39 








   19 





































    3 
















   20 

































































    1 













































































































   27 






   21 
























    3 




























   11 


































   31 


































    4 


































    1 


















































































    2 



























































   26 




































































































   17 






   17 
























































   86 








   83 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
/* $NetBSD: secmodel_suser.c,v 1.58 2024/03/01 22:01:03 andvar Exp $ */
/*-
 * Copyright (c) 2006 Elad Efrat <elad@NetBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This file contains kauth(9) listeners needed to implement the traditional
 * NetBSD superuser access restrictions.
 *
 * There are two main resources a request can be issued to: user-owned and
 * system owned. For the first, traditional Unix access checks are done, as
 * well as superuser checks. If needed, the request context is examined before
 * a decision is made. For the latter, usually only superuser checks are done
 * as normal users are not allowed to access system resources.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: secmodel_suser.c,v 1.58 2024/03/01 22:01:03 andvar Exp $");

#include <sys/types.h>
#include <sys/param.h>
#include <sys/kauth.h>

#include <sys/mutex.h>
#include <sys/mount.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/module.h>

#include <secmodel/secmodel.h>
#include <secmodel/suser/suser.h>

MODULE(MODULE_CLASS_SECMODEL, suser, NULL);

static kauth_listener_t l_generic, l_system, l_process, l_network, l_machdep,
    l_device, l_vnode;

static secmodel_t suser_sm;

SYSCTL_SETUP(sysctl_security_suser_setup, "secmodel_user sysctl")
{
        const struct sysctlnode *rnode;

        sysctl_createv(clog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "models", NULL,
                       NULL, 0, NULL, 0,
                       CTL_SECURITY, CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "suser", NULL,
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "name", NULL,
                       NULL, 0, __UNCONST(SECMODEL_SUSER_NAME), 0,
                       CTL_CREATE, CTL_EOL);
}

void
secmodel_suser_init(void)
{

}

void
secmodel_suser_start(void)
{
        l_generic = kauth_listen_scope(KAUTH_SCOPE_GENERIC,
            secmodel_suser_generic_cb, NULL);
        l_system = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
            secmodel_suser_system_cb, NULL);
        l_process = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
            secmodel_suser_process_cb, NULL);
        l_network = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
            secmodel_suser_network_cb, NULL);
        l_machdep = kauth_listen_scope(KAUTH_SCOPE_MACHDEP,
            secmodel_suser_machdep_cb, NULL);
        l_device = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
            secmodel_suser_device_cb, NULL);
        l_vnode = kauth_listen_scope(KAUTH_SCOPE_VNODE,
            secmodel_suser_vnode_cb, NULL);
}

void
secmodel_suser_stop(void)
{
        kauth_unlisten_scope(l_generic);
        kauth_unlisten_scope(l_system);
        kauth_unlisten_scope(l_process);
        kauth_unlisten_scope(l_network);
        kauth_unlisten_scope(l_machdep);
        kauth_unlisten_scope(l_device);
        kauth_unlisten_scope(l_vnode);
}

static bool
suser_isroot(kauth_cred_t cred)
{
        return kauth_cred_geteuid(cred) == 0;
}

static int
suser_eval(const char *what, void *arg, void *ret)
{
        int error = 0;

        if (strcasecmp(what, "is-root") == 0) {
                kauth_cred_t cred = arg;
                bool *bp = ret;

                *bp = suser_isroot(cred);
        } else {
                error = ENOENT;
        }

        return error;
}

static int
suser_modcmd(modcmd_t cmd, void *arg)
{
        int error = 0;

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = secmodel_register(&suser_sm,
                    SECMODEL_SUSER_ID, SECMODEL_SUSER_NAME,
                    NULL, suser_eval, NULL);
                if (error != 0)
                        printf("suser_modcmd::init: secmodel_register "
                            "returned %d\n", error);

                secmodel_suser_init();
                secmodel_suser_start();
                break;

        case MODULE_CMD_FINI:
                secmodel_suser_stop();

                error = secmodel_deregister(suser_sm);
                if (error != 0)
                        printf("suser_modcmd::fini: secmodel_deregister "
                            "returned %d\n", error);

                break;

        case MODULE_CMD_AUTOUNLOAD:
                error = EPERM;
                break;

        default:
                error = ENOTTY;
                break;
        }

        return (error);
}

/*
 * kauth(9) listener
 *
 * Security model: Traditional NetBSD
 * Scope: Generic
 * Responsibility: Superuser access
 */
int
secmodel_suser_generic_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        bool isroot;
        int result;

        isroot = suser_isroot(cred);
        result = KAUTH_RESULT_DEFER;

        switch (action) {
        case KAUTH_GENERIC_ISSUSER:
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;
                break;

        default:
                break;
        }

        return (result);
}

/*
 * kauth(9) listener
 *
 * Security model: Traditional NetBSD
 * Scope: System
 * Responsibility: Superuser access
 */
int
secmodel_suser_system_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        bool isroot;
        int result;
        enum kauth_system_req req;

        isroot = suser_isroot(cred);
        result = KAUTH_RESULT_DEFER;
        req = (enum kauth_system_req)(uintptr_t)arg0;

        switch (action) {
        case KAUTH_SYSTEM_CPU:
                switch (req) {
                case KAUTH_REQ_SYSTEM_CPU_SETSTATE:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }

                break;

        case KAUTH_SYSTEM_DEVMAPPER:
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;

                break;

        case KAUTH_SYSTEM_FS_QUOTA:
                switch (req) {
                case KAUTH_REQ_SYSTEM_FS_QUOTA_GET:
                case KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF:
                case KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE:
                case KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                default:
                        break;
                }

                break;

        case KAUTH_SYSTEM_SYSVIPC:
                switch (req) {
                case KAUTH_REQ_SYSTEM_SYSVIPC_BYPASS:
                case KAUTH_REQ_SYSTEM_SYSVIPC_SHM_LOCK:
                case KAUTH_REQ_SYSTEM_SYSVIPC_SHM_UNLOCK:
                case KAUTH_REQ_SYSTEM_SYSVIPC_MSGQ_OVERSIZE:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }

                break;

        case KAUTH_SYSTEM_MOUNT:
                switch (req) {
                case KAUTH_REQ_SYSTEM_MOUNT_DEVICE:
                case KAUTH_REQ_SYSTEM_MOUNT_GET:
                case KAUTH_REQ_SYSTEM_MOUNT_NEW:
                case KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT:
                case KAUTH_REQ_SYSTEM_MOUNT_UPDATE:
                case KAUTH_REQ_SYSTEM_MOUNT_UMAP:
                        if (isroot) {
                                result = KAUTH_RESULT_ALLOW;
                                break;
                        }

                        break;

                default:
                        break;
                }

                break;

        case KAUTH_SYSTEM_MQUEUE:
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;

                break;

        case KAUTH_SYSTEM_PSET:
                switch (req) {
                case KAUTH_REQ_SYSTEM_PSET_ASSIGN:
                case KAUTH_REQ_SYSTEM_PSET_BIND:
                case KAUTH_REQ_SYSTEM_PSET_CREATE:
                case KAUTH_REQ_SYSTEM_PSET_DESTROY:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }

                break;

        case KAUTH_SYSTEM_TIME:
                switch (req) {
                case KAUTH_REQ_SYSTEM_TIME_ADJTIME:
                case KAUTH_REQ_SYSTEM_TIME_NTPADJTIME:
                case KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS:
                case KAUTH_REQ_SYSTEM_TIME_SYSTEM:
                case KAUTH_REQ_SYSTEM_TIME_RTCOFFSET:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                default:
                        break;
                }
                break;

        case KAUTH_SYSTEM_SEMAPHORE:
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;

                break;

        case KAUTH_SYSTEM_SYSCTL:
                switch (req) {
                case KAUTH_REQ_SYSTEM_SYSCTL_ADD:
                case KAUTH_REQ_SYSTEM_SYSCTL_DELETE:
                case KAUTH_REQ_SYSTEM_SYSCTL_DESC:
                case KAUTH_REQ_SYSTEM_SYSCTL_MODIFY:
                case KAUTH_REQ_SYSTEM_SYSCTL_PRVT:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                default:
                        break;
                }

                break;

        case KAUTH_SYSTEM_SWAPCTL:
        case KAUTH_SYSTEM_ACCOUNTING:
        case KAUTH_SYSTEM_REBOOT:
        case KAUTH_SYSTEM_CHROOT:
        case KAUTH_SYSTEM_FILEHANDLE:
        case KAUTH_SYSTEM_MKNOD:
        case KAUTH_SYSTEM_SETIDCORE:
        case KAUTH_SYSTEM_MODULE:
        case KAUTH_SYSTEM_FS_RESERVEDSPACE:
        case KAUTH_SYSTEM_MAP_VA_ZERO:
        case KAUTH_SYSTEM_FS_EXTATTR:
        case KAUTH_SYSTEM_FS_SNAPSHOT:
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;
                break;

        case KAUTH_SYSTEM_DEBUG:
                break;

        case KAUTH_SYSTEM_CHSYSFLAGS:
                /* Deprecated. */
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;

                break;

        case KAUTH_SYSTEM_VERIEXEC:
                switch (req) {
                case KAUTH_REQ_SYSTEM_VERIEXEC_ACCESS:
                case KAUTH_REQ_SYSTEM_VERIEXEC_MODIFY:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }

                break;

        case KAUTH_SYSTEM_LFS:
                switch (req) {
                case KAUTH_REQ_SYSTEM_LFS_MARKV:
                case KAUTH_REQ_SYSTEM_LFS_BMAPV:
                case KAUTH_REQ_SYSTEM_LFS_SEGCLEAN:
                case KAUTH_REQ_SYSTEM_LFS_SEGWAIT:
                case KAUTH_REQ_SYSTEM_LFS_FCNTL:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                default:
                        break;
                }

                break;

        case KAUTH_SYSTEM_INTR:
                switch (req) {
                case KAUTH_REQ_SYSTEM_INTR_AFFINITY:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }

                break;

        case KAUTH_SYSTEM_KERNADDR:
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;

                break;

        default:
                break;
        }

        return (result);
}

/*
 * kauth(9) listener
 *
 * Security model: Traditional NetBSD
 * Scope: Process
 * Responsibility: Superuser access
 */
int
secmodel_suser_process_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        bool isroot;
        int result;

        isroot = suser_isroot(cred);
        result = KAUTH_RESULT_DEFER;

        switch (action) {
        case KAUTH_PROCESS_SIGNAL:
        case KAUTH_PROCESS_KTRACE:
        case KAUTH_PROCESS_PROCFS:
        case KAUTH_PROCESS_PTRACE:
        case KAUTH_PROCESS_SCHEDULER_GETPARAM:
        case KAUTH_PROCESS_SCHEDULER_SETPARAM:
        case KAUTH_PROCESS_SCHEDULER_GETAFFINITY:
        case KAUTH_PROCESS_SCHEDULER_SETAFFINITY:
        case KAUTH_PROCESS_SETID:
        case KAUTH_PROCESS_KEVENT_FILTER:
        case KAUTH_PROCESS_NICE:
        case KAUTH_PROCESS_FORK:
        case KAUTH_PROCESS_CORENAME:
        case KAUTH_PROCESS_STOPFLAG:
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;

                break;

        case KAUTH_PROCESS_CANSEE: {
                unsigned long req;

                req = (unsigned long)arg1;

                switch (req) {
                case KAUTH_REQ_PROCESS_CANSEE_ARGS:
                case KAUTH_REQ_PROCESS_CANSEE_ENTRY:
                case KAUTH_REQ_PROCESS_CANSEE_OPENFILES:
                case KAUTH_REQ_PROCESS_CANSEE_EPROC:
                case KAUTH_REQ_PROCESS_CANSEE_KPTR:
                        if (isroot) {
                                result = KAUTH_RESULT_ALLOW;
                                break;
                        }

                        break;

                case KAUTH_REQ_PROCESS_CANSEE_ENV:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }

                break;
                }

        case KAUTH_PROCESS_RLIMIT: {
                enum kauth_process_req req;

                req = (enum kauth_process_req)(uintptr_t)arg1;

                switch (req) {
                case KAUTH_REQ_PROCESS_RLIMIT_SET:
                case KAUTH_REQ_PROCESS_RLIMIT_GET:
                case KAUTH_REQ_PROCESS_RLIMIT_BYPASS:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }

                break;
                }

        default:
                break;
        }

        return (result);
}

/*
 * kauth(9) listener
 *
 * Security model: Traditional NetBSD
 * Scope: Network
 * Responsibility: Superuser access
 */
int
secmodel_suser_network_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        bool isroot;
        int result;
        enum kauth_network_req req;

        isroot = suser_isroot(cred);
        result = KAUTH_RESULT_DEFER;
        req = (enum kauth_network_req)(uintptr_t)arg0;

        switch (action) {
        case KAUTH_NETWORK_ALTQ:
                switch (req) {
                case KAUTH_REQ_NETWORK_ALTQ_AFMAP:
                case KAUTH_REQ_NETWORK_ALTQ_BLUE:
                case KAUTH_REQ_NETWORK_ALTQ_CBQ:
                case KAUTH_REQ_NETWORK_ALTQ_CDNR:
                case KAUTH_REQ_NETWORK_ALTQ_CONF:
                case KAUTH_REQ_NETWORK_ALTQ_FIFOQ:
                case KAUTH_REQ_NETWORK_ALTQ_HFSC:
                case KAUTH_REQ_NETWORK_ALTQ_JOBS:
                case KAUTH_REQ_NETWORK_ALTQ_PRIQ:
                case KAUTH_REQ_NETWORK_ALTQ_RED:
                case KAUTH_REQ_NETWORK_ALTQ_RIO:
                case KAUTH_REQ_NETWORK_ALTQ_WFQ:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                default:
                        break;
                }

                break;

        case KAUTH_NETWORK_BIND:
                switch (req) {
                case KAUTH_REQ_NETWORK_BIND_PORT:
                case KAUTH_REQ_NETWORK_BIND_PRIVPORT:
                case KAUTH_REQ_NETWORK_BIND_ANYADDR:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                default:
                        break;
                }
                break;

        case KAUTH_NETWORK_FIREWALL:
                switch (req) {
                case KAUTH_REQ_NETWORK_FIREWALL_FW:
                case KAUTH_REQ_NETWORK_FIREWALL_NAT:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }
                break;

        case KAUTH_NETWORK_FORWSRCRT:
        case KAUTH_NETWORK_ROUTE:
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;

                break;

        case KAUTH_NETWORK_INTERFACE:
                switch (req) {
                case KAUTH_REQ_NETWORK_INTERFACE_GET:
                case KAUTH_REQ_NETWORK_INTERFACE_SET:
                case KAUTH_REQ_NETWORK_INTERFACE_GETPRIV:
                case KAUTH_REQ_NETWORK_INTERFACE_SETPRIV:
                case KAUTH_REQ_NETWORK_INTERFACE_FIRMWARE:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                default:
                        break;
                }
                break;

        case KAUTH_NETWORK_INTERFACE_BRIDGE:
                switch (req) {
                case KAUTH_REQ_NETWORK_INTERFACE_BRIDGE_GETPRIV:
                case KAUTH_REQ_NETWORK_INTERFACE_BRIDGE_SETPRIV:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                default:
                        break;
                }

                break;

        case KAUTH_NETWORK_INTERFACE_PPP:
                switch (req) {
                case KAUTH_REQ_NETWORK_INTERFACE_PPP_ADD:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                default:
                        break;
                }

                break;

        case KAUTH_NETWORK_INTERFACE_PVC:
                switch (req) {
                case KAUTH_REQ_NETWORK_INTERFACE_PVC_ADD:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }

                break;

        case KAUTH_NETWORK_INTERFACE_SLIP:
                switch (req) {
                case KAUTH_REQ_NETWORK_INTERFACE_SLIP_ADD:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                default:
                        break;
                }

                break;

        case KAUTH_NETWORK_INTERFACE_TUN:
                switch (req) {
                case KAUTH_REQ_NETWORK_INTERFACE_TUN_ADD:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                default:
                        break;
                }

                break;

        case KAUTH_NETWORK_IPV6:
                switch (req) {
                case KAUTH_REQ_NETWORK_IPV6_HOPBYHOP:
                case KAUTH_REQ_NETWORK_IPV6_JOIN_MULTICAST:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }

                break;

        case KAUTH_NETWORK_NFS:
                switch (req) {
                case KAUTH_REQ_NETWORK_NFS_EXPORT:
                case KAUTH_REQ_NETWORK_NFS_SVC:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }
                break;

        case KAUTH_NETWORK_SMB:
                switch (req) {
                case KAUTH_REQ_NETWORK_SMB_SHARE_ACCESS:
                case KAUTH_REQ_NETWORK_SMB_SHARE_CREATE:
                case KAUTH_REQ_NETWORK_SMB_VC_ACCESS:
                case KAUTH_REQ_NETWORK_SMB_VC_CREATE:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }

                break;

        case KAUTH_NETWORK_INTERFACE_WG:
                switch (req) {
                case KAUTH_REQ_NETWORK_INTERFACE_WG_GETPRIV:
                case KAUTH_REQ_NETWORK_INTERFACE_WG_SETPRIV:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                default:
                        break;
                }

                break;

        case KAUTH_NETWORK_SOCKET:
                switch (req) {
                case KAUTH_REQ_NETWORK_SOCKET_DROP:
                case KAUTH_REQ_NETWORK_SOCKET_OPEN:
                case KAUTH_REQ_NETWORK_SOCKET_RAWSOCK:
                case KAUTH_REQ_NETWORK_SOCKET_SETPRIV:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                case KAUTH_REQ_NETWORK_SOCKET_CANSEE:
                        if (isroot) {
                                result = KAUTH_RESULT_ALLOW;
                                break;
                        }

                        break;

                default:
                        break;
                }

                break;

        case KAUTH_NETWORK_IPSEC:
                switch (req) {
                case KAUTH_REQ_NETWORK_IPSEC_BYPASS:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }

                break;

        default:
                break;
        }

        return (result);
}

/*
 * kauth(9) listener
 *
 * Security model: Traditional NetBSD
 * Scope: Machdep
 * Responsibility: Superuser access
 */
int
secmodel_suser_machdep_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        bool isroot;
        int result;

        isroot = suser_isroot(cred);
        result = KAUTH_RESULT_DEFER;

        switch (action) {
        case KAUTH_MACHDEP_CPU_UCODE_APPLY:
        case KAUTH_MACHDEP_IOPERM_GET:
        case KAUTH_MACHDEP_LDT_GET:
        case KAUTH_MACHDEP_LDT_SET:
        case KAUTH_MACHDEP_MTRR_GET:
        case KAUTH_MACHDEP_CACHEFLUSH:
        case KAUTH_MACHDEP_IOPERM_SET:
        case KAUTH_MACHDEP_IOPL:
        case KAUTH_MACHDEP_MTRR_SET:
        case KAUTH_MACHDEP_NVRAM:
        case KAUTH_MACHDEP_UNMANAGEDMEM:
        case KAUTH_MACHDEP_PXG:
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;
                break;

        case KAUTH_MACHDEP_SVS_DISABLE:
                /* Deprecated. */
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;
                break;

        default:
                break;
        }

        return (result);
}

/*
 * kauth(9) listener
 *
 * Security model: Traditional NetBSD
 * Scope: Device
 * Responsibility: Superuser access
 */
int
secmodel_suser_device_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        bool isroot;
        int result;

        isroot = suser_isroot(cred);
        result = KAUTH_RESULT_DEFER;

        switch (action) {
        case KAUTH_DEVICE_BLUETOOTH_SETPRIV:
        case KAUTH_DEVICE_BLUETOOTH_SEND:
        case KAUTH_DEVICE_BLUETOOTH_RECV:
        case KAUTH_DEVICE_TTY_OPEN:
        case KAUTH_DEVICE_TTY_PRIVSET:
        case KAUTH_DEVICE_TTY_STI:
        case KAUTH_DEVICE_TTY_VIRTUAL:
        case KAUTH_DEVICE_RND_ADDDATA:
        case KAUTH_DEVICE_RND_ADDDATA_ESTIMATE:
        case KAUTH_DEVICE_RND_GETPRIV:
        case KAUTH_DEVICE_RND_SETPRIV:
        case KAUTH_DEVICE_WSCONS_KEYBOARD_BELL:
        case KAUTH_DEVICE_WSCONS_KEYBOARD_KEYREPEAT:
        case KAUTH_DEVICE_NVMM_CTL:
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;
                break;

        case KAUTH_DEVICE_BLUETOOTH_BCSP:
        case KAUTH_DEVICE_BLUETOOTH_BTUART: {
                enum kauth_device_req req;

                req = (enum kauth_device_req)(uintptr_t)arg0;
                switch (req) {
                case KAUTH_REQ_DEVICE_BLUETOOTH_BCSP_ADD:
                case KAUTH_REQ_DEVICE_BLUETOOTH_BTUART_ADD:
                        if (isroot)
                                result = KAUTH_RESULT_ALLOW;
                        break;

                default:
                        break;
                }

                break;
                }

        case KAUTH_DEVICE_GPIO_PINSET:
                /*
                 * root can access gpio pins, secmodel_securelevel can veto
                 * this decision.
                 */
                if (isroot)
                        result = KAUTH_RESULT_ALLOW;
                break;

        default:
                break;
        }

        return (result);
}

int
secmodel_suser_vnode_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        bool isroot;
        int result;

        isroot = suser_isroot(cred);
        result = KAUTH_RESULT_DEFER;

        if (isroot) {
                /* Superuser can execute only if the file's executable. */
                if ((action & KAUTH_VNODE_EXECUTE) == 0 ||
                    (action & KAUTH_VNODE_IS_EXEC))
                        result = KAUTH_RESULT_ALLOW;
        }

        return (result);
}




















































































































































































    3 



    3 


    3 















    3 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
/*        $NetBSD: subr_pcq.c,v 1.20 2023/02/24 11:02:27 riastradh Exp $        */

/*-
 * Copyright (c) 2009, 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Lockless producer/consumer queue.
 *
 * Summary of the producer algorithm in pcq_put (may run many in
 * parallel with each other and with a consumer):
 *
 *        P1. initialize an item
 *
 *        P2. atomic_cas(&pcq->pcq_pc) loop to advance the producer
 *            pointer, reserving a space at c (fails if not enough space)
 *
 *        P3. atomic_store_release(&pcq->pcq_items[c], item) to publish
 *            the item in the space it reserved
 *
 * Summary of the consumer algorithm in pcq_get (must be serialized by
 * caller with other consumers, may run in parallel with any number of
 * producers):
 *
 *        C1. atomic_load_relaxed(&pcq->pcq_pc) to get the consumer
 *            pointer and a snapshot of the producer pointer, which may
 *            point to null items or point to initialized items (fails if
 *            no space reserved for published items yet)
 *
 *        C2. atomic_load_consume(&pcq->pcq_items[c]) to get the next
 *            unconsumed but potentially published item (fails if item
 *            not published yet)
 *
 *        C3. pcq->pcq_items[c] = NULL to consume the next unconsumed but
 *            published item
 *
 *        C4. membar_producer
 *
 *        C5. atomic_cas(&pcq->pcq_pc) loop to advance the consumer
 *            pointer
 *
 *        C6. use the item
 *
 * Note that there is a weird bare membar_producer which is not matched
 * by membar_consumer.  This is one of the rare cases of a memory
 * barrier on one side that is not matched by a memory barrier on
 * another side, but the ordering works out, with a somewhat more
 * involved proof.
 *
 * Some properties that need to be proved:
 *
 *        Theorem 1.  For pcq_put call that leads into pcq_get:
 *        Initializing item at P1 is dependency-ordered before usage of
 *        item at C6, so items placed by pcq_put can be safely used by
 *        the caller of pcq_get.
 *
 *        Proof sketch.
 *
 *                Assume load/store P2 synchronizes with load/store C1
 *                (if not, pcq_get fails in `if (p == c) return NULL').
 *
 *                Assume store-release P3 synchronizes with load-consume
 *                C2 (if not, pcq_get fails in `if (item == NULL) return
 *                NULL').
 *
 *                Then:
 *
 *                - P1 is sequenced before store-release P3
 *                - store-release P3 synchronizes with load-consume C2
 *                - load-consume C2 is dependency-ordered before C6
 *
 *                Hence transitively, P1 is dependency-ordered before C6,
 *                QED.
 *
 *        Theorem 2.  For pcq_get call followed by pcq_put: Nulling out
 *        location at store C3 happens before placing a new item in the
 *        same location at store P3, so items are not lost.
 *
 *        Proof sketch.
 *
 *                Assume load/store C5 synchronizes with load/store P2
 *                (otherwise pcq_peek starts over the CAS loop or fails).
 *
 *                Then:
 *
 *                - store C3 is sequenced before membar_producer C4
 *                - membar_producer C4 is sequenced before load/store C5
 *                - load/store C5 synchronizes with load/store P2 at &pcq->pcq_pc
 *                - P2 is sequenced before store-release P3
 *
 *                Hence transitively, store C3 happens before
 *                store-release P3, QED.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_pcq.c,v 1.20 2023/02/24 11:02:27 riastradh Exp $");

#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/kmem.h>

#include <sys/pcq.h>

/*
 * Internal producer-consumer queue structure.  Note: providing a separate
 * cache-line both for pcq_t::pcq_pc and pcq_t::pcq_items.
 */
struct pcq {
        u_int                        pcq_nitems;
        uint8_t                        pcq_pad1[COHERENCY_UNIT - sizeof(u_int)];
        volatile uint32_t        pcq_pc;
        uint8_t                        pcq_pad2[COHERENCY_UNIT - sizeof(uint32_t)];
        void * volatile                pcq_items[];
};

/*
 * Producer (p) - stored in the lower 16 bits of pcq_t::pcq_pc.
 * Consumer (c) - in the higher 16 bits.
 *
 * We have a limitation of 16 bits i.e. 0xffff items in the queue.
 * The PCQ_MAXLEN constant is set accordingly.
 */

static inline void
pcq_split(uint32_t v, u_int *p, u_int *c)
{

        *p = v & 0xffff;
        *c = v >> 16;
}

static inline uint32_t
pcq_combine(u_int p, u_int c)
{

        return p | (c << 16);
}

static inline u_int
pcq_advance(pcq_t *pcq, u_int pc)
{

        if (__predict_false(++pc == pcq->pcq_nitems)) {
                return 0;
        }
        return pc;
}

/*
 * pcq_put: place an item at the end of the queue.
 */
bool
pcq_put(pcq_t *pcq, void *item)
{
        uint32_t v, nv;
        u_int op, p, c;

        KASSERT(item != NULL);

        do {
                v = atomic_load_relaxed(&pcq->pcq_pc);
                pcq_split(v, &op, &c);
                p = pcq_advance(pcq, op);
                if (p == c) {
                        /* Queue is full. */
                        return false;
                }
                nv = pcq_combine(p, c);
        } while (atomic_cas_32(&pcq->pcq_pc, v, nv) != v);

        /*
         * Ensure that the update to pcq_pc is globally visible before the
         * data item.  See pcq_get().  This also ensures that any changes
         * that the caller made to the data item are globally visible
         * before we put it onto the list.
         */
        atomic_store_release(&pcq->pcq_items[op], item);

        /*
         * Synchronization activity to wake up the consumer will ensure
         * that the update to pcq_items[] is visible before the wakeup
         * arrives.  So, we do not need an additional memory barrier here.
         */
        return true;
}

/*
 * pcq_peek: return the next item from the queue without removal.
 */
void *
pcq_peek(pcq_t *pcq)
{
        const uint32_t v = atomic_load_relaxed(&pcq->pcq_pc);
        u_int p, c;

        pcq_split(v, &p, &c);

        /* See comment on race below in pcq_get(). */
        return (p == c) ? NULL : atomic_load_consume(&pcq->pcq_items[c]);
}

/*
 * pcq_get: remove and return the next item for consumption or NULL if empty.
 *
 * => The caller must prevent concurrent gets from occurring.
 */
void *
pcq_get(pcq_t *pcq)
{
        uint32_t v, nv;
        u_int p, c;
        void *item;

        v = atomic_load_relaxed(&pcq->pcq_pc);
        pcq_split(v, &p, &c);
        if (p == c) {
                /* Queue is empty: nothing to return. */
                return NULL;
        }
        item = atomic_load_consume(&pcq->pcq_items[c]);
        if (item == NULL) {
                /*
                 * Raced with sender: we rely on a notification (e.g. softint
                 * or wakeup) being generated after the producer's pcq_put(),
                 * causing us to retry pcq_get() later.
                 */
                return NULL;
        }
        /*
         * We have exclusive access to this slot, so no need for
         * atomic_store_*.
         */
        pcq->pcq_items[c] = NULL;
        c = pcq_advance(pcq, c);
        nv = pcq_combine(p, c);

        /*
         * Ensure that update to pcq_items[c] becomes globally visible
         * before the update to pcq_pc.  If it were reordered to occur
         * after it, we could in theory wipe out a modification made
         * to pcq_items[c] by pcq_put().
         *
         * No need for load-before-store ordering of membar_release
         * because the only load we need to ensure happens first is the
         * load of pcq->pcq_items[c], but that necessarily happens
         * before the store to pcq->pcq_items[c] to null it out because
         * it is at the same memory location.  Yes, this is a bare
         * membar_producer with no matching membar_consumer.
         */
        membar_producer();
        while (__predict_false(atomic_cas_32(&pcq->pcq_pc, v, nv) != v)) {
                v = atomic_load_relaxed(&pcq->pcq_pc);
                pcq_split(v, &p, &c);
                c = pcq_advance(pcq, c);
                nv = pcq_combine(p, c);
        }
        return item;
}

pcq_t *
pcq_create(size_t nitems, km_flag_t kmflags)
{
        pcq_t *pcq;

        KASSERT(nitems > 0);
        KASSERT(nitems <= PCQ_MAXLEN);

        pcq = kmem_zalloc(offsetof(pcq_t, pcq_items[nitems]), kmflags);
        if (pcq != NULL) {
                pcq->pcq_nitems = nitems;
        }
        return pcq;
}

void
pcq_destroy(pcq_t *pcq)
{

        kmem_free(pcq, offsetof(pcq_t, pcq_items[pcq->pcq_nitems]));
}

size_t
pcq_maxitems(pcq_t *pcq)
{

        return pcq->pcq_nitems;
}













































  132 




























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
/*        $NetBSD: bitops.h,v 1.15 2021/09/12 15:22:05 rillig Exp $        */

/*-
 * Copyright (c) 2007, 2010 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas and Joerg Sonnenberger.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
#ifndef _SYS_BITOPS_H_
#define _SYS_BITOPS_H_

#include <sys/stdint.h>

/*
 * Find First Set functions
 */
#ifndef ffs32
static __inline int __unused
ffs32(uint32_t _n)
{
        int _v;

        if (!_n)
                return 0;

        _v = 1;
        if ((_n & 0x0000FFFFU) == 0) {
                _n >>= 16;
                _v += 16;
        }
        if ((_n & 0x000000FFU) == 0) {
                _n >>= 8;
                _v += 8;
        }
        if ((_n & 0x0000000FU) == 0) {
                _n >>= 4;
                _v += 4;
        }
        if ((_n & 0x00000003U) == 0) {
                _n >>= 2;
                _v += 2;
        }
        if ((_n & 0x00000001U) == 0) {
                _n >>= 1;
                _v += 1;
        }
        return _v;
}
#endif

#ifndef ffs64
static __inline int __unused
ffs64(uint64_t _n)
{
        int _v;

        if (!_n)
                return 0;

        _v = 1;
        if ((_n & 0x00000000FFFFFFFFULL) == 0) {
                _n >>= 32;
                _v += 32;
        }
        if ((_n & 0x000000000000FFFFULL) == 0) {
                _n >>= 16;
                _v += 16;
        }
        if ((_n & 0x00000000000000FFULL) == 0) {
                _n >>= 8;
                _v += 8;
        }
        if ((_n & 0x000000000000000FULL) == 0) {
                _n >>= 4;
                _v += 4;
        }
        if ((_n & 0x0000000000000003ULL) == 0) {
                _n >>= 2;
                _v += 2;
        }
        if ((_n & 0x0000000000000001ULL) == 0) {
                _n >>= 1;
                _v += 1;
        }
        return _v;
}
#endif

/*
 * Find Last Set functions
 */
#ifndef fls32
static __inline int __unused
fls32(uint32_t _n)
{
        int _v;

        if (!_n)
                return 0;

        _v = 32;
        if ((_n & 0xFFFF0000U) == 0) {
                _n <<= 16;
                _v -= 16;
        }
        if ((_n & 0xFF000000U) == 0) {
                _n <<= 8;
                _v -= 8;
        }
        if ((_n & 0xF0000000U) == 0) {
                _n <<= 4;
                _v -= 4;
        }
        if ((_n & 0xC0000000U) == 0) {
                _n <<= 2;
                _v -= 2;
        }
        if ((_n & 0x80000000U) == 0) {
                _n <<= 1;
                _v -= 1;
        }
        return _v;
}
#endif

#ifndef fls64
static __inline int __unused
fls64(uint64_t _n)
{
        int _v;

        if (!_n)
                return 0;

        _v = 64;
        if ((_n & 0xFFFFFFFF00000000ULL) == 0) {
                _n <<= 32;
                _v -= 32;
        }
        if ((_n & 0xFFFF000000000000ULL) == 0) {
                _n <<= 16;
                _v -= 16;
        }
        if ((_n & 0xFF00000000000000ULL) == 0) {
                _n <<= 8;
                _v -= 8;
        }
        if ((_n & 0xF000000000000000ULL) == 0) {
                _n <<= 4;
                _v -= 4;
        }
        if ((_n & 0xC000000000000000ULL) == 0) {
                _n <<= 2;
                _v -= 2;
        }
        if ((_n & 0x8000000000000000ULL) == 0) {
                _n <<= 1;
                _v -= 1;
        }
        return _v;
}
#endif

/*
 * Integer logarithm, returns -1 on error. Inspired by the linux
 * version written by David Howells.
 */
#define _ilog2_helper(_n, _x)        ((_n) & (1ULL << (_x))) ? _x :
#define _ilog2_const(_n) ( \
        _ilog2_helper(_n, 63) \
        _ilog2_helper(_n, 62) \
        _ilog2_helper(_n, 61) \
        _ilog2_helper(_n, 60) \
        _ilog2_helper(_n, 59) \
        _ilog2_helper(_n, 58) \
        _ilog2_helper(_n, 57) \
        _ilog2_helper(_n, 56) \
        _ilog2_helper(_n, 55) \
        _ilog2_helper(_n, 54) \
        _ilog2_helper(_n, 53) \
        _ilog2_helper(_n, 52) \
        _ilog2_helper(_n, 51) \
        _ilog2_helper(_n, 50) \
        _ilog2_helper(_n, 49) \
        _ilog2_helper(_n, 48) \
        _ilog2_helper(_n, 47) \
        _ilog2_helper(_n, 46) \
        _ilog2_helper(_n, 45) \
        _ilog2_helper(_n, 44) \
        _ilog2_helper(_n, 43) \
        _ilog2_helper(_n, 42) \
        _ilog2_helper(_n, 41) \
        _ilog2_helper(_n, 40) \
        _ilog2_helper(_n, 39) \
        _ilog2_helper(_n, 38) \
        _ilog2_helper(_n, 37) \
        _ilog2_helper(_n, 36) \
        _ilog2_helper(_n, 35) \
        _ilog2_helper(_n, 34) \
        _ilog2_helper(_n, 33) \
        _ilog2_helper(_n, 32) \
        _ilog2_helper(_n, 31) \
        _ilog2_helper(_n, 30) \
        _ilog2_helper(_n, 29) \
        _ilog2_helper(_n, 28) \
        _ilog2_helper(_n, 27) \
        _ilog2_helper(_n, 26) \
        _ilog2_helper(_n, 25) \
        _ilog2_helper(_n, 24) \
        _ilog2_helper(_n, 23) \
        _ilog2_helper(_n, 22) \
        _ilog2_helper(_n, 21) \
        _ilog2_helper(_n, 20) \
        _ilog2_helper(_n, 19) \
        _ilog2_helper(_n, 18) \
        _ilog2_helper(_n, 17) \
        _ilog2_helper(_n, 16) \
        _ilog2_helper(_n, 15) \
        _ilog2_helper(_n, 14) \
        _ilog2_helper(_n, 13) \
        _ilog2_helper(_n, 12) \
        _ilog2_helper(_n, 11) \
        _ilog2_helper(_n, 10) \
        _ilog2_helper(_n,  9) \
        _ilog2_helper(_n,  8) \
        _ilog2_helper(_n,  7) \
        _ilog2_helper(_n,  6) \
        _ilog2_helper(_n,  5) \
        _ilog2_helper(_n,  4) \
        _ilog2_helper(_n,  3) \
        _ilog2_helper(_n,  2) \
        _ilog2_helper(_n,  1) \
        _ilog2_helper(_n,  0) \
        -1)

#define ilog2(_n) \
( \
        __builtin_constant_p(_n) ?  _ilog2_const(_n) : \
        ((sizeof(_n) > 4 ? fls64(_n) : fls32(_n)) - 1) \
)

static __inline void
fast_divide32_prepare(uint32_t _div, uint32_t * __restrict _m,
    uint8_t *__restrict _s1, uint8_t *__restrict _s2)
{
        uint64_t _mt;
        int _l;

        _l = fls32(_div - 1);
        _mt = (uint64_t)(0x100000000ULL * ((1ULL << _l) - _div));
        *_m = (uint32_t)(_mt / _div + 1);
        *_s1 = (_l > 1) ? 1U : (uint8_t)_l;
        *_s2 = (_l == 0) ? 0 : (uint8_t)(_l - 1);
}

/* ARGSUSED */
static __inline uint32_t
fast_divide32(uint32_t _v, uint32_t _div __unused, uint32_t _m, uint8_t _s1,
    uint8_t _s2)
{
        uint32_t _t;

        _t = (uint32_t)(((uint64_t)_v * _m) >> 32);
        return (_t + ((_v - _t) >> _s1)) >> _s2;
}

static __inline uint32_t
fast_remainder32(uint32_t _v, uint32_t _div, uint32_t _m, uint8_t _s1,
    uint8_t _s2)
{

        return _v - _div * fast_divide32(_v, _div, _m, _s1, _s2);
}

#define __BITMAP_TYPE(__s, __t, __n) struct __s { \
    __t _b[__BITMAP_SIZE(__t, __n)]; \
}

#define __BITMAP_BITS(__t)                (sizeof(__t) * NBBY)
#define __BITMAP_SHIFT(__t)                (ilog2(__BITMAP_BITS(__t)))
#define __BITMAP_MASK(__t)                (__BITMAP_BITS(__t) - 1)
#define __BITMAP_SIZE(__t, __n) \
    (((__n) + (__BITMAP_BITS(__t) - 1)) / __BITMAP_BITS(__t))
#define __BITMAP_BIT(__n, __v) \
    ((__typeof__((__v)->_b[0]))1 << ((__n) & __BITMAP_MASK(*(__v)->_b)))
#define __BITMAP_WORD(__n, __v) \
    ((__n) >> __BITMAP_SHIFT(*(__v)->_b))

#define __BITMAP_SET(__n, __v) \
    ((__v)->_b[__BITMAP_WORD(__n, __v)] |= __BITMAP_BIT(__n, __v))
#define __BITMAP_CLR(__n, __v) \
    ((__v)->_b[__BITMAP_WORD(__n, __v)] &= ~__BITMAP_BIT(__n, __v))
#define __BITMAP_ISSET(__n, __v) \
    ((__v)->_b[__BITMAP_WORD(__n, __v)] & __BITMAP_BIT(__n, __v))

#if __GNUC_PREREQ__(2, 95)
#define        __BITMAP_ZERO(__v) \
    (void)__builtin_memset((__v), 0, sizeof(*__v))
#else
#define __BITMAP_ZERO(__v) do {                                                \
        size_t __i;                                                        \
        for (__i = 0; __i < __arraycount((__v)->_b); __i++)                \
                (__v)->_b[__i] = 0;                                        \
        } while (/* CONSTCOND */ 0)
#endif /* GCC 2.95 */

#endif /* _SYS_BITOPS_H_ */




























































































  253 






























    2 













  255 

  162 
  156 





























   10 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
/*        $NetBSD: syscall.c,v 1.22 2023/10/05 19:41:06 ad Exp $        */

/*-
 * Copyright (c) 1998, 2000, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: syscall.c,v 1.22 2023/10/05 19:41:06 ad Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/signal.h>
#include <sys/ktrace.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscall_stats.h>

#include <uvm/uvm_extern.h>

#include <machine/cpu.h>
#include <machine/psl.h>
#include <machine/userret.h>

#include "opt_dtrace.h"

#ifndef __x86_64__
int                x86_copyargs(void *, void *, size_t);
#endif

void                syscall_intern(struct proc *);
static void        syscall(struct trapframe *);

void
md_child_return(struct lwp *l)
{
        struct trapframe *tf = l->l_md.md_regs;

        X86_TF_RAX(tf) = 0;
        X86_TF_RFLAGS(tf) &= ~PSL_C;

        userret(l);
}

/*
 * Process the tail end of a posix_spawn() for the child.
 */
void
cpu_spawn_return(struct lwp *l)
{

        userret(l);
}

/*
 * syscall(frame):
 *        System call request from POSIX system call gate interface to kernel.
 *        Like trap(), argument is call by reference.
 */
#ifdef KDTRACE_HOOKS
void syscall(struct trapframe *);
#else
static
#endif
void
syscall(struct trapframe *frame)
{
        const struct sysent *callp;
        struct proc *p;
        struct lwp *l;
        int error;
        register_t code, rval[2];
#ifdef __x86_64__
        /* Verify that the syscall args will fit in the trapframe space */
        CTASSERT(offsetof(struct trapframe, tf_arg9) >=
            sizeof(register_t) * (2 + SYS_MAXSYSARGS - 1));
#define args (&frame->tf_rdi)
#else
        register_t args[2 + SYS_MAXSYSARGS];
#endif

        l = curlwp;
        p = l->l_proc;

        code = X86_TF_RAX(frame) & (SYS_NSYSENT - 1);
        callp = p->p_emul->e_sysent + code;

        SYSCALL_COUNT(syscall_counts, code);
        SYSCALL_TIME_SYS_ENTRY(l, syscall_times, code);

#ifdef __x86_64__
        /*
         * The first 6 syscall args are passed in rdi, rsi, rdx, r10, r8 and r9
         * (rcx gets copied to r10 in the libc stub because the syscall
         * instruction overwrites %cx) and are together in the trap frame
         * with space following for 4 more entries.
         */
        if (__predict_false(callp->sy_argsize > 6 * 8)) {
                error = copyin((register_t *)frame->tf_rsp + 1,
                    &frame->tf_arg6, callp->sy_argsize - 6 * 8);
                if (error != 0)
                        goto bad;
        }
#else
        if (callp->sy_argsize) {
                error = x86_copyargs((char *)frame->tf_esp + sizeof(int), args,
                            callp->sy_argsize);
                if (__predict_false(error != 0))
                        goto bad;
        }
#endif
        error = sy_invoke(callp, l, args, rval, code);

        if (__predict_true(error == 0)) {
                X86_TF_RAX(frame) = rval[0];
                X86_TF_RDX(frame) = rval[1];
                X86_TF_RFLAGS(frame) &= ~PSL_C;        /* carry bit */
        } else {
                switch (error) {
                case ERESTART:
                        /*
                         * The offset to adjust the PC by depends on whether we
                         * entered the kernel through the trap or call gate.
                         * We saved the instruction size in tf_err on entry.
                         */
                        X86_TF_RIP(frame) -= frame->tf_err;
                        break;
                case EJUSTRETURN:
                        /* nothing to do */
                        break;
                default:
                bad:
                        X86_TF_RAX(frame) = error;
                        X86_TF_RFLAGS(frame) |= PSL_C;        /* carry bit */
                        break;
                }
        }

        SYSCALL_TIME_SYS_EXIT(l);
        userret(l);
}

void
syscall_intern(struct proc *p)
{

        p->p_md.md_syscall = syscall;
}




























































    2 

















    1 











    1 
















    1 







































    1 


    1 
    1 


    1 




















    1 





    1 



    1 
    1 










    1 







































    1 





    1 










    1 









    7 















    1 




    2 












    1 


    1 
























    2 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/*        $NetBSD: sys_module.c,v 1.30 2022/05/24 06:20:05 andvar Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * System calls relating to loadable modules.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_module.c,v 1.30 2022/05/24 06:20:05 andvar Exp $");

#ifdef _KERNEL_OPT
#include "opt_modular.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/kobj.h>
#include <sys/module.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/compat_stub.h>

/*
 * Arbitrary limit to avoid DoS for excessive memory allocation.
 */
#define MAXPROPSLEN        4096

int
handle_modctl_load(const char *ml_filename, int ml_flags, const char *ml_props,
    size_t ml_propslen)
{
        char *path;
        char *props;
        int error;
        prop_dictionary_t dict;
        size_t propslen = 0;

        if ((ml_props != NULL && ml_propslen == 0) ||
            (ml_props == NULL && ml_propslen > 0)) {
                return EINVAL;
        }

        path = PNBUF_GET();
        error = copyinstr(ml_filename, path, MAXPATHLEN, NULL);
        if (error != 0)
                goto out1;

        if (ml_props != NULL) {
                if (ml_propslen > MAXPROPSLEN) {
                        error = ENOMEM;
                        goto out1;
                }
                propslen = ml_propslen + 1;

                props = kmem_alloc(propslen, KM_SLEEP);
                error = copyinstr(ml_props, props, propslen, NULL);
                if (error != 0)
                        goto out2;

                dict = prop_dictionary_internalize(props);
                if (dict == NULL) {
                        error = EINVAL;
                        goto out2;
                }
        } else {
                dict = NULL;
                props = NULL;
        }

        error = module_load(path, ml_flags, dict, MODULE_CLASS_ANY);

        if (dict != NULL) {
                prop_object_release(dict);
        }

out2:
        if (props != NULL) {
                kmem_free(props, propslen);
        }
out1:
        PNBUF_PUT(path);
        return error;
}

static int
handle_modctl_stat(struct iovec *iov, void *arg)
{
        int ms_cnt;
        modstat_t *ms, *mso;
        size_t ms_len;
        char *req, *reqo;
        size_t req_len;
        char *out_p;
        size_t out_s;

        modinfo_t *mi;
        module_t *mod;
        vaddr_t addr;
        size_t size;
        size_t used;
        int off;
        int error;
        bool stataddr;

        /* If not privileged, don't expose kernel addresses. */
        error = kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE,
            curproc, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_KPTR), NULL, NULL);
        stataddr = (error == 0);

        kernconfig_lock();
        ms_cnt = 0;
        req_len = 1;

        /*
         * Count up the number of modstat_t needed, and total size of
         * require_module lists on both active and built-in lists
         */
        TAILQ_FOREACH(mod, &module_list, mod_chain) {
                ms_cnt++;
                mi = mod->mod_info;
                if (mi->mi_required != NULL) {
                        req_len += strlen(mi->mi_required) + 1;
                }
        }
        TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
                ms_cnt++;
                mi = mod->mod_info;
                if (mi->mi_required != NULL) {
                        req_len += strlen(mi->mi_required) + 1;
                }
        }

        /* Allocate internal buffers to hold all the output data */
        ms_len = ms_cnt * sizeof(modstat_t);
        ms = kmem_zalloc(ms_len, KM_SLEEP);
        req = kmem_zalloc(req_len, KM_SLEEP);

        mso = ms;
        reqo = req++;
        off = 1;

        /*
         * Load data into our internal buffers for both active and
         * built-in module lists
         */
        TAILQ_FOREACH(mod, &module_list, mod_chain) {
                mi = mod->mod_info;
                strlcpy(ms->ms_name, mi->mi_name, sizeof(ms->ms_name));
                if (mi->mi_required != NULL) {
                        ms->ms_reqoffset = off;
                        used = strlcpy(req,  mi->mi_required, req_len - off);
                        KASSERTMSG(used < req_len - off, "reqlist grew!");
                        off += used + 1;
                        req += used + 1;
                } else
                        ms->ms_reqoffset = 0;
                if (mod->mod_kobj != NULL && stataddr) {
                        kobj_stat(mod->mod_kobj, &addr, &size);
                        ms->ms_addr = addr;
                        ms->ms_size = size;
                }
                ms->ms_class = mi->mi_class;
                ms->ms_refcnt = mod->mod_refcnt;
                ms->ms_source = mod->mod_source;
                ms->ms_flags = mod->mod_flags;
                ms++;
        }
        TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
                mi = mod->mod_info;
                strlcpy(ms->ms_name, mi->mi_name, sizeof(ms->ms_name));
                if (mi->mi_required != NULL) {
                        ms->ms_reqoffset = off;
                        used = strlcpy(req,  mi->mi_required, req_len - off);
                        KASSERTMSG(used < req_len - off, "reqlist grew!");
                        off += used + 1;
                        req += used + 1;
                } else
                        ms->ms_reqoffset = 0;
                if (mod->mod_kobj != NULL && stataddr) {
                        kobj_stat(mod->mod_kobj, &addr, &size);
                        ms->ms_addr = addr;
                        ms->ms_size = size;
                }
                ms->ms_class = mi->mi_class;
                ms->ms_refcnt = -1;
                KASSERT(mod->mod_source == MODULE_SOURCE_KERNEL);
                ms->ms_source = mod->mod_source;
                ms++;
        }
        kernconfig_unlock();

        /*
         * Now copyout our internal buffers back to userland
         */
        out_p = iov->iov_base;
        out_s = iov->iov_len;
        size = sizeof(ms_cnt);

        /* Copy out the count of modstat_t */
        if (out_s) {
                size = uimin(sizeof(ms_cnt), out_s);
                error = copyout(&ms_cnt, out_p, size);
                out_p += size;
                out_s -= size;
        }
        /* Copy out the modstat_t array */
        if (out_s && error == 0) {
                size = uimin(ms_len, out_s);
                error = copyout(mso, out_p, size);
                out_p += size;
                out_s -= size;
        }
        /* Copy out the "required" strings */
        if (out_s && error == 0) {
                size = uimin(req_len, out_s);
                error = copyout(reqo, out_p, size);
                out_p += size;
                out_s -= size;
        }
        kmem_free(mso, ms_len);
        kmem_free(reqo, req_len);

        /* Finally, update the userland copy of the iovec's length */
        if (error == 0) {
                iov->iov_len = ms_len + req_len + sizeof(ms_cnt);
                error = copyout(iov, arg, sizeof(*iov));
        }

        return error;
}

int
sys_modctl(struct lwp *l, const struct sys_modctl_args *uap,
           register_t *retval)
{
        /* {
                syscallarg(int)                cmd;
                syscallarg(void *)        arg;
        } */
        char buf[MAXMODNAME];
        struct iovec iov;
        modctl_load_t ml;
        int error;
        void *arg;
#ifdef MODULAR
        uintptr_t loadtype;
#endif

        arg = SCARG(uap, arg);

        switch (SCARG(uap, cmd)) {
        case MODCTL_LOAD:
                error = copyin(arg, &ml, sizeof(ml));
                if (error != 0)
                        break;
                error = handle_modctl_load(ml.ml_filename, ml.ml_flags,
                    ml.ml_props, ml.ml_propslen);
                break;

        case MODCTL_UNLOAD:
                error = copyinstr(arg, buf, sizeof(buf), NULL);
                if (error == 0) {
                        error = module_unload(buf);
                }
                break;

        case MODCTL_STAT:
                error = copyin(arg, &iov, sizeof(iov));
                if (error != 0) {
                        break;
                }
                error = handle_modctl_stat(&iov, arg);
                break;

        case MODCTL_EXISTS:
#ifndef MODULAR
                error = ENOSYS;
#else
                loadtype = (uintptr_t)arg;
                switch (loadtype) {        /* 0 = modload, 1 = autoload */
                case 0:                        /* FALLTHROUGH */
                case 1:
                        error = kauth_authorize_system(kauth_cred_get(),
                             KAUTH_SYSTEM_MODULE, 0,
                             (void *)(uintptr_t)MODCTL_LOAD,
                             (void *)loadtype, NULL);
                        break;
                default:
                        error = EINVAL;
                        break;
                }
#endif
                break;

        default:
                (void)module_autoload("compat_80", MODULE_CLASS_EXEC);
                MODULE_HOOK_CALL(compat_modstat_80_hook,
                    (SCARG(uap, cmd), &iov, arg), enosys(), error);
                if (error == ENOSYS)
                        error = EINVAL;
                break;
        }

        return error;
}





































































































































































    1 






















    1 

    5 




    1 






































    3 





    1 
    4 



    4 















    4 







    1 



    4 
    1 




























    1 










    1 



    4 
    4 







    6 











    6 






    3 




    4 




    5 





    3 
























    5 


























    6 









    6 








    5 









    6 








































    6 






    3 

    3 







    3 



    3 



    3 



    3 





    3 








    3 













    3 

    3 
    1 






    1 






    1 

    1 

















































































































































































































































































    4 






    4 


    4 
    1 


    1 


    1 



    1 

    1 






























    1 




    1 






    1 











    4 
























































    1 















    1 










    1 



    1 













































    3 





    3 


    3 




    1 

















    3 























































































   11 




   10 
    3 

    7 

    4 




    3 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
/*        $NetBSD: in6_pcb.c,v 1.177 2022/11/04 09:04:27 ozaki-r Exp $        */
/*        $KAME: in6_pcb.c,v 1.84 2001/02/08 18:02:08 itojun Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)in_pcb.c        8.2 (Berkeley) 1/4/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6_pcb.c,v 1.177 2022/11/04 09:04:27 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/domain.h>
#include <sys/once.h>

#include <net/if.h>
#include <net/route.h>

#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip6.h>
#include <netinet/portalgo.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/scope6_var.h>

#include "faith.h"

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#include <netipsec/key.h>
#endif /* IPSEC */

#include <netinet/tcp_vtw.h>

const struct in6_addr zeroin6_addr;

#define        IN6PCBHASH_PORT(table, lport) \
        &(table)->inpt_porthashtbl[ntohs(lport) & (table)->inpt_porthash]
#define IN6PCBHASH_BIND(table, laddr, lport) \
        &(table)->inpt_bindhashtbl[ \
            (((laddr)->s6_addr32[0] ^ (laddr)->s6_addr32[1] ^ \
              (laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3]) + ntohs(lport)) & \
            (table)->inpt_bindhash]
#define IN6PCBHASH_CONNECT(table, faddr, fport, laddr, lport) \
        &(table)->inpt_bindhashtbl[ \
            ((((faddr)->s6_addr32[0] ^ (faddr)->s6_addr32[1] ^ \
              (faddr)->s6_addr32[2] ^ (faddr)->s6_addr32[3]) + ntohs(fport)) + \
             (((laddr)->s6_addr32[0] ^ (laddr)->s6_addr32[1] ^ \
              (laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3]) + \
              ntohs(lport))) & (table)->inpt_bindhash]

int ip6_anonportmin = IPV6PORT_ANONMIN;
int ip6_anonportmax = IPV6PORT_ANONMAX;
int ip6_lowportmin  = IPV6PORT_RESERVEDMIN;
int ip6_lowportmax  = IPV6PORT_RESERVEDMAX;

void
in6pcb_init(struct inpcbtable *table, int bindhashsize, int connecthashsize)
{

        inpcb_init(table, bindhashsize, connecthashsize);
        table->inpt_lastport = (in_port_t)ip6_anonportmax;
}

/*
 * Bind address from sin6 to inp.
 */
static int
in6pcb_bind_addr(struct inpcb *inp, struct sockaddr_in6 *sin6, struct lwp *l)
{
        int error;
        int s;

        /*
         * We should check the family, but old programs
         * incorrectly fail to initialize it.
         */
        if (sin6->sin6_family != AF_INET6)
                return EAFNOSUPPORT;

#ifndef INET
        if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr))
                return EADDRNOTAVAIL;
#endif

        if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0)
                return error;

        s = pserialize_read_enter();
        if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
                if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
                        error = EINVAL;
                        goto out;
                }
                if (sin6->sin6_addr.s6_addr32[3]) {
                        struct sockaddr_in sin;

                        memset(&sin, 0, sizeof(sin));
                        sin.sin_len = sizeof(sin);
                        sin.sin_family = AF_INET;
                        bcopy(&sin6->sin6_addr.s6_addr32[3],
                            &sin.sin_addr, sizeof(sin.sin_addr));
                        if (!IN_MULTICAST(sin.sin_addr.s_addr)) {
                                struct ifaddr *ifa;
                                ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
                                if (ifa == NULL &&
                                    (inp->inp_flags & IN6P_BINDANY) == 0) {
                                        error = EADDRNOTAVAIL;
                                        goto out;
                                }
                        }
                }
        } else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
                // succeed
        } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
                struct ifaddr *ifa = NULL;

                if ((inp->inp_flags & IN6P_FAITH) == 0) {
                        ifa = ifa_ifwithaddr(sin6tosa(sin6));
                        if (ifa == NULL &&
                            (inp->inp_flags & IN6P_BINDANY) == 0) {
                                error = EADDRNOTAVAIL;
                                goto out;
                        }
                }

                /*
                 * bind to an anycast address might accidentally
                 * cause sending a packet with an anycast source
                 * address, so we forbid it.
                 *
                 * We should allow to bind to a deprecated address,
                 * since the application dare to use it.
                 * But, can we assume that they are careful enough
                 * to check if the address is deprecated or not?
                 * Maybe, as a safeguard, we should have a setsockopt
                 * flag to control the bind(2) behavior against
                 * deprecated addresses (default: forbid bind(2)).
                 */
                if (ifa &&
                    ifatoia6(ifa)->ia6_flags &
                    (IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED)) {
                        error = EADDRNOTAVAIL;
                        goto out;
                }
        }
        in6p_laddr(inp) = sin6->sin6_addr;
        error = 0;
out:
        pserialize_read_exit(s);
        return error;
}

/*
 * Bind port from sin6 to inp.
 */
static int
in6pcb_bind_port(struct inpcb *inp, struct sockaddr_in6 *sin6, struct lwp *l)
{
        struct inpcbtable *table = inp->inp_table;
        struct socket *so = inp->inp_socket;
        int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
        int error;

        if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 &&
           ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 ||
            (so->so_options & SO_ACCEPTCONN) == 0))
                wild = 1;

        if (sin6->sin6_port != 0) {
                enum kauth_network_req req;

#ifndef IPNOPRIVPORTS
                if (ntohs(sin6->sin6_port) < IPV6PORT_RESERVED)
                        req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
                else
#endif /* IPNOPRIVPORTS */
                        req = KAUTH_REQ_NETWORK_BIND_PORT;

                error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_BIND,
                    req, so, sin6, NULL);
                if (error)
                        return EACCES;
        }

        if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
                /*
                 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
                 * allow compepte duplication of binding if
                 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
                 * and a multicast address is bound on both
                 * new and duplicated sockets.
                 */
                if (so->so_options & (SO_REUSEADDR | SO_REUSEPORT))
                        reuseport = SO_REUSEADDR|SO_REUSEPORT;
        }

        if (sin6->sin6_port != 0) {
                if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
#ifdef INET
                        struct inpcb *t;
                        struct vestigial_inpcb vestige;

                        t = inpcb_lookup_local(table,
                            *(struct in_addr *)&sin6->sin6_addr.s6_addr32[3],
                            sin6->sin6_port, wild, &vestige);
                        if (t && (reuseport & t->inp_socket->so_options) == 0)
                                return EADDRINUSE;
                        if (!t
                            && vestige.valid
                            && !(reuseport && vestige.reuse_port))
                            return EADDRINUSE;
#else
                        return EADDRNOTAVAIL;
#endif
                }

                {
                        struct inpcb *t;
                        struct vestigial_inpcb vestige;

                        t = in6pcb_lookup_local(table, &sin6->sin6_addr,
                            sin6->sin6_port, wild, &vestige);
                        if (t && (reuseport & t->inp_socket->so_options) == 0)
                                return EADDRINUSE;
                        if (!t
                            && vestige.valid
                            && !(reuseport && vestige.reuse_port))
                            return EADDRINUSE;
                }
        }

        if (sin6->sin6_port == 0) {
                int e;
                e = in6pcb_set_port(sin6, inp, l);
                if (e != 0)
                        return e;
        } else {
                inp->inp_lport = sin6->sin6_port;
                inpcb_set_state(inp, INP_BOUND);
        }

        LIST_REMOVE(inp, inp_lhash);
        LIST_INSERT_HEAD(IN6PCBHASH_PORT(table, inp->inp_lport),
            inp, inp_lhash);

        return 0;
}

int
in6pcb_bind(void *v, struct sockaddr_in6 *sin6, struct lwp *l)
{
        struct inpcb *inp = v;
        struct sockaddr_in6 lsin6;
        int error;

        if (inp->inp_af != AF_INET6)
                return EINVAL;

        /*
         * If we already have a local port or a local address it means we're
         * bounded.
         */
        if (inp->inp_lport || !(IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) ||
            (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) &&
              in6p_laddr(inp).s6_addr32[3] == 0)))
                return EINVAL;

        if (NULL != sin6) {
                /* We were provided a sockaddr_in6 to use. */
                if (sin6->sin6_len != sizeof(*sin6))
                        return EINVAL;
        } else {
                /* We always bind to *something*, even if it's "anything". */
                lsin6 = *((const struct sockaddr_in6 *)
                    inp->inp_socket->so_proto->pr_domain->dom_sa_any);
                sin6 = &lsin6;
        }

        /* Bind address. */
        error = in6pcb_bind_addr(inp, sin6, l);
        if (error)
                return error;

        /* Bind port. */
        error = in6pcb_bind_port(inp, sin6, l);
        if (error) {
                /*
                 * Reset the address here to "any" so we don't "leak" the
                 * inpcb.
                 */
                in6p_laddr(inp) = in6addr_any;

                return error;
        }


#if 0
        in6p_flowinfo(inp) = 0;        /* XXX */
#endif
        return 0;
}

/*
 * Connect from a socket to a specified address.
 * Both address and port must be specified in argument sin6.
 * If don't have a local address for this socket yet,
 * then pick one.
 */
int
in6pcb_connect(void *v, struct sockaddr_in6 *sin6, struct lwp *l)
{
        struct inpcb *inp = v;
        struct in6_addr *in6a = NULL;
        struct in6_addr ia6;
        struct ifnet *ifp = NULL;        /* outgoing interface */
        int error = 0;
        int scope_ambiguous = 0;
#ifdef INET
        struct in6_addr mapped;
#endif
        struct sockaddr_in6 tmp;
        struct vestigial_inpcb vestige;
        struct psref psref;
        int bound;

        (void)&in6a;                                /* XXX fool gcc */

        if (inp->inp_af != AF_INET6)
                return EINVAL;

        if (sin6->sin6_len != sizeof(*sin6))
                return EINVAL;
        if (sin6->sin6_family != AF_INET6)
                return EAFNOSUPPORT;
        if (sin6->sin6_port == 0)
                return EADDRNOTAVAIL;

        if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) &&
            inp->inp_socket->so_type == SOCK_STREAM)
                return EADDRNOTAVAIL;

        if (sin6->sin6_scope_id == 0 && !ip6_use_defzone)
                scope_ambiguous = 1;
        if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0)
                return error;

        /* sanity check for mapped address case */
        if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
                if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
                        return EINVAL;
                if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)))
                        in6p_laddr(inp).s6_addr16[5] = htons(0xffff);
                if (!IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)))
                        return EINVAL;
        } else
        {
                if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)))
                        return EINVAL;
        }

        /* protect *sin6 from overwrites */
        tmp = *sin6;
        sin6 = &tmp;

        bound = curlwp_bind();
        /* Source address selection. */
        if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) &&
            in6p_laddr(inp).s6_addr32[3] == 0) {
#ifdef INET
                struct sockaddr_in sin;
                struct in_ifaddr *ia4;
                struct psref _psref;

                memset(&sin, 0, sizeof(sin));
                sin.sin_len = sizeof(sin);
                sin.sin_family = AF_INET;
                memcpy(&sin.sin_addr, &sin6->sin6_addr.s6_addr32[3],
                        sizeof(sin.sin_addr));
                ia4 = in_selectsrc(&sin, &inp->inp_route,
                        inp->inp_socket->so_options, NULL, &error, &_psref);
                if (ia4 == NULL) {
                        if (error == 0)
                                error = EADDRNOTAVAIL;
                        curlwp_bindx(bound);
                        return error;
                }
                memset(&mapped, 0, sizeof(mapped));
                mapped.s6_addr16[5] = htons(0xffff);
                memcpy(&mapped.s6_addr32[3], &IA_SIN(ia4)->sin_addr,
                    sizeof(IA_SIN(ia4)->sin_addr));
                ia4_release(ia4, &_psref);
                in6a = &mapped;
#else
                curlwp_bindx(bound);
                return EADDRNOTAVAIL;
#endif
        } else {
                /*
                 * XXX: in6_selectsrc might replace the bound local address
                 * with the address specified by setsockopt(IPV6_PKTINFO).
                 * Is it the intended behavior?
                 */
                error = in6_selectsrc(sin6, in6p_outputopts(inp),
                    in6p_moptions(inp), &inp->inp_route, &in6p_laddr(inp),
                    &ifp, &psref, &ia6);
                if (error == 0)
                        in6a = &ia6;
                if (ifp && scope_ambiguous &&
                    (error = in6_setscope(&sin6->sin6_addr, ifp, NULL)) != 0) {
                        if_put(ifp, &psref);
                        curlwp_bindx(bound);
                        return error;
                }

                if (in6a == NULL) {
                        if_put(ifp, &psref);
                        curlwp_bindx(bound);
                        if (error == 0)
                                error = EADDRNOTAVAIL;
                        return error;
                }
        }

        if (ifp != NULL) {
                in6p_ip6(inp).ip6_hlim = (u_int8_t)in6pcb_selecthlim(inp, ifp);
                if_put(ifp, &psref);
        } else
                in6p_ip6(inp).ip6_hlim = (u_int8_t)in6pcb_selecthlim_rt(inp);
        curlwp_bindx(bound);

        if (in6pcb_lookup(inp->inp_table, &sin6->sin6_addr,
            sin6->sin6_port,
            IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) ? in6a : &in6p_laddr(inp),
                                  inp->inp_lport, 0, &vestige)
                || vestige.valid)
                return EADDRINUSE;
        if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) ||
            (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) &&
             in6p_laddr(inp).s6_addr32[3] == 0))
        {
                if (inp->inp_lport == 0) {
                        error = in6pcb_bind(inp, NULL, l);
                        if (error != 0)
                                return error;
                }
                in6p_laddr(inp) = *in6a;
        }
        in6p_faddr(inp) = sin6->sin6_addr;
        inp->inp_fport = sin6->sin6_port;

        /* Late bind, if needed */
        if (inp->inp_bindportonsend) {
               struct sockaddr_in6 lsin = *((const struct sockaddr_in6 *)
                    inp->inp_socket->so_proto->pr_domain->dom_sa_any);
                lsin.sin6_addr = in6p_laddr(inp);
                lsin.sin6_port = 0;

               if ((error = in6pcb_bind_port(inp, &lsin, l)) != 0)
                       return error;
        }
        
        inpcb_set_state(inp, INP_CONNECTED);
        in6p_flowinfo(inp) &= ~IPV6_FLOWLABEL_MASK;
        if (ip6_auto_flowlabel)
                in6p_flowinfo(inp) |=
                    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
#if defined(IPSEC)
        if (ipsec_enabled && inp->inp_socket->so_type == SOCK_STREAM)
                ipsec_pcbconn(inp->inp_sp);
#endif
        return 0;
}

void
in6pcb_disconnect(struct inpcb *inp)
{
        memset((void *)&in6p_faddr(inp), 0, sizeof(in6p_faddr(inp)));
        inp->inp_fport = 0;
        inpcb_set_state(inp, INP_BOUND);
        in6p_flowinfo(inp) &= ~IPV6_FLOWLABEL_MASK;
#if defined(IPSEC)
        if (ipsec_enabled)
                ipsec_pcbdisconn(inp->inp_sp);
#endif
        if (inp->inp_socket->so_state & SS_NOFDREF)
                inpcb_destroy(inp);
}

void
in6pcb_fetch_sockaddr(struct inpcb *inp, struct sockaddr_in6 *sin6)
{

        if (inp->inp_af != AF_INET6)
                return;

        sockaddr_in6_init(sin6, &in6p_laddr(inp), inp->inp_lport, 0, 0);
        (void)sa6_recoverscope(sin6); /* XXX: should catch errors */
}

void
in6pcb_fetch_peeraddr(struct inpcb *inp, struct sockaddr_in6 *sin6)
{

        if (inp->inp_af != AF_INET6)
                return;

        sockaddr_in6_init(sin6, &in6p_faddr(inp), inp->inp_fport, 0, 0);
        (void)sa6_recoverscope(sin6); /* XXX: should catch errors */
}

/*
 * Pass some notification to all connections of a protocol
 * associated with address dst.  The local address and/or port numbers
 * may be specified to limit the search.  The "usual action" will be
 * taken, depending on the ctlinput cmd.  The caller must filter any
 * cmds that are uninteresting (e.g., no error in the map).
 * Call the protocol specific routine (if any) to report
 * any errors for each matching socket.
 *
 * Must be called at splsoftnet.
 *
 * Note: src (4th arg) carries the flowlabel value on the original IPv6
 * header, in sin6_flowinfo member.
 */
int
in6pcb_notify(struct inpcbtable *table, const struct sockaddr *dst,
    u_int fport_arg, const struct sockaddr *src, u_int lport_arg, int cmd,
    void *cmdarg, void (*notify)(struct inpcb *, int))
{
        struct inpcb *inp;
        struct sockaddr_in6 sa6_src;
        const struct sockaddr_in6 *sa6_dst;
        in_port_t fport = fport_arg, lport = lport_arg;
        int errno;
        int nmatch = 0;
        u_int32_t flowinfo;

        if ((unsigned)cmd >= PRC_NCMDS || dst->sa_family != AF_INET6)
                return 0;

        sa6_dst = (const struct sockaddr_in6 *)dst;
        if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr))
                return 0;

        /*
         * note that src can be NULL when we get notify by local fragmentation.
         */
        sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src;
        flowinfo = sa6_src.sin6_flowinfo;

        /*
         * Redirects go to all references to the destination,
         * and use in6pcb_rtchange to invalidate the route cache.
         * Dead host indications: also use in6pcb_rtchange to invalidate
         * the cache, and deliver the error to all the sockets.
         * Otherwise, if we have knowledge of the local port and address,
         * deliver only to that socket.
         */
        if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) {
                fport = 0;
                lport = 0;
                memset((void *)&sa6_src.sin6_addr, 0, sizeof(sa6_src.sin6_addr));

                if (cmd != PRC_HOSTDEAD)
                        notify = in6pcb_rtchange;
        }

        errno = inet6ctlerrmap[cmd];
        TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
                struct rtentry *rt = NULL;

                if (inp->inp_af != AF_INET6)
                        continue;

                /*
                 * Under the following condition, notify of redirects
                 * to the pcb, without making address matches against inpcb.
                 * - redirect notification is arrived.
                 * - the inpcb is unconnected.
                 * - the inpcb is caching !RTF_HOST routing entry.
                 * - the ICMPv6 notification is from the gateway cached in the
                 *   inpcb.  i.e. ICMPv6 notification is from nexthop gateway
                 *   the inpcb used very recently.
                 *
                 * This is to improve interaction between netbsd/openbsd
                 * redirect handling code, and inpcb route cache code.
                 * without the clause, !RTF_HOST routing entry (which carries
                 * gateway used by inpcb right before the ICMPv6 redirect)
                 * will be cached forever in unconnected inpcb.
                 *
                 * There still is a question regarding to what is TRT:
                 * - On bsdi/freebsd, RTF_HOST (cloned) routing entry will be
                 *   generated on packet output.  inpcb will always cache
                 *   RTF_HOST routing entry so there's no need for the clause
                 *   (ICMPv6 redirect will update RTF_HOST routing entry,
                 *   and inpcb is caching it already).
                 *   However, bsdi/freebsd are vulnerable to local DoS attacks
                 *   due to the cloned routing entries.
                 * - Specwise, "destination cache" is mentioned in RFC2461.
                 *   Jinmei says that it implies bsdi/freebsd behavior, itojun
                 *   is not really convinced.
                 * - Having hiwat/lowat on # of cloned host route (redirect/
                 *   pmtud) may be a good idea.  netbsd/openbsd has it.  see
                 *   icmp6_mtudisc_update().
                 */
                if ((PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) &&
                    IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) &&
                    (rt = rtcache_validate(&inp->inp_route)) != NULL &&
                    !(rt->rt_flags & RTF_HOST)) {
                        const struct sockaddr_in6 *dst6;

                        dst6 = (const struct sockaddr_in6 *)
                            rtcache_getdst(&inp->inp_route);
                        if (dst6 == NULL)
                                ;
                        else if (IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr,
                            &sa6_dst->sin6_addr)) {
                                rtcache_unref(rt, &inp->inp_route);
                                goto do_notify;
                        }
                }
                rtcache_unref(rt, &inp->inp_route);

                /*
                 * If the error designates a new path MTU for a destination
                 * and the application (associated with this socket) wanted to
                 * know the value, notify. Note that we notify for all
                 * disconnected sockets if the corresponding application
                 * wanted. This is because some UDP applications keep sending
                 * sockets disconnected.
                 * XXX: should we avoid to notify the value to TCP sockets?
                 */
                if (cmd == PRC_MSGSIZE && (inp->inp_flags & IN6P_MTU) != 0 &&
                    (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)) ||
                     IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &sa6_dst->sin6_addr))) {
                        ip6_notify_pmtu(inp, (const struct sockaddr_in6 *)dst,
                                        (u_int32_t *)cmdarg);
                }

                /*
                 * Detect if we should notify the error. If no source and
                 * destination ports are specified, but non-zero flowinfo and
                 * local address match, notify the error. This is the case
                 * when the error is delivered with an encrypted buffer
                 * by ESP. Otherwise, just compare addresses and ports
                 * as usual.
                 */
                if (lport == 0 && fport == 0 && flowinfo &&
                    inp->inp_socket != NULL &&
                    flowinfo == (in6p_flowinfo(inp) & IPV6_FLOWLABEL_MASK) &&
                    IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &sa6_src.sin6_addr))
                        goto do_notify;
                else if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp),
                                             &sa6_dst->sin6_addr) ||
                    inp->inp_socket == NULL ||
                    (lport && inp->inp_lport != lport) ||
                    (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) &&
                     !IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp),
                                         &sa6_src.sin6_addr)) ||
                    (fport && inp->inp_fport != fport))
                        continue;

          do_notify:
                if (notify)
                        (*notify)(inp, errno);
                nmatch++;
        }
        return nmatch;
}

void
in6pcb_purgeif0(struct inpcbtable *table, struct ifnet *ifp)
{
        struct inpcb *inp;
        struct ip6_moptions *im6o;
        struct in6_multi_mship *imm, *nimm;

        KASSERT(ifp != NULL);

        TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
                bool need_unlock = false;
                if (inp->inp_af != AF_INET6)
                        continue;

                /* The caller holds either one of inps' lock */
                if (!inp_locked(inp)) {
                        inp_lock(inp);
                        need_unlock = true;
                }
                im6o = in6p_moptions(inp);
                if (im6o) {
                        /*
                         * Unselect the outgoing interface if it is being
                         * detached.
                         */
                        if (im6o->im6o_multicast_if_index == ifp->if_index)
                                im6o->im6o_multicast_if_index = 0;

                        /*
                         * Drop multicast group membership if we joined
                         * through the interface being detached.
                         * XXX controversial - is it really legal for kernel
                         * to force this?
                         */
                        LIST_FOREACH_SAFE(imm, &im6o->im6o_memberships,
                            i6mm_chain, nimm) {
                                if (imm->i6mm_maddr->in6m_ifp == ifp) {
                                        LIST_REMOVE(imm, i6mm_chain);
                                        in6_leavegroup(imm);
                                }
                        }
                }

                in_purgeifmcast(inp->inp_moptions, ifp);

                if (need_unlock)
                        inp_unlock(inp);
        }
}

void
in6pcb_purgeif(struct inpcbtable *table, struct ifnet *ifp)
{
        struct rtentry *rt;
        struct inpcb *inp;

        TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
                if (inp->inp_af != AF_INET6)
                        continue;
                if ((rt = rtcache_validate(&inp->inp_route)) != NULL &&
                    rt->rt_ifp == ifp) {
                        rtcache_unref(rt, &inp->inp_route);
                        in6pcb_rtchange(inp, 0);
                } else
                        rtcache_unref(rt, &inp->inp_route);
        }
}

/*
 * After a routing change, flush old routing.  A new route can be
 * allocated the next time output is attempted.
 */
void
in6pcb_rtchange(struct inpcb *inp, int errno)
{
        if (inp->inp_af != AF_INET6)
                return;

        rtcache_free(&inp->inp_route);
        /*
         * A new route can be allocated the next time
         * output is attempted.
         */
}

struct inpcb *
in6pcb_lookup_local(struct inpcbtable *table, struct in6_addr *laddr6, 
                   u_int lport_arg, int lookup_wildcard, struct vestigial_inpcb *vp)
{
        struct inpcbhead *head;
        struct inpcb *inp, *match = NULL;
        int matchwild = 3, wildcard;
        in_port_t lport = lport_arg;

        if (vp)
                vp->valid = 0;

        head = IN6PCBHASH_PORT(table, lport);
        LIST_FOREACH(inp, head, inp_lhash) {
                if (inp->inp_af != AF_INET6)
                        continue;

                if (inp->inp_lport != lport)
                        continue;
                wildcard = 0;
                if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) {
                        if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
                                continue;
                }
                if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)))
                        wildcard++;
                if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) {
                        if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
                                continue;
                        if (!IN6_IS_ADDR_V4MAPPED(laddr6))
                                continue;

                        /* duplicate of IPv4 logic */
                        wildcard = 0;
                        if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp)) &&
                            in6p_faddr(inp).s6_addr32[3])
                                wildcard++;
                        if (!in6p_laddr(inp).s6_addr32[3]) {
                                if (laddr6->s6_addr32[3])
                                        wildcard++;
                        } else {
                                if (!laddr6->s6_addr32[3])
                                        wildcard++;
                                else {
                                        if (in6p_laddr(inp).s6_addr32[3] !=
                                            laddr6->s6_addr32[3])
                                                continue;
                                }
                        }
                } else if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) {
                        if (IN6_IS_ADDR_V4MAPPED(laddr6)) {
                                if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
                                        continue;
                        }
                        if (!IN6_IS_ADDR_UNSPECIFIED(laddr6))
                                wildcard++;
                } else {
                        if (IN6_IS_ADDR_V4MAPPED(laddr6)) {
                                if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
                                        continue;
                        }
                        if (IN6_IS_ADDR_UNSPECIFIED(laddr6))
                                wildcard++;
                        else {
                                if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp),
                                    laddr6))
                                        continue;
                        }
                }
                if (wildcard && !lookup_wildcard)
                        continue;
                if (wildcard < matchwild) {
                        match = inp;
                        matchwild = wildcard;
                        if (matchwild == 0)
                                break;
                }
        }
        if (match && matchwild == 0)
                return match;

        if (vp && table->vestige && table->vestige->init_ports6) {
                struct vestigial_inpcb better;
                bool has_better = false;
                void *state;

                state = (*table->vestige->init_ports6)(laddr6,
                                                       lport_arg,
                                                       lookup_wildcard);
                while (table->vestige
                       && (*table->vestige->next_port6)(state, vp)) {

                        if (vp->lport != lport)
                                continue;
                        wildcard = 0;
                        if (!IN6_IS_ADDR_UNSPECIFIED(&vp->faddr.v6))
                                wildcard++;
                        if (IN6_IS_ADDR_UNSPECIFIED(&vp->laddr.v6)) {
                                if (!IN6_IS_ADDR_UNSPECIFIED(laddr6))
                                        wildcard++;
                        } else {
                                if (IN6_IS_ADDR_V4MAPPED(laddr6)) {
                                        if (vp->v6only)
                                                continue;
                                }
                                if (IN6_IS_ADDR_UNSPECIFIED(laddr6))
                                        wildcard++;
                                else {
                                        if (!IN6_ARE_ADDR_EQUAL(&vp->laddr.v6, laddr6))
                                                continue;
                                }
                        }
                        if (wildcard && !lookup_wildcard)
                                continue;
                        if (wildcard < matchwild) {
                                better = *vp;
                                has_better = true;

                                matchwild = wildcard;
                                if (matchwild == 0)
                                        break;
                        }
                }

                if (has_better) {
                        *vp = better;
                        return 0;
                }
        }
        return match;
}

/*
 * WARNING: return value (rtentry) could be IPv4 one if inpcb is connected to
 * IPv4 mapped address.
 */
struct rtentry *
in6pcb_rtentry(struct inpcb *inp)
{
        struct rtentry *rt;
        struct route *ro;
        union {
                const struct sockaddr *sa;
                const struct sockaddr_in6 *sa6;
#ifdef INET
                const struct sockaddr_in *sa4;
#endif
        } cdst;

        ro = &inp->inp_route;

        if (inp->inp_af != AF_INET6)
                return NULL;

        cdst.sa = rtcache_getdst(ro);
        if (cdst.sa == NULL)
                ;
#ifdef INET
        else if (cdst.sa->sa_family == AF_INET) {
                KASSERT(IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp)));
                if (cdst.sa4->sin_addr.s_addr != in6p_faddr(inp).s6_addr32[3])
                        rtcache_free(ro);
        }
#endif
        else {
                if (!IN6_ARE_ADDR_EQUAL(&cdst.sa6->sin6_addr,
                                        &in6p_faddr(inp)))
                        rtcache_free(ro);
        }
        if ((rt = rtcache_validate(ro)) == NULL)
                rt = rtcache_update(ro, 1);
#ifdef INET
        if (rt == NULL && IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) {
                union {
                        struct sockaddr                dst;
                        struct sockaddr_in        dst4;
                } u;
                struct in_addr addr;

                addr.s_addr = in6p_faddr(inp).s6_addr32[3];

                sockaddr_in_init(&u.dst4, &addr, 0);
                if (rtcache_setdst(ro, &u.dst) != 0)
                        return NULL;

                rt = rtcache_init(ro);
        } else
#endif
        if (rt == NULL && !IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) {
                union {
                        struct sockaddr                dst;
                        struct sockaddr_in6        dst6;
                } u;

                sockaddr_in6_init(&u.dst6, &in6p_faddr(inp), 0, 0, 0);
                if (rtcache_setdst(ro, &u.dst) != 0)
                        return NULL;

                rt = rtcache_init(ro);
        }
        return rt;
}

void
in6pcb_rtentry_unref(struct rtentry *rt, struct inpcb *inp)
{

        rtcache_unref(rt, &inp->inp_route);
}

struct inpcb *
in6pcb_lookup(struct inpcbtable *table, const struct in6_addr *faddr6,
                      u_int fport_arg, const struct in6_addr *laddr6, u_int lport_arg,
                      int faith,
                      struct vestigial_inpcb *vp)
{
        struct inpcbhead *head;
        struct inpcb *inp;
        in_port_t fport = fport_arg, lport = lport_arg;

        if (vp)
                vp->valid = 0;

        head = IN6PCBHASH_CONNECT(table, faddr6, fport, laddr6, lport);
        LIST_FOREACH(inp, head, inp_hash) {
                if (inp->inp_af != AF_INET6)
                        continue;

                /* find exact match on both source and dest */
                if (inp->inp_fport != fport)
                        continue;
                if (inp->inp_lport != lport)
                        continue;
                if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)))
                        continue;
                if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), faddr6))
                        continue;
                if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)))
                        continue;
                if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), laddr6))
                        continue;
                if ((IN6_IS_ADDR_V4MAPPED(laddr6) ||
                     IN6_IS_ADDR_V4MAPPED(faddr6)) &&
                    (inp->inp_flags & IN6P_IPV6_V6ONLY))
                        continue;
                return inp;
        }
        if (vp && table->vestige) {
                if ((*table->vestige->lookup6)(faddr6, fport_arg,
                                               laddr6, lport_arg, vp))
                        return NULL;
        }

        return NULL;
}

struct inpcb *
in6pcb_lookup_bound(struct inpcbtable *table, const struct in6_addr *laddr6, 
        u_int lport_arg, int faith)
{
        struct inpcbhead *head;
        struct inpcb *inp;
        in_port_t lport = lport_arg;
#ifdef INET
        struct in6_addr zero_mapped;
#endif

        head = IN6PCBHASH_BIND(table, laddr6, lport);
        LIST_FOREACH(inp, head, inp_hash) {
                if (inp->inp_af != AF_INET6)
                        continue;

                if (faith && (inp->inp_flags & IN6P_FAITH) == 0)
                        continue;
                if (inp->inp_fport != 0)
                        continue;
                if (inp->inp_lport != lport)
                        continue;
                if (IN6_IS_ADDR_V4MAPPED(laddr6) &&
                    (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
                        continue;
                if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), laddr6))
                        goto out;
        }
#ifdef INET
        if (IN6_IS_ADDR_V4MAPPED(laddr6)) {
                memset(&zero_mapped, 0, sizeof(zero_mapped));
                zero_mapped.s6_addr16[5] = 0xffff;
                head = IN6PCBHASH_BIND(table, &zero_mapped, lport);
                LIST_FOREACH(inp, head, inp_hash) {
                        if (inp->inp_af != AF_INET6)
                                continue;

                        if (faith && (inp->inp_flags & IN6P_FAITH) == 0)
                                continue;
                        if (inp->inp_fport != 0)
                                continue;
                        if (inp->inp_lport != lport)
                                continue;
                        if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
                                continue;
                        if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &zero_mapped))
                                goto out;
                }
        }
#endif
        head = IN6PCBHASH_BIND(table, &zeroin6_addr, lport);
        LIST_FOREACH(inp, head, inp_hash) {
                if (inp->inp_af != AF_INET6)
                        continue;

                if (faith && (inp->inp_flags & IN6P_FAITH) == 0)
                        continue;
                if (inp->inp_fport != 0)
                        continue;
                if (inp->inp_lport != lport)
                        continue;
                if (IN6_IS_ADDR_V4MAPPED(laddr6) &&
                    (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
                        continue;
                if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &zeroin6_addr))
                        goto out;
        }
        return NULL;

out:
        if (inp != LIST_FIRST(head)) {
                LIST_REMOVE(inp, inp_hash);
                LIST_INSERT_HEAD(head, inp, inp_hash);
        }
        return inp;
}

void
in6pcb_set_state(struct inpcb *inp, int state)
{

        if (inp->inp_af != AF_INET6)
                return;

        if (inp->inp_state > INP_ATTACHED)
                LIST_REMOVE(inp, inp_hash);

        switch (state) {
        case INP_BOUND:
                LIST_INSERT_HEAD(IN6PCBHASH_BIND(inp->inp_table,
                    &in6p_laddr(inp), inp->inp_lport), inp,
                    inp_hash);
                break;
        case INP_CONNECTED:
                LIST_INSERT_HEAD(IN6PCBHASH_CONNECT(inp->inp_table,
                    &in6p_faddr(inp), inp->inp_fport,
                    &in6p_laddr(inp), inp->inp_lport), inp,
                    inp_hash);
                break;
        }

        inp->inp_state = state;
}




























































































































   41 











   40 

































































   84 




   85 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
/*        $NetBSD: secmodel_extensions_vfs.c,v 1.1 2023/04/22 13:54:19 riastradh Exp $        */

/*-
 * Copyright (c) 2011 Elad Efrat <elad@NetBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: secmodel_extensions_vfs.c,v 1.1 2023/04/22 13:54:19 riastradh Exp $");

#include <sys/types.h>
#include <sys/param.h>

#include <sys/kauth.h>
#include <sys/vnode.h>

#include <secmodel/secmodel.h>
#include <secmodel/extensions/extensions.h>
#include <secmodel/extensions/extensions_impl.h>

static int dovfsusermount;
static int hardlink_check_uid;
static int hardlink_check_gid;

static kauth_listener_t l_system, l_vnode;

static int secmodel_extensions_system_cb(kauth_cred_t, kauth_action_t,
    void *, void *, void *, void *, void *);
static int secmodel_extensions_vnode_cb(kauth_cred_t, kauth_action_t,
    void *, void *, void *, void *, void *);

void
secmodel_extensions_vfs_start(void)
{

        l_system = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
            secmodel_extensions_system_cb, NULL);
        l_vnode = kauth_listen_scope(KAUTH_SCOPE_VNODE,
            secmodel_extensions_vnode_cb, NULL);
}

void
secmodel_extensions_vfs_stop(void)
{

        kauth_unlisten_scope(l_system);
        kauth_unlisten_scope(l_vnode);
}

void
secmodel_extensions_vfs_sysctl(struct sysctllog **clog,
    const struct sysctlnode *rnode)
{

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "usermount",
                       SYSCTL_DESCR("Whether unprivileged users may mount "
                                    "filesystems"),
                       sysctl_extensions_user_handler, 0, &dovfsusermount, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "hardlink_check_uid",
                       SYSCTL_DESCR("Whether unprivileged users can hardlink "\
                            "to files they don't own"),
                       sysctl_extensions_user_handler, 0,
                       &hardlink_check_uid, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "hardlink_check_gid",
                       SYSCTL_DESCR("Whether unprivileged users can hardlink "\
                            "to files that are not in their " \
                            "group membership"),
                       sysctl_extensions_user_handler, 0,
                       &hardlink_check_gid, 0,
                       CTL_CREATE, CTL_EOL);

        /* Compatibility: vfs.generic.usermount */
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "generic",
                       SYSCTL_DESCR("Non-specific vfs related information"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, VFS_GENERIC, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "usermount",
                       SYSCTL_DESCR("Whether unprivileged users may mount "
                                    "filesystems"),
                       sysctl_extensions_user_handler, 0, &dovfsusermount, 0,
                       CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL);
}

static int
secmodel_extensions_system_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        vnode_t *vp;
        struct vattr va;
        struct mount *mp;
        u_long flags;
        int result;
        enum kauth_system_req req;
        int error;

        req = (enum kauth_system_req)(uintptr_t)arg0;
        result = KAUTH_RESULT_DEFER;

        switch (action) {
        case KAUTH_SYSTEM_MOUNT:
                if (dovfsusermount == 0)
                        break;
                switch (req) {
                case KAUTH_REQ_SYSTEM_MOUNT_NEW:
                        vp = (vnode_t *)arg1;
                        mp = vp->v_mount;
                        flags = (u_long)arg2;

                        /*
                         * Ensure that the user owns the directory onto which
                         * the mount is attempted.
                         */
                        vn_lock(vp, LK_SHARED | LK_RETRY);
                        error = VOP_GETATTR(vp, &va, cred);
                        VOP_UNLOCK(vp);
                        if (error)
                                break;

                        if (va.va_uid != kauth_cred_geteuid(cred))
                                break;

                        error = usermount_common_policy(mp, flags);
                        if (error)
                                break;

                        result = KAUTH_RESULT_ALLOW;

                        break;

                case KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT:
                        mp = arg1;

                        /* Must own the mount. */
                        if (mp->mnt_stat.f_owner == kauth_cred_geteuid(cred))
                                result = KAUTH_RESULT_ALLOW;

                        break;

                case KAUTH_REQ_SYSTEM_MOUNT_UPDATE:
                        mp = arg1;
                        flags = (u_long)arg2;

                        /* Must own the mount. */
                        if (mp->mnt_stat.f_owner == kauth_cred_geteuid(cred) &&
                                usermount_common_policy(mp, flags) == 0)
                                result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }
                break;

        default:
                break;
        }

        return (result);
}

static int
secmodel_extensions_vnode_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        int error;
        bool isroot;
        struct vattr va;

        if ((action & KAUTH_VNODE_ADD_LINK) == 0)
                return KAUTH_RESULT_DEFER;

        error = VOP_GETATTR((vnode_t *)arg0, &va, cred);
        if (error)
                goto checkroot;

        if (hardlink_check_uid && kauth_cred_geteuid(cred) != va.va_uid)
                goto checkroot;

        if (hardlink_check_gid && kauth_cred_groupmember(cred, va.va_gid) != 0)
                goto checkroot;

        return KAUTH_RESULT_DEFER;
checkroot:
        error = secmodel_eval("org.netbsd.secmodel.suser", "is-root",
            cred, &isroot);
        if (error || !isroot)
                return KAUTH_RESULT_DENY;

        return KAUTH_RESULT_DEFER;
}


































































































































































































































































































































































































































































   81 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
/*        $NetBSD: uvm_page.h,v 1.109 2020/12/20 16:38:26 skrll Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * Copyright (c) 1991, 1993, The Regents of the University of California.
 *
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * The Mach Operating System project at Carnegie-Mellon University.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vm_page.h   7.3 (Berkeley) 4/21/91
 * from: Id: uvm_page.h,v 1.1.2.6 1998/02/04 02:31:42 chuck Exp
 *
 *
 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
 * All rights reserved.
 *
 * Permission to use, copy, modify and distribute this software and
 * its documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 */

#ifndef _UVM_UVM_PAGE_H_
#define _UVM_UVM_PAGE_H_

#ifdef _KERNEL_OPT
#include "opt_uvm_page_trkown.h"
#endif

#include <sys/rwlock.h>

#include <uvm/uvm_extern.h>
#include <uvm/uvm_pglist.h>

/*
 * Management of resident (logical) pages.
 *
 * Each resident page has a vm_page structure, indexed by page number.
 * There are several lists in the structure:
 *
 * - A red-black tree rooted with the containing object is used to
 *   quickly perform object+offset lookups.
 * - A list of all pages for a given object, for a quick deactivation
 *   at a time of deallocation.
 * - An ordered list of pages due for pageout.
 *
 * In addition, the structure contains the object and offset to which
 * this page belongs (for pageout) and sundry status bits.
 *
 * Note that the page structure has no lock of its own.  The page is
 * generally protected by its owner's lock (UVM object or amap/anon).
 * It should be noted that UVM has to serialize pmap(9) operations on
 * the managed pages, e.g. for pmap_enter() calls.  Hence, the lock
 * order is as follows:
 *
 *        [vmpage-owner-lock] ->
 *                any pmap locks (e.g. PV hash lock)
 *
 * Since the kernel is always self-consistent, no serialization is
 * required for unmanaged mappings, e.g. for pmap_kenter_pa() calls.
 *
 * Field markings and the corresponding locks:
 *
 * f:        free page queue lock, uvm_fpageqlock
 * o:        page owner (uvm_object::vmobjlock, vm_amap::am_lock, vm_anon::an_lock)
 * i:        vm_page::interlock
 *        => flags set and cleared only with o&i held can
 *           safely be tested for with only o held.
 * o,i:        o|i for read, o&i for write (depends on context - if could be loaned)
 *          => see uvm_loan.c
 * w:        wired page queue or uvm_pglistalloc:
 *          => wired page queue: o&i to change, stable from wire to unwire
 *                XXX What about concurrent or nested wire?
 *          => uvm_pglistalloc: owned by caller
 * ?:        locked by pmap or assumed page owner's lock
 * p:        locked by pagedaemon policy module (pdpolicy)
 * c:        cpu private
 * s:        stable, does not change
 *
 * UVM and pmap(9) may use uvm_page_owner_locked_p() to assert whether the
 * page owner's lock is acquired.
 *
 * A page can have one of four identities:
 *
 * o free
 *   => pageq.list is entry on global free page queue
 *   => uanon is unused (or (void *)0xdeadbeef for DEBUG)
 *   => uobject is unused (or (void *)0xdeadbeef for DEBUG)
 *   => PG_FREE is set in flags
 * o owned by a uvm_object
 *   => pageq.queue is entry on wired page queue, if any
 *   => uanon is NULL or the vm_anon to which it has been O->A loaned
 *   => uobject is owner
 * o owned by a vm_anon
 *   => pageq is unused (XXX correct?)
 *   => uanon is owner
 *   => uobject is NULL
 *   => PG_ANON is set in flags
 * o allocated by uvm_pglistalloc
 *   => pageq.queue is entry on resulting pglist, owned by caller
 *   => uanon is unused
 *   => uobject is unused
 *
 * The following transitions are allowed:
 *
 * - uvm_pagealloc: free -> owned by a uvm_object/vm_anon
 * - uvm_pagefree: owned by a uvm_object/vm_anon -> free
 * - uvm_pglistalloc: free -> allocated by uvm_pglistalloc
 * - uvm_pglistfree: allocated by uvm_pglistalloc -> free
 *
 * On the ordering of fields:
 *
 * The fields most heavily used during fault processing are clustered
 * together at the start of the structure to reduce cache misses.
 * XXX This entire thing should be shrunk to fit in one cache line.
 */

struct vm_page {
        /* _LP64: first cache line */
        union {
                TAILQ_ENTRY(vm_page) queue;        /* w: wired page queue
                                                 * or uvm_pglistalloc output */
                LIST_ENTRY(vm_page) list;        /* f: global free page queue */
        } pageq;
        uint32_t                pqflags;        /* i: pagedaemon flags */
        uint32_t                flags;                /* o: object flags */
        paddr_t                        phys_addr;        /* o: physical address of pg */
        uint32_t                loan_count;        /* o,i: num. active loans */
        uint32_t                wire_count;        /* o,i: wired down map refs */
        struct vm_anon                *uanon;                /* o,i: anon */
        struct uvm_object        *uobject;        /* o,i: object */
        voff_t                        offset;                /* o: offset into object */

        /* _LP64: second cache line */
        kmutex_t                interlock;        /* s: lock on identity */
        TAILQ_ENTRY(vm_page)        pdqueue;        /* p: pagedaemon queue */

#ifdef __HAVE_VM_PAGE_MD
        struct vm_page_md        mdpage;                /* ?: pmap-specific data */
#endif

#if defined(UVM_PAGE_TRKOWN)
        /* debugging fields to track page ownership */
        pid_t                        owner;                /* proc that set PG_BUSY */
        lwpid_t                        lowner;                /* lwp that set PG_BUSY */
        const char                *owner_tag;        /* why it was set busy */
#endif
};

/*
 * Overview of UVM page flags, stored in pg->flags.
 *
 * Locking notes:
 *
 * PG_, struct vm_page::flags        => locked by owner
 * PG_AOBJ                        => additionally locked by vm_page::interlock
 * PG_ANON                        => additionally locked by vm_page::interlock
 * PG_FREE                        => additionally locked by uvm_fpageqlock
 *                                   for uvm_pglistalloc()
 *
 * Flag descriptions:
 *
 * PG_CLEAN:
 *        Page is known clean.
 *        The contents of the page is consistent with its backing store.
 *
 * PG_DIRTY:
 *        Page is known dirty.
 *        To avoid losing data, the contents of the page should be written
 *        back to the backing store before freeing the page.
 *
 * PG_BUSY:
 *        Page is long-term locked, usually because of I/O (transfer from the
 *        page memory to the backing store) is in progress.  LWP attempting
 *        to access the page shall set PQ_WANTED and wait.  PG_BUSY may only
 *        be set with a write lock held on the object.
 *
 * PG_PAGEOUT:
 *        Indicates that the page is being paged-out in preparation for
 *        being freed.
 *
 * PG_RELEASED:
 *        Indicates that the page, which is currently PG_BUSY, should be freed
 *        after the release of long-term lock.  It is responsibility of the
 *        owning LWP (i.e. which set PG_BUSY) to do it.
 *
 * PG_FAKE:
 *        Page has been allocated, but not yet initialised.  The flag is used
 *        to avoid overwriting of valid data, e.g. to prevent read from the
 *        backing store when in-core data is newer.
 *
 * PG_RDONLY:
 *        Indicates that the page must be mapped read-only.
 *
 * PG_MARKER:
 *        Dummy marker page, generally used for list traversal.
 */

/*
 * if you want to renumber PG_CLEAN and PG_DIRTY, check __CTASSERTs in
 * uvm_page_status.c first.
 */

#define        PG_CLEAN        0x00000001        /* page is known clean */
#define        PG_DIRTY        0x00000002        /* page is known dirty */
#define        PG_BUSY                0x00000004        /* page is locked */
#define        PG_PAGEOUT        0x00000010        /* page to be freed for pagedaemon */
#define        PG_RELEASED        0x00000020        /* page to be freed when unbusied */
#define        PG_FAKE                0x00000040        /* page is not yet initialized */
#define        PG_RDONLY        0x00000080        /* page must be mapped read-only */
#define        PG_TABLED        0x00000200        /* page is tabled in object */
#define        PG_AOBJ                0x00000400        /* page is part of an anonymous
                                           uvm_object */
#define        PG_ANON                0x00000800        /* page is part of an anon, rather
                                           than an uvm_object */
#define        PG_FILE                0x00001000        /* file backed (non-anonymous) */
#define        PG_READAHEAD        0x00002000        /* read-ahead but not "hit" yet */
#define        PG_FREE                0x00004000        /* page is on free list */
#define        PG_MARKER        0x00008000        /* dummy marker page */
#define        PG_PAGER1        0x00010000        /* pager-specific flag */
#define        PG_PGLCA        0x00020000        /* allocated by uvm_pglistalloc_contig */

#define        PG_STAT                (PG_ANON|PG_AOBJ|PG_FILE)
#define        PG_SWAPBACKED        (PG_ANON|PG_AOBJ)

#define        UVM_PGFLAGBITS \
        "\20\1CLEAN\2DIRTY\3BUSY" \
        "\5PAGEOUT\6RELEASED\7FAKE\10RDONLY" \
        "\11ZERO\12TABLED\13AOBJ\14ANON" \
        "\15FILE\16READAHEAD\17FREE\20MARKER" \
        "\21PAGER1\22PGLCA"

/*
 * Flags stored in pg->pqflags, which is protected by pg->interlock.
 *
 * PQ_PRIVATE:
 *        ... is for uvmpdpol to do whatever it wants with.
 *
 * PQ_INTENT_SET:
 *        Indicates that the intent set on the page has not yet been realized.
 *
 * PQ_INTENT_QUEUED:
 *        Indicates that the page is, or will soon be, on a per-CPU queue for
 *        the intent to be realized.
 *
 * PQ_WANTED:
 *        Indicates that the page, which is currently PG_BUSY, is wanted by
 *        some other LWP.  The page owner (i.e. LWP which set PG_BUSY) is
 *        responsible to clear both flags and wake up any waiters once it has
 *        released the long-term lock (PG_BUSY).
 */

#define        PQ_INTENT_A                0x00000000        /* intend activation */
#define        PQ_INTENT_I                0x00000001        /* intend deactivation */
#define        PQ_INTENT_E                0x00000002        /* intend enqueue */
#define        PQ_INTENT_D                0x00000003        /* intend dequeue */
#define        PQ_INTENT_MASK                0x00000003        /* mask of intended state */
#define        PQ_INTENT_SET                0x00000004        /* not realized yet */
#define        PQ_INTENT_QUEUED        0x00000008        /* queued for processing */
#define        PQ_PRIVATE                0x00000ff0        /* private for pdpolicy */
#define        PQ_WANTED                0x00001000        /* someone is waiting for page */

#define        UVM_PQFLAGBITS \
        "\20\1INTENT_0\2INTENT_1\3INTENT_SET\4INTENT_QUEUED" \
        "\5PRIVATE1\6PRIVATE2\7PRIVATE3\10PRIVATE4" \
        "\11PRIVATE5\12PRIVATE6\13PRIVATE7\14PRIVATE8" \
        "\15WANTED"

/*
 * physical memory layout structure
 *
 * MD vmparam.h must #define:
 *   VM_PHYSEG_MAX = max number of physical memory segments we support
 *                   (if this is "1" then we revert to a "contig" case)
 *   VM_PHYSSEG_STRAT: memory sort/search options (for VM_PHYSEG_MAX > 1)
 *         - VM_PSTRAT_RANDOM:   linear search (random order)
 *        - VM_PSTRAT_BSEARCH:  binary search (sorted by address)
 *        - VM_PSTRAT_BIGFIRST: linear search (sorted by largest segment first)
 *      - others?
 *   XXXCDC: eventually we should purge all left-over global variables...
 */
#define VM_PSTRAT_RANDOM        1
#define VM_PSTRAT_BSEARCH        2
#define VM_PSTRAT_BIGFIRST        3

#ifdef _KERNEL

/*
 * prototypes: the following prototypes define the interface to pages
 */

void uvm_page_init(vaddr_t *, vaddr_t *);
void uvm_pglistalloc_init(void);
#if defined(UVM_PAGE_TRKOWN)
void uvm_page_own(struct vm_page *, const char *);
#endif
#if !defined(PMAP_STEAL_MEMORY)
bool uvm_page_physget(paddr_t *);
#endif
void uvm_page_recolor(int);
void uvm_page_rebucket(void);

void uvm_pageactivate(struct vm_page *);
vaddr_t uvm_pageboot_alloc(vsize_t);
void uvm_pagecopy(struct vm_page *, struct vm_page *);
void uvm_pagedeactivate(struct vm_page *);
void uvm_pagedequeue(struct vm_page *);
void uvm_pageenqueue(struct vm_page *);
void uvm_pagefree(struct vm_page *);
void uvm_pagelock(struct vm_page *);
void uvm_pagelock2(struct vm_page *, struct vm_page *);
void uvm_pageunlock(struct vm_page *);
void uvm_pageunlock2(struct vm_page *, struct vm_page *);
void uvm_page_unbusy(struct vm_page **, int);
struct vm_page *uvm_pagelookup(struct uvm_object *, voff_t);
void uvm_pageunwire(struct vm_page *);
void uvm_pagewire(struct vm_page *);
void uvm_pagezero(struct vm_page *);
bool uvm_pageismanaged(paddr_t);
bool uvm_page_owner_locked_p(struct vm_page *, bool);
void uvm_pgfl_lock(void);
void uvm_pgfl_unlock(void);
unsigned int uvm_pagegetdirty(struct vm_page *);
void uvm_pagemarkdirty(struct vm_page *, unsigned int);
bool uvm_pagecheckdirty(struct vm_page *, bool);
bool uvm_pagereadonly_p(struct vm_page *);
bool uvm_page_locked_p(struct vm_page *);
void uvm_pagewakeup(struct vm_page *);
bool uvm_pagewanted_p(struct vm_page *);
void uvm_pagewait(struct vm_page *, krwlock_t *, const char *);

int uvm_page_lookup_freelist(struct vm_page *);

struct vm_page *uvm_phys_to_vm_page(paddr_t);
paddr_t uvm_vm_page_to_phys(const struct vm_page *);

#if defined(PMAP_DIRECT)
extern bool ubc_direct;
int uvm_direct_process(struct vm_page **, u_int, voff_t, vsize_t,
            int (*)(void *, size_t, void *), void *);
#endif

/*
 * page dirtiness status for uvm_pagegetdirty and uvm_pagemarkdirty
 *
 * UNKNOWN means that we need to consult pmap to know if the page is
 * dirty or not.
 * basically, UVM_PAGE_STATUS_CLEAN implies that the page has no writable
 * mapping.
 *
 * if you want to renumber these, check __CTASSERTs in
 * uvm_page_status.c first.
 */

#define        UVM_PAGE_STATUS_UNKNOWN        0
#define        UVM_PAGE_STATUS_CLEAN        1
#define        UVM_PAGE_STATUS_DIRTY        2
#define        UVM_PAGE_NUM_STATUS        3

/*
 * macros
 */

#define VM_PAGE_TO_PHYS(entry)        uvm_vm_page_to_phys(entry)

#ifdef __HAVE_VM_PAGE_MD
#define        VM_PAGE_TO_MD(pg)        (&(pg)->mdpage)
#define        VM_MD_TO_PAGE(md)        (container_of((md), struct vm_page, mdpage))
#endif

/*
 * Compute the page color for a given page.
 */
#define        VM_PGCOLOR(pg) \
        (atop(VM_PAGE_TO_PHYS((pg))) & uvmexp.colormask)
#define        PHYS_TO_VM_PAGE(pa)        uvm_phys_to_vm_page(pa)

/*
 * VM_PAGE_IS_FREE() can't tell if the page is on global free list, or a
 * per-CPU cache.  If you need to be certain, pause caching.
 */
#define VM_PAGE_IS_FREE(entry)  ((entry)->flags & PG_FREE)

/*
 * Use the lower 10 bits of pg->phys_addr to cache some some locators for
 * the page.  This implies that the smallest possible page size is 1kB, and
 * that nobody should use pg->phys_addr directly (use VM_PAGE_TO_PHYS()).
 * 
 * - 5 bits for the freelist index, because uvm_page_lookup_freelist()
 *   traverses an rbtree and therefore features prominently in traces
 *   captured during performance test.  It would probably be more useful to
 *   cache physseg index here because freelist can be inferred from physseg,
 *   but it requires changes to allocation for UVM_HOTPLUG, so for now we'll
 *   go with freelist.
 *
 * - 5 bits for "bucket", a way for us to categorise pages further as
 *   needed (e.g. NUMA node).
 *
 * None of this is set in stone; it can be adjusted as needed.
 */

#define        UVM_PHYSADDR_FREELIST        __BITS(0,4)
#define        UVM_PHYSADDR_BUCKET        __BITS(5,9)

static inline unsigned
uvm_page_get_freelist(struct vm_page *pg)
{
        unsigned fl = __SHIFTOUT(pg->phys_addr, UVM_PHYSADDR_FREELIST);
        KASSERT(fl == (unsigned)uvm_page_lookup_freelist(pg));
        return fl;
}

static inline unsigned
uvm_page_get_bucket(struct vm_page *pg)
{
        return __SHIFTOUT(pg->phys_addr, UVM_PHYSADDR_BUCKET);
}

static inline void
uvm_page_set_freelist(struct vm_page *pg, unsigned fl)
{
        KASSERT(fl < 32);
        pg->phys_addr &= ~UVM_PHYSADDR_FREELIST;
        pg->phys_addr |= __SHIFTIN(fl, UVM_PHYSADDR_FREELIST);
}

static inline void
uvm_page_set_bucket(struct vm_page *pg, unsigned b)
{
        KASSERT(b < 32);
        pg->phys_addr &= ~UVM_PHYSADDR_BUCKET;
        pg->phys_addr |= __SHIFTIN(b, UVM_PHYSADDR_BUCKET);
}

#endif /* _KERNEL */

#endif /* _UVM_UVM_PAGE_H_ */



















































































































































































   26 







   27 











    6 


































































    6 






    6 

















































































































































































































































































































































































































    6 



    6 



    6 










































    1 











    1 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
/*        $NetBSD: kern_sig.c,v 1.409 2024/02/10 09:24:18 andvar Exp $        */

/*-
 * Copyright (c) 2006, 2007, 2008, 2019, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_sig.c        8.14 (Berkeley) 5/14/95
 */

/*
 * Signal subsystem.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_sig.c,v 1.409 2024/02/10 09:24:18 andvar Exp $");

#include "opt_execfmt.h"
#include "opt_ptrace.h"
#include "opt_dtrace.h"
#include "opt_compat_sunos.h"
#include "opt_compat_netbsd.h"
#include "opt_compat_netbsd32.h"
#include "opt_pax.h"

#define        SIGPROP                /* include signal properties table */
#include <sys/param.h>
#include <sys/signalvar.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/systm.h>
#include <sys/wait.h>
#include <sys/ktrace.h>
#include <sys/syslog.h>
#include <sys/filedesc.h>
#include <sys/file.h>
#include <sys/pool.h>
#include <sys/ucontext.h>
#include <sys/exec.h>
#include <sys/kauth.h>
#include <sys/acct.h>
#include <sys/callout.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/module.h>
#include <sys/sdt.h>
#include <sys/exec_elf.h>
#include <sys/compat_stub.h>

#ifdef PAX_SEGVGUARD
#include <sys/pax.h>
#endif /* PAX_SEGVGUARD */

#include <uvm/uvm_extern.h>

/* Many hard-coded assumptions that there are <= 4 x 32bit signal mask bits */
__CTASSERT(NSIG <= 128);

#define        SIGQUEUE_MAX        32
static pool_cache_t        sigacts_cache        __read_mostly;
static pool_cache_t        ksiginfo_cache        __read_mostly;
static callout_t        proc_stop_ch        __cacheline_aligned;

sigset_t                contsigmask        __cacheline_aligned;
sigset_t                stopsigmask        __cacheline_aligned;
static sigset_t                vforksigmask        __cacheline_aligned;
sigset_t                sigcantmask        __cacheline_aligned;

static void        ksiginfo_exechook(struct proc *, void *);
static void        proc_stop(struct proc *, int);
static void        proc_stop_done(struct proc *, int);
static void        proc_stop_callout(void *);
static int        sigchecktrace(void);
static int        sigpost(struct lwp *, sig_t, int, int);
static int        sigput(sigpend_t *, struct proc *, ksiginfo_t *);
static int        sigunwait(struct proc *, const ksiginfo_t *);
static void        sigswitch(int, int, bool);
static void        sigswitch_unlock_and_switch_away(struct lwp *);

static void        sigacts_poolpage_free(struct pool *, void *);
static void        *sigacts_poolpage_alloc(struct pool *, int);

/*
 * DTrace SDT provider definitions
 */
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE3(proc, kernel, , signal__send,
    "struct lwp *",         /* target thread */
    "struct proc *",         /* target process */
    "int");                /* signal */
SDT_PROBE_DEFINE3(proc, kernel, , signal__discard,
    "struct lwp *",        /* target thread */
    "struct proc *",        /* target process */
    "int");                  /* signal */
SDT_PROBE_DEFINE3(proc, kernel, , signal__handle,
    "int",                 /* signal */
    "ksiginfo_t *",         /* signal info */
    "void (*)(void)");        /* handler address */


static struct pool_allocator sigactspool_allocator = {
        .pa_alloc = sigacts_poolpage_alloc,
        .pa_free = sigacts_poolpage_free
};

#ifdef DEBUG
int        kern_logsigexit = 1;
#else
int        kern_logsigexit = 0;
#endif

static const char logcoredump[] =
    "pid %d (%s), uid %d: exited on signal %d (core dumped)\n";
static const char lognocoredump[] =
    "pid %d (%s), uid %d: exited on signal %d (core not dumped, err = %d)\n";

static kauth_listener_t signal_listener;

static int
signal_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        struct proc *p;
        int result, signum;

        result = KAUTH_RESULT_DEFER;
        p = arg0;
        signum = (int)(unsigned long)arg1;

        if (action != KAUTH_PROCESS_SIGNAL)
                return result;

        if (kauth_cred_uidmatch(cred, p->p_cred) ||
            (signum == SIGCONT && (curproc->p_session == p->p_session)))
                result = KAUTH_RESULT_ALLOW;

        return result;
}

static int
sigacts_ctor(void *arg __unused, void *obj, int flags __unused)
{
        memset(obj, 0, sizeof(struct sigacts));
        return 0;
}

/*
 * signal_init:
 *
 *        Initialize global signal-related data structures.
 */
void
signal_init(void)
{

        sigactspool_allocator.pa_pagesz = (PAGE_SIZE)*2;

        sigacts_cache = pool_cache_init(sizeof(struct sigacts), 0, 0, 0,
            "sigacts", sizeof(struct sigacts) > PAGE_SIZE ?
            &sigactspool_allocator : NULL, IPL_NONE, sigacts_ctor, NULL, NULL);
        ksiginfo_cache = pool_cache_init(sizeof(ksiginfo_t), 0, 0, 0,
            "ksiginfo", NULL, IPL_VM, NULL, NULL, NULL);

        exechook_establish(ksiginfo_exechook, NULL);

        callout_init(&proc_stop_ch, CALLOUT_MPSAFE);
        callout_setfunc(&proc_stop_ch, proc_stop_callout, NULL);

        signal_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
            signal_listener_cb, NULL);
}

/*
 * sigacts_poolpage_alloc:
 *
 *        Allocate a page for the sigacts memory pool.
 */
static void *
sigacts_poolpage_alloc(struct pool *pp, int flags)
{

        return (void *)uvm_km_alloc(kernel_map,
            PAGE_SIZE * 2, PAGE_SIZE * 2,
            ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
            | UVM_KMF_WIRED);
}

/*
 * sigacts_poolpage_free:
 *
 *        Free a page on behalf of the sigacts memory pool.
 */
static void
sigacts_poolpage_free(struct pool *pp, void *v)
{

        uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * 2, UVM_KMF_WIRED);
}

/*
 * sigactsinit:
 *
 *        Create an initial sigacts structure, using the same signal state
 *        as of specified process.  If 'share' is set, share the sigacts by
 *        holding a reference, otherwise just copy it from parent.
 */
struct sigacts *
sigactsinit(struct proc *pp, int share)
{
        struct sigacts *ps = pp->p_sigacts, *ps2;

        if (__predict_false(share)) {
                atomic_inc_uint(&ps->sa_refcnt);
                return ps;
        }
        ps2 = pool_cache_get(sigacts_cache, PR_WAITOK);
        mutex_init(&ps2->sa_mutex, MUTEX_DEFAULT, IPL_SCHED);
        ps2->sa_refcnt = 1;

        mutex_enter(&ps->sa_mutex);
        memcpy(ps2->sa_sigdesc, ps->sa_sigdesc, sizeof(ps2->sa_sigdesc));
        mutex_exit(&ps->sa_mutex);
        return ps2;
}

/*
 * sigactsunshare:
 *
 *        Make this process not share its sigacts, maintaining all signal state.
 */
void
sigactsunshare(struct proc *p)
{
        struct sigacts *ps, *oldps = p->p_sigacts;

        if (__predict_true(oldps->sa_refcnt == 1))
                return;

        ps = pool_cache_get(sigacts_cache, PR_WAITOK);
        mutex_init(&ps->sa_mutex, MUTEX_DEFAULT, IPL_SCHED);
        memcpy(ps->sa_sigdesc, oldps->sa_sigdesc, sizeof(ps->sa_sigdesc));
        ps->sa_refcnt = 1;

        p->p_sigacts = ps;
        sigactsfree(oldps);
}

/*
 * sigactsfree;
 *
 *        Release a sigacts structure.
 */
void
sigactsfree(struct sigacts *ps)
{

        membar_release();
        if (atomic_dec_uint_nv(&ps->sa_refcnt) == 0) {
                membar_acquire();
                mutex_destroy(&ps->sa_mutex);
                pool_cache_put(sigacts_cache, ps);
        }
}

/*
 * siginit:
 *
 *        Initialize signal state for process 0; set to ignore signals that
 *        are ignored by default and disable the signal stack.  Locking not
 *        required as the system is still cold.
 */
void
siginit(struct proc *p)
{
        struct lwp *l;
        struct sigacts *ps;
        int signo, prop;

        ps = p->p_sigacts;
        sigemptyset(&contsigmask);
        sigemptyset(&stopsigmask);
        sigemptyset(&vforksigmask);
        sigemptyset(&sigcantmask);
        for (signo = 1; signo < NSIG; signo++) {
                prop = sigprop[signo];
                if (prop & SA_CONT)
                        sigaddset(&contsigmask, signo);
                if (prop & SA_STOP)
                        sigaddset(&stopsigmask, signo);
                if (prop & SA_STOP && signo != SIGSTOP)
                        sigaddset(&vforksigmask, signo);
                if (prop & SA_CANTMASK)
                        sigaddset(&sigcantmask, signo);
                if (prop & SA_IGNORE && signo != SIGCONT)
                        sigaddset(&p->p_sigctx.ps_sigignore, signo);
                sigemptyset(&SIGACTION_PS(ps, signo).sa_mask);
                SIGACTION_PS(ps, signo).sa_flags = SA_RESTART;
        }
        sigemptyset(&p->p_sigctx.ps_sigcatch);
        p->p_sflag &= ~PS_NOCLDSTOP;

        ksiginfo_queue_init(&p->p_sigpend.sp_info);
        sigemptyset(&p->p_sigpend.sp_set);

        /*
         * Reset per LWP state.
         */
        l = LIST_FIRST(&p->p_lwps);
        l->l_sigwaited = NULL;
        l->l_sigstk = SS_INIT;
        ksiginfo_queue_init(&l->l_sigpend.sp_info);
        sigemptyset(&l->l_sigpend.sp_set);

        /* One reference. */
        ps->sa_refcnt = 1;
}

/*
 * execsigs:
 *
 *        Reset signals for an exec of the specified process.
 */
void
execsigs(struct proc *p)
{
        struct sigacts *ps;
        struct lwp *l;
        int signo, prop;
        sigset_t tset;
        ksiginfoq_t kq;

        KASSERT(p->p_nlwps == 1);

        sigactsunshare(p);
        ps = p->p_sigacts;

        /*
         * Reset caught signals.  Held signals remain held through
         * l->l_sigmask (unless they were caught, and are now ignored
         * by default).
         *
         * No need to lock yet, the process has only one LWP and
         * at this point the sigacts are private to the process.
         */
        sigemptyset(&tset);
        for (signo = 1; signo < NSIG; signo++) {
                if (sigismember(&p->p_sigctx.ps_sigcatch, signo)) {
                        prop = sigprop[signo];
                        if (prop & SA_IGNORE) {
                                if ((prop & SA_CONT) == 0)
                                        sigaddset(&p->p_sigctx.ps_sigignore,
                                            signo);
                                sigaddset(&tset, signo);
                        }
                        SIGACTION_PS(ps, signo).sa_handler = SIG_DFL;
                }
                sigemptyset(&SIGACTION_PS(ps, signo).sa_mask);
                SIGACTION_PS(ps, signo).sa_flags = SA_RESTART;
        }
        ksiginfo_queue_init(&kq);

        mutex_enter(p->p_lock);
        sigclearall(p, &tset, &kq);
        sigemptyset(&p->p_sigctx.ps_sigcatch);

        /*
         * Reset no zombies if child dies flag as Solaris does.
         */
        p->p_flag &= ~(PK_NOCLDWAIT | PK_CLDSIGIGN);
        if (SIGACTION_PS(ps, SIGCHLD).sa_handler == SIG_IGN)
                SIGACTION_PS(ps, SIGCHLD).sa_handler = SIG_DFL;

        /*
         * Reset per-LWP state.
         */
        l = LIST_FIRST(&p->p_lwps);
        l->l_sigwaited = NULL;
        l->l_sigstk = SS_INIT;
        ksiginfo_queue_init(&l->l_sigpend.sp_info);
        sigemptyset(&l->l_sigpend.sp_set);
        mutex_exit(p->p_lock);

        ksiginfo_queue_drain(&kq);
}

/*
 * ksiginfo_exechook:
 *
 *        Free all pending ksiginfo entries from a process on exec.
 *        Additionally, drain any unused ksiginfo structures in the
 *        system back to the pool.
 *
 *        XXX This should not be a hook, every process has signals.
 */
static void
ksiginfo_exechook(struct proc *p, void *v)
{
        ksiginfoq_t kq;

        ksiginfo_queue_init(&kq);

        mutex_enter(p->p_lock);
        sigclearall(p, NULL, &kq);
        mutex_exit(p->p_lock);

        ksiginfo_queue_drain(&kq);
}

/*
 * ksiginfo_alloc:
 *
 *        Allocate a new ksiginfo structure from the pool, and optionally copy
 *        an existing one.  If the existing ksiginfo_t is from the pool, and
 *        has not been queued somewhere, then just return it.  Additionally,
 *        if the existing ksiginfo_t does not contain any information beyond
 *        the signal number, then just return it.
 */
ksiginfo_t *
ksiginfo_alloc(struct proc *p, ksiginfo_t *ok, int flags)
{
        ksiginfo_t *kp;

        if (ok != NULL) {
                if ((ok->ksi_flags & (KSI_QUEUED | KSI_FROMPOOL)) ==
                    KSI_FROMPOOL)
                        return ok;
                if (KSI_EMPTY_P(ok))
                        return ok;
        }

        kp = pool_cache_get(ksiginfo_cache, flags);
        if (kp == NULL) {
#ifdef DIAGNOSTIC
                printf("Out of memory allocating ksiginfo for pid %d\n",
                    p->p_pid);
#endif
                return NULL;
        }

        if (ok != NULL) {
                memcpy(kp, ok, sizeof(*kp));
                kp->ksi_flags &= ~KSI_QUEUED;
        } else
                KSI_INIT_EMPTY(kp);

        kp->ksi_flags |= KSI_FROMPOOL;

        return kp;
}

/*
 * ksiginfo_free:
 *
 *        If the given ksiginfo_t is from the pool and has not been queued,
 *        then free it.
 */
void
ksiginfo_free(ksiginfo_t *kp)
{

        if ((kp->ksi_flags & (KSI_QUEUED | KSI_FROMPOOL)) != KSI_FROMPOOL)
                return;
        pool_cache_put(ksiginfo_cache, kp);
}

/*
 * ksiginfo_queue_drain:
 *
 *        Drain a non-empty ksiginfo_t queue.
 */
void
ksiginfo_queue_drain0(ksiginfoq_t *kq)
{
        ksiginfo_t *ksi;

        KASSERT(!TAILQ_EMPTY(kq));

        while (!TAILQ_EMPTY(kq)) {
                ksi = TAILQ_FIRST(kq);
                TAILQ_REMOVE(kq, ksi, ksi_list);
                pool_cache_put(ksiginfo_cache, ksi);
        }
}

static int
siggetinfo(sigpend_t *sp, ksiginfo_t *out, int signo)
{
        ksiginfo_t *ksi, *nksi;

        if (sp == NULL)
                goto out;

        /* Find siginfo and copy it out. */
        int count = 0;
        TAILQ_FOREACH_SAFE(ksi, &sp->sp_info, ksi_list, nksi) {
                if (ksi->ksi_signo != signo)
                        continue;
                if (count++ > 0) /* Only remove the first, count all of them */
                        continue;
                TAILQ_REMOVE(&sp->sp_info, ksi, ksi_list);
                KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0);
                KASSERT((ksi->ksi_flags & KSI_QUEUED) != 0);
                ksi->ksi_flags &= ~KSI_QUEUED;
                if (out != NULL) {
                        memcpy(out, ksi, sizeof(*out));
                        out->ksi_flags &= ~(KSI_FROMPOOL | KSI_QUEUED);
                }
                ksiginfo_free(ksi);
        }
        if (count)
                return count;

out:
        /* If there is no siginfo, then manufacture it. */
        if (out != NULL) {
                KSI_INIT(out);
                out->ksi_info._signo = signo;
                out->ksi_info._code = SI_NOINFO;
        }
        return 0;
}

/*
 * sigget:
 *
 *        Fetch the first pending signal from a set.  Optionally, also fetch
 *        or manufacture a ksiginfo element.  Returns the number of the first
 *        pending signal, or zero.
 */
int
sigget(sigpend_t *sp, ksiginfo_t *out, int signo, const sigset_t *mask)
{
        sigset_t tset;
        int count;

        /* If there's no pending set, the signal is from the debugger. */
        if (sp == NULL)
                goto out;

        /* Construct mask from signo, and 'mask'. */
        if (signo == 0) {
                if (mask != NULL) {
                        tset = *mask;
                        __sigandset(&sp->sp_set, &tset);
                } else
                        tset = sp->sp_set;

                /* If there are no signals pending - return. */
                if ((signo = firstsig(&tset)) == 0)
                        goto out;
        } else {
                KASSERT(sigismember(&sp->sp_set, signo));
        }

        sigdelset(&sp->sp_set, signo);
out:
        count = siggetinfo(sp, out, signo);
        if (count > 1)
                sigaddset(&sp->sp_set, signo);
        return signo;
}

/*
 * sigput:
 *
 *        Append a new ksiginfo element to the list of pending ksiginfo's.
 */
static int
sigput(sigpend_t *sp, struct proc *p, ksiginfo_t *ksi)
{
        ksiginfo_t *kp;

        KASSERT(mutex_owned(p->p_lock));
        KASSERT((ksi->ksi_flags & KSI_QUEUED) == 0);

        sigaddset(&sp->sp_set, ksi->ksi_signo);

        /*
         * If there is no siginfo, we are done.
         */
        if (KSI_EMPTY_P(ksi))
                return 0;

        KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0);

        size_t count = 0;
        TAILQ_FOREACH(kp, &sp->sp_info, ksi_list) {
                count++;
                if (ksi->ksi_signo >= SIGRTMIN && ksi->ksi_signo <= SIGRTMAX)
                        continue;
                if (kp->ksi_signo == ksi->ksi_signo) {
                        KSI_COPY(ksi, kp);
                        kp->ksi_flags |= KSI_QUEUED;
                        return 0;
                }
        }

        if (count >= SIGQUEUE_MAX) {
#ifdef DIAGNOSTIC
                printf("%s(%d): Signal queue is full signal=%d\n",
                    p->p_comm, p->p_pid, ksi->ksi_signo);
#endif
                return EAGAIN;
        }
        ksi->ksi_flags |= KSI_QUEUED;
        TAILQ_INSERT_TAIL(&sp->sp_info, ksi, ksi_list);

        return 0;
}

/*
 * sigclear:
 *
 *        Clear all pending signals in the specified set.
 */
void
sigclear(sigpend_t *sp, const sigset_t *mask, ksiginfoq_t *kq)
{
        ksiginfo_t *ksi, *next;

        if (mask == NULL)
                sigemptyset(&sp->sp_set);
        else
                sigminusset(mask, &sp->sp_set);

        TAILQ_FOREACH_SAFE(ksi, &sp->sp_info, ksi_list, next) {
                if (mask == NULL || sigismember(mask, ksi->ksi_signo)) {
                        TAILQ_REMOVE(&sp->sp_info, ksi, ksi_list);
                        KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0);
                        KASSERT((ksi->ksi_flags & KSI_QUEUED) != 0);
                        TAILQ_INSERT_TAIL(kq, ksi, ksi_list);
                }
        }
}

/*
 * sigclearall:
 *
 *        Clear all pending signals in the specified set from a process and
 *        its LWPs.
 */
void
sigclearall(struct proc *p, const sigset_t *mask, ksiginfoq_t *kq)
{
        struct lwp *l;

        KASSERT(mutex_owned(p->p_lock));

        sigclear(&p->p_sigpend, mask, kq);

        LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                sigclear(&l->l_sigpend, mask, kq);
        }
}

/*
 * sigispending:
 *
 *        Return the first signal number if there are pending signals for the
 *        current LWP.  May be called unlocked provided that LW_PENDSIG is set,
 *        and that the signal has been posted to the appopriate queue before
 *        LW_PENDSIG is set.
 *
 *        This should only ever be called with (l == curlwp), unless the
 *        result does not matter (procfs, sysctl).
 */
int
sigispending(struct lwp *l, int signo)
{
        struct proc *p = l->l_proc;
        sigset_t tset;

        membar_consumer();

        tset = l->l_sigpend.sp_set;
        sigplusset(&p->p_sigpend.sp_set, &tset);
        sigminusset(&p->p_sigctx.ps_sigignore, &tset);
        sigminusset(&l->l_sigmask, &tset);

        if (signo == 0) {
                return firstsig(&tset);
        }
        return sigismember(&tset, signo) ? signo : 0;
}

void
getucontext(struct lwp *l, ucontext_t *ucp)
{
        struct proc *p = l->l_proc;

        KASSERT(mutex_owned(p->p_lock));

        ucp->uc_flags = 0;
        ucp->uc_link = l->l_ctxlink;
        ucp->uc_sigmask = l->l_sigmask;
        ucp->uc_flags |= _UC_SIGMASK;

        /*
         * The (unsupplied) definition of the `current execution stack'
         * in the System V Interface Definition appears to allow returning
         * the main context stack.
         */
        if ((l->l_sigstk.ss_flags & SS_ONSTACK) == 0) {
                ucp->uc_stack.ss_sp = (void *)l->l_proc->p_stackbase;
                ucp->uc_stack.ss_size = ctob(l->l_proc->p_vmspace->vm_ssize);
                ucp->uc_stack.ss_flags = 0;        /* XXX, def. is Very Fishy */
        } else {
                /* Simply copy alternate signal execution stack. */
                ucp->uc_stack = l->l_sigstk;
        }
        ucp->uc_flags |= _UC_STACK;
        mutex_exit(p->p_lock);
        cpu_getmcontext(l, &ucp->uc_mcontext, &ucp->uc_flags);
        mutex_enter(p->p_lock);
}

int
setucontext(struct lwp *l, const ucontext_t *ucp)
{
        struct proc *p = l->l_proc;
        int error;

        KASSERT(mutex_owned(p->p_lock));

        if ((ucp->uc_flags & _UC_SIGMASK) != 0) {
                error = sigprocmask1(l, SIG_SETMASK, &ucp->uc_sigmask, NULL);
                if (error != 0)
                        return error;
        }

        mutex_exit(p->p_lock);
        error = cpu_setmcontext(l, &ucp->uc_mcontext, ucp->uc_flags);
        mutex_enter(p->p_lock);
        if (error != 0)
                return (error);

        l->l_ctxlink = ucp->uc_link;

        /*
         * If there was stack information, update whether or not we are
         * still running on an alternate signal stack.
         */
        if ((ucp->uc_flags & _UC_STACK) != 0) {
                if (ucp->uc_stack.ss_flags & SS_ONSTACK)
                        l->l_sigstk.ss_flags |= SS_ONSTACK;
                else
                        l->l_sigstk.ss_flags &= ~SS_ONSTACK;
        }

        return 0;
}

/*
 * killpg1: common code for kill process group/broadcast kill.
 */
int
killpg1(struct lwp *l, ksiginfo_t *ksi, int pgid, int all)
{
        struct proc        *p, *cp;
        kauth_cred_t        pc;
        struct pgrp        *pgrp;
        int                nfound;
        int                signo = ksi->ksi_signo;

        cp = l->l_proc;
        pc = l->l_cred;
        nfound = 0;

        mutex_enter(&proc_lock);
        if (all) {
                /*
                 * Broadcast.
                 */
                PROCLIST_FOREACH(p, &allproc) {
                        if (p->p_pid <= 1 || p == cp ||
                            (p->p_flag & PK_SYSTEM) != 0)
                                continue;
                        mutex_enter(p->p_lock);
                        if (kauth_authorize_process(pc,
                            KAUTH_PROCESS_SIGNAL, p, KAUTH_ARG(signo), NULL,
                            NULL) == 0) {
                                nfound++;
                                if (signo)
                                        kpsignal2(p, ksi);
                        }
                        mutex_exit(p->p_lock);
                }
        } else {
                if (pgid == 0)
                        /* Zero pgid means send to my process group. */
                        pgrp = cp->p_pgrp;
                else {
                        pgrp = pgrp_find(pgid);
                        if (pgrp == NULL)
                                goto out;
                }
                LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
                        if (p->p_pid <= 1 || p->p_flag & PK_SYSTEM)
                                continue;
                        mutex_enter(p->p_lock);
                        if (kauth_authorize_process(pc, KAUTH_PROCESS_SIGNAL,
                            p, KAUTH_ARG(signo), NULL, NULL) == 0) {
                                nfound++;
                                if (signo && P_ZOMBIE(p) == 0)
                                        kpsignal2(p, ksi);
                        }
                        mutex_exit(p->p_lock);
                }
        }
out:
        mutex_exit(&proc_lock);
        return nfound ? 0 : ESRCH;
}

/*
 * Send a signal to a process group.  If checktty is set, limit to members
 * which have a controlling terminal.
 */
void
pgsignal(struct pgrp *pgrp, int sig, int checkctty)
{
        ksiginfo_t ksi;

        KASSERT(!cpu_intr_p());
        KASSERT(mutex_owned(&proc_lock));

        KSI_INIT_EMPTY(&ksi);
        ksi.ksi_signo = sig;
        kpgsignal(pgrp, &ksi, NULL, checkctty);
}

void
kpgsignal(struct pgrp *pgrp, ksiginfo_t *ksi, void *data, int checkctty)
{
        struct proc *p;

        KASSERT(!cpu_intr_p());
        KASSERT(mutex_owned(&proc_lock));
        KASSERT(pgrp != NULL);

        LIST_FOREACH(p, &pgrp->pg_members, p_pglist)
                if (checkctty == 0 || p->p_lflag & PL_CONTROLT)
                        kpsignal(p, ksi, data);
}

/*
 * Send a signal caused by a trap to the current LWP.  If it will be caught
 * immediately, deliver it with correct code.  Otherwise, post it normally.
 */
void
trapsignal(struct lwp *l, ksiginfo_t *ksi)
{
        struct proc        *p;
        struct sigacts        *ps;
        int signo = ksi->ksi_signo;
        sigset_t *mask;
        sig_t action;

        KASSERT(KSI_TRAP_P(ksi));

        ksi->ksi_lid = l->l_lid;
        p = l->l_proc;

        KASSERT(!cpu_intr_p());
        mutex_enter(&proc_lock);
        mutex_enter(p->p_lock);

repeat:
        /*
         * If we are exiting, demise now.
         *
         * This avoids notifying tracer and deadlocking.
         */
        if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
                mutex_exit(p->p_lock);
                mutex_exit(&proc_lock);
                lwp_exit(l);
                panic("trapsignal");
                /* NOTREACHED */
        }

        /*
         * The process is already stopping.
         */
        if ((p->p_sflag & PS_STOPPING) != 0) {
                mutex_exit(&proc_lock);
                sigswitch_unlock_and_switch_away(l);
                mutex_enter(&proc_lock);
                mutex_enter(p->p_lock);
                goto repeat;
        }

        mask = &l->l_sigmask;
        ps = p->p_sigacts;
        action = SIGACTION_PS(ps, signo).sa_handler;

        if (ISSET(p->p_slflag, PSL_TRACED) &&
            !(p->p_pptr == p->p_opptr && ISSET(p->p_lflag, PL_PPWAIT)) &&
            p->p_xsig != SIGKILL &&
            !sigismember(&p->p_sigpend.sp_set, SIGKILL)) {
                p->p_xsig = signo;
                p->p_sigctx.ps_faked = true;
                p->p_sigctx.ps_lwp = ksi->ksi_lid;
                p->p_sigctx.ps_info = ksi->ksi_info;
                sigswitch(0, signo, true);

                if (ktrpoint(KTR_PSIG)) {
                        if (p->p_emul->e_ktrpsig)
                                p->p_emul->e_ktrpsig(signo, action, mask, ksi);
                        else
                                ktrpsig(signo, action, mask, ksi);
                }
                return;
        }

        const bool caught = sigismember(&p->p_sigctx.ps_sigcatch, signo);
        const bool masked = sigismember(mask, signo);
        if (caught && !masked) {
                mutex_exit(&proc_lock);
                l->l_ru.ru_nsignals++;
                kpsendsig(l, ksi, mask);
                mutex_exit(p->p_lock);

                if (ktrpoint(KTR_PSIG)) {
                        if (p->p_emul->e_ktrpsig)
                                p->p_emul->e_ktrpsig(signo, action, mask, ksi);
                        else
                                ktrpsig(signo, action, mask, ksi);
                }
                return;
        }

        /*
         * If the signal is masked or ignored, then unmask it and
         * reset it to the default action so that the process or
         * its tracer will be notified.
         */
        const bool ignored = action == SIG_IGN;
        if (masked || ignored) {
                mutex_enter(&ps->sa_mutex);
                sigdelset(mask, signo);
                sigdelset(&p->p_sigctx.ps_sigcatch, signo);
                sigdelset(&p->p_sigctx.ps_sigignore, signo);
                sigdelset(&SIGACTION_PS(ps, signo).sa_mask, signo);
                SIGACTION_PS(ps, signo).sa_handler = SIG_DFL;
                mutex_exit(&ps->sa_mutex);
        }

        kpsignal2(p, ksi);
        mutex_exit(p->p_lock);
        mutex_exit(&proc_lock);
}

/*
 * Fill in signal information and signal the parent for a child status change.
 */
void
child_psignal(struct proc *p, int mask)
{
        ksiginfo_t ksi;
        struct proc *q;
        int xsig;

        KASSERT(mutex_owned(&proc_lock));
        KASSERT(mutex_owned(p->p_lock));

        xsig = p->p_xsig;

        KSI_INIT(&ksi);
        ksi.ksi_signo = SIGCHLD;
        ksi.ksi_code = (xsig == SIGCONT ? CLD_CONTINUED : CLD_STOPPED);
        ksi.ksi_pid = p->p_pid;
        ksi.ksi_uid = kauth_cred_geteuid(p->p_cred);
        ksi.ksi_status = xsig;
        ksi.ksi_utime = p->p_stats->p_ru.ru_utime.tv_sec;
        ksi.ksi_stime = p->p_stats->p_ru.ru_stime.tv_sec;

        q = p->p_pptr;

        mutex_exit(p->p_lock);
        mutex_enter(q->p_lock);

        if ((q->p_sflag & mask) == 0)
                kpsignal2(q, &ksi);

        mutex_exit(q->p_lock);
        mutex_enter(p->p_lock);
}

void
psignal(struct proc *p, int signo)
{
        ksiginfo_t ksi;

        KASSERT(!cpu_intr_p());
        KASSERT(mutex_owned(&proc_lock));

        KSI_INIT_EMPTY(&ksi);
        ksi.ksi_signo = signo;
        mutex_enter(p->p_lock);
        kpsignal2(p, &ksi);
        mutex_exit(p->p_lock);
}

void
kpsignal(struct proc *p, ksiginfo_t *ksi, void *data)
{
        fdfile_t *ff;
        file_t *fp;
        fdtab_t *dt;

        KASSERT(!cpu_intr_p());
        KASSERT(mutex_owned(&proc_lock));

        if ((p->p_sflag & PS_WEXIT) == 0 && data) {
                size_t fd;
                filedesc_t *fdp = p->p_fd;

                /* XXXSMP locking */
                ksi->ksi_fd = -1;
                dt = atomic_load_consume(&fdp->fd_dt);
                for (fd = 0; fd < dt->dt_nfiles; fd++) {
                        if ((ff = dt->dt_ff[fd]) == NULL)
                                continue;
                        if ((fp = atomic_load_consume(&ff->ff_file)) == NULL)
                                continue;
                        if (fp->f_data == data) {
                                ksi->ksi_fd = fd;
                                break;
                        }
                }
        }
        mutex_enter(p->p_lock);
        kpsignal2(p, ksi);
        mutex_exit(p->p_lock);
}

/*
 * sigismasked:
 *
 *        Returns true if signal is ignored or masked for the specified LWP.
 */
int
sigismasked(struct lwp *l, int sig)
{
        struct proc *p = l->l_proc;

        return sigismember(&p->p_sigctx.ps_sigignore, sig) ||
            sigismember(&l->l_sigmask, sig);
}

/*
 * sigpost:
 *
 *        Post a pending signal to an LWP.  Returns non-zero if the LWP may
 *        be able to take the signal.
 */
static int
sigpost(struct lwp *l, sig_t action, int prop, int sig)
{
        int rv, masked;
        struct proc *p = l->l_proc;

        KASSERT(mutex_owned(p->p_lock));

        /*
         * If the LWP is on the way out, sigclear() will be busy draining all
         * pending signals.  Don't give it more.
         */
        if (l->l_stat == LSZOMB)
                return 0;

        SDT_PROBE(proc, kernel, , signal__send, l, p, sig, 0, 0);

        lwp_lock(l);
        if (__predict_false((l->l_flag & LW_DBGSUSPEND) != 0)) {
                if ((prop & SA_KILL) != 0)
                        l->l_flag &= ~LW_DBGSUSPEND;
                else {
                        lwp_unlock(l);
                        return 0;
                }
        }

        /*
         * Have the LWP check for signals.  This ensures that even if no LWP
         * is found to take the signal immediately, it should be taken soon.
         */
        signotify(l);

        /*
         * SIGCONT can be masked, but if LWP is stopped, it needs restart.
         * Note: SIGKILL and SIGSTOP cannot be masked.
         */
        masked = sigismember(&l->l_sigmask, sig);
        if (masked && ((prop & SA_CONT) == 0 || l->l_stat != LSSTOP)) {
                lwp_unlock(l);
                return 0;
        }

        /*
         * If killing the process, make it run fast.
         */
        if (__predict_false((prop & SA_KILL) != 0) &&
            action == SIG_DFL && l->l_priority < MAXPRI_USER) {
                KASSERT(l->l_class == SCHED_OTHER);
                lwp_changepri(l, MAXPRI_USER);
        }

        /*
         * If the LWP is running or on a run queue, then we win.  If it's
         * sleeping interruptably, wake it and make it take the signal.  If
         * the sleep isn't interruptable, then the chances are it will get
         * to see the signal soon anyhow.  If suspended, it can't take the
         * signal right now.  If it's LWP private or for all LWPs, save it
         * for later; otherwise punt.
         */
        rv = 0;

        switch (l->l_stat) {
        case LSRUN:
        case LSONPROC:
                rv = 1;
                break;

        case LSSLEEP:
                if ((l->l_flag & LW_SINTR) != 0) {
                        /* setrunnable() will release the lock. */
                        setrunnable(l);
                        return 1;
                }
                break;

        case LSSUSPENDED:
                if ((prop & SA_KILL) != 0 && (l->l_flag & LW_WCORE) != 0) {
                        /* lwp_continue() will release the lock. */
                        lwp_continue(l);
                        return 1;
                }
                break;

        case LSSTOP:
                if ((prop & SA_STOP) != 0)
                        break;

                /*
                 * If the LWP is stopped and we are sending a continue
                 * signal, then start it again.
                 */
                if ((prop & SA_CONT) != 0) {
                        if (l->l_wchan != NULL) {
                                l->l_stat = LSSLEEP;
                                p->p_nrlwps++;
                                rv = 1;
                                break;
                        }
                        /* setrunnable() will release the lock. */
                        setrunnable(l);
                        return 1;
                } else if (l->l_wchan == NULL || (l->l_flag & LW_SINTR) != 0) {
                        /* setrunnable() will release the lock. */
                        setrunnable(l);
                        return 1;
                }
                break;

        default:
                break;
        }

        lwp_unlock(l);
        return rv;
}

/*
 * Notify an LWP that it has a pending signal.
 */
void
signotify(struct lwp *l)
{
        KASSERT(lwp_locked(l, NULL));

        l->l_flag |= LW_PENDSIG;
        lwp_need_userret(l);
}

/*
 * Find an LWP within process p that is waiting on signal ksi, and hand
 * it on.
 */
static int
sigunwait(struct proc *p, const ksiginfo_t *ksi)
{
        struct lwp *l;
        int signo;

        KASSERT(mutex_owned(p->p_lock));

        signo = ksi->ksi_signo;

        if (ksi->ksi_lid != 0) {
                /*
                 * Signal came via _lwp_kill().  Find the LWP and see if
                 * it's interested.
                 */
                if ((l = lwp_find(p, ksi->ksi_lid)) == NULL)
                        return 0;
                if (l->l_sigwaited == NULL ||
                    !sigismember(&l->l_sigwaitset, signo))
                        return 0;
        } else {
                /*
                 * Look for any LWP that may be interested.
                 */
                LIST_FOREACH(l, &p->p_sigwaiters, l_sigwaiter) {
                        KASSERT(l->l_sigwaited != NULL);
                        if (sigismember(&l->l_sigwaitset, signo))
                                break;
                }
        }

        if (l != NULL) {
                l->l_sigwaited->ksi_info = ksi->ksi_info;
                l->l_sigwaited = NULL;
                LIST_REMOVE(l, l_sigwaiter);
                cv_signal(&l->l_sigcv);
                return 1;
        }

        return 0;
}

/*
 * Send the signal to the process.  If the signal has an action, the action
 * is usually performed by the target process rather than the caller; we add
 * the signal to the set of pending signals for the process.
 *
 * Exceptions:
 *   o When a stop signal is sent to a sleeping process that takes the
 *     default action, the process is stopped without awakening it.
 *   o SIGCONT restarts stopped processes (or puts them back to sleep)
 *     regardless of the signal action (eg, blocked or ignored).
 *
 * Other ignored signals are discarded immediately.
 */
int
kpsignal2(struct proc *p, ksiginfo_t *ksi)
{
        int prop, signo = ksi->ksi_signo;
        struct lwp *l = NULL;
        ksiginfo_t *kp;
        lwpid_t lid;
        sig_t action;
        bool toall;
        bool traced;
        int error = 0;

        KASSERT(!cpu_intr_p());
        KASSERT(mutex_owned(&proc_lock));
        KASSERT(mutex_owned(p->p_lock));
        KASSERT((ksi->ksi_flags & KSI_QUEUED) == 0);
        KASSERT(signo > 0);
        KASSERT(signo < NSIG);

        /*
         * If the process is being created by fork, is a zombie or is
         * exiting, then just drop the signal here and bail out.
         */
        if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
                return 0;

        /*
         * Notify any interested parties of the signal.
         */
        KNOTE(&p->p_klist, NOTE_SIGNAL | signo);

        /*
         * Some signals including SIGKILL must act on the entire process.
         */
        kp = NULL;
        prop = sigprop[signo];
        toall = ((prop & SA_TOALL) != 0);
        lid = toall ? 0 : ksi->ksi_lid;
        traced = ISSET(p->p_slflag, PSL_TRACED) &&
            !sigismember(&p->p_sigctx.ps_sigpass, signo);

        /*
         * If proc is traced, always give parent a chance.
         */
        if (traced) {
                action = SIG_DFL;

                if (lid == 0) {
                        /*
                         * If the process is being traced and the signal
                         * is being caught, make sure to save any ksiginfo.
                         */
                        if ((kp = ksiginfo_alloc(p, ksi, PR_NOWAIT)) == NULL)
                                goto discard;
                        if ((error = sigput(&p->p_sigpend, p, kp)) != 0)
                                goto out;
                }
        } else {

                /*
                 * If the signal is being ignored, then drop it.  Note: we
                 * don't set SIGCONT in ps_sigignore, and if it is set to
                 * SIG_IGN, action will be SIG_DFL here.
                 */
                if (sigismember(&p->p_sigctx.ps_sigignore, signo))
                        goto discard;

                else if (sigismember(&p->p_sigctx.ps_sigcatch, signo))
                        action = SIG_CATCH;
                else {
                        action = SIG_DFL;

                        /*
                         * If sending a tty stop signal to a member of an
                         * orphaned process group, discard the signal here if
                         * the action is default; don't stop the process below
                         * if sleeping, and don't clear any pending SIGCONT.
                         */
                        if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0)
                                goto discard;

                        if (prop & SA_KILL && p->p_nice > NZERO)
                                p->p_nice = NZERO;
                }
        }

        /*
         * If stopping or continuing a process, discard any pending
         * signals that would do the inverse.
         */
        if ((prop & (SA_CONT | SA_STOP)) != 0) {
                ksiginfoq_t kq;

                ksiginfo_queue_init(&kq);
                if ((prop & SA_CONT) != 0)
                        sigclear(&p->p_sigpend, &stopsigmask, &kq);
                if ((prop & SA_STOP) != 0)
                        sigclear(&p->p_sigpend, &contsigmask, &kq);
                ksiginfo_queue_drain(&kq);        /* XXXSMP */
        }

        /*
         * If the signal doesn't have SA_CANTMASK (no override for SIGKILL,
         * please!), check if any LWPs are waiting on it.  If yes, pass on
         * the signal info.  The signal won't be processed further here.
         */
        if ((prop & SA_CANTMASK) == 0 && !LIST_EMPTY(&p->p_sigwaiters) &&
            p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0 &&
            sigunwait(p, ksi))
                goto discard;

        /*
         * XXXSMP Should be allocated by the caller, we're holding locks
         * here.
         */
        if (kp == NULL && (kp = ksiginfo_alloc(p, ksi, PR_NOWAIT)) == NULL)
                goto discard;

        /*
         * LWP private signals are easy - just find the LWP and post
         * the signal to it.
         */
        if (lid != 0) {
                l = lwp_find(p, lid);
                if (l != NULL) {
                        if ((error = sigput(&l->l_sigpend, p, kp)) != 0)
                                goto out;
                        membar_producer();
                        if (sigpost(l, action, prop, kp->ksi_signo) != 0)
                                signo = -1;
                }
                goto out;
        }

        /*
         * Some signals go to all LWPs, even if posted with _lwp_kill()
         * or for an SA process.
         */
        if (p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0) {
                if (traced)
                        goto deliver;

                /*
                 * If SIGCONT is default (or ignored) and process is
                 * asleep, we are finished; the process should not
                 * be awakened.
                 */
                if ((prop & SA_CONT) != 0 && action == SIG_DFL)
                        goto out;
        } else {
                /*
                 * Process is stopped or stopping.
                 * - If traced, then no action is needed, unless killing.
                 * - Run the process only if sending SIGCONT or SIGKILL.
                 */
                if (traced && signo != SIGKILL) {
                        goto out;
                }
                if ((prop & SA_CONT) != 0 || signo == SIGKILL) {
                        /*
                         * Re-adjust p_nstopchild if the process was
                         * stopped but not yet collected by its parent.
                         */
                        if (p->p_stat == SSTOP && !p->p_waited)
                                p->p_pptr->p_nstopchild--;
                        p->p_stat = SACTIVE;
                        p->p_sflag &= ~PS_STOPPING;
                        if (traced) {
                                KASSERT(signo == SIGKILL);
                                goto deliver;
                        }
                        /*
                         * Do not make signal pending if SIGCONT is default.
                         *
                         * If the process catches SIGCONT, let it handle the
                         * signal itself (if waiting on event - process runs,
                         * otherwise continues sleeping).
                         */
                        if ((prop & SA_CONT) != 0) {
                                p->p_xsig = SIGCONT;
                                p->p_sflag |= PS_CONTINUED;
                                child_psignal(p, 0);
                                if (action == SIG_DFL) {
                                        KASSERT(signo != SIGKILL);
                                        goto deliver;
                                }
                        }
                } else if ((prop & SA_STOP) != 0) {
                        /*
                         * Already stopped, don't need to stop again.
                         * (If we did the shell could get confused.)
                         */
                        goto out;
                }
        }
        /*
         * Make signal pending.
         */
        KASSERT(!traced);
        if ((error = sigput(&p->p_sigpend, p, kp)) != 0)
                goto out;
deliver:
        /*
         * Before we set LW_PENDSIG on any LWP, ensure that the signal is
         * visible on the per process list (for sigispending()).  This
         * is unlikely to be needed in practice, but...
         */
        membar_producer();

        /*
         * Try to find an LWP that can take the signal.
         */
        LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                if (sigpost(l, action, prop, kp->ksi_signo) && !toall)
                        break;
        }
        signo = -1;
out:
        /*
         * If the ksiginfo wasn't used, then bin it.  XXXSMP freeing memory
         * with locks held.  The caller should take care of this.
         */
        ksiginfo_free(kp);
        if (signo == -1)
                return error;
discard:
        SDT_PROBE(proc, kernel, , signal__discard, l, p, signo, 0, 0);
        return error;
}

void
kpsendsig(struct lwp *l, const ksiginfo_t *ksi, const sigset_t *mask)
{
        struct proc *p = l->l_proc;

        KASSERT(mutex_owned(p->p_lock));
        (*p->p_emul->e_sendsig)(ksi, mask);
}

/*
 * Stop any LWPs sleeping interruptably.
 */
static void
proc_stop_lwps(struct proc *p)
{
        struct lwp *l;

        KASSERT(mutex_owned(p->p_lock));
        KASSERT((p->p_sflag & PS_STOPPING) != 0);

        LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                lwp_lock(l);
                if (l->l_stat == LSSLEEP && (l->l_flag & LW_SINTR) != 0) {
                        l->l_stat = LSSTOP;
                        p->p_nrlwps--;
                }
                lwp_unlock(l);
        }
}

/*
 * Finish stopping of a process.  Mark it stopped and notify the parent.
 *
 * Drop p_lock briefly if ppsig is true.
 */
static void
proc_stop_done(struct proc *p, int ppmask)
{

        KASSERT(mutex_owned(&proc_lock));
        KASSERT(mutex_owned(p->p_lock));
        KASSERT((p->p_sflag & PS_STOPPING) != 0);
        KASSERT(p->p_nrlwps == 0 || p->p_nrlwps == 1);
        KASSERT(p->p_nrlwps == 0 || p == curproc);

        p->p_sflag &= ~PS_STOPPING;
        p->p_stat = SSTOP;
        p->p_waited = 0;
        p->p_pptr->p_nstopchild++;

        /* child_psignal drops p_lock briefly. */
        child_psignal(p, ppmask);
        cv_broadcast(&p->p_pptr->p_waitcv);
}

/*
 * Stop the current process and switch away to the debugger notifying
 * an event specific to a traced process only.
 */
void
eventswitch(int code, int pe_report_event, int entity)
{
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;
        struct sigacts *ps;
        sigset_t *mask;
        sig_t action;
        ksiginfo_t ksi;
        const int signo = SIGTRAP;

        KASSERT(mutex_owned(&proc_lock));
        KASSERT(mutex_owned(p->p_lock));
        KASSERT(p->p_pptr != initproc);
        KASSERT(l->l_stat == LSONPROC);
        KASSERT(ISSET(p->p_slflag, PSL_TRACED));
        KASSERT(!ISSET(l->l_flag, LW_SYSTEM));
        KASSERT(p->p_nrlwps > 0);
        KASSERT((code == TRAP_CHLD) || (code == TRAP_LWP) ||
                (code == TRAP_EXEC));
        KASSERT((code != TRAP_CHLD) || (entity > 1)); /* prevent pid1 */
        KASSERT((code != TRAP_LWP) || (entity > 0));

repeat:
        /*
         * If we are exiting, demise now.
         *
         * This avoids notifying tracer and deadlocking.
         */
        if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
                mutex_exit(p->p_lock);
                mutex_exit(&proc_lock);

                if (pe_report_event == PTRACE_LWP_EXIT) {
                        /* Avoid double lwp_exit() and panic. */
                        return;
                }

                lwp_exit(l);
                panic("eventswitch");
                /* NOTREACHED */
        }

        /*
         * If we are no longer traced, abandon this event signal.
         *
         * This avoids killing a process after detaching the debugger.
         */
        if (__predict_false(!ISSET(p->p_slflag, PSL_TRACED))) {
                mutex_exit(p->p_lock);
                mutex_exit(&proc_lock);
                return;
        }

        /*
         * If there's a pending SIGKILL process it immediately.
         */
        if (p->p_xsig == SIGKILL ||
            sigismember(&p->p_sigpend.sp_set, SIGKILL)) {
                mutex_exit(p->p_lock);
                mutex_exit(&proc_lock);
                return;
        }

        /*
         * The process is already stopping.
         */
        if ((p->p_sflag & PS_STOPPING) != 0) {
                mutex_exit(&proc_lock);
                sigswitch_unlock_and_switch_away(l);
                mutex_enter(&proc_lock);
                mutex_enter(p->p_lock);
                goto repeat;
        }

        KSI_INIT_TRAP(&ksi);
        ksi.ksi_lid = l->l_lid;
        ksi.ksi_signo = signo;
        ksi.ksi_code = code;
        ksi.ksi_pe_report_event = pe_report_event;

        CTASSERT(sizeof(ksi.ksi_pe_other_pid) == sizeof(ksi.ksi_pe_lwp));
        ksi.ksi_pe_other_pid = entity;

        /* Needed for ktrace */
        ps = p->p_sigacts;
        action = SIGACTION_PS(ps, signo).sa_handler;
        mask = &l->l_sigmask;

        p->p_xsig = signo;
        p->p_sigctx.ps_faked = true;
        p->p_sigctx.ps_lwp = ksi.ksi_lid;
        p->p_sigctx.ps_info = ksi.ksi_info;

        sigswitch(0, signo, true);

        if (code == TRAP_CHLD) {
                mutex_enter(&proc_lock);
                while (l->l_vforkwaiting)
                        cv_wait(&l->l_waitcv, &proc_lock);
                mutex_exit(&proc_lock);
        }

        if (ktrpoint(KTR_PSIG)) {
                if (p->p_emul->e_ktrpsig)
                        p->p_emul->e_ktrpsig(signo, action, mask, &ksi);
                else
                        ktrpsig(signo, action, mask, &ksi);
        }
}

void
eventswitchchild(struct proc *p, int code, int pe_report_event)
{
        mutex_enter(&proc_lock);
        mutex_enter(p->p_lock);
        if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) !=
            (PSL_TRACED|PSL_TRACEDCHILD)) {
                mutex_exit(p->p_lock);
                mutex_exit(&proc_lock);
                return;
        }
        eventswitch(code, pe_report_event, p->p_oppid);
}

/*
 * Stop the current process and switch away when being stopped or traced.
 */
static void
sigswitch(int ppmask, int signo, bool proc_lock_held)
{
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;

        KASSERT(mutex_owned(p->p_lock));
        KASSERT(l->l_stat == LSONPROC);
        KASSERT(p->p_nrlwps > 0);

        if (proc_lock_held) {
                KASSERT(mutex_owned(&proc_lock));
        } else {
                KASSERT(!mutex_owned(&proc_lock));
        }

        /*
         * On entry we know that the process needs to stop.  If it's
         * the result of a 'sideways' stop signal that has been sourced
         * through issignal(), then stop other LWPs in the process too.
         */
        if (p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0) {
                KASSERT(signo != 0);
                proc_stop(p, signo);
                KASSERT(p->p_nrlwps > 0);
        }

        /*
         * If we are the last live LWP, and the stop was a result of
         * a new signal, then signal the parent.
         */
        if ((p->p_sflag & PS_STOPPING) != 0) {
                if (!proc_lock_held && !mutex_tryenter(&proc_lock)) {
                        mutex_exit(p->p_lock);
                        mutex_enter(&proc_lock);
                        mutex_enter(p->p_lock);
                }

                if (p->p_nrlwps == 1 && (p->p_sflag & PS_STOPPING) != 0) {
                        /*
                         * Note that proc_stop_done() can drop
                         * p->p_lock briefly.
                         */
                        proc_stop_done(p, ppmask);
                }

                mutex_exit(&proc_lock);
        }

        sigswitch_unlock_and_switch_away(l);
}

/*
 * Unlock and switch away.
 */
static void
sigswitch_unlock_and_switch_away(struct lwp *l)
{
        struct proc *p;

        p = l->l_proc;

        KASSERT(mutex_owned(p->p_lock));
        KASSERT(!mutex_owned(&proc_lock));

        KASSERT(l->l_stat == LSONPROC);
        KASSERT(p->p_nrlwps > 0);
        KASSERT(l->l_blcnt == 0);

        if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
                p->p_nrlwps--;
                lwp_lock(l);
                KASSERT(l->l_stat == LSONPROC || l->l_stat == LSSLEEP);
                l->l_stat = LSSTOP;
                lwp_unlock(l);
        }

        mutex_exit(p->p_lock);
        lwp_lock(l);
        spc_lock(l->l_cpu);
        mi_switch(l);
}

/*
 * Check for a signal from the debugger.
 */
static int
sigchecktrace(void)
{
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;
        int signo;

        KASSERT(mutex_owned(p->p_lock));

        /* If there's a pending SIGKILL, process it immediately. */
        if (sigismember(&p->p_sigpend.sp_set, SIGKILL))
                return 0;

        /*
         * If we are no longer being traced, or the parent didn't
         * give us a signal, or we're stopping, look for more signals.
         */
        if ((p->p_slflag & PSL_TRACED) == 0 || p->p_xsig == 0 ||
            (p->p_sflag & PS_STOPPING) != 0)
                return 0;

        /*
         * If the new signal is being masked, look for other signals.
         * `p->p_sigctx.ps_siglist |= mask' is done in setrunnable().
         */
        signo = p->p_xsig;
        p->p_xsig = 0;
        if (sigismember(&l->l_sigmask, signo)) {
                signo = 0;
        }
        return signo;
}

/*
 * If the current process has received a signal (should be caught or cause
 * termination, should interrupt current syscall), return the signal number.
 *
 * Stop signals with default action are processed immediately, then cleared;
 * they aren't returned.  This is checked after each entry to the system for
 * a syscall or trap.
 *
 * We will also return -1 if the process is exiting and the current LWP must
 * follow suit.
 */
int
issignal(struct lwp *l)
{
        struct proc *p;
        int siglwp, signo, prop;
        sigpend_t *sp;
        sigset_t ss;
        bool traced;

        p = l->l_proc;
        sp = NULL;
        signo = 0;

        KASSERT(p == curproc);
        KASSERT(mutex_owned(p->p_lock));

        for (;;) {
                /* Discard any signals that we have decided not to take. */
                if (signo != 0) {
                        (void)sigget(sp, NULL, signo, NULL);
                }

                /*
                 * If the process is stopped/stopping, then stop ourselves
                 * now that we're on the kernel/userspace boundary.  When
                 * we awaken, check for a signal from the debugger.
                 */
                if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
                        sigswitch_unlock_and_switch_away(l);
                        mutex_enter(p->p_lock);
                        continue;
                } else if (p->p_stat == SACTIVE)
                        signo = sigchecktrace();
                else
                        signo = 0;

                /* Signals from the debugger are "out of band". */
                sp = NULL;

                /*
                 * If the debugger didn't provide a signal, find a pending
                 * signal from our set.  Check per-LWP signals first, and
                 * then per-process.
                 */
                if (signo == 0) {
                        sp = &l->l_sigpend;
                        ss = sp->sp_set;
                        siglwp = l->l_lid;
                        if ((p->p_lflag & PL_PPWAIT) != 0)
                                sigminusset(&vforksigmask, &ss);
                        sigminusset(&l->l_sigmask, &ss);

                        if ((signo = firstsig(&ss)) == 0) {
                                sp = &p->p_sigpend;
                                ss = sp->sp_set;
                                siglwp = 0;
                                if ((p->p_lflag & PL_PPWAIT) != 0)
                                        sigminusset(&vforksigmask, &ss);
                                sigminusset(&l->l_sigmask, &ss);

                                if ((signo = firstsig(&ss)) == 0) {
                                        /*
                                         * No signal pending - clear the
                                         * indicator and bail out.
                                         */
                                        lwp_lock(l);
                                        l->l_flag &= ~LW_PENDSIG;
                                        lwp_unlock(l);
                                        sp = NULL;
                                        break;
                                }
                        }
                }

                traced = ISSET(p->p_slflag, PSL_TRACED) &&
                    !sigismember(&p->p_sigctx.ps_sigpass, signo);

                if (sp) {
                        /* Overwrite process' signal context to correspond
                         * to the currently reported LWP.  This is necessary
                         * for PT_GET_SIGINFO to report the correct signal when
                         * multiple LWPs have pending signals.  We do this only
                         * when the signal comes from the queue, for signals
                         * created by the debugger we assume it set correct
                         * siginfo.
                         */
                        ksiginfo_t *ksi = TAILQ_FIRST(&sp->sp_info);
                        if (ksi) {
                                p->p_sigctx.ps_lwp = ksi->ksi_lid;
                                p->p_sigctx.ps_info = ksi->ksi_info;
                        } else {
                                p->p_sigctx.ps_lwp = siglwp;
                                memset(&p->p_sigctx.ps_info, 0,
                                    sizeof(p->p_sigctx.ps_info));
                                p->p_sigctx.ps_info._signo = signo;
                                p->p_sigctx.ps_info._code = SI_NOINFO;
                        }
                }

                /*
                 * We should see pending but ignored signals only if
                 * we are being traced.
                 */
                if (sigismember(&p->p_sigctx.ps_sigignore, signo) &&
                    !traced) {
                        /* Discard the signal. */
                        continue;
                }

                /*
                 * If traced, always stop, and stay stopped until released
                 * by the debugger.  If the our parent is our debugger waiting
                 * for us and we vforked, don't hang as we could deadlock.
                 */
                if (traced && signo != SIGKILL &&
                    !(ISSET(p->p_lflag, PL_PPWAIT) &&
                     (p->p_pptr == p->p_opptr))) {
                        /*
                         * Take the signal, but don't remove it from the
                         * siginfo queue, because the debugger can send
                         * it later.
                         */
                        if (sp)
                                sigdelset(&sp->sp_set, signo);
                        p->p_xsig = signo;

                        /* Handling of signal trace */
                        sigswitch(0, signo, false);
                        mutex_enter(p->p_lock);

                        /* Check for a signal from the debugger. */
                        if ((signo = sigchecktrace()) == 0)
                                continue;

                        /* Signals from the debugger are "out of band". */
                        sp = NULL;
                }

                prop = sigprop[signo];

                /*
                 * Decide whether the signal should be returned.
                 */
                switch ((long)SIGACTION(p, signo).sa_handler) {
                case (long)SIG_DFL:
                        /*
                         * Don't take default actions on system processes.
                         */
                        if (p->p_pid <= 1) {
#ifdef DIAGNOSTIC
                                /*
                                 * Are you sure you want to ignore SIGSEGV
                                 * in init? XXX
                                 */
                                printf_nolog("Process (pid %d) got sig %d\n",
                                    p->p_pid, signo);
#endif
                                continue;
                        }

                        /*
                         * If there is a pending stop signal to process with
                         * default action, stop here, then clear the signal.
                         * However, if process is member of an orphaned
                         * process group, ignore tty stop signals.
                         */
                        if (prop & SA_STOP) {
                                /*
                                 * XXX Don't hold proc_lock for p_lflag,
                                 * but it's not a big deal.
                                 */
                                if ((traced &&
                                     !(ISSET(p->p_lflag, PL_PPWAIT) &&
                                     (p->p_pptr == p->p_opptr))) ||
                                    ((p->p_lflag & PL_ORPHANPG) != 0 &&
                                    prop & SA_TTYSTOP)) {
                                        /* Ignore the signal. */
                                        continue;
                                }
                                /* Take the signal. */
                                (void)sigget(sp, NULL, signo, NULL);
                                p->p_xsig = signo;
                                p->p_sflag &= ~PS_CONTINUED;
                                signo = 0;
                                sigswitch(PS_NOCLDSTOP, p->p_xsig, false);
                                mutex_enter(p->p_lock);
                        } else if (prop & SA_IGNORE) {
                                /*
                                 * Except for SIGCONT, shouldn't get here.
                                 * Default action is to ignore; drop it.
                                 */
                                continue;
                        }
                        break;

                case (long)SIG_IGN:
#ifdef DEBUG_ISSIGNAL
                        /*
                         * Masking above should prevent us ever trying
                         * to take action on an ignored signal other
                         * than SIGCONT, unless process is traced.
                         */
                        if ((prop & SA_CONT) == 0 && !traced)
                                printf_nolog("issignal\n");
#endif
                        continue;

                default:
                        /*
                         * This signal has an action, let postsig() process
                         * it.
                         */
                        break;
                }

                break;
        }

        l->l_sigpendset = sp;
        return signo;
}

/*
 * Take the action for the specified signal
 * from the current set of pending signals.
 */
void
postsig(int signo)
{
        struct lwp        *l;
        struct proc        *p;
        struct sigacts        *ps;
        sig_t                action;
        sigset_t        *returnmask;
        ksiginfo_t        ksi;

        l = curlwp;
        p = l->l_proc;
        ps = p->p_sigacts;

        KASSERT(mutex_owned(p->p_lock));
        KASSERT(signo > 0);

        /*
         * Set the new mask value and also defer further occurrences of this
         * signal.
         *
         * Special case: user has done a sigsuspend.  Here the current mask is
         * not of interest, but rather the mask from before the sigsuspend is
         * what we want restored after the signal processing is completed.
         */
        if (l->l_sigrestore) {
                returnmask = &l->l_sigoldmask;
                l->l_sigrestore = 0;
        } else
                returnmask = &l->l_sigmask;

        /*
         * Commit to taking the signal before releasing the mutex.
         */
        action = SIGACTION_PS(ps, signo).sa_handler;
        l->l_ru.ru_nsignals++;
        if (l->l_sigpendset == NULL) {
                /* From the debugger */
                if (p->p_sigctx.ps_faked &&
                    signo == p->p_sigctx.ps_info._signo) {
                        KSI_INIT(&ksi);
                        ksi.ksi_info = p->p_sigctx.ps_info;
                        ksi.ksi_lid = p->p_sigctx.ps_lwp;
                        p->p_sigctx.ps_faked = false;
                } else {
                        if (!siggetinfo(&l->l_sigpend, &ksi, signo))
                                (void)siggetinfo(&p->p_sigpend, &ksi, signo);
                }
        } else
                sigget(l->l_sigpendset, &ksi, signo, NULL);

        if (ktrpoint(KTR_PSIG)) {
                mutex_exit(p->p_lock);
                if (p->p_emul->e_ktrpsig)
                        p->p_emul->e_ktrpsig(signo, action,
                            returnmask, &ksi);
                else
                        ktrpsig(signo, action, returnmask, &ksi);
                mutex_enter(p->p_lock);
        }

        SDT_PROBE(proc, kernel, , signal__handle, signo, &ksi, action, 0, 0);

        if (action == SIG_DFL) {
                /*
                 * Default action, where the default is to kill
                 * the process.  (Other cases were ignored above.)
                 */
                sigexit(l, signo);
                return;
        }

        /*
         * If we get here, the signal must be caught.
         */
#ifdef DIAGNOSTIC
        if (action == SIG_IGN || sigismember(&l->l_sigmask, signo))
                panic("postsig action");
#endif

        kpsendsig(l, &ksi, returnmask);
}

/*
 * sendsig:
 *
 *        Default signal delivery method for NetBSD.
 */
void
sendsig(const struct ksiginfo *ksi, const sigset_t *mask)
{
        struct sigacts *sa;
        int sig;

        sig = ksi->ksi_signo;
        sa = curproc->p_sigacts;

        switch (sa->sa_sigdesc[sig].sd_vers)  {
        case __SIGTRAMP_SIGCODE_VERSION:
#ifdef __HAVE_STRUCT_SIGCONTEXT
        case __SIGTRAMP_SIGCONTEXT_VERSION_MIN ...
             __SIGTRAMP_SIGCONTEXT_VERSION_MAX:
                /* Compat for 1.6 and earlier. */
                MODULE_HOOK_CALL_VOID(sendsig_sigcontext_16_hook, (ksi, mask),
                    break);
                return;
#endif /* __HAVE_STRUCT_SIGCONTEXT */
        case __SIGTRAMP_SIGINFO_VERSION_MIN ...
             __SIGTRAMP_SIGINFO_VERSION_MAX:
                sendsig_siginfo(ksi, mask);
                return;
        default:
                break;
        }

        printf("sendsig: bad version %d\n", sa->sa_sigdesc[sig].sd_vers);
        sigexit(curlwp, SIGILL);
}

/*
 * sendsig_reset:
 *
 *        Reset the signal action.  Called from emulation specific sendsig()
 *        before unlocking to deliver the signal.
 */
void
sendsig_reset(struct lwp *l, int signo)
{
        struct proc *p = l->l_proc;
        struct sigacts *ps = p->p_sigacts;

        KASSERT(mutex_owned(p->p_lock));

        p->p_sigctx.ps_lwp = 0;
        memset(&p->p_sigctx.ps_info, 0, sizeof(p->p_sigctx.ps_info));

        mutex_enter(&ps->sa_mutex);
        sigplusset(&SIGACTION_PS(ps, signo).sa_mask, &l->l_sigmask);
        if (SIGACTION_PS(ps, signo).sa_flags & SA_RESETHAND) {
                sigdelset(&p->p_sigctx.ps_sigcatch, signo);
                if (signo != SIGCONT && sigprop[signo] & SA_IGNORE)
                        sigaddset(&p->p_sigctx.ps_sigignore, signo);
                SIGACTION_PS(ps, signo).sa_handler = SIG_DFL;
        }
        mutex_exit(&ps->sa_mutex);
}

/*
 * Kill the current process for stated reason.
 */
void
killproc(struct proc *p, const char *why)
{

        KASSERT(mutex_owned(&proc_lock));

        log(LOG_ERR, "pid %d was killed: %s\n", p->p_pid, why);
        uprintf_locked("sorry, pid %d was killed: %s\n", p->p_pid, why);
        psignal(p, SIGKILL);
}

/*
 * Force the current process to exit with the specified signal, dumping core
 * if appropriate.  We bypass the normal tests for masked and caught
 * signals, allowing unrecoverable failures to terminate the process without
 * changing signal state.  Mark the accounting record with the signal
 * termination.  If dumping core, save the signal number for the debugger.
 * Calls exit and does not return.
 */
void
sigexit(struct lwp *l, int signo)
{
        int exitsig, error, docore;
        struct proc *p;
        struct lwp *t;

        p = l->l_proc;

        KASSERT(mutex_owned(p->p_lock));
        KASSERT(l->l_blcnt == 0);

        /*
         * Don't permit coredump() multiple times in the same process.
         * Call back into sigexit, where we will be suspended until
         * the deed is done.  Note that this is a recursive call, but
         * LW_WCORE will prevent us from coming back this way.
         */
        if ((p->p_sflag & PS_WCORE) != 0) {
                lwp_lock(l);
                l->l_flag |= (LW_WCORE | LW_WEXIT | LW_WSUSPEND);
                lwp_need_userret(l);
                lwp_unlock(l);
                mutex_exit(p->p_lock);
                lwp_userret(l);
                panic("sigexit 1");
                /* NOTREACHED */
        }

        /* If process is already on the way out, then bail now. */
        if ((p->p_sflag & PS_WEXIT) != 0) {
                mutex_exit(p->p_lock);
                lwp_exit(l);
                panic("sigexit 2");
                /* NOTREACHED */
        }

        /*
         * Prepare all other LWPs for exit.  If dumping core, suspend them
         * so that their registers are available long enough to be dumped.
          */
        if ((docore = (sigprop[signo] & SA_CORE)) != 0) {
                p->p_sflag |= PS_WCORE;
                for (;;) {
                        LIST_FOREACH(t, &p->p_lwps, l_sibling) {
                                lwp_lock(t);
                                if (t == l) {
                                        t->l_flag &=
                                            ~(LW_WSUSPEND | LW_DBGSUSPEND);
                                        lwp_unlock(t);
                                        continue;
                                }
                                t->l_flag |= (LW_WCORE | LW_WEXIT);
                                lwp_need_userret(t);
                                lwp_suspend(l, t);
                        }

                        if (p->p_nrlwps == 1)
                                break;

                        /*
                         * Kick any LWPs sitting in lwp_wait1(), and wait
                         * for everyone else to stop before proceeding.
                         */
                        p->p_nlwpwait++;
                        cv_broadcast(&p->p_lwpcv);
                        cv_wait(&p->p_lwpcv, p->p_lock);
                        p->p_nlwpwait--;
                }
        }

        exitsig = signo;
        p->p_acflag |= AXSIG;
        memset(&p->p_sigctx.ps_info, 0, sizeof(p->p_sigctx.ps_info));
        p->p_sigctx.ps_info._signo = signo;
        p->p_sigctx.ps_info._code = SI_NOINFO;

        if (docore) {
                mutex_exit(p->p_lock);
                MODULE_HOOK_CALL(coredump_hook, (l, NULL), enosys(), error);

                if (kern_logsigexit) {
                        int uid = l->l_cred ?
                            (int)kauth_cred_geteuid(l->l_cred) : -1;

                        if (error)
                                log(LOG_INFO, lognocoredump, p->p_pid,
                                    p->p_comm, uid, signo, error);
                        else
                                log(LOG_INFO, logcoredump, p->p_pid,
                                    p->p_comm, uid, signo);
                }

#ifdef PAX_SEGVGUARD
                rw_enter(&exec_lock, RW_WRITER);
                pax_segvguard(l, p->p_textvp, p->p_comm, true);
                rw_exit(&exec_lock);
#endif /* PAX_SEGVGUARD */

                /* Acquire the sched state mutex.  exit1() will release it. */
                mutex_enter(p->p_lock);
                if (error == 0)
                        p->p_sflag |= PS_COREDUMP;
        }

        /* No longer dumping core. */
        p->p_sflag &= ~PS_WCORE;

        exit1(l, 0, exitsig);
        /* NOTREACHED */
}

/*
 * Since the "real" code may (or may not) be present in loadable module,
 * we provide routines here which calls the module hooks.
 */

int
coredump_netbsd(struct lwp *l, struct coredump_iostate *iocookie)
{

        int retval;

        MODULE_HOOK_CALL(coredump_netbsd_hook, (l, iocookie), ENOSYS, retval);
        return retval;
}

int
coredump_netbsd32(struct lwp *l, struct coredump_iostate *iocookie)
{

        int retval;

        MODULE_HOOK_CALL(coredump_netbsd32_hook, (l, iocookie), ENOSYS, retval);
        return retval;
}

int
coredump_elf32(struct lwp *l, struct coredump_iostate *iocookie)
{
        int retval;

        MODULE_HOOK_CALL(coredump_elf32_hook, (l, iocookie), ENOSYS, retval);
        return retval;
}

int
coredump_elf64(struct lwp *l, struct coredump_iostate *iocookie)
{
        int retval;

        MODULE_HOOK_CALL(coredump_elf64_hook, (l, iocookie), ENOSYS, retval);
        return retval;
}

/*
 * Put process 'p' into the stopped state and optionally, notify the parent.
 */
void
proc_stop(struct proc *p, int signo)
{
        struct lwp *l;

        KASSERT(mutex_owned(p->p_lock));

        /*
         * First off, set the stopping indicator and bring all sleeping
         * LWPs to a halt so they are included in p->p_nrlwps.  We mustn't
         * unlock between here and the p->p_nrlwps check below.
         */
        p->p_sflag |= PS_STOPPING;
        membar_producer();

        proc_stop_lwps(p);

        /*
         * If there are no LWPs available to take the signal, then we
         * signal the parent process immediately.  Otherwise, the last
         * LWP to stop will take care of it.
         */

        if (p->p_nrlwps == 0) {
                proc_stop_done(p, PS_NOCLDSTOP);
        } else {
                /*
                 * Have the remaining LWPs come to a halt, and trigger
                 * proc_stop_callout() to ensure that they do.
                 */
                LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                        sigpost(l, SIG_DFL, SA_STOP, signo);
                }
                callout_schedule(&proc_stop_ch, 1);
        }
}

/*
 * When stopping a process, we do not immediately set sleeping LWPs stopped,
 * but wait for them to come to a halt at the kernel-user boundary.  This is
 * to allow LWPs to release any locks that they may hold before stopping.
 *
 * Non-interruptable sleeps can be long, and there is the potential for an
 * LWP to begin sleeping interruptably soon after the process has been set
 * stopping (PS_STOPPING).  These LWPs will not notice that the process is
 * stopping, and so complete halt of the process and the return of status
 * information to the parent could be delayed indefinitely.
 *
 * To handle this race, proc_stop_callout() runs once per tick while there
 * are stopping processes in the system.  It sets LWPs that are sleeping
 * interruptably into the LSSTOP state.
 *
 * Note that we are not concerned about keeping all LWPs stopped while the
 * process is stopped: stopped LWPs can awaken briefly to handle signals.
 * What we do need to ensure is that all LWPs in a stopping process have
 * stopped at least once, so that notification can be sent to the parent
 * process.
 */
static void
proc_stop_callout(void *cookie)
{
        bool more, restart;
        struct proc *p;

        (void)cookie;

        do {
                restart = false;
                more = false;

                mutex_enter(&proc_lock);
                PROCLIST_FOREACH(p, &allproc) {
                        mutex_enter(p->p_lock);

                        if ((p->p_sflag & PS_STOPPING) == 0) {
                                mutex_exit(p->p_lock);
                                continue;
                        }

                        /* Stop any LWPs sleeping interruptably. */
                        proc_stop_lwps(p);
                        if (p->p_nrlwps == 0) {
                                /*
                                 * We brought the process to a halt.
                                 * Mark it as stopped and notify the
                                 * parent.
                                 *
                                 * Note that proc_stop_done() will
                                 * drop p->p_lock briefly.
                                 * Arrange to restart and check
                                 * all processes again.
                                 */
                                restart = true;
                                proc_stop_done(p, PS_NOCLDSTOP);
                        } else
                                more = true;

                        mutex_exit(p->p_lock);
                        if (restart)
                                break;
                }
                mutex_exit(&proc_lock);
        } while (restart);

        /*
         * If we noted processes that are stopping but still have
         * running LWPs, then arrange to check again in 1 tick.
         */
        if (more)
                callout_schedule(&proc_stop_ch, 1);
}

/*
 * Given a process in state SSTOP, set the state back to SACTIVE and
 * move LSSTOP'd LWPs to LSSLEEP or make them runnable.
 */
void
proc_unstop(struct proc *p)
{
        struct lwp *l;
        int sig;

        KASSERT(mutex_owned(&proc_lock));
        KASSERT(mutex_owned(p->p_lock));

        p->p_stat = SACTIVE;
        p->p_sflag &= ~PS_STOPPING;
        sig = p->p_xsig;

        if (!p->p_waited)
                p->p_pptr->p_nstopchild--;

        LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                lwp_lock(l);
                if (l->l_stat != LSSTOP || (l->l_flag & LW_DBGSUSPEND) != 0) {
                        lwp_unlock(l);
                        continue;
                }
                if (l->l_wchan == NULL) {
                        setrunnable(l);
                        continue;
                }
                if (sig && (l->l_flag & LW_SINTR) != 0) {
                        setrunnable(l);
                        sig = 0;
                } else {
                        l->l_stat = LSSLEEP;
                        p->p_nrlwps++;
                        lwp_unlock(l);
                }
        }
}

void
proc_stoptrace(int trapno, int sysnum, const register_t args[],
               const register_t *ret, int error)
{
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;
        struct sigacts *ps;
        sigset_t *mask;
        sig_t action;
        ksiginfo_t ksi;
        size_t i, sy_narg;
        const int signo = SIGTRAP;

        KASSERT((trapno == TRAP_SCE) || (trapno == TRAP_SCX));
        KASSERT(p->p_pptr != initproc);
        KASSERT(ISSET(p->p_slflag, PSL_TRACED));
        KASSERT(ISSET(p->p_slflag, PSL_SYSCALL));

        sy_narg = p->p_emul->e_sysent[sysnum].sy_narg;

        KSI_INIT_TRAP(&ksi);
        ksi.ksi_lid = l->l_lid;
        ksi.ksi_signo = signo;
        ksi.ksi_code = trapno;

        ksi.ksi_sysnum = sysnum;
        if (trapno == TRAP_SCE) {
                ksi.ksi_retval[0] = 0;
                ksi.ksi_retval[1] = 0;
                ksi.ksi_error = 0;
        } else {
                ksi.ksi_retval[0] = ret[0];
                ksi.ksi_retval[1] = ret[1];
                ksi.ksi_error = error;
        }

        memset(ksi.ksi_args, 0, sizeof(ksi.ksi_args));

        for (i = 0; i < sy_narg; i++)
                ksi.ksi_args[i] = args[i];

        mutex_enter(p->p_lock);

repeat:
        /*
         * If we are exiting, demise now.
         *
         * This avoids notifying tracer and deadlocking.
         */
        if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
                mutex_exit(p->p_lock);
                lwp_exit(l);
                panic("proc_stoptrace");
                /* NOTREACHED */
        }

        /*
         * If there's a pending SIGKILL process it immediately.
         */
        if (p->p_xsig == SIGKILL ||
            sigismember(&p->p_sigpend.sp_set, SIGKILL)) {
                mutex_exit(p->p_lock);
                return;
        }

        /*
         * If we are no longer traced, abandon this event signal.
         *
         * This avoids killing a process after detaching the debugger.
         */
        if (__predict_false(!ISSET(p->p_slflag, PSL_TRACED))) {
                mutex_exit(p->p_lock);
                return;
        }

        /*
         * The process is already stopping.
         */
        if ((p->p_sflag & PS_STOPPING) != 0) {
                sigswitch_unlock_and_switch_away(l);
                mutex_enter(p->p_lock);
                goto repeat;
        }

        /* Needed for ktrace */
        ps = p->p_sigacts;
        action = SIGACTION_PS(ps, signo).sa_handler;
        mask = &l->l_sigmask;

        p->p_xsig = signo;
        p->p_sigctx.ps_lwp = ksi.ksi_lid;
        p->p_sigctx.ps_info = ksi.ksi_info;
        sigswitch(0, signo, false);

        if (ktrpoint(KTR_PSIG)) {
                if (p->p_emul->e_ktrpsig)
                        p->p_emul->e_ktrpsig(signo, action, mask, &ksi);
                else
                        ktrpsig(signo, action, mask, &ksi);
        }
}

static int
filt_sigattach(struct knote *kn)
{
        struct proc *p = curproc;

        kn->kn_obj = p;
        kn->kn_flags |= EV_CLEAR;        /* automatically set */

        mutex_enter(p->p_lock);
        klist_insert(&p->p_klist, kn);
        mutex_exit(p->p_lock);

        return 0;
}

static void
filt_sigdetach(struct knote *kn)
{
        struct proc *p = kn->kn_obj;

        mutex_enter(p->p_lock);
        klist_remove(&p->p_klist, kn);
        mutex_exit(p->p_lock);
}

/*
 * Signal knotes are shared with proc knotes, so we apply a mask to
 * the hint in order to differentiate them from process hints.  This
 * could be avoided by using a signal-specific knote list, but probably
 * isn't worth the trouble.
 */
static int
filt_signal(struct knote *kn, long hint)
{

        if (hint & NOTE_SIGNAL) {
                hint &= ~NOTE_SIGNAL;

                if (kn->kn_id == hint)
                        kn->kn_data++;
        }
        return (kn->kn_data != 0);
}

const struct filterops sig_filtops = {
        .f_flags = FILTEROP_MPSAFE,
        .f_attach = filt_sigattach,
        .f_detach = filt_sigdetach,
        .f_event = filt_signal,
};









































































































































































































































































































































    6 


    6 


    6 

























    2 


    2 

















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
/*        $NetBSD: kern_hook.c,v 1.15 2024/01/17 10:18:41 hannken Exp $        */

/*-
 * Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, and by Luke Mewburn.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_hook.c,v 1.15 2024/01/17 10:18:41 hannken Exp $");

#include <sys/param.h>

#include <sys/condvar.h>
#include <sys/cpu.h>
#include <sys/device.h>
#include <sys/exec.h>
#include <sys/hook.h>
#include <sys/kmem.h>
#include <sys/malloc.h>
#include <sys/once.h>
#include <sys/rwlock.h>
#include <sys/systm.h>

/*
 * A generic linear hook.
 */
struct hook_desc {
        LIST_ENTRY(hook_desc) hk_list;
        void        (*hk_fn)(void *);
        void        *hk_arg;
};
typedef LIST_HEAD(, hook_desc) hook_list_t;

enum hook_list_st {
        HKLIST_IDLE,
        HKLIST_INUSE,
};

struct khook_list {
        hook_list_t         hl_list;
        kmutex_t         hl_lock;
        kmutex_t        *hl_cvlock;
        struct lwp        *hl_lwp;
        kcondvar_t         hl_cv;
        enum hook_list_st
                         hl_state;
        khook_t                *hl_active_hk;
        char                 hl_namebuf[HOOKNAMSIZ];
};

int        powerhook_debug = 0;

static ONCE_DECL(hook_control);
static krwlock_t exithook_lock;
static krwlock_t forkhook_lock;

static int
hook_init(void)
{

        rw_init(&exithook_lock);
        rw_init(&forkhook_lock);

        return 0;
}

static void *
hook_establish(hook_list_t *list, krwlock_t *lock,
    void (*fn)(void *), void *arg)
{
        struct hook_desc *hd;

        RUN_ONCE(&hook_control, hook_init);

        hd = malloc(sizeof(*hd), M_DEVBUF, M_NOWAIT);
        if (hd != NULL) {
                if (lock)
                        rw_enter(lock, RW_WRITER);
                hd->hk_fn = fn;
                hd->hk_arg = arg;
                LIST_INSERT_HEAD(list, hd, hk_list);
                if (lock)
                        rw_exit(lock);
        }

        return (hd);
}

static void
hook_disestablish(hook_list_t *list, krwlock_t *lock, void *vhook)
{

        if (lock)
                rw_enter(lock, RW_WRITER);
#ifdef DIAGNOSTIC
        struct hook_desc *hd;

        LIST_FOREACH(hd, list, hk_list) {
                if (hd == vhook)
                        break;
        }

        if (hd == NULL)
                panic("hook_disestablish: hook %p not established", vhook);
#endif
        LIST_REMOVE((struct hook_desc *)vhook, hk_list);
        free(vhook, M_DEVBUF);
        if (lock)
                rw_exit(lock);
}

static void
hook_destroy(hook_list_t *list)
{
        struct hook_desc *hd;

        while ((hd = LIST_FIRST(list)) != NULL) {
                LIST_REMOVE(hd, hk_list);
                free(hd, M_DEVBUF);
        }
}

static void
hook_proc_run(hook_list_t *list, krwlock_t *lock, struct proc *p)
{
        struct hook_desc *hd;

        RUN_ONCE(&hook_control, hook_init);

        if (lock)
                rw_enter(lock, RW_READER);
        LIST_FOREACH(hd, list, hk_list) {
                __FPTRCAST(void (*)(struct proc *, void *), *hd->hk_fn)(p,
                    hd->hk_arg);
        }
        if (lock)
                rw_exit(lock);
}

/*
 * "Shutdown hook" types, functions, and variables.
 *
 * Should be invoked immediately before the
 * system is halted or rebooted, i.e. after file systems unmounted,
 * after crash dump done, etc.
 *
 * Each shutdown hook is removed from the list before it's run, so that
 * it won't be run again.
 */

static hook_list_t shutdownhook_list = LIST_HEAD_INITIALIZER(shutdownhook_list);

void *
shutdownhook_establish(void (*fn)(void *), void *arg)
{
        return hook_establish(&shutdownhook_list, NULL, fn, arg);
}

void
shutdownhook_disestablish(void *vhook)
{
        hook_disestablish(&shutdownhook_list, NULL, vhook);
}

/*
 * Run shutdown hooks.  Should be invoked immediately before the
 * system is halted or rebooted, i.e. after file systems unmounted,
 * after crash dump done, etc.
 *
 * Each shutdown hook is removed from the list before it's run, so that
 * it won't be run again.
 */
void
doshutdownhooks(void)
{
        struct hook_desc *dp;

        while ((dp = LIST_FIRST(&shutdownhook_list)) != NULL) {
                LIST_REMOVE(dp, hk_list);
                (*dp->hk_fn)(dp->hk_arg);
#if 0
                /*
                 * Don't bother freeing the hook structure,, since we may
                 * be rebooting because of a memory corruption problem,
                 * and this might only make things worse.  It doesn't
                 * matter, anyway, since the system is just about to
                 * reboot.
                 */
                free(dp, M_DEVBUF);
#endif
        }
}

/*
 * "Mountroot hook" types, functions, and variables.
 */

static hook_list_t mountroothook_list=LIST_HEAD_INITIALIZER(mountroothook_list);

void *
mountroothook_establish(void (*fn)(device_t), device_t dev)
{
        return hook_establish(&mountroothook_list, NULL,
            __FPTRCAST(void (*), fn), dev);
}

void
mountroothook_disestablish(void *vhook)
{
        hook_disestablish(&mountroothook_list, NULL, vhook);
}

void
mountroothook_destroy(void)
{
        hook_destroy(&mountroothook_list);
}

void
domountroothook(device_t therootdev)
{
        struct hook_desc *hd;

        LIST_FOREACH(hd, &mountroothook_list, hk_list) {
                if (hd->hk_arg == therootdev) {
                        (*hd->hk_fn)(hd->hk_arg);
                        return;
                }
        }
}

static hook_list_t exechook_list = LIST_HEAD_INITIALIZER(exechook_list);

void *
exechook_establish(void (*fn)(struct proc *, void *), void *arg)
{
        return hook_establish(&exechook_list, &exec_lock,
                __FPTRCAST(void (*)(void *), fn), arg);
}

void
exechook_disestablish(void *vhook)
{
        hook_disestablish(&exechook_list, &exec_lock, vhook);
}

/*
 * Run exec hooks.
 */
void
doexechooks(struct proc *p)
{
        KASSERT(rw_lock_held(&exec_lock));

        hook_proc_run(&exechook_list, NULL, p);
}

static hook_list_t exithook_list = LIST_HEAD_INITIALIZER(exithook_list);

void *
exithook_establish(void (*fn)(struct proc *, void *), void *arg)
{

        return hook_establish(&exithook_list, &exithook_lock,
            __FPTRCAST(void (*)(void *), fn), arg);
}

void
exithook_disestablish(void *vhook)
{

        hook_disestablish(&exithook_list, &exithook_lock, vhook);
}

/*
 * Run exit hooks.
 */
void
doexithooks(struct proc *p)
{
        hook_proc_run(&exithook_list, &exithook_lock, p);
}

static hook_list_t forkhook_list = LIST_HEAD_INITIALIZER(forkhook_list);

void *
forkhook_establish(void (*fn)(struct proc *, struct proc *))
{
        return hook_establish(&forkhook_list, &forkhook_lock,
            __FPTRCAST(void (*)(void *), fn), NULL);
}

void
forkhook_disestablish(void *vhook)
{
        hook_disestablish(&forkhook_list, &forkhook_lock, vhook);
}

/*
 * Run fork hooks.
 */
void
doforkhooks(struct proc *p2, struct proc *p1)
{
        struct hook_desc *hd;

        RUN_ONCE(&hook_control, hook_init);

        rw_enter(&forkhook_lock, RW_READER);
        LIST_FOREACH(hd, &forkhook_list, hk_list) {
                __FPTRCAST(void (*)(struct proc *, struct proc *), *hd->hk_fn)
                    (p2, p1);
        }
        rw_exit(&forkhook_lock);
}

static hook_list_t critpollhook_list = LIST_HEAD_INITIALIZER(critpollhook_list);

void *
critpollhook_establish(void (*fn)(void *), void *arg)
{
        return hook_establish(&critpollhook_list, NULL, fn, arg);
}

void
critpollhook_disestablish(void *vhook)
{
        hook_disestablish(&critpollhook_list, NULL, vhook);
}

/*
 * Run critical polling hooks.
 */
void
docritpollhooks(void)
{
        struct hook_desc *hd;

        LIST_FOREACH(hd, &critpollhook_list, hk_list) {
                (*hd->hk_fn)(hd->hk_arg);
        }
}

/*
 * "Power hook" types, functions, and variables.
 * The list of power hooks is kept ordered with the last registered hook
 * first.
 * When running the hooks on power down the hooks are called in reverse
 * registration order, when powering up in registration order.
 */
struct powerhook_desc {
        TAILQ_ENTRY(powerhook_desc) sfd_list;
        void        (*sfd_fn)(int, void *);
        void        *sfd_arg;
        char        sfd_name[16];
};

static TAILQ_HEAD(powerhook_head, powerhook_desc) powerhook_list =
    TAILQ_HEAD_INITIALIZER(powerhook_list);

void *
powerhook_establish(const char *name, void (*fn)(int, void *), void *arg)
{
        struct powerhook_desc *ndp;

        ndp = (struct powerhook_desc *)
            malloc(sizeof(*ndp), M_DEVBUF, M_NOWAIT);
        if (ndp == NULL)
                return (NULL);

        ndp->sfd_fn = fn;
        ndp->sfd_arg = arg;
        strlcpy(ndp->sfd_name, name, sizeof(ndp->sfd_name));
        TAILQ_INSERT_HEAD(&powerhook_list, ndp, sfd_list);

        aprint_error("%s: WARNING: powerhook_establish is deprecated\n", name);
        return (ndp);
}

void
powerhook_disestablish(void *vhook)
{
#ifdef DIAGNOSTIC
        struct powerhook_desc *dp;

        TAILQ_FOREACH(dp, &powerhook_list, sfd_list)
                if (dp == vhook)
                        goto found;
        panic("powerhook_disestablish: hook %p not established", vhook);
 found:
#endif

        TAILQ_REMOVE(&powerhook_list, (struct powerhook_desc *)vhook,
            sfd_list);
        free(vhook, M_DEVBUF);
}

/*
 * Run power hooks.
 */
void
dopowerhooks(int why)
{
        struct powerhook_desc *dp;
        const char *why_name;
        static const char * pwr_names[] = {PWR_NAMES};
        why_name = why < __arraycount(pwr_names) ? pwr_names[why] : "???";

        if (why == PWR_RESUME || why == PWR_SOFTRESUME) {
                TAILQ_FOREACH_REVERSE(dp, &powerhook_list, powerhook_head,
                    sfd_list)
                {
                        if (powerhook_debug)
                                printf("dopowerhooks %s: %s (%p)\n",
                                    why_name, dp->sfd_name, dp);
                        (*dp->sfd_fn)(why, dp->sfd_arg);
                }
        } else {
                TAILQ_FOREACH(dp, &powerhook_list, sfd_list) {
                        if (powerhook_debug)
                                printf("dopowerhooks %s: %s (%p)\n",
                                    why_name, dp->sfd_name, dp);
                        (*dp->sfd_fn)(why, dp->sfd_arg);
                }
        }

        if (powerhook_debug)
                printf("dopowerhooks: %s done\n", why_name);
}

/*
 * A simple linear hook.
 */

khook_list_t *
simplehook_create(int ipl, const char *wmsg)
{
        khook_list_t *l;

        l = kmem_zalloc(sizeof(*l), KM_SLEEP);

        mutex_init(&l->hl_lock, MUTEX_DEFAULT, ipl);
        strlcpy(l->hl_namebuf, wmsg, sizeof(l->hl_namebuf));
        cv_init(&l->hl_cv, l->hl_namebuf);
        LIST_INIT(&l->hl_list);
        l->hl_state = HKLIST_IDLE;

        return l;
}

void
simplehook_destroy(khook_list_t *l)
{
        struct hook_desc *hd;

        KASSERT(l->hl_state == HKLIST_IDLE);

        while ((hd = LIST_FIRST(&l->hl_list)) != NULL) {
                LIST_REMOVE(hd, hk_list);
                kmem_free(hd, sizeof(*hd));
        }

        cv_destroy(&l->hl_cv);
        mutex_destroy(&l->hl_lock);
        kmem_free(l, sizeof(*l));
}

int
simplehook_dohooks(khook_list_t *l)
{
        struct hook_desc *hd, *nexthd;
        kmutex_t *cv_lock;
        void (*fn)(void *);
        void *arg;

        mutex_enter(&l->hl_lock);
        if (l->hl_state != HKLIST_IDLE) {
                mutex_exit(&l->hl_lock);
                return EBUSY;
        }

        /* stop removing hooks */
        l->hl_state = HKLIST_INUSE;
        l->hl_lwp = curlwp;

        LIST_FOREACH(hd, &l->hl_list, hk_list) {
                if (hd->hk_fn == NULL)
                        continue;

                fn = hd->hk_fn;
                arg = hd->hk_arg;
                l->hl_active_hk = hd;
                l->hl_cvlock = NULL;

                mutex_exit(&l->hl_lock);

                /* do callback without l->hl_lock */
                (*fn)(arg);

                mutex_enter(&l->hl_lock);
                l->hl_active_hk = NULL;
                cv_lock = l->hl_cvlock;

                if (hd->hk_fn == NULL) {
                        if (cv_lock != NULL) {
                                mutex_exit(&l->hl_lock);
                                mutex_enter(cv_lock);
                        }

                        cv_broadcast(&l->hl_cv);

                        if (cv_lock != NULL) {
                                mutex_exit(cv_lock);
                                mutex_enter(&l->hl_lock);
                        }
                }
        }

        /* remove marked node while running hooks */
        LIST_FOREACH_SAFE(hd, &l->hl_list, hk_list, nexthd) {
                if (hd->hk_fn == NULL) {
                        LIST_REMOVE(hd, hk_list);
                        kmem_free(hd, sizeof(*hd));
                }
        }

        l->hl_lwp = NULL;
        l->hl_state = HKLIST_IDLE;
        mutex_exit(&l->hl_lock);

        return 0;
}

khook_t *
simplehook_establish(khook_list_t *l, void (*fn)(void *), void *arg)
{
        struct hook_desc *hd;

        hd = kmem_zalloc(sizeof(*hd), KM_SLEEP);
        hd->hk_fn = fn;
        hd->hk_arg = arg;

        mutex_enter(&l->hl_lock);
        LIST_INSERT_HEAD(&l->hl_list, hd, hk_list);
        mutex_exit(&l->hl_lock);

        return hd;
}

void
simplehook_disestablish(khook_list_t *l, khook_t *hd, kmutex_t *lock)
{
        struct hook_desc *hd0 __diagused;
        kmutex_t *cv_lock;

        KASSERT(lock == NULL || mutex_owned(lock));
        mutex_enter(&l->hl_lock);

#ifdef DIAGNOSTIC
        LIST_FOREACH(hd0, &l->hl_list, hk_list) {
                if (hd == hd0)
                        break;
        }

        if (hd0 == NULL)
                panic("hook_disestablish: hook %p not established", hd);
#endif

        /* The hook is not referred, remove immediately */
        if (l->hl_state == HKLIST_IDLE) {
                LIST_REMOVE(hd, hk_list);
                kmem_free(hd, sizeof(*hd));
                mutex_exit(&l->hl_lock);
                return;
        }

        /* remove callback. hd will be removed in dohooks */
        hd->hk_fn = NULL;
        hd->hk_arg = NULL;

        /* If the hook is running, wait for the completion */
        if (l->hl_active_hk == hd &&
            l->hl_lwp != curlwp) {
                if (lock != NULL) {
                        cv_lock = lock;
                        KASSERT(l->hl_cvlock == NULL);
                        l->hl_cvlock = lock;
                        mutex_exit(&l->hl_lock);
                } else {
                        cv_lock = &l->hl_lock;
                }

                cv_wait(&l->hl_cv, cv_lock);

                if (lock == NULL)
                        mutex_exit(&l->hl_lock);
        } else {
                mutex_exit(&l->hl_lock);
        }
}

bool
simplehook_has_hooks(khook_list_t *l)
{
        bool empty;

        mutex_enter(&l->hl_lock);
        empty = LIST_EMPTY(&l->hl_list);
        mutex_exit(&l->hl_lock);

        return !empty;
}







































































































































































































































































































































































    1 
































    1 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
/*        $NetBSD: ext2fs_vfsops.c,v 1.225 2023/08/27 16:35:51 christos Exp $        */

/*
 * Copyright (c) 1989, 1991, 1993, 1994
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ffs_vfsops.c        8.14 (Berkeley) 11/28/94
 * Modified for ext2fs by Manuel Bouyer.
 */

/*
 * Copyright (c) 1997 Manuel Bouyer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *        @(#)ffs_vfsops.c        8.14 (Berkeley) 11/28/94
 * Modified for ext2fs by Manuel Bouyer.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ext2fs_vfsops.c,v 1.225 2023/08/27 16:35:51 christos Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/socket.h>
#include <sys/mount.h>
#include <sys/buf.h>
#include <sys/device.h>
#include <sys/file.h>
#include <sys/disklabel.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/pool.h>
#include <sys/lock.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/module.h>

#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

#include <ufs/ufs/quota.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/ufs_extern.h>

#include <ufs/ext2fs/ext2fs.h>
#include <ufs/ext2fs/ext2fs_dir.h>
#include <ufs/ext2fs/ext2fs_extern.h>

MODULE(MODULE_CLASS_VFS, ext2fs, "ufs");

int ext2fs_sbupdate(struct ufsmount *, int);
static int ext2fs_sbfill(struct m_ext2fs *, int);

extern const struct vnodeopv_desc ext2fs_vnodeop_opv_desc;
extern const struct vnodeopv_desc ext2fs_specop_opv_desc;
extern const struct vnodeopv_desc ext2fs_fifoop_opv_desc;

const struct vnodeopv_desc * const ext2fs_vnodeopv_descs[] = {
        &ext2fs_vnodeop_opv_desc,
        &ext2fs_specop_opv_desc,
        &ext2fs_fifoop_opv_desc,
        NULL,
};

struct vfsops ext2fs_vfsops = {
        .vfs_name = MOUNT_EXT2FS,
        .vfs_min_mount_data = sizeof (struct ufs_args),
        .vfs_mount = ext2fs_mount,
        .vfs_start = ufs_start,
        .vfs_unmount = ext2fs_unmount,
        .vfs_root = ufs_root,
        .vfs_quotactl = ufs_quotactl,
        .vfs_statvfs = ext2fs_statvfs,
        .vfs_sync = ext2fs_sync,
        .vfs_vget = ufs_vget,
        .vfs_loadvnode = ext2fs_loadvnode,
        .vfs_newvnode = ext2fs_newvnode,
        .vfs_fhtovp = ext2fs_fhtovp,
        .vfs_vptofh = ext2fs_vptofh,
        .vfs_init = ext2fs_init,
        .vfs_reinit = ext2fs_reinit,
        .vfs_done = ext2fs_done,
        .vfs_mountroot = ext2fs_mountroot,
        .vfs_snapshot = (void *)eopnotsupp,
        .vfs_extattrctl = vfs_stdextattrctl,
        .vfs_suspendctl = genfs_suspendctl,
        .vfs_renamelock_enter = genfs_renamelock_enter,
        .vfs_renamelock_exit = genfs_renamelock_exit,
        .vfs_fsync = (void *)eopnotsupp,
        .vfs_opv_descs = ext2fs_vnodeopv_descs
};

static const struct genfs_ops ext2fs_genfsops = {
        .gop_size = genfs_size,
        .gop_alloc = ext2fs_gop_alloc,
        .gop_write = genfs_gop_write,
        .gop_markupdate = ufs_gop_markupdate,
        .gop_putrange = genfs_gop_putrange,
};

static const struct ufs_ops ext2fs_ufsops = {
        .uo_itimes = ext2fs_itimes,
        .uo_update = ext2fs_update,
        .uo_bufrd = ext2fs_bufrd,
        .uo_bufwr = ext2fs_bufwr,
};

static void
e2fs_cgload(const char *ondisk, struct ext2_gd *inmemory, int cg_size,
    int shift_cg_entry_size)
{

        if (shift_cg_entry_size == 6) {
                memcpy(inmemory, ondisk, cg_size);
                return;
        }

        const char *iptr = ondisk;
        struct ext2_gd *optr = inmemory;
        int sh = 1 << shift_cg_entry_size;
        int lim = cg_size >> shift_cg_entry_size;
        if (shift_cg_entry_size > 6) {
                for (int i = 0; i < lim; i++, optr++, iptr += sh) {
                        memcpy(optr, iptr, sizeof(*optr));
                }
        } else {
                for (int i = 0; i < lim; i++, optr++, iptr += sh) {
                        memcpy(optr, iptr, E2FS_REV0_GD_SIZE);
                        memset((char *)optr + E2FS_REV0_GD_SIZE, 0,
                            sizeof(*optr) - E2FS_REV0_GD_SIZE);
                }
        }
}

static void
e2fs_cgsave(const struct ext2_gd *inmemory, char *ondisk, int cg_size,
    int shift_cg_entry_size)
{

        if (shift_cg_entry_size == 6) {
                memcpy(ondisk, inmemory, cg_size);
                return;
        }

        const struct ext2_gd *iptr = inmemory;
        char *optr = ondisk;
        int sh = 1 << shift_cg_entry_size;
        int lim = cg_size >> shift_cg_entry_size;
        if (shift_cg_entry_size > 6) {
                for (int i = 0; i < lim; i++, iptr++, optr += sh) {
                        memcpy(optr, iptr, sizeof(*iptr));
                        memset(optr + sizeof(*iptr), 0, sh - sizeof(*iptr));
                }
        } else {
                for (int i = 0; i < lim; i++, iptr++, optr += sh) {
                        memcpy(optr, iptr, E2FS_REV0_GD_SIZE);
                }
        }
}

/* Fill in the inode uid/gid from ext2 halves.  */
void
ext2fs_set_inode_guid(struct inode *ip)
{

        ip->i_gid = ip->i_e2fs_gid;
        ip->i_uid = ip->i_e2fs_uid;
        if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
                ip->i_gid |= ip->i_e2fs_gid_high << 16;
                ip->i_uid |= ip->i_e2fs_uid_high << 16;
        }
}

SYSCTL_SETUP(ext2fs_sysctl_setup, "ext2fs sysctl")
{

                sysctl_createv(clog, 0, NULL, NULL,
                               CTLFLAG_PERMANENT,
                               CTLTYPE_NODE, "ext2fs",
                               SYSCTL_DESCR("Linux EXT2FS file system"),
                               NULL, 0, NULL, 0,
                               CTL_VFS, 17, CTL_EOL);
                /*
                 * XXX the "17" above could be dynamic, thereby eliminating
                 * one more instance of the "number to vfs" mapping problem,
                 * but "17" is the order as taken from sys/mount.h
                 */
}

static int
ext2fs_modcmd(modcmd_t cmd, void *arg)
{
        int error;

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = vfs_attach(&ext2fs_vfsops);
                if (error != 0)
                        break;
                break;
        case MODULE_CMD_FINI:
                error = vfs_detach(&ext2fs_vfsops);
                if (error != 0)
                        break;
                break;
        default:
                error = ENOTTY;
                break;
        }

        return error;
}

/*
 * XXX Same structure as FFS inodes?  Should we share a common pool?
 */
struct pool ext2fs_inode_pool;

extern u_long ext2gennumber;

void
ext2fs_init(void)
{

        pool_init(&ext2fs_inode_pool, sizeof(struct inode), 0, 0, 0,
            "ext2fsinopl", &pool_allocator_nointr, IPL_NONE);
        ufs_init();
}

void
ext2fs_reinit(void)
{
        ufs_reinit();
}

void
ext2fs_done(void)
{

        ufs_done();
        pool_destroy(&ext2fs_inode_pool);
}

static void
ext2fs_sb_setmountinfo(struct m_ext2fs *fs, struct mount *mp)
{
        (void)strlcpy(fs->e2fs_fsmnt, mp->mnt_stat.f_mntonname,
            sizeof(fs->e2fs_fsmnt));
        if (fs->e2fs_ronly == 0 && fs->e2fs.e2fs_rev > E2FS_REV0) {
                (void)strlcpy(fs->e2fs.e2fs_fsmnt, mp->mnt_stat.f_mntonname,
                    sizeof(fs->e2fs.e2fs_fsmnt));

                fs->e2fs.e2fs_mtime = time_second;
                fs->e2fs.e2fs_mnt_count++;

                fs->e2fs_fmod = 1;
        }
}

/*
 * Called by main() when ext2fs is going to be mounted as root.
 *
 * Name is updated by mount(8) after booting.
 */

int
ext2fs_mountroot(void)
{
        extern struct vnode *rootvp;
        struct m_ext2fs *fs;
        struct mount *mp;
        struct ufsmount *ump;
        int error;

        if (device_class(root_device) != DV_DISK)
                return ENODEV;

        if ((error = vfs_rootmountalloc(MOUNT_EXT2FS, "root_device", &mp))) {
                vrele(rootvp);
                return error;
        }

        if ((error = ext2fs_mountfs(rootvp, mp)) != 0) {
                vfs_unbusy(mp);
                vfs_rele(mp);
                return error;
        }
        mountlist_append(mp);
        ump = VFSTOUFS(mp);
        fs = ump->um_e2fs;
        ext2fs_sb_setmountinfo(fs, mp);
        (void)ext2fs_statvfs(mp, &mp->mnt_stat);
        vfs_unbusy(mp);
        setrootfstime((time_t)fs->e2fs.e2fs_wtime);
        return 0;
}

/*
 * VFS Operations.
 *
 * mount system call
 */
int
ext2fs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
        struct lwp *l = curlwp;
        struct vnode *devvp;
        struct ufs_args *args = data;
        struct ufsmount *ump = NULL;
        struct m_ext2fs *fs;
        int error = 0, flags, update;
        mode_t accessmode;

        if (args == NULL)
                return EINVAL;
        if (*data_len < sizeof *args)
                return EINVAL;

        if (mp->mnt_flag & MNT_GETARGS) {
                ump = VFSTOUFS(mp);
                if (ump == NULL)
                        return EIO;
                memset(args, 0, sizeof *args);
                args->fspec = NULL;
                *data_len = sizeof *args;
                return 0;
        }

        update = mp->mnt_flag & MNT_UPDATE;

        /* Check arguments */
        if (args->fspec != NULL) {
                /*
                 * Look up the name and verify that it's sane.
                 */
                error = namei_simple_user(args->fspec,
                                        NSM_FOLLOW_NOEMULROOT, &devvp);
                if (error != 0)
                        return error;

                if (!update) {
                        /*
                         * Be sure this is a valid block device
                         */
                        if (devvp->v_type != VBLK)
                                error = ENOTBLK;
                        else if (bdevsw_lookup(devvp->v_rdev) == NULL)
                                error = ENXIO;
                } else {
                        /*
                         * Be sure we're still naming the same device
                         * used for our initial mount
                         */
                        ump = VFSTOUFS(mp);
                        if (devvp != ump->um_devvp) {
                                if (devvp->v_rdev != ump->um_devvp->v_rdev)
                                        error = EINVAL;
                                else {
                                        vrele(devvp);
                                        devvp = ump->um_devvp;
                                        vref(devvp);
                                }
                        }
                }
        } else {
                if (!update) {
                        /* New mounts must have a filename for the device */
                        return EINVAL;
                } else {
                        ump = VFSTOUFS(mp);
                        devvp = ump->um_devvp;
                        vref(devvp);
                }
        }

        /*
         * If mount by non-root, then verify that user has necessary
         * permissions on the device.
         *
         * Permission to update a mount is checked higher, so here we presume
         * updating the mount is okay (for example, as far as securelevel goes)
         * which leaves us with the normal check.
         */
        if (error == 0) {
                accessmode = VREAD;
                if (update ?
                    (mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
                    (mp->mnt_flag & MNT_RDONLY) == 0)
                        accessmode |= VWRITE;
                vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
                    KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp,
                    KAUTH_ARG(accessmode));
                VOP_UNLOCK(devvp);
        }

        if (error) {
                vrele(devvp);
                return error;
        }

        if (!update) {
                int xflags;

                if (mp->mnt_flag & MNT_RDONLY)
                        xflags = FREAD;
                else
                        xflags = FREAD|FWRITE;
                vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                error = VOP_OPEN(devvp, xflags, FSCRED);
                VOP_UNLOCK(devvp);
                if (error)
                        goto fail;
                error = ext2fs_mountfs(devvp, mp);
                if (error) {
                        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                        (void)VOP_CLOSE(devvp, xflags, NOCRED);
                        VOP_UNLOCK(devvp);
                        goto fail;
                }

                ump = VFSTOUFS(mp);
                fs = ump->um_e2fs;
        } else {
                /*
                 * Update the mount.
                 */

                /*
                 * The initial mount got a reference on this
                 * device, so drop the one obtained via
                 * namei(), above.
                 */
                vrele(devvp);

                ump = VFSTOUFS(mp);
                fs = ump->um_e2fs;
                if (fs->e2fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
                        /*
                         * Changing from r/w to r/o
                         */
                        flags = WRITECLOSE;
                        if (mp->mnt_flag & MNT_FORCE)
                                flags |= FORCECLOSE;
                        error = ext2fs_flushfiles(mp, flags);
                        if (error == 0 &&
                            ext2fs_cgupdate(ump, MNT_WAIT) == 0 &&
                            (fs->e2fs.e2fs_state & E2FS_ERRORS) == 0) {
                                fs->e2fs.e2fs_state = E2FS_ISCLEAN;
                                (void) ext2fs_sbupdate(ump, MNT_WAIT);
                        }
                        if (error)
                                return error;
                        fs->e2fs_ronly = 1;
                }

                if (mp->mnt_flag & MNT_RELOAD) {
                        error = ext2fs_reload(mp, l->l_cred, l);
                        if (error)
                                return error;
                }

                if (fs->e2fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
                        /*
                         * Changing from read-only to read/write
                         */
                        fs->e2fs_ronly = 0;
                        if (fs->e2fs.e2fs_state == E2FS_ISCLEAN)
                                fs->e2fs.e2fs_state = 0;
                        else
                                fs->e2fs.e2fs_state = E2FS_ERRORS;
                        fs->e2fs_fmod = 1;
                }
                if (args->fspec == NULL)
                        return 0;
        }

        error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
            UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
        if (error == 0)
                ext2fs_sb_setmountinfo(fs, mp);

        if (fs->e2fs_fmod != 0) {        /* XXX */
                fs->e2fs_fmod = 0;
                if (fs->e2fs.e2fs_state == 0)
                        fs->e2fs.e2fs_wtime = time_second;
                else
                        printf("%s: file system not clean; please fsck(8)\n",
                                mp->mnt_stat.f_mntfromname);
                (void) ext2fs_cgupdate(ump, MNT_WAIT);
        }
        return error;

fail:
        vrele(devvp);
        return error;
}

/*
 * Sanity check the disk vnode content, and copy it over to inode structure.
 */
static int
ext2fs_loadvnode_content(struct m_ext2fs *fs, ino_t ino, struct buf *bp, struct inode *ip)
{
        struct ext2fs_dinode *din;
        int error = 0;

        din = (struct ext2fs_dinode *)((char *)bp->b_data +
            (ino_to_fsbo(fs, ino) * EXT2_DINODE_SIZE(fs)));

        /* sanity checks - inode data NOT byteswapped at this point */
        if (EXT2_DINODE_FITS(din, e2di_extra_isize, EXT2_DINODE_SIZE(fs))
            && (EXT2_DINODE_SIZE(fs) - EXT2_REV0_DINODE_SIZE)
            < fs2h16(din->e2di_extra_isize))
        {
                printf("ext2fs: inode %"PRIu64" bad extra_isize %u",
                        ino, din->e2di_extra_isize);
                error = EINVAL;
                goto bad;
        }

        /* everything alright, proceed with copy */
        if (ip->i_din.e2fs_din == NULL)
                ip->i_din.e2fs_din = kmem_alloc(EXT2_DINODE_SIZE(fs), KM_SLEEP);

        e2fs_iload(din, ip->i_din.e2fs_din, EXT2_DINODE_SIZE(fs));

        ext2fs_set_inode_guid(ip);

    bad:
        return error;
}

/*
 * Reload all incore data for a filesystem (used after running fsck on
 * the root filesystem and finding things to fix). The filesystem must
 * be mounted read-only.
 *
 * Things to do to update the mount:
 *        1) invalidate all cached meta-data.
 *        2) re-read superblock from disk.
 *        3) re-read summary information from disk.
 *        4) invalidate all inactive vnodes.
 *        5) invalidate all cached file data.
 *        6) re-read inode data for all active vnodes.
 */
int
ext2fs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l)
{
        struct vnode *vp, *devvp;
        struct inode *ip;
        struct buf *bp;
        struct m_ext2fs *fs;
        struct ext2fs *newfs;
        int i, error;
        struct ufsmount *ump;
        struct vnode_iterator *marker;

        if ((mp->mnt_flag & MNT_RDONLY) == 0)
                return EINVAL;

        ump = VFSTOUFS(mp);
        /*
         * Step 1: invalidate all cached meta-data.
         */
        devvp = ump->um_devvp;
        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
        error = vinvalbuf(devvp, 0, cred, l, 0, 0);
        VOP_UNLOCK(devvp);
        if (error)
                panic("ext2fs_reload: dirty1");

        fs = ump->um_e2fs;
        /*
         * Step 2: re-read superblock from disk. Copy in new superblock, and
         * compute in-memory values.
         */
        error = bread(devvp, SBLOCK, SBSIZE, 0, &bp);
        if (error)
                return error;
        newfs = (struct ext2fs *)bp->b_data;
        e2fs_sbload(newfs, &fs->e2fs);

        brelse(bp, 0);

        error = ext2fs_sbfill(fs, (mp->mnt_flag & MNT_RDONLY) != 0);
        if (error)
                return error;

        /*
         * Step 3: re-read summary information from disk.
         */
        for (i = 0; i < fs->e2fs_ngdb; i++) {
                error = bread(devvp ,
                    EXT2_FSBTODB(fs, fs->e2fs.e2fs_first_dblock +
                    1 /* superblock */ + i),
                    fs->e2fs_bsize, 0, &bp);
                if (error) {
                        return error;
                }
                e2fs_cgload(bp->b_data,
                    &fs->e2fs_gd[i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
                    fs->e2fs_bsize, fs->e2fs_group_desc_shift);
                brelse(bp, 0);
        }

        vfs_vnode_iterator_init(mp, &marker);
        while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) {
                /*
                 * Step 4: invalidate all inactive vnodes.
                 */
                if (vrecycle(vp))
                        continue;
                /*
                 * Step 5: invalidate all cached file data.
                 */
                if (vn_lock(vp, LK_EXCLUSIVE)) {
                        vrele(vp);
                        continue;
                }
                if (vinvalbuf(vp, 0, cred, l, 0, 0))
                        panic("ext2fs_reload: dirty2");
                /*
                 * Step 6: re-read inode data for all active vnodes.
                 */
                ip = VTOI(vp);
                error = bread(devvp, EXT2_FSBTODB(fs, ino_to_fsba(fs, ip->i_number)),
                    (int)fs->e2fs_bsize, 0, &bp);
                if (error) {
                        vput(vp);
                        break;
                }
                error = ext2fs_loadvnode_content(fs, ip->i_number, bp, ip);
                brelse(bp, 0);
                if (error) {
                        vput(vp);
                        break;
                }

                vput(vp);
        }
        vfs_vnode_iterator_destroy(marker);
        return error;
}

/*
 * Common code for mount and mountroot
 */
int
ext2fs_mountfs(struct vnode *devvp, struct mount *mp)
{
        struct lwp *l = curlwp;
        struct ufsmount *ump;
        struct buf *bp;
        struct ext2fs *fs;
        struct m_ext2fs *m_fs;
        dev_t dev;
        int error, i, ronly;
        kauth_cred_t cred;

        dev = devvp->v_rdev;
        cred = l->l_cred;

        /* Flush out any old buffers remaining from a previous use. */
        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
        error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
        VOP_UNLOCK(devvp);
        if (error)
                return error;

        ronly = (mp->mnt_flag & MNT_RDONLY) != 0;

        bp = NULL;
        ump = NULL;

        /* Read the superblock from disk, and swap it directly. */
        error = bread(devvp, SBLOCK, SBSIZE, 0, &bp);
        if (error)
                goto out;
        fs = (struct ext2fs *)bp->b_data;
        m_fs = kmem_zalloc(sizeof(*m_fs), KM_SLEEP);
        e2fs_sbload(fs, &m_fs->e2fs);

        brelse(bp, 0);
        bp = NULL;

        /* Once swapped, validate and fill in the superblock. */
        error = ext2fs_sbfill(m_fs, ronly);
        if (error) {
                kmem_free(m_fs, sizeof(*m_fs));
                goto out;
        }
        m_fs->e2fs_ronly = ronly;

        ump = kmem_zalloc(sizeof(*ump), KM_SLEEP);
        ump->um_fstype = UFS1;
        ump->um_ops = &ext2fs_ufsops;
        ump->um_e2fs = m_fs;

        if (ronly == 0) {
                if (m_fs->e2fs.e2fs_state == E2FS_ISCLEAN)
                        m_fs->e2fs.e2fs_state = 0;
                else
                        m_fs->e2fs.e2fs_state = E2FS_ERRORS;
                m_fs->e2fs_fmod = 1;
        }

        int32_t sh = m_fs->e2fs_bsize >> m_fs->e2fs_group_desc_shift;
        /* XXX: should be added in ext2fs_sbfill()? */
        m_fs->e2fs_gd = kmem_alloc(m_fs->e2fs_ngdb * sh
            * sizeof(struct ext2_gd), KM_SLEEP);
        for (i = 0; i < m_fs->e2fs_ngdb; i++) {
                error = bread(devvp,
                    EXT2_FSBTODB(m_fs, m_fs->e2fs.e2fs_first_dblock +
                    1 /* superblock */ + i),
                    m_fs->e2fs_bsize, 0, &bp);
                if (error)
                        goto out1;
                e2fs_cgload(bp->b_data, &m_fs->e2fs_gd[i * m_fs->e2fs_bsize
                    / sizeof(struct ext2_gd)],
                    m_fs->e2fs_bsize, m_fs->e2fs_group_desc_shift);
                brelse(bp, 0);
                bp = NULL;
        }

        error = ext2fs_cg_verify_and_initialize(devvp, m_fs, ronly);
        if (error)
                goto out1;

        mp->mnt_data = ump;
        mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
        mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_EXT2FS);
        mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
        mp->mnt_stat.f_namemax = EXT2FS_MAXNAMLEN;
        mp->mnt_flag |= MNT_LOCAL;
        mp->mnt_dev_bshift = DEV_BSHIFT;        /* XXX */
        mp->mnt_fs_bshift = m_fs->e2fs_bshift;
        mp->mnt_iflag |= IMNT_DTYPE | IMNT_SHRLOOKUP;
        ump->um_flags = 0;
        ump->um_mountp = mp;
        ump->um_dev = dev;
        ump->um_devvp = devvp;
        ump->um_nindir = EXT2_NINDIR(m_fs);
        ump->um_lognindir = ffs(EXT2_NINDIR(m_fs)) - 1;
        ump->um_bptrtodb = m_fs->e2fs_fsbtodb;
        ump->um_seqinc = 1; /* no frags */
        ump->um_maxsymlinklen = EXT2_MAXSYMLINKLEN;
        ump->um_dirblksiz = m_fs->e2fs_bsize;
        ump->um_maxfilesize = ((uint64_t)0x80000000 * m_fs->e2fs_bsize - 1);
        spec_node_setmountedfs(devvp, mp);
        return 0;

out1:
        kmem_free(m_fs->e2fs_gd, m_fs->e2fs_ngdb * sh * sizeof(struct ext2_gd));
out:
        if (bp != NULL)
                brelse(bp, 0);
        if (ump) {
                kmem_free(ump->um_e2fs, sizeof(*m_fs));
                kmem_free(ump, sizeof(*ump));
                mp->mnt_data = NULL;
        }
        return error;
}

/*
 * unmount system call
 */
int
ext2fs_unmount(struct mount *mp, int mntflags)
{
        struct ufsmount *ump;
        struct m_ext2fs *fs;
        int error, flags;

        flags = 0;
        if (mntflags & MNT_FORCE)
                flags |= FORCECLOSE;
        if ((error = ext2fs_flushfiles(mp, flags)) != 0)
                return error;
        ump = VFSTOUFS(mp);
        fs = ump->um_e2fs;
        if (fs->e2fs_ronly == 0 &&
                ext2fs_cgupdate(ump, MNT_WAIT) == 0 &&
                (fs->e2fs.e2fs_state & E2FS_ERRORS) == 0) {
                fs->e2fs.e2fs_state = E2FS_ISCLEAN;
                (void) ext2fs_sbupdate(ump, MNT_WAIT);
        }
        if (ump->um_devvp->v_type != VBAD)
                spec_node_setmountedfs(ump->um_devvp, NULL);
        vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_CLOSE(ump->um_devvp, fs->e2fs_ronly ? FREAD : FREAD|FWRITE,
            NOCRED);
        vput(ump->um_devvp);
        int32_t sh = fs->e2fs_bsize >> fs->e2fs_group_desc_shift;
        kmem_free(fs->e2fs_gd, fs->e2fs_ngdb * sh * sizeof(struct ext2_gd));
        kmem_free(fs, sizeof(*fs));
        kmem_free(ump, sizeof(*ump));
        mp->mnt_data = NULL;
        mp->mnt_flag &= ~MNT_LOCAL;
        return error;
}

/*
 * Flush out all the files in a filesystem.
 */
int
ext2fs_flushfiles(struct mount *mp, int flags)
{
        extern int doforce;
        int error;

        if (!doforce)
                flags &= ~FORCECLOSE;
        error = vflush(mp, NULLVP, flags);
        return error;
}

/*
 * Get file system statistics.
 */
int
ext2fs_statvfs(struct mount *mp, struct statvfs *sbp)
{
        struct ufsmount *ump;
        struct m_ext2fs *fs;
        uint32_t overhead, overhead_per_group, ngdb;
        int i, ngroups;

        ump = VFSTOUFS(mp);
        fs = ump->um_e2fs;
        if (fs->e2fs.e2fs_magic != E2FS_MAGIC)
                panic("ext2fs_statvfs");

        /*
         * Compute the overhead (FS structures)
         */
        overhead_per_group =
            1 /* block bitmap */ +
            1 /* inode bitmap */ +
            fs->e2fs_itpg;
        overhead = fs->e2fs.e2fs_first_dblock +
            fs->e2fs_ncg * overhead_per_group;
        if (EXT2F_HAS_COMPAT_FEATURE(fs, EXT2F_COMPAT_SPARSESUPER2)) {
                /*
                 * Superblock and group descriptions is in group zero,
                 * then optionally 0, 1 or 2 extra copies.
                 */
                ngroups = 1
                        + (fs->e2fs.e4fs_backup_bgs[0] ? 1 : 0)
                        + (fs->e2fs.e4fs_backup_bgs[1] ? 1 : 0);
        } else if (EXT2F_HAS_ROCOMPAT_FEATURE(fs, EXT2F_ROCOMPAT_SPARSESUPER)) {
                for (i = 0, ngroups = 0; i < fs->e2fs_ncg; i++) {
                        if (cg_has_sb(i))
                                ngroups++;
                }
        } else {
                ngroups = fs->e2fs_ncg;
        }
        ngdb = fs->e2fs_ngdb;
        if (EXT2F_HAS_COMPAT_FEATURE(fs, EXT2F_COMPAT_RESIZE))
                ngdb += fs->e2fs.e2fs_reserved_ngdb;
        overhead += ngroups * (1 /* superblock */ + ngdb);

        sbp->f_bsize = fs->e2fs_bsize;
        sbp->f_frsize = MINBSIZE << fs->e2fs.e2fs_fsize;
        sbp->f_iosize = fs->e2fs_bsize;
        sbp->f_blocks = fs->e2fs.e2fs_bcount - overhead;
        sbp->f_bfree = fs->e2fs.e2fs_fbcount;
        sbp->f_bresvd = fs->e2fs.e2fs_rbcount;
        if (sbp->f_bfree > sbp->f_bresvd)
                sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
        else
                sbp->f_bavail = 0;
        sbp->f_files =  fs->e2fs.e2fs_icount;
        sbp->f_ffree = fs->e2fs.e2fs_ficount;
        sbp->f_favail = fs->e2fs.e2fs_ficount;
        sbp->f_fresvd = 0;
        copy_statvfs_info(sbp, mp);
        return 0;
}

static bool
ext2fs_sync_selector(void *cl, struct vnode *vp)
{
        struct inode *ip;

        KASSERT(mutex_owned(vp->v_interlock));

        ip = VTOI(vp);
        /*
         * Skip the vnode/inode if inaccessible.
         */
        if (ip == NULL || vp->v_type == VNON)
                return false;

        if (((ip->i_flag &
              (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 &&
             LIST_EMPTY(&vp->v_dirtyblkhd) &&
             (vp->v_iflag & VI_ONWORKLST) == 0))
                return false;
        return true;
}

/*
 * Go through the disk queues to initiate sandbagged IO;
 * go through the inodes to write those that have been modified;
 * initiate the writing of the super block if it has been modified.
 *
 * Note: we are always called with the filesystem marked `MPBUSY'.
 */
int
ext2fs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
{
        struct vnode *vp;
        struct ufsmount *ump = VFSTOUFS(mp);
        struct m_ext2fs *fs;
        struct vnode_iterator *marker;
        int error, allerror = 0;

        fs = ump->um_e2fs;
        if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) {        /* XXX */
                printf("fs = %s\n", fs->e2fs_fsmnt);
                panic("update: rofs mod");
        }

        /*
         * Write back each (modified) inode.
         */
        vfs_vnode_iterator_init(mp, &marker);
        while ((vp = vfs_vnode_iterator_next(marker, ext2fs_sync_selector,
            NULL)))
        {
                error = vn_lock(vp, LK_EXCLUSIVE);
                if (error) {
                        vrele(vp);
                        continue;
                }
                if (vp->v_type == VREG && waitfor == MNT_LAZY)
                        error = ext2fs_update(vp, NULL, NULL, 0);
                else
                        error = VOP_FSYNC(vp, cred,
                            waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0);
                if (error)
                        allerror = error;
                vput(vp);
        }
        vfs_vnode_iterator_destroy(marker);
        /*
         * Force stale file system control information to be flushed.
         */
        if (waitfor != MNT_LAZY) {
                vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
                if ((error = VOP_FSYNC(ump->um_devvp, cred,
                    waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0)) != 0)
                        allerror = error;
                VOP_UNLOCK(ump->um_devvp);
        }
        /*
         * Write back modified superblock.
         */
        if (fs->e2fs_fmod != 0) {
                fs->e2fs_fmod = 0;
                fs->e2fs.e2fs_wtime = time_second;
                if ((error = ext2fs_cgupdate(ump, waitfor)))
                        allerror = error;
        }
        return allerror;
}

/*
 * Load inode from disk and initialize vnode.
 */
static int
ext2fs_init_vnode(struct ufsmount *ump, struct vnode *vp, ino_t ino)
{
        struct m_ext2fs *fs;
        struct inode *ip;
        struct buf *bp;
        int error;

        fs = ump->um_e2fs;

        /* Read in the disk contents for the inode, copy into the inode. */
        error = bread(ump->um_devvp, EXT2_FSBTODB(fs, ino_to_fsba(fs, ino)),
            (int)fs->e2fs_bsize, 0, &bp);
        if (error)
                return error;

        /* Allocate and initialize inode. */
        ip = pool_get(&ext2fs_inode_pool, PR_WAITOK);
        memset(ip, 0, sizeof(struct inode));
        ip->i_vnode = vp;
        ip->i_ump = ump;
        ip->i_e2fs = fs;
        ip->i_dev = ump->um_dev;
        ip->i_number = ino;
        ip->i_e2fs_last_lblk = 0;
        ip->i_e2fs_last_blk = 0;

        error = ext2fs_loadvnode_content(fs, ino, bp, ip);
        brelse(bp, 0);
        if (error) {
                pool_put(&ext2fs_inode_pool, ip);
                return error;
        }

        /* If the inode was deleted, reset all fields */
        if (ip->i_e2fs_dtime != 0) {
                ip->i_e2fs_mode = 0;
                (void)ext2fs_setsize(ip, 0);
                (void)ext2fs_setnblock(ip, 0);
                memset(ip->i_e2fs_blocks, 0, sizeof(ip->i_e2fs_blocks));
        }

        /* Initialise vnode with this inode. */
        vp->v_tag = VT_EXT2FS;
        vp->v_op = ext2fs_vnodeop_p;
        vp->v_data = ip;

        /* Initialize genfs node. */
        genfs_node_init(vp, &ext2fs_genfsops);

        return 0;
}

/*
 * Read an inode from disk and initialize this vnode / inode pair.
 * Caller assures no other thread will try to load this inode.
 */
int
ext2fs_loadvnode(struct mount *mp, struct vnode *vp,
    const void *key, size_t key_len, const void **new_key)
{
        ino_t ino;
        struct inode *ip;
        struct ufsmount *ump;
        int error;

        KASSERT(key_len == sizeof(ino));
        memcpy(&ino, key, key_len);
        ump = VFSTOUFS(mp);

        error = ext2fs_init_vnode(ump, vp, ino);
        if (error)
                return error;

        ip = VTOI(vp);

        /* Initialize the vnode from the inode. */
        ext2fs_vinit(mp, ext2fs_specop_p, ext2fs_fifoop_p, &vp);

        /* Finish inode initialization. */
        ip->i_devvp = ump->um_devvp;
        vref(ip->i_devvp);

        /*
         * Set up a generation number for this inode if it does not
         * already have one. This should only happen on old filesystems.
         */

        if (ip->i_e2fs_gen == 0) {
                if (++ext2gennumber < (u_long)time_second)
                        ext2gennumber = time_second;
                ip->i_e2fs_gen = ext2gennumber;
                if ((mp->mnt_flag & MNT_RDONLY) == 0)
                        ip->i_flag |= IN_MODIFIED;
        }
        uvm_vnp_setsize(vp, ext2fs_size(ip));
        *new_key = &ip->i_number;
        return 0;
}

/*
 * Create a new inode on disk and initialize this vnode / inode pair.
 */
int
ext2fs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp,
    struct vattr *vap, kauth_cred_t cred, void *extra,
    size_t *key_len, const void **new_key)
{
        ino_t ino;
        struct inode *ip, *pdir;
        struct m_ext2fs *fs;
        struct ufsmount *ump;
        int error, mode;

        KASSERT(dvp->v_mount == mp);
        KASSERT(vap->va_type != VNON);

        *key_len = sizeof(ino);

        pdir = VTOI(dvp);
        fs = pdir->i_e2fs;
        ump = VFSTOUFS(mp);
        mode = MAKEIMODE(vap->va_type, vap->va_mode);

        /* Allocate fresh inode. */
        error = ext2fs_valloc(dvp, mode, cred, &ino);
        if (error)
                return error;

        /* Attach inode to vnode. */
        error = ext2fs_init_vnode(ump, vp, ino);
        if (error) {
                ext2fs_vfree(dvp, ino, mode);
                return error;
        }

        ip = VTOI(vp);

        KASSERT(!E2FS_HAS_GD_CSUM(fs) ||
            (fs->e2fs_gd[ino_to_cg(fs, ino)].ext2bgd_flags &
            h2fs16(E2FS_BG_INODE_ZEROED)) != 0);

        /* check for already used inode; makes sense only for ZEROED itable */
        if (__predict_false(ip->i_e2fs_mode && ip->i_e2fs_nlink != 0)) {
                printf("mode = 0%o, nlinks %d, inum = %llu, fs = %s\n",
                    ip->i_e2fs_mode, ip->i_e2fs_nlink,
                    (unsigned long long)ip->i_number, fs->e2fs_fsmnt);
                panic("ext2fs_valloc: dup alloc");
        }

        memset(ip->i_din.e2fs_din, 0, EXT2_DINODE_SIZE(fs));

        /*
         * Set up a new generation number for this inode.
         */
        if (++ext2gennumber < time_second)
                ext2gennumber = time_second;
        ip->i_e2fs_gen = ext2gennumber;

        ip->i_uid = kauth_cred_geteuid(cred);
        ip->i_e2fs_uid = ip->i_uid & 0xffff;
        ip->i_e2fs_gid = pdir->i_e2fs_gid;
        if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
                ip->i_e2fs_uid_high = (ip->i_uid >> 16) & 0xffff;
                ip->i_e2fs_gid_high = pdir->i_e2fs_gid_high;
        } else {
                ip->i_e2fs_uid_high = 0;
                ip->i_e2fs_gid_high = 0;
        }
        ip->i_gid = ip->i_e2fs_gid | (ip->i_e2fs_gid_high << 16);
        ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
        ip->i_e2fs_mode = mode;
        vp->v_type = IFTOVT(mode);
        ip->i_e2fs_nlink = 1;

        /* Authorize setting SGID if needed. */
        if (ip->i_e2fs_mode & ISGID) {
                error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY,
                    vp, NULL, genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid,
                    mode));
                if (error)
                        ip->i_e2fs_mode &= ~ISGID;
        }

        /* Initialize extra_isize according to what is set in superblock */
        if (EXT2F_HAS_ROCOMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_EXTRA_ISIZE)
            && EXT2_DINODE_SIZE(ip->i_e2fs) > EXT2_REV0_DINODE_SIZE) {
                ip->i_din.e2fs_din->e2di_extra_isize =
                    ip->i_e2fs->e2fs.e4fs_want_extra_isize;
        }

        /* Set create time if possible */
        if (EXT2_DINODE_FITS(ip->i_din.e2fs_din, e2di_crtime,
            EXT2_DINODE_SIZE(ip->i_e2fs))) {
                struct timespec now;
                vfs_timestamp(&now);
                EXT2_DINODE_TIME_SET(&now, ip->i_din.e2fs_din, e2di_crtime,
                    EXT2_DINODE_SIZE(ip->i_e2fs));
        }

        /* Initialize the vnode from the inode. */
        ext2fs_vinit(mp, ext2fs_specop_p, ext2fs_fifoop_p, &vp);

        /* Finish inode initialization. */
        ip->i_devvp = ump->um_devvp;
        vref(ip->i_devvp);

        uvm_vnp_setsize(vp, ext2fs_size(ip));
        *new_key = &ip->i_number;
        return 0;
}

/*
 * File handle to vnode
 *
 * Have to be really careful about stale file handles:
 * - check that the inode number is valid
 * - call ext2fs_vget() to get the locked inode
 * - check for an unallocated inode (i_mode == 0)
 */
int
ext2fs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp)
{
        struct inode *ip;
        struct vnode *nvp;
        int error;
        struct ufid ufh;
        struct m_ext2fs *fs;

        if (fhp->fid_len != sizeof(struct ufid))
                return EINVAL;

        memcpy(&ufh, fhp, sizeof(struct ufid));
        fs = VFSTOUFS(mp)->um_e2fs;
        if ((ufh.ufid_ino < EXT2_FIRSTINO && ufh.ufid_ino != EXT2_ROOTINO) ||
                ufh.ufid_ino >= fs->e2fs_ncg * fs->e2fs.e2fs_ipg)
                return ESTALE;

        if ((error = VFS_VGET(mp, ufh.ufid_ino, lktype, &nvp)) != 0) {
                *vpp = NULLVP;
                return error;
        }
        ip = VTOI(nvp);
        if (ip->i_e2fs_mode == 0 || ip->i_e2fs_dtime != 0 ||
                ip->i_e2fs_gen != ufh.ufid_gen) {
                vput(nvp);
                *vpp = NULLVP;
                return ESTALE;
        }
        *vpp = nvp;
        return 0;
}

/*
 * Vnode pointer to File handle
 */
/* ARGSUSED */
int
ext2fs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
{
        struct inode *ip;
        struct ufid ufh;

        if (*fh_size < sizeof(struct ufid)) {
                *fh_size = sizeof(struct ufid);
                return E2BIG;
        }
        *fh_size = sizeof(struct ufid);

        ip = VTOI(vp);
        memset(&ufh, 0, sizeof(ufh));
        ufh.ufid_len = sizeof(struct ufid);
        ufh.ufid_ino = ip->i_number;
        ufh.ufid_gen = ip->i_e2fs_gen;
        memcpy(fhp, &ufh, sizeof(ufh));
        return 0;
}

/*
 * Write a superblock and associated information back to disk.
 */
int
ext2fs_sbupdate(struct ufsmount *mp, int waitfor)
{
        struct m_ext2fs *fs = mp->um_e2fs;
        struct buf *bp;
        int error = 0;

        bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0);
        e2fs_sbsave(&fs->e2fs, (struct ext2fs*)bp->b_data);
        if (waitfor == MNT_WAIT)
                error = bwrite(bp);
        else
                bawrite(bp);
        return error;
}

int
ext2fs_cgupdate(struct ufsmount *mp, int waitfor)
{
        struct m_ext2fs *fs = mp->um_e2fs;
        struct buf *bp;
        int i, error = 0, allerror = 0;

        allerror = ext2fs_sbupdate(mp, waitfor);
        for (i = 0; i < fs->e2fs_ngdb; i++) {
                bp = getblk(mp->um_devvp, EXT2_FSBTODB(fs,
                    fs->e2fs.e2fs_first_dblock +
                    1 /* superblock */ + i), fs->e2fs_bsize, 0, 0);
                e2fs_cgsave(&fs->e2fs_gd[
                    i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
                    bp->b_data, fs->e2fs_bsize, fs->e2fs_group_desc_shift);
                if (waitfor == MNT_WAIT)
                        error = bwrite(bp);
                else
                        bawrite(bp);
        }

        if (!allerror && error)
                allerror = error;
        return allerror;
}

/*
 * Fill in the m_fs structure, and validate the fields of the superblock.
 * NOTE: here, the superblock is already swapped.
 */
static int
ext2fs_sbfill(struct m_ext2fs *m_fs, int ronly)
{
        uint32_t u32;
        struct ext2fs *fs = &m_fs->e2fs;

        /*
         * General sanity checks
         */
        if (fs->e2fs_magic != E2FS_MAGIC)
                return EINVAL;
        if (fs->e2fs_rev > E2FS_REV1) {
                printf("ext2fs: unsupported revision number: %#x\n",
                    fs->e2fs_rev);
                return EINVAL;
        }
        if (fs->e2fs_log_bsize > 2) {
                /* block size = 1024|2048|4096 */
                printf("ext2fs: bad block size: %d\n", fs->e2fs_log_bsize);
                return EINVAL;
        }
        if (fs->e2fs_bpg == 0) {
                printf("ext2fs: zero blocks per group\n");
                return EINVAL;
        }
        if (fs->e2fs_ipg == 0) {
                printf("ext2fs: zero inodes per group\n");
                return EINVAL;
        }

        if (fs->e2fs_first_dblock >= fs->e2fs_bcount) {
                printf("ext2fs: invalid first data block\n");
                return EINVAL;
        }
        if (fs->e2fs_rbcount > fs->e2fs_bcount ||
            fs->e2fs_fbcount > fs->e2fs_bcount) {
                printf("ext2fs: invalid block count\n");
                return EINVAL;
        }

        /*
         * Compute the fields of the superblock
         */
        u32 = fs->e2fs_bcount - fs->e2fs_first_dblock; /* > 0 */
        m_fs->e2fs_ncg = howmany(u32, fs->e2fs_bpg);
        if (m_fs->e2fs_ncg == 0) {
                printf("ext2fs: invalid number of cylinder groups\n");
                return EINVAL;
        }

        m_fs->e2fs_fsbtodb = fs->e2fs_log_bsize + LOG_MINBSIZE - DEV_BSHIFT;
        m_fs->e2fs_bsize = MINBSIZE << fs->e2fs_log_bsize;
        m_fs->e2fs_bshift = LOG_MINBSIZE + fs->e2fs_log_bsize;
        m_fs->e2fs_qbmask = m_fs->e2fs_bsize - 1;
        m_fs->e2fs_bmask = ~m_fs->e2fs_qbmask;

        if (!(fs->e2fs_features_incompat & EXT2F_INCOMPAT_64BIT) ||
            (fs->e2fs_rev == E2FS_REV0))
                m_fs->e2fs_group_desc_shift = 5;
        else {
                for (m_fs->e2fs_group_desc_shift = 0;
                     (1 << m_fs->e2fs_group_desc_shift)
                       < fs->e3fs_desc_size;
                     m_fs->e2fs_group_desc_shift++);
        }

        if ((u32 = (m_fs->e2fs_bsize >> m_fs->e2fs_group_desc_shift)) == 0) {
                /* Unlikely to happen */
                printf("ext2fs: invalid block size\n");
                return EINVAL;
        }
        m_fs->e2fs_ngdb = howmany(m_fs->e2fs_ncg, u32);
        if (m_fs->e2fs_ngdb == 0) {
                printf("ext2fs: invalid number of group descriptor blocks\n");
                return EINVAL;
        }

        if (m_fs->e2fs_bsize < EXT2_DINODE_SIZE(m_fs)) {
                printf("ext2fs: invalid inode size\n");
                return EINVAL;
        }
        m_fs->e2fs_ipb = m_fs->e2fs_bsize / EXT2_DINODE_SIZE(m_fs);

        m_fs->e2fs_itpg = fs->e2fs_ipg / m_fs->e2fs_ipb;

        /*
         * Revision-specific checks
         */
        if (fs->e2fs_rev > E2FS_REV0) {
                char buf[256];
                if (fs->e2fs_first_ino != EXT2_FIRSTINO) {
                        printf("ext2fs: unsupported first inode position\n");
                        return EINVAL;
                }
                u32 = fs->e2fs_features_incompat & ~EXT2F_INCOMPAT_SUPP;
                if (u32) {
                        snprintb(buf, sizeof(buf), EXT2F_INCOMPAT_BITS, u32);
                        printf("ext2fs: unsupported incompat features: %s\n",
                            buf);
#ifndef EXT2_IGNORE_INCOMPAT_FEATURES
                        return EINVAL;
#endif
                }
                u32 = fs->e2fs_features_rocompat & ~EXT2F_ROCOMPAT_SUPP;
                if (!ronly && u32) {
                        snprintb(buf, sizeof(buf), EXT2F_ROCOMPAT_BITS, u32);
                        printf("ext2fs: unsupported ro-incompat features: %s\n",
                            buf);
#ifndef EXT2_IGNORE_ROCOMPAT_FEATURES
                        return EROFS;
#endif
                }
                if (fs->e2fs_inode_size == 0 || !powerof2(fs->e2fs_inode_size) || fs->e2fs_inode_size > m_fs->e2fs_bsize) {
                        printf("ext2fs: bad inode size\n");
                        return EINVAL;
                }
        }

        return 0;
}




























































































































































































































































































































































































































































































































































































































































































































































































































































































   82 






   83 





































































   82 
   82 

   80 

















   89 


   89 




   89 
   90 

   87 








































































































































   79 
   83 



















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
/* $NetBSD: uvm_physseg.c,v 1.20 2024/01/13 09:44:42 tnn Exp $ */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * Copyright (c) 1991, 1993, The Regents of the University of California.
 *
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * The Mach Operating System project at Carnegie-Mellon University.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vm_page.h   7.3 (Berkeley) 4/21/91
 * from: Id: uvm_page.h,v 1.1.2.6 1998/02/04 02:31:42 chuck Exp
 *
 *
 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
 * All rights reserved.
 *
 * Permission to use, copy, modify and distribute this software and
 * its documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 */

/*
 * Consolidated API from uvm_page.c and others.
 * Consolidated and designed by Cherry G. Mathew <cherry@zyx.in>
 * rbtree(3) backing implementation by:
 * Santhosh N. Raju <santhosh.raju@gmail.com>
 */

#ifdef _KERNEL_OPT
#include "opt_uvm.h"
#endif

#include <sys/param.h>
#include <sys/types.h>
#include <sys/extent.h>
#include <sys/kmem.h>

#include <uvm/uvm.h>
#include <uvm/uvm_page.h>
#include <uvm/uvm_param.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_physseg.h>

/*
 * uvm_physseg: describes one segment of physical memory
 */
struct uvm_physseg {
        /* used during RB tree lookup for PHYS_TO_VM_PAGE(). */
#if defined(UVM_HOTPLUG)
        struct  rb_node rb_node;        /* tree information */
#endif
        paddr_t        start;                        /* PF# of first page in segment */
        paddr_t        end;                        /* (PF# of last page in segment) + 1 */
        struct        vm_page *pgs;                /* vm_page structures (from start) */

        /* less performance sensitive fields. */
        paddr_t        avail_start;                /* PF# of first free page in segment */
        paddr_t        avail_end;                /* (PF# of last free page in segment) +1  */
        struct  extent *ext;                /* extent(9) structure to manage pgs[] */
        int        free_list;                /* which free list they belong on */
        u_long        start_hint;                /* start looking for free pages here */
#ifdef __HAVE_PMAP_PHYSSEG
        struct        pmap_physseg pmseg;        /* pmap specific (MD) data */
#endif
};

/*
 * These functions are reserved for uvm(9) internal use and are not
 * exported in the header file uvm_physseg.h
 *
 * Thus they are redefined here.
 */
void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);

/* returns a pgs array */
struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);

#if defined(UVM_HOTPLUG) /* rbtree impementation */

#define                HANDLE_TO_PHYSSEG_NODE(h)        ((struct uvm_physseg *)(h))
#define                PHYSSEG_NODE_TO_HANDLE(u)        ((uvm_physseg_t)(u))

struct uvm_physseg_graph {
        struct rb_tree rb_tree;                /* Tree for entries */
        int            nentries;        /* Number of entries */
} __aligned(COHERENCY_UNIT);

static struct uvm_physseg_graph uvm_physseg_graph __read_mostly;

/*
 * Note on kmem(9) allocator usage:
 * We take the conservative approach that plug/unplug are allowed to
 * fail in high memory stress situations.
 *
 * We want to avoid re-entrant situations in which one plug/unplug
 * operation is waiting on a previous one to complete, since this
 * makes the design more complicated than necessary.
 *
 * We may review this and change its behaviour, once the use cases
 * become more obvious.
 */

/*
 * Special alloc()/free() functions for boot time support:
 * We assume that alloc() at boot time is only for new 'vm_physseg's
 * This allows us to use a static array for memory allocation at boot
 * time. Thus we avoid using kmem(9) which is not ready at this point
 * in boot.
 *
 * After kmem(9) is ready, we use it. We currently discard any free()s
 * to this static array, since the size is small enough to be a
 * trivial waste on all architectures we run on.
 */

static size_t nseg = 0;
static struct uvm_physseg uvm_physseg[VM_PHYSSEG_MAX];

static void *
uvm_physseg_alloc(size_t sz)
{
        /*
         * During boot time, we only support allocating vm_physseg
         * entries from the static array.
         * We need to assert for this.
         */

        if (__predict_false(uvm.page_init_done == false)) {
                if (sz % sizeof(struct uvm_physseg))
                        panic("%s: tried to alloc size other than multiple"
                            " of struct uvm_physseg at boot\n", __func__);

                size_t n = sz / sizeof(struct uvm_physseg);
                nseg += n;

                KASSERT(nseg > 0);
                KASSERT(nseg <= VM_PHYSSEG_MAX);

                return &uvm_physseg[nseg - n];
        }

        return kmem_zalloc(sz, KM_NOSLEEP);
}

static void
uvm_physseg_free(void *p, size_t sz)
{
        /*
         * This is a bit tricky. We do allow simulation of free()
         * during boot (for eg: when MD code is "steal"ing memory,
         * and the segment has been exhausted (and thus needs to be
         * free() - ed.
         * free() also complicates things because we leak the
         * free(). Therefore calling code can't assume that free()-ed
         * memory is available for alloc() again, at boot time.
         *
         * Thus we can't explicitly disallow free()s during
         * boot time. However, the same restriction for alloc()
         * applies to free(). We only allow uvm_physseg related free()s
         * via this function during boot time.
         */

        if (__predict_false(uvm.page_init_done == false)) {
                if (sz % sizeof(struct uvm_physseg))
                        panic("%s: tried to free size other than struct uvm_physseg"
                            " at boot\n", __func__);

        }

        /*
         * Could have been in a single if(){} block - split for
         * clarity
         */

        if ((struct uvm_physseg *)p >= uvm_physseg &&
            (struct uvm_physseg *)p < (uvm_physseg + VM_PHYSSEG_MAX)) {
                if (sz % sizeof(struct uvm_physseg))
                        panic("%s: tried to free() other than struct uvm_physseg"
                            " from static array\n", __func__);

                if ((sz / sizeof(struct uvm_physseg)) >= VM_PHYSSEG_MAX)
                        panic("%s: tried to free() the entire static array!", __func__);
                return; /* Nothing to free */
        }

        kmem_free(p, sz);
}

/* XXX: Multi page size */
bool
uvm_physseg_plug(paddr_t pfn, size_t pages, uvm_physseg_t *psp)
{
        int preload;
        size_t slabpages;
        struct uvm_physseg *ps, *current_ps = NULL;
        struct vm_page *slab = NULL, *pgs = NULL;

#ifdef DEBUG
        paddr_t off;
        uvm_physseg_t upm;
        upm = uvm_physseg_find(pfn, &off);

        ps = HANDLE_TO_PHYSSEG_NODE(upm);

        if (ps != NULL) /* XXX; do we allow "update" plugs ? */
                return false;
#endif

        /*
         * do we have room?
         */

        ps = uvm_physseg_alloc(sizeof (struct uvm_physseg));
        if (ps == NULL) {
                printf("uvm_page_physload: unable to load physical memory "
                    "segment\n");
                printf("\t%d segments allocated, ignoring 0x%"PRIxPADDR" -> 0x%"PRIxPADDR"\n",
                    VM_PHYSSEG_MAX, pfn, pfn + pages + 1);
                printf("\tincrease VM_PHYSSEG_MAX\n");
                return false;
        }

        /* span init */
        ps->start = pfn;
        ps->end = pfn + pages;

        /*
         * XXX: Ugly hack because uvmexp.npages accounts for only
         * those pages in the segment included below as well - this
         * should be legacy and removed.
         */

        ps->avail_start = ps->start;
        ps->avail_end = ps->end;

        /*
         * check to see if this is a "preload" (i.e. uvm_page_init hasn't been
         * called yet, so kmem is not available).
         */

        preload = 1; /* We are going to assume it is a preload */

        RB_TREE_FOREACH(current_ps, &(uvm_physseg_graph.rb_tree)) {
                /* If there are non NULL pages then we are not in a preload */
                if (current_ps->pgs != NULL) {
                        preload = 0;
                        /* Try to scavenge from earlier unplug()s. */
                        pgs = uvm_physseg_seg_alloc_from_slab(current_ps, pages);

                        if (pgs != NULL) {
                                break;
                        }
                }
        }


        /*
         * if VM is already running, attempt to kmem_alloc vm_page structures
         */

        if (!preload) {
                if (pgs == NULL) { /* Brand new */
                        /* Iteratively try alloc down from uvmexp.npages */
                        for (slabpages = (size_t) uvmexp.npages; slabpages >= pages; slabpages--) {
                                slab = kmem_zalloc(sizeof *pgs * (long unsigned int)slabpages, KM_NOSLEEP);
                                if (slab != NULL)
                                        break;
                        }

                        if (slab == NULL) {
                                uvm_physseg_free(ps, sizeof(struct uvm_physseg));
                                return false;
                        }

                        uvm_physseg_seg_chomp_slab(ps, slab, (size_t) slabpages);
                        /* We allocate enough for this plug */
                        pgs = uvm_physseg_seg_alloc_from_slab(ps, pages);

                        if (pgs == NULL) {
                                printf("unable to uvm_physseg_seg_alloc_from_slab() from backend\n");
                                return false;
                        }
                } else {
                        /* Reuse scavenged extent */
                        ps->ext = current_ps->ext;
                }

                physmem += pages;
                uvmpdpol_reinit();
        } else { /* Boot time - see uvm_page.c:uvm_page_init() */
                pgs = NULL;
                ps->pgs = pgs;
        }

        /*
         * now insert us in the proper place in uvm_physseg_graph.rb_tree
         */

        current_ps = rb_tree_insert_node(&(uvm_physseg_graph.rb_tree), ps);
        if (current_ps != ps) {
                panic("uvm_page_physload: Duplicate address range detected!");
        }
        uvm_physseg_graph.nentries++;

        /*
         * uvm_pagefree() requires the PHYS_TO_VM_PAGE(pgs[i]) on the
         * newly allocated pgs[] to return the correct value. This is
         * a bit of a chicken and egg problem, since it needs
         * uvm_physseg_find() to succeed. For this, the node needs to
         * be inserted *before* uvm_physseg_init_seg() happens.
         *
         * During boot, this happens anyway, since
         * uvm_physseg_init_seg() is called later on and separately
         * from uvm_page.c:uvm_page_init().
         * In the case of hotplug we need to ensure this.
         */

        if (__predict_true(!preload))
                uvm_physseg_init_seg(ps, pgs);

        if (psp != NULL)
                *psp = ps;

        return true;
}

static int
uvm_physseg_compare_nodes(void *ctx, const void *nnode1, const void *nnode2)
{
        const struct uvm_physseg *enode1 = nnode1;
        const struct uvm_physseg *enode2 = nnode2;

        KASSERT(enode1->start < enode2->start || enode1->start >= enode2->end);
        KASSERT(enode2->start < enode1->start || enode2->start >= enode1->end);

        if (enode1->start < enode2->start)
                return -1;
        if (enode1->start >= enode2->end)
                return 1;
        return 0;
}

static int
uvm_physseg_compare_key(void *ctx, const void *nnode, const void *pkey)
{
        const struct uvm_physseg *enode = nnode;
        const paddr_t pa = *(const paddr_t *) pkey;

        if(enode->start <= pa && pa < enode->end)
                return 0;
        if (enode->start < pa)
                return -1;
        if (enode->end > pa)
                return 1;

        return 0;
}

static const rb_tree_ops_t uvm_physseg_tree_ops = {
        .rbto_compare_nodes = uvm_physseg_compare_nodes,
        .rbto_compare_key = uvm_physseg_compare_key,
        .rbto_node_offset = offsetof(struct uvm_physseg, rb_node),
        .rbto_context = NULL
};

/*
 * uvm_physseg_init: init the physmem
 *
 * => physmem unit should not be in use at this point
 */

void
uvm_physseg_init(void)
{
        rb_tree_init(&(uvm_physseg_graph.rb_tree), &uvm_physseg_tree_ops);
        uvm_physseg_graph.nentries = 0;
}

uvm_physseg_t
uvm_physseg_get_next(uvm_physseg_t upm)
{
        /* next of invalid is invalid, not fatal */
        if (uvm_physseg_valid_p(upm) == false)
                return UVM_PHYSSEG_TYPE_INVALID;

        return (uvm_physseg_t) rb_tree_iterate(&(uvm_physseg_graph.rb_tree), upm,
            RB_DIR_RIGHT);
}

uvm_physseg_t
uvm_physseg_get_prev(uvm_physseg_t upm)
{
        /* prev of invalid is invalid, not fatal */
        if (uvm_physseg_valid_p(upm) == false)
                return UVM_PHYSSEG_TYPE_INVALID;

        return (uvm_physseg_t) rb_tree_iterate(&(uvm_physseg_graph.rb_tree), upm,
            RB_DIR_LEFT);
}

uvm_physseg_t
uvm_physseg_get_last(void)
{
        return (uvm_physseg_t) RB_TREE_MAX(&(uvm_physseg_graph.rb_tree));
}

uvm_physseg_t
uvm_physseg_get_first(void)
{
        return (uvm_physseg_t) RB_TREE_MIN(&(uvm_physseg_graph.rb_tree));
}

paddr_t
uvm_physseg_get_highest_frame(void)
{
        struct uvm_physseg *ps =
            (uvm_physseg_t) RB_TREE_MAX(&(uvm_physseg_graph.rb_tree));

        return ps->end - 1;
}

/*
 * uvm_page_physunload: unload physical memory and return it to
 * caller.
 */
bool
uvm_page_physunload(uvm_physseg_t upm, int freelist, paddr_t *paddrp)
{
        struct uvm_physseg *seg;

        if (__predict_true(uvm.page_init_done == true))
                panic("%s: unload attempted after uvm_page_init()\n", __func__);

        seg = HANDLE_TO_PHYSSEG_NODE(upm);

        if (seg->free_list != freelist) {
                return false;
        }

        /*
         * During cold boot, what we're about to unplug hasn't been
         * put on the uvm freelist, nor has uvmexp.npages been
         * updated. (This happens in uvm_page.c:uvm_page_init())
         *
         * For hotplug, we assume here that the pages being unloaded
         * here are completely out of sight of uvm (ie; not on any uvm
         * lists), and that  uvmexp.npages has been suitably
         * decremented before we're called.
         *
         * XXX: will avail_end == start if avail_start < avail_end?
         */

        /* try from front */
        if (seg->avail_start == seg->start &&
            seg->avail_start < seg->avail_end) {
                *paddrp = ctob(seg->avail_start);
                return uvm_physseg_unplug(seg->avail_start, 1);
        }

        /* try from rear */
        if (seg->avail_end == seg->end &&
            seg->avail_start < seg->avail_end) {
                *paddrp = ctob(seg->avail_end - 1);
                return uvm_physseg_unplug(seg->avail_end - 1, 1);
        }

        return false;
}

bool
uvm_page_physunload_force(uvm_physseg_t upm, int freelist, paddr_t *paddrp)
{
        struct uvm_physseg *seg;

        seg = HANDLE_TO_PHYSSEG_NODE(upm);

        if (__predict_true(uvm.page_init_done == true))
                panic("%s: unload attempted after uvm_page_init()\n", __func__);
        /* any room in this bank? */
        if (seg->avail_start >= seg->avail_end) {
                return false; /* nope */
        }

        *paddrp = ctob(seg->avail_start);

        /* Always unplug from front */
        return uvm_physseg_unplug(seg->avail_start, 1);
}


/*
 * vm_physseg_find: find vm_physseg structure that belongs to a PA
 */
uvm_physseg_t
uvm_physseg_find(paddr_t pframe, psize_t *offp)
{
        struct uvm_physseg * ps = NULL;

        ps = rb_tree_find_node(&(uvm_physseg_graph.rb_tree), &pframe);

        if(ps != NULL && offp != NULL)
                *offp = pframe - ps->start;

        return ps;
}

#else  /* UVM_HOTPLUG */

/*
 * physical memory config is stored in vm_physmem.
 */

#define        VM_PHYSMEM_PTR(i)        (&vm_physmem[i])
#if VM_PHYSSEG_MAX == 1
#define VM_PHYSMEM_PTR_SWAP(i, j) /* impossible */
#else
#define VM_PHYSMEM_PTR_SWAP(i, j)                                              \
        do { vm_physmem[(i)] = vm_physmem[(j)]; } while (0)
#endif

#define                HANDLE_TO_PHYSSEG_NODE(h)        (VM_PHYSMEM_PTR((int)h))
#define                PHYSSEG_NODE_TO_HANDLE(u)        ((int)((vsize_t) (u - vm_physmem) / sizeof(struct uvm_physseg)))

/* XXXCDC: uvm.physmem */
static struct uvm_physseg vm_physmem[VM_PHYSSEG_MAX] __read_mostly;
/* XXXCDC: uvm.nphysseg */
static int vm_nphysseg __read_mostly = 0;
#define        vm_nphysmem        vm_nphysseg

void
uvm_physseg_init(void)
{
        /* XXX: Provisioning for rb_tree related init(s) */
        return;
}

int
uvm_physseg_get_next(uvm_physseg_t lcv)
{
        /* next of invalid is invalid, not fatal */
        if (uvm_physseg_valid_p(lcv) == false)
                return UVM_PHYSSEG_TYPE_INVALID;

        return (lcv + 1);
}

int
uvm_physseg_get_prev(uvm_physseg_t lcv)
{
        /* prev of invalid is invalid, not fatal */
        if (uvm_physseg_valid_p(lcv) == false)
                return UVM_PHYSSEG_TYPE_INVALID;

        return (lcv - 1);
}

int
uvm_physseg_get_last(void)
{
        return (vm_nphysseg - 1);
}

int
uvm_physseg_get_first(void)
{
        return 0;
}

paddr_t
uvm_physseg_get_highest_frame(void)
{
        int lcv;
        paddr_t last = 0;
        struct uvm_physseg *ps;

        for (lcv = 0; lcv < vm_nphysseg; lcv++) {
                ps = VM_PHYSMEM_PTR(lcv);
                if (last < ps->end)
                        last = ps->end;
        }

        return last;
}


static struct vm_page *
uvm_post_preload_check(void)
{
        int preload, lcv;

        /*
         * check to see if this is a "preload" (i.e. uvm_page_init hasn't been
         * called yet, so kmem is not available).
         */

        for (lcv = 0 ; lcv < vm_nphysmem ; lcv++) {
                if (VM_PHYSMEM_PTR(lcv)->pgs)
                        break;
        }
        preload = (lcv == vm_nphysmem);

        /*
         * if VM is already running, attempt to kmem_alloc vm_page structures
         */

        if (!preload) {
                panic("Tried to add RAM after uvm_page_init");
        }

        return NULL;
}

/*
 * uvm_page_physunload: unload physical memory and return it to
 * caller.
 */
bool
uvm_page_physunload(uvm_physseg_t psi, int freelist, paddr_t *paddrp)
{
        int x;
        struct uvm_physseg *seg;

        uvm_post_preload_check();

        seg = VM_PHYSMEM_PTR(psi);

        if (seg->free_list != freelist) {
                return false;
        }

        /* try from front */
        if (seg->avail_start == seg->start &&
            seg->avail_start < seg->avail_end) {
                *paddrp = ctob(seg->avail_start);
                seg->avail_start++;
                seg->start++;
                /* nothing left?   nuke it */
                if (seg->avail_start == seg->end) {
                        if (vm_nphysmem == 1)
                                panic("uvm_page_physget: out of memory!");
                        vm_nphysmem--;
                        for (x = psi ; x < vm_nphysmem ; x++)
                                /* structure copy */
                                VM_PHYSMEM_PTR_SWAP(x, x + 1);
                }
                return (true);
        }

        /* try from rear */
        if (seg->avail_end == seg->end &&
            seg->avail_start < seg->avail_end) {
                *paddrp = ctob(seg->avail_end - 1);
                seg->avail_end--;
                seg->end--;
                /* nothing left?   nuke it */
                if (seg->avail_end == seg->start) {
                        if (vm_nphysmem == 1)
                                panic("uvm_page_physget: out of memory!");
                        vm_nphysmem--;
                        for (x = psi ; x < vm_nphysmem ; x++)
                                /* structure copy */
                                VM_PHYSMEM_PTR_SWAP(x, x + 1);
                }
                return (true);
        }

        return false;
}

bool
uvm_page_physunload_force(uvm_physseg_t psi, int freelist, paddr_t *paddrp)
{
        int x;
        struct uvm_physseg *seg;

        uvm_post_preload_check();

        seg = VM_PHYSMEM_PTR(psi);

        /* any room in this bank? */
        if (seg->avail_start >= seg->avail_end) {
                return false; /* nope */
        }

        *paddrp = ctob(seg->avail_start);
        seg->avail_start++;
        /* truncate! */
        seg->start = seg->avail_start;

        /* nothing left?   nuke it */
        if (seg->avail_start == seg->end) {
                if (vm_nphysmem == 1)
                        panic("uvm_page_physget: out of memory!");
                vm_nphysmem--;
                for (x = psi ; x < vm_nphysmem ; x++)
                        /* structure copy */
                        VM_PHYSMEM_PTR_SWAP(x, x + 1);
        }
        return (true);
}

bool
uvm_physseg_plug(paddr_t pfn, size_t pages, uvm_physseg_t *psp)
{
        int lcv;
        struct vm_page *pgs;
        struct uvm_physseg *ps;

#ifdef DEBUG
        paddr_t off;
        uvm_physseg_t upm;
        upm = uvm_physseg_find(pfn, &off);

        if (uvm_physseg_valid_p(upm)) /* XXX; do we allow "update" plugs ? */
                return false;
#endif

        paddr_t start = pfn;
        paddr_t end = pfn + pages;
        paddr_t avail_start = start;
        paddr_t avail_end = end;

        if (uvmexp.pagesize == 0)
                panic("uvm_page_physload: page size not set!");

        /*
         * do we have room?
         */

        if (vm_nphysmem == VM_PHYSSEG_MAX) {
                printf("uvm_page_physload: unable to load physical memory "
                    "segment\n");
                printf("\t%d segments allocated, ignoring 0x%llx -> 0x%llx\n",
                    VM_PHYSSEG_MAX, (long long)start, (long long)end);
                printf("\tincrease VM_PHYSSEG_MAX\n");
                if (psp != NULL)
                        *psp = UVM_PHYSSEG_TYPE_INVALID_OVERFLOW;
                return false;
        }

        /*
         * check to see if this is a "preload" (i.e. uvm_page_init hasn't been
         * called yet, so kmem is not available).
         */
        pgs = uvm_post_preload_check();

        /*
         * now insert us in the proper place in vm_physmem[]
         */

#if (VM_PHYSSEG_STRAT == VM_PSTRAT_RANDOM)
        /* random: put it at the end (easy!) */
        ps = VM_PHYSMEM_PTR(vm_nphysmem);
        lcv = vm_nphysmem;
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
        {
                int x;
                /* sort by address for binary search */
                for (lcv = 0 ; lcv < vm_nphysmem ; lcv++)
                        if (start < VM_PHYSMEM_PTR(lcv)->start)
                                break;
                ps = VM_PHYSMEM_PTR(lcv);
                /* move back other entries, if necessary ... */
                for (x = vm_nphysmem ; x > lcv ; x--)
                        /* structure copy */
                        VM_PHYSMEM_PTR_SWAP(x, x - 1);
        }
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
        {
                int x;
                /* sort by largest segment first */
                for (lcv = 0 ; lcv < vm_nphysmem ; lcv++)
                        if ((end - start) >
                            (VM_PHYSMEM_PTR(lcv)->end - VM_PHYSMEM_PTR(lcv)->start))
                                break;
                ps = VM_PHYSMEM_PTR(lcv);
                /* move back other entries, if necessary ... */
                for (x = vm_nphysmem ; x > lcv ; x--)
                        /* structure copy */
                        VM_PHYSMEM_PTR_SWAP(x, x - 1);
        }
#else
        panic("uvm_page_physload: unknown physseg strategy selected!");
#endif

        ps->start = start;
        ps->end = end;
        ps->avail_start = avail_start;
        ps->avail_end = avail_end;

        ps->pgs = pgs;

        vm_nphysmem++;

        if (psp != NULL)
                *psp = lcv;

        return true;
}

/*
 * when VM_PHYSSEG_MAX is 1, we can simplify these functions
 */

#if VM_PHYSSEG_MAX == 1
static inline int vm_physseg_find_contig(struct uvm_physseg *, int, paddr_t, psize_t *);
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
static inline int vm_physseg_find_bsearch(struct uvm_physseg *, int, paddr_t, psize_t *);
#else
static inline int vm_physseg_find_linear(struct uvm_physseg *, int, paddr_t, psize_t *);
#endif

/*
 * vm_physseg_find: find vm_physseg structure that belongs to a PA
 */
inline int
uvm_physseg_find(paddr_t pframe, psize_t *offp)
{

#if VM_PHYSSEG_MAX == 1
        return vm_physseg_find_contig(vm_physmem, vm_nphysseg, pframe, offp);
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
        return vm_physseg_find_bsearch(vm_physmem, vm_nphysseg, pframe, offp);
#else
        return vm_physseg_find_linear(vm_physmem, vm_nphysseg, pframe, offp);
#endif
}

#if VM_PHYSSEG_MAX == 1
static inline int
vm_physseg_find_contig(struct uvm_physseg *segs, int nsegs, paddr_t pframe, psize_t *offp)
{

        /* 'contig' case */
        if (pframe >= segs[0].start && pframe < segs[0].end) {
                if (offp)
                        *offp = pframe - segs[0].start;
                return(0);
        }
        return(-1);
}

#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)

static inline int
vm_physseg_find_bsearch(struct uvm_physseg *segs, int nsegs, paddr_t pframe, psize_t *offp)
{
        /* binary search for it */
        int        start, len, guess;

        /*
         * if try is too large (thus target is less than try) we reduce
         * the length to trunc(len/2) [i.e. everything smaller than "try"]
         *
         * if the try is too small (thus target is greater than try) then
         * we set the new start to be (try + 1).   this means we need to
         * reduce the length to (round(len/2) - 1).
         *
         * note "adjust" below which takes advantage of the fact that
         *  (round(len/2) - 1) == trunc((len - 1) / 2)
         * for any value of len we may have
         */

        for (start = 0, len = nsegs ; len != 0 ; len = len / 2) {
                guess = start + (len / 2);        /* try in the middle */

                /* start past our try? */
                if (pframe >= segs[guess].start) {
                        /* was try correct? */
                        if (pframe < segs[guess].end) {
                                if (offp)
                                        *offp = pframe - segs[guess].start;
                                return guess;            /* got it */
                        }
                        start = guess + 1;        /* next time, start here */
                        len--;                        /* "adjust" */
                } else {
                        /*
                         * pframe before try, just reduce length of
                         * region, done in "for" loop
                         */
                }
        }
        return(-1);
}

#else

static inline int
vm_physseg_find_linear(struct uvm_physseg *segs, int nsegs, paddr_t pframe, psize_t *offp)
{
        /* linear search for it */
        int        lcv;

        for (lcv = 0; lcv < nsegs; lcv++) {
                if (pframe >= segs[lcv].start &&
                    pframe < segs[lcv].end) {
                        if (offp)
                                *offp = pframe - segs[lcv].start;
                        return(lcv);                   /* got it */
                }
        }
        return(-1);
}
#endif
#endif /* UVM_HOTPLUG */

/*
 * PHYS_TO_VM_PAGE: find vm_page for a PA.  used by MI code to get vm_pages
 * back from an I/O mapping (ugh!).  used in some MD code as well.  it can
 * be prominent in flamegraphs, so optimise it and try to make it easy for
 * the compiler by including next to the inline lookup routines.
 */
struct vm_page *
uvm_phys_to_vm_page(paddr_t pa)
{
#if VM_PHYSSEG_STRAT != VM_PSTRAT_BSEARCH
        /* 'contig' and linear cases */
        KASSERT(vm_nphysseg > 0);
        struct uvm_physseg *ps = &vm_physmem[0];
        struct uvm_physseg *end = &vm_physmem[vm_nphysseg];
        paddr_t pframe = atop(pa);
        do {
                if (pframe >= ps->start && pframe < ps->end) {
                        return &ps->pgs[pframe - ps->start];
                }
        } while (VM_PHYSSEG_MAX > 1 && __predict_false(++ps < end));
        return NULL;
#else
        /* binary search for it */
        paddr_t pf = atop(pa);
        paddr_t        off;
        uvm_physseg_t        upm;

        upm = uvm_physseg_find(pf, &off);
        if (upm != UVM_PHYSSEG_TYPE_INVALID)
                return uvm_physseg_get_pg(upm, off);
        return(NULL);
#endif
}

bool
uvm_physseg_valid_p(uvm_physseg_t upm)
{
        struct uvm_physseg *ps;

        if (upm == UVM_PHYSSEG_TYPE_INVALID ||
            upm == UVM_PHYSSEG_TYPE_INVALID_EMPTY ||
            upm == UVM_PHYSSEG_TYPE_INVALID_OVERFLOW)
                return false;

        /*
         * This is the delicate init dance -
         * needs to go with the dance.
         */
        if (uvm.page_init_done != true)
                return true;

        ps = HANDLE_TO_PHYSSEG_NODE(upm);

        /* Extra checks needed only post uvm_page_init() */
        if (ps->pgs == NULL)
                return false;

        /* XXX: etc. */

        return true;

}

/*
 * Boot protocol dictates that these must be able to return partially
 * initialised segments.
 */
paddr_t
uvm_physseg_get_start(uvm_physseg_t upm)
{
        if (uvm_physseg_valid_p(upm) == false)
                return (paddr_t) -1;

        return HANDLE_TO_PHYSSEG_NODE(upm)->start;
}

paddr_t
uvm_physseg_get_end(uvm_physseg_t upm)
{
        if (uvm_physseg_valid_p(upm) == false)
                return (paddr_t) -1;

        return HANDLE_TO_PHYSSEG_NODE(upm)->end;
}

paddr_t
uvm_physseg_get_avail_start(uvm_physseg_t upm)
{
        if (uvm_physseg_valid_p(upm) == false)
                return (paddr_t) -1;

        return HANDLE_TO_PHYSSEG_NODE(upm)->avail_start;
}

#if defined(UVM_PHYSSEG_LEGACY)
void
uvm_physseg_set_avail_start(uvm_physseg_t upm, paddr_t avail_start)
{
        struct uvm_physseg *ps = HANDLE_TO_PHYSSEG_NODE(upm);

#if defined(DIAGNOSTIC)
        paddr_t avail_end;
        avail_end = uvm_physseg_get_avail_end(upm);
        KASSERT(uvm_physseg_valid_p(upm));
        KASSERT(avail_start < avail_end);
        KASSERT(avail_start >= ps->start);
#endif

        ps->avail_start = avail_start;
}

void
uvm_physseg_set_avail_end(uvm_physseg_t upm, paddr_t avail_end)
{
        struct uvm_physseg *ps = HANDLE_TO_PHYSSEG_NODE(upm);

#if defined(DIAGNOSTIC)
        paddr_t avail_start;
        avail_start = uvm_physseg_get_avail_start(upm);
        KASSERT(uvm_physseg_valid_p(upm));
        KASSERT(avail_end > avail_start);
        KASSERT(avail_end <= ps->end);
#endif

        ps->avail_end = avail_end;
}

#endif /* UVM_PHYSSEG_LEGACY */

paddr_t
uvm_physseg_get_avail_end(uvm_physseg_t upm)
{
        if (uvm_physseg_valid_p(upm) == false)
                return (paddr_t) -1;

        return HANDLE_TO_PHYSSEG_NODE(upm)->avail_end;
}

inline struct vm_page *
uvm_physseg_get_pg(uvm_physseg_t upm, paddr_t idx)
{
        KASSERT(uvm_physseg_valid_p(upm));
        return &HANDLE_TO_PHYSSEG_NODE(upm)->pgs[idx];
}

#ifdef __HAVE_PMAP_PHYSSEG
struct pmap_physseg *
uvm_physseg_get_pmseg(uvm_physseg_t upm)
{
        KASSERT(uvm_physseg_valid_p(upm));
        return &(HANDLE_TO_PHYSSEG_NODE(upm)->pmseg);
}
#endif

int
uvm_physseg_get_free_list(uvm_physseg_t upm)
{
        KASSERT(uvm_physseg_valid_p(upm));
        return HANDLE_TO_PHYSSEG_NODE(upm)->free_list;
}

u_long
uvm_physseg_get_start_hint(uvm_physseg_t upm)
{
        KASSERT(uvm_physseg_valid_p(upm));
        return HANDLE_TO_PHYSSEG_NODE(upm)->start_hint;
}

bool
uvm_physseg_set_start_hint(uvm_physseg_t upm, u_long start_hint)
{
        if (uvm_physseg_valid_p(upm) == false)
                return false;

        HANDLE_TO_PHYSSEG_NODE(upm)->start_hint = start_hint;
        return true;
}

void
uvm_physseg_init_seg(uvm_physseg_t upm, struct vm_page *pgs)
{
        psize_t i;
        psize_t n;
        paddr_t paddr;
        struct uvm_physseg *seg;
        struct vm_page *pg;

        KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
        KASSERT(pgs != NULL);

        seg = HANDLE_TO_PHYSSEG_NODE(upm);
        KASSERT(seg != NULL);
        KASSERT(seg->pgs == NULL);

        n = seg->end - seg->start;
        seg->pgs = pgs;

        /* init and free vm_pages (we've already zeroed them) */
        paddr = ctob(seg->start);
        for (i = 0 ; i < n ; i++, paddr += PAGE_SIZE) {
                pg = &seg->pgs[i];
                pg->phys_addr = paddr;
#ifdef __HAVE_VM_PAGE_MD
                VM_MDPAGE_INIT(pg);
#endif
                if (atop(paddr) >= seg->avail_start &&
                    atop(paddr) < seg->avail_end) {
                        uvmexp.npages++;
                        /* add page to free pool */
                        uvm_page_set_freelist(pg,
                            uvm_page_lookup_freelist(pg));
                        /* Disable LOCKDEBUG: too many and too early. */
                        mutex_init(&pg->interlock, MUTEX_NODEBUG, IPL_NONE);
                        uvm_pagefree(pg);
                }
        }
}

void
uvm_physseg_seg_chomp_slab(uvm_physseg_t upm, struct vm_page *pgs, size_t n)
{
        struct uvm_physseg *seg = HANDLE_TO_PHYSSEG_NODE(upm);

        /* max number of pre-boot unplug()s allowed */
#define UVM_PHYSSEG_BOOT_UNPLUG_MAX VM_PHYSSEG_MAX

        static char btslab_ex_storage[EXTENT_FIXED_STORAGE_SIZE(UVM_PHYSSEG_BOOT_UNPLUG_MAX)];

        if (__predict_false(uvm.page_init_done == false)) {
                seg->ext = extent_create("Boot time slab", (u_long) pgs, (u_long) (pgs + n),
                    (void *)btslab_ex_storage, sizeof(btslab_ex_storage), 0);
        } else {
                seg->ext = extent_create("Hotplug slab", (u_long) pgs, (u_long) (pgs + n), NULL, 0, 0);
        }

        KASSERT(seg->ext != NULL);

}

struct vm_page *
uvm_physseg_seg_alloc_from_slab(uvm_physseg_t upm, size_t pages)
{
        int err;
        struct uvm_physseg *seg;
        struct vm_page *pgs = NULL;

        KASSERT(pages > 0);

        seg = HANDLE_TO_PHYSSEG_NODE(upm);

        if (__predict_false(seg->ext == NULL)) {
                /*
                 * This is a situation unique to boot time.
                 * It shouldn't happen at any point other than from
                 * the first uvm_page.c:uvm_page_init() call
                 * Since we're in a loop, we can get away with the
                 * below.
                 */
                KASSERT(uvm.page_init_done != true);

                uvm_physseg_t upmp = uvm_physseg_get_prev(upm);
                KASSERT(upmp != UVM_PHYSSEG_TYPE_INVALID);

                seg->ext = HANDLE_TO_PHYSSEG_NODE(upmp)->ext;

                KASSERT(seg->ext != NULL);
        }

        /* We allocate enough for this segment */
        err = extent_alloc(seg->ext, sizeof(*pgs) * pages, 1, 0, EX_BOUNDZERO, (u_long *)&pgs);

        if (err != 0) {
#ifdef DEBUG
                printf("%s: extent_alloc failed with error: %d \n",
                    __func__, err);
#endif
        }

        return pgs;
}

/*
 * uvm_page_physload: load physical memory into VM system
 *
 * => all args are PFs
 * => all pages in start/end get vm_page structures
 * => areas marked by avail_start/avail_end get added to the free page pool
 * => we are limited to VM_PHYSSEG_MAX physical memory segments
 */

uvm_physseg_t
uvm_page_physload(paddr_t start, paddr_t end, paddr_t avail_start,
    paddr_t avail_end, int free_list)
{
        struct uvm_physseg *ps;
        uvm_physseg_t upm;

        if (__predict_true(uvm.page_init_done == true))
                panic("%s: unload attempted after uvm_page_init()\n", __func__);
        if (uvmexp.pagesize == 0)
                panic("uvm_page_physload: page size not set!");
        if (free_list >= VM_NFREELIST || free_list < VM_FREELIST_DEFAULT)
                panic("uvm_page_physload: bad free list %d", free_list);
        if (start >= end)
                panic("uvm_page_physload: start[%" PRIxPADDR "] >= end[%"
                    PRIxPADDR "]", start, end);

        if (uvm_physseg_plug(start, end - start, &upm) == false) {
                panic("uvm_physseg_plug() failed at boot.");
                /* NOTREACHED */
                return UVM_PHYSSEG_TYPE_INVALID; /* XXX: correct type */
        }

        ps = HANDLE_TO_PHYSSEG_NODE(upm);

        /* Legacy */
        ps->avail_start = avail_start;
        ps->avail_end = avail_end;

        ps->free_list = free_list; /* XXX: */


        return upm;
}

bool
uvm_physseg_unplug(paddr_t pfn, size_t pages)
{
        uvm_physseg_t upm;
        paddr_t off = 0, start __diagused, end;
        struct uvm_physseg *seg;

        upm = uvm_physseg_find(pfn, &off);

        if (!uvm_physseg_valid_p(upm)) {
                printf("%s: Tried to unplug from unknown offset\n", __func__);
                return false;
        }

        seg = HANDLE_TO_PHYSSEG_NODE(upm);

        start = uvm_physseg_get_start(upm);
        end = uvm_physseg_get_end(upm);

        if (end < (pfn + pages)) {
                printf("%s: Tried to unplug oversized span \n", __func__);
                return false;
        }

        KASSERT(pfn == start + off); /* sanity */

        if (__predict_true(uvm.page_init_done == true)) {
                /* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */
                if (extent_free(seg->ext, (u_long)(seg->pgs + off), sizeof(struct vm_page) * pages, EX_MALLOCOK | EX_NOWAIT) != 0)
                        return false;
        }

        if (off == 0 && (pfn + pages) == end) {
#if defined(UVM_HOTPLUG) /* rbtree implementation */
                int segcount = 0;
                struct uvm_physseg *current_ps;
                /* Complete segment */
                if (uvm_physseg_graph.nentries == 1)
                        panic("%s: out of memory!", __func__);

                if (__predict_true(uvm.page_init_done == true)) {
                        RB_TREE_FOREACH(current_ps, &(uvm_physseg_graph.rb_tree)) {
                                if (seg->ext == current_ps->ext)
                                        segcount++;
                        }
                        KASSERT(segcount > 0);

                        if (segcount == 1) {
                                extent_destroy(seg->ext);
                        }

                        /*
                         * We assume that the unplug will succeed from
                         *  this point onwards
                         */
                        uvmexp.npages -= (int) pages;
                }

                rb_tree_remove_node(&(uvm_physseg_graph.rb_tree), upm);
                memset(seg, 0, sizeof(struct uvm_physseg));
                uvm_physseg_free(seg, sizeof(struct uvm_physseg));
                uvm_physseg_graph.nentries--;
#else /* UVM_HOTPLUG */
                int x;
                if (vm_nphysmem == 1)
                        panic("uvm_page_physget: out of memory!");
                vm_nphysmem--;
                for (x = upm ; x < vm_nphysmem ; x++)
                        /* structure copy */
                        VM_PHYSMEM_PTR_SWAP(x, x + 1);
#endif /* UVM_HOTPLUG */
                /* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */
                return true;
        }

        if (off > 0 &&
            (pfn + pages) < end) {
#if defined(UVM_HOTPLUG) /* rbtree implementation */
                /* middle chunk - need a new segment */
                struct uvm_physseg *ps, *current_ps;
                ps = uvm_physseg_alloc(sizeof (struct uvm_physseg));
                if (ps == NULL) {
                        printf("%s: Unable to allocated new fragment vm_physseg \n",
                            __func__);
                        return false;
                }

                /* Remove middle chunk */
                if (__predict_true(uvm.page_init_done == true)) {
                        KASSERT(seg->ext != NULL);
                        ps->ext = seg->ext;

                        /* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */
                        /*
                         * We assume that the unplug will succeed from
                         *  this point onwards
                         */
                        uvmexp.npages -= (int) pages;
                }

                ps->start = pfn + pages;
                ps->avail_start = ps->start; /* XXX: Legacy */

                ps->end = seg->end;
                ps->avail_end = ps->end; /* XXX: Legacy */

                seg->end = pfn;
                seg->avail_end = seg->end; /* XXX: Legacy */


                /*
                 * The new pgs array points to the beginning of the
                 * tail fragment.
                 */
                if (__predict_true(uvm.page_init_done == true))
                        ps->pgs = seg->pgs + off + pages;

                current_ps = rb_tree_insert_node(&(uvm_physseg_graph.rb_tree), ps);
                if (current_ps != ps) {
                        panic("uvm_page_physload: Duplicate address range detected!");
                }
                uvm_physseg_graph.nentries++;
#else /* UVM_HOTPLUG */
                panic("%s: can't unplug() from the middle of a segment without"
                    " UVM_HOTPLUG\n",  __func__);
                /* NOTREACHED */
#endif /* UVM_HOTPLUG */
                return true;
        }

        if (off == 0 && (pfn + pages) < end) {
                /* Remove front chunk */
                if (__predict_true(uvm.page_init_done == true)) {
                        /* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */
                        /*
                         * We assume that the unplug will succeed from
                         *  this point onwards
                         */
                        uvmexp.npages -= (int) pages;
                }

                /* Truncate */
                seg->start = pfn + pages;
                seg->avail_start = seg->start; /* XXX: Legacy */

                /*
                 * Move the pgs array start to the beginning of the
                 * tail end.
                 */
                if (__predict_true(uvm.page_init_done == true))
                        seg->pgs += pages;

                return true;
        }

        if (off > 0 && (pfn + pages) == end) {
                /* back chunk */


                /* Truncate! */
                seg->end = pfn;
                seg->avail_end = seg->end; /* XXX: Legacy */

                uvmexp.npages -= (int) pages;

                return true;
        }

        printf("%s: Tried to unplug unknown range \n", __func__);

        return false;
}


















































































   39 











   39 




    3 


   38 










    3 







    3 



    3 







    3 
   11 









   31 
   12 






   38 
















    3 


    3 

    3 




    3 






    3 















    2 






































































    1 
    2 

    2 

    1 


















    2 




    4 





    4 




    4 

    4 

    4 








    3 




    3 

    1 

    2 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
/*        $NetBSD: ufs_inode.c,v 1.112 2020/09/05 16:30:13 riastradh Exp $        */

/*
 * Copyright (c) 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ufs_inode.c        8.9 (Berkeley) 5/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.112 2020/09/05 16:30:13 riastradh Exp $");

#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#include "opt_wapbl.h"
#include "opt_uvmhist.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/kernel.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/wapbl.h>
#include <sys/kmem.h>

#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_wapbl.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dirhash.h>
#endif
#ifdef UFS_EXTATTR
#include <ufs/ufs/extattr.h>
#endif

#ifdef UVMHIST
#include <uvm/uvm.h>
#endif
#include <uvm/uvm_page.h>
#include <uvm/uvm_stat.h>

/*
 * Last reference to an inode.  If necessary, write or delete it.
 */
int
ufs_inactive(void *v)
{
        struct vop_inactive_v2_args /* {
                struct vnode *a_vp;
                struct bool *a_recycle;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct inode *ip = VTOI(vp);
        struct mount *mp = vp->v_mount;
        mode_t mode;
        int allerror = 0, error;
        bool wapbl_locked = false;

        UFS_WAPBL_JUNLOCK_ASSERT(mp);

        /*
         * Ignore inodes related to stale file handles.
         */
        if (ip->i_mode == 0)
                goto out;

        if (ip->i_nlink <= 0 && (mp->mnt_flag & MNT_RDONLY) == 0) {
#ifdef UFS_EXTATTR
                ufs_extattr_vnode_inactive(vp, curlwp);
#endif
                /*
                 * All file blocks must be freed before we can let the vnode
                 * be reclaimed, so can't postpone full truncating any further.
                 */
                ufs_truncate_all(vp);

#if defined(QUOTA) || defined(QUOTA2)
                error = UFS_WAPBL_BEGIN(mp);
                if (error) {
                        allerror = error;
                } else {
                        wapbl_locked = true;
                        (void)chkiq(ip, -1, NOCRED, 0);
                }
#endif
                DIP_ASSIGN(ip, rdev, 0);
                mode = ip->i_mode;
                ip->i_mode = 0;
                ip->i_omode = mode;
                DIP_ASSIGN(ip, mode, 0);
                ip->i_flag |= IN_CHANGE | IN_UPDATE;
                /*
                 * Defer final inode free and update to ufs_reclaim().
                 */
        }

        if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) {
                if (! wapbl_locked) {
                        error = UFS_WAPBL_BEGIN(mp);
                        if (error) {
                                allerror = error;
                                goto out;
                        }
                        wapbl_locked = true;
                }
                UFS_UPDATE(vp, NULL, NULL, 0);
        }
out:
        if (wapbl_locked)
                UFS_WAPBL_END(mp);
        /*
         * If we are done with the inode, reclaim it
         * so that it can be reused immediately.
         */
        *ap->a_recycle = (ip->i_mode == 0);

        if (ip->i_mode == 0 && (DIP(ip, size) != 0 || DIP(ip, blocks) != 0)) {
                printf("%s: unlinked ino %" PRId64 " on \"%s\" has"
                    " non zero size %" PRIx64 " or blocks %" PRIx64
                    " with allerror %d\n",
                    __func__, ip->i_number, mp->mnt_stat.f_mntonname,
                    DIP(ip, size), DIP(ip, blocks), allerror);
                panic("%s: dirty filesystem?", __func__);
        }

        return (allerror);
}

/*
 * Reclaim an inode so that it can be used for other purposes.
 */
int
ufs_reclaim(struct vnode *vp)
{
        struct inode *ip = VTOI(vp);

        if (!UFS_WAPBL_BEGIN(vp->v_mount)) {
                UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE);
                UFS_WAPBL_END(vp->v_mount);
        }
        UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE);

        if (ip->i_devvp) {
                vrele(ip->i_devvp);
                ip->i_devvp = 0;
        }
#if defined(QUOTA) || defined(QUOTA2)
        ufsquota_free(ip);
#endif
#ifdef UFS_DIRHASH
        if (ip->i_dirhash != NULL)
                ufsdirhash_free(ip);
#endif
        return (0);
}

/*
 * allocate a range of blocks in a file.
 * after this function returns, any page entirely contained within the range
 * will map to invalid data and thus must be overwritten before it is made
 * accessible to others.
 */

int
ufs_balloc_range(struct vnode *vp, off_t off, off_t len, kauth_cred_t cred,
    int flags)
{
        off_t neweof;        /* file size after the operation */
        off_t neweob;        /* offset next to the last block after the operation */
        off_t pagestart; /* starting offset of range covered by pgs */
        off_t eob;        /* offset next to allocated blocks */
        struct uvm_object *uobj;
        int i, delta, error, npages;
        int bshift = vp->v_mount->mnt_fs_bshift;
        int bsize = 1 << bshift;
        int ppb = MAX(bsize >> PAGE_SHIFT, 1);
        struct vm_page **pgs;
        size_t pgssize;
        UVMHIST_FUNC("ufs_balloc_range"); UVMHIST_CALLED(ubchist);
        UVMHIST_LOG(ubchist, "vp %#jx off 0x%jx len 0x%jx u_size 0x%jx",
                    (uintptr_t)vp, off, len, vp->v_size);

        neweof = MAX(vp->v_size, off + len);
        GOP_SIZE(vp, neweof, &neweob, 0);

        error = 0;
        uobj = &vp->v_uobj;

        /*
         * read or create pages covering the range of the allocation and
         * keep them locked until the new block is allocated, so there
         * will be no window where the old contents of the new block are
         * visible to racing threads.
         */

        pagestart = trunc_page(off) & ~(bsize - 1);
        npages = MIN(ppb, (round_page(neweob) - pagestart) >> PAGE_SHIFT);
        pgssize = npages * sizeof(struct vm_page *);
        pgs = kmem_zalloc(pgssize, KM_SLEEP);

        /*
         * adjust off to be block-aligned.
         */

        delta = off & (bsize - 1);
        off -= delta;
        len += delta;

        genfs_node_wrlock(vp);
        rw_enter(uobj->vmobjlock, RW_WRITER);
        error = VOP_GETPAGES(vp, pagestart, pgs, &npages, 0,
            VM_PROT_WRITE, 0, PGO_SYNCIO | PGO_PASTEOF | PGO_NOBLOCKALLOC |
            PGO_NOTIMESTAMP | PGO_GLOCKHELD);
        if (error) {
                genfs_node_unlock(vp);
                goto out;
        }

        /*
         * now allocate the range.
         */

        error = GOP_ALLOC(vp, off, len, flags, cred);
        genfs_node_unlock(vp);

        /*
         * if the allocation succeeded, mark all the pages dirty
         * and clear PG_RDONLY on any pages that are now fully backed
         * by disk blocks.  if the allocation failed, we do not invalidate
         * the pages since they might have already existed and been dirty,
         * in which case we need to keep them around.  if we created the pages,
         * they will be clean and read-only, and leaving such pages
         * in the cache won't cause any problems.
         */

        GOP_SIZE(vp, off + len, &eob, 0);
        rw_enter(uobj->vmobjlock, RW_WRITER);
        for (i = 0; i < npages; i++) {
                KASSERT((pgs[i]->flags & PG_RELEASED) == 0);
                if (!error) {
                        if (off <= pagestart + (i << PAGE_SHIFT) &&
                            pagestart + ((i + 1) << PAGE_SHIFT) <= eob) {
                                pgs[i]->flags &= ~PG_RDONLY;
                        }
                        uvm_pagemarkdirty(pgs[i], UVM_PAGE_STATUS_DIRTY);
                }
                uvm_pagelock(pgs[i]);
                uvm_pageactivate(pgs[i]);
                uvm_pageunlock(pgs[i]);
        }
        uvm_page_unbusy(pgs, npages);
        rw_exit(uobj->vmobjlock);

 out:
         kmem_free(pgs, pgssize);
        return error;
}

int
ufs_truncate_retry(struct vnode *vp, int ioflag, uint64_t newsize,
    kauth_cred_t cred)
{
        struct inode *ip = VTOI(vp);
        struct mount *mp = vp->v_mount;
        int error = 0;

        UFS_WAPBL_JUNLOCK_ASSERT(mp);

        /*
         * Truncate might temporarily fail, loop until done.
         */
        do {
                error = UFS_WAPBL_BEGIN(mp);
                if (error)
                        goto out;

                error = UFS_TRUNCATE(vp, newsize, ioflag, cred);
                UFS_WAPBL_END(mp);

                if (error != 0 && error != EAGAIN)
                        goto out;
        } while (ip->i_size != newsize);

  out:
        return error;
}

/* truncate all the data of the inode including extended attributes */
int
ufs_truncate_all(struct vnode *vp)
{
        struct inode *ip = VTOI(vp);
        off_t isize = ip->i_size;

        if (ip->i_ump->um_fstype == UFS2)
                isize += ip->i_ffs2_extsize;

        if (isize == 0)
                return 0;
        return ufs_truncate_retry(vp, IO_NORMAL | IO_EXT, 0, NOCRED);
}































































































































































































































    1 


    1 












    1 













    1 















    1 



















    1 























































































































































































































































































































    1 













    1 


    1 






































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
/*        $NetBSD: tty_pty.c,v 1.149 2021/10/11 01:07:36 thorpej Exp $        */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)tty_pty.c        8.4 (Berkeley) 2/20/95
 */

/*
 * Pseudo-teletype Driver
 * (Actually two drivers, requiring two entries in 'cdevsw')
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty_pty.c,v 1.149 2021/10/11 01:07:36 thorpej Exp $");

#include "opt_ptm.h"

#define TTY_ALLOW_PRIVATE

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/ioctl.h>
#include <sys/ioctl_compat.h>
#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/signalvar.h>
#include <sys/uio.h>
#include <sys/filedesc.h>
#include <sys/conf.h>
#include <sys/poll.h>
#include <sys/pty.h>
#include <sys/kauth.h>

#include "ioconf.h"

#define        DEFAULT_NPTYS                16        /* default number of initial ptys */
#define DEFAULT_MAXPTYS                992        /* default maximum number of ptys */

#define BUFSIZ 100                /* Chunk size iomoved to/from user */

struct        pt_softc {
        struct        tty *pt_tty;
        int        pt_flags;
        struct        selinfo pt_selr, pt_selw;
        u_char        pt_send;
        u_char        pt_ucntl;
};

static struct pt_softc **pt_softc = NULL;        /* pty array */
static int maxptys = DEFAULT_MAXPTYS;        /* maximum number of ptys (sysctable) */
kmutex_t pt_softc_mutex;
int npty = 0;                        /* for pstat -t */

#define        PF_PKT                0x08                /* packet mode */
#define        PF_STOPPED        0x10                /* user told stopped */
#define        PF_REMOTE        0x20                /* remote and flow controlled input */
#define        PF_NOSTOP        0x40
#define PF_UCNTL        0x80                /* user control mode */

void        ptcwakeup(struct tty *, int);
void        ptsstart(struct tty *);
int        pty_maxptys(int, int);

static struct pt_softc **ptyarralloc(int);

dev_type_open(ptcopen);
dev_type_close(ptcclose);
dev_type_read(ptcread);
dev_type_write(ptcwrite);
dev_type_poll(ptcpoll);
dev_type_kqfilter(ptckqfilter);

dev_type_open(ptsopen);
dev_type_close(ptsclose);
dev_type_read(ptsread);
dev_type_write(ptswrite);
dev_type_stop(ptsstop);
dev_type_poll(ptspoll);

dev_type_ioctl(ptyioctl);
dev_type_tty(ptytty);

const struct cdevsw ptc_cdevsw = {
        .d_open = ptcopen,
        .d_close = ptcclose,
        .d_read = ptcread,
        .d_write = ptcwrite,
        .d_ioctl = ptyioctl,
        .d_stop = nullstop,
        .d_tty = ptytty,
        .d_poll = ptcpoll,
        .d_mmap = nommap,
        .d_kqfilter = ptckqfilter,
        .d_discard = nodiscard,
        .d_flag = D_TTY
};

const struct cdevsw pts_cdevsw = {
        .d_open = ptsopen,
        .d_close = ptsclose,
        .d_read = ptsread,
        .d_write = ptswrite,
        .d_ioctl = ptyioctl,
        .d_stop = ptsstop,
        .d_tty = ptytty,
        .d_poll = ptspoll,
        .d_mmap = nommap,
        .d_kqfilter = ttykqfilter,
        .d_discard = nodiscard,
        .d_flag = D_TTY
};

#if defined(pmax)
/*
 * Used by arch/pmax/conf/majors.pmax, which needs a second copy as it
 * needs to map this stuff to two pairs of majors.
 */

const struct cdevsw ptc_ultrix_cdevsw = {
        .d_open = ptcopen,
        .d_close = ptcclose,
        .d_read = ptcread,
        .d_write = ptcwrite,
        .d_ioctl = ptyioctl,
        .d_stop = nullstop,
        .d_tty = ptytty,
        .d_poll = ptcpoll,
        .d_mmap = nommap,
        .d_kqfilter = ptckqfilter,
        .d_discard = nodiscard,
        .d_flag = D_TTY
};

const struct cdevsw pts_ultrix_cdevsw = {
        .d_open = ptsopen,
        .d_close = ptsclose,
        .d_read = ptsread,
        .d_write = ptswrite,
        .d_ioctl = ptyioctl,
        .d_stop = ptsstop,
        .d_tty = ptytty,
        .d_poll = ptspoll,
        .d_mmap = nommap,
        .d_kqfilter = ttykqfilter,
        .d_discard = nodiscard,
        .d_flag = D_TTY
};
#endif /* defined(pmax) */

/*
 * Check if a pty is free to use.
 */
int
pty_isfree(int minor, int lock)
{
        struct pt_softc *pt = pt_softc[minor];
        if (lock)
                mutex_enter(&pt_softc_mutex);
        minor = pt == NULL || pt->pt_tty == NULL ||
            pt->pt_tty->t_oproc == NULL;
        if (lock)
                mutex_exit(&pt_softc_mutex);
        return minor;
}

/*
 * Allocate and zero array of nelem elements.
 */
static struct pt_softc **
ptyarralloc(int nelem)
{
        struct pt_softc **pt;
        nelem += 10;
        pt = kmem_zalloc(nelem * sizeof(*pt), KM_SLEEP);
        return pt;
}

static void
ptyarrfree(struct pt_softc **pt, int nelem)
{

        nelem += 10;
        kmem_free(pt, nelem * sizeof(*pt));
}

/*
 * Check if the minor is correct and ensure necessary structures
 * are properly allocated.
 */
int
pty_check(int ptn)
{
        struct pt_softc *pti;

        if (ptn >= npty) {
                struct pt_softc **newpt, **oldpt;
                int newnpty;
                int oldnpty;

                /* check if the requested pty can be granted */
                if (ptn >= maxptys) {
            limit_reached:
                        tablefull("pty", "increase kern.maxptys");
                        return ENXIO;
                }

                /* Allocate a larger pty array */
                for (newnpty = npty; newnpty <= ptn;)
                        newnpty *= 2;
                if (newnpty > maxptys)
                        newnpty = maxptys;
                newpt = ptyarralloc(newnpty);

                /*
                 * Now grab the pty array mutex - we need to ensure
                 * that the pty array is consistent while copying its
                 * content to newly allocated, larger space; we also
                 * need to be safe against pty_maxptys().
                 */
                mutex_enter(&pt_softc_mutex);

                if (newnpty >= maxptys) {
                        /* limit cut away beneath us... */
                        if (ptn >= maxptys) {
                                mutex_exit(&pt_softc_mutex);
                                ptyarrfree(newpt, newnpty);
                                goto limit_reached;
                        }
                        newnpty = maxptys;
                }

                /*
                 * If the pty array was not enlarged while we were waiting
                 * for mutex, copy current contents of pt_softc[] to newly
                 * allocated array and start using the new bigger array.
                 */
                if (newnpty > npty) {
                        memcpy(newpt, pt_softc, npty*sizeof(struct pt_softc *));
                        oldpt = pt_softc;
                        oldnpty = npty;
                        pt_softc = newpt;
                        npty = newnpty;
                } else {
                        /* was enlarged when waited for lock, free new space */
                        oldpt = newpt;
                        oldnpty = newnpty;
                }

                mutex_exit(&pt_softc_mutex);
                ptyarrfree(oldpt, oldnpty);
        }

        /*
         * If the entry is not yet allocated, allocate one. The mutex is
         * needed so that the state of pt_softc[] array is consistant
         * in case it has been lengthened above.
         */
        if (!pt_softc[ptn]) {
                pti = kmem_zalloc(sizeof(*pti), KM_SLEEP);

                selinit(&pti->pt_selr);
                selinit(&pti->pt_selw);
                pti->pt_tty = tty_alloc();

                mutex_enter(&pt_softc_mutex);

                /*
                 * Check the entry again - it might have been
                 * added while we were waiting for mutex.
                 */
                if (pt_softc[ptn]) {
                        mutex_exit(&pt_softc_mutex);
                        tty_free(pti->pt_tty);
                        seldestroy(&pti->pt_selr);
                        seldestroy(&pti->pt_selw);
                        kmem_free(pti, sizeof(*pti));
                        return 0;
                }
                tty_attach(pti->pt_tty);
                pt_softc[ptn] = pti;

                mutex_exit(&pt_softc_mutex);
        }

        return 0;
}

/*
 * Set maxpty in thread-safe way. Returns 0 in case of error, otherwise
 * new value of maxptys.
 */
int
pty_maxptys(int newmax, int set)
{
        if (!set)
                return maxptys;

        /*
         * We have to grab the pt_softc lock, so that we would pick correct
         * value of npty (might be modified in pty_check()).
         */
        mutex_enter(&pt_softc_mutex);

        /*
         * The value cannot be set to value lower than the highest pty
         * number ever allocated.
         */
        if (newmax >= npty)
                maxptys = newmax;
        else
                newmax = 0;

        mutex_exit(&pt_softc_mutex);

        return newmax;
}

/*
 * Establish n (or default if n is 1) ptys in the system.
 */
void
ptyattach(int n)
{

        mutex_init(&pt_softc_mutex, MUTEX_DEFAULT, IPL_NONE);

        /* maybe should allow 0 => none? */
        if (n <= 1)
                n = DEFAULT_NPTYS;
        pt_softc = ptyarralloc(n);
        npty = n;
#ifndef NO_DEV_PTM
        ptmattach(1);
#endif
}

/*ARGSUSED*/
int
ptsopen(dev_t dev, int flag, int devtype, struct lwp *l)
{
        struct pt_softc *pti;
        struct tty *tp;
        int error;
        int ptn = minor(dev);

        if ((error = pty_check(ptn)) != 0)
                return error;

        mutex_spin_enter(&tty_lock);
        pti = pt_softc[ptn];
        tp = pti->pt_tty;
        if (!ISSET(tp->t_state, TS_ISOPEN)) {
                tp->t_dev = dev;
                ttychars(tp);                /* Set up default chars */
                tp->t_iflag = TTYDEF_IFLAG;
                tp->t_oflag = TTYDEF_OFLAG;
                tp->t_lflag = TTYDEF_LFLAG;
                tp->t_cflag = TTYDEF_CFLAG;
                tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED;
                ttsetwater(tp);                /* would be done in xxparam() */
        } else if (kauth_authorize_device_tty(l->l_cred, KAUTH_DEVICE_TTY_OPEN,
            tp) != 0) {
                mutex_spin_exit(&tty_lock);
                return EBUSY;
        }
        if (tp->t_oproc)                        /* Ctrlr still around. */
                SET(tp->t_state, TS_CARR_ON);
        if (!ISSET(flag, O_NONBLOCK)) {
                while (!ISSET(tp->t_state, TS_CARR_ON)) {
                        tp->t_wopen++;
                        error = ttysleep(tp, &tp->t_rawcv, true, 0);
                        tp->t_wopen--;
                        if (error != 0) {
                                mutex_spin_exit(&tty_lock);
                                return error;
                        }
                }
        }
        mutex_spin_exit(&tty_lock);
        error = (*tp->t_linesw->l_open)(dev, tp);
        ptcwakeup(tp, FREAD|FWRITE);
        return error;
}

int
ptsclose(dev_t dev, int flag, int mode, struct lwp *l)
{
        struct pt_softc *pti = pt_softc[minor(dev)];
        struct tty *tp = pti->pt_tty;
        int error;

        error = (*tp->t_linesw->l_close)(tp, flag);
        error |= ttyclose(tp);
        ptcwakeup(tp, FREAD|FWRITE);
        return error;
}

int
ptsread(dev_t dev, struct uio *uio, int flag)
{
        struct proc *p = curproc;
        struct pt_softc *pti = pt_softc[minor(dev)];
        struct tty *tp = pti->pt_tty;
        int error = 0;
        int cc, c;

again:
        if (pti->pt_flags & PF_REMOTE) {
                mutex_spin_enter(&tty_lock);
                while (isbackground(p, tp)) {        /* XXXSMP */
                        if (sigismasked(curlwp, SIGTTIN) ||
                            p->p_pgrp->pg_jobc == 0 ||
                            p->p_lflag & PL_PPWAIT) {
                                mutex_spin_exit(&tty_lock);
                                return EIO;
                        }
                        ttysig(tp, TTYSIG_PG1, SIGTTIN);
                        error = ttypause(tp, hz);
                        if (error != 0) {
                                mutex_spin_exit(&tty_lock);
                                return error;
                        }
                }
                if (tp->t_canq.c_cc == 0) {
                        if (flag & IO_NDELAY) {
                                mutex_spin_exit(&tty_lock);
                                return EWOULDBLOCK;
                        }
                        error = ttysleep(tp, &tp->t_cancv, true, 0);
                        mutex_spin_exit(&tty_lock);
                        if (error != 0)
                                return error;
                        goto again;
                }
                while(error == 0 && tp->t_canq.c_cc > 1 && uio->uio_resid > 0) {
                        c = getc(&tp->t_canq);
                        mutex_spin_exit(&tty_lock);
                        error = ureadc(c, uio);
                        mutex_spin_enter(&tty_lock);
                        /* Re-check terminal state here? */
                }
                if (tp->t_canq.c_cc == 1)
                        (void) getc(&tp->t_canq);
                cc = tp->t_canq.c_cc;
                mutex_spin_exit(&tty_lock);
                if (cc)
                        return error;
        } else if (tp->t_oproc)
                error = (*tp->t_linesw->l_read)(tp, uio, flag);
        ptcwakeup(tp, FWRITE);
        return error;
}

/*
 * Write to pseudo-tty.
 * Wakeups of controlling tty will happen
 * indirectly, when tty driver calls ptsstart.
 */
int
ptswrite(dev_t dev, struct uio *uio, int flag)
{
        struct pt_softc *pti = pt_softc[minor(dev)];
        struct tty *tp = pti->pt_tty;

        if (tp->t_oproc == NULL)
                return EIO;
        return (*tp->t_linesw->l_write)(tp, uio, flag);
}

/*
 * Poll pseudo-tty.
 */
int
ptspoll(dev_t dev, int events, struct lwp *l)
{
        struct pt_softc *pti = pt_softc[minor(dev)];
        struct tty *tp = pti->pt_tty;

        if (tp->t_oproc == NULL)
                return POLLHUP;

        return (*tp->t_linesw->l_poll)(tp, events, l);
}

/*
 * Start output on pseudo-tty.
 * Wake up process polling or sleeping for input from controlling tty.
 */
void
ptsstart(struct tty *tp)
{
        struct pt_softc *pti;

        KASSERT(tp->t_dev != NODEV);
        pti = pt_softc[minor(tp->t_dev)];

        KASSERT(mutex_owned(&tty_lock));

        if (ISSET(tp->t_state, TS_TTSTOP))
                return;
        if (pti->pt_flags & PF_STOPPED) {
                pti->pt_flags &= ~PF_STOPPED;
                pti->pt_send = TIOCPKT_START;
        }

        selnotify(&pti->pt_selr, 0, NOTE_SUBMIT);
        cv_broadcast(&tp->t_outcvf);
}

/*
 * Stop output.
 */
void
ptsstop(struct tty *tp, int flush)
{
        struct pt_softc *pti;

        KASSERT(tp->t_dev != NODEV);
        pti = pt_softc[minor(tp->t_dev)];

        KASSERT(mutex_owned(&tty_lock));

        /* note: FLUSHREAD and FLUSHWRITE already ok */
        CTASSERT(TIOCPKT_FLUSHREAD == FREAD);
        CTASSERT(TIOCPKT_FLUSHWRITE == FWRITE);
        if (flush == 0) {
                flush = TIOCPKT_STOP;
                pti->pt_flags |= PF_STOPPED;
        } else
                pti->pt_flags &= ~PF_STOPPED;
        pti->pt_send |= flush;

        /* change of perspective */
        if (flush & FREAD) {
                selnotify(&pti->pt_selw, 0, NOTE_SUBMIT);
                cv_broadcast(&tp->t_rawcvf);
        }
        if (flush & FWRITE) {
                selnotify(&pti->pt_selr, 0, NOTE_SUBMIT);
                cv_broadcast(&tp->t_outcvf);
        }
}

void
ptcwakeup(struct tty *tp, int flag)
{
        struct pt_softc *pti;

        if (tp->t_dev == NODEV)
                return;        /* client side not open yet */

        pti = pt_softc[minor(tp->t_dev)];
        KASSERT(pti != NULL);

        mutex_spin_enter(&tty_lock);
        if (flag & FREAD) {
                selnotify(&pti->pt_selr, 0, NOTE_SUBMIT);
                cv_broadcast(&tp->t_outcvf);
        }
        if (flag & FWRITE) {
                selnotify(&pti->pt_selw, 0, NOTE_SUBMIT);
                cv_broadcast(&tp->t_rawcvf);
        }
        mutex_spin_exit(&tty_lock);
}

/*ARGSUSED*/
int
ptcopen(dev_t dev, int flag, int devtype, struct lwp *l)
{
        struct pt_softc *pti;
        struct tty *tp;
        int error;
        int ptn = minor(dev);

        if ((error = pty_check(ptn)) != 0)
                return error;

        pti = pt_softc[ptn];
        tp = pti->pt_tty;

        mutex_spin_enter(&tty_lock);
        if (tp->t_oproc) {
                mutex_spin_exit(&tty_lock);
                return EIO;
        }
        tp->t_dev = dev;
        tp->t_oproc = ptsstart;
        mutex_spin_exit(&tty_lock);
        (void)(*tp->t_linesw->l_modem)(tp, 1);
        CLR(tp->t_lflag, EXTPROC);
        pti->pt_flags = 0;
        pti->pt_send = 0;
        pti->pt_ucntl = 0;
        return 0;
}

/*ARGSUSED*/
int
ptcclose(dev_t dev, int flag, int devtype, struct lwp *l)
{
        struct pt_softc *pti = pt_softc[minor(dev)];
        struct tty *tp = pti->pt_tty;

        (void)(*tp->t_linesw->l_modem)(tp, 0);
        mutex_spin_enter(&tty_lock);
        CLR(tp->t_state, TS_CARR_ON);
        tp->t_oproc = NULL;                /* mark closed */
        mutex_spin_exit(&tty_lock);
        return 0;
}

int
ptcread(dev_t dev, struct uio *uio, int flag)
{
        struct pt_softc *pti = pt_softc[minor(dev)];
        struct tty *tp = pti->pt_tty;
        u_char bf[BUFSIZ];
        int error = 0, cc;
        int c;

        if (uio->uio_resid <= 0)
                return EINVAL;

        /*
         * We want to block until the slave
         * is open, and there's something to read;
         * but if we lost the slave or we're NBIO,
         * then return the appropriate error instead.
         */
        mutex_spin_enter(&tty_lock);
        for (;;) {
                if (ISSET(tp->t_state, TS_ISOPEN)) {
                        if (pti->pt_flags & PF_PKT && (c = pti->pt_send)) {
                                pti->pt_send = 0;
                                mutex_spin_exit(&tty_lock);
                                error = ureadc(c, uio);
                                if (error != 0)
                                        return error;
                                /*
                                 * Since we don't have the tty locked, there's
                                 * a risk of messing up `t_termios'. This is
                                 * relevant only if the tty got closed and then
                                 * opened again while we were out uiomoving.
                                 */
                                if (c & TIOCPKT_IOCTL) {
                                        cc = uimin(uio->uio_resid,
                                                sizeof(tp->t_termios));
                                        uiomove((void *) &tp->t_termios,
                                                cc, uio);
                                }
                                return 0;
                        }
                        if (pti->pt_flags & PF_UCNTL && (c = pti->pt_ucntl)) {
                                pti->pt_ucntl = 0;
                                mutex_spin_exit(&tty_lock);
                                error = ureadc(c, uio);
                                if (error != 0)
                                        return error;
                                return 0;
                        }
                        if (tp->t_outq.c_cc && !ISSET(tp->t_state, TS_TTSTOP))
                                break;
                }
                if (!ISSET(tp->t_state, TS_CARR_ON)) {
                        error = 0;        /* EOF */
                        goto out;
                }
                if (flag & IO_NDELAY) {
                        error = EWOULDBLOCK;
                        goto out;
                }
                error = cv_wait_sig(&tp->t_outcvf, &tty_lock);
                if (error != 0)
                        goto out;
        }

        if (pti->pt_flags & (PF_PKT|PF_UCNTL)) {
                mutex_spin_exit(&tty_lock);
                error = ureadc(0, uio);
                mutex_spin_enter(&tty_lock);
                if (error == 0 && !ISSET(tp->t_state, TS_ISOPEN))
                        error = EIO;
        }
        while (uio->uio_resid > 0 && error == 0) {
                cc = q_to_b(&tp->t_outq, bf, uimin(uio->uio_resid, BUFSIZ));
                if (cc <= 0)
                        break;
                mutex_spin_exit(&tty_lock);
                error = uiomove(bf, cc, uio);
                mutex_spin_enter(&tty_lock);
                if (error == 0 && !ISSET(tp->t_state, TS_ISOPEN))
                        error = EIO;
        }
        ttypull(tp);
out:
        mutex_spin_exit(&tty_lock);
        return error;
}


int
ptcwrite(dev_t dev, struct uio *uio, int flag)
{
        struct pt_softc *pti = pt_softc[minor(dev)];
        struct tty *tp = pti->pt_tty;
        u_char *cp = NULL;
        int cc = 0;
        u_char locbuf[BUFSIZ];
        int cnt = 0;
        int error = 0;

again:
        mutex_spin_enter(&tty_lock);
        if (!ISSET(tp->t_state, TS_ISOPEN))
                goto block;
        if (pti->pt_flags & PF_REMOTE) {
                if (tp->t_canq.c_cc)
                        goto block;
                while (uio->uio_resid > 0 && tp->t_canq.c_cc < TTYHOG) {
                        if (cc == 0) {
                                cc = uimin(uio->uio_resid, BUFSIZ);
                                cc = uimin(cc, TTYHOG - tp->t_canq.c_cc);
                                cp = locbuf;
                                mutex_spin_exit(&tty_lock);
                                error = uiomove(cp, cc, uio);
                                if (error != 0)
                                        return error;
                                mutex_spin_enter(&tty_lock);
                                /* check again for safety */
                                if (!ISSET(tp->t_state, TS_ISOPEN)) {
                                        /*
                                         * adjust for data copied in but not
                                         * written
                                         */
                                        uio->uio_resid += cc;
                                        error = EIO;
                                        goto out;
                                }
                        }
                        if (cc) {
                                cc = b_to_q(cp, cc, &tp->t_outq);
                                if (cc > 0)
                                        goto block;
                        }
                }
                (void) putc(0, &tp->t_canq);
                ttwakeup(tp);
                cv_broadcast(&tp->t_cancv);
                error = 0;
                goto out;
        }
        while (uio->uio_resid > 0) {
                if (cc == 0) {
                        cc = uimin(uio->uio_resid, BUFSIZ);
                        cp = locbuf;
                        mutex_spin_exit(&tty_lock);
                        error = uiomove(cp, cc, uio);
                        if (error != 0)
                                return error;
                        mutex_spin_enter(&tty_lock);
                        /* check again for safety */
                        if (!ISSET(tp->t_state, TS_ISOPEN)) {
                                /* adjust for data copied in but not written */
                                uio->uio_resid += cc;
                                error = EIO;
                                goto out;
                        }
                }
                while (cc > 0) {
                        int used = tp->t_rawq.c_cc + tp->t_canq.c_cc;
                        int canon = ISSET(tp->t_lflag, ICANON) ? 1 : 0;
                        /*
                         * We need space for 2 characters if canonical
                         * because we might need to print ^C
                         */
                        if (used >= (TTYHOG - canon) &&
                           (tp->t_canq.c_cc > 0 || !canon)) {
                                cv_broadcast(&tp->t_rawcv);
                                goto block;
                        }
                        /*
                         * XXX - should change l_rint to be called with lock
                         *         see also tty.c:ttyinput_wlock()
                         */
                        mutex_spin_exit(&tty_lock);
                        (*tp->t_linesw->l_rint)(*cp++, tp);
                        mutex_spin_enter(&tty_lock);
                        cnt++;
                        cc--;
                }
        }
        error = 0;
        goto out;

block:
        /*
         * Come here to wait for slave to open, for space
         * in outq, or space in rawq.
         */
        if (!ISSET(tp->t_state, TS_CARR_ON)) {
                /* adjust for data copied in but not written */
                uio->uio_resid += cc;
                error = EIO;
                goto out;
        }
        if (flag & IO_NDELAY) {
                /* adjust for data copied in but not written */
                uio->uio_resid += cc;
                error = cnt == 0 ? EWOULDBLOCK : 0;
                goto out;
        }
        error = cv_wait_sig(&tp->t_rawcvf, &tty_lock);
        mutex_spin_exit(&tty_lock);
        if (error != 0) {
                /* adjust for data copied in but not written */
                uio->uio_resid += cc;
                return error;
        }
        goto again;

out:
        mutex_spin_exit(&tty_lock);
        return error;
}

int
ptcpoll(dev_t dev, int events, struct lwp *l)
{
        struct pt_softc *pti = pt_softc[minor(dev)];
        struct tty *tp = pti->pt_tty;
        int revents = 0;

        mutex_spin_enter(&tty_lock);

        if (events & (POLLIN | POLLRDNORM))
                if (ISSET(tp->t_state, TS_ISOPEN) &&
                    ((tp->t_outq.c_cc > 0 && !ISSET(tp->t_state, TS_TTSTOP)) ||
                     ((pti->pt_flags & PF_PKT) && pti->pt_send) ||
                     ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)))
                        revents |= events & (POLLIN | POLLRDNORM);

        if (events & (POLLOUT | POLLWRNORM))
                if (ISSET(tp->t_state, TS_ISOPEN) &&
                    ((pti->pt_flags & PF_REMOTE) ?
                     (tp->t_canq.c_cc == 0) :
                     ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG-2) ||
                      (tp->t_canq.c_cc == 0 && ISSET(tp->t_lflag, ICANON)))))
                        revents |= events & (POLLOUT | POLLWRNORM);

        if (events & POLLHUP)
                if (!ISSET(tp->t_state, TS_CARR_ON))
                        revents |= POLLHUP;

        if (revents == 0) {
                if (events & (POLLIN | POLLHUP | POLLRDNORM))
                        selrecord(l, &pti->pt_selr);

                if (events & (POLLOUT | POLLWRNORM))
                        selrecord(l, &pti->pt_selw);
        }

        mutex_spin_exit(&tty_lock);

        return revents;
}

static void
filt_ptcrdetach(struct knote *kn)
{
        struct pt_softc *pti;

        pti = kn->kn_hook;

        mutex_spin_enter(&tty_lock);
        selremove_knote(&pti->pt_selr, kn);
        mutex_spin_exit(&tty_lock);
}

static int
filt_ptcread(struct knote *kn, long hint)
{
        struct pt_softc *pti;
        struct tty        *tp;
        int canread;

        pti = kn->kn_hook;
        tp = pti->pt_tty;

        if ((hint & NOTE_SUBMIT) == 0) {
                mutex_spin_enter(&tty_lock);
        }

        canread = (ISSET(tp->t_state, TS_ISOPEN) &&
                    ((tp->t_outq.c_cc > 0 && !ISSET(tp->t_state, TS_TTSTOP)) ||
                     ((pti->pt_flags & PF_PKT) && pti->pt_send) ||
                     ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)));

        if (canread) {
                /*
                 * c_cc is number of characters after output post-processing;
                 * the amount of data actually read(2) depends on
                 * setting of input flags for the terminal.
                 */
                kn->kn_data = tp->t_outq.c_cc;
                if (((pti->pt_flags & PF_PKT) && pti->pt_send) ||
                    ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl))
                        kn->kn_data++;
        }
        if (!ISSET(tp->t_state, TS_CARR_ON)) {
                knote_set_eof(kn, 0);
                canread = 1;
        }

        if ((hint & NOTE_SUBMIT) == 0) {
                mutex_spin_exit(&tty_lock);
        }

        return canread;
}

static void
filt_ptcwdetach(struct knote *kn)
{
        struct pt_softc *pti;

        pti = kn->kn_hook;

        mutex_spin_enter(&tty_lock);
        selremove_knote(&pti->pt_selw, kn);
        mutex_spin_exit(&tty_lock);
}

static int
filt_ptcwrite(struct knote *kn, long hint)
{
        struct pt_softc *pti;
        struct tty        *tp;
        int canwrite;
        int nwrite;

        pti = kn->kn_hook;
        tp = pti->pt_tty;

        if ((hint & NOTE_SUBMIT) == 0) {
                mutex_spin_enter(&tty_lock);
        }

        canwrite = (ISSET(tp->t_state, TS_ISOPEN) &&
                    ((pti->pt_flags & PF_REMOTE) ?
                     (tp->t_canq.c_cc == 0) :
                     ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG-2) ||
                      (tp->t_canq.c_cc == 0 && ISSET(tp->t_lflag, ICANON)))));

        if (canwrite) {
                if (pti->pt_flags & PF_REMOTE)
                        nwrite = tp->t_canq.c_cn;
                else {
                        /* this is guaranteed to be > 0 due to above check */
                        nwrite = tp->t_canq.c_cn
                                - (tp->t_rawq.c_cc + tp->t_canq.c_cc);
                }
                kn->kn_data = nwrite;
        }

        if ((hint & NOTE_SUBMIT) == 0) {
                mutex_spin_exit(&tty_lock);
        }

        return canwrite;
}

static const struct filterops ptcread_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_ptcrdetach,
        .f_event = filt_ptcread,
};

static const struct filterops ptcwrite_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_ptcwdetach,
        .f_event = filt_ptcwrite,
};

int
ptckqfilter(dev_t dev, struct knote *kn)
{
        struct pt_softc *pti = pt_softc[minor(dev)];
        struct selinfo        *sip;

        switch (kn->kn_filter) {
        case EVFILT_READ:
                sip = &pti->pt_selr;
                kn->kn_fop = &ptcread_filtops;
                break;
        case EVFILT_WRITE:
                sip = &pti->pt_selw;
                kn->kn_fop = &ptcwrite_filtops;
                break;
        default:
                return EINVAL;
        }

        kn->kn_hook = pti;

        mutex_spin_enter(&tty_lock);
        selrecord_knote(sip, kn);
        mutex_spin_exit(&tty_lock);

        return 0;
}

struct tty *
ptytty(dev_t dev)
{
        struct pt_softc *pti = pt_softc[minor(dev)];
        struct tty *tp = pti->pt_tty;

        return tp;
}

/*ARGSUSED*/
int
ptyioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
        struct pt_softc *pti = pt_softc[minor(dev)];
        struct tty *tp = pti->pt_tty;
        const struct cdevsw *cdev;
        u_char *cc = tp->t_cc;
        int stop, error, sig;
#ifndef NO_DEV_PTM
        struct mount *mp;
#endif

        /*
         * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG.
         * ttywflush(tp) will hang if there are characters in the outq.
         */
        if (cmd == TIOCEXT) {
                /*
                 * When the EXTPROC bit is being toggled, we need
                 * to send an TIOCPKT_IOCTL if the packet driver
                 * is turned on.
                 */
                if (*(int *)data) {
                        if (pti->pt_flags & PF_PKT) {
                                pti->pt_send |= TIOCPKT_IOCTL;
                                ptcwakeup(tp, FREAD);
                        }
                        SET(tp->t_lflag, EXTPROC);
                } else {
                        if (ISSET(tp->t_lflag, EXTPROC) &&
                            (pti->pt_flags & PF_PKT)) {
                                pti->pt_send |= TIOCPKT_IOCTL;
                                ptcwakeup(tp, FREAD);
                        }
                        CLR(tp->t_lflag, EXTPROC);
                }
                return(0);
        }

#ifndef NO_DEV_PTM
        /* Allow getting the name from either the master or the slave */
        if (cmd == TIOCPTSNAME) {
                if ((error = pty_getmp(l, &mp)) != 0)
                        return error;
                return pty_fill_ptmget(l, dev, -1, -1, data, mp);
        }
#endif

        cdev = cdevsw_lookup(dev);
        if (cdev != NULL && cdev->d_open == ptcopen)
                switch (cmd) {
#ifndef NO_DEV_PTM
                case TIOCGRANTPT:
                        if ((error = pty_getmp(l, &mp)) != 0)
                                return error;
                        return pty_grant_slave(l, dev, mp);
#endif

                case TIOCGPGRP:
                        /*
                         * We avoid calling ttioctl on the controller since,
                         * in that case, tp must be the controlling terminal.
                         */
                        *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
                        return 0;

                case TIOCPKT:
                        if (*(int *)data) {
                                if (pti->pt_flags & PF_UCNTL)
                                        return EINVAL;
                                pti->pt_flags |= PF_PKT;
                        } else
                                pti->pt_flags &= ~PF_PKT;
                        return 0;

                case TIOCUCNTL:
                        if (*(int *)data) {
                                if (pti->pt_flags & PF_PKT)
                                        return EINVAL;
                                pti->pt_flags |= PF_UCNTL;
                        } else
                                pti->pt_flags &= ~PF_UCNTL;
                        return 0;

                case TIOCREMOTE:
                        if (*(int *)data)
                                pti->pt_flags |= PF_REMOTE;
                        else
                                pti->pt_flags &= ~PF_REMOTE;
                        mutex_spin_enter(&tty_lock);
                        ttyflush(tp, FREAD|FWRITE);
                        mutex_spin_exit(&tty_lock);
                        return 0;

                case TIOCSETP:
                case TIOCSETN:
                case TIOCSETD:
                case TIOCSETA:
                case TIOCSETAW:
                case TIOCSETAF:
                        mutex_spin_enter(&tty_lock);
                        ndflush(&tp->t_outq, tp->t_outq.c_cc);
                        mutex_spin_exit(&tty_lock);
                        break;

                case TIOCSIG:
                        sig = (int)(long)*(void **)data;
                        if (sig <= 0 || sig >= NSIG)
                                return EINVAL;
                        mutex_spin_enter(&tty_lock);
                        if (!ISSET(tp->t_lflag, NOFLSH))
                                ttyflush(tp, FREAD|FWRITE);
                        tp->t_state |= TS_SIGINFO;
                        ttysig(tp, TTYSIG_PG1, sig);
                        mutex_spin_exit(&tty_lock);
                        return 0;

                case FIONREAD:
                        mutex_spin_enter(&tty_lock);
                        *(int *)data = tp->t_outq.c_cc;
                        mutex_spin_exit(&tty_lock);
                        return 0;
                }

        error = (*tp->t_linesw->l_ioctl)(tp, cmd, data, flag, l);
        if (error == EPASSTHROUGH)
                 error = ttioctl(tp, cmd, data, flag, l);
        if (error == EPASSTHROUGH) {
                if (pti->pt_flags & PF_UCNTL &&
                    (cmd & ~0xff) == UIOCCMD(0)) {
                        if (cmd & 0xff) {
                                pti->pt_ucntl = (u_char)cmd;
                                ptcwakeup(tp, FREAD);
                        }
                        return 0;
                }
        }
        /*
         * If external processing and packet mode send ioctl packet.
         */
        if (ISSET(tp->t_lflag, EXTPROC) && (pti->pt_flags & PF_PKT)) {
                switch(cmd) {
                case TIOCSETA:
                case TIOCSETAW:
                case TIOCSETAF:
                case TIOCSETP:
                case TIOCSETN:
                case TIOCSETC:
                case TIOCSLTC:
                case TIOCLBIS:
                case TIOCLBIC:
                case TIOCLSET:
                        pti->pt_send |= TIOCPKT_IOCTL;
                        ptcwakeup(tp, FREAD);
                default:
                        break;
                }
        }
        stop = ISSET(tp->t_iflag, IXON) && CCEQ(cc[VSTOP], CTRL('s'))
                && CCEQ(cc[VSTART], CTRL('q'));
        if (pti->pt_flags & PF_NOSTOP) {
                if (stop) {
                        pti->pt_send &= ~TIOCPKT_NOSTOP;
                        pti->pt_send |= TIOCPKT_DOSTOP;
                        pti->pt_flags &= ~PF_NOSTOP;
                        ptcwakeup(tp, FREAD);
                }
        } else {
                if (!stop) {
                        pti->pt_send &= ~TIOCPKT_DOSTOP;
                        pti->pt_send |= TIOCPKT_NOSTOP;
                        pti->pt_flags |= PF_NOSTOP;
                        ptcwakeup(tp, FREAD);
                }
        }
        return error;
}




























































































































































































































































































































































































































































































































































































    3 







    3 
























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
/*        $NetBSD: in6.h,v 1.101 2021/07/31 10:12:04 andvar Exp $        */
/*        $KAME: in6.h,v 1.83 2001/03/29 02:55:07 jinmei Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)in.h        8.3 (Berkeley) 1/3/94
 */

#ifndef _NETINET6_IN6_H_
#define _NETINET6_IN6_H_

#include <sys/featuretest.h>

#ifndef __KAME_NETINET_IN_H_INCLUDED_
#error "do not include netinet6/in6.h directly, include netinet/in.h.  see RFC2553"
#endif

#include <sys/socket.h>
#include <sys/endian.h>                /* ntohl */

/*
 * Identification of the network protocol stack
 * for *BSD-current/release: http://www.kame.net/dev/cvsweb.cgi/kame/COVERAGE
 * has the table of implementation/integration differences.
 */
#define __KAME__
#define __KAME_VERSION                "NetBSD-current"

/*
 * Local port number conventions:
 *
 * Ports < IPPORT_RESERVED are reserved for privileged processes (e.g. root),
 * unless a kernel is compiled with IPNOPRIVPORTS defined.
 *
 * When a user does a bind(2) or connect(2) with a port number of zero,
 * a non-conflicting local port address is chosen.
 *
 * The default range is IPPORT_ANONMIN to IPPORT_ANONMAX, although
 * that is settable by sysctl(3); net.inet.ip.anonportmin and
 * net.inet.ip.anonportmax respectively.
 *
 * A user may set the IPPROTO_IP option IP_PORTRANGE to change this
 * default assignment range.
 *
 * The value IP_PORTRANGE_DEFAULT causes the default behavior.
 *
 * The value IP_PORTRANGE_HIGH is the same as IP_PORTRANGE_DEFAULT,
 * and exists only for FreeBSD compatibility purposes.
 *
 * The value IP_PORTRANGE_LOW changes the range to the "low" are
 * that is (by convention) restricted to privileged processes.
 * This convention is based on "vouchsafe" principles only.
 * It is only secure if you trust the remote host to restrict these ports.
 * The range is IPPORT_RESERVEDMIN to IPPORT_RESERVEDMAX.
 */

#if defined(_NETBSD_SOURCE)
#define        IPV6PORT_RESERVED        1024
#define        IPV6PORT_ANONMIN        49152
#define        IPV6PORT_ANONMAX        65535
#define        IPV6PORT_RESERVEDMIN        600
#define        IPV6PORT_RESERVEDMAX        (IPV6PORT_RESERVED-1)
#endif

/*
 * IPv6 address
 */
struct in6_addr {
        union {
                __uint8_t   __u6_addr8[16];
                __uint16_t  __u6_addr16[8];
                uint32_t  __u6_addr32[4];
        } __u6_addr;                        /* 128-bit IP6 address */
};

#define s6_addr   __u6_addr.__u6_addr8
#ifdef _KERNEL        /* XXX nonstandard */
#define s6_addr8  __u6_addr.__u6_addr8
#define s6_addr16 __u6_addr.__u6_addr16
#define s6_addr32 __u6_addr.__u6_addr32
#endif

#define INET6_ADDRSTRLEN        46

/*
 * Socket address for IPv6
 */
#if defined(_NETBSD_SOURCE)
#define SIN6_LEN
#endif
struct sockaddr_in6 {
        uint8_t                sin6_len;        /* length of this struct(socklen_t)*/
        sa_family_t        sin6_family;        /* AF_INET6 (sa_family_t) */
        in_port_t        sin6_port;        /* Transport layer port */
        uint32_t        sin6_flowinfo;        /* IP6 flow information */
        struct in6_addr        sin6_addr;        /* IP6 address */
        uint32_t        sin6_scope_id;        /* scope zone index */
};

/*
 * Local definition for masks
 */
#ifdef _KERNEL        /* XXX nonstandard */
#define IN6MASK0        {{{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }}}
#define IN6MASK32        {{{ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, \
                            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}}
#define IN6MASK64        {{{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \
                            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}}
#define IN6MASK96        {{{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \
                            0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }}}
#define IN6MASK128        {{{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \
                            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }}}
#endif

#ifdef _KERNEL
extern const struct sockaddr_in6 sa6_any;

extern const struct in6_addr in6mask0;
extern const struct in6_addr in6mask32;
extern const struct in6_addr in6mask64;
extern const struct in6_addr in6mask96;
extern const struct in6_addr in6mask128;
#endif /* _KERNEL */

/*
 * Macros started with IPV6_ADDR is KAME local
 */
#ifdef _KERNEL        /* XXX nonstandard */
#if BYTE_ORDER == BIG_ENDIAN
#define IPV6_ADDR_INT32_ONE        1
#define IPV6_ADDR_INT32_TWO        2
#define IPV6_ADDR_INT32_MNL        0xff010000
#define IPV6_ADDR_INT32_MLL        0xff020000
#define IPV6_ADDR_INT32_SMP        0x0000ffff
#define IPV6_ADDR_INT16_ULL        0xfe80
#define IPV6_ADDR_INT16_USL        0xfec0
#define IPV6_ADDR_INT16_MLL        0xff02
#elif BYTE_ORDER == LITTLE_ENDIAN
#define IPV6_ADDR_INT32_ONE        0x01000000
#define IPV6_ADDR_INT32_TWO        0x02000000
#define IPV6_ADDR_INT32_MNL        0x000001ff
#define IPV6_ADDR_INT32_MLL        0x000002ff
#define IPV6_ADDR_INT32_SMP        0xffff0000
#define IPV6_ADDR_INT16_ULL        0x80fe
#define IPV6_ADDR_INT16_USL        0xc0fe
#define IPV6_ADDR_INT16_MLL        0x02ff
#endif
#endif

/*
 * Definition of some useful macros to handle IP6 addresses
 */
#define IN6ADDR_ANY_INIT \
        {{{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}}
#define IN6ADDR_LOOPBACK_INIT \
        {{{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}}
#define IN6ADDR_NODELOCAL_ALLNODES_INIT \
        {{{ 0xff, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}}
#define IN6ADDR_LINKLOCAL_ALLNODES_INIT \
        {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}}
#define IN6ADDR_LINKLOCAL_ALLROUTERS_INIT \
        {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 }}}

extern const struct in6_addr in6addr_any;
extern const struct in6_addr in6addr_loopback;
extern const struct in6_addr in6addr_nodelocal_allnodes;
extern const struct in6_addr in6addr_linklocal_allnodes;
extern const struct in6_addr in6addr_linklocal_allrouters;

#define IN6_ARE_ADDR_EQUAL(a, b)                        \
    (memcmp(&(a)->s6_addr[0], &(b)->s6_addr[0], sizeof(struct in6_addr)) == 0)

/*
 * Unspecified
 */
#define IN6_IS_ADDR_UNSPECIFIED(a)        \
        ((a)->__u6_addr.__u6_addr32[0] == 0 &&        \
         (a)->__u6_addr.__u6_addr32[1] == 0 &&        \
         (a)->__u6_addr.__u6_addr32[2] == 0 &&        \
         (a)->__u6_addr.__u6_addr32[3] == 0)

/*
 * Loopback
 */
#define IN6_IS_ADDR_LOOPBACK(a)                \
        ((a)->__u6_addr.__u6_addr32[0] == 0 &&        \
         (a)->__u6_addr.__u6_addr32[1] == 0 &&        \
         (a)->__u6_addr.__u6_addr32[2] == 0 &&        \
         (a)->__u6_addr.__u6_addr32[3] == ntohl(1))

/*
 * IPv4 compatible
 */
#define IN6_IS_ADDR_V4COMPAT(a)                \
        ((a)->__u6_addr.__u6_addr32[0] == 0 &&        \
         (a)->__u6_addr.__u6_addr32[1] == 0 &&        \
         (a)->__u6_addr.__u6_addr32[2] == 0 &&        \
         (a)->__u6_addr.__u6_addr32[3] != 0 &&        \
         (a)->__u6_addr.__u6_addr32[3] != ntohl(1))

/*
 * Mapped
 */
#define IN6_IS_ADDR_V4MAPPED(a)                      \
        ((a)->__u6_addr.__u6_addr32[0] == 0 &&        \
         (a)->__u6_addr.__u6_addr32[1] == 0 &&        \
         (a)->__u6_addr.__u6_addr32[2] == ntohl(0x0000ffff))

/*
 * KAME Scope Values
 */

#ifdef _KERNEL        /* XXX nonstandard */
#define IPV6_ADDR_SCOPE_NODELOCAL        0x01
#define IPV6_ADDR_SCOPE_INTFACELOCAL        0x01
#define IPV6_ADDR_SCOPE_LINKLOCAL        0x02
#define IPV6_ADDR_SCOPE_SITELOCAL        0x05
#define IPV6_ADDR_SCOPE_ORGLOCAL        0x08        /* just used in this file */
#define IPV6_ADDR_SCOPE_GLOBAL                0x0e
#else
#define __IPV6_ADDR_SCOPE_NODELOCAL        0x01
#define __IPV6_ADDR_SCOPE_LINKLOCAL        0x02
#define __IPV6_ADDR_SCOPE_SITELOCAL        0x05
#define __IPV6_ADDR_SCOPE_ORGLOCAL        0x08        /* just used in this file */
#define __IPV6_ADDR_SCOPE_GLOBAL        0x0e
#endif

/*
 * Unicast Scope
 * Note that we must check topmost 10 bits only, not 16 bits (see RFC2373).
 */
#define IN6_IS_ADDR_LINKLOCAL(a)        \
        (((a)->s6_addr[0] == 0xfe) && (((a)->s6_addr[1] & 0xc0) == 0x80))
#define IN6_IS_ADDR_SITELOCAL(a)        \
        (((a)->s6_addr[0] == 0xfe) && (((a)->s6_addr[1] & 0xc0) == 0xc0))

/*
 * Multicast
 */
#define IN6_IS_ADDR_MULTICAST(a)        ((a)->s6_addr[0] == 0xff)

#ifdef _KERNEL        /* XXX nonstandard */
#define IPV6_ADDR_MC_SCOPE(a)                ((a)->s6_addr[1] & 0x0f)
#else
#define __IPV6_ADDR_MC_SCOPE(a)                ((a)->s6_addr[1] & 0x0f)
#endif

/*
 * Multicast Scope
 */
#ifdef _KERNEL        /* refers nonstandard items */
#define IN6_IS_ADDR_MC_NODELOCAL(a)        \
        (IN6_IS_ADDR_MULTICAST(a) &&        \
         (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_NODELOCAL))
#define IN6_IS_ADDR_MC_INTFACELOCAL(a)        \
        (IN6_IS_ADDR_MULTICAST(a) &&        \
         (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_INTFACELOCAL))
#define IN6_IS_ADDR_MC_LINKLOCAL(a)        \
        (IN6_IS_ADDR_MULTICAST(a) &&        \
         (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_LINKLOCAL))
#define IN6_IS_ADDR_MC_SITELOCAL(a)        \
        (IN6_IS_ADDR_MULTICAST(a) &&         \
         (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_SITELOCAL))
#define IN6_IS_ADDR_MC_ORGLOCAL(a)        \
        (IN6_IS_ADDR_MULTICAST(a) &&        \
         (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_ORGLOCAL))
#define IN6_IS_ADDR_MC_GLOBAL(a)        \
        (IN6_IS_ADDR_MULTICAST(a) &&        \
         (IPV6_ADDR_MC_SCOPE(a) == IPV6_ADDR_SCOPE_GLOBAL))
#else
#define IN6_IS_ADDR_MC_NODELOCAL(a)        \
        (IN6_IS_ADDR_MULTICAST(a) &&        \
         (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_NODELOCAL))
#define IN6_IS_ADDR_MC_LINKLOCAL(a)        \
        (IN6_IS_ADDR_MULTICAST(a) &&        \
         (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_LINKLOCAL))
#define IN6_IS_ADDR_MC_SITELOCAL(a)        \
        (IN6_IS_ADDR_MULTICAST(a) &&         \
         (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_SITELOCAL))
#define IN6_IS_ADDR_MC_ORGLOCAL(a)        \
        (IN6_IS_ADDR_MULTICAST(a) &&        \
         (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_ORGLOCAL))
#define IN6_IS_ADDR_MC_GLOBAL(a)        \
        (IN6_IS_ADDR_MULTICAST(a) &&        \
         (__IPV6_ADDR_MC_SCOPE(a) == __IPV6_ADDR_SCOPE_GLOBAL))
#endif

#ifdef _KERNEL        /* nonstandard */
/*
 * KAME Scope
 */
#define IN6_IS_SCOPE_LINKLOCAL(a)        \
        ((IN6_IS_ADDR_LINKLOCAL(a)) ||        \
         (IN6_IS_ADDR_MC_LINKLOCAL(a)))

#define        IN6_IS_SCOPE_EMBEDDABLE(__a)        \
    (IN6_IS_SCOPE_LINKLOCAL(__a) || IN6_IS_ADDR_MC_INTFACELOCAL(__a))

#define IFA6_IS_DEPRECATED(a) \
        ((a)->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME && \
         (u_int32_t)((time_uptime - (a)->ia6_updatetime)) > \
         (a)->ia6_lifetime.ia6t_pltime)
#define IFA6_IS_INVALID(a) \
        ((a)->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME && \
         (u_int32_t)((time_uptime - (a)->ia6_updatetime)) > \
         (a)->ia6_lifetime.ia6t_vltime)
#endif

/*
 * Options for use with [gs]etsockopt at the IPV6 level.
 * First word of comment is data type; bool is stored in int.
 */
/* no hdrincl */
#if 0
/* These are deprecated non-standard options which are no longer supported. */
#define IPV6_OPTIONS                1  /* buf/ip6_opts; set/get IP6 options */
#define IPV6_RECVOPTS                5  /* bool; receive all IP6 opts w/dgram */
#define IPV6_RECVRETOPTS        6  /* bool; receive IP6 opts for response */
#define IPV6_RECVDSTADDR        7  /* bool; receive IP6 dst addr w/dgram */
#define IPV6_RETOPTS                8  /* ip6_opts; set/get IP6 options */
#endif
#define IPV6_SOCKOPT_RESERVED1        3  /* reserved for future use */
#define IPV6_UNICAST_HOPS        4  /* int; IP6 hops */
#define IPV6_MULTICAST_IF        9  /* u_int; set/get IP6 multicast i/f  */
#define IPV6_MULTICAST_HOPS        10 /* int; set/get IP6 multicast hops */
#define IPV6_MULTICAST_LOOP        11 /* u_int; set/get IP6 multicast loopback */
/* The join and leave membership option numbers need to match with the v4 ones */
#define IPV6_JOIN_GROUP                12 /* ip6_mreq; join a group membership */
#define IPV6_LEAVE_GROUP        13 /* ip6_mreq; leave a group membership */
#define IPV6_PORTRANGE                14 /* int; range to choose for unspec port */
#if defined(_NETBSD_SOURCE)
#define IPV6_PORTALGO                17 /* int; port selection algo (rfc6056) */
#define ICMP6_FILTER                18 /* icmp6_filter; icmp6 filter */
#endif
/* RFC2292 options */
#ifdef _KERNEL
#define IPV6_2292PKTINFO        19 /* bool; send/recv if, src/dst addr */
#define IPV6_2292HOPLIMIT        20 /* bool; hop limit */
#define IPV6_2292NEXTHOP        21 /* bool; next hop addr */
#define IPV6_2292HOPOPTS        22 /* bool; hop-by-hop option */
#define IPV6_2292DSTOPTS        23 /* bool; destination option */
#define IPV6_2292RTHDR                24 /* bool; routing header */
#define IPV6_2292PKTOPTIONS        25 /* buf/cmsghdr; set/get IPv6 options */
#endif
#define IPV6_CHECKSUM                26 /* int; checksum offset for raw socket */
#define IPV6_V6ONLY                27 /* bool; make AF_INET6 sockets v6 only */

#define IPV6_IPSEC_POLICY        28 /* struct; get/set security policy */
#define IPV6_FAITH                29 /* bool; accept FAITH'ed connections */

/* new socket options introduced in RFC3542 */
#define IPV6_RTHDRDSTOPTS       35 /* ip6_dest; send dst option before rthdr */

#define IPV6_RECVPKTINFO        36 /* bool; recv if, dst addr */
#define IPV6_RECVHOPLIMIT       37 /* bool; recv hop limit */
#define IPV6_RECVRTHDR          38 /* bool; recv routing header */
#define IPV6_RECVHOPOPTS        39 /* bool; recv hop-by-hop option */
#define IPV6_RECVDSTOPTS        40 /* bool; recv dst option after rthdr */
#ifdef _KERNEL
#define IPV6_RECVRTHDRDSTOPTS   41 /* bool; recv dst option before rthdr */
#endif
#define IPV6_USE_MIN_MTU        42 /* bool; send packets at the minimum MTU */
#define IPV6_RECVPATHMTU        43 /* bool; notify an according MTU */
#define IPV6_PATHMTU                44 /* mtuinfo; get the current path MTU (sopt),
                                      4 bytes int; MTU notification (cmsg) */

/* more new socket options introduced in RFC3542 */
#define IPV6_PKTINFO                46 /* in6_pktinfo; send if, src addr */
#define IPV6_HOPLIMIT                47 /* int; send hop limit */
#define IPV6_NEXTHOP                48 /* sockaddr; next hop addr */
#define IPV6_HOPOPTS                49 /* ip6_hbh; send hop-by-hop option */
#define IPV6_DSTOPTS                50 /* ip6_dest; send dst option before rthdr */
#define IPV6_RTHDR                51 /* ip6_rthdr; send routing header */

#define IPV6_RECVTCLASS                57 /* bool; recv traffic class values */
#ifdef _KERNEL
#define IPV6_OTCLASS                58 /* u_int8_t; send traffic class value */
#endif

#define IPV6_TCLASS                61 /* int; send traffic class value */
#define IPV6_DONTFRAG                62 /* bool; disable IPv6 fragmentation */
#define IPV6_PREFER_TEMPADDR        63 /* int; prefer temporary address as
                                    * the source address */
#define IPV6_BINDANY                64 /* bool: allow bind to any address */
/* to define items, should talk with KAME guys first, for *BSD compatibility */

#define IPV6_RTHDR_LOOSE     0 /* this hop need not be a neighbor. XXX old spec */
#define IPV6_RTHDR_STRICT    1 /* this hop must be a neighbor. XXX old spec */
#define IPV6_RTHDR_TYPE_0    0 /* IPv6 routing header type 0 */

/*
 * Defaults and limits for options
 */
#define IPV6_DEFAULT_MULTICAST_HOPS 1        /* normally limit m'casts to 1 hop  */
#define IPV6_DEFAULT_MULTICAST_LOOP 1        /* normally hear sends if a member  */

/*
 * Argument structure for IPV6_JOIN_GROUP and IPV6_LEAVE_GROUP.
 */
struct ipv6_mreq {
        struct in6_addr        ipv6mr_multiaddr;
        unsigned int        ipv6mr_interface;
};

/*
 * IPV6_PKTINFO: Packet information(RFC2292 sec 5)
 */
struct in6_pktinfo {
        struct in6_addr        ipi6_addr;        /* src/dst IPv6 address */
        unsigned int        ipi6_ifindex;        /* send/recv interface index */
};

/*
 * Control structure for IPV6_RECVPATHMTU socket option.
 */
struct ip6_mtuinfo {
        struct sockaddr_in6 ip6m_addr;        /* or sockaddr_storage? */
        uint32_t ip6m_mtu;
};

/*
 * Argument for IPV6_PORTRANGE:
 * - which range to search when port is unspecified at bind() or connect()
 */
#define        IPV6_PORTRANGE_DEFAULT        0        /* default range */
#define        IPV6_PORTRANGE_HIGH        1        /* "high" - request firewall bypass */
#define        IPV6_PORTRANGE_LOW        2        /* "low" - vouchsafe security */

#if defined(_NETBSD_SOURCE)
/*
 * Definitions for inet6 sysctl operations.
 *
 * Third level is protocol number.
 * Fourth level is desired variable within that protocol.
 */
/*
 * Names for IP sysctl objects
 */
#define IPV6CTL_FORWARDING        1        /* act as router */
#define IPV6CTL_SENDREDIRECTS        2        /* may send redirects when forwarding*/
#define IPV6CTL_DEFHLIM                3        /* default Hop-Limit */
/* IPV6CTL_DEFMTU=4, never implemented */
#define IPV6CTL_FORWSRCRT        5        /* forward source-routed dgrams */
#define IPV6CTL_STATS                6        /* stats */
#define IPV6CTL_MRTSTATS        7        /* multicast forwarding stats */
#define IPV6CTL_MRTPROTO        8        /* multicast routing protocol */
#define IPV6CTL_MAXFRAGPACKETS        9        /* max packets reassembly queue */
#define IPV6CTL_SOURCECHECK        10        /* verify source route and intf */
#define IPV6CTL_SOURCECHECK_LOGINT 11        /* minimum logging interval */
/* 12 was IPV6CTL_ACCEPT_RTADV */
#define IPV6CTL_KEEPFAITH        13
#define IPV6CTL_LOG_INTERVAL        14
#define IPV6CTL_HDRNESTLIMIT        15
#define IPV6CTL_DAD_COUNT        16
#define IPV6CTL_AUTO_FLOWLABEL        17
#define IPV6CTL_DEFMCASTHLIM        18
#define IPV6CTL_GIF_HLIM        19        /* default HLIM for gif encap packet */
#define IPV6CTL_KAME_VERSION        20
#define IPV6CTL_USE_DEPRECATED        21        /* use deprecated addr (RFC2462 5.5.4) */
/* 22 was IPV6CTL_RR_PRUNE */
/* 23: reserved */
#define IPV6CTL_V6ONLY                24
/* 25 to 27: reserved */
#define IPV6CTL_ANONPORTMIN        28        /* minimum ephemeral port */
#define IPV6CTL_ANONPORTMAX        29        /* maximum ephemeral port */
#define IPV6CTL_LOWPORTMIN        30        /* minimum reserved port */
#define IPV6CTL_LOWPORTMAX        31        /* maximum reserved port */
/* 32 to 34: reserved */
#define IPV6CTL_AUTO_LINKLOCAL        35        /* automatic link-local addr assign */
/* 36 to 37: reserved */
#define IPV6CTL_ADDRCTLPOLICY        38        /* get/set address selection policy */
#define IPV6CTL_USE_DEFAULTZONE        39        /* use default scope zone */
/* 40: reserved */
#define IPV6CTL_MAXFRAGS        41        /* max fragments */
#define IPV6CTL_IFQ                42        /* IPv6 packet input queue */
/* 43 was IPV6CTL_RTADV_MAXROUTES */
/* 44 was IPV6CTL_RTADV_NUMROUTES */
#define IPV6CTL_GIF_PMTU        45        /* gif(4) Path MTU setting */
#define IPV6CTL_IPSEC_HLIM        46        /* default HLIM for ipsecif encap packet */
#define IPV6CTL_IPSEC_PMTU        47        /* ipsecif(4) Path MTU setting */
#endif /* _NETBSD_SOURCE */

#ifdef _KERNEL
struct cmsghdr;

/*
 * in6_cksum_phdr:
 *
 *        Compute significant parts of the IPv6 checksum pseudo-header
 *        for use in a delayed TCP/UDP checksum calculation.
 *
 *        Args:
 *
 *                src                Source IPv6 address
 *                dst                Destination IPv6 address
 *                len                htonl(proto-hdr-len)
 *                nxt                htonl(next-proto-number)
 *
 *        NOTE: We expect the src and dst addresses to be 16-bit
 *        aligned!
 */
static __inline u_int16_t __unused
in6_cksum_phdr(const struct in6_addr *src, const struct in6_addr *dst,
    u_int32_t len, u_int32_t nxt)
{
        u_int32_t sum = 0;
        const u_int16_t *w;

        /*LINTED*/
        w = (const u_int16_t *) src;
        sum += w[0];
        if (!IN6_IS_SCOPE_LINKLOCAL(src))
                sum += w[1];
        sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5];
        sum += w[6]; sum += w[7];

        /*LINTED*/
        w = (const u_int16_t *) dst;
        sum += w[0];
        if (!IN6_IS_SCOPE_LINKLOCAL(dst))
                sum += w[1];
        sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5];
        sum += w[6]; sum += w[7];

        sum += (u_int16_t)(len >> 16) + (u_int16_t)(len /*& 0xffff*/);

        sum += (u_int16_t)(nxt >> 16) + (u_int16_t)(nxt /*& 0xffff*/);

        sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/);

        if (sum > 0xffff)
                sum -= 0xffff;

        return (sum);
}

struct mbuf;
struct ifnet;
int sockaddr_in6_cmp(const struct sockaddr *, const struct sockaddr *);
struct sockaddr *sockaddr_in6_externalize(struct sockaddr *, socklen_t,
    const struct sockaddr *);
int        in6_cksum(struct mbuf *, u_int8_t, u_int32_t, u_int32_t);
int        in6_localaddr(const struct in6_addr *);
int        in6_addrscope(const struct in6_addr *);
struct        in6_ifaddr *in6_ifawithifp(struct ifnet *, struct in6_addr *);
extern void in6_if_link_up(struct ifnet *);
extern void in6_if_link_down(struct ifnet *);
extern void in6_if_link_state_change(struct ifnet *, int);
extern void in6_if_up(struct ifnet *);
extern void in6_if_down(struct ifnet *);
extern void addrsel_policy_init(void);
extern        u_char        ip6_protox[];

struct ip6_hdr;
int in6_tunnel_validate(const struct ip6_hdr *, const struct in6_addr *,
        const struct in6_addr *);

#define        satosin6(sa)        ((struct sockaddr_in6 *)(sa))
#define        satocsin6(sa)        ((const struct sockaddr_in6 *)(sa))
#define        sin6tosa(sin6)        ((struct sockaddr *)(sin6))
#define        sin6tocsa(sin6)        ((const struct sockaddr *)(sin6))
#define        ifatoia6(ifa)        ((struct in6_ifaddr *)(ifa))

static __inline void
sockaddr_in6_init1(struct sockaddr_in6 *sin6, const struct in6_addr *addr,
    in_port_t port, uint32_t flowinfo, uint32_t scope_id)
{
        sin6->sin6_port = port;
        sin6->sin6_flowinfo = flowinfo;
        sin6->sin6_addr = *addr;
        sin6->sin6_scope_id = scope_id;
}

static __inline void
sockaddr_in6_init(struct sockaddr_in6 *sin6, const struct in6_addr *addr,
    in_port_t port, uint32_t flowinfo, uint32_t scope_id)
{
        sin6->sin6_family = AF_INET6;
        sin6->sin6_len = sizeof(*sin6);
        sockaddr_in6_init1(sin6, addr, port, flowinfo, scope_id);
}

static __inline struct sockaddr *
sockaddr_in6_alloc(const struct in6_addr *addr, in_port_t port,
    uint32_t flowinfo, uint32_t scope_id, int flags)
{
        struct sockaddr *sa;

        if ((sa = sockaddr_alloc(AF_INET6, sizeof(struct sockaddr_in6),
            flags)) == NULL)
                return NULL;

        sockaddr_in6_init1(satosin6(sa), addr, port, flowinfo, scope_id);

        return sa;
}
#endif /* _KERNEL */

#if defined(_NETBSD_SOURCE)

#include <machine/ansi.h>

#ifdef        _BSD_SIZE_T_
typedef        _BSD_SIZE_T_                size_t;
#define        _SIZE_T
#undef        _BSD_SIZE_T_
#endif

#include <sys/cdefs.h>

__BEGIN_DECLS
struct cmsghdr;

void        in6_in_2_v4mapin6(const struct in_addr *, struct in6_addr *);
void        in6_sin6_2_sin(struct sockaddr_in *, struct sockaddr_in6 *);
void        in6_sin_2_v4mapsin6(const struct sockaddr_in *, struct sockaddr_in6 *);
void        in6_sin6_2_sin_in_sock(struct sockaddr *);
void        in6_sin_2_v4mapsin6_in_sock(struct sockaddr **);

#define INET6_IS_ADDR_LINKLOCAL                1
#define INET6_IS_ADDR_MC_LINKLOCAL        2
#define INET6_IS_ADDR_SITELOCAL                4
void        inet6_getscopeid(struct sockaddr_in6 *, int);
void        inet6_putscopeid(struct sockaddr_in6 *, int);

extern int inet6_option_space(int);
extern int inet6_option_init(void *, struct cmsghdr **, int);
extern int inet6_option_append(struct cmsghdr *, const uint8_t *,
        int, int);
extern uint8_t *inet6_option_alloc(struct cmsghdr *, int, int, int);
extern int inet6_option_next(const struct cmsghdr *, uint8_t **);
extern int inet6_option_find(const struct cmsghdr *, uint8_t **, int);

extern size_t inet6_rthdr_space(int, int);
extern struct cmsghdr *inet6_rthdr_init(void *, int);
extern int inet6_rthdr_add(struct cmsghdr *, const struct in6_addr *,
                unsigned int);
extern int inet6_rthdr_lasthop(struct cmsghdr *, unsigned int);
#if 0 /* not implemented yet */
extern int inet6_rthdr_reverse(const struct cmsghdr *, struct cmsghdr *);
#endif
extern int inet6_rthdr_segments(const struct cmsghdr *);
extern struct in6_addr *inet6_rthdr_getaddr(struct cmsghdr *, int);
extern int inet6_rthdr_getflags(const struct cmsghdr *, int);

extern int inet6_opt_init(void *, socklen_t);
extern int inet6_opt_append(void *, socklen_t, int, uint8_t,
                socklen_t, uint8_t, void **);
extern int inet6_opt_finish(void *, socklen_t, int);
extern int inet6_opt_set_val(void *, int, void *, socklen_t);

extern int inet6_opt_next(void *, socklen_t, int, uint8_t *,
                socklen_t *, void **);
extern int inet6_opt_find(void *, socklen_t, int, uint8_t,
                socklen_t *, void **);
extern int inet6_opt_get_val(void *, int, void *, socklen_t);
extern socklen_t inet6_rth_space(int, int);
extern void *inet6_rth_init(void *, socklen_t, int, int);
extern int inet6_rth_add(void *, const struct in6_addr *);
extern int inet6_rth_reverse(const void *, void *);
extern int inet6_rth_segments(const void *);
extern struct in6_addr *inet6_rth_getaddr(const void *, int);
__END_DECLS
#endif /* _NETBSD_SOURCE */

#if defined(_KERNEL) || defined(_TEST)
int        in6_print(char *, size_t, const struct in6_addr *);
#define IN6_PRINT(b, a) (in6_print((b), sizeof(b), (a)), (b))
int        sin6_print(char *, size_t, const void *);
#endif

#endif /* !_NETINET6_IN6_H_ */






























































































  200 



  200 




























  168 




  164 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/*        $NetBSD: subr_debug.c,v 1.7 2008/04/30 20:20:53 ad Exp $        */

/*-
 * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Shared support code for kernels built with the DEBUG option.
 */
 
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_debug.c,v 1.7 2008/04/30 20:20:53 ad Exp $");

#include "opt_ddb.h"

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/debug.h>
#include <sys/atomic.h>
#include <sys/cpu.h>

#include <uvm/uvm_extern.h>

#include <machine/lock.h>

/*
 * Allocation/free validation by pointer address.  Introduces
 * significant overhead and is not enabled by default.  Patch
 * `debug_freecheck' to 1 at boot time to enable.
 */
#define        FREECHECK_BYTES                (8*1024*1024)

typedef struct fcitem {
        void                *i_addr;
        struct fcitem        *i_next;
} fcitem_t;

fcitem_t                *freecheck_free;
__cpu_simple_lock_t        freecheck_lock;
u_int                        debug_freecheck;

void
debug_init(void)
{
        size_t cnt;
        fcitem_t *i;

        __cpu_simple_lock_init(&freecheck_lock);

        if (debug_freecheck) {
                i = (fcitem_t *)uvm_km_alloc(kernel_map, FREECHECK_BYTES, 0,
                    UVM_KMF_WIRED);
                if (i == NULL) {
                        printf("freecheck_init: unable to allocate memory");
                        return;
                }

                for (cnt = FREECHECK_BYTES / sizeof(*i); cnt != 0; cnt--) {
                        i->i_next = freecheck_free;
                        freecheck_free = i++;
                }
        }
}

void
freecheck_out(void **head, void *addr)
{
        fcitem_t *i;
        int s;

        if (!debug_freecheck)
                return;

        s = splvm();
        __cpu_simple_lock(&freecheck_lock);
        for (i = *head; i != NULL; i = i->i_next) {
                if (i->i_addr != addr)
                        continue;
                __cpu_simple_unlock(&freecheck_lock);
                splx(s);
                panic("freecheck_out: %p already out", addr);
        }
        if ((i = freecheck_free) != NULL) {
                freecheck_free = i->i_next;
                i->i_addr = addr;
                i->i_next = *head;
                *head = i;
        }
        __cpu_simple_unlock(&freecheck_lock);
        splx(s);

        if (i == NULL) {
                if (atomic_swap_uint(&debug_freecheck, 1) == 0)
                        printf("freecheck_out: no more slots\n");
        }
}

void
freecheck_in(void **head, void *addr)
{
        fcitem_t *i;
        void *pp;
        int s;

        if (!debug_freecheck)
                return;

        s = splvm();
        __cpu_simple_lock(&freecheck_lock);
        for (i = *head, pp = head; i != NULL; pp = &i->i_next, i = i->i_next) {
                if (i->i_addr == addr) {
                        *(fcitem_t **)pp = i->i_next;
                        i->i_next = freecheck_free;
                        freecheck_free = i;
                        break;
                }
        }
        __cpu_simple_unlock(&freecheck_lock);
        splx(s);

        if (i != NULL)
                return;

#ifdef DDB
        printf("freecheck_in: %p not out\n", addr);
        Debugger();
#else
        panic("freecheck_in: %p not out", addr);
#endif
}




































































































































































































    3 

















    1 



    1 
    1 
    1 


    1 





































































    3 















    3 














    3 





















































































































    1 



























































    2 
    2 

















































































































































    2 















    2 























    1 
















































































































































































































































































    1 





    1 

    1 






























    1 




    1 



    1 

    1 


    1 

    1 



    2 


























































































































































































































































































































































































































































































































































































































































































































































    1 

    3 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
/*        $NetBSD: rtsock_shared.c,v 1.23 2022/10/04 07:06:31 msaitoh Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1988, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)rtsock.c        8.7 (Berkeley) 10/12/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rtsock_shared.c,v 1.23 2022/10/04 07:06:31 msaitoh Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/intr.h>
#include <sys/condvar.h>
#include <sys/compat_stub.h>

#include <net/if.h>
#include <net/if_llatbl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/raw_cb.h>

#include <netinet/in_var.h>
#include <netinet/if_inarp.h>

#include <netmpls/mpls.h>

#include <compat/net/if.h>
#include <compat/net/route.h>

/* sa_family is after sa_len, rest is data */
#define _SA_MINSIZE        (offsetof(struct sockaddr, sa_family) + \
                         sizeof(((struct sockaddr *)0)->sa_family))

#ifdef COMPAT_RTSOCK
/*
 * These are used when #include-d from compat/common/rtsock_50.c
 */
#define        RTM_XVERSION        RTM_OVERSION
#define        RTM_XNEWADDR        RTM_ONEWADDR
#define        RTM_XDELADDR        RTM_ODELADDR
#define        RTM_XCHGADDR        RTM_OCHGADDR
#define        RT_XADVANCE(a,b) RT_OADVANCE(a,b)
#define        RT_XROUNDUP(n)        RT_OROUNDUP(n)
#define        PF_XROUTE        PF_OROUTE
#define        rt_xmsghdr        rt_msghdr50
#define        if_xmsghdr        if_msghdr        /* if_msghdr50 is for RTM_OIFINFO */
#define        ifa_xmsghdr        ifa_msghdr50
#define        if_xannouncemsghdr        if_announcemsghdr50
#define        COMPATNAME(x)        compat_50_ ## x
#define        DOMAINNAME        "oroute"
#define        COMPATCALL(name, args)                \
        MODULE_HOOK_CALL_VOID(rtsock_ ## name ## _50_hook, args, __nothing);
#define        RTS_CTASSERT(x)        __nothing
CTASSERT(sizeof(struct ifa_xmsghdr) == 20);
DOMAIN_DEFINE(compat_50_routedomain); /* forward declare and add to link set */
#else /* COMPAT_RTSOCK */
/*
 * These are used normally, when not #include-d from compat/common/rtsock_50.c
 */
#define        RTM_XVERSION        RTM_VERSION
#define        RTM_XNEWADDR        RTM_NEWADDR
#define        RTM_XDELADDR        RTM_DELADDR
#define        RTM_XCHGADDR        RTM_CHGADDR
#define        RT_XADVANCE(a,b) RT_ADVANCE(a,b)
#define        RT_XROUNDUP(n)        RT_ROUNDUP(n)
#define        PF_XROUTE        PF_ROUTE
#define        rt_xmsghdr        rt_msghdr
#define        if_xmsghdr        if_msghdr
#define        ifa_xmsghdr        ifa_msghdr
#define        if_xannouncemsghdr        if_announcemsghdr
#define        COMPATNAME(x)        x
#define        DOMAINNAME        "route"
#define        COMPATCALL(name, args)        __nothing;
#define        RTS_CTASSERT(x)        CTASSERT(x)
CTASSERT(sizeof(struct ifa_xmsghdr) == 32);
DOMAIN_DEFINE(routedomain); /* forward declare and add to link set */
#endif /* COMPAT_RTSOCK */

#ifdef RTSOCK_DEBUG
#define RT_IN_PRINT(info, b, a) (in_print((b), sizeof(b), \
    &((const struct sockaddr_in *)(info)->rti_info[(a)])->sin_addr), (b))
#endif /* RTSOCK_DEBUG */

struct route_info COMPATNAME(route_info) = {
        .ri_dst = { .sa_len = 2, .sa_family = PF_XROUTE, },
        .ri_src = { .sa_len = 2, .sa_family = PF_XROUTE, },
        .ri_maxqlen = IFQ_MAXLEN,
};

static void COMPATNAME(route_init)(void);
static int COMPATNAME(route_output)(struct mbuf *, struct socket *);

static int rt_xaddrs(u_char, const char *, const char *, struct rt_addrinfo *);
static struct mbuf *rt_makeifannouncemsg(struct ifnet *, int, int,
    struct rt_addrinfo *);
static int rt_msg2(int, struct rt_addrinfo *, void *, struct rt_walkarg *, int *);
static void _rt_setmetrics(int, const struct rt_xmsghdr *, struct rtentry *);
static void rtm_setmetrics(const struct rtentry *, struct rt_xmsghdr *);
static void rt_adjustcount(int, int);

static const struct protosw COMPATNAME(route_protosw)[];

struct routecb {
        struct rawcb        rocb_rcb;
        unsigned int        rocb_msgfilter;
#define        RTMSGFILTER(m)        (1U << (m))
        char                *rocb_missfilter;
        size_t                rocb_missfilterlen;
};
#define sotoroutecb(so)        ((struct routecb *)(so)->so_pcb)

static struct rawcbhead rt_rawcb;
#ifdef NET_MPSAFE
static kmutex_t *rt_so_mtx;

static bool rt_updating = false;
static kcondvar_t rt_update_cv;
#endif

static void
rt_adjustcount(int af, int cnt)
{
        struct route_cb * const cb = &COMPATNAME(route_info).ri_cb;

        cb->any_count += cnt;

        switch (af) {
        case AF_INET:
                cb->ip_count += cnt;
                return;
#ifdef INET6
        case AF_INET6:
                cb->ip6_count += cnt;
                return;
#endif
        case AF_MPLS:
                cb->mpls_count += cnt;
                return;
        }
}

static int
COMPATNAME(route_filter)(struct mbuf *m, struct sockproto *proto,
    struct rawcb *rp)
{
        struct routecb *rop = (struct routecb *)rp;
        struct rt_xmsghdr rtm;

        KASSERT(m != NULL);
        KASSERT(proto != NULL);
        KASSERT(rp != NULL);

        /* Wrong family for this socket. */
        if (proto->sp_family != PF_ROUTE)
                return ENOPROTOOPT;

        /* If no filter set, just return. */
        if (rop->rocb_msgfilter == 0 && rop->rocb_missfilterlen == 0)
                return 0;

        /* Ensure we can access rtm_type */
        if (m->m_len <
            offsetof(struct rt_xmsghdr, rtm_type) + sizeof(rtm.rtm_type))
                return EINVAL;

        m_copydata(m, offsetof(struct rt_xmsghdr, rtm_type),
            sizeof(rtm.rtm_type), &rtm.rtm_type);
        if (rtm.rtm_type >= sizeof(rop->rocb_msgfilter) * CHAR_BIT)
                return EINVAL;
        /* If the rtm type is filtered out, return a positive. */
        if (rop->rocb_msgfilter != 0 &&
            !(rop->rocb_msgfilter & RTMSGFILTER(rtm.rtm_type)))
                return EEXIST;

        if (rop->rocb_missfilterlen != 0 && rtm.rtm_type == RTM_MISS) {
                __CTASSERT(RTAX_DST == 0);
                struct sockaddr_storage ss;
                struct sockaddr *dst = (struct sockaddr *)&ss, *sa;
                char *cp = rop->rocb_missfilter;
                char *ep = cp + rop->rocb_missfilterlen;

                /* Ensure we can access sa_len */
                if (m->m_pkthdr.len < sizeof(rtm) + _SA_MINSIZE)
                        return EINVAL;
                m_copydata(m, sizeof(rtm) + offsetof(struct sockaddr, sa_len),
                    sizeof(ss.ss_len), &ss.ss_len);
                if (ss.ss_len < _SA_MINSIZE ||
                    ss.ss_len > sizeof(ss) ||
                    m->m_pkthdr.len < sizeof(rtm) + ss.ss_len)
                        return EINVAL;
                /* Copy out the destination sockaddr */
                m_copydata(m, sizeof(rtm), ss.ss_len, &ss);

                /* Find a matching sockaddr in the filter */
                while (cp < ep) {
                        sa = (struct sockaddr *)cp;
                        if (sa->sa_len == dst->sa_len &&
                            memcmp(sa, dst, sa->sa_len) == 0)
                                break;
                        cp += RT_XROUNDUP(sa->sa_len);
                }
                if (cp == ep)
                        return EEXIST;
        }

        /* Passed the filter. */
        return 0;
}

static void
rt_pr_init(void)
{

        LIST_INIT(&rt_rawcb);
}

static int
COMPATNAME(route_attach)(struct socket *so, int proto)
{
        struct rawcb *rp;
        struct routecb *rop;
        int s, error;

        KASSERT(sotorawcb(so) == NULL);
        rop = kmem_zalloc(sizeof(*rop), KM_SLEEP);
        rp = &rop->rocb_rcb;
        rp->rcb_len = sizeof(*rop);
        so->so_pcb = rp;

        s = splsoftnet();

#ifdef NET_MPSAFE
        KASSERT(so->so_lock == NULL);
        mutex_obj_hold(rt_so_mtx);
        so->so_lock = rt_so_mtx;
        solock(so);
#endif

        if ((error = raw_attach(so, proto, &rt_rawcb)) == 0) {
                rt_adjustcount(rp->rcb_proto.sp_protocol, 1);
                rp->rcb_laddr = &COMPATNAME(route_info).ri_src;
                rp->rcb_faddr = &COMPATNAME(route_info).ri_dst;
                rp->rcb_filter = COMPATNAME(route_filter);
        }
        splx(s);

        if (error) {
                kmem_free(rop, sizeof(*rop));
                so->so_pcb = NULL;
                return error;
        }

        soisconnected(so);
        so->so_options |= SO_USELOOPBACK;
        KASSERT(solocked(so));

        return error;
}

static void
COMPATNAME(route_detach)(struct socket *so)
{
        struct rawcb *rp = sotorawcb(so);
        struct routecb *rop = (struct routecb *)rp;
        int s;

        KASSERT(rp != NULL);
        KASSERT(solocked(so));

        s = splsoftnet();
        if (rop->rocb_missfilterlen != 0)
                kmem_free(rop->rocb_missfilter, rop->rocb_missfilterlen);
        rt_adjustcount(rp->rcb_proto.sp_protocol, -1);
        raw_detach(so);
        splx(s);
}

static int
COMPATNAME(route_accept)(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));

        panic("route_accept");

        return EOPNOTSUPP;
}

static int
COMPATNAME(route_bind)(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
COMPATNAME(route_listen)(struct socket *so, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
COMPATNAME(route_connect)(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
COMPATNAME(route_connect2)(struct socket *so, struct socket *so2)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
COMPATNAME(route_disconnect)(struct socket *so)
{
        struct rawcb *rp = sotorawcb(so);
        int s;

        KASSERT(solocked(so));
        KASSERT(rp != NULL);

        s = splsoftnet();
        soisdisconnected(so);
        raw_disconnect(rp);
        splx(s);

        return 0;
}

static int
COMPATNAME(route_shutdown)(struct socket *so)
{
        int s;

        KASSERT(solocked(so));

        /*
         * Mark the connection as being incapable of further input.
         */
        s = splsoftnet();
        socantsendmore(so);
        splx(s);
        return 0;
}

static int
COMPATNAME(route_abort)(struct socket *so)
{
        KASSERT(solocked(so));

        panic("route_abort");

        return EOPNOTSUPP;
}

static int
COMPATNAME(route_ioctl)(struct socket *so, u_long cmd, void *nam,
    struct ifnet * ifp)
{
        return EOPNOTSUPP;
}

static int
COMPATNAME(route_stat)(struct socket *so, struct stat *ub)
{
        KASSERT(solocked(so));

        return 0;
}

static int
COMPATNAME(route_peeraddr)(struct socket *so, struct sockaddr *nam)
{
        struct rawcb *rp = sotorawcb(so);

        KASSERT(solocked(so));
        KASSERT(rp != NULL);
        KASSERT(nam != NULL);

        if (rp->rcb_faddr == NULL)
                return ENOTCONN;

        raw_setpeeraddr(rp, nam);
        return 0;
}

static int
COMPATNAME(route_sockaddr)(struct socket *so, struct sockaddr *nam)
{
        struct rawcb *rp = sotorawcb(so);

        KASSERT(solocked(so));
        KASSERT(rp != NULL);
        KASSERT(nam != NULL);

        if (rp->rcb_faddr == NULL)
                return ENOTCONN;

        raw_setsockaddr(rp, nam);
        return 0;
}

static int
COMPATNAME(route_rcvd)(struct socket *so, int flags, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
COMPATNAME(route_recvoob)(struct socket *so, struct mbuf *m, int flags)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
COMPATNAME(route_send)(struct socket *so, struct mbuf *m,
    struct sockaddr *nam, struct mbuf *control, struct lwp *l)
{
        int error = 0;
        int s;

        KASSERT(solocked(so));
        KASSERT(so->so_proto == &COMPATNAME(route_protosw)[0]);

        s = splsoftnet();
        error = raw_send(so, m, nam, control, l, &COMPATNAME(route_output));
        splx(s);

        return error;
}

static int
COMPATNAME(route_sendoob)(struct socket *so, struct mbuf *m,
    struct mbuf *control)
{
        KASSERT(solocked(so));

        m_freem(m);
        m_freem(control);

        return EOPNOTSUPP;
}
static int
COMPATNAME(route_purgeif)(struct socket *so, struct ifnet *ifp)
{

        panic("route_purgeif");

        return EOPNOTSUPP;
}

#if defined(INET) || defined(INET6)
static int
route_get_sdl_index(struct rt_addrinfo *info, int *sdl_index)
{
        struct rtentry *nrt;
        int error;

        error = rtrequest1(RTM_GET, info, &nrt);
        if (error != 0)
                return error;
        /*
         * nrt->rt_ifp->if_index may not be correct
         * due to changing to ifplo0.
         */
        *sdl_index = satosdl(nrt->rt_gateway)->sdl_index;
        rt_unref(nrt);

        return 0;
}
#endif

static void
route_get_sdl(const struct ifnet *ifp, const struct sockaddr *dst,
    struct sockaddr_dl *sdl, int *flags)
{
        struct llentry *la;

        KASSERT(ifp != NULL);

        IF_AFDATA_RLOCK(ifp);
        switch (dst->sa_family) {
        case AF_INET:
                la = lla_lookup(LLTABLE(ifp), 0, dst);
                break;
        case AF_INET6:
                la = lla_lookup(LLTABLE6(ifp), 0, dst);
                break;
        default:
                la = NULL;
                KASSERTMSG(0, "Invalid AF=%d\n", dst->sa_family);
                break;
        }
        IF_AFDATA_RUNLOCK(ifp);

        void *a = (LLE_IS_VALID(la) && (la->la_flags & LLE_VALID) == LLE_VALID)
            ? &la->ll_addr : NULL;

        a = sockaddr_dl_init(sdl, sizeof(*sdl), ifp->if_index, ifp->if_type,
            NULL, 0, a, ifp->if_addrlen);
        KASSERT(a != NULL);

        if (la != NULL) {
                *flags = la->la_flags;
                LLE_RUNLOCK(la);
        }
}

static int
route_output_report(struct rtentry *rt, struct rt_addrinfo *info,
    struct rt_xmsghdr *rtm, struct rt_xmsghdr **new_rtm)
{
        int len, error;

        if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
                const struct ifaddr *rtifa;
                const struct ifnet *ifp = rt->rt_ifp;

                info->rti_info[RTAX_IFP] = ifp->if_dl->ifa_addr;
                /* rtifa used to be simply rt->rt_ifa.
                 * If rt->rt_ifa != NULL, then
                 * rt_get_ifa() != NULL.  So this
                 * ought to still be safe. --dyoung
                 */
                rtifa = rt_get_ifa(rt);
                info->rti_info[RTAX_IFA] = rtifa->ifa_addr;
#ifdef RTSOCK_DEBUG
                if (info->rti_info[RTAX_IFA]->sa_family == AF_INET) {
                        char ibuf[INET_ADDRSTRLEN];
                        char abuf[INET_ADDRSTRLEN];
                        printf("%s: copying out RTAX_IFA %s "
                            "for info->rti_info[RTAX_DST] %s "
                            "ifa_getifa %p ifa_seqno %p\n",
                            __func__,
                            RT_IN_PRINT(info, ibuf, RTAX_IFA),
                            RT_IN_PRINT(info, abuf, RTAX_DST),
                            (void *)rtifa->ifa_getifa,
                            rtifa->ifa_seqno);
                }
#endif /* RTSOCK_DEBUG */
                if (ifp->if_flags & IFF_POINTOPOINT)
                        info->rti_info[RTAX_BRD] = rtifa->ifa_dstaddr;
                else
                        info->rti_info[RTAX_BRD] = NULL;
                rtm->rtm_index = ifp->if_index;
        }
        error = rt_msg2(rtm->rtm_type, info, NULL, NULL, &len);
        if (error)
                return error;
        if (len > rtm->rtm_msglen) {
                struct rt_xmsghdr *old_rtm = rtm;
                R_Malloc(*new_rtm, struct rt_xmsghdr *, len);
                if (*new_rtm == NULL)
                        return ENOBUFS;
                (void)memcpy(*new_rtm, old_rtm, old_rtm->rtm_msglen);
                rtm = *new_rtm;
        }
        (void)rt_msg2(rtm->rtm_type, info, rtm, NULL, 0);
        rtm->rtm_flags = rt->rt_flags;
        rtm_setmetrics(rt, rtm);
        rtm->rtm_addrs = info->rti_addrs;

        return 0;
}

/*ARGSUSED*/
int
COMPATNAME(route_output)(struct mbuf *m, struct socket *so)
{
        struct sockproto proto = { .sp_family = PF_XROUTE, };
        struct rt_xmsghdr hdr;
        struct rt_xmsghdr *rtm = NULL;
        struct rt_xmsghdr *old_rtm = NULL, *new_rtm = NULL;
        struct rtentry *rt = NULL;
        struct rtentry *saved_nrt = NULL;
        struct rt_addrinfo info;
        int len, error = 0;
        sa_family_t family;
        struct sockaddr_dl sdl;
        int bound = curlwp_bind();
        bool do_rt_free = false;
        struct sockaddr_storage netmask;

#define senderr(e) do { error = e; goto flush;} while (/*CONSTCOND*/ 0)
        if (m == NULL || ((m->m_len < sizeof(int32_t)) &&
           (m = m_pullup(m, sizeof(int32_t))) == NULL)) {
                error = ENOBUFS;
                goto out;
        }
        if ((m->m_flags & M_PKTHDR) == 0)
                panic("%s", __func__);
        len = m->m_pkthdr.len;
        if (len < sizeof(*rtm)) {
                info.rti_info[RTAX_DST] = NULL;
                senderr(EINVAL);
        }
        m_copydata(m, 0, sizeof(hdr), &hdr);
        if (len != hdr.rtm_msglen) {
                info.rti_info[RTAX_DST] = NULL;
                senderr(EINVAL);
        }
        R_Malloc(rtm, struct rt_xmsghdr *, len);
        if (rtm == NULL) {
                info.rti_info[RTAX_DST] = NULL;
                senderr(ENOBUFS);
        }
        m_copydata(m, 0, len, rtm);
        if (rtm->rtm_version != RTM_XVERSION) {
                info.rti_info[RTAX_DST] = NULL;
                senderr(EPROTONOSUPPORT);
        }
        rtm->rtm_pid = curproc->p_pid;
        memset(&info, 0, sizeof(info));
        info.rti_addrs = rtm->rtm_addrs;
        if (rt_xaddrs(rtm->rtm_type, (const char *)(rtm + 1), len + (char *)rtm,
            &info)) {
                senderr(EINVAL);
        }
        info.rti_flags = rtm->rtm_flags;
        if (info.rti_info[RTAX_DST] == NULL ||
            (info.rti_info[RTAX_DST]->sa_family >= AF_MAX)) {
                senderr(EINVAL);
        }
#ifdef RTSOCK_DEBUG
        if (info.rti_info[RTAX_DST]->sa_family == AF_INET) {
                char abuf[INET_ADDRSTRLEN];
                printf("%s: extracted info.rti_info[RTAX_DST] %s\n", __func__,
                    RT_IN_PRINT(&info, abuf, RTAX_DST));
        }
#endif /* RTSOCK_DEBUG */
        if (info.rti_info[RTAX_GATEWAY] != NULL &&
            (info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX)) {
                senderr(EINVAL);
        }

        /*
         * Verify that the socket has the appropriate privilege; RTM_GET
         * is the only operation the non-superuser is allowed.
         */
        if (kauth_authorize_network(so->so_cred, KAUTH_NETWORK_ROUTE,
            0, rtm, NULL, NULL) != 0)
                senderr(EACCES);

        /*
         * route(8) passes a sockaddr truncated with prefixlen.
         * The kernel doesn't expect such sockaddr and need to 
         * use a buffer that is big enough for the sockaddr expected
         * (padded with 0's). We keep the original length of the sockaddr.
         */
        if (info.rti_info[RTAX_NETMASK]) {
                /*
                 * Use the family of RTAX_DST, because RTAX_NETMASK
                 * can have a zero family if it comes from the radix
                 * tree via rt_mask().
                 */
                socklen_t sa_len = sockaddr_getsize_by_family(
                    info.rti_info[RTAX_DST]->sa_family);
                socklen_t masklen = sockaddr_getlen(
                    info.rti_info[RTAX_NETMASK]);
                if (sa_len != 0 && sa_len > masklen) {
                        KASSERT(sa_len <= sizeof(netmask));
                        memcpy(&netmask, info.rti_info[RTAX_NETMASK], masklen);
                        memset((char *)&netmask + masklen, 0, sa_len - masklen);
                        info.rti_info[RTAX_NETMASK] = sstocsa(&netmask);
                }
        }

        switch (rtm->rtm_type) {

        case RTM_ADD:
                if (info.rti_info[RTAX_GATEWAY] == NULL) {
                        senderr(EINVAL);
                }
#if defined(INET) || defined(INET6)
                /* support for new ARP/NDP code with keeping backcompat */
                if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) {
                        const struct sockaddr_dl *sdlp =
                            satocsdl(info.rti_info[RTAX_GATEWAY]);

                        /* Allow routing requests by interface index */
                        if (sdlp->sdl_nlen == 0 && sdlp->sdl_alen == 0
                            && sdlp->sdl_slen == 0)
                                goto fallback;
                        /*
                         * Old arp binaries don't set the sdl_index
                         * so we have to complement it.
                         */
                        int sdl_index = sdlp->sdl_index;
                        if (sdl_index == 0) {
                                error = route_get_sdl_index(&info, &sdl_index);
                                if (error != 0)
                                        goto fallback;
                        } else if (
                            info.rti_info[RTAX_DST]->sa_family == AF_INET) {
                                /*
                                 * XXX workaround for SIN_PROXY case; proxy arp
                                 * entry should be in an interface that has
                                 * a network route including the destination,
                                 * not a local (link) route that may not be a
                                 * desired place, for example a tap.
                                 */
                                const struct sockaddr_inarp *sina =
                                    (const struct sockaddr_inarp *)
                                    info.rti_info[RTAX_DST];
                                if (sina->sin_other & SIN_PROXY) {
                                        error = route_get_sdl_index(&info,
                                            &sdl_index);
                                        if (error != 0)
                                                goto fallback;
                                }
                        }
                        error = lla_rt_output(rtm->rtm_type, rtm->rtm_flags,
                            rtm->rtm_rmx.rmx_expire, &info, sdl_index);
                        break;
                }
        fallback:
#endif /* defined(INET) || defined(INET6) */
                error = rtrequest1(rtm->rtm_type, &info, &saved_nrt);
                if (error == 0) {
                        _rt_setmetrics(rtm->rtm_inits, rtm, saved_nrt);
                        rt_unref(saved_nrt);
                }
                break;

        case RTM_DELETE:
#if defined(INET) || defined(INET6)
                /* support for new ARP/NDP code */
                if (info.rti_info[RTAX_GATEWAY] &&
                    (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) &&
                    (rtm->rtm_flags & RTF_LLDATA) != 0) {
                        const struct sockaddr_dl *sdlp =
                            satocsdl(info.rti_info[RTAX_GATEWAY]);
                        error = lla_rt_output(rtm->rtm_type, rtm->rtm_flags,
                            rtm->rtm_rmx.rmx_expire, &info, sdlp->sdl_index);
                        rtm->rtm_flags &= ~RTF_UP;
                        break;
                }
#endif
                error = rtrequest1(rtm->rtm_type, &info, &saved_nrt);
                if (error != 0)
                        break;

                rt = saved_nrt;
                do_rt_free = true;
                info.rti_info[RTAX_DST] = rt_getkey(rt);
                info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
                info.rti_info[RTAX_NETMASK] = rt_mask(rt);
                info.rti_info[RTAX_TAG] = rt_gettag(rt);
                error = route_output_report(rt, &info, rtm, &new_rtm);
                if (error)
                        senderr(error);
                if (new_rtm != NULL) {
                        old_rtm = rtm;
                        rtm = new_rtm;
                }
                break;

        case RTM_GET:
        case RTM_CHANGE:
        case RTM_LOCK:
                /* XXX This will mask info.rti_info[RTAX_DST] with
                 * info.rti_info[RTAX_NETMASK] before
                 * searching.  It did not used to do that.  --dyoung
                 */
                rt = NULL;
                error = rtrequest1(RTM_GET, &info, &rt);
                if (error != 0)
                        senderr(error);
                if (rtm->rtm_type != RTM_GET) {/* XXX: too grotty */
                        if (memcmp(info.rti_info[RTAX_DST], rt_getkey(rt),
                            info.rti_info[RTAX_DST]->sa_len) != 0)
                                senderr(ESRCH);
                        if (info.rti_info[RTAX_NETMASK] == NULL &&
                            rt_mask(rt) != NULL)
                                senderr(ETOOMANYREFS);
                }

                /*
                 * XXX if arp/ndp requests an L2 entry, we have to obtain
                 * it from lltable while for the route command we have to
                 * return a route as it is. How to distinguish them?
                 * For newer arp/ndp, RTF_LLDATA flag set by arp/ndp
                 * indicates an L2 entry is requested. For old arp/ndp
                 * binaries, we check RTF_UP flag is NOT set; it works
                 * by the fact that arp/ndp don't set it while the route
                 * command sets it.
                 */
                if (((rtm->rtm_flags & RTF_LLDATA) != 0 ||
                     (rtm->rtm_flags & RTF_UP) == 0) &&
                    rtm->rtm_type == RTM_GET &&
                    sockaddr_cmp(rt_getkey(rt), info.rti_info[RTAX_DST]) != 0) {
                        int ll_flags = 0;
                        route_get_sdl(rt->rt_ifp, info.rti_info[RTAX_DST], &sdl,
                            &ll_flags);
                        info.rti_info[RTAX_GATEWAY] = sstocsa(&sdl);
                        error = route_output_report(rt, &info, rtm, &new_rtm);
                        if (error)
                                senderr(error);
                        if (new_rtm != NULL) {
                                old_rtm = rtm;
                                rtm = new_rtm;
                        }
                        rtm->rtm_flags |= RTF_LLDATA;
                        rtm->rtm_flags &= ~RTF_CONNECTED;
                        rtm->rtm_flags |= (ll_flags & LLE_STATIC) ? RTF_STATIC : 0;
                        break;
                }

                switch (rtm->rtm_type) {
                case RTM_GET:
                        info.rti_info[RTAX_DST] = rt_getkey(rt);
                        info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
                        info.rti_info[RTAX_NETMASK] = rt_mask(rt);
                        info.rti_info[RTAX_TAG] = rt_gettag(rt);
                        error = route_output_report(rt, &info, rtm, &new_rtm);
                        if (error)
                                senderr(error);
                        if (new_rtm != NULL) {
                                old_rtm = rtm;
                                rtm = new_rtm;
                        }
                        break;

                case RTM_CHANGE:
#ifdef NET_MPSAFE
                        /*
                         * Release rt_so_mtx to avoid a deadlock with route_intr
                         * and also serialize updating routes to avoid another.
                         */
                        if (rt_updating) {
                                /* Release to allow the updater to proceed */
                                rt_unref(rt);
                                rt = NULL;
                        }
                        while (rt_updating) {
                                error = cv_wait_sig(&rt_update_cv, rt_so_mtx);
                                if (error != 0)
                                        goto flush;
                        }
                        if (rt == NULL) {
                                error = rtrequest1(RTM_GET, &info, &rt);
                                if (error != 0)
                                        goto flush;
                        }
                        rt_updating = true;
                        mutex_exit(rt_so_mtx);

                        error = rt_update_prepare(rt);
                        if (error == 0) {
                                error = rt_update(rt, &info, rtm);
                                rt_update_finish(rt);
                        }

                        mutex_enter(rt_so_mtx);
                        rt_updating = false;
                        cv_broadcast(&rt_update_cv);
#else
                        error = rt_update(rt, &info, rtm);
#endif
                        if (error != 0)
                                goto flush;
                        /*FALLTHROUGH*/
                case RTM_LOCK:
                        rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits);
                        rt->rt_rmx.rmx_locks |=
                            (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
                        break;
                }
                break;

        default:
                senderr(EOPNOTSUPP);
        }

flush:
        if (rtm) {
                if (error)
                        rtm->rtm_errno = error;
                else
                        rtm->rtm_flags |= RTF_DONE;
        }
        family = info.rti_info[RTAX_DST] ? info.rti_info[RTAX_DST]->sa_family :
            0;
        /* We cannot free old_rtm until we have stopped using the
         * pointers in info, some of which may point to sockaddrs
         * in old_rtm.
         */
        if (old_rtm != NULL)
                Free(old_rtm);
        if (rt) {
                if (do_rt_free) {
#ifdef NET_MPSAFE
                        /*
                         * Release rt_so_mtx to avoid a deadlock with
                         * route_intr.
                         */
                        mutex_exit(rt_so_mtx);
                        rt_free(rt);
                        mutex_enter(rt_so_mtx);
#else
                        rt_free(rt);
#endif
                } else
                        rt_unref(rt);
        }
    {
        struct rawcb *rp = NULL;
        /*
         * Check to see if we don't want our own messages.
         */
        if ((so->so_options & SO_USELOOPBACK) == 0) {
                if (COMPATNAME(route_info).ri_cb.any_count <= 1) {
                        if (rtm)
                                Free(rtm);
                        m_freem(m);
                        goto out;
                }
                /* There is another listener, so construct message */
                rp = sotorawcb(so);
        }
        if (rtm) {
                m_copyback(m, 0, rtm->rtm_msglen, rtm);
                if (m->m_pkthdr.len < rtm->rtm_msglen) {
                        m_freem(m);
                        m = NULL;
                } else if (m->m_pkthdr.len > rtm->rtm_msglen)
                        m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
                Free(rtm);
        }
        if (rp)
                rp->rcb_proto.sp_family = 0; /* Avoid us */
        if (family)
                proto.sp_protocol = family;
        if (m)
                raw_input(m, &proto, &COMPATNAME(route_info).ri_src,
                    &COMPATNAME(route_info).ri_dst, &rt_rawcb);
        if (rp)
                rp->rcb_proto.sp_family = PF_XROUTE;
    }
out:
        curlwp_bindx(bound);
        return error;
}

static int
route_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
        struct routecb *rop = sotoroutecb(so);
        int error = 0;
        unsigned char *rtm_type, *cp, *ep;
        size_t len;
        unsigned int msgfilter;
        struct sockaddr *sa;

        KASSERT(solocked(so));

        if (sopt->sopt_level != AF_ROUTE) {
                error = ENOPROTOOPT;
        } else switch (op) {
        case PRCO_SETOPT:
                switch (sopt->sopt_name) {
                case RO_MSGFILTER:
                        msgfilter = 0;
                        for (rtm_type = sopt->sopt_data, len = sopt->sopt_size;
                             len != 0;
                             rtm_type++, len -= sizeof(*rtm_type))
                        {
                                /* Guard against overflowing our storage. */
                                if (*rtm_type >= sizeof(msgfilter) * CHAR_BIT) {
                                        error = EOVERFLOW;
                                        break;
                                }
                                msgfilter |= RTMSGFILTER(*rtm_type);
                        }
                        if (error == 0)
                                rop->rocb_msgfilter = msgfilter;
                        break;
                case RO_MISSFILTER:
                        /* Validate the data */
                        len = 0;
                        cp = sopt->sopt_data;
                        ep = cp + sopt->sopt_size;
                        while (cp < ep) {
                                if (ep - cp <
                                    offsetof(struct sockaddr, sa_len) +
                                    sizeof(sa->sa_len))
                                        break;
                                if (++len > RO_FILTSA_MAX) {
                                        error = ENOBUFS;
                                        break;
                                }
                                sa = (struct sockaddr *)cp;
                                if (sa->sa_len < _SA_MINSIZE ||
                                    sa->sa_len >sizeof(struct sockaddr_storage))
                                        return EINVAL;
                                cp += RT_XROUNDUP(sa->sa_len);
                        }
                        if (cp != ep) {
                                if (error == 0)
                                        error = EINVAL;
                                break;
                        }
                        if (rop->rocb_missfilterlen != 0)
                                kmem_free(rop->rocb_missfilter,
                                    rop->rocb_missfilterlen);
                        if (sopt->sopt_size != 0) {
                                rop->rocb_missfilter =
                                    kmem_alloc(sopt->sopt_size, KM_SLEEP);
                                if (rop->rocb_missfilter == NULL) {
                                        rop->rocb_missfilterlen = 0;
                                        error = ENOBUFS;
                                        break;
                                }
                        } else
                                rop->rocb_missfilter = NULL;
                        rop->rocb_missfilterlen = sopt->sopt_size;
                        if (rop->rocb_missfilterlen != 0)
                                memcpy(rop->rocb_missfilter, sopt->sopt_data,
                                    rop->rocb_missfilterlen);
                        break;
                default:
                        error = ENOPROTOOPT;
                        break;
                }
                break;
        case PRCO_GETOPT:
                switch (sopt->sopt_name) {
                case RO_MSGFILTER:
                        error = ENOTSUP;
                        break;
                default:
                        error = ENOPROTOOPT;
                        break;
                }
        }
        return error;
}

static void
_rt_setmetrics(int which, const struct rt_xmsghdr *in, struct rtentry *out)
{
#define metric(f, e) if (which & (f)) out->rt_rmx.e = in->rtm_rmx.e;
        metric(RTV_RPIPE, rmx_recvpipe);
        metric(RTV_SPIPE, rmx_sendpipe);
        metric(RTV_SSTHRESH, rmx_ssthresh);
        metric(RTV_RTT, rmx_rtt);
        metric(RTV_RTTVAR, rmx_rttvar);
        metric(RTV_HOPCOUNT, rmx_hopcount);
        metric(RTV_MTU, rmx_mtu);
#undef metric
        if (which & RTV_EXPIRE) {
                out->rt_rmx.rmx_expire = in->rtm_rmx.rmx_expire ?
                    time_wall_to_mono(in->rtm_rmx.rmx_expire) : 0;
        }
}

static void
rtm_setmetrics(const struct rtentry *in, struct rt_xmsghdr *out)
{
#define metric(e) out->rtm_rmx.e = in->rt_rmx.e;
        metric(rmx_recvpipe);
        metric(rmx_sendpipe);
        metric(rmx_ssthresh);
        metric(rmx_rtt);
        metric(rmx_rttvar);
        metric(rmx_hopcount);
        metric(rmx_mtu);
        metric(rmx_locks);
#undef metric
        out->rtm_rmx.rmx_expire = in->rt_rmx.rmx_expire ?
            time_mono_to_wall(in->rt_rmx.rmx_expire) : 0;
}

static int
rt_xaddrs(u_char rtmtype, const char *cp, const char *cplim,
    struct rt_addrinfo *rtinfo)
{
        const struct sockaddr *sa = NULL;        /* Quell compiler warning */
        int i;

        for (i = 0; i < RTAX_MAX && cp < cplim; i++) {
                if ((rtinfo->rti_addrs & (1 << i)) == 0)
                        continue;
                rtinfo->rti_info[i] = sa = (const struct sockaddr *)cp;
                RT_XADVANCE(cp, sa);
        }

        /*
         * Check for extra addresses specified, except RTM_GET asking
         * for interface info.
         */
        if (rtmtype == RTM_GET) {
                if (((rtinfo->rti_addrs &
                    (~((1 << RTAX_IFP) | (1 << RTAX_IFA)))) & (~0U << i)) != 0)
                        return 1;
        } else if ((rtinfo->rti_addrs & (~0U << i)) != 0)
                return 1;
        /* Check for bad data length.  */
        if (cp != cplim) {
                if (i == RTAX_NETMASK + 1 && sa != NULL &&
                    cp - RT_XROUNDUP(sa->sa_len) + sa->sa_len == cplim)
                        /*
                         * The last sockaddr was info.rti_info[RTAX_NETMASK].
                         * We accept this for now for the sake of old
                         * binaries or third party softwares.
                         */
                        ;
                else
                        return 1;
        }
        return 0;
}

static int
rt_getlen(int type)
{
        RTS_CTASSERT(__alignof(struct ifa_msghdr) >= sizeof(uint64_t));
        RTS_CTASSERT(__alignof(struct if_msghdr) >= sizeof(uint64_t));
        RTS_CTASSERT(__alignof(struct if_announcemsghdr) >= sizeof(uint64_t));
        RTS_CTASSERT(__alignof(struct rt_msghdr) >= sizeof(uint64_t));

        switch (type) {
        case RTM_ODELADDR:
        case RTM_ONEWADDR:
        case RTM_OCHGADDR:
                if (rtsock_iflist_70_hook.hooked)
                        return sizeof(struct ifa_msghdr70);
                else {
#ifdef RTSOCK_DEBUG
                        printf("%s: unsupported RTM type %d\n", __func__, type);
#endif
                        return -1;
                }

        case RTM_DELADDR:
        case RTM_NEWADDR:
        case RTM_CHGADDR:
                return sizeof(struct ifa_xmsghdr);

        case RTM_OOIFINFO:
                if (rtsock_iflist_14_hook.hooked)
                        return sizeof(struct if_msghdr14);
                else {
#ifdef RTSOCK_DEBUG
                        printf("%s: unsupported RTM type RTM_OOIFINFO\n",
                            __func__);
#endif
                        return -1;
                }

        case RTM_OIFINFO:
                if (rtsock_iflist_50_hook.hooked)
                        return sizeof(struct if_msghdr50);
                else {
#ifdef RTSOCK_DEBUG
                        printf("%s: unsupported RTM type RTM_OIFINFO\n",
                            __func__);
#endif
                        return -1;
                }

        case RTM_IFINFO:
                return sizeof(struct if_xmsghdr);

        case RTM_IFANNOUNCE:
        case RTM_IEEE80211:
                return sizeof(struct if_xannouncemsghdr);

        default:
                return sizeof(struct rt_xmsghdr);
        }
}


struct mbuf *
COMPATNAME(rt_msg1)(int type, struct rt_addrinfo *rtinfo, void *data, int datalen)
{
        struct rt_xmsghdr *rtm;
        struct mbuf *m;
        int i;
        const struct sockaddr *sa;
        int len, dlen;

        m = m_gethdr(M_DONTWAIT, MT_DATA);
        if (m == NULL)
                return m;
        MCLAIM(m, &COMPATNAME(routedomain).dom_mowner);

        if ((len = rt_getlen(type)) == -1)
                goto out;
        if (len > MHLEN + MLEN)
                panic("%s: message too long", __func__);
        else if (len > MHLEN) {
                m->m_next = m_get(M_DONTWAIT, MT_DATA);
                if (m->m_next == NULL)
                        goto out;
                MCLAIM(m->m_next, m->m_owner);
                m->m_pkthdr.len = len;
                m->m_len = MHLEN;
                m->m_next->m_len = len - MHLEN;
        } else {
                m->m_pkthdr.len = m->m_len = len;
        }
        m_reset_rcvif(m);
        m_copyback(m, 0, datalen, data);
        if (len > datalen)
                (void)memset(mtod(m, char *) + datalen, 0, len - datalen);
        rtm = mtod(m, struct rt_xmsghdr *);
        for (i = 0; i < RTAX_MAX; i++) {
                if ((sa = rtinfo->rti_info[i]) == NULL)
                        continue;
                rtinfo->rti_addrs |= (1 << i);
                dlen = RT_XROUNDUP(sa->sa_len);
                m_copyback(m, len, sa->sa_len, sa);
                if (dlen != sa->sa_len) {
                        /*
                         * Up to 7 + 1 nul's since roundup is to
                         * sizeof(uint64_t) (8 bytes)
                         */
                        m_copyback(m, len + sa->sa_len,
                            dlen - sa->sa_len, "\0\0\0\0\0\0\0");
                }
                len += dlen;
        }
        if (m->m_pkthdr.len != len)
                goto out;
        rtm->rtm_msglen = len;
        rtm->rtm_version = RTM_XVERSION;
        rtm->rtm_type = type;
        return m;
out:
        m_freem(m);
        return NULL;
}

/*
 * rt_msg2
 *
 *         fills 'cp' or 'w'.w_tmem with the routing socket message and
 *                returns the length of the message in 'lenp'.
 *
 * if walkarg is 0, cp is expected to be 0 or a buffer large enough to hold
 *        the message
 * otherwise walkarg's w_needed is updated and if the user buffer is
 *        specified and w_needed indicates space exists the information is copied
 *        into the temp space (w_tmem). w_tmem is [re]allocated if necessary,
 *        if the allocation fails ENOBUFS is returned.
 */
static int
rt_msg2(int type, struct rt_addrinfo *rtinfo, void *cpv, struct rt_walkarg *w,
        int *lenp)
{
        int i;
        int len, dlen, second_time = 0;
        char *cp0, *cp = cpv;

        rtinfo->rti_addrs = 0;
again:
        if ((len = rt_getlen(type)) == -1)
                return EINVAL;

        if ((cp0 = cp) != NULL)
                cp += len;
        for (i = 0; i < RTAX_MAX; i++) {
                const struct sockaddr *sa;

                if ((sa = rtinfo->rti_info[i]) == NULL)
                        continue;
                rtinfo->rti_addrs |= (1 << i);
                dlen = RT_XROUNDUP(sa->sa_len);
                if (cp) {
                        int diff = dlen - sa->sa_len;
                        (void)memcpy(cp, sa, (size_t)sa->sa_len);
                        cp += sa->sa_len;
                        if (diff > 0) {
                                (void)memset(cp, 0, (size_t)diff);
                                cp += diff;
                        }
                }
                len += dlen;
        }
        if (cp == NULL && w != NULL && !second_time) {
                struct rt_walkarg *rw = w;

                rw->w_needed += len;
                if (rw->w_needed <= 0 && rw->w_where) {
                        if (rw->w_tmemsize < len) {
                                if (rw->w_tmem)
                                        kmem_free(rw->w_tmem, rw->w_tmemsize);
                                rw->w_tmem = kmem_zalloc(len, KM_SLEEP);
                                rw->w_tmemsize = len;
                        }
                        if (rw->w_tmem) {
                                cp = rw->w_tmem;
                                second_time = 1;
                                goto again;
                        } else {
                                rw->w_tmemneeded = len;
                                return ENOBUFS;
                        }
                }
        }
        if (cp) {
                struct rt_xmsghdr *rtm = (struct rt_xmsghdr *)cp0;

                rtm->rtm_version = RTM_XVERSION;
                rtm->rtm_type = type;
                rtm->rtm_msglen = len;
        }
        if (lenp)
                *lenp = len;
        return 0;
}

/*
 * This routine is called to generate a message from the routing
 * socket indicating that a redirect has occurred, a routing lookup
 * has failed, or that a protocol has detected timeouts to a particular
 * destination.
 */
void
COMPATNAME(rt_missmsg)(int type, const struct rt_addrinfo *rtinfo, int flags,
    int error)
{
        struct rt_xmsghdr rtm;
        struct mbuf *m;
        const struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];
        struct rt_addrinfo info = *rtinfo;

        COMPATCALL(rt_missmsg, (type, rtinfo, flags, error));
        if (COMPATNAME(route_info).ri_cb.any_count == 0)
                return;
        memset(&rtm, 0, sizeof(rtm));
        rtm.rtm_pid = curproc->p_pid;
        rtm.rtm_flags = RTF_DONE | flags;
        rtm.rtm_errno = error;
        m = COMPATNAME(rt_msg1)(type, &info, &rtm, sizeof(rtm));
        if (m == NULL)
                return;
        mtod(m, struct rt_xmsghdr *)->rtm_addrs = info.rti_addrs;
        COMPATNAME(route_enqueue)(m, sa ? sa->sa_family : 0);
}

/*
 * This routine is called to generate a message from the routing
 * socket indicating that the status of a network interface has changed.
 */
void
COMPATNAME(rt_ifmsg)(struct ifnet *ifp)
{
        struct if_xmsghdr ifm;
        struct mbuf *m;
        struct rt_addrinfo info;

        COMPATCALL(rt_ifmsg, (ifp));
        if (COMPATNAME(route_info).ri_cb.any_count == 0)
                return;
        (void)memset(&info, 0, sizeof(info));
        (void)memset(&ifm, 0, sizeof(ifm));
        ifm.ifm_index = ifp->if_index;
        ifm.ifm_flags = ifp->if_flags;
        if_export_if_data(ifp, &ifm.ifm_data, false);
        ifm.ifm_addrs = 0;
        m = COMPATNAME(rt_msg1)(RTM_IFINFO, &info, &ifm, sizeof(ifm));
        if (m == NULL)
                return;
        COMPATNAME(route_enqueue)(m, 0);
        MODULE_HOOK_CALL_VOID(rtsock_oifmsg_14_hook, (ifp), __nothing);
        MODULE_HOOK_CALL_VOID(rtsock_oifmsg_50_hook, (ifp), __nothing);
}

/*
 * This is called to generate messages from the routing socket
 * indicating a network interface has had addresses associated with it.
 * if we ever reverse the logic and replace messages TO the routing
 * socket indicate a request to configure interfaces, then it will
 * be unnecessary as the routing socket will automatically generate
 * copies of it.
 */
static void
COMPATNAME(rt_addrmsg0)(int cmd, struct ifaddr *ifa, int error,
    struct rtentry *rt, const struct sockaddr *src)
{
#define        cmdpass(__cmd, __pass)        (((__cmd) << 2) | (__pass))
        struct rt_addrinfo info;
        const struct sockaddr *sa;
        int pass;
        struct mbuf *m;
        struct ifnet *ifp;
        struct rt_xmsghdr rtm;
        struct ifa_xmsghdr ifam;
        int ncmd;

        KASSERT(ifa != NULL);
        KASSERT(ifa->ifa_addr != NULL);
        ifp = ifa->ifa_ifp;
        if (cmd == RTM_ADD && vec_sctp_add_ip_address != NULL) {
                (*vec_sctp_add_ip_address)(ifa);
        } else if (cmd == RTM_DELETE && vec_sctp_delete_ip_address != NULL) {
                (*vec_sctp_delete_ip_address)(ifa);
        }

        COMPATCALL(rt_addrmsg_rt, (cmd, ifa, error, rt));
        if (COMPATNAME(route_info).ri_cb.any_count == 0)
                return;
        for (pass = 1; pass < 3; pass++) {
                memset(&info, 0, sizeof(info));
                switch (cmdpass(cmd, pass)) {
                case cmdpass(RTM_ADD, 1):
                case cmdpass(RTM_CHANGE, 1):
                case cmdpass(RTM_DELETE, 2):
                case cmdpass(RTM_NEWADDR, 1):
                case cmdpass(RTM_DELADDR, 1):
                case cmdpass(RTM_CHGADDR, 1):
                        switch (cmd) {
                        case RTM_ADD:
                                ncmd = RTM_XNEWADDR;
                                break;
                        case RTM_DELETE:
                                ncmd = RTM_XDELADDR;
                                break;
                        case RTM_CHANGE:
                                ncmd = RTM_XCHGADDR;
                                break;
                        case RTM_NEWADDR:
                                ncmd = RTM_XNEWADDR;
                                break;
                        case RTM_DELADDR:
                                ncmd = RTM_XDELADDR;
                                break;
                        case RTM_CHGADDR:
                                ncmd = RTM_XCHGADDR;
                                break;
                        default:
                                panic("%s: unknown command %d", __func__, cmd);
                        }
                        MODULE_HOOK_CALL_VOID(rtsock_newaddr_70_hook,
                            (ncmd, ifa), __nothing);
                        info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
                        KASSERT(ifp->if_dl != NULL);
                        info.rti_info[RTAX_IFP] = ifp->if_dl->ifa_addr;
                        info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
                        info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
                        info.rti_info[RTAX_AUTHOR] = src;
                        memset(&ifam, 0, sizeof(ifam));
                        ifam.ifam_index = ifp->if_index;
                        ifam.ifam_metric = ifa->ifa_metric;
                        ifam.ifam_flags = ifa->ifa_flags;
#ifndef COMPAT_RTSOCK
                        ifam.ifam_pid = curproc->p_pid;
                        ifam.ifam_addrflags = if_addrflags(ifa);
#endif
                        m = COMPATNAME(rt_msg1)(ncmd, &info, &ifam, sizeof(ifam));
                        if (m == NULL)
                                continue;
                        mtod(m, struct ifa_xmsghdr *)->ifam_addrs =
                            info.rti_addrs;
                        break;
                case cmdpass(RTM_ADD, 2):
                case cmdpass(RTM_CHANGE, 2):
                case cmdpass(RTM_DELETE, 1):
                        if (rt == NULL)
                                continue;
                        info.rti_info[RTAX_NETMASK] = rt_mask(rt);
                        info.rti_info[RTAX_DST] = sa = rt_getkey(rt);
                        info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
                        memset(&rtm, 0, sizeof(rtm));
                        rtm.rtm_pid = curproc->p_pid;
                        rtm.rtm_index = ifp->if_index;
                        rtm.rtm_flags |= rt->rt_flags;
                        rtm.rtm_errno = error;
                        m = COMPATNAME(rt_msg1)(cmd, &info, &rtm, sizeof(rtm));
                        if (m == NULL)
                                continue;
                        mtod(m, struct rt_xmsghdr *)->rtm_addrs = info.rti_addrs;
                        break;
                default:
                        continue;
                }
                KASSERTMSG(m != NULL, "called with wrong command");
                COMPATNAME(route_enqueue)(m, sa ? sa->sa_family : 0);
        }
#undef cmdpass
}

void
COMPATNAME(rt_addrmsg)(int cmd, struct ifaddr *ifa)
{

        COMPATNAME(rt_addrmsg0)(cmd, ifa, 0, NULL, NULL);
}

void
COMPATNAME(rt_addrmsg_rt)(int cmd, struct ifaddr *ifa, int error,
    struct rtentry *rt)
{

        COMPATNAME(rt_addrmsg0)(cmd, ifa, error, rt, NULL);
}

void
COMPATNAME(rt_addrmsg_src)(int cmd, struct ifaddr *ifa,
    const struct sockaddr *src)
{

        COMPATNAME(rt_addrmsg0)(cmd, ifa, 0, NULL, src);
}

static struct mbuf *
rt_makeifannouncemsg(struct ifnet *ifp, int type, int what,
    struct rt_addrinfo *info)
{
        struct if_xannouncemsghdr ifan;

        memset(info, 0, sizeof(*info));
        memset(&ifan, 0, sizeof(ifan));
        ifan.ifan_index = ifp->if_index;
        strlcpy(ifan.ifan_name, ifp->if_xname, sizeof(ifan.ifan_name));
        ifan.ifan_what = what;
        return COMPATNAME(rt_msg1)(type, info, &ifan, sizeof(ifan));
}

/*
 * This is called to generate routing socket messages indicating
 * network interface arrival and departure.
 */
void
COMPATNAME(rt_ifannouncemsg)(struct ifnet *ifp, int what)
{
        struct mbuf *m;
        struct rt_addrinfo info;

        COMPATCALL(rt_ifannouncemsg, (ifp, what));
        if (COMPATNAME(route_info).ri_cb.any_count == 0)
                return;
        m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info);
        if (m == NULL)
                return;
        COMPATNAME(route_enqueue)(m, 0);
}

/*
 * This is called to generate routing socket messages indicating
 * IEEE80211 wireless events.
 * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way.
 */
void
COMPATNAME(rt_ieee80211msg)(struct ifnet *ifp, int what, void *data,
        size_t data_len)
{
        struct mbuf *m;
        struct rt_addrinfo info;

        COMPATCALL(rt_ieee80211msg, (ifp, what, data, data_len));
        if (COMPATNAME(route_info).ri_cb.any_count == 0)
                return;
        m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info);
        if (m == NULL)
                return;
        /*
         * Append the ieee80211 data.  Try to stick it in the
         * mbuf containing the ifannounce msg; otherwise allocate
         * a new mbuf and append.
         *
         * NB: we assume m is a single mbuf.
         */
        if (data_len > M_TRAILINGSPACE(m)) {
                struct mbuf *n = m_get(M_NOWAIT, MT_DATA);
                if (n == NULL) {
                        m_freem(m);
                        return;
                }
                (void)memcpy(mtod(n, void *), data, data_len);
                n->m_len = data_len;
                m->m_next = n;
        } else if (data_len > 0) {
                (void)memcpy(mtod(m, uint8_t *) + m->m_len, data, data_len);
                m->m_len += data_len;
        }
        if (m->m_flags & M_PKTHDR)
                m->m_pkthdr.len += data_len;
        mtod(m, struct if_xannouncemsghdr *)->ifan_msglen += data_len;
        COMPATNAME(route_enqueue)(m, 0);
}

/*
 * Routing message software interrupt routine
 */
static void
COMPATNAME(route_intr)(void *cookie)
{
        struct sockproto proto = { .sp_family = PF_XROUTE, };
        struct route_info * const ri = &COMPATNAME(route_info);
        struct mbuf *m;

        SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE();
        for (;;) {
                IFQ_LOCK(&ri->ri_intrq);
                IF_DEQUEUE(&ri->ri_intrq, m);
                IFQ_UNLOCK(&ri->ri_intrq);
                if (m == NULL)
                        break;
                proto.sp_protocol = M_GETCTX(m, uintptr_t);
#ifdef NET_MPSAFE
                mutex_enter(rt_so_mtx);
#endif
                raw_input(m, &proto, &ri->ri_src, &ri->ri_dst, &rt_rawcb);
#ifdef NET_MPSAFE
                mutex_exit(rt_so_mtx);
#endif
        }
        SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}

/*
 * Enqueue a message to the software interrupt routine.
 */
void
COMPATNAME(route_enqueue)(struct mbuf *m, int family)
{
        struct route_info * const ri = &COMPATNAME(route_info);
        int wasempty;

        IFQ_LOCK(&ri->ri_intrq);
        if (IF_QFULL(&ri->ri_intrq)) {
                printf("%s: queue full, dropped message\n", __func__);
                IF_DROP(&ri->ri_intrq);
                IFQ_UNLOCK(&ri->ri_intrq);
                m_freem(m);
        } else {
                wasempty = IF_IS_EMPTY(&ri->ri_intrq);
                M_SETCTX(m, (uintptr_t)family);
                IF_ENQUEUE(&ri->ri_intrq, m);
                IFQ_UNLOCK(&ri->ri_intrq);
                if (wasempty) {
                        kpreempt_disable();
                        softint_schedule(ri->ri_sih);
                        kpreempt_enable();
                }
        }
}

static void
COMPATNAME(route_init)(void)
{
        struct route_info * const ri = &COMPATNAME(route_info);

#ifndef COMPAT_RTSOCK
        rt_init();
#ifdef NET_MPSAFE
        rt_so_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);

        cv_init(&rt_update_cv, "rtsock_cv");
#endif

        sysctl_net_route_setup(NULL, PF_ROUTE, "rtable");
#endif
        ri->ri_intrq.ifq_maxlen = ri->ri_maxqlen;
        ri->ri_sih = softint_establish(SOFTINT_NET | SOFTINT_MPSAFE,
            COMPATNAME(route_intr), NULL);
        IFQ_LOCK_INIT(&ri->ri_intrq);

#ifdef MBUFTRACE
        MOWNER_ATTACH(&COMPATNAME(routedomain).dom_mowner);
#endif
}

/*
 * Definitions of protocols supported in the ROUTE domain.
 */
#ifndef COMPAT_RTSOCK
PR_WRAP_USRREQS(route);
#else
PR_WRAP_USRREQS(compat_50_route);
#endif

static const struct pr_usrreqs route_usrreqs = {
        .pr_attach        = COMPATNAME(route_attach_wrapper),
        .pr_detach        = COMPATNAME(route_detach_wrapper),
        .pr_accept        = COMPATNAME(route_accept_wrapper),
        .pr_bind        = COMPATNAME(route_bind_wrapper),
        .pr_listen        = COMPATNAME(route_listen_wrapper),
        .pr_connect        = COMPATNAME(route_connect_wrapper),
        .pr_connect2        = COMPATNAME(route_connect2_wrapper),
        .pr_disconnect        = COMPATNAME(route_disconnect_wrapper),
        .pr_shutdown        = COMPATNAME(route_shutdown_wrapper),
        .pr_abort        = COMPATNAME(route_abort_wrapper),
        .pr_ioctl        = COMPATNAME(route_ioctl_wrapper),
        .pr_stat        = COMPATNAME(route_stat_wrapper),
        .pr_peeraddr        = COMPATNAME(route_peeraddr_wrapper),
        .pr_sockaddr        = COMPATNAME(route_sockaddr_wrapper),
        .pr_rcvd        = COMPATNAME(route_rcvd_wrapper),
        .pr_recvoob        = COMPATNAME(route_recvoob_wrapper),
        .pr_send        = COMPATNAME(route_send_wrapper),
        .pr_sendoob        = COMPATNAME(route_sendoob_wrapper),
        .pr_purgeif        = COMPATNAME(route_purgeif_wrapper),
};

static const struct protosw COMPATNAME(route_protosw)[] = {
        {
                .pr_type = SOCK_RAW,
                .pr_domain = &COMPATNAME(routedomain),
                .pr_flags = PR_ATOMIC|PR_ADDR,
                .pr_ctlinput = raw_ctlinput,
                .pr_ctloutput = route_ctloutput,
                .pr_usrreqs = &route_usrreqs,
                .pr_init = rt_pr_init,
        },
};

struct domain COMPATNAME(routedomain) = {
        .dom_family = PF_XROUTE,
        .dom_name = DOMAINNAME,
        .dom_init = COMPATNAME(route_init),
        .dom_protosw = COMPATNAME(route_protosw),
        .dom_protoswNPROTOSW =
            &COMPATNAME(route_protosw)[__arraycount(COMPATNAME(route_protosw))],
#ifdef MBUFTRACE
        .dom_mowner = MOWNER_INIT("route", "rtm"),
#endif
};






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 
    1 













    1 









    1 
    1 

    1 



    1 



























































































    1 









    1 
    1 
    1 




    1 


    1 





    1 
















    1 


    1 
    1 
    1 
    1 





















   28 



   29 
   29 
   29 
   29 
   29 





   29 






























   28 






   29 
































    2 
























   10 
   21 






























































    1 



    1 
    1 




    1 












    1 


    1 
    1 
    1 
    1 





























    1 



    1 
    1 
    1 
    1 
    1 











    1 

































































































    1 

















    1 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
/*        $NetBSD: dk.c,v 1.171 2023/05/22 15:00:17 riastradh Exp $        */

/*-
 * Copyright (c) 2004, 2005, 2006, 2007 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: dk.c,v 1.171 2023/05/22 15:00:17 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_dkwedge.h"
#endif

#include <sys/param.h>
#include <sys/types.h>

#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/callout.h>
#include <sys/conf.h>
#include <sys/device.h>
#include <sys/disk.h>
#include <sys/disklabel.h>
#include <sys/errno.h>
#include <sys/fcntl.h>
#include <sys/ioctl.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/stat.h>
#include <sys/systm.h>
#include <sys/vnode.h>

#include <miscfs/specfs/specdev.h>

MALLOC_DEFINE(M_DKWEDGE, "dkwedge", "Disk wedge structures");

typedef enum {
        DKW_STATE_LARVAL        = 0,
        DKW_STATE_RUNNING        = 1,
        DKW_STATE_DYING                = 2,
        DKW_STATE_DEAD                = 666
} dkwedge_state_t;

/*
 * Lock order:
 *
 *        sc->sc_dk.dk_openlock
 *        => sc->sc_parent->dk_rawlock
 *        => sc->sc_parent->dk_openlock
 *        => dkwedges_lock
 *        => sc->sc_sizelock
 *
 * Locking notes:
 *
 *        W        dkwedges_lock
 *        D        device reference
 *        O        sc->sc_dk.dk_openlock
 *        P        sc->sc_parent->dk_openlock
 *        R        sc->sc_parent->dk_rawlock
 *        S        sc->sc_sizelock
 *        I        sc->sc_iolock
 *        $        stable after initialization
 *        1        used only by a single thread
 *
 * x&y means both x and y must be held to write (with a write lock if
 * one is rwlock), and either x or y must be held to read.
 */

struct dkwedge_softc {
        device_t        sc_dev;        /* P&W: pointer to our pseudo-device */
                /* sc_dev is also stable while device is referenced */
        struct cfdata        sc_cfdata;        /* 1: our cfdata structure */
        uint8_t                sc_wname[128];        /* $: wedge name (Unicode, UTF-8) */

        dkwedge_state_t sc_state;        /* state this wedge is in */
                /* stable while device is referenced */
                /* used only in assertions when stable, and in dump in ddb */

        struct disk        *sc_parent;        /* $: parent disk */
                /* P: sc_parent->dk_openmask */
                /* P: sc_parent->dk_nwedges */
                /* P: sc_parent->dk_wedges */
                /* R: sc_parent->dk_rawopens */
                /* R: sc_parent->dk_rawvp (also stable while wedge is open) */
        daddr_t                sc_offset;        /* $: LBA offset of wedge in parent */
        krwlock_t        sc_sizelock;
        uint64_t        sc_size;        /* S: size of wedge in blocks */
        char                sc_ptype[32];        /* $: partition type */
        dev_t                sc_pdev;        /* $: cached parent's dev_t */
                                        /* P: link on parent's wedge list */
        LIST_ENTRY(dkwedge_softc) sc_plink;

        struct disk        sc_dk;                /* our own disk structure */
                /* O&R: sc_dk.dk_bopenmask */
                /* O&R: sc_dk.dk_copenmask */
                /* O&R: sc_dk.dk_openmask */
        struct bufq_state *sc_bufq;        /* $: buffer queue */
        struct callout        sc_restart_ch;        /* I: callout to restart I/O */

        kmutex_t        sc_iolock;
        bool                sc_iostop;        /* I: don't schedule restart */
        int                sc_mode;        /* O&R: parent open mode */
};

static int        dkwedge_match(device_t, cfdata_t, void *);
static void        dkwedge_attach(device_t, device_t, void *);
static int        dkwedge_detach(device_t, int);

static void        dk_set_geometry(struct dkwedge_softc *, struct disk *);

static void        dkstart(struct dkwedge_softc *);
static void        dkiodone(struct buf *);
static void        dkrestart(void *);
static void        dkminphys(struct buf *);

static int        dkfirstopen(struct dkwedge_softc *, int);
static void        dklastclose(struct dkwedge_softc *);
static int        dkwedge_detach(device_t, int);
static void        dkwedge_delall1(struct disk *, bool);
static int        dkwedge_del1(struct dkwedge_info *, int);
static int        dk_open_parent(dev_t, int, struct vnode **);
static int        dk_close_parent(struct vnode *, int);

static dev_type_open(dkopen);
static dev_type_close(dkclose);
static dev_type_cancel(dkcancel);
static dev_type_read(dkread);
static dev_type_write(dkwrite);
static dev_type_ioctl(dkioctl);
static dev_type_strategy(dkstrategy);
static dev_type_dump(dkdump);
static dev_type_size(dksize);
static dev_type_discard(dkdiscard);

CFDRIVER_DECL(dk, DV_DISK, NULL);
CFATTACH_DECL3_NEW(dk, 0,
    dkwedge_match, dkwedge_attach, dkwedge_detach, NULL, NULL, NULL,
    DVF_DETACH_SHUTDOWN);

const struct bdevsw dk_bdevsw = {
        .d_open = dkopen,
        .d_close = dkclose,
        .d_cancel = dkcancel,
        .d_strategy = dkstrategy,
        .d_ioctl = dkioctl,
        .d_dump = dkdump,
        .d_psize = dksize,
        .d_discard = dkdiscard,
        .d_cfdriver = &dk_cd,
        .d_devtounit = dev_minor_unit,
        .d_flag = D_DISK | D_MPSAFE
};

const struct cdevsw dk_cdevsw = {
        .d_open = dkopen,
        .d_close = dkclose,
        .d_cancel = dkcancel,
        .d_read = dkread,
        .d_write = dkwrite,
        .d_ioctl = dkioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = dkdiscard,
        .d_cfdriver = &dk_cd,
        .d_devtounit = dev_minor_unit,
        .d_flag = D_DISK | D_MPSAFE
};

static struct dkwedge_softc **dkwedges;
static u_int ndkwedges;
static krwlock_t dkwedges_lock;

static LIST_HEAD(, dkwedge_discovery_method) dkwedge_discovery_methods;
static krwlock_t dkwedge_discovery_methods_lock;

/*
 * dkwedge_match:
 *
 *        Autoconfiguration match function for pseudo-device glue.
 */
static int
dkwedge_match(device_t parent, cfdata_t match, void *aux)
{

        /* Pseudo-device; always present. */
        return 1;
}

/*
 * dkwedge_attach:
 *
 *        Autoconfiguration attach function for pseudo-device glue.
 */
static void
dkwedge_attach(device_t parent, device_t self, void *aux)
{
        struct dkwedge_softc *sc = aux;
        struct disk *pdk = sc->sc_parent;
        int unit = device_unit(self);

        KASSERTMSG(unit >= 0, "unit=%d", unit);

        if (!pmf_device_register(self, NULL, NULL))
                aprint_error_dev(self, "couldn't establish power handler\n");

        mutex_enter(&pdk->dk_openlock);
        rw_enter(&dkwedges_lock, RW_WRITER);
        KASSERTMSG(unit < ndkwedges, "unit=%d ndkwedges=%u", unit, ndkwedges);
        KASSERTMSG(sc == dkwedges[unit], "sc=%p dkwedges[%d]=%p",
            sc, unit, dkwedges[unit]);
        KASSERTMSG(sc->sc_dev == NULL, "sc=%p sc->sc_dev=%p", sc, sc->sc_dev);
        sc->sc_dev = self;
        rw_exit(&dkwedges_lock);
        mutex_exit(&pdk->dk_openlock);

        disk_init(&sc->sc_dk, device_xname(sc->sc_dev), NULL);
        mutex_enter(&pdk->dk_openlock);
        dk_set_geometry(sc, pdk);
        mutex_exit(&pdk->dk_openlock);
        disk_attach(&sc->sc_dk);

        /* Disk wedge is ready for use! */
        device_set_private(self, sc);
        sc->sc_state = DKW_STATE_RUNNING;
}

/*
 * dkwedge_compute_pdev:
 *
 *        Compute the parent disk's dev_t.
 */
static int
dkwedge_compute_pdev(const char *pname, dev_t *pdevp, enum vtype type)
{
        const char *name, *cp;
        devmajor_t pmaj;
        int punit;
        char devname[16];

        name = pname;
        switch (type) {
        case VBLK:
                pmaj = devsw_name2blk(name, devname, sizeof(devname));
                break;
        case VCHR:
                pmaj = devsw_name2chr(name, devname, sizeof(devname));
                break;
        default:
                pmaj = NODEVMAJOR;
                break;
        }
        if (pmaj == NODEVMAJOR)
                return ENXIO;

        name += strlen(devname);
        for (cp = name, punit = 0; *cp >= '0' && *cp <= '9'; cp++)
                punit = (punit * 10) + (*cp - '0');
        if (cp == name) {
                /* Invalid parent disk name. */
                return ENXIO;
        }

        *pdevp = MAKEDISKDEV(pmaj, punit, RAW_PART);

        return 0;
}

/*
 * dkwedge_array_expand:
 *
 *        Expand the dkwedges array.
 *
 *        Releases and reacquires dkwedges_lock as a writer.
 */
static int
dkwedge_array_expand(void)
{

        const unsigned incr = 16;
        unsigned newcnt, oldcnt;
        struct dkwedge_softc **newarray = NULL, **oldarray = NULL;

        KASSERT(rw_write_held(&dkwedges_lock));

        oldcnt = ndkwedges;
        oldarray = dkwedges;

        if (oldcnt >= INT_MAX - incr)
                return ENFILE;        /* XXX */
        newcnt = oldcnt + incr;

        rw_exit(&dkwedges_lock);
        newarray = malloc(newcnt * sizeof(*newarray), M_DKWEDGE,
            M_WAITOK|M_ZERO);
        rw_enter(&dkwedges_lock, RW_WRITER);

        if (ndkwedges != oldcnt || dkwedges != oldarray) {
                oldarray = NULL; /* already recycled */
                goto out;
        }

        if (oldarray != NULL)
                memcpy(newarray, dkwedges, ndkwedges * sizeof(*newarray));
        dkwedges = newarray;
        newarray = NULL;        /* transferred to dkwedges */
        ndkwedges = newcnt;

out:        rw_exit(&dkwedges_lock);
        if (oldarray != NULL)
                free(oldarray, M_DKWEDGE);
        if (newarray != NULL)
                free(newarray, M_DKWEDGE);
        rw_enter(&dkwedges_lock, RW_WRITER);
        return 0;
}

static void
dkwedge_size_init(struct dkwedge_softc *sc, uint64_t size)
{

        rw_init(&sc->sc_sizelock);
        sc->sc_size = size;
}

static void
dkwedge_size_fini(struct dkwedge_softc *sc)
{

        rw_destroy(&sc->sc_sizelock);
}

static uint64_t
dkwedge_size(struct dkwedge_softc *sc)
{
        uint64_t size;

        rw_enter(&sc->sc_sizelock, RW_READER);
        size = sc->sc_size;
        rw_exit(&sc->sc_sizelock);

        return size;
}

static void
dkwedge_size_increase(struct dkwedge_softc *sc, uint64_t size)
{

        KASSERT(mutex_owned(&sc->sc_parent->dk_openlock));

        rw_enter(&sc->sc_sizelock, RW_WRITER);
        KASSERTMSG(size >= sc->sc_size,
            "decreasing dkwedge size from %"PRIu64" to %"PRIu64,
            sc->sc_size, size);
        sc->sc_size = size;
        rw_exit(&sc->sc_sizelock);
}

static void
dk_set_geometry(struct dkwedge_softc *sc, struct disk *pdk)
{
        struct disk *dk = &sc->sc_dk;
        struct disk_geom *dg = &dk->dk_geom;

        KASSERT(mutex_owned(&pdk->dk_openlock));

        memset(dg, 0, sizeof(*dg));

        dg->dg_secperunit = dkwedge_size(sc);
        dg->dg_secsize = DEV_BSIZE << pdk->dk_blkshift;

        /* fake numbers, 1 cylinder is 1 MB with default sector size */
        dg->dg_nsectors = 32;
        dg->dg_ntracks = 64;
        dg->dg_ncylinders =
            dg->dg_secperunit / (dg->dg_nsectors * dg->dg_ntracks);

        disk_set_info(sc->sc_dev, dk, NULL);
}

/*
 * dkwedge_add:                [exported function]
 *
 *        Add a disk wedge based on the provided information.
 *
 *        The incoming dkw_devname[] is ignored, instead being
 *        filled in and returned to the caller.
 */
int
dkwedge_add(struct dkwedge_info *dkw)
{
        struct dkwedge_softc *sc, *lsc;
        struct disk *pdk;
        u_int unit;
        int error;
        dev_t pdev;
        device_t dev __diagused;

        dkw->dkw_parent[sizeof(dkw->dkw_parent) - 1] = '\0';
        pdk = disk_find(dkw->dkw_parent);
        if (pdk == NULL)
                return ENXIO;

        error = dkwedge_compute_pdev(pdk->dk_name, &pdev, VBLK);
        if (error)
                return error;

        if (dkw->dkw_offset < 0)
                return EINVAL;

        /*
         * Check for an existing wedge at the same disk offset. Allow
         * updating a wedge if the only change is the size, and the new
         * size is larger than the old.
         */
        sc = NULL;
        mutex_enter(&pdk->dk_openlock);
        LIST_FOREACH(lsc, &pdk->dk_wedges, sc_plink) {
                if (lsc->sc_offset != dkw->dkw_offset)
                        continue;
                if (strcmp(lsc->sc_wname, dkw->dkw_wname) != 0)
                        break;
                if (strcmp(lsc->sc_ptype, dkw->dkw_ptype) != 0)
                        break;
                if (dkwedge_size(lsc) > dkw->dkw_size)
                        break;
                if (lsc->sc_dev == NULL)
                        break;

                sc = lsc;
                device_acquire(sc->sc_dev);
                dkwedge_size_increase(sc, dkw->dkw_size);
                dk_set_geometry(sc, pdk);

                break;
        }
        mutex_exit(&pdk->dk_openlock);

        if (sc != NULL)
                goto announce;

        sc = malloc(sizeof(*sc), M_DKWEDGE, M_WAITOK|M_ZERO);
        sc->sc_state = DKW_STATE_LARVAL;
        sc->sc_parent = pdk;
        sc->sc_pdev = pdev;
        sc->sc_offset = dkw->dkw_offset;
        dkwedge_size_init(sc, dkw->dkw_size);

        memcpy(sc->sc_wname, dkw->dkw_wname, sizeof(sc->sc_wname));
        sc->sc_wname[sizeof(sc->sc_wname) - 1] = '\0';

        memcpy(sc->sc_ptype, dkw->dkw_ptype, sizeof(sc->sc_ptype));
        sc->sc_ptype[sizeof(sc->sc_ptype) - 1] = '\0';

        bufq_alloc(&sc->sc_bufq, "fcfs", 0);

        callout_init(&sc->sc_restart_ch, 0);
        callout_setfunc(&sc->sc_restart_ch, dkrestart, sc);

        mutex_init(&sc->sc_iolock, MUTEX_DEFAULT, IPL_BIO);

        /*
         * Wedge will be added; increment the wedge count for the parent.
         * Only allow this to happen if RAW_PART is the only thing open.
         */
        mutex_enter(&pdk->dk_openlock);
        if (pdk->dk_openmask & ~(1 << RAW_PART))
                error = EBUSY;
        else {
                /* Check for wedge overlap. */
                LIST_FOREACH(lsc, &pdk->dk_wedges, sc_plink) {
                        /* XXX arithmetic overflow */
                        uint64_t size = dkwedge_size(sc);
                        uint64_t lsize = dkwedge_size(lsc);
                        daddr_t lastblk = sc->sc_offset + size - 1;
                        daddr_t llastblk = lsc->sc_offset + lsize - 1;

                        if (sc->sc_offset >= lsc->sc_offset &&
                            sc->sc_offset <= llastblk) {
                                /* Overlaps the tail of the existing wedge. */
                                break;
                        }
                        if (lastblk >= lsc->sc_offset &&
                            lastblk <= llastblk) {
                                /* Overlaps the head of the existing wedge. */
                                    break;
                        }
                }
                if (lsc != NULL) {
                        if (sc->sc_offset == lsc->sc_offset &&
                            dkwedge_size(sc) == dkwedge_size(lsc) &&
                            strcmp(sc->sc_wname, lsc->sc_wname) == 0)
                                error = EEXIST;
                        else
                                error = EINVAL;
                } else {
                        pdk->dk_nwedges++;
                        LIST_INSERT_HEAD(&pdk->dk_wedges, sc, sc_plink);
                }
        }
        mutex_exit(&pdk->dk_openlock);
        if (error) {
                mutex_destroy(&sc->sc_iolock);
                bufq_free(sc->sc_bufq);
                dkwedge_size_fini(sc);
                free(sc, M_DKWEDGE);
                return error;
        }

        /* Fill in our cfdata for the pseudo-device glue. */
        sc->sc_cfdata.cf_name = dk_cd.cd_name;
        sc->sc_cfdata.cf_atname = dk_ca.ca_name;
        /* sc->sc_cfdata.cf_unit set below */
        sc->sc_cfdata.cf_fstate = FSTATE_NOTFOUND; /* use chosen cf_unit */

        /* Insert the larval wedge into the array. */
        rw_enter(&dkwedges_lock, RW_WRITER);
        for (error = 0;;) {
                struct dkwedge_softc **scpp;

                /*
                 * Check for a duplicate wname while searching for
                 * a slot.
                 */
                for (scpp = NULL, unit = 0; unit < ndkwedges; unit++) {
                        if (dkwedges[unit] == NULL) {
                                if (scpp == NULL) {
                                        scpp = &dkwedges[unit];
                                        sc->sc_cfdata.cf_unit = unit;
                                }
                        } else {
                                /* XXX Unicode. */
                                if (strcmp(dkwedges[unit]->sc_wname,
                                        sc->sc_wname) == 0) {
                                        error = EEXIST;
                                        break;
                                }
                        }
                }
                if (error)
                        break;
                KASSERT(unit == ndkwedges);
                if (scpp == NULL) {
                        error = dkwedge_array_expand();
                        if (error)
                                break;
                } else {
                        KASSERT(scpp == &dkwedges[sc->sc_cfdata.cf_unit]);
                        *scpp = sc;
                        break;
                }
        }
        rw_exit(&dkwedges_lock);
        if (error) {
                mutex_enter(&pdk->dk_openlock);
                pdk->dk_nwedges--;
                LIST_REMOVE(sc, sc_plink);
                mutex_exit(&pdk->dk_openlock);

                mutex_destroy(&sc->sc_iolock);
                bufq_free(sc->sc_bufq);
                dkwedge_size_fini(sc);
                free(sc, M_DKWEDGE);
                return error;
        }

        /*
         * Now that we know the unit #, attach a pseudo-device for
         * this wedge instance.  This will provide us with the
         * device_t necessary for glue to other parts of the system.
         *
         * This should never fail, unless we're almost totally out of
         * memory.
         */
        if ((dev = config_attach_pseudo_acquire(&sc->sc_cfdata, sc)) == NULL) {
                aprint_error("%s%u: unable to attach pseudo-device\n",
                    sc->sc_cfdata.cf_name, sc->sc_cfdata.cf_unit);

                rw_enter(&dkwedges_lock, RW_WRITER);
                KASSERT(dkwedges[sc->sc_cfdata.cf_unit] == sc);
                dkwedges[sc->sc_cfdata.cf_unit] = NULL;
                rw_exit(&dkwedges_lock);

                mutex_enter(&pdk->dk_openlock);
                pdk->dk_nwedges--;
                LIST_REMOVE(sc, sc_plink);
                mutex_exit(&pdk->dk_openlock);

                mutex_destroy(&sc->sc_iolock);
                bufq_free(sc->sc_bufq);
                dkwedge_size_fini(sc);
                free(sc, M_DKWEDGE);
                return ENOMEM;
        }

        KASSERT(dev == sc->sc_dev);

announce:
        /* Announce our arrival. */
        aprint_normal(
            "%s at %s: \"%s\", %"PRIu64" blocks at %"PRId64", type: %s\n",
            device_xname(sc->sc_dev), pdk->dk_name,
            sc->sc_wname,        /* XXX Unicode */
            dkwedge_size(sc), sc->sc_offset,
            sc->sc_ptype[0] == '\0' ? "<unknown>" : sc->sc_ptype);

        /* Return the devname to the caller. */
        strlcpy(dkw->dkw_devname, device_xname(sc->sc_dev),
            sizeof(dkw->dkw_devname));

        device_release(sc->sc_dev);
        return 0;
}

/*
 * dkwedge_find_acquire:
 *
 *        Lookup a disk wedge based on the provided information.
 *        NOTE: We look up the wedge based on the wedge devname,
 *        not wname.
 *
 *        Return NULL if the wedge is not found, otherwise return
 *        the wedge's softc.  Assign the wedge's unit number to unitp
 *        if unitp is not NULL.  The wedge's sc_dev is referenced and
 *        must be released by device_release or equivalent.
 */
static struct dkwedge_softc *
dkwedge_find_acquire(struct dkwedge_info *dkw, u_int *unitp)
{
        struct dkwedge_softc *sc = NULL;
        u_int unit;

        /* Find our softc. */
        dkw->dkw_devname[sizeof(dkw->dkw_devname) - 1] = '\0';
        rw_enter(&dkwedges_lock, RW_READER);
        for (unit = 0; unit < ndkwedges; unit++) {
                if ((sc = dkwedges[unit]) != NULL &&
                    sc->sc_dev != NULL &&
                    strcmp(device_xname(sc->sc_dev), dkw->dkw_devname) == 0 &&
                    strcmp(sc->sc_parent->dk_name, dkw->dkw_parent) == 0) {
                        device_acquire(sc->sc_dev);
                        break;
                }
        }
        rw_exit(&dkwedges_lock);
        if (sc == NULL)
                return NULL;

        if (unitp != NULL)
                *unitp = unit;

        return sc;
}

/*
 * dkwedge_del:                [exported function]
 *
 *        Delete a disk wedge based on the provided information.
 *        NOTE: We look up the wedge based on the wedge devname,
 *        not wname.
 */
int
dkwedge_del(struct dkwedge_info *dkw)
{

        return dkwedge_del1(dkw, 0);
}

int
dkwedge_del1(struct dkwedge_info *dkw, int flags)
{
        struct dkwedge_softc *sc = NULL;

        /* Find our softc. */
        if ((sc = dkwedge_find_acquire(dkw, NULL)) == NULL)
                return ESRCH;

        return config_detach_release(sc->sc_dev, flags);
}

/*
 * dkwedge_detach:
 *
 *        Autoconfiguration detach function for pseudo-device glue.
 */
static int
dkwedge_detach(device_t self, int flags)
{
        struct dkwedge_softc *const sc = device_private(self);
        const u_int unit = device_unit(self);
        int bmaj, cmaj, error;

        error = disk_begindetach(&sc->sc_dk, /*lastclose*/NULL, self, flags);
        if (error)
                return error;

        /* Mark the wedge as dying. */
        sc->sc_state = DKW_STATE_DYING;

        pmf_device_deregister(self);

        /* Kill any pending restart. */
        mutex_enter(&sc->sc_iolock);
        sc->sc_iostop = true;
        mutex_exit(&sc->sc_iolock);
        callout_halt(&sc->sc_restart_ch, NULL);

        /* Locate the wedge major numbers. */
        bmaj = bdevsw_lookup_major(&dk_bdevsw);
        cmaj = cdevsw_lookup_major(&dk_cdevsw);

        /* Nuke the vnodes for any open instances. */
        vdevgone(bmaj, unit, unit, VBLK);
        vdevgone(cmaj, unit, unit, VCHR);

        /*
         * At this point, all block device opens have been closed,
         * synchronously flushing any buffered writes; and all
         * character device I/O operations have completed
         * synchronously, and character device opens have been closed.
         *
         * So there can be no more opens or queued buffers by now.
         */
        KASSERT(sc->sc_dk.dk_openmask == 0);
        KASSERT(bufq_peek(sc->sc_bufq) == NULL);
        bufq_drain(sc->sc_bufq);

        /* Announce our departure. */
        aprint_normal("%s at %s (%s) deleted\n", device_xname(sc->sc_dev),
            sc->sc_parent->dk_name,
            sc->sc_wname);        /* XXX Unicode */

        mutex_enter(&sc->sc_parent->dk_openlock);
        sc->sc_parent->dk_nwedges--;
        LIST_REMOVE(sc, sc_plink);
        mutex_exit(&sc->sc_parent->dk_openlock);

        /* Delete our buffer queue. */
        bufq_free(sc->sc_bufq);

        /* Detach from the disk list. */
        disk_detach(&sc->sc_dk);
        disk_destroy(&sc->sc_dk);

        /* Poof. */
        rw_enter(&dkwedges_lock, RW_WRITER);
        KASSERT(dkwedges[unit] == sc);
        dkwedges[unit] = NULL;
        sc->sc_state = DKW_STATE_DEAD;
        rw_exit(&dkwedges_lock);

        mutex_destroy(&sc->sc_iolock);
        dkwedge_size_fini(sc);

        free(sc, M_DKWEDGE);

        return 0;
}

/*
 * dkwedge_delall:        [exported function]
 *
 *        Forcibly delete all of the wedges on the specified disk.  Used
 *        when a disk is being detached.
 */
void
dkwedge_delall(struct disk *pdk)
{

        dkwedge_delall1(pdk, /*idleonly*/false);
}

/*
 * dkwedge_delidle:        [exported function]
 *
 *        Delete all of the wedges on the specified disk if idle.  Used
 *        by ioctl(DIOCRMWEDGES).
 */
void
dkwedge_delidle(struct disk *pdk)
{

        dkwedge_delall1(pdk, /*idleonly*/true);
}

static void
dkwedge_delall1(struct disk *pdk, bool idleonly)
{
        struct dkwedge_softc *sc;
        int flags;

        flags = DETACH_QUIET;
        if (!idleonly)
                flags |= DETACH_FORCE;

        for (;;) {
                mutex_enter(&pdk->dk_rawlock); /* for sc->sc_dk.dk_openmask */
                mutex_enter(&pdk->dk_openlock);
                LIST_FOREACH(sc, &pdk->dk_wedges, sc_plink) {
                        /*
                         * Wedge is not yet created.  This is a race --
                         * it may as well have been added just after we
                         * deleted all the wedges, so pretend it's not
                         * here yet.
                         */
                        if (sc->sc_dev == NULL)
                                continue;
                        if (!idleonly || sc->sc_dk.dk_openmask == 0) {
                                device_acquire(sc->sc_dev);
                                break;
                        }
                }
                if (sc == NULL) {
                        KASSERT(idleonly || pdk->dk_nwedges == 0);
                        mutex_exit(&pdk->dk_openlock);
                        mutex_exit(&pdk->dk_rawlock);
                        return;
                }
                mutex_exit(&pdk->dk_openlock);
                mutex_exit(&pdk->dk_rawlock);
                (void)config_detach_release(sc->sc_dev, flags);
        }
}

/*
 * dkwedge_list:        [exported function]
 *
 *        List all of the wedges on a particular disk.
 */
int
dkwedge_list(struct disk *pdk, struct dkwedge_list *dkwl, struct lwp *l)
{
        struct uio uio;
        struct iovec iov;
        struct dkwedge_softc *sc;
        struct dkwedge_info dkw;
        int error = 0;

        iov.iov_base = dkwl->dkwl_buf;
        iov.iov_len = dkwl->dkwl_bufsize;

        uio.uio_iov = &iov;
        uio.uio_iovcnt = 1;
        uio.uio_offset = 0;
        uio.uio_resid = dkwl->dkwl_bufsize;
        uio.uio_rw = UIO_READ;
        KASSERT(l == curlwp);
        uio.uio_vmspace = l->l_proc->p_vmspace;

        dkwl->dkwl_ncopied = 0;

        mutex_enter(&pdk->dk_openlock);
        LIST_FOREACH(sc, &pdk->dk_wedges, sc_plink) {
                if (uio.uio_resid < sizeof(dkw))
                        break;

                if (sc->sc_dev == NULL)
                        continue;

                strlcpy(dkw.dkw_devname, device_xname(sc->sc_dev),
                    sizeof(dkw.dkw_devname));
                memcpy(dkw.dkw_wname, sc->sc_wname, sizeof(dkw.dkw_wname));
                dkw.dkw_wname[sizeof(dkw.dkw_wname) - 1] = '\0';
                strlcpy(dkw.dkw_parent, sc->sc_parent->dk_name,
                    sizeof(dkw.dkw_parent));
                dkw.dkw_offset = sc->sc_offset;
                dkw.dkw_size = dkwedge_size(sc);
                strlcpy(dkw.dkw_ptype, sc->sc_ptype, sizeof(dkw.dkw_ptype));

                /*
                 * Acquire a device reference so this wedge doesn't go
                 * away before our next iteration in LIST_FOREACH, and
                 * then release the lock for uiomove.
                 */
                device_acquire(sc->sc_dev);
                mutex_exit(&pdk->dk_openlock);
                error = uiomove(&dkw, sizeof(dkw), &uio);
                mutex_enter(&pdk->dk_openlock);
                device_release(sc->sc_dev);
                if (error)
                        break;

                dkwl->dkwl_ncopied++;
        }
        dkwl->dkwl_nwedges = pdk->dk_nwedges;
        mutex_exit(&pdk->dk_openlock);

        return error;
}

static device_t
dkwedge_find_by_wname_acquire(const char *wname)
{
        device_t dv = NULL;
        struct dkwedge_softc *sc;
        int i;

        rw_enter(&dkwedges_lock, RW_READER);
        for (i = 0; i < ndkwedges; i++) {
                if ((sc = dkwedges[i]) == NULL || sc->sc_dev == NULL)
                        continue;
                if (strcmp(sc->sc_wname, wname) == 0) {
                        if (dv != NULL) {
                                printf(
                                    "WARNING: double match for wedge name %s "
                                    "(%s, %s)\n", wname, device_xname(dv),
                                    device_xname(sc->sc_dev));
                                continue;
                        }
                        device_acquire(sc->sc_dev);
                        dv = sc->sc_dev;
                }
        }
        rw_exit(&dkwedges_lock);
        return dv;
}

static device_t
dkwedge_find_by_parent_acquire(const char *name, size_t *i)
{

        rw_enter(&dkwedges_lock, RW_READER);
        for (; *i < (size_t)ndkwedges; (*i)++) {
                struct dkwedge_softc *sc;
                if ((sc = dkwedges[*i]) == NULL || sc->sc_dev == NULL)
                        continue;
                if (strcmp(sc->sc_parent->dk_name, name) != 0)
                        continue;
                device_acquire(sc->sc_dev);
                rw_exit(&dkwedges_lock);
                return sc->sc_dev;
        }
        rw_exit(&dkwedges_lock);
        return NULL;
}

/* XXX unsafe */
device_t
dkwedge_find_by_wname(const char *wname)
{
        device_t dv;

        if ((dv = dkwedge_find_by_wname_acquire(wname)) == NULL)
                return NULL;
        device_release(dv);
        return dv;
}

/* XXX unsafe */
device_t
dkwedge_find_by_parent(const char *name, size_t *i)
{
        device_t dv;

        if ((dv = dkwedge_find_by_parent_acquire(name, i)) == NULL)
                return NULL;
        device_release(dv);
        return dv;
}

void
dkwedge_print_wnames(void)
{
        struct dkwedge_softc *sc;
        int i;

        rw_enter(&dkwedges_lock, RW_READER);
        for (i = 0; i < ndkwedges; i++) {
                if ((sc = dkwedges[i]) == NULL || sc->sc_dev == NULL)
                        continue;
                printf(" wedge:%s", sc->sc_wname);
        }
        rw_exit(&dkwedges_lock);
}

/*
 * We need a dummy object to stuff into the dkwedge discovery method link
 * set to ensure that there is always at least one object in the set.
 */
static struct dkwedge_discovery_method dummy_discovery_method;
__link_set_add_bss(dkwedge_methods, dummy_discovery_method);

/*
 * dkwedge_init:
 *
 *        Initialize the disk wedge subsystem.
 */
void
dkwedge_init(void)
{
        __link_set_decl(dkwedge_methods, struct dkwedge_discovery_method);
        struct dkwedge_discovery_method * const *ddmp;
        struct dkwedge_discovery_method *lddm, *ddm;

        rw_init(&dkwedges_lock);
        rw_init(&dkwedge_discovery_methods_lock);

        if (config_cfdriver_attach(&dk_cd) != 0)
                panic("dkwedge: unable to attach cfdriver");
        if (config_cfattach_attach(dk_cd.cd_name, &dk_ca) != 0)
                panic("dkwedge: unable to attach cfattach");

        rw_enter(&dkwedge_discovery_methods_lock, RW_WRITER);

        LIST_INIT(&dkwedge_discovery_methods);

        __link_set_foreach(ddmp, dkwedge_methods) {
                ddm = *ddmp;
                if (ddm == &dummy_discovery_method)
                        continue;
                if (LIST_EMPTY(&dkwedge_discovery_methods)) {
                        LIST_INSERT_HEAD(&dkwedge_discovery_methods,
                            ddm, ddm_list);
                        continue;
                }
                LIST_FOREACH(lddm, &dkwedge_discovery_methods, ddm_list) {
                        if (ddm->ddm_priority == lddm->ddm_priority) {
                                aprint_error("dk-method-%s: method \"%s\" "
                                    "already exists at priority %d\n",
                                    ddm->ddm_name, lddm->ddm_name,
                                    lddm->ddm_priority);
                                /* Not inserted. */
                                break;
                        }
                        if (ddm->ddm_priority < lddm->ddm_priority) {
                                /* Higher priority; insert before. */
                                LIST_INSERT_BEFORE(lddm, ddm, ddm_list);
                                break;
                        }
                        if (LIST_NEXT(lddm, ddm_list) == NULL) {
                                /* Last one; insert after. */
                                KASSERT(lddm->ddm_priority < ddm->ddm_priority);
                                LIST_INSERT_AFTER(lddm, ddm, ddm_list);
                                break;
                        }
                }
        }

        rw_exit(&dkwedge_discovery_methods_lock);
}

#ifdef DKWEDGE_AUTODISCOVER
int        dkwedge_autodiscover = 1;
#else
int        dkwedge_autodiscover = 0;
#endif

/*
 * dkwedge_discover:        [exported function]
 *
 *        Discover the wedges on a newly attached disk.
 *        Remove all unused wedges on the disk first.
 */
void
dkwedge_discover(struct disk *pdk)
{
        struct dkwedge_discovery_method *ddm;
        struct vnode *vp;
        int error;
        dev_t pdev;

        /*
         * Require people playing with wedges to enable this explicitly.
         */
        if (dkwedge_autodiscover == 0)
                return;

        rw_enter(&dkwedge_discovery_methods_lock, RW_READER);

        /*
         * Use the character device for scanning, the block device
         * is busy if there are already wedges attached.
         */
        error = dkwedge_compute_pdev(pdk->dk_name, &pdev, VCHR);
        if (error) {
                aprint_error("%s: unable to compute pdev, error = %d\n",
                    pdk->dk_name, error);
                goto out;
        }

        error = cdevvp(pdev, &vp);
        if (error) {
                aprint_error("%s: unable to find vnode for pdev, error = %d\n",
                    pdk->dk_name, error);
                goto out;
        }

        error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        if (error) {
                aprint_error("%s: unable to lock vnode for pdev, error = %d\n",
                    pdk->dk_name, error);
                vrele(vp);
                goto out;
        }

        error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
        if (error) {
                if (error != ENXIO)
                        aprint_error("%s: unable to open device, error = %d\n",
                            pdk->dk_name, error);
                vput(vp);
                goto out;
        }
        VOP_UNLOCK(vp);

        /*
         * Remove unused wedges
         */
        dkwedge_delidle(pdk);

        /*
         * For each supported partition map type, look to see if
         * this map type exists.  If so, parse it and add the
         * corresponding wedges.
         */
        LIST_FOREACH(ddm, &dkwedge_discovery_methods, ddm_list) {
                error = (*ddm->ddm_discover)(pdk, vp);
                if (error == 0) {
                        /* Successfully created wedges; we're done. */
                        break;
                }
        }

        error = vn_close(vp, FREAD, NOCRED);
        if (error) {
                aprint_error("%s: unable to close device, error = %d\n",
                    pdk->dk_name, error);
                /* We'll just assume the vnode has been cleaned up. */
        }

out:
        rw_exit(&dkwedge_discovery_methods_lock);
}

/*
 * dkwedge_read:
 *
 *        Read some data from the specified disk, used for
 *        partition discovery.
 */
int
dkwedge_read(struct disk *pdk, struct vnode *vp, daddr_t blkno,
    void *tbuf, size_t len)
{
        buf_t *bp;
        int error;
        bool isopen;
        dev_t bdev;
        struct vnode *bdvp;

        /*
         * The kernel cannot read from a character device vnode
         * as physio() only handles user memory.
         *
         * If the block device has already been opened by a wedge
         * use that vnode and temporarily bump the open counter.
         *
         * Otherwise try to open the block device.
         */

        bdev = devsw_chr2blk(vp->v_rdev);

        mutex_enter(&pdk->dk_rawlock);
        if (pdk->dk_rawopens != 0) {
                KASSERT(pdk->dk_rawvp != NULL);
                isopen = true;
                ++pdk->dk_rawopens;
                bdvp = pdk->dk_rawvp;
                error = 0;
        } else {
                isopen = false;
                error = dk_open_parent(bdev, FREAD, &bdvp);
        }
        mutex_exit(&pdk->dk_rawlock);

        if (error)
                return error;

        bp = getiobuf(bdvp, true);
        bp->b_flags = B_READ;
        bp->b_cflags = BC_BUSY;
        bp->b_dev = bdev;
        bp->b_data = tbuf;
        bp->b_bufsize = bp->b_bcount = len;
        bp->b_blkno = blkno;
        bp->b_cylinder = 0;
        bp->b_error = 0;

        VOP_STRATEGY(bdvp, bp);
        error = biowait(bp);
        putiobuf(bp);

        mutex_enter(&pdk->dk_rawlock);
        if (isopen) {
                --pdk->dk_rawopens;
        } else {
                dk_close_parent(bdvp, FREAD);
        }
        mutex_exit(&pdk->dk_rawlock);

        return error;
}

/*
 * dkwedge_lookup:
 *
 *        Look up a dkwedge_softc based on the provided dev_t.
 *
 *        Caller must guarantee the wedge is referenced.
 */
static struct dkwedge_softc *
dkwedge_lookup(dev_t dev)
{

        return device_lookup_private(&dk_cd, minor(dev));
}

static struct dkwedge_softc *
dkwedge_lookup_acquire(dev_t dev)
{
        device_t dv = device_lookup_acquire(&dk_cd, minor(dev));

        if (dv == NULL)
                return NULL;
        return device_private(dv);
}

static int
dk_open_parent(dev_t dev, int mode, struct vnode **vpp)
{
        struct vnode *vp;
        int error;

        error = bdevvp(dev, &vp);
        if (error)
                return error;

        error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        if (error) {
                vrele(vp);
                return error;
        }
        error = VOP_OPEN(vp, mode, NOCRED);
        if (error) {
                vput(vp);
                return error;
        }

        /* VOP_OPEN() doesn't do this for us. */
        if (mode & FWRITE) {
                mutex_enter(vp->v_interlock);
                vp->v_writecount++;
                mutex_exit(vp->v_interlock);
        }

        VOP_UNLOCK(vp);

        *vpp = vp;

        return 0;
}

static int
dk_close_parent(struct vnode *vp, int mode)
{
        int error;

        error = vn_close(vp, mode, NOCRED);
        return error;
}

/*
 * dkopen:                [devsw entry point]
 *
 *        Open a wedge.
 */
static int
dkopen(dev_t dev, int flags, int fmt, struct lwp *l)
{
        struct dkwedge_softc *sc = dkwedge_lookup(dev);
        int error = 0;

        if (sc == NULL)
                return ENXIO;
        KASSERT(sc->sc_dev != NULL);
        KASSERT(sc->sc_state == DKW_STATE_RUNNING);

        /*
         * We go through a complicated little dance to only open the parent
         * vnode once per wedge, no matter how many times the wedge is
         * opened.  The reason?  We see one dkopen() per open call, but
         * only dkclose() on the last close.
         */
        mutex_enter(&sc->sc_dk.dk_openlock);
        mutex_enter(&sc->sc_parent->dk_rawlock);
        if (sc->sc_dk.dk_openmask == 0) {
                error = dkfirstopen(sc, flags);
                if (error)
                        goto out;
        } else if (flags & ~sc->sc_mode & FWRITE) {
                /*
                 * The parent is already open, but the previous attempt
                 * to open it read/write failed and fell back to
                 * read-only.  In that case, we assume the medium is
                 * read-only and fail to open the wedge read/write.
                 */
                error = EROFS;
                goto out;
        }
        KASSERT(sc->sc_mode != 0);
        KASSERTMSG(sc->sc_mode & FREAD, "%s: sc_mode=%x",
            device_xname(sc->sc_dev), sc->sc_mode);
        KASSERTMSG((flags & FWRITE) ? (sc->sc_mode & FWRITE) : 1,
            "%s: flags=%x sc_mode=%x",
            device_xname(sc->sc_dev), flags, sc->sc_mode);
        if (fmt == S_IFCHR)
                sc->sc_dk.dk_copenmask |= 1;
        else
                sc->sc_dk.dk_bopenmask |= 1;
        sc->sc_dk.dk_openmask =
            sc->sc_dk.dk_copenmask | sc->sc_dk.dk_bopenmask;

out:        mutex_exit(&sc->sc_parent->dk_rawlock);
        mutex_exit(&sc->sc_dk.dk_openlock);
        return error;
}

static int
dkfirstopen(struct dkwedge_softc *sc, int flags)
{
        struct dkwedge_softc *nsc;
        struct vnode *vp;
        int mode;
        int error;

        KASSERT(mutex_owned(&sc->sc_dk.dk_openlock));
        KASSERT(mutex_owned(&sc->sc_parent->dk_rawlock));

        if (sc->sc_parent->dk_rawopens == 0) {
                KASSERT(sc->sc_parent->dk_rawvp == NULL);
                /*
                 * Try open read-write. If this fails for EROFS
                 * and wedge is read-only, retry to open read-only.
                 */
                mode = FREAD | FWRITE;
                error = dk_open_parent(sc->sc_pdev, mode, &vp);
                if (error == EROFS && (flags & FWRITE) == 0) {
                        mode &= ~FWRITE;
                        error = dk_open_parent(sc->sc_pdev, mode, &vp);
                }
                if (error)
                        return error;
                KASSERT(vp != NULL);
                sc->sc_parent->dk_rawvp = vp;
        } else {
                /*
                 * Retrieve mode from an already opened wedge.
                 *
                 * At this point, dk_rawopens is bounded by the number
                 * of dkwedge devices in the system, which is limited
                 * by autoconf device numbering to INT_MAX.  Since
                 * dk_rawopens is unsigned, this can't overflow.
                 */
                KASSERT(sc->sc_parent->dk_rawopens < UINT_MAX);
                KASSERT(sc->sc_parent->dk_rawvp != NULL);
                mode = 0;
                mutex_enter(&sc->sc_parent->dk_openlock);
                LIST_FOREACH(nsc, &sc->sc_parent->dk_wedges, sc_plink) {
                        if (nsc == sc || nsc->sc_dk.dk_openmask == 0)
                                continue;
                        mode = nsc->sc_mode;
                        break;
                }
                mutex_exit(&sc->sc_parent->dk_openlock);
        }
        sc->sc_mode = mode;
        sc->sc_parent->dk_rawopens++;

        return 0;
}

static void
dklastclose(struct dkwedge_softc *sc)
{

        KASSERT(mutex_owned(&sc->sc_dk.dk_openlock));
        KASSERT(mutex_owned(&sc->sc_parent->dk_rawlock));
        KASSERT(sc->sc_parent->dk_rawopens > 0);
        KASSERT(sc->sc_parent->dk_rawvp != NULL);

        if (--sc->sc_parent->dk_rawopens == 0) {
                struct vnode *const vp = sc->sc_parent->dk_rawvp;
                const int mode = sc->sc_mode;

                sc->sc_parent->dk_rawvp = NULL;
                sc->sc_mode = 0;

                dk_close_parent(vp, mode);
        }
}

/*
 * dkclose:                [devsw entry point]
 *
 *        Close a wedge.
 */
static int
dkclose(dev_t dev, int flags, int fmt, struct lwp *l)
{
        struct dkwedge_softc *sc = dkwedge_lookup(dev);

        /*
         * dkclose can be called even if dkopen didn't succeed, so we
         * have to handle the same possibility that the wedge may not
         * exist.
         */
        if (sc == NULL)
                return ENXIO;
        KASSERT(sc->sc_dev != NULL);
        KASSERT(sc->sc_state != DKW_STATE_LARVAL);
        KASSERT(sc->sc_state != DKW_STATE_DEAD);

        mutex_enter(&sc->sc_dk.dk_openlock);
        mutex_enter(&sc->sc_parent->dk_rawlock);

        KASSERT(sc->sc_dk.dk_openmask != 0);

        if (fmt == S_IFCHR)
                sc->sc_dk.dk_copenmask &= ~1;
        else
                sc->sc_dk.dk_bopenmask &= ~1;
        sc->sc_dk.dk_openmask =
            sc->sc_dk.dk_copenmask | sc->sc_dk.dk_bopenmask;

        if (sc->sc_dk.dk_openmask == 0) {
                dklastclose(sc);
        }

        mutex_exit(&sc->sc_parent->dk_rawlock);
        mutex_exit(&sc->sc_dk.dk_openlock);

        return 0;
}

/*
 * dkcancel:                [devsw entry point]
 *
 *        Cancel any pending I/O operations waiting on a wedge.
 */
static int
dkcancel(dev_t dev, int flags, int fmt, struct lwp *l)
{
        struct dkwedge_softc *sc = dkwedge_lookup(dev);

        KASSERT(sc != NULL);
        KASSERT(sc->sc_dev != NULL);
        KASSERT(sc->sc_state != DKW_STATE_LARVAL);
        KASSERT(sc->sc_state != DKW_STATE_DEAD);

        /*
         * Disk I/O is expected to complete or fail within a reasonable
         * timeframe -- it's storage, not communication.  Further, the
         * character and block device interface guarantees that prior
         * reads and writes have completed or failed by the time close
         * returns -- we are not to cancel them here.  If the parent
         * device's hardware is gone, the parent driver can make them
         * fail.  Nothing for dk(4) itself to do.
         */

        return 0;
}

/*
 * dkstrategy:                [devsw entry point]
 *
 *        Perform I/O based on the wedge I/O strategy.
 */
static void
dkstrategy(struct buf *bp)
{
        struct dkwedge_softc *sc = dkwedge_lookup(bp->b_dev);
        uint64_t p_size, p_offset;

        KASSERT(sc != NULL);
        KASSERT(sc->sc_dev != NULL);
        KASSERT(sc->sc_state != DKW_STATE_LARVAL);
        KASSERT(sc->sc_state != DKW_STATE_DEAD);
        KASSERT(sc->sc_parent->dk_rawvp != NULL);

        /* If it's an empty transfer, wake up the top half now. */
        if (bp->b_bcount == 0)
                goto done;

        p_offset = sc->sc_offset << sc->sc_parent->dk_blkshift;
        p_size = dkwedge_size(sc) << sc->sc_parent->dk_blkshift;

        /* Make sure it's in-range. */
        if (bounds_check_with_mediasize(bp, DEV_BSIZE, p_size) <= 0)
                goto done;

        /* Translate it to the parent's raw LBA. */
        bp->b_rawblkno = bp->b_blkno + p_offset;

        /* Place it in the queue and start I/O on the unit. */
        mutex_enter(&sc->sc_iolock);
        disk_wait(&sc->sc_dk);
        bufq_put(sc->sc_bufq, bp);
        mutex_exit(&sc->sc_iolock);

        dkstart(sc);
        return;

done:
        bp->b_resid = bp->b_bcount;
        biodone(bp);
}

/*
 * dkstart:
 *
 *        Start I/O that has been enqueued on the wedge.
 */
static void
dkstart(struct dkwedge_softc *sc)
{
        struct vnode *vp;
        struct buf *bp, *nbp;

        mutex_enter(&sc->sc_iolock);

        /* Do as much work as has been enqueued. */
        while ((bp = bufq_peek(sc->sc_bufq)) != NULL) {
                if (sc->sc_iostop) {
                        (void) bufq_get(sc->sc_bufq);
                        mutex_exit(&sc->sc_iolock);
                        bp->b_error = ENXIO;
                        bp->b_resid = bp->b_bcount;
                        biodone(bp);
                        mutex_enter(&sc->sc_iolock);
                        continue;
                }

                /* fetch an I/O buf with sc_iolock dropped */
                mutex_exit(&sc->sc_iolock);
                nbp = getiobuf(sc->sc_parent->dk_rawvp, false);
                mutex_enter(&sc->sc_iolock);
                if (nbp == NULL) {
                        /*
                         * No resources to run this request; leave the
                         * buffer queued up, and schedule a timer to
                         * restart the queue in 1/2 a second.
                         */
                        if (!sc->sc_iostop)
                                callout_schedule(&sc->sc_restart_ch, hz/2);
                        break;
                }

                /*
                 * fetch buf, this can fail if another thread
                 * has already processed the queue, it can also
                 * return a completely different buf.
                 */
                bp = bufq_get(sc->sc_bufq);
                if (bp == NULL) {
                        mutex_exit(&sc->sc_iolock);
                        putiobuf(nbp);
                        mutex_enter(&sc->sc_iolock);
                        continue;
                }

                /* Instrumentation. */
                disk_busy(&sc->sc_dk);

                /* release lock for VOP_STRATEGY */
                mutex_exit(&sc->sc_iolock);

                nbp->b_data = bp->b_data;
                nbp->b_flags = bp->b_flags;
                nbp->b_oflags = bp->b_oflags;
                nbp->b_cflags = bp->b_cflags;
                nbp->b_iodone = dkiodone;
                nbp->b_proc = bp->b_proc;
                nbp->b_blkno = bp->b_rawblkno;
                nbp->b_dev = sc->sc_parent->dk_rawvp->v_rdev;
                nbp->b_bcount = bp->b_bcount;
                nbp->b_private = bp;
                BIO_COPYPRIO(nbp, bp);

                vp = nbp->b_vp;
                if ((nbp->b_flags & B_READ) == 0) {
                        mutex_enter(vp->v_interlock);
                        vp->v_numoutput++;
                        mutex_exit(vp->v_interlock);
                }
                VOP_STRATEGY(vp, nbp);

                mutex_enter(&sc->sc_iolock);
        }

        mutex_exit(&sc->sc_iolock);
}

/*
 * dkiodone:
 *
 *        I/O to a wedge has completed; alert the top half.
 */
static void
dkiodone(struct buf *bp)
{
        struct buf *obp = bp->b_private;
        struct dkwedge_softc *sc = dkwedge_lookup(obp->b_dev);

        KASSERT(sc != NULL);
        KASSERT(sc->sc_dev != NULL);

        if (bp->b_error != 0)
                obp->b_error = bp->b_error;
        obp->b_resid = bp->b_resid;
        putiobuf(bp);

        mutex_enter(&sc->sc_iolock);
        disk_unbusy(&sc->sc_dk, obp->b_bcount - obp->b_resid,
            obp->b_flags & B_READ);
        mutex_exit(&sc->sc_iolock);

        biodone(obp);

        /* Kick the queue in case there is more work we can do. */
        dkstart(sc);
}

/*
 * dkrestart:
 *
 *        Restart the work queue after it was stalled due to
 *        a resource shortage.  Invoked via a callout.
 */
static void
dkrestart(void *v)
{
        struct dkwedge_softc *sc = v;

        dkstart(sc);
}

/*
 * dkminphys:
 *
 *        Call parent's minphys function.
 */
static void
dkminphys(struct buf *bp)
{
        struct dkwedge_softc *sc = dkwedge_lookup(bp->b_dev);
        dev_t dev;

        KASSERT(sc != NULL);
        KASSERT(sc->sc_dev != NULL);

        dev = bp->b_dev;
        bp->b_dev = sc->sc_pdev;
        if (sc->sc_parent->dk_driver && sc->sc_parent->dk_driver->d_minphys)
                (*sc->sc_parent->dk_driver->d_minphys)(bp);
        else
                minphys(bp);
        bp->b_dev = dev;
}

/*
 * dkread:                [devsw entry point]
 *
 *        Read from a wedge.
 */
static int
dkread(dev_t dev, struct uio *uio, int flags)
{
        struct dkwedge_softc *sc __diagused = dkwedge_lookup(dev);

        KASSERT(sc != NULL);
        KASSERT(sc->sc_dev != NULL);
        KASSERT(sc->sc_state != DKW_STATE_LARVAL);
        KASSERT(sc->sc_state != DKW_STATE_DEAD);

        return physio(dkstrategy, NULL, dev, B_READ, dkminphys, uio);
}

/*
 * dkwrite:                [devsw entry point]
 *
 *        Write to a wedge.
 */
static int
dkwrite(dev_t dev, struct uio *uio, int flags)
{
        struct dkwedge_softc *sc __diagused = dkwedge_lookup(dev);

        KASSERT(sc != NULL);
        KASSERT(sc->sc_dev != NULL);
        KASSERT(sc->sc_state != DKW_STATE_LARVAL);
        KASSERT(sc->sc_state != DKW_STATE_DEAD);

        return physio(dkstrategy, NULL, dev, B_WRITE, dkminphys, uio);
}

/*
 * dkioctl:                [devsw entry point]
 *
 *        Perform an ioctl request on a wedge.
 */
static int
dkioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
        struct dkwedge_softc *sc = dkwedge_lookup(dev);
        int error = 0;

        KASSERT(sc != NULL);
        KASSERT(sc->sc_dev != NULL);
        KASSERT(sc->sc_state != DKW_STATE_LARVAL);
        KASSERT(sc->sc_state != DKW_STATE_DEAD);
        KASSERT(sc->sc_parent->dk_rawvp != NULL);

        /*
         * We pass NODEV instead of our device to indicate we don't
         * want to handle disklabel ioctls
         */
        error = disk_ioctl(&sc->sc_dk, NODEV, cmd, data, flag, l);
        if (error != EPASSTHROUGH)
                return error;

        error = 0;

        switch (cmd) {
        case DIOCGSTRATEGY:
        case DIOCGCACHE:
        case DIOCCACHESYNC:
                error = VOP_IOCTL(sc->sc_parent->dk_rawvp, cmd, data, flag,
                    l != NULL ? l->l_cred : NOCRED);
                break;
        case DIOCGWEDGEINFO: {
                struct dkwedge_info *dkw = data;

                strlcpy(dkw->dkw_devname, device_xname(sc->sc_dev),
                    sizeof(dkw->dkw_devname));
                    memcpy(dkw->dkw_wname, sc->sc_wname, sizeof(dkw->dkw_wname));
                dkw->dkw_wname[sizeof(dkw->dkw_wname) - 1] = '\0';
                strlcpy(dkw->dkw_parent, sc->sc_parent->dk_name,
                    sizeof(dkw->dkw_parent));
                dkw->dkw_offset = sc->sc_offset;
                dkw->dkw_size = dkwedge_size(sc);
                strlcpy(dkw->dkw_ptype, sc->sc_ptype, sizeof(dkw->dkw_ptype));

                break;
        }
        case DIOCGSECTORALIGN: {
                struct disk_sectoralign *dsa = data;
                uint32_t r;

                error = VOP_IOCTL(sc->sc_parent->dk_rawvp, cmd, dsa, flag,
                    l != NULL ? l->l_cred : NOCRED);
                if (error)
                        break;

                r = sc->sc_offset % dsa->dsa_alignment;
                if (r < dsa->dsa_firstaligned)
                        dsa->dsa_firstaligned = dsa->dsa_firstaligned - r;
                else
                        dsa->dsa_firstaligned = (dsa->dsa_firstaligned +
                            dsa->dsa_alignment) - r;
                break;
        }
        default:
                error = ENOTTY;
        }

        return error;
}

/*
 * dkdiscard:                [devsw entry point]
 *
 *        Perform a discard-range request on a wedge.
 */
static int
dkdiscard(dev_t dev, off_t pos, off_t len)
{
        struct dkwedge_softc *sc = dkwedge_lookup(dev);
        uint64_t size = dkwedge_size(sc);
        unsigned shift;
        off_t offset, maxlen;
        int error;

        KASSERT(sc != NULL);
        KASSERT(sc->sc_dev != NULL);
        KASSERT(sc->sc_state != DKW_STATE_LARVAL);
        KASSERT(sc->sc_state != DKW_STATE_DEAD);
        KASSERT(sc->sc_parent->dk_rawvp != NULL);

        /* XXX check bounds on size/offset up front */
        shift = (sc->sc_parent->dk_blkshift + DEV_BSHIFT);
        KASSERT(__type_fit(off_t, size));
        KASSERT(__type_fit(off_t, sc->sc_offset));
        KASSERT(0 <= sc->sc_offset);
        KASSERT(size <= (__type_max(off_t) >> shift));
        KASSERT(sc->sc_offset <= ((__type_max(off_t) >> shift) - size));
        offset = ((off_t)sc->sc_offset << shift);
        maxlen = ((off_t)size << shift);

        if (len > maxlen)
                return EINVAL;
        if (pos > (maxlen - len))
                return EINVAL;

        pos += offset;

        vn_lock(sc->sc_parent->dk_rawvp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_FDISCARD(sc->sc_parent->dk_rawvp, pos, len);
        VOP_UNLOCK(sc->sc_parent->dk_rawvp);

        return error;
}

/*
 * dksize:                [devsw entry point]
 *
 *        Query the size of a wedge for the purpose of performing a dump
 *        or for swapping to.
 */
static int
dksize(dev_t dev)
{
        /*
         * Don't bother taking a reference because this is only used
         * either (a) while the device is open (for swap), or (b) while
         * any multiprocessing is quiescent (for crash dumps).
         */
        struct dkwedge_softc *sc = dkwedge_lookup(dev);
        uint64_t p_size;
        int rv = -1;

        if (sc == NULL)
                return -1;
        if (sc->sc_state != DKW_STATE_RUNNING)
                return -1;

        /* Our content type is static, no need to open the device. */

        p_size = dkwedge_size(sc) << sc->sc_parent->dk_blkshift;
        if (strcmp(sc->sc_ptype, DKW_PTYPE_SWAP) == 0) {
                /* Saturate if we are larger than INT_MAX. */
                if (p_size > INT_MAX)
                        rv = INT_MAX;
                else
                        rv = (int)p_size;
        }

        return rv;
}

/*
 * dkdump:                [devsw entry point]
 *
 *        Perform a crash dump to a wedge.
 */
static int
dkdump(dev_t dev, daddr_t blkno, void *va, size_t size)
{
        /*
         * Don't bother taking a reference because this is only used
         * while any multiprocessing is quiescent.
         */
        struct dkwedge_softc *sc = dkwedge_lookup(dev);
        const struct bdevsw *bdev;
        uint64_t p_size, p_offset;

        if (sc == NULL)
                return ENXIO;
        if (sc->sc_state != DKW_STATE_RUNNING)
                return ENXIO;

        /* Our content type is static, no need to open the device. */

        if (strcmp(sc->sc_ptype, DKW_PTYPE_SWAP) != 0 &&
            strcmp(sc->sc_ptype, DKW_PTYPE_RAID) != 0 &&
            strcmp(sc->sc_ptype, DKW_PTYPE_CGD) != 0)
                return ENXIO;
        if (size % DEV_BSIZE != 0)
                return EINVAL;

        p_offset = sc->sc_offset << sc->sc_parent->dk_blkshift;
        p_size = dkwedge_size(sc) << sc->sc_parent->dk_blkshift;

        if (blkno < 0 || blkno + size/DEV_BSIZE > p_size) {
                printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
                    "p_size (%" PRIu64 ")\n", __func__, blkno,
                    size/DEV_BSIZE, p_size);
                return EINVAL;
        }

        bdev = bdevsw_lookup(sc->sc_pdev);
        return (*bdev->d_dump)(sc->sc_pdev, blkno + p_offset, va, size);
}

/*
 * config glue
 */

/*
 * dkwedge_find_partition
 *
 *        Find wedge corresponding to the specified parent name
 *        and offset/length.
 */
static device_t
dkwedge_find_partition_acquire(device_t parent, daddr_t startblk,
    uint64_t nblks)
{
        struct dkwedge_softc *sc;
        int i;
        device_t wedge = NULL;

        rw_enter(&dkwedges_lock, RW_READER);
        for (i = 0; i < ndkwedges; i++) {
                if ((sc = dkwedges[i]) == NULL || sc->sc_dev == NULL)
                        continue;
                if (strcmp(sc->sc_parent->dk_name, device_xname(parent)) == 0 &&
                    sc->sc_offset == startblk &&
                    dkwedge_size(sc) == nblks) {
                        if (wedge) {
                                printf("WARNING: double match for boot wedge "
                                    "(%s, %s)\n",
                                    device_xname(wedge),
                                    device_xname(sc->sc_dev));
                                continue;
                        }
                        wedge = sc->sc_dev;
                        device_acquire(wedge);
                }
        }
        rw_exit(&dkwedges_lock);

        return wedge;
}

/* XXX unsafe */
device_t
dkwedge_find_partition(device_t parent, daddr_t startblk,
    uint64_t nblks)
{
        device_t dv;

        if ((dv = dkwedge_find_partition_acquire(parent, startblk, nblks))
            == NULL)
                return NULL;
        device_release(dv);
        return dv;
}

const char *
dkwedge_get_parent_name(dev_t dev)
{
        /* XXX: perhaps do this in lookup? */
        int bmaj = bdevsw_lookup_major(&dk_bdevsw);
        int cmaj = cdevsw_lookup_major(&dk_cdevsw);

        if (major(dev) != bmaj && major(dev) != cmaj)
                return NULL;

        struct dkwedge_softc *const sc = dkwedge_lookup_acquire(dev);
        if (sc == NULL)
                return NULL;
        const char *const name = sc->sc_parent->dk_name;
        device_release(sc->sc_dev);
        return name;
}











































































































































































































































































































































































































   28 












   18 
    3 

   18 









































    3 


    1 








    1 

    1 
    1 










    3 



    2 
    2 







    1 



    1 







   26 

   27 
    1 






   19 

   20 








   67 

   68 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
/*        $NetBSD: socketvar.h,v 1.168 2024/02/03 19:05:14 jdolecek Exp $        */

/*-
 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1982, 1986, 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)socketvar.h        8.3 (Berkeley) 2/19/95
 */

#ifndef _SYS_SOCKETVAR_H_
#define        _SYS_SOCKETVAR_H_

#include <sys/select.h>
#include <sys/selinfo.h>                /* for struct selinfo */
#include <sys/queue.h>
#include <sys/mutex.h>
#include <sys/condvar.h>

#if !defined(_KERNEL)
struct uio;
struct lwp;
struct uidinfo;
#else
#include <sys/atomic.h>
#include <sys/uidinfo.h>
#endif

TAILQ_HEAD(soqhead, socket);

/*
 * Variables for socket buffering.
 */
struct sockbuf {
        struct selinfo sb_sel;                /* process selecting read/write */
        struct mowner *sb_mowner;        /* who owns data for this sockbuf */
        struct socket *sb_so;                /* back pointer to socket */
        kcondvar_t sb_cv;                /* notifier */
        /* When re-zeroing this struct, we zero from sb_startzero to the end */
#define        sb_startzero        sb_cc
        u_long        sb_cc;                        /* actual chars in buffer */
        u_long        sb_hiwat;                /* max actual char count */
        u_long        sb_mbcnt;                /* chars of mbufs used */
        u_long        sb_mbmax;                /* max chars of mbufs to use */
        u_long        sb_lowat;                /* low water mark */
        struct mbuf *sb_mb;                /* the mbuf chain */
        struct mbuf *sb_mbtail;                /* the last mbuf in the chain */
        struct mbuf *sb_lastrecord;        /* first mbuf of last record in
                                           socket buffer */
        int        sb_flags;                /* flags, see below */
        int        sb_timeo;                /* timeout for read/write */
        u_long        sb_overflowed;                /* # of drops due to full buffer */
};

#ifndef SB_MAX
#define        SB_MAX                (256*1024)        /* default for max chars in sockbuf */
#endif

#define        SB_LOCK                0x01                /* lock on data queue */
#define        SB_NOTIFY        0x04                /* someone is waiting for data/space */
#define        SB_ASYNC        0x10                /* ASYNC I/O, need signals */
#define        SB_UPCALL        0x20                /* someone wants an upcall */
#define        SB_NOINTR        0x40                /* operations not interruptible */
#define        SB_KNOTE        0x100                /* kernel note attached */
#define        SB_AUTOSIZE        0x800                /* automatically size socket buffer */

/*
 * Kernel structure per socket.
 * Contains send and receive buffer queues,
 * handle on protocol and pointer to protocol
 * private data and error information.
 */
struct so_accf {
        struct accept_filter        *so_accept_filter;
        void        *so_accept_filter_arg;        /* saved filter args */
        char        *so_accept_filter_str;        /* saved user args */
};

struct sockaddr;

struct socket {
        kmutex_t * volatile so_lock;        /* pointer to lock on structure */
        kcondvar_t        so_cv;                /* notifier */
        short                so_type;        /* generic type, see socket.h */
        short                so_options;        /* from socket call, see socket.h */
        u_short                so_linger;        /* time to linger while closing */
        short                so_state;        /* internal state flags SS_*, below */
        int                so_unused;        /* used to be so_nbio */
        void                *so_pcb;        /* protocol control block */
        const struct protosw *so_proto;        /* protocol handle */
/*
 * Variables for connection queueing.
 * Socket where accepts occur is so_head in all subsidiary sockets.
 * If so_head is 0, socket is not related to an accept.
 * For head socket so_q0 queues partially completed connections,
 * while so_q is a queue of connections ready to be accepted.
 * If a connection is aborted and it has so_head set, then
 * it has to be pulled out of either so_q0 or so_q.
 * We allow connections to queue up based on current queue lengths
 * and limit on number of queued connections for this socket.
 */
        struct socket        *so_head;        /* back pointer to accept socket */
        struct soqhead        *so_onq;        /* queue (q or q0) that we're on */
        struct soqhead        so_q0;                /* queue of partial connections */
        struct soqhead        so_q;                /* queue of incoming connections */
        TAILQ_ENTRY(socket) so_qe;        /* our queue entry (q or q0) */
        short                so_q0len;        /* partials on so_q0 */
        short                so_qlen;        /* number of connections on so_q */
        short                so_qlimit;        /* max number queued connections */
        short                so_timeo;        /* connection timeout */
        u_short                so_error;        /* error affecting connection */
        u_short                so_rerror;        /* error affecting receiving */
        u_short                so_aborting;        /* references from soabort() */
        pid_t                so_pgid;        /* pgid for signals */
        u_long                so_oobmark;        /* chars to oob mark */
        struct sockbuf        so_snd;                /* send buffer */
        struct sockbuf        so_rcv;                /* receive buffer */

        void                *so_internal;        /* Space for svr4 stream data */
        void                (*so_upcall) (struct socket *, void *, int, int);
        void *                so_upcallarg;        /* Arg for above */
        int                (*so_send) (struct socket *, struct sockaddr *,
                                        struct uio *, struct mbuf *,
                                        struct mbuf *, int, struct lwp *);
        int                (*so_receive) (struct socket *,
                                        struct mbuf **,
                                        struct uio *, struct mbuf **,
                                        struct mbuf **, int *);
        struct mowner        *so_mowner;        /* who owns mbufs for this socket */
        struct uidinfo        *so_uidinfo;        /* who opened the socket */
        gid_t                so_egid;        /* creator effective gid */
        pid_t                so_cpid;        /* creator pid */
        struct so_accf        *so_accf;
        kauth_cred_t        so_cred;        /* socket credentials */
};

/*
 * Socket state bits.
 */
#define        SS_NOFDREF                0x001        /* no file table ref any more */
#define        SS_ISCONNECTED                0x002        /* socket connected to a peer */
#define        SS_ISCONNECTING                0x004        /* in process of connecting to peer */
#define        SS_ISDISCONNECTING        0x008        /* in process of disconnecting */
#define        SS_CANTSENDMORE                0x010        /* can't send more data to peer */
#define        SS_CANTRCVMORE                0x020        /* can't receive more data from peer */
#define        SS_RCVATMARK                0x040        /* at mark on input */
#define        SS_ISABORTING                0x080        /* aborting fd references - close() */
#define        SS_RESTARTSYS                0x100        /* restart blocked system calls */
#define        SS_POLLRDBAND                0x200        /* poll should return POLLRDBAND */
#define        SS_MORETOCOME                0x400        /*
                                         * hint from sosend to lower layer;
                                         * more data coming
                                         */
#define        SS_ISDISCONNECTED        0x800        /* socket disconnected from peer */
#define        SS_ISAPIPE                 0x1000        /* socket is implementing a pipe */
#define        SS_NBIO                        0x2000        /* socket is in non blocking I/O */

#ifdef _KERNEL

struct accept_filter {
        char        accf_name[16];
        void        (*accf_callback)
                (struct socket *, void *, int, int);
        void *        (*accf_create)
                (struct socket *, char *);
        void        (*accf_destroy)
                (struct socket *);
        LIST_ENTRY(accept_filter) accf_next;
        u_int        accf_refcnt;
};

struct sockopt {
        int                sopt_level;                /* option level */
        int                sopt_name;                /* option name */
        size_t                sopt_size;                /* data length */
        size_t                sopt_retsize;                /* returned data length */
        void *                sopt_data;                /* data pointer */
        uint8_t                sopt_buf[sizeof(int)];        /* internal storage */
};

#define        SB_EMPTY_FIXUP(sb)                                                \
do {                                                                        \
        KASSERT(solocked((sb)->sb_so));                                        \
        if ((sb)->sb_mb == NULL) {                                        \
                (sb)->sb_mbtail = NULL;                                        \
                (sb)->sb_lastrecord = NULL;                                \
        }                                                                \
} while (/*CONSTCOND*/0)

extern u_long                sb_max;
extern int                somaxkva;
extern int                sock_loan_thresh;
extern kmutex_t                *softnet_lock;

struct mbuf;
struct lwp;
struct msghdr;
struct stat;
struct knote;
struct sockaddr_big;
enum uio_seg;

/* 0x400 is SO_OTIMESTAMP */
#define SOOPT_TIMESTAMP(o)     ((o) & (SO_TIMESTAMP | 0x400))

/*
 * File operations on sockets.
 */
int        soo_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
int        soo_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
int        soo_fcntl(file_t *, u_int cmd, void *);
int        soo_ioctl(file_t *, u_long cmd, void *);
int        soo_poll(file_t *, int);
int        soo_kqfilter(file_t *, struct knote *);
int         soo_close(file_t *);
int        soo_stat(file_t *, struct stat *);
void        soo_restart(file_t *);
void        sbappend(struct sockbuf *, struct mbuf *);
void        sbappendstream(struct sockbuf *, struct mbuf *);
int        sbappendaddr(struct sockbuf *, const struct sockaddr *, struct mbuf *,
            struct mbuf *);
int        sbappendaddrchain(struct sockbuf *, const struct sockaddr *,
             struct mbuf *, int);
int        sbappendcontrol(struct sockbuf *, struct mbuf *, struct mbuf *);
void        sbappendrecord(struct sockbuf *, struct mbuf *);
void        sbcheck(struct sockbuf *);
void        sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *);
struct mbuf *
        sbcreatecontrol(void *, int, int, int);
struct mbuf *
        sbcreatecontrol1(void **, int, int, int, int);
struct mbuf **
        sbsavetimestamp(int, struct mbuf **);
void        sbdrop(struct sockbuf *, int);
void        sbdroprecord(struct sockbuf *);
void        sbflush(struct sockbuf *);
void        sbinsertoob(struct sockbuf *, struct mbuf *);
void        sbrelease(struct sockbuf *, struct socket *);
int        sbreserve(struct sockbuf *, u_long, struct socket *);
int        sbwait(struct sockbuf *);
int        sb_max_set(u_long);
void        soinit(void);
void        soinit1(void);
void        soinit2(void);
int        soabort(struct socket *);
int        soaccept(struct socket *, struct sockaddr *);
int        sofamily(const struct socket *);
int        sobind(struct socket *, struct sockaddr *, struct lwp *);
void        socantrcvmore(struct socket *);
void        socantsendmore(struct socket *);
void        soroverflow(struct socket *);
int        soclose(struct socket *);
int        soconnect(struct socket *, struct sockaddr *, struct lwp *);
int        soconnect2(struct socket *, struct socket *);
int        socreate(int, struct socket **, int, int, struct lwp *,
                 struct socket *);
int        fsocreate(int, struct socket **, int, int, int *, file_t **,
                struct socket *);
int        sodisconnect(struct socket *);
void        sofree(struct socket *);
int        sogetopt(struct socket *, struct sockopt *);
void        sohasoutofband(struct socket *);
void        soisconnected(struct socket *);
void        soisconnecting(struct socket *);
void        soisdisconnected(struct socket *);
void        soisdisconnecting(struct socket *);
int        solisten(struct socket *, int, struct lwp *);
struct socket *
        sonewconn(struct socket *, bool);
void        soqinsque(struct socket *, struct socket *, int);
bool        soqremque(struct socket *, int);
int        soreceive(struct socket *, struct mbuf **, struct uio *,
            struct mbuf **, struct mbuf **, int *);
int        soreserve(struct socket *, u_long, u_long);
void        sorflush(struct socket *);
int        sosend(struct socket *, struct sockaddr *, struct uio *,
            struct mbuf *, struct mbuf *, int, struct lwp *);
int        sosetopt(struct socket *, struct sockopt *);
int        so_setsockopt(struct lwp *, struct socket *, int, int, const void *, size_t);
int        soshutdown(struct socket *, int);
void        sorestart(struct socket *);
void        sowakeup(struct socket *, struct sockbuf *, int);
int        sockargs(struct mbuf **, const void *, size_t, enum uio_seg, int);
int        sopoll(struct socket *, int);
struct        socket *soget(bool);
void        soput(struct socket *);
bool        solocked(const struct socket *);
bool        solocked2(const struct socket *, const struct socket *);
int        sblock(struct sockbuf *, int);
void        sbunlock(struct sockbuf *);
int        sowait(struct socket *, bool, int);
void        solockretry(struct socket *, kmutex_t *);
void        sosetlock(struct socket *);
void        solockreset(struct socket *, kmutex_t *);

void        sockopt_init(struct sockopt *, int, int, size_t);
void        sockopt_destroy(struct sockopt *);
int        sockopt_set(struct sockopt *, const void *, size_t);
int        sockopt_setint(struct sockopt *, int);
int        sockopt_get(const struct sockopt *, void *, size_t);
int        sockopt_getint(const struct sockopt *, int *);
int        sockopt_setmbuf(struct sockopt *, struct mbuf *);
struct mbuf *sockopt_getmbuf(const struct sockopt *);

int        copyout_sockname(struct sockaddr *, unsigned int *, int, struct mbuf *);
int        copyout_sockname_sb(struct sockaddr *, unsigned int *,
    int , struct sockaddr_big *);
int        copyout_msg_control(struct lwp *, struct msghdr *, struct mbuf *);
void        free_control_mbuf(struct lwp *, struct mbuf *, struct mbuf *);

int        do_sys_getpeername(int, struct sockaddr *);
int        do_sys_getsockname(int, struct sockaddr *);

int        do_sys_sendmsg(struct lwp *, int, struct msghdr *, int, register_t *);
int        do_sys_sendmsg_so(struct lwp *, int, struct socket *, file_t *,
            struct msghdr *, int, register_t *);

int        do_sys_recvmsg(struct lwp *, int, struct msghdr *,
            struct mbuf **, struct mbuf **, register_t *);
int        do_sys_recvmsg_so(struct lwp *, int, struct socket *,
            struct msghdr *mp, struct mbuf **, struct mbuf **, register_t *);

int        do_sys_bind(struct lwp *, int, struct sockaddr *);
int        do_sys_connect(struct lwp *, int, struct sockaddr *);
int        do_sys_accept(struct lwp *, int, struct sockaddr *, register_t *,
            const sigset_t *, int, int);

int        do_sys_peeloff(struct socket *, void *);
/*
 * Inline functions for sockets and socket buffering.
 */

#include <sys/protosw.h>
#include <sys/mbuf.h>

/*
 * Do we need to notify the other side when I/O is possible?
 */
static __inline int
sb_notify(struct sockbuf *sb)
{

        KASSERT(solocked(sb->sb_so));

        return sb->sb_flags & (SB_NOTIFY | SB_ASYNC | SB_UPCALL | SB_KNOTE);
}

/*
 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
 * Since the fields are unsigned, detect overflow and return 0.
 */
static __inline u_long
sbspace(const struct sockbuf *sb)
{

        KASSERT(solocked(sb->sb_so));
        if (sb->sb_hiwat <= sb->sb_cc || sb->sb_mbmax <= sb->sb_mbcnt)
                return 0;
        return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
}

static __inline u_long
sbspace_oob(const struct sockbuf *sb)
{
        u_long hiwat = sb->sb_hiwat;

        if (hiwat < ULONG_MAX - 1024)
                hiwat += 1024;

        KASSERT(solocked(sb->sb_so));

        if (hiwat <= sb->sb_cc || sb->sb_mbmax <= sb->sb_mbcnt)
                return 0;
        return lmin(hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
}

/*
 * How much socket buffer space has been used?
 */
static __inline u_long
sbused(const struct sockbuf *sb)
{

        KASSERT(solocked(sb->sb_so));
        return sb->sb_cc;
}

/* do we have to send all at once on a socket? */
static __inline int
sosendallatonce(const struct socket *so)
{

        return so->so_proto->pr_flags & PR_ATOMIC;
}

/* can we read something from so? */
static __inline int
soreadable(const struct socket *so)
{

        KASSERT(solocked(so));

        return so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
            (so->so_state & SS_CANTRCVMORE) != 0 ||
            so->so_qlen != 0 || so->so_error != 0 || so->so_rerror != 0;
}

/* can we write something to so? */
static __inline int
sowritable(const struct socket *so)
{

        KASSERT(solocked(so));

        return (sbspace(&so->so_snd) >= so->so_snd.sb_lowat &&
            ((so->so_state & SS_ISCONNECTED) != 0 ||
            (so->so_proto->pr_flags & PR_CONNREQUIRED) == 0)) ||
            (so->so_state & SS_CANTSENDMORE) != 0 ||
            so->so_error != 0;
}

/* adjust counters in sb reflecting allocation of m */
static __inline void
sballoc(struct sockbuf *sb, struct mbuf *m)
{

        KASSERT(solocked(sb->sb_so));

        sb->sb_cc += m->m_len;
        sb->sb_mbcnt += MSIZE;
        if (m->m_flags & M_EXT)
                sb->sb_mbcnt += m->m_ext.ext_size;
}

/* adjust counters in sb reflecting freeing of m */
static __inline void
sbfree(struct sockbuf *sb, struct mbuf *m)
{

        KASSERT(solocked(sb->sb_so));

        sb->sb_cc -= m->m_len;
        sb->sb_mbcnt -= MSIZE;
        if (m->m_flags & M_EXT)
                sb->sb_mbcnt -= m->m_ext.ext_size;
}

static __inline void
sorwakeup(struct socket *so)
{

        KASSERT(solocked(so));

        if (sb_notify(&so->so_rcv))
                sowakeup(so, &so->so_rcv, POLL_IN);
}

static __inline void
sowwakeup(struct socket *so)
{

        KASSERT(solocked(so));

        if (sb_notify(&so->so_snd))
                sowakeup(so, &so->so_snd, POLL_OUT);
}

static __inline void
solock(struct socket *so)
{
        kmutex_t *lock;

        lock = atomic_load_consume(&so->so_lock);
        mutex_enter(lock);
        if (__predict_false(lock != atomic_load_relaxed(&so->so_lock)))
                solockretry(so, lock);
}

static __inline void
sounlock(struct socket *so)
{

        mutex_exit(so->so_lock);
}

#ifdef SOCKBUF_DEBUG
/*
 * SBLASTRECORDCHK: check sb->sb_lastrecord is maintained correctly.
 * SBLASTMBUFCHK: check sb->sb_mbtail is maintained correctly.
 *
 * => panic if the socket buffer is inconsistent.
 * => 'where' is used for a panic message.
 */
void        sblastrecordchk(struct sockbuf *, const char *);
#define        SBLASTRECORDCHK(sb, where)        sblastrecordchk((sb), (where))

void        sblastmbufchk(struct sockbuf *, const char *);
#define        SBLASTMBUFCHK(sb, where)        sblastmbufchk((sb), (where))
#define        SBCHECK(sb)                        sbcheck(sb)
#else
#define        SBLASTRECORDCHK(sb, where)        /* nothing */
#define        SBLASTMBUFCHK(sb, where)        /* nothing */
#define        SBCHECK(sb)                        /* nothing */
#endif /* SOCKBUF_DEBUG */

/* sosend loan */
vaddr_t        sokvaalloc(vaddr_t, vsize_t, struct socket *);
void        sokvafree(vaddr_t, vsize_t);
void        soloanfree(struct mbuf *, void *, size_t, void *);

/*
 * Values for socket-buffer-append priority argument to sbappendaddrchain().
 * The following flags are reserved for future implementation:
 *
 *  SB_PRIO_NONE:  honour normal socket-buffer limits.
 *
 *  SB_PRIO_ONESHOT_OVERFLOW:  if the socket has any space,
 *        deliver the entire chain. Intended for large requests
 *      that should be delivered in their entirety, or not at all.
 *
 * SB_PRIO_OVERDRAFT:  allow a small (2*MLEN) overflow, over and
 *        aboce normal socket limits. Intended messages indicating
 *      buffer overflow in earlier normal/lower-priority messages .
 *
 * SB_PRIO_BESTEFFORT: Ignore  limits entirely.  Intended only for
 *         kernel-generated messages to specially-marked scokets which
 *        require "reliable" delivery, nd where the source socket/protocol
 *        message generator enforce some hard limit (but possibly well
 *        above kern.sbmax). It is entirely up to the in-kernel source to
 *        avoid complete mbuf exhaustion or DoS scenarios.
 */
#define SB_PRIO_NONE                  0
#define SB_PRIO_ONESHOT_OVERFLOW 1
#define SB_PRIO_OVERDRAFT        2
#define SB_PRIO_BESTEFFORT        3

/*
 * Accept filter functions (duh).
 */
int        accept_filt_getopt(struct socket *, struct sockopt *);
int        accept_filt_setopt(struct socket *, const struct sockopt *);
int        accept_filt_clear(struct socket *);
int        accept_filt_add(struct accept_filter *);
int        accept_filt_del(struct accept_filter *);
struct        accept_filter *accept_filt_get(char *);
#ifdef ACCEPT_FILTER_MOD
#ifdef SYSCTL_DECL
SYSCTL_DECL(_net_inet_accf);
#endif
void        accept_filter_init(void);
#endif
#ifdef DDB
int sofindproc(struct socket *so, int all, void (*pr)(const char *, ...));
void socket_print(const char *modif, void (*pr)(const char *, ...));
#endif

#endif /* _KERNEL */

#endif /* !_SYS_SOCKETVAR_H_ */













































































   15 


























   15 














    5 


    5 



    2 
    4 
    2 
    4 
    5 















    3 






    3 
    3 



    3 
    3 

    3 




















    3 




    3 






















    2 



    2 

















    2 



    2 





















    2 



    2 























    3 

























    3 




    3 



    3 

    3 














    3 



    3 
















    3 







    3 


    3 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
/*        $NetBSD: ffs_subr.c,v 1.54 2023/01/07 19:41:30 chs Exp $        */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ffs_subr.c        8.5 (Berkeley) 3/21/95
 */

#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_subr.c,v 1.54 2023/01/07 19:41:30 chs Exp $");

#include <sys/param.h>

/* in ffs_tables.c */
extern const int inside[], around[];
extern const u_char * const fragtbl[];

#ifndef _KERNEL
#define FFS_EI /* always include byteswapped filesystems support */
#endif
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
#include <ufs/ufs/ufs_bswap.h>

#ifndef _KERNEL
#include <ufs/ufs/dinode.h>
void    panic(const char *, ...)
    __attribute__((__noreturn__,__format__(__printf__,1,2)));

#else        /* _KERNEL */
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/buf.h>
#include <sys/inttypes.h>
#include <sys/pool.h>
#include <sys/fstrans.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>

/*
 * Load up the contents of an inode and copy the appropriate pieces
 * to the incore copy.
 */
void
ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino)
{
        struct ufs1_dinode *dp1;
        struct ufs2_dinode *dp2;

        if (ip->i_ump->um_fstype == UFS1) {
                dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino);
#ifdef FFS_EI
                if (UFS_FSNEEDSWAP(fs))
                        ffs_dinode1_swap(dp1, ip->i_din.ffs1_din);
                else
#endif
                *ip->i_din.ffs1_din = *dp1;

                ip->i_mode = ip->i_ffs1_mode;
                ip->i_nlink = ip->i_ffs1_nlink;
                ip->i_size = ip->i_ffs1_size;
                ip->i_flags = ip->i_ffs1_flags;
                ip->i_gen = ip->i_ffs1_gen;
                ip->i_uid = ip->i_ffs1_uid;
                ip->i_gid = ip->i_ffs1_gid;
        } else {
                dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino);
#ifdef FFS_EI
                if (UFS_FSNEEDSWAP(fs))
                        ffs_dinode2_swap(dp2, ip->i_din.ffs2_din);
                else
#endif
                *ip->i_din.ffs2_din = *dp2;

                ip->i_mode = ip->i_ffs2_mode;
                ip->i_nlink = ip->i_ffs2_nlink;
                ip->i_size = ip->i_ffs2_size;
                ip->i_flags = ip->i_ffs2_flags;
                ip->i_gen = ip->i_ffs2_gen;
                ip->i_uid = ip->i_ffs2_uid;
                ip->i_gid = ip->i_ffs2_gid;
        }
}

int
ffs_getblk(struct vnode *vp, daddr_t lblkno, daddr_t blkno, int size,
    bool clearbuf, buf_t **bpp)
{
        int error = 0;

        KASSERT(blkno >= 0 || blkno == FFS_NOBLK);

        if ((*bpp = getblk(vp, lblkno, size, 0, 0)) == NULL)
                return ENOMEM;
        if (blkno != FFS_NOBLK)
                (*bpp)->b_blkno = blkno;
        if (clearbuf)
                clrbuf(*bpp);
        if ((*bpp)->b_blkno >= 0 && (error = fscow_run(*bpp, false)) != 0) {
                brelse(*bpp, BC_INVAL);
                *bpp = NULL;
        }
        return error;
}

#endif        /* _KERNEL */

/*
 * Update the frsum fields to reflect addition or deletion
 * of some frags.
 */
void
ffs_fragacct(struct fs *fs, int fragmap, uint32_t fraglist[], int cnt,
    int needswap)
{
        int inblk;
        int field, subfield;
        int siz, pos;

        inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
        fragmap <<= 1;
        for (siz = 1; siz < fs->fs_frag; siz++) {
                if ((inblk & (1 << (siz + (fs->fs_frag & (NBBY - 1))))) == 0)
                        continue;
                field = around[siz];
                subfield = inside[siz];
                for (pos = siz; pos <= fs->fs_frag; pos++) {
                        if ((fragmap & field) == subfield) {
                                fraglist[siz] = ufs_rw32(
                                    ufs_rw32(fraglist[siz], needswap) + cnt,
                                    needswap);
                                pos += siz;
                                field <<= siz;
                                subfield <<= siz;
                        }
                        field <<= 1;
                        subfield <<= 1;
                }
        }
}

/*
 * block operations
 *
 * check if a block is available
 *  returns true if all the corresponding bits in the free map are 1
 *  returns false if any corresponding bit in the free map is 0
 */
int
ffs_isblock(struct fs *fs, u_char *cp, int32_t h)
{
        u_char mask;

        switch ((int)fs->fs_fragshift) {
        case 3:
                return (cp[h] == 0xff);
        case 2:
                mask = 0x0f << ((h & 0x1) << 2);
                return ((cp[h >> 1] & mask) == mask);
        case 1:
                mask = 0x03 << ((h & 0x3) << 1);
                return ((cp[h >> 2] & mask) == mask);
        case 0:
                mask = 0x01 << (h & 0x7);
                return ((cp[h >> 3] & mask) == mask);
        default:
                panic("%s: unknown fs_fragshift %d", __func__,
                    (int)fs->fs_fragshift);
        }
}

/*
 * check if a block is completely allocated
 *  returns true if all the corresponding bits in the free map are 0
 *  returns false if any corresponding bit in the free map is 1
 */
int
ffs_isfreeblock(struct fs *fs, u_char *cp, int32_t h)
{

        switch ((int)fs->fs_fragshift) {
        case 3:
                return (cp[h] == 0);
        case 2:
                return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
        case 1:
                return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
        case 0:
                return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
        default:
                panic("%s: unknown fs_fragshift %d", __func__,
                    (int)fs->fs_fragshift);
        }
}

/*
 * take a block out of the map
 */
void
ffs_clrblock(struct fs *fs, u_char *cp, int32_t h)
{

        switch ((int)fs->fs_fragshift) {
        case 3:
                cp[h] = 0;
                return;
        case 2:
                cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
                return;
        case 1:
                cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
                return;
        case 0:
                cp[h >> 3] &= ~(0x01 << (h & 0x7));
                return;
        default:
                panic("%s: unknown fs_fragshift %d", __func__,
                    (int)fs->fs_fragshift);
        }
}

/*
 * put a block into the map
 */
void
ffs_setblock(struct fs *fs, u_char *cp, int32_t h)
{

        switch ((int)fs->fs_fragshift) {
        case 3:
                cp[h] = 0xff;
                return;
        case 2:
                cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
                return;
        case 1:
                cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
                return;
        case 0:
                cp[h >> 3] |= (0x01 << (h & 0x7));
                return;
        default:
                panic("%s: unknown fs_fragshift %d", __func__,
                    (int)fs->fs_fragshift);
        }
}

/*
 * Update the cluster map because of an allocation or free.
 *
 * Cnt == 1 means free; cnt == -1 means allocating.
 */
void
ffs_clusteracct(struct fs *fs, struct cg *cgp, int32_t blkno, int cnt)
{
        int32_t *sump;
        int32_t *lp;
        u_char *freemapp, *mapp;
        int i, start, end, forw, back, map;
        unsigned int bit;
        const int needswap = UFS_FSNEEDSWAP(fs);

        /* KASSERT(mutex_owned(&ump->um_lock)); */

        if (fs->fs_contigsumsize <= 0)
                return;
        freemapp = cg_clustersfree(cgp, needswap);
        sump = cg_clustersum(cgp, needswap);
        /*
         * Allocate or clear the actual block.
         */
        if (cnt > 0)
                setbit(freemapp, blkno);
        else
                clrbit(freemapp, blkno);
        /*
         * Find the size of the cluster going forward.
         */
        start = blkno + 1;
        end = start + fs->fs_contigsumsize;
        if ((uint32_t)end >= ufs_rw32(cgp->cg_nclusterblks, needswap))
                end = ufs_rw32(cgp->cg_nclusterblks, needswap);
        mapp = &freemapp[start / NBBY];
        map = *mapp++;
        bit = 1U << ((unsigned int)start % NBBY);
        for (i = start; i < end; i++) {
                if ((map & bit) == 0)
                        break;
                if ((i & (NBBY - 1)) != (NBBY - 1)) {
                        bit <<= 1;
                } else {
                        map = *mapp++;
                        bit = 1;
                }
        }
        forw = i - start;
        /*
         * Find the size of the cluster going backward.
         */
        start = blkno - 1;
        end = start - fs->fs_contigsumsize;
        if (end < 0)
                end = -1;
        mapp = &freemapp[start / NBBY];
        map = *mapp--;
        bit = 1U << ((unsigned int)start % NBBY);
        for (i = start; i > end; i--) {
                if ((map & bit) == 0)
                        break;
                if ((i & (NBBY - 1)) != 0) {
                        bit >>= 1;
                } else {
                        map = *mapp--;
                        bit = 1U << (NBBY - 1);
                }
        }
        back = start - i;
        /*
         * Account for old cluster and the possibly new forward and
         * back clusters.
         */
        i = back + forw + 1;
        if (i > fs->fs_contigsumsize)
                i = fs->fs_contigsumsize;
        ufs_add32(sump[i], cnt, needswap);
        if (back > 0)
                ufs_add32(sump[back], -cnt, needswap);
        if (forw > 0)
                ufs_add32(sump[forw], -cnt, needswap);

        /*
         * Update cluster summary information.
         */
        lp = &sump[fs->fs_contigsumsize];
        for (i = fs->fs_contigsumsize; i > 0; i--)
                if (ufs_rw32(*lp--, needswap) > 0)
                        break;
#if defined(_KERNEL)
        fs->fs_maxcluster[ufs_rw32(cgp->cg_cgx, needswap)] = i;
#endif
}
















































































































































































































































































































    1 


















    1 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
/*        $NetBSD: scsipi_ioctl.c,v 1.73 2019/12/27 09:41:51 msaitoh Exp $        */

/*-
 * Copyright (c) 1998, 2004 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Contributed by HD Associates (hd@world.std.com).
 * Copyright (c) 1992, 1993 HD Associates
 *
 * Berkeley style copyright.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: scsipi_ioctl.c,v 1.73 2019/12/27 09:41:51 msaitoh Exp $");

#ifdef _KERNEL_OPT
#include "opt_compat_freebsd.h"
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/device.h>
#include <sys/fcntl.h>

#include <dev/scsipi/scsipi_all.h>
#include <dev/scsipi/scsipiconf.h>
#include <dev/scsipi/scsipi_base.h>
#include <dev/scsipi/scsiconf.h>
#include <sys/scsiio.h>

#include "scsibus.h"
#include "atapibus.h"

struct scsi_ioctl {
        LIST_ENTRY(scsi_ioctl) si_list;
        struct buf si_bp;
        struct uio si_uio;
        struct iovec si_iov;
        scsireq_t si_screq;
        struct scsipi_periph *si_periph;
};

static LIST_HEAD(, scsi_ioctl) si_head;
static kmutex_t si_lock;

void
scsipi_ioctl_init(void)
{

        mutex_init(&si_lock, MUTEX_DEFAULT, IPL_BIO);
}

static struct scsi_ioctl *
si_get(void)
{
        struct scsi_ioctl *si;

        si = malloc(sizeof(struct scsi_ioctl), M_TEMP, M_WAITOK|M_ZERO);
        buf_init(&si->si_bp);
        mutex_enter(&si_lock);
        LIST_INSERT_HEAD(&si_head, si, si_list);
        mutex_exit(&si_lock);
        return (si);
}

static void
si_free(struct scsi_ioctl *si)
{

        mutex_enter(&si_lock);
        LIST_REMOVE(si, si_list);
        mutex_exit(&si_lock);
        buf_destroy(&si->si_bp);
        free(si, M_TEMP);
}

static struct scsi_ioctl *
si_find(struct buf *bp)
{
        struct scsi_ioctl *si;

        mutex_enter(&si_lock);
        for (si = si_head.lh_first; si != 0; si = si->si_list.le_next)
                if (bp == &si->si_bp)
                        break;
        mutex_exit(&si_lock);
        return (si);
}

/*
 * We let the user interpret his own sense in the generic scsi world.
 * This routine is called at interrupt time if the XS_CTL_USERCMD bit was set
 * in the flags passed to scsi_scsipi_cmd(). No other completion processing
 * takes place, even if we are running over another device driver.
 * The lower level routines that call us here, will free the xs and restart
 * the device's queue if such exists.
 */
void
scsipi_user_done(struct scsipi_xfer *xs)
{
        struct buf *bp;
        struct scsi_ioctl *si;
        scsireq_t *screq;
        struct scsipi_periph *periph = xs->xs_periph;

        bp = xs->bp;
#ifdef DIAGNOSTIC
        if (bp == NULL) {
                scsipi_printaddr(periph);
                printf("user command with no buf\n");
                panic("scsipi_user_done");
        }
#endif
        si = si_find(bp);
#ifdef DIAGNOSTIC
        if (si == NULL) {
                scsipi_printaddr(periph);
                printf("user command with no ioctl\n");
                panic("scsipi_user_done");
        }
#endif

        screq = &si->si_screq;

        SC_DEBUG(xs->xs_periph, SCSIPI_DB2, ("user-done\n"));

        screq->retsts = 0;
        screq->status = xs->status;
        switch (xs->error) {
        case XS_NOERROR:
                SC_DEBUG(periph, SCSIPI_DB3, ("no error\n"));
                screq->datalen_used =
                    xs->datalen - xs->resid;        /* probably rubbish */
                screq->retsts = SCCMD_OK;
                break;
        case XS_SENSE:
                SC_DEBUG(periph, SCSIPI_DB3, ("have sense\n"));
                screq->senselen_used = uimin(sizeof(xs->sense.scsi_sense),
                    SENSEBUFLEN);
                memcpy(screq->sense, &xs->sense.scsi_sense,
                    screq->senselen_used);
                screq->retsts = SCCMD_SENSE;
                break;
        case XS_SHORTSENSE:
                SC_DEBUG(periph, SCSIPI_DB3, ("have short sense\n"));
                screq->senselen_used = uimin(sizeof(xs->sense.atapi_sense),
                    SENSEBUFLEN);
                memcpy(screq->sense, &xs->sense.atapi_sense,
                    screq->senselen_used);
                screq->retsts = SCCMD_UNKNOWN; /* XXX need a shortsense here */
                break;
        case XS_DRIVER_STUFFUP:
                scsipi_printaddr(periph);
                printf("passthrough: adapter inconsistency\n");
                screq->retsts = SCCMD_UNKNOWN;
                break;
        case XS_SELTIMEOUT:
                SC_DEBUG(periph, SCSIPI_DB3, ("seltimeout\n"));
                screq->retsts = SCCMD_TIMEOUT;
                break;
        case XS_TIMEOUT:
                SC_DEBUG(periph, SCSIPI_DB3, ("timeout\n"));
                screq->retsts = SCCMD_TIMEOUT;
                break;
        case XS_BUSY:
                SC_DEBUG(periph, SCSIPI_DB3, ("busy\n"));
                screq->retsts = SCCMD_BUSY;
                break;
        default:
                scsipi_printaddr(periph);
                printf("unknown error category %d from adapter\n",
                    xs->error);
                screq->retsts = SCCMD_UNKNOWN;
                break;
        }

        if (xs->xs_control & XS_CTL_ASYNC) {
                mutex_enter(chan_mtx(periph->periph_channel));
                scsipi_put_xs(xs);
                mutex_exit(chan_mtx(periph->periph_channel));
        }
}


/* Pseudo strategy function
 * Called by scsipi_do_ioctl() via physio/physstrat if there is to
 * be data transferred, and directly if there is no data transfer.
 *
 * Should I reorganize this so it returns to physio instead
 * of sleeping in scsiio_scsipi_cmd?  Is there any advantage, other
 * than avoiding the probable duplicate wakeup in iodone? [PD]
 *
 * No, seems ok to me... [JRE]
 * (I don't see any duplicate wakeups)
 *
 * Can't be used with block devices or raw_read/raw_write directly
 * from the cdevsw/bdevsw tables because they couldn't have added
 * the screq structure. [JRE]
 */
static void
scsistrategy(struct buf *bp)
{
        struct scsi_ioctl *si;
        scsireq_t *screq;
        struct scsipi_periph *periph;
        int error;
        int flags = 0;

        si = si_find(bp);
        if (si == NULL) {
                printf("scsistrategy: "
                    "No matching ioctl request found in queue\n");
                error = EINVAL;
                goto done;
        }
        screq = &si->si_screq;
        periph = si->si_periph;
        SC_DEBUG(periph, SCSIPI_DB2, ("user_strategy\n"));

        /*
         * We're in trouble if physio tried to break up the transfer.
         */
        if (bp->b_bcount != screq->datalen) {
                scsipi_printaddr(periph);
                printf("physio split the request.. cannot proceed\n");
                error = EIO;
                goto done;
        }

        if (screq->timeout == 0) {
                error = EINVAL;
                goto done;
        }

        if (screq->cmdlen > sizeof(struct scsipi_generic)) {
                scsipi_printaddr(periph);
                printf("cmdlen too big\n");
                error = EFAULT;
                goto done;
        }

        if ((screq->flags & SCCMD_READ) && screq->datalen > 0)
                flags |= XS_CTL_DATA_IN;
        if ((screq->flags & SCCMD_WRITE) && screq->datalen > 0)
                flags |= XS_CTL_DATA_OUT;
        if (screq->flags & SCCMD_TARGET)
                flags |= XS_CTL_TARGET;
        if (screq->flags & SCCMD_ESCAPE)
                flags |= XS_CTL_ESCAPE;

        error = scsipi_command(periph, (void *)screq->cmd, screq->cmdlen,
            (void *)bp->b_data, screq->datalen,
            0, /* user must do the retries *//* ignored */
            screq->timeout, bp, flags | XS_CTL_USERCMD);

done:
        if (error)
                bp->b_resid = bp->b_bcount;
        bp->b_error = error;
        biodone(bp);
        return;
}

/*
 * Something (e.g. another driver) has called us
 * with a periph and a scsi-specific ioctl to perform,
 * better try.  If user-level type command, we must
 * still be running in the context of the calling process
 */
int
scsipi_do_ioctl(struct scsipi_periph *periph, dev_t dev, u_long cmd,
    void *addr, int flag, struct lwp *l)
{
        int error;

        SC_DEBUG(periph, SCSIPI_DB2, ("scsipi_do_ioctl(0x%lx)\n", cmd));

        if (addr == NULL)
                return EINVAL;

        /* Check for the safe-ness of this request. */
        switch (cmd) {
        case OSCIOCIDENTIFY:
        case SCIOCIDENTIFY:
                break;
        case SCIOCCOMMAND:
                if ((((scsireq_t *)addr)->flags & SCCMD_READ) == 0 &&
                    (flag & FWRITE) == 0)
                        return (EBADF);
                break;
        default:
                if ((flag & FWRITE) == 0)
                        return (EBADF);
        }

        switch (cmd) {
        case SCIOCCOMMAND: {
                scsireq_t *screq = (scsireq_t *)addr;
                struct scsi_ioctl *si;
                int len;

                len = screq->datalen;

                /*
                 * If there is data, there must be a data buffer and a direction specified
                 */
                if (len > 0 && (screq->databuf == NULL ||
                    (screq->flags & (SCCMD_READ|SCCMD_WRITE)) == 0))
                        return (EINVAL);

                si = si_get();
                si->si_screq = *screq;
                si->si_periph = periph;
                if (len) {
                        si->si_iov.iov_base = screq->databuf;
                        si->si_iov.iov_len = len;
                        si->si_uio.uio_iov = &si->si_iov;
                        si->si_uio.uio_iovcnt = 1;
                        si->si_uio.uio_resid = len;
                        si->si_uio.uio_offset = 0;
                        si->si_uio.uio_rw =
                            (screq->flags & SCCMD_READ) ? UIO_READ : UIO_WRITE;
                        if ((flag & FKIOCTL) == 0) {
                                si->si_uio.uio_vmspace = l->l_proc->p_vmspace;
                        } else {
                                UIO_SETUP_SYSSPACE(&si->si_uio);
                        }
                        error = physio(scsistrategy, &si->si_bp, dev,
                            (screq->flags & SCCMD_READ) ? B_READ : B_WRITE,
                            periph->periph_channel->chan_adapter->adapt_minphys,
                            &si->si_uio);
                } else {
                        /* if no data, no need to translate it.. */
                        si->si_bp.b_flags = 0;
                        si->si_bp.b_data = 0;
                        si->si_bp.b_bcount = 0;
                        si->si_bp.b_dev = dev;
                        si->si_bp.b_proc = l->l_proc;
                        scsistrategy(&si->si_bp);
                        error = si->si_bp.b_error;
                }
                *screq = si->si_screq;
                si_free(si);
                return (error);
        }
        case SCIOCDEBUG: {
                int level = *((int *)addr);

                SC_DEBUG(periph, SCSIPI_DB3, ("debug set to %d\n", level));
                periph->periph_dbflags = 0;
                if (level & 1)
                        periph->periph_dbflags |= SCSIPI_DB1;
                if (level & 2)
                        periph->periph_dbflags |= SCSIPI_DB2;
                if (level & 4)
                        periph->periph_dbflags |= SCSIPI_DB3;
                if (level & 8)
                        periph->periph_dbflags |= SCSIPI_DB4;
                return (0);
        }
        case SCIOCRECONFIG:
        case SCIOCDECONFIG:
                return (EINVAL);
        case SCIOCIDENTIFY: {
                struct scsi_addr *sca = (struct scsi_addr *)addr;

                switch (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(periph))) {
                case SCSIPI_BUSTYPE_SCSI:
                        sca->type = TYPE_SCSI;
                        sca->addr.scsi.scbus =
                            device_unit(device_parent(periph->periph_dev));
                        sca->addr.scsi.target = periph->periph_target;
                        sca->addr.scsi.lun = periph->periph_lun;
                        return (0);
                case SCSIPI_BUSTYPE_ATAPI:
                        sca->type = TYPE_ATAPI;
                        sca->addr.atapi.atbus =
                            device_unit(device_parent(periph->periph_dev));
                        sca->addr.atapi.drive = periph->periph_target;
                        return (0);
                }
                return (ENXIO);
        }
#if defined(COMPAT_12) || defined(COMPAT_FREEBSD)
        /* SCIOCIDENTIFY before ATAPI staff merge */
        case OSCIOCIDENTIFY: {
                struct oscsi_addr *sca = (struct oscsi_addr *)addr;

                switch (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(periph))) {
                case SCSIPI_BUSTYPE_SCSI:
                        sca->scbus =
                            device_unit(device_parent(periph->periph_dev));
                        sca->target = periph->periph_target;
                        sca->lun = periph->periph_lun;
                        return (0);
                }
                return (ENODEV);
        }
#endif
        default:
                return (ENOTTY);
        }

#ifdef DIAGNOSTIC
        panic("scsipi_do_ioctl: impossible");
#endif
}













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
/*        $NetBSD: lfs_accessors.h,v 1.51 2022/04/24 20:32:44 rillig Exp $        */

/*  from NetBSD: lfs.h,v 1.165 2015/07/24 06:59:32 dholland Exp  */
/*  from NetBSD: dinode.h,v 1.25 2016/01/22 23:06:10 dholland Exp  */
/*  from NetBSD: dir.h,v 1.25 2015/09/01 06:16:03 dholland Exp  */

/*-
 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Konrad E. Schroder <perseant@hhhh.org>.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
/*-
 * Copyright (c) 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)lfs.h        8.9 (Berkeley) 5/8/95
 */
/*
 * Copyright (c) 2002 Networks Associates Technology, Inc.
 * All rights reserved.
 *
 * This software was developed for the FreeBSD Project by Marshall
 * Kirk McKusick and Network Associates Laboratories, the Security
 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
 * research program
 *
 * Copyright (c) 1982, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)dinode.h        8.9 (Berkeley) 3/29/95
 */
/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)dir.h        8.5 (Berkeley) 4/27/95
 */

#ifndef _UFS_LFS_LFS_ACCESSORS_H_
#define _UFS_LFS_LFS_ACCESSORS_H_

#if defined(_KERNEL_OPT)
#include "opt_lfs.h"
#endif

#include <sys/bswap.h>

#include <ufs/lfs/lfs.h>

#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <assert.h>
#include <string.h>
#define KASSERT assert
#else
#include <sys/systm.h>
#endif

/*
 * STRUCT_LFS is used by the libsa code to get accessors that work
 * with struct salfs instead of struct lfs, and by the cleaner to
 * get accessors that work with struct clfs.
 */

#ifndef STRUCT_LFS
#define STRUCT_LFS struct lfs
#endif

/*
 * byte order
 */

/*
 * For now at least, the bootblocks shall not be endian-independent.
 * We can see later if it fits in the size budget. Also disable the
 * byteswapping if LFS_EI is off.
 *
 * Caution: these functions "know" that bswap16/32/64 are unsigned,
 * and if that changes will likely break silently.
 */

#if defined(_STANDALONE) || (defined(_KERNEL) && !defined(LFS_EI))
#define LFS_SWAP_int16_t(fs, val) (val)
#define LFS_SWAP_int32_t(fs, val) (val)
#define LFS_SWAP_int64_t(fs, val) (val)
#define LFS_SWAP_uint16_t(fs, val) (val)
#define LFS_SWAP_uint32_t(fs, val) (val)
#define LFS_SWAP_uint64_t(fs, val) (val)
#else
#define LFS_SWAP_int16_t(fs, val) \
        ((fs)->lfs_dobyteswap ? (int16_t)bswap16(val) : (val))
#define LFS_SWAP_int32_t(fs, val) \
        ((fs)->lfs_dobyteswap ? (int32_t)bswap32(val) : (val))
#define LFS_SWAP_int64_t(fs, val) \
        ((fs)->lfs_dobyteswap ? (int64_t)bswap64(val) : (val))
#define LFS_SWAP_uint16_t(fs, val) \
        ((fs)->lfs_dobyteswap ? bswap16(val) : (val))
#define LFS_SWAP_uint32_t(fs, val) \
        ((fs)->lfs_dobyteswap ? bswap32(val) : (val))
#define LFS_SWAP_uint64_t(fs, val) \
        ((fs)->lfs_dobyteswap ? bswap64(val) : (val))
#endif

/*
 * For handling directories we will need to know if the volume is
 * little-endian.
 */
#if BYTE_ORDER == LITTLE_ENDIAN
#define LFS_LITTLE_ENDIAN_ONDISK(fs) (!(fs)->lfs_dobyteswap)
#else
#define LFS_LITTLE_ENDIAN_ONDISK(fs) ((fs)->lfs_dobyteswap)
#endif


/*
 * Suppress spurious warnings -- we use
 *
 *        type *foo = &obj->member;
 *
 * in macros to verify that obj->member has the right type.  When the
 * object is a packed structure with misaligned members, this causes
 * some compiles to squeal that taking the address might lead to
 * undefined behaviour later on -- which is helpful in general, not
 * relevant in this case, because we don't do anything with foo
 * afterward; we only declare it to get a type check and then we
 * discard it.
 */
#ifdef __GNUC__
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Waddress-of-packed-member"
#elif __GNUC_PREREQ__(9,0)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
#endif
#endif



/*
 * directories
 */

#define LFS_DIRHEADERSIZE(fs) \
        ((fs)->lfs_is64 ? sizeof(struct lfs_dirheader64) : sizeof(struct lfs_dirheader32))

/*
 * The LFS_DIRSIZ macro gives the minimum record length which will hold
 * the directory entry.  This requires the amount of space in struct lfs_direct
 * without the d_name field, plus enough space for the name with a terminating
 * null byte (dp->d_namlen+1), rounded up to a 4 byte boundary.
 */
#define        LFS_DIRECTSIZ(fs, namlen) \
        (LFS_DIRHEADERSIZE(fs) + (((namlen)+1 + 3) &~ 3))

/*
 * The size of the largest possible directory entry. This is
 * used by ulfs_dirhash to figure the size of an array, so we
 * need a single constant value true for both lfs32 and lfs64.
 */
#define LFS_MAXDIRENTRYSIZE \
        (sizeof(struct lfs_dirheader64) + (((LFS_MAXNAMLEN+1)+1 + 3) & ~3))

#if (BYTE_ORDER == LITTLE_ENDIAN)
#define LFS_OLDDIRSIZ(oldfmt, dp, needswap)        \
    (((oldfmt) && !(needswap)) ?                \
    LFS_DIRECTSIZ((dp)->d_type) : LFS_DIRECTSIZ((dp)->d_namlen))
#else
#define LFS_OLDDIRSIZ(oldfmt, dp, needswap)        \
    (((oldfmt) && (needswap)) ?                        \
    LFS_DIRECTSIZ((dp)->d_type) : LFS_DIRECTSIZ((dp)->d_namlen))
#endif

#define LFS_DIRSIZ(fs, dp) LFS_DIRECTSIZ(fs, lfs_dir_getnamlen(fs, dp))

/* Constants for the first argument of LFS_OLDDIRSIZ */
#define LFS_OLDDIRFMT        1
#define LFS_NEWDIRFMT        0

#define LFS_NEXTDIR(fs, dp) \
        ((LFS_DIRHEADER *)((char *)(dp) + lfs_dir_getreclen(fs, dp)))

static __inline char *
lfs_dir_nameptr(const STRUCT_LFS *fs, LFS_DIRHEADER *dh)
{
        if (fs->lfs_is64) {
                return (char *)(&dh->u_64 + 1);
        } else {
                return (char *)(&dh->u_32 + 1);
        }
}

static __inline uint64_t
lfs_dir_getino(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh)
{
        if (fs->lfs_is64) {
                return LFS_SWAP_uint64_t(fs, dh->u_64.dh_ino);
        } else {
                return LFS_SWAP_uint32_t(fs, dh->u_32.dh_ino);
        }
}

static __inline uint16_t
lfs_dir_getreclen(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh)
{
        if (fs->lfs_is64) {
                return LFS_SWAP_uint16_t(fs, dh->u_64.dh_reclen);
        } else {
                return LFS_SWAP_uint16_t(fs, dh->u_32.dh_reclen);
        }
}

static __inline uint8_t
lfs_dir_gettype(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh)
{
        if (fs->lfs_is64) {
                KASSERT(fs->lfs_hasolddirfmt == 0);
                return dh->u_64.dh_type;
        } else if (fs->lfs_hasolddirfmt) {
                return LFS_DT_UNKNOWN;
        } else {
                return dh->u_32.dh_type;
        }
}

static __inline uint8_t
lfs_dir_getnamlen(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh)
{
        if (fs->lfs_is64) {
                KASSERT(fs->lfs_hasolddirfmt == 0);
                return dh->u_64.dh_namlen;
        } else if (fs->lfs_hasolddirfmt && LFS_LITTLE_ENDIAN_ONDISK(fs)) {
                /* low-order byte of old 16-bit namlen field */
                return dh->u_32.dh_type;
        } else {
                return dh->u_32.dh_namlen;
        }
}

static __inline void
lfs_dir_setino(STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint64_t ino)
{
        if (fs->lfs_is64) {
                dh->u_64.dh_ino = LFS_SWAP_uint64_t(fs, ino);
        } else {
                dh->u_32.dh_ino = LFS_SWAP_uint32_t(fs, ino);
        }
}

static __inline void
lfs_dir_setreclen(STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint16_t reclen)
{
        if (fs->lfs_is64) {
                dh->u_64.dh_reclen = LFS_SWAP_uint16_t(fs, reclen);
        } else {
                dh->u_32.dh_reclen = LFS_SWAP_uint16_t(fs, reclen);
        }
}

static __inline void
lfs_dir_settype(const STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint8_t type)
{
        if (fs->lfs_is64) {
                KASSERT(fs->lfs_hasolddirfmt == 0);
                dh->u_64.dh_type = type;
        } else if (fs->lfs_hasolddirfmt) {
                /* do nothing */
                return;
        } else {
                dh->u_32.dh_type = type;
        }
}

static __inline void
lfs_dir_setnamlen(const STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint8_t namlen)
{
        if (fs->lfs_is64) {
                KASSERT(fs->lfs_hasolddirfmt == 0);
                dh->u_64.dh_namlen = namlen;
        } else if (fs->lfs_hasolddirfmt && LFS_LITTLE_ENDIAN_ONDISK(fs)) {
                /* low-order byte of old 16-bit namlen field */
                dh->u_32.dh_type = namlen;
        } else {
                dh->u_32.dh_namlen = namlen;
        }
}

static __inline void
lfs_copydirname(STRUCT_LFS *fs, char *dest, const char *src,
                unsigned namlen, unsigned reclen)
{
        unsigned spacelen;

        KASSERT(reclen > LFS_DIRHEADERSIZE(fs));
        spacelen = reclen - LFS_DIRHEADERSIZE(fs);

        /* must always be at least 1 byte as a null terminator */
        KASSERT(spacelen > namlen);

        memcpy(dest, src, namlen);
        memset(dest + namlen, '\0', spacelen - namlen);
}

static __inline LFS_DIRHEADER *
lfs_dirtemplate_dotdot(STRUCT_LFS *fs, union lfs_dirtemplate *dt)
{
        /* XXX blah, be nice to have a way to do this w/o casts */
        if (fs->lfs_is64) {
                return (LFS_DIRHEADER *)&dt->u_64.dotdot_header;
        } else {
                return (LFS_DIRHEADER *)&dt->u_32.dotdot_header;
        }
}

static __inline char *
lfs_dirtemplate_dotdotname(STRUCT_LFS *fs, union lfs_dirtemplate *dt)
{
        if (fs->lfs_is64) {
                return dt->u_64.dotdot_name;
        } else {
                return dt->u_32.dotdot_name;
        }
}

/*
 * dinodes
 */

/*
 * Maximum length of a symlink that can be stored within the inode.
 */
#define LFS32_MAXSYMLINKLEN        ((ULFS_NDADDR + ULFS_NIADDR) * sizeof(int32_t))
#define LFS64_MAXSYMLINKLEN        ((ULFS_NDADDR + ULFS_NIADDR) * sizeof(int64_t))

#define LFS_MAXSYMLINKLEN(fs) \
        ((fs)->lfs_is64 ? LFS64_MAXSYMLINKLEN : LFS32_MAXSYMLINKLEN)

#define DINOSIZE(fs) ((fs)->lfs_is64 ? sizeof(struct lfs64_dinode) : sizeof(struct lfs32_dinode))

#define DINO_IN_BLOCK(fs, base, ix) \
        ((union lfs_dinode *)((char *)(base) + DINOSIZE(fs) * (ix)))

static __inline void
lfs_copy_dinode(STRUCT_LFS *fs,
    union lfs_dinode *dst, const union lfs_dinode *src)
{
        /*
         * We can do structure assignment of the structs, but not of
         * the whole union, as the union is the size of the (larger)
         * 64-bit struct and on a 32-bit fs the upper half of it might
         * be off the end of a buffer or otherwise invalid.
         */
        if (fs->lfs_is64) {
                dst->u_64 = src->u_64;
        } else {
                dst->u_32 = src->u_32;
        }
}

#define LFS_DEF_DINO_ACCESSOR(type, type32, field) \
        static __inline type                                \
        lfs_dino_get##field(STRUCT_LFS *fs, union lfs_dinode *dip) \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        return LFS_SWAP_##type(fs, dip->u_64.di_##field); \
                } else {                                        \
                        return LFS_SWAP_##type32(fs, dip->u_32.di_##field); \
                }                                                \
        }                                                        \
        static __inline void                                \
        lfs_dino_set##field(STRUCT_LFS *fs, union lfs_dinode *dip, type val) \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        type *p = &dip->u_64.di_##field;        \
                        (void)p;                                \
                        dip->u_64.di_##field = LFS_SWAP_##type(fs, val); \
                } else {                                        \
                        type32 *p = &dip->u_32.di_##field;        \
                        (void)p;                                \
                        dip->u_32.di_##field = LFS_SWAP_##type32(fs, val); \
                }                                                \
        }                                                        \

LFS_DEF_DINO_ACCESSOR(uint16_t, uint16_t, mode)
LFS_DEF_DINO_ACCESSOR(int16_t, int16_t, nlink)
LFS_DEF_DINO_ACCESSOR(uint64_t, uint32_t, inumber)
LFS_DEF_DINO_ACCESSOR(uint64_t, uint64_t, size)
LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, atime)
LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, atimensec)
LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, mtime)
LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, mtimensec)
LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, ctime)
LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, ctimensec)
LFS_DEF_DINO_ACCESSOR(uint32_t, uint32_t, flags)
LFS_DEF_DINO_ACCESSOR(uint64_t, uint32_t, blocks)
LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, gen)
LFS_DEF_DINO_ACCESSOR(uint32_t, uint32_t, uid)
LFS_DEF_DINO_ACCESSOR(uint32_t, uint32_t, gid)

/* XXX this should be done differently (it's a fake field) */
LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, rdev)

static __inline daddr_t
lfs_dino_getdb(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix)
{
        KASSERT(ix < ULFS_NDADDR);
        if (fs->lfs_is64) {
                return LFS_SWAP_int64_t(fs, dip->u_64.di_db[ix]);
        } else {
                /* note: this must sign-extend or UNWRITTEN gets trashed */
                return (int32_t)LFS_SWAP_int32_t(fs, dip->u_32.di_db[ix]);
        }
}

static __inline daddr_t
lfs_dino_getib(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix)
{
        KASSERT(ix < ULFS_NIADDR);
        if (fs->lfs_is64) {
                return LFS_SWAP_int64_t(fs, dip->u_64.di_ib[ix]);
        } else {
                /* note: this must sign-extend or UNWRITTEN gets trashed */
                return (int32_t)LFS_SWAP_int32_t(fs, dip->u_32.di_ib[ix]);
        }
}

static __inline void
lfs_dino_setdb(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix, daddr_t val)
{
        KASSERT(ix < ULFS_NDADDR);
        if (fs->lfs_is64) {
                dip->u_64.di_db[ix] = LFS_SWAP_int64_t(fs, val);
        } else {
                dip->u_32.di_db[ix] = LFS_SWAP_uint32_t(fs, val);
        }
}

static __inline void
lfs_dino_setib(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix, daddr_t val)
{
        KASSERT(ix < ULFS_NIADDR);
        if (fs->lfs_is64) {
                dip->u_64.di_ib[ix] = LFS_SWAP_int64_t(fs, val);
        } else {
                dip->u_32.di_ib[ix] = LFS_SWAP_uint32_t(fs, val);
        }
}

/* birthtime is present only in the 64-bit inode */
static __inline void
lfs_dino_setbirthtime(STRUCT_LFS *fs, union lfs_dinode *dip,
    const struct timespec *ts)
{
        if (fs->lfs_is64) {
                dip->u_64.di_birthtime = ts->tv_sec;
                dip->u_64.di_birthnsec = ts->tv_nsec;
        } else {
                /* drop it on the floor */
        }
}

/*
 * indirect blocks
 */

static __inline daddr_t
lfs_iblock_get(STRUCT_LFS *fs, void *block, unsigned ix)
{
        if (fs->lfs_is64) {
                // XXX re-enable these asserts after reorging this file
                //KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int64_t));
                return (daddr_t)(((int64_t *)block)[ix]);
        } else {
                //KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int32_t));
                /* must sign-extend or UNWRITTEN gets trashed */
                return (daddr_t)(int64_t)(((int32_t *)block)[ix]);
        }
}

static __inline void
lfs_iblock_set(STRUCT_LFS *fs, void *block, unsigned ix, daddr_t val)
{
        if (fs->lfs_is64) {
                //KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int64_t));
                ((int64_t *)block)[ix] = val;
        } else {
                //KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int32_t));
                ((int32_t *)block)[ix] = val;
        }
}

/*
 * "struct buf" associated definitions
 */

# define LFS_LOCK_BUF(bp) do {                                                \
        if (((bp)->b_flags & B_LOCKED) == 0 && bp->b_iodone == NULL) {        \
                mutex_enter(&lfs_lock);                                        \
                ++locked_queue_count;                                        \
                locked_queue_bytes += bp->b_bufsize;                        \
                mutex_exit(&lfs_lock);                                        \
        }                                                                \
        (bp)->b_flags |= B_LOCKED;                                        \
} while (0)

# define LFS_UNLOCK_BUF(bp) do {                                        \
        if (((bp)->b_flags & B_LOCKED) != 0 && bp->b_iodone == NULL) {        \
                mutex_enter(&lfs_lock);                                        \
                --locked_queue_count;                                        \
                locked_queue_bytes -= bp->b_bufsize;                        \
                if (locked_queue_count < LFS_WAIT_BUFS &&                \
                    locked_queue_bytes < LFS_WAIT_BYTES)                \
                        cv_broadcast(&locked_queue_cv);                        \
                mutex_exit(&lfs_lock);                                        \
        }                                                                \
        (bp)->b_flags &= ~B_LOCKED;                                        \
} while (0)

/*
 * "struct inode" associated definitions
 */

#define LFS_SET_UINO(ip, states) do {                                        \
        if (((states) & IN_ACCESSED) && !((ip)->i_state & IN_ACCESSED))        \
                lfs_sb_adduinodes((ip)->i_lfs, 1);                        \
        if (((states) & IN_CLEANING) && !((ip)->i_state & IN_CLEANING))        \
                lfs_sb_adduinodes((ip)->i_lfs, 1);                        \
        if (((states) & IN_MODIFIED) && !((ip)->i_state & IN_MODIFIED))        \
                lfs_sb_adduinodes((ip)->i_lfs, 1);                        \
        (ip)->i_state |= (states);                                        \
} while (0)

#define LFS_CLR_UINO(ip, states) do {                                        \
        if (((states) & IN_ACCESSED) && ((ip)->i_state & IN_ACCESSED))        \
                lfs_sb_subuinodes((ip)->i_lfs, 1);                        \
        if (((states) & IN_CLEANING) && ((ip)->i_state & IN_CLEANING))        \
                lfs_sb_subuinodes((ip)->i_lfs, 1);                        \
        if (((states) & IN_MODIFIED) && ((ip)->i_state & IN_MODIFIED))        \
                lfs_sb_subuinodes((ip)->i_lfs, 1);                        \
        (ip)->i_state &= ~(states);                                        \
        if (lfs_sb_getuinodes((ip)->i_lfs) < 0) {                        \
                panic("lfs_uinodes < 0");                                \
        }                                                                \
} while (0)

#define LFS_ITIMES(ip, acc, mod, cre) \
        while ((ip)->i_state & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY)) \
                lfs_itimes(ip, acc, mod, cre)

/*
 * On-disk and in-memory checkpoint segment usage structure.
 */

#define        SEGUPB(fs)        (lfs_sb_getsepb(fs))
#define        SEGTABSIZE_SU(fs)                                                \
        ((lfs_sb_getnseg(fs) + SEGUPB(fs) - 1) / lfs_sb_getsepb(fs))

#ifdef _KERNEL
# define SHARE_IFLOCK(F)                                                 \
  do {                                                                        \
        rw_enter(&(F)->lfs_iflock, RW_READER);                                \
  } while(0)
# define UNSHARE_IFLOCK(F)                                                \
  do {                                                                        \
        rw_exit(&(F)->lfs_iflock);                                        \
  } while(0)
#else /* ! _KERNEL */
# define SHARE_IFLOCK(F)
# define UNSHARE_IFLOCK(F)
#endif /* ! _KERNEL */

/* Read in the block with a specific segment usage entry from the ifile. */
#define        LFS_SEGENTRY(SP, F, IN, BP) do {                                \
        int _e;                                                                \
        SHARE_IFLOCK(F);                                                \
        VTOI((F)->lfs_ivnode)->i_state |= IN_ACCESS;                        \
        if ((_e = bread((F)->lfs_ivnode,                                \
            ((IN) / lfs_sb_getsepb(F)) + lfs_sb_getcleansz(F),                \
            lfs_sb_getbsize(F), 0, &(BP))) != 0)                        \
                panic("lfs: ifile read: segentry %llu: error %d\n",        \
                         (unsigned long long)(IN), _e);                        \
        if (lfs_sb_getversion(F) == 1)                                        \
                (SP) = (SEGUSE *)((SEGUSE_V1 *)(BP)->b_data +                \
                        ((IN) & (lfs_sb_getsepb(F) - 1)));                \
        else                                                                \
                (SP) = (SEGUSE *)(BP)->b_data + ((IN) % lfs_sb_getsepb(F)); \
        UNSHARE_IFLOCK(F);                                                \
} while (0)

#define LFS_WRITESEGENTRY(SP, F, IN, BP) do {                                \
        if ((SP)->su_nbytes == 0)                                        \
                (SP)->su_flags |= SEGUSE_EMPTY;                                \
        else                                                                \
                (SP)->su_flags &= ~SEGUSE_EMPTY;                        \
        (F)->lfs_suflags[(F)->lfs_activesb][(IN)] = (SP)->su_flags;        \
        LFS_BWRITE_LOG(BP);                                                \
} while (0)

/*
 * FINFO (file info) entries.
 */

/* Size of an on-disk block pointer, e.g. in an indirect block. */
/* XXX: move to a more suitable location in this file */
#define LFS_BLKPTRSIZE(fs) ((fs)->lfs_is64 ? sizeof(int64_t) : sizeof(int32_t))

/* Size of an on-disk inode number. */
/* XXX: move to a more suitable location in this file */
#define LFS_INUMSIZE(fs) ((fs)->lfs_is64 ? sizeof(int64_t) : sizeof(int32_t))

/* size of a FINFO, without the block pointers */
#define        FINFOSIZE(fs)        ((fs)->lfs_is64 ? sizeof(FINFO64) : sizeof(FINFO32))

/* Full size of the provided FINFO record, including its block pointers. */
#define FINFO_FULLSIZE(fs, fip) \
        (FINFOSIZE(fs) + lfs_fi_getnblocks(fs, fip) * LFS_BLKPTRSIZE(fs))

#define NEXT_FINFO(fs, fip) \
        ((FINFO *)((char *)(fip) + FINFO_FULLSIZE(fs, fip)))

#define LFS_DEF_FI_ACCESSOR(type, type32, field) \
        static __inline type                                \
        lfs_fi_get##field(STRUCT_LFS *fs, FINFO *fip)                \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        return fip->u_64.fi_##field;                 \
                } else {                                        \
                        return fip->u_32.fi_##field;                 \
                }                                                \
        }                                                        \
        static __inline void                                \
        lfs_fi_set##field(STRUCT_LFS *fs, FINFO *fip, type val) \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        type *p = &fip->u_64.fi_##field;        \
                        (void)p;                                \
                        fip->u_64.fi_##field = val;                \
                } else {                                        \
                        type32 *p = &fip->u_32.fi_##field;        \
                        (void)p;                                \
                        fip->u_32.fi_##field = val;                \
                }                                                \
        }                                                        \

LFS_DEF_FI_ACCESSOR(uint32_t, uint32_t, nblocks)
LFS_DEF_FI_ACCESSOR(uint32_t, uint32_t, version)
LFS_DEF_FI_ACCESSOR(uint64_t, uint32_t, ino)
LFS_DEF_FI_ACCESSOR(uint32_t, uint32_t, lastlength)

static __inline daddr_t
lfs_fi_getblock(STRUCT_LFS *fs, FINFO *fip, unsigned idx)
{
        void *firstblock;

        firstblock = (char *)fip + FINFOSIZE(fs);
        KASSERT(idx < lfs_fi_getnblocks(fs, fip));
        if (fs->lfs_is64) {
                return ((int64_t *)firstblock)[idx];
        } else {
                return ((int32_t *)firstblock)[idx];
        }
}

static __inline void
lfs_fi_setblock(STRUCT_LFS *fs, FINFO *fip, unsigned idx, daddr_t blk)
{
        void *firstblock;

        firstblock = (char *)fip + FINFOSIZE(fs);
        KASSERT(idx < lfs_fi_getnblocks(fs, fip));
        if (fs->lfs_is64) {
                ((int64_t *)firstblock)[idx] = blk;
        } else {
                ((int32_t *)firstblock)[idx] = blk;
        }
}

/*
 * inode info entries (in the segment summary)
 */

#define IINFOSIZE(fs)        ((fs)->lfs_is64 ? sizeof(IINFO64) : sizeof(IINFO32))

/* iinfos scroll backward from the end of the segment summary block */
#define SEGSUM_IINFOSTART(fs, buf) \
        ((IINFO *)((char *)buf + lfs_sb_getsumsize(fs) - IINFOSIZE(fs)))

#define NEXTLOWER_IINFO(fs, iip) \
        ((IINFO *)((char *)(iip) - IINFOSIZE(fs)))

#define NTH_IINFO(fs, buf, n) \
        ((IINFO *)((char *)SEGSUM_IINFOSTART(fs, buf) - (n)*IINFOSIZE(fs)))

static __inline uint64_t
lfs_ii_getblock(STRUCT_LFS *fs, IINFO *iip)
{
        if (fs->lfs_is64) {
                return iip->u_64.ii_block;
        } else {
                return iip->u_32.ii_block;
        }
}

static __inline void
lfs_ii_setblock(STRUCT_LFS *fs, IINFO *iip, uint64_t block)
{
        if (fs->lfs_is64) {
                iip->u_64.ii_block = block;
        } else {
                iip->u_32.ii_block = block;
        }
}

/*
 * Index file inode entries.
 */

#define IFILE_ENTRYSIZE(fs) \
        ((fs)->lfs_is64 ? sizeof(IFILE64) : sizeof(IFILE32))

/*
 * LFSv1 compatibility code is not allowed to touch if_atime, since it
 * may not be mapped!
 */
/* Read in the block with a specific inode from the ifile. */
#define        LFS_IENTRY(IP, F, IN, BP) do {                                        \
        int _e;                                                                \
        SHARE_IFLOCK(F);                                                \
        VTOI((F)->lfs_ivnode)->i_state |= IN_ACCESS;                        \
        if ((_e = bread((F)->lfs_ivnode,                                \
        (IN) / lfs_sb_getifpb(F) + lfs_sb_getcleansz(F) + lfs_sb_getsegtabsz(F), \
        lfs_sb_getbsize(F), 0, &(BP))) != 0)                                \
                panic("lfs: ifile ino %d read %d", (int)(IN), _e);        \
        if ((F)->lfs_is64) {                                                \
                (IP) = (IFILE *)((IFILE64 *)(BP)->b_data +                \
                                 (IN) % lfs_sb_getifpb(F));                \
        } else if (lfs_sb_getversion(F) > 1) {                                \
                (IP) = (IFILE *)((IFILE32 *)(BP)->b_data +                \
                                (IN) % lfs_sb_getifpb(F));                 \
        } else {                                                        \
                (IP) = (IFILE *)((IFILE_V1 *)(BP)->b_data +                \
                                 (IN) % lfs_sb_getifpb(F));                \
        }                                                                \
        UNSHARE_IFLOCK(F);                                                \
} while (0)
#define LFS_IENTRY_NEXT(IP, F) do { \
        if ((F)->lfs_is64) {                                                \
                (IP) = (IFILE *)((IFILE64 *)(IP) + 1);                        \
        } else if (lfs_sb_getversion(F) > 1) {                                \
                (IP) = (IFILE *)((IFILE32 *)(IP) + 1);                        \
        } else {                                                        \
                (IP) = (IFILE *)((IFILE_V1 *)(IP) + 1);                        \
        }                                                                \
} while (0)

#define LFS_DEF_IF_ACCESSOR(type, type32, field) \
        static __inline type                                \
        lfs_if_get##field(STRUCT_LFS *fs, IFILE *ifp)                \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        return ifp->u_64.if_##field;                 \
                } else {                                        \
                        return ifp->u_32.if_##field;                 \
                }                                                \
        }                                                        \
        static __inline void                                \
        lfs_if_set##field(STRUCT_LFS *fs, IFILE *ifp, type val) \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        type *p = &ifp->u_64.if_##field;        \
                        (void)p;                                \
                        ifp->u_64.if_##field = val;                \
                } else {                                        \
                        type32 *p = &ifp->u_32.if_##field;        \
                        (void)p;                                \
                        ifp->u_32.if_##field = val;                \
                }                                                \
        }                                                        \

LFS_DEF_IF_ACCESSOR(uint32_t, uint32_t, version)
LFS_DEF_IF_ACCESSOR(int64_t, int32_t, daddr)
LFS_DEF_IF_ACCESSOR(uint64_t, uint32_t, nextfree)
LFS_DEF_IF_ACCESSOR(uint64_t, uint32_t, atime_sec)
LFS_DEF_IF_ACCESSOR(uint32_t, uint32_t, atime_nsec)

/*
 * Cleaner information structure.  This resides in the ifile and is used
 * to pass information from the kernel to the cleaner.
 */

#define        CLEANSIZE_SU(fs)                                                \
        ((((fs)->lfs_is64 ? sizeof(CLEANERINFO64) : sizeof(CLEANERINFO32)) + \
                lfs_sb_getbsize(fs) - 1) >> lfs_sb_getbshift(fs))

#define LFS_DEF_CI_ACCESSOR(type, type32, field) \
        static __inline type                                \
        lfs_ci_get##field(STRUCT_LFS *fs, CLEANERINFO *cip)        \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        return cip->u_64.field;                 \
                } else {                                        \
                        return cip->u_32.field;                 \
                }                                                \
        }                                                        \
        static __inline void                                \
        lfs_ci_set##field(STRUCT_LFS *fs, CLEANERINFO *cip, type val) \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        type *p = &cip->u_64.field;                \
                        (void)p;                                \
                        cip->u_64.field = val;                        \
                } else {                                        \
                        type32 *p = &cip->u_32.field;                \
                        (void)p;                                \
                        cip->u_32.field = val;                        \
                }                                                \
        }                                                        \

LFS_DEF_CI_ACCESSOR(uint32_t, uint32_t, clean)
LFS_DEF_CI_ACCESSOR(uint32_t, uint32_t, dirty)
LFS_DEF_CI_ACCESSOR(int64_t, int32_t, bfree)
LFS_DEF_CI_ACCESSOR(int64_t, int32_t, avail)
LFS_DEF_CI_ACCESSOR(uint64_t, uint32_t, free_head)
LFS_DEF_CI_ACCESSOR(uint64_t, uint32_t, free_tail)
LFS_DEF_CI_ACCESSOR(uint32_t, uint32_t, flags)

static __inline void
lfs_ci_shiftcleantodirty(STRUCT_LFS *fs, CLEANERINFO *cip, unsigned num)
{
        lfs_ci_setclean(fs, cip, lfs_ci_getclean(fs, cip) - num);
        lfs_ci_setdirty(fs, cip, lfs_ci_getdirty(fs, cip) + num);
}

static __inline void
lfs_ci_shiftdirtytoclean(STRUCT_LFS *fs, CLEANERINFO *cip, unsigned num)
{
        lfs_ci_setdirty(fs, cip, lfs_ci_getdirty(fs, cip) - num);
        lfs_ci_setclean(fs, cip, lfs_ci_getclean(fs, cip) + num);
}

/* Read in the block with the cleaner info from the ifile. */
#define LFS_CLEANERINFO(CP, F, BP) do {                                        \
        int _e;                                                                \
        SHARE_IFLOCK(F);                                                \
        VTOI((F)->lfs_ivnode)->i_state |= IN_ACCESS;                        \
        _e = bread((F)->lfs_ivnode,                                        \
            (daddr_t)0, lfs_sb_getbsize(F), 0, &(BP));                        \
        if (_e)                                                                \
                panic("lfs: ifile read: cleanerinfo: error %d\n", _e);        \
        (CP) = (CLEANERINFO *)(BP)->b_data;                                \
        UNSHARE_IFLOCK(F);                                                \
} while (0)

/*
 * Synchronize the Ifile cleaner info with current avail and bfree.
 */
#define LFS_SYNC_CLEANERINFO(cip, fs, bp, w) do {                         \
    mutex_enter(&lfs_lock);                                                \
    if ((w) || lfs_ci_getbfree(fs, cip) != lfs_sb_getbfree(fs) ||        \
        lfs_ci_getavail(fs, cip) != lfs_sb_getavail(fs) - fs->lfs_ravail - \
        fs->lfs_favail) {                                                 \
        lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs));                         \
        lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs) - fs->lfs_ravail -        \
                fs->lfs_favail);                                         \
        if (((bp)->b_flags & B_GATHERED) == 0) {                         \
                fs->lfs_flags |= LFS_IFDIRTY;                                \
        }                                                                \
        mutex_exit(&lfs_lock);                                                \
        (void) LFS_BWRITE_LOG(bp); /* Ifile */                                 \
    } else {                                                                 \
        mutex_exit(&lfs_lock);                                                \
        brelse(bp, 0);                                                         \
    }                                                                        \
} while (0)

/*
 * Get the head of the inode free list.
 * Always called with the segment lock held.
 */
#define LFS_GET_HEADFREE(FS, CIP, BP, FREEP) do {                        \
        if (lfs_sb_getversion(FS) > 1) {                                \
                LFS_CLEANERINFO((CIP), (FS), (BP));                        \
                lfs_sb_setfreehd(FS, lfs_ci_getfree_head(FS, CIP));        \
                brelse(BP, 0);                                                \
        }                                                                \
        *(FREEP) = lfs_sb_getfreehd(FS);                                \
} while (0)

#define LFS_PUT_HEADFREE(FS, CIP, BP, VAL) do {                                \
        lfs_sb_setfreehd(FS, VAL);                                        \
        if (lfs_sb_getversion(FS) > 1) {                                \
                LFS_CLEANERINFO((CIP), (FS), (BP));                        \
                lfs_ci_setfree_head(FS, CIP, VAL);                        \
                LFS_BWRITE_LOG(BP);                                        \
                mutex_enter(&lfs_lock);                                        \
                (FS)->lfs_flags |= LFS_IFDIRTY;                                \
                mutex_exit(&lfs_lock);                                        \
        }                                                                \
} while (0)

#define LFS_GET_TAILFREE(FS, CIP, BP, FREEP) do {                        \
        LFS_CLEANERINFO((CIP), (FS), (BP));                                \
        *(FREEP) = lfs_ci_getfree_tail(FS, CIP);                        \
        brelse(BP, 0);                                                        \
} while (0)

#define LFS_PUT_TAILFREE(FS, CIP, BP, VAL) do {                                \
        LFS_CLEANERINFO((CIP), (FS), (BP));                                \
        lfs_ci_setfree_tail(FS, CIP, VAL);                                \
        LFS_BWRITE_LOG(BP);                                                \
        mutex_enter(&lfs_lock);                                                \
        (FS)->lfs_flags |= LFS_IFDIRTY;                                        \
        mutex_exit(&lfs_lock);                                                \
} while (0)

/*
 * On-disk segment summary information
 */

#define SEGSUM_SIZE(fs) \
        (fs->lfs_is64 ? sizeof(SEGSUM64) : \
         lfs_sb_getversion(fs) > 1 ? sizeof(SEGSUM32) : sizeof(SEGSUM_V1))

/*
 * The SEGSUM structure is followed by FINFO structures. Get the pointer
 * to the first FINFO.
 *
 * XXX this can't be a macro yet; this file needs to be resorted.
 */
#if 0
static __inline FINFO *
segsum_finfobase(STRUCT_LFS *fs, SEGSUM *ssp)
{
        return (FINFO *)((char *)ssp + SEGSUM_SIZE(fs));
}
#else
#define SEGSUM_FINFOBASE(fs, ssp) \
        ((FINFO *)((char *)(ssp) + SEGSUM_SIZE(fs)));
#endif

#define LFS_DEF_SS_ACCESSOR(type, type32, field) \
        static __inline type                                \
        lfs_ss_get##field(STRUCT_LFS *fs, SEGSUM *ssp)                \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        return ssp->u_64.ss_##field;                 \
                } else {                                        \
                        return ssp->u_32.ss_##field;                 \
                }                                                \
        }                                                        \
        static __inline void                                \
        lfs_ss_set##field(STRUCT_LFS *fs, SEGSUM *ssp, type val) \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        type *p = &ssp->u_64.ss_##field;        \
                        (void)p;                                \
                        ssp->u_64.ss_##field = val;                \
                } else {                                        \
                        type32 *p = &ssp->u_32.ss_##field;        \
                        (void)p;                                \
                        ssp->u_32.ss_##field = val;                \
                }                                                \
        }                                                        \

LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, sumsum)
LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, datasum)
LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, magic)
LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, ident)
LFS_DEF_SS_ACCESSOR(int64_t, int32_t, next)
LFS_DEF_SS_ACCESSOR(uint16_t, uint16_t, nfinfo)
LFS_DEF_SS_ACCESSOR(uint16_t, uint16_t, ninos)
LFS_DEF_SS_ACCESSOR(uint16_t, uint16_t, flags)
LFS_DEF_SS_ACCESSOR(uint64_t, uint32_t, reclino)
LFS_DEF_SS_ACCESSOR(uint64_t, uint64_t, serial)
LFS_DEF_SS_ACCESSOR(uint64_t, uint64_t, create)

static __inline size_t
lfs_ss_getsumstart(STRUCT_LFS *fs)
{
        /* These are actually all the same. */
        if (fs->lfs_is64) {
                return offsetof(SEGSUM64, ss_datasum);
        } else /* if (lfs_sb_getversion(fs) > 1) */ {
                return offsetof(SEGSUM32, ss_datasum);
        } /* else {
                return offsetof(SEGSUM_V1, ss_datasum);
        } */
        /*
         * XXX ^^^ until this file is resorted lfs_sb_getversion isn't
         * defined yet.
         */
}

static __inline uint32_t
lfs_ss_getocreate(STRUCT_LFS *fs, SEGSUM *ssp)
{
        KASSERT(fs->lfs_is64 == 0);
        /* XXX need to resort this file before we can do this */
        //KASSERT(lfs_sb_getversion(fs) == 1);

        return ssp->u_v1.ss_create;
}

static __inline void
lfs_ss_setocreate(STRUCT_LFS *fs, SEGSUM *ssp, uint32_t val)
{
        KASSERT(fs->lfs_is64 == 0);
        /* XXX need to resort this file before we can do this */
        //KASSERT(lfs_sb_getversion(fs) == 1);

        ssp->u_v1.ss_create = val;
}


/*
 * Super block.
 */

/*
 * Generate accessors for the on-disk superblock fields with cpp.
 */

#define LFS_DEF_SB_ACCESSOR_FULL(type, type32, field) \
        static __inline type                                \
        lfs_sb_get##field(STRUCT_LFS *fs)                        \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        return fs->lfs_dlfs_u.u_64.dlfs_##field; \
                } else {                                        \
                        return fs->lfs_dlfs_u.u_32.dlfs_##field; \
                }                                                \
        }                                                        \
        static __inline void                                \
        lfs_sb_set##field(STRUCT_LFS *fs, type val)                \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        fs->lfs_dlfs_u.u_64.dlfs_##field = val;        \
                } else {                                        \
                        fs->lfs_dlfs_u.u_32.dlfs_##field = val;        \
                }                                                \
        }                                                        \
        static __inline void                                \
        lfs_sb_add##field(STRUCT_LFS *fs, type val)                \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        type *p64 = &fs->lfs_dlfs_u.u_64.dlfs_##field; \
                        *p64 += val;                                \
                } else {                                        \
                        type32 *p32 = &fs->lfs_dlfs_u.u_32.dlfs_##field; \
                        *p32 += val;                                \
                }                                                \
        }                                                        \
        static __inline void                                \
        lfs_sb_sub##field(STRUCT_LFS *fs, type val)                \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        type *p64 = &fs->lfs_dlfs_u.u_64.dlfs_##field; \
                        *p64 -= val;                                \
                } else {                                        \
                        type32 *p32 = &fs->lfs_dlfs_u.u_32.dlfs_##field; \
                        *p32 -= val;                                \
                }                                                \
        }

#define LFS_DEF_SB_ACCESSOR(t, f) LFS_DEF_SB_ACCESSOR_FULL(t, t, f)

#define LFS_DEF_SB_ACCESSOR_32ONLY(type, field, val64) \
        static __inline type                                \
        lfs_sb_get##field(STRUCT_LFS *fs)                        \
        {                                                        \
                if (fs->lfs_is64) {                                \
                        return val64;                                \
                } else {                                        \
                        return fs->lfs_dlfs_u.u_32.dlfs_##field; \
                }                                                \
        }

LFS_DEF_SB_ACCESSOR(uint32_t, version)
LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, size)
LFS_DEF_SB_ACCESSOR(uint32_t, ssize)
LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, dsize)
LFS_DEF_SB_ACCESSOR(uint32_t, bsize)
LFS_DEF_SB_ACCESSOR(uint32_t, fsize)
LFS_DEF_SB_ACCESSOR(uint32_t, frag)
LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, freehd)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, bfree)
LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, nfiles)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, avail)
LFS_DEF_SB_ACCESSOR(int32_t, uinodes)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, idaddr)
LFS_DEF_SB_ACCESSOR_32ONLY(uint32_t, ifile, LFS_IFILE_INUM)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, lastseg)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, nextseg)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, curseg)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, offset)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, lastpseg)
LFS_DEF_SB_ACCESSOR(uint32_t, inopf)
LFS_DEF_SB_ACCESSOR(uint32_t, minfree)
LFS_DEF_SB_ACCESSOR(uint64_t, maxfilesize)
LFS_DEF_SB_ACCESSOR(uint32_t, fsbpseg)
LFS_DEF_SB_ACCESSOR(uint32_t, inopb)
LFS_DEF_SB_ACCESSOR(uint32_t, ifpb)
LFS_DEF_SB_ACCESSOR(uint32_t, sepb)
LFS_DEF_SB_ACCESSOR(uint32_t, nindir)
LFS_DEF_SB_ACCESSOR(uint32_t, nseg)
LFS_DEF_SB_ACCESSOR(uint32_t, nspf)
LFS_DEF_SB_ACCESSOR(uint32_t, cleansz)
LFS_DEF_SB_ACCESSOR(uint32_t, segtabsz)
LFS_DEF_SB_ACCESSOR_32ONLY(uint32_t, segmask, 0)
LFS_DEF_SB_ACCESSOR_32ONLY(uint32_t, segshift, 0)
LFS_DEF_SB_ACCESSOR(uint64_t, bmask)
LFS_DEF_SB_ACCESSOR(uint32_t, bshift)
LFS_DEF_SB_ACCESSOR(uint64_t, ffmask)
LFS_DEF_SB_ACCESSOR(uint32_t, ffshift)
LFS_DEF_SB_ACCESSOR(uint64_t, fbmask)
LFS_DEF_SB_ACCESSOR(uint32_t, fbshift)
LFS_DEF_SB_ACCESSOR(uint32_t, blktodb)
LFS_DEF_SB_ACCESSOR(uint32_t, fsbtodb)
LFS_DEF_SB_ACCESSOR(uint32_t, sushift)
LFS_DEF_SB_ACCESSOR(int32_t, maxsymlinklen)
LFS_DEF_SB_ACCESSOR(uint32_t, cksum)
LFS_DEF_SB_ACCESSOR(uint16_t, pflags)
LFS_DEF_SB_ACCESSOR(uint32_t, nclean)
LFS_DEF_SB_ACCESSOR(int32_t, dmeta)
LFS_DEF_SB_ACCESSOR(uint32_t, minfreeseg)
LFS_DEF_SB_ACCESSOR(uint32_t, sumsize)
LFS_DEF_SB_ACCESSOR(uint64_t, serial)
LFS_DEF_SB_ACCESSOR(uint32_t, ibsize)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, s0addr)
LFS_DEF_SB_ACCESSOR(uint64_t, tstamp)
LFS_DEF_SB_ACCESSOR(uint32_t, inodefmt)
LFS_DEF_SB_ACCESSOR(uint32_t, interleave)
LFS_DEF_SB_ACCESSOR(uint32_t, ident)
LFS_DEF_SB_ACCESSOR(uint32_t, resvseg)

/* special-case accessors */

/*
 * the v1 otstamp field lives in what's now dlfs_inopf
 */
#define lfs_sb_getotstamp(fs) lfs_sb_getinopf(fs)
#define lfs_sb_setotstamp(fs, val) lfs_sb_setinopf(fs, val)

/*
 * lfs_sboffs is an array
 */
static __inline int32_t
lfs_sb_getsboff(STRUCT_LFS *fs, unsigned n)
{
#ifdef KASSERT /* ugh */
        KASSERT(n < LFS_MAXNUMSB);
#endif
        if (fs->lfs_is64) {
                return fs->lfs_dlfs_u.u_64.dlfs_sboffs[n];
        } else {
                return fs->lfs_dlfs_u.u_32.dlfs_sboffs[n];
        }
}
static __inline void
lfs_sb_setsboff(STRUCT_LFS *fs, unsigned n, int32_t val)
{
#ifdef KASSERT /* ugh */
        KASSERT(n < LFS_MAXNUMSB);
#endif
        if (fs->lfs_is64) {
                fs->lfs_dlfs_u.u_64.dlfs_sboffs[n] = val;
        } else {
                fs->lfs_dlfs_u.u_32.dlfs_sboffs[n] = val;
        }
}

/*
 * lfs_fsmnt is a string
 */
static __inline const char *
lfs_sb_getfsmnt(STRUCT_LFS *fs)
{
        if (fs->lfs_is64) {
                return (const char *)fs->lfs_dlfs_u.u_64.dlfs_fsmnt;
        } else {
                return (const char *)fs->lfs_dlfs_u.u_32.dlfs_fsmnt;
        }
}

static __inline void
lfs_sb_setfsmnt(STRUCT_LFS *fs, const char *str)
{
        if (fs->lfs_is64) {
                (void)strncpy((char *)fs->lfs_dlfs_u.u_64.dlfs_fsmnt, str,
                        sizeof(fs->lfs_dlfs_u.u_64.dlfs_fsmnt));
        } else {
                (void)strncpy((char *)fs->lfs_dlfs_u.u_32.dlfs_fsmnt, str,
                        sizeof(fs->lfs_dlfs_u.u_32.dlfs_fsmnt));
        }
}

/* Highest addressable fsb */
#define LFS_MAX_DADDR(fs) \
        ((fs)->lfs_is64 ? 0x7fffffffffffffff : 0x7fffffff)

/* LFS_NINDIR is the number of indirects in a file system block. */
#define        LFS_NINDIR(fs)        (lfs_sb_getnindir(fs))

/* LFS_INOPB is the number of inodes in a secondary storage block. */
#define        LFS_INOPB(fs)        (lfs_sb_getinopb(fs))
/* LFS_INOPF is the number of inodes in a fragment. */
#define LFS_INOPF(fs)        (lfs_sb_getinopf(fs))

#define        lfs_blkoff(fs, loc)        ((int)((loc) & lfs_sb_getbmask(fs)))
#define lfs_fragoff(fs, loc)    /* calculates (loc % fs->lfs_fsize) */ \
    ((int)((loc) & lfs_sb_getffmask(fs)))

/* XXX: lowercase these as they're no longer macros */
/* Frags to diskblocks */
static __inline uint64_t
LFS_FSBTODB(STRUCT_LFS *fs, uint64_t b)
{
#if defined(_KERNEL)
        return b << (lfs_sb_getffshift(fs) - DEV_BSHIFT);
#else
        return b << lfs_sb_getfsbtodb(fs);
#endif
}
/* Diskblocks to frags */
static __inline uint64_t
LFS_DBTOFSB(STRUCT_LFS *fs, uint64_t b)
{
#if defined(_KERNEL)
        return b >> (lfs_sb_getffshift(fs) - DEV_BSHIFT);
#else
        return b >> lfs_sb_getfsbtodb(fs);
#endif
}

#define        lfs_lblkno(fs, loc)        ((loc) >> lfs_sb_getbshift(fs))
#define        lfs_lblktosize(fs, blk)        ((blk) << lfs_sb_getbshift(fs))

/* Frags to bytes */
static __inline uint64_t
lfs_fsbtob(STRUCT_LFS *fs, uint64_t b)
{
        return b << lfs_sb_getffshift(fs);
}
/* Bytes to frags */
static __inline uint64_t
lfs_btofsb(STRUCT_LFS *fs, uint64_t b)
{
        return b >> lfs_sb_getffshift(fs);
}

#define lfs_numfrags(fs, loc)        /* calculates (loc / fs->lfs_fsize) */        \
        ((loc) >> lfs_sb_getffshift(fs))
#define lfs_blkroundup(fs, size)/* calculates roundup(size, lfs_sb_getbsize(fs)) */ \
        ((off_t)(((size) + lfs_sb_getbmask(fs)) & (~lfs_sb_getbmask(fs))))
#define lfs_fragroundup(fs, size)/* calculates roundup(size, fs->lfs_fsize) */ \
        ((off_t)(((size) + lfs_sb_getffmask(fs)) & (~lfs_sb_getffmask(fs))))
#define lfs_fragstoblks(fs, frags)/* calculates (frags / fs->fs_frag) */ \
        ((frags) >> lfs_sb_getfbshift(fs))
#define lfs_blkstofrags(fs, blks)/* calculates (blks * fs->fs_frag) */ \
        ((blks) << lfs_sb_getfbshift(fs))
#define lfs_fragnum(fs, fsb)        /* calculates (fsb % fs->lfs_frag) */        \
        ((fsb) & ((fs)->lfs_frag - 1))
#define lfs_blknum(fs, fsb)        /* calculates rounddown(fsb, fs->lfs_frag) */ \
        ((fsb) &~ ((fs)->lfs_frag - 1))
#define lfs_dblksize(fs, dp, lbn) \
        (((lbn) >= ULFS_NDADDR || lfs_dino_getsize(fs, dp) >= ((lbn) + 1) << lfs_sb_getbshift(fs)) \
            ? lfs_sb_getbsize(fs) \
            : (lfs_fragroundup(fs, lfs_blkoff(fs, lfs_dino_getsize(fs, dp)))))

#define        lfs_segsize(fs)        (lfs_sb_getversion(fs) == 1 ?                             \
                           lfs_lblktosize((fs), lfs_sb_getssize(fs)) :        \
                           lfs_sb_getssize(fs))
/* XXX segtod produces a result in frags despite the 'd' */
#define lfs_segtod(fs, seg) (lfs_btofsb(fs, lfs_segsize(fs)) * (seg))
#define        lfs_dtosn(fs, daddr)        /* block address to segment number */        \
        ((uint32_t)(((daddr) - lfs_sb_gets0addr(fs)) / lfs_segtod((fs), 1)))
#define lfs_sntod(fs, sn)        /* segment number to disk address */        \
        ((daddr_t)(lfs_segtod((fs), (sn)) + lfs_sb_gets0addr(fs)))

/* XXX, blah. make this appear only if struct inode is defined */
#ifdef _UFS_LFS_LFS_INODE_H_
static __inline uint32_t
lfs_blksize(STRUCT_LFS *fs, struct inode *ip, uint64_t lbn)
{
        if (lbn >= ULFS_NDADDR || lfs_dino_getsize(fs, ip->i_din) >= (lbn + 1) << lfs_sb_getbshift(fs)) {
                return lfs_sb_getbsize(fs);
        } else {
                return lfs_fragroundup(fs, lfs_blkoff(fs, lfs_dino_getsize(fs, ip->i_din)));
        }
}
#endif

/*
 * union lfs_blocks
 */

static __inline void
lfs_blocks_fromvoid(STRUCT_LFS *fs, union lfs_blocks *bp, void *p)
{
        if (fs->lfs_is64) {
                bp->b64 = p;
        } else {
                bp->b32 = p;
        }
}

static __inline void
lfs_blocks_fromfinfo(STRUCT_LFS *fs, union lfs_blocks *bp, FINFO *fip)
{
        void *firstblock;

        firstblock = (char *)fip + FINFOSIZE(fs);
        if (fs->lfs_is64) {
                bp->b64 = (int64_t *)firstblock;
        }  else {
                bp->b32 = (int32_t *)firstblock;
        }
}

static __inline daddr_t
lfs_blocks_get(STRUCT_LFS *fs, union lfs_blocks *bp, unsigned idx)
{
        if (fs->lfs_is64) {
                return bp->b64[idx];
        } else {
                return bp->b32[idx];
        }
}

static __inline void
lfs_blocks_set(STRUCT_LFS *fs, union lfs_blocks *bp, unsigned idx, daddr_t val)
{
        if (fs->lfs_is64) {
                bp->b64[idx] = val;
        } else {
                bp->b32[idx] = val;
        }
}

static __inline void
lfs_blocks_inc(STRUCT_LFS *fs, union lfs_blocks *bp)
{
        if (fs->lfs_is64) {
                bp->b64++;
        } else {
                bp->b32++;
        }
}

static __inline int
lfs_blocks_eq(STRUCT_LFS *fs, union lfs_blocks *bp1, union lfs_blocks *bp2)
{
        if (fs->lfs_is64) {
                return bp1->b64 == bp2->b64;
        } else {
                return bp1->b32 == bp2->b32;
        }
}

static __inline int
lfs_blocks_sub(STRUCT_LFS *fs, union lfs_blocks *bp1, union lfs_blocks *bp2)
{
        /* (remember that the pointers are typed) */
        if (fs->lfs_is64) {
                return bp1->b64 - bp2->b64;
        } else {
                return bp1->b32 - bp2->b32;
        }
}

/*
 * struct segment
 */


/*
 * Macros for determining free space on the disk, with the variable metadata
 * of segment summaries and inode blocks taken into account.
 */
/*
 * Estimate number of clean blocks not available for writing because
 * they will contain metadata or overhead.  This is calculated as
 *
 *                E = ((C * M / D) * D + (0) * (T - D)) / T
 * or more simply
 *                E = (C * M) / T
 *
 * where
 * C is the clean space,
 * D is the dirty space,
 * M is the dirty metadata, and
 * T = C + D is the total space on disk.
 *
 * This approximates the old formula of E = C * M / D when D is close to T,
 * but avoids falsely reporting "disk full" when the sample size (D) is small.
 */
#define LFS_EST_CMETA(F) ((                                                \
        (lfs_sb_getdmeta(F) * (int64_t)lfs_sb_getnclean(F)) /                 \
        (lfs_sb_getnseg(F))))

/* Estimate total size of the disk not including metadata */
#define LFS_EST_NONMETA(F) (lfs_sb_getdsize(F) - lfs_sb_getdmeta(F) - LFS_EST_CMETA(F))

/* Estimate number of blocks actually available for writing */
#define LFS_EST_BFREE(F) (lfs_sb_getbfree(F) > LFS_EST_CMETA(F) ?             \
                          lfs_sb_getbfree(F) - LFS_EST_CMETA(F) : 0)

/* Amount of non-meta space not available to mortal man */
#define LFS_EST_RSVD(F) ((LFS_EST_NONMETA(F) *                             \
                                   (uint64_t)lfs_sb_getminfree(F)) /             \
                                  100)

/* Can credential C write BB blocks? XXX: kauth_cred_geteuid is abusive */
#define ISSPACE(F, BB, C)                                                \
        ((((C) == NOCRED || kauth_cred_geteuid(C) == 0) &&                \
          LFS_EST_BFREE(F) >= (BB)) ||                                        \
         (kauth_cred_geteuid(C) != 0 && IS_FREESPACE(F, BB)))

/* Can an ordinary user write BB blocks */
#define IS_FREESPACE(F, BB)                                                \
          (LFS_EST_BFREE(F) >= (BB) + LFS_EST_RSVD(F))

/*
 * The minimum number of blocks to create a new inode.  This is:
 * directory direct block (1) + ULFS_NIADDR indirect blocks + inode block (1) +
 * ifile direct block (1) + ULFS_NIADDR indirect blocks = 3 + 2 * ULFS_NIADDR blocks.
 */
#define LFS_NRESERVE(F) (lfs_btofsb((F), (2 * ULFS_NIADDR + 3) << lfs_sb_getbshift(F)))


/*
 * Suppress spurious clang warnings
 */
#ifdef __GNUC__
#if defined(__clang__)
#pragma clang diagnostic pop
#elif __GNUC_PREREQ__(9,0)
#pragma GCC diagnostic pop
#endif
#endif


#endif /* _UFS_LFS_LFS_ACCESSORS_H_ */















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 



















    4 














































































































































    4 








































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
/*        $NetBSD: ffs_snapshot.c,v 1.155 2023/05/11 23:11:25 chs Exp $        */

/*
 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
 *
 * Further information about snapshots can be obtained from:
 *
 *        Marshall Kirk McKusick                http://www.mckusick.com/softdep/
 *        1614 Oxford Street                mckusick@mckusick.com
 *        Berkeley, CA 94709-1608                +1-510-843-9542
 *        USA
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ffs_snapshot.c        8.11 (McKusick) 7/23/00
 *
 *        from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.155 2023/05/11 23:11:25 chs Exp $");

#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#endif

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/sched.h>
#include <sys/stat.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>
#include <sys/wapbl.h>

#include <miscfs/specfs/specdev.h>

#include <ufs/ufs/quota.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>

#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>

#include <uvm/uvm.h>

TAILQ_HEAD(inodelst, inode);                        /* List of active snapshots */

struct snap_info {
        kmutex_t si_lock;                        /* Lock this snapinfo */
        kmutex_t si_snaplock;                        /* Snapshot vnode common lock */
        lwp_t *si_owner;                        /* Snaplock owner */
        struct inodelst si_snapshots;                /* List of active snapshots */
        daddr_t *si_snapblklist;                /* Snapshot block hints list */
        uint32_t si_gen;                        /* Incremented on change */
};

#if !defined(FFS_NO_SNAPSHOT)
typedef int (*acctfunc_t)
    (struct vnode *, void *, int, int, struct fs *, daddr_t, int);

static int snapshot_setup(struct mount *, struct vnode *);
static int snapshot_copyfs(struct mount *, struct vnode *, void **);
static int snapshot_expunge(struct mount *, struct vnode *,
    struct fs *, daddr_t *, daddr_t **);
static int snapshot_expunge_snap(struct mount *, struct vnode *,
    struct fs *, daddr_t);
static int snapshot_writefs(struct mount *, struct vnode *, void *);
static int cgaccount(struct vnode *, int, int *);
static int cgaccount1(int, struct vnode *, void *, int);
static int expunge(struct vnode *, struct inode *, struct fs *,
    acctfunc_t, int);
static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
    daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
static int fullacct(struct vnode *, void *, int, int, struct fs *,
    daddr_t, int);
static int snapacct(struct vnode *, void *, int, int, struct fs *,
    daddr_t, int);
static int mapacct(struct vnode *, void *, int, int, struct fs *,
    daddr_t, int);
#endif /* !defined(FFS_NO_SNAPSHOT) */

static int ffs_copyonwrite(void *, struct buf *, bool);
static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
static int rwfsblk(struct vnode *, int, void *, daddr_t);
static int syncsnap(struct vnode *);
static int wrsnapblk(struct vnode *, void *, daddr_t);
#if !defined(FFS_NO_SNAPSHOT)
static int blocks_in_journal(struct fs *);
#endif

static inline bool is_active_snapshot(struct snap_info *, struct inode *);
static inline daddr_t db_get(struct inode *, int);
static inline void db_assign(struct inode *, int, daddr_t);
static inline daddr_t ib_get(struct inode *, int);
static inline daddr_t idb_get(struct inode *, void *, int);
static inline void idb_assign(struct inode *, void *, int, daddr_t);

#ifdef DEBUG
static int snapdebug = 0;
#endif

int
ffs_snapshot_init(struct ufsmount *ump)
{
        struct snap_info *si;

        si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
        TAILQ_INIT(&si->si_snapshots);
        mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
        mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
        si->si_owner = NULL;
        si->si_gen = 0;
        si->si_snapblklist = NULL;

        return 0;
}

void
ffs_snapshot_fini(struct ufsmount *ump)
{
        struct snap_info *si;

        si = ump->um_snapinfo;
        ump->um_snapinfo = NULL;

        KASSERT(TAILQ_EMPTY(&si->si_snapshots));
        mutex_destroy(&si->si_lock);
        mutex_destroy(&si->si_snaplock);
        KASSERT(si->si_snapblklist == NULL);
        kmem_free(si, sizeof(*si));
}

/*
 * Create a snapshot file and initialize it for the filesystem.
 * Vnode is locked on entry and return.
 */
int
ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
{
#if defined(FFS_NO_SNAPSHOT)
        return EOPNOTSUPP;
}
#else /* defined(FFS_NO_SNAPSHOT) */
        bool suspended = false;
        int error, redo = 0, snaploc;
        void *sbbuf = NULL;
        daddr_t *snaplist = NULL, snaplistsize = 0;
        struct buf *bp, *nbp;
        struct fs *copy_fs = NULL;
        struct fs *fs = VFSTOUFS(mp)->um_fs;
        struct inode *ip = VTOI(vp);
        struct lwp *l = curlwp;
        struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
        struct timespec ts;
        struct timeval starttime;
#ifdef DEBUG
        struct timeval endtime;
#endif
        struct vnode *devvp = ip->i_devvp;

        /*
         * If the vnode already is a snapshot, return.
         */
        if ((ip->i_flags & SF_SNAPSHOT)) {
                if ((ip->i_flags & SF_SNAPINVAL))
                        return EINVAL;
                if (ctime) {
                        ctime->tv_sec = DIP(ip, mtime);
                        ctime->tv_nsec = DIP(ip, mtimensec);
                }
                return 0;
        }
        /*
         * Check for free snapshot slot in the superblock.
         */
        for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
                if (fs->fs_snapinum[snaploc] == 0)
                        break;
        if (snaploc == FSMAXSNAP)
                return (ENOSPC);
        /*
         * Prepare the vnode to become a snapshot.
         */
        error = snapshot_setup(mp, vp);
        if (error)
                goto out;

        /*
         * Copy all the cylinder group maps. Although the
         * filesystem is still active, we hope that only a few
         * cylinder groups will change between now and when we
         * suspend operations. Thus, we will be able to quickly
         * touch up the few cylinder groups that changed during
         * the suspension period.
         */
        error = cgaccount(vp, 1, NULL);
        if (error)
                goto out;

        /*
         * snapshot is now valid
         */
        ip->i_flags &= ~SF_SNAPINVAL;
        DIP_ASSIGN(ip, flags, ip->i_flags);
        ip->i_flag |= IN_CHANGE | IN_UPDATE;

        /*
         * Ensure that the snapshot is completely on disk.
         * Since we have marked it as a snapshot it is safe to
         * unlock it as no process will be allowed to write to it.
         */
        error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
        if (error)
                goto out;
        VOP_UNLOCK(vp);
        /*
         * All allocations are done, so we can now suspend the filesystem.
         */
        error = vfs_suspend(vp->v_mount, 0);
        if (error == 0) {
                suspended = true;
                vrele_flush(vp->v_mount);
                error = VFS_SYNC(vp->v_mount, MNT_WAIT, curlwp->l_cred);
        }
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        if (error)
                goto out;
        getmicrotime(&starttime);
        /*
         * First, copy all the cylinder group maps that have changed.
         */
        error = cgaccount(vp, 2, &redo);
        if (error)
                goto out;
        /*
         * Create a copy of the superblock and its summary information.
         */
        error = snapshot_copyfs(mp, vp, &sbbuf);
        if (error)
                goto out;
        copy_fs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
        /*
         * Expunge unlinked files from our view.
         */
        error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
        if (error)
                goto out;
        /*
         * Record snapshot inode. Since this is the newest snapshot,
         * it must be placed at the end of the list.
         */
        if (ip->i_nlink > 0)
                fs->fs_snapinum[snaploc] = ip->i_number;

        mutex_enter(&si->si_lock);
        if (is_active_snapshot(si, ip))
                panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
        TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
        if (TAILQ_FIRST(&si->si_snapshots) == ip) {
                /*
                 * If this is the first snapshot on this filesystem, put the
                 * preliminary list in place and establish the cow handler.
                 */
                si->si_snapblklist = snaplist;
                fscow_establish(mp, ffs_copyonwrite, devvp);
        }
        si->si_gen++;
        mutex_exit(&si->si_lock);

        vp->v_vflag |= VV_SYSTEM;
        /*
         * Set the mtime to the time the snapshot has been taken.
         */
        TIMEVAL_TO_TIMESPEC(&starttime, &ts);
        if (ctime)
                *ctime = ts;
        DIP_ASSIGN(ip, mtime, ts.tv_sec);
        DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
        ip->i_flag |= IN_CHANGE | IN_UPDATE;
        /*
         * Copy allocation information from all snapshots and then
         * expunge them from our view.
         */
        error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
        if (error)
                goto out;
        /*
         * Write the superblock and its summary information to the snapshot.
         */
        error = snapshot_writefs(mp, vp, sbbuf);
        if (error)
                goto out;
        /*
         * We're nearly done, ensure that the snapshot is completely on disk.
         */
        error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
        if (error)
                goto out;
        /*
         * Invalidate and free all pages on the snapshot vnode.
         * We will read and write through the buffercache.
         */
        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
        error = VOP_PUTPAGES(vp, 0, 0,
                    PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
        if (error)
                goto out;
        /*
         * Invalidate short ( < fs_bsize ) buffers.  We will always read
         * full size buffers later.
         */
        mutex_enter(&bufcache_lock);
        KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
        for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
                nbp = LIST_NEXT(bp, b_vnbufs);
                if (bp->b_bcount == fs->fs_bsize)
                        continue;
                error = bbusy(bp, false, 0, NULL);
                if (error != 0) {
                        if (error == EPASSTHROUGH) {
                                nbp = LIST_FIRST(&vp->v_cleanblkhd);
                                continue;
                        }
                        break;
                }
                brelsel(bp, BC_INVAL | BC_VFLUSH);
        }
        mutex_exit(&bufcache_lock);

out:
        if (sbbuf != NULL) {
                free(copy_fs->fs_csp, M_UFSMNT);
                free(sbbuf, M_UFSMNT);
        }
        if (fs->fs_active != NULL) {
                free(fs->fs_active, M_DEVBUF);
                fs->fs_active = NULL;
        }

        mutex_enter(&si->si_lock);
        if (snaplist != NULL) {
                if (si->si_snapblklist == snaplist)
                        si->si_snapblklist = NULL;
                free(snaplist, M_UFSMNT);
        }
        if (error) {
                fs->fs_snapinum[snaploc] = 0;
        } else {
                /*
                 * As this is the newest list, it is the most inclusive, so
                 * should replace the previous list.
                 */
                si->si_snapblklist = ip->i_snapblklist;
        }
        si->si_gen++;
        mutex_exit(&si->si_lock);

        if (suspended) {
                VOP_UNLOCK(vp);
                vfs_resume(vp->v_mount);
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef DEBUG
                getmicrotime(&endtime);
                timersub(&endtime, &starttime, &endtime);
                printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
                    mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
                    endtime.tv_usec / 1000, redo, fs->fs_ncg);
#endif
        }
        if (error) {
                if (UFS_WAPBL_BEGIN(mp) == 0) {
                        /*
                         * We depend on ffs_truncate() to call ffs_snapremove()
                         * before it may return an error. On failed
                         * ffs_truncate() we have normal file with leaked
                         * (meta-) data, but no snapshot to use.
                         */
                        (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
                        UFS_WAPBL_END(mp);
                }
        } else if (ip->i_nlink > 0)
                vref(vp);
        return (error);
}

/*
 * Prepare vnode to become a snapshot.
 */
static int
snapshot_setup(struct mount *mp, struct vnode *vp)
{
        int error, n, len, loc, cg;
        daddr_t blkno, numblks;
        struct buf *ibp, *nbp;
        struct fs *fs = VFSTOUFS(mp)->um_fs;
        struct lwp *l = curlwp;
        const int wbreak = blocks_in_journal(fs)/8;
        struct inode *ip = VTOI(vp);

        /*
         * Check mount, readonly reference and owner.
         */
        if (vp->v_mount != mp)
                return EXDEV;
        if (vp->v_writecount != 0)
                return EBUSY;
        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT,
            0, mp, vp, NULL);
        if (error)
                return EACCES;

        /*
         * Must completely truncate the file here. Allocated
         * blocks on a snapshot mean that block has been copied
         * on write, see ffs_copyonwrite() testing "blkno != 0"
         */
        error = ufs_truncate_all(vp);
        if (error)
                return error;

        /* Change inode to snapshot type file. */
        error = UFS_WAPBL_BEGIN(mp);
        if (error)
                return error;
#if defined(QUOTA) || defined(QUOTA2)
        /* snapshot inodes are not accounted in quotas */
        chkiq(ip, -1, l->l_cred, 0);
#endif
        ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
        DIP_ASSIGN(ip, flags, ip->i_flags);
        ip->i_flag |= IN_CHANGE | IN_UPDATE;
        ffs_update(vp, NULL, NULL, UPDATE_WAIT);
        UFS_WAPBL_END(mp);

        KASSERT(ip->i_flags & SF_SNAPSHOT);
        /*
         * Write an empty list of preallocated blocks to the end of
         * the snapshot to set size to at least that of the filesystem.
         */
        numblks = howmany(fs->fs_size, fs->fs_frag);
        blkno = 1;
        blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
        error = vn_rdwr(UIO_WRITE, vp,
            (void *)&blkno, sizeof(blkno), ffs_lblktosize(fs, (off_t)numblks),
            UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
        if (error)
                return error;
        /*
         * Preallocate critical data structures so that we can copy
         * them in without further allocation after we suspend all
         * operations on the filesystem. We would like to just release
         * the allocated buffers without writing them since they will
         * be filled in below once we are ready to go, but this upsets
         * the soft update code, so we go ahead and write the new buffers.
         *
         * Allocate all indirect blocks and mark all of them as not
         * needing to be copied.
         */
        error = UFS_WAPBL_BEGIN(mp);
        if (error)
                return error;
        for (blkno = UFS_NDADDR, n = 0; blkno < numblks; blkno += FFS_NINDIR(fs)) {
                error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
                    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
                if (error)
                        goto out;
                brelse(ibp, 0);
                if (wbreak > 0 && (++n % wbreak) == 0) {
                        UFS_WAPBL_END(mp);
                        error = UFS_WAPBL_BEGIN(mp);
                        if (error)
                                return error;
                }
        }
        /*
         * Allocate copies for the superblock and its summary information.
         */
        error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
            0, &nbp);
        if (error)
                goto out;
        bawrite(nbp);
        blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
        len = howmany(fs->fs_cssize, fs->fs_bsize);
        for (loc = 0; loc < len; loc++) {
                error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)),
                    fs->fs_bsize, l->l_cred, 0, &nbp);
                if (error)
                        goto out;
                bawrite(nbp);
                if (wbreak > 0 && (++n % wbreak) == 0) {
                        UFS_WAPBL_END(mp);
                        error = UFS_WAPBL_BEGIN(mp);
                        if (error)
                                return error;
                }
        }
        /*
         * Allocate all cylinder group blocks.
         */
        for (cg = 0; cg < fs->fs_ncg; cg++) {
                error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
                    fs->fs_bsize, l->l_cred, 0, &nbp);
                if (error)
                        goto out;
                bawrite(nbp);
                if (wbreak > 0 && (++n % wbreak) == 0) {
                        UFS_WAPBL_END(mp);
                        error = UFS_WAPBL_BEGIN(mp);
                        if (error)
                                return error;
                }
        }

out:
        UFS_WAPBL_END(mp);
        return error;
}

/*
 * Create a copy of the superblock and its summary information.
 * It is up to the caller to free copyfs and copy_fs->fs_csp.
 */
static int
snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
{
        int error, i, len, loc, size;
        void *space;
        int32_t *lp;
        struct buf *bp;
        struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
        struct vnode *devvp = VTOI(vp)->i_devvp;

        /*
         * Grab a copy of the superblock and its summary information.
         * We delay writing it until the suspension is released below.
         */
        *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
        loc = ffs_blkoff(fs, fs->fs_sblockloc);
        if (loc > 0)
                memset(*sbbuf, 0, loc);
        copyfs = (struct fs *)((char *)(*sbbuf) + loc);
        memcpy(copyfs, fs, fs->fs_sbsize);
        size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
        if (fs->fs_sbsize < size)
                memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0, 
                    size - fs->fs_sbsize);
        size = ffs_blkroundup(fs, fs->fs_cssize);
        if (fs->fs_contigsumsize > 0)
                size += fs->fs_ncg * sizeof(int32_t);
        space = malloc(size, M_UFSMNT, M_WAITOK);
        copyfs->fs_csp = space;
        memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
        space = (char *)space + fs->fs_cssize;
        loc = howmany(fs->fs_cssize, fs->fs_fsize);
        i = fs->fs_frag - loc % fs->fs_frag;
        len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
        if (len > 0) {
                if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc),
                    len, 0, &bp)) != 0) {
                        free(copyfs->fs_csp, M_UFSMNT);
                        free(*sbbuf, M_UFSMNT);
                        *sbbuf = NULL;
                        return error;
                }
                memcpy(space, bp->b_data, (u_int)len);
                space = (char *)space + len;
                brelse(bp, BC_INVAL | BC_NOCACHE);
        }
        if (fs->fs_contigsumsize > 0) {
                copyfs->fs_maxcluster = lp = space;
                for (i = 0; i < fs->fs_ncg; i++)
                        *lp++ = fs->fs_contigsumsize;
        }
        if (mp->mnt_wapbl)
                copyfs->fs_flags &= ~FS_DOWAPBL;
        return 0;
}

struct snapshot_expunge_ctx {
        struct vnode *logvp;
        struct vnode *vp;
        struct fs *copy_fs;
};

static bool
snapshot_expunge_selector(void *cl, struct vnode *xvp)
{
        struct snapshot_expunge_ctx *c = cl;
        struct inode *xp;

        KASSERT(mutex_owned(xvp->v_interlock));

        xp = VTOI(xvp);
        if (xvp->v_type == VNON || VTOI(xvp) == NULL ||
            (xp->i_flags & SF_SNAPSHOT))
                return false;
#ifdef DEBUG
        if (snapdebug)
                vprint("ffs_snapshot: busy vnode", xvp);
#endif

        if (xvp == c->logvp)
                return true;

        if (xp->i_nlink > 0)
                return false;

        if (ffs_checkfreefile(c->copy_fs, c->vp, xp->i_number))
                return false;

        return true;
}

/*
 * We must check for active files that have been unlinked (e.g., with a zero
 * link count). We have to expunge all trace of these files from the snapshot
 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
 * Note that we skip unlinked snapshot files as they will be handled separately.
 * Calculate the snapshot list size and create a preliminary list.
 */
static int
snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
    daddr_t *snaplistsize, daddr_t **snaplist)
{
        int cg, error = 0, len, loc;
        daddr_t blkno, *blkp;
        struct fs *fs = VFSTOUFS(mp)->um_fs;
        struct inode *xp;
        struct vnode *logvp = NULL, *xvp;
        struct vnode_iterator *marker;
        struct snapshot_expunge_ctx ctx;

        *snaplist = NULL;
        /*
         * Get the log inode if any.
         */
        if ((fs->fs_flags & FS_DOWAPBL) &&
            fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
                error = VFS_VGET(mp, fs->fs_journallocs[UFS_WAPBL_INFS_INO],
                    LK_EXCLUSIVE, &logvp);
                if (error)
                        goto out;
        }
        /*
         * We also calculate the needed size for the snapshot list.
         */
        *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
            FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;

        vfs_vnode_iterator_init(mp, &marker);
        ctx.logvp = logvp;
        ctx.vp = vp;
        ctx.copy_fs = copy_fs;
        while ((xvp = vfs_vnode_iterator_next(marker, snapshot_expunge_selector,
            &ctx)))
        {
                /*
                 * If there is a fragment, clear it here.
                 */
                xp = VTOI(xvp);
                blkno = 0;
                loc = howmany(xp->i_size, fs->fs_bsize) - 1;
                if (loc < UFS_NDADDR) {
                        len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size));
                        if (len > 0 && len < fs->fs_bsize) {
                                error = UFS_WAPBL_BEGIN(mp);
                                if (error) {
                                        vrele(xvp);
                                        vfs_vnode_iterator_destroy(marker);
                                        goto out;
                                }
                                ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
                                    len, xp->i_number);
                                blkno = db_get(xp, loc);
                                db_assign(xp, loc, 0);
                                UFS_WAPBL_END(mp);
                        }
                }
                *snaplistsize += 1;
                error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
                if (blkno)
                        db_assign(xp, loc, blkno);
                if (!error) {
                        error = UFS_WAPBL_BEGIN(mp);
                        if (!error) {
                                error = ffs_freefile_snap(copy_fs, vp,
                                    xp->i_number, xp->i_mode);
                                UFS_WAPBL_END(mp);
                        }
                }
                vrele(xvp);
                if (error) {
                        vfs_vnode_iterator_destroy(marker);
                        goto out;
                }
        }
        vfs_vnode_iterator_destroy(marker);

        /*
         * Create a preliminary list of preallocated snapshot blocks.
         */
        *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
        blkp = &(*snaplist)[1];
        *blkp++ = ffs_lblkno(fs, fs->fs_sblockloc);
        blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
        for (cg = 0; cg < fs->fs_ncg; cg++) {
                if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno)
                        break;
                *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
        }
        len = howmany(fs->fs_cssize, fs->fs_bsize);
        for (loc = 0; loc < len; loc++)
                *blkp++ = blkno + loc;
        for (; cg < fs->fs_ncg; cg++)
                *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
        (*snaplist)[0] = blkp - &(*snaplist)[0];

out:
        if (logvp != NULL)
                vput(logvp);
        if (error && *snaplist != NULL) {
                free(*snaplist, M_UFSMNT);
                *snaplist = NULL;
        }

        return error;
}

/*
 * Copy allocation information from all the snapshots in this snapshot and
 * then expunge them from its view. Also, collect the list of allocated
 * blocks in i_snapblklist.
 */
static int
snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
    struct fs *copy_fs, daddr_t snaplistsize)
{
        int error = 0, i;
        daddr_t numblks, *snaplist = NULL;
        struct fs *fs = VFSTOUFS(mp)->um_fs;
        struct inode *ip = VTOI(vp), *xp;
        struct lwp *l = curlwp;
        struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;

        TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
                if (xp != ip) {
                        error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
                        if (error)
                                break;
                }
                if (xp->i_nlink != 0)
                        continue;
                error = UFS_WAPBL_BEGIN(mp);
                if (error)
                        break;
                error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
                UFS_WAPBL_END(mp);
                if (error)
                        break;
        }
        if (error)
                goto out;
        /*
         * Allocate space for the full list of preallocated snapshot blocks.
         */
        snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
        ip->i_snapblklist = &snaplist[1];
        /*
         * Expunge the blocks used by the snapshots from the set of
         * blocks marked as used in the snapshot bitmaps. Also, collect
         * the list of allocated blocks in i_snapblklist.
         */
        error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
        if (error)
                goto out;
        if (snaplistsize < ip->i_snapblklist - snaplist)
                panic("ffs_snapshot: list too small");
        snaplistsize = ip->i_snapblklist - snaplist;
        snaplist[0] = snaplistsize;
        ip->i_snapblklist = &snaplist[0];
        /*
         * Write out the list of allocated blocks to the end of the snapshot.
         */
        numblks = howmany(fs->fs_size, fs->fs_frag);
        for (i = 0; i < snaplistsize; i++)
                snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
        error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
            snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks),
            UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
        for (i = 0; i < snaplistsize; i++)
                snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
out:
        if (error && snaplist != NULL) {
                free(snaplist, M_UFSMNT);
                ip->i_snapblklist = NULL;
        }
        return error;
}

/*
 * Write the superblock and its summary information to the snapshot.
 * Make sure, the first UFS_NDADDR blocks get copied to the snapshot.
 */
static int
snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
{
        int error, len, loc;
        void *space;
        daddr_t blkno;
        struct buf *bp;
        struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
        struct inode *ip = VTOI(vp);
        struct lwp *l = curlwp;

        copyfs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));

        /*
         * Write the superblock and its summary information
         * to the snapshot.
         */
        blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
        len = howmany(fs->fs_cssize, fs->fs_bsize);
        space = copyfs->fs_csp;
#ifdef FFS_EI
        if (UFS_FSNEEDSWAP(fs)) {
                ffs_sb_swap(copyfs, copyfs);
                ffs_csum_swap(space, space, fs->fs_cssize);
        }
#endif
        error = UFS_WAPBL_BEGIN(mp);
        if (error)
                return error;
        for (loc = 0; loc < len; loc++) {
                error = bread(vp, blkno + loc, fs->fs_bsize,
                    B_MODIFY, &bp);
                if (error) {
                        break;
                }
                memcpy(bp->b_data, space, fs->fs_bsize);
                space = (char *)space + fs->fs_bsize;
                bawrite(bp);
        }
        if (error)
                goto out;
        error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc),
            fs->fs_bsize, B_MODIFY, &bp);
        if (error) {
                goto out;
        } else {
                memcpy(bp->b_data, sbbuf, fs->fs_bsize);
                bawrite(bp);
        }
        /*
         * Copy the first UFS_NDADDR blocks to the snapshot so
         * ffs_copyonwrite() and ffs_snapblkfree() will always work on
         * indirect blocks.
         */
        for (loc = 0; loc < UFS_NDADDR; loc++) {
                if (db_get(ip, loc) != 0)
                        continue;
                error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc),
                    fs->fs_bsize, l->l_cred, 0, &bp);
                if (error)
                        break;
                error = rwfsblk(vp, B_READ, bp->b_data, loc);
                if (error) {
                        brelse(bp, 0);
                        break;
                }
                bawrite(bp);
        }

out:
        UFS_WAPBL_END(mp);
        return error;
}

/*
 * Copy all cylinder group maps.
 */
static int
cgaccount(struct vnode *vp, int passno, int *redo)
{
        int cg, error = 0;
        struct buf *nbp;
        struct fs *fs = VTOI(vp)->i_fs;

        if (redo != NULL)
                *redo = 0;
        if (passno == 1)
                fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
                    M_DEVBUF, M_WAITOK | M_ZERO);
        for (cg = 0; cg < fs->fs_ncg; cg++) {
                if (passno == 2 && ACTIVECG_ISSET(fs, cg))
                        continue;

                if (redo != NULL)
                        *redo += 1;
                error = UFS_WAPBL_BEGIN(vp->v_mount);
                if (error)
                        return error;
                error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
                    fs->fs_bsize, curlwp->l_cred, 0, &nbp);
                if (error) {
                        UFS_WAPBL_END(vp->v_mount);
                        break;
                }
                error = cgaccount1(cg, vp, nbp->b_data, passno);
                bawrite(nbp);
                UFS_WAPBL_END(vp->v_mount);
                if (error)
                        break;
        }
        return error;
}

/*
 * Copy a cylinder group map. All the unallocated blocks are marked
 * BLK_NOCOPY so that the snapshot knows that it need not copy them
 * if they are later written. If passno is one, then this is a first
 * pass, so only setting needs to be done. If passno is 2, then this
 * is a revision to a previous pass which must be undone as the
 * replacement pass is done.
 */
static int
cgaccount1(int cg, struct vnode *vp, void *data, int passno)
{
        struct buf *bp, *ibp;
        struct inode *ip;
        struct cg *cgp;
        struct fs *fs;
        struct lwp *l = curlwp;
        daddr_t base, numblks;
        int error, len, loc, ns __unused, indiroff;

        ip = VTOI(vp);
        fs = ip->i_fs;
        ns = UFS_FSNEEDSWAP(fs);
        error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
                (int)fs->fs_cgsize, 0, &bp);
        if (error) {
                return (error);
        }
        cgp = (struct cg *)bp->b_data;
        if (!cg_chkmagic(cgp, ns)) {
                brelse(bp, 0);
                return (EIO);
        }
        ACTIVECG_SET(fs, cg);

        memcpy(data, bp->b_data, fs->fs_cgsize);
        brelse(bp, 0);
        if (fs->fs_cgsize < fs->fs_bsize)
                memset((char *)data + fs->fs_cgsize, 0,
                    fs->fs_bsize - fs->fs_cgsize);
        numblks = howmany(fs->fs_size, fs->fs_frag);
        len = howmany(fs->fs_fpg, fs->fs_frag);
        base = cgbase(fs, cg) / fs->fs_frag;
        if (base + len >= numblks)
                len = numblks - base - 1;
        loc = 0;
        if (base < UFS_NDADDR) {
                for ( ; loc < UFS_NDADDR; loc++) {
                        if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
                                db_assign(ip, loc, BLK_NOCOPY);
                        else if (db_get(ip, loc) == BLK_NOCOPY) {
                                if (passno == 2)
                                        db_assign(ip, loc, 0);
                                else if (passno == 1)
                                        panic("ffs_snapshot: lost direct block");
                        }
                }
        }
        if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)),
            fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
                return (error);
        indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs);
        for ( ; loc < len; loc++, indiroff++) {
                if (indiroff >= FFS_NINDIR(fs)) {
                        bawrite(ibp);
                        if ((error = ffs_balloc(vp,
                            ffs_lblktosize(fs, (off_t)(base + loc)),
                            fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
                                return (error);
                        indiroff = 0;
                }
                if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
                        idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
                else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
                        if (passno == 2)
                                idb_assign(ip, ibp->b_data, indiroff, 0);
                        else if (passno == 1)
                                panic("ffs_snapshot: lost indirect block");
                }
        }
        bdwrite(ibp);
        return (0);
}

/*
 * Before expunging a snapshot inode, note all the
 * blocks that it claims with BLK_SNAP so that fsck will
 * be able to account for those blocks properly and so
 * that this snapshot knows that it need not copy them
 * if the other snapshot holding them is freed.
 */
static int
expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
    acctfunc_t acctfunc, int expungetype)
{
        int i, error, ns __unused;
        daddr_t lbn, rlbn;
        daddr_t len, blkno, numblks, blksperindir;
        struct ufs1_dinode *dip1;
        struct ufs2_dinode *dip2;
        struct lwp *l = curlwp;
        void *bap;
        struct buf *bp;
        struct mount *mp;

        ns = UFS_FSNEEDSWAP(fs);
        mp = snapvp->v_mount;

        error = UFS_WAPBL_BEGIN(mp);
        if (error)
                return error;
        /*
         * Prepare to expunge the inode. If its inode block has not
         * yet been copied, then allocate and fill the copy.
         */
        lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
        error = snapblkaddr(snapvp, lbn, &blkno);
        if (error)
                return error;
        if (blkno != 0) {
                error = bread(snapvp, lbn, fs->fs_bsize,
                    B_MODIFY, &bp);
        } else {
                error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn),
                    fs->fs_bsize, l->l_cred, 0, &bp);
                if (! error)
                        error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
        }
        if (error) {
                UFS_WAPBL_END(mp);
                return error;
        }
        /*
         * Set a snapshot inode to be a zero length file, regular files
         * or unlinked snapshots to be completely unallocated.
         */
        if (fs->fs_magic == FS_UFS1_MAGIC) {
                dip1 = (struct ufs1_dinode *)bp->b_data +
                    ino_to_fsbo(fs, cancelip->i_number);
                if (cancelip->i_flags & SF_SNAPSHOT) {
                        dip1->di_flags =
                            ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
                            SF_SNAPINVAL, ns);
                }
                if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
                        dip1->di_mode = 0;
                dip1->di_size = 0;
                dip1->di_blocks = 0;
                memset(&dip1->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t));
        } else {
                dip2 = (struct ufs2_dinode *)bp->b_data +
                    ino_to_fsbo(fs, cancelip->i_number);
                if (cancelip->i_flags & SF_SNAPSHOT) {
                        dip2->di_flags =
                            ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
                            SF_SNAPINVAL, ns);
                }
                if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
                        dip2->di_mode = 0;
                dip2->di_size = 0;
                dip2->di_blocks = 0;
                memset(&dip2->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t));
        }
        bdwrite(bp);
        UFS_WAPBL_END(mp);
        /*
         * Now go through and expunge all the blocks in the file
         * using the function requested.
         */
        numblks = howmany(cancelip->i_size, fs->fs_bsize);
        if (fs->fs_magic == FS_UFS1_MAGIC)
                bap = &cancelip->i_ffs1_db[0];
        else
                bap = &cancelip->i_ffs2_db[0];
        error = (*acctfunc)(snapvp, bap, 0, UFS_NDADDR, fs, 0, expungetype);
        if (error)
                return (error);
        if (fs->fs_magic == FS_UFS1_MAGIC)
                bap = &cancelip->i_ffs1_ib[0];
        else
                bap = &cancelip->i_ffs2_ib[0];
        error = (*acctfunc)(snapvp, bap, 0, UFS_NIADDR, fs, -1, expungetype);
        if (error)
                return (error);
        blksperindir = 1;
        lbn = -UFS_NDADDR;
        len = numblks - UFS_NDADDR;
        rlbn = UFS_NDADDR;
        for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
                error = indiracct(snapvp, ITOV(cancelip), i,
                    ib_get(cancelip, i), lbn, rlbn, len,
                    blksperindir, fs, acctfunc, expungetype);
                if (error)
                        return (error);
                blksperindir *= FFS_NINDIR(fs);
                lbn -= blksperindir + 1;
                len -= blksperindir;
                rlbn += blksperindir;
        }
        return (0);
}

/*
 * Descend an indirect block chain for vnode cancelvp accounting for all
 * its indirect blocks in snapvp.
 */
static int
indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
    daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
    daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
{
        int error, num, i;
        daddr_t subblksperindir;
        struct indir indirs[UFS_NIADDR + 2];
        daddr_t last;
        void *bap;
        struct buf *bp;

        if (blkno == 0) {
                if (expungetype == BLK_NOCOPY)
                        return (0);
                panic("indiracct: missing indir");
        }
        if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
                return (error);
        if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
                panic("indiracct: botched params");
        /*
         * We have to expand bread here since it will deadlock looking
         * up the block number for any blocks that are not in the cache.
         */
        error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize,
            false, &bp);
        if (error)
                return error;
        if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
            rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) {
                brelse(bp, 0);
                return (error);
        }
        /*
         * Account for the block pointers in this indirect block.
         */
        last = howmany(remblks, blksperindir);
        if (last > FFS_NINDIR(fs))
                last = FFS_NINDIR(fs);
        bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
        memcpy((void *)bap, bp->b_data, fs->fs_bsize);
        brelse(bp, 0);
        error = (*acctfunc)(snapvp, bap, 0, last,
            fs, level == 0 ? rlbn : -1, expungetype);
        if (error || level == 0)
                goto out;
        /*
         * Account for the block pointers in each of the indirect blocks
         * in the levels below us.
         */
        subblksperindir = blksperindir / FFS_NINDIR(fs);
        for (lbn++, level--, i = 0; i < last; i++) {
                error = indiracct(snapvp, cancelvp, level,
                    idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
                    subblksperindir, fs, acctfunc, expungetype);
                if (error)
                        goto out;
                rlbn += blksperindir;
                lbn -= blksperindir;
                remblks -= blksperindir;
        }
out:
        free(bap, M_DEVBUF);
        return (error);
}

/*
 * Do both snap accounting and map accounting.
 */
static int
fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
    struct fs *fs, daddr_t lblkno,
    int exptype /* BLK_SNAP or BLK_NOCOPY */)
{
        int error;

        if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
                return (error);
        return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
}

/*
 * Identify a set of blocks allocated in a snapshot inode.
 */
static int
snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
    struct fs *fs, daddr_t lblkno,
    int expungetype /* BLK_SNAP or BLK_NOCOPY */)
{
        struct inode *ip = VTOI(vp);
        struct lwp *l = curlwp;
        struct mount *mp = vp->v_mount;
        daddr_t blkno;
        daddr_t lbn;
        struct buf *ibp;
        int error, n;
        const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;

        error = UFS_WAPBL_BEGIN(mp);
        if (error)
                return error;
        for ( n = 0; oldblkp < lastblkp; oldblkp++) {
                blkno = idb_get(ip, bap, oldblkp);
                if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
                        continue;
                lbn = ffs_fragstoblks(fs, blkno);
                if (lbn < UFS_NDADDR) {
                        blkno = db_get(ip, lbn);
                        ip->i_flag |= IN_CHANGE | IN_UPDATE;
                } else {
                        error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
                            fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
                        if (error)
                                break;
                        blkno = idb_get(ip, ibp->b_data,
                            (lbn - UFS_NDADDR) % FFS_NINDIR(fs));
                }
                /*
                 * If we are expunging a snapshot vnode and we
                 * find a block marked BLK_NOCOPY, then it is
                 * one that has been allocated to this snapshot after
                 * we took our current snapshot and can be ignored.
                 */
                if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
                        if (lbn >= UFS_NDADDR)
                                brelse(ibp, 0);
                } else {
                        if (blkno != 0)
                                panic("snapacct: bad block");
                        if (lbn < UFS_NDADDR)
                                db_assign(ip, lbn, expungetype);
                        else {
                                idb_assign(ip, ibp->b_data,
                                    (lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype);
                                bdwrite(ibp);
                        }
                }
                if (wbreak > 0 && (++n % wbreak) == 0) {
                        UFS_WAPBL_END(mp);
                        error = UFS_WAPBL_BEGIN(mp);
                        if (error)
                                return error;
                }
        }
        UFS_WAPBL_END(mp);
        return error;
}

/*
 * Account for a set of blocks allocated in a snapshot inode.
 */
static int
mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
    struct fs *fs, daddr_t lblkno, int expungetype)
{
        daddr_t blkno;
        struct inode *ip;
        struct mount *mp = vp->v_mount;
        ino_t inum;
        int acctit, error, n;
        const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;

        error = UFS_WAPBL_BEGIN(mp);
        if (error)
                return error;
        ip = VTOI(vp);
        inum = ip->i_number;
        if (lblkno == -1)
                acctit = 0;
        else
                acctit = 1;
        for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
                blkno = idb_get(ip, bap, oldblkp);
                if (blkno == 0 || blkno == BLK_NOCOPY)
                        continue;
                if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
                        *ip->i_snapblklist++ = lblkno;
                if (blkno == BLK_SNAP)
                        blkno = ffs_blkstofrags(fs, lblkno);
                ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
                if (wbreak > 0 && (++n % wbreak) == 0) {
                        UFS_WAPBL_END(mp);
                        error = UFS_WAPBL_BEGIN(mp);
                        if (error)
                                return error;
                }
        }
        UFS_WAPBL_END(mp);
        return (0);
}

/*
 * Number of blocks that fit into the journal or zero if not logging.
 */
static int
blocks_in_journal(struct fs *fs)
{
        off_t bpj;

        if ((fs->fs_flags & FS_DOWAPBL) == 0)
                return 0;
        bpj = 1;
        if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
                switch (fs->fs_journal_location) {
                case UFS_WAPBL_JOURNALLOC_END_PARTITION:
                        bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
                            fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
                        break;
                case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
                        bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
                            fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
                        break;
                }
        }
        bpj /= fs->fs_bsize;
        return (bpj > 0 ? bpj : 1);
}
#endif /* defined(FFS_NO_SNAPSHOT) */

/*
 * Decrement extra reference on snapshot when last name is removed.
 * It will not be freed until the last open reference goes away.
 */
void
ffs_snapgone(struct vnode *vp)
{
        struct inode *xp, *ip = VTOI(vp);
        struct mount *mp = spec_node_getmountedfs(ip->i_devvp);
        struct fs *fs;
        struct snap_info *si;
        int snaploc;

        si = VFSTOUFS(mp)->um_snapinfo;

        /*
         * Find snapshot in incore list.
         */
        mutex_enter(&si->si_lock);
        TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
                if (xp == ip)
                        break;
        mutex_exit(&si->si_lock);
        if (xp != NULL)
                vrele(ITOV(ip));
#ifdef DEBUG
        else if (snapdebug)
                printf("ffs_snapgone: lost snapshot vnode %llu\n",
                    (unsigned long long)ip->i_number);
#endif
        /*
         * Delete snapshot inode from superblock. Keep list dense.
         */
        mutex_enter(&si->si_lock);
        fs = ip->i_fs;
        for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
                if (fs->fs_snapinum[snaploc] == ip->i_number)
                        break;
        if (snaploc < FSMAXSNAP) {
                for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
                        if (fs->fs_snapinum[snaploc] == 0)
                                break;
                        fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
                }
                fs->fs_snapinum[snaploc - 1] = 0;
        }
        si->si_gen++;
        mutex_exit(&si->si_lock);
}

/*
 * Prepare a snapshot file for being removed.
 */
void
ffs_snapremove(struct vnode *vp)
{
        struct inode *ip = VTOI(vp), *xp;
        struct vnode *devvp = ip->i_devvp;
        struct fs *fs = ip->i_fs;
        struct mount *mp = spec_node_getmountedfs(devvp);
        struct buf *ibp;
        struct snap_info *si;
        struct lwp *l = curlwp;
        daddr_t numblks, blkno, dblk;
        int error, loc, last;

        si = VFSTOUFS(mp)->um_snapinfo;
        /*
         * If active, delete from incore list (this snapshot may
         * already have been in the process of being deleted, so
         * would not have been active).
         *
         * Clear copy-on-write flag if last snapshot.
         */
        mutex_enter(&si->si_snaplock);
        mutex_enter(&si->si_lock);
        if (is_active_snapshot(si, ip)) {
                TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
                if (TAILQ_FIRST(&si->si_snapshots) != 0) {
                        /* Roll back the list of preallocated blocks. */
                        xp = TAILQ_LAST(&si->si_snapshots, inodelst);
                        si->si_snapblklist = xp->i_snapblklist;
                        si->si_gen++;
                        mutex_exit(&si->si_lock);
                        mutex_exit(&si->si_snaplock);
                } else {
                        si->si_snapblklist = 0;
                        si->si_gen++;
                        mutex_exit(&si->si_lock);
                        mutex_exit(&si->si_snaplock);
                        fscow_disestablish(mp, ffs_copyonwrite, devvp);
                }
                if (ip->i_snapblklist != NULL) {
                        free(ip->i_snapblklist, M_UFSMNT);
                        ip->i_snapblklist = NULL;
                }
        } else {
                mutex_exit(&si->si_lock);
                mutex_exit(&si->si_snaplock);
        }
        /*
         * Clear all BLK_NOCOPY fields. Pass any block claims to other
         * snapshots that want them (see ffs_snapblkfree below).
         */
        for (blkno = 1; blkno < UFS_NDADDR; blkno++) {
                dblk = db_get(ip, blkno);
                if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
                        db_assign(ip, blkno, 0);
                else if ((dblk == ffs_blkstofrags(fs, blkno) &&
                     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
                     ip->i_number))) {
                        DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
                        db_assign(ip, blkno, 0);
                }
        }
        numblks = howmany(ip->i_size, fs->fs_bsize);
        for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) {
                error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
                    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
                if (error)
                        continue;
                if (fs->fs_size - blkno > FFS_NINDIR(fs))
                        last = FFS_NINDIR(fs);
                else
                        last = fs->fs_size - blkno;
                for (loc = 0; loc < last; loc++) {
                        dblk = idb_get(ip, ibp->b_data, loc);
                        if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
                                idb_assign(ip, ibp->b_data, loc, 0);
                        else if (dblk == ffs_blkstofrags(fs, blkno) &&
                            ffs_snapblkfree(fs, ip->i_devvp, dblk,
                            fs->fs_bsize, ip->i_number)) {
                                DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
                                idb_assign(ip, ibp->b_data, loc, 0);
                        }
                }
                bawrite(ibp);
                UFS_WAPBL_END(mp);
                error = UFS_WAPBL_BEGIN(mp);
                KASSERT(error == 0);
        }
        /*
         * Clear snapshot flag and drop reference.
         */
        ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
        DIP_ASSIGN(ip, flags, ip->i_flags);
        ip->i_flag |= IN_CHANGE | IN_UPDATE;
#if defined(QUOTA) || defined(QUOTA2)
        chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
        chkiq(ip, 1, l->l_cred, FORCE);
#endif
}

/*
 * Notification that a block is being freed. Return zero if the free
 * should be allowed to proceed. Return non-zero if the snapshot file
 * wants to claim the block. The block will be claimed if it is an
 * uncopied part of one of the snapshots. It will be freed if it is
 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
 * If a fragment is being freed, then all snapshots that care about
 * it must make a copy since a snapshot file can only claim full sized
 * blocks. Note that if more than one snapshot file maps the block,
 * we can pick one at random to claim it. Since none of the snapshots
 * can change, we are assurred that they will all see the same unmodified
 * image. When deleting a snapshot file (see ffs_snapremove above), we
 * must push any of these claimed blocks to one of the other snapshots
 * that maps it. These claimed blocks are easily identified as they will
 * have a block number equal to their logical block number within the
 * snapshot. A copied block can never have this property because they
 * must always have been allocated from a BLK_NOCOPY location.
 */
int
ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
    long size, ino_t inum)
{
        struct mount *mp = spec_node_getmountedfs(devvp);
        struct buf *ibp;
        struct inode *ip;
        struct vnode *vp = NULL;
        struct snap_info *si;
        void *saved_data = NULL;
        daddr_t lbn;
        daddr_t blkno;
        uint32_t gen;
        int indiroff = 0, error = 0, claimedblk = 0;

        si = VFSTOUFS(mp)->um_snapinfo;
        lbn = ffs_fragstoblks(fs, bno);
        mutex_enter(&si->si_snaplock);
        mutex_enter(&si->si_lock);
        si->si_owner = curlwp;
                
retry:
        gen = si->si_gen;
        TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
                vp = ITOV(ip);
                /*
                 * Lookup block being written.
                 */
                if (lbn < UFS_NDADDR) {
                        blkno = db_get(ip, lbn);
                } else {
                        mutex_exit(&si->si_lock);
                        error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
                            fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
                        if (error) {
                                mutex_enter(&si->si_lock);
                                break;
                        }
                        indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs);
                        blkno = idb_get(ip, ibp->b_data, indiroff);
                        mutex_enter(&si->si_lock);
                        if (gen != si->si_gen) {
                                brelse(ibp, 0);
                                goto retry;
                        }
                }
                /*
                 * Check to see if block needs to be copied.
                 */
                if (blkno == 0) {
                        /*
                         * A block that we map is being freed. If it has not
                         * been claimed yet, we will claim or copy it (below).
                         */
                        claimedblk = 1;
                } else if (blkno == BLK_SNAP) {
                        /*
                         * No previous snapshot claimed the block,
                         * so it will be freed and become a BLK_NOCOPY
                         * (don't care) for us.
                         */
                        if (claimedblk)
                                panic("snapblkfree: inconsistent block type");
                        if (lbn < UFS_NDADDR) {
                                db_assign(ip, lbn, BLK_NOCOPY);
                                ip->i_flag |= IN_CHANGE | IN_UPDATE;
                        } else {
                                idb_assign(ip, ibp->b_data, indiroff,
                                    BLK_NOCOPY);
                                mutex_exit(&si->si_lock);
                                if (ip->i_nlink > 0)
                                        bwrite(ibp);
                                else
                                        bdwrite(ibp);
                                mutex_enter(&si->si_lock);
                                if (gen != si->si_gen)
                                        goto retry;
                        }
                        continue;
                } else /* BLK_NOCOPY or default */ {
                        /*
                         * If the snapshot has already copied the block
                         * (default), or does not care about the block,
                         * it is not needed.
                         */
                        if (lbn >= UFS_NDADDR)
                                brelse(ibp, 0);
                        continue;
                }
                /*
                 * If this is a full size block, we will just grab it
                 * and assign it to the snapshot inode. Otherwise we
                 * will proceed to copy it. See explanation for this
                 * routine as to why only a single snapshot needs to
                 * claim this block.
                 */
                if (size == fs->fs_bsize) {
#ifdef DEBUG
                        if (snapdebug)
                                printf("%s %llu lbn %" PRId64
                                    "from inum %llu\n",
                                    "Grabonremove: snapino",
                                    (unsigned long long)ip->i_number,
                                    lbn, (unsigned long long)inum);
#endif
                        mutex_exit(&si->si_lock);
                        if (lbn < UFS_NDADDR) {
                                db_assign(ip, lbn, bno);
                        } else {
                                idb_assign(ip, ibp->b_data, indiroff, bno);
                                if (ip->i_nlink > 0)
                                        bwrite(ibp);
                                else
                                        bdwrite(ibp);
                        }
                        DIP_ADD(ip, blocks, btodb(size));
                        ip->i_flag |= IN_CHANGE | IN_UPDATE;
                        if (ip->i_nlink > 0 && mp->mnt_wapbl)
                                error = syncsnap(vp);
                        else
                                error = 0;
                        mutex_enter(&si->si_lock);
                        si->si_owner = NULL;
                        mutex_exit(&si->si_lock);
                        mutex_exit(&si->si_snaplock);
                        return (error == 0);
                }
                if (lbn >= UFS_NDADDR)
                        brelse(ibp, 0);
#ifdef DEBUG
                if (snapdebug)
                        printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
                            "Copyonremove: snapino ",
                            (unsigned long long)ip->i_number,
                            lbn, "for inum", (unsigned long long)inum, size);
#endif
                /*
                 * If we have already read the old block contents, then
                 * simply copy them to the new block. Note that we need
                 * to synchronously write snapshots that have not been
                 * unlinked, and hence will be visible after a crash,
                 * to ensure their integrity.
                 */
                mutex_exit(&si->si_lock);
                if (saved_data == NULL) {
                        saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
                        error = rwfsblk(vp, B_READ, saved_data, lbn);
                        if (error) {
                                free(saved_data, M_UFSMNT);
                                saved_data = NULL;
                                mutex_enter(&si->si_lock);
                                break;
                        }
                }
                error = wrsnapblk(vp, saved_data, lbn);
                if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
                        error = syncsnap(vp);
                mutex_enter(&si->si_lock);
                if (error)
                        break;
                if (gen != si->si_gen)
                        goto retry;
        }
        si->si_owner = NULL;
        mutex_exit(&si->si_lock);
        mutex_exit(&si->si_snaplock);
        if (saved_data)
                free(saved_data, M_UFSMNT);
        /*
         * If we have been unable to allocate a block in which to do
         * the copy, then return non-zero so that the fragment will
         * not be freed. Although space will be lost, the snapshot
         * will stay consistent.
         */
        return (error);
}

/*
 * Associate snapshot files when mounting.
 */
void
ffs_snapshot_mount(struct mount *mp)
{
        struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
        struct fs *fs = VFSTOUFS(mp)->um_fs;
        struct lwp *l = curlwp;
        struct vnode *vp;
        struct inode *ip, *xp;
        struct snap_info *si;
        daddr_t snaplistsize, *snapblklist;
        int i, error, ns __unused, snaploc, loc;

        /*
         * No persistent snapshots on apple ufs file systems.
         */
        if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
                return;

        si = VFSTOUFS(mp)->um_snapinfo;
        ns = UFS_FSNEEDSWAP(fs);
        /*
         * XXX The following needs to be set before ffs_truncate or
         * VOP_READ can be called.
         */
        mp->mnt_stat.f_iosize = fs->fs_bsize;
        /*
         * Process each snapshot listed in the superblock.
         */
        vp = NULL;
        mutex_enter(&si->si_lock);
        for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
                if (fs->fs_snapinum[snaploc] == 0)
                        break;
                if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
                    LK_EXCLUSIVE, &vp)) != 0) {
                        printf("ffs_snapshot_mount: vget failed %d\n", error);
                        continue;
                }
                ip = VTOI(vp);
                if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
                    SF_SNAPSHOT) {
                        printf("ffs_snapshot_mount: non-snapshot inode %d\n",
                            fs->fs_snapinum[snaploc]);
                        vput(vp);
                        vp = NULL;
                        for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
                                if (fs->fs_snapinum[loc] == 0)
                                        break;
                                fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
                        }
                        fs->fs_snapinum[loc - 1] = 0;
                        snaploc--;
                        continue;
                }

                /*
                 * Read the block hints list. Use an empty list on
                 * read errors.
                 */
                error = vn_rdwr(UIO_READ, vp,
                    (void *)&snaplistsize, sizeof(snaplistsize),
                    ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
                    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
                    l->l_cred, NULL, NULL);
                if (error) {
                        printf("ffs_snapshot_mount: read_1 failed %d\n", error);
                        snaplistsize = 1;
                } else
                        snaplistsize = ufs_rw64(snaplistsize, ns);
                snapblklist = malloc(
                    snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
                if (error)
                        snapblklist[0] = 1;
                else {
                        error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
                            snaplistsize * sizeof(daddr_t),
                            ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
                            UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
                            l->l_cred, NULL, NULL);
                        for (i = 0; i < snaplistsize; i++)
                                snapblklist[i] = ufs_rw64(snapblklist[i], ns);
                        if (error) {
                                printf("ffs_snapshot_mount: read_2 failed %d\n",
                                    error);
                                snapblklist[0] = 1;
                        }
                }
                ip->i_snapblklist = &snapblklist[0];

                /*
                 * Link it onto the active snapshot list.
                 */
                if (is_active_snapshot(si, ip))
                        panic("ffs_snapshot_mount: %"PRIu64" already on list",
                            ip->i_number);
                else
                        TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
                vp->v_vflag |= VV_SYSTEM;
                VOP_UNLOCK(vp);
        }
        /*
         * No usable snapshots found.
         */
        if (vp == NULL) {
                mutex_exit(&si->si_lock);
                return;
        }
        /*
         * Attach the block hints list. We always want to
         * use the list from the newest snapshot.
        */
        xp = TAILQ_LAST(&si->si_snapshots, inodelst);
        si->si_snapblklist = xp->i_snapblklist;
        fscow_establish(mp, ffs_copyonwrite, devvp);
        si->si_gen++;
        mutex_exit(&si->si_lock);
}

/*
 * Disassociate snapshot files when unmounting.
 */
void
ffs_snapshot_unmount(struct mount *mp)
{
        struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
        struct inode *xp;
        struct vnode *vp = NULL;
        struct snap_info *si;

        si = VFSTOUFS(mp)->um_snapinfo;
        mutex_enter(&si->si_lock);
        while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
                vp = ITOV(xp);
                TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
                if (xp->i_snapblklist == si->si_snapblklist)
                        si->si_snapblklist = NULL;
                free(xp->i_snapblklist, M_UFSMNT);
                if (xp->i_nlink > 0) {
                        si->si_gen++;
                        mutex_exit(&si->si_lock);
                        vrele(vp);
                        mutex_enter(&si->si_lock);
                }
        }
        si->si_gen++;
        mutex_exit(&si->si_lock);
        if (vp)
                fscow_disestablish(mp, ffs_copyonwrite, devvp);
}

/*
 * Check for need to copy block that is about to be written,
 * copying the block if necessary.
 */
static int
ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
{
        struct fs *fs;
        struct inode *ip;
        struct vnode *devvp = v, *vp = NULL;
        struct mount *mp = spec_node_getmountedfs(devvp);
        struct snap_info *si;
        void *saved_data = NULL;
        daddr_t lbn, blkno, *snapblklist;
        uint32_t gen;
        int lower, upper, mid, snapshot_locked = 0, error = 0;

        /*
         * Check for valid snapshots.
         */
        si = VFSTOUFS(mp)->um_snapinfo;
        mutex_enter(&si->si_lock);
        ip = TAILQ_FIRST(&si->si_snapshots);
        if (ip == NULL) {
                mutex_exit(&si->si_lock);
                return 0;
        }
        /*
         * First check to see if it is after the file system,
         * in the journal or in the preallocated list.
         * By doing these checks we avoid several potential deadlocks.
         */
        fs = ip->i_fs;
        lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno));
        if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) {
                mutex_exit(&si->si_lock);
                return 0;
        }
        if ((fs->fs_flags & FS_DOWAPBL) &&
            fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
                off_t blk_off, log_start, log_end;

                log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
                    fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
                log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
                    fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
                blk_off = dbtob(bp->b_blkno);
                if (blk_off >= log_start && blk_off < log_end) {
                        mutex_exit(&si->si_lock);
                        return 0;
                }
        }
        snapblklist = si->si_snapblklist;
        upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
        lower = 1;
        while (lower <= upper) {
                mid = (lower + upper) / 2;
                if (snapblklist[mid] == lbn)
                        break;
                if (snapblklist[mid] < lbn)
                        lower = mid + 1;
                else
                        upper = mid - 1;
        }
        if (lower <= upper) {
                mutex_exit(&si->si_lock);
                return 0;
        }
        /*
         * Not in the precomputed list, so check the snapshots.
         */
         if (si->si_owner != curlwp) {
                if (!mutex_tryenter(&si->si_snaplock)) {
                        mutex_exit(&si->si_lock);
                        mutex_enter(&si->si_snaplock);
                        mutex_enter(&si->si_lock);
                }
                si->si_owner = curlwp;
                snapshot_locked = 1;
         }
         if (data_valid && bp->b_bcount == fs->fs_bsize)
                saved_data = bp->b_data;
retry:
        gen = si->si_gen;
        TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
                vp = ITOV(ip);
                /*
                 * We ensure that everything of our own that needs to be
                 * copied will be done at the time that ffs_snapshot is
                 * called. Thus we can skip the check here which can
                 * deadlock in doing the lookup in ffs_balloc.
                 */
                if (bp->b_vp == vp)
                        continue;
                /*
                 * Check to see if block needs to be copied.
                 */
                if (lbn < UFS_NDADDR) {
                        blkno = db_get(ip, lbn);
                } else {
                        mutex_exit(&si->si_lock);
                        blkno = 0; /* XXX: GCC */
                        if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
                                mutex_enter(&si->si_lock);
                                break;
                        }
                        mutex_enter(&si->si_lock);
                        if (gen != si->si_gen)
                                goto retry;
                }
                KASSERTMSG((blkno != BLK_SNAP || bp->b_lblkno < 0),
                    "ffs_copyonwrite: bad copy block: blkno %jd, lblkno %jd",
                    (intmax_t)blkno, (intmax_t)bp->b_lblkno);
                if (blkno != 0)
                        continue;

                if (curlwp == uvm.pagedaemon_lwp) {
                        error = ENOMEM;
                        break;
                }
                /* Only one level of recursion allowed. */
                KASSERT(snapshot_locked);
                /*
                 * Allocate the block into which to do the copy. Since
                 * multiple processes may all try to copy the same block,
                 * we have to recheck our need to do a copy if we sleep
                 * waiting for the lock.
                 *
                 * Because all snapshots on a filesystem share a single
                 * lock, we ensure that we will never be in competition
                 * with another process to allocate a block.
                 */
#ifdef DEBUG
                if (snapdebug) {
                        printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
                            (unsigned long long)ip->i_number, lbn);
                        if (bp->b_vp == devvp)
                                printf("fs metadata");
                        else
                                printf("inum %llu", (unsigned long long)
                                    VTOI(bp->b_vp)->i_number);
                        printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
                }
#endif
                /*
                 * If we have already read the old block contents, then
                 * simply copy them to the new block. Note that we need
                 * to synchronously write snapshots that have not been
                 * unlinked, and hence will be visible after a crash,
                 * to ensure their integrity.
                 */
                mutex_exit(&si->si_lock);
                if (saved_data == NULL) {
                        saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
                        error = rwfsblk(vp, B_READ, saved_data, lbn);
                        if (error) {
                                free(saved_data, M_UFSMNT);
                                saved_data = NULL;
                                mutex_enter(&si->si_lock);
                                break;
                        }
                }
                error = wrsnapblk(vp, saved_data, lbn);
                if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
                        error = syncsnap(vp);
                mutex_enter(&si->si_lock);
                if (error)
                        break;
                if (gen != si->si_gen)
                        goto retry;
        }
        /*
         * Note that we need to synchronously write snapshots that
         * have not been unlinked, and hence will be visible after
         * a crash, to ensure their integrity.
         */
        if (snapshot_locked) {
                si->si_owner = NULL;
                mutex_exit(&si->si_lock);
                mutex_exit(&si->si_snaplock);
        } else
                mutex_exit(&si->si_lock);
        if (saved_data && saved_data != bp->b_data)
                free(saved_data, M_UFSMNT);
        return error;
}

/*
 * Read from a snapshot.
 */
int
ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
{
        struct inode *ip = VTOI(vp);
        struct fs *fs = ip->i_fs;
        struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
        struct buf *bp;
        daddr_t lbn, nextlbn;
        off_t fsbytes, bytesinfile;
        long size, xfersize, blkoffset;
        int error;

        mutex_enter(&si->si_snaplock);

        if (ioflag & IO_ALTSEMANTICS)
                fsbytes = ip->i_size;
        else
                fsbytes = ffs_lfragtosize(fs, fs->fs_size);
        for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
                bytesinfile = fsbytes - uio->uio_offset;
                if (bytesinfile <= 0)
                        break;
                lbn = ffs_lblkno(fs, uio->uio_offset);
                nextlbn = lbn + 1;
                size = fs->fs_bsize;
                blkoffset = ffs_blkoff(fs, uio->uio_offset);
                xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
                    bytesinfile);

                if (ffs_lblktosize(fs, nextlbn + 1) >= fsbytes) {
                        if (ffs_lblktosize(fs, lbn) + size > fsbytes)
                                size = ffs_fragroundup(fs,
                                    fsbytes - ffs_lblktosize(fs, lbn));
                        error = bread(vp, lbn, size, 0, &bp);
                } else {
                        int nextsize = fs->fs_bsize;
                        error = breadn(vp, lbn,
                            size, &nextlbn, &nextsize, 1, 0, &bp);
                }
                if (error)
                        break;

                /*
                 * We should only get non-zero b_resid when an I/O error
                 * has occurred, which should cause us to break above.
                 * However, if the short read did not cause an error,
                 * then we want to ensure that we do not uiomove bad
                 * or uninitialized data.
                 */
                size -= bp->b_resid;
                if (size < blkoffset + xfersize) {
                        xfersize = size - blkoffset;
                        if (xfersize <= 0)
                                break;
                }
                error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
                if (error)
                        break;
                brelse(bp, BC_AGE);
        }
        if (bp != NULL)
                brelse(bp, BC_AGE);

        mutex_exit(&si->si_snaplock);
        return error;
}

/*
 * Lookup a snapshots data block address.
 * Simpler than UFS_BALLOC() as we know all metadata is already allocated
 * and safe even for the pagedaemon where we cannot bread().
 */
static int
snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
{
        struct indir indirs[UFS_NIADDR + 2];
        struct inode *ip = VTOI(vp);
        struct fs *fs = ip->i_fs;
        struct buf *bp;
        int error, num;

        KASSERT(lbn >= 0);

        if (lbn < UFS_NDADDR) {
                *res = db_get(ip, lbn);
                return 0;
        }
        if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
                return error;
        if (curlwp == uvm.pagedaemon_lwp) {
                mutex_enter(&bufcache_lock);
                bp = incore(vp, indirs[num-1].in_lbn);
                if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
                        *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
                        error = 0;
                } else
                        error = ENOMEM;
                mutex_exit(&bufcache_lock);
                return error;
        }
        error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, 0, &bp);
        if (error == 0) {
                *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
                brelse(bp, 0);
        }

        return error;
}

/*
 * Read or write the specified block of the filesystem vp resides on
 * from or to the disk bypassing the buffer cache.
 */
static int
rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
{
        int error;
        struct inode *ip = VTOI(vp);
        struct fs *fs = ip->i_fs;
        struct buf *nbp;

        nbp = getiobuf(NULL, true);
        nbp->b_flags = flags;
        nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
        nbp->b_error = 0;
        nbp->b_data = data;
        nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn));
        nbp->b_proc = NULL;
        nbp->b_dev = ip->i_devvp->v_rdev;
        SET(nbp->b_cflags, BC_BUSY);        /* mark buffer busy */

        bdev_strategy(nbp);

        error = biowait(nbp);

        putiobuf(nbp);

        return error;
}

/*
 * Write all dirty buffers to disk and invalidate them.
 */
static int
syncsnap(struct vnode *vp)
{
        int error;
        buf_t *bp;
        struct fs *fs = VTOI(vp)->i_fs;

        mutex_enter(&bufcache_lock);
        while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
                error = bbusy(bp, false, 0, NULL);
                if (error == EPASSTHROUGH)
                        continue;
                else if (error != 0) {
                        mutex_exit(&bufcache_lock);
                        return error;
                }
                KASSERT(bp->b_bcount == fs->fs_bsize);
                mutex_exit(&bufcache_lock);
                error = rwfsblk(vp, B_WRITE, bp->b_data,
                    ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)));
                brelse(bp, BC_INVAL | BC_VFLUSH);
                if (error)
                        return error;
                mutex_enter(&bufcache_lock);
        }
        mutex_exit(&bufcache_lock);

        return 0;
}

/*
 * Write the specified block to a snapshot.
 */
static int
wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
{
        struct inode *ip = VTOI(vp);
        struct fs *fs = ip->i_fs;
        struct buf *bp;
        int error;

        error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize,
            FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
        if (error)
                return error;
        memcpy(bp->b_data, data, fs->fs_bsize);
        if (ip->i_nlink > 0)
                error = bwrite(bp);
        else
                bawrite(bp);

        return error;
}

/*
 * Check if this inode is present on the active snapshot list.
 * Must be called with snapinfo locked.
 */
static inline bool
is_active_snapshot(struct snap_info *si, struct inode *ip)
{
        struct inode *xp;

        KASSERT(mutex_owned(&si->si_lock));

        TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
                if (xp == ip)
                        return true;
        return false;
}

/*
 * Get/Put direct block from inode or buffer containing disk addresses. Take
 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
 * into a global include.
 */
static inline daddr_t
db_get(struct inode *ip, int loc)
{
        if (ip->i_ump->um_fstype == UFS1)
                return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
        else
                return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
}

static inline void
db_assign(struct inode *ip, int loc, daddr_t val)
{
        if (ip->i_ump->um_fstype == UFS1)
                ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
        else
                ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
}

__unused static inline daddr_t
ib_get(struct inode *ip, int loc)
{
        if (ip->i_ump->um_fstype == UFS1)
                return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
        else
                return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
}

static inline daddr_t
idb_get(struct inode *ip, void *bf, int loc)
{
        if (ip->i_ump->um_fstype == UFS1)
                return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
        else
                return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
}

static inline void
idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
{
        if (ip->i_ump->um_fstype == UFS1)
                ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
        else
                ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
}
























































































   22 











   21 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
/*        $NetBSD: ufs_wapbl.h,v 1.19 2020/04/11 17:43:54 jdolecek Exp $        */

/*-
 * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */


#ifndef _UFS_UFS_UFS_WAPBL_H_
#define        _UFS_UFS_UFS_WAPBL_H_

#if defined(_KERNEL_OPT)
#include "opt_wapbl.h"
#endif

/*
 * Information for the journal location stored in the superblock.
 * We store the journal version, some flags, the journal location
 * type, and some location specific "locators" that identify where
 * the log itself is located.
 */

/* fs->fs_journal_version */
#define        UFS_WAPBL_VERSION                        1

/* fs->fs_journal_location */
#define        UFS_WAPBL_JOURNALLOC_NONE                0

#define        UFS_WAPBL_JOURNALLOC_END_PARTITION        1
#define         UFS_WAPBL_EPART_ADDR                          0 /* locator slots */
#define         UFS_WAPBL_EPART_COUNT                          1
#define         UFS_WAPBL_EPART_BLKSZ                          2
#define         UFS_WAPBL_EPART_UNUSED                          3

#define        UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM        2
#define         UFS_WAPBL_INFS_ADDR                          0 /* locator slots */
#define         UFS_WAPBL_INFS_COUNT                          1
#define         UFS_WAPBL_INFS_BLKSZ                          2
#define         UFS_WAPBL_INFS_INO                          3

/* fs->fs_journal_flags */
#define        UFS_WAPBL_FLAGS_CREATE_LOG                0x1
#define        UFS_WAPBL_FLAGS_CLEAR_LOG                0x2


/*
 * The journal size is limited to between 1MB and 64MB.
 * The default journal size is the filesystem size divided by
 * the scale factor - this is 1M of journal per 1GB of filesystem
 * space.
 *
 * XXX: Is 64MB too limiting?  If user explicitly asks for more, allow it?
 */
#define        UFS_WAPBL_JOURNAL_SCALE                        1024
#define        UFS_WAPBL_MIN_JOURNAL_SIZE                (1024 * 1024)
#define        UFS_WAPBL_MAX_JOURNAL_SIZE                (64 * 1024 * 1024)


#if defined(WAPBL)

static __inline int
ufs_wapbl_begin(struct mount *mp, const char *file, int line)
{
        if (mp->mnt_wapbl) {
                int error;
                error = wapbl_begin(mp->mnt_wapbl, file, line);
                if (error)
                        return error;
        }
        return 0;
}

static __inline void
ufs_wapbl_end(struct mount *mp)
{
        if (mp->mnt_wapbl) {
                wapbl_end(mp->mnt_wapbl);
        }
}

#define        UFS_WAPBL_BEGIN(mp)                                                \
        ufs_wapbl_begin(mp, __func__, __LINE__)
#define        UFS_WAPBL_END(mp) ufs_wapbl_end(mp)

#define        UFS_WAPBL_UPDATE(vp, access, modify, flags)                        \
        if ((vp)->v_mount->mnt_wapbl) {                                        \
                UFS_UPDATE(vp, access, modify, flags);                        \
        }

#ifdef DIAGNOSTIC
#define        UFS_WAPBL_JLOCK_ASSERT(mp)                                        \
        if (mp->mnt_wapbl) wapbl_jlock_assert(mp->mnt_wapbl)
#define        UFS_WAPBL_JUNLOCK_ASSERT(mp)                                        \
        if (mp->mnt_wapbl) wapbl_junlock_assert(mp->mnt_wapbl)
#else
#define        UFS_WAPBL_JLOCK_ASSERT(mp)
#define UFS_WAPBL_JUNLOCK_ASSERT(mp)
#endif

#define        UFS_WAPBL_REGISTER_INODE(mp, ino, mode)                                \
        if (mp->mnt_wapbl) wapbl_register_inode(mp->mnt_wapbl, ino, mode)
#define        UFS_WAPBL_UNREGISTER_INODE(mp, ino, mode)                        \
        if (mp->mnt_wapbl) wapbl_unregister_inode(mp->mnt_wapbl, ino, mode)

#define        UFS_WAPBL_REGISTER_DEALLOCATION(mp, blk, len, cookiep)                \
        (mp->mnt_wapbl)                                                        \
            ? wapbl_register_deallocation(mp->mnt_wapbl, blk, len,        \
                false, cookiep)                                                \
            : 0

#define        UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(mp, blk, len)                \
        (                                                                \
          (mp->mnt_wapbl)                                                \
            ? wapbl_register_deallocation(mp->mnt_wapbl, blk, len,        \
                true, NULL)                                                \
            : 0                                                                \
        )

#define        UFS_WAPBL_UNREGISTER_DEALLOCATION(mp, cookie)                        \
        if (mp->mnt_wapbl) wapbl_unregister_deallocation(mp->mnt_wapbl, cookie)

#else /* ! WAPBL */
#define        UFS_WAPBL_BEGIN(mp) (__USE(mp), 0)
#define        UFS_WAPBL_END(mp)        do { } while (0)
#define        UFS_WAPBL_UPDATE(vp, access, modify, flags)        do { } while (0)
#define        UFS_WAPBL_JLOCK_ASSERT(mp)
#define        UFS_WAPBL_JUNLOCK_ASSERT(mp)
#define        UFS_WAPBL_REGISTER_INODE(mp, ino, mode)                do { } while (0)
#define        UFS_WAPBL_UNREGISTER_INODE(mp, ino, mode)        do { } while (0)
#define        UFS_WAPBL_REGISTER_DEALLOCATION(mp, blk, len, cookiep)                0
#define        UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(mp, blk, len)                0
#define        UFS_WAPBL_UNREGISTER_DEALLOCATION(mp, cookie)        do { } while (0)
#endif

#endif /* !_UFS_UFS_UFS_WAPBL_H_ */

























































































































































   20 











   20 


   20 
























































   12 








   12 










   12 
    1 

   12 













































   30 

   34 









   41 


   34 








    8 





    8 
























   33 











   33 





   32 

   33 
















   30 

   33 













    5 











    2 











    5 






    5 

    5 






































































































   30 




   30 





   29 


    3 








    3 


   30 



   29 

    3 




























































































































































    2 








   10 
    7 






   10 













    8 







    9 
    7 
    1 











    9 





















   20 











































    8 







   12 






































    1 
   11 



   12 


   12 


   12 




   11 










































































































































































































































































































































































































































































































   20 



















    1 
































    1 

















    3 




















    3 

















    1 













    1 







    1 










    2 


   12 



























    4 




    4 






    4 


    4 



    4 












    4 




    4 
    4 





    4 




    4 



    4 







    4 
    4 




    4 




    4 









    4 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
/*        $NetBSD: vfs_mount.c,v 1.105 2024/04/19 00:45:41 riastradh Exp $        */

/*-
 * Copyright (c) 1997-2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_subr.c        8.13 (Berkeley) 4/18/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_mount.c,v 1.105 2024/04/19 00:45:41 riastradh Exp $");

#include "veriexec.h"

#include <sys/param.h>
#include <sys/kernel.h>

#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <sys/filedesc.h>
#include <sys/device.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/fstrans.h>
#include <sys/namei.h>
#include <sys/extattr.h>
#include <sys/verified_exec.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/vfs_syscalls.h>
#include <sys/vnode_impl.h>

#include <miscfs/deadfs/deadfs.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

#include <uvm/uvm_swap.h>

enum mountlist_type {
        ME_MOUNT,
        ME_MARKER
};
struct mountlist_entry {
        TAILQ_ENTRY(mountlist_entry) me_list;        /* Mount list. */
        struct mount *me_mount;                        /* Actual mount if ME_MOUNT,
                                                   current mount else. */
        enum mountlist_type me_type;                /* Mount or marker. */
};
struct mount_iterator {
        struct mountlist_entry mi_entry;
};

static struct vnode *vfs_vnode_iterator_next1(struct vnode_iterator *,
    bool (*)(void *, struct vnode *), void *, bool);

/* Root filesystem. */
vnode_t *                        rootvnode;

/* Mounted filesystem list. */
static TAILQ_HEAD(mountlist, mountlist_entry) mountlist;
static kmutex_t                        mountlist_lock __cacheline_aligned;
int vnode_offset_next_by_lru        /* XXX: ugly hack for pstat.c */
    = offsetof(vnode_impl_t, vi_lrulist.tqe_next);

kmutex_t                        vfs_list_lock __cacheline_aligned;

static specificdata_domain_t        mount_specificdata_domain;
static kmutex_t                        mntid_lock;

static kmutex_t                        mountgen_lock __cacheline_aligned;
static uint64_t                        mountgen;

void
vfs_mount_sysinit(void)
{

        TAILQ_INIT(&mountlist);
        mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
        mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);

        mount_specificdata_domain = specificdata_domain_create();
        mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
        mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
        mountgen = 0;
}

struct mount *
vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp)
{
        struct mount *mp;
        int error __diagused;

        mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
        mp->mnt_op = vfsops;
        mp->mnt_refcnt = 1;
        TAILQ_INIT(&mp->mnt_vnodelist);
        mp->mnt_renamelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
        mp->mnt_vnodelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
        mp->mnt_updating = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
        mp->mnt_vnodecovered = vp;
        mount_initspecific(mp);

        error = fstrans_mount(mp);
        KASSERT(error == 0);

        mutex_enter(&mountgen_lock);
        mp->mnt_gen = mountgen++;
        mutex_exit(&mountgen_lock);

        return mp;
}

/*
 * vfs_rootmountalloc: lookup a filesystem type, and if found allocate and
 * initialize a mount structure for it.
 *
 * Devname is usually updated by mount(8) after booting.
 */
int
vfs_rootmountalloc(const char *fstypename, const char *devname,
    struct mount **mpp)
{
        struct vfsops *vfsp = NULL;
        struct mount *mp;
        int error __diagused;

        mutex_enter(&vfs_list_lock);
        LIST_FOREACH(vfsp, &vfs_list, vfs_list)
                if (!strncmp(vfsp->vfs_name, fstypename, 
                    sizeof(mp->mnt_stat.f_fstypename)))
                        break;
        if (vfsp == NULL) {
                mutex_exit(&vfs_list_lock);
                return (ENODEV);
        }
        vfsp->vfs_refcount++;
        mutex_exit(&vfs_list_lock);

        if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
                return ENOMEM;
        error = vfs_busy(mp);
        KASSERT(error == 0);
        mp->mnt_flag = MNT_RDONLY;
        (void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
            sizeof(mp->mnt_stat.f_fstypename));
        mp->mnt_stat.f_mntonname[0] = '/';
        mp->mnt_stat.f_mntonname[1] = '\0';
        mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
            '\0';
        (void)copystr(devname, mp->mnt_stat.f_mntfromname,
            sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
        *mpp = mp;
        return 0;
}

/*
 * vfs_getnewfsid: get a new unique fsid.
 */
void
vfs_getnewfsid(struct mount *mp)
{
        static u_short xxxfs_mntid;
        struct mountlist_entry *me;
        fsid_t tfsid;
        int mtype;

        mutex_enter(&mntid_lock);
        if (xxxfs_mntid == 0)
                ++xxxfs_mntid;
        mtype = makefstype(mp->mnt_op->vfs_name);
        tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
        tfsid.__fsid_val[1] = mtype;
        /* Always increment to not return the same fsid to parallel mounts. */
        xxxfs_mntid++;

        /*
         * Directly walk mountlist to prevent deadlock through
         * mountlist_iterator_next() -> vfs_busy().
         */
        mutex_enter(&mountlist_lock);
        for (me = TAILQ_FIRST(&mountlist); me != TAILQ_END(&mountlist); ) {
                if (me->me_type == ME_MOUNT &&
                    me->me_mount->mnt_stat.f_fsidx.__fsid_val[0] ==
                    tfsid.__fsid_val[0] &&
                    me->me_mount->mnt_stat.f_fsidx.__fsid_val[1] ==
                    tfsid.__fsid_val[1]) {
                        tfsid.__fsid_val[0]++;
                        xxxfs_mntid++;
                        me = TAILQ_FIRST(&mountlist);
                } else {
                        me = TAILQ_NEXT(me, me_list);
                }
        }
        mutex_exit(&mountlist_lock);

        mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
        mp->mnt_stat.f_fsidx.__fsid_val[1] = tfsid.__fsid_val[1];
        mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
        mutex_exit(&mntid_lock);
}

/*
 * Lookup a mount point by filesystem identifier.
 *
 * XXX Needs to add a reference to the mount point.
 */
struct mount *
vfs_getvfs(fsid_t *fsid)
{
        mount_iterator_t *iter;
        struct mount *mp;

        mountlist_iterator_init(&iter);
        while ((mp = mountlist_iterator_next(iter)) != NULL) {
                if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
                    mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
                        mountlist_iterator_destroy(iter);
                        return mp;
                }
        }
        mountlist_iterator_destroy(iter);
        return NULL;
}

/*
 * Take a reference to a mount structure.
 */
void
vfs_ref(struct mount *mp)
{

        KASSERT(mp->mnt_refcnt > 0 || mutex_owned(&mountlist_lock));

        atomic_inc_uint(&mp->mnt_refcnt);
}

/*
 * Drop a reference to a mount structure, freeing if the last reference.
 */
void
vfs_rele(struct mount *mp)
{

        membar_release();
        if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
                return;
        }
        membar_acquire();

        /*
         * Nothing else has visibility of the mount: we can now
         * free the data structures.
         */
        KASSERT(mp->mnt_refcnt == 0);
        specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
        mutex_obj_free(mp->mnt_updating);
        mutex_obj_free(mp->mnt_renamelock);
        mutex_obj_free(mp->mnt_vnodelock);
        if (mp->mnt_op != NULL) {
                vfs_delref(mp->mnt_op);
        }
        fstrans_unmount(mp);
        /*
         * Final free of mp gets done from fstrans_mount_dtor().
         *
         * Prevents this memory to be reused as a mount before
         * fstrans releases all references to it.
         */
}

/*
 * Mark a mount point as busy, and gain a new reference to it.  Used to
 * prevent the file system from being unmounted during critical sections.
 *
 * vfs_busy can be called multiple times and by multiple threads
 * and must be accompanied by the same number of vfs_unbusy calls.
 *
 * => The caller must hold a pre-existing reference to the mount.
 * => Will fail if the file system is being unmounted, or is unmounted.
 */
static inline int
_vfs_busy(struct mount *mp, bool wait)
{

        KASSERT(mp->mnt_refcnt > 0);

        if (wait) {
                fstrans_start(mp);
        } else {
                if (fstrans_start_nowait(mp))
                        return EBUSY;
        }
        if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
                fstrans_done(mp);
                return ENOENT;
        }
        vfs_ref(mp);
        return 0;
}

int
vfs_busy(struct mount *mp)
{

        return _vfs_busy(mp, true);
}

int
vfs_trybusy(struct mount *mp)
{

        return _vfs_busy(mp, false);
}

/*
 * Unbusy a busy filesystem.
 *
 * Every successful vfs_busy() call must be undone by a vfs_unbusy() call.
 */
void
vfs_unbusy(struct mount *mp)
{

        KASSERT(mp->mnt_refcnt > 0);

        fstrans_done(mp);
        vfs_rele(mp);
}

/*
 * Change a file systems lower mount.
 * Both the current and the new lower mount may be NULL.  The caller
 * guarantees exclusive access to the mount and holds a pre-existing
 * reference to the new lower mount.
 */
int
vfs_set_lowermount(struct mount *mp, struct mount *lowermp)
{
        struct mount *oldlowermp;
        int error;

#ifdef DEBUG
        /*
         * Limit the depth of file system stack so kernel sanitizers
         * may stress mount/unmount without exhausting the kernel stack.
         */
        int depth;
        struct mount *mp2;

        for (depth = 0, mp2 = lowermp; mp2; depth++, mp2 = mp2->mnt_lower) {
                if (depth == 23)
                        return EINVAL;
        }
#endif

        if (lowermp) {
                if (lowermp == dead_rootmount)
                        return ENOENT;
                error = vfs_busy(lowermp);
                if (error)
                        return error;
                vfs_ref(lowermp);
        }

        oldlowermp = mp->mnt_lower;
        mp->mnt_lower = lowermp;

        if (lowermp)
                vfs_unbusy(lowermp);

        if (oldlowermp)
                vfs_rele(oldlowermp);

        return 0;
}

struct vnode_iterator {
        vnode_impl_t vi_vnode;
};

void
vfs_vnode_iterator_init(struct mount *mp, struct vnode_iterator **vnip)
{
        vnode_t *vp;
        vnode_impl_t *vip;

        vp = vnalloc_marker(mp);
        vip = VNODE_TO_VIMPL(vp);

        mutex_enter(mp->mnt_vnodelock);
        TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vip, vi_mntvnodes);
        vp->v_usecount = 1;
        mutex_exit(mp->mnt_vnodelock);

        *vnip = (struct vnode_iterator *)vip;
}

void
vfs_vnode_iterator_destroy(struct vnode_iterator *vni)
{
        vnode_impl_t *mvip = &vni->vi_vnode;
        vnode_t *mvp = VIMPL_TO_VNODE(mvip);
        kmutex_t *lock;

        KASSERT(vnis_marker(mvp));
        if (vrefcnt(mvp) != 0) {
                lock = mvp->v_mount->mnt_vnodelock;
                mutex_enter(lock);
                TAILQ_REMOVE(&mvp->v_mount->mnt_vnodelist, mvip, vi_mntvnodes);
                mvp->v_usecount = 0;
                mutex_exit(lock);
        }
        vnfree_marker(mvp);
}

static struct vnode *
vfs_vnode_iterator_next1(struct vnode_iterator *vni,
    bool (*f)(void *, struct vnode *), void *cl, bool do_wait)
{
        vnode_impl_t *mvip = &vni->vi_vnode;
        struct mount *mp = VIMPL_TO_VNODE(mvip)->v_mount;
        vnode_t *vp;
        vnode_impl_t *vip;
        kmutex_t *lock;
        int error;

        KASSERT(vnis_marker(VIMPL_TO_VNODE(mvip)));

        lock = mp->mnt_vnodelock;
        do {
                mutex_enter(lock);
                vip = TAILQ_NEXT(mvip, vi_mntvnodes);
                TAILQ_REMOVE(&mp->mnt_vnodelist, mvip, vi_mntvnodes);
                VIMPL_TO_VNODE(mvip)->v_usecount = 0;
again:
                if (vip == NULL) {
                        mutex_exit(lock);
                               return NULL;
                }
                vp = VIMPL_TO_VNODE(vip);
                KASSERT(vp != NULL);
                mutex_enter(vp->v_interlock);
                if (vnis_marker(vp) ||
                    vdead_check(vp, (do_wait ? 0 : VDEAD_NOWAIT)) ||
                    (f && !(*f)(cl, vp))) {
                        mutex_exit(vp->v_interlock);
                        vip = TAILQ_NEXT(vip, vi_mntvnodes);
                        goto again;
                }

                TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vip, mvip, vi_mntvnodes);
                VIMPL_TO_VNODE(mvip)->v_usecount = 1;
                mutex_exit(lock);
                error = vcache_vget(vp);
                KASSERT(error == 0 || error == ENOENT);
        } while (error != 0);

        return vp;
}

struct vnode *
vfs_vnode_iterator_next(struct vnode_iterator *vni,
    bool (*f)(void *, struct vnode *), void *cl)
{

        return vfs_vnode_iterator_next1(vni, f, cl, false);
}

/*
 * Move a vnode from one mount queue to another.
 */
void
vfs_insmntque(vnode_t *vp, struct mount *mp)
{
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
        struct mount *omp;
        kmutex_t *lock;

        KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 ||
            vp->v_tag == VT_VFS);

        /*
         * Delete from old mount point vnode list, if on one.
         */
        if ((omp = vp->v_mount) != NULL) {
                lock = omp->mnt_vnodelock;
                mutex_enter(lock);
                TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vip, vi_mntvnodes);
                mutex_exit(lock);
        }

        /*
         * Insert into list of vnodes for the new mount point, if
         * available.  The caller must take a reference on the mount
         * structure and donate to the vnode.
         */
        if ((vp->v_mount = mp) != NULL) {
                lock = mp->mnt_vnodelock;
                mutex_enter(lock);
                TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vip, vi_mntvnodes);
                mutex_exit(lock);
        }

        if (omp != NULL) {
                /* Release reference to old mount. */
                vfs_rele(omp);
        }
}

/*
 * Remove any vnodes in the vnode table belonging to mount point mp.
 *
 * If FORCECLOSE is not specified, there should not be any active ones,
 * return error if any are found (nb: this is a user error, not a
 * system error). If FORCECLOSE is specified, detach any active vnodes
 * that are found.
 *
 * If WRITECLOSE is set, only flush out regular file vnodes open for
 * writing.
 *
 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
 */
#ifdef DEBUG
int busyprt = 0;        /* print out busy vnodes */
struct ctldebug debug1 = { "busyprt", &busyprt };
#endif

static vnode_t *
vflushnext(struct vnode_iterator *marker, int *when)
{
        if (getticks() > *when) {
                yield();
                *when = getticks() + hz / 10;
        }
        preempt_point();
        return vfs_vnode_iterator_next1(marker, NULL, NULL, true);
}

/*
 * Flush one vnode.  Referenced on entry, unreferenced on return.
 */
static int
vflush_one(vnode_t *vp, vnode_t *skipvp, int flags)
{
        int error;
        struct vattr vattr;

        if (vp == skipvp ||
            ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM))) {
                vrele(vp);
                return 0;
        }
        /*
         * If WRITECLOSE is set, only flush out regular file
         * vnodes open for writing or open and unlinked.
         */
        if ((flags & WRITECLOSE)) {
                if (vp->v_type != VREG) {
                        vrele(vp);
                        return 0;
                }
                error = vn_lock(vp, LK_EXCLUSIVE);
                if (error) {
                        KASSERT(error == ENOENT);
                        vrele(vp);
                        return 0;
                }
                error = VOP_FSYNC(vp, curlwp->l_cred, FSYNC_WAIT, 0, 0);
                if (error == 0)
                        error = VOP_GETATTR(vp, &vattr, curlwp->l_cred);
                VOP_UNLOCK(vp);
                if (error) {
                        vrele(vp);
                        return error;
                }
                if (vp->v_writecount == 0 && vattr.va_nlink > 0) {
                        vrele(vp);
                        return 0;
                }
        }
        /*
         * First try to recycle the vnode.
         */
        if (vrecycle(vp))
                return 0;
        /*
         * If FORCECLOSE is set, forcibly close the vnode.
         * For block or character devices, revert to an
         * anonymous device.  For all other files, just
         * kill them.
         */
        if (flags & FORCECLOSE) {
                if (vrefcnt(vp) > 1 &&
                    (vp->v_type == VBLK || vp->v_type == VCHR))
                        vcache_make_anon(vp);
                else
                        vgone(vp);
                return 0;
        }
        vrele(vp);
        return EBUSY;
}

int
vflush(struct mount *mp, vnode_t *skipvp, int flags)
{
        vnode_t *vp;
        struct vnode_iterator *marker;
        int busy, error, when, retries = 2;

        do {
                busy = error = when = 0;

                /*
                 * First, flush out any vnode references from the
                 * deferred vrele list.
                 */
                vrele_flush(mp);

                vfs_vnode_iterator_init(mp, &marker);

                while ((vp = vflushnext(marker, &when)) != NULL) {
                        error = vflush_one(vp, skipvp, flags);
                        if (error == EBUSY) {
                                error = 0;
                                busy++;
#ifdef DEBUG
                                if (busyprt && retries == 0)
                                        vprint("vflush: busy vnode", vp);
#endif
                        } else if (error != 0) {
                                break;
                        }
                }

                vfs_vnode_iterator_destroy(marker);
        } while (error == 0 && busy > 0 && retries-- > 0);

        if (error)
                return error;
        if (busy)
                return EBUSY;
        return 0;
}

/*
 * Mount a file system.
 */

/*
 * Scan all active processes to see if any of them have a current or root
 * directory onto which the new filesystem has just been  mounted. If so,
 * replace them with the new mount point.
 */
static void
mount_checkdirs(vnode_t *olddp)
{
        vnode_t *newdp, *rele1, *rele2;
        struct cwdinfo *cwdi;
        struct proc *p;
        bool retry;

        if (vrefcnt(olddp) == 1) {
                return;
        }
        if (VFS_ROOT(olddp->v_mountedhere, LK_EXCLUSIVE, &newdp))
                panic("mount: lost mount");

        do {
                retry = false;
                mutex_enter(&proc_lock);
                PROCLIST_FOREACH(p, &allproc) {
                        if ((cwdi = p->p_cwdi) == NULL)
                                continue;
                        /*
                         * Cannot change to the old directory any more,
                         * so even if we see a stale value it is not a
                         * problem.
                         */
                        if (cwdi->cwdi_cdir != olddp &&
                            cwdi->cwdi_rdir != olddp)
                                continue;
                        retry = true;
                        rele1 = NULL;
                        rele2 = NULL;
                        atomic_inc_uint(&cwdi->cwdi_refcnt);
                        mutex_exit(&proc_lock);
                        rw_enter(&cwdi->cwdi_lock, RW_WRITER);
                        if (cwdi->cwdi_cdir == olddp) {
                                rele1 = cwdi->cwdi_cdir;
                                vref(newdp);
                                cwdi->cwdi_cdir = newdp;
                        }
                        if (cwdi->cwdi_rdir == olddp) {
                                rele2 = cwdi->cwdi_rdir;
                                vref(newdp);
                                cwdi->cwdi_rdir = newdp;
                        }
                        rw_exit(&cwdi->cwdi_lock);
                        cwdfree(cwdi);
                        if (rele1 != NULL)
                                vrele(rele1);
                        if (rele2 != NULL)
                                vrele(rele2);
                        mutex_enter(&proc_lock);
                        break;
                }
                mutex_exit(&proc_lock);
        } while (retry);

        if (rootvnode == olddp) {
                vrele(rootvnode);
                vref(newdp);
                rootvnode = newdp;
        }
        vput(newdp);
}

/*
 * Start extended attributes
 */
static int
start_extattr(struct mount *mp)
{
        int error;

        error = VFS_EXTATTRCTL(mp, EXTATTR_CMD_START, NULL, 0, NULL);
        if (error) 
                printf("%s: failed to start extattr: error = %d\n",
                       mp->mnt_stat.f_mntonname, error);

        return error;
}

int
mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops,
    const char *path, int flags, void *data, size_t *data_len)
{
        vnode_t *vp = *vpp;
        struct mount *mp;
        struct pathbuf *pb;
        struct nameidata nd;
        int error, error2;

        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
            KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
        if (error) {
                vfs_delref(vfsops);
                return error;
        }

        /* Cannot make a non-dir a mount-point (from here anyway). */
        if (vp->v_type != VDIR) {
                vfs_delref(vfsops);
                return ENOTDIR;
        }

        if (flags & MNT_EXPORTED) {
                vfs_delref(vfsops);
                return EINVAL;
        }

        if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) {
                vfs_delref(vfsops);
                return ENOMEM;
        }

        mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);

        /*
         * The underlying file system may refuse the mount for
         * various reasons.  Allow the user to force it to happen.
         *
         * Set the mount level flags.
         */
        mp->mnt_flag = flags & (MNT_BASIC_FLAGS | MNT_FORCE | MNT_IGNORE);

        error = VFS_MOUNT(mp, path, data, data_len);
        mp->mnt_flag &= ~MNT_OP_FLAGS;

        if (error != 0) {
                vfs_rele(mp);
                return error;
        }

        /* Suspend new file system before taking mnt_updating. */
        do {
                error2 = vfs_suspend(mp, 0);
        } while (error2 == EINTR || error2 == ERESTART);
        KASSERT(error2 == 0 || error2 == EOPNOTSUPP);
        mutex_enter(mp->mnt_updating);

        /*
         * Validate and prepare the mount point.
         */
        error = pathbuf_copyin(path, &pb);
        if (error != 0) {
                goto err_mounted;
        }
        NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
        error = namei(&nd);
        pathbuf_destroy(pb);
        if (error != 0) {
                goto err_mounted;
        }
        if (nd.ni_vp != vp) {
                vput(nd.ni_vp);
                error = EINVAL;
                goto err_mounted;
        }
        if (vp->v_mountedhere != NULL) {
                vput(nd.ni_vp);
                error = EBUSY;
                goto err_mounted;
        }
        error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0);
        if (error != 0) {
                vput(nd.ni_vp);
                goto err_mounted;
        }

        /*
         * Put the new filesystem on the mount list after root.
         */
        cache_purge(vp);
        mp->mnt_iflag &= ~IMNT_WANTRDWR;

        mountlist_append(mp);
        if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
                vfs_syncer_add_to_worklist(mp);
        vp->v_mountedhere = mp;
        vput(nd.ni_vp);

        mount_checkdirs(vp);
        mutex_exit(mp->mnt_updating);
        if (error2 == 0)
                vfs_resume(mp);

        /* Hold an additional reference to the mount across VFS_START(). */
        vfs_ref(mp);
        (void) VFS_STATVFS(mp, &mp->mnt_stat);
        error = VFS_START(mp, 0);
        if (error) {
                vrele(vp);
        } else if (flags & MNT_EXTATTR) {
                if (start_extattr(mp) != 0)
                        mp->mnt_flag &= ~MNT_EXTATTR;
        }
        /* Drop reference held for VFS_START(). */
        vfs_rele(mp);
        *vpp = NULL;
        return error;

err_mounted:
        if (VFS_UNMOUNT(mp, MNT_FORCE) != 0)
                panic("Unmounting fresh file system failed");
        mutex_exit(mp->mnt_updating);
        if (error2 == 0)
                vfs_resume(mp);
        vfs_set_lowermount(mp, NULL);
        vfs_rele(mp);

        return error;
}

/*
 * Do the actual file system unmount.  File system is assumed to have
 * been locked by the caller.
 *
 * => Caller hold reference to the mount, explicitly for dounmount().
 */
int
dounmount(struct mount *mp, int flags, struct lwp *l)
{
        struct vnode *coveredvp, *vp;
        struct vnode_impl *vip;
        int error, async, used_syncer, used_extattr;
        const bool was_suspended = fstrans_is_owner(mp);

#if NVERIEXEC > 0
        error = veriexec_unmountchk(mp);
        if (error)
                return (error);
#endif /* NVERIEXEC > 0 */

        if (!was_suspended) {
                error = vfs_suspend(mp, 0);
                if (error) {
                        return error;
                }
        }

        KASSERT((mp->mnt_iflag & IMNT_GONE) == 0);

        used_syncer = (mp->mnt_iflag & IMNT_ONWORKLIST) != 0;
        used_extattr = mp->mnt_flag & MNT_EXTATTR;

        mp->mnt_iflag |= IMNT_UNMOUNT;
        mutex_enter(mp->mnt_updating);
        async = mp->mnt_flag & MNT_ASYNC;
        mp->mnt_flag &= ~MNT_ASYNC;
        cache_purgevfs(mp);        /* remove cache entries for this file sys */
        if (used_syncer)
                vfs_syncer_remove_from_worklist(mp);
        error = 0;
        if (((mp->mnt_flag & MNT_RDONLY) == 0) && ((flags & MNT_FORCE) == 0)) {
                error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
        }
        if (error == 0 || (flags & MNT_FORCE)) {
                error = VFS_UNMOUNT(mp, flags);
        }
        if (error) {
                mp->mnt_iflag &= ~IMNT_UNMOUNT;
                if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
                        vfs_syncer_add_to_worklist(mp);
                mp->mnt_flag |= async;
                mutex_exit(mp->mnt_updating);
                if (!was_suspended)
                        vfs_resume(mp);
                if (used_extattr) {
                        if (start_extattr(mp) != 0)
                                mp->mnt_flag &= ~MNT_EXTATTR;
                        else
                                mp->mnt_flag |= MNT_EXTATTR;
                }
                return (error);
        }
        mutex_exit(mp->mnt_updating);

        /*
         * mark filesystem as gone to prevent further umounts
         * after mnt_umounting lock is gone, this also prevents
         * vfs_busy() from succeeding.
         */
        mp->mnt_iflag |= IMNT_GONE;
        if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
                coveredvp->v_mountedhere = NULL;
        }
        if (!was_suspended)
                vfs_resume(mp);

        mountlist_remove(mp);

        if ((vip = TAILQ_FIRST(&mp->mnt_vnodelist)) != NULL) {
                vp = VIMPL_TO_VNODE(vip);
                vprint("dangling", vp);
                panic("unmount: dangling vnode");
        }
        vfs_hooks_unmount(mp);

        vfs_set_lowermount(mp, NULL);
        vfs_rele(mp);        /* reference from mount() */
        if (coveredvp != NULLVP) {
                vrele(coveredvp);
        }
        return (0);
}

/*
 * Unmount all file systems.
 * We traverse the list in reverse order under the assumption that doing so
 * will avoid needing to worry about dependencies.
 */
bool
vfs_unmountall(struct lwp *l)
{

        printf("unmounting file systems...\n");
        return vfs_unmountall1(l, true, true);
}

static void
vfs_unmount_print(struct mount *mp, const char *pfx)
{

        aprint_verbose("%sunmounted %s on %s type %s\n", pfx,
            mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
            mp->mnt_stat.f_fstypename);
}

/*
 * Return the mount with the highest generation less than "gen".
 */
static struct mount *
vfs_unmount_next(uint64_t gen)
{
        mount_iterator_t *iter;
        struct mount *mp, *nmp;

        nmp = NULL;

        mountlist_iterator_init(&iter);
        while ((mp = mountlist_iterator_next(iter)) != NULL) {
                if ((nmp == NULL || mp->mnt_gen > nmp->mnt_gen) && 
                    mp->mnt_gen < gen) {
                        if (nmp != NULL)
                                vfs_rele(nmp);
                        nmp = mp;
                        vfs_ref(nmp);
                }
        }
        mountlist_iterator_destroy(iter);

        return nmp;
}

bool
vfs_unmount_forceone(struct lwp *l)
{
        struct mount *mp;
        int error;

        mp = vfs_unmount_next(mountgen);
        if (mp == NULL) {
                return false;
        }

#ifdef DEBUG
        printf("forcefully unmounting %s (%s)...\n",
            mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
#endif
        if ((error = dounmount(mp, MNT_FORCE, l)) == 0) {
                vfs_unmount_print(mp, "forcefully ");
                return true;
        } else {
                vfs_rele(mp);
        }

#ifdef DEBUG
        printf("forceful unmount of %s failed with error %d\n",
            mp->mnt_stat.f_mntonname, error);
#endif

        return false;
}

bool
vfs_unmountall1(struct lwp *l, bool force, bool verbose)
{
        struct mount *mp;
        mount_iterator_t *iter;
        bool any_error = false, progress = false;
        uint64_t gen;
        int error;

        gen = mountgen;
        for (;;) {
                mp = vfs_unmount_next(gen);
                if (mp == NULL)
                        break;
                gen = mp->mnt_gen;

#ifdef DEBUG
                printf("unmounting %p %s (%s)...\n",
                    (void *)mp, mp->mnt_stat.f_mntonname,
                    mp->mnt_stat.f_mntfromname);
#endif
                if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
                        vfs_unmount_print(mp, "");
                        progress = true;
                } else {
                        vfs_rele(mp);
                        if (verbose) {
                                printf("unmount of %s failed with error %d\n",
                                    mp->mnt_stat.f_mntonname, error);
                        }
                        any_error = true;
                }
        }
        if (verbose) {
                printf("unmounting done\n");
        }
        if (any_error && verbose) {
                printf("WARNING: some file systems would not unmount\n");
        }
        /* If the mountlist is empty it is time to remove swap. */
        mountlist_iterator_init(&iter);
        if (mountlist_iterator_next(iter) == NULL) {
                uvm_swap_shutdown(l);
        }
        mountlist_iterator_destroy(iter);

        return progress;
}

void
vfs_sync_all(struct lwp *l)
{
        printf("syncing disks... ");

        /* remove user processes from run queue */
        suspendsched();
        (void)spl0();

        /* avoid coming back this way again if we panic. */
        doing_shutdown = 1;

        do_sys_sync(l);

        /* Wait for sync to finish. */
        if (vfs_syncwait() != 0) {
#if defined(DDB) && defined(DEBUG_HALT_BUSY)
                Debugger();
#endif
                printf("giving up\n");
                return;
        } else
                printf("done\n");
}

/*
 * Sync and unmount file systems before shutting down.
 */
void
vfs_shutdown(void)
{
        lwp_t *l = curlwp;

        vfs_sync_all(l);

        /*
         * If we have panicked - do not make the situation potentially
         * worse by unmounting the file systems.
         */
        if (panicstr != NULL) {
                return;
        }

        /* Unmount file systems. */
        vfs_unmountall(l);
}

/*
 * Print a list of supported file system types (used by vfs_mountroot)
 */
static void
vfs_print_fstypes(void)
{
        struct vfsops *v;
        int cnt = 0;

        mutex_enter(&vfs_list_lock);
        LIST_FOREACH(v, &vfs_list, vfs_list)
                ++cnt;
        mutex_exit(&vfs_list_lock);

        if (cnt == 0) {
                printf("WARNING: No file system modules have been loaded.\n");
                return;
        }

        printf("Supported file systems:");
        mutex_enter(&vfs_list_lock);
        LIST_FOREACH(v, &vfs_list, vfs_list) {
                printf(" %s", v->vfs_name);
        }
        mutex_exit(&vfs_list_lock);
        printf("\n");
}

/*
 * Mount the root file system.  If the operator didn't specify a
 * file system to use, try all possible file systems until one
 * succeeds.
 */
int
vfs_mountroot(void)
{
        struct vfsops *v;
        int error = ENODEV;

        if (root_device == NULL)
                panic("vfs_mountroot: root device unknown");

        switch (device_class(root_device)) {
        case DV_IFNET:
                if (rootdev != NODEV)
                        panic("vfs_mountroot: rootdev set for DV_IFNET "
                            "(0x%llx -> %llu,%llu)",
                            (unsigned long long)rootdev,
                            (unsigned long long)major(rootdev),
                            (unsigned long long)minor(rootdev));
                break;

        case DV_DISK:
                if (rootdev == NODEV)
                        panic("vfs_mountroot: rootdev not set for DV_DISK");
                if (bdevvp(rootdev, &rootvp))
                        panic("vfs_mountroot: can't get vnode for rootdev");
                vn_lock(rootvp, LK_EXCLUSIVE | LK_RETRY);
                error = VOP_OPEN(rootvp, FREAD, FSCRED);
                VOP_UNLOCK(rootvp);
                if (error) {
                        printf("vfs_mountroot: can't open root device\n");
                        return (error);
                }
                break;

        case DV_VIRTUAL:
                break;

        default:
                printf("%s: inappropriate for root file system\n",
                    device_xname(root_device));
                return (ENODEV);
        }

        /*
         * If user specified a root fs type, use it.  Make sure the
         * specified type exists and has a mount_root()
         */
        if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
                v = vfs_getopsbyname(rootfstype);
                error = EFTYPE;
                if (v != NULL) {
                        if (v->vfs_mountroot != NULL) {
                                error = (v->vfs_mountroot)();
                        }
                        v->vfs_refcount--;
                }
                goto done;
        }

        /*
         * Try each file system currently configured into the kernel.
         */
        mutex_enter(&vfs_list_lock);
        LIST_FOREACH(v, &vfs_list, vfs_list) {
                if (v->vfs_mountroot == NULL)
                        continue;
#ifdef DEBUG
                aprint_normal("mountroot: trying %s...\n", v->vfs_name);
#endif
                v->vfs_refcount++;
                mutex_exit(&vfs_list_lock);
                error = (*v->vfs_mountroot)();
                mutex_enter(&vfs_list_lock);
                v->vfs_refcount--;
                if (!error) {
                        aprint_normal("root file system type: %s\n",
                            v->vfs_name);
                        break;
                }
        }
        mutex_exit(&vfs_list_lock);

        if (v == NULL) {
                vfs_print_fstypes();
                printf("no file system for %s", device_xname(root_device));
                if (device_class(root_device) == DV_DISK)
                        printf(" (dev 0x%llx)", (unsigned long long)rootdev);
                printf("\n");
                error = EFTYPE;
        }

done:
        if (error && device_class(root_device) == DV_DISK) {
                vn_lock(rootvp, LK_EXCLUSIVE | LK_RETRY);
                VOP_CLOSE(rootvp, FREAD, FSCRED);
                VOP_UNLOCK(rootvp);
                vrele(rootvp);
        }
        if (error == 0) {
                mount_iterator_t *iter;
                struct mount *mp;

                mountlist_iterator_init(&iter);
                mp = mountlist_iterator_next(iter);
                KASSERT(mp != NULL);
                mountlist_iterator_destroy(iter);

                mp->mnt_flag |= MNT_ROOTFS;
                mp->mnt_op->vfs_refcount++;

                /*
                 * Get the vnode for '/'.  Set cwdi0.cwdi_cdir to
                 * reference it, and donate it the reference grabbed
                 * with VFS_ROOT().
                 */
                error = VFS_ROOT(mp, LK_NONE, &rootvnode);
                if (error)
                        panic("cannot find root vnode, error=%d", error);
                cwdi0.cwdi_cdir = rootvnode;
                cwdi0.cwdi_rdir = NULL;

                /*
                 * Now that root is mounted, we can fixup initproc's CWD
                 * info.  All other processes are kthreads, which merely
                 * share proc0's CWD info.
                 */
                initproc->p_cwdi->cwdi_cdir = rootvnode;
                vref(initproc->p_cwdi->cwdi_cdir);
                initproc->p_cwdi->cwdi_rdir = NULL;
                /*
                 * Enable loading of modules from the filesystem
                 */
                module_load_vfs_init();

        }
        return (error);
}

/*
 * mount_specific_key_create --
 *        Create a key for subsystem mount-specific data.
 */
int
mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
{

        return specificdata_key_create(mount_specificdata_domain, keyp, dtor);
}

/*
 * mount_specific_key_delete --
 *        Delete a key for subsystem mount-specific data.
 */
void
mount_specific_key_delete(specificdata_key_t key)
{

        specificdata_key_delete(mount_specificdata_domain, key);
}

/*
 * mount_initspecific --
 *        Initialize a mount's specificdata container.
 */
void
mount_initspecific(struct mount *mp)
{
        int error __diagused;

        error = specificdata_init(mount_specificdata_domain,
                                  &mp->mnt_specdataref);
        KASSERT(error == 0);
}

/*
 * mount_finispecific --
 *        Finalize a mount's specificdata container.
 */
void
mount_finispecific(struct mount *mp)
{

        specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
}

/*
 * mount_getspecific --
 *        Return mount-specific data corresponding to the specified key.
 */
void *
mount_getspecific(struct mount *mp, specificdata_key_t key)
{

        return specificdata_getspecific(mount_specificdata_domain,
                                         &mp->mnt_specdataref, key);
}

/*
 * mount_setspecific --
 *        Set mount-specific data corresponding to the specified key.
 */
void
mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
{

        specificdata_setspecific(mount_specificdata_domain,
                                 &mp->mnt_specdataref, key, data);
}

/*
 * Check to see if a filesystem is mounted on a block device.
 */
int
vfs_mountedon(vnode_t *vp)
{
        vnode_t *vq;
        int error = 0;

        if (vp->v_type != VBLK)
                return ENOTBLK;
        if (spec_node_getmountedfs(vp) != NULL)
                return EBUSY;
        if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, VDEAD_NOWAIT, &vq)
            == 0) {
                if (spec_node_getmountedfs(vq) != NULL)
                        error = EBUSY;
                vrele(vq);
        }

        return error;
}

/*
 * Check if a device pointed to by vp is mounted.
 *
 * Returns:
 *   EINVAL        if it's not a disk
 *   EBUSY        if it's a disk and mounted
 *   0                if it's a disk and not mounted
 */
int
rawdev_mounted(vnode_t *vp, vnode_t **bvpp)
{
        vnode_t *bvp;
        dev_t dev;
        int d_type;

        bvp = NULL;
        d_type = D_OTHER;

        if (iskmemvp(vp))
                return EINVAL;

        switch (vp->v_type) {
        case VCHR: {
                const struct cdevsw *cdev;

                dev = vp->v_rdev;
                cdev = cdevsw_lookup(dev);
                if (cdev != NULL) {
                        dev_t blkdev;

                        blkdev = devsw_chr2blk(dev);
                        if (blkdev != NODEV) {
                                if (vfinddev(blkdev, VBLK, &bvp) != 0) {
                                        d_type = (cdev->d_flag & D_TYPEMASK);
                                        /* XXX: what if bvp disappears? */
                                        vrele(bvp);
                                }
                        }
                }

                break;
                }

        case VBLK: {
                const struct bdevsw *bdev;

                dev = vp->v_rdev;
                bdev = bdevsw_lookup(dev);
                if (bdev != NULL)
                        d_type = (bdev->d_flag & D_TYPEMASK);

                bvp = vp;

                break;
                }

        default:
                break;
        }

        if (d_type != D_DISK)
                return EINVAL;

        if (bvpp != NULL)
                *bvpp = bvp;

        /*
         * XXX: This is bogus. We should be failing the request
         * XXX: not only if this specific slice is mounted, but
         * XXX: if it's on a disk with any other mounted slice.
         */
        if (vfs_mountedon(bvp))
                return EBUSY;

        return 0;
}

/*
 * Make a 'unique' number from a mount type name.
 */
long
makefstype(const char *type)
{
        long rv;

        for (rv = 0; *type; type++) {
                rv <<= 2;
                rv ^= *type;
        }
        return rv;
}

static struct mountlist_entry *
mountlist_alloc(enum mountlist_type type, struct mount *mp)
{
        struct mountlist_entry *me;

        me = kmem_zalloc(sizeof(*me), KM_SLEEP);
        me->me_mount = mp;
        me->me_type = type;

        return me;
}

static void
mountlist_free(struct mountlist_entry *me)
{

        kmem_free(me, sizeof(*me));
}

void
mountlist_iterator_init(mount_iterator_t **mip)
{
        struct mountlist_entry *me;

        me = mountlist_alloc(ME_MARKER, NULL);
        mutex_enter(&mountlist_lock);
        TAILQ_INSERT_HEAD(&mountlist, me, me_list);
        mutex_exit(&mountlist_lock);
        *mip = (mount_iterator_t *)me;
}

void
mountlist_iterator_destroy(mount_iterator_t *mi)
{
        struct mountlist_entry *marker = &mi->mi_entry;

        if (marker->me_mount != NULL)
                vfs_unbusy(marker->me_mount);

        mutex_enter(&mountlist_lock);
        TAILQ_REMOVE(&mountlist, marker, me_list);
        mutex_exit(&mountlist_lock);

        mountlist_free(marker);

}

/*
 * Return the next mount or NULL for this iterator.
 * Mark it busy on success.
 */
static inline struct mount *
_mountlist_iterator_next(mount_iterator_t *mi, bool wait)
{
        struct mountlist_entry *me, *marker = &mi->mi_entry;
        struct mount *mp;
        int error;

        if (marker->me_mount != NULL) {
                vfs_unbusy(marker->me_mount);
                marker->me_mount = NULL;
        }

        mutex_enter(&mountlist_lock);
        for (;;) {
                KASSERT(marker->me_type == ME_MARKER);

                me = TAILQ_NEXT(marker, me_list);
                if (me == NULL) {
                        /* End of list: keep marker and return. */
                        mutex_exit(&mountlist_lock);
                        return NULL;
                }
                TAILQ_REMOVE(&mountlist, marker, me_list);
                TAILQ_INSERT_AFTER(&mountlist, me, marker, me_list);

                /* Skip other markers. */
                if (me->me_type != ME_MOUNT)
                        continue;

                /* Take an initial reference for vfs_busy() below. */
                mp = me->me_mount;
                KASSERT(mp != NULL);
                vfs_ref(mp);
                mutex_exit(&mountlist_lock);

                /* Try to mark this mount busy and return on success. */
                if (wait)
                        error = vfs_busy(mp);
                else
                        error = vfs_trybusy(mp);
                if (error == 0) {
                        vfs_rele(mp);
                        marker->me_mount = mp;
                        return mp;
                }
                vfs_rele(mp);
                mutex_enter(&mountlist_lock);
        }
}

struct mount *
mountlist_iterator_next(mount_iterator_t *mi)
{

        return _mountlist_iterator_next(mi, true);
}

struct mount *
mountlist_iterator_trynext(mount_iterator_t *mi)
{

        return _mountlist_iterator_next(mi, false);
}

/*
 * Attach new mount to the end of the mount list.
 */
void
mountlist_append(struct mount *mp)
{
        struct mountlist_entry *me;

        me = mountlist_alloc(ME_MOUNT, mp);
        mutex_enter(&mountlist_lock);
        TAILQ_INSERT_TAIL(&mountlist, me, me_list);
        mutex_exit(&mountlist_lock);
}

/*
 * Remove mount from mount list.
 */void
mountlist_remove(struct mount *mp)
{
        struct mountlist_entry *me;

        mutex_enter(&mountlist_lock);
        TAILQ_FOREACH(me, &mountlist, me_list)
                if (me->me_type == ME_MOUNT && me->me_mount == mp)
                        break;
        KASSERT(me != NULL);
        TAILQ_REMOVE(&mountlist, me, me_list);
        mutex_exit(&mountlist_lock);
        mountlist_free(me);
}

/*
 * Unlocked variant to traverse the mountlist.
 * To be used from DDB only.
 */
struct mount *
_mountlist_next(struct mount *mp)
{
        struct mountlist_entry *me;

        if (mp == NULL) {
                me = TAILQ_FIRST(&mountlist);
        } else {
                TAILQ_FOREACH(me, &mountlist, me_list)
                        if (me->me_type == ME_MOUNT && me->me_mount == mp)
                                break;
                if (me != NULL)
                        me = TAILQ_NEXT(me, me_list);
        }

        while (me != NULL && me->me_type != ME_MOUNT)
                me = TAILQ_NEXT(me, me_list);

        return (me ? me->me_mount : NULL);
}



































































































































































    1 














    1 
    1 











    1 



















    1 














































































































































































































































































































































































    1 










    1 





















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
/*        $NetBSD: wsmux.c,v 1.66 2022/03/28 12:38:58 riastradh Exp $        */

/*
 * Copyright (c) 1998, 2005 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Author: Lennart Augustsson <lennart@augustsson.net>
 *         Carlstedt Research & Technology
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * wscons mux device.
 *
 * The mux device is a collection of real mice and keyboards and acts as
 * a merge point for all the events from the different real devices.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: wsmux.c,v 1.66 2022/03/28 12:38:58 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_modular.h"
#endif

#include "wsdisplay.h"
#include "wsmux.h"
#include "wskbd.h"
#include "wsmouse.h"

#include <sys/param.h>
#include <sys/conf.h>
#include <sys/ioctl.h>
#include <sys/poll.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/tty.h>
#include <sys/signalvar.h>
#include <sys/device.h>
#include <sys/device_impl.h>        /* XXX autoconf abuse */

#include "opt_wsdisplay_compat.h"

#include <dev/wscons/wsconsio.h>
#include <dev/wscons/wsksymdef.h>
#include <dev/wscons/wseventvar.h>
#include <dev/wscons/wscons_callbacks.h>
#include <dev/wscons/wsmuxvar.h>

#include "ioconf.h"

#ifdef WSMUX_DEBUG
#define DPRINTF(x)        if (wsmuxdebug) printf x
#define DPRINTFN(n,x)        if (wsmuxdebug > (n)) printf x
int        wsmuxdebug = 0;
#else
#define DPRINTF(x)
#define DPRINTFN(n,x)
#endif

/*
 * The wsmux pseudo device is used to multiplex events from several wsmouse,
 * wskbd, and/or wsmux devices together.
 * The devices connected together form a tree with muxes in the interior
 * and real devices (mouse and kbd) at the leaves.  The special case of
 * a tree with one node (mux or other) is supported as well.
 * Only the device at the root of the tree can be opened (if a non-root
 * device is opened the subtree rooted at that point is severed from the
 * containing tree).  When the root is opened it allocates a wseventvar
 * struct which all the nodes in the tree will send their events too.
 * An ioctl() performed on the root is propagated to all the nodes.
 * There are also ioctl() operations to add and remove nodes from a tree.
 */

static int wsmux_mux_open(struct wsevsrc *, struct wseventvar *);
static int wsmux_mux_close(struct wsevsrc *);

static void wsmux_do_open(struct wsmux_softc *, struct wseventvar *);

static void wsmux_do_close(struct wsmux_softc *);
#if NWSDISPLAY > 0
static int wsmux_evsrc_set_display(device_t, struct wsevsrc *);
#else
#define wsmux_evsrc_set_display NULL
#endif

static int wsmux_do_displayioctl(device_t dev, u_long cmd,
                                 void *data, int flag, struct lwp *l);
static int wsmux_do_ioctl(device_t, u_long, void *,int,struct lwp *);

static int wsmux_add_mux(int, struct wsmux_softc *);

#define WSMUXDEV(n) ((n) & 0x7f)
#define WSMUXCTL(n) ((n) & 0x80)

dev_type_open(wsmuxopen);
dev_type_close(wsmuxclose);
dev_type_read(wsmuxread);
dev_type_ioctl(wsmuxioctl);
dev_type_poll(wsmuxpoll);
dev_type_kqfilter(wsmuxkqfilter);

const struct cdevsw wsmux_cdevsw = {
        .d_open = wsmuxopen,
        .d_close = wsmuxclose,
        .d_read = wsmuxread,
        .d_write = nowrite,
        .d_ioctl = wsmuxioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = wsmuxpoll,
        .d_mmap = nommap,
        .d_kqfilter = wsmuxkqfilter,
        .d_discard = nodiscard,
        .d_flag = D_OTHER
};

struct wssrcops wsmux_srcops = {
        WSMUX_MUX,
        wsmux_mux_open, wsmux_mux_close, wsmux_do_ioctl, wsmux_do_displayioctl,
        wsmux_evsrc_set_display
};

/* From upper level */
void
wsmuxattach(int n)
{
}

/* Keep track of all muxes that have been allocated */
static struct wsmux_softc **wsmuxdevs = NULL;
static int nwsmux = 0;

/* Return mux n, create if necessary */
struct wsmux_softc *
wsmux_getmux(int n)
{
        struct wsmux_softc *sc;

        n = WSMUXDEV(n);        /* limit range */

        /* Make sure there is room for mux n in the table */
        if (n >= nwsmux) {
                void *new;

                new = realloc(wsmuxdevs, (n + 1) * sizeof(*wsmuxdevs),
                    M_DEVBUF, M_ZERO | M_WAITOK);
                wsmuxdevs = new;
                nwsmux = n + 1;
        }

        sc = wsmuxdevs[n];
        if (sc == NULL) {
                sc = wsmux_create("wsmux", n);
                wsmuxdevs[n] = sc;
        }
        return (sc);
}

/*
 * open() of the pseudo device from device table.
 */
int
wsmuxopen(dev_t dev, int flags, int mode, struct lwp *l)
{
        struct wsmux_softc *sc;
        struct wseventvar *evar;
        int minr, unit;

        minr = minor(dev);
        unit = WSMUXDEV(minr);
        sc = wsmux_getmux(unit);
        if (sc == NULL)
                return (ENXIO);

        DPRINTF(("wsmuxopen: %s: sc=%p l=%p\n",
                 device_xname(sc->sc_base.me_dv), sc, l));

        if (WSMUXCTL(minr)) {
                /* This is the control device which does not allow reads. */
                if (flags & FREAD)
                        return (EINVAL);
                return (0);
        }
        if ((flags & (FREAD | FWRITE)) == FWRITE)
                /* Allow write only open */
                return (0);

        if (sc->sc_base.me_parent != NULL) {
                /* Grab the mux out of the greedy hands of the parent mux. */
                DPRINTF(("wsmuxopen: detach\n"));
                wsmux_detach_sc(&sc->sc_base);
        }

        if (sc->sc_base.me_evp != NULL)
                /* Already open. */
                return (EBUSY);

        evar = &sc->sc_base.me_evar;
        wsevent_init(evar, l->l_proc);
#ifdef WSDISPLAY_COMPAT_RAWKBD
        sc->sc_rawkbd = 0;
#endif

        wsmux_do_open(sc, evar);

        return (0);
}

/*
 * Open of a mux via the parent mux.
 */
int
wsmux_mux_open(struct wsevsrc *me, struct wseventvar *evar)
{
        struct wsmux_softc *sc = (struct wsmux_softc *)me;

#ifdef DIAGNOSTIC
        if (sc->sc_base.me_evp != NULL) {
                printf("wsmux_mux_open: busy\n");
                return (EBUSY);
        }
        if (sc->sc_base.me_parent == NULL) {
                printf("wsmux_mux_open: no parent\n");
                return (EINVAL);
        }
#endif

        wsmux_do_open(sc, evar);

        return (0);
}

/* Common part of opening a mux. */
void
wsmux_do_open(struct wsmux_softc *sc, struct wseventvar *evar)
{
        struct wsevsrc *me;

        sc->sc_base.me_evp = evar; /* remember event variable, mark as open */

        /* Open all children. */
        TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
                DPRINTF(("wsmuxopen: %s: m=%p dev=%s\n",
                         device_xname(sc->sc_base.me_dv), me,
                         device_xname(me->me_dv)));
#ifdef DIAGNOSTIC
                if (me->me_evp != NULL) {
                        printf("wsmuxopen: dev already in use\n");
                        continue;
                }
                if (me->me_parent != sc) {
                        printf("wsmux_do_open: bad child=%p\n", me);
                        continue;
                }
                {
                int error = wsevsrc_open(me, evar);
                if (error) {
                        DPRINTF(("wsmuxopen: open failed %d\n", error));
                }
                }
#else
                /* ignore errors, failing children will not be marked open */
                (void)wsevsrc_open(me, evar);
#endif
        }
}

/*
 * close() of the pseudo device from device table.
 */
int
wsmuxclose(dev_t dev, int flags, int mode,
    struct lwp *l)
{
        int minr = minor(dev);
        struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)];
        struct wseventvar *evar = sc->sc_base.me_evp;

        if (WSMUXCTL(minr))
                /* control device */
                return (0);
        if (evar == NULL)
                /* Not open for read */
                return (0);

        wsmux_do_close(sc);
        sc->sc_base.me_evp = NULL;
        wsevent_fini(evar);
        return (0);
}

/*
 * Close of a mux via the parent mux.
 */
int
wsmux_mux_close(struct wsevsrc *me)
{
        me->me_evp = NULL;
        wsmux_do_close((struct wsmux_softc *)me);
        return (0);
}

/* Common part of closing a mux. */
void
wsmux_do_close(struct wsmux_softc *sc)
{
        struct wsevsrc *me;

        DPRINTF(("wsmuxclose: %s: sc=%p\n",
                 device_xname(sc->sc_base.me_dv), sc));

        /* Close all the children. */
        TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
                DPRINTF(("wsmuxclose %s: m=%p dev=%s\n",
                         device_xname(sc->sc_base.me_dv), me,
                         device_xname(me->me_dv)));
#ifdef DIAGNOSTIC
                if (me->me_parent != sc) {
                        printf("wsmuxclose: bad child=%p\n", me);
                        continue;
                }
#endif
                (void)wsevsrc_close(me);
                me->me_evp = NULL;
        }
}

/*
 * read() of the pseudo device from device table.
 */
int
wsmuxread(dev_t dev, struct uio *uio, int flags)
{
        int minr = minor(dev);
        struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)];
        struct wseventvar *evar;
        int error;

        if (WSMUXCTL(minr)) {
                /* control device */
                return (EINVAL);
        }

        evar = sc->sc_base.me_evp;
        if (evar == NULL) {
#ifdef DIAGNOSTIC
                /* XXX can we get here? */
                printf("wsmuxread: not open\n");
#endif
                return (EINVAL);
        }

        DPRINTFN(5,("wsmuxread: %s event read evar=%p\n",
                    device_xname(sc->sc_base.me_dv), evar));
        error = wsevent_read(evar, uio, flags);
        DPRINTFN(5,("wsmuxread: %s event read ==> error=%d\n",
                    device_xname(sc->sc_base.me_dv), error));
        return (error);
}

/*
 * ioctl of the pseudo device from device table.
 */
int
wsmuxioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
        int u = WSMUXDEV(minor(dev));

        return wsmux_do_ioctl(wsmuxdevs[u]->sc_base.me_dv, cmd, data, flag, l);
}

/*
 * ioctl of a mux via the parent mux, continuation of wsmuxioctl().
 */
int
wsmux_do_ioctl(device_t dv, u_long cmd, void *data, int flag,
               struct lwp *lwp)
{
        struct wsmux_softc *sc = device_private(dv);
        struct wsevsrc *me;
        int error, ok;
        int s, n;
        struct wseventvar *evar;
        struct wscons_event event;
        struct wsmux_device_list *l;

        DPRINTF(("wsmux_do_ioctl: %s: enter sc=%p, cmd=%08lx\n",
                 device_xname(sc->sc_base.me_dv), sc, cmd));

        switch (cmd) {
#if defined(COMPAT_50) || defined(MODULAR)
        case WSMUXIO_OINJECTEVENT:
#endif /* defined(COMPAT_50) || defined(MODULAR) */
        case WSMUXIO_INJECTEVENT:
                /* Inject an event, e.g., from moused. */
                DPRINTF(("%s: inject\n", device_xname(sc->sc_base.me_dv)));

                evar = sc->sc_base.me_evp;
                if (evar == NULL) {
                        /* No event sink, so ignore it. */
                        DPRINTF(("wsmux_do_ioctl: event ignored\n"));
                        return (0);
                }

                s = spltty();
                event.type = ((struct wscons_event *)data)->type;
                event.value = ((struct wscons_event *)data)->value;
                error = wsevent_inject(evar, &event, 1);
                splx(s);

                return error;
        case WSMUXIO_ADD_DEVICE:
#define d ((struct wsmux_device *)data)
                DPRINTF(("%s: add type=%d, no=%d\n",
                         device_xname(sc->sc_base.me_dv), d->type, d->idx));
                switch (d->type) {
#if NWSMOUSE > 0
                case WSMUX_MOUSE:
                        return (wsmouse_add_mux(d->idx, sc));
#endif
#if NWSKBD > 0
                case WSMUX_KBD:
                        return (wskbd_add_mux(d->idx, sc));
#endif
                case WSMUX_MUX:
                        return (wsmux_add_mux(d->idx, sc));
                case WSMUX_BELL:
                        return (wsbell_add_mux(d->idx, sc));
                default:
                        return (EINVAL);
                }
        case WSMUXIO_REMOVE_DEVICE:
                DPRINTF(("%s: rem type=%d, no=%d\n",
                         device_xname(sc->sc_base.me_dv), d->type, d->idx));
                /* Locate the device */
                TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
                        if (me->me_ops->type == d->type &&
                            device_unit(me->me_dv) == d->idx) {
                                DPRINTF(("wsmux_do_ioctl: detach\n"));
                                wsmux_detach_sc(me);
                                return (0);
                        }
                }
                return (EINVAL);
#undef d

        case WSMUXIO_LIST_DEVICES:
                DPRINTF(("%s: list\n", device_xname(sc->sc_base.me_dv)));
                l = (struct wsmux_device_list *)data;
                n = 0;
                TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
                        if (n >= WSMUX_MAXDEV)
                                break;
                        l->devices[n].type = me->me_ops->type;
                        l->devices[n].idx = device_unit(me->me_dv);
                        n++;
                }
                l->ndevices = n;
                return (0);
#ifdef WSDISPLAY_COMPAT_RAWKBD
        case WSKBDIO_SETMODE:
                sc->sc_rawkbd = *(int *)data;
                DPRINTF(("wsmux_do_ioctl: save rawkbd = %d\n", sc->sc_rawkbd));
                break;
#endif

        case WSKBDIO_SETVERSION:
        case WSMOUSEIO_SETVERSION:
        case WSDISPLAYIO_SETVERSION:
                DPRINTF(("%s: WSxxxIO_SETVERSION\n",
                        device_xname(sc->sc_base.me_dv)));
                evar = sc->sc_base.me_evp;
                if (evar == NULL)
                        return (EINVAL);
                return wsevent_setversion(evar, *(int *)data);

        case FIONBIO:
                DPRINTF(("%s: FIONBIO\n", device_xname(sc->sc_base.me_dv)));
                return (0);

        case FIOASYNC:
                DPRINTF(("%s: FIOASYNC\n", device_xname(sc->sc_base.me_dv)));
                evar = sc->sc_base.me_evp;
                if (evar == NULL)
                        return (EINVAL);
                evar->async = *(int *)data != 0;
                return (0);
        case FIOSETOWN:
                DPRINTF(("%s: FIOSETOWN\n", device_xname(sc->sc_base.me_dv)));
                evar = sc->sc_base.me_evp;
                if (evar == NULL)
                        return (EINVAL);
                if (-*(int *)data != evar->io->p_pgid
                    && *(int *)data != evar->io->p_pid)
                        return (EPERM);
                return (0);
        case TIOCSPGRP:
                DPRINTF(("%s: TIOCSPGRP\n", device_xname(sc->sc_base.me_dv)));
                evar = sc->sc_base.me_evp;
                if (evar == NULL)
                        return (EINVAL);
                if (*(int *)data != evar->io->p_pgid)
                        return (EPERM);
                return (0);
        default:
                DPRINTF(("%s: unknown\n", device_xname(sc->sc_base.me_dv)));
                break;
        }

        if (sc->sc_base.me_evp == NULL
#if NWSDISPLAY > 0
            && sc->sc_base.me_dispdv == NULL
#endif
            )
                return (EACCES);

        /* Return 0 if any of the ioctl() succeeds, otherwise the last error */
        error = 0;
        ok = 0;
        TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
#ifdef DIAGNOSTIC
                /* XXX check evp? */
                if (me->me_parent != sc) {
                        printf("wsmux_do_ioctl: bad child %p\n", me);
                        continue;
                }
#endif
                error = wsevsrc_ioctl(me, cmd, data, flag, lwp);
                DPRINTF(("wsmux_do_ioctl: %s: me=%p dev=%s ==> %d\n",
                         device_xname(sc->sc_base.me_dv), me,
                         device_xname(me->me_dv), error));
                if (!error)
                        ok = 1;
        }
        if (ok) {
                error = 0;
                if (cmd == WSKBDIO_SETENCODING) {
                        sc->sc_kbd_layout = *((kbd_t *)data);
                }

        }

        return (error);
}

/*
 * poll() of the pseudo device from device table.
 */
int
wsmuxpoll(dev_t dev, int events, struct lwp *l)
{
        int minr = minor(dev);
        struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)];

        if (WSMUXCTL(minr)) {
                /* control device */
                return (0);
        }

        if (sc->sc_base.me_evp == NULL) {
#ifdef DIAGNOSTIC
                printf("wsmuxpoll: not open\n");
#endif
                return (POLLHUP);
        }

        return (wsevent_poll(sc->sc_base.me_evp, events, l));
}

/*
 * kqfilter() of the pseudo device from device table.
 */
int
wsmuxkqfilter(dev_t dev, struct knote *kn)
{
        int minr = minor(dev);
        struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)];

        if (WSMUXCTL(minr)) {
                /* control device */
                return (1);
        }

        if (sc->sc_base.me_evp == NULL) {
#ifdef DIAGNOSTIC
                printf("wsmuxkqfilter: not open\n");
#endif
                return (1);
        }

        return (wsevent_kqfilter(sc->sc_base.me_evp, kn));
}

/*
 * Add mux unit as a child to muxsc.
 */
int
wsmux_add_mux(int unit, struct wsmux_softc *muxsc)
{
        struct wsmux_softc *sc, *m;

        sc = wsmux_getmux(unit);
        if (sc == NULL)
                return (ENXIO);

        DPRINTF(("wsmux_add_mux: %s(%p) to %s(%p)\n",
                 device_xname(sc->sc_base.me_dv), sc,
                 device_xname(muxsc->sc_base.me_dv), muxsc));

        if (sc->sc_base.me_parent != NULL || sc->sc_base.me_evp != NULL)
                return (EBUSY);

        /* The mux we are adding must not be an ancestor of itself. */
        for (m = muxsc; m != NULL ; m = m->sc_base.me_parent)
                if (m == sc)
                        return (EINVAL);

        return (wsmux_attach_sc(muxsc, &sc->sc_base));
}

/* Create a new mux softc. */
struct wsmux_softc *
wsmux_create(const char *name, int unit)
{
        struct wsmux_softc *sc;

        /* XXX This is wrong -- should use autoconfiguration framework */

        DPRINTF(("wsmux_create: allocating\n"));
        sc = malloc(sizeof *sc, M_DEVBUF, M_WAITOK|M_ZERO);
        sc->sc_base.me_dv = malloc(sizeof(struct device), M_DEVBUF,
            M_WAITOK|M_ZERO);
        TAILQ_INIT(&sc->sc_cld);
        snprintf(sc->sc_base.me_dv->dv_xname,
            sizeof sc->sc_base.me_dv->dv_xname, "%s%d", name, unit);
        sc->sc_base.me_dv->dv_private = sc;
        sc->sc_base.me_dv->dv_unit = unit;
        sc->sc_base.me_ops = &wsmux_srcops;
        sc->sc_kbd_layout = KB_NONE;
        return (sc);
}

/* Attach me as a child to sc. */
int
wsmux_attach_sc(struct wsmux_softc *sc, struct wsevsrc *me)
{
        int error;

        if (sc == NULL)
                return (EINVAL);

        DPRINTF(("wsmux_attach_sc: %s(%p): type=%d\n",
                 device_xname(sc->sc_base.me_dv), sc, me->me_ops->type));

#ifdef DIAGNOSTIC
        if (me->me_parent != NULL) {
                printf("wsmux_attach_sc: busy\n");
                return (EBUSY);
        }
#endif
        me->me_parent = sc;
        TAILQ_INSERT_TAIL(&sc->sc_cld, me, me_next);

        error = 0;
#if NWSDISPLAY > 0
        if (sc->sc_base.me_dispdv != NULL) {
                /* This is a display mux, so attach the new device to it. */
                DPRINTF(("wsmux_attach_sc: %s: set display %p\n",
                         device_xname(sc->sc_base.me_dv),
                         sc->sc_base.me_dispdv));
                if (me->me_ops->dsetdisplay != NULL) {
                        error = wsevsrc_set_display(me, &sc->sc_base);
                        /* Ignore that the console already has a display. */
                        if (error == EBUSY)
                                error = 0;
                        if (!error) {
#ifdef WSDISPLAY_COMPAT_RAWKBD
                                DPRINTF(("wsmux_attach_sc: %s set rawkbd=%d\n",
                                         device_xname(me->me_dv),
                                         sc->sc_rawkbd));
                                (void)wsevsrc_ioctl(me, WSKBDIO_SETMODE,
                                                    &sc->sc_rawkbd, 0, 0);
#endif
                                if (sc->sc_kbd_layout != KB_NONE)
                                        (void)wsevsrc_ioctl(me,
                                            WSKBDIO_SETENCODING,
                                            &sc->sc_kbd_layout, FWRITE, 0);
                        }
                }
        }
#endif
        if (sc->sc_base.me_evp != NULL) {
                /* Mux is open, so open the new subdevice */
                DPRINTF(("wsmux_attach_sc: %s: calling open of %s\n",
                         device_xname(sc->sc_base.me_dv),
                         device_xname(me->me_dv)));
                error = wsevsrc_open(me, sc->sc_base.me_evp);
        } else {
                DPRINTF(("wsmux_attach_sc: %s not open\n",
                         device_xname(sc->sc_base.me_dv)));
        }

        if (error) {
                me->me_parent = NULL;
                TAILQ_REMOVE(&sc->sc_cld, me, me_next);
        }

        DPRINTF(("wsmux_attach_sc: %s(%p) done, error=%d\n",
                 device_xname(sc->sc_base.me_dv), sc, error));
        return (error);
}

/* Remove me from the parent. */
void
wsmux_detach_sc(struct wsevsrc *me)
{
        struct wsmux_softc *sc = me->me_parent;

        DPRINTF(("wsmux_detach_sc: %s(%p) parent=%p\n",
                 device_xname(me->me_dv), me, sc));

#ifdef DIAGNOSTIC
        if (sc == NULL) {
                printf("wsmux_detach_sc: %s has no parent\n",
                       device_xname(me->me_dv));
                return;
        }
#endif

#if NWSDISPLAY > 0
        if (sc->sc_base.me_dispdv != NULL) {
                if (me->me_ops->dsetdisplay != NULL)
                        /* ignore error, there's nothing we can do */
                        (void)wsevsrc_set_display(me, NULL);
        } else
#endif
                if (me->me_evp != NULL) {
                DPRINTF(("wsmux_detach_sc: close\n"));
                /* mux device is open, so close multiplexee */
                (void)wsevsrc_close(me);
        }

        TAILQ_REMOVE(&sc->sc_cld, me, me_next);
        me->me_parent = NULL;

        DPRINTF(("wsmux_detach_sc: done sc=%p\n", sc));
}

/*
 * Display ioctl() of a mux via the parent mux.
 */
int
wsmux_do_displayioctl(device_t dv, u_long cmd, void *data, int flag,
                      struct lwp *l)
{
        struct wsmux_softc *sc = device_private(dv);
        struct wsevsrc *me;
        int error, ok;

        DPRINTF(("wsmux_displayioctl: %s: sc=%p, cmd=%08lx\n",
                 device_xname(sc->sc_base.me_dv), sc, cmd));

#ifdef WSDISPLAY_COMPAT_RAWKBD
        if (cmd == WSKBDIO_SETMODE) {
                sc->sc_rawkbd = *(int *)data;
                DPRINTF(("wsmux_displayioctl: rawkbd = %d\n", sc->sc_rawkbd));
        }
#endif

        /*
         * Return 0 if any of the ioctl() succeeds, otherwise the last error.
         * Return EPASSTHROUGH if no mux component accepts the ioctl.
         */
        error = EPASSTHROUGH;
        ok = 0;
        TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
                DPRINTF(("wsmux_displayioctl: me=%p\n", me));
#ifdef DIAGNOSTIC
                if (me->me_parent != sc) {
                        printf("wsmux_displayioctl: bad child %p\n", me);
                        continue;
                }
#endif
                if (me->me_ops->ddispioctl != NULL) {
                        error = wsevsrc_display_ioctl(me, cmd, data, flag, l);
                        DPRINTF(("wsmux_displayioctl: me=%p dev=%s ==> %d\n",
                                 me, device_xname(me->me_dv), error));
                        if (!error)
                                ok = 1;
                }
        }
        if (ok)
                error = 0;

        return (error);
}

#if NWSDISPLAY > 0
/*
 * Set display of a mux via the parent mux.
 */
int
wsmux_evsrc_set_display(device_t dv, struct wsevsrc *ame)
{
        struct wsmux_softc *muxsc = (struct wsmux_softc *)ame;
        struct wsmux_softc *sc = device_private(dv);
        device_t displaydv = muxsc ? muxsc->sc_base.me_dispdv : NULL;

        DPRINTF(("wsmux_set_display: %s: displaydv=%p\n",
                 device_xname(sc->sc_base.me_dv), displaydv));

        if (displaydv != NULL) {
                if (sc->sc_base.me_dispdv != NULL)
                        return (EBUSY);
        } else {
                if (sc->sc_base.me_dispdv == NULL)
                        return (ENXIO);
        }

        return wsmux_set_display(sc, displaydv);
}

int
wsmux_set_display(struct wsmux_softc *sc, device_t displaydv)
{
        device_t odisplaydv;
        struct wsevsrc *me;
        struct wsmux_softc *nsc = displaydv ? sc : NULL;
        int error, ok;

        odisplaydv = sc->sc_base.me_dispdv;
        sc->sc_base.me_dispdv = displaydv;

        if (displaydv)
                aprint_verbose_dev(sc->sc_base.me_dv, "connecting to %s\n",
                       device_xname(displaydv));
        ok = 0;
        error = 0;
        TAILQ_FOREACH(me, &sc->sc_cld,me_next) {
#ifdef DIAGNOSTIC
                if (me->me_parent != sc) {
                        printf("wsmux_set_display: bad child parent %p\n", me);
                        continue;
                }
#endif
                if (me->me_ops->dsetdisplay != NULL) {
                        error = wsevsrc_set_display(me, &nsc->sc_base);
                        DPRINTF(("wsmux_set_display: m=%p dev=%s error=%d\n",
                                 me, device_xname(me->me_dv), error));
                        if (!error) {
                                ok = 1;
#ifdef WSDISPLAY_COMPAT_RAWKBD
                                DPRINTF(("wsmux_set_display: %s set rawkbd=%d\n",
                                         device_xname(me->me_dv), sc->sc_rawkbd));
                                (void)wsevsrc_ioctl(me, WSKBDIO_SETMODE,
                                                    &sc->sc_rawkbd, 0, 0);
#endif
                        }
                }
        }
        if (ok)
                error = 0;

        if (displaydv == NULL)
                aprint_verbose("%s: disconnecting from %s\n",
                       device_xname(sc->sc_base.me_dv),
                       device_xname(odisplaydv));

        return (error);
}
#endif /* NWSDISPLAY > 0 */
































































































































































































































































































































































































































































    2 















































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
/*        $NetBSD: tcp_var.h,v 1.198 2022/10/28 05:18:39 ozaki-r Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
 *
 * NRL grants permission for redistribution and use in source and binary
 * forms, with or without modification, of the software and documentation
 * created at NRL provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgements:
 *      This product includes software developed by the University of
 *      California, Berkeley and its contributors.
 *      This product includes software developed at the Information
 *      Technology Division, US Naval Research Laboratory.
 * 4. Neither the name of the NRL nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * The views and conclusions contained in the software and documentation
 * are those of the authors and should not be interpreted as representing
 * official policies, either expressed or implied, of the US Naval
 * Research Laboratory (NRL).
 */

/*-
 * Copyright (c) 1997, 1998, 1999, 2001, 2005 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1993, 1994, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)tcp_var.h        8.4 (Berkeley) 5/24/95
 */

#ifndef _NETINET_TCP_VAR_H_
#define _NETINET_TCP_VAR_H_

#if defined(_KERNEL_OPT)
#include "opt_inet.h"
#include "opt_mbuftrace.h"

#endif

/*
 * TCP kernel structures and variables.
 */

#include <sys/callout.h>

#ifdef TCP_SIGNATURE
/*
 * Defines which are needed by the xform_tcp module and tcp_[in|out]put
 * for SADB verification and lookup.
 */
#define        TCP_SIGLEN        16        /* length of computed digest in bytes */
#define        TCP_KEYLEN_MIN        1        /* minimum length of TCP-MD5 key */
#define        TCP_KEYLEN_MAX        80        /* maximum length of TCP-MD5 key */
/*
 * Only a single SA per host may be specified at this time. An SPI is
 * needed in order for the KEY_LOOKUP_SA() lookup to work.
 */
#define        TCP_SIG_SPI        0x1000
#endif /* TCP_SIGNATURE */

/*
 * Tcp+ip header, after ip options removed.
 */
struct tcpiphdr {
        struct ipovly ti_i;                /* overlaid ip structure */
        struct tcphdr ti_t;                /* tcp header */
};
#ifdef CTASSERT
CTASSERT(sizeof(struct tcpiphdr) == 40);
#endif
#define        ti_x1                ti_i.ih_x1
#define        ti_pr                ti_i.ih_pr
#define        ti_len                ti_i.ih_len
#define        ti_src                ti_i.ih_src
#define        ti_dst                ti_i.ih_dst
#define        ti_sport        ti_t.th_sport
#define        ti_dport        ti_t.th_dport
#define        ti_seq                ti_t.th_seq
#define        ti_ack                ti_t.th_ack
#define        ti_x2                ti_t.th_x2
#define        ti_off                ti_t.th_off
#define        ti_flags        ti_t.th_flags
#define        ti_win                ti_t.th_win
#define        ti_sum                ti_t.th_sum
#define        ti_urp                ti_t.th_urp

/*
 * SACK option block.
 */
struct sackblk {
        tcp_seq left;                /* Left edge of sack block. */
        tcp_seq right;                /* Right edge of sack block. */
};

TAILQ_HEAD(sackhead, sackhole);
struct sackhole {
        tcp_seq start;
        tcp_seq end;
        tcp_seq rxmit;

        TAILQ_ENTRY(sackhole) sackhole_q;
};

struct syn_cache;

/*
 * Tcp control block, one per tcp; fields:
 */
struct tcpcb {
        int        t_family;                /* address family on the wire */
        struct ipqehead segq;                /* sequencing queue */
        int        t_segqlen;                /* length of the above */
        callout_t t_timer[TCPT_NTIMERS];/* tcp timers */
        short        t_state;                /* state of this connection */
        short        t_rxtshift;                /* log(2) of rexmt exp. backoff */
        uint32_t t_rxtcur;                /* current retransmit value */
        short        t_dupacks;                /* consecutive dup acks recd */
        /*
         * t_partialacks:
         *        <0        not in fast recovery.
         *        ==0        in fast recovery.  has not received partial acks
         *        >0        in fast recovery.  has received partial acks
         */
        short        t_partialacks;                /* partials acks during fast rexmit */
        u_short        t_peermss;                /* peer's maximum segment size */
        u_short        t_ourmss;                /* our's maximum segment size */
        u_short t_segsz;                /* current segment size in use */
        char        t_force;                /* 1 if forcing out a byte */
        u_int        t_flags;
#define        TF_ACKNOW        0x0001                /* ack peer immediately */
#define        TF_DELACK        0x0002                /* ack, but try to delay it */
#define        TF_NODELAY        0x0004                /* don't delay packets to coalesce */
#define        TF_NOOPT        0x0008                /* don't use tcp options */
#define        TF_REQ_SCALE        0x0020                /* have/will request window scaling */
#define        TF_RCVD_SCALE        0x0040                /* other side has requested scaling */
#define        TF_REQ_TSTMP        0x0080                /* have/will request timestamps */
#define        TF_RCVD_TSTMP        0x0100                /* a timestamp was received in SYN */
#define        TF_SACK_PERMIT        0x0200                /* other side said I could SACK */
#define        TF_SYN_REXMT        0x0400                /* rexmit timer fired on SYN */
#define        TF_WILL_SACK        0x0800                /* try to use SACK */
#define        TF_REASSEMBLING        0x1000                /* we're busy reassembling */
#define        TF_DEAD                0x2000                /* dead and to-be-released */
#define        TF_PMTUD_PEND        0x4000                /* Path MTU Discovery pending */
#define        TF_ECN_PERMIT        0x10000                /* other side said is ECN-ready */
#define        TF_ECN_SND_CWR        0x20000                /* ECN CWR in queue */
#define        TF_ECN_SND_ECE        0x40000                /* ECN ECE in queue */
#define        TF_SIGNATURE        0x400000        /* require MD5 digests (RFC2385) */


        struct        mbuf *t_template;        /* skeletal packet for transmit */
        struct        inpcb *t_inpcb;                /* back pointer to internet pcb */
        callout_t t_delack_ch;                /* delayed ACK callout */
/*
 * The following fields are used as in the protocol specification.
 * See RFC793, Dec. 1981, page 21.
 */
/* send sequence variables */
        tcp_seq        snd_una;                /* send unacknowledged */
        tcp_seq        snd_nxt;                /* send next */
        tcp_seq        snd_up;                        /* send urgent pointer */
        tcp_seq        snd_wl1;                /* window update seg seq number */
        tcp_seq        snd_wl2;                /* window update seg ack number */
        tcp_seq        iss;                        /* initial send sequence number */
        u_long        snd_wnd;                /* send window */
/*
 * snd_recover
 *         it's basically same as the "recover" variable in RFC 2852 (NewReno).
 *         when entering fast retransmit, it's set to snd_max.
 *         newreno uses this to detect partial ack.
 * snd_high
 *         it's basically same as the "send_high" variable in RFC 2852 (NewReno).
 *         on each RTO, it's set to snd_max.
 *         newreno uses this to avoid false fast retransmits.
 */
        tcp_seq snd_recover;
        tcp_seq        snd_high;
/* receive sequence variables */
        u_long        rcv_wnd;                /* receive window */
        tcp_seq        rcv_nxt;                /* receive next */
        tcp_seq        rcv_up;                        /* receive urgent pointer */
        tcp_seq        irs;                        /* initial receive sequence number */
/*
 * Additional variables for this implementation.
 */
/* receive variables */
        tcp_seq        rcv_adv;                /* advertised window */

/*
 * retransmit variables
 *
 * snd_max
 *         the highest sequence number we've ever sent.
 *        used to recognize retransmits.
 */
        tcp_seq        snd_max;

/* congestion control (for slow start, source quench, retransmit after loss) */
        u_long        snd_cwnd;                /* congestion-controlled window */
        u_long        snd_ssthresh;                /* snd_cwnd size threshold for
                                         * for slow start exponential to
                                         * linear switch
                                         */
/* auto-sizing variables */
        u_int rfbuf_cnt;                /* recv buffer autoscaling byte count */
        uint32_t rfbuf_ts;                /* recv buffer autoscaling timestamp */

/*
 * transmit timing stuff.  See below for scale of srtt and rttvar.
 * "Variance" is actually smoothed difference.
 */
        uint32_t t_rcvtime;                /* time last segment received */
        uint32_t t_rtttime;                /* time we started measuring rtt */
        tcp_seq        t_rtseq;                /* sequence number being timed */
        int32_t        t_srtt;                        /* smoothed round-trip time */
        int32_t        t_rttvar;                /* variance in round-trip time */
        uint32_t t_rttmin;                /* minimum rtt allowed */
        u_long        max_sndwnd;                /* largest window peer has offered */

/* out-of-band data */
        char        t_oobflags;                /* have some */
        char        t_iobc;                        /* input character */
#define        TCPOOB_HAVEDATA        0x01
#define        TCPOOB_HADDATA        0x02
        short        t_softerror;                /* possible error not yet reported */

/* RFC 1323 variables */
        u_char        snd_scale;                /* window scaling for send window */
        u_char        rcv_scale;                /* window scaling for recv window */
        u_char        request_r_scale;        /* pending window scaling */
        u_char        requested_s_scale;
        u_int32_t ts_recent;                /* timestamp echo data */
        u_int32_t ts_recent_age;        /* when last updated */
        u_int32_t ts_timebase;                /* our timebase */
        tcp_seq        last_ack_sent;

/* RFC 3465 variables */
        u_long        t_bytes_acked;                /* ABC "bytes_acked" parameter */

/* SACK stuff */
#define TCP_SACK_MAX 3
#define TCPSACK_NONE 0
#define TCPSACK_HAVED 1
        u_char rcv_sack_flags;                /* SACK flags. */
        struct sackblk rcv_dsack_block;        /* RX D-SACK block. */
        struct ipqehead timeq;                /* time sequenced queue. */
        struct sackhead snd_holes;        /* TX SACK holes. */
        int        snd_numholes;                /* Number of TX SACK holes. */
        tcp_seq rcv_lastsack;                /* last seq number(+1) sack'd by rcv'r*/
        tcp_seq sack_newdata;                /* New data xmitted in this recovery
                                           episode starts at this seq number*/
        tcp_seq snd_fack;                /* FACK TCP.  Forward-most data held by
                                           peer. */

/* CUBIC variables */
        ulong snd_cubic_wmax;                /* W_max */
        ulong snd_cubic_wmax_last;        /* Used for fast convergence */
        ulong snd_cubic_ctime;                /* Last congestion time */

/* pointer for syn cache entries*/
        LIST_HEAD(, syn_cache) t_sc;        /* list of entries by this tcb */

/* prediction of next mbuf when using large window sizes */
        struct        mbuf *t_lastm;                /* last mbuf that data was sent from */
        int        t_inoff;                /* data offset in previous mbuf */
        int        t_lastoff;                /* last data address in mbuf chain */
        int        t_lastlen;                /* last length read from mbuf chain */

/* Path-MTU discovery blackhole detection */
        int t_mtudisc;                        /* perform mtudisc for this tcb */
/* Path-MTU Discovery Information */
        u_int        t_pmtud_mss_acked;        /* MSS acked, lower bound for MTU */
        u_int        t_pmtud_mtu_sent;        /* MTU used, upper bound for MTU */
        tcp_seq        t_pmtud_th_seq;                /* TCP SEQ from ICMP payload */
        u_int        t_pmtud_nextmtu;        /* Advertised Next-Hop MTU from ICMP */
        u_short        t_pmtud_ip_len;                /* IP length from ICMP payload */
        u_short        t_pmtud_ip_hl;                /* IP header length from ICMP payload */

        uint8_t t_ecn_retries;                /* # of ECN setup retries */
        
        const struct tcp_congctl *t_congctl;        /* per TCB congctl algorithm */

        /* Keepalive per socket */
        u_int        t_keepinit;
        u_int        t_keepidle;
        u_int        t_keepintvl;
        u_int        t_keepcnt;
        u_int        t_maxidle;                /* t_keepcnt * t_keepintvl */

        u_int        t_msl;                        /* MSL to use for this connexion */

        /* maintain a few stats per connection: */
        uint32_t t_rcvoopack;                 /* out-of-order packets received */
        uint32_t t_sndrexmitpack;         /* retransmit packets sent */
        uint32_t t_sndzerowin;                /* zero-window updates sent */
};

/*
 * Macros to aid ECN TCP.
 */
#define TCP_ECN_ALLOWED(tp)        (tp->t_flags & TF_ECN_PERMIT)

/*
 * Macros to aid SACK/FACK TCP.
 */
#define TCP_SACK_ENABLED(tp)        (tp->t_flags & TF_WILL_SACK)
#define TCP_FACK_FASTRECOV(tp)        \
        (TCP_SACK_ENABLED(tp) && \
        (SEQ_GT(tp->snd_fack, tp->snd_una + tcprexmtthresh * tp->t_segsz)))

#ifdef _KERNEL
/*
 * TCP reassembly queue locks.
 */
static __inline int tcp_reass_lock_try (struct tcpcb *)
        __unused;
static __inline void tcp_reass_unlock (struct tcpcb *)
        __unused;

static __inline int
tcp_reass_lock_try(struct tcpcb *tp)
{
        int s;

        /*
         * Use splvm() -- we're blocking things that would cause
         * mbuf allocation.
         */
        s = splvm();
        if (tp->t_flags & TF_REASSEMBLING) {
                splx(s);
                return (0);
        }
        tp->t_flags |= TF_REASSEMBLING;
        splx(s);
        return (1);
}

static __inline void
tcp_reass_unlock(struct tcpcb *tp)
{
        int s;

        s = splvm();
        KASSERT((tp->t_flags & TF_REASSEMBLING) != 0);
        tp->t_flags &= ~TF_REASSEMBLING;
        splx(s);
}

#ifdef DIAGNOSTIC
#define        TCP_REASS_LOCK(tp)                                                \
do {                                                                        \
        if (tcp_reass_lock_try(tp) == 0) {                                \
                printf("%s:%d: tcpcb %p reass already locked\n",        \
                    __FILE__, __LINE__, tp);                                \
                panic("tcp_reass_lock");                                \
        }                                                                \
} while (/*CONSTCOND*/ 0)
#define        TCP_REASS_LOCK_CHECK(tp)                                        \
do {                                                                        \
        if (((tp)->t_flags & TF_REASSEMBLING) == 0) {                        \
                printf("%s:%d: tcpcb %p reass lock not held\n",                \
                    __FILE__, __LINE__, tp);                                \
                panic("tcp reass lock check");                                \
        }                                                                \
} while (/*CONSTCOND*/ 0)
#else
#define        TCP_REASS_LOCK(tp)        (void) tcp_reass_lock_try((tp))
#define        TCP_REASS_LOCK_CHECK(tp) /* nothing */
#endif

#define        TCP_REASS_UNLOCK(tp)        tcp_reass_unlock((tp))
#endif /* _KERNEL */

/*
 * Queue for delayed ACK processing.
 */
#ifdef _KERNEL
extern int tcp_delack_ticks;
void        tcp_delack(void *);

#define TCP_RESTART_DELACK(tp)                                                \
        callout_reset(&(tp)->t_delack_ch, tcp_delack_ticks,                \
            tcp_delack, tp)

#define        TCP_SET_DELACK(tp)                                                \
do {                                                                        \
        if (((tp)->t_flags & TF_DELACK) == 0) {                                \
                (tp)->t_flags |= TF_DELACK;                                \
                TCP_RESTART_DELACK(tp);                                        \
        }                                                                \
} while (/*CONSTCOND*/0)

#define        TCP_CLEAR_DELACK(tp)                                                \
do {                                                                        \
        if ((tp)->t_flags & TF_DELACK) {                                \
                (tp)->t_flags &= ~TF_DELACK;                                \
                callout_stop(&(tp)->t_delack_ch);                        \
        }                                                                \
} while (/*CONSTCOND*/0)
#endif /* _KERNEL */

/*
 * Compute the current timestamp for a connection.
 */
#define        TCP_TIMESTAMP(tp)        (tcp_now - (tp)->ts_timebase)

/*
 * Handy way of passing around TCP option info.
 */
struct tcp_opt_info {
        int                ts_present;
        u_int32_t        ts_val;
        u_int32_t        ts_ecr;
        u_int16_t        maxseg;
};

#define        TOF_SIGNATURE        0x0040                /* signature option present */
#define        TOF_SIGLEN        0x0080                /* sigature length valid (RFC2385) */

#define        intotcpcb(ip)        ((struct tcpcb *)(ip)->inp_ppcb)
#define        sototcpcb(so)        (intotcpcb(sotoinpcb(so)))

/*
 * See RFC2988 for a discussion of RTO calculation; comments assume
 * familiarity with that document.
 *
 * The smoothed round-trip time and estimated variance are stored as
 * fixed point numbers.  Historically, srtt was scaled by
 * TCP_RTT_SHIFT bits, and rttvar by TCP_RTTVAR_SHIFT bits.  Because
 * the values coincide with the alpha and beta parameters suggested
 * for RTO calculation (1/8 for srtt, 1/4 for rttvar), the combination
 * of computing 1/8 of the new value and transforming it to the
 * fixed-point representation required zero instructions.  However,
 * the storage representations no longer coincide with the alpha/beta
 * shifts; instead, more fractional bits are present.
 *
 * The storage representation of srtt is 1/32 slow ticks, or 1/64 s.
 * (The assumption that a slow tick is 500 ms should not be present in
 * the code.)
 *
 * The storage representation of rttvar is 1/16 slow ticks, or 1/32 s.
 * There may be some confusion about this in the code.
 *
 * For historical reasons, these scales are also used in smoothing the
 * average (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed).
 * This results in alpha of 0.125 and beta of 0.25, following RFC2988
 * section 2.3
 *
 * XXX Change SHIFT values to LGWEIGHT and REP_SHIFT, and adjust
 * the code to use the correct ones.
 */
#define        TCP_RTT_SHIFT                3        /* shift for srtt; 3 bits frac. */
#define        TCP_RTTVAR_SHIFT        2        /* multiplier for rttvar; 2 bits */

/*
 * Compute TCP retransmission timer, following RFC2988.
 * This macro returns a value in slow timeout ticks.
 *
 * Section 2.2 requires that the RTO value be
 *  srtt + max(G, 4*RTTVAR)
 * where G is the clock granularity.
 *
 * This comment has not necessarily been updated for the new storage
 * representation:
 *
 * Because of the way we do the smoothing, srtt and rttvar
 * will each average +1/2 tick of bias.  When we compute
 * the retransmit timer, we want 1/2 tick of rounding and
 * 1 extra tick because of +-1/2 tick uncertainty in the
 * firing of the timer.  The bias will give us exactly the
 * 1.5 tick we need.  But, because the bias is
 * statistical, we have to test that we don't drop below
 * the minimum feasible timer (which is 2 ticks).
 * This macro assumes that the value of 1<<TCP_RTTVAR_SHIFT
 * is the same as the multiplier for rttvar.
 *
 * This macro appears to be wrong; it should be checking rttvar*4 in
 * ticks and making sure we use 1 instead if rttvar*4 rounds to 0.  It
 * appears to be treating srtt as being in the old storage
 * representation, resulting in a factor of 4 extra.
 */
#define        TCP_REXMTVAL(tp) \
        ((((tp)->t_srtt >> TCP_RTT_SHIFT) + (tp)->t_rttvar) >> 2)

/*
 * Compute the initial window for slow start.
 */
#define        TCP_INITIAL_WINDOW(iw, segsz) \
        uimin((iw) * (segsz), uimax(2 * (segsz), tcp_init_win_max[(iw)]))

/*
 * TCP statistics.
 * Each counter is an unsigned 64-bit value.
 *
 * Many of these should be kept per connection, but that's inconvenient
 * at the moment.
 */
#define        TCP_STAT_CONNATTEMPT        0        /* connections initiated */
#define        TCP_STAT_ACCEPTS        1        /* connections accepted */
#define        TCP_STAT_CONNECTS        2        /* connections established */
#define        TCP_STAT_DROPS                3        /* connections dropped */
#define        TCP_STAT_CONNDROPS        4        /* embryonic connections dropped */
#define        TCP_STAT_CLOSED                5        /* conn. closed (includes drops) */
#define        TCP_STAT_SEGSTIMED        6        /* segs where we tried to get rtt */
#define        TCP_STAT_RTTUPDATED        7        /* times we succeeded */
#define        TCP_STAT_DELACK                8        /* delayed ACKs sent */
#define        TCP_STAT_TIMEOUTDROP        9        /* conn. dropped in rxmt timeout */
#define        TCP_STAT_REXMTTIMEO        10        /* retransmit timeouts */
#define        TCP_STAT_PERSISTTIMEO        11        /* persist timeouts */
#define        TCP_STAT_KEEPTIMEO        12        /* keepalive timeouts */
#define        TCP_STAT_KEEPPROBE        13        /* keepalive probes sent */
#define        TCP_STAT_KEEPDROPS        14        /* connections dropped in keepalive */
#define        TCP_STAT_PERSISTDROPS        15        /* connections dropped in persist */
#define        TCP_STAT_CONNSDRAINED        16        /* connections drained due to memory
                                           shortage */
#define        TCP_STAT_PMTUBLACKHOLE        17        /* PMTUD blackhole detected */
#define        TCP_STAT_SNDTOTAL        18        /* total packets sent */
#define        TCP_STAT_SNDPACK        19        /* data packlets sent */
#define        TCP_STAT_SNDBYTE        20        /* data bytes sent */
#define        TCP_STAT_SNDREXMITPACK        21        /* data packets retransmitted */
#define        TCP_STAT_SNDREXMITBYTE        22        /* data bytes retransmitted */
#define        TCP_STAT_SNDACKS        23        /* ACK-only packets sent */
#define        TCP_STAT_SNDPROBE        24        /* window probes sent */
#define        TCP_STAT_SNDURG                25        /* packets sent with URG only */
#define        TCP_STAT_SNDWINUP        26        /* window update-only packets sent */
#define        TCP_STAT_SNDCTRL        27        /* control (SYN|FIN|RST) packets sent */
#define        TCP_STAT_RCVTOTAL        28        /* total packets received */
#define        TCP_STAT_RCVPACK        29        /* packets received in sequence */
#define        TCP_STAT_RCVBYTE        30        /* bytes received in sequence */
#define        TCP_STAT_RCVBADSUM        31        /* packets received with cksum errs */
#define        TCP_STAT_RCVBADOFF        32        /* packets received with bad offset */
#define        TCP_STAT_RCVMEMDROP        33        /* packets dropped for lack of memory */
#define        TCP_STAT_RCVSHORT        34        /* packets received too short */
#define        TCP_STAT_RCVDUPPACK        35        /* duplicate-only packets received */
#define        TCP_STAT_RCVDUPBYTE        36        /* duplicate-only bytes received */
#define        TCP_STAT_RCVPARTDUPPACK        37        /* packets with some duplicate data */
#define        TCP_STAT_RCVPARTDUPBYTE        38        /* dup. bytes in part-dup. packets */
#define        TCP_STAT_RCVOOPACK        39        /* out-of-order packets received */
#define        TCP_STAT_RCVOOBYTE        40        /* out-of-order bytes received */
#define        TCP_STAT_RCVPACKAFTERWIN 41        /* packets with data after window */
#define        TCP_STAT_RCVBYTEAFTERWIN 42        /* bytes received after window */
#define        TCP_STAT_RCVAFTERCLOSE        43        /* packets received after "close" */
#define        TCP_STAT_RCVWINPROBE        44        /* rcvd window probe packets */
#define        TCP_STAT_RCVDUPACK        45        /* rcvd duplicate ACKs */
#define        TCP_STAT_RCVACKTOOMUCH        46        /* rcvd ACKs for unsent data */
#define        TCP_STAT_RCVACKPACK        47        /* rcvd ACK packets */
#define        TCP_STAT_RCVACKBYTE        48        /* bytes ACKed by rcvd ACKs */
#define        TCP_STAT_RCVWINUPD        49        /* rcvd window update packets */
#define        TCP_STAT_PAWSDROP        50        /* segments dropped due to PAWS */
#define        TCP_STAT_PREDACK        51        /* times hdr predict OK for ACKs */
#define        TCP_STAT_PREDDAT        52        /* times hdr predict OK for data pkts */
#define        TCP_STAT_PCBHASHMISS        53        /* input packets missing PCB hash */
#define        TCP_STAT_NOPORT                54        /* no socket on port */
#define        TCP_STAT_BADSYN                55        /* received ACK for which we have
                                           no SYN in compressed state */
#define        TCP_STAT_DELAYED_FREE        56        /* delayed pool_put() of tcpcb */
#define        TCP_STAT_SC_ADDED        57        /* # of sc entries added */
#define        TCP_STAT_SC_COMPLETED        58        /* # of sc connections completed */
#define        TCP_STAT_SC_TIMED_OUT        59        /* # of sc entries timed out */
#define        TCP_STAT_SC_OVERFLOWED        60        /* # of sc drops due to overflow */
#define        TCP_STAT_SC_RESET        61        /* # of sc drops due to RST */
#define        TCP_STAT_SC_UNREACH        62        /* # of sc drops due to ICMP unreach */
#define        TCP_STAT_SC_BUCKETOVERFLOW 63        /* # of sc drops due to bucket ovflow */
#define        TCP_STAT_SC_ABORTED        64        /* # of sc entries aborted (no mem) */
#define        TCP_STAT_SC_DUPESYN        65        /* # of duplicate SYNs received */
#define        TCP_STAT_SC_DROPPED        66        /* # of SYNs dropped (no route/mem) */
#define        TCP_STAT_SC_COLLISIONS        67        /* # of sc hash collisions */
#define        TCP_STAT_SC_RETRANSMITTED 68        /* # of sc retransmissions */
#define        TCP_STAT_SC_DELAYED_FREE 69        /* # of delayed pool_put()s */
#define        TCP_STAT_SELFQUENCH        70        /* # of ENOBUFS we get on output */
#define        TCP_STAT_BADSIG                71        /* # of drops due to bad signature */
#define        TCP_STAT_GOODSIG        72        /* # of packets with good signature */
#define        TCP_STAT_ECN_SHS        73        /* # of successful ECN handshakes */
#define        TCP_STAT_ECN_CE                74        /* # of packets with CE bit */
#define        TCP_STAT_ECN_ECT        75        /* # of packets with ECT(0) bit */

#define        TCP_NSTATS                76

/*
 * Names for TCP sysctl objects.
 */
#define        TCPCTL_RFC1323                1        /* RFC1323 timestamps/scaling */
#define        TCPCTL_SENDSPACE        2        /* default send buffer */
#define        TCPCTL_RECVSPACE        3        /* default recv buffer */
#define        TCPCTL_MSSDFLT                4        /* default seg size */
#define        TCPCTL_SYN_CACHE_LIMIT        5        /* max size of comp. state engine */
#define        TCPCTL_SYN_BUCKET_LIMIT        6        /* max size of hash bucket */
#if 0        /*obsoleted*/
#define        TCPCTL_SYN_CACHE_INTER        7        /* interval of comp. state timer */
#endif
#define        TCPCTL_INIT_WIN                8        /* initial window */
#define        TCPCTL_MSS_IFMTU        9        /* mss from interface, not in_maxmtu */
#define        TCPCTL_SACK                10        /* RFC2018 selective acknowledgement */
#define        TCPCTL_WSCALE                11        /* RFC1323 window scaling */
#define        TCPCTL_TSTAMP                12        /* RFC1323 timestamps */
#if 0        /*obsoleted*/
#define        TCPCTL_COMPAT_42        13        /* 4.2BSD TCP bug work-arounds */
#endif
#define        TCPCTL_CWM                14        /* Congestion Window Monitoring */
#define        TCPCTL_CWM_BURSTSIZE        15        /* burst size allowed by CWM */
#define        TCPCTL_ACK_ON_PUSH        16        /* ACK immediately on PUSH */
#define        TCPCTL_KEEPIDLE                17        /* keepalive idle time */
#define        TCPCTL_KEEPINTVL        18        /* keepalive probe interval */
#define        TCPCTL_KEEPCNT                19        /* keepalive count */
#define        TCPCTL_SLOWHZ                20        /* PR_SLOWHZ (read-only) */
#define        TCPCTL_NEWRENO                21        /* NewReno Congestion Control */
#define TCPCTL_LOG_REFUSED        22        /* Log refused connections */
#if 0        /*obsoleted*/
#define        TCPCTL_RSTRATELIMIT        23        /* RST rate limit */
#endif
#define        TCPCTL_RSTPPSLIMIT        24        /* RST pps limit */
#define        TCPCTL_DELACK_TICKS        25        /* # ticks to delay ACK */
#define        TCPCTL_INIT_WIN_LOCAL        26        /* initial window for local nets */
#define        TCPCTL_IDENT                27        /* rfc 931 identd */
#define        TCPCTL_ACKDROPRATELIMIT        28        /* SYN/RST -> ACK rate limit */
#define        TCPCTL_LOOPBACKCKSUM        29        /* do TCP checksum on loopback */
#define        TCPCTL_STATS                30        /* TCP statistics */
#define        TCPCTL_DEBUG                31        /* TCP debug sockets */
#define        TCPCTL_DEBX                32        /* # of tcp debug sockets */
#define        TCPCTL_DROP                33        /* drop tcp connection */
#define        TCPCTL_MSL                34        /* Max Segment Life */

#ifdef _KERNEL

extern        struct inpcbtable tcbtable;        /* head of queue of active tcpcb's */
extern        const struct pr_usrreqs tcp_usrreqs;

extern        u_int32_t tcp_now;        /* for RFC 1323 timestamps */
extern        int tcp_do_rfc1323;        /* enabled/disabled? */
extern        int tcp_do_sack;        /* SACK enabled/disabled? */
extern        int tcp_do_win_scale;        /* RFC1323 window scaling enabled/disabled? */
extern        int tcp_do_timestamps;        /* RFC1323 timestamps enabled/disabled? */
extern        int tcp_mssdflt;        /* default seg size */
extern        int tcp_minmss;                /* minimal seg size */
extern  int tcp_msl;                /* max segment life */
extern        int tcp_init_win;        /* initial window */
extern        int tcp_init_win_local;        /* initial window for local nets */
extern        int tcp_init_win_max[11];/* max sizes for values of tcp_init_win_* */
extern        int tcp_mss_ifmtu;        /* take MSS from interface, not in_maxmtu */
extern        int tcp_cwm;                /* enable Congestion Window Monitoring */
extern        int tcp_cwm_burstsize;        /* burst size allowed by CWM */
extern        int tcp_ack_on_push;        /* ACK immediately on PUSH */
extern        int tcp_log_refused;        /* log refused connections */
extern        int tcp_do_ecn;                /* TCP ECN enabled/disabled? */
extern        int tcp_ecn_maxretries;        /* Max ECN setup retries */
extern        int tcp_do_rfc1948;        /* ISS by cryptographic hash */
extern int tcp_sack_tp_maxholes;        /* Max holes per connection. */
extern int tcp_sack_globalmaxholes;        /* Max holes per system. */
extern int tcp_sack_globalholes;        /* Number of holes present. */
extern int tcp_do_abc;                        /* RFC3465 ABC enabled/disabled? */
extern int tcp_abc_aggressive;                /* 1: L=2*SMSS  0: L=1*SMSS */

extern int tcp_msl_enable;                /* enable TIME_WAIT truncation        */
extern int tcp_msl_loop;                /* MSL for loopback                */
extern int tcp_msl_local;                /* MSL for 'local'                */
extern int tcp_msl_remote;                /* MSL otherwise                */
extern int tcp_msl_remote_threshold;        /* RTT threshold                */
extern int tcp_rttlocal;                /* Use RTT to decide who's 'local' */
extern int tcp4_vtw_enable;
extern int tcp6_vtw_enable;
extern int tcp_vtw_was_enabled;
extern int tcp_vtw_entries;

extern        int tcp_rst_ppslim;
extern        int tcp_ackdrop_ppslim;

#ifdef MBUFTRACE
extern        struct mowner tcp_rx_mowner;
extern        struct mowner tcp_tx_mowner;
extern        struct mowner tcp_reass_mowner;
extern        struct mowner tcp_sock_mowner;
extern        struct mowner tcp_sock_rx_mowner;
extern        struct mowner tcp_sock_tx_mowner;
extern        struct mowner tcp_mowner;
#endif

extern int tcp_do_autorcvbuf;
extern int tcp_autorcvbuf_inc;
extern int tcp_autorcvbuf_max;
extern int tcp_do_autosndbuf;
extern int tcp_autosndbuf_inc;
extern int tcp_autosndbuf_max;

struct secasvar;

void         tcp_canceltimers(struct tcpcb *);
struct tcpcb *
         tcp_close(struct tcpcb *);
int         tcp_isdead(struct tcpcb *);
#ifdef INET6
void         *tcp6_ctlinput(int, const struct sockaddr *, void *);
#endif
void         *tcp_ctlinput(int, const struct sockaddr *, void *);
int         tcp_ctloutput(int, struct socket *, struct sockopt *);
struct tcpcb *
         tcp_disconnect1(struct tcpcb *);
struct tcpcb *
         tcp_drop(struct tcpcb *, int);
#ifdef TCP_SIGNATURE
int         tcp_signature_apply(void *, void *, u_int);
struct secasvar *tcp_signature_getsav(struct mbuf *);
int         tcp_signature(struct mbuf *, struct tcphdr *, int, struct secasvar *,
            char *);
#endif
void         tcp_drain(void);
void         tcp_drainstub(void);
void         tcp_established(struct tcpcb *);
void         tcp_init(void);
void         tcp_init_common(unsigned);
#ifdef INET6
int         tcp6_input(struct mbuf **, int *, int);
#endif
void         tcp_input(struct mbuf *, int, int);
u_int         tcp_hdrsz(struct tcpcb *);
u_long         tcp_mss_to_advertise(const struct ifnet *, int);
void         tcp_mss_from_peer(struct tcpcb *, int);
void         tcp_tcpcb_template(void);
struct tcpcb *
         tcp_newtcpcb(int, struct inpcb *);
void         tcp_notify(struct inpcb *, int);
u_int         tcp_optlen(struct tcpcb *);
int         tcp_output(struct tcpcb *);
void         tcp_pulloutofband(struct socket *,
            struct tcphdr *, struct mbuf *, int);
void         tcp_quench(struct inpcb *);
void         tcp_mtudisc(struct inpcb *, int);
#ifdef INET6
void         tcp6_mtudisc_callback(struct in6_addr *);
#endif

void        tcpipqent_init(void);
struct ipqent *tcpipqent_alloc(void);
void         tcpipqent_free(struct ipqent *);

int         tcp_respond(struct tcpcb *, struct mbuf *, struct mbuf *,
            struct tcphdr *, tcp_seq, tcp_seq, int);
void         tcp_rmx_rtt(struct tcpcb *);
void         tcp_setpersist(struct tcpcb *);
#ifdef TCP_SIGNATURE
int         tcp_signature_compute(struct mbuf *, struct tcphdr *, int, int,
            int, u_char *, u_int);
#endif
void         tcp_fasttimo(void);
struct mbuf *
         tcp_template(struct tcpcb *);
void         tcp_trace(short, short, struct tcpcb *, struct mbuf *, int);
struct tcpcb *
         tcp_usrclosed(struct tcpcb *);
void         tcp_usrreq_init(void);
void         tcp_xmit_timer(struct tcpcb *, uint32_t);
tcp_seq         tcp_new_iss(struct tcpcb *);
tcp_seq  tcp_new_iss1(void *, void *, u_int16_t, u_int16_t, size_t);

void         tcp_sack_init(void);
void         tcp_new_dsack(struct tcpcb *, tcp_seq, u_int32_t);
void         tcp_sack_option(struct tcpcb *, const struct tcphdr *,
            const u_char *, int);
void         tcp_del_sackholes(struct tcpcb *, const struct tcphdr *);
void         tcp_free_sackholes(struct tcpcb *);
void         tcp_sack_adjust(struct tcpcb *tp);
struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
int         tcp_sack_numblks(const struct tcpcb *);
#define        TCP_SACK_OPTLEN(nblks)        ((nblks) * 8 + 2 + 2)

void         tcp_statinc(u_int);
void         tcp_statadd(u_int, uint64_t);

int         tcp_input_checksum(int, struct mbuf *, const struct tcphdr *, int, int,
    int);

int        tcp_dooptions(struct tcpcb *, const u_char *, int,
            struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *);
#endif

#endif /* !_NETINET_TCP_VAR_H_ */



























































































































































































































































































































































































































































































































    3 

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
/*        $NetBSD: bpf.h,v 1.82 2023/08/23 13:21:17 rin Exp $        */

/*
 * Copyright (c) 1990, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from the Stanford/CMU enet packet filter,
 * (net/enet.c) distributed as part of 4.3BSD, and code contributed
 * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
 * Berkeley Laboratory.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)bpf.h        8.2 (Berkeley) 1/9/95
 * @(#) Header: bpf.h,v 1.36 97/06/12 14:29:53 leres Exp  (LBL)
 */

#ifndef _NET_BPF_H_
#define _NET_BPF_H_

#include <sys/ioccom.h>
#include <sys/time.h>

/* BSD style release date */
#define BPF_RELEASE 199606

/* Date when COP instructions and external memory have been released. */
#define BPF_COP_EXTMEM_RELEASE 20140624

__BEGIN_DECLS

typedef        int bpf_int32;
typedef        u_int bpf_u_int32;

/*
 * Alignment macros.  BPF_WORDALIGN rounds up to the next
 * even multiple of BPF_ALIGNMENT.
 */
#define BPF_ALIGNMENT sizeof(long)
#define BPF_ALIGNMENT32 sizeof(int)

#define BPF_WORDALIGN(x) (((x)+(BPF_ALIGNMENT-1))&~(BPF_ALIGNMENT-1))
#define BPF_WORDALIGN32(x) (((x)+(BPF_ALIGNMENT32-1))&~(BPF_ALIGNMENT32-1))

#define BPF_MAXINSNS 512
#define BPF_DFLTBUFSIZE (1024*1024)        /* default static upper limit */
#define BPF_MAXBUFSIZE (1024*1024*16)        /* hard limit on sysctl'able value */
#define BPF_MINBUFSIZE 32

/*
 *  Structure for BIOCSETF.
 */
struct bpf_program {
        u_int bf_len;
        struct bpf_insn *bf_insns;
};

/*
 * Struct returned by BIOCGSTATS and net.bpf.stats sysctl.
 */
struct bpf_stat {
        uint64_t bs_recv;        /* number of packets received */
        uint64_t bs_drop;        /* number of packets dropped */
        uint64_t bs_capt;        /* number of packets captured */
        uint64_t bs_padding[13];
};

/*
 * Struct returned by BIOCGSTATS_30.
 */
struct bpf_stat30 {
        u_int bs_recv;                /* number of packets received */
        u_int bs_drop;                /* number of packets dropped */
};

/*
 * Struct return by BIOCVERSION.  This represents the version number of
 * the filter language described by the instruction encodings below.
 * bpf understands a program iff kernel_major == filter_major &&
 * kernel_minor >= filter_minor, that is, if the value returned by the
 * running kernel has the same major number and a minor number equal
 * equal to or less than the filter being downloaded.  Otherwise, the
 * results are undefined, meaning an error may be returned or packets
 * may be accepted haphazardly.
 * It has nothing to do with the source code version.
 */
struct bpf_version {
        u_short bv_major;
        u_short bv_minor;
};
/* Current version number of filter architecture. */
#define BPF_MAJOR_VERSION 1
#define BPF_MINOR_VERSION 1

/*
 * BPF ioctls
 *
 * The first set is for compatibility with Sun's pcc style
 * header files.  If your using gcc, we assume that you
 * have run fixincludes so the latter set should work.
 */
#define BIOCGBLEN         _IOR('B', 102, u_int)
#define BIOCSBLEN        _IOWR('B', 102, u_int)
#define BIOCSETF         _IOW('B', 103, struct bpf_program)
#define BIOCFLUSH          _IO('B', 104)
#define BIOCPROMISC          _IO('B', 105)
#define BIOCGDLT         _IOR('B', 106, u_int)
#define BIOCGETIF         _IOR('B', 107, struct ifreq)
#define BIOCSETIF         _IOW('B', 108, struct ifreq)
#ifdef COMPAT_50
#include <compat/sys/time.h>
#define BIOCSORTIMEOUT         _IOW('B', 109, struct timeval50)
#define BIOCGORTIMEOUT         _IOR('B', 110, struct timeval50)
#endif
#define BIOCGSTATS         _IOR('B', 111, struct bpf_stat)
#define BIOCGSTATS_30         _IOR('B', 111, struct bpf_stat30)
#define BIOCIMMEDIATE         _IOW('B', 112, u_int)
#define BIOCVERSION         _IOR('B', 113, struct bpf_version)
#define BIOCSTCPF         _IOW('B', 114, struct bpf_program)
#define BIOCSUDPF         _IOW('B', 115, struct bpf_program)
#define BIOCGHDRCMPLT         _IOR('B', 116, u_int)
#define BIOCSHDRCMPLT         _IOW('B', 117, u_int)
#define BIOCSDLT         _IOW('B', 118, u_int)
#define BIOCGDLTLIST        _IOWR('B', 119, struct bpf_dltlist)
#define BIOCGDIRECTION         _IOR('B', 120, u_int)
#define BIOCSDIRECTION         _IOW('B', 121, u_int)
#define BIOCSRTIMEOUT         _IOW('B', 122, struct timeval)
#define BIOCGRTIMEOUT         _IOR('B', 123, struct timeval)
#define BIOCGFEEDBACK         _IOR('B', 124, u_int)
#define BIOCSFEEDBACK         _IOW('B', 125, u_int)
#define BIOCFEEDBACK     BIOCSFEEDBACK                /* FreeBSD name */
#define BIOCLOCK          _IO('B', 126)
#define BIOCSETWF         _IOW('B', 127, struct bpf_program)

/* Obsolete */
#define        BIOCGSEESENT        BIOCGDIRECTION
#define        BIOCSSEESENT        BIOCSDIRECTION

/*
 * Packet directions.
 * BPF_D_IN = 0, BPF_D_INOUT =1 for backward compatibility of BIOC[GS]SEESENT.
 */
#define        BPF_D_IN        0        /* See incoming packets */
#define        BPF_D_INOUT        1        /* See incoming and outgoing packets */
#define        BPF_D_OUT        2        /* See outgoing packets */

/*
 * Structure prepended to each packet. This is "wire" format, so we
 * cannot change it unfortunately to 64 bit times on 32 bit systems [yet].
 */
struct bpf_timeval {
        long tv_sec;
        long tv_usec;
};

struct bpf_timeval32 {
        int32_t tv_sec;
        int32_t tv_usec;
};

struct bpf_hdr {
        struct bpf_timeval bh_tstamp;        /* time stamp */
        uint32_t        bh_caplen;        /* length of captured portion */
        uint32_t        bh_datalen;        /* original length of packet */
        uint16_t        bh_hdrlen;        /* length of bpf header (this struct
                                           plus alignment padding) */
};

struct bpf_hdr32 {
        struct bpf_timeval32 bh_tstamp;        /* time stamp */
        uint32_t        bh_caplen;        /* length of captured portion */
        uint32_t        bh_datalen;        /* original length of packet */
        uint16_t        bh_hdrlen;        /* length of bpf header (this struct
                                           plus alignment padding) */
};
/*
 * Because the structure above is not a multiple of 4 bytes, some compilers
 * will insist on inserting padding; hence, sizeof(struct bpf_hdr) won't work.
 * Only the kernel needs to know about it; applications use bh_hdrlen.
 * XXX To save a few bytes on 32-bit machines, we avoid end-of-struct
 * XXX padding by using the size of the header data elements.  This is
 * XXX fail-safe: on new machines, we just use the 'safe' sizeof.
 */
#ifdef _KERNEL
#if defined(__mips64)
#define SIZEOF_BPF_HDR sizeof(struct bpf_hdr)
#define SIZEOF_BPF_HDR32 18
#elif defined(__arm32__) || defined(__i386__) || defined(__m68k__) || \
    defined(__mips__) || defined(__ns32k__) || defined(__vax__) || \
    defined(__sh__) || (defined(__sparc__) && !defined(__sparc64__))
#define SIZEOF_BPF_HDR 18
#define SIZEOF_BPF_HDR32 18
#else
#define SIZEOF_BPF_HDR sizeof(struct bpf_hdr)
#define SIZEOF_BPF_HDR32 sizeof(struct bpf_hdr32)
#endif
#endif

/* Pull in data-link level type codes. */
#include <net/dlt.h>

/*
 * The instruction encodings.
 */
/* instruction classes */
#define BPF_CLASS(code) ((code) & 0x07)
#define                BPF_LD                0x00
#define                BPF_LDX                0x01
#define                BPF_ST                0x02
#define                BPF_STX                0x03
#define                BPF_ALU                0x04
#define                BPF_JMP                0x05
#define                BPF_RET                0x06
#define                BPF_MISC        0x07

/* ld/ldx fields */
#define BPF_SIZE(code)        ((code) & 0x18)
#define                BPF_W                0x00
#define                BPF_H                0x08
#define                BPF_B                0x10
/*                                0x18        reserved; used by BSD/OS */
#define BPF_MODE(code)        ((code) & 0xe0)
#define                BPF_IMM         0x00
#define                BPF_ABS                0x20
#define                BPF_IND                0x40
#define                BPF_MEM                0x60
#define                BPF_LEN                0x80
#define                BPF_MSH                0xa0
/*                                0xc0        reserved; used by BSD/OS */
/*                                0xe0        reserved; used by BSD/OS */

/* alu/jmp fields */
#define BPF_OP(code)        ((code) & 0xf0)
#define                BPF_ADD                0x00
#define                BPF_SUB                0x10
#define                BPF_MUL                0x20
#define                BPF_DIV                0x30
#define                BPF_OR                0x40
#define                BPF_AND                0x50
#define                BPF_LSH                0x60
#define                BPF_RSH                0x70
#define                BPF_NEG                0x80
#define                BPF_MOD                0x90
#define                BPF_XOR                0xa0
/*                                0xb0        reserved */
/*                                0xc0        reserved */
/*                                0xd0        reserved */
/*                                0xe0        reserved */
/*                                0xf0        reserved */
#define                BPF_JA                0x00
#define                BPF_JEQ                0x10
#define                BPF_JGT                0x20
#define                BPF_JGE                0x30
#define                BPF_JSET        0x40
/*                                0x50        reserved; used by BSD/OS */
/*                                0x60        reserved */
/*                                0x70        reserved */
/*                                0x80        reserved */
/*                                0x90        reserved */
/*                                0xa0        reserved */
/*                                0xb0        reserved */
/*                                0xc0        reserved */
/*                                0xd0        reserved */
/*                                0xe0        reserved */
/*                                0xf0        reserved */
#define BPF_SRC(code)        ((code) & 0x08)
#define                BPF_K                0x00
#define                BPF_X                0x08

/* ret - BPF_K and BPF_X also apply */
#define BPF_RVAL(code)        ((code) & 0x18)
#define                BPF_A                0x10
/*                                0x18        reserved */

/* misc */
#define BPF_MISCOP(code) ((code) & 0xf8)
#define                BPF_TAX                0x00
/*                                0x10        reserved */
/*                                0x18        reserved */
#define                BPF_COP                0x20
/*                                0x28        reserved */
/*                                0x30        reserved */
/*                                0x38        reserved */
#define                BPF_COPX        0x40        /* XXX: also used by BSD/OS */
/*                                0x48        reserved */
/*                                0x50        reserved */
/*                                0x58        reserved */
/*                                0x60        reserved */
/*                                0x68        reserved */
/*                                0x70        reserved */
/*                                0x78        reserved */
#define                BPF_TXA                0x80
/*                                0x88        reserved */
/*                                0x90        reserved */
/*                                0x98        reserved */
/*                                0xa0        reserved */
/*                                0xa8        reserved */
/*                                0xb0        reserved */
/*                                0xb8        reserved */
/*                                0xc0        reserved; used by BSD/OS */
/*                                0xc8        reserved */
/*                                0xd0        reserved */
/*                                0xd8        reserved */
/*                                0xe0        reserved */
/*                                0xe8        reserved */
/*                                0xf0        reserved */
/*                                0xf8        reserved */

/*
 * The instruction data structure.
 */
struct bpf_insn {
        uint16_t  code;
        u_char           jt;
        u_char           jf;
        uint32_t  k;
};

/*
 * Auxiliary data, for use when interpreting a filter intended for the
 * Linux kernel when the kernel rejects the filter (requiring us to
 * run it in userland).  It contains VLAN tag information.
 */
struct bpf_aux_data {
        u_short vlan_tag_present;
        u_short vlan_tag;
};

/*
 * Macros for insn array initializers.
 */
#define BPF_STMT(code, k) { (uint16_t)(code), 0, 0, k }
#define BPF_JUMP(code, k, jt, jf) { (uint16_t)(code), jt, jf, k }

/*
 * Number of scratch memory words (for BPF_LD|BPF_MEM and BPF_ST).
 */
#define        BPF_MEMWORDS                16

/*
 * bpf_memword_init_t: bits indicate which words in the external memory
 * store will be initialised by the caller before BPF program execution.
 */
typedef uint32_t bpf_memword_init_t;
#define        BPF_MEMWORD_INIT(k)        (UINT32_C(1) << (k))

/* Note: two most significant bits are reserved by bpfjit. */
__CTASSERT(BPF_MEMWORDS + 2 <= sizeof(bpf_memword_init_t) * NBBY);

#ifdef _KERNEL
/*
 * Max number of external memory words (for BPF_LD|BPF_MEM and BPF_ST).
 */
#define        BPF_MAX_MEMWORDS        30

__CTASSERT(BPF_MAX_MEMWORDS >= BPF_MEMWORDS);
__CTASSERT(BPF_MAX_MEMWORDS + 2 <= sizeof(bpf_memword_init_t) * NBBY);
#endif

/*
 * Structure to retrieve available DLTs for the interface.
 */
struct bpf_dltlist {
        u_int        bfl_len;        /* number of bfd_list array */
        u_int        *bfl_list;        /* array of DLTs */
};

struct bpf_ctx;
typedef struct bpf_ctx bpf_ctx_t;

typedef struct bpf_args {
        const uint8_t *        pkt;
        size_t                wirelen;
        size_t                buflen;
        /*
         * The following arguments are used only by some kernel
         * subsystems.
         * They aren't required for classical bpf filter programs.
         * For such programs, bpfjit generated code doesn't read
         * those arguments at all. Note however that bpf interpreter
         * always needs a pointer to memstore.
         */
        uint32_t *        mem; /* pointer to external memory store */
        void *                arg; /* auxiliary argument for a copfunc */
} bpf_args_t;

#if defined(_KERNEL) || defined(__BPF_PRIVATE)

typedef uint32_t (*bpf_copfunc_t)(const bpf_ctx_t *, bpf_args_t *, uint32_t);

struct bpf_ctx {
        /*
         * BPF coprocessor functions and the number of them.
         */
        const bpf_copfunc_t *        copfuncs;
        size_t                        nfuncs;

        /*
         * The number of memory words in the external memory store.
         * There may be up to BPF_MAX_MEMWORDS words; if zero is set,
         * then the internal memory store is used which has a fixed
         * number of words (BPF_MEMWORDS).
         */
        size_t                        extwords;

        /*
         * The bitmask indicating which words in the external memstore
         * will be initialised by the caller.
         */
        bpf_memword_init_t        preinited;
};
#endif

#ifdef _KERNEL
#include <net/bpfjit.h>
#include <net/if.h>

struct bpf_if;

struct bpf_ops {
        void (*bpf_attach)(struct ifnet *, u_int, u_int, struct bpf_if **);
        void (*bpf_detach)(struct ifnet *);
        void (*bpf_change_type)(struct ifnet *, u_int, u_int);

        void (*bpf_mtap)(struct bpf_if *, struct mbuf *, u_int);
        void (*bpf_mtap2)(struct bpf_if *, void *, u_int, struct mbuf *,
            u_int);
        void (*bpf_mtap_af)(struct bpf_if *, uint32_t, struct mbuf *, u_int);
        void (*bpf_mtap_sl_in)(struct bpf_if *, u_char *, struct mbuf **);
        void (*bpf_mtap_sl_out)(struct bpf_if *, u_char *, struct mbuf *);

        void (*bpf_mtap_softint_init)(struct ifnet *);
        void (*bpf_mtap_softint)(struct ifnet *, struct mbuf *);

        int (*bpf_register_track_event)(struct bpf_if **,
            void (*)(struct bpf_if *, struct ifnet *, int, int));
        int (*bpf_deregister_track_event)(struct bpf_if **,
            void (*)(struct bpf_if *, struct ifnet *, int, int));
};

extern struct bpf_ops *bpf_ops;

static __inline void
bpf_attach(struct ifnet *_ifp, u_int _dlt, u_int _hdrlen)
{
        bpf_ops->bpf_attach(_ifp, _dlt, _hdrlen, &_ifp->if_bpf);
}

static __inline void
bpf_attach2(struct ifnet *_ifp, u_int _dlt, u_int _hdrlen, struct bpf_if **_dp)
{
        bpf_ops->bpf_attach(_ifp, _dlt, _hdrlen, _dp);
}

static __inline void
bpf_mtap(struct ifnet *_ifp, struct mbuf *_m, u_int _direction)
{
        if (_ifp->if_bpf) {
                if (_ifp->if_bpf_mtap) {
                        _ifp->if_bpf_mtap(_ifp->if_bpf, _m, _direction);
                } else {
                        bpf_ops->bpf_mtap(_ifp->if_bpf, _m, _direction);
                }
        }
}

static __inline void
bpf_mtap2(struct bpf_if *_bpf, void *_data, u_int _dlen, struct mbuf *_m,
        u_int _direction)
{
        bpf_ops->bpf_mtap2(_bpf, _data, _dlen, _m, _direction);
}

static __inline void
bpf_mtap3(struct bpf_if *_bpf, struct mbuf *_m, u_int _direction)
{
        if (_bpf)
                bpf_ops->bpf_mtap(_bpf, _m, _direction);
}

static __inline void
bpf_mtap_af(struct ifnet *_ifp, uint32_t _af, struct mbuf *_m,
    u_int _direction)
{
        if (_ifp->if_bpf)
                bpf_ops->bpf_mtap_af(_ifp->if_bpf, _af, _m, _direction);
}

static __inline void
bpf_change_type(struct ifnet *_ifp, u_int _dlt, u_int _hdrlen)
{
        bpf_ops->bpf_change_type(_ifp, _dlt, _hdrlen);
}

static __inline bool
bpf_peers_present(struct bpf_if *dp)
{
        /*
         * Our code makes sure the driver visible pointer is NULL
         * whenever there is no listener on this tap.
         */
        return dp != NULL;
}

static __inline void
bpf_detach(struct ifnet *_ifp)
{
        bpf_ops->bpf_detach(_ifp);
}

static __inline void
bpf_mtap_sl_in(struct ifnet *_ifp, u_char *_hdr, struct mbuf **_m)
{
        bpf_ops->bpf_mtap_sl_in(_ifp->if_bpf, _hdr, _m);
}

static __inline void
bpf_mtap_sl_out(struct ifnet *_ifp, u_char *_hdr, struct mbuf *_m)
{
        if (_ifp->if_bpf)
                bpf_ops->bpf_mtap_sl_out(_ifp->if_bpf, _hdr, _m);
}

static __inline void
bpf_mtap_softint_init(struct ifnet *_ifp)
{

        bpf_ops->bpf_mtap_softint_init(_ifp);
}

static __inline void
bpf_mtap_softint(struct ifnet *_ifp, struct mbuf *_m)
{

        if (_ifp->if_bpf)
                bpf_ops->bpf_mtap_softint(_ifp, _m);
}

static __inline int
bpf_register_track_event(struct bpf_if **_dp,
            void (*_fun)(struct bpf_if *, struct ifnet *, int, int))
{
        if (bpf_ops->bpf_register_track_event == NULL)
                return ENXIO;
        return bpf_ops->bpf_register_track_event(_dp, _fun);
}

static __inline int
bpf_deregister_track_event(struct bpf_if **_dp,
            void (*_fun)(struct bpf_if *, struct ifnet *, int, int))
{
        if (bpf_ops->bpf_deregister_track_event == NULL)
                return ENXIO;
        return bpf_ops->bpf_deregister_track_event(_dp, _fun);
}

void        bpf_setops(void);

void        bpf_ops_handover_enter(struct bpf_ops *);
void        bpf_ops_handover_exit(void);

void        bpfilterattach(int);

bpf_ctx_t *bpf_create(void);
void        bpf_destroy(bpf_ctx_t *);

int        bpf_set_cop(bpf_ctx_t *, const bpf_copfunc_t *, size_t);
int        bpf_set_extmem(bpf_ctx_t *, size_t, bpf_memword_init_t);
u_int        bpf_filter_ext(const bpf_ctx_t *, const struct bpf_insn *, bpf_args_t *);
int        bpf_validate_ext(const bpf_ctx_t *, const struct bpf_insn *, int);

bpfjit_func_t bpf_jit_generate(bpf_ctx_t *, void *, size_t);
void        bpf_jit_freecode(bpfjit_func_t);

#endif

int        bpf_validate(const struct bpf_insn *, int);
u_int        bpf_filter(const struct bpf_insn *, const u_char *, u_int, u_int);

u_int        bpf_filter_with_aux_data(const struct bpf_insn *, const u_char *, u_int, u_int, const struct bpf_aux_data *);

/*
 * events to be tracked by bpf_register_track_event callbacks
 */
#define        BPF_TRACK_EVENT_ATTACH        1
#define        BPF_TRACK_EVENT_DETACH        2

void bpf_dump(const struct bpf_program *, int);
char  *bpf_image(const struct bpf_insn *, int);

__END_DECLS

#if 1  /* XXX: remove me, for the benefit of sanitizers */
#define BIOCGSTATSOLD BIOCGSTATS_30
#define bpf_stat_old bpf_stat30
#endif

#endif /* !_NET_BPF_H_ */























































































































    1 





    1 
















    1 













































    1 




    1 



























    1 

    1 





































    1 

























































































    1 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
/*        $NetBSD: uvm_readahead.c,v 1.16 2023/09/23 18:21:12 ad Exp $        */

/*-
 * Copyright (c)2003, 2005, 2009 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * uvm_object read-ahead
 *
 * TODO:
 *        - tune.
 *        - handle multiple streams.
 *        - find a better way to deal with PGO_LOCKED pager requests.
 *          (currently just ignored)
 *        - consider the amount of memory in the system.
 *        - consider the speed of the underlying device.
 *        - consider filesystem block size / block layout.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_readahead.c,v 1.16 2023/09/23 18:21:12 ad Exp $");

#include <sys/param.h>
#include <sys/kmem.h>

#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>

#if defined(READAHEAD_DEBUG)
#define        DPRINTF(a)        printf a
#else /* defined(READAHEAD_DEBUG) */
#define        DPRINTF(a)        /* nothing */
#endif /* defined(READAHEAD_DEBUG) */

/*
 * uvm_ractx: read-ahead context.
 */

struct uvm_ractx {
        int ra_flags;
#define        RA_VALID        1
        off_t ra_winstart;        /* window start offset */
        size_t ra_winsize;        /* window size */
        off_t ra_next;                /* next offset to read-ahead */
};

#if defined(sun2) || defined(sun3)
/* XXX: on sun2 and sun3 MAXPHYS is 0xe000 */
#undef MAXPHYS        
#define MAXPHYS                0x8000        /* XXX */
#endif

#define        RA_WINSIZE_INIT        MAXPHYS                        /* initial window size */
#define        RA_WINSIZE_MAX        (MAXPHYS * 16)                /* max window size */
#define        RA_WINSIZE_SEQENTIAL        RA_WINSIZE_MAX        /* fixed window size used for
                                                   SEQUENTIAL hint */
#define        RA_MINSIZE        (MAXPHYS * 2)                /* min size to start i/o */
#define        RA_IOCHUNK        MAXPHYS                        /* read-ahead i/o chunk size */

static off_t ra_startio(struct uvm_object *, off_t, size_t);
static struct uvm_ractx *ra_allocctx(void);
static void ra_freectx(struct uvm_ractx *);

/*
 * uvm_ra_init: initialize readahead module.
 */

void
uvm_ra_init(void)
{

}

static struct uvm_ractx *
ra_allocctx(void)
{

        return kmem_alloc(sizeof(struct uvm_ractx), KM_NOSLEEP);
}

static void
ra_freectx(struct uvm_ractx *ra)
{

        kmem_free(ra, sizeof(struct uvm_ractx));
}

/*
 * ra_startio: start i/o for read-ahead.
 *
 * => start i/o for each RA_IOCHUNK sized chunk.
 * => return offset to which we started i/o.
 */

static off_t
ra_startio(struct uvm_object *uobj, off_t off, size_t sz)
{
        const off_t endoff = off + sz;

        DPRINTF(("%s: uobj=%p, off=%" PRIu64 ", endoff=%" PRIu64 "\n",
            __func__, uobj, off, endoff));

        KASSERT(rw_write_held(uobj->vmobjlock));

        /*
         * Don't issue read-ahead if the last page of the range is already cached.
         * The assumption is that since the access is sequential, the intermediate
         * pages would have similar LRU stats, and hence likely to be still in cache
         * too. This speeds up I/O using cache, since it avoids lookups and temporary
         * allocations done by full pgo_get.
         */
        struct vm_page *pg = uvm_pagelookup(uobj, trunc_page(endoff - 1));
        if (pg != NULL) {
                DPRINTF(("%s:  off=%" PRIu64 ", sz=%zu already cached\n",
                    __func__, off, sz));
                return endoff;
        }

        off = trunc_page(off);
        while (off < endoff) {
                const size_t chunksize = RA_IOCHUNK;
                int error;
                size_t donebytes;
                int npages;
                int orignpages;
                size_t bytelen;

                KASSERT((chunksize & (chunksize - 1)) == 0);
                KASSERT((off & PAGE_MASK) == 0);
                bytelen = ((off + chunksize) & -(off_t)chunksize) - off;
                KASSERT((bytelen & PAGE_MASK) == 0);
                npages = orignpages = bytelen >> PAGE_SHIFT;
                KASSERT(npages != 0);

                /*
                 * use UVM_ADV_RANDOM to avoid recursion.
                 */

                error = (*uobj->pgops->pgo_get)(uobj, off, NULL,
                    &npages, 0, VM_PROT_READ, UVM_ADV_RANDOM, PGO_NOTIMESTAMP);
                rw_enter(uobj->vmobjlock, RW_WRITER);
                DPRINTF(("%s:  off=%" PRIu64 ", bytelen=%zu -> %d\n",
                    __func__, off, bytelen, error));
                if (error != 0 && error != EBUSY) {
                        if (error != EINVAL) { /* maybe past EOF */
                                DPRINTF(("%s: error=%d\n", __func__, error));
                        }
                        break;
                }
                KASSERT(orignpages == npages);
                donebytes = orignpages << PAGE_SHIFT;
                off += donebytes;
        }

        return off;
}

/* ------------------------------------------------------------ */

/*
 * uvm_ra_allocctx: allocate a context.
 */

struct uvm_ractx *
uvm_ra_allocctx(void)
{
        struct uvm_ractx *ra;

        ra = ra_allocctx();
        if (ra != NULL) {
                ra->ra_flags = 0;
        }

        return ra;
}

/*
 * uvm_ra_freectx: free a context.
 */

void
uvm_ra_freectx(struct uvm_ractx *ra)
{

        KASSERT(ra != NULL);
        ra_freectx(ra);
}

/*
 * uvm_ra_request: update a read-ahead context and start i/o if appropriate.
 *
 * => called when [reqoff, reqoff+reqsize) is requested.
 * => object must be locked by caller, will return locked.
 */

void
uvm_ra_request(struct uvm_ractx *ra, int advice, struct uvm_object *uobj,
    off_t reqoff, size_t reqsize)
{

        KASSERT(rw_write_held(uobj->vmobjlock));

        if (ra == NULL || advice == UVM_ADV_RANDOM) {
                return;
        }

        if (advice == UVM_ADV_SEQUENTIAL) {

                /*
                 * always do read-ahead with a large window.
                 */

                if ((ra->ra_flags & RA_VALID) == 0) {
                        ra->ra_winstart = ra->ra_next = 0;
                        ra->ra_flags |= RA_VALID;
                }
                if (reqoff < ra->ra_winstart) {
                        ra->ra_next = reqoff;
                }
                ra->ra_winsize = RA_WINSIZE_SEQENTIAL;
                goto do_readahead;
        }

        /*
         * a request with UVM_ADV_NORMAL hint.  (ie. no hint)
         *
         * we keep a sliding window in order to determine:
         *        - if the previous read-ahead was successful or not.
         *        - how many bytes to read-ahead.
         */

        /*
         * if it's the first request for this context,
         * initialize context and return.
         */

        if ((ra->ra_flags & RA_VALID) == 0) {
initialize:
                ra->ra_winstart = ra->ra_next = reqoff + reqsize;
                ra->ra_winsize = RA_WINSIZE_INIT;
                ra->ra_flags |= RA_VALID;
                goto done;
        }

        /*
         * if it isn't in our window,
         * initialize context and return.
         * (read-ahead miss)
         */

        if (reqoff < ra->ra_winstart ||
            ra->ra_winstart + ra->ra_winsize < reqoff) {

                /*
                 * ... unless we seem to be reading the same chunk repeatedly.
                 *
                 * XXX should have some margin?
                 */

                if (reqoff + reqsize == ra->ra_winstart) {
                        DPRINTF(("%s: %p: same block: off=%" PRIu64
                            ", size=%zd, winstart=%" PRIu64 "\n",
                            __func__, ra, reqoff, reqsize, ra->ra_winstart));
                        goto done;
                }
                goto initialize;
        }

        /*
         * it's in our window. (read-ahead hit)
         *        - start read-ahead i/o if appropriate.
         *        - advance and enlarge window.
         */

do_readahead:

        /*
         * don't bother to read-ahead behind current request.
         */

        if (reqoff > ra->ra_next) {
                ra->ra_next = reqoff;
        }

        /*
         * try to make [reqoff, reqoff+ra_winsize) in-core.
         * note that [reqoff, ra_next) is considered already done.
         */

        if (reqoff + ra->ra_winsize > ra->ra_next) {
                off_t raoff = MAX(reqoff, ra->ra_next);
                size_t rasize = reqoff + ra->ra_winsize - ra->ra_next;

#if defined(DIAGNOSTIC)
                if (rasize > RA_WINSIZE_MAX) {
                        printf("%s: corrupted context", __func__);
                        rasize = RA_WINSIZE_MAX;
                }
#endif /* defined(DIAGNOSTIC) */

                /*
                 * issue read-ahead only if we can start big enough i/o.
                 * otherwise we end up with a stream of small i/o.
                 */

                if (rasize >= RA_MINSIZE) {
                        off_t next;

                        next = ra_startio(uobj, raoff, rasize);
                        ra->ra_next = next;
                }
        }

        /*
         * update window.
         *
         * enlarge window by reqsize, so that it grows in a predictable manner
         * regardless of the size of each read(2).
         */

        ra->ra_winstart = reqoff + reqsize;
        ra->ra_winsize = MIN(RA_WINSIZE_MAX, ra->ra_winsize + reqsize);

done:;
}

int
uvm_readahead(struct uvm_object *uobj, off_t off, off_t size)
{

        /*
         * don't allow too much read-ahead.
         */
        if (size > RA_WINSIZE_MAX) {
                size = RA_WINSIZE_MAX;
        }
        rw_enter(uobj->vmobjlock, RW_WRITER);
        ra_startio(uobj, off, size);
        rw_exit(uobj->vmobjlock);
        return 0;
}
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/*        $NetBSD: exec_elf32.c,v 1.143 2019/11/20 19:37:53 pgoyette Exp $        */

/*
 * Copyright (c) 1996 Christopher G. Demetriou
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Christopher G. Demetriou
 *        for the NetBSD Project.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_elf32.c,v 1.143 2019/11/20 19:37:53 pgoyette Exp $");

#define        ELFSIZE        32

#include "exec_elf.c"

#include <sys/module.h>

#define ELF32_AUXSIZE (ELF_AUX_ENTRIES * sizeof(Aux32Info) \
    + MAXPATHLEN + ALIGN(1))

MODULE(MODULE_CLASS_EXEC, exec_elf32, NULL);

static struct execsw exec_elf32_execsw[] = {
        {
                .es_hdrsz = sizeof (Elf32_Ehdr),
                  .es_makecmds = exec_elf32_makecmds,
                  .u = {
                        .elf_probe_func = netbsd_elf32_probe,
                },
                .es_emul = &emul_netbsd,
                .es_prio = EXECSW_PRIO_FIRST,
                .es_arglen = ELF32_AUXSIZE,
                .es_copyargs = elf32_copyargs,
                .es_setregs = NULL,
                .es_coredump = coredump_elf32,
                .es_setup_stack = exec_setup_stack,
        },
#if EXEC_ELF_NOTELESS
        {
                .es_hdrsz = sizeof (Elf32_Ehdr),
                  .es_makecmds = exec_elf32_makecmds,
                  .u {
                        elf_probe_func = NULL,
                },
                .es_emul = &emul_netbsd,
                .es_prio = EXECSW_PRIO_LAST,
                .es_arglen = ELF32_AUXSIZE,
                .es_copyargs = elf32_copyargs,
                .es_setregs = NULL,
                .es_coredump = coredump_elf32,
                .es_setup_stack = exec_setup_stack,
        },
#endif
};

static int
exec_elf32_modcmd(modcmd_t cmd, void *arg)
{
#if ARCH_ELFSIZE == 64
        /*
         * If we are on a 64bit system, we don't want the 32bit execsw[] to be
         * added in the global array, because the exec_elf32 module only works
         * on 32bit systems.
         *
         * However, we need the exec_elf32 module, because it will make the 32bit
         * functions available for netbsd32 and linux32.
         *
         * Therefore, allow this module on 64bit systems, but make it dormant.
         */

        (void)exec_elf32_execsw; /* unused */

        switch (cmd) {
        case MODULE_CMD_INIT:
        case MODULE_CMD_FINI:
                return 0;
        default:
                return ENOTTY;
        }
#else /* ARCH_ELFSIZE == 64 */
        switch (cmd) {
        case MODULE_CMD_INIT:
                return exec_add(exec_elf32_execsw,
                    __arraycount(exec_elf32_execsw));

        case MODULE_CMD_FINI:
                return exec_remove(exec_elf32_execsw,
                    __arraycount(exec_elf32_execsw));

        default:
                return ENOTTY;
        }
#endif /* ARCH_ELFSIZE == 64 */
}





























































































































































































    7 













    1 






    1 



    1 








    5 










    3 




    5 




    5 
















    5 



    5 

    4 









































































    5 









    5 
    5 















    5 







    5 


















    5 
































    5 

















    5 

    5 



















    8 














    8 



    2 
    8 








    8 
    2 








    7 
    1 


















    1 





    1 



















    7 




































    7 
    2 




    4 



    8 

    8 







    5 








    5 
    5 
    5 




















    5 












































    5 


















    5 











    5 




















    7 


    1 








    5 

    2 




























    5 











    3 



















    5 
    5 


    5 






    5 







    3 



    2 







    4 
































    4 
























    5 








































































































































































































    5 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
/*        $NetBSD: in6_src.c,v 1.92 2023/08/03 04:24:55 ozaki-r Exp $        */
/*        $KAME: in6_src.c,v 1.159 2005/10/19 01:40:32 t-momose Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)in_pcb.c        8.2 (Berkeley) 1/4/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6_src.c,v 1.92 2023/08/03 04:24:55 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/kauth.h>

#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>

#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/portalgo.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/nd6.h>
#include <netinet6/scope6_var.h>

#ifdef MIP6
#include <netinet6/mip6.h>
#include <netinet6/mip6_var.h>
#include "mip.h"
#if NMIP > 0
#include <net/if_mip.h>
#endif /* NMIP > 0 */
#endif /* MIP6 */

#include <netinet/tcp_vtw.h>

#define ADDR_LABEL_NOTAPP (-1)
struct in6_addrpolicy defaultaddrpolicy;

int ip6_prefer_tempaddr = 0;

static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *,
        struct ip6_moptions *, struct route *, struct ifnet **, struct psref *);

static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *);

static void init_policy_queue(void);
static int add_addrsel_policyent(struct in6_addrpolicy *);
static int delete_addrsel_policyent(struct in6_addrpolicy *);
static int walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *),
                                    void *);
static int dump_addrsel_policyent(struct in6_addrpolicy *, void *);
static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *);

#define        IFA6_IS_VALIDATED(ia) \
        (((ia)->ia6_flags & (IN6_IFF_TENTATIVE | IN6_IFF_DETACHED)) == 0)

/*
 * Return an IPv6 address, which is the most appropriate for a given
 * destination and user specified options.
 * If necessary, this function lookups the routing table and returns
 * an entry to the caller for later use.
 */
#if 0                                /* disabled ad-hoc */
#define REPLACE(r) do {\
        char _buf1[INET6_ADDRSTRLEN], _buf2[INET6_ADDRSTRLEN]; \
        if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \
                sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
                ip6stat.ip6s_sources_rule[(r)]++; \
        printf("%s: replace %s with %s by %d\n", __func__, ia_best ? \
            IN6_PRINT(_buf1, &ia_best->ia_addr.sin6_addr) : "none", \
            IN6_PRINT(_buf2, &ia->ia_addr.sin6_addr), (r)); \
        goto replace; \
} while(/*CONSTCOND*/0)
#define NEXT(r) do {\
        if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \
                sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
                ip6stat.ip6s_sources_rule[(r)]++; \
        printf("%s: keep %s against %s by %d\n", ia_best ? \
            IN6_PRINT(_buf1, &ia_best->ia_addr.sin6_addr) : "none", \
            IN6_PRINT(_buf2, &ia->ia_addr.sin6_addr), (r)); \
        goto next;                 /* XXX: we can't use 'continue' here */ \
} while(/*CONSTCOND*/0)
#define BREAK(r) do { \
        if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \
                sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
                ip6stat.ip6s_sources_rule[(r)]++; \
        goto out;                 /* XXX: we can't use 'break' here */ \
} while(/*CONSTCOND*/0)
#else
#define REPLACE(r) goto replace
#define NEXT(r) goto next
#define BREAK(r) goto out
#endif

/*
 * Called inside pserialize critical section. Don't sleep/block.
 */
static struct in6_ifaddr *
in6_select_best_ia(struct sockaddr_in6 *dstsock, struct in6_addr *dst,
    const struct ifnet *ifp, const struct ip6_pktopts *opts,
    const u_int32_t odstzone)
{
        struct in6_ifaddr *ia, *ia_best = NULL;
        int dst_scope = -1, best_scope = -1, best_matchlen = -1;
        struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL;

        IN6_ADDRLIST_READER_FOREACH(ia) {
                int new_scope = -1, new_matchlen = -1;
                struct in6_addrpolicy *new_policy = NULL;
                u_int32_t srczone, osrczone, dstzone;
                struct in6_addr src;
                struct ifnet *ifp1 = ia->ia_ifp;
                int prefer_tempaddr;

                /*
                 * We'll never take an address that breaks the scope zone
                 * of the destination.  We also skip an address if its zone
                 * does not contain the outgoing interface.
                 * XXX: we should probably use sin6_scope_id here.
                 */
                if (in6_setscope(dst, ifp1, &dstzone) ||
                    odstzone != dstzone) {
                        continue;
                }
                src = ia->ia_addr.sin6_addr;

                /* Skip the scope test in impossible cases */
                if (!(ifp->if_flags & IFF_LOOPBACK) &&
                    IN6_IS_ADDR_LOOPBACK(&src))
                        continue;

                if (in6_setscope(&src, ifp, &osrczone) ||
                    in6_setscope(&src, ifp1, &srczone) ||
                    osrczone != srczone) {
                        continue;
                }

                /* avoid unusable addresses */
                if ((ia->ia6_flags & (IN6_IFF_DUPLICATED | IN6_IFF_ANYCAST)))
                        continue;
                if (!ip6_use_deprecated && IFA6_IS_DEPRECATED(ia))
                        continue;

#if defined(MIP6) && NMIP > 0
                /* avoid unusable home addresses. */
                if ((ia->ia6_flags & IN6_IFF_HOME) &&
                    !mip6_ifa6_is_addr_valid_hoa(ia))
                        continue;
#endif /* MIP6 && NMIP > 0 */

                /* Rule 1: Prefer same address */
                if (IN6_ARE_ADDR_EQUAL(dst, &ia->ia_addr.sin6_addr)) {
                        ia_best = ia;
                        BREAK(1); /* there should be no better candidate */
                }

                if (ia_best == NULL)
                        REPLACE(1);

                /* Rule 2: Prefer appropriate scope */
                if (dst_scope < 0)
                        dst_scope = in6_addrscope(dst);
                new_scope = in6_addrscope(&ia->ia_addr.sin6_addr);
                if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) {
                        if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0)
                                REPLACE(2);
                        NEXT(2);
                } else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) {
                        if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0)
                                NEXT(2);
                        REPLACE(2);
                }

                /*
                 * Rule 3: Avoid deprecated addresses.  Note that the case of
                 * !ip6_use_deprecated is already rejected above.
                 * Treat unvalidated addresses as deprecated here.
                 */
                if (IFA6_IS_VALIDATED(ia_best) && !IFA6_IS_VALIDATED(ia))
                        NEXT(3);
                if (!IFA6_IS_VALIDATED(ia_best) && IFA6_IS_VALIDATED(ia))
                        REPLACE(3);
                if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia))
                        NEXT(3);
                if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia))
                        REPLACE(3);

                /* Rule 4: Prefer home addresses */
#if defined(MIP6) && NMIP > 0
                if (!MIP6_IS_MN)
                        goto skip_rule4;

                if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 &&
                    (ia->ia6_flags & IN6_IFF_HOME) == 0) {
                        /* both address are not home addresses. */
                        goto skip_rule4;
                }

                /*
                 * If SA is simultaneously a home address and care-of
                 * address and SB is not, then prefer SA. Similarly,
                 * if SB is simultaneously a home address and care-of
                 * address and SA is not, then prefer SB.
                 */
                if (((ia_best->ia6_flags & IN6_IFF_HOME) != 0 &&
                        ia_best->ia_ifp->if_type != IFT_MIP)
                    &&
                    ((ia->ia6_flags & IN6_IFF_HOME) != 0 &&
                        ia->ia_ifp->if_type == IFT_MIP))
                        NEXT(4);
                if (((ia_best->ia6_flags & IN6_IFF_HOME) != 0 &&
                        ia_best->ia_ifp->if_type == IFT_MIP)
                    &&
                    ((ia->ia6_flags & IN6_IFF_HOME) != 0 &&
                        ia->ia_ifp->if_type != IFT_MIP))
                        REPLACE(4);
                if (ip6po_usecoa == 0) {
                        /*
                         * If SA is just a home address and SB is just
                         * a care-of address, then prefer
                         * SA. Similarly, if SB is just a home address
                         * and SA is just a care-of address, then
                         * prefer SB.
                         */
                        if ((ia_best->ia6_flags & IN6_IFF_HOME) != 0 &&
                            (ia->ia6_flags & IN6_IFF_HOME) == 0) {
                                NEXT(4);
                        }
                        if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 &&
                            (ia->ia6_flags & IN6_IFF_HOME) != 0) {
                                REPLACE(4);
                        }
                } else {
                        /*
                         * a sender don't want to use a home address
                         * because:
                         *
                         * 1) we cannot use.  (ex. NS or NA to global
                         * addresses.)
                         *
                         * 2) a user specified not to use.
                         * (ex. mip6control -u)
                         */
                        if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 &&
                            (ia->ia6_flags & IN6_IFF_HOME) != 0) {
                                /* XXX breaks stat */
                                NEXT(0);
                        }
                        if ((ia_best->ia6_flags & IN6_IFF_HOME) != 0 &&
                            (ia->ia6_flags & IN6_IFF_HOME) == 0) {
                                /* XXX breaks stat */
                                REPLACE(0);
                        }
                }
        skip_rule4:
#endif /* MIP6 && NMIP > 0 */

                /* Rule 5: Prefer outgoing interface */
                if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp)
                        NEXT(5);
                if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp)
                        REPLACE(5);

                /*
                 * Rule 6: Prefer matching label
                 * Note that best_policy should be non-NULL here.
                 */
                if (dst_policy == NULL)
                        dst_policy = lookup_addrsel_policy(dstsock);
                if (dst_policy->label != ADDR_LABEL_NOTAPP) {
                        new_policy = lookup_addrsel_policy(&ia->ia_addr);
                        if (dst_policy->label == best_policy->label &&
                            dst_policy->label != new_policy->label)
                                NEXT(6);
                        if (dst_policy->label != best_policy->label &&
                            dst_policy->label == new_policy->label)
                                REPLACE(6);
                }

                /*
                 * Rule 7: Prefer public addresses.
                 * We allow users to reverse the logic by configuring
                 * a sysctl variable, so that privacy conscious users can
                 * always prefer temporary addresses.
                 */
                if (opts == NULL ||
                    opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) {
                        prefer_tempaddr = ip6_prefer_tempaddr;
                } else if (opts->ip6po_prefer_tempaddr ==
                    IP6PO_TEMPADDR_NOTPREFER) {
                        prefer_tempaddr = 0;
                } else
                        prefer_tempaddr = 1;
                if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
                    (ia->ia6_flags & IN6_IFF_TEMPORARY)) {
                        if (prefer_tempaddr)
                                REPLACE(7);
                        else
                                NEXT(7);
                }
                if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
                    !(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
                        if (prefer_tempaddr)
                                NEXT(7);
                        else
                                REPLACE(7);
                }

                /*
                 * Rule 8: prefer addresses on alive interfaces.
                 * This is a KAME specific rule.
                 */
                if ((ia_best->ia_ifp->if_flags & IFF_UP) &&
                    !(ia->ia_ifp->if_flags & IFF_UP))
                        NEXT(8);
                if (!(ia_best->ia_ifp->if_flags & IFF_UP) &&
                    (ia->ia_ifp->if_flags & IFF_UP))
                        REPLACE(8);

                /*
                 * Rule 9: prefer addresses on "preferred" interfaces.
                 * This is a KAME specific rule.
                 */
#ifdef notyet                        /* until introducing address selection */
#define NDI_BEST ND_IFINFO(ia_best->ia_ifp)
#define NDI_NEW  ND_IFINFO(ia->ia_ifp)
                if ((NDI_BEST->flags & ND6_IFF_PREFER_SOURCE) &&
                    !(NDI_NEW->flags & ND6_IFF_PREFER_SOURCE))
                        NEXT(9);
                if (!(NDI_BEST->flags & ND6_IFF_PREFER_SOURCE) &&
                    (NDI_NEW->flags & ND6_IFF_PREFER_SOURCE))
                        REPLACE(9);
#undef NDI_BEST
#undef NDI_NEW
#endif

                /*
                 * Rule 14: Use longest matching prefix.
                 * Note: in the address selection draft, this rule is
                 * documented as "Rule 8".  However, since it is also
                 * documented that this rule can be overridden, we assign
                 * a large number so that it is easy to assign smaller numbers
                 * to more preferred rules.
                 */
                new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, dst);
                if (best_matchlen < new_matchlen)
                        REPLACE(14);
                if (new_matchlen < best_matchlen)
                        NEXT(14);

                /* Rule 15 is reserved. */

                /*
                 * Last resort: just keep the current candidate.
                 * Or, do we need more rules?
                 */
                continue;

          replace:
                ia_best = ia;
                best_scope = (new_scope >= 0 ? new_scope :
                              in6_addrscope(&ia_best->ia_addr.sin6_addr));
                best_policy = (new_policy ? new_policy :
                               lookup_addrsel_policy(&ia_best->ia_addr));
                best_matchlen = (new_matchlen >= 0 ? new_matchlen :
                                 in6_matchlen(&ia_best->ia_addr.sin6_addr,
                                              dst));

          next:
                continue;

          out:
                break;
        }

        return ia_best;
}
#undef REPLACE
#undef BREAK
#undef NEXT

int
in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
        struct ip6_moptions *mopts, struct route *ro, struct in6_addr *laddr,
        struct ifnet **ifpp, struct psref *psref, struct in6_addr *ret_ia6)
{
        struct in6_addr dst;
        struct ifnet *ifp = NULL;
        struct in6_ifaddr *ia = NULL;
        struct in6_pktinfo *pi = NULL;
        u_int32_t odstzone;
        int error = 0, iferror;
#if defined(MIP6) && NMIP > 0
        u_int8_t ip6po_usecoa = 0;
#endif /* MIP6 && NMIP > 0 */
        struct psref local_psref;
        int bound = curlwp_bind();
#define PSREF (psref == NULL) ? &local_psref : psref
        int s;

        KASSERT((ifpp != NULL && psref != NULL) ||
                (ifpp == NULL && psref == NULL));

        dst = dstsock->sin6_addr; /* make a copy for local operation */
        if (ifpp)
                *ifpp = NULL;

        /*
         * Try to determine the outgoing interface for the given destination.
         * We do this regardless of whether the socket is bound, since the
         * caller may need this information as a side effect of the call
         * to this function (e.g., for identifying the appropriate scope zone
         * ID).
         */
        iferror = in6_selectif(dstsock, opts, mopts, ro, &ifp, PSREF);
        if (ifpp != NULL)
                *ifpp = ifp;

        /*
         * If the source address is explicitly specified by the caller,
         * check if the requested source address is indeed a unicast address
         * assigned to the node, and can be used as the packet's source
         * address.  If everything is okay, use the address as source.
         */
        if (opts && (pi = opts->ip6po_pktinfo) &&
            !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) {
                struct sockaddr_in6 srcsock;
                struct in6_ifaddr *ia6;
                int _s;
                struct ifaddr *ifa;

                /*
                 * Determine the appropriate zone id of the source based on
                 * the zone of the destination and the outgoing interface.
                 * If the specified address is ambiguous wrt the scope zone,
                 * the interface must be specified; otherwise, ifa_ifwithaddr()
                 * will fail matching the address.
                 */
                memset(&srcsock, 0, sizeof(srcsock));
                srcsock.sin6_family = AF_INET6;
                srcsock.sin6_len = sizeof(srcsock);
                srcsock.sin6_addr = pi->ipi6_addr;
                if (ifp) {
                        error = in6_setscope(&srcsock.sin6_addr, ifp, NULL);
                        if (error != 0)
                                goto exit;
                }

                _s = pserialize_read_enter();
                ifa = ifa_ifwithaddr(sin6tosa(&srcsock));
                if ((ia6 = ifatoia6(ifa)) == NULL ||
                    ia6->ia6_flags &
                    (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) {
                        pserialize_read_exit(_s);
                        error = EADDRNOTAVAIL;
                        goto exit;
                }
                pi->ipi6_addr = srcsock.sin6_addr; /* XXX: this overrides pi */
                if (ifpp)
                        *ifpp = ifp;
                *ret_ia6 = ia6->ia_addr.sin6_addr;
                pserialize_read_exit(_s);
                goto exit;
        }

        /*
         * If the socket has already bound the source, just use it.  We don't
         * care at the moment whether in6_selectif() succeeded above, even
         * though it would eventually cause an error.
         */
        if (laddr && !IN6_IS_ADDR_UNSPECIFIED(laddr)) {
                *ret_ia6 = *laddr;
                goto exit;
        }

        /*
         * The outgoing interface is crucial in the general selection procedure
         * below.  If it is not known at this point, we fail.
         */
        if (ifp == NULL) {
                error = iferror;
                goto exit;
        }

        /*
         * If the address is not yet determined, choose the best one based on
         * the outgoing interface and the destination address.
         */

#if defined(MIP6) && NMIP > 0
        /*
         * a caller can specify IP6PO_USECOA to not to use a home
         * address.  for example, the case that the neighbour
         * unreachability detection to the global address.
         */
        if (opts != NULL &&
            (opts->ip6po_flags & IP6PO_USECOA) != 0) {
                ip6po_usecoa = 1;
        }
#endif /* MIP6 && NMIP > 0 */

        error = in6_setscope(&dst, ifp, &odstzone);
        if (error != 0)
                goto exit;

        s = pserialize_read_enter();

        ia = in6_select_best_ia(dstsock, &dst, ifp, opts, odstzone);
        if (ia == NULL) {
                pserialize_read_exit(s);
                error = EADDRNOTAVAIL;
                goto exit;
        }
        *ret_ia6 = ia->ia_addr.sin6_addr;

        pserialize_read_exit(s);
exit:
        if (ifpp == NULL)
                if_put(ifp, PSREF);
        curlwp_bindx(bound);
        return error;
#undef PSREF
}

int
in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
    struct route **ro, struct rtentry **retrt, bool count_discard)
{
        int error = 0;
        struct rtentry *rt = NULL;
        union {
                struct sockaddr                dst;
                struct sockaddr_in        dst4;
                struct sockaddr_in6        dst6;
        } u;

        KASSERT(ro != NULL);
        KASSERT(*ro != NULL);
        KASSERT(retrt != NULL);

#if 0
        if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
            dstsock->sin6_addr.s6_addr32[1] == 0 &&
            !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
                char ip6buf[INET6_ADDRSTRLEN];
                printf("%s: strange destination %s\n", __func__,
                       IN6_PRINT(ip6buf, &dstsock->sin6_addr));
        } else {
                char ip6buf[INET6_ADDRSTRLEN];
                printf("%s: destination = %s%%%d\n", __func__,
                       IN6_PRINT(ip6buf, &dstsock->sin6_addr),
                       dstsock->sin6_scope_id); /* for debug */
        }
#endif

        /*
         * If the next hop address for the packet is specified by the caller,
         * use it as the gateway.
         */
        if (opts && opts->ip6po_nexthop) {
                struct route *ron;
                struct sockaddr_in6 *sin6_next;

                sin6_next = satosin6(opts->ip6po_nexthop);

                /* at this moment, we only support AF_INET6 next hops */
                if (sin6_next->sin6_family != AF_INET6) {
                        IP6_STATINC(IP6_STAT_ODROPPED);
                        error = EAFNOSUPPORT; /* or should we proceed? */
                        goto done;
                }

                /*
                 * If the next hop is an IPv6 address, then the node identified
                 * by that address must be a neighbor of the sending host.
                 */
                ron = &opts->ip6po_nextroute;
                rt = rtcache_lookup(ron, sin6tosa(sin6_next));
                if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) != 0 ||
                    !nd6_is_addr_neighbor(sin6_next, rt->rt_ifp)) {
                        if (rt != NULL) {
                                if (count_discard)
                                        in6_ifstat_inc(rt->rt_ifp,
                                            ifs6_out_discard);
                                rtcache_unref(rt, ron);
                                rt = NULL;
                        }
                        rtcache_free(ron);
                        error = EHOSTUNREACH;
                        goto done;
                }
                *ro = ron;

                goto done;
        }

        /*
         * Use a cached route if it exists and is valid, else try to allocate
         * a new one.  Note that we should check the address family of the
         * cached destination, in case of sharing the cache with IPv4.
         *
         * for V4 mapped addresses we want to pick up the v4 route
         * see PR kern/56348
         */
        if (IN6_IS_ADDR_V4MAPPED(&dstsock->sin6_addr)) {
                in6_sin6_2_sin(&u.dst4, dstsock);
        } else {
                u.dst6 = *dstsock;
                u.dst6.sin6_scope_id = 0;
        }

        rt = rtcache_lookup1(*ro, &u.dst, 1);

        if (rt == NULL)
                error = EHOSTUNREACH;

        /*
         * Check if the outgoing interface conflicts with
         * the interface specified by ipi6_ifindex (if specified).
         * Note that loopback interface is always okay.
         * (this may happen when we are sending a packet to one of
         *  our own addresses.)
         */
        if (opts && opts->ip6po_pktinfo && opts->ip6po_pktinfo->ipi6_ifindex) {
                if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_LOOPBACK) &&
                    rt->rt_ifp->if_index != opts->ip6po_pktinfo->ipi6_ifindex) {
                        if (count_discard)
                                in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard);
                        error = EHOSTUNREACH;
                        rtcache_unref(rt, *ro);
                        rt = NULL;
                }
        }

done:
        if (error == EHOSTUNREACH)
                IP6_STATINC(IP6_STAT_NOROUTE);
        *retrt = rt;
        return error;
}

static int
in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, 
        struct ip6_moptions *mopts, struct route *ro, struct ifnet **retifp,
        struct psref *psref)
{
        int error = 0;
        struct rtentry *rt = NULL;
        struct in6_addr *dst;
        struct in6_pktinfo *pi = NULL;

        KASSERT(retifp != NULL);
        *retifp = NULL;
        dst = &dstsock->sin6_addr;

        /* If the caller specify the outgoing interface explicitly, use it. */
        if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
                /* XXX boundary check is assumed to be already done. */
                *retifp = if_get_byindex(pi->ipi6_ifindex, psref);
                if (*retifp != NULL)
                        return 0;
                goto getroute;
        }

        /*
         * If the destination address is a multicast address and the outgoing
         * interface for the address is specified by the caller, use it.
         */
        if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL) {
                *retifp = if_get_byindex(mopts->im6o_multicast_if_index, psref);
                if (*retifp != NULL)
                        return 0; /* we do not need a route for multicast. */
        }

getroute:
        error = in6_selectroute(dstsock, opts, &ro, &rt, false);
        if (error != 0)
                return error;

        *retifp = if_get_byindex(rt->rt_ifp->if_index, psref);

        /*
         * do not use a rejected or black hole route.
         * XXX: this check should be done in the L2 output routine.
         * However, if we skipped this check here, we'd see the following
         * scenario:
         * - install a rejected route for a scoped address prefix
         *   (like fe80::/10)
         * - send a packet to a destination that matches the scoped prefix,
         *   with ambiguity about the scope zone.
         * - pick the outgoing interface from the route, and disambiguate the
         *   scope zone with the interface.
         * - ip6_output() would try to get another route with the "new"
         *   destination, which may be valid.
         * - we'd see no error on output.
         * Although this may not be very harmful, it should still be confusing.
         * We thus reject the case here.
         */
        if ((rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
                error = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
                /* XXX: ifp can be returned with psref even if error */
                goto out;
        }

        /*
         * Adjust the "outgoing" interface.  If we're going to loop the packet
         * back to ourselves, the ifp would be the loopback interface.
         * However, we'd rather know the interface associated to the
         * destination address (which should probably be one of our own
         * addresses.)
         */
        if (rt->rt_ifa->ifa_ifp != *retifp &&
            !if_is_deactivated(rt->rt_ifa->ifa_ifp)) {
                if_put(*retifp, psref);
                *retifp = rt->rt_ifa->ifa_ifp;
                if_acquire(*retifp, psref);
        }
out:
        rtcache_unref(rt, ro);
        return error;
}

/*
 * Default hop limit selection. The precedence is as follows:
 * 1. Hoplimit value specified via ioctl.
 * 2. (If the outgoing interface is detected) the current
 *     hop limit of the interface specified by router advertisement.
 * 3. The system default hoplimit.
*/
int
in6pcb_selecthlim(struct inpcb *inp, struct ifnet *ifp)
{
        if (inp && in6p_hops6(inp) >= 0)
                return in6p_hops6(inp);
        else if (ifp)
                return (ND_IFINFO(ifp)->chlim);
        else
                return (ip6_defhlim);
}

int
in6pcb_selecthlim_rt(struct inpcb *inp)
{
        struct rtentry *rt;

        if (inp == NULL)
                return in6pcb_selecthlim(inp, NULL);

        rt = rtcache_validate(&inp->inp_route);
        if (rt != NULL) {
                int ret = in6pcb_selecthlim(inp, rt->rt_ifp);
                rtcache_unref(rt, &inp->inp_route);
                return ret;
        } else
                return in6pcb_selecthlim(inp, NULL);
}

/*
 * Find an empty port and set it to the specified PCB.
 */
int
in6pcb_set_port(struct sockaddr_in6 *sin6, struct inpcb *inp, struct lwp *l)
{
        struct socket *so = inp->inp_socket;
        struct inpcbtable *table = inp->inp_table;
        u_int16_t lport, *lastport;
        enum kauth_network_req req;
        int error = 0;

        if (inp->inp_flags & IN6P_LOWPORT) {
#ifndef IPNOPRIVPORTS
                req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
#else
                req = KAUTH_REQ_NETWORK_BIND_PORT;
#endif
                lastport = &table->inpt_lastlow;
        } else {
                req = KAUTH_REQ_NETWORK_BIND_PORT;

                lastport = &table->inpt_lastport;
        }

        /* XXX-kauth: KAUTH_REQ_NETWORK_BIND_AUTOASSIGN_{,PRIV}PORT */
        error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_BIND, req, so,
            sin6, NULL);
        if (error)
                return (EACCES);

       /*
        * Use RFC6056 randomized port selection
        */
        error = portalgo_randport(&lport, inp, l->l_cred);
        if (error)
                return error;

        inp->inp_flags |= IN6P_ANONPORT;
        *lastport = lport;
        inp->inp_lport = htons(lport);
        in6pcb_set_state(inp, INP_BOUND);
        return (0);                /* success */
}

void
addrsel_policy_init(void)
{
        init_policy_queue();

        /* initialize the "last resort" policy */
        memset(&defaultaddrpolicy, 0, sizeof(defaultaddrpolicy));
        defaultaddrpolicy.label = ADDR_LABEL_NOTAPP;
}

/*
 * XXX: NOMPSAFE if a policy is set
 */
static struct in6_addrpolicy *
lookup_addrsel_policy(struct sockaddr_in6 *key)
{
        struct in6_addrpolicy *match = NULL;

        match = match_addrsel_policy(key);

        if (match == NULL)
                match = &defaultaddrpolicy;
        else
                match->use++;

        return (match);
}

/*
 * Subroutines to manage the address selection policy table via sysctl.
 */
struct sel_walkarg {
        size_t        w_total;
        size_t        w_given;
        void *        w_where;
        void *w_limit;
};

int sysctl_net_inet6_addrctlpolicy(SYSCTLFN_ARGS);
int
sysctl_net_inet6_addrctlpolicy(SYSCTLFN_ARGS)
{
        int error = 0;
        int s;

        s = splsoftnet();

        if (newp) {
                error = EPERM;
                goto end;
        }
        if (oldp && oldlenp == NULL) {
                error = EINVAL;
                goto end;
        }
        if (oldp || oldlenp) {
                struct sel_walkarg w;
                size_t oldlen = *oldlenp;

                memset(&w, 0, sizeof(w));
                w.w_given = oldlen;
                w.w_where = oldp;
                if (oldp)
                        w.w_limit = (char *)oldp + oldlen;

                error = walk_addrsel_policy(dump_addrsel_policyent, &w);

                *oldlenp = w.w_total;
                if (oldp && w.w_total > oldlen && error == 0)
                        error = ENOMEM;
        }

  end:
        splx(s);

        return (error);
}

int
in6_src_ioctl(u_long cmd, void *data)
{
        int i;
        struct in6_addrpolicy ent0;

        if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY)
                return (EOPNOTSUPP); /* check for safety */

        ent0 = *(struct in6_addrpolicy *)data;

        if (ent0.label == ADDR_LABEL_NOTAPP)
                return (EINVAL);
        /* check if the prefix mask is consecutive. */
        if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0)
                return (EINVAL);
        /* clear trailing garbages (if any) of the prefix address. */
        for (i = 0; i < 4; i++) {
                ent0.addr.sin6_addr.s6_addr32[i] &=
                        ent0.addrmask.sin6_addr.s6_addr32[i];
        }
        ent0.use = 0;

        switch (cmd) {
        case SIOCAADDRCTL_POLICY:
                return (add_addrsel_policyent(&ent0));
        case SIOCDADDRCTL_POLICY:
                return (delete_addrsel_policyent(&ent0));
        }

        return (0);                /* XXX: compromise compilers */
}

/*
 * The followings are implementation of the policy table using a
 * simple tail queue.
 * XXX such details should be hidden.
 * XXX implementation using binary tree should be more efficient.
 */
struct addrsel_policyent {
        TAILQ_ENTRY(addrsel_policyent) ape_entry;
        struct in6_addrpolicy ape_policy;
};

TAILQ_HEAD(addrsel_policyhead, addrsel_policyent);

struct addrsel_policyhead addrsel_policytab;

static void
init_policy_queue(void)
{
        TAILQ_INIT(&addrsel_policytab);
}

static int
add_addrsel_policyent(struct in6_addrpolicy *newpolicy)
{
        struct addrsel_policyent *newpol, *pol;

        /* duplication check */
        TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) {
                if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr,
                    &pol->ape_policy.addr.sin6_addr) &&
                    IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr,
                    &pol->ape_policy.addrmask.sin6_addr)) {
                        return (EEXIST);        /* or override it? */
                }
        }

        newpol = malloc(sizeof(*newpol), M_IFADDR, M_WAITOK|M_ZERO);

        /* XXX: should validate entry */
        newpol->ape_policy = *newpolicy;

        TAILQ_INSERT_TAIL(&addrsel_policytab, newpol, ape_entry);

        return (0);
}

static int
delete_addrsel_policyent(struct in6_addrpolicy *key)
{
        struct addrsel_policyent *pol;

        /* search for the entry in the table */
        for (pol = TAILQ_FIRST(&addrsel_policytab); pol;
             pol = TAILQ_NEXT(pol, ape_entry)) {
                if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr,
                    &pol->ape_policy.addr.sin6_addr) &&
                    IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr,
                    &pol->ape_policy.addrmask.sin6_addr)) {
                        break;
                }
        }
        if (pol == NULL) {
                return (ESRCH);
        }

        TAILQ_REMOVE(&addrsel_policytab, pol, ape_entry);

        return (0);
}

static int
walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w)
{
        struct addrsel_policyent *pol;
        int error = 0;

        TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) {
                if ((error = (*callback)(&pol->ape_policy, w)) != 0)
                        return error;
        }

        return error;
}

static int
dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg)
{
        int error = 0;
        struct sel_walkarg *w = arg;

        if (w->w_where && (char *)w->w_where + sizeof(*pol) <= (char *)w->w_limit) {
                if ((error = copyout(pol, w->w_where, sizeof(*pol))) != 0)
                        return error;
                w->w_where = (char *)w->w_where + sizeof(*pol);
        }
        w->w_total += sizeof(*pol);

        return error;
}

static struct in6_addrpolicy *
match_addrsel_policy(struct sockaddr_in6 *key)
{
        struct addrsel_policyent *pent;
        struct in6_addrpolicy *bestpol = NULL, *pol;
        int matchlen, bestmatchlen = -1;
        u_char *mp, *ep, *k, *p, m;

        for (pent = TAILQ_FIRST(&addrsel_policytab); pent;
             pent = TAILQ_NEXT(pent, ape_entry)) {
                matchlen = 0;

                pol = &pent->ape_policy;
                mp = (u_char *)&pol->addrmask.sin6_addr;
                ep = mp + 16;        /* XXX: scope field? */
                k = (u_char *)&key->sin6_addr;
                p = (u_char *)&pol->addr.sin6_addr;
                for (; mp < ep && *mp; mp++, k++, p++) {
                        m = *mp;
                        if ((*k & m) != *p)
                                goto next; /* not match */
                        if (m == 0xff) /* short cut for a typical case */
                                matchlen += 8;
                        else {
                                while (m >= 0x80) {
                                        matchlen++;
                                        m <<= 1;
                                }
                        }
                }

                /* matched.  check if this is better than the current best. */
                if (bestpol == NULL ||
                    matchlen > bestmatchlen) {
                        bestpol = pol;
                        bestmatchlen = matchlen;
                }

          next:
                continue;
        }

        return (bestpol);
}











































































































































































































    4 














    4 
























    4 






    4 













    4 

















    4 
    3 


    3 

    3 











    4 


















    4 
































    3 
    4 



















    4 




















































    4 

















    4 




































































    4 
    3 


    4 





    4 

































































































    2 




















































    2 




    2 


























































    2 
















    2 
























    2 


























































































































    1 















    1 

    1 

























    1 















    1 















































































































































































    1 

















    1 





    1 























































































































































































































































































































































































































    2 









    2 



    2 







    2 





    2 







    2 






















    2 














































    4 













    4 




    4 













    3 








    4 



    4 










    4 







    4 





















































    3 














    3 


























    1 







    1 




































































    1 








    1 

    1 
    1 


    1 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
/*        $NetBSD: union_vnops.c,v 1.83 2022/03/19 13:48:04 hannken Exp $        */

/*
 * Copyright (c) 1992, 1993, 1994, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)union_vnops.c        8.33 (Berkeley) 7/31/95
 */

/*
 * Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)union_vnops.c        8.33 (Berkeley) 7/31/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: union_vnops.c,v 1.83 2022/03/19 13:48:04 hannken Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/buf.h>
#include <sys/queue.h>
#include <sys/lock.h>
#include <sys/kauth.h>

#include <fs/union/union.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

int union_parsepath(void *);
int union_lookup(void *);
int union_create(void *);
int union_whiteout(void *);
int union_mknod(void *);
int union_open(void *);
int union_close(void *);
int union_access(void *);
int union_getattr(void *);
int union_setattr(void *);
int union_read(void *);
int union_write(void *);
int union_ioctl(void *);
int union_poll(void *);
int union_revoke(void *);
int union_mmap(void *);
int union_fsync(void *);
int union_seek(void *);
int union_remove(void *);
int union_link(void *);
int union_rename(void *);
int union_mkdir(void *);
int union_rmdir(void *);
int union_symlink(void *);
int union_readdir(void *);
int union_readlink(void *);
int union_abortop(void *);
int union_inactive(void *);
int union_reclaim(void *);
int union_lock(void *);
int union_unlock(void *);
int union_bmap(void *);
int union_print(void *);
int union_islocked(void *);
int union_pathconf(void *);
int union_advlock(void *);
int union_strategy(void *);
int union_bwrite(void *);
int union_getpages(void *);
int union_putpages(void *);
int union_kqfilter(void *);

static int union_lookup1(struct vnode *, struct vnode **,
                              struct vnode **, struct componentname *);


/*
 * Global vfs data structures
 */
int (**union_vnodeop_p)(void *);
const struct vnodeopv_entry_desc union_vnodeop_entries[] = {
        { &vop_default_desc, vn_default_error },
        { &vop_parsepath_desc, union_parsepath },        /* parsepath */
        { &vop_lookup_desc, union_lookup },                /* lookup */
        { &vop_create_desc, union_create },                /* create */
        { &vop_whiteout_desc, union_whiteout },                /* whiteout */
        { &vop_mknod_desc, union_mknod },                /* mknod */
        { &vop_open_desc, union_open },                        /* open */
        { &vop_close_desc, union_close },                /* close */
        { &vop_access_desc, union_access },                /* access */
        { &vop_accessx_desc, genfs_accessx },                /* accessx */
        { &vop_getattr_desc, union_getattr },                /* getattr */
        { &vop_setattr_desc, union_setattr },                /* setattr */
        { &vop_read_desc, union_read },                        /* read */
        { &vop_write_desc, union_write },                /* write */
        { &vop_fallocate_desc, genfs_eopnotsupp },        /* fallocate */
        { &vop_fdiscard_desc, genfs_eopnotsupp },        /* fdiscard */
        { &vop_ioctl_desc, union_ioctl },                /* ioctl */
        { &vop_poll_desc, union_poll },                        /* select */
        { &vop_revoke_desc, union_revoke },                /* revoke */
        { &vop_mmap_desc, union_mmap },                        /* mmap */
        { &vop_fsync_desc, union_fsync },                /* fsync */
        { &vop_seek_desc, union_seek },                        /* seek */
        { &vop_remove_desc, union_remove },                /* remove */
        { &vop_link_desc, union_link },                        /* link */
        { &vop_rename_desc, union_rename },                /* rename */
        { &vop_mkdir_desc, union_mkdir },                /* mkdir */
        { &vop_rmdir_desc, union_rmdir },                /* rmdir */
        { &vop_symlink_desc, union_symlink },                /* symlink */
        { &vop_readdir_desc, union_readdir },                /* readdir */
        { &vop_readlink_desc, union_readlink },                /* readlink */
        { &vop_abortop_desc, union_abortop },                /* abortop */
        { &vop_inactive_desc, union_inactive },                /* inactive */
        { &vop_reclaim_desc, union_reclaim },                /* reclaim */
        { &vop_lock_desc, union_lock },                        /* lock */
        { &vop_unlock_desc, union_unlock },                /* unlock */
        { &vop_bmap_desc, union_bmap },                        /* bmap */
        { &vop_strategy_desc, union_strategy },                /* strategy */
        { &vop_bwrite_desc, union_bwrite },                /* bwrite */
        { &vop_print_desc, union_print },                /* print */
        { &vop_islocked_desc, union_islocked },                /* islocked */
        { &vop_pathconf_desc, union_pathconf },                /* pathconf */
        { &vop_advlock_desc, union_advlock },                /* advlock */
        { &vop_getpages_desc, union_getpages },                /* getpages */
        { &vop_putpages_desc, union_putpages },                /* putpages */
        { &vop_kqfilter_desc, union_kqfilter },                /* kqfilter */
        { NULL, NULL }
};
const struct vnodeopv_desc union_vnodeop_opv_desc =
        { &union_vnodeop_p, union_vnodeop_entries };

#define NODE_IS_SPECIAL(vp) \
        ((vp)->v_type == VBLK || (vp)->v_type == VCHR || \
        (vp)->v_type == VSOCK || (vp)->v_type == VFIFO)

int
union_parsepath(void *v)
{
        struct vop_parsepath_args /* {
                struct vnode *a_dvp;
                const char *a_name;
                size_t *a_retval;
        } */ *ap = v;
        struct vnode *upperdvp, *lowerdvp;
        size_t upper, lower;
        int error;

        upperdvp = UPPERVP(ap->a_dvp);
        lowerdvp = LOWERVP(ap->a_dvp);

        if (upperdvp != NULLVP) {
                error = VOP_PARSEPATH(upperdvp, ap->a_name, &upper);
                if (error) {
                        return error;
                }
        } else {
                upper = 0;
        }

        if (lowerdvp != NULLVP) {
                error = VOP_PARSEPATH(lowerdvp, ap->a_name, &lower);
                if (error) {
                        return error;
                }
        } else {
                lower = 0;
        }

        if (upper == 0 && lower == 0) {
                panic("%s: missing both layers", __func__);
        }

        /*
         * If they're different, use the larger one. This is not a
         * comprehensive solution, but it's sufficient for the
         * non-default cases of parsepath that currently exist.
         */
        *ap->a_retval = MAX(upper, lower);
        return 0;
}

static int
union_lookup1(struct vnode *udvp, struct vnode **dvpp, struct vnode **vpp,
        struct componentname *cnp)
{
        int error;
        struct vnode *tdvp;
        struct vnode *dvp;
        struct mount *mp;

        dvp = *dvpp;

        /*
         * If stepping up the directory tree, check for going
         * back across the mount point, in which case do what
         * lookup would do by stepping back down the mount
         * hierarchy.
         */
        if (cnp->cn_flags & ISDOTDOT) {
                while ((dvp != udvp) && (dvp->v_vflag & VV_ROOT)) {
                        /*
                         * Don't do the NOCROSSMOUNT check
                         * at this level.  By definition,
                         * union fs deals with namespaces, not
                         * filesystems.
                         */
                        tdvp = dvp;
                        *dvpp = dvp = dvp->v_mount->mnt_vnodecovered;
                        VOP_UNLOCK(tdvp);
                        vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
                }
        }

        error = VOP_LOOKUP(dvp, &tdvp, cnp);
        if (error)
                return (error);
        if (dvp != tdvp) {
                if (cnp->cn_flags & ISDOTDOT)
                        VOP_UNLOCK(dvp);
                error = vn_lock(tdvp, LK_EXCLUSIVE);
                if (cnp->cn_flags & ISDOTDOT)
                        vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
                if (error) {
                        vrele(tdvp);
                        return error;
                }
                dvp = tdvp;
        }

        /*
         * Lastly check if the current node is a mount point in
         * which case walk up the mount hierarchy making sure not to
         * bump into the root of the mount tree (ie. dvp != udvp).
         */
        while (dvp != udvp && (dvp->v_type == VDIR) &&
               (mp = dvp->v_mountedhere)) {
                if (vfs_busy(mp))
                        continue;
                vput(dvp);
                error = VFS_ROOT(mp, LK_EXCLUSIVE, &tdvp);
                vfs_unbusy(mp);
                if (error) {
                        return (error);
                }
                dvp = tdvp;
        }

        *vpp = dvp;
        return (0);
}

int
union_lookup(void *v)
{
        struct vop_lookup_v2_args /* {
                struct vnodeop_desc *a_desc;
                struct vnode *a_dvp;
                struct vnode **a_vpp;
                struct componentname *a_cnp;
        } */ *ap = v;
        int error;
        int uerror, lerror;
        struct vnode *uppervp, *lowervp;
        struct vnode *upperdvp, *lowerdvp;
        struct vnode *dvp = ap->a_dvp;
        struct union_node *dun = VTOUNION(dvp);
        struct componentname *cnp = ap->a_cnp;
        struct union_mount *um = MOUNTTOUNIONMOUNT(dvp->v_mount);
        kauth_cred_t saved_cred = NULL;
        int iswhiteout;
        struct vattr va;

#ifdef notyet
        if (cnp->cn_namelen == 3 &&
                        cnp->cn_nameptr[2] == '.' &&
                        cnp->cn_nameptr[1] == '.' &&
                        cnp->cn_nameptr[0] == '.') {
                dvp = *ap->a_vpp = LOWERVP(ap->a_dvp);
                if (dvp == NULLVP)
                        return (ENOENT);
                vref(dvp);
                vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
                return (0);
        }
#endif

        if ((cnp->cn_flags & ISLASTCN) &&
            (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
            (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
                return (EROFS);

start:
        upperdvp = dun->un_uppervp;
        lowerdvp = dun->un_lowervp;
        uppervp = NULLVP;
        lowervp = NULLVP;
        iswhiteout = 0;

        /*
         * do the lookup in the upper level.
         * if that level comsumes additional pathnames,
         * then assume that something special is going
         * on and just return that vnode.
         */
        if (upperdvp != NULLVP) {
                uerror = union_lookup1(um->um_uppervp, &upperdvp,
                                        &uppervp, cnp);
                if (uerror == ENOENT || uerror == EJUSTRETURN) {
                        if (cnp->cn_flags & ISWHITEOUT) {
                                iswhiteout = 1;
                        } else if (lowerdvp != NULLVP) {
                                lerror = VOP_GETATTR(upperdvp, &va,
                                        cnp->cn_cred);
                                if (lerror == 0 && (va.va_flags & OPAQUE))
                                        iswhiteout = 1;
                        }
                }
        } else {
                uerror = ENOENT;
        }

        /*
         * in a similar way to the upper layer, do the lookup
         * in the lower layer.   this time, if there is some
         * component magic going on, then vput whatever we got
         * back from the upper layer and return the lower vnode
         * instead.
         */
        if (lowerdvp != NULLVP && !iswhiteout) {
                int nameiop;

                vn_lock(lowerdvp, LK_EXCLUSIVE | LK_RETRY);

                /*
                 * Only do a LOOKUP on the bottom node, since
                 * we won't be making changes to it anyway.
                 */
                nameiop = cnp->cn_nameiop;
                cnp->cn_nameiop = LOOKUP;
                if (um->um_op == UNMNT_BELOW) {
                        saved_cred = cnp->cn_cred;
                        cnp->cn_cred = um->um_cred;
                }

                /*
                 * we shouldn't have to worry about locking interactions
                 * between the lower layer and our union layer (w.r.t.
                 * `..' processing) because we don't futz with lowervp
                 * locks in the union-node instantiation code path.
                 */
                lerror = union_lookup1(um->um_lowervp, &lowerdvp,
                                &lowervp, cnp);
                if (um->um_op == UNMNT_BELOW)
                        cnp->cn_cred = saved_cred;
                cnp->cn_nameiop = nameiop;

                if (lowervp != lowerdvp)
                        VOP_UNLOCK(lowerdvp);
        } else {
                lerror = ENOENT;
                if ((cnp->cn_flags & ISDOTDOT) && dun->un_pvp != NULLVP) {
                        lowervp = LOWERVP(dun->un_pvp);
                        if (lowervp != NULLVP) {
                                vref(lowervp);
                                vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY);
                                lerror = 0;
                        }
                }
        }

        /*
         * EJUSTRETURN is used by underlying filesystems to indicate that
         * a directory modification op was started successfully.
         * This will only happen in the upper layer, since
         * the lower layer only does LOOKUPs.
         * If this union is mounted read-only, bounce it now.
         */

        if ((uerror == EJUSTRETURN) && (cnp->cn_flags & ISLASTCN) &&
            (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
            ((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)))
                uerror = EROFS;

        /*
         * at this point, we have uerror and lerror indicating
         * possible errors with the lookups in the upper and lower
         * layers.  additionally, uppervp and lowervp are (locked)
         * references to existing vnodes in the upper and lower layers.
         *
         * there are now three cases to consider.
         * 1. if both layers returned an error, then return whatever
         *    error the upper layer generated.
         *
         * 2. if the top layer failed and the bottom layer succeeded
         *    then two subcases occur.
         *    a.  the bottom vnode is not a directory, in which
         *          case just return a new union vnode referencing
         *          an empty top layer and the existing bottom layer.
         *    b.  the bottom vnode is a directory, in which case
         *          create a new directory in the top-level and
         *          continue as in case 3.
         *
         * 3. if the top layer succeeded then return a new union
         *    vnode referencing whatever the new top layer and
         *    whatever the bottom layer returned.
         */

        *ap->a_vpp = NULLVP;


        /* case 1. */
        if ((uerror != 0) && (lerror != 0)) {
                return (uerror);
        }

        /* case 2. */
        if (uerror != 0 /* && (lerror == 0) */ ) {
                if (lowervp->v_type == VDIR) { /* case 2b. */
                        /*
                         * We may be racing another process to make the
                         * upper-level shadow directory.  Be careful with
                         * locks/etc!
                         * If we have to create a shadow directory and want
                         * to commit the node we have to restart the lookup
                         * to get the componentname right.
                         */
                        if (upperdvp) {
                                VOP_UNLOCK(upperdvp);
                                uerror = union_mkshadow(um, upperdvp, cnp,
                                    &uppervp);
                                vn_lock(upperdvp, LK_EXCLUSIVE | LK_RETRY);
                                if (uerror == 0 && cnp->cn_nameiop != LOOKUP) {
                                        vrele(uppervp);
                                        if (lowervp != NULLVP)
                                                vput(lowervp);
                                        goto start;
                                }
                        }
                        if (uerror) {
                                if (lowervp != NULLVP) {
                                        vput(lowervp);
                                        lowervp = NULLVP;
                                }
                                return (uerror);
                        }
                }
        } else { /* uerror == 0 */
                if (uppervp != upperdvp)
                        VOP_UNLOCK(uppervp);
        }

        if (lowervp != NULLVP)
                VOP_UNLOCK(lowervp);

        error = union_allocvp(ap->a_vpp, dvp->v_mount, dvp, upperdvp, cnp,
                              uppervp, lowervp, 1);

        if (error) {
                if (uppervp != NULLVP)
                        vrele(uppervp);
                if (lowervp != NULLVP)
                        vrele(lowervp);
                return error;
        }

        return 0;
}

int
union_create(void *v)
{
        struct vop_create_v3_args /* {
                struct vnode *a_dvp;
                struct vnode **a_vpp;
                struct componentname *a_cnp;
                struct vattr *a_vap;
        } */ *ap = v;
        struct union_node *un = VTOUNION(ap->a_dvp);
        struct vnode *dvp = un->un_uppervp;
        struct componentname *cnp = ap->a_cnp;

        if (dvp != NULLVP) {
                int error;
                struct vnode *vp;
                struct mount *mp;

                mp = ap->a_dvp->v_mount;

                vp = NULL;
                error = VOP_CREATE(dvp, &vp, cnp, ap->a_vap);
                if (error)
                        return (error);

                error = union_allocvp(ap->a_vpp, mp, NULLVP, NULLVP, cnp, vp,
                                NULLVP, 1);
                if (error)
                        vrele(vp);
                return (error);
        }

        return (EROFS);
}

int
union_whiteout(void *v)
{
        struct vop_whiteout_args /* {
                struct vnode *a_dvp;
                struct componentname *a_cnp;
                int a_flags;
        } */ *ap = v;
        struct union_node *un = VTOUNION(ap->a_dvp);
        struct componentname *cnp = ap->a_cnp;

        if (un->un_uppervp == NULLVP)
                return (EOPNOTSUPP);

        return (VOP_WHITEOUT(un->un_uppervp, cnp, ap->a_flags));
}

int
union_mknod(void *v)
{
        struct vop_mknod_v3_args /* {
                struct vnode *a_dvp;
                struct vnode **a_vpp;
                struct componentname *a_cnp;
                struct vattr *a_vap;
        } */ *ap = v;
        struct union_node *un = VTOUNION(ap->a_dvp);
        struct vnode *dvp = un->un_uppervp;
        struct componentname *cnp = ap->a_cnp;

        if (dvp != NULLVP) {
                int error;
                struct vnode *vp;
                struct mount *mp;

                mp = ap->a_dvp->v_mount;
                error = VOP_MKNOD(dvp, &vp, cnp, ap->a_vap);
                if (error)
                        return (error);

                error = union_allocvp(ap->a_vpp, mp, NULLVP, NULLVP,
                                      cnp, vp, NULLVP, 1);
                if (error)
                        vrele(vp);
                return (error);
        }

        return (EROFS);
}

int
union_open(void *v)
{
        struct vop_open_args /* {
                struct vnodeop_desc *a_desc;
                struct vnode *a_vp;
                int a_mode;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct union_node *un = VTOUNION(ap->a_vp);
        struct vnode *tvp;
        int mode = ap->a_mode;
        kauth_cred_t cred = ap->a_cred;
        struct lwp *l = curlwp;
        int error;

        /*
         * If there is an existing upper vp then simply open that.
         */
        tvp = un->un_uppervp;
        if (tvp == NULLVP) {
                /*
                 * If the lower vnode is being opened for writing, then
                 * copy the file contents to the upper vnode and open that,
                 * otherwise can simply open the lower vnode.
                 */
                tvp = un->un_lowervp;
                if ((ap->a_mode & FWRITE) && (tvp->v_type == VREG)) {
                        error = union_copyup(un, (mode&O_TRUNC) == 0, cred, l);
                        if (error == 0)
                                error = VOP_OPEN(un->un_uppervp, mode, cred);
                        if (error == 0) {
                                mutex_enter(un->un_uppervp->v_interlock);
                                un->un_uppervp->v_writecount++;
                                mutex_exit(un->un_uppervp->v_interlock);
                        }
                        return (error);
                }

                /*
                 * Just open the lower vnode, but check for nodev mount flag
                 */
                if ((tvp->v_type == VBLK || tvp->v_type == VCHR) &&
                    (ap->a_vp->v_mount->mnt_flag & MNT_NODEV))
                        return ENXIO;
                un->un_openl++;
                vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
                error = VOP_OPEN(tvp, mode, cred);
                VOP_UNLOCK(tvp);

                return (error);
        }
        /*
         * Just open the upper vnode, checking for nodev mount flag first
         */
        if ((tvp->v_type == VBLK || tvp->v_type == VCHR) &&
            (ap->a_vp->v_mount->mnt_flag & MNT_NODEV))
                return ENXIO;

        error = VOP_OPEN(tvp, mode, cred);
        if (error == 0 && (ap->a_mode & FWRITE)) {
                mutex_enter(tvp->v_interlock);
                tvp->v_writecount++;
                mutex_exit(tvp->v_interlock);
        }

        return (error);
}

int
union_close(void *v)
{
        struct vop_close_args /* {
                struct vnode *a_vp;
                int  a_fflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct union_node *un = VTOUNION(ap->a_vp);
        struct vnode *vp;
        int error;
        bool do_lock;

        vp = un->un_uppervp;
        if (vp != NULLVP) {
                do_lock = false;
        } else {
                KASSERT(un->un_openl > 0);
                --un->un_openl;
                vp = un->un_lowervp;
                do_lock = true;
        }

        KASSERT(vp != NULLVP);
        ap->a_vp = vp;
        if ((ap->a_fflag & FWRITE)) {
                KASSERT(vp == un->un_uppervp);
                mutex_enter(vp->v_interlock);
                vp->v_writecount--;
                mutex_exit(vp->v_interlock);
        }
        if (do_lock)
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        error = VCALL(vp, VOFFSET(vop_close), ap);
        if (do_lock)
                VOP_UNLOCK(vp);

        return error;
}

/*
 * Check access permission on the union vnode.
 * The access check being enforced is to check
 * against both the underlying vnode, and any
 * copied vnode.  This ensures that no additional
 * file permissions are given away simply because
 * the user caused an implicit file copy.
 */
int
union_access(void *v)
{
        struct vop_access_args /* {
                struct vnodeop_desc *a_desc;
                struct vnode *a_vp;
                accmode_t a_accmode;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct union_node *un = VTOUNION(vp);
        int error = EACCES;
        struct union_mount *um = MOUNTTOUNIONMOUNT(vp->v_mount);

        /*
         * Disallow write attempts on read-only file systems;
         * unless the file is a socket, fifo, or a block or
         * character device resident on the file system.
         */
        if (ap->a_accmode & VWRITE) {
                switch (vp->v_type) {
                case VDIR:
                case VLNK:
                case VREG:
                        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                                return (EROFS);
                        break;
                case VBAD:
                case VBLK:
                case VCHR:
                case VSOCK:
                case VFIFO:
                case VNON:
                default:
                        break;
                }
        }

        /*
         * Copy up to prevent checking (and failing) against
         * underlying file system mounted read only.
         * Check for read access first to prevent implicit
         * copy of inaccessible underlying vnode.
         */
        if (un->un_uppervp == NULLVP &&
            (un->un_lowervp->v_type == VREG) &&
            (ap->a_accmode & VWRITE)) {
                vn_lock(un->un_lowervp, LK_EXCLUSIVE | LK_RETRY);
                error = VOP_ACCESS(un->un_lowervp, VREAD, ap->a_cred);
                VOP_UNLOCK(un->un_lowervp);
                if (error == 0)
                        error = union_copyup(un, 1, ap->a_cred, curlwp);
                if (error)
                        return error;
        }

        if ((vp = un->un_uppervp) != NULLVP) {
                ap->a_vp = vp;
                return (VCALL(vp, VOFFSET(vop_access), ap));
        }

        if ((vp = un->un_lowervp) != NULLVP) {
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                ap->a_vp = vp;
                error = VCALL(vp, VOFFSET(vop_access), ap);
                if (error == 0) {
                        if (um->um_op == UNMNT_BELOW) {
                                ap->a_cred = um->um_cred;
                                error = VCALL(vp, VOFFSET(vop_access), ap);
                        }
                }
                VOP_UNLOCK(vp);
                if (error)
                        return (error);
        }

        return (error);
}

/*
 * We handle getattr only to change the fsid and
 * track object sizes
 */
int
union_getattr(void *v)
{
        struct vop_getattr_args /* {
                struct vnode *a_vp;
                struct vattr *a_vap;
                kauth_cred_t a_cred;
        } */ *ap = v;
        int error;
        struct union_node *un = VTOUNION(ap->a_vp);
        struct vnode *vp = un->un_uppervp;
        struct vattr *vap;
        struct vattr va;


        /*
         * Some programs walk the filesystem hierarchy by counting
         * links to directories to avoid stat'ing all the time.
         * This means the link count on directories needs to be "correct".
         * The only way to do that is to call getattr on both layers
         * and fix up the link count.  The link count will not necessarily
         * be accurate but will be large enough to defeat the tree walkers.
         *
         * To make life more interesting, some filesystems don't keep
         * track of link counts in the expected way, and return a
         * link count of `1' for those directories; if either of the
         * component directories returns a link count of `1', we return a 1.
         */

        vap = ap->a_vap;

        vp = un->un_uppervp;
        if (vp != NULLVP) {
                error = VOP_GETATTR(vp, vap, ap->a_cred);
                if (error)
                        return (error);
                mutex_enter(&un->un_lock);
                union_newsize(ap->a_vp, vap->va_size, VNOVAL);
        }

        if (vp == NULLVP) {
                vp = un->un_lowervp;
        } else if (vp->v_type == VDIR) {
                vp = un->un_lowervp;
                if (vp != NULLVP)
                        vap = &va;
        } else {
                vp = NULLVP;
        }

        if (vp != NULLVP) {
                if (vp == un->un_lowervp)
                        vn_lock(vp, LK_SHARED | LK_RETRY);
                error = VOP_GETATTR(vp, vap, ap->a_cred);
                if (vp == un->un_lowervp)
                        VOP_UNLOCK(vp);
                if (error)
                        return (error);
                mutex_enter(&un->un_lock);
                union_newsize(ap->a_vp, VNOVAL, vap->va_size);
        }

        if ((vap != ap->a_vap) && (vap->va_type == VDIR)) {
                /*
                 * Link count manipulation:
                 *        - If both return "2", return 2 (no subdirs)
                 *        - If one or the other return "1", return "1" (ENOCLUE)
                 */
                if ((ap->a_vap->va_nlink == 2) &&
                    (vap->va_nlink == 2))
                        ;
                else if (ap->a_vap->va_nlink != 1) {
                        if (vap->va_nlink == 1)
                                ap->a_vap->va_nlink = 1;
                        else
                                ap->a_vap->va_nlink += vap->va_nlink;
                }
        }
        ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
        return (0);
}

int
union_setattr(void *v)
{
        struct vop_setattr_args /* {
                struct vnode *a_vp;
                struct vattr *a_vap;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vattr *vap = ap->a_vap;
        struct vnode *vp = ap->a_vp;
        struct union_node *un = VTOUNION(vp);
        bool size_only;                /* All but va_size are VNOVAL. */
        int error;

        size_only = (vap->va_flags == VNOVAL && vap->va_uid == (uid_t)VNOVAL &&
            vap->va_gid == (gid_t)VNOVAL && vap->va_atime.tv_sec == VNOVAL &&
            vap->va_mtime.tv_sec == VNOVAL && vap->va_mode == (mode_t)VNOVAL);

        if (!size_only && (vp->v_mount->mnt_flag & MNT_RDONLY))
                return (EROFS);
        if (vap->va_size != VNOVAL) {
                 switch (vp->v_type) {
                 case VDIR:
                         return (EISDIR);
                 case VCHR:
                 case VBLK:
                 case VSOCK:
                 case VFIFO:
                        break;
                case VREG:
                case VLNK:
                 default:
                        /*
                         * Disallow write attempts if the filesystem is
                         * mounted read-only.
                         */
                        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                                return (EROFS);
                }
        }

        /*
         * Handle case of truncating lower object to zero size,
         * by creating a zero length upper object.  This is to
         * handle the case of open with O_TRUNC and O_CREAT.
         */
        if ((un->un_uppervp == NULLVP) &&
            /* assert(un->un_lowervp != NULLVP) */
            (un->un_lowervp->v_type == VREG)) {
                error = union_copyup(un, (vap->va_size != 0),
                                                ap->a_cred, curlwp);
                if (error)
                        return (error);
        }

        /*
         * Try to set attributes in upper layer, ignore size change to zero
         * for devices to handle O_TRUNC and return read-only filesystem error
         * otherwise.
         */
        if (un->un_uppervp != NULLVP) {
                error = VOP_SETATTR(un->un_uppervp, vap, ap->a_cred);
                if ((error == 0) && (vap->va_size != VNOVAL)) {
                        mutex_enter(&un->un_lock);
                        union_newsize(ap->a_vp, vap->va_size, VNOVAL);
                }
        } else {
                KASSERT(un->un_lowervp != NULLVP);
                if (NODE_IS_SPECIAL(un->un_lowervp)) {
                        if (size_only &&
                            (vap->va_size == 0 || vap->va_size == VNOVAL))
                                error = 0;
                        else
                                error = EROFS;
                } else {
                        error = EROFS;
                }
        }

        return (error);
}

int
union_read(void *v)
{
        struct vop_read_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int  a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        int error;
        struct vnode *vp = OTHERVP(ap->a_vp);
        int dolock = (vp == LOWERVP(ap->a_vp));

        if (dolock)
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_READ(vp, ap->a_uio, ap->a_ioflag, ap->a_cred);
        if (dolock)
                VOP_UNLOCK(vp);

        /*
         * XXX
         * perhaps the size of the underlying object has changed under
         * our feet.  take advantage of the offset information present
         * in the uio structure.
         */
        if (error == 0) {
                struct union_node *un = VTOUNION(ap->a_vp);
                off_t cur = ap->a_uio->uio_offset;
                off_t usz = VNOVAL, lsz = VNOVAL;

                mutex_enter(&un->un_lock);
                if (vp == un->un_uppervp) {
                        if (cur > un->un_uppersz)
                                usz = cur;
                } else {
                        if (cur > un->un_lowersz)
                                lsz = cur;
                }

                if (usz != VNOVAL || lsz != VNOVAL)
                        union_newsize(ap->a_vp, usz, lsz);
                else
                        mutex_exit(&un->un_lock);
        }

        return (error);
}

int
union_write(void *v)
{
        struct vop_read_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int  a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        int error;
        struct vnode *vp;
        struct union_node *un = VTOUNION(ap->a_vp);

        vp = UPPERVP(ap->a_vp);
        if (vp == NULLVP) {
                vp = LOWERVP(ap->a_vp);
                if (NODE_IS_SPECIAL(vp)) {
                        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                        error = VOP_WRITE(vp, ap->a_uio, ap->a_ioflag,
                            ap->a_cred);
                        VOP_UNLOCK(vp);
                        return error;
                }
                panic("union: missing upper layer in write");
        }

        error = VOP_WRITE(vp, ap->a_uio, ap->a_ioflag, ap->a_cred);

        /*
         * the size of the underlying object may be changed by the
         * write.
         */
        if (error == 0) {
                off_t cur = ap->a_uio->uio_offset;

                mutex_enter(&un->un_lock);
                if (cur > un->un_uppersz)
                        union_newsize(ap->a_vp, cur, VNOVAL);
                else
                        mutex_exit(&un->un_lock);
        }

        return (error);
}

int
union_ioctl(void *v)
{
        struct vop_ioctl_args /* {
                struct vnode *a_vp;
                int  a_command;
                void *a_data;
                int  a_fflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *ovp = OTHERVP(ap->a_vp);

        ap->a_vp = ovp;
        return (VCALL(ovp, VOFFSET(vop_ioctl), ap));
}

int
union_poll(void *v)
{
        struct vop_poll_args /* {
                struct vnode *a_vp;
                int a_events;
        } */ *ap = v;
        struct vnode *ovp = OTHERVP(ap->a_vp);

        ap->a_vp = ovp;
        return (VCALL(ovp, VOFFSET(vop_poll), ap));
}

int
union_revoke(void *v)
{
        struct vop_revoke_args /* {
                struct vnode *a_vp;
                int a_flags;
                struct proc *a_p;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;

        if (UPPERVP(vp))
                VOP_REVOKE(UPPERVP(vp), ap->a_flags);
        if (LOWERVP(vp))
                VOP_REVOKE(LOWERVP(vp), ap->a_flags);
        vgone(vp);        /* XXXAD?? */
        return (0);
}

int
union_mmap(void *v)
{
        struct vop_mmap_args /* {
                struct vnode *a_vp;
                vm_prot_t a_prot;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *ovp = OTHERVP(ap->a_vp);

        ap->a_vp = ovp;
        return (VCALL(ovp, VOFFSET(vop_mmap), ap));
}

int
union_fsync(void *v)
{
        struct vop_fsync_args /* {
                struct vnode *a_vp;
                kauth_cred_t a_cred;
                int  a_flags;
                off_t offhi;
                off_t offlo;
        } */ *ap = v;
        int error = 0;
        struct vnode *targetvp;

        /*
         * If vinvalbuf is calling us, it's a "shallow fsync" -- don't
         * bother syncing the underlying vnodes, since (a) they'll be
         * fsync'ed when reclaimed and (b) we could deadlock if
         * they're locked; otherwise, pass it through to the
         * underlying layer.
         */
        if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR) {
                error = spec_fsync(v);
                if (error)
                        return error;
        }

        if (ap->a_flags & FSYNC_RECLAIM)
                return 0;

        targetvp = OTHERVP(ap->a_vp);
        if (targetvp != NULLVP) {
                int dolock = (targetvp == LOWERVP(ap->a_vp));

                if (dolock)
                        vn_lock(targetvp, LK_EXCLUSIVE | LK_RETRY);
                error = VOP_FSYNC(targetvp, ap->a_cred, ap->a_flags,
                            ap->a_offlo, ap->a_offhi);
                if (dolock)
                        VOP_UNLOCK(targetvp);
        }

        return (error);
}

int
union_seek(void *v)
{
        struct vop_seek_args /* {
                struct vnode *a_vp;
                off_t  a_oldoff;
                off_t  a_newoff;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *ovp = OTHERVP(ap->a_vp);

        ap->a_vp = ovp;
        return (VCALL(ovp, VOFFSET(vop_seek), ap));
}

int
union_remove(void *v)
{
        struct vop_remove_v3_args /* {
                struct vnode *a_dvp;
                struct vnode *a_vp;
                struct componentname *a_cnp;
                nlink_t ctx_vp_new_nlink;
        } */ *ap = v;
        int error;
        struct union_node *dun = VTOUNION(ap->a_dvp);
        struct union_node *un = VTOUNION(ap->a_vp);
        struct componentname *cnp = ap->a_cnp;

        if (dun->un_uppervp == NULLVP)
                panic("union remove: null upper vnode");

        if (un->un_uppervp != NULLVP) {
                struct vnode *dvp = dun->un_uppervp;
                struct vnode *vp = un->un_uppervp;

                /* Account for VOP_REMOVE to vrele vp.  */
                vref(vp);
                if (union_dowhiteout(un, cnp->cn_cred))
                        cnp->cn_flags |= DOWHITEOUT;
                error = VOP_REMOVE(dvp, vp, cnp);
                if (!error)
                        union_removed_upper(un);
                vrele(ap->a_vp);
        } else {
                error = union_mkwhiteout(
                        MOUNTTOUNIONMOUNT(UNIONTOV(dun)->v_mount),
                        dun->un_uppervp, ap->a_cnp, un);
                vput(ap->a_vp);
        }

        return (error);
}

int
union_link(void *v)
{
        struct vop_link_v2_args /* {
                struct vnode *a_dvp;
                struct vnode *a_vp;
                struct componentname *a_cnp;
        } */ *ap = v;
        int error = 0;
        struct componentname *cnp = ap->a_cnp;
        struct union_node *dun;
        struct vnode *vp;
        struct vnode *dvp;

        dun = VTOUNION(ap->a_dvp);

        KASSERT((ap->a_cnp->cn_flags & LOCKPARENT) != 0);

        if (ap->a_dvp->v_op != ap->a_vp->v_op) {
                vp = ap->a_vp;
        } else {
                struct union_node *un = VTOUNION(ap->a_vp);
                if (un->un_uppervp == NULLVP) {
                        const bool droplock = (dun->un_uppervp == un->un_dirvp);

                        /*
                         * Needs to be copied before we can link it.
                         */
                        vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
                        if (droplock)
                                VOP_UNLOCK(dun->un_uppervp);
                        error = union_copyup(un, 1, cnp->cn_cred, curlwp);
                        if (droplock) {
                                vn_lock(dun->un_uppervp,
                                    LK_EXCLUSIVE | LK_RETRY);
                                /*
                                 * During copyup, we dropped the lock on the
                                 * dir and invalidated any saved namei lookup
                                 * state for the directory we'll be entering
                                 * the link in.  We need to re-run the lookup
                                 * in that directory to reset any state needed
                                 * for VOP_LINK.
                                 * Call relookup on the union-layer to reset
                                 * the state.
                                 */
                                vp  = NULLVP;
                                if (dun->un_uppervp == NULLVP)
                                         panic("union: null upperdvp?");
                                error = relookup(ap->a_dvp, &vp, ap->a_cnp, 0);
                                if (error) {
                                        VOP_UNLOCK(ap->a_vp);
                                        return EROFS;        /* ? */
                                }
                                if (vp != NULLVP) {
                                        /*
                                         * The name we want to create has
                                         * mysteriously appeared (a race?)
                                         */
                                        error = EEXIST;
                                        VOP_UNLOCK(ap->a_vp);
                                        vput(vp);
                                        return (error);
                                }
                        }
                        VOP_UNLOCK(ap->a_vp);
                }
                vp = un->un_uppervp;
        }

        dvp = dun->un_uppervp;
        if (dvp == NULLVP)
                error = EROFS;

        if (error)
                return (error);

        return VOP_LINK(dvp, vp, cnp);
}

int
union_rename(void *v)
{
        struct vop_rename_args /* {
                struct vnode *a_fdvp;
                struct vnode *a_fvp;
                struct componentname *a_fcnp;
                struct vnode *a_tdvp;
                struct vnode *a_tvp;
                struct componentname *a_tcnp;
        } */ *ap = v;
        int error;

        struct vnode *fdvp = ap->a_fdvp;
        struct vnode *fvp = ap->a_fvp;
        struct vnode *tdvp = ap->a_tdvp;
        struct vnode *tvp = ap->a_tvp;

        /*
         * Account for VOP_RENAME to vrele all nodes.
         * Note: VOP_RENAME will unlock tdvp.
         */

        if (fdvp->v_op == union_vnodeop_p) {        /* always true */
                struct union_node *un = VTOUNION(fdvp);
                if (un->un_uppervp == NULLVP) {
                        /*
                         * this should never happen in normal
                         * operation but might if there was
                         * a problem creating the top-level shadow
                         * directory.
                         */
                        error = EXDEV;
                        goto bad;
                }

                fdvp = un->un_uppervp;
                vref(fdvp);
        }

        if (fvp->v_op == union_vnodeop_p) {        /* always true */
                struct union_node *un = VTOUNION(fvp);
                if (un->un_uppervp == NULLVP) {
                        /* XXX: should do a copyup */
                        error = EXDEV;
                        goto bad;
                }

                if (un->un_lowervp != NULLVP)
                        ap->a_fcnp->cn_flags |= DOWHITEOUT;

                fvp = un->un_uppervp;
                vref(fvp);
        }

        if (tdvp->v_op == union_vnodeop_p) {
                struct union_node *un = VTOUNION(tdvp);
                if (un->un_uppervp == NULLVP) {
                        /*
                         * this should never happen in normal
                         * operation but might if there was
                         * a problem creating the top-level shadow
                         * directory.
                         */
                        error = EXDEV;
                        goto bad;
                }

                tdvp = un->un_uppervp;
                vref(tdvp);
        }

        if (tvp != NULLVP && tvp->v_op == union_vnodeop_p) {
                struct union_node *un = VTOUNION(tvp);

                tvp = un->un_uppervp;
                if (tvp != NULLVP) {
                        vref(tvp);
                }
        }

        error = VOP_RENAME(fdvp, fvp, ap->a_fcnp, tdvp, tvp, ap->a_tcnp);
        goto out;

bad:
        vput(tdvp);
        if (tvp != NULLVP)
                vput(tvp);
        vrele(fdvp);
        vrele(fvp);

out:
        if (fdvp != ap->a_fdvp) {
                vrele(ap->a_fdvp);
        }
        if (fvp != ap->a_fvp) {
                vrele(ap->a_fvp);
        }
        if (tdvp != ap->a_tdvp) {
                vrele(ap->a_tdvp);
        }
        if (tvp != ap->a_tvp) {
                vrele(ap->a_tvp);
        }
        return (error);
}

int
union_mkdir(void *v)
{
        struct vop_mkdir_v3_args /* {
                struct vnode *a_dvp;
                struct vnode **a_vpp;
                struct componentname *a_cnp;
                struct vattr *a_vap;
        } */ *ap = v;
        struct union_node *un = VTOUNION(ap->a_dvp);
        struct vnode *dvp = un->un_uppervp;
        struct componentname *cnp = ap->a_cnp;

        if (dvp != NULLVP) {
                int error;
                struct vnode *vp;

                vp = NULL;
                error = VOP_MKDIR(dvp, &vp, cnp, ap->a_vap);
                if (error) {
                        vrele(ap->a_dvp);
                        return (error);
                }

                error = union_allocvp(ap->a_vpp, ap->a_dvp->v_mount, ap->a_dvp,
                                NULLVP, cnp, vp, NULLVP, 1);
                if (error)
                        vrele(vp);
                return (error);
        }

        return (EROFS);
}

int
union_rmdir(void *v)
{
        struct vop_rmdir_v2_args /* {
                struct vnode *a_dvp;
                struct vnode *a_vp;
                struct componentname *a_cnp;
        } */ *ap = v;
        int error;
        struct union_node *dun = VTOUNION(ap->a_dvp);
        struct union_node *un = VTOUNION(ap->a_vp);
        struct componentname *cnp = ap->a_cnp;

        if (dun->un_uppervp == NULLVP)
                panic("union rmdir: null upper vnode");

        error = union_check_rmdir(un, cnp->cn_cred);
        if (error) {
                vput(ap->a_vp);
                return error;
        }

        if (un->un_uppervp != NULLVP) {
                struct vnode *dvp = dun->un_uppervp;
                struct vnode *vp = un->un_uppervp;

                /* Account for VOP_RMDIR to vrele vp.  */
                vref(vp);
                if (union_dowhiteout(un, cnp->cn_cred))
                        cnp->cn_flags |= DOWHITEOUT;
                error = VOP_RMDIR(dvp, vp, ap->a_cnp);
                if (!error)
                        union_removed_upper(un);
                vrele(ap->a_vp);
        } else {
                error = union_mkwhiteout(
                        MOUNTTOUNIONMOUNT(UNIONTOV(dun)->v_mount),
                        dun->un_uppervp, ap->a_cnp, un);
                vput(ap->a_vp);
        }

        return (error);
}

int
union_symlink(void *v)
{
        struct vop_symlink_v3_args /* {
                struct vnode *a_dvp;
                struct vnode **a_vpp;
                struct componentname *a_cnp;
                struct vattr *a_vap;
                char *a_target;
        } */ *ap = v;
        struct union_node *un = VTOUNION(ap->a_dvp);
        struct vnode *dvp = un->un_uppervp;
        struct componentname *cnp = ap->a_cnp;

        if (dvp != NULLVP) {
                int error;

                error = VOP_SYMLINK(dvp, ap->a_vpp, cnp, ap->a_vap,
                                    ap->a_target);
                return (error);
        }

        return (EROFS);
}

/*
 * union_readdir works in concert with getdirentries and
 * readdir(3) to provide a list of entries in the unioned
 * directories.  getdirentries is responsible for walking
 * down the union stack.  readdir(3) is responsible for
 * eliminating duplicate names from the returned data stream.
 */
int
union_readdir(void *v)
{
        struct vop_readdir_args /* {
                struct vnodeop_desc *a_desc;
                struct vnode *a_vp;
                struct uio *a_uio;
                kauth_cred_t a_cred;
                int *a_eofflag;
                u_long *a_cookies;
                int a_ncookies;
        } */ *ap = v;
        struct union_node *un = VTOUNION(ap->a_vp);
        struct vnode *vp;
        int dolock, error;

        if (un->un_hooknode) {
                KASSERT(un->un_uppervp == NULLVP);
                KASSERT(un->un_lowervp != NULLVP);
                vp = un->un_lowervp;
                dolock = 1;
        } else {
                vp = un->un_uppervp;
                dolock = 0;
        }
        if (vp == NULLVP)
                return 0;

        if (dolock)
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        ap->a_vp = vp;
        error = VCALL(vp, VOFFSET(vop_readdir), ap);
        if (dolock)
                VOP_UNLOCK(vp);

        return error;
}

int
union_readlink(void *v)
{
        struct vop_readlink_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                kauth_cred_t a_cred;
        } */ *ap = v;
        int error;
        struct vnode *vp = OTHERVP(ap->a_vp);
        int dolock = (vp == LOWERVP(ap->a_vp));

        if (dolock)
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        ap->a_vp = vp;
        error = VCALL(vp, VOFFSET(vop_readlink), ap);
        if (dolock)
                VOP_UNLOCK(vp);

        return (error);
}

int
union_abortop(void *v)
{
        struct vop_abortop_args /* {
                struct vnode *a_dvp;
                struct componentname *a_cnp;
        } */ *ap = v;

        KASSERT(UPPERVP(ap->a_dvp) != NULL);

        ap->a_dvp = UPPERVP(ap->a_dvp);
        return VCALL(ap->a_dvp, VOFFSET(vop_abortop), ap);
}

int
union_inactive(void *v)
{
        struct vop_inactive_v2_args /* {
                const struct vnodeop_desc *a_desc;
                struct vnode *a_vp;
                bool *a_recycle;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct union_node *un = VTOUNION(vp);
        struct vnode **vpp;

        /*
         * Do nothing (and _don't_ bypass).
         * Wait to vrele lowervp until reclaim,
         * so that until then our union_node is in the
         * cache and reusable.
         *
         * NEEDSWORK: Someday, consider inactive'ing
         * the lowervp and then trying to reactivate it
         * with capabilities (v_id)
         * like they do in the name lookup cache code.
         * That's too much work for now.
         */

        if (un->un_dircache != 0) {
                for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
                        vrele(*vpp);
                free(un->un_dircache, M_TEMP);
                un->un_dircache = 0;
        }

        *ap->a_recycle = ((un->un_cflags & UN_CACHED) == 0);

        return (0);
}

int
union_reclaim(void *v)
{
        struct vop_reclaim_v2_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct vnode *uvp = UPPERVP(vp);

        VOP_UNLOCK(vp);

        if (uvp != NULL) {
                mutex_enter(uvp->v_interlock);
                KASSERT(vp->v_interlock == uvp->v_interlock);
                uvp->v_writecount -= vp->v_writecount;
                mutex_exit(uvp->v_interlock);
        }

        union_freevp(vp);

        return (0);
}

static int
union_lock1(struct vnode *vp, struct vnode *lockvp, int flags)
{
        struct vop_lock_args ap;

        ap.a_desc = VDESC(vop_lock);
        ap.a_vp = lockvp;
        ap.a_flags = flags;

        if (lockvp == vp)
                return genfs_lock(&ap);
        else
                return VCALL(ap.a_vp, VOFFSET(vop_lock), &ap);
}

static int
union_unlock1(struct vnode *vp, struct vnode *lockvp)
{
        struct vop_unlock_args ap;

        ap.a_desc = VDESC(vop_unlock);
        ap.a_vp = lockvp;

        if (lockvp == vp)
                return genfs_unlock(&ap);
        else
                return VCALL(ap.a_vp, VOFFSET(vop_unlock), &ap);
}

int
union_lock(void *v)
{
        struct vop_lock_args /* {
                struct vnode *a_vp;
                int a_flags;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp, *lockvp;
        struct union_node *un = VTOUNION(vp);
        int flags = ap->a_flags;
        int error;

        if ((flags & LK_NOWAIT) != 0) {
                if (!mutex_tryenter(&un->un_lock))
                        return EBUSY;
                lockvp = LOCKVP(vp);
                error = union_lock1(vp, lockvp, flags);
                mutex_exit(&un->un_lock);
                return error;
        }

        mutex_enter(&un->un_lock);
        for (;;) {
                lockvp = LOCKVP(vp);
                mutex_exit(&un->un_lock);
                error = union_lock1(vp, lockvp, flags);
                if (error != 0 || (flags & (LK_DOWNGRADE | LK_UPGRADE)) != 0)
                        return error;
                mutex_enter(&un->un_lock);
                if (lockvp == LOCKVP(vp))
                        break;
                union_unlock1(vp, lockvp);
        }
        mutex_exit(&un->un_lock);

        return error;
}

int
union_unlock(void *v)
{
        struct vop_unlock_args /* {
                struct vnode *a_vp;
                int a_flags;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp, *lockvp;

        lockvp = LOCKVP(vp);
        union_unlock1(vp, lockvp);

        return 0;
}

int
union_bmap(void *v)
{
        struct vop_bmap_args /* {
                struct vnode *a_vp;
                daddr_t  a_bn;
                struct vnode **a_vpp;
                daddr_t *a_bnp;
                int *a_runp;
        } */ *ap = v;
        int error;
        struct vnode *vp = OTHERVP(ap->a_vp);
        int dolock = (vp == LOWERVP(ap->a_vp));

        if (dolock)
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        ap->a_vp = vp;
        error = VCALL(vp, VOFFSET(vop_bmap), ap);
        if (dolock)
                VOP_UNLOCK(vp);

        return (error);
}

int
union_print(void *v)
{
        struct vop_print_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;

        printf("\ttag VT_UNION, vp=%p, uppervp=%p, lowervp=%p\n",
                        vp, UPPERVP(vp), LOWERVP(vp));
        if (UPPERVP(vp) != NULLVP)
                vprint("union: upper", UPPERVP(vp));
        if (LOWERVP(vp) != NULLVP)
                vprint("union: lower", LOWERVP(vp));
        if (VTOUNION(vp)->un_dircache) {
                struct vnode **vpp;
                for (vpp = VTOUNION(vp)->un_dircache; *vpp != NULLVP; vpp++)
                        vprint("dircache:", *vpp);
        }

        return (0);
}

int
union_islocked(void *v)
{
        struct vop_islocked_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct vnode *vp;
        struct union_node *un;

        un = VTOUNION(ap->a_vp);
        mutex_enter(&un->un_lock);
        vp = LOCKVP(ap->a_vp);
        mutex_exit(&un->un_lock);

        if (vp == ap->a_vp)
                return genfs_islocked(ap);
        else
                return VOP_ISLOCKED(vp);
}

int
union_pathconf(void *v)
{
        struct vop_pathconf_args /* {
                struct vnode *a_vp;
                int a_name;
                int *a_retval;
        } */ *ap = v;
        int error;
        struct vnode *vp = OTHERVP(ap->a_vp);
        int dolock = (vp == LOWERVP(ap->a_vp));

        if (dolock)
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        ap->a_vp = vp;
        error = VCALL(vp, VOFFSET(vop_pathconf), ap);
        if (dolock)
                VOP_UNLOCK(vp);

        return (error);
}

int
union_advlock(void *v)
{
        struct vop_advlock_args /* {
                struct vnode *a_vp;
                void *a_id;
                int  a_op;
                struct flock *a_fl;
                int  a_flags;
        } */ *ap = v;
        struct vnode *ovp = OTHERVP(ap->a_vp);

        ap->a_vp = ovp;
        return (VCALL(ovp, VOFFSET(vop_advlock), ap));
}

int
union_strategy(void *v)
{
        struct vop_strategy_args /* {
                struct vnode *a_vp;
                struct buf *a_bp;
        } */ *ap = v;
        struct vnode *ovp = OTHERVP(ap->a_vp);
        struct buf *bp = ap->a_bp;

        KASSERT(ovp != NULLVP);
        if (!NODE_IS_SPECIAL(ovp))
                KASSERT((bp->b_flags & B_READ) || ovp != LOWERVP(bp->b_vp));

        return (VOP_STRATEGY(ovp, bp));
}

int
union_bwrite(void *v)
{
        struct vop_bwrite_args /* {
                struct vnode *a_vp;
                struct buf *a_bp;
        } */ *ap = v;
        struct vnode *ovp = OTHERVP(ap->a_vp);
        struct buf *bp = ap->a_bp;

        KASSERT(ovp != NULLVP);
        if (!NODE_IS_SPECIAL(ovp))
                KASSERT((bp->b_flags & B_READ) || ovp != LOWERVP(bp->b_vp));

        return (VOP_BWRITE(ovp, bp));
}

int
union_getpages(void *v)
{
        struct vop_getpages_args /* {
                struct vnode *a_vp;
                voff_t a_offset;
                struct vm_page **a_m;
                int *a_count;
                int a_centeridx;
                vm_prot_t a_access_type;
                int a_advice;
                int a_flags;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;

        KASSERT(rw_lock_held(vp->v_uobj.vmobjlock));

        if (ap->a_flags & PGO_LOCKED) {
                return EBUSY;
        }
        ap->a_vp = OTHERVP(vp);
        KASSERT(vp->v_uobj.vmobjlock == ap->a_vp->v_uobj.vmobjlock);

        /* Just pass the request on to the underlying layer. */
        return VCALL(ap->a_vp, VOFFSET(vop_getpages), ap);
}

int
union_putpages(void *v)
{
        struct vop_putpages_args /* {
                struct vnode *a_vp;
                voff_t a_offlo;
                voff_t a_offhi;
                int a_flags;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;

        KASSERT(rw_lock_held(vp->v_uobj.vmobjlock));

        ap->a_vp = OTHERVP(vp);
        KASSERT(vp->v_uobj.vmobjlock == ap->a_vp->v_uobj.vmobjlock);

        if (ap->a_flags & PGO_RECLAIM) {
                rw_exit(vp->v_uobj.vmobjlock);
                return 0;
        }

        /* Just pass the request on to the underlying layer. */
        return VCALL(ap->a_vp, VOFFSET(vop_putpages), ap);
}

int
union_kqfilter(void *v)
{
        struct vop_kqfilter_args /* {
                struct vnode        *a_vp;
                struct knote        *a_kn;
        } */ *ap = v;
        int error;

        /*
         * We watch either the upper layer file (if it already exists),
         * or the lower layer one. If there is lower layer file only
         * at this moment, we will keep watching that lower layer file
         * even if upper layer file would be created later on.
         */
        if (UPPERVP(ap->a_vp))
                error = VOP_KQFILTER(UPPERVP(ap->a_vp), ap->a_kn);
        else if (LOWERVP(ap->a_vp))
                error = VOP_KQFILTER(LOWERVP(ap->a_vp), ap->a_kn);
        else {
                /* panic? */
                error = EOPNOTSUPP;
        }

        return (error);
}



















































































































































































































    1 




















    6 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
/*        $NetBSD: dtrace_bsd.h,v 1.9 2018/04/19 21:19:07 christos Exp $        */

/*-
 * Copyright (c) 2007-2008 John Birrell (jb@freebsd.org)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * $FreeBSD: src/sys/sys/dtrace_bsd.h,v 1.3.2.1 2009/08/03 08:13:06 kensmith Exp $
 *
 * This file contains BSD shims for Sun's DTrace code.
 */

#ifndef _SYS_DTRACE_BSD_H
#define        _SYS_DTRACE_BSD_H

#if defined(_KERNEL_OPT)
#include "opt_dtrace.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/proc.h>

/* Forward definitions: */
struct mbuf;
struct trapframe;
struct lwp;
struct vattr;
struct vnode;
struct ucred;

/*
 * Cyclic clock function type definition used to hook the cyclic
 * subsystem into the appropriate timer interrupt.
 */
typedef        void (*cyclic_clock_func_t)(struct clockframe *);
extern cyclic_clock_func_t        cyclic_clock_func[];

/*
 * The dtrace module handles traps that occur during a DTrace probe.
 * This type definition is used in the trap handler to provide a
 * hook for the dtrace module to register its handler with.
 */
typedef int (*dtrace_trap_func_t)(struct trapframe *, u_int);

int        dtrace_trap(struct trapframe *, u_int);

extern dtrace_trap_func_t        dtrace_trap_func;

/* Used by the machine dependent trap() code. */
typedef        int (*dtrace_invop_func_t)(uintptr_t, uintptr_t *, uintptr_t);
typedef void (*dtrace_doubletrap_func_t)(void);

/* Global variables in trap.c */
extern        dtrace_invop_func_t        dtrace_invop_func;
extern        dtrace_doubletrap_func_t        dtrace_doubletrap_func;

/* Virtual time hook function type. */
typedef        void (*dtrace_vtime_switch_func_t)(struct lwp *);

extern int                        dtrace_vtime_active;
extern dtrace_vtime_switch_func_t        dtrace_vtime_switch_func;

/* The fasttrap module hooks into the fork, exit and exit. */
typedef void (*dtrace_fork_func_t)(struct proc *, struct proc *);
typedef void (*dtrace_execexit_func_t)(struct proc *);

/* Global variable in kern_fork.c */
extern dtrace_fork_func_t        dtrace_fasttrap_fork;

/* Global variable in kern_exec.c */
extern dtrace_execexit_func_t        dtrace_fasttrap_exec;

/* Global variable in kern_exit.c */
extern dtrace_execexit_func_t        dtrace_fasttrap_exit;

/* The dtmalloc provider hooks into malloc. */
typedef        void (*dtrace_malloc_probe_func_t)(u_int32_t, uintptr_t arg0,
    uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4);

extern dtrace_malloc_probe_func_t   dtrace_malloc_probe;

/* dtnfsclient NFSv3 access cache provider hooks. */
typedef void (*dtrace_nfsclient_accesscache_flush_probe_func_t)(uint32_t,
    struct vnode *);
extern dtrace_nfsclient_accesscache_flush_probe_func_t
    dtrace_nfsclient_accesscache_flush_done_probe;

typedef void (*dtrace_nfsclient_accesscache_get_probe_func_t)(uint32_t,
    struct vnode *, uid_t, uint32_t);
extern dtrace_nfsclient_accesscache_get_probe_func_t
    dtrace_nfsclient_accesscache_get_hit_probe,
    dtrace_nfsclient_accesscache_get_miss_probe;

typedef void (*dtrace_nfsclient_accesscache_load_probe_func_t)(uint32_t,
    struct vnode *, uid_t, uint32_t, int);
extern dtrace_nfsclient_accesscache_load_probe_func_t
    dtrace_nfsclient_accesscache_load_done_probe;

/* dtnfsclient NFSv[23] attribute cache provider hooks. */
typedef void (*dtrace_nfsclient_attrcache_flush_probe_func_t)(uint32_t,
    struct vnode *);
extern dtrace_nfsclient_attrcache_flush_probe_func_t
    dtrace_nfsclient_attrcache_flush_done_probe;

typedef void (*dtrace_nfsclient_attrcache_get_hit_probe_func_t)(uint32_t,
    struct vnode *, struct vattr *);
extern dtrace_nfsclient_attrcache_get_hit_probe_func_t
    dtrace_nfsclient_attrcache_get_hit_probe;

typedef void (*dtrace_nfsclient_attrcache_get_miss_probe_func_t)(uint32_t,
    struct vnode *);
extern dtrace_nfsclient_attrcache_get_miss_probe_func_t
    dtrace_nfsclient_attrcache_get_miss_probe;

typedef void (*dtrace_nfsclient_attrcache_load_probe_func_t)(uint32_t,
    struct vnode *, struct vattr *, int);
extern dtrace_nfsclient_attrcache_load_probe_func_t
    dtrace_nfsclient_attrcache_load_done_probe;

/* dtnfsclient NFSv[23] RPC provider hooks. */
typedef void (*dtrace_nfsclient_nfs23_start_probe_func_t)(uint32_t,
    struct vnode *, struct mbuf *, struct ucred *, int);
extern dtrace_nfsclient_nfs23_start_probe_func_t
    dtrace_nfsclient_nfs23_start_probe;

typedef void (*dtrace_nfsclient_nfs23_done_probe_func_t)(uint32_t,
    struct vnode *, struct mbuf *, struct ucred *, int, int);
extern dtrace_nfsclient_nfs23_done_probe_func_t
    dtrace_nfsclient_nfs23_done_probe;

/*
 * OpenSolaris compatible time functions returning nanoseconds.
 * On OpenSolaris these return hrtime_t which we define as uint64_t.
 */
uint64_t        dtrace_gethrtime(void);
uint64_t        dtrace_gethrestime(void);

/* sizes based on DTrace structure requirements */
#define KDTRACE_PROC_SIZE        64
#define KDTRACE_PROC_ZERO        8
#define        KDTRACE_THREAD_SIZE        256
#define        KDTRACE_THREAD_ZERO        64

/*
 * Functions for managing the opaque DTrace memory areas for 
 * processes and lwps.
 */

static __inline size_t        kdtrace_proc_size(void);
static __inline void kdtrace_proc_ctor(void *, struct proc *);
static __inline void kdtrace_proc_dtor(void *, struct proc *);
static __inline size_t        kdtrace_thread_size(void);
static __inline void kdtrace_thread_ctor(void *, struct lwp *);
static __inline void kdtrace_thread_dtor(void *, struct lwp *);


/* Return the DTrace process data size compiled in the kernel hooks. */
static __inline size_t
kdtrace_proc_size(void)
{

        return KDTRACE_PROC_SIZE;
}

/* Return the DTrace thread data size compiled in the kernel hooks. */
static __inline size_t
kdtrace_thread_size(void)
{

        return KDTRACE_THREAD_SIZE;
}

static __inline void
kdtrace_proc_ctor(void *arg, struct proc *p)
{

#ifdef KDTRACE_HOOKS
        p->p_dtrace = kmem_zalloc(KDTRACE_PROC_SIZE, KM_SLEEP);
#endif
}

static __inline void
kdtrace_proc_dtor(void *arg, struct proc *p)
{

#ifdef KDTRACE_HOOKS
        if (p->p_dtrace != NULL) {
                kmem_free(p->p_dtrace, KDTRACE_PROC_SIZE);
                p->p_dtrace = NULL;
        }
#endif
}

static __inline void
kdtrace_thread_ctor(void *arg, struct lwp *l)
{

#ifdef KDTRACE_HOOKS
        l->l_dtrace = kmem_zalloc(KDTRACE_THREAD_SIZE, KM_SLEEP);
#endif
}

static __inline void
kdtrace_thread_dtor(void *arg, struct lwp *l)
{

#ifdef KDTRACE_HOOKS
        if (l->l_dtrace != NULL) {
                kmem_free(l->l_dtrace, KDTRACE_THREAD_SIZE);
                l->l_dtrace = NULL;
        }
#endif
}

#endif /* _SYS_DTRACE_BSD_H */




































































    5 










    5 


    1 





    4 
    4 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/*        $NetBSD: raw_cb.c,v 1.24 2017/09/25 01:56:22 ozaki-r Exp $        */

/*
 * Copyright (c) 1980, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)raw_cb.c        8.1 (Berkeley) 6/10/93
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: raw_cb.c,v 1.24 2017/09/25 01:56:22 ozaki-r Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/kmem.h>

#include <net/if.h>
#include <net/route.h>
#include <net/raw_cb.h>
#include <netinet/in.h>

/*
 * Routines to manage the raw protocol control blocks.
 *
 * TODO:
 *        hash lookups by protocol family/protocol + address family
 *        take care of unique address problems per AF?
 *        redo address binding to allow wildcards
 */

static u_long                raw_sendspace = RAWSNDQ;
static u_long                raw_recvspace = RAWRCVQ;

/*
 * Allocate a nominal amount of buffer space for the socket.
 */
int
raw_attach(struct socket *so, int proto, struct rawcbhead *rawcbhead)
{
        struct rawcb *rp;
        int error;

        /*
         * It is assumed that raw_attach() is called after space has been
         * allocated for the rawcb; consumer protocols may simply allocate
         * type struct rawcb, or a wrapper data structure that begins with a
         * struct rawcb.
         */
        rp = sotorawcb(so);
        KASSERT(rp != NULL);
        sosetlock(so);

        if ((error = soreserve(so, raw_sendspace, raw_recvspace)) != 0) {
                return error;
        }
        rp->rcb_socket = so;
        rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family;
        rp->rcb_proto.sp_protocol = proto;
        LIST_INSERT_HEAD(rawcbhead, rp, rcb_list);
        KASSERT(solocked(so));

        return 0;
}

/*
 * Detach the raw connection block and discard socket resources.
 */
void
raw_detach(struct socket *so)
{
        struct rawcb *rp = sotorawcb(so);
        const size_t rcb_len = rp->rcb_len;

        KASSERT(rp != NULL);
        KASSERT(solocked(so));

        /* Remove the last reference. */
        LIST_REMOVE(rp, rcb_list);
        so->so_pcb = NULL;

        /* Note: sofree() drops the socket's lock. */
        sofree(so);
        kmem_free(rp, rcb_len);
        if (so->so_lock != softnet_lock) {
                so->so_lock = softnet_lock;
                mutex_obj_hold(softnet_lock);
        }
        mutex_enter(softnet_lock);
}

/*
 * Disconnect and possibly release resources.
 */
void
raw_disconnect(struct rawcb *rp)
{
        struct socket *so = rp->rcb_socket;

        if (so->so_state & SS_NOFDREF) {
                raw_detach(so);
        }
}


























































































































































































































   40 




































































































































































































































































































































































































































































































    5 






    5 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
/*        $NetBSD: kern_module.c,v 1.161 2023/01/31 13:21:37 riastradh Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software developed for The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Kernel module support.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_module.c,v 1.161 2023/01/31 13:21:37 riastradh Exp $");

#define _MODULE_INTERNAL

#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "opt_modular.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/lwp.h>
#include <sys/kauth.h>
#include <sys/kobj.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/module_hook.h>
#include <sys/kthread.h>
#include <sys/sysctl.h>
#include <sys/lock.h>
#include <sys/evcnt.h>

#include <uvm/uvm_extern.h>

struct vm_map *module_map;
const char *module_machine;
char        module_base[MODULE_BASE_SIZE];

struct modlist        module_list = TAILQ_HEAD_INITIALIZER(module_list);
struct modlist        module_builtins = TAILQ_HEAD_INITIALIZER(module_builtins);
static struct modlist module_bootlist = TAILQ_HEAD_INITIALIZER(module_bootlist);

struct module_callbacks {
        TAILQ_ENTRY(module_callbacks) modcb_list;
        void (*modcb_load)(struct module *);
        void (*modcb_unload)(struct module *);
};
TAILQ_HEAD(modcblist, module_callbacks);
static struct modcblist modcblist;

static module_t *module_netbsd;
static const modinfo_t module_netbsd_modinfo = {
        .mi_version = __NetBSD_Version__,
        .mi_class = MODULE_CLASS_MISC,
        .mi_name = "netbsd"
};

static module_t        *module_active;
#ifdef MODULAR_DEFAULT_VERBOSE
bool                module_verbose_on = true;
#else
bool                module_verbose_on = false;
#endif
#ifdef MODULAR_DEFAULT_AUTOLOAD
bool                module_autoload_on = true;
#else
bool                module_autoload_on = false;
#endif
bool                module_autounload_unsafe = 0;
u_int                module_count;
u_int                module_builtinlist;
u_int                module_autotime = 10;
u_int                module_gen = 1;
static kcondvar_t module_thread_cv;
static kmutex_t module_thread_lock;
static int        module_thread_ticks;
int (*module_load_vfs_vec)(const char *, int, bool, module_t *,
                           prop_dictionary_t *) = (void *)eopnotsupp;

static kauth_listener_t        module_listener;

static specificdata_domain_t module_specificdata_domain;

/* Ensure that the kernel's link set isn't empty. */
static modinfo_t module_dummy;
__link_set_add_rodata(modules, module_dummy);

static module_t        *module_newmodule(modsrc_t);
static void        module_free(module_t *);
static void        module_require_force(module_t *);
static int        module_do_load(const char *, bool, int, prop_dictionary_t,
                    module_t **, modclass_t modclass, bool);
static int        module_do_unload(const char *, bool);
static int        module_do_builtin(const module_t *, const char *, module_t **,
    prop_dictionary_t);
static int        module_fetch_info(module_t *);
static void        module_thread(void *);

static module_t        *module_lookup(const char *);
static void        module_enqueue(module_t *);

static bool        module_merge_dicts(prop_dictionary_t, const prop_dictionary_t);

static void        sysctl_module_setup(void);
static int        sysctl_module_autotime(SYSCTLFN_PROTO);

static void        module_callback_load(struct module *);
static void        module_callback_unload(struct module *);

#define MODULE_CLASS_MATCH(mi, modclass) \
        ((modclass) == MODULE_CLASS_ANY || (modclass) == (mi)->mi_class)

static void
module_incompat(const modinfo_t *mi, int modclass)
{
        module_error("incompatible module class %d for `%s' (wanted %d)",
            mi->mi_class, mi->mi_name, modclass);
}

struct module *
module_kernel(void)
{

        return module_netbsd;
}

/*
 * module_error:
 *
 *        Utility function: log an error.
 */
void
module_error(const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        printf("WARNING: module error: ");
        vprintf(fmt, ap);
        printf("\n");
        va_end(ap);
}

/*
 * module_print:
 *
 *        Utility function: log verbose output.
 */
void
module_print(const char *fmt, ...)
{
        va_list ap;

        if (module_verbose_on) {
                va_start(ap, fmt);
                printf("DEBUG: module: ");
                vprintf(fmt, ap);
                printf("\n");
                va_end(ap);
        }
}

/*
 * module_name:
 *
 *        Utility function: return the module's name.
 */
const char *
module_name(struct module *mod)
{

        return mod->mod_info->mi_name;
}

/*
 * module_source:
 *
 *        Utility function: return the module's source.
 */
modsrc_t
module_source(struct module *mod)
{

        return mod->mod_source;
}

static int
module_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;

        result = KAUTH_RESULT_DEFER;

        if (action != KAUTH_SYSTEM_MODULE)
                return result;

        if ((uintptr_t)arg2 != 0)        /* autoload */
                result = KAUTH_RESULT_ALLOW;

        return result;
}

/*
 * Allocate a new module_t
 */
static module_t *
module_newmodule(modsrc_t source)
{
        module_t *mod;

        mod = kmem_zalloc(sizeof(*mod), KM_SLEEP);
        mod->mod_source = source;
        specificdata_init(module_specificdata_domain, &mod->mod_sdref);
        return mod;
}

/*
 * Free a module_t
 */
static void
module_free(module_t *mod)
{

        specificdata_fini(module_specificdata_domain, &mod->mod_sdref);
        if (mod->mod_required)
                kmem_free(mod->mod_required, mod->mod_arequired *
                    sizeof(module_t *));
        kmem_free(mod, sizeof(*mod));
}

/*
 * Require the -f (force) flag to load a module
 */
static void
module_require_force(struct module *mod)
{
        SET(mod->mod_flags, MODFLG_MUST_FORCE);
}

/*
 * Add modules to the builtin list.  This can done at boottime or
 * at runtime if the module is linked into the kernel with an
 * external linker.  All or none of the input will be handled.
 * Optionally, the modules can be initialized.  If they are not
 * initialized, module_init_class() or module_load() can be used
 * later, but these are not guaranteed to give atomic results.
 */
int
module_builtin_add(modinfo_t *const *mip, size_t nmodinfo, bool init)
{
        struct module **modp = NULL, *mod_iter;
        int rv = 0, i, mipskip;

        if (init) {
                rv = kauth_authorize_system(kauth_cred_get(),
                    KAUTH_SYSTEM_MODULE, 0, (void *)(uintptr_t)MODCTL_LOAD,
                    (void *)(uintptr_t)1, NULL);
                if (rv) {
                        return rv;
                }
        }

        for (i = 0, mipskip = 0; i < nmodinfo; i++) {
                if (mip[i] == &module_dummy) {
                        KASSERT(nmodinfo > 0);
                        nmodinfo--;
                }
        }
        if (nmodinfo == 0)
                return 0;

        modp = kmem_zalloc(sizeof(*modp) * nmodinfo, KM_SLEEP);
        for (i = 0, mipskip = 0; i < nmodinfo; i++) {
                if (mip[i+mipskip] == &module_dummy) {
                        mipskip++;
                        continue;
                }
                modp[i] = module_newmodule(MODULE_SOURCE_KERNEL);
                modp[i]->mod_info = mip[i+mipskip];
        }
        kernconfig_lock();

        /* do this in three stages for error recovery and atomicity */

        /* first check for presence */
        for (i = 0; i < nmodinfo; i++) {
                TAILQ_FOREACH(mod_iter, &module_builtins, mod_chain) {
                        if (strcmp(mod_iter->mod_info->mi_name,
                            modp[i]->mod_info->mi_name) == 0)
                                break;
                }
                if (mod_iter) {
                        rv = EEXIST;
                        goto out;
                }

                if (module_lookup(modp[i]->mod_info->mi_name) != NULL) {
                        rv = EEXIST;
                        goto out;
                }
        }

        /* then add to list */
        for (i = 0; i < nmodinfo; i++) {
                TAILQ_INSERT_TAIL(&module_builtins, modp[i], mod_chain);
                module_builtinlist++;
        }

        /* finally, init (if required) */
        if (init) {
                for (i = 0; i < nmodinfo; i++) {
                        rv = module_do_builtin(modp[i],
                            modp[i]->mod_info->mi_name, NULL, NULL);
                        /* throw in the towel, recovery hard & not worth it */
                        if (rv)
                                panic("%s: builtin module \"%s\" init failed:"
                                    " %d", __func__,
                                    modp[i]->mod_info->mi_name, rv);
                }
        }

 out:
        kernconfig_unlock();
        if (rv != 0) {
                for (i = 0; i < nmodinfo; i++) {
                        if (modp[i])
                                module_free(modp[i]);
                }
        }
        kmem_free(modp, sizeof(*modp) * nmodinfo);
        return rv;
}

/*
 * Optionally fini and remove builtin module from the kernel.
 * Note: the module will now be unreachable except via mi && builtin_add.
 */
int
module_builtin_remove(modinfo_t *mi, bool fini)
{
        struct module *mod;
        int rv = 0;

        if (fini) {
                rv = kauth_authorize_system(kauth_cred_get(),
                    KAUTH_SYSTEM_MODULE, 0, (void *)(uintptr_t)MODCTL_UNLOAD,
                    NULL, NULL);
                if (rv)
                        return rv;

                kernconfig_lock();
                rv = module_do_unload(mi->mi_name, true);
                if (rv) {
                        goto out;
                }
        } else {
                kernconfig_lock();
        }
        TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
                if (strcmp(mod->mod_info->mi_name, mi->mi_name) == 0)
                        break;
        }
        if (mod) {
                TAILQ_REMOVE(&module_builtins, mod, mod_chain);
                module_builtinlist--;
        } else {
                KASSERT(fini == false);
                rv = ENOENT;
        }

 out:
        kernconfig_unlock();
        return rv;
}

/*
 * module_init:
 *
 *        Initialize the module subsystem.
 */
void
module_init(void)
{
        __link_set_decl(modules, modinfo_t);
        modinfo_t *const *mip;
        int rv;

        if (module_map == NULL) {
                module_map = kernel_map;
        }
        cv_init(&module_thread_cv, "mod_unld");
        mutex_init(&module_thread_lock, MUTEX_DEFAULT, IPL_NONE);
        TAILQ_INIT(&modcblist);

#ifdef MODULAR        /* XXX */
        module_init_md();
#endif

#ifdef KERNEL_DIR
        const char *booted_kernel = get_booted_kernel();
        if (booted_kernel) {
                char *ptr = strrchr(booted_kernel, '/');
                snprintf(module_base, sizeof(module_base), "/%.*s/modules",
                    (int)(ptr - booted_kernel), booted_kernel);
        } else {
                strlcpy(module_base, "/netbsd/modules", sizeof(module_base));
                printf("Cannot find kernel name, loading modules from \"%s\"\n",
                    module_base);
        }
#else
        if (!module_machine)
                module_machine = machine;
#if __NetBSD_Version__ / 1000000 % 100 == 99        /* -current */
        snprintf(module_base, sizeof(module_base), "/stand/%s/%s/modules",
            module_machine, osrelease);
#else                                                /* release */
        snprintf(module_base, sizeof(module_base), "/stand/%s/%d.%d/modules",
            module_machine, __NetBSD_Version__ / 100000000,
            __NetBSD_Version__ / 1000000 % 100);
#endif
#endif

        module_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
            module_listener_cb, NULL);

        __link_set_foreach(mip, modules) {
                if ((rv = module_builtin_add(mip, 1, false)) != 0)
                        module_error("builtin %s failed: %d\n",
                            (*mip)->mi_name, rv);
        }

        sysctl_module_setup();
        module_specificdata_domain = specificdata_domain_create();

        module_netbsd = module_newmodule(MODULE_SOURCE_KERNEL);
        module_netbsd->mod_refcnt = 1;
        module_netbsd->mod_info = &module_netbsd_modinfo;
}

/*
 * module_start_unload_thread:
 *
 *        Start the auto unload kthread.
 */
void
module_start_unload_thread(void)
{
        int error;

        error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, module_thread,
            NULL, NULL, "modunload");
        if (error != 0)
                panic("%s: %d", __func__, error);
}

/*
 * module_builtin_require_force
 *
 * Require MODCTL_MUST_FORCE to load any built-in modules that have 
 * not yet been initialized
 */
void
module_builtin_require_force(void)
{
        module_t *mod;

        kernconfig_lock();
        TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
                module_require_force(mod);
        }
        kernconfig_unlock();
}

static struct sysctllog *module_sysctllog;

static int
sysctl_module_autotime(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int t, error;

        t = *(int *)rnode->sysctl_data;

        node = *rnode;
        node.sysctl_data = &t;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return (error);

        if (t < 0)
                return (EINVAL);

        *(int *)rnode->sysctl_data = t;
        return (0);
}

static void
sysctl_module_setup(void)
{
        const struct sysctlnode *node = NULL;

        sysctl_createv(&module_sysctllog, 0, NULL, &node,
                CTLFLAG_PERMANENT,
                CTLTYPE_NODE, "module",
                SYSCTL_DESCR("Module options"),
                NULL, 0, NULL, 0,
                CTL_KERN, CTL_CREATE, CTL_EOL);

        if (node == NULL)
                return;

        sysctl_createv(&module_sysctllog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_BOOL, "autoload",
                SYSCTL_DESCR("Enable automatic load of modules"),
                NULL, 0, &module_autoload_on, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(&module_sysctllog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_BOOL, "autounload_unsafe",
                SYSCTL_DESCR("Enable automatic unload of unaudited modules"),
                NULL, 0, &module_autounload_unsafe, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(&module_sysctllog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_BOOL, "verbose",
                SYSCTL_DESCR("Enable verbose output"),
                NULL, 0, &module_verbose_on, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(&module_sysctllog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READONLY,
                CTLTYPE_STRING, "path",
                SYSCTL_DESCR("Default module load path"),
                NULL, 0, module_base, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(&module_sysctllog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "autotime",
                SYSCTL_DESCR("Auto-unload delay"),
                sysctl_module_autotime, 0, &module_autotime, 0,
                CTL_CREATE, CTL_EOL);
}

/*
 * module_init_class:
 *
 *        Initialize all built-in and pre-loaded modules of the
 *        specified class.
 */
void
module_init_class(modclass_t modclass)
{
        TAILQ_HEAD(, module) bi_fail = TAILQ_HEAD_INITIALIZER(bi_fail);
        module_t *mod;
        modinfo_t *mi;

        kernconfig_lock();
        /*
         * Builtins first.  These will not depend on pre-loaded modules
         * (because the kernel would not link).
         */
        do {
                TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
                        mi = mod->mod_info;
                        if (!MODULE_CLASS_MATCH(mi, modclass))
                                continue;
                        /*
                         * If initializing a builtin module fails, don't try
                         * to load it again.  But keep it around and queue it
                         * on the builtins list after we're done with module
                         * init.  Don't set it to MODFLG_MUST_FORCE in case a
                         * future attempt to initialize can be successful.
                         * (If the module has previously been set to
                         * MODFLG_MUST_FORCE, don't try to override that!)
                         */
                        if (ISSET(mod->mod_flags, MODFLG_MUST_FORCE) ||
                            module_do_builtin(mod, mi->mi_name, NULL,
                            NULL) != 0) {
                                TAILQ_REMOVE(&module_builtins, mod, mod_chain);
                                TAILQ_INSERT_TAIL(&bi_fail, mod, mod_chain);
                        }
                        break;
                }
        } while (mod != NULL);

        /*
         * Now preloaded modules.  These will be pulled off the
         * list as we call module_do_load();
         */
        do {
                TAILQ_FOREACH(mod, &module_bootlist, mod_chain) {
                        mi = mod->mod_info;
                        if (!MODULE_CLASS_MATCH(mi, modclass))
                                continue;
                        module_do_load(mi->mi_name, false, 0, NULL, NULL,
                            modclass, false);
                        break;
                }
        } while (mod != NULL);

        /* return failed builtin modules to builtin list */
        while ((mod = TAILQ_FIRST(&bi_fail)) != NULL) {
                TAILQ_REMOVE(&bi_fail, mod, mod_chain);
                TAILQ_INSERT_TAIL(&module_builtins, mod, mod_chain);
        }

        kernconfig_unlock();
}

/*
 * module_compatible:
 *
 *        Return true if the two supplied kernel versions are said to
 *        have the same binary interface for kernel code.  The entire
 *        version is signficant for the development tree (-current),
 *        major and minor versions are significant for official
 *        releases of the system.
 */
bool
module_compatible(int v1, int v2)
{

#if __NetBSD_Version__ / 1000000 % 100 == 99        /* -current */
        return v1 == v2;
#else                                                /* release */
        return abs(v1 - v2) < 10000;
#endif
}

/*
 * module_load:
 *
 *        Load a single module from the file system.
 */
int
module_load(const char *filename, int flags, prop_dictionary_t props,
            modclass_t modclass)
{
        module_t *mod;
        int error;

        /* Test if we already have the module loaded before
         * authorizing so we have the opportunity to return EEXIST. */
        kernconfig_lock();
        mod = module_lookup(filename);
        if (mod != NULL) {
                module_print("%s module `%s' already loaded",
                    "requested", filename);
                error = EEXIST;
                goto out;
        }

        /* Authorize. */
        error = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE,
            0, (void *)(uintptr_t)MODCTL_LOAD, NULL, NULL);
        if (error != 0)
                goto out;

        error = module_do_load(filename, false, flags, props, NULL, modclass,
            false);

out:
        kernconfig_unlock();
        return error;
}

/*
 * module_autoload:
 *
 *        Load a single module from the file system, system initiated.
 */
int
module_autoload(const char *filename, modclass_t modclass)
{
        int error;
        struct proc *p = curlwp->l_proc;

        kernconfig_lock();

        /* Nothing if the user has disabled it. */
        if (!module_autoload_on) {
                kernconfig_unlock();
                return EPERM;
        }

        /* Disallow path separators and magic symlinks. */
        if (strchr(filename, '/') != NULL || strchr(filename, '@') != NULL ||
            strchr(filename, '.') != NULL) {
                kernconfig_unlock();
                return EPERM;
        }

        /* Authorize. */
        error = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE,
            0, (void *)(uintptr_t)MODCTL_LOAD, (void *)(uintptr_t)1, NULL);

        if (error == 0)
                error = module_do_load(filename, false, 0, NULL, NULL, modclass,
                    true);

        module_print("Autoload for `%s' requested by pid %d (%s), status %d",
            filename, p->p_pid, p->p_comm, error);
        kernconfig_unlock();
        return error;
}

/*
 * module_unload:
 *
 *        Find and unload a module by name.
 */
int
module_unload(const char *name)
{
        int error;

        /* Authorize. */
        error = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE,
            0, (void *)(uintptr_t)MODCTL_UNLOAD, NULL, NULL);
        if (error != 0) {
                return error;
        }

        kernconfig_lock();
        error = module_do_unload(name, true);
        kernconfig_unlock();

        return error;
}

/*
 * module_lookup:
 *
 *        Look up a module by name.
 */
module_t *
module_lookup(const char *name)
{
        module_t *mod;

        KASSERT(kernconfig_is_held());

        TAILQ_FOREACH(mod, &module_list, mod_chain) {
                if (strcmp(mod->mod_info->mi_name, name) == 0)
                        break;
        }

        return mod;
}

/*
 * module_hold:
 *
 *        Add a single reference to a module.  It's the caller's
 *        responsibility to ensure that the reference is dropped
 *        later.
 */
void
module_hold(module_t *mod)
{

        kernconfig_lock();
        mod->mod_refcnt++;
        kernconfig_unlock();
}

/*
 * module_rele:
 *
 *        Release a reference acquired with module_hold().
 */
void
module_rele(module_t *mod)
{

        kernconfig_lock();
        KASSERT(mod->mod_refcnt > 0);
        mod->mod_refcnt--;
        kernconfig_unlock();
}

/*
 * module_enqueue:
 *
 *        Put a module onto the global list and update counters.
 */
void
module_enqueue(module_t *mod)
{
        int i;

        KASSERT(kernconfig_is_held());

        /*
         * Put new entry at the head of the queue so autounload can unload
         * requisite modules with only one pass through the queue.
         */
        TAILQ_INSERT_HEAD(&module_list, mod, mod_chain);
        if (mod->mod_nrequired) {

                /* Add references to the requisite modules. */
                for (i = 0; i < mod->mod_nrequired; i++) {
                        KASSERT((*mod->mod_required)[i] != NULL);
                        (*mod->mod_required)[i]->mod_refcnt++;
                }
        }
        module_count++;
        module_gen++;
}

/*
 * Our array of required module pointers starts with zero entries.  If we
 * need to add a new entry, and the list is already full, we reallocate a
 * larger array, adding MAXMODDEPS entries.
 */
static void
alloc_required(module_t *mod)
{
        module_t *(*new)[], *(*old)[];
        int areq;
        int i;

        if (mod->mod_nrequired >= mod->mod_arequired) {
                areq = mod->mod_arequired + MAXMODDEPS;
                old = mod->mod_required;
                new = kmem_zalloc(areq * sizeof(module_t *), KM_SLEEP);
                for (i = 0; i < mod->mod_arequired; i++)
                        (*new)[i] = (*old)[i];
                mod->mod_required = new;
                if (old)
                        kmem_free(old, mod->mod_arequired * sizeof(module_t *));
                mod->mod_arequired = areq;
        }
}

/*
 * module_do_builtin:
 *
 *        Initialize a module from the list of modules that are
 *        already linked into the kernel.
 */
static int
module_do_builtin(const module_t *pmod, const char *name, module_t **modp,
    prop_dictionary_t props)
{
        const char *p, *s;
        char buf[MAXMODNAME];
        modinfo_t *mi = NULL;
        module_t *mod, *mod2, *mod_loaded, *prev_active;
        size_t len;
        int error;

        KASSERT(kernconfig_is_held());

        /*
         * Search the list to see if we have a module by this name.
         */
        TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
                if (strcmp(mod->mod_info->mi_name, name) == 0) {
                        mi = mod->mod_info;
                        break;
                }
        }

        /*
         * Check to see if already loaded.  This might happen if we
         * were already loaded as a dependency.
         */
        if ((mod_loaded = module_lookup(name)) != NULL) {
                KASSERT(mod == NULL);
                if (modp)
                        *modp = mod_loaded;
                return 0;
        }

        /* Note! This is from TAILQ, not immediate above */
        if (mi == NULL) {
                /*
                 * XXX: We'd like to panic here, but currently in some
                 * cases (such as nfsserver + nfs), the dependee can be
                 * successfully linked without the dependencies.
                 */
                module_error("built-in module %s can't find builtin "
                    "dependency `%s'", pmod->mod_info->mi_name, name);
                return ENOENT;
        }

        /*
         * Initialize pre-requisites.
         */
        KASSERT(mod->mod_required == NULL);
        KASSERT(mod->mod_arequired == 0);
        KASSERT(mod->mod_nrequired == 0);
        if (mi->mi_required != NULL) {
                for (s = mi->mi_required; *s != '\0'; s = p) {
                        if (*s == ',')
                                s++;
                        p = s;
                        while (*p != '\0' && *p != ',')
                                p++;
                        len = uimin(p - s + 1, sizeof(buf));
                        strlcpy(buf, s, len);
                        if (buf[0] == '\0')
                                break;
                        alloc_required(mod);
                        error = module_do_builtin(mod, buf, &mod2, NULL);
                        if (error != 0) {
                                module_error("built-in module %s prerequisite "
                                    "%s failed, error %d", name, buf, error);
                                goto fail;
                        }
                        (*mod->mod_required)[mod->mod_nrequired++] = mod2;
                }
        }

        /*
         * Try to initialize the module.
         */
        prev_active = module_active;
        module_active = mod;
        error = (*mi->mi_modcmd)(MODULE_CMD_INIT, props);
        module_active = prev_active;
        if (error != 0) {
                module_error("built-in module %s failed its MODULE_CMD_INIT, "
                    "error %d", mi->mi_name, error);
                goto fail;
        }

        /* load always succeeds after this point */

        TAILQ_REMOVE(&module_builtins, mod, mod_chain);
        module_builtinlist--;
        if (modp != NULL) {
                *modp = mod;
        }
        module_enqueue(mod);
        return 0;

 fail:
        if (mod->mod_required)
                kmem_free(mod->mod_required, mod->mod_arequired *
                    sizeof(module_t *));
        mod->mod_arequired = 0;
        mod->mod_nrequired = 0;
        mod->mod_required = NULL;
        return error;
}

/*
 * module_load_sysctl
 *
 * Check to see if a non-builtin module has any SYSCTL_SETUP() routine(s)
 * registered.  If so, call it (them).
 */

static void
module_load_sysctl(module_t *mod)
{
        void (**ls_funcp)(struct sysctllog **);
        void *ls_start;
        size_t ls_size, count;
        int error;

        /*
         * Built-in modules don't have a mod_kobj so we cannot search
         * for their link_set_sysctl_funcs
         */
        if (mod->mod_source == MODULE_SOURCE_KERNEL)
                return;

        error = kobj_find_section(mod->mod_kobj, "link_set_sysctl_funcs",
            &ls_start, &ls_size);
        if (error == 0) {
                count = ls_size / sizeof(ls_start);
                ls_funcp = ls_start;
                while (count--) {
                        (**ls_funcp)(&mod->mod_sysctllog);
                        ls_funcp++;
                }
        }
}

/*
 * module_load_evcnt
 *
 * Check to see if a non-builtin module has any static evcnt's defined;
 * if so, attach them.
 */

static void
module_load_evcnt(module_t *mod)
{
        struct evcnt * const *ls_evp;
        void *ls_start;
        size_t ls_size, count;
        int error;

        /*
         * Built-in modules' static evcnt stuff will be handled
         * automatically as part of general kernel initialization
         */
        if (mod->mod_source == MODULE_SOURCE_KERNEL)
                return;

        error = kobj_find_section(mod->mod_kobj, "link_set_evcnts",
            &ls_start, &ls_size);
        if (error == 0) {
                count = ls_size / sizeof(*ls_evp);
                ls_evp = ls_start;
                while (count--) {
                        evcnt_attach_static(*ls_evp++);
                }
        }
}

/*
 * module_unload_evcnt
 *
 * Check to see if a non-builtin module has any static evcnt's defined;
 * if so, detach them.
 */

static void
module_unload_evcnt(module_t *mod)
{
        struct evcnt * const *ls_evp;
        void *ls_start;
        size_t ls_size, count;
        int error;

        /*
         * Built-in modules' static evcnt stuff will be handled
         * automatically as part of general kernel initialization
         */
        if (mod->mod_source == MODULE_SOURCE_KERNEL)
                return;

        error = kobj_find_section(mod->mod_kobj, "link_set_evcnts",
            &ls_start, &ls_size);
        if (error == 0) {
                count = ls_size / sizeof(*ls_evp);
                ls_evp = (void *)((char *)ls_start + ls_size);
                while (count--) {
                        evcnt_detach(*--ls_evp);
                }
        }
}

/*
 * module_do_load:
 *
 *        Helper routine: load a module from the file system, or one
 *        pushed by the boot loader.
 */
static int
module_do_load(const char *name, bool isdep, int flags,
               prop_dictionary_t props, module_t **modp, modclass_t modclass,
               bool autoload)
{
        /* The pending list for this level of recursion */
        TAILQ_HEAD(pending_t, module);
        struct pending_t *pending;
        struct pending_t new_pending = TAILQ_HEAD_INITIALIZER(new_pending);

        /* The stack of pending lists */
        static SLIST_HEAD(pend_head, pend_entry) pend_stack =
                SLIST_HEAD_INITIALIZER(pend_stack);
        struct pend_entry {
                SLIST_ENTRY(pend_entry) pe_entry;
                struct pending_t *pe_pending;
        } my_pend_entry;

        modinfo_t *mi;
        module_t *mod, *mod2, *prev_active;
        prop_dictionary_t filedict;
        char buf[MAXMODNAME];
        const char *s, *p;
        int error;
        size_t len;

        KASSERT(kernconfig_is_held());

        filedict = NULL;
        error = 0;

        /*
         * Set up the pending list for this entry.  If this is an
         * internal entry (for a dependency), then use the same list
         * as for the outer call;  otherwise, it's an external entry
         * (possibly recursive, ie a module's xxx_modcmd(init, ...)
         * routine called us), so use the locally allocated list.  In
         * either case, add it to our stack.
         */
        if (isdep) {
                KASSERT(SLIST_FIRST(&pend_stack) != NULL);
                pending = SLIST_FIRST(&pend_stack)->pe_pending;
        } else
                pending = &new_pending;
        my_pend_entry.pe_pending = pending;
        SLIST_INSERT_HEAD(&pend_stack, &my_pend_entry, pe_entry);

        /*
         * Search the list of disabled builtins first.
         */
        TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
                if (strcmp(mod->mod_info->mi_name, name) == 0) {
                        break;
                }
        }
        if (mod) {
                if (ISSET(mod->mod_flags, MODFLG_MUST_FORCE) &&
                    !ISSET(flags, MODCTL_LOAD_FORCE)) {
                        if (!autoload) {
                                module_error("use -f to reinstate "
                                    "builtin module `%s'", name);
                        }
                        SLIST_REMOVE_HEAD(&pend_stack, pe_entry);
                        return EPERM;
                } else {
                        SLIST_REMOVE_HEAD(&pend_stack, pe_entry);
                        error = module_do_builtin(mod, name, modp, props);
                        return error;
                }
        }

        /*
         * Load the module and link.  Before going to the file system,
         * scan the list of modules loaded by the boot loader.
         */
        TAILQ_FOREACH(mod, &module_bootlist, mod_chain) {
                if (strcmp(mod->mod_info->mi_name, name) == 0) {
                        TAILQ_REMOVE(&module_bootlist, mod, mod_chain);
                        break;
                }
        }
        if (mod != NULL) {
                TAILQ_INSERT_TAIL(pending, mod, mod_chain);
        } else {
                /*
                 * Check to see if module is already present.
                 */
                mod = module_lookup(name);
                if (mod != NULL) {
                        if (modp != NULL) {
                                *modp = mod;
                        }
                        module_print("%s module `%s' already loaded",
                            isdep ? "dependent" : "requested", name);
                        SLIST_REMOVE_HEAD(&pend_stack, pe_entry);
                        return EEXIST;
                }

                mod = module_newmodule(MODULE_SOURCE_FILESYS);
                if (mod == NULL) {
                        module_error("out of memory for `%s'", name);
                        SLIST_REMOVE_HEAD(&pend_stack, pe_entry);
                        return ENOMEM;
                }

                error = module_load_vfs_vec(name, flags, autoload, mod,
                                            &filedict);
                if (error != 0) {
#ifdef DEBUG
                        /*
                         * The exec class of modules contains a list of
                         * modules that is the union of all the modules
                         * available for each architecture, so we don't
                         * print an error if they are missing.
                         */
                        if ((modclass != MODULE_CLASS_EXEC || error != ENOENT)
                            && root_device != NULL)
                                module_error("vfs load failed for `%s', "
                                    "error %d", name, error);
#endif
                        SLIST_REMOVE_HEAD(&pend_stack, pe_entry);
                        module_free(mod);
                        return error;
                }
                TAILQ_INSERT_TAIL(pending, mod, mod_chain);

                error = module_fetch_info(mod);
                if (error != 0) {
                        module_error("cannot fetch info for `%s', error %d",
                            name, error);
                        goto fail;
                }
        }

        /*
         * Check compatibility.
         */
        mi = mod->mod_info;
        if (strnlen(mi->mi_name, MAXMODNAME) >= MAXMODNAME) {
                error = EINVAL;
                module_error("module name `%s' longer than %d", mi->mi_name,
                    MAXMODNAME);
                goto fail;
        }
        if (mi->mi_class <= MODULE_CLASS_ANY ||
            mi->mi_class >= MODULE_CLASS_MAX) {
                error = EINVAL;
                module_error("module `%s' has invalid class %d",
                    mi->mi_name, mi->mi_class);
                    goto fail;
        }
        if (!module_compatible(mi->mi_version, __NetBSD_Version__)) {
                module_error("module `%s' built for `%d', system `%d'",
                    mi->mi_name, mi->mi_version, __NetBSD_Version__);
                if (ISSET(flags, MODCTL_LOAD_FORCE)) {
                        module_error("forced load, system may be unstable");
                } else {
                        error = EPROGMISMATCH;
                        goto fail;
                }
        }

        /*
         * If a specific kind of module was requested, ensure that we have
         * a match.
         */
        if (!MODULE_CLASS_MATCH(mi, modclass)) {
                module_incompat(mi, modclass);
                error = ENOENT;
                goto fail;
        }

        /*
         * If loading a dependency, `name' is a plain module name.
         * The name must match.
         */
        if (isdep && strcmp(mi->mi_name, name) != 0) {
                module_error("dependency name mismatch (`%s' != `%s')",
                    name, mi->mi_name);
                error = ENOENT;
                goto fail;
        }

        /*
         * If we loaded a module from the filesystem, check the actual
         * module name (from the modinfo_t) to ensure another module
         * with the same name doesn't already exist.  (There's no
         * guarantee the filename will match the module name, and the
         * dup-symbols check may not be sufficient.)
         */
        if (mod->mod_source == MODULE_SOURCE_FILESYS) {
                mod2 = module_lookup(mod->mod_info->mi_name);
                if ( mod2 && mod2 != mod) {
                        module_error("module with name `%s' already loaded",
                            mod2->mod_info->mi_name);
                        error = EEXIST;
                        if (modp != NULL)
                                *modp = mod2;
                        goto fail;
                }
        }

        /*
         * Block circular dependencies.
         */
        TAILQ_FOREACH(mod2, pending, mod_chain) {
                if (mod == mod2) {
                        continue;
                }
                if (strcmp(mod2->mod_info->mi_name, mi->mi_name) == 0) {
                        error = EDEADLK;
                        module_error("circular dependency detected for `%s'",
                            mi->mi_name);
                        goto fail;
                }
        }

        /*
         * Now try to load any requisite modules.
         */
        if (mi->mi_required != NULL) {
                mod->mod_arequired = 0;
                for (s = mi->mi_required; *s != '\0'; s = p) {
                        if (*s == ',')
                                s++;
                        p = s;
                        while (*p != '\0' && *p != ',')
                                p++;
                        len = p - s + 1;
                        if (len >= MAXMODNAME) {
                                error = EINVAL;
                                module_error("required module name `%s' "
                                    "longer than %d", mi->mi_required,
                                    MAXMODNAME);
                                goto fail;
                        }
                        strlcpy(buf, s, len);
                        if (buf[0] == '\0')
                                break;
                        alloc_required(mod);
                        if (strcmp(buf, mi->mi_name) == 0) {
                                error = EDEADLK;
                                module_error("self-dependency detected for "
                                   "`%s'", mi->mi_name);
                                goto fail;
                        }
                        error = module_do_load(buf, true, flags, NULL,
                            &mod2, MODULE_CLASS_ANY, true);
                        if (error != 0 && error != EEXIST) {
                                module_error("recursive load failed for `%s' "
                                    "(`%s' required), error %d", mi->mi_name,
                                    buf, error);
                                goto fail;
                        }
                        (*mod->mod_required)[mod->mod_nrequired++] = mod2;
                }
        }

        /*
         * We loaded all needed modules successfully: perform global
         * relocations and initialize.
         */
        {
                char xname[MAXMODNAME];

                /*
                 * In case of error the entire module is gone, so we
                 * need to save its name for possible error report.
                 */

                strlcpy(xname, mi->mi_name, MAXMODNAME);
                error = kobj_affix(mod->mod_kobj, mi->mi_name);
                if (error != 0) {
                        module_error("unable to affix module `%s', error %d",
                            xname, error);
                        goto fail2;
                }
        }

        if (filedict) {
                if (!module_merge_dicts(filedict, props)) {
                        module_error("module properties failed for %s", name);
                        error = EINVAL;
                        goto fail;
                }
        }

        prev_active = module_active;
        module_active = mod;

        /*
         * Note that we handle sysctl and evcnt setup _before_ we
         * initialize the module itself.  This maintains a consistent
         * order between built-in and run-time-loaded modules.  If
         * initialization then fails, we'll need to undo these, too.
         */
        module_load_sysctl(mod);        /* Set-up module's sysctl if any */
        module_load_evcnt(mod);                /* Attach any static evcnt needed */


        error = (*mi->mi_modcmd)(MODULE_CMD_INIT, filedict ? filedict : props);
        module_active = prev_active;
        if (filedict) {
                prop_object_release(filedict);
                filedict = NULL;
        }
        if (error != 0) {
                module_error("modcmd(CMD_INIT) failed for `%s', error %d",
                    mi->mi_name, error);
                goto fail3;
        }

        /*
         * If a recursive load already added a module with the same
         * name, abort.
         */
        mod2 = module_lookup(mi->mi_name);
        if (mod2 && mod2 != mod) {
                module_error("recursive load causes duplicate module `%s'",
                    mi->mi_name);
                error = EEXIST;
                goto fail1;
        }

        /*
         * Good, the module loaded successfully.  Put it onto the
         * list and add references to its requisite modules.
         */
        TAILQ_REMOVE(pending, mod, mod_chain);
        module_enqueue(mod);
        if (modp != NULL) {
                *modp = mod;
        }
        if (autoload && module_autotime > 0) {
                /*
                 * Arrange to try unloading the module after
                 * a short delay unless auto-unload is disabled.
                 */
                mod->mod_autotime = time_second + module_autotime;
                SET(mod->mod_flags, MODFLG_AUTO_LOADED);
                module_thread_kick();
        }
        SLIST_REMOVE_HEAD(&pend_stack, pe_entry);
        module_print("module `%s' loaded successfully", mi->mi_name);
        module_callback_load(mod);
        return 0;

 fail1:
        (*mi->mi_modcmd)(MODULE_CMD_FINI, NULL);
 fail3:
        /*
         * If there were any registered SYSCTL_SETUP funcs, make sure
         * we release the sysctl entries
         */
        if (mod->mod_sysctllog) {
                sysctl_teardown(&mod->mod_sysctllog);
        }
        /* Also detach any static evcnt's */
        module_unload_evcnt(mod);
 fail:
        kobj_unload(mod->mod_kobj);
 fail2:
        if (filedict != NULL) {
                prop_object_release(filedict);
                filedict = NULL;
        }
        TAILQ_REMOVE(pending, mod, mod_chain);
        SLIST_REMOVE_HEAD(&pend_stack, pe_entry);
        module_free(mod);
        return error;
}

/*
 * module_do_unload:
 *
 *        Helper routine: do the dirty work of unloading a module.
 */
static int
module_do_unload(const char *name, bool load_requires_force)
{
        module_t *mod, *prev_active;
        int error;
        u_int i;

        KASSERT(kernconfig_is_held());
        KASSERT(name != NULL);

        module_print("unload requested for '%s' (%s)", name,
            load_requires_force ? "TRUE" : "FALSE");
        mod = module_lookup(name);
        if (mod == NULL) {
                module_error("module `%s' not found", name);
                return ENOENT;
        }
        if (mod->mod_refcnt != 0) {
                module_print("module `%s' busy (%d refs)", name,
                    mod->mod_refcnt);
                return EBUSY;
        }

        /*
         * Builtin secmodels are there to stay.
         */
        if (mod->mod_source == MODULE_SOURCE_KERNEL &&
            mod->mod_info->mi_class == MODULE_CLASS_SECMODEL) {
                module_print("cannot unload built-in secmodel module `%s'",
                    name);
                return EPERM;
        }

        prev_active = module_active;
        module_active = mod;
        module_callback_unload(mod);

        /* let the module clean up after itself */
        error = (*mod->mod_info->mi_modcmd)(MODULE_CMD_FINI, NULL);

        /*
         * If there were any registered SYSCTL_SETUP funcs, make sure
         * we release the sysctl entries.  Same for static evcnt.
         */
        if (error == 0) {
                if (mod->mod_sysctllog) {
                        sysctl_teardown(&mod->mod_sysctllog);
                }
                module_unload_evcnt(mod);
        }
        module_active = prev_active;
        if (error != 0) {
                module_print("could not unload module `%s' error=%d", name,
                    error);
                return error;
        }
        module_count--;
        TAILQ_REMOVE(&module_list, mod, mod_chain);
        for (i = 0; i < mod->mod_nrequired; i++) {
                (*mod->mod_required)[i]->mod_refcnt--;
        }
        module_print("unloaded module `%s'", name);
        if (mod->mod_kobj != NULL) {
                kobj_unload(mod->mod_kobj);
        }
        if (mod->mod_source == MODULE_SOURCE_KERNEL) {
                if (mod->mod_required != NULL) {
                        /*
                         * release "required" resources - will be re-parsed
                         * if the module is re-enabled
                         */
                        kmem_free(mod->mod_required,
                            mod->mod_arequired * sizeof(module_t *));
                        mod->mod_nrequired = 0;
                        mod->mod_arequired = 0;
                        mod->mod_required = NULL;
                }
                if (load_requires_force)
                        module_require_force(mod);
                TAILQ_INSERT_TAIL(&module_builtins, mod, mod_chain);
                module_builtinlist++;
        } else {
                module_free(mod);
        }
        module_gen++;

        return 0;
}

/*
 * module_prime:
 *
 *        Push a module loaded by the bootloader onto our internal
 *        list.
 */
int
module_prime(const char *name, void *base, size_t size)
{
        __link_set_decl(modules, modinfo_t);
        modinfo_t *const *mip;
        module_t *mod;
        int error;

        /* Check for module name same as a built-in module */

        __link_set_foreach(mip, modules) {
                if (*mip == &module_dummy)
                        continue;
                if (strcmp((*mip)->mi_name, name) == 0) {
                        module_error("module `%s' pushed by boot loader "
                            "already exists", name);
                        return EEXIST;
                }
        }

        /* Also eliminate duplicate boolist entries */

        TAILQ_FOREACH(mod, &module_bootlist, mod_chain) {
                if (strcmp(mod->mod_info->mi_name, name) == 0) {
                        module_error("duplicate bootlist entry for module "
                            "`%s'", name);
                        return EEXIST;
                }
        }

        mod = module_newmodule(MODULE_SOURCE_BOOT);
        if (mod == NULL) {
                return ENOMEM;
        }

        error = kobj_load_mem(&mod->mod_kobj, name, base, size);
        if (error != 0) {
                module_free(mod);
                module_error("unable to load `%s' pushed by boot loader, "
                    "error %d", name, error);
                return error;
        }
        error = module_fetch_info(mod);
        if (error != 0) {
                kobj_unload(mod->mod_kobj);
                module_free(mod);
                module_error("unable to fetch_info for `%s' pushed by boot "
                    "loader, error %d", name, error);
                return error;
        }

        TAILQ_INSERT_TAIL(&module_bootlist, mod, mod_chain);

        return 0;
}

/*
 * module_fetch_into:
 *
 *        Fetch modinfo record from a loaded module.
 */
static int
module_fetch_info(module_t *mod)
{
        int error;
        void *addr;
        size_t size;

        /*
         * Find module info record and check compatibility.
         */
        error = kobj_find_section(mod->mod_kobj, "link_set_modules",
            &addr, &size);
        if (error != 0) {
                module_error("`link_set_modules' section not present, "
                    "error %d", error);
                return error;
        }
        if (size != sizeof(modinfo_t **)) {
                if (size > sizeof(modinfo_t **) &&
                    (size % sizeof(modinfo_t **)) == 0) {
                        module_error("`link_set_modules' section wrong size "
                            "(%zu different MODULE declarations?)",
                            size / sizeof(modinfo_t **));
                } else {
                        module_error("`link_set_modules' section wrong size "
                            "(got %zu, wanted %zu)",
                            size, sizeof(modinfo_t **));
                }
                return ENOEXEC;
        }
        mod->mod_info = *(modinfo_t **)addr;

        return 0;
}

/*
 * module_find_section:
 *
 *        Allows a module that is being initialized to look up a section
 *        within its ELF object.
 */
int
module_find_section(const char *name, void **addr, size_t *size)
{

        KASSERT(kernconfig_is_held());
        KASSERT(module_active != NULL);

        return kobj_find_section(module_active->mod_kobj, name, addr, size);
}

/*
 * module_thread:
 *
 *        Automatically unload modules.  We try once to unload autoloaded
 *        modules after module_autotime seconds.  If the system is under
 *        severe memory pressure, we'll try unloading all modules, else if
 *        module_autotime is zero, we don't try to unload, even if the
 *        module was previously scheduled for unload.
 */
static void
module_thread(void *cookie)
{
        module_t *mod, *next;
        modinfo_t *mi;
        int error;

        for (;;) {
                kernconfig_lock();
                for (mod = TAILQ_FIRST(&module_list); mod != NULL; mod = next) {
                        next = TAILQ_NEXT(mod, mod_chain);

                        /* skip built-in modules */
                        if (mod->mod_source == MODULE_SOURCE_KERNEL)
                                continue;
                        /* skip modules that weren't auto-loaded */
                        if (!ISSET(mod->mod_flags, MODFLG_AUTO_LOADED))
                                continue;

                        if (uvm_availmem(false) < uvmexp.freemin) {
                                module_thread_ticks = hz;
                        } else if (module_autotime == 0 ||
                                   mod->mod_autotime == 0) {
                                continue;
                        } else if (time_second < mod->mod_autotime) {
                                module_thread_ticks = hz;
                                    continue;
                        } else {
                                mod->mod_autotime = 0;
                        }

                        /*
                         * Ask the module if it can be safely unloaded.
                         *
                         * - Modules which have been audited to be OK
                         *   with that will return 0.
                         *
                         * - Modules which have not been audited for
                         *   safe autounload will return ENOTTY.
                         *
                         *   => With kern.module.autounload_unsafe=1,
                         *      we treat ENOTTY as acceptance.
                         *
                         * - Some modules would ping-ping in and out
                         *   because their use is transient but often.
                         *   Example: exec_script.  Other modules may
                         *   still be in use.  These modules can
                         *   prevent autounload in all cases by
                         *   returning EBUSY or some other error code.
                         */
                        mi = mod->mod_info;
                        error = (*mi->mi_modcmd)(MODULE_CMD_AUTOUNLOAD, NULL);
                        if (error == 0 ||
                            (error == ENOTTY && module_autounload_unsafe)) {
                                (void)module_do_unload(mi->mi_name, false);
                        } else
                                module_print("module `%s' declined to be "
                                    "auto-unloaded error=%d", mi->mi_name,
                                    error);
                }
                kernconfig_unlock();

                mutex_enter(&module_thread_lock);
                (void)cv_timedwait(&module_thread_cv, &module_thread_lock,
                    module_thread_ticks);
                module_thread_ticks = 0;
                mutex_exit(&module_thread_lock);
        }
}

/*
 * module_thread:
 *
 *        Kick the module thread into action, perhaps because the
 *        system is low on memory.
 */
void
module_thread_kick(void)
{

        mutex_enter(&module_thread_lock);
        module_thread_ticks = hz;
        cv_broadcast(&module_thread_cv);
        mutex_exit(&module_thread_lock);
}

#ifdef DDB
/*
 * module_whatis:
 *
 *        Helper routine for DDB.
 */
void
module_whatis(uintptr_t addr, void (*pr)(const char *, ...))
{
        module_t *mod;
        size_t msize;
        vaddr_t maddr;

        TAILQ_FOREACH(mod, &module_list, mod_chain) {
                if (mod->mod_kobj == NULL) {
                        continue;
                }
                if (kobj_stat(mod->mod_kobj, &maddr, &msize) != 0)
                        continue;
                if (addr < maddr || addr >= maddr + msize) {
                        continue;
                }
                (*pr)("%p is %p+%zu, in kernel module `%s'\n",
                    (void *)addr, (void *)maddr,
                    (size_t)(addr - maddr), mod->mod_info->mi_name);
        }
}

/*
 * module_print_list:
 *
 *        Helper routine for DDB.
 */
void
module_print_list(void (*pr)(const char *, ...))
{
        const char *src;
        module_t *mod;
        size_t msize;
        vaddr_t maddr;

        (*pr)("%16s %16s %8s %8s\n", "NAME", "TEXT/DATA", "SIZE", "SOURCE");

        TAILQ_FOREACH(mod, &module_list, mod_chain) {
                switch (mod->mod_source) {
                case MODULE_SOURCE_KERNEL:
                        src = "builtin";
                        break;
                case MODULE_SOURCE_FILESYS:
                        src = "filesys";
                        break;
                case MODULE_SOURCE_BOOT:
                        src = "boot";
                        break;
                default:
                        src = "unknown";
                        break;
                }
                if (mod->mod_kobj == NULL) {
                        maddr = 0;
                        msize = 0;
                } else if (kobj_stat(mod->mod_kobj, &maddr, &msize) != 0)
                        continue;
                (*pr)("%16s %16lx %8ld %8s\n", mod->mod_info->mi_name,
                    (long)maddr, (long)msize, src);
        }
}
#endif        /* DDB */

static bool
module_merge_dicts(prop_dictionary_t existing_dict,
                   const prop_dictionary_t new_dict)
{
        prop_dictionary_keysym_t props_keysym;
        prop_object_iterator_t props_iter;
        prop_object_t props_obj;
        const char *props_key;
        bool error;

        if (new_dict == NULL) {                        /* nothing to merge */
                return true;
        }

        error = false;
        props_iter = prop_dictionary_iterator(new_dict);
        if (props_iter == NULL) {
                return false;
        }

        while ((props_obj = prop_object_iterator_next(props_iter)) != NULL) {
                props_keysym = (prop_dictionary_keysym_t)props_obj;
                props_key = prop_dictionary_keysym_value(props_keysym);
                props_obj = prop_dictionary_get_keysym(new_dict, props_keysym);
                if ((props_obj == NULL) || !prop_dictionary_set(existing_dict,
                    props_key, props_obj)) {
                        error = true;
                        goto out;
                }
        }
        error = false;

out:
        prop_object_iterator_release(props_iter);

        return !error;
}

/*
 * module_specific_key_create:
 *
 *        Create a key for subsystem module-specific data.
 */
specificdata_key_t
module_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
{

        return specificdata_key_create(module_specificdata_domain, keyp, dtor);
}

/*
 * module_specific_key_delete:
 *
 *        Delete a key for subsystem module-specific data.
 */
void
module_specific_key_delete(specificdata_key_t key)
{

        return specificdata_key_delete(module_specificdata_domain, key);
}

/*
 * module_getspecific:
 *
 *        Return module-specific data corresponding to the specified key.
 */
void *
module_getspecific(module_t *mod, specificdata_key_t key)
{

        return specificdata_getspecific(module_specificdata_domain,
            &mod->mod_sdref, key);
}

/*
 * module_setspecific:
 *
 *        Set module-specific data corresponding to the specified key.
 */
void
module_setspecific(module_t *mod, specificdata_key_t key, void *data)
{

        specificdata_setspecific(module_specificdata_domain,
            &mod->mod_sdref, key, data);
}

/*
 * module_register_callbacks:
 *
 *        Register a new set of callbacks to be called on module load/unload.
 *        Call the load callback on each existing module.
 *        Return an opaque handle for unregistering these later.
 */
void *
module_register_callbacks(void (*load)(struct module *),
    void (*unload)(struct module *))
{
        struct module_callbacks *modcb;
        struct module *mod;

        modcb = kmem_alloc(sizeof(*modcb), KM_SLEEP);
        modcb->modcb_load = load;
        modcb->modcb_unload = unload;

        kernconfig_lock();
        TAILQ_INSERT_TAIL(&modcblist, modcb, modcb_list);
        TAILQ_FOREACH_REVERSE(mod, &module_list, modlist, mod_chain)
                load(mod);
        kernconfig_unlock();

        return modcb;
}

/*
 * module_unregister_callbacks:
 *
 *        Unregister a previously-registered set of module load/unload callbacks.
 *        Call the unload callback on each existing module.
 */
void
module_unregister_callbacks(void *opaque)
{
        struct module_callbacks *modcb;
        struct module *mod;

        modcb = opaque;
        kernconfig_lock();
        TAILQ_FOREACH(mod, &module_list, mod_chain)
                modcb->modcb_unload(mod);
        TAILQ_REMOVE(&modcblist, modcb, modcb_list);
        kernconfig_unlock();
        kmem_free(modcb, sizeof(*modcb));
}

/*
 * module_callback_load:
 *
 *        Helper routine: call all load callbacks on a module being loaded.
 */
static void
module_callback_load(struct module *mod)
{
        struct module_callbacks *modcb;

        TAILQ_FOREACH(modcb, &modcblist, modcb_list) {
                modcb->modcb_load(mod);
        }
}

/*
 * module_callback_unload:
 *
 *        Helper routine: call all unload callbacks on a module being unloaded.
 */
static void
module_callback_unload(struct module *mod)
{
        struct module_callbacks *modcb;

        TAILQ_FOREACH(modcb, &modcblist, modcb_list) {
                modcb->modcb_unload(mod);
        }
}

























































































































    1 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
/*        $NetBSD: netbsd32_machdep.c,v 1.141 2022/08/20 23:49:31 riastradh Exp $        */

/*
 * Copyright (c) 2001 Wasabi Systems, Inc.
 * All rights reserved.
 *
 * Written by Frank van der Linden for Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed for the NetBSD Project by
 *      Wasabi Systems, Inc.
 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
 *    or promote products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: netbsd32_machdep.c,v 1.141 2022/08/20 23:49:31 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_compat_netbsd32.h"
#include "opt_execfmt.h"
#include "opt_user_ldt.h"
#include "opt_mtrr.h"
#endif

#include <sys/param.h>
#include <sys/exec.h>
#include <sys/exec_aout.h>
#include <sys/kmem.h>
#include <sys/kmem.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/systm.h>
#include <sys/core.h>
#include <sys/mount.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/ras.h>
#include <sys/ptrace.h>
#include <sys/kauth.h>
#include <sys/compat_stub.h>

#include <x86/fpu.h>
#include <x86/dbregs.h>
#include <machine/frame.h>
#include <machine/reg.h>
#include <machine/vmparam.h>
#ifdef MTRR
#include <machine/mtrr.h>
#endif
#include <machine/netbsd32_machdep.h>
#include <machine/sysarch.h>
#include <machine/userret.h>
#include <machine/gdt.h>
#include <machine/pmap_private.h>

#include <compat/netbsd32/netbsd32.h>
#include <compat/netbsd32/netbsd32_exec.h>
#include <compat/netbsd32/netbsd32_syscallargs.h>

#include <compat/sys/signal.h>
#include <compat/sys/signalvar.h>

/* Provide a the name of the architecture we're emulating */
const char machine32[] = "i386";
const char machine_arch32[] = "i386";

static int netbsd32_process_doxmmregs(struct lwp *, struct lwp *, void *, bool);
static int netbsd32_process_xmmregio(struct lwp *, struct lwp *, struct uio *);

#ifdef USER_LDT
static int x86_64_get_ldt32(struct lwp *, void *, register_t *);
static int x86_64_set_ldt32(struct lwp *, void *, register_t *);
#else
#define x86_64_get_ldt32(x, y, z)        ENOSYS
#define x86_64_set_ldt32(x, y, z)        ENOSYS
#endif

#ifdef MTRR
static int x86_64_get_mtrr32(struct lwp *, void *, register_t *);
static int x86_64_set_mtrr32(struct lwp *, void *, register_t *);
#else
#define x86_64_get_mtrr32(x, y, z)        ENOSYS
#define x86_64_set_mtrr32(x, y, z)        ENOSYS
#endif

int check_sigcontext32(struct lwp *, const struct netbsd32_sigcontext *);
void netbsd32_buildcontext(struct lwp *, struct trapframe *, void *,
    sig_t, int);

#ifdef EXEC_AOUT
/*
 * There is no native a.out -- this function is required
 * for i386 a.out emulation (COMPAT_NETBSD32+EXEC_AOUT).
 */
int
cpu_exec_aout_makecmds(struct lwp *p, struct exec_package *e)
{

        return ENOEXEC;
}
#endif

void
netbsd32_setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
{
        struct pcb *pcb;
        struct trapframe *tf;
        struct proc *p = l->l_proc;

        pcb = lwp_getpcb(l);

#if defined(USER_LDT)
        pmap_ldt_cleanup(l);
#endif

        netbsd32_adjust_limits(p);

        fpu_clear(l, pack->ep_osversion >= 699002600
            ?  __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
        x86_dbregs_clear(l);

        kpreempt_disable();
        pcb->pcb_flags = PCB_COMPAT32;
        p->p_flag |= PK_32;
        l->l_md.md_flags = MDL_COMPAT32;        /* force iret not sysret */
        cpu_segregs32_zero(l);
        cpu_fsgs_reload(l, LSEL(LUDATA32_SEL, SEL_UPL),
            LSEL(LUDATA32_SEL, SEL_UPL));
        kpreempt_enable();

        tf = l->l_md.md_regs;
        tf->tf_ds = LSEL(LUDATA32_SEL, SEL_UPL);
        tf->tf_es = LSEL(LUDATA32_SEL, SEL_UPL);
        tf->tf_rdi = 0;
        tf->tf_rsi = 0;
        tf->tf_rbp = 0;
        tf->tf_rbx = (uint32_t)p->p_psstrp;
        tf->tf_rdx = 0;
        tf->tf_rcx = 0;
        tf->tf_rax = 0;
        tf->tf_rip = pack->ep_entry;
        tf->tf_cs = LSEL(LUCODE32_SEL, SEL_UPL);
        tf->tf_rflags = PSL_USERSET;
        tf->tf_rsp = stack;
        tf->tf_ss = LSEL(LUDATA32_SEL, SEL_UPL);
}

void
netbsd32_buildcontext(struct lwp *l, struct trapframe *tf, void *fp,
    sig_t catcher, int onstack)
{
        /*
         * Build context to run handler in.
         */
        tf->tf_ds = GSEL(GUDATA32_SEL, SEL_UPL);
        tf->tf_es = GSEL(GUDATA32_SEL, SEL_UPL);
#if 0
        tf->tf_fs = GSEL(GUDATA32_SEL, SEL_UPL);
        tf->tf_gs = GSEL(GUDATA32_SEL, SEL_UPL);
#endif

        /* Ensure FP state is sane. */
        fpu_sigreset(l);

        tf->tf_rip = (uint64_t)catcher;
        tf->tf_cs = GSEL(GUCODE32_SEL, SEL_UPL);
        tf->tf_rflags &= ~PSL_CLEARSIG;
        tf->tf_rsp = (uint64_t)fp;
        tf->tf_ss = GSEL(GUDATA32_SEL, SEL_UPL);

        /* Remember that we're now on the signal stack. */
        if (onstack)
                l->l_sigstk.ss_flags |= SS_ONSTACK;
        if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS32) {
                /*
                 * process has given an invalid address for the
                 * handler. Stop it, but do not do it before so
                 * we can return the right info to userland (or in core dump)
                 */
                sigexit(l, SIGILL);
                /* NOTREACHED */
        }
}

void
netbsd32_sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
{
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;
        struct sigacts *ps = p->p_sigacts;
        int onstack, error;
        int sig = ksi->ksi_signo;
        struct netbsd32_sigframe_siginfo *fp, frame;
        const struct sigaction *sa = &SIGACTION(p, sig);
        sig_t catcher = sa->sa_handler;
        struct trapframe *tf = l->l_md.md_regs;
        stack_t * const ss = &l->l_sigstk;

        /* Do we need to jump onto the signal stack? */
        onstack =
            (ss->ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
            (sa->sa_flags & SA_ONSTACK) != 0;

        /* Allocate space for the signal handler context. */
        if (onstack)
                fp = (struct netbsd32_sigframe_siginfo *)
                    ((char *)ss->ss_sp + ss->ss_size);
        else
                fp = (struct netbsd32_sigframe_siginfo *)tf->tf_rsp;

        fp--;

        /* Build stack frame for signal trampoline. */
        switch (ps->sa_sigdesc[sig].sd_vers) {
        case __SIGTRAMP_SIGCODE_VERSION:     /* handled by sendsig_sigcontext */
        case __SIGTRAMP_SIGCONTEXT_VERSION: /* handled by sendsig_sigcontext */
        default:        /* unknown version */
                printf("nsendsig: bad version %d\n",
                    ps->sa_sigdesc[sig].sd_vers);
                sigexit(l, SIGILL);
        case __SIGTRAMP_SIGINFO_VERSION:
                break;
        }

        memset(&frame, 0, sizeof(frame));
        frame.sf_ra = (uint32_t)(uintptr_t)ps->sa_sigdesc[sig].sd_tramp;
        frame.sf_signum = sig;
        frame.sf_sip = (uint32_t)(uintptr_t)&fp->sf_si;
        frame.sf_ucp = (uint32_t)(uintptr_t)&fp->sf_uc;
        netbsd32_si_to_si32(&frame.sf_si, (const siginfo_t *)&ksi->ksi_info);
        frame.sf_uc.uc_flags = _UC_SIGMASK;
        frame.sf_uc.uc_sigmask = *mask;
        frame.sf_uc.uc_link = (uint32_t)(uintptr_t)l->l_ctxlink;
        frame.sf_uc.uc_flags |= (ss->ss_flags & SS_ONSTACK)
            ? _UC_SETSTACK : _UC_CLRSTACK;
        sendsig_reset(l, sig);

        mutex_exit(p->p_lock);
        cpu_getmcontext32(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
        error = copyout(&frame, fp, sizeof(frame));
        mutex_enter(p->p_lock);

        if (error != 0) {
                /*
                 * Process has trashed its stack; give it an illegal
                 * instruction to halt it in its tracks.
                 */
                sigexit(l, SIGILL);
                /* NOTREACHED */
        }

        netbsd32_buildcontext(l, tf, fp, catcher, onstack);
}

/*
 * Dump the machine specific segment at the start of a core dump.
 */
struct md_core32 {
        struct reg32 intreg;
        struct fpreg32 freg;
};

int
cpu_coredump32(struct lwp *l, struct coredump_iostate *iocookie,
    struct core32 *chdr)
{
        struct md_core32 md_core;
        struct coreseg cseg;
        int error;

        if (iocookie == NULL) {
                CORE_SETMAGIC(*chdr, COREMAGIC, MID_I386, 0);
                chdr->c_hdrsize = ALIGN32(sizeof(*chdr));
                chdr->c_seghdrsize = ALIGN32(sizeof(cseg));
                chdr->c_cpusize = sizeof(md_core);
                chdr->c_nseg++;
                return 0;
        }

        /* Save integer registers. */
        error = netbsd32_process_read_regs(l, &md_core.intreg);
        if (error)
                return error;

        /* Save floating point registers. */
        error = netbsd32_process_read_fpregs(l, &md_core.freg, NULL);
        if (error)
                return error;

        CORE_SETMAGIC(cseg, CORESEGMAGIC, MID_I386, CORE_CPU);
        cseg.c_addr = 0;
        cseg.c_size = chdr->c_cpusize;

        MODULE_HOOK_CALL(coredump_write_hook, (iocookie, UIO_SYSSPACE, &cseg,
            chdr->c_seghdrsize), ENOSYS, error);
        if (error)
                return error;

        MODULE_HOOK_CALL(coredump_write_hook, (iocookie, UIO_SYSSPACE, &md_core,
            sizeof(md_core)), ENOSYS, error);

        return error;
}

int
netbsd32_ptrace_translate_request(int req)
{

        switch (req)
        {
        case 0 ... PT_FIRSTMACH - 1:        return req;
        case PT32_STEP:                        return PT_STEP;
        case PT32_GETREGS:                return PT_GETREGS;
        case PT32_SETREGS:                return PT_SETREGS;
        case PT32_GETFPREGS:                return PT_GETFPREGS;
        case PT32_SETFPREGS:                return PT_SETFPREGS;
        case PT32_GETXMMREGS:                return PT_GETXMMREGS;
        case PT32_SETXMMREGS:                return PT_SETXMMREGS;
        case PT32_GETDBREGS:                return PT_GETDBREGS;
        case PT32_SETDBREGS:                return PT_SETDBREGS;
        case PT32_SETSTEP:                return PT_SETSTEP;
        case PT32_CLEARSTEP:                return PT_CLEARSTEP;
        case PT32_GETXSTATE:                return PT_GETXSTATE;
        case PT32_SETXSTATE:                return PT_SETXSTATE;
        default:                        return -1;
        }
}

int
netbsd32_process_read_regs(struct lwp *l, struct reg32 *regs)
{
        struct trapframe *tf = l->l_md.md_regs;

        /* XXX avoid sign extension problems with unknown upper bits? */
        regs->r_gs = tf->tf_gs & 0xffff;
        regs->r_fs = tf->tf_fs & 0xffff;
        regs->r_es = tf->tf_es & 0xffff;
        regs->r_ds = tf->tf_ds & 0xffff;
        regs->r_eflags = tf->tf_rflags;
        regs->r_edi = tf->tf_rdi & 0xffffffff;
        regs->r_esi = tf->tf_rsi & 0xffffffff;
        regs->r_ebp = tf->tf_rbp & 0xffffffff;
        regs->r_ebx = tf->tf_rbx & 0xffffffff;
        regs->r_edx = tf->tf_rdx & 0xffffffff;
        regs->r_ecx = tf->tf_rcx & 0xffffffff;
        regs->r_eax = tf->tf_rax & 0xffffffff;
        regs->r_eip = tf->tf_rip & 0xffffffff;
        regs->r_cs = tf->tf_cs & 0xffff;
        regs->r_esp = tf->tf_rsp & 0xffffffff;
        regs->r_ss = tf->tf_ss & 0xffff;

        return 0;
}

int
netbsd32_process_read_fpregs(struct lwp *l, struct fpreg32 *regs, size_t *sz)
{

        __CTASSERT(sizeof(*regs) == sizeof(struct save87));
        process_read_fpregs_s87(l, (struct save87 *)regs);
        return 0;
}

int
netbsd32_process_read_dbregs(struct lwp *l, struct dbreg32 *regs, size_t *sz)
{
        struct dbreg regs64;

        x86_dbregs_read(l, &regs64);
        memset(regs, 0, sizeof(*regs));
        regs->dr[0] = regs64.dr[0] & 0xffffffff;
        regs->dr[1] = regs64.dr[1] & 0xffffffff;
        regs->dr[2] = regs64.dr[2] & 0xffffffff;
        regs->dr[3] = regs64.dr[3] & 0xffffffff;

        regs->dr[6] = regs64.dr[6] & 0xffffffff;
        regs->dr[7] = regs64.dr[7] & 0xffffffff;

        return 0;
}

int
netbsd32_process_write_regs(struct lwp *l, const struct reg32 *regs)
{
        struct trapframe *tf;
        struct pcb *pcb;

        tf = l->l_md.md_regs;
        pcb = lwp_getpcb(l);

        /*
         * Check for security violations.
         */
        if (((regs->r_eflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
                return EINVAL;
        if (!VALID_USER_CSEL32(regs->r_cs))
                return EINVAL;
        if (regs->r_fs != 0 && !VALID_USER_DSEL32(regs->r_fs) &&
            !(VALID_USER_FSEL32(regs->r_fs) && pcb->pcb_fs != 0))
                return EINVAL;
        if (regs->r_gs != 0 && !VALID_USER_DSEL32(regs->r_gs) &&
            !(VALID_USER_GSEL32(regs->r_gs) && pcb->pcb_gs != 0))
                return EINVAL;
        if (regs->r_es != 0 && !VALID_USER_DSEL32(regs->r_es))
                return EINVAL;
        if (!VALID_USER_DSEL32(regs->r_ds) ||
            !VALID_USER_DSEL32(regs->r_ss))
                return EINVAL;
        if ((u_int)regs->r_eip >= VM_MAXUSER_ADDRESS32)
                return EINVAL;

        tf->tf_rax = regs->r_eax;
        tf->tf_rcx = regs->r_ecx;
        tf->tf_rdx = regs->r_edx;
        tf->tf_rbx = regs->r_ebx;
        tf->tf_rsp = regs->r_esp;
        tf->tf_rbp = regs->r_ebp;
        tf->tf_rsi = regs->r_esi;
        tf->tf_rdi = regs->r_edi;
        tf->tf_rip = regs->r_eip;
        tf->tf_rflags = regs->r_eflags;
        tf->tf_cs = regs->r_cs & 0xFFFF;
        tf->tf_ss = regs->r_ss & 0xFFFF;
        tf->tf_ds = regs->r_ds & 0xFFFF;
        tf->tf_es = regs->r_es & 0xFFFF;
        tf->tf_fs = regs->r_fs & 0xFFFF;
        tf->tf_gs = regs->r_gs & 0xFFFF;

        return 0;
}

int
netbsd32_process_write_fpregs(struct lwp *l, const struct fpreg32 *regs,
    size_t sz)
{

        __CTASSERT(sizeof(*regs) == sizeof(struct save87));
        process_write_fpregs_s87(l, (const struct save87 *)regs);
        return 0;
}

int
netbsd32_process_write_dbregs(struct lwp *l, const struct dbreg32 *regs,
    size_t sz)
{
        size_t i;
        struct dbreg regs64;

        /* Check that DR0-DR3 contain user-space address */
        for (i = 0; i < X86_DBREGS; i++) {
                if ((u_int)regs->dr[i] >= VM_MAXUSER_ADDRESS32)
                        return EINVAL;
        }

        if (regs->dr[7] & X86_DR7_GENERAL_DETECT_ENABLE) {
                return EINVAL;
        }

        memset(&regs64, 0, sizeof(regs64));

        regs64.dr[0] = (u_int)regs->dr[0];
        regs64.dr[1] = (u_int)regs->dr[1];
        regs64.dr[2] = (u_int)regs->dr[2];
        regs64.dr[3] = (u_int)regs->dr[3];

        regs64.dr[6] = (u_int)regs->dr[6];
        regs64.dr[7] = (u_int)regs->dr[7];

        x86_dbregs_write(l, &regs64);
        return 0;
}

static int
netbsd32_process_doxmmregs(struct lwp *curl, struct lwp *l, void *addr,
    bool write)
        /* curl:                 tracer */
        /* l:                         traced */
{
        struct uio uio;
        struct iovec iov;
        struct vmspace *vm;
        int error;

        if ((curl->l_proc->p_flag & PK_32) == 0 ||
            (l->l_proc->p_flag & PK_32) == 0)
                return EINVAL;

        if (!process_machdep_validfpu(l->l_proc))
                return EINVAL;

        error = proc_vmspace_getref(curl->l_proc, &vm);
        if (error)
                return error;

        iov.iov_base = addr;
        iov.iov_len = sizeof(struct xmmregs32);
        uio.uio_iov = &iov;
        uio.uio_iovcnt = 1;
        uio.uio_offset = 0;
        uio.uio_resid = sizeof(struct xmmregs32);
        uio.uio_rw = write ? UIO_WRITE : UIO_READ;
        uio.uio_vmspace = vm;

        error = netbsd32_process_xmmregio(curl, l, &uio);
        uvmspace_free(vm);
        return error;
}

static int
netbsd32_process_xmmregio(struct lwp *curl, struct lwp *l, struct uio *uio)
        /* curl:                 tracer */
        /* l:                         traced */
{
        struct xmmregs32 regs;
        int error;
        char *kv;
        size_t kl;

        kl = sizeof(regs);
        kv = (char *)&regs;

        if (uio->uio_offset < 0 || uio->uio_offset > (off_t)kl)
                return EINVAL;

        kv += uio->uio_offset;
        kl -= uio->uio_offset;

        if (kl > uio->uio_resid)
                kl = uio->uio_resid;

        process_read_fpregs_xmm(l, &regs.fxstate);
        error = uiomove(kv, kl, uio);
        if (error == 0 && uio->uio_rw == UIO_WRITE) {
                if (l->l_proc->p_stat != SSTOP)
                        error = EBUSY;
                else
                        process_write_fpregs_xmm(l, &regs.fxstate);
        }

        uio->uio_offset = 0;
        return error;
}

int
netbsd32_sysarch(struct lwp *l, const struct netbsd32_sysarch_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) op;
                syscallarg(netbsd32_voidp) parms;
        } */
        int error;

        switch (SCARG(uap, op)) {
        case X86_IOPL:
                error = x86_iopl(l,
                    NETBSD32PTR64(SCARG(uap, parms)), retval);
                break;
        case X86_GET_LDT:
                error = x86_64_get_ldt32(l,
                    NETBSD32PTR64(SCARG(uap, parms)), retval);
                break;
        case X86_SET_LDT:
                error = x86_64_set_ldt32(l,
                    NETBSD32PTR64(SCARG(uap, parms)), retval);
                break;
        case X86_GET_MTRR:
                error = x86_64_get_mtrr32(l,
                    NETBSD32PTR64(SCARG(uap, parms)), retval);
                break;
        case X86_SET_MTRR:
                error = x86_64_set_mtrr32(l,
                    NETBSD32PTR64(SCARG(uap, parms)), retval);
                break;
        default:
                error = EINVAL;
                break;
        }
        return error;
}

#ifdef USER_LDT
static int
x86_64_set_ldt32(struct lwp *l, void *args, register_t *retval)
{
        struct x86_set_ldt_args32 ua32;
        struct x86_set_ldt_args ua;
        union descriptor *descv;
        int error;

        if ((error = copyin(args, &ua32, sizeof(ua32))) != 0)
                return error;

        ua.start = ua32.start;
        ua.num = ua32.num;

        if (ua.num < 0 || ua.num > MAX_USERLDT_SLOTS)
                return EINVAL;

        const size_t alloc_size = sizeof(*descv) * ua.num;

        descv = kmem_alloc(alloc_size, KM_SLEEP);
        error = copyin((void *)(uintptr_t)ua32.desc, descv,
            sizeof(*descv) * ua.num);
        if (error == 0)
                error = x86_set_ldt1(l, &ua, descv);
        *retval = ua.start;

        kmem_free(descv, alloc_size);
        return error;
}

static int
x86_64_get_ldt32(struct lwp *l, void *args, register_t *retval)
{
        struct x86_get_ldt_args32 ua32;
        struct x86_get_ldt_args ua;
        union descriptor *cp;
        int error;

        if ((error = copyin(args, &ua32, sizeof(ua32))) != 0)
                return error;

        ua.start = ua32.start;
        ua.num = ua32.num;

        if (ua.num < 0 || ua.num > MAX_USERLDT_SLOTS)
                return EINVAL;

        const size_t alloc_size = ua.num * sizeof(union descriptor);

        cp = kmem_alloc(alloc_size, KM_SLEEP);
        error = x86_get_ldt1(l, &ua, cp);
        *retval = ua.num;
        if (error == 0)
                error = copyout(cp, (void *)(uintptr_t)ua32.desc,
                    ua.num * sizeof(*cp));

        kmem_free(cp, alloc_size);
        return error;
}
#endif

#ifdef MTRR
static int
x86_64_get_mtrr32(struct lwp *l, void *args, register_t *retval)
{
        struct x86_64_get_mtrr_args32 args32;
        int error, i;
        int32_t n;
        struct mtrr32 *m32p, m32;
        struct mtrr *m64p, *mp;
        size_t size;

        m64p = NULL;

        if (mtrr_funcs == NULL)
                return ENOSYS;

        error = kauth_authorize_machdep(l->l_cred, KAUTH_MACHDEP_MTRR_GET,
            NULL, NULL, NULL, NULL);
        if (error)
                return error;

        error = copyin(args, &args32, sizeof(args32));
        if (error != 0)
                return error;

        if (args32.mtrrp == 0) {
                n = (MTRR_I686_NFIXED_SOFT + MTRR_I686_NVAR_MAX);
                return copyout(&n, (void *)(uintptr_t)args32.n, sizeof(n));
        }

        error = copyin((void *)(uintptr_t)args32.n, &n, sizeof(n));
        if (error != 0)
                return error;

        if (n <= 0 || n > (MTRR_I686_NFIXED_SOFT + MTRR_I686_NVAR_MAX))
                return EINVAL;

        size = n * sizeof(struct mtrr);
        m64p = kmem_zalloc(size, KM_SLEEP);
        error = mtrr_get(m64p, &n, l->l_proc, 0);
        if (error != 0)
                goto fail;
        m32p = (struct mtrr32 *)(uintptr_t)args32.mtrrp;
        mp = m64p;
        for (i = 0; i < n; i++) {
                m32.base = mp->base;
                m32.len = mp->len;
                m32.type = mp->type;
                m32.flags = mp->flags;
                m32.owner = mp->owner;
                error = copyout(&m32, m32p, sizeof(m32));
                if (error != 0)
                        break;
                mp++;
                m32p++;
        }
fail:
        if (m64p != NULL)
                kmem_free(m64p, size);
        if (error != 0)
                n = 0;
        copyout(&n, (void *)(uintptr_t)args32.n, sizeof(n));
        return error;
}

static int
x86_64_set_mtrr32(struct lwp *l, void *args, register_t *retval)
{
        struct x86_64_set_mtrr_args32 args32;
        struct mtrr32 *m32p, m32;
        struct mtrr *m64p, *mp;
        int error, i;
        int32_t n;
        size_t size;

        m64p = NULL;

        if (mtrr_funcs == NULL)
                return ENOSYS;

        error = kauth_authorize_machdep(l->l_cred, KAUTH_MACHDEP_MTRR_SET,
            NULL, NULL, NULL, NULL);
        if (error)
                return error;

        error = copyin(args, &args32, sizeof(args32));
        if (error != 0)
                return error;

        error = copyin((void *)(uintptr_t)args32.n, &n, sizeof(n));
        if (error != 0)
                return error;

        if (n <= 0 || n > (MTRR_I686_NFIXED_SOFT + MTRR_I686_NVAR_MAX)) {
                error = EINVAL;
                goto fail;
        }

        size = n * sizeof(struct mtrr);
        m64p = kmem_zalloc(size, KM_SLEEP);
        m32p = (struct mtrr32 *)(uintptr_t)args32.mtrrp;
        mp = m64p;
        for (i = 0; i < n; i++) {
                error = copyin(m32p, &m32, sizeof(m32));
                if (error != 0)
                        goto fail;
                mp->base = m32.base;
                mp->len = m32.len;
                mp->type = m32.type;
                mp->flags = m32.flags;
                mp->owner = m32.owner;
                m32p++;
                mp++;
        }

        error = mtrr_set(m64p, &n, l->l_proc, 0);
fail:
        if (m64p != NULL)
                kmem_free(m64p, size);
        if (error != 0)
                n = 0;
        copyout(&n, (void *)(uintptr_t)args32.n, sizeof(n));
        return error;
}
#endif

int
cpu_setmcontext32(struct lwp *l, const mcontext32_t *mcp, unsigned int flags)
{
        struct trapframe *tf = l->l_md.md_regs;
        const __greg32_t *gr = mcp->__gregs;
        struct proc *p = l->l_proc;
        int error;

        /* Restore register context, if any. */
        if ((flags & _UC_CPU) != 0) {
                /*
                 * Check for security violations.
                 */
                error = cpu_mcontext32_validate(l, mcp);
                if (error != 0)
                        return error;

                cpu_fsgs_reload(l, gr[_REG32_FS], gr[_REG32_GS]);
                tf->tf_es = gr[_REG32_ES] & 0xFFFF;
                tf->tf_ds = gr[_REG32_DS] & 0xFFFF;
                /* Only change the user-alterable part of eflags */
                tf->tf_rflags &= ~PSL_USER;
                tf->tf_rflags |= (gr[_REG32_EFL] & PSL_USER);
                tf->tf_rdi    = gr[_REG32_EDI];
                tf->tf_rsi    = gr[_REG32_ESI];
                tf->tf_rbp    = gr[_REG32_EBP];
                tf->tf_rbx    = gr[_REG32_EBX];
                tf->tf_rdx    = gr[_REG32_EDX];
                tf->tf_rcx    = gr[_REG32_ECX];
                tf->tf_rax    = gr[_REG32_EAX];
                tf->tf_rip    = gr[_REG32_EIP];
                tf->tf_cs     = gr[_REG32_CS] & 0xFFFF;
                tf->tf_rsp    = gr[_REG32_UESP];
                tf->tf_ss     = gr[_REG32_SS] & 0xFFFF;
        }

        if ((flags & _UC_TLSBASE) != 0)
                lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);

        /* Restore floating point register context, if any. */
        if ((flags & _UC_FPU) != 0) {
                /* Assume fxsave context */
                process_write_fpregs_xmm(l, (const struct fxsave *)
                    &mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
        }

        mutex_enter(p->p_lock);
        if (flags & _UC_SETSTACK)
                l->l_sigstk.ss_flags |= SS_ONSTACK;
        if (flags & _UC_CLRSTACK)
                l->l_sigstk.ss_flags &= ~SS_ONSTACK;
        mutex_exit(p->p_lock);

        return 0;
}

void
cpu_getmcontext32(struct lwp *l, mcontext32_t *mcp, unsigned int *flags)
{
        const struct trapframe *tf = l->l_md.md_regs;
        __greg32_t *gr = mcp->__gregs;
        __greg32_t ras_eip;

        /* Save register context. */
        gr[_REG32_GS]  = tf->tf_gs & 0xFFFF;
        gr[_REG32_FS]  = tf->tf_fs & 0xFFFF;
        gr[_REG32_ES]  = tf->tf_es & 0xFFFF;
        gr[_REG32_DS]  = tf->tf_ds & 0xFFFF;
        gr[_REG32_EFL] = tf->tf_rflags;
        gr[_REG32_EDI]    = tf->tf_rdi;
        gr[_REG32_ESI]    = tf->tf_rsi;
        gr[_REG32_EBP]    = tf->tf_rbp;
        gr[_REG32_EBX]    = tf->tf_rbx;
        gr[_REG32_EDX]    = tf->tf_rdx;
        gr[_REG32_ECX]    = tf->tf_rcx;
        gr[_REG32_EAX]    = tf->tf_rax;
        gr[_REG32_EIP]    = tf->tf_rip;
        gr[_REG32_CS]     = tf->tf_cs & 0xFFFF;
        gr[_REG32_ESP]    = tf->tf_rsp;
        gr[_REG32_UESP]   = tf->tf_rsp;
        gr[_REG32_SS]     = tf->tf_ss & 0xFFFF;
        gr[_REG32_TRAPNO] = tf->tf_trapno;
        gr[_REG32_ERR]    = tf->tf_err;

        if ((ras_eip = (__greg32_t)(uintptr_t)ras_lookup(l->l_proc,
            (void *) (uintptr_t)gr[_REG32_EIP])) != (__greg32_t)-1)
                gr[_REG32_EIP] = ras_eip;

        *flags |= _UC_CPU;

        mcp->_mc_tlsbase = (uint32_t)(uintptr_t)l->l_private;
        *flags |= _UC_TLSBASE;

        /* Save floating point register context. */
        process_read_fpregs_xmm(l, (struct fxsave *)
            &mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
        memset(&mcp->__fpregs.__fp_pad, 0, sizeof(mcp->__fpregs.__fp_pad));
        *flags |= _UC_FXSAVE | _UC_FPU;
}

void
startlwp32(void *arg)
{
        ucontext32_t *uc = arg;
        lwp_t *l = curlwp;
        int error __diagused;

        error = cpu_setmcontext32(l, &uc->uc_mcontext, uc->uc_flags);
        KASSERT(error == 0);

        /* Note: we are freeing ucontext_t, not ucontext32_t. */
        kmem_free(uc, sizeof(ucontext_t));
        userret(l);
}

int
check_sigcontext32(struct lwp *l, const struct netbsd32_sigcontext *scp)
{
        struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap;
        struct trapframe *tf;
        struct pcb *pcb;

        tf = l->l_md.md_regs;
        pcb = lwp_getpcb(curlwp);

        if (((scp->sc_eflags ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
                return EINVAL;

        if (__predict_false(pmap->pm_ldt != NULL)) {
                /* Allow unfamiliar segment register values (USER_LDT). */
                if (!USERMODE(scp->sc_cs))
                        return EINVAL;
        } else {
                if (!VALID_USER_CSEL32(scp->sc_cs))
                        return EINVAL;
                if (scp->sc_fs != 0 && !VALID_USER_DSEL32(scp->sc_fs) &&
                    !(VALID_USER_FSEL32(scp->sc_fs) && pcb->pcb_fs != 0))
                        return EINVAL;
                if (scp->sc_gs != 0 && !VALID_USER_DSEL32(scp->sc_gs) &&
                    !(VALID_USER_GSEL32(scp->sc_gs) && pcb->pcb_gs != 0))
                        return EINVAL;
                if (scp->sc_es != 0 && !VALID_USER_DSEL32(scp->sc_es))
                        return EINVAL;
                if (!VALID_USER_DSEL32(scp->sc_ds) ||
                    !VALID_USER_DSEL32(scp->sc_ss))
                        return EINVAL;
        }

        if (scp->sc_eip >= VM_MAXUSER_ADDRESS32)
                return EINVAL;

        return 0;
}

int
cpu_mcontext32_validate(struct lwp *l, const mcontext32_t *mcp)
{
        struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap;
        const __greg32_t *gr;
        struct trapframe *tf;
        struct pcb *pcb;

        gr = mcp->__gregs;
        tf = l->l_md.md_regs;
        pcb = lwp_getpcb(l);

        if (((gr[_REG32_EFL] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
                return EINVAL;

        if (__predict_false(pmap->pm_ldt != NULL)) {
                /* Allow unfamiliar segment register values (USER_LDT). */
                if (!USERMODE(gr[_REG32_CS]))
                        return EINVAL;
        } else {
                if (!VALID_USER_CSEL32(gr[_REG32_CS]))
                        return EINVAL;
                if (gr[_REG32_FS] != 0 && !VALID_USER_DSEL32(gr[_REG32_FS]) &&
                    !(VALID_USER_FSEL32(gr[_REG32_FS]) && pcb->pcb_fs != 0))
                        return EINVAL;
                if (gr[_REG32_GS] != 0 && !VALID_USER_DSEL32(gr[_REG32_GS]) &&
                    !(VALID_USER_GSEL32(gr[_REG32_GS]) && pcb->pcb_gs != 0))
                        return EINVAL;
                if (gr[_REG32_ES] != 0 && !VALID_USER_DSEL32(gr[_REG32_ES]))
                        return EINVAL;
                if (!VALID_USER_DSEL32(gr[_REG32_DS]) ||
                    !VALID_USER_DSEL32(gr[_REG32_SS]))
                        return EINVAL;
        }

        if (gr[_REG32_EIP] >= VM_MAXUSER_ADDRESS32)
                return EINVAL;

        return 0;
}

static int
cpu_mcontext32from64_validate(struct lwp *l, const struct reg *regp)
{
        mcontext32_t mc;
        __greg32_t *gr32 = mc.__gregs;
        const __greg_t *gr = regp->regs;

        memset(&mc, 0, sizeof(mc));
        gr32[_REG32_EFL] = gr[_REG_RFLAGS];
        gr32[_REG32_EIP] = gr[_REG_RIP];
        gr32[_REG32_CS] = gr[_REG_CS];
        gr32[_REG32_DS] = gr[_REG_DS];
        gr32[_REG32_ES] = gr[_REG_ES];
        gr32[_REG32_FS] = gr[_REG_FS];
        gr32[_REG32_GS] = gr[_REG_GS];
        gr32[_REG32_SS] = gr[_REG_SS];
        return cpu_mcontext32_validate(l, &mc);
}

vaddr_t
netbsd32_vm_default_addr(struct proc *p, vaddr_t base, vsize_t sz,
    int topdown)
{
        if (topdown)
                return VM_DEFAULT_ADDRESS32_TOPDOWN(base, sz);
        else
                return VM_DEFAULT_ADDRESS32_BOTTOMUP(base, sz);
}

static const char *
netbsd32_machine32(void)
{

        return machine32;
}

void
netbsd32_machdep_md_init(void)
{

        MODULE_HOOK_SET(netbsd32_machine32_hook, netbsd32_machine32);
        MODULE_HOOK_SET(netbsd32_reg_validate_hook,
            cpu_mcontext32from64_validate);
        MODULE_HOOK_SET(netbsd32_process_doxmmregs_hook,
            netbsd32_process_doxmmregs);
}

void
netbsd32_machdep_md_fini(void)
{

        MODULE_HOOK_UNSET(netbsd32_machine32_hook);
        MODULE_HOOK_UNSET(netbsd32_reg_validate_hook);
        MODULE_HOOK_UNSET(netbsd32_process_doxmmregs_hook);
}





































































































   38 


















   36 

















   12 
   12 



   12 





   12 


























































































   12 

   12 
   12 






   12 





    2 
    2 
    2 



    2 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
/*        $NetBSD: ipi.c,v 1.30 2019/12/01 15:34:46 ad Exp $        */

/*-
 * Copyright (c) 2000, 2008, 2009, 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by RedBack Networks Inc.
 *
 * Author: Bill Sommerfeld
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ipi.c,v 1.30 2019/12/01 15:34:46 ad Exp $");

#include "opt_mtrr.h"

#include <sys/param.h>
#include <sys/device.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/intr.h>
#include <sys/ipi.h>
#include <sys/cpu.h>
#include <sys/xcall.h>

#ifdef MULTIPROCESSOR

#include <machine/cpufunc.h>
#include <machine/cpuvar.h>
#include <machine/i82093var.h>
#include <machine/i82489reg.h>
#include <machine/i82489var.h>
#include <machine/mtrr.h>
#include <machine/gdt.h>

#include "acpica.h"

#include <x86/fpu.h>

static void        x86_ipi_ast(struct cpu_info *);
static void        x86_ipi_halt(struct cpu_info *);
static void        x86_ipi_kpreempt(struct cpu_info *);
static void        x86_ipi_xcall(struct cpu_info *);
static void        x86_ipi_generic(struct cpu_info *);

#ifdef MTRR
static void        x86_ipi_reload_mtrr(struct cpu_info *);
#else
#define                x86_ipi_reload_mtrr        NULL
#endif

#if NACPICA > 0
void        acpi_cpu_sleep(struct cpu_info *);
#else
#define        acpi_cpu_sleep        NULL
#endif

static void        x86_ipi_synch_fpu(struct cpu_info *);

void (* const ipifunc[X86_NIPI])(struct cpu_info *) =
{
        x86_ipi_halt,                /* X86_IPI_HALT */
        x86_ipi_ast,                /* X86_IPI_AST */
        x86_ipi_generic,        /* X86_IPI_GENERIC */
        x86_ipi_synch_fpu,        /* X86_IPI_SYNCH_FPU */
        x86_ipi_reload_mtrr,        /* X86_IPI_MTRR */
        NULL,                        /* X86_IPI_GDT */
        x86_ipi_xcall,                /* X86_IPI_XCALL */
        acpi_cpu_sleep,                /* X86_IPI_ACPI_CPU_SLEEP */
        x86_ipi_kpreempt        /* X86_IPI_KPREEMPT */
};

/*
 * x86 IPI interface.
 */

int
x86_send_ipi(struct cpu_info *ci, int ipimask)
{
        uint32_t o, n;
        int ret = 0;

        /* Don't send IPI to CPU which isn't (yet) running. */
        if (__predict_false((ci->ci_flags & CPUF_RUNNING) == 0))
                return ENOENT;

        /* Set in new IPI bit, and capture previous state. */
        for (o = 0;; o = n) {
                n = atomic_cas_32(&ci->ci_ipis, o, o | ipimask);
                if (__predict_true(o == n)) {
                        break;
                }
        }

        /* If no IPI already pending, send one. */
        if (o == 0) {
                ret = x86_ipi(LAPIC_IPI_VECTOR, ci->ci_cpuid, LAPIC_DLMODE_FIXED);
                if (ret != 0) {
                        printf("ipi of %x from %s to %s failed\n",
                            ipimask,
                            device_xname(curcpu()->ci_dev),
                            device_xname(ci->ci_dev));
                }
        }

        return ret;
}

void
x86_broadcast_ipi(int ipimask)
{
        struct cpu_info *ci, *self = curcpu();
        int count = 0;
        CPU_INFO_ITERATOR cii;

        for (CPU_INFO_FOREACH(cii, ci)) {
                if (ci == self)
                        continue;
                if ((ci->ci_flags & CPUF_RUNNING) == 0)
                        continue;
                atomic_or_32(&ci->ci_ipis, ipimask);
                count++;
        }
        if (!count)
                return;

        x86_ipi(LAPIC_IPI_VECTOR, LAPIC_DEST_ALLEXCL, LAPIC_DLMODE_FIXED);
}

void
x86_ipi_handler(void)
{
        struct cpu_info *ci = curcpu();
        uint32_t pending;
        int bit;

        pending = atomic_swap_32(&ci->ci_ipis, 0);

        KDASSERT((pending >> X86_NIPI) == 0);
        while ((bit = ffs(pending)) != 0) {
                bit--;
                pending &= ~(1 << bit);
                ci->ci_ipi_events[bit].ev_count++;
                (*ipifunc[bit])(ci);
        }
}

/*
 * Common x86 IPI handlers.
 */

static void
x86_ipi_halt(struct cpu_info *ci)
{

        x86_disable_intr();
        atomic_and_32(&ci->ci_flags, ~CPUF_RUNNING);

        for (;;) {
                x86_hlt();
        }
}

static void
x86_ipi_synch_fpu(struct cpu_info *ci)
{

        panic("%s: impossible", __func__);
}

#ifdef MTRR
static void
x86_ipi_reload_mtrr(struct cpu_info *ci)
{

        if (mtrr_funcs != NULL) {
                /*
                 * mtrr_reload_cpu() is a macro in mtrr.h which picks
                 * the appropriate function to use.
                 */
                mtrr_reload_cpu(ci);
        }
}
#endif

static void
x86_ipi_kpreempt(struct cpu_info *ci)
{

        softint_trigger(1 << SIR_PREEMPT);
}

static void
x86_ipi_ast(struct cpu_info *ci)
{

        aston(ci->ci_onproc);
}

/*
 * MD support for xcall(9) interface.
 */

static void
x86_ipi_xcall(struct cpu_info *ci)
{
        xc_ipi_handler();
}

static void
x86_ipi_generic(struct cpu_info *ci)
{
        ipi_cpu_handler();
}

void
xc_send_ipi(struct cpu_info *ci)
{

        KASSERT(kpreempt_disabled());
        KASSERT(curcpu() != ci);

        if (ci) {
                /* Unicast: remote CPU. */
                x86_send_ipi(ci, X86_IPI_XCALL);
        } else {
                /* Broadcast: all, but local CPU (caller will handle it). */
                x86_broadcast_ipi(X86_IPI_XCALL);
        }
}

void
cpu_ipi(struct cpu_info *ci)
{
        KASSERT(kpreempt_disabled());
        KASSERT(curcpu() != ci);

        if (ci) {
                /* Unicast: remote CPU. */
                x86_send_ipi(ci, X86_IPI_GENERIC);
        } else {
                /* Broadcast: all, but local CPU (caller will handle it). */
                x86_broadcast_ipi(X86_IPI_GENERIC);
        }
}

#else

int
x86_send_ipi(struct cpu_info *ci, int ipimask)
{

        return 0;
}

void
x86_broadcast_ipi(int ipimask)
{

}

void
cpu_ipi(struct cpu_info *ci)
{
}

#endif

















































































   48 
































































































































    3 




















































































    3 

















    3 



























































    3 
































































   50 


   48 











   48 






































    4 

   48 




















    3 
    3 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
/*        $NetBSD: subr_percpu.c,v 1.25 2020/05/11 21:37:31 riastradh Exp $        */

/*-
 * Copyright (c)2007,2008 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * per-cpu storage.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.25 2020/05/11 21:37:31 riastradh Exp $");

#include <sys/param.h>
#include <sys/cpu.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/rwlock.h>
#include <sys/vmem.h>
#include <sys/xcall.h>

#define        PERCPU_QUANTUM_SIZE        (ALIGNBYTES + 1)
#define        PERCPU_QCACHE_MAX        0
#define        PERCPU_IMPORT_SIZE        2048

struct percpu {
        unsigned                pc_offset;
        size_t                        pc_size;
        percpu_callback_t        pc_ctor;
        percpu_callback_t        pc_dtor;
        void                        *pc_cookie;
        LIST_ENTRY(percpu)        pc_list;
};

static krwlock_t        percpu_swap_lock        __cacheline_aligned;
static vmem_t *                percpu_offset_arena        __read_mostly;
static struct {
        kmutex_t        lock;
        unsigned int        nextoff;
        LIST_HEAD(, percpu) ctor_list;
        struct lwp        *busy;
        kcondvar_t        cv;
} percpu_allocation __cacheline_aligned;

static percpu_cpu_t *
cpu_percpu(struct cpu_info *ci)
{

        return &ci->ci_data.cpu_percpu;
}

static unsigned int
percpu_offset(percpu_t *pc)
{
        const unsigned int off = pc->pc_offset;

        KASSERT(off < percpu_allocation.nextoff);
        return off;
}

/*
 * percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge
 */
__noubsan
static void
percpu_cpu_swap(void *p1, void *p2)
{
        struct cpu_info * const ci = p1;
        percpu_cpu_t * const newpcc = p2;
        percpu_cpu_t * const pcc = cpu_percpu(ci);

        KASSERT(ci == curcpu() || !mp_online);

        /*
         * swap *pcc and *newpcc unless anyone has beaten us.
         */
        rw_enter(&percpu_swap_lock, RW_WRITER);
        if (newpcc->pcc_size > pcc->pcc_size) {
                percpu_cpu_t tmp;
                int s;

                tmp = *pcc;

                /*
                 * block interrupts so that we don't lose their modifications.
                 */

                s = splhigh();

                /*
                 * copy data to new storage.
                 */

                memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size);

                /*
                 * this assignment needs to be atomic for percpu_getptr_remote.
                 */

                pcc->pcc_data = newpcc->pcc_data;

                splx(s);

                pcc->pcc_size = newpcc->pcc_size;
                *newpcc = tmp;
        }
        rw_exit(&percpu_swap_lock);
}

/*
 * percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space
 */

static void
percpu_cpu_enlarge(size_t size)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;

        for (CPU_INFO_FOREACH(cii, ci)) {
                percpu_cpu_t pcc;

                pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */
                pcc.pcc_size = size;
                if (!mp_online) {
                        percpu_cpu_swap(ci, &pcc);
                } else {
                        uint64_t where;

                        where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci);
                        xc_wait(where);
                }
                KASSERT(pcc.pcc_size <= size);
                if (pcc.pcc_data != NULL) {
                        kmem_free(pcc.pcc_data, pcc.pcc_size);
                }
        }
}

/*
 * percpu_backend_alloc: vmem import callback for percpu_offset_arena
 */

static int
percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize,
    vm_flag_t vmflags, vmem_addr_t *addrp)
{
        unsigned int offset;
        unsigned int nextoff;

        ASSERT_SLEEPABLE();
        KASSERT(dummy == NULL);

        if ((vmflags & VM_NOSLEEP) != 0)
                return ENOMEM;

        size = roundup(size, PERCPU_IMPORT_SIZE);
        mutex_enter(&percpu_allocation.lock);
        offset = percpu_allocation.nextoff;
        percpu_allocation.nextoff = nextoff = percpu_allocation.nextoff + size;
        mutex_exit(&percpu_allocation.lock);

        percpu_cpu_enlarge(nextoff);

        *resultsize = size;
        *addrp = (vmem_addr_t)offset;
        return 0;
}

static void
percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci)
{
        size_t sz = (uintptr_t)vp2;

        memset(vp, 0, sz);
}

/*
 * percpu_zero: initialize percpu storage with zero.
 */

static void
percpu_zero(percpu_t *pc, size_t sz)
{

        percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz);
}

/*
 * percpu_init: subsystem initialization
 */

void
percpu_init(void)
{

        ASSERT_SLEEPABLE();
        rw_init(&percpu_swap_lock);
        mutex_init(&percpu_allocation.lock, MUTEX_DEFAULT, IPL_NONE);
        percpu_allocation.nextoff = PERCPU_QUANTUM_SIZE;
        LIST_INIT(&percpu_allocation.ctor_list);
        percpu_allocation.busy = NULL;
        cv_init(&percpu_allocation.cv, "percpu");

        percpu_offset_arena = vmem_xcreate("percpu", 0, 0, PERCPU_QUANTUM_SIZE,
            percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP,
            IPL_NONE);
}

/*
 * percpu_init_cpu: cpu initialization
 *
 * => should be called before the cpu appears on the list for CPU_INFO_FOREACH.
 * => may be called for static CPUs afterward (typically just primary CPU)
 */

void
percpu_init_cpu(struct cpu_info *ci)
{
        percpu_cpu_t * const pcc = cpu_percpu(ci);
        struct percpu *pc;
        size_t size = percpu_allocation.nextoff; /* XXX racy */

        ASSERT_SLEEPABLE();

        /*
         * For the primary CPU, prior percpu_create may have already
         * triggered allocation, so there's nothing more for us to do
         * here.
         */
        if (pcc->pcc_size)
                return;
        KASSERT(pcc->pcc_data == NULL);

        /*
         * Otherwise, allocate storage and, while the constructor list
         * is locked, run constructors for all percpus on this CPU.
         */
        pcc->pcc_size = size;
        if (size) {
                pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP);
                mutex_enter(&percpu_allocation.lock);
                while (percpu_allocation.busy)
                        cv_wait(&percpu_allocation.cv,
                            &percpu_allocation.lock);
                percpu_allocation.busy = curlwp;
                LIST_FOREACH(pc, &percpu_allocation.ctor_list, pc_list) {
                        KASSERT(pc->pc_ctor);
                        mutex_exit(&percpu_allocation.lock);
                        (*pc->pc_ctor)((char *)pcc->pcc_data + pc->pc_offset,
                            pc->pc_cookie, ci);
                        mutex_enter(&percpu_allocation.lock);
                }
                KASSERT(percpu_allocation.busy == curlwp);
                percpu_allocation.busy = NULL;
                cv_broadcast(&percpu_allocation.cv);
                mutex_exit(&percpu_allocation.lock);
        }
}

/*
 * percpu_alloc: allocate percpu storage
 *
 * => called in thread context.
 * => considered as an expensive and rare operation.
 * => allocated storage is initialized with zeros.
 */

percpu_t *
percpu_alloc(size_t size)
{

        return percpu_create(size, NULL, NULL, NULL);
}

/*
 * percpu_create: allocate percpu storage and associate ctor/dtor with it
 *
 * => called in thread context.
 * => considered as an expensive and rare operation.
 * => allocated storage is initialized by ctor, or zeros if ctor is null
 * => percpu_free will call dtor first, if dtor is nonnull
 * => ctor or dtor may sleep, even on allocation
 */

percpu_t *
percpu_create(size_t size, percpu_callback_t ctor, percpu_callback_t dtor,
    void *cookie)
{
        vmem_addr_t offset;
        percpu_t *pc;

        ASSERT_SLEEPABLE();
        (void)vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT,
            &offset);

        pc = kmem_alloc(sizeof(*pc), KM_SLEEP);
        pc->pc_offset = offset;
        pc->pc_size = size;
        pc->pc_ctor = ctor;
        pc->pc_dtor = dtor;
        pc->pc_cookie = cookie;

        if (ctor) {
                CPU_INFO_ITERATOR cii;
                struct cpu_info *ci;
                void *buf;

                /*
                 * Wait until nobody is using the list of percpus with
                 * constructors.
                 */
                mutex_enter(&percpu_allocation.lock);
                while (percpu_allocation.busy)
                        cv_wait(&percpu_allocation.cv,
                            &percpu_allocation.lock);
                percpu_allocation.busy = curlwp;
                mutex_exit(&percpu_allocation.lock);

                /*
                 * Run the constructor for all CPUs.  We use a
                 * temporary buffer wo that we need not hold the
                 * percpu_swap_lock while running the constructor.
                 */
                buf = kmem_alloc(size, KM_SLEEP);
                for (CPU_INFO_FOREACH(cii, ci)) {
                        memset(buf, 0, size);
                        (*ctor)(buf, cookie, ci);
                        percpu_traverse_enter();
                        memcpy(percpu_getptr_remote(pc, ci), buf, size);
                        percpu_traverse_exit();
                }
                explicit_memset(buf, 0, size);
                kmem_free(buf, size);

                /*
                 * Insert the percpu into the list of percpus with
                 * constructors.  We are now done using the list, so it
                 * is safe for concurrent percpu_create or concurrent
                 * percpu_init_cpu to run.
                 */
                mutex_enter(&percpu_allocation.lock);
                KASSERT(percpu_allocation.busy == curlwp);
                percpu_allocation.busy = NULL;
                cv_broadcast(&percpu_allocation.cv);
                LIST_INSERT_HEAD(&percpu_allocation.ctor_list, pc, pc_list);
                mutex_exit(&percpu_allocation.lock);
        } else {
                percpu_zero(pc, size);
        }

        return pc;
}

/*
 * percpu_free: free percpu storage
 *
 * => called in thread context.
 * => considered as an expensive and rare operation.
 */

void
percpu_free(percpu_t *pc, size_t size)
{

        ASSERT_SLEEPABLE();
        KASSERT(size == pc->pc_size);

        /*
         * If there's a constructor, take the percpu off the list of
         * percpus with constructors, but first wait until nobody is
         * using the list.
         */
        if (pc->pc_ctor) {
                mutex_enter(&percpu_allocation.lock);
                while (percpu_allocation.busy)
                        cv_wait(&percpu_allocation.cv,
                            &percpu_allocation.lock);
                LIST_REMOVE(pc, pc_list);
                mutex_exit(&percpu_allocation.lock);
        }

        /* If there's a destructor, run it now for all CPUs.  */
        if (pc->pc_dtor) {
                CPU_INFO_ITERATOR cii;
                struct cpu_info *ci;
                void *buf;

                buf = kmem_alloc(size, KM_SLEEP);
                for (CPU_INFO_FOREACH(cii, ci)) {
                        percpu_traverse_enter();
                        memcpy(buf, percpu_getptr_remote(pc, ci), size);
                        explicit_memset(percpu_getptr_remote(pc, ci), 0, size);
                        percpu_traverse_exit();
                        (*pc->pc_dtor)(buf, pc->pc_cookie, ci);
                }
                explicit_memset(buf, 0, size);
                kmem_free(buf, size);
        }

        vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size);
        kmem_free(pc, sizeof(*pc));
}

/*
 * percpu_getref:
 *
 * => safe to be used in either thread or interrupt context
 * => disables preemption; must be bracketed with a percpu_putref()
 */

void *
percpu_getref(percpu_t *pc)
{

        kpreempt_disable();
        return percpu_getptr_remote(pc, curcpu());
}

/*
 * percpu_putref:
 *
 * => drops the preemption-disabled count after caller is done with per-cpu
 *    data
 */

void
percpu_putref(percpu_t *pc)
{

        kpreempt_enable();
}

/*
 * percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote:
 * helpers to access remote cpu's percpu data.
 *
 * => called in thread context.
 * => percpu_traverse_enter can block low-priority xcalls.
 * => typical usage would be:
 *
 *        sum = 0;
 *        percpu_traverse_enter();
 *        for (CPU_INFO_FOREACH(cii, ci)) {
 *                unsigned int *p = percpu_getptr_remote(pc, ci);
 *                sum += *p;
 *        }
 *        percpu_traverse_exit();
 */

void
percpu_traverse_enter(void)
{

        ASSERT_SLEEPABLE();
        rw_enter(&percpu_swap_lock, RW_READER);
}

void
percpu_traverse_exit(void)
{

        rw_exit(&percpu_swap_lock);
}

void *
percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci)
{

        return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)];
}

/*
 * percpu_foreach: call the specified callback function for each cpus.
 *
 * => must be called from thread context.
 * => callback executes on **current** CPU (or, really, arbitrary CPU,
 *    in case of preemption)
 * => caller should not rely on the cpu iteration order.
 * => the callback function should be minimum because it is executed with
 *    holding a global lock, which can block low-priority xcalls.
 *    eg. it's illegal for a callback function to sleep for memory allocation.
 */
void
percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;

        percpu_traverse_enter();
        for (CPU_INFO_FOREACH(cii, ci)) {
                (*cb)(percpu_getptr_remote(pc, ci), arg, ci);
        }
        percpu_traverse_exit();
}

struct percpu_xcall_ctx {
        percpu_callback_t  ctx_cb;
        void                  *ctx_arg;
};

static void
percpu_xcfunc(void * const v1, void * const v2)
{
        percpu_t * const pc = v1;
        struct percpu_xcall_ctx * const ctx = v2;

        (*ctx->ctx_cb)(percpu_getref(pc), ctx->ctx_arg, curcpu());
        percpu_putref(pc);
}

/*
 * percpu_foreach_xcall: call the specified callback function for each
 * cpu.  This version uses an xcall to run the callback on each cpu.
 *
 * => must be called from thread context.
 * => callback executes on **remote** CPU in soft-interrupt context
 *    (at the specified soft interrupt priority).
 * => caller should not rely on the cpu iteration order.
 * => the callback function should be minimum because it may be
 *    executed in soft-interrupt context.  eg. it's illegal for
 *    a callback function to sleep for memory allocation.
 */
void
percpu_foreach_xcall(percpu_t *pc, u_int xcflags, percpu_callback_t cb,
                     void *arg)
{
        struct percpu_xcall_ctx ctx = {
                .ctx_cb = cb,
                .ctx_arg = arg,
        };
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;

        for (CPU_INFO_FOREACH(cii, ci)) {
                xc_wait(xc_unicast(xcflags, percpu_xcfunc, pc, &ctx, ci));
        }
}




































































    5 



























    5 







    2 







    5 








































    2 




















    2 

    1 



    2 








    3 

    2 

    5 
    3 





    5 

    6 









    4 


    4 





    4 



    4 



    3 

















    5 









    1 














    1 


    1 






















    1 


    3 














    4 










    1 
    3 





    1 


    4 



    2 



    2 

















































































































    1 









    1 
    1 
    1 
    1 
























































































    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
/*        $NetBSD: subr_time.c,v 1.38 2023/07/08 20:02:10 riastradh Exp $        */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_clock.c        8.5 (Berkeley) 1/21/94
 *        @(#)kern_time.c 8.4 (Berkeley) 5/26/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_time.c,v 1.38 2023/07/08 20:02:10 riastradh Exp $");

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/lwp.h>
#include <sys/timex.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/intr.h>

/*
 * Compute number of hz until specified time.  Used to compute second
 * argument to callout_reset() from an absolute time.
 */
int
tvhzto(const struct timeval *tvp)
{
        struct timeval now, tv;

        tv = *tvp;        /* Don't modify original tvp. */
        getmicrotime(&now);
        timersub(&tv, &now, &tv);
        return tvtohz(&tv);
}

/*
 * Compute number of ticks in the specified amount of time.
 */
int
tvtohz(const struct timeval *tv)
{
        unsigned long ticks;
        long sec, usec;

        /*
         * If the number of usecs in the whole seconds part of the time
         * difference fits in a long, then the total number of usecs will
         * fit in an unsigned long.  Compute the total and convert it to
         * ticks, rounding up and adding 1 to allow for the current tick
         * to expire.  Rounding also depends on unsigned long arithmetic
         * to avoid overflow.
         *
         * Otherwise, if the number of ticks in the whole seconds part of
         * the time difference fits in a long, then convert the parts to
         * ticks separately and add, using similar rounding methods and
         * overflow avoidance.  This method would work in the previous
         * case, but it is slightly slower and assumes that hz is integral.
         *
         * Otherwise, round the time difference down to the maximum
         * representable value.
         *
         * If ints are 32-bit, then the maximum value for any timeout in
         * 10ms ticks is 248 days.
         */
        sec = tv->tv_sec;
        usec = tv->tv_usec;

        KASSERT(usec >= 0);
        KASSERT(usec < 1000000);

        /* catch overflows in conversion time_t->int */
        if (tv->tv_sec > INT_MAX)
                return INT_MAX;
        if (tv->tv_sec < 0)
                return 0;

        if (sec < 0 || (sec == 0 && usec == 0)) {
                /*
                 * Would expire now or in the past.  Return 0 ticks.
                 * This is different from the legacy tvhzto() interface,
                 * and callers need to check for it.
                 */
                ticks = 0;
        } else if (sec <= (LONG_MAX / 1000000))
                ticks = (((sec * 1000000) + (unsigned long)usec + (tick - 1))
                    / tick) + 1;
        else if (sec <= (LONG_MAX / hz))
                ticks = (sec * hz) +
                    (((unsigned long)usec + (tick - 1)) / tick) + 1;
        else
                ticks = LONG_MAX;

        if (ticks > INT_MAX)
                ticks = INT_MAX;

        return ((int)ticks);
}

int
tshzto(const struct timespec *tsp)
{
        struct timespec now, ts;

        ts = *tsp;        /* Don't modify original tsp. */
        getnanotime(&now);
        timespecsub(&ts, &now, &ts);
        return tstohz(&ts);
}

int
tshztoup(const struct timespec *tsp)
{
        struct timespec now, ts;

        ts = *tsp;        /* Don't modify original tsp. */
        getnanouptime(&now);
        timespecsub(&ts, &now, &ts);
        return tstohz(&ts);
}

/*
 * Compute number of ticks in the specified amount of time.
 */
int
tstohz(const struct timespec *ts)
{
        struct timeval tv;

        /*
         * usec has great enough resolution for hz, so convert to a
         * timeval and use tvtohz() above.
         */
        TIMESPEC_TO_TIMEVAL(&tv, ts);
        return tvtohz(&tv);
}

/*
 * Check that a proposed value to load into the .it_value or
 * .it_interval part of an interval timer is acceptable, and
 * fix it to have at least minimal value (i.e. if it is less
 * than the resolution of the clock, round it up.). We don't
 * timeout the 0,0 value because this means to disable the
 * timer or the interval.
 */
int
itimerfix(struct timeval *tv)
{

        if (tv->tv_usec < 0 || tv->tv_usec >= 1000000)
                return EINVAL;
        if (tv->tv_sec < 0)
                return ETIMEDOUT;
        if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
                tv->tv_usec = tick;
        return 0;
}

int
itimespecfix(struct timespec *ts)
{

        if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
                return EINVAL;
        if (ts->tv_sec < 0)
                return ETIMEDOUT;
        if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)
                ts->tv_nsec = tick * 1000;
        return 0;
}

int
inittimeleft(struct timespec *ts, struct timespec *sleepts)
{

        if (itimespecfix(ts)) {
                return -1;
        }
        KASSERT(ts->tv_sec >= 0);
        getnanouptime(sleepts);
        return 0;
}

int
gettimeleft(struct timespec *ts, struct timespec *sleepts)
{
        struct timespec now, sleptts;

        KASSERT(ts->tv_sec >= 0);

        /*
         * Reduce ts by elapsed time based on monotonic time scale.
         */
        getnanouptime(&now);
        KASSERT(timespeccmp(sleepts, &now, <=));
        timespecsub(&now, sleepts, &sleptts);
        *sleepts = now;

        if (timespeccmp(ts, &sleptts, <=)) { /* timed out */
                timespecclear(ts);
                return 0;
        }
        timespecsub(ts, &sleptts, ts);

        return tstohz(ts);
}

void
clock_timeleft(clockid_t clockid, struct timespec *ts, struct timespec *sleepts)
{
        struct timespec sleptts;

        clock_gettime1(clockid, &sleptts);
        timespecadd(ts, sleepts, ts);
        timespecsub(ts, &sleptts, ts);
        *sleepts = sleptts;
}

int
clock_gettime1(clockid_t clock_id, struct timespec *ts)
{
        int error;
        struct proc *p;

#define CPUCLOCK_ID_MASK (~(CLOCK_THREAD_CPUTIME_ID|CLOCK_PROCESS_CPUTIME_ID))
        if (clock_id & CLOCK_PROCESS_CPUTIME_ID) {
                pid_t pid = clock_id & CPUCLOCK_ID_MASK;
                struct timeval cputime;

                mutex_enter(&proc_lock);
                p = pid == 0 ? curproc : proc_find(pid);
                if (p == NULL) {
                        mutex_exit(&proc_lock);
                        return ESRCH;
                }
                mutex_enter(p->p_lock);
                calcru(p, /*usertime*/NULL, /*systime*/NULL, /*intrtime*/NULL,
                    &cputime);
                mutex_exit(p->p_lock);
                mutex_exit(&proc_lock);

                // XXX: Perhaps create a special kauth type
                error = kauth_authorize_process(kauth_cred_get(),
                    KAUTH_PROCESS_PTRACE, p,
                    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
                if (error)
                        return error;

                TIMEVAL_TO_TIMESPEC(&cputime, ts);
                return 0;
        } else if (clock_id & CLOCK_THREAD_CPUTIME_ID) {
                struct lwp *l;
                lwpid_t lid = clock_id & CPUCLOCK_ID_MASK;
                struct bintime tm = {0, 0};

                p = curproc;
                mutex_enter(p->p_lock);
                l = lid == 0 ? curlwp : lwp_find(p, lid);
                if (l == NULL) {
                        mutex_exit(p->p_lock);
                        return ESRCH;
                }
                addrulwp(l, &tm);
                mutex_exit(p->p_lock);

                bintime2timespec(&tm, ts);
                return 0;
        }

        switch (clock_id) {
        case CLOCK_REALTIME:
                nanotime(ts);
                break;
        case CLOCK_MONOTONIC:
                nanouptime(ts);
                break;
        default:
                return EINVAL;
        }

        return 0;
}

/*
 * Calculate delta and convert from struct timespec to the ticks.
 */
int
ts2timo(clockid_t clock_id, int flags, struct timespec *ts,
    int *timo, struct timespec *start)
{
        int error;
        struct timespec tsd;

        if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000L)
                return EINVAL;

        if ((flags & TIMER_ABSTIME) != 0 || start != NULL) {
                error = clock_gettime1(clock_id, &tsd);
                if (error != 0)
                        return error;
                if (start != NULL)
                        *start = tsd;
        }

        if ((flags & TIMER_ABSTIME) != 0) {
                if (!timespecsubok(ts, &tsd))
                        return EINVAL;
                timespecsub(ts, &tsd, ts);
        }

        error = itimespecfix(ts);
        if (error != 0)
                return error;

        if (ts->tv_sec == 0 && ts->tv_nsec == 0)
                return ETIMEDOUT;

        *timo = tstohz(ts);
        KASSERT(*timo > 0);

        return 0;
}

bool
timespecaddok(const struct timespec *tsp, const struct timespec *usp)
{
        enum { TIME_MIN = __type_min(time_t), TIME_MAX = __type_max(time_t) };
        time_t a = tsp->tv_sec;
        time_t b = usp->tv_sec;
        bool carry;

        /*
         * Caller is responsible for guaranteeing valid timespec
         * inputs.  Any user-controlled inputs must be validated or
         * adjusted.
         */
        KASSERT(tsp->tv_nsec >= 0);
        KASSERT(usp->tv_nsec >= 0);
        KASSERT(tsp->tv_nsec < 1000000000L);
        KASSERT(usp->tv_nsec < 1000000000L);
        CTASSERT(1000000000L <= __type_max(long) - 1000000000L);

        /*
         * Fail if a + b + carry overflows TIME_MAX, or if a + b
         * overflows TIME_MIN because timespecadd adds the carry after
         * computing a + b.
         *
         * Break it into two mutually exclusive and exhaustive cases:
         * I. a >= 0
         * II. a < 0
         */
        carry = (tsp->tv_nsec + usp->tv_nsec >= 1000000000L);
        if (a >= 0) {
                /*
                 * Case I: a >= 0.  If b < 0, then b + 1 <= 0, so
                 *
                 *        a + b + 1 <= a + 0 <= TIME_MAX,
                 *
                 * and
                 *
                 *        a + b >= 0 + b = b >= TIME_MIN,
                 *
                 * so this can't overflow.
                 *
                 * If b >= 0, then a + b + carry >= a + b >= 0, so
                 * negative results and thus results below TIME_MIN are
                 * impossible; we need only avoid
                 *
                 *        a + b + carry > TIME_MAX,
                 *
                 * which we will do by rejecting if
                 *
                 *        b > TIME_MAX - a - carry,
                 *
                 * which in turn is incidentally always false if b < 0
                 * so we don't need extra logic to discriminate on the
                 * b >= 0 and b < 0 cases.
                 *
                 * Since 0 <= a <= TIME_MAX, we know
                 *
                 *        0 <= TIME_MAX - a <= TIME_MAX,
                 *
                 * and hence
                 *
                 *        -1 <= TIME_MAX - a - 1 < TIME_MAX.
                 *
                 * So we can compute TIME_MAX - a - carry (i.e., either
                 * TIME_MAX - a or TIME_MAX - a - 1) safely without
                 * overflow.
                 */
                if (b > TIME_MAX - a - carry)
                        return false;
        } else {
                /*
                 * Case II: a < 0.  If b >= 0, then since a + 1 <= 0,
                 * we have
                 *
                 *        a + b + 1 <= b <= TIME_MAX,
                 *
                 * and
                 *
                 *        a + b >= a >= TIME_MIN,
                 *
                 * so this can't overflow.
                 *
                 * If b < 0, then the intermediate a + b is negative
                 * and the outcome a + b + 1 is nonpositive, so we need
                 * only avoid
                 *
                 *        a + b < TIME_MIN,
                 *
                 * which we will do by rejecting if
                 *
                 *        a < TIME_MIN - b.
                 *
                 * (Reminder: The carry is added afterward in
                 * timespecadd, so to avoid overflow it is not enough
                 * to merely reject a + b + carry < TIME_MIN.)
                 *
                 * It is safe to compute the difference TIME_MIN - b
                 * because b is negative, so the result lies in
                 * (TIME_MIN, 0].
                 */
                if (b < 0 && a < TIME_MIN - b)
                        return false;
        }

        return true;
}

bool
timespecsubok(const struct timespec *tsp, const struct timespec *usp)
{
        enum { TIME_MIN = __type_min(time_t), TIME_MAX = __type_max(time_t) };
        time_t a = tsp->tv_sec, b = usp->tv_sec;
        bool borrow;

        /*
         * Caller is responsible for guaranteeing valid timespec
         * inputs.  Any user-controlled inputs must be validated or
         * adjusted.
         */
        KASSERT(tsp->tv_nsec >= 0);
        KASSERT(usp->tv_nsec >= 0);
        KASSERT(tsp->tv_nsec < 1000000000L);
        KASSERT(usp->tv_nsec < 1000000000L);
        CTASSERT(1000000000L <= __type_max(long) - 1000000000L);

        /*
         * Fail if a - b - borrow overflows TIME_MIN, or if a - b
         * overflows TIME_MAX because timespecsub subtracts the borrow
         * after computing a - b.
         *
         * Break it into two mutually exclusive and exhaustive cases:
         * I. a < 0
         * II. a >= 0
         */
        borrow = (tsp->tv_nsec - usp->tv_nsec < 0);
        if (a < 0) {
                /*
                 * Case I: a < 0.  If b < 0, then -b - 1 >= 0, so
                 *
                 *        a - b - 1 >= a + 0 >= TIME_MIN,
                 *
                 * and, since a <= -1, provided that TIME_MIN <=
                 * -TIME_MAX - 1 so that TIME_MAX <= -TIME_MIN - 1 (in
                 * fact, equality holds, under the assumption of
                 * two's-complement arithmetic),
                 *
                 *        a - b <= -1 - b = -b - 1 <= TIME_MAX,
                 *
                 * so this can't overflow.
                 */
                CTASSERT(TIME_MIN <= -TIME_MAX - 1);

                /*
                 * If b >= 0, then a - b - borrow <= a - b < 0, so
                 * positive results and thus results above TIME_MAX are
                 * impossible; we need only avoid
                 *
                 *        a - b - borrow < TIME_MIN,
                 *
                 * which we will do by rejecting if
                 *
                 *        a < TIME_MIN + b + borrow.
                 *
                 * The right-hand side is safe to evaluate for any
                 * values of b and borrow as long as TIME_MIN +
                 * TIME_MAX + 1 <= TIME_MAX, i.e., TIME_MIN <= -1.
                 * (Note: If time_t were unsigned, this would fail!)
                 *
                 * Note: Unlike Case I in timespecaddok, this criterion
                 * does not work for b < 0, nor can the roles of a and
                 * b in the inequality be reversed (e.g., -b < TIME_MIN
                 * - a + borrow) without extra cases like checking for
                 * b = TEST_MIN.
                 */
                CTASSERT(TIME_MIN < -1);
                if (b >= 0 && a < TIME_MIN + b + borrow)
                        return false;
        } else {
                /*
                 * Case II: a >= 0.  If b >= 0, then
                 *
                 *        a - b <= a <= TIME_MAX,
                 *
                 * and, provided TIME_MIN <= -TIME_MAX - 1 (in fact,
                 * equality holds, under the assumption of
                 * two's-complement arithmetic)
                 *
                 *        a - b - 1 >= -b - 1 >= -TIME_MAX - 1 >= TIME_MIN,
                 *
                 * so this can't overflow.
                 */
                CTASSERT(TIME_MIN <= -TIME_MAX - 1);

                /*
                 * If b < 0, then a - b >= a >= 0, so negative results
                 * and thus results below TIME_MIN are impossible; we
                 * need only avoid
                 *
                 *        a - b > TIME_MAX,
                 *
                 * which we will do by rejecting if
                 *
                 *        a > TIME_MAX + b.
                 *
                 * (Reminder: The borrow is subtracted afterward in
                 * timespecsub, so to avoid overflow it is not enough
                 * to merely reject a - b - borrow > TIME_MAX.)
                 *
                 * It is safe to compute the sum TIME_MAX + b because b
                 * is negative, so the result lies in [0, TIME_MAX).
                 */
                if (b < 0 && a > TIME_MAX + b)
                        return false;
        }

        return true;
}


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 









    3 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
/*        $NetBSD: ufs_extattr.c,v 1.55 2024/02/10 18:43:53 andvar Exp $        */

/*-
 * Copyright (c) 1999-2002 Robert N. M. Watson
 * Copyright (c) 2002-2003 Networks Associates Technology, Inc.
 * All rights reserved.
 *
 * This software was developed by Robert Watson for the TrustedBSD Project.
 *
 * This software was developed for the FreeBSD Project in part by Network
 * Associates Laboratories, the Security Research Division of Network
 * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
 * as part of the DARPA CHATS research program.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

/*
 * Support for file system extended attributes on the UFS1 file system.
 *
 * Extended attributes are defined in the form name=value, where name is
 * a nul-terminated string in the style of a file name, and value is a
 * binary blob of zero or more bytes.  The UFS1 extended attribute service
 * layers support for extended attributes onto a backing file, in the style
 * of the quota implementation, meaning that it requires no underlying format
 * changes to the file system.  This design choice exchanges simplicity,
 * usability, and easy deployment for performance.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_extattr.c,v 1.55 2024/02/10 18:43:53 andvar Exp $");

#ifdef _KERNEL_OPT
#include "opt_ffs.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/reboot.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/namei.h>
#include <sys/kmem.h>
#include <sys/fcntl.h>
#include <sys/lwp.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/lock.h>
#include <sys/dirent.h>
#include <sys/extattr.h>
#include <sys/sysctl.h>

#include <ufs/ufs/dir.h>
#include <ufs/ufs/extattr.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_extern.h>

int ufs_extattr_sync = 1;
int ufs_extattr_autocreate = 1024;

static int        ufs_extattr_valid_attrname(int attrnamespace,
                    const char *attrname);
static int        ufs_extattr_enable_with_open(struct ufsmount *ump,
                    struct vnode *vp, int attrnamespace, const char *attrname,
                    struct lwp *l);
static int        ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
                    const char *attrname, struct vnode *backing_vnode,
                    struct lwp *l);
static int        ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
                    const char *attrname, struct lwp *l);
static int        ufs_extattr_get(struct vnode *vp, int attrnamespace,
                    const char *name, struct uio *uio, size_t *size,
                    kauth_cred_t cred, struct lwp *l);
static int        ufs_extattr_list(struct vnode *vp, int attrnamespace,
                    struct uio *uio, size_t *size, int flag,
                    kauth_cred_t cred, struct lwp *l);
static int        ufs_extattr_set(struct vnode *vp, int attrnamespace,
                    const char *name, struct uio *uio, kauth_cred_t cred,
                    struct lwp *l);
static int        ufs_extattr_rm(struct vnode *vp, int attrnamespace,
                    const char *name, kauth_cred_t cred, struct lwp *l);
static struct ufs_extattr_list_entry *ufs_extattr_find_attr(struct ufsmount *,
                    int, const char *);
static int        ufs_extattr_get_header(struct vnode *,
                    struct ufs_extattr_list_entry *,
                    struct ufs_extattr_header *, off_t *);


/*
 * Per-FS attribute lock protecting attribute operations.
 * XXX Right now there is a lot of lock contention due to having a single
 * lock per-FS; really, this should be far more fine-grained.
 */
static void
ufs_extattr_uepm_lock(struct ufsmount *ump)
{

        /*
         * XXX This needs to be recursive for the following reasons:
         *   - it is taken in ufs_extattr_vnode_inactive
         *   - which is called from VOP_INACTIVE
         *   - which can be triggered by any vrele, vput, or vn_close
         *   - several of these can happen while it's held
         */
        if (mutex_owned(&ump->um_extattr.uepm_lock)) {
                ump->um_extattr.uepm_lockcnt++;
                return;
        }
        mutex_enter(&ump->um_extattr.uepm_lock);
}

static void
ufs_extattr_uepm_unlock(struct ufsmount *ump)
{

        if (ump->um_extattr.uepm_lockcnt != 0) {
                KASSERT(mutex_owned(&ump->um_extattr.uepm_lock));
                ump->um_extattr.uepm_lockcnt--;
                return;
        }
        mutex_exit(&ump->um_extattr.uepm_lock);
}

/*-
 * Determine whether the name passed is a valid name for an actual
 * attribute.
 *
 * Invalid currently consists of:
 *         NULL pointer for attrname
 *         zero-length attrname (used to retrieve application attribute list)
 */
static int
ufs_extattr_valid_attrname(int attrnamespace, const char *attrname)
{

        if (attrname == NULL)
                return 0;
        if (strlen(attrname) == 0)
                return 0;
        return 1;
}

/*
 * Autocreate an attribute storage
 */
static int
ufs_extattr_autocreate_attr(struct vnode *vp, int attrnamespace,
    const char *attrname, struct lwp *l, struct ufs_extattr_list_entry **uelep)
{
        struct mount *mp = vp->v_mount;
        struct ufsmount *ump = VFSTOUFS(mp);
        struct vnode *backing_vp;
        struct pathbuf *pb;
        char *path;
        struct ufs_extattr_fileheader uef;
        struct ufs_extattr_list_entry *uele;
        int error;

        path = PNBUF_GET();

        /*
         * We only support system and user namespace autocreation
         */
        switch (attrnamespace) {
        case EXTATTR_NAMESPACE_SYSTEM:
                (void)snprintf(path, PATH_MAX, "%s/%s/%s/%s",
                    mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR,
                    UFS_EXTATTR_SUBDIR_SYSTEM, attrname);
                break;
        case EXTATTR_NAMESPACE_USER:
                (void)snprintf(path, PATH_MAX, "%s/%s/%s/%s",
                    mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR,
                    UFS_EXTATTR_SUBDIR_USER, attrname);
                break;
        default:
                PNBUF_PUT(path);
                *uelep = NULL;
                return EINVAL;
                break;
        }

        /*
         * Release extended attribute mount lock, otherwise
         * we can deadlock with another thread that would lock
         * vp after we unlock it below, and call
         * ufs_extattr_uepm_lock(ump), for instance
         * in ufs_getextattr().
         */
        ufs_extattr_uepm_unlock(ump);

        /*
         * XXX unlock/lock should only be done when setting extattr
         * on backing store or one of its parent directory
         * including root, but we always do it for now.
         */
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        VOP_UNLOCK(vp);

        pb = pathbuf_create(path);

        /*
         * Since we do not hold ufs_extattr_uepm_lock anymore,
         * another thread may race with us for backend creation,
         * but only one can succeed here thanks to O_EXCL.
         *
          * backing_vp is the backing store.
         */
        error = vn_open(NULL, pb, 0, O_CREAT|O_EXCL|O_RDWR, 0600,
            &backing_vp, NULL, NULL);

        /*
         * Reacquire the lock on the vnode
         */
        KASSERT(VOP_ISLOCKED(vp) == 0);
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);

        ufs_extattr_uepm_lock(ump);

        if (error != 0) {
                pathbuf_destroy(pb);
                PNBUF_PUT(path);
                *uelep = NULL;
                return error;
        }

        KASSERT(backing_vp != NULL);
        KASSERT(VOP_ISLOCKED(backing_vp) == LK_EXCLUSIVE);

        pathbuf_destroy(pb);
        PNBUF_PUT(path);

        uef.uef_magic = UFS_EXTATTR_MAGIC;
        uef.uef_version = UFS_EXTATTR_VERSION;
        uef.uef_size = ufs_extattr_autocreate;

        error = vn_rdwr(UIO_WRITE, backing_vp, &uef, sizeof(uef), 0,
                        UIO_SYSSPACE, IO_NODELOCKED|IO_APPEND,
                        l->l_cred, NULL, l);

        VOP_UNLOCK(backing_vp);

        if (error != 0) {
                printf("%s: write uef header failed for `%s' (%d)\n",
                    __func__, attrname, error);
                vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
                *uelep = NULL;
                return error;
        }

        /*
         * Now enable attribute.
         */
        error = ufs_extattr_enable(ump,attrnamespace, attrname, backing_vp, l);
        KASSERT(VOP_ISLOCKED(backing_vp) == 0);

        if (error != 0) {
                printf("%s: enable `%s' failed (%d)\n",
                    __func__, attrname, error);
                vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
                *uelep = NULL;
                return error;
        }

        uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
        if (uele == NULL) {
                printf("%s: attribute `%s' created but not found!\n",
                    __func__, attrname);
                vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
                *uelep = NULL;
                return ESRCH; /* really internal error */
        }

        printf("%s: EA backing store autocreated for %s\n",
            mp->mnt_stat.f_mntonname, attrname);

        *uelep = uele;
        return 0;
}

/*
 * Locate an attribute given a name and mountpoint.
 * Must be holding uepm lock for the mount point.
 */
static struct ufs_extattr_list_entry *
ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace,
    const char *attrname)
{
        struct ufs_extattr_list_entry *search_attribute;

        for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list);
            search_attribute != NULL;
            search_attribute = LIST_NEXT(search_attribute, uele_entries)) {
                if (!(strncmp(attrname, search_attribute->uele_attrname,
                            UFS_EXTATTR_MAXEXTATTRNAME)) &&
                    (attrnamespace == search_attribute->uele_attrnamespace)) {
                        return search_attribute;
                }
        }

        return 0;
}

/*
 * Initialize per-FS structures supporting extended attributes.  Do not
 * start extended attributes yet.
 */
void
ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm)
{

        uepm->uepm_flags = 0;
        uepm->uepm_lockcnt = 0;

        LIST_INIT(&uepm->uepm_list);
        mutex_init(&uepm->uepm_lock, MUTEX_DEFAULT, IPL_NONE);
        uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED;
}

/*
 * Destroy per-FS structures supporting extended attributes.  Assumes
 * that EAs have already been stopped, and will panic if not.
 */
void
ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm)
{

        if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
                panic("ufs_extattr_uepm_destroy: not initialized");

        if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED))
                panic("ufs_extattr_uepm_destroy: called while still started");

        /*
         * It's not clear that either order for the next three lines is
         * ideal, and it should never be a problem if this is only called
         * during unmount, and with vfs_busy().
         */
        uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED;
        uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED;
        mutex_destroy(&uepm->uepm_lock);
}

/*
 * Start extended attribute support on an FS.
 */
int
ufs_extattr_start(struct mount *mp, struct lwp *l)
{
        struct ufsmount *ump;
        int error = 0;

        ump = VFSTOUFS(mp);

        if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
                ufs_extattr_uepm_init(&ump->um_extattr);

        ufs_extattr_uepm_lock(ump);

        if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) {
                error = EOPNOTSUPP;
                goto unlock;
        }
        if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) {
                error = EBUSY;
                goto unlock;
        }

        ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED;

        ump->um_extattr.uepm_ucred = l->l_cred;
        kauth_cred_hold(ump->um_extattr.uepm_ucred);

 unlock:
        ufs_extattr_uepm_unlock(ump);
        return error;
}

/*
 * Helper routine: given a locked parent directory and filename, return
 * the locked vnode of the inode associated with the name.  Will not
 * follow symlinks, may return any type of vnode.  Lock on parent will
 * be released even in the event of a failure.  In the event that the
 * target is the parent (i.e., "."), there will be two references and
 * one lock, requiring the caller to possibly special-case.
 */
static int
ufs_extattr_lookup(struct vnode *start_dvp, int lockparent,
    const char *dirname,
    struct vnode **vp, struct lwp *l)
{
        struct vop_lookup_v2_args vargs;
        struct componentname cnp;
        struct vnode *target_vp;
        char *pnbuf;
        int error;

        KASSERT(VOP_ISLOCKED(start_dvp) == LK_EXCLUSIVE);

        pnbuf = PNBUF_GET();

        memset(&cnp, 0, sizeof(cnp));
        cnp.cn_nameiop = LOOKUP;
        cnp.cn_flags = ISLASTCN | lockparent;
        cnp.cn_cred = l->l_cred;
        cnp.cn_nameptr = pnbuf;
        error = copystr(dirname, pnbuf, MAXPATHLEN, &cnp.cn_namelen);
        if (error) {
                if (lockparent == 0) {
                        VOP_UNLOCK(start_dvp);
                }
                PNBUF_PUT(pnbuf);
                printf("%s: copystr failed (%d)\n", __func__, error);
                return error;
        }
        cnp.cn_namelen--;        /* trim nul termination */
        vargs.a_desc = NULL;
        vargs.a_dvp = start_dvp;
        vargs.a_vpp = &target_vp;
        vargs.a_cnp = &cnp;
        error = ufs_lookup(&vargs);
        PNBUF_PUT(pnbuf);
        if (error) {
                if (lockparent == 0) {
                        VOP_UNLOCK(start_dvp);
                }
                return error;
        }
#if 0
        if (target_vp == start_dvp)
                panic("%s: target_vp == start_dvp", __func__);
#endif

        if (target_vp != start_dvp) {
                error = vn_lock(target_vp, LK_EXCLUSIVE);
                if (lockparent == 0)
                        VOP_UNLOCK(start_dvp);
                if (error) {
                        vrele(target_vp);
                        return error;
                }
        }

        KASSERT(VOP_ISLOCKED(target_vp) == LK_EXCLUSIVE);
        *vp = target_vp;
        return 0;
}

/*
 * Enable an EA using the passed filesystem, backing vnode, attribute name,
 * namespace, and proc.  Will perform a VOP_OPEN() on the vp, so expects vp
 * to be locked when passed in.  The vnode will be returned unlocked,
 * regardless of success/failure of the function.  As a result, the caller
 * will always need to vrele(), but not vput().
 */
static int
ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp,
    int attrnamespace, const char *attrname, struct lwp *l)
{
        int error;

        error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred);
        if (error) {
                printf("%s: VOP_OPEN(): failed (%d)\n", __func__, error);
                VOP_UNLOCK(vp);
                return error;
        }

        mutex_enter(vp->v_interlock);
        vp->v_writecount++;
        mutex_exit(vp->v_interlock);

        vref(vp);

        VOP_UNLOCK(vp);

        error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, l);
        if (error != 0)
                vn_close(vp, FREAD|FWRITE, l->l_cred);
        return error;
}

/*
 * Given a locked directory vnode, iterate over the names in the directory
 * and use ufs_extattr_lookup() to retrieve locked vnodes of potential
 * attribute files.  Then invoke ufs_extattr_enable_with_open() on each
 * to attempt to start the attribute.  Leaves the directory locked on
 * exit.
 */
static int
ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp,
    int attrnamespace, struct lwp *l)
{
        struct vop_readdir_args vargs;
        struct statvfs *sbp = &ump->um_mountp->mnt_stat;
        struct dirent *dp, *edp;
        struct vnode *attr_vp;
        struct uio auio;
        struct iovec aiov;
        char *dirbuf;
        int error, eofflag = 0;

        if (dvp->v_type != VDIR)
                return ENOTDIR;

        dirbuf = kmem_alloc(UFS_DIRBLKSIZ, KM_SLEEP);

        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        auio.uio_rw = UIO_READ;
        auio.uio_offset = 0;
        UIO_SETUP_SYSSPACE(&auio);

        vargs.a_desc = NULL;
        vargs.a_vp = dvp;
        vargs.a_uio = &auio;
        vargs.a_cred = l->l_cred;
        vargs.a_eofflag = &eofflag;
        vargs.a_ncookies = NULL;
        vargs.a_cookies = NULL;

        while (!eofflag) {
                auio.uio_resid = UFS_DIRBLKSIZ;
                aiov.iov_base = dirbuf;
                aiov.iov_len = UFS_DIRBLKSIZ;
                error = ufs_readdir(&vargs);
                if (error) {
                        printf("%s: ufs_readdir (%d)\n", __func__, error);
                        return error;
                }

                /*
                 * XXXRW: While in UFS, we always get UFS_DIRBLKSIZ returns from
                 * the directory code on success, on other file systems this
                 * may not be the case.  For portability, we should check the
                 * read length on return from ufs_readdir().
                 */
                edp = (struct dirent *)&dirbuf[UFS_DIRBLKSIZ];
                for (dp = (struct dirent *)dirbuf; dp < edp; ) {
                        if (dp->d_reclen == 0)
                                break;
                        /* Skip "." and ".." */
                        if (dp->d_name[0] == '.' &&
                            (dp->d_name[1] == '\0' ||
                             (dp->d_name[1] == '.' && dp->d_name[2] == '\0')))
                                goto next;
                        error = ufs_extattr_lookup(dvp, LOCKPARENT,
                            dp->d_name, &attr_vp, l);
                        if (error == ENOENT) {
                                goto next; /* keep silent */
                        } else if (error) {
                                printf("%s: lookup `%s' (%d)\n", __func__,
                                    dp->d_name, error);
                        } else if (attr_vp == dvp) {
                                vrele(attr_vp);
                        } else if (attr_vp->v_type != VREG) {
                                vput(attr_vp);
                        } else {
                                error = ufs_extattr_enable_with_open(ump,
                                    attr_vp, attrnamespace, dp->d_name, l);
                                vrele(attr_vp);
                                if (error) {
                                        printf("%s: enable `%s' (%d)\n",
                                            __func__, dp->d_name, error);
                                } else if (bootverbose) {
                                        printf("%s: EA %s loaded\n",
                                            sbp->f_mntonname, dp->d_name);
                                }
                        }
 next:
                        dp = (struct dirent *) ((char *)dp + dp->d_reclen);
                        if (dp >= edp)
                                break;
                }
        }
        kmem_free(dirbuf, UFS_DIRBLKSIZ);

        return 0;
}

static int
ufs_extattr_subdir(struct lwp *l, struct mount *mp, struct vnode *attr_dvp,
    const char *subdir, int namespace)
{
        int error;
        struct vnode *attr_sub;
        error = ufs_extattr_lookup(attr_dvp, LOCKPARENT, subdir, &attr_sub, l);
        KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
        if (error) {
                printf("%s: Can't find `%s/%s/%s' (%d)\n",
                    __func__, mp->mnt_stat.f_mntonname,
                    UFS_EXTATTR_FSROOTSUBDIR, subdir, error);
                return error;
        }
        KASSERT(VOP_ISLOCKED(attr_sub) == LK_EXCLUSIVE);
        error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
            attr_sub, namespace, l);
        if (error) {
                printf("%s: ufs_extattr_iterate_directory `%s/%s/%s' (%d)\n",
                    __func__, mp->mnt_stat.f_mntonname,
                    UFS_EXTATTR_FSROOTSUBDIR, subdir, error);
        }
        KASSERT(VOP_ISLOCKED(attr_sub) == LK_EXCLUSIVE);
        vput(attr_sub);
        return error;
}

/*
 * Auto-start of extended attributes, to be executed (optionally) at
 * mount-time.
 */
int
ufs_extattr_autostart(struct mount *mp, struct lwp *l)
{
        struct vnode *rvp, *attr_dvp;
        int error;

        /*
         * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root?
         * If so, automatically start EA's.
         */
        error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp);
        if (error) {
                printf("%s: VFS_ROOT() (%d)\n", __func__, error);
                return error;
        }

        KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE);

        error = ufs_extattr_lookup(rvp, 0,
            UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, l);
        if (error) {
                /* rvp ref'd but now unlocked */
                KASSERT(VOP_ISLOCKED(rvp) == 0);
                vrele(rvp);
                printf("%s: lookup `%s/%s' (%d)\n", __func__,
                    mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR, error);
                return error;
        }
        if (rvp == attr_dvp) {
                /* Should never happen. */
                KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE);
                vrele(attr_dvp);
                vput(rvp);
                printf("%s: `/' == `%s/%s' (%d)\n", __func__,
                    mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR, EINVAL);
                return EINVAL;
        }
        KASSERT(VOP_ISLOCKED(rvp) == 0);
        vrele(rvp);

        KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);

        if (attr_dvp->v_type != VDIR) {
                printf("%s: `%s/%s' is not a directory\n",
                    __func__, mp->mnt_stat.f_mntonname,
                    UFS_EXTATTR_FSROOTSUBDIR);
                goto return_vput_attr_dvp;
        }

        error = ufs_extattr_start(mp, l);
        if (error) {
                printf("%s: ufs_extattr_start failed (%d)\n", __func__,
                    error);
                goto return_vput_attr_dvp;
        }

        /*
         * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM,
         * UFS_EXTATTR_SUBDIR_USER.  For each, iterate over the sub-directory,
         * and start with appropriate type.  Failures in either don't
         * result in an over-all failure.  attr_dvp is left locked to
         * be cleaned up on exit.
         */
        error = ufs_extattr_subdir(l, mp, attr_dvp, UFS_EXTATTR_SUBDIR_SYSTEM,
            EXTATTR_NAMESPACE_SYSTEM);
        error = ufs_extattr_subdir(l, mp, attr_dvp, UFS_EXTATTR_SUBDIR_USER,
            EXTATTR_NAMESPACE_USER);

        /* Mask startup failures in sub-directories. */
        error = 0;

 return_vput_attr_dvp:
        KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
        vput(attr_dvp);

        return error;
}

/*
 * Stop extended attribute support on an FS.
 */
void
ufs_extattr_stop(struct mount *mp, struct lwp *l)
{
        struct ufs_extattr_list_entry *uele;
        struct ufsmount *ump = VFSTOUFS(mp);

        ufs_extattr_uepm_lock(ump);

        /*
         * If we haven't been started, no big deal.  Just short-circuit
         * the processing work.
         */
        if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
                goto unlock;
        }

        while (LIST_FIRST(&ump->um_extattr.uepm_list) != NULL) {
                uele = LIST_FIRST(&ump->um_extattr.uepm_list);
                ufs_extattr_disable(ump, uele->uele_attrnamespace,
                    uele->uele_attrname, l);
        }

        ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED;

        kauth_cred_free(ump->um_extattr.uepm_ucred);
        ump->um_extattr.uepm_ucred = NULL;

 unlock:
        ufs_extattr_uepm_unlock(ump);
}

/*
 * Enable a named attribute on the specified filesystem; provide an
 * unlocked backing vnode to hold the attribute data.
 */
static int
ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
    const char *attrname, struct vnode *backing_vnode, struct lwp *l)
{
        struct ufs_extattr_list_entry *attribute;
        struct iovec aiov;
        struct uio auio;
        int error = 0;

        if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
                return EINVAL;
        if (backing_vnode->v_type != VREG)
                return EINVAL;

        attribute = kmem_zalloc(sizeof(*attribute), KM_SLEEP);

        if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
                error = EOPNOTSUPP;
                goto free_exit;
        }

        if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) {
                error = EEXIST;
                goto free_exit;
        }

        strncpy(attribute->uele_attrname, attrname,
            UFS_EXTATTR_MAXEXTATTRNAME);
        attribute->uele_attrnamespace = attrnamespace;
        memset(&attribute->uele_fileheader, 0,
            sizeof(struct ufs_extattr_fileheader));

        attribute->uele_backing_vnode = backing_vnode;

        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        aiov.iov_base = (void *) &attribute->uele_fileheader;
        aiov.iov_len = sizeof(struct ufs_extattr_fileheader);
        auio.uio_resid = sizeof(struct ufs_extattr_fileheader);
        auio.uio_offset = (off_t) 0;
        auio.uio_rw = UIO_READ;
        UIO_SETUP_SYSSPACE(&auio);

        vn_lock(backing_vnode, LK_SHARED | LK_RETRY);
        error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED,
            ump->um_extattr.uepm_ucred);

        if (error)
                goto unlock_free_exit;

        if (auio.uio_resid != 0) {
                printf("%s: malformed attribute header\n", __func__);
                error = EINVAL;
                goto unlock_free_exit;
        }

        /*
         * Try to determine the byte order of the attribute file.
         */
        if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
                attribute->uele_flags |= UELE_F_NEEDSWAP;
                attribute->uele_fileheader.uef_magic =
                    ufs_rw32(attribute->uele_fileheader.uef_magic,
                        UELE_NEEDSWAP(attribute));
                if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
                        printf("%s: invalid attribute header magic\n",
                            __func__);
                        error = EINVAL;
                        goto unlock_free_exit;
                }
        }
        attribute->uele_fileheader.uef_version =
            ufs_rw32(attribute->uele_fileheader.uef_version,
                UELE_NEEDSWAP(attribute));
        attribute->uele_fileheader.uef_size =
            ufs_rw32(attribute->uele_fileheader.uef_size,
                UELE_NEEDSWAP(attribute));

        if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) {
                printf("%s: incorrect attribute header version %d != %d\n",
                    __func__, attribute->uele_fileheader.uef_version,
                    UFS_EXTATTR_VERSION);
                error = EINVAL;
                goto unlock_free_exit;
        }

        LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute, uele_entries);

        VOP_UNLOCK(backing_vnode);
        return 0;

 unlock_free_exit:
        VOP_UNLOCK(backing_vnode);

 free_exit:
        kmem_free(attribute, sizeof(*attribute));
        return error;
}

/*
 * Disable extended attribute support on an FS.
 */
static int
ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
    const char *attrname, struct lwp *l)
{
        struct ufs_extattr_list_entry *uele;
        int error = 0;

        if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
                return EINVAL;

        uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
        if (!uele)
                return ENODATA;

        LIST_REMOVE(uele, uele_entries);

        error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE, l->l_cred);

        kmem_free(uele, sizeof(*uele));

        return error;
}

/*
 * VFS call to manage extended attributes in UFS.  If filename_vp is
 * non-NULL, it must be passed in locked, and regardless of errors in
 * processing, will be unlocked.
 */
int
ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
    int attrnamespace, const char *attrname)
{
        struct lwp *l = curlwp;
        struct ufsmount *ump = VFSTOUFS(mp);
        int error;

        /*
         * Only privileged processes can configure extended attributes.
         */
        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_EXTATTR,
            0, mp, NULL, NULL);
        if (error) {
                if (filename_vp != NULL)
                        VOP_UNLOCK(filename_vp);
                return error;
        }

        switch(cmd) {
        case UFS_EXTATTR_CMD_START:
        case UFS_EXTATTR_CMD_STOP:
        case UFS_EXTATTR_CMD_ENABLE:
        case UFS_EXTATTR_CMD_DISABLE:
                if (filename_vp != NULL) {
                        VOP_UNLOCK(filename_vp);
                        return EINVAL;
                }
                if (attrname != NULL)
                        return EINVAL;
                break;
        default:
                return EINVAL;
        }

        switch(cmd) {
        case UFS_EXTATTR_CMD_START:
                error = ufs_extattr_autostart(mp, l);
                return error;

        case UFS_EXTATTR_CMD_STOP:
                ufs_extattr_stop(mp, l);
                return 0;

        case UFS_EXTATTR_CMD_ENABLE:
                /*
                 * ufs_extattr_enable_with_open() will always unlock the
                 * vnode, regardless of failure.
                 */
                ufs_extattr_uepm_lock(ump);
                error = ufs_extattr_enable_with_open(ump, filename_vp,
                    attrnamespace, attrname, l);
                ufs_extattr_uepm_unlock(ump);
                return error;

        case UFS_EXTATTR_CMD_DISABLE:
                ufs_extattr_uepm_lock(ump);
                error = ufs_extattr_disable(ump, attrnamespace, attrname, l);
                ufs_extattr_uepm_unlock(ump);
                return error;

        default:
                return EINVAL;
        }
}

/*
 * Read extended attribute header for a given vnode and attribute.
 * Backing vnode should be locked and unlocked by caller.
 */
static int
ufs_extattr_get_header(struct vnode *vp, struct ufs_extattr_list_entry *uele,
    struct ufs_extattr_header *ueh, off_t *bap)
{
        struct mount *mp = vp->v_mount;
        struct ufsmount *ump = VFSTOUFS(mp);
        struct inode *ip = VTOI(vp);
        off_t base_offset;
        struct iovec aiov;
        struct uio aio;
        int error;

        /*
         * Find base offset of header in file based on file header size, and
         * data header size + maximum data size, indexed by inode number.
         */
        base_offset = sizeof(struct ufs_extattr_fileheader) +
            ip->i_number * (sizeof(struct ufs_extattr_header) +
            uele->uele_fileheader.uef_size);

        /*
         * Read in the data header to see if the data is defined, and if so
         * how much.
         */
        memset(ueh, 0, sizeof(struct ufs_extattr_header));
        aiov.iov_base = ueh;
        aiov.iov_len = sizeof(struct ufs_extattr_header);
        aio.uio_iov = &aiov;
        aio.uio_iovcnt = 1;
        aio.uio_rw = UIO_READ;
        aio.uio_offset = base_offset;
        aio.uio_resid = sizeof(struct ufs_extattr_header);
        UIO_SETUP_SYSSPACE(&aio);

        error = VOP_READ(uele->uele_backing_vnode, &aio,
            IO_NODELOCKED, ump->um_extattr.uepm_ucred);
        if (error)
                return error;

        /*
         * Attribute headers are kept in file system byte order.
         * XXX What about the blob of data?
         */
        ueh->ueh_flags = ufs_rw32(ueh->ueh_flags, UELE_NEEDSWAP(uele));
        ueh->ueh_len   = ufs_rw32(ueh->ueh_len, UELE_NEEDSWAP(uele));
        ueh->ueh_i_gen = ufs_rw32(ueh->ueh_i_gen, UELE_NEEDSWAP(uele));

        /* Defined? */
        if ((ueh->ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0)
                return ENODATA;

        /* Valid for the current inode generation? */
        if (ueh->ueh_i_gen != ip->i_gen) {
                /*
                 * The inode itself has a different generation number
                 * than the uele data.  For now, the best solution
                 * is to coerce this to undefined, and let it get cleaned
                 * up by the next write or extattrctl clean.
                 */
                printf("%s: %s: inode gen inconsistency (%u, %jd)\n",
                       __func__,  mp->mnt_stat.f_mntonname, ueh->ueh_i_gen,
                       (intmax_t)ip->i_gen);
                return ENODATA;
        }

        /* Local size consistency check. */
        if (ueh->ueh_len > uele->uele_fileheader.uef_size)
                return ENXIO;

        /* Return base offset */
        if (bap != NULL)
                *bap = base_offset;

        return 0;
}

/*
 * Vnode operation to retrieve a named extended attribute.
 */
int
ufs_getextattr(struct vop_getextattr_args *ap)
/*
vop_getextattr {
        IN struct vnode *a_vp;
        IN int a_attrnamespace;
        IN const char *a_name;
        INOUT struct uio *a_uio;
        OUT size_t *a_size;
        IN kauth_cred_t a_cred;
};
*/
{
        struct mount *mp = ap->a_vp->v_mount;
        struct ufsmount *ump = VFSTOUFS(mp);
        int error;

        if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
                return EOPNOTSUPP;

        ufs_extattr_uepm_lock(ump);

        error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name,
            ap->a_uio, ap->a_size, ap->a_cred, curlwp);

        ufs_extattr_uepm_unlock(ump);

        return error;
}

/*
 * Real work associated with retrieving a named attribute--assumes that
 * the attribute lock has already been grabbed.
 */
static int
ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name,
    struct uio *uio, size_t *size, kauth_cred_t cred, struct lwp *l)
{
        struct ufs_extattr_list_entry *attribute;
        struct ufs_extattr_header ueh;
        struct mount *mp = vp->v_mount;
        struct ufsmount *ump = VFSTOUFS(mp);
        off_t base_offset;
        size_t len, old_len;
        int error = 0;

        if (strlen(name) == 0)
                return EINVAL;

        error = extattr_check_cred(vp, attrnamespace, cred, VREAD);
        if (error)
                return error;

        attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
        if (!attribute)
                return ENODATA;

        /*
         * Allow only offsets of zero to encourage the read/replace
         * extended attribute semantic.  Otherwise we can't guarantee
         * atomicity, as we don't provide locks for extended attributes.
         */
        if (uio != NULL && uio->uio_offset != 0)
                return ENXIO;

        /*
         * Don't need to get a lock on the backing file if the getattr is
         * being applied to the backing file, as the lock is already held.
         */
        if (attribute->uele_backing_vnode != vp)
                vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY);

        error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset);
        if (error)
                goto vopunlock_exit;

        /* Return full data size if caller requested it. */
        if (size != NULL)
                *size = ueh.ueh_len;

        /* Return data if the caller requested it. */
        if (uio != NULL) {
                /* Allow for offset into the attribute data. */
                uio->uio_offset = base_offset + sizeof(struct
                    ufs_extattr_header);

                /*
                 * Figure out maximum to transfer -- use buffer size and
                 * local data limit.
                 */
                len = MIN(uio->uio_resid, ueh.ueh_len);
                old_len = uio->uio_resid;
                uio->uio_resid = len;

                error = VOP_READ(attribute->uele_backing_vnode, uio,
                    IO_NODELOCKED, ump->um_extattr.uepm_ucred);
                if (error)
                        goto vopunlock_exit;

                uio->uio_resid = old_len - (len - uio->uio_resid);
        }

 vopunlock_exit:

        if (uio != NULL)
                uio->uio_offset = 0;

        if (attribute->uele_backing_vnode != vp)
                VOP_UNLOCK(attribute->uele_backing_vnode);

        return error;
}

/*
 * Vnode operation to list extended attribute for a vnode
 */
int
ufs_listextattr(struct vop_listextattr_args *ap)
/*
vop_listextattr {
        IN struct vnode *a_vp;
        IN int a_attrnamespace;
        INOUT struct uio *a_uio;
        OUT size_t *a_size;
        IN int flag;
        IN kauth_cred_t a_cred;
        struct proc *a_p;
};
*/
{
        struct mount *mp = ap->a_vp->v_mount;
        struct ufsmount *ump = VFSTOUFS(mp);
        int error;

        if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
                return EOPNOTSUPP;

        ufs_extattr_uepm_lock(ump);

        error = ufs_extattr_list(ap->a_vp, ap->a_attrnamespace,
            ap->a_uio, ap->a_size, ap->a_flag, ap->a_cred, curlwp);

        ufs_extattr_uepm_unlock(ump);

        return error;
}

/*
 * Real work associated with retrieving list of attributes--assumes that
 * the attribute lock has already been grabbed.
 */
static int
ufs_extattr_list(struct vnode *vp, int attrnamespace,
    struct uio *uio, size_t *size, int flag,
    kauth_cred_t cred, struct lwp *l)
{
        struct ufs_extattr_list_entry *uele;
        struct ufs_extattr_header ueh;
        struct mount *mp = vp->v_mount;
        struct ufsmount *ump = VFSTOUFS(mp);
        size_t listsize = 0;
        int error = 0;

        /*
         * XXX: We can move this inside the loop and iterate on individual
         *        attributes.
         */
        error = extattr_check_cred(vp, attrnamespace, cred, VREAD);
        if (error)
                return error;

        LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) {
                unsigned char attrnamelen;

                if (uele->uele_attrnamespace != attrnamespace)
                        continue;

                error = ufs_extattr_get_header(vp, uele, &ueh, NULL);
                if (error == ENODATA)
                        continue;
                if (error != 0)
                        return error;

                /*
                 * Don't need to get a lock on the backing file if
                 * the listattr is being applied to the backing file,
                 * as the lock is already held.
                 */
                if (uele->uele_backing_vnode != vp)
                        vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY);

                /*
                 * +1 for trailing NUL (listxattr flavor)
                 *  or leading name length (extattr_list_file flavor)
                  */
                attrnamelen = strlen(uele->uele_attrname);
                listsize += attrnamelen + 1;

                /* Return data if the caller requested it. */
                if (uio != NULL) {
                        /*
                         * We support two flavors. Either NUL-terminated
                         * strings (a la listxattr), or non NUL-terminated,
                         * one byte length prefixed strings (for
                         * extattr_list_file). EXTATTR_LIST_LENPREFIX switches
                          * that second behavior.
                         */
                        if (flag & EXTATTR_LIST_LENPREFIX) {
                                uint8_t len = (uint8_t)attrnamelen;

                                /* Copy leading name length */
                                error = uiomove(&len, sizeof(len), uio);
                                if (error != 0)
                                        break;
                        } else {
                                /* Include trailing NULL */
                                attrnamelen++;
                        }

                        error = uiomove(uele->uele_attrname,
                            (size_t)attrnamelen, uio);
                        if (error != 0)
                                break;
                }

                if (uele->uele_backing_vnode != vp)
                        VOP_UNLOCK(uele->uele_backing_vnode);

                if (error != 0)
                        return error;
        }

        if (uio != NULL)
                uio->uio_offset = 0;

        /* Return full data size if caller requested it. */
        if (size != NULL)
                *size = listsize;

        return 0;
}

/*
 * Vnode operation to remove a named attribute.
 */
int
ufs_deleteextattr(struct vop_deleteextattr_args *ap)
/*
vop_deleteextattr {
        IN struct vnode *a_vp;
        IN int a_attrnamespace;
        IN const char *a_name;
        IN kauth_cred_t a_cred;
};
*/
{
        struct mount *mp = ap->a_vp->v_mount;
        struct ufsmount *ump = VFSTOUFS(mp);
        int error;

        if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
                return EOPNOTSUPP;

        ufs_extattr_uepm_lock(ump);

        error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name,
            ap->a_cred, curlwp);

        ufs_extattr_uepm_unlock(ump);

        return error;
}

/*
 * Vnode operation to set a named attribute.
 */
int
ufs_setextattr(struct vop_setextattr_args *ap)
/*
vop_setextattr {
        IN struct vnode *a_vp;
        IN int a_attrnamespace;
        IN const char *a_name;
        INOUT struct uio *a_uio;
        IN kauth_cred_t a_cred;
};
*/
{
        struct mount *mp = ap->a_vp->v_mount;
        struct ufsmount *ump = VFSTOUFS(mp);
        int error;

        if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
                return EOPNOTSUPP;

        ufs_extattr_uepm_lock(ump);

        /*
         * XXX: No longer a supported way to delete extended attributes.
         */
        if (ap->a_uio == NULL) {
                ufs_extattr_uepm_unlock(ump);
                return EINVAL;
        }

        error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name,
            ap->a_uio, ap->a_cred, curlwp);

        ufs_extattr_uepm_unlock(ump);

        return error;
}

/*
 * Real work associated with setting a vnode's extended attributes;
 * assumes that the attribute lock has already been grabbed.
 */
static int
ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name,
    struct uio *uio, kauth_cred_t cred, struct lwp *l)
{
        struct ufs_extattr_list_entry *attribute;
        struct ufs_extattr_header ueh;
        struct iovec local_aiov;
        struct uio local_aio;
        struct mount *mp = vp->v_mount;
        struct ufsmount *ump = VFSTOUFS(mp);
        struct inode *ip = VTOI(vp);
        off_t base_offset;
        int error = 0, ioflag;

        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                return EROFS;

        if (!ufs_extattr_valid_attrname(attrnamespace, name))
                return EINVAL;

        error = extattr_check_cred(vp, attrnamespace, cred, VWRITE);
        if (error)
                return error;

        attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
        if (!attribute) {
                error = ufs_extattr_autocreate_attr(vp, attrnamespace,
                    name, l, &attribute);
                if (error == EEXIST) {
                        /* Another thread raced us for backend creation */
                        error = 0;
                        attribute =
                            ufs_extattr_find_attr(ump, attrnamespace, name);
                }

                if (error || !attribute)
                        return ENODATA;
        }

        /*
         * Early rejection of invalid offsets/length.
         * Reject: any offset but 0 (replace)
         *         Any size greater than attribute size limit
          */
        if (uio->uio_offset != 0 ||
            uio->uio_resid > attribute->uele_fileheader.uef_size)
                return ENXIO;

        /*
         * Find base offset of header in file based on file header size, and
         * data header size + maximum data size, indexed by inode number.
         */
        base_offset = sizeof(struct ufs_extattr_fileheader) +
            ip->i_number * (sizeof(struct ufs_extattr_header) +
            attribute->uele_fileheader.uef_size);

        /*
         * Write out a data header for the data.
         */
        ueh.ueh_len = ufs_rw32((uint32_t) uio->uio_resid,
            UELE_NEEDSWAP(attribute));
        ueh.ueh_flags = ufs_rw32(UFS_EXTATTR_ATTR_FLAG_INUSE,
            UELE_NEEDSWAP(attribute));
        ueh.ueh_i_gen = ufs_rw32(ip->i_gen, UELE_NEEDSWAP(attribute));
        local_aiov.iov_base = &ueh;
        local_aiov.iov_len = sizeof(struct ufs_extattr_header);
        local_aio.uio_iov = &local_aiov;
        local_aio.uio_iovcnt = 1;
        local_aio.uio_rw = UIO_WRITE;
        local_aio.uio_offset = base_offset;
        local_aio.uio_resid = sizeof(struct ufs_extattr_header);
        UIO_SETUP_SYSSPACE(&local_aio);

        /*
         * Don't need to get a lock on the backing file if the setattr is
         * being applied to the backing file, as the lock is already held.
         */
        if (attribute->uele_backing_vnode != vp)
                vn_lock(attribute->uele_backing_vnode,
                    LK_EXCLUSIVE | LK_RETRY);

        ioflag = IO_NODELOCKED;
        if (ufs_extattr_sync)
                ioflag |= IO_SYNC;
        error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
            ump->um_extattr.uepm_ucred);
        if (error)
                goto vopunlock_exit;

        if (local_aio.uio_resid != 0) {
                error = ENXIO;
                goto vopunlock_exit;
        }

        /*
         * Write out user data.
         * XXX NOT ATOMIC WITH RESPECT TO THE HEADER.
         */
        uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header);

        ioflag = IO_NODELOCKED;
        if (ufs_extattr_sync)
                ioflag |= IO_SYNC;
        error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag,
            ump->um_extattr.uepm_ucred);

 vopunlock_exit:
        uio->uio_offset = 0;

        if (attribute->uele_backing_vnode != vp)
                VOP_UNLOCK(attribute->uele_backing_vnode);

        return error;
}

/*
 * Real work associated with removing an extended attribute from a vnode.
 * Assumes the attribute lock has already been grabbed.
 */
static int
ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name,
    kauth_cred_t cred, struct lwp *l)
{
        struct ufs_extattr_list_entry *attribute;
        struct ufs_extattr_header ueh;
        struct mount *mp = vp->v_mount;
        struct ufsmount *ump = VFSTOUFS(mp);
        struct iovec local_aiov;
        struct uio local_aio;
        off_t base_offset;
        int error = 0, ioflag;

        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                return EROFS;

        if (!ufs_extattr_valid_attrname(attrnamespace, name))
                return EINVAL;

        error = extattr_check_cred(vp, attrnamespace, cred, VWRITE);
        if (error)
                return error;

        attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
        if (!attribute)
                return ENODATA;

        /*
         * Don't need to get a lock on the backing file if the getattr is
         * being applied to the backing file, as the lock is already held.
         */
        if (attribute->uele_backing_vnode != vp)
                vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY);

        error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset);
        if (error)
                goto vopunlock_exit;

        /* Flag it as not in use. */
        ueh.ueh_flags = 0;                /* No need to byte swap 0 */
        ueh.ueh_len = 0;                /* ...ditto... */

        local_aiov.iov_base = &ueh;
        local_aiov.iov_len = sizeof(struct ufs_extattr_header);
        local_aio.uio_iov = &local_aiov;
        local_aio.uio_iovcnt = 1;
        local_aio.uio_rw = UIO_WRITE;
        local_aio.uio_offset = base_offset;
        local_aio.uio_resid = sizeof(struct ufs_extattr_header);
        UIO_SETUP_SYSSPACE(&local_aio);

        ioflag = IO_NODELOCKED;
        if (ufs_extattr_sync)
                ioflag |= IO_SYNC;
        error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
            ump->um_extattr.uepm_ucred);
        if (error)
                goto vopunlock_exit;

        if (local_aio.uio_resid != 0)
                error = ENXIO;

 vopunlock_exit:
        VOP_UNLOCK(attribute->uele_backing_vnode);

        return error;
}

/*
 * Called by UFS when an inode is no longer active and should have its
 * attributes stripped.
 */
void
ufs_extattr_vnode_inactive(struct vnode *vp, struct lwp *l)
{
        struct ufs_extattr_list_entry *uele;
        struct mount *mp = vp->v_mount;
        struct ufsmount *ump = VFSTOUFS(mp);

        /*
         * In that case, we cannot lock. We should not have any active vnodes
         * on the fs if this is not yet initialized but is going to be, so
         * this can go unlocked.
         */
        if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
                return;

        if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
                return;

        ufs_extattr_uepm_lock(ump);

        LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries)
                ufs_extattr_rm(vp, uele->uele_attrnamespace,
                    uele->uele_attrname, lwp0.l_cred, l);

        ufs_extattr_uepm_unlock(ump);
}

void
ufs_extattr_init(void)
{

}

void
ufs_extattr_done(void)
{

}























































































   41 









































    1 














































































    3 



















































    1 



    2 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
/*      $NetBSD: clockctl.c,v 1.39 2022/03/28 12:33:20 riastradh Exp $ */

/*-
 * Copyright (c) 2001 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Emmanuel Dreyfus.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: clockctl.c,v 1.39 2022/03/28 12:33:20 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_ntp.h"
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/device.h>
#include <sys/time.h>
#include <sys/conf.h>
#include <sys/timex.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/compat_stub.h>

#include <sys/clockctl.h>
#include <compat/sys/clockctl.h>
#include <compat/sys/time_types.h>


kmutex_t clockctl_mtx;
int clockctl_refcnt;

#include "ioconf.h"

dev_type_ioctl(clockctlioctl);

const struct cdevsw clockctl_cdevsw = {
        .d_open = clockctlopen,
        .d_close = clockctlclose,
        .d_read = noread,
        .d_write = nowrite,
        .d_ioctl = clockctlioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_OTHER,
};

static kauth_listener_t clockctl_listener;

static int
clockctl_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;
        enum kauth_system_req req;
        bool device_context;

        result = KAUTH_RESULT_DEFER;
        req = (enum kauth_system_req)(uintptr_t)arg0;

        if ((action != KAUTH_SYSTEM_TIME) ||
            (req != KAUTH_REQ_SYSTEM_TIME_SYSTEM))
                return result;

        device_context = arg3 != NULL;

        /* Device is controlled by permissions, so allow. */
        if (device_context)
                result = KAUTH_RESULT_ALLOW;

        return result;
}

/*ARGSUSED*/
void
clockctlattach(int num)
{

/*
 * Don't initialize the listener here - it will get handled as part
 * of module initialization.
 */
#if 0
        clockctl_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
            clockctl_listener_cb, NULL);
#endif
}

/*
 * Maintain a refcount for each open/close, so we know when it is
 * safe to call devsw_detach()
 */
int
clockctlopen(dev_t dev, int flag, int mode, struct lwp *l)
{

        mutex_enter(&clockctl_mtx);
        clockctl_refcnt++;
        mutex_exit(&clockctl_mtx);

        return 0;
}

int
clockctlclose(dev_t dev, int flag, int mode, struct lwp *l)
{

        mutex_enter(&clockctl_mtx);
        clockctl_refcnt--;
        mutex_exit(&clockctl_mtx);

        return 0;
}

MODULE(MODULE_CLASS_DRIVER, clockctl, NULL);

int
clockctl_modcmd(modcmd_t cmd, void *data)
{
        int error;
#ifdef _MODULE
        int bmajor, cmajor;
#endif

        error = 0;

        switch (cmd) {
        case MODULE_CMD_INIT:
                mutex_init(&clockctl_mtx, MUTEX_DEFAULT, IPL_NONE);

                clockctl_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
                    clockctl_listener_cb, NULL);

#ifdef _MODULE
                bmajor = cmajor = -1;
                error = devsw_attach("clockctl", NULL, &bmajor,
                    &clockctl_cdevsw, &cmajor);
                if (error != 0)
                        kauth_unlisten_scope(clockctl_listener);
#endif

                break;

        case MODULE_CMD_FINI:
                mutex_enter(&clockctl_mtx);
                if (clockctl_refcnt != 0) {
                        mutex_exit(&clockctl_mtx);
                        return EBUSY;
                }
#ifdef _MODULE
                devsw_detach(NULL, &clockctl_cdevsw);
#endif
                mutex_exit(&clockctl_mtx);

                kauth_unlisten_scope(clockctl_listener);
                mutex_destroy(&clockctl_mtx);
                break;

        default:
                error = ENOTTY;
                break;
        }

        return error;
}

int
clockctlioctl(
    dev_t dev,
    u_long cmd,
    void *data,
    int flags,
    struct lwp *l)
{
        int error = 0;

        switch (cmd) {
        case CLOCKCTL_SETTIMEOFDAY: {
                struct clockctl_settimeofday *args = data;

                error = settimeofday1(args->tv, true, args->tzp, l, false);
                break;
        }
        case CLOCKCTL_ADJTIME: {
                struct timeval atv, oldatv;
                struct clockctl_adjtime *args = data;

                if (args->delta) {
                        error = copyin(args->delta, &atv, sizeof(atv));
                        if (error)
                                return (error);
                }
                adjtime1(args->delta ? &atv : NULL,
                    args->olddelta ? &oldatv : NULL, l->l_proc);
                if (args->olddelta)
                        error = copyout(&oldatv, args->olddelta,
                            sizeof(oldatv));
                break;
        }
        case CLOCKCTL_CLOCK_SETTIME: {
                struct clockctl_clock_settime *args = data;
                struct timespec ts;

                error = copyin(args->tp, &ts, sizeof ts);
                if (error)
                        return (error);
                error = clock_settime1(l->l_proc, args->clock_id, &ts, false);
                break;
        }
        case CLOCKCTL_NTP_ADJTIME: {
                struct clockctl_ntp_adjtime *args = data;
                struct timex ntv;

                if (vec_ntp_timestatus == NULL) {
                        error = ENOTTY;
                        break;
                }
                error = copyin(args->tp, &ntv, sizeof(ntv));
                if (error)
                        return (error);

                (*vec_ntp_adjtime1)(&ntv);

                error = copyout(&ntv, args->tp, sizeof(ntv));
                if (error == 0)
                        args->retval = (*vec_ntp_timestatus)();
                break;
        }
        default:
                MODULE_HOOK_CALL(clockctl_ioctl_50_hook,
                    (dev, cmd, data, flags, l), enosys(), error);
                if (error == ENOSYS)
                        error = ENOTTY;
        }

        return (error);
}



















































































































































































    2 















    1 


    1 



















































































































    2 














    2 









    1 







































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
/*        $NetBSD: kern_time_50.c,v 1.38 2024/01/19 18:39:15 christos Exp $        */

/*-
 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_time_50.c,v 1.38 2024/01/19 18:39:15 christos Exp $");

#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_aio.h"
#include "opt_ntp.h"
#include "opt_mqueue.h"
#endif

#include <sys/param.h>
#include <sys/conf.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/socketvar.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/dirent.h>
#include <sys/kauth.h>
#include <sys/time.h>
#include <sys/timex.h>
#include <sys/clockctl.h>
#include <sys/aio.h>
#include <sys/poll.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/syscallvar.h>
#include <sys/sysctl.h>
#include <sys/resource.h>
#include <sys/compat_stub.h>

#include <compat/common/compat_util.h>
#include <compat/common/compat_mod.h>
#include <compat/sys/time.h>
#include <compat/sys/timex.h>
#include <compat/sys/resource.h>
#include <compat/sys/clockctl.h>

struct timeval50 boottime50; 

static const struct syscall_package kern_time_50_syscalls[] = {
        { SYS_compat_50_clock_gettime, 0,
            (sy_call_t *)compat_50_sys_clock_gettime },  
        { SYS_compat_50_clock_settime, 0,
            (sy_call_t *)compat_50_sys_clock_settime },
        { SYS_compat_50_clock_getres, 0,
            (sy_call_t *)compat_50_sys_clock_getres},
        { SYS_compat_50_nanosleep, 0, (sy_call_t *)compat_50_sys_nanosleep },
        { SYS_compat_50_gettimeofday, 0,
            (sy_call_t *)compat_50_sys_gettimeofday },     
        { SYS_compat_50_settimeofday, 0,
            (sy_call_t *)compat_50_sys_settimeofday },
        { SYS_compat_50_adjtime, 0, (sy_call_t *)compat_50_sys_adjtime },
        { SYS_compat_50_setitimer, 0, (sy_call_t *)compat_50_sys_setitimer },
        { SYS_compat_50_getitimer, 0, (sy_call_t *)compat_50_sys_getitimer },
        { SYS_compat_50_aio_suspend, 0,
            (sy_call_t *)compat_50_sys_aio_suspend },
        { SYS_compat_50_mq_timedsend, 0,
            (sy_call_t *)compat_50_sys_mq_timedsend },
        { SYS_compat_50_mq_timedreceive, 0,
            (sy_call_t *)compat_50_sys_mq_timedreceive },
        { SYS_compat_50_getrusage, 0, (sy_call_t *)compat_50_sys_getrusage },
        { SYS_compat_50_timer_settime, 0,
            (sy_call_t *)compat_50_sys_timer_settime },
        { SYS_compat_50_timer_gettime, 0,
            (sy_call_t *)compat_50_sys_timer_gettime },
        { SYS_compat_50___ntp_gettime30, 0,
            (sy_call_t *)compat_50_sys___ntp_gettime30 },
        { 0, 0, NULL }
}; 

int
compat_50_sys_clock_gettime(struct lwp *l,
    const struct compat_50_sys_clock_gettime_args *uap, register_t *retval)
{
        /* {
                syscallarg(clockid_t) clock_id;
                syscallarg(struct timespec50 *) tp;
        } */
        int error;
        struct timespec ats;
        struct timespec50 ats50;

        error = clock_gettime1(SCARG(uap, clock_id), &ats);
        if (error != 0)
                return error;

        timespec_to_timespec50(&ats, &ats50);

        return copyout(&ats50, SCARG(uap, tp), sizeof(ats50));
}

/* ARGSUSED */
int
compat_50_sys_clock_settime(struct lwp *l,
    const struct compat_50_sys_clock_settime_args *uap, register_t *retval)
{
        /* {
                syscallarg(clockid_t) clock_id;
                syscallarg(const struct timespec50 *) tp;
        } */
        int error;
        struct timespec ats;
        struct timespec50 ats50;

        error = copyin(SCARG(uap, tp), &ats50, sizeof(ats50));
        if (error)
                return error;
        timespec50_to_timespec(&ats50, &ats);

        return clock_settime1(l->l_proc, SCARG(uap, clock_id), &ats,
            true);
}


int
compat_50_sys_clock_getres(struct lwp *l,
    const struct compat_50_sys_clock_getres_args *uap, register_t *retval)
{
        /* {
                syscallarg(clockid_t) clock_id;
                syscallarg(struct timespec50 *) tp;
        } */
        struct timespec50 ats50;
        struct timespec ats;
        int error;

        error = clock_getres1(SCARG(uap, clock_id), &ats);
        if (error != 0)
                return error;

        if (SCARG(uap, tp)) {
                timespec_to_timespec50(&ats, &ats50);
                error = copyout(&ats50, SCARG(uap, tp), sizeof(ats50));
        }

        return error;
}

/* ARGSUSED */
int
compat_50_sys_nanosleep(struct lwp *l,
    const struct compat_50_sys_nanosleep_args *uap, register_t *retval)
{
        /* {
                syscallarg(struct timespec50 *) rqtp;
                syscallarg(struct timespec50 *) rmtp;
        } */
        struct timespec rmt, rqt;
        struct timespec50 rmt50, rqt50;
        int error, error1;

        error = copyin(SCARG(uap, rqtp), &rqt50, sizeof(rqt50));
        if (error)
                return error;
        timespec50_to_timespec(&rqt50, &rqt);

        error = nanosleep1(l, CLOCK_MONOTONIC, 0, &rqt,
            SCARG(uap, rmtp) ? &rmt : NULL);
        if (SCARG(uap, rmtp) == NULL || (error != 0 && error != EINTR))
                return error;

        timespec_to_timespec50(&rmt, &rmt50);
        error1 = copyout(&rmt50, SCARG(uap, rmtp), sizeof(*SCARG(uap, rmtp)));
        return error1 ? error1 : error;
}

/* ARGSUSED */
int
compat_50_sys_gettimeofday(struct lwp *l,
    const struct compat_50_sys_gettimeofday_args *uap, register_t *retval)
{
        /* {
                syscallarg(struct timeval50 *) tp;
                syscallarg(void *) tzp;                really "struct timezone *";
        } */
        struct timeval atv;
        struct timeval50 atv50;
        int error = 0;
        struct timezone tzfake;

        if (SCARG(uap, tp)) {
                microtime(&atv);
                timeval_to_timeval50(&atv, &atv50);
                error = copyout(&atv50, SCARG(uap, tp), sizeof(*SCARG(uap, tp)));
                if (error)
                        return error;
        }
        if (SCARG(uap, tzp)) {
                /*
                 * NetBSD has no kernel notion of time zone, so we just
                 * fake up a timezone struct and return it if demanded.
                 */
                memset(&tzfake, 0, sizeof(tzfake));
                tzfake.tz_minuteswest = 0;
                tzfake.tz_dsttime = 0;
                error = copyout(&tzfake, SCARG(uap, tzp), sizeof(tzfake));
        }
        return error;
}

/* ARGSUSED */
int
compat_50_sys_settimeofday(struct lwp *l,
    const struct compat_50_sys_settimeofday_args *uap, register_t *retval)
{
        /* {
                syscallarg(const struct timeval50 *) tv;
                syscallarg(const void *) tzp; really "const struct timezone *";
        } */
        struct timeval50 atv50;
        struct timeval atv;
        int error = copyin(SCARG(uap, tv), &atv50, sizeof(atv50));
        if (error)
                return error;
        timeval50_to_timeval(&atv50, &atv);
        return settimeofday1(&atv, false, SCARG(uap, tzp), l, true);
}

/* ARGSUSED */
int
compat_50_sys_adjtime(struct lwp *l,
    const struct compat_50_sys_adjtime_args *uap, register_t *retval)
{
        /* {
                syscallarg(const struct timeval50 *) delta;
                syscallarg(struct timeval50 *) olddelta;
        } */
        int error;
        struct timeval50 delta50, olddelta50;
        struct timeval delta, olddelta;

        if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_TIME,
            KAUTH_REQ_SYSTEM_TIME_ADJTIME, NULL, NULL, NULL)) != 0)
                return error;

        if (SCARG(uap, delta)) {
                error = copyin(SCARG(uap, delta), &delta50,
                    sizeof(*SCARG(uap, delta)));
                if (error)
                        return (error);
                timeval50_to_timeval(&delta50, &delta);
        }
        adjtime1(SCARG(uap, delta) ? &delta : NULL,
            SCARG(uap, olddelta) ? &olddelta : NULL, l->l_proc);
        if (SCARG(uap, olddelta)) {
                timeval_to_timeval50(&olddelta, &olddelta50);
                error = copyout(&olddelta50, SCARG(uap, olddelta),
                    sizeof(*SCARG(uap, olddelta)));
        }
        return error;
}

/* BSD routine to set/arm an interval timer. */
/* ARGSUSED */
int
compat_50_sys_getitimer(struct lwp *l,
    const struct compat_50_sys_getitimer_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) which;
                syscallarg(struct itimerval50 *) itv;
        } */
        struct proc *p = l->l_proc;
        struct itimerval aitv;
        struct itimerval50 aitv50;
        int error;

        error = dogetitimer(p, SCARG(uap, which), &aitv);
        if (error)
                return error;
        itimerval_to_itimerval50(&aitv, &aitv50);
        return copyout(&aitv50, SCARG(uap, itv), sizeof(*SCARG(uap, itv)));
}

int
compat_50_sys_setitimer(struct lwp *l,
    const struct compat_50_sys_setitimer_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) which;
                syscallarg(const struct itimerval50 *) itv;
                syscallarg(struct itimerval50 *) oitv;
        } */
        struct proc *p = l->l_proc;
        int which = SCARG(uap, which);
        struct compat_50_sys_getitimer_args getargs;
        const struct itimerval50 *itvp;
        struct itimerval50 aitv50;
        struct itimerval aitv;
        int error;

        itvp = SCARG(uap, itv);
        if (itvp &&
            (error = copyin(itvp, &aitv50, sizeof(aitv50))) != 0)
                return (error);
        itimerval50_to_itimerval(&aitv50, &aitv);
        if (SCARG(uap, oitv) != NULL) {
                SCARG(&getargs, which) = which;
                SCARG(&getargs, itv) = SCARG(uap, oitv);
                if ((error = compat_50_sys_getitimer(l, &getargs, retval)) != 0)
                        return (error);
        }
        if (itvp == 0)
                return (0);

        return dosetitimer(p, which, &aitv);
}

int
compat_50_sys_aio_suspend(struct lwp *l,
    const struct compat_50_sys_aio_suspend_args *uap, register_t *retval)
{
        /* {
                syscallarg(const struct aiocb *const[]) list;
                syscallarg(int) nent;
                syscallarg(const struct timespec50 *) timeout;
        } */
#ifdef AIO
        struct aiocb **list;
        struct timespec ts;
        struct timespec50 ts50;
        int error, nent;

        nent = SCARG(uap, nent);
        if (nent <= 0 || nent > aio_listio_max)
                return EAGAIN;

        if (SCARG(uap, timeout)) {
                /* Convert timespec to ticks */
                error = copyin(SCARG(uap, timeout), &ts50,
                    sizeof(*SCARG(uap, timeout)));
                if (error)
                        return error;
                timespec50_to_timespec(&ts50, &ts);
        }
        list = kmem_alloc(nent * sizeof(*list), KM_SLEEP);
        error = copyin(SCARG(uap, list), list, nent * sizeof(*list));
        if (error)
                goto out;
        error = aio_suspend1(l, list, nent, SCARG(uap, timeout) ? &ts : NULL);
out:
        kmem_free(list, nent * sizeof(*list));
        return error;
#else
        return ENOSYS;
#endif
}

int
compat_50_sys_mq_timedsend(struct lwp *l,
    const struct compat_50_sys_mq_timedsend_args *uap, register_t *retval)
{
        /* {
                syscallarg(mqd_t) mqdes;
                syscallarg(const char *) msg_ptr;
                syscallarg(size_t) msg_len;
                syscallarg(unsigned) msg_prio;
                syscallarg(const struct timespec50 *) abs_timeout;
        } */
#ifdef MQUEUE
        struct timespec50 ts50;
        struct timespec ts, *tsp;
        int error;

        /* Get and convert time value */
        if (SCARG(uap, abs_timeout)) {
                error = copyin(SCARG(uap, abs_timeout), &ts50, sizeof(ts50));
                if (error)
                        return error;
                timespec50_to_timespec(&ts50, &ts);
                tsp = &ts;
        } else {
                tsp = NULL;
        }

        return mq_send1(SCARG(uap, mqdes), SCARG(uap, msg_ptr),
            SCARG(uap, msg_len), SCARG(uap, msg_prio), tsp);
#else
        return ENOSYS;
#endif
}

int
compat_50_sys_mq_timedreceive(struct lwp *l,
    const struct compat_50_sys_mq_timedreceive_args *uap, register_t *retval)
{
        /* {
                syscallarg(mqd_t) mqdes;
                syscallarg(char *) msg_ptr;
                syscallarg(size_t) msg_len;
                syscallarg(unsigned *) msg_prio;
                syscallarg(const struct timespec50 *) abs_timeout;
        } */
#ifdef MQUEUE
        struct timespec ts, *tsp;
        struct timespec50 ts50;
        ssize_t mlen;
        int error;

        /* Get and convert time value */
        if (SCARG(uap, abs_timeout)) {
                error = copyin(SCARG(uap, abs_timeout), &ts50, sizeof(ts50));
                if (error)
                        return error;

                timespec50_to_timespec(&ts50, &ts);
                tsp = &ts;
        } else {
                tsp = NULL;
        }

        error = mq_recv1(SCARG(uap, mqdes), SCARG(uap, msg_ptr),
            SCARG(uap, msg_len), SCARG(uap, msg_prio), tsp, &mlen);
        if (error == 0)
                *retval = mlen;

        return error;
#else
        return ENOSYS;
#endif
}

int
compat_50_sys_getrusage(struct lwp *l,
    const struct compat_50_sys_getrusage_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) who;
                syscallarg(struct rusage50 *) rusage;
        } */
        int error;
        struct rusage ru;
        struct rusage50 ru50;
        struct proc *p = l->l_proc;

        error = getrusage1(p, SCARG(uap, who), &ru);
        if (error != 0)
                return error;

        rusage_to_rusage50(&ru, &ru50);
        return copyout(&ru50, SCARG(uap, rusage), sizeof(ru50));
}


/* Return the time remaining until a POSIX timer fires. */
int
compat_50_sys_timer_gettime(struct lwp *l,
    const struct compat_50_sys_timer_gettime_args *uap, register_t *retval)
{
        /* {
                syscallarg(timer_t) timerid;
                syscallarg(struct itimerspec50 *) value;
        } */
        struct itimerspec its;
        struct itimerspec50 its50;
        int error;

        if ((error = dotimer_gettime(SCARG(uap, timerid), l->l_proc,
            &its)) != 0)
                return error;
        itimerspec_to_itimerspec50(&its, &its50);

        return copyout(&its50, SCARG(uap, value), sizeof(its50));
}

/* Set and arm a POSIX realtime timer */
int
compat_50_sys_timer_settime(struct lwp *l,
    const struct compat_50_sys_timer_settime_args *uap, register_t *retval)
{
        /* {
                syscallarg(timer_t) timerid;
                syscallarg(int) flags;
                syscallarg(const struct itimerspec50 *) value;
                syscallarg(struct itimerspec50 *) ovalue;
        } */
        int error;
        struct itimerspec value, ovalue, *ovp = NULL;
        struct itimerspec50 value50, ovalue50;

        if ((error = copyin(SCARG(uap, value), &value50, sizeof(value50))) != 0)
                return error;

        itimerspec50_to_itimerspec(&value50, &value);
        if (SCARG(uap, ovalue))
                ovp = &ovalue;

        if ((error = dotimer_settime(SCARG(uap, timerid), &value, ovp,
            SCARG(uap, flags), l->l_proc)) != 0)
                return error;

        if (ovp) {
                itimerspec_to_itimerspec50(&ovalue, &ovalue50);
                return copyout(&ovalue50, SCARG(uap, ovalue), sizeof(ovalue50));
        }
        return 0;
}

/*
 * ntp_gettime() - NTP user application interface
 */
int
compat_50_sys___ntp_gettime30(struct lwp *l,
    const struct compat_50_sys___ntp_gettime30_args *uap, register_t *retval)
{
        if (vec_ntp_gettime == NULL)
                return ENOSYS;                /* No NTP available in kernel */

        /* {
                syscallarg(struct ntptimeval *) ntvp;
        } */
        struct ntptimeval ntv;
        struct ntptimeval50 ntv50;
        int error;

        if (SCARG(uap, ntvp)) {
                (*vec_ntp_gettime)(&ntv);
                memset(&ntv50, 0, sizeof(ntv50));
                timespec_to_timespec50(&ntv.time, &ntv50.time);
                ntv50.maxerror = ntv.maxerror;
                ntv50.esterror = ntv.esterror;
                ntv50.tai = ntv.tai;
                ntv50.time_state = ntv.time_state;

                error = copyout(&ntv50, SCARG(uap, ntvp), sizeof(ntv50));
                if (error)
                        return error;
        }
        *retval = (*vec_ntp_timestatus)();
        return 0;
}

SYSCTL_SETUP(compat_sysctl_time, "Old system boottime")
{
        struct timeval tv;

        getmicroboottime(&tv);
        timeval_to_timeval50(&tv, &boottime50);

        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT, 
                CTLTYPE_STRUCT, "oboottime", 
                SYSCTL_DESCR("System boot time"),
                NULL, 0, &boottime50, sizeof(boottime50),
                CTL_KERN, KERN_OBOOTTIME, CTL_EOL);
}

int             
kern_time_50_init(void)
{               
        int error;

        error = syscall_establish(NULL, kern_time_50_syscalls);

        return error;
}       
        
int
kern_time_50_fini(void)
{               
        int error;

        error = syscall_disestablish(NULL, kern_time_50_syscalls);

        return error;
}












































































































































































































































































































































































































































































































































































































































    1 































    1 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
/*        $NetBSD: lfs_vfsops.c,v 1.382 2022/03/19 13:53:33 hannken Exp $        */

/*-
 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Konrad E. Schroder <perseant@hhhh.org>.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
/*-
 * Copyright (c) 1989, 1991, 1993, 1994
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)lfs_vfsops.c        8.20 (Berkeley) 6/10/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.382 2022/03/19 13:53:33 hannken Exp $");

#if defined(_KERNEL_OPT)
#include "opt_lfs.h"
#include "opt_quota.h"
#include "opt_uvmhist.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/kthread.h>
#include <sys/buf.h>
#include <sys/device.h>
#include <sys/file.h>
#include <sys/disklabel.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/socket.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/syscallvar.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>

#include <miscfs/specfs/specdev.h>

#include <ufs/lfs/ulfs_quotacommon.h>
#include <ufs/lfs/ulfs_inode.h>
#include <ufs/lfs/ulfsmount.h>
#include <ufs/lfs/ulfs_bswap.h>
#include <ufs/lfs/ulfs_extern.h>

#ifdef UVMHIST
#include <uvm/uvm.h>
#endif
#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>
#include <uvm/uvm_page.h>
#include <uvm/uvm_stat.h>

#include <ufs/lfs/lfs.h>
#include <ufs/lfs/lfs_accessors.h>
#include <ufs/lfs/lfs_kernel.h>
#include <ufs/lfs/lfs_extern.h>

#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/genfs_node.h>

MODULE(MODULE_CLASS_VFS, lfs, NULL);

static int lfs_gop_write(struct vnode *, struct vm_page **, int, int);
static int lfs_mountfs(struct vnode *, struct mount *, struct lwp *);
static int lfs_flushfiles(struct mount *, int);

extern const struct vnodeopv_desc lfs_vnodeop_opv_desc;
extern const struct vnodeopv_desc lfs_specop_opv_desc;
extern const struct vnodeopv_desc lfs_fifoop_opv_desc;

struct lwp * lfs_writer_daemon = NULL;
kcondvar_t lfs_writerd_cv;

int lfs_do_flush = 0;
#ifdef LFS_KERNEL_RFW
int lfs_do_rfw = 0;
#endif

const struct vnodeopv_desc * const lfs_vnodeopv_descs[] = {
        &lfs_vnodeop_opv_desc,
        &lfs_specop_opv_desc,
        &lfs_fifoop_opv_desc,
        NULL,
};

struct vfsops lfs_vfsops = {
        .vfs_name = MOUNT_LFS,
        .vfs_min_mount_data = sizeof (struct ulfs_args),
        .vfs_mount = lfs_mount,
        .vfs_start = ulfs_start,
        .vfs_unmount = lfs_unmount,
        .vfs_root = ulfs_root,
        .vfs_quotactl = ulfs_quotactl,
        .vfs_statvfs = lfs_statvfs,
        .vfs_sync = lfs_sync,
        .vfs_vget = lfs_vget,
        .vfs_loadvnode = lfs_loadvnode,
        .vfs_newvnode = lfs_newvnode,
        .vfs_fhtovp = lfs_fhtovp,
        .vfs_vptofh = lfs_vptofh,
        .vfs_init = lfs_init,
        .vfs_reinit = lfs_reinit,
        .vfs_done = lfs_done,
        .vfs_mountroot = lfs_mountroot,
        .vfs_snapshot = (void *)eopnotsupp,
        .vfs_extattrctl = lfs_extattrctl,
        .vfs_suspendctl = genfs_suspendctl,
        .vfs_renamelock_enter = genfs_renamelock_enter,
        .vfs_renamelock_exit = genfs_renamelock_exit,
        .vfs_fsync = (void *)eopnotsupp,
        .vfs_opv_descs = lfs_vnodeopv_descs
};

const struct genfs_ops lfs_genfsops = {
        .gop_size = lfs_gop_size,
        .gop_alloc = ulfs_gop_alloc,
        .gop_write = lfs_gop_write,
        .gop_markupdate = ulfs_gop_markupdate,
        .gop_putrange = genfs_gop_putrange,
};

struct shortlong {
        const char *sname;
        const char *lname;
};

static int
sysctl_lfs_dostats(SYSCTLFN_ARGS)
{
        extern struct lfs_stats lfs_stats;
        extern int lfs_dostats;
        int error;

        error = sysctl_lookup(SYSCTLFN_CALL(rnode));
        if (error || newp == NULL)
                return (error);

        if (lfs_dostats == 0)
                memset(&lfs_stats, 0, sizeof(lfs_stats));

        return (0);
}

SYSCTL_SETUP(lfs_sysctl_setup, "lfs sysctl")
{
        int i;
        extern int lfs_writeindir, lfs_dostats, lfs_clean_vnhead,
                   lfs_fs_pagetrip, lfs_ignore_lazy_sync;
#ifdef DEBUG
        extern int lfs_debug_log_subsys[DLOG_MAX];
        struct shortlong dlog_names[DLOG_MAX] = { /* Must match lfs.h ! */
                { "rollforward", "Debug roll-forward code" },
                { "alloc",        "Debug inode allocation and free list" },
                { "avail",        "Debug space-available-now accounting" },
                { "flush",        "Debug flush triggers" },
                { "lockedlist",        "Debug locked list accounting" },
                { "vnode_verbose", "Verbose per-vnode-written debugging" },
                { "vnode",        "Debug vnode use during segment write" },
                { "segment",        "Debug segment writing" },
                { "seguse",        "Debug segment used-bytes accounting" },
                { "cleaner",        "Debug cleaning routines" },
                { "mount",        "Debug mount/unmount routines" },
                { "pagecache",        "Debug UBC interactions" },
                { "dirop",        "Debug directory-operation accounting" },
                { "malloc",        "Debug private malloc accounting" },
        };
#endif /* DEBUG */
        struct shortlong stat_names[] = { /* Must match lfs.h! */
                { "segsused",            "Number of new segments allocated" },
                { "psegwrites",            "Number of partial-segment writes" },
                { "psyncwrites",    "Number of synchronous partial-segment"
                                    " writes" },
                { "pcleanwrites",   "Number of partial-segment writes by the"
                                    " cleaner" },
                { "blocktot",       "Number of blocks written" },
                { "cleanblocks",    "Number of blocks written by the cleaner" },
                { "ncheckpoints",   "Number of checkpoints made" },
                { "nwrites",        "Number of whole writes" },
                { "nsync_writes",   "Number of synchronous writes" },
                { "wait_exceeded",  "Number of times writer waited for"
                                    " cleaner" },
                { "write_exceeded", "Number of times writer invoked flush" },
                { "flush_invoked",  "Number of times flush was invoked" },
                { "vflush_invoked", "Number of time vflush was called" },
                { "clean_inlocked", "Number of vnodes skipped for being dead" },
                { "clean_vnlocked", "Number of vnodes skipped for vget failure" },
                { "segs_reclaimed", "Number of segments reclaimed" },
        };

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "lfs",
                       SYSCTL_DESCR("Log-structured file system"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, 5, CTL_EOL);
        /*
         * XXX the "5" above could be dynamic, thereby eliminating one
         * more instance of the "number to vfs" mapping problem, but
         * "5" is the order as taken from sys/mount.h
         */

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "flushindir", NULL,
                       NULL, 0, &lfs_writeindir, 0,
                       CTL_VFS, 5, LFS_WRITEINDIR, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "clean_vnhead", NULL,
                       NULL, 0, &lfs_clean_vnhead, 0,
                       CTL_VFS, 5, LFS_CLEAN_VNHEAD, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "dostats",
                       SYSCTL_DESCR("Maintain statistics on LFS operations"),
                       sysctl_lfs_dostats, 0, &lfs_dostats, 0,
                       CTL_VFS, 5, LFS_DOSTATS, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "pagetrip",
                       SYSCTL_DESCR("How many dirty pages in fs triggers"
                                    " a flush"),
                       NULL, 0, &lfs_fs_pagetrip, 0,
                       CTL_VFS, 5, LFS_FS_PAGETRIP, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "ignore_lazy_sync",
                       SYSCTL_DESCR("Lazy Sync is ignored entirely"),
                       NULL, 0, &lfs_ignore_lazy_sync, 0,
                       CTL_VFS, 5, LFS_IGNORE_LAZY_SYNC, CTL_EOL);
#ifdef LFS_KERNEL_RFW
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "rfw",
                       SYSCTL_DESCR("Use in-kernel roll-forward on mount"),
                       NULL, 0, &lfs_do_rfw, 0,
                       CTL_VFS, 5, LFS_DO_RFW, CTL_EOL);
#endif

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "stats",
                       SYSCTL_DESCR("Debugging options"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, 5, LFS_STATS, CTL_EOL);
        for (i = 0; i < sizeof(struct lfs_stats) / sizeof(u_int); i++) {
                sysctl_createv(clog, 0, NULL, NULL,
                               CTLFLAG_PERMANENT|CTLFLAG_READONLY,
                               CTLTYPE_INT, stat_names[i].sname,
                               SYSCTL_DESCR(stat_names[i].lname),
                               NULL, 0, &(((u_int *)&lfs_stats.segsused)[i]),
                               0, CTL_VFS, 5, LFS_STATS, i, CTL_EOL);
        }

#ifdef DEBUG
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "debug",
                       SYSCTL_DESCR("Debugging options"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, 5, LFS_DEBUGLOG, CTL_EOL);
        for (i = 0; i < DLOG_MAX; i++) {
                sysctl_createv(clog, 0, NULL, NULL,
                               CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                               CTLTYPE_INT, dlog_names[i].sname,
                               SYSCTL_DESCR(dlog_names[i].lname),
                               NULL, 0, &(lfs_debug_log_subsys[i]), 0,
                               CTL_VFS, 5, LFS_DEBUGLOG, i, CTL_EOL);
        }
#endif
}

/* old cleaner syscall interface.  see VOP_FCNTL() */
static const struct syscall_package lfs_syscalls[] = {
        { SYS_lfs_bmapv,        0, (sy_call_t *)sys_lfs_bmapv                },
        { SYS_lfs_markv,        0, (sy_call_t *)sys_lfs_markv                },
        { SYS___lfs_segwait50,        0, (sy_call_t *)sys___lfs_segwait50        },
        { SYS_lfs_segclean,        0, (sy_call_t *)sys_lfs_segclean        },
        { 0, 0, NULL },
};

static int
lfs_modcmd(modcmd_t cmd, void *arg)
{
        int error;

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = syscall_establish(NULL, lfs_syscalls);
                if (error)
                        return error;
                error = vfs_attach(&lfs_vfsops);
                if (error != 0) {
                        syscall_disestablish(NULL, lfs_syscalls);
                        break;
                }
                cv_init(&lfs_allclean_wakeup, "segment");
                break;
        case MODULE_CMD_FINI:
                error = vfs_detach(&lfs_vfsops);
                if (error != 0)
                        break;
                syscall_disestablish(NULL, lfs_syscalls);
                cv_destroy(&lfs_allclean_wakeup);
                break;
        default:
                error = ENOTTY;
                break;
        }

        return (error);
}

/*
 * XXX Same structure as FFS inodes?  Should we share a common pool?
 */
struct pool lfs_inode_pool;
struct pool lfs_dinode_pool;
struct pool lfs_inoext_pool;
struct pool lfs_lbnentry_pool;

/*
 * The writer daemon.  UVM keeps track of how many dirty pages we are holding
 * in lfs_subsys_pages; the daemon flushes the filesystem when this value
 * crosses the (user-defined) threshold LFS_MAX_PAGES.
 */
static void
lfs_writerd(void *arg)
{
        mount_iterator_t *iter;
         struct mount *mp;
         struct lfs *fs;
        struct vfsops *vfs = NULL;
         int fsflags;
        int lfsc;
        int wrote_something = 0;
 
        mutex_enter(&lfs_lock);
        KASSERTMSG(lfs_writer_daemon == NULL, "more than one LFS writer daemon");
        lfs_writer_daemon = curlwp;
        mutex_exit(&lfs_lock);

        /* Take an extra reference to the LFS vfsops. */
        vfs = vfs_getopsbyname(MOUNT_LFS);
 
         mutex_enter(&lfs_lock);
         for (;;) {
                KASSERT(mutex_owned(&lfs_lock));
                if (wrote_something == 0)
                        cv_timedwait(&lfs_writerd_cv, &lfs_lock, hz/10 + 1);
                KASSERT(mutex_owned(&lfs_lock));
                wrote_something = 0;

                /*
                 * If global state wants a flush, flush everything.
                 */
                if (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS ||
                        locked_queue_bytes > LFS_MAX_BYTES ||
                        lfs_subsys_pages > LFS_MAX_PAGES) {

                        if (lfs_do_flush) {
                                DLOG((DLOG_FLUSH, "lfs_writerd: lfs_do_flush\n"));
                        }
                        if (locked_queue_count > LFS_MAX_BUFS) {
                                DLOG((DLOG_FLUSH, "lfs_writerd: lqc = %d, max %d\n",
                                      locked_queue_count, LFS_MAX_BUFS));
                        }
                        if (locked_queue_bytes > LFS_MAX_BYTES) {
                                DLOG((DLOG_FLUSH, "lfs_writerd: lqb = %ld, max %ld\n",
                                      locked_queue_bytes, LFS_MAX_BYTES));
                        }
                        if (lfs_subsys_pages > LFS_MAX_PAGES) {
                                DLOG((DLOG_FLUSH, "lfs_writerd: lssp = %d, max %d\n",
                                      lfs_subsys_pages, LFS_MAX_PAGES));
                        }

                        lfs_flush(NULL, SEGM_WRITERD, 0);
                        lfs_do_flush = 0;
                        KASSERT(mutex_owned(&lfs_lock));
                        continue;
                }
                KASSERT(mutex_owned(&lfs_lock));
                mutex_exit(&lfs_lock);
 
                 /*
                  * Look through the list of LFSs to see if any of them
                  * have requested pageouts.
                  */
                 mountlist_iterator_init(&iter);
                lfsc = 0;
                while ((mp = mountlist_iterator_next(iter)) != NULL) {
                        KASSERT(!mutex_owned(&lfs_lock));
                         if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
                             sizeof(mp->mnt_stat.f_fstypename)) == 0) {
                                ++lfsc;
                                 fs = VFSTOULFS(mp)->um_lfs;
                                daddr_t ooffset = 0;
                                fsflags = SEGM_SINGLE;

                                 mutex_enter(&lfs_lock);
                                ooffset = lfs_sb_getoffset(fs);

                                if (lfs_sb_getnextseg(fs) < lfs_sb_getcurseg(fs) && fs->lfs_nowrap) {
                                        /* Don't try to write if we're suspended */
                                        mutex_exit(&lfs_lock);
                                        continue;
                                }
                                if (LFS_STARVED_FOR_SEGS(fs)) {
                                        mutex_exit(&lfs_lock);

                                        DLOG((DLOG_FLUSH, "lfs_writerd: need cleaning before writing possible\n"));
                                        lfs_wakeup_cleaner(fs);
                                        continue;
                                }

                                 if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
                                      lfs_dirvcount > LFS_MAX_DIROP) &&
                                    fs->lfs_dirops == 0) {
                                        fsflags &= ~SEGM_SINGLE;
                                         fsflags |= SEGM_CKP;
                                        DLOG((DLOG_FLUSH, "lfs_writerd: checkpoint\n"));
                                        lfs_flush_fs(fs, fsflags);
                                } else if (fs->lfs_pdflush) {
                                         DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n"));
                                         lfs_flush_fs(fs, fsflags);
                                 } else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) {
                                         DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n"));
                                         mutex_exit(&lfs_lock);
                                         lfs_writer_enter(fs, "wrdirop");
                                         lfs_flush_pchain(fs);
                                         lfs_writer_leave(fs);
                                        mutex_enter(&lfs_lock);
                                }
                                if (lfs_sb_getoffset(fs) != ooffset)
                                        ++wrote_something;
                                mutex_exit(&lfs_lock);
                         }
                        KASSERT(!mutex_owned(&lfs_lock));
                 }
                if (lfsc == 0) {
                        mutex_enter(&lfs_lock);
                        lfs_writer_daemon = NULL;
                        mutex_exit(&lfs_lock);
                        mountlist_iterator_destroy(iter);
                        break;
                }
                 mountlist_iterator_destroy(iter);
 
                 mutex_enter(&lfs_lock);
         }
        KASSERT(!mutex_owned(&lfs_lock));

        /* Give up our extra reference so the module can be unloaded. */
        mutex_enter(&vfs_list_lock);
        if (vfs != NULL)
                vfs->vfs_refcount--;
        mutex_exit(&vfs_list_lock);

        /* Done! */
        kthread_exit(0);
}

/*
 * Initialize the filesystem, most work done by ulfs_init.
 */
void
lfs_init(void)
{

        /*
         * XXX: should we use separate pools for 32-bit and 64-bit
         * dinodes?
         */
        malloc_type_attach(M_SEGMENT);
        pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0,
            "lfsinopl", &pool_allocator_nointr, IPL_NONE);
        pool_init(&lfs_dinode_pool, sizeof(union lfs_dinode), 0, 0, 0,
            "lfsdinopl", &pool_allocator_nointr, IPL_NONE);
        pool_init(&lfs_inoext_pool, sizeof(struct lfs_inode_ext), 8, 0, 0,
            "lfsinoextpl", &pool_allocator_nointr, IPL_NONE);
        pool_init(&lfs_lbnentry_pool, sizeof(struct lbnentry), 0, 0, 0,
            "lfslbnpool", &pool_allocator_nointr, IPL_NONE);
        ulfs_init();

#ifdef DEBUG
        memset(lfs_log, 0, sizeof(lfs_log));
#endif
        mutex_init(&lfs_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&lfs_writerd_cv, "lfswrite");
        cv_init(&locked_queue_cv, "lfsbuf");
        cv_init(&lfs_writing_cv, "lfsflush");
}

void
lfs_reinit(void)
{
        ulfs_reinit();
}

void
lfs_done(void)
{
        ulfs_done();
        mutex_destroy(&lfs_lock);
        cv_destroy(&lfs_writerd_cv);
        cv_destroy(&locked_queue_cv);
        cv_destroy(&lfs_writing_cv);
        pool_destroy(&lfs_inode_pool);
        pool_destroy(&lfs_dinode_pool);
        pool_destroy(&lfs_inoext_pool);
        pool_destroy(&lfs_lbnentry_pool);
        malloc_type_detach(M_SEGMENT);
}

/*
 * Called by main() when ulfs is going to be mounted as root.
 */
int
lfs_mountroot(void)
{
        extern struct vnode *rootvp;
        struct lfs *fs = NULL;                                /* LFS */
        struct mount *mp;
        struct lwp *l = curlwp;
        struct ulfsmount *ump;
        int error;

        if (device_class(root_device) != DV_DISK)
                return (ENODEV);

        if (rootdev == NODEV)
                return (ENODEV);
        if ((error = vfs_rootmountalloc(MOUNT_LFS, "root_device", &mp))) {
                vrele(rootvp);
                return (error);
        }
        if ((error = lfs_mountfs(rootvp, mp, l))) {
                vfs_unbusy(mp);
                vfs_rele(mp);
                return (error);
        }
        mountlist_append(mp);
        ump = VFSTOULFS(mp);
        fs = ump->um_lfs;
        lfs_sb_setfsmnt(fs, mp->mnt_stat.f_mntonname);
        (void)lfs_statvfs(mp, &mp->mnt_stat);
        vfs_unbusy(mp);
        setrootfstime((time_t)lfs_sb_gettstamp(VFSTOULFS(mp)->um_lfs));
        return (0);
}

/*
 * VFS Operations.
 *
 * mount system call
 */
int
lfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
        struct lwp *l = curlwp;
        struct vnode *devvp;
        struct ulfs_args *args = data;
        struct ulfsmount *ump = NULL;
        struct lfs *fs = NULL;                                /* LFS */
        int error = 0, update;
        mode_t accessmode;

        if (args == NULL)
                return EINVAL;
        if (*data_len < sizeof *args)
                return EINVAL;

        if (mp->mnt_flag & MNT_GETARGS) {
                ump = VFSTOULFS(mp);
                if (ump == NULL)
                        return EIO;
                args->fspec = NULL;
                *data_len = sizeof *args;
                return 0;
        }

        update = mp->mnt_flag & MNT_UPDATE;

        /* Check arguments */
        if (args->fspec != NULL) {
                /*
                 * Look up the name and verify that it's sane.
                 */
                error = namei_simple_user(args->fspec,
                                        NSM_FOLLOW_NOEMULROOT, &devvp);
                if (error != 0)
                        return (error);

                if (!update) {
                        /*
                         * Be sure this is a valid block device
                         */
                        if (devvp->v_type != VBLK)
                                error = ENOTBLK;
                        else if (bdevsw_lookup(devvp->v_rdev) == NULL)
                                error = ENXIO;
                } else {
                        /*
                         * Be sure we're still naming the same device
                         * used for our initial mount
                         *
                         * XXX dholland 20151010: if namei gives us a
                         * different vnode for the same device,
                         * wouldn't it be better to use it going
                         * forward rather than ignore it in favor of
                         * the old one?
                         */
                        ump = VFSTOULFS(mp);
                        fs = ump->um_lfs;
                        if (devvp != fs->lfs_devvp) {
                                if (devvp->v_rdev != fs->lfs_devvp->v_rdev)
                                        error = EINVAL;
                                else {
                                        vrele(devvp);
                                        devvp = fs->lfs_devvp;
                                        vref(devvp);
                                }
                        }
                }
        } else {
                if (!update) {
                        /* New mounts must have a filename for the device */
                        return (EINVAL);
                } else {
                        /* Use the extant mount */
                        ump = VFSTOULFS(mp);
                        fs = ump->um_lfs;
                        devvp = fs->lfs_devvp;
                        vref(devvp);
                }
        }


        /*
         * If mount by non-root, then verify that user has necessary
         * permissions on the device.
         */
        if (error == 0) {
                accessmode = VREAD;
                if (update ?
                    (mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
                    (mp->mnt_flag & MNT_RDONLY) == 0)
                        accessmode |= VWRITE;
                vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
                    KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp,
                    KAUTH_ARG(accessmode));
                VOP_UNLOCK(devvp);
        }

        if (error) {
                vrele(devvp);
                return (error);
        }

        if (!update) {
                int flags;

                if (mp->mnt_flag & MNT_RDONLY)
                        flags = FREAD;
                else
                        flags = FREAD|FWRITE;
                vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                error = VOP_OPEN(devvp, flags, FSCRED);
                VOP_UNLOCK(devvp);
                if (error)
                        goto fail;
                error = lfs_mountfs(devvp, mp, l);                /* LFS */
                if (error) {
                        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                        (void)VOP_CLOSE(devvp, flags, NOCRED);
                        VOP_UNLOCK(devvp);
                        goto fail;
                }

                ump = VFSTOULFS(mp);
                fs = ump->um_lfs;
        } else {
                /*
                 * Update the mount.
                 */

                /*
                 * The initial mount got a reference on this
                 * device, so drop the one obtained via
                 * namei(), above.
                 */
                vrele(devvp);

                ump = VFSTOULFS(mp);
                fs = ump->um_lfs;

                if (!fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDONLY)) {
                        /*
                         * Changing from read/write to read-only.
                         */
                        int flags = WRITECLOSE;
                        if (mp->mnt_flag & MNT_FORCE)
                                flags |= FORCECLOSE;
                        error = lfs_flushfiles(mp, flags);
                        if (error)
                                return error;
                        fs->lfs_ronly = 1;
                } else if (fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
                        /*
                         * Changing from read-only to read/write.
                         * Note in the superblocks that we're writing.
                         */

                        /* XXX: quotas should have been on even if readonly */
                        if (fs->lfs_use_quota2) {
#ifdef LFS_QUOTA2
                                error = lfs_quota2_mount(mp);
#else
                                uprintf("%s: no kernel support for this "
                                        "filesystem's quotas\n",
                                        mp->mnt_stat.f_mntonname);
                                if (mp->mnt_flag & MNT_FORCE) {
                                        uprintf("%s: mounting anyway; "
                                                "fsck afterwards\n",
                                                mp->mnt_stat.f_mntonname);
                                } else {
                                        error = EINVAL;
                                }
#endif
                                if (error) {
                                        return error;
                                }
                        }

                        fs->lfs_ronly = 0;
                        if (lfs_sb_getpflags(fs) & LFS_PF_CLEAN) {
                                lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) & ~LFS_PF_CLEAN);
                                lfs_writesuper(fs, lfs_sb_getsboff(fs, 0));
                                lfs_writesuper(fs, lfs_sb_getsboff(fs, 1));
                        }
                }

                if (args->fspec == NULL)
                        return 0;
        }

        error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
            UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
        if (error == 0)
                lfs_sb_setfsmnt(fs, mp->mnt_stat.f_mntonname);
        return error;

fail:
        vrele(devvp);
        return (error);
}

/*
 * Helper for mountfs. Note that the fs pointer may be a dummy one
 * pointing into a superblock buffer. (Which is gross; see below.)
 */
static int
lfs_checkmagic(struct lfs *fs)
{
        switch (fs->lfs_dlfs_u.u_32.dlfs_magic) {
            case LFS_MAGIC:
                fs->lfs_is64 = false;
                fs->lfs_dobyteswap = false;
                break;
            case LFS64_MAGIC:
                fs->lfs_is64 = true;
                fs->lfs_dobyteswap = false;
                break;
#ifdef LFS_EI
            case LFS_MAGIC_SWAPPED:
                fs->lfs_is64 = false;
                fs->lfs_dobyteswap = true;
                break;
            case LFS64_MAGIC_SWAPPED:
                fs->lfs_is64 = true;
                fs->lfs_dobyteswap = true;
                break;
#endif
            default:
                /* XXX needs translation */
                return EINVAL;
        }
        return 0;
}

/*
 * Common code for mount and mountroot
 * LFS specific
 */
int
lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
{
        struct lfs *primarysb, *altsb, *thesb;
        struct buf *primarybuf, *altbuf;
        struct lfs *fs;
        struct ulfsmount *ump;
        struct vnode *vp;
        dev_t dev;
        int error, i, ronly, fsbsize;
        kauth_cred_t cred;
        CLEANERINFO *cip;
        SEGUSE *sup;
        daddr_t sb_addr;
        ino_t *orphan;
        size_t norphan;

        cred = l ? l->l_cred : NOCRED;

        /* The superblock is supposed to be 512 bytes. */
        __CTASSERT(sizeof(struct dlfs) == DEV_BSIZE);

        /*
         * Flush out any old buffers remaining from a previous use.
         */
        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
        error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
        VOP_UNLOCK(devvp);
        if (error)
                return (error);

        ronly = (mp->mnt_flag & MNT_RDONLY) != 0;

        /* Don't free random space on error. */
        primarybuf = NULL;
        altbuf = NULL;
        ump = NULL;

        sb_addr = LFS_LABELPAD / DEV_BSIZE;
        while (1) {
                /*
                 * Read in the superblock.
                 *
                 * Note that because LFS_SBPAD is substantially larger
                 * (8K) than the actual on-disk superblock (512 bytes)
                 * the buffer contains enough space to be used as a
                 * whole struct lfs (in-memory superblock) - we do this
                 * only so we can set and use the is64 and dobyteswap
                 * members. XXX this is gross and the logic here should
                 * be reworked.
                 */
                error = bread(devvp, sb_addr, LFS_SBPAD, 0, &primarybuf);
                if (error)
                        goto out;
                primarysb = (struct lfs *)primarybuf->b_data;

                /* Check the basics. */
                error = lfs_checkmagic(primarysb);
                if (error) {
                        DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock wrong magic\n"));
                        goto out;
                }
                if (lfs_sb_getbsize(primarysb) > MAXBSIZE ||
                    lfs_sb_getversion(primarysb) > LFS_VERSION ||
                    lfs_sb_getbsize(primarysb) < sizeof(struct dlfs)) {
                        DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock sanity failed\n"));
                        /* XXX needs translation */
                        error = EINVAL;
                        goto out;
                }
                if (lfs_sb_getinodefmt(primarysb) > LFS_MAXINODEFMT) {
                        DLOG((DLOG_MOUNT, "lfs_mountfs: unknown inode format %d\n",
                               lfs_sb_getinodefmt(primarysb)));
                        error = EINVAL;
                        goto out;
                }

                if (lfs_sb_getversion(primarysb) == 1)
                        fsbsize = DEV_BSIZE;
                else {
                        fsbsize = 1 << lfs_sb_getffshift(primarysb);
                        /*
                         * Could be, if the frag size is large enough, that we
                         * don't have the "real" primary superblock.  If that's
                         * the case, get the real one, and try again.
                         */
                        if (sb_addr != (lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT))) {
                                DLOG((DLOG_MOUNT, "lfs_mountfs: sb daddr"
                                      " 0x%llx is not right, trying 0x%llx\n",
                                      (long long)sb_addr,
                                      (long long)(lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT))));
                                sb_addr = lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT);
                                brelse(primarybuf, BC_INVAL);
                                continue;
                        }
                }
                break;
        }

        /*
         * Check the second superblock to see which is newer; then mount
         * using the older of the two.        This is necessary to ensure that
         * the filesystem is valid if it was not unmounted cleanly.
         */

        if (lfs_sb_getsboff(primarysb, 1) &&
            lfs_sb_getsboff(primarysb, 1) - LFS_LABELPAD / fsbsize > LFS_SBPAD / fsbsize)
        {
                error = bread(devvp, lfs_sb_getsboff(primarysb, 1) * (fsbsize / DEV_BSIZE),
                        LFS_SBPAD, 0, &altbuf);
                if (error)
                        goto out;
                altsb = (struct lfs *)altbuf->b_data;

                /*
                 * Note: this used to do the sanity check only if the
                 * timestamp/serial comparison required use of altsb;
                 * this way is less tolerant, but if altsb is corrupted
                 * enough that the magic number, version, and blocksize
                 * are bogus, why would the timestamp or serial fields
                 * mean anything either? If this kind of thing happens,
                 * you need to fsck anyway.
                 */

                error = lfs_checkmagic(altsb);
                if (error)
                        goto out;

                /* Check the basics. */
                if (lfs_sb_getbsize(altsb) > MAXBSIZE ||
                    lfs_sb_getversion(altsb) > LFS_VERSION ||
                    lfs_sb_getbsize(altsb) < sizeof(struct dlfs)) {
                        DLOG((DLOG_MOUNT, "lfs_mountfs: alt superblock"
                              " sanity failed\n"));
                        error = EINVAL;                /* XXX needs translation */
                        goto out;
                }

                if (lfs_sb_getversion(primarysb) == 1) {
                        /* 1s resolution comparison */
                        if (lfs_sb_gettstamp(altsb) < lfs_sb_gettstamp(primarysb))
                                thesb = altsb;
                        else
                                thesb = primarysb;
                } else {
                        /* monotonic infinite-resolution comparison */
                        if (lfs_sb_getserial(altsb) < lfs_sb_getserial(primarysb))
                                thesb = altsb;
                        else
                                thesb = primarysb;
                }
        } else {
                DLOG((DLOG_MOUNT, "lfs_mountfs: invalid alt superblock location"
                      " daddr=0x%x\n", lfs_sb_getsboff(primarysb, 1)));
                error = EINVAL;
                goto out;
        }

        /*
         * Allocate the mount structure, copy the superblock into it.
         * Note that the 32-bit and 64-bit superblocks are the same size.
         */
        fs = kmem_zalloc(sizeof(struct lfs), KM_SLEEP);
        memcpy(&fs->lfs_dlfs_u.u_32, &thesb->lfs_dlfs_u.u_32,
               sizeof(struct dlfs));
        fs->lfs_is64 = thesb->lfs_is64;
        fs->lfs_dobyteswap = thesb->lfs_dobyteswap;
        fs->lfs_hasolddirfmt = false; /* set for real below */

        /* Compatibility */
        if (lfs_sb_getversion(fs) < 2) {
                lfs_sb_setsumsize(fs, LFS_V1_SUMMARY_SIZE);
                lfs_sb_setibsize(fs, lfs_sb_getbsize(fs));
                lfs_sb_sets0addr(fs, lfs_sb_getsboff(fs, 0));
                lfs_sb_settstamp(fs, lfs_sb_getotstamp(fs));
                lfs_sb_setfsbtodb(fs, 0);
        }
        if (lfs_sb_getresvseg(fs) == 0)
                lfs_sb_setresvseg(fs, MIN(lfs_sb_getminfreeseg(fs) - 1, \
                        MAX(MIN_RESV_SEGS, lfs_sb_getminfreeseg(fs) / 2 + 1)));

        /*
         * If we aren't going to be able to write meaningfully to this
         * filesystem, and were not mounted readonly, bomb out now.
         */
        if (lfs_fsbtob(fs, LFS_NRESERVE(fs)) > LFS_MAX_BYTES && !ronly) {
                DLOG((DLOG_MOUNT, "lfs_mount: to mount this filesystem read/write,"
                      " we need BUFPAGES >= %lld\n",
                      (long long)((bufmem_hiwater / bufmem_lowater) *
                                  LFS_INVERSE_MAX_BYTES(
                                          lfs_fsbtob(fs, LFS_NRESERVE(fs))) >> PAGE_SHIFT)));
                kmem_free(fs, sizeof(struct lfs));
                error = EFBIG; /* XXX needs translation */
                goto out;
        }

        /* Before rolling forward, lock so vget will sleep for other procs */
        if (l != NULL) {
                fs->lfs_flags = LFS_NOTYET;
                fs->lfs_rfpid = l->l_proc->p_pid;
        }

        ump = kmem_zalloc(sizeof(*ump), KM_SLEEP);
        ump->um_lfs = fs;
        ump->um_fstype = fs->lfs_is64 ? ULFS2 : ULFS1;
        /* ump->um_cleaner_thread = NULL; */
        brelse(primarybuf, BC_INVAL);
        brelse(altbuf, BC_INVAL);
        primarybuf = NULL;
        altbuf = NULL;


        /* Set up the I/O information */
        fs->lfs_devbsize = DEV_BSIZE;
        fs->lfs_iocount = 0;
        fs->lfs_diropwait = 0;
        fs->lfs_activesb = 0;
        lfs_sb_setuinodes(fs, 0);
        fs->lfs_ravail = 0;
        fs->lfs_favail = 0;
        fs->lfs_sbactive = 0;

        /* Set up the ifile and lock aflags */
        fs->lfs_doifile = 0;
        fs->lfs_writer = 0;
        fs->lfs_dirops = 0;
        fs->lfs_nadirop = 0;
        fs->lfs_seglock = 0;
        fs->lfs_pdflush = 0;
        fs->lfs_sleepers = 0;
        fs->lfs_pages = 0;
        rw_init(&fs->lfs_fraglock);
        rw_init(&fs->lfs_iflock);
        cv_init(&fs->lfs_sleeperscv, "lfs_slp");
        cv_init(&fs->lfs_diropscv, "lfs_dirop");
        cv_init(&fs->lfs_stopcv, "lfsstop");
        cv_init(&fs->lfs_nextsegsleep, "segment");

        /* Set the file system readonly/modify bits. */
        fs->lfs_ronly = ronly;
        if (ronly == 0)
                fs->lfs_fmod = 1;

        /* Device we're using */
        dev = devvp->v_rdev;
        fs->lfs_dev = dev;
        fs->lfs_devvp = devvp;

        /* ulfs-level information */
        fs->um_flags = 0;
        fs->um_bptrtodb = lfs_sb_getffshift(fs) - DEV_BSHIFT;
        fs->um_seqinc = lfs_sb_getfrag(fs);
        fs->um_nindir = lfs_sb_getnindir(fs);
        fs->um_lognindir = ffs(lfs_sb_getnindir(fs)) - 1;
        fs->um_maxsymlinklen = lfs_sb_getmaxsymlinklen(fs);
        fs->um_dirblksiz = LFS_DIRBLKSIZ;
        fs->um_maxfilesize = lfs_sb_getmaxfilesize(fs);

        /* quota stuff */
        /* XXX: these need to come from the on-disk superblock to be used */
        fs->lfs_use_quota2 = 0;
        fs->lfs_quota_magic = 0;
        fs->lfs_quota_flags = 0;
        fs->lfs_quotaino[0] = 0;
        fs->lfs_quotaino[1] = 0;

        /* Initialize the mount structure. */
        mp->mnt_data = ump;
        mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
        mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_LFS);
        mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
        mp->mnt_stat.f_namemax = LFS_MAXNAMLEN;
        mp->mnt_stat.f_iosize = lfs_sb_getbsize(fs);
        mp->mnt_flag |= MNT_LOCAL;
        mp->mnt_iflag |= IMNT_SHRLOOKUP;
        mp->mnt_fs_bshift = lfs_sb_getbshift(fs);
        mp->mnt_iflag |= IMNT_CAN_RWTORO;
        if (fs->um_maxsymlinklen > 0)
                mp->mnt_iflag |= IMNT_DTYPE;
        else
                fs->lfs_hasolddirfmt = true;

        ump->um_mountp = mp;
        for (i = 0; i < ULFS_MAXQUOTAS; i++)
                ump->um_quotas[i] = NULLVP;
        spec_node_setmountedfs(devvp, mp);

        /* Set up reserved memory for pageout */
        lfs_setup_resblks(fs);
        /* Set up vdirop tailq */
        TAILQ_INIT(&fs->lfs_dchainhd);
        /* and paging tailq */
        TAILQ_INIT(&fs->lfs_pchainhd);
        /* and delayed segment accounting for truncation list */
        LIST_INIT(&fs->lfs_segdhd);

        /*
         * We use the ifile vnode for almost every operation.  Instead of
         * retrieving it from the hash table each time we retrieve it here,
         * artificially increment the reference count and keep a pointer
         * to it in the incore copy of the superblock.
         */
        if ((error = VFS_VGET(mp, LFS_IFILE_INUM, LK_EXCLUSIVE, &vp)) != 0) {
                DLOG((DLOG_MOUNT, "lfs_mountfs: ifile vget failed, error=%d\n", error));
                goto out;
        }
        fs->lfs_ivnode = vp;
        vref(vp);

        /* Set up inode bitmap, order free list, and gather orphans.  */
        lfs_order_freelist(fs, &orphan, &norphan);

        /* Set up segment usage flags for the autocleaner. */
        fs->lfs_nactive = 0;
        fs->lfs_suflags = malloc(2 * sizeof(u_int32_t *),
                                 M_SEGMENT, M_WAITOK);
        fs->lfs_suflags[0] = malloc(lfs_sb_getnseg(fs) * sizeof(u_int32_t),
                                    M_SEGMENT, M_WAITOK);
        fs->lfs_suflags[1] = malloc(lfs_sb_getnseg(fs) * sizeof(u_int32_t),
                                    M_SEGMENT, M_WAITOK);
        memset(fs->lfs_suflags[1], 0, lfs_sb_getnseg(fs) * sizeof(u_int32_t));
        for (i = 0; i < lfs_sb_getnseg(fs); i++) {
                int changed;
                struct buf *bp;

                LFS_SEGENTRY(sup, fs, i, bp);
                changed = 0;
                if (!ronly) {
                        if (sup->su_nbytes == 0 &&
                            !(sup->su_flags & SEGUSE_EMPTY)) {
                                sup->su_flags |= SEGUSE_EMPTY;
                                ++changed;
                        } else if (!(sup->su_nbytes == 0) &&
                                   (sup->su_flags & SEGUSE_EMPTY)) {
                                sup->su_flags &= ~SEGUSE_EMPTY;
                                ++changed;
                        }
                        if (sup->su_flags & (SEGUSE_ACTIVE|SEGUSE_INVAL)) {
                                sup->su_flags &= ~(SEGUSE_ACTIVE|SEGUSE_INVAL);
                                ++changed;
                        }
                }
                fs->lfs_suflags[0][i] = sup->su_flags;
                if (changed)
                        LFS_WRITESEGENTRY(sup, fs, i, bp);
                else
                        brelse(bp, 0);
        }

        /* Free the orphans we discovered while ordering the freelist.  */
        lfs_free_orphans(fs, orphan, norphan);

        /*
         * XXX: if the fs has quotas, quotas should be on even if
         * readonly. Otherwise you can't query the quota info!
         * However, that's not how the quota2 code got written and I
         * don't know if it'll behave itself if enabled while
         * readonly, so for now use the same enable logic as ffs.
         *
         * XXX: also, if you use the -f behavior allowed here (and
         * equivalently above for remount) it will corrupt the fs. It
         * ought not to allow that. It should allow mounting readonly
         * if there are quotas and the kernel doesn't have the quota
         * code, but only readonly.
         *
         * XXX: and if you use the -f behavior allowed here it will
         * likely crash at unmount time (or remount time) because we
         * think quotas are active.
         *
         * Although none of this applies until there's a way to set
         * lfs_use_quota2 and have quotas in the fs at all.
         */
        if (!ronly && fs->lfs_use_quota2) {
#ifdef LFS_QUOTA2
                error = lfs_quota2_mount(mp);
#else
                uprintf("%s: no kernel support for this filesystem's quotas\n",
                        mp->mnt_stat.f_mntonname);
                if (mp->mnt_flag & MNT_FORCE) {
                        uprintf("%s: mounting anyway; fsck afterwards\n",
                                mp->mnt_stat.f_mntonname);
                } else {
                        error = EINVAL;
                }
#endif
                if (error) {
                        /* XXX XXX must clean up the stuff immediately above */
                        printf("lfs_mountfs: sorry, leaking some memory\n");
                        goto out;
                }
        }

#ifdef LFS_KERNEL_RFW
        lfs_roll_forward(fs, mp, l);
#endif

        /* If writing, sb is not clean; record in case of immediate crash */
        if (!fs->lfs_ronly) {
                lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) & ~LFS_PF_CLEAN);
                lfs_writesuper(fs, lfs_sb_getsboff(fs, 0));
                lfs_writesuper(fs, lfs_sb_getsboff(fs, 1));
        }

        /* Allow vget now that roll-forward is complete */
        fs->lfs_flags &= ~(LFS_NOTYET);
        wakeup(&fs->lfs_flags);

        /*
         * Initialize the ifile cleaner info with information from
         * the superblock.
         */
        {
                struct buf *bp;

                LFS_CLEANERINFO(cip, fs, bp);
                lfs_ci_setclean(fs, cip, lfs_sb_getnclean(fs));
                lfs_ci_setdirty(fs, cip, lfs_sb_getnseg(fs) - lfs_sb_getnclean(fs));
                lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs));
                lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs));
                (void) LFS_BWRITE_LOG(bp); /* Ifile */
        }

        /*
         * Mark the current segment as ACTIVE, since we're going to
         * be writing to it.
         */
        {
                struct buf *bp;

                LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getoffset(fs)), bp);
                sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
                fs->lfs_nactive++;
                LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getoffset(fs)), bp);  /* Ifile */
        }

        /* Now that roll-forward is done, unlock the Ifile */
        vput(vp);

        /* Start the pagedaemon-anticipating daemon */
        mutex_enter(&lfs_lock);
        if (lfs_writer_daemon == NULL &&
            kthread_create(PRI_BIO, 0, NULL,
            lfs_writerd, NULL, NULL, "lfs_writer") != 0)
                panic("fork lfs_writer");
        mutex_exit(&lfs_lock);

        printf("WARNING: the log-structured file system is experimental\n"
            "WARNING: it may cause system crashes and/or corrupt data\n");

        return (0);

out:
        if (primarybuf)
                brelse(primarybuf, BC_INVAL);
        if (altbuf)
                brelse(altbuf, BC_INVAL);
        if (ump) {
                kmem_free(ump->um_lfs, sizeof(struct lfs));
                kmem_free(ump, sizeof(*ump));
                mp->mnt_data = NULL;
        }

        return (error);
}

/*
 * unmount system call
 */
int
lfs_unmount(struct mount *mp, int mntflags)
{
        struct ulfsmount *ump;
        struct lfs *fs;
        int error, ronly;

        ump = VFSTOULFS(mp);
        fs = ump->um_lfs;

        error = lfs_flushfiles(mp, mntflags & MNT_FORCE ? FORCECLOSE : 0);
        if (error)
                return error;

        /* Finish with the Ifile, now that we're done with it */
        vgone(fs->lfs_ivnode);

        ronly = !fs->lfs_ronly;
        if (fs->lfs_devvp->v_type != VBAD)
                spec_node_setmountedfs(fs->lfs_devvp, NULL);
        vn_lock(fs->lfs_devvp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_CLOSE(fs->lfs_devvp,
            ronly ? FREAD : FREAD|FWRITE, NOCRED);
        vput(fs->lfs_devvp);

        /* Complain about page leakage */
        if (fs->lfs_pages > 0)
                printf("lfs_unmount: still claim %d pages (%d in subsystem)\n",
                        fs->lfs_pages, lfs_subsys_pages);

        /* Free per-mount data structures */
        free(fs->lfs_ino_bitmap, M_SEGMENT);
        free(fs->lfs_suflags[0], M_SEGMENT);
        free(fs->lfs_suflags[1], M_SEGMENT);
        free(fs->lfs_suflags, M_SEGMENT);
        lfs_free_resblks(fs);
        cv_destroy(&fs->lfs_sleeperscv);
        cv_destroy(&fs->lfs_diropscv);
        cv_destroy(&fs->lfs_stopcv);
        cv_destroy(&fs->lfs_nextsegsleep);

        rw_destroy(&fs->lfs_fraglock);
        rw_destroy(&fs->lfs_iflock);

        kmem_free(fs, sizeof(struct lfs));
        kmem_free(ump, sizeof(*ump));

        mp->mnt_data = NULL;
        mp->mnt_flag &= ~MNT_LOCAL;
        return (error);
}

static int
lfs_flushfiles(struct mount *mp, int flags)
{
        struct lwp *l = curlwp;
        struct ulfsmount *ump;
        struct lfs *fs;
        struct vnode *vp;
        int error;

        ump = VFSTOULFS(mp);
        fs = ump->um_lfs;

        /* Two checkpoints */
        if (!fs->lfs_ronly) {
                lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
                lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
        }

        /* wake up the cleaner so it can die */
        /* XXX: shouldn't this be *after* the error cases below? */
        lfs_wakeup_cleaner(fs);
        mutex_enter(&lfs_lock);
        while (fs->lfs_sleepers)
                cv_wait(&fs->lfs_sleeperscv, &lfs_lock);
        mutex_exit(&lfs_lock);

#ifdef LFS_EXTATTR
        if (ump->um_fstype == ULFS1) {
                if (ump->um_extattr.uepm_flags & ULFS_EXTATTR_UEPM_STARTED) {
                        ulfs_extattr_stop(mp, curlwp);
                }
                if (ump->um_extattr.uepm_flags & ULFS_EXTATTR_UEPM_INITIALIZED) {
                        ulfs_extattr_uepm_destroy(&ump->um_extattr);
                        mp->mnt_flag &= ~MNT_EXTATTR;
                }
        }
#endif
#ifdef LFS_QUOTA
        if ((error = lfsquota1_umount(mp, flags)) != 0)
                return (error);
#endif
#ifdef LFS_QUOTA2
        if ((error = lfsquota2_umount(mp, flags)) != 0)
                return (error);
#endif
        if ((error = vflush(mp, fs->lfs_ivnode, flags)) != 0)
                return (error);
        if ((error = VFS_SYNC(mp, 1, l->l_cred)) != 0)
                return (error);
        vp = fs->lfs_ivnode;
        mutex_enter(vp->v_interlock);
        if (LIST_FIRST(&vp->v_dirtyblkhd))
                panic("lfs_unmount: still dirty blocks on ifile vnode");
        mutex_exit(vp->v_interlock);

        /* Explicitly write the superblock, to update serial and pflags */
        if (!fs->lfs_ronly) {
                lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) | LFS_PF_CLEAN);
                lfs_writesuper(fs, lfs_sb_getsboff(fs, 0));
                lfs_writesuper(fs, lfs_sb_getsboff(fs, 1));
        }
        mutex_enter(&lfs_lock);
        while (fs->lfs_iocount)
                mtsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs_umount", 0,
                        &lfs_lock);
        mutex_exit(&lfs_lock);

        return 0;
}

/*
 * Get file system statistics.
 *
 * NB: We don't lock to access the superblock here, because it's not
 * really that important if we get it wrong.
 */
int
lfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
        struct lfs *fs;
        struct ulfsmount *ump;

        ump = VFSTOULFS(mp);
        fs = ump->um_lfs;

        sbp->f_bsize = lfs_sb_getbsize(fs);
        sbp->f_frsize = lfs_sb_getfsize(fs);
        sbp->f_iosize = lfs_sb_getbsize(fs);
        sbp->f_blocks = LFS_EST_NONMETA(fs) - VTOI(fs->lfs_ivnode)->i_lfs_effnblks;

        sbp->f_bfree = LFS_EST_BFREE(fs);
        /*
         * XXX this should be lfs_sb_getsize (measured in frags)
         * rather than dsize (measured in diskblocks). However,
         * getsize needs a format version check (for version 1 it
         * needs to be blockstofrags'd) so for the moment I'm going to
         * leave this...  it won't fire wrongly as frags are at least
         * as big as diskblocks.
         */
        KASSERT(sbp->f_bfree <= lfs_sb_getdsize(fs));
#if 0
        if (sbp->f_bfree < 0)
                sbp->f_bfree = 0;
#endif

        sbp->f_bresvd = LFS_EST_RSVD(fs);
        if (sbp->f_bfree > sbp->f_bresvd)
                sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
        else
                sbp->f_bavail = 0;

        /* XXX: huh? - dholland 20150728 */
        sbp->f_files = lfs_sb_getbfree(fs) / lfs_btofsb(fs, lfs_sb_getibsize(fs))
            * LFS_INOPB(fs);
        sbp->f_ffree = sbp->f_files - lfs_sb_getnfiles(fs);
        sbp->f_favail = sbp->f_ffree;
        sbp->f_fresvd = 0;
        copy_statvfs_info(sbp, mp);
        return (0);
}

/*
 * Go through the disk queues to initiate sandbagged IO;
 * go through the inodes to write those that have been modified;
 * initiate the writing of the super block if it has been modified.
 *
 * Note: we are always called with the filesystem marked `MPBUSY'.
 */
int
lfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
{
        int error;
        struct lfs *fs;

        fs = VFSTOULFS(mp)->um_lfs;
        if (fs->lfs_ronly)
                return 0;

        /* Snapshots should not hose the syncer */
        /*
         * XXX Sync can block here anyway, since we don't have a very
         * XXX good idea of how much data is pending.  If it's more
         * XXX than a segment and lfs_nextseg is close to the end of
         * XXX the log, we'll likely block.
         */
        mutex_enter(&lfs_lock);
        if (fs->lfs_nowrap && lfs_sb_getnextseg(fs) < lfs_sb_getcurseg(fs)) {
                mutex_exit(&lfs_lock);
                return 0;
        }
        mutex_exit(&lfs_lock);

        lfs_writer_enter(fs, "lfs_dirops");

        /* All syncs must be checkpoints until roll-forward is implemented. */
        DLOG((DLOG_FLUSH, "lfs_sync at 0x%jx\n",
              (uintmax_t)lfs_sb_getoffset(fs)));
        error = lfs_segwrite(mp, SEGM_CKP | (waitfor ? SEGM_SYNC : 0));
        lfs_writer_leave(fs);
#ifdef LFS_QUOTA
        lfs_qsync(mp);
#endif
        return (error);
}

/*
 * Look up an LFS dinode number to find its incore vnode.  If not already
 * in core, read it in from the specified device.  Return the inode locked.
 * Detection and handling of mount points must be done by the calling routine.
 */
int
lfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp)
{
        int error;

        error = vcache_get(mp, &ino, sizeof(ino), vpp);
        if (error)
                return error;
        error = vn_lock(*vpp, lktype);
        if (error) {
                vrele(*vpp);
                *vpp = NULL;
                return error;
        }

        return 0;
}

/*
 * Create a new vnode/inode pair and initialize what fields we can.
 */
static void
lfs_init_vnode(struct ulfsmount *ump, ino_t ino, struct vnode *vp)
{
        struct lfs *fs = ump->um_lfs;
        struct inode *ip;
        union lfs_dinode *dp;

        ASSERT_NO_SEGLOCK(fs);

        /* Initialize the inode. */
        ip = pool_get(&lfs_inode_pool, PR_WAITOK);
        memset(ip, 0, sizeof(*ip));
        dp = pool_get(&lfs_dinode_pool, PR_WAITOK);
        memset(dp, 0, sizeof(*dp));
        ip->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK);
        memset(ip->inode_ext.lfs, 0, sizeof(*ip->inode_ext.lfs));
        ip->i_din = dp;
        ip->i_ump = ump;
        ip->i_vnode = vp;
        ip->i_dev = fs->lfs_dev;
        lfs_dino_setinumber(fs, dp, ino);
        ip->i_number = ino;
        ip->i_lfs = fs;
        ip->i_lfs_effnblks = 0;
        SPLAY_INIT(&ip->i_lfs_lbtree);
        ip->i_lfs_nbtree = 0;
        LIST_INIT(&ip->i_lfs_segdhd);

        vp->v_tag = VT_LFS;
        vp->v_op = lfs_vnodeop_p;
        vp->v_data = ip;
}

/*
 * Undo lfs_init_vnode().
 */
static void
lfs_deinit_vnode(struct ulfsmount *ump, struct vnode *vp)
{
        struct inode *ip = VTOI(vp);

        pool_put(&lfs_inoext_pool, ip->inode_ext.lfs);
        pool_put(&lfs_dinode_pool, ip->i_din);
        pool_put(&lfs_inode_pool, ip);
        vp->v_data = NULL;
}

/*
 * Read an inode from disk and initialize this vnode / inode pair.
 * Caller assures no other thread will try to load this inode.
 */
int
lfs_loadvnode(struct mount *mp, struct vnode *vp,
    const void *key, size_t key_len, const void **new_key)
{
        struct lfs *fs;
        union lfs_dinode *dip;
        struct inode *ip;
        struct buf *bp;
        IFILE *ifp;
        struct ulfsmount *ump;
        ino_t ino;
        daddr_t daddr;
        int error, retries;
        struct timespec ts;

        KASSERT(key_len == sizeof(ino));
        memcpy(&ino, key, key_len);

        memset(&ts, 0, sizeof ts);        /* XXX gcc */

        ump = VFSTOULFS(mp);
        fs = ump->um_lfs;

        /*
         * If the filesystem is not completely mounted yet, suspend
         * any access requests (wait for roll-forward to complete).
         */
        mutex_enter(&lfs_lock);
        while ((fs->lfs_flags & LFS_NOTYET) && curproc->p_pid != fs->lfs_rfpid)
                mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_notyet", 0,
                        &lfs_lock);
        mutex_exit(&lfs_lock);

        /* Translate the inode number to a disk address. */
        if (ino == LFS_IFILE_INUM)
                daddr = lfs_sb_getidaddr(fs);
        else {
                /* XXX bounds-check this too */
                LFS_IENTRY(ifp, fs, ino, bp);
                daddr = lfs_if_getdaddr(fs, ifp);
                if (lfs_sb_getversion(fs) > 1) {
                        ts.tv_sec = lfs_if_getatime_sec(fs, ifp);
                        ts.tv_nsec = lfs_if_getatime_nsec(fs, ifp);
                }

                brelse(bp, 0);
                if (daddr == LFS_UNUSED_DADDR)
                        return (ENOENT);
        }

        /* Allocate/init new vnode/inode. */
        lfs_init_vnode(ump, ino, vp);
        ip = VTOI(vp);

        /* If the cleaner supplied the inode, use it. */
        if (curlwp == fs->lfs_cleaner_thread && fs->lfs_cleaner_hint != NULL &&
            fs->lfs_cleaner_hint->bi_lbn == LFS_UNUSED_LBN) {
                dip = fs->lfs_cleaner_hint->bi_bp;
                if (fs->lfs_is64) {
                        error = copyin(dip, &ip->i_din->u_64,
                                       sizeof(struct lfs64_dinode));
                } else {
                        error = copyin(dip, &ip->i_din->u_32,
                                       sizeof(struct lfs32_dinode));
                }
                if (error) {
                        lfs_deinit_vnode(ump, vp);
                        return error;
                }
                KASSERT(ip->i_number == ino);
                goto out;
        }

        /* Read in the disk contents for the inode, copy into the inode. */
        retries = 0;
again:
        error = bread(fs->lfs_devvp, LFS_FSBTODB(fs, daddr),
                (lfs_sb_getversion(fs) == 1 ? lfs_sb_getbsize(fs) : lfs_sb_getibsize(fs)),
                0, &bp);
        if (error) {
                lfs_deinit_vnode(ump, vp);
                return error;
        }

        dip = lfs_ifind(fs, ino, bp);
        if (dip == NULL) {
                /* Assume write has not completed yet; try again */
                brelse(bp, BC_INVAL);
                ++retries;
                if (retries <= LFS_IFIND_RETRIES) {
                        mutex_enter(&lfs_lock);
                        if (fs->lfs_iocount) {
                                DLOG((DLOG_VNODE,
                                    "%s: dinode %d not found, retrying...\n",
                                    __func__, ino));
                                (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
                                              "lfs ifind", 1, &lfs_lock);
                        } else
                                retries = LFS_IFIND_RETRIES;
                        mutex_exit(&lfs_lock);
                        goto again;
                }
#ifdef DEBUG
                /* If the seglock is held look at the bpp to see
                   what is there anyway */
                mutex_enter(&lfs_lock);
                if (fs->lfs_seglock > 0) {
                        struct buf **bpp;
                        union lfs_dinode *dp;
                        int i;

                        for (bpp = fs->lfs_sp->bpp;
                             bpp != fs->lfs_sp->cbpp; ++bpp) {
                                if ((*bpp)->b_vp == fs->lfs_ivnode &&
                                    bpp != fs->lfs_sp->bpp) {
                                        /* Inode block */
                                        printf("%s: block 0x%" PRIx64 ": ",
                                               __func__, (*bpp)->b_blkno);
                                        for (i = 0; i < LFS_INOPB(fs); i++) {
                                                dp = DINO_IN_BLOCK(fs,
                                                    (*bpp)->b_data, i);
                                                if (lfs_dino_getinumber(fs, dp))
                                                        printf("%ju ",
                                                            (uintmax_t)lfs_dino_getinumber(fs, dp));
                                        }
                                        printf("\n");
                                }
                        }
                }
                mutex_exit(&lfs_lock);
#endif /* DEBUG */
                panic("lfs_loadvnode: dinode not found");
        }
        lfs_copy_dinode(fs, ip->i_din, dip);
        brelse(bp, 0);

out:        
        if (lfs_sb_getversion(fs) > 1) {
                lfs_dino_setatime(fs, ip->i_din, ts.tv_sec);
                lfs_dino_setatimensec(fs, ip->i_din, ts.tv_nsec);
        }

        lfs_vinit(mp, &vp);

        *new_key = &ip->i_number;
        return 0;
}

/*
 * Create a new inode and initialize this vnode / inode pair.
 */
int
lfs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp,
    struct vattr *vap, kauth_cred_t cred, void *extra,
    size_t *key_len, const void **new_key)
{
        ino_t ino;
        struct inode *ip;
        struct ulfsmount *ump;
        struct lfs *fs;
        int error, mode, gen;

        KASSERT(dvp != NULL || vap->va_fileid > 0);
        KASSERT(dvp != NULL && dvp->v_mount == mp);
        KASSERT(vap->va_type != VNON);

        *key_len = sizeof(ino);
        ump = VFSTOULFS(mp);
        fs = ump->um_lfs;
        mode = MAKEIMODE(vap->va_type, vap->va_mode);

        /*
         * Allocate fresh inode.  With "dvp == NULL" take the inode number
         * and version from "vap".
        */
        if (dvp == NULL) {
                ino = vap->va_fileid;
                gen = vap->va_gen;
                error = lfs_valloc_fixed(fs, ino, gen);
        } else {
                error = lfs_valloc(dvp, mode, cred, &ino, &gen);
        }
        if (error)
                return error;

        /* Attach inode to vnode. */
        lfs_init_vnode(ump, ino, vp);
        ip = VTOI(vp);

        mutex_enter(&lfs_lock);
        LFS_SET_UINO(ip, IN_CHANGE);
        mutex_exit(&lfs_lock);

        /* Note no blocks yet */
        ip->i_lfs_hiblk = -1;

        /* Set a new generation number for this inode. */
        ip->i_gen = gen;
        lfs_dino_setgen(fs, ip->i_din, gen);

        memset(ip->i_lfs_fragsize, 0,
            ULFS_NDADDR * sizeof(*ip->i_lfs_fragsize));

        /* Set uid / gid. */
        if (cred == NOCRED || cred == FSCRED) {
                ip->i_gid = 0;
                ip->i_uid = 0;
        } else {
                ip->i_gid = VTOI(dvp)->i_gid;
                ip->i_uid = kauth_cred_geteuid(cred);
        }
        DIP_ASSIGN(ip, gid, ip->i_gid);
        DIP_ASSIGN(ip, uid, ip->i_uid);

#if defined(LFS_QUOTA) || defined(LFS_QUOTA2)
        error = lfs_chkiq(ip, 1, cred, 0);
        if (error) {
                lfs_vfree(dvp, ino, mode);
                lfs_deinit_vnode(ump, vp);

                return error;
        }
#endif

        /* Set type and finalize. */
        ip->i_flags = 0;
        DIP_ASSIGN(ip, flags, 0);
        ip->i_mode = mode;
        DIP_ASSIGN(ip, mode, mode);
        if (vap->va_rdev != VNOVAL) {
                /*
                 * Want to be able to use this to make badblock
                 * inodes, so don't truncate the dev number.
                 */
                // XXX clean this up
                if (ump->um_fstype == ULFS1)
                        ip->i_din->u_32.di_rdev = ulfs_rw32(vap->va_rdev,
                            ULFS_MPNEEDSWAP(fs));
                else
                        ip->i_din->u_64.di_rdev = ulfs_rw64(vap->va_rdev,
                            ULFS_MPNEEDSWAP(fs));
        }
        lfs_vinit(mp, &vp);

        *new_key = &ip->i_number;
        return 0;
}

/*
 * File handle to vnode
 */
int
lfs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp)
{
        struct lfid lfh;
        struct lfs *fs;

        if (fhp->fid_len != sizeof(struct lfid))
                return EINVAL;

        memcpy(&lfh, fhp, sizeof(lfh));
        if (lfh.lfid_ino < LFS_IFILE_INUM)
                return ESTALE;

        fs = VFSTOULFS(mp)->um_lfs;
        if (lfh.lfid_ident != lfs_sb_getident(fs))
                return ESTALE;

        if (lfh.lfid_ino >
            ((lfs_dino_getsize(fs, VTOI(fs->lfs_ivnode)->i_din) >> lfs_sb_getbshift(fs)) -
             lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs))
                return ESTALE;

        return (ulfs_fhtovp(mp, &lfh.lfid_ufid, lktype, vpp));
}

/*
 * Vnode pointer to File handle
 */
/* ARGSUSED */
int
lfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
{
        struct inode *ip;
        struct lfid lfh;

        if (*fh_size < sizeof(struct lfid)) {
                *fh_size = sizeof(struct lfid);
                return E2BIG;
        }
        *fh_size = sizeof(struct lfid);
        ip = VTOI(vp);
        memset(&lfh, 0, sizeof(lfh));
        lfh.lfid_len = sizeof(struct lfid);
        lfh.lfid_ino = ip->i_number;
        lfh.lfid_gen = ip->i_gen;
        lfh.lfid_ident = lfs_sb_getident(ip->i_lfs);
        memcpy(fhp, &lfh, sizeof(lfh));
        return (0);
}

/*
 * ulfs_bmaparray callback function for writing.
 *
 * Since blocks will be written to the new segment anyway,
 * we don't care about current daddr of them.
 */
static bool
lfs_issequential_hole(const struct lfs *fs,
    daddr_t daddr0, daddr_t daddr1)
{
        (void)fs; /* not used */

        KASSERT(daddr0 == UNWRITTEN ||
            (0 <= daddr0 && daddr0 <= LFS_MAX_DADDR(fs)));
        KASSERT(daddr1 == UNWRITTEN ||
            (0 <= daddr1 && daddr1 <= LFS_MAX_DADDR(fs)));

        /* NOTE: all we want to know here is 'hole or not'. */
        /* NOTE: UNASSIGNED is converted to 0 by ulfs_bmaparray. */

        /*
         * treat UNWRITTENs and all resident blocks as 'contiguous'
         */
        if (daddr0 != 0 && daddr1 != 0)
                return true;

        /*
         * both are in hole?
         */
        if (daddr0 == 0 && daddr1 == 0)
                return true; /* all holes are 'contiguous' for us. */

        return false;
}

/*
 * lfs_gop_write functions exactly like genfs_gop_write, except that
 * (1) it requires the seglock to be held by its caller, and sp->fip
 *     to be properly initialized (it will return without re-initializing
 *     sp->fip, and without calling lfs_writeseg).
 * (2) it uses the remaining space in the segment, rather than VOP_BMAP,
 *     to determine how large a block it can write at once (though it does
 *     still use VOP_BMAP to find holes in the file);
 * (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks
 *     (leaving lfs_writeseg to deal with the cluster blocks, so we might
 *     now have clusters of clusters, ick.)
 */
static int
lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
    int flags)
{
        int i, error, run, haveeof = 0;
        int fs_bshift;
        vaddr_t kva;
        off_t eof, offset, startoffset = 0;
        size_t bytes, iobytes, skipbytes;
        bool async = (flags & PGO_SYNCIO) == 0;
        daddr_t lbn, blkno;
        struct vm_page *pg;
        struct buf *mbp, *bp;
        struct vnode *devvp = VTOI(vp)->i_devvp;
        struct inode *ip = VTOI(vp);
        struct lfs *fs = ip->i_lfs;
        struct segment *sp = fs->lfs_sp;
        SEGSUM *ssp;
        UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist);
        const char * failreason = NULL;

        ASSERT_SEGLOCK(fs);

        /* The Ifile lives in the buffer cache */
        KASSERT(vp != fs->lfs_ivnode);

        /*
         * We don't want to fill the disk before the cleaner has a chance
         * to make room for us.  If we're in danger of doing that, fail
         * with EAGAIN.  The caller will have to notice this, unlock
         * so the cleaner can run, relock and try again.
         *
         * We must write everything, however, if our vnode is being
         * reclaimed.
         */
        mutex_enter(vp->v_interlock);
        if (LFS_STARVED_FOR_SEGS(fs) && vdead_check(vp, VDEAD_NOWAIT) == 0) {
                mutex_exit(vp->v_interlock);
                failreason = "Starved for segs and not flushing vp";
                 goto tryagain;
        }
        mutex_exit(vp->v_interlock);

        /*
         * Sometimes things slip past the filters in lfs_putpages,
         * and the pagedaemon tries to write pages---problem is
         * that the pagedaemon never acquires the segment lock.
         *
         * Alternatively, pages that were clean when we called
         * genfs_putpages may have become dirty in the meantime.  In this
         * case the segment header is not properly set up for blocks
         * to be added to it.
         *
         * Unbusy and unclean the pages, and put them on the ACTIVE
         * queue under the hypothesis that they couldn't have got here
         * unless they were modified *quite* recently.
         *
         * XXXUBC that last statement is an oversimplification of course.
         */
        if (!LFS_SEGLOCK_HELD(fs)) {
                failreason = "Seglock not held";
                goto tryagain;
        }
        if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) {
                failreason = "Inode with no_gop_write";
                goto tryagain;
        }
        if ((pgs[0]->offset & lfs_sb_getbmask(fs)) != 0) {
                failreason = "Bad page offset";
                goto tryagain;
        }

        UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
            (uintptr_t)vp, (uintptr_t)pgs, npages, flags);

        GOP_SIZE(vp, vp->v_size, &eof, 0);
        haveeof = 1;

        if (vp->v_type == VREG)
                fs_bshift = vp->v_mount->mnt_fs_bshift;
        else
                fs_bshift = DEV_BSHIFT;
        error = 0;
        pg = pgs[0];
        startoffset = pg->offset;
        KASSERT(eof >= 0);

        if (startoffset >= eof) {
                failreason = "Offset beyond EOF";
                goto tryagain;
        } else
                bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
        skipbytes = 0;

        KASSERT(bytes != 0);

        /* Swap PG_DELWRI for PG_PAGEOUT */
        for (i = 0; i < npages; i++) {
                if (pgs[i]->flags & PG_DELWRI) {
                        KASSERT(!(pgs[i]->flags & PG_PAGEOUT));
                        pgs[i]->flags &= ~PG_DELWRI;
                        pgs[i]->flags |= PG_PAGEOUT;
                        uvm_pageout_start(1);
                        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
                        uvm_pagelock(pgs[i]);
                        uvm_pageunwire(pgs[i]);
                        uvm_pageunlock(pgs[i]);
                        rw_exit(vp->v_uobj.vmobjlock);
                }
        }

        /*
         * Check to make sure we're starting on a block boundary.
         * We'll check later to make sure we always write entire
         * blocks (or fragments).
         */
        if (startoffset & lfs_sb_getbmask(fs))
                printf("%" PRId64 " & %" PRIu64 " = %" PRId64 "\n",
                       startoffset, lfs_sb_getbmask(fs),
                       startoffset & lfs_sb_getbmask(fs));
        KASSERT((startoffset & lfs_sb_getbmask(fs)) == 0);
        if (bytes & lfs_sb_getffmask(fs)) {
                printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes);
                panic("lfs_gop_write: non-integer blocks");
        }

        /*
         * We could deadlock here on pager_map with UVMPAGER_MAPIN_WAITOK.
         * If we would, write what we have and try again.  If we don't
         * have anything to write, we'll have to sleep.
         */
        ssp = (SEGSUM *)sp->segsum;
        if ((kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
                                      (lfs_ss_getnfinfo(fs, ssp) < 1 ?
                                       UVMPAGER_MAPIN_WAITOK : 0))) == 0x0) {
                DLOG((DLOG_PAGE, "lfs_gop_write: forcing write\n"));
#if 0
                      " with nfinfo=%d at offset 0x%jx\n",
                      (int)lfs_ss_getnfinfo(fs, ssp),
                      (uintmax_t)lfs_sb_getoffset(fs)));
#endif
                lfs_updatemeta(sp);
                lfs_release_finfo(fs);
                (void) lfs_writeseg(fs, sp);

                lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);

                /*
                 * Having given up all of the pager_map we were holding,
                 * we can now wait for aiodoned to reclaim it for us
                 * without fear of deadlock.
                 */
                kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
                                     UVMPAGER_MAPIN_WAITOK);
        }

        mbp = getiobuf(NULL, true);
        UVMHIST_LOG(ubchist, "vp %#jx mbp %#jx num now %jd bytes 0x%jx",
            (uintptr_t)vp, (uintptr_t)mbp, vp->v_numoutput, bytes);
        mbp->b_bufsize = npages << PAGE_SHIFT;
        mbp->b_data = (void *)kva;
        mbp->b_resid = mbp->b_bcount = bytes;
        mbp->b_cflags |= BC_BUSY|BC_AGE;
        mbp->b_iodone = uvm_aio_aiodone;

        bp = NULL;
        for (offset = startoffset;
            bytes > 0;
            offset += iobytes, bytes -= iobytes) {
                lbn = offset >> fs_bshift;
                error = ulfs_bmaparray(vp, lbn, &blkno, NULL, NULL, &run,
                    lfs_issequential_hole);
                if (error) {
                        UVMHIST_LOG(ubchist, "ulfs_bmaparray() -> %jd",
                            error,0,0,0);
                        skipbytes += bytes;
                        bytes = 0;
                        break;
                }

                iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
                    bytes);
                if (blkno == (daddr_t)-1) {
                        skipbytes += iobytes;
                        continue;
                }

                /*
                 * Discover how much we can really pack into this buffer.
                 */
                /* If no room in the current segment, finish it up */
                if (sp->sum_bytes_left < sizeof(int32_t) ||
                    sp->seg_bytes_left < (1 << lfs_sb_getbshift(fs))) {
                        int vers;

                        lfs_updatemeta(sp);
                        vers = lfs_fi_getversion(fs, sp->fip);
                        lfs_release_finfo(fs);
                        (void) lfs_writeseg(fs, sp);

                        lfs_acquire_finfo(fs, ip->i_number, vers);
                }
                /* Check both for space in segment and space in segsum */
                iobytes = MIN(iobytes, (sp->seg_bytes_left >> fs_bshift)
                                        << fs_bshift);
                iobytes = MIN(iobytes, (sp->sum_bytes_left / sizeof(int32_t))
                                       << fs_bshift);
                KASSERT(iobytes > 0);

                /* if it's really one i/o, don't make a second buf */
                if (offset == startoffset && iobytes == bytes) {
                        bp = mbp;
                        /* 
                         * All the LFS output is done by the segwriter.  It
                         * will increment numoutput by one for all the bufs it
                         * receives.  However this buffer needs one extra to
                         * account for aiodone.
                         */
                        mutex_enter(vp->v_interlock);
                        vp->v_numoutput++;
                        mutex_exit(vp->v_interlock);
                } else {
                        bp = getiobuf(NULL, true);
                        UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
                            (uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
                        nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes);
                        /*
                         * LFS doesn't like async I/O here, dies with
                         * an assert in lfs_bwrite().  Is that assert
                         * valid?  I retained non-async behaviour when
                         * converted this to use nestiobuf --pooka
                         */
                        bp->b_flags &= ~B_ASYNC;
                }

                /* XXX This is silly ... is this necessary? */
                mutex_enter(&bufcache_lock);
                mutex_enter(vp->v_interlock);
                bgetvp(vp, bp);
                mutex_exit(vp->v_interlock);
                mutex_exit(&bufcache_lock);

                bp->b_lblkno = lfs_lblkno(fs, offset);
                bp->b_private = mbp;
                if (devvp->v_type == VBLK) {
                        bp->b_dev = devvp->v_rdev;
                }
                VOP_BWRITE(bp->b_vp, bp);
                while (lfs_gatherblock(sp, bp, NULL))
                        continue;
        }

        nestiobuf_done(mbp, skipbytes, error);
        if (skipbytes) {
                UVMHIST_LOG(ubchist, "skipbytes %jd", skipbytes, 0,0,0);
        }
        UVMHIST_LOG(ubchist, "returning 0", 0,0,0,0);

        if (!async) {
                /* Start a segment write. */
                UVMHIST_LOG(ubchist, "flushing", 0,0,0,0);
                mutex_enter(&lfs_lock);
                lfs_flush(fs, 0, 1);
                mutex_exit(&lfs_lock);
        }

        if ((sp->seg_flags & SEGM_SINGLE) && lfs_sb_getcurseg(fs) != fs->lfs_startseg)
                return EAGAIN;

        return (0);

    tryagain:
        /*
         * We can't write the pages, for whatever reason.
         * Clean up after ourselves, and make the caller try again.
         */
        mutex_enter(vp->v_interlock);

        /* Tell why we're here, if we know */
        if (failreason != NULL) {
                DLOG((DLOG_PAGE, "lfs_gop_write: %s\n", failreason));
        }
        if (haveeof && startoffset >= eof) {
                 DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
                       " eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
                       pgs[0]->offset, eof, npages));
        }

        for (i = 0; i < npages; i++) {
                pg = pgs[i];

                if (pg->flags & PG_PAGEOUT)
                        uvm_pageout_done(1);
                uvm_pagelock(pg);
                if (pg->flags & PG_DELWRI) {
                        uvm_pageunwire(pg);
                }
                uvm_pageactivate(pg);
                uvm_pageunlock(pg);
                pg->flags &= ~(PG_DELWRI|PG_PAGEOUT|PG_RELEASED);
                uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
                DLOG((DLOG_PAGE, "pg[%d] = %p (vp %p off %" PRIx64 ")\n", i, pg,
                        vp, pg->offset));
                DLOG((DLOG_PAGE, "pg[%d]->flags = %x\n", i, pg->flags));
                DLOG((DLOG_PAGE, "pg[%d]->pqflags = %x\n", i, pg->pqflags));
                DLOG((DLOG_PAGE, "pg[%d]->uanon = %p\n", i, pg->uanon));
                DLOG((DLOG_PAGE, "pg[%d]->uobject = %p\n", i, pg->uobject));
                DLOG((DLOG_PAGE, "pg[%d]->wire_count = %d\n", i,
                      pg->wire_count));
                DLOG((DLOG_PAGE, "pg[%d]->loan_count = %d\n", i,
                      pg->loan_count));
        }
        uvm_page_unbusy(pgs, npages);
        mutex_exit(vp->v_interlock);
        return EAGAIN;
}

/*
 * finish vnode/inode initialization.
 * used by lfs_vget.
 */
void
lfs_vinit(struct mount *mp, struct vnode **vpp)
{
        struct vnode *vp = *vpp;
        struct inode *ip = VTOI(vp);
        struct ulfsmount *ump = VFSTOULFS(mp);
        struct lfs *fs = ump->um_lfs;
        int i;

        ip->i_mode = lfs_dino_getmode(fs, ip->i_din);
        ip->i_nlink = lfs_dino_getnlink(fs, ip->i_din);
        ip->i_lfs_osize = ip->i_size = lfs_dino_getsize(fs, ip->i_din);
        ip->i_flags = lfs_dino_getflags(fs, ip->i_din);
        ip->i_gen = lfs_dino_getgen(fs, ip->i_din);
        ip->i_uid = lfs_dino_getuid(fs, ip->i_din);
        ip->i_gid = lfs_dino_getgid(fs, ip->i_din);

        ip->i_lfs_effnblks = lfs_dino_getblocks(fs, ip->i_din);
        ip->i_lfs_odnlink = lfs_dino_getnlink(fs, ip->i_din);

        /*
         * Initialize the vnode from the inode, check for aliases.  In all
         * cases re-init ip, the underlying vnode/inode may have changed.
         */
        ulfs_vinit(mp, lfs_specop_p, lfs_fifoop_p, &vp);
        ip = VTOI(vp);

        memset(ip->i_lfs_fragsize, 0, ULFS_NDADDR * sizeof(*ip->i_lfs_fragsize));
        if (vp->v_type != VLNK || ip->i_size >= ip->i_lfs->um_maxsymlinklen) {
#ifdef DEBUG
                for (i = (ip->i_size + lfs_sb_getbsize(fs) - 1) >> lfs_sb_getbshift(fs);
                    i < ULFS_NDADDR; i++) {
                        if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
                            i == 0)
                                continue;
                        if (lfs_dino_getdb(fs, ip->i_din, i) != 0) {
                                lfs_dump_dinode(fs, ip->i_din);
                                panic("inconsistent inode (direct)");
                        }
                }
                for ( ; i < ULFS_NDADDR + ULFS_NIADDR; i++) {
                        if (lfs_dino_getib(fs, ip->i_din, i - ULFS_NDADDR) != 0) {
                                lfs_dump_dinode(fs, ip->i_din);
                                panic("inconsistent inode (indirect)");
                        }
                }
#endif /* DEBUG */
                for (i = 0; i < ULFS_NDADDR; i++)
                        if (lfs_dino_getdb(fs, ip->i_din, i) != 0)
                                ip->i_lfs_fragsize[i] = lfs_blksize(fs, ip, i);
        }

        KASSERTMSG((vp->v_type != VNON),
            "lfs_vinit: ino %llu is type VNON! (ifmt=%o)\n",
            (unsigned long long)ip->i_number,
            (ip->i_mode & LFS_IFMT) >> 12);

        /*
         * Finish inode initialization now that aliasing has been resolved.
         */

        ip->i_devvp = fs->lfs_devvp;
        vref(ip->i_devvp);
#if defined(LFS_QUOTA) || defined(LFS_QUOTA2)
        ulfsquota_init(ip);
#endif
        genfs_node_init(vp, &lfs_genfsops);
        uvm_vnp_setsize(vp, ip->i_size);

        /* Initialize hiblk from file size */
        ip->i_lfs_hiblk = lfs_lblkno(ip->i_lfs, ip->i_size + lfs_sb_getbsize(ip->i_lfs) - 1) - 1;

        *vpp = vp;
}

/*
 * Resize the filesystem to contain the specified number of segments.
 */
int
lfs_resize_fs(struct lfs *fs, int newnsegs)
{
        SEGUSE *sup;
        CLEANERINFO *cip;
        struct buf *bp, *obp;
        daddr_t olast, nlast, ilast, noff, start, end;
        struct vnode *ivp;
        struct inode *ip;
        int error, badnews, inc, oldnsegs;
        int sbbytes, csbbytes, gain, cgain;
        int i;

        /* Only support v2 and up */
        if (lfs_sb_getversion(fs) < 2)
                return EOPNOTSUPP;

        /* If we're doing nothing, do it fast */
        oldnsegs = lfs_sb_getnseg(fs);
        if (newnsegs == oldnsegs)
                return 0;

        /* We always have to have two superblocks */
        if (newnsegs <= lfs_dtosn(fs, lfs_sb_getsboff(fs, 1)))
                /* XXX this error code is rather nonsense */
                return EFBIG;

        ivp = fs->lfs_ivnode;
        ip = VTOI(ivp);
        error = 0;

        /* Take the segment lock so no one else calls lfs_newseg() */
        lfs_seglock(fs, SEGM_PROT);

        /*
         * Make sure the segments we're going to be losing, if any,
         * are in fact empty.  We hold the seglock, so their status
         * cannot change underneath us.  Count the superblocks we lose,
         * while we're at it.
         */
        sbbytes = csbbytes = 0;
        cgain = 0;
        for (i = newnsegs; i < oldnsegs; i++) {
                LFS_SEGENTRY(sup, fs, i, bp);
                badnews = sup->su_nbytes || !(sup->su_flags & SEGUSE_INVAL);
                if (sup->su_flags & SEGUSE_SUPERBLOCK)
                        sbbytes += LFS_SBPAD;
                if (!(sup->su_flags & SEGUSE_DIRTY)) {
                        ++cgain;
                        if (sup->su_flags & SEGUSE_SUPERBLOCK)
                                csbbytes += LFS_SBPAD;
                }
                brelse(bp, 0);
                if (badnews) {
                        error = EBUSY;
                        goto out;
                }
        }

        /* Note old and new segment table endpoints, and old ifile size */
        olast = lfs_sb_getcleansz(fs) + lfs_sb_getsegtabsz(fs);
        nlast = howmany(newnsegs, lfs_sb_getsepb(fs)) + lfs_sb_getcleansz(fs);
        ilast = ivp->v_size >> lfs_sb_getbshift(fs);
        noff = nlast - olast;

        /*
         * Make sure no one can use the Ifile while we change it around.
         * Even after taking the iflock we need to make sure no one still
         * is holding Ifile buffers, so we get each one, to drain them.
         * (XXX this could be done better.)
         */
        rw_enter(&fs->lfs_iflock, RW_WRITER);
        for (i = 0; i < ilast; i++) {
                /* XXX what to do if bread fails? */
                bread(ivp, i, lfs_sb_getbsize(fs), 0, &bp);
                brelse(bp, 0);
        }

        /* Allocate new Ifile blocks */
        for (i = ilast; i < ilast + noff; i++) {
                if (lfs_balloc(ivp, i * lfs_sb_getbsize(fs), lfs_sb_getbsize(fs), NOCRED, 0,
                               &bp) != 0)
                        panic("balloc extending ifile");
                memset(bp->b_data, 0, lfs_sb_getbsize(fs));
                VOP_BWRITE(bp->b_vp, bp);
        }

        /* Register new ifile size */
        ip->i_size += noff * lfs_sb_getbsize(fs);
        lfs_dino_setsize(fs, ip->i_din, ip->i_size);
        uvm_vnp_setsize(ivp, ip->i_size);

        /* Copy the inode table to its new position */
        if (noff != 0) {
                if (noff < 0) {
                        start = nlast;
                        end = ilast + noff;
                        inc = 1;
                } else {
                        start = ilast + noff - 1;
                        end = nlast - 1;
                        inc = -1;
                }
                for (i = start; i != end; i += inc) {
                        if (bread(ivp, i, lfs_sb_getbsize(fs),
                            B_MODIFY, &bp) != 0)
                                panic("resize: bread dst blk failed");
                        if (bread(ivp, i - noff, lfs_sb_getbsize(fs),
                            0, &obp))
                                panic("resize: bread src blk failed");
                        memcpy(bp->b_data, obp->b_data, lfs_sb_getbsize(fs));
                        VOP_BWRITE(bp->b_vp, bp);
                        brelse(obp, 0);
                }
        }

        /* If we are expanding, write the new empty SEGUSE entries */
        if (newnsegs > oldnsegs) {
                for (i = oldnsegs; i < newnsegs; i++) {
                        if ((error = bread(ivp, i / lfs_sb_getsepb(fs) +
                                           lfs_sb_getcleansz(fs), lfs_sb_getbsize(fs),
                                           B_MODIFY, &bp)) != 0)
                                panic("lfs: ifile read: %d", error);
                        while ((i + 1) % lfs_sb_getsepb(fs) && i < newnsegs) {
                                sup = &((SEGUSE *)bp->b_data)[i % lfs_sb_getsepb(fs)];
                                memset(sup, 0, sizeof(*sup));
                                i++;
                        }
                        VOP_BWRITE(bp->b_vp, bp);
                }
        }

        /* Zero out unused superblock offsets */
        for (i = 2; i < LFS_MAXNUMSB; i++)
                if (lfs_dtosn(fs, lfs_sb_getsboff(fs, i)) >= newnsegs)
                        lfs_sb_setsboff(fs, i, 0x0);

        /*
         * Correct superblock entries that depend on fs size.
         * The computations of these are as follows:
         *
         * size  = lfs_segtod(fs, nseg)
         * dsize = lfs_segtod(fs, nseg - minfreeseg) - lfs_btofsb(#super * LFS_SBPAD)
         * bfree = dsize - lfs_btofsb(fs, bsize * nseg / 2) - blocks_actually_used
         * avail = lfs_segtod(fs, nclean) - lfs_btofsb(#clean_super * LFS_SBPAD)
         *         + (lfs_segtod(fs, 1) - (offset - curseg))
         *           - lfs_segtod(fs, minfreeseg - (minfreeseg / 2))
         *
         * XXX - we should probably adjust minfreeseg as well.
         */
        gain = (newnsegs - oldnsegs);
        lfs_sb_setnseg(fs, newnsegs);
        lfs_sb_setsegtabsz(fs, nlast - lfs_sb_getcleansz(fs));
        lfs_sb_addsize(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)));
        lfs_sb_adddsize(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)) - lfs_btofsb(fs, sbbytes));
        lfs_sb_addbfree(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)) - lfs_btofsb(fs, sbbytes)
                       - gain * lfs_btofsb(fs, lfs_sb_getbsize(fs) / 2));
        if (gain > 0) {
                lfs_sb_addnclean(fs, gain);
                lfs_sb_addavail(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)));
        } else {
                lfs_sb_subnclean(fs, cgain);
                lfs_sb_subavail(fs, cgain * lfs_btofsb(fs, lfs_sb_getssize(fs)) -
                                 lfs_btofsb(fs, csbbytes));
        }

        /* Resize segment flag cache */
        fs->lfs_suflags[0] = realloc(fs->lfs_suflags[0],
            lfs_sb_getnseg(fs) * sizeof(u_int32_t), M_SEGMENT, M_WAITOK);
        fs->lfs_suflags[1] = realloc(fs->lfs_suflags[1],
            lfs_sb_getnseg(fs) * sizeof(u_int32_t), M_SEGMENT, M_WAITOK);
        for (i = oldnsegs; i < newnsegs; i++)
                fs->lfs_suflags[0][i] = fs->lfs_suflags[1][i] = 0x0;

        /* Truncate Ifile if necessary */
        if (noff < 0)
                lfs_truncate(ivp, ivp->v_size + (noff << lfs_sb_getbshift(fs)), 0,
                    NOCRED);

        /* Update cleaner info so the cleaner can die */
        /* XXX what to do if bread fails? */
        bread(ivp, 0, lfs_sb_getbsize(fs), B_MODIFY, &bp);
        cip = bp->b_data;
        lfs_ci_setclean(fs, cip, lfs_sb_getnclean(fs));
        lfs_ci_setdirty(fs, cip, lfs_sb_getnseg(fs) - lfs_sb_getnclean(fs));
        VOP_BWRITE(bp->b_vp, bp);

        /* Let Ifile accesses proceed */
        rw_exit(&fs->lfs_iflock);

    out:
        lfs_segunlock(fs);
        return error;
}

/*
 * Extended attribute dispatch
 */
int
lfs_extattrctl(struct mount *mp, int cmd, struct vnode *vp,
               int attrnamespace, const char *attrname)
{
#ifdef LFS_EXTATTR
        struct ulfsmount *ump;

        ump = VFSTOULFS(mp);
        if (ump->um_fstype == ULFS1) {
                return ulfs_extattrctl(mp, cmd, vp, attrnamespace, attrname);
        }
#endif
        return vfs_stdextattrctl(mp, cmd, vp, attrnamespace, attrname);
}















































































































































































































































































    3 


    3 




    3 






    3 

































    3 




    3 

















































































    5 



    5 





    5 



    5 




















    5 



    5 





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
/*        $NetBSD: subr_log.c,v 1.63 2022/10/26 23:28:30 riastradh Exp $        */

/*-
 * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)subr_log.c        8.3 (Berkeley) 2/14/95
 */

/*
 * Error log buffer for kernel printf's.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_log.c,v 1.63 2022/10/26 23:28:30 riastradh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/ioctl.h>
#include <sys/msgbuf.h>
#include <sys/file.h>
#include <sys/syslog.h>
#include <sys/conf.h>
#include <sys/select.h>
#include <sys/poll.h> 
#include <sys/intr.h>
#include <sys/sysctl.h>
#include <sys/ktrace.h>

static int sysctl_msgbuf(SYSCTLFN_PROTO);

static void        logsoftintr(void *);

static bool        log_async;
static struct selinfo log_selp;                /* process waiting on select call */
static pid_t        log_pgid;                /* process/group for async I/O */
static kcondvar_t log_cv;
static void        *log_sih;

static kmutex_t log_lock;
int        log_open;                        /* also used in log() */
int        msgbufmapped;                        /* is the message buffer mapped */
int        msgbufenabled;                        /* is logging to the buffer enabled */
struct        kern_msgbuf *msgbufp;                /* the mapped buffer, itself. */

void
initmsgbuf(void *bf, size_t bufsize)
{
        struct kern_msgbuf *mbp;
        long new_bufs;

        /* Sanity-check the given size. */
        if (bufsize < sizeof(struct kern_msgbuf))
                return;

        mbp = msgbufp = (struct kern_msgbuf *)bf;

        new_bufs = bufsize - offsetof(struct kern_msgbuf, msg_bufc);
        if ((mbp->msg_magic != MSG_MAGIC) || (mbp->msg_bufs != new_bufs) ||
            (mbp->msg_bufr < 0) || (mbp->msg_bufr >= mbp->msg_bufs) ||
            (mbp->msg_bufx < 0) || (mbp->msg_bufx >= mbp->msg_bufs)) {
                /*
                 * If the buffer magic number is wrong, has changed
                 * size (which shouldn't happen often), or is
                 * internally inconsistent, initialize it.
                 */

                memset(bf, 0, bufsize);
                mbp->msg_magic = MSG_MAGIC;
                mbp->msg_bufs = new_bufs;
        }

        /* mark it as ready for use. */
        msgbufmapped = msgbufenabled = 1;
}

void
loginit(void)
{

        mutex_init(&log_lock, MUTEX_DEFAULT, IPL_VM);
        selinit(&log_selp);
        cv_init(&log_cv, "klog");
        log_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE,
            logsoftintr, NULL);

        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_INT, "msgbufsize",
                       SYSCTL_DESCR("Size of the kernel message buffer"),
                       sysctl_msgbuf, 0, NULL, 0,
                       CTL_KERN, KERN_MSGBUFSIZE, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_INT, "msgbuf",
                       SYSCTL_DESCR("Kernel message buffer"),
                       sysctl_msgbuf, 0, NULL, 0,
                       CTL_KERN, KERN_MSGBUF, CTL_EOL);
}

/*ARGSUSED*/
static int
logopen(dev_t dev, int flags, int mode, struct lwp *l)
{
        struct kern_msgbuf *mbp = msgbufp;
        int error = 0;

        mutex_spin_enter(&log_lock);
        if (log_open) {
                error = EBUSY;
        } else {
                log_open = 1;
                log_pgid = l->l_proc->p_pid;        /* signal process only */
                /*
                 * The message buffer is initialized during system
                 * configuration.  If it's been clobbered, note that
                 * and return an error.  (This allows a user to read
                 * the buffer via /dev/kmem, and try to figure out
                 * what clobbered it.
                 */
                if (mbp->msg_magic != MSG_MAGIC) {
                        msgbufenabled = 0;
                        error = ENXIO;
                }
        }
        mutex_spin_exit(&log_lock);

        return error;
}

/*ARGSUSED*/
static int
logclose(dev_t dev, int flag, int mode, struct lwp *l)
{

        mutex_spin_enter(&log_lock);
        log_pgid = 0;
        log_open = 0;
        log_async = 0;
        mutex_spin_exit(&log_lock);

        return 0;
}

/*ARGSUSED*/
static int
logread(dev_t dev, struct uio *uio, int flag)
{
        struct kern_msgbuf *mbp = msgbufp;
        long l;
        int error = 0;

        mutex_spin_enter(&log_lock);
        while (mbp->msg_bufr == mbp->msg_bufx) {
                if (flag & IO_NDELAY) {
                        mutex_spin_exit(&log_lock);
                        return EWOULDBLOCK;
                }
                error = cv_wait_sig(&log_cv, &log_lock);
                if (error) {
                        mutex_spin_exit(&log_lock);
                        return error;
                }
        }
        while (uio->uio_resid > 0) {
                l = mbp->msg_bufx - mbp->msg_bufr;
                if (l < 0)
                        l = mbp->msg_bufs - mbp->msg_bufr;
                l = uimin(l, uio->uio_resid);
                if (l == 0)
                        break;
                mutex_spin_exit(&log_lock);
                error = uiomove(&mbp->msg_bufc[mbp->msg_bufr], (int)l, uio);
                mutex_spin_enter(&log_lock);
                if (error)
                        break;
                mbp->msg_bufr += l;
                if (mbp->msg_bufr < 0 || mbp->msg_bufr >= mbp->msg_bufs)
                        mbp->msg_bufr = 0;
        }
        mutex_spin_exit(&log_lock);

        return error;
}

/*ARGSUSED*/
static int
logpoll(dev_t dev, int events, struct lwp *l)
{
        int revents = 0;

        if (events & (POLLIN | POLLRDNORM)) {
                mutex_spin_enter(&log_lock);
                if (msgbufp->msg_bufr != msgbufp->msg_bufx)
                        revents |= events & (POLLIN | POLLRDNORM);
                else
                        selrecord(l, &log_selp);
                mutex_spin_exit(&log_lock);
        }

        return revents;
}

static void
filt_logrdetach(struct knote *kn)
{

        mutex_spin_enter(&log_lock);
        selremove_knote(&log_selp, kn);
        mutex_spin_exit(&log_lock);
}

static int
filt_logread(struct knote *kn, long hint)
{
        int rv;

        if ((hint & NOTE_SUBMIT) == 0)
                mutex_spin_enter(&log_lock);
        if (msgbufp->msg_bufr == msgbufp->msg_bufx) {
                rv = 0;
        } else if (msgbufp->msg_bufr < msgbufp->msg_bufx) {
                kn->kn_data = msgbufp->msg_bufx - msgbufp->msg_bufr;
                rv = 1;
        } else {
                kn->kn_data = (msgbufp->msg_bufs - msgbufp->msg_bufr) +
                    msgbufp->msg_bufx;
                rv = 1;
        }
        if ((hint & NOTE_SUBMIT) == 0)
                mutex_spin_exit(&log_lock);

        return rv;
}

static const struct filterops logread_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_logrdetach,
        .f_event = filt_logread,
};

static int
logkqfilter(dev_t dev, struct knote *kn)
{

        switch (kn->kn_filter) {
        case EVFILT_READ:
                kn->kn_fop = &logread_filtops;
                mutex_spin_enter(&log_lock);
                selrecord_knote(&log_selp, kn);
                mutex_spin_exit(&log_lock);
                break;

        default:
                return (EINVAL);
        }

        return (0);
}

void
logwakeup(void)
{

        if (!cold && log_open) {
                mutex_spin_enter(&log_lock);
                selnotify(&log_selp, 0, NOTE_SUBMIT);
                if (log_async)
                        softint_schedule(log_sih);
                cv_broadcast(&log_cv);
                mutex_spin_exit(&log_lock);
        }
}

static void
logsoftintr(void *cookie)
{
        pid_t pid;

        if ((pid = log_pgid) != 0)
                fownsignal(pid, SIGIO, 0, 0, NULL);
}

/*ARGSUSED*/
static int
logioctl(dev_t dev, u_long com, void *data, int flag, struct lwp *lwp)
{
        long l;

        switch (com) {

        /* return number of characters immediately available */
        case FIONREAD:
                mutex_spin_enter(&log_lock);
                l = msgbufp->msg_bufx - msgbufp->msg_bufr;
                if (l < 0)
                        l += msgbufp->msg_bufs;
                mutex_spin_exit(&log_lock);
                *(int *)data = l;
                break;

        case FIONBIO:
                break;

        case FIOASYNC:
                /* No locking needed, 'thread private'. */
                log_async = (*((int *)data) != 0);
                break;

        case TIOCSPGRP:
        case FIOSETOWN:
                return fsetown(&log_pgid, com, data);

        case TIOCGPGRP:
        case FIOGETOWN:
                return fgetown(log_pgid, com, data);

        default:
                return (EPASSTHROUGH);
        }
        return (0);
}

static void
logskip(struct kern_msgbuf *mbp)
{
        /*
         * Move forward read pointer to the next line
         * in the buffer.  Note that the buffer is
         * a ring buffer so we should reset msg_bufr
         * to 0 when msg_bufr exceeds msg_bufs.
         *
         * To prevent to loop forever, give up if we
         * cannot find a newline in mbp->msg_bufs
         * characters (the max size of the buffer).
         */
        for (int i = 0; i < mbp->msg_bufs; i++) {
                char c0 = mbp->msg_bufc[mbp->msg_bufr];
                if (++mbp->msg_bufr >= mbp->msg_bufs)
                        mbp->msg_bufr = 0;
                if (c0 == '\n')
                        break;
        }
}

static void
logaddchar(struct kern_msgbuf *mbp, int c)
{
        mbp->msg_bufc[mbp->msg_bufx++] = c;
        if (mbp->msg_bufx < 0 || mbp->msg_bufx >= mbp->msg_bufs)
                mbp->msg_bufx = 0;

        /* If the buffer is full, keep the most recent data. */
        if (mbp->msg_bufr == mbp->msg_bufx)
                logskip(mbp);
}

void
logputchar(int c)
{
        struct kern_msgbuf *mbp;

        if (!cold)
                mutex_spin_enter(&log_lock);

        if (!msgbufenabled)
                goto out;

        mbp = msgbufp;
        if (mbp->msg_magic != MSG_MAGIC) {
                /*
                 * Arguably should panic or somehow notify the
                 * user...  but how?  Panic may be too drastic,
                 * and would obliterate the message being kicked
                 * out (maybe a panic itself), and printf
                 * would invoke us recursively.  Silently punt
                 * for now.  If syslog is running, it should
                 * notice.
                 */
                msgbufenabled = 0;
                goto out;

        }

        logaddchar(mbp, c);

out:
        if (!cold)
                mutex_spin_exit(&log_lock);
}

/*
 * sysctl helper routine for kern.msgbufsize and kern.msgbuf. For the
 * former it merely checks the message buffer is set up. For the latter,
 * it also copies out the data if necessary.
 */
static int
sysctl_msgbuf(SYSCTLFN_ARGS)
{
        char *where = oldp;
        size_t len, maxlen;
        long beg, end;
        int error;

        if (!logenabled(msgbufp)) {
                msgbufenabled = 0;
                return (ENXIO);
        }

        switch (rnode->sysctl_num) {
        case KERN_MSGBUFSIZE: {
                struct sysctlnode node = *rnode;
                int msg_bufs = (int)msgbufp->msg_bufs;
                node.sysctl_data = &msg_bufs;
                return (sysctl_lookup(SYSCTLFN_CALL(&node)));
        }
        case KERN_MSGBUF:
                break;
        default:
                return (EOPNOTSUPP);
        }

        if (newp != NULL)
                return (EPERM);

        if (oldp == NULL) {
                /* always return full buffer size */
                *oldlenp = msgbufp->msg_bufs;
                return (0);
        }

        sysctl_unlock();

        /*
         * First, copy from the write pointer to the end of
         * message buffer.
         */
        error = 0;
        mutex_spin_enter(&log_lock);
        maxlen = MIN(msgbufp->msg_bufs, *oldlenp);
        beg = msgbufp->msg_bufx;
        end = msgbufp->msg_bufs;
        mutex_spin_exit(&log_lock);

        while (maxlen > 0) {
                len = MIN(end - beg, maxlen);
                if (len == 0)
                        break;
                /* XXX unlocked, but hardly matters. */
                error = copyout(&msgbufp->msg_bufc[beg], where, len);
                ktrmibio(-1, UIO_READ, where, len, error);
                if (error)
                        break;
                where += len;
                maxlen -= len;

                /*
                 * ... then, copy from the beginning of message buffer to
                 * the write pointer.
                 */
                beg = 0;
                end = msgbufp->msg_bufx;
        }

        sysctl_relock();
        return (error);
}

const struct cdevsw log_cdevsw = {
        .d_open = logopen,
        .d_close = logclose,
        .d_read = logread,
        .d_write = nowrite,
        .d_ioctl = logioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = logpoll,
        .d_mmap = nommap,
        .d_kqfilter = logkqfilter,
        .d_discard = nodiscard,
        .d_flag = D_OTHER | D_MPSAFE
};











































































































































































    1 















    1 


    1 






































































































    1 







    1 






















































































































































































































































































































































































































































































































































































    2 











    1 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
/*        $NetBSD: fdesc_vnops.c,v 1.140 2022/03/27 17:10:55 christos Exp $        */

/*
 * Copyright (c) 1992, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)fdesc_vnops.c        8.17 (Berkeley) 5/22/95
 *
 * #Id: fdesc_vnops.c,v 1.12 1993/04/06 16:17:17 jsp Exp #
 */

/*
 * /dev/fd Filesystem
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: fdesc_vnops.c,v 1.140 2022/03/27 17:10:55 christos Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/socketvar.h>
#include <sys/filedesc.h>
#include <sys/vnode.h>
#include <sys/malloc.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/buf.h>
#include <sys/dirent.h>
#include <sys/tty.h>
#include <sys/kauth.h>
#include <sys/atomic.h>

#include <miscfs/fdesc/fdesc.h>
#include <miscfs/genfs/genfs.h>

#define cttyvp(p) ((p)->p_lflag & PL_CONTROLT ? (p)->p_session->s_ttyvp : NULL)

dev_t devctty;

#if (FD_STDIN != FD_STDOUT-1) || (FD_STDOUT != FD_STDERR-1)
FD_STDIN, FD_STDOUT, FD_STDERR must be a sequence n, n+1, n+2
#endif

int        fdesc_lookup(void *);
int        fdesc_open(void *);
int        fdesc_getattr(void *);
int        fdesc_setattr(void *);
int        fdesc_read(void *);
int        fdesc_write(void *);
int        fdesc_ioctl(void *);
int        fdesc_poll(void *);
int        fdesc_kqfilter(void *);
int        fdesc_readdir(void *);
int        fdesc_readlink(void *);
int        fdesc_inactive(void *);
int        fdesc_reclaim(void *);
int        fdesc_print(void *);
int        fdesc_pathconf(void *);

static int fdesc_attr(int, struct vattr *, kauth_cred_t);

int (**fdesc_vnodeop_p)(void *);
const struct vnodeopv_entry_desc fdesc_vnodeop_entries[] = {
        { &vop_default_desc, vn_default_error },
        { &vop_parsepath_desc, genfs_parsepath },        /* parsepath */
        { &vop_lookup_desc, fdesc_lookup },                /* lookup */
        { &vop_create_desc, genfs_eopnotsupp },                /* create */
        { &vop_mknod_desc, genfs_eopnotsupp },                /* mknod */
        { &vop_open_desc, fdesc_open },                        /* open */
        { &vop_close_desc, genfs_nullop },                /* close */
        { &vop_access_desc, genfs_nullop },                /* access */
        { &vop_accessx_desc, genfs_accessx },                /* accessx */
        { &vop_getattr_desc, fdesc_getattr },                /* getattr */
        { &vop_setattr_desc, fdesc_setattr },                /* setattr */
        { &vop_read_desc, fdesc_read },                        /* read */
        { &vop_write_desc, fdesc_write },                /* write */
        { &vop_fallocate_desc, genfs_eopnotsupp },        /* fallocate */
        { &vop_fdiscard_desc, genfs_eopnotsupp },        /* fdiscard */
        { &vop_ioctl_desc, fdesc_ioctl },                /* ioctl */
        { &vop_fcntl_desc, genfs_fcntl },                /* fcntl */
        { &vop_poll_desc, fdesc_poll },                        /* poll */
        { &vop_kqfilter_desc, fdesc_kqfilter },                /* kqfilter */
        { &vop_revoke_desc, genfs_revoke },                /* revoke */
        { &vop_mmap_desc, genfs_eopnotsupp },                /* mmap */
        { &vop_fsync_desc, genfs_nullop },                /* fsync */
        { &vop_seek_desc, genfs_seek },                        /* seek */
        { &vop_remove_desc, genfs_eopnotsupp },                /* remove */
        { &vop_link_desc, genfs_erofs_link },                /* link */
        { &vop_rename_desc, genfs_eopnotsupp },                /* rename */
        { &vop_mkdir_desc, genfs_eopnotsupp },                /* mkdir */
        { &vop_rmdir_desc, genfs_eopnotsupp },                /* rmdir */
        { &vop_symlink_desc, genfs_erofs_symlink },        /* symlink */
        { &vop_readdir_desc, fdesc_readdir },                /* readdir */
        { &vop_readlink_desc, fdesc_readlink },                /* readlink */
        { &vop_abortop_desc, genfs_abortop },                /* abortop */
        { &vop_inactive_desc, fdesc_inactive },                /* inactive */
        { &vop_reclaim_desc, fdesc_reclaim },                /* reclaim */
        { &vop_lock_desc, genfs_lock },                        /* lock */
        { &vop_unlock_desc, genfs_unlock },                /* unlock */
        { &vop_bmap_desc, genfs_eopnotsupp },                /* bmap */
        { &vop_strategy_desc, genfs_badop },                /* strategy */
        { &vop_print_desc, fdesc_print },                /* print */
        { &vop_islocked_desc, genfs_islocked },                /* islocked */
        { &vop_pathconf_desc, fdesc_pathconf },                /* pathconf */
        { &vop_advlock_desc, genfs_einval },                /* advlock */
        { &vop_bwrite_desc, genfs_eopnotsupp },                /* bwrite */
        { &vop_putpages_desc, genfs_null_putpages },        /* putpages */
        { NULL, NULL }
};

const struct vnodeopv_desc fdesc_vnodeop_opv_desc =
        { &fdesc_vnodeop_p, fdesc_vnodeop_entries };

/*
 * Initialise cache headers
 */
void
fdesc_init(void)
{
        int cttymajor;

        /* locate the major number */
        cttymajor = devsw_name2chr("ctty", NULL, 0);
        devctty = makedev(cttymajor, 0);
}

void
fdesc_done(void)
{
}

/*
 * vp is the current namei directory
 * ndp is the name to locate in that directory...
 */
int
fdesc_lookup(void *v)
{
        struct vop_lookup_v2_args /* {
                struct vnode * a_dvp;
                struct vnode ** a_vpp;
                struct componentname * a_cnp;
        } */ *ap = v;
        struct vnode **vpp = ap->a_vpp;
        struct vnode *dvp = ap->a_dvp;
        struct componentname *cnp = ap->a_cnp;
        struct lwp *l = curlwp;
        const char *pname = cnp->cn_nameptr;
        struct proc *p = l->l_proc;
        unsigned fd = 0;
        int error, ix = -1;
        fdtab_t *dt;

        dt = atomic_load_consume(&curlwp->l_fd->fd_dt);

        if (cnp->cn_namelen == 1 && *pname == '.') {
                *vpp = dvp;
                vref(dvp);
                return (0);
        }

        switch (VTOFDESC(dvp)->fd_type) {
        default:
        case Flink:
        case Fdesc:
        case Fctty:
                error = ENOTDIR;
                goto bad;

        case Froot:
                if (cnp->cn_namelen == 2 && memcmp(pname, "fd", 2) == 0) {
                        ix = FD_DEVFD;
                        goto good;
                }

                if (cnp->cn_namelen == 3 && memcmp(pname, "tty", 3) == 0) {
                        struct vnode *ttyvp = cttyvp(p);
                        if (ttyvp == NULL) {
                                error = ENXIO;
                                goto bad;
                        }
                        ix = FD_CTTY;
                        goto good;
                }

                switch (cnp->cn_namelen) {
                case 5:
                        if (memcmp(pname, "stdin", 5) == 0) {
                                ix = FD_STDIN;
                                goto good;
                        }
                        break;
                case 6:
                        if (memcmp(pname, "stdout", 6) == 0) {
                                ix = FD_STDOUT;
                                goto good;
                        } else if (memcmp(pname, "stderr", 6) == 0) {
                                ix = FD_STDERR;
                                goto good;
                        }
                        break;
                }

                error = ENOENT;
                goto bad;

        case Fdevfd:
                if (cnp->cn_namelen == 2 && memcmp(pname, "..", 2) == 0) {
                        ix = FD_ROOT;
                        goto good;
                }

                fd = 0;
                while (*pname >= '0' && *pname <= '9') {
                        fd = 10 * fd + *pname++ - '0';
                        if (fd >= dt->dt_nfiles)
                                break;
                }

                if (*pname != '\0') {
                        error = ENOENT;
                        goto bad;
                }

                if (fd >= dt->dt_nfiles || dt->dt_ff[fd] == NULL ||
                    dt->dt_ff[fd]->ff_file == NULL) {
                        error = EBADF;
                        goto bad;
                }

                ix = FD_DESC + fd;
                goto good;
        }

bad:
        *vpp = NULL;
        return error;

good:
        KASSERT(ix != -1);
        error = vcache_get(dvp->v_mount, &ix, sizeof(ix), vpp);
        if (error)
                return error;

        /*
         * Prevent returning VNON nodes.
         * Operation fdesc_inactive() will reset the type to VNON.
         */
        if (ix == FD_CTTY)
                (*vpp)->v_type = VCHR;
        else if (ix >= FD_DESC)
                (*vpp)->v_type = VREG;
        KASSERT((*vpp)->v_type != VNON);

        return 0;
}

int
fdesc_open(void *v)
{
        struct vop_open_args /* {
                struct vnode *a_vp;
                int  a_mode;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;

        switch (VTOFDESC(vp)->fd_type) {
        case Fdesc:
                /*
                 * XXX Kludge: set dupfd to contain the value of the
                 * the file descriptor being sought for duplication.
                 * The error return ensures that the vnode for this
                 * device will be released by vn_open. vn_open will
                 * then detect this special error and take the actions
                 * in fd_dupopen. Other callers of vn_open or VOP_OPEN
                 * not prepared to deal with this situation will
                 * report a real error.
                 */
                curlwp->l_dupfd = VTOFDESC(vp)->fd_fd;        /* XXX */
                return EDUPFD;

        case Fctty:
                return cdev_open(devctty, ap->a_mode, 0, curlwp);
        case Froot:
        case Fdevfd:
        case Flink:
                break;
        }

        return (0);
}

static int
fdesc_attr(int fd, struct vattr *vap, kauth_cred_t cred)
{
        file_t *fp;
        struct stat stb;
        int error;

        if ((fp = fd_getfile(fd)) == NULL)
                return (EBADF);

        switch (fp->f_type) {
        case DTYPE_VNODE:
                vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
                error = VOP_GETATTR(fp->f_vnode, vap, cred);
                VOP_UNLOCK(fp->f_vnode);
                if (error == 0 && vap->va_type == VDIR) {
                        /*
                         * directories can cause loops in the namespace,
                         * so turn off the 'x' bits to avoid trouble.
                         */
                        vap->va_mode &= ~(S_IXUSR|S_IXGRP|S_IXOTH);
                }
                break;

        default:
                memset(&stb, 0, sizeof(stb));
                error = (*fp->f_ops->fo_stat)(fp, &stb);
                if (error)
                        break;

                vattr_null(vap);
                switch(fp->f_type) {
                case DTYPE_SOCKET:
                        vap->va_type = VSOCK;
                        break;
                case DTYPE_PIPE:
                        vap->va_type = VFIFO;
                        break;
                default:
                        /* use VNON perhaps? */
                        vap->va_type = VBAD;
                        break;
                }
                vap->va_mode = stb.st_mode;
                vap->va_nlink = stb.st_nlink;
                vap->va_uid = stb.st_uid;
                vap->va_gid = stb.st_gid;
                vap->va_fsid = stb.st_dev;
                vap->va_fileid = stb.st_ino;
                vap->va_size = stb.st_size;
                vap->va_blocksize = stb.st_blksize;
                vap->va_atime = stb.st_atimespec;
                vap->va_mtime = stb.st_mtimespec;
                vap->va_ctime = stb.st_ctimespec;
                vap->va_gen = stb.st_gen;
                vap->va_flags = stb.st_flags;
                vap->va_rdev = stb.st_rdev;
                vap->va_bytes = stb.st_blocks * stb.st_blksize;
                break;
        }

        fd_putfile(fd);
        return (error);
}

int
fdesc_getattr(void *v)
{
        struct vop_getattr_args /* {
                struct vnode *a_vp;
                struct vattr *a_vap;
                kauth_cred_t a_cred;
                struct lwp *a_l;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct vattr *vap = ap->a_vap;
        unsigned fd;
        int error = 0;
        struct timeval tv;

        switch (VTOFDESC(vp)->fd_type) {
        case Froot:
        case Fdevfd:
        case Flink:
        case Fctty:
                vattr_null(vap);
                vap->va_fileid = VTOFDESC(vp)->fd_ix;

#define R_ALL (S_IRUSR|S_IRGRP|S_IROTH)
#define W_ALL (S_IWUSR|S_IWGRP|S_IWOTH)
#define X_ALL (S_IXUSR|S_IXGRP|S_IXOTH)

                switch (VTOFDESC(vp)->fd_type) {
                case Flink:
                        vap->va_mode = R_ALL|X_ALL;
                        vap->va_type = VLNK;
                        vap->va_rdev = 0;
                        vap->va_nlink = 1;
                        vap->va_size = strlen(VTOFDESC(vp)->fd_link);
                        break;

                case Fctty:
                        vap->va_mode = R_ALL|W_ALL;
                        vap->va_type = VCHR;
                        vap->va_rdev = devctty;
                        vap->va_nlink = 1;
                        vap->va_size = 0;
                        break;

                default:
                        vap->va_mode = R_ALL|X_ALL;
                        vap->va_type = VDIR;
                        vap->va_rdev = 0;
                        vap->va_nlink = 2;
                        vap->va_size = DEV_BSIZE;
                        break;
                }
                vap->va_uid = 0;
                vap->va_gid = 0;
                vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
                vap->va_blocksize = DEV_BSIZE;
                getmicroboottime(&tv);
                vap->va_atime.tv_sec = tv.tv_sec;
                vap->va_atime.tv_nsec = 0;
                vap->va_mtime = vap->va_atime;
                vap->va_ctime = vap->va_mtime;
                vap->va_gen = 0;
                vap->va_flags = 0;
                vap->va_bytes = 0;
                break;

        case Fdesc:
                fd = VTOFDESC(vp)->fd_fd;
                error = fdesc_attr(fd, vap, ap->a_cred);
                break;

        default:
                panic("fdesc_getattr");
                break;
        }

        if (error == 0)
                vp->v_type = vap->va_type;

        return (error);
}

int
fdesc_setattr(void *v)
{
        struct vop_setattr_args /* {
                struct vnode *a_vp;
                struct vattr *a_vap;
                kauth_cred_t a_cred;
        } */ *ap = v;
        file_t *fp;
        unsigned fd;

        /*
         * Can't mess with the root vnode
         */
        switch (VTOFDESC(ap->a_vp)->fd_type) {
        case Fdesc:
                break;

        case Fctty:
                return (0);

        default:
                return (EACCES);
        }

        fd = VTOFDESC(ap->a_vp)->fd_fd;
        if ((fp = fd_getfile(fd)) == NULL)
                return (EBADF);

        /*
         * XXX: Can't reasonably set the attr's on any types currently.
         *      On vnode's this will cause truncation and socket/pipes make
         *      no sense.
         */
        fd_putfile(fd);
        return (0);
}


struct fdesc_target {
        ino_t ft_fileno;
        u_char ft_type;
        u_char ft_namlen;
        const char *ft_name;
} fdesc_targets[] = {
#define N(s) sizeof(s)-1, s
        { FD_DEVFD,  DT_DIR,     N("fd")     },
        { FD_STDIN,  DT_LNK,     N("stdin")  },
        { FD_STDOUT, DT_LNK,     N("stdout") },
        { FD_STDERR, DT_LNK,     N("stderr") },
        { FD_CTTY,   DT_UNKNOWN, N("tty")    },
#undef N
#define UIO_MX _DIRENT_RECLEN((struct dirent *)NULL, sizeof("stderr") - 1)
};
static int nfdesc_targets = sizeof(fdesc_targets) / sizeof(fdesc_targets[0]);

int
fdesc_readdir(void *v)
{
        struct vop_readdir_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                kauth_cred_t a_cred;
                int *a_eofflag;
                off_t **a_cookies;
                int *a_ncookies;
        } */ *ap = v;
        struct uio *uio = ap->a_uio;
        struct dirent d;
        off_t i;
        int j;
        int error;
        off_t *cookies = NULL;
        int ncookies;
        fdtab_t *dt;

        switch (VTOFDESC(ap->a_vp)->fd_type) {
        case Fctty:
                return 0;

        case Fdesc:
                return ENOTDIR;

        default:
                break;
        }

        dt = atomic_load_consume(&curlwp->l_fd->fd_dt);

        if (uio->uio_resid < UIO_MX)
                return EINVAL;
        if (uio->uio_offset < 0)
                return EINVAL;

        error = 0;
        i = uio->uio_offset;
        (void)memset(&d, 0, UIO_MX);
        d.d_reclen = UIO_MX;
        if (ap->a_ncookies)
                ncookies = uio->uio_resid / UIO_MX;
        else
                ncookies = 0;

        if (VTOFDESC(ap->a_vp)->fd_type == Froot) {
                struct fdesc_target *ft;

                if (i >= nfdesc_targets)
                        return 0;

                if (ap->a_ncookies) {
                        ncookies = uimin(ncookies, (nfdesc_targets - i));
                        cookies = malloc(ncookies * sizeof(off_t),
                            M_TEMP, M_WAITOK);
                        *ap->a_cookies = cookies;
                        *ap->a_ncookies = ncookies;
                }

                for (ft = &fdesc_targets[i]; uio->uio_resid >= UIO_MX &&
                    i < nfdesc_targets; ft++, i++) {
                        switch (ft->ft_fileno) {
                        case FD_CTTY:
                                if (cttyvp(curproc) == NULL)
                                        continue;
                                break;

                        case FD_STDIN:
                        case FD_STDOUT:
                        case FD_STDERR:
                                if ((ft->ft_fileno - FD_STDIN) >=
                                    dt->dt_nfiles)
                                        continue;
                                if (dt->dt_ff[ft->ft_fileno - FD_STDIN]
                                    == NULL || dt->dt_ff[ft->ft_fileno -
                                    FD_STDIN]->ff_file == NULL)
                                        continue;
                                break;
                        }

                        d.d_fileno = ft->ft_fileno;
                        d.d_namlen = ft->ft_namlen;
                        (void)memcpy(d.d_name, ft->ft_name, ft->ft_namlen + 1);
                        d.d_type = ft->ft_type;

                        if ((error = uiomove(&d, UIO_MX, uio)) != 0)
                                break;
                        if (cookies)
                                *cookies++ = i + 1;
                }
        } else {
                if (ap->a_ncookies) {
                        ncookies = uimin(ncookies, dt->dt_nfiles + 2);
                        cookies = malloc(ncookies * sizeof(off_t),
                            M_TEMP, M_WAITOK);
                        *ap->a_cookies = cookies;
                        *ap->a_ncookies = ncookies;
                }
                for (; i - 2 < dt->dt_nfiles && uio->uio_resid >= UIO_MX; i++) {
                        switch (i) {
                        case 0:
                        case 1:
                                d.d_fileno = FD_ROOT;                /* XXX */
                                d.d_namlen = i + 1;
                                (void)memcpy(d.d_name, "..", d.d_namlen);
                                d.d_name[i + 1] = '\0';
                                d.d_type = DT_DIR;
                                break;

                        default:
                                j = (int)i - 2;
                                if (dt->dt_ff[j] == NULL ||
                                    dt->dt_ff[j]->ff_file == NULL)
                                        continue;
                                d.d_fileno = j + FD_STDIN;
                                d.d_namlen = snprintf(d.d_name,
                                    sizeof(d.d_name), "%d", j);
                                d.d_type = DT_UNKNOWN;
                                break;
                        }

                        if ((error = uiomove(&d, UIO_MX, uio)) != 0)
                                break;
                        if (cookies)
                                *cookies++ = i + 1;
                }
        }

        if (ap->a_ncookies && error) {
                free(*ap->a_cookies, M_TEMP);
                *ap->a_ncookies = 0;
                *ap->a_cookies = NULL;
        }

        uio->uio_offset = i;
        return error;
}

int
fdesc_readlink(void *v)
{
        struct vop_readlink_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        int error;

        if (vp->v_type != VLNK)
                return (EPERM);

        if (VTOFDESC(vp)->fd_type == Flink) {
                const char *ln = VTOFDESC(vp)->fd_link;
                error = uiomove(__UNCONST(ln), strlen(ln), ap->a_uio);
        } else {
                error = EOPNOTSUPP;
        }

        return (error);
}

int
fdesc_read(void *v)
{
        struct vop_read_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int  a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        int error = EOPNOTSUPP;
        struct vnode *vp = ap->a_vp;

        switch (VTOFDESC(vp)->fd_type) {
        case Fctty:
                VOP_UNLOCK(vp);
                error = cdev_read(devctty, ap->a_uio, ap->a_ioflag);
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                break;

        default:
                error = EOPNOTSUPP;
                break;
        }

        return (error);
}

int
fdesc_write(void *v)
{
        struct vop_write_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int  a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        int error = EOPNOTSUPP;
        struct vnode *vp = ap->a_vp;

        switch (VTOFDESC(vp)->fd_type) {
        case Fctty:
                VOP_UNLOCK(vp);
                error = cdev_write(devctty, ap->a_uio, ap->a_ioflag);
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                break;

        default:
                error = EOPNOTSUPP;
                break;
        }

        return (error);
}

int
fdesc_ioctl(void *v)
{
        struct vop_ioctl_args /* {
                struct vnode *a_vp;
                u_long a_command;
                void *a_data;
                int  a_fflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        int error = EOPNOTSUPP;

        switch (VTOFDESC(ap->a_vp)->fd_type) {
        case Fctty:
                error = cdev_ioctl(devctty, ap->a_command, ap->a_data,
                    ap->a_fflag, curlwp);
                break;

        default:
                error = EOPNOTSUPP;
                break;
        }

        return (error);
}

int
fdesc_poll(void *v)
{
        struct vop_poll_args /* {
                struct vnode *a_vp;
                int a_events;
        } */ *ap = v;
        int revents;

        switch (VTOFDESC(ap->a_vp)->fd_type) {
        case Fctty:
                revents = cdev_poll(devctty, ap->a_events, curlwp);
                break;

        default:
                revents = genfs_poll(v);
                break;
        }

        return (revents);
}

int
fdesc_kqfilter(void *v)
{
        struct vop_kqfilter_args /* {
                struct vnode *a_vp;
                struct knote *a_kn;
        } */ *ap = v;
        int error, fd;
        file_t *fp;

        switch (VTOFDESC(ap->a_vp)->fd_type) {
        case Fctty:
                error = cdev_kqfilter(devctty, ap->a_kn);
                break;

        case Fdesc:
                /* just invoke kqfilter for the underlying descriptor */
                fd = VTOFDESC(ap->a_vp)->fd_fd;
                if ((fp = fd_getfile(fd)) == NULL)
                        return (1);
                error = (*fp->f_ops->fo_kqfilter)(fp, ap->a_kn);
                fd_putfile(fd);
                break;

        default:
                return (genfs_kqfilter(v));
        }

        return (error);
}

int
fdesc_inactive(void *v)
{
        struct vop_inactive_v2_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct fdescnode *fd = VTOFDESC(vp);

        /*
         * Clear out the v_type field to avoid
         * nasty things happening on reclaim.
         */
        if (fd->fd_type == Fctty || fd->fd_type == Fdesc)
                vp->v_type = VNON;

        return (0);
}

int
fdesc_reclaim(void *v)
{
        struct vop_reclaim_v2_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct fdescnode *fd = VTOFDESC(vp);

        VOP_UNLOCK(vp);

        vp->v_data = NULL;
        kmem_free(fd, sizeof(struct fdescnode));

        return (0);
}

/*
 * Return POSIX pathconf information applicable to special devices.
 */
int
fdesc_pathconf(void *v)
{
        struct vop_pathconf_args /* {
                struct vnode *a_vp;
                int a_name;
                register_t *a_retval;
        } */ *ap = v;

        switch (ap->a_name) {
        case _PC_LINK_MAX:
                *ap->a_retval = LINK_MAX;
                return (0);
        case _PC_MAX_CANON:
                *ap->a_retval = MAX_CANON;
                return (0);
        case _PC_MAX_INPUT:
                *ap->a_retval = MAX_INPUT;
                return (0);
        case _PC_PIPE_BUF:
                *ap->a_retval = PIPE_BUF;
                return (0);
        case _PC_CHOWN_RESTRICTED:
                *ap->a_retval = 1;
                return (0);
        case _PC_VDISABLE:
                *ap->a_retval = _POSIX_VDISABLE;
                return (0);
        case _PC_SYNC_IO:
                *ap->a_retval = 1;
                return (0);
        default:
                return genfs_pathconf(ap);
        }
        /* NOTREACHED */
}

/*
 * Print out the contents of a /dev/fd vnode.
 */
/* ARGSUSED */
int
fdesc_print(void *v)
{
        printf("tag VT_NON, fdesc vnode\n");
        return (0);
}




























































































































































































































































































































    3 
























































    3 















    3 


















    2 
















    1 

















    1 


























    1 






























































    1 
    1 



    1 




























































    1 






    1 









































































    5 









    6 

    3 






























    3 


























    6 






    6 




    6 

















    6 





























    3 




    3 






























    3 

































































    3 

    3 




    3 




    1 





    6 






































































































































































































































































































































































































































































































































































    1 














































































































































































































































































































































































































































































































































































































































































    5 









    5 


    5 





























































































































































































































































































    2 













    1 
    2 




    2 















    1 


















    2 
    2 





    1 








    1 
    1 









    7 





























    7 















    5 
























    5 






































































































































































































































































    5 








    7 

























    4 


    3 

    3 




    4 


    2 


















    5 



    5 
    2 
    6 
    1 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
/*        $NetBSD: kern_exec.c,v 1.521 2023/10/08 12:38:58 ad Exp $        */

/*-
 * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (C) 1993, 1994, 1996 Christopher G. Demetriou
 * Copyright (C) 1992 Wolfgang Solfrank.
 * Copyright (C) 1992 TooLs GmbH.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by TooLs GmbH.
 * 4. The name of TooLs GmbH may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.521 2023/10/08 12:38:58 ad Exp $");

#include "opt_exec.h"
#include "opt_execfmt.h"
#include "opt_ktrace.h"
#include "opt_modular.h"
#include "opt_syscall_debug.h"
#include "veriexec.h"
#include "opt_pax.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/mount.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/acct.h>
#include <sys/atomic.h>
#include <sys/exec.h>
#include <sys/futex.h>
#include <sys/ktrace.h>
#include <sys/uidinfo.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/ras.h>
#include <sys/signalvar.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/kauth.h>
#include <sys/lwpctl.h>
#include <sys/pax.h>
#include <sys/cpu.h>
#include <sys/module.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <sys/vfs_syscalls.h>
#if NVERIEXEC > 0
#include <sys/verified_exec.h>
#endif /* NVERIEXEC > 0 */
#include <sys/sdt.h>
#include <sys/spawn.h>
#include <sys/prot.h>
#include <sys/cprng.h>

#include <uvm/uvm_extern.h>

#include <machine/reg.h>

#include <compat/common/compat_util.h>

#ifndef MD_TOPDOWN_INIT
#ifdef __USE_TOPDOWN_VM
#define        MD_TOPDOWN_INIT(epp)        (epp)->ep_flags |= EXEC_TOPDOWN_VM
#else
#define        MD_TOPDOWN_INIT(epp)
#endif
#endif

struct execve_data;

extern int user_va0_disable;

static size_t calcargs(struct execve_data * restrict, const size_t);
static size_t calcstack(struct execve_data * restrict, const size_t);
static int copyoutargs(struct execve_data * restrict, struct lwp *,
    char * const);
static int copyoutpsstrs(struct execve_data * restrict, struct proc *);
static int copyinargs(struct execve_data * restrict, char * const *,
    char * const *, execve_fetch_element_t, char **);
static int copyinargstrs(struct execve_data * restrict, char * const *,
    execve_fetch_element_t, char **, size_t *, void (*)(const void *, size_t));
static int exec_sigcode_map(struct proc *, const struct emul *);

#if defined(DEBUG) && !defined(DEBUG_EXEC)
#define DEBUG_EXEC
#endif
#ifdef DEBUG_EXEC
#define DPRINTF(a) printf a
#define COPYPRINTF(s, a, b) printf("%s, %d: copyout%s @%p %zu\n", __func__, \
    __LINE__, (s), (a), (b))
static void dump_vmcmds(const struct exec_package * const, size_t, int);
#define DUMPVMCMDS(p, x, e) do { dump_vmcmds((p), (x), (e)); } while (0)
#else
#define DPRINTF(a)
#define COPYPRINTF(s, a, b)
#define DUMPVMCMDS(p, x, e) do {} while (0)
#endif /* DEBUG_EXEC */

/*
 * DTrace SDT provider definitions
 */
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE1(proc, kernel, , exec, "char *");
SDT_PROBE_DEFINE1(proc, kernel, , exec__success, "char *");
SDT_PROBE_DEFINE1(proc, kernel, , exec__failure, "int");

/*
 * Exec function switch:
 *
 * Note that each makecmds function is responsible for loading the
 * exec package with the necessary functions for any exec-type-specific
 * handling.
 *
 * Functions for specific exec types should be defined in their own
 * header file.
 */
static const struct execsw        **execsw = NULL;
static int                        nexecs;

u_int        exec_maxhdrsz;         /* must not be static - used by netbsd32 */

/* list of dynamically loaded execsw entries */
static LIST_HEAD(execlist_head, exec_entry) ex_head =
    LIST_HEAD_INITIALIZER(ex_head);
struct exec_entry {
        LIST_ENTRY(exec_entry)        ex_list;
        SLIST_ENTRY(exec_entry)        ex_slist;
        const struct execsw        *ex_sw;
};

#ifndef __HAVE_SYSCALL_INTERN
void        syscall(void);
#endif

/* NetBSD autoloadable syscalls */
#ifdef MODULAR
#include <kern/syscalls_autoload.c>
#endif

/* NetBSD emul struct */
struct emul emul_netbsd = {
        .e_name =                "netbsd",
#ifdef EMUL_NATIVEROOT
        .e_path =                EMUL_NATIVEROOT,
#else
        .e_path =                NULL,
#endif
#ifndef __HAVE_MINIMAL_EMUL
        .e_flags =                EMUL_HAS_SYS___syscall,
        .e_errno =                NULL,
        .e_nosys =                SYS_syscall,
        .e_nsysent =                SYS_NSYSENT,
#endif
#ifdef MODULAR
        .e_sc_autoload =        netbsd_syscalls_autoload,
#endif
        .e_sysent =                sysent,
        .e_nomodbits =                sysent_nomodbits,
#ifdef SYSCALL_DEBUG
        .e_syscallnames =        syscallnames,
#else
        .e_syscallnames =        NULL,
#endif
        .e_sendsig =                sendsig,
        .e_trapsignal =                trapsignal,
        .e_sigcode =                NULL,
        .e_esigcode =                NULL,
        .e_sigobject =                NULL,
        .e_setregs =                setregs,
        .e_proc_exec =                NULL,
        .e_proc_fork =                NULL,
        .e_proc_exit =                NULL,
        .e_lwp_fork =                NULL,
        .e_lwp_exit =                NULL,
#ifdef __HAVE_SYSCALL_INTERN
        .e_syscall_intern =        syscall_intern,
#else
        .e_syscall =                syscall,
#endif
        .e_sysctlovly =                NULL,
        .e_vm_default_addr =        uvm_default_mapaddr,
        .e_usertrap =                NULL,
        .e_ucsize =                sizeof(ucontext_t),
        .e_startlwp =                startlwp
};

/*
 * Exec lock. Used to control access to execsw[] structures.
 * This must not be static so that netbsd32 can access it, too.
 */
krwlock_t exec_lock __cacheline_aligned;

/*
 * Data used between a loadvm and execve part of an "exec" operation
 */
struct execve_data {
        struct exec_package        ed_pack;
        struct pathbuf                *ed_pathbuf;
        struct vattr                ed_attr;
        struct ps_strings        ed_arginfo;
        char                        *ed_argp;
        const char                *ed_pathstring;
        char                        *ed_resolvedname;
        size_t                        ed_ps_strings_sz;
        int                        ed_szsigcode;
        size_t                        ed_argslen;
        long                        ed_argc;
        long                        ed_envc;
};

/*
 * data passed from parent lwp to child during a posix_spawn()
 */
struct spawn_exec_data {
        struct execve_data        sed_exec;
        struct posix_spawn_file_actions
                                *sed_actions;
        struct posix_spawnattr        *sed_attrs;
        struct proc                *sed_parent;
        kcondvar_t                sed_cv_child_ready;
        kmutex_t                sed_mtx_child;
        int                        sed_error;
        volatile uint32_t        sed_refcnt;
};

static struct vm_map *exec_map;
static struct pool exec_pool;

static void *
exec_pool_alloc(struct pool *pp, int flags)
{

        return (void *)uvm_km_alloc(exec_map, NCARGS, 0,
            UVM_KMF_PAGEABLE | UVM_KMF_WAITVA);
}

static void
exec_pool_free(struct pool *pp, void *addr)
{

        uvm_km_free(exec_map, (vaddr_t)addr, NCARGS, UVM_KMF_PAGEABLE);
}

static struct pool_allocator exec_palloc = {
        .pa_alloc = exec_pool_alloc,
        .pa_free = exec_pool_free,
        .pa_pagesz = NCARGS
};

static void
exec_path_free(struct execve_data *data)
{              
        pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
        pathbuf_destroy(data->ed_pathbuf);
        if (data->ed_resolvedname)
                PNBUF_PUT(data->ed_resolvedname);
}

static int
exec_resolvename(struct lwp *l, struct exec_package *epp, struct vnode *vp,
    char **rpath)
{
        int error;
        char *p;

        KASSERT(rpath != NULL);

        *rpath = PNBUF_GET();
        error = vnode_to_path(*rpath, MAXPATHLEN, vp, l, l->l_proc);
        if (error) {
                DPRINTF(("%s: can't resolve name for %s, error %d\n",
                    __func__, epp->ep_kname, error));
                PNBUF_PUT(*rpath);
                *rpath = NULL;
                return error;
        }
        epp->ep_resolvedname = *rpath;
        if ((p = strrchr(*rpath, '/')) != NULL)
                epp->ep_kname = p + 1;
        return 0;
}


/*
 * check exec:
 * given an "executable" described in the exec package's namei info,
 * see what we can do with it.
 *
 * ON ENTRY:
 *        exec package with appropriate namei info
 *        lwp pointer of exec'ing lwp
 *        NO SELF-LOCKED VNODES
 *
 * ON EXIT:
 *        error:        nothing held, etc.  exec header still allocated.
 *        ok:        filled exec package, executable's vnode (unlocked).
 *
 * EXEC SWITCH ENTRY:
 *         Locked vnode to check, exec package, proc.
 *
 * EXEC SWITCH EXIT:
 *        ok:        return 0, filled exec package, executable's vnode (unlocked).
 *        error:        destructive:
 *                        everything deallocated execept exec header.
 *                non-destructive:
 *                        error code, executable's vnode (unlocked),
 *                        exec header unmodified.
 */
int
/*ARGSUSED*/
check_exec(struct lwp *l, struct exec_package *epp, struct pathbuf *pb,
    char **rpath)
{
        int                error, i;
        struct vnode        *vp;
        size_t                resid;

        if (epp->ep_resolvedname) {
                struct nameidata nd;

                // grab the absolute pathbuf here before namei() trashes it.
                pathbuf_copystring(pb, epp->ep_resolvedname, PATH_MAX);
                NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);

                /* first get the vnode */
                if ((error = namei(&nd)) != 0)
                        return error;

                epp->ep_vp = vp = nd.ni_vp;
#ifdef DIAGNOSTIC
                /* paranoia (take this out once namei stuff stabilizes) */
                memset(nd.ni_pnbuf, '~', PATH_MAX);
#endif
        } else {
                struct file *fp;

                if ((error = fd_getvnode(epp->ep_xfd, &fp)) != 0)
                        return error;
                epp->ep_vp = vp = fp->f_vnode;
                vref(vp);
                fd_putfile(epp->ep_xfd);
                if ((error = exec_resolvename(l, epp, vp, rpath)) != 0)
                        return error;
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        }

        /* check access and type */
        if (vp->v_type != VREG) {
                error = EACCES;
                goto bad1;
        }
        if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != 0)
                goto bad1;

        /* get attributes */
        /* XXX VOP_GETATTR is the only thing that needs LK_EXCLUSIVE here */
        if ((error = VOP_GETATTR(vp, epp->ep_vap, l->l_cred)) != 0)
                goto bad1;

        /* Check mount point */
        if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
                error = EACCES;
                goto bad1;
        }
        if (vp->v_mount->mnt_flag & MNT_NOSUID)
                epp->ep_vap->va_mode &= ~(S_ISUID | S_ISGID);

        /* try to open it */
        if ((error = VOP_OPEN(vp, FREAD, l->l_cred)) != 0)
                goto bad1;

        /* now we have the file, get the exec header */
        error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, 0,
                        UIO_SYSSPACE, IO_NODELOCKED, l->l_cred, &resid, NULL);
        if (error)
                goto bad1;

        /* unlock vp, since we need it unlocked from here on out. */
        VOP_UNLOCK(vp);

#if NVERIEXEC > 0
        error = veriexec_verify(l, vp,
            epp->ep_resolvedname ? epp->ep_resolvedname : epp->ep_kname,
            epp->ep_flags & EXEC_INDIR ? VERIEXEC_INDIRECT : VERIEXEC_DIRECT,
            NULL);
        if (error)
                goto bad2;
#endif /* NVERIEXEC > 0 */

#ifdef PAX_SEGVGUARD
        error = pax_segvguard(l, vp, epp->ep_resolvedname, false);
        if (error)
                goto bad2;
#endif /* PAX_SEGVGUARD */

        epp->ep_hdrvalid = epp->ep_hdrlen - resid;

        /*
         * Set up default address space limits.  Can be overridden
         * by individual exec packages.
         */
        epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
        epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS;

        /*
         * set up the vmcmds for creation of the process
         * address space
         */
        error = ENOEXEC;
        for (i = 0; i < nexecs; i++) {
                int newerror;

                epp->ep_esch = execsw[i];
                newerror = (*execsw[i]->es_makecmds)(l, epp);

                if (!newerror) {
                        /* Seems ok: check that entry point is not too high */
                        if (epp->ep_entry >= epp->ep_vm_maxaddr) {
#ifdef DIAGNOSTIC
                                printf("%s: rejecting %p due to "
                                    "too high entry address (>= %p)\n",
                                         __func__, (void *)epp->ep_entry,
                                         (void *)epp->ep_vm_maxaddr);
#endif
                                error = ENOEXEC;
                                break;
                        }
                        /* Seems ok: check that entry point is not too low */
                        if (epp->ep_entry < epp->ep_vm_minaddr) {
#ifdef DIAGNOSTIC
                                printf("%s: rejecting %p due to "
                                    "too low entry address (< %p)\n",
                                     __func__, (void *)epp->ep_entry,
                                     (void *)epp->ep_vm_minaddr);
#endif
                                error = ENOEXEC;
                                break;
                        }

                        /* check limits */
#ifdef DIAGNOSTIC
#define LMSG "%s: rejecting due to %s limit (%ju > %ju)\n"
#endif
#ifdef MAXTSIZ
                        if (epp->ep_tsize > MAXTSIZ) {
#ifdef DIAGNOSTIC
                                printf(LMSG, __func__, "text",
                                    (uintmax_t)epp->ep_tsize,
                                    (uintmax_t)MAXTSIZ);
#endif
                                error = ENOMEM;
                                break;
                        }
#endif
                        vsize_t dlimit =
                            (vsize_t)l->l_proc->p_rlimit[RLIMIT_DATA].rlim_cur;
                        if (epp->ep_dsize > dlimit) {
#ifdef DIAGNOSTIC
                                printf(LMSG, __func__, "data",
                                    (uintmax_t)epp->ep_dsize,
                                    (uintmax_t)dlimit);
#endif
                                error = ENOMEM;
                                break;
                        }
                        return 0;
                }

                /*
                 * Reset all the fields that may have been modified by the
                 * loader.
                 */
                KASSERT(epp->ep_emul_arg == NULL);
                if (epp->ep_emul_root != NULL) {
                        vrele(epp->ep_emul_root);
                        epp->ep_emul_root = NULL;
                }
                if (epp->ep_interp != NULL) {
                        vrele(epp->ep_interp);
                        epp->ep_interp = NULL;
                }
                epp->ep_pax_flags = 0;

                /* make sure the first "interesting" error code is saved. */
                if (error == ENOEXEC)
                        error = newerror;

                if (epp->ep_flags & EXEC_DESTR)
                        /* Error from "#!" code, tidied up by recursive call */
                        return error;
        }

        /* not found, error */

        /*
         * free any vmspace-creation commands,
         * and release their references
         */
        kill_vmcmds(&epp->ep_vmcmds);

#if NVERIEXEC > 0 || defined(PAX_SEGVGUARD)
bad2:
#endif
        /*
         * close and release the vnode, restore the old one, free the
         * pathname buf, and punt.
         */
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        VOP_CLOSE(vp, FREAD, l->l_cred);
        vput(vp);
        return error;

bad1:
        /*
         * free the namei pathname buffer, and put the vnode
         * (which we don't yet have open).
         */
        vput(vp);                                /* was still locked */
        return error;
}

#ifdef __MACHINE_STACK_GROWS_UP
#define STACK_PTHREADSPACE NBPG
#else
#define STACK_PTHREADSPACE 0
#endif

static int
execve_fetch_element(char * const *array, size_t index, char **value)
{
        return copyin(array + index, value, sizeof(*value));
}

/*
 * exec system call
 */
int
sys_execve(struct lwp *l, const struct sys_execve_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *)        path;
                syscallarg(char * const *)        argp;
                syscallarg(char * const *)        envp;
        } */

        return execve1(l, true, SCARG(uap, path), -1, SCARG(uap, argp),
            SCARG(uap, envp), execve_fetch_element);
}

int
sys_fexecve(struct lwp *l, const struct sys_fexecve_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        fd;
                syscallarg(char * const *)        argp;
                syscallarg(char * const *)        envp;
        } */

        return execve1(l, false, NULL, SCARG(uap, fd), SCARG(uap, argp),
            SCARG(uap, envp), execve_fetch_element);
}

/*
 * Load modules to try and execute an image that we do not understand.
 * If no execsw entries are present, we load those likely to be needed
 * in order to run native images only.  Otherwise, we autoload all
 * possible modules that could let us run the binary.  XXX lame
 */
static void
exec_autoload(void)
{
#ifdef MODULAR
        static const char * const native[] = {
                "exec_elf32",
                "exec_elf64",
                "exec_script",
                NULL
        };
        static const char * const compat[] = {
                "exec_elf32",
                "exec_elf64",
                "exec_script",
                "exec_aout",
                "exec_coff",
                "exec_ecoff",
                "compat_aoutm68k",
                "compat_netbsd32",
#if 0
                "compat_linux",
                "compat_linux32",
#endif
                "compat_sunos",
                "compat_sunos32",
                "compat_ultrix",
                NULL
        };
        char const * const *list;
        int i;

        list = nexecs == 0 ? native : compat;
        for (i = 0; list[i] != NULL; i++) {
                if (module_autoload(list[i], MODULE_CLASS_EXEC) != 0) {
                        continue;
                }
                yield();
        }
#endif
}

/*
 * Copy the user or kernel supplied upath to the allocated pathbuffer pbp
 * making it absolute in the process, by prepending the current working
 * directory if it is not. If offs is supplied it will contain the offset
 * where the original supplied copy of upath starts.
 */
int
exec_makepathbuf(struct lwp *l, const char *upath, enum uio_seg seg,
    struct pathbuf **pbp, size_t *offs)
{
        char *path, *bp;
        size_t len, tlen;
        int error;
        struct cwdinfo *cwdi;

        path = PNBUF_GET();
        if (seg == UIO_SYSSPACE) {
                error = copystr(upath, path, MAXPATHLEN, &len);
        } else {
                error = copyinstr(upath, path, MAXPATHLEN, &len);
        }
        if (error)
                goto err;

        if (path[0] == '/') {
                if (offs)
                        *offs = 0;
                goto out;
        }

        len++;
        if (len + 1 >= MAXPATHLEN) {
                error = ENAMETOOLONG;
                goto err;
        }
        bp = path + MAXPATHLEN - len;
        memmove(bp, path, len);
        *(--bp) = '/';

        cwdi = l->l_proc->p_cwdi;
        rw_enter(&cwdi->cwdi_lock, RW_READER);
        error = getcwd_common(cwdi->cwdi_cdir, NULL, &bp, path, MAXPATHLEN / 2,
            GETCWD_CHECK_ACCESS, l);
        rw_exit(&cwdi->cwdi_lock);

        if (error)
                goto err;
        tlen = path + MAXPATHLEN - bp;

        memmove(path, bp, tlen);
        path[tlen - 1] = '\0';
        if (offs)
                *offs = tlen - len;
out:
        *pbp = pathbuf_assimilate(path);
        return 0;
err:
        PNBUF_PUT(path);
        return error;
}

vaddr_t
exec_vm_minaddr(vaddr_t va_min)
{
        /*
         * Increase va_min if we don't want NULL to be mappable by the
         * process.
         */
#define VM_MIN_GUARD        PAGE_SIZE
        if (user_va0_disable && (va_min < VM_MIN_GUARD))
                return VM_MIN_GUARD;
        return va_min;
}

static int
execve_loadvm(struct lwp *l, bool has_path, const char *path, int fd,
        char * const *args, char * const *envs,
        execve_fetch_element_t fetch_element,
        struct execve_data * restrict data)
{
        struct exec_package        * const epp = &data->ed_pack;
        int                        error;
        struct proc                *p;
        char                        *dp;
        u_int                        modgen;

        KASSERT(data != NULL);

        p = l->l_proc;
        modgen = 0;

        SDT_PROBE(proc, kernel, , exec, path, 0, 0, 0, 0);

        /*
         * Check if we have exceeded our number of processes limit.
         * This is so that we handle the case where a root daemon
         * forked, ran setuid to become the desired user and is trying
         * to exec. The obvious place to do the reference counting check
         * is setuid(), but we don't do the reference counting check there
         * like other OS's do because then all the programs that use setuid()
         * must be modified to check the return code of setuid() and exit().
         * It is dangerous to make setuid() fail, because it fails open and
         * the program will continue to run as root. If we make it succeed
         * and return an error code, again we are not enforcing the limit.
         * The best place to enforce the limit is here, when the process tries
         * to execute a new image, because eventually the process will need
         * to call exec in order to do something useful.
         */
 retry:
        if (p->p_flag & PK_SUGID) {
                if (kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT,
                     p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
                     &p->p_rlimit[RLIMIT_NPROC],
                     KAUTH_ARG(RLIMIT_NPROC)) != 0 &&
                    chgproccnt(kauth_cred_getuid(l->l_cred), 0) >
                     p->p_rlimit[RLIMIT_NPROC].rlim_cur)
                return EAGAIN;
        }

        /*
         * Drain existing references and forbid new ones.  The process
         * should be left alone until we're done here.  This is necessary
         * to avoid race conditions - e.g. in ptrace() - that might allow
         * a local user to illicitly obtain elevated privileges.
         */
        rw_enter(&p->p_reflock, RW_WRITER);

        if (has_path) {
                size_t        offs;
                /*
                 * Init the namei data to point the file user's program name.
                 * This is done here rather than in check_exec(), so that it's
                 * possible to override this settings if any of makecmd/probe
                 * functions call check_exec() recursively - for example,
                 * see exec_script_makecmds().
                 */
                if ((error = exec_makepathbuf(l, path, UIO_USERSPACE,
                    &data->ed_pathbuf, &offs)) != 0)
                        goto clrflg;
                data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf);
                epp->ep_kname = data->ed_pathstring + offs;
                data->ed_resolvedname = PNBUF_GET();
                epp->ep_resolvedname = data->ed_resolvedname;
                epp->ep_xfd = -1;
        } else {
                data->ed_pathbuf = pathbuf_assimilate(strcpy(PNBUF_GET(), "/"));
                data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf);
                epp->ep_kname = "*fexecve*";
                data->ed_resolvedname = NULL;
                epp->ep_resolvedname = NULL;
                epp->ep_xfd = fd;
        }


        /*
         * initialize the fields of the exec package.
         */
        epp->ep_hdr = kmem_alloc(exec_maxhdrsz, KM_SLEEP);
        epp->ep_hdrlen = exec_maxhdrsz;
        epp->ep_hdrvalid = 0;
        epp->ep_emul_arg = NULL;
        epp->ep_emul_arg_free = NULL;
        memset(&epp->ep_vmcmds, 0, sizeof(epp->ep_vmcmds));
        epp->ep_vap = &data->ed_attr;
        epp->ep_flags = (p->p_flag & PK_32) ? EXEC_FROM32 : 0;
        MD_TOPDOWN_INIT(epp);
        epp->ep_emul_root = NULL;
        epp->ep_interp = NULL;
        epp->ep_esch = NULL;
        epp->ep_pax_flags = 0;
        memset(epp->ep_machine_arch, 0, sizeof(epp->ep_machine_arch));

        rw_enter(&exec_lock, RW_READER);

        /* see if we can run it. */
        if ((error = check_exec(l, epp, data->ed_pathbuf,
            &data->ed_resolvedname)) != 0) {
                if (error != ENOENT && error != EACCES && error != ENOEXEC) {
                        DPRINTF(("%s: check exec failed for %s, error %d\n",
                            __func__, epp->ep_kname, error));
                }
                goto freehdr;
        }

        /* allocate an argument buffer */
        data->ed_argp = pool_get(&exec_pool, PR_WAITOK);
        KASSERT(data->ed_argp != NULL);
        dp = data->ed_argp;

        if ((error = copyinargs(data, args, envs, fetch_element, &dp)) != 0) {
                goto bad;
        }

        /*
         * Calculate the new stack size.
         */

#ifdef __MACHINE_STACK_GROWS_UP
/*
 * copyargs() fills argc/argv/envp from the lower address even on
 * __MACHINE_STACK_GROWS_UP machines.  Reserve a few words just below the SP
 * so that _rtld() use it.
 */
#define        RTLD_GAP        32
#else
#define        RTLD_GAP        0
#endif

        const size_t argenvstrlen = (char *)ALIGN(dp) - data->ed_argp;

        data->ed_argslen = calcargs(data, argenvstrlen);

        const size_t len = calcstack(data, pax_aslr_stack_gap(epp) + RTLD_GAP);

        if (len > epp->ep_ssize) {
                /* in effect, compare to initial limit */
                DPRINTF(("%s: stack limit exceeded %zu\n", __func__, len));
                error = ENOMEM;
                goto bad;
        }
        /* adjust "active stack depth" for process VSZ */
        epp->ep_ssize = len;

        return 0;

 bad:
        /* free the vmspace-creation commands, and release their references */
        kill_vmcmds(&epp->ep_vmcmds);
        /* kill any opened file descriptor, if necessary */
        if (epp->ep_flags & EXEC_HASFD) {
                epp->ep_flags &= ~EXEC_HASFD;
                fd_close(epp->ep_fd);
        }
        /* close and put the exec'd file */
        vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
        VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred);
        vput(epp->ep_vp);
        pool_put(&exec_pool, data->ed_argp);

 freehdr:
        kmem_free(epp->ep_hdr, epp->ep_hdrlen);
        if (epp->ep_emul_root != NULL)
                vrele(epp->ep_emul_root);
        if (epp->ep_interp != NULL)
                vrele(epp->ep_interp);

        rw_exit(&exec_lock);

        exec_path_free(data);

 clrflg:
        rw_exit(&p->p_reflock);

        if (modgen != module_gen && error == ENOEXEC) {
                modgen = module_gen;
                exec_autoload();
                goto retry;
        }

        SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
        return error;
}

static int
execve_dovmcmds(struct lwp *l, struct execve_data * restrict data)
{
        struct exec_package        * const epp = &data->ed_pack;
        struct proc                *p = l->l_proc;
        struct exec_vmcmd        *base_vcp;
        int                        error = 0;
        size_t                        i;

        /* record proc's vnode, for use by procfs and others */
        if (p->p_textvp)
                vrele(p->p_textvp);
        vref(epp->ep_vp);
        p->p_textvp = epp->ep_vp;

        /* create the new process's VM space by running the vmcmds */
        KASSERTMSG(epp->ep_vmcmds.evs_used != 0, "%s: no vmcmds", __func__);

#ifdef TRACE_EXEC
        DUMPVMCMDS(epp, 0, 0);
#endif

        base_vcp = NULL;

        for (i = 0; i < epp->ep_vmcmds.evs_used && !error; i++) {
                struct exec_vmcmd *vcp;

                vcp = &epp->ep_vmcmds.evs_cmds[i];
                if (vcp->ev_flags & VMCMD_RELATIVE) {
                        KASSERTMSG(base_vcp != NULL,
                            "%s: relative vmcmd with no base", __func__);
                        KASSERTMSG((vcp->ev_flags & VMCMD_BASE) == 0,
                            "%s: illegal base & relative vmcmd", __func__);
                        vcp->ev_addr += base_vcp->ev_addr;
                }
                error = (*vcp->ev_proc)(l, vcp);
                if (error)
                        DUMPVMCMDS(epp, i, error);
                if (vcp->ev_flags & VMCMD_BASE)
                        base_vcp = vcp;
        }

        /* free the vmspace-creation commands, and release their references */
        kill_vmcmds(&epp->ep_vmcmds);

        vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
        VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred);
        vput(epp->ep_vp);

        /* if an error happened, deallocate and punt */
        if (error != 0) {
                DPRINTF(("%s: vmcmd %zu failed: %d\n", __func__, i - 1, error));
        }
        return error;
}

static void
execve_free_data(struct execve_data *data)
{
        struct exec_package        * const epp = &data->ed_pack;

        /* free the vmspace-creation commands, and release their references */
        kill_vmcmds(&epp->ep_vmcmds);
        /* kill any opened file descriptor, if necessary */
        if (epp->ep_flags & EXEC_HASFD) {
                epp->ep_flags &= ~EXEC_HASFD;
                fd_close(epp->ep_fd);
        }

        /* close and put the exec'd file */
        vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
        VOP_CLOSE(epp->ep_vp, FREAD, curlwp->l_cred);
        vput(epp->ep_vp);
        pool_put(&exec_pool, data->ed_argp);

        kmem_free(epp->ep_hdr, epp->ep_hdrlen);
        if (epp->ep_emul_root != NULL)
                vrele(epp->ep_emul_root);
        if (epp->ep_interp != NULL)
                vrele(epp->ep_interp);

        exec_path_free(data);
}

static void
pathexec(struct proc *p, const char *resolvedname)
{
        /* set command name & other accounting info */
        const char *cmdname;

        if (resolvedname == NULL) {
                cmdname = "*fexecve*";
                resolvedname = "/";
        } else {
                cmdname = strrchr(resolvedname, '/') + 1;
        }
        KASSERTMSG(resolvedname[0] == '/', "bad resolvedname `%s'",
            resolvedname);

        strlcpy(p->p_comm, cmdname, sizeof(p->p_comm));

        kmem_strfree(p->p_path);
        p->p_path = kmem_strdupsize(resolvedname, NULL, KM_SLEEP);
}

/* XXX elsewhere */
static int
credexec(struct lwp *l, struct execve_data *data)
{
        struct proc *p = l->l_proc;
        struct vattr *attr = &data->ed_attr;
        int error;

        /*
         * Deal with set[ug]id.  MNT_NOSUID has already been used to disable
         * s[ug]id.  It's OK to check for PSL_TRACED here as we have blocked
         * out additional references on the process for the moment.
         */
        if ((p->p_slflag & PSL_TRACED) == 0 &&

            (((attr->va_mode & S_ISUID) != 0 &&
              kauth_cred_geteuid(l->l_cred) != attr->va_uid) ||

             ((attr->va_mode & S_ISGID) != 0 &&
              kauth_cred_getegid(l->l_cred) != attr->va_gid))) {
                /*
                 * Mark the process as SUGID before we do
                 * anything that might block.
                 */
                proc_crmod_enter();
                proc_crmod_leave(NULL, NULL, true);
                if (data->ed_argc == 0) {
                        DPRINTF((
                            "%s: not executing set[ug]id binary with no args\n",
                            __func__));
                        return EINVAL;
                }

                /* Make sure file descriptors 0..2 are in use. */
                if ((error = fd_checkstd()) != 0) {
                        DPRINTF(("%s: fdcheckstd failed %d\n",
                            __func__, error));
                        return error;
                }

                /*
                 * Copy the credential so other references don't see our
                 * changes.
                 */
                l->l_cred = kauth_cred_copy(l->l_cred);
#ifdef KTRACE
                /*
                 * If the persistent trace flag isn't set, turn off.
                 */
                if (p->p_tracep) {
                        mutex_enter(&ktrace_lock);
                        if (!(p->p_traceflag & KTRFAC_PERSISTENT))
                                ktrderef(p);
                        mutex_exit(&ktrace_lock);
                }
#endif
                if (attr->va_mode & S_ISUID)
                        kauth_cred_seteuid(l->l_cred, attr->va_uid);
                if (attr->va_mode & S_ISGID)
                        kauth_cred_setegid(l->l_cred, attr->va_gid);
        } else {
                if (kauth_cred_geteuid(l->l_cred) ==
                    kauth_cred_getuid(l->l_cred) &&
                    kauth_cred_getegid(l->l_cred) ==
                    kauth_cred_getgid(l->l_cred))
                        p->p_flag &= ~PK_SUGID;
        }

        /*
         * Copy the credential so other references don't see our changes.
         * Test to see if this is necessary first, since in the common case
         * we won't need a private reference.
         */
        if (kauth_cred_geteuid(l->l_cred) != kauth_cred_getsvuid(l->l_cred) ||
            kauth_cred_getegid(l->l_cred) != kauth_cred_getsvgid(l->l_cred)) {
                l->l_cred = kauth_cred_copy(l->l_cred);
                kauth_cred_setsvuid(l->l_cred, kauth_cred_geteuid(l->l_cred));
                kauth_cred_setsvgid(l->l_cred, kauth_cred_getegid(l->l_cred));
        }

        /* Update the master credentials. */
        if (l->l_cred != p->p_cred) {
                kauth_cred_t ocred;
                mutex_enter(p->p_lock);
                ocred = p->p_cred;
                p->p_cred = kauth_cred_hold(l->l_cred);
                mutex_exit(p->p_lock);
                kauth_cred_free(ocred);
        }

        return 0;
}

static void
emulexec(struct lwp *l, struct exec_package *epp)
{
        struct proc                *p = l->l_proc;

        /* The emulation root will usually have been found when we looked
         * for the elf interpreter (or similar), if not look now. */
        if (epp->ep_esch->es_emul->e_path != NULL &&
            epp->ep_emul_root == NULL)
                emul_find_root(l, epp);

        /* Any old emulation root got removed by fdcloseexec */
        rw_enter(&p->p_cwdi->cwdi_lock, RW_WRITER);
        p->p_cwdi->cwdi_edir = epp->ep_emul_root;
        rw_exit(&p->p_cwdi->cwdi_lock);
        epp->ep_emul_root = NULL;
        if (epp->ep_interp != NULL)
                vrele(epp->ep_interp);

        /*
         * Call emulation specific exec hook. This can setup per-process
         * p->p_emuldata or do any other per-process stuff an emulation needs.
         *
         * If we are executing process of different emulation than the
         * original forked process, call e_proc_exit() of the old emulation
         * first, then e_proc_exec() of new emulation. If the emulation is
         * same, the exec hook code should deallocate any old emulation
         * resources held previously by this process.
         */
        if (p->p_emul && p->p_emul->e_proc_exit
            && p->p_emul != epp->ep_esch->es_emul)
                (*p->p_emul->e_proc_exit)(p);

        /*
         * Call exec hook. Emulation code may NOT store reference to anything
         * from &pack.
         */
        if (epp->ep_esch->es_emul->e_proc_exec)
                (*epp->ep_esch->es_emul->e_proc_exec)(p, epp);

        /* update p_emul, the old value is no longer needed */
        p->p_emul = epp->ep_esch->es_emul;

        /* ...and the same for p_execsw */
        p->p_execsw = epp->ep_esch;

#ifdef __HAVE_SYSCALL_INTERN
        (*p->p_emul->e_syscall_intern)(p);
#endif
        ktremul();
}

static int
execve_runproc(struct lwp *l, struct execve_data * restrict data,
        bool no_local_exec_lock, bool is_spawn)
{
        struct exec_package        * const epp = &data->ed_pack;
        int error = 0;
        struct proc                *p;
        struct vmspace                *vm;

        /*
         * In case of a posix_spawn operation, the child doing the exec
         * might not hold the reader lock on exec_lock, but the parent
         * will do this instead.
         */
        KASSERT(no_local_exec_lock || rw_lock_held(&exec_lock));
        KASSERT(!no_local_exec_lock || is_spawn);
        KASSERT(data != NULL);

        p = l->l_proc;

        /* Get rid of other LWPs. */
        if (p->p_nlwps > 1) {
                mutex_enter(p->p_lock);
                exit_lwps(l);
                mutex_exit(p->p_lock);
        }
        KDASSERT(p->p_nlwps == 1);

        /*
         * All of the other LWPs got rid of their robust futexes
         * when they exited above, but we might still have some
         * to dispose of.  Do that now.
         */
        if (__predict_false(l->l_robust_head != 0)) {
                futex_release_all_lwp(l);
                /*
                 * Since this LWP will live on with a different
                 * program image, we need to clear the robust
                 * futex list pointer here.
                 */
                l->l_robust_head = 0;
        }

        /* Destroy any lwpctl info. */
        if (p->p_lwpctl != NULL)
                lwp_ctl_exit();

        /* Remove POSIX timers */
        ptimers_free(p, TIMERS_POSIX);

        /* Set the PaX flags. */
        pax_set_flags(epp, p);

        /*
         * Do whatever is necessary to prepare the address space
         * for remapping.  Note that this might replace the current
         * vmspace with another!
         *
         * vfork(): do not touch any user space data in the new child
         * until we have awoken the parent below, or it will defeat
         * lazy pmap switching (on x86).
         */
        if (is_spawn)
                uvmspace_spawn(l, epp->ep_vm_minaddr,
                    epp->ep_vm_maxaddr,
                    epp->ep_flags & EXEC_TOPDOWN_VM);
        else
                uvmspace_exec(l, epp->ep_vm_minaddr,
                    epp->ep_vm_maxaddr,
                    epp->ep_flags & EXEC_TOPDOWN_VM);
        vm = p->p_vmspace;

        vm->vm_taddr = (void *)epp->ep_taddr;
        vm->vm_tsize = btoc(epp->ep_tsize);
        vm->vm_daddr = (void*)epp->ep_daddr;
        vm->vm_dsize = btoc(epp->ep_dsize);
        vm->vm_ssize = btoc(epp->ep_ssize);
        vm->vm_issize = 0;
        vm->vm_maxsaddr = (void *)epp->ep_maxsaddr;
        vm->vm_minsaddr = (void *)epp->ep_minsaddr;

        pax_aslr_init_vm(l, vm, epp);

        cwdexec(p);
        fd_closeexec();                /* handle close on exec */

        if (__predict_false(ktrace_on))
                fd_ktrexecfd();

        execsigs(p);                /* reset caught signals */

        mutex_enter(p->p_lock);
        l->l_ctxlink = NULL;        /* reset ucontext link */
        p->p_acflag &= ~AFORK;
        p->p_flag |= PK_EXEC;
        mutex_exit(p->p_lock);

        error = credexec(l, data);
        if (error)
                goto exec_abort;

#if defined(__HAVE_RAS)
        /*
         * Remove all RASs from the address space.
         */
        ras_purgeall();
#endif

        /*
         * Stop profiling.
         */
        if ((p->p_stflag & PST_PROFIL) != 0) {
                mutex_spin_enter(&p->p_stmutex);
                stopprofclock(p);
                mutex_spin_exit(&p->p_stmutex);
        }

        /*
         * It's OK to test PL_PPWAIT unlocked here, as other LWPs have
         * exited and exec()/exit() are the only places it will be cleared.
         *
         * Once the parent has been awoken, curlwp may teleport to a new CPU
         * in sched_vforkexec(), and it's then OK to start messing with user
         * data.  See comment above.
         */
        if ((p->p_lflag & PL_PPWAIT) != 0) {
                bool samecpu;
                lwp_t *lp;

                mutex_enter(&proc_lock);
                lp = p->p_vforklwp;
                p->p_vforklwp = NULL;
                l->l_lwpctl = NULL; /* was on loan from blocked parent */

                /* Clear flags after cv_broadcast() (scheduler needs them). */
                p->p_lflag &= ~PL_PPWAIT;
                lp->l_vforkwaiting = false;

                /* If parent is still on same CPU, teleport curlwp elsewhere. */
                samecpu = (lp->l_cpu == curlwp->l_cpu);
                cv_broadcast(&lp->l_waitcv);
                mutex_exit(&proc_lock);

                /* Give the parent its CPU back - find a new home. */
                KASSERT(!is_spawn);
                sched_vforkexec(l, samecpu);
        }

        /* Now map address space. */
        error = execve_dovmcmds(l, data);
        if (error != 0)
                goto exec_abort;

        pathexec(p, epp->ep_resolvedname);

        char * const newstack = STACK_GROW(vm->vm_minsaddr, epp->ep_ssize);

        error = copyoutargs(data, l, newstack);
        if (error != 0)
                goto exec_abort;

        doexechooks(p);

        /*
         * Set initial SP at the top of the stack.
         *
         * Note that on machines where stack grows up (e.g. hppa), SP points to
         * the end of arg/env strings.  Userland guesses the address of argc
         * via ps_strings::ps_argvstr.
         */

        /* Setup new registers and do misc. setup. */
        (*epp->ep_esch->es_emul->e_setregs)(l, epp, (vaddr_t)newstack);
        if (epp->ep_esch->es_setregs)
                (*epp->ep_esch->es_setregs)(l, epp, (vaddr_t)newstack);

        /* Provide a consistent LWP private setting */
        (void)lwp_setprivate(l, NULL);

        /* Discard all PCU state; need to start fresh */
        pcu_discard_all(l);

        /* map the process's signal trampoline code */
        if ((error = exec_sigcode_map(p, epp->ep_esch->es_emul)) != 0) {
                DPRINTF(("%s: map sigcode failed %d\n", __func__, error));
                goto exec_abort;
        }

        pool_put(&exec_pool, data->ed_argp);

        /*
         * Notify anyone who might care that we've exec'd.
         *
         * This is slightly racy; someone could sneak in and
         * attach a knote after we've decided not to notify,
         * or vice-versa, but that's not particularly bothersome.
         * knote_proc_exec() will acquire p->p_lock as needed.
         */
        if (!SLIST_EMPTY(&p->p_klist)) {
                knote_proc_exec(p);
        }

        kmem_free(epp->ep_hdr, epp->ep_hdrlen);

        SDT_PROBE(proc, kernel, , exec__success, epp->ep_kname, 0, 0, 0, 0);

        emulexec(l, epp);

        /* Allow new references from the debugger/procfs. */
        rw_exit(&p->p_reflock);
        if (!no_local_exec_lock)
                rw_exit(&exec_lock);

        mutex_enter(&proc_lock);

        /* posix_spawn(3) reports a single event with implied exec(3) */
        if ((p->p_slflag & PSL_TRACED) && !is_spawn) {
                mutex_enter(p->p_lock);
                eventswitch(TRAP_EXEC, 0, 0);
                mutex_enter(&proc_lock);
        }

        if (p->p_sflag & PS_STOPEXEC) {
                ksiginfoq_t kq;

                KASSERT(l->l_blcnt == 0);
                p->p_pptr->p_nstopchild++;
                p->p_waited = 0;
                mutex_enter(p->p_lock);
                ksiginfo_queue_init(&kq);
                sigclearall(p, &contsigmask, &kq);
                lwp_lock(l);
                l->l_stat = LSSTOP;
                p->p_stat = SSTOP;
                p->p_nrlwps--;
                lwp_unlock(l);
                mutex_exit(p->p_lock);
                mutex_exit(&proc_lock);
                lwp_lock(l);
                spc_lock(l->l_cpu);
                mi_switch(l);
                ksiginfo_queue_drain(&kq);
        } else {
                mutex_exit(&proc_lock);
        }

        exec_path_free(data);
#ifdef TRACE_EXEC
        DPRINTF(("%s finished\n", __func__));
#endif
        return EJUSTRETURN;

 exec_abort:
        SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
        rw_exit(&p->p_reflock);
        if (!no_local_exec_lock)
                rw_exit(&exec_lock);

        exec_path_free(data);

        /*
         * the old process doesn't exist anymore.  exit gracefully.
         * get rid of the (new) address space we have created, if any, get rid
         * of our namei data and vnode, and exit noting failure
         */
        if (vm != NULL) {
                uvm_deallocate(&vm->vm_map, VM_MIN_ADDRESS,
                        VM_MAXUSER_ADDRESS - VM_MIN_ADDRESS);
        }

        exec_free_emul_arg(epp);
        pool_put(&exec_pool, data->ed_argp);
        kmem_free(epp->ep_hdr, epp->ep_hdrlen);
        if (epp->ep_emul_root != NULL)
                vrele(epp->ep_emul_root);
        if (epp->ep_interp != NULL)
                vrele(epp->ep_interp);

        /* Acquire the sched-state mutex (exit1() will release it). */
        if (!is_spawn) {
                mutex_enter(p->p_lock);
                exit1(l, error, SIGABRT);
        }

        return error;
}

int
execve1(struct lwp *l, bool has_path, const char *path, int fd,
    char * const *args, char * const *envs,
    execve_fetch_element_t fetch_element)
{
        struct execve_data data;
        int error;

        error = execve_loadvm(l, has_path, path, fd, args, envs, fetch_element,
            &data);
        if (error)
                return error;
        error = execve_runproc(l, &data, false, false);
        return error;
}

static size_t
fromptrsz(const struct exec_package *epp)
{
        return (epp->ep_flags & EXEC_FROM32) ? sizeof(int) : sizeof(char *);
}

static size_t
ptrsz(const struct exec_package *epp)
{
        return (epp->ep_flags & EXEC_32) ? sizeof(int) : sizeof(char *);
}

static size_t
calcargs(struct execve_data * restrict data, const size_t argenvstrlen)
{
        struct exec_package        * const epp = &data->ed_pack;

        const size_t nargenvptrs =
            1 +                                /* long argc */
            data->ed_argc +                /* char *argv[] */
            1 +                                /* \0 */
            data->ed_envc +                /* char *env[] */
            1;                                /* \0 */

        return (nargenvptrs * ptrsz(epp))        /* pointers */
            + argenvstrlen                        /* strings */
            + epp->ep_esch->es_arglen;                /* auxinfo */
}

static size_t
calcstack(struct execve_data * restrict data, const size_t gaplen)
{
        struct exec_package        * const epp = &data->ed_pack;

        data->ed_szsigcode = epp->ep_esch->es_emul->e_esigcode -
            epp->ep_esch->es_emul->e_sigcode;

        data->ed_ps_strings_sz = (epp->ep_flags & EXEC_32) ?
            sizeof(struct ps_strings32) : sizeof(struct ps_strings);

        const size_t sigcode_psstr_sz =
            data->ed_szsigcode +        /* sigcode */
            data->ed_ps_strings_sz +        /* ps_strings */
            STACK_PTHREADSPACE;                /* pthread space */

        const size_t stacklen =
            data->ed_argslen +
            gaplen +
            sigcode_psstr_sz;

        /* make the stack "safely" aligned */
        return STACK_LEN_ALIGN(stacklen, STACK_ALIGNBYTES);
}

static int
copyoutargs(struct execve_data * restrict data, struct lwp *l,
    char * const newstack)
{
        struct exec_package        * const epp = &data->ed_pack;
        struct proc                *p = l->l_proc;
        int                        error;

        memset(&data->ed_arginfo, 0, sizeof(data->ed_arginfo));

        /* remember information about the process */
        data->ed_arginfo.ps_nargvstr = data->ed_argc;
        data->ed_arginfo.ps_nenvstr = data->ed_envc;

        /*
         * Allocate the stack address passed to the newly execve()'ed process.
         *
         * The new stack address will be set to the SP (stack pointer) register
         * in setregs().
         */

        char *newargs = STACK_ALLOC(
            STACK_SHRINK(newstack, data->ed_argslen), data->ed_argslen);

        error = (*epp->ep_esch->es_copyargs)(l, epp,
            &data->ed_arginfo, &newargs, data->ed_argp);

        if (error) {
                DPRINTF(("%s: copyargs failed %d\n", __func__, error));
                return error;
        }

        error = copyoutpsstrs(data, p);
        if (error != 0)
                return error;

        return 0;
}

static int
copyoutpsstrs(struct execve_data * restrict data, struct proc *p)
{
        struct exec_package        * const epp = &data->ed_pack;
        struct ps_strings32        arginfo32;
        void                        *aip;
        int                        error;

        /* fill process ps_strings info */
        p->p_psstrp = (vaddr_t)STACK_ALLOC(STACK_GROW(epp->ep_minsaddr,
            STACK_PTHREADSPACE), data->ed_ps_strings_sz);

        if (epp->ep_flags & EXEC_32) {
                aip = &arginfo32;
                arginfo32.ps_argvstr = (vaddr_t)data->ed_arginfo.ps_argvstr;
                arginfo32.ps_nargvstr = data->ed_arginfo.ps_nargvstr;
                arginfo32.ps_envstr = (vaddr_t)data->ed_arginfo.ps_envstr;
                arginfo32.ps_nenvstr = data->ed_arginfo.ps_nenvstr;
        } else
                aip = &data->ed_arginfo;

        /* copy out the process's ps_strings structure */
        if ((error = copyout(aip, (void *)p->p_psstrp, data->ed_ps_strings_sz))
            != 0) {
                DPRINTF(("%s: ps_strings copyout %p->%p size %zu failed\n",
                    __func__, aip, (void *)p->p_psstrp, data->ed_ps_strings_sz));
                return error;
        }

        return 0;
}

static int
copyinargs(struct execve_data * restrict data, char * const *args,
    char * const *envs, execve_fetch_element_t fetch_element, char **dpp)
{
        struct exec_package        * const epp = &data->ed_pack;
        char                        *dp;
        size_t                        i;
        int                        error;

        dp = *dpp;

        data->ed_argc = 0;

        /* copy the fake args list, if there's one, freeing it as we go */
        if (epp->ep_flags & EXEC_HASARGL) {
                struct exec_fakearg        *fa = epp->ep_fa;

                while (fa->fa_arg != NULL) {
                        const size_t maxlen = ARG_MAX - (dp - data->ed_argp);
                        size_t len;

                        len = strlcpy(dp, fa->fa_arg, maxlen);
                        /* Count NUL into len. */
                        if (len < maxlen)
                                len++;
                        else {
                                while (fa->fa_arg != NULL) {
                                        kmem_free(fa->fa_arg, fa->fa_len);
                                        fa++;
                                }
                                kmem_free(epp->ep_fa, epp->ep_fa_len);
                                epp->ep_flags &= ~EXEC_HASARGL;
                                return E2BIG;
                        }
                        ktrexecarg(fa->fa_arg, len - 1);
                        dp += len;

                        kmem_free(fa->fa_arg, fa->fa_len);
                        fa++;
                        data->ed_argc++;
                }
                kmem_free(epp->ep_fa, epp->ep_fa_len);
                epp->ep_flags &= ~EXEC_HASARGL;
        }

        /*
         * Read and count argument strings from user.
         */

        if (args == NULL) {
                DPRINTF(("%s: null args\n", __func__));
                return EINVAL;
        }
        if (epp->ep_flags & EXEC_SKIPARG)
                args = (const void *)((const char *)args + fromptrsz(epp));
        i = 0;
        error = copyinargstrs(data, args, fetch_element, &dp, &i, ktr_execarg);
        if (error != 0) {
                DPRINTF(("%s: copyin arg %d\n", __func__, error));
                return error;
        }
        data->ed_argc += i;

        /*
         * Read and count environment strings from user.
         */

        data->ed_envc = 0;
        /* environment need not be there */
        if (envs == NULL)
                goto done;
        i = 0;
        error = copyinargstrs(data, envs, fetch_element, &dp, &i, ktr_execenv);
        if (error != 0) {
                DPRINTF(("%s: copyin env %d\n", __func__, error));
                return error;
        }
        data->ed_envc += i;

done:
        *dpp = dp;

        return 0;
}

static int
copyinargstrs(struct execve_data * restrict data, char * const *strs,
    execve_fetch_element_t fetch_element, char **dpp, size_t *ip,
    void (*ktr)(const void *, size_t))
{
        char                        *dp, *sp;
        size_t                        i;
        int                        error;

        dp = *dpp;

        i = 0;
        while (1) {
                const size_t maxlen = ARG_MAX - (dp - data->ed_argp);
                size_t len;

                if ((error = (*fetch_element)(strs, i, &sp)) != 0) {
                        return error;
                }
                if (!sp)
                        break;
                if ((error = copyinstr(sp, dp, maxlen, &len)) != 0) {
                        if (error == ENAMETOOLONG)
                                error = E2BIG;
                        return error;
                }
                if (__predict_false(ktrace_on))
                        (*ktr)(dp, len - 1);
                dp += len;
                i++;
        }

        *dpp = dp;
        *ip = i;

        return 0;
}

/*
 * Copy argv and env strings from kernel buffer (argp) to the new stack.
 * Those strings are located just after auxinfo.
 */
int
copyargs(struct lwp *l, struct exec_package *pack, struct ps_strings *arginfo,
    char **stackp, void *argp)
{
        char        **cpp, *dp, *sp;
        size_t        len;
        void        *nullp;
        long        argc, envc;
        int        error;

        cpp = (char **)*stackp;
        nullp = NULL;
        argc = arginfo->ps_nargvstr;
        envc = arginfo->ps_nenvstr;

        /* argc on stack is long */
        CTASSERT(sizeof(*cpp) == sizeof(argc));

        dp = (char *)(cpp +
            1 +                                /* long argc */
            argc +                        /* char *argv[] */
            1 +                                /* \0 */
            envc +                        /* char *env[] */
            1) +                        /* \0 */
            pack->ep_esch->es_arglen;        /* auxinfo */
        sp = argp;

        if ((error = copyout(&argc, cpp++, sizeof(argc))) != 0) {
                COPYPRINTF("", cpp - 1, sizeof(argc));
                return error;
        }

        /* XXX don't copy them out, remap them! */
        arginfo->ps_argvstr = cpp; /* remember location of argv for later */

        for (; --argc >= 0; sp += len, dp += len) {
                if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) {
                        COPYPRINTF("", cpp - 1, sizeof(dp));
                        return error;
                }
                if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) {
                        COPYPRINTF("str", dp, (size_t)ARG_MAX);
                        return error;
                }
        }

        if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) {
                COPYPRINTF("", cpp - 1, sizeof(nullp));
                return error;
        }

        arginfo->ps_envstr = cpp; /* remember location of envp for later */

        for (; --envc >= 0; sp += len, dp += len) {
                if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) {
                        COPYPRINTF("", cpp - 1, sizeof(dp));
                        return error;
                }
                if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) {
                        COPYPRINTF("str", dp, (size_t)ARG_MAX);
                        return error;
                }

        }

        if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) {
                COPYPRINTF("", cpp - 1, sizeof(nullp));
                return error;
        }

        *stackp = (char *)cpp;
        return 0;
}


/*
 * Add execsw[] entries.
 */
int
exec_add(struct execsw *esp, int count)
{
        struct exec_entry        *it;
        int                        i, error = 0;

        if (count == 0) {
                return 0;
        }

        /* Check for duplicates. */
        rw_enter(&exec_lock, RW_WRITER);
        for (i = 0; i < count; i++) {
                LIST_FOREACH(it, &ex_head, ex_list) {
                        /* assume unique (makecmds, probe_func, emulation) */
                        if (it->ex_sw->es_makecmds == esp[i].es_makecmds &&
                            it->ex_sw->u.elf_probe_func ==
                            esp[i].u.elf_probe_func &&
                            it->ex_sw->es_emul == esp[i].es_emul) {
                                rw_exit(&exec_lock);
                                return EEXIST;
                        }
                }
        }

        /* Allocate new entries. */
        for (i = 0; i < count; i++) {
                it = kmem_alloc(sizeof(*it), KM_SLEEP);
                it->ex_sw = &esp[i];
                error = exec_sigcode_alloc(it->ex_sw->es_emul);
                if (error != 0) {
                        kmem_free(it, sizeof(*it));
                        break;
                }
                LIST_INSERT_HEAD(&ex_head, it, ex_list);
        }
        /* If even one fails, remove them all back. */
        if (error != 0) {
                for (i--; i >= 0; i--) {
                        it = LIST_FIRST(&ex_head);
                        LIST_REMOVE(it, ex_list);
                        exec_sigcode_free(it->ex_sw->es_emul);
                        kmem_free(it, sizeof(*it));
                }
                return error;
        }

        /* update execsw[] */
        exec_init(0);
        rw_exit(&exec_lock);
        return 0;
}

/*
 * Remove execsw[] entry.
 */
int
exec_remove(struct execsw *esp, int count)
{
        struct exec_entry        *it, *next;
        int                        i;
        const struct proclist_desc *pd;
        proc_t                        *p;

        if (count == 0) {
                return 0;
        }

        /* Abort if any are busy. */
        rw_enter(&exec_lock, RW_WRITER);
        for (i = 0; i < count; i++) {
                mutex_enter(&proc_lock);
                for (pd = proclists; pd->pd_list != NULL; pd++) {
                        PROCLIST_FOREACH(p, pd->pd_list) {
                                if (p->p_execsw == &esp[i]) {
                                        mutex_exit(&proc_lock);
                                        rw_exit(&exec_lock);
                                        return EBUSY;
                                }
                        }
                }
                mutex_exit(&proc_lock);
        }

        /* None are busy, so remove them all. */
        for (i = 0; i < count; i++) {
                for (it = LIST_FIRST(&ex_head); it != NULL; it = next) {
                        next = LIST_NEXT(it, ex_list);
                        if (it->ex_sw == &esp[i]) {
                                LIST_REMOVE(it, ex_list);
                                exec_sigcode_free(it->ex_sw->es_emul);
                                kmem_free(it, sizeof(*it));
                                break;
                        }
                }
        }

        /* update execsw[] */
        exec_init(0);
        rw_exit(&exec_lock);
        return 0;
}

/*
 * Initialize exec structures. If init_boot is true, also does necessary
 * one-time initialization (it's called from main() that way).
 * Once system is multiuser, this should be called with exec_lock held,
 * i.e. via exec_{add|remove}().
 */
int
exec_init(int init_boot)
{
        const struct execsw         **sw;
        struct exec_entry        *ex;
        SLIST_HEAD(,exec_entry)        first;
        SLIST_HEAD(,exec_entry)        any;
        SLIST_HEAD(,exec_entry)        last;
        int                        i, sz;

        if (init_boot) {
                /* do one-time initializations */
                vaddr_t vmin = 0, vmax;

                rw_init(&exec_lock);
                exec_map = uvm_km_suballoc(kernel_map, &vmin, &vmax,
                    maxexec*NCARGS, VM_MAP_PAGEABLE, false, NULL);
                pool_init(&exec_pool, NCARGS, 0, 0, PR_NOALIGN|PR_NOTOUCH,
                    "execargs", &exec_palloc, IPL_NONE);
                pool_sethardlimit(&exec_pool, maxexec, "should not happen", 0);
        } else {
                KASSERT(rw_write_held(&exec_lock));
        }

        /* Sort each entry onto the appropriate queue. */
        SLIST_INIT(&first);
        SLIST_INIT(&any);
        SLIST_INIT(&last);
        sz = 0;
        LIST_FOREACH(ex, &ex_head, ex_list) {
                switch(ex->ex_sw->es_prio) {
                case EXECSW_PRIO_FIRST:
                        SLIST_INSERT_HEAD(&first, ex, ex_slist);
                        break;
                case EXECSW_PRIO_ANY:
                        SLIST_INSERT_HEAD(&any, ex, ex_slist);
                        break;
                case EXECSW_PRIO_LAST:
                        SLIST_INSERT_HEAD(&last, ex, ex_slist);
                        break;
                default:
                        panic("%s", __func__);
                        break;
                }
                sz++;
        }

        /*
         * Create new execsw[].  Ensure we do not try a zero-sized
         * allocation.
         */
        sw = kmem_alloc(sz * sizeof(struct execsw *) + 1, KM_SLEEP);
        i = 0;
        SLIST_FOREACH(ex, &first, ex_slist) {
                sw[i++] = ex->ex_sw;
        }
        SLIST_FOREACH(ex, &any, ex_slist) {
                sw[i++] = ex->ex_sw;
        }
        SLIST_FOREACH(ex, &last, ex_slist) {
                sw[i++] = ex->ex_sw;
        }

        /* Replace old execsw[] and free used memory. */
        if (execsw != NULL) {
                kmem_free(__UNCONST(execsw),
                    nexecs * sizeof(struct execsw *) + 1);
        }
        execsw = sw;
        nexecs = sz;

        /* Figure out the maximum size of an exec header. */
        exec_maxhdrsz = sizeof(int);
        for (i = 0; i < nexecs; i++) {
                if (execsw[i]->es_hdrsz > exec_maxhdrsz)
                        exec_maxhdrsz = execsw[i]->es_hdrsz;
        }

        return 0;
}

int
exec_sigcode_alloc(const struct emul *e)
{
        vaddr_t va;
        vsize_t sz;
        int error;
        struct uvm_object *uobj;

        KASSERT(rw_lock_held(&exec_lock));

        if (e == NULL || e->e_sigobject == NULL)
                return 0;

        sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode;
        if (sz == 0)
                return 0;

        /*
         * Create a sigobject for this emulation.
         *
         * sigobject is an anonymous memory object (just like SYSV shared
         * memory) that we keep a permanent reference to and that we map
         * in all processes that need this sigcode. The creation is simple,
         * we create an object, add a permanent reference to it, map it in
         * kernel space, copy out the sigcode to it and unmap it.
         * We map it with PROT_READ|PROT_EXEC into the process just
         * the way sys_mmap() would map it.
         */
        if (*e->e_sigobject == NULL) {
                uobj = uao_create(sz, 0);
                (*uobj->pgops->pgo_reference)(uobj);
                va = vm_map_min(kernel_map);
                if ((error = uvm_map(kernel_map, &va, round_page(sz),
                    uobj, 0, 0,
                    UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
                    UVM_INH_SHARE, UVM_ADV_RANDOM, 0)))) {
                        printf("sigcode kernel mapping failed %d\n", error);
                        (*uobj->pgops->pgo_detach)(uobj);
                        return error;
                }
                memcpy((void *)va, e->e_sigcode, sz);
#ifdef PMAP_NEED_PROCWR
                pmap_procwr(&proc0, va, sz);
#endif
                uvm_unmap(kernel_map, va, va + round_page(sz));
                *e->e_sigobject = uobj;
                KASSERT(uobj->uo_refs == 1);
        } else {
                /* if already created, reference++ */
                uobj = *e->e_sigobject;
                (*uobj->pgops->pgo_reference)(uobj);
        }

        return 0;
}

void
exec_sigcode_free(const struct emul *e)
{
        struct uvm_object *uobj;

        KASSERT(rw_lock_held(&exec_lock));

        if (e == NULL || e->e_sigobject == NULL)
                return;

        uobj = *e->e_sigobject;
        if (uobj == NULL)
                return;

        if (uobj->uo_refs == 1)
                *e->e_sigobject = NULL;        /* I'm the last person to reference. */
        (*uobj->pgops->pgo_detach)(uobj);
}

static int
exec_sigcode_map(struct proc *p, const struct emul *e)
{
        vaddr_t va;
        vsize_t sz;
        int error;
        struct uvm_object *uobj;

        sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode;
        if (e->e_sigobject == NULL || sz == 0)
                return 0;

        uobj = *e->e_sigobject;
        if (uobj == NULL)
                return 0;

        /* Just a hint to uvm_map where to put it. */
        va = e->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr,
            round_page(sz), p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);

#ifdef __alpha__
        /*
         * Tru64 puts /sbin/loader at the end of user virtual memory,
         * which causes the above calculation to put the sigcode at
         * an invalid address.  Put it just below the text instead.
         */
        if (va == (vaddr_t)vm_map_max(&p->p_vmspace->vm_map)) {
                va = (vaddr_t)p->p_vmspace->vm_taddr - round_page(sz);
        }
#endif

        (*uobj->pgops->pgo_reference)(uobj);
        error = uvm_map(&p->p_vmspace->vm_map, &va, round_page(sz),
                        uobj, 0, 0,
                        UVM_MAPFLAG(UVM_PROT_RX, UVM_PROT_RX, UVM_INH_SHARE,
                                    UVM_ADV_RANDOM, 0));
        if (error) {
                DPRINTF(("%s, %d: map %p "
                    "uvm_map %#"PRIxVSIZE"@%#"PRIxVADDR" failed %d\n",
                    __func__, __LINE__, &p->p_vmspace->vm_map, round_page(sz),
                    va, error));
                (*uobj->pgops->pgo_detach)(uobj);
                return error;
        }
        p->p_sigctx.ps_sigcode = (void *)va;
        return 0;
}

/*
 * Release a refcount on spawn_exec_data and destroy memory, if this
 * was the last one.
 */
static void
spawn_exec_data_release(struct spawn_exec_data *data)
{

        membar_release();
        if (atomic_dec_32_nv(&data->sed_refcnt) != 0)
                return;
        membar_acquire();

        cv_destroy(&data->sed_cv_child_ready);
        mutex_destroy(&data->sed_mtx_child);

        if (data->sed_actions)
                posix_spawn_fa_free(data->sed_actions,
                    data->sed_actions->len);
        if (data->sed_attrs)
                kmem_free(data->sed_attrs,
                    sizeof(*data->sed_attrs));
        kmem_free(data, sizeof(*data));
}

static int
handle_posix_spawn_file_actions(struct posix_spawn_file_actions *actions)
{
        struct lwp *l = curlwp;
        register_t retval;
        int error, newfd;

        if (actions == NULL)
                return 0;

        for (size_t i = 0; i < actions->len; i++) {
                const struct posix_spawn_file_actions_entry *fae =
                    &actions->fae[i];
                switch (fae->fae_action) {
                case FAE_OPEN:
                        if (fd_getfile(fae->fae_fildes) != NULL) {
                                error = fd_close(fae->fae_fildes);
                                if (error)
                                        return error;
                        }
                        error = fd_open(fae->fae_path, fae->fae_oflag,
                            fae->fae_mode, &newfd);
                        if (error)
                                return error;
                        if (newfd != fae->fae_fildes) {
                                error = dodup(l, newfd,
                                    fae->fae_fildes, 0, &retval);
                                if (fd_getfile(newfd) != NULL)
                                        fd_close(newfd);
                        }
                        break;
                case FAE_DUP2:
                        error = dodup(l, fae->fae_fildes,
                            fae->fae_newfildes, 0, &retval);
                        break;
                case FAE_CLOSE:
                        if (fd_getfile(fae->fae_fildes) == NULL) {
                                return EBADF;
                        }
                        error = fd_close(fae->fae_fildes);
                        break;
                case FAE_CHDIR:
                        error = do_sys_chdir(l, fae->fae_chdir_path,
                            UIO_SYSSPACE, &retval);
                        break;
                case FAE_FCHDIR:
                        error = do_sys_fchdir(l, fae->fae_fildes, &retval);
                        break;
                }
                if (error)
                        return error;
        }
        return 0;
}

static int
handle_posix_spawn_attrs(struct posix_spawnattr *attrs, struct proc *parent)
{
        struct sigaction sigact;
        int error;
        struct proc *p = curproc;
        struct lwp *l = curlwp;

        if (attrs == NULL)
                return 0;

        memset(&sigact, 0, sizeof(sigact));
        sigact._sa_u._sa_handler = SIG_DFL;
        sigact.sa_flags = 0;

        /* 
         * set state to SSTOP so that this proc can be found by pid.
         * see proc_enterprp, do_sched_setparam below
         */
        mutex_enter(&proc_lock);
        /*
         * p_stat should be SACTIVE, so we need to adjust the
         * parent's p_nstopchild here.  For safety, just make
         * we're on the good side of SDEAD before we adjust.
         */
        int ostat = p->p_stat;
        KASSERT(ostat < SSTOP);
        p->p_stat = SSTOP;
        p->p_waited = 0;
        p->p_pptr->p_nstopchild++;
        mutex_exit(&proc_lock);

        /* Set process group */
        if (attrs->sa_flags & POSIX_SPAWN_SETPGROUP) {
                pid_t mypid = p->p_pid;
                pid_t pgrp = attrs->sa_pgroup;

                if (pgrp == 0)
                        pgrp = mypid;

                error = proc_enterpgrp(parent, mypid, pgrp, false);
                if (error)
                        goto out;
        }

        /* Set scheduler policy */
        if (attrs->sa_flags & POSIX_SPAWN_SETSCHEDULER)
                error = do_sched_setparam(p->p_pid, 0, attrs->sa_schedpolicy,
                    &attrs->sa_schedparam);
        else if (attrs->sa_flags & POSIX_SPAWN_SETSCHEDPARAM) {
                error = do_sched_setparam(parent->p_pid, 0,
                    SCHED_NONE, &attrs->sa_schedparam);
        }
        if (error)
                goto out;

        /* Reset user ID's */
        if (attrs->sa_flags & POSIX_SPAWN_RESETIDS) {
                error = do_setresgid(l, -1, kauth_cred_getgid(l->l_cred), -1,
                     ID_E_EQ_R | ID_E_EQ_S);
                if (error)
                        return error;
                error = do_setresuid(l, -1, kauth_cred_getuid(l->l_cred), -1,
                    ID_E_EQ_R | ID_E_EQ_S);
                if (error)
                        goto out;
        }

        /* Set signal masks/defaults */
        if (attrs->sa_flags & POSIX_SPAWN_SETSIGMASK) {
                mutex_enter(p->p_lock);
                error = sigprocmask1(l, SIG_SETMASK, &attrs->sa_sigmask, NULL);
                mutex_exit(p->p_lock);
                if (error)
                        goto out;
        }

        if (attrs->sa_flags & POSIX_SPAWN_SETSIGDEF) {
                /*
                 * The following sigaction call is using a sigaction
                 * version 0 trampoline which is in the compatibility
                 * code only. This is not a problem because for SIG_DFL
                 * and SIG_IGN, the trampolines are now ignored. If they
                 * were not, this would be a problem because we are
                 * holding the exec_lock, and the compat code needs
                 * to do the same in order to replace the trampoline
                 * code of the process.
                 */
                for (int i = 1; i <= NSIG; i++) {
                        if (sigismember(&attrs->sa_sigdefault, i))
                                sigaction1(l, i, &sigact, NULL, NULL, 0);
                }
        }
        error = 0;
out:
        mutex_enter(&proc_lock);
        p->p_stat = ostat;
        p->p_pptr->p_nstopchild--;
        mutex_exit(&proc_lock);
        return error;
}

/*
 * A child lwp of a posix_spawn operation starts here and ends up in
 * cpu_spawn_return, dealing with all filedescriptor and scheduler
 * manipulations in between.
 * The parent waits for the child, as it is not clear whether the child
 * will be able to acquire its own exec_lock. If it can, the parent can
 * be released early and continue running in parallel. If not (or if the
 * magic debug flag is passed in the scheduler attribute struct), the
 * child rides on the parent's exec lock until it is ready to return to
 * to userland - and only then releases the parent. This method loses
 * concurrency, but improves error reporting.
 */
static void
spawn_return(void *arg)
{
        struct spawn_exec_data *spawn_data = arg;
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;
        int error;
        bool have_reflock;
        bool parent_is_waiting = true;

        /*
         * Check if we can release parent early.
         * We either need to have no sed_attrs, or sed_attrs does not
         * have POSIX_SPAWN_RETURNERROR or one of the flags, that require
         * safe access to the parent proc (passed in sed_parent).
         * We then try to get the exec_lock, and only if that works, we can
         * release the parent here already.
         */
        struct posix_spawnattr *attrs = spawn_data->sed_attrs;
        if ((!attrs || (attrs->sa_flags
                & (POSIX_SPAWN_RETURNERROR|POSIX_SPAWN_SETPGROUP)) == 0)
            && rw_tryenter(&exec_lock, RW_READER)) {
                parent_is_waiting = false;
                mutex_enter(&spawn_data->sed_mtx_child);
                cv_signal(&spawn_data->sed_cv_child_ready);
                mutex_exit(&spawn_data->sed_mtx_child);
        }

        /* don't allow debugger access yet */
        rw_enter(&p->p_reflock, RW_WRITER);
        have_reflock = true;

        /* handle posix_spawnattr */
        error = handle_posix_spawn_attrs(attrs, spawn_data->sed_parent);
        if (error)
                goto report_error;

        /* handle posix_spawn_file_actions */
        error = handle_posix_spawn_file_actions(spawn_data->sed_actions);
        if (error)
                goto report_error;

        /* now do the real exec */
        error = execve_runproc(l, &spawn_data->sed_exec, parent_is_waiting,
            true);
        have_reflock = false;
        if (error == EJUSTRETURN)
                error = 0;
        else if (error)
                goto report_error;

        if (parent_is_waiting) {
                mutex_enter(&spawn_data->sed_mtx_child);
                cv_signal(&spawn_data->sed_cv_child_ready);
                mutex_exit(&spawn_data->sed_mtx_child);
        }

        /* release our refcount on the data */
        spawn_exec_data_release(spawn_data);

        if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) ==
            (PSL_TRACED|PSL_TRACEDCHILD)) {
                eventswitchchild(p, TRAP_CHLD, PTRACE_POSIX_SPAWN);
        }

        /* and finally: leave to userland for the first time */
        cpu_spawn_return(l);

        /* NOTREACHED */
        return;

 report_error:
        if (have_reflock) {
                /*
                 * We have not passed through execve_runproc(),
                 * which would have released the p_reflock and also
                 * taken ownership of the sed_exec part of spawn_data,
                 * so release/free both here.
                 */
                rw_exit(&p->p_reflock);
                execve_free_data(&spawn_data->sed_exec);
        }

        if (parent_is_waiting) {
                /* pass error to parent */
                mutex_enter(&spawn_data->sed_mtx_child);
                spawn_data->sed_error = error;
                cv_signal(&spawn_data->sed_cv_child_ready);
                mutex_exit(&spawn_data->sed_mtx_child);
        } else {
                rw_exit(&exec_lock);
        }

        /* release our refcount on the data */
        spawn_exec_data_release(spawn_data);

        /* done, exit */
        mutex_enter(p->p_lock);
        /*
         * Posix explicitly asks for an exit code of 127 if we report
         * errors from the child process - so, unfortunately, there
         * is no way to report a more exact error code.
         * A NetBSD specific workaround is POSIX_SPAWN_RETURNERROR as
         * flag bit in the attrp argument to posix_spawn(2), see above.
         */
        exit1(l, 127, 0);
}

static __inline char **
posix_spawn_fae_path(struct posix_spawn_file_actions_entry *fae)
{
        switch (fae->fae_action) {
        case FAE_OPEN:
                return &fae->fae_path;
        case FAE_CHDIR:
                return &fae->fae_chdir_path;
        default:
                return NULL;
        }
}
    
void
posix_spawn_fa_free(struct posix_spawn_file_actions *fa, size_t len)
{

        for (size_t i = 0; i < len; i++) {
                char **pathp = posix_spawn_fae_path(&fa->fae[i]);
                if (pathp)
                        kmem_strfree(*pathp);
        }
        if (fa->len > 0)
                kmem_free(fa->fae, sizeof(*fa->fae) * fa->len);
        kmem_free(fa, sizeof(*fa));
}

static int
posix_spawn_fa_alloc(struct posix_spawn_file_actions **fap,
    const struct posix_spawn_file_actions *ufa, rlim_t lim)
{
        struct posix_spawn_file_actions *fa;
        struct posix_spawn_file_actions_entry *fae;
        char *pbuf = NULL;
        int error;
        size_t i = 0;

        fa = kmem_alloc(sizeof(*fa), KM_SLEEP);
        error = copyin(ufa, fa, sizeof(*fa));
        if (error || fa->len == 0) {
                kmem_free(fa, sizeof(*fa));
                return error;        /* 0 if not an error, and len == 0 */
        }

        if (fa->len > lim) {
                kmem_free(fa, sizeof(*fa));
                return EINVAL;
        }

        fa->size = fa->len;
        size_t fal = fa->len * sizeof(*fae);
        fae = fa->fae;
        fa->fae = kmem_alloc(fal, KM_SLEEP);
        error = copyin(fae, fa->fae, fal);
        if (error)
                goto out;

        pbuf = PNBUF_GET();
        for (; i < fa->len; i++) {
                char **pathp = posix_spawn_fae_path(&fa->fae[i]);
                if (pathp == NULL)
                        continue;
                error = copyinstr(*pathp, pbuf, MAXPATHLEN, &fal);
                if (error)
                        goto out;
                *pathp = kmem_alloc(fal, KM_SLEEP);
                memcpy(*pathp, pbuf, fal);
        }
        PNBUF_PUT(pbuf);

        *fap = fa;
        return 0;
out:
        if (pbuf)
                PNBUF_PUT(pbuf);
        posix_spawn_fa_free(fa, i);
        return error;
}

/*
 * N.B. increments nprocs upon success.  Callers need to drop nprocs if
 * they fail for some other reason.
 */
int
check_posix_spawn(struct lwp *l1)
{
        int error, tnprocs, count;
        uid_t uid;
        struct proc *p1;

        p1 = l1->l_proc;
        uid = kauth_cred_getuid(l1->l_cred);
        tnprocs = atomic_inc_uint_nv(&nprocs);

        /*
         * Although process entries are dynamically created, we still keep
         * a global limit on the maximum number we will create.
         */
        if (__predict_false(tnprocs >= maxproc))
                error = -1;
        else
                error = kauth_authorize_process(l1->l_cred,
                    KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL);

        if (error) {
                atomic_dec_uint(&nprocs);
                return EAGAIN;
        }

        /*
         * Enforce limits.
         */
        count = chgproccnt(uid, 1);
        if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT,
             p1, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
             &p1->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != 0 &&
            __predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) {
                (void)chgproccnt(uid, -1);
                atomic_dec_uint(&nprocs);
                return EAGAIN;
        }

        return 0;
}

int
do_posix_spawn(struct lwp *l1, pid_t *pid_res, bool *child_ok, const char *path,
        struct posix_spawn_file_actions *fa,
        struct posix_spawnattr *sa,
        char *const *argv, char *const *envp,
        execve_fetch_element_t fetch)
{

        struct proc *p1, *p2;
        struct lwp *l2;
        int error;
        struct spawn_exec_data *spawn_data;
        vaddr_t uaddr = 0;
        pid_t pid;
        bool have_exec_lock = false;

        p1 = l1->l_proc;

        /* Allocate and init spawn_data */
        spawn_data = kmem_zalloc(sizeof(*spawn_data), KM_SLEEP);
        spawn_data->sed_refcnt = 1; /* only parent so far */
        cv_init(&spawn_data->sed_cv_child_ready, "pspawn");
        mutex_init(&spawn_data->sed_mtx_child, MUTEX_DEFAULT, IPL_NONE);
        mutex_enter(&spawn_data->sed_mtx_child);

        /*
         * Do the first part of the exec now, collect state
         * in spawn_data.
         */
        error = execve_loadvm(l1, true, path, -1, argv,
            envp, fetch, &spawn_data->sed_exec);
        if (error == EJUSTRETURN)
                error = 0;
        else if (error)
                goto error_exit;

        have_exec_lock = true;

        /*
         * Allocate virtual address space for the U-area now, while it
         * is still easy to abort the fork operation if we're out of
         * kernel virtual address space.
         */
        uaddr = uvm_uarea_alloc();
        if (__predict_false(uaddr == 0)) {
                error = ENOMEM;
                goto error_exit;
        }
        
        /*
         * Allocate new proc. Borrow proc0 vmspace for it, we will
         * replace it with its own before returning to userland
         * in the child.
         */
        p2 = proc_alloc();
        if (p2 == NULL) {
                /* We were unable to allocate a process ID. */
                error = EAGAIN;
                goto error_exit;
        }

        /*
         * This is a point of no return, we will have to go through
         * the child proc to properly clean it up past this point.
         */
        pid = p2->p_pid;

        /*
         * Make a proc table entry for the new process.
         * Start by zeroing the section of proc that is zero-initialized,
         * then copy the section that is copied directly from the parent.
         */
        memset(&p2->p_startzero, 0,
            (unsigned) ((char *)&p2->p_endzero - (char *)&p2->p_startzero));
        memcpy(&p2->p_startcopy, &p1->p_startcopy,
            (unsigned) ((char *)&p2->p_endcopy - (char *)&p2->p_startcopy));
        p2->p_vmspace = proc0.p_vmspace;

        TAILQ_INIT(&p2->p_sigpend.sp_info);

        LIST_INIT(&p2->p_lwps);
        LIST_INIT(&p2->p_sigwaiters);

        /*
         * Duplicate sub-structures as needed.
         * Increase reference counts on shared objects.
         * Inherit flags we want to keep.  The flags related to SIGCHLD
         * handling are important in order to keep a consistent behaviour
         * for the child after the fork.  If we are a 32-bit process, the
         * child will be too.
         */
        p2->p_flag =
            p1->p_flag & (PK_SUGID | PK_NOCLDWAIT | PK_CLDSIGIGN | PK_32);
        p2->p_emul = p1->p_emul;
        p2->p_execsw = p1->p_execsw;

        mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
        mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
        rw_init(&p2->p_reflock);
        cv_init(&p2->p_waitcv, "wait");
        cv_init(&p2->p_lwpcv, "lwpwait");

        p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);

        kauth_proc_fork(p1, p2);

        p2->p_raslist = NULL;
        p2->p_fd = fd_copy();

        /* XXX racy */
        p2->p_mqueue_cnt = p1->p_mqueue_cnt;

        p2->p_cwdi = cwdinit();

        /*
         * Note: p_limit (rlimit stuff) is copy-on-write, so normally
         * we just need increase pl_refcnt.
         */
        if (!p1->p_limit->pl_writeable) {
                lim_addref(p1->p_limit);
                p2->p_limit = p1->p_limit;
        } else {
                p2->p_limit = lim_copy(p1->p_limit);
        }

        p2->p_lflag = 0;
        l1->l_vforkwaiting = false;
        p2->p_sflag = 0;
        p2->p_slflag = 0;
        p2->p_pptr = p1;
        p2->p_ppid = p1->p_pid;
        LIST_INIT(&p2->p_children);

        p2->p_aio = NULL;

#ifdef KTRACE
        /*
         * Copy traceflag and tracefile if enabled.
         * If not inherited, these were zeroed above.
         */
        if (p1->p_traceflag & KTRFAC_INHERIT) {
                mutex_enter(&ktrace_lock);
                p2->p_traceflag = p1->p_traceflag;
                if ((p2->p_tracep = p1->p_tracep) != NULL)
                        ktradref(p2);
                mutex_exit(&ktrace_lock);
        }
#endif

        /*
         * Create signal actions for the child process.
         */
        p2->p_sigacts = sigactsinit(p1, 0);
        mutex_enter(p1->p_lock);
        p2->p_sflag |=
            (p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP));
        sched_proc_fork(p1, p2);
        mutex_exit(p1->p_lock);

        p2->p_stflag = p1->p_stflag;

        /*
         * p_stats.
         * Copy parts of p_stats, and zero out the rest.
         */
        p2->p_stats = pstatscopy(p1->p_stats);

        /* copy over machdep flags to the new proc */
        cpu_proc_fork(p1, p2);

        /*
         * Prepare remaining parts of spawn data
         */
        spawn_data->sed_actions = fa;
        spawn_data->sed_attrs = sa;

        spawn_data->sed_parent = p1;

        /* create LWP */
        lwp_create(l1, p2, uaddr, 0, NULL, 0, spawn_return, spawn_data,
            &l2, l1->l_class, &l1->l_sigmask, &l1->l_sigstk);
        l2->l_ctxlink = NULL;        /* reset ucontext link */

        /*
         * Copy the credential so other references don't see our changes.
         * Test to see if this is necessary first, since in the common case
         * we won't need a private reference.
         */
        if (kauth_cred_geteuid(l2->l_cred) != kauth_cred_getsvuid(l2->l_cred) ||
            kauth_cred_getegid(l2->l_cred) != kauth_cred_getsvgid(l2->l_cred)) {
                l2->l_cred = kauth_cred_copy(l2->l_cred);
                kauth_cred_setsvuid(l2->l_cred, kauth_cred_geteuid(l2->l_cred));
                kauth_cred_setsvgid(l2->l_cred, kauth_cred_getegid(l2->l_cred));
        }

        /* Update the master credentials. */
        if (l2->l_cred != p2->p_cred) {
                kauth_cred_t ocred;
                mutex_enter(p2->p_lock);
                ocred = p2->p_cred;
                p2->p_cred = kauth_cred_hold(l2->l_cred);
                mutex_exit(p2->p_lock);
                kauth_cred_free(ocred);
        }

        *child_ok = true;
        spawn_data->sed_refcnt = 2;        /* child gets it as well */
#if 0
        l2->l_nopreempt = 1; /* start it non-preemptable */
#endif

        /*
         * It's now safe for the scheduler and other processes to see the
         * child process.
         */
        mutex_enter(&proc_lock);

        if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT)
                p2->p_lflag |= PL_CONTROLT;

        LIST_INSERT_HEAD(&p1->p_children, p2, p_sibling);
        p2->p_exitsig = SIGCHLD;        /* signal for parent on exit */

        if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) ==
            (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) {
                proc_changeparent(p2, p1->p_pptr);
                SET(p2->p_slflag, PSL_TRACEDCHILD);
        }

        p2->p_oppid = p1->p_pid;  /* Remember the original parent id. */

        LIST_INSERT_AFTER(p1, p2, p_pglist);
        LIST_INSERT_HEAD(&allproc, p2, p_list);

        p2->p_trace_enabled = trace_is_enabled(p2);
#ifdef __HAVE_SYSCALL_INTERN
        (*p2->p_emul->e_syscall_intern)(p2);
#endif

        /*
         * Make child runnable, set start time, and add to run queue except
         * if the parent requested the child to start in SSTOP state.
         */
        mutex_enter(p2->p_lock);

        getmicrotime(&p2->p_stats->p_start);

        lwp_lock(l2);
        KASSERT(p2->p_nrlwps == 1);
        KASSERT(l2->l_stat == LSIDL);
        p2->p_nrlwps = 1;
        p2->p_stat = SACTIVE;
        setrunnable(l2);
        /* LWP now unlocked */

        mutex_exit(p2->p_lock);
        mutex_exit(&proc_lock);

        cv_wait(&spawn_data->sed_cv_child_ready, &spawn_data->sed_mtx_child);
        error = spawn_data->sed_error;
        mutex_exit(&spawn_data->sed_mtx_child);
        spawn_exec_data_release(spawn_data);

        rw_exit(&p1->p_reflock);
        rw_exit(&exec_lock);
        have_exec_lock = false;

        *pid_res = pid;

        if (error)
                return error;

        if (p1->p_slflag & PSL_TRACED) {
                /* Paranoid check */
                mutex_enter(&proc_lock);
                if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) !=
                    (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) {
                        mutex_exit(&proc_lock);
                        return 0;
                }

                mutex_enter(p1->p_lock);
                eventswitch(TRAP_CHLD, PTRACE_POSIX_SPAWN, pid);
        }
        return 0;

 error_exit:
        if (have_exec_lock) {
                execve_free_data(&spawn_data->sed_exec);
                rw_exit(&p1->p_reflock);
                rw_exit(&exec_lock);
        }
        mutex_exit(&spawn_data->sed_mtx_child);
        spawn_exec_data_release(spawn_data);
        if (uaddr != 0)
                uvm_uarea_free(uaddr);

        return error;
}

int
sys_posix_spawn(struct lwp *l1, const struct sys_posix_spawn_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(pid_t *) pid;
                syscallarg(const char *) path;
                syscallarg(const struct posix_spawn_file_actions *) file_actions;
                syscallarg(const struct posix_spawnattr *) attrp;
                syscallarg(char *const *) argv;
                syscallarg(char *const *) envp;
        } */        

        int error;
        struct posix_spawn_file_actions *fa = NULL;
        struct posix_spawnattr *sa = NULL;
        pid_t pid;
        bool child_ok = false;
        rlim_t max_fileactions;
        proc_t *p = l1->l_proc;

        /* check_posix_spawn() increments nprocs for us. */
        error = check_posix_spawn(l1);
        if (error) {
                *retval = error;
                return 0;
        }

        /* copy in file_actions struct */
        if (SCARG(uap, file_actions) != NULL) {
                max_fileactions = 2 * uimin(p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
                    maxfiles);
                error = posix_spawn_fa_alloc(&fa, SCARG(uap, file_actions),
                    max_fileactions);
                if (error)
                        goto error_exit;
        }

        /* copyin posix_spawnattr struct */
        if (SCARG(uap, attrp) != NULL) {
                sa = kmem_alloc(sizeof(*sa), KM_SLEEP);
                error = copyin(SCARG(uap, attrp), sa, sizeof(*sa));
                if (error)
                        goto error_exit;
        }

        /*
         * Do the spawn
         */
        error = do_posix_spawn(l1, &pid, &child_ok, SCARG(uap, path), fa, sa,
            SCARG(uap, argv), SCARG(uap, envp), execve_fetch_element);
        if (error)
                goto error_exit;

        if (error == 0 && SCARG(uap, pid) != NULL)
                error = copyout(&pid, SCARG(uap, pid), sizeof(pid));

        *retval = error;
        return 0;

 error_exit:
        if (!child_ok) {
                (void)chgproccnt(kauth_cred_getuid(l1->l_cred), -1);
                atomic_dec_uint(&nprocs);

                if (sa)
                        kmem_free(sa, sizeof(*sa));
                if (fa)
                        posix_spawn_fa_free(fa, fa->len);
        }

        *retval = error;
        return 0;
}

void
exec_free_emul_arg(struct exec_package *epp)
{
        if (epp->ep_emul_arg_free != NULL) {
                KASSERT(epp->ep_emul_arg != NULL);
                (*epp->ep_emul_arg_free)(epp->ep_emul_arg);
                epp->ep_emul_arg_free = NULL;
                epp->ep_emul_arg = NULL;
        } else {
                KASSERT(epp->ep_emul_arg == NULL);
        }
}

#ifdef DEBUG_EXEC
static void
dump_vmcmds(const struct exec_package * const epp, size_t x, int error)
{
        struct exec_vmcmd *vp = &epp->ep_vmcmds.evs_cmds[0];
        size_t j;

        if (error == 0)
                DPRINTF(("vmcmds %u\n", epp->ep_vmcmds.evs_used));
        else
                DPRINTF(("vmcmds %zu/%u, error %d\n", x, 
                    epp->ep_vmcmds.evs_used, error));

        for (j = 0; j < epp->ep_vmcmds.evs_used; j++) {
                DPRINTF(("vmcmd[%zu] = vmcmd_map_%s %#"
                    PRIxVADDR"/%#"PRIxVSIZE" fd@%#"
                    PRIxVSIZE" prot=0%o flags=%d\n", j,
                    vp[j].ev_proc == vmcmd_map_pagedvn ?
                    "pagedvn" :
                    vp[j].ev_proc == vmcmd_map_readvn ?
                    "readvn" :
                    vp[j].ev_proc == vmcmd_map_zero ?
                    "zero" : "*unknown*",
                    vp[j].ev_addr, vp[j].ev_len,
                    vp[j].ev_offset, vp[j].ev_prot,
                    vp[j].ev_flags));
                if (error != 0 && j == x)
                        DPRINTF(("     ^--- failed\n"));
        }
}
#endif





























































    5 








    4 



    5 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
/*        $NetBSD: strncpy.c,v 1.4 2018/02/04 01:13:45 mrg Exp $        */

/*-
 * Copyright (c) 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Chris Torek.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
#if 0
static char sccsid[] = "@(#)strncpy.c        8.1 (Berkeley) 6/4/93";
#else
__RCSID("$NetBSD: strncpy.c,v 1.4 2018/02/04 01:13:45 mrg Exp $");
#endif
#endif /* LIBC_SCCS and not lint */

#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <assert.h>
#include <string.h>
#else
#include <lib/libkern/libkern.h>
#endif

#ifdef _FORTIFY_SOURCE
#undef strncpy
#endif

/*
 * Copy src to dst, truncating or null-padding to always copy n bytes.
 * Return dst.
 */
char *
strncpy(char *dst, const char *src, size_t n)
{

        if (n != 0) {
                char *d = dst;
                const char *s = src;

                do {
                        if ((*d++ = *s++) == 0) {
                                /* NUL pad the remaining n-1 bytes */
                                while (--n != 0)
                                        *d++ = 0;
                                break;
                        }
                } while (--n != 0);
        }
        return (dst);
}















































































































































































    1 

































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
/*        $NetBSD: procfs.h,v 1.84 2024/01/17 10:20:12 hannken Exp $        */

/*
 * Copyright (c) 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)procfs.h        8.9 (Berkeley) 5/14/95
 */

/*
 * Copyright (c) 1993 Jan-Simon Pendry
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)procfs.h        8.9 (Berkeley) 5/14/95
 */

/* This also pulls in __HAVE_PROCFS_MACHDEP */
#include <sys/ptrace.h>

#ifdef _KERNEL
#include <sys/proc.h>

/*
 * The different types of node in a procfs filesystem
 */
typedef enum {
        PFSauxv,        /* ELF Auxiliary Vector */
        PFSchroot,        /* the process's current root directory */
        PFScmdline,        /* process command line args */
        PFScpuinfo,        /* CPU info (if -o linux) */
        PFScpustat,        /* status info (if -o linux) */
        PFScurproc,        /* symbolic link for curproc */
        PFScwd,                /* the process's current working directory */
        PFSdevices,        /* major/device name mappings (if -o linux) */
        PFSemul,        /* the process's emulation */
        PFSenviron,        /* process environment */
        PFSexe,                /* symlink to the executable file */
        PFSfd,                /* a directory containing the processes open fd's */
        PFSfile,        /* the executable file */
        PFSfpregs,        /* the process's FP register set */
        PFSloadavg,        /* load average (if -o linux) */
        PFSlimit,        /* resource limits */
        PFSmap,                /* memory map */
        PFSmaps,        /* memory map, Linux style (if -o linux) */
        PFSmem,                /* the process's memory image */
        PFSmeminfo,        /* system memory info (if -o linux) */
        PFSmounts,        /* mounted filesystems (if -o linux) */
        PFSnote,        /* process notifier */
        PFSnotepg,        /* process group notifier */
        PFSproc,        /* a process-specific sub-directory */
        PFSregs,        /* the process's register set */
        PFSroot,        /* the filesystem root */
        PFSself,        /* like curproc, but this is the Linux name */
        PFSstat,        /* process status (if -o linux) */
        PFSstatm,        /* process memory info (if -o linux) */
        PFSstatus,        /* process status */
        PFStask,        /* task subdirector (if -o linux) */
        PFSuptime,        /* elapsed time since (if -o linux) */
        PFSversion,        /* kernel version (if -o linux) */
#ifdef __HAVE_PROCFS_MACHDEP
        PROCFS_MACHDEP_NODE_TYPES
#endif
        PFSlast,        /* track number of types */
} pfstype;

/*
 * control data for the proc file system.
 */
struct pfskey {
        pfstype                pk_type;        /* type of procfs node */
        pid_t                pk_pid;                /* associated process */
        int                pk_fd;                /* associated fd if not -1 */
};
struct pfsnode {
        LIST_ENTRY(pfsnode) pfs_hash;        /* per pid hash list */
        struct vnode        *pfs_vnode;        /* vnode associated with this pfsnode */
        struct mount        *pfs_mount;        /* mount associated with this pfsnode */
        struct pfskey        pfs_key;
#define pfs_type pfs_key.pk_type
#define pfs_pid pfs_key.pk_pid
#define pfs_fd pfs_key.pk_fd
        mode_t                pfs_mode;        /* mode bits for stat() */
        u_long                pfs_flags;        /* open flags */
        uint64_t        pfs_fileno;        /* unique file id */
};

#define PROCFS_NOTELEN        64        /* max length of a note (/proc/$pid/note) */
#define PROCFS_MAXNAMLEN        255

#endif /* _KERNEL */

struct procfs_args {
        int version;
        int flags;
};

#define PROCFS_ARGSVERSION        1

#define PROCFSMNT_LINUXCOMPAT        0x01

#define PROCFSMNT_BITS "\177\20" \
    "b\00linuxcompat\0"

/*
 * Kernel stuff follows
 */
#ifdef _KERNEL
#define CNEQ(cnp, s, len) \
         ((cnp)->cn_namelen == (len) && \
          (memcmp((s), (cnp)->cn_nameptr, (len)) == 0))

#define UIO_MX 32

static __inline ino_t
procfs_fileno(pid_t _pid, pfstype _type, int _fd)
{
        ino_t _ino;
        switch (_type) {
        case PFSroot:
                return 2;
        case PFScurproc:
                return 3;
        case PFSself:
                return 4;
        default:
                _ino = _pid + 1;
                if (_fd != -1)
                        _ino = _ino << 32 | _fd;
                return _ino * PFSlast + _type;
        }
}

#define PROCFS_FILENO(pid, type, fd) procfs_fileno(pid, type, fd)

#define PROCFS_TYPE(type)        ((type) % PFSlast)

struct procfsmount {
        int pmnt_flags;
};

#define VFSTOPROC(mp)        ((struct procfsmount *)(mp)->mnt_data)

/*
 * Convert between pfsnode vnode
 */
#define VTOPFS(vp)        ((struct pfsnode *)(vp)->v_data)
#define PFSTOV(pfs)        ((pfs)->pfs_vnode)

typedef struct vfs_namemap vfs_namemap_t;
struct vfs_namemap {
        const char *nm_name;
        int nm_val;
};

int vfs_getuserstr(struct uio *, char *, int *);
const vfs_namemap_t *vfs_findname(const vfs_namemap_t *, const char *, int);

struct mount;

struct proc *procfs_proc_find(struct mount *, pid_t);
bool procfs_use_linux_compat(struct mount *);

static inline bool
procfs_proc_is_linux_compat(void)
{
        const char *emulname = curlwp->l_proc->p_emul->e_name;
        return (strncmp(emulname, "linux", 5) == 0);
}

int procfs_proc_lock(struct mount *, int, struct proc **, int);
void procfs_proc_unlock(struct proc *);
int procfs_allocvp(struct mount *, struct vnode **, pid_t, pfstype, int);
int procfs_donote(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);
int procfs_doregs(struct lwp *, struct lwp *, struct pfsnode *,
    struct uio *);
int procfs_dofpregs(struct lwp *, struct lwp *, struct pfsnode *,
    struct uio *);
int procfs_domem(struct lwp *, struct lwp *, struct pfsnode *,
    struct uio *);
int procfs_do_pid_stat(struct lwp *, struct lwp *, struct pfsnode *,
    struct uio *);
int procfs_dostatus(struct lwp *, struct lwp *, struct pfsnode *,
    struct uio *);
int procfs_domap(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *, int);
int procfs_doprocargs(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *, int);
int procfs_domeminfo(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);
int procfs_dodevices(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);
int procfs_docpuinfo(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);
int procfs_docpustat(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);
int procfs_doloadavg(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);
int procfs_do_pid_statm(struct lwp *, struct lwp *, struct pfsnode *,
    struct uio *);
int procfs_dofd(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);
int procfs_douptime(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);
int procfs_domounts(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);
int procfs_doemul(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);
int procfs_doversion(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);
int procfs_doauxv(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);
int procfs_dolimit(struct lwp *, struct proc *, struct pfsnode *,
    struct uio *);

void procfs_hashrem(struct pfsnode *);
int procfs_getfp(struct pfsnode *, struct proc *, struct file **);

/* functions to check whether or not files should be displayed */
int procfs_validauxv(struct lwp *, struct mount *);
int procfs_validfile(struct lwp *, struct mount *);
int procfs_validfpregs(struct lwp *, struct mount *);
int procfs_validregs(struct lwp *, struct mount *);
int procfs_validmap(struct lwp *, struct mount *);

int procfs_rw(void *);

int procfs_getcpuinfstr(char *, size_t *);

#define PROCFS_LOCKED        0x01
#define PROCFS_WANT        0x02

extern int (**procfs_vnodeop_p)(void *);
extern struct vfsops procfs_vfsops;

int        procfs_root(struct mount *, int, struct vnode **);

#ifdef __HAVE_PROCFS_MACHDEP
struct vattr;

void        procfs_machdep_allocvp(struct vnode *);
int        procfs_machdep_rw(struct lwp *, struct lwp *, struct pfsnode *,
            struct uio *);
int        procfs_machdep_getattr(struct vnode *, struct vattr *, struct proc *);
#endif

#endif /* _KERNEL */


























































































































































    2 











    2 



    2 































    2 



































































































































    2 













    2 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
/*        $NetBSD: fss.c,v 1.114 2023/03/22 21:14:46 hannken Exp $        */

/*-
 * Copyright (c) 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Juergen Hannken-Illjes.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * File system snapshot disk driver.
 *
 * Block/character interface to the snapshot of a mounted file system.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: fss.c,v 1.114 2023/03/22 21:14:46 hannken Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/kmem.h>
#include <sys/buf.h>
#include <sys/ioctl.h>
#include <sys/disklabel.h>
#include <sys/device.h>
#include <sys/disk.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/conf.h>
#include <sys/kthread.h>
#include <sys/fstrans.h>
#include <sys/vfs_syscalls.h>                /* For do_sys_unlink(). */

#include <miscfs/specfs/specdev.h>

#include <dev/fssvar.h>

#include <uvm/uvm.h>

#include "ioconf.h"

dev_type_open(fss_open);
dev_type_close(fss_close);
dev_type_read(fss_read);
dev_type_write(fss_write);
dev_type_ioctl(fss_ioctl);
dev_type_strategy(fss_strategy);
dev_type_dump(fss_dump);
dev_type_size(fss_size);

static void fss_unmount_hook(struct mount *);
static int fss_copy_on_write(void *, struct buf *, bool);
static inline void fss_error(struct fss_softc *, const char *);
static int fss_create_files(struct fss_softc *, struct fss_set *,
    off_t *, struct lwp *);
static int fss_create_snapshot(struct fss_softc *, struct fss_set *,
    struct lwp *);
static int fss_delete_snapshot(struct fss_softc *, struct lwp *);
static int fss_softc_alloc(struct fss_softc *);
static void fss_softc_free(struct fss_softc *);
static int fss_read_cluster(struct fss_softc *, u_int32_t);
static void fss_bs_thread(void *);
static int fss_bs_io(struct fss_softc *, fss_io_type,
    u_int32_t, off_t, int, void *, size_t *);
static u_int32_t *fss_bs_indir(struct fss_softc *, u_int32_t);

static kmutex_t fss_device_lock;        /* Protect all units. */
static kcondvar_t fss_device_cv;        /* Serialize snapshot creation. */
static bool fss_creating = false;        /* Currently creating a snapshot. */
static int fss_num_attached = 0;        /* Number of attached devices. */
static struct vfs_hooks fss_vfs_hooks = {
        .vh_unmount = fss_unmount_hook
};

const struct bdevsw fss_bdevsw = {
        .d_open = fss_open,
        .d_close = fss_close,
        .d_strategy = fss_strategy,
        .d_ioctl = fss_ioctl,
        .d_dump = fss_dump,
        .d_psize = fss_size,
        .d_discard = nodiscard,
        .d_flag = D_DISK | D_MPSAFE
};

const struct cdevsw fss_cdevsw = {
        .d_open = fss_open,
        .d_close = fss_close,
        .d_read = fss_read,
        .d_write = fss_write,
        .d_ioctl = fss_ioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_DISK | D_MPSAFE
};

static int fss_match(device_t, cfdata_t, void *);
static void fss_attach(device_t, device_t, void *);
static int fss_detach(device_t, int);

CFATTACH_DECL_NEW(fss, sizeof(struct fss_softc),
    fss_match, fss_attach, fss_detach, NULL);

void
fssattach(int num)
{

        mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&fss_device_cv, "snapwait");
        if (config_cfattach_attach(fss_cd.cd_name, &fss_ca))
                aprint_error("%s: unable to register\n", fss_cd.cd_name);
}

static int
fss_match(device_t self, cfdata_t cfdata, void *aux)
{
        return 1;
}

static void
fss_attach(device_t parent, device_t self, void *aux)
{
        struct fss_softc *sc = device_private(self);

        sc->sc_dev = self;
        sc->sc_bdev = NODEV;
        mutex_init(&sc->sc_slock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&sc->sc_work_cv, "fssbs");
        cv_init(&sc->sc_cache_cv, "cowwait");
        bufq_alloc(&sc->sc_bufq, "fcfs", 0);
        sc->sc_dkdev = kmem_zalloc(sizeof(*sc->sc_dkdev), KM_SLEEP);
        sc->sc_dkdev->dk_info = NULL;
        disk_init(sc->sc_dkdev, device_xname(self), NULL);
        if (!pmf_device_register(self, NULL, NULL))
                aprint_error_dev(self, "couldn't establish power handler\n");

        if (fss_num_attached++ == 0)
                vfs_hooks_attach(&fss_vfs_hooks);
}

static int
fss_detach(device_t self, int flags)
{
        struct fss_softc *sc = device_private(self);

        mutex_enter(&sc->sc_slock);
        if (sc->sc_state != FSS_IDLE) {
                mutex_exit(&sc->sc_slock);
                return EBUSY;
        }
        mutex_exit(&sc->sc_slock);

        if (--fss_num_attached == 0)
                vfs_hooks_detach(&fss_vfs_hooks);

        pmf_device_deregister(self);
        mutex_destroy(&sc->sc_slock);
        cv_destroy(&sc->sc_work_cv);
        cv_destroy(&sc->sc_cache_cv);
        bufq_drain(sc->sc_bufq);
        bufq_free(sc->sc_bufq);
        disk_destroy(sc->sc_dkdev);
        kmem_free(sc->sc_dkdev, sizeof(*sc->sc_dkdev));

        return 0;
}

int
fss_open(dev_t dev, int flags, int mode, struct lwp *l)
{
        int mflag;
        cfdata_t cf;
        struct fss_softc *sc;

        mflag = (mode == S_IFCHR ? FSS_CDEV_OPEN : FSS_BDEV_OPEN);

        mutex_enter(&fss_device_lock);

        sc = device_lookup_private(&fss_cd, minor(dev));
        if (sc == NULL) {
                cf = kmem_zalloc(sizeof(*cf), KM_SLEEP);
                cf->cf_name = fss_cd.cd_name;
                cf->cf_atname = fss_cd.cd_name;
                cf->cf_unit = minor(dev);
                cf->cf_fstate = FSTATE_STAR;
                sc = device_private(config_attach_pseudo(cf));
                if (sc == NULL) {
                        mutex_exit(&fss_device_lock);
                        return ENOMEM;
                }
                sc->sc_state = FSS_IDLE;
        }

        mutex_enter(&sc->sc_slock);

        sc->sc_flags |= mflag;

        mutex_exit(&sc->sc_slock);
        mutex_exit(&fss_device_lock);

        return 0;
}

int
fss_close(dev_t dev, int flags, int mode, struct lwp *l)
{
        int mflag, error;
        cfdata_t cf;
        struct fss_softc *sc = device_lookup_private(&fss_cd, minor(dev));

        if (sc == NULL)
                return ENXIO;

        mflag = (mode == S_IFCHR ? FSS_CDEV_OPEN : FSS_BDEV_OPEN);
        error = 0;

        mutex_enter(&fss_device_lock);
restart:
        mutex_enter(&sc->sc_slock);
        if ((sc->sc_flags & (FSS_CDEV_OPEN|FSS_BDEV_OPEN)) != mflag) {
                sc->sc_flags &= ~mflag;
                mutex_exit(&sc->sc_slock);
                mutex_exit(&fss_device_lock);
                return 0;
        }
        if (sc->sc_state != FSS_IDLE &&
            (sc->sc_uflags & FSS_UNCONFIG_ON_CLOSE) != 0) {
                sc->sc_uflags &= ~FSS_UNCONFIG_ON_CLOSE;
                mutex_exit(&sc->sc_slock);
                error = fss_ioctl(dev, FSSIOCCLR, NULL, FWRITE, l);
                goto restart;
        }
        if (sc->sc_state != FSS_IDLE) {
                mutex_exit(&sc->sc_slock);
                mutex_exit(&fss_device_lock);
                return error;
        }

        KASSERT(sc->sc_state == FSS_IDLE);
        KASSERT((sc->sc_flags & (FSS_CDEV_OPEN|FSS_BDEV_OPEN)) == mflag);
        mutex_exit(&sc->sc_slock);
        cf = device_cfdata(sc->sc_dev);
        error = config_detach(sc->sc_dev, DETACH_QUIET);
        if (! error)
                kmem_free(cf, sizeof(*cf));
        mutex_exit(&fss_device_lock);

        return error;
}

void
fss_strategy(struct buf *bp)
{
        const bool write = ((bp->b_flags & B_READ) != B_READ);
        struct fss_softc *sc = device_lookup_private(&fss_cd, minor(bp->b_dev));

        if (sc == NULL) {
                bp->b_error = ENXIO;
                goto done;
        }

        mutex_enter(&sc->sc_slock);

        if (write || sc->sc_state != FSS_ACTIVE) {
                bp->b_error = (write ? EROFS : ENXIO);
                goto done;
        }
        /* Check bounds for non-persistent snapshots. */
        if ((sc->sc_flags & FSS_PERSISTENT) == 0 &&
            bounds_check_with_mediasize(bp, DEV_BSIZE,
            btodb(FSS_CLTOB(sc, sc->sc_clcount - 1) + sc->sc_clresid)) <= 0)
                goto done;

        bp->b_rawblkno = bp->b_blkno;
        bufq_put(sc->sc_bufq, bp);
        cv_signal(&sc->sc_work_cv);

        mutex_exit(&sc->sc_slock);
        return;

done:
        if (sc != NULL)
                mutex_exit(&sc->sc_slock);
        bp->b_resid = bp->b_bcount;
        biodone(bp);
}

int
fss_read(dev_t dev, struct uio *uio, int flags)
{
        return physio(fss_strategy, NULL, dev, B_READ, minphys, uio);
}

int
fss_write(dev_t dev, struct uio *uio, int flags)
{
        return physio(fss_strategy, NULL, dev, B_WRITE, minphys, uio);
}

int
fss_ioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
        int error = 0;
        struct fss_softc *sc = device_lookup_private(&fss_cd, minor(dev));
        struct fss_set _fss;
        struct fss_set *fss = (struct fss_set *)data;
        struct fss_set50 *fss50 = (struct fss_set50 *)data;
        struct fss_get *fsg = (struct fss_get *)data;
#ifndef _LP64
        struct fss_get50 *fsg50 = (struct fss_get50 *)data;
#endif

        if (sc == NULL)
                return ENXIO;

        switch (cmd) {
        case FSSIOCSET50:
                fss = &_fss;
                fss->fss_mount = fss50->fss_mount;
                fss->fss_bstore = fss50->fss_bstore;
                fss->fss_csize = fss50->fss_csize;
                fss->fss_flags = 0;
                /* Fall through */
        case FSSIOCSET:
                mutex_enter(&sc->sc_slock);
                if ((flag & FWRITE) == 0)
                        error = EPERM;
                if (error == 0 && sc->sc_state != FSS_IDLE) {
                        error = EBUSY;
                } else {
                        sc->sc_state = FSS_CREATING;
                        copyinstr(fss->fss_mount, sc->sc_mntname,
                            sizeof(sc->sc_mntname), NULL);
                        memset(&sc->sc_time, 0, sizeof(sc->sc_time));
                        sc->sc_clshift = 0;
                }
                mutex_exit(&sc->sc_slock);
                if (error)
                        break;

                /*
                 * Serialize snapshot creation.
                 */
                mutex_enter(&fss_device_lock);
                while (fss_creating) {
                        error = cv_wait_sig(&fss_device_cv, &fss_device_lock);
                        if (error) {
                                mutex_enter(&sc->sc_slock);
                                KASSERT(sc->sc_state == FSS_CREATING);
                                sc->sc_state = FSS_IDLE;
                                mutex_exit(&sc->sc_slock);
                                mutex_exit(&fss_device_lock);
                                break;
                        }
                }
                fss_creating = true;
                mutex_exit(&fss_device_lock);

                error = fss_create_snapshot(sc, fss, l);
                mutex_enter(&sc->sc_slock);
                if (error == 0) {
                        KASSERT(sc->sc_state == FSS_ACTIVE);
                        sc->sc_uflags = fss->fss_flags;
                } else {
                        KASSERT(sc->sc_state == FSS_CREATING);
                        sc->sc_state = FSS_IDLE;
                }
                mutex_exit(&sc->sc_slock);

                mutex_enter(&fss_device_lock);
                fss_creating = false;
                cv_broadcast(&fss_device_cv);
                mutex_exit(&fss_device_lock);

                break;

        case FSSIOCCLR:
                mutex_enter(&sc->sc_slock);
                if ((flag & FWRITE) == 0) {
                        error = EPERM;
                } else if (sc->sc_state != FSS_ACTIVE) {
                        error = EBUSY;
                } else {
                        sc->sc_state = FSS_DESTROYING;
                }
                mutex_exit(&sc->sc_slock);
                if (error)
                        break;

                error = fss_delete_snapshot(sc, l);
                mutex_enter(&sc->sc_slock);
                if (error)
                        fss_error(sc, "Failed to delete snapshot");
                else
                        KASSERT(sc->sc_state == FSS_IDLE);
                mutex_exit(&sc->sc_slock);
                break;

#ifndef _LP64
        case FSSIOCGET50:
                mutex_enter(&sc->sc_slock);
                if (sc->sc_state == FSS_IDLE) {
                        error = ENXIO;
                } else if ((sc->sc_flags & FSS_PERSISTENT) == 0) {
                        memcpy(fsg50->fsg_mount, sc->sc_mntname, MNAMELEN);
                        fsg50->fsg_csize = FSS_CLSIZE(sc);
                        timeval_to_timeval50(&sc->sc_time, &fsg50->fsg_time);
                        fsg50->fsg_mount_size = sc->sc_clcount;
                        fsg50->fsg_bs_size = sc->sc_clnext;
                        error = 0;
                } else {
                        memcpy(fsg50->fsg_mount, sc->sc_mntname, MNAMELEN);
                        fsg50->fsg_csize = 0;
                        timeval_to_timeval50(&sc->sc_time, &fsg50->fsg_time);
                        fsg50->fsg_mount_size = 0;
                        fsg50->fsg_bs_size = 0;
                        error = 0;
                }
                mutex_exit(&sc->sc_slock);
                break;
#endif /* _LP64 */

        case FSSIOCGET:
                mutex_enter(&sc->sc_slock);
                if (sc->sc_state == FSS_IDLE) {
                        error = ENXIO;
                } else if ((sc->sc_flags & FSS_PERSISTENT) == 0) {
                        memcpy(fsg->fsg_mount, sc->sc_mntname, MNAMELEN);
                        fsg->fsg_csize = FSS_CLSIZE(sc);
                        fsg->fsg_time = sc->sc_time;
                        fsg->fsg_mount_size = sc->sc_clcount;
                        fsg->fsg_bs_size = sc->sc_clnext;
                        error = 0;
                } else {
                        memcpy(fsg->fsg_mount, sc->sc_mntname, MNAMELEN);
                        fsg->fsg_csize = 0;
                        fsg->fsg_time = sc->sc_time;
                        fsg->fsg_mount_size = 0;
                        fsg->fsg_bs_size = 0;
                        error = 0;
                }
                mutex_exit(&sc->sc_slock);
                break;

        case FSSIOFSET:
                mutex_enter(&sc->sc_slock);
                sc->sc_uflags = *(int *)data;
                mutex_exit(&sc->sc_slock);
                error = 0;
                break;

        case FSSIOFGET:
                mutex_enter(&sc->sc_slock);
                *(int *)data = sc->sc_uflags;
                mutex_exit(&sc->sc_slock);
                error = 0;
                break;

        default:
                error = EINVAL;
                break;
        }

        return error;
}

int
fss_size(dev_t dev)
{
        return -1;
}

int
fss_dump(dev_t dev, daddr_t blkno, void *va,
    size_t size)
{
        return EROFS;
}

/*
 * An error occurred reading or writing the snapshot or backing store.
 * If it is the first error log to console and disestablish cow handler.
 * The caller holds the mutex.
 */
static inline void
fss_error(struct fss_softc *sc, const char *msg)
{

        KASSERT(mutex_owned(&sc->sc_slock));

        if ((sc->sc_flags & FSS_ERROR))
                return;

        aprint_error_dev(sc->sc_dev, "snapshot invalid: %s\n", msg);
        if ((sc->sc_flags & FSS_PERSISTENT) == 0) {
                mutex_exit(&sc->sc_slock);
                fscow_disestablish(sc->sc_mount, fss_copy_on_write, sc);
                mutex_enter(&sc->sc_slock);
        }
        sc->sc_flags |= FSS_ERROR;
}

/*
 * Allocate the variable sized parts of the softc and
 * fork the kernel thread.
 *
 * The fields sc_clcount, sc_clshift, sc_cache_size and sc_indir_size
 * must be initialized.
 */
static int
fss_softc_alloc(struct fss_softc *sc)
{
        int i, error;

        if ((sc->sc_flags & FSS_PERSISTENT) == 0) {
                sc->sc_copied =
                    kmem_zalloc(howmany(sc->sc_clcount, NBBY), KM_SLEEP);
                sc->sc_cache = kmem_alloc(sc->sc_cache_size *
                    sizeof(struct fss_cache), KM_SLEEP);
                for (i = 0; i < sc->sc_cache_size; i++) {
                        sc->sc_cache[i].fc_type = FSS_CACHE_FREE;
                        sc->sc_cache[i].fc_data =
                            kmem_alloc(FSS_CLSIZE(sc), KM_SLEEP);
                        cv_init(&sc->sc_cache[i].fc_state_cv, "cowwait1");
                }

                sc->sc_indir_valid =
                    kmem_zalloc(howmany(sc->sc_indir_size, NBBY), KM_SLEEP);
                sc->sc_indir_data = kmem_zalloc(FSS_CLSIZE(sc), KM_SLEEP);
        } else {
                sc->sc_copied = NULL;
                sc->sc_cache = NULL;
                sc->sc_indir_valid = NULL;
                sc->sc_indir_data = NULL;
        }

        sc->sc_flags |= FSS_BS_THREAD;
        if ((error = kthread_create(PRI_BIO, KTHREAD_MUSTJOIN, NULL,
            fss_bs_thread, sc, &sc->sc_bs_lwp,
            "%s", device_xname(sc->sc_dev))) != 0) {
                sc->sc_flags &= ~FSS_BS_THREAD;
                return error;
        }

        disk_attach(sc->sc_dkdev);

        return 0;
}

/*
 * Free the variable sized parts of the softc.
 */
static void
fss_softc_free(struct fss_softc *sc)
{
        int i;

        if ((sc->sc_flags & FSS_BS_THREAD) != 0) {
                mutex_enter(&sc->sc_slock);
                sc->sc_flags &= ~FSS_BS_THREAD;
                cv_signal(&sc->sc_work_cv);
                mutex_exit(&sc->sc_slock);
                kthread_join(sc->sc_bs_lwp);

                disk_detach(sc->sc_dkdev);
        }

        if (sc->sc_copied != NULL)
                kmem_free(sc->sc_copied, howmany(sc->sc_clcount, NBBY));
        sc->sc_copied = NULL;

        if (sc->sc_cache != NULL) {
                for (i = 0; i < sc->sc_cache_size; i++)
                        if (sc->sc_cache[i].fc_data != NULL) {
                                cv_destroy(&sc->sc_cache[i].fc_state_cv);
                                kmem_free(sc->sc_cache[i].fc_data,
                                    FSS_CLSIZE(sc));
                        }
                kmem_free(sc->sc_cache,
                    sc->sc_cache_size*sizeof(struct fss_cache));
        }
        sc->sc_cache = NULL;

        if (sc->sc_indir_valid != NULL)
                kmem_free(sc->sc_indir_valid, howmany(sc->sc_indir_size, NBBY));
        sc->sc_indir_valid = NULL;

        if (sc->sc_indir_data != NULL)
                kmem_free(sc->sc_indir_data, FSS_CLSIZE(sc));
        sc->sc_indir_data = NULL;
}

/*
 * Set all active snapshots on this file system into ERROR state.
 */
static void
fss_unmount_hook(struct mount *mp)
{
        int i;
        struct fss_softc *sc;

        mutex_enter(&fss_device_lock);
        for (i = 0; i < fss_cd.cd_ndevs; i++) {
                if ((sc = device_lookup_private(&fss_cd, i)) == NULL)
                        continue;
                mutex_enter(&sc->sc_slock);
                if (sc->sc_state != FSS_IDLE && sc->sc_mount == mp)
                        fss_error(sc, "forced by unmount");
                mutex_exit(&sc->sc_slock);
        }
        mutex_exit(&fss_device_lock);
}

/*
 * A buffer is written to the snapshotted block device. Copy to
 * backing store if needed.
 */
static int
fss_copy_on_write(void *v, struct buf *bp, bool data_valid)
{
        int error;
        u_int32_t cl, ch, c;
        struct fss_softc *sc = v;

        mutex_enter(&sc->sc_slock);
        if (sc->sc_state != FSS_ACTIVE) {
                mutex_exit(&sc->sc_slock);
                return 0;
        }

        cl = FSS_BTOCL(sc, dbtob(bp->b_blkno));
        ch = FSS_BTOCL(sc, dbtob(bp->b_blkno)+bp->b_bcount-1);
        error = 0;
        if (curlwp == uvm.pagedaemon_lwp) {
                for (c = cl; c <= ch; c++)
                        if (isclr(sc->sc_copied, c)) {
                                error = ENOMEM;
                                break;
                        }
        }
        mutex_exit(&sc->sc_slock);

        if (error == 0)
                for (c = cl; c <= ch; c++) {
                        error = fss_read_cluster(sc, c);
                        if (error)
                                break;
                }

        return error;
}

/*
 * Lookup and open needed files.
 *
 * For file system internal snapshot initializes sc_mntname, sc_mount,
 * sc_bs_vp and sc_time.
 *
 * Otherwise returns dev and size of the underlying block device.
 * Initializes sc_mntname, sc_mount, sc_bdev, sc_bs_vp and sc_mount
 */
static int
fss_create_files(struct fss_softc *sc, struct fss_set *fss,
    off_t *bsize, struct lwp *l)
{
        int error, bits, fsbsize;
        uint64_t numsec;
        unsigned int secsize;
        struct timespec ts;
        /* distinguish lookup 1 from lookup 2 to reduce mistakes */
        struct pathbuf *pb2;
        struct vnode *vp, *vp2;

        /*
         * Get the mounted file system.
         */

        error = namei_simple_user(fss->fss_mount,
                                NSM_FOLLOW_NOEMULROOT, &vp);
        if (error != 0)
                return error;

        if ((vp->v_vflag & VV_ROOT) != VV_ROOT) {
                vrele(vp);
                return EINVAL;
        }

        sc->sc_mount = vp->v_mount;
        memcpy(sc->sc_mntname, sc->sc_mount->mnt_stat.f_mntonname, MNAMELEN);

        vrele(vp);

        /*
         * Check for file system internal snapshot.
         */

        error = namei_simple_user(fss->fss_bstore,
                                NSM_FOLLOW_NOEMULROOT, &vp);
        if (error != 0)
                return error;

        if (vp->v_type == VREG && vp->v_mount == sc->sc_mount) {
                sc->sc_flags |= FSS_PERSISTENT;
                sc->sc_bs_vp = vp;

                fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
                bits = sizeof(sc->sc_bs_bshift)*NBBY;
                for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < bits;
                    sc->sc_bs_bshift++)
                        if (FSS_FSBSIZE(sc) == fsbsize)
                                break;
                if (sc->sc_bs_bshift >= bits)
                        return EINVAL;

                sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
                sc->sc_clshift = 0;

                if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
                        error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
                        if (error)
                                return error;
                }
                error = vn_lock(vp, LK_EXCLUSIVE);
                if (error != 0)
                        return error;
                error = VFS_SNAPSHOT(sc->sc_mount, sc->sc_bs_vp, &ts);
                TIMESPEC_TO_TIMEVAL(&sc->sc_time, &ts);

                VOP_UNLOCK(sc->sc_bs_vp);

                return error;
        }
        vrele(vp);

        /*
         * Get the block device it is mounted on and its size.
         */

        error = spec_node_lookup_by_mount(sc->sc_mount, &vp);
        if (error)
                return error;
        sc->sc_bdev = vp->v_rdev;

        error = getdisksize(vp, &numsec, &secsize);
        vrele(vp);
        if (error)
                return error;

        *bsize = (off_t)numsec*secsize;

        /*
         * Get the backing store
         */

        error = pathbuf_copyin(fss->fss_bstore, &pb2);
        if (error) {
                 return error;
        }
        error = vn_open(NULL, pb2, 0, FREAD|FWRITE, 0, &vp2, NULL, NULL);
        if (error != 0) {
                pathbuf_destroy(pb2);
                return error;
        }
        VOP_UNLOCK(vp2);

        sc->sc_bs_vp = vp2;

        if (vp2->v_type != VREG && vp2->v_type != VCHR) {
                vrele(vp2);
                pathbuf_destroy(pb2);
                return EINVAL;
        }
        pathbuf_destroy(pb2);

        if ((fss->fss_flags & FSS_UNLINK_ON_CREATE) != 0) {
                error = do_sys_unlink(fss->fss_bstore, UIO_USERSPACE);
                if (error)
                        return error;
        }
        if (sc->sc_bs_vp->v_type == VREG) {
                fsbsize = sc->sc_bs_vp->v_mount->mnt_stat.f_iosize;
                if (fsbsize & (fsbsize-1))        /* No power of two */
                        return EINVAL;
                for (sc->sc_bs_bshift = 1; sc->sc_bs_bshift < 32;
                    sc->sc_bs_bshift++)
                        if (FSS_FSBSIZE(sc) == fsbsize)
                                break;
                if (sc->sc_bs_bshift >= 32)
                        return EINVAL;
                sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
        } else {
                sc->sc_bs_bshift = DEV_BSHIFT;
                sc->sc_bs_bmask = FSS_FSBSIZE(sc)-1;
        }

        return 0;
}

/*
 * Create a snapshot.
 */
static int
fss_create_snapshot(struct fss_softc *sc, struct fss_set *fss, struct lwp *l)
{
        int len, error;
        u_int32_t csize;
        off_t bsize;

        bsize = 0;        /* XXX gcc */

        /*
         * Open needed files.
         */
        if ((error = fss_create_files(sc, fss, &bsize, l)) != 0)
                goto bad;

        if (sc->sc_flags & FSS_PERSISTENT) {
                fss_softc_alloc(sc);
                mutex_enter(&sc->sc_slock);
                sc->sc_state = FSS_ACTIVE;
                mutex_exit(&sc->sc_slock);
                return 0;
        }

        /*
         * Set cluster size. Must be a power of two and
         * a multiple of backing store block size.
         */
        if (fss->fss_csize <= 0)
                csize = MAXPHYS;
        else
                csize = fss->fss_csize;
        if (bsize/csize > FSS_CLUSTER_MAX)
                csize = bsize/FSS_CLUSTER_MAX+1;

        for (sc->sc_clshift = sc->sc_bs_bshift; sc->sc_clshift < 32;
            sc->sc_clshift++)
                if (FSS_CLSIZE(sc) >= csize)
                        break;
        if (sc->sc_clshift >= 32) {
                error = EINVAL;
                goto bad;
        }
        sc->sc_clmask = FSS_CLSIZE(sc)-1;

        /*
         * Set number of cache slots.
         */
        if (FSS_CLSIZE(sc) <= 8192)
                sc->sc_cache_size = 32;
        else if (FSS_CLSIZE(sc) <= 65536)
                sc->sc_cache_size = 8;
        else
                sc->sc_cache_size = 4;

        /*
         * Set number of clusters and size of last cluster.
         */
        sc->sc_clcount = FSS_BTOCL(sc, bsize-1)+1;
        sc->sc_clresid = FSS_CLOFF(sc, bsize-1)+1;

        /*
         * Set size of indirect table.
         */
        len = sc->sc_clcount*sizeof(u_int32_t);
        sc->sc_indir_size = FSS_BTOCL(sc, len)+1;
        sc->sc_clnext = sc->sc_indir_size;
        sc->sc_indir_cur = 0;

        if ((error = fss_softc_alloc(sc)) != 0)
                goto bad;

        /*
         * Activate the snapshot.
         */

        if ((error = vfs_suspend(sc->sc_mount, 0)) != 0)
                goto bad;

        microtime(&sc->sc_time);

        vrele_flush(sc->sc_mount);
        error = VFS_SYNC(sc->sc_mount, MNT_WAIT, curlwp->l_cred);
        if (error == 0)
                error = fscow_establish(sc->sc_mount, fss_copy_on_write, sc);
        if (error == 0) {
                mutex_enter(&sc->sc_slock);
                sc->sc_state = FSS_ACTIVE;
                mutex_exit(&sc->sc_slock);
        }

        vfs_resume(sc->sc_mount);

        if (error != 0)
                goto bad;

        aprint_debug_dev(sc->sc_dev, "%s snapshot active\n", sc->sc_mntname);
        aprint_debug_dev(sc->sc_dev,
            "%u clusters of %u, %u cache slots, %u indir clusters\n",
            sc->sc_clcount, FSS_CLSIZE(sc),
            sc->sc_cache_size, sc->sc_indir_size);

        return 0;

bad:
        fss_softc_free(sc);
        if (sc->sc_bs_vp != NULL) {
                if (sc->sc_flags & FSS_PERSISTENT)
                        vrele(sc->sc_bs_vp);
                else
                        vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_cred);
        }
        sc->sc_bs_vp = NULL;

        return error;
}

/*
 * Delete a snapshot.
 */
static int
fss_delete_snapshot(struct fss_softc *sc, struct lwp *l)
{

        mutex_enter(&sc->sc_slock);
        if ((sc->sc_flags & FSS_PERSISTENT) == 0 &&
            (sc->sc_flags & FSS_ERROR) == 0) {
                mutex_exit(&sc->sc_slock);
                fscow_disestablish(sc->sc_mount, fss_copy_on_write, sc);
        } else {
                mutex_exit(&sc->sc_slock);
        }

        fss_softc_free(sc);
        if (sc->sc_flags & FSS_PERSISTENT)
                vrele(sc->sc_bs_vp);
        else
                vn_close(sc->sc_bs_vp, FREAD|FWRITE, l->l_cred);

        mutex_enter(&sc->sc_slock);
        sc->sc_state = FSS_IDLE;
        sc->sc_mount = NULL;
        sc->sc_bdev = NODEV;
        sc->sc_bs_vp = NULL;
        sc->sc_flags &= ~FSS_PERSISTENT;
        mutex_exit(&sc->sc_slock);

        return 0;
}

/*
 * Read a cluster from the snapshotted block device to the cache.
 */
static int
fss_read_cluster(struct fss_softc *sc, u_int32_t cl)
{
        int error, todo, offset, len;
        daddr_t dblk;
        struct buf *bp, *mbp;
        struct fss_cache *scp, *scl;

        /*
         * Get a free cache slot.
         */
        scl = sc->sc_cache+sc->sc_cache_size;

        mutex_enter(&sc->sc_slock);

restart:
        if (isset(sc->sc_copied, cl) || sc->sc_state != FSS_ACTIVE) {
                mutex_exit(&sc->sc_slock);
                return 0;
        }

        for (scp = sc->sc_cache; scp < scl; scp++) {
                if (scp->fc_type == FSS_CACHE_VALID) {
                        if (scp->fc_cluster == cl) {
                                mutex_exit(&sc->sc_slock);
                                return 0;
                        }
                } else if (scp->fc_type == FSS_CACHE_BUSY) {
                        if (scp->fc_cluster == cl) {
                                cv_wait(&scp->fc_state_cv, &sc->sc_slock);
                                goto restart;
                        }
                }
        }

        for (scp = sc->sc_cache; scp < scl; scp++)
                if (scp->fc_type == FSS_CACHE_FREE) {
                        scp->fc_type = FSS_CACHE_BUSY;
                        scp->fc_cluster = cl;
                        break;
                }
        if (scp >= scl) {
                cv_wait(&sc->sc_cache_cv, &sc->sc_slock);
                goto restart;
        }

        mutex_exit(&sc->sc_slock);

        /*
         * Start the read.
         */
        dblk = btodb(FSS_CLTOB(sc, cl));
        if (cl == sc->sc_clcount-1) {
                todo = sc->sc_clresid;
                memset((char *)scp->fc_data + todo, 0, FSS_CLSIZE(sc) - todo);
        } else
                todo = FSS_CLSIZE(sc);
        offset = 0;
        mbp = getiobuf(NULL, true);
        mbp->b_bufsize = todo;
        mbp->b_data = scp->fc_data;
        mbp->b_resid = mbp->b_bcount = todo;
        mbp->b_flags = B_READ;
        mbp->b_cflags = BC_BUSY;
        mbp->b_dev = sc->sc_bdev;
        while (todo > 0) {
                len = todo;
                if (len > MAXPHYS)
                        len = MAXPHYS;
                if (btodb(FSS_CLTOB(sc, cl)) == dblk && len == todo)
                        bp = mbp;
                else {
                        bp = getiobuf(NULL, true);
                        nestiobuf_setup(mbp, bp, offset, len);
                }
                bp->b_lblkno = 0;
                bp->b_blkno = dblk;
                bdev_strategy(bp);
                dblk += btodb(len);
                offset += len;
                todo -= len;
        }
        error = biowait(mbp);
        if (error == 0 && mbp->b_resid != 0)
                error = EIO;
        putiobuf(mbp);

        mutex_enter(&sc->sc_slock);
        scp->fc_type = (error ? FSS_CACHE_FREE : FSS_CACHE_VALID);
        cv_broadcast(&scp->fc_state_cv);
        if (error == 0) {
                setbit(sc->sc_copied, scp->fc_cluster);
                cv_signal(&sc->sc_work_cv);
        }
        mutex_exit(&sc->sc_slock);

        return error;
}

/*
 * Read/write clusters from/to backing store.
 * For persistent snapshots must be called with cl == 0. off is the
 * offset into the snapshot.
 */
static int
fss_bs_io(struct fss_softc *sc, fss_io_type rw,
    u_int32_t cl, off_t off, int len, void *data, size_t *resid)
{
        int error;

        off += FSS_CLTOB(sc, cl);

        vn_lock(sc->sc_bs_vp, LK_EXCLUSIVE|LK_RETRY);

        error = vn_rdwr((rw == FSS_READ ? UIO_READ : UIO_WRITE), sc->sc_bs_vp,
            data, len, off, UIO_SYSSPACE,
            IO_ADV_ENCODE(POSIX_FADV_NOREUSE) | IO_NODELOCKED,
            sc->sc_bs_lwp->l_cred, resid, NULL);
        if (error == 0) {
                rw_enter(sc->sc_bs_vp->v_uobj.vmobjlock, RW_WRITER);
                error = VOP_PUTPAGES(sc->sc_bs_vp, trunc_page(off),
                    round_page(off+len), PGO_CLEANIT | PGO_FREE | PGO_SYNCIO);
        }

        VOP_UNLOCK(sc->sc_bs_vp);

        return error;
}

/*
 * Get a pointer to the indirect slot for this cluster.
 */
static u_int32_t *
fss_bs_indir(struct fss_softc *sc, u_int32_t cl)
{
        u_int32_t icl;
        int ioff;

        icl = cl/(FSS_CLSIZE(sc)/sizeof(u_int32_t));
        ioff = cl%(FSS_CLSIZE(sc)/sizeof(u_int32_t));

        if (sc->sc_indir_cur == icl)
                return &sc->sc_indir_data[ioff];

        if (sc->sc_indir_dirty) {
                if (fss_bs_io(sc, FSS_WRITE, sc->sc_indir_cur, 0,
                    FSS_CLSIZE(sc), (void *)sc->sc_indir_data, NULL) != 0)
                        return NULL;
                setbit(sc->sc_indir_valid, sc->sc_indir_cur);
        }

        sc->sc_indir_dirty = 0;
        sc->sc_indir_cur = icl;

        if (isset(sc->sc_indir_valid, sc->sc_indir_cur)) {
                if (fss_bs_io(sc, FSS_READ, sc->sc_indir_cur, 0,
                    FSS_CLSIZE(sc), (void *)sc->sc_indir_data, NULL) != 0)
                        return NULL;
        } else
                memset(sc->sc_indir_data, 0, FSS_CLSIZE(sc));

        return &sc->sc_indir_data[ioff];
}

/*
 * The kernel thread (one for every active snapshot).
 *
 * After wakeup it cleans the cache and runs the I/O requests.
 */
static void
fss_bs_thread(void *arg)
{
        bool thread_idle, is_valid;
        int error, i, todo, len, crotor, is_read;
        long off;
        char *addr;
        u_int32_t c, cl, ch, *indirp;
        size_t resid;
        struct buf *bp, *nbp;
        struct fss_softc *sc;
        struct fss_cache *scp, *scl;

        sc = arg;
        scl = sc->sc_cache+sc->sc_cache_size;
        crotor = 0;
        thread_idle = false;

        mutex_enter(&sc->sc_slock);

        for (;;) {
                if (thread_idle)
                        cv_wait(&sc->sc_work_cv, &sc->sc_slock);
                thread_idle = true;
                if ((sc->sc_flags & FSS_BS_THREAD) == 0) {
                        mutex_exit(&sc->sc_slock);
                        kthread_exit(0);
                }

                /*
                 * Process I/O requests (persistent)
                 */

                if (sc->sc_flags & FSS_PERSISTENT) {
                        if ((bp = bufq_get(sc->sc_bufq)) == NULL)
                                continue;
                        is_valid = (sc->sc_state == FSS_ACTIVE);
                        is_read = (bp->b_flags & B_READ);
                        thread_idle = false;
                        mutex_exit(&sc->sc_slock);

                        if (is_valid) {
                                disk_busy(sc->sc_dkdev);
                                error = fss_bs_io(sc, FSS_READ, 0,
                                    dbtob(bp->b_blkno), bp->b_bcount,
                                    bp->b_data, &resid);
                                if (error)
                                        resid = bp->b_bcount;
                                disk_unbusy(sc->sc_dkdev,
                                    (error ? 0 : bp->b_bcount), is_read);
                        } else {
                                error = ENXIO;
                                resid = bp->b_bcount;
                        }

                        bp->b_error = error;
                        bp->b_resid = resid;
                        biodone(bp);

                        mutex_enter(&sc->sc_slock);
                        continue;
                }

                /*
                 * Clean the cache
                 */
                for (i = 0; i < sc->sc_cache_size; i++) {
                        crotor = (crotor + 1) % sc->sc_cache_size;
                        scp = sc->sc_cache + crotor;
                        if (scp->fc_type != FSS_CACHE_VALID)
                                continue;
                        mutex_exit(&sc->sc_slock);

                        thread_idle = false;
                        indirp = fss_bs_indir(sc, scp->fc_cluster);
                        if (indirp != NULL) {
                                error = fss_bs_io(sc, FSS_WRITE, sc->sc_clnext,
                                    0, FSS_CLSIZE(sc), scp->fc_data, NULL);
                        } else
                                error = EIO;

                        mutex_enter(&sc->sc_slock);
                        if (error == 0) {
                                *indirp = sc->sc_clnext++;
                                sc->sc_indir_dirty = 1;
                        } else
                                fss_error(sc, "write error on backing store");

                        scp->fc_type = FSS_CACHE_FREE;
                        cv_broadcast(&sc->sc_cache_cv);
                        break;
                }

                /*
                 * Process I/O requests
                 */
                if ((bp = bufq_get(sc->sc_bufq)) == NULL)
                        continue;
                is_valid = (sc->sc_state == FSS_ACTIVE);
                is_read = (bp->b_flags & B_READ);
                thread_idle = false;

                if (!is_valid) {
                        mutex_exit(&sc->sc_slock);

                        bp->b_error = ENXIO;
                        bp->b_resid = bp->b_bcount;
                        biodone(bp);

                        mutex_enter(&sc->sc_slock);
                        continue;
                }

                disk_busy(sc->sc_dkdev);

                /*
                 * First read from the snapshotted block device unless
                 * this request is completely covered by backing store.
                 */

                cl = FSS_BTOCL(sc, dbtob(bp->b_blkno));
                off = FSS_CLOFF(sc, dbtob(bp->b_blkno));
                ch = FSS_BTOCL(sc, dbtob(bp->b_blkno)+bp->b_bcount-1);
                error = 0;
                bp->b_resid = 0;
                bp->b_error = 0;
                for (c = cl; c <= ch; c++) {
                        if (isset(sc->sc_copied, c))
                                continue;
                        mutex_exit(&sc->sc_slock);

                        /* Not on backing store, read from device. */
                        nbp = getiobuf(NULL, true);
                        nbp->b_flags = B_READ | (bp->b_flags & B_PHYS);
                        nbp->b_resid = nbp->b_bcount = bp->b_bcount;
                        nbp->b_bufsize = bp->b_bcount;
                        nbp->b_data = bp->b_data;
                        nbp->b_blkno = bp->b_blkno;
                        nbp->b_lblkno = 0;
                        nbp->b_dev = sc->sc_bdev;
                        SET(nbp->b_cflags, BC_BUSY);        /* mark buffer busy */

                        bdev_strategy(nbp);

                        error = biowait(nbp);
                        if (error == 0 && nbp->b_resid != 0)
                                error = EIO;
                        if (error != 0) {
                                bp->b_resid = bp->b_bcount;
                                bp->b_error = nbp->b_error;
                                disk_unbusy(sc->sc_dkdev, 0, is_read);
                                biodone(bp);
                        }
                        putiobuf(nbp);

                        mutex_enter(&sc->sc_slock);
                        break;
                }
                if (error)
                        continue;

                /*
                 * Replace those parts that have been saved to backing store.
                 */

                addr = bp->b_data;
                todo = bp->b_bcount;
                for (c = cl; c <= ch; c++, off = 0, todo -= len, addr += len) {
                        len = FSS_CLSIZE(sc)-off;
                        if (len > todo)
                                len = todo;
                        if (isclr(sc->sc_copied, c))
                                continue;
                        mutex_exit(&sc->sc_slock);

                        indirp = fss_bs_indir(sc, c);
                        if (indirp == NULL || *indirp == 0) {
                                /*
                                 * Not on backing store. Either in cache
                                 * or hole in the snapshotted block device.
                                 */

                                mutex_enter(&sc->sc_slock);
                                for (scp = sc->sc_cache; scp < scl; scp++)
                                        if (scp->fc_type == FSS_CACHE_VALID &&
                                            scp->fc_cluster == c)
                                                break;
                                if (scp < scl)
                                        memcpy(addr, (char *)scp->fc_data+off,
                                            len);
                                else
                                        memset(addr, 0, len);
                                continue;
                        }

                        /*
                         * Read from backing store.
                         */
                        error = fss_bs_io(sc, FSS_READ,
                            *indirp, off, len, addr, NULL);

                        mutex_enter(&sc->sc_slock);
                        if (error) {
                                bp->b_resid = bp->b_bcount;
                                bp->b_error = error;
                                break;
                        }
                }
                mutex_exit(&sc->sc_slock);

                disk_unbusy(sc->sc_dkdev, (error ? 0 : bp->b_bcount), is_read);
                biodone(bp);

                mutex_enter(&sc->sc_slock);
        }
}

#ifdef _MODULE

#include <sys/module.h>

MODULE(MODULE_CLASS_DRIVER, fss, "bufq_fcfs");
CFDRIVER_DECL(fss, DV_DISK, NULL);

devmajor_t fss_bmajor = -1, fss_cmajor = -1;

static int
fss_modcmd(modcmd_t cmd, void *arg)
{
        int error = 0;

        switch (cmd) {
        case MODULE_CMD_INIT:
                mutex_init(&fss_device_lock, MUTEX_DEFAULT, IPL_NONE);
                cv_init(&fss_device_cv, "snapwait");

                error = devsw_attach(fss_cd.cd_name,
                    &fss_bdevsw, &fss_bmajor, &fss_cdevsw, &fss_cmajor);
                if (error) {
                        mutex_destroy(&fss_device_lock);
                        break;
                }

                error = config_cfdriver_attach(&fss_cd);
                if (error) {
                        devsw_detach(&fss_bdevsw, &fss_cdevsw);
                        mutex_destroy(&fss_device_lock);
                        break;
                }

                error = config_cfattach_attach(fss_cd.cd_name, &fss_ca);
                if (error) {
                        config_cfdriver_detach(&fss_cd);
                        devsw_detach(&fss_bdevsw, &fss_cdevsw);
                        mutex_destroy(&fss_device_lock);
                        break;
                }

                break;

        case MODULE_CMD_FINI:
                error = config_cfattach_detach(fss_cd.cd_name, &fss_ca);
                if (error) {
                        break;
                }
                error = config_cfdriver_detach(&fss_cd);
                if (error) {
                        config_cfattach_attach(fss_cd.cd_name, &fss_ca);
                        break;
                }
                devsw_detach(&fss_bdevsw, &fss_cdevsw);
                cv_destroy(&fss_device_cv);
                mutex_destroy(&fss_device_lock);
                break;

        default:
                error = ENOTTY;
                break;
        }

        return error;
}

#endif /* _MODULE */





























































































































































































































    1 
























    6 















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
/*        $NetBSD: signalvar.h,v 1.104 2021/11/01 05:07:17 thorpej Exp $        */

/*
 * Copyright (c) 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)signalvar.h        8.6 (Berkeley) 2/19/95
 */

#ifndef        _SYS_SIGNALVAR_H_
#define        _SYS_SIGNALVAR_H_

#include <sys/siginfo.h>
#include <sys/queue.h>
#include <sys/mutex.h>
#include <sys/stdbool.h>

#ifndef _KERNEL
#include <string.h>     /* Required for memset(3) and memcpy(3) prototypes */
#endif /* _KERNEL */

/*
 * Kernel signal definitions and data structures,
 * not exported to user programs.
 */

/*
 * Queue of signals.
 */
typedef TAILQ_HEAD(ksiginfoq, ksiginfo) ksiginfoq_t;

/*
 * Process signal actions, possibly shared between processes.
 */
struct sigacts {
        struct sigact_sigdesc {
                struct sigaction sd_sigact;
                const void        *sd_tramp;
                int                sd_vers;
        } sa_sigdesc[NSIG];                /* disposition of signals */

        int                sa_refcnt;        /* reference count */
        kmutex_t        sa_mutex;        /* lock on sa_refcnt */
};

/*
 * Pending signals, per LWP and per process.
 */
typedef struct sigpend {
        ksiginfoq_t        sp_info;
        sigset_t        sp_set;
} sigpend_t;

/*
 * Process signal state.
 */
struct sigctx {
        struct _ksiginfo ps_info;        /* for core dump/debugger XXX */
        int                 ps_lwp;        /* for core dump/debugger XXX */
        bool                 ps_faked;        /* for core dump/debugger XXX */
        void                *ps_sigcode;        /* address of signal trampoline */
        sigset_t         ps_sigignore;        /* Signals being ignored. */
        sigset_t         ps_sigcatch;        /* Signals being caught by user. */
        sigset_t         ps_sigpass;        /* Signals evading the debugger. */
};

/* additional signal action values, used only temporarily/internally */
#define        SIG_CATCH        (void (*)(int))2

/*
 * get signal action for process and signal; currently only for current process
 */
#define SIGACTION(p, sig)        (p->p_sigacts->sa_sigdesc[(sig)].sd_sigact)
#define        SIGACTION_PS(ps, sig)        (ps->sa_sigdesc[(sig)].sd_sigact)

/*
 * Copy a sigaction structure without padding.
 */
static __inline void
sigaction_copy(struct sigaction *dst, const struct sigaction *src)
{
        memset(dst, 0, sizeof(*dst));
        dst->_sa_u._sa_handler = src->_sa_u._sa_handler;
        memcpy(&dst->sa_mask, &src->sa_mask, sizeof(dst->sa_mask));
        dst->sa_flags = src->sa_flags;
}

/*
 * Signal properties and actions.
 * The array below categorizes the signals and their default actions
 * according to the following properties:
 */
#define        SA_KILL                0x0001                /* terminates process by default */
#define        SA_CORE                0x0002                /* ditto and coredumps */
#define        SA_STOP                0x0004                /* suspend process */
#define        SA_TTYSTOP        0x0008                /* ditto, from tty */
#define        SA_IGNORE        0x0010                /* ignore by default */
#define        SA_CONT                0x0020                /* continue if suspended */
#define        SA_CANTMASK        0x0040                /* non-maskable, catchable */
#define        SA_NORESET        0x0080                /* not reset when caught */
#define        SA_TOLWP        0x0100                /* to LWP that generated, if local */
#define        SA_TOALL        0x0200                /* always to all LWPs */

#ifdef _KERNEL

#include <sys/systm.h>                        /* for copyin_t/copyout_t */

extern sigset_t contsigmask, stopsigmask, sigcantmask;

struct vnode;
struct coredump_iostate;

/*
 * Machine-independent functions:
 */
int        coredump_netbsd(struct lwp *, struct coredump_iostate *);
int        coredump_netbsd32(struct lwp *, struct coredump_iostate *);
int        real_coredump_netbsd(struct lwp *, struct coredump_iostate *);
void        execsigs(struct proc *);
int        issignal(struct lwp *);
void        pgsignal(struct pgrp *, int, int);
void        kpgsignal(struct pgrp *, struct ksiginfo *, void *, int);
void        postsig(int);
void        psignal(struct proc *, int);
void        kpsignal(struct proc *, struct ksiginfo *, void *);
void        child_psignal(struct proc *, int);
void        siginit(struct proc *);
void        trapsignal(struct lwp *, struct ksiginfo *);
void        sigexit(struct lwp *, int) __dead;
void        killproc(struct proc *, const char *);
void        setsigvec(struct proc *, int, struct sigaction *);
int        killpg1(struct lwp *, struct ksiginfo *, int, int);
void        proc_unstop(struct proc *p);
void        eventswitch(int, int, int);
void        eventswitchchild(struct proc *, int, int);

int        sigaction1(struct lwp *, int, const struct sigaction *,
            struct sigaction *, const void *, int);
int        sigprocmask1(struct lwp *, int, const sigset_t *, sigset_t *);
void        sigpending1(struct lwp *, sigset_t *);
void        sigsuspendsetup(struct lwp *, const sigset_t *);
void        sigsuspendteardown(struct lwp *);
int        sigsuspend1(struct lwp *, const sigset_t *);
int        sigaltstack1(struct lwp *, const stack_t *, stack_t *);
int        sigismasked(struct lwp *, int);

int        sigget(sigpend_t *, ksiginfo_t *, int, const sigset_t *);
void        sigclear(sigpend_t *, const sigset_t *, ksiginfoq_t *);
void        sigclearall(struct proc *, const sigset_t *, ksiginfoq_t *);

int        kpsignal2(struct proc *, ksiginfo_t *);

void        signal_init(void);

struct sigacts        *sigactsinit(struct proc *, int);
void        sigactsunshare(struct proc *);
void        sigactsfree(struct sigacts *);

void        kpsendsig(struct lwp *, const struct ksiginfo *, const sigset_t *);
void        sendsig_reset(struct lwp *, int);
void        sendsig(const struct ksiginfo *, const sigset_t *);

ksiginfo_t        *ksiginfo_alloc(struct proc *, ksiginfo_t *, int);
void        ksiginfo_free(ksiginfo_t *);
void        ksiginfo_queue_drain0(ksiginfoq_t *);

struct sys_____sigtimedwait50_args;
int        sigtimedwait1(struct lwp *, const struct sys_____sigtimedwait50_args *,
    register_t *, copyin_t, copyout_t, copyin_t, copyout_t);

void        signotify(struct lwp *);
int        sigispending(struct lwp *, int);

/*
 * Machine-dependent functions:
 */
void        sendsig_sigcontext(const struct ksiginfo *, const sigset_t *);
void        sendsig_siginfo(const struct ksiginfo *, const sigset_t *);

extern        struct pool ksiginfo_pool;

/*
 * firstsig:
 *
 *         Return the first signal in a signal set.
 */
static __inline int
firstsig(const sigset_t *ss)
{
        int sig;

        sig = ffs(ss->__bits[0]);
        if (sig != 0)
                return (sig);
#if NSIG > 33
        sig = ffs(ss->__bits[1]);
        if (sig != 0)
                return (sig + 32);
#endif
#if NSIG > 65
        sig = ffs(ss->__bits[2]);
        if (sig != 0)
                return (sig + 64);
#endif
#if NSIG > 97
        sig = ffs(ss->__bits[3]);
        if (sig != 0)
                return (sig + 96);
#endif
        return (0);
}

static __inline void
ksiginfo_queue_init(ksiginfoq_t *kq)
{
        TAILQ_INIT(kq);
}

static __inline void
ksiginfo_queue_drain(ksiginfoq_t *kq)
{
        if (!TAILQ_EMPTY(kq))
                ksiginfo_queue_drain0(kq);
}

#endif        /* _KERNEL */

#ifdef        _KERNEL
#ifdef        SIGPROP
const int sigprop[NSIG] = {
        0,                                        /* 0 unused */
        SA_KILL,                                /* 1 SIGHUP */
        SA_KILL,                                /* 2 SIGINT */
        SA_KILL|SA_CORE,                        /* 3 SIGQUIT */
        SA_KILL|SA_CORE|SA_NORESET|SA_TOLWP,        /* 4 SIGILL */
        SA_KILL|SA_CORE|SA_NORESET|SA_TOLWP,        /* 5 SIGTRAP */
        SA_KILL|SA_CORE,                        /* 6 SIGABRT */
        SA_KILL|SA_CORE|SA_TOLWP,                /* 7 SIGEMT */
        SA_KILL|SA_CORE|SA_TOLWP,                /* 8 SIGFPE */
        SA_KILL|SA_CANTMASK|SA_TOALL,                /* 9 SIGKILL */
        SA_KILL|SA_CORE|SA_TOLWP,                /* 10 SIGBUS */
        SA_KILL|SA_CORE|SA_TOLWP,                /* 11 SIGSEGV */
        SA_KILL|SA_CORE|SA_TOLWP,                /* 12 SIGSYS */
        SA_KILL,                                /* 13 SIGPIPE */
        SA_KILL,                                /* 14 SIGALRM */
        SA_KILL,                                /* 15 SIGTERM */
        SA_IGNORE,                                /* 16 SIGURG */
        SA_STOP|SA_CANTMASK|SA_TOALL,                /* 17 SIGSTOP */
        SA_STOP|SA_TTYSTOP|SA_TOALL,                /* 18 SIGTSTP */
        SA_IGNORE|SA_CONT|SA_TOALL,                /* 19 SIGCONT */
        SA_IGNORE,                                /* 20 SIGCHLD */
        SA_STOP|SA_TTYSTOP|SA_TOALL,                /* 21 SIGTTIN */
        SA_STOP|SA_TTYSTOP|SA_TOALL,                /* 22 SIGTTOU */
        SA_IGNORE,                                /* 23 SIGIO */
        SA_KILL,                                /* 24 SIGXCPU */
        SA_KILL,                                /* 25 SIGXFSZ */
        SA_KILL,                                /* 26 SIGVTALRM */
        SA_KILL,                                /* 27 SIGPROF */
        SA_IGNORE,                                /* 28 SIGWINCH  */
        SA_IGNORE,                                /* 29 SIGINFO */
        SA_KILL,                                /* 30 SIGUSR1 */
        SA_KILL,                                /* 31 SIGUSR2 */
        SA_IGNORE|SA_NORESET,                        /* 32 SIGPWR */
        SA_KILL,                                /* 33 SIGRTMIN + 0 */
        SA_KILL,                                /* 34 SIGRTMIN + 1 */
        SA_KILL,                                /* 35 SIGRTMIN + 2 */
        SA_KILL,                                /* 36 SIGRTMIN + 3 */
        SA_KILL,                                /* 37 SIGRTMIN + 4 */
        SA_KILL,                                /* 38 SIGRTMIN + 5 */
        SA_KILL,                                /* 39 SIGRTMIN + 6 */
        SA_KILL,                                /* 40 SIGRTMIN + 7 */
        SA_KILL,                                /* 41 SIGRTMIN + 8 */
        SA_KILL,                                /* 42 SIGRTMIN + 9 */
        SA_KILL,                                /* 43 SIGRTMIN + 10 */
        SA_KILL,                                /* 44 SIGRTMIN + 11 */
        SA_KILL,                                /* 45 SIGRTMIN + 12 */
        SA_KILL,                                /* 46 SIGRTMIN + 13 */
        SA_KILL,                                /* 47 SIGRTMIN + 14 */
        SA_KILL,                                /* 48 SIGRTMIN + 15 */
        SA_KILL,                                /* 49 SIGRTMIN + 16 */
        SA_KILL,                                /* 50 SIGRTMIN + 17 */
        SA_KILL,                                /* 51 SIGRTMIN + 18 */
        SA_KILL,                                /* 52 SIGRTMIN + 19 */
        SA_KILL,                                /* 53 SIGRTMIN + 20 */
        SA_KILL,                                /* 54 SIGRTMIN + 21 */
        SA_KILL,                                /* 55 SIGRTMIN + 22 */
        SA_KILL,                                /* 56 SIGRTMIN + 23 */
        SA_KILL,                                /* 57 SIGRTMIN + 24 */
        SA_KILL,                                /* 58 SIGRTMIN + 25 */
        SA_KILL,                                /* 59 SIGRTMIN + 26 */
        SA_KILL,                                /* 60 SIGRTMIN + 27 */
        SA_KILL,                                /* 61 SIGRTMIN + 28 */
        SA_KILL,                                /* 62 SIGRTMIN + 29 */
        SA_KILL,                                /* 63 SIGRTMIN + 30 */
};
#undef        SIGPROP
#else
extern const int sigprop[NSIG];
#endif        /* SIGPROP */
#endif        /* _KERNEL */
#endif        /* !_SYS_SIGNALVAR_H_ */







































































































    7 











    6 







    1 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
/*        $NetBSD: kern_select_50.c,v 1.4 2023/07/28 18:19:00 christos Exp $        */

/*-
 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_select_50.c,v 1.4 2023/07/28 18:19:00 christos Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/event.h>
#include <sys/poll.h>
#include <sys/select.h>
#include <sys/time.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>

#include <compat/sys/event.h>
#include <compat/sys/time.h>
#include <compat/common/compat_mod.h>

static const struct syscall_package kern_select_50_syscalls[] = {
        { SYS_compat_50_kevent, 0, (sy_call_t *)compat_50_sys_kevent },
        { SYS_compat_50_select, 0, (sy_call_t *)compat_50_sys_select },
        { SYS_compat_50_pselect, 0, (sy_call_t *)compat_50_sys_pselect },
        { SYS_compat_50_pollts, 0, (sy_call_t *)compat_50_sys_pollts },
        { 0, 0, NULL }
};

static int
compat_50_kevent_fetch_timeout(const void *src, void *dest, size_t length)
{
        struct timespec50 ts50;
        int error;

        KASSERT(length == sizeof(struct timespec));

        error = copyin(src, &ts50, sizeof(ts50));
        if (error)
                return error;
        timespec50_to_timespec(&ts50, (struct timespec *)dest);
        return 0;
}

int
compat_50_sys_kevent(struct lwp *l, const struct compat_50_sys_kevent_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(struct kevent100 *) changelist;
                syscallarg(size_t) nchanges;
                syscallarg(struct kevent100 *) eventlist;
                syscallarg(size_t) nevents;
                syscallarg(struct timespec50) timeout;
        } */
        static const struct kevent_ops compat_50_kevent_ops = {
                .keo_private = NULL,
                .keo_fetch_timeout = compat_50_kevent_fetch_timeout,
                .keo_fetch_changes = compat_100___kevent50_fetch_changes,
                .keo_put_events = compat_100___kevent50_put_events,
        };

        return kevent1(retval, SCARG(uap, fd),
            (const struct kevent *)(const void *)SCARG(uap, changelist), SCARG(uap, nchanges),
            (struct kevent *)(void *)SCARG(uap, eventlist), SCARG(uap, nevents),
            (const struct timespec *)(const void *)SCARG(uap, timeout),
            &compat_50_kevent_ops);
}

int
compat_50_sys_select(struct lwp *l,
    const struct compat_50_sys_select_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)                        nd;
                syscallarg(fd_set *)                in;
                syscallarg(fd_set *)                ou;
                syscallarg(fd_set *)                ex;
                syscallarg(struct timeval50 *)        tv;
        } */
        struct timespec ats, *ts = NULL;
        struct timeval50 atv50;
        int error;

        if (SCARG(uap, tv)) {
                error = copyin(SCARG(uap, tv), (void *)&atv50, sizeof(atv50));
                if (error)
                        return error;

                if (atv50.tv_usec < 0 || atv50.tv_usec >= 1000000)
                        return EINVAL;

                ats.tv_sec = atv50.tv_sec;
                ats.tv_nsec = atv50.tv_usec * 1000;
                ts = &ats;
        }

        return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
            SCARG(uap, ou), SCARG(uap, ex), ts, NULL);
}

int
compat_50_sys_pselect(struct lwp *l,
    const struct compat_50_sys_pselect_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)                                nd;
                syscallarg(fd_set *)                        in;
                syscallarg(fd_set *)                        ou;
                syscallarg(fd_set *)                        ex;
                syscallarg(const struct timespec50 *)        ts;
                syscallarg(sigset_t *)                        mask;
        } */
        struct timespec50        ats50;
        struct timespec        ats, *ts = NULL;
        sigset_t        amask, *mask = NULL;
        int                error;

        if (SCARG(uap, ts)) {
                error = copyin(SCARG(uap, ts), &ats50, sizeof(ats50));
                if (error)
                        return error;
                timespec50_to_timespec(&ats50, &ats);
                ts = &ats;
        }
        if (SCARG(uap, mask) != NULL) {
                error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
                if (error)
                        return error;
                mask = &amask;
        }

        return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
            SCARG(uap, ou), SCARG(uap, ex), ts, mask);
}

int
compat_50_sys_pollts(struct lwp *l, const struct compat_50_sys_pollts_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(struct pollfd *)                fds;
                syscallarg(u_int)                        nfds;
                syscallarg(const struct timespec50 *)        ts;
                syscallarg(const sigset_t *)                mask;
        } */
        struct timespec        ats, *ts = NULL;
        struct timespec50 ats50;
        sigset_t        amask, *mask = NULL;
        int                error;

        if (SCARG(uap, ts)) {
                error = copyin(SCARG(uap, ts), &ats50, sizeof(ats50));
                if (error)
                        return error;
                timespec50_to_timespec(&ats50, &ats);
                ts = &ats;
        }
        if (SCARG(uap, mask)) {
                error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
                if (error)
                        return error;
                mask = &amask;
        }

        return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask);
}

int             
kern_select_50_init(void)
{               
        
return syscall_establish(NULL, kern_select_50_syscalls);
}       
        
int
kern_select_50_fini(void)
{               

return syscall_disestablish(NULL, kern_select_50_syscalls);
}



































































































































































































































































   39 


















   15 








   15 






















    1 






    1 




















   95 



   93 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
/*        $NetBSD: subr_specificdata.c,v 1.14 2017/06/01 02:45:13 chs Exp $        */

/*-
 * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 2006 YAMAMOTO Takashi.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_specificdata.c,v 1.14 2017/06/01 02:45:13 chs Exp $");

#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/specificdata.h>
#include <sys/queue.h>
#include <sys/mutex.h>

/*
 * Locking notes:
 *
 * The specdataref_container pointer in the specificdata_reference
 * is volatile.  To read it, you must hold EITHER the domain lock
 * or the ref lock.  To write it, you must hold BOTH the domain lock
 * and the ref lock.  The locks must be acquired in the following
 * order:
 *        domain -> ref
 */

typedef struct {
        specificdata_dtor_t        ski_dtor;
} specificdata_key_impl;

struct specificdata_container {
        size_t                sc_nkey;
        LIST_ENTRY(specificdata_container) sc_list;
        void *                sc_data[];        /* variable length */
};

#define        SPECIFICDATA_CONTAINER_BYTESIZE(n)                \
        (sizeof(struct specificdata_container) + ((n) * sizeof(void *)))

struct specificdata_domain {
        kmutex_t        sd_lock;
        unsigned int        sd_nkey;
        LIST_HEAD(, specificdata_container) sd_list;
        specificdata_key_impl *sd_keys;
};

static void
specificdata_container_link(specificdata_domain_t sd,
                            specificdata_container_t sc)
{

        LIST_INSERT_HEAD(&sd->sd_list, sc, sc_list);
}

static void
specificdata_container_unlink(specificdata_domain_t sd,
                              specificdata_container_t sc)
{

        LIST_REMOVE(sc, sc_list);
}

static void
specificdata_destroy_datum(specificdata_domain_t sd,
                           specificdata_container_t sc, specificdata_key_t key)
{
        specificdata_dtor_t dtor;
        void *data;

        if (key >= sc->sc_nkey)
                return;

        KASSERT(key < sd->sd_nkey);
        
        data = sc->sc_data[key];
        dtor = sd->sd_keys[key].ski_dtor;

        if (dtor != NULL) {
                if (data != NULL) {
                        sc->sc_data[key] = NULL;
                        (*dtor)(data);
                }
        } else {
                KASSERT(data == NULL);
        }
}

static void
specificdata_noop_dtor(void *data)
{

        /* nothing */
}

/*
 * specificdata_domain_create --
 *        Create a specificdata domain.
 */
specificdata_domain_t
specificdata_domain_create(void)
{
        specificdata_domain_t sd;

        sd = kmem_zalloc(sizeof(*sd), KM_SLEEP);
        mutex_init(&sd->sd_lock, MUTEX_DEFAULT, IPL_NONE);
        LIST_INIT(&sd->sd_list);

        return (sd);
}

/*
 * specificdata_domain_delete --
 *        Destroy a specificdata domain.
 */
void
specificdata_domain_delete(specificdata_domain_t sd)
{

        panic("specificdata_domain_delete: not implemented");
}

/*
 * specificdata_key_create --
 *        Create a specificdata key for a domain.
 *
 *        Note: This is a rare operation.
 */
int
specificdata_key_create(specificdata_domain_t sd, specificdata_key_t *keyp,
                        specificdata_dtor_t dtor)
{
        specificdata_key_impl *newkeys;
        specificdata_key_t key = 0;
        size_t nsz;

        ASSERT_SLEEPABLE();

        if (dtor == NULL)
                dtor = specificdata_noop_dtor;
        
        mutex_enter(&sd->sd_lock);

        if (sd->sd_keys == NULL)
                goto needalloc;

        for (; key < sd->sd_nkey; key++) {
                if (sd->sd_keys[key].ski_dtor == NULL)
                        goto gotit;
        }

 needalloc:
        nsz = (sd->sd_nkey + 1) * sizeof(*newkeys);
        /* XXXSMP allocating memory while holding a lock. */
        newkeys = kmem_zalloc(nsz, KM_SLEEP);
        if (sd->sd_keys != NULL) {
                size_t osz = sd->sd_nkey * sizeof(*newkeys);
                memcpy(newkeys, sd->sd_keys, osz);
                kmem_free(sd->sd_keys, osz);
        }
        sd->sd_keys = newkeys;
        sd->sd_nkey++;
 gotit:
        sd->sd_keys[key].ski_dtor = dtor;

        mutex_exit(&sd->sd_lock);

        *keyp = key;
        return (0);
}

/*
 * specificdata_key_delete --
 *        Destroy a specificdata key for a domain.
 *
 *        Note: This is a rare operation.
 */
void
specificdata_key_delete(specificdata_domain_t sd, specificdata_key_t key)
{
        specificdata_container_t sc;

        mutex_enter(&sd->sd_lock);

        if (key >= sd->sd_nkey)
                goto out;

        /*
         * Traverse all of the specificdata containers in the domain
         * and the destroy the datum for the dying key.
         */
        LIST_FOREACH(sc, &sd->sd_list, sc_list) {
                specificdata_destroy_datum(sd, sc, key);
        }

        sd->sd_keys[key].ski_dtor = NULL;

 out:
        mutex_exit(&sd->sd_lock);
}

/*
 * specificdata_init --
 *        Initialize a specificdata container for operation in the
 *        specified domain.
 */
int
specificdata_init(specificdata_domain_t sd, specificdata_reference *ref)
{

        /*
         * Just NULL-out the container pointer; we'll allocate the
         * container the first time specificdata is put into it.
         */
        ref->specdataref_container = NULL;
        mutex_init(&ref->specdataref_lock, MUTEX_DEFAULT, IPL_NONE);

        return (0);
}

/*
 * specificdata_fini --
 *        Destroy a specificdata container.  We destroy all of the datums
 *        stuffed into the container just as if the key were destroyed.
 */
void
specificdata_fini(specificdata_domain_t sd, specificdata_reference *ref)
{
        specificdata_container_t sc;
        specificdata_key_t key;

        ASSERT_SLEEPABLE();

        mutex_destroy(&ref->specdataref_lock);

        sc = ref->specdataref_container;
        if (sc == NULL)
                return;
        ref->specdataref_container = NULL;
        
        mutex_enter(&sd->sd_lock);

        specificdata_container_unlink(sd, sc);
        for (key = 0; key < sc->sc_nkey; key++) {
                specificdata_destroy_datum(sd, sc, key);
        }

        mutex_exit(&sd->sd_lock);

        kmem_free(sc, SPECIFICDATA_CONTAINER_BYTESIZE(sc->sc_nkey));
}

/*
 * specificdata_getspecific --
 *        Get a datum from a container.
 */
void *
specificdata_getspecific(specificdata_domain_t sd, specificdata_reference *ref,
                         specificdata_key_t key)
{
        specificdata_container_t sc;
        void *data = NULL;

        mutex_enter(&ref->specdataref_lock);

        sc = ref->specdataref_container;
        if (sc != NULL && key < sc->sc_nkey)
                data = sc->sc_data[key];

        mutex_exit(&ref->specdataref_lock);

        return (data);
}

/*
 * specificdata_getspecific_unlocked --
 *        Get a datum from a container in a lockless fashion.
 *
 *        Note: When using this routine, care must be taken to ensure
 *        that no other thread could cause the specificdata_reference
 *        to become invalid (i.e. point at the wrong container) by
 *        issuing a setspecific call or destroying the container.
 */
void *
specificdata_getspecific_unlocked(specificdata_domain_t sd,
                                  specificdata_reference *ref,
                                  specificdata_key_t key)
{
        specificdata_container_t sc;
        
        sc = ref->specdataref_container;
        if (sc != NULL && key < sc->sc_nkey)
                return (sc->sc_data[key]);

        return (NULL);
}

/*
 * specificdata_setspecific --
 *      Put a datum into a container.
 */
void
specificdata_setspecific(specificdata_domain_t sd,
                         specificdata_reference *ref,
                         specificdata_key_t key, void *data)
{
        specificdata_container_t sc, newsc;
        size_t newnkey, sz;

        ASSERT_SLEEPABLE();

        mutex_enter(&ref->specdataref_lock);

        sc = ref->specdataref_container;
        if (__predict_true(sc != NULL && key < sc->sc_nkey)) {
                sc->sc_data[key] = data;
                mutex_exit(&ref->specdataref_lock);
                return;
        }

        mutex_exit(&ref->specdataref_lock);

        /*
         * Slow path: need to resize.
         */
        
        mutex_enter(&sd->sd_lock);
        newnkey = sd->sd_nkey;
        if (key >= newnkey) {
                mutex_exit(&sd->sd_lock);
                panic("specificdata_setspecific");
        }
        sz = SPECIFICDATA_CONTAINER_BYTESIZE(newnkey);
        newsc = kmem_zalloc(sz, KM_SLEEP);
        newsc->sc_nkey = newnkey;

        mutex_enter(&ref->specdataref_lock);

        sc = ref->specdataref_container;
        if (sc != NULL) {
                if (key < sc->sc_nkey) {
                        /*
                         * Someone beat us to the punch.  Unwind and put
                         * the object into the now large enough container.
                         */
                        sc->sc_data[key] = data;
                        mutex_exit(&ref->specdataref_lock);
                        mutex_exit(&sd->sd_lock);
                        kmem_free(newsc, sz);
                        return;
                }
                specificdata_container_unlink(sd, sc);
                memcpy(newsc->sc_data, sc->sc_data,
                       sc->sc_nkey * sizeof(void *));
        }
        newsc->sc_data[key] = data;
        specificdata_container_link(sd, newsc);
        ref->specdataref_container = newsc;

        mutex_exit(&ref->specdataref_lock);
        mutex_exit(&sd->sd_lock);

        if (sc != NULL)
                kmem_free(sc, SPECIFICDATA_CONTAINER_BYTESIZE(sc->sc_nkey));
}

























































































































































    2 


    1 
























    2 













    2 










    1 




    1 



    1 





    1 











    2 



    2 



    2 








    2 























































    1 












    1 


















    1 





















    1 











    2 























    2 


    2 











    2 












    1 

































    2 






    2 





















    2 









    2 

    1 






    2 







    2 







    2 


    2 























    1 











    1 

    1 

















    1 
















    1 


    1 
    1 










































    2 





















    2 


    1 
    1 




    1 


























    1 














    1 









    2 





































    2 












































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
/*        $NetBSD: sysv_shm.c,v 1.142 2024/03/02 08:59:47 mlelstv Exp $        */

/*-
 * Copyright (c) 1999, 2007 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, and by Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1994 Adam Glass and Charles M. Hannum.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by Adam Glass and Charles M.
 *        Hannum.
 * 4. The names of the authors may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sysv_shm.c,v 1.142 2024/03/02 08:59:47 mlelstv Exp $");

#ifdef _KERNEL_OPT
#include "opt_sysv.h"
#endif

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/shm.h>
#include <sys/mutex.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/mount.h>                /* XXX for <sys/syscallargs.h> */
#include <sys/syscallargs.h>
#include <sys/queue.h>
#include <sys/kauth.h>

#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>

struct shmmap_entry {
        SLIST_ENTRY(shmmap_entry) next;
        vaddr_t va;
        int shmid;
};

int                        shm_nused                __cacheline_aligned;
struct shmid_ds *        shmsegs                        __read_mostly;

static kmutex_t                shm_lock                __cacheline_aligned;
static kcondvar_t *        shm_cv                        __cacheline_aligned;
static int                shm_last_free                __cacheline_aligned;
static size_t                shm_committed                __cacheline_aligned;
static int                shm_use_phys                __read_mostly;

static kcondvar_t        shm_realloc_cv;
static bool                shm_realloc_state;
static u_int                shm_realloc_disable;

struct shmmap_state {
        unsigned int nitems;
        unsigned int nrefs;
        SLIST_HEAD(, shmmap_entry) entries;
};

extern int kern_has_sysvshm;

SYSCTL_SETUP_PROTO(sysctl_ipc_shm_setup);

#ifdef SHMDEBUG
#define SHMPRINTF(a) printf a
#else
#define SHMPRINTF(a)
#endif

static int shmrealloc(int);

/*
 * Find the shared memory segment permission by the index. Only used by
 * compat_linux to implement SHM_STAT.
 */
int
shm_find_segment_perm_by_index(int index, struct ipc_perm *perm)
{
        struct shmid_ds *shmseg;

        mutex_enter(&shm_lock);
        if (index < 0 || index >= shminfo.shmmni) {
                mutex_exit(&shm_lock);
                return EINVAL;
        }
        shmseg = &shmsegs[index];
        memcpy(perm, &shmseg->shm_perm, sizeof(*perm));
        mutex_exit(&shm_lock);
        return 0;
}

/*
 * Find the shared memory segment by the identifier.
 *  => must be called with shm_lock held;
 */
static struct shmid_ds *
shm_find_segment_by_shmid(int shmid)
{
        int segnum;
        struct shmid_ds *shmseg;

        KASSERT(mutex_owned(&shm_lock));

        segnum = IPCID_TO_IX(shmid);
        if (segnum < 0 || segnum >= shminfo.shmmni)
                return NULL;
        shmseg = &shmsegs[segnum];
        if ((shmseg->shm_perm.mode & SHMSEG_ALLOCATED) == 0)
                return NULL;
        if ((shmseg->shm_perm.mode &
            (SHMSEG_REMOVED|SHMSEG_RMLINGER)) == SHMSEG_REMOVED)
                return NULL;
        if (shmseg->shm_perm._seq != IPCID_TO_SEQ(shmid))
                return NULL;

        return shmseg;
}

/*
 * Free memory segment.
 *  => must be called with shm_lock held;
 */
static void
shm_free_segment(int segnum)
{
        struct shmid_ds *shmseg;
        size_t size;
        bool wanted;

        KASSERT(mutex_owned(&shm_lock));

        shmseg = &shmsegs[segnum];
        SHMPRINTF(("shm freeing key 0x%lx seq 0x%x\n",
            shmseg->shm_perm._key, shmseg->shm_perm._seq));

        size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
        wanted = (shmseg->shm_perm.mode & SHMSEG_WANTED);

        shmseg->_shm_internal = NULL;
        shm_committed -= btoc(size);
        shm_nused--;
        shmseg->shm_perm.mode = SHMSEG_FREE;
        shm_last_free = segnum;
        if (wanted == true)
                cv_broadcast(&shm_cv[segnum]);
}

/*
 * Delete entry from the shm map.
 *  => must be called with shm_lock held;
 */
static struct uvm_object *
shm_delete_mapping(struct shmmap_state *shmmap_s,
    struct shmmap_entry *shmmap_se)
{
        struct uvm_object *uobj = NULL;
        struct shmid_ds *shmseg;
        int segnum;

        KASSERT(mutex_owned(&shm_lock));

        segnum = IPCID_TO_IX(shmmap_se->shmid);
        shmseg = &shmsegs[segnum];
        SLIST_REMOVE(&shmmap_s->entries, shmmap_se, shmmap_entry, next);
        shmmap_s->nitems--;
        shmseg->shm_dtime = time_second;
        if ((--shmseg->shm_nattch <= 0) &&
            (shmseg->shm_perm.mode & SHMSEG_REMOVED)) {
                uobj = shmseg->_shm_internal;
                shm_free_segment(segnum);
        }

        return uobj;
}

/*
 * Get a non-shared shm map for that vmspace.  Note, that memory
 * allocation might be performed with lock held.
 */
static struct shmmap_state *
shmmap_getprivate(struct proc *p)
{
        struct shmmap_state *oshmmap_s, *shmmap_s;
        struct shmmap_entry *oshmmap_se, *shmmap_se;

        KASSERT(mutex_owned(&shm_lock));

        /* 1. A shm map with refcnt = 1, used by ourselves, thus return */
        oshmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
        if (oshmmap_s && oshmmap_s->nrefs == 1)
                return oshmmap_s;

        /* 2. No shm map preset - create a fresh one */
        shmmap_s = kmem_zalloc(sizeof(struct shmmap_state), KM_SLEEP);
        shmmap_s->nrefs = 1;
        SLIST_INIT(&shmmap_s->entries);
        p->p_vmspace->vm_shm = (void *)shmmap_s;

        if (oshmmap_s == NULL)
                return shmmap_s;

        SHMPRINTF(("shmmap_getprivate: vm %p split (%d entries), was used by %d\n",
            p->p_vmspace, oshmmap_s->nitems, oshmmap_s->nrefs));

        /* 3. A shared shm map, copy to a fresh one and adjust refcounts */
        SLIST_FOREACH(oshmmap_se, &oshmmap_s->entries, next) {
                shmmap_se = kmem_alloc(sizeof(struct shmmap_entry), KM_SLEEP);
                shmmap_se->va = oshmmap_se->va;
                shmmap_se->shmid = oshmmap_se->shmid;
                SLIST_INSERT_HEAD(&shmmap_s->entries, shmmap_se, next);
        }
        shmmap_s->nitems = oshmmap_s->nitems;
        oshmmap_s->nrefs--;

        return shmmap_s;
}

/*
 * Lock/unlock the memory.
 *  => must be called with shm_lock held;
 */
static int
shm_memlock(struct shmid_ds *shmseg, int shmid, int cmd)
{
        size_t size;
        int error;

        KASSERT(mutex_owned(&shm_lock));

        size = round_page(shmseg->shm_segsz);

        if (cmd == SHM_LOCK && (shmseg->shm_perm.mode & SHMSEG_WIRED) == 0) {
                /* Wire the object and map, then tag it */
                error = uvm_obj_wirepages(shmseg->_shm_internal,
                    0, size, NULL);
                if (error)
                        return EIO;
                shmseg->shm_perm.mode |= SHMSEG_WIRED;

        } else if (cmd == SHM_UNLOCK &&
            (shmseg->shm_perm.mode & SHMSEG_WIRED) != 0) {
                /* Unwire the object, then untag it */
                uvm_obj_unwirepages(shmseg->_shm_internal, 0, size);
                shmseg->shm_perm.mode &= ~SHMSEG_WIRED;
        }

        return 0;
}

/*
 * Unmap shared memory.
 */
int
sys_shmdt(struct lwp *l, const struct sys_shmdt_args *uap, register_t *retval)
{
        /* {
                syscallarg(const void *) shmaddr;
        } */
        struct proc *p = l->l_proc;
        struct shmmap_state *shmmap_s1, *shmmap_s;
        struct shmmap_entry *shmmap_se;
        struct uvm_object *uobj;
        struct shmid_ds *shmseg;
        size_t size;

        mutex_enter(&shm_lock);
        /* In case of reallocation, we will wait for completion */
        while (__predict_false(shm_realloc_state))
                cv_wait(&shm_realloc_cv, &shm_lock);

        shmmap_s1 = (struct shmmap_state *)p->p_vmspace->vm_shm;
        if (shmmap_s1 == NULL) {
                mutex_exit(&shm_lock);
                return EINVAL;
        }

        /* Find the map entry */
        SLIST_FOREACH(shmmap_se, &shmmap_s1->entries, next)
                if (shmmap_se->va == (vaddr_t)SCARG(uap, shmaddr))
                        break;
        if (shmmap_se == NULL) {
                mutex_exit(&shm_lock);
                return EINVAL;
        }

        shmmap_s = shmmap_getprivate(p);
        if (shmmap_s != shmmap_s1) {
                /* Map has been copied, lookup entry in new map */
                SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next)
                        if (shmmap_se->va == (vaddr_t)SCARG(uap, shmaddr))
                                break;
                if (shmmap_se == NULL) {
                        mutex_exit(&shm_lock);
                        return EINVAL;
                }
        }

        SHMPRINTF(("shmdt: vm %p: remove %d @%lx\n",
            p->p_vmspace, shmmap_se->shmid, shmmap_se->va));

        /* Delete the entry from shm map */
        uobj = shm_delete_mapping(shmmap_s, shmmap_se);
        shmseg = &shmsegs[IPCID_TO_IX(shmmap_se->shmid)];
        size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
        mutex_exit(&shm_lock);

        uvm_deallocate(&p->p_vmspace->vm_map, shmmap_se->va, size);
        if (uobj != NULL) {
                uao_detach(uobj);
        }
        kmem_free(shmmap_se, sizeof(struct shmmap_entry));

        return 0;
}

/*
 * Map shared memory.
 */
int
sys_shmat(struct lwp *l, const struct sys_shmat_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) shmid;
                syscallarg(const void *) shmaddr;
                syscallarg(int) shmflg;
        } */
        int error, flags = 0;
        struct proc *p = l->l_proc;
        kauth_cred_t cred = l->l_cred;
        struct shmid_ds *shmseg;
        struct shmmap_state *shmmap_s;
        struct shmmap_entry *shmmap_se;
        struct uvm_object *uobj;
        struct vmspace *vm;
        vaddr_t attach_va;
        vm_prot_t prot;
        vsize_t size;

        /* Allocate a new map entry and set it */
        shmmap_se = kmem_alloc(sizeof(struct shmmap_entry), KM_SLEEP);
        shmmap_se->shmid = SCARG(uap, shmid);

        mutex_enter(&shm_lock);
        /* In case of reallocation, we will wait for completion */
        while (__predict_false(shm_realloc_state))
                cv_wait(&shm_realloc_cv, &shm_lock);

        shmseg = shm_find_segment_by_shmid(SCARG(uap, shmid));
        if (shmseg == NULL) {
                error = EINVAL;
                goto err;
        }
        error = ipcperm(cred, &shmseg->shm_perm,
            (SCARG(uap, shmflg) & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
        if (error)
                goto err;

        vm = p->p_vmspace;
        shmmap_s = (struct shmmap_state *)vm->vm_shm;
        if (shmmap_s && shmmap_s->nitems >= shminfo.shmseg) {
                error = EMFILE;
                goto err;
        }

        size = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
        prot = VM_PROT_READ;
        if ((SCARG(uap, shmflg) & SHM_RDONLY) == 0)
                prot |= VM_PROT_WRITE;
        if (SCARG(uap, shmaddr)) {
                flags |= UVM_FLAG_FIXED;
                if (SCARG(uap, shmflg) & SHM_RND)
                        attach_va =
                            (vaddr_t)SCARG(uap, shmaddr) & ~(SHMLBA-1);
                else if (((vaddr_t)SCARG(uap, shmaddr) & (SHMLBA-1)) == 0)
                        attach_va = (vaddr_t)SCARG(uap, shmaddr);
                else {
                        error = EINVAL;
                        goto err;
                }
        } else {
                /* This is just a hint to uvm_map() about where to put it. */
                attach_va = p->p_emul->e_vm_default_addr(p,
                    (vaddr_t)vm->vm_daddr, size,
                    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
        }

        /*
         * Create a map entry, add it to the list and increase the counters.
         */
        shmmap_s = shmmap_getprivate(p);
        SLIST_INSERT_HEAD(&shmmap_s->entries, shmmap_se, next);
        shmmap_s->nitems++;
        shmseg->shm_lpid = p->p_pid;
        shmseg->shm_nattch++;

        /*
         * Map the segment into the address space.
         */
        uobj = shmseg->_shm_internal;
        uao_reference(uobj);
        error = uvm_map(&vm->vm_map, &attach_va, size, uobj, 0, 0,
            UVM_MAPFLAG(prot, prot, UVM_INH_SHARE, UVM_ADV_RANDOM, flags));
        if (error)
                goto err_detach;

        /* Set the new address, and update the time */
        shmmap_se->va = attach_va;
        shmseg->shm_atime = time_second;
        retval[0] = attach_va;
        SHMPRINTF(("shmat: vm %p: add %d @%lx\n",
            p->p_vmspace, shmmap_se->shmid, attach_va));
err:
        mutex_exit(&shm_lock);
        if (error && shmmap_se) {
                kmem_free(shmmap_se, sizeof(struct shmmap_entry));
        }
        return error;

err_detach:
        uao_detach(uobj);
        uobj = shm_delete_mapping(shmmap_s, shmmap_se);
        mutex_exit(&shm_lock);
        if (uobj != NULL) {
                uao_detach(uobj);
        }
        kmem_free(shmmap_se, sizeof(struct shmmap_entry));
        return error;
}

/*
 * Shared memory control operations.
 */
int
sys___shmctl50(struct lwp *l, const struct sys___shmctl50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) shmid;
                syscallarg(int) cmd;
                syscallarg(struct shmid_ds *) buf;
        } */
        struct shmid_ds shmbuf;
        int cmd, error;

        cmd = SCARG(uap, cmd);
        if (cmd == IPC_SET) {
                error = copyin(SCARG(uap, buf), &shmbuf, sizeof(shmbuf));
                if (error)
                        return error;
        }

        error = shmctl1(l, SCARG(uap, shmid), cmd,
            (cmd == IPC_SET || cmd == IPC_STAT) ? &shmbuf : NULL);

        if (error == 0 && cmd == IPC_STAT)
                error = copyout(&shmbuf, SCARG(uap, buf), sizeof(shmbuf));

        return error;
}

int
shmctl1(struct lwp *l, int shmid, int cmd, struct shmid_ds *shmbuf)
{
        struct uvm_object *uobj = NULL;
        kauth_cred_t cred = l->l_cred;
        struct shmid_ds *shmseg;
        int error = 0;

        mutex_enter(&shm_lock);
        /* In case of reallocation, we will wait for completion */
        while (__predict_false(shm_realloc_state))
                cv_wait(&shm_realloc_cv, &shm_lock);

        shmseg = shm_find_segment_by_shmid(shmid);
        if (shmseg == NULL) {
                mutex_exit(&shm_lock);
                return EINVAL;
        }

        switch (cmd) {
        case IPC_STAT:
                if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_R)) != 0)
                        break;
                memset(shmbuf, 0, sizeof *shmbuf);
                shmbuf->shm_perm = shmseg->shm_perm;
                shmbuf->shm_perm.mode &= 0777;
                shmbuf->shm_segsz = shmseg->shm_segsz;
                shmbuf->shm_lpid = shmseg->shm_lpid;
                shmbuf->shm_cpid = shmseg->shm_cpid;
                shmbuf->shm_nattch = shmseg->shm_nattch;
                shmbuf->shm_atime = shmseg->shm_atime;
                shmbuf->shm_dtime = shmseg->shm_dtime;
                shmbuf->shm_ctime = shmseg->shm_ctime;
                break;
        case IPC_SET:
                if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_M)) != 0)
                        break;
                shmseg->shm_perm.uid = shmbuf->shm_perm.uid;
                shmseg->shm_perm.gid = shmbuf->shm_perm.gid;
                shmseg->shm_perm.mode =
                    (shmseg->shm_perm.mode & ~ACCESSPERMS) |
                    (shmbuf->shm_perm.mode & ACCESSPERMS);
                shmseg->shm_ctime = time_second;
                break;
        case IPC_RMID:
                if ((error = ipcperm(cred, &shmseg->shm_perm, IPC_M)) != 0)
                        break;
                shmseg->shm_perm._key = IPC_PRIVATE;
                shmseg->shm_perm.mode |= SHMSEG_REMOVED;
                if (shmseg->shm_nattch <= 0) {
                        uobj = shmseg->_shm_internal;
                        shm_free_segment(IPCID_TO_IX(shmid));
                }
                break;
        case SHM_LOCK:
        case SHM_UNLOCK:
                if ((error = kauth_authorize_system(cred,
                    KAUTH_SYSTEM_SYSVIPC,
                    (cmd == SHM_LOCK) ? KAUTH_REQ_SYSTEM_SYSVIPC_SHM_LOCK :
                    KAUTH_REQ_SYSTEM_SYSVIPC_SHM_UNLOCK, NULL, NULL, NULL)) != 0)
                        break;
                error = shm_memlock(shmseg, shmid, cmd);
                break;
        default:
                error = EINVAL;
        }

        mutex_exit(&shm_lock);
        if (uobj != NULL)
                uao_detach(uobj);
        return error;
}

/*
 * Try to take an already existing segment.
 *  => must be called with shm_lock held;
 *  => called from one place, thus, inline;
 */
static inline int
shmget_existing(struct lwp *l, const struct sys_shmget_args *uap, int mode,
    register_t *retval)
{
        struct shmid_ds *shmseg;
        kauth_cred_t cred = l->l_cred;
        int segnum, error;
again:
        KASSERT(mutex_owned(&shm_lock));

        /* Find segment by key */
        for (segnum = 0; segnum < shminfo.shmmni; segnum++)
                if ((shmsegs[segnum].shm_perm.mode & SHMSEG_ALLOCATED) &&
                    shmsegs[segnum].shm_perm._key == SCARG(uap, key))
                        break;
        if (segnum == shminfo.shmmni) {
                /* Not found */
                return -1;
        }

        shmseg = &shmsegs[segnum];
        if (shmseg->shm_perm.mode & SHMSEG_REMOVED) {
                /*
                 * This segment is in the process of being allocated.  Wait
                 * until it's done, and look the key up again (in case the
                 * allocation failed or it was freed).
                 */
                shmseg->shm_perm.mode |= SHMSEG_WANTED;
                error = cv_wait_sig(&shm_cv[segnum], &shm_lock);
                if (error)
                        return error;
                goto again;
        }

        /*
         * First check the flags, to generate a useful error when a
         * segment already exists.
         */
        if ((SCARG(uap, shmflg) & (IPC_CREAT | IPC_EXCL)) ==
            (IPC_CREAT | IPC_EXCL))
                return EEXIST;

        /* Check the permission and segment size. */
        error = ipcperm(cred, &shmseg->shm_perm, mode);
        if (error)
                return error;
        if (SCARG(uap, size) && SCARG(uap, size) > shmseg->shm_segsz)
                return EINVAL;

        *retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
        return 0;
}

int
sys_shmget(struct lwp *l, const struct sys_shmget_args *uap, register_t *retval)
{
        /* {
                syscallarg(key_t) key;
                syscallarg(size_t) size;
                syscallarg(int) shmflg;
        } */
        struct shmid_ds *shmseg;
        kauth_cred_t cred = l->l_cred;
        key_t key = SCARG(uap, key);
        size_t size;
        int error, mode, segnum;
        bool lockmem;

        mode = SCARG(uap, shmflg) & ACCESSPERMS;
        if (SCARG(uap, shmflg) & _SHM_RMLINGER)
                mode |= SHMSEG_RMLINGER;

        SHMPRINTF(("shmget: key 0x%lx size 0x%zx shmflg 0x%x mode 0x%x\n",
            SCARG(uap, key), SCARG(uap, size), SCARG(uap, shmflg), mode));

        mutex_enter(&shm_lock);
        /* In case of reallocation, we will wait for completion */
        while (__predict_false(shm_realloc_state))
                cv_wait(&shm_realloc_cv, &shm_lock);

        if (key != IPC_PRIVATE) {
                error = shmget_existing(l, uap, mode, retval);
                if (error != -1) {
                        mutex_exit(&shm_lock);
                        return error;
                }
                if ((SCARG(uap, shmflg) & IPC_CREAT) == 0) {
                        mutex_exit(&shm_lock);
                        return ENOENT;
                }
        }
        error = 0;

        /*
         * Check the for the limits.
         */
        size = SCARG(uap, size);
        if (size < shminfo.shmmin || size > shminfo.shmmax) {
                mutex_exit(&shm_lock);
                return EINVAL;
        }
        if (shm_nused >= shminfo.shmmni) {
                mutex_exit(&shm_lock);
                return ENOSPC;
        }
        size = round_page(size);
        if (shm_committed + btoc(size) > shminfo.shmall) {
                mutex_exit(&shm_lock);
                return ENOMEM;
        }

        /* Find the first available segment */
        if (shm_last_free < 0) {
                for (segnum = 0; segnum < shminfo.shmmni; segnum++)
                        if (shmsegs[segnum].shm_perm.mode & SHMSEG_FREE)
                                break;
                KASSERT(segnum < shminfo.shmmni);
        } else {
                segnum = shm_last_free;
                shm_last_free = -1;
        }

        /*
         * Initialize the segment.
         * We will drop the lock while allocating the memory, thus mark the
         * segment present, but removed, that no other thread could take it.
         * Also, disable reallocation, while lock is dropped.
         */
        shmseg = &shmsegs[segnum];
        shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
        shm_committed += btoc(size);
        shm_nused++;
        lockmem = shm_use_phys;
        shm_realloc_disable++;
        mutex_exit(&shm_lock);

        /* Allocate the memory object and lock it if needed */
        shmseg->_shm_internal = uao_create(size, 0);
        if (lockmem) {
                /* Wire the pages and tag it */
                error = uvm_obj_wirepages(shmseg->_shm_internal, 0, size, NULL);
                if (error) {
                        uao_detach(shmseg->_shm_internal);
                        mutex_enter(&shm_lock);
                        shm_free_segment(segnum);
                        shm_realloc_disable--;
                        mutex_exit(&shm_lock);
                        return error;
                }
        }

        /*
         * Please note, while segment is marked, there are no need to hold the
         * lock, while setting it (except shm_perm.mode).
         */
        shmseg->shm_perm._key = SCARG(uap, key);
        shmseg->shm_perm._seq = (shmseg->shm_perm._seq + 1) & 0x7fff;
        *retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);

        shmseg->shm_perm.cuid = shmseg->shm_perm.uid = kauth_cred_geteuid(cred);
        shmseg->shm_perm.cgid = shmseg->shm_perm.gid = kauth_cred_getegid(cred);
        shmseg->shm_segsz = SCARG(uap, size);
        shmseg->shm_cpid = l->l_proc->p_pid;
        shmseg->shm_lpid = shmseg->shm_nattch = 0;
        shmseg->shm_atime = shmseg->shm_dtime = 0;
        shmseg->shm_ctime = time_second;

        /*
         * Segment is initialized.
         * Enter the lock, mark as allocated, and notify waiters (if any).
         * Also, unmark the state of reallocation.
         */
        mutex_enter(&shm_lock);
        shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) |
            (mode & (ACCESSPERMS | SHMSEG_RMLINGER)) |
            SHMSEG_ALLOCATED | (lockmem ? SHMSEG_WIRED : 0);
        if (shmseg->shm_perm.mode & SHMSEG_WANTED) {
                shmseg->shm_perm.mode &= ~SHMSEG_WANTED;
                cv_broadcast(&shm_cv[segnum]);
        }
        shm_realloc_disable--;
        cv_broadcast(&shm_realloc_cv);
        mutex_exit(&shm_lock);

        return error;
}

void
shmfork(struct vmspace *vm1, struct vmspace *vm2)
{
        struct shmmap_state *shmmap_s;
        struct shmmap_entry *shmmap_se;

        SHMPRINTF(("shmfork %p->%p\n", vm1, vm2));
        mutex_enter(&shm_lock);
        vm2->vm_shm = vm1->vm_shm;
        if (vm1->vm_shm) {
                shmmap_s = (struct shmmap_state *)vm1->vm_shm;
                SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next)
                        shmsegs[IPCID_TO_IX(shmmap_se->shmid)].shm_nattch++;
                shmmap_s->nrefs++;
        }
        mutex_exit(&shm_lock);
}

void
shmexit(struct vmspace *vm)
{
        struct shmmap_state *shmmap_s;
        struct shmmap_entry *shmmap_se;

        mutex_enter(&shm_lock);
        shmmap_s = (struct shmmap_state *)vm->vm_shm;
        if (shmmap_s == NULL) {
                mutex_exit(&shm_lock);
                return;
        }
        vm->vm_shm = NULL;

        if (--shmmap_s->nrefs > 0) {
                SHMPRINTF(("shmexit: vm %p drop ref (%d entries), refs = %d\n",
                    vm, shmmap_s->nitems, shmmap_s->nrefs));
                SLIST_FOREACH(shmmap_se, &shmmap_s->entries, next) {
                        shmsegs[IPCID_TO_IX(shmmap_se->shmid)].shm_nattch--;
                }
                mutex_exit(&shm_lock);
                return;
        }

        SHMPRINTF(("shmexit: vm %p cleanup (%d entries)\n", vm, shmmap_s->nitems));
        if (shmmap_s->nitems == 0) {
                mutex_exit(&shm_lock);
                kmem_free(shmmap_s, sizeof(struct shmmap_state));
                return;
        }

        /*
         * Delete the entry from shm map.
         */
        for (;;) {
                struct shmid_ds *shmseg;
                struct uvm_object *uobj;
                size_t sz;

                shmmap_se = SLIST_FIRST(&shmmap_s->entries);
                KASSERT(shmmap_se != NULL);

                shmseg = &shmsegs[IPCID_TO_IX(shmmap_se->shmid)];
                sz = (shmseg->shm_segsz + PGOFSET) & ~PGOFSET;
                /* shm_delete_mapping() removes from the list. */
                uobj = shm_delete_mapping(shmmap_s, shmmap_se);
                mutex_exit(&shm_lock);

                uvm_deallocate(&vm->vm_map, shmmap_se->va, sz);
                if (uobj != NULL) {
                        uao_detach(uobj);
                }
                kmem_free(shmmap_se, sizeof(struct shmmap_entry));

                if (SLIST_EMPTY(&shmmap_s->entries)) {
                        break;
                }
                mutex_enter(&shm_lock);
                KASSERT(!SLIST_EMPTY(&shmmap_s->entries));
        }
        kmem_free(shmmap_s, sizeof(struct shmmap_state));
}

static int
shmrealloc(int newshmni)
{
        vaddr_t v;
        struct shmid_ds *oldshmsegs, *newshmsegs;
        kcondvar_t *newshm_cv, *oldshm_cv;
        size_t sz;
        int i, lsegid, oldshmni;

        if (newshmni < 1)
                return EINVAL;

        /* Allocate new memory area */
        sz = ALIGN(newshmni * sizeof(struct shmid_ds)) +
            ALIGN(newshmni * sizeof(kcondvar_t));
        sz = round_page(sz);
        v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
        if (v == 0)
                return ENOMEM;

        mutex_enter(&shm_lock);
        while (shm_realloc_state || shm_realloc_disable)
                cv_wait(&shm_realloc_cv, &shm_lock);

        /*
         * Get the number of last segment.  Fail we are trying to
         * reallocate less memory than we use.
         */
        lsegid = 0;
        for (i = 0; i < shminfo.shmmni; i++)
                if ((shmsegs[i].shm_perm.mode & SHMSEG_FREE) == 0)
                        lsegid = i;
        if (lsegid >= newshmni) {
                mutex_exit(&shm_lock);
                uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);
                return EBUSY;
        }
        shm_realloc_state = true;

        newshmsegs = (void *)v;
        newshm_cv = (void *)((uintptr_t)newshmsegs +
            ALIGN(newshmni * sizeof(struct shmid_ds)));

        /* Copy all memory to the new area */
        for (i = 0; i < shm_nused; i++) {
                cv_init(&newshm_cv[i], "shmwait");
                (void)memcpy(&newshmsegs[i], &shmsegs[i],
                    sizeof(newshmsegs[0]));
        }

        /* Mark as free all new segments, if there is any */
        for (; i < newshmni; i++) {
                cv_init(&newshm_cv[i], "shmwait");
                newshmsegs[i].shm_perm.mode = SHMSEG_FREE;
                newshmsegs[i].shm_perm._seq = 0;
        }

        oldshmsegs = shmsegs;
        oldshmni = shminfo.shmmni;
        shminfo.shmmni = newshmni;
        shmsegs = newshmsegs;
        shm_cv = newshm_cv;

        /* Reallocation completed - notify all waiters, if any */
        shm_realloc_state = false;
        cv_broadcast(&shm_realloc_cv);
        mutex_exit(&shm_lock);

        /* Release now unused resources. */
        oldshm_cv = (void *)((uintptr_t)oldshmsegs +
            ALIGN(oldshmni * sizeof(struct shmid_ds)));
        for (i = 0; i < oldshmni; i++)
                cv_destroy(&oldshm_cv[i]);

        sz = ALIGN(oldshmni * sizeof(struct shmid_ds)) +
            ALIGN(oldshmni * sizeof(kcondvar_t));
        sz = round_page(sz);
        uvm_km_free(kernel_map, (vaddr_t)oldshmsegs, sz, UVM_KMF_WIRED);

        return 0;
}

int
shminit(void)
{
        vaddr_t v;
        size_t sz;
        int i;

        mutex_init(&shm_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&shm_realloc_cv, "shmrealc");

        /* Allocate the wired memory for our structures */
        sz = ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)) +
            ALIGN(shminfo.shmmni * sizeof(kcondvar_t));
        sz = round_page(sz);
        v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
        if (v == 0) {
                printf("sysv_shm: cannot allocate memory");
                return ENOMEM;
        }
        shmsegs = (void *)v;
        shm_cv = (void *)((uintptr_t)shmsegs +
            ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)));

        if (shminfo.shmmax == 0)
                shminfo.shmall = uimax(physmem / 4, 1024);
        else
                shminfo.shmall = shminfo.shmmax / PAGE_SIZE;
        shminfo.shmmax = (uint64_t)shminfo.shmall * PAGE_SIZE;

        for (i = 0; i < shminfo.shmmni; i++) {
                cv_init(&shm_cv[i], "shmwait");
                shmsegs[i].shm_perm.mode = SHMSEG_FREE;
                shmsegs[i].shm_perm._seq = 0;
        }
        shm_last_free = 0;
        shm_nused = 0;
        shm_committed = 0;
        shm_realloc_disable = 0;
        shm_realloc_state = false;

        kern_has_sysvshm = 1;

        /* Load the callback function pointers for the uvm subsystem */
        uvm_shmexit = shmexit;
        uvm_shmfork = shmfork;

        return 0;
}

int
shmfini(void)
{
        size_t sz;
        int i;
        vaddr_t v = (vaddr_t)shmsegs;

        mutex_enter(&shm_lock);
        if (shm_nused) {
                mutex_exit(&shm_lock);
                return 1;
        }

        /* Clear the callback function pointers for the uvm subsystem */
        uvm_shmexit = NULL;
        uvm_shmfork = NULL;

        /* Destroy all condvars */
        for (i = 0; i < shminfo.shmmni; i++)
                cv_destroy(&shm_cv[i]);
        cv_destroy(&shm_realloc_cv);

        /* Free the allocated/wired memory */
        sz = ALIGN(shminfo.shmmni * sizeof(struct shmid_ds)) +
            ALIGN(shminfo.shmmni * sizeof(kcondvar_t));
        sz = round_page(sz);
        uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);

        /* Release and destroy our mutex */
        mutex_exit(&shm_lock);
        mutex_destroy(&shm_lock);

        kern_has_sysvshm = 0;

        return 0;
}

static int
sysctl_ipc_shmmni(SYSCTLFN_ARGS)
{
        int newsize, error;
        struct sysctlnode node;
        node = *rnode;
        node.sysctl_data = &newsize;

        newsize = shminfo.shmmni;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        sysctl_unlock();
        error = shmrealloc(newsize);
        sysctl_relock();
        return error;
}

static int
sysctl_ipc_shmmaxpgs(SYSCTLFN_ARGS)
{
        uint32_t newsize;
        int error;
        struct sysctlnode node;
        node = *rnode;
        node.sysctl_data = &newsize;

        newsize = shminfo.shmall;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        if (newsize < 1)
                return EINVAL;

        shminfo.shmall = newsize;
        shminfo.shmmax = (uint64_t)shminfo.shmall * PAGE_SIZE;

        return 0;
}

static int
sysctl_ipc_shmmax(SYSCTLFN_ARGS)
{
        uint64_t newsize;
        int error;
        struct sysctlnode node;
        node = *rnode;
        node.sysctl_data = &newsize;

        newsize = shminfo.shmmax;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        if (newsize < PAGE_SIZE)
                return EINVAL;

        shminfo.shmmax = round_page(newsize);
        shminfo.shmall = shminfo.shmmax / PAGE_SIZE;

        return 0;
}

SYSCTL_SETUP(sysctl_ipc_shm_setup, "sysctl kern.ipc subtree setup")
{

        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT,
                CTLTYPE_NODE, "ipc",
                SYSCTL_DESCR("SysV IPC options"),
                NULL, 0, NULL, 0,
                CTL_KERN, KERN_SYSVIPC, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_QUAD, "shmmax",
                SYSCTL_DESCR("Max shared memory segment size in bytes"),
                sysctl_ipc_shmmax, 0, &shminfo.shmmax, 0,
                CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMAX, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "shmmni",
                SYSCTL_DESCR("Max number of shared memory identifiers"),
                sysctl_ipc_shmmni, 0, &shminfo.shmmni, 0,
                CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMNI, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "shmseg",
                SYSCTL_DESCR("Max shared memory segments per process"),
                NULL, 0, &shminfo.shmseg, 0,
                CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMSEG, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "shmmaxpgs",
                SYSCTL_DESCR("Max amount of shared memory in pages"),
                sysctl_ipc_shmmaxpgs, 0, &shminfo.shmall, 0,
                CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMMAXPGS, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "shm_use_phys",
                SYSCTL_DESCR("Enable/disable locking of shared memory in "
                    "physical memory"), NULL, 0, &shm_use_phys, 0,
                CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_SHMUSEPHYS, CTL_EOL);
}































































































































































































































































   91 

   99 


















   89 

   91 

   26 
   93 

   92 







   25 




   25 













   70 









   27 


   27 




   27 










   30 
   30 


   30 
    3 
   29 
















   28 


    3 


   30 
    3 
   29 
   29 






























































































































   45 














   54 








   54 






   29 
    9 





   31 

    3 
    7 





   30 

   32 

   31 

   29 













































































































































































































   90 



   90 
   31 


   90 

   90 










   63 









   63 

   50 





   22 












































   47 








   44 

   47 










   47 

    1 



    1 






    1 





   47 

   19 


   38 




   47 




   46 









   47 





















   17 

    4 







   31 






   47 














   47 


   47 



   47 

   47 

















   46 
















    3 










   45 

    3 




    3 







    3 




   47 
   47 


   47 

















   47 




    3 






   45 






   87 

   87 


   31 

























   93 

   94 










   13 

   13 

   13 





















   11 

   11 






   11 




































































































































































   30 


   37 




















































































   36 


   29 
    1 



    3 











   29 























































    3 



    3 

    3 
    3 
    3 




    2 
    1 




    3 














   63 


   72 






   72 














   26 


   26 







   25 







   26 












   30 













   30 



   30 























   25 













   14 

    1 























   14 
   14 
   14 

   13 
    1 




















   19 

























   19 
   19 

   19 
   19 






   19 













   19 
























































































































    3 









    3 
    3 
    3 


















    3 





    3 















    3 





    3 











    3 





    3 
    2 
    1 









    3 




    3 
    3 

    3 






    3 

    3 
    3 



    3 
























    3 
































































































   48 

   48 

   45 
    3 

   46 


   48 








































    9 










    9 




    9 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
/*        $NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $        */

/*-
 * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_subr.c        8.13 (Berkeley) 4/18/94
 */

/*
 * The vnode cache subsystem.
 *
 * Life-cycle
 *
 *        Normally, there are two points where new vnodes are created:
 *        VOP_CREATE(9) and VOP_LOOKUP(9).  The life-cycle of a vnode
 *        starts in one of the following ways:
 *
 *        - Allocation, via vcache_get(9) or vcache_new(9).
 *        - Reclamation of inactive vnode, via vcache_vget(9).
 *
 *        Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
 *        was another, traditional way.  Currently, only the draining thread
 *        recycles the vnodes.  This behaviour might be revisited.
 *
 *        The life-cycle ends when the last reference is dropped, usually
 *        in VOP_REMOVE(9).  In such case, VOP_INACTIVE(9) is called to inform
 *        the file system that vnode is inactive.  Via this call, file system
 *        indicates whether vnode can be recycled (usually, it checks its own
 *        references, e.g. count of links, whether the file was removed).
 *
 *        Depending on indication, vnode can be put into a free list (cache),
 *        or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
 *        disassociate underlying file system from the vnode, and finally
 *        destroyed.
 *
 * Vnode state
 *
 *        Vnode is always in one of six states:
 *        - MARKER        This is a marker vnode to help list traversal.  It
 *                        will never change its state.
 *        - LOADING        Vnode is associating underlying file system and not
 *                        yet ready to use.
 *        - LOADED        Vnode has associated underlying file system and is
 *                        ready to use.
 *        - BLOCKED        Vnode is active but cannot get new references.
 *        - RECLAIMING        Vnode is disassociating from the underlying file
 *                        system.
 *        - RECLAIMED        Vnode has disassociated from underlying file system
 *                        and is dead.
 *
 *        Valid state changes are:
 *        LOADING -> LOADED
 *                        Vnode has been initialised in vcache_get() or
 *                        vcache_new() and is ready to use.
 *        BLOCKED -> RECLAIMING
 *                        Vnode starts disassociation from underlying file
 *                        system in vcache_reclaim().
 *        RECLAIMING -> RECLAIMED
 *                        Vnode finished disassociation from underlying file
 *                        system in vcache_reclaim().
 *        LOADED -> BLOCKED
 *                        Either vcache_rekey*() is changing the vnode key or
 *                        vrelel() is about to call VOP_INACTIVE().
 *        BLOCKED -> LOADED
 *                        The block condition is over.
 *        LOADING -> RECLAIMED
 *                        Either vcache_get() or vcache_new() failed to
 *                        associate the underlying file system or vcache_rekey*()
 *                        drops a vnode used as placeholder.
 *
 *        Of these states LOADING, BLOCKED and RECLAIMING are intermediate
 *        and it is possible to wait for state change.
 *
 *        State is protected with v_interlock with one exception:
 *        to change from LOADING both v_interlock and vcache_lock must be held
 *        so it is possible to check "state == LOADING" without holding
 *        v_interlock.  See vcache_get() for details.
 *
 * Reference counting
 *
 *        Vnode is considered active, if reference count (vnode_t::v_usecount)
 *        is non-zero.  It is maintained using: vref(9) and vrele(9), as well
 *        as vput(9), routines.  Common points holding references are e.g.
 *        file openings, current working directory, mount points, etc.
 *
 *        v_usecount is adjusted with atomic operations, however to change
 *        from a non-zero value to zero the interlock must also be held.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $");

#ifdef _KERNEL_OPT
#include "opt_pax.h"
#endif

#include <sys/param.h>
#include <sys/kernel.h>

#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/device.h>
#include <sys/hash.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/pax.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/threadpool.h>
#include <sys/vnode_impl.h>
#include <sys/wapbl.h>
#include <sys/fstrans.h>

#include <miscfs/deadfs/deadfs.h>
#include <miscfs/specfs/specdev.h>

#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>
#include <uvm/uvm_stat.h>

/* Flags to vrelel. */
#define        VRELEL_ASYNC        0x0001        /* Always defer to vrele thread. */

#define        LRU_VRELE        0
#define        LRU_FREE        1
#define        LRU_HOLD        2
#define        LRU_COUNT        3

/*
 * There are three lru lists: one holds vnodes waiting for async release,
 * one is for vnodes which have no buffer/page references and one for those
 * which do (i.e.  v_holdcnt is non-zero).  We put the lists into a single,
 * private cache line as vnodes migrate between them while under the same
 * lock (vdrain_lock).
 */

typedef struct {
        vnode_impl_t *li_marker;
} lru_iter_t;

u_int                        numvnodes                __cacheline_aligned;
static vnodelst_t        lru_list[LRU_COUNT]        __cacheline_aligned;
static struct threadpool *threadpool;
static struct threadpool_job vdrain_job;
static struct threadpool_job vrele_job;
static kmutex_t                vdrain_lock                __cacheline_aligned;
SLIST_HEAD(hashhead, vnode_impl);
static kmutex_t                vcache_lock                __cacheline_aligned;
static kcondvar_t        vcache_cv;
static u_int                vcache_hashsize;
static u_long                vcache_hashmask;
static struct hashhead        *vcache_hashtab;
static pool_cache_t        vcache_pool;
static void                lru_requeue(vnode_t *, vnodelst_t *);
static vnodelst_t *        lru_which(vnode_t *);
static vnode_impl_t *        lru_iter_first(int, lru_iter_t *);
static vnode_impl_t *        lru_iter_next(lru_iter_t *);
static void                lru_iter_release(lru_iter_t *);
static vnode_impl_t *        vcache_alloc(void);
static void                vcache_dealloc(vnode_impl_t *);
static void                vcache_free(vnode_impl_t *);
static void                vcache_init(void);
static void                vcache_reinit(void);
static void                vcache_reclaim(vnode_t *);
static void                vrele_deferred(vnode_impl_t *);
static void                vrelel(vnode_t *, int, int);
static void                vnpanic(vnode_t *, const char *, ...)
    __printflike(2, 3);
static bool                vdrain_one(u_int);
static void                vdrain_task(struct threadpool_job *);
static void                vrele_task(struct threadpool_job *);

/* Routines having to do with the management of the vnode table. */

/*
 * The high bit of v_usecount is a gate for vcache_tryvget().  It's set
 * only when the vnode state is LOADED.
 * The next bit of v_usecount is a flag for vrelel().  It's set
 * from vcache_vget() and vcache_tryvget() whenever the operation succeeds.
 */
#define        VUSECOUNT_MASK        0x3fffffff
#define        VUSECOUNT_GATE        0x80000000
#define        VUSECOUNT_VGET        0x40000000

/*
 * Return the current usecount of a vnode.
 */
inline int
vrefcnt(struct vnode *vp)
{

        return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK;
}

/* Vnode state operations and diagnostics. */

#if defined(DIAGNOSTIC)

#define VSTATE_VALID(state) \
        ((state) != VS_ACTIVE && (state) != VS_MARKER)
#define VSTATE_GET(vp) \
        vstate_assert_get((vp), __func__, __LINE__)
#define VSTATE_CHANGE(vp, from, to) \
        vstate_assert_change((vp), (from), (to), __func__, __LINE__)
#define VSTATE_WAIT_STABLE(vp) \
        vstate_assert_wait_stable((vp), __func__, __LINE__)

void
_vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
    bool has_lock)
{
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
        int refcnt = vrefcnt(vp);

        if (!has_lock) {
                enum vnode_state vstate = atomic_load_relaxed(&vip->vi_state);

                if (state == VS_ACTIVE && refcnt > 0 &&
                    (vstate == VS_LOADED || vstate == VS_BLOCKED))
                        return;
                if (vstate == state)
                        return;
                mutex_enter((vp)->v_interlock);
        }

        KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);

        if ((state == VS_ACTIVE && refcnt > 0 &&
            (vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
            vip->vi_state == state) {
                if (!has_lock)
                        mutex_exit((vp)->v_interlock);
                return;
        }
        vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
            vstate_name(vip->vi_state), refcnt,
            vstate_name(state), func, line);
}

static enum vnode_state
vstate_assert_get(vnode_t *vp, const char *func, int line)
{
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);

        KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
        if (! VSTATE_VALID(vip->vi_state))
                vnpanic(vp, "state is %s at %s:%d",
                    vstate_name(vip->vi_state), func, line);

        return vip->vi_state;
}

static void
vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
{
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);

        KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
        if (! VSTATE_VALID(vip->vi_state))
                vnpanic(vp, "state is %s at %s:%d",
                    vstate_name(vip->vi_state), func, line);

        while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
                cv_wait(&vp->v_cv, vp->v_interlock);

        if (! VSTATE_VALID(vip->vi_state))
                vnpanic(vp, "state is %s at %s:%d",
                    vstate_name(vip->vi_state), func, line);
}

static void
vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
    const char *func, int line)
{
        bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE);
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);

        KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
        if (from == VS_LOADING)
                KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);

        if (! VSTATE_VALID(from))
                vnpanic(vp, "from is %s at %s:%d",
                    vstate_name(from), func, line);
        if (! VSTATE_VALID(to))
                vnpanic(vp, "to is %s at %s:%d",
                    vstate_name(to), func, line);
        if (vip->vi_state != from)
                vnpanic(vp, "from is %s, expected %s at %s:%d\n",
                    vstate_name(vip->vi_state), vstate_name(from), func, line);
        if ((from == VS_LOADED) != gated)
                vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n",
                    vstate_name(vip->vi_state), gated, func, line);

        /* Open/close the gate for vcache_tryvget(). */
        if (to == VS_LOADED) {
                membar_release();
                atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
        } else {
                atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
        }

        atomic_store_relaxed(&vip->vi_state, to);
        if (from == VS_LOADING)
                cv_broadcast(&vcache_cv);
        if (to == VS_LOADED || to == VS_RECLAIMED)
                cv_broadcast(&vp->v_cv);
}

#else /* defined(DIAGNOSTIC) */

#define VSTATE_GET(vp) \
        (VNODE_TO_VIMPL((vp))->vi_state)
#define VSTATE_CHANGE(vp, from, to) \
        vstate_change((vp), (from), (to))
#define VSTATE_WAIT_STABLE(vp) \
        vstate_wait_stable((vp))
void
_vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
    bool has_lock)
{

}

static void
vstate_wait_stable(vnode_t *vp)
{
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);

        while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
                cv_wait(&vp->v_cv, vp->v_interlock);
}

static void
vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
{
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);

        /* Open/close the gate for vcache_tryvget(). */
        if (to == VS_LOADED) {
                membar_release();
                atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
        } else {
                atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
        }

        atomic_store_relaxed(&vip->vi_state, to);
        if (from == VS_LOADING)
                cv_broadcast(&vcache_cv);
        if (to == VS_LOADED || to == VS_RECLAIMED)
                cv_broadcast(&vp->v_cv);
}

#endif /* defined(DIAGNOSTIC) */

void
vfs_vnode_sysinit(void)
{
        int error __diagused, i;

        dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
        KASSERT(dead_rootmount != NULL);
        dead_rootmount->mnt_iflag |= IMNT_MPSAFE;

        mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
        for (i = 0; i < LRU_COUNT; i++) {
                TAILQ_INIT(&lru_list[i]);
        }
        vcache_init();

        error = threadpool_get(&threadpool, PRI_NONE);
        KASSERTMSG((error == 0), "threadpool_get failed: %d", error);
        threadpool_job_init(&vdrain_job, vdrain_task, &vdrain_lock, "vdrain");
        threadpool_job_init(&vrele_job, vrele_task, &vdrain_lock, "vrele");
}

/*
 * Allocate a new marker vnode.
 */
vnode_t *
vnalloc_marker(struct mount *mp)
{
        vnode_impl_t *vip;
        vnode_t *vp;

        vip = pool_cache_get(vcache_pool, PR_WAITOK);
        memset(vip, 0, sizeof(*vip));
        vp = VIMPL_TO_VNODE(vip);
        uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
        vp->v_mount = mp;
        vp->v_type = VBAD;
        vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
        klist_init(&vip->vi_klist.vk_klist);
        vp->v_klist = &vip->vi_klist;
        vip->vi_state = VS_MARKER;

        return vp;
}

/*
 * Free a marker vnode.
 */
void
vnfree_marker(vnode_t *vp)
{
        vnode_impl_t *vip;

        vip = VNODE_TO_VIMPL(vp);
        KASSERT(vip->vi_state == VS_MARKER);
        mutex_obj_free(vp->v_interlock);
        uvm_obj_destroy(&vp->v_uobj, true);
        klist_fini(&vip->vi_klist.vk_klist);
        pool_cache_put(vcache_pool, vip);
}

/*
 * Test a vnode for being a marker vnode.
 */
bool
vnis_marker(vnode_t *vp)
{

        return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
}

/*
 * Return the lru list this node should be on.
 */
static vnodelst_t *
lru_which(vnode_t *vp)
{

        KASSERT(mutex_owned(vp->v_interlock));

        if (vp->v_holdcnt > 0)
                return &lru_list[LRU_HOLD];
        else
                return &lru_list[LRU_FREE];
}

/*
 * Put vnode to end of given list.
 * Both the current and the new list may be NULL, used on vnode alloc/free.
 * Adjust numvnodes and signal vdrain thread if there is work.
 */
static void
lru_requeue(vnode_t *vp, vnodelst_t *listhd)
{
        vnode_impl_t *vip;
        int d;

        /*
         * If the vnode is on the correct list, and was put there recently,
         * then leave it be, thus avoiding huge cache and lock contention.
         */
        vip = VNODE_TO_VIMPL(vp);
        if (listhd == vip->vi_lrulisthd &&
            (getticks() - vip->vi_lrulisttm) < hz) {
                    return;
        }

        mutex_enter(&vdrain_lock);
        d = 0;
        if (vip->vi_lrulisthd != NULL)
                TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
        else
                d++;
        vip->vi_lrulisthd = listhd;
        vip->vi_lrulisttm = getticks();
        if (vip->vi_lrulisthd != NULL)
                TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
        else
                d--;
        if (d != 0) {
                /*
                 * Looks strange?  This is not a bug.  Don't store
                 * numvnodes unless there is a change - avoid false
                 * sharing on MP.
                 */
                numvnodes += d;
        }
        if (listhd == &lru_list[LRU_VRELE])
                threadpool_schedule_job(threadpool, &vrele_job);
        if (d > 0 && numvnodes > desiredvnodes)
                threadpool_schedule_job(threadpool, &vdrain_job);
        if (d > 0 && numvnodes > desiredvnodes + desiredvnodes / 16)
                kpause("vnfull", false, MAX(1, mstohz(10)), &vdrain_lock);
        mutex_exit(&vdrain_lock);
}

/*
 * LRU list iterator.
 * Caller holds vdrain_lock.
 */
static vnode_impl_t *
lru_iter_first(int idx, lru_iter_t *iterp)
{
        vnode_impl_t *marker;

        KASSERT(mutex_owned(&vdrain_lock));

        mutex_exit(&vdrain_lock);
        marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
        mutex_enter(&vdrain_lock);
        marker->vi_lrulisthd = &lru_list[idx];
        iterp->li_marker = marker;

        TAILQ_INSERT_HEAD(marker->vi_lrulisthd, marker, vi_lrulist);

        return lru_iter_next(iterp);
}

static vnode_impl_t *
lru_iter_next(lru_iter_t *iter)
{
        vnode_impl_t *vip, *marker;
        vnodelst_t *listhd;

        KASSERT(mutex_owned(&vdrain_lock));

        marker = iter->li_marker;
        listhd = marker->vi_lrulisthd;

        while ((vip = TAILQ_NEXT(marker, vi_lrulist))) {
                TAILQ_REMOVE(listhd, marker, vi_lrulist);
                TAILQ_INSERT_AFTER(listhd, vip, marker, vi_lrulist);
                if (!vnis_marker(VIMPL_TO_VNODE(vip)))
                        break;
        }

        return vip;
}

static void
lru_iter_release(lru_iter_t *iter)
{
        vnode_impl_t *marker;

        KASSERT(mutex_owned(&vdrain_lock));

        marker = iter->li_marker;
        TAILQ_REMOVE(marker->vi_lrulisthd, marker, vi_lrulist);

        mutex_exit(&vdrain_lock);
        vnfree_marker(VIMPL_TO_VNODE(marker));
        mutex_enter(&vdrain_lock);
}

/*
 * Release deferred vrele vnodes for this mount.
 * Called with file system suspended.
 */
void
vrele_flush(struct mount *mp)
{
        lru_iter_t iter;
        vnode_impl_t *vip;

        KASSERT(fstrans_is_owner(mp));

        mutex_enter(&vdrain_lock);
        for (vip = lru_iter_first(LRU_VRELE, &iter); vip != NULL;
            vip = lru_iter_next(&iter)) {
                if (VIMPL_TO_VNODE(vip)->v_mount != mp)
                        continue;
                vrele_deferred(vip);
        }
        lru_iter_release(&iter);
        mutex_exit(&vdrain_lock);
}

/*
 * One pass through the LRU lists to keep the number of allocated
 * vnodes below target.  Returns true if target met.
 */
static bool
vdrain_one(u_int target)
{
        int ix, lists[] = { LRU_FREE, LRU_HOLD };
        lru_iter_t iter;
        vnode_impl_t *vip;
        vnode_t *vp;
        struct mount *mp;

        KASSERT(mutex_owned(&vdrain_lock));

        for (ix = 0; ix < __arraycount(lists); ix++) {
                for (vip = lru_iter_first(lists[ix], &iter); vip != NULL;
                    vip = lru_iter_next(&iter)) {
                        if (numvnodes < target) {
                                lru_iter_release(&iter);
                                return true;
                        }

                        vp = VIMPL_TO_VNODE(vip);

                        /* Probe usecount (unlocked). */
                        if (vrefcnt(vp) > 0)
                                continue;
                        /* Try v_interlock -- we lock the wrong direction! */
                        if (!mutex_tryenter(vp->v_interlock))
                                continue;
                        /* Probe usecount and state. */
                        if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) {
                                mutex_exit(vp->v_interlock);
                                continue;
                        }
                        mutex_exit(&vdrain_lock);

                        mp = vp->v_mount;
                        if (fstrans_start_nowait(mp) != 0) {
                                mutex_exit(vp->v_interlock);
                                mutex_enter(&vdrain_lock);
                                continue;
                        }

                        if (vcache_vget(vp) == 0) {
                                if (!vrecycle(vp)) {
                                        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                                        mutex_enter(vp->v_interlock);
                                        vrelel(vp, 0, LK_EXCLUSIVE);
                                }
                        }
                        fstrans_done(mp);

                        mutex_enter(&vdrain_lock);
                }
                lru_iter_release(&iter);
        }

        return false;
}

/*
 * threadpool task to keep the number of vnodes below desiredvnodes.
 */
static void
vdrain_task(struct threadpool_job *job)
{
        u_int target;

        target = desiredvnodes - desiredvnodes / 16;

        mutex_enter(&vdrain_lock);

        while (!vdrain_one(target))
                kpause("vdrain", false, 1, &vdrain_lock);

        threadpool_job_done(job);
        mutex_exit(&vdrain_lock);
}

/*
 * threadpool task to process asynchronous vrele.
 */
static void
vrele_task(struct threadpool_job *job)
{
        int skipped;
        lru_iter_t iter;
        vnode_impl_t *vip;
        struct mount *mp;

        mutex_enter(&vdrain_lock);
        while ((vip = lru_iter_first(LRU_VRELE, &iter)) != NULL) {
                for (skipped = 0; vip != NULL; vip = lru_iter_next(&iter)) {
                        mp = VIMPL_TO_VNODE(vip)->v_mount;
                        if (fstrans_start_nowait(mp) == 0) {
                                vrele_deferred(vip);
                                fstrans_done(mp);
                        } else {
                                skipped++;
                        }
                }

                lru_iter_release(&iter);
                if (skipped)
                        kpause("vrele", false, MAX(1, mstohz(10)), &vdrain_lock);
        }

        threadpool_job_done(job);
        lru_iter_release(&iter);
        mutex_exit(&vdrain_lock);
}

/*
 * Try to drop reference on a vnode.  Abort if we are releasing the
 * last reference.  Note: this _must_ succeed if not the last reference.
 */
static bool
vtryrele(vnode_t *vp)
{
        u_int use, next;

        membar_release();
        for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
                if (__predict_false((use & VUSECOUNT_MASK) == 1)) {
                        return false;
                }
                KASSERT((use & VUSECOUNT_MASK) > 1);
                next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
                if (__predict_true(next == use)) {
                        return true;
                }
        }
}

/*
 * vput: unlock and release the reference.
 */
void
vput(vnode_t *vp)
{
        int lktype;

        /*
         * Do an unlocked check of the usecount.  If it looks like we're not
         * about to drop the last reference, then unlock the vnode and try
         * to drop the reference.  If it ends up being the last reference
         * after all, vrelel() can fix it all up.  Most of the time this
         * will all go to plan.
         */
        if (vrefcnt(vp) > 1) {
                VOP_UNLOCK(vp);
                if (vtryrele(vp)) {
                        return;
                }
                lktype = LK_NONE;
        } else {
                lktype = VOP_ISLOCKED(vp);
                KASSERT(lktype != LK_NONE);
        }
        mutex_enter(vp->v_interlock);
        vrelel(vp, 0, lktype);
}

/*
 * Release a vnode from the deferred list.
 */
static void
vrele_deferred(vnode_impl_t *vip)
{
        vnode_t *vp;

        KASSERT(mutex_owned(&vdrain_lock));
        KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);

        vp = VIMPL_TO_VNODE(vip);

        /*
         * First remove the vnode from the vrele list.
         * Put it on the last lru list, the last vrele()
         * will put it back onto the right list before
         * its usecount reaches zero.
         */
        TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
        vip->vi_lrulisthd = &lru_list[LRU_HOLD];
        vip->vi_lrulisttm = getticks();
        TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);

        mutex_exit(&vdrain_lock);

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        mutex_enter(vp->v_interlock);
        vrelel(vp, 0, LK_EXCLUSIVE);

        mutex_enter(&vdrain_lock);
}

/*
 * Vnode release.  If reference count drops to zero, call inactive
 * routine and either return to freelist or free to the pool.
 */
static void
vrelel(vnode_t *vp, int flags, int lktype)
{
        const bool async = ((flags & VRELEL_ASYNC) != 0);
        bool recycle, defer, objlock_held;
        u_int use, next;
        int error;

        objlock_held = false;

retry:
        KASSERT(mutex_owned(vp->v_interlock));

        if (__predict_false(vp->v_op == dead_vnodeop_p &&
            VSTATE_GET(vp) != VS_RECLAIMED)) {
                vnpanic(vp, "dead but not clean");
        }

        /*
         * If not the last reference, just unlock and drop the reference count.
         *
         * Otherwise make sure we pass a point in time where we hold the
         * last reference with VGET flag unset.
         */
        for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
                if (__predict_false((use & VUSECOUNT_MASK) > 1)) {
                        if (objlock_held) {
                                objlock_held = false;
                                rw_exit(vp->v_uobj.vmobjlock);
                        }
                        if (lktype != LK_NONE) {
                                mutex_exit(vp->v_interlock);
                                lktype = LK_NONE;
                                VOP_UNLOCK(vp);
                                mutex_enter(vp->v_interlock);
                        }
                        if (vtryrele(vp)) {
                                mutex_exit(vp->v_interlock);
                                return;
                        }
                        next = atomic_load_relaxed(&vp->v_usecount);
                        continue;
                }
                KASSERT((use & VUSECOUNT_MASK) == 1);
                next = use & ~VUSECOUNT_VGET;
                if (next != use) {
                        next = atomic_cas_uint(&vp->v_usecount, use, next);
                }
                if (__predict_true(next == use)) {
                        break;
                }
        }
        membar_acquire();
        if (vrefcnt(vp) <= 0 || vp->v_writecount != 0) {
                vnpanic(vp, "%s: bad ref count", __func__);
        }

#ifdef DIAGNOSTIC
        if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
            vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
                vprint("vrelel: missing VOP_CLOSE()", vp);
        }
#endif

        /*
         * If already clean there is no need to lock, defer or
         * deactivate this node.
         */
        if (VSTATE_GET(vp) == VS_RECLAIMED) {
                if (objlock_held) {
                        objlock_held = false;
                        rw_exit(vp->v_uobj.vmobjlock);
                }
                if (lktype != LK_NONE) {
                        mutex_exit(vp->v_interlock);
                        lktype = LK_NONE;
                        VOP_UNLOCK(vp);
                        mutex_enter(vp->v_interlock);
                }
                goto out;
        }

        /*
         * First try to get the vnode locked for VOP_INACTIVE().
         * Defer vnode release to vrele task if caller requests
         * it explicitly, is the pagedaemon or the lock failed.
         */
        defer = false;
        if ((curlwp == uvm.pagedaemon_lwp) || async) {
                defer = true;
        } else if (lktype == LK_SHARED) {
                /* Excellent chance of getting, if the last ref. */
                error = vn_lock(vp, LK_UPGRADE | LK_RETRY | LK_NOWAIT);
                if (error != 0) {
                        defer = true;
                } else {
                        lktype = LK_EXCLUSIVE;
                }
        } else if (lktype == LK_NONE) {
                /* Excellent chance of getting, if the last ref. */
                error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
                if (error != 0) {
                        defer = true;
                } else {
                        lktype = LK_EXCLUSIVE;
                }
        }
        KASSERT(mutex_owned(vp->v_interlock));
        if (defer) {
                /*
                 * Defer reclaim to the vrele task; it's not safe to
                 * clean it here.  We donate it our last reference.
                 */
                if (lktype != LK_NONE) {
                        mutex_exit(vp->v_interlock);
                        VOP_UNLOCK(vp);
                        mutex_enter(vp->v_interlock);
                }
                lru_requeue(vp, &lru_list[LRU_VRELE]);
                mutex_exit(vp->v_interlock);
                return;
        }
        KASSERT(lktype == LK_EXCLUSIVE);

        /* If the node gained another reference, retry. */
        use = atomic_load_relaxed(&vp->v_usecount);
        if ((use & VUSECOUNT_VGET) != 0) {
                goto retry;
        }
        KASSERT((use & VUSECOUNT_MASK) == 1);

        if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP|VI_WRMAP)) != 0 ||
            (vp->v_vflag & VV_MAPPED) != 0) {
                /* Take care of space accounting. */
                if (!objlock_held) {
                        objlock_held = true;
                        if (!rw_tryenter(vp->v_uobj.vmobjlock, RW_WRITER)) {
                                mutex_exit(vp->v_interlock);
                                rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
                                mutex_enter(vp->v_interlock);
                                goto retry;
                        }
                }
                if ((vp->v_iflag & VI_EXECMAP) != 0) {
                        cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
                }
                vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
                vp->v_vflag &= ~VV_MAPPED;
        }
        if (objlock_held) {
                objlock_held = false;
                rw_exit(vp->v_uobj.vmobjlock);
        }

        /*
         * Deactivate the vnode, but preserve our reference across
         * the call to VOP_INACTIVE().
         *
         * If VOP_INACTIVE() indicates that the file has been
         * deleted, then recycle the vnode.
         *
         * Note that VOP_INACTIVE() will not drop the vnode lock.
         */
        mutex_exit(vp->v_interlock);
        recycle = false;
        VOP_INACTIVE(vp, &recycle);
        if (!recycle) {
                lktype = LK_NONE;
                VOP_UNLOCK(vp);
        }
        mutex_enter(vp->v_interlock);

        /*
         * Block new references then check again to see if a
         * new reference was acquired in the meantime.  If
         * it was, restore the vnode state and try again.
         */
        if (recycle) {
                VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
                use = atomic_load_relaxed(&vp->v_usecount);
                if ((use & VUSECOUNT_VGET) != 0) {
                        VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
                        goto retry;
                }
                KASSERT((use & VUSECOUNT_MASK) == 1);
        }

        /*
         * Recycle the vnode if the file is now unused (unlinked).
         */
        if (recycle) {
                VSTATE_ASSERT(vp, VS_BLOCKED);
                KASSERT(lktype == LK_EXCLUSIVE);
                /* vcache_reclaim drops the lock. */
                lktype = LK_NONE;
                vcache_reclaim(vp);
        }
        KASSERT(vrefcnt(vp) > 0);
        KASSERT(lktype == LK_NONE);

out:
        for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
                if (__predict_false((use & VUSECOUNT_VGET) != 0 &&
                    (use & VUSECOUNT_MASK) == 1)) {
                        /* Gained and released another reference, retry. */
                        goto retry;
                }
                next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
                if (__predict_true(next == use)) {
                        if (__predict_false((use & VUSECOUNT_MASK) != 1)) {
                                /* Gained another reference. */
                                mutex_exit(vp->v_interlock);
                                return;
                        }
                        break;
                }
        }
        membar_acquire();

        if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
                /*
                 * It's clean so destroy it.  It isn't referenced
                 * anywhere since it has been reclaimed.
                 */
                vcache_free(VNODE_TO_VIMPL(vp));
        } else {
                /*
                 * Otherwise, put it back onto the freelist.  It
                 * can't be destroyed while still associated with
                 * a file system.
                 */
                lru_requeue(vp, lru_which(vp));
                mutex_exit(vp->v_interlock);
        }
}

void
vrele(vnode_t *vp)
{

        if (vtryrele(vp)) {
                return;
        }
        mutex_enter(vp->v_interlock);
        vrelel(vp, 0, LK_NONE);
}

/*
 * Asynchronous vnode release, vnode is released in different context.
 */
void
vrele_async(vnode_t *vp)
{

        if (vtryrele(vp)) {
                return;
        }
        mutex_enter(vp->v_interlock);
        vrelel(vp, VRELEL_ASYNC, LK_NONE);
}

/*
 * Vnode reference, where a reference is already held by some other
 * object (for example, a file structure).
 *
 * NB: lockless code sequences may rely on this not blocking.
 */
void
vref(vnode_t *vp)
{

        KASSERT(vrefcnt(vp) > 0);

        atomic_inc_uint(&vp->v_usecount);
}

/*
 * Page or buffer structure gets a reference.
 * Called with v_interlock held.
 */
void
vholdl(vnode_t *vp)
{

        KASSERT(mutex_owned(vp->v_interlock));

        if (vp->v_holdcnt++ == 0 && vrefcnt(vp) == 0)
                lru_requeue(vp, lru_which(vp));
}

/*
 * Page or buffer structure gets a reference.
 */
void
vhold(vnode_t *vp)
{

        mutex_enter(vp->v_interlock);
        vholdl(vp);
        mutex_exit(vp->v_interlock);
}

/*
 * Page or buffer structure frees a reference.
 * Called with v_interlock held.
 */
void
holdrelel(vnode_t *vp)
{

        KASSERT(mutex_owned(vp->v_interlock));

        if (vp->v_holdcnt <= 0) {
                vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
        }

        vp->v_holdcnt--;
        if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
                lru_requeue(vp, lru_which(vp));
}

/*
 * Page or buffer structure frees a reference.
 */
void
holdrele(vnode_t *vp)
{

        mutex_enter(vp->v_interlock);
        holdrelel(vp);
        mutex_exit(vp->v_interlock);
}

/*
 * Recycle an unused vnode if caller holds the last reference.
 */
bool
vrecycle(vnode_t *vp)
{
        int error __diagused;

        mutex_enter(vp->v_interlock);

        /* If the vnode is already clean we're done. */
        VSTATE_WAIT_STABLE(vp);
        if (VSTATE_GET(vp) != VS_LOADED) {
                VSTATE_ASSERT(vp, VS_RECLAIMED);
                vrelel(vp, 0, LK_NONE);
                return true;
        }

        /* Prevent further references until the vnode is locked. */
        VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);

        /* Make sure we hold the last reference. */
        if (vrefcnt(vp) != 1) {
                VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
                mutex_exit(vp->v_interlock);
                return false;
        }

        mutex_exit(vp->v_interlock);

        /*
         * On a leaf file system this lock will always succeed as we hold
         * the last reference and prevent further references.
         * On layered file systems waiting for the lock would open a can of
         * deadlocks as the lower vnodes may have other active references.
         */
        error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);

        mutex_enter(vp->v_interlock);
        if (error) {
                VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
                mutex_exit(vp->v_interlock);
                return false;
        }

        KASSERT(vrefcnt(vp) == 1);
        vcache_reclaim(vp);
        vrelel(vp, 0, LK_NONE);

        return true;
}

/*
 * Helper for vrevoke() to propagate suspension from lastmp
 * to thismp.  Both args may be NULL.
 * Returns the currently suspended file system or NULL.
 */
static struct mount *
vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
{
        int error;

        if (lastmp == thismp)
                return thismp;

        if (lastmp != NULL)
                vfs_resume(lastmp);

        if (thismp == NULL)
                return NULL;

        do {
                error = vfs_suspend(thismp, 0);
        } while (error == EINTR || error == ERESTART);

        if (error == 0)
                return thismp;

        KASSERT(error == EOPNOTSUPP || error == ENOENT);
        return NULL;
}

/*
 * Eliminate all activity associated with the requested vnode
 * and with all vnodes aliased to the requested vnode.
 */
void
vrevoke(vnode_t *vp)
{
        struct mount *mp;
        vnode_t *vq;
        enum vtype type;
        dev_t dev;

        KASSERT(vrefcnt(vp) > 0);

        mp = vrevoke_suspend_next(NULL, vp->v_mount);

        mutex_enter(vp->v_interlock);
        VSTATE_WAIT_STABLE(vp);
        if (VSTATE_GET(vp) == VS_RECLAIMED) {
                mutex_exit(vp->v_interlock);
        } else if (vp->v_type != VBLK && vp->v_type != VCHR) {
                atomic_inc_uint(&vp->v_usecount);
                mutex_exit(vp->v_interlock);
                vgone(vp);
        } else {
                dev = vp->v_rdev;
                type = vp->v_type;
                mutex_exit(vp->v_interlock);

                while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq)
                    == 0) {
                        mp = vrevoke_suspend_next(mp, vq->v_mount);
                        vgone(vq);
                }
        }
        vrevoke_suspend_next(mp, NULL);
}

/*
 * Eliminate all activity associated with a vnode in preparation for
 * reuse.  Drops a reference from the vnode.
 */
void
vgone(vnode_t *vp)
{
        int lktype;

        KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        lktype = LK_EXCLUSIVE;
        mutex_enter(vp->v_interlock);
        VSTATE_WAIT_STABLE(vp);
        if (VSTATE_GET(vp) == VS_LOADED) {
                VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
                vcache_reclaim(vp);
                lktype = LK_NONE;
        }
        VSTATE_ASSERT(vp, VS_RECLAIMED);
        vrelel(vp, 0, lktype);
}

static inline uint32_t
vcache_hash(const struct vcache_key *key)
{
        uint32_t hash = HASH32_BUF_INIT;

        KASSERT(key->vk_key_len > 0);

        hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
        hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
        return hash;
}

static int
vcache_stats(struct hashstat_sysctl *hs, bool fill)
{
        vnode_impl_t *vip;
        uint64_t chain;

        strlcpy(hs->hash_name, "vcache", sizeof(hs->hash_name));
        strlcpy(hs->hash_desc, "vnode cache hash", sizeof(hs->hash_desc));
        if (!fill)
                return 0;

        hs->hash_size = vcache_hashmask + 1;

        for (size_t i = 0; i < hs->hash_size; i++) {
                chain = 0;
                mutex_enter(&vcache_lock);
                SLIST_FOREACH(vip, &vcache_hashtab[i], vi_hash) {
                        chain++;
                }
                mutex_exit(&vcache_lock);
                if (chain > 0) {
                        hs->hash_used++;
                        hs->hash_items += chain;
                        if (chain > hs->hash_maxchain)
                                hs->hash_maxchain = chain;
                }
                preempt_point();
        }

        return 0;
}

static void
vcache_init(void)
{

        vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit,
            0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
        KASSERT(vcache_pool != NULL);
        mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&vcache_cv, "vcache");
        vcache_hashsize = desiredvnodes;
        vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
            &vcache_hashmask);
        hashstat_register("vcache", vcache_stats);
}

static void
vcache_reinit(void)
{
        int i;
        uint32_t hash;
        u_long oldmask, newmask;
        struct hashhead *oldtab, *newtab;
        vnode_impl_t *vip;

        newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
        mutex_enter(&vcache_lock);
        oldtab = vcache_hashtab;
        oldmask = vcache_hashmask;
        vcache_hashsize = desiredvnodes;
        vcache_hashtab = newtab;
        vcache_hashmask = newmask;
        for (i = 0; i <= oldmask; i++) {
                while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
                        SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
                        hash = vcache_hash(&vip->vi_key);
                        SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
                            vip, vi_hash);
                }
        }
        mutex_exit(&vcache_lock);
        hashdone(oldtab, HASH_SLIST, oldmask);
}

static inline vnode_impl_t *
vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
{
        struct hashhead *hashp;
        vnode_impl_t *vip;

        KASSERT(mutex_owned(&vcache_lock));

        hashp = &vcache_hashtab[hash & vcache_hashmask];
        SLIST_FOREACH(vip, hashp, vi_hash) {
                if (key->vk_mount != vip->vi_key.vk_mount)
                        continue;
                if (key->vk_key_len != vip->vi_key.vk_key_len)
                        continue;
                if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
                        continue;
                return vip;
        }
        return NULL;
}

/*
 * Allocate a new, uninitialized vcache node.
 */
static vnode_impl_t *
vcache_alloc(void)
{
        vnode_impl_t *vip;
        vnode_t *vp;

        vip = pool_cache_get(vcache_pool, PR_WAITOK);
        vp = VIMPL_TO_VNODE(vip);
        memset(vip, 0, sizeof(*vip));

        rw_init(&vip->vi_lock);
        vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);

        uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
        klist_init(&vip->vi_klist.vk_klist);
        vp->v_klist = &vip->vi_klist;
        cv_init(&vp->v_cv, "vnode");
        cache_vnode_init(vp);

        vp->v_usecount = 1;
        vp->v_type = VNON;
        vp->v_size = vp->v_writesize = VSIZENOTSET;

        vip->vi_state = VS_LOADING;

        lru_requeue(vp, &lru_list[LRU_FREE]);

        return vip;
}

/*
 * Deallocate a vcache node in state VS_LOADING.
 *
 * vcache_lock held on entry and released on return.
 */
static void
vcache_dealloc(vnode_impl_t *vip)
{
        vnode_t *vp;

        KASSERT(mutex_owned(&vcache_lock));

        vp = VIMPL_TO_VNODE(vip);
        vfs_ref(dead_rootmount);
        vfs_insmntque(vp, dead_rootmount);
        mutex_enter(vp->v_interlock);
        vp->v_op = dead_vnodeop_p;
        VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
        mutex_exit(&vcache_lock);
        vrelel(vp, 0, LK_NONE);
}

/*
 * Free an unused, unreferenced vcache node.
 * v_interlock locked on entry.
 */
static void
vcache_free(vnode_impl_t *vip)
{
        vnode_t *vp;

        vp = VIMPL_TO_VNODE(vip);
        KASSERT(mutex_owned(vp->v_interlock));

        KASSERT(vrefcnt(vp) == 0);
        KASSERT(vp->v_holdcnt == 0);
        KASSERT(vp->v_writecount == 0);
        lru_requeue(vp, NULL);
        mutex_exit(vp->v_interlock);

        vfs_insmntque(vp, NULL);
        if (vp->v_type == VBLK || vp->v_type == VCHR)
                spec_node_destroy(vp);

        mutex_obj_free(vp->v_interlock);
        rw_destroy(&vip->vi_lock);
        uvm_obj_destroy(&vp->v_uobj, true);
        KASSERT(vp->v_klist == &vip->vi_klist);
        klist_fini(&vip->vi_klist.vk_klist);
        cv_destroy(&vp->v_cv);
        cache_vnode_fini(vp);
        pool_cache_put(vcache_pool, vip);
}

/*
 * Try to get an initial reference on this cached vnode.
 * Returns zero on success or EBUSY if the vnode state is not LOADED.
 *
 * NB: lockless code sequences may rely on this not blocking.
 */
int
vcache_tryvget(vnode_t *vp)
{
        u_int use, next;

        for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
                if (__predict_false((use & VUSECOUNT_GATE) == 0)) {
                        return EBUSY;
                }
                next = atomic_cas_uint(&vp->v_usecount,
                    use, (use + 1) | VUSECOUNT_VGET);
                if (__predict_true(next == use)) {
                        membar_acquire();
                        return 0;
                }
        }
}

/*
 * Try to get an initial reference on this cached vnode.
 * Returns zero on success and  ENOENT if the vnode has been reclaimed.
 * Will wait for the vnode state to be stable.
 *
 * v_interlock locked on entry and unlocked on exit.
 */
int
vcache_vget(vnode_t *vp)
{
        int error;

        KASSERT(mutex_owned(vp->v_interlock));

        /* Increment hold count to prevent vnode from disappearing. */
        vp->v_holdcnt++;
        VSTATE_WAIT_STABLE(vp);
        vp->v_holdcnt--;

        /* If this was the last reference to a reclaimed vnode free it now. */
        if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) {
                if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0)
                        vcache_free(VNODE_TO_VIMPL(vp));
                else
                        mutex_exit(vp->v_interlock);
                return ENOENT;
        }
        VSTATE_ASSERT(vp, VS_LOADED);
        error = vcache_tryvget(vp);
        KASSERT(error == 0);
        mutex_exit(vp->v_interlock);

        return 0;
}

/*
 * Get a vnode / fs node pair by key and return it referenced through vpp.
 */
int
vcache_get(struct mount *mp, const void *key, size_t key_len,
    struct vnode **vpp)
{
        int error;
        uint32_t hash;
        const void *new_key;
        struct vnode *vp;
        struct vcache_key vcache_key;
        vnode_impl_t *vip, *new_vip;

        new_key = NULL;
        *vpp = NULL;

        vcache_key.vk_mount = mp;
        vcache_key.vk_key = key;
        vcache_key.vk_key_len = key_len;
        hash = vcache_hash(&vcache_key);

again:
        mutex_enter(&vcache_lock);
        vip = vcache_hash_lookup(&vcache_key, hash);

        /* If found, take a reference or retry. */
        if (__predict_true(vip != NULL)) {
                /*
                 * If the vnode is loading we cannot take the v_interlock
                 * here as it might change during load (see uvm_obj_setlock()).
                 * As changing state from VS_LOADING requires both vcache_lock
                 * and v_interlock it is safe to test with vcache_lock held.
                 *
                 * Wait for vnodes changing state from VS_LOADING and retry.
                 */
                if (__predict_false(vip->vi_state == VS_LOADING)) {
                        cv_wait(&vcache_cv, &vcache_lock);
                        mutex_exit(&vcache_lock);
                        goto again;
                }
                vp = VIMPL_TO_VNODE(vip);
                mutex_enter(vp->v_interlock);
                mutex_exit(&vcache_lock);
                error = vcache_vget(vp);
                if (error == ENOENT)
                        goto again;
                if (error == 0)
                        *vpp = vp;
                KASSERT((error != 0) == (*vpp == NULL));
                return error;
        }
        mutex_exit(&vcache_lock);

        /* Allocate and initialize a new vcache / vnode pair. */
        error = vfs_busy(mp);
        if (error)
                return error;
        new_vip = vcache_alloc();
        new_vip->vi_key = vcache_key;
        vp = VIMPL_TO_VNODE(new_vip);
        mutex_enter(&vcache_lock);
        vip = vcache_hash_lookup(&vcache_key, hash);
        if (vip == NULL) {
                SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
                    new_vip, vi_hash);
                vip = new_vip;
        }

        /* If another thread beat us inserting this node, retry. */
        if (vip != new_vip) {
                vcache_dealloc(new_vip);
                vfs_unbusy(mp);
                goto again;
        }
        mutex_exit(&vcache_lock);

        /* Load the fs node.  Exclusive as new_node is VS_LOADING. */
        error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
        if (error) {
                mutex_enter(&vcache_lock);
                SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
                    new_vip, vnode_impl, vi_hash);
                vcache_dealloc(new_vip);
                vfs_unbusy(mp);
                KASSERT(*vpp == NULL);
                return error;
        }
        KASSERT(new_key != NULL);
        KASSERT(memcmp(key, new_key, key_len) == 0);
        KASSERT(vp->v_op != NULL);
        vfs_insmntque(vp, mp);
        if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
                vp->v_vflag |= VV_MPSAFE;
        vfs_ref(mp);
        vfs_unbusy(mp);

        /* Finished loading, finalize node. */
        mutex_enter(&vcache_lock);
        new_vip->vi_key.vk_key = new_key;
        mutex_enter(vp->v_interlock);
        VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
        mutex_exit(vp->v_interlock);
        mutex_exit(&vcache_lock);
        *vpp = vp;
        return 0;
}

/*
 * Create a new vnode / fs node pair and return it referenced through vpp.
 */
int
vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
    kauth_cred_t cred, void *extra, struct vnode **vpp)
{
        int error;
        uint32_t hash;
        struct vnode *vp, *ovp;
        vnode_impl_t *vip, *ovip;

        *vpp = NULL;

        /* Allocate and initialize a new vcache / vnode pair. */
        error = vfs_busy(mp);
        if (error)
                return error;
        vip = vcache_alloc();
        vip->vi_key.vk_mount = mp;
        vp = VIMPL_TO_VNODE(vip);

        /* Create and load the fs node. */
        error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra,
            &vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
        if (error) {
                mutex_enter(&vcache_lock);
                vcache_dealloc(vip);
                vfs_unbusy(mp);
                KASSERT(*vpp == NULL);
                return error;
        }
        KASSERT(vp->v_op != NULL);
        KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount));
        if (vip->vi_key.vk_key_len > 0) {
                KASSERT(vip->vi_key.vk_key != NULL);
                hash = vcache_hash(&vip->vi_key);

                /*
                 * Wait for previous instance to be reclaimed,
                 * then insert new node.
                 */
                mutex_enter(&vcache_lock);
                while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
                        ovp = VIMPL_TO_VNODE(ovip);
                        mutex_enter(ovp->v_interlock);
                        mutex_exit(&vcache_lock);
                        error = vcache_vget(ovp);
                        KASSERT(error == ENOENT);
                        mutex_enter(&vcache_lock);
                }
                SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
                    vip, vi_hash);
                mutex_exit(&vcache_lock);
        }
        vfs_insmntque(vp, mp);
        if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
                vp->v_vflag |= VV_MPSAFE;
        vfs_ref(mp);
        vfs_unbusy(mp);

        /* Finished loading, finalize node. */
        mutex_enter(&vcache_lock);
        mutex_enter(vp->v_interlock);
        VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
        mutex_exit(&vcache_lock);
        mutex_exit(vp->v_interlock);
        *vpp = vp;
        return 0;
}

/*
 * Prepare key change: update old cache nodes key and lock new cache node.
 * Return an error if the new node already exists.
 */
int
vcache_rekey_enter(struct mount *mp, struct vnode *vp,
    const void *old_key, size_t old_key_len,
    const void *new_key, size_t new_key_len)
{
        uint32_t old_hash, new_hash;
        struct vcache_key old_vcache_key, new_vcache_key;
        vnode_impl_t *vip, *new_vip;

        old_vcache_key.vk_mount = mp;
        old_vcache_key.vk_key = old_key;
        old_vcache_key.vk_key_len = old_key_len;
        old_hash = vcache_hash(&old_vcache_key);

        new_vcache_key.vk_mount = mp;
        new_vcache_key.vk_key = new_key;
        new_vcache_key.vk_key_len = new_key_len;
        new_hash = vcache_hash(&new_vcache_key);

        new_vip = vcache_alloc();
        new_vip->vi_key = new_vcache_key;

        /* Insert locked new node used as placeholder. */
        mutex_enter(&vcache_lock);
        vip = vcache_hash_lookup(&new_vcache_key, new_hash);
        if (vip != NULL) {
                vcache_dealloc(new_vip);
                return EEXIST;
        }
        SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
            new_vip, vi_hash);

        /* Replace old nodes key with the temporary copy. */
        vip = vcache_hash_lookup(&old_vcache_key, old_hash);
        KASSERT(vip != NULL);
        KASSERT(VIMPL_TO_VNODE(vip) == vp);
        KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
        vip->vi_key = old_vcache_key;
        mutex_exit(&vcache_lock);
        return 0;
}

/*
 * Key change complete: update old node and remove placeholder.
 */
void
vcache_rekey_exit(struct mount *mp, struct vnode *vp,
    const void *old_key, size_t old_key_len,
    const void *new_key, size_t new_key_len)
{
        uint32_t old_hash, new_hash;
        struct vcache_key old_vcache_key, new_vcache_key;
        vnode_impl_t *vip, *new_vip;
        struct vnode *new_vp;

        old_vcache_key.vk_mount = mp;
        old_vcache_key.vk_key = old_key;
        old_vcache_key.vk_key_len = old_key_len;
        old_hash = vcache_hash(&old_vcache_key);

        new_vcache_key.vk_mount = mp;
        new_vcache_key.vk_key = new_key;
        new_vcache_key.vk_key_len = new_key_len;
        new_hash = vcache_hash(&new_vcache_key);

        mutex_enter(&vcache_lock);

        /* Lookup old and new node. */
        vip = vcache_hash_lookup(&old_vcache_key, old_hash);
        KASSERT(vip != NULL);
        KASSERT(VIMPL_TO_VNODE(vip) == vp);

        new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
        KASSERT(new_vip != NULL);
        KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
        new_vp = VIMPL_TO_VNODE(new_vip);
        mutex_enter(new_vp->v_interlock);
        VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
        mutex_exit(new_vp->v_interlock);

        /* Rekey old node and put it onto its new hashlist. */
        vip->vi_key = new_vcache_key;
        if (old_hash != new_hash) {
                SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
                    vip, vnode_impl, vi_hash);
                SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
                    vip, vi_hash);
        }

        /* Remove new node used as placeholder. */
        SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
            new_vip, vnode_impl, vi_hash);
        vcache_dealloc(new_vip);
}

/*
 * Disassociate the underlying file system from a vnode.
 *
 * Must be called with vnode locked and will return unlocked.
 * Must be called with the interlock held, and will return with it held.
 */
static void
vcache_reclaim(vnode_t *vp)
{
        lwp_t *l = curlwp;
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
        struct mount *mp = vp->v_mount;
        uint32_t hash;
        uint8_t temp_buf[64], *temp_key;
        size_t temp_key_len;
        bool recycle;
        int error;

        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        KASSERT(mutex_owned(vp->v_interlock));
        KASSERT(vrefcnt(vp) != 0);

        temp_key_len = vip->vi_key.vk_key_len;
        /*
         * Prevent the vnode from being recycled or brought into use
         * while we clean it out.
         */
        VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING);

        /*
         * Send NOTE_REVOKE now, before we call VOP_RECLAIM(),
         * because VOP_RECLAIM() could cause vp->v_klist to
         * become invalid.  Don't check for interest in NOTE_REVOKE
         * here; it's always posted because it sets EV_EOF.
         *
         * Once it's been posted, reset vp->v_klist to point to
         * our own local storage, in case we were sharing with
         * someone else.
         */
        KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE);
        vp->v_klist = &vip->vi_klist;
        mutex_exit(vp->v_interlock);

        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
        mutex_enter(vp->v_interlock);
        if ((vp->v_iflag & VI_EXECMAP) != 0) {
                cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
        }
        vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
        vp->v_iflag |= VI_DEADCHECK; /* for genfs_getpages() */
        mutex_exit(vp->v_interlock);
        rw_exit(vp->v_uobj.vmobjlock);

        /*
         * With vnode state set to reclaiming, purge name cache immediately
         * to prevent new handles on vnode, and wait for existing threads
         * trying to get a handle to notice VS_RECLAIMED status and abort.
         */
        cache_purge(vp);

        /* Replace the vnode key with a temporary copy. */
        if (vip->vi_key.vk_key_len > sizeof(temp_buf)) {
                temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
        } else {
                temp_key = temp_buf;
        }
        if (vip->vi_key.vk_key_len > 0) {
                mutex_enter(&vcache_lock);
                memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
                vip->vi_key.vk_key = temp_key;
                mutex_exit(&vcache_lock);
        }

        fstrans_start(mp);

        /*
         * Clean out any cached data associated with the vnode.
         */
        error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
        if (error != 0) {
                if (wapbl_vphaswapbl(vp))
                        WAPBL_DISCARD(wapbl_vptomp(vp));
                error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
        }
        KASSERTMSG((error == 0), "vinvalbuf failed: %d", error);
        KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
        if (vp->v_type == VBLK || vp->v_type == VCHR) {
                 spec_node_revoke(vp);
        }

        /*
         * Disassociate the underlying file system from the vnode.
         * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
         * the vnode, and may destroy the vnode so that VOP_UNLOCK
         * would no longer function.
         */
        VOP_INACTIVE(vp, &recycle);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        if (VOP_RECLAIM(vp)) {
                vnpanic(vp, "%s: cannot reclaim", __func__);
        }

        KASSERT(vp->v_data == NULL);
        KASSERT((vp->v_iflag & VI_PAGES) == 0);

        if (vp->v_type == VREG && vp->v_ractx != NULL) {
                uvm_ra_freectx(vp->v_ractx);
                vp->v_ractx = NULL;
        }

        if (vip->vi_key.vk_key_len > 0) {
        /* Remove from vnode cache. */
                hash = vcache_hash(&vip->vi_key);
                mutex_enter(&vcache_lock);
                KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
                SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
                    vip, vnode_impl, vi_hash);
                mutex_exit(&vcache_lock);
        }
        if (temp_key != temp_buf)
                kmem_free(temp_key, temp_key_len);

        /* Done with purge, notify sleepers of the grim news. */
        mutex_enter(vp->v_interlock);
        vp->v_op = dead_vnodeop_p;
        VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
        vp->v_tag = VT_NON;
        mutex_exit(vp->v_interlock);

        /*
         * Move to dead mount.  Must be after changing the operations
         * vector as vnode operations enter the mount before using the
         * operations vector.  See sys/kern/vnode_if.c.
         */
        vp->v_vflag &= ~VV_ROOT;
        vfs_ref(dead_rootmount);
        vfs_insmntque(vp, dead_rootmount);

#ifdef PAX_SEGVGUARD
        pax_segvguard_cleanup(vp);
#endif /* PAX_SEGVGUARD */

        mutex_enter(vp->v_interlock);
        fstrans_done(mp);
        KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
}

/*
 * Disassociate the underlying file system from an open device vnode
 * and make it anonymous.
 *
 * Vnode unlocked on entry, drops a reference to the vnode.
 */
void
vcache_make_anon(vnode_t *vp)
{
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
        uint32_t hash;
        bool recycle;

        KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
        KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
        VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);

        /* Remove from vnode cache. */
        hash = vcache_hash(&vip->vi_key);
        mutex_enter(&vcache_lock);
        KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash));
        SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
            vip, vnode_impl, vi_hash);
        vip->vi_key.vk_mount = dead_rootmount;
        vip->vi_key.vk_key_len = 0;
        vip->vi_key.vk_key = NULL;
        mutex_exit(&vcache_lock);

        /*
         * Disassociate the underlying file system from the vnode.
         * VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
         * the vnode, and may destroy the vnode so that VOP_UNLOCK
         * would no longer function.
         */
        if (vn_lock(vp, LK_EXCLUSIVE)) {
                vnpanic(vp, "%s: cannot lock", __func__);
        }
        VOP_INACTIVE(vp, &recycle);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        if (VOP_RECLAIM(vp)) {
                vnpanic(vp, "%s: cannot reclaim", __func__);
        }

        /* Purge name cache. */
        cache_purge(vp);

        /* Done with purge, change operations vector. */
        mutex_enter(vp->v_interlock);
        vp->v_op = spec_vnodeop_p;
        vp->v_vflag |= VV_MPSAFE;
        mutex_exit(vp->v_interlock);

        /*
         * Move to dead mount.  Must be after changing the operations
         * vector as vnode operations enter the mount before using the
         * operations vector.  See sys/kern/vnode_if.c.
         */
        vfs_ref(dead_rootmount);
        vfs_insmntque(vp, dead_rootmount);

        vrele(vp);
}

/*
 * Update outstanding I/O count and do wakeup if requested.
 */
void
vwakeup(struct buf *bp)
{
        vnode_t *vp;

        if ((vp = bp->b_vp) == NULL)
                return;

        KASSERT(bp->b_objlock == vp->v_interlock);
        KASSERT(mutex_owned(bp->b_objlock));

        if (--vp->v_numoutput < 0)
                vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp);
        if (vp->v_numoutput == 0)
                cv_broadcast(&vp->v_cv);
}

/*
 * Test a vnode for being or becoming dead.  Returns one of:
 * EBUSY:  vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
 * ENOENT: vnode is dead.
 * 0:      otherwise.
 *
 * Whenever this function returns a non-zero value all future
 * calls will also return a non-zero value.
 */
int
vdead_check(struct vnode *vp, int flags)
{

        KASSERT(mutex_owned(vp->v_interlock));

        if (! ISSET(flags, VDEAD_NOWAIT))
                VSTATE_WAIT_STABLE(vp);

        if (VSTATE_GET(vp) == VS_RECLAIMING) {
                KASSERT(ISSET(flags, VDEAD_NOWAIT));
                return EBUSY;
        } else if (VSTATE_GET(vp) == VS_RECLAIMED) {
                return ENOENT;
        }

        return 0;
}

int
vfs_drainvnodes(void)
{

        mutex_enter(&vdrain_lock);

        if (!vdrain_one(desiredvnodes)) {
                mutex_exit(&vdrain_lock);
                return EBUSY;
        }

        mutex_exit(&vdrain_lock);

        if (vcache_hashsize != desiredvnodes)
                vcache_reinit();

        return 0;
}

void
vnpanic(vnode_t *vp, const char *fmt, ...)
{
        va_list ap;

#ifdef DIAGNOSTIC
        vprint(NULL, vp);
#endif
        va_start(ap, fmt);
        vpanic(fmt, ap);
        va_end(ap);
}

void
vshareilock(vnode_t *tvp, vnode_t *fvp)
{
        kmutex_t *oldlock;

        oldlock = tvp->v_interlock;
        mutex_obj_hold(fvp->v_interlock);
        tvp->v_interlock = fvp->v_interlock;
        mutex_obj_free(oldlock);
}

void
vshareklist(vnode_t *tvp, vnode_t *fvp)
{
        /*
         * If two vnodes share klist state, they must also share
         * an interlock.
         */
        KASSERT(tvp->v_interlock == fvp->v_interlock);

        /*
         * We make the following assumptions:
         *
         * ==> Some other synchronization is happening outside of
         *     our view to make this safe.
         *
         * ==> That the "to" vnode will have the necessary references
         *     on the "from" vnode so that the storage for the klist
         *     won't be yanked out from beneath us (the vnode_impl).
         *
         * ==> If "from" is also sharing, we then assume that "from"
         *     has the necessary references, and so on.
         */
        tvp->v_klist = fvp->v_klist;
}
























































































   13 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/*        $NetBSD: vfs_syscalls_40.c,v 1.5 2019/01/27 02:08:39 pgoyette Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_syscalls.c        8.42 (Berkeley) 7/31/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_40.c,v 1.5 2019/01/27 02:08:39 pgoyette Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mount.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>

#include <compat/common/compat_mod.h>

static const struct syscall_package vfs_syscalls_40_syscalls[] = {
        { SYS_compat_40_mount, 0, (sy_call_t *)compat_40_sys_mount },
        { 0, 0, NULL },
};

int
compat_40_sys_mount(struct lwp *l, const struct compat_40_sys_mount_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) type;
                syscallarg(const char *) path;
                syscallarg(int) flags;
                syscallarg(void *) data;
        } */
        register_t dummy;

        return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path),
            SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE, 0, &dummy);
}

int
vfs_syscalls_40_init(void)
{

        return syscall_establish(NULL, vfs_syscalls_40_syscalls);
}

int
vfs_syscalls_40_fini(void)
{

        return syscall_disestablish(NULL, vfs_syscalls_40_syscalls);
}











































































































































































































































    4 












    4 












    4 
























    9 



    9 




    9 

    8 









    4 



    9 




























    9 



    9 




    9 




    9 


    9 


    9 













    4 

    9 




    9 







    9 





















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
/*        $NetBSD: subr_psref.c,v 1.18 2022/02/12 16:31:06 macallan Exp $        */

/*-
 * Copyright (c) 2016 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Taylor R. Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Passive references
 *
 *        Passive references are references to objects that guarantee the
 *        object will not be destroyed until the reference is released.
 *
 *        Passive references require no interprocessor synchronization to
 *        acquire or release.  However, destroying the target of passive
 *        references requires expensive interprocessor synchronization --
 *        xcalls to determine on which CPUs the object is still in use.
 *
 *        Passive references may be held only on a single CPU and by a
 *        single LWP.  They require the caller to allocate a little stack
 *        space, a struct psref object.  Sleeping while a passive
 *        reference is held is allowed, provided that the owner's LWP is
 *        bound to a CPU -- e.g., the owner is a softint or a bound
 *        kthread.  However, sleeping should be kept to a short duration,
 *        e.g. sleeping on an adaptive lock.
 *
 *        Passive references serve as an intermediate stage between
 *        reference counting and passive serialization (pserialize(9)):
 *
 *        - If you need references to transfer from CPU to CPU or LWP to
 *          LWP, or if you need long-term references, you must use
 *          reference counting, e.g. with atomic operations or locks,
 *          which incurs interprocessor synchronization for every use --
 *          cheaper than an xcall, but not scalable.
 *
 *        - If all users *guarantee* that they will not sleep, then it is
 *          not necessary to use passive references: you may as well just
 *          use the even cheaper pserialize(9), because you have
 *          satisfied the requirements of a pserialize read section.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_psref.c,v 1.18 2022/02/12 16:31:06 macallan Exp $");

#include <sys/param.h>
#include <sys/types.h>
#include <sys/condvar.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/psref.h>
#include <sys/queue.h>
#include <sys/xcall.h>
#include <sys/lwp.h>

SLIST_HEAD(psref_head, psref);

static bool        _psref_held(const struct psref_target *, struct psref_class *,
                    bool);

/*
 * struct psref_class
 *
 *        Private global state for a class of passive reference targets.
 *        Opaque to callers.
 */
struct psref_class {
        kmutex_t                prc_lock;
        kcondvar_t                prc_cv;
        struct percpu                *prc_percpu; /* struct psref_cpu */
        ipl_cookie_t                prc_iplcookie;
        unsigned int                prc_xc_flags;
};

/*
 * struct psref_cpu
 *
 *        Private per-CPU state for a class of passive reference targets.
 *        Not exposed by the API.
 */
struct psref_cpu {
        struct psref_head        pcpu_head;
};

/*
 * Data structures and functions for debugging.
 */
#ifndef PSREF_DEBUG_NITEMS
#define PSREF_DEBUG_NITEMS 16
#endif

struct psref_debug_item {
        void                        *prdi_caller;
        struct psref                *prdi_psref;
};

struct psref_debug {
        int                        prd_refs_peek;
        struct psref_debug_item prd_items[PSREF_DEBUG_NITEMS];
};

#ifdef PSREF_DEBUG
static void psref_debug_acquire(struct psref *);
static void psref_debug_release(struct psref *);

static void psref_debug_lwp_free(void *);

static specificdata_key_t psref_debug_lwp_key;
#endif

/*
 * psref_init()
 */
void
psref_init(void)
{

#ifdef PSREF_DEBUG
        lwp_specific_key_create(&psref_debug_lwp_key, psref_debug_lwp_free);
#endif
}

/*
 * psref_class_create(name, ipl)
 *
 *        Create a new passive reference class, with the given wchan name
 *        and ipl.
 */
struct psref_class *
psref_class_create(const char *name, int ipl)
{
        struct psref_class *class;

        ASSERT_SLEEPABLE();

        class = kmem_alloc(sizeof(*class), KM_SLEEP);
        class->prc_percpu = percpu_alloc(sizeof(struct psref_cpu));
        mutex_init(&class->prc_lock, MUTEX_DEFAULT, ipl);
        cv_init(&class->prc_cv, name);
        class->prc_iplcookie = makeiplcookie(ipl);
        class->prc_xc_flags = XC_HIGHPRI_IPL(ipl);

        return class;
}

static void __diagused
psref_cpu_drained_p(void *p, void *cookie, struct cpu_info *ci __unused)
{
        const struct psref_cpu *pcpu = p;
        bool *retp = cookie;

        if (!SLIST_EMPTY(&pcpu->pcpu_head))
                *retp = false;
}

static bool __diagused
psref_class_drained_p(const struct psref_class *prc)
{
        bool ret = true;

        percpu_foreach(prc->prc_percpu, &psref_cpu_drained_p, &ret);

        return ret;
}

/*
 * psref_class_destroy(class)
 *
 *        Destroy a passive reference class and free memory associated
 *        with it.  All targets in this class must have been drained and
 *        destroyed already.
 */
void
psref_class_destroy(struct psref_class *class)
{

        KASSERT(psref_class_drained_p(class));

        cv_destroy(&class->prc_cv);
        mutex_destroy(&class->prc_lock);
        percpu_free(class->prc_percpu, sizeof(struct psref_cpu));
        kmem_free(class, sizeof(*class));
}

/*
 * psref_target_init(target, class)
 *
 *        Initialize a passive reference target in the specified class.
 *        The caller is responsible for issuing a membar_producer after
 *        psref_target_init and before exposing a pointer to the target
 *        to other CPUs.
 */
void
psref_target_init(struct psref_target *target,
    struct psref_class *class)
{

        target->prt_class = class;
        target->prt_draining = false;
}

#ifdef DEBUG
static bool
psref_exist(struct psref_cpu *pcpu, struct psref *psref)
{
        struct psref *_psref;

        SLIST_FOREACH(_psref, &pcpu->pcpu_head, psref_entry) {
                if (_psref == psref)
                        return true;
        }
        return false;
}

static void
psref_check_duplication(struct psref_cpu *pcpu, struct psref *psref,
    const struct psref_target *target)
{
        bool found = false;

        found = psref_exist(pcpu, psref);
        if (found) {
                panic("The psref is already in the list (acquiring twice?): "
                    "psref=%p target=%p", psref, target);
        }
}

static void
psref_check_existence(struct psref_cpu *pcpu, struct psref *psref,
    const struct psref_target *target)
{
        bool found = false;

        found = psref_exist(pcpu, psref);
        if (!found) {
                panic("The psref isn't in the list (releasing unused psref?): "
                    "psref=%p target=%p", psref, target);
        }
}
#endif /* DEBUG */

/*
 * psref_acquire(psref, target, class)
 *
 *        Acquire a passive reference to the specified target, which must
 *        be in the specified class.
 *
 *        The caller must guarantee that the target will not be destroyed
 *        before psref_acquire returns.
 *
 *        The caller must additionally guarantee that it will not switch
 *        CPUs before releasing the passive reference, either by
 *        disabling kpreemption and avoiding sleeps, or by being in a
 *        softint or in an LWP bound to a CPU.
 */
void
psref_acquire(struct psref *psref, const struct psref_target *target,
    struct psref_class *class)
{
        struct psref_cpu *pcpu;
        int s;

        KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
                ISSET(curlwp->l_pflag, LP_BOUND)),
            "passive references are CPU-local,"
            " but preemption is enabled and the caller is not"
            " in a softint or CPU-bound LWP");
        KASSERTMSG(!target->prt_draining, "psref target already destroyed: %p",
            target);
        KASSERTMSG((target->prt_class == class),
            "mismatched psref target class: %p (ref) != %p (expected)",
            target->prt_class, class);

        /* Block interrupts and acquire the current CPU's reference list.  */
        s = splraiseipl(class->prc_iplcookie);
        pcpu = percpu_getref(class->prc_percpu);

#ifdef DEBUG
        /* Sanity-check if the target is already acquired with the same psref.  */
        psref_check_duplication(pcpu, psref, target);
#endif

        /* Record our reference.  */
        SLIST_INSERT_HEAD(&pcpu->pcpu_head, psref, psref_entry);
        psref->psref_target = target;
        psref->psref_lwp = curlwp;
        psref->psref_cpu = curcpu();

        /* Release the CPU list and restore interrupts.  */
        percpu_putref(class->prc_percpu);
        splx(s);

#if defined(DIAGNOSTIC) || defined(PSREF_DEBUG)
        curlwp->l_psrefs++;
#endif
#ifdef PSREF_DEBUG
        psref_debug_acquire(psref);
#endif
}

/*
 * psref_release(psref, target, class)
 *
 *        Release a passive reference to the specified target, which must
 *        be in the specified class.
 *
 *        The caller must not have switched CPUs or LWPs since acquiring
 *        the passive reference.
 */
void
psref_release(struct psref *psref, const struct psref_target *target,
    struct psref_class *class)
{
        struct psref_cpu *pcpu;
        int s;

        KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
                ISSET(curlwp->l_pflag, LP_BOUND)),
            "passive references are CPU-local,"
            " but preemption is enabled and the caller is not"
            " in a softint or CPU-bound LWP");
        KASSERTMSG((target->prt_class == class),
            "mismatched psref target class: %p (ref) != %p (expected)",
            target->prt_class, class);

        /* Make sure the psref looks sensible.  */
        KASSERTMSG((psref->psref_target == target),
            "passive reference target mismatch: %p (ref) != %p (expected)",
            psref->psref_target, target);
        KASSERTMSG((psref->psref_lwp == curlwp),
            "passive reference transferred from lwp %p to lwp %p",
            psref->psref_lwp, curlwp);
        KASSERTMSG((psref->psref_cpu == curcpu()),
            "passive reference transferred from CPU %u to CPU %u",
            cpu_index(psref->psref_cpu), cpu_index(curcpu()));

        /*
         * Block interrupts and remove the psref from the current CPU's
         * list.  No need to percpu_getref or get the head of the list,
         * and the caller guarantees that we are bound to a CPU anyway
         * (as does blocking interrupts).
         */
        s = splraiseipl(class->prc_iplcookie);
        pcpu = percpu_getref(class->prc_percpu);
#ifdef DEBUG
        /* Sanity-check if the target is surely acquired before.  */
        psref_check_existence(pcpu, psref, target);
#endif
        SLIST_REMOVE(&pcpu->pcpu_head, psref, psref, psref_entry);
        percpu_putref(class->prc_percpu);
        splx(s);

#if defined(DIAGNOSTIC) || defined(PSREF_DEBUG)
        KASSERT(curlwp->l_psrefs > 0);
        curlwp->l_psrefs--;
#endif
#ifdef PSREF_DEBUG
        psref_debug_release(psref);
#endif

        /* If someone is waiting for users to drain, notify 'em.  */
        if (__predict_false(target->prt_draining))
                cv_broadcast(&class->prc_cv);
}

/*
 * psref_copy(pto, pfrom, class)
 *
 *        Copy a passive reference from pfrom, which must be in the
 *        specified class, to pto.  Both pfrom and pto must later be
 *        released with psref_release.
 *
 *        The caller must not have switched CPUs or LWPs since acquiring
 *        pfrom, and must not switch CPUs or LWPs before releasing both
 *        pfrom and pto.
 */
void
psref_copy(struct psref *pto, const struct psref *pfrom,
    struct psref_class *class)
{
        struct psref_cpu *pcpu;
        int s;

        KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
                ISSET(curlwp->l_pflag, LP_BOUND)),
            "passive references are CPU-local,"
            " but preemption is enabled and the caller is not"
            " in a softint or CPU-bound LWP");
        KASSERTMSG((pto != pfrom),
            "can't copy passive reference to itself: %p",
            pto);

        /* Make sure the pfrom reference looks sensible.  */
        KASSERTMSG((pfrom->psref_lwp == curlwp),
            "passive reference transferred from lwp %p to lwp %p",
            pfrom->psref_lwp, curlwp);
        KASSERTMSG((pfrom->psref_cpu == curcpu()),
            "passive reference transferred from CPU %u to CPU %u",
            cpu_index(pfrom->psref_cpu), cpu_index(curcpu()));
        KASSERTMSG((pfrom->psref_target->prt_class == class),
            "mismatched psref target class: %p (ref) != %p (expected)",
            pfrom->psref_target->prt_class, class);

        /* Block interrupts and acquire the current CPU's reference list.  */
        s = splraiseipl(class->prc_iplcookie);
        pcpu = percpu_getref(class->prc_percpu);

        /* Record the new reference.  */
        SLIST_INSERT_HEAD(&pcpu->pcpu_head, pto, psref_entry);
        pto->psref_target = pfrom->psref_target;
        pto->psref_lwp = curlwp;
        pto->psref_cpu = curcpu();

        /* Release the CPU list and restore interrupts.  */
        percpu_putref(class->prc_percpu);
        splx(s);

#if defined(DIAGNOSTIC) || defined(PSREF_DEBUG)
        curlwp->l_psrefs++;
#endif
}

/*
 * struct psreffed
 *
 *        Global state for draining a psref target.
 */
struct psreffed {
        struct psref_class        *class;
        struct psref_target        *target;
        bool                        ret;
};

static void
psreffed_p_xc(void *cookie0, void *cookie1 __unused)
{
        struct psreffed *P = cookie0;

        /*
         * If we hold a psref to the target, then answer true.
         *
         * This is the only dynamic decision that may be made with
         * psref_held.
         *
         * No need to lock anything here: every write transitions from
         * false to true, so there can be no conflicting writes.  No
         * need for a memory barrier here because P->ret is read only
         * after xc_wait, which has already issued any necessary memory
         * barriers.
         */
        if (_psref_held(P->target, P->class, true))
                P->ret = true;
}

static bool
psreffed_p(struct psref_target *target, struct psref_class *class)
{
        struct psreffed P = {
                .class = class,
                .target = target,
                .ret = false,
        };

        if (__predict_true(mp_online)) {
                /*
                 * Ask all CPUs to say whether they hold a psref to the
                 * target.
                 */
                xc_wait(xc_broadcast(class->prc_xc_flags, &psreffed_p_xc, &P,
                                     NULL));
        } else
                psreffed_p_xc(&P, NULL);

        return P.ret;
}

/*
 * psref_target_destroy(target, class)
 *
 *        Destroy a passive reference target.  Waits for all existing
 *        references to drain.  Caller must guarantee no new references
 *        will be acquired once it calls psref_target_destroy, e.g. by
 *        removing the target from a global list first.  May sleep.
 */
void
psref_target_destroy(struct psref_target *target, struct psref_class *class)
{

        ASSERT_SLEEPABLE();

        KASSERTMSG(!target->prt_draining, "psref target already destroyed: %p",
            target);
        KASSERTMSG((target->prt_class == class),
            "mismatched psref target class: %p (ref) != %p (expected)",
            target->prt_class, class);

        /* Request psref_release to notify us when done.  */
        target->prt_draining = true;

        /* Wait until there are no more references on any CPU.  */
        while (psreffed_p(target, class)) {
                /*
                 * This enter/wait/exit business looks wrong, but it is
                 * both necessary, because psreffed_p performs a
                 * low-priority xcall and hence cannot run while a
                 * mutex is locked, and OK, because the wait is timed
                 * -- explicit wakeups are only an optimization.
                 */
                mutex_enter(&class->prc_lock);
                (void)cv_timedwait(&class->prc_cv, &class->prc_lock, 1);
                mutex_exit(&class->prc_lock);
        }

        /* No more references.  Cause subsequent psref_acquire to kassert.  */
        target->prt_class = NULL;
}

static bool
_psref_held(const struct psref_target *target, struct psref_class *class,
    bool lwp_mismatch_ok)
{
        const struct psref_cpu *pcpu;
        const struct psref *psref;
        int s;
        bool held = false;

        KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
                ISSET(curlwp->l_pflag, LP_BOUND)),
            "passive references are CPU-local,"
            " but preemption is enabled and the caller is not"
            " in a softint or CPU-bound LWP");
        KASSERTMSG((target->prt_class == class),
            "mismatched psref target class: %p (ref) != %p (expected)",
            target->prt_class, class);

        /* Block interrupts and acquire the current CPU's reference list.  */
        s = splraiseipl(class->prc_iplcookie);
        pcpu = percpu_getref(class->prc_percpu);

        /* Search through all the references on this CPU.  */
        SLIST_FOREACH(psref, &pcpu->pcpu_head, psref_entry) {
                /* Sanity-check the reference's CPU.  */
                KASSERTMSG((psref->psref_cpu == curcpu()),
                    "passive reference transferred from CPU %u to CPU %u",
                    cpu_index(psref->psref_cpu), cpu_index(curcpu()));

                /* If it doesn't match, skip it and move on.  */
                if (psref->psref_target != target)
                        continue;

                /*
                 * Sanity-check the reference's LWP if we are asserting
                 * via psref_held that this LWP holds it, but not if we
                 * are testing in psref_target_destroy whether any LWP
                 * still holds it.
                 */
                KASSERTMSG((lwp_mismatch_ok || psref->psref_lwp == curlwp),
                    "passive reference transferred from lwp %p to lwp %p",
                    psref->psref_lwp, curlwp);

                /* Stop here and report that we found it.  */
                held = true;
                break;
        }

        /* Release the CPU list and restore interrupts.  */
        percpu_putref(class->prc_percpu);
        splx(s);

        return held;
}

/*
 * psref_held(target, class)
 *
 *        True if the current CPU holds a passive reference to target,
 *        false otherwise.  May be used only inside assertions.
 */
bool
psref_held(const struct psref_target *target, struct psref_class *class)
{

        return _psref_held(target, class, false);
}

#ifdef PSREF_DEBUG
void
psref_debug_init_lwp(struct lwp *l)
{
        struct psref_debug *prd;

        prd = kmem_zalloc(sizeof(*prd), KM_SLEEP);
        lwp_setspecific_by_lwp(l, psref_debug_lwp_key, prd);
}

static void
psref_debug_lwp_free(void *arg)
{
        struct psref_debug *prd = arg;

        kmem_free(prd, sizeof(*prd));
}

static void
psref_debug_acquire(struct psref *psref)
{
        struct psref_debug *prd;
        struct lwp *l = curlwp;
        int s, i;

        prd = lwp_getspecific(psref_debug_lwp_key);
        if (__predict_false(prd == NULL)) {
                psref->psref_debug = NULL;
                return;
        }

        s = splserial();
        if (l->l_psrefs > prd->prd_refs_peek) {
                prd->prd_refs_peek = l->l_psrefs;
                if (__predict_false(prd->prd_refs_peek > PSREF_DEBUG_NITEMS))
                        panic("exceeded PSREF_DEBUG_NITEMS");
        }
        for (i = 0; i < prd->prd_refs_peek; i++) {
                struct psref_debug_item *prdi = &prd->prd_items[i];
                if (prdi->prdi_psref != NULL)
                        continue;
                prdi->prdi_caller = psref->psref_debug;
                prdi->prdi_psref = psref;
                psref->psref_debug = prdi;
                break;
        }
        if (__predict_false(i == prd->prd_refs_peek))
                panic("out of range: %d", i);
        splx(s);
}

static void
psref_debug_release(struct psref *psref)
{
        int s;

        s = splserial();
        if (__predict_true(psref->psref_debug != NULL)) {
                struct psref_debug_item *prdi = psref->psref_debug;
                prdi->prdi_psref = NULL;
        }
        splx(s);
}

void
psref_debug_barrier(void)
{
        struct psref_debug *prd;
        struct lwp *l = curlwp;
        int s, i;

        prd = lwp_getspecific(psref_debug_lwp_key);
        if (__predict_false(prd == NULL))
                return;

        s = splserial();
        for (i = 0; i < prd->prd_refs_peek; i++) {
                struct psref_debug_item *prdi = &prd->prd_items[i];
                if (__predict_true(prdi->prdi_psref == NULL))
                        continue;
                panic("psref leaked: lwp(%p) acquired at %p", l, prdi->prdi_caller);
        }
        prd->prd_refs_peek = 0; /* Reset the counter */
        splx(s);
}
#endif /* PSREF_DEBUG */





























































    2 










    1 





    1 



    1 

    1 















































    3 














    1 



    1 



    1 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
/*        $NetBSD: kern_50.c,v 1.3 2020/01/29 15:47:51 ad Exp $        */

/*-
 * Copyright (c) 2008, 2009, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_50.c,v 1.3 2020/01/29 15:47:51 ad Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/lwp.h>
#include <sys/proc.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>

#include <compat/sys/resource.h>
#include <compat/sys/time.h>

#include <compat/common/compat_mod.h>

static const struct syscall_package kern_50_syscalls[] = {
        { SYS_compat_50__lwp_park, 0, (sy_call_t *)compat_50_sys__lwp_park },
        { SYS_compat_50___sigtimedwait, 0,
            (sy_call_t *)compat_50_sys___sigtimedwait },
        { SYS_compat_50_wait4, 0, (sy_call_t *)compat_50_sys_wait4 },
        { 0, 0, NULL }
};

int
compat_50_sys__lwp_park(struct lwp *l,
    const struct compat_50_sys__lwp_park_args *uap, register_t *retval)
{
        /* {
                syscallarg(const struct timespec50 *)        ts;
                syscallarg(lwpid_t)                        unpark;
                syscallarg(const void *)                hint;
                syscallarg(const void *)                unparkhint;
        } */
        struct timespec ts, *tsp;
        struct timespec50 ts50;
        int error;

        if (SCARG(uap, ts) == NULL)
                tsp = NULL;
        else {
                error = copyin(SCARG(uap, ts), &ts50, sizeof(ts50));
                if (error != 0)
                        return error;
                timespec50_to_timespec(&ts50, &ts);
                tsp = &ts;
        }

        if (SCARG(uap, unpark) != 0) {
                error = lwp_unpark(&SCARG(uap, unpark), 1);
                if (error != 0)
                        return error;
        }

        return lwp_park(CLOCK_REALTIME, TIMER_ABSTIME, tsp);
}

static int
tscopyin(const void *u, void *s, size_t len)
{
        struct timespec50 ts50;
        int error;

        KASSERT(len == sizeof(struct timespec));
        error = copyin(u, &ts50, sizeof(ts50));
        if (error)
                return error;
        timespec50_to_timespec(&ts50, s);
        return 0;
}

static int
tscopyout(const void *s, void *u, size_t len)
{
        struct timespec50 ts50;

        KASSERT(len == sizeof(struct timespec));
        timespec_to_timespec50(s, &ts50);
        return copyout(&ts50, u, sizeof(ts50));
}

int
compat_50_sys___sigtimedwait(struct lwp *l,
    const struct compat_50_sys___sigtimedwait_args *uap, register_t *retval)
{
        int res;

        res = sigtimedwait1(l,
            (const struct sys_____sigtimedwait50_args *)uap, retval, copyin,
            copyout, tscopyin, tscopyout);
        if (!res)
                *retval = 0; /* XXX NetBSD<=5 was not POSIX compliant */
        return res;
}

int
compat_50_sys_wait4(struct lwp *l, const struct compat_50_sys_wait4_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        pid;
                syscallarg(int *)                status;
                syscallarg(int)                        options;
                syscallarg(struct rusage50 *)        rusage;
        } */
        int status, error, pid = SCARG(uap, pid);
        struct rusage50 ru50;
        struct rusage ru;

        error = do_sys_wait(&pid, &status, SCARG(uap, options),
            SCARG(uap, rusage) != NULL ? &ru : NULL);

        retval[0] = pid;
        if (pid == 0)
                return error;

        if (SCARG(uap, rusage)) {
                rusage_to_rusage50(&ru, &ru50);
                error = copyout(&ru50, SCARG(uap, rusage), sizeof(ru50));
        }

        if (error == 0 && SCARG(uap, status))
                error = copyout(&status, SCARG(uap, status), sizeof(status));

        return error;
}

int
kern_50_init(void)
{

        return syscall_establish(NULL, kern_50_syscalls);
}

int
kern_50_fini(void)
{

        return syscall_disestablish(NULL, kern_50_syscalls);
}

































































































































































    4 










































































































   29 









   29 































   29 















   29 





































































































































































































    3 







































    1 



    1 
























    1 


































































































































































    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
/*        $NetBSD: subr_disk.c,v 1.137 2023/05/09 12:04:04 riastradh Exp $        */

/*-
 * Copyright (c) 1996, 1997, 1999, 2000, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ufs_disksubr.c        8.5 (Berkeley) 1/21/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.137 2023/05/09 12:04:04 riastradh Exp $");

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/buf.h>
#include <sys/fcntl.h>
#include <sys/syslog.h>
#include <sys/disklabel.h>
#include <sys/disk.h>
#include <sys/sysctl.h>
#include <lib/libkern/libkern.h>

/*
 * Disk error is the preface to plaintive error messages
 * about failing disk transfers.  It prints messages of the form

hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)

 * if the offset of the error in the transfer and a disk label
 * are both available.  blkdone should be -1 if the position of the error
 * is unknown; the disklabel pointer may be null from drivers that have not
 * been converted to use them.  The message is printed with printf
 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
 * The message should be completed (with at least a newline) with printf
 * or addlog, respectively.  There is no trailing space.
 */
#ifndef PRIdaddr
#define PRIdaddr PRId64
#endif
void
diskerr(const struct buf *bp, const char *dname, const char *what, int pri,
    int blkdone, const struct disklabel *lp)
{
        int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
        void (*pr)(const char *, ...) __printflike(1, 2);
        char partname = 'a' + part;
        daddr_t sn;

        if (/*CONSTCOND*/0)
                /* Compiler will error this if the format is wrong... */
                printf("%" PRIdaddr, bp->b_blkno);

        if (pri != LOG_PRINTF) {
                static const char fmt[] = "";
                log(pri, fmt);
                pr = addlog;
        } else
                pr = printf;
        (*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
            bp->b_flags & B_READ ? "read" : "writ");
        sn = bp->b_blkno;
        if (bp->b_bcount <= DEV_BSIZE)
                (*pr)("%" PRIdaddr, sn);
        else {
                if (blkdone >= 0) {
                        sn += blkdone;
                        (*pr)("%" PRIdaddr " of ", sn);
                }
                (*pr)("%" PRIdaddr "-%" PRIdaddr "", bp->b_blkno,
                    bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
        }
        if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
                sn += lp->d_partitions[part].p_offset;
                (*pr)(" (%s%d bn %" PRIdaddr "; cn %" PRIdaddr "",
                    dname, unit, sn, sn / lp->d_secpercyl);
                sn %= lp->d_secpercyl;
                (*pr)(" tn %" PRIdaddr " sn %" PRIdaddr ")",
                    sn / lp->d_nsectors, sn % lp->d_nsectors);
        }
}

/*
 * Searches the iostatlist for the disk corresponding to the
 * name provided.
 */
struct disk *
disk_find(const char *name)
{
        struct io_stats *stat;

        stat = iostat_find(name);

        if ((stat != NULL) && (stat->io_type == IOSTAT_DISK))
                return stat->io_parent;

        return (NULL);
}

void
disk_init(struct disk *diskp, const char *name, const struct dkdriver *driver)
{
        u_int blocksize = DEV_BSIZE;

        /*
         * Initialize the wedge-related locks and other fields.
         */
        mutex_init(&diskp->dk_rawlock, MUTEX_DEFAULT, IPL_NONE);
        mutex_init(&diskp->dk_openlock, MUTEX_DEFAULT, IPL_NONE);
        LIST_INIT(&diskp->dk_wedges);
        diskp->dk_nwedges = 0;
        diskp->dk_labelsector = LABELSECTOR;
        diskp->dk_blkshift = DK_BSIZE2BLKSHIFT(blocksize);
        diskp->dk_byteshift = DK_BSIZE2BYTESHIFT(blocksize);
        diskp->dk_name = name;
        diskp->dk_driver = driver;
}

/*
 * Rename a disk.
 */
void
disk_rename(struct disk *diskp, const char *name)
{

        diskp->dk_name = name;
        iostat_rename(diskp->dk_stats, diskp->dk_name);
}

/*
 * Attach a disk.
 */
void
disk_attach(struct disk *diskp)
{

        /*
         * Allocate and initialize the disklabel structures.
         */
        diskp->dk_label = kmem_zalloc(sizeof(struct disklabel), KM_SLEEP);
        diskp->dk_cpulabel = kmem_zalloc(sizeof(struct cpu_disklabel),
            KM_SLEEP);

        /*
         * Set up the stats collection.
         */
        diskp->dk_stats = iostat_alloc(IOSTAT_DISK, diskp, diskp->dk_name);
}

int
disk_begindetach(struct disk *dk, int (*lastclose)(device_t),
    device_t self, int flags)
{
        int rc;

        rc = 0;
        mutex_enter(&dk->dk_openlock);
        if (dk->dk_openmask == 0)
                ;        /* nothing to do */
        else if ((flags & DETACH_FORCE) == 0)
                rc = EBUSY;
        else if (lastclose != NULL)
                rc = (*lastclose)(self);
        mutex_exit(&dk->dk_openlock);

        return rc;
}

/*
 * Detach a disk.
 */
void
disk_detach(struct disk *diskp)
{

        /*
         * Remove from the drivelist.
         */
        iostat_free(diskp->dk_stats);

        /*
         * Release the disk-info dictionary.
         */
        if (diskp->dk_info) {
                prop_object_release(diskp->dk_info);
                diskp->dk_info = NULL;
        }

        /*
         * Free the space used by the disklabel structures.
         */
        kmem_free(diskp->dk_label, sizeof(*diskp->dk_label));
        kmem_free(diskp->dk_cpulabel, sizeof(*diskp->dk_cpulabel));
}

void
disk_destroy(struct disk *diskp)
{

        mutex_destroy(&diskp->dk_openlock);
        mutex_destroy(&diskp->dk_rawlock);
}

/*
 * Mark the disk as having work queued for metrics collection.
 */
void
disk_wait(struct disk *diskp)
{

        iostat_wait(diskp->dk_stats);
}

/*
 * Mark the disk as busy for metrics collection.
 */
void
disk_busy(struct disk *diskp)
{

        iostat_busy(diskp->dk_stats);
}

/*
 * Finished disk operations, gather metrics.
 */
void
disk_unbusy(struct disk *diskp, long bcount, int read)
{

        iostat_unbusy(diskp->dk_stats, bcount, read);
}

/*
 * Return true if disk has an I/O operation in flight.
 */
bool
disk_isbusy(struct disk *diskp)
{

        return iostat_isbusy(diskp->dk_stats);
}

/*
 * Bounds checking against the media size, used for the raw partition.
 * secsize, mediasize and b_blkno must all be the same units.
 * Possibly this has to be DEV_BSIZE (512).
 */
int
bounds_check_with_mediasize(struct buf *bp, int secsize, uint64_t mediasize)
{
        int64_t sz;

        if (bp->b_blkno < 0) {
                /* Reject negative offsets immediately. */
                bp->b_error = EINVAL;
                return 0;
        }

        sz = howmany((int64_t)bp->b_bcount, secsize);

        /*
         * bp->b_bcount is a 32-bit value, and we rejected a negative
         * bp->b_blkno already, so "bp->b_blkno + sz" cannot overflow.
         */

        if (bp->b_blkno + sz > mediasize) {
                sz = mediasize - bp->b_blkno;
                if (sz == 0) {
                        /* If exactly at end of disk, return EOF. */
                        bp->b_resid = bp->b_bcount;
                        return 0;
                }
                if (sz < 0) {
                        /* If past end of disk, return EINVAL. */
                        bp->b_error = EINVAL;
                        return 0;
                }
                /* Otherwise, truncate request. */
                bp->b_bcount = sz * secsize;
        }

        return 1;
}

/*
 * Determine the size of the transfer, and make sure it is
 * within the boundaries of the partition. Adjust transfer
 * if needed, and signal errors or early completion.
 */
int
bounds_check_with_label(struct disk *dk, struct buf *bp, int wlabel)
{
        struct disklabel *lp = dk->dk_label;
        struct partition *p = lp->d_partitions + DISKPART(bp->b_dev);
        uint64_t p_size, p_offset, labelsector;
        int64_t sz;

        if (bp->b_blkno < 0) {
                /* Reject negative offsets immediately. */
                bp->b_error = EINVAL;
                return -1;
        }

        /* Protect against division by zero. XXX: Should never happen?!?! */
        if ((lp->d_secsize / DEV_BSIZE) == 0 || lp->d_secpercyl == 0) {
                bp->b_error = EINVAL;
                return -1;
        }

        p_size = (uint64_t)p->p_size << dk->dk_blkshift;
        p_offset = (uint64_t)p->p_offset << dk->dk_blkshift;
#if RAW_PART == 3
        labelsector = lp->d_partitions[2].p_offset;
#else
        labelsector = lp->d_partitions[RAW_PART].p_offset;
#endif
        labelsector = (labelsector + dk->dk_labelsector) << dk->dk_blkshift;

        sz = howmany((int64_t)bp->b_bcount, DEV_BSIZE);

        /*
         * bp->b_bcount is a 32-bit value, and we rejected a negative
         * bp->b_blkno already, so "bp->b_blkno + sz" cannot overflow.
         */

        if (bp->b_blkno + sz > p_size) {
                sz = p_size - bp->b_blkno;
                if (sz == 0) {
                        /* If exactly at end of disk, return EOF. */
                        bp->b_resid = bp->b_bcount;
                        return 0;
                }
                if (sz < 0) {
                        /* If past end of disk, return EINVAL. */
                        bp->b_error = EINVAL;
                        return -1;
                }
                /* Otherwise, truncate request. */
                bp->b_bcount = sz << DEV_BSHIFT;
        }

        /* Overwriting disk label? */
        if (bp->b_blkno + p_offset <= labelsector &&
            bp->b_blkno + p_offset + sz > labelsector &&
            (bp->b_flags & B_READ) == 0 && !wlabel) {
                bp->b_error = EROFS;
                return -1;
        }

        /* calculate cylinder for disksort to order transfers with */
        bp->b_cylinder = (bp->b_blkno + p->p_offset) /
            (lp->d_secsize / DEV_BSIZE) / lp->d_secpercyl;
        return 1;
}

int
disk_read_sectors(void (*strat)(struct buf *), const struct disklabel *lp,
    struct buf *bp, unsigned int sector, int count)
{

        if ((lp->d_secsize / DEV_BSIZE) == 0 || lp->d_secpercyl == 0)
                return EINVAL;

        bp->b_blkno = btodb((off_t)sector * lp->d_secsize);
        bp->b_bcount = count * lp->d_secsize;
        bp->b_flags = (bp->b_flags & ~B_WRITE) | B_READ;
        bp->b_oflags &= ~BO_DONE;
        bp->b_cylinder = sector / lp->d_secpercyl;
        (*strat)(bp);
        return biowait(bp);
}

const char *
convertdisklabel(struct disklabel *lp, void (*strat)(struct buf *),
    struct buf *bp, uint32_t secperunit)
{
        struct partition rp, *altp, *p;
        int geom_ok;
        const char *str;

        memset(&rp, 0, sizeof(rp));
        rp.p_size = secperunit;
        rp.p_fstype = FS_UNUSED;

        /* If we can seek to d_secperunit - 1, believe the disk geometry. */
        if (secperunit != 0 &&
            disk_read_sectors(strat, lp, bp, secperunit - 1, 1) == 0)
                geom_ok = 1;
        else
                geom_ok = 0;

#if 0
        printf("%s: secperunit (%" PRIu32 ") %s\n", __func__,
            secperunit, geom_ok ? "ok" : "not ok");
#endif

        p = &lp->d_partitions[RAW_PART];
        if (RAW_PART == 'c' - 'a')
                altp = &lp->d_partitions['d' - 'a'];
        else
                altp = &lp->d_partitions['c' - 'a'];

        if (lp->d_npartitions > RAW_PART && p->p_offset == 0 && p->p_size != 0)
                return NULL;        /* already a raw partition */
        else if (lp->d_npartitions > MAX('c', 'd') - 'a' &&
                 altp->p_offset == 0 && altp->p_size != 0) {
                /* alternate partition ('c' or 'd') is suitable for raw slot,
                 * swap with 'd' or 'c'.
                 */
                rp = *p;
                *p = *altp;
                *altp = rp;
                return NULL;
        } else if (lp->d_npartitions <= RAW_PART &&
                   lp->d_npartitions > 'c' - 'a') {
                /* No raw partition is present, but the alternate is present.
                 * Copy alternate to raw partition.
                 */
                lp->d_npartitions = RAW_PART + 1;
                *p = *altp;
                return NULL;
        } else if (!geom_ok)
                str = "no raw partition and disk reports bad geometry";
        else if (lp->d_npartitions <= RAW_PART) {
                memset(&lp->d_partitions[lp->d_npartitions], 0,
                    sizeof(struct partition) * (RAW_PART - lp->d_npartitions));
                *p = rp;
                lp->d_npartitions = RAW_PART + 1;
                return NULL;
        } else if (lp->d_npartitions < MAXPARTITIONS) {
                memmove(p + 1, p,
                    sizeof(struct partition) * (lp->d_npartitions - RAW_PART));
                *p = rp;
                lp->d_npartitions++;
                return NULL;
        } else
                str = "no raw partition and partition table is full";
#ifdef DIAGNOSTIC
        printf("Bad partition: %s\n", str);
        printf("type = %u, subtype = %u, typename = %s\n",
            lp->d_type, lp->d_subtype, lp->d_typename);
        printf("secsize = %u, nsectors = %u, ntracks = %u\n",
            lp->d_secsize, lp->d_nsectors, lp->d_ntracks);
        printf("ncylinders = %u, secpercyl = %u, secperunit = %u\n",
            lp->d_ncylinders, lp->d_secpercyl, lp->d_secperunit);
        printf("npartitions = %u\n", lp->d_npartitions);

        for (size_t i = 0; i < MIN(lp->d_npartitions, MAXPARTITIONS); i++) {
                p = &lp->d_partitions[i];
                printf("\t%c: offset = %u size = %u fstype = %u\n",
                    (char)(i + 'a'), p->p_offset, p->p_size, p->p_fstype);
        }
#endif                        
        return str;
}

/*
 * disk_ioctl --
 *        Generic disk ioctl handling.
 */
int
disk_ioctl(struct disk *dk, dev_t dev, u_long cmd, void *data, int flag,
    struct lwp *l)
{
        struct dkwedge_info *dkw;
        struct partinfo *pi;
        struct partition *dp;
#ifdef __HAVE_OLD_DISKLABEL
        struct disklabel newlabel;
#endif

        switch (cmd) {
        case DIOCGDISKINFO: {
                prop_dictionary_t disk_info;
                int error;

                mutex_enter(&dk->dk_openlock);
                if ((disk_info = dk->dk_info) == NULL) {
                        error = ENOTSUP;
                } else {
                        prop_object_retain(disk_info);
                        error = 0;
                }
                mutex_exit(&dk->dk_openlock);
                if (error)
                        return error;

                error = prop_dictionary_copyout_ioctl(data, cmd, disk_info);
                prop_object_release(disk_info);
                return error;
        }
        case DIOCGSECTORSIZE:
                *(u_int *)data = dk->dk_geom.dg_secsize;
                return 0;

        case DIOCGMEDIASIZE:
                *(off_t *)data = (off_t)dk->dk_geom.dg_secsize *
                    dk->dk_geom.dg_secperunit;
                return 0;
        default:
                break;
        }

        if (dev == NODEV)
                return EPASSTHROUGH;

        /* The following should be moved to dk_ioctl */
        switch (cmd) {
        case DIOCGDINFO:
                if (dk->dk_label == NULL)
                        return EBUSY;
                memcpy(data, dk->dk_label, sizeof (*dk->dk_label));
                return 0;

#ifdef __HAVE_OLD_DISKLABEL
        case ODIOCGDINFO:
                if (dk->dk_label == NULL)
                        return EBUSY;
                memcpy(&newlabel, dk->dk_label, sizeof(newlabel));
                if (newlabel.d_npartitions > OLDMAXPARTITIONS)
                        return ENOTTY;
                memcpy(data, &newlabel, sizeof(struct olddisklabel));
                return 0;
#endif

        case DIOCGPARTINFO:
                pi = data;
                memset(pi, 0, sizeof(*pi));
                pi->pi_secsize = dk->dk_geom.dg_secsize;
                pi->pi_bsize = MAX(BLKDEV_IOSIZE, pi->pi_secsize);

                if (DISKPART(dev) == RAW_PART) {
                        pi->pi_size = dk->dk_geom.dg_secperunit;
                        return 0;
                }

                if (dk->dk_label == NULL)
                        return EBUSY;

                dp = &dk->dk_label->d_partitions[DISKPART(dev)];
                pi->pi_offset = dp->p_offset;
                pi->pi_size = dp->p_size;

                pi->pi_fstype = dp->p_fstype;
                pi->pi_frag = dp->p_frag;
                pi->pi_fsize = dp->p_fsize;
                pi->pi_cpg = dp->p_cpg;
                
                /*
                 * dholland 20130616: XXX this logic should not be
                 * here. It is here because the old buffer cache
                 * demands that all accesses to the same blocks need
                 * to be the same size; but it only works for FFS and
                 * nowadays I think it'll fail silently if the size
                 * info in the disklabel is wrong. (Or missing.) The
                 * buffer cache needs to be smarter; or failing that
                 * we need a reliable way here to get the right block
                 * size; or a reliable way to guarantee that (a) the
                 * fs is not mounted when we get here and (b) any
                 * buffers generated here will get purged when the fs
                 * does get mounted.
                 */
                if (dp->p_fstype == FS_BSDFFS &&
                    dp->p_frag != 0 && dp->p_fsize != 0)
                        pi->pi_bsize = dp->p_frag * dp->p_fsize;
                return 0;

        case DIOCAWEDGE:
                if ((flag & FWRITE) == 0)
                        return EBADF;

                dkw = data;
                strlcpy(dkw->dkw_parent, dk->dk_name, sizeof(dkw->dkw_parent));
                return dkwedge_add(dkw);

        case DIOCDWEDGE:
                if ((flag & FWRITE) == 0)
                        return EBADF;

                dkw = data;
                strlcpy(dkw->dkw_parent, dk->dk_name, sizeof(dkw->dkw_parent));
                return dkwedge_del(dkw);

        case DIOCLWEDGES:
                return dkwedge_list(dk, data, l);

        case DIOCMWEDGES:
                if ((flag & FWRITE) == 0)
                        return EBADF;

                dkwedge_discover(dk);
                return 0;

        case DIOCRMWEDGES:
                if ((flag & FWRITE) == 0)
                        return EBADF;

                dkwedge_delidle(dk);
                return 0;

        default:
                return EPASSTHROUGH;
        }
}

/*
 * disk_set_info --
 *        Canonicalize dk->dk_geom and set some parameters.
 *
 *        If disk_set_info can happen concurrently with disk_ioctl in a
 *        driver, the driver must serialize calls to disk_set_info with
 *        dk_openlock.
 */
void
disk_set_info(device_t dev, struct disk *dk, const char *type)
{
        struct disk_geom *dg = &dk->dk_geom;

        if (dg->dg_secsize == 0) {
#ifdef DIAGNOSTIC
                printf("%s: fixing 0 sector size\n", dk->dk_name);
#endif
                dg->dg_secsize = DEV_BSIZE;
        }

        dk->dk_blkshift = DK_BSIZE2BLKSHIFT(dg->dg_secsize);
        dk->dk_byteshift = DK_BSIZE2BYTESHIFT(dg->dg_secsize);

        if (dg->dg_secperunit == 0) {
#ifdef DIAGNOSTIC
                if (dg->dg_ncylinders == 0) {
                        printf("%s: secperunit and ncylinders are zero\n",
                            dk->dk_name);
                }
                if (dg->dg_nsectors == 0 || dg->dg_ntracks == 0) {
                        printf("%s: secperunit and (sectors or tracks) "
                            "are zero\n", dk->dk_name);
                }
#endif
                dg->dg_secperunit = (int64_t) dg->dg_nsectors *
                    dg->dg_ntracks * dg->dg_ncylinders;
        }

        if (dg->dg_ncylinders == 0) {
                if (dg->dg_ntracks && dg->dg_nsectors)
                        dg->dg_ncylinders = dg->dg_secperunit /
                            (dg->dg_ntracks * dg->dg_nsectors);
        }

        prop_dictionary_t disk_info, odisk_info, geom;

        disk_info = prop_dictionary_create();
        geom = prop_dictionary_create();

        prop_dictionary_set_uint64(geom, "sectors-per-unit",
            dg->dg_secperunit);

        prop_dictionary_set_uint32(geom, "sector-size", dg->dg_secsize);

        if (dg->dg_nsectors)
                prop_dictionary_set_uint16(geom, "sectors-per-track",
                    dg->dg_nsectors);

        if (dg->dg_ntracks)
                prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
                    dg->dg_ntracks);

        if (dg->dg_ncylinders)
                prop_dictionary_set_uint64(geom, "cylinders-per-unit",
                    dg->dg_ncylinders);

        prop_dictionary_set(disk_info, "geometry", geom);

        if (type)
                prop_dictionary_set_string_nocopy(disk_info, "type", type);

        prop_object_release(geom);

        odisk_info = dk->dk_info;
        dk->dk_info = disk_info;

        if (dev)
                prop_dictionary_set(device_properties(dev), "disk-info",
                    disk_info);

        /*
         * Don't release disk_info here; we keep a reference to it.
         * disk_detach() will release it when we go away.
         */
        if (odisk_info)
                prop_object_release(odisk_info);
}

int
disklabel_dev_unit(dev_t dev)
{

        return DISKUNIT(dev);
}















































































    1 











    1 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/*        $NetBSD: tmpfs_fifoops.c,v 1.15 2021/07/19 01:30:25 dholland Exp $        */

/*
 * Copyright (c) 2005 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
 * 2005 program.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * tmpfs vnode interface for named pipes.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_fifoops.c,v 1.15 2021/07/19 01:30:25 dholland Exp $");

#include <sys/param.h>
#include <sys/vnode.h>

#include <fs/tmpfs/tmpfs.h>
#include <fs/tmpfs/tmpfs_fifoops.h>

/*
 * vnode operations vector used for fifos stored in a tmpfs file system.
 */
int (**tmpfs_fifoop_p)(void *);
const struct vnodeopv_entry_desc tmpfs_fifoop_entries[] = {
        { &vop_default_desc,                vn_default_error },
        GENFS_FIFOOP_ENTRIES,
        { &vop_close_desc,                tmpfs_fifo_close },
        { &vop_access_desc,                tmpfs_access },
        { &vop_accessx_desc,                genfs_accessx },
        { &vop_getattr_desc,                tmpfs_getattr },
        { &vop_setattr_desc,                tmpfs_setattr },
        { &vop_read_desc,                tmpfs_fifo_read },
        { &vop_write_desc,                tmpfs_fifo_write },
        { &vop_fcntl_desc,                genfs_fcntl },
        { &vop_fsync_desc,                vn_fifo_bypass },
        { &vop_inactive_desc,                tmpfs_inactive },
        { &vop_reclaim_desc,                tmpfs_reclaim },
        { &vop_lock_desc,                genfs_lock },
        { &vop_unlock_desc,                genfs_unlock },
        { &vop_strategy_desc,                vn_fifo_bypass },
        { &vop_print_desc,                tmpfs_print },
        { &vop_islocked_desc,                genfs_islocked },
        { &vop_bwrite_desc,                genfs_nullop },
        { NULL, NULL }
};

const struct vnodeopv_desc tmpfs_fifoop_opv_desc = {
        &tmpfs_fifoop_p, tmpfs_fifoop_entries
};

int
tmpfs_fifo_close(void *v)
{
        struct vop_close_args /* {
                struct vnode        *a_vp;
                int                a_fflag;
                kauth_cred_t        a_cred;
        } */ *ap __unused = v;

        return VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), v);
}

int
tmpfs_fifo_read(void *v)
{
        struct vop_read_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;

        tmpfs_update(vp, TMPFS_UPDATE_ATIME);
        return VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), v);
}

int
tmpfs_fifo_write(void *v)
{
        struct vop_write_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;

        tmpfs_update(vp, TMPFS_UPDATE_MTIME);
        return VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), v);
}
































































































































































































































































































    3 






    3 







    3 
    3 






































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
/*        $NetBSD: kern_cpu.c,v 1.97 2023/09/02 17:44:59 riastradh Exp $        */

/*-
 * Copyright (c) 2007, 2008, 2009, 2010, 2012, 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c)2007 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * CPU related routines not shared with rump.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.97 2023/09/02 17:44:59 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_cpu_ucode.h"
#include "opt_heartbeat.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/idle.h>
#include <sys/sched.h>
#include <sys/intr.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/cpuio.h>
#include <sys/proc.h>
#include <sys/percpu.h>
#include <sys/kernel.h>
#include <sys/kauth.h>
#include <sys/xcall.h>
#include <sys/pool.h>
#include <sys/kmem.h>
#include <sys/select.h>
#include <sys/namei.h>
#include <sys/callout.h>
#include <sys/pcu.h>
#include <sys/heartbeat.h>

#include <uvm/uvm_extern.h>

#include "ioconf.h"

/*
 * If the port has stated that cpu_data is the first thing in cpu_info,
 * verify that the claim is true. This will prevent them from getting out
 * of sync.
 */
#ifdef __HAVE_CPU_DATA_FIRST
CTASSERT(offsetof(struct cpu_info, ci_data) == 0);
#else
CTASSERT(offsetof(struct cpu_info, ci_data) != 0);
#endif

int (*compat_cpuctl_ioctl)(struct lwp *, u_long, void *) = (void *)enosys;

static void        cpu_xc_online(struct cpu_info *, void *);
static void        cpu_xc_offline(struct cpu_info *, void *);

dev_type_ioctl(cpuctl_ioctl);

const struct cdevsw cpuctl_cdevsw = {
        .d_open = nullopen,
        .d_close = nullclose,
        .d_read = nullread,
        .d_write = nullwrite,
        .d_ioctl = cpuctl_ioctl,
        .d_stop = nullstop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_OTHER | D_MPSAFE
};

int
mi_cpu_attach(struct cpu_info *ci)
{
        int error;

        KASSERT(maxcpus > 0);

        if ((ci->ci_index = ncpu) >= maxcpus)
                panic("Too many CPUs.  Increase MAXCPUS?");
        kcpuset_set(kcpuset_attached, cpu_index(ci));

        /*
         * Create a convenience cpuset of just ourselves.
         */
        kcpuset_create(&ci->ci_kcpuset, true);
        kcpuset_set(ci->ci_kcpuset, cpu_index(ci));

        TAILQ_INIT(&ci->ci_data.cpu_ld_locks);
        __cpu_simple_lock_init(&ci->ci_data.cpu_ld_lock);

        /* This is useful for eg, per-cpu evcnt */
        snprintf(ci->ci_data.cpu_name, sizeof(ci->ci_data.cpu_name), "cpu%d",
            cpu_index(ci));

        if (__predict_false(cpu_infos == NULL)) {
                size_t ci_bufsize = (maxcpus + 1) * sizeof(struct cpu_info *);
                cpu_infos = kmem_zalloc(ci_bufsize, KM_SLEEP);
        }
        cpu_infos[cpu_index(ci)] = ci;

        sched_cpuattach(ci);

        error = create_idle_lwp(ci);
        if (error != 0) {
                /* XXX revert sched_cpuattach */
                return error;
        }

        if (ci == curcpu())
                ci->ci_onproc = curlwp;
        else
                ci->ci_onproc = ci->ci_data.cpu_idlelwp;

        percpu_init_cpu(ci);
        softint_init(ci);
        callout_init_cpu(ci);
        xc_init_cpu(ci);
        pool_cache_cpu_init(ci);
        selsysinit(ci);
        cache_cpu_init(ci);
        TAILQ_INIT(&ci->ci_data.cpu_biodone);
        ncpu++;
        ncpuonline++;

        return 0;
}

void
cpuctlattach(int dummy __unused)
{

        KASSERT(cpu_infos != NULL);
}

int
cpuctl_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l)
{
        CPU_INFO_ITERATOR cii;
        cpustate_t *cs;
        struct cpu_info *ci;
        int error, i;
        u_int id;

        error = 0;

        mutex_enter(&cpu_lock);
        switch (cmd) {
        case IOC_CPU_SETSTATE:
                cs = data;
                error = kauth_authorize_system(l->l_cred,
                    KAUTH_SYSTEM_CPU, KAUTH_REQ_SYSTEM_CPU_SETSTATE, cs, NULL,
                    NULL);
                if (error != 0)
                        break;
                if (cs->cs_id >= maxcpus ||
                    (ci = cpu_lookup(cs->cs_id)) == NULL) {
                        error = ESRCH;
                        break;
                }
                cpu_setintr(ci, cs->cs_intr);        /* XXX neglect errors */
                error = cpu_setstate(ci, cs->cs_online);
                break;

        case IOC_CPU_GETSTATE:
                cs = data;
                id = cs->cs_id;
                memset(cs, 0, sizeof(*cs));
                cs->cs_id = id;
                if (cs->cs_id >= maxcpus ||
                    (ci = cpu_lookup(id)) == NULL) {
                        error = ESRCH;
                        break;
                }
                if ((ci->ci_schedstate.spc_flags & SPCF_OFFLINE) != 0)
                        cs->cs_online = false;
                else
                        cs->cs_online = true;
                if ((ci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0)
                        cs->cs_intr = false;
                else
                        cs->cs_intr = true;
                cs->cs_lastmod = (int32_t)ci->ci_schedstate.spc_lastmod;
                cs->cs_lastmodhi = (int32_t)
                    (ci->ci_schedstate.spc_lastmod >> 32);
                cs->cs_intrcnt = cpu_intr_count(ci) + 1;
                cs->cs_hwid = ci->ci_cpuid;
                break;

        case IOC_CPU_MAPID:
                i = 0;
                for (CPU_INFO_FOREACH(cii, ci)) {
                        if (i++ == *(int *)data)
                                break;
                }
                if (ci == NULL)
                        error = ESRCH;
                else
                        *(int *)data = cpu_index(ci);
                break;

        case IOC_CPU_GETCOUNT:
                *(int *)data = ncpu;
                break;

#ifdef CPU_UCODE
        case IOC_CPU_UCODE_GET_VERSION:
                error = cpu_ucode_get_version((struct cpu_ucode_version *)data);
                break;

        case IOC_CPU_UCODE_APPLY:
                error = kauth_authorize_machdep(l->l_cred,
                    KAUTH_MACHDEP_CPU_UCODE_APPLY,
                    NULL, NULL, NULL, NULL);
                if (error != 0)
                        break;
                error = cpu_ucode_apply((const struct cpu_ucode *)data);
                break;
#endif

        default:
                error = (*compat_cpuctl_ioctl)(l, cmd, data);
                break;
        }
        mutex_exit(&cpu_lock);

        return error;
}

struct cpu_info *
cpu_lookup(u_int idx)
{
        struct cpu_info *ci;

        /*
         * cpu_infos is a NULL terminated array of MAXCPUS + 1 entries,
         * so an index of MAXCPUS here is ok.  See mi_cpu_attach.
         */
        KASSERT(idx <= maxcpus);

        if (__predict_false(cpu_infos == NULL)) {
                KASSERT(idx == 0);
                return curcpu();
        }

        ci = cpu_infos[idx];
        KASSERT(ci == NULL || cpu_index(ci) == idx);
        KASSERTMSG(idx < maxcpus || ci == NULL, "idx %d ci %p", idx, ci);

        return ci;
}

static void
cpu_xc_offline(struct cpu_info *ci, void *unused)
{
        struct schedstate_percpu *spc, *mspc = NULL;
        struct cpu_info *target_ci;
        struct lwp *l;
        CPU_INFO_ITERATOR cii;
        int s;

        /*
         * Thread that made the cross call (separate context) holds
         * cpu_lock on our behalf.
         */
        spc = &ci->ci_schedstate;
        s = splsched();
        spc->spc_flags |= SPCF_OFFLINE;
        splx(s);

        /* Take the first available CPU for the migration. */
        for (CPU_INFO_FOREACH(cii, target_ci)) {
                mspc = &target_ci->ci_schedstate;
                if ((mspc->spc_flags & SPCF_OFFLINE) == 0)
                        break;
        }
        KASSERT(target_ci != NULL);

        /*
         * Migrate all non-bound threads to the other CPU.  Note that this
         * runs from the xcall thread, thus handling of LSONPROC is not needed.
         */
        mutex_enter(&proc_lock);
        LIST_FOREACH(l, &alllwp, l_list) {
                struct cpu_info *mci;

                lwp_lock(l);
                if (l->l_cpu != ci || (l->l_pflag & (LP_BOUND | LP_INTR))) {
                        lwp_unlock(l);
                        continue;
                }
                /* Regular case - no affinity. */
                if (l->l_affinity == NULL) {
                        lwp_migrate(l, target_ci);
                        continue;
                }
                /* Affinity is set, find an online CPU in the set. */
                for (CPU_INFO_FOREACH(cii, mci)) {
                        mspc = &mci->ci_schedstate;
                        if ((mspc->spc_flags & SPCF_OFFLINE) == 0 &&
                            kcpuset_isset(l->l_affinity, cpu_index(mci)))
                                break;
                }
                if (mci == NULL) {
                        lwp_unlock(l);
                        mutex_exit(&proc_lock);
                        goto fail;
                }
                lwp_migrate(l, mci);
        }
        mutex_exit(&proc_lock);

#if PCU_UNIT_COUNT > 0
        pcu_save_all_on_cpu();
#endif

        heartbeat_suspend();

#ifdef __HAVE_MD_CPU_OFFLINE
        cpu_offline_md();
#endif
        return;
fail:
        /* Just unset the SPCF_OFFLINE flag, caller will check */
        s = splsched();
        spc->spc_flags &= ~SPCF_OFFLINE;
        splx(s);
}

static void
cpu_xc_online(struct cpu_info *ci, void *unused)
{
        struct schedstate_percpu *spc;
        int s;

        heartbeat_resume();

        spc = &ci->ci_schedstate;
        s = splsched();
        spc->spc_flags &= ~SPCF_OFFLINE;
        splx(s);
}

int
cpu_setstate(struct cpu_info *ci, bool online)
{
        struct schedstate_percpu *spc;
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci2;
        uint64_t where;
        xcfunc_t func;
        int nonline;

        spc = &ci->ci_schedstate;

        KASSERT(mutex_owned(&cpu_lock));

        if (online) {
                if ((spc->spc_flags & SPCF_OFFLINE) == 0)
                        return 0;
                func = (xcfunc_t)cpu_xc_online;
        } else {
                if ((spc->spc_flags & SPCF_OFFLINE) != 0)
                        return 0;
                nonline = 0;
                /*
                 * Ensure that at least one CPU within the processor set
                 * stays online.  Revisit this later.
                 */
                for (CPU_INFO_FOREACH(cii, ci2)) {
                        if ((ci2->ci_schedstate.spc_flags & SPCF_OFFLINE) != 0)
                                continue;
                        if (ci2->ci_schedstate.spc_psid != spc->spc_psid)
                                continue;
                        nonline++;
                }
                if (nonline == 1)
                        return EBUSY;
                func = (xcfunc_t)cpu_xc_offline;
        }

        where = xc_unicast(0, func, ci, NULL, ci);
        xc_wait(where);
        if (online) {
                KASSERT((spc->spc_flags & SPCF_OFFLINE) == 0);
                ncpuonline++;
        } else {
                if ((spc->spc_flags & SPCF_OFFLINE) == 0) {
                        /* If was not set offline, then it is busy */
                        return EBUSY;
                }
                ncpuonline--;
        }

        spc->spc_lastmod = time_second;
        return 0;
}

#if defined(__HAVE_INTR_CONTROL)
static void
cpu_xc_intr(struct cpu_info *ci, void *unused)
{
        struct schedstate_percpu *spc;
        int s;

        spc = &ci->ci_schedstate;
        s = splsched();
        spc->spc_flags &= ~SPCF_NOINTR;
        splx(s);
}

static void
cpu_xc_nointr(struct cpu_info *ci, void *unused)
{
        struct schedstate_percpu *spc;
        int s;

        spc = &ci->ci_schedstate;
        s = splsched();
        spc->spc_flags |= SPCF_NOINTR;
        splx(s);
}

int
cpu_setintr(struct cpu_info *ci, bool intr)
{
        struct schedstate_percpu *spc;
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci2;
        uint64_t where;
        xcfunc_t func;
        int nintr;

        spc = &ci->ci_schedstate;

        KASSERT(mutex_owned(&cpu_lock));

        if (intr) {
                if ((spc->spc_flags & SPCF_NOINTR) == 0)
                        return 0;
                func = (xcfunc_t)cpu_xc_intr;
        } else {
                if (CPU_IS_PRIMARY(ci))        /* XXX kern/45117 */
                        return EINVAL;
                if ((spc->spc_flags & SPCF_NOINTR) != 0)
                        return 0;
                /*
                 * Ensure that at least one CPU within the system
                 * is handing device interrupts.
                 */
                nintr = 0;
                for (CPU_INFO_FOREACH(cii, ci2)) {
                        if ((ci2->ci_schedstate.spc_flags & SPCF_NOINTR) != 0)
                                continue;
                        if (ci2 == ci)
                                continue;
                        nintr++;
                }
                if (nintr == 0)
                        return EBUSY;
                func = (xcfunc_t)cpu_xc_nointr;
        }

        where = xc_unicast(0, func, ci, NULL, ci);
        xc_wait(where);
        if (intr) {
                KASSERT((spc->spc_flags & SPCF_NOINTR) == 0);
        } else if ((spc->spc_flags & SPCF_NOINTR) == 0) {
                /* If was not set offline, then it is busy */
                return EBUSY;
        }

        /* Direct interrupts away from the CPU and record the change. */
        cpu_intr_redistribute();
        spc->spc_lastmod = time_second;
        return 0;
}
#else        /* __HAVE_INTR_CONTROL */
int
cpu_setintr(struct cpu_info *ci, bool intr)
{

        return EOPNOTSUPP;
}

u_int
cpu_intr_count(struct cpu_info *ci)
{

        return 0;        /* 0 == "don't know" */
}
#endif        /* __HAVE_INTR_CONTROL */

#ifdef CPU_UCODE
int
cpu_ucode_load(struct cpu_ucode_softc *sc, const char *fwname)
{
        firmware_handle_t fwh;
        int error;

        if (sc->sc_blob != NULL) {
                firmware_free(sc->sc_blob, sc->sc_blobsize);
                sc->sc_blob = NULL;
                sc->sc_blobsize = 0;
        }

        error = cpu_ucode_md_open(&fwh, sc->loader_version, fwname);
        if (error != 0) {
#ifdef DEBUG
                printf("ucode: firmware_open(%s) failed: %i\n", fwname, error);
#endif
                goto err0;
        }

        sc->sc_blobsize = firmware_get_size(fwh);
        if (sc->sc_blobsize == 0) {
                error = EFTYPE;
                firmware_close(fwh);
                goto err0;
        }
        sc->sc_blob = firmware_malloc(sc->sc_blobsize);
        if (sc->sc_blob == NULL) {
                error = ENOMEM;
                firmware_close(fwh);
                goto err0;
        }

        error = firmware_read(fwh, 0, sc->sc_blob, sc->sc_blobsize);
        firmware_close(fwh);
        if (error != 0)
                goto err1;

        return 0;

err1:
        firmware_free(sc->sc_blob, sc->sc_blobsize);
        sc->sc_blob = NULL;
        sc->sc_blobsize = 0;
err0:
        return error;
}
#endif

























































































































































































































































































































































































































































































































































    1 








    1 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
/*        $NetBSD: st.c,v 1.243 2022/02/23 21:54:41 andvar Exp $ */

/*-
 * Copyright (c) 1998, 2004 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Originally written by Julian Elischer (julian@tfs.com)
 * for TRW Financial Systems for use under the MACH(2.5) operating system.
 *
 * TRW Financial Systems, in accordance with their agreement with Carnegie
 * Mellon University, makes this software available to CMU to distribute
 * or use in any manner that they see fit as long as this message is kept with
 * the software. For this reason TFS also grants any other persons or
 * organisations permission to use or modify this software.
 *
 * TFS supplies this software to be publicly redistributed
 * on the understanding that TFS is not responsible for the correct
 * functioning of this software in any circumstances.
 *
 * Ported to run under 386BSD by Julian Elischer (julian@tfs.com) Sept 1992
 * major changes by Julian Elischer (julian@jules.dialix.oz.au) May 1993
 *
 * A lot of rewhacking done by mjacob (mjacob@nas.nasa.gov).
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: st.c,v 1.243 2022/02/23 21:54:41 andvar Exp $");

#ifdef _KERNEL_OPT
#include "opt_scsi.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/malloc.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/proc.h>
#include <sys/mtio.h>
#include <sys/device.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/iostat.h>
#include <sys/sysctl.h>

#include <dev/scsipi/scsi_spc.h>
#include <dev/scsipi/scsipi_all.h>
#include <dev/scsipi/scsi_all.h>
#include <dev/scsipi/scsi_tape.h>
#include <dev/scsipi/scsipiconf.h>
#include <dev/scsipi/scsipi_base.h>
#include <dev/scsipi/stvar.h>

/* Defines for device specific stuff */
#define DEF_FIXED_BSIZE  512

#define STMODE(z)        ( minor(z)       & 0x03)
#define STDSTY(z)        ((minor(z) >> 2) & 0x03)
#define STUNIT(z)        ((minor(z) >> 4)       )
#define STNMINOR        16

#define NORMAL_MODE        0
#define NOREW_MODE        1
#define EJECT_MODE        2
#define CTRL_MODE        3

#ifndef                ST_MOUNT_DELAY
#define                ST_MOUNT_DELAY                0
#endif

static dev_type_open(stopen);
static dev_type_close(stclose);
static dev_type_read(stread);
static dev_type_write(stwrite);
static dev_type_ioctl(stioctl);
static dev_type_strategy(ststrategy);
static dev_type_dump(stdump);

const struct bdevsw st_bdevsw = {
        .d_open = stopen,
        .d_close = stclose,
        .d_strategy = ststrategy,
        .d_ioctl = stioctl,
        .d_dump = stdump,
        .d_psize = nosize,
        .d_discard = nodiscard,
        .d_flag = D_TAPE | D_MPSAFE
};

const struct cdevsw st_cdevsw = {
        .d_open = stopen,
        .d_close = stclose,
        .d_read = stread,
        .d_write = stwrite,
        .d_ioctl = stioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_TAPE | D_MPSAFE
};

/*
 * Define various devices that we know mis-behave in some way,
 * and note how they are bad, so we can correct for them
 */

static const struct st_quirk_inquiry_pattern st_quirk_patterns[] = {
        {{T_SEQUENTIAL, T_REMOV,
         "        ", "                ", "    "}, {0, 0, {
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 0-3 */
                {ST_Q_FORCE_BLKSIZE, 512, QIC_24},        /* minor 4-7 */
                {ST_Q_FORCE_BLKSIZE, 0, HALFINCH_1600},        /* minor 8-11 */
                {ST_Q_FORCE_BLKSIZE, 0, HALFINCH_6250}        /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "TANDBERG", " TDC 3600       ", ""},     {0, 12, {
                {0, 0, 0},                                /* minor 0-3 */
                {ST_Q_FORCE_BLKSIZE, 0, QIC_525},        /* minor 4-7 */
                {0, 0, QIC_150},                        /* minor 8-11 */
                {0, 0, QIC_120}                                /* minor 12-15 */
        }}},
         {{T_SEQUENTIAL, T_REMOV,
          "TANDBERG", " TDC 3800       ", ""},     {0, 0, {
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 0-3 */
                {0, 0, QIC_525},                        /* minor 4-7 */
                {0, 0, QIC_150},                        /* minor 8-11 */
                {0, 0, QIC_120}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
          "TANDBERG", " SLR5 4/8GB     ", ""},     {0, 0, {
                {ST_Q_FORCE_BLKSIZE, 1024, 0},                /* minor 0-3 */
                {0, 0, 0},                                /* minor 4-7 */
                {0, 0, 0},                                /* minor 8-11 */
                {0, 0, 0}                                /* minor 12-15 */
        }}},
        /*
         * lacking a manual for the 4200, it's not clear what the
         * specific density codes should be- the device is a 2.5GB
         * capable QIC drive, those density codes aren't readily
         * available. The 'default' will just have to do.
         */
         {{T_SEQUENTIAL, T_REMOV,
          "TANDBERG", " TDC 4200       ", ""},     {0, 0, {
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 0-3 */
                {0, 0, QIC_525},                        /* minor 4-7 */
                {0, 0, QIC_150},                        /* minor 8-11 */
                {0, 0, QIC_120}                                /* minor 12-15 */
        }}},
        /*
         * At least -005 and -007 need this.  I'll assume they all do unless I
         * hear otherwise.  - mycroft, 31MAR1994
         */
        {{T_SEQUENTIAL, T_REMOV,
         "ARCHIVE ", "VIPER 2525 25462", ""},     {0, 0, {
                {ST_Q_SENSE_HELP, 0, 0},                /* minor 0-3 */
                {ST_Q_SENSE_HELP, 0, QIC_525},                /* minor 4-7 */
                {0, 0, QIC_150},                        /* minor 8-11 */
                {0, 0, QIC_120}                                /* minor 12-15 */
        }}},
        /*
         * One user reports that this works for his tape drive.  It probably
         * needs more work.  - mycroft, 09APR1994
         */
        {{T_SEQUENTIAL, T_REMOV,
         "SANKYO  ", "CP525           ", ""},    {0, 0, {
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 0-3 */
                {ST_Q_FORCE_BLKSIZE, 512, QIC_525},        /* minor 4-7 */
                {0, 0, QIC_150},                        /* minor 8-11 */
                {0, 0, QIC_120}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "ANRITSU ", "DMT780          ", ""},     {0, 0, {
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 0-3 */
                {ST_Q_FORCE_BLKSIZE, 512, QIC_525},        /* minor 4-7 */
                {0, 0, QIC_150},                        /* minor 8-11 */
                {0, 0, QIC_120}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "ARCHIVE ", "VIPER 150  21247", ""},     {ST_Q_ERASE_NOIMM, 12, {
                {ST_Q_SENSE_HELP, 0, 0},                /* minor 0-3 */
                {0, 0, QIC_150},                        /* minor 4-7 */
                {0, 0, QIC_120},                        /* minor 8-11 */
                {0, 0, QIC_24}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "ARCHIVE ", "VIPER 150  21531", ""},     {ST_Q_ERASE_NOIMM, 12, {
                {ST_Q_SENSE_HELP, 0, 0},                /* minor 0-3 */
                {0, 0, QIC_150},                        /* minor 4-7 */
                {0, 0, QIC_120},                        /* minor 8-11 */
                {0, 0, QIC_24}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "WANGTEK ", "5099ES SCSI", ""},          {0, 0, {
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 0-3 */
                {0, 0, QIC_11},                                /* minor 4-7 */
                {0, 0, QIC_24},                                /* minor 8-11 */
                {0, 0, QIC_24}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "WANGTEK ", "5150ES SCSI", ""},          {0, 0, {
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 0-3 */
                {0, 0, QIC_24},                                /* minor 4-7 */
                {0, 0, QIC_120},                        /* minor 8-11 */
                {0, 0, QIC_150}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "WANGTEK ", "5525ES SCSI REV7", ""},     {0, 0, {
                {0, 0, 0},                                /* minor 0-3 */
                {ST_Q_BLKSIZE, 0, QIC_525},                /* minor 4-7 */
                {0, 0, QIC_150},                        /* minor 8-11 */
                {0, 0, QIC_120}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "WangDAT ", "Model 1300      ", ""},     {0, 0, {
                {0, 0, 0},                                /* minor 0-3 */
                {ST_Q_FORCE_BLKSIZE, 512, DDS},                /* minor 4-7 */
                {ST_Q_FORCE_BLKSIZE, 1024, DDS},        /* minor 8-11 */
                {ST_Q_FORCE_BLKSIZE, 0, DDS}                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "EXABYTE ", "EXB-8200        ", "263H"}, {0, 5, {
                {0, 0, 0},                                /* minor 0-3 */
                {0, 0, 0},                                /* minor 4-7 */
                {0, 0, 0},                                /* minor 8-11 */
                {0, 0, 0}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "STK",      "9490",             ""},
                                {ST_Q_FORCE_BLKSIZE, 0, {
                {0, 0, 0},                                /* minor 0-3 */
                {0, 0, 0},                                /* minor 4-7 */
                {0, 0, 0},                                /* minor 8-11 */
                {0, 0, 0}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "STK",      "SD-3",             ""},
                                {ST_Q_FORCE_BLKSIZE, 0, {
                {0, 0, 0},                                /* minor 0-3 */
                {0, 0, 0},                                /* minor 4-7 */
                {0, 0, 0},                                /* minor 8-11 */
                {0, 0, 0}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "IBM",      "03590",            ""},     {ST_Q_IGNORE_LOADS, 0, {
                {0, 0, 0},                                /* minor 0-3 */
                {0, 0, 0},                                /* minor 4-7 */
                {0, 0, 0},                                /* minor 8-11 */
                {0, 0, 0}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "HP      ", "T4000s          ", ""},     {ST_Q_UNIMODAL, 0, {
                {0, 0, QIC_3095},                        /* minor 0-3 */
                {0, 0, QIC_3095},                        /* minor 4-7 */
                {0, 0, QIC_3095},                        /* minor 8-11 */
                {0, 0, QIC_3095},                        /* minor 12-15 */
        }}},
#if 0
        {{T_SEQUENTIAL, T_REMOV,
         "EXABYTE ", "EXB-8200        ", ""},     {0, 12, {
                {0, 0, 0},                                /* minor 0-3 */
                {0, 0, 0},                                /* minor 4-7 */
                {0, 0, 0},                                /* minor 8-11 */
                {0, 0, 0}                                /* minor 12-15 */
        }}},
#endif
        {{T_SEQUENTIAL, T_REMOV,
         "TEAC    ", "MT-2ST/N50      ", ""},     {ST_Q_IGNORE_LOADS, 0, {
                {0, 0, 0},                                /* minor 0-3 */
                {0, 0, 0},                                /* minor 4-7 */
                {0, 0, 0},                                /* minor 8-11 */
                {0, 0, 0}                                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "OnStream", "ADR50 Drive", ""},          {ST_Q_UNIMODAL, 0, {
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 0-3 */
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 4-7 */
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 8-11 */
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "OnStream DI-30",      "",   "1.0"},  {ST_Q_NOFILEMARKS, 0, {
                {0, 0, 0},                              /* minor 0-3 */
                {0, 0, 0},                              /* minor 4-7 */
                {0, 0, 0},                              /* minor 8-11 */
                {0, 0, 0}                               /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "NCR H621", "0-STD-03-46F880 ", ""},     {ST_Q_NOPREVENT, 0, {
                {0, 0, 0},                               /* minor 0-3 */
                {0, 0, 0},                               /* minor 4-7 */
                {0, 0, 0},                               /* minor 8-11 */
                {0, 0, 0}                               /* minor 12-15 */
        }}},
        {{T_SEQUENTIAL, T_REMOV,
         "Seagate STT3401A", "hp0atxa", ""},        {0, 0, {
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 0-3 */
                {ST_Q_FORCE_BLKSIZE, 1024, 0},                /* minor 4-7 */
                {ST_Q_FORCE_BLKSIZE, 512, 0},                /* minor 8-11 */
                {ST_Q_FORCE_BLKSIZE, 512, 0}                /* minor 12-15 */
        }}},
};

#define NOEJECT 0
#define EJECT 1

static void        st_identify_drive(struct st_softc *,
                    struct scsipi_inquiry_pattern *);
static void        st_loadquirks(struct st_softc *);
static int        st_mount_tape(dev_t, int);
static void        st_unmount(struct st_softc *, boolean);
static int        st_decide_mode(struct st_softc *, boolean);
static void        ststart(struct scsipi_periph *);
static int        ststart1(struct scsipi_periph *, struct buf *, int *);
static void        strestart(void *);
static void        stdone(struct scsipi_xfer *, int);
static int        st_read(struct st_softc *, char *, int, int);
static int        st_space(struct st_softc *, int, u_int, int);
static int        st_write_filemarks(struct st_softc *, int, int);
static int        st_check_eod(struct st_softc *, boolean, int *, int);
static int        st_load(struct st_softc *, u_int, int);
static int        st_rewind(struct st_softc *, u_int, int);
static int        st_interpret_sense(struct scsipi_xfer *);
static int        st_touch_tape(struct st_softc *);
static int        st_erase(struct st_softc *, int full, int flags);
static void     st_updatefilepos(struct st_softc *);
static int        st_rdpos(struct st_softc *, int, uint32_t *);
static int        st_setpos(struct st_softc *, int, uint32_t *);

static const struct scsipi_periphsw st_switch = {
        st_interpret_sense,
        ststart,
        NULL,
        stdone
};

#if defined(ST_ENABLE_EARLYWARN)
#define        ST_INIT_FLAGS        ST_EARLYWARN
#else
#define        ST_INIT_FLAGS        0
#endif

/*
 * The routine called by the low level scsi routine when it discovers
 * A device suitable for this driver
 */
void
stattach(device_t parent, device_t self, void *aux)
{
        struct st_softc *st = device_private(self);
        struct scsipibus_attach_args *sa = aux;
        struct scsipi_periph *periph = sa->sa_periph;

        SC_DEBUG(periph, SCSIPI_DB2, ("stattach: "));
        st->sc_dev = self;

        /* Store information needed to contact our base driver */
        st->sc_periph = periph;
        periph->periph_dev = st->sc_dev;
        periph->periph_switch = &st_switch;

        /* Set initial flags  */
        st->flags = ST_INIT_FLAGS;

        /* Set up the buf queues for this device */
        bufq_alloc(&st->buf_queue, "fcfs", 0);
        bufq_alloc(&st->buf_defer, "fcfs", 0);
        callout_init(&st->sc_callout, 0);
        mutex_init(&st->sc_iolock, MUTEX_DEFAULT, IPL_VM);

        /*
         * Check if the drive is a known criminal and take
         * Any steps needed to bring it into line
         */
        st_identify_drive(st, &sa->sa_inqbuf);
        aprint_naive("\n");
        aprint_normal("\n");
        /* Use the subdriver to request information regarding the drive.  */
        aprint_normal_dev(self, "%s", st->quirkdata ? "quirks apply, " : "");
        if (scsipi_test_unit_ready(periph,
            XS_CTL_DISCOVERY | XS_CTL_SILENT | XS_CTL_IGNORE_MEDIA_CHANGE) ||
            st->ops(st, ST_OPS_MODESENSE,
            XS_CTL_DISCOVERY | XS_CTL_SILENT | XS_CTL_IGNORE_MEDIA_CHANGE))
                aprint_normal("drive empty\n");
        else {
                aprint_normal("density code %d, ", st->media_density);
                if (st->media_blksize > 0)
                        aprint_normal("%d-byte", st->media_blksize);
                else
                        aprint_normal("variable");
                aprint_normal(" blocks, write-%s\n",
                    (st->flags & ST_READONLY) ? "protected" : "enabled");
        }

        st->stats = iostat_alloc(IOSTAT_TAPE, parent,
            device_xname(st->sc_dev));

        rnd_attach_source(&st->rnd_source, device_xname(st->sc_dev),
            RND_TYPE_TAPE, RND_FLAG_DEFAULT);
}

int
stdetach(device_t self, int flags)
{
        struct st_softc *st = device_private(self);
        struct scsipi_periph *periph = st->sc_periph;
        struct scsipi_channel *chan = periph->periph_channel;
        int bmaj, cmaj, mn;

        /* locate the major number */
        bmaj = bdevsw_lookup_major(&st_bdevsw);
        cmaj = cdevsw_lookup_major(&st_cdevsw);

        /* kill any pending restart */
        callout_halt(&st->sc_callout, NULL);

        mutex_enter(chan_mtx(chan));

        /* Kill off any queued buffers. */
        bufq_drain(st->buf_defer);
        bufq_drain(st->buf_queue);

        /* Kill off any pending commands. */
        scsipi_kill_pending(st->sc_periph);

        mutex_exit(chan_mtx(chan));

        bufq_free(st->buf_defer);
        bufq_free(st->buf_queue);
        mutex_destroy(&st->sc_iolock);

        /* Nuke the vnodes for any open instances */
        mn = STUNIT(device_unit(self));
        vdevgone(bmaj, mn, mn+STNMINOR-1, VBLK);
        vdevgone(cmaj, mn, mn+STNMINOR-1, VCHR);

        iostat_free(st->stats);

        /* Unhook the entropy source. */
        rnd_detach_source(&st->rnd_source);

        return 0;
}

/*
 * Use the inquiry routine in 'scsi_base' to get drive info so we can
 * Further tailor our behaviour.
 */
static void
st_identify_drive(struct st_softc *st, struct scsipi_inquiry_pattern *inqbuf)
{
        const struct st_quirk_inquiry_pattern *finger;
        int priority;

        finger = scsipi_inqmatch(inqbuf,
            st_quirk_patterns,
            sizeof(st_quirk_patterns) / sizeof(st_quirk_patterns[0]),
            sizeof(st_quirk_patterns[0]), &priority);
        if (priority != 0) {
                st->quirkdata = &finger->quirkdata;
                st->drive_quirks = finger->quirkdata.quirks;
                st->quirks = finger->quirkdata.quirks;        /* start value */
                st->page_0_size = finger->quirkdata.page_0_size;
                KASSERT(st->page_0_size <= MAX_PAGE_0_SIZE);
                st_loadquirks(st);
        }
}

/*
 * initialise the subdevices to the default (QUIRK) state.
 * this will remove any setting made by the system operator or previous
 * operations.
 */
static void
st_loadquirks(struct st_softc *st)
{
        const struct        modes *mode;
        struct        modes *mode2;
        int i;

        mode = st->quirkdata->modes;
        mode2 = st->modes;
        for (i = 0; i < 4; i++) {
                memset(mode2, 0, sizeof(struct modes));
                st->modeflags[i] &= ~(BLKSIZE_SET_BY_QUIRK |
                    DENSITY_SET_BY_QUIRK | BLKSIZE_SET_BY_USER |
                    DENSITY_SET_BY_USER);
                if ((mode->quirks | st->drive_quirks) & ST_Q_FORCE_BLKSIZE) {
                        mode2->blksize = mode->blksize;
                        st->modeflags[i] |= BLKSIZE_SET_BY_QUIRK;
                }
                if (mode->density) {
                        mode2->density = mode->density;
                        st->modeflags[i] |= DENSITY_SET_BY_QUIRK;
                }
                mode2->quirks |= mode->quirks;
                mode++;
                mode2++;
        }
}

/* open the device. */
static int
stopen(dev_t dev, int flags, int mode, struct lwp *l)
{
        u_int stmode, dsty;
        int error, sflags, unit, tries, ntries;
        struct st_softc *st;
        struct scsipi_periph *periph;
        struct scsipi_adapter *adapt;

        unit = STUNIT(dev);
        st = device_lookup_private(&st_cd, unit);
        if (st == NULL)
                return ENXIO;

        stmode = STMODE(dev);
        dsty = STDSTY(dev);

        periph = st->sc_periph;
        adapt = periph->periph_channel->chan_adapter;

        SC_DEBUG(periph, SCSIPI_DB1,
            ("open: dev=0x%"PRIx64" (unit %d (of %d))\n", dev, unit,
            st_cd.cd_ndevs));

        /* Only allow one at a time */
        if (periph->periph_flags & PERIPH_OPEN) {
                aprint_error_dev(st->sc_dev, "already open\n");
                return EBUSY;
        }

        if ((error = scsipi_adapter_addref(adapt)) != 0)
                return error;

        /* clear any latched errors. */
        st->mt_resid = 0;
        st->mt_erreg = 0;
        st->asc = 0;
        st->ascq = 0;

        /*
         * Catch any unit attention errors. Be silent about this
         * unless we're already mounted. We ignore media change
         * if we're in control mode or not mounted yet.
         */
        if ((st->flags & ST_MOUNTED) == 0 || stmode == CTRL_MODE) {
#ifdef SCSIDEBUG
                sflags = XS_CTL_IGNORE_MEDIA_CHANGE;
#else
                sflags = XS_CTL_SILENT|XS_CTL_IGNORE_MEDIA_CHANGE;
#endif
        } else
                sflags = 0;

        /*
         * If we're already mounted or we aren't configured for
         * a mount delay, only try a test unit ready once. Otherwise,
         * try up to ST_MOUNT_DELAY times with a rest interval of
         * one second between each try.
         */
        if ((st->flags & ST_MOUNTED) || ST_MOUNT_DELAY == 0)
                ntries = 1;
        else
                ntries = ST_MOUNT_DELAY;

        for (error = tries = 0; tries < ntries; tries++) {
                int slpintr, oflags;

                /*
                 * If we had no error, or we're opening the control mode
                 * device, we jump out right away.
                 */
                error = scsipi_test_unit_ready(periph, sflags);
                if (error == 0 || stmode == CTRL_MODE)
                        break;

                /*
                 * We had an error.
                 *
                 * If we're already mounted or we aren't configured for
                 * a mount delay, or the error isn't a NOT READY error,
                 * skip to the error exit now.
                 */
                if ((st->flags & ST_MOUNTED) || ST_MOUNT_DELAY == 0 ||
                    (st->mt_key != SKEY_NOT_READY)) {
                        device_printf(st->sc_dev,
                                      "mount error (sense key=%d) - "
                                      "terminating mount session\n",
                                      st->mt_key);
                        /*
                         * the following should not trigger unless
                         * something serious happened while the device
                         * was open (PREVENT MEDIUM REMOVAL in effect)
                         */
                        if (st->flags & ST_WRITTEN &&
                            st->mt_key == SKEY_UNIT_ATTENTION) {
                                /*
                                 * device / media state may have changed
                                 * refrain from writing missing file marks
                                 * onto potentially newly inserted/formatted
                                 * media (e. g. emergency EJECT/RESET/etc.)
                                 */
                                st->flags &= ~(ST_WRITTEN|ST_FM_WRITTEN);

                                device_printf(st->sc_dev,
                                    "CAUTION: file marks/data may be missing"
                                    " - ASC = 0x%02x, ASCQ = 0x%02x\n",
                                              st->asc, st->ascq);
                        }
                        goto bad;
                }

                /* clear any latched errors. */
                st->mt_resid = 0;
                st->mt_erreg = 0;
                st->asc = 0;
                st->ascq = 0;

                /*
                 * Fake that we have the device open so
                 * we block other apps from getting in.
                 */
                oflags = periph->periph_flags;
                periph->periph_flags |= PERIPH_OPEN;

                slpintr = kpause("stload", true, hz, NULL);

                periph->periph_flags = oflags;        /* restore flags */
                if (slpintr != 0 && slpintr != EWOULDBLOCK) {
                        device_printf(st->sc_dev, "load interrupted\n");
                        goto bad;
                }
        }

        /*
         * If the mode is 3 (e.g. minor = 3,7,11,15) then the device has
         * been opened to set defaults and perform other, usually non-I/O
         * related, operations. In this case, do a quick check to see
         * whether the unit actually had a tape loaded (this will be known
         * as to whether or not we got a NOT READY for the above
         * unit attention). If a tape is there, go do a mount sequence.
         */
        if (stmode == CTRL_MODE &&
            st->mt_key != SKEY_NO_SENSE &&
            st->mt_key != SKEY_UNIT_ATTENTION) {
                periph->periph_flags |= PERIPH_OPEN;
                return 0;
        }

        /*
         * If we get this far and had an error set, that means we failed
         * to pass the 'test unit ready' test for the non-controlmode device,
         * so we bounce the open.
         */
        if (error)
                return error;

        /* Else, we're now committed to saying we're open. */
        periph->periph_flags |= PERIPH_OPEN; /* unit attn are now errors */

        /*
         * If it's a different mode, or if the media has been
         * invalidated, unmount the tape from the previous
         * session but continue with open processing
         */
        if (st->last_dsty != dsty ||
            (periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)
                st_unmount(st, NOEJECT);

        /*
         * If we are not mounted, then we should start a new
         * mount session.
         */
        if (!(st->flags & ST_MOUNTED)) {
                if ((error = st_mount_tape(dev, flags)) != 0)
                        goto bad;
                st->last_dsty = dsty;
        }
        if (!(st->quirks & ST_Q_NOPREVENT)) {
                scsipi_prevent(periph, SPAMR_PREVENT_DT,
                    XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_NOT_READY);
        }

        SC_DEBUG(periph, SCSIPI_DB2, ("open complete\n"));
        return 0;

bad:
        st_unmount(st, NOEJECT);
        scsipi_adapter_delref(adapt);
        periph->periph_flags &= ~PERIPH_OPEN;
        return error;
}

static int
stclose(dev_t dev, int flags, int mode, struct lwp *l)
{
        int stxx, error = 0;
        struct st_softc *st = device_lookup_private(&st_cd, STUNIT(dev));
        struct scsipi_periph *periph = st->sc_periph;
        struct scsipi_adapter *adapt = periph->periph_channel->chan_adapter;

        SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("closing\n"));

        /*
         * Make sure that a tape opened in write-only mode will have
         * file marks written on it when closed, even if not written to.
         *
         * This is for SUN compatibility. Actually, the Sun way of
         * things is to:
         *
         *        only write filemarks if there are fmks to be written and
         *                   - open for write (possibly read/write)
         *                - the last operation was a write
         *         or:
         *                - opened for wronly
         *                - no data was written (including filemarks)
         */

        stxx = st->flags & (ST_WRITTEN | ST_FM_WRITTEN);
        if ((flags & FWRITE) != 0) {
                int nm = 0;
#ifdef ST_SUNCOMPAT
                /*
                 * on request only
                 * original compat code has not been working
                 * since ~1998
                 */
                if ((flags & O_ACCMODE) == FWRITE && (stxx == 0)) {
                        st->flags |= ST_WRITTEN;
                        SC_DEBUG(st->sc_periph, SCSIPI_DB3,
                                 ("SUN compatibility: write FM(s) at close\n"));
                }
#endif
                error = st_check_eod(st, FALSE, &nm, 0);
                SC_DEBUG(st->sc_periph, SCSIPI_DB3,
                         ("wrote %d FM(s) at close error=%d\n", nm, error));
        }

        /* Allow robots to eject tape if needed.  */
        if (!(st->quirks & ST_Q_NOPREVENT)) {
                scsipi_prevent(periph, SPAMR_ALLOW,
                    XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_NOT_READY);
        }

        switch (STMODE(dev)) {
        case NORMAL_MODE:
                st_unmount(st, NOEJECT);
                break;
        case NOREW_MODE:
        case CTRL_MODE:
                /*
                 * Leave mounted unless media seems to have been removed.
                 *
                 * Otherwise, if we're to terminate a tape with more than one
                 * filemark [ and because we're not rewinding here ], backspace
                 * one filemark so that later appends will see an unbroken
                 * sequence of:
                 *
                 *        file - FMK - file - FMK ... file - FMK FMK (EOM)
                 */
                if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) {
                        st_unmount(st, NOEJECT);
                } else if (error == 0) {
                        /*
                         * ST_WRITTEN was preserved from above.
                         *
                         * All we need to know here is:
                         *
                         *        Were we writing this tape and was the last
                         *        operation a write?
                         *
                         *        Are there supposed to be 2FM at EOD?
                         *
                         * If both statements are true, then we backspace
                         * one filemark.
                         */
                        stxx &= ~ST_FM_WRITTEN;
                        stxx |= (st->flags & ST_2FM_AT_EOD);
                        if ((flags & FWRITE) != 0 &&
                            (stxx == (ST_2FM_AT_EOD|ST_WRITTEN))) {
                                error = st_space(st, -1, SP_FILEMARKS, 0);
                                SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("st_space(-1) error=%d\n", error));
                        } else {
                                SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("no backspacing - flags = 0x%x, stxx=0x%x, st->flags=0x%x\n", flags, stxx, st->flags));
                        }
                } else {
                        SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("error %d from st_check_eod\n", error));
                }
        
                break;
        case EJECT_MODE:
                st_unmount(st, EJECT);
                break;
        }

        KASSERTMSG((st->flags & ST_WRITTEN) == 0,
                   "pending ST_WRITTEN flag NOT cleared (flags=0x%x)", st->flags);
        
        scsipi_wait_drain(periph);

        scsipi_adapter_delref(adapt);
        periph->periph_flags &= ~PERIPH_OPEN;

        return error;
}

/*
 * Start a new mount session.
 * Copy in all the default parameters from the selected device mode.
 * and try guess any that seem to be defaulted.
 */
static int
st_mount_tape(dev_t dev, int flags)
{
        int unit;
        u_int dsty;
        struct st_softc *st;
        struct scsipi_periph *periph;
        int error = 0;

        unit = STUNIT(dev);
        dsty = STDSTY(dev);
        st = device_lookup_private(&st_cd, unit);
        periph = st->sc_periph;

        if (st->flags & ST_MOUNTED)
                return 0;

        SC_DEBUG(periph, SCSIPI_DB1, ("mounting\n "));
        st->flags |= ST_NEW_MOUNT;
        st->quirks = st->drive_quirks | st->modes[dsty].quirks;
        /*
         * If the media is new, then make sure we give it a chance to
         * to do a 'load' instruction.  (We assume it is new.)
         */
        if ((error = st_load(st, LD_LOAD, XS_CTL_SILENT)) != 0)
                return error;
        /*
         * Throw another dummy instruction to catch
         * 'Unit attention' errors. Many drives give
         * these after doing a Load instruction (with
         * the MEDIUM MAY HAVE CHANGED asc/ascq).
         */
        scsipi_test_unit_ready(periph, XS_CTL_SILENT);        /* XXX */

        /*
         * Some devices can't tell you much until they have been
         * asked to look at the media. This quirk does this.
         */
        if (st->quirks & ST_Q_SENSE_HELP)
                if ((error = st_touch_tape(st)) != 0)
                        return error;
        /*
         * Load the physical device parameters
         * loads: blkmin, blkmax
         */
        if ((error = st->ops(st, ST_OPS_RBL, 0)) != 0)
                return error;
        /*
         * Load the media dependent parameters
         * includes: media_blksize,media_density,numblks
         * As we have a tape in, it should be reflected here.
         * If not you may need the "quirk" above.
         */
        if ((error = st->ops(st, ST_OPS_MODESENSE, 0)) != 0)
                return error;
        /*
         * If we have gained a permanent density from somewhere,
         * then use it in preference to the one supplied by
         * default by the driver.
         */
        if (st->modeflags[dsty] & (DENSITY_SET_BY_QUIRK | DENSITY_SET_BY_USER))
                st->density = st->modes[dsty].density;
        else
                st->density = st->media_density;
        /*
         * If we have gained a permanent blocksize
         * then use it in preference to the one supplied by
         * default by the driver.
         */
        st->flags &= ~ST_FIXEDBLOCKS;
        if (st->modeflags[dsty] &
            (BLKSIZE_SET_BY_QUIRK | BLKSIZE_SET_BY_USER)) {
                st->blksize = st->modes[dsty].blksize;
                if (st->blksize)
                        st->flags |= ST_FIXEDBLOCKS;
        } else {
                if ((error = st_decide_mode(st, FALSE)) != 0)
                        return error;
        }
        if ((error = st->ops(st, ST_OPS_MODESELECT, 0)) != 0) {
                /* ATAPI will return ENODEV for this, and this may be OK */
                if (error != ENODEV) {
                        aprint_error_dev(st->sc_dev,
                            "cannot set selected mode\n");
                        return error;
                }
        }
        st->flags &= ~ST_NEW_MOUNT;
        st->flags |= ST_MOUNTED;
        periph->periph_flags |= PERIPH_MEDIA_LOADED;        /* move earlier? */
        st->blkno = st->fileno = (daddr_t) 0;
        return 0;
}

/*
 * End the present mount session.
 * Rewind, and optionally eject the tape.
 * Reset various flags to indicate that all new
 * operations require another mount operation
 */
static void
st_unmount(struct st_softc *st, boolean eject)
{
        struct scsipi_periph *periph = st->sc_periph;
        int nmarks;

        if ((st->flags & ST_MOUNTED) == 0)
                return;
        SC_DEBUG(periph, SCSIPI_DB1, ("unmounting\n"));
        st_check_eod(st, FALSE, &nmarks, XS_CTL_IGNORE_NOT_READY);
        st_rewind(st, 0, XS_CTL_IGNORE_NOT_READY);

        /*
         * Section 9.3.3 of the SCSI specs states that a device shall return
         * the density value specified in the last successful MODE SELECT
         * after an unload operation, in case it is not able to
         * automatically determine the density of the new medium.
         *
         * So we instruct the device to use the default density, which will
         * prevent the use of stale density values (in particular,
         * in st_touch_tape().
         */
        st->density = 0;
        if (st->ops(st, ST_OPS_MODESELECT, 0) != 0) {
                aprint_error_dev(st->sc_dev,
                    "WARNING: cannot revert to default density\n");
        }

        if (eject) {
                if (!(st->quirks & ST_Q_NOPREVENT)) {
                        scsipi_prevent(periph, SPAMR_ALLOW,
                            XS_CTL_IGNORE_ILLEGAL_REQUEST |
                            XS_CTL_IGNORE_NOT_READY);
                }
                st_load(st, LD_UNLOAD, XS_CTL_IGNORE_NOT_READY);
                st->blkno = st->fileno = (daddr_t) -1;
        } else {
                st->blkno = st->fileno = (daddr_t) 0;
        }
        st->flags &= ~(ST_MOUNTED | ST_NEW_MOUNT);
        periph->periph_flags &= ~PERIPH_MEDIA_LOADED;
}

/*
 * Given all we know about the device, media, mode, 'quirks' and
 * initial operation, make a decision as to how we should be set
 * to run (regarding blocking and EOD marks)
 */
int
st_decide_mode(struct st_softc *st, boolean first_read)
{

        SC_DEBUG(st->sc_periph, SCSIPI_DB2, ("starting block mode decision\n"));

        /*
         * If the drive can only handle fixed-length blocks and only at
         * one size, perhaps we should just do that.
         */
        if (st->blkmin && (st->blkmin == st->blkmax)) {
                st->flags |= ST_FIXEDBLOCKS;
                st->blksize = st->blkmin;
                SC_DEBUG(st->sc_periph, SCSIPI_DB3,
                    ("blkmin == blkmax of %d\n", st->blkmin));
                goto done;
        }
        /*
         * If the tape density mandates (or even suggests) use of fixed
         * or variable-length blocks, comply.
         */
        switch (st->density) {
        case HALFINCH_800:
        case HALFINCH_1600:
        case HALFINCH_6250:
        case DDS:
                st->flags &= ~ST_FIXEDBLOCKS;
                st->blksize = 0;
                SC_DEBUG(st->sc_periph, SCSIPI_DB3,
                    ("density specified variable\n"));
                goto done;
        case QIC_11:
        case QIC_24:
        case QIC_120:
        case QIC_150:
        case QIC_525:
        case QIC_1320:
        case QIC_3095:
        case QIC_3220:
                st->flags |= ST_FIXEDBLOCKS;
                if (st->media_blksize > 0)
                        st->blksize = st->media_blksize;
                else
                        st->blksize = DEF_FIXED_BSIZE;
                SC_DEBUG(st->sc_periph, SCSIPI_DB3,
                    ("density specified fixed\n"));
                goto done;
        }
        /*
         * If we're about to read the tape, perhaps we should choose
         * fixed or variable-length blocks and block size according to
         * what the drive found on the tape.
         */
        if (first_read &&
            (!(st->quirks & ST_Q_BLKSIZE) || (st->media_blksize == 0) ||
            (st->media_blksize == DEF_FIXED_BSIZE) ||
            (st->media_blksize == 1024))) {
                if (st->media_blksize > 0)
                        st->flags |= ST_FIXEDBLOCKS;
                else
                        st->flags &= ~ST_FIXEDBLOCKS;
                st->blksize = st->media_blksize;
                SC_DEBUG(st->sc_periph, SCSIPI_DB3,
                    ("Used media_blksize of %d\n", st->media_blksize));
                goto done;
        }
        /*
         * We're getting no hints from any direction.  Choose variable-
         * length blocks arbitrarily.
         */
        st->flags &= ~ST_FIXEDBLOCKS;
        st->blksize = 0;
        SC_DEBUG(st->sc_periph, SCSIPI_DB3,
            ("Give up and default to variable mode\n"));

done:
        /*
         * Decide whether or not to write two file marks to signify end-
         * of-data.  Make the decision as a function of density.  If
         * the decision is not to use a second file mark, the SCSI BLANK
         * CHECK condition code will be recognized as end-of-data when
         * first read.
         * (I think this should be a by-product of fixed/variable..julian)
         */
        switch (st->density) {
/*      case 8 mm:   What is the SCSI density code for 8 mm, anyway? */
        case QIC_11:
        case QIC_24:
        case QIC_120:
        case QIC_150:
        case QIC_525:
        case QIC_1320:
        case QIC_3095:
        case QIC_3220:
                st->flags &= ~ST_2FM_AT_EOD;
                break;
        default:
                st->flags |= ST_2FM_AT_EOD;
        }
        return 0;
}

/*
 * Actually translate the requested transfer into
 * one the physical driver can understand
 * The transfer is described by a buf and will include
 * only one physical transfer.
 */
static void
ststrategy(struct buf *bp)
{
        struct st_softc *st = device_lookup_private(&st_cd, STUNIT(bp->b_dev));
        struct scsipi_periph *periph = st->sc_periph;
        struct scsipi_channel *chan = periph->periph_channel;

        SC_DEBUG(periph, SCSIPI_DB1,
            ("ststrategy %d bytes @ blk %" PRId64 "\n", bp->b_bcount,
                bp->b_blkno));
        /* If it's a null transfer, return immediately */
        if (bp->b_bcount == 0)
                goto abort;

        /* If offset is negative, error */
        if (bp->b_blkno < 0) {
                SC_DEBUG(periph, SCSIPI_DB3,
                         ("EINVAL: ststrategy negative blockcount %" PRId64 "\n", bp->b_blkno));
                bp->b_error = EINVAL;
                goto abort;
        }

        /* Odd sized request on fixed drives are verboten */
        if (st->flags & ST_FIXEDBLOCKS) {
                if (bp->b_bcount % st->blksize) {
                        aprint_error_dev(st->sc_dev, "bad request, must be multiple of %d\n",
                            st->blksize);
                        bp->b_error = EIO;
                        goto abort;
                }
        }
        /* as are out-of-range requests on variable drives. */
        else if (bp->b_bcount < st->blkmin ||
            (st->blkmax && bp->b_bcount > st->blkmax)) {
                aprint_error_dev(st->sc_dev, "bad request, must be between %d and %d\n",
                    st->blkmin, st->blkmax);
                bp->b_error = EIO;
                goto abort;
        }
        mutex_enter(chan_mtx(chan));

        /*
         * Place it in the queue of activities for this tape
         * at the end (a bit silly because we only have on user..
         * (but it could fork()))
         */
        bufq_put(st->buf_queue, bp);

        /*
         * Tell the device to get going on the transfer if it's
         * not doing anything, otherwise just wait for completion
         * (All a bit silly if we're only allowing 1 open but..)
         */
        ststart(periph);

        mutex_exit(chan_mtx(chan));
        return;
abort:
        /*
         * Reset the residue because we didn't do anything,
         * and send the buffer back as done.
         */
        bp->b_resid = bp->b_bcount;
        biodone(bp);
        return;
}

/*
 * ststart looks to see if there is a buf waiting for the device
 * and that the device is not already busy. If the device is busy,
 * the request is deferred and retried on the next attempt.
 * If both are true, ststart creates a scsi command to perform
 * the transfer required.
 *
 * The transfer request will call scsipi_done on completion,
 * which will in turn call this routine again so that the next
 * queued transfer is performed. The bufs are queued by the
 * strategy routine (ststrategy)
 *
 * This routine is also called after other non-queued requests
 * have been made of the scsi driver, to ensure that the queue
 * continues to be drained.
 * ststart() is called with channel lock held
 */
static int
ststart1(struct scsipi_periph *periph, struct buf *bp, int *errnop)
{
        struct st_softc *st = device_private(periph->periph_dev);
        struct scsipi_channel *chan = periph->periph_channel;
        struct scsi_rw_tape cmd;
        struct scsipi_xfer *xs;
        int flags, error, complete = 1;

        SC_DEBUG(periph, SCSIPI_DB2, ("ststart1 "));

        mutex_enter(chan_mtx(chan));

        if (periph->periph_active >= periph->periph_openings) {
                error = EAGAIN;
                goto out;
        }

        /* if a special awaits, let it proceed first */
        if (periph->periph_flags & PERIPH_WAITING) {
                periph->periph_flags &= ~PERIPH_WAITING;
                cv_broadcast(periph_cv_periph(periph));
                error = EAGAIN;
                goto out;
        }

        /*
         * If the device has been unmounted by the user
         * then throw away all requests until done.
         */
        if (__predict_false((st->flags & ST_MOUNTED) == 0 ||
            (periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)) {
                error = EIO;
                goto out;
        }

        /*
         * only FIXEDBLOCK devices have pending I/O or space operations.
         */
        if (st->flags & ST_FIXEDBLOCKS) {
                /*
                 * If we are at a filemark but have not reported it yet
                 * then we should report it now
                 */
                if (st->flags & ST_AT_FILEMARK) {
                        if ((bp->b_flags & B_READ) == B_WRITE) {
                                /*
                                 * Handling of ST_AT_FILEMARK in
                                 * st_space will fill in the right file
                                 * mark count.
                                 * Back up over filemark
                                 */
                                if (st_space(st, 0, SP_FILEMARKS, 0)) {
                                        error = EIO;
                                        goto out;
                                }
                        } else {
                                error = 0;
                                st->flags &= ~ST_AT_FILEMARK;
                                goto out;
                        }
                }
        }
        /*
         * If we are at EOM but have not reported it
         * yet then we should report it now.
         */
        if (st->flags & (ST_EOM_PENDING|ST_EIO_PENDING)) {
                error = 0;
                if (st->flags & ST_EIO_PENDING)
                        error = EIO;
                st->flags &= ~(ST_EOM_PENDING|ST_EIO_PENDING);
                goto out;
        }

        /* Fill out the scsi command */
        memset(&cmd, 0, sizeof(cmd));
        flags = XS_CTL_NOSLEEP | XS_CTL_ASYNC;
        if ((bp->b_flags & B_READ) == B_WRITE) {
                cmd.opcode = WRITE;
                st->flags &= ~ST_FM_WRITTEN;
                flags |= XS_CTL_DATA_OUT;
        } else {
                cmd.opcode = READ;
                flags |= XS_CTL_DATA_IN;
        }

        /*
         * Handle "fixed-block-mode" tape drives by using the
         * block count instead of the length.
         */
        if (st->flags & ST_FIXEDBLOCKS) {
                cmd.byte2 |= SRW_FIXED;
                _lto3b(bp->b_bcount / st->blksize, cmd.len);
        } else
                _lto3b(bp->b_bcount, cmd.len);

        /* Clear 'position updated' indicator */
        st->flags &= ~ST_POSUPDATED;

        /* go ask the adapter to do all this for us */
        xs = scsipi_make_xs_locked(periph,
            (struct scsipi_generic *)&cmd, sizeof(cmd),
            (u_char *)bp->b_data, bp->b_bcount,
            0, ST_IO_TIME, bp, flags);
        if (__predict_false(xs == NULL)) {
                /*
                 * out of memory. Keep this buffer in the queue, and
                 * retry later.
                 */
                callout_reset(&st->sc_callout, hz / 2, strestart,
                    periph);
                error = EAGAIN;
                goto out;
        }

        error = scsipi_execute_xs(xs);
        /* with a scsipi_xfer preallocated, scsipi_command can't fail */
        KASSERT(error == 0);
        if (error == 0)
                complete = 0;

out:
        mutex_exit(chan_mtx(chan));

        *errnop = error;
        return complete;
}

static void
ststart(struct scsipi_periph *periph)
{
        struct st_softc *st = device_private(periph->periph_dev);
        struct scsipi_channel *chan = periph->periph_channel;
        struct buf *bp;
        int error, complete;

        SC_DEBUG(periph, SCSIPI_DB2, ("ststart "));

        mutex_exit(chan_mtx(chan));
        mutex_enter(&st->sc_iolock);

        while ((bp = bufq_get(st->buf_defer)) != NULL
               || (bp = bufq_get(st->buf_queue)) != NULL) {

                iostat_busy(st->stats);
                mutex_exit(&st->sc_iolock);

                complete = ststart1(periph, bp, &error);

                mutex_enter(&st->sc_iolock);
                if (complete) {
                        iostat_unbusy(st->stats, 0,
                                      ((bp->b_flags & B_READ) == B_READ));
                        if (error == EAGAIN) {
                                bufq_put(st->buf_defer, bp);
                                break;
                        }
                }
                mutex_exit(&st->sc_iolock);

                if (complete) {
                        bp->b_error = error;
                        bp->b_resid = bp->b_bcount;
                        biodone(bp);
                }

                mutex_enter(&st->sc_iolock);
        }

        mutex_exit(&st->sc_iolock);
        mutex_enter(chan_mtx(chan));
}

static void
strestart(void *v)
{
        struct scsipi_periph *periph = (struct scsipi_periph *)v;
        struct scsipi_channel *chan = periph->periph_channel;

        mutex_enter(chan_mtx(chan));
        ststart((struct scsipi_periph *)v);
        mutex_exit(chan_mtx(chan));
}

static void
stdone(struct scsipi_xfer *xs, int error)
{
        struct st_softc *st = device_private(xs->xs_periph->periph_dev);
        struct buf *bp = xs->bp;

        if (bp) {
                bp->b_error = error;
                bp->b_resid = xs->resid;
                /*
                 * buggy device ? A SDLT320 can report an info
                 * field of 0x3de8000 on a Media Error/Write Error
                 * for this CBD: 0x0a 00 00 80 00 00
                 */
                if (bp->b_resid > bp->b_bcount || bp->b_resid < 0)
                        bp->b_resid = bp->b_bcount;

                mutex_enter(&st->sc_iolock);

                if ((bp->b_flags & B_READ) == B_WRITE)
                        st->flags |= ST_WRITTEN;
                else
                        st->flags &= ~ST_WRITTEN;

                iostat_unbusy(st->stats, bp->b_bcount,
                             ((bp->b_flags & B_READ) == B_READ));

                if ((st->flags & ST_POSUPDATED) == 0) {
                        if (error) {
                                st->fileno = st->blkno = -1;
                        } else if (st->blkno != -1) {
                                if (st->flags & ST_FIXEDBLOCKS)
                                        st->blkno +=
                                            (bp->b_bcount / st->blksize);
                                else
                                        st->blkno++;
                        }
                }

                mutex_exit(&st->sc_iolock);

                rnd_add_uint32(&st->rnd_source, bp->b_blkno);

                biodone(bp);
        }
}

static int
stread(dev_t dev, struct uio *uio, int iomode)
{
        struct st_softc *st = device_lookup_private(&st_cd, STUNIT(dev));

        int r = physio(ststrategy, NULL, dev, B_READ,
                       st->sc_periph->periph_channel->chan_adapter->adapt_minphys, uio);

        SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[stread: result=%d]\n", r));

        return r;
}

static int
stwrite(dev_t dev, struct uio *uio, int iomode)
{
        struct st_softc *st = device_lookup_private(&st_cd, STUNIT(dev));

        int r = physio(ststrategy, NULL, dev, B_WRITE,
            st->sc_periph->periph_channel->chan_adapter->adapt_minphys, uio);

        SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[stwrite: result=%d]\n", r));

        return r;
}

/*
 * Perform special action on behalf of the user;
 * knows about the internals of this device
 */
static int
stioctl(dev_t dev, u_long cmd, void *arg, int flag, struct lwp *l)
{
        int error = 0;
        int unit;
        int number, nmarks, dsty;
        int flags;
        struct st_softc *st;
        int hold_blksize;
        uint8_t hold_density;
        struct mtop *mt = (struct mtop *) arg;

        /* Find the device that the user is talking about */
        flags = 0;                /* give error messages, act on errors etc. */
        unit = STUNIT(dev);
        dsty = STDSTY(dev);
        st = device_lookup_private(&st_cd, unit);
        hold_blksize = st->blksize;
        hold_density = st->density;

        switch ((u_int)cmd) {
        case MTIOCGET: {
                struct mtget *g = (struct mtget *) arg;
                /*
                 * (to get the current state of READONLY)
                 */
                error = st->ops(st, ST_OPS_MODESENSE, XS_CTL_SILENT);
                if (error) {
                        /*
                         * Ignore the error if in control mode;
                         * this is mandated by st(4).
                         */
                        if (STMODE(dev) != CTRL_MODE)
                                break;
                        error = 0;
                }
                SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[ioctl: get status]\n"));
                memset(g, 0, sizeof(struct mtget));
                g->mt_type = MT_ISAR;        /* Ultrix compat *//*? */
                g->mt_blksiz = st->blksize;
                g->mt_density = st->density;
                g->mt_mblksiz[0] = st->modes[0].blksize;
                g->mt_mblksiz[1] = st->modes[1].blksize;
                g->mt_mblksiz[2] = st->modes[2].blksize;
                g->mt_mblksiz[3] = st->modes[3].blksize;
                g->mt_mdensity[0] = st->modes[0].density;
                g->mt_mdensity[1] = st->modes[1].density;
                g->mt_mdensity[2] = st->modes[2].density;
                g->mt_mdensity[3] = st->modes[3].density;
                g->mt_fileno = st->fileno;
                g->mt_blkno = st->blkno;
                if (st->flags & ST_READONLY)
                        g->mt_dsreg |= MT_DS_RDONLY;
                if (st->flags & ST_MOUNTED)
                        g->mt_dsreg |= MT_DS_MOUNTED;
                g->mt_resid = st->mt_resid;
                g->mt_erreg = st->mt_erreg;
                /*
                 * clear latched errors.
                 */
                st->mt_resid = 0;
                st->mt_erreg = 0;
                st->asc = 0;
                st->ascq = 0;
                break;
        }
        case MTIOCTOP: {
                SC_DEBUG(st->sc_periph, SCSIPI_DB1,
                    ("[ioctl: op=0x%x count=0x%x]\n", mt->mt_op,
                        mt->mt_count));

                /* compat: in U*x it is a short */
                number = mt->mt_count;
                switch ((short) (mt->mt_op)) {
                case MTWEOF:        /* write an end-of-file record */
                        error = st_write_filemarks(st, number, flags);
                        break;
                case MTBSF:        /* backward space file */
                        number = -number;
                        /* FALLTHROUGH */
                case MTFSF:        /* forward space file */
                        error = st_check_eod(st, FALSE, &nmarks, flags);
                        if (!error)
                                error = st_space(st, number - nmarks,
                                    SP_FILEMARKS, flags);
                        break;
                case MTBSR:        /* backward space record */
                        number = -number;
                        /* FALLTHROUGH */
                case MTFSR:        /* forward space record */
                        error = st_check_eod(st, true, &nmarks, flags);
                        if (!error)
                                error = st_space(st, number, SP_BLKS, flags);
                        break;
                case MTREW:        /* rewind */
                        error = st_rewind(st, 0, flags);
                        break;
                case MTOFFL:        /* rewind and put the drive offline */
                        st_unmount(st, EJECT);
                        break;
                case MTNOP:        /* no operation, sets status only */
                        break;
                case MTRETEN:        /* retension the tape */
                        error = st_load(st, LD_RETENSION, flags);
                        if (!error)
                                error = st_load(st, LD_LOAD, flags);
                        break;
                case MTEOM:        /* forward space to end of media */
                        error = st_check_eod(st, FALSE, &nmarks, flags);
                        if (!error)
                                error = st_space(st, 1, SP_EOM, flags);
                        break;
                case MTCACHE:        /* enable controller cache */
                        st->flags &= ~ST_DONTBUFFER;
                        goto try_new_value;
                case MTNOCACHE:        /* disable controller cache */
                        st->flags |= ST_DONTBUFFER;
                        goto try_new_value;
                case MTERASE:        /* erase volume */
                        error = st_erase(st, number, flags);
                        break;
                case MTSETBSIZ:        /* Set block size for device */
#ifdef        NOTYET
                        if (!(st->flags & ST_NEW_MOUNT)) {
                                uprintf("re-mount tape before changing "
                                    "blocksize");
                                error = EINVAL;
                                break;
                        }
#endif
                        if (number == 0)
                                st->flags &= ~ST_FIXEDBLOCKS;
                        else {
                                if ((st->blkmin || st->blkmax) &&
                                    (number < st->blkmin ||
                                    number > st->blkmax)) {
                                        error = EINVAL;
                                        break;
                                }
                                st->flags |= ST_FIXEDBLOCKS;
                        }
                        st->blksize = number;
                        st->flags |= ST_BLOCK_SET;        /*XXX */
                        goto try_new_value;
                case MTSETDNSTY:        /* Set density for device and mode */
                        /*
                         * Any number >= 0 and <= 0xff is legal. Numbers
                         * above 0x80 are 'vendor unique'.
                         */
                        if (number < 0 || number > 255) {
                                error = EINVAL;
                                break;
                        } else
                                st->density = number;
                        goto try_new_value;
                case MTCMPRESS:
                        error = st->ops(st, (number == 0) ?
                            ST_OPS_CMPRSS_OFF : ST_OPS_CMPRSS_ON,
                            XS_CTL_SILENT);
                        break;
                case MTEWARN:
                        if (number)
                                st->flags |= ST_EARLYWARN;
                        else
                                st->flags &= ~ST_EARLYWARN;
                        break;

                default:
                        error = EINVAL;
                }
                break;
        }
        case MTIOCIEOT:
        case MTIOCEEOT:
                break;
        case MTIOCRDSPOS:
                error = st_rdpos(st, 0, (uint32_t *)arg);
                break;
        case MTIOCRDHPOS:
                error = st_rdpos(st, 1, (uint32_t *)arg);
                break;
        case MTIOCSLOCATE:
                error = st_setpos(st, 0, (uint32_t *)arg);
                break;
        case MTIOCHLOCATE:
                error = st_setpos(st, 1, (uint32_t *)arg);
                break;
        default:
                error = scsipi_do_ioctl(st->sc_periph, dev, cmd, arg, flag, l);
                break;
        }
        return error;

try_new_value:
        /*
         * Check that the mode being asked for is aggreeable to the
         * drive. If not, put it back the way it was.
         *
         * If in control mode, we can make (persistent) mode changes
         * even if no medium is loaded (see st(4)).
         */
        if ((STMODE(dev) != CTRL_MODE || (st->flags & ST_MOUNTED) != 0) &&
            (error = st->ops(st, ST_OPS_MODESELECT, 0)) != 0) {
                /* put it back as it was */
                aprint_error_dev(st->sc_dev, "cannot set selected mode\n");
                st->density = hold_density;
                st->blksize = hold_blksize;
                if (st->blksize)
                        st->flags |= ST_FIXEDBLOCKS;
                else
                        st->flags &= ~ST_FIXEDBLOCKS;
                return error;
        }
        /*
         * As the drive liked it, if we are setting a new default,
         * set it into the structures as such.
         *
         * The means for deciding this are not finalised yet- but
         * if the device was opened in Control Mode, the values
         * are persistent now across mounts.
         */
        if (STMODE(dev) == CTRL_MODE) {
                switch ((short) (mt->mt_op)) {
                case MTSETBSIZ:
                        st->modes[dsty].blksize = st->blksize;
                        st->modeflags[dsty] |= BLKSIZE_SET_BY_USER;
                        break;
                case MTSETDNSTY:
                        st->modes[dsty].density = st->density;
                        st->modeflags[dsty] |= DENSITY_SET_BY_USER;
                        break;
                }
        }
        return 0;
}

/* Do a synchronous read. */
static int
st_read(struct st_softc *st, char *bf, int size, int flags)
{
        struct scsi_rw_tape cmd;

        /* If it's a null transfer, return immediately */
        if (size == 0)
                return 0;
        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = READ;
        if (st->flags & ST_FIXEDBLOCKS) {
                cmd.byte2 |= SRW_FIXED;
                _lto3b(size / (st->blksize ? st->blksize : DEF_FIXED_BSIZE),
                    cmd.len);
        } else
                _lto3b(size, cmd.len);
        return scsipi_command(st->sc_periph,
            (void *)&cmd, sizeof(cmd), (void *)bf, size, 0, ST_IO_TIME, NULL,
            flags | XS_CTL_DATA_IN);
}

/* issue an erase command */
static int
st_erase(struct st_softc *st, int full, int flags)
{
        int tmo;
        struct scsi_erase cmd;

        /*
         * Full erase means set LONG bit in erase command, which asks
         * the drive to erase the entire unit.  Without this bit, we're
         * asking the drive to write an erase gap.
         */
        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = ERASE;
        if (full) {
                cmd.byte2 = SE_LONG;
                tmo = ST_SPC_TIME;
        } else
                tmo = ST_IO_TIME;

        /*
         * XXX We always do this asynchronously, for now, unless the device
         * has the ST_Q_ERASE_NOIMM quirk.  How long should we wait if we
         * want to (eventually) to it synchronously?
         */
        if ((st->quirks & ST_Q_ERASE_NOIMM) == 0)
                cmd.byte2 |= SE_IMMED;

        return scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
            ST_RETRIES, tmo, NULL, flags);
}

/* skip N blocks/filemarks/seq filemarks/eom */
static int
st_space(struct st_softc *st, int number, u_int what, int flags)
{
        struct scsi_space cmd;
        int error;

        switch (what) {
        case SP_BLKS:
                if (st->flags & ST_PER_ACTION) {
                        if (number > 0) {
                                st->flags &= ~ST_PER_ACTION;
                                return EIO;
                        } else if (number < 0) {
                                if (st->flags & ST_AT_FILEMARK) {
                                        /*
                                         * Handling of ST_AT_FILEMARK
                                         * in st_space will fill in the
                                         * right file mark count.
                                         */
                                        error = st_space(st, 0, SP_FILEMARKS,
                                            flags);
                                        if (error)
                                                return error;
                                }
                                if (st->flags & ST_BLANK_READ) {
                                        st->flags &= ~ST_BLANK_READ;
                                        return EIO;
                                }
                                st->flags &= ~(ST_EIO_PENDING|ST_EOM_PENDING);
                        }
                }
                break;
        case SP_FILEMARKS:
                if (st->flags & ST_EIO_PENDING) {
                        if (number > 0) {
                                /* pretend we just discovered the error */
                                st->flags &= ~ST_EIO_PENDING;
                                return EIO;
                        } else if (number < 0) {
                                /* back away from the error */
                                st->flags &= ~ST_EIO_PENDING;
                        }
                }
                if (st->flags & ST_AT_FILEMARK) {
                        st->flags &= ~ST_AT_FILEMARK;
                        number--;
                }
                if ((st->flags & ST_BLANK_READ) && (number < 0)) {
                        /* back away from unwritten tape */
                        st->flags &= ~ST_BLANK_READ;
                        number++;        /* XXX dubious */
                }
                break;
        case SP_EOM:
                if (st->flags & ST_EOM_PENDING) {
                        /* we're already there */
                        st->flags &= ~ST_EOM_PENDING;
                        return 0;
                }
                if (st->flags & ST_EIO_PENDING) {
                        /* pretend we just discovered the error */
                        st->flags &= ~ST_EIO_PENDING;
                        return EIO;
                }
                if (st->flags & ST_AT_FILEMARK)
                        st->flags &= ~ST_AT_FILEMARK;
                break;
        }
        if (number == 0)
                return 0;

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = SPACE;
        cmd.byte2 = what;
        _lto3b(number, cmd.number);

        st->flags &= ~ST_POSUPDATED;
        st->last_ctl_resid = 0;
        error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
            0, ST_SPC_TIME, NULL, flags);

        if (error == 0 && (st->flags & ST_POSUPDATED) == 0) {
                number = number - st->last_ctl_resid;
                if (what == SP_BLKS) {
                        if (st->blkno != -1)
                                st->blkno += number;
                } else if (what == SP_FILEMARKS) {
                        if (st->fileno != -1) {
                                st->fileno += number;
                                if (number > 0)
                                        st->blkno = 0;
                                else if (number < 0)
                                        st->blkno = -1;
                        }
                } else if (what == SP_EOM) {
                        st_updatefilepos(st);
                }
        }
        return error;
}

/*
 * write N filemarks
 */
static int
st_write_filemarks(struct st_softc *st, int number, int flags)
{
        int error;
        struct scsi_write_filemarks cmd;

        /*
         * It's hard to write a negative number of file marks.
         * Don't try.
         */
        if (number < 0) {
                SC_DEBUG(st->sc_periph, SCSIPI_DB3,
                         ("EINVAL: st_write_filemarks not writing %d file marks\n", number));
                return EINVAL;
        }
        
        switch (number) {
        case 0:                /* really a command to sync the drive's buffers */
                break;
        case 1:
                if (st->flags & ST_FM_WRITTEN)        /* already have one down */
                        st->flags &= ~ST_WRITTEN;
                else
                        st->flags |= ST_FM_WRITTEN;
                st->flags &= ~ST_PER_ACTION;
                break;
        default:
                st->flags &= ~(ST_PER_ACTION | ST_WRITTEN);
        }

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = WRITE_FILEMARKS;
        if (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(st->sc_periph)) ==
            SCSIPI_BUSTYPE_ATAPI)
                cmd.byte2 = SR_IMMED;
        /*
         * The ATAPI Onstream DI-30 doesn't support writing filemarks, but
         * WRITE_FILEMARKS is still used to flush the buffer
         */
        if ((st->quirks & ST_Q_NOFILEMARKS) == 0)
                _lto3b(number, cmd.number);

        /* XXX WE NEED TO BE ABLE TO GET A RESIDIUAL XXX */
        error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
            0, ST_IO_TIME * 4, NULL, flags);
        if (error == 0 && st->fileno != -1)
                st->fileno += number;
        return error;
}

/*
 * Make sure the right number of file marks is on tape if the
 * tape has been written.  If the position argument is true,
 * leave the tape positioned where it was originally.
 *
 * nmarks returns the number of marks to skip (or, if position
 * true, which were skipped) to get back original position.
 */
static int
st_check_eod(struct st_softc *st, boolean position, int *nmarks, int flags)
{
        int error;

        switch (st->flags & (ST_WRITTEN | ST_FM_WRITTEN | ST_2FM_AT_EOD)) {
        default:
                *nmarks = 0;
                return 0;
        case ST_WRITTEN:
        case ST_WRITTEN | ST_FM_WRITTEN | ST_2FM_AT_EOD:
                *nmarks = 1;
                break;
        case ST_WRITTEN | ST_2FM_AT_EOD:
                *nmarks = 2;
        }
        error = st_write_filemarks(st, *nmarks, flags);
        if (position && !error)
                error = st_space(st, -*nmarks, SP_FILEMARKS, flags);
        return error;
}

/* load/unload/retension */
static int
st_load(struct st_softc *st, u_int type, int flags)
{
        int error;
        struct scsi_load cmd;

        if (type != LD_LOAD) {
                int nmarks;

                error = st_check_eod(st, FALSE, &nmarks, flags);
                if (error) {
                        aprint_error_dev(st->sc_dev,
                            "failed to write closing filemarks at "
                            "unload, errno=%d\n", error);
                        return error;
                }
        }
        if (st->quirks & ST_Q_IGNORE_LOADS) {
                if (type == LD_LOAD)
                        /*
                         * If we ignore loads, at least we should try a rewind.
                         */
                        return st_rewind(st, 0, flags);
                /* otherwise, we should do what's asked of us */
        }

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = LOAD;
        if (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(st->sc_periph)) ==
            SCSIPI_BUSTYPE_ATAPI)
                cmd.byte2 = SR_IMMED;
        cmd.how = type;

        error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
            ST_RETRIES, ST_SPC_TIME, NULL, flags);
        if (error) {
                aprint_error_dev(st->sc_dev, "error %d in st_load (op %d)\n",
                    error, type);
        }
        return error;
}

/* Rewind the device */
static int
st_rewind(struct st_softc *st, u_int immediate, int flags)
{
        struct scsi_rewind cmd;
        int error;
        int nmarks;
        int timeout;

        error = st_check_eod(st, FALSE, &nmarks, flags);
        if (error) {
                aprint_error_dev(st->sc_dev,
                    "failed to write closing filemarks at "
                    "rewind, errno=%d\n", error);
                return error;
        }
        st->flags &= ~ST_PER_ACTION;

        /* If requestor asked for immediate response, set a short timeout */
        timeout = immediate ? ST_CTL_TIME : ST_SPC_TIME;

        /* ATAPI tapes always need immediate to be set */
        if (scsipi_periph_bustype(st->sc_periph) == SCSIPI_BUSTYPE_ATAPI)
                immediate = SR_IMMED;

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = REWIND;
        cmd.byte2 = immediate;

        error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
            ST_RETRIES, timeout, NULL, flags);
        if (error) {
                aprint_error_dev(st->sc_dev, "error %d trying to rewind\n",
                    error);
                /* lost position */
                st->fileno = st->blkno = -1;
        } else
                st->fileno = st->blkno = 0;
        return error;
}

static void
st_updatefilepos(struct st_softc *st)
{
        int error;
        uint8_t posdata[32];
        struct scsi_tape_read_position cmd;

        memset(&cmd, 0, sizeof(cmd));
        memset(&posdata, 0, sizeof(posdata));
        cmd.opcode = READ_POSITION;
        cmd.byte1 = 6;  /* service action: LONG FORM */

        error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd),
            (void *)&posdata, sizeof(posdata), ST_RETRIES, ST_CTL_TIME, NULL,
            XS_CTL_SILENT | XS_CTL_DATA_IN);

        if (error == 0) {
#ifdef SCSIPI_DEBUG
                if (st->sc_periph->periph_dbflags & SCSIPI_DB3) {
                        int hard;

                        printf("posdata: ");
                        for (hard = 0; hard < sizeof(posdata); hard++)
                                printf("%02x ", posdata[hard] & 0xff);
                        printf("\n");
                }
#endif
                if (posdata[0] & 0xC) { /* Block|Mark Position Unknown */
                        SC_DEBUG(st->sc_periph, SCSIPI_DB3,
                                 ("st_updatefilepos block/mark position unknown (0x%02x)\n",
                                  posdata[0]));
                } else {
                        st->fileno = _8btol(&posdata[16]);
                        st->blkno = 0;
                        SC_DEBUG(st->sc_periph, SCSIPI_DB3,
                                 ("st_updatefilepos file position %"PRId64"\n",
                                  st->fileno));
                        return;
                }
        } else {
                SC_DEBUG(st->sc_periph, SCSIPI_DB3,
                         ("st_updatefilepos READ POSITION(LONG_FORM) failed (error=%d)\n",
                          error));
        }
        st->fileno = -1;
        st->blkno = -1;
}

static int
st_rdpos(struct st_softc *st, int hard, uint32_t *blkptr)
{
        int error;
        uint8_t posdata[20];
        struct scsi_tape_read_position cmd;

        /*
         * We try and flush any buffered writes here if we were writing
         * and we're trying to get hardware block position. It eats
         * up performance substantially, but I'm wary of drive firmware.
         *
         * I think that *logical* block position is probably okay-
         * but hardware block position might have to wait for data
         * to hit media to be valid. Caveat Emptor.
         */

        if (hard && (st->flags & ST_WRITTEN)) {
                /* First flush any pending writes... */
                error = st_write_filemarks(st, 0, XS_CTL_SILENT);
                /*
                 * The latter case is for 'write protected' tapes
                 * which are too stupid to recognize a zero count
                 * for writing filemarks as a no-op.
                 */
                if (error != 0 && error != EACCES && error != EROFS)
                        return error;
        }

        memset(&cmd, 0, sizeof(cmd));
        memset(&posdata, 0, sizeof(posdata));
        cmd.opcode = READ_POSITION;
        if (hard)
                cmd.byte1 = 1;

        error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd),
            (void *)&posdata, sizeof(posdata), ST_RETRIES, ST_CTL_TIME, NULL,
            XS_CTL_SILENT | XS_CTL_DATA_IN);

        if (error == 0) {
#if        0
                printf("posdata:");
                for (hard = 0; hard < sizeof(posdata); hard++)
                        printf("%02x ", posdata[hard] & 0xff);
                printf("\n");
#endif
                if (posdata[0] & 0x4) {        /* Block Position Unknown */
                        SC_DEBUG(st->sc_periph, SCSIPI_DB3,
                                 ("EINVAL: strdpos block position unknown\n"));
                        error = EINVAL;
                }        
                else
                        *blkptr = _4btol(&posdata[4]);
        }
        return error;
}

static int
st_setpos(struct st_softc *st, int hard, uint32_t *blkptr)
{
        int error;
        struct scsi_tape_locate cmd;

        /*
         * We used to try and flush any buffered writes here.
         * Now we push this onto user applications to either
         * flush the pending writes themselves (via a zero count
         * WRITE FILEMARKS command) or they can trust their tape
         * drive to do this correctly for them.
         *
         * There are very ugly performance limitations otherwise.
         */

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = LOCATE;
        if (hard)
                cmd.byte2 = 1 << 2;
        _lto4b(*blkptr, cmd.blkaddr);
        error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
            ST_RETRIES, ST_SPC_TIME, NULL, 0);
        /*
         * Note file && block number position now unknown (if
         * these things ever start being maintained in this driver)
         */
        st->fileno = st->blkno = -1;
        return error;
}


/*
 * Look at the returned sense and act on the error and determine
 * the unix error number to pass back..., 0 (== report no error),
 * -1 = retry the operation, -2 continue error processing.
 */
static int
st_interpret_sense(struct scsipi_xfer *xs)
{
        struct scsipi_periph *periph = xs->xs_periph;
        struct scsi_sense_data *sense = &xs->sense.scsi_sense;
        struct buf *bp = xs->bp;
        struct st_softc *st = device_private(periph->periph_dev);
        int retval = EJUSTRETURN;
        int doprint = ((xs->xs_control & XS_CTL_SILENT) == 0);
        uint8_t key;
        int32_t info;

        /*
         * If it isn't a extended or extended/deferred error, let
         * the generic code handle it.
         */
        if (SSD_RCODE(sense->response_code) != SSD_RCODE_CURRENT &&
            SSD_RCODE(sense->response_code) != SSD_RCODE_DEFERRED)
                return retval;

        if (sense->response_code & SSD_RCODE_VALID)
                info = _4btol(sense->info);
        else
                info = (st->flags & ST_FIXEDBLOCKS) ?
                    xs->datalen / st->blksize : xs->datalen;
        key = SSD_SENSE_KEY(sense->flags);
        st->mt_erreg = key;
        st->asc = sense->asc;
        st->ascq = sense->ascq;
        st->mt_resid = (short) info;

        if (key == SKEY_NOT_READY && st->asc == 0x4 && st->ascq == 0x1) {
                /* Not Ready, Logical Unit Is in Process Of Becoming Ready */
                if (!callout_pending(&periph->periph_callout))
                        scsipi_periph_freeze(periph, 1);
                callout_reset(&periph->periph_callout,
                    hz, scsipi_periph_timed_thaw, periph);
                return ERESTART;
        }

        /* If the device is not open yet, let generic handle */
        if ((periph->periph_flags & PERIPH_OPEN) == 0)
                return retval;

        xs->resid = info;
        if (st->flags & ST_FIXEDBLOCKS) {
                if (bp) {
                        xs->resid *= st->blksize;
                        st->last_io_resid = xs->resid;
                } else
                        st->last_ctl_resid = xs->resid;
                if (key == SKEY_VOLUME_OVERFLOW) {
                        st->flags |= ST_EIO_PENDING;
                        if (bp)
                                bp->b_resid = xs->resid;
                } else if (sense->flags & SSD_EOM) {
                        if ((st->flags & ST_EARLYWARN) == 0)
                                st->flags |= ST_EIO_PENDING;
                        st->flags |= ST_EOM_PENDING;
                        if (bp) {
#if 0
                                bp->b_resid = xs->resid;
#else
                                /*
                                 * Grotesque as it seems, the few times
                                 * I've actually seen a non-zero resid,
                                 * the tape drive actually lied and had
                                 * written all the data!
                                 */
                                bp->b_resid = 0;
#endif
                        }
                }
                if (sense->flags & SSD_FILEMARK) {
                        st->flags |= ST_AT_FILEMARK;
                        if (bp)
                                bp->b_resid = xs->resid;
                        if (st->fileno != (daddr_t) -1) {
                                st->fileno++;
                                st->blkno = 0;
                                st->flags |= ST_POSUPDATED;
                        }
                }
                if (sense->flags & SSD_ILI) {
                        st->flags |= ST_EIO_PENDING;
                        if (bp)
                                bp->b_resid = xs->resid;
                        if (sense->response_code & SSD_RCODE_VALID &&
                            (xs->xs_control & XS_CTL_SILENT) == 0)
                                aprint_error_dev(st->sc_dev,
                                    "block wrong size, %d blocks residual\n",
                                    info);

                        /*
                         * This quirk code helps the drive read
                         * the first tape block, regardless of
                         * format.  That is required for these
                         * drives to return proper MODE SENSE
                         * information.
                         */
                        if ((st->quirks & ST_Q_SENSE_HELP) &&
                            (periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)
                                st->blksize -= 512;
                        else if ((st->flags & ST_POSUPDATED) == 0) {
                                if (st->blkno != (daddr_t) -1) {
                                        st->blkno +=
                                            (xs->datalen / st->blksize);
                                        st->flags |= ST_POSUPDATED;
                                }
                        }
                }
                /*
                 * If data wanted and no data was transferred, do it immediately
                 */
                if (xs->datalen && xs->resid >= xs->datalen) {
                        if (st->flags & ST_EIO_PENDING)
                                return EIO;
                        if (st->flags & ST_AT_FILEMARK) {
                                if (bp)
                                        bp->b_resid = xs->resid;
                                return 0;
                        }
                }
        } else {                /* must be variable mode */
                if (bp)
                        st->last_io_resid = xs->resid;
                else
                        st->last_ctl_resid = xs->resid;
                if (sense->flags & SSD_EOM) {
                        /*
                         * The current semantics of this
                         * driver requires EOM detection
                         * to return EIO unless early
                         * warning detection is enabled
                         * for variable mode (this is always
                         * on for fixed block mode).
                         */
                        if (st->flags & ST_EARLYWARN) {
                                st->flags |= ST_EOM_PENDING;
                                retval = 0;
                        } else {
                                retval = EIO;
                                /*
                                 * If we return an error we can't claim to
                                 * have transferred all data.
                                 */
                                if (xs->resid == 0)
                                        xs->resid = xs->datalen;
                        }

                        /*
                         * If it's an unadorned EOM detection,
                         * suppress printing an error.
                         */
                        if (key == SKEY_NO_SENSE) {
                                doprint = 0;
                        }
                } else if (sense->flags & SSD_FILEMARK) {
                        retval = 0;
                        if (st->fileno != (daddr_t) -1) {
                                st->fileno++;
                                st->blkno = 0;
                                st->flags |= ST_POSUPDATED;
                        }
                } else if (sense->flags & SSD_ILI) {
                        if (info < 0) {
                                /*
                                 * The tape record was bigger than the read
                                 * we issued.
                                 */
                                if ((xs->xs_control & XS_CTL_SILENT) == 0) {
                                        aprint_error_dev(st->sc_dev,
                                            "%d-byte tape record too big"
                                            " for %d-byte user buffer\n",
                                            xs->datalen - info, xs->datalen);
                                }
                                retval = EIO;
                        } else {
                                retval = 0;
                                if (st->blkno != (daddr_t) -1) {
                                        st->blkno++;
                                        st->flags |= ST_POSUPDATED;
                                }
                        }
                }
                if (bp)
                        bp->b_resid = xs->resid;
        }

#ifndef SCSIPI_DEBUG
        if (retval == 0 && key == SKEY_NO_SENSE)
                doprint = 0;
#endif
        if (key == SKEY_BLANK_CHECK) {
                /*
                 * This quirk code helps the drive read the
                 * first tape block, regardless of format.  That
                 * is required for these drives to return proper
                 * MODE SENSE information.
                 */
                if ((st->quirks & ST_Q_SENSE_HELP) &&
                    (periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) {
                        /* still starting */
                        st->blksize -= 512;
                } else if (!(st->flags & (ST_2FM_AT_EOD | ST_BLANK_READ))) {
                        st->flags |= ST_BLANK_READ;
                        xs->resid = xs->datalen;
                        if (bp) {
                                bp->b_resid = xs->resid;
                                /* return an EOF */
                        }
                        retval = 0;
                        /* lost position */
                        st->fileno = st->blkno = -1;
                }
        }

        /*
         * If generic sense processing will continue, we should not
         * print sense info here.
         */
        if (retval == EJUSTRETURN)
                doprint = 0;

        if (doprint) {
                /* Print verbose sense info if possible */
                if (scsipi_print_sense(xs, 0) != 0)
                        return retval;

                /* Print less-verbose sense info */
                scsipi_printaddr(periph);
                printf("Sense Key 0x%02x", key);
                if ((sense->response_code & SSD_RCODE_VALID) != 0) {
                        switch (key) {
                        case SKEY_NOT_READY:
                        case SKEY_ILLEGAL_REQUEST:
                        case SKEY_UNIT_ATTENTION:
                        case SKEY_DATA_PROTECT:
                                break;
                        case SKEY_VOLUME_OVERFLOW:
                        case SKEY_BLANK_CHECK:
                                printf(", requested size: %d (decimal)", info);
                                break;
                        case SKEY_ABORTED_COMMAND:
                                if (xs->xs_retries)
                                        printf(", retrying");
                                printf(", cmd 0x%x, info 0x%x",
                                    xs->cmd->opcode, info);
                                break;
                        default:
                                printf(", info = %d (decimal)", info);
                        }
                }
                if (sense->extra_len != 0) {
                        int n;
                        printf(", data =");
                        for (n = 0; n < sense->extra_len; n++)
                                printf(" %02x", sense->csi[n]);
                }
                printf("\n");
        }
        return retval;
}

/*
 * The quirk here is that the drive returns some value to st_mode_sense
 * incorrectly until the tape has actually passed by the head.
 *
 * The method is to set the drive to large fixed-block state (user-specified
 * density and 1024-byte blocks), then read and rewind to get it to sense the
 * tape.  If that doesn't work, try 512-byte fixed blocks.  If that doesn't
 * work, as a last resort, try variable- length blocks.  The result will be
 * the ability to do an accurate st_mode_sense.
 *
 * We know we can do a rewind because we just did a load, which implies rewind.
 * Rewind seems preferable to space backward if we have a virgin tape.
 *
 * The rest of the code for this quirk is in ILI processing and BLANK CHECK
 * error processing, both part of st_interpret_sense.
 */
static int
st_touch_tape(struct st_softc *st)
{
        char *bf;
        int readsize;
        int error;

        bf = malloc(1024, M_TEMP, M_WAITOK);
        if ((error = st->ops(st, ST_OPS_MODESENSE, 0)) != 0)
                goto bad;

        /*
         * If the block size is already known from the
         * sense data, use it. Else start probing at 1024.
         */
        if (st->media_blksize > 0)
                st->blksize = st->media_blksize;
        else
                st->blksize = 1024;

        do {
                switch (st->blksize) {
                case 512:
                case 1024:
                        readsize = st->blksize;
                        st->flags |= ST_FIXEDBLOCKS;
                        break;
                default:
                        readsize = 1;
                        st->flags &= ~ST_FIXEDBLOCKS;
                }
                if ((error = st->ops(st, ST_OPS_MODESELECT, XS_CTL_SILENT))
                    != 0) {
                        /*
                         * The device did not agree with the proposed
                         * block size. If we exhausted our options,
                         * return failure, else try another.
                         */
                        if (readsize == 1)
                                goto bad;
                        st->blksize -= 512;
                        continue;
                }
                st_read(st, bf, readsize, XS_CTL_SILENT);        /* XXX */
                if ((error = st_rewind(st, 0, 0)) != 0) {
bad:                        free(bf, M_TEMP);
                        return error;
                }
        } while (readsize != 1 && readsize > st->blksize);

        free(bf, M_TEMP);
        return 0;
}

static int
stdump(dev_t dev, daddr_t blkno, void *va, size_t size)
{
        /* Not implemented. */
        return ENXIO;
}

/*
 * Send a filled out parameter structure to the drive to
 * set it into the desire modes etc.
 */
int
st_mode_select(struct st_softc *st, int flags)
{
        u_int select_len;
        struct select {
                struct scsi_mode_parameter_header_6 header;
                struct scsi_general_block_descriptor blk_desc;
                u_char sense_data[MAX_PAGE_0_SIZE];
        } select;
        struct scsipi_periph *periph = st->sc_periph;

        select_len = sizeof(select.header) + sizeof(select.blk_desc) +
                     st->page_0_size;

        /*
         * This quirk deals with drives that have only one valid mode
         * and think this gives them license to reject all mode selects,
         * even if the selected mode is the one that is supported.
         */
        if (st->quirks & ST_Q_UNIMODAL) {
                SC_DEBUG(periph, SCSIPI_DB3,
                    ("not setting density 0x%x blksize 0x%x\n",
                    st->density, st->blksize));
                return 0;
        }

        /* Set up for a mode select */
        memset(&select, 0, sizeof(select));
        select.header.blk_desc_len = sizeof(struct
            scsi_general_block_descriptor);
        select.header.dev_spec &= ~SMH_DSP_BUFF_MODE;
        select.blk_desc.density = st->density;
        if (st->flags & ST_DONTBUFFER)
                select.header.dev_spec |= SMH_DSP_BUFF_MODE_OFF;
        else
                select.header.dev_spec |= SMH_DSP_BUFF_MODE_ON;
        if (st->flags & ST_FIXEDBLOCKS)
                _lto3b(st->blksize, select.blk_desc.blklen);
        if (st->page_0_size)
                memcpy(select.sense_data, st->sense_data, st->page_0_size);

        /* do the command */
        return scsipi_mode_select(periph, 0, &select.header, select_len,
                                  flags, ST_RETRIES, ST_CTL_TIME);
}









































































































































    1 





    1 










































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
/*        $NetBSD: exec_subr.c,v 1.88 2023/11/21 14:35:36 riastradh Exp $        */

/*
 * Copyright (c) 1993, 1994, 1996 Christopher G. Demetriou
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Christopher G. Demetriou.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_subr.c,v 1.88 2023/11/21 14:35:36 riastradh Exp $");

#include "opt_pax.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
#include <sys/filedesc.h>
#include <sys/exec.h>
#include <sys/mman.h>
#include <sys/resourcevar.h>
#include <sys/device.h>
#include <sys/pax.h>

#include <uvm/uvm_extern.h>

#define        VMCMD_EVCNT_DECL(name)                                        \
static struct evcnt vmcmd_ev_##name =                                \
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "vmcmd", #name);        \
EVCNT_ATTACH_STATIC(vmcmd_ev_##name)

#define        VMCMD_EVCNT_INCR(name)                                        \
    vmcmd_ev_##name.ev_count++

VMCMD_EVCNT_DECL(calls);
VMCMD_EVCNT_DECL(extends);
VMCMD_EVCNT_DECL(kills);

#ifdef DEBUG_STACK
#define DPRINTF(a) uprintf a
#else
#define DPRINTF(a)
#endif

unsigned int user_stack_guard_size = 1024 * 1024;
unsigned int user_thread_stack_guard_size = 64 * 1024;

/*
 * new_vmcmd():
 *        create a new vmcmd structure and fill in its fields based
 *        on function call arguments.  make sure objects ref'd by
 *        the vmcmd are 'held'.
 */

void
new_vmcmd(struct exec_vmcmd_set *evsp,
    int (*proc)(struct lwp * l, struct exec_vmcmd *),
    vsize_t len, vaddr_t addr, struct vnode *vp, u_long offset,
    u_int prot, int flags)
{
        struct exec_vmcmd *vcp;

        VMCMD_EVCNT_INCR(calls);
        KASSERT(proc != vmcmd_map_pagedvn || (vp->v_iflag & VI_TEXT));
        KASSERT(vp == NULL || vrefcnt(vp) > 0);

        if (evsp->evs_used >= evsp->evs_cnt)
                vmcmdset_extend(evsp);
        vcp = &evsp->evs_cmds[evsp->evs_used++];
        vcp->ev_proc = proc;
        vcp->ev_len = len;
        vcp->ev_addr = addr;
        if ((vcp->ev_vp = vp) != NULL)
                vref(vp);
        vcp->ev_offset = offset;
        vcp->ev_prot = prot;
        vcp->ev_flags = flags;
}

void
vmcmdset_extend(struct exec_vmcmd_set *evsp)
{
        struct exec_vmcmd *nvcp;
        u_int ocnt;

#ifdef DIAGNOSTIC
        if (evsp->evs_used < evsp->evs_cnt)
                panic("vmcmdset_extend: not necessary");
#endif

        /* figure out number of entries in new set */
        if ((ocnt = evsp->evs_cnt) != 0) {
                evsp->evs_cnt += ocnt;
                VMCMD_EVCNT_INCR(extends);
        } else
                evsp->evs_cnt = EXEC_DEFAULT_VMCMD_SETSIZE;

        /* allocate it */
        nvcp = kmem_alloc(evsp->evs_cnt * sizeof(struct exec_vmcmd), KM_SLEEP);

        /* free the old struct, if there was one, and record the new one */
        if (ocnt) {
                memcpy(nvcp, evsp->evs_cmds,
                    (ocnt * sizeof(struct exec_vmcmd)));
                kmem_free(evsp->evs_cmds, ocnt * sizeof(struct exec_vmcmd));
        }
        evsp->evs_cmds = nvcp;
}

void
kill_vmcmds(struct exec_vmcmd_set *evsp)
{
        struct exec_vmcmd *vcp;
        u_int i;

        VMCMD_EVCNT_INCR(kills);

        if (evsp->evs_cnt == 0)
                return;

        for (i = 0; i < evsp->evs_used; i++) {
                vcp = &evsp->evs_cmds[i];
                if (vcp->ev_vp != NULL)
                        vrele(vcp->ev_vp);
        }
        kmem_free(evsp->evs_cmds, evsp->evs_cnt * sizeof(struct exec_vmcmd));
        evsp->evs_used = evsp->evs_cnt = 0;
}

/*
 * vmcmd_map_pagedvn():
 *        handle vmcmd which specifies that a vnode should be mmap'd.
 *        appropriate for handling demand-paged text and data segments.
 */

static int
vmcmd_get_prot(struct lwp *l, const struct exec_vmcmd *cmd, vm_prot_t *prot,
    vm_prot_t *maxprot)
{
        vm_prot_t extraprot = PROT_MPROTECT_EXTRACT(cmd->ev_prot);

        *prot = cmd->ev_prot & UVM_PROT_ALL;
        *maxprot = PAX_MPROTECT_MAXPROTECT(l, *prot, extraprot, UVM_PROT_ALL);

        if ((*prot & *maxprot) != *prot)
                return EACCES;
        return PAX_MPROTECT_VALIDATE(l, *prot);
}

int
vmcmd_map_pagedvn(struct lwp *l, struct exec_vmcmd *cmd)
{
        struct uvm_object *uobj;
        struct vnode *vp = cmd->ev_vp;
        struct proc *p = l->l_proc;
        int error;
        vm_prot_t prot, maxprot;

        KASSERT(vp->v_iflag & VI_TEXT);

        /*
         * map the vnode in using uvm_map.
         */

        if (cmd->ev_len == 0)
                return 0;
        if (cmd->ev_offset & PAGE_MASK)
                return EINVAL;
        if (cmd->ev_addr & PAGE_MASK)
                return EINVAL;
        if (cmd->ev_len & PAGE_MASK)
                return EINVAL;

        if ((error = vmcmd_get_prot(l, cmd, &prot, &maxprot)) != 0)
                return error;

        /*
         * check the file system's opinion about mmapping the file
         */

        error = VOP_MMAP(vp, prot, l->l_cred);
        if (error)
                return error;

        if ((vp->v_vflag & VV_MAPPED) == 0) {
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                vp->v_vflag |= VV_MAPPED;
                VOP_UNLOCK(vp);
        }

        /*
         * do the map, reference the object for this map entry
         */
        uobj = &vp->v_uobj;
        vref(vp);

        error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr, cmd->ev_len,
                uobj, cmd->ev_offset, 0,
                UVM_MAPFLAG(prot, maxprot, UVM_INH_COPY,
                        UVM_ADV_NORMAL, UVM_FLAG_COPYONW|UVM_FLAG_FIXED));
        if (error) {
                uobj->pgops->pgo_detach(uobj);
        }
        return error;
}

/*
 * vmcmd_map_readvn():
 *        handle vmcmd which specifies that a vnode should be read from.
 *        appropriate for non-demand-paged text/data segments, i.e. impure
 *        objects (a la OMAGIC and NMAGIC).
 */
int
vmcmd_map_readvn(struct lwp *l, struct exec_vmcmd *cmd)
{
        struct proc *p = l->l_proc;
        int error;
        long diff;

        if (cmd->ev_len == 0)
                return 0;

        diff = cmd->ev_addr - trunc_page(cmd->ev_addr);
        cmd->ev_addr -= diff;                        /* required by uvm_map */
        cmd->ev_offset -= diff;
        cmd->ev_len += diff;

        error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr,
                        round_page(cmd->ev_len), NULL, UVM_UNKNOWN_OFFSET, 0,
                        UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_COPY,
                        UVM_ADV_NORMAL,
                        UVM_FLAG_FIXED|UVM_FLAG_OVERLAY|UVM_FLAG_COPYONW));

        if (error)
                return error;

        return vmcmd_readvn(l, cmd);
}

int
vmcmd_readvn(struct lwp *l, struct exec_vmcmd *cmd)
{
        struct proc *p = l->l_proc;
        int error;
        vm_prot_t prot, maxprot;

        error = vn_rdwr(UIO_READ, cmd->ev_vp, (void *)cmd->ev_addr,
            cmd->ev_len, cmd->ev_offset, UIO_USERSPACE, IO_UNIT,
            l->l_cred, NULL, l);
        if (error)
                return error;

        if ((error = vmcmd_get_prot(l, cmd, &prot, &maxprot)) != 0)
                return error;

#ifdef PMAP_NEED_PROCWR
        /*
         * we had to write the process, make sure the pages are synched
         * with the instruction cache.
         */
        if (prot & VM_PROT_EXECUTE)
                pmap_procwr(p, cmd->ev_addr, cmd->ev_len);
#endif

        /*
         * we had to map in the area at PROT_ALL so that vn_rdwr()
         * could write to it.   however, the caller seems to want
         * it mapped read-only, so now we are going to have to call
         * uvm_map_protect() to fix up the protection.  ICK.
         */
        if (maxprot != VM_PROT_ALL) {
                error = uvm_map_protect(&p->p_vmspace->vm_map,
                                trunc_page(cmd->ev_addr),
                                round_page(cmd->ev_addr + cmd->ev_len),
                                maxprot, true);
                if (error)
                        return error;
        }

        if (prot != maxprot) {
                error = uvm_map_protect(&p->p_vmspace->vm_map,
                                trunc_page(cmd->ev_addr),
                                round_page(cmd->ev_addr + cmd->ev_len),
                                prot, false);
                if (error)
                        return error;
        }

        return 0;
}

/*
 * vmcmd_map_zero():
 *        handle vmcmd which specifies a zero-filled address space region.  The
 *        address range must be first allocated, then protected appropriately.
 */

int
vmcmd_map_zero(struct lwp *l, struct exec_vmcmd *cmd)
{
        struct proc *p = l->l_proc;
        int error;
        long diff;
        vm_prot_t prot, maxprot;

        diff = cmd->ev_addr - trunc_page(cmd->ev_addr);
        cmd->ev_addr -= diff;                        /* required by uvm_map */
        cmd->ev_len += diff;

        if ((error = vmcmd_get_prot(l, cmd, &prot, &maxprot)) != 0)
                return error;

        error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr,
                        round_page(cmd->ev_len), NULL, UVM_UNKNOWN_OFFSET, 0,
                        UVM_MAPFLAG(prot, maxprot, UVM_INH_COPY,
                        UVM_ADV_NORMAL,
                        UVM_FLAG_FIXED|UVM_FLAG_COPYONW));
        if (cmd->ev_flags & VMCMD_STACK)
                curproc->p_vmspace->vm_issize += atop(round_page(cmd->ev_len));
        return error;
}

/*
 * exec_read():
 *
 *        Read from vnode into buffer at offset.
 */
int
exec_read(struct lwp *l, struct vnode *vp, u_long off, void *bf, size_t size,
    int ioflg)
{
        int error;
        size_t resid;

        KASSERT((ioflg & IO_NODELOCKED) == 0 || VOP_ISLOCKED(vp) != LK_NONE);

        if ((error = vn_rdwr(UIO_READ, vp, bf, size, off, UIO_SYSSPACE,
            ioflg, l->l_cred, &resid, NULL)) != 0)
                return error;
        /*
         * See if we got all of it
         */
        if (resid != 0)
                return ENOEXEC;
        return 0;
}

/*
 * exec_setup_stack(): Set up the stack segment for an elf
 * executable.
 *
 * Note that the ep_ssize parameter must be set to be the current stack
 * limit; this is adjusted in the body of execve() to yield the
 * appropriate stack segment usage once the argument length is
 * calculated.
 *
 * This function returns an int for uniformity with other (future) formats'
 * stack setup functions.  They might have errors to return.
 */

int
exec_setup_stack(struct lwp *l, struct exec_package *epp)
{
        vsize_t max_stack_size;
        vaddr_t access_linear_min;
        vsize_t access_size;
        vaddr_t noaccess_linear_min;
        vsize_t noaccess_size;

#ifndef        USRSTACK32
#define USRSTACK32        (0x00000000ffffffffL&~PGOFSET)
#endif
#ifndef MAXSSIZ32
#define MAXSSIZ32        (MAXSSIZ >> 2)
#endif

        if (epp->ep_flags & EXEC_32) {
                epp->ep_minsaddr = USRSTACK32;
                max_stack_size = MAXSSIZ32;
        } else {
                epp->ep_minsaddr = USRSTACK;
                max_stack_size = MAXSSIZ;
        }

        DPRINTF(("ep_minsaddr=%#jx max_stack_size=%#jx\n",
            (uintmax_t)epp->ep_minsaddr, (uintmax_t)max_stack_size));

        pax_aslr_stack(epp, &max_stack_size);

        DPRINTF(("[RLIMIT_STACK].lim_cur=%#jx max_stack_size=%#jx\n",
            (uintmax_t)l->l_proc->p_rlimit[RLIMIT_STACK].rlim_cur,
            (uintmax_t)max_stack_size));
        epp->ep_ssize = MIN(l->l_proc->p_rlimit[RLIMIT_STACK].rlim_cur,
            max_stack_size);

        l->l_proc->p_stackbase = epp->ep_minsaddr;
        
        epp->ep_maxsaddr = (vaddr_t)STACK_GROW(epp->ep_minsaddr,
            max_stack_size);

        DPRINTF(("ep_ssize=%#jx ep_minsaddr=%#jx ep_maxsaddr=%#jx\n",
            (uintmax_t)epp->ep_ssize, (uintmax_t)epp->ep_minsaddr,
            (uintmax_t)epp->ep_maxsaddr));

        /*
         * set up commands for stack.  note that this takes *two*, one to
         * map the part of the stack which we can access, and one to map
         * the part which we can't.
         *
         * arguably, it could be made into one, but that would require the
         * addition of another mapping proc, which is unnecessary
         */
        access_size = epp->ep_ssize;
        access_linear_min = (vaddr_t)STACK_ALLOC(epp->ep_minsaddr, access_size);
        noaccess_size = max_stack_size - access_size;
        noaccess_linear_min = (vaddr_t)STACK_ALLOC(STACK_GROW(epp->ep_minsaddr,
            access_size), noaccess_size);

        DPRINTF(("access_size=%#jx, access_linear_min=%#jx, "
            "noaccess_size=%#jx, noaccess_linear_min=%#jx\n",
            (uintmax_t)access_size, (uintmax_t)access_linear_min,
            (uintmax_t)noaccess_size, (uintmax_t)noaccess_linear_min));

        if (user_stack_guard_size > 0) {
#ifdef __MACHINE_STACK_GROWS_UP
                vsize_t guard_size = MIN(VM_MAXUSER_ADDRESS - epp->ep_maxsaddr, user_stack_guard_size);
                if (guard_size > 0)
                        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, guard_size,
                            epp->ep_maxsaddr, NULL, 0, VM_PROT_NONE);
#else
                NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, user_stack_guard_size,
                    epp->ep_maxsaddr - user_stack_guard_size, NULL, 0, VM_PROT_NONE);
#endif
        }
        if (noaccess_size > 0 && noaccess_size <= MAXSSIZ) {
                NEW_VMCMD2(&epp->ep_vmcmds, vmcmd_map_zero, noaccess_size,
                    noaccess_linear_min, NULL, 0,
                    VM_PROT_NONE | PROT_MPROTECT(VM_PROT_READ | VM_PROT_WRITE),
                    VMCMD_STACK);
        }
        KASSERT(access_size > 0);
        KASSERT(access_size <= MAXSSIZ);
        NEW_VMCMD2(&epp->ep_vmcmds, vmcmd_map_zero, access_size,
            access_linear_min, NULL, 0, VM_PROT_READ | VM_PROT_WRITE,
            VMCMD_STACK);

        return 0;
}















































































































































































































































    1 






















    1 




























   10 



















   10 








































































































   10 


    9 
   10 
   10 






   10 































































   10 













    3 


















    3 





    3 
    3 



    3 





























  101 






   57 
   49 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
/*        $NetBSD: ffs_vnops.c,v 1.138 2021/12/14 11:06:12 chs Exp $        */

/*-
 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Wasabi Systems, Inc, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ffs_vnops.c        8.15 (Berkeley) 5/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.138 2021/12/14 11:06:12 chs Exp $");

#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_wapbl.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/resourcevar.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/buf.h>
#include <sys/event.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/pool.h>
#include <sys/signalvar.h>
#include <sys/kauth.h>
#include <sys/wapbl.h>

#include <miscfs/fifofs/fifo.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

#include <ufs/ufs/acl.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_wapbl.h>

#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>

/* Global vfs data structures for ufs. */
int (**ffs_vnodeop_p)(void *);
const struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
        { &vop_default_desc, vn_default_error },
        { &vop_parsepath_desc, genfs_parsepath },        /* parsepath */
        { &vop_lookup_desc, ufs_lookup },                /* lookup */
        { &vop_create_desc, ufs_create },                /* create */
        { &vop_whiteout_desc, ufs_whiteout },                /* whiteout */
        { &vop_mknod_desc, ufs_mknod },                        /* mknod */
        { &vop_open_desc, ufs_open },                        /* open */
        { &vop_close_desc, ufs_close },                        /* close */
        { &vop_access_desc, genfs_access },                /* access */
        { &vop_accessx_desc, ufs_accessx },                /* accessx */
        { &vop_getattr_desc, ufs_getattr },                /* getattr */
        { &vop_setattr_desc, ufs_setattr },                /* setattr */
        { &vop_read_desc, ffs_read },                        /* read */
        { &vop_write_desc, ffs_write },                        /* write */
        { &vop_fallocate_desc, genfs_eopnotsupp },        /* fallocate */
        { &vop_fdiscard_desc, genfs_eopnotsupp },        /* fdiscard */
        { &vop_ioctl_desc, genfs_enoioctl },                /* ioctl */
        { &vop_fcntl_desc, genfs_fcntl },                /* fcntl */
        { &vop_poll_desc, genfs_poll },                        /* poll */
        { &vop_kqfilter_desc, genfs_kqfilter },                /* kqfilter */
        { &vop_revoke_desc, genfs_revoke },                /* revoke */
        { &vop_mmap_desc, genfs_mmap },                        /* mmap */
        { &vop_fsync_desc, ffs_fsync },                        /* fsync */
        { &vop_seek_desc, genfs_seek },                        /* seek */
        { &vop_remove_desc, ufs_remove },                /* remove */
        { &vop_link_desc, ufs_link },                        /* link */
        { &vop_rename_desc, ufs_rename },                /* rename */
        { &vop_mkdir_desc, ufs_mkdir },                        /* mkdir */
        { &vop_rmdir_desc, ufs_rmdir },                        /* rmdir */
        { &vop_symlink_desc, ufs_symlink },                /* symlink */
        { &vop_readdir_desc, ufs_readdir },                /* readdir */
        { &vop_readlink_desc, ufs_readlink },                /* readlink */
        { &vop_abortop_desc, genfs_abortop },                /* abortop */
        { &vop_inactive_desc, ufs_inactive },                /* inactive */
        { &vop_reclaim_desc, ffs_reclaim },                /* reclaim */
        { &vop_lock_desc, genfs_lock },                        /* lock */
        { &vop_unlock_desc, genfs_unlock },                /* unlock */
        { &vop_bmap_desc, ufs_bmap },                        /* bmap */
        { &vop_strategy_desc, ufs_strategy },                /* strategy */
        { &vop_print_desc, ufs_print },                        /* print */
        { &vop_islocked_desc, genfs_islocked },                /* islocked */
        { &vop_pathconf_desc, ufs_pathconf },                /* pathconf */
        { &vop_advlock_desc, ufs_advlock },                /* advlock */
        { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
        { &vop_getpages_desc, genfs_getpages },                /* getpages */
        { &vop_putpages_desc, genfs_putpages },                /* putpages */
        { &vop_openextattr_desc, ffs_openextattr },        /* openextattr */
        { &vop_closeextattr_desc, ffs_closeextattr },        /* closeextattr */
        { &vop_getextattr_desc, ffs_getextattr },        /* getextattr */
        { &vop_setextattr_desc, ffs_setextattr },        /* setextattr */
        { &vop_listextattr_desc, ffs_listextattr },        /* listextattr */
        { &vop_deleteextattr_desc, ffs_deleteextattr },        /* deleteextattr */
        { &vop_getacl_desc, ufs_getacl },                /* getacl */
        { &vop_setacl_desc, ufs_setacl },                /* setacl */
        { &vop_aclcheck_desc, ufs_aclcheck },                /* aclcheck */
        { NULL, NULL }
};
const struct vnodeopv_desc ffs_vnodeop_opv_desc =
        { &ffs_vnodeop_p, ffs_vnodeop_entries };

int (**ffs_specop_p)(void *);
const struct vnodeopv_entry_desc ffs_specop_entries[] = {
        { &vop_default_desc, vn_default_error },
        GENFS_SPECOP_ENTRIES,
        { &vop_close_desc, ufsspec_close },                /* close */
        { &vop_access_desc, genfs_access },                /* access */
        { &vop_accessx_desc, ufs_accessx },                /* accessx */
        { &vop_getattr_desc, ufs_getattr },                /* getattr */
        { &vop_setattr_desc, ufs_setattr },                /* setattr */
        { &vop_read_desc, ufsspec_read },                /* read */
        { &vop_write_desc, ufsspec_write },                /* write */
        { &vop_fcntl_desc, genfs_fcntl },                /* fcntl */
        { &vop_fsync_desc, ffs_spec_fsync },                /* fsync */
        { &vop_inactive_desc, ufs_inactive },                /* inactive */
        { &vop_reclaim_desc, ffs_reclaim },                /* reclaim */
        { &vop_lock_desc, genfs_lock },                        /* lock */
        { &vop_unlock_desc, genfs_unlock },                /* unlock */
        { &vop_print_desc, ufs_print },                        /* print */
        { &vop_islocked_desc, genfs_islocked },                /* islocked */
        { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
        { &vop_openextattr_desc, ffs_openextattr },        /* openextattr */
        { &vop_closeextattr_desc, ffs_closeextattr },        /* closeextattr */
        { &vop_getextattr_desc, ffs_getextattr },        /* getextattr */
        { &vop_setextattr_desc, ffs_setextattr },        /* setextattr */
        { &vop_listextattr_desc, ffs_listextattr },        /* listextattr */
        { &vop_deleteextattr_desc, ffs_deleteextattr },        /* deleteextattr */
        { &vop_getacl_desc, ufs_getacl },                /* getacl */
        { &vop_setacl_desc, ufs_setacl },                /* setacl */
        { &vop_aclcheck_desc, ufs_aclcheck },                /* aclcheck */
        { NULL, NULL }
};
const struct vnodeopv_desc ffs_specop_opv_desc =
        { &ffs_specop_p, ffs_specop_entries };

int (**ffs_fifoop_p)(void *);
const struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
        { &vop_default_desc, vn_default_error },
        GENFS_FIFOOP_ENTRIES,
        { &vop_close_desc, ufsfifo_close },                /* close */
        { &vop_access_desc, genfs_access },                /* access */
        { &vop_accessx_desc, ufs_accessx },                /* accessx */
        { &vop_getattr_desc, ufs_getattr },                /* getattr */
        { &vop_setattr_desc, ufs_setattr },                /* setattr */
        { &vop_read_desc, ufsfifo_read },                /* read */
        { &vop_write_desc, ufsfifo_write },                /* write */
        { &vop_fcntl_desc, genfs_fcntl },                /* fcntl */
        { &vop_fsync_desc, ffs_fsync },                        /* fsync */
        { &vop_inactive_desc, ufs_inactive },                /* inactive */
        { &vop_reclaim_desc, ffs_reclaim },                /* reclaim */
        { &vop_lock_desc, genfs_lock },                        /* lock */
        { &vop_unlock_desc, genfs_unlock },                /* unlock */
        { &vop_bmap_desc, ufs_bmap },                        /* bmap */
        { &vop_strategy_desc, ffsext_strategy },        /* strategy */
        { &vop_print_desc, ufs_print },                        /* print */
        { &vop_islocked_desc, genfs_islocked },                /* islocked */
        { &vop_pathconf_desc, ufs_pathconf },                /* pathconf */
        { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
        { &vop_openextattr_desc, ffs_openextattr },        /* openextattr */
        { &vop_closeextattr_desc, ffs_closeextattr },        /* closeextattr */
        { &vop_getextattr_desc, ffs_getextattr },        /* getextattr */
        { &vop_setextattr_desc, ffs_setextattr },        /* setextattr */
        { &vop_listextattr_desc, ffs_listextattr },        /* listextattr */
        { &vop_deleteextattr_desc, ffs_deleteextattr },        /* deleteextattr */
        { &vop_getacl_desc, ufs_getacl },                /* getacl */
        { &vop_setacl_desc, ufs_setacl },                /* setacl */
        { &vop_aclcheck_desc, ufs_aclcheck },                /* aclcheck */
        { NULL, NULL }
};
const struct vnodeopv_desc ffs_fifoop_opv_desc =
        { &ffs_fifoop_p, ffs_fifoop_entries };

#include <ufs/ufs/ufs_readwrite.c>

int
ffs_spec_fsync(void *v)
{
        struct vop_fsync_args /* {
                struct vnode *a_vp;
                kauth_cred_t a_cred;
                int a_flags;
                off_t a_offlo;
                off_t a_offhi;
                struct lwp *a_l;
        } */ *ap = v;
        int error, flags, uflags;
        struct vnode *vp;

        flags = ap->a_flags;
        uflags = UPDATE_CLOSE | ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
        vp = ap->a_vp;

        error = spec_fsync(v);
        if (error)
                goto out;

#ifdef WAPBL
        struct mount *mp = vp->v_mount;

        if (mp && mp->mnt_wapbl) {
                /*
                 * Don't bother writing out metadata if the syncer is
                 * making the request.  We will let the sync vnode
                 * write it out in a single burst through a call to
                 * VFS_SYNC().
                 */
                if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0)
                        goto out;
                if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
                    | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) {
                        error = UFS_WAPBL_BEGIN(mp);
                        if (error != 0)
                                goto out;
                        error = ffs_update(vp, NULL, NULL, uflags);
                        UFS_WAPBL_END(mp);
                }
                goto out;
        }
#endif /* WAPBL */

        error = ffs_update(vp, NULL, NULL, uflags);

out:
        return error;
}

int
ffs_fsync(void *v)
{
        struct vop_fsync_args /* {
                struct vnode *a_vp;
                kauth_cred_t a_cred;
                int a_flags;
                off_t a_offlo;
                off_t a_offhi;
                struct lwp *a_l;
        } */ *ap = v;
        struct buf *bp;
        int num, error, i;
        struct indir ia[UFS_NIADDR + 1];
        int bsize;
        daddr_t blk_high;
        struct vnode *vp;
        struct mount *mp;

        vp = ap->a_vp;
        mp = vp->v_mount;

        if ((ap->a_offlo == 0 && ap->a_offhi == 0) || (vp->v_type != VREG)) {
                error = ffs_full_fsync(vp, ap->a_flags);
                goto out;
        }

        bsize = mp->mnt_stat.f_iosize;
        blk_high = ap->a_offhi / bsize;
        if (ap->a_offhi % bsize != 0)
                blk_high++;

        /*
         * First, flush all pages in range.
         */

        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
        error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
            round_page(ap->a_offhi), PGO_CLEANIT |
            ((ap->a_flags & FSYNC_WAIT) ? PGO_SYNCIO : 0));
        if (error) {
                goto out;
        }

#ifdef WAPBL
        KASSERT(vp->v_type == VREG);
        if (mp->mnt_wapbl) {
                /*
                 * Don't bother writing out metadata if the syncer is
                 * making the request.  We will let the sync vnode
                 * write it out in a single burst through a call to
                 * VFS_SYNC().
                 */
                if ((ap->a_flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0) {
                        return 0;
                }
                error = 0;
                if (vp->v_tag == VT_UFS && VTOI(vp)->i_flag &
                    (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY |
                                 IN_MODIFIED | IN_ACCESSED)) {
                        error = UFS_WAPBL_BEGIN(mp);
                        if (error) {
                                return error;
                        }
                        error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE |
                            ((ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0));
                        UFS_WAPBL_END(mp);
                }
                if (error || (ap->a_flags & FSYNC_NOLOG) != 0) {
                        return error;
                }
                error = wapbl_flush(mp->mnt_wapbl, 0);
                return error;
        }
#endif /* WAPBL */

        /*
         * Then, flush indirect blocks.
         */

        if (blk_high >= UFS_NDADDR) {
                error = ufs_getlbns(vp, blk_high, ia, &num);
                if (error)
                        goto out;

                mutex_enter(&bufcache_lock);
                for (i = 0; i < num; i++) {
                        if ((bp = incore(vp, ia[i].in_lbn)) == NULL)
                                continue;
                        if ((bp->b_cflags & BC_BUSY) != 0 ||
                            (bp->b_oflags & BO_DELWRI) == 0)
                                continue;
                        bp->b_cflags |= BC_BUSY | BC_VFLUSH;
                        mutex_exit(&bufcache_lock);
                        bawrite(bp);
                        mutex_enter(&bufcache_lock);
                }
                mutex_exit(&bufcache_lock);
        }

        if (ap->a_flags & FSYNC_WAIT) {
                mutex_enter(vp->v_interlock);
                while (vp->v_numoutput > 0)
                        cv_wait(&vp->v_cv, vp->v_interlock);
                mutex_exit(vp->v_interlock);
        }

        error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE |
            (((ap->a_flags & (FSYNC_WAIT | FSYNC_DATAONLY)) == FSYNC_WAIT)
            ? UPDATE_WAIT : 0));

        if (error == 0 && ap->a_flags & FSYNC_CACHE) {
                int l = 0;
                VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE,
                        curlwp->l_cred);
        }

out:
        return error;
}

/*
 * Synch an open file.  Called for VOP_FSYNC().
 */
/* ARGSUSED */
int
ffs_full_fsync(struct vnode *vp, int flags)
{
        int error, i, uflags;

        KASSERT(vp->v_tag == VT_UFS);
        KASSERT(VTOI(vp) != NULL);
        KASSERT(vp->v_type != VCHR && vp->v_type != VBLK);

        uflags = UPDATE_CLOSE | ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);

#ifdef WAPBL
        struct mount *mp = vp->v_mount;

        if (mp && mp->mnt_wapbl) {

                /*
                 * Flush all dirty data associated with the vnode.
                 */
                if (vp->v_type == VREG) {
                        int pflags = PGO_ALLPAGES | PGO_CLEANIT;

                        if ((flags & FSYNC_LAZY))
                                pflags |= PGO_LAZY;
                        if ((flags & FSYNC_WAIT))
                                pflags |= PGO_SYNCIO;
                        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
                        error = VOP_PUTPAGES(vp, 0, 0, pflags);
                        if (error)
                                return error;
                }

                /*
                 * Don't bother writing out metadata if the syncer is
                 * making the request.  We will let the sync vnode
                 * write it out in a single burst through a call to
                 * VFS_SYNC().
                 */
                if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0)
                        return 0;

                if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
                    | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) {
                        error = UFS_WAPBL_BEGIN(mp);
                        if (error)
                                return error;
                        error = ffs_update(vp, NULL, NULL, uflags);
                        UFS_WAPBL_END(mp);
                } else {
                        error = 0;
                }
                if (error || (flags & FSYNC_NOLOG) != 0)
                        return error;

                /*
                 * Don't flush the log if the vnode being flushed
                 * contains no dirty buffers that could be in the log.
                 */
                if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
                        error = wapbl_flush(mp->mnt_wapbl, 0);
                        if (error)
                                return error;
                }

                if ((flags & FSYNC_WAIT) != 0) {
                        mutex_enter(vp->v_interlock);
                        while (vp->v_numoutput != 0)
                                cv_wait(&vp->v_cv, vp->v_interlock);
                        mutex_exit(vp->v_interlock);
                }

                return error;
        }
#endif /* WAPBL */

        error = vflushbuf(vp, flags);
        if (error == 0)
                error = ffs_update(vp, NULL, NULL, uflags);
        if (error == 0 && (flags & FSYNC_CACHE) != 0) {
                i = 1;
                (void)VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &i, FWRITE,
                    kauth_cred_get());
        }

        return error;
}

/*
 * Reclaim an inode so that it can be used for other purposes.
 */
int
ffs_reclaim(void *v)
{
        struct vop_reclaim_v2_args /* {
                struct vnode *a_vp;
                struct lwp *a_l;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct inode *ip = VTOI(vp);
        struct mount *mp = vp->v_mount;
        struct ufsmount *ump = ip->i_ump;
        void *data;
        int error;

        VOP_UNLOCK(vp);

        /*
         * The inode must be freed and updated before being removed
         * from its hash chain.  Other threads trying to gain a hold
         * or lock on the inode will be stalled.
         */
        error = UFS_WAPBL_BEGIN(mp);
        if (error) {
                return error;
        }
        if (ip->i_nlink <= 0 && ip->i_omode != 0 &&
            (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
                ffs_vfree(vp, ip->i_number, ip->i_omode);
        UFS_WAPBL_END(mp);
        if ((error = ufs_reclaim(vp)) != 0) {
                return (error);
        }
        if (ip->i_din.ffs1_din != NULL) {
                if (ump->um_fstype == UFS1)
                        pool_cache_put(ffs_dinode1_cache, ip->i_din.ffs1_din);
                else
                        pool_cache_put(ffs_dinode2_cache, ip->i_din.ffs2_din);
        }
        /*
         * To interlock with ffs_sync().
         */
        genfs_node_destroy(vp);
        mutex_enter(vp->v_interlock);
        data = vp->v_data;
        vp->v_data = NULL;
        mutex_exit(vp->v_interlock);

        /*
         * XXX MFS ends up here, too, to free an inode.  Should we create
         * XXX a separate pool for MFS inodes?
         */
        pool_cache_put(ffs_inode_cache, data);
        return (0);
}

/*
 * Return the last logical file offset that should be written for this file
 * if we're doing a write that ends at "size".
 */

void
ffs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
{
        struct inode *ip = VTOI(vp);
        struct fs *fs = ip->i_fs;
        daddr_t olbn, nlbn;

        olbn = ffs_lblkno(fs, ip->i_size);
        nlbn = ffs_lblkno(fs, size);
        if (nlbn < UFS_NDADDR && olbn <= nlbn) {
                *eobp = ffs_fragroundup(fs, size);
        } else {
                *eobp = ffs_blkroundup(fs, size);
        }
}

























































































































    4 












    4 
















    4 



















    4 














    2 


    1 
    2 
    3 



    1 
    1 
    4 



    2 


    3 







    1 



    1 

    1 





    3 







    1 
















    2 

















    2 





















































    1 
















    1 


























































































































    1 
    1 


    1 










    3 
















    3 










    3 

    3 


    3 





















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
/*        $NetBSD: fifo_vnops.c,v 1.91 2021/10/11 01:07:36 thorpej Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1990, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)fifo_vnops.c        8.10 (Berkeley) 5/27/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: fifo_vnops.c,v 1.91 2021/10/11 01:07:36 thorpej Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/file.h>
#include <sys/errno.h>
#include <sys/kmem.h>
#include <sys/un.h>
#include <sys/poll.h>
#include <sys/event.h>
#include <sys/condvar.h>

#include <miscfs/fifofs/fifo.h>
#include <miscfs/genfs/genfs.h>

/*
 * This structure is associated with the FIFO vnode and stores
 * the state associated with the FIFO.
 */
struct fifoinfo {
        struct socket        *fi_readsock;
        struct socket        *fi_writesock;
        kcondvar_t        fi_rcv;
        int                fi_readers;
        kcondvar_t        fi_wcv;
        int                fi_writers;
};

/*
 * Trivial lookup routine that always fails.
 */
/* ARGSUSED */
static int
fifo_lookup(void *v)
{
        struct vop_lookup_v2_args /* {
                struct vnode                *a_dvp;
                struct vnode                **a_vpp;
                struct componentname        *a_cnp;
        } */ *ap = v;

        *ap->a_vpp = NULL;
        return (ENOTDIR);
}

/*
 * Open called to set up a new instance of a fifo or
 * to find an active instance of a fifo.
 */
static int
fifo_open(void *v)
{
        struct vop_open_args /* {
                struct vnode        *a_vp;
                int                a_mode;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        struct lwp        *l = curlwp;
        struct vnode        *vp;
        struct fifoinfo        *fip;
        struct socket        *rso, *wso;
        int                error;

        vp = ap->a_vp;
        KASSERT(VOP_ISLOCKED(vp));

        if ((fip = vp->v_fifoinfo) == NULL) {
                fip = kmem_alloc(sizeof(*fip), KM_SLEEP);
                error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0, l, NULL);
                if (error != 0) {
                        kmem_free(fip, sizeof(*fip));
                        return (error);
                }
                fip->fi_readsock = rso;
                error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0, l, rso);
                if (error != 0) {
                        (void)soclose(rso);
                        kmem_free(fip, sizeof(*fip));
                        return (error);
                }
                fip->fi_writesock = wso;
                solock(wso);
                if ((error = unp_connect2(wso, rso)) != 0) {
                        sounlock(wso);
                        (void)soclose(wso);
                        (void)soclose(rso);
                        kmem_free(fip, sizeof(*fip));
                        return (error);
                }

                /*
                 * FIFOs must be readable when there is at least 1
                 * byte of data available in the receive buffer.
                 *
                 * FIFOs must be writable when there is space for
                 * at least PIPE_BUF bytes in the send buffer.
                 * If we're increasing the low water mark for the
                 * send buffer, then mimic how soreserve() would
                 * have set the high water mark.
                 */
                rso->so_rcv.sb_lowat = 1;
                if (wso->so_snd.sb_lowat < PIPE_BUF) {
                        wso->so_snd.sb_hiwat = PIPE_BUF * 2;
                }
                wso->so_snd.sb_lowat = PIPE_BUF;

                fip->fi_readers = 0;
                fip->fi_writers = 0;
                wso->so_state |= SS_CANTRCVMORE;
                rso->so_state |= SS_CANTSENDMORE;
                cv_init(&fip->fi_rcv, "fiford");
                cv_init(&fip->fi_wcv, "fifowr");
                vp->v_fifoinfo = fip;
        } else {
                wso = fip->fi_writesock;
                rso = fip->fi_readsock;
                solock(wso);
        }

        if (ap->a_mode & FREAD) {
                if (fip->fi_readers++ == 0) {
                        wso->so_state &= ~SS_CANTSENDMORE;
                        cv_broadcast(&fip->fi_wcv);
                }
        }
        if (ap->a_mode & FWRITE) {
                if (fip->fi_writers++ == 0) {
                        rso->so_state &= ~SS_CANTRCVMORE;
                        cv_broadcast(&fip->fi_rcv);
                }
        }
        if (ap->a_mode & FREAD) {
                if (ap->a_mode & O_NONBLOCK) {
                } else {
                        while (!soreadable(rso) && fip->fi_writers == 0) {
                                VOP_UNLOCK(vp);
                                error = cv_wait_sig(&fip->fi_rcv,
                                    wso->so_lock);
                                sounlock(wso);
                                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                                if (error)
                                        goto bad;
                                solock(wso);
                        }
                }
        }
        if (ap->a_mode & FWRITE) {
                if (ap->a_mode & O_NONBLOCK) {
                        if (fip->fi_readers == 0) {
                                error = ENXIO;
                                sounlock(wso);
                                goto bad;
                        }
                } else {
                        while (fip->fi_readers == 0) {
                                VOP_UNLOCK(vp);
                                error = cv_wait_sig(&fip->fi_wcv,
                                    wso->so_lock);
                                sounlock(wso);
                                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                                if (error)
                                        goto bad;
                                solock(wso);
                        }
                }
        }
        sounlock(wso);
        return (0);
 bad:
        VOP_CLOSE(vp, ap->a_mode, ap->a_cred);
        return (error);
}

/*
 * Vnode op for read
 */
/* ARGSUSED */
static int
fifo_read(void *v)
{
        struct vop_read_args /* {
                struct vnode        *a_vp;
                struct uio        *a_uio;
                int                a_ioflag;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        struct uio        *uio;
        struct socket        *rso;
        int                error, sflags;
        size_t                startresid;

        uio = ap->a_uio;
        rso = ap->a_vp->v_fifoinfo->fi_readsock;
#ifdef DIAGNOSTIC
        if (uio->uio_rw != UIO_READ)
                panic("fifo_read mode");
#endif
        if (uio->uio_resid == 0)
                return (0);
        startresid = uio->uio_resid;
        VOP_UNLOCK(ap->a_vp);
        sflags = (ap->a_ioflag & IO_NDELAY) ? MSG_NBIO : 0;
        error = (*rso->so_receive)(rso, NULL, uio, NULL, NULL, &sflags);
        /*
         * Clear EOF indication after first such return.
         */
        if (error == 0 && uio->uio_resid == startresid)
                rso->so_state &= ~SS_CANTRCVMORE;
        if (ap->a_ioflag & IO_NDELAY) {
                if (error == EWOULDBLOCK &&
                    ap->a_vp->v_fifoinfo->fi_writers == 0)
                        error = 0;
        }
        vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
        return (error);
}

/*
 * Vnode op for write
 */
/* ARGSUSED */
static int
fifo_write(void *v)
{
        struct vop_write_args /* {
                struct vnode        *a_vp;
                struct uio        *a_uio;
                int                a_ioflag;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        struct socket        *wso;
        int                error, sflags;

        wso = ap->a_vp->v_fifoinfo->fi_writesock;
#ifdef DIAGNOSTIC
        if (ap->a_uio->uio_rw != UIO_WRITE)
                panic("fifo_write mode");
#endif
        VOP_UNLOCK(ap->a_vp);
        sflags = (ap->a_ioflag & IO_NDELAY) ? MSG_NBIO : 0;
        error = (*wso->so_send)(wso, NULL, ap->a_uio, 0, NULL, sflags, curlwp);
        vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
        return (error);
}

/*
 * Device ioctl operation.
 */
/* ARGSUSED */
static int
fifo_ioctl(void *v)
{
        struct vop_ioctl_args /* {
                struct vnode        *a_vp;
                u_long                a_command;
                void                *a_data;
                int                a_fflag;
                kauth_cred_t        a_cred;
                struct lwp        *a_l;
        } */ *ap = v;
        struct file        filetmp;
        int                error;

        if (ap->a_command == FIONBIO)
                return (0);
        if (ap->a_fflag & FREAD) {
                filetmp.f_data = ap->a_vp->v_fifoinfo->fi_readsock;
                error = soo_ioctl(&filetmp, ap->a_command, ap->a_data);
                if (error)
                        return (error);
        }
        if (ap->a_fflag & FWRITE) {
                filetmp.f_data = ap->a_vp->v_fifoinfo->fi_writesock;
                error = soo_ioctl(&filetmp, ap->a_command, ap->a_data);
                if (error)
                        return (error);
        }
        return (0);
}

/* ARGSUSED */
static int
fifo_poll(void *v)
{
        struct vop_poll_args /* {
                struct vnode        *a_vp;
                int                a_events;
        } */ *ap = v;
        struct socket        *rso = ap->a_vp->v_fifoinfo->fi_readsock;
        struct socket        *wso = ap->a_vp->v_fifoinfo->fi_writesock;
        struct socket        *lso = NULL;
        int                events;

        /*
         * N.B. We're using a slightly different naming convention
         * for these variables that most poll handlers.
         */
        int                revents = 0;
        int                wevents = 0;

        if (rso != NULL) {
                lso = rso;
        } else if (wso != NULL) {
                lso = wso;
        }

        if (lso == NULL) {
                /* No associated sockets -> no events to report. */
                return 0;
        }

        KASSERT(rso == NULL || lso->so_lock == rso->so_lock);
        KASSERT(wso == NULL || lso->so_lock == wso->so_lock);

        solock(lso);

        if (rso != NULL) {
                events = ap->a_events & (POLLIN | POLLRDNORM);
                if (events != 0 && soreadable(rso)) {
                        revents |= events;
                }
                if (rso->so_state & SS_CANTRCVMORE) {
                        revents |= POLLHUP;
                }
                /*
                 * We always selrecord the read side here regardless
                 * of the caller's read interest because we need to
                 * action POLLHUP.
                 */
                if (revents == 0) {
                        selrecord(curlwp, &rso->so_rcv.sb_sel);
                        rso->so_rcv.sb_flags |= SB_NOTIFY;
                }
        }

        /* POSIX sez: POLLHUP and POLLOUT are mutually-exclusive. */
        if (wso != NULL && (revents & POLLHUP) == 0) {
                events = ap->a_events & (POLLOUT | POLLWRNORM);
                if (events != 0 && sowritable(wso)) {
                        wevents |= events;
                }
                if (wevents == 0 && events != 0) {
                        selrecord(curlwp, &wso->so_snd.sb_sel);
                        wso->so_snd.sb_flags |= SB_NOTIFY;
                }
        }

        sounlock(lso);

        return (revents | wevents);
}

static int
fifo_inactive(void *v)
{
        struct vop_inactive_v2_args /* {
                struct vnode        *a_vp;
                struct lwp        *a_l;
        } */ *ap __unused = v;

        return (0);
}

/*
 * This is a noop, simply returning what one has been given.
 */
static int
fifo_bmap(void *v)
{
        struct vop_bmap_args /* {
                struct vnode        *a_vp;
                daddr_t                a_bn;
                struct vnode        **a_vpp;
                daddr_t                *a_bnp;
                int                *a_runp;
        } */ *ap = v;

        if (ap->a_vpp != NULL)
                *ap->a_vpp = ap->a_vp;
        if (ap->a_bnp != NULL)
                *ap->a_bnp = ap->a_bn;
        if (ap->a_runp != NULL)
                *ap->a_runp = 0;
        return (0);
}

/*
 * This is like socantrcvmore(), but we send the POLL_HUP code.
 */
static void
fifo_socantrcvmore(struct socket *so)
{
        KASSERT(solocked(so));

        so->so_state |= SS_CANTRCVMORE;
        if (sb_notify(&so->so_rcv)) {
                sowakeup(so, &so->so_rcv, POLL_HUP);
        }
}

/*
 * Device close routine
 */
/* ARGSUSED */
static int
fifo_close(void *v)
{
        struct vop_close_args /* {
                struct vnode        *a_vp;
                int                a_fflag;
                kauth_cred_t        a_cred;
                struct lwp        *a_l;
        } */ *ap = v;
        struct vnode        *vp;
        struct fifoinfo        *fip;
        struct socket *wso, *rso;
        int isrevoke;

        vp = ap->a_vp;
        fip = vp->v_fifoinfo;
        isrevoke = (ap->a_fflag & (FREAD | FWRITE | FNONBLOCK)) == FNONBLOCK;
        wso = fip->fi_writesock;
        rso = fip->fi_readsock;
        solock(wso);
        if (isrevoke) {
                if (fip->fi_readers != 0) {
                        fip->fi_readers = 0;
                        socantsendmore(wso);
                }
                if (fip->fi_writers != 0) {
                        fip->fi_writers = 0;
                        fifo_socantrcvmore(rso);
                }
        } else {
                if ((ap->a_fflag & FREAD) && --fip->fi_readers == 0)
                        socantsendmore(wso);
                if ((ap->a_fflag & FWRITE) && --fip->fi_writers == 0)
                        fifo_socantrcvmore(rso);
        }
        if ((fip->fi_readers + fip->fi_writers) == 0) {
                sounlock(wso);
                (void) soclose(rso);
                (void) soclose(wso);
                cv_destroy(&fip->fi_rcv);
                cv_destroy(&fip->fi_wcv);
                kmem_free(fip, sizeof(*fip));
                vp->v_fifoinfo = NULL;
        } else
                sounlock(wso);
        return (0);
}

/*
 * Print out internal contents of a fifo vnode.
 */
static void
fifo_printinfo(struct vnode *vp)
{
        struct fifoinfo        *fip;

        fip = vp->v_fifoinfo;
        printf(", fifo with %d readers and %d writers",
            fip->fi_readers, fip->fi_writers);
}

/*
 * Print out the contents of a fifo vnode.
 */
static int
fifo_print(void *v)
{
        struct vop_print_args /* {
                struct vnode        *a_vp;
        } */ *ap = v;

        /*
         * We are most likely being called with the vnode belonging
         * to some file system and this is not printed.
         */
        if (ap->a_vp->v_tag == VT_NON)
                printf("tag VT_NON");

        fifo_printinfo(ap->a_vp);
        printf("\n");
        return 0;
}

/*
 * Return POSIX pathconf information applicable to fifo's.
 */
static int
fifo_pathconf(void *v)
{
        struct vop_pathconf_args /* {
                struct vnode        *a_vp;
                int                a_name;
                register_t        *a_retval;
        } */ *ap = v;

        switch (ap->a_name) {
        case _PC_LINK_MAX:
                *ap->a_retval = LINK_MAX;
                return (0);
        case _PC_PIPE_BUF:
                *ap->a_retval = PIPE_BUF;
                return (0);
        case _PC_CHOWN_RESTRICTED:
                *ap->a_retval = 1;
                return (0);
        case _PC_SYNC_IO:
                *ap->a_retval = 1;
                return (0);
        default:
                return genfs_pathconf(ap);
        }
        /* NOTREACHED */
}

static void
filt_fifordetach(struct knote *kn)
{
        struct socket *so;

        so = (struct socket *)kn->kn_hook;
        solock(so);
        if (selremove_knote(&so->so_rcv.sb_sel, kn))
                so->so_rcv.sb_flags &= ~SB_KNOTE;
        sounlock(so);
}

static int
filt_fiforead(struct knote *kn, long hint)
{
        struct socket *so;
        int rv;

        so = (struct socket *)kn->kn_hook;
        if (hint != NOTE_SUBMIT)
                solock(so);
        kn->kn_data = so->so_rcv.sb_cc;
        if (so->so_state & SS_CANTRCVMORE) {
                knote_set_eof(kn, 0);
                rv = 1;
        } else {
                knote_clear_eof(kn);
                rv = (kn->kn_data >= so->so_rcv.sb_lowat);
        }
        if (hint != NOTE_SUBMIT)
                sounlock(so);
        return rv;
}

static void
filt_fifowdetach(struct knote *kn)
{
        struct socket *so;

        so = (struct socket *)kn->kn_hook;
        solock(so);
        if (selremove_knote(&so->so_snd.sb_sel, kn))
                so->so_snd.sb_flags &= ~SB_KNOTE;
        sounlock(so);
}

static int
filt_fifowrite(struct knote *kn, long hint)
{
        struct socket *so;
        int rv;

        so = (struct socket *)kn->kn_hook;
        if (hint != NOTE_SUBMIT)
                solock(so);
        kn->kn_data = sbspace(&so->so_snd);
        if (so->so_state & SS_CANTSENDMORE) {
                knote_set_eof(kn, 0);
                rv = 1;
        } else {
                knote_clear_eof(kn);
                rv = (kn->kn_data >= so->so_snd.sb_lowat);
        }
        if (hint != NOTE_SUBMIT)
                sounlock(so);
        return rv;
}

static const struct filterops fiforead_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_fifordetach,
        .f_event = filt_fiforead,
};

static const struct filterops fifowrite_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_fifowdetach,
        .f_event = filt_fifowrite,
};

/* ARGSUSED */
static int
fifo_kqfilter(void *v)
{
        struct vop_kqfilter_args /* {
                struct vnode *a_vp;
                struct knote *a_kn;
        } */ *ap = v;
        struct socket        *so;
        struct sockbuf        *sb;

        switch (ap->a_kn->kn_filter) {
        case EVFILT_READ:
                so = (struct socket *)ap->a_vp->v_fifoinfo->fi_readsock;
                ap->a_kn->kn_fop = &fiforead_filtops;
                sb = &so->so_rcv;
                break;
        case EVFILT_WRITE:
                so = (struct socket *)ap->a_vp->v_fifoinfo->fi_writesock;
                ap->a_kn->kn_fop = &fifowrite_filtops;
                sb = &so->so_snd;
                break;
        default:
                return (EINVAL);
        }

        ap->a_kn->kn_hook = so;

        solock(so);
        selrecord_knote(&sb->sb_sel, ap->a_kn);
        sb->sb_flags |= SB_KNOTE;
        sounlock(so);

        return (0);
}

int (**fifo_vnodeop_p)(void *);
const struct vnodeopv_entry_desc fifo_vnodeop_entries[] = {
        { &vop_default_desc, vn_default_error },
        { &vop_parsepath_desc, genfs_parsepath },        /* parsepath */
        { &vop_lookup_desc, fifo_lookup },                /* lookup */
        { &vop_create_desc, genfs_badop },                /* create */
        { &vop_mknod_desc, genfs_badop },                /* mknod */
        { &vop_open_desc, fifo_open },                        /* open */
        { &vop_close_desc, fifo_close },                /* close */
        { &vop_access_desc, genfs_ebadf },                /* access */
        { &vop_accessx_desc, genfs_accessx },                /* accessx */
        { &vop_getattr_desc, genfs_ebadf },                /* getattr */
        { &vop_setattr_desc, genfs_ebadf },                /* setattr */
        { &vop_read_desc, fifo_read },                        /* read */
        { &vop_write_desc, fifo_write },                /* write */
        { &vop_fallocate_desc, genfs_eopnotsupp },        /* fallocate */
        { &vop_fdiscard_desc, genfs_eopnotsupp },        /* fdiscard */
        { &vop_ioctl_desc, fifo_ioctl },                /* ioctl */
        { &vop_poll_desc, fifo_poll },                        /* poll */
        { &vop_kqfilter_desc, fifo_kqfilter },                /* kqfilter */
        { &vop_revoke_desc, genfs_revoke },                /* revoke */
        { &vop_mmap_desc, genfs_badop },                /* mmap */
        { &vop_fsync_desc, genfs_nullop },                /* fsync */
        { &vop_seek_desc, genfs_badop },                /* seek */
        { &vop_remove_desc, genfs_badop },                /* remove */
        { &vop_link_desc, genfs_badop },                /* link */
        { &vop_rename_desc, genfs_badop },                /* rename */
        { &vop_mkdir_desc, genfs_badop },                /* mkdir */
        { &vop_rmdir_desc, genfs_badop },                /* rmdir */
        { &vop_symlink_desc, genfs_badop },                /* symlink */
        { &vop_readdir_desc, genfs_badop },                /* readdir */
        { &vop_readlink_desc, genfs_badop },                /* readlink */
        { &vop_abortop_desc, genfs_badop },                /* abortop */
        { &vop_inactive_desc, fifo_inactive },                /* inactive */
        { &vop_reclaim_desc, genfs_nullop },                /* reclaim */
        { &vop_lock_desc, genfs_lock },                        /* lock */
        { &vop_unlock_desc, genfs_unlock },                /* unlock */
        { &vop_bmap_desc, fifo_bmap },                        /* bmap */
        { &vop_strategy_desc, genfs_badop },                /* strategy */
        { &vop_print_desc, fifo_print },                /* print */
        { &vop_islocked_desc, genfs_islocked },                /* islocked */
        { &vop_pathconf_desc, fifo_pathconf },                /* pathconf */
        { &vop_advlock_desc, genfs_einval },                /* advlock */
        { &vop_bwrite_desc, genfs_nullop },                /* bwrite */
        { &vop_putpages_desc, genfs_null_putpages },        /* putpages */
        { NULL, NULL }
};
const struct vnodeopv_desc fifo_vnodeop_opv_desc =
        { &fifo_vnodeop_p, fifo_vnodeop_entries };





















































































































    9 













   40 


   39 






   11 








   38 
   38 














   62 








   41 
   46 
















    2 


































    6 






    8 

















   40 



















    6 

   38 








    6 

    6 








    6 




   42 
    6 





















   25 

    3 
   26 











   27 
   27 

   26 



   26 












    3 
   26 











   41 












   42 
   41 
   41 
   42 







   39 
   39 
   38 


   39 







   12 




   12 
   12 
   12 















   41 
   42 

   42 










   39 









   40 
   39 
   38 
   40 
   39 





   20 
   27 













   40 

   40 
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
/*        $NetBSD: kern_turnstile.c,v 1.55 2023/10/15 10:30:20 riastradh Exp $        */

/*-
 * Copyright (c) 2002, 2006, 2007, 2009, 2019, 2020, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe and Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Turnstiles are described in detail in:
 *
 *        Solaris Internals: Core Kernel Architecture, Jim Mauro and
 *            Richard McDougall.
 *
 * Turnstiles are kept in a hash table.  There are likely to be many more
 * synchronisation objects than there are threads.  Since a thread can block
 * on only one lock at a time, we only need one turnstile per thread, and
 * so they are allocated at thread creation time.
 *
 * When a thread decides it needs to block on a lock, it looks up the
 * active turnstile for that lock.  If no active turnstile exists, then
 * the process lends its turnstile to the lock.  If there is already an
 * active turnstile for the lock, the thread places its turnstile on a
 * list of free turnstiles, and references the active one instead.
 *
 * The act of looking up the turnstile acquires an interlock on the sleep
 * queue.  If a thread decides it doesn't need to block after all, then this
 * interlock must be released by explicitly aborting the turnstile
 * operation.
 *
 * When a thread is awakened, it needs to get its turnstile back.  If there
 * are still other threads waiting in the active turnstile, the thread
 * grabs a free turnstile off the free list.  Otherwise, it can take back
 * the active turnstile from the lock (thus deactivating the turnstile).
 *
 * Turnstiles are where we do priority inheritence.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.55 2023/10/15 10:30:20 riastradh Exp $");

#include <sys/param.h>

#include <sys/lockdebug.h>
#include <sys/lwp.h>
#include <sys/proc.h>
#include <sys/sleepq.h>
#include <sys/sleeptab.h>
#include <sys/syncobj.h>
#include <sys/systm.h>

/*
 * Shift of 6 aligns to typical cache line size of 64 bytes;  there's no
 * point having two turnstile locks to back two lock objects that share one
 * cache line.
 */
#define        TS_HASH_SIZE        128
#define        TS_HASH_MASK        (TS_HASH_SIZE - 1)
#define        TS_HASH(obj)        (((uintptr_t)(obj) >> 6) & TS_HASH_MASK)

static tschain_t        turnstile_chains[TS_HASH_SIZE] __cacheline_aligned;

static union {
        kmutex_t        lock;
        uint8_t                pad[COHERENCY_UNIT];
} turnstile_locks[TS_HASH_SIZE] __cacheline_aligned;

/*
 * turnstile_init:
 *
 *        Initialize the turnstile mechanism.
 */
void
turnstile_init(void)
{
        int i;

        for (i = 0; i < TS_HASH_SIZE; i++) {
                LIST_INIT(&turnstile_chains[i]);
                mutex_init(&turnstile_locks[i].lock, MUTEX_DEFAULT, IPL_SCHED);
        }

        turnstile_ctor(&turnstile0);
}

/*
 * turnstile_ctor:
 *
 *        Constructor for turnstiles.
 */
void
turnstile_ctor(turnstile_t *ts)
{

        memset(ts, 0, sizeof(*ts));
        sleepq_init(&ts->ts_sleepq[TS_READER_Q]);
        sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]);
}

/*
 * turnstile_remove:
 *
 *        Remove an LWP from a turnstile sleep queue and wake it.
 */
static inline void
turnstile_remove(turnstile_t *ts, lwp_t *l, int q)
{
        turnstile_t *nts;

        KASSERT(l->l_ts == ts);

        /*
         * This process is no longer using the active turnstile.
         * Find an inactive one on the free list to give to it.
         */
        if ((nts = ts->ts_free) != NULL) {
                KASSERT(TS_ALL_WAITERS(ts) > 1);
                l->l_ts = nts;
                ts->ts_free = nts->ts_free;
                nts->ts_free = NULL;
        } else {
                /*
                 * If the free list is empty, this is the last
                 * waiter.
                 */
                KASSERT(TS_ALL_WAITERS(ts) == 1);
                LIST_REMOVE(ts, ts_chain);
        }

        ts->ts_waiters[q]--;
        sleepq_remove(&ts->ts_sleepq[q], l, true);
}

/*
 * turnstile_lookup:
 *
 *        Look up the turnstile for the specified lock.  This acquires and
 *        holds the turnstile chain lock (sleep queue interlock).
 */
turnstile_t *
turnstile_lookup(wchan_t obj)
{
        turnstile_t *ts;
        tschain_t *tc;
        u_int hash;

        hash = TS_HASH(obj);
        tc = &turnstile_chains[hash];
        mutex_spin_enter(&turnstile_locks[hash].lock);

        LIST_FOREACH(ts, tc, ts_chain)
                if (ts->ts_obj == obj)
                        return (ts);

        /*
         * No turnstile yet for this lock.  No problem, turnstile_block()
         * handles this by fetching the turnstile from the blocking thread.
         */
        return (NULL);
}

/*
 * turnstile_exit:
 *
 *        Abort a turnstile operation.
 */
void
turnstile_exit(wchan_t obj)
{

        mutex_spin_exit(&turnstile_locks[TS_HASH(obj)].lock);
}

/*
 * turnstile_lendpri:
 *
 *        Lend our priority to lwps on the blocking chain.
 *
 *        If the current owner of the lock (l->l_wchan, set by sleepq_enqueue)
 *        has a priority lower than ours (lwp_eprio(l)), lend our priority to
 *        him to avoid priority inversions.
 */

static void
turnstile_lendpri(lwp_t *cur)
{
        lwp_t * l = cur;
        pri_t prio;

        /*
         * NOTE: if you get a panic in this code block, it is likely that
         * a lock has been destroyed or corrupted while still in use.  Try
         * compiling a kernel with LOCKDEBUG to pinpoint the problem.
         */

        LOCKDEBUG_BARRIER(l->l_mutex, 1);
        KASSERT(l == curlwp);
        prio = lwp_eprio(l);
        for (;;) {
                lwp_t *owner;
                turnstile_t *ts;
                bool dolock;

                if (l->l_wchan == NULL)
                        break;

                /*
                 * Ask syncobj the owner of the lock.
                 */
                owner = (*l->l_syncobj->sobj_owner)(l->l_wchan);
                if (owner == NULL)
                        break;

                /*
                 * The owner may have changed as we have dropped the tc lock.
                 */
                if (cur == owner) {
                        /*
                         * We own the lock: stop here, sleepq_block()
                         * should wake up immediately.
                         */
                        break;
                }
                /*
                 * Acquire owner->l_mutex if we don't have it yet.
                 * Because we already have another LWP lock (l->l_mutex) held,
                 * we need to play a try lock dance to avoid deadlock.
                 */
                dolock = l->l_mutex != atomic_load_relaxed(&owner->l_mutex);
                if (l == owner || (dolock && !lwp_trylock(owner))) {
                        /*
                         * The owner was changed behind us or trylock failed.
                         * Restart from curlwp.
                         *
                         * Note that there may be a livelock here:
                         * the owner may try grabbing cur's lock (which is the
                         * tc lock) while we're trying to grab the owner's lock.
                         */
                        lwp_unlock(l);
                        l = cur;
                        lwp_lock(l);
                        prio = lwp_eprio(l);
                        continue;
                }
                /*
                 * If the owner's priority is already higher than ours,
                 * there's nothing to do anymore.
                 */
                if (prio <= lwp_eprio(owner)) {
                        if (dolock)
                                lwp_unlock(owner);
                        break;
                }
                /*
                 * Lend our priority to the 'owner' LWP.
                 *
                 * Update lenders info for turnstile_unlendpri.
                 */
                ts = l->l_ts;
                KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL);
                if (ts->ts_inheritor == NULL) {
                        ts->ts_inheritor = owner;
                        ts->ts_eprio = prio;
                        SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain);
                        lwp_lendpri(owner, prio);
                } else if (prio > ts->ts_eprio) {
                        ts->ts_eprio = prio;
                        lwp_lendpri(owner, prio);
                }
                if (dolock)
                        lwp_unlock(l);
                LOCKDEBUG_BARRIER(owner->l_mutex, 1);
                l = owner;
        }
        LOCKDEBUG_BARRIER(l->l_mutex, 1);
        if (cur->l_mutex != atomic_load_relaxed(&l->l_mutex)) {
                lwp_unlock(l);
                lwp_lock(cur);
        }
        LOCKDEBUG_BARRIER(cur->l_mutex, 1);
}

/*
 * turnstile_unlendpri: undo turnstile_lendpri
 */

static void
turnstile_unlendpri(turnstile_t *ts)
{
        lwp_t * const l = curlwp;
        turnstile_t *iter;
        turnstile_t *next;
        turnstile_t *prev = NULL;
        pri_t prio;
        bool dolock;

        KASSERT(ts->ts_inheritor != NULL);
        ts->ts_inheritor = NULL;
        dolock = (atomic_load_relaxed(&l->l_mutex) ==
            l->l_cpu->ci_schedstate.spc_lwplock);
        if (dolock) {
                lwp_lock(l);
        }

        /*
         * the following loop does two things.
         *
         * - remove ts from the list.
         *
         * - from the rest of the list, find the highest priority.
         */

        prio = -1;
        KASSERT(!SLIST_EMPTY(&l->l_pi_lenders));
        for (iter = SLIST_FIRST(&l->l_pi_lenders);
            iter != NULL; iter = next) {
                KASSERT(lwp_eprio(l) >= ts->ts_eprio);
                next = SLIST_NEXT(iter, ts_pichain);
                if (iter == ts) {
                        if (prev == NULL) {
                                SLIST_REMOVE_HEAD(&l->l_pi_lenders,
                                    ts_pichain);
                        } else {
                                SLIST_REMOVE_AFTER(prev, ts_pichain);
                        }
                } else if (prio < iter->ts_eprio) {
                        prio = iter->ts_eprio;
                }
                prev = iter;
        }

        lwp_lendpri(l, prio);

        if (dolock) {
                lwp_unlock(l);
        }
}

/*
 * turnstile_block:
 *
 *         Enter an object into the turnstile chain and prepare the current
 *         LWP for sleep.
 */
void
turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj)
{
        lwp_t * const l = curlwp; /* cached curlwp */
        turnstile_t *ots;
        tschain_t *tc;
        kmutex_t *lock;
        sleepq_t *sq;
        u_int hash;
        int nlocks;

        hash = TS_HASH(obj);
        tc = &turnstile_chains[hash];
        lock = &turnstile_locks[hash].lock;

        KASSERT(q == TS_READER_Q || q == TS_WRITER_Q);
        KASSERT(mutex_owned(lock));
        KASSERT(l != NULL);
        KASSERT(l->l_ts != NULL);

        if (ts == NULL) {
                /*
                 * We are the first thread to wait for this object;
                 * lend our turnstile to it.
                 */
                ts = l->l_ts;
                KASSERT(TS_ALL_WAITERS(ts) == 0);
                KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]));
                KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
                ts->ts_obj = obj;
                ts->ts_inheritor = NULL;
                LIST_INSERT_HEAD(tc, ts, ts_chain);
        } else {
                /*
                 * Object already has a turnstile.  Put our turnstile
                 * onto the free list, and reference the existing
                 * turnstile instead.
                 */
                ots = l->l_ts;
                KASSERT(ots->ts_free == NULL);
                ots->ts_free = ts->ts_free;
                ts->ts_free = ots;
                l->l_ts = ts;

                KASSERT(ts->ts_obj == obj);
                KASSERT(TS_ALL_WAITERS(ts) != 0);
                KASSERT(!LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) ||
                        !LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
        }

        sq = &ts->ts_sleepq[q];
        ts->ts_waiters[q]++;
        nlocks = sleepq_enter(sq, l, lock);
        LOCKDEBUG_BARRIER(lock, 1);
        sleepq_enqueue(sq, obj, sobj->sobj_name, sobj, false);

        /*
         * Disable preemption across this entire block, as we may drop
         * scheduler locks (allowing preemption), and would prefer not
         * to be interrupted while in a state of flux.
         */
        KPREEMPT_DISABLE(l);
        KASSERT(lock == l->l_mutex);
        turnstile_lendpri(l);
        sleepq_block(0, false, sobj, nlocks);
        KPREEMPT_ENABLE(l);
}

/*
 * turnstile_wakeup:
 *
 *        Wake up the specified number of threads that are blocked
 *        in a turnstile.
 */
void
turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl)
{
        sleepq_t *sq;
        kmutex_t *lock;
        u_int hash;
        lwp_t *l;

        hash = TS_HASH(ts->ts_obj);
        lock = &turnstile_locks[hash].lock;
        sq = &ts->ts_sleepq[q];

        KASSERT(q == TS_READER_Q || q == TS_WRITER_Q);
        KASSERT(count > 0);
        KASSERT(count <= TS_WAITERS(ts, q));
        KASSERT(mutex_owned(lock));
        KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL);

        /*
         * restore inherited priority if necessary.
         */

        if (ts->ts_inheritor != NULL) {
                turnstile_unlendpri(ts);
        }

        if (nl != NULL) {
#if defined(DEBUG) || defined(LOCKDEBUG)
                LIST_FOREACH(l, sq, l_sleepchain) {
                        if (l == nl)
                                break;
                }
                if (l == NULL)
                        panic("turnstile_wakeup: nl not on sleepq");
#endif
                turnstile_remove(ts, nl, q);
        } else {
                while (count-- > 0) {
                        l = LIST_FIRST(sq);
                        KASSERT(l != NULL);
                        turnstile_remove(ts, l, q);
                }
        }
        mutex_spin_exit(lock);
}

/*
 * turnstile_unsleep:
 *
 *        Remove an LWP from the turnstile.  This is called when the LWP has
 *        not been awoken normally but instead interrupted: for example, if it
 *        has received a signal.  It's not a valid action for turnstiles,
 *        since LWPs blocking on a turnstile are not interruptable.
 */
void
turnstile_unsleep(lwp_t *l, bool cleanup)
{

        lwp_unlock(l);
        panic("turnstile_unsleep");
}

/*
 * turnstile_changepri:
 *
 *        Adjust the priority of an LWP residing on a turnstile.
 */
void
turnstile_changepri(lwp_t *l, pri_t pri)
{

        /* XXX priority inheritance */
        sleepq_changepri(l, pri);
}

#if defined(LOCKDEBUG)
/*
 * turnstile_print:
 *
 *        Given the address of a lock object, print the contents of a
 *        turnstile.
 */
void
turnstile_print(volatile void *obj, void (*pr)(const char *, ...))
{
        turnstile_t *ts;
        tschain_t *tc;
        sleepq_t *rsq, *wsq;
        u_int hash;
        lwp_t *l;

        hash = TS_HASH(obj);
        tc = &turnstile_chains[hash];

        LIST_FOREACH(ts, tc, ts_chain)
                if (ts->ts_obj == obj)
                        break;

        if (ts == NULL) {
                (*pr)("Turnstile: no active turnstile for this lock.\n");
                return;
        }

        rsq = &ts->ts_sleepq[TS_READER_Q];
        wsq = &ts->ts_sleepq[TS_WRITER_Q];

        (*pr)("Turnstile:\n");
        (*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q));
        LIST_FOREACH(l, rsq, l_sleepchain) {
                (*pr)(" %p", l);
        }
        (*pr)("\n");

        (*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q));
        LIST_FOREACH(l, wsq, l_sleepchain) {
                (*pr)(" %p", l);
        }
        (*pr)("\n");
}
#endif        /* LOCKDEBUG */






























































































   58 






























   57 
   57 







   58 
   58 



   58 








   56 
   56 

































    2 
   55 

   58 
   56 
   57 
   57 

















    3 











   54 











   54 


   55 

   54 






   54 























   54 


   54 






















    3 




















    1 



























    3 













    2 




    3 































































    2 


    3 






    2 













    3 




























    3 

    2 
    1 












    3 









    2 






    2 







    3 



    2 

















    3 





    2 




    3 




    2 
    1 


    3 




















    3 


















































    3 


    2 
    1 
    2 













    2 















    1 











































    2 

























    2 








































    2 














    2 























































   13 














   13 



























   13 
   13 
   13 
   13 
   13 





   13 

















   13 


   12 

   12 



   12 












    2 























    1 
























    2 

























    2 



    2 
    2 

    2 
    2 
    2 



    1 










































































    2 
















    1 








    2 

























    1 
    1 
    1 
    1 

















    1 


































    1 
















    1 

    2 
    2 

    2 








    1 

    2 
    1 




















    2 
    2 
    2 
    2 


    2 




    2 

    1 
    1 

    1 
    1 












    1 

















    2 










    2 

    1 




    2 
    1 















    2 
    1 













    1 







    1 





























































    1 














    1 
    1 
    1 





    1 









    1 






    1 




















    1 




    1 




















































    1 

























    1 

















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
/*        $NetBSD: genfs_io.c,v 1.104 2024/04/05 13:05:40 riastradh Exp $        */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.104 2024/04/05 13:05:40 riastradh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/kmem.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>
#include <sys/buf.h>
#include <sys/atomic.h>

#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/genfs_node.h>
#include <miscfs/specfs/specdev.h>

#include <uvm/uvm.h>
#include <uvm/uvm_pager.h>
#include <uvm/uvm_page_array.h>

static int genfs_do_directio(struct vmspace *, vaddr_t, size_t, struct vnode *,
    off_t, enum uio_rw);
static void genfs_dio_iodone(struct buf *);

static int genfs_getpages_read(struct vnode *, struct vm_page **, int, off_t,
    off_t, bool, bool, bool, bool);
static int genfs_do_io(struct vnode *, off_t, vaddr_t, size_t, int, enum uio_rw,
    void (*)(struct buf *));
static void genfs_rel_pages(struct vm_page **, unsigned int);

int genfs_maxdio = MAXPHYS;

static void
genfs_rel_pages(struct vm_page **pgs, unsigned int npages)
{
        unsigned int i;

        for (i = 0; i < npages; i++) {
                struct vm_page *pg = pgs[i];

                if (pg == NULL || pg == PGO_DONTCARE)
                        continue;
                KASSERT(uvm_page_owner_locked_p(pg, true));
                if (pg->flags & PG_FAKE) {
                        pg->flags |= PG_RELEASED;
                }
        }
        uvm_page_unbusy(pgs, npages);
}

/*
 * generic VM getpages routine.
 * Return PG_BUSY pages for the given range,
 * reading from backing store if necessary.
 */

int
genfs_getpages(void *v)
{
        struct vop_getpages_args /* {
                struct vnode *a_vp;
                voff_t a_offset;
                struct vm_page **a_m;
                int *a_count;
                int a_centeridx;
                vm_prot_t a_access_type;
                int a_advice;
                int a_flags;
        } */ * const ap = v;

        off_t diskeof, memeof;
        int i, error, npages, iflag;
        const int flags = ap->a_flags;
        struct vnode * const vp = ap->a_vp;
        struct uvm_object * const uobj = &vp->v_uobj;
        const bool async = (flags & PGO_SYNCIO) == 0;
        const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
        const bool overwrite = (flags & PGO_OVERWRITE) != 0;
        const bool blockalloc = memwrite && (flags & PGO_NOBLOCKALLOC) == 0;
        const bool need_wapbl = (vp->v_mount->mnt_wapbl &&
                        (flags & PGO_JOURNALLOCKED) == 0);
        const bool glocked = (flags & PGO_GLOCKHELD) != 0;
        bool holds_wapbl = false;
        struct mount *trans_mount = NULL;
        UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist);

        UVMHIST_LOG(ubchist, "vp %#jx off 0x%jx/%jx count %jd",
            (uintptr_t)vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count);

        KASSERT(memwrite >= overwrite);
        KASSERT(vp->v_type == VREG || vp->v_type == VDIR ||
            vp->v_type == VLNK || vp->v_type == VBLK);

        /*
         * the object must be locked.  it can only be a read lock when
         * processing a read fault with PGO_LOCKED.
         */

        KASSERT(rw_lock_held(uobj->vmobjlock));
        KASSERT(rw_write_held(uobj->vmobjlock) ||
           ((flags & PGO_LOCKED) != 0 && !memwrite));

#ifdef DIAGNOSTIC
        if ((flags & PGO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl)
                WAPBL_JLOCK_ASSERT(vp->v_mount);
#endif

        /*
         * check for reclaimed vnode.  v_interlock is not held here, but
         * VI_DEADCHECK is set with vmobjlock held.
         */

        iflag = atomic_load_relaxed(&vp->v_iflag);
        if (__predict_false((iflag & VI_DEADCHECK) != 0)) {
                mutex_enter(vp->v_interlock);
                error = vdead_check(vp, VDEAD_NOWAIT);
                mutex_exit(vp->v_interlock);
                if (error) {
                        if ((flags & PGO_LOCKED) == 0)
                                rw_exit(uobj->vmobjlock);
                        return error;
                }
        }

startover:
        error = 0;
        const voff_t origvsize = vp->v_size;
        const off_t origoffset = ap->a_offset;
        const int orignpages = *ap->a_count;

        GOP_SIZE(vp, origvsize, &diskeof, 0);
        if (flags & PGO_PASTEOF) {
                off_t newsize;
#if defined(DIAGNOSTIC)
                off_t writeeof;
#endif /* defined(DIAGNOSTIC) */

                newsize = MAX(origvsize,
                    origoffset + (orignpages << PAGE_SHIFT));
                GOP_SIZE(vp, newsize, &memeof, GOP_SIZE_MEM);
#if defined(DIAGNOSTIC)
                GOP_SIZE(vp, vp->v_writesize, &writeeof, GOP_SIZE_MEM);
                if (newsize > round_page(writeeof)) {
                        panic("%s: past eof: %" PRId64 " vs. %" PRId64,
                            __func__, newsize, round_page(writeeof));
                }
#endif /* defined(DIAGNOSTIC) */
        } else {
                GOP_SIZE(vp, origvsize, &memeof, GOP_SIZE_MEM);
        }
        KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages);
        KASSERT((origoffset & (PAGE_SIZE - 1)) == 0);
        KASSERT(origoffset >= 0);
        KASSERT(orignpages > 0);

        /*
         * Bounds-check the request.
         */

        if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) {
                if ((flags & PGO_LOCKED) == 0) {
                        rw_exit(uobj->vmobjlock);
                }
                UVMHIST_LOG(ubchist, "off 0x%jx count %jd goes past EOF 0x%jx",
                    origoffset, *ap->a_count, memeof,0);
                error = EINVAL;
                goto out_err;
        }

        /* uobj is locked */

        if ((flags & PGO_NOTIMESTAMP) == 0 &&
            (vp->v_type != VBLK ||
            (vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) {
                int updflags = 0;

                if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
                        updflags = GOP_UPDATE_ACCESSED;
                }
                if (memwrite) {
                        updflags |= GOP_UPDATE_MODIFIED;
                }
                if (updflags != 0) {
                        GOP_MARKUPDATE(vp, updflags);
                }
        }

        /*
         * For PGO_LOCKED requests, just return whatever's in memory.
         */

        if (flags & PGO_LOCKED) {
                int nfound;
                struct vm_page *pg;

                KASSERT(!glocked);
                npages = *ap->a_count;
#if defined(DEBUG)
                for (i = 0; i < npages; i++) {
                        pg = ap->a_m[i];
                        KASSERT(pg == NULL || pg == PGO_DONTCARE);
                }
#endif /* defined(DEBUG) */
                 nfound = uvn_findpages(uobj, origoffset, &npages,
                    ap->a_m, NULL,
                    UFP_NOWAIT | UFP_NOALLOC | UFP_NOBUSY |
                    (memwrite ? UFP_NORDONLY : 0));
                KASSERT(npages == *ap->a_count);
                if (nfound == 0) {
                        error = EBUSY;
                        goto out_err;
                }
                /*
                 * lock and unlock g_glock to ensure that no one is truncating
                 * the file behind us.
                 */
                if (!genfs_node_rdtrylock(vp)) {
                        /*
                         * restore the array.
                         */

                        for (i = 0; i < npages; i++) {
                                pg = ap->a_m[i];

                                if (pg != NULL && pg != PGO_DONTCARE) {
                                        ap->a_m[i] = NULL;
                                }
                                KASSERT(ap->a_m[i] == NULL ||
                                    ap->a_m[i] == PGO_DONTCARE);
                        }
                } else {
                        genfs_node_unlock(vp);
                }
                error = (ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0);
                if (error == 0 && memwrite) {
                        for (i = 0; i < npages; i++) {
                                pg = ap->a_m[i];
                                if (pg == NULL || pg == PGO_DONTCARE) {
                                        continue;
                                }
                                if (uvm_pagegetdirty(pg) ==
                                    UVM_PAGE_STATUS_CLEAN) {
                                        uvm_pagemarkdirty(pg,
                                            UVM_PAGE_STATUS_UNKNOWN);
                                }
                        }
                }
                goto out_err;
        }
        rw_exit(uobj->vmobjlock);

        /*
         * find the requested pages and make some simple checks.
         * leave space in the page array for a whole block.
         */

        const int fs_bshift = (vp->v_type != VBLK) ?
            vp->v_mount->mnt_fs_bshift : DEV_BSHIFT;
        const int fs_bsize = 1 << fs_bshift;
#define        blk_mask        (fs_bsize - 1)
#define        trunc_blk(x)        ((x) & ~blk_mask)
#define        round_blk(x)        (((x) + blk_mask) & ~blk_mask)

        const int orignmempages = MIN(orignpages,
            round_page(memeof - origoffset) >> PAGE_SHIFT);
        npages = orignmempages;
        const off_t startoffset = trunc_blk(origoffset);
        const off_t endoffset = MIN(
            round_page(round_blk(origoffset + (npages << PAGE_SHIFT))),
            round_page(memeof));
        const int ridx = (origoffset - startoffset) >> PAGE_SHIFT;

        const int pgs_size = sizeof(struct vm_page *) *
            ((endoffset - startoffset) >> PAGE_SHIFT);
        struct vm_page **pgs, *pgs_onstack[UBC_MAX_PAGES];

        if (pgs_size > sizeof(pgs_onstack)) {
                pgs = kmem_zalloc(pgs_size, async ? KM_NOSLEEP : KM_SLEEP);
                if (pgs == NULL) {
                        pgs = pgs_onstack;
                        error = ENOMEM;
                        goto out_err;
                }
        } else {
                pgs = pgs_onstack;
                (void)memset(pgs, 0, pgs_size);
        }

        UVMHIST_LOG(ubchist, "ridx %jd npages %jd startoff %#jx endoff %#jx",
            ridx, npages, startoffset, endoffset);

        if (trans_mount == NULL) {
                trans_mount = vp->v_mount;
                fstrans_start(trans_mount);
                /*
                 * check if this vnode is still valid.
                 */
                mutex_enter(vp->v_interlock);
                error = vdead_check(vp, 0);
                mutex_exit(vp->v_interlock);
                if (error)
                        goto out_err_free;
                /*
                 * XXX: This assumes that we come here only via
                 * the mmio path
                 */
                if (blockalloc && need_wapbl) {
                        error = WAPBL_BEGIN(trans_mount);
                        if (error)
                                goto out_err_free;
                        holds_wapbl = true;
                }
        }

        /*
         * hold g_glock to prevent a race with truncate.
         *
         * check if our idea of v_size is still valid.
         */

        KASSERT(!glocked || genfs_node_wrlocked(vp));
        if (!glocked) {
                if (blockalloc) {
                        genfs_node_wrlock(vp);
                } else {
                        genfs_node_rdlock(vp);
                }
        }
        rw_enter(uobj->vmobjlock, RW_WRITER);
        if (vp->v_size < origvsize) {
                if (!glocked) {
                        genfs_node_unlock(vp);
                }
                if (pgs != pgs_onstack)
                        kmem_free(pgs, pgs_size);
                goto startover;
        }

        if (uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], NULL,
            async ? UFP_NOWAIT : UFP_ALL) != orignmempages) {
                if (!glocked) {
                        genfs_node_unlock(vp);
                }
                KASSERT(async != 0);
                genfs_rel_pages(&pgs[ridx], orignmempages);
                rw_exit(uobj->vmobjlock);
                error = EBUSY;
                goto out_err_free;
        }

        /*
         * if PGO_OVERWRITE is set, don't bother reading the pages.
         */

        if (overwrite) {
                if (!glocked) {
                        genfs_node_unlock(vp);
                }
                UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0);

                for (i = 0; i < npages; i++) {
                        struct vm_page *pg = pgs[ridx + i];

                        /*
                         * it's caller's responsibility to allocate blocks
                         * beforehand for the overwrite case.
                         */

                        KASSERT((pg->flags & PG_RDONLY) == 0 || !blockalloc);
                        pg->flags &= ~PG_RDONLY;

                        /*
                         * mark the page DIRTY.
                         * otherwise another thread can do putpages and pull
                         * our vnode from syncer's queue before our caller does
                         * ubc_release.  note that putpages won't see CLEAN
                         * pages even if they are BUSY.
                         */

                        uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
                }
                npages += ridx;
                goto out;
        }

        /*
         * if the pages are already resident, just return them.
         */

        for (i = 0; i < npages; i++) {
                struct vm_page *pg = pgs[ridx + i];

                if ((pg->flags & PG_FAKE) ||
                    (blockalloc && (pg->flags & PG_RDONLY) != 0)) {
                        break;
                }
        }
        if (i == npages) {
                if (!glocked) {
                        genfs_node_unlock(vp);
                }
                UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0);
                npages += ridx;
                goto out;
        }

        /*
         * the page wasn't resident and we're not overwriting,
         * so we're going to have to do some i/o.
         * find any additional pages needed to cover the expanded range.
         */

        npages = (endoffset - startoffset) >> PAGE_SHIFT;
        if (startoffset != origoffset || npages != orignmempages) {
                int npgs;

                /*
                 * we need to avoid deadlocks caused by locking
                 * additional pages at lower offsets than pages we
                 * already have locked.  unlock them all and start over.
                 */

                genfs_rel_pages(&pgs[ridx], orignmempages);
                memset(pgs, 0, pgs_size);

                UVMHIST_LOG(ubchist, "reset npages start 0x%jx end 0x%jx",
                    startoffset, endoffset, 0,0);
                npgs = npages;
                if (uvn_findpages(uobj, startoffset, &npgs, pgs, NULL,
                    async ? UFP_NOWAIT : UFP_ALL) != npages) {
                        if (!glocked) {
                                genfs_node_unlock(vp);
                        }
                        KASSERT(async != 0);
                        genfs_rel_pages(pgs, npages);
                        rw_exit(uobj->vmobjlock);
                        error = EBUSY;
                        goto out_err_free;
                }
        }

        rw_exit(uobj->vmobjlock);
        error = genfs_getpages_read(vp, pgs, npages, startoffset, diskeof,
            async, memwrite, blockalloc, glocked);
        if (!glocked) {
                genfs_node_unlock(vp);
        }
        if (error == 0 && async)
                goto out_err_free;
        rw_enter(uobj->vmobjlock, RW_WRITER);

        /*
         * we're almost done!  release the pages...
         * for errors, we free the pages.
         * otherwise we activate them and mark them as valid and clean.
         * also, unbusy pages that were not actually requested.
         */

        if (error) {
                genfs_rel_pages(pgs, npages);
                rw_exit(uobj->vmobjlock);
                UVMHIST_LOG(ubchist, "returning error %jd", error,0,0,0);
                goto out_err_free;
        }

out:
        UVMHIST_LOG(ubchist, "succeeding, npages %jd", npages,0,0,0);
        error = 0;
        for (i = 0; i < npages; i++) {
                struct vm_page *pg = pgs[i];
                if (pg == NULL) {
                        continue;
                }
                UVMHIST_LOG(ubchist, "examining pg %#jx flags 0x%jx",
                    (uintptr_t)pg, pg->flags, 0,0);
                if (pg->flags & PG_FAKE && !overwrite) {
                        /*
                         * we've read page's contents from the backing storage.
                         *
                         * for a read fault, we keep them CLEAN;  if we
                         * encountered a hole while reading, the pages can
                         * already been dirtied with zeros.
                         */
                        KASSERTMSG(blockalloc || uvm_pagegetdirty(pg) ==
                            UVM_PAGE_STATUS_CLEAN, "page %p not clean", pg);
                        pg->flags &= ~PG_FAKE;
                }
                KASSERT(!memwrite || !blockalloc || (pg->flags & PG_RDONLY) == 0);
                if (i < ridx || i >= ridx + orignmempages || async) {
                        UVMHIST_LOG(ubchist, "unbusy pg %#jx offset 0x%jx",
                            (uintptr_t)pg, pg->offset,0,0);
                        if (pg->flags & PG_FAKE) {
                                KASSERT(overwrite);
                                uvm_pagezero(pg);
                        }
                        if (pg->flags & PG_RELEASED) {
                                uvm_pagefree(pg);
                                continue;
                        }
                        uvm_pagelock(pg);
                        uvm_pageenqueue(pg);
                        uvm_pagewakeup(pg);
                        uvm_pageunlock(pg);
                        pg->flags &= ~(PG_BUSY|PG_FAKE);
                        UVM_PAGE_OWN(pg, NULL);
                } else if (memwrite && !overwrite &&
                    uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
                        /*
                         * for a write fault, start dirtiness tracking of
                         * requested pages.
                         */
                        uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
                }
        }
        rw_exit(uobj->vmobjlock);
        if (ap->a_m != NULL) {
                memcpy(ap->a_m, &pgs[ridx],
                    orignmempages * sizeof(struct vm_page *));
        }

out_err_free:
        if (pgs != NULL && pgs != pgs_onstack)
                kmem_free(pgs, pgs_size);
out_err:
        if (trans_mount != NULL) {
                if (holds_wapbl)
                        WAPBL_END(trans_mount);
                fstrans_done(trans_mount);
        }
        return error;
}

/*
 * genfs_getpages_read: Read the pages in with VOP_BMAP/VOP_STRATEGY.
 *
 * "glocked" (which is currently not actually used) tells us not whether
 * the genfs_node is locked on entry (it always is) but whether it was
 * locked on entry to genfs_getpages.
 */
static int
genfs_getpages_read(struct vnode *vp, struct vm_page **pgs, int npages,
    off_t startoffset, off_t diskeof,
    bool async, bool memwrite, bool blockalloc, bool glocked)
{
        struct uvm_object * const uobj = &vp->v_uobj;
        const int fs_bshift = (vp->v_type != VBLK) ?
            vp->v_mount->mnt_fs_bshift : DEV_BSHIFT;
        const int dev_bshift = (vp->v_type != VBLK) ?
            vp->v_mount->mnt_dev_bshift : DEV_BSHIFT;
        kauth_cred_t const cred = curlwp->l_cred;                /* XXXUBC curlwp */
        size_t bytes, iobytes, tailstart, tailbytes, totalbytes, skipbytes;
        vaddr_t kva;
        struct buf *bp, *mbp;
        bool sawhole = false;
        int i;
        int error = 0;

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);

        /*
         * read the desired page(s).
         */

        totalbytes = npages << PAGE_SHIFT;
        bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0));
        tailbytes = totalbytes - bytes;
        skipbytes = 0;

        kva = uvm_pagermapin(pgs, npages,
            UVMPAGER_MAPIN_READ | (async ? 0 : UVMPAGER_MAPIN_WAITOK));
        if (kva == 0)
                return EBUSY;

        mbp = getiobuf(vp, true);
        mbp->b_bufsize = totalbytes;
        mbp->b_data = (void *)kva;
        mbp->b_resid = mbp->b_bcount = bytes;
        mbp->b_cflags |= BC_BUSY;
        if (async) {
                mbp->b_flags = B_READ | B_ASYNC;
                mbp->b_iodone = uvm_aio_aiodone;
        } else {
                mbp->b_flags = B_READ;
                mbp->b_iodone = NULL;
        }
        if (async)
                BIO_SETPRIO(mbp, BPRIO_TIMELIMITED);
        else
                BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL);

        /*
         * if EOF is in the middle of the range, zero the part past EOF.
         * skip over pages which are not PG_FAKE since in that case they have
         * valid data that we need to preserve.
         */

        tailstart = bytes;
        while (tailbytes > 0) {
                const int len = PAGE_SIZE - (tailstart & PAGE_MASK);

                KASSERT(len <= tailbytes);
                if ((pgs[tailstart >> PAGE_SHIFT]->flags & PG_FAKE) != 0) {
                        memset((void *)(kva + tailstart), 0, len);
                        UVMHIST_LOG(ubchist, "tailbytes %#jx 0x%jx 0x%jx",
                            (uintptr_t)kva, tailstart, len, 0);
                }
                tailstart += len;
                tailbytes -= len;
        }

        /*
         * now loop over the pages, reading as needed.
         */

        bp = NULL;
        off_t offset;
        for (offset = startoffset;
            bytes > 0;
            offset += iobytes, bytes -= iobytes) {
                int run;
                daddr_t lbn, blkno;
                int pidx;
                struct vnode *devvp;

                /*
                 * skip pages which don't need to be read.
                 */

                pidx = (offset - startoffset) >> PAGE_SHIFT;
                while ((pgs[pidx]->flags & PG_FAKE) == 0) {
                        size_t b;

                        KASSERT((offset & (PAGE_SIZE - 1)) == 0);
                        if ((pgs[pidx]->flags & PG_RDONLY)) {
                                sawhole = true;
                        }
                        b = MIN(PAGE_SIZE, bytes);
                        offset += b;
                        bytes -= b;
                        skipbytes += b;
                        pidx++;
                        UVMHIST_LOG(ubchist, "skipping, new offset 0x%jx",
                            offset, 0,0,0);
                        if (bytes == 0) {
                                goto loopdone;
                        }
                }

                /*
                 * bmap the file to find out the blkno to read from and
                 * how much we can read in one i/o.  if bmap returns an error,
                 * skip the rest of the top-level i/o.
                 */

                lbn = offset >> fs_bshift;
                error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
                if (error) {
                        UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%jx -> %jd",
                            lbn,error,0,0);
                        skipbytes += bytes;
                        bytes = 0;
                        goto loopdone;
                }

                /*
                 * see how many pages can be read with this i/o.
                 * reduce the i/o size if necessary to avoid
                 * overwriting pages with valid data.
                 */

                iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
                    bytes);
                if (offset + iobytes > round_page(offset)) {
                        int pcount;

                        pcount = 1;
                        while (pidx + pcount < npages &&
                            pgs[pidx + pcount]->flags & PG_FAKE) {
                                pcount++;
                        }
                        iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) -
                            (offset - trunc_page(offset)));
                }

                /*
                 * if this block isn't allocated, zero it instead of
                 * reading it.  unless we are going to allocate blocks,
                 * mark the pages we zeroed PG_RDONLY.
                 */

                if (blkno == (daddr_t)-1) {
                        int holepages = (round_page(offset + iobytes) -
                            trunc_page(offset)) >> PAGE_SHIFT;
                        UVMHIST_LOG(ubchist, "lbn 0x%jx -> HOLE", lbn,0,0,0);

                        sawhole = true;
                        memset((char *)kva + (offset - startoffset), 0,
                            iobytes);
                        skipbytes += iobytes;

                        if (!blockalloc) {
                                rw_enter(uobj->vmobjlock, RW_WRITER);
                                for (i = 0; i < holepages; i++) {
                                        pgs[pidx + i]->flags |= PG_RDONLY;
                                }
                                rw_exit(uobj->vmobjlock);
                        }
                        continue;
                }

                /*
                 * allocate a sub-buf for this piece of the i/o
                 * (or just use mbp if there's only 1 piece),
                 * and start it going.
                 */

                if (offset == startoffset && iobytes == bytes) {
                        bp = mbp;
                } else {
                        UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
                            (uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
                        bp = getiobuf(vp, true);
                        nestiobuf_setup(mbp, bp, offset - startoffset, iobytes);
                }
                bp->b_lblkno = 0;

                /* adjust physical blkno for partial blocks */
                bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
                    dev_bshift);

                UVMHIST_LOG(ubchist,
                    "bp %#jx offset 0x%x bcount 0x%x blkno 0x%x",
                    (uintptr_t)bp, offset, bp->b_bcount, bp->b_blkno);

                VOP_STRATEGY(devvp, bp);
        }

loopdone:
        nestiobuf_done(mbp, skipbytes, error);
        if (async) {
                UVMHIST_LOG(ubchist, "returning 0 (async)",0,0,0,0);
                return 0;
        }
        if (bp != NULL) {
                error = biowait(mbp);
        }

        /* Remove the mapping (make KVA available as soon as possible) */
        uvm_pagermapout(kva, npages);

        /*
         * if this we encountered a hole then we have to do a little more work.
         * for read faults, we marked the page PG_RDONLY so that future
         * write accesses to the page will fault again.
         * for write faults, we must make sure that the backing store for
         * the page is completely allocated while the pages are locked.
         */

        if (!error && sawhole && blockalloc) {
                error = GOP_ALLOC(vp, startoffset,
                    npages << PAGE_SHIFT, 0, cred);
                UVMHIST_LOG(ubchist, "gop_alloc off 0x%jx/0x%jx -> %jd",
                    startoffset, npages << PAGE_SHIFT, error,0);
                if (!error) {
                        rw_enter(uobj->vmobjlock, RW_WRITER);
                        for (i = 0; i < npages; i++) {
                                struct vm_page *pg = pgs[i];

                                if (pg == NULL) {
                                        continue;
                                }
                                pg->flags &= ~PG_RDONLY;
                                uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
                                UVMHIST_LOG(ubchist, "mark dirty pg %#jx",
                                    (uintptr_t)pg, 0, 0, 0);
                        }
                        rw_exit(uobj->vmobjlock);
                }
        }

        putiobuf(mbp);
        return error;
}

/*
 * generic VM putpages routine.
 * Write the given range of pages to backing store.
 *
 * => "offhi == 0" means flush all pages at or after "offlo".
 * => object should be locked by caller.  we return with the
 *      object unlocked.
 * => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O).
 *        thus, a caller might want to unlock higher level resources
 *        (e.g. vm_map) before calling flush.
 * => if neither PGO_CLEANIT nor PGO_SYNCIO is set, we will not block
 * => if PGO_ALLPAGES is set, then all pages in the object will be processed.
 *
 * note on "cleaning" object and PG_BUSY pages:
 *        this routine is holding the lock on the object.   the only time
 *        that it can run into a PG_BUSY page that it does not own is if
 *        some other process has started I/O on the page (e.g. either
 *        a pagein, or a pageout).  if the PG_BUSY page is being paged
 *        in, then it can not be dirty (!UVM_PAGE_STATUS_CLEAN) because no
 *        one has        had a chance to modify it yet.  if the PG_BUSY page is
 *        being paged out then it means that someone else has already started
 *        cleaning the page for us (how nice!).  in this case, if we
 *        have syncio specified, then after we make our pass through the
 *        object we need to wait for the other PG_BUSY pages to clear
 *        off (i.e. we need to do an iosync).   also note that once a
 *        page is PG_BUSY it must stay in its object until it is un-busyed.
 */

int
genfs_putpages(void *v)
{
        struct vop_putpages_args /* {
                struct vnode *a_vp;
                voff_t a_offlo;
                voff_t a_offhi;
                int a_flags;
        } */ * const ap = v;

        return genfs_do_putpages(ap->a_vp, ap->a_offlo, ap->a_offhi,
            ap->a_flags, NULL);
}

int
genfs_do_putpages(struct vnode *vp, off_t startoff, off_t endoff,
    int origflags, struct vm_page **busypg)
{
        struct uvm_object * const uobj = &vp->v_uobj;
        krwlock_t * const slock = uobj->vmobjlock;
        off_t nextoff;
        int i, error, npages, nback;
        int freeflag;
        /*
         * This array is larger than it should so that it's size is constant.
         * The right size is MAXPAGES.
         */
        struct vm_page *pgs[MAXPHYS / MIN_PAGE_SIZE];
#define MAXPAGES (MAXPHYS / PAGE_SIZE)
        struct vm_page *pg, *tpg;
        struct uvm_page_array a;
        bool wasclean, needs_clean;
        bool async = (origflags & PGO_SYNCIO) == 0;
        bool pagedaemon = curlwp == uvm.pagedaemon_lwp;
        struct mount *trans_mp;
        int flags;
        bool modified;                /* if we write out any pages */
        bool holds_wapbl;
        bool cleanall;                /* try to pull off from the syncer's list */
        bool onworklst;
        bool nodirty;
        const bool dirtyonly = (origflags & (PGO_DEACTIVATE|PGO_FREE)) == 0;

        UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);

        KASSERT(origflags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE));
        KASSERT((startoff & PAGE_MASK) == 0);
        KASSERT((endoff & PAGE_MASK) == 0);
        KASSERT(startoff < endoff || endoff == 0);
        KASSERT(rw_write_held(slock));

        UVMHIST_LOG(ubchist, "vp %#jx pages %jd off 0x%jx len 0x%jx",
            (uintptr_t)vp, uobj->uo_npages, startoff, endoff - startoff);

#ifdef DIAGNOSTIC
        if ((origflags & PGO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl)
                WAPBL_JLOCK_ASSERT(vp->v_mount);
#endif

        trans_mp = NULL;
        holds_wapbl = false;

retry:
        modified = false;
        flags = origflags;

        /*
         * shortcut if we have no pages to process.
         */

        nodirty = uvm_obj_clean_p(uobj);
#ifdef DIAGNOSTIC
        mutex_enter(vp->v_interlock);
        KASSERT((vp->v_iflag & VI_ONWORKLST) != 0 || nodirty);
        mutex_exit(vp->v_interlock);
#endif
        if (uobj->uo_npages == 0 || (dirtyonly && nodirty)) {
                mutex_enter(vp->v_interlock);
                if (vp->v_iflag & VI_ONWORKLST && LIST_EMPTY(&vp->v_dirtyblkhd)) {
                        vn_syncer_remove_from_worklist(vp);
                }
                mutex_exit(vp->v_interlock);
                if (trans_mp) {
                        if (holds_wapbl)
                                WAPBL_END(trans_mp);
                        fstrans_done(trans_mp);
                }
                rw_exit(slock);
                return (0);
        }

        /*
         * the vnode has pages, set up to process the request.
         */

        if (trans_mp == NULL && (flags & PGO_CLEANIT) != 0) {
                if (pagedaemon) {
                        /* Pagedaemon must not sleep here. */
                        trans_mp = vp->v_mount;
                        error = fstrans_start_nowait(trans_mp);
                        if (error) {
                                rw_exit(slock);
                                return error;
                        }
                } else {
                        /*
                         * Cannot use vdeadcheck() here as this operation
                         * usually gets used from VOP_RECLAIM().  Test for
                         * change of v_mount instead and retry on change.
                         */
                        rw_exit(slock);
                        trans_mp = vp->v_mount;
                        fstrans_start(trans_mp);
                        if (vp->v_mount != trans_mp) {
                                fstrans_done(trans_mp);
                                trans_mp = NULL;
                        } else {
                                holds_wapbl = (trans_mp->mnt_wapbl &&
                                    (origflags & PGO_JOURNALLOCKED) == 0);
                                if (holds_wapbl) {
                                        error = WAPBL_BEGIN(trans_mp);
                                        if (error) {
                                                fstrans_done(trans_mp);
                                                return error;
                                        }
                                }
                        }
                        rw_enter(slock, RW_WRITER);
                        goto retry;
                }
        }

        error = 0;
        wasclean = uvm_obj_nowriteback_p(uobj);
        nextoff = startoff;
        if (endoff == 0 || flags & PGO_ALLPAGES) {
                endoff = trunc_page(LLONG_MAX);
        }

        /*
         * if this vnode is known not to have dirty pages,
         * don't bother to clean it out.
         */

        if (nodirty) {
                /* We handled the dirtyonly && nodirty case above.  */
                KASSERT(!dirtyonly);
                flags &= ~PGO_CLEANIT;
        }

        /*
         * start the loop to scan pages.
         */

        cleanall = true;
        freeflag = pagedaemon ? PG_PAGEOUT : PG_RELEASED;
        uvm_page_array_init(&a, uobj, dirtyonly ? (UVM_PAGE_ARRAY_FILL_DIRTY |
            (!async ? UVM_PAGE_ARRAY_FILL_WRITEBACK : 0)) : 0);
        for (;;) {
                bool pgprotected;

                /*
                 * if !dirtyonly, iterate over all resident pages in the range.
                 *
                 * if dirtyonly, only possibly dirty pages are interesting.
                 * however, if we are asked to sync for integrity, we should
                 * wait on pages being written back by other threads as well.
                 */

                pg = uvm_page_array_fill_and_peek(&a, nextoff, 0);
                if (pg == NULL) {
                        break;
                }

                KASSERT(pg->uobject == uobj);
                KASSERT((pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
                    (pg->flags & (PG_BUSY)) != 0);
                KASSERT(pg->offset >= startoff);
                KASSERT(pg->offset >= nextoff);
                KASSERT(!dirtyonly ||
                    uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN ||
                    uvm_obj_page_writeback_p(pg));

                if (pg->offset >= endoff) {
                        break;
                }

                /*
                 * a preempt point.
                 */

                if (preempt_needed()) {
                        nextoff = pg->offset; /* visit this page again */
                        rw_exit(slock);
                        preempt();
                        /*
                         * as we dropped the object lock, our cached pages can
                         * be stale.
                         */
                        uvm_page_array_clear(&a);
                        rw_enter(slock, RW_WRITER);
                        continue;
                }

                /*
                 * if the current page is busy, wait for it to become unbusy.
                 */

                if ((pg->flags & PG_BUSY) != 0) {
                        UVMHIST_LOG(ubchist, "busy %#jx", (uintptr_t)pg,
                           0, 0, 0);
                        if ((pg->flags & (PG_RELEASED|PG_PAGEOUT)) != 0
                            && (flags & PGO_BUSYFAIL) != 0) {
                                UVMHIST_LOG(ubchist, "busyfail %#jx",
                                    (uintptr_t)pg, 0, 0, 0);
                                error = EDEADLK;
                                if (busypg != NULL)
                                        *busypg = pg;
                                break;
                        }
                        if (pagedaemon) {
                                /*
                                 * someone has taken the page while we
                                 * dropped the lock for fstrans_start.
                                 */
                                break;
                        }
                        /*
                         * don't bother to wait on other's activities
                         * unless we are asked to sync for integrity.
                         */
                        if (!async && (flags & PGO_RECLAIM) == 0) {
                                wasclean = false;
                                nextoff = pg->offset + PAGE_SIZE;
                                uvm_page_array_advance(&a);
                                continue;
                        }
                        nextoff = pg->offset; /* visit this page again */
                        uvm_pagewait(pg, slock, "genput");
                        /*
                         * as we dropped the object lock, our cached pages can
                         * be stale.
                         */
                        uvm_page_array_clear(&a);
                        rw_enter(slock, RW_WRITER);
                        continue;
                }

                nextoff = pg->offset + PAGE_SIZE;
                uvm_page_array_advance(&a);

                /*
                 * if we're freeing, remove all mappings of the page now.
                 * if we're cleaning, check if the page is needs to be cleaned.
                 */

                pgprotected = false;
                if (flags & PGO_FREE) {
                        pmap_page_protect(pg, VM_PROT_NONE);
                        pgprotected = true;
                } else if (flags & PGO_CLEANIT) {

                        /*
                         * if we still have some hope to pull this vnode off
                         * from the syncer queue, write-protect the page.
                         */

                        if (cleanall && wasclean) {

                                /*
                                 * uobj pages get wired only by uvm_fault
                                 * where uobj is locked.
                                 */

                                if (pg->wire_count == 0) {
                                        pmap_page_protect(pg,
                                            VM_PROT_READ|VM_PROT_EXECUTE);
                                        pgprotected = true;
                                } else {
                                        cleanall = false;
                                }
                        }
                }

                if (flags & PGO_CLEANIT) {
                        needs_clean = uvm_pagecheckdirty(pg, pgprotected);
                } else {
                        needs_clean = false;
                }

                /*
                 * if we're cleaning, build a cluster.
                 * the cluster will consist of pages which are currently dirty.
                 * if not cleaning, just operate on the one page.
                 */

                if (needs_clean) {
                        wasclean = false;
                        memset(pgs, 0, sizeof(pgs));
                        pg->flags |= PG_BUSY;
                        UVM_PAGE_OWN(pg, "genfs_putpages");

                        /*
                         * let the fs constrain the offset range of the cluster.
                         * we additionally constrain the range here such that
                         * it fits in the "pgs" pages array.
                         */

                        off_t fslo, fshi, genlo, lo, off = pg->offset;
                        GOP_PUTRANGE(vp, off, &fslo, &fshi);
                        KASSERT(fslo == trunc_page(fslo));
                        KASSERT(fslo <= off);
                        KASSERT(fshi == trunc_page(fshi));
                        KASSERT(fshi == 0 || off < fshi);

                        if (off > MAXPHYS / 2)
                                genlo = trunc_page(off - (MAXPHYS / 2));
                        else
                                genlo = 0;
                        lo = MAX(fslo, genlo);

                        /*
                         * first look backward.
                         */

                        npages = (off - lo) >> PAGE_SHIFT;
                        nback = npages;
                        uvn_findpages(uobj, off - PAGE_SIZE, &nback,
                            &pgs[0], NULL,
                            UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY|UFP_BACKWARD);
                        if (nback) {
                                memmove(&pgs[0], &pgs[npages - nback],
                                    nback * sizeof(pgs[0]));
                                if (npages - nback < nback)
                                        memset(&pgs[nback], 0,
                                            (npages - nback) * sizeof(pgs[0]));
                                else
                                        memset(&pgs[npages - nback], 0,
                                            nback * sizeof(pgs[0]));
                        }

                        /*
                         * then plug in our page of interest.
                         */

                        pgs[nback] = pg;

                        /*
                         * then look forward to fill in the remaining space in
                         * the array of pages.
                         *
                         * pass our cached array of pages so that hopefully
                         * uvn_findpages can find some good pages in it.
                         * the array a was filled above with the one of
                         * following sets of flags:
                         *        0
                         *        UVM_PAGE_ARRAY_FILL_DIRTY
                         *        UVM_PAGE_ARRAY_FILL_DIRTY|WRITEBACK
                         *
                         * XXX this is fragile but it'll work: the array
                         * was earlier filled sparsely, but UFP_DIRTYONLY
                         * implies dense.  see corresponding comment in
                         * uvn_findpages().
                         */

                        npages = MAXPAGES - nback - 1;
                        if (fshi)
                                npages = MIN(npages,
                                             (fshi - off - 1) >> PAGE_SHIFT);
                        uvn_findpages(uobj, off + PAGE_SIZE, &npages,
                            &pgs[nback + 1], &a,
                            UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY);
                        npages += nback + 1;
                } else {
                        pgs[0] = pg;
                        npages = 1;
                        nback = 0;
                }

                /*
                 * apply FREE or DEACTIVATE options if requested.
                 */

                for (i = 0; i < npages; i++) {
                        tpg = pgs[i];
                        KASSERT(tpg->uobject == uobj);
                        KASSERT(i == 0 ||
                            pgs[i-1]->offset + PAGE_SIZE == tpg->offset);
                        KASSERT(!needs_clean || uvm_pagegetdirty(pgs[i]) !=
                            UVM_PAGE_STATUS_DIRTY);
                        if (needs_clean) {
                                /*
                                 * mark pages as WRITEBACK so that concurrent
                                 * fsync can find and wait for our activities.
                                 */
                                uvm_obj_page_set_writeback(pgs[i]);
                        }
                        if (tpg->offset < startoff || tpg->offset >= endoff)
                                continue;
                        if (flags & PGO_DEACTIVATE && tpg->wire_count == 0) {
                                uvm_pagelock(tpg);
                                uvm_pagedeactivate(tpg);
                                uvm_pageunlock(tpg);
                        } else if (flags & PGO_FREE) {
                                pmap_page_protect(tpg, VM_PROT_NONE);
                                if (tpg->flags & PG_BUSY) {
                                        tpg->flags |= freeflag;
                                        if (pagedaemon) {
                                                uvm_pageout_start(1);
                                                uvm_pagelock(tpg);
                                                uvm_pagedequeue(tpg);
                                                uvm_pageunlock(tpg);
                                        }
                                } else {

                                        /*
                                         * ``page is not busy''
                                         * implies that npages is 1
                                         * and needs_clean is false.
                                         */

                                        KASSERT(npages == 1);
                                        KASSERT(!needs_clean);
                                        KASSERT(pg == tpg);
                                        KASSERT(nextoff ==
                                            tpg->offset + PAGE_SIZE);
                                        uvm_pagefree(tpg);
                                        if (pagedaemon)
                                                uvmexp.pdfreed++;
                                }
                        }
                }
                if (needs_clean) {
                        modified = true;
                        KASSERT(nextoff == pg->offset + PAGE_SIZE);
                        KASSERT(nback < npages);
                        nextoff = pg->offset + ((npages - nback) << PAGE_SHIFT);
                        KASSERT(pgs[nback] == pg);
                        KASSERT(nextoff == pgs[npages - 1]->offset + PAGE_SIZE);

                        /*
                         * start the i/o.
                         */
                        rw_exit(slock);
                        error = GOP_WRITE(vp, pgs, npages, flags);
                        /*
                         * as we dropped the object lock, our cached pages can
                         * be stale.
                         */
                        uvm_page_array_clear(&a);
                        rw_enter(slock, RW_WRITER);
                        if (error) {
                                break;
                        }
                }
        }
        uvm_page_array_fini(&a);

        /*
         * update ctime/mtime if the modification we started writing out might
         * be from mmap'ed write.
         *
         * this is necessary when an application keeps a file mmaped and
         * repeatedly modifies it via the window.  note that, because we
         * don't always write-protect pages when cleaning, such modifications
         * might not involve any page faults.
         */

        mutex_enter(vp->v_interlock);
        if (modified && (vp->v_iflag & VI_WRMAP) != 0 &&
            (vp->v_type != VBLK ||
            (vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) {
                GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
        }

        /*
         * if we no longer have any possibly dirty pages, take us off the
         * syncer list.
         */

        if ((vp->v_iflag & VI_ONWORKLST) != 0 && uvm_obj_clean_p(uobj) &&
            LIST_EMPTY(&vp->v_dirtyblkhd)) {
                vn_syncer_remove_from_worklist(vp);
        }

        /* Wait for output to complete. */
        rw_exit(slock);
        if (!wasclean && !async && vp->v_numoutput != 0) {
                while (vp->v_numoutput != 0)
                        cv_wait(&vp->v_cv, vp->v_interlock);
        }
        onworklst = (vp->v_iflag & VI_ONWORKLST) != 0;
        mutex_exit(vp->v_interlock);

        if ((flags & PGO_RECLAIM) != 0 && onworklst) {
                /*
                 * in the case of PGO_RECLAIM, ensure to make the vnode clean.
                 * retrying is not a big deal because, in many cases,
                 * uobj->uo_npages is already 0 here.
                 */
                rw_enter(slock, RW_WRITER);
                goto retry;
        }

        if (trans_mp) {
                if (holds_wapbl)
                        WAPBL_END(trans_mp);
                fstrans_done(trans_mp);
        }

        return (error);
}

/*
 * Default putrange method for file systems that do not care
 * how many pages are given to one GOP_WRITE() call.
 */
void
genfs_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip)
{

        *lop = 0;
        *hip = 0;
}

int
genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
{
        off_t off;
        vaddr_t kva;
        size_t len;
        int error;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);

        UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
            (uintptr_t)vp, (uintptr_t)pgs, npages, flags);

        off = pgs[0]->offset;
        kva = uvm_pagermapin(pgs, npages,
            UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
        len = npages << PAGE_SHIFT;

        error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE,
                            uvm_aio_aiodone);

        return error;
}

/*
 * genfs_gop_write_rwmap:
 *
 * a variant of genfs_gop_write.  it's used by UDF for its directory buffers.
 * this maps pages with PROT_WRITE so that VOP_STRATEGY can modifies
 * the contents before writing it out to the underlying storage.
 */

int
genfs_gop_write_rwmap(struct vnode *vp, struct vm_page **pgs, int npages,
    int flags)
{
        off_t off;
        vaddr_t kva;
        size_t len;
        int error;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);

        UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
            (uintptr_t)vp, (uintptr_t)pgs, npages, flags);

        off = pgs[0]->offset;
        kva = uvm_pagermapin(pgs, npages,
            UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK);
        len = npages << PAGE_SHIFT;

        error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE,
                            uvm_aio_aiodone);

        return error;
}

/*
 * Backend routine for doing I/O to vnode pages.  Pages are already locked
 * and mapped into kernel memory.  Here we just look up the underlying
 * device block addresses and call the strategy routine.
 */

static int
genfs_do_io(struct vnode *vp, off_t off, vaddr_t kva, size_t len, int flags,
    enum uio_rw rw, void (*iodone)(struct buf *))
{
        int s, error;
        int fs_bshift, dev_bshift;
        off_t eof, offset, startoffset;
        size_t bytes, iobytes, skipbytes;
        struct buf *mbp, *bp;
        const bool async = (flags & PGO_SYNCIO) == 0;
        const bool lazy = (flags & PGO_LAZY) == 0;
        const bool iowrite = rw == UIO_WRITE;
        const int brw = iowrite ? B_WRITE : B_READ;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);

        UVMHIST_LOG(ubchist, "vp %#jx kva %#jx len 0x%jx flags 0x%jx",
            (uintptr_t)vp, (uintptr_t)kva, len, flags);

        KASSERT(vp->v_size != VSIZENOTSET);
        KASSERT(vp->v_writesize != VSIZENOTSET);
        KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p"
            " v_size=0x%llx v_writesize=0x%llx", vp,
            (unsigned long long)vp->v_size,
            (unsigned long long)vp->v_writesize);
        GOP_SIZE(vp, vp->v_writesize, &eof, 0);
        if (vp->v_type != VBLK) {
                fs_bshift = vp->v_mount->mnt_fs_bshift;
                dev_bshift = vp->v_mount->mnt_dev_bshift;
        } else {
                fs_bshift = DEV_BSHIFT;
                dev_bshift = DEV_BSHIFT;
        }
        error = 0;
        startoffset = off;
        bytes = MIN(len, eof - startoffset);
        skipbytes = 0;
        KASSERT(bytes != 0);

        if (iowrite) {
                /*
                 * why += 2?
                 * 1 for biodone, 1 for uvm_aio_aiodone.
                 */
                mutex_enter(vp->v_interlock);
                vp->v_numoutput += 2;
                mutex_exit(vp->v_interlock);
        }
        mbp = getiobuf(vp, true);
        UVMHIST_LOG(ubchist, "vp %#jx mbp %#jx num now %jd bytes 0x%jx",
            (uintptr_t)vp, (uintptr_t)mbp, vp->v_numoutput, bytes);
        mbp->b_bufsize = len;
        mbp->b_data = (void *)kva;
        mbp->b_resid = mbp->b_bcount = bytes;
        mbp->b_cflags |= BC_BUSY | BC_AGE;
        if (async) {
                mbp->b_flags = brw | B_ASYNC;
                mbp->b_iodone = iodone;
        } else {
                mbp->b_flags = brw;
                mbp->b_iodone = NULL;
        }
        if (curlwp == uvm.pagedaemon_lwp)
                BIO_SETPRIO(mbp, BPRIO_TIMELIMITED);
        else if (async || lazy)
                BIO_SETPRIO(mbp, BPRIO_TIMENONCRITICAL);
        else
                BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL);

        bp = NULL;
        for (offset = startoffset;
            bytes > 0;
            offset += iobytes, bytes -= iobytes) {
                int run;
                daddr_t lbn, blkno;
                struct vnode *devvp;

                /*
                 * bmap the file to find out the blkno to read from and
                 * how much we can read in one i/o.  if bmap returns an error,
                 * skip the rest of the top-level i/o.
                 */

                lbn = offset >> fs_bshift;
                error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
                if (error) {
                        UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%jx -> %jd",
                            lbn, error, 0, 0);
                        skipbytes += bytes;
                        bytes = 0;
                        goto loopdone;
                }

                /*
                 * see how many pages can be read with this i/o.
                 * reduce the i/o size if necessary to avoid
                 * overwriting pages with valid data.
                 */

                iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
                    bytes);

                /*
                 * if this block isn't allocated, zero it instead of
                 * reading it.  unless we are going to allocate blocks,
                 * mark the pages we zeroed PG_RDONLY.
                 */

                if (blkno == (daddr_t)-1) {
                        if (!iowrite) {
                                memset((char *)kva + (offset - startoffset), 0,
                                    iobytes);
                        }
                        skipbytes += iobytes;
                        continue;
                }

                /*
                 * allocate a sub-buf for this piece of the i/o
                 * (or just use mbp if there's only 1 piece),
                 * and start it going.
                 */

                if (offset == startoffset && iobytes == bytes) {
                        bp = mbp;
                } else {
                        UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
                            (uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
                        bp = getiobuf(vp, true);
                        nestiobuf_setup(mbp, bp, offset - startoffset, iobytes);
                }
                bp->b_lblkno = 0;

                /* adjust physical blkno for partial blocks */
                bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
                    dev_bshift);

                UVMHIST_LOG(ubchist,
                    "bp %#jx offset 0x%jx bcount 0x%jx blkno 0x%jx",
                    (uintptr_t)bp, offset, bp->b_bcount, bp->b_blkno);

                VOP_STRATEGY(devvp, bp);
        }

loopdone:
        if (skipbytes) {
                UVMHIST_LOG(ubchist, "skipbytes %jd", skipbytes, 0,0,0);
        }
        nestiobuf_done(mbp, skipbytes, error);
        if (async) {
                UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0);
                return (0);
        }
        UVMHIST_LOG(ubchist, "waiting for mbp %#jx", (uintptr_t)mbp, 0, 0, 0);
        error = biowait(mbp);
        s = splbio();
        (*iodone)(mbp);
        splx(s);
        UVMHIST_LOG(ubchist, "returning, error %jd", error, 0, 0, 0);
        return (error);
}

int
genfs_compat_getpages(void *v)
{
        struct vop_getpages_args /* {
                struct vnode *a_vp;
                voff_t a_offset;
                struct vm_page **a_m;
                int *a_count;
                int a_centeridx;
                vm_prot_t a_access_type;
                int a_advice;
                int a_flags;
        } */ *ap = v;

        off_t origoffset;
        struct vnode *vp = ap->a_vp;
        struct uvm_object *uobj = &vp->v_uobj;
        struct vm_page *pg, **pgs;
        vaddr_t kva;
        int i, error, orignpages, npages;
        struct iovec iov;
        struct uio uio;
        kauth_cred_t cred = curlwp->l_cred;
        const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;

        error = 0;
        origoffset = ap->a_offset;
        orignpages = *ap->a_count;
        pgs = ap->a_m;

        if (ap->a_flags & PGO_LOCKED) {
                uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, NULL,
                    UFP_NOWAIT|UFP_NOALLOC| (memwrite ? UFP_NORDONLY : 0));

                error = ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
                return error;
        }
        if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= vp->v_size) {
                rw_exit(uobj->vmobjlock);
                return EINVAL;
        }
        if ((ap->a_flags & PGO_SYNCIO) == 0) {
                rw_exit(uobj->vmobjlock);
                return 0;
        }
        npages = orignpages;
        uvn_findpages(uobj, origoffset, &npages, pgs, NULL, UFP_ALL);
        rw_exit(uobj->vmobjlock);
        kva = uvm_pagermapin(pgs, npages,
            UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK);
        for (i = 0; i < npages; i++) {
                pg = pgs[i];
                if ((pg->flags & PG_FAKE) == 0) {
                        continue;
                }
                iov.iov_base = (char *)kva + (i << PAGE_SHIFT);
                iov.iov_len = PAGE_SIZE;
                uio.uio_iov = &iov;
                uio.uio_iovcnt = 1;
                uio.uio_offset = origoffset + (i << PAGE_SHIFT);
                uio.uio_rw = UIO_READ;
                uio.uio_resid = PAGE_SIZE;
                UIO_SETUP_SYSSPACE(&uio);
                /* XXX vn_lock */
                error = VOP_READ(vp, &uio, 0, cred);
                if (error) {
                        break;
                }
                if (uio.uio_resid) {
                        memset(iov.iov_base, 0, uio.uio_resid);
                }
        }
        uvm_pagermapout(kva, npages);
        rw_enter(uobj->vmobjlock, RW_WRITER);
        for (i = 0; i < npages; i++) {
                pg = pgs[i];
                if (error && (pg->flags & PG_FAKE) != 0) {
                        pg->flags |= PG_RELEASED;
                } else {
                        uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
                        uvm_pagelock(pg);
                        uvm_pageactivate(pg);
                        uvm_pageunlock(pg);
                }
        }
        if (error) {
                uvm_page_unbusy(pgs, npages);
        }
        rw_exit(uobj->vmobjlock);
        return error;
}

int
genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
    int flags)
{
        off_t offset;
        struct iovec iov;
        struct uio uio;
        kauth_cred_t cred = curlwp->l_cred;
        struct buf *bp;
        vaddr_t kva;
        int error;

        offset = pgs[0]->offset;
        kva = uvm_pagermapin(pgs, npages,
            UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);

        iov.iov_base = (void *)kva;
        iov.iov_len = npages << PAGE_SHIFT;
        uio.uio_iov = &iov;
        uio.uio_iovcnt = 1;
        uio.uio_offset = offset;
        uio.uio_rw = UIO_WRITE;
        uio.uio_resid = npages << PAGE_SHIFT;
        UIO_SETUP_SYSSPACE(&uio);
        /* XXX vn_lock */
        error = VOP_WRITE(vp, &uio, 0, cred);

        mutex_enter(vp->v_interlock);
        vp->v_numoutput++;
        mutex_exit(vp->v_interlock);

        bp = getiobuf(vp, true);
        bp->b_cflags |= BC_BUSY | BC_AGE;
        bp->b_lblkno = offset >> vp->v_mount->mnt_fs_bshift;
        bp->b_data = (char *)kva;
        bp->b_bcount = npages << PAGE_SHIFT;
        bp->b_bufsize = npages << PAGE_SHIFT;
        bp->b_resid = 0;
        bp->b_error = error;
        uvm_aio_aiodone(bp);
        return (error);
}

/*
 * Process a uio using direct I/O.  If we reach a part of the request
 * which cannot be processed in this fashion for some reason, just return.
 * The caller must handle some additional part of the request using
 * buffered I/O before trying direct I/O again.
 */

void
genfs_directio(struct vnode *vp, struct uio *uio, int ioflag)
{
        struct vmspace *vs;
        struct iovec *iov;
        vaddr_t va;
        size_t len;
        const int mask = DEV_BSIZE - 1;
        int error;
        bool need_wapbl = (vp->v_mount && vp->v_mount->mnt_wapbl &&
            (ioflag & IO_JOURNALLOCKED) == 0);

#ifdef DIAGNOSTIC
        if ((ioflag & IO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl)
                WAPBL_JLOCK_ASSERT(vp->v_mount);
#endif

        /*
         * We only support direct I/O to user space for now.
         */

        if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) {
                return;
        }

        /*
         * If the vnode is mapped, we would need to get the getpages lock
         * to stabilize the bmap, but then we would get into trouble while
         * locking the pages if the pages belong to this same vnode (or a
         * multi-vnode cascade to the same effect).  Just fall back to
         * buffered I/O if the vnode is mapped to avoid this mess.
         */

        if (vp->v_vflag & VV_MAPPED) {
                return;
        }

        if (need_wapbl) {
                error = WAPBL_BEGIN(vp->v_mount);
                if (error)
                        return;
        }

        /*
         * Do as much of the uio as possible with direct I/O.
         */

        vs = uio->uio_vmspace;
        while (uio->uio_resid) {
                iov = uio->uio_iov;
                if (iov->iov_len == 0) {
                        uio->uio_iov++;
                        uio->uio_iovcnt--;
                        continue;
                }
                va = (vaddr_t)iov->iov_base;
                len = MIN(iov->iov_len, genfs_maxdio);
                len &= ~mask;

                /*
                 * If the next chunk is smaller than DEV_BSIZE or extends past
                 * the current EOF, then fall back to buffered I/O.
                 */

                if (len == 0 || uio->uio_offset + len > vp->v_size) {
                        break;
                }

                /*
                 * Check alignment.  The file offset must be at least
                 * sector-aligned.  The exact constraint on memory alignment
                 * is very hardware-dependent, but requiring sector-aligned
                 * addresses there too is safe.
                 */

                if (uio->uio_offset & mask || va & mask) {
                        break;
                }
                error = genfs_do_directio(vs, va, len, vp, uio->uio_offset,
                                          uio->uio_rw);
                if (error) {
                        break;
                }
                iov->iov_base = (char *)iov->iov_base + len;
                iov->iov_len -= len;
                uio->uio_offset += len;
                uio->uio_resid -= len;
        }

        if (need_wapbl)
                WAPBL_END(vp->v_mount);
}

/*
 * Iodone routine for direct I/O.  We don't do much here since the request is
 * always synchronous, so the caller will do most of the work after biowait().
 */

static void
genfs_dio_iodone(struct buf *bp)
{

        KASSERT((bp->b_flags & B_ASYNC) == 0);
        if ((bp->b_flags & B_READ) == 0 && (bp->b_cflags & BC_AGE) != 0) {
                mutex_enter(bp->b_objlock);
                vwakeup(bp);
                mutex_exit(bp->b_objlock);
        }
        putiobuf(bp);
}

/*
 * Process one chunk of a direct I/O request.
 */

static int
genfs_do_directio(struct vmspace *vs, vaddr_t uva, size_t len, struct vnode *vp,
    off_t off, enum uio_rw rw)
{
        struct vm_map *map;
        struct pmap *upm, *kpm __unused;
        size_t klen = round_page(uva + len) - trunc_page(uva);
        off_t spoff, epoff;
        vaddr_t kva, puva;
        paddr_t pa;
        vm_prot_t prot;
        int error, rv __diagused, poff, koff;
        const int pgoflags = PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED |
                (rw == UIO_WRITE ? PGO_FREE : 0);

        /*
         * For writes, verify that this range of the file already has fully
         * allocated backing store.  If there are any holes, just punt and
         * make the caller take the buffered write path.
         */

        if (rw == UIO_WRITE) {
                daddr_t lbn, elbn, blkno;
                int bsize, bshift, run;

                bshift = vp->v_mount->mnt_fs_bshift;
                bsize = 1 << bshift;
                lbn = off >> bshift;
                elbn = (off + len + bsize - 1) >> bshift;
                while (lbn < elbn) {
                        error = VOP_BMAP(vp, lbn, NULL, &blkno, &run);
                        if (error) {
                                return error;
                        }
                        if (blkno == (daddr_t)-1) {
                                return ENOSPC;
                        }
                        lbn += 1 + run;
                }
        }

        /*
         * Flush any cached pages for parts of the file that we're about to
         * access.  If we're writing, invalidate pages as well.
         */

        spoff = trunc_page(off);
        epoff = round_page(off + len);
        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
        error = VOP_PUTPAGES(vp, spoff, epoff, pgoflags);
        if (error) {
                return error;
        }

        /*
         * Wire the user pages and remap them into kernel memory.
         */

        prot = rw == UIO_READ ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ;
        error = uvm_vslock(vs, (void *)uva, len, prot);
        if (error) {
                return error;
        }

        map = &vs->vm_map;
        upm = vm_map_pmap(map);
        kpm = vm_map_pmap(kernel_map);
        puva = trunc_page(uva);
        kva = uvm_km_alloc(kernel_map, klen, atop(puva) & uvmexp.colormask,
            UVM_KMF_VAONLY | UVM_KMF_WAITVA | UVM_KMF_COLORMATCH);
        for (poff = 0; poff < klen; poff += PAGE_SIZE) {
                rv = pmap_extract(upm, puva + poff, &pa);
                KASSERT(rv);
                pmap_kenter_pa(kva + poff, pa, prot, PMAP_WIRED);
        }
        pmap_update(kpm);

        /*
         * Do the I/O.
         */

        koff = uva - trunc_page(uva);
        error = genfs_do_io(vp, off, kva + koff, len, PGO_SYNCIO, rw,
                            genfs_dio_iodone);

        /*
         * Tear down the kernel mapping.
         */

        pmap_kremove(kva, klen);
        pmap_update(kpm);
        uvm_km_free(kernel_map, kva, klen, UVM_KMF_VAONLY);

        /*
         * Unwire the user pages.
         */

        uvm_vsunlock(vs, (void *)uva, len);
        return error;
}























































































































































































































































































































































































































































































































   17 





























   19 


   19 





   19 





















   17 






   18 
































































    2 







    2 






    2 





































    2 






    2 







    2 




























































































































































































    3 









    3 



    1 




    3 










































    1 



































    1 





















    1 








    1 















    1 



    1 






    1 
















    1 














    1 


    1 










    1 


    1 


    1 

















    1 




















    1 






    1 






    1 














































































































    1 







































































































































































































    1 















    1 






    2 



    2 
    2 



    2 







































    3 






    3 
    3 

    3 

    3 







    3 






    1 















































    3 














































































    3 






    3 









    3 



















































































































































    3 
    3 
    3 

    3 














   16 


   16 
   16 


   16 

   16 





















































































































































   16 





   10 
   15 









   16 









   16 
    4 



   15 




























































































































































































































































































































    1 
    1 





    1 





   16 


   17 


   17 

    1 

    1 




    2 


    2 


    2 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
/*        $NetBSD: uipc_mbuf.c,v 1.252 2023/11/27 02:50:27 ozaki-r Exp $        */

/*
 * Copyright (c) 1999, 2001, 2018 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, and Maxime Villard.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)uipc_mbuf.c        8.4 (Berkeley) 2/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_mbuf.c,v 1.252 2023/11/27 02:50:27 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_mbuftrace.h"
#include "opt_nmbclusters.h"
#include "opt_ddb.h"
#include "ether.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/proc.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/percpu.h>
#include <sys/pool.h>
#include <sys/socket.h>
#include <sys/sysctl.h>

#include <net/if.h>

pool_cache_t mb_cache;        /* mbuf cache */
static pool_cache_t mcl_cache;        /* mbuf cluster cache */

struct mbstat mbstat;
int max_linkhdr;
int max_protohdr;
int max_hdr;
int max_datalen;

static void mb_drain(void *, int);
static int mb_ctor(void *, void *, int);

static void sysctl_kern_mbuf_setup(void);

static struct sysctllog *mbuf_sysctllog;

static struct mbuf *m_copy_internal(struct mbuf *, int, int, int, bool);
static struct mbuf *m_split_internal(struct mbuf *, int, int, bool);
static int m_copyback_internal(struct mbuf **, int, int, const void *,
    int, int);

/* Flags for m_copyback_internal. */
#define        CB_COPYBACK        0x0001        /* copyback from cp */
#define        CB_PRESERVE        0x0002        /* preserve original data */
#define        CB_COW                0x0004        /* do copy-on-write */
#define        CB_EXTEND        0x0008        /* extend chain */

static const char mclpool_warnmsg[] =
    "WARNING: mclpool limit reached; increase kern.mbuf.nmbclusters";

MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");

static percpu_t *mbstat_percpu;

#ifdef MBUFTRACE
struct mownerhead mowners = LIST_HEAD_INITIALIZER(mowners);
struct mowner unknown_mowners[] = {
        MOWNER_INIT("unknown", "free"),
        MOWNER_INIT("unknown", "data"),
        MOWNER_INIT("unknown", "header"),
        MOWNER_INIT("unknown", "soname"),
        MOWNER_INIT("unknown", "soopts"),
        MOWNER_INIT("unknown", "ftable"),
        MOWNER_INIT("unknown", "control"),
        MOWNER_INIT("unknown", "oobdata"),
};
struct mowner revoked_mowner = MOWNER_INIT("revoked", "");
#endif

#define        MEXT_ISEMBEDDED(m) ((m)->m_ext_ref == (m))

#define        MCLADDREFERENCE(o, n)                                                \
do {                                                                        \
        KASSERT(((o)->m_flags & M_EXT) != 0);                                \
        KASSERT(((n)->m_flags & M_EXT) == 0);                                \
        KASSERT((o)->m_ext.ext_refcnt >= 1);                                \
        (n)->m_flags |= ((o)->m_flags & M_EXTCOPYFLAGS);                \
        atomic_inc_uint(&(o)->m_ext.ext_refcnt);                        \
        (n)->m_ext_ref = (o)->m_ext_ref;                                \
        mowner_ref((n), (n)->m_flags);                                        \
} while (/* CONSTCOND */ 0)

static int
nmbclusters_limit(void)
{
#if defined(PMAP_MAP_POOLPAGE)
        /* direct mapping, doesn't use space in kmem_arena */
        vsize_t max_size = physmem / 4;
#else
        vsize_t max_size = MIN(physmem / 4, nkmempages / 4);
#endif

        max_size = max_size * PAGE_SIZE / MCLBYTES;
#ifdef NMBCLUSTERS_MAX
        max_size = MIN(max_size, NMBCLUSTERS_MAX);
#endif

        return max_size;
}

/*
 * Initialize the mbuf allocator.
 */
void
mbinit(void)
{

        CTASSERT(sizeof(struct _m_ext) <= MHLEN);
        CTASSERT(sizeof(struct mbuf) == MSIZE);

        sysctl_kern_mbuf_setup();

        mb_cache = pool_cache_init(msize, 0, 0, 0, "mbpl",
            NULL, IPL_VM, mb_ctor, NULL, NULL);
        KASSERT(mb_cache != NULL);

        mcl_cache = pool_cache_init(mclbytes, COHERENCY_UNIT, 0, 0, "mclpl",
            NULL, IPL_VM, NULL, NULL, NULL);
        KASSERT(mcl_cache != NULL);

        pool_cache_set_drain_hook(mb_cache, mb_drain, NULL);
        pool_cache_set_drain_hook(mcl_cache, mb_drain, NULL);

        /*
         * Set an arbitrary default limit on the number of mbuf clusters.
         */
#ifdef NMBCLUSTERS
        nmbclusters = MIN(NMBCLUSTERS, nmbclusters_limit());
#else
        nmbclusters = MAX(1024,
            (vsize_t)physmem * PAGE_SIZE / MCLBYTES / 16);
        nmbclusters = MIN(nmbclusters, nmbclusters_limit());
#endif

        /*
         * Set the hard limit on the mclpool to the number of
         * mbuf clusters the kernel is to support.  Log the limit
         * reached message max once a minute.
         */
        pool_cache_sethardlimit(mcl_cache, nmbclusters, mclpool_warnmsg, 60);

        mbstat_percpu = percpu_alloc(sizeof(struct mbstat_cpu));

        /*
         * Set a low water mark for both mbufs and clusters.  This should
         * help ensure that they can be allocated in a memory starvation
         * situation.  This is important for e.g. diskless systems which
         * must allocate mbufs in order for the pagedaemon to clean pages.
         */
        pool_cache_setlowat(mb_cache, mblowat);
        pool_cache_setlowat(mcl_cache, mcllowat);

#ifdef MBUFTRACE
        {
                /*
                 * Attach the unknown mowners.
                 */
                int i;
                MOWNER_ATTACH(&revoked_mowner);
                for (i = sizeof(unknown_mowners)/sizeof(unknown_mowners[0]);
                     i-- > 0; )
                        MOWNER_ATTACH(&unknown_mowners[i]);
        }
#endif
}

static void
mb_drain(void *arg, int flags)
{
        struct domain *dp;
        const struct protosw *pr;
        struct ifnet *ifp;
        int s;

        KERNEL_LOCK(1, NULL);
        s = splvm();
        DOMAIN_FOREACH(dp) {
                for (pr = dp->dom_protosw;
                     pr < dp->dom_protoswNPROTOSW; pr++)
                        if (pr->pr_drain)
                                (*pr->pr_drain)();
        }
        /* XXX we cannot use psref in H/W interrupt */
        if (!cpu_intr_p()) {
                int bound = curlwp_bind();
                IFNET_READER_FOREACH(ifp) {
                        struct psref psref;

                        if_acquire(ifp, &psref);

                        if (ifp->if_drain)
                                (*ifp->if_drain)(ifp);

                        if_release(ifp, &psref);
                }
                curlwp_bindx(bound);
        }
        splx(s);
        mbstat.m_drain++;
        KERNEL_UNLOCK_ONE(NULL);
}

/*
 * sysctl helper routine for the kern.mbuf subtree.
 * nmbclusters, mblowat and mcllowat need range
 * checking and pool tweaking after being reset.
 */
static int
sysctl_kern_mbuf(SYSCTLFN_ARGS)
{
        int error, newval;
        struct sysctlnode node;

        node = *rnode;
        node.sysctl_data = &newval;
        switch (rnode->sysctl_num) {
        case MBUF_NMBCLUSTERS:
        case MBUF_MBLOWAT:
        case MBUF_MCLLOWAT:
                newval = *(int*)rnode->sysctl_data;
                break;
        case MBUF_NMBCLUSTERS_LIMIT:
                newval = nmbclusters_limit();
                break;
        default:
                return EOPNOTSUPP;
        }

        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;
        if (newval < 0)
                return EINVAL;

        switch (node.sysctl_num) {
        case MBUF_NMBCLUSTERS:
                if (newval < nmbclusters)
                        return EINVAL;
                if (newval > nmbclusters_limit())
                        return EINVAL;
                nmbclusters = newval;
                pool_cache_sethardlimit(mcl_cache, nmbclusters,
                    mclpool_warnmsg, 60);
                break;
        case MBUF_MBLOWAT:
                mblowat = newval;
                pool_cache_setlowat(mb_cache, mblowat);
                break;
        case MBUF_MCLLOWAT:
                mcllowat = newval;
                pool_cache_setlowat(mcl_cache, mcllowat);
                break;
        }

        return 0;
}

#ifdef MBUFTRACE
static void
mowner_convert_to_user_cb(void *v1, void *v2, struct cpu_info *ci)
{
        struct mowner_counter *mc = v1;
        struct mowner_user *mo_user = v2;
        int i;

        for (i = 0; i < MOWNER_COUNTER_NCOUNTERS; i++) {
                mo_user->mo_counter[i] += mc->mc_counter[i];
        }
}

static void
mowner_convert_to_user(struct mowner *mo, struct mowner_user *mo_user)
{

        memset(mo_user, 0, sizeof(*mo_user));
        CTASSERT(sizeof(mo_user->mo_name) == sizeof(mo->mo_name));
        CTASSERT(sizeof(mo_user->mo_descr) == sizeof(mo->mo_descr));
        memcpy(mo_user->mo_name, mo->mo_name, sizeof(mo->mo_name));
        memcpy(mo_user->mo_descr, mo->mo_descr, sizeof(mo->mo_descr));
        percpu_foreach(mo->mo_counters, mowner_convert_to_user_cb, mo_user);
}

static int
sysctl_kern_mbuf_mowners(SYSCTLFN_ARGS)
{
        struct mowner *mo;
        size_t len = 0;
        int error = 0;

        if (namelen != 0)
                return EINVAL;
        if (newp != NULL)
                return EPERM;

        LIST_FOREACH(mo, &mowners, mo_link) {
                struct mowner_user mo_user;

                mowner_convert_to_user(mo, &mo_user);

                if (oldp != NULL) {
                        if (*oldlenp - len < sizeof(mo_user)) {
                                error = ENOMEM;
                                break;
                        }
                        error = copyout(&mo_user, (char *)oldp + len,
                            sizeof(mo_user));
                        if (error)
                                break;
                }
                len += sizeof(mo_user);
        }

        if (error == 0)
                *oldlenp = len;

        return error;
}
#endif /* MBUFTRACE */

void
mbstat_type_add(int type, int diff)
{
        struct mbstat_cpu *mb;
        int s;

        s = splvm();
        mb = percpu_getref(mbstat_percpu);
        mb->m_mtypes[type] += diff;
        percpu_putref(mbstat_percpu);
        splx(s);
}

static void
mbstat_convert_to_user_cb(void *v1, void *v2, struct cpu_info *ci)
{
        struct mbstat_cpu *mbsc = v1;
        struct mbstat *mbs = v2;
        int i;

        for (i = 0; i < __arraycount(mbs->m_mtypes); i++) {
                mbs->m_mtypes[i] += mbsc->m_mtypes[i];
        }
}

static void
mbstat_convert_to_user(struct mbstat *mbs)
{

        memset(mbs, 0, sizeof(*mbs));
        mbs->m_drain = mbstat.m_drain;
        percpu_foreach(mbstat_percpu, mbstat_convert_to_user_cb, mbs);
}

static int
sysctl_kern_mbuf_stats(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        struct mbstat mbs;

        mbstat_convert_to_user(&mbs);
        node = *rnode;
        node.sysctl_data = &mbs;
        node.sysctl_size = sizeof(mbs);
        return sysctl_lookup(SYSCTLFN_CALL(&node));
}

static void
sysctl_kern_mbuf_setup(void)
{

        KASSERT(mbuf_sysctllog == NULL);
        sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "mbuf",
                       SYSCTL_DESCR("mbuf control variables"),
                       NULL, 0, NULL, 0,
                       CTL_KERN, KERN_MBUF, CTL_EOL);

        sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_INT, "msize",
                       SYSCTL_DESCR("mbuf base size"),
                       NULL, msize, NULL, 0,
                       CTL_KERN, KERN_MBUF, MBUF_MSIZE, CTL_EOL);
        sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_INT, "mclbytes",
                       SYSCTL_DESCR("mbuf cluster size"),
                       NULL, mclbytes, NULL, 0,
                       CTL_KERN, KERN_MBUF, MBUF_MCLBYTES, CTL_EOL);
        sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "nmbclusters",
                       SYSCTL_DESCR("Limit on the number of mbuf clusters"),
                       sysctl_kern_mbuf, 0, &nmbclusters, 0,
                       CTL_KERN, KERN_MBUF, MBUF_NMBCLUSTERS, CTL_EOL);
        sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "mblowat",
                       SYSCTL_DESCR("mbuf low water mark"),
                       sysctl_kern_mbuf, 0, &mblowat, 0,
                       CTL_KERN, KERN_MBUF, MBUF_MBLOWAT, CTL_EOL);
        sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "mcllowat",
                       SYSCTL_DESCR("mbuf cluster low water mark"),
                       sysctl_kern_mbuf, 0, &mcllowat, 0,
                       CTL_KERN, KERN_MBUF, MBUF_MCLLOWAT, CTL_EOL);
        sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "stats",
                       SYSCTL_DESCR("mbuf allocation statistics"),
                       sysctl_kern_mbuf_stats, 0, NULL, 0,
                       CTL_KERN, KERN_MBUF, MBUF_STATS, CTL_EOL);
#ifdef MBUFTRACE
        sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "mowners",
                       SYSCTL_DESCR("Information about mbuf owners"),
                       sysctl_kern_mbuf_mowners, 0, NULL, 0,
                       CTL_KERN, KERN_MBUF, MBUF_MOWNERS, CTL_EOL);
#endif
        sysctl_createv(&mbuf_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
                       CTLTYPE_INT, "nmbclusters_limit",
                       SYSCTL_DESCR("Limit of nmbclusters"),
                       sysctl_kern_mbuf, 0, NULL, 0,
                       CTL_KERN, KERN_MBUF, MBUF_NMBCLUSTERS_LIMIT, CTL_EOL);
}

static int
mb_ctor(void *arg, void *object, int flags)
{
        struct mbuf *m = object;

#ifdef POOL_VTOPHYS
        m->m_paddr = POOL_VTOPHYS(m);
#else
        m->m_paddr = M_PADDR_INVALID;
#endif
        return 0;
}

/*
 * Add mbuf to the end of a chain
 */
struct mbuf *
m_add(struct mbuf *c, struct mbuf *m)
{
        struct mbuf *n;

        if (c == NULL)
                return m;

        for (n = c; n->m_next != NULL; n = n->m_next)
                continue;
        n->m_next = m;
        return c;
}

struct mbuf *
m_get(int how, int type)
{
        struct mbuf *m;

        KASSERT(type != MT_FREE);

        m = pool_cache_get(mb_cache,
            how == M_WAIT ? PR_WAITOK|PR_LIMITFAIL : PR_NOWAIT);
        if (m == NULL)
                return NULL;
        KASSERTMSG(((vaddr_t)m->m_dat & PAGE_MASK) + MLEN <= PAGE_SIZE,
            "m=%p m->m_dat=%p"
            " MLEN=%u PAGE_MASK=0x%x PAGE_SIZE=%u",
            m, m->m_dat,
            (unsigned)MLEN, (unsigned)PAGE_MASK, (unsigned)PAGE_SIZE);

        mbstat_type_add(type, 1);

        mowner_init(m, type);
        m->m_ext_ref = m; /* default */
        m->m_type = type;
        m->m_len = 0;
        m->m_next = NULL;
        m->m_nextpkt = NULL; /* default */
        m->m_data = m->m_dat;
        m->m_flags = 0; /* default */

        return m;
}

struct mbuf *
m_gethdr(int how, int type)
{
        struct mbuf *m;

        m = m_get(how, type);
        if (m == NULL)
                return NULL;

        m->m_data = m->m_pktdat;
        m->m_flags = M_PKTHDR;

        m_reset_rcvif(m);
        m->m_pkthdr.len = 0;
        m->m_pkthdr.csum_flags = 0;
        m->m_pkthdr.csum_data = 0;
        m->m_pkthdr.segsz = 0;
        m->m_pkthdr.ether_vtag = 0;
        m->m_pkthdr.pkthdr_flags = 0;
        SLIST_INIT(&m->m_pkthdr.tags);

        m->m_pkthdr.pattr_class = NULL;
        m->m_pkthdr.pattr_af = AF_UNSPEC;
        m->m_pkthdr.pattr_hdr = NULL;

        return m;
}

struct mbuf *
m_get_n(int how, int type, size_t alignbytes, size_t nbytes)
{
        struct mbuf *m;

        if (alignbytes > MCLBYTES || nbytes > MCLBYTES - alignbytes)
                return NULL;
        if ((m = m_get(how, type)) == NULL)
                return NULL;
        if (nbytes + alignbytes > MLEN) {
                m_clget(m, how);
                if ((m->m_flags & M_EXT) == 0) {
                        m_free(m);
                        return NULL;
                }
        }
        m->m_len = alignbytes + nbytes;
        m_adj(m, alignbytes);

        return m;
}

struct mbuf *
m_gethdr_n(int how, int type, size_t alignbytes, size_t nbytes)
{
        struct mbuf *m;

        if (nbytes > MCLBYTES || nbytes > MCLBYTES - alignbytes)
                return NULL;
        if ((m = m_gethdr(how, type)) == NULL)
                return NULL;
        if (alignbytes + nbytes > MHLEN) {
                m_clget(m, how);
                if ((m->m_flags & M_EXT) == 0) {
                        m_free(m);
                        return NULL;
                }
        }
        m->m_len = m->m_pkthdr.len = alignbytes + nbytes;
        m_adj(m, alignbytes);

        return m;
}

void
m_clget(struct mbuf *m, int how)
{
        m->m_ext_storage.ext_buf = (char *)pool_cache_get_paddr(mcl_cache,
            how == M_WAIT ? (PR_WAITOK|PR_LIMITFAIL) : PR_NOWAIT,
            &m->m_ext_storage.ext_paddr);

        if (m->m_ext_storage.ext_buf == NULL)
                return;

        KASSERTMSG((((vaddr_t)m->m_ext_storage.ext_buf & PAGE_MASK) + mclbytes
                <= PAGE_SIZE),
            "m=%p m->m_ext_storage.ext_buf=%p"
            " mclbytes=%u PAGE_MASK=0x%x PAGE_SIZE=%u",
            m, m->m_dat,
            (unsigned)mclbytes, (unsigned)PAGE_MASK, (unsigned)PAGE_SIZE);

        MCLINITREFERENCE(m);
        m->m_data = m->m_ext.ext_buf;
        m->m_flags = (m->m_flags & ~M_EXTCOPYFLAGS) |
            M_EXT|M_EXT_CLUSTER|M_EXT_RW;
        m->m_ext.ext_size = MCLBYTES;
        m->m_ext.ext_free = NULL;
        m->m_ext.ext_arg = NULL;
        /* ext_paddr initialized above */

        mowner_ref(m, M_EXT|M_EXT_CLUSTER);
}

struct mbuf *
m_getcl(int how, int type, int flags)
{
        struct mbuf *mp;

        if ((flags & M_PKTHDR) != 0)
                mp = m_gethdr(how, type);
        else
                mp = m_get(how, type);

        if (mp == NULL)
                return NULL;

        MCLGET(mp, how);
        if ((mp->m_flags & M_EXT) != 0)
                return mp;

        m_free(mp);
        return NULL;
}

/*
 * Utility function for M_PREPEND. Do *NOT* use it directly.
 */
struct mbuf *
m_prepend(struct mbuf *m, int len, int how)
{
        struct mbuf *mn;

        if (__predict_false(len > MHLEN)) {
                panic("%s: len > MHLEN", __func__);
        }

        KASSERT(len != M_COPYALL);
        mn = m_get(how, m->m_type);
        if (mn == NULL) {
                m_freem(m);
                return NULL;
        }

        if (m->m_flags & M_PKTHDR) {
                m_move_pkthdr(mn, m);
        } else {
                MCLAIM(mn, m->m_owner);
        }
        mn->m_next = m;
        m = mn;

        if (m->m_flags & M_PKTHDR) {
                if (len < MHLEN)
                        m_align(m, len);
        } else {
                if (len < MLEN)
                        m_align(m, len);
        }

        m->m_len = len;
        return m;
}

struct mbuf *
m_copym(struct mbuf *m, int off, int len, int wait)
{
        /* Shallow copy on M_EXT. */
        return m_copy_internal(m, off, len, wait, false);
}

struct mbuf *
m_dup(struct mbuf *m, int off, int len, int wait)
{
        /* Deep copy. */
        return m_copy_internal(m, off, len, wait, true);
}

static inline int
m_copylen(int len, int copylen)
{
        return (len == M_COPYALL) ? copylen : uimin(len, copylen);
}

static struct mbuf *
m_copy_internal(struct mbuf *m, int off0, int len, int wait, bool deep)
{
        struct mbuf *m0 __diagused = m;
        int len0 __diagused = len;
        struct mbuf *n, **np;
        int off = off0;
        struct mbuf *top;
        int copyhdr = 0;

        if (off < 0 || (len != M_COPYALL && len < 0))
                panic("%s: off %d, len %d", __func__, off, len);
        if (off == 0 && m->m_flags & M_PKTHDR)
                copyhdr = 1;
        while (off > 0) {
                if (m == NULL)
                        panic("%s: m == NULL, off %d", __func__, off);
                if (off < m->m_len)
                        break;
                off -= m->m_len;
                m = m->m_next;
        }

        np = &top;
        top = NULL;
        while (len == M_COPYALL || len > 0) {
                if (m == NULL) {
                        if (len != M_COPYALL)
                                panic("%s: m == NULL, len %d [!COPYALL]",
                                    __func__, len);
                        break;
                }

                n = m_get(wait, m->m_type);
                *np = n;
                if (n == NULL)
                        goto nospace;
                MCLAIM(n, m->m_owner);

                if (copyhdr) {
                        m_copy_pkthdr(n, m);
                        if (len == M_COPYALL)
                                n->m_pkthdr.len -= off0;
                        else
                                n->m_pkthdr.len = len;
                        copyhdr = 0;
                }
                n->m_len = m_copylen(len, m->m_len - off);

                if (m->m_flags & M_EXT) {
                        if (!deep) {
                                n->m_data = m->m_data + off;
                                MCLADDREFERENCE(m, n);
                        } else {
                                /*
                                 * We don't care if MCLGET fails. n->m_len is
                                 * recomputed and handles that.
                                 */
                                MCLGET(n, wait);
                                n->m_len = 0;
                                n->m_len = M_TRAILINGSPACE(n);
                                n->m_len = m_copylen(len, n->m_len);
                                n->m_len = uimin(n->m_len, m->m_len - off);
                                memcpy(mtod(n, void *), mtod(m, char *) + off,
                                    (unsigned)n->m_len);
                        }
                } else {
                        memcpy(mtod(n, void *), mtod(m, char *) + off,
                            (unsigned)n->m_len);
                }

                if (len != M_COPYALL)
                        len -= n->m_len;
                off += n->m_len;

                KASSERTMSG(off <= m->m_len,
                    "m=%p m->m_len=%d off=%d len=%d m0=%p off0=%d len0=%d",
                    m, m->m_len, off, len, m0, off0, len0);

                if (off == m->m_len) {
                        m = m->m_next;
                        off = 0;
                }
                np = &n->m_next;
        }

        return top;

nospace:
        m_freem(top);
        return NULL;
}

/*
 * Copy an entire packet, including header (which must be present).
 * An optimization of the common case 'm_copym(m, 0, M_COPYALL, how)'.
 */
struct mbuf *
m_copypacket(struct mbuf *m, int how)
{
        struct mbuf *top, *n, *o;

        if (__predict_false((m->m_flags & M_PKTHDR) == 0)) {
                panic("%s: no header (m = %p)", __func__, m);
        }

        n = m_get(how, m->m_type);
        top = n;
        if (!n)
                goto nospace;

        MCLAIM(n, m->m_owner);
        m_copy_pkthdr(n, m);
        n->m_len = m->m_len;
        if (m->m_flags & M_EXT) {
                n->m_data = m->m_data;
                MCLADDREFERENCE(m, n);
        } else {
                memcpy(mtod(n, char *), mtod(m, char *), n->m_len);
        }

        m = m->m_next;
        while (m) {
                o = m_get(how, m->m_type);
                if (!o)
                        goto nospace;

                MCLAIM(o, m->m_owner);
                n->m_next = o;
                n = n->m_next;

                n->m_len = m->m_len;
                if (m->m_flags & M_EXT) {
                        n->m_data = m->m_data;
                        MCLADDREFERENCE(m, n);
                } else {
                        memcpy(mtod(n, char *), mtod(m, char *), n->m_len);
                }

                m = m->m_next;
        }
        return top;

nospace:
        m_freem(top);
        return NULL;
}

void
m_copydata(struct mbuf *m, int off, int len, void *cp)
{
        unsigned int count;
        struct mbuf *m0 = m;
        int len0 = len;
        int off0 = off;
        void *cp0 = cp;

        KASSERT(len != M_COPYALL);
        if (off < 0 || len < 0)
                panic("m_copydata: off %d, len %d", off, len);
        while (off > 0) {
                if (m == NULL)
                        panic("m_copydata(%p,%d,%d,%p): m=NULL, off=%d (%d)",
                            m0, len0, off0, cp0, off, off0 - off);
                if (off < m->m_len)
                        break;
                off -= m->m_len;
                m = m->m_next;
        }
        while (len > 0) {
                if (m == NULL)
                        panic("m_copydata(%p,%d,%d,%p): "
                            "m=NULL, off=%d (%d), len=%d (%d)",
                            m0, len0, off0, cp0,
                            off, off0 - off, len, len0 - len);
                count = uimin(m->m_len - off, len);
                memcpy(cp, mtod(m, char *) + off, count);
                len -= count;
                cp = (char *)cp + count;
                off = 0;
                m = m->m_next;
        }
}

/*
 * Concatenate mbuf chain n to m.
 * n might be copied into m (when n->m_len is small), therefore data portion of
 * n could be copied into an mbuf of different mbuf type.
 * Any m_pkthdr is not updated.
 */
void
m_cat(struct mbuf *m, struct mbuf *n)
{

        while (m->m_next)
                m = m->m_next;
        while (n) {
                if (M_READONLY(m) || n->m_len > M_TRAILINGSPACE(m)) {
                        /* just join the two chains */
                        m->m_next = n;
                        return;
                }
                /* splat the data from one into the other */
                memcpy(mtod(m, char *) + m->m_len, mtod(n, void *),
                    (u_int)n->m_len);
                m->m_len += n->m_len;
                n = m_free(n);
        }
}

void
m_adj(struct mbuf *mp, int req_len)
{
        int len = req_len;
        struct mbuf *m;
        int count;

        if ((m = mp) == NULL)
                return;
        if (len >= 0) {
                /*
                 * Trim from head.
                 */
                while (m != NULL && len > 0) {
                        if (m->m_len <= len) {
                                len -= m->m_len;
                                m->m_len = 0;
                                m = m->m_next;
                        } else {
                                m->m_len -= len;
                                m->m_data += len;
                                len = 0;
                        }
                }
                if (mp->m_flags & M_PKTHDR)
                        mp->m_pkthdr.len -= (req_len - len);
        } else {
                /*
                 * Trim from tail.  Scan the mbuf chain,
                 * calculating its length and finding the last mbuf.
                 * If the adjustment only affects this mbuf, then just
                 * adjust and return.  Otherwise, rescan and truncate
                 * after the remaining size.
                 */
                len = -len;
                count = 0;
                for (;;) {
                        count += m->m_len;
                        if (m->m_next == NULL)
                                break;
                        m = m->m_next;
                }
                if (m->m_len >= len) {
                        m->m_len -= len;
                        if (mp->m_flags & M_PKTHDR)
                                mp->m_pkthdr.len -= len;
                        return;
                }

                count -= len;
                if (count < 0)
                        count = 0;

                /*
                 * Correct length for chain is "count".
                 * Find the mbuf with last data, adjust its length,
                 * and toss data from remaining mbufs on chain.
                 */
                m = mp;
                if (m->m_flags & M_PKTHDR)
                        m->m_pkthdr.len = count;
                for (; m; m = m->m_next) {
                        if (m->m_len >= count) {
                                m->m_len = count;
                                break;
                        }
                        count -= m->m_len;
                }
                if (m) {
                        while (m->m_next)
                                (m = m->m_next)->m_len = 0;
                }
        }
}

/*
 * m_ensure_contig: rearrange an mbuf chain that given length of bytes
 * would be contiguous and in the data area of an mbuf (therefore, mtod()
 * would work for a structure of given length).
 *
 * => On success, returns true and the resulting mbuf chain; false otherwise.
 * => The mbuf chain may change, but is always preserved valid.
 */
bool
m_ensure_contig(struct mbuf **m0, int len)
{
        struct mbuf *n = *m0, *m;
        size_t count, space;

        KASSERT(len != M_COPYALL);
        /*
         * If first mbuf has no cluster, and has room for len bytes
         * without shifting current data, pullup into it,
         * otherwise allocate a new mbuf to prepend to the chain.
         */
        if ((n->m_flags & M_EXT) == 0 &&
            n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
                if (n->m_len >= len) {
                        return true;
                }
                m = n;
                n = n->m_next;
                len -= m->m_len;
        } else {
                if (len > MHLEN) {
                        return false;
                }
                m = m_get(M_DONTWAIT, n->m_type);
                if (m == NULL) {
                        return false;
                }
                MCLAIM(m, n->m_owner);
                if (n->m_flags & M_PKTHDR) {
                        m_move_pkthdr(m, n);
                }
        }
        space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
        do {
                count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
                memcpy(mtod(m, char *) + m->m_len, mtod(n, void *),
                  (unsigned)count);
                len -= count;
                m->m_len += count;
                n->m_len -= count;
                space -= count;
                if (n->m_len)
                        n->m_data += count;
                else
                        n = m_free(n);
        } while (len > 0 && n);

        m->m_next = n;
        *m0 = m;

        return len <= 0;
}

/*
 * m_pullup: same as m_ensure_contig(), but destroys mbuf chain on error.
 */
struct mbuf *
m_pullup(struct mbuf *n, int len)
{
        struct mbuf *m = n;

        KASSERT(len != M_COPYALL);
        if (!m_ensure_contig(&m, len)) {
                KASSERT(m != NULL);
                m_freem(m);
                m = NULL;
        }
        return m;
}

/*
 * ensure that [off, off + len) is contiguous on the mbuf chain "m".
 * packet chain before "off" is kept untouched.
 * if offp == NULL, the target will start at <retval, 0> on resulting chain.
 * if offp != NULL, the target will start at <retval, *offp> on resulting chain.
 *
 * on error return (NULL return value), original "m" will be freed.
 *
 * XXX M_TRAILINGSPACE/M_LEADINGSPACE on shared cluster (sharedcluster)
 */
struct mbuf *
m_pulldown(struct mbuf *m, int off, int len, int *offp)
{
        struct mbuf *n, *o;
        int hlen, tlen, olen;
        int sharedcluster;

        /* Check invalid arguments. */
        if (m == NULL)
                panic("%s: m == NULL", __func__);
        if (len > MCLBYTES) {
                m_freem(m);
                return NULL;
        }

        n = m;
        while (n != NULL && off > 0) {
                if (n->m_len > off)
                        break;
                off -= n->m_len;
                n = n->m_next;
        }
        /* Be sure to point non-empty mbuf. */
        while (n != NULL && n->m_len == 0)
                n = n->m_next;
        if (!n) {
                m_freem(m);
                return NULL;        /* mbuf chain too short */
        }

        sharedcluster = M_READONLY(n);

        /*
         * The target data is on <n, off>. If we got enough data on the mbuf
         * "n", we're done.
         */
#ifdef __NO_STRICT_ALIGNMENT
        if ((off == 0 || offp) && len <= n->m_len - off && !sharedcluster)
#else
        if ((off == 0 || offp) && len <= n->m_len - off && !sharedcluster &&
            ALIGNED_POINTER((mtod(n, char *) + off), uint32_t))
#endif
                goto ok;

        /*
         * When (len <= n->m_len - off) and (off != 0), it is a special case.
         * Len bytes from <n, off> sit in single mbuf, but the caller does
         * not like the starting position (off).
         *
         * Chop the current mbuf into two pieces, set off to 0.
         */
        if (len <= n->m_len - off) {
                struct mbuf *mlast;

                o = m_dup(n, off, n->m_len - off, M_DONTWAIT);
                if (o == NULL) {
                        m_freem(m);
                        return NULL;        /* ENOBUFS */
                }
                KASSERTMSG(o->m_len >= len, "o=%p o->m_len=%d len=%d",
                    o, o->m_len, len);
                for (mlast = o; mlast->m_next != NULL; mlast = mlast->m_next)
                        ;
                n->m_len = off;
                mlast->m_next = n->m_next;
                n->m_next = o;
                n = o;
                off = 0;
                goto ok;
        }

        /*
         * We need to take hlen from <n, off> and tlen from <n->m_next, 0>,
         * and construct contiguous mbuf with m_len == len.
         *
         * Note that hlen + tlen == len, and tlen > 0.
         */
        hlen = n->m_len - off;
        tlen = len - hlen;

        /*
         * Ensure that we have enough trailing data on mbuf chain. If not,
         * we can do nothing about the chain.
         */
        olen = 0;
        for (o = n->m_next; o != NULL; o = o->m_next)
                olen += o->m_len;
        if (hlen + olen < len) {
                m_freem(m);
                return NULL;        /* mbuf chain too short */
        }

        /*
         * Easy cases first. We need to use m_copydata() to get data from
         * <n->m_next, 0>.
         */
        if ((off == 0 || offp) && M_TRAILINGSPACE(n) >= tlen &&
            !sharedcluster) {
                m_copydata(n->m_next, 0, tlen, mtod(n, char *) + n->m_len);
                n->m_len += tlen;
                m_adj(n->m_next, tlen);
                goto ok;
        }
        if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen &&
#ifndef __NO_STRICT_ALIGNMENT
            ALIGNED_POINTER((n->m_next->m_data - hlen), uint32_t) &&
#endif
            !sharedcluster && n->m_next->m_len >= tlen) {
                n->m_next->m_data -= hlen;
                n->m_next->m_len += hlen;
                memcpy(mtod(n->m_next, void *), mtod(n, char *) + off, hlen);
                n->m_len -= hlen;
                n = n->m_next;
                off = 0;
                goto ok;
        }

        /*
         * Now, we need to do the hard way. Don't copy as there's no room
         * on both ends.
         */
        o = m_get(M_DONTWAIT, m->m_type);
        if (o && len > MLEN) {
                MCLGET(o, M_DONTWAIT);
                if ((o->m_flags & M_EXT) == 0) {
                        m_free(o);
                        o = NULL;
                }
        }
        if (!o) {
                m_freem(m);
                return NULL;        /* ENOBUFS */
        }
        /* get hlen from <n, off> into <o, 0> */
        o->m_len = hlen;
        memcpy(mtod(o, void *), mtod(n, char *) + off, hlen);
        n->m_len -= hlen;
        /* get tlen from <n->m_next, 0> into <o, hlen> */
        m_copydata(n->m_next, 0, tlen, mtod(o, char *) + o->m_len);
        o->m_len += tlen;
        m_adj(n->m_next, tlen);
        o->m_next = n->m_next;
        n->m_next = o;
        n = o;
        off = 0;

ok:
        if (offp)
                *offp = off;
        return n;
}

/*
 * Like m_pullup(), except a new mbuf is always allocated, and we allow
 * the amount of empty space before the data in the new mbuf to be specified
 * (in the event that the caller expects to prepend later).
 */
struct mbuf *
m_copyup(struct mbuf *n, int len, int dstoff)
{
        struct mbuf *m;
        int count, space;

        KASSERT(len != M_COPYALL);
        if (len > ((int)MHLEN - dstoff))
                goto bad;
        m = m_get(M_DONTWAIT, n->m_type);
        if (m == NULL)
                goto bad;
        MCLAIM(m, n->m_owner);
        if (n->m_flags & M_PKTHDR) {
                m_move_pkthdr(m, n);
        }
        m->m_data += dstoff;
        space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
        do {
                count = uimin(uimin(uimax(len, max_protohdr), space), n->m_len);
                memcpy(mtod(m, char *) + m->m_len, mtod(n, void *),
                    (unsigned)count);
                len -= count;
                m->m_len += count;
                n->m_len -= count;
                space -= count;
                if (n->m_len)
                        n->m_data += count;
                else
                        n = m_free(n);
        } while (len > 0 && n);
        if (len > 0) {
                (void) m_free(m);
                goto bad;
        }
        m->m_next = n;
        return m;
 bad:
        m_freem(n);
        return NULL;
}

struct mbuf *
m_split(struct mbuf *m0, int len, int wait)
{
        return m_split_internal(m0, len, wait, true);
}

static struct mbuf *
m_split_internal(struct mbuf *m0, int len0, int wait, bool copyhdr)
{
        struct mbuf *m, *n;
        unsigned len = len0, remain, len_save;

        KASSERT(len0 != M_COPYALL);
        for (m = m0; m && len > m->m_len; m = m->m_next)
                len -= m->m_len;
        if (m == NULL)
                return NULL;

        remain = m->m_len - len;
        if (copyhdr && (m0->m_flags & M_PKTHDR)) {
                n = m_gethdr(wait, m0->m_type);
                if (n == NULL)
                        return NULL;

                MCLAIM(n, m0->m_owner);
                m_copy_rcvif(n, m0);
                n->m_pkthdr.len = m0->m_pkthdr.len - len0;
                len_save = m0->m_pkthdr.len;
                m0->m_pkthdr.len = len0;

                if ((m->m_flags & M_EXT) == 0 && remain > MHLEN) {
                        /* m can't be the lead packet */
                        m_align(n, 0);
                        n->m_len = 0;
                        n->m_next = m_split(m, len, wait);
                        if (n->m_next == NULL) {
                                (void)m_free(n);
                                m0->m_pkthdr.len = len_save;
                                return NULL;
                        }
                        return n;
                }
        } else if (remain == 0) {
                n = m->m_next;
                m->m_next = NULL;
                return n;
        } else {
                n = m_get(wait, m->m_type);
                if (n == NULL)
                        return NULL;
                MCLAIM(n, m->m_owner);
        }

        if (m->m_flags & M_EXT) {
                n->m_data = m->m_data + len;
                MCLADDREFERENCE(m, n);
        } else {
                m_align(n, remain);
                memcpy(mtod(n, void *), mtod(m, char *) + len, remain);
        }

        n->m_len = remain;
        m->m_len = len;
        n->m_next = m->m_next;
        m->m_next = NULL;
        return n;
}

/*
 * Routine to copy from device local memory into mbufs.
 */
struct mbuf *
m_devget(char *buf, int totlen, int off, struct ifnet *ifp)
{
        struct mbuf *m;
        struct mbuf *top = NULL, **mp = &top;
        char *cp, *epkt;
        int len;

        cp = buf;
        epkt = cp + totlen;
        if (off) {
                /*
                 * If 'off' is non-zero, packet is trailer-encapsulated,
                 * so we have to skip the type and length fields.
                 */
                cp += off + 2 * sizeof(uint16_t);
                totlen -= 2 * sizeof(uint16_t);
        }

        m = m_gethdr(M_DONTWAIT, MT_DATA);
        if (m == NULL)
                return NULL;
        m_set_rcvif(m, ifp);
        m->m_pkthdr.len = totlen;
        m->m_len = MHLEN;

        while (totlen > 0) {
                if (top) {
                        m = m_get(M_DONTWAIT, MT_DATA);
                        if (m == NULL) {
                                m_freem(top);
                                return NULL;
                        }
                        m->m_len = MLEN;
                }

                len = uimin(totlen, epkt - cp);

                if (len >= MINCLSIZE) {
                        MCLGET(m, M_DONTWAIT);
                        if ((m->m_flags & M_EXT) == 0) {
                                m_free(m);
                                m_freem(top);
                                return NULL;
                        }
                        m->m_len = len = uimin(len, MCLBYTES);
                } else {
                        /*
                         * Place initial small packet/header at end of mbuf.
                         */
                        if (len < m->m_len) {
                                if (top == 0 && len + max_linkhdr <= m->m_len)
                                        m->m_data += max_linkhdr;
                                m->m_len = len;
                        } else
                                len = m->m_len;
                }

                memcpy(mtod(m, void *), cp, (size_t)len);

                cp += len;
                *mp = m;
                mp = &m->m_next;
                totlen -= len;
                if (cp == epkt)
                        cp = buf;
        }

        return top;
}

/*
 * Copy data from a buffer back into the indicated mbuf chain,
 * starting "off" bytes from the beginning, extending the mbuf
 * chain if necessary.
 */
void
m_copyback(struct mbuf *m0, int off, int len, const void *cp)
{
#if defined(DEBUG)
        struct mbuf *origm = m0;
        int error;
#endif

        if (m0 == NULL)
                return;

#if defined(DEBUG)
        error =
#endif
        m_copyback_internal(&m0, off, len, cp, CB_COPYBACK|CB_EXTEND,
            M_DONTWAIT);

#if defined(DEBUG)
        if (error != 0 || (m0 != NULL && origm != m0))
                panic("m_copyback");
#endif
}

struct mbuf *
m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
{
        int error;

        /* don't support chain expansion */
        KASSERT(len != M_COPYALL);
        KDASSERT(off + len <= m_length(m0));

        error = m_copyback_internal(&m0, off, len, cp, CB_COPYBACK|CB_COW,
            how);
        if (error) {
                /*
                 * no way to recover from partial success.
                 * just free the chain.
                 */
                m_freem(m0);
                return NULL;
        }
        return m0;
}

int
m_makewritable(struct mbuf **mp, int off, int len, int how)
{
        int error;
#if defined(DEBUG)
        int origlen = m_length(*mp);
#endif

        error = m_copyback_internal(mp, off, len, NULL, CB_PRESERVE|CB_COW,
            how);
        if (error)
                return error;

#if defined(DEBUG)
        int reslen = 0;
        for (struct mbuf *n = *mp; n; n = n->m_next)
                reslen += n->m_len;
        if (origlen != reslen)
                panic("m_makewritable: length changed");
        if (((*mp)->m_flags & M_PKTHDR) != 0 && reslen != (*mp)->m_pkthdr.len)
                panic("m_makewritable: inconsist");
#endif

        return 0;
}

static int
m_copyback_internal(struct mbuf **mp0, int off, int len, const void *vp,
    int flags, int how)
{
        int mlen;
        struct mbuf *m, *n;
        struct mbuf **mp;
        int totlen = 0;
        const char *cp = vp;

        KASSERT(mp0 != NULL);
        KASSERT(*mp0 != NULL);
        KASSERT((flags & CB_PRESERVE) == 0 || cp == NULL);
        KASSERT((flags & CB_COPYBACK) == 0 || cp != NULL);

        if (len == M_COPYALL)
                len = m_length(*mp0) - off;

        /*
         * we don't bother to update "totlen" in the case of CB_COW,
         * assuming that CB_EXTEND and CB_COW are exclusive.
         */

        KASSERT((~flags & (CB_EXTEND|CB_COW)) != 0);

        mp = mp0;
        m = *mp;
        while (off > (mlen = m->m_len)) {
                off -= mlen;
                totlen += mlen;
                if (m->m_next == NULL) {
                        int tspace;
extend:
                        if ((flags & CB_EXTEND) == 0)
                                goto out;

                        /*
                         * try to make some space at the end of "m".
                         */

                        mlen = m->m_len;
                        if (off + len >= MINCLSIZE &&
                            (m->m_flags & M_EXT) == 0 && m->m_len == 0) {
                                MCLGET(m, how);
                        }
                        tspace = M_TRAILINGSPACE(m);
                        if (tspace > 0) {
                                tspace = uimin(tspace, off + len);
                                KASSERT(tspace > 0);
                                memset(mtod(m, char *) + m->m_len, 0,
                                    uimin(off, tspace));
                                m->m_len += tspace;
                                off += mlen;
                                totlen -= mlen;
                                continue;
                        }

                        /*
                         * need to allocate an mbuf.
                         */

                        if (off + len >= MINCLSIZE) {
                                n = m_getcl(how, m->m_type, 0);
                        } else {
                                n = m_get(how, m->m_type);
                        }
                        if (n == NULL) {
                                goto out;
                        }
                        n->m_len = uimin(M_TRAILINGSPACE(n), off + len);
                        memset(mtod(n, char *), 0, uimin(n->m_len, off));
                        m->m_next = n;
                }
                mp = &m->m_next;
                m = m->m_next;
        }
        while (len > 0) {
                mlen = m->m_len - off;
                if (mlen != 0 && M_READONLY(m)) {
                        /*
                         * This mbuf is read-only. Allocate a new writable
                         * mbuf and try again.
                         */
                        char *datap;
                        int eatlen;

                        KASSERT((flags & CB_COW) != 0);

                        /*
                         * if we're going to write into the middle of
                         * a mbuf, split it first.
                         */
                        if (off > 0) {
                                n = m_split_internal(m, off, how, false);
                                if (n == NULL)
                                        goto enobufs;
                                m->m_next = n;
                                mp = &m->m_next;
                                m = n;
                                off = 0;
                                continue;
                        }

                        /*
                         * XXX TODO coalesce into the trailingspace of
                         * the previous mbuf when possible.
                         */

                        /*
                         * allocate a new mbuf.  copy packet header if needed.
                         */
                        n = m_get(how, m->m_type);
                        if (n == NULL)
                                goto enobufs;
                        MCLAIM(n, m->m_owner);
                        if (off == 0 && (m->m_flags & M_PKTHDR) != 0) {
                                m_move_pkthdr(n, m);
                                n->m_len = MHLEN;
                        } else {
                                if (len >= MINCLSIZE)
                                        MCLGET(n, M_DONTWAIT);
                                n->m_len =
                                    (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
                        }
                        if (n->m_len > len)
                                n->m_len = len;

                        /*
                         * free the region which has been overwritten.
                         * copying data from old mbufs if requested.
                         */
                        if (flags & CB_PRESERVE)
                                datap = mtod(n, char *);
                        else
                                datap = NULL;
                        eatlen = n->m_len;
                        while (m != NULL && M_READONLY(m) &&
                            n->m_type == m->m_type && eatlen > 0) {
                                mlen = uimin(eatlen, m->m_len);
                                if (datap) {
                                        m_copydata(m, 0, mlen, datap);
                                        datap += mlen;
                                }
                                m->m_data += mlen;
                                m->m_len -= mlen;
                                eatlen -= mlen;
                                if (m->m_len == 0)
                                        *mp = m = m_free(m);
                        }
                        if (eatlen > 0)
                                n->m_len -= eatlen;
                        n->m_next = m;
                        *mp = m = n;
                        continue;
                }
                mlen = uimin(mlen, len);
                if (flags & CB_COPYBACK) {
                        memcpy(mtod(m, char *) + off, cp, (unsigned)mlen);
                        cp += mlen;
                }
                len -= mlen;
                mlen += off;
                off = 0;
                totlen += mlen;
                if (len == 0)
                        break;
                if (m->m_next == NULL) {
                        goto extend;
                }
                mp = &m->m_next;
                m = m->m_next;
        }

out:
        if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
                KASSERT((flags & CB_EXTEND) != 0);
                m->m_pkthdr.len = totlen;
        }

        return 0;

enobufs:
        return ENOBUFS;
}

/*
 * Compress the mbuf chain. Return the new mbuf chain on success, NULL on
 * failure. The first mbuf is preserved, and on success the pointer returned
 * is the same as the one passed.
 */
struct mbuf *
m_defrag(struct mbuf *m, int how)
{
        struct mbuf *m0, *mn, *n;
        int sz;

        KASSERT((m->m_flags & M_PKTHDR) != 0);

        if (m->m_next == NULL)
                return m;

        /* Defrag to single mbuf if at all possible */
        if ((m->m_flags & M_EXT) == 0 && m->m_pkthdr.len <= MCLBYTES) {
                if (m->m_pkthdr.len <= MHLEN) {
                        if (M_TRAILINGSPACE(m) < (m->m_pkthdr.len - m->m_len)) {
                                KASSERTMSG(M_LEADINGSPACE(m) +
                                    M_TRAILINGSPACE(m) >=
                                    (m->m_pkthdr.len - m->m_len),
                                    "too small leading %d trailing %d ro? %d"
                                    " pkthdr.len %d mlen %d",
                                    (int)M_LEADINGSPACE(m),
                                    (int)M_TRAILINGSPACE(m),
                                    M_READONLY(m),
                                    m->m_pkthdr.len, m->m_len);

                                memmove(m->m_pktdat, m->m_data, m->m_len);
                                m->m_data = m->m_pktdat;

                                KASSERT(M_TRAILINGSPACE(m) >=
                                    (m->m_pkthdr.len - m->m_len));
                        }
                } else {
                        /* Must copy data before adding cluster */
                        m0 = m_get(how, MT_DATA);
                        if (m0 == NULL)
                                return NULL;
                        KASSERTMSG(m->m_len <= MHLEN,
                            "m=%p m->m_len=%d MHLEN=%u",
                            m, m->m_len, (unsigned)MHLEN);
                        m_copydata(m, 0, m->m_len, mtod(m0, void *));

                        MCLGET(m, how);
                        if ((m->m_flags & M_EXT) == 0) {
                                m_free(m0);
                                return NULL;
                        }
                        memcpy(m->m_data, mtod(m0, void *), m->m_len);
                        m_free(m0);
                }
                KASSERTMSG(M_TRAILINGSPACE(m) >= (m->m_pkthdr.len - m->m_len),
                    "m=%p M_TRAILINGSPACE(m)=%zd m->m_pkthdr.len=%d"
                    " m->m_len=%d",
                    m, M_TRAILINGSPACE(m), m->m_pkthdr.len, m->m_len);
                m_copydata(m->m_next, 0, m->m_pkthdr.len - m->m_len,
                            mtod(m, char *) + m->m_len);
                m->m_len = m->m_pkthdr.len;
                m_freem(m->m_next);
                m->m_next = NULL;
                return m;
        }

        m0 = m_get(how, MT_DATA);
        if (m0 == NULL)
                return NULL;
        mn = m0;

        sz = m->m_pkthdr.len - m->m_len;
        KASSERT(sz >= 0);

        do {
                if (sz > MLEN) {
                        MCLGET(mn, how);
                        if ((mn->m_flags & M_EXT) == 0) {
                                m_freem(m0);
                                return NULL;
                        }
                }

                mn->m_len = MIN(sz, MCLBYTES);

                m_copydata(m, m->m_pkthdr.len - sz, mn->m_len,
                     mtod(mn, void *));

                sz -= mn->m_len;

                if (sz > 0) {
                        /* need more mbufs */
                        n = m_get(how, MT_DATA);
                        if (n == NULL) {
                                m_freem(m0);
                                return NULL;
                        }

                        mn->m_next = n;
                        mn = n;
                }
        } while (sz > 0);

        m_freem(m->m_next);
        m->m_next = m0;

        return m;
}

void
m_remove_pkthdr(struct mbuf *m)
{
        KASSERT(m->m_flags & M_PKTHDR);

        m_tag_delete_chain(m);
        m->m_flags &= ~M_PKTHDR;
        memset(&m->m_pkthdr, 0, sizeof(m->m_pkthdr));
}

void
m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
{
        KASSERT((to->m_flags & M_EXT) == 0);
        KASSERT((to->m_flags & M_PKTHDR) == 0 ||
            SLIST_FIRST(&to->m_pkthdr.tags) == NULL);
        KASSERT((from->m_flags & M_PKTHDR) != 0);

        to->m_pkthdr = from->m_pkthdr;
        to->m_flags = from->m_flags & M_COPYFLAGS;
        to->m_data = to->m_pktdat;

        SLIST_INIT(&to->m_pkthdr.tags);
        m_tag_copy_chain(to, from);
}

void
m_move_pkthdr(struct mbuf *to, struct mbuf *from)
{
        KASSERT((to->m_flags & M_EXT) == 0);
        KASSERT((to->m_flags & M_PKTHDR) == 0 ||
            SLIST_FIRST(&to->m_pkthdr.tags) == NULL);
        KASSERT((from->m_flags & M_PKTHDR) != 0);

        to->m_pkthdr = from->m_pkthdr;
        to->m_flags = from->m_flags & M_COPYFLAGS;
        to->m_data = to->m_pktdat;

        from->m_flags &= ~M_PKTHDR;
}

/*
 * Set the m_data pointer of a newly-allocated mbuf to place an object of the
 * specified size at the end of the mbuf, longword aligned.
 */
void
m_align(struct mbuf *m, int len)
{
        int buflen, adjust;

        KASSERT(len != M_COPYALL);
        KASSERTMSG(M_LEADINGSPACE(m) == 0, "m=%p M_LEADINGSPACE(m)=%zd",
            m, M_LEADINGSPACE(m));

        buflen = M_BUFSIZE(m);

        KASSERTMSG(len <= buflen, "m=%p len=%d buflen=%d", m, len, buflen);
        adjust = buflen - len;
        m->m_data += adjust &~ (sizeof(long)-1);
}

/*
 * Apply function f to the data in an mbuf chain starting "off" bytes from the
 * beginning, continuing for "len" bytes.
 */
int
m_apply(struct mbuf *m, int off, int len,
    int (*f)(void *, void *, unsigned int), void *arg)
{
        unsigned int count;
        int rval;

        KASSERT(len != M_COPYALL);
        KASSERT(len >= 0);
        KASSERT(off >= 0);

        while (off > 0) {
                KASSERT(m != NULL);
                if (off < m->m_len)
                        break;
                off -= m->m_len;
                m = m->m_next;
        }
        while (len > 0) {
                KASSERT(m != NULL);
                count = uimin(m->m_len - off, len);

                rval = (*f)(arg, mtod(m, char *) + off, count);
                if (rval)
                        return rval;

                len -= count;
                off = 0;
                m = m->m_next;
        }

        return 0;
}

/*
 * Return a pointer to mbuf/offset of location in mbuf chain.
 */
struct mbuf *
m_getptr(struct mbuf *m, int loc, int *off)
{

        while (loc >= 0) {
                /* Normal end of search */
                if (m->m_len > loc) {
                        *off = loc;
                        return m;
                }

                loc -= m->m_len;

                if (m->m_next == NULL) {
                        if (loc == 0) {
                                /* Point at the end of valid data */
                                *off = m->m_len;
                                return m;
                        }
                        return NULL;
                } else {
                        m = m->m_next;
                }
        }

        return NULL;
}

/*
 * Release a reference to the mbuf external storage.
 *
 * => free the mbuf m itself as well.
 */
static void
m_ext_free(struct mbuf *m)
{
        const bool embedded = MEXT_ISEMBEDDED(m);
        bool dofree = true;
        u_int refcnt;

        KASSERT((m->m_flags & M_EXT) != 0);
        KASSERT(MEXT_ISEMBEDDED(m->m_ext_ref));
        KASSERT((m->m_ext_ref->m_flags & M_EXT) != 0);
        KASSERT((m->m_flags & M_EXT_CLUSTER) ==
            (m->m_ext_ref->m_flags & M_EXT_CLUSTER));

        if (__predict_false(m->m_type == MT_FREE)) {
                panic("mbuf %p already freed", m);
        }

        if (__predict_true(m->m_ext.ext_refcnt == 1)) {
                refcnt = m->m_ext.ext_refcnt = 0;
        } else {
                membar_release();
                refcnt = atomic_dec_uint_nv(&m->m_ext.ext_refcnt);
        }

        if (refcnt > 0) {
                if (embedded) {
                        /*
                         * other mbuf's m_ext_ref still points to us.
                         */
                        dofree = false;
                } else {
                        m->m_ext_ref = m;
                }
        } else {
                /*
                 * dropping the last reference
                 */
                membar_acquire();
                if (!embedded) {
                        m->m_ext.ext_refcnt++; /* XXX */
                        m_ext_free(m->m_ext_ref);
                        m->m_ext_ref = m;
                } else if ((m->m_flags & M_EXT_CLUSTER) != 0) {
                        pool_cache_put_paddr(mcl_cache,
                            m->m_ext.ext_buf, m->m_ext.ext_paddr);
                } else if (m->m_ext.ext_free) {
                        (*m->m_ext.ext_free)(m,
                            m->m_ext.ext_buf, m->m_ext.ext_size,
                            m->m_ext.ext_arg);
                        /*
                         * 'm' is already freed by the ext_free callback.
                         */
                        dofree = false;
                } else {
                        free(m->m_ext.ext_buf, 0);
                }
        }

        if (dofree) {
                m->m_type = MT_FREE;
                m->m_data = NULL;
                pool_cache_put(mb_cache, m);
        }
}

/*
 * Free a single mbuf and associated external storage. Return the
 * successor, if any.
 */
struct mbuf *
m_free(struct mbuf *m)
{
        struct mbuf *n;

        mowner_revoke(m, 1, m->m_flags);
        mbstat_type_add(m->m_type, -1);

        if (m->m_flags & M_PKTHDR)
                m_tag_delete_chain(m);

        n = m->m_next;

        if (m->m_flags & M_EXT) {
                m_ext_free(m);
        } else {
                if (__predict_false(m->m_type == MT_FREE)) {
                        panic("mbuf %p already freed", m);
                }
                m->m_type = MT_FREE;
                m->m_data = NULL;
                pool_cache_put(mb_cache, m);
        }

        return n;
}

void
m_freem(struct mbuf *m)
{
        if (m == NULL)
                return;
        do {
                m = m_free(m);
        } while (m);
}

#if defined(DDB)
void
m_print(const struct mbuf *m, const char *modif, void (*pr)(const char *, ...))
{
        char ch;
        bool opt_c = false;
        bool opt_d = false;
#if NETHER > 0
        bool opt_v = false;
        const struct mbuf *m0 = NULL;
#endif
        int no = 0;
        char buf[512];

        while ((ch = *(modif++)) != '\0') {
                switch (ch) {
                case 'c':
                        opt_c = true;
                        break;
                case 'd':
                        opt_d = true;
                        break;
#if NETHER > 0
                case 'v':
                        opt_v = true;
                        m0 = m;
                        break;
#endif
                default:
                        break;
                }
        }

nextchain:
        (*pr)("MBUF(%d) %p\n", no, m);
        snprintb(buf, sizeof(buf), M_FLAGS_BITS, (u_int)m->m_flags);
        (*pr)("  data=%p, len=%d, type=%d, flags=%s\n",
            m->m_data, m->m_len, m->m_type, buf);
        if (opt_d) {
                int i;
                unsigned char *p = m->m_data;

                (*pr)("  data:");

                for (i = 0; i < m->m_len; i++) {
                        if (i % 16 == 0)
                                (*pr)("\n");
                        (*pr)(" %02x", p[i]);
                }

                (*pr)("\n");
        }
        (*pr)("  owner=%p, next=%p, nextpkt=%p\n", m->m_owner, m->m_next,
            m->m_nextpkt);
        (*pr)("  leadingspace=%u, trailingspace=%u, readonly=%u\n",
            (int)M_LEADINGSPACE(m), (int)M_TRAILINGSPACE(m),
            (int)M_READONLY(m));
        if ((m->m_flags & M_PKTHDR) != 0) {
                snprintb(buf, sizeof(buf), M_CSUM_BITS, m->m_pkthdr.csum_flags);
                (*pr)("  pktlen=%d, rcvif=%p, csum_flags=%s, csum_data=0x%"
                    PRIx32 ", segsz=%u\n",
                    m->m_pkthdr.len, m_get_rcvif_NOMPSAFE(m),
                    buf, m->m_pkthdr.csum_data, m->m_pkthdr.segsz);
        }
        if ((m->m_flags & M_EXT)) {
                (*pr)("  ext_refcnt=%u, ext_buf=%p, ext_size=%zd, "
                    "ext_free=%p, ext_arg=%p\n",
                    m->m_ext.ext_refcnt,
                    m->m_ext.ext_buf, m->m_ext.ext_size,
                    m->m_ext.ext_free, m->m_ext.ext_arg);
        }
        if ((~m->m_flags & (M_EXT|M_EXT_PAGES)) == 0) {
                vaddr_t sva = (vaddr_t)m->m_ext.ext_buf;
                vaddr_t eva = sva + m->m_ext.ext_size;
                int n = (round_page(eva) - trunc_page(sva)) >> PAGE_SHIFT;
                int i;

                (*pr)("  pages:");
                for (i = 0; i < n; i ++) {
                        (*pr)(" %p", m->m_ext.ext_pgs[i]);
                }
                (*pr)("\n");
        }

        if (opt_c) {
                m = m->m_next;
                if (m != NULL) {
                        no++;
                        goto nextchain;
                }
        }

#if NETHER > 0
        if (opt_v && m0)
                m_examine(m0, AF_ETHER, modif, pr);
#endif
}
#endif /* defined(DDB) */

#if defined(MBUFTRACE)
void
mowner_init_owner(struct mowner *mo, const char *name, const char *descr)
{
        memset(mo, 0, sizeof(*mo));
        strlcpy(mo->mo_name, name, sizeof(mo->mo_name));
        strlcpy(mo->mo_descr, descr, sizeof(mo->mo_descr));
}

void
mowner_attach(struct mowner *mo)
{

        KASSERT(mo->mo_counters == NULL);
        mo->mo_counters = percpu_alloc(sizeof(struct mowner_counter));

        /* XXX lock */
        LIST_INSERT_HEAD(&mowners, mo, mo_link);
}

void
mowner_detach(struct mowner *mo)
{

        KASSERT(mo->mo_counters != NULL);

        /* XXX lock */
        LIST_REMOVE(mo, mo_link);

        percpu_free(mo->mo_counters, sizeof(struct mowner_counter));
        mo->mo_counters = NULL;
}

void
mowner_init(struct mbuf *m, int type)
{
        struct mowner_counter *mc;
        struct mowner *mo;
        int s;

        m->m_owner = mo = &unknown_mowners[type];
        s = splvm();
        mc = percpu_getref(mo->mo_counters);
        mc->mc_counter[MOWNER_COUNTER_CLAIMS]++;
        percpu_putref(mo->mo_counters);
        splx(s);
}

void
mowner_ref(struct mbuf *m, int flags)
{
        struct mowner *mo = m->m_owner;
        struct mowner_counter *mc;
        int s;

        s = splvm();
        mc = percpu_getref(mo->mo_counters);
        if ((flags & M_EXT) != 0)
                mc->mc_counter[MOWNER_COUNTER_EXT_CLAIMS]++;
        if ((flags & M_EXT_CLUSTER) != 0)
                mc->mc_counter[MOWNER_COUNTER_CLUSTER_CLAIMS]++;
        percpu_putref(mo->mo_counters);
        splx(s);
}

void
mowner_revoke(struct mbuf *m, bool all, int flags)
{
        struct mowner *mo = m->m_owner;
        struct mowner_counter *mc;
        int s;

        s = splvm();
        mc = percpu_getref(mo->mo_counters);
        if ((flags & M_EXT) != 0)
                mc->mc_counter[MOWNER_COUNTER_EXT_RELEASES]++;
        if ((flags & M_EXT_CLUSTER) != 0)
                mc->mc_counter[MOWNER_COUNTER_CLUSTER_RELEASES]++;
        if (all)
                mc->mc_counter[MOWNER_COUNTER_RELEASES]++;
        percpu_putref(mo->mo_counters);
        splx(s);
        if (all)
                m->m_owner = &revoked_mowner;
}

static void
mowner_claim(struct mbuf *m, struct mowner *mo)
{
        struct mowner_counter *mc;
        int flags = m->m_flags;
        int s;

        s = splvm();
        mc = percpu_getref(mo->mo_counters);
        mc->mc_counter[MOWNER_COUNTER_CLAIMS]++;
        if ((flags & M_EXT) != 0)
                mc->mc_counter[MOWNER_COUNTER_EXT_CLAIMS]++;
        if ((flags & M_EXT_CLUSTER) != 0)
                mc->mc_counter[MOWNER_COUNTER_CLUSTER_CLAIMS]++;
        percpu_putref(mo->mo_counters);
        splx(s);
        m->m_owner = mo;
}

void
m_claim(struct mbuf *m, struct mowner *mo)
{

        if (m->m_owner == mo || mo == NULL)
                return;

        mowner_revoke(m, true, m->m_flags);
        mowner_claim(m, mo);
}

void
m_claimm(struct mbuf *m, struct mowner *mo)
{

        for (; m != NULL; m = m->m_next)
                m_claim(m, mo);
}
#endif /* defined(MBUFTRACE) */

#ifdef DIAGNOSTIC
/*
 * Verify that the mbuf chain is not malformed. Used only for diagnostic.
 * Panics on error.
 */
void
m_verify_packet(struct mbuf *m)
{
        struct mbuf *n = m;
        char *low, *high, *dat;
        int totlen = 0, len;

        if (__predict_false((m->m_flags & M_PKTHDR) == 0)) {
                panic("%s: mbuf doesn't have M_PKTHDR", __func__);
        }

        while (n != NULL) {
                if (__predict_false(n->m_type == MT_FREE)) {
                        panic("%s: mbuf already freed (n = %p)", __func__, n);
                }
#if 0
                /*
                 * This ought to be a rule of the mbuf API. Unfortunately,
                 * many places don't respect that rule.
                 */
                if (__predict_false((n != m) && (n->m_flags & M_PKTHDR) != 0)) {
                        panic("%s: M_PKTHDR set on secondary mbuf", __func__);
                }
#endif
                if (__predict_false(n->m_nextpkt != NULL)) {
                        panic("%s: m_nextpkt not null (m_nextpkt = %p)",
                            __func__, n->m_nextpkt);
                }

                dat = n->m_data;
                len = n->m_len;
                if (__predict_false(len < 0)) {
                        panic("%s: incorrect length (len = %d)", __func__, len);
                }

                low = M_BUFADDR(n);
                high = low + M_BUFSIZE(n);
                if (__predict_false((dat < low) || (dat + len > high))) {
                        panic("%s: m_data not in packet"
                            "(dat = %p, len = %d, low = %p, high = %p)",
                            __func__, dat, len, low, high);
                }

                totlen += len;
                n = n->m_next;
        }

        if (__predict_false(totlen != m->m_pkthdr.len)) {
                panic("%s: inconsistent mbuf length (%d != %d)", __func__,
                    totlen, m->m_pkthdr.len);
        }
}
#endif

struct m_tag *
m_tag_get(int type, int len, int wait)
{
        struct m_tag *t;

        if (len < 0)
                return NULL;
        t = malloc(len + sizeof(struct m_tag), M_PACKET_TAGS, wait);
        if (t == NULL)
                return NULL;
        t->m_tag_id = type;
        t->m_tag_len = len;
        return t;
}

void
m_tag_free(struct m_tag *t)
{
        free(t, M_PACKET_TAGS);
}

void
m_tag_prepend(struct mbuf *m, struct m_tag *t)
{
        KASSERT((m->m_flags & M_PKTHDR) != 0);
        SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
}

void
m_tag_unlink(struct mbuf *m, struct m_tag *t)
{
        KASSERT((m->m_flags & M_PKTHDR) != 0);
        SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link);
}

void
m_tag_delete(struct mbuf *m, struct m_tag *t)
{
        m_tag_unlink(m, t);
        m_tag_free(t);
}

void
m_tag_delete_chain(struct mbuf *m)
{
        struct m_tag *p, *q;

        KASSERT((m->m_flags & M_PKTHDR) != 0);

        p = SLIST_FIRST(&m->m_pkthdr.tags);
        if (p == NULL)
                return;
        while ((q = SLIST_NEXT(p, m_tag_link)) != NULL)
                m_tag_delete(m, q);
        m_tag_delete(m, p);
}

struct m_tag *
m_tag_find(const struct mbuf *m, int type)
{
        struct m_tag *p;

        KASSERT((m->m_flags & M_PKTHDR) != 0);

        p = SLIST_FIRST(&m->m_pkthdr.tags);
        while (p != NULL) {
                if (p->m_tag_id == type)
                        return p;
                p = SLIST_NEXT(p, m_tag_link);
        }
        return NULL;
}

struct m_tag *
m_tag_copy(struct m_tag *t)
{
        struct m_tag *p;

        p = m_tag_get(t->m_tag_id, t->m_tag_len, M_NOWAIT);
        if (p == NULL)
                return NULL;
        memcpy(p + 1, t + 1, t->m_tag_len);
        return p;
}

/*
 * Copy two tag chains. The destination mbuf (to) loses any attached
 * tags even if the operation fails. This should not be a problem, as
 * m_tag_copy_chain() is typically called with a newly-allocated
 * destination mbuf.
 */
int
m_tag_copy_chain(struct mbuf *to, struct mbuf *from)
{
        struct m_tag *p, *t, *tprev = NULL;

        KASSERT((from->m_flags & M_PKTHDR) != 0);

        m_tag_delete_chain(to);
        SLIST_FOREACH(p, &from->m_pkthdr.tags, m_tag_link) {
                t = m_tag_copy(p);
                if (t == NULL) {
                        m_tag_delete_chain(to);
                        return 0;
                }
                if (tprev == NULL)
                        SLIST_INSERT_HEAD(&to->m_pkthdr.tags, t, m_tag_link);
                else
                        SLIST_INSERT_AFTER(tprev, t, m_tag_link);
                tprev = t;
        }
        return 1;
}

































































































































































































































































































































































































































































    1 






















    1 












    1 
















    1 





















































    1 













    1 























































































































































































































































































































































































    3 























    1 







    2 





































































    1 

























    1 


























































    1 






    1 
































    1 





    1 







    1 

































































































































































































































































































































































































































































































    1 




















    1 









    1 




















































































































































































































































































    1 








    1 




























    1 






































    1 












































































    1 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
/*        $NetBSD: tcp_subr.c,v 1.296 2022/11/04 09:01:53 ozaki-r Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1997, 1998, 2000, 2001, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
 * Facility, NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)tcp_subr.c        8.2 (Berkeley) 5/24/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.296 2022/11/04 09:01:53 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_inet_csum.h"
#include "opt_mbuftrace.h"
#endif

#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/once.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/pool.h>
#include <sys/md5.h>
#include <sys/cprng.h>

#include <net/route.h>
#include <net/if.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>

#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6protosw.h>
#include <netinet/icmp6.h>
#include <netinet6/nd6.h>
#endif

#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_vtw.h>
#include <netinet/tcp_private.h>
#include <netinet/tcp_congctl.h>
#include <netinet/tcp_syncache.h>

#ifdef IPSEC
#include <netipsec/ipsec.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#include <netipsec/key.h>
#endif


struct        inpcbtable tcbtable;        /* head of queue of active tcpcb's */
u_int32_t tcp_now;                /* slow ticks, for RFC 1323 timestamps */

percpu_t *tcpstat_percpu;

/* patchable/settable parameters for tcp */
int         tcp_mssdflt = TCP_MSS;
int        tcp_minmss = TCP_MINMSS;
int         tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
int        tcp_do_rfc1323 = 1;        /* window scaling / timestamps (obsolete) */
int        tcp_do_rfc1948 = 0;        /* ISS by cryptographic hash */
int        tcp_do_sack = 1;        /* selective acknowledgement */
int        tcp_do_win_scale = 1;        /* RFC1323 window scaling */
int        tcp_do_timestamps = 1;        /* RFC1323 timestamps */
int        tcp_ack_on_push = 0;        /* set to enable immediate ACK-on-PUSH */
int        tcp_do_ecn = 0;                /* Explicit Congestion Notification */
#ifndef TCP_INIT_WIN
#define        TCP_INIT_WIN        4        /* initial slow start window */
#endif
#ifndef TCP_INIT_WIN_LOCAL
#define        TCP_INIT_WIN_LOCAL 4        /* initial slow start window for local nets */
#endif
/*
 * Up to 5 we scale linearly, to reach 3 * 1460; then (iw) * 1460.
 * This is to simulate current behavior for iw == 4
 */
int tcp_init_win_max[] = {
         1 * 1460,
         1 * 1460,
         2 * 1460,
         2 * 1460,
         3 * 1460,
         5 * 1460,
         6 * 1460,
         7 * 1460,
         8 * 1460,
         9 * 1460,
        10 * 1460
};
int        tcp_init_win = TCP_INIT_WIN;
int        tcp_init_win_local = TCP_INIT_WIN_LOCAL;
int        tcp_mss_ifmtu = 0;
int        tcp_rst_ppslim = 100;        /* 100pps */
int        tcp_ackdrop_ppslim = 100;        /* 100pps */
int        tcp_do_loopback_cksum = 0;
int        tcp_do_abc = 1;                /* RFC3465 Appropriate byte counting. */
int        tcp_abc_aggressive = 1;        /* 1: L=2*SMSS  0: L=1*SMSS */
int        tcp_sack_tp_maxholes = 32;
int        tcp_sack_globalmaxholes = 1024;
int        tcp_sack_globalholes = 0;
int        tcp_ecn_maxretries = 1;
int        tcp_msl_enable = 1;                /* enable TIME_WAIT truncation        */
int        tcp_msl_loop   = PR_SLOWHZ;        /* MSL for loopback                */
int        tcp_msl_local  = 5 * PR_SLOWHZ;        /* MSL for 'local'                */
int        tcp_msl_remote = TCPTV_MSL;        /* MSL otherwise                */
int        tcp_msl_remote_threshold = TCPTV_SRTTDFLT;        /* RTT threshold */
int        tcp_rttlocal = 0;                /* Use RTT to decide who's 'local' */

int        tcp4_vtw_enable = 0;                /* 1 to enable */
int        tcp6_vtw_enable = 0;                /* 1 to enable */
int        tcp_vtw_was_enabled = 0;
int        tcp_vtw_entries = 1 << 4;        /* 16 vestigial TIME_WAIT entries */

/* tcb hash */
#ifndef TCBHASHSIZE
#define        TCBHASHSIZE        128
#endif
int        tcbhashsize = TCBHASHSIZE;

int        tcp_freeq(struct tcpcb *);
static int        tcp_iss_secret_init(void);

static void        tcp_mtudisc_callback(struct in_addr);

#ifdef INET6
static void        tcp6_mtudisc(struct inpcb *, int);
#endif

static struct pool tcpcb_pool;

static int tcp_drainwanted;

#ifdef TCP_CSUM_COUNTERS
#include <sys/device.h>

struct evcnt tcp_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp", "hwcsum bad");
struct evcnt tcp_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp", "hwcsum ok");
struct evcnt tcp_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp", "hwcsum data");
struct evcnt tcp_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp", "swcsum");

EVCNT_ATTACH_STATIC(tcp_hwcsum_bad);
EVCNT_ATTACH_STATIC(tcp_hwcsum_ok);
EVCNT_ATTACH_STATIC(tcp_hwcsum_data);
EVCNT_ATTACH_STATIC(tcp_swcsum);

#if defined(INET6)
struct evcnt tcp6_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp6", "hwcsum bad");
struct evcnt tcp6_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp6", "hwcsum ok");
struct evcnt tcp6_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp6", "hwcsum data");
struct evcnt tcp6_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp6", "swcsum");

EVCNT_ATTACH_STATIC(tcp6_hwcsum_bad);
EVCNT_ATTACH_STATIC(tcp6_hwcsum_ok);
EVCNT_ATTACH_STATIC(tcp6_hwcsum_data);
EVCNT_ATTACH_STATIC(tcp6_swcsum);
#endif /* defined(INET6) */
#endif /* TCP_CSUM_COUNTERS */


#ifdef TCP_OUTPUT_COUNTERS
#include <sys/device.h>

struct evcnt tcp_output_bigheader = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp", "output big header");
struct evcnt tcp_output_predict_hit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp", "output predict hit");
struct evcnt tcp_output_predict_miss = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp", "output predict miss");
struct evcnt tcp_output_copysmall = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp", "output copy small");
struct evcnt tcp_output_copybig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp", "output copy big");
struct evcnt tcp_output_refbig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp", "output reference big");

EVCNT_ATTACH_STATIC(tcp_output_bigheader);
EVCNT_ATTACH_STATIC(tcp_output_predict_hit);
EVCNT_ATTACH_STATIC(tcp_output_predict_miss);
EVCNT_ATTACH_STATIC(tcp_output_copysmall);
EVCNT_ATTACH_STATIC(tcp_output_copybig);
EVCNT_ATTACH_STATIC(tcp_output_refbig);

#endif /* TCP_OUTPUT_COUNTERS */

#ifdef TCP_REASS_COUNTERS
#include <sys/device.h>

struct evcnt tcp_reass_ = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "tcp_reass", "calls");
struct evcnt tcp_reass_empty = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    &tcp_reass_, "tcp_reass", "insert into empty queue");
struct evcnt tcp_reass_iteration[8] = {
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", ">7 iterations"),
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "1 iteration"),
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "2 iterations"),
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "3 iterations"),
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "4 iterations"),
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "5 iterations"),
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "6 iterations"),
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "7 iterations"),
};
struct evcnt tcp_reass_prependfirst = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    &tcp_reass_, "tcp_reass", "prepend to first");
struct evcnt tcp_reass_prepend = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    &tcp_reass_, "tcp_reass", "prepend");
struct evcnt tcp_reass_insert = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    &tcp_reass_, "tcp_reass", "insert");
struct evcnt tcp_reass_inserttail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    &tcp_reass_, "tcp_reass", "insert at tail");
struct evcnt tcp_reass_append = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    &tcp_reass_, "tcp_reass", "append");
struct evcnt tcp_reass_appendtail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    &tcp_reass_, "tcp_reass", "append to tail fragment");
struct evcnt tcp_reass_overlaptail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    &tcp_reass_, "tcp_reass", "overlap at end");
struct evcnt tcp_reass_overlapfront = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    &tcp_reass_, "tcp_reass", "overlap at start");
struct evcnt tcp_reass_segdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    &tcp_reass_, "tcp_reass", "duplicate segment");
struct evcnt tcp_reass_fragdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    &tcp_reass_, "tcp_reass", "duplicate fragment");

EVCNT_ATTACH_STATIC(tcp_reass_);
EVCNT_ATTACH_STATIC(tcp_reass_empty);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 0);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 1);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 2);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 3);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 4);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 5);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 6);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 7);
EVCNT_ATTACH_STATIC(tcp_reass_prependfirst);
EVCNT_ATTACH_STATIC(tcp_reass_prepend);
EVCNT_ATTACH_STATIC(tcp_reass_insert);
EVCNT_ATTACH_STATIC(tcp_reass_inserttail);
EVCNT_ATTACH_STATIC(tcp_reass_append);
EVCNT_ATTACH_STATIC(tcp_reass_appendtail);
EVCNT_ATTACH_STATIC(tcp_reass_overlaptail);
EVCNT_ATTACH_STATIC(tcp_reass_overlapfront);
EVCNT_ATTACH_STATIC(tcp_reass_segdup);
EVCNT_ATTACH_STATIC(tcp_reass_fragdup);

#endif /* TCP_REASS_COUNTERS */

#ifdef MBUFTRACE
struct mowner tcp_mowner = MOWNER_INIT("tcp", "");
struct mowner tcp_rx_mowner = MOWNER_INIT("tcp", "rx");
struct mowner tcp_tx_mowner = MOWNER_INIT("tcp", "tx");
struct mowner tcp_sock_mowner = MOWNER_INIT("tcp", "sock");
struct mowner tcp_sock_rx_mowner = MOWNER_INIT("tcp", "sock rx");
struct mowner tcp_sock_tx_mowner = MOWNER_INIT("tcp", "sock tx");
#endif

static int
do_tcpinit(void)
{

        inpcb_init(&tcbtable, tcbhashsize, tcbhashsize);
        pool_init(&tcpcb_pool, sizeof(struct tcpcb), 0, 0, 0, "tcpcbpl",
            NULL, IPL_SOFTNET);

        tcp_usrreq_init();

        /* Initialize timer state. */
        tcp_timer_init();

        /* Initialize the compressed state engine. */
        syn_cache_init();

        /* Initialize the congestion control algorithms. */
        tcp_congctl_init();

        /* Initialize the TCPCB template. */
        tcp_tcpcb_template();

        /* Initialize reassembly queue */
        tcpipqent_init();

        /* SACK */
        tcp_sack_init();

        MOWNER_ATTACH(&tcp_tx_mowner);
        MOWNER_ATTACH(&tcp_rx_mowner);
        MOWNER_ATTACH(&tcp_reass_mowner);
        MOWNER_ATTACH(&tcp_sock_mowner);
        MOWNER_ATTACH(&tcp_sock_tx_mowner);
        MOWNER_ATTACH(&tcp_sock_rx_mowner);
        MOWNER_ATTACH(&tcp_mowner);

        tcpstat_percpu = percpu_alloc(sizeof(uint64_t) * TCP_NSTATS);

        vtw_earlyinit();

        tcp_slowtimo_init();

        return 0;
}

void
tcp_init_common(unsigned basehlen)
{
        static ONCE_DECL(dotcpinit);
        unsigned hlen = basehlen + sizeof(struct tcphdr);
        unsigned oldhlen;

        if (max_linkhdr + hlen > MHLEN)
                panic("tcp_init");
        while ((oldhlen = max_protohdr) < hlen)
                atomic_cas_uint(&max_protohdr, oldhlen, hlen);

        RUN_ONCE(&dotcpinit, do_tcpinit);
}

/*
 * Tcp initialization
 */
void
tcp_init(void)
{

        icmp_mtudisc_callback_register(tcp_mtudisc_callback);

        tcp_init_common(sizeof(struct ip));
}

/*
 * Create template to be used to send tcp packets on a connection.
 * Call after host entry created, allocates an mbuf and fills
 * in a skeletal tcp/ip header, minimizing the amount of work
 * necessary when the connection is used.
 */
struct mbuf *
tcp_template(struct tcpcb *tp)
{
        struct inpcb *inp = tp->t_inpcb;
        struct tcphdr *n;
        struct mbuf *m;
        int hlen;

        switch (tp->t_family) {
        case AF_INET:
                hlen = sizeof(struct ip);
                if (inp->inp_af == AF_INET)
                        break;
#ifdef INET6
                if (inp->inp_af == AF_INET6) {
                        /* mapped addr case */
                        if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))
                         && IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp)))
                                break;
                }
#endif
                return NULL;        /*EINVAL*/
#ifdef INET6
        case AF_INET6:
                hlen = sizeof(struct ip6_hdr);
                if (inp != NULL) {
                        /* more sainty check? */
                        break;
                }
                return NULL;        /*EINVAL*/
#endif
        default:
                return NULL;        /*EAFNOSUPPORT*/
        }

        KASSERT(hlen + sizeof(struct tcphdr) <= MCLBYTES);

        m = tp->t_template;
        if (m && m->m_len == hlen + sizeof(struct tcphdr)) {
                ;
        } else {
                if (m)
                        m_freem(m);
                m = tp->t_template = NULL;
                MGETHDR(m, M_DONTWAIT, MT_HEADER);
                if (m && hlen + sizeof(struct tcphdr) > MHLEN) {
                        MCLGET(m, M_DONTWAIT);
                        if ((m->m_flags & M_EXT) == 0) {
                                m_free(m);
                                m = NULL;
                        }
                }
                if (m == NULL)
                        return NULL;
                MCLAIM(m, &tcp_mowner);
                m->m_pkthdr.len = m->m_len = hlen + sizeof(struct tcphdr);
        }

        memset(mtod(m, void *), 0, m->m_len);

        n = (struct tcphdr *)(mtod(m, char *) + hlen);

        switch (tp->t_family) {
        case AF_INET:
            {
                struct ipovly *ipov;
                mtod(m, struct ip *)->ip_v = 4;
                mtod(m, struct ip *)->ip_hl = hlen >> 2;
                ipov = mtod(m, struct ipovly *);
                ipov->ih_pr = IPPROTO_TCP;
                ipov->ih_len = htons(sizeof(struct tcphdr));
                if (inp->inp_af == AF_INET) {
                        ipov->ih_src = in4p_laddr(inp);
                        ipov->ih_dst = in4p_faddr(inp);
                }
#ifdef INET6
                else if (inp->inp_af == AF_INET6) {
                        /* mapped addr case */
                        bcopy(&in6p_laddr(inp).s6_addr32[3], &ipov->ih_src,
                                sizeof(ipov->ih_src));
                        bcopy(&in6p_faddr(inp).s6_addr32[3], &ipov->ih_dst,
                                sizeof(ipov->ih_dst));
                }
#endif

                /*
                 * Compute the pseudo-header portion of the checksum
                 * now.  We incrementally add in the TCP option and
                 * payload lengths later, and then compute the TCP
                 * checksum right before the packet is sent off onto
                 * the wire.
                 */
                n->th_sum = in_cksum_phdr(ipov->ih_src.s_addr,
                    ipov->ih_dst.s_addr,
                    htons(sizeof(struct tcphdr) + IPPROTO_TCP));
                break;
            }
#ifdef INET6
        case AF_INET6:
            {
                struct ip6_hdr *ip6;
                mtod(m, struct ip *)->ip_v = 6;
                ip6 = mtod(m, struct ip6_hdr *);
                ip6->ip6_nxt = IPPROTO_TCP;
                ip6->ip6_plen = htons(sizeof(struct tcphdr));
                ip6->ip6_src = in6p_laddr(inp);
                ip6->ip6_dst = in6p_faddr(inp);
                ip6->ip6_flow = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK;
                if (ip6_auto_flowlabel) {
                        ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
                        ip6->ip6_flow |=
                            (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
                }
                ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
                ip6->ip6_vfc |= IPV6_VERSION;

                /*
                 * Compute the pseudo-header portion of the checksum
                 * now.  We incrementally add in the TCP option and
                 * payload lengths later, and then compute the TCP
                 * checksum right before the packet is sent off onto
                 * the wire.
                 */
                n->th_sum = in6_cksum_phdr(&in6p_laddr(inp),
                    &in6p_faddr(inp), htonl(sizeof(struct tcphdr)),
                    htonl(IPPROTO_TCP));
                break;
            }
#endif
        }

        n->th_sport = inp->inp_lport;
        n->th_dport = inp->inp_fport;

        n->th_seq = 0;
        n->th_ack = 0;
        n->th_x2 = 0;
        n->th_off = 5;
        n->th_flags = 0;
        n->th_win = 0;
        n->th_urp = 0;
        return m;
}

/*
 * Send a single message to the TCP at address specified by
 * the given TCP/IP header.  If m == 0, then we make a copy
 * of the tcpiphdr at ti and send directly to the addressed host.
 * This is used to force keep alive messages out using the TCP
 * template for a connection tp->t_template.  If flags are given
 * then we send a message back to the TCP which originated the
 * segment ti, and discard the mbuf containing it and any other
 * attached mbufs.
 *
 * In any case the ack and sequence number of the transmitted
 * segment are as specified by the parameters.
 */
int
tcp_respond(struct tcpcb *tp, struct mbuf *mtemplate, struct mbuf *m,
    struct tcphdr *th0, tcp_seq ack, tcp_seq seq, int flags)
{
        struct route *ro;
        int error, tlen, win = 0;
        int hlen;
        struct ip *ip;
#ifdef INET6
        struct ip6_hdr *ip6;
#endif
        int family;        /* family on packet, not inpcb! */
        struct tcphdr *th;

        if (tp != NULL && (flags & TH_RST) == 0) {
                KASSERT(tp->t_inpcb != NULL);

                win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
        }

        th = NULL;        /* Quell uninitialized warning */
        ip = NULL;
#ifdef INET6
        ip6 = NULL;
#endif
        if (m == NULL) {
                if (!mtemplate)
                        return EINVAL;

                /* get family information from template */
                switch (mtod(mtemplate, struct ip *)->ip_v) {
                case 4:
                        family = AF_INET;
                        hlen = sizeof(struct ip);
                        break;
#ifdef INET6
                case 6:
                        family = AF_INET6;
                        hlen = sizeof(struct ip6_hdr);
                        break;
#endif
                default:
                        return EAFNOSUPPORT;
                }

                MGETHDR(m, M_DONTWAIT, MT_HEADER);
                if (m) {
                        MCLAIM(m, &tcp_tx_mowner);
                        MCLGET(m, M_DONTWAIT);
                        if ((m->m_flags & M_EXT) == 0) {
                                m_free(m);
                                m = NULL;
                        }
                }
                if (m == NULL)
                        return ENOBUFS;

                tlen = 0;

                m->m_data += max_linkhdr;
                bcopy(mtod(mtemplate, void *), mtod(m, void *),
                        mtemplate->m_len);
                switch (family) {
                case AF_INET:
                        ip = mtod(m, struct ip *);
                        th = (struct tcphdr *)(ip + 1);
                        break;
#ifdef INET6
                case AF_INET6:
                        ip6 = mtod(m, struct ip6_hdr *);
                        th = (struct tcphdr *)(ip6 + 1);
                        break;
#endif
                }
                flags = TH_ACK;
        } else {
                if ((m->m_flags & M_PKTHDR) == 0) {
                        m_freem(m);
                        return EINVAL;
                }
                KASSERT(th0 != NULL);

                /* get family information from m */
                switch (mtod(m, struct ip *)->ip_v) {
                case 4:
                        family = AF_INET;
                        hlen = sizeof(struct ip);
                        ip = mtod(m, struct ip *);
                        break;
#ifdef INET6
                case 6:
                        family = AF_INET6;
                        hlen = sizeof(struct ip6_hdr);
                        ip6 = mtod(m, struct ip6_hdr *);
                        break;
#endif
                default:
                        m_freem(m);
                        return EAFNOSUPPORT;
                }
                /* clear h/w csum flags inherited from rx packet */
                m->m_pkthdr.csum_flags = 0;

                if ((flags & TH_SYN) == 0 || sizeof(*th0) > (th0->th_off << 2))
                        tlen = sizeof(*th0);
                else
                        tlen = th0->th_off << 2;

                if (m->m_len > hlen + tlen && (m->m_flags & M_EXT) == 0 &&
                    mtod(m, char *) + hlen == (char *)th0) {
                        m->m_len = hlen + tlen;
                        m_freem(m->m_next);
                        m->m_next = NULL;
                } else {
                        struct mbuf *n;

                        KASSERT(max_linkhdr + hlen + tlen <= MCLBYTES);

                        MGETHDR(n, M_DONTWAIT, MT_HEADER);
                        if (n && max_linkhdr + hlen + tlen > MHLEN) {
                                MCLGET(n, M_DONTWAIT);
                                if ((n->m_flags & M_EXT) == 0) {
                                        m_freem(n);
                                        n = NULL;
                                }
                        }
                        if (!n) {
                                m_freem(m);
                                return ENOBUFS;
                        }

                        MCLAIM(n, &tcp_tx_mowner);
                        n->m_data += max_linkhdr;
                        n->m_len = hlen + tlen;
                        m_copyback(n, 0, hlen, mtod(m, void *));
                        m_copyback(n, hlen, tlen, (void *)th0);

                        m_freem(m);
                        m = n;
                        n = NULL;
                }

#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
                switch (family) {
                case AF_INET:
                        ip = mtod(m, struct ip *);
                        th = (struct tcphdr *)(ip + 1);
                        ip->ip_p = IPPROTO_TCP;
                        xchg(ip->ip_dst, ip->ip_src, struct in_addr);
                        ip->ip_p = IPPROTO_TCP;
                        break;
#ifdef INET6
                case AF_INET6:
                        ip6 = mtod(m, struct ip6_hdr *);
                        th = (struct tcphdr *)(ip6 + 1);
                        ip6->ip6_nxt = IPPROTO_TCP;
                        xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
                        ip6->ip6_nxt = IPPROTO_TCP;
                        break;
#endif
                }
                xchg(th->th_dport, th->th_sport, u_int16_t);
#undef xchg
                tlen = 0;        /*be friendly with the following code*/
        }
        th->th_seq = htonl(seq);
        th->th_ack = htonl(ack);
        th->th_x2 = 0;
        if ((flags & TH_SYN) == 0) {
                if (tp)
                        win >>= tp->rcv_scale;
                if (win > TCP_MAXWIN)
                        win = TCP_MAXWIN;
                th->th_win = htons((u_int16_t)win);
                th->th_off = sizeof (struct tcphdr) >> 2;
                tlen += sizeof(*th);
        } else {
                tlen += th->th_off << 2;
        }
        m->m_len = hlen + tlen;
        m->m_pkthdr.len = hlen + tlen;
        m_reset_rcvif(m);
        th->th_flags = flags;
        th->th_urp = 0;

        switch (family) {
        case AF_INET:
            {
                struct ipovly *ipov = (struct ipovly *)ip;
                memset(ipov->ih_x1, 0, sizeof ipov->ih_x1);
                ipov->ih_len = htons((u_int16_t)tlen);

                th->th_sum = 0;
                th->th_sum = in_cksum(m, hlen + tlen);
                ip->ip_len = htons(hlen + tlen);
                ip->ip_ttl = ip_defttl;
                break;
            }
#ifdef INET6
        case AF_INET6:
            {
                th->th_sum = 0;
                th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
                    tlen);
                ip6->ip6_plen = htons(tlen);
                if (tp && tp->t_inpcb->inp_af == AF_INET6)
                        ip6->ip6_hlim = in6pcb_selecthlim_rt(tp->t_inpcb);
                else
                        ip6->ip6_hlim = ip6_defhlim;
                ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK;
                if (ip6_auto_flowlabel) {
                        ip6->ip6_flow |=
                            (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
                }
                break;
            }
#endif
        }

        if (tp != NULL && tp->t_inpcb->inp_af == AF_INET) {
                ro = &tp->t_inpcb->inp_route;
                KASSERT(family == AF_INET);
                KASSERT(in_hosteq(ip->ip_dst, in4p_faddr(tp->t_inpcb)));
        }
#ifdef INET6
        else if (tp != NULL && tp->t_inpcb->inp_af == AF_INET6) {
                ro = (struct route *)&tp->t_inpcb->inp_route;

#ifdef DIAGNOSTIC
                if (family == AF_INET) {
                        if (!IN6_IS_ADDR_V4MAPPED(&in6p_faddr(tp->t_inpcb)))
                                panic("tcp_respond: not mapped addr");
                        if (memcmp(&ip->ip_dst,
                            &in6p_faddr(tp->t_inpcb).s6_addr32[3],
                            sizeof(ip->ip_dst)) != 0) {
                                panic("tcp_respond: ip_dst != in6p_faddr");
                        }
                } else if (family == AF_INET6) {
                        if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
                            &in6p_faddr(tp->t_inpcb)))
                                panic("tcp_respond: ip6_dst != in6p_faddr");
                } else
                        panic("tcp_respond: address family mismatch");
#endif
        }
#endif
        else
                ro = NULL;

        switch (family) {
        case AF_INET:
                error = ip_output(m, NULL, ro,
                    (tp && tp->t_mtudisc ? IP_MTUDISC : 0), NULL,
                    tp ? tp->t_inpcb : NULL);
                break;
#ifdef INET6
        case AF_INET6:
                error = ip6_output(m, NULL, ro, 0, NULL,
                    tp ? tp->t_inpcb : NULL, NULL);
                break;
#endif
        default:
                error = EAFNOSUPPORT;
                break;
        }

        return error;
}

/*
 * Template TCPCB.  Rather than zeroing a new TCPCB and initializing
 * a bunch of members individually, we maintain this template for the
 * static and mostly-static components of the TCPCB, and copy it into
 * the new TCPCB instead.
 */
static struct tcpcb tcpcb_template = {
        .t_srtt = TCPTV_SRTTBASE,
        .t_rttmin = TCPTV_MIN,

        .snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT,
        .snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT,
        .snd_numholes = 0,
        .snd_cubic_wmax = 0,
        .snd_cubic_wmax_last = 0,
        .snd_cubic_ctime = 0,

        .t_partialacks = -1,
        .t_bytes_acked = 0,
        .t_sndrexmitpack = 0,
        .t_rcvoopack = 0,
        .t_sndzerowin = 0,
};

/*
 * Updates the TCPCB template whenever a parameter that would affect
 * the template is changed.
 */
void
tcp_tcpcb_template(void)
{
        struct tcpcb *tp = &tcpcb_template;
        int flags;

        tp->t_peermss = tcp_mssdflt;
        tp->t_ourmss = tcp_mssdflt;
        tp->t_segsz = tcp_mssdflt;

        flags = 0;
        if (tcp_do_rfc1323 && tcp_do_win_scale)
                flags |= TF_REQ_SCALE;
        if (tcp_do_rfc1323 && tcp_do_timestamps)
                flags |= TF_REQ_TSTMP;
        tp->t_flags = flags;

        /*
         * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
         * rtt estimate.  Set rttvar so that srtt + 2 * rttvar gives
         * reasonable initial retransmit time.
         */
        tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1);
        TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
            TCPTV_MIN, TCPTV_REXMTMAX);

        /* Keep Alive */
        tp->t_keepinit = MIN(tcp_keepinit, TCP_TIMER_MAXTICKS);
        tp->t_keepidle = MIN(tcp_keepidle, TCP_TIMER_MAXTICKS);
        tp->t_keepintvl = MIN(tcp_keepintvl, TCP_TIMER_MAXTICKS);
        tp->t_keepcnt = MAX(1, MIN(tcp_keepcnt, TCP_TIMER_MAXTICKS));
        tp->t_maxidle = tp->t_keepcnt * MIN(tp->t_keepintvl,
            TCP_TIMER_MAXTICKS/tp->t_keepcnt);

        /* MSL */
        tp->t_msl = TCPTV_MSL;
}

/*
 * Create a new TCP control block, making an
 * empty reassembly queue and hooking it to the argument
 * protocol control block.
 */
struct tcpcb *
tcp_newtcpcb(int family, struct inpcb *inp)
{
        struct tcpcb *tp;
        int i;

        /* XXX Consider using a pool_cache for speed. */
        tp = pool_get(&tcpcb_pool, PR_NOWAIT);        /* splsoftnet via tcp_usrreq */
        if (tp == NULL)
                return NULL;
        memcpy(tp, &tcpcb_template, sizeof(*tp));
        TAILQ_INIT(&tp->segq);
        TAILQ_INIT(&tp->timeq);
        tp->t_family = family;                /* may be overridden later on */
        TAILQ_INIT(&tp->snd_holes);
        LIST_INIT(&tp->t_sc);                /* XXX can template this */

        /* Don't sweat this loop; hopefully the compiler will unroll it. */
        for (i = 0; i < TCPT_NTIMERS; i++) {
                callout_init(&tp->t_timer[i], CALLOUT_MPSAFE);
                TCP_TIMER_INIT(tp, i);
        }
        callout_init(&tp->t_delack_ch, CALLOUT_MPSAFE);

        switch (family) {
        case AF_INET:
                in4p_ip(inp).ip_ttl = ip_defttl;
                inp->inp_ppcb = (void *)tp;

                tp->t_inpcb = inp;
                tp->t_mtudisc = ip_mtudisc;
                break;
#ifdef INET6
        case AF_INET6:
                in6p_ip6(inp).ip6_hlim = in6pcb_selecthlim_rt(inp);
                inp->inp_ppcb = (void *)tp;

                tp->t_inpcb = inp;
                /* for IPv6, always try to run path MTU discovery */
                tp->t_mtudisc = 1;
                break;
#endif /* INET6 */
        default:
                for (i = 0; i < TCPT_NTIMERS; i++)
                        callout_destroy(&tp->t_timer[i]);
                callout_destroy(&tp->t_delack_ch);
                pool_put(&tcpcb_pool, tp);        /* splsoftnet via tcp_usrreq */
                return NULL;
        }

        /*
         * Initialize our timebase.  When we send timestamps, we take
         * the delta from tcp_now -- this means each connection always
         * gets a timebase of 1, which makes it, among other things,
         * more difficult to determine how long a system has been up,
         * and thus how many TCP sequence increments have occurred.
         *
         * We start with 1, because 0 doesn't work with linux, which
         * considers timestamp 0 in a SYN packet as a bug and disables
         * timestamps.
         */
        tp->ts_timebase = tcp_now - 1;

        tcp_congctl_select(tp, tcp_congctl_global_name);

        return tp;
}

/*
 * Drop a TCP connection, reporting
 * the specified error.  If connection is synchronized,
 * then send a RST to peer.
 */
struct tcpcb *
tcp_drop(struct tcpcb *tp, int errno)
{
        struct socket *so;

        KASSERT(tp->t_inpcb != NULL);

        so = tp->t_inpcb->inp_socket;
        if (so == NULL)
                return NULL;

        if (TCPS_HAVERCVDSYN(tp->t_state)) {
                tp->t_state = TCPS_CLOSED;
                (void) tcp_output(tp);
                TCP_STATINC(TCP_STAT_DROPS);
        } else
                TCP_STATINC(TCP_STAT_CONNDROPS);
        if (errno == ETIMEDOUT && tp->t_softerror)
                errno = tp->t_softerror;
        so->so_error = errno;
        return (tcp_close(tp));
}

/*
 * Close a TCP control block:
 *        discard all space held by the tcp
 *        discard internet protocol block
 *        wake up any sleepers
 */
struct tcpcb *
tcp_close(struct tcpcb *tp)
{
        struct inpcb *inp;
        struct socket *so;
#ifdef RTV_RTT
        struct rtentry *rt = NULL;
#endif
        struct route *ro;
        int j;

        inp = tp->t_inpcb;
        so = inp->inp_socket;
        ro = &inp->inp_route;

#ifdef RTV_RTT
        /*
         * If we sent enough data to get some meaningful characteristics,
         * save them in the routing entry.  'Enough' is arbitrarily
         * defined as the sendpipesize (default 4K) * 16.  This would
         * give us 16 rtt samples assuming we only get one sample per
         * window (the usual case on a long haul net).  16 samples is
         * enough for the srtt filter to converge to within 5% of the correct
         * value; fewer samples and we could save a very bogus rtt.
         *
         * Don't update the default route's characteristics and don't
         * update anything that the user "locked".
         */
        if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) &&
            ro && (rt = rtcache_validate(ro)) != NULL &&
            !in_nullhost(satocsin(rt_getkey(rt))->sin_addr)) {
                u_long i = 0;

                if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
                        i = tp->t_srtt *
                            ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
                        if (rt->rt_rmx.rmx_rtt && i)
                                /*
                                 * filter this update to half the old & half
                                 * the new values, converting scale.
                                 * See route.h and tcp_var.h for a
                                 * description of the scaling constants.
                                 */
                                rt->rt_rmx.rmx_rtt =
                                    (rt->rt_rmx.rmx_rtt + i) / 2;
                        else
                                rt->rt_rmx.rmx_rtt = i;
                }
                if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
                        i = tp->t_rttvar *
                            ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2));
                        if (rt->rt_rmx.rmx_rttvar && i)
                                rt->rt_rmx.rmx_rttvar =
                                    (rt->rt_rmx.rmx_rttvar + i) / 2;
                        else
                                rt->rt_rmx.rmx_rttvar = i;
                }
                /*
                 * update the pipelimit (ssthresh) if it has been updated
                 * already or if a pipesize was specified & the threshold
                 * got below half the pipesize.  I.e., wait for bad news
                 * before we start updating, then update on both good
                 * and bad news.
                 */
                if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
                    (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh) ||
                    i < (rt->rt_rmx.rmx_sendpipe / 2)) {
                        /*
                         * convert the limit from user data bytes to
                         * packets then to packet data bytes.
                         */
                        i = (i + tp->t_segsz / 2) / tp->t_segsz;
                        if (i < 2)
                                i = 2;
                        i *= (u_long)(tp->t_segsz + sizeof (struct tcpiphdr));
                        if (rt->rt_rmx.rmx_ssthresh)
                                rt->rt_rmx.rmx_ssthresh =
                                    (rt->rt_rmx.rmx_ssthresh + i) / 2;
                        else
                                rt->rt_rmx.rmx_ssthresh = i;
                }
        }
        rtcache_unref(rt, ro);
#endif /* RTV_RTT */
        /* free the reassembly queue, if any */
        TCP_REASS_LOCK(tp);
        (void) tcp_freeq(tp);
        TCP_REASS_UNLOCK(tp);

        /* free the SACK holes list. */
        tcp_free_sackholes(tp);
        tcp_congctl_release(tp);
        syn_cache_cleanup(tp);

        if (tp->t_template) {
                m_free(tp->t_template);
                tp->t_template = NULL;
        }

        /*
         * Detaching the pcb will unlock the socket/tcpcb, and stopping
         * the timers can also drop the lock.  We need to prevent access
         * to the tcpcb as it's half torn down.  Flag the pcb as dead
         * (prevents access by timers) and only then detach it.
         */
        tp->t_flags |= TF_DEAD;
        inp->inp_ppcb = NULL;
        soisdisconnected(so);
        inpcb_destroy(inp);
        /*
         * pcb is no longer visble elsewhere, so we can safely release
         * the lock in callout_halt() if needed.
         */
        TCP_STATINC(TCP_STAT_CLOSED);
        for (j = 0; j < TCPT_NTIMERS; j++) {
                callout_halt(&tp->t_timer[j], softnet_lock);
                callout_destroy(&tp->t_timer[j]);
        }
        callout_halt(&tp->t_delack_ch, softnet_lock);
        callout_destroy(&tp->t_delack_ch);
        pool_put(&tcpcb_pool, tp);

        return NULL;
}

int
tcp_freeq(struct tcpcb *tp)
{
        struct ipqent *qe;
        int rv = 0;

        TCP_REASS_LOCK_CHECK(tp);

        while ((qe = TAILQ_FIRST(&tp->segq)) != NULL) {
                TAILQ_REMOVE(&tp->segq, qe, ipqe_q);
                TAILQ_REMOVE(&tp->timeq, qe, ipqe_timeq);
                m_freem(qe->ipqe_m);
                tcpipqent_free(qe);
                rv = 1;
        }
        tp->t_segqlen = 0;
        KASSERT(TAILQ_EMPTY(&tp->timeq));
        return (rv);
}

void
tcp_fasttimo(void)
{
        if (tcp_drainwanted) {
                tcp_drain();
                tcp_drainwanted = 0;
        }
}

void
tcp_drainstub(void)
{
        tcp_drainwanted = 1;
}

/*
 * Protocol drain routine.  Called when memory is in short supply.
 * Called from pr_fasttimo thus a callout context.
 */
void
tcp_drain(void)
{
        struct inpcb *inp;
        struct tcpcb *tp;

        mutex_enter(softnet_lock);
        KERNEL_LOCK(1, NULL);

        /*
         * Free the sequence queue of all TCP connections.
         */
        TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue) {
                tp = intotcpcb(inp);
                if (tp != NULL) {
                        /*
                         * If the tcpcb is already busy,
                         * just bail out now.
                         */
                        if (tcp_reass_lock_try(tp) == 0)
                                continue;
                        if (tcp_freeq(tp))
                                TCP_STATINC(TCP_STAT_CONNSDRAINED);
                        TCP_REASS_UNLOCK(tp);
                }
        }

        KERNEL_UNLOCK_ONE(NULL);
        mutex_exit(softnet_lock);
}

/*
 * Notify a tcp user of an asynchronous error;
 * store error as soft error, but wake up user
 * (for now, won't do anything until can select for soft error).
 */
void
tcp_notify(struct inpcb *inp, int error)
{
        struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
        struct socket *so = inp->inp_socket;

        /*
         * Ignore some errors if we are hooked up.
         * If connection hasn't completed, has retransmitted several times,
         * and receives a second error, give up now.  This is better
         * than waiting a long time to establish a connection that
         * can never complete.
         */
        if (tp->t_state == TCPS_ESTABLISHED &&
             (error == EHOSTUNREACH || error == ENETUNREACH ||
              error == EHOSTDOWN)) {
                return;
        } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
            tp->t_rxtshift > 3 && tp->t_softerror)
                so->so_error = error;
        else
                tp->t_softerror = error;
        cv_broadcast(&so->so_cv);
        sorwakeup(so);
        sowwakeup(so);
}

#ifdef INET6
void *
tcp6_ctlinput(int cmd, const struct sockaddr *sa, void *d)
{
        struct tcphdr th;
        void (*notify)(struct inpcb *, int) = tcp_notify;
        int nmatch;
        struct ip6_hdr *ip6;
        const struct sockaddr_in6 *sa6_src = NULL;
        const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa;
        struct mbuf *m;
        int off;

        if (sa->sa_family != AF_INET6 ||
            sa->sa_len != sizeof(struct sockaddr_in6))
                return NULL;
        if ((unsigned)cmd >= PRC_NCMDS)
                return NULL;
        else if (cmd == PRC_QUENCH) {
                /*
                 * Don't honor ICMP Source Quench messages meant for
                 * TCP connections.
                 */
                return NULL;
        } else if (PRC_IS_REDIRECT(cmd))
                notify = in6pcb_rtchange, d = NULL;
        else if (cmd == PRC_MSGSIZE)
                ; /* special code is present, see below */
        else if (cmd == PRC_HOSTDEAD)
                d = NULL;
        else if (inet6ctlerrmap[cmd] == 0)
                return NULL;

        /* if the parameter is from icmp6, decode it. */
        if (d != NULL) {
                struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d;
                m = ip6cp->ip6c_m;
                ip6 = ip6cp->ip6c_ip6;
                off = ip6cp->ip6c_off;
                sa6_src = ip6cp->ip6c_src;
        } else {
                m = NULL;
                ip6 = NULL;
                sa6_src = &sa6_any;
                off = 0;
        }

        if (ip6) {
                /* check if we can safely examine src and dst ports */
                if (m->m_pkthdr.len < off + sizeof(th)) {
                        if (cmd == PRC_MSGSIZE)
                                icmp6_mtudisc_update((struct ip6ctlparam *)d, 0);
                        return NULL;
                }

                memset(&th, 0, sizeof(th));
                m_copydata(m, off, sizeof(th), (void *)&th);

                if (cmd == PRC_MSGSIZE) {
                        int valid = 0;

                        /*
                         * Check to see if we have a valid TCP connection
                         * corresponding to the address in the ICMPv6 message
                         * payload.
                         */
                        if (in6pcb_lookup(&tcbtable, &sa6->sin6_addr,
                            th.th_dport,
                            (const struct in6_addr *)&sa6_src->sin6_addr,
                                                  th.th_sport, 0, 0))
                                valid++;

                        /*
                         * Depending on the value of "valid" and routing table
                         * size (mtudisc_{hi,lo}wat), we will:
                         * - recalcurate the new MTU and create the
                         *   corresponding routing entry, or
                         * - ignore the MTU change notification.
                         */
                        icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);

                        /*
                         * no need to call in6pcb_notify, it should have been
                         * called via callback if necessary
                         */
                        return NULL;
                }

                nmatch = in6pcb_notify(&tcbtable, sa, th.th_dport,
                    (const struct sockaddr *)sa6_src, th.th_sport, cmd, NULL, notify);
                if (nmatch == 0 && syn_cache_count &&
                    (inet6ctlerrmap[cmd] == EHOSTUNREACH ||
                     inet6ctlerrmap[cmd] == ENETUNREACH ||
                     inet6ctlerrmap[cmd] == EHOSTDOWN))
                        syn_cache_unreach((const struct sockaddr *)sa6_src,
                                          sa, &th);
        } else {
                (void) in6pcb_notify(&tcbtable, sa, 0,
                    (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify);
        }

        return NULL;
}
#endif

/* assumes that ip header and tcp header are contiguous on mbuf */
void *
tcp_ctlinput(int cmd, const struct sockaddr *sa, void *v)
{
        struct ip *ip = v;
        struct tcphdr *th;
        struct icmp *icp;
        extern const int inetctlerrmap[];
        void (*notify)(struct inpcb *, int) = tcp_notify;
        int errno;
        int nmatch;
        struct tcpcb *tp;
        u_int mtu;
        tcp_seq seq;
        struct inpcb *inp;
#ifdef INET6
        struct in6_addr src6, dst6;
#endif

        if (sa->sa_family != AF_INET ||
            sa->sa_len != sizeof(struct sockaddr_in))
                return NULL;
        if ((unsigned)cmd >= PRC_NCMDS)
                return NULL;
        errno = inetctlerrmap[cmd];
        if (cmd == PRC_QUENCH)
                /*
                 * Don't honor ICMP Source Quench messages meant for
                 * TCP connections.
                 */
                return NULL;
        else if (PRC_IS_REDIRECT(cmd))
                notify = inpcb_rtchange, ip = 0;
        else if (cmd == PRC_MSGSIZE && ip && ip->ip_v == 4) {
                /*
                 * Check to see if we have a valid TCP connection
                 * corresponding to the address in the ICMP message
                 * payload.
                 *
                 * Boundary check is made in icmp_input(), with ICMP_ADVLENMIN.
                 */
                th = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
#ifdef INET6
                in6_in_2_v4mapin6(&ip->ip_src, &src6);
                in6_in_2_v4mapin6(&ip->ip_dst, &dst6);
#endif
                if ((inp = inpcb_lookup(&tcbtable, ip->ip_dst,
                    th->th_dport, ip->ip_src, th->th_sport, 0)) != NULL)
                        ;
#ifdef INET6
                else if ((inp = in6pcb_lookup(&tcbtable, &dst6,
                    th->th_dport, &src6, th->th_sport, 0, 0)) != NULL)
                        ;
#endif
                else
                        return NULL;

                /*
                 * Now that we've validated that we are actually communicating
                 * with the host indicated in the ICMP message, locate the
                 * ICMP header, recalculate the new MTU, and create the
                 * corresponding routing entry.
                 */
                icp = (struct icmp *)((char *)ip -
                    offsetof(struct icmp, icmp_ip));
                tp = intotcpcb(inp);
                if (tp == NULL)
                        return NULL;
                seq = ntohl(th->th_seq);
                if (SEQ_LT(seq, tp->snd_una) || SEQ_GT(seq, tp->snd_max))
                        return NULL;
                /*
                 * If the ICMP message advertises a Next-Hop MTU
                 * equal or larger than the maximum packet size we have
                 * ever sent, drop the message.
                 */
                mtu = (u_int)ntohs(icp->icmp_nextmtu);
                if (mtu >= tp->t_pmtud_mtu_sent)
                        return NULL;
                if (mtu >= tcp_hdrsz(tp) + tp->t_pmtud_mss_acked) {
                        /*
                         * Calculate new MTU, and create corresponding
                         * route (traditional PMTUD).
                         */
                        tp->t_flags &= ~TF_PMTUD_PEND;
                        icmp_mtudisc(icp, ip->ip_dst);
                } else {
                        /*
                         * Record the information got in the ICMP
                         * message; act on it later.
                         * If we had already recorded an ICMP message,
                         * replace the old one only if the new message
                         * refers to an older TCP segment
                         */
                        if (tp->t_flags & TF_PMTUD_PEND) {
                                if (SEQ_LT(tp->t_pmtud_th_seq, seq))
                                        return NULL;
                        } else
                                tp->t_flags |= TF_PMTUD_PEND;
                        tp->t_pmtud_th_seq = seq;
                        tp->t_pmtud_nextmtu = icp->icmp_nextmtu;
                        tp->t_pmtud_ip_len = icp->icmp_ip.ip_len;
                        tp->t_pmtud_ip_hl = icp->icmp_ip.ip_hl;
                }
                return NULL;
        } else if (cmd == PRC_HOSTDEAD)
                ip = 0;
        else if (errno == 0)
                return NULL;
        if (ip && ip->ip_v == 4 && sa->sa_family == AF_INET) {
                th = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
                nmatch = inpcb_notify(&tcbtable, satocsin(sa)->sin_addr,
                    th->th_dport, ip->ip_src, th->th_sport, errno, notify);
                if (nmatch == 0 && syn_cache_count &&
                    (inetctlerrmap[cmd] == EHOSTUNREACH ||
                    inetctlerrmap[cmd] == ENETUNREACH ||
                    inetctlerrmap[cmd] == EHOSTDOWN)) {
                        struct sockaddr_in sin;
                        memset(&sin, 0, sizeof(sin));
                        sin.sin_len = sizeof(sin);
                        sin.sin_family = AF_INET;
                        sin.sin_port = th->th_sport;
                        sin.sin_addr = ip->ip_src;
                        syn_cache_unreach((struct sockaddr *)&sin, sa, th);
                }

                /* XXX mapped address case */
        } else
                inpcb_notifyall(&tcbtable, satocsin(sa)->sin_addr, errno,
                    notify);
        return NULL;
}

/*
 * When a source quench is received, we are being notified of congestion.
 * Close the congestion window down to the Loss Window (one segment).
 * We will gradually open it again as we proceed.
 */
void
tcp_quench(struct inpcb *inp)
{
        struct tcpcb *tp = intotcpcb(inp);

        if (tp) {
                tp->snd_cwnd = tp->t_segsz;
                tp->t_bytes_acked = 0;
        }
}

/*
 * Path MTU Discovery handlers.
 */
void
tcp_mtudisc_callback(struct in_addr faddr)
{
#ifdef INET6
        struct in6_addr in6;
#endif

        inpcb_notifyall(&tcbtable, faddr, EMSGSIZE, tcp_mtudisc);
#ifdef INET6
        in6_in_2_v4mapin6(&faddr, &in6);
        tcp6_mtudisc_callback(&in6);
#endif
}

/*
 * On receipt of path MTU corrections, flush old route and replace it
 * with the new one.  Retransmit all unacknowledged packets, to ensure
 * that all packets will be received.
 */
void
tcp_mtudisc(struct inpcb *inp, int errno)
{
        struct tcpcb *tp = intotcpcb(inp);
        struct rtentry *rt;

        if (tp == NULL)
                return;

        rt = inpcb_rtentry(inp);
        if (rt != NULL) {
                /*
                 * If this was not a host route, remove and realloc.
                 */
                if ((rt->rt_flags & RTF_HOST) == 0) {
                        inpcb_rtentry_unref(rt, inp);
                        inpcb_rtchange(inp, errno);
                        if ((rt = inpcb_rtentry(inp)) == NULL)
                                return;
                }

                /*
                 * Slow start out of the error condition.  We
                 * use the MTU because we know it's smaller
                 * than the previously transmitted segment.
                 *
                 * Note: This is more conservative than the
                 * suggestion in draft-floyd-incr-init-win-03.
                 */
                if (rt->rt_rmx.rmx_mtu != 0)
                        tp->snd_cwnd =
                            TCP_INITIAL_WINDOW(tcp_init_win,
                            rt->rt_rmx.rmx_mtu);
                inpcb_rtentry_unref(rt, inp);
        }

        /*
         * Resend unacknowledged packets.
         */
        tp->snd_nxt = tp->sack_newdata = tp->snd_una;
        tcp_output(tp);
}

#ifdef INET6
/*
 * Path MTU Discovery handlers.
 */
void
tcp6_mtudisc_callback(struct in6_addr *faddr)
{
        struct sockaddr_in6 sin6;

        memset(&sin6, 0, sizeof(sin6));
        sin6.sin6_family = AF_INET6;
        sin6.sin6_len = sizeof(struct sockaddr_in6);
        sin6.sin6_addr = *faddr;
        (void) in6pcb_notify(&tcbtable, (struct sockaddr *)&sin6, 0,
            (const struct sockaddr *)&sa6_any, 0, PRC_MSGSIZE, NULL, tcp6_mtudisc);
}

void
tcp6_mtudisc(struct inpcb *inp, int errno)
{
        struct tcpcb *tp = intotcpcb(inp);
        struct rtentry *rt;

        if (tp == NULL)
                return;

        rt = in6pcb_rtentry(inp);
        if (rt != NULL) {
                /*
                 * If this was not a host route, remove and realloc.
                 */
                if ((rt->rt_flags & RTF_HOST) == 0) {
                        in6pcb_rtentry_unref(rt, inp);
                        in6pcb_rtchange(inp, errno);
                        rt = in6pcb_rtentry(inp);
                        if (rt == NULL)
                                return;
                }

                /*
                 * Slow start out of the error condition.  We
                 * use the MTU because we know it's smaller
                 * than the previously transmitted segment.
                 *
                 * Note: This is more conservative than the
                 * suggestion in draft-floyd-incr-init-win-03.
                 */
                if (rt->rt_rmx.rmx_mtu != 0) {
                        tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win,
                            rt->rt_rmx.rmx_mtu);
                }
                in6pcb_rtentry_unref(rt, inp);
        }

        /*
         * Resend unacknowledged packets.
         */
        tp->snd_nxt = tp->sack_newdata = tp->snd_una;
        tcp_output(tp);
}
#endif /* INET6 */

/*
 * Compute the MSS to advertise to the peer.  Called only during
 * the 3-way handshake.  If we are the server (peer initiated
 * connection), we are called with a pointer to the interface
 * on which the SYN packet arrived.  If we are the client (we
 * initiated connection), we are called with a pointer to the
 * interface out which this connection should go.
 *
 * NOTE: Do not subtract IP option/extension header size nor IPsec
 * header size from MSS advertisement.  MSS option must hold the maximum
 * segment size we can accept, so it must always be:
 *         max(if mtu) - ip header - tcp header
 */
u_long
tcp_mss_to_advertise(const struct ifnet *ifp, int af)
{
        extern u_long in_maxmtu;
        u_long mss = 0;
        u_long hdrsiz;

        /*
         * In order to avoid defeating path MTU discovery on the peer,
         * we advertise the max MTU of all attached networks as our MSS,
         * per RFC 1191, section 3.1.
         *
         * We provide the option to advertise just the MTU of
         * the interface on which we hope this connection will
         * be receiving.  If we are responding to a SYN, we
         * will have a pretty good idea about this, but when
         * initiating a connection there is a bit more doubt.
         *
         * We also need to ensure that loopback has a large enough
         * MSS, as the loopback MTU is never included in in_maxmtu.
         */

        if (ifp != NULL)
                switch (af) {
#ifdef INET6
                case AF_INET6:        /* FALLTHROUGH */
#endif
                case AF_INET:
                        mss = ifp->if_mtu;
                        break;
                }

        if (tcp_mss_ifmtu == 0)
                switch (af) {
#ifdef INET6
                case AF_INET6:        /* FALLTHROUGH */
#endif
                case AF_INET:
                        mss = uimax(in_maxmtu, mss);
                        break;
                }

        switch (af) {
        case AF_INET:
                hdrsiz = sizeof(struct ip);
                break;
#ifdef INET6
        case AF_INET6:
                hdrsiz = sizeof(struct ip6_hdr);
                break;
#endif
        default:
                hdrsiz = 0;
                break;
        }
        hdrsiz += sizeof(struct tcphdr);
        if (mss > hdrsiz)
                mss -= hdrsiz;

        mss = uimax(tcp_mssdflt, mss);
        return (mss);
}

/*
 * Set connection variables based on the peer's advertised MSS.
 * We are passed the TCPCB for the actual connection.  If we
 * are the server, we are called by the compressed state engine
 * when the 3-way handshake is complete.  If we are the client,
 * we are called when we receive the SYN,ACK from the server.
 *
 * NOTE: Our advertised MSS value must be initialized in the TCPCB
 * before this routine is called!
 */
void
tcp_mss_from_peer(struct tcpcb *tp, int offer)
{
        struct socket *so;
#if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
        struct rtentry *rt;
#endif
        u_long bufsize;
        int mss;

        KASSERT(tp->t_inpcb != NULL);

        so = NULL;
        rt = NULL;

        so = tp->t_inpcb->inp_socket;
#if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
        rt = inpcb_rtentry(tp->t_inpcb);
#endif

        /*
         * As per RFC1122, use the default MSS value, unless they
         * sent us an offer.  Do not accept offers less than 256 bytes.
         */
        mss = tcp_mssdflt;
        if (offer)
                mss = offer;
        mss = uimax(mss, 256);                /* sanity */
        tp->t_peermss = mss;
        mss -= tcp_optlen(tp);
        if (tp->t_inpcb->inp_af == AF_INET)
                mss -= ip_optlen(tp->t_inpcb);
#ifdef INET6
        if (tp->t_inpcb->inp_af == AF_INET6)
                mss -= ip6_optlen(tp->t_inpcb);
#endif
        /*
         * XXX XXX What if mss goes negative or zero? This can happen if a
         * socket has large IPv6 options. We crash below.
         */

        /*
         * If there's a pipesize, change the socket buffer to that size.
         * Make the socket buffer an integral number of MSS units.  If
         * the MSS is larger than the socket buffer, artificially decrease
         * the MSS.
         */
#ifdef RTV_SPIPE
        if (rt != NULL && rt->rt_rmx.rmx_sendpipe != 0)
                bufsize = rt->rt_rmx.rmx_sendpipe;
        else
#endif
        {
                KASSERT(so != NULL);
                bufsize = so->so_snd.sb_hiwat;
        }
        if (bufsize < mss)
                mss = bufsize;
        else {
                bufsize = roundup(bufsize, mss);
                if (bufsize > sb_max)
                        bufsize = sb_max;
                (void) sbreserve(&so->so_snd, bufsize, so);
        }
        tp->t_segsz = mss;

#ifdef RTV_SSTHRESH
        if (rt != NULL && rt->rt_rmx.rmx_ssthresh) {
                /*
                 * There's some sort of gateway or interface buffer
                 * limit on the path.  Use this to set the slow
                 * start threshold, but set the threshold to no less
                 * than 2 * MSS.
                 */
                tp->snd_ssthresh = uimax(2 * mss, rt->rt_rmx.rmx_ssthresh);
        }
#endif
#if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
        inpcb_rtentry_unref(rt, tp->t_inpcb);
#endif
}

/*
 * Processing necessary when a TCP connection is established.
 */
void
tcp_established(struct tcpcb *tp)
{
        struct socket *so;
#ifdef RTV_RPIPE
        struct rtentry *rt;
#endif
        u_long bufsize;

        KASSERT(tp->t_inpcb != NULL);

        so = NULL;
        rt = NULL;

        /* This is a while() to reduce the dreadful stairstepping below */
        while (tp->t_inpcb->inp_af == AF_INET) {
                so = tp->t_inpcb->inp_socket;
#if defined(RTV_RPIPE)
                rt = inpcb_rtentry(tp->t_inpcb);
#endif
                if (__predict_true(tcp_msl_enable)) {
                        if (in4p_laddr(tp->t_inpcb).s_addr == INADDR_LOOPBACK) {
                                tp->t_msl = tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
                                break;
                        }

                        if (__predict_false(tcp_rttlocal)) {
                                /* This may be adjusted by tcp_input */
                                tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
                                break;
                        }
                        if (in_localaddr(in4p_faddr(tp->t_inpcb))) {
                                tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
                                break;
                        }
                }
                tp->t_msl = tcp_msl_remote ? tcp_msl_remote : TCPTV_MSL;
                break;
        }

        /* Clamp to a reasonable range.  */
        tp->t_msl = MIN(tp->t_msl, TCP_MAXMSL);

#ifdef INET6
        while (tp->t_inpcb->inp_af == AF_INET6) {
                so = tp->t_inpcb->inp_socket;
#if defined(RTV_RPIPE)
                rt = in6pcb_rtentry(tp->t_inpcb);
#endif
                if (__predict_true(tcp_msl_enable)) {
                        extern const struct in6_addr in6addr_loopback;

                        if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(tp->t_inpcb),
                            &in6addr_loopback)) {
                                tp->t_msl = tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
                                break;
                        }

                        if (__predict_false(tcp_rttlocal)) {
                                /* This may be adjusted by tcp_input */
                                tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
                                break;
                        }
                        if (in6_localaddr(&in6p_faddr(tp->t_inpcb))) {
                                tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
                                break;
                        }
                }
                tp->t_msl = tcp_msl_remote ? tcp_msl_remote : TCPTV_MSL;
                break;
        }

        /* Clamp to a reasonable range.  */
        tp->t_msl = MIN(tp->t_msl, TCP_MAXMSL);
#endif

        tp->t_state = TCPS_ESTABLISHED;
        TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);

#ifdef RTV_RPIPE
        if (rt != NULL && rt->rt_rmx.rmx_recvpipe != 0)
                bufsize = rt->rt_rmx.rmx_recvpipe;
        else
#endif
        {
                KASSERT(so != NULL);
                bufsize = so->so_rcv.sb_hiwat;
        }
        if (bufsize > tp->t_ourmss) {
                bufsize = roundup(bufsize, tp->t_ourmss);
                if (bufsize > sb_max)
                        bufsize = sb_max;
                (void) sbreserve(&so->so_rcv, bufsize, so);
        }
#ifdef RTV_RPIPE
        inpcb_rtentry_unref(rt, tp->t_inpcb);
#endif
}

/*
 * Check if there's an initial rtt or rttvar.  Convert from the
 * route-table units to scaled multiples of the slow timeout timer.
 * Called only during the 3-way handshake.
 */
void
tcp_rmx_rtt(struct tcpcb *tp)
{
#ifdef RTV_RTT
        struct rtentry *rt = NULL;
        int rtt;

        KASSERT(tp->t_inpcb != NULL);

        rt = inpcb_rtentry(tp->t_inpcb);
        if (rt == NULL)
                return;

        if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
                /*
                 * XXX The lock bit for MTU indicates that the value
                 * is also a minimum value; this is subject to time.
                 */
                if (rt->rt_rmx.rmx_locks & RTV_RTT)
                        TCPT_RANGESET(tp->t_rttmin,
                            rtt / (RTM_RTTUNIT / PR_SLOWHZ),
                            TCPTV_MIN, TCPTV_REXMTMAX);
                tp->t_srtt = rtt /
                    ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
                if (rt->rt_rmx.rmx_rttvar) {
                        tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
                            ((RTM_RTTUNIT / PR_SLOWHZ) >>
                                (TCP_RTTVAR_SHIFT + 2));
                } else {
                        /* Default variation is +- 1 rtt */
                        tp->t_rttvar =
                            tp->t_srtt >> (TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT);
                }
                TCPT_RANGESET(tp->t_rxtcur,
                    ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2),
                    tp->t_rttmin, TCPTV_REXMTMAX);
        }
        inpcb_rtentry_unref(rt, tp->t_inpcb);
#endif
}

tcp_seq         tcp_iss_seq = 0;        /* tcp initial seq # */

/*
 * Get a new sequence value given a tcp control block
 */
tcp_seq
tcp_new_iss(struct tcpcb *tp)
{

        if (tp->t_inpcb->inp_af == AF_INET) {
                return tcp_new_iss1(&in4p_laddr(tp->t_inpcb),
                    &in4p_faddr(tp->t_inpcb), tp->t_inpcb->inp_lport,
                    tp->t_inpcb->inp_fport, sizeof(in4p_laddr(tp->t_inpcb)));
        }
#ifdef INET6
        if (tp->t_inpcb->inp_af == AF_INET6) {
                return tcp_new_iss1(&in6p_laddr(tp->t_inpcb),
                    &in6p_faddr(tp->t_inpcb), tp->t_inpcb->inp_lport,
                    tp->t_inpcb->inp_fport, sizeof(in6p_laddr(tp->t_inpcb)));
        }
#endif

        panic("tcp_new_iss: unreachable");
}

static u_int8_t tcp_iss_secret[16];        /* 128 bits; should be plenty */

/*
 * Initialize RFC 1948 ISS Secret
 */
static int
tcp_iss_secret_init(void)
{
        cprng_strong(kern_cprng,
            tcp_iss_secret, sizeof(tcp_iss_secret), 0);

        return 0;
}

/*
 * This routine actually generates a new TCP initial sequence number.
 */
tcp_seq
tcp_new_iss1(void *laddr, void *faddr, u_int16_t lport, u_int16_t fport,
    size_t addrsz)
{
        tcp_seq tcp_iss;

        if (tcp_do_rfc1948) {
                MD5_CTX ctx;
                u_int8_t hash[16];        /* XXX MD5 knowledge */
                static ONCE_DECL(tcp_iss_secret_control);

                /*
                 * If we haven't been here before, initialize our cryptographic
                 * hash secret.
                 */
                RUN_ONCE(&tcp_iss_secret_control, tcp_iss_secret_init);

                /*
                 * Compute the base value of the ISS.  It is a hash
                 * of (saddr, sport, daddr, dport, secret).
                 */
                MD5Init(&ctx);

                MD5Update(&ctx, (u_char *) laddr, addrsz);
                MD5Update(&ctx, (u_char *) &lport, sizeof(lport));

                MD5Update(&ctx, (u_char *) faddr, addrsz);
                MD5Update(&ctx, (u_char *) &fport, sizeof(fport));

                MD5Update(&ctx, tcp_iss_secret, sizeof(tcp_iss_secret));

                MD5Final(hash, &ctx);

                memcpy(&tcp_iss, hash, sizeof(tcp_iss));

#ifdef TCPISS_DEBUG
                printf("ISS hash 0x%08x, ", tcp_iss);
#endif
        } else {
                /*
                 * Randomize.
                 */
                tcp_iss = cprng_fast32() & TCP_ISS_RANDOM_MASK;
#ifdef TCPISS_DEBUG
                printf("ISS random 0x%08x, ", tcp_iss);
#endif
        }

        /*
         * Add the offset in to the computed value.
         */
        tcp_iss += tcp_iss_seq;
#ifdef TCPISS_DEBUG
        printf("ISS %08x\n", tcp_iss);
#endif
        return tcp_iss;
}

#if defined(IPSEC)
/* compute ESP/AH header size for TCP, including outer IP header. */
size_t
ipsec4_hdrsiz_tcp(struct tcpcb *tp)
{
        struct inpcb *inp;
        size_t hdrsiz;

        /* XXX mapped addr case (tp->t_inpcb) */
        if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
                return 0;
        switch (tp->t_family) {
        case AF_INET:
                /* XXX: should use correct direction. */
                hdrsiz = ipsec_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp);
                break;
        default:
                hdrsiz = 0;
                break;
        }

        return hdrsiz;
}

#ifdef INET6
size_t
ipsec6_hdrsiz_tcp(struct tcpcb *tp)
{
        struct inpcb *inp;
        size_t hdrsiz;

        if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
                return 0;
        switch (tp->t_family) {
        case AF_INET6:
                /* XXX: should use correct direction. */
                hdrsiz = ipsec_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp);
                break;
        case AF_INET:
                /* mapped address case - tricky */
        default:
                hdrsiz = 0;
                break;
        }

        return hdrsiz;
}
#endif
#endif /*IPSEC*/

/*
 * Determine the length of the TCP options for this connection.
 *
 * XXX:  What do we do for SACK, when we add that?  Just reserve
 *       all of the space?  Otherwise we can't exactly be incrementing
 *       cwnd by an amount that varies depending on the amount we last
 *       had to SACK!
 */

u_int
tcp_optlen(struct tcpcb *tp)
{
        u_int optlen;

        optlen = 0;
        if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
            (TF_REQ_TSTMP | TF_RCVD_TSTMP))
                optlen += TCPOLEN_TSTAMP_APPA;

#ifdef TCP_SIGNATURE
        if (tp->t_flags & TF_SIGNATURE)
                optlen += TCPOLEN_SIGLEN;
#endif

        return optlen;
}

u_int
tcp_hdrsz(struct tcpcb *tp)
{
        u_int hlen;

        switch (tp->t_family) {
#ifdef INET6
        case AF_INET6:
                hlen = sizeof(struct ip6_hdr);
                break;
#endif
        case AF_INET:
                hlen = sizeof(struct ip);
                break;
        default:
                hlen = 0;
                break;
        }
        hlen += sizeof(struct tcphdr);

        if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
            (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
                hlen += TCPOLEN_TSTAMP_APPA;
#ifdef TCP_SIGNATURE
        if (tp->t_flags & TF_SIGNATURE)
                hlen += TCPOLEN_SIGLEN;
#endif
        return hlen;
}

void
tcp_statinc(u_int stat)
{

        KASSERT(stat < TCP_NSTATS);
        TCP_STATINC(stat);
}

void
tcp_statadd(u_int stat, uint64_t val)
{

        KASSERT(stat < TCP_NSTATS);
        TCP_STATADD(stat, val);
}










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 



    1 
    1 









































    5 





    5 
    5 





































    4 













    4 
    4 



















    4 




    4 




    4 


    4 












































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
/*        $NetBSD: in6.c,v 1.292 2024/03/01 23:50:27 riastradh Exp $        */
/*        $KAME: in6.c,v 1.198 2001/07/18 09:12:38 itojun Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)in.c        8.2 (Berkeley) 11/15/93
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6.c,v 1.292 2024/03/01 23:50:27 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_compat_netbsd.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/kauth.h>
#include <sys/cprng.h>
#include <sys/kmem.h>

#include <net/if.h>
#include <net/if_types.h>
#include <net/if_llatbl.h>
#include <net/if_ether.h>
#include <net/if_dl.h>
#include <net/pfil.h>
#include <net/route.h>

#include <netinet/in.h>
#include <netinet/in_var.h>

#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
#include <netinet6/mld6_var.h>
#include <netinet6/ip6_mroute.h>
#include <netinet6/in6_ifattach.h>
#include <netinet6/scope6_var.h>

#include <compat/netinet6/in6_var.h>
#include <compat/netinet6/nd6.h>

MALLOC_DEFINE(M_IP6OPT, "ip6_options", "IPv6 options");

/* enable backward compatibility code for obsoleted ioctls */
#define COMPAT_IN6IFIOCTL

#ifdef        IN6_DEBUG
#define        IN6_DPRINTF(__fmt, ...)        printf(__fmt, __VA_ARGS__)
#else
#define        IN6_DPRINTF(__fmt, ...)        do { } while (/*CONSTCOND*/0) 
#endif /* IN6_DEBUG */

/*
 * Definitions of some constant IP6 addresses.
 */
const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
const struct in6_addr in6addr_nodelocal_allnodes =
        IN6ADDR_NODELOCAL_ALLNODES_INIT;
const struct in6_addr in6addr_linklocal_allnodes =
        IN6ADDR_LINKLOCAL_ALLNODES_INIT;
const struct in6_addr in6addr_linklocal_allrouters =
        IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;

const struct in6_addr in6mask0 = IN6MASK0;
const struct in6_addr in6mask32 = IN6MASK32;
const struct in6_addr in6mask64 = IN6MASK64;
const struct in6_addr in6mask96 = IN6MASK96;
const struct in6_addr in6mask128 = IN6MASK128;

const struct sockaddr_in6 sa6_any = {sizeof(sa6_any), AF_INET6,
                                     0, 0, IN6ADDR_ANY_INIT, 0};

struct pslist_head        in6_ifaddr_list;
kmutex_t                in6_ifaddr_lock;

static int in6_lifaddr_ioctl(struct socket *, u_long, void *,
        struct ifnet *);
static int in6_ifaddprefix(struct in6_ifaddr *);
static int in6_ifremprefix(struct in6_ifaddr *);
static int in6_ifinit(struct ifnet *, struct in6_ifaddr *,
        const struct sockaddr_in6 *, int);
static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *);
static int in6_update_ifa1(struct ifnet *, struct in6_aliasreq *,
    struct in6_ifaddr **, struct psref *, int);

void
in6_init(void)
{

        PSLIST_INIT(&in6_ifaddr_list);
        mutex_init(&in6_ifaddr_lock, MUTEX_DEFAULT, IPL_NONE);

        in6_sysctl_multicast_setup(NULL);
}

/*
 * Add ownaddr as loopback rtentry.  We previously add the route only if
 * necessary (ex. on a p2p link).  However, since we now manage addresses
 * separately from prefixes, we should always add the route.  We can't
 * rely on the cloning mechanism from the corresponding interface route
 * any more.
 */
void
in6_ifaddlocal(struct ifaddr *ifa)
{

        if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &in6addr_any) ||
            (ifa->ifa_ifp->if_flags & IFF_POINTOPOINT &&
            IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), IFA_DSTIN6(ifa))))
        {
                rt_addrmsg(RTM_NEWADDR, ifa);
                return;
        }

        rt_ifa_addlocal(ifa);
}

/*
 * Remove loopback rtentry of ownaddr generated by in6_ifaddlocal(),
 * if it exists.
 */
void
in6_ifremlocal(struct ifaddr *ifa)
{
        struct in6_ifaddr *ia;
        struct ifaddr *alt_ifa = NULL;
        int ia_count = 0;
        struct psref psref;
        int s;

        /*
         * Some of BSD variants do not remove cloned routes
         * from an interface direct route, when removing the direct route
         * (see comments in net/net_osdep.h).  Even for variants that do remove
         * cloned routes, they could fail to remove the cloned routes when
         * we handle multiple addresses that share a common prefix.
         * So, we should remove the route corresponding to the deleted address.
         */

        /*
         * Delete the entry only if exactly one ifaddr matches the
         * address, ifa->ifa_addr.
         *
         * If more than one ifaddr matches, replace the ifaddr in
         * the routing table, rt_ifa, with a different ifaddr than
         * the one we are purging, ifa.  It is important to do
         * this, or else the routing table can accumulate dangling
         * pointers rt->rt_ifa->ifa_ifp to destroyed interfaces,
         * which will lead to crashes, later.  (More than one ifaddr
         * can match if we assign the same address to multiple---probably
         * p2p---interfaces.)
         *
         * XXX An old comment at this place said, "we should avoid
         * XXX such a configuration [i.e., interfaces with the same
         * XXX addressed assigned --ed.] in IPv6...".  I do not
         * XXX agree, especially now that I have fixed the dangling
         * XXX ifp-pointers bug.
         */
        s = pserialize_read_enter();
        IN6_ADDRLIST_READER_FOREACH(ia) {
                if (!IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &ia->ia_addr.sin6_addr))
                        continue;
                if (ia->ia_ifp != ifa->ifa_ifp)
                        alt_ifa = &ia->ia_ifa;
                if (++ia_count > 1 && alt_ifa != NULL)
                        break;
        }
        if (ia_count > 1 && alt_ifa != NULL)
                ifa_acquire(alt_ifa, &psref);
        pserialize_read_exit(s);

        if (ia_count == 0)
                return;

        rt_ifa_remlocal(ifa, ia_count == 1 ? NULL : alt_ifa);

        if (ia_count > 1 && alt_ifa != NULL)
                ifa_release(alt_ifa, &psref);
}

/* Add prefix route for the network. */
static int
in6_ifaddprefix(struct in6_ifaddr *ia)
{
        int error, flags = 0;

        if (in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) == 128) {
                if (ia->ia_dstaddr.sin6_family != AF_INET6)
                        /* We don't need to install a host route. */
                        return 0;
                flags |= RTF_HOST;
        }

        /* Is this a connected route for neighbour discovery? */
        if (nd6_need_cache(ia->ia_ifp))
                flags |= RTF_CONNECTED;

        if ((error = rtinit(&ia->ia_ifa, RTM_ADD, RTF_UP | flags)) == 0)
                ia->ia_flags |= IFA_ROUTE;
        else if (error == EEXIST)
                /* Existence of the route is not an error. */
                error = 0;

        return error;
}

static int
in6_rt_ifa_matcher(struct rtentry *rt, void *v)
{
        struct ifaddr *ifa = v;

        if (rt->rt_ifa == ifa)
                return 1;
        else
                return 0;
}

/* Delete network prefix route if present.
 * Re-add it to another address if the prefix matches. */
static int
in6_ifremprefix(struct in6_ifaddr *target)
{
        int error, s;
        struct in6_ifaddr *ia;

        if ((target->ia_flags & IFA_ROUTE) == 0)
                return 0;

        s = pserialize_read_enter();
        IN6_ADDRLIST_READER_FOREACH(ia) {
                if (target->ia_dstaddr.sin6_len) {
                        if (ia->ia_dstaddr.sin6_len == 0 ||
                            !IN6_ARE_ADDR_EQUAL(&ia->ia_dstaddr.sin6_addr,
                            &target->ia_dstaddr.sin6_addr))
                                continue;
                } else {
                        if (!IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
                            &target->ia_addr.sin6_addr,
                            &target->ia_prefixmask.sin6_addr))
                                continue;
                }

                /*
                 * if we got a matching prefix route, move IFA_ROUTE to him
                 */
                if ((ia->ia_flags & IFA_ROUTE) == 0) {
                        struct psref psref;
                        int bound = curlwp_bind();

                        ia6_acquire(ia, &psref);
                        pserialize_read_exit(s);

                        rtinit(&target->ia_ifa, RTM_DELETE, 0);
                        target->ia_flags &= ~IFA_ROUTE;

                        error = in6_ifaddprefix(ia);

                        if (!ISSET(target->ia_ifa.ifa_flags, IFA_DESTROYING))
                                goto skip;
                        /*
                         * Replace rt_ifa of routes that have the removing address
                         * with the new address.
                         */
                        rt_replace_ifa_matched_entries(AF_INET6,
                            in6_rt_ifa_matcher, &target->ia_ifa, &ia->ia_ifa);

                skip:
                        ia6_release(ia, &psref);
                        curlwp_bindx(bound);

                        return error;
                }
        }
        pserialize_read_exit(s);

        /*
         * noone seem to have prefix route.  remove it.
         */
        rtinit(&target->ia_ifa, RTM_DELETE, 0);
        target->ia_flags &= ~IFA_ROUTE;

        if (ISSET(target->ia_ifa.ifa_flags, IFA_DESTROYING)) {
                /* Remove routes that have the removing address as rt_ifa. */
                rt_delete_matched_entries(AF_INET6, in6_rt_ifa_matcher,
                    &target->ia_ifa, true);
        }

        return 0;
}

int
in6_mask2len(struct in6_addr *mask, u_char *lim0)
{
        int x = 0, y;
        u_char *lim = lim0, *p;

        /* ignore the scope_id part */
        if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask))
                lim = (u_char *)mask + sizeof(*mask);
        for (p = (u_char *)mask; p < lim; x++, p++) {
                if (*p != 0xff)
                        break;
        }
        y = 0;
        if (p < lim) {
                for (y = 0; y < NBBY; y++) {
                        if ((*p & (0x80 >> y)) == 0)
                                break;
                }
        }

        /*
         * when the limit pointer is given, do a stricter check on the
         * remaining bits.
         */
        if (p < lim) {
                if (y != 0 && (*p & (0x00ff >> y)) != 0)
                        return -1;
                for (p = p + 1; p < lim; p++)
                        if (*p != 0)
                                return -1;
        }

        return x * NBBY + y;
}

#define ifa2ia6(ifa)        ((struct in6_ifaddr *)(ifa))
#define ia62ifa(ia6)        (&((ia6)->ia_ifa))

static int
in6_control1(struct socket *so, u_long cmd, void *data, struct ifnet *ifp)
{
        struct        in6_ifreq *ifr = (struct in6_ifreq *)data;
        struct        in6_ifaddr *ia = NULL;
        struct        in6_aliasreq *ifra = (struct in6_aliasreq *)data;
        struct sockaddr_in6 *sa6;
        int error, bound;
        struct psref psref;

        switch (cmd) {
        case SIOCAADDRCTL_POLICY:
        case SIOCDADDRCTL_POLICY:
                /* Privileged. */
                return in6_src_ioctl(cmd, data);
        /*
         * XXX: Fix me, once we fix SIOCSIFADDR, SIOCIFDSTADDR, etc.
         */
        case SIOCSIFADDR:
        case SIOCSIFDSTADDR:
        case SIOCSIFBRDADDR:
        case SIOCSIFNETMASK:
                return EOPNOTSUPP;
        case SIOCGETSGCNT_IN6:
        case SIOCGETMIFCNT_IN6:
                return mrt6_ioctl(cmd, data);
        case SIOCGIFADDRPREF:
        case SIOCSIFADDRPREF:
                if (ifp == NULL)
                        return EINVAL;
                return ifaddrpref_ioctl(so, cmd, data, ifp);
        }

        if (ifp == NULL)
                return EOPNOTSUPP;

        switch (cmd) {
#ifdef OSIOCSIFINFO_IN6_90
        case OSIOCSIFINFO_FLAGS_90:
        case OSIOCSIFINFO_IN6_90:
        case OSIOCSDEFIFACE_IN6:
        case OSIOCSNDFLUSH_IN6:
        case OSIOCSPFXFLUSH_IN6:
        case OSIOCSRTRFLUSH_IN6:
#endif
        case SIOCSIFINFO_FLAGS:
        case SIOCSIFINFO_IN6:
                /* Privileged. */
                /* FALLTHROUGH */
#ifdef OSIOCGIFINFO_IN6
        case OSIOCGIFINFO_IN6:
#endif
#ifdef OSIOCGIFINFO_IN6_90
        case OSIOCGDRLST_IN6:
        case OSIOCGPRLST_IN6:
        case OSIOCGIFINFO_IN6_90:
        case OSIOCGDEFIFACE_IN6:
#endif
        case SIOCGIFINFO_IN6:
        case SIOCGNBRINFO_IN6:
                return nd6_ioctl(cmd, data, ifp);
        }

        switch (cmd) {
        case SIOCALIFADDR:
        case SIOCDLIFADDR:
                /* Privileged. */
                /* FALLTHROUGH */
        case SIOCGLIFADDR:
                return in6_lifaddr_ioctl(so, cmd, data, ifp);
        }

        /*
         * Find address for this interface, if it exists.
         *
         * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation
         * only, and used the first interface address as the target of other
         * operations (without checking ifra_addr).  This was because netinet
         * code/API assumed at most 1 interface address per interface.
         * Since IPv6 allows a node to assign multiple addresses
         * on a single interface, we almost always look and check the
         * presence of ifra_addr, and reject invalid ones here.
         * It also decreases duplicated code among SIOC*_IN6 operations.
         */
        switch (cmd) {
        case SIOCAIFADDR_IN6:
#ifdef OSIOCAIFADDR_IN6
        case OSIOCAIFADDR_IN6:
#endif
#ifdef OSIOCSIFPHYADDR_IN6
        case OSIOCSIFPHYADDR_IN6:
#endif
        case SIOCSIFPHYADDR_IN6:
                sa6 = &ifra->ifra_addr;
                break;
        case SIOCSIFADDR_IN6:
        case SIOCGIFADDR_IN6:
        case SIOCSIFDSTADDR_IN6:
        case SIOCSIFNETMASK_IN6:
        case SIOCGIFDSTADDR_IN6:
        case SIOCGIFNETMASK_IN6:
        case SIOCDIFADDR_IN6:
        case SIOCGIFPSRCADDR_IN6:
        case SIOCGIFPDSTADDR_IN6:
        case SIOCGIFAFLAG_IN6:
        case SIOCGIFALIFETIME_IN6:
#ifdef OSIOCGIFALIFETIME_IN6
        case OSIOCGIFALIFETIME_IN6:
#endif
        case SIOCGIFSTAT_IN6:
        case SIOCGIFSTAT_ICMP6:
                sa6 = &ifr->ifr_addr;
                break;
        default:
                sa6 = NULL;
                break;
        }

        error = 0;
        bound = curlwp_bind();
        if (sa6 && sa6->sin6_family == AF_INET6) {
                if (sa6->sin6_scope_id != 0)
                        error = sa6_embedscope(sa6, 0);
                else
                        error = in6_setscope(&sa6->sin6_addr, ifp, NULL);
                if (error != 0)
                        goto out;
                ia = in6ifa_ifpwithaddr_psref(ifp, &sa6->sin6_addr, &psref);
        } else
                ia = NULL;

        switch (cmd) {
        case SIOCSIFADDR_IN6:
        case SIOCSIFDSTADDR_IN6:
        case SIOCSIFNETMASK_IN6:
                /*
                 * Since IPv6 allows a node to assign multiple addresses
                 * on a single interface, SIOCSIFxxx ioctls are deprecated.
                 */
                error = EINVAL;
                goto release;

        case SIOCDIFADDR_IN6:
                /*
                 * for IPv4, we look for existing in_ifaddr here to allow
                 * "ifconfig if0 delete" to remove the first IPv4 address on
                 * the interface.  For IPv6, as the spec allows multiple
                 * interface address from the day one, we consider "remove the
                 * first one" semantics to be not preferable.
                 */
                if (ia == NULL) {
                        error = EADDRNOTAVAIL;
                        goto out;
                }
#ifdef OSIOCAIFADDR_IN6
                /* FALLTHROUGH */
        case OSIOCAIFADDR_IN6:
#endif
                /* FALLTHROUGH */
        case SIOCAIFADDR_IN6:
                /*
                 * We always require users to specify a valid IPv6 address for
                 * the corresponding operation.
                 */
                if (ifra->ifra_addr.sin6_family != AF_INET6 ||
                    ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) {
                        error = EAFNOSUPPORT;
                        goto release;
                }
                /* Privileged. */

                break;

        case SIOCGIFADDR_IN6:
                /* This interface is basically deprecated. use SIOCGIFCONF. */
                /* FALLTHROUGH */
        case SIOCGIFAFLAG_IN6:
        case SIOCGIFNETMASK_IN6:
        case SIOCGIFDSTADDR_IN6:
        case SIOCGIFALIFETIME_IN6:
#ifdef OSIOCGIFALIFETIME_IN6
        case OSIOCGIFALIFETIME_IN6:
#endif
                /* must think again about its semantics */
                if (ia == NULL) {
                        error = EADDRNOTAVAIL;
                        goto out;
                }
                break;
        }

        switch (cmd) {

        case SIOCGIFADDR_IN6:
                ifr->ifr_addr = ia->ia_addr;
                error = sa6_recoverscope(&ifr->ifr_addr);
                break;

        case SIOCGIFDSTADDR_IN6:
                if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
                        error = EINVAL;
                        break;
                }
                /*
                 * XXX: should we check if ifa_dstaddr is NULL and return
                 * an error?
                 */
                ifr->ifr_dstaddr = ia->ia_dstaddr;
                error = sa6_recoverscope(&ifr->ifr_dstaddr);
                break;

        case SIOCGIFNETMASK_IN6:
                ifr->ifr_addr = ia->ia_prefixmask;
                break;

        case SIOCGIFAFLAG_IN6:
                ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags;
                break;

        case SIOCGIFSTAT_IN6:
                if (ifp == NULL) {
                        error = EINVAL;
                        break;
                }
                memset(&ifr->ifr_ifru.ifru_stat, 0,
                    sizeof(ifr->ifr_ifru.ifru_stat));
                ifr->ifr_ifru.ifru_stat =
                    *((struct in6_ifextra *)ifp->if_afdata[AF_INET6])->in6_ifstat;
                break;

        case SIOCGIFSTAT_ICMP6:
                if (ifp == NULL) {
                        error = EINVAL;
                        break;
                }
                memset(&ifr->ifr_ifru.ifru_icmp6stat, 0,
                    sizeof(ifr->ifr_ifru.ifru_icmp6stat));
                ifr->ifr_ifru.ifru_icmp6stat =
                    *((struct in6_ifextra *)ifp->if_afdata[AF_INET6])->icmp6_ifstat;
                break;

#ifdef OSIOCGIFALIFETIME_IN6
        case OSIOCGIFALIFETIME_IN6:
#endif
        case SIOCGIFALIFETIME_IN6:
                ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime;
                if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
                        time_t maxexpire;
                        struct in6_addrlifetime *retlt =
                            &ifr->ifr_ifru.ifru_lifetime;

                        /*
                         * XXX: adjust expiration time assuming time_t is
                         * signed.
                         */
                        maxexpire = ((time_t)~0) &
                            (time_t)~(1ULL << ((sizeof(maxexpire) * NBBY) - 1));
                        if (ia->ia6_lifetime.ia6t_vltime <
                            maxexpire - ia->ia6_updatetime) {
                                retlt->ia6t_expire = ia->ia6_updatetime +
                                    ia->ia6_lifetime.ia6t_vltime;
                                retlt->ia6t_expire = retlt->ia6t_expire ?
                                    time_mono_to_wall(retlt->ia6t_expire) :
                                    0;
                        } else
                                retlt->ia6t_expire = maxexpire;
                }
                if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
                        time_t maxexpire;
                        struct in6_addrlifetime *retlt =
                            &ifr->ifr_ifru.ifru_lifetime;

                        /*
                         * XXX: adjust expiration time assuming time_t is
                         * signed.
                         */
                        maxexpire = ((time_t)~0) &
                            (time_t)~(1ULL << ((sizeof(maxexpire) * NBBY) - 1));
                        if (ia->ia6_lifetime.ia6t_pltime <
                            maxexpire - ia->ia6_updatetime) {
                                retlt->ia6t_preferred = ia->ia6_updatetime +
                                    ia->ia6_lifetime.ia6t_pltime;
                                retlt->ia6t_preferred = retlt->ia6t_preferred ?
                                    time_mono_to_wall(retlt->ia6t_preferred) :
                                    0;
                        } else
                                retlt->ia6t_preferred = maxexpire;
                }
#ifdef OSIOCFIFALIFETIME_IN6
                if (cmd == OSIOCFIFALIFETIME_IN6)
                        in6_addrlifetime_to_in6_addrlifetime50(
                            &ifr->ifru.ifru_lifetime);
#endif
                break;

#ifdef OSIOCAIFADDR_IN6
        case OSIOCAIFADDR_IN6:
                in6_aliasreq50_to_in6_aliasreq(ifra);
#endif
                /*FALLTHROUGH*/
        case SIOCAIFADDR_IN6:
        {
                struct in6_addrlifetime *lt;

                /* reject read-only flags */
                if ((ifra->ifra_flags & IN6_IFF_DUPLICATED) != 0 ||
                    (ifra->ifra_flags & IN6_IFF_DETACHED) != 0 ||
                    (ifra->ifra_flags & IN6_IFF_TENTATIVE) != 0 ||
                    (ifra->ifra_flags & IN6_IFF_NODAD) != 0) {
                        error = EINVAL;
                        break;
                }
                /*
                 * ia6t_expire and ia6t_preferred won't be used for now,
                 * so just in case.
                 */
                lt = &ifra->ifra_lifetime;
                if (lt->ia6t_expire != 0)
                        lt->ia6t_expire = time_wall_to_mono(lt->ia6t_expire);
                if (lt->ia6t_preferred != 0)
                        lt->ia6t_preferred =
                            time_wall_to_mono(lt->ia6t_preferred);
                /*
                 * make (ia == NULL) or update (ia != NULL) the interface
                 * address structure, and link it to the list.
                 */
                int s = splsoftnet();
                error = in6_update_ifa1(ifp, ifra, &ia, &psref, 0);
                splx(s);
                /*
                 * in6_update_ifa1 doesn't create the address if its
                 * valid lifetime (vltime) is zero, since we would just
                 * delete the address immediately in that case anyway.
                 * So it may succeed but return null ia.  In that case,
                 * nothing left to do.
                 */
                if (error || ia == NULL)
                        break;
                pfil_run_addrhooks(if_pfil, cmd, &ia->ia_ifa);
                break;
        }

        case SIOCDIFADDR_IN6:
                ia6_release(ia, &psref);
                ifaref(&ia->ia_ifa);
                in6_purgeaddr(&ia->ia_ifa);
                pfil_run_addrhooks(if_pfil, cmd, &ia->ia_ifa);
                ifafree(&ia->ia_ifa);
                ia = NULL;
                break;

        default:
                error = ENOTTY;
        }
release:
        ia6_release(ia, &psref);
out:
        curlwp_bindx(bound);
        return error;
}

int
in6_control(struct socket *so, u_long cmd, void *data, struct ifnet *ifp)
{
        int error, s;

        switch (cmd) {
#ifdef OSIOCSIFINFO_IN6_90
        case OSIOCSIFINFO_FLAGS_90:
        case OSIOCSIFINFO_IN6_90:
        case OSIOCSDEFIFACE_IN6:
        case OSIOCSNDFLUSH_IN6:
        case OSIOCSPFXFLUSH_IN6:
        case OSIOCSRTRFLUSH_IN6:
#endif
        case SIOCSIFINFO_FLAGS:
        case SIOCSIFINFO_IN6:

        case SIOCALIFADDR:
        case SIOCDLIFADDR:

        case SIOCDIFADDR_IN6:
#ifdef OSIOCAIFADDR_IN6
        case OSIOCAIFADDR_IN6:
#endif
        case SIOCAIFADDR_IN6:

        case SIOCAADDRCTL_POLICY:
        case SIOCDADDRCTL_POLICY:

                if (kauth_authorize_network(kauth_cred_get(),
                    KAUTH_NETWORK_SOCKET,
                    KAUTH_REQ_NETWORK_SOCKET_SETPRIV,
                    so, NULL, NULL))
                        return EPERM;
                break;
        }

        s = splsoftnet();
#ifndef NET_MPSAFE
        KASSERT(KERNEL_LOCKED_P());
#endif
        error = in6_control1(so , cmd, data, ifp);
        splx(s);
        return error;
}

static int
in6_get_llsol_addr(struct in6_addr *llsol, struct ifnet *ifp,
    struct in6_addr *ip6)
{
        int error;

        memset(llsol, 0, sizeof(struct in6_addr));
        llsol->s6_addr16[0] = htons(0xff02);
        llsol->s6_addr32[1] = 0;
        llsol->s6_addr32[2] = htonl(1);
        llsol->s6_addr32[3] = ip6->s6_addr32[3];
        llsol->s6_addr8[12] = 0xff;

        error = in6_setscope(llsol, ifp, NULL);
        if (error != 0) {
                /* XXX: should not happen */
                log(LOG_ERR, "%s: in6_setscope failed\n", __func__);
        }

        return error;
}

static int
in6_join_mcastgroups(struct in6_aliasreq *ifra, struct in6_ifaddr *ia,
    struct ifnet *ifp, int flags)
{
        int error;
        struct sockaddr_in6 mltaddr, mltmask;
        struct in6_multi_mship *imm;
        struct in6_addr llsol;
        struct rtentry *rt;
        int dad_delay;
        char ip6buf[INET6_ADDRSTRLEN];

        /* join solicited multicast addr for new host id */
        error = in6_get_llsol_addr(&llsol, ifp, &ifra->ifra_addr.sin6_addr);
        if (error != 0)
                goto out;
        dad_delay = 0;
        if ((flags & IN6_IFAUPDATE_DADDELAY)) {
                /*
                 * We need a random delay for DAD on the address
                 * being configured.  It also means delaying
                 * transmission of the corresponding MLD report to
                 * avoid report collision.
                 * [draft-ietf-ipv6-rfc2462bis-02.txt]
                 */
                dad_delay = cprng_fast32() % (MAX_RTR_SOLICITATION_DELAY * hz);
        }

#define        MLTMASK_LEN  4        /* mltmask's masklen (=32bit=4octet) */
        /* join solicited multicast addr for new host id */
        imm = in6_joingroup(ifp, &llsol, &error, dad_delay);
        if (!imm) {
                nd6log(LOG_ERR,
                    "addmulti failed for %s on %s (errno=%d)\n",
                    IN6_PRINT(ip6buf, &llsol), if_name(ifp), error);
                goto out;
        }
        mutex_enter(&in6_ifaddr_lock);
        LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
        mutex_exit(&in6_ifaddr_lock);

        sockaddr_in6_init(&mltmask, &in6mask32, 0, 0, 0);

        /*
         * join link-local all-nodes address
         */
        sockaddr_in6_init(&mltaddr, &in6addr_linklocal_allnodes,
            0, 0, 0);
        if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) != 0)
                goto out; /* XXX: should not fail */

        /*
         * XXX: do we really need this automatic routes?
         * We should probably reconsider this stuff.  Most applications
         * actually do not need the routes, since they usually specify
         * the outgoing interface.
         */
        rt = rtalloc1(sin6tosa(&mltaddr), 0);
        if (rt) {
                if (memcmp(&mltaddr.sin6_addr,
                    &satocsin6(rt_getkey(rt))->sin6_addr,
                    MLTMASK_LEN)) {
                        rt_unref(rt);
                        rt = NULL;
                } else if (rt->rt_ifp != ifp) {
                        IN6_DPRINTF("%s: rt_ifp %p -> %p (%s) "
                            "network %04x:%04x::/32 = %04x:%04x::/32\n",
                            __func__, rt->rt_ifp, ifp, ifp->if_xname,
                            ntohs(mltaddr.sin6_addr.s6_addr16[0]),
                            ntohs(mltaddr.sin6_addr.s6_addr16[1]),
                            satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[0],
                            satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[1]);
#ifdef NET_MPSAFE
                        error = rt_update_prepare(rt);
                        if (error == 0) {
                                rt_replace_ifa(rt, &ia->ia_ifa);
                                rt->rt_ifp = ifp;
                                rt_update_finish(rt);
                        } else {
                                /*
                                 * If error != 0, the rtentry is being
                                 * destroyed, so doing nothing doesn't
                                 * matter.
                                 */
                        }
#else
                        rt_replace_ifa(rt, &ia->ia_ifa);
                        rt->rt_ifp = ifp;
#endif
                }
        }
        if (!rt) {
                struct rt_addrinfo info;

                memset(&info, 0, sizeof(info));
                info.rti_info[RTAX_DST] = sin6tosa(&mltaddr);
                info.rti_info[RTAX_GATEWAY] = sin6tosa(&ia->ia_addr);
                info.rti_info[RTAX_NETMASK] = sin6tosa(&mltmask);
                info.rti_info[RTAX_IFA] = sin6tosa(&ia->ia_addr);
                /* XXX: we need RTF_CONNECTED to fake nd6_rtrequest */
                info.rti_flags = RTF_UP | RTF_CONNECTED;
                error = rtrequest1(RTM_ADD, &info, NULL);
                if (error)
                        goto out;
        } else {
                rt_unref(rt);
        }
        imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0);
        if (!imm) {
                nd6log(LOG_WARNING,
                    "addmulti failed for %s on %s (errno=%d)\n",
                    IN6_PRINT(ip6buf, &mltaddr.sin6_addr),
                    if_name(ifp), error);
                goto out;
        }
        mutex_enter(&in6_ifaddr_lock);
        LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
        mutex_exit(&in6_ifaddr_lock);

        /*
         * join node information group address
         */
        dad_delay = 0;
        if ((flags & IN6_IFAUPDATE_DADDELAY)) {
                /*
                 * The spec doesn't say anything about delay for this
                 * group, but the same logic should apply.
                 */
                dad_delay = cprng_fast32() % (MAX_RTR_SOLICITATION_DELAY * hz);
        }
        if (in6_nigroup(ifp, hostname, hostnamelen, &mltaddr) != 0)
                ;
        else if ((imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error,
                  dad_delay)) == NULL) { /* XXX jinmei */
                nd6log(LOG_WARNING,
                    "addmulti failed for %s on %s (errno=%d)\n",
                    IN6_PRINT(ip6buf, &mltaddr.sin6_addr),
                    if_name(ifp), error);
                /* XXX not very fatal, go on... */
        } else {
                mutex_enter(&in6_ifaddr_lock);
                LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
                mutex_exit(&in6_ifaddr_lock);
        }


        /*
         * join interface-local all-nodes address.
         * (ff01::1%ifN, and ff01::%ifN/32)
         */
        mltaddr.sin6_addr = in6addr_nodelocal_allnodes;
        if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) != 0)
                goto out; /* XXX: should not fail */

        /* XXX: again, do we really need the route? */
        rt = rtalloc1(sin6tosa(&mltaddr), 0);
        if (rt) {
                /* 32bit came from "mltmask" */
                if (memcmp(&mltaddr.sin6_addr,
                    &satocsin6(rt_getkey(rt))->sin6_addr,
                    32 / NBBY)) {
                        rt_unref(rt);
                        rt = NULL;
                } else if (rt->rt_ifp != ifp) {
                        IN6_DPRINTF("%s: rt_ifp %p -> %p (%s) "
                            "network %04x:%04x::/32 = %04x:%04x::/32\n",
                            __func__, rt->rt_ifp, ifp, ifp->if_xname,
                            ntohs(mltaddr.sin6_addr.s6_addr16[0]),
                            ntohs(mltaddr.sin6_addr.s6_addr16[1]),
                            satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[0],
                            satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[1]);
#ifdef NET_MPSAFE
                        error = rt_update_prepare(rt);
                        if (error == 0) {
                                rt_replace_ifa(rt, &ia->ia_ifa);
                                rt->rt_ifp = ifp;
                                rt_update_finish(rt);
                        } else {
                                /*
                                 * If error != 0, the rtentry is being
                                 * destroyed, so doing nothing doesn't
                                 * matter.
                                 */
                        }
#else
                        rt_replace_ifa(rt, &ia->ia_ifa);
                        rt->rt_ifp = ifp;
#endif
                }
        }
        if (!rt) {
                struct rt_addrinfo info;

                memset(&info, 0, sizeof(info));
                info.rti_info[RTAX_DST] = sin6tosa(&mltaddr);
                info.rti_info[RTAX_GATEWAY] = sin6tosa(&ia->ia_addr);
                info.rti_info[RTAX_NETMASK] = sin6tosa(&mltmask);
                info.rti_info[RTAX_IFA] = sin6tosa(&ia->ia_addr);
                info.rti_flags = RTF_UP | RTF_CONNECTED;
                error = rtrequest1(RTM_ADD, &info, NULL);
                if (error)
                        goto out;
#undef        MLTMASK_LEN
        } else {
                rt_unref(rt);
        }
        imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0);
        if (!imm) {
                nd6log(LOG_WARNING,
                    "addmulti failed for %s on %s (errno=%d)\n",
                    IN6_PRINT(ip6buf, &mltaddr.sin6_addr),
                    if_name(ifp), error);
                goto out;
        } else {
                mutex_enter(&in6_ifaddr_lock);
                LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
                mutex_exit(&in6_ifaddr_lock);
        }
        return 0;

out:
        KASSERT(error != 0);
        return error;
}

/*
 * Update parameters of an IPv6 interface address.
 * If necessary, a new entry is created and linked into address chains.
 * This function is separated from in6_control().
 * XXX: should this be performed under splsoftnet()?
 */
static int
in6_update_ifa1(struct ifnet *ifp, struct in6_aliasreq *ifra,
    struct in6_ifaddr **iap, struct psref *psref, int flags)
{
        int error = 0, hostIsNew = 0, plen = -1;
        struct sockaddr_in6 dst6;
        struct in6_addrlifetime *lt;
        int dad_delay, was_tentative;
        struct in6_ifaddr *ia = iap ? *iap : NULL;
        char ip6buf[INET6_ADDRSTRLEN];
        bool addrmaskNotChanged = false;
        bool send_rtm_newaddr = (ip6_param_rt_msg == 1);
        int saved_flags = 0;

        KASSERT((iap == NULL && psref == NULL) ||
            (iap != NULL && psref != NULL));

        /* Validate parameters */
        if (ifp == NULL || ifra == NULL) /* this maybe redundant */
                return EINVAL;

        /*
         * The destination address for a p2p link must have a family
         * of AF_UNSPEC or AF_INET6.
         */
        if ((ifp->if_flags & IFF_POINTOPOINT) != 0 &&
            ifra->ifra_dstaddr.sin6_family != AF_INET6 &&
            ifra->ifra_dstaddr.sin6_family != AF_UNSPEC)
                return EAFNOSUPPORT;
        /*
         * validate ifra_prefixmask.  don't check sin6_family, netmask
         * does not carry fields other than sin6_len.
         */
        if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6))
                return EINVAL;
        /*
         * Because the IPv6 address architecture is classless, we require
         * users to specify a (non 0) prefix length (mask) for a new address.
         * We also require the prefix (when specified) mask is valid, and thus
         * reject a non-consecutive mask.
         */
        if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0)
                return EINVAL;
        if (ifra->ifra_prefixmask.sin6_len != 0) {
                plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
                    (u_char *)&ifra->ifra_prefixmask +
                    ifra->ifra_prefixmask.sin6_len);
                if (plen <= 0)
                        return EINVAL;
        } else {
                /*
                 * In this case, ia must not be NULL.  We just use its prefix
                 * length.
                 */
                plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL);
        }
        /*
         * If the destination address on a p2p interface is specified,
         * and the address is a scoped one, validate/set the scope
         * zone identifier.
         */
        dst6 = ifra->ifra_dstaddr;
        if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 &&
            (dst6.sin6_family == AF_INET6)) {
                struct in6_addr in6_tmp;
                u_int32_t zoneid;

                in6_tmp = dst6.sin6_addr;
                if (in6_setscope(&in6_tmp, ifp, &zoneid))
                        return EINVAL; /* XXX: should be impossible */

                if (dst6.sin6_scope_id != 0) {
                        if (dst6.sin6_scope_id != zoneid)
                                return EINVAL;
                } else                /* user omit to specify the ID. */
                        dst6.sin6_scope_id = zoneid;

                /* convert into the internal form */
                if (sa6_embedscope(&dst6, 0))
                        return EINVAL; /* XXX: should be impossible */
        }
        /*
         * The destination address can be specified only for a p2p or a
         * loopback interface.  If specified, the corresponding prefix length
         * must be 128.
         */
        if (ifra->ifra_dstaddr.sin6_family == AF_INET6) {
#ifdef FORCE_P2PPLEN
                int i;
#endif

                if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) {
                        /* XXX: noisy message */
                        nd6log(LOG_INFO, "a destination can "
                            "be specified for a p2p or a loopback IF only\n");
                        return EINVAL;
                }
                if (plen != 128) {
                        nd6log(LOG_INFO, "prefixlen should "
                            "be 128 when dstaddr is specified\n");
#ifdef FORCE_P2PPLEN
                        /*
                         * To be compatible with old configurations,
                         * such as ifconfig gif0 inet6 2001::1 2001::2
                         * prefixlen 126, we override the specified
                         * prefixmask as if the prefix length was 128.
                         */
                        ifra->ifra_prefixmask.sin6_len =
                            sizeof(struct sockaddr_in6);
                        for (i = 0; i < 4; i++)
                                ifra->ifra_prefixmask.sin6_addr.s6_addr32[i] =
                                    0xffffffff;
                        plen = 128;
#else
                        return EINVAL;
#endif
                }
        }
        /* lifetime consistency check */
        lt = &ifra->ifra_lifetime;
        if (lt->ia6t_pltime > lt->ia6t_vltime)
                return EINVAL;
        if (lt->ia6t_vltime == 0) {
                /*
                 * the following log might be noisy, but this is a typical
                 * configuration mistake or a tool's bug.
                 */
                nd6log(LOG_INFO, "valid lifetime is 0 for %s\n",
                    IN6_PRINT(ip6buf, &ifra->ifra_addr.sin6_addr));

                if (ia == NULL)
                        return 0; /* there's nothing to do */
        }

#define sin6eq(a, b) \
        ((a)->sin6_len == sizeof(struct sockaddr_in6) && \
         (b)->sin6_len == sizeof(struct sockaddr_in6) && \
         IN6_ARE_ADDR_EQUAL(&(a)->sin6_addr, &(b)->sin6_addr))

        if (!send_rtm_newaddr) {
                if (ia != NULL &&
                    sin6eq(&ifra->ifra_addr, &ia->ia_addr) &&
                    sin6eq(&ifra->ifra_prefixmask, &ia->ia_prefixmask)) {
                        addrmaskNotChanged = true;
                        saved_flags = ia->ia6_flags;  /* check it later */
                }
        }
#undef sin6eq

        /*
         * If this is a new address, allocate a new ifaddr and link it
         * into chains.
         */
        if (ia == NULL) {
                hostIsNew = 1;
                /*
                 * When in6_update_ifa() is called in a process of a received
                 * RA, it is called under an interrupt context.  So, we should
                 * call malloc with M_NOWAIT.
                 */
                ia = malloc(sizeof(*ia), M_IFADDR, M_NOWAIT|M_ZERO);
                if (ia == NULL)
                        return ENOBUFS;
                LIST_INIT(&ia->ia6_memberships);
                /* Initialize the address and masks, and put time stamp */
                ia->ia_ifa.ifa_addr = sin6tosa(&ia->ia_addr);
                ia->ia_addr.sin6_family = AF_INET6;
                ia->ia_addr.sin6_len = sizeof(ia->ia_addr);
                ia->ia6_createtime = time_uptime;
                if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) {
                        /*
                         * XXX: some functions expect that ifa_dstaddr is not
                         * NULL for p2p interfaces.
                         */
                        ia->ia_ifa.ifa_dstaddr = sin6tosa(&ia->ia_dstaddr);
                } else {
                        ia->ia_ifa.ifa_dstaddr = NULL;
                }
                ia->ia_ifa.ifa_netmask = sin6tosa(&ia->ia_prefixmask);

                ia->ia_ifp = ifp;
                IN6_ADDRLIST_ENTRY_INIT(ia);
                ifa_psref_init(&ia->ia_ifa);
        }

        /* update timestamp */
        ia->ia6_updatetime = time_uptime;

        /* set prefix mask */
        if (ifra->ifra_prefixmask.sin6_len) {
                if (ia->ia_prefixmask.sin6_len) {
                        if (!IN6_ARE_ADDR_EQUAL(&ia->ia_prefixmask.sin6_addr,
                            &ifra->ifra_prefixmask.sin6_addr))
                                in6_ifremprefix(ia);
                }
                ia->ia_prefixmask = ifra->ifra_prefixmask;
        }

        /* Set destination address. */
        if (dst6.sin6_family == AF_INET6) {
                if (!IN6_ARE_ADDR_EQUAL(&dst6.sin6_addr,
                    &ia->ia_dstaddr.sin6_addr))
                        in6_ifremprefix(ia);
                ia->ia_dstaddr = dst6;
        }

        /*
         * Set lifetimes.  We do not refer to ia6t_expire and ia6t_preferred
         * to see if the address is deprecated or invalidated, but initialize
         * these members for applications.
         */
        ia->ia6_lifetime = ifra->ifra_lifetime;
        if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
                ia->ia6_lifetime.ia6t_expire =
                    time_uptime + ia->ia6_lifetime.ia6t_vltime;
        } else
                ia->ia6_lifetime.ia6t_expire = 0;
        if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
                ia->ia6_lifetime.ia6t_preferred =
                    time_uptime + ia->ia6_lifetime.ia6t_pltime;
        } else
                ia->ia6_lifetime.ia6t_preferred = 0;

        /*
         * configure address flags.
         * We need to preserve tentative state so DAD works if
         * something adds the same address before DAD finishes.
         */
        was_tentative = ia->ia6_flags & (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED);
        ia->ia6_flags = ifra->ifra_flags;

        /*
         * Make the address tentative before joining multicast addresses,
         * so that corresponding MLD responses would not have a tentative
         * source address.
         */
        ia->ia6_flags &= ~IN6_IFF_DUPLICATED;        /* safety */
        if (ifp->if_link_state == LINK_STATE_DOWN) {
                ia->ia6_flags |= IN6_IFF_DETACHED;
                ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
        } else if ((hostIsNew || was_tentative) && if_do_dad(ifp) &&
                   ip6_dad_enabled()) {
                ia->ia6_flags |= IN6_IFF_TENTATIVE;
        }

        /*
         * backward compatibility - if IN6_IFF_DEPRECATED is set from the
         * userland, make it deprecated.
         */
        if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) {
                ia->ia6_lifetime.ia6t_pltime = 0;
                ia->ia6_lifetime.ia6t_preferred = time_uptime;
        }

        if (!send_rtm_newaddr) {
                /*
                 * We will not send RTM_NEWADDR if the only difference between
                 * ia and ifra is preferred/valid lifetimes, because it is not
                 * very useful for userland programs to be notified of that
                 * changes.
                 */
                if (addrmaskNotChanged && ia->ia6_flags == saved_flags)
                        return 0;
        }

        if (hostIsNew) {
                /*
                 * We need a reference to ia before calling in6_ifinit.
                 * Otherwise ia can be freed in in6_ifinit accidentally.
                 */
                ifaref(&ia->ia_ifa);
        }

        /* Must execute in6_ifinit and ifa_insert atomically */
        mutex_enter(&in6_ifaddr_lock);

        /* reset the interface and routing table appropriately. */
        error = in6_ifinit(ifp, ia, &ifra->ifra_addr, hostIsNew);
        if (error != 0) {
                if (hostIsNew)
                        free(ia, M_IFADDR);
                mutex_exit(&in6_ifaddr_lock);
                return error;
        }

        /*
         * We are done if we have simply modified an existing address.
         */
        if (!hostIsNew) {
                mutex_exit(&in6_ifaddr_lock);
                return error;
        }

        /*
         * Insert ia to the global list and ifa to the interface's list.
         * A reference to it is already gained above.
         */
        IN6_ADDRLIST_WRITER_INSERT_TAIL(ia);
        ifa_insert(ifp, &ia->ia_ifa);

        mutex_exit(&in6_ifaddr_lock);

        /*
         * Beyond this point, we should call in6_purgeaddr upon an error,
         * not just go to unlink.
         */

        /* join necessary multicast groups */
        if ((ifp->if_flags & IFF_MULTICAST) != 0) {
                error = in6_join_mcastgroups(ifra, ia, ifp, flags);
                if (error != 0)
                        goto cleanup;
        }

        if (nd6_need_cache(ifp)) {
                /* XXX maybe unnecessary */
                ia->ia_ifa.ifa_rtrequest = nd6_rtrequest;
                ia->ia_ifa.ifa_flags |= RTF_CONNECTED;
        }

        /*
         * Perform DAD, if needed.
         * XXX It may be of use, if we can administratively
         * disable DAD.
         */
        if (hostIsNew && if_do_dad(ifp) &&
            ((ifra->ifra_flags & IN6_IFF_NODAD) == 0) &&
            (ia->ia6_flags & IN6_IFF_TENTATIVE))
        {
                int mindelay, maxdelay;

                dad_delay = 0;
                if ((flags & IN6_IFAUPDATE_DADDELAY)) {
                        struct in6_addr llsol;
                        struct in6_multi *in6m_sol = NULL;
                        /*
                         * We need to impose a delay before sending an NS
                         * for DAD.  Check if we also needed a delay for the
                         * corresponding MLD message.  If we did, the delay
                         * should be larger than the MLD delay (this could be
                         * relaxed a bit, but this simple logic is at least
                         * safe).
                         */
                        mindelay = 0;
                        error = in6_get_llsol_addr(&llsol, ifp,
                            &ifra->ifra_addr.sin6_addr);
                        in6_multi_lock(RW_READER);
                        if (error == 0)
                                in6m_sol = in6_lookup_multi(&llsol, ifp);
                        if (in6m_sol != NULL &&
                            in6m_sol->in6m_state == MLD_REPORTPENDING) {
                                mindelay = in6m_sol->in6m_timer;
                        }
                        in6_multi_unlock();
                        maxdelay = MAX_RTR_SOLICITATION_DELAY * hz;
                        if (maxdelay - mindelay == 0)
                                dad_delay = 0;
                        else {
                                dad_delay =
                                    (cprng_fast32() % (maxdelay - mindelay)) +
                                    mindelay;
                        }
                }
                /* +1 ensures callout is always used */
                nd6_dad_start(&ia->ia_ifa, dad_delay + 1);
        }

        if (iap != NULL) {
                *iap = ia;
                if (hostIsNew)
                        ia6_acquire(ia, psref);
        }

        return 0;

  cleanup:
        in6_purgeaddr(&ia->ia_ifa);
        return error;
}

int
in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags)
{
        int rc, s;

        s = splsoftnet();
        rc = in6_update_ifa1(ifp, ifra, NULL, NULL, flags);
        splx(s);
        return rc;
}

void
in6_purgeaddr(struct ifaddr *ifa)
{
        struct ifnet *ifp = ifa->ifa_ifp;
        struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa;
        struct in6_multi_mship *imm;

        /* KASSERT(!ifa_held(ifa)); XXX need ifa_not_held (psref_not_held) */
        KASSERT(IFNET_LOCKED(ifp));

        ifa->ifa_flags |= IFA_DESTROYING;

        /* stop DAD processing */
        nd6_dad_stop(ifa);

        /* Delete any network route. */
        in6_ifremprefix(ia);

        /* Remove ownaddr's loopback rtentry, if it exists. */
        in6_ifremlocal(&(ia->ia_ifa));

        /*
         * leave from multicast groups we have joined for the interface
         */
    again:
        mutex_enter(&in6_ifaddr_lock);
        while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) {
                struct in6_multi *in6m __diagused = imm->i6mm_maddr;
                KASSERTMSG(in6m == NULL || in6m->in6m_ifp == ifp,
                    "in6m_ifp=%s ifp=%s", in6m ? in6m->in6m_ifp->if_xname : NULL,
                    ifp->if_xname);
                LIST_REMOVE(imm, i6mm_chain);
                mutex_exit(&in6_ifaddr_lock);

                in6_leavegroup(imm);
                goto again;
        }
        mutex_exit(&in6_ifaddr_lock);

        in6_unlink_ifa(ia, ifp);
}

static void
in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp)
{
        int        s = splsoftnet();

        mutex_enter(&in6_ifaddr_lock);
        IN6_ADDRLIST_WRITER_REMOVE(ia);
        ifa_remove(ifp, &ia->ia_ifa);
        /* Assume ifa_remove called pserialize_perform and psref_destroy */
        mutex_exit(&in6_ifaddr_lock);
        IN6_ADDRLIST_ENTRY_DESTROY(ia);

        /*
         * release another refcnt for the link from in6_ifaddr.
         * Note that we should decrement the refcnt at least once for all *BSD.
         */
        ifafree(&ia->ia_ifa);

        splx(s);
}

void
in6_purgeif(struct ifnet *ifp)
{

        IFNET_LOCK(ifp);
        in6_ifdetach(ifp);
        IFNET_UNLOCK(ifp);
}

void
in6_purge_mcast_references(struct in6_multi *in6m)
{
        struct        in6_ifaddr *ia;

        KASSERT(in6_multi_locked(RW_WRITER));

        mutex_enter(&in6_ifaddr_lock);
        IN6_ADDRLIST_WRITER_FOREACH(ia) {
                struct in6_multi_mship *imm;
                LIST_FOREACH(imm, &ia->ia6_memberships, i6mm_chain) {
                        if (imm->i6mm_maddr == in6m)
                                imm->i6mm_maddr = NULL;
                }
        }
        mutex_exit(&in6_ifaddr_lock);
}

/*
 * SIOC[GAD]LIFADDR.
 *        SIOCGLIFADDR: get first address. (?)
 *        SIOCGLIFADDR with IFLR_PREFIX:
 *                get first address that matches the specified prefix.
 *        SIOCALIFADDR: add the specified address.
 *        SIOCALIFADDR with IFLR_PREFIX:
 *                add the specified prefix, filling hostid part from
 *                the first link-local address.  prefixlen must be <= 64.
 *        SIOCDLIFADDR: delete the specified address.
 *        SIOCDLIFADDR with IFLR_PREFIX:
 *                delete the first address that matches the specified prefix.
 * return values:
 *        EINVAL on invalid parameters
 *        EADDRNOTAVAIL on prefix match failed/specified address not found
 *        other values may be returned from in6_ioctl()
 *
 * NOTE: SIOCALIFADDR(with IFLR_PREFIX set) allows prefixlen less than 64.
 * this is to accommodate address naming scheme other than RFC2374,
 * in the future.
 * RFC2373 defines interface id to be 64bit, but it allows non-RFC2374
 * address encoding scheme. (see figure on page 8)
 */
static int
in6_lifaddr_ioctl(struct socket *so, u_long cmd, void *data, 
        struct ifnet *ifp)
{
        struct in6_ifaddr *ia = NULL; /* XXX gcc 4.8 maybe-uninitialized */
        struct if_laddrreq *iflr = (struct if_laddrreq *)data;
        struct ifaddr *ifa;
        struct sockaddr *sa;

        /* sanity checks */
        if (!data || !ifp) {
                panic("invalid argument to in6_lifaddr_ioctl");
                /* NOTREACHED */
        }

        switch (cmd) {
        case SIOCGLIFADDR:
                /* address must be specified on GET with IFLR_PREFIX */
                if ((iflr->flags & IFLR_PREFIX) == 0)
                        break;
                /* FALLTHROUGH */
        case SIOCALIFADDR:
        case SIOCDLIFADDR:
                /* address must be specified on ADD and DELETE */
                sa = (struct sockaddr *)&iflr->addr;
                if (sa->sa_family != AF_INET6)
                        return EINVAL;
                if (sa->sa_len != sizeof(struct sockaddr_in6))
                        return EINVAL;
                /* XXX need improvement */
                sa = (struct sockaddr *)&iflr->dstaddr;
                if (sa->sa_family && sa->sa_family != AF_INET6)
                        return EINVAL;
                if (sa->sa_len && sa->sa_len != sizeof(struct sockaddr_in6))
                        return EINVAL;
                break;
        default: /* shouldn't happen */
#if 0
                panic("invalid cmd to in6_lifaddr_ioctl");
                /* NOTREACHED */
#else
                return EOPNOTSUPP;
#endif
        }
        if (sizeof(struct in6_addr) * NBBY < iflr->prefixlen)
                return EINVAL;

        switch (cmd) {
        case SIOCALIFADDR:
            {
                struct in6_aliasreq ifra;
                struct in6_addr *xhostid = NULL;
                int prefixlen;
                int bound = curlwp_bind();
                struct psref psref;

                if ((iflr->flags & IFLR_PREFIX) != 0) {
                        struct sockaddr_in6 *sin6;

                        /*
                         * xhostid is to fill in the hostid part of the
                         * address.  xhostid points to the first link-local
                         * address attached to the interface.
                         */
                        ia = in6ifa_ifpforlinklocal_psref(ifp, 0, &psref);
                        if (ia == NULL) {
                                curlwp_bindx(bound);
                                return EADDRNOTAVAIL;
                        }
                        xhostid = IFA_IN6(&ia->ia_ifa);

                         /* prefixlen must be <= 64. */
                        if (64 < iflr->prefixlen) {
                                ia6_release(ia, &psref);
                                curlwp_bindx(bound);
                                return EINVAL;
                        }
                        prefixlen = iflr->prefixlen;

                        /* hostid part must be zero. */
                        sin6 = (struct sockaddr_in6 *)&iflr->addr;
                        if (sin6->sin6_addr.s6_addr32[2] != 0
                         || sin6->sin6_addr.s6_addr32[3] != 0) {
                                ia6_release(ia, &psref);
                                curlwp_bindx(bound);
                                return EINVAL;
                        }
                } else
                        prefixlen = iflr->prefixlen;

                /* copy args to in6_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */
                memset(&ifra, 0, sizeof(ifra));
                memcpy(ifra.ifra_name, iflr->iflr_name, sizeof(ifra.ifra_name));

                memcpy(&ifra.ifra_addr, &iflr->addr,
                    ((struct sockaddr *)&iflr->addr)->sa_len);
                if (xhostid) {
                        /* fill in hostid part */
                        ifra.ifra_addr.sin6_addr.s6_addr32[2] =
                            xhostid->s6_addr32[2];
                        ifra.ifra_addr.sin6_addr.s6_addr32[3] =
                            xhostid->s6_addr32[3];
                }

                if (((struct sockaddr *)&iflr->dstaddr)->sa_family) { /* XXX */
                        memcpy(&ifra.ifra_dstaddr, &iflr->dstaddr,
                            ((struct sockaddr *)&iflr->dstaddr)->sa_len);
                        if (xhostid) {
                                ifra.ifra_dstaddr.sin6_addr.s6_addr32[2] =
                                    xhostid->s6_addr32[2];
                                ifra.ifra_dstaddr.sin6_addr.s6_addr32[3] =
                                    xhostid->s6_addr32[3];
                        }
                }
                if (xhostid) {
                        ia6_release(ia, &psref);
                        ia = NULL;
                }
                curlwp_bindx(bound);

                ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
                in6_prefixlen2mask(&ifra.ifra_prefixmask.sin6_addr, prefixlen);

                ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
                ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
                ifra.ifra_flags = iflr->flags & ~IFLR_PREFIX;
                return in6_control(so, SIOCAIFADDR_IN6, &ifra, ifp);
            }
        case SIOCGLIFADDR:
        case SIOCDLIFADDR:
            {
                struct in6_addr mask, candidate, match;
                struct sockaddr_in6 *sin6;
                int cmp;
                int error, s;

                memset(&mask, 0, sizeof(mask));
                if (iflr->flags & IFLR_PREFIX) {
                        /* lookup a prefix rather than address. */
                        in6_prefixlen2mask(&mask, iflr->prefixlen);

                        sin6 = (struct sockaddr_in6 *)&iflr->addr;
                        memcpy(&match, &sin6->sin6_addr, sizeof(match));
                        match.s6_addr32[0] &= mask.s6_addr32[0];
                        match.s6_addr32[1] &= mask.s6_addr32[1];
                        match.s6_addr32[2] &= mask.s6_addr32[2];
                        match.s6_addr32[3] &= mask.s6_addr32[3];

                        /* if you set extra bits, that's wrong */
                        if (memcmp(&match, &sin6->sin6_addr, sizeof(match)))
                                return EINVAL;

                        cmp = 1;
                } else {
                        if (cmd == SIOCGLIFADDR) {
                                /* on getting an address, take the 1st match */
                                cmp = 0;        /* XXX */
                        } else {
                                /* on deleting an address, do exact match */
                                in6_prefixlen2mask(&mask, 128);
                                sin6 = (struct sockaddr_in6 *)&iflr->addr;
                                memcpy(&match, &sin6->sin6_addr, sizeof(match));

                                cmp = 1;
                        }
                }

                s = pserialize_read_enter();
                IFADDR_READER_FOREACH(ifa, ifp) {
                        if (ifa->ifa_addr->sa_family != AF_INET6)
                                continue;
                        if (!cmp)
                                break;

                        /*
                         * XXX: this is adhoc, but is necessary to allow
                         * a user to specify fe80::/64 (not /10) for a
                         * link-local address.
                         */
                        memcpy(&candidate, IFA_IN6(ifa), sizeof(candidate));
                        in6_clearscope(&candidate);
                        candidate.s6_addr32[0] &= mask.s6_addr32[0];
                        candidate.s6_addr32[1] &= mask.s6_addr32[1];
                        candidate.s6_addr32[2] &= mask.s6_addr32[2];
                        candidate.s6_addr32[3] &= mask.s6_addr32[3];
                        if (IN6_ARE_ADDR_EQUAL(&candidate, &match))
                                break;
                }
                if (!ifa) {
                        error = EADDRNOTAVAIL;
                        goto error;
                }
                ia = ifa2ia6(ifa);

                if (cmd == SIOCGLIFADDR) {
                        /* fill in the if_laddrreq structure */
                        memcpy(&iflr->addr, &ia->ia_addr, ia->ia_addr.sin6_len);
                        error = sa6_recoverscope(
                            (struct sockaddr_in6 *)&iflr->addr);
                        if (error != 0)
                                goto error;

                        if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
                                memcpy(&iflr->dstaddr, &ia->ia_dstaddr,
                                    ia->ia_dstaddr.sin6_len);
                                error = sa6_recoverscope(
                                    (struct sockaddr_in6 *)&iflr->dstaddr);
                                if (error != 0)
                                        goto error;
                        } else
                                memset(&iflr->dstaddr, 0, sizeof(iflr->dstaddr));

                        iflr->prefixlen =
                            in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL);

                        iflr->flags = ia->ia6_flags;        /* XXX */

                        error = 0;
                } else {
                        struct in6_aliasreq ifra;

                        /* fill in6_aliasreq and do ioctl(SIOCDIFADDR_IN6) */
                        memset(&ifra, 0, sizeof(ifra));
                        memcpy(ifra.ifra_name, iflr->iflr_name,
                            sizeof(ifra.ifra_name));

                        memcpy(&ifra.ifra_addr, &ia->ia_addr,
                            ia->ia_addr.sin6_len);
                        if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
                                memcpy(&ifra.ifra_dstaddr, &ia->ia_dstaddr,
                                    ia->ia_dstaddr.sin6_len);
                        } else {
                                memset(&ifra.ifra_dstaddr, 0,
                                    sizeof(ifra.ifra_dstaddr));
                        }
                        memcpy(&ifra.ifra_dstaddr, &ia->ia_prefixmask,
                            ia->ia_prefixmask.sin6_len);

                        ifra.ifra_flags = ia->ia6_flags;
                        pserialize_read_exit(s);

                        return in6_control(so, SIOCDIFADDR_IN6, &ifra, ifp);
                }
        error:
                pserialize_read_exit(s);
                return error;
            }
        }

        return EOPNOTSUPP;        /* just for safety */
}

/*
 * Initialize an interface's internet6 address
 * and routing table entry.
 */
static int
in6_ifinit(struct ifnet *ifp, struct in6_ifaddr *ia, 
        const struct sockaddr_in6 *sin6, int newhost)
{
        int        error = 0, ifacount = 0;
        int s;
        struct ifaddr *ifa;

        KASSERT(mutex_owned(&in6_ifaddr_lock));

        /*
         * Give the interface a chance to initialize
         * if this is its first address,
         * and to validate the address if necessary.
         */
        s = pserialize_read_enter();
        IFADDR_READER_FOREACH(ifa, ifp) {
                if (ifa->ifa_addr->sa_family != AF_INET6)
                        continue;
                ifacount++;
        }
        pserialize_read_exit(s);

        ia->ia_addr = *sin6;

        if (ifacount == 0 &&
            (error = if_addr_init(ifp, &ia->ia_ifa, true)) != 0) {
                return error;
        }

        ia->ia_ifa.ifa_metric = ifp->if_metric;

        /* we could do in(6)_socktrim here, but just omit it at this moment. */

        /* Add ownaddr as loopback rtentry, if necessary (ex. on p2p link). */
        if (newhost) {
                /* set the rtrequest function to create llinfo */
                if (ifp->if_flags & IFF_POINTOPOINT)
                        ia->ia_ifa.ifa_rtrequest = p2p_rtrequest;
                else if ((ifp->if_flags & IFF_LOOPBACK) == 0)
                        ia->ia_ifa.ifa_rtrequest = nd6_rtrequest;
                in6_ifaddlocal(&ia->ia_ifa);
        } else {
                /* Inform the routing socket of new flags/timings */
                rt_addrmsg(RTM_NEWADDR, &ia->ia_ifa);
        }

        /* Add the network prefix route. */
        if ((error = in6_ifaddprefix(ia)) != 0) {
                if (newhost)
                        in6_ifremlocal(&ia->ia_ifa);
                return error;
        }

        return error;
}

static struct ifaddr *
bestifa(struct ifaddr *best_ifa, struct ifaddr *ifa)
{
        if (best_ifa == NULL || best_ifa->ifa_preference < ifa->ifa_preference)
                return ifa;
        return best_ifa;
}

/*
 * Find an IPv6 interface link-local address specific to an interface.
 */
struct in6_ifaddr *
in6ifa_ifpforlinklocal(const struct ifnet *ifp, const int ignoreflags)
{
        struct ifaddr *best_ifa = NULL, *ifa;

        IFADDR_READER_FOREACH(ifa, ifp) {
                if (ifa->ifa_addr->sa_family != AF_INET6)
                        continue;
                if (!IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa)))
                        continue;
                if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0)
                        continue;
                best_ifa = bestifa(best_ifa, ifa);
        }

        return (struct in6_ifaddr *)best_ifa;
}

struct in6_ifaddr *
in6ifa_ifpforlinklocal_psref(const struct ifnet *ifp, const int ignoreflags,
    struct psref *psref)
{
        struct in6_ifaddr *ia;
        int s = pserialize_read_enter();

        ia = in6ifa_ifpforlinklocal(ifp, ignoreflags);
        if (ia != NULL)
                ia6_acquire(ia, psref);
        pserialize_read_exit(s);

        return ia;
}

/*
 * find the internet address corresponding to a given address.
 * ifaddr is returned referenced.
 */
struct in6_ifaddr *
in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid)
{
        struct in6_ifaddr *ia;
        int s;

        s = pserialize_read_enter();
        IN6_ADDRLIST_READER_FOREACH(ia) {
                if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) {
                        if (zoneid != 0 &&
                            zoneid != ia->ia_addr.sin6_scope_id)
                                continue;
                        ifaref(&ia->ia_ifa);
                        break;
                }
        }
        pserialize_read_exit(s);

        return ia;
}

/*
 * find the internet address corresponding to a given interface and address.
 */
struct in6_ifaddr *
in6ifa_ifpwithaddr(const struct ifnet *ifp, const struct in6_addr *addr)
{
        struct ifaddr *best_ifa = NULL, *ifa;

        IFADDR_READER_FOREACH(ifa, ifp) {
                if (ifa->ifa_addr->sa_family != AF_INET6)
                        continue;
                if (!IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa)))
                        continue;
                best_ifa = bestifa(best_ifa, ifa);
        }

        return (struct in6_ifaddr *)best_ifa;
}

struct in6_ifaddr *
in6ifa_ifpwithaddr_psref(const struct ifnet *ifp, const struct in6_addr *addr,
    struct psref *psref)
{
        struct in6_ifaddr *ia;
        int s = pserialize_read_enter();

        ia = in6ifa_ifpwithaddr(ifp, addr);
        if (ia != NULL)
                ia6_acquire(ia, psref);
        pserialize_read_exit(s);

        return ia;
}

static struct in6_ifaddr *
bestia(struct in6_ifaddr *best_ia, struct in6_ifaddr *ia)
{
        if (best_ia == NULL ||
            best_ia->ia_ifa.ifa_preference < ia->ia_ifa.ifa_preference)
                return ia;
        return best_ia;
}

/*
 * Determine if an address is on a local network.
 */
int
in6_localaddr(const struct in6_addr *in6)
{
        struct in6_ifaddr *ia;
        int s;

        if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6))
                return 1;

        s = pserialize_read_enter();
        IN6_ADDRLIST_READER_FOREACH(ia) {
                if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr,
                                              &ia->ia_prefixmask.sin6_addr)) {
                        pserialize_read_exit(s);
                        return 1;
                }
        }
        pserialize_read_exit(s);

        return 0;
}

int
in6_is_addr_deprecated(struct sockaddr_in6 *sa6)
{
        struct in6_ifaddr *ia;
        int s;

        s = pserialize_read_enter();
        IN6_ADDRLIST_READER_FOREACH(ia) {
                if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
                    &sa6->sin6_addr) &&
#ifdef SCOPEDROUTING
                    ia->ia_addr.sin6_scope_id == sa6->sin6_scope_id &&
#endif
                    (ia->ia6_flags & IN6_IFF_DEPRECATED) != 0) {
                        pserialize_read_exit(s);
                        return 1; /* true */
                }

                /* XXX: do we still have to go thru the rest of the list? */
        }
        pserialize_read_exit(s);

        return 0;                /* false */
}

/*
 * return length of part which dst and src are equal
 * hard coding...
 */
int
in6_matchlen(struct in6_addr *src, struct in6_addr *dst)
{
        int match = 0;
        u_char *s = (u_char *)src, *d = (u_char *)dst;
        u_char *lim = s + 16, r;

        while (s < lim)
                if ((r = (*d++ ^ *s++)) != 0) {
                        while (r < 128) {
                                match++;
                                r <<= 1;
                        }
                        break;
                } else
                        match += NBBY;
        return match;
}

void
in6_prefixlen2mask(struct in6_addr *maskp, int len)
{
        static const u_char maskarray[NBBY] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
        int bytelen, bitlen, i;

        /* sanity check */
        if (len < 0 || len > 128) {
                log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n",
                    len);
                return;
        }

        memset(maskp, 0, sizeof(*maskp));
        bytelen = len / NBBY;
        bitlen = len % NBBY;
        for (i = 0; i < bytelen; i++)
                maskp->s6_addr[i] = 0xff;
        if (bitlen)
                maskp->s6_addr[bytelen] = maskarray[bitlen - 1];
}

/*
 * return the best address out of the same scope. if no address was
 * found, return the first valid address from designated IF.
 */
struct in6_ifaddr *
in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst)
{
        int dst_scope =        in6_addrscope(dst), blen = -1, tlen;
        struct ifaddr *ifa;
        struct in6_ifaddr *best_ia = NULL, *ia;
        struct in6_ifaddr *dep[2];        /* last-resort: deprecated */

        dep[0] = dep[1] = NULL;

        /*
         * We first look for addresses in the same scope.
         * If there is one, return it.
         * If two or more, return one which matches the dst longest.
         * If none, return one of global addresses assigned other ifs.
         */
        IFADDR_READER_FOREACH(ifa, ifp) {
                if (ifa->ifa_addr->sa_family != AF_INET6)
                        continue;
                ia = (struct in6_ifaddr *)ifa;
                if (ia->ia6_flags & IN6_IFF_ANYCAST)
                        continue; /* XXX: is there any case to allow anycast? */
                if (ia->ia6_flags & IN6_IFF_NOTREADY)
                        continue; /* don't use this interface */
                if (ia->ia6_flags & IN6_IFF_DETACHED)
                        continue;
                if (ia->ia6_flags & IN6_IFF_DEPRECATED) {
                        if (ip6_use_deprecated)
                                dep[0] = ia;
                        continue;
                }

                if (dst_scope != in6_addrscope(IFA_IN6(ifa)))
                        continue;
                /*
                 * call in6_matchlen() as few as possible
                 */
                if (best_ia == NULL) {
                        best_ia = ia;
                        continue;
                }
                if (blen == -1)
                        blen = in6_matchlen(&best_ia->ia_addr.sin6_addr, dst);
                tlen = in6_matchlen(IFA_IN6(ifa), dst);
                if (tlen > blen) {
                        blen = tlen;
                        best_ia = ia;
                } else if (tlen == blen)
                        best_ia = bestia(best_ia, ia);
        }
        if (best_ia != NULL)
                return best_ia;

        IFADDR_READER_FOREACH(ifa, ifp) {
                if (ifa->ifa_addr->sa_family != AF_INET6)
                        continue;
                ia = (struct in6_ifaddr *)ifa;
                if (ia->ia6_flags & IN6_IFF_ANYCAST)
                        continue; /* XXX: is there any case to allow anycast? */
                if (ia->ia6_flags & IN6_IFF_NOTREADY)
                        continue; /* don't use this interface */
                if (ia->ia6_flags & IN6_IFF_DETACHED)
                        continue;
                if (ia->ia6_flags & IN6_IFF_DEPRECATED) {
                        if (ip6_use_deprecated)
                                dep[1] = (struct in6_ifaddr *)ifa;
                        continue;
                }

                best_ia = bestia(best_ia, ia);
        }
        if (best_ia != NULL)
                return best_ia;

        /* use the last-resort values, that are, deprecated addresses */
        if (dep[0])
                return dep[0];
        if (dep[1])
                return dep[1];

        return NULL;
}

/*
 * perform DAD when interface becomes IFF_UP.
 */
void
in6_if_link_up(struct ifnet *ifp)
{
        struct ifaddr *ifa;
        struct in6_ifaddr *ia;
        int s, bound;
        char ip6buf[INET6_ADDRSTRLEN];

        /* Ensure it's sane to run DAD */
        if (ifp->if_link_state == LINK_STATE_DOWN)
                return;
        if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING))
                return;

        bound = curlwp_bind();
        s = pserialize_read_enter();
        IFADDR_READER_FOREACH(ifa, ifp) {
                struct psref psref;

                if (ifa->ifa_addr->sa_family != AF_INET6)
                        continue;

                ifa_acquire(ifa, &psref);
                pserialize_read_exit(s);
                ia = (struct in6_ifaddr *)ifa;

                /* If detached then mark as tentative */
                if (ia->ia6_flags & IN6_IFF_DETACHED) {
                        ia->ia6_flags &= ~IN6_IFF_DETACHED;
                        if (ip6_dad_enabled() && if_do_dad(ifp)) {
                                ia->ia6_flags |= IN6_IFF_TENTATIVE;
                                nd6log(LOG_ERR, "%s marked tentative\n",
                                    IN6_PRINT(ip6buf,
                                    &ia->ia_addr.sin6_addr));
                        } else if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0)
                                rt_addrmsg(RTM_NEWADDR, ifa);
                }

                if (ia->ia6_flags & IN6_IFF_TENTATIVE) {
                        int rand_delay;

                        /* Clear the duplicated flag as we're starting DAD. */
                        ia->ia6_flags &= ~IN6_IFF_DUPLICATED;

                        /*
                         * The TENTATIVE flag was likely set by hand
                         * beforehand, implicitly indicating the need for DAD.
                         * We may be able to skip the random delay in this
                         * case, but we impose delays just in case.
                         */
                        rand_delay = cprng_fast32() %
                            (MAX_RTR_SOLICITATION_DELAY * hz);
                        /* +1 ensures callout is always used */
                        nd6_dad_start(ifa, rand_delay + 1);
                }

                s = pserialize_read_enter();
                ifa_release(ifa, &psref);
        }
        pserialize_read_exit(s);
        curlwp_bindx(bound);
}

void
in6_if_up(struct ifnet *ifp)
{

        /*
         * special cases, like 6to4, are handled in in6_ifattach
         */
        in6_ifattach(ifp, NULL);

        /* interface may not support link state, so bring it up also */
        in6_if_link_up(ifp);
}

/*
 * Mark all addresses as detached.
 */
void
in6_if_link_down(struct ifnet *ifp)
{
        struct ifaddr *ifa;
        struct in6_ifaddr *ia;
        int s, bound;
        char ip6buf[INET6_ADDRSTRLEN];

        bound = curlwp_bind();
        s = pserialize_read_enter();
        IFADDR_READER_FOREACH(ifa, ifp) {
                struct psref psref;

                if (ifa->ifa_addr->sa_family != AF_INET6)
                        continue;

                ifa_acquire(ifa, &psref);
                pserialize_read_exit(s);
                ia = (struct in6_ifaddr *)ifa;

                /* Stop DAD processing */
                nd6_dad_stop(ifa);

                /*
                 * Mark the address as detached.
                 * This satisfies RFC4862 Section 5.3, but we should apply
                 * this logic to all addresses to be a good citizen and
                 * avoid potential duplicated addresses.
                 * When the interface comes up again, detached addresses
                 * are marked tentative and DAD commences.
                 */
                if (!(ia->ia6_flags & IN6_IFF_DETACHED)) {
                        nd6log(LOG_DEBUG, "%s marked detached\n",
                            IN6_PRINT(ip6buf, &ia->ia_addr.sin6_addr));
                        ia->ia6_flags |= IN6_IFF_DETACHED;
                        ia->ia6_flags &=
                            ~(IN6_IFF_TENTATIVE | IN6_IFF_DUPLICATED);
                        rt_addrmsg(RTM_NEWADDR, ifa);
                }

                s = pserialize_read_enter();
                ifa_release(ifa, &psref);
        }
        pserialize_read_exit(s);
        curlwp_bindx(bound);

        /* Clear ND6_IFF_IFDISABLED to allow DAD again on link-up. */
        if (ifp->if_afdata[AF_INET6] != NULL)
                ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED;
}

void
in6_if_down(struct ifnet *ifp)
{

        in6_if_link_down(ifp);
        lltable_purge_entries(LLTABLE6(ifp));
}

void
in6_if_link_state_change(struct ifnet *ifp, int link_state)
{

        /*
         * Treat LINK_STATE_UNKNOWN as UP.
         * LINK_STATE_UNKNOWN transitions to LINK_STATE_DOWN when
         * if_link_state_change() transitions to LINK_STATE_UP.
         */
        if (link_state == LINK_STATE_DOWN)
                in6_if_link_down(ifp);
        else
                in6_if_link_up(ifp);
}

int
in6_tunnel_validate(const struct ip6_hdr *ip6, const struct in6_addr *src,
    const struct in6_addr *dst)
{

        /* check for address match */
        if (!IN6_ARE_ADDR_EQUAL(src, &ip6->ip6_dst) ||
            !IN6_ARE_ADDR_EQUAL(dst, &ip6->ip6_src))
                return 0;

        /* martian filters on outer source - done in ip6_input */

        /* NOTE: the packet may be dropped by uRPF. */

        /* return valid bytes length */
        return sizeof(*src) + sizeof(*dst);
}

#define        IN6_LLTBL_DEFAULT_HSIZE        32
#define        IN6_LLTBL_HASH(k, h) \
        (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1))

/*
 * Do actual deallocation of @lle.
 * Called by LLE_FREE_LOCKED when number of references
 * drops to zero.
 */
static void
in6_lltable_destroy_lle(struct llentry *lle)
{

        KASSERTMSG(lle->la_numheld == 0, "la_numheld=%d", lle->la_numheld);

        LLE_WUNLOCK(lle);
        LLE_LOCK_DESTROY(lle);
        llentry_pool_put(lle);
}

static struct llentry *
in6_lltable_new(const struct in6_addr *addr6, u_int flags)
{
        struct llentry *lle;

        lle = llentry_pool_get(PR_NOWAIT);
        if (lle == NULL)                /* NB: caller generates msg */
                return NULL;

        lle->r_l3addr.addr6 = *addr6;
        lle->lle_refcnt = 1;
        lle->lle_free = in6_lltable_destroy_lle;
        LLE_LOCK_INIT(lle);
        callout_init(&lle->lle_timer, CALLOUT_MPSAFE);

        return lle;
}

static int
in6_lltable_match_prefix(const struct sockaddr *prefix,
    const struct sockaddr *mask, u_int flags, struct llentry *lle)
{
        const struct sockaddr_in6 *pfx = (const struct sockaddr_in6 *)prefix;
        const struct sockaddr_in6 *msk = (const struct sockaddr_in6 *)mask;

        if (IN6_ARE_MASKED_ADDR_EQUAL(&lle->r_l3addr.addr6,
            &pfx->sin6_addr, &msk->sin6_addr) &&
            ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)))
                return 1;

        return 0;
}

static void
in6_lltable_free_entry(struct lltable *llt, struct llentry *lle)
{

        LLE_WLOCK_ASSERT(lle);
        (void) llentry_free(lle);
}

static int
in6_lltable_rtcheck(struct ifnet *ifp, u_int flags,
    const struct sockaddr *l3addr, const struct rtentry *rt)
{
        char ip6buf[INET6_ADDRSTRLEN];

        if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) || rt->rt_ifp != ifp) {
                int s;
                struct ifaddr *ifa;
                /*
                 * Create an ND6 cache for an IPv6 neighbor
                 * that is not covered by our own prefix.
                 */
                /* XXX ifaof_ifpforaddr should take a const param */
                s = pserialize_read_enter();
                ifa = ifaof_ifpforaddr(l3addr, ifp);
                if (ifa != NULL) {
                        pserialize_read_exit(s);
                        return 0;
                }
                pserialize_read_exit(s);
                log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n",
                    IN6_PRINT(ip6buf,
                    &((const struct sockaddr_in6 *)l3addr)->sin6_addr));
                return EINVAL;
        }
        return 0;
}

static inline uint32_t
in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize)
{

        return IN6_LLTBL_HASH(dst->s6_addr32[3], hsize);
}

static uint32_t
in6_lltable_hash(const struct llentry *lle, uint32_t hsize)
{

        return in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize);
}

static void
in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
{
        struct sockaddr_in6 *sin6;

        sin6 = (struct sockaddr_in6 *)sa;
        bzero(sin6, sizeof(*sin6));
        sin6->sin6_family = AF_INET6;
        sin6->sin6_len = sizeof(*sin6);
        sin6->sin6_addr = lle->r_l3addr.addr6;
}

static inline struct llentry *
in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst)
{
        struct llentry *lle;
        struct llentries *lleh;
        u_int hashidx;

        hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize);
        lleh = &llt->lle_head[hashidx];
        LIST_FOREACH(lle, lleh, lle_next) {
                if (lle->la_flags & LLE_DELETED)
                        continue;
                if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst))
                        break;
        }

        return lle;
}

static int
in6_lltable_delete(struct lltable *llt, u_int flags,
        const struct sockaddr *l3addr)
{
        const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
        struct llentry *lle;

        IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp);
        KASSERTMSG(l3addr->sa_family == AF_INET6,
            "sin_family %d", l3addr->sa_family);

        lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);

        if (lle == NULL) {
#ifdef LLTABLE_DEBUG
                char buf[64];
                sockaddr_format(l3addr, buf, sizeof(buf));
                log(LOG_INFO, "%s: cache for %s is not found\n",
                    __func__, buf);
#endif
                return ENOENT;
        }

        LLE_WLOCK(lle);
#ifdef LLTABLE_DEBUG
        {
                char buf[64];
                sockaddr_format(l3addr, buf, sizeof(buf));
                log(LOG_INFO, "%s: cache for %s (%p) is deleted\n",
                    __func__, buf, lle);
        }
#endif
        llentry_free(lle);

        return 0;
}

static struct llentry *
in6_lltable_create(struct lltable *llt, u_int flags,
    const struct sockaddr *l3addr, const struct rtentry *rt)
{
        const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
        struct ifnet *ifp = llt->llt_ifp;
        struct llentry *lle;

        IF_AFDATA_WLOCK_ASSERT(ifp);
        KASSERTMSG(l3addr->sa_family == AF_INET6,
            "sin_family %d", l3addr->sa_family);

        lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);

        if (lle != NULL) {
                LLE_WLOCK(lle);
                return lle;
        }

        /*
         * A route that covers the given address must have
         * been installed 1st because we are doing a resolution,
         * verify this.
         */
        if (!(flags & LLE_IFADDR) &&
            in6_lltable_rtcheck(ifp, flags, l3addr, rt) != 0)
                return NULL;

        lle = in6_lltable_new(&sin6->sin6_addr, flags);
        if (lle == NULL) {
                log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
                return NULL;
        }
        lle->la_flags = flags;
        if ((flags & LLE_IFADDR) == LLE_IFADDR) {
                memcpy(&lle->ll_addr, CLLADDR(ifp->if_sadl), ifp->if_addrlen);
                lle->la_flags |= LLE_VALID;
        }

        lltable_link_entry(llt, lle);
        LLE_WLOCK(lle);

        return lle;
}

static struct llentry *
in6_lltable_lookup(struct lltable *llt, u_int flags,
        const struct sockaddr *l3addr)
{
        const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
        struct llentry *lle;

        IF_AFDATA_LOCK_ASSERT(llt->llt_ifp);
        KASSERTMSG(l3addr->sa_family == AF_INET6,
            "sin_family %d", l3addr->sa_family);

        lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);

        if (lle == NULL)
                return NULL;

        if (flags & LLE_EXCLUSIVE)
                LLE_WLOCK(lle);
        else
                LLE_RLOCK(lle);
        return lle;
}

static int
in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle,
    struct rt_walkarg *w)
{
        struct sockaddr_in6 sin6;

        LLTABLE_LOCK_ASSERT();

        /* skip deleted entries */
        if (lle->la_flags & LLE_DELETED)
                return 0;

        sockaddr_in6_init(&sin6, &lle->r_l3addr.addr6, 0, 0, 0);

        return lltable_dump_entry(llt, lle, w, sin6tosa(&sin6));
}

static struct lltable *
in6_lltattach(struct ifnet *ifp)
{
        struct lltable *llt;

        llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE);
        llt->llt_af = AF_INET6;
        llt->llt_ifp = ifp;

        llt->llt_lookup = in6_lltable_lookup;
        llt->llt_create = in6_lltable_create;
        llt->llt_delete = in6_lltable_delete;
        llt->llt_dump_entry = in6_lltable_dump_entry;
        llt->llt_hash = in6_lltable_hash;
        llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry;
        llt->llt_free_entry = in6_lltable_free_entry;
        llt->llt_match_prefix = in6_lltable_match_prefix;
        lltable_link(llt);

        return llt;
}

void *
in6_domifattach(struct ifnet *ifp)
{
        struct in6_ifextra *ext;

        ext = malloc(sizeof(*ext), M_IFADDR, M_WAITOK|M_ZERO);

        ext->in6_ifstat = malloc(sizeof(struct in6_ifstat),
            M_IFADDR, M_WAITOK|M_ZERO);

        ext->icmp6_ifstat = malloc(sizeof(struct icmp6_ifstat),
            M_IFADDR, M_WAITOK|M_ZERO);

        ext->nd_ifinfo = nd6_ifattach(ifp);
        ext->scope6_id = scope6_ifattach(ifp);
        ext->lltable = in6_lltattach(ifp);

        return ext;
}

void
in6_domifdetach(struct ifnet *ifp, void *aux)
{
        struct in6_ifextra *ext = (struct in6_ifextra *)aux;

        lltable_free(ext->lltable);
        ext->lltable = NULL;
        SOFTNET_LOCK_UNLESS_NET_MPSAFE();
        nd6_ifdetach(ifp, ext);
        SOFTNET_UNLOCK_UNLESS_NET_MPSAFE();
        free(ext->in6_ifstat, M_IFADDR);
        free(ext->icmp6_ifstat, M_IFADDR);
        scope6_ifdetach(ext->scope6_id);
        free(ext, M_IFADDR);
}

/*
 * Convert IPv4 address stored in struct in_addr to IPv4-Mapped IPv6 address
 * stored in struct in6_addr as defined in RFC 4921 section 2.5.5.2.
 */
void
in6_in_2_v4mapin6(const struct in_addr *in, struct in6_addr *in6)
{
        in6->s6_addr32[0] = 0;
        in6->s6_addr32[1] = 0;
        in6->s6_addr32[2] = IPV6_ADDR_INT32_SMP;
        in6->s6_addr32[3] = in->s_addr;
}

/*
 * Convert sockaddr_in6 to sockaddr_in.  Original sockaddr_in6 must be
 * v4 mapped addr or v4 compat addr
 */
void
in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
{
        memset(sin, 0, sizeof(*sin));
        sin->sin_len = sizeof(struct sockaddr_in);
        sin->sin_family = AF_INET;
        sin->sin_port = sin6->sin6_port;
        sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3];
}

/* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */
void
in6_sin_2_v4mapsin6(const struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
{
        memset(sin6, 0, sizeof(*sin6));
        sin6->sin6_len = sizeof(struct sockaddr_in6);
        sin6->sin6_family = AF_INET6;
        sin6->sin6_port = sin->sin_port;
        in6_in_2_v4mapin6(&sin->sin_addr, &sin6->sin6_addr);
}

/* Convert sockaddr_in6 into sockaddr_in. */
void
in6_sin6_2_sin_in_sock(struct sockaddr *nam)
{
        struct sockaddr_in *sin_p;
        struct sockaddr_in6 sin6;

        /*
         * Save original sockaddr_in6 addr and convert it
         * to sockaddr_in.
         */
        sin6 = *(struct sockaddr_in6 *)nam;
        sin_p = (struct sockaddr_in *)nam;
        in6_sin6_2_sin(sin_p, &sin6);
}

/* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */
void
in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam)
{
        struct sockaddr_in *sin_p;
        struct sockaddr_in6 *sin6_p;

        sin6_p = malloc(sizeof(*sin6_p), M_SONAME, M_WAITOK);
        sin_p = (struct sockaddr_in *)*nam;
        in6_sin_2_v4mapsin6(sin_p, sin6_p);
        free(*nam, M_SONAME);
        *nam = sin6tosa(sin6_p);
}


































































































































































































































































    2 



    2 


    2 
































    4 





    4 




    4 






















































    4 







    4 




    4 
















    4 








    2 

    2 
    4 




    4 


    4 













    4 












    4 



















    4 



    4 
    2 












    1 
    2 





















    2 


















    2 





    4 




























    2 






    2 



    2 







    2 








    2 



    1 








    2 




    2 











    2 












































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
/*        $NetBSD: union_subr.c,v 1.82 2022/07/18 04:30:30 thorpej Exp $        */

/*
 * Copyright (c) 1994
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)union_subr.c        8.20 (Berkeley) 5/20/95
 */

/*
 * Copyright (c) 1994 Jan-Simon Pendry
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)union_subr.c        8.20 (Berkeley) 5/20/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: union_subr.c,v 1.82 2022/07/18 04:30:30 thorpej Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/dirent.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/queue.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/kauth.h>

#include <uvm/uvm_extern.h>

#include <fs/union/union.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

static LIST_HEAD(uhashhead, union_node) *uhashtbl;
static u_long uhash_mask;                /* size of hash table - 1 */
#define UNION_HASH(u, l) \
        ((((u_long) (u) + (u_long) (l)) >> 8) & uhash_mask)
#define NOHASH        ((u_long)-1)

static kmutex_t uhash_lock;

static void union_newupper(struct union_node *, struct vnode *);
static void union_newlower(struct union_node *, struct vnode *);
static void union_ref(struct union_node *);
static void union_rele(struct union_node *);
static int union_do_lookup(struct vnode *, struct componentname *, kauth_cred_t,    const char *);
int union_vn_close(struct vnode *, int, kauth_cred_t, struct lwp *);
static void union_dircache_r(struct vnode *, struct vnode ***, int *);
struct vnode *union_dircache(struct vnode *, struct lwp *);

void
union_init(void)
{

        mutex_init(&uhash_lock, MUTEX_DEFAULT, IPL_NONE);
        uhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &uhash_mask);
}

void
union_reinit(void)
{
        struct union_node *un;
        struct uhashhead *oldhash, *hash;
        u_long oldmask, mask, val;
        int i;

        hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
        mutex_enter(&uhash_lock);
        oldhash = uhashtbl;
        oldmask = uhash_mask;
        uhashtbl = hash;
        uhash_mask = mask;
        for (i = 0; i <= oldmask; i++) {
                while ((un = LIST_FIRST(&oldhash[i])) != NULL) {
                        LIST_REMOVE(un, un_cache);
                        val = UNION_HASH(un->un_uppervp, un->un_lowervp);
                        LIST_INSERT_HEAD(&hash[val], un, un_cache);
                }
        }
        mutex_exit(&uhash_lock);
        hashdone(oldhash, HASH_LIST, oldmask);
}

/*
 * Free global unionfs resources.
 */
void
union_done(void)
{

        hashdone(uhashtbl, HASH_LIST, uhash_mask);
        mutex_destroy(&uhash_lock);

        /* Make sure to unset the readdir hook. */
        vn_union_readdir_hook = NULL;
}

void
union_newlower(struct union_node *un, struct vnode *lowervp)
{
        int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
        int nhash = UNION_HASH(un->un_uppervp, lowervp);

        if (un->un_lowervp == lowervp)
                return;

        KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
        KASSERT(un->un_lowervp == NULL);

        mutex_enter(&uhash_lock);

        if (ohash != nhash && (un->un_cflags & UN_CACHED)) {
                un->un_cflags &= ~UN_CACHED;
                LIST_REMOVE(un, un_cache);
        }
        mutex_enter(&un->un_lock);
        un->un_lowervp = lowervp;
        un->un_lowersz = VNOVAL;
        mutex_exit(&un->un_lock);
        if (ohash != nhash) {
                LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
                un->un_cflags |= UN_CACHED;
        }

        mutex_exit(&uhash_lock);
}

void
union_newupper(struct union_node *un, struct vnode *uppervp)
{
        int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
        int nhash = UNION_HASH(uppervp, un->un_lowervp);
        struct vop_lock_args lock_ap;
        struct vop_unlock_args unlock_ap;
        int error __diagused;

        if (un->un_uppervp == uppervp)
                return;

        KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
        KASSERT(un->un_uppervp == NULL);

        /*
         * We have to transfer the vnode lock from the union vnode to
         * the upper vnode.  Lock the upper vnode first.  We cannot use
         * VOP_LOCK() here as it would break the fstrans state.
         */
        lock_ap.a_desc = VDESC(vop_lock);
        lock_ap.a_vp = uppervp;
        lock_ap.a_flags = LK_EXCLUSIVE;
        error = VCALL(lock_ap.a_vp,  VOFFSET(vop_lock), &lock_ap);
        KASSERT(error == 0);

        mutex_enter(&uhash_lock);

        if (ohash != nhash && (un->un_cflags & UN_CACHED)) {
                un->un_cflags &= ~UN_CACHED;
                LIST_REMOVE(un, un_cache);
        }
        mutex_enter(&un->un_lock);
        un->un_uppervp = uppervp;
        un->un_uppersz = VNOVAL;
        /*
         * With the upper vnode in place unlock the union vnode to
         * finalize the lock transfer.
         */
        unlock_ap.a_desc = VDESC(vop_unlock);
        unlock_ap.a_vp = UNIONTOV(un);
        genfs_unlock(&unlock_ap);
        /* Update union vnode interlock, vmobjlock, & klist. */
        vshareilock(UNIONTOV(un), uppervp);
        rw_obj_hold(uppervp->v_uobj.vmobjlock);
        uvm_obj_setlock(&UNIONTOV(un)->v_uobj, uppervp->v_uobj.vmobjlock);
        vshareklist(UNIONTOV(un), uppervp);
        mutex_exit(&un->un_lock);
        if (ohash != nhash) {
                LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
                un->un_cflags |= UN_CACHED;
        }

        mutex_exit(&uhash_lock);
}

/*
 * Keep track of size changes in the underlying vnodes.
 * If the size changes, then callback to the vm layer
 * giving priority to the upper layer size.
 *
 * Mutex un_lock hold on entry and released on return.
 */
void
union_newsize(struct vnode *vp, off_t uppersz, off_t lowersz)
{
        struct union_node *un = VTOUNION(vp);
        off_t sz;

        KASSERT(mutex_owned(&un->un_lock));
        /* only interested in regular files */
        if (vp->v_type != VREG) {
                mutex_exit(&un->un_lock);
                uvm_vnp_setsize(vp, 0);
                return;
        }

        sz = VNOVAL;

        if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) {
                un->un_uppersz = uppersz;
                if (sz == VNOVAL)
                        sz = un->un_uppersz;
        }

        if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) {
                un->un_lowersz = lowersz;
                if (sz == VNOVAL)
                        sz = un->un_lowersz;
        }
        mutex_exit(&un->un_lock);

        if (sz != VNOVAL) {
#ifdef UNION_DIAGNOSTIC
                printf("union: %s size now %qd\n",
                    uppersz != VNOVAL ? "upper" : "lower", sz);
#endif
                uvm_vnp_setsize(vp, sz);
        }
}

static void
union_ref(struct union_node *un)
{

        KASSERT(mutex_owned(&uhash_lock));
        un->un_refs++;
}

static void
union_rele(struct union_node *un)
{

        mutex_enter(&uhash_lock);
        un->un_refs--;
        if (un->un_refs > 0) {
                mutex_exit(&uhash_lock);
                return;
        }
        if (un->un_cflags & UN_CACHED) {
                un->un_cflags &= ~UN_CACHED;
                LIST_REMOVE(un, un_cache);
        }
        mutex_exit(&uhash_lock);

        if (un->un_pvp != NULLVP)
                vrele(un->un_pvp);
        if (un->un_uppervp != NULLVP)
                vrele(un->un_uppervp);
        if (un->un_lowervp != NULLVP)
                vrele(un->un_lowervp);
        if (un->un_dirvp != NULLVP)
                vrele(un->un_dirvp);
        if (un->un_path)
                free(un->un_path, M_TEMP);
        mutex_destroy(&un->un_lock);

        free(un, M_TEMP);
}

/*
 * allocate a union_node/vnode pair.  the vnode is
 * referenced and unlocked.  the new vnode is returned
 * via (vpp).  (mp) is the mountpoint of the union filesystem,
 * (dvp) is the parent directory where the upper layer object
 * should exist (but doesn't) and (cnp) is the componentname
 * information which is partially copied to allow the upper
 * layer object to be created at a later time.  (uppervp)
 * and (lowervp) reference the upper and lower layer objects
 * being mapped.  either, but not both, can be nil.
 * both, if supplied, are unlocked.
 * the reference is either maintained in the new union_node
 * object which is allocated, or they are vrele'd.
 *
 * all union_nodes are maintained on a hash
 * list.  new nodes are only allocated when they cannot
 * be found on this list.  entries on the list are
 * removed when the vfs reclaim entry is called.
 *
 * the vnode gets attached or referenced with vcache_get().
 */
int
union_allocvp(
        struct vnode **vpp,
        struct mount *mp,
        struct vnode *undvp,                /* parent union vnode */
        struct vnode *dvp,                /* may be null */
        struct componentname *cnp,        /* may be null */
        struct vnode *uppervp,                /* may be null */
        struct vnode *lowervp,                /* may be null */
        int docache)
{
        int error;
        struct union_node *un = NULL, *un1;
        struct vnode *vp, *xlowervp = NULLVP;
        u_long hash[3];
        int try;
        bool is_dotdot;

        is_dotdot = (dvp != NULL && cnp != NULL && (cnp->cn_flags & ISDOTDOT));

        if (uppervp == NULLVP && lowervp == NULLVP)
                panic("union: unidentifiable allocation");

        if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
                xlowervp = lowervp;
                lowervp = NULLVP;
        }

        /*
         * If both uppervp and lowervp are not NULL we have to
         * search union nodes with one vnode as NULL too.
         */
        hash[0] = UNION_HASH(uppervp, lowervp);
        if (uppervp == NULL || lowervp == NULL) {
                hash[1] = hash[2] = NOHASH;
        } else {
                hash[1] = UNION_HASH(uppervp, NULLVP);
                hash[2] = UNION_HASH(NULLVP, lowervp);
        }

        if (!docache) {
                un = NULL;
                goto found;
        }

loop:
        mutex_enter(&uhash_lock);

        for (try = 0; try < 3; try++) {
                if (hash[try] == NOHASH)
                        continue;
                LIST_FOREACH(un, &uhashtbl[hash[try]], un_cache) {
                        if ((un->un_lowervp && un->un_lowervp != lowervp) ||
                            (un->un_uppervp && un->un_uppervp != uppervp) ||
                            un->un_mount != mp)
                                continue;

                        union_ref(un);
                        mutex_exit(&uhash_lock);
                        error = vcache_get(mp, &un, sizeof(un), &vp);
                        KASSERT(error != 0 || UNIONTOV(un) == vp);
                        union_rele(un);
                        if (error == ENOENT)
                                goto loop;
                        else if (error)
                                goto out;
                        goto found;
                }
        }

        mutex_exit(&uhash_lock);

found:
        if (un) {
                if (uppervp != dvp) {
                        if (is_dotdot)
                                VOP_UNLOCK(dvp);
                        vn_lock(UNIONTOV(un), LK_EXCLUSIVE | LK_RETRY);
                        if (is_dotdot)
                                vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
                }
                /*
                 * Save information about the upper layer.
                 */
                if (uppervp != un->un_uppervp) {
                        union_newupper(un, uppervp);
                } else if (uppervp) {
                        vrele(uppervp);
                }

                /*
                 * Save information about the lower layer.
                 * This needs to keep track of pathname
                 * and directory information which union_vn_create
                 * might need.
                 */
                if (lowervp != un->un_lowervp) {
                        union_newlower(un, lowervp);
                        if (cnp && (lowervp != NULLVP)) {
                                un->un_path = malloc(cnp->cn_namelen+1,
                                                M_TEMP, M_WAITOK);
                                memcpy(un->un_path, cnp->cn_nameptr,
                                                cnp->cn_namelen);
                                un->un_path[cnp->cn_namelen] = '\0';
                                vref(dvp);
                                un->un_dirvp = dvp;
                        }
                } else if (lowervp) {
                        vrele(lowervp);
                }
                *vpp = UNIONTOV(un);
                if (uppervp != dvp)
                        VOP_UNLOCK(*vpp);
                error = 0;
                goto out;
        }

        un = malloc(sizeof(struct union_node), M_TEMP, M_WAITOK);
        mutex_init(&un->un_lock, MUTEX_DEFAULT, IPL_NONE);
        un->un_refs = 1;
        un->un_mount = mp;
        un->un_vnode = NULL;
        un->un_uppervp = uppervp;
        un->un_lowervp = lowervp;
        un->un_pvp = undvp;
        if (undvp != NULLVP)
                vref(undvp);
        un->un_dircache = 0;
        un->un_openl = 0;
        un->un_cflags = 0;
        un->un_hooknode = false;

        un->un_uppersz = VNOVAL;
        un->un_lowersz = VNOVAL;

        if (dvp && cnp && (lowervp != NULLVP)) {
                un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK);
                memcpy(un->un_path, cnp->cn_nameptr, cnp->cn_namelen);
                un->un_path[cnp->cn_namelen] = '\0';
                vref(dvp);
                un->un_dirvp = dvp;
        } else {
                un->un_path = 0;
                un->un_dirvp = 0;
        }

        if (docache) {
                mutex_enter(&uhash_lock);
                LIST_FOREACH(un1, &uhashtbl[hash[0]], un_cache) {
                        if (un1->un_lowervp == lowervp &&
                            un1->un_uppervp == uppervp &&
                            un1->un_mount == mp) {
                                /*
                                 * Another thread beat us, push back freshly
                                 * allocated node and retry.
                                 */
                                mutex_exit(&uhash_lock);
                                union_rele(un);
                                goto loop;
                        }
                }
                LIST_INSERT_HEAD(&uhashtbl[hash[0]], un, un_cache);
                un->un_cflags |= UN_CACHED;
                mutex_exit(&uhash_lock);
        }

        error = vcache_get(mp, &un, sizeof(un), vpp);
        KASSERT(error != 0 || UNIONTOV(un) == *vpp);
        union_rele(un);
        if (error == ENOENT)
                goto loop;

out:
        if (xlowervp)
                vrele(xlowervp);

        return error;
}

int
union_freevp(struct vnode *vp)
{
        struct union_node *un = VTOUNION(vp);

        /* Detach vnode from union node. */
        un->un_vnode = NULL;
        un->un_uppersz = VNOVAL;
        un->un_lowersz = VNOVAL;

        /* Detach union node from vnode. */
        mutex_enter(vp->v_interlock);
        vp->v_data = NULL;
        mutex_exit(vp->v_interlock);

        union_rele(un);

        return 0;
}

int
union_loadvnode(struct mount *mp, struct vnode *vp,
    const void *key, size_t key_len, const void **new_key)
{
        struct vattr va;
        struct vnode *svp;
        struct union_node *un;
        struct union_mount *um;
        voff_t uppersz, lowersz;

        KASSERT(key_len == sizeof(un));
        memcpy(&un, key, key_len);

        um = MOUNTTOUNIONMOUNT(mp);
        svp = (un->un_uppervp != NULLVP) ? un->un_uppervp : un->un_lowervp;

        vp->v_tag = VT_UNION;
        vp->v_op = union_vnodeop_p;
        vp->v_data = un;
        un->un_vnode = vp;

        vp->v_type = svp->v_type;
        if (svp->v_type == VCHR || svp->v_type == VBLK)
                spec_node_init(vp, svp->v_rdev);

        vshareilock(vp, svp);
        rw_obj_hold(svp->v_uobj.vmobjlock);
        uvm_obj_setlock(&vp->v_uobj, svp->v_uobj.vmobjlock);
        vshareklist(vp, svp);

        /* detect the root vnode (and aliases) */
        if ((un->un_uppervp == um->um_uppervp) &&
            ((un->un_lowervp == NULLVP) || un->un_lowervp == um->um_lowervp)) {
                if (un->un_lowervp == NULLVP) {
                        un->un_lowervp = um->um_lowervp;
                        if (un->un_lowervp != NULLVP) 
                                vref(un->un_lowervp);
                }
                vp->v_vflag |= VV_ROOT;
        }

        uppersz = lowersz = VNOVAL;
        if (un->un_uppervp != NULLVP) {
                if (vn_lock(un->un_uppervp, LK_SHARED) == 0) {
                        if (VOP_GETATTR(un->un_uppervp, &va, FSCRED) == 0)
                                uppersz = va.va_size;
                        VOP_UNLOCK(un->un_uppervp);
                }
        }
        if (un->un_lowervp != NULLVP) {
                if (vn_lock(un->un_lowervp, LK_SHARED) == 0) {
                        if (VOP_GETATTR(un->un_lowervp, &va, FSCRED) == 0)
                                lowersz = va.va_size;
                        VOP_UNLOCK(un->un_lowervp);
                }
        }

        mutex_enter(&un->un_lock);
        union_newsize(vp, uppersz, lowersz);

        mutex_enter(&uhash_lock);
        union_ref(un);
        mutex_exit(&uhash_lock);

        *new_key = &vp->v_data;

        return 0;
}

/*
 * copyfile.  copy the vnode (fvp) to the vnode (tvp)
 * using a sequence of reads and writes.  both (fvp)
 * and (tvp) are locked on entry and exit.
 */
int
union_copyfile(struct vnode *fvp, struct vnode *tvp, kauth_cred_t cred,
        struct lwp *l)
{
        char *tbuf;
        struct uio uio;
        struct iovec iov;
        int error = 0;

        /*
         * strategy:
         * allocate a buffer of size MAXBSIZE.
         * loop doing reads and writes, keeping track
         * of the current uio offset.
         * give up at the first sign of trouble.
         */

        uio.uio_offset = 0;
        UIO_SETUP_SYSSPACE(&uio);

        tbuf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);

        /* ugly loop follows... */
        do {
                off_t offset = uio.uio_offset;

                uio.uio_iov = &iov;
                uio.uio_iovcnt = 1;
                iov.iov_base = tbuf;
                iov.iov_len = MAXBSIZE;
                uio.uio_resid = iov.iov_len;
                uio.uio_rw = UIO_READ;
                error = VOP_READ(fvp, &uio, 0, cred);

                if (error == 0) {
                        uio.uio_iov = &iov;
                        uio.uio_iovcnt = 1;
                        iov.iov_base = tbuf;
                        iov.iov_len = MAXBSIZE - uio.uio_resid;
                        uio.uio_offset = offset;
                        uio.uio_rw = UIO_WRITE;
                        uio.uio_resid = iov.iov_len;

                        if (uio.uio_resid == 0)
                                break;

                        do {
                                error = VOP_WRITE(tvp, &uio, 0, cred);
                        } while ((uio.uio_resid > 0) && (error == 0));
                }

        } while (error == 0);

        free(tbuf, M_TEMP);
        return (error);
}

/*
 * (un) is assumed to be locked on entry and remains
 * locked on exit.
 */
int
union_copyup(struct union_node *un, int docopy, kauth_cred_t cred,
        struct lwp *l)
{
        int error;
        struct vnode *lvp, *uvp;
        struct vattr lvattr, uvattr;

        error = union_vn_create(&uvp, un, l);
        if (error)
                return (error);

        union_newupper(un, uvp);

        lvp = un->un_lowervp;

        if (docopy) {
                /*
                 * XX - should not ignore errors
                 * from VOP_CLOSE
                 */
                vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);

                error = VOP_GETATTR(lvp, &lvattr, cred);
                if (error == 0)
                        error = VOP_OPEN(lvp, FREAD, cred);
                if (error == 0) {
                        error = union_copyfile(lvp, uvp, cred, l);
                        (void) VOP_CLOSE(lvp, FREAD, cred);
                }
                if (error == 0) {
                        /* Copy permissions up too */
                        vattr_null(&uvattr);
                        uvattr.va_mode = lvattr.va_mode;
                        uvattr.va_flags = lvattr.va_flags;
                        error = VOP_SETATTR(uvp, &uvattr, cred);
                }
                VOP_UNLOCK(lvp);
#ifdef UNION_DIAGNOSTIC
                if (error == 0)
                        uprintf("union: copied up %s\n", un->un_path);
#endif

        }
        union_vn_close(uvp, FWRITE, cred, l);

        /*
         * Subsequent IOs will go to the top layer, so
         * call close on the lower vnode and open on the
         * upper vnode to ensure that the filesystem keeps
         * its references counts right.  This doesn't do
         * the right thing with (cred) and (FREAD) though.
         * Ignoring error returns is not right, either.
         */
        if (error == 0) {
                int i;

                vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
                for (i = 0; i < un->un_openl; i++) {
                        (void) VOP_CLOSE(lvp, FREAD, cred);
                        (void) VOP_OPEN(uvp, FREAD, cred);
                }
                un->un_openl = 0;
                VOP_UNLOCK(lvp);
        }

        return (error);

}

/*
 * Prepare the creation of a new node in the upper layer.
 *
 * (dvp) is the directory in which to create the new node.
 * it is locked on entry and exit.
 * (cnp) is the componentname to be created.
 * (cred, path, hash) are credentials, path and its hash to fill (cnp).
 */
static int
union_do_lookup(struct vnode *dvp, struct componentname *cnp, kauth_cred_t cred,
    const char *path)
{
        int error;
        struct vnode *vp;

        cnp->cn_nameiop = CREATE;
        cnp->cn_flags = LOCKPARENT | ISLASTCN;
        cnp->cn_cred = cred;
        cnp->cn_nameptr = path;
        cnp->cn_namelen = strlen(path);

        error = VOP_LOOKUP(dvp, &vp, cnp);

        if (error == 0) {
                KASSERT(vp != NULL);
                VOP_ABORTOP(dvp, cnp);
                vrele(vp);
                error = EEXIST;
        } else if (error == EJUSTRETURN) {
                error = 0;
        }

        return error;
}

/*
 * Create a shadow directory in the upper layer.
 * The new vnode is returned locked.
 *
 * (um) points to the union mount structure for access to the
 * the mounting process's credentials.
 * (dvp) is the directory in which to create the shadow directory.
 * it is unlocked on entry and exit.
 * (cnp) is the componentname to be created.
 * (vpp) is the returned newly created shadow directory, which
 * is returned locked.
 *
 * N.B. We still attempt to create shadow directories even if the union
 * is mounted read-only, which is a little nonintuitive.
 */
int
union_mkshadow(struct union_mount *um, struct vnode *dvp,
        struct componentname *cnp, struct vnode **vpp)
{
        int error;
        struct vattr va;
        struct componentname cn;
        char *pnbuf;

        if (cnp->cn_namelen + 1 > MAXPATHLEN)
                return ENAMETOOLONG;
        pnbuf = PNBUF_GET();
        memcpy(pnbuf, cnp->cn_nameptr, cnp->cn_namelen);
        pnbuf[cnp->cn_namelen] = '\0';

        vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);

        error = union_do_lookup(dvp, &cn,
            (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred), pnbuf);
        if (error) {
                VOP_UNLOCK(dvp);
                PNBUF_PUT(pnbuf);
                return error;
        }

        /*
         * policy: when creating the shadow directory in the
         * upper layer, create it owned by the user who did
         * the mount, group from parent directory, and mode
         * 777 modified by umask (ie mostly identical to the
         * mkdir syscall).  (jsp, kb)
         */

        vattr_null(&va);
        va.va_type = VDIR;
        va.va_mode = um->um_cmode;

        KASSERT(*vpp == NULL);
        error = VOP_MKDIR(dvp, vpp, &cn, &va);
        VOP_UNLOCK(dvp);
        PNBUF_PUT(pnbuf);
        return error;
}

/*
 * Create a whiteout entry in the upper layer.
 *
 * (um) points to the union mount structure for access to the
 * the mounting process's credentials.
 * (dvp) is the directory in which to create the whiteout.
 * it is locked on entry and exit.
 * (cnp) is the componentname to be created.
 * (un) holds the path and its hash to be created.
 */
int
union_mkwhiteout(struct union_mount *um, struct vnode *dvp,
        struct componentname *cnp, struct union_node *un)
{
        int error;
        struct componentname cn;

        error = union_do_lookup(dvp, &cn,
            (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred),
            un->un_path);
        if (error)
                return error;

        error = VOP_WHITEOUT(dvp, &cn, CREATE);
        return error;
}

/*
 * union_vn_create: creates and opens a new shadow file
 * on the upper union layer.  this function is similar
 * in spirit to calling vn_open but it avoids calling namei().
 * the problem with calling namei is that a) it locks too many
 * things, and b) it doesn't start at the "right" directory,
 * whereas union_do_lookup is told where to start.
 */
int
union_vn_create(struct vnode **vpp, struct union_node *un, struct lwp *l)
{
        struct vnode *vp;
        kauth_cred_t cred = l->l_cred;
        struct vattr vat;
        struct vattr *vap = &vat;
        int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
        int error;
        int cmode = UN_FILEMODE & ~l->l_proc->p_cwdi->cwdi_cmask;
        struct componentname cn;

        *vpp = NULLVP;

        vn_lock(un->un_dirvp, LK_EXCLUSIVE | LK_RETRY);

        error = union_do_lookup(un->un_dirvp, &cn, l->l_cred,
            un->un_path);
        if (error) {
                VOP_UNLOCK(un->un_dirvp);
                return error;
        }

        /*
         * Good - there was no race to create the file
         * so go ahead and create it.  The permissions
         * on the file will be 0666 modified by the
         * current user's umask.  Access to the file, while
         * it is unioned, will require access to the top *and*
         * bottom files.  Access when not unioned will simply
         * require access to the top-level file.
         * TODO: confirm choice of access permissions.
         */
        vattr_null(vap);
        vap->va_type = VREG;
        vap->va_mode = cmode;
        vp = NULL;
        error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap);
        if (error) {
                VOP_UNLOCK(un->un_dirvp);
                return error;
        }

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        VOP_UNLOCK(un->un_dirvp);
        error = VOP_OPEN(vp, fmode, cred);
        if (error) {
                vput(vp);
                return error;
        }

        vp->v_writecount++;
        VOP_UNLOCK(vp);
        *vpp = vp;
        return 0;
}

int
union_vn_close(struct vnode *vp, int fmode, kauth_cred_t cred, struct lwp *l)
{

        if (fmode & FWRITE)
                --vp->v_writecount;
        return (VOP_CLOSE(vp, fmode, cred));
}

void
union_removed_upper(struct union_node *un)
{
        struct vnode *vp = UNIONTOV(un);

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#if 1
        /*
         * We do not set the uppervp to NULLVP here, because lowervp
         * may also be NULLVP, so this routine would end up creating
         * a bogus union node with no upper or lower VP (that causes
         * pain in many places that assume at least one VP exists).
         * Since we've removed this node from the cache hash chains,
         * it won't be found again.  When all current holders
         * release it, union_inactive() will vgone() it.
         */
        union_diruncache(un);
#else
        union_newupper(un, NULLVP);
#endif

        VOP_UNLOCK(vp);

        mutex_enter(&uhash_lock);
        if (un->un_cflags & UN_CACHED) {
                un->un_cflags &= ~UN_CACHED;
                LIST_REMOVE(un, un_cache);
        }
        mutex_exit(&uhash_lock);
}

#if 0
struct vnode *
union_lowervp(struct vnode *vp)
{
        struct union_node *un = VTOUNION(vp);

        if ((un->un_lowervp != NULLVP) &&
            (vp->v_type == un->un_lowervp->v_type)) {
                if (vget(un->un_lowervp, 0, true /* wait */) == 0)
                        return (un->un_lowervp);
        }

        return (NULLVP);
}
#endif

/*
 * determine whether a whiteout is needed
 * during a remove/rmdir operation.
 */
int
union_dowhiteout(struct union_node *un, kauth_cred_t cred)
{
        struct vattr va;

        if (un->un_lowervp != NULLVP)
                return (1);

        if (VOP_GETATTR(un->un_uppervp, &va, cred) == 0 &&
            (va.va_flags & OPAQUE))
                return (1);

        return (0);
}

static void
union_dircache_r(struct vnode *vp, struct vnode ***vppp, int *cntp)
{
        struct union_node *un;

        if (vp->v_op != union_vnodeop_p) {
                if (vppp) {
                        vref(vp);
                        *(*vppp)++ = vp;
                        if (--(*cntp) == 0)
                                panic("union: dircache table too small");
                } else {
                        (*cntp)++;
                }

                return;
        }

        un = VTOUNION(vp);
        if (un->un_uppervp != NULLVP)
                union_dircache_r(un->un_uppervp, vppp, cntp);
        if (un->un_lowervp != NULLVP)
                union_dircache_r(un->un_lowervp, vppp, cntp);
}

struct vnode *
union_dircache(struct vnode *vp, struct lwp *l)
{
        int cnt;
        struct vnode *nvp = NULLVP;
        struct vnode **vpp;
        struct vnode **dircache;
        int error;

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        dircache = VTOUNION(vp)->un_dircache;

        nvp = NULLVP;

        if (dircache == 0) {
                cnt = 0;
                union_dircache_r(vp, 0, &cnt);
                cnt++;
                dircache = (struct vnode **)
                                malloc(cnt * sizeof(struct vnode *),
                                        M_TEMP, M_WAITOK);
                vpp = dircache;
                union_dircache_r(vp, &vpp, &cnt);
                VTOUNION(vp)->un_dircache = dircache;
                *vpp = NULLVP;
                vpp = dircache + 1;
        } else {
                vpp = dircache;
                do {
                        if (*vpp++ == VTOUNION(vp)->un_lowervp)
                                break;
                } while (*vpp != NULLVP);
        }

        if (*vpp == NULLVP)
                goto out;

        vref(*vpp);
        error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0,
            NULLVP, *vpp, 0);
        if (!error) {
                vn_lock(nvp, LK_EXCLUSIVE | LK_RETRY);
                VTOUNION(vp)->un_dircache = 0;
                VTOUNION(nvp)->un_hooknode = true;
                VTOUNION(nvp)->un_dircache = dircache;
        }

out:
        VOP_UNLOCK(vp);
        return (nvp);
}

void
union_diruncache(struct union_node *un)
{
        struct vnode **vpp;

        KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
        if (un->un_dircache != 0) {
                for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
                        vrele(*vpp);
                free(un->un_dircache, M_TEMP);
                un->un_dircache = 0;
        }
}

/*
 * Check whether node can rmdir (check empty).
 */
int
union_check_rmdir(struct union_node *un, kauth_cred_t cred)
{
        int dirlen, eofflag, error;
        char *dirbuf;
        struct vattr va;
        struct vnode *tvp;
        struct dirent *dp, *edp;
        struct componentname cn;
        struct iovec aiov;
        struct uio auio;

        KASSERT(un->un_uppervp != NULL);

        /* Check upper for being opaque. */
        KASSERT(VOP_ISLOCKED(un->un_uppervp));
        error = VOP_GETATTR(un->un_uppervp, &va, cred);
        if (error || (va.va_flags & OPAQUE))
                return error;

        if (un->un_lowervp == NULL)
                return 0;

        /* Check lower for being empty. */
        vn_lock(un->un_lowervp, LK_SHARED | LK_RETRY);
        error = VOP_GETATTR(un->un_lowervp, &va, cred);
        if (error) {
                VOP_UNLOCK(un->un_lowervp);
                return error;
        }
        dirlen = va.va_blocksize;
        dirbuf = kmem_alloc(dirlen, KM_SLEEP);
        /* error = 0; */
        eofflag = 0;
        auio.uio_offset = 0;
        do {
                aiov.iov_len = dirlen;
                aiov.iov_base = dirbuf;
                auio.uio_iov = &aiov;
                auio.uio_iovcnt = 1;
                auio.uio_resid = aiov.iov_len;
                auio.uio_rw = UIO_READ;
                UIO_SETUP_SYSSPACE(&auio);
                error = VOP_READDIR(un->un_lowervp, &auio, cred, &eofflag,
                    NULL, NULL);
                if (error)
                        break;
                edp = (struct dirent *)&dirbuf[dirlen - auio.uio_resid];
                for (dp = (struct dirent *)dirbuf;
                    error == 0 && dp < edp;
                    dp = (struct dirent *)((char *)dp + dp->d_reclen)) {
                        if (dp->d_reclen == 0) {
                                error = ENOTEMPTY;
                                break;
                        }
                        if (dp->d_type == DT_WHT ||
                            (dp->d_namlen == 1 && dp->d_name[0] == '.') ||
                            (dp->d_namlen == 2 && !memcmp(dp->d_name, "..", 2)))
                                continue;
                        /* Check for presence in the upper layer. */
                        cn.cn_nameiop = LOOKUP;
                        cn.cn_flags = ISLASTCN | RDONLY;
                        cn.cn_cred = cred;
                        cn.cn_nameptr = dp->d_name;
                        cn.cn_namelen = dp->d_namlen;
                        error = VOP_LOOKUP(un->un_uppervp, &tvp, &cn);
                        if (error == ENOENT && (cn.cn_flags & ISWHITEOUT)) {
                                error = 0;
                                continue;
                        }
                        if (error == 0)
                                vrele(tvp);
                        error = ENOTEMPTY;
                }
        } while (error == 0 && !eofflag);
        kmem_free(dirbuf, dirlen);
        VOP_UNLOCK(un->un_lowervp);

        return error;
}

/*
 * This hook is called from vn_readdir() to switch to lower directory
 * entry after the upper directory is read.
 */
int
union_readdirhook(struct vnode **vpp, struct file *fp, struct lwp *l)
{
        struct vnode *vp = *vpp, *lvp;
        struct vattr va;
        int error;

        if (vp->v_op != union_vnodeop_p)
                return (0);

        /*
         * If the directory is opaque,
         * then don't show lower entries
         */
        vn_lock(vp, LK_SHARED | LK_RETRY);
        error = VOP_GETATTR(vp, &va, fp->f_cred);
        VOP_UNLOCK(vp);
        if (error || (va.va_flags & OPAQUE))
                return error;

        if ((lvp = union_dircache(vp, l)) == NULLVP)
                return (0);

        error = VOP_OPEN(lvp, FREAD, fp->f_cred);
        if (error) {
                vput(lvp);
                return (error);
        }
        VOP_UNLOCK(lvp);
        fp->f_vnode = lvp;
        fp->f_offset = 0;
        error = vn_close(vp, FREAD, fp->f_cred);
        if (error)
                return (error);
        *vpp = lvp;
        return (0);
}





















































































































































































































































































































































   37 



   35 


    9 


   17 













    2 

   86 










   16 




   16 
   17 
   17 

   17 









   10 


   11 








   17 





    2 


   16 









   85 






   86 
   86 











   85 




   75 

   72 






   73 









   72 



























































   64 








   63 


    8 
   63 











   63 


   33 
   60 










   51 
   51 









    7 


    7 










































   20 
   19 



























   86 







   85 


   12 









   81 





















   81 





   43 








    3 






   79 
   81 





   80 













    9 
   39 

   49 
   49 









   66 

   66 







   64 
    8 

   39 
   65 





















    4 







    4 





    3 













    3 
    3 





    3 





    3 
    4 
    4 













    4 
    4 










    4 



    1 









    4 
















    4 

    4 



    4 











   50 




   50 


    6 

    6 




   50 








   50 

















   50 









   50 


























    3 


   50 
    1 
   41 

   27 













   27 
    8 

   20 


























    3 

    3 

    3 












    8 

    8 
    8 
    8 
    8 


    7 








    8 




    8 








    3 





















































   29 
















    3 


    3 
    3 















   16 




   16 








   10 






































   16 



   16 


   16 

































   16 


   16 


   16 

   16 



























































   50 



   50 



   50 
























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
/*        $NetBSD: vfs_cache.c,v 1.156 2023/10/02 21:50:18 ad Exp $        */

/*-
 * Copyright (c) 2008, 2019, 2020, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_cache.c        8.3 (Berkeley) 8/22/94
 */

/*
 * Name caching:
 *
 *        Names found by directory scans are retained in a cache for future
 *        reference.  It is managed LRU, so frequently used names will hang
 *        around.  The cache is indexed by hash value obtained from the name.
 *
 *        The name cache is the brainchild of Robert Elz and was introduced in
 *        4.3BSD.  See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk
 *        McKusick, May 21 1984.
 *
 * Data structures:
 *
 *        Most Unix namecaches very sensibly use a global hash table to index
 *        names.  The global hash table works well, but can cause concurrency
 *        headaches for the kernel hacker.  In the NetBSD 10.0 implementation
 *        we are not sensible, and use a per-directory data structure to index
 *        names, but the cache otherwise functions the same.
 *
 *        The index is a red-black tree.  It should not be difficult to
 *        experiment with other types of index, however note that a tree
 *        can trivially be made to support lockless lookup.
 *
 *        Each cached name is stored in a struct namecache, along with a
 *        pointer to the associated vnode (nc_vp).  Names longer than a
 *        maximum length of NCHNAMLEN are allocated with kmem_alloc(); they
 *        occur infrequently, and names shorter than this are stored directly
 *        in struct namecache.  If it is a "negative" entry, (i.e. for a name
 *        that is known NOT to exist) the vnode pointer will be NULL.
 *
 *        In practice this implementation is not any slower than the hash
 *        table that preceeded it and in some cases it significantly
 *        outperforms the hash table.  Some reasons why this might be:
 *
 *        - natural partitioning provided by the file system structure, which
 *          the prior implementation discarded (global hash table).
 *        - worst case tree traversal of O(log n), the hash table could have
 *          many collisions.
 *        - minimized cache misses & total L2/L3 CPU cache footprint; struct
 *          namecache and vnode_impl_t are laid out to keep cache footprint
 *          minimal in the lookup path; no hash table buckets to cache.
 *        - minimized number of conditionals & string comparisons.
 *
 *        For a directory with 3 cached names for 3 distinct vnodes, the
 *        various vnodes and namecache structs would be connected like this
 *        (the root is at the bottom of the diagram):
 *
 *          ...
 *           ^
 *           |- vi_nc_tree
 *           |
 *      +----o----+               +---------+               +---------+
 *      |  VDIR   |               |  VCHR   |               |  VREG   |
 *      |  vnode  o-----+         |  vnode  o-----+         |  vnode  o------+
 *      +---------+     |         +---------+     |         +---------+      |
 *           ^          |              ^          |              ^           |
 *           |- nc_vp   |- vi_nc_list  |- nc_vp   |- vi_nc_list  |- nc_vp    |
 *           |          |              |          |              |           |
 *      +----o----+     |         +----o----+     |         +----o----+      |
 *  +---onamecache|<----+     +---onamecache|<----+     +---onamecache|<-----+
 *  |   +---------+           |   +---------+           |   +---------+
 *  |        ^                |        ^                |        ^
 *  |        |                |        |                |        |
 *  |        |  +----------------------+                |        |
 *  |-nc_dvp | +-------------------------------------------------+
 *  |        |/- vi_nc_tree   |                         |
 *  |        |                |- nc_dvp                 |- nc_dvp
 *  |   +----o----+           |                         |
 *  +-->|  VDIR   |<----------+                         |
 *      |  vnode  |<------------------------------------+
 *      +---------+
 *
 *      START HERE
 *
 * Replacement:
 *
 *        As the cache becomes full, old and unused entries are purged as new
 *        entries are added.  The synchronization overhead in maintaining a
 *        strict ordering would be prohibitive, so the VM system's "clock" or
 *        "second chance" page replacement algorithm is aped here.  New
 *        entries go to the tail of the active list.  After they age out and
 *        reach the head of the list, they are moved to the tail of the
 *        inactive list.  Any use of the deactivated cache entry reactivates
 *        it, saving it from impending doom; if not reactivated, the entry
 *        eventually reaches the head of the inactive list and is purged.
 *
 * Concurrency:
 *
 *        From a performance perspective, cache_lookup(nameiop == LOOKUP) is
 *        what really matters; insertion of new entries with cache_enter() is
 *        comparatively infrequent, and overshadowed by the cost of expensive
 *        file system metadata operations (which may involve disk I/O).  We
 *        therefore want to make everything simplest in the lookup path.
 *
 *        struct namecache is mostly stable except for list and tree related
 *        entries, changes to which don't affect the cached name or vnode. 
 *        For changes to name+vnode, entries are purged in preference to
 *        modifying them.
 *
 *        Read access to namecache entries is made via tree, list, or LRU
 *        list.  A lock corresponding to the direction of access should be
 *        held.  See definition of "struct namecache" in src/sys/namei.src,
 *        and the definition of "struct vnode" for the particulars.
 *
 *        Per-CPU statistics, and LRU list totals are read unlocked, since an
 *        approximate value is OK.  We maintain 32-bit sized per-CPU counters
 *        and 64-bit global counters since 32-bit sized counters can be
 *        observed locklessly while the global counters are protected by a
 *        mutex.
 *
 *        The lock order is:
 *
 *        1) vi->vi_nc_lock        (tree or parent -> child direction,
 *                                 used during forward lookup)
 *
 *        2) vi->vi_nc_listlock        (list or child -> parent direction,
 *                                 used during reverse lookup)
 *
 *        3) cache_lru_lock        (LRU list direction, used during reclaim)
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.156 2023/10/02 21:50:18 ad Exp $");

#define __NAMECACHE_PRIVATE
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "opt_dtrace.h"
#endif

#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/callout.h>
#include <sys/cpu.h>
#include <sys/errno.h>
#include <sys/evcnt.h>
#include <sys/hash.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/param.h>
#include <sys/pool.h>
#include <sys/sdt.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/vnode_impl.h>

#include <miscfs/genfs/genfs.h>

/*
 * Assert that data structure layout hasn't changed unintentionally.
 */
#ifdef _LP64
CTASSERT(sizeof(struct namecache) == 128);
#else
CTASSERT(sizeof(struct namecache) == 64);
#endif
CTASSERT(NC_NLEN_MASK >= MAXPATHLEN);

static void        cache_activate(struct namecache *);
static void        cache_update_stats(void *);
static int        cache_compare_nodes(void *, const void *, const void *);
static void        cache_deactivate(void);
static void        cache_reclaim(void);
static int        cache_stat_sysctl(SYSCTLFN_ARGS);

/*
 * Global pool cache.
 */
static pool_cache_t cache_pool __read_mostly;

/*
 * LRU replacement.
 */
enum cache_lru_id {
        LRU_ACTIVE,
        LRU_INACTIVE,
        LRU_COUNT
};

static struct {
        TAILQ_HEAD(, namecache)        list[LRU_COUNT];
        u_int                        count[LRU_COUNT];
} cache_lru __cacheline_aligned;

static kmutex_t cache_lru_lock __cacheline_aligned;

/*
 * Cache effectiveness statistics.  nchstats holds system-wide total.
 */
struct nchstats        nchstats;
struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t);
struct nchcpu {
        struct nchstats_percpu cur;
        struct nchstats_percpu last;
};
static callout_t cache_stat_callout;
static kmutex_t cache_stat_lock __cacheline_aligned;

#define        COUNT(f) do { \
        lwp_t *l = curlwp; \
        KPREEMPT_DISABLE(l); \
        struct nchcpu *nchcpu = curcpu()->ci_data.cpu_nch; \
        nchcpu->cur.f++; \
        KPREEMPT_ENABLE(l); \
} while (/* CONSTCOND */ 0);

#define        UPDATE(nchcpu, f) do { \
        uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \
        nchstats.f += (uint32_t)(cur - nchcpu->last.f); \
        nchcpu->last.f = cur; \
} while (/* CONSTCOND */ 0)

/*
 * Tunables.  cache_maxlen replaces the historical doingcache:
 * set it zero to disable caching for debugging purposes.
 */
int cache_lru_maxdeact __read_mostly = 2;        /* max # to deactivate */
int cache_lru_maxscan __read_mostly = 64;        /* max # to scan/reclaim */
int cache_maxlen __read_mostly = NC_NLEN_MASK;        /* max name length to cache */
int cache_stat_interval __read_mostly = 300;        /* in seconds */

/*
 * sysctl stuff.
 */
static struct        sysctllog *cache_sysctllog;

/*
 * This is a dummy name that cannot usually occur anywhere in the cache nor
 * file system.  It's used when caching the root vnode of mounted file
 * systems.  The name is attached to the directory that the file system is
 * mounted on.
 */
static const char cache_mp_name[] = "";
static const int cache_mp_nlen = sizeof(cache_mp_name) - 1;

/*
 * Red-black tree stuff.
 */
static const rb_tree_ops_t cache_rbtree_ops = {
        .rbto_compare_nodes = cache_compare_nodes,
        .rbto_compare_key = cache_compare_nodes,
        .rbto_node_offset = offsetof(struct namecache, nc_tree),
        .rbto_context = NULL
};

/*
 * dtrace probes.
 */
SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *");
SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *");
SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *");
SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t");
SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *");
SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *",
    "char *", "size_t");
SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *",
    "char *", "size_t");
SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *",
    "char *", "size_t");
SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *",
     "struct vnode *");
SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *",
     "int");
SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int");
SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *",
    "char *", "size_t");
SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *",
    "char *", "size_t");

/*
 * rbtree: compare two nodes.
 */
static int
cache_compare_nodes(void *context, const void *n1, const void *n2)
{
        const struct namecache *nc1 = n1;
        const struct namecache *nc2 = n2;

        if (nc1->nc_key < nc2->nc_key) {
                return -1;
        }
        if (nc1->nc_key > nc2->nc_key) {
                return 1;
        }
        KASSERT(NC_NLEN(nc1) == NC_NLEN(nc2));
        return memcmp(nc1->nc_name, nc2->nc_name, NC_NLEN(nc1));
}

/*
 * Compute a key value for the given name.  The name length is encoded in
 * the key value to try and improve uniqueness, and so that length doesn't
 * need to be compared separately for string comparisons.
 */
static uintptr_t
cache_key(const char *name, size_t nlen)
{
        uintptr_t key;

        KASSERT((nlen & ~NC_NLEN_MASK) == 0);

        key = hash32_buf(name, nlen, HASH32_STR_INIT);
        return (key << NC_NLEN_BITS) | (uintptr_t)nlen;
}

/*
 * Remove an entry from the cache.  vi_nc_lock must be held, and if dir2node
 * is true, then we're locking in the conventional direction and the list
 * lock will be acquired when removing the entry from the vnode list.
 */
static void
cache_remove(struct namecache *ncp, const bool dir2node)
{
        struct vnode *vp, *dvp = ncp->nc_dvp;
        vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
        size_t namelen = NC_NLEN(ncp);

        KASSERT(rw_write_held(&dvi->vi_nc_lock));
        KASSERT(cache_key(ncp->nc_name, namelen) == ncp->nc_key);
        KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp);

        SDT_PROBE(vfs, namecache, invalidate, done, ncp, 0, 0, 0, 0);

        /*
         * Remove from the vnode's list.  This excludes cache_revlookup(),
         * and then it's safe to remove from the LRU lists.
         */
        if ((vp = ncp->nc_vp) != NULL) {
                vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
                if (__predict_true(dir2node)) {
                        rw_enter(&vi->vi_nc_listlock, RW_WRITER);
                        TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
                        rw_exit(&vi->vi_nc_listlock);
                } else {
                        TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
                }
        }

        /* Remove from the directory's rbtree. */
        rb_tree_remove_node(&dvi->vi_nc_tree, ncp);

        /* Remove from the LRU lists. */
        mutex_enter(&cache_lru_lock);
        TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
        cache_lru.count[ncp->nc_lrulist]--;
        mutex_exit(&cache_lru_lock);

        /* Finally, free it. */
        if (namelen > NCHNAMLEN) {
                size_t sz = offsetof(struct namecache, nc_name[namelen]);
                kmem_free(ncp, sz);
        } else {
                pool_cache_put(cache_pool, ncp);
        }
}

/*
 * Find a single cache entry and return it.  vi_nc_lock must be held.
 */
static struct namecache * __noinline
cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen,
    uintptr_t key)
{
        vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
        struct rb_node *node = dvi->vi_nc_tree.rbt_root;
        struct namecache *ncp;
        enum cache_lru_id lrulist;
        int diff;

        KASSERT(namelen <= MAXPATHLEN);
        KASSERT(rw_lock_held(&dvi->vi_nc_lock));

        /*
         * Search the RB tree for the key.  This is an inlined lookup
         * tailored for exactly what's needed here that turns out to be
         * quite a bit faster than using rb_tree_find_node().
         *
         * For a matching key memcmp() needs to be called once to confirm
         * that the correct name has been found.  Very rarely there will be
         * a key value collision and the search will continue.
         */
        for (;;) {
                if (__predict_false(RB_SENTINEL_P(node))) {
                        return NULL;
                }
                ncp = (struct namecache *)node;
                KASSERT((void *)&ncp->nc_tree == (void *)ncp);
                KASSERT(ncp->nc_dvp == dvp);
                if (ncp->nc_key == key) {
                        KASSERT(NC_NLEN(ncp) == namelen);
                        diff = memcmp(ncp->nc_name, name, namelen);
                        if (__predict_true(diff == 0)) {
                                break;
                        }
                        node = node->rb_nodes[diff < 0];
                } else {
                        node = node->rb_nodes[ncp->nc_key < key];
                }
        }

        /*
         * If the entry is on the wrong LRU list, requeue it.  This is an
         * unlocked check, but it will rarely be wrong and even then there
         * will be no harm caused.
         */
        lrulist = atomic_load_relaxed(&ncp->nc_lrulist);
        if (__predict_false(lrulist != LRU_ACTIVE)) {
                cache_activate(ncp);
        }
        return ncp;
}

/*
 * Look for a the name in the cache. We don't do this
 * if the segment name is long, simply so the cache can avoid
 * holding long names (which would either waste space, or
 * add greatly to the complexity).
 *
 * Lookup is called with DVP pointing to the directory to search,
 * and CNP providing the name of the entry being sought: cn_nameptr
 * is the name, cn_namelen is its length, and cn_flags is the flags
 * word from the namei operation.
 *
 * DVP must be locked.
 *
 * There are three possible non-error return states:
 *    1. Nothing was found in the cache. Nothing is known about
 *       the requested name.
 *    2. A negative entry was found in the cache, meaning that the
 *       requested name definitely does not exist.
 *    3. A positive entry was found in the cache, meaning that the
 *       requested name does exist and that we are providing the
 *       vnode.
 * In these cases the results are:
 *    1. 0 returned; VN is set to NULL.
 *    2. 1 returned; VN is set to NULL.
 *    3. 1 returned; VN is set to the vnode found.
 *
 * The additional result argument ISWHT is set to zero, unless a
 * negative entry is found that was entered as a whiteout, in which
 * case ISWHT is set to one.
 *
 * The ISWHT_RET argument pointer may be null. In this case an
 * assertion is made that the whiteout flag is not set. File systems
 * that do not support whiteouts can/should do this.
 *
 * Filesystems that do support whiteouts should add ISWHITEOUT to
 * cnp->cn_flags if ISWHT comes back nonzero.
 *
 * When a vnode is returned, it is locked, as per the vnode lookup
 * locking protocol.
 *
 * There is no way for this function to fail, in the sense of
 * generating an error that requires aborting the namei operation.
 *
 * (Prior to October 2012, this function returned an integer status,
 * and a vnode, and mucked with the flags word in CNP for whiteouts.
 * The integer status was -1 for "nothing found", ENOENT for "a
 * negative entry found", 0 for "a positive entry found", and possibly
 * other errors, and the value of VN might or might not have been set
 * depending on what error occurred.)
 */
bool
cache_lookup(struct vnode *dvp, const char *name, size_t namelen,
             uint32_t nameiop, uint32_t cnflags,
             int *iswht_ret, struct vnode **vn_ret)
{
        vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
        struct namecache *ncp;
        struct vnode *vp;
        uintptr_t key;
        int error;
        bool hit;
        krw_t op;

        KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);

        /* Establish default result values */
        if (iswht_ret != NULL) {
                *iswht_ret = 0;
        }
        *vn_ret = NULL;

        if (__predict_false(namelen > cache_maxlen)) {
                SDT_PROBE(vfs, namecache, lookup, toolong, dvp,
                    name, namelen, 0, 0);
                COUNT(ncs_long);
                return false;
        }

        /* Compute the key up front - don't need the lock. */
        key = cache_key(name, namelen);

        /* Could the entry be purged below? */
        if ((cnflags & ISLASTCN) != 0 &&
            ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) {
                    op = RW_WRITER;
        } else {
                op = RW_READER;
        }

        /* Now look for the name. */
        rw_enter(&dvi->vi_nc_lock, op);
        ncp = cache_lookup_entry(dvp, name, namelen, key);
        if (__predict_false(ncp == NULL)) {
                rw_exit(&dvi->vi_nc_lock);
                COUNT(ncs_miss);
                SDT_PROBE(vfs, namecache, lookup, miss, dvp,
                    name, namelen, 0, 0);
                return false;
        }
        if (__predict_false((cnflags & MAKEENTRY) == 0)) {
                /*
                 * Last component and we are renaming or deleting,
                 * the cache entry is invalid, or otherwise don't
                 * want cache entry to exist.
                 */
                KASSERT((cnflags & ISLASTCN) != 0);
                cache_remove(ncp, true);
                rw_exit(&dvi->vi_nc_lock);
                COUNT(ncs_badhits);
                return false;
        }
        if ((vp = ncp->nc_vp) == NULL) {
                if (iswht_ret != NULL) {
                        /*
                         * Restore the ISWHITEOUT flag saved earlier.
                         */
                        *iswht_ret = ncp->nc_whiteout;
                } else {
                        KASSERT(!ncp->nc_whiteout);
                }
                if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) {
                        /*
                         * Last component and we are preparing to create
                         * the named object, so flush the negative cache
                         * entry.
                         */
                        COUNT(ncs_badhits);
                        cache_remove(ncp, true);
                        hit = false;
                } else {
                        COUNT(ncs_neghits);
                        SDT_PROBE(vfs, namecache, lookup, hit, dvp, name,
                            namelen, 0, 0);
                        /* found neg entry; vn is already null from above */
                        hit = true;
                }
                rw_exit(&dvi->vi_nc_lock);
                return hit;
        }
        error = vcache_tryvget(vp);
        rw_exit(&dvi->vi_nc_lock);
        if (error) {
                KASSERT(error == EBUSY);
                /*
                 * This vnode is being cleaned out.
                 * XXX badhits?
                 */
                COUNT(ncs_falsehits);
                return false;
        }

        COUNT(ncs_goodhits);
        SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
        /* found it */
        *vn_ret = vp;
        return true;
}

/*
 * Version of the above without the nameiop argument, for NFS.
 */
bool
cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen,
                 uint32_t cnflags,
                 int *iswht_ret, struct vnode **vn_ret)
{

        return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY,
            iswht_ret, vn_ret);
}

/*
 * Used by namei() to walk down a path, component by component by looking up
 * names in the cache.  The node locks are chained along the way: a parent's
 * lock is not dropped until the child's is acquired.
 */
bool
cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen,
                    struct vnode **vn_ret, krwlock_t **plock,
                    kauth_cred_t cred)
{
        vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
        struct namecache *ncp;
        krwlock_t *oldlock, *newlock;
        struct vnode *vp;
        uintptr_t key;
        int error;

        KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);

        /* If disabled, or file system doesn't support this, bail out. */
        if (__predict_false((dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) {
                return false;
        }

        if (__predict_false(namelen > cache_maxlen)) {
                COUNT(ncs_long);
                return false;
        }

        /* Compute the key up front - don't need the lock. */
        key = cache_key(name, namelen);

        /*
         * Acquire the directory lock.  Once we have that, we can drop the
         * previous one (if any).
         *
         * The two lock holds mean that the directory can't go away while
         * here: the directory must be purged with cache_purge() before
         * being freed, and both parent & child's vi_nc_lock must be taken
         * before that point is passed.
         *
         * However if there's no previous lock, like at the root of the
         * chain, then "dvp" must be referenced to prevent dvp going away
         * before we get its lock.
         *
         * Note that the two locks can be the same if looking up a dot, for
         * example: /usr/bin/.  If looking up the parent (..) we can't wait
         * on the lock as child -> parent is the wrong direction.
         */
        if (*plock != &dvi->vi_nc_lock) {
                oldlock = *plock;
                newlock = &dvi->vi_nc_lock;
                if (!rw_tryenter(&dvi->vi_nc_lock, RW_READER)) {
                        return false;
                }
        } else {
                oldlock = NULL;
                newlock = NULL;
                if (*plock == NULL) {
                        KASSERT(vrefcnt(dvp) > 0);
                }
        }

        /*
         * First up check if the user is allowed to look up files in this
         * directory.
         */
        if (cred != FSCRED) {
                if (dvi->vi_nc_mode == VNOVAL) {
                        if (newlock != NULL) {
                                rw_exit(newlock);
                        }
                        return false;
                }
                KASSERT(dvi->vi_nc_uid != VNOVAL);
                KASSERT(dvi->vi_nc_gid != VNOVAL);
                error = kauth_authorize_vnode(cred,
                    KAUTH_ACCESS_ACTION(VEXEC,
                    dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL,
                    genfs_can_access(dvp, cred, dvi->vi_nc_uid, dvi->vi_nc_gid,
                    dvi->vi_nc_mode & ALLPERMS, NULL, VEXEC));
                if (error != 0) {
                        if (newlock != NULL) {
                                rw_exit(newlock);
                        }
                        COUNT(ncs_denied);
                        return false;
                }
        }

        /*
         * Now look for a matching cache entry.
         */
        ncp = cache_lookup_entry(dvp, name, namelen, key);
        if (__predict_false(ncp == NULL)) {
                if (newlock != NULL) {
                        rw_exit(newlock);
                }
                COUNT(ncs_miss);
                SDT_PROBE(vfs, namecache, lookup, miss, dvp,
                    name, namelen, 0, 0);
                return false;
        }
        if ((vp = ncp->nc_vp) == NULL) {
                /* found negative entry; vn is already null from above */
                KASSERT(namelen != cache_mp_nlen);
                KASSERT(name != cache_mp_name);
                COUNT(ncs_neghits);
        } else {
                COUNT(ncs_goodhits); /* XXX can be "badhits" */
        }
        SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);

        /*
         * Return with the directory lock still held.  It will either be
         * returned to us with another call to cache_lookup_linked() when
         * looking up the next component, or the caller will release it
         * manually when finished.
         */
        if (oldlock) {
                rw_exit(oldlock);
        }
        if (newlock) {
                *plock = newlock;
        }
        *vn_ret = vp;
        return true;
}

/*
 * Scan cache looking for name of directory entry pointing at vp.
 * Will not search for "." or "..".
 *
 * If the lookup succeeds the vnode is referenced and stored in dvpp.
 *
 * If bufp is non-NULL, also place the name in the buffer which starts
 * at bufp, immediately before *bpp, and move bpp backwards to point
 * at the start of it.  (Yes, this is a little baroque, but it's done
 * this way to cater to the whims of getcwd).
 *
 * Returns 0 on success, -1 on cache miss, positive errno on failure.
 */
int
cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp,
    bool checkaccess, accmode_t accmode)
{
        vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
        struct namecache *ncp;
        enum cache_lru_id lrulist;
        struct vnode *dvp;
        int error, nlen;
        char *bp;

        KASSERT(vp != NULL);

        if (cache_maxlen == 0)
                goto out;

        rw_enter(&vi->vi_nc_listlock, RW_READER);
        if (checkaccess) {
                /*
                 * Check if the user is allowed to see.  NOTE: this is
                 * checking for access on the "wrong" directory.  getcwd()
                 * wants to see that there is access on every component
                 * along the way, not that there is access to any individual
                 * component.  Don't use this to check you can look in vp.
                 *
                 * I don't like it, I didn't come up with it, don't blame me!
                 */
                if (vi->vi_nc_mode == VNOVAL) {
                        rw_exit(&vi->vi_nc_listlock);
                        return -1;
                }
                KASSERT(vi->vi_nc_uid != VNOVAL);
                KASSERT(vi->vi_nc_gid != VNOVAL);
                error = kauth_authorize_vnode(kauth_cred_get(),
                    KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode &
                    ALLPERMS), vp, NULL, genfs_can_access(vp, curlwp->l_cred,
                    vi->vi_nc_uid, vi->vi_nc_gid, vi->vi_nc_mode & ALLPERMS,
                    NULL, accmode));
                    if (error != 0) {
                            rw_exit(&vi->vi_nc_listlock);
                        COUNT(ncs_denied);
                        return EACCES;
                }
        }
        TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) {
                KASSERT(ncp->nc_vp == vp);
                KASSERT(ncp->nc_dvp != NULL);
                nlen = NC_NLEN(ncp);

                /*
                 * Ignore mountpoint entries.
                 */
                if (nlen == cache_mp_nlen) {
                        continue;
                }

                /*
                 * The queue is partially sorted.  Once we hit dots, nothing
                 * else remains but dots and dotdots, so bail out.
                 */
                if (ncp->nc_name[0] == '.') {
                        if (nlen == 1 ||
                            (nlen == 2 && ncp->nc_name[1] == '.')) {
                                    break;
                        }
                }

                /*
                 * Record a hit on the entry.  This is an unlocked read but
                 * even if wrong it doesn't matter too much.
                 */
                lrulist = atomic_load_relaxed(&ncp->nc_lrulist);
                if (lrulist != LRU_ACTIVE) {
                        cache_activate(ncp);
                }

                if (bufp) {
                        bp = *bpp;
                        bp -= nlen;
                        if (bp <= bufp) {
                                *dvpp = NULL;
                                rw_exit(&vi->vi_nc_listlock);
                                SDT_PROBE(vfs, namecache, revlookup,
                                    fail, vp, ERANGE, 0, 0, 0);
                                return (ERANGE);
                        }
                        memcpy(bp, ncp->nc_name, nlen);
                        *bpp = bp;
                }

                dvp = ncp->nc_dvp;
                error = vcache_tryvget(dvp);
                rw_exit(&vi->vi_nc_listlock);
                if (error) {
                        KASSERT(error == EBUSY);
                        if (bufp)
                                (*bpp) += nlen;
                        *dvpp = NULL;
                        SDT_PROBE(vfs, namecache, revlookup, fail, vp,
                            error, 0, 0, 0);
                        return -1;
                }
                *dvpp = dvp;
                SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp,
                    0, 0, 0);
                COUNT(ncs_revhits);
                return (0);
        }
        rw_exit(&vi->vi_nc_listlock);
        COUNT(ncs_revmiss);
 out:
        *dvpp = NULL;
        return (-1);
}

/*
 * Add an entry to the cache.
 */
void
cache_enter(struct vnode *dvp, struct vnode *vp,
            const char *name, size_t namelen, uint32_t cnflags)
{
        vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
        struct namecache *ncp, *oncp;
        int total;

        KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);

        /* First, check whether we can/should add a cache entry. */
        if ((cnflags & MAKEENTRY) == 0 ||
            __predict_false(namelen > cache_maxlen)) {
                SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen,
                    0, 0);
                return;
        }

        SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0);

        /*
         * Reclaim some entries if over budget.  This is an unlocked check,
         * but it doesn't matter.  Just need to catch up with things
         * eventually: it doesn't matter if we go over temporarily.
         */
        total = atomic_load_relaxed(&cache_lru.count[LRU_ACTIVE]);
        total += atomic_load_relaxed(&cache_lru.count[LRU_INACTIVE]);
        if (__predict_false(total > desiredvnodes)) {
                cache_reclaim();
        }

        /* Now allocate a fresh entry. */
        if (__predict_true(namelen <= NCHNAMLEN)) {
                ncp = pool_cache_get(cache_pool, PR_WAITOK);
        } else {
                size_t sz = offsetof(struct namecache, nc_name[namelen]);
                ncp = kmem_alloc(sz, KM_SLEEP);
        }

        /*
         * Fill in cache info.  For negative hits, save the ISWHITEOUT flag
         * so we can restore it later when the cache entry is used again.
         */
        ncp->nc_vp = vp;
        ncp->nc_dvp = dvp;
        ncp->nc_key = cache_key(name, namelen);
        ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0);
        memcpy(ncp->nc_name, name, namelen);

        /*
         * Insert to the directory.  Concurrent lookups may race for a cache
         * entry.  If there's a entry there already, purge it.
         */
        rw_enter(&dvi->vi_nc_lock, RW_WRITER);
        oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
        if (oncp != ncp) {
                KASSERT(oncp->nc_key == ncp->nc_key);
                KASSERT(NC_NLEN(oncp) == NC_NLEN(ncp));
                KASSERT(memcmp(oncp->nc_name, name, namelen) == 0);
                cache_remove(oncp, true);
                oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
                KASSERT(oncp == ncp);
        }

        /*
         * With the directory lock still held, insert to the tail of the
         * ACTIVE LRU list (new) and take the opportunity to incrementally
         * balance the lists.
         */
        mutex_enter(&cache_lru_lock);
        ncp->nc_lrulist = LRU_ACTIVE;
        cache_lru.count[LRU_ACTIVE]++;
        TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
        cache_deactivate();
        mutex_exit(&cache_lru_lock);

        /*
         * Finally, insert to the vnode and unlock.  With everything set up
         * it's safe to let cache_revlookup() see the entry.  Partially sort
         * the per-vnode list: dots go to back so cache_revlookup() doesn't
         * have to consider them.
         */
        if (vp != NULL) {
                vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
                rw_enter(&vi->vi_nc_listlock, RW_WRITER);
                if ((namelen == 1 && name[0] == '.') ||
                    (namelen == 2 && name[0] == '.' && name[1] == '.')) {
                        TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list);
                } else {
                        TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list);
                }
                rw_exit(&vi->vi_nc_listlock);
        }
        rw_exit(&dvi->vi_nc_lock);
}

/*
 * Set identity info in cache for a vnode.  We only care about directories
 * so ignore other updates.  The cached info may be marked invalid if the
 * inode has an ACL.
 */
void
cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid, bool valid)
{
        vnode_impl_t *vi = VNODE_TO_VIMPL(vp);

        if (vp->v_type == VDIR) {
                /* Grab both locks, for forward & reverse lookup. */
                rw_enter(&vi->vi_nc_lock, RW_WRITER);
                rw_enter(&vi->vi_nc_listlock, RW_WRITER);
                if (valid) {
                        vi->vi_nc_mode = mode;
                        vi->vi_nc_uid = uid;
                        vi->vi_nc_gid = gid;
                } else {
                        vi->vi_nc_mode = VNOVAL;
                        vi->vi_nc_uid = VNOVAL;
                        vi->vi_nc_gid = VNOVAL;
                }
                rw_exit(&vi->vi_nc_listlock);
                rw_exit(&vi->vi_nc_lock);
        }
}

/*
 * Return true if we have identity for the given vnode, and use as an
 * opportunity to confirm that everything squares up.
 *
 * Because of shared code, some file systems could provide partial
 * information, missing some updates, so check the mount flag too.
 */
bool
cache_have_id(struct vnode *vp)
{

        if (vp->v_type == VDIR &&
            (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0 &&
            atomic_load_relaxed(&VNODE_TO_VIMPL(vp)->vi_nc_mode) != VNOVAL) {
                return true;
        } else {
                return false;
        }
}

/*
 * Enter a mount point.  cvp is the covered vnode, and rvp is the root of
 * the mounted file system.
 */
void
cache_enter_mount(struct vnode *cvp, struct vnode *rvp)
{

        KASSERT(vrefcnt(cvp) > 0);
        KASSERT(vrefcnt(rvp) > 0);
        KASSERT(cvp->v_type == VDIR);
        KASSERT((rvp->v_vflag & VV_ROOT) != 0);

        if (rvp->v_type == VDIR) {
                cache_enter(cvp, rvp, cache_mp_name, cache_mp_nlen, MAKEENTRY);
        }
}

/*
 * Look up a cached mount point.  Used in the strongly locked path.
 */
bool
cache_lookup_mount(struct vnode *dvp, struct vnode **vn_ret)
{
        bool ret;

        ret = cache_lookup(dvp, cache_mp_name, cache_mp_nlen, LOOKUP,
            MAKEENTRY, NULL, vn_ret);
        KASSERT((*vn_ret != NULL) == ret);
        return ret;
}

/*
 * Try to cross a mount point.  For use with cache_lookup_linked().
 */
bool
cache_cross_mount(struct vnode **dvp, krwlock_t **plock)
{

        return cache_lookup_linked(*dvp, cache_mp_name, cache_mp_nlen,
           dvp, plock, FSCRED);
}

/*
 * Name cache initialization, from vfs_init() when the system is booting.
 */
void
nchinit(void)
{

        cache_pool = pool_cache_init(sizeof(struct namecache),
            coherency_unit, 0, 0, "namecache", NULL, IPL_NONE, NULL,
            NULL, NULL);
        KASSERT(cache_pool != NULL);

        mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE);
        TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]);
        TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]);

        mutex_init(&cache_stat_lock, MUTEX_DEFAULT, IPL_NONE);
        callout_init(&cache_stat_callout, CALLOUT_MPSAFE);
        callout_setfunc(&cache_stat_callout, cache_update_stats, NULL);
        callout_schedule(&cache_stat_callout, cache_stat_interval * hz);

        KASSERT(cache_sysctllog == NULL);
        sysctl_createv(&cache_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "namecache_stats",
                       SYSCTL_DESCR("namecache statistics"),
                       cache_stat_sysctl, 0, NULL, 0,
                       CTL_VFS, CTL_CREATE, CTL_EOL);
}

/*
 * Called once for each CPU in the system as attached.
 */
void
cache_cpu_init(struct cpu_info *ci)
{
        size_t sz;

        sz = roundup2(sizeof(struct nchcpu), coherency_unit);
        ci->ci_data.cpu_nch = kmem_zalloc(sz, KM_SLEEP);
        KASSERT(((uintptr_t)ci->ci_data.cpu_nch & (coherency_unit - 1)) == 0);
}

/*
 * A vnode is being allocated: set up cache structures.
 */
void
cache_vnode_init(struct vnode *vp)
{
        vnode_impl_t *vi = VNODE_TO_VIMPL(vp);

        rw_init(&vi->vi_nc_lock);
        rw_init(&vi->vi_nc_listlock);
        rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops);
        TAILQ_INIT(&vi->vi_nc_list);
        vi->vi_nc_mode = VNOVAL;
        vi->vi_nc_uid = VNOVAL;
        vi->vi_nc_gid = VNOVAL;
}

/*
 * A vnode is being freed: finish cache structures.
 */
void
cache_vnode_fini(struct vnode *vp)
{
        vnode_impl_t *vi = VNODE_TO_VIMPL(vp);

        KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL);
        KASSERT(TAILQ_EMPTY(&vi->vi_nc_list));
        rw_destroy(&vi->vi_nc_lock);
        rw_destroy(&vi->vi_nc_listlock);
}

/*
 * Helper for cache_purge1(): purge cache entries for the given vnode from
 * all directories that the vnode is cached in.
 */
static void
cache_purge_parents(struct vnode *vp)
{
        vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp);
        struct vnode *dvp, *blocked;
        struct namecache *ncp;

        SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0);

        blocked = NULL;

        rw_enter(&vi->vi_nc_listlock, RW_WRITER);
        while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) {
                /*
                 * Locking in the wrong direction.  Try for a hold on the
                 * directory node's lock, and if we get it then all good,
                 * nuke the entry and move on to the next.
                 */
                dvp = ncp->nc_dvp;
                dvi = VNODE_TO_VIMPL(dvp);
                if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
                        cache_remove(ncp, false);
                        rw_exit(&dvi->vi_nc_lock);
                        blocked = NULL;
                        continue;
                }

                /*
                 * We can't wait on the directory node's lock with our list
                 * lock held or the system could deadlock.
                 *
                 * Take a hold on the directory vnode to prevent it from
                 * being freed (taking the vnode & lock with it).  Then
                 * wait for the lock to become available with no other locks
                 * held, and retry.
                 *
                 * If this happens twice in a row, give the other side a
                 * breather; we can do nothing until it lets go.
                 */
                vhold(dvp);
                rw_exit(&vi->vi_nc_listlock);
                rw_enter(&dvi->vi_nc_lock, RW_WRITER);
                /* Do nothing. */
                rw_exit(&dvi->vi_nc_lock);
                holdrele(dvp);
                if (blocked == dvp) {
                        kpause("ncpurge", false, 1, NULL);
                }
                rw_enter(&vi->vi_nc_listlock, RW_WRITER);
                blocked = dvp;
        }
        rw_exit(&vi->vi_nc_listlock);
}

/*
 * Helper for cache_purge1(): purge all cache entries hanging off the given
 * directory vnode.
 */
static void
cache_purge_children(struct vnode *dvp)
{
        vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
        struct namecache *ncp;

        SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0);

        rw_enter(&dvi->vi_nc_lock, RW_WRITER);
        while ((ncp = RB_TREE_MIN(&dvi->vi_nc_tree)) != NULL) {
                cache_remove(ncp, true);
        }
        rw_exit(&dvi->vi_nc_lock);
}

/*
 * Helper for cache_purge1(): purge cache entry from the given vnode,
 * finding it by name.
 */
static void
cache_purge_name(struct vnode *dvp, const char *name, size_t namelen)
{
        vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
        struct namecache *ncp;
        uintptr_t key;

        SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0);

        key = cache_key(name, namelen);
        rw_enter(&dvi->vi_nc_lock, RW_WRITER);
        ncp = cache_lookup_entry(dvp, name, namelen, key);
        if (ncp) {
                cache_remove(ncp, true);
        }
        rw_exit(&dvi->vi_nc_lock);
}

/*
 * Cache flush, a particular vnode; called when a vnode is renamed to
 * hide entries that would now be invalid.
 */
void
cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags)
{

        if (flags & PURGE_PARENTS) {
                cache_purge_parents(vp);
        }
        if (flags & PURGE_CHILDREN) {
                cache_purge_children(vp);
        }
        if (name != NULL) {
                cache_purge_name(vp, name, namelen);
        }
}

/*
 * vnode filter for cache_purgevfs().
 */
static bool
cache_vdir_filter(void *cookie, vnode_t *vp)
{

        return vp->v_type == VDIR;
}

/*
 * Cache flush, a whole filesystem; called when filesys is umounted to
 * remove entries that would now be invalid.
 */
void
cache_purgevfs(struct mount *mp)
{
        struct vnode_iterator *iter;
        vnode_t *dvp;

        vfs_vnode_iterator_init(mp, &iter);
        for (;;) {
                dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL);
                if (dvp == NULL) {
                        break;
                }
                cache_purge_children(dvp);
                vrele(dvp);
        }
        vfs_vnode_iterator_destroy(iter);
}

/*
 * Re-queue an entry onto the tail of the active LRU list, after it has
 * scored a hit.
 */
static void
cache_activate(struct namecache *ncp)
{

        mutex_enter(&cache_lru_lock);
        TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
        TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
        cache_lru.count[ncp->nc_lrulist]--;
        cache_lru.count[LRU_ACTIVE]++;
        ncp->nc_lrulist = LRU_ACTIVE;
        mutex_exit(&cache_lru_lock);
}

/*
 * Try to balance the LRU lists.  Pick some victim entries, and re-queue
 * them from the head of the active list to the tail of the inactive list.
 */
static void
cache_deactivate(void)
{
        struct namecache *ncp;
        int total, i;

        KASSERT(mutex_owned(&cache_lru_lock));

        /* If we're nowhere near budget yet, don't bother. */
        total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE];
        if (total < (desiredvnodes >> 1)) {
                    return;
        }

        /*
         * Aim for a 1:1 ratio of active to inactive.  This is to allow each
         * potential victim a reasonable amount of time to cycle through the
         * inactive list in order to score a hit and be reactivated, while
         * trying not to cause reactivations too frequently.
         */
        if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) {
                return;
        }

        /* Move only a few at a time; will catch up eventually. */
        for (i = 0; i < cache_lru_maxdeact; i++) {
                ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]);
                if (ncp == NULL) {
                        break;
                }
                KASSERT(ncp->nc_lrulist == LRU_ACTIVE);
                ncp->nc_lrulist = LRU_INACTIVE;
                TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
                TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru);
                cache_lru.count[LRU_ACTIVE]--;
                cache_lru.count[LRU_INACTIVE]++;
        }
}

/*
 * Free some entries from the cache, when we have gone over budget.
 *
 * We don't want to cause too much work for any individual caller, and it
 * doesn't matter if we temporarily go over budget.  This is also "just a
 * cache" so it's not a big deal if we screw up and throw out something we
 * shouldn't.  So we take a relaxed attitude to this process to reduce its
 * impact.
 */
static void
cache_reclaim(void)
{
        struct namecache *ncp;
        vnode_impl_t *dvi;
        int toscan;

        /*
         * Scan up to a preset maximum number of entries, but no more than
         * 0.8% of the total at once (to allow for very small systems).
         *
         * On bigger systems, do a larger chunk of work to reduce the number
         * of times that cache_lru_lock is held for any length of time.
         */
        mutex_enter(&cache_lru_lock);
        toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7);
        toscan = MAX(toscan, 1);
        SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] +
            cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0);
        while (toscan-- != 0) {
                /* First try to balance the lists. */
                cache_deactivate();

                /* Now look for a victim on head of inactive list (old). */
                ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]);
                if (ncp == NULL) {
                        break;
                }
                dvi = VNODE_TO_VIMPL(ncp->nc_dvp);
                KASSERT(ncp->nc_lrulist == LRU_INACTIVE);
                KASSERT(dvi != NULL);

                /*
                 * Locking in the wrong direction.  If we can't get the
                 * lock, the directory is actively busy, and it could also
                 * cause problems for the next guy in here, so send the
                 * entry to the back of the list.
                 */
                if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
                        TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE],
                            ncp, nc_lru);
                        TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE],
                            ncp, nc_lru);
                        continue;
                }

                /*
                 * Now have the victim entry locked.  Drop the LRU list
                 * lock, purge the entry, and start over.  The hold on
                 * vi_nc_lock will prevent the vnode from vanishing until
                 * finished (cache_purge() will be called on dvp before it
                 * disappears, and that will wait on vi_nc_lock).
                 */
                mutex_exit(&cache_lru_lock);
                cache_remove(ncp, true);
                rw_exit(&dvi->vi_nc_lock);
                mutex_enter(&cache_lru_lock);
        }
        mutex_exit(&cache_lru_lock);
}

/*
 * For file system code: count a lookup that required a full re-scan of
 * directory metadata.
 */
void
namecache_count_pass2(void)
{

        COUNT(ncs_pass2);
}

/*
 * For file system code: count a lookup that scored a hit in the directory
 * metadata near the location of the last lookup.
 */
void
namecache_count_2passes(void)
{

        COUNT(ncs_2passes);
}

/*
 * Sum the stats from all CPUs into nchstats.  This needs to run at least
 * once within every window where a 32-bit counter could roll over.  It's
 * called regularly by timer to ensure this.
 */
static void
cache_update_stats(void *cookie)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;

        mutex_enter(&cache_stat_lock);
        for (CPU_INFO_FOREACH(cii, ci)) {
                struct nchcpu *nchcpu = ci->ci_data.cpu_nch;
                UPDATE(nchcpu, ncs_goodhits);
                UPDATE(nchcpu, ncs_neghits);
                UPDATE(nchcpu, ncs_badhits);
                UPDATE(nchcpu, ncs_falsehits);
                UPDATE(nchcpu, ncs_miss);
                UPDATE(nchcpu, ncs_long);
                UPDATE(nchcpu, ncs_pass2);
                UPDATE(nchcpu, ncs_2passes);
                UPDATE(nchcpu, ncs_revhits);
                UPDATE(nchcpu, ncs_revmiss);
                UPDATE(nchcpu, ncs_denied);
        }
        if (cookie != NULL) {
                memcpy(cookie, &nchstats, sizeof(nchstats));
        }
        /* Reset the timer; arrive back here in N minutes at latest. */
        callout_schedule(&cache_stat_callout, cache_stat_interval * hz);
        mutex_exit(&cache_stat_lock);
}

/*
 * Fetch the current values of the stats for sysctl.
 */
static int
cache_stat_sysctl(SYSCTLFN_ARGS)
{
        struct nchstats stats;

        if (oldp == NULL) {
                *oldlenp = sizeof(nchstats);
                return 0;
        }

        if (*oldlenp <= 0) {
                *oldlenp = 0;
                return 0;
        }

        /* Refresh the global stats. */
        sysctl_unlock();
        cache_update_stats(&stats);
        sysctl_relock();

        *oldlenp = MIN(sizeof(stats), *oldlenp);
        return sysctl_copyout(l, &stats, oldp, *oldlenp);
}

/*
 * For the debugger, given the address of a vnode, print all associated
 * names in the cache.
 */
#ifdef DDB
void
namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
{
        struct vnode *dvp = NULL;
        struct namecache *ncp;
        enum cache_lru_id id;

        for (id = 0; id < LRU_COUNT; id++) {
                TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
                        if (ncp->nc_vp == vp) {
                                (*pr)("name %.*s\n", NC_NLEN(ncp),
                                    ncp->nc_name);
                                dvp = ncp->nc_dvp;
                        }
                }
        }
        if (dvp == NULL) {
                (*pr)("name not found\n");
                return;
        }
        for (id = 0; id < LRU_COUNT; id++) {
                TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
                        if (ncp->nc_vp == dvp) {
                                (*pr)("parent %.*s\n", NC_NLEN(ncp),
                                    ncp->nc_name);
                        }
                }
        }
}
#endif
























































































































    1 











    1 












































































    1 

    1 















    1 





    1 


































    1 





    1 



































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
/*        $NetBSD: vfs_syscalls_50.c,v 1.26 2021/08/15 07:57:46 christos Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_50.c,v 1.26 2021/08/15 07:57:46 christos Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#include "opt_quota.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/socketvar.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/dirent.h>
#include <sys/kauth.h>
#include <sys/time.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <sys/vfs_syscalls.h>
#ifndef LFS
#define LFS
#endif
#include <sys/syscallargs.h>

#include <ufs/lfs/lfs_extern.h>

#include <compat/common/compat_util.h>
#include <compat/common/compat_mod.h>
#include <compat/sys/time.h>
#include <compat/sys/stat.h>
#include <compat/sys/dirent.h>
#include <compat/sys/mount.h>

static const struct syscall_package vfs_syscalls_50_syscalls[] = {
        { SYS_compat_50___stat30, 0, (sy_call_t *)compat_50_sys___stat30 },
        { SYS_compat_50___fstat30, 0, (sy_call_t *)compat_50_sys___fstat30 },
        { SYS_compat_50___lstat30, 0, (sy_call_t *)compat_50_sys___lstat30 },
        { SYS_compat_50___fhstat40, 0, (sy_call_t *)compat_50_sys___fhstat40 },
        { SYS_compat_50_utimes, 0, (sy_call_t *)compat_50_sys_utimes },
        { SYS_compat_50_lfs_segwait, 0,
            (sy_call_t *)compat_50_sys_lfs_segwait } ,
        { SYS_compat_50_futimes, 0, (sy_call_t *)compat_50_sys_futimes },
        { SYS_compat_50_lutimes, 0, (sy_call_t *)compat_50_sys_lutimes },
        { SYS_compat_50_mknod, 0, (sy_call_t *)compat_50_sys_mknod },
        { 0, 0, NULL }
};

/*
 * Convert from a new to an old stat structure.
 */
static void
cvtstat(struct stat30 *ost, const struct stat *st)
{

        /* Handle any padding. */
        memset(ost, 0, sizeof(*ost));
        ost->st_dev = st->st_dev;
        ost->st_ino = st->st_ino;
        ost->st_mode = st->st_mode;
        ost->st_nlink = st->st_nlink;
        ost->st_uid = st->st_uid;
        ost->st_gid = st->st_gid;
        ost->st_rdev = st->st_rdev;
        timespec_to_timespec50(&st->st_atimespec, &ost->st_atimespec);
        timespec_to_timespec50(&st->st_mtimespec, &ost->st_mtimespec);
        timespec_to_timespec50(&st->st_ctimespec, &ost->st_ctimespec);
        timespec_to_timespec50(&st->st_birthtimespec, &ost->st_birthtimespec);
        ost->st_size = st->st_size;
        ost->st_blocks = st->st_blocks;
        ost->st_blksize = st->st_blksize;
        ost->st_flags = st->st_flags;
        ost->st_gen = st->st_gen;
        memset(ost->st_spare, 0, sizeof(ost->st_spare));
}

/*
 * Get file status; this version follows links.
 */
/* ARGSUSED */
int
compat_50_sys___stat30(struct lwp *l, const struct compat_50_sys___stat30_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(struct stat30 *) ub;
        } */
        struct stat sb;
        struct stat30 osb;
        int error;

        error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb);
        if (error)
                return error;
        cvtstat(&osb, &sb);
        return copyout(&osb, SCARG(uap, ub), sizeof(osb));
}


/*
 * Get file status; this version does not follow links.
 */
/* ARGSUSED */
int
compat_50_sys___lstat30(struct lwp *l, const struct compat_50_sys___lstat30_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(struct stat30 *) ub;
        } */
        struct stat sb;
        struct stat30 osb;
        int error;

        error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb);
        if (error)
                return error;
        cvtstat(&osb, &sb);
        return copyout(&osb, SCARG(uap, ub), sizeof(osb));
}

/*
 * Return status information about a file descriptor.
 */
/* ARGSUSED */
int
compat_50_sys___fstat30(struct lwp *l, const struct compat_50_sys___fstat30_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(struct stat30 *) sb;
        } */
        struct stat sb;
        struct stat30 osb;
        int error;

        error = do_sys_fstat(SCARG(uap, fd), &sb);
        if (error)
                return error;
        cvtstat(&osb, &sb);
        return copyout(&osb, SCARG(uap, sb), sizeof(osb));
}

/* ARGSUSED */
int
compat_50_sys___fhstat40(struct lwp *l, const struct compat_50_sys___fhstat40_args *uap, register_t *retval)
{
        /* {
                syscallarg(const void *) fhp;
                syscallarg(size_t) fh_size;
                syscallarg(struct stat30 *) sb;
        } */
        struct stat sb;
        struct stat30 osb;
        int error;

        error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
        if (error)
                return error;
        cvtstat(&osb, &sb);
        return copyout(&osb, SCARG(uap, sb), sizeof(osb));
}

static int
compat_50_do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path,
    int flag, const struct timeval50 *tptr)
{
        struct timeval tv[2], *tvp;
        struct timeval50 tv50[2];
        if (tptr) {
                int error = copyin(tptr, tv50, sizeof(tv50));
                if (error)
                        return error;
                timeval50_to_timeval(&tv50[0], &tv[0]);
                timeval50_to_timeval(&tv50[1], &tv[1]);
                tvp = tv;
        } else
                tvp = NULL;
        return do_sys_utimes(l, vp, path, flag, tvp, UIO_SYSSPACE);
}
    
/*
 * Set the access and modification times given a path name; this
 * version follows links.
 */
/* ARGSUSED */
int
compat_50_sys_utimes(struct lwp *l, const struct compat_50_sys_utimes_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(const struct timeval50 *) tptr;
        } */

        return compat_50_do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
            SCARG(uap, tptr));
}

/*
 * Set the access and modification times given a file descriptor.
 */
/* ARGSUSED */
int
compat_50_sys_futimes(struct lwp *l,
    const struct compat_50_sys_futimes_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const struct timeval50 *) tptr;
        } */
        int error;
        struct file *fp;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return error;
        error = compat_50_do_sys_utimes(l, fp->f_vnode, NULL, 0,
            SCARG(uap, tptr));
        fd_putfile(SCARG(uap, fd));
        return error;
}

/*
 * Set the access and modification times given a path name; this
 * version does not follow links.
 */
int
compat_50_sys_lutimes(struct lwp *l,
    const struct compat_50_sys_lutimes_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(const struct timeval50 *) tptr;
        } */

        return compat_50_do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
            SCARG(uap, tptr));
}

int
compat_50_sys_lfs_segwait(struct lwp *l,
    const struct compat_50_sys_lfs_segwait_args *uap, register_t *retval)
{
        /* {
                syscallarg(fsid_t *) fsidp;
                syscallarg(struct timeval50 *) tv;
        } */
#ifdef notyet
/* XXX need to check presence of LFS at run-time XXX */
        struct timeval atv;
        struct timeval50 atv50;
        fsid_t fsid;
        int error;

        /* XXX need we be su to segwait? */
        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS,
            KAUTH_REQ_SYSTEM_LFS_SEGWAIT, NULL, NULL, NULL);
        if (error)
                return (error);
        if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
                return (error);

        if (SCARG(uap, tv)) {
                error = copyin(SCARG(uap, tv), &atv50, sizeof(atv50));
                if (error)
                        return (error);
                timeval50_to_timeval(&atv50, &atv);
                if (itimerfix(&atv))
                        return (EINVAL);
        } else /* NULL or invalid */
                atv.tv_sec = atv.tv_usec = 0;
        return lfs_segwait(&fsid, &atv);
#else
        return ENOSYS;
#endif
}

int
compat_50_sys_mknod(struct lwp *l,
    const struct compat_50_sys_mknod_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(mode_t) mode;
                syscallarg(uint32_t) dev;
        } */
        return do_sys_mknod(l, SCARG(uap, path), SCARG(uap, mode),
            SCARG(uap, dev), UIO_USERSPACE);
}

int             
vfs_syscalls_50_init(void)
{               
        
        return syscall_establish(NULL, vfs_syscalls_50_syscalls);
}       
        
int
vfs_syscalls_50_fini(void)
{               

        return syscall_disestablish(NULL, vfs_syscalls_50_syscalls);
}






































































































   60 





















  121 




  122 
   20 












   28 




   29 
    5 









   29 

























  157 























  150 


  148 








































   59 


   60 













  147 
   35 








  158 



  159 


























  156 
  118 

















  116 






  115 

  116 
  117 







































  109 
   61 


















  117 
  114 
  116 





  116 
  118 





  116 





  112 
















    8 





   84 
   85 



   70 




  108 
  110 
  111 
  110 
  111 











   94 
   94 
   96 



  113 
  110 







  112 
  111 
  113 
  112 
  110 
















   87 
   86 
   86 















   66 






   13 
   35 








   86 
   17 





















   46 






    6 






   47 



   45 

















    4 

    1 








   46 











   46 
   46 
   47 


















    7 











   45 


















   47 












   42 


   14 





















   27 
   26 
   27 



















    5 

   22 











   88 
























   21 
   44 
   87 



   12 










   27 
   27 
   27 















   47 





   24 
   24 
   22 
   24 


   24 





   23 




    4 
   24 
    5 















    3 
    2 
    3 













    5 
    5 




    5 








   10 

   24 
   18 

   17 
   17 
   17 



















   13 
   12 

   13 











    6 

    6 

    6 

























   13 
   13 










   69 




   68 



   40 

   28 










   45 



















    7 

   47 


















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
/*        $NetBSD: rb.c,v 1.16 2021/09/16 21:29:41 andvar Exp $        */

/*-
 * Copyright (c) 2001 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Matt Thomas <matt@3am-software.com>.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif

#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <sys/types.h>
#include <stddef.h>
#include <assert.h>
#include <stdbool.h>
#ifdef RBDEBUG
#define        KASSERT(s)        assert(s)
#define        __rbt_unused
#else
#define KASSERT(s)        do { } while (/*CONSTCOND*/ 0)
#define        __rbt_unused        __unused
#endif
__RCSID("$NetBSD: rb.c,v 1.16 2021/09/16 21:29:41 andvar Exp $");
#else
#include <lib/libkern/libkern.h>
__KERNEL_RCSID(0, "$NetBSD: rb.c,v 1.16 2021/09/16 21:29:41 andvar Exp $");
#ifndef DIAGNOSTIC
#define        __rbt_unused        __unused
#else
#define        __rbt_unused
#endif
#endif

#ifdef _LIBC
__weak_alias(rb_tree_init, _rb_tree_init)
__weak_alias(rb_tree_find_node, _rb_tree_find_node)
__weak_alias(rb_tree_find_node_geq, _rb_tree_find_node_geq)
__weak_alias(rb_tree_find_node_leq, _rb_tree_find_node_leq)
__weak_alias(rb_tree_insert_node, _rb_tree_insert_node)
__weak_alias(rb_tree_remove_node, _rb_tree_remove_node)
__weak_alias(rb_tree_iterate, _rb_tree_iterate)
#ifdef RBDEBUG
__weak_alias(rb_tree_check, _rb_tree_check)
__weak_alias(rb_tree_depths, _rb_tree_depths)
#endif

#include "namespace.h"
#endif

#ifdef RBTEST
#include "rbtree.h"
#else
#include <sys/rbtree.h>
#endif

static void rb_tree_insert_rebalance(struct rb_tree *, struct rb_node *);
static void rb_tree_removal_rebalance(struct rb_tree *, struct rb_node *,
        unsigned int);
#ifdef RBDEBUG
static const struct rb_node *rb_tree_iterate_const(const struct rb_tree *,
        const struct rb_node *, const unsigned int);
static bool rb_tree_check_node(const struct rb_tree *, const struct rb_node *,
        const struct rb_node *, bool);
#else
#define        rb_tree_check_node(a, b, c, d)        true
#endif

#define        RB_NODETOITEM(rbto, rbn)        \
    ((void *)((uintptr_t)(rbn) - (rbto)->rbto_node_offset))
#define        RB_ITEMTONODE(rbto, rbn)        \
    ((rb_node_t *)((uintptr_t)(rbn) + (rbto)->rbto_node_offset))

#define        RB_SENTINEL_NODE        NULL

void
rb_tree_init(struct rb_tree *rbt, const rb_tree_ops_t *ops)
{

        rbt->rbt_ops = ops;
        rbt->rbt_root = RB_SENTINEL_NODE;
        RB_TAILQ_INIT(&rbt->rbt_nodes);
#ifndef RBSMALL
        rbt->rbt_minmax[RB_DIR_LEFT] = rbt->rbt_root;        /* minimum node */
        rbt->rbt_minmax[RB_DIR_RIGHT] = rbt->rbt_root;        /* maximum node */
#endif
#ifdef RBSTATS
        rbt->rbt_count = 0;
        rbt->rbt_insertions = 0;
        rbt->rbt_removals = 0;
        rbt->rbt_insertion_rebalance_calls = 0;
        rbt->rbt_insertion_rebalance_passes = 0;
        rbt->rbt_removal_rebalance_calls = 0;
        rbt->rbt_removal_rebalance_passes = 0;
#endif
}

void *
rb_tree_find_node(struct rb_tree *rbt, const void *key)
{
        const rb_tree_ops_t *rbto = rbt->rbt_ops;
        rbto_compare_key_fn compare_key = rbto->rbto_compare_key;
        struct rb_node *parent = rbt->rbt_root;

        while (!RB_SENTINEL_P(parent)) {
                void *pobj = RB_NODETOITEM(rbto, parent);
                const signed int diff = (*compare_key)(rbto->rbto_context,
                    pobj, key);
                if (diff == 0)
                        return pobj;
                parent = parent->rb_nodes[diff < 0];
        }

        return NULL;
}

void *
rb_tree_find_node_geq(struct rb_tree *rbt, const void *key)
{
        const rb_tree_ops_t *rbto = rbt->rbt_ops;
        rbto_compare_key_fn compare_key = rbto->rbto_compare_key;
        struct rb_node *parent = rbt->rbt_root, *last = NULL;

        while (!RB_SENTINEL_P(parent)) {
                void *pobj = RB_NODETOITEM(rbto, parent);
                const signed int diff = (*compare_key)(rbto->rbto_context,
                    pobj, key);
                if (diff == 0)
                        return pobj;
                if (diff > 0)
                        last = parent;
                parent = parent->rb_nodes[diff < 0];
        }

        return last == NULL ? NULL : RB_NODETOITEM(rbto, last);
}

void *
rb_tree_find_node_leq(struct rb_tree *rbt, const void *key)
{
        const rb_tree_ops_t *rbto = rbt->rbt_ops;
        rbto_compare_key_fn compare_key = rbto->rbto_compare_key;
        struct rb_node *parent = rbt->rbt_root, *last = NULL;

        while (!RB_SENTINEL_P(parent)) {
                void *pobj = RB_NODETOITEM(rbto, parent);
                const signed int diff = (*compare_key)(rbto->rbto_context,
                    pobj, key);
                if (diff == 0)
                        return pobj;
                if (diff < 0)
                        last = parent;
                parent = parent->rb_nodes[diff < 0];
        }

        return last == NULL ? NULL : RB_NODETOITEM(rbto, last);
}

void *
rb_tree_insert_node(struct rb_tree *rbt, void *object)
{
        const rb_tree_ops_t *rbto = rbt->rbt_ops;
        rbto_compare_nodes_fn compare_nodes = rbto->rbto_compare_nodes;
        struct rb_node *parent, *tmp, *self = RB_ITEMTONODE(rbto, object);
        unsigned int position;
        bool rebalance;

        RBSTAT_INC(rbt->rbt_insertions);

        tmp = rbt->rbt_root;
        /*
         * This is a hack.  Because rbt->rbt_root is just a struct rb_node *,
         * just like rb_node->rb_nodes[RB_DIR_LEFT], we can use this fact to
         * avoid a lot of tests for root and know that even at root,
         * updating RB_FATHER(rb_node)->rb_nodes[RB_POSITION(rb_node)] will
         * update rbt->rbt_root.
         */
        parent = (struct rb_node *)(void *)&rbt->rbt_root;
        position = RB_DIR_LEFT;

        /*
         * Find out where to place this new leaf.
         */
        while (!RB_SENTINEL_P(tmp)) {
                void *tobj = RB_NODETOITEM(rbto, tmp);
                const signed int diff = (*compare_nodes)(rbto->rbto_context,
                    tobj, object);
                if (__predict_false(diff == 0)) {
                        /*
                         * Node already exists; return it.
                         */
                        return tobj;
                }
                parent = tmp;
                position = (diff < 0);
                tmp = parent->rb_nodes[position];
        }

#ifdef RBDEBUG
        {
                struct rb_node *prev = NULL, *next = NULL;

                if (position == RB_DIR_RIGHT)
                        prev = parent;
                else if (tmp != rbt->rbt_root)
                        next = parent;

                /*
                 * Verify our sequential position
                 */
                KASSERT(prev == NULL || !RB_SENTINEL_P(prev));
                KASSERT(next == NULL || !RB_SENTINEL_P(next));
                if (prev != NULL && next == NULL)
                        next = TAILQ_NEXT(prev, rb_link);
                if (prev == NULL && next != NULL)
                        prev = TAILQ_PREV(next, rb_node_qh, rb_link);
                KASSERT(prev == NULL || !RB_SENTINEL_P(prev));
                KASSERT(next == NULL || !RB_SENTINEL_P(next));
                KASSERT(prev == NULL || (*compare_nodes)(rbto->rbto_context,
                    RB_NODETOITEM(rbto, prev), RB_NODETOITEM(rbto, self)) < 0);
                KASSERT(next == NULL || (*compare_nodes)(rbto->rbto_context,
                    RB_NODETOITEM(rbto, self), RB_NODETOITEM(rbto, next)) < 0);
        }
#endif

        /*
         * Initialize the node and insert as a leaf into the tree.
         */
        RB_SET_FATHER(self, parent);
        RB_SET_POSITION(self, position);
        if (__predict_false(parent == (struct rb_node *)(void *)&rbt->rbt_root)) {
                RB_MARK_BLACK(self);                /* root is always black */
#ifndef RBSMALL
                rbt->rbt_minmax[RB_DIR_LEFT] = self;
                rbt->rbt_minmax[RB_DIR_RIGHT] = self;
#endif
                rebalance = false;
        } else {
                KASSERT(position == RB_DIR_LEFT || position == RB_DIR_RIGHT);
#ifndef RBSMALL
                /*
                 * Keep track of the minimum and maximum nodes.  If our
                 * parent is a minmax node and we on their min/max side,
                 * we must be the new min/max node.
                 */
                if (parent == rbt->rbt_minmax[position])
                        rbt->rbt_minmax[position] = self;
#endif /* !RBSMALL */
                /*
                 * All new nodes are colored red.  We only need to rebalance
                 * if our parent is also red.
                 */
                RB_MARK_RED(self);
                rebalance = RB_RED_P(parent);
        }
        KASSERT(RB_SENTINEL_P(parent->rb_nodes[position]));
        self->rb_left = parent->rb_nodes[position];
        self->rb_right = parent->rb_nodes[position];
        parent->rb_nodes[position] = self;
        KASSERT(RB_CHILDLESS_P(self));

        /*
         * Insert the new node into a sorted list for easy sequential access
         */
        RBSTAT_INC(rbt->rbt_count);
#ifdef RBDEBUG
        if (RB_ROOT_P(rbt, self)) {
                RB_TAILQ_INSERT_HEAD(&rbt->rbt_nodes, self, rb_link);
        } else if (position == RB_DIR_LEFT) {
                KASSERT((*compare_nodes)(rbto->rbto_context,
                    RB_NODETOITEM(rbto, self),
                    RB_NODETOITEM(rbto, RB_FATHER(self))) < 0);
                RB_TAILQ_INSERT_BEFORE(RB_FATHER(self), self, rb_link);
        } else {
                KASSERT((*compare_nodes)(rbto->rbto_context,
                    RB_NODETOITEM(rbto, RB_FATHER(self)),
                    RB_NODETOITEM(rbto, self)) < 0);
                RB_TAILQ_INSERT_AFTER(&rbt->rbt_nodes, RB_FATHER(self),
                    self, rb_link);
        }
#endif
        KASSERT(rb_tree_check_node(rbt, self, NULL, !rebalance));

        /*
         * Rebalance tree after insertion
         */
        if (rebalance) {
                rb_tree_insert_rebalance(rbt, self);
                KASSERT(rb_tree_check_node(rbt, self, NULL, true));
        }

        /* Successfully inserted, return our node pointer. */
        return object;
}

/*
 * Swap the location and colors of 'self' and its child @ which.  The child
 * can not be a sentinel node.  This is our rotation function.  However,
 * since it preserves coloring, it great simplifies both insertion and
 * removal since rotation almost always involves the exchanging of colors
 * as a separate step.
 */
static void
rb_tree_reparent_nodes(__rbt_unused struct rb_tree *rbt,
        struct rb_node *old_father, const unsigned int which)
{
        const unsigned int other = which ^ RB_DIR_OTHER;
        struct rb_node * const grandpa = RB_FATHER(old_father);
        struct rb_node * const old_child = old_father->rb_nodes[which];
        struct rb_node * const new_father = old_child;
        struct rb_node * const new_child = old_father;

        KASSERT(which == RB_DIR_LEFT || which == RB_DIR_RIGHT);

        KASSERT(!RB_SENTINEL_P(old_child));
        KASSERT(RB_FATHER(old_child) == old_father);

        KASSERT(rb_tree_check_node(rbt, old_father, NULL, false));
        KASSERT(rb_tree_check_node(rbt, old_child, NULL, false));
        KASSERT(RB_ROOT_P(rbt, old_father) ||
            rb_tree_check_node(rbt, grandpa, NULL, false));

        /*
         * Exchange descendant linkages.
         */
        grandpa->rb_nodes[RB_POSITION(old_father)] = new_father;
        new_child->rb_nodes[which] = old_child->rb_nodes[other];
        new_father->rb_nodes[other] = new_child;

        /*
         * Update ancestor linkages
         */
        RB_SET_FATHER(new_father, grandpa);
        RB_SET_FATHER(new_child, new_father);

        /*
         * Exchange properties between new_father and new_child.  The only
         * change is that new_child's position is now on the other side.
         */
#if 0
        {
                struct rb_node tmp;
                tmp.rb_info = 0;
                RB_COPY_PROPERTIES(&tmp, old_child);
                RB_COPY_PROPERTIES(new_father, old_father);
                RB_COPY_PROPERTIES(new_child, &tmp);
        }
#else
        RB_SWAP_PROPERTIES(new_father, new_child);
#endif
        RB_SET_POSITION(new_child, other);

        /*
         * Make sure to reparent the new child to ourself.
         */
        if (!RB_SENTINEL_P(new_child->rb_nodes[which])) {
                RB_SET_FATHER(new_child->rb_nodes[which], new_child);
                RB_SET_POSITION(new_child->rb_nodes[which], which);
        }

        KASSERT(rb_tree_check_node(rbt, new_father, NULL, false));
        KASSERT(rb_tree_check_node(rbt, new_child, NULL, false));
        KASSERT(RB_ROOT_P(rbt, new_father) ||
            rb_tree_check_node(rbt, grandpa, NULL, false));
}

static void
rb_tree_insert_rebalance(struct rb_tree *rbt, struct rb_node *self)
{
        struct rb_node * father = RB_FATHER(self);
        struct rb_node * grandpa = RB_FATHER(father);
        struct rb_node * uncle;
        unsigned int which;
        unsigned int other;

        KASSERT(!RB_ROOT_P(rbt, self));
        KASSERT(RB_RED_P(self));
        KASSERT(RB_RED_P(father));
        RBSTAT_INC(rbt->rbt_insertion_rebalance_calls);

        for (;;) {
                KASSERT(!RB_SENTINEL_P(self));

                KASSERT(RB_RED_P(self));
                KASSERT(RB_RED_P(father));
                /*
                 * We are red and our parent is red, therefore we must have a
                 * grandfather and he must be black.
                 */
                grandpa = RB_FATHER(father);
                KASSERT(RB_BLACK_P(grandpa));
                KASSERT(RB_DIR_RIGHT == 1 && RB_DIR_LEFT == 0);
                which = (father == grandpa->rb_right);
                other = which ^ RB_DIR_OTHER;
                uncle = grandpa->rb_nodes[other];

                if (RB_BLACK_P(uncle))
                        break;

                RBSTAT_INC(rbt->rbt_insertion_rebalance_passes);
                /*
                 * Case 1: our uncle is red
                 *   Simply invert the colors of our parent and
                 *   uncle and make our grandparent red.  And
                 *   then solve the problem up at his level.
                 */
                RB_MARK_BLACK(uncle);
                RB_MARK_BLACK(father);
                if (__predict_false(RB_ROOT_P(rbt, grandpa))) {
                        /*
                         * If our grandpa is root, don't bother
                         * setting him to red, just return.
                         */
                        KASSERT(RB_BLACK_P(grandpa));
                        return;
                }
                RB_MARK_RED(grandpa);
                self = grandpa;
                father = RB_FATHER(self);
                KASSERT(RB_RED_P(self));
                if (RB_BLACK_P(father)) {
                        /*
                         * If our greatgrandpa is black, we're done.
                         */
                        KASSERT(RB_BLACK_P(rbt->rbt_root));
                        return;
                }
        }

        KASSERT(!RB_ROOT_P(rbt, self));
        KASSERT(RB_RED_P(self));
        KASSERT(RB_RED_P(father));
        KASSERT(RB_BLACK_P(uncle));
        KASSERT(RB_BLACK_P(grandpa));
        /*
         * Case 2&3: our uncle is black.
         */
        if (self == father->rb_nodes[other]) {
                /*
                 * Case 2: we are on the same side as our uncle
                 *   Swap ourselves with our parent so this case
                 *   becomes case 3.  Basically our parent becomes our
                 *   child.
                 */
                rb_tree_reparent_nodes(rbt, father, other);
                KASSERT(RB_FATHER(father) == self);
                KASSERT(self->rb_nodes[which] == father);
                KASSERT(RB_FATHER(self) == grandpa);
                self = father;
                father = RB_FATHER(self);
        }
        KASSERT(RB_RED_P(self) && RB_RED_P(father));
        KASSERT(grandpa->rb_nodes[which] == father);
        /*
         * Case 3: we are opposite a child of a black uncle.
         *   Swap our parent and grandparent.  Since our grandfather
         *   is black, our father will become black and our new sibling
         *   (former grandparent) will become red.
         */
        rb_tree_reparent_nodes(rbt, grandpa, which);
        KASSERT(RB_FATHER(self) == father);
        KASSERT(RB_FATHER(self)->rb_nodes[RB_POSITION(self) ^ RB_DIR_OTHER] == grandpa);
        KASSERT(RB_RED_P(self));
        KASSERT(RB_BLACK_P(father));
        KASSERT(RB_RED_P(grandpa));

        /*
         * Final step: Set the root to black.
         */
        RB_MARK_BLACK(rbt->rbt_root);
}

static void
rb_tree_prune_node(struct rb_tree *rbt, struct rb_node *self, bool rebalance)
{
        const unsigned int which = RB_POSITION(self);
        struct rb_node *father = RB_FATHER(self);
#ifndef RBSMALL
        const bool was_root = RB_ROOT_P(rbt, self);
#endif

        KASSERT(rebalance || (RB_ROOT_P(rbt, self) || RB_RED_P(self)));
        KASSERT(!rebalance || RB_BLACK_P(self));
        KASSERT(RB_CHILDLESS_P(self));
        KASSERT(rb_tree_check_node(rbt, self, NULL, false));

        /*
         * Since we are childless, we know that self->rb_left is pointing
         * to the sentinel node.
         */
        father->rb_nodes[which] = self->rb_left;

        /*
         * Remove ourselves from the node list, decrement the count,
         * and update min/max.
         */
        RB_TAILQ_REMOVE(&rbt->rbt_nodes, self, rb_link);
        RBSTAT_DEC(rbt->rbt_count);
#ifndef RBSMALL
        if (__predict_false(rbt->rbt_minmax[RB_POSITION(self)] == self)) {
                rbt->rbt_minmax[RB_POSITION(self)] = father;
                /*
                 * When removing the root, rbt->rbt_minmax[RB_DIR_LEFT] is
                 * updated automatically, but we also need to update 
                 * rbt->rbt_minmax[RB_DIR_RIGHT];
                 */
                if (__predict_false(was_root)) {
                        rbt->rbt_minmax[RB_DIR_RIGHT] = father;
                }
        }
        RB_SET_FATHER(self, NULL);
#endif

        /*
         * Rebalance if requested.
         */
        if (rebalance)
                rb_tree_removal_rebalance(rbt, father, which);
        KASSERT(was_root || rb_tree_check_node(rbt, father, NULL, true));
}

/*
 * When deleting an interior node
 */
static void
rb_tree_swap_prune_and_rebalance(struct rb_tree *rbt, struct rb_node *self,
        struct rb_node *standin)
{
        const unsigned int standin_which = RB_POSITION(standin);
        unsigned int standin_other = standin_which ^ RB_DIR_OTHER;
        struct rb_node *standin_son;
        struct rb_node *standin_father = RB_FATHER(standin);
        bool rebalance = RB_BLACK_P(standin);

        if (standin_father == self) {
                /*
                 * As a child of self, any childen would be opposite of
                 * our parent.
                 */
                KASSERT(RB_SENTINEL_P(standin->rb_nodes[standin_other]));
                standin_son = standin->rb_nodes[standin_which];
        } else {
                /*
                 * Since we aren't a child of self, any childen would be
                 * on the same side as our parent.
                 */
                KASSERT(RB_SENTINEL_P(standin->rb_nodes[standin_which]));
                standin_son = standin->rb_nodes[standin_other];
        }

        /*
         * the node we are removing must have two children.
         */
        KASSERT(RB_TWOCHILDREN_P(self));
        /*
         * If standin has a child, it must be red.
         */
        KASSERT(RB_SENTINEL_P(standin_son) || RB_RED_P(standin_son));

        /*
         * Verify things are sane.
         */
        KASSERT(rb_tree_check_node(rbt, self, NULL, false));
        KASSERT(rb_tree_check_node(rbt, standin, NULL, false));

        if (__predict_false(RB_RED_P(standin_son))) {
                /*
                 * We know we have a red child so if we flip it to black
                 * we don't have to rebalance.
                 */
                KASSERT(rb_tree_check_node(rbt, standin_son, NULL, true));
                RB_MARK_BLACK(standin_son);
                rebalance = false;

                if (standin_father == self) {
                        KASSERT(RB_POSITION(standin_son) == standin_which);
                } else {
                        KASSERT(RB_POSITION(standin_son) == standin_other);
                        /*
                         * Change the son's parentage to point to his grandpa.
                         */
                        RB_SET_FATHER(standin_son, standin_father);
                        RB_SET_POSITION(standin_son, standin_which);
                }
        }

        if (standin_father == self) {
                /*
                 * If we are about to delete the standin's father, then when
                 * we call rebalance, we need to use ourselves as our father.
                 * Otherwise remember our original father.  Also, sincef we are
                 * our standin's father we only need to reparent the standin's
                 * brother.
                 *
                 * |    R      -->     S    |
                 * |  Q   S    -->   Q   T  |
                 * |        t  -->          |
                 */
                KASSERT(RB_SENTINEL_P(standin->rb_nodes[standin_other]));
                KASSERT(!RB_SENTINEL_P(self->rb_nodes[standin_other]));
                KASSERT(self->rb_nodes[standin_which] == standin);
                /*
                 * Have our son/standin adopt his brother as his new son.
                 */
                standin_father = standin;
        } else {
                /*
                 * |    R          -->    S       .  |
                 * |   / \  |   T  -->   / \  |  /   |
                 * |  ..... | S    -->  ..... | T    |
                 *
                 * Sever standin's connection to his father.
                 */
                standin_father->rb_nodes[standin_which] = standin_son;
                /*
                 * Adopt the far son.
                 */
                standin->rb_nodes[standin_other] = self->rb_nodes[standin_other];
                RB_SET_FATHER(standin->rb_nodes[standin_other], standin);
                KASSERT(RB_POSITION(self->rb_nodes[standin_other]) == standin_other);
                /*
                 * Use standin_other because we need to preserve standin_which
                 * for the removal_rebalance.
                 */
                standin_other = standin_which;
        }

        /*
         * Move the only remaining son to our standin.  If our standin is our
         * son, this will be the only son needed to be moved.
         */
        KASSERT(standin->rb_nodes[standin_other] != self->rb_nodes[standin_other]);
        standin->rb_nodes[standin_other] = self->rb_nodes[standin_other];
        RB_SET_FATHER(standin->rb_nodes[standin_other], standin);

        /*
         * Now copy the result of self to standin and then replace
         * self with standin in the tree.
         */
        RB_COPY_PROPERTIES(standin, self);
        RB_SET_FATHER(standin, RB_FATHER(self));
        RB_FATHER(standin)->rb_nodes[RB_POSITION(standin)] = standin;

        /*
         * Remove ourselves from the node list, decrement the count,
         * and update min/max.
         */
        RB_TAILQ_REMOVE(&rbt->rbt_nodes, self, rb_link);
        RBSTAT_DEC(rbt->rbt_count);
#ifndef RBSMALL
        if (__predict_false(rbt->rbt_minmax[RB_POSITION(self)] == self))
                rbt->rbt_minmax[RB_POSITION(self)] = RB_FATHER(self);
        RB_SET_FATHER(self, NULL);
#endif

        KASSERT(rb_tree_check_node(rbt, standin, NULL, false));
        KASSERT(RB_FATHER_SENTINEL_P(standin)
                || rb_tree_check_node(rbt, standin_father, NULL, false));
        KASSERT(RB_LEFT_SENTINEL_P(standin)
                || rb_tree_check_node(rbt, standin->rb_left, NULL, false));
        KASSERT(RB_RIGHT_SENTINEL_P(standin)
                || rb_tree_check_node(rbt, standin->rb_right, NULL, false));

        if (!rebalance)
                return;

        rb_tree_removal_rebalance(rbt, standin_father, standin_which);
        KASSERT(rb_tree_check_node(rbt, standin, NULL, true));
}

/*
 * We could do this by doing
 *        rb_tree_node_swap(rbt, self, which);
 *        rb_tree_prune_node(rbt, self, false);
 *
 * But it's more efficient to just evalate and recolor the child.
 */
static void
rb_tree_prune_blackred_branch(struct rb_tree *rbt, struct rb_node *self,
        unsigned int which)
{
        struct rb_node *father = RB_FATHER(self);
        struct rb_node *son = self->rb_nodes[which];
#ifndef RBSMALL
        const bool was_root = RB_ROOT_P(rbt, self);
#endif

        KASSERT(which == RB_DIR_LEFT || which == RB_DIR_RIGHT);
        KASSERT(RB_BLACK_P(self) && RB_RED_P(son));
        KASSERT(!RB_TWOCHILDREN_P(son));
        KASSERT(RB_CHILDLESS_P(son));
        KASSERT(rb_tree_check_node(rbt, self, NULL, false));
        KASSERT(rb_tree_check_node(rbt, son, NULL, false));

        /*
         * Remove ourselves from the tree and give our former child our
         * properties (position, color, root).
         */
        RB_COPY_PROPERTIES(son, self);
        father->rb_nodes[RB_POSITION(son)] = son;
        RB_SET_FATHER(son, father);

        /*
         * Remove ourselves from the node list, decrement the count,
         * and update minmax.
         */
        RB_TAILQ_REMOVE(&rbt->rbt_nodes, self, rb_link);
        RBSTAT_DEC(rbt->rbt_count);
#ifndef RBSMALL
        if (__predict_false(was_root)) {
                KASSERT(rbt->rbt_minmax[which] == son);
                rbt->rbt_minmax[which ^ RB_DIR_OTHER] = son;
        } else if (rbt->rbt_minmax[RB_POSITION(self)] == self) {
                rbt->rbt_minmax[RB_POSITION(self)] = son;
        }
        RB_SET_FATHER(self, NULL);
#endif

        KASSERT(was_root || rb_tree_check_node(rbt, father, NULL, true));
        KASSERT(rb_tree_check_node(rbt, son, NULL, true));
}

void
rb_tree_remove_node(struct rb_tree *rbt, void *object)
{
        const rb_tree_ops_t *rbto = rbt->rbt_ops;
        struct rb_node *standin, *self = RB_ITEMTONODE(rbto, object);
        unsigned int which;

        KASSERT(!RB_SENTINEL_P(self));
        RBSTAT_INC(rbt->rbt_removals);

        /*
         * In the following diagrams, we (the node to be removed) are S.  Red
         * nodes are lowercase.  T could be either red or black.
         *
         * Remember the major axiom of the red-black tree: the number of
         * black nodes from the root to each leaf is constant across all
         * leaves, only the number of red nodes varies.
         *
         * Thus removing a red leaf doesn't require any other changes to a
         * red-black tree.  So if we must remove a node, attempt to rearrange
         * the tree so we can remove a red node.
         *
         * The simpliest case is a childless red node or a childless root node:
         *
         * |    T  -->    T  |    or    |  R  -->  *  |
         * |  s    -->  *    |
         */
        if (RB_CHILDLESS_P(self)) {
                const bool rebalance = RB_BLACK_P(self) && !RB_ROOT_P(rbt, self);
                rb_tree_prune_node(rbt, self, rebalance);
                return;
        }
        KASSERT(!RB_CHILDLESS_P(self));
        if (!RB_TWOCHILDREN_P(self)) {
                /*
                 * The next simpliest case is the node we are deleting is
                 * black and has one red child.
                 *
                 * |      T  -->      T  -->      T  |
                 * |    S    -->  R      -->  R      |
                 * |  r      -->    s    -->    *    |
                 */
                which = RB_LEFT_SENTINEL_P(self) ? RB_DIR_RIGHT : RB_DIR_LEFT;
                KASSERT(RB_BLACK_P(self));
                KASSERT(RB_RED_P(self->rb_nodes[which]));
                KASSERT(RB_CHILDLESS_P(self->rb_nodes[which]));
                rb_tree_prune_blackred_branch(rbt, self, which);
                return;
        }
        KASSERT(RB_TWOCHILDREN_P(self));

        /*
         * We invert these because we prefer to remove from the inside of
         * the tree.
         */
        which = RB_POSITION(self) ^ RB_DIR_OTHER;

        /*
         * Let's find the node closes to us opposite of our parent
         * Now swap it with ourself, "prune" it, and rebalance, if needed.
         */
        standin = RB_ITEMTONODE(rbto, rb_tree_iterate(rbt, object, which));
        rb_tree_swap_prune_and_rebalance(rbt, self, standin);
}

static void
rb_tree_removal_rebalance(struct rb_tree *rbt, struct rb_node *parent,
        unsigned int which)
{
        KASSERT(!RB_SENTINEL_P(parent));
        KASSERT(RB_SENTINEL_P(parent->rb_nodes[which]));
        KASSERT(which == RB_DIR_LEFT || which == RB_DIR_RIGHT);
        RBSTAT_INC(rbt->rbt_removal_rebalance_calls);

        while (RB_BLACK_P(parent->rb_nodes[which])) {
                unsigned int other = which ^ RB_DIR_OTHER;
                struct rb_node *brother = parent->rb_nodes[other];

                RBSTAT_INC(rbt->rbt_removal_rebalance_passes);

                KASSERT(!RB_SENTINEL_P(brother));
                /*
                 * For cases 1, 2a, and 2b, our brother's children must
                 * be black and our father must be black
                 */
                if (RB_BLACK_P(parent)
                    && RB_BLACK_P(brother->rb_left)
                    && RB_BLACK_P(brother->rb_right)) {
                        if (RB_RED_P(brother)) {
                                /*
                                 * Case 1: Our brother is red, swap its
                                 * position (and colors) with our parent. 
                                 * This should now be case 2b (unless C or E
                                 * has a red child which is case 3; thus no
                                 * explicit branch to case 2b).
                                 *
                                 *    B         ->        D
                                 *  A     d     ->    b     E
                                 *      C   E   ->  A   C
                                 */
                                KASSERT(RB_BLACK_P(parent));
                                rb_tree_reparent_nodes(rbt, parent, other);
                                brother = parent->rb_nodes[other];
                                KASSERT(!RB_SENTINEL_P(brother));
                                KASSERT(RB_RED_P(parent));
                                KASSERT(RB_BLACK_P(brother));
                                KASSERT(rb_tree_check_node(rbt, brother, NULL, false));
                                KASSERT(rb_tree_check_node(rbt, parent, NULL, false));
                        } else {
                                /*
                                 * Both our parent and brother are black.
                                 * Change our brother to red, advance up rank
                                 * and go through the loop again.
                                 *
                                 *    B         ->   *B
                                 * *A     D     ->  A     d
                                 *      C   E   ->      C   E
                                 */
                                RB_MARK_RED(brother);
                                KASSERT(RB_BLACK_P(brother->rb_left));
                                KASSERT(RB_BLACK_P(brother->rb_right));
                                if (RB_ROOT_P(rbt, parent))
                                        return;        /* root == parent == black */
                                KASSERT(rb_tree_check_node(rbt, brother, NULL, false));
                                KASSERT(rb_tree_check_node(rbt, parent, NULL, false));
                                which = RB_POSITION(parent);
                                parent = RB_FATHER(parent);
                                continue;
                        }
                }
                /*
                 * Avoid an else here so that case 2a above can hit either
                 * case 2b, 3, or 4.
                 */
                if (RB_RED_P(parent)
                    && RB_BLACK_P(brother)
                    && RB_BLACK_P(brother->rb_left)
                    && RB_BLACK_P(brother->rb_right)) {
                        KASSERT(RB_RED_P(parent));
                        KASSERT(RB_BLACK_P(brother));
                        KASSERT(RB_BLACK_P(brother->rb_left));
                        KASSERT(RB_BLACK_P(brother->rb_right));
                        /*
                         * We are black, our father is red, our brother and
                         * both nephews are black.  Simply invert/exchange the
                         * colors of our father and brother (to black and red
                         * respectively).
                         *
                         *        |    f        -->    F        |
                         *        |  *     B    -->  *     b    |
                         *        |      N   N  -->      N   N  |
                         */
                        RB_MARK_BLACK(parent);
                        RB_MARK_RED(brother);
                        KASSERT(rb_tree_check_node(rbt, brother, NULL, true));
                        break;                /* We're done! */
                } else {
                        /*
                         * Our brother must be black and have at least one
                         * red child (it may have two).
                         */
                        KASSERT(RB_BLACK_P(brother));
                        KASSERT(RB_RED_P(brother->rb_nodes[which]) ||
                                RB_RED_P(brother->rb_nodes[other]));
                        if (RB_BLACK_P(brother->rb_nodes[other])) {
                                /*
                                 * Case 3: our brother is black, our near
                                 * nephew is red, and our far nephew is black.
                                 * Swap our brother with our near nephew.  
                                 * This result in a tree that matches case 4.
                                 * (Our father could be red or black).
                                 *
                                 *        |    F      -->    F      |
                                 *        |  x     B  -->  x   B    |
                                 *        |      n    -->        n  |
                                 */
                                KASSERT(RB_RED_P(brother->rb_nodes[which]));
                                rb_tree_reparent_nodes(rbt, brother, which);
                                KASSERT(RB_FATHER(brother) == parent->rb_nodes[other]);
                                brother = parent->rb_nodes[other];
                                KASSERT(RB_RED_P(brother->rb_nodes[other]));
                        }
                        /*
                         * Case 4: our brother is black and our far nephew
                         * is red.  Swap our father and brother locations and
                         * change our far nephew to black.  (these can be
                         * done in either order so we change the color first).
                         * The result is a valid red-black tree and is a
                         * terminal case.  (again we don't care about the
                         * father's color)
                         *
                         * If the father is red, we will get a red-black-black
                         * tree:
                         *        |  f      ->  f      -->    b    |
                         *        |    B    ->    B    -->  F   N  |
                         *        |      n  ->      N  -->         |
                         *
                         * If the father is black, we will get an all black
                         * tree:
                         *        |  F      ->  F      -->    B    |
                         *        |    B    ->    B    -->  F   N  |
                         *        |      n  ->      N  -->         |
                         *
                         * If we had two red nephews, then after the swap,
                         * our former father would have a red grandson. 
                         */
                        KASSERT(RB_BLACK_P(brother));
                        KASSERT(RB_RED_P(brother->rb_nodes[other]));
                        RB_MARK_BLACK(brother->rb_nodes[other]);
                        rb_tree_reparent_nodes(rbt, parent, other);
                        break;                /* We're done! */
                }
        }
        KASSERT(rb_tree_check_node(rbt, parent, NULL, true));
}

void *
rb_tree_iterate(struct rb_tree *rbt, void *object, const unsigned int direction)
{
        const rb_tree_ops_t *rbto = rbt->rbt_ops;
        const unsigned int other = direction ^ RB_DIR_OTHER;
        struct rb_node *self;

        KASSERT(direction == RB_DIR_LEFT || direction == RB_DIR_RIGHT);

        if (object == NULL) {
#ifndef RBSMALL
                if (RB_SENTINEL_P(rbt->rbt_root))
                        return NULL;
                return RB_NODETOITEM(rbto, rbt->rbt_minmax[direction]);
#else
                self = rbt->rbt_root;
                if (RB_SENTINEL_P(self))
                        return NULL;
                while (!RB_SENTINEL_P(self->rb_nodes[direction]))
                        self = self->rb_nodes[direction];
                return RB_NODETOITEM(rbto, self);
#endif /* !RBSMALL */
        }
        self = RB_ITEMTONODE(rbto, object);
        KASSERT(!RB_SENTINEL_P(self));
        /*
         * We can't go any further in this direction.  We proceed up in the
         * opposite direction until our parent is in direction we want to go.
         */
        if (RB_SENTINEL_P(self->rb_nodes[direction])) {
                while (!RB_ROOT_P(rbt, self)) {
                        if (other == RB_POSITION(self))
                                return RB_NODETOITEM(rbto, RB_FATHER(self));
                        self = RB_FATHER(self);
                }
                return NULL;
        }

        /*
         * Advance down one in current direction and go down as far as possible
         * in the opposite direction.
         */
        self = self->rb_nodes[direction];
        KASSERT(!RB_SENTINEL_P(self));
        while (!RB_SENTINEL_P(self->rb_nodes[other]))
                self = self->rb_nodes[other];
        return RB_NODETOITEM(rbto, self);
}

#ifdef RBDEBUG
static const struct rb_node *
rb_tree_iterate_const(const struct rb_tree *rbt, const struct rb_node *self,
        const unsigned int direction)
{
        const unsigned int other = direction ^ RB_DIR_OTHER;
        KASSERT(direction == RB_DIR_LEFT || direction == RB_DIR_RIGHT);

        if (self == NULL) {
#ifndef RBSMALL
                if (RB_SENTINEL_P(rbt->rbt_root))
                        return NULL;
                return rbt->rbt_minmax[direction];
#else
                self = rbt->rbt_root;
                if (RB_SENTINEL_P(self))
                        return NULL;
                while (!RB_SENTINEL_P(self->rb_nodes[direction]))
                        self = self->rb_nodes[direction];
                return self;
#endif /* !RBSMALL */
        }
        KASSERT(!RB_SENTINEL_P(self));
        /*
         * We can't go any further in this direction.  We proceed up in the
         * opposite direction until our parent is in direction we want to go.
         */
        if (RB_SENTINEL_P(self->rb_nodes[direction])) {
                while (!RB_ROOT_P(rbt, self)) {
                        if (other == RB_POSITION(self))
                                return RB_FATHER(self);
                        self = RB_FATHER(self);
                }
                return NULL;
        }

        /*
         * Advance down one in current direction and go down as far as possible
         * in the opposite direction.
         */
        self = self->rb_nodes[direction];
        KASSERT(!RB_SENTINEL_P(self));
        while (!RB_SENTINEL_P(self->rb_nodes[other]))
                self = self->rb_nodes[other];
        return self;
}

static unsigned int
rb_tree_count_black(const struct rb_node *self)
{
        unsigned int left, right;

        if (RB_SENTINEL_P(self))
                return 0;

        left = rb_tree_count_black(self->rb_left);
        right = rb_tree_count_black(self->rb_right);

        KASSERT(left == right);

        return left + RB_BLACK_P(self);
}

static bool
rb_tree_check_node(const struct rb_tree *rbt, const struct rb_node *self,
        const struct rb_node *prev, bool red_check)
{
        const rb_tree_ops_t *rbto = rbt->rbt_ops;
        rbto_compare_nodes_fn compare_nodes = rbto->rbto_compare_nodes;

        KASSERT(!RB_SENTINEL_P(self));
        KASSERT(prev == NULL || (*compare_nodes)(rbto->rbto_context,
            RB_NODETOITEM(rbto, prev), RB_NODETOITEM(rbto, self)) < 0);

        /*
         * Verify our relationship to our parent.
         */
        if (RB_ROOT_P(rbt, self)) {
                KASSERT(self == rbt->rbt_root);
                KASSERT(RB_POSITION(self) == RB_DIR_LEFT);
                KASSERT(RB_FATHER(self)->rb_nodes[RB_DIR_LEFT] == self);
                KASSERT(RB_FATHER(self) == (const struct rb_node *) &rbt->rbt_root);
        } else {
                int diff = (*compare_nodes)(rbto->rbto_context,
                    RB_NODETOITEM(rbto, self),
                    RB_NODETOITEM(rbto, RB_FATHER(self)));

                KASSERT(self != rbt->rbt_root);
                KASSERT(!RB_FATHER_SENTINEL_P(self));
                if (RB_POSITION(self) == RB_DIR_LEFT) {
                        KASSERT(diff < 0);
                        KASSERT(RB_FATHER(self)->rb_nodes[RB_DIR_LEFT] == self);
                } else {
                        KASSERT(diff > 0);
                        KASSERT(RB_FATHER(self)->rb_nodes[RB_DIR_RIGHT] == self);
                }
        }

        /*
         * Verify our position in the linked list against the tree itself.
         */
        {
                const struct rb_node *prev0 = rb_tree_iterate_const(rbt, self, RB_DIR_LEFT);
                const struct rb_node *next0 = rb_tree_iterate_const(rbt, self, RB_DIR_RIGHT);
                KASSERT(prev0 == TAILQ_PREV(self, rb_node_qh, rb_link));
                KASSERT(next0 == TAILQ_NEXT(self, rb_link));
#ifndef RBSMALL
                KASSERT(prev0 != NULL || self == rbt->rbt_minmax[RB_DIR_LEFT]);
                KASSERT(next0 != NULL || self == rbt->rbt_minmax[RB_DIR_RIGHT]);
#endif
        }

        /*
         * The root must be black.
         * There can never be two adjacent red nodes. 
         */
        if (red_check) {
                KASSERT(!RB_ROOT_P(rbt, self) || RB_BLACK_P(self));
                (void) rb_tree_count_black(self);
                if (RB_RED_P(self)) {
                        const struct rb_node *brother;
                        KASSERT(!RB_ROOT_P(rbt, self));
                        brother = RB_FATHER(self)->rb_nodes[RB_POSITION(self) ^ RB_DIR_OTHER];
                        KASSERT(RB_BLACK_P(RB_FATHER(self)));
                        /* 
                         * I'm red and have no children, then I must either
                         * have no brother or my brother also be red and
                         * also have no children.  (black count == 0)
                         */
                        KASSERT(!RB_CHILDLESS_P(self)
                                || RB_SENTINEL_P(brother)
                                || RB_RED_P(brother)
                                || RB_CHILDLESS_P(brother));
                        /*
                         * If I'm not childless, I must have two children
                         * and they must be both be black.
                         */
                        KASSERT(RB_CHILDLESS_P(self)
                                || (RB_TWOCHILDREN_P(self)
                                    && RB_BLACK_P(self->rb_left)
                                    && RB_BLACK_P(self->rb_right)));
                        /*
                         * If I'm not childless, thus I have black children,
                         * then my brother must either be black or have two
                         * black children.
                         */
                        KASSERT(RB_CHILDLESS_P(self)
                                || RB_BLACK_P(brother)
                                || (RB_TWOCHILDREN_P(brother)
                                    && RB_BLACK_P(brother->rb_left)
                                    && RB_BLACK_P(brother->rb_right)));
                } else {
                        /*
                         * If I'm black and have one child, that child must
                         * be red and childless.
                         */
                        KASSERT(RB_CHILDLESS_P(self)
                                || RB_TWOCHILDREN_P(self)
                                || (!RB_LEFT_SENTINEL_P(self)
                                    && RB_RIGHT_SENTINEL_P(self)
                                    && RB_RED_P(self->rb_left)
                                    && RB_CHILDLESS_P(self->rb_left))
                                || (!RB_RIGHT_SENTINEL_P(self)
                                    && RB_LEFT_SENTINEL_P(self)
                                    && RB_RED_P(self->rb_right)
                                    && RB_CHILDLESS_P(self->rb_right)));

                        /*
                         * If I'm a childless black node and my parent is
                         * black, my 2nd closet relative away from my parent
                         * is either red or has a red parent or red children.
                         */
                        if (!RB_ROOT_P(rbt, self)
                            && RB_CHILDLESS_P(self)
                            && RB_BLACK_P(RB_FATHER(self))) {
                                const unsigned int which = RB_POSITION(self);
                                const unsigned int other = which ^ RB_DIR_OTHER;
                                const struct rb_node *relative0, *relative;

                                relative0 = rb_tree_iterate_const(rbt,
                                    self, other);
                                KASSERT(relative0 != NULL);
                                relative = rb_tree_iterate_const(rbt,
                                    relative0, other);
                                KASSERT(relative != NULL);
                                KASSERT(RB_SENTINEL_P(relative->rb_nodes[which]));
#if 0
                                KASSERT(RB_RED_P(relative)
                                        || RB_RED_P(relative->rb_left)
                                        || RB_RED_P(relative->rb_right)
                                        || RB_RED_P(RB_FATHER(relative)));
#endif
                        }
                }
                /*
                 * A grandparent's children must be real nodes and not
                 * sentinels.  First check out grandparent.
                 */
                KASSERT(RB_ROOT_P(rbt, self)
                        || RB_ROOT_P(rbt, RB_FATHER(self))
                        || RB_TWOCHILDREN_P(RB_FATHER(RB_FATHER(self))));
                /*
                 * If we are have grandchildren on our left, then
                 * we must have a child on our right.
                 */
                KASSERT(RB_LEFT_SENTINEL_P(self)
                        || RB_CHILDLESS_P(self->rb_left)
                        || !RB_RIGHT_SENTINEL_P(self));
                /*
                 * If we are have grandchildren on our right, then
                 * we must have a child on our left.
                 */
                KASSERT(RB_RIGHT_SENTINEL_P(self)
                        || RB_CHILDLESS_P(self->rb_right)
                        || !RB_LEFT_SENTINEL_P(self));

                /*
                 * If we have a child on the left and it doesn't have two
                 * children make sure we don't have great-great-grandchildren on
                 * the right.
                 */
                KASSERT(RB_TWOCHILDREN_P(self->rb_left)
                        || RB_CHILDLESS_P(self->rb_right)
                        || RB_CHILDLESS_P(self->rb_right->rb_left)
                        || RB_CHILDLESS_P(self->rb_right->rb_left->rb_left)
                        || RB_CHILDLESS_P(self->rb_right->rb_left->rb_right)
                        || RB_CHILDLESS_P(self->rb_right->rb_right)
                        || RB_CHILDLESS_P(self->rb_right->rb_right->rb_left)
                        || RB_CHILDLESS_P(self->rb_right->rb_right->rb_right));

                /*
                 * If we have a child on the right and it doesn't have two
                 * children make sure we don't have great-great-grandchildren on
                 * the left.
                 */
                KASSERT(RB_TWOCHILDREN_P(self->rb_right)
                        || RB_CHILDLESS_P(self->rb_left)
                        || RB_CHILDLESS_P(self->rb_left->rb_left)
                        || RB_CHILDLESS_P(self->rb_left->rb_left->rb_left)
                        || RB_CHILDLESS_P(self->rb_left->rb_left->rb_right)
                        || RB_CHILDLESS_P(self->rb_left->rb_right)
                        || RB_CHILDLESS_P(self->rb_left->rb_right->rb_left)
                        || RB_CHILDLESS_P(self->rb_left->rb_right->rb_right));

                /*
                 * If we are fully interior node, then our predecessors and
                 * successors must have no children in our direction.
                 */
                if (RB_TWOCHILDREN_P(self)) {
                        const struct rb_node *prev0;
                        const struct rb_node *next0;

                        prev0 = rb_tree_iterate_const(rbt, self, RB_DIR_LEFT);
                        KASSERT(prev0 != NULL);
                        KASSERT(RB_RIGHT_SENTINEL_P(prev0));

                        next0 = rb_tree_iterate_const(rbt, self, RB_DIR_RIGHT);
                        KASSERT(next0 != NULL);
                        KASSERT(RB_LEFT_SENTINEL_P(next0));
                }
        }

        return true;
}

void
rb_tree_check(const struct rb_tree *rbt, bool red_check)
{
        const struct rb_node *self;
        const struct rb_node *prev;
#ifdef RBSTATS
        unsigned int count = 0;
#endif

        KASSERT(rbt->rbt_root != NULL);
        KASSERT(RB_LEFT_P(rbt->rbt_root));

#if defined(RBSTATS) && !defined(RBSMALL)
        KASSERT(rbt->rbt_count > 1
            || rbt->rbt_minmax[RB_DIR_LEFT] == rbt->rbt_minmax[RB_DIR_RIGHT]);
#endif

        prev = NULL;
        TAILQ_FOREACH(self, &rbt->rbt_nodes, rb_link) {
                rb_tree_check_node(rbt, self, prev, false);
#ifdef RBSTATS
                count++;
#endif
        }
#ifdef RBSTATS
        KASSERT(rbt->rbt_count == count);
#endif
        if (red_check) {
                KASSERT(RB_BLACK_P(rbt->rbt_root));
                KASSERT(RB_SENTINEL_P(rbt->rbt_root)
                        || rb_tree_count_black(rbt->rbt_root));

                /*
                 * The root must be black.
                 * There can never be two adjacent red nodes. 
                 */
                TAILQ_FOREACH(self, &rbt->rbt_nodes, rb_link) {
                        rb_tree_check_node(rbt, self, NULL, true);
                }
        }
}
#endif /* RBDEBUG */

#ifdef RBSTATS
static void
rb_tree_mark_depth(const struct rb_tree *rbt, const struct rb_node *self,
        size_t *depths, size_t depth)
{
        if (RB_SENTINEL_P(self))
                return;

        if (RB_TWOCHILDREN_P(self)) {
                rb_tree_mark_depth(rbt, self->rb_left, depths, depth + 1);
                rb_tree_mark_depth(rbt, self->rb_right, depths, depth + 1);
                return;
        }
        depths[depth]++;
        if (!RB_LEFT_SENTINEL_P(self)) {
                rb_tree_mark_depth(rbt, self->rb_left, depths, depth + 1);
        }
        if (!RB_RIGHT_SENTINEL_P(self)) {
                rb_tree_mark_depth(rbt, self->rb_right, depths, depth + 1);
        }
}

void
rb_tree_depths(const struct rb_tree *rbt, size_t *depths)
{
        rb_tree_mark_depth(rbt, rbt->rbt_root, depths, 1);
}
#endif /* RBSTATS */



































































































   28 


   29 




   29 





   29 
   29 





























    2 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
/*        $NetBSD: bufq_fcfs.c,v 1.13 2017/05/04 11:03:27 kamil Exp $        */
/*        NetBSD: subr_disk.c,v 1.61 2004/09/25 03:30:44 thorpej Exp         */

/*-
 * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ufs_disksubr.c        8.5 (Berkeley) 1/21/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: bufq_fcfs.c,v 1.13 2017/05/04 11:03:27 kamil Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/bufq_impl.h>
#include <sys/kmem.h>
#include <sys/module.h>

/*
 * First-come first-served sort for disks.
 *
 * Requests are appended to the queue without any reordering.
 */

struct bufq_fcfs {
        TAILQ_HEAD(, buf) bq_head;        /* actual list of buffers */
};

static void bufq_fcfs_init(struct bufq_state *);
static void bufq_fcfs_put(struct bufq_state *, struct buf *);
static struct buf *bufq_fcfs_get(struct bufq_state *, int);

BUFQ_DEFINE(fcfs, 10, bufq_fcfs_init);

static void
bufq_fcfs_put(struct bufq_state *bufq, struct buf *bp)
{
        struct bufq_fcfs *fcfs = bufq_private(bufq);

        TAILQ_INSERT_TAIL(&fcfs->bq_head, bp, b_actq);
}

static struct buf *
bufq_fcfs_get(struct bufq_state *bufq, int remove)
{
        struct bufq_fcfs *fcfs = bufq_private(bufq);
        struct buf *bp;

        bp = TAILQ_FIRST(&fcfs->bq_head);

        if (bp != NULL && remove)
                TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq);

        return (bp);
}

static struct buf *
bufq_fcfs_cancel(struct bufq_state *bufq, struct buf *buf)
{
        struct bufq_fcfs *fcfs = bufq_private(bufq);
        struct buf *bp;

        TAILQ_FOREACH(bp, &fcfs->bq_head, b_actq) {
                if (bp == buf) {
                        TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq);
                        return buf;
                }
        }
        return NULL;
}

static void
bufq_fcfs_fini(struct bufq_state *bufq)
{

        KASSERT(bufq->bq_private != NULL);
        kmem_free(bufq->bq_private, sizeof(struct bufq_fcfs));
}

static void
bufq_fcfs_init(struct bufq_state *bufq)
{
        struct bufq_fcfs *fcfs;

        bufq->bq_get = bufq_fcfs_get;
        bufq->bq_put = bufq_fcfs_put;
        bufq->bq_cancel = bufq_fcfs_cancel;
        bufq->bq_fini = bufq_fcfs_fini;
        bufq->bq_private = kmem_zalloc(sizeof(struct bufq_fcfs), KM_SLEEP);
        fcfs = (struct bufq_fcfs *)bufq->bq_private;
        TAILQ_INIT(&fcfs->bq_head);
}

MODULE(MODULE_CLASS_BUFQ, bufq_fcfs, NULL);

static int
bufq_fcfs_modcmd(modcmd_t cmd, void *opaque)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return bufq_register(&bufq_strat_fcfs);
        case MODULE_CMD_FINI:
                return bufq_unregister(&bufq_strat_fcfs);
        default:
                return ENOTTY;
        }
}




















































































































































































































































    8 


    9 

    9 




    9 












    9 

    8 







    9 








    9 






















   17 


   16 

   17 
    9 


















   16 









   18 

   18 





    6 



    5 




    6 




















    6 









































   17 

   17 

   15 



















   16 

   16 






   16 

























    1 
    5 









    6 

    6 


    6 




   17 


   17 


   17 
   13 
    5 








    6 

    6 






   17 











   18 

   18 

   18 




   18 


   18 
   18 










    1 


































    1 





    1 































































































































   14 



   14 
   13 

   14 





















   14 











































   14 





   14 








   14 















   14 








































































   17 



   17 
   17 






















   17 




   16 
   17 
   17 
   17 
   17 
   17 
























































































































































   53 



   52 


   52 
   53 
    9 
   53 










   46 





   16 




   50 
































   17 











   17 
   17 
   17 
    2 
   15 

   17 
   17 
   17 
   17 
   16 
   17 
   17 
   17 
   16 
   17 


   17 






















   17 












   14 

   14 


   14 























   17 
   16 



   17 







   14 














   14 
















   16 
   17 


   16 










   16 
   14 







   16 






   14 



   17 

   16 




   16 












   13 

   13 
   13 





    7 







    6 




    6 


    5 
    6 





    5 



    6 


































    6 


    6 

    6 





    6 
    6 



    6 


    6 
    4 




    4 



    4 
    6 
    6 




    6 



    6 

    6 




























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
/*        $NetBSD: subr_vmem.c,v 1.115 2023/12/03 19:34:08 thorpej Exp $        */

/*-
 * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * reference:
 * -        Magazines and Vmem: Extending the Slab Allocator
 *        to Many CPUs and Arbitrary Resources
 *        http://www.usenix.org/event/usenix01/bonwick.html
 *
 * locking & the boundary tag pool:
 * -         A pool(9) is used for vmem boundary tags
 * -         During a pool get call the global vmem_btag_refill_lock is taken,
 *        to serialize access to the allocation reserve, but no other
 *        vmem arena locks.
 * -        During pool_put calls no vmem mutexes are locked.
 * -         pool_drain doesn't hold the pool's mutex while releasing memory to
 *         its backing therefore no interference with any vmem mutexes.
 * -        The boundary tag pool is forced to put page headers into pool pages
 *          (PR_PHINPAGE) and not off page to avoid pool recursion.
 *          (due to sizeof(bt_t) it should be the case anyway)
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_vmem.c,v 1.115 2023/12/03 19:34:08 thorpej Exp $");

#if defined(_KERNEL) && defined(_KERNEL_OPT)
#include "opt_ddb.h"
#endif /* defined(_KERNEL) && defined(_KERNEL_OPT) */

#include <sys/param.h>
#include <sys/hash.h>
#include <sys/queue.h>
#include <sys/bitops.h>

#if defined(_KERNEL)
#include <sys/systm.h>
#include <sys/kernel.h>        /* hz */
#include <sys/callout.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/vmem.h>
#include <sys/vmem_impl.h>
#include <sys/workqueue.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_km.h>
#include <uvm/uvm_page.h>
#include <uvm/uvm_pdaemon.h>
#else /* defined(_KERNEL) */
#include <stdio.h>
#include <errno.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "../sys/vmem.h"
#include "../sys/vmem_impl.h"
#endif /* defined(_KERNEL) */


#if defined(_KERNEL)
#include <sys/evcnt.h>
#define VMEM_EVCNT_DEFINE(name) \
struct evcnt vmem_evcnt_##name = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, \
    "vmem", #name); \
EVCNT_ATTACH_STATIC(vmem_evcnt_##name);
#define VMEM_EVCNT_INCR(ev)        vmem_evcnt_##ev.ev_count++
#define VMEM_EVCNT_DECR(ev)        vmem_evcnt_##ev.ev_count--

VMEM_EVCNT_DEFINE(static_bt_count)
VMEM_EVCNT_DEFINE(static_bt_inuse)

#define        VMEM_CONDVAR_INIT(vm, wchan)        cv_init(&vm->vm_cv, wchan)
#define        VMEM_CONDVAR_DESTROY(vm)        cv_destroy(&vm->vm_cv)
#define        VMEM_CONDVAR_WAIT(vm)                cv_wait(&vm->vm_cv, &vm->vm_lock)
#define        VMEM_CONDVAR_BROADCAST(vm)        cv_broadcast(&vm->vm_cv)

#else /* defined(_KERNEL) */

#define VMEM_EVCNT_INCR(ev)        /* nothing */
#define VMEM_EVCNT_DECR(ev)        /* nothing */

#define        VMEM_CONDVAR_INIT(vm, wchan)        /* nothing */
#define        VMEM_CONDVAR_DESTROY(vm)        /* nothing */
#define        VMEM_CONDVAR_WAIT(vm)                /* nothing */
#define        VMEM_CONDVAR_BROADCAST(vm)        /* nothing */

#define        UNITTEST
#define        KASSERT(a)                assert(a)
#define        KASSERTMSG(a, m, ...)        assert(a)
#define        mutex_init(a, b, c)        /* nothing */
#define        mutex_destroy(a)        /* nothing */
#define        mutex_enter(a)                /* nothing */
#define        mutex_tryenter(a)        true
#define        mutex_exit(a)                /* nothing */
#define        mutex_owned(a)                true
#define        ASSERT_SLEEPABLE()        /* nothing */
#define        panic(...)                printf(__VA_ARGS__); abort()
#endif /* defined(_KERNEL) */

#if defined(VMEM_SANITY)
static void vmem_check(vmem_t *);
#else /* defined(VMEM_SANITY) */
#define vmem_check(vm)        /* nothing */
#endif /* defined(VMEM_SANITY) */

#define        VMEM_HASHSIZE_MIN        1        /* XXX */
#define        VMEM_HASHSIZE_MAX        65536        /* XXX */
#define        VMEM_HASHSIZE_INIT        1

#define        VM_FITMASK        (VM_BESTFIT | VM_INSTANTFIT)

#if defined(_KERNEL)
static bool vmem_bootstrapped = false;
static kmutex_t vmem_list_lock;
static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list);
#endif /* defined(_KERNEL) */

/* ---- misc */

#define        VMEM_LOCK(vm)                mutex_enter(&(vm)->vm_lock)
#define        VMEM_TRYLOCK(vm)        mutex_tryenter(&(vm)->vm_lock)
#define        VMEM_UNLOCK(vm)                mutex_exit(&(vm)->vm_lock)
#define        VMEM_LOCK_INIT(vm, ipl)        mutex_init(&(vm)->vm_lock, MUTEX_DEFAULT, (ipl))
#define        VMEM_LOCK_DESTROY(vm)        mutex_destroy(&(vm)->vm_lock)
#define        VMEM_ASSERT_LOCKED(vm)        KASSERT(mutex_owned(&(vm)->vm_lock))

#define        VMEM_ALIGNUP(addr, align) \
        (-(-(addr) & -(align)))

#define        VMEM_CROSS_P(addr1, addr2, boundary) \
        ((((addr1) ^ (addr2)) & -(boundary)) != 0)

#define        ORDER2SIZE(order)        ((vmem_size_t)1 << (order))
#define        SIZE2ORDER(size)        ((int)ilog2(size))

static void
vmem_kick_pdaemon(void)
{
#if defined(_KERNEL)
        uvm_kick_pdaemon();
#endif
}

static void vmem_xfree_bt(vmem_t *, bt_t *);

#if !defined(_KERNEL)
#define        xmalloc(sz, flags)        malloc(sz)
#define        xfree(p, sz)                free(p)
#define        bt_alloc(vm, flags)        malloc(sizeof(bt_t))
#define        bt_free(vm, bt)                free(bt)
#define        bt_freetrim(vm, l)        /* nothing */
#else /* defined(_KERNEL) */

#define        xmalloc(sz, flags) \
    kmem_alloc(sz, ((flags) & VM_SLEEP) ? KM_SLEEP : KM_NOSLEEP);
#define        xfree(p, sz)                kmem_free(p, sz);

/*
 * BT_RESERVE calculation:
 * we allocate memory for boundary tags with vmem; therefore we have
 * to keep a reserve of bts used to allocated memory for bts.
 * This reserve is 4 for each arena involved in allocating vmems memory.
 * BT_MAXFREE: don't cache excessive counts of bts in arenas
 */
#define STATIC_BT_COUNT 200
#define BT_MINRESERVE 4
#define BT_MAXFREE 64

static struct vmem_btag static_bts[STATIC_BT_COUNT];
static int static_bt_count = STATIC_BT_COUNT;

static struct vmem kmem_va_meta_arena_store;
vmem_t *kmem_va_meta_arena;
static struct vmem kmem_meta_arena_store;
vmem_t *kmem_meta_arena = NULL;

static kmutex_t vmem_btag_refill_lock;
static kmutex_t vmem_btag_lock;
static LIST_HEAD(, vmem_btag) vmem_btag_freelist;
static size_t vmem_btag_freelist_count = 0;
static struct pool vmem_btag_pool;
static bool vmem_btag_pool_initialized __read_mostly;

/* ---- boundary tag */

static int bt_refill(vmem_t *vm);
static int bt_refill_locked(vmem_t *vm);

static void *
pool_page_alloc_vmem_meta(struct pool *pp, int flags)
{
        const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
        vmem_addr_t va;
        int ret;

        ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz,
            (vflags & ~VM_FITMASK) | VM_INSTANTFIT | VM_POPULATING, &va);

        return ret ? NULL : (void *)va;
}

static void
pool_page_free_vmem_meta(struct pool *pp, void *v)
{

        vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz);
}

/* allocator for vmem-pool metadata */
struct pool_allocator pool_allocator_vmem_meta = {
        .pa_alloc = pool_page_alloc_vmem_meta,
        .pa_free = pool_page_free_vmem_meta,
        .pa_pagesz = 0
};

static int
bt_refill_locked(vmem_t *vm)
{
        bt_t *bt;

        VMEM_ASSERT_LOCKED(vm);

        if (vm->vm_nfreetags > BT_MINRESERVE) {
                return 0;
        }

        mutex_enter(&vmem_btag_lock);
        while (!LIST_EMPTY(&vmem_btag_freelist) &&
            vm->vm_nfreetags <= BT_MINRESERVE &&
            (vm->vm_flags & VM_PRIVTAGS) == 0) {
                bt = LIST_FIRST(&vmem_btag_freelist);
                LIST_REMOVE(bt, bt_freelist);
                bt->bt_flags = 0;
                LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
                vm->vm_nfreetags++;
                vmem_btag_freelist_count--;
                VMEM_EVCNT_INCR(static_bt_inuse);
        }
        mutex_exit(&vmem_btag_lock);

        while (vm->vm_nfreetags <= BT_MINRESERVE) {
                VMEM_UNLOCK(vm);
                KASSERT(vmem_btag_pool_initialized);
                mutex_enter(&vmem_btag_refill_lock);
                bt = pool_get(&vmem_btag_pool, PR_NOWAIT);
                mutex_exit(&vmem_btag_refill_lock);
                VMEM_LOCK(vm);
                if (bt == NULL)
                        break;
                bt->bt_flags = 0;
                LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
                vm->vm_nfreetags++;
        }

        if (vm->vm_nfreetags <= BT_MINRESERVE) {
                return ENOMEM;
        }

        if (kmem_meta_arena != NULL) {
                VMEM_UNLOCK(vm);
                (void)bt_refill(kmem_arena);
                (void)bt_refill(kmem_va_meta_arena);
                (void)bt_refill(kmem_meta_arena);
                VMEM_LOCK(vm);
        }

        return 0;
}

static int
bt_refill(vmem_t *vm)
{
        int rv;

        VMEM_LOCK(vm);
        rv = bt_refill_locked(vm);
        VMEM_UNLOCK(vm);
        return rv;
}

static bt_t *
bt_alloc(vmem_t *vm, vm_flag_t flags)
{
        bt_t *bt;

        VMEM_ASSERT_LOCKED(vm);

        while (vm->vm_nfreetags <= BT_MINRESERVE && (flags & VM_POPULATING) == 0) {
                if (bt_refill_locked(vm)) {
                        if ((flags & VM_NOSLEEP) != 0) {
                                return NULL;
                        }

                        /*
                         * It would be nice to wait for something specific here
                         * but there are multiple ways that a retry could
                         * succeed and we can't wait for multiple things
                         * simultaneously.  So we'll just sleep for an arbitrary
                         * short period of time and retry regardless.
                         * This should be a very rare case.
                         */

                        vmem_kick_pdaemon();
                        kpause("btalloc", false, 1, &vm->vm_lock);
                }
        }
        bt = LIST_FIRST(&vm->vm_freetags);
        LIST_REMOVE(bt, bt_freelist);
        vm->vm_nfreetags--;

        return bt;
}

static void
bt_free(vmem_t *vm, bt_t *bt)
{

        VMEM_ASSERT_LOCKED(vm);

        LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
        vm->vm_nfreetags++;
}

static void
bt_freetrim(vmem_t *vm, int freelimit)
{
        bt_t *bt, *next_bt;
        LIST_HEAD(, vmem_btag) tofree;

        VMEM_ASSERT_LOCKED(vm);

        LIST_INIT(&tofree);

        LIST_FOREACH_SAFE(bt, &vm->vm_freetags, bt_freelist, next_bt) {
                if (vm->vm_nfreetags <= freelimit) {
                        break;
                }
                if (bt->bt_flags & BT_F_PRIVATE) {
                        continue;
                }
                LIST_REMOVE(bt, bt_freelist);
                vm->vm_nfreetags--;
                if (bt >= static_bts
                    && bt < &static_bts[STATIC_BT_COUNT]) {
                        mutex_enter(&vmem_btag_lock);
                        LIST_INSERT_HEAD(&vmem_btag_freelist, bt, bt_freelist);
                        vmem_btag_freelist_count++;
                        mutex_exit(&vmem_btag_lock);
                        VMEM_EVCNT_DECR(static_bt_inuse);
                } else {
                        LIST_INSERT_HEAD(&tofree, bt, bt_freelist);
                }
        }

        VMEM_UNLOCK(vm);
        while (!LIST_EMPTY(&tofree)) {
                bt = LIST_FIRST(&tofree);
                LIST_REMOVE(bt, bt_freelist);
                pool_put(&vmem_btag_pool, bt);
        }
}

/*
 * Add private boundary tags (statically-allocated by the caller)
 * to a vmem arena's free tag list.
 */
void
vmem_add_bts(vmem_t *vm, struct vmem_btag *bts, unsigned int nbts)
{
        VMEM_LOCK(vm);
        while (nbts != 0) {
                bts->bt_flags = BT_F_PRIVATE;
                LIST_INSERT_HEAD(&vm->vm_freetags, bts, bt_freelist);
                vm->vm_nfreetags++;
                bts++;
                nbts--;
        }
        VMEM_UNLOCK(vm);
}
#endif        /* defined(_KERNEL) */

/*
 * freelist[0] ... [1, 1]
 * freelist[1] ... [2, 3]
 * freelist[2] ... [4, 7]
 * freelist[3] ... [8, 15]
 *  :
 * freelist[n] ... [(1 << n), (1 << (n + 1)) - 1]
 *  :
 */

static struct vmem_freelist *
bt_freehead_tofree(vmem_t *vm, vmem_size_t size)
{
        const vmem_size_t qsize = size >> vm->vm_quantum_shift;
        const int idx = SIZE2ORDER(qsize);

        KASSERT(size != 0);
        KASSERT(qsize != 0);
        KASSERT((size & vm->vm_quantum_mask) == 0);
        KASSERT(idx >= 0);
        KASSERT(idx < VMEM_MAXORDER);

        return &vm->vm_freelist[idx];
}

/*
 * bt_freehead_toalloc: return the freelist for the given size and allocation
 * strategy.
 *
 * for VM_INSTANTFIT, return the list in which any blocks are large enough
 * for the requested size.  otherwise, return the list which can have blocks
 * large enough for the requested size.
 */

static struct vmem_freelist *
bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, vm_flag_t strat)
{
        const vmem_size_t qsize = size >> vm->vm_quantum_shift;
        int idx = SIZE2ORDER(qsize);

        KASSERT(size != 0);
        KASSERT(qsize != 0);
        KASSERT((size & vm->vm_quantum_mask) == 0);

        if (strat == VM_INSTANTFIT && ORDER2SIZE(idx) != qsize) {
                idx++;
                /* check too large request? */
        }
        KASSERT(idx >= 0);
        KASSERT(idx < VMEM_MAXORDER);

        return &vm->vm_freelist[idx];
}

/* ---- boundary tag hash */

static struct vmem_hashlist *
bt_hashhead(vmem_t *vm, vmem_addr_t addr)
{
        struct vmem_hashlist *list;
        unsigned int hash;

        hash = hash32_buf(&addr, sizeof(addr), HASH32_BUF_INIT);
        list = &vm->vm_hashlist[hash & vm->vm_hashmask];

        return list;
}

static bt_t *
bt_lookupbusy(vmem_t *vm, vmem_addr_t addr)
{
        struct vmem_hashlist *list;
        bt_t *bt;

        list = bt_hashhead(vm, addr);
        LIST_FOREACH(bt, list, bt_hashlist) {
                if (bt->bt_start == addr) {
                        break;
                }
        }

        return bt;
}

static void
bt_rembusy(vmem_t *vm, bt_t *bt)
{

        KASSERT(vm->vm_nbusytag > 0);
        vm->vm_inuse -= bt->bt_size;
        vm->vm_nbusytag--;
        LIST_REMOVE(bt, bt_hashlist);
}

static void
bt_insbusy(vmem_t *vm, bt_t *bt)
{
        struct vmem_hashlist *list;

        KASSERT(bt->bt_type == BT_TYPE_BUSY);

        list = bt_hashhead(vm, bt->bt_start);
        LIST_INSERT_HEAD(list, bt, bt_hashlist);
        if (++vm->vm_nbusytag > vm->vm_maxbusytag) {
                vm->vm_maxbusytag = vm->vm_nbusytag;
        }
        vm->vm_inuse += bt->bt_size;
}

/* ---- boundary tag list */

static void
bt_remseg(vmem_t *vm, bt_t *bt)
{

        TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist);
}

static void
bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev)
{

        TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist);
}

static void
bt_insseg_tail(vmem_t *vm, bt_t *bt)
{

        TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist);
}

static void
bt_remfree(vmem_t *vm, bt_t *bt)
{

        KASSERT(bt->bt_type == BT_TYPE_FREE);

        LIST_REMOVE(bt, bt_freelist);
}

static void
bt_insfree(vmem_t *vm, bt_t *bt)
{
        struct vmem_freelist *list;

        list = bt_freehead_tofree(vm, bt->bt_size);
        LIST_INSERT_HEAD(list, bt, bt_freelist);
}

/* ---- vmem internal functions */

#if defined(QCACHE)
static inline vm_flag_t
prf_to_vmf(int prflags)
{
        vm_flag_t vmflags;

        KASSERT((prflags & ~(PR_LIMITFAIL | PR_WAITOK | PR_NOWAIT)) == 0);
        if ((prflags & PR_WAITOK) != 0) {
                vmflags = VM_SLEEP;
        } else {
                vmflags = VM_NOSLEEP;
        }
        return vmflags;
}

static inline int
vmf_to_prf(vm_flag_t vmflags)
{
        int prflags;

        if ((vmflags & VM_SLEEP) != 0) {
                prflags = PR_WAITOK;
        } else {
                prflags = PR_NOWAIT;
        }
        return prflags;
}

static size_t
qc_poolpage_size(size_t qcache_max)
{
        int i;

        for (i = 0; ORDER2SIZE(i) <= qcache_max * 3; i++) {
                /* nothing */
        }
        return ORDER2SIZE(i);
}

static void *
qc_poolpage_alloc(struct pool *pool, int prflags)
{
        qcache_t *qc = QC_POOL_TO_QCACHE(pool);
        vmem_t *vm = qc->qc_vmem;
        vmem_addr_t addr;

        if (vmem_alloc(vm, pool->pr_alloc->pa_pagesz,
            prf_to_vmf(prflags) | VM_INSTANTFIT, &addr) != 0)
                return NULL;
        return (void *)addr;
}

static void
qc_poolpage_free(struct pool *pool, void *addr)
{
        qcache_t *qc = QC_POOL_TO_QCACHE(pool);
        vmem_t *vm = qc->qc_vmem;

        vmem_free(vm, (vmem_addr_t)addr, pool->pr_alloc->pa_pagesz);
}

static void
qc_init(vmem_t *vm, size_t qcache_max, int ipl)
{
        qcache_t *prevqc;
        struct pool_allocator *pa;
        int qcache_idx_max;
        int i;

        KASSERT((qcache_max & vm->vm_quantum_mask) == 0);
        if (qcache_max > (VMEM_QCACHE_IDX_MAX << vm->vm_quantum_shift)) {
                qcache_max = VMEM_QCACHE_IDX_MAX << vm->vm_quantum_shift;
        }
        vm->vm_qcache_max = qcache_max;
        pa = &vm->vm_qcache_allocator;
        memset(pa, 0, sizeof(*pa));
        pa->pa_alloc = qc_poolpage_alloc;
        pa->pa_free = qc_poolpage_free;
        pa->pa_pagesz = qc_poolpage_size(qcache_max);

        qcache_idx_max = qcache_max >> vm->vm_quantum_shift;
        prevqc = NULL;
        for (i = qcache_idx_max; i > 0; i--) {
                qcache_t *qc = &vm->vm_qcache_store[i - 1];
                size_t size = i << vm->vm_quantum_shift;
                pool_cache_t pc;

                qc->qc_vmem = vm;
                snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu",
                    vm->vm_name, size);

                pc = pool_cache_init(size,
                    ORDER2SIZE(vm->vm_quantum_shift), 0,
                    PR_NOALIGN | PR_NOTOUCH | PR_RECURSIVE /* XXX */,
                    qc->qc_name, pa, ipl, NULL, NULL, NULL);

                KASSERT(pc);

                qc->qc_cache = pc;
                KASSERT(qc->qc_cache != NULL);        /* XXX */
                if (prevqc != NULL &&
                    qc->qc_cache->pc_pool.pr_itemsperpage ==
                    prevqc->qc_cache->pc_pool.pr_itemsperpage) {
                        pool_cache_destroy(qc->qc_cache);
                        vm->vm_qcache[i - 1] = prevqc;
                        continue;
                }
                qc->qc_cache->pc_pool.pr_qcache = qc;
                vm->vm_qcache[i - 1] = qc;
                prevqc = qc;
        }
}

static void
qc_destroy(vmem_t *vm)
{
        const qcache_t *prevqc;
        int i;
        int qcache_idx_max;

        qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
        prevqc = NULL;
        for (i = 0; i < qcache_idx_max; i++) {
                qcache_t *qc = vm->vm_qcache[i];

                if (prevqc == qc) {
                        continue;
                }
                pool_cache_destroy(qc->qc_cache);
                prevqc = qc;
        }
}
#endif

#if defined(_KERNEL)
static void
vmem_bootstrap(void)
{

        mutex_init(&vmem_list_lock, MUTEX_DEFAULT, IPL_NONE);
        mutex_init(&vmem_btag_lock, MUTEX_DEFAULT, IPL_VM);
        mutex_init(&vmem_btag_refill_lock, MUTEX_DEFAULT, IPL_VM);

        while (static_bt_count-- > 0) {
                bt_t *bt = &static_bts[static_bt_count];
                LIST_INSERT_HEAD(&vmem_btag_freelist, bt, bt_freelist);
                VMEM_EVCNT_INCR(static_bt_count);
                vmem_btag_freelist_count++;
        }
        vmem_bootstrapped = TRUE;
}

void
vmem_subsystem_init(vmem_t *vm)
{

        kmem_va_meta_arena = vmem_init(&kmem_va_meta_arena_store, "vmem-va",
            0, 0, PAGE_SIZE, vmem_alloc, vmem_free, vm,
            0, VM_NOSLEEP | VM_BOOTSTRAP | VM_LARGEIMPORT,
            IPL_VM);

        kmem_meta_arena = vmem_init(&kmem_meta_arena_store, "vmem-meta",
            0, 0, PAGE_SIZE,
            uvm_km_kmem_alloc, uvm_km_kmem_free, kmem_va_meta_arena,
            0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM);

        pool_init(&vmem_btag_pool, sizeof(bt_t), coherency_unit, 0,
            PR_PHINPAGE, "vmembt", &pool_allocator_vmem_meta, IPL_VM);
        vmem_btag_pool_initialized = true;
}
#endif /* defined(_KERNEL) */

static int
vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, vm_flag_t flags,
    int spanbttype)
{
        bt_t *btspan;
        bt_t *btfree;

        VMEM_ASSERT_LOCKED(vm);
        KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
        KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
        KASSERT(spanbttype == BT_TYPE_SPAN ||
            spanbttype == BT_TYPE_SPAN_STATIC);

        btspan = bt_alloc(vm, flags);
        if (btspan == NULL) {
                return ENOMEM;
        }
        btfree = bt_alloc(vm, flags);
        if (btfree == NULL) {
                bt_free(vm, btspan);
                return ENOMEM;
        }

        btspan->bt_type = spanbttype;
        btspan->bt_start = addr;
        btspan->bt_size = size;

        btfree->bt_type = BT_TYPE_FREE;
        btfree->bt_start = addr;
        btfree->bt_size = size;

        bt_insseg_tail(vm, btspan);
        bt_insseg(vm, btfree, btspan);
        bt_insfree(vm, btfree);
        vm->vm_size += size;

        return 0;
}

static void
vmem_destroy1(vmem_t *vm)
{

#if defined(QCACHE)
        qc_destroy(vm);
#endif /* defined(QCACHE) */
        VMEM_LOCK(vm);

        for (int i = 0; i < vm->vm_hashsize; i++) {
                bt_t *bt;

                while ((bt = LIST_FIRST(&vm->vm_hashlist[i])) != NULL) {
                        KASSERT(bt->bt_type == BT_TYPE_SPAN_STATIC);
                        LIST_REMOVE(bt, bt_hashlist);
                        bt_free(vm, bt);
                }
        }

        /* bt_freetrim() drops the lock. */
        bt_freetrim(vm, 0);
        if (vm->vm_hashlist != &vm->vm_hash0) {
                xfree(vm->vm_hashlist,
                    sizeof(struct vmem_hashlist) * vm->vm_hashsize);
        }

        VMEM_CONDVAR_DESTROY(vm);
        VMEM_LOCK_DESTROY(vm);
        xfree(vm, sizeof(*vm));
}

static int
vmem_import(vmem_t *vm, vmem_size_t size, vm_flag_t flags)
{
        vmem_addr_t addr;
        int rc;

        VMEM_ASSERT_LOCKED(vm);

        if (vm->vm_importfn == NULL) {
                return EINVAL;
        }

        if (vm->vm_flags & VM_LARGEIMPORT) {
                size *= 16;
        }

        VMEM_UNLOCK(vm);
        if (vm->vm_flags & VM_XIMPORT) {
                rc = __FPTRCAST(vmem_ximport_t *, vm->vm_importfn)(vm->vm_arg,
                    size, &size, flags, &addr);
        } else {
                rc = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr);
        }
        VMEM_LOCK(vm);

        if (rc) {
                return ENOMEM;
        }

        if (vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN) != 0) {
                VMEM_UNLOCK(vm);
                (*vm->vm_releasefn)(vm->vm_arg, addr, size);
                VMEM_LOCK(vm);
                return ENOMEM;
        }

        return 0;
}

#if defined(_KERNEL)
static int
vmem_rehash(vmem_t *vm, size_t newhashsize, vm_flag_t flags)
{
        bt_t *bt;
        int i;
        struct vmem_hashlist *newhashlist;
        struct vmem_hashlist *oldhashlist;
        size_t oldhashsize;

        KASSERT(newhashsize > 0);

        /* Round hash size up to a power of 2. */
        newhashsize = 1 << (ilog2(newhashsize) + 1);

        newhashlist =
            xmalloc(sizeof(struct vmem_hashlist) * newhashsize, flags);
        if (newhashlist == NULL) {
                return ENOMEM;
        }
        for (i = 0; i < newhashsize; i++) {
                LIST_INIT(&newhashlist[i]);
        }

        VMEM_LOCK(vm);
        /* Decay back to a small hash slowly. */
        if (vm->vm_maxbusytag >= 2) {
                vm->vm_maxbusytag = vm->vm_maxbusytag / 2 - 1;
                if (vm->vm_nbusytag > vm->vm_maxbusytag) {
                        vm->vm_maxbusytag = vm->vm_nbusytag;
                }
        } else {
                vm->vm_maxbusytag = vm->vm_nbusytag;
        }
        oldhashlist = vm->vm_hashlist;
        oldhashsize = vm->vm_hashsize;
        vm->vm_hashlist = newhashlist;
        vm->vm_hashsize = newhashsize;
        vm->vm_hashmask = newhashsize - 1;
        if (oldhashlist == NULL) {
                VMEM_UNLOCK(vm);
                return 0;
        }
        for (i = 0; i < oldhashsize; i++) {
                while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) {
                        bt_rembusy(vm, bt); /* XXX */
                        bt_insbusy(vm, bt);
                }
        }
        VMEM_UNLOCK(vm);

        if (oldhashlist != &vm->vm_hash0) {
                xfree(oldhashlist,
                    sizeof(struct vmem_hashlist) * oldhashsize);
        }

        return 0;
}
#endif /* _KERNEL */

/*
 * vmem_fit: check if a bt can satisfy the given restrictions.
 *
 * it's a caller's responsibility to ensure the region is big enough
 * before calling us.
 */

static int
vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align,
    vmem_size_t phase, vmem_size_t nocross,
    vmem_addr_t minaddr, vmem_addr_t maxaddr, vmem_addr_t *addrp)
{
        vmem_addr_t start;
        vmem_addr_t end;

        KASSERT(size > 0);
        KASSERT(bt->bt_size >= size); /* caller's responsibility */

        /*
         * XXX assumption: vmem_addr_t and vmem_size_t are
         * unsigned integer of the same size.
         */

        start = bt->bt_start;
        if (start < minaddr) {
                start = minaddr;
        }
        end = BT_END(bt);
        if (end > maxaddr) {
                end = maxaddr;
        }
        if (start > end) {
                return ENOMEM;
        }

        start = VMEM_ALIGNUP(start - phase, align) + phase;
        if (start < bt->bt_start) {
                start += align;
        }
        if (VMEM_CROSS_P(start, start + size - 1, nocross)) {
                KASSERT(align < nocross);
                start = VMEM_ALIGNUP(start - phase, nocross) + phase;
        }
        if (start <= end && end - start >= size - 1) {
                KASSERT((start & (align - 1)) == phase);
                KASSERT(!VMEM_CROSS_P(start, start + size - 1, nocross));
                KASSERT(minaddr <= start);
                KASSERT(maxaddr == 0 || start + size - 1 <= maxaddr);
                KASSERT(bt->bt_start <= start);
                KASSERT(BT_END(bt) - start >= size - 1);
                *addrp = start;
                return 0;
        }
        return ENOMEM;
}

/* ---- vmem API */

/*
 * vmem_init: creates a vmem arena.
 */

vmem_t *
vmem_init(vmem_t *vm, const char *name,
    vmem_addr_t base, vmem_size_t size, vmem_size_t quantum,
    vmem_import_t *importfn, vmem_release_t *releasefn,
    vmem_t *arg, vmem_size_t qcache_max, vm_flag_t flags, int ipl)
{
        int i;

        KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
        KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
        KASSERT(quantum > 0);

#if defined(_KERNEL)
        /* XXX: SMP, we get called early... */
        if (!vmem_bootstrapped) {
                vmem_bootstrap();
        }
#endif /* defined(_KERNEL) */

        if (vm == NULL) {
                vm = xmalloc(sizeof(*vm), flags);
        }
        if (vm == NULL) {
                return NULL;
        }

        VMEM_CONDVAR_INIT(vm, "vmem");
        VMEM_LOCK_INIT(vm, ipl);
        vm->vm_flags = flags;
        vm->vm_nfreetags = 0;
        LIST_INIT(&vm->vm_freetags);
        strlcpy(vm->vm_name, name, sizeof(vm->vm_name));
        vm->vm_quantum_mask = quantum - 1;
        vm->vm_quantum_shift = SIZE2ORDER(quantum);
        KASSERT(ORDER2SIZE(vm->vm_quantum_shift) == quantum);
        vm->vm_importfn = importfn;
        vm->vm_releasefn = releasefn;
        vm->vm_arg = arg;
        vm->vm_nbusytag = 0;
        vm->vm_maxbusytag = 0;
        vm->vm_size = 0;
        vm->vm_inuse = 0;
#if defined(QCACHE)
        qc_init(vm, qcache_max, ipl);
#endif /* defined(QCACHE) */

        TAILQ_INIT(&vm->vm_seglist);
        for (i = 0; i < VMEM_MAXORDER; i++) {
                LIST_INIT(&vm->vm_freelist[i]);
        }
        memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0));
        vm->vm_hashsize = 1;
        vm->vm_hashmask = vm->vm_hashsize - 1;
        vm->vm_hashlist = &vm->vm_hash0;

        if (size != 0) {
                if (vmem_add(vm, base, size, flags) != 0) {
                        vmem_destroy1(vm);
                        return NULL;
                }
        }

#if defined(_KERNEL)
        if (flags & VM_BOOTSTRAP) {
                bt_refill(vm);
        }

        mutex_enter(&vmem_list_lock);
        LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist);
        mutex_exit(&vmem_list_lock);
#endif /* defined(_KERNEL) */

        return vm;
}



/*
 * vmem_create: create an arena.
 *
 * => must not be called from interrupt context.
 */

vmem_t *
vmem_create(const char *name, vmem_addr_t base, vmem_size_t size,
    vmem_size_t quantum, vmem_import_t *importfn, vmem_release_t *releasefn,
    vmem_t *source, vmem_size_t qcache_max, vm_flag_t flags, int ipl)
{

        KASSERT((flags & (VM_XIMPORT)) == 0);

        return vmem_init(NULL, name, base, size, quantum,
            importfn, releasefn, source, qcache_max, flags, ipl);
}

/*
 * vmem_xcreate: create an arena takes alternative import func.
 *
 * => must not be called from interrupt context.
 */

vmem_t *
vmem_xcreate(const char *name, vmem_addr_t base, vmem_size_t size,
    vmem_size_t quantum, vmem_ximport_t *importfn, vmem_release_t *releasefn,
    vmem_t *source, vmem_size_t qcache_max, vm_flag_t flags, int ipl)
{

        KASSERT((flags & (VM_XIMPORT)) == 0);

        return vmem_init(NULL, name, base, size, quantum,
            __FPTRCAST(vmem_import_t *, importfn), releasefn, source,
            qcache_max, flags | VM_XIMPORT, ipl);
}

void
vmem_destroy(vmem_t *vm)
{

#if defined(_KERNEL)
        mutex_enter(&vmem_list_lock);
        LIST_REMOVE(vm, vm_alllist);
        mutex_exit(&vmem_list_lock);
#endif /* defined(_KERNEL) */

        vmem_destroy1(vm);
}

vmem_size_t
vmem_roundup_size(vmem_t *vm, vmem_size_t size)
{

        return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask;
}

/*
 * vmem_alloc: allocate resource from the arena.
 */

int
vmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags, vmem_addr_t *addrp)
{
        const vm_flag_t strat __diagused = flags & VM_FITMASK;
        int error;

        KASSERT((flags & (VM_SLEEP|VM_NOSLEEP)) != 0);
        KASSERT((~flags & (VM_SLEEP|VM_NOSLEEP)) != 0);

        KASSERT(size > 0);
        KASSERT(strat == VM_BESTFIT || strat == VM_INSTANTFIT);
        if ((flags & VM_SLEEP) != 0) {
                ASSERT_SLEEPABLE();
        }

#if defined(QCACHE)
        if (size <= vm->vm_qcache_max) {
                void *p;
                int qidx = (size + vm->vm_quantum_mask) >> vm->vm_quantum_shift;
                qcache_t *qc = vm->vm_qcache[qidx - 1];

                p = pool_cache_get(qc->qc_cache, vmf_to_prf(flags));
                if (addrp != NULL)
                        *addrp = (vmem_addr_t)p;
                error = (p == NULL) ? ENOMEM : 0;
                goto out;
        }
#endif /* defined(QCACHE) */

        error = vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
            flags, addrp);
#if defined(QCACHE)
 out:
#endif /* defined(QCACHE) */
        KASSERTMSG(error || addrp == NULL ||
            (*addrp & vm->vm_quantum_mask) == 0,
            "vmem %s mask=0x%jx addr=0x%jx",
            vm->vm_name, (uintmax_t)vm->vm_quantum_mask, (uintmax_t)*addrp);
        KASSERT(error == 0 || (flags & VM_SLEEP) == 0);
        return error;
}

int
vmem_xalloc_addr(vmem_t *vm, const vmem_addr_t addr, const vmem_size_t size,
    vm_flag_t flags)
{
        vmem_addr_t result;
        int error;

        KASSERT((addr & vm->vm_quantum_mask) == 0);
        KASSERT(size != 0);

        flags = (flags & ~VM_INSTANTFIT) | VM_BESTFIT;

        error = vmem_xalloc(vm, size, 0, 0, 0, addr, addr + size - 1,
            flags, &result);

        KASSERT(error || result == addr);
        KASSERT(error == 0 || (flags & VM_SLEEP) == 0);
        return error;
}

int
vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align,
    const vmem_size_t phase, const vmem_size_t nocross,
    const vmem_addr_t minaddr, const vmem_addr_t maxaddr, const vm_flag_t flags,
    vmem_addr_t *addrp)
{
        struct vmem_freelist *list;
        struct vmem_freelist *first;
        struct vmem_freelist *end;
        bt_t *bt;
        bt_t *btnew;
        bt_t *btnew2;
        const vmem_size_t size = vmem_roundup_size(vm, size0);
        vm_flag_t strat = flags & VM_FITMASK;
        vmem_addr_t start;
        int rc;

        KASSERT(size0 > 0);
        KASSERT(size > 0);
        KASSERT(strat == VM_BESTFIT || strat == VM_INSTANTFIT);
        if ((flags & VM_SLEEP) != 0) {
                ASSERT_SLEEPABLE();
        }
        KASSERT((align & vm->vm_quantum_mask) == 0);
        KASSERT((align & (align - 1)) == 0);
        KASSERT((phase & vm->vm_quantum_mask) == 0);
        KASSERT((nocross & vm->vm_quantum_mask) == 0);
        KASSERT((nocross & (nocross - 1)) == 0);
        KASSERT(align == 0 || phase < align);
        KASSERT(phase == 0 || phase < align);
        KASSERT(nocross == 0 || nocross >= size);
        KASSERT(minaddr <= maxaddr);
        KASSERT(!VMEM_CROSS_P(phase, phase + size - 1, nocross));

        if (align == 0) {
                align = vm->vm_quantum_mask + 1;
        }

        /*
         * allocate boundary tags before acquiring the vmem lock.
         */
        VMEM_LOCK(vm);
        btnew = bt_alloc(vm, flags);
        if (btnew == NULL) {
                VMEM_UNLOCK(vm);
                return ENOMEM;
        }
        btnew2 = bt_alloc(vm, flags); /* XXX not necessary if no restrictions */
        if (btnew2 == NULL) {
                bt_free(vm, btnew);
                VMEM_UNLOCK(vm);
                return ENOMEM;
        }

        /*
         * choose a free block from which we allocate.
         */
retry_strat:
        first = bt_freehead_toalloc(vm, size, strat);
        end = &vm->vm_freelist[VMEM_MAXORDER];
retry:
        bt = NULL;
        vmem_check(vm);
        if (strat == VM_INSTANTFIT) {
                /*
                 * just choose the first block which satisfies our restrictions.
                 *
                 * note that we don't need to check the size of the blocks
                 * because any blocks found on these list should be larger than
                 * the given size.
                 */
                for (list = first; list < end; list++) {
                        bt = LIST_FIRST(list);
                        if (bt != NULL) {
                                rc = vmem_fit(bt, size, align, phase,
                                    nocross, minaddr, maxaddr, &start);
                                if (rc == 0) {
                                        goto gotit;
                                }
                                /*
                                 * don't bother to follow the bt_freelist link
                                 * here.  the list can be very long and we are
                                 * told to run fast.  blocks from the later free
                                 * lists are larger and have better chances to
                                 * satisfy our restrictions.
                                 */
                        }
                }
        } else { /* VM_BESTFIT */
                /*
                 * we assume that, for space efficiency, it's better to
                 * allocate from a smaller block.  thus we will start searching
                 * from the lower-order list than VM_INSTANTFIT.
                 * however, don't bother to find the smallest block in a free
                 * list because the list can be very long.  we can revisit it
                 * if/when it turns out to be a problem.
                 *
                 * note that the 'first' list can contain blocks smaller than
                 * the requested size.  thus we need to check bt_size.
                 */
                for (list = first; list < end; list++) {
                        LIST_FOREACH(bt, list, bt_freelist) {
                                if (bt->bt_size >= size) {
                                        rc = vmem_fit(bt, size, align, phase,
                                            nocross, minaddr, maxaddr, &start);
                                        if (rc == 0) {
                                                goto gotit;
                                        }
                                }
                        }
                }
        }
#if 1
        if (strat == VM_INSTANTFIT) {
                strat = VM_BESTFIT;
                goto retry_strat;
        }
#endif
        if (align != vm->vm_quantum_mask + 1 || phase != 0 || nocross != 0) {

                /*
                 * XXX should try to import a region large enough to
                 * satisfy restrictions?
                 */

                goto fail;
        }
        /* XXX eeek, minaddr & maxaddr not respected */
        if (vmem_import(vm, size, flags) == 0) {
                goto retry;
        }
        /* XXX */

        if ((flags & VM_SLEEP) != 0) {
                vmem_kick_pdaemon();
                VMEM_CONDVAR_WAIT(vm);
                goto retry;
        }
fail:
        bt_free(vm, btnew);
        bt_free(vm, btnew2);
        VMEM_UNLOCK(vm);
        return ENOMEM;

gotit:
        KASSERT(bt->bt_type == BT_TYPE_FREE);
        KASSERT(bt->bt_size >= size);
        bt_remfree(vm, bt);
        vmem_check(vm);
        if (bt->bt_start != start) {
                btnew2->bt_type = BT_TYPE_FREE;
                btnew2->bt_start = bt->bt_start;
                btnew2->bt_size = start - bt->bt_start;
                bt->bt_start = start;
                bt->bt_size -= btnew2->bt_size;
                bt_insfree(vm, btnew2);
                bt_insseg(vm, btnew2, TAILQ_PREV(bt, vmem_seglist, bt_seglist));
                btnew2 = NULL;
                vmem_check(vm);
        }
        KASSERT(bt->bt_start == start);
        if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) {
                /* split */
                btnew->bt_type = BT_TYPE_BUSY;
                btnew->bt_start = bt->bt_start;
                btnew->bt_size = size;
                bt->bt_start = bt->bt_start + size;
                bt->bt_size -= size;
                bt_insfree(vm, bt);
                bt_insseg(vm, btnew, TAILQ_PREV(bt, vmem_seglist, bt_seglist));
                bt_insbusy(vm, btnew);
                vmem_check(vm);
        } else {
                bt->bt_type = BT_TYPE_BUSY;
                bt_insbusy(vm, bt);
                vmem_check(vm);
                bt_free(vm, btnew);
                btnew = bt;
        }
        if (btnew2 != NULL) {
                bt_free(vm, btnew2);
        }
        KASSERT(btnew->bt_size >= size);
        btnew->bt_type = BT_TYPE_BUSY;
        if (addrp != NULL)
                *addrp = btnew->bt_start;
        VMEM_UNLOCK(vm);
        KASSERTMSG(addrp == NULL ||
            (*addrp & vm->vm_quantum_mask) == 0,
            "vmem %s mask=0x%jx addr=0x%jx",
            vm->vm_name, (uintmax_t)vm->vm_quantum_mask, (uintmax_t)*addrp);
        return 0;
}

/*
 * vmem_free: free the resource to the arena.
 */

void
vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
{

        KASSERT(size > 0);
        KASSERTMSG((addr & vm->vm_quantum_mask) == 0,
            "vmem %s mask=0x%jx addr=0x%jx",
            vm->vm_name, (uintmax_t)vm->vm_quantum_mask, (uintmax_t)addr);

#if defined(QCACHE)
        if (size <= vm->vm_qcache_max) {
                int qidx = (size + vm->vm_quantum_mask) >> vm->vm_quantum_shift;
                qcache_t *qc = vm->vm_qcache[qidx - 1];

                pool_cache_put(qc->qc_cache, (void *)addr);
                return;
        }
#endif /* defined(QCACHE) */

        vmem_xfree(vm, addr, size);
}

void
vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
{
        bt_t *bt;

        KASSERT(size > 0);
        KASSERTMSG((addr & vm->vm_quantum_mask) == 0,
            "vmem %s mask=0x%jx addr=0x%jx",
            vm->vm_name, (uintmax_t)vm->vm_quantum_mask, (uintmax_t)addr);

        VMEM_LOCK(vm);

        bt = bt_lookupbusy(vm, addr);
        KASSERTMSG(bt != NULL, "vmem %s addr 0x%jx size 0x%jx",
            vm->vm_name, (uintmax_t)addr, (uintmax_t)size);
        KASSERT(bt->bt_start == addr);
        KASSERT(bt->bt_size == vmem_roundup_size(vm, size) ||
            bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask);

        /* vmem_xfree_bt() drops the lock. */
        vmem_xfree_bt(vm, bt);
}

void
vmem_xfreeall(vmem_t *vm)
{
        bt_t *bt;

#if defined(QCACHE)
        /* This can't be used if the arena has a quantum cache. */
        KASSERT(vm->vm_qcache_max == 0);
#endif /* defined(QCACHE) */

        for (;;) {
                VMEM_LOCK(vm);
                TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
                        if (bt->bt_type == BT_TYPE_BUSY)
                                break;
                }
                if (bt != NULL) {
                        /* vmem_xfree_bt() drops the lock. */
                        vmem_xfree_bt(vm, bt);
                } else {
                        VMEM_UNLOCK(vm);
                        return;
                }
        }
}

static void
vmem_xfree_bt(vmem_t *vm, bt_t *bt)
{
        bt_t *t;

        VMEM_ASSERT_LOCKED(vm);

        KASSERT(bt->bt_type == BT_TYPE_BUSY);
        bt_rembusy(vm, bt);
        bt->bt_type = BT_TYPE_FREE;

        /* coalesce */
        t = TAILQ_NEXT(bt, bt_seglist);
        if (t != NULL && t->bt_type == BT_TYPE_FREE) {
                KASSERT(BT_END(bt) < t->bt_start);        /* YYY */
                bt_remfree(vm, t);
                bt_remseg(vm, t);
                bt->bt_size += t->bt_size;
                bt_free(vm, t);
        }
        t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
        if (t != NULL && t->bt_type == BT_TYPE_FREE) {
                KASSERT(BT_END(t) < bt->bt_start);        /* YYY */
                bt_remfree(vm, t);
                bt_remseg(vm, t);
                bt->bt_size += t->bt_size;
                bt->bt_start = t->bt_start;
                bt_free(vm, t);
        }

        t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
        KASSERT(t != NULL);
        KASSERT(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY);
        if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN &&
            t->bt_size == bt->bt_size) {
                vmem_addr_t spanaddr;
                vmem_size_t spansize;

                KASSERT(t->bt_start == bt->bt_start);
                spanaddr = bt->bt_start;
                spansize = bt->bt_size;
                bt_remseg(vm, bt);
                bt_free(vm, bt);
                bt_remseg(vm, t);
                bt_free(vm, t);
                vm->vm_size -= spansize;
                VMEM_CONDVAR_BROADCAST(vm);
                /* bt_freetrim() drops the lock. */
                bt_freetrim(vm, BT_MAXFREE);
                (*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize);
        } else {
                bt_insfree(vm, bt);
                VMEM_CONDVAR_BROADCAST(vm);
                /* bt_freetrim() drops the lock. */
                bt_freetrim(vm, BT_MAXFREE);
        }
}

/*
 * vmem_add:
 *
 * => caller must ensure appropriate spl,
 *    if the arena can be accessed from interrupt context.
 */

int
vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, vm_flag_t flags)
{
        int rv;

        VMEM_LOCK(vm);
        rv = vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN_STATIC);
        VMEM_UNLOCK(vm);

        return rv;
}

/*
 * vmem_size: information about arenas size
 *
 * => return free/allocated size in arena
 */
vmem_size_t
vmem_size(vmem_t *vm, int typemask)
{

        switch (typemask) {
        case VMEM_ALLOC:
                return vm->vm_inuse;
        case VMEM_FREE:
                return vm->vm_size - vm->vm_inuse;
        case VMEM_FREE|VMEM_ALLOC:
                return vm->vm_size;
        default:
                panic("vmem_size");
        }
}

/* ---- rehash */

#if defined(_KERNEL)
static struct callout vmem_rehash_ch;
static int vmem_rehash_interval;
static struct workqueue *vmem_rehash_wq;
static struct work vmem_rehash_wk;

static void
vmem_rehash_all(struct work *wk, void *dummy)
{
        vmem_t *vm;

        KASSERT(wk == &vmem_rehash_wk);
        mutex_enter(&vmem_list_lock);
        LIST_FOREACH(vm, &vmem_list, vm_alllist) {
                size_t desired;
                size_t current;

                desired = atomic_load_relaxed(&vm->vm_maxbusytag);
                current = atomic_load_relaxed(&vm->vm_hashsize);

                if (desired > VMEM_HASHSIZE_MAX) {
                        desired = VMEM_HASHSIZE_MAX;
                } else if (desired < VMEM_HASHSIZE_MIN) {
                        desired = VMEM_HASHSIZE_MIN;
                }
                if (desired > current * 2 || desired * 2 < current) {
                        vmem_rehash(vm, desired, VM_NOSLEEP);
                }
        }
        mutex_exit(&vmem_list_lock);

        callout_schedule(&vmem_rehash_ch, vmem_rehash_interval);
}

static void
vmem_rehash_all_kick(void *dummy)
{

        workqueue_enqueue(vmem_rehash_wq, &vmem_rehash_wk, NULL);
}

void
vmem_rehash_start(void)
{
        int error;

        error = workqueue_create(&vmem_rehash_wq, "vmem_rehash",
            vmem_rehash_all, NULL, PRI_VM, IPL_SOFTCLOCK, WQ_MPSAFE);
        if (error) {
                panic("%s: workqueue_create %d\n", __func__, error);
        }
        callout_init(&vmem_rehash_ch, CALLOUT_MPSAFE);
        callout_setfunc(&vmem_rehash_ch, vmem_rehash_all_kick, NULL);

        vmem_rehash_interval = hz * 10;
        callout_schedule(&vmem_rehash_ch, vmem_rehash_interval);
}
#endif /* defined(_KERNEL) */

/* ---- debug */

#if defined(DDB) || defined(UNITTEST) || defined(VMEM_SANITY)

static void bt_dump(const bt_t *, void (*)(const char *, ...)
    __printflike(1, 2));

static const char *
bt_type_string(int type)
{
        static const char * const table[] = {
                [BT_TYPE_BUSY] = "busy",
                [BT_TYPE_FREE] = "free",
                [BT_TYPE_SPAN] = "span",
                [BT_TYPE_SPAN_STATIC] = "static span",
        };

        if (type >= __arraycount(table)) {
                return "BOGUS";
        }
        return table[type];
}

static void
bt_dump(const bt_t *bt, void (*pr)(const char *, ...))
{

        (*pr)("\t%p: %" PRIu64 ", %" PRIu64 ", %d(%s)\n",
            bt, (uint64_t)bt->bt_start, (uint64_t)bt->bt_size,
            bt->bt_type, bt_type_string(bt->bt_type));
}

static void
vmem_dump(const vmem_t *vm , void (*pr)(const char *, ...) __printflike(1, 2))
{
        const bt_t *bt;
        int i;

        (*pr)("vmem %p '%s'\n", vm, vm->vm_name);
        TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
                bt_dump(bt, pr);
        }

        for (i = 0; i < VMEM_MAXORDER; i++) {
                const struct vmem_freelist *fl = &vm->vm_freelist[i];

                if (LIST_EMPTY(fl)) {
                        continue;
                }

                (*pr)("freelist[%d]\n", i);
                LIST_FOREACH(bt, fl, bt_freelist) {
                        bt_dump(bt, pr);
                }
        }
}

#endif /* defined(DDB) || defined(UNITTEST) || defined(VMEM_SANITY) */

#if defined(DDB)
static bt_t *
vmem_whatis_lookup(vmem_t *vm, uintptr_t addr)
{
        bt_t *bt;

        TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
                if (BT_ISSPAN_P(bt)) {
                        continue;
                }
                if (bt->bt_start <= addr && addr <= BT_END(bt)) {
                        return bt;
                }
        }

        return NULL;
}

void
vmem_whatis(uintptr_t addr, void (*pr)(const char *, ...))
{
        vmem_t *vm;

        LIST_FOREACH(vm, &vmem_list, vm_alllist) {
                bt_t *bt;

                bt = vmem_whatis_lookup(vm, addr);
                if (bt == NULL) {
                        continue;
                }
                (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n",
                    (void *)addr, (void *)bt->bt_start,
                    (size_t)(addr - bt->bt_start), vm->vm_name,
                    (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free");
        }
}

void
vmem_printall(const char *modif, void (*pr)(const char *, ...))
{
        const vmem_t *vm;

        LIST_FOREACH(vm, &vmem_list, vm_alllist) {
                vmem_dump(vm, pr);
        }
}

void
vmem_print(uintptr_t addr, const char *modif, void (*pr)(const char *, ...))
{
        const vmem_t *vm = (const void *)addr;

        vmem_dump(vm, pr);
}
#endif /* defined(DDB) */

#if defined(_KERNEL)
#define vmem_printf printf
#else
#include <stdio.h>
#include <stdarg.h>

static void
vmem_printf(const char *fmt, ...)
{
        va_list ap;
        va_start(ap, fmt);
        vprintf(fmt, ap);
        va_end(ap);
}
#endif

#if defined(VMEM_SANITY)

static bool
vmem_check_sanity(vmem_t *vm)
{
        const bt_t *bt, *bt2;

        KASSERT(vm != NULL);

        TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
                if (bt->bt_start > BT_END(bt)) {
                        printf("corrupted tag\n");
                        bt_dump(bt, vmem_printf);
                        return false;
                }
        }
        TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
                TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) {
                        if (bt == bt2) {
                                continue;
                        }
                        if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) {
                                continue;
                        }
                        if (bt->bt_start <= BT_END(bt2) &&
                            bt2->bt_start <= BT_END(bt)) {
                                printf("overwrapped tags\n");
                                bt_dump(bt, vmem_printf);
                                bt_dump(bt2, vmem_printf);
                                return false;
                        }
                }
        }

        return true;
}

static void
vmem_check(vmem_t *vm)
{

        if (!vmem_check_sanity(vm)) {
                panic("insanity vmem %p", vm);
        }
}

#endif /* defined(VMEM_SANITY) */

#if defined(UNITTEST)
int
main(void)
{
        int rc;
        vmem_t *vm;
        vmem_addr_t p;
        struct reg {
                vmem_addr_t p;
                vmem_size_t sz;
                bool x;
        } *reg = NULL;
        int nreg = 0;
        int nalloc = 0;
        int nfree = 0;
        vmem_size_t total = 0;
#if 1
        vm_flag_t strat = VM_INSTANTFIT;
#else
        vm_flag_t strat = VM_BESTFIT;
#endif

        vm = vmem_create("test", 0, 0, 1, NULL, NULL, NULL, 0, VM_SLEEP,
#ifdef _KERNEL
            IPL_NONE
#else
            0
#endif
            );
        if (vm == NULL) {
                printf("vmem_create\n");
                exit(EXIT_FAILURE);
        }
        vmem_dump(vm, vmem_printf);

        rc = vmem_add(vm, 0, 50, VM_SLEEP);
        assert(rc == 0);
        rc = vmem_add(vm, 100, 200, VM_SLEEP);
        assert(rc == 0);
        rc = vmem_add(vm, 2000, 1, VM_SLEEP);
        assert(rc == 0);
        rc = vmem_add(vm, 40000, 65536, VM_SLEEP);
        assert(rc == 0);
        rc = vmem_add(vm, 10000, 10000, VM_SLEEP);
        assert(rc == 0);
        rc = vmem_add(vm, 500, 1000, VM_SLEEP);
        assert(rc == 0);
        rc = vmem_add(vm, 0xffffff00, 0x100, VM_SLEEP);
        assert(rc == 0);
        rc = vmem_xalloc(vm, 0x101, 0, 0, 0,
            0xffffff00, 0xffffffff, strat|VM_SLEEP, &p);
        assert(rc != 0);
        rc = vmem_xalloc(vm, 50, 0, 0, 0, 0, 49, strat|VM_SLEEP, &p);
        assert(rc == 0 && p == 0);
        vmem_xfree(vm, p, 50);
        rc = vmem_xalloc(vm, 25, 0, 0, 0, 0, 24, strat|VM_SLEEP, &p);
        assert(rc == 0 && p == 0);
        rc = vmem_xalloc(vm, 0x100, 0, 0, 0,
            0xffffff01, 0xffffffff, strat|VM_SLEEP, &p);
        assert(rc != 0);
        rc = vmem_xalloc(vm, 0x100, 0, 0, 0,
            0xffffff00, 0xfffffffe, strat|VM_SLEEP, &p);
        assert(rc != 0);
        rc = vmem_xalloc(vm, 0x100, 0, 0, 0,
            0xffffff00, 0xffffffff, strat|VM_SLEEP, &p);
        assert(rc == 0);
        vmem_dump(vm, vmem_printf);
        for (;;) {
                struct reg *r;
                int t = rand() % 100;

                if (t > 45) {
                        /* alloc */
                        vmem_size_t sz = rand() % 500 + 1;
                        bool x;
                        vmem_size_t align, phase, nocross;
                        vmem_addr_t minaddr, maxaddr;

                        if (t > 70) {
                                x = true;
                                /* XXX */
                                align = 1 << (rand() % 15);
                                phase = rand() % 65536;
                                nocross = 1 << (rand() % 15);
                                if (align <= phase) {
                                        phase = 0;
                                }
                                if (VMEM_CROSS_P(phase, phase + sz - 1,
                                    nocross)) {
                                        nocross = 0;
                                }
                                do {
                                        minaddr = rand() % 50000;
                                        maxaddr = rand() % 70000;
                                } while (minaddr > maxaddr);
                                printf("=== xalloc %" PRIu64
                                    " align=%" PRIu64 ", phase=%" PRIu64
                                    ", nocross=%" PRIu64 ", min=%" PRIu64
                                    ", max=%" PRIu64 "\n",
                                    (uint64_t)sz,
                                    (uint64_t)align,
                                    (uint64_t)phase,
                                    (uint64_t)nocross,
                                    (uint64_t)minaddr,
                                    (uint64_t)maxaddr);
                                rc = vmem_xalloc(vm, sz, align, phase, nocross,
                                    minaddr, maxaddr, strat|VM_SLEEP, &p);
                        } else {
                                x = false;
                                printf("=== alloc %" PRIu64 "\n", (uint64_t)sz);
                                rc = vmem_alloc(vm, sz, strat|VM_SLEEP, &p);
                        }
                        printf("-> %" PRIu64 "\n", (uint64_t)p);
                        vmem_dump(vm, vmem_printf);
                        if (rc != 0) {
                                if (x) {
                                        continue;
                                }
                                break;
                        }
                        nreg++;
                        reg = realloc(reg, sizeof(*reg) * nreg);
                        r = &reg[nreg - 1];
                        r->p = p;
                        r->sz = sz;
                        r->x = x;
                        total += sz;
                        nalloc++;
                } else if (nreg != 0) {
                        /* free */
                        r = &reg[rand() % nreg];
                        printf("=== free %" PRIu64 ", %" PRIu64 "\n",
                            (uint64_t)r->p, (uint64_t)r->sz);
                        if (r->x) {
                                vmem_xfree(vm, r->p, r->sz);
                        } else {
                                vmem_free(vm, r->p, r->sz);
                        }
                        total -= r->sz;
                        vmem_dump(vm, vmem_printf);
                        *r = reg[nreg - 1];
                        nreg--;
                        nfree++;
                }
                printf("total=%" PRIu64 "\n", (uint64_t)total);
        }
        fprintf(stderr, "total=%" PRIu64 ", nalloc=%d, nfree=%d\n",
            (uint64_t)total, nalloc, nfree);
        exit(EXIT_SUCCESS);
}
#endif /* defined(UNITTEST) */








































































































































































































































































































































































































































































































































































































































































































































































































































































    2 


















    2 





    2 






    1 

    1 







    2 



































































































































    2 





















    2 









    1 









    1 























































































































































































































































































































































    2 










    2 


    1 





    2 




































    2 





















    2 



    2 





    2 













































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
/*        $NetBSD: ip6_input.c,v 1.227 2022/10/28 05:18:39 ozaki-r Exp $        */
/*        $KAME: ip6_input.c,v 1.188 2001/03/29 05:34:31 itojun Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ip_input.c        8.2 (Berkeley) 1/4/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip6_input.c,v 1.227 2022/10/28 05:18:39 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_gateway.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/cprng.h>
#include <sys/percpu.h>

#include <net/if.h>
#include <net/if_types.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/pktqueue.h>
#include <net/pfil.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#ifdef INET
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#endif /* INET */
#include <netinet/ip6.h>
#include <netinet/portalgo.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet/icmp6.h>
#include <netinet6/scope6_var.h>
#include <netinet6/in6_ifattach.h>
#include <netinet6/nd6.h>

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#include <netipsec/key.h>
#endif /* IPSEC */

#include <netinet6/ip6protosw.h>

#include "faith.h"

extern struct domain inet6domain;

u_char ip6_protox[IPPROTO_MAX];
pktqueue_t *ip6_pktq __read_mostly;

pfil_head_t *inet6_pfil_hook;

percpu_t *ip6stat_percpu;

percpu_t *ip6_forward_rt_percpu __cacheline_aligned;

static void ip6intr(void *);
static void ip6_input(struct mbuf *, struct ifnet *);
static bool ip6_badaddr(struct ip6_hdr *);
static struct m_tag *ip6_setdstifaddr(struct mbuf *, const struct in6_ifaddr *);

static struct m_tag *ip6_addaux(struct mbuf *);
static struct m_tag *ip6_findaux(struct mbuf *);
static void ip6_delaux(struct mbuf *);

static int ip6_process_hopopts(struct mbuf *, u_int8_t *, int, u_int32_t *,
    u_int32_t *);
static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int);
static void sysctl_net_inet6_ip6_setup(struct sysctllog **);

#ifdef NET_MPSAFE
#define        SOFTNET_LOCK()                mutex_enter(softnet_lock)
#define        SOFTNET_UNLOCK()        mutex_exit(softnet_lock)
#else
#define        SOFTNET_LOCK()                KASSERT(mutex_owned(softnet_lock))
#define        SOFTNET_UNLOCK()        KASSERT(mutex_owned(softnet_lock))
#endif

/* Ensure that non packed structures are the desired size. */
__CTASSERT(sizeof(struct ip6_hdr) == 40);
__CTASSERT(sizeof(struct ip6_ext) == 2);
__CTASSERT(sizeof(struct ip6_hbh) == 2);
__CTASSERT(sizeof(struct ip6_dest) == 2);
__CTASSERT(sizeof(struct ip6_opt) == 2);
__CTASSERT(sizeof(struct ip6_opt_jumbo) == 6);
__CTASSERT(sizeof(struct ip6_opt_nsap) == 4);
__CTASSERT(sizeof(struct ip6_opt_tunnel) == 3);
__CTASSERT(sizeof(struct ip6_opt_router) == 4);
__CTASSERT(sizeof(struct ip6_rthdr) == 4);
__CTASSERT(sizeof(struct ip6_rthdr0) == 8);
__CTASSERT(sizeof(struct ip6_frag) == 8);

/*
 * IP6 initialization: fill in IP6 protocol switch table.
 * All protocols not implemented in kernel go to raw IP6 protocol handler.
 */
void
ip6_init(void)
{
        const struct ip6protosw *pr;
        int i;

        in6_init();

        ip6_pktq = pktq_create(IFQ_MAXLEN, ip6intr, NULL);
        KASSERT(ip6_pktq != NULL);

        sysctl_net_inet6_ip6_setup(NULL);
        pr = (const struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
        if (pr == 0)
                panic("ip6_init");
        for (i = 0; i < IPPROTO_MAX; i++)
                ip6_protox[i] = pr - inet6sw;
        for (pr = (const struct ip6protosw *)inet6domain.dom_protosw;
            pr < (const struct ip6protosw *)inet6domain.dom_protoswNPROTOSW; pr++)
                if (pr->pr_domain->dom_family == PF_INET6 &&
                    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
                        ip6_protox[pr->pr_protocol] = pr - inet6sw;

        scope6_init();
        addrsel_policy_init();
        nd6_init();
        frag6_init();

#ifdef GATEWAY
        ip6flow_init(ip6_hashsize);
#endif
        /* Register our Packet Filter hook. */
        inet6_pfil_hook = pfil_head_create(PFIL_TYPE_AF, (void *)AF_INET6);
        KASSERT(inet6_pfil_hook != NULL);

        ip6stat_percpu = percpu_alloc(sizeof(uint64_t) * IP6_NSTATS);
        ip6_forward_rt_percpu = rtcache_percpu_alloc();
}

/*
 * IP6 input interrupt handling. Just pass the packet to ip6_input.
 */
static void
ip6intr(void *arg __unused)
{
        struct mbuf *m;

        SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE();
        while ((m = pktq_dequeue(ip6_pktq)) != NULL) {
                struct psref psref;
                struct ifnet *rcvif = m_get_rcvif_psref(m, &psref);

                if (rcvif == NULL) {
                        IP6_STATINC(IP6_STAT_IFDROP);
                        m_freem(m);
                        continue;
                }
                /*
                 * Drop the packet if IPv6 is disabled on the interface.
                 */
                if ((ND_IFINFO(rcvif)->flags & ND6_IFF_IFDISABLED)) {
                        m_put_rcvif_psref(rcvif, &psref);
                        IP6_STATINC(IP6_STAT_IFDROP);
                        m_freem(m);
                        continue;
                }
                ip6_input(m, rcvif);
                m_put_rcvif_psref(rcvif, &psref);
        }
        SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}

static void
ip6_input(struct mbuf *m, struct ifnet *rcvif)
{
        struct ip6_hdr *ip6;
        int hit, off = sizeof(struct ip6_hdr), nest;
        u_int32_t plen;
        u_int32_t rtalert = ~0;
        int nxt, ours = 0, rh_present = 0, frg_present;
        struct ifnet *deliverifp = NULL;
        int srcrt = 0;
        struct rtentry *rt = NULL;
        union {
                struct sockaddr                dst;
                struct sockaddr_in6        dst6;
        } u;
        struct route *ro;

        KASSERT(rcvif != NULL);

        /*
         * make sure we don't have onion peering information into m_tag.
         */
        ip6_delaux(m);

        /*
         * mbuf statistics
         */
        if (m->m_flags & M_EXT) {
                if (m->m_next)
                        IP6_STATINC(IP6_STAT_MEXT2M);
                else
                        IP6_STATINC(IP6_STAT_MEXT1);
        } else {
#define M2MMAX        32
                if (m->m_next) {
                        if (m->m_flags & M_LOOP)
                        /*XXX*/        IP6_STATINC(IP6_STAT_M2M + lo0ifp->if_index);
                        else if (rcvif->if_index < M2MMAX)
                                IP6_STATINC(IP6_STAT_M2M + rcvif->if_index);
                        else
                                IP6_STATINC(IP6_STAT_M2M);
                } else
                        IP6_STATINC(IP6_STAT_M1);
#undef M2MMAX
        }

        in6_ifstat_inc(rcvif, ifs6_in_receive);
        IP6_STATINC(IP6_STAT_TOTAL);

        /*
         * If the IPv6 header is not aligned, slurp it up into a new
         * mbuf with space for link headers, in the event we forward
         * it.  Otherwise, if it is aligned, make sure the entire base
         * IPv6 header is in the first mbuf of the chain.
         */
        if (M_GET_ALIGNED_HDR(&m, struct ip6_hdr, true) != 0) {
                /* XXXJRT new stat, please */
                IP6_STATINC(IP6_STAT_TOOSMALL);
                in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
                return;
        }

        ip6 = mtod(m, struct ip6_hdr *);

        if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
                IP6_STATINC(IP6_STAT_BADVERS);
                in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
                goto bad;
        }

        if (ip6_badaddr(ip6)) {
                IP6_STATINC(IP6_STAT_BADSCOPE);
                in6_ifstat_inc(rcvif, ifs6_in_addrerr);
                goto bad;
        }

        /*
         * Assume that we can create a fast-forward IP flow entry
         * based on this packet.
         */
        m->m_flags |= M_CANFASTFWD;

        /*
         * Run through list of hooks for input packets.  If there are any
         * filters which require that additional packets in the flow are
         * not fast-forwarded, they must clear the M_CANFASTFWD flag.
         * Note that filters must _never_ set this flag, as another filter
         * in the list may have previously cleared it.
         *
         * Don't call hooks if the packet has already been processed by
         * IPsec (encapsulated, tunnel mode).
         */
#if defined(IPSEC)
        if (!ipsec_used || !ipsec_skip_pfil(m))
#else
        if (1)
#endif
        {
                struct in6_addr odst;
                int error;

                odst = ip6->ip6_dst;
                error = pfil_run_hooks(inet6_pfil_hook, &m, rcvif, PFIL_IN);
                if (error != 0 || m == NULL) {
                        IP6_STATINC(IP6_STAT_PFILDROP_IN);
                        return;
                }
                if (m->m_len < sizeof(struct ip6_hdr)) {
                        if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
                                IP6_STATINC(IP6_STAT_TOOSMALL);
                                in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
                                return;
                        }
                }
                ip6 = mtod(m, struct ip6_hdr *);
                srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst);
        }

        IP6_STATINC(IP6_STAT_NXTHIST + ip6->ip6_nxt);

#ifdef ALTQ
        if (altq_input != NULL) {
                SOFTNET_LOCK();
                if ((*altq_input)(m, AF_INET6) == 0) {
                        SOFTNET_UNLOCK();
                        /* packet is dropped by traffic conditioner */
                        return;
                }
                SOFTNET_UNLOCK();
        }
#endif

        /*
         * Disambiguate address scope zones (if there is ambiguity).
         * We first make sure that the original source or destination address
         * is not in our internal form for scoped addresses.  Such addresses
         * are not necessarily invalid spec-wise, but we cannot accept them due
         * to the usage conflict.
         * in6_setscope() then also checks and rejects the cases where src or
         * dst are the loopback address and the receiving interface
         * is not loopback.
         */
        if (__predict_false(
            m_makewritable(&m, 0, sizeof(struct ip6_hdr), M_DONTWAIT))) {
                IP6_STATINC(IP6_STAT_IDROPPED);
                goto bad;
        }
        ip6 = mtod(m, struct ip6_hdr *);
        if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) {
                IP6_STATINC(IP6_STAT_BADSCOPE);        /* XXX */
                goto bad;
        }
        if (in6_setscope(&ip6->ip6_src, rcvif, NULL) ||
            in6_setscope(&ip6->ip6_dst, rcvif, NULL)) {
                IP6_STATINC(IP6_STAT_BADSCOPE);
                goto bad;
        }

        ro = rtcache_percpu_getref(ip6_forward_rt_percpu);

        /*
         * Multicast check
         */
        if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
                bool ingroup;

                in6_ifstat_inc(rcvif, ifs6_in_mcast);
                /*
                 * See if we belong to the destination multicast group on the
                 * arrival interface.
                 */
                ingroup = in6_multi_group(&ip6->ip6_dst, rcvif);
                if (ingroup) {
                        ours = 1;
                } else if (!ip6_mrouter) {
                        uint64_t *ip6s = IP6_STAT_GETREF();
                        ip6s[IP6_STAT_NOTMEMBER]++;
                        ip6s[IP6_STAT_CANTFORWARD]++;
                        IP6_STAT_PUTREF();
                        in6_ifstat_inc(rcvif, ifs6_in_discard);
                        goto bad_unref;
                }
                deliverifp = rcvif;
                goto hbhcheck;
        }

        sockaddr_in6_init(&u.dst6, &ip6->ip6_dst, 0, 0, 0);

        /*
         * Unicast check
         */
        rt = rtcache_lookup2(ro, &u.dst, 1, &hit);
        if (hit)
                IP6_STATINC(IP6_STAT_FORWARD_CACHEHIT);
        else
                IP6_STATINC(IP6_STAT_FORWARD_CACHEMISS);

        /*
         * Accept the packet if the forwarding interface to the destination
         * (according to the routing table) is the loopback interface,
         * unless the associated route has a gateway.
         *
         * We don't explicitly match ip6_dst against an interface here. It
         * is already done in rtcache_lookup2: rt->rt_ifp->if_type will be
         * IFT_LOOP if the packet is for us.
         *
         * Note that this approach causes to accept a packet if there is a
         * route to the loopback interface for the destination of the packet.
         * But we think it's even useful in some situations, e.g. when using
         * a special daemon which wants to intercept the packet.
         */
        if (rt != NULL &&
            (rt->rt_flags & (RTF_HOST|RTF_GATEWAY)) == RTF_HOST &&
            rt->rt_ifp->if_type == IFT_LOOP) {
                struct in6_ifaddr *ia6 = (struct in6_ifaddr *)rt->rt_ifa;
                int addrok;

                if (ia6->ia6_flags & IN6_IFF_ANYCAST)
                        m->m_flags |= M_ANYCAST6;
                /*
                 * packets to a tentative, duplicated, or somehow invalid
                 * address must not be accepted.
                 */
                if (ia6->ia6_flags & IN6_IFF_NOTREADY)
                        addrok = 0;
                else if (ia6->ia6_flags & IN6_IFF_DETACHED &&
                    !IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src))
                {
                        /* Allow internal traffic to DETACHED addresses */
                        struct sockaddr_in6 sin6;
                        int s;

                        memset(&sin6, 0, sizeof(sin6));
                        sin6.sin6_family = AF_INET6;
                        sin6.sin6_len = sizeof(sin6);
                        sin6.sin6_addr = ip6->ip6_src;
                        s = pserialize_read_enter();
                        addrok = (ifa_ifwithaddr(sin6tosa(&sin6)) != NULL);
                        pserialize_read_exit(s);
                } else
                        addrok = 1;
                if (addrok) {
                        /* this address is ready */
                        ours = 1;
                        deliverifp = ia6->ia_ifp;        /* correct? */
                        goto hbhcheck;
                } else {
                        /* address is not ready, so discard the packet. */
                        char ip6bufs[INET6_ADDRSTRLEN];
                        char ip6bufd[INET6_ADDRSTRLEN];
                        nd6log(LOG_INFO, "packet to an unready address %s->%s\n",
                            IN6_PRINT(ip6bufs, &ip6->ip6_src),
                            IN6_PRINT(ip6bufd, &ip6->ip6_dst));

                        IP6_STATINC(IP6_STAT_IDROPPED);
                        goto bad_unref;
                }
        }

        /*
         * FAITH (Firewall Aided Internet Translator)
         */
#if defined(NFAITH) && 0 < NFAITH
        if (ip6_keepfaith) {
                if (rt != NULL && rt->rt_ifp != NULL &&
                    rt->rt_ifp->if_type == IFT_FAITH) {
                        /* XXX do we need more sanity checks? */
                        ours = 1;
                        deliverifp = rt->rt_ifp; /* faith */
                        goto hbhcheck;
                }
        }
#endif

        /*
         * Now there is no reason to process the packet if it's not our own
         * and we're not a router.
         */
        if (!ip6_forwarding) {
                IP6_STATINC(IP6_STAT_CANTFORWARD);
                in6_ifstat_inc(rcvif, ifs6_in_discard);
                goto bad_unref;
        }

hbhcheck:
        /*
         * Record address information into m_tag, if we don't have one yet.
         * Note that we are unable to record it, if the address is not listed
         * as our interface address (e.g. multicast addresses, addresses
         * within FAITH prefixes and such).
         */
        if (deliverifp && ip6_getdstifaddr(m) == NULL) {
                struct in6_ifaddr *ia6;
                int s = pserialize_read_enter();

                ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst);
                /* Depends on ip6_setdstifaddr never sleep */
                if (ia6 != NULL && ip6_setdstifaddr(m, ia6) == NULL) {
                        /*
                         * XXX maybe we should drop the packet here,
                         * as we could not provide enough information
                         * to the upper layers.
                         */
                }
                pserialize_read_exit(s);
        }

        /*
         * Process Hop-by-Hop options header if it's contained.
         * m may be modified in ip6_hopopts_input().
         * If a JumboPayload option is included, plen will also be modified.
         */
        plen = (u_int32_t)ntohs(ip6->ip6_plen);
        if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
                struct ip6_hbh *hbh;

                if (ip6_hopopts_input(&plen, &rtalert, &m, &off)) {
                        /* m already freed */
                        in6_ifstat_inc(rcvif, ifs6_in_discard);
                        rtcache_unref(rt, ro);
                        rtcache_percpu_putref(ip6_forward_rt_percpu);
                        return;
                }

                /* adjust pointer */
                ip6 = mtod(m, struct ip6_hdr *);

                /*
                 * if the payload length field is 0 and the next header field
                 * indicates Hop-by-Hop Options header, then a Jumbo Payload
                 * option MUST be included.
                 */
                if (ip6->ip6_plen == 0 && plen == 0) {
                        /*
                         * Note that if a valid jumbo payload option is
                         * contained, ip6_hopopts_input() must set a valid
                         * (non-zero) payload length to the variable plen.
                         */
                        IP6_STATINC(IP6_STAT_BADOPTIONS);
                        in6_ifstat_inc(rcvif, ifs6_in_discard);
                        in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
                        icmp6_error(m, ICMP6_PARAM_PROB,
                                    ICMP6_PARAMPROB_HEADER,
                                    (char *)&ip6->ip6_plen - (char *)ip6);
                        rtcache_unref(rt, ro);
                        rtcache_percpu_putref(ip6_forward_rt_percpu);
                        return;
                }
                IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
                        sizeof(struct ip6_hbh));
                if (hbh == NULL) {
                        IP6_STATINC(IP6_STAT_TOOSHORT);
                        rtcache_unref(rt, ro);
                        rtcache_percpu_putref(ip6_forward_rt_percpu);
                        return;
                }
                KASSERT(ACCESSIBLE_POINTER(hbh, struct ip6_hdr));
                nxt = hbh->ip6h_nxt;

                /*
                 * accept the packet if a router alert option is included
                 * and we act as an IPv6 router.
                 */
                if (rtalert != ~0 && ip6_forwarding)
                        ours = 1;
        } else
                nxt = ip6->ip6_nxt;

        /*
         * Check that the amount of data in the buffers is at least much as
         * the IPv6 header would have us expect. Trim mbufs if longer than we
         * expect. Drop packet if shorter than we expect.
         */
        if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
                IP6_STATINC(IP6_STAT_TOOSHORT);
                in6_ifstat_inc(rcvif, ifs6_in_truncated);
                goto bad_unref;
        }
        if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
                if (m->m_len == m->m_pkthdr.len) {
                        m->m_len = sizeof(struct ip6_hdr) + plen;
                        m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
                } else
                        m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len);
        }

        /*
         * Forward if desirable.
         */
        if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
                /*
                 * If we are acting as a multicast router, all
                 * incoming multicast packets are passed to the
                 * kernel-level multicast forwarding function.
                 * The packet is returned (relatively) intact; if
                 * ip6_mforward() returns a non-zero value, the packet
                 * must be discarded, else it may be accepted below.
                 */
                if (ip6_mrouter != NULL) {
                        int error;

                        SOFTNET_LOCK();
                        error = ip6_mforward(ip6, rcvif, m);
                        SOFTNET_UNLOCK();

                        if (error != 0) {
                                rtcache_unref(rt, ro);
                                rtcache_percpu_putref(ip6_forward_rt_percpu);
                                IP6_STATINC(IP6_STAT_CANTFORWARD);
                                goto bad;
                        }
                }
                if (!ours) {
                        IP6_STATINC(IP6_STAT_CANTFORWARD);
                        goto bad_unref;
                }
        } else if (!ours) {
                rtcache_unref(rt, ro);
                rtcache_percpu_putref(ip6_forward_rt_percpu);
                ip6_forward(m, srcrt, rcvif);
                return;
        }

        ip6 = mtod(m, struct ip6_hdr *);

        /*
         * Malicious party may be able to use IPv4 mapped addr to confuse
         * tcp/udp stack and bypass security checks (act as if it was from
         * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1).  Be cautious.
         *
         * For SIIT end node behavior, you may want to disable the check.
         * However, you will  become vulnerable to attacks using IPv4 mapped
         * source.
         */
        if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
            IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
                IP6_STATINC(IP6_STAT_BADSCOPE);
                in6_ifstat_inc(rcvif, ifs6_in_addrerr);
                goto bad_unref;
        }

#ifdef IFA_STATS
        if (deliverifp != NULL) {
                struct in6_ifaddr *ia6;
                int s = pserialize_read_enter();
                ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst);
                if (ia6)
                        ia6->ia_ifa.ifa_data.ifad_inbytes += m->m_pkthdr.len;
                pserialize_read_exit(s);
        }
#endif
        IP6_STATINC(IP6_STAT_DELIVERED);
        in6_ifstat_inc(deliverifp, ifs6_in_deliver);
        nest = 0;

        if (rt != NULL) {
                rtcache_unref(rt, ro);
                rt = NULL;
        }
        rtcache_percpu_putref(ip6_forward_rt_percpu);

        rh_present = 0;
        frg_present = 0;
        while (nxt != IPPROTO_DONE) {
                if (ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) {
                        IP6_STATINC(IP6_STAT_TOOMANYHDR);
                        in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
                        goto bad;
                }

                M_VERIFY_PACKET(m);

                /*
                 * protection against faulty packet - there should be
                 * more sanity checks in header chain processing.
                 */
                if (m->m_pkthdr.len < off) {
                        IP6_STATINC(IP6_STAT_TOOSHORT);
                        in6_ifstat_inc(rcvif, ifs6_in_truncated);
                        goto bad;
                }

                if (nxt == IPPROTO_ROUTING) {
                        if (rh_present++) {
                                in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
                                IP6_STATINC(IP6_STAT_BADOPTIONS);
                                goto bad;
                        }
                } else if (nxt == IPPROTO_FRAGMENT) {
                        if (frg_present++) {
                                in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
                                IP6_STATINC(IP6_STAT_BADOPTIONS);
                                goto bad;
                        }
                }

#ifdef IPSEC
                if (ipsec_used) {
                        /*
                         * Enforce IPsec policy checking if we are seeing last
                         * header. Note that we do not visit this with
                         * protocols with pcb layer code - like udp/tcp/raw ip.
                         */
                        if ((inet6sw[ip6_protox[nxt]].pr_flags
                            & PR_LASTHDR) != 0) {
                                int error;

                                error = ipsec_ip_input_checkpolicy(m, false);
                                if (error) {
                                        IP6_STATINC(IP6_STAT_IPSECDROP_IN);
                                        goto bad;
                                }
                        }
                }
#endif

                nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
        }
        return;

bad_unref:
        rtcache_unref(rt, ro);
        rtcache_percpu_putref(ip6_forward_rt_percpu);
bad:
        m_freem(m);
        return;
}

static bool
ip6_badaddr(struct ip6_hdr *ip6)
{
        /* Check against address spoofing/corruption. */
        if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
            IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
                return true;
        }

        /*
         * The following check is not documented in specs.  A malicious
         * party may be able to use IPv4 mapped addr to confuse tcp/udp stack
         * and bypass security checks (act as if it was from 127.0.0.1 by using
         * IPv6 src ::ffff:127.0.0.1).  Be cautious.
         *
         * This check chokes if we are in an SIIT cloud.  As none of BSDs
         * support IPv4-less kernel compilation, we cannot support SIIT
         * environment at all.  So, it makes more sense for us to reject any
         * malicious packets for non-SIIT environment, than try to do a
         * partial support for SIIT environment.
         */
        if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
            IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
                return true;
        }

        /*
         * Reject packets with IPv4-compatible IPv6 addresses (RFC4291).
         */
        if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) ||
            IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
                return true;
        }

        return false;
}

/*
 * set/grab in6_ifaddr correspond to IPv6 destination address.
 */
static struct m_tag *
ip6_setdstifaddr(struct mbuf *m, const struct in6_ifaddr *ia)
{
        struct m_tag *mtag;
        struct ip6aux *ip6a;

        mtag = ip6_addaux(m);
        if (mtag == NULL)
                return NULL;

        ip6a = (struct ip6aux *)(mtag + 1);
        if (in6_setscope(&ip6a->ip6a_src, ia->ia_ifp, &ip6a->ip6a_scope_id)) {
                IP6_STATINC(IP6_STAT_BADSCOPE);
                return NULL;
        }

        ip6a->ip6a_src = ia->ia_addr.sin6_addr;
        ip6a->ip6a_flags = ia->ia6_flags;
        return mtag;
}

const struct ip6aux *
ip6_getdstifaddr(struct mbuf *m)
{
        struct m_tag *mtag;

        mtag = ip6_findaux(m);
        if (mtag != NULL)
                return (struct ip6aux *)(mtag + 1);
        else
                return NULL;
}

/*
 * Hop-by-Hop options header processing. If a valid jumbo payload option is
 * included, the real payload length will be stored in plenp.
 *
 * rtalertp - XXX: should be stored more smart way
 */
int
ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp, 
        struct mbuf **mp, int *offp)
{
        struct mbuf *m = *mp;
        int off = *offp, hbhlen;
        struct ip6_hbh *hbh;

        /* validation of the length of the header */
        IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m,
            sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
        if (hbh == NULL) {
                IP6_STATINC(IP6_STAT_TOOSHORT);
                return -1;
        }
        hbhlen = (hbh->ip6h_len + 1) << 3;
        IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
            hbhlen);
        if (hbh == NULL) {
                IP6_STATINC(IP6_STAT_TOOSHORT);
                return -1;
        }
        KASSERT(ACCESSIBLE_POINTER(hbh, struct ip6_hdr));
        off += hbhlen;
        hbhlen -= sizeof(struct ip6_hbh);

        if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh),
            hbhlen, rtalertp, plenp) < 0)
                return -1;

        *offp = off;
        *mp = m;
        return 0;
}

/*
 * Search header for all Hop-by-hop options and process each option.
 * This function is separate from ip6_hopopts_input() in order to
 * handle a case where the sending node itself process its hop-by-hop
 * options header. In such a case, the function is called from ip6_output().
 *
 * The function assumes that hbh header is located right after the IPv6 header
 * (RFC2460 p7), opthead is pointer into data content in m, and opthead to
 * opthead + hbhlen is located in continuous memory region.
 */
static int
ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen, 
        u_int32_t *rtalertp, u_int32_t *plenp)
{
        struct ip6_hdr *ip6;
        int optlen = 0;
        u_int8_t *opt = opthead;
        u_int16_t rtalert_val;
        u_int32_t jumboplen;
        const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh);

        for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) {
                switch (*opt) {
                case IP6OPT_PAD1:
                        optlen = 1;
                        break;
                case IP6OPT_PADN:
                        if (hbhlen < IP6OPT_MINLEN) {
                                IP6_STATINC(IP6_STAT_TOOSMALL);
                                goto bad;
                        }
                        optlen = *(opt + 1) + 2;
                        break;
                case IP6OPT_RTALERT:
                        /* XXX may need check for alignment */
                        if (hbhlen < IP6OPT_RTALERT_LEN) {
                                IP6_STATINC(IP6_STAT_TOOSMALL);
                                goto bad;
                        }
                        if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) {
                                IP6_STATINC(IP6_STAT_BADOPTIONS);
                                icmp6_error(m, ICMP6_PARAM_PROB,
                                    ICMP6_PARAMPROB_HEADER,
                                    erroff + opt + 1 - opthead);
                                return (-1);
                        }
                        optlen = IP6OPT_RTALERT_LEN;
                        memcpy((void *)&rtalert_val, (void *)(opt + 2), 2);
                        *rtalertp = ntohs(rtalert_val);
                        break;
                case IP6OPT_JUMBO:
                        /* XXX may need check for alignment */
                        if (hbhlen < IP6OPT_JUMBO_LEN) {
                                IP6_STATINC(IP6_STAT_TOOSMALL);
                                goto bad;
                        }
                        if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) {
                                IP6_STATINC(IP6_STAT_BADOPTIONS);
                                icmp6_error(m, ICMP6_PARAM_PROB,
                                    ICMP6_PARAMPROB_HEADER,
                                    erroff + opt + 1 - opthead);
                                return (-1);
                        }
                        optlen = IP6OPT_JUMBO_LEN;

                        /*
                         * IPv6 packets that have non 0 payload length
                         * must not contain a jumbo payload option.
                         */
                        ip6 = mtod(m, struct ip6_hdr *);
                        if (ip6->ip6_plen) {
                                IP6_STATINC(IP6_STAT_BADOPTIONS);
                                icmp6_error(m, ICMP6_PARAM_PROB,
                                    ICMP6_PARAMPROB_HEADER,
                                    erroff + opt - opthead);
                                return (-1);
                        }

                        /*
                         * We may see jumbolen in unaligned location, so
                         * we'd need to perform memcpy().
                         */
                        memcpy(&jumboplen, opt + 2, sizeof(jumboplen));
                        jumboplen = (u_int32_t)htonl(jumboplen);

#if 1
                        /*
                         * if there are multiple jumbo payload options,
                         * *plenp will be non-zero and the packet will be
                         * rejected.
                         * the behavior may need some debate in ipngwg -
                         * multiple options does not make sense, however,
                         * there's no explicit mention in specification.
                         */
                        if (*plenp != 0) {
                                IP6_STATINC(IP6_STAT_BADOPTIONS);
                                icmp6_error(m, ICMP6_PARAM_PROB,
                                    ICMP6_PARAMPROB_HEADER,
                                    erroff + opt + 2 - opthead);
                                return (-1);
                        }
#endif

                        /*
                         * jumbo payload length must be larger than 65535.
                         */
                        if (jumboplen <= IPV6_MAXPACKET) {
                                IP6_STATINC(IP6_STAT_BADOPTIONS);
                                icmp6_error(m, ICMP6_PARAM_PROB,
                                    ICMP6_PARAMPROB_HEADER,
                                    erroff + opt + 2 - opthead);
                                return (-1);
                        }
                        *plenp = jumboplen;

                        break;
                default:                /* unknown option */
                        if (hbhlen < IP6OPT_MINLEN) {
                                IP6_STATINC(IP6_STAT_TOOSMALL);
                                goto bad;
                        }
                        optlen = ip6_unknown_opt(opt, m,
                            erroff + opt - opthead);
                        if (optlen == -1)
                                return (-1);
                        optlen += 2;
                        break;
                }
        }

        return (0);

  bad:
        m_freem(m);
        return (-1);
}

/*
 * Unknown option processing.
 * The third argument `off' is the offset from the IPv6 header to the option,
 * which is necessary if the IPv6 header the and option header and IPv6 header
 * is not continuous in order to return an ICMPv6 error.
 */
int
ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off)
{
        struct ip6_hdr *ip6;

        switch (IP6OPT_TYPE(*optp)) {
        case IP6OPT_TYPE_SKIP: /* ignore the option */
                return ((int)*(optp + 1));
        case IP6OPT_TYPE_DISCARD:        /* silently discard */
                m_freem(m);
                return (-1);
        case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */
                IP6_STATINC(IP6_STAT_BADOPTIONS);
                icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off);
                return (-1);
        case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */
                IP6_STATINC(IP6_STAT_BADOPTIONS);
                ip6 = mtod(m, struct ip6_hdr *);
                if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
                    (m->m_flags & (M_BCAST|M_MCAST)))
                        m_freem(m);
                else
                        icmp6_error(m, ICMP6_PARAM_PROB,
                                    ICMP6_PARAMPROB_OPTION, off);
                return (-1);
        }

        m_freem(m);                /* XXX: NOTREACHED */
        return (-1);
}

void
ip6_savecontrol(struct inpcb *inp, struct mbuf **mp, 
        struct ip6_hdr *ip6, struct mbuf *m)
{
        struct socket *so = inp->inp_socket;
#ifdef RFC2292
#define IS2292(x, y)        ((inp->inp_flags & IN6P_RFC2292) ? (x) : (y))
#else
#define IS2292(x, y)        (y)
#endif

        KASSERT(m->m_flags & M_PKTHDR);

        if (SOOPT_TIMESTAMP(so->so_options))
                mp = sbsavetimestamp(so->so_options, mp);

        /* some OSes call this logic with IPv4 packet, for SO_TIMESTAMP */
        if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION)
                return;

        /* RFC 2292 sec. 5 */
        if ((inp->inp_flags & IN6P_PKTINFO) != 0) {
                struct in6_pktinfo pi6;

                memcpy(&pi6.ipi6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
                in6_clearscope(&pi6.ipi6_addr);        /* XXX */
                pi6.ipi6_ifindex = m->m_pkthdr.rcvif_index;
                *mp = sbcreatecontrol(&pi6, sizeof(pi6),
                    IS2292(IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6);
                if (*mp)
                        mp = &(*mp)->m_next;
        }

        if (inp->inp_flags & IN6P_HOPLIMIT) {
                int hlim = ip6->ip6_hlim & 0xff;

                *mp = sbcreatecontrol(&hlim, sizeof(hlim),
                    IS2292(IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6);
                if (*mp)
                        mp = &(*mp)->m_next;
        }

        if ((inp->inp_flags & IN6P_TCLASS) != 0) {
                u_int32_t flowinfo;
                int tclass;

                flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK);
                flowinfo >>= 20;

                tclass = flowinfo & 0xff;
                *mp = sbcreatecontrol(&tclass, sizeof(tclass),
                    IPV6_TCLASS, IPPROTO_IPV6);

                if (*mp)
                        mp = &(*mp)->m_next;
        }

        /*
         * IPV6_HOPOPTS socket option.  Recall that we required super-user
         * privilege for the option (see ip6_ctloutput), but it might be too
         * strict, since there might be some hop-by-hop options which can be
         * returned to normal user.
         * See also RFC3542 section 8 (or RFC2292 section 6).
         */
        if ((inp->inp_flags & IN6P_HOPOPTS) != 0) {
                /*
                 * Check if a hop-by-hop options header is contatined in the
                 * received packet, and if so, store the options as ancillary
                 * data. Note that a hop-by-hop options header must be
                 * just after the IPv6 header, which fact is assured through
                 * the IPv6 input processing.
                 */
                struct ip6_hdr *xip6 = mtod(m, struct ip6_hdr *);
                if (xip6->ip6_nxt == IPPROTO_HOPOPTS) {
                        struct ip6_hbh *hbh;
                        int hbhlen;
                        struct mbuf *ext;

                        ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr),
                            xip6->ip6_nxt);
                        if (ext == NULL) {
                                IP6_STATINC(IP6_STAT_TOOSHORT);
                                return;
                        }
                        hbh = mtod(ext, struct ip6_hbh *);
                        hbhlen = (hbh->ip6h_len + 1) << 3;
                        if (hbhlen != ext->m_len) {
                                m_freem(ext);
                                IP6_STATINC(IP6_STAT_TOOSHORT);
                                return;
                        }

                        /*
                         * XXX: We copy whole the header even if a jumbo
                         * payload option is included, which option is to
                         * be removed before returning in the RFC 2292.
                         * Note: this constraint is removed in RFC3542.
                         */
                        *mp = sbcreatecontrol(hbh, hbhlen,
                            IS2292(IPV6_2292HOPOPTS, IPV6_HOPOPTS),
                            IPPROTO_IPV6);
                        if (*mp)
                                mp = &(*mp)->m_next;
                        m_freem(ext);
                }
        }

        /* IPV6_DSTOPTS and IPV6_RTHDR socket options */
        if (inp->inp_flags & (IN6P_DSTOPTS | IN6P_RTHDR)) {
                struct ip6_hdr *xip6 = mtod(m, struct ip6_hdr *);
                int nxt = xip6->ip6_nxt, off = sizeof(struct ip6_hdr);

                /*
                 * Search for destination options headers or routing
                 * header(s) through the header chain, and stores each
                 * header as ancillary data.
                 * Note that the order of the headers remains in
                 * the chain of ancillary data.
                 */
                for (;;) {        /* is explicit loop prevention necessary? */
                        struct ip6_ext *ip6e = NULL;
                        int elen;
                        struct mbuf *ext = NULL;

                        /*
                         * if it is not an extension header, don't try to
                         * pull it from the chain.
                         */
                        switch (nxt) {
                        case IPPROTO_DSTOPTS:
                        case IPPROTO_ROUTING:
                        case IPPROTO_HOPOPTS:
                        case IPPROTO_AH: /* is it possible? */
                                break;
                        default:
                                goto loopend;
                        }

                        ext = ip6_pullexthdr(m, off, nxt);
                        if (ext == NULL) {
                                IP6_STATINC(IP6_STAT_TOOSHORT);
                                return;
                        }
                        ip6e = mtod(ext, struct ip6_ext *);
                        if (nxt == IPPROTO_AH)
                                elen = (ip6e->ip6e_len + 2) << 2;
                        else
                                elen = (ip6e->ip6e_len + 1) << 3;
                        if (elen != ext->m_len) {
                                m_freem(ext);
                                IP6_STATINC(IP6_STAT_TOOSHORT);
                                return;
                        }
                        KASSERT(ACCESSIBLE_POINTER(ip6e, struct ip6_hdr));

                        switch (nxt) {
                        case IPPROTO_DSTOPTS:
                                if (!(inp->inp_flags & IN6P_DSTOPTS))
                                        break;

                                *mp = sbcreatecontrol(ip6e, elen,
                                    IS2292(IPV6_2292DSTOPTS, IPV6_DSTOPTS),
                                    IPPROTO_IPV6);
                                if (*mp)
                                        mp = &(*mp)->m_next;
                                break;

                        case IPPROTO_ROUTING:
                                if (!(inp->inp_flags & IN6P_RTHDR))
                                        break;

                                *mp = sbcreatecontrol(ip6e, elen,
                                    IS2292(IPV6_2292RTHDR, IPV6_RTHDR),
                                    IPPROTO_IPV6);
                                if (*mp)
                                        mp = &(*mp)->m_next;
                                break;

                        case IPPROTO_HOPOPTS:
                        case IPPROTO_AH: /* is it possible? */
                                break;

                        default:
                                /*
                                  * other cases have been filtered in the above.
                                 * none will visit this case.  here we supply
                                 * the code just in case (nxt overwritten or
                                 * other cases).
                                 */
                                m_freem(ext);
                                goto loopend;

                        }

                        /* proceed with the next header. */
                        off += elen;
                        nxt = ip6e->ip6e_nxt;
                        ip6e = NULL;
                        m_freem(ext);
                        ext = NULL;
                }
          loopend:
                  ;
        }
}
#undef IS2292


void
ip6_notify_pmtu(struct inpcb *inp, const struct sockaddr_in6 *dst,
    uint32_t *mtu)
{
        struct socket *so;
        struct mbuf *m_mtu;
        struct ip6_mtuinfo mtuctl;

        so = inp->inp_socket;

        if (mtu == NULL)
                return;

        KASSERT(so != NULL);

        memset(&mtuctl, 0, sizeof(mtuctl));        /* zero-clear for safety */
        mtuctl.ip6m_mtu = *mtu;
        mtuctl.ip6m_addr = *dst;
        if (sa6_recoverscope(&mtuctl.ip6m_addr))
                return;

        if ((m_mtu = sbcreatecontrol(&mtuctl, sizeof(mtuctl),
            IPV6_PATHMTU, IPPROTO_IPV6)) == NULL)
                return;

        if (sbappendaddr(&so->so_rcv, (const struct sockaddr *)dst, NULL, m_mtu)
            == 0) {
                soroverflow(so);
                m_freem(m_mtu);
        } else
                sorwakeup(so);

        return;
}

/*
 * pull single extension header from mbuf chain.  returns single mbuf that
 * contains the result, or NULL on error.
 */
static struct mbuf *
ip6_pullexthdr(struct mbuf *m, size_t off, int nxt)
{
        struct ip6_ext ip6e;
        size_t elen;
        struct mbuf *n;

        if (off + sizeof(ip6e) > m->m_pkthdr.len)
                return NULL;

        m_copydata(m, off, sizeof(ip6e), (void *)&ip6e);
        if (nxt == IPPROTO_AH)
                elen = (ip6e.ip6e_len + 2) << 2;
        else
                elen = (ip6e.ip6e_len + 1) << 3;

        if (off + elen > m->m_pkthdr.len)
                return NULL;

        MGET(n, M_DONTWAIT, MT_DATA);
        if (n && elen >= MLEN) {
                MCLGET(n, M_DONTWAIT);
                if ((n->m_flags & M_EXT) == 0) {
                        m_free(n);
                        n = NULL;
                }
        }
        if (!n)
                return NULL;

        n->m_len = 0;
        if (elen >= M_TRAILINGSPACE(n)) {
                m_free(n);
                return NULL;
        }

        m_copydata(m, off, elen, mtod(n, void *));
        n->m_len = elen;
        return n;
}

/*
 * Get offset to the previous header followed by the header
 * currently processed.
 */
int
ip6_get_prevhdr(struct mbuf *m, int off)
{
        struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);

        if (off == sizeof(struct ip6_hdr)) {
                return offsetof(struct ip6_hdr, ip6_nxt);
        } else if (off < sizeof(struct ip6_hdr)) {
                panic("%s: off < sizeof(struct ip6_hdr)", __func__);
        } else {
                int len, nlen, nxt;
                struct ip6_ext ip6e;

                nxt = ip6->ip6_nxt;
                len = sizeof(struct ip6_hdr);
                nlen = 0;
                while (len < off) {
                        m_copydata(m, len, sizeof(ip6e), &ip6e);

                        switch (nxt) {
                        case IPPROTO_FRAGMENT:
                                nlen = sizeof(struct ip6_frag);
                                break;
                        case IPPROTO_AH:
                                nlen = (ip6e.ip6e_len + 2) << 2;
                                break;
                        default:
                                nlen = (ip6e.ip6e_len + 1) << 3;
                                break;
                        }
                        len += nlen;
                        nxt = ip6e.ip6e_nxt;
                }

                return (len - nlen);
        }
}

/*
 * get next header offset.  m will be retained.
 */
int
ip6_nexthdr(struct mbuf *m, int off, int proto, int *nxtp)
{
        struct ip6_hdr ip6;
        struct ip6_ext ip6e;
        struct ip6_frag fh;

        /* just in case */
        if (m == NULL)
                panic("%s: m == NULL", __func__);
        if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off)
                return -1;

        switch (proto) {
        case IPPROTO_IPV6:
                /* do not chase beyond intermediate IPv6 headers */
                if (off != 0)
                        return -1;
                if (m->m_pkthdr.len < off + sizeof(ip6))
                        return -1;
                m_copydata(m, off, sizeof(ip6), (void *)&ip6);
                if (nxtp)
                        *nxtp = ip6.ip6_nxt;
                off += sizeof(ip6);
                return off;

        case IPPROTO_FRAGMENT:
                /*
                 * terminate parsing if it is not the first fragment,
                 * it does not make sense to parse through it.
                 */
                if (m->m_pkthdr.len < off + sizeof(fh))
                        return -1;
                m_copydata(m, off, sizeof(fh), (void *)&fh);
                if ((fh.ip6f_offlg & IP6F_OFF_MASK) != 0)
                        return -1;
                if (nxtp)
                        *nxtp = fh.ip6f_nxt;
                off += sizeof(struct ip6_frag);
                return off;

        case IPPROTO_AH:
                if (m->m_pkthdr.len < off + sizeof(ip6e))
                        return -1;
                m_copydata(m, off, sizeof(ip6e), (void *)&ip6e);
                if (nxtp)
                        *nxtp = ip6e.ip6e_nxt;
                off += (ip6e.ip6e_len + 2) << 2;
                if (m->m_pkthdr.len < off)
                        return -1;
                return off;

        case IPPROTO_HOPOPTS:
        case IPPROTO_ROUTING:
        case IPPROTO_DSTOPTS:
                if (m->m_pkthdr.len < off + sizeof(ip6e))
                        return -1;
                m_copydata(m, off, sizeof(ip6e), (void *)&ip6e);
                if (nxtp)
                        *nxtp = ip6e.ip6e_nxt;
                off += (ip6e.ip6e_len + 1) << 3;
                if (m->m_pkthdr.len < off)
                        return -1;
                return off;

        case IPPROTO_NONE:
        case IPPROTO_ESP:
        case IPPROTO_IPCOMP:
                /* give up */
                return -1;

        default:
                return -1;
        }
}

/*
 * get offset for the last header in the chain.  m will be kept untainted.
 */
int
ip6_lasthdr(struct mbuf *m, int off, int proto, int *nxtp)
{
        int newoff;
        int nxt;

        if (!nxtp) {
                nxt = -1;
                nxtp = &nxt;
        }
        for (;;) {
                newoff = ip6_nexthdr(m, off, proto, nxtp);
                if (newoff < 0)
                        return off;
                else if (newoff < off)
                        return -1;        /* invalid */
                else if (newoff == off)
                        return newoff;

                off = newoff;
                proto = *nxtp;
        }
}

static struct m_tag *
ip6_addaux(struct mbuf *m)
{
        struct m_tag *mtag;

        mtag = m_tag_find(m, PACKET_TAG_INET6);
        if (!mtag) {
                mtag = m_tag_get(PACKET_TAG_INET6, sizeof(struct ip6aux),
                    M_NOWAIT);
                if (mtag) {
                        m_tag_prepend(m, mtag);
                        memset(mtag + 1, 0, sizeof(struct ip6aux));
                }
        }
        return mtag;
}

static struct m_tag *
ip6_findaux(struct mbuf *m)
{
        struct m_tag *mtag;

        mtag = m_tag_find(m, PACKET_TAG_INET6);
        return mtag;
}

static void
ip6_delaux(struct mbuf *m)
{
        struct m_tag *mtag;

        mtag = m_tag_find(m, PACKET_TAG_INET6);
        if (mtag)
                m_tag_delete(m, mtag);
}

/*
 * System control for IP6
 */

const u_char inet6ctlerrmap[PRC_NCMDS] = {
        0,                0,                0,                0,
        0,                EMSGSIZE,        EHOSTDOWN,        EHOSTUNREACH,
        EHOSTUNREACH,        EHOSTUNREACH,        ECONNREFUSED,        ECONNREFUSED,
        EMSGSIZE,        EHOSTUNREACH,        0,                0,
        0,                0,                0,                0,
        ENOPROTOOPT
};

extern int sysctl_net_inet6_addrctlpolicy(SYSCTLFN_ARGS);

static int
sysctl_net_inet6_ip6_stats(SYSCTLFN_ARGS)
{

        return (NETSTAT_SYSCTL(ip6stat_percpu, IP6_NSTATS));
}

static void
sysctl_net_inet6_ip6_setup(struct sysctllog **clog)
{
        const struct sysctlnode *ip6_node;

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "inet6",
                       SYSCTL_DESCR("PF_INET6 related settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET6, CTL_EOL);
        sysctl_createv(clog, 0, NULL, &ip6_node,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "ip6",
                       SYSCTL_DESCR("IPv6 related settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "forwarding",
                       SYSCTL_DESCR("Enable forwarding of INET6 datagrams"),
                       NULL, 0, &ip6_forwarding, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_FORWARDING, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "redirect",
                       SYSCTL_DESCR("Enable sending of ICMPv6 redirect messages"),
                       NULL, 0, &ip6_sendredirects, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_SENDREDIRECTS, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "hlim",
                       SYSCTL_DESCR("Hop limit for an INET6 datagram"),
                       NULL, 0, &ip6_defhlim, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_DEFHLIM, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "maxfragpackets",
                       SYSCTL_DESCR("Maximum number of fragments to buffer "
                                    "for reassembly"),
                       NULL, 0, &ip6_maxfragpackets, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_MAXFRAGPACKETS, CTL_EOL);

        pktq_sysctl_setup(ip6_pktq, clog, ip6_node, IPV6CTL_IFQ);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "keepfaith",
                       SYSCTL_DESCR("Activate faith interface"),
                       NULL, 0, &ip6_keepfaith, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_KEEPFAITH, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "log_interval",
                       SYSCTL_DESCR("Minimum interval between logging "
                                    "unroutable packets"),
                       NULL, 0, &ip6_log_interval, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_LOG_INTERVAL, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "hdrnestlimit",
                       SYSCTL_DESCR("Maximum number of nested IPv6 headers"),
                       NULL, 0, &ip6_hdrnestlimit, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_HDRNESTLIMIT, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "dad_count",
                       SYSCTL_DESCR("Number of Duplicate Address Detection "
                                    "probes to send"),
                       NULL, 0, &ip6_dad_count, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_DAD_COUNT, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "auto_flowlabel",
                       SYSCTL_DESCR("Assign random IPv6 flow labels"),
                       NULL, 0, &ip6_auto_flowlabel, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_AUTO_FLOWLABEL, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "defmcasthlim",
                       SYSCTL_DESCR("Default multicast hop limit"),
                       NULL, 0, &ip6_defmcasthlim, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_DEFMCASTHLIM, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "kame_version",
                       SYSCTL_DESCR("KAME Version"),
                       NULL, 0, __UNCONST(__KAME_VERSION), 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_KAME_VERSION, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "use_deprecated",
                       SYSCTL_DESCR("Allow use of deprecated addresses as "
                                    "source addresses"),
                       NULL, 0, &ip6_use_deprecated, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_USE_DEPRECATED, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT
#ifndef INET6_BINDV6ONLY
                       |CTLFLAG_READWRITE,
#endif
                       CTLTYPE_INT, "v6only",
                       SYSCTL_DESCR("Disallow PF_INET6 sockets from connecting "
                                    "to PF_INET sockets"),
                       NULL, 0, &ip6_v6only, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_V6ONLY, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "anonportmin",
                       SYSCTL_DESCR("Lowest ephemeral port number to assign"),
                       sysctl_net_inet_ip_ports, 0, &ip6_anonportmin, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_ANONPORTMIN, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "anonportmax",
                       SYSCTL_DESCR("Highest ephemeral port number to assign"),
                       sysctl_net_inet_ip_ports, 0, &ip6_anonportmax, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_ANONPORTMAX, CTL_EOL);
#ifndef IPNOPRIVPORTS
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "lowportmin",
                       SYSCTL_DESCR("Lowest privileged ephemeral port number "
                                    "to assign"),
                       sysctl_net_inet_ip_ports, 0, &ip6_lowportmin, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_LOWPORTMIN, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "lowportmax",
                       SYSCTL_DESCR("Highest privileged ephemeral port number "
                                    "to assign"),
                       sysctl_net_inet_ip_ports, 0, &ip6_lowportmax, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_LOWPORTMAX, CTL_EOL);
#endif /* IPNOPRIVPORTS */
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "auto_linklocal",
                       SYSCTL_DESCR("Default value of per-interface flag for "
                                    "adding an IPv6 link-local address to "
                                    "interfaces when attached"),
                       NULL, 0, &ip6_auto_linklocal, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_AUTO_LINKLOCAL, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
                       CTLTYPE_STRUCT, "addctlpolicy",
                       SYSCTL_DESCR("Return the current address control"
                           " policy"),
                       sysctl_net_inet6_addrctlpolicy, 0, NULL, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_ADDRCTLPOLICY, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "prefer_tempaddr",
                       SYSCTL_DESCR("Prefer temporary address as source "
                                    "address"),
                       NULL, 0, &ip6_prefer_tempaddr, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "maxfrags",
                       SYSCTL_DESCR("Maximum fragments in reassembly queue"),
                       NULL, 0, &ip6_maxfrags, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_MAXFRAGS, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "stats",
                       SYSCTL_DESCR("IPv6 statistics"),
                       sysctl_net_inet6_ip6_stats, 0, NULL, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_STATS, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "use_defaultzone",
                       SYSCTL_DESCR("Whether to use the default scope zones"),
                       NULL, 0, &ip6_use_defzone, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       IPV6CTL_USE_DEFAULTZONE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "mcast_pmtu",
                       SYSCTL_DESCR("Enable pMTU discovery for multicast packet"),
                       NULL, 0, &ip6_mcast_pmtu, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       CTL_CREATE, CTL_EOL);
        /* anonportalgo RFC6056 subtree */
        const struct sysctlnode *portalgo_node;
        sysctl_createv(clog, 0, NULL, &portalgo_node,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "anonportalgo",
                       SYSCTL_DESCR("Anonymous port algorithm selection (RFC 6056)"),
                           NULL, 0, NULL, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &portalgo_node, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "available",
                       SYSCTL_DESCR("available algorithms"),
                       sysctl_portalgo_available, 0, NULL, PORTALGO_MAXLEN,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &portalgo_node, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_STRING, "selected",
                       SYSCTL_DESCR("selected algorithm"),
                       sysctl_portalgo_selected6, 0, NULL, PORTALGO_MAXLEN,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &portalgo_node, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_STRUCT, "reserve",
                       SYSCTL_DESCR("bitmap of reserved ports"),
                       sysctl_portalgo_reserve6, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "neighborgcthresh",
                       SYSCTL_DESCR("Maximum number of entries in neighbor"
                        " cache"),
                       NULL, 1, &ip6_neighborgcthresh, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "maxdynroutes",
                       SYSCTL_DESCR("Maximum number of routes created via"
                           " redirect"),
                       NULL, 1, &ip6_maxdynroutes, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "param_rt_msg",
                       SYSCTL_DESCR("How to send parameter changing"
                           " routing message"),
                       NULL, 0, &ip6_param_rt_msg, 0,
                       CTL_NET, PF_INET6, IPPROTO_IPV6,
                       CTL_CREATE, CTL_EOL);
}

void
ip6_statinc(u_int stat)
{

        KASSERT(stat < IP6_NSTATS);
        IP6_STATINC(stat);
}















































































































    3 



































    2 



    2 



    2 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/*        $NetBSD: ptyfs_subr.c,v 1.34 2020/11/27 14:43:57 christos Exp $        */

/*
 * Copyright (c) 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ptyfs_subr.c        8.6 (Berkeley) 5/14/95
 */

/*
 * Copyright (c) 1994 Christopher G. Demetriou.  All rights reserved.
 * Copyright (c) 1993 Jan-Simon Pendry
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)procfs_subr.c        8.6 (Berkeley) 5/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ptyfs_subr.c,v 1.34 2020/11/27 14:43:57 christos Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/stat.h>
#include <sys/malloc.h>
#include <sys/file.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/select.h>
#include <sys/tty.h>
#include <sys/pty.h>
#include <sys/kauth.h>
#include <sys/lwp.h>

#include <fs/ptyfs/ptyfs.h>

static kmutex_t ptyfs_hashlock;

static SLIST_HEAD(ptyfs_hashhead, ptyfsnode) *ptyfs_node_tbl;
static u_long ptyfs_node_mask; /* size of hash table - 1 */

/*
 * allocate a ptyfsnode/vnode pair.  the vnode is referenced.
 *
 * the pty, ptyfs_type, and mount point uniquely
 * identify a ptyfsnode.  the mount point is needed
 * because someone might mount this filesystem
 * twice.
 */
int
ptyfs_allocvp(struct mount *mp, struct vnode **vpp, ptyfstype type, int pty)
{
        struct ptyfskey key;

        memset(&key, 0, sizeof(key));
        key.ptk_pty = pty;
        key.ptk_type = type;
        return vcache_get(mp, &key, sizeof(key), vpp);
}

/*
 * Initialize ptyfsnode hash table.
 */
void
ptyfs_hashinit(void)
{

        ptyfs_node_tbl = hashinit(16, HASH_SLIST, true, &ptyfs_node_mask);
        mutex_init(&ptyfs_hashlock, MUTEX_DEFAULT, IPL_NONE);
}

/*
 * Free ptyfsnode hash table.
 */
void
ptyfs_hashdone(void)
{
        
        mutex_destroy(&ptyfs_hashlock);
        hashdone(ptyfs_node_tbl, HASH_SLIST, ptyfs_node_mask);
}

/*
 * Get a ptyfsnode from the hash table, or allocate one.
 */
struct ptyfsnode *
ptyfs_get_node(ptyfstype type, int pty)
{
        struct ptyfs_hashhead *ppp;
        struct ptyfsnode *pp;

        ppp = &ptyfs_node_tbl[PTYFS_FILENO(type, pty) & ptyfs_node_mask];

        mutex_enter(&ptyfs_hashlock);
        SLIST_FOREACH(pp, ppp, ptyfs_hash) {
                if (pty == pp->ptyfs_pty && pp->ptyfs_type == type) {
                        mutex_exit(&ptyfs_hashlock);
                        return pp;
                }
        }
        mutex_exit(&ptyfs_hashlock);

        pp = malloc(sizeof(struct ptyfsnode), M_TEMP, M_WAITOK);
        pp->ptyfs_pty = pty;
        pp->ptyfs_type = type;
        pp->ptyfs_fileno = PTYFS_FILENO(type, pty);
        if (pp->ptyfs_type == PTYFSroot)
                pp->ptyfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|
                    S_IROTH|S_IXOTH;
        else
                pp->ptyfs_mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|
                    S_IROTH|S_IWOTH;

        pp->ptyfs_uid = pp->ptyfs_gid = 0;
        pp->ptyfs_status = PTYFS_CHANGE;
        PTYFS_ITIMES(pp, NULL, NULL, NULL);
        pp->ptyfs_birthtime = pp->ptyfs_mtime =
            pp->ptyfs_atime = pp->ptyfs_ctime;
        pp->ptyfs_flags = 0;
        mutex_enter(&ptyfs_hashlock);
        /*
         * XXX We have minimum race condition when opening master side
         * first time, if other threads through other mount points, trying
         * opening the same device. As follow we have little chance have
         * unused list entries.
         */
        SLIST_INSERT_HEAD(ppp, pp, ptyfs_hash);
        mutex_exit(&ptyfs_hashlock);
        return pp;
}

/*
 * Mark this controlling pty as active.
 */
void
ptyfs_set_active(struct mount *mp, int pty)
{
        struct ptyfsmount *pmnt = VFSTOPTY(mp);

        KASSERT(pty >= 0);
        /* Reallocate map if needed. */
        if (pty >= pmnt->pmnt_bitmap_size * NBBY) {
                int osize, nsize;
                uint8_t *obitmap, *nbitmap;

                nsize = roundup(howmany(pty + 1, NBBY), 64);
                nbitmap = kmem_alloc(nsize, KM_SLEEP);
                mutex_enter(&pmnt->pmnt_lock);
                if (pty < pmnt->pmnt_bitmap_size * NBBY) {
                        mutex_exit(&pmnt->pmnt_lock);
                        kmem_free(nbitmap, nsize);
                } else {
                        osize = pmnt->pmnt_bitmap_size;
                        obitmap = pmnt->pmnt_bitmap;
                        pmnt->pmnt_bitmap_size = nsize;
                        pmnt->pmnt_bitmap = nbitmap;
                        if (osize > 0)
                                memcpy(pmnt->pmnt_bitmap, obitmap, osize);
                        memset(pmnt->pmnt_bitmap + osize, 0, nsize - osize);
                        mutex_exit(&pmnt->pmnt_lock);
                        if (osize > 0)
                                kmem_free(obitmap, osize);
                }
        }

        mutex_enter(&pmnt->pmnt_lock);
        setbit(pmnt->pmnt_bitmap, pty);
        mutex_exit(&pmnt->pmnt_lock);
}

/*
 * Mark this controlling pty as inactive.
 */
void
ptyfs_clr_active(struct mount *mp, int pty)
{
        struct ptyfsmount *pmnt = VFSTOPTY(mp);

        KASSERT(pty >= 0);
        mutex_enter(&pmnt->pmnt_lock);
        if (pty >= 0 && pty < pmnt->pmnt_bitmap_size * NBBY)
                clrbit(pmnt->pmnt_bitmap, pty);
        mutex_exit(&pmnt->pmnt_lock);
}

/*
 * Lookup the next active controlling pty greater or equal "pty".
 * Return -1 if not found.
 */
int
ptyfs_next_active(struct mount *mp, int pty)
{
        struct ptyfsmount *pmnt = VFSTOPTY(mp);

        KASSERT(pty >= 0);
        mutex_enter(&pmnt->pmnt_lock);
        while (pty < pmnt->pmnt_bitmap_size * NBBY) {
                if (isset(pmnt->pmnt_bitmap, pty)) {
                        mutex_exit(&pmnt->pmnt_lock);
                        return pty;
                }
                pty++;
        }
        mutex_exit(&pmnt->pmnt_lock);
        return -1;
}

































































































































    1 



















































































































































































































































    1 

    1 





    1 







    1 

    1 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
/*        $NetBSD: procfs_subr.c,v 1.117 2024/01/17 10:20:12 hannken Exp $        */

/*-
 * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)procfs_subr.c        8.6 (Berkeley) 5/14/95
 */

/*
 * Copyright (c) 1994 Christopher G. Demetriou.  All rights reserved.
 * Copyright (c) 1993 Jan-Simon Pendry
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)procfs_subr.c        8.6 (Berkeley) 5/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: procfs_subr.c,v 1.117 2024/01/17 10:20:12 hannken Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/fstrans.h>
#include <sys/vnode.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kauth.h>
#include <sys/sysctl.h>

#include <miscfs/procfs/procfs.h>

/*
 * Allocate a pfsnode/vnode pair.  The vnode is referenced.
 * The pid, type, and file descriptor uniquely identify a pfsnode.
 */
int
procfs_allocvp(struct mount *mp, struct vnode **vpp, pid_t pid,
    pfstype type, int fd)
{
        struct pfskey key;

        memset(&key, 0, sizeof(key));
        key.pk_type = type;
        key.pk_pid = pid;
        key.pk_fd = fd;

        return vcache_get(mp, &key, sizeof(key), vpp);
}

int
procfs_rw(void *v)
{
        struct vop_read_args *ap = v;
        struct vnode *vp = ap->a_vp;
        struct uio *uio = ap->a_uio;
        struct lwp *curl;
        struct lwp *l;
        struct pfsnode *pfs = VTOPFS(vp);
        struct proc *p;
        int error;

        if (uio->uio_offset < 0)
                return EINVAL;

        if ((error =
             procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH)) != 0)
                return error;

        curl = curlwp;

        /*
         * Do not allow init to be modified while in secure mode; it
         * could be duped into changing the security level.
         */
#define        M2K(m)        ((m) == UIO_READ ? KAUTH_REQ_PROCESS_PROCFS_READ : \
                 KAUTH_REQ_PROCESS_PROCFS_WRITE)
        mutex_enter(p->p_lock);
        error = kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_PROCFS,
            p, pfs, KAUTH_ARG(M2K(uio->uio_rw)), NULL);
        mutex_exit(p->p_lock);
        if (error) {
                procfs_proc_unlock(p);
                return (error);
        }
#undef        M2K

        mutex_enter(p->p_lock);
        LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                if (l->l_stat != LSZOMB)
                        break;
        }
        /* Process is exiting if no-LWPS or all LWPs are LSZOMB */
        if (l == NULL) {
                mutex_exit(p->p_lock);
                procfs_proc_unlock(p);
                return ESRCH;
        }

        lwp_addref(l);
        mutex_exit(p->p_lock);

        switch (pfs->pfs_type) {
        case PFSnote:
        case PFSnotepg:
                error = procfs_donote(curl, p, pfs, uio);
                break;

        case PFSregs:
                error = procfs_doregs(curl, l, pfs, uio);
                break;

        case PFSfpregs:
                error = procfs_dofpregs(curl, l, pfs, uio);
                break;

        case PFSstatus:
                error = procfs_dostatus(curl, l, pfs, uio);
                break;

        case PFSstat:
                error = procfs_do_pid_stat(curl, l, pfs, uio);
                break;

        case PFSlimit:
                error = procfs_dolimit(curl, p, pfs, uio);
                break;

        case PFSmap:
                error = procfs_domap(curl, p, pfs, uio, 0);
                break;

        case PFSmaps:
                error = procfs_domap(curl, p, pfs, uio, 1);
                break;

        case PFSmem:
                error = procfs_domem(curl, l, pfs, uio);
                break;

        case PFScmdline:
                error = procfs_doprocargs(curl, p, pfs, uio, KERN_PROC_ARGV);
                break;

        case PFSenviron:
                error = procfs_doprocargs(curl, p, pfs, uio, KERN_PROC_ENV);
                break;

        case PFSmeminfo:
                error = procfs_domeminfo(curl, p, pfs, uio);
                break;

        case PFSdevices:
                error = procfs_dodevices(curl, p, pfs, uio);
                break;

        case PFScpuinfo:
                error = procfs_docpuinfo(curl, p, pfs, uio);
                break;

        case PFScpustat:
                error = procfs_docpustat(curl, p, pfs, uio);
                break;

        case PFSloadavg:
                error = procfs_doloadavg(curl, p, pfs, uio);
                break;

        case PFSstatm:
                error = procfs_do_pid_statm(curl, l, pfs, uio);
                break;

        case PFSfd:
                error = procfs_dofd(curl, p, pfs, uio);
                break;

        case PFSuptime:
                error = procfs_douptime(curl, p, pfs, uio);
                break;

        case PFSmounts:
                error = procfs_domounts(curl, p, pfs, uio);
                break;

        case PFSemul:
                error = procfs_doemul(curl, p, pfs, uio);
                break;

        case PFSversion:
                error = procfs_doversion(curl, p, pfs, uio);
                break;

        case PFSauxv:
                error = procfs_doauxv(curl, p, pfs, uio);
                break;

#ifdef __HAVE_PROCFS_MACHDEP
        PROCFS_MACHDEP_NODETYPE_CASES
                error = procfs_machdep_rw(curl, l, pfs, uio);
                break;
#endif

        default:
                error = EOPNOTSUPP;
                break;
        }

        /*
         * Release the references that we acquired earlier.
         */
        lwp_delref(l);
        procfs_proc_unlock(p);

        return (error);
}

/*
 * Get a string from userland into (bf).  Strip a trailing
 * nl character (to allow easy access from the shell).
 * The buffer should be *buflenp + 1 chars long.  vfs_getuserstr
 * will automatically add a nul char at the end.
 *
 * Returns 0 on success or the following errors
 *
 * EINVAL:    file offset is non-zero.
 * EMSGSIZE:  message is longer than kernel buffer
 * EFAULT:    user i/o buffer is not addressable
 */
int
vfs_getuserstr(struct uio *uio, char *bf, int *buflenp)
{
        size_t xlen;
        int error;

        if (uio->uio_offset != 0)
                return (EINVAL);

        xlen = *buflenp;

        /* must be able to read the whole string in one go */
        if (xlen < uio->uio_resid)
                return (EMSGSIZE);
        xlen = uio->uio_resid;

        if ((error = uiomove(bf, xlen, uio)) != 0)
                return (error);

        /* allow multiple writes without seeks */
        uio->uio_offset = 0;

        /* cleanup string and remove trailing newline */
        bf[xlen] = '\0';
        xlen = strlen(bf);
        if (xlen > 0 && bf[xlen-1] == '\n')
                bf[--xlen] = '\0';
        *buflenp = xlen;

        return (0);
}

const vfs_namemap_t *
vfs_findname(const vfs_namemap_t *nm, const char *bf, int buflen)
{

        for (; nm->nm_name; nm++)
                if (memcmp(bf, nm->nm_name, buflen+1) == 0)
                        return (nm);

        return (0);
}

bool
procfs_use_linux_compat(struct mount *mp)
{
        const int flags = VFSTOPROC(mp)->pmnt_flags;

        return (flags & PROCFSMNT_LINUXCOMPAT) ? true : false;
}

struct proc *
procfs_proc_find(struct mount *mp, pid_t pid)
{

        KASSERT(mutex_owned(&proc_lock));
        return procfs_use_linux_compat(mp) ? proc_find_lwpid(pid)
                                           : proc_find(pid);
}

int
procfs_proc_lock(struct mount *mp, int pid, struct proc **bunghole,
                 int notfound)
{
        struct proc *tp;
        int error = 0;

        mutex_enter(&proc_lock);

        if (pid == 0)
                tp = &proc0;
        else if ((tp = procfs_proc_find(mp, pid)) == NULL)
                error = notfound;
        if (tp != NULL && !rw_tryenter(&tp->p_reflock, RW_READER))
                error = EBUSY;

        mutex_exit(&proc_lock);

        *bunghole = tp;
        return error;
}

void
procfs_proc_unlock(struct proc *p)
{

        rw_exit(&p->p_reflock);
}

int
procfs_doemul(struct lwp *curl, struct proc *p,
    struct pfsnode *pfs, struct uio *uio)
{
        const char *ename = p->p_emul->e_name;
        return uiomove_frombuf(__UNCONST(ename), strlen(ename), uio);
}
































































  219 








  255 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/*        $NetBSD: cpu.h,v 1.72 2023/09/04 20:58:52 mrg Exp $        */

/*-
 * Copyright (c) 1990 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * William Jolitz.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)cpu.h        5.4 (Berkeley) 5/9/91
 */

#ifndef _AMD64_CPU_H_
#define _AMD64_CPU_H_

#ifdef __x86_64__

#include <x86/cpu.h>

#ifdef _KERNEL

#if defined(__GNUC__) && !defined(_MODULE)

static struct cpu_info *x86_curcpu(void);
static lwp_t *x86_curlwp(void);

/*
 * XXXGCC12 has:
 * ./machine/cpu.h:57:9: error: array subscript 0 is outside array bounds of 'struct cpu_info * const[0]' [-Werror=array-bounds]
 *    56 |         __asm("movq %%gs:%1, %0" :
 */
#pragma GCC push_options
#pragma GCC diagnostic ignored "-Warray-bounds"

__inline __always_inline static struct cpu_info * __unused __nomsan
x86_curcpu(void)
{
        struct cpu_info *ci;

        __asm("movq %%gs:%1, %0" :
            "=r" (ci) :
            "m"
            (*(struct cpu_info * const *)offsetof(struct cpu_info, ci_self)));
        return ci;
}

__inline static lwp_t * __unused __nomsan __attribute__ ((const))
x86_curlwp(void)
{
        lwp_t *l;

        __asm("movq %%gs:%1, %0" :
            "=r" (l) :
            "m"
            (*(struct cpu_info * const *)offsetof(struct cpu_info, ci_curlwp)));
        return l;
}

#pragma GCC pop_options

#endif        /* __GNUC__ && !_MODULE */

#ifdef XENPV
#define        CLKF_USERMODE(frame)        (curcpu()->ci_xen_clockf_usermode)
#define CLKF_PC(frame)                (curcpu()->ci_xen_clockf_pc)
#else /* XENPV */
#define        CLKF_USERMODE(frame)        USERMODE((frame)->cf_if.if_tf.tf_cs)
#define CLKF_PC(frame)                ((frame)->cf_if.if_tf.tf_rip)
#endif /* XENPV */
#define CLKF_INTR(frame)        (curcpu()->ci_idepth > 0)
#define LWP_PC(l)                ((l)->l_md.md_regs->tf_rip)

void *cpu_uarea_alloc(bool);
bool cpu_uarea_free(void *);

#endif        /* _KERNEL */

#else        /*        __x86_64__        */

#include <i386/cpu.h>

#endif        /*        __x86_64__        */

#endif /* !_AMD64_CPU_H_ */












































































































   15 











    3 













   27 









   27 










   16 








   16 
















































































































































































































































































































































































































































































































































































































































































































































































    3 

    3 






























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
/*        $NetBSD: ufs_quota.c,v 1.118 2023/02/22 21:49:45 riastradh Exp $        */

/*
 * Copyright (c) 1982, 1986, 1990, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Robert Elz at The University of Melbourne.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ufs_quota.c        8.5 (Berkeley) 5/20/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_quota.c,v 1.118 2023/02/22 21:49:45 riastradh Exp $");

#if defined(_KERNEL_OPT)
#include "opt_quota.h"
#endif
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/kauth.h>

#include <sys/quotactl.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_quota.h>

kmutex_t dqlock;
kcondvar_t dqcv;
const char *quotatypes[MAXQUOTAS] = INITQFNAMES;

/*
 * Code pertaining to management of the in-core dquot data structures.
 */
#define DQHASH(dqvp, id) \
        (((((long)(dqvp)) >> 8) + id) & dqhash)
static LIST_HEAD(dqhashhead, dquot) *dqhashtbl;
static u_long dqhash;
static pool_cache_t dquot_cache;


static int quota_handle_cmd_stat(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_idtypestat(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_objtypestat(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_get(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_put(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_cursorget(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_del(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_quotaon(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_quotaoff(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_cursoropen(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_cursorclose(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_cursorskipidtype(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_cursoratend(struct mount *, struct lwp *,
    struct quotactl_args *args);
static int quota_handle_cmd_cursorrewind(struct mount *, struct lwp *,
    struct quotactl_args *args);

/*
 * Initialize the quota fields of an inode.
 */
void
ufsquota_init(struct inode *ip)
{
        int i;

        for (i = 0; i < MAXQUOTAS; i++)
                ip->i_dquot[i] = NODQUOT;
}

/*
 * Release the quota fields from an inode.
 */
void
ufsquota_free(struct inode *ip)
{
        int i;

        for (i = 0; i < MAXQUOTAS; i++) {
                dqrele(ITOV(ip), ip->i_dquot[i]);
                ip->i_dquot[i] = NODQUOT;
        }
}

/*
 * Update disk usage, and take corrective action.
 */
int
chkdq(struct inode *ip, int64_t change, kauth_cred_t cred, int flags)
{
        /* do not track snapshot usage, or we will deadlock */
        if ((ip->i_flags & SF_SNAPSHOT) != 0)
                return 0;

#ifdef QUOTA
        if (ip->i_ump->um_flags & UFS_QUOTA)
                return chkdq1(ip, change, cred, flags);
#endif
#ifdef QUOTA2
        if (ip->i_ump->um_flags & UFS_QUOTA2)
                return chkdq2(ip, change, cred, flags);
#endif
        return 0;
}

/*
 * Check the inode limit, applying corrective action.
 */
int
chkiq(struct inode *ip, int32_t change, kauth_cred_t cred, int flags)
{
        /* do not track snapshot usage, or we will deadlock */
        if ((ip->i_flags & SF_SNAPSHOT) != 0)
                return 0;
#ifdef QUOTA
        if (ip->i_ump->um_flags & UFS_QUOTA)
                return chkiq1(ip, change, cred, flags);
#endif
#ifdef QUOTA2
        if (ip->i_ump->um_flags & UFS_QUOTA2)
                return chkiq2(ip, change, cred, flags);
#endif
        return 0;
}

int
quota_handle_cmd(struct mount *mp, struct lwp *l,
                 struct quotactl_args *args)
{
        int error = 0;

        switch (args->qc_op) {
        case QUOTACTL_STAT:
                error = quota_handle_cmd_stat(mp, l, args);
                break;
        case QUOTACTL_IDTYPESTAT:
                error = quota_handle_cmd_idtypestat(mp, l, args);
                break;
        case QUOTACTL_OBJTYPESTAT:
                error = quota_handle_cmd_objtypestat(mp, l, args);
                break;
        case QUOTACTL_QUOTAON:
                error = quota_handle_cmd_quotaon(mp, l, args);
                break;
        case QUOTACTL_QUOTAOFF:
                error = quota_handle_cmd_quotaoff(mp, l, args);
                break;
        case QUOTACTL_GET:
                error = quota_handle_cmd_get(mp, l, args);
                break;
        case QUOTACTL_PUT:
                error = quota_handle_cmd_put(mp, l, args);
                break;
        case QUOTACTL_CURSORGET:
                error = quota_handle_cmd_cursorget(mp, l, args);
                break;
        case QUOTACTL_DEL:
                error = quota_handle_cmd_del(mp, l, args);
                break;
        case QUOTACTL_CURSOROPEN:
                error = quota_handle_cmd_cursoropen(mp, l, args);
                break;
        case QUOTACTL_CURSORCLOSE:
                error = quota_handle_cmd_cursorclose(mp, l, args);
                break;
        case QUOTACTL_CURSORSKIPIDTYPE:
                error = quota_handle_cmd_cursorskipidtype(mp, l, args);
                break;
        case QUOTACTL_CURSORATEND:
                error = quota_handle_cmd_cursoratend(mp, l, args);
                break;
        case QUOTACTL_CURSORREWIND:
                error = quota_handle_cmd_cursorrewind(mp, l, args);
                break;
        default:
                panic("Invalid quotactl operation %d\n", args->qc_op);
        }

        return error;
}

static int
quota_handle_cmd_stat(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
        struct ufsmount *ump = VFSTOUFS(mp);
        struct quotastat *info;

        KASSERT(args->qc_op == QUOTACTL_STAT);
        info = args->u.stat.qc_info;

        if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
                return EOPNOTSUPP;

#ifdef QUOTA
        if (ump->um_flags & UFS_QUOTA) {
                strcpy(info->qs_implname, "ufs/ffs quota v1");
                info->qs_numidtypes = MAXQUOTAS;
                /* XXX no define for this */
                info->qs_numobjtypes = 2;
                info->qs_restrictions = 0;
                info->qs_restrictions |= QUOTA_RESTRICT_NEEDSQUOTACHECK;
                info->qs_restrictions |= QUOTA_RESTRICT_UNIFORMGRACE;
                info->qs_restrictions |= QUOTA_RESTRICT_32BIT;
        } else
#endif
#ifdef QUOTA2
        if (ump->um_flags & UFS_QUOTA2) {
                strcpy(info->qs_implname, "ufs/ffs quota v2");
                info->qs_numidtypes = MAXQUOTAS;
                info->qs_numobjtypes = N_QL;
                info->qs_restrictions = 0;
        } else
#endif
                return EOPNOTSUPP;

        return 0;
}

static int
quota_handle_cmd_idtypestat(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
        struct ufsmount *ump = VFSTOUFS(mp);
        int idtype;
        struct quotaidtypestat *info;
        const char *name;

        KASSERT(args->qc_op == QUOTACTL_IDTYPESTAT);
        idtype = args->u.idtypestat.qc_idtype;
        info = args->u.idtypestat.qc_info;

        if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
                return EOPNOTSUPP;

        /*
         * These are the same for both QUOTA and QUOTA2.
         */
        switch (idtype) {
        case QUOTA_IDTYPE_USER:
                name = "user";
                break;
        case QUOTA_IDTYPE_GROUP:
                name = "group";
                break;
        default:
                return EINVAL;
        }
        strlcpy(info->qis_name, name, sizeof(info->qis_name));
        return 0;
}

static int
quota_handle_cmd_objtypestat(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
        struct ufsmount *ump = VFSTOUFS(mp);
        int objtype;
        struct quotaobjtypestat *info;
        const char *name;
        int isbytes;

        KASSERT(args->qc_op == QUOTACTL_OBJTYPESTAT);
        objtype = args->u.objtypestat.qc_objtype;
        info = args->u.objtypestat.qc_info;

        if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
                return EOPNOTSUPP;

        /*
         * These are the same for both QUOTA and QUOTA2.
         */
        switch (objtype) {
        case QUOTA_OBJTYPE_BLOCKS:
                name = "block";
                isbytes = 1;
                break;
        case QUOTA_OBJTYPE_FILES:
                name = "file";
                isbytes = 0;
                break;
        default:
                return EINVAL;
        }
        strlcpy(info->qos_name, name, sizeof(info->qos_name));
        info->qos_isbytes = isbytes;
        return 0;
}

/* XXX shouldn't all this be in kauth ? */
static int
quota_get_auth(struct mount *mp, struct lwp *l, uid_t id) {
        /* The user can always query about his own quota. */
        if (id == kauth_cred_geteuid(l->l_cred))
                return 0;
        return kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
            KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, KAUTH_ARG(id), NULL);
}

static int
quota_handle_cmd_get(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
        struct ufsmount *ump = VFSTOUFS(mp);
        int error;
        const struct quotakey *qk;
        struct quotaval *qv;

        KASSERT(args->qc_op == QUOTACTL_GET);
        qk = args->u.get.qc_key;
        qv = args->u.get.qc_val;

        if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
                return EOPNOTSUPP;

        error = quota_get_auth(mp, l, qk->qk_id);
        if (error != 0)
                return error;
#ifdef QUOTA
        if (ump->um_flags & UFS_QUOTA) {
                error = quota1_handle_cmd_get(ump, qk, qv);
        } else
#endif
#ifdef QUOTA2
        if (ump->um_flags & UFS_QUOTA2) {
                error = quota2_handle_cmd_get(ump, qk, qv);
        } else
#endif
                panic("quota_handle_cmd_get: no support ?");

        if (error != 0)
                return error;

        return error;
}

static int
quota_handle_cmd_put(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
        struct ufsmount *ump = VFSTOUFS(mp);
        const struct quotakey *qk;
        const struct quotaval *qv;
        id_t kauth_id;
        int error;

        KASSERT(args->qc_op == QUOTACTL_PUT);
        qk = args->u.put.qc_key;
        qv = args->u.put.qc_val;

        if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
                return EOPNOTSUPP;

        kauth_id = qk->qk_id;
        if (kauth_id == QUOTA_DEFAULTID) {
                kauth_id = 0;
        }

        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
            KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(kauth_id),
            NULL);
        if (error != 0) {
                return error;
        }

#ifdef QUOTA
        if (ump->um_flags & UFS_QUOTA)
                error = quota1_handle_cmd_put(ump, qk, qv);
        else
#endif
#ifdef QUOTA2
        if (ump->um_flags & UFS_QUOTA2) {
                error = quota2_handle_cmd_put(ump, qk, qv);
        } else
#endif
                panic("quota_handle_cmd_get: no support ?");

        if (error == ENOENT) {
                error = 0;
        }

        return error;
}

static int
quota_handle_cmd_del(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
        struct ufsmount *ump = VFSTOUFS(mp);
        const struct quotakey *qk;
        id_t kauth_id;
        int error;

        KASSERT(args->qc_op == QUOTACTL_DEL);
        qk = args->u.del.qc_key;

        kauth_id = qk->qk_id;
        if (kauth_id == QUOTA_DEFAULTID) {
                kauth_id = 0;
        }

        if ((ump->um_flags & UFS_QUOTA2) == 0)
                return EOPNOTSUPP;

        /* avoid whitespace changes */
        {
                error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
                    KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(kauth_id),
                    NULL);
                if (error != 0)
                        goto err;
#ifdef QUOTA2
                if (ump->um_flags & UFS_QUOTA2) {
                        error = quota2_handle_cmd_del(ump, qk);
                } else
#endif
                        panic("quota_handle_cmd_get: no support ?");

                if (error && error != ENOENT)
                        goto err;
        }

        return 0;
 err:
        return error;
}

static int
quota_handle_cmd_cursorget(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
        struct ufsmount *ump = VFSTOUFS(mp);
        int error;

        KASSERT(args->qc_op == QUOTACTL_CURSORGET);

        if ((ump->um_flags & UFS_QUOTA2) == 0)
                return EOPNOTSUPP;

        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
            KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, NULL, NULL);
        if (error)
                return error;

#ifdef QUOTA2
        if (ump->um_flags & UFS_QUOTA2) {
                struct quotakcursor *cursor = args->u.cursorget.qc_cursor;
                struct quotakey *keys = args->u.cursorget.qc_keys;
                struct quotaval *vals = args->u.cursorget.qc_vals;
                unsigned maxnum = args->u.cursorget.qc_maxnum;
                unsigned *ret = args->u.cursorget.qc_ret;

                error = quota2_handle_cmd_cursorget(ump, cursor, keys, vals,
                                                    maxnum, ret);
        } else
#endif
                panic("quota_handle_cmd_cursorget: no support ?");

        return error;
}

static int
quota_handle_cmd_cursoropen(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
#ifdef QUOTA2
        struct ufsmount *ump = VFSTOUFS(mp);
        struct quotakcursor *cursor = args->u.cursoropen.qc_cursor;
#endif
        int error;

        KASSERT(args->qc_op == QUOTACTL_CURSOROPEN);

        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
            KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, NULL, NULL);
        if (error)
                return error;

#ifdef QUOTA2
        if (ump->um_flags & UFS_QUOTA2) {
                error = quota2_handle_cmd_cursoropen(ump, cursor);
        } else
#endif
                error = EOPNOTSUPP;

        return error;
}

static int
quota_handle_cmd_cursorclose(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
#ifdef QUOTA2
        struct ufsmount *ump = VFSTOUFS(mp);
        struct quotakcursor *cursor = args->u.cursorclose.qc_cursor;
#endif
        int error;

        KASSERT(args->qc_op == QUOTACTL_CURSORCLOSE);

        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
            KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, NULL, NULL);
        if (error)
                return error;

#ifdef QUOTA2
        if (ump->um_flags & UFS_QUOTA2) {
                error = quota2_handle_cmd_cursorclose(ump, cursor);
        } else
#endif
                error = EOPNOTSUPP;

        return error;
}

static int
quota_handle_cmd_cursorskipidtype(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
#ifdef QUOTA2
        struct ufsmount *ump = VFSTOUFS(mp);
        struct quotakcursor *cursor = args->u.cursorskipidtype.qc_cursor;
        int idtype = args->u.cursorskipidtype.qc_idtype;
#endif
        int error;

        KASSERT(args->qc_op == QUOTACTL_CURSORSKIPIDTYPE);

#ifdef QUOTA2
        if (ump->um_flags & UFS_QUOTA2) {
                error = quota2_handle_cmd_cursorskipidtype(ump, cursor, idtype);
        } else
#endif
                error = EOPNOTSUPP;

        return error;
}

static int
quota_handle_cmd_cursoratend(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
#ifdef QUOTA2
        struct ufsmount *ump = VFSTOUFS(mp);
        struct quotakcursor *cursor = args->u.cursoratend.qc_cursor;
        unsigned *ret = args->u.cursoratend.qc_ret;
#endif
        int error;

        KASSERT(args->qc_op == QUOTACTL_CURSORATEND);

#ifdef QUOTA2
        if (ump->um_flags & UFS_QUOTA2) {
                error = quota2_handle_cmd_cursoratend(ump, cursor, ret);
        } else
#endif
                error = EOPNOTSUPP;

        return error;
}

static int
quota_handle_cmd_cursorrewind(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
#ifdef QUOTA2
        struct ufsmount *ump = VFSTOUFS(mp);
        struct quotakcursor *cursor = args->u.cursorrewind.qc_cursor;
#endif
        int error;

        KASSERT(args->qc_op == QUOTACTL_CURSORREWIND);

#ifdef QUOTA2
        if (ump->um_flags & UFS_QUOTA2) {
                error = quota2_handle_cmd_cursorrewind(ump, cursor);
        } else
#endif
                error = EOPNOTSUPP;

        return error;
}

static int
quota_handle_cmd_quotaon(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
        struct ufsmount *ump = VFSTOUFS(mp);
        int error;

        KASSERT(args->qc_op == QUOTACTL_QUOTAON);

        if ((ump->um_flags & UFS_QUOTA2) != 0)
                return EBUSY;

        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
            KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL);
        if (error != 0) {
                return error;
        }
#ifdef QUOTA
        int idtype = args->u.quotaon.qc_idtype;
        const char *qfile = args->u.quotaon.qc_quotafile;
        error = quota1_handle_cmd_quotaon(l, ump, idtype, qfile);
#else
        error = EOPNOTSUPP;
#endif

        return error;
}

static int
quota_handle_cmd_quotaoff(struct mount *mp, struct lwp *l,
    struct quotactl_args *args)
{
        struct ufsmount *ump = VFSTOUFS(mp);
        int error;

        KASSERT(args->qc_op == QUOTACTL_QUOTAOFF);

        if ((ump->um_flags & UFS_QUOTA2) != 0)
                return EOPNOTSUPP;

        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
            KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL);
        if (error != 0) {
                return error;
        }
#ifdef QUOTA
        int idtype = args->u.quotaoff.qc_idtype;
        error = quota1_handle_cmd_quotaoff(l, ump, idtype);
#else
        error = EOPNOTSUPP;
#endif

        return error;
}

/*
 * Initialize the quota system.
 */
void
dqinit(void)
{

        mutex_init(&dqlock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&dqcv, "quota");
        dqhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &dqhash);
        dquot_cache = pool_cache_init(sizeof(struct dquot), 0, 0, 0, "ufsdq",
            NULL, IPL_NONE, NULL, NULL, NULL);
}

void
dqreinit(void)
{
        struct dquot *dq;
        struct dqhashhead *oldhash, *hash;
        struct vnode *dqvp;
        u_long oldmask, mask, hashval;
        int i;

        hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
        mutex_enter(&dqlock);
        oldhash = dqhashtbl;
        oldmask = dqhash;
        dqhashtbl = hash;
        dqhash = mask;
        for (i = 0; i <= oldmask; i++) {
                while ((dq = LIST_FIRST(&oldhash[i])) != NULL) {
                        dqvp = dq->dq_ump->um_quotas[dq->dq_type];
                        LIST_REMOVE(dq, dq_hash);
                        hashval = DQHASH(dqvp, dq->dq_id);
                        LIST_INSERT_HEAD(&dqhashtbl[hashval], dq, dq_hash);
                }
        }
        mutex_exit(&dqlock);
        hashdone(oldhash, HASH_LIST, oldmask);
}

/*
 * Free resources held by quota system.
 */
void
dqdone(void)
{

        pool_cache_destroy(dquot_cache);
        hashdone(dqhashtbl, HASH_LIST, dqhash);
        cv_destroy(&dqcv);
        mutex_destroy(&dqlock);
}

/*
 * Set up the quotas for an inode.
 *
 * This routine completely defines the semantics of quotas.
 * If other criterion want to be used to establish quotas, the
 * MAXQUOTAS value in quotas.h should be increased, and the
 * additional dquots set up here.
 */
int
getinoquota(struct inode *ip)
{
        struct ufsmount *ump = ip->i_ump;
        struct vnode *vp = ITOV(ip);
        int i, error;
        u_int32_t ino_ids[MAXQUOTAS];

        /*
         * To avoid deadlocks never update quotas for quota files
         * on the same file system
         */
        for (i = 0; i < MAXQUOTAS; i++)
                if (vp == ump->um_quotas[i])
                        return 0;

        ino_ids[USRQUOTA] = ip->i_uid;
        ino_ids[GRPQUOTA] = ip->i_gid;
        for (i = 0; i < MAXQUOTAS; i++) {
                /*
                 * If the file id changed the quota needs update.
                 */
                if (ip->i_dquot[i] != NODQUOT &&
                    ip->i_dquot[i]->dq_id != ino_ids[i]) {
                        dqrele(ITOV(ip), ip->i_dquot[i]);
                        ip->i_dquot[i] = NODQUOT;
                }
                /*
                 * Set up the quota based on file id.
                 * ENODEV means that quotas are not enabled.
                 */
                if (ip->i_dquot[i] == NODQUOT &&
                    (error = dqget(vp, ino_ids[i], ump, i, &ip->i_dquot[i])) &&
                    error != ENODEV)
                        return (error);
        }
        return 0;
}

/*
 * Obtain a dquot structure for the specified identifier and quota file
 * reading the information from the file if necessary.
 */
int
dqget(struct vnode *vp, u_long id, struct ufsmount *ump, int type,
    struct dquot **dqp)
{
        struct dquot *dq, *ndq;
        struct dqhashhead *dqh;
        struct vnode *dqvp;
        int error = 0; /* XXX gcc */

        /* Lock to see an up to date value for QTF_CLOSING. */
        mutex_enter(&dqlock);
        if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) {
                mutex_exit(&dqlock);
                *dqp = NODQUOT;
                return (ENODEV);
        }
        dqvp = ump->um_quotas[type];
#ifdef QUOTA
        if (ump->um_flags & UFS_QUOTA) {
                if (dqvp == NULLVP || (ump->umq1_qflags[type] & QTF_CLOSING)) {
                        mutex_exit(&dqlock);
                        *dqp = NODQUOT;
                        return (ENODEV);
                }
        }
#endif
#ifdef QUOTA2
        if (ump->um_flags & UFS_QUOTA2) {
                if (dqvp == NULLVP) {
                        mutex_exit(&dqlock);
                        *dqp = NODQUOT;
                        return (ENODEV);
                }
        }
#endif
        KASSERT(dqvp != vp);
        /*
         * Check the cache first.
         */
        dqh = &dqhashtbl[DQHASH(dqvp, id)];
        LIST_FOREACH(dq, dqh, dq_hash) {
                if (dq->dq_id != id ||
                    dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
                        continue;
                KASSERT(dq->dq_cnt > 0);
                dqref(dq);
                mutex_exit(&dqlock);
                *dqp = dq;
                return (0);
        }
        /*
         * Not in cache, allocate a new one.
         */
        mutex_exit(&dqlock);
        ndq = pool_cache_get(dquot_cache, PR_WAITOK);
        /*
         * Initialize the contents of the dquot structure.
         */
        memset((char *)ndq, 0, sizeof *ndq);
        ndq->dq_flags = 0;
        ndq->dq_id = id;
        ndq->dq_ump = ump;
        ndq->dq_type = type;
        mutex_init(&ndq->dq_interlock, MUTEX_DEFAULT, IPL_NONE);
        mutex_enter(&dqlock);
        dqh = &dqhashtbl[DQHASH(dqvp, id)];
        LIST_FOREACH(dq, dqh, dq_hash) {
                if (dq->dq_id != id ||
                    dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
                        continue;
                /*
                 * Another thread beat us allocating this dquot.
                 */
                KASSERT(dq->dq_cnt > 0);
                dqref(dq);
                mutex_exit(&dqlock);
                mutex_destroy(&ndq->dq_interlock);
                pool_cache_put(dquot_cache, ndq);
                *dqp = dq;
                return 0;
        }
        dq = ndq;
        LIST_INSERT_HEAD(dqh, dq, dq_hash);
        dqref(dq);
        mutex_enter(&dq->dq_interlock);
        mutex_exit(&dqlock);
#ifdef QUOTA
        if (ump->um_flags & UFS_QUOTA)
                error = dq1get(dqvp, id, ump, type, dq);
#endif
#ifdef QUOTA2
        if (ump->um_flags & UFS_QUOTA2)
                error = dq2get(dqvp, id, ump, type, dq);
#endif
        /*
         * I/O error in reading quota file, release
         * quota structure and reflect problem to caller.
         */
        if (error) {
                mutex_enter(&dqlock);
                LIST_REMOVE(dq, dq_hash);
                mutex_exit(&dqlock);
                mutex_exit(&dq->dq_interlock);
                dqrele(vp, dq);
                *dqp = NODQUOT;
                return (error);
        }
        mutex_exit(&dq->dq_interlock);
        *dqp = dq;
        return (0);
}

/*
 * Obtain a reference to a dquot.
 */
void
dqref(struct dquot *dq)
{

        KASSERT(mutex_owned(&dqlock));
        dq->dq_cnt++;
        KASSERT(dq->dq_cnt > 0);
}

/*
 * Release a reference to a dquot.
 */
void
dqrele(struct vnode *vp, struct dquot *dq)
{

        if (dq == NODQUOT)
                return;
        mutex_enter(&dq->dq_interlock);
        for (;;) {
                mutex_enter(&dqlock);
                if (dq->dq_cnt > 1) {
                        dq->dq_cnt--;
                        mutex_exit(&dqlock);
                        mutex_exit(&dq->dq_interlock);
                        return;
                }
                if ((dq->dq_flags & DQ_MOD) == 0)
                        break;
                mutex_exit(&dqlock);
#ifdef QUOTA
                if (dq->dq_ump->um_flags & UFS_QUOTA)
                        (void) dq1sync(vp, dq);
#endif
#ifdef QUOTA2
                if (dq->dq_ump->um_flags & UFS_QUOTA2)
                        (void) dq2sync(vp, dq);
#endif
        }
        KASSERT(dq->dq_cnt == 1 && (dq->dq_flags & DQ_MOD) == 0);
        LIST_REMOVE(dq, dq_hash);
        mutex_exit(&dqlock);
        mutex_exit(&dq->dq_interlock);
        mutex_destroy(&dq->dq_interlock);
        pool_cache_put(dquot_cache, dq);
}

int
qsync(struct mount *mp)
{
        struct ufsmount *ump = VFSTOUFS(mp);
#ifdef QUOTA
        if (ump->um_flags & UFS_QUOTA)
                return q1sync(mp);
#endif
#ifdef QUOTA2
        if (ump->um_flags & UFS_QUOTA2)
                return q2sync(mp);
#endif
        return 0;
}

#ifdef DIAGNOSTIC
/*
 * Check the hash chains for stray dquot's.
 */
void
dqflush(struct vnode *vp)
{
        struct dquot *dq;
        int i;

        mutex_enter(&dqlock);
        for (i = 0; i <= dqhash; i++)
                LIST_FOREACH(dq, &dqhashtbl[i], dq_hash)
                        KASSERT(dq->dq_ump->um_quotas[dq->dq_type] != vp);
        mutex_exit(&dqlock);
}
#endif



















































































































































    6 

    6 












    4 
    5 
    5 











    6 
    6 
































































































   98 











  127 










  127 








  127 

  128 





  124 



  123 


   91 
   95 

   97 



  122 




   10 




    5 








    6 






    1 


    1 



















































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
/*        $NetBSD: fpu.c,v 1.87 2023/07/18 12:34:25 riastradh Exp $        */

/*
 * Copyright (c) 2008, 2019 The NetBSD Foundation, Inc.  All
 * rights reserved.
 *
 * This code is derived from software developed for The NetBSD Foundation
 * by Andrew Doran and Maxime Villard.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1991 The Regents of the University of California.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)npx.c        7.2 (Berkeley) 5/12/91
 */

/*
 * Copyright (c) 1994, 1995, 1998 Charles M. Hannum.  All rights reserved.
 * Copyright (c) 1990 William Jolitz.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)npx.c        7.2 (Berkeley) 5/12/91
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: fpu.c,v 1.87 2023/07/18 12:34:25 riastradh Exp $");

#include "opt_ddb.h"
#include "opt_multiprocessor.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/xcall.h>

#include <machine/cpu.h>
#include <machine/cpuvar.h>
#include <machine/cputypes.h>
#include <machine/intr.h>
#include <machine/cpufunc.h>
#include <machine/pcb.h>
#include <machine/trap.h>
#include <machine/specialreg.h>
#include <x86/cpu.h>
#include <x86/fpu.h>

#ifdef DDB
#include <ddb/ddb.h>
#endif

#ifdef XENPV
#define clts() HYPERVISOR_fpu_taskswitch(0)
#define stts() HYPERVISOR_fpu_taskswitch(1)
#endif

void fpu_handle_deferred(void);
void fpu_switch(struct lwp *, struct lwp *);

uint32_t x86_fpu_mxcsr_mask __read_mostly = 0;

static inline union savefpu *
fpu_lwp_area(struct lwp *l)
{
        struct pcb *pcb = lwp_getpcb(l);
        union savefpu *area = &pcb->pcb_savefpu;

        KASSERT((l->l_flag & LW_SYSTEM) == 0);
        if (l == curlwp) {
                fpu_save();
        }
        KASSERT(!(l->l_md.md_flags & MDL_FPU_IN_CPU));

        return area;
}

static inline void
fpu_save_lwp(struct lwp *l)
{
        struct pcb *pcb = lwp_getpcb(l);
        union savefpu *area = &pcb->pcb_savefpu;
        int s;

        s = splvm();
        if (l->l_md.md_flags & MDL_FPU_IN_CPU) {
                KASSERT((l->l_flag & LW_SYSTEM) == 0);
                fpu_area_save(area, x86_xsave_features, !(l->l_proc->p_flag & PK_32));
                l->l_md.md_flags &= ~MDL_FPU_IN_CPU;
        }
        splx(s);
}

/*
 * Bring curlwp's FPU state in memory. It will get installed back in the CPU
 * when returning to userland.
 */
void
fpu_save(void)
{
        fpu_save_lwp(curlwp);
}

void
fpuinit(struct cpu_info *ci)
{
        /*
         * This might not be strictly necessary since it will be initialized
         * for each process. However it does no harm.
         */
        clts();
        fninit();
        stts();
}

void
fpuinit_mxcsr_mask(void)
{
#ifndef XENPV
        union savefpu fpusave __aligned(64);
        u_long psl;

        memset(&fpusave, 0, sizeof(fpusave));

        /* Disable interrupts, and enable FPU */
        psl = x86_read_psl();
        x86_disable_intr();
        clts();

        /* Fill in the FPU area */
        fxsave(&fpusave);

        /* Restore previous state */
        stts();
        x86_write_psl(psl);

        if (fpusave.sv_xmm.fx_mxcsr_mask == 0) {
                x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__;
        } else {
                x86_fpu_mxcsr_mask = fpusave.sv_xmm.fx_mxcsr_mask;
        }
#else
        /*
         * XXX XXX XXX: On Xen the FXSAVE above faults. That's because
         * &fpusave is not 16-byte aligned. Stack alignment problem
         * somewhere, it seems.
         */
        x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__;
#endif
}

static inline void
fpu_errata_amd(void)
{
        uint16_t sw;

        /*
         * AMD FPUs do not restore FIP, FDP, and FOP on fxrstor and xrstor
         * when FSW.ES=0, leaking other threads' execution history.
         *
         * Clear them manually by loading a zero (fldummy). We do this
         * unconditionally, regardless of FSW.ES.
         *
         * Before that, clear the ES bit in the x87 status word if it is
         * currently set, in order to avoid causing a fault in the
         * upcoming load.
         *
         * Newer generations of AMD CPUs have CPUID_Fn80000008_EBX[2],
         * which indicates that FIP/FDP/FOP are restored (same behavior
         * as Intel). We're not using it though.
         */
        fnstsw(&sw);
        if (sw & 0x80)
                fnclex();
        fldummy();
}

#ifdef __x86_64__
#define XS64(x) (is_64bit ? x##64 : x)
#else
#define XS64(x) x
#endif

void
fpu_area_save(void *area, uint64_t xsave_features, bool is_64bit)
{
        switch (x86_fpu_save) {
        case FPU_SAVE_FSAVE:
                fnsave(area);
                break;
        case FPU_SAVE_FXSAVE:
                XS64(fxsave)(area);
                break;
        case FPU_SAVE_XSAVE:
                XS64(xsave)(area, xsave_features);
                break;
        case FPU_SAVE_XSAVEOPT:
                XS64(xsaveopt)(area, xsave_features);
                break;
        }

        stts();
}

void
fpu_area_restore(const void *area, uint64_t xsave_features, bool is_64bit)
{
        clts();

        switch (x86_fpu_save) {
        case FPU_SAVE_FSAVE:
                frstor(area);
                break;
        case FPU_SAVE_FXSAVE:
                if (cpu_vendor == CPUVENDOR_AMD)
                        fpu_errata_amd();
                XS64(fxrstor)(area);
                break;
        case FPU_SAVE_XSAVE:
        case FPU_SAVE_XSAVEOPT:
                if (cpu_vendor == CPUVENDOR_AMD)
                        fpu_errata_amd();
                XS64(xrstor)(area, xsave_features);
                break;
        }
}

void
fpu_handle_deferred(void)
{
        struct pcb *pcb = lwp_getpcb(curlwp);
        fpu_area_restore(&pcb->pcb_savefpu, x86_xsave_features,
            !(curlwp->l_proc->p_flag & PK_32));
}

void
fpu_switch(struct lwp *oldlwp, struct lwp *newlwp)
{
        struct cpu_info *ci __diagused = curcpu();
        struct pcb *pcb;

        KASSERTMSG(ci->ci_ilevel >= IPL_SCHED, "cpu%d ilevel=%d",
            cpu_index(ci), ci->ci_ilevel);

        if (oldlwp->l_md.md_flags & MDL_FPU_IN_CPU) {
                KASSERT(!(oldlwp->l_flag & LW_SYSTEM));
                pcb = lwp_getpcb(oldlwp);
                fpu_area_save(&pcb->pcb_savefpu, x86_xsave_features,
                    !(oldlwp->l_proc->p_flag & PK_32));
                oldlwp->l_md.md_flags &= ~MDL_FPU_IN_CPU;
        }
        KASSERT(!(newlwp->l_md.md_flags & MDL_FPU_IN_CPU));
}

void
fpu_lwp_fork(struct lwp *l1, struct lwp *l2)
{
        struct pcb *pcb2 = lwp_getpcb(l2);
        union savefpu *fpu_save;

        /* Kernel threads have no FPU. */
        if (__predict_false(l2->l_flag & LW_SYSTEM)) {
                return;
        }
        /* For init(8). */
        if (__predict_false(l1->l_flag & LW_SYSTEM)) {
                memset(&pcb2->pcb_savefpu, 0, x86_fpu_save_size);
                return;
        }

        fpu_save = fpu_lwp_area(l1);
        memcpy(&pcb2->pcb_savefpu, fpu_save, x86_fpu_save_size);
        l2->l_md.md_flags &= ~MDL_FPU_IN_CPU;
}

void
fpu_lwp_abandon(struct lwp *l)
{
        int s;

        KASSERT(l == curlwp);
        s = splvm();
        l->l_md.md_flags &= ~MDL_FPU_IN_CPU;
        stts();
        splx(s);
}

/* -------------------------------------------------------------------------- */

/*
 * fpu_kern_enter()
 *
 *        Begin using the FPU.  Raises to splvm, disabling most
 *        interrupts and rendering the thread non-preemptible; caller
 *        should not use this for long periods of time, and must call
 *        fpu_kern_leave() afterward.  Non-recursive -- you cannot call
 *        fpu_kern_enter() again without calling fpu_kern_leave() first.
 *
 *        Must be used only at IPL_VM or below -- never in IPL_SCHED or
 *        IPL_HIGH interrupt handlers.
 */
void
fpu_kern_enter(void)
{
        static const union savefpu safe_fpu __aligned(64) = {
                .sv_xmm = {
                        .fx_mxcsr = __SAFE_MXCSR__,
                },
        };
        struct lwp *l = curlwp;
        struct cpu_info *ci;
        int s;

        s = splvm();

        ci = curcpu();
#if 0
        /*
         * Can't assert this because if the caller holds a spin lock at
         * IPL_VM, and previously held and released a spin lock at
         * higher IPL, the IPL remains raised above IPL_VM.
         */
        KASSERTMSG(ci->ci_ilevel <= IPL_VM || cold, "ilevel=%d",
            ci->ci_ilevel);
#endif
        KASSERT(ci->ci_kfpu_spl == -1);
        ci->ci_kfpu_spl = s;

        /*
         * If we are in a softint and have a pinned lwp, the fpu state is that
         * of the pinned lwp, so save it there.
         */
        while ((l->l_pflag & LP_INTR) && (l->l_switchto != NULL))
                l = l->l_switchto;
        fpu_save_lwp(l);

        /*
         * Clear CR0_TS, which fpu_save_lwp set if it saved anything --
         * otherwise the CPU will trap if we try to use the FPU under
         * the false impression that there has been a task switch since
         * the last FPU usage requiring that we save the FPU state.
         */
        clts();

        /*
         * Zero the FPU registers and install safe control words.
         */
        fpu_area_restore(&safe_fpu, x86_xsave_features, /*is_64bit*/false);
}

/*
 * fpu_kern_leave()
 *
 *        End using the FPU after fpu_kern_enter().
 */
void
fpu_kern_leave(void)
{
        static const union savefpu zero_fpu __aligned(64);
        struct cpu_info *ci = curcpu();
        int s;

#if 0
        /*
         * Can't assert this because if the caller holds a spin lock at
         * IPL_VM, and previously held and released a spin lock at
         * higher IPL, the IPL remains raised above IPL_VM.
         */
        KASSERT(ci->ci_ilevel == IPL_VM || cold);
#endif
        KASSERT(ci->ci_kfpu_spl != -1);

        /*
         * Zero the fpu registers; otherwise we might leak secrets
         * through Spectre-class attacks to userland, even if there are
         * no bugs in fpu state management.
         */
        fpu_area_restore(&zero_fpu, x86_xsave_features, /*is_64bit*/false);

        /*
         * Set CR0_TS again so that the kernel can't accidentally use
         * the FPU.
         */
        stts();

        s = ci->ci_kfpu_spl;
        ci->ci_kfpu_spl = -1;
        splx(s);
}

/* -------------------------------------------------------------------------- */

/*
 * The following table is used to ensure that the FPE_... value
 * that is passed as a trapcode to the signal handler of the user
 * process does not have more than one bit set.
 *
 * Multiple bits may be set if SSE simd instructions generate errors
 * on more than one value or if the user process modifies the control
 * word while a status word bit is already set (which this is a sign
 * of bad coding).
 * We have no choice than to narrow them down to one bit, since we must
 * not send a trapcode that is not exactly one of the FPE_ macros.
 *
 * The mechanism has a static table with 127 entries.  Each combination
 * of the 7 FPU status word exception bits directly translates to a
 * position in this table, where a single FPE_... value is stored.
 * This FPE_... value stored there is considered the "most important"
 * of the exception bits and will be sent as the signal code.  The
 * precedence of the bits is based upon Intel Document "Numerical
 * Applications", Chapter "Special Computational Situations".
 *
 * The code to choose one of these values does these steps:
 * 1) Throw away status word bits that cannot be masked.
 * 2) Throw away the bits currently masked in the control word,
 *    assuming the user isn't interested in them anymore.
 * 3) Reinsert status word bit 7 (stack fault) if it is set, which
 *    cannot be masked but must be preserved.
 *    'Stack fault' is a sub-class of 'invalid operation'.
 * 4) Use the remaining bits to point into the trapcode table.
 *
 * The 6 maskable bits in order of their preference, as stated in the
 * above referenced Intel manual:
 * 1  Invalid operation (FP_X_INV)
 * 1a   Stack underflow
 * 1b   Stack overflow
 * 1c   Operand of unsupported format
 * 1d   SNaN operand.
 * 2  QNaN operand (not an exception, irrelevant here)
 * 3  Any other invalid-operation not mentioned above or zero divide
 *      (FP_X_INV, FP_X_DZ)
 * 4  Denormal operand (FP_X_DNML)
 * 5  Numeric over/underflow (FP_X_OFL, FP_X_UFL)
 * 6  Inexact result (FP_X_IMP)
 *
 * NB: the above seems to mix up the mxscr error bits and the x87 ones.
 * They are in the same order, but there is no EN_SW_STACK_FAULT in the mmx
 * status.
 *
 * The table is nearly, but not quite, in bit order (ZERODIV and DENORM
 * are swapped).
 *
 * This table assumes that any stack fault is cleared - so that an INVOP
 * fault will only be reported as FLTSUB once.
 * This might not happen if the mask is being changed.
 */
#define FPE_xxx1(f) (f & EN_SW_INVOP \
                ? (f & EN_SW_STACK_FAULT ? FPE_FLTSUB : FPE_FLTINV) \
        : f & EN_SW_ZERODIV ? FPE_FLTDIV \
        : f & EN_SW_DENORM ? FPE_FLTUND \
        : f & EN_SW_OVERFLOW ? FPE_FLTOVF \
        : f & EN_SW_UNDERFLOW ? FPE_FLTUND \
        : f & EN_SW_PRECLOSS ? FPE_FLTRES \
        : f & EN_SW_STACK_FAULT ? FPE_FLTSUB : 0)
#define        FPE_xxx2(f)        FPE_xxx1(f),        FPE_xxx1((f + 1))
#define        FPE_xxx4(f)        FPE_xxx2(f),        FPE_xxx2((f + 2))
#define        FPE_xxx8(f)        FPE_xxx4(f),        FPE_xxx4((f + 4))
#define        FPE_xxx16(f)        FPE_xxx8(f),        FPE_xxx8((f + 8))
#define        FPE_xxx32(f)        FPE_xxx16(f),        FPE_xxx16((f + 16))
static const uint8_t fpetable[128] = {
        FPE_xxx32(0), FPE_xxx32(32), FPE_xxx32(64), FPE_xxx32(96)
};
#undef FPE_xxx1
#undef FPE_xxx2
#undef FPE_xxx4
#undef FPE_xxx8
#undef FPE_xxx16
#undef FPE_xxx32

/*
 * This is a synchronous trap on either an x87 instruction (due to an unmasked
 * error on the previous x87 instruction) or on an SSE/SSE2/etc instruction due
 * to an error on the instruction itself.
 *
 * If trap actually generates a signal, then the fpu state is saved and then
 * copied onto the lwp's user-stack, and then recovered from there when the
 * signal returns.
 *
 * All this code needs to do is save the reason for the trap. For x87 traps the
 * status word bits need clearing to stop the trap re-occurring. For SSE traps
 * the mxcsr bits are 'sticky' and need clearing to not confuse a later trap.
 *
 * We come here with interrupts disabled.
 */
void
fputrap(struct trapframe *frame)
{
        uint32_t statbits;
        ksiginfo_t ksi;

        if (__predict_false(!USERMODE(frame->tf_cs))) {
                register_t ip = X86_TF_RIP(frame);
                char where[128];

#ifdef DDB
                db_symstr(where, sizeof(where), (db_expr_t)ip, DB_STGY_PROC);
#else
                snprintf(where, sizeof(where), "%p", (void *)ip);
#endif
                panic("fpu trap from kernel at %s, trapframe %p\n", where,
                    frame);
        }

        KASSERT(curlwp->l_md.md_flags & MDL_FPU_IN_CPU);

        if (frame->tf_trapno == T_XMM) {
                uint32_t mxcsr;
                x86_stmxcsr(&mxcsr);
                statbits = mxcsr;
                /* Clear the sticky status bits */
                mxcsr &= ~0x3f;
                x86_ldmxcsr(&mxcsr);

                /* Remove masked interrupts and non-status bits */
                statbits &= ~(statbits >> 7) & 0x3f;
                /* Mark this is an XMM status */
                statbits |= 0x10000;
        } else {
                uint16_t cw, sw;
                /* Get current control and status words */
                fnstcw(&cw);
                fnstsw(&sw);
                /* Clear any pending exceptions from status word */
                fnclex();

                /* Remove masked interrupts */
                statbits = sw & ~(cw & 0x3f);
        }

        /* Doesn't matter now if we get pre-empted */
        x86_enable_intr();

        KSI_INIT_TRAP(&ksi);
        ksi.ksi_signo = SIGFPE;
        ksi.ksi_addr = (void *)X86_TF_RIP(frame);
        ksi.ksi_code = fpetable[statbits & 0x7f];
        ksi.ksi_trap = statbits;
        (*curlwp->l_proc->p_emul->e_trapsignal)(curlwp, &ksi);
}

void
fpudna(struct trapframe *frame)
{
        panic("fpudna from %s, ip %p, trapframe %p",
            USERMODE(frame->tf_cs) ? "userland" : "kernel",
            (void *)X86_TF_RIP(frame), frame);
}

/* -------------------------------------------------------------------------- */

static inline void
fpu_xstate_reload(union savefpu *fpu_save, uint64_t xstate)
{
        /*
         * Force a reload of the given xstate during the next XRSTOR.
         */
        if (x86_fpu_save >= FPU_SAVE_XSAVE) {
                fpu_save->sv_xsave_hdr.xsh_xstate_bv |= xstate;
        }
}

void
fpu_set_default_cw(struct lwp *l, unsigned int x87_cw)
{
        union savefpu *fpu_save = fpu_lwp_area(l);
        struct pcb *pcb = lwp_getpcb(l);

        if (i386_use_fxsave) {
                fpu_save->sv_xmm.fx_cw = x87_cw;
                if (x87_cw != __INITIAL_NPXCW__) {
                        fpu_xstate_reload(fpu_save, XCR0_X87);
                }
        } else {
                fpu_save->sv_87.s87_cw = x87_cw;
        }
        pcb->pcb_fpu_dflt_cw = x87_cw;
}

void
fpu_clear(struct lwp *l, unsigned int x87_cw)
{
        union savefpu *fpu_save;
        struct pcb *pcb;

        KASSERT(l == curlwp);
        fpu_save = fpu_lwp_area(l);

        switch (x86_fpu_save) {
        case FPU_SAVE_FSAVE:
                memset(&fpu_save->sv_87, 0, x86_fpu_save_size);
                fpu_save->sv_87.s87_tw = 0xffff;
                fpu_save->sv_87.s87_cw = x87_cw;
                break;
        case FPU_SAVE_FXSAVE:
                memset(&fpu_save->sv_xmm, 0, x86_fpu_save_size);
                fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
                fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
                fpu_save->sv_xmm.fx_cw = x87_cw;
                break;
        case FPU_SAVE_XSAVE:
        case FPU_SAVE_XSAVEOPT:
                memset(&fpu_save->sv_xmm, 0, x86_fpu_save_size);
                fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
                fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
                fpu_save->sv_xmm.fx_cw = x87_cw;
                if (__predict_false(x87_cw != __INITIAL_NPXCW__)) {
                        fpu_xstate_reload(fpu_save, XCR0_X87);
                }
                break;
        }

        pcb = lwp_getpcb(l);
        pcb->pcb_fpu_dflt_cw = x87_cw;
}

void
fpu_sigreset(struct lwp *l)
{
        union savefpu *fpu_save = fpu_lwp_area(l);
        struct pcb *pcb = lwp_getpcb(l);

        /*
         * For signal handlers the register values don't matter. Just reset
         * a few fields.
         */
        if (i386_use_fxsave) {
                fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
                fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
                fpu_save->sv_xmm.fx_tw = 0;
                fpu_save->sv_xmm.fx_cw = pcb->pcb_fpu_dflt_cw;
        } else {
                fpu_save->sv_87.s87_tw = 0xffff;
                fpu_save->sv_87.s87_cw = pcb->pcb_fpu_dflt_cw;
        }
}

void
process_write_fpregs_xmm(struct lwp *l, const struct fxsave *fpregs)
{
        union savefpu *fpu_save = fpu_lwp_area(l);

        if (i386_use_fxsave) {
                memcpy(&fpu_save->sv_xmm, fpregs, sizeof(fpu_save->sv_xmm));

                /*
                 * Invalid bits in mxcsr or mxcsr_mask will cause faults.
                 */
                fpu_save->sv_xmm.fx_mxcsr_mask &= x86_fpu_mxcsr_mask;
                fpu_save->sv_xmm.fx_mxcsr &= fpu_save->sv_xmm.fx_mxcsr_mask;

                fpu_xstate_reload(fpu_save, XCR0_X87 | XCR0_SSE);
        } else {
                process_xmm_to_s87(fpregs, &fpu_save->sv_87);
        }
}

void
process_write_fpregs_s87(struct lwp *l, const struct save87 *fpregs)
{
        union savefpu *fpu_save = fpu_lwp_area(l);

        if (i386_use_fxsave) {
                process_s87_to_xmm(fpregs, &fpu_save->sv_xmm);
                fpu_xstate_reload(fpu_save, XCR0_X87 | XCR0_SSE);
        } else {
                memcpy(&fpu_save->sv_87, fpregs, sizeof(fpu_save->sv_87));
        }
}

void
process_read_fpregs_xmm(struct lwp *l, struct fxsave *fpregs)
{
        union savefpu *fpu_save = fpu_lwp_area(l);

        if (i386_use_fxsave) {
                memcpy(fpregs, &fpu_save->sv_xmm, sizeof(fpu_save->sv_xmm));
        } else {
                memset(fpregs, 0, sizeof(*fpregs));
                process_s87_to_xmm(&fpu_save->sv_87, fpregs);
        }
}

void
process_read_fpregs_s87(struct lwp *l, struct save87 *fpregs)
{
        union savefpu *fpu_save = fpu_lwp_area(l);

        if (i386_use_fxsave) {
                memset(fpregs, 0, sizeof(*fpregs));
                process_xmm_to_s87(&fpu_save->sv_xmm, fpregs);
        } else {
                memcpy(fpregs, &fpu_save->sv_87, sizeof(fpu_save->sv_87));
        }
}

int
process_read_xstate(struct lwp *l, struct xstate *xstate)
{
        union savefpu *fpu_save = fpu_lwp_area(l);

        if (x86_fpu_save == FPU_SAVE_FSAVE) {
                /* Convert from legacy FSAVE format. */
                memset(&xstate->xs_fxsave, 0, sizeof(xstate->xs_fxsave));
                process_s87_to_xmm(&fpu_save->sv_87, &xstate->xs_fxsave);

                /* We only got x87 data. */
                xstate->xs_rfbm = XCR0_X87;
                xstate->xs_xstate_bv = XCR0_X87;
                return 0;
        }

        /* Copy the legacy area. */
        memcpy(&xstate->xs_fxsave, fpu_save->sv_xsave_hdr.xsh_fxsave,
            sizeof(xstate->xs_fxsave));

        if (x86_fpu_save == FPU_SAVE_FXSAVE) {
                /* FXSAVE means we've got x87 + SSE data. */
                xstate->xs_rfbm = XCR0_X87 | XCR0_SSE;
                xstate->xs_xstate_bv = XCR0_X87 | XCR0_SSE;
                return 0;
        }

        /* Copy the bitmap indicating which states are available. */
        xstate->xs_rfbm = x86_xsave_features & XCR0_FPU;
        xstate->xs_xstate_bv = fpu_save->sv_xsave_hdr.xsh_xstate_bv;
        KASSERT(!(xstate->xs_xstate_bv & ~xstate->xs_rfbm));

#define COPY_COMPONENT(xcr0_val, xsave_val, field)                        \
        if (xstate->xs_xstate_bv & xcr0_val) {                                \
                KASSERT(x86_xsave_offsets[xsave_val]                        \
                    >= sizeof(struct xsave_header));                        \
                KASSERT(x86_xsave_sizes[xsave_val]                        \
                    >= sizeof(xstate->field));                                \
                memcpy(&xstate->field,                                        \
                    (char*)fpu_save + x86_xsave_offsets[xsave_val],        \
                    sizeof(xstate->field));                                \
        }

        COPY_COMPONENT(XCR0_YMM_Hi128, XSAVE_YMM_Hi128, xs_ymm_hi128);
        COPY_COMPONENT(XCR0_Opmask, XSAVE_Opmask, xs_opmask);
        COPY_COMPONENT(XCR0_ZMM_Hi256, XSAVE_ZMM_Hi256, xs_zmm_hi256);
        COPY_COMPONENT(XCR0_Hi16_ZMM, XSAVE_Hi16_ZMM, xs_hi16_zmm);

#undef COPY_COMPONENT

        return 0;
}

int
process_verify_xstate(const struct xstate *xstate)
{
        /* xstate_bv must be a subset of RFBM */
        if (xstate->xs_xstate_bv & ~xstate->xs_rfbm)
                return EINVAL;

        switch (x86_fpu_save) {
        case FPU_SAVE_FSAVE:
                if ((xstate->xs_rfbm & ~XCR0_X87))
                        return EINVAL;
                break;
        case FPU_SAVE_FXSAVE:
                if ((xstate->xs_rfbm & ~(XCR0_X87 | XCR0_SSE)))
                        return EINVAL;
                break;
        default:
                /* Verify whether no unsupported features are enabled */
                if ((xstate->xs_rfbm & ~(x86_xsave_features & XCR0_FPU)) != 0)
                        return EINVAL;
        }

        return 0;
}

int
process_write_xstate(struct lwp *l, const struct xstate *xstate)
{
        union savefpu *fpu_save = fpu_lwp_area(l);

        /* Convert data into legacy FSAVE format. */
        if (x86_fpu_save == FPU_SAVE_FSAVE) {
                if (xstate->xs_xstate_bv & XCR0_X87)
                        process_xmm_to_s87(&xstate->xs_fxsave, &fpu_save->sv_87);
                return 0;
        }

        /* If XSAVE is supported, make sure that xstate_bv is set correctly. */
        if (x86_fpu_save >= FPU_SAVE_XSAVE) {
                /*
                 * Bit-wise "xstate->xs_rfbm ? xstate->xs_xstate_bv :
                 *           fpu_save->sv_xsave_hdr.xsh_xstate_bv"
                 */
                fpu_save->sv_xsave_hdr.xsh_xstate_bv =
                    (fpu_save->sv_xsave_hdr.xsh_xstate_bv & ~xstate->xs_rfbm) |
                    xstate->xs_xstate_bv;
        }

        if (xstate->xs_xstate_bv & XCR0_X87) {
                /*
                 * X87 state is split into two areas, interspersed with SSE
                 * data.
                 */
                memcpy(&fpu_save->sv_xmm, &xstate->xs_fxsave, 24);
                memcpy(fpu_save->sv_xmm.fx_87_ac, xstate->xs_fxsave.fx_87_ac,
                    sizeof(xstate->xs_fxsave.fx_87_ac));
        }

        /*
         * Copy MXCSR if either SSE or AVX state is requested, to match the
         * XSAVE behavior for those flags.
         */
        if (xstate->xs_xstate_bv & (XCR0_SSE|XCR0_YMM_Hi128)) {
                /*
                 * Invalid bits in mxcsr or mxcsr_mask will cause faults.
                 */
                fpu_save->sv_xmm.fx_mxcsr_mask = xstate->xs_fxsave.fx_mxcsr_mask
                    & x86_fpu_mxcsr_mask;
                fpu_save->sv_xmm.fx_mxcsr = xstate->xs_fxsave.fx_mxcsr &
                    fpu_save->sv_xmm.fx_mxcsr_mask;
        }

        if (xstate->xs_xstate_bv & XCR0_SSE) {
                memcpy(&fpu_save->sv_xsave_hdr.xsh_fxsave[160],
                    xstate->xs_fxsave.fx_xmm, sizeof(xstate->xs_fxsave.fx_xmm));
        }

#define COPY_COMPONENT(xcr0_val, xsave_val, field)                        \
        if (xstate->xs_xstate_bv & xcr0_val) {                                \
                KASSERT(x86_xsave_offsets[xsave_val]                        \
                    >= sizeof(struct xsave_header));                        \
                KASSERT(x86_xsave_sizes[xsave_val]                        \
                    >= sizeof(xstate->field));                                \
                memcpy((char *)fpu_save + x86_xsave_offsets[xsave_val],        \
                    &xstate->field, sizeof(xstate->field));                \
        }

        COPY_COMPONENT(XCR0_YMM_Hi128, XSAVE_YMM_Hi128, xs_ymm_hi128);
        COPY_COMPONENT(XCR0_Opmask, XSAVE_Opmask, xs_opmask);
        COPY_COMPONENT(XCR0_ZMM_Hi256, XSAVE_ZMM_Hi256, xs_zmm_hi256);
        COPY_COMPONENT(XCR0_Hi16_ZMM, XSAVE_Hi16_ZMM, xs_hi16_zmm);

#undef COPY_COMPONENT

        return 0;
}






















































































































































































































































































































































































































































































































































   98 






    4 












   83 










   79 
   84 

   82 










    3 



   82 






























   85 










   83 
   85 
   81 

   57 






   57 








































    2 














   49 



   49 




































   66 
   30 















    5 































  142 








   84 





   82 










   83 
   13 









   25 



   82 























  144 




  142 



   84 






   81 
   84 







   81 

   82 










   84 
   25 









































































































  114 





























   55 


   55 




   55 











   54 







   55 

   55 

























































   17 






   17 
   16 

    9 
   15 


   17 

   17 


   17 







   17 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































    7 





    7 






    7 


    7 























   64 







    7 



    7 



   64 
   65 


   67 




    7 
    7 
    7 

   66 




































   84 


   86 

   85 














   84 



   83 


   84 
   56 





   82 

   79 






















   84 





   21 















   12 



   12 
   12 











   12 
   12 









































   81 
   82 
   80 
   78 
   81 







   82 
   12 

    9 








   82 


   20 







   38 
   67 





   77 



   55 

















   49 



















   12 



   12 
   12 
   12 
   11 
   12 





   11 
   12 























   12 









   83 




   83 

   80 
   41 

   41 





   30 




   68 







    6 


    6 
    6 


    6 
    1 
    5 


    6 








    6 





    6 





    6 
    6 
    6 





















    6 












    6 























   30 








   29 







   30 

   30 








   30 



   30 



   29 























   30 
   29 
   30 





   30 




   30 



























   28 






   30 







































































































































































































    6 




    6 


















    6 












    6 










    2 



    2 


















    6 














































    2 












    2 































  110 






  110 


    2 






    2 











    2 
    2 


    2 

























    2 
    2 
    2 

    2 




    2 


































































































































































































































    6 




    6 






















































































































































  122 



  124 






  124 
  124 



































   68 



   67 
   68 












   67 

   67 






    1 


















  115 



















  114 




  113 


  113 

  115 



   57 




















   97 













  111 




  110 








  110 
  110 
  108 













































  126 



  126 





















   62 

   62 

















  125 
  124 






  126 



























































































































    9 



  132 













  128 
























   83 
   67 


  129 






   71 
  111 





   84 
   66 

   66 
   86 











   19 






































   37 





















   38 


















    6 


























    6 


























































    5 




    5 











    5 
    5 










    5 

    5 























    5 
    5 





    5 









    5 


















   12 





   12 
   12 

    2 











   10 





   10 






   10 


















   10 

























   12 








    9 





    7 














    7 


    5 





    5 









    5 






    5 



    3 














   12 






   12 













    5 











    5 
    3 
    3 
    5 








    2 
    3 



    5 


    5 
    5 
    5 













    2 









    1 
    5 









    5 
    5 
    5 


    1 
    5 

    5 


    3 
    2 











    1 


    1 


    1 






    7 













    6 
    7 
    6 
    5 











    2 
    2 








    2 
    2 
    1 
    1 






















    2 

    2 

    1 






    2 

    2 


















    1 




    1 


    1 
    1 




    2 













    7 





















































































    4 







    4 

































    5 











    1 
    1 
    1 



    4 




























































    9 





























    9 



    9 







    9 


    7 
    7 




    7 




    7 





    6 
    7 










    8 










    3 

















    3 






    3 





    3 


























   79 




   81 










   81 














   81 
   81 
   81 

   80 








   82 


    3 
   79 








   21 

   70 
   29 















   80 
















   65 

   30 












    2 


















   82 
























   69 
   30 

























































   81 









   82 
    3 
   28 
   74 


   79 

   76 





   29 

   29 



   77 



















   78 
   78 





   55 
   49 

   48 








   79 

    6 






















































































































































































































































































































































































































































































































































































































































































  119 
















  119 




    5 
    5 
    5 
    5 





    5 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
/*        $NetBSD: pmap.c,v 1.426 2023/10/04 20:28:06 ad Exp $        */

/*
 * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran, and by Maxime Villard.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 2007 Manuel Bouyer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

/*
 * Copyright 2001 (c) Wasabi Systems, Inc.
 * All rights reserved.
 *
 * Written by Frank van der Linden for Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed for the NetBSD Project by
 *      Wasabi Systems, Inc.
 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
 *    or promote products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.426 2023/10/04 20:28:06 ad Exp $");

#include "opt_user_ldt.h"
#include "opt_lockdebug.h"
#include "opt_multiprocessor.h"
#include "opt_xen.h"
#include "opt_svs.h"
#include "opt_kaslr.h"
#include "opt_efi.h"

#define        __MUTEX_PRIVATE        /* for assertions */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/xcall.h>
#include <sys/kcore.h>
#include <sys/kmem.h>
#include <sys/asan.h>
#include <sys/msan.h>
#include <sys/entropy.h>

#include <uvm/uvm.h>
#include <uvm/pmap/pmap_pvt.h>

#include <dev/isa/isareg.h>

#include <machine/specialreg.h>
#include <machine/gdt.h>
#include <machine/isa_machdep.h>
#include <machine/cpuvar.h>
#include <machine/cputypes.h>
#include <machine/pmap_private.h>

#include <x86/bootspace.h>
#include <x86/pat.h>
#include <x86/pmap_pv.h>

#include <x86/i82489reg.h>
#include <x86/i82489var.h>

#ifdef XEN
#include <xen/include/public/xen.h>
#include <xen/hypervisor.h>
#include <xen/xenpmap.h>
#endif

#ifdef __HAVE_DIRECT_MAP
#include <crypto/nist_hash_drbg/nist_hash_drbg.h>
#endif

/*
 * general info:
 *
 *  - for an explanation of how the x86 MMU hardware works see
 *    the comments in <machine/pte.h>.
 *
 *  - for an explanation of the general memory structure used by
 *    this pmap (including the recursive mapping), see the comments
 *    in <machine/pmap.h>.
 *
 * this file contains the code for the "pmap module."   the module's
 * job is to manage the hardware's virtual to physical address mappings.
 * note that there are two levels of mapping in the VM system:
 *
 *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
 *      to map ranges of virtual address space to objects/files.  for
 *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
 *      to the file /bin/ls starting at offset zero."   note that
 *      the upper layer mapping is not concerned with how individual
 *      vm_pages are mapped.
 *
 *  [2] the lower layer of the VM system (the pmap) maintains the mappings
 *      from virtual addresses.   it is concerned with which vm_page is
 *      mapped where.   for example, when you run /bin/ls and start
 *      at page 0x1000 the fault routine may lookup the correct page
 *      of the /bin/ls file and then ask the pmap layer to establish
 *      a mapping for it.
 *
 * note that information in the lower layer of the VM system can be
 * thrown away since it can easily be reconstructed from the info
 * in the upper layer.
 *
 * data structures we use include:
 *
 *  - struct pmap: describes the address space of one thread
 *  - struct pmap_page: describes one pv-tracked page, without
 *    necessarily a corresponding vm_page
 *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
 *  - pmap_page::pp_pvlist: there is one list per pv-tracked page of
 *    physical memory.   the pp_pvlist points to a list of pv_entry
 *    structures which describe all the <PMAP,VA> pairs that this
 *    page is mapped in.    this is critical for page based operations
 *    such as pmap_page_protect() [change protection on _all_ mappings
 *    of a page]
 */

/*
 * Locking
 *
 * We have the following locks that we must deal with, listed in the order
 * that they are acquired:
 *
 * pg->uobject->vmobjlock, pg->uanon->an_lock
 *
 *        For managed pages, these per-object locks are taken by the VM system
 *        before calling into the pmap module - either a read or write hold.
 *        The lock hold prevent pages from changing identity while the pmap is
 *        operating on them.  For example, the same lock is held across a call
 *        to pmap_remove() and the following call to pmap_update(), so that a
 *        page does not gain a new identity while its TLB visibility is stale.
 *
 * pmap->pm_lock
 *
 *        This lock protects the fields in the pmap structure including the
 *        non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
 *        structures.  For modifying unmanaged kernel PTEs it is not needed as
 *        kernel PDEs are never freed, and the kernel is expected to be self
 *        consistent (and the lock can't be taken for unmanaged kernel PTEs,
 *        because they can be modified from interrupt context).
 *
 * pmaps_lock
 *
 *        This lock protects the list of active pmaps (headed by "pmaps").
 *        It's acquired when adding or removing pmaps or adjusting kernel PDEs.
 *
 * pp_lock
 *
 *        This per-page lock protects PV entry lists and the embedded PV entry
 *        in each vm_page, allowing for concurrent operation on pages by
 *        different pmaps.  This is a spin mutex at IPL_VM, because at the
 *        points it is taken context switching is usually not tolerable, and
 *        spin mutexes must block out interrupts that could take kernel_lock.
 */

/* uvm_object is abused here to index pmap_pages; make assertions happy. */
#ifdef DIAGNOSTIC
#define        PMAP_DUMMY_LOCK(pm)        rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
#define        PMAP_DUMMY_UNLOCK(pm)        rw_exit(&(pm)->pm_dummy_lock)
#else
#define        PMAP_DUMMY_LOCK(pm)
#define        PMAP_DUMMY_UNLOCK(pm)
#endif

static const struct uvm_pagerops pmap_pager = {
        /* nothing */
};

/*
 * pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X)
 */
#define pl_i(va, lvl) \
        (((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1])

#define        pl_i_roundup(va, lvl)        pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl))

/*
 * PTP macros:
 *   a PTP's index is the PD index of the PDE that points to it
 *   a PTP's offset is the byte-offset in the PTE space that this PTP is at
 *   a PTP's VA is the first VA mapped by that PTP
 */

#define ptp_va2o(va, lvl)        (pl_i(va, (lvl)+1) * PAGE_SIZE)

const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
const long nkptpmax[] = NKPTPMAX_INITIALIZER;
const long nbpd[] = NBPD_INITIALIZER;
#ifdef i386
pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
#else
pd_entry_t *normal_pdes[3];
#endif

long nkptp[] = NKPTP_INITIALIZER;

struct pmap_head pmaps;
kmutex_t pmaps_lock __cacheline_aligned;

struct pcpu_area *pcpuarea __read_mostly;

static vaddr_t pmap_maxkvaddr;

/*
 * Misc. event counters.
 */
struct evcnt pmap_iobmp_evcnt;
struct evcnt pmap_ldt_evcnt;

/*
 * PAT
 */
static bool cpu_pat_enabled __read_mostly = false;

/*
 * Global data structures
 */

static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
static rb_tree_t pmap_kernel_rb __cacheline_aligned;

struct bootspace bootspace __read_mostly;
struct slotspace slotspace __read_mostly;

/* Set to PTE_NX if supported. */
pd_entry_t pmap_pg_nx __read_mostly = 0;

/* Set to PTE_G if supported. */
pd_entry_t pmap_pg_g __read_mostly = 0;

/* Set to true if large pages are supported. */
int pmap_largepages __read_mostly = 0;

paddr_t lowmem_rsvd __read_mostly;
paddr_t avail_start __read_mostly; /* PA of first available physical page */
paddr_t avail_end __read_mostly; /* PA of last available physical page */

#ifdef XENPV
paddr_t pmap_pa_start; /* PA of first physical page for this domain */
paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
#endif

#define        VM_PAGE_TO_PP(pg)        (&(pg)->mdpage.mp_pp)
#define        PMAP_CHECK_PP(pp) \
    KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)

#define PAGE_ALIGNED(pp)        \
        __builtin_assume_aligned((void *)(pp), PAGE_SIZE)

/*
 * Other data structures
 */

static pt_entry_t protection_codes[8] __read_mostly;

static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */

/*
 * The following two vaddr_t's are used during system startup to keep track of
 * how much of the kernel's VM space we have used. Once the system is started,
 * the management of the remaining kernel VM space is turned over to the
 * kernel_map vm_map.
 */
static vaddr_t virtual_avail __read_mostly;        /* VA of first free KVA */
static vaddr_t virtual_end __read_mostly;        /* VA of last free KVA */

#ifndef XENPV
/*
 * LAPIC virtual address, and fake physical address.
 */
volatile vaddr_t local_apic_va __read_mostly;
paddr_t local_apic_pa __read_mostly;
#endif

/*
 * pool that pmap structures are allocated from
 */
struct pool_cache pmap_cache;
static int  pmap_ctor(void *, void *, int);
static void pmap_dtor(void *, void *);

/*
 * pv_page cache
 */
static struct pool_cache pmap_pvp_cache;

#ifdef __HAVE_DIRECT_MAP
vaddr_t pmap_direct_base __read_mostly;
vaddr_t pmap_direct_end __read_mostly;
#endif

#ifndef __HAVE_DIRECT_MAP
/*
 * Special VAs and the PTEs that map them
 */
static pt_entry_t *early_zero_pte;
static void pmap_vpage_cpualloc(struct cpu_info *);
#ifdef XENPV
char *early_zerop; /* also referenced from xen_locore() */
#else
static char *early_zerop;
#endif
#endif

int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);

/* PDP pool and its callbacks */
static struct pool pmap_pdp_pool;
static void pmap_pdp_init(pd_entry_t *);
static void pmap_pdp_fini(pd_entry_t *);

#ifdef PAE
/* need to allocate items of 4 pages */
static void *pmap_pdp_alloc(struct pool *, int);
static void pmap_pdp_free(struct pool *, void *);
static struct pool_allocator pmap_pdp_allocator = {
        .pa_alloc = pmap_pdp_alloc,
        .pa_free = pmap_pdp_free,
        .pa_pagesz = PAGE_SIZE * PDP_SIZE,
};
#endif

extern vaddr_t idt_vaddr;
extern paddr_t idt_paddr;
extern vaddr_t gdt_vaddr;
extern paddr_t gdt_paddr;
extern vaddr_t ldt_vaddr;
extern paddr_t ldt_paddr;

#ifdef i386
/* stuff to fix the pentium f00f bug */
extern vaddr_t pentium_idt_vaddr;
#endif

/* Array of freshly allocated PTPs, for pmap_get_ptp(). */
struct pmap_ptparray {
        struct vm_page *pg[PTP_LEVELS + 1];
        bool alloced[PTP_LEVELS + 1];
};

/*
 * PV entries are allocated in page-sized chunks and cached per-pmap to
 * avoid intense pressure on memory allocators.
 */

struct pv_page {
        LIST_HEAD(, pv_entry)        pvp_pves;
        LIST_ENTRY(pv_page)        pvp_list;
        long                        pvp_nfree;
        struct pmap                *pvp_pmap;
};

#define        PVE_PER_PVP        ((PAGE_SIZE / sizeof(struct pv_entry)) - 1)

/*
 * PV tree prototypes
 */

static int        pmap_compare_key(void *, const void *, const void *);
static int        pmap_compare_nodes(void *, const void *, const void *);

/* Read-black tree */
static const rb_tree_ops_t pmap_rbtree_ops = {
        .rbto_compare_nodes = pmap_compare_nodes,
        .rbto_compare_key = pmap_compare_key,
        .rbto_node_offset = offsetof(struct pv_entry, pve_rb),
        .rbto_context = NULL
};

/*
 * Local prototypes
 */

#ifdef __HAVE_PCPU_AREA
static void pmap_init_pcpu(void);
#endif
#ifdef __HAVE_DIRECT_MAP
static void pmap_init_directmap(struct pmap *);
#endif
#if !defined(XENPV)
static void pmap_remap_global(void);
#endif
#ifndef XENPV
static void pmap_init_lapic(void);
static void pmap_remap_largepages(void);
#endif

static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
    struct vm_page **);
static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
    pd_entry_t * const *);
static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
static void pmap_freepage(struct pmap *, struct vm_page *, int);
static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
    pt_entry_t *, pd_entry_t * const *);
static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
    vaddr_t);
static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
    vaddr_t);
static int pmap_pvp_ctor(void *, void *, int);
static void pmap_pvp_dtor(void *, void *);
static struct pv_entry *pmap_alloc_pv(struct pmap *);
static void pmap_free_pv(struct pmap *, struct pv_entry *);
static void pmap_drain_pv(struct pmap *);

static void pmap_alloc_level(struct pmap *, vaddr_t, long *);

static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
static void pmap_reactivate(struct pmap *);

long
pmap_resident_count(struct pmap *pmap)
{

        return pmap->pm_stats.resident_count;
}

long
pmap_wired_count(struct pmap *pmap)
{

        return pmap->pm_stats.wired_count;
}

/*
 * p m a p   h e l p e r   f u n c t i o n s
 */

static inline void
pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
{

        KASSERT(cold || mutex_owned(&pmap->pm_lock));
        pmap->pm_stats.resident_count += resid_diff;
        pmap->pm_stats.wired_count += wired_diff;
}

static inline void
pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
{
        int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
        int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);

        KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
        KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED);

        pmap_stats_update(pmap, resid_diff, wired_diff);
}

/*
 * ptp_to_pmap: lookup pmap by ptp
 */
static inline struct pmap *
ptp_to_pmap(struct vm_page *ptp)
{
        struct pmap *pmap;

        if (ptp == NULL) {
                return pmap_kernel();
        }
        pmap = (struct pmap *)ptp->uobject;
        KASSERT(pmap != NULL);
        KASSERT(&pmap->pm_obj[0] == ptp->uobject);
        return pmap;
}

static inline struct pv_pte *
pve_to_pvpte(struct pv_entry *pve)
{

        if (pve == NULL)
                return NULL;
        KASSERT((void *)&pve->pve_pte == (void *)pve);
        return &pve->pve_pte;
}

static inline struct pv_entry *
pvpte_to_pve(struct pv_pte *pvpte)
{
        struct pv_entry *pve = (void *)pvpte;

        KASSERT(pve_to_pvpte(pve) == pvpte);
        return pve;
}

/*
 * Return true if the pmap page has an embedded PV entry.
 */
static inline bool
pv_pte_embedded(struct pmap_page *pp)
{

        KASSERT(mutex_owned(&pp->pp_lock));
        return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
}

/*
 * pv_pte_first, pv_pte_next: PV list iterator.
 */
static inline struct pv_pte *
pv_pte_first(struct pmap_page *pp)
{

        KASSERT(mutex_owned(&pp->pp_lock));
        if (pv_pte_embedded(pp)) {
                return &pp->pp_pte;
        }
        return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
}

static inline struct pv_pte *
pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
{

        KASSERT(mutex_owned(&pp->pp_lock));
        KASSERT(pvpte != NULL);
        if (pvpte == &pp->pp_pte) {
                return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
        }
        return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
}

static inline uint8_t
pmap_pte_to_pp_attrs(pt_entry_t pte)
{
        uint8_t ret = 0;
        if (pte & PTE_D)
                ret |= PP_ATTRS_D;
        if (pte & PTE_A)
                ret |= PP_ATTRS_A;
        if (pte & PTE_W)
                ret |= PP_ATTRS_W;
        return ret;
}

static inline pt_entry_t
pmap_pp_attrs_to_pte(uint8_t attrs)
{
        pt_entry_t pte = 0;
        if (attrs & PP_ATTRS_D)
                pte |= PTE_D;
        if (attrs & PP_ATTRS_A)
                pte |= PTE_A;
        if (attrs & PP_ATTRS_W)
                pte |= PTE_W;
        return pte;
}

/*
 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
 * of course the kernel is always loaded
 */
bool
pmap_is_curpmap(struct pmap *pmap)
{
        return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
}

inline void
pmap_reference(struct pmap *pmap)
{

        atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
}

/*
 * rbtree: compare two nodes.
 */
static int
pmap_compare_nodes(void *context, const void *n1, const void *n2)
{
        const struct pv_entry *pve1 = n1;
        const struct pv_entry *pve2 = n2;

        KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);

        if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
                return -1;
        }
        if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
                return 1;
        }
        return 0;
}

/*
 * rbtree: compare a node and a key.
 */
static int
pmap_compare_key(void *context, const void *n, const void *k)
{
        const struct pv_entry *pve = n;
        const vaddr_t key = (vaddr_t)k;

        if (pve->pve_pte.pte_va < key) {
                return -1;
        }
        if (pve->pve_pte.pte_va > key) {
                return 1;
        }
        return 0;
}

/*
 * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
 */
static inline void
pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
{
        vaddr_t *min = (vaddr_t *)&ptp->uanon;

        if (va < *min) {
                *min = va;
        }
}

/*
 * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
 */
static inline void
pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
{
        vaddr_t sclip;

        if (ptp == NULL) {
                return;
        }

        sclip = (vaddr_t)ptp->uanon;
        sclip = (*startva < sclip ? sclip : *startva);
        *pte += (sclip - *startva) / PAGE_SIZE;
        *startva = sclip;
}

/*
 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
 *
 * there are several pmaps involved.  some or all of them might be same.
 *
 *        - the pmap given by the first argument
 *                our caller wants to access this pmap's PTEs.
 *
 *        - pmap_kernel()
 *                the kernel pmap.  note that it only contains the kernel part
 *                of the address space which is shared by any pmap.  ie. any
 *                pmap can be used instead of pmap_kernel() for our purpose.
 *
 *        - ci->ci_pmap
 *                pmap currently loaded on the cpu.
 *
 *        - vm_map_pmap(&curproc->p_vmspace->vm_map)
 *                current process' pmap.
 *
 * => caller must lock pmap first (if not the kernel pmap)
 * => must be undone with pmap_unmap_ptes before returning
 * => disables kernel preemption
 */
void
pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
    pd_entry_t * const **pdeppp)
{
        struct pmap *curpmap;
        struct cpu_info *ci;
        lwp_t *l;

        kpreempt_disable();

        /* The kernel's pmap is always accessible. */
        if (pmap == pmap_kernel()) {
                *pmap2 = NULL;
                *ptepp = PTE_BASE;
                *pdeppp = normal_pdes;
                return;
        }

        KASSERT(mutex_owned(&pmap->pm_lock));

        l = curlwp;
        ci = l->l_cpu;
        curpmap = ci->ci_pmap;
        if (pmap == curpmap) {
                /*
                 * Already on the CPU: make it valid.  This is very
                 * often the case during exit(), when we have switched
                 * to the kernel pmap in order to destroy a user pmap.
                 */
                if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
                        pmap_reactivate(pmap);
                }
                *pmap2 = NULL;
        } else {
                /*
                 * Toss current pmap from CPU and install new pmap, but keep
                 * a reference to the old one.  Dropping the reference can
                 * can block as it needs to take locks, so defer that to
                 * pmap_unmap_ptes().
                 */
                pmap_reference(pmap);
                pmap_load1(l, pmap, curpmap);
                *pmap2 = curpmap;
        }
        KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
#ifdef DIAGNOSTIC
        pmap->pm_pctr = lwp_pctr();
#endif
        *ptepp = PTE_BASE;

#if defined(XENPV) && defined(__x86_64__)
        KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
        ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
        *pdeppp = ci->ci_normal_pdes;
#else
        *pdeppp = normal_pdes;
#endif
}

/*
 * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
 *
 * => we cannot tolerate context switches while mapped in: assert this.
 * => reenables kernel preemption.
 * => does not unlock pmap.
 */
void
pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
{
        struct cpu_info *ci;
        struct pmap *mypmap;
        struct lwp *l;

        KASSERT(kpreempt_disabled());

        /* The kernel's pmap is always accessible. */
        if (pmap == pmap_kernel()) {
                kpreempt_enable();
                return;
        }

        l = curlwp;
        ci = l->l_cpu;

        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(pmap->pm_pctr == lwp_pctr());

#if defined(XENPV) && defined(__x86_64__)
        KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
        ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
#endif

        /* If not our own pmap, mark whatever's on the CPU now as lazy. */
        KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
        mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
        if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
                ci->ci_want_pmapload = 0;
        } else {
                ci->ci_want_pmapload = (mypmap != pmap_kernel());
                ci->ci_tlbstate = TLBSTATE_LAZY;
        }

        /* Now safe to re-enable preemption. */
        kpreempt_enable();

        /* Toss reference to other pmap taken earlier. */
        if (pmap2 != NULL) {
                pmap_destroy(pmap2);
        }
}

inline static void
pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
{

#if !defined(__x86_64__)
        if (curproc == NULL || curproc->p_vmspace == NULL ||
            pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
                return;

        if ((opte ^ npte) & PTE_X)
                pmap_update_pg(va);

        /*
         * Executability was removed on the last executable change.
         * Reset the code segment to something conservative and
         * let the trap handler deal with setting the right limit.
         * We can't do that because of locking constraints on the vm map.
         */

        if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
                struct trapframe *tf = curlwp->l_md.md_regs;

                tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
                pm->pm_hiexec = I386_MAX_EXE_ADDR;
        }
#endif /* !defined(__x86_64__) */
}

#if !defined(__x86_64__)
/*
 * Fixup the code segment to cover all potential executable mappings.
 * returns 0 if no changes to the code segment were made.
 */
int
pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
{
        struct vm_map_entry *ent;
        struct pmap *pm = vm_map_pmap(map);
        vaddr_t va = 0;

        vm_map_lock_read(map);
        for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
                /*
                 * This entry has greater va than the entries before.
                 * We need to make it point to the last page, not past it.
                 */
                if (ent->protection & VM_PROT_EXECUTE)
                        va = trunc_page(ent->end) - PAGE_SIZE;
        }
        vm_map_unlock_read(map);
        if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
                return 0;

        pm->pm_hiexec = va;
        if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
                tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
        } else {
                tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
                return 0;
        }
        return 1;
}
#endif /* !defined(__x86_64__) */

void
pat_init(struct cpu_info *ci)
{
#ifndef XENPV
        uint64_t pat;

        if (!(ci->ci_feat_val[0] & CPUID_PAT))
                return;

        /* We change WT to WC. Leave all other entries the default values. */
        pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
              PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
              PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
              PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);

        wrmsr(MSR_CR_PAT, pat);
        cpu_pat_enabled = true;
#endif
}

static pt_entry_t
pmap_pat_flags(u_int flags)
{
        u_int cacheflags = (flags & PMAP_CACHE_MASK);

        if (!cpu_pat_enabled) {
                switch (cacheflags) {
                case PMAP_NOCACHE:
                case PMAP_NOCACHE_OVR:
                        /* results in PGC_UCMINUS on cpus which have
                         * the cpuid PAT but PAT "disabled"
                         */
                        return PTE_PCD;
                default:
                        return 0;
                }
        }

        switch (cacheflags) {
        case PMAP_NOCACHE:
                return PGC_UC;
        case PMAP_WRITE_COMBINE:
                return PGC_WC;
        case PMAP_WRITE_BACK:
                return PGC_WB;
        case PMAP_NOCACHE_OVR:
                return PGC_UCMINUS;
        }

        return 0;
}

/*
 * p m a p   k e n t e r   f u n c t i o n s
 *
 * functions to quickly enter/remove pages from the kernel address
 * space.   pmap_kremove is exported to MI kernel.  we make use of
 * the recursive PTE mappings.
 */

/*
 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
 *
 * => no need to lock anything, assume va is already allocated
 * => should be faster than normal pmap enter function
 */
void
pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
{
        pt_entry_t *pte, opte, npte;

        KASSERT(!(prot & ~VM_PROT_ALL));

        if (va < VM_MIN_KERNEL_ADDRESS)
                pte = vtopte(va);
        else
                pte = kvtopte(va);
#if defined(XENPV) && defined(DOM0OPS)
        if (pa < pmap_pa_start || pa >= pmap_pa_end) {
#ifdef DEBUG
                printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
                    " outside range\n", __func__, pa, va);
#endif /* DEBUG */
                npte = pa;
        } else
#endif /* XENPV && DOM0OPS */
                npte = pmap_pa2pte(pa);
        npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
        npte |= pmap_pat_flags(flags);
        opte = pmap_pte_testset(pte, npte); /* zap! */

        /*
         * XXX: make sure we are not dealing with a large page, since the only
         * large pages created are for the kernel image, and they should never
         * be kentered.
         */
        KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va);

        if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
                /* This should not happen. */
                printf_nolog("%s: mapping already present\n", __func__);
                kpreempt_disable();
                pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
                kpreempt_enable();
        }
}

__strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);

#if defined(__x86_64__)
/*
 * Change protection for a virtual address. Local for a CPU only, don't
 * care about TLB shootdowns.
 *
 * => must be called with preemption disabled
 */
void
pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
{
        pt_entry_t *pte, opte, npte;

        KASSERT(kpreempt_disabled());

        if (va < VM_MIN_KERNEL_ADDRESS)
                pte = vtopte(va);
        else
                pte = kvtopte(va);

        npte = opte = *pte;

        if ((prot & VM_PROT_WRITE) != 0)
                npte |= PTE_W;
        else
                npte &= ~(PTE_W|PTE_D);

        if (opte != npte) {
                pmap_pte_set(pte, npte);
                pmap_pte_flush();
                invlpg(va);
        }
}
#endif /* defined(__x86_64__) */

/*
 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
 *
 * => no need to lock anything
 * => caller must dispose of any vm_page mapped in the va range
 * => note: not an inline function
 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
 * => we assume kernel only unmaps valid addresses and thus don't bother
 *    checking the valid bit before doing TLB flushing
 * => must be followed by call to pmap_update() before reuse of page
 */
static void
pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
{
        pt_entry_t *pte, opte;
        vaddr_t va, eva;

        eva = sva + len;

        kpreempt_disable();
        for (va = sva; va < eva; va += PAGE_SIZE) {
                pte = kvtopte(va);
                opte = pmap_pte_testset(pte, 0); /* zap! */
                if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) {
                        pmap_tlb_shootdown(pmap_kernel(), va, opte,
                            TLBSHOOT_KREMOVE);
                }
                KASSERTMSG((opte & PTE_PS) == 0,
                    "va %#" PRIxVADDR " is a large page", va);
                KASSERTMSG((opte & PTE_PVLIST) == 0,
                    "va %#" PRIxVADDR " is a pv tracked page", va);
        }
        if (localonly) {
                tlbflushg();
        }
        kpreempt_enable();
}

void
pmap_kremove(vaddr_t sva, vsize_t len)
{

        pmap_kremove1(sva, len, false);
}

/*
 * pmap_kremove_local: like pmap_kremove(), but only worry about
 * TLB invalidations on the current CPU.  this is only intended
 * for use while writing kernel crash dumps, either after panic
 * or via reboot -d.
 */
void
pmap_kremove_local(vaddr_t sva, vsize_t len)
{

        pmap_kremove1(sva, len, true);
}

/*
 * p m a p   i n i t   f u n c t i o n s
 *
 * pmap_bootstrap and pmap_init are called during system startup
 * to init the pmap module.   pmap_bootstrap() does a low level
 * init just to get things rolling.   pmap_init() finishes the job.
 */

/*
 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
 * This function is to be used before any VM system has been set up.
 *
 * The va is taken from virtual_avail.
 */
static vaddr_t
pmap_bootstrap_valloc(size_t npages)
{
        vaddr_t va = virtual_avail;
        virtual_avail += npages * PAGE_SIZE;
        return va;
}

/*
 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
 * This function is to be used before any VM system has been set up.
 *
 * The pa is taken from avail_start.
 */
static paddr_t
pmap_bootstrap_palloc(size_t npages)
{
        paddr_t pa = avail_start;
        avail_start += npages * PAGE_SIZE;
        return pa;
}

/*
 * pmap_bootstrap: get the system in a state where it can run with VM properly
 * enabled (called before main()). The VM system is fully init'd later.
 *
 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
 *    kernel, and nkpde PTP's for the kernel.
 * => kva_start is the first free virtual address in kernel space.
 */
void
pmap_bootstrap(vaddr_t kva_start)
{
        struct pmap *kpm;
        int i;
        vaddr_t kva;

        pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);

        /*
         * Set up our local static global vars that keep track of the usage of
         * KVM before kernel_map is set up.
         */
        virtual_avail = kva_start;                /* first free KVA */
        virtual_end = VM_MAX_KERNEL_ADDRESS;        /* last KVA */

        /*
         * Set up protection_codes: we need to be able to convert from a MI
         * protection code (some combo of VM_PROT...) to something we can jam
         * into a x86 PTE.
         */
        protection_codes[VM_PROT_NONE] = pmap_pg_nx;
        protection_codes[VM_PROT_EXECUTE] = PTE_X;
        protection_codes[VM_PROT_READ] = pmap_pg_nx;
        protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
        protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
        protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
        protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
        protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;

        /*
         * Now we init the kernel's pmap.
         *
         * The kernel pmap's pm_obj is not used for much. However, in user pmaps
         * the pm_obj contains the list of active PTPs.
         */
        kpm = pmap_kernel();
        mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
        rw_init(&kpm->pm_dummy_lock);
        for (i = 0; i < PTP_LEVELS - 1; i++) {
                uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
                uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
                kpm->pm_ptphint[i] = NULL;
        }
        memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */

        kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
        for (i = 0; i < PDP_SIZE; i++)
                kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;

        kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
                x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);

        kcpuset_create(&kpm->pm_cpus, true);
        kcpuset_create(&kpm->pm_kernel_cpus, true);

        kpm->pm_ldt = NULL;
        kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);

        /*
         * the above is just a rough estimate and not critical to the proper
         * operation of the system.
         */

#if !defined(XENPV)
        /*
         * Begin to enable global TLB entries if they are supported: add PTE_G
         * attribute to already mapped kernel pages. Do that only if SVS is
         * disabled.
         *
         * The G bit has no effect until the CR4_PGE bit is set in CR4, which
         * happens later in cpu_init().
         */
#ifdef SVS
        if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
#else
        if (cpu_feature[0] & CPUID_PGE) {
#endif
                pmap_pg_g = PTE_G;
                pmap_remap_global();
        }
#endif

#ifndef XENPV
        /*
         * Enable large pages if they are supported.
         */
        if (cpu_feature[0] & CPUID_PSE) {
                lcr4(rcr4() | CR4_PSE);        /* enable hardware (via %cr4) */
                pmap_largepages = 1;        /* enable software */

                /*
                 * The TLB must be flushed after enabling large pages on Pentium
                 * CPUs, according to section 3.6.2.2 of "Intel Architecture
                 * Software Developer's Manual, Volume 3: System Programming".
                 */
                tlbflushg();

                /* Remap the kernel. */
                pmap_remap_largepages();
        }
        pmap_init_lapic();
#endif /* !XENPV */

#ifdef __HAVE_PCPU_AREA
        pmap_init_pcpu();
#endif

#ifdef __HAVE_DIRECT_MAP
        pmap_init_directmap(kpm);
#else
        pmap_vpage_cpualloc(&cpu_info_primary);

        if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
                early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
                early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
        } else { /* amd64 */
                /*
                 * zero_pte is stuck at the end of mapped space for the kernel
                 * image (disjunct from kva space). This is done so that it
                 * can safely be used in pmap_growkernel (pmap_get_physpage),
                 * when it's called for the first time.
                 * XXXfvdl fix this for MULTIPROCESSOR later.
                 */
#ifdef XENPV
                /* early_zerop initialized in xen_locore() */
#else
                early_zerop = (void *)bootspace.spareva;
#endif
                early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
        }
#endif

#if defined(XENPV) && defined(__x86_64__)
        extern vaddr_t xen_dummy_page;
        paddr_t xen_dummy_user_pgd;

        /*
         * We want a dummy page directory for Xen: when deactivating a pmap,
         * Xen will still consider it active. So we set user PGD to this one
         * to lift all protection on the now inactive page tables set.
         */
        xen_dummy_user_pgd = xen_dummy_page - KERNBASE;

        /* Zero fill it, the less checks in Xen it requires the better */
        memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
        /* Mark read-only */
        HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
            pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
            UVMF_INVLPG);
        /* Pin as L4 */
        xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
#endif

        /*
         * Allocate space for the Interrupt Descriptor Table (IDT),
         * Global Descriptor Table (GDT), and Local Descriptor Table
         * (LDT).
         *
         * Currently there is an initial temporary GDT allocated on the
         * stack by the caller of init386/init_x86_64, which is (among
         * other things) needed on i386 for %fs-relative addressing for
         * CPU-local data (CPUVAR(...), curcpu(), curlwp).  This
         * initial temporary GDT will be popped off the stack before we
         * can enter main, so we need to make sure there is space for a
         * second temporary GDT to continue existing when we enter main
         * before we allocate space for the permanent GDT with
         * uvm_km(9) in gdt_init via cpu_startup and switch to that.
         */
        idt_vaddr = pmap_bootstrap_valloc(1);
        idt_paddr = pmap_bootstrap_palloc(1);

        gdt_vaddr = pmap_bootstrap_valloc(1);
        gdt_paddr = pmap_bootstrap_palloc(1);

#ifdef __HAVE_PCPU_AREA
        ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
#else
        ldt_vaddr = pmap_bootstrap_valloc(1);
#endif
        ldt_paddr = pmap_bootstrap_palloc(1);

#if !defined(__x86_64__)
        /* pentium f00f bug stuff */
        pentium_idt_vaddr = pmap_bootstrap_valloc(1);
#endif

#if defined(XENPVHVM)
        /* XXX: move to hypervisor.c with appropriate API adjustments */
        extern paddr_t HYPERVISOR_shared_info_pa;
        extern volatile struct xencons_interface *xencons_interface; /* XXX */
        extern struct xenstore_domain_interface *xenstore_interface; /* XXX */

        if (vm_guest != VM_GUEST_XENPVH) {
                HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
                HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
        }
        xencons_interface = (void *) pmap_bootstrap_valloc(1);
        xenstore_interface = (void *) pmap_bootstrap_valloc(1);
#endif
        /*
         * Now we reserve some VM for mapping pages when doing a crash dump.
         */
        virtual_avail = reserve_dumppages(virtual_avail);

        /*
         * Init the global lock and global list.
         */
        mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
        LIST_INIT(&pmaps);

        /*
         * Ensure the TLB is sync'd with reality by flushing it...
         */
        tlbflushg();

        /*
         * Calculate pmap_maxkvaddr from nkptp[].
         */
        kva = VM_MIN_KERNEL_ADDRESS;
        for (i = PTP_LEVELS - 1; i >= 1; i--) {
                kva += nkptp[i] * nbpd[i];
        }
        pmap_maxkvaddr = kva;
}

#ifndef XENPV
static void
pmap_init_lapic(void)
{
        /*
         * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
         * x86 implementation relies a lot on this address to be valid; so just
         * allocate a fake physical page that will be kentered into
         * local_apic_va by machdep.
         *
         * If the LAPIC is present, the va will be remapped somewhere else
         * later in lapic_map.
         */
        local_apic_va = pmap_bootstrap_valloc(1);
        local_apic_pa = pmap_bootstrap_palloc(1);
}
#endif

#ifdef __x86_64__
static size_t
pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
{
        size_t npages;
        npages = (roundup(endva, pgsz) / pgsz) -
            (rounddown(startva, pgsz) / pgsz);
        return npages;
}
#endif

#if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
static inline void
slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
{
        size_t sslot = slotspace.area[type].sslot;
        size_t nslot = slotspace.area[type].nslot;

        memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
}
#endif

#ifdef __x86_64__
/*
 * Randomize the location of an area. We count the holes in the VM space. We
 * randomly select one hole, and then randomly select an area within that hole.
 * Finally we update the associated entry in the slotspace structure.
 */
vaddr_t
slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
    vaddr_t randva)
{
        struct {
                int start;
                int end;
        } holes[SLSPACE_NAREAS+1];
        size_t i, nholes, hole;
        size_t startsl, endsl, nslots, winsize;
        vaddr_t startva, va;

        sz = roundup(sz, align);

        /*
         * Take one more slot with +NBPD_L4, because we may end up choosing
         * an area that crosses slots:
         *     +------+------+------+
         *     | Slot | Slot | Slot |
         *     +------+------+------+
         *        [Chosen Area]
         * And in that case we must take into account the additional slot
         * consumed.
         */
        nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;

        /* Get the holes. */
        nholes = 0;
        size_t curslot = 0 + 256; /* end of SLAREA_USER */
        while (1) {
                /*
                 * Find the first occupied slot after the current one.
                 * The area between the two is a hole.
                 */
                size_t minsslot = 512;
                size_t minnslot = 0;
                for (i = 0; i < SLSPACE_NAREAS; i++) {
                        if (!slotspace.area[i].active)
                                continue;
                        if (slotspace.area[i].sslot >= curslot &&
                            slotspace.area[i].sslot < minsslot) {
                                minsslot = slotspace.area[i].sslot;
                                minnslot = slotspace.area[i].nslot;
                        }
                }

                /* No hole anymore, stop here. */
                if (minsslot == 512) {
                        break;
                }

                /* Register the hole. */
                if (minsslot - curslot >= nslots) {
                        holes[nholes].start = curslot;
                        holes[nholes].end = minsslot;
                        nholes++;
                }

                /* Skip that hole, and iterate again. */
                curslot = minsslot + minnslot;
        }

        if (nholes == 0) {
                panic("%s: impossible", __func__);
        }

        /* Select a hole. */
        hole = randhole;
#ifdef NO_X86_ASLR
        hole = 0;
#endif
        hole %= nholes;
        startsl = holes[hole].start;
        endsl = holes[hole].end;
        startva = VA_SIGN_NEG(startsl * NBPD_L4);

        /* Select an area within the hole. */
        va = randva;
#ifdef NO_X86_ASLR
        va = 0;
#endif
        winsize = ((endsl - startsl) * NBPD_L4) - sz;
        va %= winsize;
        va = rounddown(va, align);
        va += startva;

        /* Update the entry. */
        slotspace.area[type].sslot = pl4_i(va);
        slotspace.area[type].nslot =
            pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
        slotspace.area[type].active = true;

        return va;
}
#endif

#ifdef __HAVE_PCPU_AREA
static void
pmap_init_pcpu(void)
{
        const vaddr_t startva = PMAP_PCPU_BASE;
        size_t nL4e, nL3e, nL2e, nL1e;
        size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
        paddr_t pa;
        vaddr_t endva;
        vaddr_t tmpva;
        pt_entry_t *pte;
        size_t size;
        int i;

        const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;

        size = sizeof(struct pcpu_area);

        endva = startva + size;

        /* We will use this temporary va. */
        tmpva = bootspace.spareva;
        pte = PTE_BASE + pl1_i(tmpva);

        /* Build L4 */
        L4e_idx = pl4_i(startva);
        nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
        KASSERT(nL4e  == 1);
        for (i = 0; i < nL4e; i++) {
                KASSERT(L4_BASE[L4e_idx+i] == 0);

                pa = pmap_bootstrap_palloc(1);
                *pte = (pa & PTE_FRAME) | pteflags;
                pmap_update_pg(tmpva);
                memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);

                L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
        }

        /* Build L3 */
        L3e_idx = pl3_i(startva);
        nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
        for (i = 0; i < nL3e; i++) {
                KASSERT(L3_BASE[L3e_idx+i] == 0);

                pa = pmap_bootstrap_palloc(1);
                *pte = (pa & PTE_FRAME) | pteflags;
                pmap_update_pg(tmpva);
                memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);

                L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
        }

        /* Build L2 */
        L2e_idx = pl2_i(startva);
        nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
        for (i = 0; i < nL2e; i++) {

                KASSERT(L2_BASE[L2e_idx+i] == 0);

                pa = pmap_bootstrap_palloc(1);
                *pte = (pa & PTE_FRAME) | pteflags;
                pmap_update_pg(tmpva);
                memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);

                L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
        }

        /* Build L1 */
        L1e_idx = pl1_i(startva);
        nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
        for (i = 0; i < nL1e; i++) {
                /*
                 * Nothing to do, the PTEs will be entered via
                 * pmap_kenter_pa.
                 */
                KASSERT(L1_BASE[L1e_idx+i] == 0);
        }

        *pte = 0;
        pmap_update_pg(tmpva);

        pcpuarea = (struct pcpu_area *)startva;

        tlbflush();
}
#endif

#ifdef __HAVE_DIRECT_MAP
static void
randomize_hole(size_t *randholep, vaddr_t *randvap)
{
        struct nist_hash_drbg drbg;
        uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES];
        const char p[] = "x86/directmap";
        int error;

        entropy_extract(seed, sizeof(seed), 0);

        error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed),
            /*nonce*/NULL, 0,
            /*personalization*/p, strlen(p));
        KASSERTMSG(error == 0, "error=%d", error);

        error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep),
            /*additional*/NULL, 0);
        KASSERTMSG(error == 0, "error=%d", error);

        error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap),
            /*additional*/NULL, 0);
        KASSERTMSG(error == 0, "error=%d", error);

        explicit_memset(seed, 0, sizeof(seed));
        explicit_memset(&drbg, 0, sizeof(drbg));
}

/*
 * Create the amd64 direct map. Called only once at boot time. We map all of
 * the physical memory contiguously using 2MB large pages, with RW permissions.
 * However there is a hole: the kernel is mapped with RO permissions.
 */
static void
pmap_init_directmap(struct pmap *kpm)
{
        extern phys_ram_seg_t mem_clusters[];
        extern int mem_cluster_cnt;

        vaddr_t startva;
        size_t nL4e, nL3e, nL2e;
        size_t L4e_idx, L3e_idx, L2e_idx;
        size_t spahole, epahole;
        paddr_t lastpa, pa;
        vaddr_t endva;
        vaddr_t tmpva;
        pt_entry_t *pte;
        phys_ram_seg_t *mc;
        int i;
        size_t randhole;
        vaddr_t randva;

        const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
        const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;

        CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);

        spahole = roundup(bootspace.head.pa, NBPD_L2);
        epahole = rounddown(bootspace.boot.pa, NBPD_L2);

        /* Get the last physical address available */
        lastpa = 0;
        for (i = 0; i < mem_cluster_cnt; i++) {
                mc = &mem_clusters[i];
                lastpa = MAX(lastpa, mc->start + mc->size);
        }

        /*
         * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
         */
        if (lastpa > MAXPHYSMEM) {
                panic("pmap_init_directmap: lastpa incorrect");
        }

        randomize_hole(&randhole, &randva);
        startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
            randhole, randva);
        endva = startva + lastpa;

        /* We will use this temporary va. */
        tmpva = bootspace.spareva;
        pte = PTE_BASE + pl1_i(tmpva);

        /* Build L4 */
        L4e_idx = pl4_i(startva);
        nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
        KASSERT(nL4e <= NL4_SLOT_DIRECT);
        for (i = 0; i < nL4e; i++) {
                KASSERT(L4_BASE[L4e_idx+i] == 0);

                pa = pmap_bootstrap_palloc(1);
                *pte = (pa & PTE_FRAME) | pteflags;
                pmap_update_pg(tmpva);
                memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);

                L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
        }

        /* Build L3 */
        L3e_idx = pl3_i(startva);
        nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
        for (i = 0; i < nL3e; i++) {
                KASSERT(L3_BASE[L3e_idx+i] == 0);

                pa = pmap_bootstrap_palloc(1);
                *pte = (pa & PTE_FRAME) | pteflags;
                pmap_update_pg(tmpva);
                memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);

                L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
        }

        /* Build L2 */
        L2e_idx = pl2_i(startva);
        nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
        for (i = 0; i < nL2e; i++) {
                KASSERT(L2_BASE[L2e_idx+i] == 0);

                pa = (paddr_t)(i * NBPD_L2);

                if (spahole <= pa && pa < epahole) {
                        L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
                            PTE_PS | pmap_pg_g;
                } else {
                        L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
                            PTE_PS | pmap_pg_g;
                }
        }

        *pte = 0;
        pmap_update_pg(tmpva);

        pmap_direct_base = startva;
        pmap_direct_end = endva;

        tlbflush();
}
#endif /* __HAVE_DIRECT_MAP */

#if !defined(XENPV)
/*
 * Remap all of the virtual pages created so far with the PTE_G bit.
 */
static void
pmap_remap_global(void)
{
        vaddr_t kva, kva_end;
        unsigned long p1i;
        size_t i;

        /* head */
        kva = bootspace.head.va;
        kva_end = kva + bootspace.head.sz;
        for ( ; kva < kva_end; kva += PAGE_SIZE) {
                p1i = pl1_i(kva);
                if (pmap_valid_entry(PTE_BASE[p1i]))
                        PTE_BASE[p1i] |= pmap_pg_g;
        }

        /* kernel segments */
        for (i = 0; i < BTSPACE_NSEGS; i++) {
                if (bootspace.segs[i].type == BTSEG_NONE) {
                        continue;
                }
                kva = bootspace.segs[i].va;
                kva_end = kva + bootspace.segs[i].sz;
                for ( ; kva < kva_end; kva += PAGE_SIZE) {
                        p1i = pl1_i(kva);
                        if (pmap_valid_entry(PTE_BASE[p1i]))
                                PTE_BASE[p1i] |= pmap_pg_g;
                }
        }

        /* boot space */
        kva = bootspace.boot.va;
        kva_end = kva + bootspace.boot.sz;
        for ( ; kva < kva_end; kva += PAGE_SIZE) {
                p1i = pl1_i(kva);
                if (pmap_valid_entry(PTE_BASE[p1i]))
                        PTE_BASE[p1i] |= pmap_pg_g;
        }
}
#endif

#ifndef XENPV
/*
 * Remap several kernel segments with large pages. We cover as many pages as we
 * can. Called only once at boot time, if the CPU supports large pages.
 */
static void
pmap_remap_largepages(void)
{
        pd_entry_t *pde;
        vaddr_t kva, kva_end;
        paddr_t pa;
        size_t i;

        /* Remap the kernel text using large pages. */
        for (i = 0; i < BTSPACE_NSEGS; i++) {
                if (bootspace.segs[i].type != BTSEG_TEXT) {
                        continue;
                }
                kva = roundup(bootspace.segs[i].va, NBPD_L2);
                if (kva < bootspace.segs[i].va) {
                        continue;
                }
                kva_end = rounddown(bootspace.segs[i].va +
                        bootspace.segs[i].sz, NBPD_L2);
                pa = roundup(bootspace.segs[i].pa, NBPD_L2);
                for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
                        pde = &L2_BASE[pl2_i(kva)];
                        *pde = pa | pmap_pg_g | PTE_PS | PTE_P;
                        tlbflushg();
                }
        }

        /* Remap the kernel rodata using large pages. */
        for (i = 0; i < BTSPACE_NSEGS; i++) {
                if (bootspace.segs[i].type != BTSEG_RODATA) {
                        continue;
                }
                kva = roundup(bootspace.segs[i].va, NBPD_L2);
                if (kva < bootspace.segs[i].va) {
                        continue;
                }
                kva_end = rounddown(bootspace.segs[i].va +
                        bootspace.segs[i].sz, NBPD_L2);
                pa = roundup(bootspace.segs[i].pa, NBPD_L2);
                for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
                        pde = &L2_BASE[pl2_i(kva)];
                        *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
                        tlbflushg();
                }
        }

        /* Remap the kernel data+bss using large pages. */
        for (i = 0; i < BTSPACE_NSEGS; i++) {
                if (bootspace.segs[i].type != BTSEG_DATA) {
                        continue;
                }
                kva = roundup(bootspace.segs[i].va, NBPD_L2);
                if (kva < bootspace.segs[i].va) {
                        continue;
                }
                kva_end = rounddown(bootspace.segs[i].va +
                        bootspace.segs[i].sz, NBPD_L2);
                pa = roundup(bootspace.segs[i].pa, NBPD_L2);
                for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
                        pde = &L2_BASE[pl2_i(kva)];
                        *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
                        tlbflushg();
                }
        }
}
#endif /* !XENPV */

/*
 * pmap_init: called from uvm_init, our job is to get the pmap system ready
 * to manage mappings.
 */
void
pmap_init(void)
{
        int flags;

        /*
         * initialize caches.
         */

        pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
            0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);

#ifdef XENPV
        /*
         * pool_cache(9) should not touch cached objects, since they
         * are pinned on xen and R/O for the domU
         */
        flags = PR_NOTOUCH;
#else
        flags = 0;
#endif

#ifdef PAE
        pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
            "pdppl", &pmap_pdp_allocator, IPL_NONE);
#else
        pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
            "pdppl", NULL, IPL_NONE);
#endif
        pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
             0, 0, "pvpage", &pool_allocator_kmem,
            IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);

        pmap_tlb_init();

        /* XXX: Since cpu_hatch() is only for secondary CPUs. */
        pmap_tlb_cpu_init(curcpu());

        evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
            NULL, "x86", "io bitmap copy");
        evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
            NULL, "x86", "ldt sync");

        /*
         * The kernel doesn't keep track of PTPs, so there's nowhere handy
         * to hang a tree of pv_entry records.  Dynamically allocated
         * pv_entry lists are not heavily used in the kernel's pmap (the
         * usual case is embedded), so cop out and use a single RB tree
         * to cover them.
         */
        rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);

        /*
         * done: pmap module is up (and ready for business)
         */

        pmap_initialized = true;
}

#ifndef XENPV
/*
 * pmap_cpu_init_late: perform late per-CPU initialization.
 */
void
pmap_cpu_init_late(struct cpu_info *ci)
{
        /*
         * The BP has already its own PD page allocated during early
         * MD startup.
         */
        if (ci == &cpu_info_primary)
                return;
#ifdef PAE
        cpu_alloc_l3_page(ci);
#endif
}
#endif

#ifndef __HAVE_DIRECT_MAP
CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);

static void
pmap_vpage_cpualloc(struct cpu_info *ci)
{
        bool primary = (ci == &cpu_info_primary);
        size_t i, npages;
        vaddr_t vabase;
        vsize_t vrange;

        npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
        KASSERT(npages >= VPAGE_MAX);
        vrange = npages * PAGE_SIZE;

        if (primary) {
                while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
                        /* Waste some pages to align properly */
                }
                /* The base is aligned, allocate the rest (contiguous) */
                pmap_bootstrap_valloc(npages - 1);
        } else {
                vabase = uvm_km_alloc(kernel_map, vrange, vrange,
                    UVM_KMF_VAONLY);
                if (vabase == 0) {
                        panic("%s: failed to allocate tmp VA for CPU %d\n",
                            __func__, cpu_index(ci));
                }
        }

        KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);

        for (i = 0; i < VPAGE_MAX; i++) {
                ci->vpage[i] = vabase + i * PAGE_SIZE;
                ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
        }
}

void
pmap_vpage_cpu_init(struct cpu_info *ci)
{
        if (ci == &cpu_info_primary) {
                /* cpu0 already taken care of in pmap_bootstrap */
                return;
        }

        pmap_vpage_cpualloc(ci);
}
#endif

/*
 * p v _ e n t r y   f u n c t i o n s
 */

/*
 * pmap_pvp_dtor: pool_cache constructor for PV pages.
 */
static int
pmap_pvp_ctor(void *arg, void *obj, int flags)
{
        struct pv_page *pvp = (struct pv_page *)obj;
        struct pv_entry *pve = (struct pv_entry *)obj + 1;
        struct pv_entry *maxpve = pve + PVE_PER_PVP;

        KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
        KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);

        LIST_INIT(&pvp->pvp_pves);
        pvp->pvp_nfree = PVE_PER_PVP;
        pvp->pvp_pmap = NULL;

        for (; pve < maxpve; pve++) {
                LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
        }

        return 0;
}

/*
 * pmap_pvp_dtor: pool_cache destructor for PV pages.
 */
static void
pmap_pvp_dtor(void *arg, void *obj)
{
        struct pv_page *pvp __diagused = obj;

        KASSERT(pvp->pvp_pmap == NULL);
        KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
}

/*
 * pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
 */
static struct pv_entry *
pmap_alloc_pv(struct pmap *pmap)
{
        struct pv_entry *pve;
        struct pv_page *pvp;

        KASSERT(mutex_owned(&pmap->pm_lock));

        if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
                if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
                        LIST_REMOVE(pvp, pvp_list);
                } else {
                        pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
                }
                if (__predict_false(pvp == NULL)) {
                        return NULL;
                }
                /* full -> part */
                LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
                pvp->pvp_pmap = pmap;
        }

        KASSERT(pvp->pvp_pmap == pmap);
        KASSERT(pvp->pvp_nfree > 0);

        pve = LIST_FIRST(&pvp->pvp_pves);
        LIST_REMOVE(pve, pve_list);
        pvp->pvp_nfree--;

        if (__predict_false(pvp->pvp_nfree == 0)) {
                /* part -> empty */
                KASSERT(LIST_EMPTY(&pvp->pvp_pves));
                LIST_REMOVE(pvp, pvp_list);
                LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
        } else {
                KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
        }

        return pve;
}

/*
 * pmap_free_pv: delayed free of a PV entry.
 */
static void
pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
{
        struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);

        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(pvp->pvp_pmap == pmap);
        KASSERT(pvp->pvp_nfree >= 0);

        LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
        pvp->pvp_nfree++;

        if (__predict_false(pvp->pvp_nfree == 1)) {
                /* empty -> part */
                LIST_REMOVE(pvp, pvp_list);
                LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
        } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
                /* part -> full */
                LIST_REMOVE(pvp, pvp_list);
                LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
        }
}

/*
 * pmap_drain_pv: free full PV pages.
 */
static void
pmap_drain_pv(struct pmap *pmap)
{
        struct pv_page *pvp;

        KASSERT(mutex_owned(&pmap->pm_lock));

        while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
                LIST_REMOVE(pvp, pvp_list);
                KASSERT(pvp->pvp_pmap == pmap);
                KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
                pvp->pvp_pmap = NULL;
                pool_cache_put(&pmap_pvp_cache, pvp);
        }
}

/*
 * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
 */
static void
pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
    vaddr_t va, bool tracked)
{
#ifdef DEBUG
        struct pv_pte *pvpte;

        PMAP_CHECK_PP(pp);

        mutex_spin_enter(&pp->pp_lock);
        for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
                if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
                        break;
                }
        }
        mutex_spin_exit(&pp->pp_lock);

        if (pvpte && !tracked) {
                panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
        } else if (!pvpte && tracked) {
                panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
        }
#endif
}

/*
 * pmap_treelookup_pv: search the PV tree for a dynamic entry
 *
 * => pmap must be locked
 */
static struct pv_entry *
pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
    const rb_tree_t *tree, const vaddr_t va)
{
        struct pv_entry *pve;
        rb_node_t *node;

        /*
         * Inlined lookup tailored for exactly what's needed here that is
         * quite a bit faster than using rb_tree_find_node().
         */
        for (node = tree->rbt_root;;) {
                if (__predict_false(RB_SENTINEL_P(node))) {
                        return NULL;
                }
                pve = (struct pv_entry *)
                    ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
                if (pve->pve_pte.pte_va == va) {
                        KASSERT(pve->pve_pte.pte_ptp == ptp);
                        return pve;
                }
                node = node->rb_nodes[pve->pve_pte.pte_va < va];
        }
}

/*
 * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
 *
 * => a PV entry must be known present (doesn't check for existence)
 * => pmap must be locked
 */
static struct pv_entry *
pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
    const struct pmap_page * const old_pp, const vaddr_t va)
{
        struct pv_entry *pve;
        const rb_tree_t *tree;

        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(ptp != NULL || pmap == pmap_kernel());

        /*
         * [This mostly deals with the case of process-private pages, i.e.
         * anonymous memory allocations or COW.]
         *
         * If the page is tracked with an embedded entry then the tree
         * lookup can be avoided.  It's safe to check for this specific
         * set of values without pp_lock because both will only ever be
         * set together for this pmap.
         *
         */
        if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
            atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
                return NULL;
        }

        /*
         * [This mostly deals with shared mappings, for example shared libs
         * and executables.]
         *
         * Optimise for pmap_remove_ptes() which works by ascending scan:
         * look at the lowest numbered node in the tree first.  The tree is
         * known non-empty because of the check above.  For short lived
         * processes where pmap_remove() isn't used much this gets close to
         * a 100% hit rate.
         */
        tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
        KASSERT(!RB_SENTINEL_P(tree->rbt_root));
        pve = (struct pv_entry *)
            ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
            offsetof(struct pv_entry, pve_rb));
        if (__predict_true(pve->pve_pte.pte_va == va)) {
                KASSERT(pve->pve_pte.pte_ptp == ptp);
                return pve;
        }

        /* Search the RB tree for the key (uncommon). */
        return pmap_treelookup_pv(pmap, ptp, tree, va);
}

/*
 * pmap_enter_pv: enter a mapping onto a pmap_page lst
 *
 * => pmap must be locked
 * => does NOT insert dynamic entries to tree (pmap_enter() does later)
 */
static int
pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
    vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
    bool *samepage, bool *new_embedded, rb_tree_t *tree)
{
        struct pv_entry *pve;
        int error;

        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(ptp_to_pmap(ptp) == pmap);
        KASSERT(ptp == NULL || ptp->uobject != NULL);
        KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
        PMAP_CHECK_PP(pp);

        /*
         * If entering the same page and it's already tracked with an
         * embedded entry, we can avoid the expense below.  It's safe
         * to check for this very specific set of values without a lock
         * because both will only ever be set together for this pmap.
         */
        if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
            atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
                *samepage = true;
                pmap_check_pv(pmap, ptp, pp, va, true);
                return 0;
        }

        /*
         * Check for an existing dynamic mapping at this address.  If it's
         * for the same page, then it will be reused and nothing needs to be
         * changed.
         */
        *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
        if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
                *samepage = true;
                pmap_check_pv(pmap, ptp, pp, va, true);
                return 0;
        }

        /*
         * Need to put a new mapping in place.  Grab a spare pv_entry in
         * case it's needed; won't know for sure until the lock is taken.
         */
        if (pmap->pm_pve == NULL) {
                pmap->pm_pve = pmap_alloc_pv(pmap);
        }

        error = 0;
        pmap_check_pv(pmap, ptp, pp, va, false);
        mutex_spin_enter(&pp->pp_lock);
        if (!pv_pte_embedded(pp)) {
                /*
                 * Embedded PV tracking available - easy.
                 */
                pp->pp_pte.pte_ptp = ptp;
                pp->pp_pte.pte_va = va;
                *new_embedded = true;
        } else if (__predict_false(pmap->pm_pve == NULL)) {
                /*
                 * No memory.
                 */
                error = ENOMEM;
        } else {
                /*
                 * Install new pv_entry on the page.
                 */
                pve = pmap->pm_pve;
                pmap->pm_pve = NULL;
                *new_pve = pve;
                pve->pve_pte.pte_ptp = ptp;
                pve->pve_pte.pte_va = va;
                pve->pve_pp = pp;
                LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
        }
        mutex_spin_exit(&pp->pp_lock);
        if (error == 0) {
                pmap_check_pv(pmap, ptp, pp, va, true);
        }

        return error;
}

/*
 * pmap_remove_pv: try to remove a mapping from a pv_list
 *
 * => pmap must be locked
 * => removes dynamic entries from tree and frees them
 * => caller should adjust ptp's wire_count and free PTP if needed
 */
static void
pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
    vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
{
        rb_tree_t *tree = (ptp != NULL ?
            &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);

        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(ptp_to_pmap(ptp) == pmap);
        KASSERT(ptp == NULL || ptp->uobject != NULL);
        KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
        KASSERT(ptp != NULL || pmap == pmap_kernel());

        pmap_check_pv(pmap, ptp, pp, va, true);

        if (pve == NULL) {
                mutex_spin_enter(&pp->pp_lock);
                KASSERT(pp->pp_pte.pte_ptp == ptp);
                KASSERT(pp->pp_pte.pte_va == va);
                pp->pp_attrs |= oattrs;
                pp->pp_pte.pte_ptp = NULL;
                pp->pp_pte.pte_va = 0;
                mutex_spin_exit(&pp->pp_lock);
        } else {
                mutex_spin_enter(&pp->pp_lock);
                KASSERT(pp->pp_pte.pte_ptp != ptp ||
                    pp->pp_pte.pte_va != va);
                KASSERT(pve->pve_pte.pte_ptp == ptp);
                KASSERT(pve->pve_pte.pte_va == va);
                KASSERT(pve->pve_pp == pp);
                pp->pp_attrs |= oattrs;
                LIST_REMOVE(pve, pve_list);
                mutex_spin_exit(&pp->pp_lock);

                KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
                rb_tree_remove_node(tree, pve);
#ifdef DIAGNOSTIC
                memset(pve, 0, sizeof(*pve));
#endif
                pmap_free_pv(pmap, pve);
        }

        KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
        pmap_check_pv(pmap, ptp, pp, va, false);
}

/*
 * p t p   f u n c t i o n s
 */

static struct vm_page *
pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
{
        int lidx = level - 1;
        off_t off = ptp_va2o(va, level);
        struct vm_page *pg;

        KASSERT(mutex_owned(&pmap->pm_lock));

        if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
                KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
                pg = pmap->pm_ptphint[lidx];
                PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
                return pg;
        }
        PMAP_DUMMY_LOCK(pmap);
        pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
        PMAP_DUMMY_UNLOCK(pmap);
        if (pg != NULL && __predict_false(pg->wire_count == 0)) {
                /* This page is queued to be freed - ignore. */
                pg = NULL;
        }
        if (pg != NULL) {
                PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
        }
        pmap->pm_ptphint[lidx] = pg;
        return pg;
}

static inline void
pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
{
        int lidx;

        KASSERT(ptp->wire_count <= 1);
        PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));

        lidx = level - 1;
        pmap_stats_update(pmap, -ptp->wire_count, 0);
        if (pmap->pm_ptphint[lidx] == ptp)
                pmap->pm_ptphint[lidx] = NULL;
        ptp->wire_count = 0;
        ptp->uanon = NULL;
        KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);

        /*
         * Enqueue the PTP to be freed by pmap_update().  We can't remove
         * the page from the uvm_object, as that can take further locks
         * (intolerable right now because the PTEs are likely mapped in).
         * Instead mark the PTP as free and if we bump into it again, we'll
         * either ignore or reuse (depending on what's useful at the time).
         */
        LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
}

static void
pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
              pt_entry_t *ptes, pd_entry_t * const *pdes)
{
        unsigned long index;
        int level;
        vaddr_t invaladdr;
        pd_entry_t opde;

        KASSERT(pmap != pmap_kernel());
        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(kpreempt_disabled());

        level = 1;
        do {
                index = pl_i(va, level + 1);
                opde = pmap_pte_testset(&pdes[level - 1][index], 0);

                /*
                 * On Xen-amd64 or SVS, we need to sync the top level page
                 * directory on each CPU.
                 */
#if defined(XENPV) && defined(__x86_64__)
                if (level == PTP_LEVELS - 1) {
                        xen_kpm_sync(pmap, index);
                }
#elif defined(SVS)
                if (svs_enabled && level == PTP_LEVELS - 1 &&
                    pmap_is_user(pmap)) {
                        svs_pmap_sync(pmap, index);
                }
#endif

                invaladdr = level == 1 ? (vaddr_t)ptes :
                    (vaddr_t)pdes[level - 2];
                pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
                    opde, TLBSHOOT_FREE_PTP);

#if defined(XENPV)
                pmap_tlb_shootnow();
#endif

                pmap_freepage(pmap, ptp, level);
                if (level < PTP_LEVELS - 1) {
                        ptp = pmap_find_ptp(pmap, va, level + 1);
                        ptp->wire_count--;
                        if (ptp->wire_count > 1)
                                break;
                }
        } while (++level < PTP_LEVELS);
        pmap_pte_flush();
}

/*
 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
 *
 * => pmap should NOT be pmap_kernel()
 * => pmap should be locked
 * => we are not touching any PTEs yet, so they need not be mapped in
 */
static int
pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
    int flags, struct vm_page **resultp)
{
        struct vm_page *ptp;
        int i, aflags;
        struct uvm_object *obj;
        voff_t off;

        KASSERT(pmap != pmap_kernel());
        KASSERT(mutex_owned(&pmap->pm_lock));

        /*
         * Loop through all page table levels allocating a page
         * for any level where we don't already have one.
         */
        memset(pt, 0, sizeof(*pt));
        aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
                UVM_PGA_ZERO;
        for (i = PTP_LEVELS; i > 1; i--) {
                obj = &pmap->pm_obj[i - 2];
                off = ptp_va2o(va, i - 1);

                PMAP_DUMMY_LOCK(pmap);
                pt->pg[i] = uvm_pagelookup(obj, off);

                if (pt->pg[i] == NULL) {
                        pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
                        pt->alloced[i] = (pt->pg[i] != NULL);
                } else if (pt->pg[i]->wire_count == 0) {
                        /* This page was queued to be freed; dequeue it. */
                        LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
                        pt->alloced[i] = true;
                }
                PMAP_DUMMY_UNLOCK(pmap);
                if (pt->pg[i] == NULL) {
                        pmap_unget_ptp(pmap, pt);
                        return ENOMEM;
                } else if (pt->alloced[i]) {
                        pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
                        rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
                            &pmap_rbtree_ops);
                        PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
                }
        }
        ptp = pt->pg[2];
        KASSERT(ptp != NULL);
        *resultp = ptp;
        pmap->pm_ptphint[0] = ptp;
        return 0;
}

/*
 * pmap_install_ptp: install any freshly allocated PTPs
 *
 * => pmap should NOT be pmap_kernel()
 * => pmap should be locked
 * => PTEs must be mapped
 * => preemption must be disabled
 */
static void
pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
    pd_entry_t * const *pdes)
{
        struct vm_page *ptp;
        unsigned long index;
        pd_entry_t *pva;
        paddr_t pa;
        int i;

        KASSERT(pmap != pmap_kernel());
        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(kpreempt_disabled());

        /*
         * Now that we have all the pages looked up or allocated,
         * loop through again installing any new ones into the tree.
         */
        for (i = PTP_LEVELS; i > 1; i--) {
                index = pl_i(va, i);
                pva = pdes[i - 2];

                if (pmap_valid_entry(pva[index])) {
                        KASSERT(!pt->alloced[i]);
                        continue;
                }

                ptp = pt->pg[i];
                ptp->flags &= ~PG_BUSY; /* never busy */
                ptp->wire_count = 1;
                pmap->pm_ptphint[i - 2] = ptp;
                pa = VM_PAGE_TO_PHYS(ptp);
                pmap_pte_set(&pva[index], (pd_entry_t)
                    (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));

                /*
                 * On Xen-amd64 or SVS, we need to sync the top level page
                 * directory on each CPU.
                 */
#if defined(XENPV) && defined(__x86_64__)
                if (i == PTP_LEVELS) {
                        xen_kpm_sync(pmap, index);
                }
#elif defined(SVS)
                if (svs_enabled && i == PTP_LEVELS &&
                    pmap_is_user(pmap)) {
                        svs_pmap_sync(pmap, index);
                }
#endif

                pmap_pte_flush();
                pmap_stats_update(pmap, 1, 0);

                /*
                 * If we're not in the top level, increase the
                 * wire count of the parent page.
                 */
                if (i < PTP_LEVELS) {
                        pt->pg[i + 1]->wire_count++;
                }
        }
}

/*
 * pmap_unget_ptp: free unusued PTPs
 *
 * => pmap should NOT be pmap_kernel()
 * => pmap should be locked
 */
static void
pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
{
        int i;

        KASSERT(pmap != pmap_kernel());
        KASSERT(mutex_owned(&pmap->pm_lock));

        for (i = PTP_LEVELS; i > 1; i--) {
                if (!pt->alloced[i]) {
                        continue;
                }
                KASSERT(pt->pg[i]->wire_count == 0);
                PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
                pmap_freepage(pmap, pt->pg[i], i - 1);
        }
}

/*
 * p m a p   l i f e c y c l e   f u n c t i o n s
 */

/*
 * pmap_pdp_init: constructor a new PDP.
 */
static void
pmap_pdp_init(pd_entry_t *pdir)
{
        paddr_t pdirpa = 0;
        vaddr_t object;
        int i;

#if !defined(XENPV) || !defined(__x86_64__)
        int npde;
#endif
#ifdef XENPV
        int s;
#endif

        memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE);

        /*
         * NOTE: This is all done unlocked, but we will check afterwards
         * if we have raced with pmap_growkernel().
         */

#if defined(XENPV) && defined(__x86_64__)
        /* Fetch the physical address of the page directory */
        (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);

        /*
         * This pdir will NEVER be active in kernel mode, so mark
         * recursive entry invalid.
         */
        pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);

        /*
         * PDP constructed this way won't be for the kernel, hence we
         * don't put kernel mappings on Xen.
         *
         * But we need to make pmap_create() happy, so put a dummy
         * (without PTE_P) value at the right place.
         */
        pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
             (pd_entry_t)-1 & PTE_FRAME;
#else /* XENPV && __x86_64__*/
        object = (vaddr_t)pdir;
        for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
                /* Fetch the physical address of the page directory */
                (void)pmap_extract(pmap_kernel(), object, &pdirpa);

                /* Put in recursive PDE to map the PTEs */
                pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
                    pmap_pg_nx;
#ifndef XENPV
                pdir[PDIR_SLOT_PTE + i] |= PTE_W;
#endif
        }

        /* Copy the kernel's top level PDE */
        npde = nkptp[PTP_LEVELS - 1];

        memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
            npde * sizeof(pd_entry_t));

        if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
                int idx = pl_i(KERNBASE, PTP_LEVELS);
                pdir[idx] = PDP_BASE[idx];
        }

#ifdef __HAVE_PCPU_AREA
        pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
#endif
#ifdef __HAVE_DIRECT_MAP
        slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
#endif
#ifdef KASAN
        slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
#endif
#ifdef KMSAN
        slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
#endif
#endif /* XENPV  && __x86_64__*/

#ifdef XENPV
        s = splvm();
        object = (vaddr_t)pdir;
        pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
            VM_PROT_READ);
        pmap_update(pmap_kernel());
        for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
                /*
                 * pin as L2/L4 page, we have to do the page with the
                 * PDIR_SLOT_PTE entries last
                 */
#ifdef PAE
                if (i == l2tol3(PDIR_SLOT_PTE))
                        continue;
#endif

                (void) pmap_extract(pmap_kernel(), object, &pdirpa);
#ifdef __x86_64__
                xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
#else
                xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
#endif
        }
#ifdef PAE
        object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
        (void)pmap_extract(pmap_kernel(), object, &pdirpa);
        xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
#endif
        splx(s);
#endif /* XENPV */
}

/*
 * pmap_pdp_fini: destructor for the PDPs.
 */
static void
pmap_pdp_fini(pd_entry_t *pdir)
{
#ifdef XENPV
        paddr_t pdirpa = 0;        /* XXX: GCC */
        vaddr_t object = (vaddr_t)pdir;
        int i;
        int s = splvm();
        pt_entry_t *pte;

        for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
                /* fetch the physical address of the page directory. */
                (void) pmap_extract(pmap_kernel(), object, &pdirpa);
                /* unpin page table */
                xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
        }
        object = (vaddr_t)pdir;
        for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
                /* Set page RW again */
                pte = kvtopte(object);
                pmap_pte_set(pte, *pte | PTE_W);
                xen_bcast_invlpg((vaddr_t)object);
        }
        splx(s);
#endif  /* XENPV */
}

#ifdef PAE
static void *
pmap_pdp_alloc(struct pool *pp, int flags)
{
        return (void *)uvm_km_alloc(kernel_map,
            PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
            ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
            UVM_KMF_WIRED);
}

static void
pmap_pdp_free(struct pool *pp, void *v)
{
        uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
            UVM_KMF_WIRED);
}
#endif /* PAE */

/*
 * pmap_ctor: constructor for the pmap cache.
 */
static int
pmap_ctor(void *arg, void *obj, int flags)
{
        struct pmap *pmap = obj;
        pt_entry_t p;
        int i;

        KASSERT((flags & PR_WAITOK) != 0);

        mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
        rw_init(&pmap->pm_dummy_lock);
        kcpuset_create(&pmap->pm_cpus, true);
        kcpuset_create(&pmap->pm_kernel_cpus, true);
#ifdef XENPV
        kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
#endif
        LIST_INIT(&pmap->pm_gc_ptp);
        pmap->pm_pve = NULL;
        LIST_INIT(&pmap->pm_pvp_full);
        LIST_INIT(&pmap->pm_pvp_part);
        LIST_INIT(&pmap->pm_pvp_empty);

        /* allocate and init PDP */
        pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);

        for (;;) {
                pmap_pdp_init(pmap->pm_pdir);
                mutex_enter(&pmaps_lock);
                p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
                if (__predict_true(p != 0)) {
                        break;
                }
                mutex_exit(&pmaps_lock);
        }

        for (i = 0; i < PDP_SIZE; i++)
                pmap->pm_pdirpa[i] =
                    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);

        LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
        mutex_exit(&pmaps_lock);

        return 0;
}

/*
 * pmap_ctor: destructor for the pmap cache.
 */
static void
pmap_dtor(void *arg, void *obj)
{
        struct pmap *pmap = obj;

        mutex_enter(&pmaps_lock);
        LIST_REMOVE(pmap, pm_list);
        mutex_exit(&pmaps_lock);

        pmap_pdp_fini(pmap->pm_pdir);
        pool_put(&pmap_pdp_pool, pmap->pm_pdir);
        mutex_destroy(&pmap->pm_lock);
        rw_destroy(&pmap->pm_dummy_lock);
        kcpuset_destroy(pmap->pm_cpus);
        kcpuset_destroy(pmap->pm_kernel_cpus);
#ifdef XENPV
        kcpuset_destroy(pmap->pm_xen_ptp_cpus);
#endif
}

/*
 * pmap_create: create a pmap object.
 */
struct pmap *
pmap_create(void)
{
        struct pmap *pmap;
        int i;

        pmap = pool_cache_get(&pmap_cache, PR_WAITOK);

        /* init uvm_object */
        for (i = 0; i < PTP_LEVELS - 1; i++) {
                uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
                uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
                pmap->pm_ptphint[i] = NULL;
        }
        pmap->pm_stats.wired_count = 0;
        /* count the PDP allocd below */
        pmap->pm_stats.resident_count = PDP_SIZE;
#if !defined(__x86_64__)
        pmap->pm_hiexec = 0;
#endif

        /* Used by NVMM and Xen */
        pmap->pm_enter = NULL;
        pmap->pm_extract = NULL;
        pmap->pm_remove = NULL;
        pmap->pm_sync_pv = NULL;
        pmap->pm_pp_remove_ent = NULL;
        pmap->pm_write_protect = NULL;
        pmap->pm_unwire = NULL;
        pmap->pm_tlb_flush = NULL;
        pmap->pm_data = NULL;

        /* init the LDT */
        pmap->pm_ldt = NULL;
        pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);

        return pmap;
}

/*
 * pmap_check_ptps: verify that none of the pmap's page table objects
 * have any pages allocated to them.
 */
static void
pmap_check_ptps(struct pmap *pmap)
{
        int i;

        for (i = 0; i < PTP_LEVELS - 1; i++) {
                KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
                    "pmap %p level %d still has %d pages",
                    pmap, i, (int)pmap->pm_obj[i].uo_npages);
        }
}

static void
pmap_check_inuse(struct pmap *pmap)
{
#ifdef DEBUG
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;

        for (CPU_INFO_FOREACH(cii, ci)) {
                if (ci->ci_pmap == pmap)
                        panic("destroying pmap being used");
#if defined(XENPV) && defined(__x86_64__)
                for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
                        if (pmap->pm_pdir[i] != 0 &&
                            ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
                                printf("pmap_destroy(%p) pmap_kernel %p "
                                    "curcpu %d cpu %d ci_pmap %p "
                                    "ci->ci_kpm_pdir[%d]=%" PRIx64
                                    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
                                    pmap, pmap_kernel(), curcpu()->ci_index,
                                    ci->ci_index, ci->ci_pmap,
                                    i, ci->ci_kpm_pdir[i],
                                    i, pmap->pm_pdir[i]);
                                panic("%s: used pmap", __func__);
                        }
                }
#endif
        }
#endif /* DEBUG */
}

/*
 * pmap_destroy:  drop reference count on pmap.  free pmap if reference
 * count goes to zero.
 *
 * => we can be called from pmap_unmap_ptes() with a different, unrelated
 *    pmap's lock held.  be careful!
 */
void
pmap_destroy(struct pmap *pmap)
{
        int i;

        /*
         * drop reference count and verify not in use.
         */

        if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
                return;
        }
        pmap_check_inuse(pmap);

        /*
         * handle any deferred frees.
         */

        mutex_enter(&pmap->pm_lock);
        if (pmap->pm_pve != NULL) {
                pmap_free_pv(pmap, pmap->pm_pve);
                pmap->pm_pve = NULL;
        }
        pmap_drain_pv(pmap);
        mutex_exit(&pmap->pm_lock);
        pmap_update(pmap);

        /*
         * Reference count is zero, free pmap resources and then free pmap.
         */

        pmap_check_ptps(pmap);
        KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));

#ifdef USER_LDT
        if (pmap->pm_ldt != NULL) {
                /*
                 * No need to switch the LDT; this address space is gone,
                 * nothing is using it.
                 *
                 * No need to lock the pmap for ldt_free (or anything else),
                 * we're the last one to use it.
                 */
                /* XXXAD can't take cpu_lock here - fix soon. */
                mutex_enter(&cpu_lock);
                ldt_free(pmap->pm_ldt_sel);
                mutex_exit(&cpu_lock);
                uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
                    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
        }
#endif

        for (i = 0; i < PTP_LEVELS - 1; i++) {
                uvm_obj_destroy(&pmap->pm_obj[i], false);
        }
        kcpuset_zero(pmap->pm_cpus);
        kcpuset_zero(pmap->pm_kernel_cpus);
#ifdef XENPV
        kcpuset_zero(pmap->pm_xen_ptp_cpus);
#endif

        KASSERT(LIST_EMPTY(&pmap->pm_pvp_full));
        KASSERT(LIST_EMPTY(&pmap->pm_pvp_part));
        KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty));

        pmap_check_ptps(pmap);
        if (__predict_false(pmap->pm_enter != NULL)) {
                /* XXX make this a different cache */
                pool_cache_destruct_object(&pmap_cache, pmap);
        } else {
                pool_cache_put(&pmap_cache, pmap);
        }
}

/*
 * pmap_zap_ptp: clear out an entire PTP without modifying PTEs
 *
 * => caller must hold pmap's lock
 * => PTP must be mapped into KVA
 * => must be called with kernel preemption disabled
 * => does as little work as possible
 */
static void
pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
    vaddr_t startva, vaddr_t blkendva)
{
#ifndef XENPV
        struct pv_entry *pve;
        struct vm_page *pg;
        struct pmap_page *pp;
        pt_entry_t opte;
        rb_tree_t *tree;
        vaddr_t va;
        int wired;
        uint8_t oattrs;
        u_int cnt;

        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(kpreempt_disabled());
        KASSERT(pmap != pmap_kernel());
        KASSERT(ptp->wire_count > 1);
        KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));

        /*
         * Start at the lowest entered VA, and scan until there are no more
         * PTEs in the PTPs.
         */
        tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
        pve = RB_TREE_MIN(tree);
        wired = 0;
        va = (vaddr_t)ptp->uanon;
        pte += ((va - startva) >> PAGE_SHIFT);

        for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
                /*
                 * No need for an atomic to clear the PTE.  Nothing else can
                 * see the address space any more and speculative access (if
                 * possible) won't modify.  Therefore there's no need to
                 * track the accessed/dirty bits.
                 */
                opte = *pte;
                if (!pmap_valid_entry(opte)) {
                        continue;
                }

                /*
                 * Count the PTE.  If it's not for a managed mapping
                 * there's noting more to do.
                 */
                cnt--;
                wired -= (opte & PTE_WIRED);
                if ((opte & PTE_PVLIST) == 0) {
#ifndef DOM0OPS
                        KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
                            "managed page without PTE_PVLIST for %#"
                            PRIxVADDR, va);
                        KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
                            "pv-tracked page without PTE_PVLIST for %#"
                            PRIxVADDR, va);
#endif
                        KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
                            &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
                            va) == NULL);
                        continue;
                }

                /*
                 * "pve" now points to the lowest (by VA) dynamic PV entry
                 * in the PTP.  If it's for this VA, take advantage of it to
                 * avoid calling PHYS_TO_VM_PAGE().  Avoid modifying the RB
                 * tree by skipping to the next VA in the tree whenever
                 * there is a match here.  The tree will be cleared out in
                 * one pass before return to pmap_remove_all().
                 */
                oattrs = pmap_pte_to_pp_attrs(opte);
                if (pve != NULL && pve->pve_pte.pte_va == va) {
                        pp = pve->pve_pp;
                        KASSERT(pve->pve_pte.pte_ptp == ptp);
                        KASSERT(pp->pp_pte.pte_ptp != ptp ||
                            pp->pp_pte.pte_va != va);
                        mutex_spin_enter(&pp->pp_lock);
                        pp->pp_attrs |= oattrs;
                        LIST_REMOVE(pve, pve_list);
                        mutex_spin_exit(&pp->pp_lock);

                        /*
                         * pve won't be touched again until pmap_drain_pv(),
                         * so it's still safe to traverse the tree.
                         */
                        pmap_free_pv(pmap, pve);
                        pve = RB_TREE_NEXT(tree, pve);
                        continue;
                }

                /*
                 * No entry in the tree so it must be embedded.  Look up the
                 * page and cancel the embedded entry.
                 */
                if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
                        pp = VM_PAGE_TO_PP(pg);
                } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
                        paddr_t pa = pmap_pte2pa(opte);
                        panic("%s: PTE_PVLIST with pv-untracked page"
                            " va = %#"PRIxVADDR"pa = %#"PRIxPADDR
                            "(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
                }
                mutex_spin_enter(&pp->pp_lock);
                KASSERT(pp->pp_pte.pte_ptp == ptp);
                KASSERT(pp->pp_pte.pte_va == va);
                pp->pp_attrs |= oattrs;
                pp->pp_pte.pte_ptp = NULL;
                pp->pp_pte.pte_va = 0;
                mutex_spin_exit(&pp->pp_lock);
        }

        /* PTP now empty - adjust the tree & stats to match. */
        pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
        ptp->wire_count = 1;
#ifdef DIAGNOSTIC
        rb_tree_init(tree, &pmap_rbtree_ops);
#endif
#else        /* !XENPV */
        /*
         * XXXAD For XEN, it's not clear to me that we can do this, because
         * I guess the hypervisor keeps track of PTEs too.
         */
        pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
#endif        /* !XENPV */
}

/*
 * pmap_remove_all: remove all mappings from pmap in bulk.
 *
 * Ordinarily when removing mappings it's important to hold the UVM object's
 * lock, so that pages do not gain a new identity while retaining stale TLB
 * entries (the same lock hold covers both pmap_remove() and pmap_update()).
 * Here it's known that the address space is no longer visible to any user
 * process, so we don't need to worry about that.
 */
bool
pmap_remove_all(struct pmap *pmap)
{
        struct vm_page *ptps[32];
        vaddr_t va, blkendva;
        struct pmap *pmap2;
        pt_entry_t *ptes;
        pd_entry_t pde __diagused;
        pd_entry_t * const *pdes;
        int lvl __diagused, i, n;

        /* XXX Can't handle EPT just yet. */
        if (pmap->pm_remove != NULL) {
                return false;
        }

        for (;;) {
                /* Fetch a block of PTPs from tree. */
                mutex_enter(&pmap->pm_lock);
                n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
                    (void **)ptps, __arraycount(ptps), false);
                if (n == 0) {
                        mutex_exit(&pmap->pm_lock);
                        break;
                }

                /* Remove all mappings in the set of PTPs. */
                pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
                for (i = 0; i < n; i++) {
                        if (ptps[i]->wire_count == 0) {
                                /* It's dead: pmap_update() will expunge. */
                                continue;
                        }

                        /* Determine range of block. */
                        va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
                        blkendva = x86_round_pdr(va + 1);

                        /* Make sure everything squares up... */
                        KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
                        KASSERT(lvl == 1);
                        KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);

                        /* Zap! */
                        pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
                            blkendva);

                        /* PTP should now be unused - free it. */
                        KASSERT(ptps[i]->wire_count == 1);
                        pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
                }
                pmap_unmap_ptes(pmap, pmap2);
                pmap_drain_pv(pmap);
                pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
                mutex_exit(&pmap->pm_lock);

                /* Process deferred frees. */
                pmap_update(pmap);

                /* A breathing point. */
                preempt_point();
        }

        /* Verify that the pmap is now completely empty. */
        pmap_check_ptps(pmap);
        KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
            "pmap %p not empty", pmap);

        return true;
}

#if defined(PMAP_FORK)
/*
 * pmap_fork: perform any necessary data structure manipulation when
 * a VM space is forked.
 */
void
pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
{
#ifdef USER_LDT
        union descriptor *new_ldt;
        int sel;

        if (__predict_true(pmap1->pm_ldt == NULL)) {
                return;
        }

        /*
         * Copy the LDT into the new process.
         *
         * Read pmap1's ldt pointer unlocked; if it changes behind our back
         * we'll retry. This will starve if there's a stream of LDT changes
         * in another thread but that should not happen.
         */

retry:
        if (pmap1->pm_ldt != NULL) {
                /* Allocate space for the new process's LDT */
                new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
                    MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
                if (new_ldt == NULL) {
                        printf("WARNING: %s: unable to allocate LDT space\n",
                            __func__);
                        return;
                }
                mutex_enter(&cpu_lock);
                /* Get a GDT slot for it */
                sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
                if (sel == -1) {
                        mutex_exit(&cpu_lock);
                        uvm_km_free(kernel_map, (vaddr_t)new_ldt,
                            MAX_USERLDT_SIZE, UVM_KMF_WIRED);
                        printf("WARNING: %s: unable to allocate LDT selector\n",
                            __func__);
                        return;
                }
        } else {
                /* Wasn't anything there after all. */
                new_ldt = NULL;
                sel = -1;
                mutex_enter(&cpu_lock);
        }

        /*
         * Now that we have cpu_lock, ensure the LDT status is the same.
         */
        if (pmap1->pm_ldt != NULL) {
                if (new_ldt == NULL) {
                        /* A wild LDT just appeared. */
                        mutex_exit(&cpu_lock);
                        goto retry;
                }

                /* Copy the LDT data and install it in pmap2 */
                memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
                pmap2->pm_ldt = new_ldt;
                pmap2->pm_ldt_sel = sel;
                mutex_exit(&cpu_lock);
        } else {
                if (new_ldt != NULL) {
                        /* The LDT disappeared, drop what we did. */
                        ldt_free(sel);
                        mutex_exit(&cpu_lock);
                        uvm_km_free(kernel_map, (vaddr_t)new_ldt,
                            MAX_USERLDT_SIZE, UVM_KMF_WIRED);
                        return;
                }

                /* We're good, just leave. */
                mutex_exit(&cpu_lock);
        }
#endif /* USER_LDT */
}
#endif /* PMAP_FORK */

#ifdef USER_LDT

/*
 * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
 * is active, reload LDTR.
 */
static void
pmap_ldt_xcall(void *arg1, void *arg2)
{
        struct pmap *pm;

        kpreempt_disable();
        pm = arg1;
        if (curcpu()->ci_pmap == pm) {
#if defined(SVS)
                if (svs_enabled) {
                        svs_ldt_sync(pm);
                } else
#endif
                lldt(pm->pm_ldt_sel);
        }
        kpreempt_enable();
}

/*
 * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
 * in the new selector on all CPUs.
 */
void
pmap_ldt_sync(struct pmap *pm)
{
        uint64_t where;

        KASSERT(mutex_owned(&cpu_lock));

        pmap_ldt_evcnt.ev_count++;
        where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
        xc_wait(where);
}

/*
 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
 * restore the default.
 */
void
pmap_ldt_cleanup(struct lwp *l)
{
        pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
        union descriptor *ldt;
        int sel;

        if (__predict_true(pmap->pm_ldt == NULL)) {
                return;
        }

        mutex_enter(&cpu_lock);
        if (pmap->pm_ldt != NULL) {
                sel = pmap->pm_ldt_sel;
                ldt = pmap->pm_ldt;
                pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
                pmap->pm_ldt = NULL;
                pmap_ldt_sync(pmap);
                ldt_free(sel);
                uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
                    UVM_KMF_WIRED);
        }
        mutex_exit(&cpu_lock);
}
#endif /* USER_LDT */

/*
 * pmap_activate: activate a process' pmap
 *
 * => must be called with kernel preemption disabled
 * => if lwp is the curlwp, then set ci_want_pmapload so that
 *    actual MMU context switch will be done by pmap_load() later
 */
void
pmap_activate(struct lwp *l)
{
        struct cpu_info *ci;
        struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);

        KASSERT(kpreempt_disabled());

        ci = curcpu();

        if (l != ci->ci_curlwp)
                return;

        KASSERT(ci->ci_want_pmapload == 0);
        KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);

        /*
         * no need to switch to kernel vmspace because
         * it's a subset of any vmspace.
         */

        if (pmap == pmap_kernel()) {
                ci->ci_want_pmapload = 0;
                return;
        }

        ci->ci_want_pmapload = 1;
}

#if defined(XENPV) && defined(__x86_64__)
#define        KASSERT_PDIRPA(pmap) \
        KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
            pmap == pmap_kernel())
#elif defined(PAE)
#define        KASSERT_PDIRPA(pmap) \
        KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
#elif !defined(XENPV)
#define        KASSERT_PDIRPA(pmap) \
        KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
#else
#define        KASSERT_PDIRPA(pmap)        KASSERT(true)        /* nothing to do */
#endif

/*
 * pmap_reactivate: try to regain reference to the pmap.
 *
 * => Must be called with kernel preemption disabled.
 */
static void
pmap_reactivate(struct pmap *pmap)
{
        struct cpu_info * const ci = curcpu();
        const cpuid_t cid = cpu_index(ci);

        KASSERT(kpreempt_disabled());
        KASSERT_PDIRPA(pmap);

        /*
         * If we still have a lazy reference to this pmap, we can assume
         * that there was no TLB shootdown for this pmap in the meantime.
         *
         * The order of events here is important as we must synchronize
         * with TLB shootdown interrupts.  Declare interest in invalidations
         * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
         * change only when the state is TLBSTATE_LAZY.
         */

        ci->ci_tlbstate = TLBSTATE_VALID;
        KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));

        if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
                /* We have the reference, state is valid. */
        } else {
                /*
                 * Must reload the TLB, pmap has been changed during
                 * deactivated.
                 */
                kcpuset_atomic_set(pmap->pm_cpus, cid);

                tlbflush();
        }
}

/*
 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
 * and relevant LDT info.
 *
 * Ensures that the current process' pmap is loaded on the current CPU's
 * MMU and that there are no stale TLB entries.
 *
 * => The caller should disable kernel preemption or do check-and-retry
 *    to prevent a preemption from undoing our efforts.
 * => This function may block.
 */
void
pmap_load(void)
{
        struct cpu_info *ci;
        struct pmap *pmap, *oldpmap;
        struct lwp *l;
        uint64_t pctr;
        int ilevel __diagused;
        u_long psl __diagused;

        kpreempt_disable();
 retry:
        ci = curcpu();
        if (!ci->ci_want_pmapload) {
                kpreempt_enable();
                return;
        }
        l = ci->ci_curlwp;
        pctr = lwp_pctr();
        __insn_barrier();

        /* should be able to take ipis. */
        KASSERTMSG((ilevel = ci->ci_ilevel) < IPL_HIGH, "ilevel=%d", ilevel);
#ifdef XENPV
        /* Check to see if interrupts are enabled (ie; no events are masked) */
        KASSERTMSG((psl = x86_read_psl()) == 0, "psl=0x%lx", psl);
#else
        KASSERTMSG(((psl = x86_read_psl()) & PSL_I) != 0, "psl=0x%lx", psl);
#endif

        KASSERT(l != NULL);
        pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
        KASSERT(pmap != pmap_kernel());
        oldpmap = ci->ci_pmap;

        if (pmap == oldpmap) {
                pmap_reactivate(pmap);
                ci->ci_want_pmapload = 0;
                kpreempt_enable();
                return;
        }

        /*
         * Acquire a reference to the new pmap and perform the switch.
         */

        pmap_reference(pmap);
        pmap_load1(l, pmap, oldpmap);
        ci->ci_want_pmapload = 0;

        /*
         * we're now running with the new pmap.  drop the reference
         * to the old pmap.  if we block, we need to go around again.
         */

        pmap_destroy(oldpmap);
        __insn_barrier();
        if (lwp_pctr() != pctr) {
                goto retry;
        }

        kpreempt_enable();
}

/*
 * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
 * pmap_load().  It's critically important that this function does not
 * block.
 */
static void
pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
{
        struct cpu_info *ci;
        struct pcb *pcb;
        cpuid_t cid;

        KASSERT(kpreempt_disabled());

        pcb = lwp_getpcb(l);
        ci = l->l_cpu;
        cid = cpu_index(ci);

        kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
        kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);

        KASSERT_PDIRPA(oldpmap);
        KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
        KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));

        /*
         * Mark the pmap in use by this CPU.  Again, we must synchronize
         * with TLB shootdown interrupts, so set the state VALID first,
         * then register us for shootdown events on this pmap.
         */
        ci->ci_tlbstate = TLBSTATE_VALID;
        kcpuset_atomic_set(pmap->pm_cpus, cid);
        kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
        ci->ci_pmap = pmap;

        /*
         * update tss.  now that we have registered for invalidations
         * from other CPUs, we're good to load the page tables.
         */
#ifdef PAE
        pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
#else
        pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
#endif

#ifdef i386
#ifndef XENPV
        ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
        ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
#endif
#endif

#if defined(SVS) && defined(USER_LDT)
        if (svs_enabled) {
                svs_ldt_sync(pmap);
        } else
#endif
        lldt(pmap->pm_ldt_sel);

        cpu_load_pmap(pmap, oldpmap);
}

/*
 * pmap_deactivate: deactivate a process' pmap.
 *
 * => Must be called with kernel preemption disabled (high IPL is enough).
 */
void
pmap_deactivate(struct lwp *l)
{
        struct pmap *pmap;
        struct cpu_info *ci;

        KASSERT(kpreempt_disabled());

        if (l != curlwp) {
                return;
        }

        /*
         * Wait for pending TLB shootdowns to complete.  Necessary because
         * TLB shootdown state is per-CPU, and the LWP may be coming off
         * the CPU before it has a chance to call pmap_update(), e.g. due
         * to kernel preemption or blocking routine in between.
         */
        pmap_tlb_shootnow();

        ci = curcpu();

        if (ci->ci_want_pmapload) {
                /*
                 * ci_want_pmapload means that our pmap is not loaded on
                 * the CPU or TLB might be stale.  note that pmap_kernel()
                 * is always considered loaded.
                 */
                KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
                    != pmap_kernel());
                KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
                    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);

                /*
                 * userspace has not been touched.
                 * nothing to do here.
                 */

                ci->ci_want_pmapload = 0;
                return;
        }

        pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);

        if (pmap == pmap_kernel()) {
                return;
        }

        KASSERT_PDIRPA(pmap);
        KASSERT(ci->ci_pmap == pmap);

        /*
         * we aren't interested in TLB invalidations for this pmap,
         * at least for the time being.
         */

        KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
        ci->ci_tlbstate = TLBSTATE_LAZY;
}

#ifdef EFI_RUNTIME

extern struct pmap *efi_runtime_pmap;

/*
 * pmap_is_user: true if pmap, which must not be the kernel pmap, is
 * for an unprivileged user process
 */
bool
pmap_is_user(struct pmap *pmap)
{

        KASSERT(pmap != pmap_kernel());
        return (pmap != efi_runtime_pmap);
}

/*
 * pmap_activate_sync: synchronously activate specified pmap.
 *
 * => Must be called with kernel preemption disabled (high IPL is enough).
 * => Must not sleep before pmap_deactivate_sync.
 */
void *
pmap_activate_sync(struct pmap *pmap)
{
        struct cpu_info *ci = curcpu();
        struct pmap *oldpmap = ci->ci_pmap;
        unsigned cid = cpu_index(ci);

        KASSERT(kpreempt_disabled());
        KASSERT(pmap != pmap_kernel());

        KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
        KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));

        if (oldpmap) {
                KASSERT_PDIRPA(oldpmap);
                kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
                kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
        }

        ci->ci_tlbstate = TLBSTATE_VALID;
        kcpuset_atomic_set(pmap->pm_cpus, cid);
        kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
        ci->ci_pmap = pmap;

#if defined(SVS) && defined(USER_LDT)
        if (svs_enabled) {
                svs_ldt_sync(pmap);
        } else
#endif
        lldt(pmap->pm_ldt_sel);

        cpu_load_pmap(pmap, oldpmap);

        return oldpmap;
}

/*
 * pmap_deactivate_sync: synchronously deactivate specified pmap and
 * restore whatever was active before pmap_activate_sync.
 *
 * => Must be called with kernel preemption disabled (high IPL is enough).
 * => Must not have slept since pmap_activate_sync.
 */
void
pmap_deactivate_sync(struct pmap *pmap, void *cookie)
{
        struct cpu_info *ci = curcpu();
        struct pmap *oldpmap = cookie;
        unsigned cid = cpu_index(ci);

        KASSERT(kpreempt_disabled());
        KASSERT(pmap != pmap_kernel());
        KASSERT(ci->ci_pmap == pmap);

        KASSERT_PDIRPA(pmap);

        KASSERT(kcpuset_isset(pmap->pm_cpus, cid));
        KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));

        pmap_tlb_shootnow();

        kcpuset_atomic_clear(pmap->pm_cpus, cid);
        kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid);

        ci->ci_tlbstate = TLBSTATE_VALID;
        ci->ci_pmap = oldpmap;
        if (oldpmap) {
                kcpuset_atomic_set(oldpmap->pm_cpus, cid);
                kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid);
#if defined(SVS) && defined(USER_LDT)
                if (svs_enabled) {
                        svs_ldt_sync(oldpmap);
                } else
#endif
                lldt(oldpmap->pm_ldt_sel);
                cpu_load_pmap(oldpmap, pmap);
        } else {
                lcr3(pmap_pdirpa(pmap_kernel(), 0));
        }
}

#endif        /* EFI_RUNTIME */

/*
 * some misc. functions
 */

bool
pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
    int *lastlvl)
{
        unsigned long index;
        pd_entry_t pde;
        int i;

        for (i = PTP_LEVELS; i > 1; i--) {
                index = pl_i(va, i);
                pde = pdes[i - 2][index];
                if ((pde & PTE_P) == 0) {
                        *lastlvl = i;
                        return false;
                }
                if (pde & PTE_PS)
                        break;
        }
        if (lastpde != NULL)
                *lastpde = pde;
        *lastlvl = i;
        return true;
}

/*
 * pmap_extract: extract a PA for the given VA
 */
bool
pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
{
        pt_entry_t *ptes, pte;
        pd_entry_t pde;
        pd_entry_t * const *pdes;
        struct pmap *pmap2;
        paddr_t pa;
        bool rv;
        int lvl;

        if (__predict_false(pmap->pm_extract != NULL)) {
                return (*pmap->pm_extract)(pmap, va, pap);
        }

#ifdef __HAVE_DIRECT_MAP
        if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
                if (pap != NULL) {
                        *pap = PMAP_DIRECT_UNMAP(va);
                }
                return true;
        }
#endif

        rv = false;
        pa = 0;

        if (pmap != pmap_kernel()) {
                mutex_enter(&pmap->pm_lock);
        }
        pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
        if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
                if (lvl == 2) {
                        pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
                        rv = true;
                } else {
                        KASSERT(lvl == 1);
                        pte = ptes[pl1_i(va)];
                        if (__predict_true((pte & PTE_P) != 0)) {
                                pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
                                rv = true;
                        }
                }
        }
        pmap_unmap_ptes(pmap, pmap2);
        if (pmap != pmap_kernel()) {
                mutex_exit(&pmap->pm_lock);
        }
        if (pap != NULL) {
                *pap = pa;
        }

        return rv;
}

/*
 * vtophys: virtual address to physical address.  For use by
 * machine-dependent code only.
 */
paddr_t
vtophys(vaddr_t va)
{
        paddr_t pa;

        if (pmap_extract(pmap_kernel(), va, &pa) == true)
                return pa;
        return 0;
}

__strict_weak_alias(pmap_extract_ma, pmap_extract);

#ifdef XENPV
/*
 * vtomach: virtual address to machine address.  For use by
 * machine-dependent code only.
 */
paddr_t
vtomach(vaddr_t va)
{
        paddr_t pa;

        if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
                return pa;
        return 0;
}
#endif

/*
 * pmap_virtual_space: used during bootup [pmap_steal_memory] to
 * determine the bounds of the kernel virtual address space.
 */
void
pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
{
        *startp = virtual_avail;
        *endp = virtual_end;
}

void
pmap_zero_page(paddr_t pa)
{
#if defined(__HAVE_DIRECT_MAP)
        memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
#else
#if defined(XENPV)
        if (XEN_VERSION_SUPPORTED(3, 4)) {
                xen_pagezero(pa);
                return;
        }
#endif
        struct cpu_info *ci;
        pt_entry_t *zpte;
        vaddr_t zerova;

        const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;

        kpreempt_disable();

        ci = curcpu();
        zerova = ci->vpage[VPAGE_ZER];
        zpte = ci->vpage_pte[VPAGE_ZER];

        KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");

        pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
        pmap_pte_flush();
        pmap_update_pg(zerova);                /* flush TLB */

        memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE);

#if defined(DIAGNOSTIC) || defined(XENPV)
        pmap_pte_set(zpte, 0);                                /* zap ! */
        pmap_pte_flush();
#endif

        kpreempt_enable();
#endif /* defined(__HAVE_DIRECT_MAP) */
}

void
pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
{
#if defined(__HAVE_DIRECT_MAP)
        vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
        vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);

        memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
#else
#if defined(XENPV)
        if (XEN_VERSION_SUPPORTED(3, 4)) {
                xen_copy_page(srcpa, dstpa);
                return;
        }
#endif
        struct cpu_info *ci;
        pt_entry_t *srcpte, *dstpte;
        vaddr_t srcva, dstva;

        const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;

        kpreempt_disable();

        ci = curcpu();
        srcva = ci->vpage[VPAGE_SRC];
        dstva = ci->vpage[VPAGE_DST];
        srcpte = ci->vpage_pte[VPAGE_SRC];
        dstpte = ci->vpage_pte[VPAGE_DST];

        KASSERT(*srcpte == 0 && *dstpte == 0);

        pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
        pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
        pmap_pte_flush();
        pmap_update_pg(srcva);
        pmap_update_pg(dstva);

        memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);

#if defined(DIAGNOSTIC) || defined(XENPV)
        pmap_pte_set(srcpte, 0);
        pmap_pte_set(dstpte, 0);
        pmap_pte_flush();
#endif

        kpreempt_enable();
#endif /* defined(__HAVE_DIRECT_MAP) */
}

static pt_entry_t *
pmap_map_ptp(struct vm_page *ptp)
{
#ifdef __HAVE_DIRECT_MAP
        return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
#else
        struct cpu_info *ci;
        pt_entry_t *ptppte;
        vaddr_t ptpva;

        KASSERT(kpreempt_disabled());

#ifndef XENPV
        const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
#else
        const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
#endif

        ci = curcpu();
        ptpva = ci->vpage[VPAGE_PTP];
        ptppte = ci->vpage_pte[VPAGE_PTP];

        pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);

        pmap_pte_flush();
        pmap_update_pg(ptpva);

        return (pt_entry_t *)ptpva;
#endif
}

static void
pmap_unmap_ptp(void)
{
#ifndef __HAVE_DIRECT_MAP
#if defined(DIAGNOSTIC) || defined(XENPV)
        struct cpu_info *ci;
        pt_entry_t *pte;

        KASSERT(kpreempt_disabled());

        ci = curcpu();
        pte = ci->vpage_pte[VPAGE_PTP];

        if (*pte != 0) {
                pmap_pte_set(pte, 0);
                pmap_pte_flush();
        }
#endif
#endif
}

static pt_entry_t *
pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
{

        KASSERT(kpreempt_disabled());
        if (pmap_is_curpmap(pmap)) {
                return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
        }
        KASSERT(ptp != NULL);
        return pmap_map_ptp(ptp) + pl1_pi(va);
}

static void
pmap_unmap_pte(void)
{

        KASSERT(kpreempt_disabled());

        pmap_unmap_ptp();
}

/*
 * p m a p   r e m o v e   f u n c t i o n s
 *
 * functions that remove mappings
 */

/*
 * pmap_remove_ptes: remove PTEs from a PTP
 *
 * => caller must hold pmap's lock
 * => PTP must be mapped into KVA
 * => PTP should be null if pmap == pmap_kernel()
 * => must be called with kernel preemption disabled
 * => returns composite pte if at least one page should be shot down
 */
static void
pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
    vaddr_t startva, vaddr_t endva)
{
        pt_entry_t *pte = (pt_entry_t *)ptpva;

        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(kpreempt_disabled());

        /*
         * mappings are very often sparse, so clip the given range to the
         * range of PTEs that are known present in the PTP.
         */
        pmap_ptp_range_clip(ptp, &startva, &pte);

        /*
         * note that ptpva points to the PTE that maps startva.   this may
         * or may not be the first PTE in the PTP.
         *
         * we loop through the PTP while there are still PTEs to look at
         * and the wire_count is greater than 1 (because we use the wire_count
         * to keep track of the number of real PTEs in the PTP).
         */
        while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
                (void)pmap_remove_pte(pmap, ptp, pte, startva);
                startva += PAGE_SIZE;
                pte++;
        }
}

/*
 * pmap_remove_pte: remove a single PTE from a PTP.
 *
 * => caller must hold pmap's lock
 * => PTP must be mapped into KVA
 * => PTP should be null if pmap == pmap_kernel()
 * => returns true if we removed a mapping
 * => must be called with kernel preemption disabled
 */
static bool
pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
    vaddr_t va)
{
        struct pv_entry *pve;
        struct vm_page *pg;
        struct pmap_page *pp;
        pt_entry_t opte;

        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(kpreempt_disabled());

        if (!pmap_valid_entry(*pte)) {
                /* VA not mapped. */
                return false;
        }

        /* Atomically save the old PTE and zap it. */
        opte = pmap_pte_testset(pte, 0);
        if (!pmap_valid_entry(opte)) {
                return false;
        }

        pmap_exec_account(pmap, va, opte, 0);
        pmap_stats_update_bypte(pmap, 0, opte);

        if (ptp) {
                /*
                 * Dropping a PTE.  Make sure that the PDE is flushed.
                 */
                ptp->wire_count--;
                if (ptp->wire_count <= 1) {
                        opte |= PTE_A;
                }
        }

        if ((opte & PTE_A) != 0) {
                pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
        }

        /*
         * If we are not on a pv list - we are done.
         */
        if ((opte & PTE_PVLIST) == 0) {
#ifndef DOM0OPS
                KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
                    "managed page without PTE_PVLIST for %#"PRIxVADDR, va);
                KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
                    "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
#endif
                KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
                    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
                return true;
        }

        if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
                pp = VM_PAGE_TO_PP(pg);
        } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
                paddr_t pa = pmap_pte2pa(opte);
                panic("%s: PTE_PVLIST with pv-untracked page"
                    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
                    __func__, va, pa, atop(pa));
        }

        /* Sync R/M bits. */
        pve = pmap_lookup_pv(pmap, ptp, pp, va);
        pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
        return true;
}

static void
pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
        pt_entry_t *ptes;
        pd_entry_t pde;
        pd_entry_t * const *pdes;
        bool result;
        vaddr_t blkendva, va = sva;
        struct vm_page *ptp;
        struct pmap *pmap2;
        int lvl;

        KASSERT(mutex_owned(&pmap->pm_lock));

        pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);

        /*
         * removing one page?  take shortcut function.
         */

        if (va + PAGE_SIZE == eva) {
                if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
                        KASSERT(lvl == 1);

                        /* Get PTP if non-kernel mapping. */
                        if (pmap != pmap_kernel()) {
                                ptp = pmap_find_ptp(pmap, va, 1);
                                KASSERTMSG(ptp != NULL,
                                    "%s: unmanaged PTP detected", __func__);
                        } else {
                                /* Never free kernel PTPs. */
                                ptp = NULL;
                        }

                        result = pmap_remove_pte(pmap, ptp,
                            &ptes[pl1_i(va)], va);

                        /*
                         * if mapping removed and the PTP is no longer
                         * being used, free it!
                         */

                        if (result && ptp && ptp->wire_count <= 1)
                                pmap_free_ptp(pmap, ptp, va, ptes, pdes);
                }
        } else for (/* null */ ; va < eva ; va = blkendva) {
                /* determine range of block */
                blkendva = x86_round_pdr(va+1);
                if (blkendva > eva)
                        blkendva = eva;

                if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
                        /* Skip a range corresponding to an invalid pde. */
                        blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
                        continue;
                }
                KASSERT(lvl == 1);

                /* Get PTP if non-kernel mapping. */
                if (pmap != pmap_kernel()) {
                        ptp = pmap_find_ptp(pmap, va, 1);
                        KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
                            __func__);
                } else {
                        /* Never free kernel PTPs. */
                        ptp = NULL;
                }

                pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
                    blkendva);

                /* If PTP is no longer being used, free it. */
                if (ptp && ptp->wire_count <= 1) {
                        pmap_free_ptp(pmap, ptp, va, ptes, pdes);
                }
        }
        pmap_unmap_ptes(pmap, pmap2);
        pmap_drain_pv(pmap);
}

/*
 * pmap_remove: mapping removal function.
 *
 * => caller should not be holding any pmap locks
 */
void
pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
        if (__predict_false(pmap->pm_remove != NULL)) {
                (*pmap->pm_remove)(pmap, sva, eva);
                return;
        }

        mutex_enter(&pmap->pm_lock);
        pmap_remove_locked(pmap, sva, eva);
        mutex_exit(&pmap->pm_lock);
}

/*
 * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
 *
 * => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
 * => Caller should disable kernel preemption.
 * => issues tlb shootdowns if necessary.
 */
static int
pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
    pt_entry_t *optep)
{
        struct pmap *pmap;
        struct vm_page *ptp;
        vaddr_t va;
        pt_entry_t *ptep;
        pt_entry_t opte;
        pt_entry_t npte;
        pt_entry_t expect;
        bool need_shootdown;

        ptp = pvpte->pte_ptp;
        va = pvpte->pte_va;
        KASSERT(ptp == NULL || ptp->uobject != NULL);
        KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
        pmap = ptp_to_pmap(ptp);
        KASSERT(kpreempt_disabled());

        if (__predict_false(pmap->pm_sync_pv != NULL)) {
                return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
                    optep);
        }

        expect = pmap_pa2pte(pa) | PTE_P;

        if (clearbits != ~0) {
                KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
                clearbits = pmap_pp_attrs_to_pte(clearbits);
        }

        ptep = pmap_map_pte(pmap, ptp, va);
        do {
                opte = *ptep;
                KASSERT((opte & (PTE_D | PTE_A)) != PTE_D);
                KASSERT((opte & (PTE_A | PTE_P)) != PTE_A);
                KASSERT(opte == 0 || (opte & PTE_P) != 0);
                if ((opte & (PTE_FRAME | PTE_P)) != expect) {
                        /*
                         * We lost a race with a V->P operation like
                         * pmap_remove().  Wait for the competitor
                         * reflecting pte bits into mp_attrs.
                         */
                        pmap_unmap_pte();
                        return EAGAIN;
                }

                /*
                 * Check if there's anything to do on this PTE.
                 */
                if ((opte & clearbits) == 0) {
                        need_shootdown = false;
                        break;
                }

                /*
                 * We need a shootdown if the PTE is cached (PTE_A) ...
                 * ... Unless we are clearing only the PTE_W bit and
                 * it isn't cached as RW (PTE_D).
                 */
                need_shootdown = (opte & PTE_A) != 0 &&
                    !(clearbits == PTE_W && (opte & PTE_D) == 0);

                npte = opte & ~clearbits;

                /*
                 * If we need a shootdown anyway, clear PTE_A and PTE_D.
                 */
                if (need_shootdown) {
                        npte &= ~(PTE_A | PTE_D);
                }
                KASSERT((npte & (PTE_D | PTE_A)) != PTE_D);
                KASSERT((npte & (PTE_A | PTE_P)) != PTE_A);
                KASSERT(npte == 0 || (opte & PTE_P) != 0);
        } while (pmap_pte_cas(ptep, opte, npte) != opte);

        if (need_shootdown) {
                pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
        }
        pmap_unmap_pte();

        *oattrs = pmap_pte_to_pp_attrs(opte);
        if (optep != NULL)
                *optep = opte;
        return 0;
}

static void
pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
    vaddr_t va)
{
        struct pmap *pmap2;
        pt_entry_t *ptes;
        pd_entry_t * const *pdes;

        KASSERT(mutex_owned(&pmap->pm_lock));

        pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
        pmap_stats_update_bypte(pmap, 0, opte);
        ptp->wire_count--;
        if (ptp->wire_count <= 1) {
                pmap_free_ptp(pmap, ptp, va, ptes, pdes);
        }
        pmap_unmap_ptes(pmap, pmap2);
}

static void
pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
{
        struct pv_pte *pvpte;
        struct vm_page *ptp;
        uintptr_t sum;
        uint8_t oattrs;
        bool locked;

        /*
         * Do an unlocked check to see if the page has no mappings, eg when
         * pmap_remove_all() was called before amap_wipeout() for a process
         * private amap - common.  The page being removed must be on the way
         * out, so we don't have to worry about concurrent attempts to enter
         * it (otherwise the caller either doesn't care or has screwed up).
         */
        sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
        sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
        sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
        if (sum == 0) {
                return;
        }

        kpreempt_disable();
        for (;;) {
                struct pmap *pmap;
                struct pv_entry *pve;
                pt_entry_t opte;
                vaddr_t va;

                mutex_spin_enter(&pp->pp_lock);
                if ((pvpte = pv_pte_first(pp)) == NULL) {
                        mutex_spin_exit(&pp->pp_lock);
                        break;
                }

                /*
                 * Add a reference to the pmap before clearing the pte.
                 * Otherwise the pmap can disappear behind us.
                 */
                ptp = pvpte->pte_ptp;
                pmap = ptp_to_pmap(ptp);
                KASSERT(pmap->pm_obj[0].uo_refs > 0);
                if (ptp != NULL) {
                        pmap_reference(pmap);
                }

                /*
                 * Now try to lock it.  We need a direct handoff between
                 * pp_lock and pm_lock to know the pv_entry is kept intact
                 * and kept associated with this pmap.  If that can't be
                 * had, wait for the pmap's lock to become free and then
                 * retry.
                 */
                locked = mutex_tryenter(&pmap->pm_lock);
                mutex_spin_exit(&pp->pp_lock);
                if (!locked) {
                        mutex_enter(&pmap->pm_lock);
                        /* nothing, just wait for it */
                        mutex_exit(&pmap->pm_lock);
                        if (ptp != NULL) {
                                pmap_destroy(pmap);
                        }
                        continue;
                }
                va = pvpte->pte_va;

                KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
                    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
                KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
                    "va %lx pmap %p ptp %p is free", va, pmap, ptp);
                KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
                    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);

#ifdef DEBUG
                pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
                rb_tree_t *tree = (ptp != NULL ?
                    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
                pve = pmap_treelookup_pv(pmap, ptp, tree, va);
                if (pve == NULL) {
                        KASSERTMSG(&pp->pp_pte == pvpte,
                            "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
                            va, pmap, ptp, pvpte, pve);
                } else {
                        KASSERTMSG(&pve->pve_pte == pvpte,
                            "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
                            va, pmap, ptp, pvpte, pve);
                }
#endif

                if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
                        panic("pmap_pp_remove: mapping not present");
                }

                pve = pmap_lookup_pv(pmap, ptp, pp, va);
                pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);

                /* Update the PTP reference count. Free if last reference. */
                if (ptp != NULL) {
                        KASSERT(pmap != pmap_kernel());
                        pmap_tlb_shootnow();
                        if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
                                (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
                        } else {
                                pmap_pp_remove_ent(pmap, ptp, opte, va);
                        }
                } else {
                        KASSERT(pmap == pmap_kernel());
                        pmap_stats_update_bypte(pmap, 0, opte);
                }
                pmap_tlb_shootnow();
                pmap_drain_pv(pmap);
                mutex_exit(&pmap->pm_lock);
                if (ptp != NULL) {
                        pmap_destroy(pmap);
                }
        }
        kpreempt_enable();
}

/*
 * pmap_page_remove: remove a managed vm_page from all pmaps that map it
 *
 * => R/M bits are sync'd back to attrs
 */
void
pmap_page_remove(struct vm_page *pg)
{
        struct pmap_page *pp;
        paddr_t pa;

        pp = VM_PAGE_TO_PP(pg);
        pa = VM_PAGE_TO_PHYS(pg);
        pmap_pp_remove(pp, pa);
}

/*
 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
 * that map it
 */
void
pmap_pv_remove(paddr_t pa)
{
        struct pmap_page *pp;

        pp = pmap_pv_tracked(pa);
        if (pp == NULL)
                panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
        pmap_pp_remove(pp, pa);
}

/*
 * p m a p   a t t r i b u t e  f u n c t i o n s
 * functions that test/change managed page's attributes
 * since a page can be mapped multiple times we must check each PTE that
 * maps it by going down the pv lists.
 */

/*
 * pmap_test_attrs: test a page's attributes
 */
bool
pmap_test_attrs(struct vm_page *pg, unsigned testbits)
{
        struct pmap_page *pp;
        struct pv_pte *pvpte;
        struct pmap *pmap;
        uint8_t oattrs;
        u_int result;
        paddr_t pa;

        pp = VM_PAGE_TO_PP(pg);
        if ((pp->pp_attrs & testbits) != 0) {
                return true;
        }
        pa = VM_PAGE_TO_PHYS(pg);
 startover:
        mutex_spin_enter(&pp->pp_lock);
        for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
                if ((pp->pp_attrs & testbits) != 0) {
                        break;
                }
                if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
                        /*
                         * raced with a V->P operation.  wait for the other
                         * side to finish by acquiring pmap's lock.  if no
                         * wait, updates to pp_attrs by the other side may
                         * go unseen.
                         */
                        pmap = ptp_to_pmap(pvpte->pte_ptp);
                        pmap_reference(pmap);
                        mutex_spin_exit(&pp->pp_lock);
                        mutex_enter(&pmap->pm_lock);
                        /* nothing. */
                        mutex_exit(&pmap->pm_lock);
                        pmap_destroy(pmap);
                        goto startover;
                }
                pp->pp_attrs |= oattrs;
        }
        result = pp->pp_attrs & testbits;
        mutex_spin_exit(&pp->pp_lock);

        /*
         * note that we will exit the for loop with a non-null pve if
         * we have found the bits we are testing for.
         */

        return result != 0;
}

static bool
pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
{
        struct pv_pte *pvpte;
        struct pmap *pmap;
        uint8_t oattrs;
        u_int result;

startover:
        mutex_spin_enter(&pp->pp_lock);
        for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
                if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
                        /*
                         * raced with a V->P operation.  wait for the other
                         * side to finish by acquiring pmap's lock.  it is
                         * probably unmapping the page, and it will be gone
                         * when the loop is restarted.
                         */
                        pmap = ptp_to_pmap(pvpte->pte_ptp);
                        pmap_reference(pmap);
                        mutex_spin_exit(&pp->pp_lock);
                        mutex_enter(&pmap->pm_lock);
                        /* nothing. */
                        mutex_exit(&pmap->pm_lock);
                        pmap_destroy(pmap);
                        goto startover;
                }
                pp->pp_attrs |= oattrs;
        }
        result = pp->pp_attrs & clearbits;
        pp->pp_attrs &= ~clearbits;
        pmap_tlb_shootnow();
        mutex_spin_exit(&pp->pp_lock);

        return result != 0;
}

/*
 * pmap_clear_attrs: clear the specified attribute for a page.
 *
 * => we return true if we cleared one of the bits we were asked to
 */
bool
pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
{
        struct pmap_page *pp;
        paddr_t pa;

        pp = VM_PAGE_TO_PP(pg);
        pa = VM_PAGE_TO_PHYS(pg);

        /*
         * If this is a new page, assert it has no mappings and simply zap
         * the stored attributes without taking any locks.
         */
        if ((pg->flags & PG_FAKE) != 0) {
                KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0);
                KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL);
                KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
                atomic_store_relaxed(&pp->pp_attrs, 0);
                return false;
        } else {
                return pmap_pp_clear_attrs(pp, pa, clearbits);
        }
}

/*
 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
 * pv-tracked page.
 */
bool
pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
{
        struct pmap_page *pp;

        pp = pmap_pv_tracked(pa);
        if (pp == NULL)
                panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);

        return pmap_pp_clear_attrs(pp, pa, clearbits);
}

/*
 * p m a p   p r o t e c t i o n   f u n c t i o n s
 */

/*
 * pmap_page_protect: change the protection of all recorded mappings
 * of a managed page
 *
 * => NOTE: this is an inline function in pmap.h
 */

/* see pmap.h */

/*
 * pmap_pv_protect: change the protection of all recorded mappings
 * of an unmanaged pv-tracked page
 *
 * => NOTE: this is an inline function in pmap.h
 */

/* see pmap.h */

/*
 * pmap_protect: set the protection in of the pages in a pmap
 *
 * => NOTE: this is an inline function in pmap.h
 */

/* see pmap.h */

/*
 * pmap_write_protect: write-protect pages in a pmap.
 *
 * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
 * present the page will still be considered as a kernel page, and the privilege
 * separation will be enforced correctly.
 */
void
pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
{
        pt_entry_t bit_rem, bit_put;
        pt_entry_t *ptes;
        pt_entry_t * const *pdes;
        struct pmap *pmap2;
        vaddr_t blockend, va;
        int lvl, i;

        if (__predict_false(pmap->pm_write_protect != NULL)) {
                (*pmap->pm_write_protect)(pmap, sva, eva, prot);
                return;
        }

        bit_rem = 0;
        if (!(prot & VM_PROT_WRITE))
                bit_rem = PTE_W;

        bit_put = 0;
        if (!(prot & VM_PROT_EXECUTE))
                bit_put = pmap_pg_nx;

        sva &= ~PAGE_MASK;
        eva &= ~PAGE_MASK;

        /*
         * Acquire pmap.  No need to lock the kernel pmap as we won't
         * be touching PV entries nor stats and kernel PDEs aren't
         * freed.
         */
        if (pmap != pmap_kernel()) {
                mutex_enter(&pmap->pm_lock);
        }
        pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);

        for (va = sva ; va < eva; va = blockend) {
                pt_entry_t *spte, *epte;

                blockend = x86_round_pdr(va + 1);
                if (blockend > eva)
                        blockend = eva;

                /* Is it a valid block? */
                if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
                        continue;
                }
                KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
                KASSERT(lvl == 1);

                spte = &ptes[pl1_i(va)];
                epte = &ptes[pl1_i(blockend)];

                for (i = 0; spte < epte; spte++, i++) {
                        pt_entry_t opte, npte;

                        do {
                                opte = *spte;
                                if (!pmap_valid_entry(opte)) {
                                        goto next;
                                }
                                npte = (opte & ~bit_rem) | bit_put;
                        } while (pmap_pte_cas(spte, opte, npte) != opte);

                        if ((opte & PTE_D) != 0) {
                                vaddr_t tva = va + x86_ptob(i);
                                pmap_tlb_shootdown(pmap, tva, opte,
                                    TLBSHOOT_WRITE_PROTECT);
                        }
next:;
                }
        }

        /* Release pmap. */
        pmap_unmap_ptes(pmap, pmap2);
        if (pmap != pmap_kernel()) {
                mutex_exit(&pmap->pm_lock);
        }
}

/*
 * pmap_unwire: clear the wired bit in the PTE.
 *
 * => Mapping should already be present.
 */
void
pmap_unwire(struct pmap *pmap, vaddr_t va)
{
        pt_entry_t *ptes, *ptep, opte;
        pd_entry_t * const *pdes;
        struct pmap *pmap2;
        int lvl;

        if (__predict_false(pmap->pm_unwire != NULL)) {
                (*pmap->pm_unwire)(pmap, va);
                return;
        }

        /*
         * Acquire pmap.  Need to lock the kernel pmap only to protect the
         * statistics.
         */
        mutex_enter(&pmap->pm_lock);
        pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);

        if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
                panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
        }
        KASSERT(lvl == 1);

        ptep = &ptes[pl1_i(va)];
        opte = *ptep;
        KASSERT(pmap_valid_entry(opte));

        if (opte & PTE_WIRED) {
                pt_entry_t npte = opte & ~PTE_WIRED;

                opte = pmap_pte_testset(ptep, npte);
                pmap_stats_update_bypte(pmap, npte, opte);
        } else {
                printf("%s: wiring for pmap %p va %#" PRIxVADDR
                    " did not change!\n", __func__, pmap, va);
        }

        /* Release pmap. */
        pmap_unmap_ptes(pmap, pmap2);
        mutex_exit(&pmap->pm_lock);
}

/*
 * pmap_copy: copy mappings from one pmap to another
 *
 * => optional function
 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
 */

/*
 * defined as macro in pmap.h
 */

__strict_weak_alias(pmap_enter, pmap_enter_default);

int
pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
    u_int flags)
{
        if (__predict_false(pmap->pm_enter != NULL)) {
                return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
        }

        return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
}

/*
 * pmap_enter: enter a mapping into a pmap
 *
 * => must be done "now" ... no lazy-evaluation
 */
int
pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
           vm_prot_t prot, u_int flags, int domid)
{
        pt_entry_t *ptes, opte, npte;
        pt_entry_t *ptep;
        pd_entry_t * const *pdes;
        struct vm_page *ptp;
        struct vm_page *new_pg, *old_pg;
        struct pmap_page *new_pp, *old_pp;
        struct pv_entry *old_pve, *new_pve;
        bool wired = (flags & PMAP_WIRED) != 0;
        struct pmap *pmap2;
        struct pmap_ptparray pt;
        int error;
        bool getptp, samepage, new_embedded;
        rb_tree_t *tree;

        KASSERT(pmap_initialized);
        KASSERT(va < VM_MAX_KERNEL_ADDRESS);
        KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
            PRIxVADDR " over PDP!", __func__, va);
        KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
            pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
            "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);

#ifdef XENPV
        KASSERT(domid == DOMID_SELF || pa == 0);
#endif

        npte = ma | protection_codes[prot] | PTE_P;
        npte |= pmap_pat_flags(flags);
        if (wired)
                npte |= PTE_WIRED;
        if (va < VM_MAXUSER_ADDRESS) {
                KASSERTMSG(pmap != pmap_kernel(),
                    "entering user va %#"PRIxVADDR" into kernel pmap",
                    va);
                if (pmap_is_user(pmap))
                        npte |= PTE_U;
        }

        if (pmap == pmap_kernel())
                npte |= pmap_pg_g;
        if (flags & VM_PROT_ALL) {
                npte |= PTE_A;
                if (flags & VM_PROT_WRITE) {
                        KASSERT((npte & PTE_W) != 0);
                        npte |= PTE_D;
                }
        }

#ifdef XENPV
        if (domid != DOMID_SELF)
                new_pg = NULL;
        else
#endif
                new_pg = PHYS_TO_VM_PAGE(pa);

        if (new_pg != NULL) {
                /* This is a managed page */
                npte |= PTE_PVLIST;
                new_pp = VM_PAGE_TO_PP(new_pg);
                PMAP_CHECK_PP(new_pp);
        } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
                /* This is an unmanaged pv-tracked page */
                npte |= PTE_PVLIST;
                PMAP_CHECK_PP(new_pp);
        } else {
                new_pp = NULL;
        }

        /* Begin by locking the pmap. */
        mutex_enter(&pmap->pm_lock);

        /* Look up the PTP.  Allocate if none present. */
        ptp = NULL;
        getptp = false;
        if (pmap != pmap_kernel()) {
                ptp = pmap_find_ptp(pmap, va, 1);
                if (ptp == NULL) {
                        getptp = true;
                        error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
                        if (error != 0) {
                                if (flags & PMAP_CANFAIL) {
                                        mutex_exit(&pmap->pm_lock);
                                        return error;
                                }
                                panic("%s: get ptp failed, error=%d", __func__,
                                    error);
                        }
                }
                tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
        } else {
                /* Embedded PV entries rely on this. */
                KASSERT(va != 0);
                tree = &pmap_kernel_rb;
        }

        /*
         * Look up the old PV entry at this VA (if any), and insert a new PV
         * entry if required for the new mapping.  Temporarily track the old
         * and new mappings concurrently.  Only after the old mapping is
         * evicted from the pmap will we remove its PV entry.  Otherwise,
         * our picture of modified/accessed state for either page could get
         * out of sync (we need any P->V operation for either page to stall
         * on pmap->pm_lock until done here).
         */
        new_pve = NULL;
        old_pve = NULL;
        samepage = false;
        new_embedded = false;

        if (new_pp != NULL) {
                error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
                    &old_pve, &samepage, &new_embedded, tree);

                /*
                 * If a new pv_entry was needed and none was available, we
                 * can go no further.
                 */
                if (error != 0) {
                        if (flags & PMAP_CANFAIL) {
                                if (getptp) {
                                        pmap_unget_ptp(pmap, &pt);
                                }
                                mutex_exit(&pmap->pm_lock);
                                return error;
                        }
                        panic("%s: alloc pve failed", __func__);
                }
        } else {
                old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
        }

        /* Map PTEs into address space. */
        pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);

        /* Install any newly allocated PTPs. */
        if (getptp) {
                pmap_install_ptp(pmap, &pt, va, pdes);
        }

        /* Check if there is an existing mapping. */
        ptep = &ptes[pl1_i(va)];
        opte = *ptep;
        bool have_oldpa = pmap_valid_entry(opte);
        paddr_t oldpa = pmap_pte2pa(opte);

        /*
         * Update the pte.
         */
        do {
                opte = *ptep;

                /*
                 * if the same page, inherit PTE_A and PTE_D.
                 */
                if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
                        npte |= opte & (PTE_A | PTE_D);
                }
#if defined(XENPV)
                if (domid != DOMID_SELF) {
                        /* pmap_pte_cas with error handling */
                        int s = splvm();
                        if (opte != *ptep) {
                                splx(s);
                                continue;
                        }
                        error = xpq_update_foreign(
                            vtomach((vaddr_t)ptep), npte, domid, flags);
                        splx(s);
                        if (error) {
                                /* Undo pv_entry tracking - oof. */
                                if (new_pp != NULL) {
                                        mutex_spin_enter(&new_pp->pp_lock);
                                        if (new_pve != NULL) {
                                                LIST_REMOVE(new_pve, pve_list);
                                                KASSERT(pmap->pm_pve == NULL);
                                                pmap->pm_pve = new_pve;
                                        } else if (new_embedded) {
                                                new_pp->pp_pte.pte_ptp = NULL;
                                                new_pp->pp_pte.pte_va = 0;
                                        }
                                        mutex_spin_exit(&new_pp->pp_lock);
                                }
                                pmap_unmap_ptes(pmap, pmap2);
                                /* Free new PTP. */
                                if (ptp != NULL && ptp->wire_count <= 1) {
                                        pmap_free_ptp(pmap, ptp, va, ptes,
                                            pdes);
                                }
                                mutex_exit(&pmap->pm_lock);
                                return error;
                        }
                        break;
                }
#endif /* defined(XENPV) */
        } while (pmap_pte_cas(ptep, opte, npte) != opte);

        /*
         * Done with the PTEs: they can now be unmapped.
         */
        pmap_unmap_ptes(pmap, pmap2);

        /*
         * Update statistics and PTP's reference count.
         */
        pmap_stats_update_bypte(pmap, npte, opte);
        if (ptp != NULL) {
                if (!have_oldpa) {
                        ptp->wire_count++;
                }
                /* Remember minimum VA in PTP. */
                pmap_ptp_range_set(ptp, va);
        }
        KASSERT(ptp == NULL || ptp->wire_count > 1);

        /*
         * If the same page, we can skip pv_entry handling.
         */
        if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
                KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
                if ((npte & PTE_PVLIST) != 0) {
                        KASSERT(samepage);
                        pmap_check_pv(pmap, ptp, new_pp, va, true);
                }
                goto same_pa;
        } else if ((npte & PTE_PVLIST) != 0) {
                KASSERT(!samepage);
        }

        /*
         * If old page is pv-tracked, remove pv_entry from its list.
         */
        if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
                if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
                        old_pp = VM_PAGE_TO_PP(old_pg);
                } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
                        panic("%s: PTE_PVLIST with pv-untracked page"
                            " va = %#"PRIxVADDR
                            " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
                            __func__, va, oldpa, atop(pa));
                }

                pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
                    pmap_pte_to_pp_attrs(opte));
        } else {
                KASSERT(old_pve == NULL);
                KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
        }

        /*
         * If new page is dynamically PV tracked, insert to tree.
         */
        if (new_pve != NULL) {
                KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
                old_pve = rb_tree_insert_node(tree, new_pve);
                KASSERT(old_pve == new_pve);
                pmap_check_pv(pmap, ptp, new_pp, va, true);
        }

same_pa:
        /*
         * shootdown tlb if necessary.
         */

        if ((~opte & (PTE_P | PTE_A)) == 0 &&
            ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
                pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
        }
        pmap_drain_pv(pmap);
        mutex_exit(&pmap->pm_lock);
        return 0;
}

#if defined(XEN) && defined(DOM0OPS)

struct pmap_data_gnt {
        SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
        vaddr_t pd_gnt_sva;
        vaddr_t pd_gnt_eva; /* range covered by this gnt */
        int pd_gnt_refs; /* ref counter */
        struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
};
SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);

static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);

static struct pmap_data_gnt *
pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
        struct pmap_data_gnt_head *headp;
        struct pmap_data_gnt *pgnt;

        KASSERT(mutex_owned(&pmap->pm_lock));
        headp = pmap->pm_data;
        KASSERT(headp != NULL);
        SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
                if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
                        return pgnt;
                /* check that we're not overlapping part of a region */
                KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
        }
        return NULL;
}

static void
pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
    const struct gnttab_map_grant_ref *ops)
{
        struct pmap_data_gnt_head *headp;
        struct pmap_data_gnt *pgnt;
        vaddr_t eva = sva + nentries * PAGE_SIZE;
        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(nentries >= 1);
        if (pmap->pm_remove == NULL) {
                pmap->pm_remove = pmap_remove_gnt;
                KASSERT(pmap->pm_data == NULL);
                headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
                SLIST_INIT(headp);
                pmap->pm_data = headp;
        } else {
                KASSERT(pmap->pm_remove == pmap_remove_gnt);
                KASSERT(pmap->pm_data != NULL);
                headp = pmap->pm_data;
        }

        pgnt = pmap_find_gnt(pmap, sva, eva);
        if (pgnt != NULL) {
                KASSERT(pgnt->pd_gnt_sva == sva);
                KASSERT(pgnt->pd_gnt_eva == eva);
                return;
        }

        /* new entry */
        pgnt = kmem_alloc(sizeof(*pgnt) +
            (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
        pgnt->pd_gnt_sva = sva;
        pgnt->pd_gnt_eva = eva;
        pgnt->pd_gnt_refs = 0;
        memcpy(pgnt->pd_gnt_ops, ops,
            sizeof(struct gnttab_map_grant_ref) * nentries);
        SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
}

static void
pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
{
        struct pmap_data_gnt_head *headp = pmap->pm_data;
        int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
        KASSERT(nentries >= 1);
        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(pgnt->pd_gnt_refs == 0);
        SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
        kmem_free(pgnt, sizeof(*pgnt) +
                    (nentries - 1) * sizeof(struct gnttab_map_grant_ref));
        if (SLIST_EMPTY(headp)) {
                kmem_free(headp, sizeof(*headp));
                pmap->pm_data = NULL;
                pmap->pm_remove = NULL;
        }
}

/*
 * pmap_enter_gnt: enter a grant entry into a pmap
 *
 * => must be done "now" ... no lazy-evaluation
 */
int
pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
    const struct gnttab_map_grant_ref *oops)
{
        struct pmap_data_gnt *pgnt;
        pt_entry_t *ptes, opte;
#ifndef XENPV
        pt_entry_t npte;
#endif
        pt_entry_t *ptep;
        pd_entry_t * const *pdes;
        struct vm_page *ptp;
        struct vm_page *old_pg;
        struct pmap_page *old_pp;
        struct pv_entry *old_pve;
        struct pmap *pmap2;
        struct pmap_ptparray pt;
        int error;
        bool getptp;
        rb_tree_t *tree;
        struct gnttab_map_grant_ref *op;
        int ret;
        int idx;

        KASSERT(pmap_initialized);
        KASSERT(va < VM_MAX_KERNEL_ADDRESS);
        KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
            PRIxVADDR " over PDP!", __func__, va);
        KASSERT(pmap != pmap_kernel());

        /* Begin by locking the pmap. */
        mutex_enter(&pmap->pm_lock);
        pmap_alloc_gnt(pmap, sva, nentries, oops);

        pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
        KASSERT(pgnt != NULL);

        /* Look up the PTP.  Allocate if none present. */
        ptp = NULL;
        getptp = false;
        ptp = pmap_find_ptp(pmap, va, 1);
        if (ptp == NULL) {
                getptp = true;
                error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
                if (error != 0) {
                        mutex_exit(&pmap->pm_lock);
                        return error;
                }
        }
        tree = &VM_PAGE_TO_PP(ptp)->pp_rb;

        /*
         * Look up the old PV entry at this VA (if any), and insert a new PV
         * entry if required for the new mapping.  Temporarily track the old
         * and new mappings concurrently.  Only after the old mapping is
         * evicted from the pmap will we remove its PV entry.  Otherwise,
         * our picture of modified/accessed state for either page could get
         * out of sync (we need any P->V operation for either page to stall
         * on pmap->pm_lock until done here).
         */
        old_pve = NULL;

        old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);

        /* Map PTEs into address space. */
        pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);

        /* Install any newly allocated PTPs. */
        if (getptp) {
                pmap_install_ptp(pmap, &pt, va, pdes);
        }

        /* Check if there is an existing mapping. */
        ptep = &ptes[pl1_i(va)];
        opte = *ptep;
        bool have_oldpa = pmap_valid_entry(opte);
        paddr_t oldpa = pmap_pte2pa(opte);

        /*
         * Update the pte.
         */

        idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
        op = &pgnt->pd_gnt_ops[idx];

#ifdef XENPV
        KASSERT(op->flags & GNTMAP_contains_pte);
        op->host_addr = xpmap_ptetomach(ptep);
#else
        KASSERT((op->flags & GNTMAP_contains_pte) == 0);
        KASSERT(op->flags != 0);
        KASSERT(op->host_addr != 0);
#endif
        op->dev_bus_addr = 0;
        op->status = GNTST_general_error;
        ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
        if (__predict_false(ret)) {
                printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
                    __func__, ret);
                op->status = GNTST_general_error;
        }
        for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
                kpause("gntmap", false, mstohz(1), NULL);
                ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
                if (__predict_false(ret)) {
                        printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
                            __func__, ret);
                        op->status = GNTST_general_error;
                }
        }
        if (__predict_false(op->status != GNTST_okay)) {
                printf("%s: GNTTABOP_map_grant_ref status: %d\n",
                    __func__, op->status);
                if (have_oldpa) { /* XXX did the pte really change if XENPV  ?*/
                        ptp->wire_count--;
                }
        } else {
#ifndef XENPV
                npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P;
                if ((op->flags & GNTMAP_readonly) == 0)
                        npte |= PTE_W;
                do {
                        opte = *ptep;
                } while (pmap_pte_cas(ptep, opte, npte) != opte);
#endif
                pgnt->pd_gnt_refs++;
                if (!have_oldpa) {
                        ptp->wire_count++;
                }
                KASSERT(ptp->wire_count > 1);
                /* Remember minimum VA in PTP. */
                pmap_ptp_range_set(ptp, va);
        }
        if (ptp->wire_count <= 1)
                pmap_free_ptp(pmap, ptp, va, ptes, pdes);

        /*
         * Done with the PTEs: they can now be unmapped.
         */
        pmap_unmap_ptes(pmap, pmap2);

        /*
         * Update statistics and PTP's reference count.
         */
        pmap_stats_update_bypte(pmap, 0, opte);

        /*
         * If old page is pv-tracked, remove pv_entry from its list.
         */
        if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
                if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
                        old_pp = VM_PAGE_TO_PP(old_pg);
                } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
                        panic("%s: PTE_PVLIST with pv-untracked page"
                            " va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
                            __func__, va, oldpa);
                }

                pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
                    pmap_pte_to_pp_attrs(opte));
        } else {
                KASSERT(old_pve == NULL);
                KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
        }

        pmap_drain_pv(pmap);
        mutex_exit(&pmap->pm_lock);
        return op->status;
}

/*
 * pmap_remove_gnt: grant mapping removal function.
 *
 * => caller should not be holding any pmap locks
 */
static void
pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
        struct pmap_data_gnt *pgnt;
        pt_entry_t *ptes;
        pd_entry_t pde;
        pd_entry_t * const *pdes;
        struct vm_page *ptp;
        struct pmap *pmap2;
        vaddr_t va;
        int lvl;
        int idx;
        struct gnttab_map_grant_ref *op;
        struct gnttab_unmap_grant_ref unmap_op;
        int ret;

        KASSERT(pmap != pmap_kernel());
        KASSERT(pmap->pm_remove == pmap_remove_gnt);

        mutex_enter(&pmap->pm_lock);
        for (va = sva; va < eva; va += PAGE_SIZE) {
                pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
                if (pgnt == NULL) {
                        pmap_remove_locked(pmap, sva, eva);
                        continue;
                }

                pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
                if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
                        panic("pmap_remove_gnt pdes not valid");
                }

                idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
                op = &pgnt->pd_gnt_ops[idx];
                KASSERT(lvl == 1);

                /* Get PTP if non-kernel mapping. */
                ptp = pmap_find_ptp(pmap, va, 1);
                KASSERTMSG(ptp != NULL,
                    "%s: unmanaged PTP detected", __func__);

                if (op->status == GNTST_okay)  {
                        KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
#ifdef XENPV 
                        unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
#else
                        unmap_op.host_addr = op->host_addr;
                        pmap_pte_testset(&ptes[pl1_i(va)], 0);
#endif
                        unmap_op.handle = op->handle;
                        unmap_op.dev_bus_addr = 0;
                        ret = HYPERVISOR_grant_table_op(
                            GNTTABOP_unmap_grant_ref, &unmap_op, 1);
                        if (ret) {
                                printf("%s: GNTTABOP_unmap_grant_ref "
                                    "failed: %d\n", __func__, ret);
                        }

                        ptp->wire_count--;
                        pgnt->pd_gnt_refs--;
                }
                if (pgnt->pd_gnt_refs == 0) {
                        pmap_free_gnt(pmap, pgnt);
                }
                /*
                 * if mapping removed and the PTP is no longer
                 * being used, free it!
                 */

                if (ptp->wire_count <= 1)
                        pmap_free_ptp(pmap, ptp, va, ptes, pdes);
                pmap_unmap_ptes(pmap, pmap2);
        }
        mutex_exit(&pmap->pm_lock);
}
#endif /* XEN && DOM0OPS */

paddr_t
pmap_get_physpage(void)
{
        struct vm_page *ptp;
        struct pmap *kpm = pmap_kernel();
        paddr_t pa;

        if (!uvm.page_init_done) {
                /*
                 * We're growing the kernel pmap early (from
                 * uvm_pageboot_alloc()). This case must be
                 * handled a little differently.
                 */

                if (!uvm_page_physget(&pa))
                        panic("%s: out of memory", __func__);
#if defined(__HAVE_DIRECT_MAP)
                memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
#else
#if defined(XENPV)
                if (XEN_VERSION_SUPPORTED(3, 4)) {
                        xen_pagezero(pa);
                        return pa;
                }
#endif
                kpreempt_disable();
                pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
                    PTE_W | pmap_pg_nx);
                pmap_pte_flush();
                pmap_update_pg((vaddr_t)early_zerop);
                memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE);
#if defined(DIAGNOSTIC) || defined(XENPV)
                pmap_pte_set(early_zero_pte, 0);
                pmap_pte_flush();
#endif /* defined(DIAGNOSTIC) */
                kpreempt_enable();
#endif /* defined(__HAVE_DIRECT_MAP) */
        } else {
                /* XXX */
                ptp = uvm_pagealloc(NULL, 0, NULL,
                                    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
                if (ptp == NULL)
                        panic("%s: out of memory", __func__);
                ptp->flags &= ~PG_BUSY;
                ptp->wire_count = 1;
                pa = VM_PAGE_TO_PHYS(ptp);
        }
        pmap_stats_update(kpm, 1, 0);

        return pa;
}

/*
 * Expand the page tree with the specified amount of PTPs, mapping virtual
 * addresses starting at kva. We populate all the levels but the last one
 * (L1). The nodes of the tree are created as RW, but the pages covered
 * will be kentered in L1, with proper permissions.
 *
 * Used only by pmap_growkernel.
 */
static void
pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
{
        unsigned long i;
        paddr_t pa;
        unsigned long index, endindex;
        int level;
        pd_entry_t *pdep;
#ifdef XENPV
        int s = splvm(); /* protect xpq_* */
#endif

        for (level = PTP_LEVELS; level > 1; level--) {
                if (level == PTP_LEVELS)
                        pdep = cpm->pm_pdir;
                else
                        pdep = normal_pdes[level - 2];
                index = pl_i_roundup(kva, level);
                endindex = index + needed_ptps[level - 1] - 1;

                for (i = index; i <= endindex; i++) {
                        pt_entry_t pte;

                        KASSERT(!pmap_valid_entry(pdep[i]));
                        pa = pmap_get_physpage();
                        pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
#ifdef __x86_64__
                        pte |= pmap_pg_nx;
#endif
                        pmap_pte_set(&pdep[i], pte);

#ifdef XENPV
                        if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
                                if (__predict_true(
                                    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
                                        /* update per-cpu PMDs on all cpus */
                                        xen_kpm_sync(pmap_kernel(), i);
                                } else {
                                        /*
                                         * too early; update primary CPU
                                         * PMD only (without locks)
                                         */
#ifdef __x86_64__
                                        pd_entry_t *cpu_pdep =
                                                &cpu_info_primary.ci_kpm_pdir[i];
#else
                                        pd_entry_t *cpu_pdep =
                                            &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
#endif
                                        pmap_pte_set(cpu_pdep, pte);
                                }
                        }
#endif

                        KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
                            pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
                        nkptp[level - 1]++;
                }
                pmap_pte_flush();
        }
#ifdef XENPV
        splx(s);
#endif
}

/*
 * pmap_growkernel: increase usage of KVM space.
 *
 * => we allocate new PTPs for the kernel and install them in all
 *    the pmaps on the system.
 */
vaddr_t
pmap_growkernel(vaddr_t maxkvaddr)
{
        struct pmap *kpm = pmap_kernel();
        struct pmap *cpm;
#if !defined(XENPV) || !defined(__x86_64__)
        struct pmap *pm;
        long old;
#endif
        int s, i;
        long needed_kptp[PTP_LEVELS], target_nptp;
        bool invalidate = false;

        s = splvm();        /* to be safe */
        mutex_enter(&kpm->pm_lock);

        if (maxkvaddr <= pmap_maxkvaddr) {
                mutex_exit(&kpm->pm_lock);
                splx(s);
                return pmap_maxkvaddr;
        }

        maxkvaddr = x86_round_pdr(maxkvaddr);
#if !defined(XENPV) || !defined(__x86_64__)
        old = nkptp[PTP_LEVELS - 1];
#endif

        /* Initialize needed_kptp. */
        for (i = PTP_LEVELS - 1; i >= 1; i--) {
                target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
                    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);

                if (target_nptp > nkptpmax[i])
                        panic("out of KVA space");
                KASSERT(target_nptp >= nkptp[i]);
                needed_kptp[i] = target_nptp - nkptp[i];
        }

#ifdef XENPV
        /* only pmap_kernel(), or the per-cpu map, has kernel entries */
        cpm = kpm;
#else
        /* Get the current pmap */
        if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
                cpm = curcpu()->ci_pmap;
        } else {
                cpm = kpm;
        }
#endif

        kasan_shadow_map((void *)pmap_maxkvaddr,
            (size_t)(maxkvaddr - pmap_maxkvaddr));
        kmsan_shadow_map((void *)pmap_maxkvaddr,
            (size_t)(maxkvaddr - pmap_maxkvaddr));

        pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);

        /*
         * If the number of top level entries changed, update all pmaps.
         */
        if (needed_kptp[PTP_LEVELS - 1] != 0) {
#ifdef XENPV
#ifdef __x86_64__
                /* nothing, kernel entries are never entered in user pmap */
#else
                int pdkidx;

                mutex_enter(&pmaps_lock);
                LIST_FOREACH(pm, &pmaps, pm_list) {
                        for (pdkidx = PDIR_SLOT_KERN + old;
                            pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
                            pdkidx++) {
                                pmap_pte_set(&pm->pm_pdir[pdkidx],
                                    kpm->pm_pdir[pdkidx]);
                        }
                        pmap_pte_flush();
                }
                mutex_exit(&pmaps_lock);
#endif /* __x86_64__ */
#else /* XENPV */
                size_t newpdes;
                newpdes = nkptp[PTP_LEVELS - 1] - old;
                if (cpm != kpm) {
                        memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
                            &cpm->pm_pdir[PDIR_SLOT_KERN + old],
                            newpdes * sizeof(pd_entry_t));
                }

                mutex_enter(&pmaps_lock);
                LIST_FOREACH(pm, &pmaps, pm_list) {
                        if (__predict_false(pm->pm_enter != NULL)) {
                                /*
                                 * Not a native pmap, the kernel is not mapped,
                                 * so nothing to synchronize.
                                 */
                                continue;
                        }
                        memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
                            &kpm->pm_pdir[PDIR_SLOT_KERN + old],
                            newpdes * sizeof(pd_entry_t));
                }
                mutex_exit(&pmaps_lock);
#endif
                invalidate = true;
        }
        pmap_maxkvaddr = maxkvaddr;
        mutex_exit(&kpm->pm_lock);
        splx(s);

        if (invalidate && pmap_initialized) {
                /* Invalidate the pmap cache. */
                pool_cache_invalidate(&pmap_cache);
        }

        return maxkvaddr;
}

#ifdef DEBUG
void pmap_dump(struct pmap *, vaddr_t, vaddr_t);

/*
 * pmap_dump: dump all the mappings from a pmap
 *
 * => caller should not be holding any pmap locks
 */
void
pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
        pt_entry_t *ptes, *pte;
        pd_entry_t * const *pdes;
        struct pmap *pmap2;
        vaddr_t blkendva;
        int lvl;

        /*
         * if end is out of range truncate.
         * if (end == start) update to max.
         */

        if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
                eva = VM_MAXUSER_ADDRESS;

        mutex_enter(&pmap->pm_lock);
        pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);

        /*
         * dumping a range of pages: we dump in PTP sized blocks (4MB)
         */

        for (/* null */ ; sva < eva ; sva = blkendva) {

                /* determine range of block */
                blkendva = x86_round_pdr(sva+1);
                if (blkendva > eva)
                        blkendva = eva;

                /* valid block? */
                if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
                        continue;
                KASSERT(lvl == 1);

                pte = &ptes[pl1_i(sva)];
                for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
                        if (!pmap_valid_entry(*pte))
                                continue;
                        printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
                            " (pte=%#" PRIxPADDR ")\n",
                            sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
                }
        }
        pmap_unmap_ptes(pmap, pmap2);
        mutex_exit(&pmap->pm_lock);
}
#endif

/*
 * pmap_update: process deferred invalidations and frees.
 */
void
pmap_update(struct pmap *pmap)
{
        struct pmap_page *pp;
        struct vm_page *ptp;

        /*
         * Initiate any pending TLB shootdowns.  Wait for them to
         * complete before returning control to the caller.
         */
        kpreempt_disable();
        pmap_tlb_shootnow();
        kpreempt_enable();

        /*
         * Now that shootdowns are complete, process deferred frees.  This
         * is an unlocked check, but is safe as we're only interested in
         * work done in this LWP - we won't get a false negative.
         */
        if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
                return;
        }

        mutex_enter(&pmap->pm_lock);
        while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
                KASSERT(ptp->wire_count == 0);
                KASSERT(ptp->uanon == NULL);
                LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
                pp = VM_PAGE_TO_PP(ptp);
                LIST_INIT(&pp->pp_pvlist);
                pp->pp_attrs = 0;
                pp->pp_pte.pte_ptp = NULL;
                pp->pp_pte.pte_va = 0;
                PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));

                /*
                 * XXX Hack to avoid extra locking, and lock
                 * assertions in uvm_pagefree().  Despite uobject
                 * being set, this isn't a managed page.
                 */
                PMAP_DUMMY_LOCK(pmap);
                uvm_pagerealloc(ptp, NULL, 0);
                PMAP_DUMMY_UNLOCK(pmap);
                uvm_pagefree(ptp);
        }
        mutex_exit(&pmap->pm_lock);
}

#if PTP_LEVELS > 4
#error "Unsupported number of page table mappings"
#endif

paddr_t
pmap_init_tmp_pgtbl(paddr_t pg)
{
        static bool maps_loaded;
        static const paddr_t x86_tmp_pml_paddr[] = {
            4 * PAGE_SIZE,        /* L1 */
            5 * PAGE_SIZE,        /* L2 */
            6 * PAGE_SIZE,        /* L3 */
            7 * PAGE_SIZE        /* L4 */
        };
        static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };

        pd_entry_t *tmp_pml, *kernel_pml;

        int level;

        if (!maps_loaded) {
                for (level = 0; level < PTP_LEVELS; ++level) {
                        x86_tmp_pml_vaddr[level] =
                            uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
                            UVM_KMF_VAONLY);

                        if (x86_tmp_pml_vaddr[level] == 0)
                                panic("mapping of real mode PML failed\n");
                        pmap_kenter_pa(x86_tmp_pml_vaddr[level],
                            x86_tmp_pml_paddr[level],
                            VM_PROT_READ | VM_PROT_WRITE, 0);
                }
                pmap_update(pmap_kernel());
                maps_loaded = true;
        }

        /* Zero levels 1-3 */
        for (level = 0; level < PTP_LEVELS - 1; ++level) {
                tmp_pml = (void *)x86_tmp_pml_vaddr[level];
                memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE);
        }

        /* Copy PML4 */
        kernel_pml = pmap_kernel()->pm_pdir;
        tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
        memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE);

#ifdef PAE
        /*
         * Use the last 4 entries of the L2 page as L3 PD entries. These
         * last entries are unlikely to be used for temporary mappings.
         * 508: maps 0->1GB (userland)
         * 509: unused
         * 510: unused
         * 511: maps 3->4GB (kernel)
         */
        tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
        tmp_pml[509] = 0;
        tmp_pml[510] = 0;
        tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
#endif

        for (level = PTP_LEVELS - 1; level > 0; --level) {
                tmp_pml = (void *)x86_tmp_pml_vaddr[level];

                tmp_pml[pl_i(pg, level + 1)] =
                    (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
        }

        tmp_pml = (void *)x86_tmp_pml_vaddr[0];
        tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;

#ifdef PAE
        /* Return the PA of the L3 page (entry 508 of the L2 page) */
        return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
#endif

        return x86_tmp_pml_paddr[PTP_LEVELS - 1];
}

u_int
x86_mmap_flags(paddr_t mdpgno)
{
        u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
        u_int pflag = 0;

        if (nflag & X86_MMAP_FLAG_PREFETCH)
                pflag |= PMAP_WRITE_COMBINE;

        return pflag;
}

#if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)

/*
 * -----------------------------------------------------------------------------
 * *****************************************************************************
 * *****************************************************************************
 * *****************************************************************************
 * *****************************************************************************
 * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
 * *****************************************************************************
 * *****************************************************************************
 * *****************************************************************************
 * *****************************************************************************
 * -----------------------------------------------------------------------------
 *
 * These functions are invoked as callbacks from the code above. Contrary to
 * native, EPT does not have a recursive slot; therefore, it is not possible
 * to call pmap_map_ptes(). Instead, we use the direct map and walk down the
 * tree manually.
 *
 * Apart from that, the logic is mostly the same as native. Once a pmap has
 * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
 * After that we're good, and the callbacks will handle the translations
 * for us.
 *
 * -----------------------------------------------------------------------------
 */

/* Hardware bits. */
#define EPT_R                __BIT(0)        /* read */
#define EPT_W                __BIT(1)        /* write */
#define EPT_X                __BIT(2)        /* execute */
#define EPT_T                __BITS(5,3)        /* type */
#define                TYPE_UC        0
#define                TYPE_WC        1
#define                TYPE_WT        4
#define                TYPE_WP        5
#define                TYPE_WB        6
#define EPT_NOPAT        __BIT(6)
#define EPT_L                __BIT(7)        /* large */
#define EPT_A                __BIT(8)        /* accessed */
#define EPT_D                __BIT(9)        /* dirty */
/* Software bits. */
#define EPT_PVLIST        __BIT(60)
#define EPT_WIRED        __BIT(61)

#define pmap_ept_valid_entry(pte)        (pte & EPT_R)

bool pmap_ept_has_ad __read_mostly;

static inline void
pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
{
        int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
        int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);

        KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
        KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);

        pmap_stats_update(pmap, resid_diff, wired_diff);
}

static pt_entry_t
pmap_ept_type(u_int flags)
{
        u_int cacheflags = (flags & PMAP_CACHE_MASK);
        pt_entry_t ret;

        switch (cacheflags) {
        case PMAP_NOCACHE:
        case PMAP_NOCACHE_OVR:
                ret = __SHIFTIN(TYPE_UC, EPT_T);
                break;
        case PMAP_WRITE_COMBINE:
                ret = __SHIFTIN(TYPE_WC, EPT_T);
                break;
        case PMAP_WRITE_BACK:
        default:
                ret = __SHIFTIN(TYPE_WB, EPT_T);
                break;
        }

        ret |= EPT_NOPAT;
        return ret;
}

static inline pt_entry_t
pmap_ept_prot(vm_prot_t prot)
{
        pt_entry_t res = 0;

        if (prot & VM_PROT_READ)
                res |= EPT_R;
        if (prot & VM_PROT_WRITE)
                res |= EPT_W;
        if (prot & VM_PROT_EXECUTE)
                res |= EPT_X;

        return res;
}

static inline uint8_t
pmap_ept_to_pp_attrs(pt_entry_t ept)
{
        uint8_t ret = 0;
        if (pmap_ept_has_ad) {
                if (ept & EPT_D)
                        ret |= PP_ATTRS_D;
                if (ept & EPT_A)
                        ret |= PP_ATTRS_A;
        } else {
                ret |= (PP_ATTRS_D|PP_ATTRS_A);
        }
        if (ept & EPT_W)
                ret |= PP_ATTRS_W;
        return ret;
}

static inline pt_entry_t
pmap_pp_attrs_to_ept(uint8_t attrs)
{
        pt_entry_t ept = 0;
        if (attrs & PP_ATTRS_D)
                ept |= EPT_D;
        if (attrs & PP_ATTRS_A)
                ept |= EPT_A;
        if (attrs & PP_ATTRS_W)
                ept |= EPT_W;
        return ept;
}

/*
 * Helper for pmap_ept_free_ptp.
 * tree[0] = &L2[L2idx]
 * tree[1] = &L3[L3idx]
 * tree[2] = &L4[L4idx]
 */
static void
pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
{
        pt_entry_t *pteva;
        paddr_t ptepa;
        int i, index;

        ptepa = pmap->pm_pdirpa[0];
        for (i = PTP_LEVELS; i > 1; i--) {
                index = pl_pi(va, i);
                pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
                KASSERT(pmap_ept_valid_entry(pteva[index]));
                tree[i - 2] = &pteva[index];
                ptepa = pmap_pte2pa(pteva[index]);
        }
}

static void
pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
{
        pd_entry_t *tree[3];
        int level;

        KASSERT(pmap != pmap_kernel());
        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(kpreempt_disabled());

        pmap_ept_get_tree(pmap, va, tree);

        level = 1;
        do {
                (void)pmap_pte_testset(tree[level - 1], 0);

                pmap_freepage(pmap, ptp, level);
                if (level < PTP_LEVELS - 1) {
                        ptp = pmap_find_ptp(pmap, va, level + 1);
                        ptp->wire_count--;
                        if (ptp->wire_count > 1)
                                break;
                }
        } while (++level < PTP_LEVELS);
        pmap_pte_flush();
}

/* Allocate L4->L3->L2. Return L2. */
static void
pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
{
        struct vm_page *ptp;
        unsigned long index;
        pd_entry_t *pteva;
        paddr_t ptepa;
        int i;

        KASSERT(pmap != pmap_kernel());
        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(kpreempt_disabled());

        /*
         * Now that we have all the pages looked up or allocated,
         * loop through again installing any new ones into the tree.
         */
        ptepa = pmap->pm_pdirpa[0];
        for (i = PTP_LEVELS; i > 1; i--) {
                index = pl_pi(va, i);
                pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);

                if (pmap_ept_valid_entry(pteva[index])) {
                        KASSERT(!pt->alloced[i]);
                        ptepa = pmap_pte2pa(pteva[index]);
                        continue;
                }

                ptp = pt->pg[i];
                ptp->flags &= ~PG_BUSY; /* never busy */
                ptp->wire_count = 1;
                pmap->pm_ptphint[i - 2] = ptp;
                ptepa = VM_PAGE_TO_PHYS(ptp);
                pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);

                pmap_pte_flush();
                pmap_stats_update(pmap, 1, 0);

                /*
                 * If we're not in the top level, increase the
                 * wire count of the parent page.
                 */
                if (i < PTP_LEVELS) {
                        pt->pg[i + 1]->wire_count++;
                }
        }
}

static int
pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
    u_int flags)
{
        pt_entry_t *ptes, opte, npte;
        pt_entry_t *ptep;
        struct vm_page *ptp;
        struct vm_page *new_pg, *old_pg;
        struct pmap_page *new_pp, *old_pp;
        struct pv_entry *old_pve, *new_pve;
        bool wired = (flags & PMAP_WIRED) != 0;
        bool accessed;
        struct pmap_ptparray pt;
        int error;
        bool getptp, samepage, new_embedded;
        rb_tree_t *tree;

        KASSERT(pmap_initialized);
        KASSERT(va < VM_MAXUSER_ADDRESS);

        npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);

        if (wired)
                npte |= EPT_WIRED;
        if (flags & VM_PROT_ALL) {
                npte |= EPT_A;
                if (flags & VM_PROT_WRITE) {
                        KASSERT((npte & EPT_W) != 0);
                        npte |= EPT_D;
                }
        }

        new_pg = PHYS_TO_VM_PAGE(pa);
        if (new_pg != NULL) {
                /* This is a managed page */
                npte |= EPT_PVLIST;
                new_pp = VM_PAGE_TO_PP(new_pg);
        } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
                /* This is an unmanaged pv-tracked page */
                npte |= EPT_PVLIST;
        } else {
                new_pp = NULL;
        }

        /* Begin by locking the pmap. */
        mutex_enter(&pmap->pm_lock);

        /* Look up the PTP.  Allocate if none present. */
        ptp = NULL;
        getptp = false;
        if (pmap != pmap_kernel()) {
                ptp = pmap_find_ptp(pmap, va, 1);
                if (ptp == NULL) {
                        getptp = true;
                        error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
                        if (error != 0) {
                                if (flags & PMAP_CANFAIL) {
                                        mutex_exit(&pmap->pm_lock);
                                        return error;
                                }
                                panic("%s: get ptp failed, error=%d", __func__,
                                    error);
                        }
                }
                tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
        } else {
                /* Embedded PV entries rely on this. */
                KASSERT(va != 0);
                tree = &pmap_kernel_rb;
        }

        /*
         * Look up the old PV entry at this VA (if any), and insert a new PV
         * entry if required for the new mapping.  Temporarily track the old
         * and new mappings concurrently.  Only after the old mapping is
         * evicted from the pmap will we remove its PV entry.  Otherwise,
         * our picture of modified/accessed state for either page could get
         * out of sync (we need any P->V operation for either page to stall
         * on pmap->pm_lock until done here).
         */
        new_pve = NULL;
        old_pve = NULL;
        samepage = false;
        new_embedded = false;

        if (new_pp != NULL) {
                error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
                    &old_pve, &samepage, &new_embedded, tree);

                /*
                 * If a new pv_entry was needed and none was available, we
                 * can go no further.
                 */
                if (error != 0) {
                        if (flags & PMAP_CANFAIL) {
                                if (getptp) {
                                        pmap_unget_ptp(pmap, &pt);
                                }
                                mutex_exit(&pmap->pm_lock);
                                return error;
                        }
                        panic("%s: alloc pve failed", __func__);
                }
        } else {
                old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
        }

        /* Map PTEs into address space. */
        kpreempt_disable();

        /* Install any newly allocated PTPs. */
        if (getptp) {
                pmap_ept_install_ptp(pmap, &pt, va);
        }

        /* Check if there is an existing mapping. */
        ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
        ptep = &ptes[pl1_pi(va)];
        opte = *ptep;
        bool have_oldpa = pmap_ept_valid_entry(opte);
        paddr_t oldpa = pmap_pte2pa(opte);

        /*
         * Update the pte.
         */
        do {
                opte = *ptep;

                /*
                 * if the same page, inherit PTE_A and PTE_D.
                 */
                if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
                        npte |= opte & (EPT_A | EPT_D);
                }
        } while (pmap_pte_cas(ptep, opte, npte) != opte);

        /*
         * Done with the PTEs: they can now be unmapped.
         */
        kpreempt_enable();

        /*
         * Update statistics and PTP's reference count.
         */
        pmap_ept_stats_update_bypte(pmap, npte, opte);
        if (ptp != NULL) {
                if (!have_oldpa) {
                        ptp->wire_count++;
                }
                /* Remember minimum VA in PTP. */
                pmap_ptp_range_set(ptp, va);
        }
        KASSERT(ptp == NULL || ptp->wire_count > 1);

        /*
         * If the same page, we can skip pv_entry handling.
         */
        if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
                KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
                if ((npte & EPT_PVLIST) != 0) {
                        KASSERT(samepage);
                        pmap_check_pv(pmap, ptp, new_pp, va, true);
                }
                goto same_pa;
        } else if ((npte & EPT_PVLIST) != 0) {
                KASSERT(!samepage);
        }

        /*
         * If old page is pv-tracked, remove pv_entry from its list.
         */
        if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
                if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
                        old_pp = VM_PAGE_TO_PP(old_pg);
                } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
                        panic("%s: EPT_PVLIST with pv-untracked page"
                            " va = %#"PRIxVADDR
                            " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
                            __func__, va, oldpa, atop(pa));
                }

                pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
                    pmap_ept_to_pp_attrs(opte));
        } else {
                KASSERT(old_pve == NULL);
                KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
        }

        /*
         * If new page is dynamically PV tracked, insert to tree.
         */
        if (new_pve != NULL) {
                KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
                old_pve = rb_tree_insert_node(tree, new_pve);
                KASSERT(old_pve == new_pve);
                pmap_check_pv(pmap, ptp, new_pp, va, true);
        }

same_pa:
        /*
         * shootdown tlb if necessary.
         */

        if (pmap_ept_has_ad) {
                accessed = (~opte & (EPT_R | EPT_A)) == 0;
        } else {
                accessed = (opte & EPT_R) != 0;
        }
        if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
                pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
        }
        pmap_drain_pv(pmap);
        mutex_exit(&pmap->pm_lock);
        return 0;
}

/* Pay close attention, this returns L2. */
static int
pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
{
        pt_entry_t *pteva;
        paddr_t ptepa;
        int i, index;

        KASSERT(mutex_owned(&pmap->pm_lock));

        ptepa = pmap->pm_pdirpa[0];
        for (i = PTP_LEVELS; i > 1; i--) {
                pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
                index = pl_pi(va, i);
                if (!pmap_ept_valid_entry(pteva[index]))
                        return i;
                ptepa = pmap_pte2pa(pteva[index]);
        }
        if (lastpde != NULL) {
                *lastpde = pteva[index];
        }

        return 0;
}

static bool
pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
{
        pt_entry_t *ptes, pte;
        pd_entry_t pde;
        paddr_t ptppa, pa;
        bool rv;

#ifdef __HAVE_DIRECT_MAP
        if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
                if (pap != NULL) {
                        *pap = PMAP_DIRECT_UNMAP(va);
                }
                return true;
        }
#endif

        rv = false;
        pa = 0;

        mutex_enter(&pmap->pm_lock);
        kpreempt_disable();

        if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
                ptppa = pmap_pte2pa(pde);
                ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
                pte = ptes[pl1_pi(va)];
                if (__predict_true((pte & EPT_R) != 0)) {
                        pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
                        rv = true;
                }
        }

        kpreempt_enable();
        mutex_exit(&pmap->pm_lock);

        if (pap != NULL) {
                *pap = pa;
        }
        return rv;
}

static bool
pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
    vaddr_t va)
{
        struct pv_entry *pve;
        struct vm_page *pg;
        struct pmap_page *pp;
        pt_entry_t opte;
        bool accessed;

        KASSERT(pmap != pmap_kernel());
        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(kpreempt_disabled());

        if (!pmap_ept_valid_entry(*pte)) {
                /* VA not mapped. */
                return false;
        }

        /* Atomically save the old PTE and zap it. */
        opte = pmap_pte_testset(pte, 0);
        if (!pmap_ept_valid_entry(opte)) {
                return false;
        }

        pmap_ept_stats_update_bypte(pmap, 0, opte);

        if (ptp) {
                /*
                 * Dropping a PTE.  Make sure that the PDE is flushed.
                 */
                ptp->wire_count--;
                if (ptp->wire_count <= 1) {
                        opte |= EPT_A;
                }
        }

        if (pmap_ept_has_ad) {
                accessed = (opte & EPT_A) != 0;
        } else {
                accessed = true;
        }
        if (accessed) {
                pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
        }

        /*
         * If we are not on a pv list - we are done.
         */
        if ((opte & EPT_PVLIST) == 0) {
                KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
                    "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
                KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
                    "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
                KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
                    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
                return true;
        }

        if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
                pp = VM_PAGE_TO_PP(pg);
        } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
                paddr_t pa = pmap_pte2pa(opte);
                panic("%s: EPT_PVLIST with pv-untracked page"
                    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
                    __func__, va, pa, atop(pa));
        }

        /* Sync R/M bits. */
        pve = pmap_lookup_pv(pmap, ptp, pp, va);
        pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
        return true;
}

static void
pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
    vaddr_t startva, vaddr_t endva)
{
        pt_entry_t *pte = (pt_entry_t *)ptpva;

        KASSERT(pmap != pmap_kernel());
        KASSERT(mutex_owned(&pmap->pm_lock));
        KASSERT(kpreempt_disabled());

        /*
         * mappings are very often sparse, so clip the given range to the
         * range of PTEs that are known present in the PTP.
         */
        pmap_ptp_range_clip(ptp, &startva, &pte);

        /*
         * note that ptpva points to the PTE that maps startva.   this may
         * or may not be the first PTE in the PTP.
         *
         * we loop through the PTP while there are still PTEs to look at
         * and the wire_count is greater than 1 (because we use the wire_count
         * to keep track of the number of real PTEs in the PTP).
         */
        while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
                (void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
                startva += PAGE_SIZE;
                pte++;
        }
}

static void
pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
        pt_entry_t *ptes;
        pd_entry_t pde;
        paddr_t ptppa;
        vaddr_t blkendva, va = sva;
        struct vm_page *ptp;

        mutex_enter(&pmap->pm_lock);
        kpreempt_disable();

        for (/* null */ ; va < eva ; va = blkendva) {
                int lvl;

                /* determine range of block */
                blkendva = x86_round_pdr(va+1);
                if (blkendva > eva)
                        blkendva = eva;

                lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
                if (lvl != 0) {
                        /* Skip a range corresponding to an invalid pde. */
                        blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
                        continue;
                }

                /* PA of the PTP */
                ptppa = pmap_pte2pa(pde);

                ptp = pmap_find_ptp(pmap, va, 1);
                KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
                    __func__);

                ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);

                pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
                    blkendva);

                /* If PTP is no longer being used, free it. */
                if (ptp && ptp->wire_count <= 1) {
                        pmap_ept_free_ptp(pmap, ptp, va);
                }
        }

        kpreempt_enable();
        pmap_drain_pv(pmap);
        mutex_exit(&pmap->pm_lock);
}

static int
pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
    uint8_t *oattrs, pt_entry_t *optep)
{
        struct pmap *pmap;
        pt_entry_t *ptep;
        pt_entry_t opte;
        pt_entry_t npte;
        pt_entry_t expect;
        bool need_shootdown;

        expect = pmap_pa2pte(pa) | EPT_R;
        pmap = ptp_to_pmap(ptp);

        if (clearbits != ~0) {
                KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
                clearbits = pmap_pp_attrs_to_ept(clearbits);
        }

        ptep = pmap_map_pte(pmap, ptp, va);
        do {
                opte = *ptep;
                KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
                KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
                KASSERT(opte == 0 || (opte & EPT_R) != 0);
                if ((opte & (PTE_FRAME | EPT_R)) != expect) {
                        /*
                         * We lost a race with a V->P operation like
                         * pmap_remove().  Wait for the competitor
                         * reflecting pte bits into mp_attrs.
                         */
                        pmap_unmap_pte();
                        return EAGAIN;
                }

                /*
                 * Check if there's anything to do on this PTE.
                 */
                if ((opte & clearbits) == 0) {
                        need_shootdown = false;
                        break;
                }

                /*
                 * We need a shootdown if the PTE is cached (EPT_A) ...
                 * ... Unless we are clearing only the EPT_W bit and
                 * it isn't cached as RW (EPT_D).
                 */
                if (pmap_ept_has_ad) {
                        need_shootdown = (opte & EPT_A) != 0 &&
                            !(clearbits == EPT_W && (opte & EPT_D) == 0);
                } else {
                        need_shootdown = true;
                }

                npte = opte & ~clearbits;

                /*
                 * If we need a shootdown anyway, clear EPT_A and EPT_D.
                 */
                if (need_shootdown) {
                        npte &= ~(EPT_A | EPT_D);
                }
                KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
                KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
                KASSERT(npte == 0 || (opte & EPT_R) != 0);
        } while (pmap_pte_cas(ptep, opte, npte) != opte);

        if (need_shootdown) {
                pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
        }
        pmap_unmap_pte();

        *oattrs = pmap_ept_to_pp_attrs(opte);
        if (optep != NULL)
                *optep = opte;
        return 0;
}

static void
pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
    vaddr_t va)
{

        KASSERT(mutex_owned(&pmap->pm_lock));

        pmap_ept_stats_update_bypte(pmap, 0, opte);
        ptp->wire_count--;
        if (ptp->wire_count <= 1) {
                pmap_ept_free_ptp(pmap, ptp, va);
        }
}

static void
pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
{
        pt_entry_t bit_rem;
        pt_entry_t *ptes, *spte;
        pt_entry_t opte, npte;
        pd_entry_t pde;
        paddr_t ptppa;
        vaddr_t va;
        bool modified;

        bit_rem = 0;
        if (!(prot & VM_PROT_WRITE))
                bit_rem = EPT_W;

        sva &= PTE_FRAME;
        eva &= PTE_FRAME;

        /* Acquire pmap. */
        mutex_enter(&pmap->pm_lock);
        kpreempt_disable();

        for (va = sva; va < eva; va += PAGE_SIZE) {
                if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
                        continue;
                }

                ptppa = pmap_pte2pa(pde);
                ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
                spte = &ptes[pl1_pi(va)];

                do {
                        opte = *spte;
                        if (!pmap_ept_valid_entry(opte)) {
                                goto next;
                        }
                        npte = (opte & ~bit_rem);
                } while (pmap_pte_cas(spte, opte, npte) != opte);

                if (pmap_ept_has_ad) {
                        modified = (opte & EPT_D) != 0;
                } else {
                        modified = true;
                }
                if (modified) {
                        vaddr_t tva = x86_ptob(spte - ptes);
                        pmap_tlb_shootdown(pmap, tva, 0,
                            TLBSHOOT_WRITE_PROTECT);
                }
next:;
        }

        kpreempt_enable();
        mutex_exit(&pmap->pm_lock);
}

static void
pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
{
        pt_entry_t *ptes, *ptep, opte;
        pd_entry_t pde;
        paddr_t ptppa;

        /* Acquire pmap. */
        mutex_enter(&pmap->pm_lock);
        kpreempt_disable();

        if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
                panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
        }

        ptppa = pmap_pte2pa(pde);
        ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
        ptep = &ptes[pl1_pi(va)];
        opte = *ptep;
        KASSERT(pmap_ept_valid_entry(opte));

        if (opte & EPT_WIRED) {
                pt_entry_t npte = opte & ~EPT_WIRED;

                opte = pmap_pte_testset(ptep, npte);
                pmap_ept_stats_update_bypte(pmap, npte, opte);
        } else {
                printf("%s: wiring for pmap %p va %#" PRIxVADDR
                    "did not change!\n", __func__, pmap, va);
        }

        /* Release pmap. */
        kpreempt_enable();
        mutex_exit(&pmap->pm_lock);
}

/* -------------------------------------------------------------------------- */

void
pmap_ept_transform(struct pmap *pmap)
{
        pmap->pm_enter = pmap_ept_enter;
        pmap->pm_extract = pmap_ept_extract;
        pmap->pm_remove = pmap_ept_remove;
        pmap->pm_sync_pv = pmap_ept_sync_pv;
        pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
        pmap->pm_write_protect = pmap_ept_write_protect;
        pmap->pm_unwire = pmap_ept_unwire;

        memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE);
}

#endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */























































































    2 





























































































































    2 
    2 

























    2 































































    4 
    4 






    4 
























































































    5 




    5 


























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
/*        $NetBSD: tmpfs_vfsops.c,v 1.78 2022/11/10 10:54:14 hannken Exp $        */

/*
 * Copyright (c) 2005, 2006, 2007 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
 * 2005 program.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Efficient memory file system.
 *
 * tmpfs is a file system that uses NetBSD's virtual memory sub-system
 * (the well-known UVM) to store file data and metadata in an efficient
 * way.  This means that it does not follow the structure of an on-disk
 * file system because it simply does not need to.  Instead, it uses
 * memory-specific data structures and algorithms to automatically
 * allocate and release resources.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_vfsops.c,v 1.78 2022/11/10 10:54:14 hannken Exp $");

#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/module.h>

#include <miscfs/genfs/genfs.h>
#include <fs/tmpfs/tmpfs.h>
#include <fs/tmpfs/tmpfs_args.h>

MODULE(MODULE_CLASS_VFS, tmpfs, NULL);

struct pool        tmpfs_dirent_pool;
struct pool        tmpfs_node_pool;

void
tmpfs_init(void)
{

        pool_init(&tmpfs_dirent_pool, sizeof(tmpfs_dirent_t), 0, 0, 0,
            "tmpfs_dirent", &pool_allocator_nointr, IPL_NONE);
        pool_init(&tmpfs_node_pool, sizeof(tmpfs_node_t), 0, 0, 0,
            "tmpfs_node", &pool_allocator_nointr, IPL_NONE);
}

void
tmpfs_done(void)
{

        pool_destroy(&tmpfs_dirent_pool);
        pool_destroy(&tmpfs_node_pool);
}

int
tmpfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
        struct tmpfs_args *args = data;
        tmpfs_mount_t *tmp;
        tmpfs_node_t *root;
        struct vattr va;
        struct vnode *vp;
        uint64_t memlimit;
        ino_t nodes;
        int error, flags;
        bool set_memlimit;
        bool set_nodes;

        if (args == NULL)
                return EINVAL;

        /* Validate the version. */
        if (*data_len < sizeof(*args) ||
            args->ta_version != TMPFS_ARGS_VERSION)
                return EINVAL;

        /* Handle retrieval of mount point arguments. */
        if (mp->mnt_flag & MNT_GETARGS) {
                if (mp->mnt_data == NULL)
                        return EIO;
                tmp = VFS_TO_TMPFS(mp);

                args->ta_version = TMPFS_ARGS_VERSION;
                args->ta_nodes_max = tmp->tm_nodes_max;
                args->ta_size_max = tmp->tm_mem_limit;

                root = tmp->tm_root;
                args->ta_root_uid = root->tn_uid;
                args->ta_root_gid = root->tn_gid;
                args->ta_root_mode = root->tn_mode;

                *data_len = sizeof(*args);
                return 0;
        }


        /* Prohibit mounts if there is not enough memory. */
        if (tmpfs_mem_info(true) < uvmexp.freetarg)
                return EINVAL;

        /* Check for invalid uid and gid arguments */
        if (args->ta_root_uid == VNOVAL || args->ta_root_gid == VNOVAL)
                return EINVAL;

        /* Get the memory usage limit for this file-system. */
        if (args->ta_size_max < PAGE_SIZE) {
                memlimit = UINT64_MAX;
                set_memlimit = false;
        } else {
                memlimit = args->ta_size_max;
                set_memlimit = true;
        }
        KASSERT(memlimit > 0);

        if (args->ta_nodes_max <= 3) {
                nodes = 3 + (memlimit / 1024);
                set_nodes = false;
        } else {
                nodes = args->ta_nodes_max;
                set_nodes = true;
        }
        nodes = MIN(nodes, INT_MAX);
        KASSERT(nodes >= 3);

        if (mp->mnt_flag & MNT_UPDATE) {
                tmp = VFS_TO_TMPFS(mp);
                if (set_nodes && nodes < tmp->tm_nodes_cnt)
                        return EBUSY;
                if ((mp->mnt_iflag & IMNT_WANTRDONLY)) {
                        /* Changing from read/write to read-only. */
                        flags = WRITECLOSE;
                        if ((mp->mnt_flag & MNT_FORCE))
                                flags |= FORCECLOSE;
                        error = vflush(mp, NULL, flags);
                        if (error)
                                return error;
                }
                if (set_memlimit) {
                        if ((error = tmpfs_mntmem_set(tmp, memlimit)) != 0)
                                return error;
                }
                if (set_nodes)
                        tmp->tm_nodes_max = nodes;
                root = tmp->tm_root;
                root->tn_uid = args->ta_root_uid;
                root->tn_gid = args->ta_root_gid;
                root->tn_mode = args->ta_root_mode;
                return 0;
        }

        mp->mnt_flag |= MNT_LOCAL;
        mp->mnt_stat.f_namemax = TMPFS_MAXNAMLEN;
        mp->mnt_fs_bshift = PAGE_SHIFT;
        mp->mnt_dev_bshift = DEV_BSHIFT;
        mp->mnt_iflag |= IMNT_MPSAFE | IMNT_CAN_RWTORO | IMNT_SHRLOOKUP |
            IMNT_NCLOOKUP;
        vfs_getnewfsid(mp);

        /* Allocate the tmpfs mount structure and fill it. */
        tmp = kmem_zalloc(sizeof(tmpfs_mount_t), KM_SLEEP);
        tmp->tm_nodes_max = nodes;
        tmp->tm_nodes_cnt = 0;
        LIST_INIT(&tmp->tm_nodes);

        mutex_init(&tmp->tm_lock, MUTEX_DEFAULT, IPL_NONE);
        tmpfs_mntmem_init(tmp, memlimit);
        mp->mnt_data = tmp;

        error = set_statvfs_info(path, UIO_USERSPACE, "tmpfs", UIO_SYSSPACE,
            mp->mnt_op->vfs_name, mp, curlwp);
        if (error)
                goto errout;

        /* Allocate the root node. */
        vattr_null(&va);
        va.va_type = VDIR;
        va.va_mode = args->ta_root_mode & ALLPERMS;
        va.va_uid = args->ta_root_uid;
        va.va_gid = args->ta_root_gid;
        error = vcache_new(mp, NULL, &va, NOCRED, NULL, &vp);
        if (error)
                goto errout;
        KASSERT(vp != NULL);
        root = VP_TO_TMPFS_NODE(vp);
        KASSERT(root != NULL);

        /*
         * Parent of the root inode is itself.  Also, root inode has no
         * directory entry (i.e. is never attached), thus hold an extra
         * reference (link) for it.
         */
        root->tn_links++;
        root->tn_spec.tn_dir.tn_parent = root;
        tmp->tm_root = root;
        vrele(vp);

        return 0;

errout:
        mp->mnt_data = NULL;
        tmpfs_mntmem_destroy(tmp);
        mutex_destroy(&tmp->tm_lock);
        kmem_free(tmp, sizeof(*tmp));

        return error;
}

int
tmpfs_start(struct mount *mp, int flags)
{

        return 0;
}

int
tmpfs_unmount(struct mount *mp, int mntflags)
{
        tmpfs_mount_t *tmp = VFS_TO_TMPFS(mp);
        tmpfs_node_t *node, *cnode;
        int error, flags = 0;

        /* Handle forced unmounts. */
        if (mntflags & MNT_FORCE)
                flags |= FORCECLOSE;

        /* Finalize all pending I/O. */
        error = vflush(mp, NULL, flags);
        if (error != 0)
                return error;

        /*
         * First round, detach and destroy all directory entries.
         * Also, clear the pointers to the vnodes - they are gone.
         */
        LIST_FOREACH(node, &tmp->tm_nodes, tn_entries) {
                tmpfs_dirent_t *de;

                node->tn_vnode = NULL;
                if (node->tn_type != VDIR) {
                        continue;
                }
                while ((de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir)) != NULL) {
                        cnode = de->td_node;
                        if (cnode && cnode != TMPFS_NODE_WHITEOUT) {
                                cnode->tn_vnode = NULL;
                        }
                        tmpfs_dir_detach(node, de);
                        tmpfs_free_dirent(tmp, de);
                }
                /* Extra virtual entry (itself for the root). */
                node->tn_links--;
        }

        /* Release the reference on root (diagnostic). */
        node = tmp->tm_root;
        node->tn_links--;

        /* Second round, destroy all inodes. */
        while ((node = LIST_FIRST(&tmp->tm_nodes)) != NULL) {
                tmpfs_free_node(tmp, node);
        }

        /* Throw away the tmpfs_mount structure. */
        tmpfs_mntmem_destroy(tmp);
        mutex_destroy(&tmp->tm_lock);
        kmem_free(tmp, sizeof(*tmp));
        mp->mnt_data = NULL;

        return 0;
}

int
tmpfs_root(struct mount *mp, int lktype, vnode_t **vpp)
{
        tmpfs_node_t *node = VFS_TO_TMPFS(mp)->tm_root;
        int error;

        error = vcache_get(mp, &node, sizeof(node), vpp);
        if (error)
                return error;
        error = vn_lock(*vpp, lktype);
        if (error) {
                vrele(*vpp);
                *vpp = NULL;
                return error;
        }

        return 0;
}

int
tmpfs_vget(struct mount *mp, ino_t ino, int lktype, vnode_t **vpp)
{

        return EOPNOTSUPP;
}

int
tmpfs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, vnode_t **vpp)
{
        tmpfs_mount_t *tmp = VFS_TO_TMPFS(mp);
        tmpfs_node_t *node;
        tmpfs_fid_t tfh;
        int error;

        if (fhp->fid_len != sizeof(tmpfs_fid_t)) {
                return EINVAL;
        }
        memcpy(&tfh, fhp, sizeof(tmpfs_fid_t));

        mutex_enter(&tmp->tm_lock);
        /* XXX big oof .. use a better data structure */
        LIST_FOREACH(node, &tmp->tm_nodes, tn_entries) {
                if (node->tn_id == tfh.tf_id) {
                        /* Prevent this node from disappearing. */
                        atomic_inc_32(&node->tn_holdcount);
                        break;
                }
        }
        mutex_exit(&tmp->tm_lock);
        if (node == NULL)
                return ESTALE;

        error = vcache_get(mp, &node, sizeof(node), vpp);
        /* If this node has been reclaimed free it now. */
        if (atomic_dec_32_nv(&node->tn_holdcount) == TMPFS_NODE_RECLAIMED) {
                KASSERT(error != 0);
                tmpfs_free_node(tmp, node);
        }
        if (error)
                return (error == ENOENT ? ESTALE : error);
        error = vn_lock(*vpp, lktype);
        if (error) {
                vrele(*vpp);
                *vpp = NULL;
                return error;
        }
        if (TMPFS_NODE_GEN(node) != tfh.tf_gen) {
                vput(*vpp);
                *vpp = NULL;
                return ESTALE;
        }

        return 0;
}

int
tmpfs_vptofh(vnode_t *vp, struct fid *fhp, size_t *fh_size)
{
        tmpfs_fid_t tfh;
        tmpfs_node_t *node;

        if (*fh_size < sizeof(tmpfs_fid_t)) {
                *fh_size = sizeof(tmpfs_fid_t);
                return E2BIG;
        }
        *fh_size = sizeof(tmpfs_fid_t);
        node = VP_TO_TMPFS_NODE(vp);

        memset(&tfh, 0, sizeof(tfh));
        tfh.tf_len = sizeof(tmpfs_fid_t);
        tfh.tf_gen = TMPFS_NODE_GEN(node);
        tfh.tf_id = node->tn_id;
        memcpy(fhp, &tfh, sizeof(tfh));

        return 0;
}

int
tmpfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
        tmpfs_mount_t *tmp;
        fsfilcnt_t freenodes;
        size_t avail;

        tmp = VFS_TO_TMPFS(mp);

        sbp->f_iosize = sbp->f_frsize = sbp->f_bsize = PAGE_SIZE;

        mutex_enter(&tmp->tm_acc_lock);
        avail =  tmpfs_pages_avail(tmp);
        sbp->f_blocks = (tmpfs_bytes_max(tmp) >> PAGE_SHIFT);
        sbp->f_bavail = sbp->f_bfree = avail;
        sbp->f_bresvd = 0;

        freenodes = MIN(tmp->tm_nodes_max - tmp->tm_nodes_cnt,
            avail * PAGE_SIZE / sizeof(tmpfs_node_t));

        sbp->f_files = tmp->tm_nodes_cnt + freenodes;
        sbp->f_favail = sbp->f_ffree = freenodes;
        sbp->f_fresvd = 0;
        mutex_exit(&tmp->tm_acc_lock);

        copy_statvfs_info(sbp, mp);

        return 0;
}

int
tmpfs_sync(struct mount *mp, int waitfor, kauth_cred_t uc)
{

        return 0;
}

int
tmpfs_snapshot(struct mount *mp, vnode_t *vp, struct timespec *ctime)
{

        return EOPNOTSUPP;
}

/*
 * tmpfs vfs operations.
 */

extern const struct vnodeopv_desc tmpfs_fifoop_opv_desc;
extern const struct vnodeopv_desc tmpfs_specop_opv_desc;
extern const struct vnodeopv_desc tmpfs_vnodeop_opv_desc;

const struct vnodeopv_desc * const tmpfs_vnodeopv_descs[] = {
        &tmpfs_fifoop_opv_desc,
        &tmpfs_specop_opv_desc,
        &tmpfs_vnodeop_opv_desc,
        NULL,
};

struct vfsops tmpfs_vfsops = {
        .vfs_name = MOUNT_TMPFS,
        .vfs_min_mount_data = sizeof (struct tmpfs_args),
        .vfs_mount = tmpfs_mount,
        .vfs_start = tmpfs_start,
        .vfs_unmount = tmpfs_unmount,
        .vfs_root = tmpfs_root,
        .vfs_quotactl = (void *)eopnotsupp,
        .vfs_statvfs = tmpfs_statvfs,
        .vfs_sync = tmpfs_sync,
        .vfs_vget = tmpfs_vget,
        .vfs_loadvnode = tmpfs_loadvnode,
        .vfs_newvnode = tmpfs_newvnode,
        .vfs_fhtovp = tmpfs_fhtovp,
        .vfs_vptofh = tmpfs_vptofh,
        .vfs_init = tmpfs_init,
        .vfs_done = tmpfs_done,
        .vfs_snapshot = tmpfs_snapshot,
        .vfs_extattrctl = vfs_stdextattrctl,
        .vfs_suspendctl = genfs_suspendctl,
        .vfs_renamelock_enter = genfs_renamelock_enter,
        .vfs_renamelock_exit = genfs_renamelock_exit,
        .vfs_fsync = (void *)eopnotsupp,
        .vfs_opv_descs = tmpfs_vnodeopv_descs
};

static int
tmpfs_modcmd(modcmd_t cmd, void *arg)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return vfs_attach(&tmpfs_vfsops);
        case MODULE_CMD_FINI:
                return vfs_detach(&tmpfs_vfsops);
        default:
                return ENOTTY;
        }
}




































    2 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/*        $NetBSD: uipc_syscalls_30.c,v 1.4 2019/01/27 02:08:39 pgoyette Exp $        */

/* written by Pavel Cahyna, 2006. Public domain. */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_syscalls_30.c,v 1.4 2019/01/27 02:08:39 pgoyette Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

/*
 * System call interface to the socket abstraction.
 */

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/msg.h>
#include <sys/sysctl.h>
#include <sys/mount.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <sys/errno.h>

#include <compat/common/compat_mod.h>

static const struct syscall_package uipc_syscalls_30_syscalls[] = {
        { SYS_compat_30_socket, 0, (sy_call_t *)compat_30_sys_socket },
        { 0, 0, NULL}
};

int
compat_30_sys_socket(struct lwp *l,
    const struct compat_30_sys_socket_args *uap, register_t *retval)
{
        int        error;

        error = sys___socket30(l, (const void *)uap, retval);
        if (error == EAFNOSUPPORT)
                error = EPROTONOSUPPORT;

        return (error);
}

int
uipc_syscalls_30_init(void)
{

        return syscall_establish(NULL, uipc_syscalls_30_syscalls);
}

int
uipc_syscalls_30_fini(void)
{

        return syscall_disestablish(NULL, uipc_syscalls_30_syscalls);
}








































































































































































































































































































































































































































































































































































































   10 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
/*        $NetBSD: proc.h,v 1.373 2023/10/04 20:52:07 ad Exp $        */

/*-
 * Copyright (c) 2006, 2007, 2008, 2020, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1986, 1989, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)proc.h        8.15 (Berkeley) 5/19/95
 */

#ifndef _SYS_PROC_H_
#define        _SYS_PROC_H_

#include <sys/lwp.h>

#if defined(_KMEMUSER) || defined(_KERNEL)

#if defined(_KERNEL_OPT)
#include "opt_multiprocessor.h"
#include "opt_kstack.h"
#include "opt_lockdebug.h"
#endif

#include <machine/proc.h>                /* Machine-dependent proc substruct */
#include <machine/pcb.h>
#include <sys/aio.h>
#include <sys/idtype.h>
#include <sys/rwlock.h>
#include <sys/mqueue.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/queue.h>
#include <sys/radixtree.h>
#include <sys/signalvar.h>
#include <sys/siginfo.h>
#include <sys/event.h>
#include <sys/specificdata.h>

#ifdef _KERNEL
#include <sys/resourcevar.h>
#else
#include <sys/time.h>
#include <sys/resource.h>
#endif

/*
 * One structure allocated per session.
 */
struct session {
        int                s_count;        /* Ref cnt; pgrps in session */
        u_int                s_flags;
#define        S_LOGIN_SET        1                /* s_login set in this session */
        struct proc        *s_leader;        /* Session leader */
        struct vnode        *s_ttyvp;        /* Vnode of controlling terminal */
        struct tty        *s_ttyp;        /* Controlling terminal */
        char                s_login[MAXLOGNAME]; /* Setlogin() name */
        pid_t                s_sid;                /* Session ID (pid of leader) */
};

/*
 * One structure allocated per process group.
 */
struct pgrp {
        LIST_HEAD(, proc) pg_members;        /* Pointer to pgrp members */
        struct session        *pg_session;        /* Pointer to session */
        pid_t                pg_id;                /* Pgrp id */
        int                pg_jobc;        /*
                                         * Number of processes qualifying
                                         * pgrp for job control
                                         */
};

/*
 * Autoloadable syscall definition
 */
struct sc_autoload {
        u_int                al_code;
        const char        *al_module;
};

/*
 * One structure allocated per emulation.
 */
struct exec_package;
struct ras;
struct kauth_cred;

struct emul {
        const char        *e_name;        /* Symbolic name */
        const char        *e_path;        /* Extra emulation path (NULL if none)*/
#ifndef __HAVE_MINIMAL_EMUL
        int                e_flags;        /* Miscellaneous flags, see above */
                                        /* Syscall handling function */
        const int        *e_errno;        /* Errno array */
        int                e_nosys;        /* Offset of the nosys() syscall */
        int                e_nsysent;        /* Number of system call entries */
#endif
        struct sysent        *e_sysent;        /* System call array */
        const uint32_t        *e_nomodbits;        /* sys_nosys/sys_nomodule flags
                                         * for syscall_disestablish() */
        const char * const *e_syscallnames; /* System call name array */
        struct sc_autoload *e_sc_autoload; /* List of autoloadable syscalls */
                                        /* Signal sending function */
        void                (*e_sendsig)(const struct ksiginfo *,
                                          const sigset_t *);
        void                (*e_trapsignal)(struct lwp *, struct ksiginfo *);
        char                *e_sigcode;        /* Start of sigcode */
        char                *e_esigcode;        /* End of sigcode */
                                        /* Set registers before execution */
        struct uvm_object **e_sigobject;/* shared sigcode object */
        void                (*e_setregs)(struct lwp *, struct exec_package *,
                                          vaddr_t);

                                        /* Per-process hooks */
        void                (*e_proc_exec)(struct proc *, struct exec_package *);
        void                (*e_proc_fork)(struct proc *, struct lwp *, int);
        void                (*e_proc_exit)(struct proc *);
        void                (*e_lwp_fork)(struct lwp *, struct lwp *);
        void                (*e_lwp_exit)(struct lwp *);

#ifdef __HAVE_SYSCALL_INTERN
        void                (*e_syscall_intern)(struct proc *);
#else
        void                (*e_syscall)(void);
#endif
                                        /* Emulation specific sysctl data */
        struct sysctlnode *e_sysctlovly;

        vaddr_t                (*e_vm_default_addr)(struct proc *, vaddr_t, vsize_t,
                             int);

        /* Emulation-specific hook for userspace page faults */
        int                (*e_usertrap)(struct lwp *, vaddr_t, void *);

        size_t                e_ucsize;        /* size of ucontext_t */
        void                (*e_startlwp)(void *);

        /* Dtrace syscall probe */
        void                 (*e_dtrace_syscall)(uint32_t, register_t,
                            const struct sysent *, const void *,
                            const register_t *, int);

        /* Emulation specific support for ktracing signal posts */
        void                (*e_ktrpsig)(int, sig_t, const sigset_t *,
                            const struct ksiginfo *);
};

/*
 * Emulation miscellaneous flags
 */
#define        EMUL_HAS_SYS___syscall        0x001        /* Has SYS___syscall */

/*
 * Description of a process.
 *
 * This structure contains the information needed to manage a thread of
 * control, known in UN*X as a process; it has references to substructures
 * containing descriptions of things that the process uses, but may share
 * with related processes.  The process structure and the substructures
 * are always addressible except for those marked "(PROC ONLY)" below,
 * which might be addressible only on a processor on which the process
 * is running.
 *
 * Field markings and the corresponding locks:
 *
 * a:        p_auxlock
 * k:        ktrace_mutex
 * l:        proc_lock
 * t:        p_stmutex
 * p:        p_lock
 * (:        updated atomically
 * ::        unlocked, stable
 */
struct vmspace;

struct proc {
        LIST_ENTRY(proc) p_list;        /* l: List of all processes */
        kmutex_t        *p_lock;        /* :: general mutex */
        kcondvar_t        p_waitcv;        /* p: wait, stop CV on children */
        kcondvar_t        p_lwpcv;        /* p: wait, stop CV on LWPs */

        /* Substructures: */
        struct kauth_cred *p_cred;        /* p: Master copy of credentials */
        struct filedesc        *p_fd;                /* :: Ptr to open files structure */
        struct cwdinfo        *p_cwdi;        /* :: cdir/rdir/cmask info */
        struct pstats        *p_stats;        /* :: Accounting/stats (PROC ONLY) */
        struct plimit        *p_limit;        /* :: Process limits */
        struct vmspace        *p_vmspace;        /* :: Address space */
        struct sigacts        *p_sigacts;        /* :: Process sigactions */
        struct aioproc        *p_aio;                /* p: Asynchronous I/O data */
        u_int                p_mqueue_cnt;        /* (: Count of open message queues */
        specificdata_reference
                        p_specdataref;        /*    subsystem proc-specific data */

        int                p_exitsig;        /* l: signal to send to parent on exit */
        int                p_flag;                /* p: PK_* flags */
        int                p_sflag;        /* p: PS_* flags */
        int                p_stflag;        /* t: PST_* flags */
        short                p_slflag;        /* l, p: PSL_* flags */
        char                p_stat;                /* l: S* process status. */
        char                p_lflag;        /* l: PL_* flags */
        char                p_trace_enabled;/* p: cached by syscall_intern() */
        char                p_pad1[3];        /*  unused */

        pid_t                p_pid;                /* :: Process identifier. */
        LIST_ENTRY(proc) p_pglist;        /* l: List of processes in pgrp. */
        struct proc         *p_pptr;        /* l: Pointer to parent process. */
        LIST_ENTRY(proc) p_sibling;        /* l: List of sibling processes. */
        LIST_HEAD(, proc) p_children;        /* l: List of children. */
        LIST_HEAD(, lwp) p_lwps;        /* p: List of LWPs. */
        struct ras        *p_raslist;        /* a: List of RAS entries */

/* The following fields are all zeroed upon creation in fork. */
#define        p_startzero        p_nlwps

        int                 p_nlwps;        /* p: Number of LWPs */
        int                 p_nzlwps;        /* p: Number of zombie LWPs */
        int                p_nrlwps;        /* p: Number running/sleeping LWPs */
        int                p_nlwpwait;        /* p: Number of LWPs in lwp_wait1() */
        int                p_ndlwps;        /* p: Number of detached LWPs */
        u_int                p_nstopchild;        /* l: Count of stopped/dead children */
        u_int                p_waited;        /* l: parent has waited on child */
        struct lwp        *p_zomblwp;        /* p: detached LWP to be reaped */
        struct lwp        *p_vforklwp;        /* p: parent LWP waiting at vfork() */

        /* scheduling */
        void                *p_sched_info;        /* p: Scheduler-specific structure */
        fixpt_t                p_estcpu;        /* p: Time avg. value of p_cpticks */
        fixpt_t                p_estcpu_inherited; /* p: cpu inherited from children */
        unsigned int        p_forktime;
        fixpt_t         p_pctcpu;       /* p: %cpu from dead LWPs */

        struct proc        *p_opptr;        /* l: save parent during ptrace. */
        struct ptimers        *p_timers;        /*    Timers: real, virtual, profiling */
        struct bintime         p_rtime;        /* p: real time */
        u_quad_t         p_uticks;        /* t: Statclock hits in user mode */
        u_quad_t         p_sticks;        /* t: Statclock hits in system mode */
        u_quad_t         p_iticks;        /* t: Statclock hits processing intr */
        uint64_t        p_xutime;        /* p: utime exposed to userspace */
        uint64_t        p_xstime;        /* p: stime exposed to userspace */

        int                p_traceflag;        /* k: Kernel trace points */
        void                *p_tracep;        /* k: Trace private data */
        struct vnode         *p_textvp;        /* :: Vnode of executable */

        struct emul        *p_emul;        /* :: emulation information */
        void                *p_emuldata;        /* :: per-proc emul data, or NULL */
        const struct execsw *p_execsw;        /* :: exec package information */
        struct klist        p_klist;        /* p: knotes attached to proc */

        LIST_HEAD(, lwp) p_sigwaiters;        /* p: LWPs waiting for signals */
        sigpend_t        p_sigpend;        /* p: pending signals */
        struct lcproc        *p_lwpctl;        /* p, a: _lwp_ctl() information */
        pid_t                p_ppid;                /* :: cached parent pid */
        pid_t                p_oppid;        /* :: cached original parent pid */
        char                *p_path;        /* :: full pathname of executable */

/*
 * End area that is zeroed on creation
 */
#define        p_endzero        p_startcopy

/*
 * The following fields are all copied upon creation in fork.
 */
#define        p_startcopy        p_sigctx

        struct sigctx         p_sigctx;        /* p: Shared signal state */

        u_char                p_nice;                /* p: Process "nice" value */
        char                p_comm[MAXCOMLEN+1];
                                        /* p: basename of last exec file */
        struct pgrp         *p_pgrp;        /* l: Pointer to process group */

        vaddr_t                p_psstrp;        /* :: address of process's ps_strings */
        u_int                p_pax;                /* :: PAX flags */
        int                p_xexit;        /* p: exit code */
/*
 * End area that is copied on creation
 */
#define        p_endcopy        p_xsig
        u_short                p_xsig;                /* p: stop signal */
        u_short                p_acflag;        /* p: Acc. flags; see struct lwp also */
        struct mdproc        p_md;                /* p: Any machine-dependent fields */
        vaddr_t                p_stackbase;        /* :: ASLR randomized stack base */
        struct kdtrace_proc *p_dtrace;        /* :: DTrace-specific data. */
/*
 * Locks in their own cache line towards the end.
 */
        kmutex_t        p_auxlock        /* :: secondary, longer term lock */
            __aligned(COHERENCY_UNIT);
        kmutex_t        p_stmutex;        /* :: mutex on profiling state */
        krwlock_t        p_reflock;        /* :: lock for debugger, procfs */
};

#define        p_rlimit        p_limit->pl_rlimit
#define        p_session        p_pgrp->pg_session
#define        p_pgid                p_pgrp->pg_id

#endif        /* _KMEMUSER || _KERNEL */

/*
 * Status values.
 */
#define        SIDL                1                /* Process being created by fork */
#define        SACTIVE                2                /* Process is not stopped */
#define        SDYING                3                /* About to die */
#define        SSTOP                4                /* Process debugging or suspension */
#define        SZOMB                5                /* Awaiting collection by parent */
#define        SDEAD                 6                /* Almost a zombie */

#define        P_ZOMBIE(p)        \
    ((p)->p_stat == SZOMB || (p)->p_stat == SDYING || (p)->p_stat == SDEAD)

/*
 * These flags are kept in p_flag and are protected by p_lock.  Access from
 * process context only.
 */
#define        PK_ADVLOCK        0x00000001 /* Process may hold a POSIX advisory lock */
#define        PK_SYSTEM        0x00000002 /* System process (kthread) */
#define        PK_SYSVSEM        0x00000004 /* Used SysV semaphores */
#define        PK_SUGID        0x00000100 /* Had set id privileges since last exec */
#define        PK_KMEM                0x00000200 /* Has kmem access */
#define        PK_EXEC                0x00004000 /* Process called exec */
#define        PK_NOCLDWAIT        0x00020000 /* No zombies if child dies */
#define        PK_32                0x00040000 /* 32-bit process (used on 64-bit kernels) */
#define        PK_CLDSIGIGN        0x00080000 /* Process is ignoring SIGCHLD */
#define        PK_MARKER        0x80000000 /* Is a dummy marker process */

/*
 * These flags are kept in p_sflag and are protected by p_lock.  Access from
 * process context only.
 */
#define        PS_NOCLDSTOP        0x00000008 /* No SIGCHLD when children stop */
#define        PS_RUMP_LWPEXIT        0x00000400 /* LWPs in RUMP kernel should exit for GC */
#define        PS_WCORE        0x00001000 /* Process needs to dump core */
#define        PS_WEXIT        0x00002000 /* Working on exiting */
#define        PS_STOPFORK        0x00800000 /* Child will be stopped on fork(2) */
#define        PS_STOPEXEC        0x01000000 /* Will be stopped on exec(2) */
#define        PS_STOPEXIT        0x02000000 /* Will be stopped at process exit */
#define        PS_COREDUMP        0x20000000 /* Process core-dumped */
#define        PS_CONTINUED        0x40000000 /* Process is continued */
#define        PS_STOPPING        0x80000000 /* Transitioning SACTIVE -> SSTOP */

/*
 * These flags are kept in p_slflag and are protected by the proc_lock
 * and p_lock.  Access from process context only.
 */
#define        PSL_TRACEFORK        0x00000001 /* traced process wants fork events */
#define        PSL_TRACEVFORK        0x00000002 /* traced process wants vfork events */
#define        PSL_TRACEVFORK_DONE        \
                        0x00000004 /* traced process wants vfork done events */
#define        PSL_TRACELWP_CREATE        \
                        0x00000008 /* traced process wants LWP create events */
#define        PSL_TRACELWP_EXIT        \
                        0x00000010 /* traced process wants LWP exit events */
#define        PSL_TRACEPOSIX_SPAWN        \
                        0x00000020 /* traced process wants posix_spawn events */

#define        PSL_TRACED        0x00000040 /* Debugged process being traced */
#define        PSL_TRACEDCHILD 0x00000080 /* Report process birth */
#define        PSL_CHTRACED        0x00000100 /* Child has been traced & reparented */
#define        PSL_SYSCALL        0x00000200 /* process has PT_SYSCALL enabled */
#define        PSL_SYSCALLEMU        0x00000400 /* cancel in-progress syscall */

/*
 * Kept in p_stflag and protected by p_stmutex.
 */
#define        PST_PROFIL        0x00000020 /* Has started profiling */

/*
 * Kept in p_lflag and protected by the proc_lock.  Access
 * from process context only.
 */
#define        PL_CONTROLT        0x00000001 /* Has a controlling terminal */
#define        PL_PPWAIT        0x00000002 /* Parent is waiting for child exec/exit */
#define        PL_SIGCOMPAT        0x00000004 /* Has used compat signal trampoline */
#define        PL_ORPHANPG        0x00000008 /* Member of an orphaned pgrp */

#if defined(_KMEMUSER) || defined(_KERNEL)

/*
 * Macro to compute the exit signal to be delivered.
 */
#define        P_EXITSIG(p)        \
    (((p)->p_slflag & PSL_TRACED) ? SIGCHLD : p->p_exitsig)
/*
 * Compute a wait(2) 16 bit exit status code
 */
#define P_WAITSTATUS(p) W_EXITCODE((p)->p_xexit, ((p)->p_xsig | \
    (((p)->p_sflag & PS_COREDUMP) ? WCOREFLAG : 0)))

LIST_HEAD(proclist, proc);                /* A list of processes */

/*
 * This structure associates a proclist with its lock.
 */
struct proclist_desc {
        struct proclist        *pd_list;        /* The list */
        /*
         * XXX Add a pointer to the proclist's lock eventually.
         */
};

#ifdef _KERNEL

/*
 * We use process IDs <= PID_MAX until there are > 16k processes.
 * NO_PGID is used to represent "no process group" for a tty.
 */
#define        PID_MAX                30000
#define        NO_PGID                ((pid_t)-1)

#define        SESS_LEADER(p)        ((p)->p_session->s_leader == (p))

/*
 * Flags passed to fork1().
 */
#define        FORK_PPWAIT        0x0001                /* Block parent until child exit */
#define        FORK_SHAREVM        0x0002                /* Share vmspace with parent */
#define        FORK_SHARECWD        0x0004                /* Share cdir/rdir/cmask */
#define        FORK_SHAREFILES        0x0008                /* Share file descriptors */
#define        FORK_SHARESIGS        0x0010                /* Share signal actions */
#define        FORK_NOWAIT        0x0020                /* Make init the parent of the child */
#define        FORK_CLEANFILES        0x0040                /* Start with a clean descriptor set */
#define        FORK_SYSTEM        0x0080                /* Fork a kernel thread */

extern struct proc        proc0;                /* Process slot for swapper */
extern u_int                nprocs;                /* Current number of procs */
extern int                maxproc;        /* Max number of procs */
#define        vmspace_kernel()        (proc0.p_vmspace)

extern kmutex_t                proc_lock;
extern struct proclist        allproc;        /* List of all processes */
extern struct proclist        zombproc;        /* List of zombie processes */

extern struct proc        *initproc;        /* Process slots for init, pager */

extern const struct proclist_desc proclists[];

int                proc_find_locked(struct lwp *, struct proc **, pid_t);
proc_t *        proc_find_raw(pid_t);
proc_t *        proc_find(pid_t);                /* Find process by ID */
proc_t *        proc_find_lwpid(pid_t);                /* Find process by LWP ID */
struct lwp *        proc_find_lwp(proc_t *, pid_t);        /* Find LWP in proc by ID */
struct lwp *        proc_find_lwp_unlocked(proc_t *, pid_t);
                                                /* Find LWP, acquire proc */
struct lwp *        proc_find_lwp_acquire_proc(pid_t, proc_t **);
struct pgrp *        pgrp_find(pid_t);                /* Find process group by ID */

void        procinit(void);
void        procinit_sysctl(void);
int        proc_enterpgrp(struct proc *, pid_t, pid_t, bool);
void        proc_leavepgrp(struct proc *);
void        proc_sesshold(struct session *);
void        proc_sessrele(struct session *);
void        fixjobc(struct proc *, struct pgrp *, int);

int        tsleep(wchan_t, pri_t, const char *, int);
int        mtsleep(wchan_t, pri_t, const char *, int, kmutex_t *);
void        wakeup(wchan_t);
int        kpause(const char *, bool, int, kmutex_t *);
void        exit1(struct lwp *, int, int) __dead;
int        kill1(struct lwp *l, pid_t pid, ksiginfo_t *ksi, register_t *retval);
int        do_sys_wait(int *, int *, int, struct rusage *);
int        do_sys_waitid(idtype_t, id_t, int *, int *, int, struct wrusage *,
            siginfo_t *);

struct proc *proc_alloc(void);
void        proc0_init(void);
pid_t        proc_alloc_pid(struct proc *);
void        proc_free_pid(pid_t);
pid_t        proc_alloc_lwpid(struct proc *, struct lwp *);
void        proc_free_lwpid(struct proc *, pid_t);
void        proc_free_mem(struct proc *);
void        exit_lwps(struct lwp *l);
int        fork1(struct lwp *, int, int, void *, size_t,
            void (*)(void *), void *, register_t *);
int        pgid_in_session(struct proc *, pid_t);
void        cpu_lwp_fork(struct lwp *, struct lwp *, void *, size_t,
            void (*)(void *), void *);
void        cpu_lwp_free(struct lwp *, int);
void        cpu_lwp_free2(struct lwp *);
void        cpu_spawn_return(struct lwp*);

#ifdef __HAVE_SYSCALL_INTERN
void        syscall_intern(struct proc *);
#endif

void        md_child_return(struct lwp *);
void        child_return(void *);

int        proc_isunder(struct proc *, struct lwp *);
int        proc_uidmatch(kauth_cred_t, kauth_cred_t);

int        proc_vmspace_getref(struct proc *, struct vmspace **);
void        proc_crmod_leave(kauth_cred_t, kauth_cred_t, bool);
void        proc_crmod_enter(void);
int        proc_getauxv(struct proc *, void **, size_t *);

int        proc_specific_key_create(specificdata_key_t *, specificdata_dtor_t);
void        proc_specific_key_delete(specificdata_key_t);
void        proc_initspecific(struct proc *);
void        proc_finispecific(struct proc *);
void *        proc_getspecific(struct proc *, specificdata_key_t);
void        proc_setspecific(struct proc *, specificdata_key_t, void *);
int        proc_compare(const struct proc *, const struct lwp *,
    const struct proc *, const struct lwp *);

/*
 * Special handlers for delivering EVFILT_PROC notifications.  These
 * exist to handle some of the special locking considerations around
 * processes.
 */
void        knote_proc_exec(struct proc *);
void        knote_proc_fork(struct proc *, struct proc *);
void        knote_proc_exit(struct proc *);

int        proclist_foreach_call(struct proclist *,
    int (*)(struct proc *, void *arg), void *);

static __inline struct proc *
_proclist_skipmarker(struct proc *p0)
{
        struct proc *p = p0;

        while (p != NULL && p->p_flag & PK_MARKER)
                p = LIST_NEXT(p, p_list);

        return p;
}

#define PROC_PTRSZ(p) (((p)->p_flag & PK_32) ? sizeof(int) : sizeof(void *))
#define PROC_REGSZ(p) (((p)->p_flag & PK_32) ? \
    sizeof(process_reg32) : sizeof(struct reg))
#define PROC_FPREGSZ(p) (((p)->p_flag & PK_32) ? \
    sizeof(process_fpreg32) : sizeof(struct fpreg))
#define PROC_DBREGSZ(p) (((p)->p_flag & PK_32) ? \
    sizeof(process_dbreg32) : sizeof(struct dbreg))

#ifndef PROC_MACHINE_ARCH
#define PROC_MACHINE_ARCH(p) machine_arch
#endif

/*
 * PROCLIST_FOREACH: iterate on the given proclist, skipping PK_MARKER ones.
 */
#define        PROCLIST_FOREACH(var, head)                                        \
        for ((var) = LIST_FIRST(head);                                        \
                ((var) = _proclist_skipmarker(var)) != NULL;                \
                (var) = LIST_NEXT(var, p_list))

#ifdef KSTACK_CHECK_MAGIC
void        kstack_setup_magic(const struct lwp *);
void        kstack_check_magic(const struct lwp *);
#else
#define        kstack_setup_magic(x)
#define        kstack_check_magic(x)
#endif

extern struct emul emul_netbsd;

#endif        /* _KERNEL */

/*
 * Kernel stack parameters.
 *
 * KSTACK_LOWEST_ADDR: return the lowest address of the LWP's kernel stack,
 * excluding red-zone.
 *
 * KSTACK_SIZE: the size kernel stack for a LWP, excluding red-zone.
 *
 * if <machine/proc.h> provides the MD definition, it will be used.
 */
#ifndef KSTACK_LOWEST_ADDR
#define        KSTACK_LOWEST_ADDR(l)        ((void *)ALIGN((struct pcb *)((l)->l_addr) + 1))
#endif
#ifndef KSTACK_SIZE
#define        KSTACK_SIZE                (USPACE - ALIGN(sizeof(struct pcb)))
#endif

#endif        /* _KMEMUSER || _KERNEL */

#endif        /* !_SYS_PROC_H_ */





























































































    3 



    3 
    3 











































    3 










    3 
    3 









    3 






























































    3 


    3 


























    3 





























































    3 





    3 













    3 












    3 










    3 

    3 

































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
/*        $NetBSD: prop_string.c,v 1.18 2023/11/17 21:29:33 thorpej Exp $        */

/*-
 * Copyright (c) 2006, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "prop_object_impl.h"
#include <prop/prop_string.h>

#include <sys/rbtree.h>
#if defined(_KERNEL) || defined(_STANDALONE)
#include <sys/stdarg.h>
#else
#include <stdarg.h>
#endif /* _KERNEL || _STANDALONE */

struct _prop_string {
        struct _prop_object        ps_obj;
        union {
                char *                psu_mutable;
                const char *        psu_immutable;
        } ps_un;
#define        ps_mutable                ps_un.psu_mutable
#define        ps_immutable                ps_un.psu_immutable
        size_t                        ps_size;        /* not including \0 */
        struct rb_node                ps_link;
        int                        ps_flags;
};

#define        PS_F_NOCOPY                0x01
#define        PS_F_MUTABLE                0x02

_PROP_POOL_INIT(_prop_string_pool, sizeof(struct _prop_string), "propstng")

_PROP_MALLOC_DEFINE(M_PROP_STRING, "prop string",
                    "property string container object")

static _prop_object_free_rv_t
                _prop_string_free(prop_stack_t, prop_object_t *);
static bool        _prop_string_externalize(
                                struct _prop_object_externalize_context *,
                                void *);
static _prop_object_equals_rv_t
                _prop_string_equals(prop_object_t, prop_object_t,
                                    void **, void **,
                                    prop_object_t *, prop_object_t *);

static const struct _prop_object_type _prop_object_type_string = {
        .pot_type        =        PROP_TYPE_STRING,
        .pot_free        =        _prop_string_free,
        .pot_extern        =        _prop_string_externalize,
        .pot_equals        =        _prop_string_equals,
};

#define        prop_object_is_string(x)        \
        ((x) != NULL && (x)->ps_obj.po_type == &_prop_object_type_string)
#define        prop_string_contents(x)  ((x)->ps_immutable ? (x)->ps_immutable : "")

/*
 * In order to reduce memory usage, all immutable string objects are
 * de-duplicated.
 */

static int
/*ARGSUSED*/
_prop_string_rb_compare_nodes(void *ctx _PROP_ARG_UNUSED,
                              const void *n1, const void *n2)
{
        const struct _prop_string * const ps1 = n1;
        const struct _prop_string * const ps2 = n2;

        _PROP_ASSERT(ps1->ps_immutable != NULL);
        _PROP_ASSERT(ps2->ps_immutable != NULL);

        return strcmp(ps1->ps_immutable, ps2->ps_immutable);
}

static int
/*ARGSUSED*/
_prop_string_rb_compare_key(void *ctx _PROP_ARG_UNUSED,
                            const void *n, const void *v)
{
        const struct _prop_string * const ps = n;
        const char * const cp = v;

        _PROP_ASSERT(ps->ps_immutable != NULL);

        return strcmp(ps->ps_immutable, cp);
}

static const rb_tree_ops_t _prop_string_rb_tree_ops = {
        .rbto_compare_nodes = _prop_string_rb_compare_nodes,
        .rbto_compare_key = _prop_string_rb_compare_key,
        .rbto_node_offset = offsetof(struct _prop_string, ps_link),
        .rbto_context = NULL
};

static struct rb_tree _prop_string_tree;

_PROP_ONCE_DECL(_prop_string_init_once)
_PROP_MUTEX_DECL_STATIC(_prop_string_tree_mutex)

static int
_prop_string_init(void)
{

        _PROP_MUTEX_INIT(_prop_string_tree_mutex);
        rb_tree_init(&_prop_string_tree,
                     &_prop_string_rb_tree_ops);

        return 0;
}

/* ARGSUSED */
static _prop_object_free_rv_t
_prop_string_free(prop_stack_t stack, prop_object_t *obj)
{
        prop_string_t ps = *obj;

        if ((ps->ps_flags & PS_F_MUTABLE) == 0) {
                _PROP_MUTEX_LOCK(_prop_string_tree_mutex);
                /*
                 * Double-check the retain count now that we've
                 * acquired the tree lock; holding this lock prevents
                 * new retains from coming in by finding it in the
                 * tree.
                 */
                if (_PROP_ATOMIC_LOAD(&ps->ps_obj.po_refcnt) == 0)
                        rb_tree_remove_node(&_prop_string_tree, ps);
                else
                        ps = NULL;
                _PROP_MUTEX_UNLOCK(_prop_string_tree_mutex);

                if (ps == NULL)
                        return (_PROP_OBJECT_FREE_DONE);
        }

        if ((ps->ps_flags & PS_F_NOCOPY) == 0 && ps->ps_mutable != NULL)
                    _PROP_FREE(ps->ps_mutable, M_PROP_STRING);
        _PROP_POOL_PUT(_prop_string_pool, ps);

        return (_PROP_OBJECT_FREE_DONE);
}

static bool
_prop_string_externalize(struct _prop_object_externalize_context *ctx,
                         void *v)
{
        prop_string_t ps = v;

        if (ps->ps_size == 0)
                return (_prop_object_externalize_empty_tag(ctx, "string"));

        if (_prop_object_externalize_start_tag(ctx, "string") == false ||
            _prop_object_externalize_append_encoded_cstring(ctx,
                                                    ps->ps_immutable) == false ||
            _prop_object_externalize_end_tag(ctx, "string") == false)
                return (false);

        return (true);
}

/* ARGSUSED */
static _prop_object_equals_rv_t
_prop_string_equals(prop_object_t v1, prop_object_t v2,
    void **stored_pointer1, void **stored_pointer2,
    prop_object_t *next_obj1, prop_object_t *next_obj2)
{
        prop_string_t str1 = v1;
        prop_string_t str2 = v2;

        if (str1 == str2)
                return (_PROP_OBJECT_EQUALS_TRUE);
        if (str1->ps_size != str2->ps_size)
                return (_PROP_OBJECT_EQUALS_FALSE);
        if (strcmp(prop_string_contents(str1), prop_string_contents(str2)))
                return (_PROP_OBJECT_EQUALS_FALSE);
        else
                return (_PROP_OBJECT_EQUALS_TRUE);
}

static prop_string_t
_prop_string_alloc(int const flags)
{
        prop_string_t ps;

        ps = _PROP_POOL_GET(_prop_string_pool);
        if (ps != NULL) {
                _prop_object_init(&ps->ps_obj, &_prop_object_type_string);

                ps->ps_mutable = NULL;
                ps->ps_size = 0;
                ps->ps_flags = flags;
        }

        return (ps);
}

static prop_string_t
_prop_string_instantiate(int const flags, const char * const str,
    size_t const len)
{
        prop_string_t ps;

        _PROP_ONCE_RUN(_prop_string_init_once, _prop_string_init);

        ps = _prop_string_alloc(flags);
        if (ps != NULL) {
                ps->ps_immutable = str;
                ps->ps_size = len;

                if ((flags & PS_F_MUTABLE) == 0) {
                        prop_string_t ops;

                        _PROP_MUTEX_LOCK(_prop_string_tree_mutex);
                        ops = rb_tree_insert_node(&_prop_string_tree, ps);
                        if (ops != ps) {
                                /*
                                 * Equivalent string object already exist;
                                 * free the new one and return a reference
                                 * to the existing object.
                                 */
                                prop_object_retain(ops);
                                _PROP_MUTEX_UNLOCK(_prop_string_tree_mutex);
                                if ((flags & PS_F_NOCOPY) == 0) {
                                        _PROP_FREE(ps->ps_mutable,
                                            M_PROP_STRING);
                                }
                                _PROP_POOL_PUT(_prop_string_pool, ps);
                                ps = ops;
                        } else {
                                _PROP_MUTEX_UNLOCK(_prop_string_tree_mutex);
                        }
                }
        } else if ((flags & PS_F_NOCOPY) == 0) {
                _PROP_FREE(__UNCONST(str), M_PROP_STRING);
        }

        return (ps);
}

_PROP_DEPRECATED(prop_string_create,
    "this program uses prop_string_create(); all functions "
    "supporting mutable prop_strings are deprecated.")
prop_string_t
prop_string_create(void)
{

        return (_prop_string_alloc(PS_F_MUTABLE));
}

_PROP_DEPRECATED(prop_string_create_cstring,
    "this program uses prop_string_create_cstring(); all functions "
    "supporting mutable prop_strings are deprecated.")
prop_string_t
prop_string_create_cstring(const char *str)
{
        prop_string_t ps;
        char *cp;
        size_t len;

        _PROP_ASSERT(str != NULL);

        ps = _prop_string_alloc(PS_F_MUTABLE);
        if (ps != NULL) {
                len = strlen(str);
                cp = _PROP_MALLOC(len + 1, M_PROP_STRING);
                if (cp == NULL) {
                        prop_object_release(ps);
                        return (NULL);
                }
                strcpy(cp, str);
                ps->ps_mutable = cp;
                ps->ps_size = len;
        }
        return (ps);
}

_PROP_DEPRECATED(prop_string_create_cstring_nocopy,
    "this program uses prop_string_create_cstring_nocopy(), "
    "which is deprecated; use prop_string_create_nocopy() instead.")
prop_string_t
prop_string_create_cstring_nocopy(const char *str)
{
        return prop_string_create_nocopy(str);
}

/*
 * prop_string_create_format --
 *        Create a string object using the provided format string.
 */
prop_string_t __printflike(1, 2)
prop_string_create_format(const char *fmt, ...)
{
        char *str = NULL;
        int len;
        size_t nlen;
        va_list ap;

        _PROP_ASSERT(fmt != NULL);

        va_start(ap, fmt);
        len = vsnprintf(NULL, 0, fmt, ap);
        va_end(ap);

        if (len < 0)
                return (NULL);
        nlen = len + 1;

        str = _PROP_MALLOC(nlen, M_PROP_STRING);
        if (str == NULL)
                return (NULL);

        va_start(ap, fmt);
        vsnprintf(str, nlen, fmt, ap);
        va_end(ap);

        return _prop_string_instantiate(0, str, (size_t)len);
}

/*
 * prop_string_create_copy --
 *        Create a string object by coping the provided constant string.
 */
prop_string_t
prop_string_create_copy(const char *str)
{
        return prop_string_create_format("%s", str);
}

/*
 * prop_string_create_nocopy --
 *        Create a string object using the provided external constant
 *        string.
 */
prop_string_t
prop_string_create_nocopy(const char *str)
{

        _PROP_ASSERT(str != NULL);

        return _prop_string_instantiate(PS_F_NOCOPY, str, strlen(str));
}

/*
 * prop_string_copy --
 *        Copy a string.  This reduces to a retain in the common case.
 *        Deprecated mutable string objects must be copied.
 */
prop_string_t
prop_string_copy(prop_string_t ops)
{
        char *cp;

        if (! prop_object_is_string(ops))
                return (NULL);

        if ((ops->ps_flags & PS_F_MUTABLE) == 0) {
                prop_object_retain(ops);
                return (ops);
        }

        cp = _PROP_MALLOC(ops->ps_size + 1, M_PROP_STRING);
        if (cp == NULL)
                return NULL;

        strcpy(cp, prop_string_contents(ops));

        return _prop_string_instantiate(PS_F_MUTABLE, cp, ops->ps_size);
}

_PROP_DEPRECATED(prop_string_copy_mutable,
    "this program uses prop_string_copy_mutable(); all functions "
    "supporting mutable prop_strings are deprecated.")
prop_string_t
prop_string_copy_mutable(prop_string_t ops)
{
        char *cp;

        if (! prop_object_is_string(ops))
                return (NULL);

        cp = _PROP_MALLOC(ops->ps_size + 1, M_PROP_STRING);
        if (cp == NULL)
                return NULL;

        strcpy(cp, prop_string_contents(ops));

        return _prop_string_instantiate(PS_F_MUTABLE, cp, ops->ps_size);
}

/*
 * prop_string_size --
 *        Return the size of the string, not including the terminating NUL.
 */
size_t
prop_string_size(prop_string_t ps)
{

        if (! prop_object_is_string(ps))
                return (0);

        return (ps->ps_size);
}

/*
 * prop_string_value --
 *        Returns a pointer to the string object's value.  This pointer
 *        remains valid only as long as the string object.
 */
const char *
prop_string_value(prop_string_t ps)
{

        if (! prop_object_is_string(ps))
                return (NULL);

        if ((ps->ps_flags & PS_F_MUTABLE) == 0)
                return (ps->ps_immutable);

        return (prop_string_contents(ps));
}

/*
 * prop_string_copy_value --
 *        Copy the string object's value into the supplied buffer.
 */
bool
prop_string_copy_value(prop_string_t ps, void *buf, size_t buflen)
{

        if (! prop_object_is_string(ps))
                return (false);

        if (buf == NULL || buflen < ps->ps_size + 1)
                return (false);

        strcpy(buf, prop_string_contents(ps));

        return (true);
}

_PROP_DEPRECATED(prop_string_mutable,
    "this program uses prop_string_mutable(); all functions "
    "supporting mutable prop_strings are deprecated.")
bool
prop_string_mutable(prop_string_t ps)
{

        if (! prop_object_is_string(ps))
                return (false);

        return ((ps->ps_flags & PS_F_MUTABLE) != 0);
}

_PROP_DEPRECATED(prop_string_cstring,
    "this program uses prop_string_cstring(), "
    "which is deprecated; use prop_string_copy_value() instead.")
char *
prop_string_cstring(prop_string_t ps)
{
        char *cp;

        if (! prop_object_is_string(ps))
                return (NULL);

        cp = _PROP_MALLOC(ps->ps_size + 1, M_TEMP);
        if (cp != NULL)
                strcpy(cp, prop_string_contents(ps));

        return (cp);
}

_PROP_DEPRECATED(prop_string_cstring_nocopy,
    "this program uses prop_string_cstring_nocopy(), "
    "which is deprecated; use prop_string_value() instead.")
const char *
prop_string_cstring_nocopy(prop_string_t ps)
{

        if (! prop_object_is_string(ps))
                return (NULL);

        return (prop_string_contents(ps));
}

_PROP_DEPRECATED(prop_string_append,
    "this program uses prop_string_append(); all functions "
    "supporting mutable prop_strings are deprecated.")
bool
prop_string_append(prop_string_t dst, prop_string_t src)
{
        char *ocp, *cp;
        size_t len;

        if (! (prop_object_is_string(dst) &&
               prop_object_is_string(src)))
                return (false);

        if ((dst->ps_flags & PS_F_MUTABLE) == 0)
                return (false);

        len = dst->ps_size + src->ps_size;
        cp = _PROP_MALLOC(len + 1, M_PROP_STRING);
        if (cp == NULL)
                return (false);
        snprintf(cp, len + 1, "%s%s", prop_string_contents(dst),
                prop_string_contents(src));
        ocp = dst->ps_mutable;
        dst->ps_mutable = cp;
        dst->ps_size = len;
        if (ocp != NULL)
                _PROP_FREE(ocp, M_PROP_STRING);

        return (true);
}

_PROP_DEPRECATED(prop_string_append_cstring,
    "this program uses prop_string_append_cstring(); all functions "
    "supporting mutable prop_strings are deprecated.")
bool
prop_string_append_cstring(prop_string_t dst, const char *src)
{
        char *ocp, *cp;
        size_t len;

        if (! prop_object_is_string(dst))
                return (false);

        _PROP_ASSERT(src != NULL);

        if ((dst->ps_flags & PS_F_MUTABLE) == 0)
                return (false);

        len = dst->ps_size + strlen(src);
        cp = _PROP_MALLOC(len + 1, M_PROP_STRING);
        if (cp == NULL)
                return (false);
        snprintf(cp, len + 1, "%s%s", prop_string_contents(dst), src);
        ocp = dst->ps_mutable;
        dst->ps_mutable = cp;
        dst->ps_size = len;
        if (ocp != NULL)
                _PROP_FREE(ocp, M_PROP_STRING);

        return (true);
}

/*
 * prop_string_equals --
 *        Return true if two strings are equivalent.
 */
bool
prop_string_equals(prop_string_t str1, prop_string_t str2)
{
        if (!prop_object_is_string(str1) || !prop_object_is_string(str2))
                return (false);

        return prop_object_equals(str1, str2);
}

/*
 * prop_string_equals_string --
 *        Return true if the string object is equivalent to the specified
 *        C string.
 */
bool
prop_string_equals_string(prop_string_t ps, const char *cp)
{

        if (! prop_object_is_string(ps))
                return (false);

        return (strcmp(prop_string_contents(ps), cp) == 0);
}

_PROP_DEPRECATED(prop_string_equals_cstring,
    "this program uses prop_string_equals_cstring(), "
    "which is deprecated; prop_string_equals_string() instead.")
bool
prop_string_equals_cstring(prop_string_t ps, const char *cp)
{
        return prop_string_equals_string(ps, cp);
}

/*
 * prop_string_compare --
 *        Compare two string objects, using strcmp() semantics.
 */
int
prop_string_compare(prop_string_t ps1, prop_string_t ps2)
{
        if (!prop_object_is_string(ps1) || !prop_object_is_string(ps2))
                return (-666);        /* arbitrary */

        return (strcmp(prop_string_contents(ps1),
                       prop_string_contents(ps2)));
}

/*
 * prop_string_compare_string --
 *        Compare a string object to the specified C string, using
 *        strcmp() semantics.
 */
int
prop_string_compare_string(prop_string_t ps, const char *cp)
{
        if (!prop_object_is_string(ps))
                return (-666);        /* arbitrary */

        return (strcmp(prop_string_contents(ps), cp));
}

/*
 * _prop_string_internalize --
 *        Parse a <string>...</string> and return the object created from the
 *        external representation.
 */
/* ARGSUSED */
bool
_prop_string_internalize(prop_stack_t stack, prop_object_t *obj,
    struct _prop_object_internalize_context *ctx)
{
        char *str;
        size_t len, alen;

        if (ctx->poic_is_empty_element) {
                *obj = prop_string_create();
                return (true);
        }

        /* No attributes recognized here. */
        if (ctx->poic_tagattr != NULL)
                return (true);

        /* Compute the length of the result. */
        if (_prop_object_internalize_decode_string(ctx, NULL, 0, &len,
                                                   NULL) == false)
                return (true);

        str = _PROP_MALLOC(len + 1, M_PROP_STRING);
        if (str == NULL)
                return (true);

        if (_prop_object_internalize_decode_string(ctx, str, len, &alen,
                                                   &ctx->poic_cp) == false ||
            alen != len) {
                _PROP_FREE(str, M_PROP_STRING);
                return (true);
        }
        str[len] = '\0';

        if (_prop_object_internalize_find_tag(ctx, "string",
                                              _PROP_TAG_TYPE_END) == false) {
                _PROP_FREE(str, M_PROP_STRING);
                return (true);
        }

        *obj = _prop_string_instantiate(0, str, len);
        return (true);
}





















































































































    9 


    9 












   10 


    9 

   11 












   32 










   32 
   31 


   31 

































   32 


































    1 




   27 
    1 



   27 















   32 




   28 



    3 





   32 











    1 


    1 


    1 


























    1 




    1 
    1 
    1 





    1 


















    5 
























    5 
    5 
    5 

    5 



























    5 



    5 
    1 

    5 














    1 






























    4 





































































































    3 

    1 























    4 













































    3 










    1 




    1 








    3 





    3 













    3 
    1 







































































    1 





    1 
    1 









    1 





    1 
    1 

    1 








    1 

    1 































   34 










   35 

















    4 









   28 



   29 



   30 













   27 
















    4 












    5 









































    5 


    5 

    5 
    5 
    5 















    5 

    5 




    5 




























































































































































    6 



    6 
    6 
    6 


    6 



    5 
    6 

















   11 


   11 







   10 
   11 












    9 



    9 









    7 
    9 
    8 
    3 
    3 













    9 






    9 
    9 
    9 
    8 
    8 
    8 




    9 
    9 

    8 

    9 

    7 













    4 



    4 


















    4 





    1 




    2 


    3 


    3 








    3 
    3 








    3 
    1 
    2 

















































































































    2 





    2 

    2 







    2 
    2 
    2 











   42 




   41 

   42 





   42 



   42 

   42 


   21 
   21 














   42 




   42 
   42 

   42 

   39 




    6 
    6 






   37 
   37 

















    1 




    1 

    1 

    1 
    1 
    1 





    1 















    9 


    9 






    9 
    6 







    9 














    6 



    6 
















    4 






    4 

    3 




    1 









    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
/*        $NetBSD: uvm_amap.c,v 1.129 2023/09/10 14:54:34 ad Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * uvm_amap.c: amap operations
 */

/*
 * this file contains functions that perform operations on amaps.  see
 * uvm_amap.h for a brief explanation of the role of amaps in uvm.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_amap.c,v 1.129 2023/09/10 14:54:34 ad Exp $");

#include "opt_uvmhist.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/atomic.h>

#include <uvm/uvm.h>
#include <uvm/uvm_swap.h>

/*
 * cache for allocation of vm_map structures.  note that in order to
 * avoid an endless loop, the amap cache's allocator cannot allocate
 * memory from an amap (it currently goes through the kernel uobj, so
 * we are ok).
 */
static struct pool_cache uvm_amap_cache;
static kmutex_t amap_list_lock __cacheline_aligned;
static LIST_HEAD(, vm_amap) amap_list;

/*
 * local functions
 */

static int
amap_roundup_slots(int slots)
{

        return kmem_roundup_size(slots * sizeof(int)) / sizeof(int);
}

#ifdef UVM_AMAP_PPREF
/*
 * what is ppref?   ppref is an _optional_ amap feature which is used
 * to keep track of reference counts on a per-page basis.  it is enabled
 * when UVM_AMAP_PPREF is defined.
 *
 * when enabled, an array of ints is allocated for the pprefs.  this
 * array is allocated only when a partial reference is added to the
 * map (either by unmapping part of the amap, or gaining a reference
 * to only a part of an amap).  if the allocation of the array fails
 * (KM_NOSLEEP), then we set the array pointer to PPREF_NONE to indicate
 * that we tried to do ppref's but couldn't alloc the array so just
 * give up (after all, this is an optional feature!).
 *
 * the array is divided into page sized "chunks."   for chunks of length 1,
 * the chunk reference count plus one is stored in that chunk's slot.
 * for chunks of length > 1 the first slot contains (the reference count
 * plus one) * -1.    [the negative value indicates that the length is
 * greater than one.]   the second slot of the chunk contains the length
 * of the chunk.   here is an example:
 *
 * actual REFS:  2  2  2  2  3  1  1  0  0  0  4  4  0  1  1  1
 *       ppref: -3  4  x  x  4 -2  2 -1  3  x -5  2  1 -2  3  x
 *              <----------><-><----><-------><----><-><------->
 * (x = don't care)
 *
 * this allows us to allow one int to contain the ref count for the whole
 * chunk.    note that the "plus one" part is needed because a reference
 * count of zero is neither positive or negative (need a way to tell
 * if we've got one zero or a bunch of them).
 *
 * here are some in-line functions to help us.
 */

/*
 * pp_getreflen: get the reference and length for a specific offset
 *
 * => ppref's amap must be locked
 */
static inline void
pp_getreflen(int *ppref, int offset, int *refp, int *lenp)
{

        if (ppref[offset] > 0) {                /* chunk size must be 1 */
                *refp = ppref[offset] - 1;        /* don't forget to adjust */
                *lenp = 1;
        } else {
                *refp = (ppref[offset] * -1) - 1;
                *lenp = ppref[offset+1];
        }
}

/*
 * pp_setreflen: set the reference and length for a specific offset
 *
 * => ppref's amap must be locked
 */
static inline void
pp_setreflen(int *ppref, int offset, int ref, int len)
{
        if (len == 0)
                return;
        if (len == 1) {
                ppref[offset] = ref + 1;
        } else {
                ppref[offset] = (ref + 1) * -1;
                ppref[offset+1] = len;
        }
}
#endif /* UVM_AMAP_PPREF */

/*
 * amap_alloc1: allocate an amap, but do not initialise the overlay.
 *
 * => Note: lock is not set.
 */
static struct vm_amap *
amap_alloc1(int slots, int padslots, int flags)
{
        const bool nowait = (flags & UVM_FLAG_NOWAIT) != 0;
        const km_flag_t kmflags = nowait ? KM_NOSLEEP : KM_SLEEP;
        struct vm_amap *amap;
        krwlock_t *newlock, *oldlock;
        int totalslots;

        amap = pool_cache_get(&uvm_amap_cache, nowait ? PR_NOWAIT : PR_WAITOK);
        if (amap == NULL) {
                return NULL;
        }
        KASSERT(amap->am_lock != NULL);
        KASSERT(amap->am_nused == 0);

        /* Try to privatize the lock if currently shared. */
        if (rw_obj_refcnt(amap->am_lock) > 1) {
                newlock = rw_obj_tryalloc();
                if (newlock != NULL) {
                            oldlock = amap->am_lock;
                            mutex_enter(&amap_list_lock);
                            amap->am_lock = newlock;
                            mutex_exit(&amap_list_lock);
                            rw_obj_free(oldlock);
                }
        }

        totalslots = amap_roundup_slots(slots + padslots);
        amap->am_ref = 1;
        amap->am_flags = 0;
#ifdef UVM_AMAP_PPREF
        amap->am_ppref = NULL;
#endif
        amap->am_maxslot = totalslots;
        amap->am_nslot = slots;

        /*
         * Note: since allocations are likely big, we expect to reduce the
         * memory fragmentation by allocating them in separate blocks.
         */
        amap->am_slots = kmem_alloc(totalslots * sizeof(int), kmflags);
        if (amap->am_slots == NULL)
                goto fail1;

        amap->am_bckptr = kmem_alloc(totalslots * sizeof(int), kmflags);
        if (amap->am_bckptr == NULL)
                goto fail2;

        amap->am_anon = kmem_alloc(totalslots * sizeof(struct vm_anon *),
            kmflags);
        if (amap->am_anon == NULL)
                goto fail3;

        return amap;

fail3:
        kmem_free(amap->am_bckptr, totalslots * sizeof(int));
fail2:
        kmem_free(amap->am_slots, totalslots * sizeof(int));
fail1:
        pool_cache_put(&uvm_amap_cache, amap);

        /*
         * XXX hack to tell the pagedaemon how many pages we need,
         * since we can need more than it would normally free.
         */
        if (nowait) {
                extern u_int uvm_extrapages;
                atomic_add_int(&uvm_extrapages,
                    ((sizeof(int) * 2 + sizeof(struct vm_anon *)) *
                    totalslots) >> PAGE_SHIFT);
        }
        return NULL;
}

/*
 * amap_alloc: allocate an amap to manage "sz" bytes of anonymous VM
 *
 * => caller should ensure sz is a multiple of PAGE_SIZE
 * => reference count to new amap is set to one
 * => new amap is returned unlocked
 */

struct vm_amap *
amap_alloc(vaddr_t sz, vaddr_t padsz, int waitf)
{
        struct vm_amap *amap;
        int slots, padslots;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        AMAP_B2SLOT(slots, sz);
        AMAP_B2SLOT(padslots, padsz);

        amap = amap_alloc1(slots, padslots, waitf);
        if (amap) {
                memset(amap->am_anon, 0,
                    amap->am_maxslot * sizeof(struct vm_anon *));
        }

        UVMHIST_LOG(maphist,"<- done, amap = %#jx, sz=%jd", (uintptr_t)amap,
            sz, 0, 0);
        return(amap);
}

/*
 * amap_ctor: pool_cache constructor for new amaps
 *
 * => carefully synchronize with amap_swap_off()
 */
static int
amap_ctor(void *arg, void *obj, int flags)
{
        struct vm_amap *amap = obj;

        if ((flags & PR_NOWAIT) != 0) {
                amap->am_lock = rw_obj_tryalloc();
                if (amap->am_lock == NULL) {
                        return ENOMEM;
                }
        } else {
                amap->am_lock = rw_obj_alloc();
        }
        amap->am_nused = 0;
        amap->am_flags = 0;

        mutex_enter(&amap_list_lock);
        LIST_INSERT_HEAD(&amap_list, amap, am_list);
        mutex_exit(&amap_list_lock);
        return 0;
}

/*
 * amap_ctor: pool_cache destructor for amaps
 *
 * => carefully synchronize with amap_swap_off()
 */
static void
amap_dtor(void *arg, void *obj)
{
        struct vm_amap *amap = obj;

        KASSERT(amap->am_nused == 0);

        mutex_enter(&amap_list_lock);
        LIST_REMOVE(amap, am_list);
        mutex_exit(&amap_list_lock);
        rw_obj_free(amap->am_lock);
}

/*
 * uvm_amap_init: initialize the amap system.
 */
void
uvm_amap_init(void)
{

        mutex_init(&amap_list_lock, MUTEX_DEFAULT, IPL_NONE);

        pool_cache_bootstrap(&uvm_amap_cache, sizeof(struct vm_amap),
            COHERENCY_UNIT, 0, 0, "amappl", NULL, IPL_NONE,
            amap_ctor, amap_dtor, NULL);
}

/*
 * amap_free: free an amap
 *
 * => the amap must be unlocked
 * => the amap should have a zero reference count and be empty
 */
void
amap_free(struct vm_amap *amap)
{
        int slots;

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        KASSERT(amap->am_ref == 0);
        KASSERT(amap->am_nused == 0);
        KASSERT((amap->am_flags & AMAP_SWAPOFF) == 0);
        slots = amap->am_maxslot;
        kmem_free(amap->am_slots, slots * sizeof(*amap->am_slots));
        kmem_free(amap->am_bckptr, slots * sizeof(*amap->am_bckptr));
        kmem_free(amap->am_anon, slots * sizeof(*amap->am_anon));
#ifdef UVM_AMAP_PPREF
        if (amap->am_ppref && amap->am_ppref != PPREF_NONE)
                kmem_free(amap->am_ppref, slots * sizeof(*amap->am_ppref));
#endif
        pool_cache_put(&uvm_amap_cache, amap);
        UVMHIST_LOG(maphist,"<- done, freed amap = %#jx", (uintptr_t)amap,
            0, 0, 0);
}

/*
 * amap_extend: extend the size of an amap (if needed)
 *
 * => called from uvm_map when we want to extend an amap to cover
 *    a new mapping (rather than allocate a new one)
 * => amap should be unlocked (we will lock it)
 * => to safely extend an amap it should have a reference count of
 *    one (thus it can't be shared)
 */
int
amap_extend(struct vm_map_entry *entry, vsize_t addsize, int flags)
{
        struct vm_amap *amap = entry->aref.ar_amap;
        int slotoff = entry->aref.ar_pageoff;
        int slotmapped, slotadd, slotneed, slotadded, slotalloc;
        int slotadj, slotarea, slotendoff;
        int oldnslots;
#ifdef UVM_AMAP_PPREF
        int *newppref, *oldppref;
#endif
        int i, *newsl, *newbck, *oldsl, *oldbck;
        struct vm_anon **newover, **oldover;
        const km_flag_t kmflags =
            (flags & AMAP_EXTEND_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;

        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist, "  (entry=%#jx, addsize=%#jx, flags=%#jx)",
            (uintptr_t)entry, addsize, flags, 0);

        /*
         * first, determine how many slots we need in the amap.  don't
         * forget that ar_pageoff could be non-zero: this means that
         * there are some unused slots before us in the amap.
         */

        amap_lock(amap, RW_WRITER);
        KASSERT(amap_refs(amap) == 1); /* amap can't be shared */
        AMAP_B2SLOT(slotmapped, entry->end - entry->start); /* slots mapped */
        AMAP_B2SLOT(slotadd, addsize);                        /* slots to add */
        if (flags & AMAP_EXTEND_FORWARDS) {
                slotneed = slotoff + slotmapped + slotadd;
                slotadj = 0;
                slotarea = 0;
        } else {
                slotneed = slotadd + slotmapped;
                slotadj = slotadd - slotoff;
                slotarea = amap->am_maxslot - slotmapped;
        }

        /*
         * Because this amap only has 1 ref, we know that there is
         * only one vm_map_entry pointing to it, and the one entry is
         * using slots between slotoff and slotoff + slotmapped.  If
         * we have been using ppref then we know that only slots in
         * the one map entry's range can have anons, since ppref
         * allowed us to free any anons outside that range as other map
         * entries which used this amap were removed. But without ppref,
         * we couldn't know which slots were still needed by other map
         * entries, so we couldn't free any anons as we removed map
         * entries, and so any slot from 0 to am_nslot can have an
         * anon.  But now that we know there is only one map entry
         * left and we know its range, we can free up any anons
         * outside that range.  This is necessary because the rest of
         * this function assumes that there are no anons in the amap
         * outside of the one map entry's range.
         */

        slotendoff = slotoff + slotmapped;
        if (amap->am_ppref == PPREF_NONE) {
                amap_wiperange(amap, 0, slotoff);
                amap_wiperange(amap, slotendoff, amap->am_nslot - slotendoff);
        }
        for (i = 0; i < slotoff; i++) {
                KASSERT(amap->am_anon[i] == NULL);
        }
        for (i = slotendoff; i < amap->am_nslot - slotendoff; i++) {
                KASSERT(amap->am_anon[i] == NULL);
        }

        /*
         * case 1: we already have enough slots in the map and thus
         * only need to bump the reference counts on the slots we are
         * adding.
         */

        if (flags & AMAP_EXTEND_FORWARDS) {
                if (amap->am_nslot >= slotneed) {
#ifdef UVM_AMAP_PPREF
                        if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
                                amap_pp_adjref(amap, slotoff + slotmapped,
                                    slotadd, 1);
                        }
#endif
                        amap_unlock(amap);
                        UVMHIST_LOG(maphist,
                            "<- done (case 1f), amap = %#jx, sltneed=%jd",
                            (uintptr_t)amap, slotneed, 0, 0);
                        return 0;
                }
        } else {
                if (slotadj <= 0) {
                        slotoff -= slotadd;
                        entry->aref.ar_pageoff = slotoff;
#ifdef UVM_AMAP_PPREF
                        if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
                                amap_pp_adjref(amap, slotoff, slotadd, 1);
                        }
#endif
                        amap_unlock(amap);
                        UVMHIST_LOG(maphist,
                            "<- done (case 1b), amap = %#jx, sltneed=%jd",
                            (uintptr_t)amap, slotneed, 0, 0);
                        return 0;
                }
        }

        /*
         * case 2: we pre-allocated slots for use and we just need to
         * bump nslot up to take account for these slots.
         */

        if (amap->am_maxslot >= slotneed) {
                if (flags & AMAP_EXTEND_FORWARDS) {
#ifdef UVM_AMAP_PPREF
                        if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
                                if ((slotoff + slotmapped) < amap->am_nslot)
                                        amap_pp_adjref(amap,
                                            slotoff + slotmapped,
                                            (amap->am_nslot -
                                            (slotoff + slotmapped)), 1);
                                pp_setreflen(amap->am_ppref, amap->am_nslot, 1,
                                    slotneed - amap->am_nslot);
                        }
#endif
                        amap->am_nslot = slotneed;
                        amap_unlock(amap);

                        /*
                         * no need to zero am_anon since that was done at
                         * alloc time and we never shrink an allocation.
                         */

                        UVMHIST_LOG(maphist,"<- done (case 2f), amap = %#jx, "
                            "slotneed=%jd", (uintptr_t)amap, slotneed, 0, 0);
                        return 0;
                } else {
#ifdef UVM_AMAP_PPREF
                        if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
                                /*
                                 * Slide up the ref counts on the pages that
                                 * are actually in use.
                                 */
                                memmove(amap->am_ppref + slotarea,
                                    amap->am_ppref + slotoff,
                                    slotmapped * sizeof(int));
                                /*
                                 * Mark the (adjusted) gap at the front as
                                 * referenced/not referenced.
                                 */
                                pp_setreflen(amap->am_ppref,
                                    0, 0, slotarea - slotadd);
                                pp_setreflen(amap->am_ppref,
                                    slotarea - slotadd, 1, slotadd);
                        }
#endif

                        /*
                         * Slide the anon pointers up and clear out
                         * the space we just made.
                         */
                        memmove(amap->am_anon + slotarea,
                            amap->am_anon + slotoff,
                            slotmapped * sizeof(struct vm_anon*));
                        memset(amap->am_anon + slotoff, 0,
                            (slotarea - slotoff) * sizeof(struct vm_anon *));

                        /*
                         * Slide the backpointers up, but don't bother
                         * wiping out the old slots.
                         */
                        memmove(amap->am_bckptr + slotarea,
                            amap->am_bckptr + slotoff,
                            slotmapped * sizeof(int));

                        /*
                         * Adjust all the useful active slot numbers.
                         */
                        for (i = 0; i < amap->am_nused; i++)
                                amap->am_slots[i] += (slotarea - slotoff);

                        /*
                         * We just filled all the empty space in the
                         * front of the amap by activating a few new
                         * slots.
                         */
                        amap->am_nslot = amap->am_maxslot;
                        entry->aref.ar_pageoff = slotarea - slotadd;
                        amap_unlock(amap);

                        UVMHIST_LOG(maphist,"<- done (case 2b), amap = %#jx, "
                            "slotneed=%jd", (uintptr_t)amap, slotneed, 0, 0);
                        return 0;
                }
        }

        /*
         * Case 3: we need to allocate a new amap and copy all the amap
         * data over from old amap to the new one.  Drop the lock before
         * performing allocation.
         *
         * Note: since allocations are likely big, we expect to reduce the
         * memory fragmentation by allocating them in separate blocks.
         */

        amap_unlock(amap);

        if (slotneed >= UVM_AMAP_LARGE) {
                return E2BIG;
        }

        slotalloc = amap_roundup_slots(slotneed);
#ifdef UVM_AMAP_PPREF
        newppref = NULL;
        if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
                /* Will be handled later if fails. */
                newppref = kmem_alloc(slotalloc * sizeof(*newppref), kmflags);
        }
#endif
        newsl = kmem_alloc(slotalloc * sizeof(*newsl), kmflags);
        newbck = kmem_alloc(slotalloc * sizeof(*newbck), kmflags);
        newover = kmem_alloc(slotalloc * sizeof(*newover), kmflags);
        if (newsl == NULL || newbck == NULL || newover == NULL) {
#ifdef UVM_AMAP_PPREF
                if (newppref != NULL) {
                        kmem_free(newppref, slotalloc * sizeof(*newppref));
                }
#endif
                if (newsl != NULL) {
                        kmem_free(newsl, slotalloc * sizeof(*newsl));
                }
                if (newbck != NULL) {
                        kmem_free(newbck, slotalloc * sizeof(*newbck));
                }
                if (newover != NULL) {
                        kmem_free(newover, slotalloc * sizeof(*newover));
                }
                return ENOMEM;
        }
        amap_lock(amap, RW_WRITER);
        KASSERT(amap->am_maxslot < slotneed);

        /*
         * Copy everything over to new allocated areas.
         */

        slotadded = slotalloc - amap->am_nslot;
        if (!(flags & AMAP_EXTEND_FORWARDS))
                slotarea = slotalloc - slotmapped;

        /* do am_slots */
        oldsl = amap->am_slots;
        if (flags & AMAP_EXTEND_FORWARDS)
                memcpy(newsl, oldsl, sizeof(int) * amap->am_nused);
        else
                for (i = 0; i < amap->am_nused; i++)
                        newsl[i] = oldsl[i] + slotarea - slotoff;
        amap->am_slots = newsl;

        /* do am_anon */
        oldover = amap->am_anon;
        if (flags & AMAP_EXTEND_FORWARDS) {
                memcpy(newover, oldover,
                    sizeof(struct vm_anon *) * amap->am_nslot);
                memset(newover + amap->am_nslot, 0,
                    sizeof(struct vm_anon *) * slotadded);
        } else {
                memcpy(newover + slotarea, oldover + slotoff,
                    sizeof(struct vm_anon *) * slotmapped);
                memset(newover, 0,
                    sizeof(struct vm_anon *) * slotarea);
        }
        amap->am_anon = newover;

        /* do am_bckptr */
        oldbck = amap->am_bckptr;
        if (flags & AMAP_EXTEND_FORWARDS)
                memcpy(newbck, oldbck, sizeof(int) * amap->am_nslot);
        else
                memcpy(newbck + slotarea, oldbck + slotoff,
                    sizeof(int) * slotmapped);
        amap->am_bckptr = newbck;

#ifdef UVM_AMAP_PPREF
        /* do ppref */
        oldppref = amap->am_ppref;
        if (newppref) {
                if (flags & AMAP_EXTEND_FORWARDS) {
                        memcpy(newppref, oldppref,
                            sizeof(int) * amap->am_nslot);
                        memset(newppref + amap->am_nslot, 0,
                            sizeof(int) * slotadded);
                } else {
                        memcpy(newppref + slotarea, oldppref + slotoff,
                            sizeof(int) * slotmapped);
                }
                amap->am_ppref = newppref;
                if ((flags & AMAP_EXTEND_FORWARDS) &&
                    (slotoff + slotmapped) < amap->am_nslot)
                        amap_pp_adjref(amap, slotoff + slotmapped,
                            (amap->am_nslot - (slotoff + slotmapped)), 1);
                if (flags & AMAP_EXTEND_FORWARDS)
                        pp_setreflen(newppref, amap->am_nslot, 1,
                            slotneed - amap->am_nslot);
                else {
                        pp_setreflen(newppref, 0, 0,
                            slotalloc - slotneed);
                        pp_setreflen(newppref, slotalloc - slotneed, 1,
                            slotneed - slotmapped);
                }
        } else {
                if (amap->am_ppref)
                        amap->am_ppref = PPREF_NONE;
        }
#endif

        /* update master values */
        if (flags & AMAP_EXTEND_FORWARDS)
                amap->am_nslot = slotneed;
        else {
                entry->aref.ar_pageoff = slotarea - slotadd;
                amap->am_nslot = slotalloc;
        }
        oldnslots = amap->am_maxslot;
        amap->am_maxslot = slotalloc;
        amap_unlock(amap);

        kmem_free(oldsl, oldnslots * sizeof(*oldsl));
        kmem_free(oldbck, oldnslots * sizeof(*oldbck));
        kmem_free(oldover, oldnslots * sizeof(*oldover));
#ifdef UVM_AMAP_PPREF
        if (oldppref && oldppref != PPREF_NONE)
                kmem_free(oldppref, oldnslots * sizeof(*oldppref));
#endif
        UVMHIST_LOG(maphist,"<- done (case 3), amap = %#jx, slotneed=%jd",
            (uintptr_t)amap, slotneed, 0, 0);
        return 0;
}

/*
 * amap_share_protect: change protection of anons in a shared amap
 *
 * for shared amaps, given the current data structure layout, it is
 * not possible for us to directly locate all maps referencing the
 * shared anon (to change the protection).  in order to protect data
 * in shared maps we use pmap_page_protect().  [this is useful for IPC
 * mechanisms like map entry passing that may want to write-protect
 * all mappings of a shared amap.]  we traverse am_anon or am_slots
 * depending on the current state of the amap.
 *
 * => entry's map and amap must be locked by the caller
 */
void
amap_share_protect(struct vm_map_entry *entry, vm_prot_t prot)
{
        struct vm_amap *amap = entry->aref.ar_amap;
        u_int slots, lcv, slot, stop;
        struct vm_anon *anon;

        KASSERT(rw_write_held(amap->am_lock));

        AMAP_B2SLOT(slots, (entry->end - entry->start));
        stop = entry->aref.ar_pageoff + slots;

        if (slots < amap->am_nused) {
                /*
                 * Cheaper to traverse am_anon.
                 */
                for (lcv = entry->aref.ar_pageoff ; lcv < stop ; lcv++) {
                        anon = amap->am_anon[lcv];
                        if (anon == NULL) {
                                continue;
                        }
                        if (anon->an_page) {
                                pmap_page_protect(anon->an_page, prot);
                        }
                }
                return;
        }

        /*
         * Cheaper to traverse am_slots.
         */
        for (lcv = 0 ; lcv < amap->am_nused ; lcv++) {
                slot = amap->am_slots[lcv];
                if (slot < entry->aref.ar_pageoff || slot >= stop) {
                        continue;
                }
                anon = amap->am_anon[slot];
                if (anon->an_page) {
                        pmap_page_protect(anon->an_page, prot);
                }
        }
}

/*
 * amap_wipeout: wipeout all anon's in an amap; then free the amap!
 *
 * => Called from amap_unref(), when reference count drops to zero.
 * => amap must be locked.
 */

void
amap_wipeout(struct vm_amap *amap)
{
        u_int lcv;

        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(amap=%#jx)", (uintptr_t)amap, 0,0,0);

        KASSERT(rw_write_held(amap->am_lock));
        KASSERT(amap->am_ref == 0);

        if (__predict_false(amap->am_flags & AMAP_SWAPOFF)) {
                /*
                 * Note: amap_swap_off() will call us again.
                 */
                amap_unlock(amap);
                return;
        }

        for (lcv = 0 ; lcv < amap->am_nused ; lcv++) {
                struct vm_anon *anon;
                u_int slot;

                slot = amap->am_slots[lcv];
                anon = amap->am_anon[slot];
                KASSERT(anon != NULL);
                KASSERT(anon->an_ref != 0);

                KASSERT(anon->an_lock == amap->am_lock);
                UVMHIST_LOG(maphist,"  processing anon %#jx, ref=%jd",
                    (uintptr_t)anon, anon->an_ref, 0, 0);

                /*
                 * Drop the reference.
                 */

                if (__predict_true(--anon->an_ref == 0)) {
                        uvm_anfree(anon);
                }
                if (__predict_false((lcv & 31) == 31)) {
                        preempt_point();
                }
        }

        /*
         * Finally, destroy the amap.
         */

        amap->am_nused = 0;
        amap_unlock(amap);
        amap_free(amap);
        UVMHIST_LOG(maphist,"<- done!", 0,0,0,0);
}

/*
 * amap_copy: ensure that a map entry's "needs_copy" flag is false
 *        by copying the amap if necessary.
 *
 * => an entry with a null amap pointer will get a new (blank) one.
 * => the map that the map entry belongs to must be locked by caller.
 * => the amap currently attached to "entry" (if any) must be unlocked.
 * => if canchunk is true, then we may clip the entry into a chunk
 * => "startva" and "endva" are used only if canchunk is true.  they are
 *     used to limit chunking (e.g. if you have a large space that you
 *     know you are going to need to allocate amaps for, there is no point
 *     in allowing that to be chunked)
 */

void
amap_copy(struct vm_map *map, struct vm_map_entry *entry, int flags,
    vaddr_t startva, vaddr_t endva)
{
        const int waitf = (flags & AMAP_COPY_NOWAIT) ? UVM_FLAG_NOWAIT : 0;
        struct vm_amap *amap, *srcamap;
        u_int slots, lcv;
        krwlock_t *oldlock;
        vsize_t len;

        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist, "  (map=%#jx, entry=%#jx, flags=%#jx)",
            (uintptr_t)map, (uintptr_t)entry, flags, -2);

        KASSERT(map != kernel_map);        /* we use nointr pool */

        srcamap = entry->aref.ar_amap;
        len = entry->end - entry->start;

        /*
         * Is there an amap to copy?  If not, create one.
         */

        if (srcamap == NULL) {
                const bool canchunk = (flags & AMAP_COPY_NOCHUNK) == 0;

                /*
                 * Check to see if we have a large amap that we can
                 * chunk.  We align startva/endva to chunk-sized
                 * boundaries and then clip to them.
                 */

                if (canchunk && atop(len) >= UVM_AMAP_LARGE) {
                        vsize_t chunksize;

                        /* Convert slots to bytes. */
                        chunksize = UVM_AMAP_CHUNK << PAGE_SHIFT;
                        startva = (startva / chunksize) * chunksize;
                        endva = roundup(endva, chunksize);
                        UVMHIST_LOG(maphist,
                            "  chunk amap ==> clip %#jx->%#jx to %#jx->%#jx",
                            entry->start, entry->end, startva, endva);
                        UVM_MAP_CLIP_START(map, entry, startva);

                        /* Watch out for endva wrap-around! */
                        if (endva >= startva) {
                                UVM_MAP_CLIP_END(map, entry, endva);
                        }
                }

                if ((flags & AMAP_COPY_NOMERGE) == 0 &&
                    uvm_mapent_trymerge(map, entry, UVM_MERGE_COPYING)) {
                        return;
                }

                UVMHIST_LOG(maphist, "<- done [creating new amap %#jx->%#jx]",
                    entry->start, entry->end, 0, 0);

                /*
                 * Allocate an initialised amap and install it.
                 * Note: we must update the length after clipping.
                 */
                len = entry->end - entry->start;
                entry->aref.ar_pageoff = 0;
                entry->aref.ar_amap = amap_alloc(len, 0, waitf);
                if (entry->aref.ar_amap != NULL) {
                        entry->etype &= ~UVM_ET_NEEDSCOPY;
                }
                return;
        }

        /*
         * First check and see if we are the only map entry referencing
         * he amap we currently have.  If so, then just take it over instead
         * of copying it.  Note that we are reading am_ref without lock held
         * as the value can only be one if we have the only reference
         * to the amap (via our locked map).  If the value is greater than
         * one, then allocate amap and re-check the value.
         */

        if (srcamap->am_ref == 1) {
                entry->etype &= ~UVM_ET_NEEDSCOPY;
                UVMHIST_LOG(maphist, "<- done [ref cnt = 1, took it over]",
                    0, 0, 0, 0);
                return;
        }

        UVMHIST_LOG(maphist,"  amap=%#jx, ref=%jd, must copy it",
            (uintptr_t)srcamap, srcamap->am_ref, 0, 0);

        /*
         * Allocate a new amap (note: not initialised, etc).
         */

        AMAP_B2SLOT(slots, len);
        amap = amap_alloc1(slots, 0, waitf);
        if (amap == NULL) {
                UVMHIST_LOG(maphist, "  amap_alloc1 failed", 0,0,0,0);
                return;
        }

        /*
         * Make the new amap share the source amap's lock, and then lock
         * both.  We must do this before we set am_nused != 0, otherwise
         * amap_swap_off() can become interested in the amap.
         */

        oldlock = amap->am_lock;
        mutex_enter(&amap_list_lock);
        amap->am_lock = srcamap->am_lock;
        mutex_exit(&amap_list_lock);
        rw_obj_hold(amap->am_lock);
        rw_obj_free(oldlock);

        amap_lock(srcamap, RW_WRITER);

        /*
         * Re-check the reference count with the lock held.  If it has
         * dropped to one - we can take over the existing map.
         */

        if (srcamap->am_ref == 1) {
                /* Just take over the existing amap. */
                entry->etype &= ~UVM_ET_NEEDSCOPY;
                amap_unlock(srcamap);
                /* Destroy the new (unused) amap. */
                amap->am_ref--;
                amap_free(amap);
                return;
        }

        /*
         * Copy the slots.  Zero the padded part.
         */

        UVMHIST_LOG(maphist, "  copying amap now",0, 0, 0, 0);
        for (lcv = 0 ; lcv < slots; lcv++) {
                amap->am_anon[lcv] =
                    srcamap->am_anon[entry->aref.ar_pageoff + lcv];
                if (amap->am_anon[lcv] == NULL)
                        continue;
                KASSERT(amap->am_anon[lcv]->an_lock == srcamap->am_lock);
                KASSERT(amap->am_anon[lcv]->an_ref > 0);
                KASSERT(amap->am_nused < amap->am_maxslot);
                amap->am_anon[lcv]->an_ref++;
                amap->am_bckptr[lcv] = amap->am_nused;
                amap->am_slots[amap->am_nused] = lcv;
                amap->am_nused++;
        }
        memset(&amap->am_anon[lcv], 0,
            (amap->am_maxslot - lcv) * sizeof(struct vm_anon *));

        /*
         * Drop our reference to the old amap (srcamap) and unlock.
         * Since the reference count on srcamap is greater than one,
         * (we checked above), it cannot drop to zero while it is locked.
         */

        srcamap->am_ref--;
        KASSERT(srcamap->am_ref > 0);

        if (srcamap->am_ref == 1 && (srcamap->am_flags & AMAP_SHARED) != 0) {
                srcamap->am_flags &= ~AMAP_SHARED;
        }
#ifdef UVM_AMAP_PPREF
        if (srcamap->am_ppref && srcamap->am_ppref != PPREF_NONE) {
                amap_pp_adjref(srcamap, entry->aref.ar_pageoff,
                    len >> PAGE_SHIFT, -1);
        }
#endif

        amap_unlock(srcamap);

        /*
         * Install new amap.
         */

        entry->aref.ar_pageoff = 0;
        entry->aref.ar_amap = amap;
        entry->etype &= ~UVM_ET_NEEDSCOPY;
        UVMHIST_LOG(maphist, "<- done",0, 0, 0, 0);
}

/*
 * amap_cow_now: resolve all copy-on-write faults in an amap now for fork(2)
 *
 *        called during fork(2) when the parent process has a wired map
 *        entry.   in that case we want to avoid write-protecting pages
 *        in the parent's map (e.g. like what you'd do for a COW page)
 *        so we resolve the COW here.
 *
 * => assume parent's entry was wired, thus all pages are resident.
 * => assume pages that are loaned out (loan_count) are already mapped
 *        read-only in all maps, and thus no need for us to worry about them
 * => assume both parent and child vm_map's are locked
 * => caller passes child's map/entry in to us
 * => if we run out of memory we will unlock the amap and sleep _with_ the
 *        parent and child vm_map's locked(!).    we have to do this since
 *        we are in the middle of a fork(2) and we can't let the parent
 *        map change until we are done copying all the map entrys.
 * => XXXCDC: out of memory should cause fork to fail, but there is
 *        currently no easy way to do this (needs fix)
 */

void
amap_cow_now(struct vm_map *map, struct vm_map_entry *entry)
{
        struct vm_amap *amap = entry->aref.ar_amap;
        struct vm_anon *anon, *nanon;
        struct vm_page *pg, *npg;
        u_int lcv, slot;

        /*
         * note that if we unlock the amap then we must ReStart the "lcv" for
         * loop because some other process could reorder the anon's in the
         * am_anon[] array on us while the lock is dropped.
         */

ReStart:
        amap_lock(amap, RW_WRITER);
        for (lcv = 0 ; lcv < amap->am_nused ; lcv++) {
                slot = amap->am_slots[lcv];
                anon = amap->am_anon[slot];
                KASSERT(anon->an_lock == amap->am_lock);

                /*
                 * If anon has only one reference - we must have already
                 * copied it.  This can happen if we needed to sleep waiting
                 * for memory in a previous run through this loop.  The new
                 * page might even have been paged out, since is not wired.
                 */

                if (anon->an_ref == 1) {
                        KASSERT(anon->an_page != NULL || anon->an_swslot != 0);
                        continue;
                }

                /*
                 * The old page must be resident since the parent is wired.
                 */

                pg = anon->an_page;
                KASSERT(pg != NULL);
                KASSERT(pg->wire_count > 0);

                /*
                 * If the page is loaned then it must already be mapped
                 * read-only and we don't need to copy it.
                 */

                if (pg->loan_count != 0) {
                        continue;
                }
                KASSERT(pg->uanon == anon);
                KASSERT(pg->uobject == NULL);

                /*
                 * If the page is busy, then we have to unlock, wait for
                 * it and then restart.
                 */

                if (pg->flags & PG_BUSY) {
                        uvm_pagewait(pg, amap->am_lock, "cownow");
                        goto ReStart;
                }

                /*
                 * Perform a copy-on-write.
                 * First - get a new anon and a page.
                 */

                nanon = uvm_analloc();
                if (nanon) {
                        nanon->an_lock = amap->am_lock;
                        npg = uvm_pagealloc(NULL, 0, nanon, 0);
                } else {
                        npg = NULL;
                }
                if (nanon == NULL || npg == NULL) {
                        amap_unlock(amap);
                        if (nanon) {
                                nanon->an_lock = NULL;
                                nanon->an_ref--;
                                KASSERT(nanon->an_ref == 0);
                                uvm_anfree(nanon);
                        }
                        uvm_wait("cownowpage");
                        goto ReStart;
                }

                /*
                 * Copy the data and replace anon with the new one.
                 * Also, setup its lock (share the with amap's lock).
                 */

                uvm_pagecopy(pg, npg);
                anon->an_ref--;
                KASSERT(anon->an_ref > 0);
                amap->am_anon[slot] = nanon;

                /*
                 * Drop PG_BUSY on new page.  Since its owner was write
                 * locked all this time - it cannot be PG_RELEASED or
                 * waited on.
                 */
                uvm_pagelock(npg);
                uvm_pageactivate(npg);
                uvm_pageunlock(npg);
                npg->flags &= ~(PG_BUSY|PG_FAKE);
                UVM_PAGE_OWN(npg, NULL);
        }
        amap_unlock(amap);
}

/*
 * amap_splitref: split a single reference into two separate references
 *
 * => called from uvm_map's clip routines
 * => origref's map should be locked
 * => origref->ar_amap should be unlocked (we will lock)
 */
void
amap_splitref(struct vm_aref *origref, struct vm_aref *splitref, vaddr_t offset)
{
        struct vm_amap *amap = origref->ar_amap;
        u_int leftslots;

        KASSERT(splitref->ar_amap == origref->ar_amap);
        AMAP_B2SLOT(leftslots, offset);
        KASSERT(leftslots != 0);

        amap_lock(amap, RW_WRITER);
        KASSERT(amap->am_nslot - origref->ar_pageoff - leftslots > 0);

#ifdef UVM_AMAP_PPREF
        /* Establish ppref before we add a duplicate reference to the amap. */
        if (amap->am_ppref == NULL) {
                amap_pp_establish(amap, origref->ar_pageoff);
        }
#endif
        /* Note: not a share reference. */
        amap->am_ref++;
        splitref->ar_pageoff = origref->ar_pageoff + leftslots;
        amap_unlock(amap);
}

#ifdef UVM_AMAP_PPREF

/*
 * amap_pp_establish: add a ppref array to an amap, if possible.
 *
 * => amap should be locked by caller.
 */
void
amap_pp_establish(struct vm_amap *amap, vaddr_t offset)
{
        const size_t sz = amap->am_maxslot * sizeof(*amap->am_ppref);

        KASSERT(rw_write_held(amap->am_lock));

        amap->am_ppref = kmem_zalloc(sz, KM_NOSLEEP);
        if (amap->am_ppref == NULL) {
                /* Failure - just do not use ppref. */
                amap->am_ppref = PPREF_NONE;
                return;
        }
        pp_setreflen(amap->am_ppref, 0, 0, offset);
        pp_setreflen(amap->am_ppref, offset, amap->am_ref,
            amap->am_nslot - offset);
}

/*
 * amap_pp_adjref: adjust reference count to a part of an amap using the
 * per-page reference count array.
 *
 * => caller must check that ppref != PPREF_NONE before calling.
 * => map and amap must be locked.
 */
void
amap_pp_adjref(struct vm_amap *amap, int curslot, vsize_t slotlen, int adjval)
{
        int stopslot, *ppref, lcv, prevlcv;
        int ref, len, prevref, prevlen;

        KASSERT(rw_write_held(amap->am_lock));

        stopslot = curslot + slotlen;
        ppref = amap->am_ppref;
        prevlcv = 0;

        /*
         * Advance to the correct place in the array, fragment if needed.
         */

        for (lcv = 0 ; lcv < curslot ; lcv += len) {
                pp_getreflen(ppref, lcv, &ref, &len);
                if (lcv + len > curslot) {     /* goes past start? */
                        pp_setreflen(ppref, lcv, ref, curslot - lcv);
                        pp_setreflen(ppref, curslot, ref, len - (curslot -lcv));
                        len = curslot - lcv;   /* new length of entry @ lcv */
                }
                prevlcv = lcv;
        }
        if (lcv == 0) {
                /*
                 * Ensure that the "prevref == ref" test below always
                 * fails, since we are starting from the beginning of
                 * the ppref array; that is, there is no previous chunk.
                 */
                prevref = -1;
                prevlen = 0;
        } else {
                pp_getreflen(ppref, prevlcv, &prevref, &prevlen);
        }

        /*
         * Now adjust reference counts in range.  Merge the first
         * changed entry with the last unchanged entry if possible.
         */
        KASSERT(lcv == curslot);
        for (/* lcv already set */; lcv < stopslot ; lcv += len) {
                pp_getreflen(ppref, lcv, &ref, &len);
                if (lcv + len > stopslot) {     /* goes past end? */
                        pp_setreflen(ppref, lcv, ref, stopslot - lcv);
                        pp_setreflen(ppref, stopslot, ref,
                            len - (stopslot - lcv));
                        len = stopslot - lcv;
                }
                ref += adjval;
                KASSERT(ref >= 0);
                KASSERT(ref <= amap->am_ref);
                if (lcv == prevlcv + prevlen && ref == prevref) {
                        pp_setreflen(ppref, prevlcv, ref, prevlen + len);
                } else {
                        pp_setreflen(ppref, lcv, ref, len);
                }
                if (ref == 0) {
                        amap_wiperange(amap, lcv, len);
                }
        }
}

/*
 * amap_wiperange: wipe out a range of an amap.
 * Note: different from amap_wipeout because the amap is kept intact.
 *
 * => Both map and amap must be locked by caller.
 */
void
amap_wiperange(struct vm_amap *amap, int slotoff, int slots)
{
        u_int lcv, stop, slotend;
        bool byanon;

        KASSERT(rw_write_held(amap->am_lock));

        /*
         * We can either traverse the amap by am_anon or by am_slots.
         * Determine which way is less expensive.
         */

        if (slots < amap->am_nused) {
                byanon = true;
                lcv = slotoff;
                stop = slotoff + slots;
                slotend = 0;
        } else {
                byanon = false;
                lcv = 0;
                stop = amap->am_nused;
                slotend = slotoff + slots;
        }

        while (lcv < stop) {
                struct vm_anon *anon;
                u_int curslot, ptr, last;

                if (byanon) {
                        curslot = lcv++;        /* lcv advances here */
                        if (amap->am_anon[curslot] == NULL)
                                continue;
                } else {
                        curslot = amap->am_slots[lcv];
                        if (curslot < slotoff || curslot >= slotend) {
                                lcv++;                /* lcv advances here */
                                continue;
                        }
                        stop--;        /* drop stop, since anon will be removed */
                }
                anon = amap->am_anon[curslot];
                KASSERT(anon->an_lock == amap->am_lock);

                /*
                 * Remove anon from the amap.
                 */

                amap->am_anon[curslot] = NULL;
                ptr = amap->am_bckptr[curslot];
                last = amap->am_nused - 1;
                if (ptr != last) {
                        amap->am_slots[ptr] = amap->am_slots[last];
                        amap->am_bckptr[amap->am_slots[ptr]] = ptr;
                }
                amap->am_nused--;

                /*
                 * Drop its reference count.
                 */

                KASSERT(anon->an_lock == amap->am_lock);
                if (--anon->an_ref == 0) {
                        uvm_anfree(anon);
                }
        }
}

#endif

#if defined(VMSWAP)

/*
 * amap_swap_off: pagein anonymous pages in amaps and drop swap slots.
 *
 * => called with swap_syscall_lock held.
 * => note that we don't always traverse all anons.
 *    eg. amaps being wiped out, released anons.
 * => return true if failed.
 */

bool
amap_swap_off(int startslot, int endslot)
{
        struct vm_amap *am;
        struct vm_amap *am_next;
        struct vm_amap marker_prev;
        struct vm_amap marker_next;
        bool rv = false;

#if defined(DIAGNOSTIC)
        memset(&marker_prev, 0, sizeof(marker_prev));
        memset(&marker_next, 0, sizeof(marker_next));
#endif /* defined(DIAGNOSTIC) */

        mutex_enter(&amap_list_lock);
        for (am = LIST_FIRST(&amap_list); am != NULL && !rv; am = am_next) {
                int i;

                LIST_INSERT_BEFORE(am, &marker_prev, am_list);
                LIST_INSERT_AFTER(am, &marker_next, am_list);

                /* amap_list_lock prevents the lock pointer from changing. */
                if (!amap_lock_try(am, RW_WRITER)) {
                        (void)kpause("amapswpo", false, 1, &amap_list_lock);
                        am_next = LIST_NEXT(&marker_prev, am_list);
                        if (am_next == &marker_next) {
                                am_next = LIST_NEXT(am_next, am_list);
                        } else {
                                KASSERT(LIST_NEXT(am_next, am_list) ==
                                    &marker_next);
                        }
                        LIST_REMOVE(&marker_prev, am_list);
                        LIST_REMOVE(&marker_next, am_list);
                        continue;
                }

                mutex_exit(&amap_list_lock);

                /* If am_nused == 0, the amap could be free - careful. */
                for (i = 0; i < am->am_nused; i++) {
                        int slot;
                        int swslot;
                        struct vm_anon *anon;

                        slot = am->am_slots[i];
                        anon = am->am_anon[slot];
                        KASSERT(anon->an_lock == am->am_lock);

                        swslot = anon->an_swslot;
                        if (swslot < startslot || endslot <= swslot) {
                                continue;
                        }

                        am->am_flags |= AMAP_SWAPOFF;

                        rv = uvm_anon_pagein(am, anon);
                        amap_lock(am, RW_WRITER);

                        am->am_flags &= ~AMAP_SWAPOFF;
                        if (amap_refs(am) == 0) {
                                amap_wipeout(am);
                                am = NULL;
                                break;
                        }
                        if (rv) {
                                break;
                        }
                        i = 0;
                }

                if (am) {
                        amap_unlock(am);
                }

                mutex_enter(&amap_list_lock);
                KASSERT(LIST_NEXT(&marker_prev, am_list) == &marker_next ||
                    LIST_NEXT(LIST_NEXT(&marker_prev, am_list), am_list) ==
                    &marker_next);
                am_next = LIST_NEXT(&marker_next, am_list);
                LIST_REMOVE(&marker_prev, am_list);
                LIST_REMOVE(&marker_next, am_list);
        }
        mutex_exit(&amap_list_lock);

        return rv;
}

#endif /* defined(VMSWAP) */

/*
 * amap_lookup: look up a page in an amap.
 *
 * => amap should be locked by caller.
 */
struct vm_anon *
amap_lookup(struct vm_aref *aref, vaddr_t offset)
{
        struct vm_amap *amap = aref->ar_amap;
        struct vm_anon *an;
        u_int slot;

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
        KASSERT(rw_lock_held(amap->am_lock));

        AMAP_B2SLOT(slot, offset);
        slot += aref->ar_pageoff;
        an = amap->am_anon[slot];

        UVMHIST_LOG(maphist,
            "<- done (amap=%#jx, offset=%#jx, result=%#jx)",
            (uintptr_t)amap, offset, (uintptr_t)an, 0);

        KASSERT(slot < amap->am_nslot);
        KASSERT(an == NULL || an->an_ref != 0);
        KASSERT(an == NULL || an->an_lock == amap->am_lock);
        return an;
}

/*
 * amap_lookups: look up a range of pages in an amap.
 *
 * => amap should be locked by caller.
 */
void
amap_lookups(struct vm_aref *aref, vaddr_t offset, struct vm_anon **anons,
    int npages)
{
        struct vm_amap *amap = aref->ar_amap;
        u_int slot;

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
        KASSERT(rw_lock_held(amap->am_lock));

        AMAP_B2SLOT(slot, offset);
        slot += aref->ar_pageoff;

        UVMHIST_LOG(maphist, "  slot=%u, npages=%d, nslot=%d",
            slot, npages, amap->am_nslot, 0);

        KASSERT((slot + (npages - 1)) < amap->am_nslot);
        memcpy(anons, &amap->am_anon[slot], npages * sizeof(struct vm_anon *));

#if defined(DIAGNOSTIC)
        for (int i = 0; i < npages; i++) {
                struct vm_anon * const an = anons[i];
                if (an == NULL) {
                        continue;
                }
                KASSERT(an->an_ref != 0);
                KASSERT(an->an_lock == amap->am_lock);
        }
#endif
        UVMHIST_LOG(maphist, "<- done", 0, 0, 0, 0);
}

/*
 * amap_add: add (or replace) a page to an amap.
 *
 * => amap should be locked by caller.
 * => anon must have the lock associated with this amap.
 */
void
amap_add(struct vm_aref *aref, vaddr_t offset, struct vm_anon *anon,
    bool replace)
{
        struct vm_amap *amap = aref->ar_amap;
        u_int slot;

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
        KASSERT(rw_write_held(amap->am_lock));
        KASSERT(anon->an_lock == amap->am_lock);

        AMAP_B2SLOT(slot, offset);
        slot += aref->ar_pageoff;
        KASSERT(slot < amap->am_nslot);

        if (replace) {
                struct vm_anon *oanon = amap->am_anon[slot];

                KASSERT(oanon != NULL);
                if (oanon->an_page && (amap->am_flags & AMAP_SHARED) != 0) {
                        pmap_page_protect(oanon->an_page, VM_PROT_NONE);
                        /*
                         * XXX: suppose page is supposed to be wired somewhere?
                         */
                }
        } else {
                KASSERT(amap->am_anon[slot] == NULL);
                KASSERT(amap->am_nused < amap->am_maxslot);
                amap->am_bckptr[slot] = amap->am_nused;
                amap->am_slots[amap->am_nused] = slot;
                amap->am_nused++;
        }
        amap->am_anon[slot] = anon;
        UVMHIST_LOG(maphist,
            "<- done (amap=%#jx, offset=%#x, anon=%#jx, rep=%d)",
            (uintptr_t)amap, offset, (uintptr_t)anon, replace);
}

/*
 * amap_unadd: remove a page from an amap.
 *
 * => amap should be locked by caller.
 */
void
amap_unadd(struct vm_aref *aref, vaddr_t offset)
{
        struct vm_amap *amap = aref->ar_amap;
        u_int slot, ptr, last;

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
        KASSERT(rw_write_held(amap->am_lock));

        AMAP_B2SLOT(slot, offset);
        slot += aref->ar_pageoff;
        KASSERT(slot < amap->am_nslot);
        KASSERT(amap->am_anon[slot] != NULL);
        KASSERT(amap->am_anon[slot]->an_lock == amap->am_lock);

        amap->am_anon[slot] = NULL;
        ptr = amap->am_bckptr[slot];

        last = amap->am_nused - 1;
        if (ptr != last) {
                /* Move the last entry to keep the slots contiguous. */
                amap->am_slots[ptr] = amap->am_slots[last];
                amap->am_bckptr[amap->am_slots[ptr]] = ptr;
        }
        amap->am_nused--;
        UVMHIST_LOG(maphist, "<- done (amap=%#jx, slot=%#jx)",
            (uintptr_t)amap, slot,0, 0);
}

/*
 * amap_adjref_anons: adjust the reference count(s) on amap and its anons.
 */
static void
amap_adjref_anons(struct vm_amap *amap, vaddr_t offset, vsize_t len,
    int refv, bool all)
{

#ifdef UVM_AMAP_PPREF
        KASSERT(rw_write_held(amap->am_lock));

        /*
         * We must establish the ppref array before changing am_ref
         * so that the ppref values match the current amap refcount.
         */

        if (amap->am_ppref == NULL) {
                amap_pp_establish(amap, offset);
        }
#endif

        amap->am_ref += refv;

#ifdef UVM_AMAP_PPREF
        if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
                amap_pp_adjref(amap, offset, len, refv);
        }
#endif
        amap_unlock(amap);
}

/*
 * amap_ref: gain a reference to an amap.
 *
 * => amap must not be locked (we will lock).
 * => "offset" and "len" are in units of pages.
 * => Called at fork time to gain the child's reference.
 */
void
amap_ref(struct vm_amap *amap, vaddr_t offset, vsize_t len, int flags)
{
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        amap_lock(amap, RW_WRITER);
        if (flags & AMAP_SHARED) {
                amap->am_flags |= AMAP_SHARED;
        }
        amap_adjref_anons(amap, offset, len, 1, (flags & AMAP_REFALL) != 0);

        UVMHIST_LOG(maphist,"<- done!  amap=%#jx", (uintptr_t)amap, 0, 0, 0);
}

/*
 * amap_unref: remove a reference to an amap.
 *
 * => All pmap-level references to this amap must be already removed.
 * => Called from uvm_unmap_detach(); entry is already removed from the map.
 * => We will lock amap, so it must be unlocked.
 */
void
amap_unref(struct vm_amap *amap, vaddr_t offset, vsize_t len, bool all)
{
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        amap_lock(amap, RW_WRITER);

        UVMHIST_LOG(maphist,"  amap=%#jx  refs=%d, nused=%d",
            (uintptr_t)amap, amap->am_ref, amap->am_nused, 0);
        KASSERT(amap->am_ref > 0);

        if (amap->am_ref == 1) {

                /*
                 * If the last reference - wipeout and destroy the amap.
                 */
                amap->am_ref--;
                amap_wipeout(amap);
                UVMHIST_LOG(maphist,"<- done (was last ref)!", 0, 0, 0, 0);
                return;
        }

        /*
         * Otherwise, drop the reference count(s) on anons.
         */

        if (amap->am_ref == 2 && (amap->am_flags & AMAP_SHARED) != 0) {
                amap->am_flags &= ~AMAP_SHARED;
        }
        amap_adjref_anons(amap, offset, len, -1, all);

        UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
}




































































































































































































































































   15 
   15 


















   12 


    3 

















   15 


   15 



















































   15 


   15 
   15 
















   12 






   10 












    3 





    3 







    3 


    3 









    3 




    3 





























































   12 
   12 























































   12 




   12 





























   12 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
/*        $NetBSD: subr_xcall.c,v 1.38 2024/03/01 04:32:38 mrg Exp $        */

/*-
 * Copyright (c) 2007-2010, 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran and Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Cross call support
 *
 * Background
 *
 *        Sometimes it is necessary to modify hardware state that is tied
 *        directly to individual CPUs (such as a CPU's local timer), and
 *        these updates can not be done remotely by another CPU.  The LWP
 *        requesting the update may be unable to guarantee that it will be
 *        running on the CPU where the update must occur, when the update
 *        occurs.
 *
 *        Additionally, it's sometimes necessary to modify per-CPU software
 *        state from a remote CPU.  Where these update operations are so
 *        rare or the access to the per-CPU data so frequent that the cost
 *        of using locking or atomic operations to provide coherency is
 *        prohibitive, another way must be found.
 *
 *        Cross calls help to solve these types of problem by allowing
 *        any LWP in the system to request that an arbitrary function be
 *        executed on a specific CPU.
 *
 * Implementation
 *
 *        A slow mechanism for making low priority cross calls is
 *        provided.  The function to be executed runs on the remote CPU
 *        within a bound kthread.  No queueing is provided, and the
 *        implementation uses global state.  The function being called may
 *        block briefly on locks, but in doing so must be careful to not
 *        interfere with other cross calls in the system.  The function is
 *        called with thread context and not from a soft interrupt, so it
 *        can ensure that it is not interrupting other code running on the
 *        CPU, and so has exclusive access to the CPU.  Since this facility
 *        is heavyweight, it's expected that it will not be used often.
 *
 *        Cross calls must not allocate memory, as the pagedaemon uses cross
 *        calls (and memory allocation may need to wait on the pagedaemon).
 *
 *        A low-overhead mechanism for high priority calls (XC_HIGHPRI) is
 *        also provided.  The function to be executed runs in software
 *        interrupt context at IPL_SOFTSERIAL level, and is expected to
 *        be very lightweight, e.g. avoid blocking.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_xcall.c,v 1.38 2024/03/01 04:32:38 mrg Exp $");

#include <sys/types.h>
#include <sys/param.h>
#include <sys/xcall.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/evcnt.h>
#include <sys/kthread.h>
#include <sys/cpu.h>
#include <sys/atomic.h>

#ifdef _RUMPKERNEL
#include "rump_private.h"
#endif

/* Cross-call state box. */
typedef struct {
        kmutex_t        xc_lock;
        kcondvar_t        xc_busy;
        xcfunc_t        xc_func;
        void *                xc_arg1;
        void *                xc_arg2;
        uint64_t        xc_headp;
        uint64_t        xc_donep;
        unsigned int        xc_ipl;
} xc_state_t;

/* Bit indicating high (1) or low (0) priority. */
#define        XC_PRI_BIT        (1ULL << 63)

/* Low priority xcall structures. */
static xc_state_t        xc_low_pri        __cacheline_aligned;

/* High priority xcall structures. */
static xc_state_t        xc_high_pri        __cacheline_aligned;
static void *                xc_sihs[4]        __cacheline_aligned;

/* Event counters. */
static struct evcnt        xc_unicast_ev        __cacheline_aligned;
static struct evcnt        xc_broadcast_ev        __cacheline_aligned;

static void                xc_init(void);
static void                xc_thread(void *);

static inline uint64_t        xc_highpri(xcfunc_t, void *, void *, struct cpu_info *,
                            unsigned int);
static inline uint64_t        xc_lowpri(xcfunc_t, void *, void *, struct cpu_info *);

/* The internal form of IPL */
#define XC_IPL_MASK                0xff00
/*
 * Assign 0 to XC_IPL_SOFTSERIAL to treat IPL_SOFTSERIAL as the default value
 * (just XC_HIGHPRI).
 */
#define XC_IPL_SOFTSERIAL        0
#define XC_IPL_SOFTNET                1
#define XC_IPL_SOFTBIO                2
#define XC_IPL_SOFTCLOCK        3
#define XC_IPL_MAX                XC_IPL_SOFTCLOCK

CTASSERT(XC_IPL_MAX <= __arraycount(xc_sihs));

/*
 * xc_init:
 *
 *        Initialize low and high priority cross-call structures.
 */
static void
xc_init(void)
{
        xc_state_t *xclo = &xc_low_pri, *xchi = &xc_high_pri;

        memset(xclo, 0, sizeof(xc_state_t));
        mutex_init(&xclo->xc_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&xclo->xc_busy, "xclow");

        memset(xchi, 0, sizeof(xc_state_t));
        mutex_init(&xchi->xc_lock, MUTEX_DEFAULT, IPL_SOFTSERIAL);
        cv_init(&xchi->xc_busy, "xchigh");

        /* Set up a softint for each IPL_SOFT*. */
#define SETUP_SOFTINT(xipl, sipl) do {                                        \
                xc_sihs[(xipl)] = softint_establish( (sipl) | SOFTINT_MPSAFE,\
                    xc__highpri_intr, NULL);                                \
                KASSERT(xc_sihs[(xipl)] != NULL);                        \
        } while (0)

        SETUP_SOFTINT(XC_IPL_SOFTSERIAL, SOFTINT_SERIAL);
        /*
         * If a IPL_SOFTXXX have the same value of the previous, we don't use
         * the IPL (see xc_encode_ipl).  So we don't need to allocate a softint
         * for it.
         */
#if IPL_SOFTNET != IPL_SOFTSERIAL
        SETUP_SOFTINT(XC_IPL_SOFTNET, SOFTINT_NET);
#endif
#if IPL_SOFTBIO != IPL_SOFTNET
        SETUP_SOFTINT(XC_IPL_SOFTBIO, SOFTINT_BIO);
#endif
#if IPL_SOFTCLOCK != IPL_SOFTBIO
        SETUP_SOFTINT(XC_IPL_SOFTCLOCK, SOFTINT_CLOCK);
#endif

#undef SETUP_SOFTINT

        evcnt_attach_dynamic(&xc_unicast_ev, EVCNT_TYPE_MISC, NULL,
           "crosscall", "unicast");
        evcnt_attach_dynamic(&xc_broadcast_ev, EVCNT_TYPE_MISC, NULL,
           "crosscall", "broadcast");
}

/*
 * Encode an IPL to a form that can be embedded into flags of xc_broadcast
 * or xc_unicast.
 */
unsigned int
xc_encode_ipl(int ipl)
{

        switch (ipl) {
        case IPL_SOFTSERIAL:
                return __SHIFTIN(XC_IPL_SOFTSERIAL, XC_IPL_MASK);
        /* IPL_SOFT* can be the same value (e.g., on sparc or mips). */
#if IPL_SOFTNET != IPL_SOFTSERIAL
        case IPL_SOFTNET:
                return __SHIFTIN(XC_IPL_SOFTNET, XC_IPL_MASK);
#endif
#if IPL_SOFTBIO != IPL_SOFTNET
        case IPL_SOFTBIO:
                return __SHIFTIN(XC_IPL_SOFTBIO, XC_IPL_MASK);
#endif
#if IPL_SOFTCLOCK != IPL_SOFTBIO
        case IPL_SOFTCLOCK:
                return __SHIFTIN(XC_IPL_SOFTCLOCK, XC_IPL_MASK);
#endif
        }

        panic("Invalid IPL: %d", ipl);
}

/*
 * Extract an XC_IPL from flags of xc_broadcast or xc_unicast.
 */
static inline unsigned int
xc_extract_ipl(unsigned int flags)
{

        return __SHIFTOUT(flags, XC_IPL_MASK);
}

/*
 * xc_init_cpu:
 *
 *        Initialize the cross-call subsystem.  Called once for each CPU
 *        in the system as they are attached.
 */
void
xc_init_cpu(struct cpu_info *ci)
{
        static bool again = false;
        int error __diagused;

        if (!again) {
                /* Autoconfiguration will prevent re-entry. */
                xc_init();
                again = true;
        }
        cv_init(&ci->ci_data.cpu_xcall, "xcall");
        error = kthread_create(PRI_XCALL, KTHREAD_MPSAFE, ci, xc_thread,
            NULL, NULL, "xcall/%u", ci->ci_index);
        KASSERT(error == 0);
}

/*
 * xc_broadcast:
 *
 *        Trigger a call on all CPUs in the system.
 */
uint64_t
xc_broadcast(unsigned int flags, xcfunc_t func, void *arg1, void *arg2)
{

        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());
        ASSERT_SLEEPABLE();

        if (__predict_false(!mp_online)) {
                int s, bound;

                if (flags & XC_HIGHPRI)
                        s = splsoftserial();
                else
                        bound = curlwp_bind();
                (*func)(arg1, arg2);
                if (flags & XC_HIGHPRI)
                        splx(s);
                else
                        curlwp_bindx(bound);
                return 0;
        }

        if ((flags & XC_HIGHPRI) != 0) {
                int ipl = xc_extract_ipl(flags);
                return xc_highpri(func, arg1, arg2, NULL, ipl);
        } else {
                return xc_lowpri(func, arg1, arg2, NULL);
        }
}

static void
xc_nop(void *arg1, void *arg2)
{

        return;
}

/*
 * xc_barrier:
 *
 *        Broadcast a nop to all CPUs in the system.
 */
void
xc_barrier(unsigned int flags)
{
        uint64_t where;

        where = xc_broadcast(flags, xc_nop, NULL, NULL);
        xc_wait(where);
}

/*
 * xc_unicast:
 *
 *        Trigger a call on one CPU.
 */
uint64_t
xc_unicast(unsigned int flags, xcfunc_t func, void *arg1, void *arg2,
    struct cpu_info *ci)
{

        KASSERT(ci != NULL);
        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());
        ASSERT_SLEEPABLE();

        if (__predict_false(!mp_online)) {
                int s, bound;

                KASSERT(ci == curcpu());

                if (flags & XC_HIGHPRI)
                        s = splsoftserial();
                else
                        bound = curlwp_bind();
                (*func)(arg1, arg2);
                if (flags & XC_HIGHPRI)
                        splx(s);
                else
                        curlwp_bindx(bound);

                return 0;
        }

        if ((flags & XC_HIGHPRI) != 0) {
                int ipl = xc_extract_ipl(flags);
                return xc_highpri(func, arg1, arg2, ci, ipl);
        } else {
                return xc_lowpri(func, arg1, arg2, ci);
        }
}

/*
 * xc_wait:
 *
 *        Wait for a cross call to complete.
 */
void
xc_wait(uint64_t where)
{
        xc_state_t *xc;

        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());
        ASSERT_SLEEPABLE();

        if (__predict_false(!mp_online)) {
                return;
        }

        /* Determine whether it is high or low priority cross-call. */
        if ((where & XC_PRI_BIT) != 0) {
                xc = &xc_high_pri;
                where &= ~XC_PRI_BIT;
        } else {
                xc = &xc_low_pri;
        }

#ifdef __HAVE_ATOMIC64_LOADSTORE
        /* Fast path, if already done. */
        if (atomic_load_acquire(&xc->xc_donep) >= where) {
                return;
        }
#endif

        /* Slow path: block until awoken. */
        mutex_enter(&xc->xc_lock);
        while (xc->xc_donep < where) {
                cv_wait(&xc->xc_busy, &xc->xc_lock);
        }
        mutex_exit(&xc->xc_lock);
}

/*
 * xc_lowpri:
 *
 *        Trigger a low priority call on one or more CPUs.
 */
static inline uint64_t
xc_lowpri(xcfunc_t func, void *arg1, void *arg2, struct cpu_info *ci)
{
        xc_state_t *xc = &xc_low_pri;
        CPU_INFO_ITERATOR cii;
        uint64_t where;

        mutex_enter(&xc->xc_lock);
        while (xc->xc_headp != xc->xc_donep) {
                cv_wait(&xc->xc_busy, &xc->xc_lock);
        }
        xc->xc_arg1 = arg1;
        xc->xc_arg2 = arg2;
        xc->xc_func = func;
        if (ci == NULL) {
                xc_broadcast_ev.ev_count++;
                for (CPU_INFO_FOREACH(cii, ci)) {
                        if ((ci->ci_schedstate.spc_flags & SPCF_RUNNING) == 0)
                                continue;
                        xc->xc_headp += 1;
                        ci->ci_data.cpu_xcall_pending = true;
                        cv_signal(&ci->ci_data.cpu_xcall);
                }
        } else {
                xc_unicast_ev.ev_count++;
                xc->xc_headp += 1;
                ci->ci_data.cpu_xcall_pending = true;
                cv_signal(&ci->ci_data.cpu_xcall);
        }
        KASSERT(xc->xc_donep < xc->xc_headp);
        where = xc->xc_headp;
        mutex_exit(&xc->xc_lock);

        /* Return a low priority ticket. */
        KASSERT((where & XC_PRI_BIT) == 0);
        return where;
}

/*
 * xc_thread:
 *
 *        One thread per-CPU to dispatch low priority calls.
 */
static void
xc_thread(void *cookie)
{
        struct cpu_info *ci = curcpu();
        xc_state_t *xc = &xc_low_pri;
        void *arg1, *arg2;
        xcfunc_t func;
        struct lwp *l = curlwp;

        KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d",
            l, l->l_nopreempt);

        mutex_enter(&xc->xc_lock);
        for (;;) {
                while (!ci->ci_data.cpu_xcall_pending) {
                        if (xc->xc_headp == xc->xc_donep) {
                                cv_broadcast(&xc->xc_busy);
                        }
                        cv_wait(&ci->ci_data.cpu_xcall, &xc->xc_lock);
                        KASSERT(ci == curcpu());
                }
                ci->ci_data.cpu_xcall_pending = false;
                func = xc->xc_func;
                arg1 = xc->xc_arg1;
                arg2 = xc->xc_arg2;
                mutex_exit(&xc->xc_lock);

                KASSERT(func != NULL);
                (*func)(arg1, arg2);

                KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d func %p",
                    l, l->l_nopreempt, func);

                mutex_enter(&xc->xc_lock);
#ifdef __HAVE_ATOMIC64_LOADSTORE
                atomic_store_release(&xc->xc_donep, xc->xc_donep + 1);
#else
                xc->xc_donep++;
#endif
        }
        /* NOTREACHED */
}

/*
 * xc_ipi_handler:
 *
 *        Handler of cross-call IPI.
 */
void
xc_ipi_handler(void)
{
        xc_state_t *xc = & xc_high_pri;

        KASSERT(xc->xc_ipl < __arraycount(xc_sihs));
        KASSERT(xc_sihs[xc->xc_ipl] != NULL);

        /* Executes xc__highpri_intr() via software interrupt. */
        softint_schedule(xc_sihs[xc->xc_ipl]);
}

/*
 * xc__highpri_intr:
 *
 *        A software interrupt handler for high priority calls.
 */
void
xc__highpri_intr(void *dummy)
{
        xc_state_t *xc = &xc_high_pri;
        void *arg1, *arg2;
        xcfunc_t func;

        KASSERTMSG(!cpu_intr_p(), "high priority xcall for function %p",
            xc->xc_func);
        /*
         * Lock-less fetch of function and its arguments.
         * Safe since it cannot change at this point.
         */
        func = xc->xc_func;
        arg1 = xc->xc_arg1;
        arg2 = xc->xc_arg2;

        KASSERT(func != NULL);
        (*func)(arg1, arg2);

        /*
         * Note the request as done, and if we have reached the head,
         * cross-call has been processed - notify waiters, if any.
         */
        mutex_enter(&xc->xc_lock);
        KASSERT(xc->xc_donep < xc->xc_headp);
#ifdef __HAVE_ATOMIC64_LOADSTORE
        atomic_store_release(&xc->xc_donep, xc->xc_donep + 1);
#else
        xc->xc_donep++;
#endif
        if (xc->xc_donep == xc->xc_headp) {
                cv_broadcast(&xc->xc_busy);
        }
        mutex_exit(&xc->xc_lock);
}

/*
 * xc_highpri:
 *
 *        Trigger a high priority call on one or more CPUs.
 */
static inline uint64_t
xc_highpri(xcfunc_t func, void *arg1, void *arg2, struct cpu_info *ci,
    unsigned int ipl)
{
        xc_state_t *xc = &xc_high_pri;
        uint64_t where;

        mutex_enter(&xc->xc_lock);
        while (xc->xc_headp != xc->xc_donep) {
                cv_wait(&xc->xc_busy, &xc->xc_lock);
        }
        xc->xc_func = func;
        xc->xc_arg1 = arg1;
        xc->xc_arg2 = arg2;
        xc->xc_headp += (ci ? 1 : ncpu);
        xc->xc_ipl = ipl;
        where = xc->xc_headp;
        mutex_exit(&xc->xc_lock);

        /*
         * Send the IPI once lock is released.
         * Note: it will handle the local CPU case.
         */

#ifdef _RUMPKERNEL
        rump_xc_highpri(ci);
#else
#ifdef MULTIPROCESSOR
        kpreempt_disable();
        if (curcpu() == ci) {
                /* Unicast: local CPU. */
                xc_ipi_handler();
        } else if (ci) {
                /* Unicast: remote CPU. */
                xc_send_ipi(ci);
        } else {
                /* Broadcast: all, including local. */
                xc_send_ipi(NULL);
                xc_ipi_handler();
        }
        kpreempt_enable();
#else
        KASSERT(ci == NULL || curcpu() == ci);
        xc_ipi_handler();
#endif
#endif

        /* Indicate a high priority ticket. */
        return (where | XC_PRI_BIT);
}
























































































































































































    3 








































    3 































    3 

    3 

    2 

















    3 



































    3 






    3 







































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
/*        $NetBSD: uvm_pager.c,v 1.131 2024/03/15 07:09:37 andvar Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * from: Id: uvm_pager.c,v 1.1.2.23 1998/02/02 20:38:06 chuck Exp
 */

/*
 * uvm_pager.c: generic functions used to assist the pagers.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_pager.c,v 1.131 2024/03/15 07:09:37 andvar Exp $");

#include "opt_uvmhist.h"
#include "opt_readahead.h"
#include "opt_pagermap.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/vnode.h>
#include <sys/buf.h>

#include <uvm/uvm.h>

/*
 * XXX
 * this is needed until the device strategy interface
 * is changed to do physically-addressed i/o.
 */

#ifndef PAGER_MAP_DEFAULT_SIZE
#define PAGER_MAP_DEFAULT_SIZE        (16 * 1024 * 1024)
#endif

#ifndef PAGER_MAP_SIZE
#define PAGER_MAP_SIZE        PAGER_MAP_DEFAULT_SIZE
#endif

size_t pager_map_size = PAGER_MAP_SIZE;

/*
 * list of uvm pagers in the system
 */

const struct uvm_pagerops * const uvmpagerops[] = {
        &aobj_pager,
        &uvm_deviceops,
        &uvm_vnodeops,
        &ubc_pager,
};

/*
 * the pager map: provides KVA for I/O
 */

struct vm_map *pager_map;                /* XXX */
kmutex_t pager_map_wanted_lock __cacheline_aligned;
bool pager_map_wanted;        /* locked by pager map */
static vaddr_t emergva;
static int emerg_ncolors;
static bool emerginuse;

void
uvm_pager_realloc_emerg(void)
{
        vaddr_t new_emergva, old_emergva;
        int old_emerg_ncolors;

        if (__predict_true(emergva != 0 && emerg_ncolors >= uvmexp.ncolors))
                return;

        KASSERT(!emerginuse);

        new_emergva = uvm_km_alloc(kernel_map,
            round_page(MAXPHYS) + ptoa(uvmexp.ncolors), ptoa(uvmexp.ncolors),
            UVM_KMF_VAONLY);

        KASSERT(new_emergva != 0);

        old_emergva = emergva;
        old_emerg_ncolors = emerg_ncolors;

        /*
         * don't support re-color in late boot anyway.
         */
        if (0) /* XXX */
                mutex_enter(&pager_map_wanted_lock);

        emergva = new_emergva;
        emerg_ncolors = uvmexp.ncolors;
        wakeup(&old_emergva);

        if (0) /* XXX */
                mutex_exit(&pager_map_wanted_lock);

        if (old_emergva)
                uvm_km_free(kernel_map, old_emergva,
                    round_page(MAXPHYS) + ptoa(old_emerg_ncolors),
                    UVM_KMF_VAONLY);
}

/*
 * uvm_pager_init: init pagers (at boot time)
 */

void
uvm_pager_init(void)
{
        u_int lcv;
        vaddr_t sva, eva;

        /*
         * init pager map
         */

        sva = 0;
        pager_map = uvm_km_suballoc(kernel_map, &sva, &eva, pager_map_size, 0,
            false, NULL);
        mutex_init(&pager_map_wanted_lock, MUTEX_DEFAULT, IPL_NONE);
        pager_map_wanted = false;

        uvm_pager_realloc_emerg();

        /*
         * call pager init functions
         */
        for (lcv = 0 ; lcv < __arraycount(uvmpagerops); lcv++) {
                if (uvmpagerops[lcv]->pgo_init)
                        uvmpagerops[lcv]->pgo_init();
        }
}

#ifdef PMAP_DIRECT
/*
 * uvm_pagermapdirect: map a single page via the pmap's direct segment
 *
 * this is an abuse of pmap_direct_process(), since the kva is being grabbed
 * and no processing is taking place, but for now..
 */

static int
uvm_pagermapdirect(void *kva, size_t sz, void *cookie)
{

        KASSERT(sz == PAGE_SIZE);
        *(vaddr_t *)cookie = (vaddr_t)kva;
        return 0;
}
#endif

/*
 * uvm_pagermapin: map pages into KVA (pager_map) for I/O that needs mappings
 *
 * we basically just map in a blank map entry to reserve the space in the
 * map and then use pmap_enter() to put the mappings in by hand.
 */

vaddr_t
uvm_pagermapin(struct vm_page **pps, int npages, int flags)
{
        vsize_t size;
        vaddr_t kva;
        vaddr_t cva;
        struct vm_page *pp;
        vm_prot_t prot;
        const bool pdaemon = (curlwp == uvm.pagedaemon_lwp);
        const u_int first_color = VM_PGCOLOR(*pps);
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(pps=%#jx, npages=%jd, first_color=%ju)",
                (uintptr_t)pps, npages, first_color, 0);

#ifdef PMAP_DIRECT
        /*
         * for a single page the direct mapped segment can be used.
         */

        if (npages == 1) {
                int error __diagused;
                KASSERT((pps[0]->flags & PG_BUSY) != 0);
                error = pmap_direct_process(VM_PAGE_TO_PHYS(pps[0]), 0,
                    PAGE_SIZE, uvm_pagermapdirect, &kva);
                KASSERT(error == 0);
                UVMHIST_LOG(maphist, "<- done, direct (KVA=%#jx)", kva,0,0,0);
                return kva;
        }
#endif

        /*
         * compute protection.  outgoing I/O only needs read
         * access to the page, whereas incoming needs read/write.
         */

        prot = VM_PROT_READ;
        if (flags & UVMPAGER_MAPIN_READ)
                prot |= VM_PROT_WRITE;

ReStart:
        size = ptoa(npages);
        kva = 0;                        /* let system choose VA */

        if (uvm_map(pager_map, &kva, size, NULL, UVM_UNKNOWN_OFFSET,
            first_color, UVM_FLAG_COLORMATCH | UVM_FLAG_NOMERGE
            | (pdaemon ? UVM_FLAG_NOWAIT : 0)) != 0) {
                if (pdaemon) {
                        mutex_enter(&pager_map_wanted_lock);
                        if (emerginuse) {
                                UVM_UNLOCK_AND_WAIT(&emergva,
                                    &pager_map_wanted_lock, false,
                                    "emergva", 0);
                                goto ReStart;
                        }
                        emerginuse = true;
                        mutex_exit(&pager_map_wanted_lock);
                        kva = emergva + ptoa(first_color);
                        /* The shift implicitly truncates to PAGE_SIZE */
                        KASSERT(npages <= (MAXPHYS >> PAGE_SHIFT));
                        goto enter;
                }
                if ((flags & UVMPAGER_MAPIN_WAITOK) == 0) {
                        UVMHIST_LOG(maphist,"<- NOWAIT failed", 0,0,0,0);
                        return(0);
                }
                mutex_enter(&pager_map_wanted_lock);
                pager_map_wanted = true;
                UVMHIST_LOG(maphist, "  SLEEPING on pager_map",0,0,0,0);
                UVM_UNLOCK_AND_WAIT(pager_map, &pager_map_wanted_lock, false,
                    "pager_map", 0);
                goto ReStart;
        }

enter:
        /* got it */
        for (cva = kva; npages != 0; npages--, cva += PAGE_SIZE) {
                pp = *pps++;
                KASSERT(pp);
                // KASSERT(!((VM_PAGE_TO_PHYS(pp) ^ cva) & uvmexp.colormask));
                KASSERT(pp->flags & PG_BUSY);
                pmap_kenter_pa(cva, VM_PAGE_TO_PHYS(pp), prot, 0);
        }
        pmap_update(vm_map_pmap(pager_map));

        UVMHIST_LOG(maphist, "<- done (KVA=%#jx)", kva,0,0,0);
        return(kva);
}

/*
 * uvm_pagermapout: remove pager_map mapping
 *
 * we remove our mappings by hand and then remove the mapping (waking
 * up anyone wanting space).
 */

void
uvm_pagermapout(vaddr_t kva, int npages)
{
        vsize_t size = ptoa(npages);
        struct vm_map_entry *entries;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist, " (kva=%#jx, npages=%jd)", kva, npages,0,0);

#ifdef PMAP_DIRECT
        /*
         * solitary pages are mapped directly.
         */

        if (npages == 1) {
                UVMHIST_LOG(maphist,"<- done, direct", 0,0,0,0);
                return;
        }
#endif

        /*
         * duplicate uvm_unmap, but add in pager_map_wanted handling.
         */

        pmap_kremove(kva, size);
        pmap_update(pmap_kernel());

        if ((kva & ~ptoa(uvmexp.colormask)) == emergva) {
                mutex_enter(&pager_map_wanted_lock);
                KASSERT(emerginuse);
                emerginuse = false;
                wakeup(&emergva);
                mutex_exit(&pager_map_wanted_lock);
                return;
        }

        vm_map_lock(pager_map);
        uvm_unmap_remove(pager_map, kva, kva + size, &entries, 0);
        mutex_enter(&pager_map_wanted_lock);
        if (pager_map_wanted) {
                pager_map_wanted = false;
                wakeup(pager_map);
        }
        mutex_exit(&pager_map_wanted_lock);
        vm_map_unlock(pager_map);
        if (entries)
                uvm_unmap_detach(entries, 0);
        UVMHIST_LOG(maphist,"<- done",0,0,0,0);
}

void
uvm_aio_aiodone_pages(struct vm_page **pgs, int npages, bool write, int error)
{
        struct uvm_object *uobj;
        struct vm_page *pg;
        krwlock_t *slock;
        int pageout_done;        /* number of PG_PAGEOUT pages processed */
        int swslot __unused;        /* used for VMSWAP */
        int i;
        bool swap;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);

        swslot = 0;
        pageout_done = 0;
        slock = NULL;
        uobj = NULL;
        pg = pgs[0];
        swap = (pg->uanon != NULL && pg->uobject == NULL) ||
                (pg->flags & PG_AOBJ) != 0;
        if (!swap) {
                uobj = pg->uobject;
                slock = uobj->vmobjlock;
                rw_enter(slock, RW_WRITER);
        } else {
#if defined(VMSWAP)
                if (error) {
                        if (pg->uobject != NULL) {
                                swslot = uao_find_swslot(pg->uobject,
                                    pg->offset >> PAGE_SHIFT);
                        } else {
                                KASSERT(pg->uanon != NULL);
                                swslot = pg->uanon->an_swslot;
                        }
                        KASSERT(swslot);
                }
#else /* defined(VMSWAP) */
                panic("%s: swap", __func__);
#endif /* defined(VMSWAP) */
        }
        for (i = 0; i < npages; i++) {
#if defined(VMSWAP)
                bool anon_disposed = false; /* XXX gcc */
#endif /* defined(VMSWAP) */

                pg = pgs[i];
                KASSERT(swap || pg->uobject == uobj);
                UVMHIST_LOG(ubchist, "pg %#jx", (uintptr_t)pg, 0,0,0);

#if defined(VMSWAP)
                /*
                 * for swap i/os, lock each page's object (or anon)
                 * individually since each page may need a different lock.
                 */

                if (swap) {
                        if (pg->uobject != NULL) {
                                slock = pg->uobject->vmobjlock;
                        } else {
                                slock = pg->uanon->an_lock;
                        }
                        rw_enter(slock, RW_WRITER);
                        anon_disposed = (pg->flags & PG_RELEASED) != 0;
                        KASSERT(!anon_disposed || pg->uobject != NULL ||
                            pg->uanon->an_ref == 0);
                }
#endif /* defined(VMSWAP) */

                if (write && uobj != NULL) {
                        KASSERT(uvm_obj_page_writeback_p(pg));
                        uvm_obj_page_clear_writeback(pg);
                }

                /*
                 * process errors.  for reads, just mark the page to be freed.
                 * for writes, if the error was ENOMEM, we assume this was
                 * a transient failure so we mark the page dirty so that
                 * we'll try to write it again later.  for all other write
                 * errors, we assume the error is permanent, thus the data
                 * in the page is lost.  bummer.
                 */

                if (error) {
                        int slot __unused;        /* used for VMSWAP */
                        if (!write) {
                                pg->flags |= PG_RELEASED;
                                continue;
                        } else if (error == ENOMEM) {
                                if (pg->flags & PG_PAGEOUT) {
                                        pg->flags &= ~PG_PAGEOUT;
                                        pageout_done++;
                                }
                                uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
                                uvm_pagelock(pg);
                                uvm_pageactivate(pg);
                                uvm_pageunlock(pg);
                                slot = 0;
                        } else
                                slot = SWSLOT_BAD;

#if defined(VMSWAP)
                        if (swap) {
                                if (pg->uobject != NULL) {
                                        int oldslot __diagused;
                                        oldslot = uao_set_swslot(pg->uobject,
                                                pg->offset >> PAGE_SHIFT, slot);
                                        KASSERT(oldslot == swslot + i);
                                } else {
                                        KASSERT(pg->uanon->an_swslot ==
                                                swslot + i);
                                        pg->uanon->an_swslot = slot;
                                }
                        }
#endif /* defined(VMSWAP) */
                }

                /*
                 * if the page is PG_FAKE, this must have been a read to
                 * initialize the page.  clear PG_FAKE and activate the page.
                 */

                if (pg->flags & PG_FAKE) {
                        KASSERT(!write);
                        pg->flags &= ~PG_FAKE;
#if defined(READAHEAD_STATS)
                        pg->flags |= PG_READAHEAD;
                        uvm_ra_total.ev_count++;
#endif /* defined(READAHEAD_STATS) */
                        KASSERT(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN);
                        uvm_pagelock(pg);
                        uvm_pageenqueue(pg);
                        uvm_pageunlock(pg);
                }

#if defined(VMSWAP)
                /*
                 * for swap pages, unlock everything for this page now.
                 */

                if (swap) {
                        if (pg->uobject == NULL && anon_disposed) {
                                uvm_anon_release(pg->uanon);
                        } else {
                                uvm_page_unbusy(&pg, 1);
                                rw_exit(slock);
                        }
                }
#endif /* defined(VMSWAP) */
        }
        if (pageout_done != 0) {
                uvm_pageout_done(pageout_done);
        }
        if (!swap) {
                uvm_page_unbusy(pgs, npages);
                rw_exit(slock);
        } else {
#if defined(VMSWAP)
                KASSERT(write);

                /* these pages are now only in swap. */
                if (error != ENOMEM) {
                        atomic_add_int(&uvmexp.swpgonly, npages);
                }
                if (error) {
                        if (error != ENOMEM)
                                uvm_swap_markbad(swslot, npages);
                        else
                                uvm_swap_free(swslot, npages);
                }
                atomic_dec_uint(&uvmexp.pdpending);
#endif /* defined(VMSWAP) */
        }
}

/*
 * uvm_aio_aiodone: do iodone processing for async i/os.
 * this should be called in thread context, not interrupt context.
 */
void
uvm_aio_aiodone(struct buf *bp)
{
        const int npages = bp->b_bufsize >> PAGE_SHIFT;
        struct vm_page *pgs[howmany(MAXPHYS, MIN_PAGE_SIZE)];
        int i, error;
        bool write;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(ubchist, "bp %#jx", (uintptr_t)bp, 0,0,0);

        KASSERT(bp->b_bufsize <= MAXPHYS);
        KASSERT(npages <= __arraycount(pgs));

        error = bp->b_error;
        write = (bp->b_flags & B_READ) == 0;

        for (i = 0; i < npages; i++) {
                pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT));
                UVMHIST_LOG(ubchist, "pgs[%jd] = %#jx", i,
                    (uintptr_t)pgs[i], 0, 0);
        }
        uvm_pagermapout((vaddr_t)bp->b_data, npages);

        uvm_aio_aiodone_pages(pgs, npages, write, error);

        if (write && (bp->b_cflags & BC_AGE) != 0) {
                mutex_enter(bp->b_objlock);
                vwakeup(bp);
                mutex_exit(bp->b_objlock);
        }
        putiobuf(bp);
}

/*
 * uvm_pageratop: convert KVAs in the pager map back to their page
 * structures.
 */

struct vm_page *
uvm_pageratop(vaddr_t kva)
{
        struct vm_page *pg;
        paddr_t pa;
        bool rv __diagused;

        rv = pmap_extract(pmap_kernel(), kva, &pa);
        KASSERT(rv);
        pg = PHYS_TO_VM_PAGE(pa);
        KASSERT(pg != NULL);
        return (pg);
}















































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 


    2 


























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
/*        $NetBSD: bus_space.c,v 1.47 2022/07/17 08:33:48 riastradh Exp $        */

/*-
 * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
 * Simulation Facility, NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: bus_space.c,v 1.47 2022/07/17 08:33:48 riastradh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/extent.h>
#include <sys/kmem.h>

#include <uvm/uvm_extern.h>

#include <dev/isa/isareg.h>

#include <sys/bus.h>
#include <machine/pio.h>
#include <machine/isa_machdep.h>

#ifdef XEN
#include <xen/hypervisor.h>
#endif

/*
 * Macros for sanity-checking the aligned-ness of pointers passed to
 * bus space ops.  These are not strictly necessary on the x86, but
 * could lead to performance improvements, and help catch problems
 * with drivers that would creep up on other architectures.
 */
#ifdef BUS_SPACE_DEBUG
#define        BUS_SPACE_ALIGNED_ADDRESS(p, t)                                \
        ((((u_long)(p)) & (sizeof(t)-1)) == 0)

#define        BUS_SPACE_ADDRESS_SANITY(p, t, d)                                \
({                                                                        \
        if (BUS_SPACE_ALIGNED_ADDRESS((p), t) == 0) {                        \
                printf("%s 0x%lx not aligned to %zu bytes %s:%d\n",        \
                    d, (u_long)(p), sizeof(t), __FILE__, __LINE__);        \
        }                                                                \
        (void) 0;                                                        \
})
#else
#define        BUS_SPACE_ADDRESS_SANITY(p,t,d)        (void) 0
#endif /* BUS_SPACE_DEBUG */

/*
 * Extent maps to manage I/O and memory space.  Allocate
 * storage for 8 regions in each, initially.  Later, ioport_malloc_safe
 * will indicate that it's safe to use malloc() to dynamically allocate
 * region descriptors.
 *
 * N.B. At least two regions are _always_ allocated from the iomem
 * extent map; (0 -> ISA hole) and (end of ISA hole -> end of RAM).
 *
 * The extent maps are not static!  Machine-dependent ISA and EISA
 * routines need access to them for bus address space allocation.
 */
static        long ioport_ex_storage[EXTENT_FIXED_STORAGE_SIZE(16) / sizeof(long)];
static        long iomem_ex_storage[EXTENT_FIXED_STORAGE_SIZE(64) / sizeof(long)];
struct        extent *ioport_ex;
struct        extent *iomem_ex;
static        int ioport_malloc_safe;

static struct bus_space_tag x86_io = { .bst_type = X86_BUS_SPACE_IO };
static struct bus_space_tag x86_mem = { .bst_type = X86_BUS_SPACE_MEM };

bus_space_tag_t x86_bus_space_io = &x86_io;
bus_space_tag_t x86_bus_space_mem = &x86_mem;

int x86_mem_add_mapping(bus_addr_t, bus_size_t,
            int, bus_space_handle_t *);

static inline bool
x86_bus_space_is_io(bus_space_tag_t t)
{
        return t->bst_type == X86_BUS_SPACE_IO;
}

static inline bool
x86_bus_space_is_mem(bus_space_tag_t t)
{
        return t->bst_type == X86_BUS_SPACE_MEM;
}

void
x86_bus_space_init(void)
{
        /*
         * Initialize the I/O port and I/O mem extent maps.
         * Note: we don't have to check the return value since
         * creation of a fixed extent map will never fail (since
         * descriptor storage has already been allocated).
         *
         * N.B. The iomem extent manages _all_ physical addresses
         * on the machine.  When the amount of RAM is found, the two
         * extents of RAM are allocated from the map (0 -> ISA hole
         * and end of ISA hole -> end of RAM).
         */
        ioport_ex = extent_create("ioport", 0x0, 0xffff,
            (void *)ioport_ex_storage, sizeof(ioport_ex_storage),
            EX_NOCOALESCE|EX_NOWAIT);
        iomem_ex = extent_create("iomem", 0x0, MAXIOMEM,
            (void *)iomem_ex_storage, sizeof(iomem_ex_storage),
            EX_NOCOALESCE|EX_NOWAIT);

#ifdef XENPV
        /* We are privileged guest os - should have IO privileges. */
        if (xendomain_is_privileged()) {
                struct physdev_set_iopl set_iopl;
                memset(&set_iopl, 0, sizeof(set_iopl));
                set_iopl.iopl = 1;
                if (HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl) != 0)
                        panic("Unable to obtain IOPL, "
                            "despite being SIF_PRIVILEGED");
        }
#endif        /* XENPV */
}

void
x86_bus_space_mallocok(void)
{

        ioport_malloc_safe = 1;
}

int
bus_space_map(bus_space_tag_t t, bus_addr_t bpa, bus_size_t size,
                int flags, bus_space_handle_t *bshp)
{
        bus_space_reservation_t bsr;
        bus_space_tag_t it;
        int error;

        if ((t->bst_exists & BUS_SPACE_OVERRIDE_MAP) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bst_super) {
                if ((it->bst_present & BUS_SPACE_OVERRIDE_MAP) == 0)
                        continue;
                return (*it->bst_ov->ov_space_map)(it->bst_ctx, t, bpa, size,
                    flags, bshp);
        }

        error = bus_space_reserve(t, bpa, size, flags, &bsr);
        if (error != 0)
                return error;

        error = bus_space_reservation_map(t, &bsr, flags, bshp);
        if (error != 0)
                bus_space_release(t, &bsr);

        return error;
}

int
bus_space_reservation_map(bus_space_tag_t t, bus_space_reservation_t *bsr,
    int flags, bus_space_handle_t *bshp)
{
        bus_addr_t bpa;
        bus_size_t size;
        bus_space_tag_t it;

        if ((t->bst_exists & BUS_SPACE_OVERRIDE_RESERVATION_MAP) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bst_super) {
                if ((it->bst_present & BUS_SPACE_OVERRIDE_RESERVATION_MAP) == 0)
                        continue;
                return (*it->bst_ov->ov_space_reservation_map)(it->bst_ctx, t,
                    bsr, flags, bshp);
        }

        bpa = bus_space_reservation_addr(bsr);
        size = bus_space_reservation_size(bsr);

        /*
         * For I/O space, that's all she wrote.
         */
        if (x86_bus_space_is_io(t)) {
                *bshp = bpa;
                return 0;
        }

#ifndef XENPV
        if (bpa >= IOM_BEGIN && (bpa + size) != 0 && (bpa + size) <= IOM_END) {
                *bshp = (bus_space_handle_t)ISA_HOLE_VADDR(bpa);
                return 0;
        }
#endif        /* !XENPV */

        /*
         * For memory space, map the bus physical address to
         * a kernel virtual address.
         */
        return x86_mem_add_mapping(bpa, size, flags, bshp);
}

int
_x86_memio_map(bus_space_tag_t t, bus_addr_t bpa, bus_size_t size,
                int flags, bus_space_handle_t *bshp)
{

        /*
         * For I/O space, just fill in the handle.
         */
        if (x86_bus_space_is_io(t)) {
                if (flags & BUS_SPACE_MAP_LINEAR)
                        return (EOPNOTSUPP);
                *bshp = bpa;
                return (0);
        }

        /*
         * For memory space, map the bus physical address to
         * a kernel virtual address.
         */
        return x86_mem_add_mapping(bpa, size, flags, bshp);
}

int
bus_space_reserve(bus_space_tag_t t,
    bus_addr_t bpa,
    bus_size_t size,
    int flags, bus_space_reservation_t *bsrp)
{
        struct extent *ex;
        int error;
        bus_space_tag_t it;

        if ((t->bst_exists & BUS_SPACE_OVERRIDE_RESERVE) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bst_super) {
                if ((it->bst_present & BUS_SPACE_OVERRIDE_RESERVE) == 0)
                        continue;
                return (*it->bst_ov->ov_space_reserve)(it->bst_ctx, t,
                    bpa, size, flags, bsrp);
        }

        /*
         * Pick the appropriate extent map.
         */
        if (x86_bus_space_is_io(t)) {
                if (flags & BUS_SPACE_MAP_LINEAR)
                        return (EOPNOTSUPP);
                ex = ioport_ex;
        } else if (x86_bus_space_is_mem(t))
                ex = iomem_ex;
        else
                panic("x86_memio_alloc: bad bus space tag");

        /*
         * Before we go any further, let's make sure that this
         * region is available.
         */
        error = extent_alloc_region(ex, bpa, size,
            EX_NOWAIT | (ioport_malloc_safe ? EX_MALLOCOK : 0));

        if (error != 0)
                return error;

        bus_space_reservation_init(bsrp, bpa, size);

        return 0;
}

int
bus_space_reserve_subregion(bus_space_tag_t t,
    bus_addr_t rstart, bus_addr_t rend,
    const bus_size_t size, const bus_size_t alignment,
    const bus_size_t boundary,
    const int flags, bus_space_reservation_t *bsrp)
{
        bus_space_reservation_t bsr;
        struct extent *ex;
        u_long bpa;
        int error;
        bus_space_tag_t it;

        if ((t->bst_exists & BUS_SPACE_OVERRIDE_RESERVE_SUBREGION) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bst_super) {
                if ((it->bst_present & BUS_SPACE_OVERRIDE_RESERVE_SUBREGION) ==
                    0)
                        continue;
                return (*it->bst_ov->ov_space_reserve_subregion)(it->bst_ctx, t,
                    rstart, rend, size, alignment, boundary, flags, bsrp);
        }

        /*
         * Pick the appropriate extent map.
         */
        if (x86_bus_space_is_io(t)) {
                if (flags & BUS_SPACE_MAP_LINEAR)
                        return (EOPNOTSUPP);
                ex = ioport_ex;
        } else if (x86_bus_space_is_mem(t))
                ex = iomem_ex;
        else
                panic("x86_memio_alloc: bad bus space tag");

        /*
         * Sanity check the allocation against the extent's boundaries.
         */
        rstart = MAX(rstart, ex->ex_start);
        rend = MIN(rend, ex->ex_end);
        if (rstart >= rend)
                panic("x86_memio_alloc: bad region start/end");

        /*
         * Do the requested allocation.
         */
        error = extent_alloc_subregion(ex, rstart, rend, size, alignment,
            boundary,
            EX_FAST | EX_NOWAIT | (ioport_malloc_safe ?  EX_MALLOCOK : 0),
            &bpa);

        if (error)
                return (error);

        bus_space_reservation_init(&bsr, bpa, size);

        *bsrp = bsr;

        return 0;
}

void
bus_space_release(bus_space_tag_t t, bus_space_reservation_t *bsr)
{
        struct extent *ex;
        bus_space_tag_t it;

        if ((t->bst_exists & BUS_SPACE_OVERRIDE_RELEASE) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bst_super) {
                if ((it->bst_present & BUS_SPACE_OVERRIDE_RELEASE) == 0)
                        continue;
                (*it->bst_ov->ov_space_release)(it->bst_ctx, t, bsr);
                return;
        }

        /*
         * Pick the appropriate extent map.
         */
        if (x86_bus_space_is_io(t)) {
                ex = ioport_ex;
        } else if (x86_bus_space_is_mem(t))
                ex = iomem_ex;
        else
                panic("x86_memio_alloc: bad bus space tag");

        if (extent_free(ex, bus_space_reservation_addr(bsr),
            bus_space_reservation_size(bsr), EX_NOWAIT |
            (ioport_malloc_safe ? EX_MALLOCOK : 0))) {
                printf("%s: pa 0x%jx, size 0x%jx\n", __func__,
                    (uintmax_t)bus_space_reservation_addr(bsr),
                    (uintmax_t)bus_space_reservation_size(bsr));
                printf("%s: can't free region\n", __func__);
        }
}

int
bus_space_alloc(bus_space_tag_t t, bus_addr_t rstart, bus_addr_t rend,
                bus_size_t size, bus_size_t alignment, bus_size_t boundary,
                int flags, bus_addr_t *bpap, bus_space_handle_t *bshp)
{
        bus_space_reservation_t bsr;
        bus_space_tag_t it;
        int error;

        if ((t->bst_exists & BUS_SPACE_OVERRIDE_ALLOC) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bst_super) {
                if ((it->bst_present & BUS_SPACE_OVERRIDE_ALLOC) == 0)
                        continue;
                return (*it->bst_ov->ov_space_alloc)(it->bst_ctx, t,
                    rstart, rend, size, alignment, boundary, flags, bpap, bshp);
        }

        /*
         * Do the requested allocation.
         */
        error = bus_space_reserve_subregion(t, rstart, rend, size, alignment,
            boundary, flags, &bsr);

        if (error != 0)
                return error;

        error = bus_space_reservation_map(t, &bsr, flags, bshp);
        if (error != 0)
                bus_space_release(t, &bsr);

        *bpap = bus_space_reservation_addr(&bsr);

        return error;
}

int
x86_mem_add_mapping(bus_addr_t bpa, bus_size_t size,
                int flags, bus_space_handle_t *bshp)
{
        paddr_t pa, endpa;
        vaddr_t va, sva;
        u_int pmapflags;

        pa = x86_trunc_page(bpa);
        endpa = x86_round_page(bpa + size);

        pmapflags = PMAP_NOCACHE;
        if ((flags & BUS_SPACE_MAP_CACHEABLE) != 0)
                pmapflags = 0;
        else if (flags & BUS_SPACE_MAP_PREFETCHABLE)
                pmapflags = PMAP_WRITE_COMBINE;

#ifdef DIAGNOSTIC
        if (endpa != 0 && endpa <= pa)
                panic("x86_mem_add_mapping: overflow");
#endif

#ifdef XENPV
        if (bpa >= IOM_BEGIN && (bpa + size) != 0 && (bpa + size) <= IOM_END) {
                sva = (vaddr_t)ISA_HOLE_VADDR(pa);
        } else
#endif        /* XENPV */
        {
                sva = uvm_km_alloc(kernel_map, endpa - pa, 0,
                    UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
                if (sva == 0)
                        return (ENOMEM);
        }

        *bshp = (bus_space_handle_t)(sva + (bpa & PGOFSET));

        for (va = sva; pa != endpa; pa += PAGE_SIZE, va += PAGE_SIZE) {
                pmap_kenter_ma(va, pa, VM_PROT_READ | VM_PROT_WRITE, pmapflags);
        }
        pmap_update(pmap_kernel());

        return 0;
}

bool
bus_space_is_equal(bus_space_tag_t t1, bus_space_tag_t t2)
{
        if (t1 == NULL || t2 == NULL)
                return false;
        return t1->bst_type == t2->bst_type;
}

/*
 * void _x86_memio_unmap(bus_space_tag bst, bus_space_handle bsh,
 *                        bus_size_t size, bus_addr_t *adrp)
 *
 *   This function unmaps memory- or io-space mapped by the function
 *   _x86_memio_map().  This function works nearly as same as
 *   x86_memio_unmap(), but this function does not ask kernel
 *   built-in extents and returns physical address of the bus space,
 *   for the convenience of the extra extent manager.
 */
void
_x86_memio_unmap(bus_space_tag_t t, bus_space_handle_t bsh,
                bus_size_t size, bus_addr_t *adrp)
{
        u_long va, endva;
        bus_addr_t bpa;

        /*
         * Find the correct extent and bus physical address.
         */
        if (x86_bus_space_is_io(t)) {
                bpa = bsh;
        } else if (x86_bus_space_is_mem(t)) {
                if (bsh >= atdevbase && (bsh + size) != 0 &&
                    (bsh + size) <= (atdevbase + IOM_SIZE)) {
                        bpa = (bus_addr_t)ISA_PHYSADDR(bsh);
                } else {

                        va = x86_trunc_page(bsh);
                        endva = x86_round_page(bsh + size);

#ifdef DIAGNOSTIC
                        if (endva <= va) {
                                panic("_x86_memio_unmap: overflow");
                        }
#endif

                        if (pmap_extract_ma(pmap_kernel(), va, &bpa) == FALSE) {
                                panic("_x86_memio_unmap:"
                                    " wrong virtual address");
                        }
                        bpa += (bsh & PGOFSET);
                        pmap_kremove(va, endva - va);
                        pmap_update(pmap_kernel());

                        /*
                         * Free the kernel virtual mapping.
                         */
                        uvm_km_free(kernel_map, va, endva - va, UVM_KMF_VAONLY);
                }
        } else {
                panic("_x86_memio_unmap: bad bus space tag");
        }

        if (adrp != NULL) {
                *adrp = bpa;
        }
}

static void
bus_space_reservation_unmap1(bus_space_tag_t t, const bus_space_handle_t bsh,
    const bus_size_t size, bus_addr_t *bpap)
{
        u_long va, endva;
        bus_addr_t bpa;

        /*
         * Find the correct extent and bus physical address.
         */
        if (x86_bus_space_is_io(t)) {
                bpa = bsh;
        } else if (x86_bus_space_is_mem(t)) {
                if (bsh >= atdevbase && (bsh + size) != 0 &&
                    (bsh + size) <= (atdevbase + IOM_SIZE)) {
                        bpa = (bus_addr_t)ISA_PHYSADDR(bsh);
                        goto ok;
                }

                va = x86_trunc_page(bsh);
                endva = x86_round_page(bsh + size);

#ifdef DIAGNOSTIC
                if (endva <= va)
                        panic("x86_memio_unmap: overflow");
#endif

                (void) pmap_extract_ma(pmap_kernel(), va, &bpa);
                bpa += (bsh & PGOFSET);

                pmap_kremove(va, endva - va);
                pmap_update(pmap_kernel());

                /*
                 * Free the kernel virtual mapping.
                 */
                uvm_km_free(kernel_map, va, endva - va, UVM_KMF_VAONLY);
        } else
                panic("x86_memio_unmap: bad bus space tag");
ok:
        if (bpap != NULL)
                *bpap = bpa;
}

void
bus_space_reservation_unmap(bus_space_tag_t t, const bus_space_handle_t bsh,
    const bus_size_t size)
{
        bus_space_tag_t it;

        if ((t->bst_exists & BUS_SPACE_OVERRIDE_RESERVATION_UNMAP) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bst_super) {
                if ((it->bst_present & BUS_SPACE_OVERRIDE_RESERVATION_UNMAP) ==
                    0)
                        continue;
                (*it->bst_ov->ov_space_reservation_unmap)(it->bst_ctx,
                    t, bsh, size);
                return;
        }

        bus_space_reservation_unmap1(t, bsh, size, NULL);
}

void
bus_space_unmap(bus_space_tag_t t, const bus_space_handle_t bsh,
    const bus_size_t size)
{
        bus_addr_t addr;
        bus_space_reservation_t bsr;
        bus_space_tag_t it;

        if ((t->bst_exists & BUS_SPACE_OVERRIDE_UNMAP) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bst_super) {
                if ((it->bst_present & BUS_SPACE_OVERRIDE_UNMAP) == 0)
                        continue;
                (*it->bst_ov->ov_space_unmap)(it->bst_ctx, t, bsh, size);
                return;
        }

        bus_space_reservation_unmap1(t, bsh, size, &addr);

        bus_space_reservation_init(&bsr, addr, size);
        bus_space_release(t, &bsr);
}

void
bus_space_free(bus_space_tag_t t, bus_space_handle_t bsh, bus_size_t size)
{
        bus_space_tag_t it;

        if ((t->bst_exists & BUS_SPACE_OVERRIDE_FREE) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bst_super) {
                if ((it->bst_present & BUS_SPACE_OVERRIDE_FREE) == 0)
                        continue;
                (*it->bst_ov->ov_space_free)(it->bst_ctx, t, bsh, size);
                return;
        }
        /* bus_space_unmap() does all that we need to do. */
        bus_space_unmap(t, bsh, size);
}

int
bus_space_subregion(bus_space_tag_t t, bus_space_handle_t bsh,
    bus_size_t offset, bus_size_t size, bus_space_handle_t *nbshp)
{

        *nbshp = bsh + offset;
        return (0);
}

paddr_t
bus_space_mmap(bus_space_tag_t t, bus_addr_t addr, off_t off, int prot,
    int flags)
{
        paddr_t pflags = 0;

        /* Can't mmap I/O space. */
        if (x86_bus_space_is_io(t))
                return (-1);

        /*
         * "addr" is the base address of the device we're mapping.
         * "off" is the offset into that device.
         *
         * Note we are called for each "page" in the device that
         * the upper layers want to map.
         */
        if (flags & BUS_SPACE_MAP_PREFETCHABLE)
                pflags |= X86_MMAP_FLAG_PREFETCH;

        return x86_btop(addr + off) | (pflags << X86_MMAP_FLAG_SHIFT);
}

void
bus_space_set_multi_1(bus_space_tag_t t, bus_space_handle_t h, bus_size_t o,
                      uint8_t v, size_t c)
{
        vaddr_t addr = h + o;

        if (x86_bus_space_is_io(t))
                while (c--)
                        outb(addr, v);
        else
                while (c--)
                        *(volatile uint8_t *)(addr) = v;
}

void
bus_space_set_multi_2(bus_space_tag_t t, bus_space_handle_t h, bus_size_t o,
                      uint16_t v, size_t c)
{
        vaddr_t addr = h + o;

        BUS_SPACE_ADDRESS_SANITY(addr, uint16_t, "bus addr");

        if (x86_bus_space_is_io(t))
                while (c--)
                        outw(addr, v);
        else
                while (c--)
                        *(volatile uint16_t *)(addr) = v;
}

void
bus_space_set_multi_4(bus_space_tag_t t, bus_space_handle_t h, bus_size_t o,
                      uint32_t v, size_t c)
{
        vaddr_t addr = h + o;

        BUS_SPACE_ADDRESS_SANITY(addr, uint32_t, "bus addr");

        if (x86_bus_space_is_io(t))
                while (c--)
                        outl(addr, v);
        else
                while (c--)
                        *(volatile uint32_t *)(addr) = v;
}

void
bus_space_set_region_1(bus_space_tag_t t, bus_space_handle_t h, bus_size_t o,
                      uint8_t v, size_t c)
{
        vaddr_t addr = h + o;

        if (x86_bus_space_is_io(t))
                for (; c != 0; c--, addr++)
                        outb(addr, v);
        else
                for (; c != 0; c--, addr++)
                        *(volatile uint8_t *)(addr) = v;
}

void
bus_space_set_region_2(bus_space_tag_t t, bus_space_handle_t h, bus_size_t o,
                       uint16_t v, size_t c)
{
        vaddr_t addr = h + o;

        BUS_SPACE_ADDRESS_SANITY(addr, uint16_t, "bus addr");

        if (x86_bus_space_is_io(t))
                for (; c != 0; c--, addr += 2)
                        outw(addr, v);
        else
                for (; c != 0; c--, addr += 2)
                        *(volatile uint16_t *)(addr) = v;
}

void
bus_space_set_region_4(bus_space_tag_t t, bus_space_handle_t h, bus_size_t o,
                       uint32_t v, size_t c)
{
        vaddr_t addr = h + o;

        BUS_SPACE_ADDRESS_SANITY(addr, uint32_t, "bus addr");

        if (x86_bus_space_is_io(t))
                for (; c != 0; c--, addr += 4)
                        outl(addr, v);
        else
                for (; c != 0; c--, addr += 4)
                        *(volatile uint32_t *)(addr) = v;
}

void
bus_space_copy_region_1(bus_space_tag_t t, bus_space_handle_t h1,
                        bus_size_t o1, bus_space_handle_t h2,
                        bus_size_t o2, size_t c)
{
        vaddr_t addr1 = h1 + o1;
        vaddr_t addr2 = h2 + o2;

        if (x86_bus_space_is_io(t)) {
                if (addr1 >= addr2) {
                        /* src after dest: copy forward */
                        for (; c != 0; c--, addr1++, addr2++)
                                outb(addr2, inb(addr1));
                } else {
                        /* dest after src: copy backwards */
                        for (addr1 += (c - 1), addr2 += (c - 1);
                            c != 0; c--, addr1--, addr2--)
                                outb(addr2, inb(addr1));
                }
        } else {
                if (addr1 >= addr2) {
                        /* src after dest: copy forward */
                        for (; c != 0; c--, addr1++, addr2++)
                                *(volatile uint8_t *)(addr2) =
                                    *(volatile uint8_t *)(addr1);
                } else {
                        /* dest after src: copy backwards */
                        for (addr1 += (c - 1), addr2 += (c - 1);
                            c != 0; c--, addr1--, addr2--)
                                *(volatile uint8_t *)(addr2) =
                                    *(volatile uint8_t *)(addr1);
                }
        }
}

void
bus_space_copy_region_2(bus_space_tag_t t, bus_space_handle_t h1,
                        bus_size_t o1, bus_space_handle_t h2,
                        bus_size_t o2, size_t c)
{
        vaddr_t addr1 = h1 + o1;
        vaddr_t addr2 = h2 + o2;

        BUS_SPACE_ADDRESS_SANITY(addr1, uint16_t, "bus addr 1");
        BUS_SPACE_ADDRESS_SANITY(addr2, uint16_t, "bus addr 2");

        if (x86_bus_space_is_io(t)) {
                if (addr1 >= addr2) {
                        /* src after dest: copy forward */
                        for (; c != 0; c--, addr1 += 2, addr2 += 2)
                                outw(addr2, inw(addr1));
                } else {
                        /* dest after src: copy backwards */
                        for (addr1 += 2 * (c - 1), addr2 += 2 * (c - 1);
                            c != 0; c--, addr1 -= 2, addr2 -= 2)
                                outw(addr2, inw(addr1));
                }
        } else {
                if (addr1 >= addr2) {
                        /* src after dest: copy forward */
                        for (; c != 0; c--, addr1 += 2, addr2 += 2)
                                *(volatile uint16_t *)(addr2) =
                                    *(volatile uint16_t *)(addr1);
                } else {
                        /* dest after src: copy backwards */
                        for (addr1 += 2 * (c - 1), addr2 += 2 * (c - 1);
                            c != 0; c--, addr1 -= 2, addr2 -= 2)
                                *(volatile uint16_t *)(addr2) =
                                    *(volatile uint16_t *)(addr1);
                }
        }
}

void
bus_space_copy_region_4(bus_space_tag_t t, bus_space_handle_t h1,
                        bus_size_t o1, bus_space_handle_t h2,
                        bus_size_t o2, size_t c)
{
        vaddr_t addr1 = h1 + o1;
        vaddr_t addr2 = h2 + o2;

        BUS_SPACE_ADDRESS_SANITY(addr1, uint32_t, "bus addr 1");
        BUS_SPACE_ADDRESS_SANITY(addr2, uint32_t, "bus addr 2");

        if (x86_bus_space_is_io(t)) {
                if (addr1 >= addr2) {
                        /* src after dest: copy forward */
                        for (; c != 0; c--, addr1 += 4, addr2 += 4)
                                outl(addr2, inl(addr1));
                } else {
                        /* dest after src: copy backwards */
                        for (addr1 += 4 * (c - 1), addr2 += 4 * (c - 1);
                            c != 0; c--, addr1 -= 4, addr2 -= 4)
                                outl(addr2, inl(addr1));
                }
        } else {
                if (addr1 >= addr2) {
                        /* src after dest: copy forward */
                        for (; c != 0; c--, addr1 += 4, addr2 += 4)
                                *(volatile uint32_t *)(addr2) =
                                    *(volatile uint32_t *)(addr1);
                } else {
                        /* dest after src: copy backwards */
                        for (addr1 += 4 * (c - 1), addr2 += 4 * (c - 1);
                            c != 0; c--, addr1 -= 4, addr2 -= 4)
                                *(volatile uint32_t *)(addr2) =
                                    *(volatile uint32_t *)(addr1);
                }
        }
}

void
bus_space_barrier(bus_space_tag_t tag, bus_space_handle_t bsh,
                  bus_size_t offset, bus_size_t len, int flags)
{

        /* I/O instructions always happen in program order.  */
        if (x86_bus_space_is_io(tag))
                return;

        /*
         * For default mappings, which are mapped with UC-type memory
         * regions, all loads and stores are issued in program order.
         *
         * For BUS_SPACE_MAP_PREFETCHABLE mappings, which are mapped
         * with WC-type memory regions, loads and stores may be issued
         * out of order, potentially requiring any of the three x86
         * fences -- LFENCE, SFENCE, MFENCE.
         *
         * For BUS_SPACE_MAP_CACHEABLE mappings, which are mapped with
         * WB-type memory regions (like normal memory), store/load may
         * be reordered to load/store, potentially requiring MFENCE.
         *
         * We can't easily tell here how the region was mapped (without
         * consulting the page tables), so just issue the fence
         * unconditionally.  Chances are either it's necessary or the
         * cost is small in comparison to device register I/O.
         *
         * Reference:
         *
         *        AMD64 Architecture Programmer's Manual, Volume 2:
         *        System Programming, 24593--Rev. 3.38--November 2021,
         *        Sec. 7.4.2 Memory Barrier Interaction with Memory
         *        Types, Table 7-3, p. 196.
         *        https://web.archive.org/web/20220625040004/https://www.amd.com/system/files/TechDocs/24593.pdf#page=256
         */
        switch (flags) {
        case 0:
                break;
        case BUS_SPACE_BARRIER_READ:
                x86_lfence();
                break;
        case BUS_SPACE_BARRIER_WRITE:
                x86_sfence();
                break;
        case BUS_SPACE_BARRIER_READ|BUS_SPACE_BARRIER_WRITE:
                x86_mfence();
                break;
        default:
                panic("unknown bus space barrier: 0x%x", (unsigned)flags);
        }
}

void *
bus_space_vaddr(bus_space_tag_t tag, bus_space_handle_t bsh)
{

        return x86_bus_space_is_mem(tag) ? (void *)bsh : NULL;
}

static const void *
bit_to_function_pointer(const struct bus_space_overrides *ov, uint64_t bit)
{
        switch (bit) {
        case BUS_SPACE_OVERRIDE_MAP:
                return ov->ov_space_map;
        case BUS_SPACE_OVERRIDE_UNMAP:
                return ov->ov_space_unmap;
        case BUS_SPACE_OVERRIDE_ALLOC:
                return ov->ov_space_alloc;
        case BUS_SPACE_OVERRIDE_FREE:
                return ov->ov_space_free;
        case BUS_SPACE_OVERRIDE_RESERVE:
                return ov->ov_space_reserve;
        case BUS_SPACE_OVERRIDE_RELEASE:
                return ov->ov_space_release;
        case BUS_SPACE_OVERRIDE_RESERVATION_MAP:
                return ov->ov_space_reservation_map;
        case BUS_SPACE_OVERRIDE_RESERVATION_UNMAP:
                return ov->ov_space_reservation_unmap;
        case BUS_SPACE_OVERRIDE_RESERVE_SUBREGION:
                return ov->ov_space_reserve_subregion;
        default:
                return NULL;
        }
}

void
bus_space_tag_destroy(bus_space_tag_t bst)
{
        kmem_free(bst, sizeof(struct bus_space_tag));
}

int
bus_space_tag_create(bus_space_tag_t obst, const uint64_t present,
    const uint64_t extpresent, const struct bus_space_overrides *ov, void *ctx,
    bus_space_tag_t *bstp)
{
        uint64_t bit, bits, nbits;
        bus_space_tag_t bst;
        const void *fp;

        if (ov == NULL || present == 0 || extpresent != 0)
                return EINVAL;

        bst = kmem_alloc(sizeof(struct bus_space_tag), KM_SLEEP);
        bst->bst_super = obst;
        bst->bst_type = obst->bst_type;

        for (bits = present; bits != 0; bits = nbits) {
                nbits = bits & (bits - 1);
                bit = nbits ^ bits;
                if ((fp = bit_to_function_pointer(ov, bit)) == NULL) {
                        printf("%s: missing bit %" PRIx64 "\n", __func__, bit);
                        goto einval;
                }
        }

        bst->bst_ov = ov;
        bst->bst_exists = obst->bst_exists | present;
        bst->bst_present = present;
        bst->bst_ctx = ctx;

        *bstp = bst;

        return 0;
einval:
        kmem_free(bst, sizeof(struct bus_space_tag));
        return EINVAL;
}





























































































































































































































































































































































































    2 


    2 





























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
/*        $NetBSD: sysmon_wdog.c,v 1.30 2021/12/31 11:05:41 riastradh Exp $        */

/*-
 * Copyright (c) 2000 Zembu Labs, Inc.
 * All rights reserved.
 *
 * Author: Jason R. Thorpe <thorpej@zembu.com>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by Zembu Labs, Inc.
 * 4. Neither the name of Zembu Labs nor the names of its employees may
 *    be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ZEMBU LABS, INC. ``AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WAR-
 * RANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DIS-
 * CLAIMED.  IN NO EVENT SHALL ZEMBU LABS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Watchdog timer framework for sysmon.  Hardware (and software)
 * watchdog timers can register themselves here to provide a
 * watchdog function, which provides an abstract interface to the
 * user.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sysmon_wdog.c,v 1.30 2021/12/31 11:05:41 riastradh Exp $");

#include <sys/param.h>
#include <sys/conf.h>
#include <sys/errno.h>
#include <sys/fcntl.h>
#include <sys/condvar.h>
#include <sys/mutex.h>
#include <sys/callout.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/module.h>
#include <sys/once.h>

#include <dev/sysmon/sysmonvar.h>

static LIST_HEAD(, sysmon_wdog) sysmon_wdog_list =
    LIST_HEAD_INITIALIZER(&sysmon_wdog_list);
static int sysmon_wdog_count;
static kmutex_t sysmon_wdog_list_mtx, sysmon_wdog_mtx;
static kcondvar_t sysmon_wdog_cv;
static struct sysmon_wdog *sysmon_armed_wdog;
static callout_t sysmon_wdog_callout;
static void *sysmon_wdog_sdhook;
static void *sysmon_wdog_cphook;

struct sysmon_wdog *sysmon_wdog_find(const char *);
void        sysmon_wdog_release(struct sysmon_wdog *);
int        sysmon_wdog_setmode(struct sysmon_wdog *, int, u_int);
void        sysmon_wdog_ktickle(void *);
void        sysmon_wdog_critpoll(void *);
void        sysmon_wdog_shutdown(void *);
void        sysmon_wdog_ref(struct sysmon_wdog *);

static struct sysmon_opvec sysmon_wdog_opvec = {
        sysmonopen_wdog, sysmonclose_wdog, sysmonioctl_wdog,
        NULL, NULL, NULL
};

MODULE(MODULE_CLASS_DRIVER, sysmon_wdog, "sysmon");

ONCE_DECL(once_wdog);

static int
wdog_preinit(void)
{

        mutex_init(&sysmon_wdog_list_mtx, MUTEX_DEFAULT, IPL_NONE);
        mutex_init(&sysmon_wdog_mtx, MUTEX_DEFAULT, IPL_SOFTCLOCK);
        cv_init(&sysmon_wdog_cv, "wdogref");
        callout_init(&sysmon_wdog_callout, 0);

        return 0;
}

int
sysmon_wdog_init(void)
{
        int error;

        (void)RUN_ONCE(&once_wdog, wdog_preinit);

        sysmon_wdog_sdhook = shutdownhook_establish(sysmon_wdog_shutdown, NULL);
        if (sysmon_wdog_sdhook == NULL)
                printf("WARNING: unable to register watchdog shutdown hook\n");
        sysmon_wdog_cphook = critpollhook_establish(sysmon_wdog_critpoll, NULL);
        if (sysmon_wdog_cphook == NULL)
                printf("WARNING: unable to register watchdog critpoll hook\n");

        error = sysmon_attach_minor(SYSMON_MINOR_WDOG, &sysmon_wdog_opvec);

        return error;
}

int
sysmon_wdog_fini(void)
{
        int error;

        if ( ! LIST_EMPTY(&sysmon_wdog_list))
                return EBUSY;

        error = sysmon_attach_minor(SYSMON_MINOR_WDOG, NULL);

        if (error == 0) {
                callout_destroy(&sysmon_wdog_callout);
                critpollhook_disestablish(sysmon_wdog_cphook);
                shutdownhook_disestablish(sysmon_wdog_sdhook);
                cv_destroy(&sysmon_wdog_cv);
                mutex_destroy(&sysmon_wdog_mtx);
                mutex_destroy(&sysmon_wdog_list_mtx);
        }

        return error;
}

/*
 * sysmonopen_wdog:
 *
 *        Open the system monitor device.
 */
int
sysmonopen_wdog(dev_t dev, int flag, int mode, struct lwp *l)
{

        return 0;
}

/*
 * sysmonclose_wdog:
 *
 *        Close the system monitor device.
 */
int
sysmonclose_wdog(dev_t dev, int flag, int mode, struct lwp *l)
{
        struct sysmon_wdog *smw;
        int error = 0;

        /*
         * If this is the last close, and there is a watchdog
         * running in UTICKLE mode, we need to disable it,
         * otherwise the system will reset in short order.
         *
         * XXX Maybe we should just go into KTICKLE mode?
         */
        mutex_enter(&sysmon_wdog_mtx);
        if ((smw = sysmon_armed_wdog) != NULL) {
                if ((smw->smw_mode & WDOG_MODE_MASK) == WDOG_MODE_UTICKLE) {
                        error = sysmon_wdog_setmode(smw,
                            WDOG_MODE_DISARMED, smw->smw_period);
                        if (error) {
                                printf("WARNING: UNABLE TO DISARM "
                                    "WATCHDOG %s ON CLOSE!\n",
                                    smw->smw_name);
                                /*
                                 * ...we will probably reboot soon.
                                 */
                        }
                }
        }
        mutex_exit(&sysmon_wdog_mtx);

        return error;
}

/*
 * sysmonioctl_wdog:
 *
 *        Perform a watchdog control request.
 */
int
sysmonioctl_wdog(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
        struct sysmon_wdog *smw;
        int error = 0;

        switch (cmd) {
        case WDOGIOC_GMODE:
            {
                struct wdog_mode *wm = (void *) data;

                wm->wm_name[sizeof(wm->wm_name) - 1] = '\0';
                smw = sysmon_wdog_find(wm->wm_name);
                if (smw == NULL) {
                        error = ESRCH;
                        break;
                }

                wm->wm_mode = smw->smw_mode;
                wm->wm_period = smw->smw_period;
                sysmon_wdog_release(smw);
                break;
            }

        case WDOGIOC_SMODE:
            {
                struct wdog_mode *wm = (void *) data;

                if ((flag & FWRITE) == 0) {
                        error = EPERM;
                        break;
                }

                wm->wm_name[sizeof(wm->wm_name) - 1] = '\0';
                smw = sysmon_wdog_find(wm->wm_name);
                if (smw == NULL) {
                        error = ESRCH;
                        break;
                }

                if (wm->wm_mode & ~(WDOG_MODE_MASK|WDOG_FEATURE_MASK))
                        error = EINVAL;
                else {
                        mutex_enter(&sysmon_wdog_mtx);
                        error = sysmon_wdog_setmode(smw, wm->wm_mode,
                            wm->wm_period);
                        mutex_exit(&sysmon_wdog_mtx);
                }

                sysmon_wdog_release(smw);
                break;
            }

        case WDOGIOC_WHICH:
            {
                struct wdog_mode *wm = (void *) data;

                mutex_enter(&sysmon_wdog_mtx);
                if ((smw = sysmon_armed_wdog) != NULL) {
                        strcpy(wm->wm_name, smw->smw_name);
                        wm->wm_mode = smw->smw_mode;
                        wm->wm_period = smw->smw_period;
                } else
                        error = ESRCH;
                mutex_exit(&sysmon_wdog_mtx);
                break;
            }

        case WDOGIOC_TICKLE:
                if ((flag & FWRITE) == 0) {
                        error = EPERM;
                        break;
                }

                mutex_enter(&sysmon_wdog_mtx);
                if ((smw = sysmon_armed_wdog) != NULL) {
                        error = (*smw->smw_tickle)(smw);
                        if (error == 0)
                                smw->smw_tickler = l->l_proc->p_pid;
                } else
                        error = ESRCH;
                mutex_exit(&sysmon_wdog_mtx);
                break;

        case WDOGIOC_GTICKLER:
                if ((smw = sysmon_armed_wdog) != NULL)
                        *(pid_t *)data = smw->smw_tickler;
                else
                        error = ESRCH;
                break;

        case WDOGIOC_GWDOGS:
            {
                struct wdog_conf *wc = (void *) data;
                char *cp;
                int i;

                mutex_enter(&sysmon_wdog_list_mtx);
                if (wc->wc_names == NULL)
                        wc->wc_count = sysmon_wdog_count;
                else {
                        for (i = 0, cp = wc->wc_names,
                               smw = LIST_FIRST(&sysmon_wdog_list);
                             i < sysmon_wdog_count && smw != NULL && error == 0;
                             i++, cp += WDOG_NAMESIZE,
                               smw = LIST_NEXT(smw, smw_list))
                                error = copyout(smw->smw_name, cp,
                                    strlen(smw->smw_name) + 1);
                        wc->wc_count = i;
                }
                mutex_exit(&sysmon_wdog_list_mtx);
                break;
            }

        default:
                error = ENOTTY;
        }

        return error;
}

/*
 * sysmon_wdog_register:
 *
 *        Register a watchdog device.
 */
int
sysmon_wdog_register(struct sysmon_wdog *smw)
{
        struct sysmon_wdog *lsmw;
        int error = 0;

        (void)RUN_ONCE(&once_wdog, wdog_preinit);

        mutex_enter(&sysmon_wdog_list_mtx);

        LIST_FOREACH(lsmw, &sysmon_wdog_list, smw_list) {
                if (strcmp(lsmw->smw_name, smw->smw_name) == 0) {
                        error = EEXIST;
                        goto out;
                }
        }

        smw->smw_mode = WDOG_MODE_DISARMED;
        smw->smw_tickler = (pid_t) -1;
        smw->smw_refcnt = 0;
        sysmon_wdog_count++;
        LIST_INSERT_HEAD(&sysmon_wdog_list, smw, smw_list);

 out:
        mutex_exit(&sysmon_wdog_list_mtx);
        return error;
}

/*
 * sysmon_wdog_unregister:
 *
 *        Unregister a watchdog device.
 */
int
sysmon_wdog_unregister(struct sysmon_wdog *smw)
{
        int rc = 0;

        mutex_enter(&sysmon_wdog_list_mtx);
        while (smw->smw_refcnt > 0 && rc == 0) {
                aprint_debug("%s: %d users remain\n", smw->smw_name,
                    smw->smw_refcnt);
                rc = cv_wait_sig(&sysmon_wdog_cv, &sysmon_wdog_list_mtx);
        }
        if (rc == 0) {
                sysmon_wdog_count--;
                LIST_REMOVE(smw, smw_list);
        }
        mutex_exit(&sysmon_wdog_list_mtx);
        return rc;
}

/*
 * sysmon_wdog_critpoll:
 *
 *        Perform critical operations during long polling periods
 */
void
sysmon_wdog_critpoll(void *arg)
{
        struct sysmon_wdog *smw = sysmon_armed_wdog;

        if (smw == NULL)
                return;

        if ((smw->smw_mode & WDOG_MODE_MASK) == WDOG_MODE_KTICKLE) {
                if ((*smw->smw_tickle)(smw) != 0) {
                        printf("WARNING: KERNEL TICKLE OF WATCHDOG %s "
                            "FAILED!\n", smw->smw_name);
                }
        }
}

/*
 * sysmon_wdog_find:
 *
 *        Find a watchdog device.  We increase the reference
 *        count on a match.
 */
struct sysmon_wdog *
sysmon_wdog_find(const char *name)
{
        struct sysmon_wdog *smw;

        mutex_enter(&sysmon_wdog_list_mtx);

        LIST_FOREACH(smw, &sysmon_wdog_list, smw_list) {
                if (strcmp(smw->smw_name, name) == 0)
                        break;
        }

        if (smw != NULL)
                smw->smw_refcnt++;

        mutex_exit(&sysmon_wdog_list_mtx);
        return smw;
}

/*
 * sysmon_wdog_release:
 *
 *        Release a watchdog device.
 */
void
sysmon_wdog_release(struct sysmon_wdog *smw)
{

        mutex_enter(&sysmon_wdog_list_mtx);
        KASSERT(smw->smw_refcnt != 0);
        smw->smw_refcnt--;
        cv_signal(&sysmon_wdog_cv);
        mutex_exit(&sysmon_wdog_list_mtx);
}

void
sysmon_wdog_ref(struct sysmon_wdog *smw)
{
        mutex_enter(&sysmon_wdog_list_mtx);
        smw->smw_refcnt++;
        mutex_exit(&sysmon_wdog_list_mtx);
}

/*
 * sysmon_wdog_setmode:
 *
 *        Set the mode of a watchdog device.
 */
int
sysmon_wdog_setmode(struct sysmon_wdog *smw, int mode, u_int period)
{
        u_int operiod = smw->smw_period;
        int omode = smw->smw_mode;
        int error = 0;

        smw->smw_period = period;
        smw->smw_mode = mode;

        switch (mode & WDOG_MODE_MASK) {
        case WDOG_MODE_DISARMED:
                if (smw != sysmon_armed_wdog) {
                        error = EINVAL;
                        goto out;
                }
                break;

        case WDOG_MODE_KTICKLE:
        case WDOG_MODE_UTICKLE:
        case WDOG_MODE_ETICKLE:
                if (sysmon_armed_wdog != NULL) {
                        error = EBUSY;
                        goto out;
                }
                break;

        default:
                error = EINVAL;
                goto out;
        }

        error = (*smw->smw_setmode)(smw);

 out:
        if (error) {
                smw->smw_period = operiod;
                smw->smw_mode = omode;
        } else {
                if ((mode & WDOG_MODE_MASK) == WDOG_MODE_DISARMED) {
                        sysmon_armed_wdog = NULL;
                        smw->smw_tickler = (pid_t) -1;
                        sysmon_wdog_release(smw);
                        if ((omode & WDOG_MODE_MASK) == WDOG_MODE_KTICKLE)
                                callout_stop(&sysmon_wdog_callout);
                } else {
                        sysmon_armed_wdog = smw;
                        sysmon_wdog_ref(smw);
                        if ((mode & WDOG_MODE_MASK) == WDOG_MODE_KTICKLE) {
                                callout_reset(&sysmon_wdog_callout,
                                    WDOG_PERIOD_TO_TICKS(smw->smw_period) / 2,
                                    sysmon_wdog_ktickle, NULL);
                        }
                }
        }
        return error;
}

/*
 * sysmon_wdog_ktickle:
 *
 *        Kernel watchdog tickle routine.
 */
void
sysmon_wdog_ktickle(void *arg)
{
        struct sysmon_wdog *smw;

        mutex_enter(&sysmon_wdog_mtx);
        if ((smw = sysmon_armed_wdog) != NULL) {
                if ((*smw->smw_tickle)(smw) != 0) {
                        printf("WARNING: KERNEL TICKLE OF WATCHDOG %s "
                            "FAILED!\n", smw->smw_name);
                        /*
                         * ...we will probably reboot soon.
                         */
                }
                callout_reset(&sysmon_wdog_callout,
                    WDOG_PERIOD_TO_TICKS(smw->smw_period) / 2,
                    sysmon_wdog_ktickle, NULL);
        }
        mutex_exit(&sysmon_wdog_mtx);
}

/*
 * sysmon_wdog_shutdown:
 *
 *        Perform shutdown-time operations.
 */
void
sysmon_wdog_shutdown(void *arg)
{
        struct sysmon_wdog *smw;

        /*
         * XXX Locking here?  I don't think it's necessary.
         */

        if ((smw = sysmon_armed_wdog) != NULL) {
                if (sysmon_wdog_setmode(smw, WDOG_MODE_DISARMED,
                    smw->smw_period))
                        printf("WARNING: FAILED TO SHUTDOWN WATCHDOG %s!\n",
                            smw->smw_name);
        }
}

static int
sysmon_wdog_modcmd(modcmd_t cmd, void *arg)
{
        int ret;

        switch (cmd) {
        case MODULE_CMD_INIT:
                ret = sysmon_wdog_init();
                break;
        case MODULE_CMD_FINI:
                ret = sysmon_wdog_fini();
                break;
        case MODULE_CMD_STAT:
        default:
                ret = ENOTTY;
        }

        return ret;
}


























































    2 


    2 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/*        $NetBSD: vfs_hooks.c,v 1.6 2009/03/15 17:14:40 cegger Exp $        */

/*-
 * Copyright (c) 2005 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Julio M. Merino Vidal.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * VFS hooks.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_hooks.c,v 1.6 2009/03/15 17:14:40 cegger Exp $");

#include <sys/param.h>
#include <sys/queue.h>
#include <sys/mount.h>
#include <sys/mutex.h>

LIST_HEAD(vfs_hooks_head, vfs_hooks) vfs_hooks_head =
    LIST_HEAD_INITIALIZER(vfs_hooks_head);

kmutex_t vfs_hooks_lock;

void
vfs_hooks_init(void)
{

        mutex_init(&vfs_hooks_lock, MUTEX_DEFAULT, IPL_NONE);
}

int
vfs_hooks_attach(struct vfs_hooks *vfs_hooks)
{

        mutex_enter(&vfs_hooks_lock);
        LIST_INSERT_HEAD(&vfs_hooks_head, vfs_hooks, vfs_hooks_list);
        mutex_exit(&vfs_hooks_lock);

        return (0);
}

int
vfs_hooks_detach(struct vfs_hooks *vfs_hooks)
{
        struct vfs_hooks *hp;
        int ret = 0;

        mutex_enter(&vfs_hooks_lock);
        LIST_FOREACH(hp, &vfs_hooks_head, vfs_hooks_list) {
                if (hp == vfs_hooks) {
                        LIST_REMOVE(hp, vfs_hooks_list);
                        break;
                }
        }
        if (hp == NULL)
                       ret = ESRCH;
        mutex_exit(&vfs_hooks_lock);

        return (ret);
}

/*
 * Macro to be used in one of the vfs_hooks_* function for hooks that
 * return an error code.  Calls will stop as soon as one of the hooks
 * fails.
 */
#define VFS_HOOKS_W_ERROR(func, fargs, hook, hargs)                        \
int                                                                        \
func fargs                                                                \
{                                                                        \
        int error;                                                        \
        struct vfs_hooks *hp;                                                \
                                                                         \
        error = EJUSTRETURN;                                                \
                                                                         \
        mutex_enter(&vfs_hooks_lock);                                        \
        LIST_FOREACH(hp, &vfs_hooks_head, vfs_hooks_list) {                \
                if (hp-> hook != NULL) {                                \
                        error = hp-> hook hargs;                        \
                        if (error != 0)                                        \
                                break;                                        \
                }                                                        \
        }                                                                \
        mutex_exit(&vfs_hooks_lock);                                        \
                                                                         \
        return error;                                                        \
}

/*
 * Macro to be used in one of the vfs_hooks_* function for hooks that
 * do not return any error code.  All hooks will be executed
 * unconditionally.
 */
#define VFS_HOOKS_WO_ERROR(func, fargs, hook, hargs)                        \
void                                                                        \
func fargs                                                                \
{                                                                        \
        struct vfs_hooks *hp;                                                \
                                                                         \
        mutex_enter(&vfs_hooks_lock);                                        \
        LIST_FOREACH(hp, &vfs_hooks_head, vfs_hooks_list) {                \
                if (hp-> hook != NULL)                                        \
                        hp-> hook hargs;                                \
        }                                                                \
        mutex_exit(&vfs_hooks_lock);                                        \
}

/*
 * Routines to iterate over VFS hooks lists and execute them.
 */

VFS_HOOKS_WO_ERROR(vfs_hooks_unmount, (struct mount *mp), vh_unmount, (mp));
VFS_HOOKS_W_ERROR(vfs_hooks_reexport, (struct mount *mp, const char *path, void *data), vh_reexport, (mp, path, data));



































































































































































































































































































































































































































    6 















   11 



   11 







    1 


























































































    6 











    6 


    6 








    6 





































































































































    3 



    3 


















    1 







    1 
    1 


    1 



































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
/*        $NetBSD: kern_pax.c,v 1.63 2022/10/26 23:22:38 riastradh Exp $        */

/*
 * Copyright (c) 2015, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Maxime Villard.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 2006 Elad Efrat <elad@NetBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_pax.c,v 1.63 2022/10/26 23:22:38 riastradh Exp $");

#include "opt_pax.h"

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/exec.h>
#include <sys/exec_elf.h>
#include <sys/pax.h>
#include <sys/sysctl.h>
#include <sys/kmem.h>
#include <sys/mman.h>
#include <sys/syslog.h>
#include <sys/vnode.h>
#include <sys/queue.h>
#include <sys/bitops.h>
#include <sys/kauth.h>
#include <sys/cprng.h>

#ifdef PAX_ASLR_DEBUG
#define PAX_DPRINTF(_fmt, args...) \
        do if (pax_aslr_debug) uprintf("%s: " _fmt "\n", __func__, ##args); \
        while (/*CONSTCOND*/0)
#else
#define PAX_DPRINTF(_fmt, args...)        do {} while (/*CONSTCOND*/0)
#endif

#ifdef PAX_ASLR
#include <sys/mman.h>
#include <sys/resourcevar.h>

int pax_aslr_enabled = 1;
int pax_aslr_global = PAX_ASLR;

#ifndef PAX_ASLR_DELTA_MMAP_LSB
#define PAX_ASLR_DELTA_MMAP_LSB                PGSHIFT
#endif
#ifndef PAX_ASLR_DELTA_MMAP_LEN
#define PAX_ASLR_DELTA_MMAP_LEN                ((sizeof(void *) * NBBY) / 2)
#endif
#ifndef PAX_ASLR_DELTA_MMAP_LEN32
#define PAX_ASLR_DELTA_MMAP_LEN32        ((sizeof(uint32_t) * NBBY) / 2)
#endif
#ifndef PAX_ASLR_DELTA_STACK_LSB
#define PAX_ASLR_DELTA_STACK_LSB        PGSHIFT
#endif
#ifndef PAX_ASLR_DELTA_STACK_LEN
#define PAX_ASLR_DELTA_STACK_LEN         ((sizeof(void *) * NBBY) / 4)
#endif
#ifndef PAX_ASLR_DELTA_STACK_LEN32
#define PAX_ASLR_DELTA_STACK_LEN32         ((sizeof(uint32_t) * NBBY) / 4)
#endif
#define PAX_ASLR_MAX_STACK_WASTE        8

#ifdef PAX_ASLR_DEBUG
int pax_aslr_debug;
/* flag set means disable */
int pax_aslr_flags;
uint32_t pax_aslr_rand;
#define PAX_ASLR_STACK                0x01
#define PAX_ASLR_STACK_GAP        0x02
#define PAX_ASLR_MMAP                0x04
#define PAX_ASLR_EXEC_OFFSET        0x08
#define PAX_ASLR_RTLD_OFFSET        0x10
#define PAX_ASLR_FIXED                0x20
#endif

static bool pax_aslr_elf_flags_active(uint32_t);
#endif /* PAX_ASLR */

#ifdef PAX_MPROTECT
static int pax_mprotect_enabled = 1;
static int pax_mprotect_global = PAX_MPROTECT;
static int pax_mprotect_ptrace = 1;
static bool pax_mprotect_elf_flags_active(uint32_t);
#endif /* PAX_MPROTECT */
#ifdef PAX_MPROTECT_DEBUG
int pax_mprotect_debug;
#endif

#ifdef PAX_SEGVGUARD
#ifndef PAX_SEGVGUARD_EXPIRY
#define        PAX_SEGVGUARD_EXPIRY                (2 * 60)
#endif
#ifndef PAX_SEGVGUARD_SUSPENSION
#define        PAX_SEGVGUARD_SUSPENSION        (10 * 60)
#endif
#ifndef        PAX_SEGVGUARD_MAXCRASHES
#define        PAX_SEGVGUARD_MAXCRASHES        5
#endif


static int pax_segvguard_enabled = 1;
static int pax_segvguard_global = PAX_SEGVGUARD;
static int pax_segvguard_expiry = PAX_SEGVGUARD_EXPIRY;
static int pax_segvguard_suspension = PAX_SEGVGUARD_SUSPENSION;
static int pax_segvguard_maxcrashes = PAX_SEGVGUARD_MAXCRASHES;

struct pax_segvguard_uid_entry {
        uid_t sue_uid;
        size_t sue_ncrashes;
        time_t sue_expiry;
        time_t sue_suspended;
        LIST_ENTRY(pax_segvguard_uid_entry) sue_list;
};

struct pax_segvguard_entry {
        LIST_HEAD(, pax_segvguard_uid_entry) segv_uids;
};

static bool pax_segvguard_elf_flags_active(uint32_t);
#endif /* PAX_SEGVGUARD */

SYSCTL_SETUP(sysctl_security_pax_setup, "sysctl security.pax setup")
{
        const struct sysctlnode *rnode = NULL, *cnode;

        sysctl_createv(clog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "pax",
                       SYSCTL_DESCR("PaX (exploit mitigation) features."),
                       NULL, 0, NULL, 0,
                       CTL_SECURITY, CTL_CREATE, CTL_EOL);

        cnode = rnode;

#ifdef PAX_MPROTECT
        rnode = cnode;
        sysctl_createv(clog, 0, &rnode, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "mprotect",
                       SYSCTL_DESCR("mprotect(2) W^X restrictions."),
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "enabled",
                       SYSCTL_DESCR("Restrictions enabled."),
                       NULL, 0, &pax_mprotect_enabled, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "global",
                       SYSCTL_DESCR("When enabled, unless explicitly "
                                    "specified, apply restrictions to "
                                    "all processes."),
                       NULL, 0, &pax_mprotect_global, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "ptrace",
                       SYSCTL_DESCR("When enabled, allow ptrace(2) to "
                            "override mprotect permissions on traced "
                            "processes"),
                       NULL, 0, &pax_mprotect_ptrace, 0,
                       CTL_CREATE, CTL_EOL);
#ifdef PAX_MPROTECT_DEBUG
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "debug",
                       SYSCTL_DESCR("print mprotect changes."),
                       NULL, 0, &pax_mprotect_debug, 0,
                       CTL_CREATE, CTL_EOL);
#endif
#endif /* PAX_MPROTECT */

#ifdef PAX_SEGVGUARD
        rnode = cnode;
        sysctl_createv(clog, 0, &rnode, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "segvguard",
                       SYSCTL_DESCR("PaX segvguard."),
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "enabled",
                       SYSCTL_DESCR("segvguard enabled."),
                       NULL, 0, &pax_segvguard_enabled, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "global",
                       SYSCTL_DESCR("segvguard all programs."),
                       NULL, 0, &pax_segvguard_global, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "expiry_timeout",
                       SYSCTL_DESCR("Entry expiry timeout (in seconds)."),
                       NULL, 0, &pax_segvguard_expiry, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "suspend_timeout",
                       SYSCTL_DESCR("Entry suspension timeout (in seconds)."),
                       NULL, 0, &pax_segvguard_suspension, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "max_crashes",
                       SYSCTL_DESCR("Max number of crashes before expiry."),
                       NULL, 0, &pax_segvguard_maxcrashes, 0,
                       CTL_CREATE, CTL_EOL);
#endif /* PAX_SEGVGUARD */

#ifdef PAX_ASLR
        rnode = cnode;
        sysctl_createv(clog, 0, &rnode, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "aslr",
                       SYSCTL_DESCR("Address Space Layout Randomization."),
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "enabled",
                       SYSCTL_DESCR("Restrictions enabled."),
                       NULL, 0, &pax_aslr_enabled, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "global",
                       SYSCTL_DESCR("When enabled, unless explicitly "
                                    "specified, apply to all processes."),
                       NULL, 0, &pax_aslr_global, 0,
                       CTL_CREATE, CTL_EOL);
#ifdef PAX_ASLR_DEBUG
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "debug",
                       SYSCTL_DESCR("Print ASLR selected addresses."),
                       NULL, 0, &pax_aslr_debug, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "flags",
                       SYSCTL_DESCR("Disable/Enable select ASLR features."),
                       NULL, 0, &pax_aslr_flags, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "rand",
                       SYSCTL_DESCR("Use the given fixed random value"),
                       NULL, 0, &pax_aslr_rand, 0,
                       CTL_CREATE, CTL_EOL);
#endif
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_INT, "mmap_len",
                       SYSCTL_DESCR("Number of bits randomized for "
                                    "mmap(2) calls."),
                       NULL, PAX_ASLR_DELTA_MMAP_LEN, NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_INT, "stack_len",
                       SYSCTL_DESCR("Number of bits randomized for "
                                    "the stack."),
                       NULL, PAX_ASLR_DELTA_STACK_LEN, NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_INT, "exec_len",
                       SYSCTL_DESCR("Number of bits randomized for "
                                    "the PIE exec base."),
                       NULL, PAX_ASLR_DELTA_EXEC_LEN, NULL, 0,
                       CTL_CREATE, CTL_EOL);

#endif /* PAX_ASLR */
}

/*
 * Initialize PaX.
 */
void
pax_init(void)
{
#ifdef PAX_ASLR
        /* Adjust maximum stack by the size we can consume for ASLR */
        maxsmap = MAXSSIZ - (MAXSSIZ / PAX_ASLR_MAX_STACK_WASTE);
        // XXX: compat32 is not handled.
#endif
}

void
pax_set_flags(struct exec_package *epp, struct proc *p)
{
        p->p_pax = epp->ep_pax_flags;

#ifdef PAX_MPROTECT
        if (pax_mprotect_ptrace == 0)
                return;
        /*
         * If we are running under the debugger, turn off MPROTECT so
          * the debugger can insert/delete breakpoints
         */
        if (p->p_slflag & PSL_TRACED)
                p->p_pax &= ~P_PAX_MPROTECT;
#endif
}

void
pax_setup_elf_flags(struct exec_package *epp, uint32_t elf_flags)
{
        uint32_t flags = 0;

#ifdef PAX_ASLR
        if (pax_aslr_elf_flags_active(elf_flags)) {
                flags |= P_PAX_ASLR;
        }
#endif
#ifdef PAX_MPROTECT
        if (pax_mprotect_elf_flags_active(elf_flags)) {
                flags |= P_PAX_MPROTECT;
        }
#endif
#ifdef PAX_SEGVGUARD
        if (pax_segvguard_elf_flags_active(elf_flags)) {
                flags |= P_PAX_GUARD;
        }
#endif

        epp->ep_pax_flags = flags;
}

#if defined(PAX_MPROTECT) || defined(PAX_SEGVGUARD) || defined(PAX_ASLR)
static inline bool
pax_flags_active(uint32_t flags, uint32_t opt)
{
        if (!(flags & opt))
                return false;
        return true;
}
#endif /* PAX_MPROTECT || PAX_SEGVGUARD || PAX_ASLR */

#ifdef PAX_MPROTECT
static bool
pax_mprotect_elf_flags_active(uint32_t flags)
{
        if (!pax_mprotect_enabled)
                return false;
        if (pax_mprotect_global && (flags & ELF_NOTE_PAX_NOMPROTECT) != 0) {
                /* Mprotect explicitly disabled */
                return false;
        }
        if (!pax_mprotect_global && (flags & ELF_NOTE_PAX_MPROTECT) == 0) {
                /* Mprotect not requested */
                return false;
        }
        return true;
}

vm_prot_t
pax_mprotect_maxprotect(
#ifdef PAX_MPROTECT_DEBUG
    const char *file, size_t line,
#endif
    struct lwp *l, vm_prot_t active, vm_prot_t extra, vm_prot_t maxprot)
{
        uint32_t flags;

        flags = l->l_proc->p_pax;
        if (!pax_flags_active(flags, P_PAX_MPROTECT))
                return maxprot;

        return (active|extra) & maxprot;
}

int
pax_mprotect_validate(
#ifdef PAX_MPROTECT_DEBUG
    const char *file, size_t line,
#endif
    struct lwp *l, vm_prot_t prot)
{
        uint32_t flags;

        flags = l->l_proc->p_pax;
        if (!pax_flags_active(flags, P_PAX_MPROTECT))
                return 0;

        if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
            (VM_PROT_WRITE|VM_PROT_EXECUTE)) {
#ifdef PAX_MPROTECT_DEBUG
                struct proc *p = l->l_proc;

                if (pax_mprotect_debug)
                        printf("%s: %s,%zu: %d.%d (%s): WX rejected\n",
                            __func__, file, line,
                            p->p_pid, l->l_lid, p->p_comm);
#endif
                return EACCES;
        }
        return 0;
}

/*
 * Bypass MPROTECT for traced processes
 */
int
pax_mprotect_prot(struct lwp *l)
{
        uint32_t flags;

        flags = l->l_proc->p_pax;
        if (!pax_flags_active(flags, P_PAX_MPROTECT))
                return 0;
        if (pax_mprotect_ptrace < 2)
                return 0;
        return UVM_EXTRACT_PROT_ALL;
}


#endif /* PAX_MPROTECT */

#ifdef PAX_ASLR
static bool
pax_aslr_elf_flags_active(uint32_t flags)
{
        if (!pax_aslr_enabled)
                return false;
        if (pax_aslr_global && (flags & ELF_NOTE_PAX_NOASLR) != 0) {
                /* ASLR explicitly disabled */
                return false;
        }
        if (!pax_aslr_global && (flags & ELF_NOTE_PAX_ASLR) == 0) {
                /* ASLR not requested */
                return false;
        }
        return true;
}

static bool
pax_aslr_epp_active(struct exec_package *epp)
{
        if (__predict_false((epp->ep_flags & (EXEC_32|EXEC_TOPDOWN_VM)) == 0))
                return false;
        return pax_flags_active(epp->ep_pax_flags, P_PAX_ASLR);
}

static bool
pax_aslr_active(struct lwp *l)
{
        return pax_flags_active(l->l_proc->p_pax, P_PAX_ASLR);
}

void
pax_aslr_init_vm(struct lwp *l, struct vmspace *vm, struct exec_package *ep)
{
        if (!pax_aslr_active(l))
                return;

        if (__predict_false((ep->ep_flags & (EXEC_32|EXEC_TOPDOWN_VM)) == 0))
                return;

#ifdef PAX_ASLR_DEBUG
        if (pax_aslr_flags & PAX_ASLR_MMAP)
                return;
#endif

        uint32_t len = (ep->ep_flags & EXEC_32) ?
            PAX_ASLR_DELTA_MMAP_LEN32 : PAX_ASLR_DELTA_MMAP_LEN;

        uint32_t rand = cprng_fast32();
#ifdef PAX_ASLR_DEBUG
        if (pax_aslr_flags & PAX_ASLR_FIXED)
                rand = pax_aslr_rand;
#endif
        vm->vm_aslr_delta_mmap = PAX_ASLR_DELTA(rand,
            PAX_ASLR_DELTA_MMAP_LSB, len);

        PAX_DPRINTF("delta_mmap=%#jx/%u",
            (uintmax_t)vm->vm_aslr_delta_mmap, len);
}

void
pax_aslr_mmap(struct lwp *l, vaddr_t *addr, vaddr_t orig_addr, int f)
{
        if (!pax_aslr_active(l))
                return;
#ifdef PAX_ASLR_DEBUG
        char buf[256];

        if (pax_aslr_flags & PAX_ASLR_MMAP)
                return;

        if (pax_aslr_debug)
                snprintb(buf, sizeof(buf), MAP_FMT, f);
        else
                buf[0] = '\0';
#endif

        if (!(f & MAP_FIXED) && ((orig_addr == 0) || !(f & MAP_ANON))) {
                PAX_DPRINTF("applying to %#jx orig_addr=%#jx f=%s",
                    (uintmax_t)*addr, (uintmax_t)orig_addr, buf);
                if (!(l->l_proc->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN))
                        *addr += l->l_proc->p_vmspace->vm_aslr_delta_mmap;
                else
                        *addr -= l->l_proc->p_vmspace->vm_aslr_delta_mmap;
                PAX_DPRINTF("result %#jx", (uintmax_t)*addr);
        } else {
                PAX_DPRINTF("not applying to %#jx orig_addr=%#jx f=%s",
                    (uintmax_t)*addr, (uintmax_t)orig_addr, buf);
        }
}

static vaddr_t
pax_aslr_offset(vaddr_t align)
{
        size_t pax_align, l2, delta;
        uint32_t rand;
        vaddr_t offset;

        pax_align = align == 0 ? PAGE_SIZE : align;
        l2 = ilog2(pax_align);

        rand = cprng_fast32();
#ifdef PAX_ASLR_DEBUG
        if (pax_aslr_flags & PAX_ASLR_FIXED)
                rand = pax_aslr_rand;
#endif

#define        PAX_TRUNC(a, b)        ((a) & ~((b) - 1))

        delta = PAX_ASLR_DELTA(rand, l2, PAX_ASLR_DELTA_EXEC_LEN);
        offset = PAX_TRUNC(delta, pax_align);
        offset = MAX(offset, pax_align);

        PAX_DPRINTF("rand=%#x l2=%#zx pax_align=%#zx delta=%#zx offset=%#jx",
            rand, l2, pax_align, delta, (uintmax_t)offset);

        return offset;
}

vaddr_t
pax_aslr_exec_offset(struct exec_package *epp, vaddr_t align)
{
        if (!pax_aslr_epp_active(epp))
                goto out;

#ifdef PAX_ASLR_DEBUG
        if (pax_aslr_flags & PAX_ASLR_EXEC_OFFSET)
                goto out;
#endif
        return pax_aslr_offset(align);
out:
        return MAX(align, PAGE_SIZE);
}

voff_t
pax_aslr_rtld_offset(struct exec_package *epp, vaddr_t align, int use_topdown)
{
        voff_t offset;

        if (!pax_aslr_epp_active(epp))
                return 0;

#ifdef PAX_ASLR_DEBUG
        if (pax_aslr_flags & PAX_ASLR_RTLD_OFFSET)
                return 0;
#endif
        offset = pax_aslr_offset(align);
        if (use_topdown)
                offset = -offset;

        return offset;
}

void
pax_aslr_stack(struct exec_package *epp, vsize_t *max_stack_size)
{
        if (!pax_aslr_epp_active(epp))
                return;
#ifdef PAX_ASLR_DEBUG
        if (pax_aslr_flags & PAX_ASLR_STACK)
                return;
#endif

        uint32_t len = (epp->ep_flags & EXEC_32) ?
            PAX_ASLR_DELTA_STACK_LEN32 : PAX_ASLR_DELTA_STACK_LEN;
        uint32_t rand = cprng_fast32();
#ifdef PAX_ASLR_DEBUG
        if (pax_aslr_flags & PAX_ASLR_FIXED)
                rand = pax_aslr_rand;
#endif
        u_long d = PAX_ASLR_DELTA(rand, PAX_ASLR_DELTA_STACK_LSB, len);
        d &= (*max_stack_size / PAX_ASLR_MAX_STACK_WASTE) - 1;
         u_long newminsaddr = (u_long)STACK_GROW(epp->ep_minsaddr, d);
        PAX_DPRINTF("old minsaddr=%#jx delta=%#lx new minsaddr=%#lx",
            (uintmax_t)epp->ep_minsaddr, d, newminsaddr);
        epp->ep_minsaddr = (vaddr_t)newminsaddr;
        *max_stack_size -= d;
}

uint32_t
pax_aslr_stack_gap(struct exec_package *epp)
{
        if (!pax_aslr_epp_active(epp))
                return 0;

#ifdef PAX_ASLR_DEBUG
        if (pax_aslr_flags & PAX_ASLR_STACK_GAP)
                return 0;
#endif

        uint32_t rand = cprng_fast32();
#ifdef PAX_ASLR_DEBUG
        if (pax_aslr_flags & PAX_ASLR_FIXED)
                rand = pax_aslr_rand;
#endif
        rand %= PAGE_SIZE;
        PAX_DPRINTF("stack gap=%#x\n", rand);
        return rand;
}
#endif /* PAX_ASLR */

#ifdef PAX_SEGVGUARD
static bool
pax_segvguard_elf_flags_active(uint32_t flags)
{
        if (!pax_segvguard_enabled)
                return false;
        if (pax_segvguard_global && (flags & ELF_NOTE_PAX_NOGUARD) != 0) {
                /* Segvguard explicitly disabled */
                return false;
        }
        if (!pax_segvguard_global && (flags & ELF_NOTE_PAX_GUARD) == 0) {
                /* Segvguard not requested */
                return false;
        }
        return true;
}

void
pax_segvguard_cleanup(struct vnode *vp)
{
        struct pax_segvguard_entry *p = vp->v_segvguard;
        struct pax_segvguard_uid_entry *up;

        if (__predict_true(p == NULL)) {
                return;
        }
        while ((up = LIST_FIRST(&p->segv_uids)) != NULL) {
                LIST_REMOVE(up, sue_list);
                kmem_free(up, sizeof(*up));
        }
        kmem_free(p, sizeof(*p));
        vp->v_segvguard = NULL;
}

/*
 * Called when a process of image vp generated a segfault.
 *
 * => exec_lock must be held by the caller
 * => if "crashed" is true, exec_lock must be held for write
 */
int
pax_segvguard(struct lwp *l, struct vnode *vp, const char *name, bool crashed)
{
        struct pax_segvguard_entry *p;
        struct pax_segvguard_uid_entry *up;
        struct timeval tv;
        uid_t uid;
        uint32_t flags;
        bool have_uid;

        KASSERT(rw_lock_held(&exec_lock));
        KASSERT(!crashed || rw_write_held(&exec_lock));

        flags = l->l_proc->p_pax;
        if (!pax_flags_active(flags, P_PAX_GUARD))
                return 0;

        if (vp == NULL)
                return EFAULT;        

        /* Fast-path if starting a program we don't know. */
        if ((p = vp->v_segvguard) == NULL && !crashed)
                return 0;

        microtime(&tv);

        /*
         * If a program we don't know crashed, we need to create a new entry
         * for it.
         */
        if (p == NULL) {
                p = kmem_alloc(sizeof(*p), KM_SLEEP);
                vp->v_segvguard = p;
                LIST_INIT(&p->segv_uids);

                /*
                 * Initialize a new entry with "crashes so far" of 1.
                 * The expiry time is when we purge the entry if it didn't
                 * reach the limit.
                 */
                up = kmem_alloc(sizeof(*up), KM_SLEEP);
                up->sue_uid = kauth_cred_getuid(l->l_cred);
                up->sue_ncrashes = 1;
                up->sue_expiry = tv.tv_sec + pax_segvguard_expiry;
                up->sue_suspended = 0;
                LIST_INSERT_HEAD(&p->segv_uids, up, sue_list);
                return 0;
        }

        /*
         * A program we "know" either executed or crashed again.
         * See if it's a culprit we're familiar with.
         */
        uid = kauth_cred_getuid(l->l_cred);
        have_uid = false;
        LIST_FOREACH(up, &p->segv_uids, sue_list) {
                if (up->sue_uid == uid) {
                        have_uid = true;
                        break;
                }
        }

        /*
         * It's someone else. Add an entry for him if we crashed.
         */
        if (!have_uid) {
                if (crashed) {
                        up = kmem_alloc(sizeof(*up), KM_SLEEP);
                        up->sue_uid = uid;
                        up->sue_ncrashes = 1;
                        up->sue_expiry = tv.tv_sec + pax_segvguard_expiry;
                        up->sue_suspended = 0;
                        LIST_INSERT_HEAD(&p->segv_uids, up, sue_list);
                }
                return 0;
        }

        if (crashed) {
                /* Check if timer on previous crashes expired first. */
                if (up->sue_expiry < tv.tv_sec) {
                        log(LOG_INFO, "PaX Segvguard: [%s] Suspension"
                            " expired.\n", name ? name : "unknown");
                        up->sue_ncrashes = 1;
                        up->sue_expiry = tv.tv_sec + pax_segvguard_expiry;
                        up->sue_suspended = 0;
                        return 0;
                }

                up->sue_ncrashes++;

                if (up->sue_ncrashes >= pax_segvguard_maxcrashes) {
                        log(LOG_ALERT, "PaX Segvguard: [%s] Suspending "
                            "execution for %d seconds after %zu crashes.\n",
                            name ? name : "unknown", pax_segvguard_suspension,
                            up->sue_ncrashes);

                        /* Suspend this program for a while. */
                        up->sue_suspended = tv.tv_sec + pax_segvguard_suspension;
                        up->sue_ncrashes = 0;
                        up->sue_expiry = 0;
                }
        } else {
                /* Are we supposed to be suspended? */
                if (up->sue_suspended > tv.tv_sec) {
                        log(LOG_ALERT, "PaX Segvguard: [%s] Preventing "
                            "execution due to repeated segfaults.\n", name ?
                            name : "unknown");
                        return EPERM;
                }
        }

        return 0;
}
#endif /* PAX_SEGVGUARD */































































































































































































































































































    1 


    1 









    1 




    1 















































































































































































































































































































































































































    1 









    1 























    1 


























    1 





























































































    1 

    1 



















































































    1 









    1 





























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
/*        $NetBSD: mld6.c,v 1.101 2019/09/25 09:53:38 ozaki-r Exp $        */
/*        $KAME: mld6.c,v 1.25 2001/01/16 14:14:18 itojun Exp $        */

/*
 * Copyright (C) 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1992, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Stephen Deering of Stanford University.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)igmp.c        8.1 (Berkeley) 7/19/93
 */

/*
 * Copyright (c) 1988 Stephen Deering.
 *
 * This code is derived from software contributed to Berkeley by
 * Stephen Deering of Stanford University.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)igmp.c        8.1 (Berkeley) 7/19/93
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: mld6.c,v 1.101 2019/09/25 09:53:38 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <sys/kernel.h>
#include <sys/callout.h>
#include <sys/cprng.h>
#include <sys/rwlock.h>

#include <net/if.h>

#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet/icmp6.h>
#include <netinet6/icmp6_private.h>
#include <netinet6/mld6_var.h>

static krwlock_t        in6_multilock __cacheline_aligned;

/*
 * Protocol constants
 */

/*
 * time between repetitions of a node's initial report of interest in a
 * multicast address(in seconds)
 */
#define MLD_UNSOLICITED_REPORT_INTERVAL        10

static struct ip6_pktopts ip6_opts;

static void mld_start_listening(struct in6_multi *);
static void mld_stop_listening(struct in6_multi *);

static struct mld_hdr *mld_allocbuf(struct mbuf **, struct in6_multi *, int);
static void mld_sendpkt(struct in6_multi *, int, const struct in6_addr *);
static void mld_starttimer(struct in6_multi *);
static void mld_stoptimer(struct in6_multi *);
static u_long mld_timerresid(struct in6_multi *);

static void in6m_ref(struct in6_multi *);
static void in6m_unref(struct in6_multi *);
static void in6m_destroy(struct in6_multi *);

void
mld_init(void)
{
        static u_int8_t hbh_buf[8];
        struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf;
        u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD);

        /* ip6h_nxt will be fill in later */
        hbh->ip6h_len = 0;        /* (8 >> 3) - 1 */

        /* XXX: grotty hard coding... */
        hbh_buf[2] = IP6OPT_PADN;        /* 2 byte padding */
        hbh_buf[3] = 0;
        hbh_buf[4] = IP6OPT_RTALERT;
        hbh_buf[5] = IP6OPT_RTALERT_LEN - 2;
        memcpy(&hbh_buf[6], (void *)&rtalert_code, sizeof(u_int16_t));

        ip6_opts.ip6po_hbh = hbh;
        /* We will specify the hoplimit by a multicast option. */
        ip6_opts.ip6po_hlim = -1;
        ip6_opts.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER;

        rw_init(&in6_multilock);
}

static void
mld_starttimer(struct in6_multi *in6m)
{
        struct timeval now;

        KASSERT(rw_write_held(&in6_multilock));
        KASSERTMSG(in6m->in6m_timer != IN6M_TIMER_UNDEF,
            "in6m_timer=%d", in6m->in6m_timer);

        microtime(&now);
        in6m->in6m_timer_expire.tv_sec = now.tv_sec + in6m->in6m_timer / hz;
        in6m->in6m_timer_expire.tv_usec = now.tv_usec +
            (in6m->in6m_timer % hz) * (1000000 / hz);
        if (in6m->in6m_timer_expire.tv_usec > 1000000) {
                in6m->in6m_timer_expire.tv_sec++;
                in6m->in6m_timer_expire.tv_usec -= 1000000;
        }

        /* start or restart the timer */
        callout_schedule(&in6m->in6m_timer_ch, in6m->in6m_timer);
}

/*
 * mld_stoptimer releases in6_multilock when calling callout_halt.
 * The caller must ensure in6m won't be freed while releasing the lock.
 */
static void
mld_stoptimer(struct in6_multi *in6m)
{

        KASSERT(rw_write_held(&in6_multilock));

        if (in6m->in6m_timer == IN6M_TIMER_UNDEF)
                return;

        rw_exit(&in6_multilock);

        callout_halt(&in6m->in6m_timer_ch, NULL);

        rw_enter(&in6_multilock, RW_WRITER);

        in6m->in6m_timer = IN6M_TIMER_UNDEF;
}

static void
mld_timeo(void *arg)
{
        struct in6_multi *in6m = arg;

        KASSERTMSG(in6m->in6m_refcount > 0, "in6m_refcount=%d",
            in6m->in6m_refcount);

        KERNEL_LOCK_UNLESS_NET_MPSAFE();
        rw_enter(&in6_multilock, RW_WRITER);
        if (in6m->in6m_timer == IN6M_TIMER_UNDEF)
                goto out;

        in6m->in6m_timer = IN6M_TIMER_UNDEF;

        switch (in6m->in6m_state) {
        case MLD_REPORTPENDING:
                mld_start_listening(in6m);
                break;
        default:
                mld_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
                break;
        }

out:
        rw_exit(&in6_multilock);
        KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}

static u_long
mld_timerresid(struct in6_multi *in6m)
{
        struct timeval now, diff;

        microtime(&now);

        if (now.tv_sec > in6m->in6m_timer_expire.tv_sec ||
            (now.tv_sec == in6m->in6m_timer_expire.tv_sec &&
            now.tv_usec > in6m->in6m_timer_expire.tv_usec)) {
                return (0);
        }
        diff = in6m->in6m_timer_expire;
        diff.tv_sec -= now.tv_sec;
        diff.tv_usec -= now.tv_usec;
        if (diff.tv_usec < 0) {
                diff.tv_sec--;
                diff.tv_usec += 1000000;
        }

        /* return the remaining time in milliseconds */
        return diff.tv_sec * 1000 + diff.tv_usec / 1000;
}

static void
mld_start_listening(struct in6_multi *in6m)
{
        struct in6_addr all_in6;

        KASSERT(rw_write_held(&in6_multilock));

        /*
         * RFC2710 page 10:
         * The node never sends a Report or Done for the link-scope all-nodes
         * address.
         * MLD messages are never sent for multicast addresses whose scope is 0
         * (reserved) or 1 (node-local).
         */
        all_in6 = in6addr_linklocal_allnodes;
        if (in6_setscope(&all_in6, in6m->in6m_ifp, NULL)) {
                /* XXX: this should not happen! */
                in6m->in6m_timer = 0;
                in6m->in6m_state = MLD_OTHERLISTENER;
        }
        if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_in6) ||
            IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) {
                in6m->in6m_timer = IN6M_TIMER_UNDEF;
                in6m->in6m_state = MLD_OTHERLISTENER;
        } else {
                mld_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
                in6m->in6m_timer = cprng_fast32() %
                    (MLD_UNSOLICITED_REPORT_INTERVAL * hz);
                in6m->in6m_state = MLD_IREPORTEDLAST;

                mld_starttimer(in6m);
        }
}

static void
mld_stop_listening(struct in6_multi *in6m)
{
        struct in6_addr allnode, allrouter;

        KASSERT(rw_lock_held(&in6_multilock));

        allnode = in6addr_linklocal_allnodes;
        if (in6_setscope(&allnode, in6m->in6m_ifp, NULL)) {
                /* XXX: this should not happen! */
                return;
        }
        allrouter = in6addr_linklocal_allrouters;
        if (in6_setscope(&allrouter, in6m->in6m_ifp, NULL)) {
                /* XXX impossible */
                return;
        }

        if (in6m->in6m_state == MLD_IREPORTEDLAST &&
            (!IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &allnode)) &&
            IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) >
            IPV6_ADDR_SCOPE_INTFACELOCAL) {
                mld_sendpkt(in6m, MLD_LISTENER_DONE, &allrouter);
        }
}

void
mld_input(struct mbuf *m, int off)
{
        struct ip6_hdr *ip6;
        struct mld_hdr *mldh;
        struct ifnet *ifp;
        struct in6_multi *in6m = NULL;
        struct in6_addr mld_addr, all_in6;
        u_long timer = 0;        /* timer value in the MLD query header */
        struct psref psref;

        ifp = m_get_rcvif_psref(m, &psref);
        if (__predict_false(ifp == NULL))
                goto out;
        IP6_EXTHDR_GET(mldh, struct mld_hdr *, m, off, sizeof(*mldh));
        if (mldh == NULL) {
                ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                goto out_nodrop;
        }

        ip6 = mtod(m, struct ip6_hdr *);

        /* source address validation */
        if (!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) {
                /*
                 * RFC3590 allows the IPv6 unspecified address as the source
                 * address of MLD report and done messages.  However, as this
                 * same document says, this special rule is for snooping
                 * switches and the RFC requires routers to discard MLD packets
                 * with the unspecified source address.  The RFC only talks
                 * about hosts receiving an MLD query or report in Security
                 * Considerations, but this is probably the correct intention.
                 * RFC3590 does not talk about other cases than link-local and
                 * the unspecified source addresses, but we believe the same
                 * rule should be applied.
                 * As a result, we only allow link-local addresses as the
                 * source address; otherwise, simply discard the packet.
                 */
#if 0
                /*
                 * XXX: do not log in an input path to avoid log flooding,
                 * though RFC3590 says "SHOULD log" if the source of a query
                 * is the unspecified address.
                 */
                char ip6bufs[INET6_ADDRSTRLEN];
                char ip6bufm[INET6_ADDRSTRLEN];
                log(LOG_INFO,
                    "mld_input: src %s is not link-local (grp=%s)\n",
                    IN6_PRINT(ip6bufs,&ip6->ip6_src),
                    IN6_PRINT(ip6bufm, &mldh->mld_addr));
#endif
                goto out;
        }

        /*
         * make a copy for local work (in6_setscope() may modify the 1st arg)
         */
        mld_addr = mldh->mld_addr;
        if (in6_setscope(&mld_addr, ifp, NULL)) {
                /* XXX: this should not happen! */
                goto out;
        }

        /*
         * In the MLD specification, there are 3 states and a flag.
         *
         * In Non-Listener state, we simply don't have a membership record.
         * In Delaying Listener state, our timer is running (in6m->in6m_timer)
         * In Idle Listener state, our timer is not running
         * (in6m->in6m_timer==IN6M_TIMER_UNDEF)
         *
         * The flag is in6m->in6m_state, it is set to MLD_OTHERLISTENER if
         * we have heard a report from another member, or MLD_IREPORTEDLAST
         * if we sent the last report.
         */
        switch (mldh->mld_type) {
        case MLD_LISTENER_QUERY: {
                struct in6_multi *next;

                if (ifp->if_flags & IFF_LOOPBACK)
                        break;

                if (!IN6_IS_ADDR_UNSPECIFIED(&mld_addr) &&
                    !IN6_IS_ADDR_MULTICAST(&mld_addr))
                        break;        /* print error or log stat? */

                all_in6 = in6addr_linklocal_allnodes;
                if (in6_setscope(&all_in6, ifp, NULL)) {
                        /* XXX: this should not happen! */
                        break;
                }

                /*
                 * - Start the timers in all of our membership records
                 *   that the query applies to for the interface on
                 *   which the query arrived excl. those that belong
                 *   to the "all-nodes" group (ff02::1).
                 * - Restart any timer that is already running but has
                 *   a value longer than the requested timeout.
                 * - Use the value specified in the query message as
                 *   the maximum timeout.
                 */
                timer = ntohs(mldh->mld_maxdelay);

                rw_enter(&in6_multilock, RW_WRITER);
                /*
                 * mld_stoptimer and mld_sendpkt release in6_multilock
                 * temporarily, so we have to prevent in6m from being freed
                 * while releasing the lock by having an extra reference to it.
                 *
                 * Also in6_purge_multi might remove items from the list of the
                 * ifp while releasing the lock. Fortunately in6_purge_multi is
                 * never executed as long as we have a psref of the ifp.
                 */
                LIST_FOREACH_SAFE(in6m, &ifp->if_multiaddrs, in6m_entry, next) {
                        if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_in6) ||
                            IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) <
                            IPV6_ADDR_SCOPE_LINKLOCAL)
                                continue;

                        if (in6m->in6m_state == MLD_REPORTPENDING)
                                continue; /* we are not yet ready */

                        if (!IN6_IS_ADDR_UNSPECIFIED(&mld_addr) &&
                            !IN6_ARE_ADDR_EQUAL(&mld_addr, &in6m->in6m_addr))
                                continue;

                        if (timer == 0) {
                                in6m_ref(in6m);

                                /* send a report immediately */
                                mld_stoptimer(in6m);
                                mld_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
                                in6m->in6m_state = MLD_IREPORTEDLAST;

                                in6m_unref(in6m); /* May free in6m */
                        } else if (in6m->in6m_timer == IN6M_TIMER_UNDEF ||
                            mld_timerresid(in6m) > timer) {
                                in6m->in6m_timer =
                                   1 + (cprng_fast32() % timer) * hz / 1000;
                                mld_starttimer(in6m);
                        }
                }
                rw_exit(&in6_multilock);
                break;
            }

        case MLD_LISTENER_REPORT:
                /*
                 * For fast leave to work, we have to know that we are the
                 * last person to send a report for this group.  Reports
                 * can potentially get looped back if we are a multicast
                 * router, so discard reports sourced by me.
                 * Note that it is impossible to check IFF_LOOPBACK flag of
                 * ifp for this purpose, since ip6_mloopback pass the physical
                 * interface to looutput.
                 */
                if (m->m_flags & M_LOOP) /* XXX: grotty flag, but efficient */
                        break;

                if (!IN6_IS_ADDR_MULTICAST(&mldh->mld_addr))
                        break;

                /*
                 * If we belong to the group being reported, stop
                 * our timer for that group.
                 */
                rw_enter(&in6_multilock, RW_WRITER);
                in6m = in6_lookup_multi(&mld_addr, ifp);
                if (in6m) {
                        in6m_ref(in6m);
                        mld_stoptimer(in6m); /* transit to idle state */
                        in6m->in6m_state = MLD_OTHERLISTENER; /* clear flag */
                        in6m_unref(in6m);
                        in6m = NULL; /* in6m might be freed */
                }
                rw_exit(&in6_multilock);
                break;
        default:                /* this is impossible */
#if 0
                /*
                 * this case should be impossible because of filtering in
                 * icmp6_input().  But we explicitly disabled this part
                 * just in case.
                 */
                log(LOG_ERR, "mld_input: illegal type(%d)", mldh->mld_type);
#endif
                break;
        }

out:
        m_freem(m);
out_nodrop:
        m_put_rcvif_psref(ifp, &psref);
}

/*
 * XXX mld_sendpkt must be called with in6_multilock held and
 * will release in6_multilock before calling ip6_output and
 * returning to avoid locking against myself in ip6_output.
 */
static void
mld_sendpkt(struct in6_multi *in6m, int type, const struct in6_addr *dst)
{
        struct mbuf *mh;
        struct mld_hdr *mldh;
        struct ip6_hdr *ip6 = NULL;
        struct ip6_moptions im6o;
        struct in6_ifaddr *ia = NULL;
        struct ifnet *ifp = in6m->in6m_ifp;
        int ignflags;
        struct psref psref;
        int bound;

        KASSERT(rw_write_held(&in6_multilock));

        /*
         * At first, find a link local address on the outgoing interface
         * to use as the source address of the MLD packet.
         * We do not reject tentative addresses for MLD report to deal with
         * the case where we first join a link-local address.
         */
        ignflags = (IN6_IFF_NOTREADY|IN6_IFF_ANYCAST) & ~IN6_IFF_TENTATIVE;
        bound = curlwp_bind();
        ia = in6ifa_ifpforlinklocal_psref(ifp, ignflags, &psref);
        if (ia == NULL) {
                curlwp_bindx(bound);
                return;
        }
        if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) {
                ia6_release(ia, &psref);
                ia = NULL;
        }

        /* Allocate two mbufs to store IPv6 header and MLD header */
        mldh = mld_allocbuf(&mh, in6m, type);
        if (mldh == NULL) {
                ia6_release(ia, &psref);
                curlwp_bindx(bound);
                return;
        }

        /* fill src/dst here */
        ip6 = mtod(mh, struct ip6_hdr *);
        ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
        ip6->ip6_dst = dst ? *dst : in6m->in6m_addr;
        ia6_release(ia, &psref);
        curlwp_bindx(bound);

        mldh->mld_addr = in6m->in6m_addr;
        in6_clearscope(&mldh->mld_addr); /* XXX */
        mldh->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr),
            sizeof(struct mld_hdr));

        /* construct multicast option */
        memset(&im6o, 0, sizeof(im6o));
        im6o.im6o_multicast_if_index = if_get_index(ifp);
        im6o.im6o_multicast_hlim = 1;

        /*
         * Request loopback of the report if we are acting as a multicast
         * router, so that the process-level routing daemon can hear it.
         */
        im6o.im6o_multicast_loop = (ip6_mrouter != NULL);

        /* increment output statistics */
        ICMP6_STATINC(ICMP6_STAT_OUTHIST + type);
        icmp6_ifstat_inc(ifp, ifs6_out_msg);
        switch (type) {
        case MLD_LISTENER_QUERY:
                icmp6_ifstat_inc(ifp, ifs6_out_mldquery);
                break;
        case MLD_LISTENER_REPORT:
                icmp6_ifstat_inc(ifp, ifs6_out_mldreport);
                break;
        case MLD_LISTENER_DONE:
                icmp6_ifstat_inc(ifp, ifs6_out_mlddone);
                break;
        }

        /* XXX we cannot call ip6_output with holding in6_multilock */
        rw_exit(&in6_multilock);

        ip6_output(mh, &ip6_opts, NULL, ia ? 0 : IPV6_UNSPECSRC,
            &im6o, NULL, NULL);

        rw_enter(&in6_multilock, RW_WRITER);
}

static struct mld_hdr *
mld_allocbuf(struct mbuf **mh, struct in6_multi *in6m, int type)
{
        struct mbuf *md;
        struct mld_hdr *mldh;
        struct ip6_hdr *ip6;

        /*
         * Allocate mbufs to store ip6 header and MLD header.
         * We allocate 2 mbufs and make chain in advance because
         * it is more convenient when inserting the hop-by-hop option later.
         */
        MGETHDR(*mh, M_DONTWAIT, MT_HEADER);
        if (*mh == NULL)
                return NULL;
        MGET(md, M_DONTWAIT, MT_DATA);
        if (md == NULL) {
                m_free(*mh);
                *mh = NULL;
                return NULL;
        }
        (*mh)->m_next = md;
        md->m_next = NULL;

        m_reset_rcvif((*mh));
        (*mh)->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
        (*mh)->m_len = sizeof(struct ip6_hdr);
        m_align(*mh, sizeof(struct ip6_hdr));

        /* fill in the ip6 header */
        ip6 = mtod(*mh, struct ip6_hdr *);
        memset(ip6, 0, sizeof(*ip6));
        ip6->ip6_flow = 0;
        ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
        ip6->ip6_vfc |= IPV6_VERSION;
        /* ip6_plen will be set later */
        ip6->ip6_nxt = IPPROTO_ICMPV6;
        /* ip6_hlim will be set by im6o.im6o_multicast_hlim */
        /* ip6_src/dst will be set by mld_sendpkt() or mld_sendbuf() */

        /* fill in the MLD header as much as possible */
        md->m_len = sizeof(struct mld_hdr);
        mldh = mtod(md, struct mld_hdr *);
        memset(mldh, 0, sizeof(struct mld_hdr));
        mldh->mld_type = type;
        return mldh;
}

static void
in6m_ref(struct in6_multi *in6m)
{

        KASSERT(rw_write_held(&in6_multilock));
        in6m->in6m_refcount++;
}

static void
in6m_unref(struct in6_multi *in6m)
{

        KASSERT(rw_write_held(&in6_multilock));
        if (--in6m->in6m_refcount == 0)
                in6m_destroy(in6m);
}

/*
 * Add an address to the list of IP6 multicast addresses for a given interface.
 */
struct        in6_multi *
in6_addmulti(struct in6_addr *maddr6, struct ifnet *ifp, int *errorp,
    int timer)
{
        struct        sockaddr_in6 sin6;
        struct        in6_multi *in6m;

        *errorp = 0;

        rw_enter(&in6_multilock, RW_WRITER);
        /*
         * See if address already in list.
         */
        in6m = in6_lookup_multi(maddr6, ifp);
        if (in6m != NULL) {
                /*
                 * Found it; just increment the reference count.
                 */
                in6m->in6m_refcount++;
        } else {
                /*
                 * New address; allocate a new multicast record
                 * and link it into the interface's multicast list.
                 */
                in6m = malloc(sizeof(*in6m), M_IPMADDR, M_NOWAIT|M_ZERO);
                if (in6m == NULL) {
                        *errorp = ENOBUFS;
                        goto out;
                }

                in6m->in6m_addr = *maddr6;
                in6m->in6m_ifp = ifp;
                in6m->in6m_refcount = 1;
                in6m->in6m_timer = IN6M_TIMER_UNDEF;
                callout_init(&in6m->in6m_timer_ch, CALLOUT_MPSAFE);
                callout_setfunc(&in6m->in6m_timer_ch, mld_timeo, in6m);

                LIST_INSERT_HEAD(&ifp->if_multiaddrs, in6m, in6m_entry);

                /*
                 * Ask the network driver to update its multicast reception
                 * filter appropriately for the new address.
                 */
                sockaddr_in6_init(&sin6, maddr6, 0, 0, 0);
                *errorp = if_mcast_op(ifp, SIOCADDMULTI, sin6tosa(&sin6));
                if (*errorp) {
                        callout_destroy(&in6m->in6m_timer_ch);
                        LIST_REMOVE(in6m, in6m_entry);
                        free(in6m, M_IPMADDR);
                        in6m = NULL;
                        goto out;
                }

                in6m->in6m_timer = timer;
                if (in6m->in6m_timer > 0) {
                        in6m->in6m_state = MLD_REPORTPENDING;
                        mld_starttimer(in6m);
                        goto out;
                }

                /*
                 * Let MLD6 know that we have joined a new IP6 multicast
                 * group.
                 */
                mld_start_listening(in6m);
        }
out:
        rw_exit(&in6_multilock);
        return in6m;
}

static void
in6m_destroy(struct in6_multi *in6m)
{
        struct sockaddr_in6 sin6;

        KASSERT(rw_write_held(&in6_multilock));
        KASSERTMSG(in6m->in6m_refcount == 0, "in6m_refcount=%d",
            in6m->in6m_refcount);

        /*
         * Unlink from list if it's listed.  This must be done before
         * mld_stop_listening because it releases in6_multilock and that allows
         * someone to look up the removing in6m from the list and add a
         * reference to the entry unexpectedly.
         */
        if (in6_lookup_multi(&in6m->in6m_addr, in6m->in6m_ifp) != NULL)
                LIST_REMOVE(in6m, in6m_entry);

        /*
         * No remaining claims to this record; let MLD6 know
         * that we are leaving the multicast group.
         */
        mld_stop_listening(in6m);

        /*
         * Delete all references of this multicasting group from
         * the membership arrays
         */
        in6_purge_mcast_references(in6m);

        /*
         * Notify the network driver to update its multicast
         * reception filter.
         */
        sockaddr_in6_init(&sin6, &in6m->in6m_addr, 0, 0, 0);
        if_mcast_op(in6m->in6m_ifp, SIOCDELMULTI, sin6tosa(&sin6));

        /* Tell mld_timeo we're halting the timer */
        in6m->in6m_timer = IN6M_TIMER_UNDEF;

        rw_exit(&in6_multilock);
        callout_halt(&in6m->in6m_timer_ch, NULL);
        callout_destroy(&in6m->in6m_timer_ch);

        free(in6m, M_IPMADDR);
        rw_enter(&in6_multilock, RW_WRITER);
}

/*
 * Delete a multicast address record.
 */
void
in6_delmulti_locked(struct in6_multi *in6m)
{

        KASSERT(rw_write_held(&in6_multilock));
        KASSERTMSG(in6m->in6m_refcount > 0, "in6m_refcount=%d",
            in6m->in6m_refcount);

        /*
         * The caller should have a reference to in6m. So we don't need to care
         * of releasing the lock in mld_stoptimer.
         */
        mld_stoptimer(in6m);
        if (--in6m->in6m_refcount == 0)
                in6m_destroy(in6m);
}

void
in6_delmulti(struct in6_multi *in6m)
{

        rw_enter(&in6_multilock, RW_WRITER);
        in6_delmulti_locked(in6m);
        rw_exit(&in6_multilock);
}

/*
 * Look up the in6_multi record for a given IP6 multicast address
 * on a given interface. If no matching record is found, "in6m"
 * returns NULL.
 */
struct in6_multi *
in6_lookup_multi(const struct in6_addr *addr, const struct ifnet *ifp)
{
        struct in6_multi *in6m;

        KASSERT(rw_lock_held(&in6_multilock));

        LIST_FOREACH(in6m, &ifp->if_multiaddrs, in6m_entry) {
                if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, addr))
                        break;
        }
        return in6m;
}

void
in6_lookup_and_delete_multi(const struct in6_addr *addr,
    const struct ifnet *ifp)
{
        struct in6_multi *in6m;

        rw_enter(&in6_multilock, RW_WRITER);
        in6m = in6_lookup_multi(addr, ifp);
        if (in6m != NULL)
                in6_delmulti_locked(in6m);
        rw_exit(&in6_multilock);
}

bool
in6_multi_group(const struct in6_addr *addr, const struct ifnet *ifp)
{
        bool ingroup;

        rw_enter(&in6_multilock, RW_READER);
        ingroup = in6_lookup_multi(addr, ifp) != NULL;
        rw_exit(&in6_multilock);

        return ingroup;
}

/*
 * Purge in6_multi records associated to the interface.
 */
void
in6_purge_multi(struct ifnet *ifp)
{
        struct in6_multi *in6m, *next;

        rw_enter(&in6_multilock, RW_WRITER);
        LIST_FOREACH_SAFE(in6m, &ifp->if_multiaddrs, in6m_entry, next) {
                LIST_REMOVE(in6m, in6m_entry);
                /*
                 * Normally multicast addresses are already purged at this
                 * point. Remaining references aren't accessible via ifp,
                 * so what we can do here is to prevent ifp from being
                 * accessed via in6m by removing it from the list of ifp.
                 */
                mld_stoptimer(in6m);
        }
        rw_exit(&in6_multilock);
}

void
in6_multi_lock(int op)
{

        rw_enter(&in6_multilock, op);
}

void
in6_multi_unlock(void)
{

        rw_exit(&in6_multilock);
}

bool
in6_multi_locked(int op)
{

        switch (op) {
        case RW_READER:
                return rw_read_held(&in6_multilock);
        case RW_WRITER:
                return rw_write_held(&in6_multilock);
        default:
                return rw_lock_held(&in6_multilock);
        }
}

struct in6_multi_mship *
in6_joingroup(struct ifnet *ifp, struct in6_addr *addr, int *errorp, int timer)
{
        struct in6_multi_mship *imm;

        imm = malloc(sizeof(*imm), M_IPMADDR, M_NOWAIT|M_ZERO);
        if (imm == NULL) {
                *errorp = ENOBUFS;
                return NULL;
        }

        imm->i6mm_maddr = in6_addmulti(addr, ifp, errorp, timer);
        if (!imm->i6mm_maddr) {
                /* *errorp is already set */
                free(imm, M_IPMADDR);
                return NULL;
        }
        return imm;
}

int
in6_leavegroup(struct in6_multi_mship *imm)
{
        struct in6_multi *in6m;

        rw_enter(&in6_multilock, RW_WRITER);
        in6m = imm->i6mm_maddr;
        imm->i6mm_maddr = NULL;
        if (in6m != NULL) {
                in6_delmulti_locked(in6m);
        }
        rw_exit(&in6_multilock);
        free(imm, M_IPMADDR);
        return 0;
}

/*
 * DEPRECATED: keep it just to avoid breaking old sysctl users.
 */
static int
in6_mkludge_sysctl(SYSCTLFN_ARGS)
{

        if (namelen != 1)
                return EINVAL;
        *oldlenp = 0;
        return 0;
}

static int
in6_multicast_sysctl(SYSCTLFN_ARGS)
{
        struct ifnet *ifp;
        struct ifaddr *ifa;
        struct in6_ifaddr *ia6;
        struct in6_multi *in6m;
        uint32_t tmp;
        int error;
        size_t written;
        struct psref psref, psref_ia;
        int bound, s;

        if (namelen != 1)
                return EINVAL;

        rw_enter(&in6_multilock, RW_READER);

        bound = curlwp_bind();
        ifp = if_get_byindex(name[0], &psref);
        if (ifp == NULL) {
                curlwp_bindx(bound);
                rw_exit(&in6_multilock);
                return ENODEV;
        }

        if (oldp == NULL) {
                *oldlenp = 0;
                s = pserialize_read_enter();
                IFADDR_READER_FOREACH(ifa, ifp) {
                        LIST_FOREACH(in6m, &ifp->if_multiaddrs, in6m_entry) {
                                *oldlenp += 2 * sizeof(struct in6_addr) +
                                    sizeof(uint32_t);
                        }
                }
                pserialize_read_exit(s);
                if_put(ifp, &psref);
                curlwp_bindx(bound);
                rw_exit(&in6_multilock);
                return 0;
        }

        error = 0;
        written = 0;
        s = pserialize_read_enter();
        IFADDR_READER_FOREACH(ifa, ifp) {
                if (ifa->ifa_addr->sa_family != AF_INET6)
                        continue;

                ifa_acquire(ifa, &psref_ia);
                pserialize_read_exit(s);

                ia6 = ifatoia6(ifa);
                LIST_FOREACH(in6m, &ifp->if_multiaddrs, in6m_entry) {
                        if (written + 2 * sizeof(struct in6_addr) +
                            sizeof(uint32_t) > *oldlenp)
                                goto done;
                        /*
                         * XXX return the first IPv6 address to keep backward
                         * compatibility, however now multicast addresses
                         * don't belong to any IPv6 addresses so it should be
                         * unnecessary.
                         */
                        error = sysctl_copyout(l, &ia6->ia_addr.sin6_addr,
                            oldp, sizeof(struct in6_addr));
                        if (error)
                                goto done;
                        oldp = (char *)oldp + sizeof(struct in6_addr);
                        written += sizeof(struct in6_addr);
                        error = sysctl_copyout(l, &in6m->in6m_addr,
                            oldp, sizeof(struct in6_addr));
                        if (error)
                                goto done;
                        oldp = (char *)oldp + sizeof(struct in6_addr);
                        written += sizeof(struct in6_addr);
                        tmp = in6m->in6m_refcount;
                        error = sysctl_copyout(l, &tmp, oldp, sizeof(tmp));
                        if (error)
                                goto done;
                        oldp = (char *)oldp + sizeof(tmp);
                        written += sizeof(tmp);
                }

                s = pserialize_read_enter();

                break;
        }
        pserialize_read_exit(s);
done:
        ifa_release(ifa, &psref_ia);
        if_put(ifp, &psref);
        curlwp_bindx(bound);
        rw_exit(&in6_multilock);
        *oldlenp = written;
        return error;
}

void
in6_sysctl_multicast_setup(struct sysctllog **clog)
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "inet6", NULL,
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET6, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "multicast",
                       SYSCTL_DESCR("Multicast information"),
                       in6_multicast_sysctl, 0, NULL, 0,
                       CTL_NET, PF_INET6, CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "multicast_kludge",
                       SYSCTL_DESCR("multicast kludge information"),
                       in6_mkludge_sysctl, 0, NULL, 0,
                       CTL_NET, PF_INET6, CTL_CREATE, CTL_EOL);
}



































































































































































































































































































    4 









    4 


































































































































    1 









    2 


































































































































































































































































































































































































































































































































































































    1 






































    1 
















    1 


    1 














    1 


    1 





































    1 
















    1 






    1 
















    1 






































































































    2 

















    2 













































































































































    1 







































































































    3 

























    3 




    3 






    2 










    1 























































    2 


    2 






    1 

    1 



















    1 

    1 






    2 













    2 






    1 





    2 

















    2 







    2 










    2 













    1 











    2 


















    2 










    2 




    2 











    2 






    2 





    2 









































































    2 











































































































































































































































































































































































































































































































































































































































    2 










    2 


    1 























































































































































    2 

    2 









































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
/* $NetBSD: com.c,v 1.384 2023/04/11 13:01:41 riastradh Exp $ */

/*-
 * Copyright (c) 1998, 1999, 2004, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1991 The Regents of the University of California.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)com.c        7.5 (Berkeley) 5/16/91
 */

/*
 * COM driver, uses National Semiconductor NS16450/NS16550AF UART
 * Supports automatic hardware flow control on StarTech ST16C650A UART
 *
 * Lock order:
 *        ttylock (IPL_VM)
 *        -> sc->sc_lock (IPL_HIGH)
 *        -> timecounter_lock (IPL_HIGH)
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: com.c,v 1.384 2023/04/11 13:01:41 riastradh Exp $");

#include "opt_com.h"
#include "opt_ddb.h"
#include "opt_kgdb.h"
#include "opt_lockdebug.h"
#include "opt_multiprocessor.h"
#include "opt_ntp.h"

/* The COM16650 option was renamed to COM_16650. */
#ifdef COM16650
#error Obsolete COM16650 option; use COM_16650 instead.
#endif

/*
 * Override cnmagic(9) macro before including <sys/systm.h>.
 * We need to know if cn_check_magic triggered debugger, so set a flag.
 * Callers of cn_check_magic must declare int cn_trapped = 0;
 * XXX: this is *ugly*!
 */
#define cn_trap()                                \
        do {                                        \
                console_debugger();                \
                cn_trapped = 1;                        \
                (void)cn_trapped;                \
        } while (/* CONSTCOND */ 0)

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/ioctl.h>
#include <sys/select.h>
#include <sys/poll.h>
#include <sys/tty.h>
#include <sys/proc.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/device.h>
#include <sys/malloc.h>
#include <sys/timepps.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/intr.h>
#ifdef RND_COM
#include <sys/rndsource.h>
#endif

#include <sys/bus.h>

#include <ddb/db_active.h>

#include <dev/ic/comreg.h>
#include <dev/ic/comvar.h>
#include <dev/ic/ns16550reg.h>
#include <dev/ic/st16650reg.h>
#include <dev/ic/hayespreg.h>
#define        com_lcr        com_cfcr
#include <dev/cons.h>

#include "ioconf.h"

#define        CSR_READ_1(r, o)        \
        (r)->cr_read((r), (r)->cr_map[o])
#define        CSR_WRITE_1(r, o, v)        \
        (r)->cr_write((r), (r)->cr_map[o], (v))
#define        CSR_WRITE_MULTI(r, o, p, n)        \
        (r)->cr_write_multi((r), (r)->cr_map[o], (p), (n))

/*
 * XXX COM_TYPE_AU1x00 specific
 */
#define        CSR_WRITE_2(r, o, v)        \
        bus_space_write_2((r)->cr_iot, (r)->cr_ioh, (r)->cr_map[o], v)
#define        CSR_READ_2(r, o)        \
        bus_space_read_2((r)->cr_iot, (r)->cr_ioh, (r)->cr_map[o])

static void com_enable_debugport(struct com_softc *);

void        com_config(struct com_softc *);
void        com_shutdown(struct com_softc *);
int        comspeed(long, long, int);
static        u_char        cflag2lcr(tcflag_t);
int        comparam(struct tty *, struct termios *);
void        comstart(struct tty *);
int        comhwiflow(struct tty *, int);

void        com_loadchannelregs(struct com_softc *);
void        com_hwiflow(struct com_softc *);
void        com_break(struct com_softc *, int);
void        com_modem(struct com_softc *, int);
void        tiocm_to_com(struct com_softc *, u_long, int);
int        com_to_tiocm(struct com_softc *);
void        com_iflush(struct com_softc *);

int        com_common_getc(dev_t, struct com_regs *);
static void        com_common_putc(dev_t, struct com_regs *, int, int);

int        cominit(struct com_regs *, int, int, int, tcflag_t);

static int comcnreattach(void);

int        comcngetc(dev_t);
void        comcnputc(dev_t, int);
void        comcnpollc(dev_t, int);

void        comsoft(void *);
static inline void com_rxsoft(struct com_softc *, struct tty *);
static inline void com_txsoft(struct com_softc *, struct tty *);
static inline void com_stsoft(struct com_softc *, struct tty *);
static inline void com_schedrx(struct com_softc *);
void        comdiag(void *);

dev_type_open(comopen);
dev_type_close(comclose);
dev_type_read(comread);
dev_type_write(comwrite);
dev_type_ioctl(comioctl);
dev_type_stop(comstop);
dev_type_tty(comtty);
dev_type_poll(compoll);

static struct comcons_info comcons_info;

/*
 * Following are all routines needed for COM to act as console
 */
static struct consdev comcons = {
        .cn_getc = comcngetc,
        .cn_putc = comcnputc,
        .cn_pollc = comcnpollc,
        .cn_dev = NODEV,
        .cn_pri = CN_NORMAL
};


const struct cdevsw com_cdevsw = {
        .d_open = comopen,
        .d_close = comclose,
        .d_read = comread,
        .d_write = comwrite,
        .d_ioctl = comioctl,
        .d_stop = comstop,
        .d_tty = comtty,
        .d_poll = compoll,
        .d_mmap = nommap,
        .d_kqfilter = ttykqfilter,
        .d_discard = nodiscard,
        .d_flag = D_TTY
};

/*
 * Make this an option variable one can patch.
 * But be warned:  this must be a power of 2!
 */
u_int com_rbuf_size = COM_RING_SIZE;

/* Stop input when 3/4 of the ring is full; restart when only 1/4 is full. */
u_int com_rbuf_hiwat = (COM_RING_SIZE * 1) / 4;
u_int com_rbuf_lowat = (COM_RING_SIZE * 3) / 4;

static int comconsattached;
static struct cnm_state com_cnm_state;

#ifdef KGDB
#include <sys/kgdb.h>

static struct com_regs comkgdbregs;
static int com_kgdb_attached;

int        com_kgdb_getc(void *);
void        com_kgdb_putc(void *, int);
#endif /* KGDB */

/* initializer for typical 16550-ish hardware */
static const bus_size_t com_std_map[COM_REGMAP_NENTRIES] = {
        [COM_REG_RXDATA]        =        com_data,
        [COM_REG_TXDATA]        =        com_data,
        [COM_REG_DLBL]                =        com_dlbl,
        [COM_REG_DLBH]                =        com_dlbh,
        [COM_REG_IER]                =        com_ier,
        [COM_REG_IIR]                =        com_iir,
        [COM_REG_FIFO]                =        com_fifo,
        [COM_REG_TCR]                =        com_fifo,
        [COM_REG_EFR]                =        com_efr,
        [COM_REG_TLR]                =        com_efr,
        [COM_REG_LCR]                =        com_lcr,
        [COM_REG_MCR]                =        com_mcr,
        [COM_REG_LSR]                =        com_lsr,
        [COM_REG_MSR]                =        com_msr,
        [COM_REG_USR]                =        com_usr,
        [COM_REG_TFL]                =        com_tfl,
        [COM_REG_RFL]                =        com_rfl,
        [COM_REG_HALT]                =        com_halt,
        [COM_REG_MDR1]                =        com_mdr1,
};

#define        COMDIALOUT_MASK        TTDIALOUT_MASK

#define        COMUNIT(x)        TTUNIT(x)
#define        COMDIALOUT(x)        TTDIALOUT(x)

#define        COM_ISALIVE(sc)        ((sc)->enabled != 0 && \
                         device_is_active((sc)->sc_dev))

#define        BR        BUS_SPACE_BARRIER_READ
#define        BW        BUS_SPACE_BARRIER_WRITE
#define COM_BARRIER(r, f) \
        bus_space_barrier((r)->cr_iot, (r)->cr_ioh, 0, (r)->cr_nports, (f))

/*
 * com_read_1 --
 *        Default register read callback using single byte accesses.
 */
static uint8_t
com_read_1(struct com_regs *regs, u_int reg)
{
        return bus_space_read_1(regs->cr_iot, regs->cr_ioh, reg);
}

/*
 * com_write_1 --
 *        Default register write callback using single byte accesses.
 */
static void
com_write_1(struct com_regs *regs, u_int reg, uint8_t val)
{
        bus_space_write_1(regs->cr_iot, regs->cr_ioh, reg, val);
}

/*
 * com_write_multi_1 --
 *        Default register multi write callback using single byte accesses.
 */
static void
com_write_multi_1(struct com_regs *regs, u_int reg, const uint8_t *datap,
    bus_size_t count)
{
        bus_space_write_multi_1(regs->cr_iot, regs->cr_ioh, reg, datap, count);
}

/*
 * com_read_4 --
 *        Default register read callback using dword accesses.
 */
static uint8_t
com_read_4(struct com_regs *regs, u_int reg)
{
        return bus_space_read_4(regs->cr_iot, regs->cr_ioh, reg) & 0xff;
}

/*
 * com_write_4 --
 *        Default register write callback using dword accesses.
 */
static void
com_write_4(struct com_regs *regs, u_int reg, uint8_t val)
{
        bus_space_write_4(regs->cr_iot, regs->cr_ioh, reg, val);
}

/*
 * com_write_multi_4 --
 *        Default register multi write callback using dword accesses.
 */
static void
com_write_multi_4(struct com_regs *regs, u_int reg, const uint8_t *datap,
    bus_size_t count)
{
        while (count-- > 0) {
                bus_space_write_4(regs->cr_iot, regs->cr_ioh, reg, *datap++);
        }
}

/*
 * com_init_regs --
 *        Driver front-ends use this to initialize our register map
 *        in the standard fashion.  They may then tailor the map to
 *        their own particular requirements.
 */
void
com_init_regs(struct com_regs *regs, bus_space_tag_t st, bus_space_handle_t sh,
              bus_addr_t addr)
{

        memset(regs, 0, sizeof(*regs));
        regs->cr_iot = st;
        regs->cr_ioh = sh;
        regs->cr_iobase = addr;
        regs->cr_nports = COM_NPORTS;
        regs->cr_read = com_read_1;
        regs->cr_write = com_write_1;
        regs->cr_write_multi = com_write_multi_1;
        memcpy(regs->cr_map, com_std_map, sizeof(regs->cr_map));
}

/*
 * com_init_regs_stride --
 *        Convenience function for front-ends that have a stride between
 *        registers.
 */
void
com_init_regs_stride(struct com_regs *regs, bus_space_tag_t st,
                     bus_space_handle_t sh, bus_addr_t addr, u_int regshift)
{

        com_init_regs(regs, st, sh, addr);
        for (size_t i = 0; i < __arraycount(regs->cr_map); i++) {
                regs->cr_map[i] <<= regshift;
        }
        regs->cr_nports <<= regshift;
}

/*
 * com_init_regs_stride_width --
 *        Convenience function for front-ends that have a stride between
 *        registers and specific I/O width requirements.
 */
void
com_init_regs_stride_width(struct com_regs *regs, bus_space_tag_t st,
                           bus_space_handle_t sh, bus_addr_t addr,
                           u_int regshift, u_int width)
{

        com_init_regs(regs, st, sh, addr);
        for (size_t i = 0; i < __arraycount(regs->cr_map); i++) {
                regs->cr_map[i] <<= regshift;
        }
        regs->cr_nports <<= regshift;

        switch (width) {
        case 1:
                /* Already set by com_init_regs */
                break;
        case 4:
                regs->cr_read = com_read_4;
                regs->cr_write = com_write_4;
                regs->cr_write_multi = com_write_multi_4;
                break;
        default:
                panic("com: unsupported I/O width %d", width);
        }
}

/*ARGSUSED*/
int
comspeed(long speed, long frequency, int type)
{
#define        divrnd(n, q)        (((n)*2/(q)+1)/2)        /* divide and round off */

        int x, err;
        int divisor = 16;

        if ((type == COM_TYPE_OMAP) && (speed > 230400)) {
            divisor = 13;
        }

        if (speed == 0)
                return (0);
        if (speed < 0)
                return (-1);
        x = divrnd(frequency / divisor, speed);
        if (x <= 0)
                return (-1);
        err = divrnd(((quad_t)frequency) * 1000 / divisor, speed * x) - 1000;
        if (err < 0)
                err = -err;
        if (err > COM_TOLERANCE)
                return (-1);
        return (x);

#undef        divrnd
}

#ifdef COM_DEBUG
int        com_debug = 0;

void comstatus(struct com_softc *, const char *);
void
comstatus(struct com_softc *sc, const char *str)
{
        struct tty *tp = sc->sc_tty;

        aprint_normal_dev(sc->sc_dev,
            "%s %cclocal  %cdcd %cts_carr_on %cdtr %ctx_stopped\n",
            str,
            ISSET(tp->t_cflag, CLOCAL) ? '+' : '-',
            ISSET(sc->sc_msr, MSR_DCD) ? '+' : '-',
            ISSET(tp->t_state, TS_CARR_ON) ? '+' : '-',
            ISSET(sc->sc_mcr, MCR_DTR) ? '+' : '-',
            sc->sc_tx_stopped ? '+' : '-');

        aprint_normal_dev(sc->sc_dev,
            "%s %ccrtscts %ccts %cts_ttstop  %crts rx_flags=0x%x\n",
            str,
            ISSET(tp->t_cflag, CRTSCTS) ? '+' : '-',
            ISSET(sc->sc_msr, MSR_CTS) ? '+' : '-',
            ISSET(tp->t_state, TS_TTSTOP) ? '+' : '-',
            ISSET(sc->sc_mcr, MCR_RTS) ? '+' : '-',
            sc->sc_rx_flags);
}
#endif

int
com_probe_subr(struct com_regs *regs)
{

        /* force access to id reg */
        CSR_WRITE_1(regs, COM_REG_LCR, LCR_8BITS);
        CSR_WRITE_1(regs, COM_REG_IIR, 0);
        if ((CSR_READ_1(regs, COM_REG_LCR) != LCR_8BITS) ||
            (CSR_READ_1(regs, COM_REG_IIR) & 0x38))
                return (0);

        return (1);
}

int
comprobe1(bus_space_tag_t iot, bus_space_handle_t ioh)
{
        struct com_regs        regs;

        com_init_regs(&regs, iot, ioh, 0/*XXX*/);

        return com_probe_subr(&regs);
}

/*
 * No locking in this routine; it is only called during attach,
 * or with the port already locked.
 */
static void
com_enable_debugport(struct com_softc *sc)
{

        /* Turn on line break interrupt, set carrier. */
        sc->sc_ier = IER_ERLS;
        if (sc->sc_type == COM_TYPE_PXA2x0)
                sc->sc_ier |= IER_EUART | IER_ERXTOUT;
        if (sc->sc_type == COM_TYPE_INGENIC ||
            sc->sc_type == COM_TYPE_TEGRA)
                sc->sc_ier |= IER_ERXTOUT;
        CSR_WRITE_1(&sc->sc_regs, COM_REG_IER, sc->sc_ier);
        SET(sc->sc_mcr, MCR_DTR | MCR_RTS);
        CSR_WRITE_1(&sc->sc_regs, COM_REG_MCR, sc->sc_mcr);
}

static void
com_intr_poll(void *arg)
{
        struct com_softc * const sc = arg;

        comintr(sc);

        callout_schedule(&sc->sc_poll_callout, sc->sc_poll_ticks);
}

void
com_attach_subr(struct com_softc *sc)
{
        struct com_regs *regsp = &sc->sc_regs;
        struct tty *tp;
        uint32_t cpr;
        uint8_t lcr;
        const char *fifo_msg = NULL;
        prop_dictionary_t dict;
        bool is_console = true;
        bool force_console = false;

        aprint_naive("\n");

        dict = device_properties(sc->sc_dev);
        prop_dictionary_get_bool(dict, "is_console", &is_console);
        prop_dictionary_get_bool(dict, "force_console", &force_console);
        callout_init(&sc->sc_diag_callout, 0);
        callout_init(&sc->sc_poll_callout, 0);
        callout_setfunc(&sc->sc_poll_callout, com_intr_poll, sc);
        mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_HIGH);

#if defined(COM_16650)
        sc->sc_type = COM_TYPE_16650;
#elif defined(COM_16750)
        sc->sc_type = COM_TYPE_16750;
#elif defined(COM_HAYESP)
        sc->sc_type = COM_TYPE_HAYESP;
#elif defined(COM_PXA2X0)
        sc->sc_type = COM_TYPE_PXA2x0;
#endif

        /* Disable interrupts before configuring the device. */
        if (sc->sc_type == COM_TYPE_PXA2x0)
                sc->sc_ier = IER_EUART;
        else
                sc->sc_ier = 0;

        CSR_WRITE_1(regsp, COM_REG_IER, sc->sc_ier);

        if ((bus_space_is_equal(regsp->cr_iot, comcons_info.regs.cr_iot) &&
            regsp->cr_iobase == comcons_info.regs.cr_iobase) || force_console) {
                comconsattached = 1;

                if (force_console)
                        memcpy(regsp, &comcons_info.regs, sizeof(*regsp));

                if (cn_tab == NULL && comcnreattach() != 0) {
                        printf("can't re-init serial console @%lx\n",
                            (u_long)comcons_info.regs.cr_iobase);
                }

                switch (sc->sc_type) {
                case COM_TYPE_16750:
                case COM_TYPE_DW_APB:
                        /* Use in comintr(). */
                         sc->sc_lcr = cflag2lcr(comcons_info.cflag);
                        break;
                }

                /* Make sure the console is always "hardwired". */
                delay(10000);                        /* wait for output to finish */
                if (is_console) {
                        SET(sc->sc_hwflags, COM_HW_CONSOLE);
                }

                SET(sc->sc_swflags, TIOCFLAG_SOFTCAR);
        }

        /* Probe for FIFO */
        switch (sc->sc_type) {
        case COM_TYPE_HAYESP:
                goto fifodone;

        case COM_TYPE_AU1x00:
                sc->sc_fifolen = 16;
                fifo_msg = "Au1X00 UART";
                SET(sc->sc_hwflags, COM_HW_FIFO);
                goto fifodelay;

        case COM_TYPE_16550_NOERS:
                sc->sc_fifolen = 16;
                fifo_msg = "ns16650, no ERS";
                SET(sc->sc_hwflags, COM_HW_FIFO);
                goto fifodelay;

        case COM_TYPE_OMAP:
                sc->sc_fifolen = 64;
                fifo_msg = "OMAP UART";
                SET(sc->sc_hwflags, COM_HW_FIFO);
                goto fifodelay;

        case COM_TYPE_INGENIC:
                sc->sc_fifolen = 16;
                fifo_msg = "Ingenic UART";
                SET(sc->sc_hwflags, COM_HW_FIFO);
                SET(sc->sc_hwflags, COM_HW_NOIEN);
                goto fifodelay;

        case COM_TYPE_TEGRA:
                sc->sc_fifolen = 8;
                fifo_msg = "Tegra UART";
                SET(sc->sc_hwflags, COM_HW_FIFO);
                CSR_WRITE_1(regsp, COM_REG_FIFO,
                    FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST | FIFO_TRIGGER_1);
                goto fifodelay;

        case COM_TYPE_BCMAUXUART:
                sc->sc_fifolen = 1;
                fifo_msg = "BCM AUX UART";
                SET(sc->sc_hwflags, COM_HW_FIFO);
                CSR_WRITE_1(regsp, COM_REG_FIFO,
                    FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST | FIFO_TRIGGER_1);
                goto fifodelay;

        case COM_TYPE_DW_APB:
                if (!prop_dictionary_get_uint(dict, "fifolen", &sc->sc_fifolen)) {
                        cpr = bus_space_read_4(sc->sc_regs.cr_iot,
                            sc->sc_regs.cr_ioh, DW_APB_UART_CPR);
                        sc->sc_fifolen = __SHIFTOUT(cpr, UART_CPR_FIFO_MODE) * 16;
                }
                if (sc->sc_fifolen == 0) {
                        sc->sc_fifolen = 1;
                        fifo_msg = "DesignWare APB UART, no fifo";
                        CSR_WRITE_1(regsp, COM_REG_FIFO, 0);
                } else {
                        fifo_msg = "DesignWare APB UART";
                        SET(sc->sc_hwflags, COM_HW_FIFO);
                        CSR_WRITE_1(regsp, COM_REG_FIFO,
                            FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST | FIFO_TRIGGER_1);
                }
                goto fifodelay;
        }

        sc->sc_fifolen = 1;
        /* look for a NS 16550AF UART with FIFOs */
        if (sc->sc_type == COM_TYPE_INGENIC) {
                CSR_WRITE_1(regsp, COM_REG_FIFO,
                    FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST | 
                    FIFO_TRIGGER_14 | FIFO_UART_ON);
        } else
                CSR_WRITE_1(regsp, COM_REG_FIFO,
                    FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST | FIFO_TRIGGER_14);
        delay(100);
        if (ISSET(CSR_READ_1(regsp, COM_REG_IIR), IIR_FIFO_MASK)
            == IIR_FIFO_MASK)
                if (ISSET(CSR_READ_1(regsp, COM_REG_FIFO), FIFO_TRIGGER_14)
                    == FIFO_TRIGGER_14) {
                        SET(sc->sc_hwflags, COM_HW_FIFO);

                        fifo_msg = "ns16550a";
                        sc->sc_fifolen = 16;

                        /*
                         * IIR changes into the EFR if LCR is set to LCR_EERS
                         * on 16650s. We also know IIR != 0 at this point.
                         * Write 0 into the EFR, and read it. If the result
                         * is 0, we have a 16650.
                         *
                         * Older 16650s were broken; the test to detect them
                         * is taken from the Linux driver. Apparently
                         * setting DLAB enable gives access to the EFR on
                         * these chips.
                         */
                        if (sc->sc_type == COM_TYPE_16650) {
                                lcr = CSR_READ_1(regsp, COM_REG_LCR);
                                CSR_WRITE_1(regsp, COM_REG_LCR, LCR_EERS);
                                CSR_WRITE_1(regsp, COM_REG_EFR, 0);
                                if (CSR_READ_1(regsp, COM_REG_EFR) == 0) {
                                        CSR_WRITE_1(regsp, COM_REG_LCR,
                                            lcr | LCR_DLAB);
                                        if (CSR_READ_1(regsp, COM_REG_EFR) == 0) {
                                                CLR(sc->sc_hwflags, COM_HW_FIFO);
                                                sc->sc_fifolen = 0;
                                        } else {
                                                SET(sc->sc_hwflags, COM_HW_FLOW);
                                                sc->sc_fifolen = 32;
                                        }
                                } else
                                        sc->sc_fifolen = 16;

                                CSR_WRITE_1(regsp, COM_REG_LCR, lcr);
                                if (sc->sc_fifolen == 0)
                                        fifo_msg = "st16650, broken fifo";
                                else if (sc->sc_fifolen == 32)
                                        fifo_msg = "st16650a";
                                else
                                        fifo_msg = "ns16550a";
                        }

                        /*
                         * TL16C750 can enable 64byte FIFO, only when DLAB
                         * is 1.  However, some 16750 may always enable.  For
                         * example, restrictions according to DLAB in a data
                         * sheet for SC16C750 were not described.
                         * Please enable 'options COM_16650', supposing you
                         * use SC16C750.  Probably 32 bytes of FIFO and HW FLOW
                         * should become effective.
                         */
                        if (sc->sc_type == COM_TYPE_16750) {
                                uint8_t iir1, iir2;
                                uint8_t fcr = FIFO_ENABLE | FIFO_TRIGGER_14;

                                lcr = CSR_READ_1(regsp, COM_REG_LCR);
                                CSR_WRITE_1(regsp, COM_REG_LCR,
                                    lcr & ~LCR_DLAB);
                                CSR_WRITE_1(regsp, COM_REG_FIFO,
                                    fcr | FIFO_64B_ENABLE);
                                iir1 = CSR_READ_1(regsp, COM_REG_IIR);
                                CSR_WRITE_1(regsp, COM_REG_FIFO, fcr);
                                CSR_WRITE_1(regsp, COM_REG_LCR, lcr | LCR_DLAB);
                                CSR_WRITE_1(regsp, COM_REG_FIFO,
                                    fcr | FIFO_64B_ENABLE);
                                iir2 = CSR_READ_1(regsp, COM_REG_IIR);

                                CSR_WRITE_1(regsp, COM_REG_LCR, lcr);

                                if (!ISSET(iir1, IIR_64B_FIFO) &&
                                    ISSET(iir2, IIR_64B_FIFO)) {
                                        /* It is TL16C750. */
                                        sc->sc_fifolen = 64;
                                        SET(sc->sc_hwflags, COM_HW_AFE);
                                } else
                                        CSR_WRITE_1(regsp, COM_REG_FIFO, fcr);

                                if (sc->sc_fifolen == 64)
                                        fifo_msg = "tl16c750";
                                else
                                        fifo_msg = "ns16750";
                        }
                } else
                        fifo_msg = "ns16550, broken fifo";
        else
                fifo_msg = "ns8250 or ns16450, no fifo";
        CSR_WRITE_1(regsp, COM_REG_FIFO, 0);

fifodelay:
        /*
         * Some chips will clear down both Tx and Rx FIFOs when zero is
         * written to com_fifo. If this chip is the console, writing zero
         * results in some of the chip/FIFO description being lost, so delay
         * printing it until now.
         */
        delay(10);
        if (ISSET(sc->sc_hwflags, COM_HW_FIFO)) {
                aprint_normal(": %s, %d-byte FIFO\n", fifo_msg, sc->sc_fifolen);
        } else {
                aprint_normal(": %s\n", fifo_msg);
        }
        if (ISSET(sc->sc_hwflags, COM_HW_TXFIFO_DISABLE)) {
                sc->sc_fifolen = 1;
                aprint_normal_dev(sc->sc_dev, "txfifo disabled\n");
        }

fifodone:

        tp = tty_alloc();
        tp->t_oproc = comstart;
        tp->t_param = comparam;
        tp->t_hwiflow = comhwiflow;
        tp->t_softc = sc;

        sc->sc_tty = tp;
        sc->sc_rbuf = malloc(com_rbuf_size << 1, M_DEVBUF, M_WAITOK);
        sc->sc_rbput = sc->sc_rbget = sc->sc_rbuf;
        sc->sc_rbavail = com_rbuf_size;
        sc->sc_ebuf = sc->sc_rbuf + (com_rbuf_size << 1);

        tty_attach(tp);

        if (!ISSET(sc->sc_hwflags, COM_HW_NOIEN))
                SET(sc->sc_mcr, MCR_IENABLE);

        if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) {
                int maj;

                /* locate the major number */
                maj = cdevsw_lookup_major(&com_cdevsw);

                tp->t_dev = cn_tab->cn_dev = makedev(maj,
                                                     device_unit(sc->sc_dev));

                aprint_normal_dev(sc->sc_dev, "console\n");
        }

#ifdef KGDB
        /*
         * Allow kgdb to "take over" this port.  If this is
         * not the console and is the kgdb device, it has
         * exclusive use.  If it's the console _and_ the
         * kgdb device, it doesn't.
         */
        if (bus_space_is_equal(regsp->cr_iot, comkgdbregs.cr_iot) &&
            regsp->cr_iobase == comkgdbregs.cr_iobase) {
                if (!ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) {
                        com_kgdb_attached = 1;

                        SET(sc->sc_hwflags, COM_HW_KGDB);
                }
                aprint_normal_dev(sc->sc_dev, "kgdb\n");
        }
#endif

        sc->sc_si = softint_establish(SOFTINT_SERIAL, comsoft, sc);

#ifdef RND_COM
        rnd_attach_source(&sc->rnd_source, device_xname(sc->sc_dev),
                          RND_TYPE_TTY, RND_FLAG_DEFAULT);
#endif

        /* if there are no enable/disable functions, assume the device
           is always enabled */
        if (!sc->enable)
                sc->enabled = 1;

        com_config(sc);

        SET(sc->sc_hwflags, COM_HW_DEV_OK);

        if (sc->sc_poll_ticks != 0)
                callout_schedule(&sc->sc_poll_callout, sc->sc_poll_ticks);
}

void
com_config(struct com_softc *sc)
{
        struct com_regs *regsp = &sc->sc_regs;

        /* Disable interrupts before configuring the device. */
        if (sc->sc_type == COM_TYPE_PXA2x0)
                sc->sc_ier = IER_EUART;
        else
                sc->sc_ier = 0;
        CSR_WRITE_1(regsp, COM_REG_IER, sc->sc_ier);
        (void) CSR_READ_1(regsp, COM_REG_IIR);

        /* Look for a Hayes ESP board. */
        if (sc->sc_type == COM_TYPE_HAYESP) {

                /* Set 16550 compatibility mode */
                bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD1,
                                  HAYESP_SETMODE);
                bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2,
                                  HAYESP_MODE_FIFO|HAYESP_MODE_RTS|
                                  HAYESP_MODE_SCALE);

                /* Set RTS/CTS flow control */
                bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD1,
                                  HAYESP_SETFLOWTYPE);
                bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2,
                                  HAYESP_FLOW_RTS);
                bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2,
                                  HAYESP_FLOW_CTS);

                /* Set flow control levels */
                bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD1,
                                  HAYESP_SETRXFLOW);
                bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2,
                                  HAYESP_HIBYTE(HAYESP_RXHIWMARK));
                bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2,
                                  HAYESP_LOBYTE(HAYESP_RXHIWMARK));
                bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2,
                                  HAYESP_HIBYTE(HAYESP_RXLOWMARK));
                bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2,
                                  HAYESP_LOBYTE(HAYESP_RXLOWMARK));
        }

        if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE|COM_HW_KGDB))
                com_enable_debugport(sc);
}

int
com_detach(device_t self, int flags)
{
        struct com_softc *sc = device_private(self);
        int maj, mn;

        if (ISSET(sc->sc_hwflags, COM_HW_KGDB))
                return EBUSY;

        if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE) &&
            (flags & DETACH_SHUTDOWN) != 0)
                return EBUSY;

        if (sc->disable != NULL && sc->enabled != 0) {
                (*sc->disable)(sc);
                sc->enabled = 0;
        }

        if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) {
                comconsattached = 0;
                cn_tab = NULL;
        }

        /* locate the major number */
        maj = cdevsw_lookup_major(&com_cdevsw);

        /* Nuke the vnodes for any open instances. */
        mn = device_unit(self);
        vdevgone(maj, mn, mn, VCHR);

        mn |= COMDIALOUT_MASK;
        vdevgone(maj, mn, mn, VCHR);

        if (sc->sc_rbuf == NULL) {
                /*
                 * Ring buffer allocation failed in the com_attach_subr,
                 * only the tty is allocated, and nothing else.
                 */
                tty_free(sc->sc_tty);
                return 0;
        }

        /* Free the receive buffer. */
        free(sc->sc_rbuf, M_DEVBUF);

        /* Detach and free the tty. */
        tty_detach(sc->sc_tty);
        tty_free(sc->sc_tty);

        /* Unhook the soft interrupt handler. */
        softint_disestablish(sc->sc_si);

#ifdef RND_COM
        /* Unhook the entropy source. */
        rnd_detach_source(&sc->rnd_source);
#endif
        callout_destroy(&sc->sc_diag_callout);

        /* Destroy the lock. */
        mutex_destroy(&sc->sc_lock);

        return (0);
}

void
com_shutdown(struct com_softc *sc)
{
        struct tty *tp = sc->sc_tty;

        mutex_spin_enter(&sc->sc_lock);

        /* If we were asserting flow control, then deassert it. */
        SET(sc->sc_rx_flags, RX_IBUF_BLOCKED);
        com_hwiflow(sc);

        /* Clear any break condition set with TIOCSBRK. */
        com_break(sc, 0);

        /*
         * Hang up if necessary.  Record when we hung up, so if we
         * immediately open the port again, we will wait a bit until
         * the other side has had time to notice that we hung up.
         */
        if (ISSET(tp->t_cflag, HUPCL)) {
                com_modem(sc, 0);
                microuptime(&sc->sc_hup_pending);
                sc->sc_hup_pending.tv_sec++;
        }

        /* Turn off interrupts. */
        if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) {
                sc->sc_ier = IER_ERLS; /* interrupt on line break */
                if ((sc->sc_type == COM_TYPE_PXA2x0) ||
                    (sc->sc_type == COM_TYPE_INGENIC) ||
                    (sc->sc_type == COM_TYPE_TEGRA))
                        sc->sc_ier |= IER_ERXTOUT;
        } else
                sc->sc_ier = 0;

        if (sc->sc_type == COM_TYPE_PXA2x0)
                sc->sc_ier |= IER_EUART;

        CSR_WRITE_1(&sc->sc_regs, COM_REG_IER, sc->sc_ier);

        mutex_spin_exit(&sc->sc_lock);

        if (sc->disable) {
#ifdef DIAGNOSTIC
                if (!sc->enabled)
                        panic("com_shutdown: not enabled?");
#endif
                (*sc->disable)(sc);
                sc->enabled = 0;
        }
}

int
comopen(dev_t dev, int flag, int mode, struct lwp *l)
{
        struct com_softc *sc;
        struct tty *tp;
        int s;
        int error;

        sc = device_lookup_private(&com_cd, COMUNIT(dev));
        if (sc == NULL || !ISSET(sc->sc_hwflags, COM_HW_DEV_OK) ||
                sc->sc_rbuf == NULL)
                return (ENXIO);

        if (!device_is_active(sc->sc_dev))
                return (ENXIO);

#ifdef KGDB
        /*
         * If this is the kgdb port, no other use is permitted.
         */
        if (ISSET(sc->sc_hwflags, COM_HW_KGDB))
                return (EBUSY);
#endif

        tp = sc->sc_tty;

        /*
         * If the device is exclusively for kernel use, deny userland
         * open.
         */
        if (ISSET(tp->t_state, TS_KERN_ONLY))
                return (EBUSY);

        if (kauth_authorize_device_tty(l->l_cred, KAUTH_DEVICE_TTY_OPEN, tp))
                return (EBUSY);

        s = spltty();

        /*
         * Do the following iff this is a first open.
         */
        if (!ISSET(tp->t_state, TS_ISOPEN) && tp->t_wopen == 0) {
                struct termios t;
                struct timeval now, diff;

                tp->t_dev = dev;

                if (sc->enable) {
                        if ((*sc->enable)(sc)) {
                                splx(s);
                                aprint_error_dev(sc->sc_dev,
                                    "device enable failed\n");
                                return (EIO);
                        }
                        mutex_spin_enter(&sc->sc_lock);
                        sc->enabled = 1;
                        com_config(sc);
                } else {
                        mutex_spin_enter(&sc->sc_lock);
                }

                if (timerisset(&sc->sc_hup_pending)) {
                        microuptime(&now);
                        while (timercmp(&now, &sc->sc_hup_pending, <)) {
                                timersub(&sc->sc_hup_pending, &now, &diff);
                                const int ms = diff.tv_sec * 1000 +
                                    diff.tv_usec / 1000;
                                kpause(ttclos, false, uimax(mstohz(ms), 1),
                                    &sc->sc_lock);
                                microuptime(&now);
                        }
                        timerclear(&sc->sc_hup_pending);
                }

                /* Turn on interrupts. */
                sc->sc_ier = IER_ERXRDY | IER_ERLS;
                if (!ISSET(tp->t_cflag, CLOCAL))
                        sc->sc_ier |= IER_EMSC;

                if (sc->sc_type == COM_TYPE_PXA2x0)
                        sc->sc_ier |= IER_EUART | IER_ERXTOUT;
                else if (sc->sc_type == COM_TYPE_INGENIC ||
                         sc->sc_type == COM_TYPE_TEGRA)
                        sc->sc_ier |= IER_ERXTOUT;
                CSR_WRITE_1(&sc->sc_regs, COM_REG_IER, sc->sc_ier);

                /* Fetch the current modem control status, needed later. */
                sc->sc_msr = CSR_READ_1(&sc->sc_regs, COM_REG_MSR);

                /* Clear PPS capture state on first open. */
                mutex_spin_enter(&timecounter_lock);
                memset(&sc->sc_pps_state, 0, sizeof(sc->sc_pps_state));
                sc->sc_pps_state.ppscap = PPS_CAPTUREASSERT | PPS_CAPTURECLEAR;
                pps_init(&sc->sc_pps_state);
                mutex_spin_exit(&timecounter_lock);

                mutex_spin_exit(&sc->sc_lock);

                /*
                 * Initialize the termios status to the defaults.  Add in the
                 * sticky bits from TIOCSFLAGS.
                 */
                if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) {
                        t.c_ospeed = comcons_info.rate;
                        t.c_cflag = comcons_info.cflag;
                } else {
                        t.c_ospeed = TTYDEF_SPEED;
                        t.c_cflag = TTYDEF_CFLAG;
                }
                t.c_ispeed = t.c_ospeed;
                if (ISSET(sc->sc_swflags, TIOCFLAG_CLOCAL))
                        SET(t.c_cflag, CLOCAL);
                if (ISSET(sc->sc_swflags, TIOCFLAG_CRTSCTS))
                        SET(t.c_cflag, CRTSCTS);
                if (ISSET(sc->sc_swflags, TIOCFLAG_MDMBUF))
                        SET(t.c_cflag, MDMBUF);
                /* Make sure comparam() will do something. */
                tp->t_ospeed = 0;
                (void) comparam(tp, &t);
                tp->t_iflag = TTYDEF_IFLAG;
                tp->t_oflag = TTYDEF_OFLAG;
                tp->t_lflag = TTYDEF_LFLAG;
                ttychars(tp);
                ttsetwater(tp);

                mutex_spin_enter(&sc->sc_lock);

                /*
                 * Turn on DTR.  We must always do this, even if carrier is not
                 * present, because otherwise we'd have to use TIOCSDTR
                 * immediately after setting CLOCAL, which applications do not
                 * expect.  We always assert DTR while the device is open
                 * unless explicitly requested to deassert it.
                 */
                com_modem(sc, 1);

                /* Clear the input ring, and unblock. */
                sc->sc_rbput = sc->sc_rbget = sc->sc_rbuf;
                sc->sc_rbavail = com_rbuf_size;
                com_iflush(sc);
                CLR(sc->sc_rx_flags, RX_ANY_BLOCK);
                com_hwiflow(sc);

#ifdef COM_DEBUG
                if (com_debug)
                        comstatus(sc, "comopen  ");
#endif

                mutex_spin_exit(&sc->sc_lock);
        }

        splx(s);

        error = ttyopen(tp, COMDIALOUT(dev), ISSET(flag, O_NONBLOCK));
        if (error)
                goto bad;

        error = (*tp->t_linesw->l_open)(dev, tp);
        if (error)
                goto bad;

        return (0);

bad:
        if (!ISSET(tp->t_state, TS_ISOPEN) && tp->t_wopen == 0) {
                /*
                 * We failed to open the device, and nobody else had it opened.
                 * Clean up the state as appropriate.
                 */
                com_shutdown(sc);
        }

        return (error);
}

int
comclose(dev_t dev, int flag, int mode, struct lwp *l)
{
        struct com_softc *sc =
            device_lookup_private(&com_cd, COMUNIT(dev));
        struct tty *tp = sc->sc_tty;

        /* XXX This is for cons.c. */
        if (!ISSET(tp->t_state, TS_ISOPEN))
                return (0);
        /*
         * If the device is exclusively for kernel use, deny userland
         * close.
         */
        if (ISSET(tp->t_state, TS_KERN_ONLY))
                return (0);

        (*tp->t_linesw->l_close)(tp, flag);
        ttyclose(tp);

        if (COM_ISALIVE(sc) == 0)
                return (0);

        if (!ISSET(tp->t_state, TS_ISOPEN) && tp->t_wopen == 0) {
                /*
                 * Although we got a last close, the device may still be in
                 * use; e.g. if this was the dialout node, and there are still
                 * processes waiting for carrier on the non-dialout node.
                 */
                com_shutdown(sc);
        }

        return (0);
}

int
comread(dev_t dev, struct uio *uio, int flag)
{
        struct com_softc *sc =
            device_lookup_private(&com_cd, COMUNIT(dev));
        struct tty *tp = sc->sc_tty;

        if (COM_ISALIVE(sc) == 0)
                return (EIO);

        return ((*tp->t_linesw->l_read)(tp, uio, flag));
}

int
comwrite(dev_t dev, struct uio *uio, int flag)
{
        struct com_softc *sc =
            device_lookup_private(&com_cd, COMUNIT(dev));
        struct tty *tp = sc->sc_tty;

        if (COM_ISALIVE(sc) == 0)
                return (EIO);

        return ((*tp->t_linesw->l_write)(tp, uio, flag));
}

int
compoll(dev_t dev, int events, struct lwp *l)
{
        struct com_softc *sc =
            device_lookup_private(&com_cd, COMUNIT(dev));
        struct tty *tp = sc->sc_tty;

        if (COM_ISALIVE(sc) == 0)
                return (POLLHUP);

        return ((*tp->t_linesw->l_poll)(tp, events, l));
}

struct tty *
comtty(dev_t dev)
{
        struct com_softc *sc =
            device_lookup_private(&com_cd, COMUNIT(dev));
        struct tty *tp = sc->sc_tty;

        return (tp);
}

int
comioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
        struct com_softc *sc;
        struct tty *tp;
        int error;

        sc = device_lookup_private(&com_cd, COMUNIT(dev));
        if (sc == NULL)
                return ENXIO;
        if (COM_ISALIVE(sc) == 0)
                return (EIO);

        tp = sc->sc_tty;

        error = (*tp->t_linesw->l_ioctl)(tp, cmd, data, flag, l);
        if (error != EPASSTHROUGH)
                return (error);

        error = ttioctl(tp, cmd, data, flag, l);
        if (error != EPASSTHROUGH)
                return (error);

        error = 0;
        switch (cmd) {
        case TIOCSFLAGS:
                error = kauth_authorize_device_tty(l->l_cred,
                    KAUTH_DEVICE_TTY_PRIVSET, tp);
                break;
        default:
                /* nothing */
                break;
        }
        if (error) {
                return error;
        }

        mutex_spin_enter(&sc->sc_lock);

        switch (cmd) {
        case TIOCSBRK:
                com_break(sc, 1);
                break;

        case TIOCCBRK:
                com_break(sc, 0);
                break;

        case TIOCSDTR:
                com_modem(sc, 1);
                break;

        case TIOCCDTR:
                com_modem(sc, 0);
                break;

        case TIOCGFLAGS:
                *(int *)data = sc->sc_swflags;
                break;

        case TIOCSFLAGS:
                sc->sc_swflags = *(int *)data;
                break;

        case TIOCMSET:
        case TIOCMBIS:
        case TIOCMBIC:
                tiocm_to_com(sc, cmd, *(int *)data);
                break;

        case TIOCMGET:
                *(int *)data = com_to_tiocm(sc);
                break;

        case PPS_IOC_CREATE:
        case PPS_IOC_DESTROY:
        case PPS_IOC_GETPARAMS:
        case PPS_IOC_SETPARAMS:
        case PPS_IOC_GETCAP:
        case PPS_IOC_FETCH:
#ifdef PPS_SYNC
        case PPS_IOC_KCBIND:
#endif
                mutex_spin_enter(&timecounter_lock);
                error = pps_ioctl(cmd, data, &sc->sc_pps_state);
                mutex_spin_exit(&timecounter_lock);
                break;

        case TIOCDCDTIMESTAMP:        /* XXX old, overloaded  API used by xntpd v3 */
                mutex_spin_enter(&timecounter_lock);
#ifndef PPS_TRAILING_EDGE
                TIMESPEC_TO_TIMEVAL((struct timeval *)data,
                    &sc->sc_pps_state.ppsinfo.assert_timestamp);
#else
                TIMESPEC_TO_TIMEVAL((struct timeval *)data,
                    &sc->sc_pps_state.ppsinfo.clear_timestamp);
#endif
                mutex_spin_exit(&timecounter_lock);
                break;

        default:
                error = EPASSTHROUGH;
                break;
        }

        mutex_spin_exit(&sc->sc_lock);

#ifdef COM_DEBUG
        if (com_debug)
                comstatus(sc, "comioctl ");
#endif

        return (error);
}

static inline void
com_schedrx(struct com_softc *sc)
{

        sc->sc_rx_ready = 1;

        /* Wake up the poller. */
        softint_schedule(sc->sc_si);
}

void
com_break(struct com_softc *sc, int onoff)
{

        if (onoff)
                SET(sc->sc_lcr, LCR_SBREAK);
        else
                CLR(sc->sc_lcr, LCR_SBREAK);

        if (!sc->sc_heldchange) {
                if (sc->sc_tx_busy) {
                        sc->sc_heldtbc = sc->sc_tbc;
                        sc->sc_tbc = 0;
                        sc->sc_heldchange = 1;
                } else
                        com_loadchannelregs(sc);
        }
}

void
com_modem(struct com_softc *sc, int onoff)
{

        if (sc->sc_mcr_dtr == 0)
                return;

        if (onoff)
                SET(sc->sc_mcr, sc->sc_mcr_dtr);
        else
                CLR(sc->sc_mcr, sc->sc_mcr_dtr);

        if (!sc->sc_heldchange) {
                if (sc->sc_tx_busy) {
                        sc->sc_heldtbc = sc->sc_tbc;
                        sc->sc_tbc = 0;
                        sc->sc_heldchange = 1;
                } else
                        com_loadchannelregs(sc);
        }
}

void
tiocm_to_com(struct com_softc *sc, u_long how, int ttybits)
{
        u_char combits;

        combits = 0;
        if (ISSET(ttybits, TIOCM_DTR))
                SET(combits, MCR_DTR);
        if (ISSET(ttybits, TIOCM_RTS))
                SET(combits, MCR_RTS);

        switch (how) {
        case TIOCMBIC:
                CLR(sc->sc_mcr, combits);
                break;

        case TIOCMBIS:
                SET(sc->sc_mcr, combits);
                break;

        case TIOCMSET:
                CLR(sc->sc_mcr, MCR_DTR | MCR_RTS);
                SET(sc->sc_mcr, combits);
                break;
        }

        if (!sc->sc_heldchange) {
                if (sc->sc_tx_busy) {
                        sc->sc_heldtbc = sc->sc_tbc;
                        sc->sc_tbc = 0;
                        sc->sc_heldchange = 1;
                } else
                        com_loadchannelregs(sc);
        }
}

int
com_to_tiocm(struct com_softc *sc)
{
        u_char combits;
        int ttybits = 0;

        combits = sc->sc_mcr;
        if (ISSET(combits, MCR_DTR))
                SET(ttybits, TIOCM_DTR);
        if (ISSET(combits, MCR_RTS))
                SET(ttybits, TIOCM_RTS);

        combits = sc->sc_msr;
        if (sc->sc_type == COM_TYPE_INGENIC) {
                SET(ttybits, TIOCM_CD);
        } else {
                if (ISSET(combits, MSR_DCD))
                        SET(ttybits, TIOCM_CD);
        }
        if (ISSET(combits, MSR_CTS))
                SET(ttybits, TIOCM_CTS);
        if (ISSET(combits, MSR_DSR))
                SET(ttybits, TIOCM_DSR);
        if (ISSET(combits, MSR_RI | MSR_TERI))
                SET(ttybits, TIOCM_RI);

        if (ISSET(sc->sc_ier, IER_ERXRDY | IER_ETXRDY | IER_ERLS | IER_EMSC))
                SET(ttybits, TIOCM_LE);

        return (ttybits);
}

static u_char
cflag2lcr(tcflag_t cflag)
{
        u_char lcr = 0;

        switch (ISSET(cflag, CSIZE)) {
        case CS5:
                SET(lcr, LCR_5BITS);
                break;
        case CS6:
                SET(lcr, LCR_6BITS);
                break;
        case CS7:
                SET(lcr, LCR_7BITS);
                break;
        case CS8:
                SET(lcr, LCR_8BITS);
                break;
        }
        if (ISSET(cflag, PARENB)) {
                SET(lcr, LCR_PENAB);
                if (!ISSET(cflag, PARODD))
                        SET(lcr, LCR_PEVEN);
        }
        if (ISSET(cflag, CSTOPB))
                SET(lcr, LCR_STOPB);

        return (lcr);
}

int
comparam(struct tty *tp, struct termios *t)
{
        struct com_softc *sc =
            device_lookup_private(&com_cd, COMUNIT(tp->t_dev));
        int ospeed;
        u_char lcr;

        if (COM_ISALIVE(sc) == 0)
                return (EIO);

        if (sc->sc_type == COM_TYPE_HAYESP) {
                int prescaler, speed;

                /*
                 * Calculate UART clock prescaler.  It should be in
                 * range of 0 .. 3.
                 */
                for (prescaler = 0, speed = t->c_ospeed; prescaler < 4;
                    prescaler++, speed /= 2)
                        if ((ospeed = comspeed(speed, sc->sc_frequency,
                                               sc->sc_type)) > 0)
                                break;

                if (prescaler == 4)
                        return (EINVAL);
                sc->sc_prescaler = prescaler;
        } else
                ospeed = comspeed(t->c_ospeed, sc->sc_frequency, sc->sc_type);

        /* Check requested parameters. */
        if (ospeed < 0)
                return (EINVAL);
        if (t->c_ispeed && t->c_ispeed != t->c_ospeed)
                return (EINVAL);

        /*
         * For the console, always force CLOCAL and !HUPCL, so that the port
         * is always active.
         */
        if (ISSET(sc->sc_swflags, TIOCFLAG_SOFTCAR) ||
            ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) {
                SET(t->c_cflag, CLOCAL);
                CLR(t->c_cflag, HUPCL);
        }

        /*
         * If there were no changes, don't do anything.  This avoids dropping
         * input and improves performance when all we did was frob things like
         * VMIN and VTIME.
         */
        if (tp->t_ospeed == t->c_ospeed &&
            tp->t_cflag == t->c_cflag)
                return (0);

        lcr = ISSET(sc->sc_lcr, LCR_SBREAK) | cflag2lcr(t->c_cflag);

        mutex_spin_enter(&sc->sc_lock);

        sc->sc_lcr = lcr;

        /*
         * If we're not in a mode that assumes a connection is present, then
         * ignore carrier changes.
         */
        if (ISSET(t->c_cflag, CLOCAL | MDMBUF))
                sc->sc_msr_dcd = 0;
        else
                sc->sc_msr_dcd = MSR_DCD;
        /*
         * Set the flow control pins depending on the current flow control
         * mode.
         */
        if (ISSET(t->c_cflag, CRTSCTS)) {
                sc->sc_mcr_dtr = MCR_DTR;
                sc->sc_mcr_rts = MCR_RTS;
                sc->sc_msr_cts = MSR_CTS;
                if (ISSET(sc->sc_hwflags, COM_HW_AFE)) {
                        SET(sc->sc_mcr, MCR_AFE);
                } else {
                        sc->sc_efr = EFR_AUTORTS | EFR_AUTOCTS;
                }
        } else if (ISSET(t->c_cflag, MDMBUF)) {
                /*
                 * For DTR/DCD flow control, make sure we don't toggle DTR for
                 * carrier detection.
                 */
                sc->sc_mcr_dtr = 0;
                sc->sc_mcr_rts = MCR_DTR;
                sc->sc_msr_cts = MSR_DCD;
                if (ISSET(sc->sc_hwflags, COM_HW_AFE)) {
                        CLR(sc->sc_mcr, MCR_AFE);
                } else {
                        sc->sc_efr = 0;
                }
        } else {
                /*
                 * If no flow control, then always set RTS.  This will make
                 * the other side happy if it mistakenly thinks we're doing
                 * RTS/CTS flow control.
                 */
                sc->sc_mcr_dtr = MCR_DTR | MCR_RTS;
                sc->sc_mcr_rts = 0;
                sc->sc_msr_cts = 0;
                if (ISSET(sc->sc_hwflags, COM_HW_AFE)) {
                        CLR(sc->sc_mcr, MCR_AFE);
                } else {
                        sc->sc_efr = 0;
                }
                if (ISSET(sc->sc_mcr, MCR_DTR))
                        SET(sc->sc_mcr, MCR_RTS);
                else
                        CLR(sc->sc_mcr, MCR_RTS);
        }
        sc->sc_msr_mask = sc->sc_msr_cts | sc->sc_msr_dcd;

        if (t->c_ospeed == 0 && tp->t_ospeed != 0)
                CLR(sc->sc_mcr, sc->sc_mcr_dtr);
        else if (t->c_ospeed != 0 && tp->t_ospeed == 0)
                SET(sc->sc_mcr, sc->sc_mcr_dtr);

        sc->sc_dlbl = ospeed;
        sc->sc_dlbh = ospeed >> 8;

        /*
         * Set the FIFO threshold based on the receive speed.
         *
         *  * If it's a low speed, it's probably a mouse or some other
         *    interactive device, so set the threshold low.
         *  * If it's a high speed, trim the trigger level down to prevent
         *    overflows.
         *  * Otherwise set it a bit higher.
         */
        if (sc->sc_type == COM_TYPE_HAYESP) {
                sc->sc_fifo = FIFO_DMA_MODE | FIFO_ENABLE | FIFO_TRIGGER_8;
        } else if (sc->sc_type == COM_TYPE_TEGRA) {
                sc->sc_fifo = FIFO_ENABLE | FIFO_TRIGGER_1;
        } else if (ISSET(sc->sc_hwflags, COM_HW_FIFO)) {
                if (t->c_ospeed <= 1200)
                        sc->sc_fifo = FIFO_ENABLE | FIFO_TRIGGER_1;
                else if (t->c_ospeed <= 38400)
                        sc->sc_fifo = FIFO_ENABLE | FIFO_TRIGGER_8;
                else
                        sc->sc_fifo = FIFO_ENABLE | FIFO_TRIGGER_4;
        } else {
                sc->sc_fifo = 0;
        }

        if (sc->sc_type == COM_TYPE_INGENIC)
                sc->sc_fifo |= FIFO_UART_ON;

        /* And copy to tty. */
        tp->t_ispeed = t->c_ospeed;
        tp->t_ospeed = t->c_ospeed;
        tp->t_cflag = t->c_cflag;

        if (!sc->sc_heldchange) {
                if (sc->sc_tx_busy) {
                        sc->sc_heldtbc = sc->sc_tbc;
                        sc->sc_tbc = 0;
                        sc->sc_heldchange = 1;
                } else
                        com_loadchannelregs(sc);
        }

        if (!ISSET(t->c_cflag, CHWFLOW)) {
                /* Disable the high water mark. */
                sc->sc_r_hiwat = 0;
                sc->sc_r_lowat = 0;
                if (ISSET(sc->sc_rx_flags, RX_TTY_OVERFLOWED)) {
                        CLR(sc->sc_rx_flags, RX_TTY_OVERFLOWED);
                        com_schedrx(sc);
                }
                if (ISSET(sc->sc_rx_flags, RX_TTY_BLOCKED|RX_IBUF_BLOCKED)) {
                        CLR(sc->sc_rx_flags, RX_TTY_BLOCKED|RX_IBUF_BLOCKED);
                        com_hwiflow(sc);
                }
        } else {
                sc->sc_r_hiwat = com_rbuf_hiwat;
                sc->sc_r_lowat = com_rbuf_lowat;
        }

        mutex_spin_exit(&sc->sc_lock);

        /*
         * Update the tty layer's idea of the carrier bit, in case we changed
         * CLOCAL or MDMBUF.  We don't hang up here; we only do that by
         * explicit request.
         */
        if (sc->sc_type == COM_TYPE_INGENIC) {
                /* no DCD here */
                (void) (*tp->t_linesw->l_modem)(tp, 1);
        } else
                (void) (*tp->t_linesw->l_modem)(tp, ISSET(sc->sc_msr, MSR_DCD));

#ifdef COM_DEBUG
        if (com_debug)
                comstatus(sc, "comparam ");
#endif

        if (!ISSET(t->c_cflag, CHWFLOW)) {
                if (sc->sc_tx_stopped) {
                        sc->sc_tx_stopped = 0;
                        comstart(tp);
                }
        }

        return (0);
}

void
com_iflush(struct com_softc *sc)
{
        struct com_regs        *regsp = &sc->sc_regs;
        uint8_t fifo;
#ifdef DIAGNOSTIC
        int reg;
#endif
        int timo;

#ifdef DIAGNOSTIC
        reg = 0xffff;
#endif
        timo = 50000;
        /* flush any pending I/O */
        while (ISSET(CSR_READ_1(regsp, COM_REG_LSR), LSR_RXRDY)
            && --timo)
#ifdef DIAGNOSTIC
                reg =
#else
                    (void)
#endif
                    CSR_READ_1(regsp, COM_REG_RXDATA);
#ifdef DIAGNOSTIC
        if (!timo)
                aprint_error_dev(sc->sc_dev, "com_iflush timeout %02x\n", reg);
#endif

        switch (sc->sc_type) {
        case COM_TYPE_16750:
        case COM_TYPE_DW_APB:
                /*
                 * Reset all Rx/Tx FIFO, preserve current FIFO length.
                 * This should prevent triggering busy interrupt while
                 * manipulating divisors.
                 */
                fifo = CSR_READ_1(regsp, COM_REG_FIFO) & (FIFO_TRIGGER_1 |
                    FIFO_TRIGGER_4 | FIFO_TRIGGER_8 | FIFO_TRIGGER_14);
                CSR_WRITE_1(regsp, COM_REG_FIFO,
                    fifo | FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST);
                delay(100);
                break;
        }
}

void
com_loadchannelregs(struct com_softc *sc)
{
        struct com_regs *regsp = &sc->sc_regs;

        /* XXXXX necessary? */
        com_iflush(sc);

        if (sc->sc_type == COM_TYPE_PXA2x0)
                CSR_WRITE_1(regsp, COM_REG_IER, IER_EUART);
        else
                CSR_WRITE_1(regsp, COM_REG_IER, 0);

        if (sc->sc_type == COM_TYPE_OMAP) {
                /* disable before changing settings */
                CSR_WRITE_1(regsp, COM_REG_MDR1, MDR1_MODE_DISABLE);
        }

        if (ISSET(sc->sc_hwflags, COM_HW_FLOW)) {
                KASSERT(sc->sc_type != COM_TYPE_AU1x00);
                KASSERT(sc->sc_type != COM_TYPE_16550_NOERS);
                /* no EFR on alchemy */
                CSR_WRITE_1(regsp, COM_REG_LCR, LCR_EERS);
                CSR_WRITE_1(regsp, COM_REG_EFR, sc->sc_efr);
        }
        if (sc->sc_type == COM_TYPE_AU1x00) {
                /* alchemy has single separate 16-bit clock divisor register */
                CSR_WRITE_2(regsp, COM_REG_DLBL, sc->sc_dlbl +
                    (sc->sc_dlbh << 8));
        } else {
                CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr | LCR_DLAB);
                CSR_WRITE_1(regsp, COM_REG_DLBL, sc->sc_dlbl);
                CSR_WRITE_1(regsp, COM_REG_DLBH, sc->sc_dlbh);
        }
        CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr);
        CSR_WRITE_1(regsp, COM_REG_MCR, sc->sc_mcr_active = sc->sc_mcr);
        CSR_WRITE_1(regsp, COM_REG_FIFO, sc->sc_fifo);
        if (sc->sc_type == COM_TYPE_HAYESP) {
                bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD1,
                    HAYESP_SETPRESCALER);
                bus_space_write_1(regsp->cr_iot, sc->sc_hayespioh, HAYESP_CMD2,
                    sc->sc_prescaler);
        }
        if (sc->sc_type == COM_TYPE_OMAP) {
                /* setup the fifos.  the FCR value is not used as long
                   as SCR[6] and SCR[7] are 0, which they are at reset
                   and we never touch the SCR register */
                uint8_t rx_fifo_trig = 40;
                uint8_t tx_fifo_trig = 60;
                uint8_t rx_start = 8;
                uint8_t rx_halt = 60;
                uint8_t tlr_value = ((rx_fifo_trig>>2) << 4) | (tx_fifo_trig>>2);
                uint8_t tcr_value = ((rx_start>>2) << 4) | (rx_halt>>2);

                /* enable access to TCR & TLR */
                CSR_WRITE_1(regsp, COM_REG_MCR, sc->sc_mcr | MCR_TCR_TLR);

                /* write tcr and tlr values */
                CSR_WRITE_1(regsp, COM_REG_TLR, tlr_value);
                CSR_WRITE_1(regsp, COM_REG_TCR, tcr_value);

                /* disable access to TCR & TLR */
                CSR_WRITE_1(regsp, COM_REG_MCR, sc->sc_mcr);

                /* enable again, but mode is based on speed */
                if (sc->sc_tty->t_termios.c_ospeed > 230400) {
                        CSR_WRITE_1(regsp, COM_REG_MDR1, MDR1_MODE_UART_13X);
                } else {
                        CSR_WRITE_1(regsp, COM_REG_MDR1, MDR1_MODE_UART_16X);
                }
        }

        CSR_WRITE_1(regsp, COM_REG_IER, sc->sc_ier);
}

int
comhwiflow(struct tty *tp, int block)
{
        struct com_softc *sc =
            device_lookup_private(&com_cd, COMUNIT(tp->t_dev));

        if (COM_ISALIVE(sc) == 0)
                return (0);

        if (sc->sc_mcr_rts == 0)
                return (0);

        mutex_spin_enter(&sc->sc_lock);

        if (block) {
                if (!ISSET(sc->sc_rx_flags, RX_TTY_BLOCKED)) {
                        SET(sc->sc_rx_flags, RX_TTY_BLOCKED);
                        com_hwiflow(sc);
                }
        } else {
                if (ISSET(sc->sc_rx_flags, RX_TTY_OVERFLOWED)) {
                        CLR(sc->sc_rx_flags, RX_TTY_OVERFLOWED);
                        com_schedrx(sc);
                }
                if (ISSET(sc->sc_rx_flags, RX_TTY_BLOCKED)) {
                        CLR(sc->sc_rx_flags, RX_TTY_BLOCKED);
                        com_hwiflow(sc);
                }
        }

        mutex_spin_exit(&sc->sc_lock);
        return (1);
}

/*
 * (un)block input via hw flowcontrol
 */
void
com_hwiflow(struct com_softc *sc)
{
        struct com_regs *regsp= &sc->sc_regs;

        if (sc->sc_mcr_rts == 0)
                return;

        if (ISSET(sc->sc_rx_flags, RX_ANY_BLOCK)) {
                CLR(sc->sc_mcr, sc->sc_mcr_rts);
                CLR(sc->sc_mcr_active, sc->sc_mcr_rts);
        } else {
                SET(sc->sc_mcr, sc->sc_mcr_rts);
                SET(sc->sc_mcr_active, sc->sc_mcr_rts);
        }
        CSR_WRITE_1(regsp, COM_REG_MCR, sc->sc_mcr_active);
}


void
comstart(struct tty *tp)
{
        struct com_softc *sc =
            device_lookup_private(&com_cd, COMUNIT(tp->t_dev));
        struct com_regs *regsp = &sc->sc_regs;

        if (COM_ISALIVE(sc) == 0)
                return;

        if (ISSET(tp->t_state, TS_BUSY | TS_TIMEOUT | TS_TTSTOP))
                return;
        if (sc->sc_tx_stopped)
                return;
        if (!ttypull(tp))
                return;

        /* Grab the first contiguous region of buffer space. */
        {
                u_char *tba;
                int tbc;

                tba = tp->t_outq.c_cf;
                tbc = ndqb(&tp->t_outq, 0);

                mutex_spin_enter(&sc->sc_lock);

                sc->sc_tba = tba;
                sc->sc_tbc = tbc;
        }

        SET(tp->t_state, TS_BUSY);
        sc->sc_tx_busy = 1;

        /* Enable transmit completion interrupts if necessary. */
        if (!ISSET(sc->sc_ier, IER_ETXRDY)) {
                SET(sc->sc_ier, IER_ETXRDY);
                CSR_WRITE_1(regsp, COM_REG_IER, sc->sc_ier);
        }

        /* Output the first chunk of the contiguous buffer. */
        if (!ISSET(sc->sc_hwflags, COM_HW_NO_TXPRELOAD)) {
                u_int n;

                n = sc->sc_tbc;
                if (n > sc->sc_fifolen)
                        n = sc->sc_fifolen;
                CSR_WRITE_MULTI(regsp, COM_REG_TXDATA, sc->sc_tba, n);
                sc->sc_tbc -= n;
                sc->sc_tba += n;
        }

        mutex_spin_exit(&sc->sc_lock);
}

/*
 * Stop output on a line.
 */
void
comstop(struct tty *tp, int flag)
{
        struct com_softc *sc =
            device_lookup_private(&com_cd, COMUNIT(tp->t_dev));

        mutex_spin_enter(&sc->sc_lock);
        if (ISSET(tp->t_state, TS_BUSY)) {
                /* Stop transmitting at the next chunk. */
                sc->sc_tbc = 0;
                sc->sc_heldtbc = 0;
                if (!ISSET(tp->t_state, TS_TTSTOP))
                        SET(tp->t_state, TS_FLUSH);
        }
        mutex_spin_exit(&sc->sc_lock);
}

void
comdiag(void *arg)
{
        struct com_softc *sc = arg;
        int overflows, floods;

        mutex_spin_enter(&sc->sc_lock);
        overflows = sc->sc_overflows;
        sc->sc_overflows = 0;
        floods = sc->sc_floods;
        sc->sc_floods = 0;
        sc->sc_errors = 0;
        mutex_spin_exit(&sc->sc_lock);

        log(LOG_WARNING, "%s: %d silo overflow%s, %d ibuf flood%s\n",
            device_xname(sc->sc_dev),
            overflows, overflows == 1 ? "" : "s",
            floods, floods == 1 ? "" : "s");
}

static inline void
com_rxsoft(struct com_softc *sc, struct tty *tp)
{
        int (*rint)(int, struct tty *) = tp->t_linesw->l_rint;
        u_char *get, *end;
        u_int cc, scc;
        u_char lsr;
        int code;

        end = sc->sc_ebuf;
        get = sc->sc_rbget;
        scc = cc = com_rbuf_size - sc->sc_rbavail;

        if (cc == com_rbuf_size) {
                sc->sc_floods++;
                if (sc->sc_errors++ == 0)
                        callout_reset(&sc->sc_diag_callout, 60 * hz,
                            comdiag, sc);
        }

        /* If not yet open, drop the entire buffer content here */
        if (!ISSET(tp->t_state, TS_ISOPEN)) {
                get += cc << 1;
                if (get >= end)
                        get -= com_rbuf_size << 1;
                cc = 0;
        }
        while (cc) {
                code = get[0];
                lsr = get[1];
                if (ISSET(lsr, LSR_OE | LSR_BI | LSR_FE | LSR_PE)) {
                        if (ISSET(lsr, LSR_OE)) {
                                sc->sc_overflows++;
                                if (sc->sc_errors++ == 0)
                                        callout_reset(&sc->sc_diag_callout,
                                            60 * hz, comdiag, sc);
                        }
                        if (ISSET(lsr, LSR_BI | LSR_FE))
                                SET(code, TTY_FE);
                        if (ISSET(lsr, LSR_PE))
                                SET(code, TTY_PE);
                }
                if ((*rint)(code, tp) == -1) {
                        /*
                         * The line discipline's buffer is out of space.
                         */
                        if (!ISSET(sc->sc_rx_flags, RX_TTY_BLOCKED)) {
                                /*
                                 * We're either not using flow control, or the
                                 * line discipline didn't tell us to block for
                                 * some reason.  Either way, we have no way to
                                 * know when there's more space available, so
                                 * just drop the rest of the data.
                                 */
                                get += cc << 1;
                                if (get >= end)
                                        get -= com_rbuf_size << 1;
                                cc = 0;
                        } else {
                                /*
                                 * Don't schedule any more receive processing
                                 * until the line discipline tells us there's
                                 * space available (through comhwiflow()).
                                 * Leave the rest of the data in the input
                                 * buffer.
                                 */
                                SET(sc->sc_rx_flags, RX_TTY_OVERFLOWED);
                        }
                        break;
                }
                get += 2;
                if (get >= end)
                        get = sc->sc_rbuf;
                cc--;
        }

        if (cc != scc) {
                sc->sc_rbget = get;
                mutex_spin_enter(&sc->sc_lock);

                cc = sc->sc_rbavail += scc - cc;
                /* Buffers should be ok again, release possible block. */
                if (cc >= sc->sc_r_lowat) {
                        if (ISSET(sc->sc_rx_flags, RX_IBUF_OVERFLOWED)) {
                                CLR(sc->sc_rx_flags, RX_IBUF_OVERFLOWED);
                                SET(sc->sc_ier, IER_ERXRDY);
                                if (sc->sc_type == COM_TYPE_PXA2x0)
                                        SET(sc->sc_ier, IER_ERXTOUT);
                                if (sc->sc_type == COM_TYPE_INGENIC ||
                                    sc->sc_type == COM_TYPE_TEGRA)
                                        SET(sc->sc_ier, IER_ERXTOUT);

                                CSR_WRITE_1(&sc->sc_regs, COM_REG_IER,
                                    sc->sc_ier);
                        }
                        if (ISSET(sc->sc_rx_flags, RX_IBUF_BLOCKED)) {
                                CLR(sc->sc_rx_flags, RX_IBUF_BLOCKED);
                                com_hwiflow(sc);
                        }
                }
                mutex_spin_exit(&sc->sc_lock);
        }
}

static inline void
com_txsoft(struct com_softc *sc, struct tty *tp)
{

        CLR(tp->t_state, TS_BUSY);
        if (ISSET(tp->t_state, TS_FLUSH))
                CLR(tp->t_state, TS_FLUSH);
        else
                ndflush(&tp->t_outq, (int)(sc->sc_tba - tp->t_outq.c_cf));
        (*tp->t_linesw->l_start)(tp);
}

static inline void
com_stsoft(struct com_softc *sc, struct tty *tp)
{
        u_char msr, delta;

        mutex_spin_enter(&sc->sc_lock);
        msr = sc->sc_msr;
        delta = sc->sc_msr_delta;
        sc->sc_msr_delta = 0;
        mutex_spin_exit(&sc->sc_lock);

        if (ISSET(delta, sc->sc_msr_dcd)) {
                /*
                 * Inform the tty layer that carrier detect changed.
                 */
                (void) (*tp->t_linesw->l_modem)(tp, ISSET(msr, MSR_DCD));
        }

        if (ISSET(delta, sc->sc_msr_cts)) {
                /* Block or unblock output according to flow control. */
                if (ISSET(msr, sc->sc_msr_cts)) {
                        sc->sc_tx_stopped = 0;
                        (*tp->t_linesw->l_start)(tp);
                } else {
                        sc->sc_tx_stopped = 1;
                }
        }

#ifdef COM_DEBUG
        if (com_debug)
                comstatus(sc, "com_stsoft");
#endif
}

void
comsoft(void *arg)
{
        struct com_softc *sc = arg;
        struct tty *tp;

        if (COM_ISALIVE(sc) == 0)
                return;

        tp = sc->sc_tty;

        if (sc->sc_rx_ready) {
                sc->sc_rx_ready = 0;
                com_rxsoft(sc, tp);
        }

        if (sc->sc_st_check) {
                sc->sc_st_check = 0;
                com_stsoft(sc, tp);
        }

        if (sc->sc_tx_done) {
                sc->sc_tx_done = 0;
                com_txsoft(sc, tp);
        }
}

int
comintr(void *arg)
{
        struct com_softc *sc = arg;
        struct com_regs *regsp = &sc->sc_regs;

        u_char *put, *end;
        u_int cc;
        u_char lsr, iir;

        if (COM_ISALIVE(sc) == 0)
                return (0);

        KASSERT(regsp != NULL);

        mutex_spin_enter(&sc->sc_lock);
        iir = CSR_READ_1(regsp, COM_REG_IIR);

        /* Handle ns16750-specific busy interrupt. */
        if (sc->sc_type == COM_TYPE_16750 &&
            (iir & IIR_BUSY) == IIR_BUSY) {
                for (int timeout = 10000;
                    (CSR_READ_1(regsp, COM_REG_USR) & 0x1) != 0; timeout--)
                        if (timeout <= 0) {
                                aprint_error_dev(sc->sc_dev,
                                    "timeout while waiting for BUSY interrupt "
                                    "acknowledge\n");
                                mutex_spin_exit(&sc->sc_lock);
                                return (0);
                        }

                CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr);
                iir = CSR_READ_1(regsp, COM_REG_IIR);
        }

        /* DesignWare APB UART BUSY interrupt */
        if (sc->sc_type == COM_TYPE_DW_APB &&
            (iir & IIR_BUSY) == IIR_BUSY) {
                if (ISSET(sc->sc_hwflags, COM_HW_CONSOLE)) {
                        (void)CSR_READ_1(regsp, COM_REG_USR);
                } else if ((CSR_READ_1(regsp, COM_REG_USR) & 0x1) != 0) {
                        CSR_WRITE_1(regsp, COM_REG_HALT, HALT_CHCFG_EN);
                        CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr | LCR_DLAB);
                        CSR_WRITE_1(regsp, COM_REG_DLBL, sc->sc_dlbl);
                        CSR_WRITE_1(regsp, COM_REG_DLBH, sc->sc_dlbh);
                        CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr);
                        CSR_WRITE_1(regsp, COM_REG_HALT,
                            HALT_CHCFG_EN | HALT_CHCFG_UD);
                        for (int timeout = 10000000;
                            (CSR_READ_1(regsp, COM_REG_HALT) & HALT_CHCFG_UD) != 0;
                            timeout--) {
                                if (timeout <= 0) {
                                        aprint_error_dev(sc->sc_dev,
                                            "timeout while waiting for HALT "
                                            "update acknowledge 0x%x 0x%x\n",
                                            CSR_READ_1(regsp, COM_REG_HALT),
                                            CSR_READ_1(regsp, COM_REG_USR));
                                        break;
                                }
                        }
                        CSR_WRITE_1(regsp, COM_REG_HALT, 0);
                        (void)CSR_READ_1(regsp, COM_REG_USR);
                } else {
                        CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr | LCR_DLAB);
                        CSR_WRITE_1(regsp, COM_REG_DLBL, sc->sc_dlbl);
                        CSR_WRITE_1(regsp, COM_REG_DLBH, sc->sc_dlbh);
                        CSR_WRITE_1(regsp, COM_REG_LCR, sc->sc_lcr);
                }
        }

        end = sc->sc_ebuf;
        put = sc->sc_rbput;
        cc = sc->sc_rbavail;

        if (ISSET(iir, IIR_NOPEND)) {
                if (ISSET(sc->sc_hwflags, COM_HW_BROKEN_ETXRDY))
                        goto do_tx;
                mutex_spin_exit(&sc->sc_lock);
                return (0);
        }

again:        do {
                u_char        msr, delta;

                lsr = CSR_READ_1(regsp, COM_REG_LSR);
                if (ISSET(lsr, LSR_BI)) {
                        int cn_trapped = 0; /* see above: cn_trap() */

                        cn_check_magic(sc->sc_tty->t_dev,
                                       CNC_BREAK, com_cnm_state);
                        if (cn_trapped)
                                continue;
#if defined(KGDB) && !defined(DDB)
                        if (ISSET(sc->sc_hwflags, COM_HW_KGDB)) {
                                kgdb_connect(1);
                                continue;
                        }
#endif
                }

                if (sc->sc_type == COM_TYPE_BCMAUXUART && ISSET(iir, IIR_RXRDY))
                        lsr |= LSR_RXRDY;

                if (ISSET(lsr, LSR_RCV_MASK) &&
                    !ISSET(sc->sc_rx_flags, RX_IBUF_OVERFLOWED)) {
                        while (cc > 0) {
                                int cn_trapped = 0;
                                put[0] = CSR_READ_1(regsp, COM_REG_RXDATA);
                                put[1] = lsr;
                                cn_check_magic(sc->sc_tty->t_dev,
                                               put[0], com_cnm_state);
                                if (cn_trapped)
                                        goto next;
                                put += 2;
                                if (put >= end)
                                        put = sc->sc_rbuf;
                                cc--;
                        next:
                                lsr = CSR_READ_1(regsp, COM_REG_LSR);
                                if (!ISSET(lsr, LSR_RCV_MASK))
                                        break;
                        }

                        /*
                         * Current string of incoming characters ended because
                         * no more data was available or we ran out of space.
                         * Schedule a receive event if any data was received.
                         * If we're out of space, turn off receive interrupts.
                         */
                        sc->sc_rbput = put;
                        sc->sc_rbavail = cc;
                        if (!ISSET(sc->sc_rx_flags, RX_TTY_OVERFLOWED))
                                sc->sc_rx_ready = 1;

                        /*
                         * See if we are in danger of overflowing a buffer. If
                         * so, use hardware flow control to ease the pressure.
                         */
                        if (!ISSET(sc->sc_rx_flags, RX_IBUF_BLOCKED) &&
                            cc < sc->sc_r_hiwat) {
                                SET(sc->sc_rx_flags, RX_IBUF_BLOCKED);
                                com_hwiflow(sc);
                        }

                        /*
                         * If we're out of space, disable receive interrupts
                         * until the queue has drained a bit.
                         */
                        if (!cc) {
                                SET(sc->sc_rx_flags, RX_IBUF_OVERFLOWED);
                                switch (sc->sc_type) {
                                case COM_TYPE_PXA2x0:
                                        CLR(sc->sc_ier, IER_ERXRDY|IER_ERXTOUT);
                                        break;
                                case COM_TYPE_INGENIC:
                                case COM_TYPE_TEGRA:
                                        CLR(sc->sc_ier,
                                            IER_ERXRDY | IER_ERXTOUT);
                                        break;
                                default:
                                        CLR(sc->sc_ier, IER_ERXRDY);
                                        break;
                                }
                                CSR_WRITE_1(regsp, COM_REG_IER, sc->sc_ier);
                        }
                } else {
                        if ((iir & (IIR_RXRDY|IIR_TXRDY)) == IIR_RXRDY) {
                                (void) CSR_READ_1(regsp, COM_REG_RXDATA);
                                continue;
                        }
                }

                msr = CSR_READ_1(regsp, COM_REG_MSR);
                delta = msr ^ sc->sc_msr;
                sc->sc_msr = msr;
                if ((sc->sc_pps_state.ppsparam.mode & PPS_CAPTUREBOTH) &&
                    (delta & MSR_DCD)) {
                        mutex_spin_enter(&timecounter_lock);
                        pps_capture(&sc->sc_pps_state);
                        pps_event(&sc->sc_pps_state,
                            (msr & MSR_DCD) ?
                            PPS_CAPTUREASSERT :
                            PPS_CAPTURECLEAR);
                        mutex_spin_exit(&timecounter_lock);
                }

                /*
                 * Process normal status changes
                 */
                if (ISSET(delta, sc->sc_msr_mask)) {
                        SET(sc->sc_msr_delta, delta);

                        /*
                         * Stop output immediately if we lose the output
                         * flow control signal or carrier detect.
                         */
                        if (ISSET(~msr, sc->sc_msr_mask)) {
                                sc->sc_tbc = 0;
                                sc->sc_heldtbc = 0;
#ifdef COM_DEBUG
                                if (com_debug)
                                        comstatus(sc, "comintr  ");
#endif
                        }

                        sc->sc_st_check = 1;
                }
        } while (!ISSET((iir =
            CSR_READ_1(regsp, COM_REG_IIR)), IIR_NOPEND) &&
            /*
             * Since some device (e.g., ST16C1550) doesn't clear IIR_TXRDY
             * by IIR read, so we can't do this way: `process all interrupts,
             * then do TX if possible'.
             */
            (iir & IIR_IMASK) != IIR_TXRDY);

do_tx:
        /*
         * Read LSR again, since there may be an interrupt between
         * the last LSR read and IIR read above.
         */
        lsr = CSR_READ_1(regsp, COM_REG_LSR);

        /*
         * See if data can be transmitted as well.
         * Schedule tx done event if no data left
         * and tty was marked busy.
         */
        if (ISSET(lsr, LSR_TXRDY)) {
                /*
                 * If we've delayed a parameter change, do it now, and restart
                 * output.
                 */
                if (sc->sc_heldchange) {
                        com_loadchannelregs(sc);
                        sc->sc_heldchange = 0;
                        sc->sc_tbc = sc->sc_heldtbc;
                        sc->sc_heldtbc = 0;
                }

                /* Output the next chunk of the contiguous buffer, if any. */
                if (sc->sc_tbc > 0) {
                        u_int n;

                        n = sc->sc_tbc;
                        if (n > sc->sc_fifolen)
                                n = sc->sc_fifolen;
                        CSR_WRITE_MULTI(regsp, COM_REG_TXDATA, sc->sc_tba, n);
                        sc->sc_tbc -= n;
                        sc->sc_tba += n;
                } else {
                        /* Disable transmit completion interrupts if necessary. */
                        if (ISSET(sc->sc_ier, IER_ETXRDY)) {
                                CLR(sc->sc_ier, IER_ETXRDY);
                                CSR_WRITE_1(regsp, COM_REG_IER, sc->sc_ier);
                        }
                        if (sc->sc_tx_busy) {
                                sc->sc_tx_busy = 0;
                                sc->sc_tx_done = 1;
                        }
                }
        }

        if (!ISSET((iir = CSR_READ_1(regsp, COM_REG_IIR)), IIR_NOPEND))
                goto again;

        mutex_spin_exit(&sc->sc_lock);

        /* Wake up the poller. */
        if ((sc->sc_rx_ready | sc->sc_st_check | sc->sc_tx_done) != 0)
                softint_schedule(sc->sc_si);

#ifdef RND_COM
        rnd_add_uint32(&sc->rnd_source, iir | lsr);
#endif

        return (1);
}

/*
 * The following functions are polled getc and putc routines, shared
 * by the console and kgdb glue.
 *
 * The read-ahead code is so that you can detect pending in-band
 * cn_magic in polled mode while doing output rather than having to
 * wait until the kernel decides it needs input.
 */

#define MAX_READAHEAD        20
static int com_readahead[MAX_READAHEAD];
static int com_readaheadcount = 0;

int
com_common_getc(dev_t dev, struct com_regs *regsp)
{
        int s = splserial();
        u_char stat, c;

        /* got a character from reading things earlier */
        if (com_readaheadcount > 0) {
                int i;

                c = com_readahead[0];
                for (i = 1; i < com_readaheadcount; i++) {
                        com_readahead[i-1] = com_readahead[i];
                }
                com_readaheadcount--;
                splx(s);
                return (c);
        }

        /* don't block until a character becomes available */
        if (!ISSET(stat = CSR_READ_1(regsp, COM_REG_LSR), LSR_RXRDY)) {
                splx(s);
                return -1;
        }

        c = CSR_READ_1(regsp, COM_REG_RXDATA);
        stat = CSR_READ_1(regsp, COM_REG_IIR);
        {
                int cn_trapped = 0;        /* required by cn_trap, see above */
                if (!db_active)
                        cn_check_magic(dev, c, com_cnm_state);
        }
        splx(s);
        return (c);
}

static void
com_common_putc(dev_t dev, struct com_regs *regsp, int c, int with_readahead)
{
        int s = splserial();
        int cin, stat, timo;

        if (with_readahead && com_readaheadcount < MAX_READAHEAD
             && ISSET(stat = CSR_READ_1(regsp, COM_REG_LSR), LSR_RXRDY)) {
                int cn_trapped = 0;
                cin = CSR_READ_1(regsp, COM_REG_RXDATA);
                stat = CSR_READ_1(regsp, COM_REG_IIR);
                cn_check_magic(dev, cin, com_cnm_state);
                com_readahead[com_readaheadcount++] = cin;
        }

        /* wait for any pending transmission to finish */
        timo = 150000;
        while (!ISSET(CSR_READ_1(regsp, COM_REG_LSR), LSR_TXRDY) && --timo)
                continue;

        CSR_WRITE_1(regsp, COM_REG_TXDATA, c);
        COM_BARRIER(regsp, BR | BW);

        splx(s);
}

/*
 * Initialize UART for use as console or KGDB line.
 */
int
cominit(struct com_regs *regsp, int rate, int frequency, int type,
    tcflag_t cflag)
{

        if (bus_space_map(regsp->cr_iot, regsp->cr_iobase, regsp->cr_nports, 0,
                &regsp->cr_ioh))
                return (ENOMEM); /* ??? */

        if (type == COM_TYPE_OMAP) {
                /* disable before changing settings */
                CSR_WRITE_1(regsp, COM_REG_MDR1, MDR1_MODE_DISABLE);
        }

        rate = comspeed(rate, frequency, type);
        if (rate != -1) {
                if (type == COM_TYPE_AU1x00) {
                        /* no EFR on alchemy */
                        CSR_WRITE_2(regsp, COM_REG_DLBL, rate);
                } else {
                        if ((type != COM_TYPE_16550_NOERS) && 
                            (type != COM_TYPE_INGENIC)) {
                                CSR_WRITE_1(regsp, COM_REG_LCR, LCR_EERS);
                                CSR_WRITE_1(regsp, COM_REG_EFR, 0);
                        }
                        CSR_WRITE_1(regsp, COM_REG_LCR, LCR_DLAB);
                        CSR_WRITE_1(regsp, COM_REG_DLBL, rate & 0xff);
                        CSR_WRITE_1(regsp, COM_REG_DLBH, rate >> 8);
                }
        }
        CSR_WRITE_1(regsp, COM_REG_LCR, cflag2lcr(cflag));
        CSR_WRITE_1(regsp, COM_REG_MCR, MCR_DTR | MCR_RTS);

        if (type == COM_TYPE_INGENIC) {
                CSR_WRITE_1(regsp, COM_REG_FIFO,
                    FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST |
                    FIFO_TRIGGER_1 | FIFO_UART_ON);
        } else {
                CSR_WRITE_1(regsp, COM_REG_FIFO,
                    FIFO_ENABLE | FIFO_RCV_RST | FIFO_XMT_RST |
                    FIFO_TRIGGER_1);
        }

        if (type == COM_TYPE_OMAP) {
                /* setup the fifos.  the FCR value is not used as long
                   as SCR[6] and SCR[7] are 0, which they are at reset
                   and we never touch the SCR register */
                uint8_t rx_fifo_trig = 40;
                uint8_t tx_fifo_trig = 60;
                uint8_t rx_start = 8;
                uint8_t rx_halt = 60;
                uint8_t tlr_value = ((rx_fifo_trig>>2) << 4) | (tx_fifo_trig>>2);
                uint8_t tcr_value = ((rx_start>>2) << 4) | (rx_halt>>2);

                /* enable access to TCR & TLR */
                CSR_WRITE_1(regsp, COM_REG_MCR, MCR_DTR | MCR_RTS | MCR_TCR_TLR);

                /* write tcr and tlr values */
                CSR_WRITE_1(regsp, COM_REG_TLR, tlr_value);
                CSR_WRITE_1(regsp, COM_REG_TCR, tcr_value);

                /* disable access to TCR & TLR */
                CSR_WRITE_1(regsp, COM_REG_MCR, MCR_DTR | MCR_RTS);

                /* enable again, but mode is based on speed */
                if (rate > 230400) {
                        CSR_WRITE_1(regsp, COM_REG_MDR1, MDR1_MODE_UART_13X);
                } else {
                        CSR_WRITE_1(regsp, COM_REG_MDR1, MDR1_MODE_UART_16X);
                }
        }

        if (type == COM_TYPE_PXA2x0)
                CSR_WRITE_1(regsp, COM_REG_IER, IER_EUART);
        else
                CSR_WRITE_1(regsp, COM_REG_IER, 0);

        return (0);
}

int
comcnattach1(struct com_regs *regsp, int rate, int frequency, int type,
    tcflag_t cflag)
{
        int res;

        comcons_info.regs = *regsp;

        res = cominit(&comcons_info.regs, rate, frequency, type, cflag);
        if (res)
                return (res);

        cn_tab = &comcons;
        cn_init_magic(&com_cnm_state);
        cn_set_magic("\047\001"); /* default magic is BREAK */

        comcons_info.frequency = frequency;
        comcons_info.type = type;
        comcons_info.rate = rate;
        comcons_info.cflag = cflag;

        return (0);
}

int
comcnattach(bus_space_tag_t iot, bus_addr_t iobase, int rate, int frequency,
    int type, tcflag_t cflag)
{
        struct com_regs        regs;

        /*XXX*/
        bus_space_handle_t dummy_bsh;
        memset(&dummy_bsh, 0, sizeof(dummy_bsh));

        /*
         * dummy_bsh required because com_init_regs() wants it.  A
         * real bus_space_handle will be filled in by cominit() later.
         * XXXJRT Detangle this mess eventually, plz.
         */
        com_init_regs(&regs, iot, dummy_bsh/*XXX*/, iobase);

        return comcnattach1(&regs, rate, frequency, type, cflag);
}

static int
comcnreattach(void)
{
        return comcnattach1(&comcons_info.regs, comcons_info.rate,
            comcons_info.frequency, comcons_info.type, comcons_info.cflag);
}

int
comcngetc(dev_t dev)
{

        return (com_common_getc(dev, &comcons_info.regs));
}

/*
 * Console kernel output character routine.
 */
void
comcnputc(dev_t dev, int c)
{

        com_common_putc(dev, &comcons_info.regs, c, cold);
}

void
comcnpollc(dev_t dev, int on)
{

        com_readaheadcount = 0;
}

#ifdef KGDB
int
com_kgdb_attach1(struct com_regs *regsp, int rate, int frequency, int type,
    tcflag_t cflag)
{
        int res;

        if (bus_space_is_equal(regsp->cr_iot, comcons_info.regs.cr_iot) &&
            regsp->cr_iobase == comcons_info.regs.cr_iobase) {
#if !defined(DDB)
                return (EBUSY); /* cannot share with console */
#else
                comkgdbregs = *regsp;
                comkgdbregs.cr_ioh = comcons_info.regs.cr_ioh;
#endif
        } else {
                comkgdbregs = *regsp;
                res = cominit(&comkgdbregs, rate, frequency, type, cflag);
                if (res)
                        return (res);

                /*
                 * XXXfvdl this shouldn't be needed, but the cn_magic goo
                 * expects this to be initialized
                 */
                cn_init_magic(&com_cnm_state);
                cn_set_magic("\047\001");
        }

        kgdb_attach(com_kgdb_getc, com_kgdb_putc, NULL);
        kgdb_dev = 123; /* unneeded, only to satisfy some tests */

        return (0);
}

int
com_kgdb_attach(bus_space_tag_t iot, bus_addr_t iobase, int rate,
    int frequency, int type, tcflag_t cflag)
{
        struct com_regs regs;

        com_init_regs(&regs, iot, (bus_space_handle_t)0/*XXX*/, iobase);

        return com_kgdb_attach1(&regs, rate, frequency, type, cflag);
}

/* ARGSUSED */
int
com_kgdb_getc(void *arg)
{

        return (com_common_getc(NODEV, &comkgdbregs));
}

/* ARGSUSED */
void
com_kgdb_putc(void *arg, int c)
{

        com_common_putc(NODEV, &comkgdbregs, c, 0);
}
#endif /* KGDB */

/*
 * helper function to identify the com ports used by
 * console or KGDB (and not yet autoconf attached)
 */
int
com_is_console(bus_space_tag_t iot, bus_addr_t iobase, bus_space_handle_t *ioh)
{
        bus_space_handle_t help;

        if (!comconsattached &&
            bus_space_is_equal(iot, comcons_info.regs.cr_iot) &&
            iobase == comcons_info.regs.cr_iobase)
                help = comcons_info.regs.cr_ioh;
#ifdef KGDB
        else if (!com_kgdb_attached &&
            bus_space_is_equal(iot, comkgdbregs.cr_iot) &&
            iobase == comkgdbregs.cr_iobase)
                help = comkgdbregs.cr_ioh;
#endif
        else
                return (0);

        if (ioh)
                *ioh = help;
        return (1);
}

/*
 * this routine exists to serve as a shutdown hook for systems that
 * have firmware which doesn't interact properly with a com device in
 * FIFO mode.
 */
bool
com_cleanup(device_t self, int how)
{
        struct com_softc *sc = device_private(self);

        if (ISSET(sc->sc_hwflags, COM_HW_FIFO))
                CSR_WRITE_1(&sc->sc_regs, COM_REG_FIFO, 0);

        return true;
}

bool
com_suspend(device_t self, const pmf_qual_t *qual)
{
        struct com_softc *sc = device_private(self);

        CSR_WRITE_1(&sc->sc_regs, COM_REG_IER, 0);
        (void)CSR_READ_1(&sc->sc_regs, COM_REG_IIR);

        return true;
}

bool
com_resume(device_t self, const pmf_qual_t *qual)
{
        struct com_softc *sc = device_private(self);

        mutex_spin_enter(&sc->sc_lock);
        com_loadchannelregs(sc);
        mutex_spin_exit(&sc->sc_lock);

        return true;
}


























































































































































    1 





    1 











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
/*        $NetBSD: joy.c,v 1.21 2017/10/28 04:53:55 riastradh Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software developed for The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1995 Jean-Marc Zucconi
 * All rights reserved.
 *
 * Ported to NetBSD by Matthieu Herrb <matthieu@laas.fr>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer
 *    in this position and unchanged.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: joy.c,v 1.21 2017/10/28 04:53:55 riastradh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/device.h>
#include <sys/errno.h>
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/vnode.h>
#include <sys/bus.h>
#include <sys/joystick.h>

#include <dev/ic/joyvar.h>

#include "ioconf.h"

/*
 * The game port can manage 4 buttons and 4 variable resistors (usually 2
 * joysticks, each with 2 buttons and 2 pots.) via the port at address 0x201.
 * Getting the state of the buttons is done by reading the game port;
 * buttons 1-4 correspond to bits 4-7 and resistors 1-4 (X1, Y1, X2, Y2)
 * to bits 0-3.  If button 1 (resp 2, 3, 4) is pressed, the bit 4 (resp 5,
 * 6, 7) is set to 0 to get the value of a resistor, write the value 0xff
 * at port and wait until the corresponding bit returns to 0.
 */


#define JOYPART(d) (minor(d) & 1)
#define JOYUNIT(d) (minor(d) >> 1)

#ifndef JOY_TIMEOUT
#define JOY_TIMEOUT   2000        /* 2 milliseconds */
#endif

static dev_type_open(joyopen);
static dev_type_close(joyclose);
static dev_type_read(joyread);
static dev_type_ioctl(joyioctl);

const struct cdevsw joy_cdevsw = {
        .d_open = joyopen,
        .d_close = joyclose,
        .d_read = joyread,
        .d_write = nowrite,
        .d_ioctl = joyioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_OTHER | D_MPSAFE
};

void
joyattach(struct joy_softc *sc)
{

        if (sc->sc_lock == NULL) {
                panic("joyattach: no lock");
        }

        sc->timeout[0] = 0;
        sc->timeout[1] = 0;

        mutex_enter(sc->sc_lock);
        bus_space_write_1(sc->sc_iot, sc->sc_ioh, 0, 0xff);
        DELAY(10000);                /* 10 ms delay */
        aprint_normal_dev(sc->sc_dev, "joystick %sconnected\n",
            (bus_space_read_1(sc->sc_iot, sc->sc_ioh, 0) & 0x0f) == 0x0f ?
            "not " : "");
        mutex_exit(sc->sc_lock);
}

int
joydetach(struct joy_softc *sc, int flags)
{
        int maj, mn;

        maj = cdevsw_lookup_major(&joy_cdevsw);
        mn = device_unit(sc->sc_dev) << 1;
        vdevgone(maj, mn, mn, VCHR);
        vdevgone(maj, mn + 1, mn + 1, VCHR);

        return 0;
}

static int
joyopen(dev_t dev, int flag, int mode, struct lwp *l)
{
        int unit = JOYUNIT(dev);
        int i = JOYPART(dev);
        struct joy_softc *sc;

        sc = device_lookup_private(&joy_cd, unit);
        if (sc == NULL)
                return ENXIO;

        mutex_enter(sc->sc_lock);
        if (sc->timeout[i]) {
                mutex_exit(sc->sc_lock);
                return EBUSY;
        }
        sc->x_off[i] = sc->y_off[i] = 0;
        sc->timeout[i] = JOY_TIMEOUT;
        mutex_exit(sc->sc_lock);
        return 0;
}

static int
joyclose(dev_t dev, int flag, int mode, struct lwp *l)
{
        int unit = JOYUNIT(dev);
        int i = JOYPART(dev);
        struct joy_softc *sc = device_lookup_private(&joy_cd, unit);

        mutex_enter(sc->sc_lock);
        sc->timeout[i] = 0;
        mutex_exit(sc->sc_lock);
        return 0;
}

static int
joyread(dev_t dev, struct uio *uio, int flag)
{
        int unit = JOYUNIT(dev);
        struct joy_softc *sc = device_lookup_private(&joy_cd, unit);
        bus_space_tag_t iot = sc->sc_iot;
        bus_space_handle_t ioh = sc->sc_ioh;
        struct joystick c;
        struct timeval start, now, diff;
        int state = 0, x = 0, y = 0, i;

        mutex_enter(sc->sc_lock);
        bus_space_write_1(iot, ioh, 0, 0xff);
        microtime(&start);
        now = start; /* structure assignment */
        i = sc->timeout[JOYPART(dev)];
        for (;;) {
                timersub(&now, &start, &diff);
                if (diff.tv_sec > 0 || diff.tv_usec > i)
                        break;
                state = bus_space_read_1(iot, ioh, 0);
                if (JOYPART(dev) == 1)
                        state >>= 2;
                if (!x && !(state & 0x01))
                        x = diff.tv_usec;
                if (!y && !(state & 0x02))
                        y = diff.tv_usec;
                if (x && y)
                        break;
                microtime(&now);
        }
        mutex_exit(sc->sc_lock);

        c.x = x ? sc->x_off[JOYPART(dev)] + x : 0x80000000;
        c.y = y ? sc->y_off[JOYPART(dev)] + y : 0x80000000;
        state >>= 4;
        c.b1 = ~state & 1;
        c.b2 = ~(state >> 1) & 1;
        return uiomove(&c, sizeof(struct joystick), uio);
}

static int
joyioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
        int unit = JOYUNIT(dev);
        struct joy_softc *sc = device_lookup_private(&joy_cd, unit);
        int i = JOYPART(dev), x, error;

        mutex_enter(sc->sc_lock);
        error = 0;
        switch (cmd) {
        case JOY_SETTIMEOUT:
                x = *(int *)data;
                if (x < 1 || x > 10000) {        /* 10ms maximum! */
                        error = EINVAL;
                        break;
                }
                sc->timeout[i] = x;
                break;
        case JOY_GETTIMEOUT:
                *(int *)data = sc->timeout[i];
                break;
        case JOY_SET_X_OFFSET:
                sc->x_off[i] = *(int *)data;
                break;
        case JOY_SET_Y_OFFSET:
                sc->y_off[i] = *(int *)data;
                break;
        case JOY_GET_X_OFFSET:
                *(int *)data = sc->x_off[i];
                break;
        case JOY_GET_Y_OFFSET:
                *(int *)data = sc->y_off[i];
                break;
        default:
                error = ENXIO;
                break;
        }
        mutex_exit(sc->sc_lock);
        return error;
}






































































































































































   19 
   18 








































   61 














   38 











   26 

   26 
   26 
















   31 

   15 










   22 















  178 













  176 
  179 
  179 


  179 












  172 





  165 



















  178 










   31 







   31 
    6 


    6 
    6 


    6 
    6 


















   29 




















   29 




  181 

  181 








  181 

  179 









  178 







  182 









  168 
  167 


  173 
  174 














  179 











   19 
   16 
















   13 
   14 














   14 
















    9 











  142 






  141 

  135 



   49 





    4 








  143 
  141 
  142 















































































































   41 





   41 
   41 








    1 
    1 





   41 
   41 
   40 
   41 
   41 













   46 




   46 












  161 



  163 












  161 



  165 










   79 

   79 












   28 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
/*        $NetBSD: kern_rwlock.c,v 1.76 2023/10/15 10:28:48 riastradh Exp $        */

/*-
 * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe and Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Kernel reader/writer lock implementation, modeled after those
 * found in Solaris, a description of which can be found in:
 *
 *        Solaris Internals: Core Kernel Architecture, Jim Mauro and
 *            Richard McDougall.
 *
 * The NetBSD implementation differs from that described in the book, in
 * that the locks are partially adaptive.  Lock waiters spin wait while a
 * lock is write held and the holder is still running on a CPU.  The method
 * of choosing which threads to awaken when a lock is released also differs,
 * mainly to take account of the partially adaptive behaviour.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.76 2023/10/15 10:28:48 riastradh Exp $");

#include "opt_lockdebug.h"

#define        __RWLOCK_PRIVATE

#include <sys/param.h>

#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/lock.h>
#include <sys/lockdebug.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/systm.h>

#include <dev/lockstat.h>

#include <machine/rwlock.h>

/*
 * LOCKDEBUG
 */

#define        RW_DEBUG_P(rw)                (((rw)->rw_owner & RW_NODEBUG) == 0)

#define        RW_WANTLOCK(rw, op) \
    LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw), \
        (uintptr_t)__builtin_return_address(0), op == RW_READER);
#define        RW_LOCKED(rw, op) \
    LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL, \
        (uintptr_t)__builtin_return_address(0), op == RW_READER);
#define        RW_UNLOCKED(rw, op) \
    LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw), \
        (uintptr_t)__builtin_return_address(0), op == RW_READER);

/*
 * DIAGNOSTIC
 */

#if defined(DIAGNOSTIC)
#define        RW_ASSERT(rw, cond) \
do { \
        if (__predict_false(!(cond))) \
                rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\
} while (/* CONSTCOND */ 0)
#else
#define        RW_ASSERT(rw, cond)        /* nothing */
#endif        /* DIAGNOSTIC */

/*
 * For platforms that do not provide stubs, or for the LOCKDEBUG case.
 */
#ifdef LOCKDEBUG
#undef        __HAVE_RW_STUBS
#endif

#ifndef __HAVE_RW_STUBS
__strong_alias(rw_enter,rw_vector_enter);
__strong_alias(rw_exit,rw_vector_exit);
__strong_alias(rw_tryenter,rw_vector_tryenter);
#endif

static void        rw_abort(const char *, size_t, krwlock_t *, const char *);
static void        rw_dump(const volatile void *, lockop_printer_t);
static lwp_t        *rw_owner(wchan_t);

lockops_t rwlock_lockops = {
        .lo_name = "Reader / writer lock",
        .lo_type = LOCKOPS_SLEEP,
        .lo_dump = rw_dump,
};

/*
 * Give rwlock holders an extra-high priority boost on-blocking due to
 * direct handoff.  XXX To be revisited.
 */
syncobj_t rw_syncobj = {
        .sobj_name        = "rwlock",
        .sobj_flag        = SOBJ_SLEEPQ_SORTED,
        .sobj_boostpri  = PRI_KTHREAD,
        .sobj_unsleep        = turnstile_unsleep,
        .sobj_changepri        = turnstile_changepri,
        .sobj_lendpri        = sleepq_lendpri,
        .sobj_owner        = rw_owner,
};

/*
 * rw_cas:
 *
 *        Do an atomic compare-and-swap on the lock word.
 */
static inline uintptr_t
rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
{

        return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
            (void *)o, (void *)n);
}

/*
 * rw_swap:
 *
 *        Do an atomic swap of the lock word.  This is used only when it's
 *        known that the lock word is set up such that it can't be changed
 *        behind us (assert this), so there's no point considering the result.
 */
static inline void
rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
{

        n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
            (void *)n);

        RW_ASSERT(rw, n == o);
        RW_ASSERT(rw, (o & RW_HAS_WAITERS) != 0);
}

/*
 * rw_dump:
 *
 *        Dump the contents of a rwlock structure.
 */
static void
rw_dump(const volatile void *cookie, lockop_printer_t pr)
{
        const volatile krwlock_t *rw = cookie;

        pr("owner/count  : %#018lx flags    : %#018x\n",
            (long)RW_OWNER(rw), (int)RW_FLAGS(rw));
}

/*
 * rw_abort:
 *
 *        Dump information about an error and panic the system.  This
 *        generates a lot of machine code in the DIAGNOSTIC case, so
 *        we ask the compiler to not inline it.
 */
static void __noinline
rw_abort(const char *func, size_t line, krwlock_t *rw, const char *msg)
{

        if (__predict_false(panicstr != NULL))
                return;

        LOCKDEBUG_ABORT(func, line, rw, &rwlock_lockops, msg);
}

/*
 * rw_init:
 *
 *        Initialize a rwlock for use.
 */
void
_rw_init(krwlock_t *rw, uintptr_t return_address)
{

#ifdef LOCKDEBUG
        /* XXX only because the assembly stubs can't handle RW_NODEBUG */
        if (LOCKDEBUG_ALLOC(rw, &rwlock_lockops, return_address))
                rw->rw_owner = 0;
        else
                rw->rw_owner = RW_NODEBUG;
#else
        rw->rw_owner = 0;
#endif
}

void
rw_init(krwlock_t *rw)
{

        _rw_init(rw, (uintptr_t)__builtin_return_address(0));
}

/*
 * rw_destroy:
 *
 *        Tear down a rwlock.
 */
void
rw_destroy(krwlock_t *rw)
{

        RW_ASSERT(rw, (rw->rw_owner & ~RW_NODEBUG) == 0);
        LOCKDEBUG_FREE((rw->rw_owner & RW_NODEBUG) == 0, rw);
}

/*
 * rw_oncpu:
 *
 *        Return true if an rwlock owner is running on a CPU in the system.
 *        If the target is waiting on the kernel big lock, then we must
 *        release it.  This is necessary to avoid deadlock.
 */
static bool
rw_oncpu(uintptr_t owner)
{
#ifdef MULTIPROCESSOR
        struct cpu_info *ci;
        lwp_t *l;

        KASSERT(kpreempt_disabled());

        if ((owner & (RW_WRITE_LOCKED|RW_HAS_WAITERS)) != RW_WRITE_LOCKED) {
                return false;
        }

        /*
         * See lwp_dtor() why dereference of the LWP pointer is safe.
         * We must have kernel preemption disabled for that.
         */
        l = (lwp_t *)(owner & RW_THREAD);
        ci = l->l_cpu;

        if (ci && ci->ci_curlwp == l) {
                /* Target is running; do we need to block? */
                return (ci->ci_biglock_wanted != l);
        }
#endif
        /* Not running.  It may be safe to block now. */
        return false;
}

/*
 * rw_vector_enter:
 *
 *        Acquire a rwlock.
 */
void
rw_vector_enter(krwlock_t *rw, const krw_t op)
{
        uintptr_t owner, incr, need_wait, set_wait, curthread, next;
        turnstile_t *ts;
        int queue;
        lwp_t *l;
        LOCKSTAT_TIMER(slptime);
        LOCKSTAT_TIMER(slpcnt);
        LOCKSTAT_TIMER(spintime);
        LOCKSTAT_COUNTER(spincnt);
        LOCKSTAT_FLAG(lsflag);

        l = curlwp;
        curthread = (uintptr_t)l;

        RW_ASSERT(rw, !cpu_intr_p());
        RW_ASSERT(rw, curthread != 0);
        RW_WANTLOCK(rw, op);

        if (__predict_true(panicstr == NULL)) {
                KDASSERT(pserialize_not_in_read_section());
                LOCKDEBUG_BARRIER(&kernel_lock, 1);
        }

        /*
         * We play a slight trick here.  If we're a reader, we want
         * increment the read count.  If we're a writer, we want to
         * set the owner field and the WRITE_LOCKED bit.
         *
         * In the latter case, we expect those bits to be zero,
         * therefore we can use an add operation to set them, which
         * means an add operation for both cases.
         */
        if (__predict_true(op == RW_READER)) {
                incr = RW_READ_INCR;
                set_wait = RW_HAS_WAITERS;
                need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
                queue = TS_READER_Q;
        } else {
                RW_ASSERT(rw, op == RW_WRITER);
                incr = curthread | RW_WRITE_LOCKED;
                set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
                need_wait = RW_WRITE_LOCKED | RW_THREAD;
                queue = TS_WRITER_Q;
        }

        LOCKSTAT_ENTER(lsflag);

        KPREEMPT_DISABLE(curlwp);
        for (owner = rw->rw_owner;;) {
                /*
                 * Read the lock owner field.  If the need-to-wait
                 * indicator is clear, then try to acquire the lock.
                 */
                if ((owner & need_wait) == 0) {
                        next = rw_cas(rw, owner, (owner + incr) &
                            ~RW_WRITE_WANTED);
                        if (__predict_true(next == owner)) {
                                /* Got it! */
                                membar_acquire();
                                break;
                        }

                        /*
                         * Didn't get it -- spin around again (we'll
                         * probably sleep on the next iteration).
                         */
                        owner = next;
                        continue;
                }
                if (__predict_false(RW_OWNER(rw) == curthread)) {
                        rw_abort(__func__, __LINE__, rw,
                            "locking against myself");
                }
                /*
                 * If the lock owner is running on another CPU, and
                 * there are no existing waiters, then spin.
                 */
                if (rw_oncpu(owner)) {
                        LOCKSTAT_START_TIMER(lsflag, spintime);
                        u_int count = SPINLOCK_BACKOFF_MIN;
                        do {
                                KPREEMPT_ENABLE(curlwp);
                                SPINLOCK_BACKOFF(count);
                                KPREEMPT_DISABLE(curlwp);
                                owner = rw->rw_owner;
                        } while (rw_oncpu(owner));
                        LOCKSTAT_STOP_TIMER(lsflag, spintime);
                        LOCKSTAT_COUNT(spincnt, 1);
                        if ((owner & need_wait) == 0)
                                continue;
                }

                /*
                 * Grab the turnstile chain lock.  Once we have that, we
                 * can adjust the waiter bits and sleep queue.
                 */
                ts = turnstile_lookup(rw);

                /*
                 * Mark the rwlock as having waiters.  If the set fails,
                 * then we may not need to sleep and should spin again.
                 * Reload rw_owner because turnstile_lookup() may have
                 * spun on the turnstile chain lock.
                 */
                owner = rw->rw_owner;
                if ((owner & need_wait) == 0 || rw_oncpu(owner)) {
                        turnstile_exit(rw);
                        continue;
                }
                next = rw_cas(rw, owner, owner | set_wait);
                /* XXX membar? */
                if (__predict_false(next != owner)) {
                        turnstile_exit(rw);
                        owner = next;
                        continue;
                }

                LOCKSTAT_START_TIMER(lsflag, slptime);
                turnstile_block(ts, queue, rw, &rw_syncobj);
                LOCKSTAT_STOP_TIMER(lsflag, slptime);
                LOCKSTAT_COUNT(slpcnt, 1);

                /*
                 * No need for a memory barrier because of context switch.
                 * If not handed the lock, then spin again.
                 */
                if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
                        break;

                owner = rw->rw_owner;
        }
        KPREEMPT_ENABLE(curlwp);

        LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK |
            (op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime,
            (l->l_rwcallsite != 0 ? l->l_rwcallsite :
              (uintptr_t)__builtin_return_address(0)));
        LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime,
            (l->l_rwcallsite != 0 ? l->l_rwcallsite :
              (uintptr_t)__builtin_return_address(0)));
        LOCKSTAT_EXIT(lsflag);

        RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
            (op == RW_READER && RW_COUNT(rw) != 0));
        RW_LOCKED(rw, op);
}

/*
 * rw_vector_exit:
 *
 *        Release a rwlock.
 */
void
rw_vector_exit(krwlock_t *rw)
{
        uintptr_t curthread, owner, decr, newown, next;
        turnstile_t *ts;
        int rcnt, wcnt;
        lwp_t *l;

        l = curlwp;
        curthread = (uintptr_t)l;
        RW_ASSERT(rw, curthread != 0);

        /*
         * Again, we use a trick.  Since we used an add operation to
         * set the required lock bits, we can use a subtract to clear
         * them, which makes the read-release and write-release path
         * the same.
         */
        owner = rw->rw_owner;
        if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
                RW_UNLOCKED(rw, RW_WRITER);
                RW_ASSERT(rw, RW_OWNER(rw) == curthread);
                decr = curthread | RW_WRITE_LOCKED;
        } else {
                RW_UNLOCKED(rw, RW_READER);
                RW_ASSERT(rw, RW_COUNT(rw) != 0);
                decr = RW_READ_INCR;
        }

        /*
         * Compute what we expect the new value of the lock to be. Only
         * proceed to do direct handoff if there are waiters, and if the
         * lock would become unowned.
         */
        membar_release();
        for (;;) {
                newown = (owner - decr);
                if ((newown & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
                        break;
                next = rw_cas(rw, owner, newown);
                if (__predict_true(next == owner))
                        return;
                owner = next;
        }

        /*
         * Grab the turnstile chain lock.  This gets the interlock
         * on the sleep queue.  Once we have that, we can adjust the
         * waiter bits.
         */
        ts = turnstile_lookup(rw);
        owner = rw->rw_owner;
        RW_ASSERT(rw, ts != NULL);
        RW_ASSERT(rw, (owner & RW_HAS_WAITERS) != 0);

        wcnt = TS_WAITERS(ts, TS_WRITER_Q);
        rcnt = TS_WAITERS(ts, TS_READER_Q);

        /*
         * Give the lock away.
         *
         * If we are releasing a write lock, then prefer to wake all
         * outstanding readers.  Otherwise, wake one writer if there
         * are outstanding readers, or all writers if there are no
         * pending readers.  If waking one specific writer, the writer
         * is handed the lock here.  If waking multiple writers, we
         * set WRITE_WANTED to block out new readers, and let them
         * do the work of acquiring the lock in rw_vector_enter().
         */
        if (rcnt == 0 || decr == RW_READ_INCR) {
                RW_ASSERT(rw, wcnt != 0);
                RW_ASSERT(rw, (owner & RW_WRITE_WANTED) != 0);

                if (rcnt != 0) {
                        /* Give the lock to the longest waiting writer. */
                        l = TS_FIRST(ts, TS_WRITER_Q);
                        newown = (uintptr_t)l | (owner & RW_NODEBUG);
                        newown |= RW_WRITE_LOCKED | RW_HAS_WAITERS;
                        if (wcnt > 1)
                                newown |= RW_WRITE_WANTED;
                        rw_swap(rw, owner, newown);
                        turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
                } else {
                        /* Wake all writers and let them fight it out. */
                        newown = owner & RW_NODEBUG;
                        newown |= RW_WRITE_WANTED;
                        rw_swap(rw, owner, newown);
                        turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
                }
        } else {
                RW_ASSERT(rw, rcnt != 0);

                /*
                 * Give the lock to all blocked readers.  If there
                 * is a writer waiting, new readers that arrive
                 * after the release will be blocked out.
                 */
                newown = owner & RW_NODEBUG;
                newown += rcnt << RW_READ_COUNT_SHIFT;
                if (wcnt != 0)
                        newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
                        
                /* Wake up all sleeping readers. */
                rw_swap(rw, owner, newown);
                turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
        }
}

/*
 * rw_vector_tryenter:
 *
 *        Try to acquire a rwlock.
 */
int
rw_vector_tryenter(krwlock_t *rw, const krw_t op)
{
        uintptr_t curthread, owner, incr, need_wait, next;
        lwp_t *l;

        l = curlwp;
        curthread = (uintptr_t)l;

        RW_ASSERT(rw, curthread != 0);

        if (op == RW_READER) {
                incr = RW_READ_INCR;
                need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
        } else {
                RW_ASSERT(rw, op == RW_WRITER);
                incr = curthread | RW_WRITE_LOCKED;
                need_wait = RW_WRITE_LOCKED | RW_THREAD;
        }

        for (owner = rw->rw_owner;; owner = next) {
                if (__predict_false((owner & need_wait) != 0))
                        return 0;
                next = rw_cas(rw, owner, owner + incr);
                if (__predict_true(next == owner)) {
                        /* Got it! */
                        break;
                }
        }

        RW_WANTLOCK(rw, op);
        RW_LOCKED(rw, op);
        RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
            (op == RW_READER && RW_COUNT(rw) != 0));

        membar_acquire();
        return 1;
}

/*
 * rw_downgrade:
 *
 *        Downgrade a write lock to a read lock.
 */
void
rw_downgrade(krwlock_t *rw)
{
        uintptr_t owner, newown, next, curthread __diagused;
        turnstile_t *ts;
        int rcnt, wcnt;
        lwp_t *l;

        l = curlwp;
        curthread = (uintptr_t)l;
        RW_ASSERT(rw, curthread != 0);
        RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
        RW_ASSERT(rw, RW_OWNER(rw) == curthread);
        RW_UNLOCKED(rw, RW_WRITER);

        membar_release();
        for (owner = rw->rw_owner;; owner = next) {
                /*
                 * If there are no waiters we can do this the easy way.  Try
                 * swapping us down to one read hold.  If it fails, the lock
                 * condition has changed and we most likely now have
                 * waiters.
                 */
                if ((owner & RW_HAS_WAITERS) == 0) {
                        newown = (owner & RW_NODEBUG);
                        next = rw_cas(rw, owner, newown + RW_READ_INCR);
                        if (__predict_true(next == owner)) {
                                RW_LOCKED(rw, RW_READER);
                                RW_ASSERT(rw,
                                    (rw->rw_owner & RW_WRITE_LOCKED) == 0);
                                RW_ASSERT(rw, RW_COUNT(rw) != 0);
                                return;
                        }
                        continue;
                }

                /*
                 * Grab the turnstile chain lock.  This gets the interlock
                 * on the sleep queue.  Once we have that, we can adjust the
                 * waiter bits.
                 */
                ts = turnstile_lookup(rw);
                RW_ASSERT(rw, ts != NULL);

                rcnt = TS_WAITERS(ts, TS_READER_Q);
                wcnt = TS_WAITERS(ts, TS_WRITER_Q);

                if (rcnt == 0) {
                        /*
                         * If there are no readers, just preserve the
                         * waiters bits, swap us down to one read hold and
                         * return.
                         */
                        RW_ASSERT(rw, wcnt != 0);
                        RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
                        RW_ASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);

                        newown = owner & RW_NODEBUG;
                        newown |= RW_READ_INCR | RW_HAS_WAITERS |
                            RW_WRITE_WANTED;
                        next = rw_cas(rw, owner, newown);
                        turnstile_exit(rw);
                        if (__predict_true(next == owner))
                                break;
                } else {
                        /*
                         * Give the lock to all blocked readers.  We may
                         * retain one read hold if downgrading.  If there is
                         * a writer waiting, new readers will be blocked
                         * out.
                         */
                        newown = owner & RW_NODEBUG;
                        newown += (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
                        if (wcnt != 0)
                                newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;

                        next = rw_cas(rw, owner, newown);
                        if (__predict_true(next == owner)) {
                                /* Wake up all sleeping readers. */
                                turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
                                break;
                        }
                        turnstile_exit(rw);
                }
        }

        RW_WANTLOCK(rw, RW_READER);
        RW_LOCKED(rw, RW_READER);
        RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
        RW_ASSERT(rw, RW_COUNT(rw) != 0);
}

/*
 * rw_tryupgrade:
 *
 *        Try to upgrade a read lock to a write lock.  We must be the only
 *        reader.
 */
int
rw_tryupgrade(krwlock_t *rw)
{
        uintptr_t owner, curthread, newown, next;
        struct lwp *l;

        l = curlwp;
        curthread = (uintptr_t)l;
        RW_ASSERT(rw, curthread != 0);
        RW_ASSERT(rw, rw_read_held(rw));

        for (owner = RW_READ_INCR;; owner = next) {
                newown = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
                next = rw_cas(rw, owner, newown);
                if (__predict_true(next == owner)) {
                        membar_acquire();
                        break;
                }
                RW_ASSERT(rw, (next & RW_WRITE_LOCKED) == 0);
                if (__predict_false((next & RW_THREAD) != RW_READ_INCR)) {
                        RW_ASSERT(rw, (next & RW_THREAD) != 0);
                        return 0;
                }
        }

        RW_UNLOCKED(rw, RW_READER);
        RW_WANTLOCK(rw, RW_WRITER);
        RW_LOCKED(rw, RW_WRITER);
        RW_ASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED);
        RW_ASSERT(rw, RW_OWNER(rw) == curthread);

        return 1;
}

/*
 * rw_read_held:
 *
 *        Returns true if the rwlock is held for reading.  Must only be
 *        used for diagnostic assertions, and never be used to make
 *         decisions about how to use a rwlock.
 */
int
rw_read_held(krwlock_t *rw)
{
        uintptr_t owner;

        if (rw == NULL)
                return 0;
        owner = rw->rw_owner;
        return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
}

/*
 * rw_write_held:
 *
 *        Returns true if the rwlock is held for writing.  Must only be
 *        used for diagnostic assertions, and never be used to make
 *        decisions about how to use a rwlock.
 */
int
rw_write_held(krwlock_t *rw)
{

        if (rw == NULL)
                return 0;
        return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
            (RW_WRITE_LOCKED | (uintptr_t)curlwp);
}

/*
 * rw_lock_held:
 *
 *        Returns true if the rwlock is held for reading or writing.  Must
 *        only be used for diagnostic assertions, and never be used to make
 *        decisions about how to use a rwlock.
 */
int
rw_lock_held(krwlock_t *rw)
{

        if (rw == NULL)
                return 0;
        return (rw->rw_owner & RW_THREAD) != 0;
}

/*
 * rw_lock_op:
 *
 *        For a rwlock that is known to be held by the caller, return
 *        RW_READER or RW_WRITER to describe the hold type.
 */
krw_t
rw_lock_op(krwlock_t *rw)
{

        RW_ASSERT(rw, rw_lock_held(rw));

        return (rw->rw_owner & RW_WRITE_LOCKED) != 0 ? RW_WRITER : RW_READER;
}

/*
 * rw_owner:
 *
 *        Return the current owner of an RW lock, but only if it is write
 *        held.  Used for priority inheritance.
 */
static lwp_t *
rw_owner(wchan_t obj)
{
        krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
        uintptr_t owner = rw->rw_owner;

        if ((owner & RW_WRITE_LOCKED) == 0)
                return NULL;

        return (void *)(owner & RW_THREAD);
}






































    4 









    4 









    1 




























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/*        $NetBSD: uipc_syscalls_40.c,v 1.24 2022/07/07 18:17:33 riastradh Exp $        */

/* written by Pavel Cahyna, 2006. Public domain. */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_syscalls_40.c,v 1.24 2022/07/07 18:17:33 riastradh Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

/*
 * System call interface to the socket abstraction.
 */

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/msg.h>
#include <sys/sysctl.h>
#include <sys/syscallargs.h>
#include <sys/errno.h>
#include <sys/compat_stub.h>

#include <net/if.h>

#include <compat/sys/socket.h>
#include <compat/sys/sockio.h>

#include <compat/common/compat_mod.h>

/*
 * Return interface configuration of system.  List may be used in later
 * ioctl's (above) to get other information.
 */
/*ARGSUSED*/
static int
compat_ifconf(u_long cmd, void *data)
{
        struct oifconf *ifc = data;
        struct ifnet *ifp;
        struct oifreq ifr, *ifrp = NULL;
        int space = 0, error = 0;
        const int sz = (int)sizeof(ifr);
        int s;
        int bound;
        struct psref psref;

        switch (cmd) {
        case OSIOCGIFCONF:
        case OOSIOCGIFCONF:
                break;
        default:
                return ENOSYS;
        }

        const bool docopy = ifc->ifc_req != NULL;
        if (docopy) {
                if (ifc->ifc_len < 0)
                        return EINVAL;

                space = ifc->ifc_len;
                ifrp = ifc->ifc_req;
        }
        memset(&ifr, 0, sizeof(ifr));

        bound = curlwp_bind();
        s = pserialize_read_enter();
        IFNET_READER_FOREACH(ifp) {
                struct ifaddr *ifa;

                if_acquire(ifp, &psref);
                pserialize_read_exit(s);

                (void)strncpy(ifr.ifr_name, ifp->if_xname,
                    sizeof(ifr.ifr_name));
                if (ifr.ifr_name[sizeof(ifr.ifr_name) - 1] != '\0') {
                        error = ENAMETOOLONG;
                        goto release_exit;
                }
                if (IFADDR_READER_EMPTY(ifp)) {
                        memset(&ifr.ifr_addr, 0, sizeof(ifr.ifr_addr));
                        if (space >= sz) {
                                error = copyout(&ifr, ifrp, sz);
                                if (error != 0)
                                        goto release_exit;
                                ifrp++;
                        }
                        space -= sizeof(ifr);
                        goto next;
                }

                s = pserialize_read_enter();
                IFADDR_READER_FOREACH(ifa, ifp) {
                        struct sockaddr *sa = ifa->ifa_addr;
                        struct psref psref_ifa;

                        ifa_acquire(ifa, &psref_ifa);
                        pserialize_read_exit(s);
#ifdef COMPAT_OSOCK
                        if (cmd == OOSIOCGIFCONF) {
                                struct osockaddr *osa =
                                    (struct osockaddr *)&ifr.ifr_addr;
                                /*
                                 * If it does not fit, we don't bother with it
                                 */
                                if (sa->sa_len > sizeof(*osa))
                                        goto next_ifa;
                                memcpy(&ifr.ifr_addr, sa, sa->sa_len);
                                osa->sa_family = sa->sa_family;
                                if (space >= sz) {
                                        error = copyout(&ifr, ifrp, sz);
                                        ifrp++;
                                }
                        } else
#endif
                        if (sa->sa_len <= sizeof(*sa)) {
                                memcpy(&ifr.ifr_addr, sa, sa->sa_len);
                                if (space >= sz) {
                                        error = copyout(&ifr, ifrp, sz);
                                        ifrp++;
                                }
                        } else {
                                space -= sa->sa_len - sizeof(*sa);
                                if (space >= sz) {
                                        error = copyout(&ifr.ifr_name, ifrp,
                                            sizeof(ifr.ifr_name));
                                        if (error == 0) {
                                                error = copyout(sa,
                                                    &ifrp->ifr_addr,
                                                    sa->sa_len);
                                        }
                                        ifrp = (struct oifreq *)
                                                (sa->sa_len +
                                                 (char *)&ifrp->ifr_addr);
                                }
                        }
                        if (error != 0) {
                                ifa_release(ifa, &psref_ifa);
                                goto release_exit;
                        }
                        space -= sz;

#ifdef COMPAT_OSOCK
                next_ifa:
#endif
                        s = pserialize_read_enter();
                        ifa_release(ifa, &psref_ifa);
                }
                pserialize_read_exit(s);

        next:
                s = pserialize_read_enter();
                if_release(ifp, &psref);
        }
        pserialize_read_exit(s);
        curlwp_bindx(bound);

        if (docopy)
                ifc->ifc_len -= space;
        else
                ifc->ifc_len = -space;
        return 0;

release_exit:
        if_release(ifp, &psref);
        curlwp_bindx(bound);
        return error;
}

void      
uipc_syscalls_40_init(void)
{
 
        MODULE_HOOK_SET(uipc_syscalls_40_hook, compat_ifconf);
}
 
void
uipc_syscalls_40_fini(void)
{
 
        MODULE_HOOK_UNSET(uipc_syscalls_40_hook);
}






























































































































































































































































































    2 































































   53 



















   10 
   11 
   11 








   52 
   52 

   52 






   51 



   52 

   52 

   52 
   52 
   51 



   52 


























































































































































   12 
































































   52 



   51 








   52 





   13 




   13 


























    1 







   52 




















    9 








   51 



   22 

   49 








   52 













   25 












   19 

   22 














   46 













   47 













   47 

   17 












   21 







   21 
   21 











   21 

   20 

   18 




   21 


























   21 












   16 







   21 




   21 




   21 




















   17 




















   19 




   18 

   18 
   19 







   19 













   19 


   13 








   19 




























   53 



   53 

   51 
   53 
   52 



   53 
   53 





   53 
    7 


   51 

   20 









   53 



   53 


   52 







    9 








   53 



   11 




   11 












   11 















   11 



















   52 























   53 




   49 















   52 


   52 


   12 
   11 

   51 

















   52 




   52 




    5 







   50 


   51 














   12 















   52 










   52 



































   52 





   52 








   51 












   13 


   12 
    1 
   12 
    1 









   13 
















   52 















   12 




   12 


   12 




   12 










   12 




































































































































































































   52 



   52 

   52 







   51 





   52 
























    3 




    3 












    3 







    3 


    3 













    3 







    1 












    2 
































































































































































































































































































































   30 








    2 
   30 









    4 














































































    3 

    2 



    2 

    2 


    2 



































   52 


   51 

   52 







    5 





    5 






    5 




   51 



   52 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
/*        $NetBSD: vfs_bio.c,v 1.303 2022/03/30 14:54:29 riastradh Exp $        */

/*-
 * Copyright (c) 2007, 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran, and by Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_bio.c        8.6 (Berkeley) 1/11/94
 */

/*-
 * Copyright (c) 1994 Christopher G. Demetriou
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_bio.c        8.6 (Berkeley) 1/11/94
 */

/*
 * The buffer cache subsystem.
 *
 * Some references:
 *        Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
 *        Leffler, et al.: The Design and Implementation of the 4.3BSD
 *                UNIX Operating System (Addison Welley, 1989)
 *
 * Locking
 *
 * There are three locks:
 * - bufcache_lock: protects global buffer cache state.
 * - BC_BUSY: a long term per-buffer lock.
 * - buf_t::b_objlock: lock on completion (biowait vs biodone).
 *
 * For buffers associated with vnodes (a most common case) b_objlock points
 * to the vnode_t::v_interlock.  Otherwise, it points to generic buffer_lock.
 *
 * Lock order:
 *        bufcache_lock ->
 *                buf_t::b_objlock
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.303 2022/03/30 14:54:29 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_bufcache.h"
#include "opt_dtrace.h"
#include "opt_biohist.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/resourcevar.h>
#include <sys/sysctl.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>
#include <sys/intr.h>
#include <sys/cpu.h>
#include <sys/wapbl.h>
#include <sys/bitops.h>
#include <sys/cprng.h>
#include <sys/sdt.h>

#include <uvm/uvm.h>        /* extern struct uvm uvm */

#include <miscfs/specfs/specdev.h>

SDT_PROVIDER_DEFINE(io);

SDT_PROBE_DEFINE4(io, kernel, , bbusy__start,
    "struct buf *"/*bp*/,
    "bool"/*intr*/, "int"/*timo*/, "kmutex_t *"/*interlock*/);
SDT_PROBE_DEFINE5(io, kernel, , bbusy__done,
    "struct buf *"/*bp*/,
    "bool"/*intr*/,
    "int"/*timo*/,
    "kmutex_t *"/*interlock*/,
    "int"/*error*/);
SDT_PROBE_DEFINE0(io, kernel, , getnewbuf__start);
SDT_PROBE_DEFINE1(io, kernel, , getnewbuf__done,  "struct buf *"/*bp*/);
SDT_PROBE_DEFINE3(io, kernel, , getblk__start,
    "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/);
SDT_PROBE_DEFINE4(io, kernel, , getblk__done,
    "struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/,
    "struct buf *"/*bp*/);
SDT_PROBE_DEFINE2(io, kernel, , brelse, "struct buf *"/*bp*/, "int"/*set*/);
SDT_PROBE_DEFINE1(io, kernel, , wait__start, "struct buf *"/*bp*/);
SDT_PROBE_DEFINE1(io, kernel, , wait__done, "struct buf *"/*bp*/);

#ifndef        BUFPAGES
# define BUFPAGES 0
#endif

#ifdef BUFCACHE
# if (BUFCACHE < 5) || (BUFCACHE > 95)
#  error BUFCACHE is not between 5 and 95
# endif
#else
# define BUFCACHE 15
#endif

u_int        nbuf;                        /* desired number of buffer headers */
u_int        bufpages = BUFPAGES;        /* optional hardwired count */
u_int        bufcache = BUFCACHE;        /* max % of RAM to use for buffer cache */

/*
 * Definitions for the buffer free lists.
 */
#define        BQUEUES                3                /* number of free buffer queues */

#define        BQ_LOCKED        0                /* super-blocks &c */
#define        BQ_LRU                1                /* lru, useful buffers */
#define        BQ_AGE                2                /* rubbish */

struct bqueue {
        TAILQ_HEAD(, buf) bq_queue;
        uint64_t bq_bytes;
        buf_t *bq_marker;
};
static struct bqueue bufqueues[BQUEUES] __cacheline_aligned;

/* Function prototypes */
static void buf_setwm(void);
static int buf_trim(void);
static void *bufpool_page_alloc(struct pool *, int);
static void bufpool_page_free(struct pool *, void *);
static buf_t *bio_doread(struct vnode *, daddr_t, int, int);
static buf_t *getnewbuf(int, int, int);
static int buf_lotsfree(void);
static int buf_canrelease(void);
static u_long buf_mempoolidx(u_long);
static u_long buf_roundsize(u_long);
static void *buf_alloc(size_t);
static void buf_mrelease(void *, size_t);
static void binsheadfree(buf_t *, struct bqueue *);
static void binstailfree(buf_t *, struct bqueue *);
#ifdef DEBUG
static int checkfreelist(buf_t *, struct bqueue *, int);
#endif
static void biointr(void *);
static void biodone2(buf_t *);
static void sysctl_kern_buf_setup(void);
static void sysctl_vm_buf_setup(void);

/* Initialization for biohist */

#include <sys/biohist.h>

BIOHIST_DEFINE(biohist);

void
biohist_init(void)
{

        BIOHIST_INIT(biohist, BIOHIST_SIZE);
}

/*
 * Definitions for the buffer hash lists.
 */
#define        BUFHASH(dvp, lbn)        \
        (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
u_long        bufhash;

static int     bufhash_stats(struct hashstat_sysctl *, bool);

static kcondvar_t needbuffer_cv;

/*
 * Buffer queue lock.
 */
kmutex_t bufcache_lock __cacheline_aligned;
kmutex_t buffer_lock __cacheline_aligned;

/* Software ISR for completed transfers. */
static void *biodone_sih;

/* Buffer pool for I/O buffers. */
static pool_cache_t buf_cache;
static pool_cache_t bufio_cache;

#define MEMPOOL_INDEX_OFFSET (ilog2(DEV_BSIZE))        /* smallest pool is 512 bytes */
#define NMEMPOOLS (ilog2(MAXBSIZE) - MEMPOOL_INDEX_OFFSET + 1)
__CTASSERT((1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) == MAXBSIZE);

/* Buffer memory pools */
static struct pool bmempools[NMEMPOOLS];

static struct vm_map *buf_map;

/*
 * Buffer memory pool allocator.
 */
static void *
bufpool_page_alloc(struct pool *pp, int flags)
{

        return (void *)uvm_km_alloc(buf_map,
            MAXBSIZE, MAXBSIZE,
            ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT|UVM_KMF_TRYLOCK)
            | UVM_KMF_WIRED);
}

static void
bufpool_page_free(struct pool *pp, void *v)
{

        uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
}

static struct pool_allocator bufmempool_allocator = {
        .pa_alloc = bufpool_page_alloc,
        .pa_free = bufpool_page_free,
        .pa_pagesz = MAXBSIZE,
};

/* Buffer memory management variables */
u_long bufmem_valimit;
u_long bufmem_hiwater;
u_long bufmem_lowater;
u_long bufmem;

/*
 * MD code can call this to set a hard limit on the amount
 * of virtual memory used by the buffer cache.
 */
int
buf_setvalimit(vsize_t sz)
{

        /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */
        if (sz < NMEMPOOLS * MAXBSIZE)
                return EINVAL;

        bufmem_valimit = sz;
        return 0;
}

static void
buf_setwm(void)
{

        bufmem_hiwater = buf_memcalc();
        /* lowater is approx. 2% of memory (with bufcache = 15) */
#define        BUFMEM_WMSHIFT        3
#define        BUFMEM_HIWMMIN        (64 * 1024 << BUFMEM_WMSHIFT)
        if (bufmem_hiwater < BUFMEM_HIWMMIN)
                /* Ensure a reasonable minimum value */
                bufmem_hiwater = BUFMEM_HIWMMIN;
        bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT;
}

#ifdef DEBUG
int debug_verify_freelist = 0;
static int
checkfreelist(buf_t *bp, struct bqueue *dp, int ison)
{
        buf_t *b;

        if (!debug_verify_freelist)
                return 1;

        TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) {
                if (b == bp)
                        return ison ? 1 : 0;
        }

        return ison ? 0 : 1;
}
#endif

/*
 * Insq/Remq for the buffer hash lists.
 * Call with buffer queue locked.
 */
static void
binsheadfree(buf_t *bp, struct bqueue *dp)
{

        KASSERT(mutex_owned(&bufcache_lock));
        KASSERT(bp->b_freelistindex == -1);
        TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist);
        dp->bq_bytes += bp->b_bufsize;
        bp->b_freelistindex = dp - bufqueues;
}

static void
binstailfree(buf_t *bp, struct bqueue *dp)
{

        KASSERT(mutex_owned(&bufcache_lock));
        KASSERTMSG(bp->b_freelistindex == -1, "double free of buffer? "
            "bp=%p, b_freelistindex=%d\n", bp, bp->b_freelistindex);
        TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist);
        dp->bq_bytes += bp->b_bufsize;
        bp->b_freelistindex = dp - bufqueues;
}

void
bremfree(buf_t *bp)
{
        struct bqueue *dp;
        int bqidx = bp->b_freelistindex;

        KASSERT(mutex_owned(&bufcache_lock));

        KASSERT(bqidx != -1);
        dp = &bufqueues[bqidx];
        KDASSERT(checkfreelist(bp, dp, 1));
        KASSERT(dp->bq_bytes >= bp->b_bufsize);
        TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist);
        dp->bq_bytes -= bp->b_bufsize;

        /* For the sysctl helper. */
        if (bp == dp->bq_marker)
                dp->bq_marker = NULL;

#if defined(DIAGNOSTIC)
        bp->b_freelistindex = -1;
#endif /* defined(DIAGNOSTIC) */
}

/*
 * note that for some ports this is used by pmap bootstrap code to
 * determine kva size.
 */
u_long
buf_memcalc(void)
{
        u_long n;
        vsize_t mapsz = 0;

        /*
         * Determine the upper bound of memory to use for buffers.
         *
         *        - If bufpages is specified, use that as the number
         *          pages.
         *
         *        - Otherwise, use bufcache as the percentage of
         *          physical memory.
         */
        if (bufpages != 0) {
                n = bufpages;
        } else {
                if (bufcache < 5) {
                        printf("forcing bufcache %d -> 5", bufcache);
                        bufcache = 5;
                }
                if (bufcache > 95) {
                        printf("forcing bufcache %d -> 95", bufcache);
                        bufcache = 95;
                }
                if (buf_map != NULL)
                        mapsz = vm_map_max(buf_map) - vm_map_min(buf_map);
                n = calc_cache_size(mapsz, bufcache,
                    (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT)
                    / PAGE_SIZE;
        }

        n <<= PAGE_SHIFT;
        if (bufmem_valimit != 0 && n > bufmem_valimit)
                n = bufmem_valimit;

        return (n);
}

/*
 * Initialize buffers and hash links for buffers.
 */
void
bufinit(void)
{
        struct bqueue *dp;
        int use_std;
        u_int i;

        biodone_vfs = biodone;

        mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE);
        mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&needbuffer_cv, "needbuf");

        if (bufmem_valimit != 0) {
                vaddr_t minaddr = 0, maxaddr;
                buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
                                          bufmem_valimit, 0, false, 0);
                if (buf_map == NULL)
                        panic("bufinit: cannot allocate submap");
        } else
                buf_map = kernel_map;

        /*
         * Initialize buffer cache memory parameters.
         */
        bufmem = 0;
        buf_setwm();

        /* On "small" machines use small pool page sizes where possible */
        use_std = (physmem < atop(16*1024*1024));

        /*
         * Also use them on systems that can map the pool pages using
         * a direct-mapped segment.
         */
#ifdef PMAP_MAP_POOLPAGE
        use_std = 1;
#endif

        buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
            "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL);
        bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
            "biopl", NULL, IPL_BIO, NULL, NULL, NULL);

        for (i = 0; i < NMEMPOOLS; i++) {
                struct pool_allocator *pa;
                struct pool *pp = &bmempools[i];
                u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET);
                char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */
                if (__predict_false(size >= 1048576))
                        (void)snprintf(name, 8, "buf%um", size / 1048576);
                else if (__predict_true(size >= 1024))
                        (void)snprintf(name, 8, "buf%uk", size / 1024);
                else
                        (void)snprintf(name, 8, "buf%ub", size);
                pa = (size <= PAGE_SIZE && use_std)
                        ? &pool_allocator_nointr
                        : &bufmempool_allocator;
                pool_init(pp, size, DEV_BSIZE, 0, 0, name, pa, IPL_NONE);
                pool_setlowat(pp, 1);
                pool_sethiwat(pp, 1);
        }

        /* Initialize the buffer queues */
        for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
                TAILQ_INIT(&dp->bq_queue);
                dp->bq_bytes = 0;
        }

        /*
         * Estimate hash table size based on the amount of memory we
         * intend to use for the buffer cache. The average buffer
         * size is dependent on our clients (i.e. filesystems).
         *
         * For now, use an empirical 3K per buffer.
         */
        nbuf = (bufmem_hiwater / 1024) / 3;
        bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash);

        sysctl_kern_buf_setup();
        sysctl_vm_buf_setup();
        hashstat_register("bufhash", bufhash_stats);
}

void
bufinit2(void)
{

        biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr,
            NULL);
        if (biodone_sih == NULL)
                panic("bufinit2: can't establish soft interrupt");
}

static int
buf_lotsfree(void)
{
        u_long guess;

        /* Always allocate if less than the low water mark. */
        if (bufmem < bufmem_lowater)
                return 1;

        /* Never allocate if greater than the high water mark. */
        if (bufmem > bufmem_hiwater)
                return 0;

        /* If there's anything on the AGE list, it should be eaten. */
        if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL)
                return 0;

        /*
         * The probabily of getting a new allocation is inversely
         * proportional  to the current size of the cache above
         * the low water mark.  Divide the total first to avoid overflows
         * in the product.
         */
        guess = cprng_fast32() % 16;

        if ((bufmem_hiwater - bufmem_lowater) / 16 * guess >=
            (bufmem - bufmem_lowater))
                return 1;

        /* Otherwise don't allocate. */
        return 0;
}

/*
 * Return estimate of bytes we think need to be
 * released to help resolve low memory conditions.
 *
 * => called with bufcache_lock held.
 */
static int
buf_canrelease(void)
{
        int pagedemand, ninvalid = 0;

        KASSERT(mutex_owned(&bufcache_lock));

        if (bufmem < bufmem_lowater)
                return 0;

        if (bufmem > bufmem_hiwater)
                return bufmem - bufmem_hiwater;

        ninvalid += bufqueues[BQ_AGE].bq_bytes;

        pagedemand = uvmexp.freetarg - uvm_availmem(false);
        if (pagedemand < 0)
                return ninvalid;
        return MAX(ninvalid, MIN(2 * MAXBSIZE,
            MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE)));
}

/*
 * Buffer memory allocation helper functions
 */
static u_long
buf_mempoolidx(u_long size)
{
        u_int n = 0;

        size -= 1;
        size >>= MEMPOOL_INDEX_OFFSET;
        while (size) {
                size >>= 1;
                n += 1;
        }
        if (n >= NMEMPOOLS)
                panic("buf mem pool index %d", n);
        return n;
}

static u_long
buf_roundsize(u_long size)
{
        /* Round up to nearest power of 2 */
        return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET));
}

static void *
buf_alloc(size_t size)
{
        u_int n = buf_mempoolidx(size);
        void *addr;

        while (1) {
                addr = pool_get(&bmempools[n], PR_NOWAIT);
                if (addr != NULL)
                        break;

                /* No memory, see if we can free some. If so, try again */
                mutex_enter(&bufcache_lock);
                if (buf_drain(1) > 0) {
                        mutex_exit(&bufcache_lock);
                        continue;
                }

                if (curlwp == uvm.pagedaemon_lwp) {
                        mutex_exit(&bufcache_lock);
                        return NULL;
                }

                /* Wait for buffers to arrive on the LRU queue */
                cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4);
                mutex_exit(&bufcache_lock);
        }

        return addr;
}

static void
buf_mrelease(void *addr, size_t size)
{

        pool_put(&bmempools[buf_mempoolidx(size)], addr);
}

/*
 * bread()/breadn() helper.
 */
static buf_t *
bio_doread(struct vnode *vp, daddr_t blkno, int size, int async)
{
        buf_t *bp;
        struct mount *mp;

        bp = getblk(vp, blkno, size, 0, 0);

        /*
         * getblk() may return NULL if we are the pagedaemon.
         */
        if (bp == NULL) {
                KASSERT(curlwp == uvm.pagedaemon_lwp);
                return NULL;
        }

        /*
         * If buffer does not have data valid, start a read.
         * Note that if buffer is BC_INVAL, getblk() won't return it.
         * Therefore, it's valid if its I/O has completed or been delayed.
         */
        if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) {
                /* Start I/O for the buffer. */
                SET(bp->b_flags, B_READ | async);
                if (async)
                        BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
                else
                        BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
                VOP_STRATEGY(vp, bp);

                /* Pay for the read. */
                curlwp->l_ru.ru_inblock++;
        } else if (async)
                brelse(bp, 0);

        if (vp->v_type == VBLK)
                mp = spec_node_getmountedfs(vp);
        else
                mp = vp->v_mount;

        /*
         * Collect statistics on synchronous and asynchronous reads.
         * Reads from block devices are charged to their associated
         * filesystem (if any).
         */
        if (mp != NULL) {
                if (async == 0)
                        mp->mnt_stat.f_syncreads++;
                else
                        mp->mnt_stat.f_asyncreads++;
        }

        return (bp);
}

/*
 * Read a disk block.
 * This algorithm described in Bach (p.54).
 */
int
bread(struct vnode *vp, daddr_t blkno, int size, int flags, buf_t **bpp)
{
        buf_t *bp;
        int error;

        BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);

        /* Get buffer for block. */
        bp = *bpp = bio_doread(vp, blkno, size, 0);
        if (bp == NULL)
                return ENOMEM;

        /* Wait for the read to complete, and return result. */
        error = biowait(bp);
        if (error == 0 && (flags & B_MODIFY) != 0)
                error = fscow_run(bp, true);
        if (error) {
                brelse(bp, 0);
                *bpp = NULL;
        }

        return error;
}

/*
 * Read-ahead multiple disk blocks. The first is sync, the rest async.
 * Trivial modification to the breada algorithm presented in Bach (p.55).
 */
int
breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks,
    int *rasizes, int nrablks, int flags, buf_t **bpp)
{
        buf_t *bp;
        int error, i;

        BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);

        bp = *bpp = bio_doread(vp, blkno, size, 0);
        if (bp == NULL)
                return ENOMEM;

        /*
         * For each of the read-ahead blocks, start a read, if necessary.
         */
        mutex_enter(&bufcache_lock);
        for (i = 0; i < nrablks; i++) {
                /* If it's in the cache, just go on to next one. */
                if (incore(vp, rablks[i]))
                        continue;

                /* Get a buffer for the read-ahead block */
                mutex_exit(&bufcache_lock);
                (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
                mutex_enter(&bufcache_lock);
        }
        mutex_exit(&bufcache_lock);

        /* Otherwise, we had to start a read for it; wait until it's valid. */
        error = biowait(bp);
        if (error == 0 && (flags & B_MODIFY) != 0)
                error = fscow_run(bp, true);
        if (error) {
                brelse(bp, 0);
                *bpp = NULL;
        }

        return error;
}

/*
 * Block write.  Described in Bach (p.56)
 */
int
bwrite(buf_t *bp)
{
        int rv, sync, wasdelayed;
        struct vnode *vp;
        struct mount *mp;

        BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx",
            (uintptr_t)bp, 0, 0, 0);

        KASSERT(ISSET(bp->b_cflags, BC_BUSY));
        KASSERT(!cv_has_waiters(&bp->b_done));

        vp = bp->b_vp;

        /*
         * dholland 20160728 AFAICT vp==NULL must be impossible as it
         * will crash upon reaching VOP_STRATEGY below... see further
         * analysis on tech-kern.
         */
        KASSERTMSG(vp != NULL, "bwrite given buffer with null vnode");

        if (vp != NULL) {
                KASSERT(bp->b_objlock == vp->v_interlock);
                if (vp->v_type == VBLK)
                        mp = spec_node_getmountedfs(vp);
                else
                        mp = vp->v_mount;
        } else {
                mp = NULL;
        }

        if (mp && mp->mnt_wapbl) {
                if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
                        bdwrite(bp);
                        return 0;
                }
        }

        /*
         * Remember buffer type, to switch on it later.  If the write was
         * synchronous, but the file system was mounted with MNT_ASYNC,
         * convert it to a delayed write.
         * XXX note that this relies on delayed tape writes being converted
         * to async, not sync writes (which is safe, but ugly).
         */
        sync = !ISSET(bp->b_flags, B_ASYNC);
        if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) {
                bdwrite(bp);
                return (0);
        }

        /*
         * Collect statistics on synchronous and asynchronous writes.
         * Writes to block devices are charged to their associated
         * filesystem (if any).
         */
        if (mp != NULL) {
                if (sync)
                        mp->mnt_stat.f_syncwrites++;
                else
                        mp->mnt_stat.f_asyncwrites++;
        }

        /*
         * Pay for the I/O operation and make sure the buf is on the correct
         * vnode queue.
         */
        bp->b_error = 0;
        wasdelayed = ISSET(bp->b_oflags, BO_DELWRI);
        CLR(bp->b_flags, B_READ);
        if (wasdelayed) {
                mutex_enter(&bufcache_lock);
                mutex_enter(bp->b_objlock);
                CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
                reassignbuf(bp, bp->b_vp);
                /* Wake anyone trying to busy the buffer via vnode's lists. */
                cv_broadcast(&bp->b_busy);
                mutex_exit(&bufcache_lock);
        } else {
                curlwp->l_ru.ru_oublock++;
                mutex_enter(bp->b_objlock);
                CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
        }
        if (vp != NULL)
                vp->v_numoutput++;
        mutex_exit(bp->b_objlock);

        /* Initiate disk write. */
        if (sync)
                BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
        else
                BIO_SETPRIO(bp, BPRIO_TIMELIMITED);

        VOP_STRATEGY(vp, bp);

        if (sync) {
                /* If I/O was synchronous, wait for it to complete. */
                rv = biowait(bp);

                /* Release the buffer. */
                brelse(bp, 0);

                return (rv);
        } else {
                return (0);
        }
}

int
vn_bwrite(void *v)
{
        struct vop_bwrite_args *ap = v;

        return (bwrite(ap->a_bp));
}

/*
 * Delayed write.
 *
 * The buffer is marked dirty, but is not queued for I/O.
 * This routine should be used when the buffer is expected
 * to be modified again soon, typically a small write that
 * partially fills a buffer.
 *
 * NB: magnetic tapes cannot be delayed; they must be
 * written in the order that the writes are requested.
 *
 * Described in Leffler, et al. (pp. 208-213).
 */
void
bdwrite(buf_t *bp)
{

        BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx",
            (uintptr_t)bp, 0, 0, 0);

        KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS ||
            bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE));
        KASSERT(ISSET(bp->b_cflags, BC_BUSY));
        KASSERT(!cv_has_waiters(&bp->b_done));

        /* If this is a tape block, write the block now. */
        if (bdev_type(bp->b_dev) == D_TAPE) {
                bawrite(bp);
                return;
        }

        if (wapbl_vphaswapbl(bp->b_vp)) {
                struct mount *mp = wapbl_vptomp(bp->b_vp);

                if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
                        WAPBL_ADD_BUF(mp, bp);
                }
        }

        /*
         * If the block hasn't been seen before:
         *        (1) Mark it as having been seen,
         *        (2) Charge for the write,
         *        (3) Make sure it's on its vnode's correct block list.
         */
        KASSERT(bp->b_vp == NULL || bp->b_objlock == bp->b_vp->v_interlock);

        if (!ISSET(bp->b_oflags, BO_DELWRI)) {
                mutex_enter(&bufcache_lock);
                mutex_enter(bp->b_objlock);
                SET(bp->b_oflags, BO_DELWRI);
                curlwp->l_ru.ru_oublock++;
                reassignbuf(bp, bp->b_vp);
                /* Wake anyone trying to busy the buffer via vnode's lists. */
                cv_broadcast(&bp->b_busy);
                mutex_exit(&bufcache_lock);
        } else {
                mutex_enter(bp->b_objlock);
        }
        /* Otherwise, the "write" is done, so mark and release the buffer. */
        CLR(bp->b_oflags, BO_DONE);
        mutex_exit(bp->b_objlock);

        brelse(bp, 0);
}

/*
 * Asynchronous block write; just an asynchronous bwrite().
 */
void
bawrite(buf_t *bp)
{

        KASSERT(ISSET(bp->b_cflags, BC_BUSY));
        KASSERT(bp->b_vp != NULL);

        SET(bp->b_flags, B_ASYNC);
        VOP_BWRITE(bp->b_vp, bp);
}

/*
 * Release a buffer on to the free lists.
 * Described in Bach (p. 46).
 */
void
brelsel(buf_t *bp, int set)
{
        struct bqueue *bufq;
        struct vnode *vp;

        SDT_PROBE2(io, kernel, , brelse,  bp, set);

        KASSERT(bp != NULL);
        KASSERT(mutex_owned(&bufcache_lock));
        KASSERT(!cv_has_waiters(&bp->b_done));

        SET(bp->b_cflags, set);

        KASSERT(ISSET(bp->b_cflags, BC_BUSY));
        KASSERT(bp->b_iodone == NULL);

        /* Wake up any processes waiting for any buffer to become free. */
        cv_signal(&needbuffer_cv);

        /* Wake up any proceeses waiting for _this_ buffer to become free */
        if (ISSET(bp->b_cflags, BC_WANTED))
                CLR(bp->b_cflags, BC_WANTED|BC_AGE);

        /* If it's clean clear the copy-on-write flag. */
        if (ISSET(bp->b_flags, B_COWDONE)) {
                mutex_enter(bp->b_objlock);
                if (!ISSET(bp->b_oflags, BO_DELWRI))
                        CLR(bp->b_flags, B_COWDONE);
                mutex_exit(bp->b_objlock);
        }

        /*
         * Determine which queue the buffer should be on, then put it there.
         */

        /* If it's locked, don't report an error; try again later. */
        if (ISSET(bp->b_flags, B_LOCKED))
                bp->b_error = 0;

        /* If it's not cacheable, or an error, mark it invalid. */
        if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0)
                SET(bp->b_cflags, BC_INVAL);

        if (ISSET(bp->b_cflags, BC_VFLUSH)) {
                /*
                 * This is a delayed write buffer that was just flushed to
                 * disk.  It is still on the LRU queue.  If it's become
                 * invalid, then we need to move it to a different queue;
                 * otherwise leave it in its current position.
                 */
                CLR(bp->b_cflags, BC_VFLUSH);
                if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) &&
                    !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) {
                        KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1));
                        goto already_queued;
                } else {
                        bremfree(bp);
                }
        }

        KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0));
        KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0));
        KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0));

        if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) {
                /*
                 * If it's invalid or empty, dissociate it from its vnode
                 * and put on the head of the appropriate queue.
                 */
                if (ISSET(bp->b_flags, B_LOCKED)) {
                        if (wapbl_vphaswapbl(vp = bp->b_vp)) {
                                struct mount *mp = wapbl_vptomp(vp);

                                KASSERT(bp->b_iodone
                                    != mp->mnt_wapbl_op->wo_wapbl_biodone);
                                WAPBL_REMOVE_BUF(mp, bp);
                        }
                }

                mutex_enter(bp->b_objlock);
                CLR(bp->b_oflags, BO_DONE|BO_DELWRI);
                if ((vp = bp->b_vp) != NULL) {
                        KASSERT(bp->b_objlock == vp->v_interlock);
                        reassignbuf(bp, bp->b_vp);
                        brelvp(bp);
                        mutex_exit(vp->v_interlock);
                } else {
                        KASSERT(bp->b_objlock == &buffer_lock);
                        mutex_exit(bp->b_objlock);
                }
                /* We want to dispose of the buffer, so wake everybody. */
                cv_broadcast(&bp->b_busy);
                if (bp->b_bufsize <= 0)
                        /* no data */
                        goto already_queued;
                else
                        /* invalid data */
                        bufq = &bufqueues[BQ_AGE];
                binsheadfree(bp, bufq);
        } else  {
                /*
                 * It has valid data.  Put it on the end of the appropriate
                 * queue, so that it'll stick around for as long as possible.
                 * If buf is AGE, but has dependencies, must put it on last
                 * bufqueue to be scanned, ie LRU. This protects against the
                 * livelock where BQ_AGE only has buffers with dependencies,
                 * and we thus never get to the dependent buffers in BQ_LRU.
                 */
                if (ISSET(bp->b_flags, B_LOCKED)) {
                        /* locked in core */
                        bufq = &bufqueues[BQ_LOCKED];
                } else if (!ISSET(bp->b_cflags, BC_AGE)) {
                        /* valid data */
                        bufq = &bufqueues[BQ_LRU];
                } else {
                        /* stale but valid data */
                        bufq = &bufqueues[BQ_AGE];
                }
                binstailfree(bp, bufq);
        }
already_queued:
        /* Unlock the buffer. */
        CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE);
        CLR(bp->b_flags, B_ASYNC);

        /*
         * Wake only the highest priority waiter on the lock, in order to
         * prevent a thundering herd: many LWPs simultaneously awakening and
         * competing for the buffer's lock.  Testing in 2019 revealed this
         * to reduce contention on bufcache_lock tenfold during a kernel
         * compile.  Here and elsewhere, when the buffer is changing
         * identity, being disposed of, or moving from one list to another,
         * we wake all lock requestors.
         */
        if (bp->b_bufsize <= 0) {
                cv_broadcast(&bp->b_busy);
                buf_destroy(bp);
#ifdef DEBUG
                memset((char *)bp, 0, sizeof(*bp));
#endif
                pool_cache_put(buf_cache, bp);
        } else
                cv_signal(&bp->b_busy);
}

void
brelse(buf_t *bp, int set)
{

        mutex_enter(&bufcache_lock);
        brelsel(bp, set);
        mutex_exit(&bufcache_lock);
}

/*
 * Determine if a block is in the cache.
 * Just look on what would be its hash chain.  If it's there, return
 * a pointer to it, unless it's marked invalid.  If it's marked invalid,
 * we normally don't return the buffer, unless the caller explicitly
 * wants us to.
 */
buf_t *
incore(struct vnode *vp, daddr_t blkno)
{
        buf_t *bp;

        KASSERT(mutex_owned(&bufcache_lock));

        /* Search hash chain */
        LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
                if (bp->b_lblkno == blkno && bp->b_vp == vp &&
                    !ISSET(bp->b_cflags, BC_INVAL)) {
                            KASSERT(bp->b_objlock == vp->v_interlock);
                            return (bp);
                }
        }

        return (NULL);
}

/*
 * Get a block of requested size that is associated with
 * a given vnode and block offset. If it is found in the
 * block cache, mark it as having been found, make it busy
 * and return it. Otherwise, return an empty block of the
 * correct size. It is up to the caller to insure that the
 * cached blocks be of the correct size.
 */
buf_t *
getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
{
        int err, preserve;
        buf_t *bp;

        mutex_enter(&bufcache_lock);
        SDT_PROBE3(io, kernel, , getblk__start,  vp, blkno, size);
 loop:
        bp = incore(vp, blkno);
        if (bp != NULL) {
                err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL);
                if (err != 0) {
                        if (err == EPASSTHROUGH)
                                goto loop;
                        mutex_exit(&bufcache_lock);
                        SDT_PROBE4(io, kernel, , getblk__done,
                            vp, blkno, size, NULL);
                        return (NULL);
                }
                KASSERT(!cv_has_waiters(&bp->b_done));
#ifdef DIAGNOSTIC
                if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) &&
                    bp->b_bcount < size && vp->v_type != VBLK)
                        panic("getblk: block size invariant failed");
#endif
                bremfree(bp);
                preserve = 1;
        } else {
                if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL)
                        goto loop;

                if (incore(vp, blkno) != NULL) {
                        /* The block has come into memory in the meantime. */
                        brelsel(bp, 0);
                        goto loop;
                }

                LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash);
                bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
                mutex_enter(vp->v_interlock);
                bgetvp(vp, bp);
                mutex_exit(vp->v_interlock);
                preserve = 0;
        }
        mutex_exit(&bufcache_lock);

        /*
         * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
         * if we re-size buffers here.
         */
        if (ISSET(bp->b_flags, B_LOCKED)) {
                KASSERT(bp->b_bufsize >= size);
        } else {
                if (allocbuf(bp, size, preserve)) {
                        mutex_enter(&bufcache_lock);
                        LIST_REMOVE(bp, b_hash);
                        brelsel(bp, BC_INVAL);
                        mutex_exit(&bufcache_lock);
                        SDT_PROBE4(io, kernel, , getblk__done,
                            vp, blkno, size, NULL);
                        return NULL;
                }
        }
        BIO_SETPRIO(bp, BPRIO_DEFAULT);
        SDT_PROBE4(io, kernel, , getblk__done,  vp, blkno, size, bp);
        return (bp);
}

/*
 * Get an empty, disassociated buffer of given size.
 */
buf_t *
geteblk(int size)
{
        buf_t *bp;
        int error __diagused;

        mutex_enter(&bufcache_lock);
        while ((bp = getnewbuf(0, 0, 0)) == NULL)
                ;

        SET(bp->b_cflags, BC_INVAL);
        LIST_INSERT_HEAD(&invalhash, bp, b_hash);
        mutex_exit(&bufcache_lock);
        BIO_SETPRIO(bp, BPRIO_DEFAULT);
        error = allocbuf(bp, size, 0);
        KASSERT(error == 0);
        return (bp);
}

/*
 * Expand or contract the actual memory allocated to a buffer.
 *
 * If the buffer shrinks, data is lost, so it's up to the
 * caller to have written it out *first*; this routine will not
 * start a write.  If the buffer grows, it's the callers
 * responsibility to fill out the buffer's additional contents.
 */
int
allocbuf(buf_t *bp, int size, int preserve)
{
        void *addr;
        vsize_t oldsize, desired_size;
        int oldcount;
        int delta;

        desired_size = buf_roundsize(size);
        if (desired_size > MAXBSIZE)
                printf("allocbuf: buffer larger than MAXBSIZE requested");

        oldcount = bp->b_bcount;

        bp->b_bcount = size;

        oldsize = bp->b_bufsize;
        if (oldsize == desired_size) {
                /*
                 * Do not short cut the WAPBL resize, as the buffer length
                 * could still have changed and this would corrupt the
                 * tracking of the transaction length.
                 */
                goto out;
        }

        /*
         * If we want a buffer of a different size, re-allocate the
         * buffer's memory; copy old content only if needed.
         */
        addr = buf_alloc(desired_size);
        if (addr == NULL)
                return ENOMEM;
        if (preserve)
                memcpy(addr, bp->b_data, MIN(oldsize,desired_size));
        if (bp->b_data != NULL)
                buf_mrelease(bp->b_data, oldsize);
        bp->b_data = addr;
        bp->b_bufsize = desired_size;

        /*
         * Update overall buffer memory counter (protected by bufcache_lock)
         */
        delta = (long)desired_size - (long)oldsize;

        mutex_enter(&bufcache_lock);
        if ((bufmem += delta) > bufmem_hiwater) {
                /*
                 * Need to trim overall memory usage.
                 */
                while (buf_canrelease()) {
                        if (preempt_needed()) {
                                mutex_exit(&bufcache_lock);
                                preempt();
                                mutex_enter(&bufcache_lock);
                        }
                        if (buf_trim() == 0)
                                break;
                }
        }
        mutex_exit(&bufcache_lock);

 out:
        if (wapbl_vphaswapbl(bp->b_vp))
                WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount);

        return 0;
}

/*
 * Find a buffer which is available for use.
 * Select something from a free list.
 * Preference is to AGE list, then LRU list.
 *
 * Called with the buffer queues locked.
 * Return buffer locked.
 */
static buf_t *
getnewbuf(int slpflag, int slptimeo, int from_bufq)
{
        buf_t *bp;
        struct vnode *vp;
        struct mount *transmp = NULL;

        SDT_PROBE0(io, kernel, , getnewbuf__start);

 start:
        KASSERT(mutex_owned(&bufcache_lock));

        /*
         * Get a new buffer from the pool.
         */
        if (!from_bufq && buf_lotsfree()) {
                mutex_exit(&bufcache_lock);
                bp = pool_cache_get(buf_cache, PR_NOWAIT);
                if (bp != NULL) {
                        memset((char *)bp, 0, sizeof(*bp));
                        buf_init(bp);
                        SET(bp->b_cflags, BC_BUSY);        /* mark buffer busy */
                        mutex_enter(&bufcache_lock);
#if defined(DIAGNOSTIC)
                        bp->b_freelistindex = -1;
#endif /* defined(DIAGNOSTIC) */
                        SDT_PROBE1(io, kernel, , getnewbuf__done,  bp);
                        return (bp);
                }
                mutex_enter(&bufcache_lock);
        }

        KASSERT(mutex_owned(&bufcache_lock));
        if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL) {
                KASSERT(!ISSET(bp->b_oflags, BO_DELWRI));
        } else {
                TAILQ_FOREACH(bp, &bufqueues[BQ_LRU].bq_queue, b_freelist) {
                        if (ISSET(bp->b_cflags, BC_VFLUSH) ||
                            !ISSET(bp->b_oflags, BO_DELWRI))
                                break;
                        if (fstrans_start_nowait(bp->b_vp->v_mount) == 0) {
                                KASSERT(transmp == NULL);
                                transmp = bp->b_vp->v_mount;
                                break;
                        }
                }
        }
        if (bp != NULL) {
                    KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH));
                bremfree(bp);

                /* Buffer is no longer on free lists. */
                SET(bp->b_cflags, BC_BUSY);

                /* Wake anyone trying to lock the old identity. */
                cv_broadcast(&bp->b_busy);
        } else {
                /*
                 * XXX: !from_bufq should be removed.
                 */
                if (!from_bufq || curlwp != uvm.pagedaemon_lwp) {
                        /* wait for a free buffer of any kind */
                        if ((slpflag & PCATCH) != 0)
                                (void)cv_timedwait_sig(&needbuffer_cv,
                                    &bufcache_lock, slptimeo);
                        else
                                (void)cv_timedwait(&needbuffer_cv,
                                    &bufcache_lock, slptimeo);
                }
                SDT_PROBE1(io, kernel, , getnewbuf__done,  NULL);
                return (NULL);
        }

#ifdef DIAGNOSTIC
        if (bp->b_bufsize <= 0)
                panic("buffer %p: on queue but empty", bp);
#endif

        if (ISSET(bp->b_cflags, BC_VFLUSH)) {
                /*
                 * This is a delayed write buffer being flushed to disk.  Make
                 * sure it gets aged out of the queue when it's finished, and
                 * leave it off the LRU queue.
                 */
                CLR(bp->b_cflags, BC_VFLUSH);
                SET(bp->b_cflags, BC_AGE);
                goto start;
        }

        KASSERT(ISSET(bp->b_cflags, BC_BUSY));
            KASSERT(!cv_has_waiters(&bp->b_done));

        /*
         * If buffer was a delayed write, start it and return NULL
         * (since we might sleep while starting the write).
         */
        if (ISSET(bp->b_oflags, BO_DELWRI)) {
                /*
                 * This buffer has gone through the LRU, so make sure it gets
                 * reused ASAP.
                 */
                SET(bp->b_cflags, BC_AGE);
                mutex_exit(&bufcache_lock);
                bawrite(bp);
                KASSERT(transmp != NULL);
                fstrans_done(transmp);
                mutex_enter(&bufcache_lock);
                SDT_PROBE1(io, kernel, , getnewbuf__done,  NULL);
                return (NULL);
        }

        KASSERT(transmp == NULL);

        vp = bp->b_vp;

        /* clear out various other fields */
        bp->b_cflags = BC_BUSY;
        bp->b_oflags = 0;
        bp->b_flags = 0;
        bp->b_dev = NODEV;
        bp->b_blkno = 0;
        bp->b_lblkno = 0;
        bp->b_rawblkno = 0;
        bp->b_iodone = 0;
        bp->b_error = 0;
        bp->b_resid = 0;
        bp->b_bcount = 0;

        LIST_REMOVE(bp, b_hash);

        /* Disassociate us from our vnode, if we had one... */
        if (vp != NULL) {
                mutex_enter(vp->v_interlock);
                brelvp(bp);
                mutex_exit(vp->v_interlock);
        }

        SDT_PROBE1(io, kernel, , getnewbuf__done,  bp);
        return (bp);
}

/*
 * Invalidate the specified buffer if it exists.
 */
void
binvalbuf(struct vnode *vp, daddr_t blkno)
{
        buf_t *bp;
        int err;

        mutex_enter(&bufcache_lock);

 loop:
        bp = incore(vp, blkno);
        if (bp != NULL) {
                err = bbusy(bp, 0, 0, NULL);
                if (err == EPASSTHROUGH)
                        goto loop;
                bremfree(bp);
                if (ISSET(bp->b_oflags, BO_DELWRI)) {
                        SET(bp->b_cflags, BC_NOCACHE);
                        mutex_exit(&bufcache_lock);
                        bwrite(bp);
                } else {
                        brelsel(bp, BC_INVAL);
                        mutex_exit(&bufcache_lock);
                }
        } else
                mutex_exit(&bufcache_lock);
}

/*
 * Attempt to free an aged buffer off the queues.
 * Called with queue lock held.
 * Returns the amount of buffer memory freed.
 */
static int
buf_trim(void)
{
        buf_t *bp;
        long size;

        KASSERT(mutex_owned(&bufcache_lock));

        /* Instruct getnewbuf() to get buffers off the queues */
        if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL)
                return 0;

        KASSERT((bp->b_cflags & BC_WANTED) == 0);
        size = bp->b_bufsize;
        bufmem -= size;
        if (size > 0) {
                buf_mrelease(bp->b_data, size);
                bp->b_bcount = bp->b_bufsize = 0;
        }
        /* brelse() will return the buffer to the global buffer pool */
        brelsel(bp, 0);
        return size;
}

int
buf_drain(int n)
{
        int size = 0, sz;

        KASSERT(mutex_owned(&bufcache_lock));

        while (size < n && bufmem > bufmem_lowater) {
                sz = buf_trim();
                if (sz <= 0)
                        break;
                size += sz;
        }

        return size;
}

/*
 * Wait for operations on the buffer to complete.
 * When they do, extract and return the I/O's error value.
 */
int
biowait(buf_t *bp)
{

        BIOHIST_FUNC(__func__);

        KASSERT(ISSET(bp->b_cflags, BC_BUSY));

        SDT_PROBE1(io, kernel, , wait__start, bp);

        mutex_enter(bp->b_objlock);

        BIOHIST_CALLARGS(biohist, "bp=%#jx, oflags=0x%jx, ret_addr=%#jx",
            (uintptr_t)bp, bp->b_oflags, 
            (uintptr_t)__builtin_return_address(0), 0);

        while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI)) {
                BIOHIST_LOG(biohist, "waiting bp=%#jx", (uintptr_t)bp, 0, 0, 0);
                cv_wait(&bp->b_done, bp->b_objlock);
        }
        mutex_exit(bp->b_objlock);

        SDT_PROBE1(io, kernel, , wait__done, bp);

        BIOHIST_LOG(biohist, "return %jd", bp->b_error, 0, 0, 0);

        return bp->b_error;
}

/*
 * Mark I/O complete on a buffer.
 *
 * If a callback has been requested, e.g. the pageout
 * daemon, do so. Otherwise, awaken waiting processes.
 *
 * [ Leffler, et al., says on p.247:
 *        "This routine wakes up the blocked process, frees the buffer
 *        for an asynchronous write, or, for a request by the pagedaemon
 *        process, invokes a procedure specified in the buffer structure" ]
 *
 * In real life, the pagedaemon (or other system processes) wants
 * to do async stuff too, and doesn't want the buffer brelse()'d.
 * (for swap pager, that puts swap buffers on the free lists (!!!),
 * for the vn device, that puts allocated buffers on the free lists!)
 */
void
biodone(buf_t *bp)
{
        int s;

        BIOHIST_FUNC(__func__);

        KASSERT(!ISSET(bp->b_oflags, BO_DONE));

        if (cpu_intr_p()) {
                /* From interrupt mode: defer to a soft interrupt. */
                s = splvm();
                TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq);

                BIOHIST_CALLARGS(biohist, "bp=%#jx, softint scheduled",
                    (uintptr_t)bp, 0, 0, 0);
                softint_schedule(biodone_sih);
                splx(s);
        } else {
                /* Process now - the buffer may be freed soon. */
                biodone2(bp);
        }
}

SDT_PROBE_DEFINE1(io, kernel, , done, "struct buf *"/*bp*/);

static void
biodone2(buf_t *bp)
{
        void (*callout)(buf_t *);

        SDT_PROBE1(io, kernel, ,done, bp);

        BIOHIST_FUNC(__func__);
        BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0);

        mutex_enter(bp->b_objlock);
        /* Note that the transfer is done. */
        if (ISSET(bp->b_oflags, BO_DONE))
                panic("biodone2 already");
        CLR(bp->b_flags, B_COWDONE);
        SET(bp->b_oflags, BO_DONE);
        BIO_SETPRIO(bp, BPRIO_DEFAULT);

        /* Wake up waiting writers. */
        if (!ISSET(bp->b_flags, B_READ))
                vwakeup(bp);

        if ((callout = bp->b_iodone) != NULL) {
                BIOHIST_LOG(biohist, "callout %#jx", (uintptr_t)callout,
                    0, 0, 0);

                /* Note callout done, then call out. */
                KASSERT(!cv_has_waiters(&bp->b_done));
                bp->b_iodone = NULL;
                mutex_exit(bp->b_objlock);
                (*callout)(bp);
        } else if (ISSET(bp->b_flags, B_ASYNC)) {
                /* If async, release. */
                BIOHIST_LOG(biohist, "async", 0, 0, 0, 0);
                KASSERT(!cv_has_waiters(&bp->b_done));
                mutex_exit(bp->b_objlock);
                brelse(bp, 0);
        } else {
                /* Otherwise just wake up waiters in biowait(). */
                BIOHIST_LOG(biohist, "wake-up", 0, 0, 0, 0);
                cv_broadcast(&bp->b_done);
                mutex_exit(bp->b_objlock);
        }
}

static void
biointr(void *cookie)
{
        struct cpu_info *ci;
        buf_t *bp;
        int s;

        BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);

        ci = curcpu();

        s = splvm();
        while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) {
                KASSERT(curcpu() == ci);

                bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone);
                TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq);
                splx(s);

                BIOHIST_LOG(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0);
                biodone2(bp);

                s = splvm();
        }
        splx(s);
}

static void
sysctl_fillbuf(const buf_t *i, struct buf_sysctl *o)
{
        const bool allowaddr = get_expose_address(curproc);

        memset(o, 0, sizeof(*o));

        o->b_flags = i->b_flags | i->b_cflags | i->b_oflags;
        o->b_error = i->b_error;
        o->b_prio = i->b_prio;
        o->b_dev = i->b_dev;
        o->b_bufsize = i->b_bufsize;
        o->b_bcount = i->b_bcount;
        o->b_resid = i->b_resid;
        COND_SET_VALUE(o->b_addr, PTRTOUINT64(i->b_data), allowaddr);
        o->b_blkno = i->b_blkno;
        o->b_rawblkno = i->b_rawblkno;
        COND_SET_VALUE(o->b_iodone, PTRTOUINT64(i->b_iodone), allowaddr);
        COND_SET_VALUE(o->b_proc, PTRTOUINT64(i->b_proc), allowaddr);
        COND_SET_VALUE(o->b_vp, PTRTOUINT64(i->b_vp), allowaddr);
        COND_SET_VALUE(o->b_saveaddr, PTRTOUINT64(i->b_saveaddr), allowaddr);
        o->b_lblkno = i->b_lblkno;
}

static int
sysctl_dobuf(SYSCTLFN_ARGS)
{
        buf_t *bp;
        struct buf_sysctl bs;
        struct bqueue *bq;
        char *dp;
        u_int i, op, arg;
        size_t len, needed, elem_size, out_size;
        int error, elem_count, retries;

        if (namelen == 1 && name[0] == CTL_QUERY)
                return (sysctl_query(SYSCTLFN_CALL(rnode)));

        if (namelen != 4)
                return (EINVAL);

        retries = 100;
 retry:
        dp = oldp;
        len = (oldp != NULL) ? *oldlenp : 0;
        op = name[0];
        arg = name[1];
        elem_size = name[2];
        elem_count = name[3];
        out_size = MIN(sizeof(bs), elem_size);

        /*
         * at the moment, these are just "placeholders" to make the
         * API for retrieving kern.buf data more extensible in the
         * future.
         *
         * XXX kern.buf currently has "netbsd32" issues.  hopefully
         * these will be resolved at a later point.
         */
        if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL ||
            elem_size < 1 || elem_count < 0)
                return (EINVAL);

        if (oldp == NULL) {
                /* count only, don't run through the buffer queues */
                needed = pool_cache_nget(buf_cache) - pool_cache_nput(buf_cache);
                *oldlenp = (needed + KERN_BUFSLOP) * elem_size;

                return 0;
        }

        error = 0;
        needed = 0;
        sysctl_unlock();
        mutex_enter(&bufcache_lock);
        for (i = 0; i < BQUEUES; i++) {
                bq = &bufqueues[i];
                TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) {
                        bq->bq_marker = bp;
                        if (len >= elem_size && elem_count > 0) {
                                sysctl_fillbuf(bp, &bs);
                                mutex_exit(&bufcache_lock);
                                error = copyout(&bs, dp, out_size);
                                mutex_enter(&bufcache_lock);
                                if (error)
                                        break;
                                if (bq->bq_marker != bp) {
                                        /*
                                         * This sysctl node is only for
                                         * statistics.  Retry; if the
                                         * queue keeps changing, then
                                         * bail out.
                                         */
                                        if (retries-- == 0) {
                                                error = EAGAIN;
                                                break;
                                        }
                                        mutex_exit(&bufcache_lock);
                                        sysctl_relock();
                                        goto retry;
                                }
                                dp += elem_size;
                                len -= elem_size;
                        }
                        needed += elem_size;
                        if (elem_count > 0 && elem_count != INT_MAX)
                                elem_count--;
                }
                if (error != 0)
                        break;
        }
        mutex_exit(&bufcache_lock);
        sysctl_relock();

        *oldlenp = needed;

        return (error);
}

static int
sysctl_bufvm_update(SYSCTLFN_ARGS)
{
        int error, rv;
        struct sysctlnode node;
        unsigned int temp_bufcache;
        unsigned long temp_water;

        /* Take a copy of the supplied node and its data */
        node = *rnode;
        if (node.sysctl_data == &bufcache) {
            node.sysctl_data = &temp_bufcache;
            temp_bufcache = *(unsigned int *)rnode->sysctl_data;
        } else {
            node.sysctl_data = &temp_water;
            temp_water = *(unsigned long *)rnode->sysctl_data;
        }

        /* Update the copy */
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return (error);

        if (rnode->sysctl_data == &bufcache) {
                if (temp_bufcache > 100)
                        return (EINVAL);
                bufcache = temp_bufcache;
                buf_setwm();
        } else if (rnode->sysctl_data == &bufmem_lowater) {
                if (bufmem_hiwater - temp_water < 16)
                        return (EINVAL);
                bufmem_lowater = temp_water;
        } else if (rnode->sysctl_data == &bufmem_hiwater) {
                if (temp_water - bufmem_lowater < 16)
                        return (EINVAL);
                bufmem_hiwater = temp_water;
        } else
                return (EINVAL);

        /* Drain until below new high water mark */
        sysctl_unlock();
        mutex_enter(&bufcache_lock);
        while (bufmem > bufmem_hiwater) {
                rv = buf_drain((bufmem - bufmem_hiwater) / (2 * 1024));
                if (rv <= 0)
                        break;
        }
        mutex_exit(&bufcache_lock);
        sysctl_relock();

        return 0;
}

static struct sysctllog *vfsbio_sysctllog;

static void
sysctl_kern_buf_setup(void)
{

        sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "buf",
                       SYSCTL_DESCR("Kernel buffer cache information"),
                       sysctl_dobuf, 0, NULL, 0,
                       CTL_KERN, KERN_BUF, CTL_EOL);
}

static void
sysctl_vm_buf_setup(void)
{

        sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "bufcache",
                       SYSCTL_DESCR("Percentage of physical memory to use for "
                                    "buffer cache"),
                       sysctl_bufvm_update, 0, &bufcache, 0,
                       CTL_VM, CTL_CREATE, CTL_EOL);
        sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
                       CTLTYPE_LONG, "bufmem",
                       SYSCTL_DESCR("Amount of kernel memory used by buffer "
                                    "cache"),
                       NULL, 0, &bufmem, 0,
                       CTL_VM, CTL_CREATE, CTL_EOL);
        sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_LONG, "bufmem_lowater",
                       SYSCTL_DESCR("Minimum amount of kernel memory to "
                                    "reserve for buffer cache"),
                       sysctl_bufvm_update, 0, &bufmem_lowater, 0,
                       CTL_VM, CTL_CREATE, CTL_EOL);
        sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_LONG, "bufmem_hiwater",
                       SYSCTL_DESCR("Maximum amount of kernel memory to use "
                                    "for buffer cache"),
                       sysctl_bufvm_update, 0, &bufmem_hiwater, 0,
                       CTL_VM, CTL_CREATE, CTL_EOL);
}

static int
bufhash_stats(struct hashstat_sysctl *hs, bool fill)
{
        buf_t *bp;
        uint64_t chain;

        strlcpy(hs->hash_name, "bufhash", sizeof(hs->hash_name));
        strlcpy(hs->hash_desc, "buffer hash", sizeof(hs->hash_desc));
        if (!fill)
                return 0;

        hs->hash_size = bufhash + 1;

        for (size_t i = 0; i < hs->hash_size; i++) {
                chain = 0;

                mutex_enter(&bufcache_lock);
                LIST_FOREACH(bp, &bufhashtbl[i], b_hash) {
                        chain++;
                }
                mutex_exit(&bufcache_lock);

                if (chain > 0) {
                        hs->hash_used++;
                        hs->hash_items += chain;
                        if (chain > hs->hash_maxchain)
                                hs->hash_maxchain = chain;
                }
                preempt_point();
        }

        return 0;
}

#ifdef DEBUG
/*
 * Print out statistics on the current allocation of the buffer pool.
 * Can be enabled to print out on every ``sync'' by setting "syncprt"
 * in vfs_syscalls.c using sysctl.
 */
void
vfs_bufstats(void)
{
        int i, j, count;
        buf_t *bp;
        struct bqueue *dp;
        int counts[MAXBSIZE / MIN_PAGE_SIZE + 1];
        static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" };

        for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
                count = 0;
                memset(counts, 0, sizeof(counts));
                TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) {
                        counts[bp->b_bufsize / PAGE_SIZE]++;
                        count++;
                }
                printf("%s: total-%d", bname[i], count);
                for (j = 0; j <= MAXBSIZE / PAGE_SIZE; j++)
                        if (counts[j] != 0)
                                printf(", %d-%d", j * PAGE_SIZE, counts[j]);
                printf("\n");
        }
}
#endif /* DEBUG */

/* ------------------------------ */

buf_t *
getiobuf(struct vnode *vp, bool waitok)
{
        buf_t *bp;

        bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
        if (bp == NULL)
                return bp;

        buf_init(bp);

        if ((bp->b_vp = vp) != NULL) {
                bp->b_objlock = vp->v_interlock;
        } else {
                KASSERT(bp->b_objlock == &buffer_lock);
        }

        return bp;
}

void
putiobuf(buf_t *bp)
{

        buf_destroy(bp);
        pool_cache_put(bufio_cache, bp);
}

/*
 * nestiobuf_iodone: b_iodone callback for nested buffers.
 */

void
nestiobuf_iodone(buf_t *bp)
{
        buf_t *mbp = bp->b_private;
        int error;
        int donebytes;

        KASSERT(bp->b_bcount <= bp->b_bufsize);
        KASSERT(mbp != bp);

        error = bp->b_error;
        if (bp->b_error == 0 &&
            (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) {
                /*
                 * Not all got transferred, raise an error. We have no way to
                 * propagate these conditions to mbp.
                 */
                error = EIO;
        }

        donebytes = bp->b_bufsize;

        putiobuf(bp);
        nestiobuf_done(mbp, donebytes, error);
}

/*
 * nestiobuf_setup: setup a "nested" buffer.
 *
 * => 'mbp' is a "master" buffer which is being divided into sub pieces.
 * => 'bp' should be a buffer allocated by getiobuf.
 * => 'offset' is a byte offset in the master buffer.
 * => 'size' is a size in bytes of this nested buffer.
 */

void
nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size)
{
        const int b_pass = mbp->b_flags & (B_READ|B_PHYS|B_RAW|B_MEDIA_FLAGS);
        struct vnode *vp = mbp->b_vp;

        KASSERT(mbp->b_bcount >= offset + size);
        bp->b_vp = vp;
        bp->b_dev = mbp->b_dev;
        bp->b_objlock = mbp->b_objlock;
        bp->b_cflags = BC_BUSY;
        bp->b_flags = B_ASYNC | b_pass;
        bp->b_iodone = nestiobuf_iodone;
        bp->b_data = (char *)mbp->b_data + offset;
        bp->b_resid = bp->b_bcount = size;
        bp->b_bufsize = bp->b_bcount;
        bp->b_private = mbp;
        BIO_COPYPRIO(bp, mbp);
        if (BUF_ISWRITE(bp) && vp != NULL) {
                mutex_enter(vp->v_interlock);
                vp->v_numoutput++;
                mutex_exit(vp->v_interlock);
        }
}

/*
 * nestiobuf_done: propagate completion to the master buffer.
 *
 * => 'donebytes' specifies how many bytes in the 'mbp' is completed.
 * => 'error' is an errno(2) that 'donebytes' has been completed with.
 */

void
nestiobuf_done(buf_t *mbp, int donebytes, int error)
{

        if (donebytes == 0) {
                return;
        }
        mutex_enter(mbp->b_objlock);
        KASSERT(mbp->b_resid >= donebytes);
        mbp->b_resid -= donebytes;
        if (error)
                mbp->b_error = error;
        if (mbp->b_resid == 0) {
                if (mbp->b_error)
                        mbp->b_resid = mbp->b_bcount;
                mutex_exit(mbp->b_objlock);
                biodone(mbp);
        } else
                mutex_exit(mbp->b_objlock);
}

void
buf_init(buf_t *bp)
{

        cv_init(&bp->b_busy, "biolock");
        cv_init(&bp->b_done, "biowait");
        bp->b_dev = NODEV;
        bp->b_error = 0;
        bp->b_flags = 0;
        bp->b_cflags = 0;
        bp->b_oflags = 0;
        bp->b_objlock = &buffer_lock;
        bp->b_iodone = NULL;
        bp->b_dev = NODEV;
        bp->b_vnbufs.le_next = NOLIST;
        BIO_SETPRIO(bp, BPRIO_DEFAULT);
}

void
buf_destroy(buf_t *bp)
{

        cv_destroy(&bp->b_done);
        cv_destroy(&bp->b_busy);
}

int
bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock)
{
        int error;

        KASSERT(mutex_owned(&bufcache_lock));

        SDT_PROBE4(io, kernel, , bbusy__start,  bp, intr, timo, interlock);

        if ((bp->b_cflags & BC_BUSY) != 0) {
                if (curlwp == uvm.pagedaemon_lwp) {
                        error = EDEADLK;
                        goto out;
                }
                bp->b_cflags |= BC_WANTED;
                if (interlock != NULL)
                        mutex_exit(interlock);
                if (intr) {
                        error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock,
                            timo);
                } else {
                        error = cv_timedwait(&bp->b_busy, &bufcache_lock,
                            timo);
                }
                /*
                 * At this point the buffer may be gone: don't touch it
                 * again.  The caller needs to find it again and retry.
                 */
                if (interlock != NULL)
                        mutex_enter(interlock);
                if (error == 0)
                        error = EPASSTHROUGH;
        } else {
                bp->b_cflags |= BC_BUSY;
                error = 0;
        }

out:        SDT_PROBE5(io, kernel, , bbusy__done,
            bp, intr, timo, interlock, error);
        return error;
}

/*
 * Nothing outside this file should really need to know about nbuf,
 * but a few things still want to read it, so give them a way to do that.
 */
u_int
buf_nbuf(void)
{

        return nbuf;
}


































































































































































































































































































































































































































































































































    2 


























    2 













































































   28 























   27 









   29 














































   11 
   21 


   29 









   29 

















   28 






    1 



















    1 















   29 














   29 












































































    1 















    1 














































    2 












































































    1 
    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
/*        $NetBSD: sd.c,v 1.336 2024/02/24 22:06:49 mlelstv Exp $        */

/*-
 * Copyright (c) 1998, 2003, 2004 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Originally written by Julian Elischer (julian@dialix.oz.au)
 * for TRW Financial Systems for use under the MACH(2.5) operating system.
 *
 * TRW Financial Systems, in accordance with their agreement with Carnegie
 * Mellon University, makes this software available to CMU to distribute
 * or use in any manner that they see fit as long as this message is kept with
 * the software. For this reason TFS also grants any other persons or
 * organisations permission to use or modify this software.
 *
 * TFS supplies this software to be publicly redistributed
 * on the understanding that TFS is not responsible for the correct
 * functioning of this software in any circumstances.
 *
 * Ported to run under 386BSD by Julian Elischer (julian@dialix.oz.au) Sept 1992
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sd.c,v 1.336 2024/02/24 22:06:49 mlelstv Exp $");

#ifdef _KERNEL_OPT
#include "opt_scsi.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/scsiio.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/errno.h>
#include <sys/device.h>
#include <sys/disklabel.h>
#include <sys/disk.h>
#include <sys/proc.h>
#include <sys/conf.h>
#include <sys/vnode.h>

#include <dev/scsipi/scsi_spc.h>
#include <dev/scsipi/scsipi_all.h>
#include <dev/scsipi/scsi_all.h>
#include <dev/scsipi/scsipi_disk.h>
#include <dev/scsipi/scsi_disk.h>
#include <dev/scsipi/scsiconf.h>
#include <dev/scsipi/scsipi_base.h>
#include <dev/scsipi/sdvar.h>

#include <prop/proplib.h>

#define        SDUNIT(dev)                        DISKUNIT(dev)
#define        SDPART(dev)                        DISKPART(dev)
#define        SDMINOR(unit, part)                DISKMINOR(unit, part)
#define        MAKESDDEV(maj, unit, part)        MAKEDISKDEV(maj, unit, part)

#define        SDLABELDEV(dev)        (MAKESDDEV(major(dev), SDUNIT(dev), RAW_PART))

#define        SD_DEFAULT_BLKSIZE        512

static void        sdminphys(struct buf *);
static void        sdstart(struct scsipi_periph *);
static void        sdrestart(void *);
static void        sddone(struct scsipi_xfer *, int);
static bool        sd_suspend(device_t, const pmf_qual_t *);
static bool        sd_shutdown(device_t, int);
static int        sd_interpret_sense(struct scsipi_xfer *);
static int        sd_diskstart(device_t, struct buf *);
static int        sd_dumpblocks(device_t, void *, daddr_t, int);
static void        sd_iosize(device_t, int *);
static int        sd_lastclose(device_t);
static int        sd_firstopen(device_t, dev_t, int, int);
static void        sd_label(device_t, struct disklabel *);

static int        sd_mode_sense(struct sd_softc *, u_int8_t, void *, size_t, int,
                    int, int *);
static int        sd_mode_select(struct sd_softc *, u_int8_t, void *, size_t, int,
                    int);
static int        sd_validate_blksize(struct scsipi_periph *, int);
static u_int64_t sd_read_capacity(struct scsipi_periph *, int *, int flags);
static int        sd_get_simplifiedparms(struct sd_softc *, struct disk_parms *,
                    int);
static int        sd_get_capacity(struct sd_softc *, struct disk_parms *, int);
static int        sd_get_parms(struct sd_softc *, struct disk_parms *, int);
static int        sd_get_parms_page4(struct sd_softc *, struct disk_parms *,
                    int);
static int        sd_get_parms_page5(struct sd_softc *, struct disk_parms *,
                    int);

static int        sd_flush(struct sd_softc *, int);
static int        sd_getcache(struct sd_softc *, int *);
static int        sd_setcache(struct sd_softc *, int);

static int        sdmatch(device_t, cfdata_t, void *);
static void        sdattach(device_t, device_t, void *);
static int        sddetach(device_t, int);
static void        sd_set_geometry(struct sd_softc *);

CFATTACH_DECL3_NEW(sd, sizeof(struct sd_softc), sdmatch, sdattach, sddetach,
    NULL, NULL, NULL, DVF_DETACH_SHUTDOWN);

extern struct cfdriver sd_cd;

static const struct scsipi_inquiry_pattern sd_patterns[] = {
        {T_DIRECT, T_FIXED,
         "",         "",                 ""},
        {T_DIRECT, T_REMOV,
         "",         "",                 ""},
        {T_OPTICAL, T_FIXED,
         "",         "",                 ""},
        {T_OPTICAL, T_REMOV,
         "",         "",                 ""},
        {T_SIMPLE_DIRECT, T_FIXED,
         "",         "",                 ""},
        {T_SIMPLE_DIRECT, T_REMOV,
         "",         "",                 ""},
};

static dev_type_open(sdopen);
static dev_type_close(sdclose);
static dev_type_read(sdread);
static dev_type_write(sdwrite);
static dev_type_ioctl(sdioctl);
static dev_type_strategy(sdstrategy);
static dev_type_dump(sddump);
static dev_type_size(sdsize);

const struct bdevsw sd_bdevsw = {
        .d_open = sdopen,
        .d_close = sdclose,
        .d_strategy = sdstrategy,
        .d_ioctl = sdioctl,
        .d_dump = sddump,
        .d_psize = sdsize,
        .d_discard = nodiscard,
        .d_cfdriver = &sd_cd,
        .d_devtounit = disklabel_dev_unit,
        .d_flag = D_DISK | D_MPSAFE
};

const struct cdevsw sd_cdevsw = {
        .d_open = sdopen,
        .d_close = sdclose,
        .d_read = sdread,
        .d_write = sdwrite,
        .d_ioctl = sdioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_cfdriver = &sd_cd,
        .d_devtounit = disklabel_dev_unit,
        .d_flag = D_DISK | D_MPSAFE
};

static const struct dkdriver sddkdriver = {
        .d_open = sdopen,
        .d_close = sdclose,
        .d_strategy = sdstrategy,
        .d_minphys = sdminphys,
        .d_diskstart = sd_diskstart,
        .d_dumpblocks = sd_dumpblocks,
        .d_iosize = sd_iosize,
        .d_firstopen = sd_firstopen,
        .d_lastclose = sd_lastclose,
        .d_label = sd_label,
};

static const struct scsipi_periphsw sd_switch = {
        sd_interpret_sense,        /* check our error handler first */
        sdstart,                /* have a queue, served by this */
        NULL,                        /* have no async handler */
        sddone,                        /* deal with stats at interrupt time */
};

struct sd_mode_sense_data {
        /*
         * XXX
         * We are not going to parse this as-is -- it just has to be large
         * enough.
         */
        union {
                struct scsi_mode_parameter_header_6 small;
                struct scsi_mode_parameter_header_10 big;
        } header;
        struct scsi_general_block_descriptor blk_desc;
        union scsi_disk_pages pages;
};

/*
 * The routine called by the low level scsi routine when it discovers
 * A device suitable for this driver
 */
static int
sdmatch(device_t parent, cfdata_t match,
    void *aux)
{
        struct scsipibus_attach_args *sa = aux;
        int priority;

        (void)scsipi_inqmatch(&sa->sa_inqbuf,
            sd_patterns, sizeof(sd_patterns) / sizeof(sd_patterns[0]),
            sizeof(sd_patterns[0]), &priority);

        return (priority);
}

/*
 * Attach routine common to atapi & scsi.
 */
static void
sdattach(device_t parent, device_t self, void *aux)
{
        struct sd_softc *sd = device_private(self);
        struct dk_softc *dksc = &sd->sc_dksc;
        struct scsipibus_attach_args *sa = aux;
        struct scsipi_periph *periph = sa->sa_periph;
        int error, result, dtype;
        struct disk_parms *dp = &sd->params;
        char pbuf[9];

        SC_DEBUG(periph, SCSIPI_DB2, ("sdattach: "));

        sd->type = (sa->sa_inqbuf.type & SID_TYPE);
        memcpy(sd->name, sa->sa_inqbuf.product, uimin(16, sizeof(sd->name)));
        memcpy(sd->typename, sa->sa_inqbuf.product, uimin(16, sizeof(sd->typename)));

        if (sd->type == T_SIMPLE_DIRECT)
                periph->periph_quirks |= PQUIRK_ONLYBIG | PQUIRK_NOBIGMODESENSE;

        switch (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(sa->sa_periph))) {
        case SCSIPI_BUSTYPE_SCSI:
                dtype = DKTYPE_SCSI;
                if (periph->periph_version == 0)
                        sd->flags |= SDF_ANCIENT;
                break;
        case SCSIPI_BUSTYPE_ATAPI:
                dtype = DKTYPE_ATAPI;
                break;
        default:
                dtype = DKTYPE_UNKNOWN;
                break;
        }

        /* Initialize dk and disk structure. */
        dk_init(dksc, self, dtype);
        disk_init(&dksc->sc_dkdev, dksc->sc_xname, &sddkdriver);

        /* Attach dk and disk subsystems */
        dk_attach(dksc);
        disk_attach(&dksc->sc_dkdev);

        bufq_alloc(&dksc->sc_bufq, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);

        callout_init(&sd->sc_callout, 0);

        /*
         * Store information needed to contact our base driver
         */
        sd->sc_periph = periph;

        periph->periph_dev = dksc->sc_dev;
        periph->periph_switch = &sd_switch;

        /*
         * Increase our openings to the maximum-per-periph
         * supported by the adapter.  This will either be
         * clamped down or grown by the adapter if necessary.
         */
        periph->periph_openings =
            SCSIPI_CHAN_MAX_PERIPH(periph->periph_channel);
        periph->periph_flags |= PERIPH_GROW_OPENINGS;

        /*
         * Use the subdriver to request information regarding the drive.
         */
        aprint_naive("\n");
        aprint_normal("\n");

        if (periph->periph_quirks & PQUIRK_START)
                (void)scsipi_start(periph, SSS_START, XS_CTL_SILENT);

        error = scsipi_test_unit_ready(periph,
            XS_CTL_DISCOVERY | XS_CTL_IGNORE_ILLEGAL_REQUEST |
            XS_CTL_IGNORE_MEDIA_CHANGE | XS_CTL_SILENT_NODEV);
        if (error)
                result = SDGP_RESULT_OFFLINE;
        else
                result = sd_get_parms(sd, &sd->params, XS_CTL_DISCOVERY);

        aprint_normal_dev(dksc->sc_dev, "");
        switch (result) {
        case SDGP_RESULT_OK:
                format_bytes(pbuf, sizeof(pbuf),
                    (u_int64_t)dp->disksize * dp->blksize);
                aprint_normal(
                "%s, %ld cyl, %ld head, %ld sec, %ld bytes/sect x %llu sectors",
                    pbuf, dp->cyls, dp->heads, dp->sectors, dp->blksize,
                    (unsigned long long)dp->disksize);
                break;

        case SDGP_RESULT_OFFLINE:
                aprint_normal("drive offline");
                break;

        case SDGP_RESULT_UNFORMATTED:
                aprint_normal("unformatted media");
                break;

#ifdef DIAGNOSTIC
        default:
                panic("sdattach: unknown result from get_parms");
                break;
#endif
        }
        aprint_normal("\n");

        /* Discover wedges on this disk if it is online */
        if (result == SDGP_RESULT_OK)
                dkwedge_discover(&dksc->sc_dkdev);

        /*
         * Establish a shutdown hook so that we can ensure that
         * our data has actually made it onto the platter at
         * shutdown time.  Note that this relies on the fact
         * that the shutdown hooks at the "leaves" of the device tree
         * are run, first (thus guaranteeing that our hook runs before
         * our ancestors').
         */
        if (!pmf_device_register1(self, sd_suspend, NULL, sd_shutdown))
                aprint_error_dev(self, "couldn't establish power handler\n");
}

static int
sddetach(device_t self, int flags)
{
        struct sd_softc *sd = device_private(self);
        struct dk_softc *dksc = &sd->sc_dksc;
        struct scsipi_periph *periph = sd->sc_periph;
        struct scsipi_channel *chan = periph->periph_channel;
        int bmaj, cmaj, i, mn, rc;

        if ((rc = disk_begindetach(&dksc->sc_dkdev, sd_lastclose, self, flags)) != 0)
                return rc;

        /* locate the major number */
        bmaj = bdevsw_lookup_major(&sd_bdevsw);
        cmaj = cdevsw_lookup_major(&sd_cdevsw);

        /* Nuke the vnodes for any open instances */
        for (i = 0; i < MAXPARTITIONS; i++) {
                mn = SDMINOR(device_unit(self), i);
                vdevgone(bmaj, mn, mn, VBLK);
                vdevgone(cmaj, mn, mn, VCHR);
        }

        /* kill any pending restart */
        callout_halt(&sd->sc_callout, NULL);

        dk_drain(dksc);

        /* Kill off any pending commands. */
        mutex_enter(chan_mtx(chan));
        scsipi_kill_pending(periph);
        mutex_exit(chan_mtx(chan));

        bufq_free(dksc->sc_bufq);

        /* Delete all of our wedges. */
        dkwedge_delall(&dksc->sc_dkdev);

        /* Detach from the disk list. */
        disk_detach(&dksc->sc_dkdev);
        disk_destroy(&dksc->sc_dkdev);

        dk_detach(dksc);

        callout_destroy(&sd->sc_callout);

        pmf_device_deregister(self);

        return (0);
}

/*
 * Serialized by caller
 */
static int
sd_firstopen(device_t self, dev_t dev, int flag, int fmt)
{
        struct sd_softc *sd = device_private(self);
        struct scsipi_periph *periph = sd->sc_periph;
        struct scsipi_adapter *adapt = periph->periph_channel->chan_adapter;
        int error, silent;
        int part, removable;

        part = SDPART(dev);

        error = scsipi_adapter_addref(adapt);
        if (error)
                return error;

        if ((part == RAW_PART && fmt == S_IFCHR) || (flag & FSILENT))
                silent = XS_CTL_SILENT;
        else
                silent = 0;

        /* Check that it is still responding and ok. */
        error = scsipi_test_unit_ready(periph,
            XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_MEDIA_CHANGE |
            silent);

        /*
         * Start the pack spinning if necessary. Always allow the
         * raw partition to be opened, for raw IOCTLs. Data transfers
         * will check for SDEV_MEDIA_LOADED.
         */
        if (error == EIO) {
                error = scsipi_start(periph, SSS_START, silent);
                if (error == EINVAL)
                        error = EIO;
        }
        if (error)
                goto bad;

        removable = (periph->periph_flags & PERIPH_REMOVABLE) != 0;
        if (removable) {
                /* Lock the pack in. */
                error = scsipi_prevent(periph, SPAMR_PREVENT_DT,
                    XS_CTL_IGNORE_ILLEGAL_REQUEST |
                    XS_CTL_IGNORE_MEDIA_CHANGE |
                    XS_CTL_SILENT);
                if (error)
                        goto bad;
        }

        if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) {
                int param_error;

                /*
                 * Load the physical device parameters.
                 *
                 * Note that if media is present but unformatted,
                 * we allow the open (so that it can be formatted!).
                 * The drive should refuse real I/O, if the media is
                 * unformatted.
                 */
                param_error = sd_get_parms(sd, &sd->params, 0);
                if (param_error == SDGP_RESULT_OFFLINE) {
                        error = ENXIO;
                        goto bad2;
                }
                periph->periph_flags |= PERIPH_MEDIA_LOADED;

                SC_DEBUG(periph, SCSIPI_DB3, ("Params loaded "));
        }

        periph->periph_flags |= PERIPH_OPEN;
        return 0;

bad2:
        if (removable)
                scsipi_prevent(periph, SPAMR_ALLOW,
                    XS_CTL_IGNORE_ILLEGAL_REQUEST |
                    XS_CTL_IGNORE_MEDIA_CHANGE |
                    XS_CTL_SILENT);

bad:
        scsipi_adapter_delref(adapt);
        return error;
}

/*
 * open the device. Make sure the partition info is a up-to-date as can be.
 */
static int
sdopen(dev_t dev, int flag, int fmt, struct lwp *l)
{
        struct sd_softc *sd;
        struct dk_softc *dksc;
        struct scsipi_periph *periph;
        int unit, part;
        int error;

        unit = SDUNIT(dev);
        sd = device_lookup_private(&sd_cd, unit);
        if (sd == NULL)
                return (ENXIO);
        dksc = &sd->sc_dksc;

        if (!device_is_active(dksc->sc_dev))
                return (ENODEV);

        periph = sd->sc_periph;
        part = SDPART(dev);

        SC_DEBUG(periph, SCSIPI_DB1,
            ("sdopen: dev=0x%"PRIx64" (unit %d (of %d), partition %d)\n",
            dev, unit, sd_cd.cd_ndevs, SDPART(dev)));

        /*
         * If any partition is open, but the disk has been invalidated,
         * disallow further opens of non-raw partition
         */
        if ((periph->periph_flags & (PERIPH_OPEN | PERIPH_MEDIA_LOADED)) ==
            PERIPH_OPEN) {
                if (part != RAW_PART || fmt != S_IFCHR)
                        return EIO;
        }

        error = dk_open(dksc, dev, flag, fmt, l);

        SC_DEBUG(periph, SCSIPI_DB3, ("open complete\n"));

        return error;
}

/*
 * Serialized by caller
 */
static int
sd_lastclose(device_t self)
{
        struct sd_softc *sd = device_private(self);
        struct dk_softc *dksc = &sd->sc_dksc;
        struct scsipi_periph *periph = sd->sc_periph;
        struct scsipi_adapter *adapt = periph->periph_channel->chan_adapter;

        /*
         * If the disk cache needs flushing, and the disk supports
         * it, do it now.
         */
        if ((sd->flags & SDF_DIRTY) != 0) {
                if (sd_flush(sd, 0)) {
                        aprint_error_dev(dksc->sc_dev,
                                "cache synchronization failed\n");
                        sd->flags &= ~SDF_FLUSHING;
                } else
                        sd->flags &= ~(SDF_FLUSHING|SDF_DIRTY);
        }

        scsipi_wait_drain(periph);

        if (periph->periph_flags & PERIPH_REMOVABLE)
                scsipi_prevent(periph, SPAMR_ALLOW,
                    XS_CTL_IGNORE_ILLEGAL_REQUEST |
                    XS_CTL_IGNORE_NOT_READY |
                    XS_CTL_SILENT);
        periph->periph_flags &= ~PERIPH_OPEN;

        scsipi_wait_drain(periph);

        scsipi_adapter_delref(adapt);

        return 0;
}

/*
 * close the device.. only called if we are the LAST occurrence of an open
 * device.  Convenient now but usually a pain.
 */
static int
sdclose(dev_t dev, int flag, int fmt, struct lwp *l)
{
        struct sd_softc *sd;
        struct dk_softc *dksc;
        int unit;

        unit = SDUNIT(dev);
        sd = device_lookup_private(&sd_cd, unit);
        dksc = &sd->sc_dksc;

        return dk_close(dksc, dev, flag, fmt, l);
}

/*
 * Actually translate the requested transfer into one the physical driver
 * can understand.  The transfer is described by a buf and will include
 * only one physical transfer.
 */
static void
sdstrategy(struct buf *bp)
{
        struct sd_softc *sd = device_lookup_private(&sd_cd, SDUNIT(bp->b_dev));
        struct dk_softc *dksc = &sd->sc_dksc;
        struct scsipi_periph *periph = sd->sc_periph;

        SC_DEBUG(sd->sc_periph, SCSIPI_DB2, ("sdstrategy "));
        SC_DEBUG(sd->sc_periph, SCSIPI_DB1,
            ("%d bytes @ blk %" PRId64 "\n", bp->b_bcount, bp->b_blkno));

        /*
         * If the device has been made invalid, error out
         */
        if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0 ||
            !device_is_active(dksc->sc_dev)) {
                if (periph->periph_flags & PERIPH_OPEN)
                        bp->b_error = EIO;
                else
                        bp->b_error = ENODEV;

                bp->b_resid = bp->b_bcount;
                biodone(bp);
                return;
        }

        dk_strategy(dksc, bp);
}

/*
 * Issue single I/O command
 *
 * Called from dk_start and implicitly from dk_strategy
 */
static int
sd_diskstart(device_t dev, struct buf *bp)
{
        struct sd_softc *sd = device_private(dev);
        struct scsipi_periph *periph = sd->sc_periph;
        struct scsipi_channel *chan = periph->periph_channel;
        struct scsipi_rw_16 cmd16;
        struct scsipi_rw_10 cmd_big;
        struct scsi_rw_6 cmd_small;
        struct scsipi_generic *cmdp;
        struct scsipi_xfer *xs;
        int error, flags, nblks, cmdlen;
        int cdb_flags;
        bool havefua = !(periph->periph_quirks & PQUIRK_NOFUA);

        mutex_enter(chan_mtx(chan));

        if (periph->periph_active >= periph->periph_openings) {
                error = EAGAIN;
                goto out;
        }

        /*
         * there is excess capacity, but a special waits
         * It'll need the adapter as soon as we clear out of the
         * way and let it run (user level wait).
         */
        if (periph->periph_flags & PERIPH_WAITING) {
                periph->periph_flags &= ~PERIPH_WAITING;
                cv_broadcast(periph_cv_periph(periph));
                error = EAGAIN;
                goto out;
        }

        /*
         * If the device has become invalid, abort all the
         * reads and writes until all files have been closed and
         * re-opened.
         */
        if (__predict_false(
            (periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)) {
                error = EIO;
                goto out;
        }

        /*
         * Mark the disk dirty so that the cache will be
         * flushed on close.
         */
        if ((bp->b_flags & B_READ) == 0)
                sd->flags |= SDF_DIRTY;

        if (sd->params.blksize == DEV_BSIZE)
                nblks = bp->b_bcount >> DEV_BSHIFT;
        else
                nblks = howmany(bp->b_bcount, sd->params.blksize);

        /*
         * Pass FUA and/or DPO if requested. Must be done before CDB
         * selection, as 6-byte CDB doesn't support the flags.
         */
        cdb_flags = 0;
        if (havefua) {
                if (bp->b_flags & B_MEDIA_FUA)
                        cdb_flags |= SRWB_FUA;

                if (bp->b_flags & B_MEDIA_DPO)
                        cdb_flags |= SRWB_DPO;
        }

        /*
         * Fill out the scsi command.  Use the smallest CDB possible
         * (6-byte, 10-byte, or 16-byte). If we need FUA or DPO,
         * need to use 10-byte or bigger, as the 6-byte doesn't support
         * the flags.
         */
        if (((bp->b_rawblkno & 0x1fffff) == bp->b_rawblkno) &&
            ((nblks & 0xff) == nblks) &&
            !(periph->periph_quirks & PQUIRK_ONLYBIG) &&
            !cdb_flags) {
                /* 6-byte CDB */
                memset(&cmd_small, 0, sizeof(cmd_small));
                cmd_small.opcode = (bp->b_flags & B_READ) ?
                    SCSI_READ_6_COMMAND : SCSI_WRITE_6_COMMAND;
                _lto3b(bp->b_rawblkno, cmd_small.addr);
                cmd_small.length = nblks & 0xff;
                cmdlen = sizeof(cmd_small);
                cmdp = (struct scsipi_generic *)&cmd_small;
        } else if ((bp->b_rawblkno & 0xffffffff) == bp->b_rawblkno) {
                /* 10-byte CDB */
                memset(&cmd_big, 0, sizeof(cmd_big));
                cmd_big.opcode = (bp->b_flags & B_READ) ?
                    READ_10 : WRITE_10;
                _lto4b(bp->b_rawblkno, cmd_big.addr);
                _lto2b(nblks, cmd_big.length);
                cmdlen = sizeof(cmd_big);
                cmdp = (struct scsipi_generic *)&cmd_big;
        } else {
                /* 16-byte CDB */
                memset(&cmd16, 0, sizeof(cmd16));
                cmd16.opcode = (bp->b_flags & B_READ) ?
                    READ_16 : WRITE_16;
                _lto8b(bp->b_rawblkno, cmd16.addr);
                _lto4b(nblks, cmd16.length);
                cmdlen = sizeof(cmd16);
                cmdp = (struct scsipi_generic *)&cmd16;
        }

        if (cdb_flags)
                cmdp->bytes[0] = cdb_flags;

        /*
         * Figure out what flags to use.
         */
        flags = XS_CTL_NOSLEEP|XS_CTL_ASYNC|XS_CTL_SIMPLE_TAG;
        if (bp->b_flags & B_READ)
                flags |= XS_CTL_DATA_IN;
        else
                flags |= XS_CTL_DATA_OUT;

        /*
         * Call the routine that chats with the adapter.
         * Note: we cannot sleep as we may be an interrupt
         */
        xs = scsipi_make_xs_locked(periph, cmdp, cmdlen,
            (u_char *)bp->b_data, bp->b_bcount,
            SDRETRIES, SD_IO_TIMEOUT, bp, flags);
        if (__predict_false(xs == NULL)) {
                /*
                 * out of memory. Keep this buffer in the queue, and
                 * retry later.
                 */
                callout_reset(&sd->sc_callout, hz / 2, sdrestart, sd);
                error = EAGAIN;
                goto out;
        }

        error = scsipi_execute_xs(xs);
        /* with a scsipi_xfer preallocated, scsipi_command can't fail */
        KASSERT(error == 0);

out:
        mutex_exit(chan_mtx(chan));

        return error;
}

/*
 * Recover I/O request after memory shortage
 *
 * Called from callout
 */
static void
sdrestart(void *v)
{
        struct sd_softc *sd = v;
        struct dk_softc *dksc = &sd->sc_dksc;

        dk_start(dksc, NULL);
}

/*
 * Recover I/O request after memory shortage
 *
 * Called from scsipi midlayer when resources have been freed
 * with channel lock held
 */
static void
sdstart(struct scsipi_periph *periph)
{
        struct sd_softc *sd = device_private(periph->periph_dev);
        struct dk_softc *dksc = &sd->sc_dksc;
        struct scsipi_channel *chan = periph->periph_channel;

        /*
         * release channel lock as dk_start may need to acquire
         * other locks
         *
         * sdstart is called from scsipi_put_xs and all its callers
         * release the lock afterwards. So releasing it here
         * doesn't matter.
         */
        mutex_exit(chan_mtx(chan));

        dk_start(dksc, NULL);

        mutex_enter(chan_mtx(chan));
}

static void
sddone(struct scsipi_xfer *xs, int error)
{
        struct sd_softc *sd = device_private(xs->xs_periph->periph_dev);
        struct dk_softc *dksc = &sd->sc_dksc;
        struct buf *bp = xs->bp;

        if (sd->flags & SDF_FLUSHING) {
                /* Flush completed, no longer dirty. */
                sd->flags &= ~(SDF_FLUSHING|SDF_DIRTY);
        }

        if (bp) {
                bp->b_error = error;
                bp->b_resid = xs->resid;
                if (error) {
                        /* on a read/write error bp->b_resid is zero, so fix */
                        bp->b_resid = bp->b_bcount;
                }

                dk_done(dksc, bp);
                /* dk_start is called from scsipi_complete */
        }
}

static void
sdminphys(struct buf *bp)
{
        struct sd_softc *sd = device_lookup_private(&sd_cd, SDUNIT(bp->b_dev));
        struct dk_softc *dksc = &sd->sc_dksc;
        long xmax;

        /*
         * If the device is ancient, we want to make sure that
         * the transfer fits into a 6-byte cdb.
         *
         * XXX Note that the SCSI-I spec says that 256-block transfers
         * are allowed in a 6-byte read/write, and are specified
         * by setting the "length" to 0.  However, we're conservative
         * here, allowing only 255-block transfers in case an
         * ancient device gets confused by length == 0.  A length of 0
         * in a 10-byte read/write actually means 0 blocks.
         */
        if ((sd->flags & SDF_ANCIENT) &&
            ((sd->sc_periph->periph_flags &
            (PERIPH_REMOVABLE | PERIPH_MEDIA_LOADED)) != PERIPH_REMOVABLE)) {
                xmax = dksc->sc_dkdev.dk_geom.dg_secsize * 0xff;

                if (bp->b_bcount > xmax)
                        bp->b_bcount = xmax;
        }

        scsipi_adapter_minphys(sd->sc_periph->periph_channel, bp);
}

static void
sd_iosize(device_t dev, int *count)
{
        struct buf B;
        int bmaj;

        bmaj       = bdevsw_lookup_major(&sd_bdevsw);
        B.b_dev    = MAKESDDEV(bmaj,device_unit(dev),RAW_PART);
        B.b_bcount = *count;

        sdminphys(&B);

        *count = B.b_bcount;
}

static int
sdread(dev_t dev, struct uio *uio, int ioflag)
{

        return (physio(sdstrategy, NULL, dev, B_READ, sdminphys, uio));
}

static int
sdwrite(dev_t dev, struct uio *uio, int ioflag)
{

        return (physio(sdstrategy, NULL, dev, B_WRITE, sdminphys, uio));
}

/*
 * Perform special action on behalf of the user
 * Knows about the internals of this device
 */
static int
sdioctl(dev_t dev, u_long cmd, void *addr, int flag, struct lwp *l)
{
        struct sd_softc *sd = device_lookup_private(&sd_cd, SDUNIT(dev));
        struct dk_softc *dksc = &sd->sc_dksc;
        struct scsipi_periph *periph = sd->sc_periph;

        int part = SDPART(dev);
        int error;

        SC_DEBUG(sd->sc_periph, SCSIPI_DB2, ("sdioctl 0x%lx ", cmd));

        /*
         * If the device is not valid, some IOCTLs can still be
         * handled on the raw partition. Check this here.
         */
        if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0 &&
            part != RAW_PART)
                return (EIO);

        switch (cmd) {
        case DIOCLOCK:
                if (periph->periph_flags & PERIPH_REMOVABLE)
                        return (scsipi_prevent(periph,
                            (*(int *)addr) ?
                            SPAMR_PREVENT_DT : SPAMR_ALLOW, 0));
                else 
                        return (ENOTTY);

        case DIOCEJECT:
                if ((periph->periph_flags & PERIPH_REMOVABLE) == 0)
                        return (ENOTTY);
                if (*(int *)addr == 0) {
                        int pmask = __BIT(part);
                        /*
                         * Don't force eject: check that we are the only
                         * partition open. If so, unlock it.
                         */
                        if (DK_BUSY(dksc, pmask) == 0) {
                                error = scsipi_prevent(periph, SPAMR_ALLOW,
                                    XS_CTL_IGNORE_NOT_READY);
                                if (error)
                                        return (error);
                        } else {
                                return (EBUSY);
                        }
                }
                /* FALLTHROUGH */
        case ODIOCEJECT:
                return ((periph->periph_flags & PERIPH_REMOVABLE) == 0 ?
                    ENOTTY : scsipi_start(periph, SSS_STOP|SSS_LOEJ, 0));

        case DIOCGCACHE:
                return (sd_getcache(sd, (int *) addr));

        case DIOCSCACHE:
                if ((flag & FWRITE) == 0)
                        return (EBADF);
                return (sd_setcache(sd, *(int *) addr));

        case DIOCCACHESYNC:
                /*
                 * XXX Do we really need to care about having a writable
                 * file descriptor here?
                 */
                if ((flag & FWRITE) == 0)
                        return (EBADF);
                if (((sd->flags & SDF_DIRTY) != 0 || *(int *)addr != 0)) {
                        error = sd_flush(sd, 0);
                        if (error) {
                                sd->flags &= ~SDF_FLUSHING;
                                return (error);
                        }
                        sd->flags &= ~(SDF_FLUSHING|SDF_DIRTY);
                }
                return (0);

        default:
                error = dk_ioctl(dksc, dev, cmd, addr, flag, l); 
                if (error == ENOTTY)
                        error = scsipi_do_ioctl(periph, dev, cmd, addr, flag, l);
                return (error);
        }

#ifdef DIAGNOSTIC
        panic("sdioctl: impossible");
#endif
}

static void
sd_label(device_t self, struct disklabel *lp)
{               
        struct sd_softc *sd = device_private(self);

        strncpy(lp->d_typename, sd->name, 16);
        lp->d_rpm = sd->params.rot_rate;
        if (sd->sc_periph->periph_flags & PERIPH_REMOVABLE)
                lp->d_flags |= D_REMOVABLE;
}

static bool
sd_shutdown(device_t self, int how)
{
        struct sd_softc *sd = device_private(self);
        struct dk_softc *dksc = &sd->sc_dksc;

        /*
         * If the disk cache needs to be flushed, and the disk supports
         * it, flush it.  We're cold at this point, so we poll for
         * completion.
         */
        if ((sd->flags & SDF_DIRTY) != 0) {
                if (sd_flush(sd, XS_CTL_NOSLEEP|XS_CTL_POLL)) {
                        aprint_error_dev(dksc->sc_dev,
                                "cache synchronization failed\n");
                        sd->flags &= ~SDF_FLUSHING;
                } else
                        sd->flags &= ~(SDF_FLUSHING|SDF_DIRTY);
        }
        return true;
}

static bool
sd_suspend(device_t dv, const pmf_qual_t *qual)
{
        return sd_shutdown(dv, boothowto); /* XXX no need to poll */
}

/*
 * Check Errors
 */
static int
sd_interpret_sense(struct scsipi_xfer *xs)
{
        struct scsipi_periph *periph = xs->xs_periph;
        struct scsipi_channel *chan = periph->periph_channel;
        struct scsi_sense_data *sense = &xs->sense.scsi_sense;
        struct sd_softc *sd = device_private(periph->periph_dev);
        struct dk_softc *dksc = &sd->sc_dksc;
        int error, retval = EJUSTRETURN;

        /*
         * If the periph is already recovering, just do the normal
         * error processing.
         */
        if (periph->periph_flags & PERIPH_RECOVERING)
                return (retval);

        /*
         * Ignore errors from accessing illegal fields (e.g. trying to
         * lock the door of a digicam, which doesn't have a door that
         * can be locked) for the SCSI_PREVENT_ALLOW_MEDIUM_REMOVAL command.
         */
        if (xs->cmd->opcode == SCSI_PREVENT_ALLOW_MEDIUM_REMOVAL &&
            SSD_SENSE_KEY(sense->flags) == SKEY_ILLEGAL_REQUEST &&
            sense->asc == 0x24 &&
            sense->ascq == 0x00) { /* Illegal field in CDB */
                if (!(xs->xs_control & XS_CTL_SILENT)) {
                        scsipi_printaddr(periph);
                        printf("no door lock\n");
                }
                xs->xs_control |= XS_CTL_IGNORE_ILLEGAL_REQUEST;
                return (retval);
        }



        /*
         * If the device is not open yet, let the generic code handle it.
         */
        if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)
                return (retval);

        /*
         * If it isn't a extended or extended/deferred error, let
         * the generic code handle it.
         */
        if (SSD_RCODE(sense->response_code) != SSD_RCODE_CURRENT &&
            SSD_RCODE(sense->response_code) != SSD_RCODE_DEFERRED)
                return (retval);

        if (SSD_SENSE_KEY(sense->flags) == SKEY_NOT_READY &&
            sense->asc == 0x4) {
                if (sense->ascq == 0x01)        {
                        /*
                         * Unit In The Process Of Becoming Ready.
                         */
                        printf("%s: waiting for pack to spin up...\n",
                            dksc->sc_xname);
                        if (!callout_pending(&periph->periph_callout))
                                scsipi_periph_freeze(periph, 1);
                        callout_reset(&periph->periph_callout,
                            5 * hz, scsipi_periph_timed_thaw, periph);
                        retval = ERESTART;
                } else if (sense->ascq == 0x02) {
                        printf("%s: pack is stopped, restarting...\n",
                            dksc->sc_xname);
                        mutex_enter(chan_mtx(chan));
                        periph->periph_flags |= PERIPH_RECOVERING;
                        mutex_exit(chan_mtx(chan));
                        error = scsipi_start(periph, SSS_START,
                            XS_CTL_URGENT|XS_CTL_HEAD_TAG|
                            XS_CTL_THAW_PERIPH|XS_CTL_FREEZE_PERIPH);
                        if (error) {
                                aprint_error_dev(dksc->sc_dev,
                                        "unable to restart pack\n");
                                retval = error;
                        } else
                                retval = ERESTART;
                        mutex_enter(chan_mtx(chan));
                        periph->periph_flags &= ~PERIPH_RECOVERING;
                        mutex_exit(chan_mtx(chan));
                }
        }
        if (SSD_SENSE_KEY(sense->flags) == SKEY_MEDIUM_ERROR &&
            sense->asc == 0x31 &&
            sense->ascq == 0x00)        { /* maybe for any asq ? */
                /* Medium Format Corrupted */
                retval = EFTYPE;
        }
        return (retval);
}


static int
sdsize(dev_t dev)
{
        struct sd_softc *sd;
        struct dk_softc *dksc;
        int unit;

        unit = SDUNIT(dev);
        sd = device_lookup_private(&sd_cd, unit);
        if (sd == NULL)
                return (-1);
        dksc = &sd->sc_dksc;

        if (!device_is_active(dksc->sc_dev))
                return (-1);

        return dk_size(dksc, dev);
}

/* #define SD_DUMP_NOT_TRUSTED if you just want to watch */
static struct scsipi_xfer sx;

/*
 * dump all of physical memory into the partition specified, starting
 * at offset 'dumplo' into the partition.
 */
static int
sddump(dev_t dev, daddr_t blkno, void *va, size_t size)
{
        struct sd_softc *sd;
        struct dk_softc *dksc;
        struct scsipi_periph *periph;
        int unit;

        unit = SDUNIT(dev);
        if ((sd = device_lookup_private(&sd_cd, unit)) == NULL)
                return (ENXIO);
        dksc = &sd->sc_dksc;

        if (!device_is_active(dksc->sc_dev))
                return (ENODEV);

        periph = sd->sc_periph;

        /* Make sure it was initialized. */
        if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)
                return (ENXIO);

        return dk_dump(dksc, dev, blkno, va, size, 0);
}

static int
sd_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
{
        struct sd_softc *sd = device_private(dev);
        struct dk_softc *dksc = &sd->sc_dksc;
        struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
        struct scsipi_rw_10 cmd;        /* write command */
        struct scsipi_xfer *xs;                /* ... convenience */
        struct scsipi_periph *periph;
        struct scsipi_channel *chan;
        size_t sectorsize;

        periph = sd->sc_periph;
        chan = periph->periph_channel;

        sectorsize = dg->dg_secsize;

        xs = &sx;

#ifndef        SD_DUMP_NOT_TRUSTED
        /*
         *  Fill out the scsi command
         */
        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = WRITE_10;
        _lto4b(blkno, cmd.addr);
        _lto2b(nblk, cmd.length);
        /*
         * Fill out the scsipi_xfer structure
         *    Note: we cannot sleep as we may be an interrupt
         * don't use scsipi_command() as it may want to wait
         * for an xs.
         */
        memset(xs, 0, sizeof(sx));
        xs->xs_control |= XS_CTL_NOSLEEP | XS_CTL_POLL |
            XS_CTL_DATA_OUT;
        xs->xs_status = 0;
        xs->xs_periph = periph;
        xs->xs_retries = SDRETRIES;
        xs->timeout = 10000;        /* 10000 millisecs for a disk ! */
        xs->cmd = (struct scsipi_generic *)&cmd;
        xs->cmdlen = sizeof(cmd);
        xs->resid = nblk * sectorsize;
        xs->error = XS_NOERROR;
        xs->bp = 0;
        xs->data = va;
        xs->datalen = nblk * sectorsize;
        callout_init(&xs->xs_callout, 0);

        /*
         * Pass all this info to the scsi driver.
         */
        scsipi_adapter_request(chan, ADAPTER_REQ_RUN_XFER, xs);
        if ((xs->xs_status & XS_STS_DONE) == 0 ||
            xs->error != XS_NOERROR)
                return (EIO);
#else        /* SD_DUMP_NOT_TRUSTED */
        /* Let's just talk about this first... */
        printf("sd%d: dump addr 0x%x, blk %d\n", unit, va, blkno);
        delay(500 * 1000);        /* half a second */
#endif        /* SD_DUMP_NOT_TRUSTED */

        return (0);
}

static int
sd_mode_sense(struct sd_softc *sd, u_int8_t byte2, void *sense, size_t size,
    int page, int flags, int *big)
{

        if ((sd->sc_periph->periph_quirks & PQUIRK_ONLYBIG) &&
            !(sd->sc_periph->periph_quirks & PQUIRK_NOBIGMODESENSE)) {
                *big = 1;
                return scsipi_mode_sense_big(sd->sc_periph, byte2, page, sense,
                    size + sizeof(struct scsi_mode_parameter_header_10),
                    flags, SDRETRIES, 6000);
        } else {
                *big = 0;
                return scsipi_mode_sense(sd->sc_periph, byte2, page, sense,
                    size + sizeof(struct scsi_mode_parameter_header_6),
                    flags, SDRETRIES, 6000);
        }
}

static int
sd_mode_select(struct sd_softc *sd, u_int8_t byte2, void *sense, size_t size,
    int flags, int big)
{

        if (big) {
                struct scsi_mode_parameter_header_10 *header = sense;

                _lto2b(0, header->data_length);
                return scsipi_mode_select_big(sd->sc_periph, byte2, sense,
                    size + sizeof(struct scsi_mode_parameter_header_10),
                    flags, SDRETRIES, 6000);
        } else {
                struct scsi_mode_parameter_header_6 *header = sense;

                header->data_length = 0;
                return scsipi_mode_select(sd->sc_periph, byte2, sense,
                    size + sizeof(struct scsi_mode_parameter_header_6),
                    flags, SDRETRIES, 6000);
        }
}

/*
 * sd_validate_blksize:
 *
 *        Validate the block size.  Print error if periph is specified, 
 */
static int
sd_validate_blksize(struct scsipi_periph *periph, int len)
{

        if (len >= 256 && powerof2(len) && len <= 4096) {
                return 1;
        }

        if (periph) {
                scsipi_printaddr(periph);
                printf("%s sector size: 0x%x.  Defaulting to %d bytes.\n",
                    !powerof2(len) ?
                    "preposterous" : "unsupported",
                    len, SD_DEFAULT_BLKSIZE);
        }

        return 0;
}

/*
 * sd_read_capacity:
 *
 *        Find out from the device what its capacity is.
 */
static u_int64_t
sd_read_capacity(struct scsipi_periph *periph, int *blksize, int flags)
{
        union {
                struct scsipi_read_capacity_10 cmd;
                struct scsipi_read_capacity_16 cmd16;
        } cmd;
        union {
                struct scsipi_read_capacity_10_data data;
                struct scsipi_read_capacity_16_data data16;
        } *datap;
        uint64_t rv;

        memset(&cmd, 0, sizeof(cmd));
        cmd.cmd.opcode = READ_CAPACITY_10;

        /*
         * Don't allocate data buffer on stack;
         * The lower driver layer might use the same stack and
         * if it uses region which is in the same cacheline,
         * cache flush ops against the data buffer won't work properly.
         */
        datap = malloc(sizeof(*datap), M_TEMP, M_WAITOK);
        if (datap == NULL)
                return 0;

        /*
         * If the command works, interpret the result as a 4 byte
         * number of blocks
         */
        rv = 0;
        memset(datap, 0, sizeof(datap->data));
        if (scsipi_command(periph, (void *)&cmd.cmd, sizeof(cmd.cmd),
            (void *)datap, sizeof(datap->data), SCSIPIRETRIES, 20000, NULL,
            flags | XS_CTL_DATA_IN | XS_CTL_SILENT) != 0)
                goto out;

        if (_4btol(datap->data.addr) != 0xffffffff) {
                *blksize = _4btol(datap->data.length);
                rv = _4btol(datap->data.addr) + 1;
                goto out;
        }

        /*
         * Device is larger than can be reflected by READ CAPACITY (10).
         * Try READ CAPACITY (16).
         */

        memset(&cmd, 0, sizeof(cmd));
        cmd.cmd16.opcode = READ_CAPACITY_16;
        cmd.cmd16.byte2 = SRC16_SERVICE_ACTION;
        _lto4b(sizeof(datap->data16), cmd.cmd16.len);

        memset(datap, 0, sizeof(datap->data16));
        if (scsipi_command(periph, (void *)&cmd.cmd16, sizeof(cmd.cmd16),
            (void *)datap, sizeof(datap->data16), SCSIPIRETRIES, 20000, NULL,
            flags | XS_CTL_DATA_IN | XS_CTL_SILENT) != 0)
                goto out;

        *blksize = _4btol(datap->data16.length);
        rv = _8btol(datap->data16.addr) + 1;

 out:
        free(datap, M_TEMP);
        return rv;
}

static int
sd_get_simplifiedparms(struct sd_softc *sd, struct disk_parms *dp, int flags)
{
        struct {
                struct scsi_mode_parameter_header_6 header;
                /* no block descriptor */
                u_int8_t pg_code; /* page code (should be 6) */
                u_int8_t pg_length; /* page length (should be 11) */
                u_int8_t wcd; /* bit0: cache disable */
                u_int8_t lbs[2]; /* logical block size */
                u_int8_t size[5]; /* number of log. blocks */
                u_int8_t pp; /* power/performance */
                u_int8_t flags;
                u_int8_t resvd;
        } scsipi_sense;
        u_int64_t blocks;
        int error, blksize;

        /*
         * sd_read_capacity (ie "read capacity") and mode sense page 6
         * give the same information. Do both for now, and check
         * for consistency.
         * XXX probably differs for removable media
         */
        dp->blksize = SD_DEFAULT_BLKSIZE;
        if ((blocks = sd_read_capacity(sd->sc_periph, &blksize, flags)) == 0)
                return (SDGP_RESULT_OFFLINE);                /* XXX? */

        error = scsipi_mode_sense(sd->sc_periph, SMS_DBD, 6,
            &scsipi_sense.header, sizeof(scsipi_sense),
            flags, SDRETRIES, 6000);

        if (error != 0)
                return (SDGP_RESULT_OFFLINE);                /* XXX? */

        dp->blksize = blksize;
        if (!sd_validate_blksize(NULL, dp->blksize))
                dp->blksize = _2btol(scsipi_sense.lbs);
        if (!sd_validate_blksize(sd->sc_periph, dp->blksize))
                dp->blksize = SD_DEFAULT_BLKSIZE;

        /*
         * Create a pseudo-geometry.
         */
        dp->heads = 64;
        dp->sectors = 32;
        dp->cyls = blocks / (dp->heads * dp->sectors);
        dp->disksize = _5btol(scsipi_sense.size);
        if (dp->disksize <= UINT32_MAX && dp->disksize != blocks) {
                printf("RBC size: mode sense=%llu, get cap=%llu\n",
                       (unsigned long long)dp->disksize,
                       (unsigned long long)blocks);
                dp->disksize = blocks;
        }
        dp->disksize512 = (dp->disksize * dp->blksize) / DEV_BSIZE;

        return (SDGP_RESULT_OK);
}

/*
 * Get the scsi driver to send a full inquiry to the * device and use the
 * results to fill out the disk parameter structure.
 */
static int
sd_get_capacity(struct sd_softc *sd, struct disk_parms *dp, int flags)
{
        u_int64_t blocks;
        int error, blksize;
#if 0
        int i;
        u_int8_t *p;
#endif

        dp->disksize = blocks = sd_read_capacity(sd->sc_periph, &blksize,
            flags);
        if (blocks == 0) {
                struct scsipi_read_format_capacities cmd;
                struct {
                        struct scsipi_capacity_list_header header;
                        struct scsipi_capacity_descriptor desc;
                } __packed data;

                memset(&cmd, 0, sizeof(cmd));
                memset(&data, 0, sizeof(data));
                cmd.opcode = READ_FORMAT_CAPACITIES;
                _lto2b(sizeof(data), cmd.length);

                error = scsipi_command(sd->sc_periph,
                    (void *)&cmd, sizeof(cmd), (void *)&data, sizeof(data),
                    SDRETRIES, 20000, NULL,
                    flags | XS_CTL_DATA_IN);
                if (error == EFTYPE) {
                        /* Medium Format Corrupted, handle as not formatted */
                        return (SDGP_RESULT_UNFORMATTED);
                }
                if (error || data.header.length == 0)
                        return (SDGP_RESULT_OFFLINE);

#if 0
printf("rfc: length=%d\n", data.header.length);
printf("rfc result:"); for (i = sizeof(struct scsipi_capacity_list_header) + data.header.length, p = (void *)&data; i; i--, p++) printf(" %02x", *p); printf("\n");
#endif
                switch (data.desc.byte5 & SCSIPI_CAP_DESC_CODE_MASK) {
                case SCSIPI_CAP_DESC_CODE_RESERVED:
                case SCSIPI_CAP_DESC_CODE_FORMATTED:
                        break;

                case SCSIPI_CAP_DESC_CODE_UNFORMATTED:
                        return (SDGP_RESULT_UNFORMATTED);

                case SCSIPI_CAP_DESC_CODE_NONE:
                        return (SDGP_RESULT_OFFLINE);
                }

                dp->disksize = blocks = _4btol(data.desc.nblks);
                if (blocks == 0)
                        return (SDGP_RESULT_OFFLINE);                /* XXX? */

                blksize = _3btol(data.desc.blklen);

        } else if (!sd_validate_blksize(NULL, blksize)) {
                struct sd_mode_sense_data scsipi_sense;
                int big, bsize;
                struct scsi_general_block_descriptor *bdesc;

                memset(&scsipi_sense, 0, sizeof(scsipi_sense));
                error = sd_mode_sense(sd, 0, &scsipi_sense,
                    sizeof(scsipi_sense.blk_desc), 0, flags | XS_CTL_SILENT, &big);
                if (!error) {
                        if (big) {
                                bdesc = (void *)(&scsipi_sense.header.big + 1);
                                bsize = _2btol(scsipi_sense.header.big.blk_desc_len);
                        } else {
                                bdesc = (void *)(&scsipi_sense.header.small + 1);
                                bsize = scsipi_sense.header.small.blk_desc_len;
                        }

#if 0
printf("page 0 sense:"); for (i = sizeof(scsipi_sense), p = (void *)&scsipi_sense; i; i--, p++) printf(" %02x", *p); printf("\n");
printf("page 0 bsize=%d\n", bsize);
printf("page 0 ok\n");
#endif

                        if (bsize >= 8) {
                                blksize = _3btol(bdesc->blklen);
                        }
                }
        }

        if (!sd_validate_blksize(sd->sc_periph, blksize))
                blksize = SD_DEFAULT_BLKSIZE;

        dp->blksize = blksize;
        dp->disksize512 = (blocks * dp->blksize) / DEV_BSIZE;
        return (0);
}

static int
sd_get_parms_page4(struct sd_softc *sd, struct disk_parms *dp, int flags)
{
        struct sd_mode_sense_data scsipi_sense;
        int error;
        int big, byte2;
        size_t poffset;
        union scsi_disk_pages *pages;

        byte2 = SMS_DBD;
again:
        memset(&scsipi_sense, 0, sizeof(scsipi_sense));
        error = sd_mode_sense(sd, byte2, &scsipi_sense,
            (byte2 ? 0 : sizeof(scsipi_sense.blk_desc)) +
            sizeof(scsipi_sense.pages.rigid_geometry), 4,
            flags | XS_CTL_SILENT, &big);
        if (error) {
                if (byte2 == SMS_DBD) {
                        /* No result; try once more with DBD off */
                        byte2 = 0;
                        goto again;
                }
                return (error);
        }

        if (big) {
                poffset = sizeof scsipi_sense.header.big;
                poffset += _2btol(scsipi_sense.header.big.blk_desc_len);
        } else {
                poffset = sizeof scsipi_sense.header.small;
                poffset += scsipi_sense.header.small.blk_desc_len;
        }

        if (poffset > sizeof(scsipi_sense) - sizeof(pages->rigid_geometry))
                return ERESTART;

        pages = (void *)((u_long)&scsipi_sense + poffset);
#if 0
        {
                size_t i;
                u_int8_t *p;

                printf("page 4 sense:");
                for (i = sizeof(scsipi_sense), p = (void *)&scsipi_sense; i;
                    i--, p++)
                        printf(" %02x", *p);
                printf("\n");
                printf("page 4 pg_code=%d sense=%p/%p\n",
                    pages->rigid_geometry.pg_code, &scsipi_sense, pages);
        }
#endif

        if ((pages->rigid_geometry.pg_code & PGCODE_MASK) != 4)
                return (ERESTART);

        SC_DEBUG(sd->sc_periph, SCSIPI_DB3,
            ("%d cyls, %d heads, %d precomp, %d red_write, %d land_zone\n",
            _3btol(pages->rigid_geometry.ncyl),
            pages->rigid_geometry.nheads,
            _2btol(pages->rigid_geometry.st_cyl_wp),
            _2btol(pages->rigid_geometry.st_cyl_rwc),
            _2btol(pages->rigid_geometry.land_zone)));

        /*
         * KLUDGE!! (for zone recorded disks)
         * give a number of sectors so that sec * trks * cyls
         * is <= disk_size
         * can lead to wasted space! THINK ABOUT THIS !
         */
        dp->heads = pages->rigid_geometry.nheads;
        dp->cyls = _3btol(pages->rigid_geometry.ncyl);
        if (dp->heads == 0 || dp->cyls == 0)
                return (ERESTART);
        dp->sectors = dp->disksize / (dp->heads * dp->cyls);        /* XXX */

        dp->rot_rate = _2btol(pages->rigid_geometry.rpm);
        if (dp->rot_rate == 0)
                dp->rot_rate = 3600;

#if 0
printf("page 4 ok\n");
#endif
        return (0);
}

static int
sd_get_parms_page5(struct sd_softc *sd, struct disk_parms *dp, int flags)
{
        struct sd_mode_sense_data scsipi_sense;
        int error;
        int big, byte2;
        size_t poffset;
        union scsi_disk_pages *pages;

        byte2 = SMS_DBD;
again:
        memset(&scsipi_sense, 0, sizeof(scsipi_sense));
        error = sd_mode_sense(sd, 0, &scsipi_sense,
            (byte2 ? 0 : sizeof(scsipi_sense.blk_desc)) +
            sizeof(scsipi_sense.pages.flex_geometry), 5,
            flags | XS_CTL_SILENT, &big);
        if (error) {
                if (byte2 == SMS_DBD) {
                        /* No result; try once more with DBD off */
                        byte2 = 0;
                        goto again;
                }
                return (error);
        }

        if (big) {
                poffset = sizeof scsipi_sense.header.big;
                poffset += _2btol(scsipi_sense.header.big.blk_desc_len);
        } else {
                poffset = sizeof scsipi_sense.header.small;
                poffset += scsipi_sense.header.small.blk_desc_len;
        }

        if (poffset > sizeof(scsipi_sense) - sizeof(pages->flex_geometry))
                return ERESTART;

        pages = (void *)((u_long)&scsipi_sense + poffset);
#if 0
        {
                size_t i;
                u_int8_t *p;

                printf("page 5 sense:");
                for (i = sizeof(scsipi_sense), p = (void *)&scsipi_sense; i;
                    i--, p++)
                        printf(" %02x", *p);
                printf("\n");
                printf("page 5 pg_code=%d sense=%p/%p\n",
                    pages->flex_geometry.pg_code, &scsipi_sense, pages);
        }
#endif

        if ((pages->flex_geometry.pg_code & PGCODE_MASK) != 5)
                return (ERESTART);

        SC_DEBUG(sd->sc_periph, SCSIPI_DB3,
            ("%d cyls, %d heads, %d sec, %d bytes/sec\n",
            _3btol(pages->flex_geometry.ncyl),
            pages->flex_geometry.nheads,
            pages->flex_geometry.ph_sec_tr,
            _2btol(pages->flex_geometry.bytes_s)));

        dp->heads = pages->flex_geometry.nheads;
        dp->cyls = _2btol(pages->flex_geometry.ncyl);
        dp->sectors = pages->flex_geometry.ph_sec_tr;
        if (dp->heads == 0 || dp->cyls == 0 || dp->sectors == 0)
                return (ERESTART);

        dp->rot_rate = _2btol(pages->rigid_geometry.rpm);
        if (dp->rot_rate == 0)
                dp->rot_rate = 3600;

#if 0
printf("page 5 ok\n");
#endif
        return (0);
}

static int
sd_get_parms(struct sd_softc *sd, struct disk_parms *dp, int flags)
{
        struct dk_softc *dksc = &sd->sc_dksc;
        int error;

        /*
         * If offline, the SDEV_MEDIA_LOADED flag will be
         * cleared by the caller if necessary.
         */
        if (sd->type == T_SIMPLE_DIRECT) {
                error = sd_get_simplifiedparms(sd, dp, flags);
                if (!error)
                        goto setprops;
                return (error);
        }

        error = sd_get_capacity(sd, dp, flags);
        if (error)
                return (error);

        if (sd->type == T_OPTICAL)
                goto page0;

        if (sd->sc_periph->periph_flags & PERIPH_REMOVABLE) {
                if (!sd_get_parms_page5(sd, dp, flags) ||
                    !sd_get_parms_page4(sd, dp, flags))
                        goto setprops;
        } else {
                if (!sd_get_parms_page4(sd, dp, flags) ||
                    !sd_get_parms_page5(sd, dp, flags))
                        goto setprops;
        }

page0:
        printf("%s: fabricating a geometry\n", dksc->sc_xname);
        /* Try calling driver's method for figuring out geometry. */
        if (!sd->sc_periph->periph_channel->chan_adapter->adapt_getgeom ||
            !(*sd->sc_periph->periph_channel->chan_adapter->adapt_getgeom)
                (sd->sc_periph, dp, dp->disksize)) {
                /*
                 * Use adaptec standard fictitious geometry
                 * this depends on which controller (e.g. 1542C is
                 * different. but we have to put SOMETHING here..)
                 */
                dp->heads = 64;
                dp->sectors = 32;
                dp->cyls = dp->disksize / (64 * 32);
        }
        dp->rot_rate = 3600;

setprops:
        sd_set_geometry(sd);

        return (SDGP_RESULT_OK);
}

static int
sd_flush(struct sd_softc *sd, int flags)
{
        struct scsipi_periph *periph = sd->sc_periph;
        struct scsi_synchronize_cache_10 cmd;

        /*
         * If the device is SCSI-2, issue a SYNCHRONIZE CACHE.
         * We issue with address 0 length 0, which should be
         * interpreted by the device as "all remaining blocks
         * starting at address 0".  We ignore ILLEGAL REQUEST
         * in the event that the command is not supported by
         * the device, and poll for completion so that we know
         * that the cache has actually been flushed.
         *
         * Unless, that is, the device can't handle the SYNCHRONIZE CACHE
         * command, as indicated by our quirks flags.
         *
         * XXX What about older devices?
         */
        if (periph->periph_version < 2 ||
            (periph->periph_quirks & PQUIRK_NOSYNCCACHE))
                return (0);

        sd->flags |= SDF_FLUSHING;
        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = SCSI_SYNCHRONIZE_CACHE_10;

        return (scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
            SDRETRIES, 100000, NULL, flags | XS_CTL_IGNORE_ILLEGAL_REQUEST));
}

static int
sd_getcache(struct sd_softc *sd, int *bitsp)
{
        struct scsipi_periph *periph = sd->sc_periph;
        struct sd_mode_sense_data scsipi_sense;
        int error, bits = 0;
        int big;
        union scsi_disk_pages *pages;
        uint8_t dev_spec;

        /* only SCSI-2 and later supported */
        if (periph->periph_version < 2)
                return (EOPNOTSUPP);

        memset(&scsipi_sense, 0, sizeof(scsipi_sense));
        error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense,
            sizeof(scsipi_sense.pages.caching_params), 8, XS_CTL_SILENT, &big);
        if (error)
                return (error);

        if (big) {
                pages = (void *)(&scsipi_sense.header.big + 1);
                dev_spec = scsipi_sense.header.big.dev_spec;
        } else {
                pages = (void *)(&scsipi_sense.header.small + 1);
                dev_spec = scsipi_sense.header.small.dev_spec;
        }

        if ((pages->caching_params.flags & CACHING_RCD) == 0)
                bits |= DKCACHE_READ;
        if (pages->caching_params.flags & CACHING_WCE)
                bits |= DKCACHE_WRITE;
        if (pages->caching_params.pg_code & PGCODE_PS)
                bits |= DKCACHE_SAVE;

        /*
         * Support for FUA/DPO, defined starting with SCSI-2. Use only
         * if device claims to support it, according to the MODE SENSE.
         */
        if (!(periph->periph_quirks & PQUIRK_NOFUA) &&
            ISSET(dev_spec, SMH_DSP_DPOFUA))
                bits |= DKCACHE_FUA | DKCACHE_DPO;

        memset(&scsipi_sense, 0, sizeof(scsipi_sense));
        error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense,
            sizeof(scsipi_sense.pages.caching_params),
            SMS_PCTRL_CHANGEABLE|8, XS_CTL_SILENT, &big);
        if (error == 0) {
                if (big)
                        pages = (void *)(&scsipi_sense.header.big + 1);
                else
                        pages = (void *)(&scsipi_sense.header.small + 1);

                if (pages->caching_params.flags & CACHING_RCD)
                        bits |= DKCACHE_RCHANGE;
                if (pages->caching_params.flags & CACHING_WCE)
                        bits |= DKCACHE_WCHANGE;
        }

        *bitsp = bits;

        return (0);
}

static int
sd_setcache(struct sd_softc *sd, int bits)
{
        struct scsipi_periph *periph = sd->sc_periph;
        struct sd_mode_sense_data scsipi_sense;
        int error;
        uint8_t oflags, byte2 = 0;
        int big;
        union scsi_disk_pages *pages;

        if (periph->periph_version < 2)
                return (EOPNOTSUPP);

        memset(&scsipi_sense, 0, sizeof(scsipi_sense));
        error = sd_mode_sense(sd, SMS_DBD, &scsipi_sense,
            sizeof(scsipi_sense.pages.caching_params), 8, 0, &big);
        if (error)
                return (error);

        if (big)
                pages = (void *)(&scsipi_sense.header.big + 1);
        else
                pages = (void *)(&scsipi_sense.header.small + 1);

        oflags = pages->caching_params.flags;

        if (bits & DKCACHE_READ)
                pages->caching_params.flags &= ~CACHING_RCD;
        else
                pages->caching_params.flags |= CACHING_RCD;

        if (bits & DKCACHE_WRITE)
                pages->caching_params.flags |= CACHING_WCE;
        else
                pages->caching_params.flags &= ~CACHING_WCE;

        if (oflags == pages->caching_params.flags)
                return (0);

        pages->caching_params.pg_code &= PGCODE_MASK;

        if (bits & DKCACHE_SAVE)
                byte2 |= SMS_SP;

        return (sd_mode_select(sd, byte2|SMS_PF, &scsipi_sense,
            sizeof(struct scsi_mode_page_header) +
            pages->caching_params.pg_length, 0, big));
}

static void
sd_set_geometry(struct sd_softc *sd)
{
        struct dk_softc *dksc = &sd->sc_dksc;
        struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;

        memset(dg, 0, sizeof(*dg));

        dg->dg_secperunit = sd->params.disksize;
        dg->dg_secsize = sd->params.blksize;
        dg->dg_nsectors = sd->params.sectors;
        dg->dg_ntracks = sd->params.heads;
        dg->dg_ncylinders = sd->params.cyls;

        disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, sd->typename);
}








































































































































































































    1 








    1 















































































































   35 

   34 















   34 


















    2 

    2 


    2 


















  242 












    1 

  246 









  109 



  110 
  109 






  110 
















  108 



  109 






  109 

    2 













  186 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
/*        $NetBSD: x86_machdep.c,v 1.154 2023/10/04 20:28:06 ad Exp $        */

/*-
 * Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi,
 * Copyright (c) 2005, 2008, 2009, 2019, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Julio M. Merino Vidal, and Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.154 2023/10/04 20:28:06 ad Exp $");

#include "opt_modular.h"
#include "opt_physmem.h"
#include "opt_splash.h"
#include "opt_kaslr.h"
#include "opt_svs.h"
#include "opt_xen.h"

#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kcore.h>
#include <sys/errno.h>
#include <sys/kauth.h>
#include <sys/mutex.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/atomic.h>
#include <sys/module.h>
#include <sys/sysctl.h>
#include <sys/extent.h>
#include <sys/rnd.h>

#include <x86/bootspace.h>
#include <x86/cpuvar.h>
#include <x86/cputypes.h>
#include <x86/efi.h>
#include <x86/machdep.h>
#include <x86/nmi.h>
#include <x86/pio.h>

#include <dev/splash/splash.h>
#include <dev/isa/isareg.h>
#include <dev/ic/i8042reg.h>
#include <dev/mm.h>

#include <machine/bootinfo.h>
#include <machine/pmap_private.h>
#include <machine/vmparam.h>

#include <uvm/uvm_extern.h>

#include "tsc.h"

#include "acpica.h"
#include "ioapic.h"
#include "lapic.h"

#if NACPICA > 0
#include <dev/acpi/acpivar.h>
#endif

#if NIOAPIC > 0 || NACPICA > 0
#include <machine/i82093var.h>
#endif

#include "opt_md.h"
#if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC)
#include <dev/md.h>
#endif

void (*x86_cpu_idle)(void);
static bool x86_cpu_idle_ipi;
static char x86_cpu_idle_text[16];

static bool x86_user_ldt_enabled __read_mostly = false;

#ifdef XEN

#include <xen/xen.h>
#include <xen/hypervisor.h>
#endif

#ifndef XENPV
void (*delay_func)(unsigned int) = i8254_delay;
void (*x86_initclock_func)(void) = i8254_initclocks;
#else /* XENPV */
void (*delay_func)(unsigned int) = xen_delay;
void (*x86_initclock_func)(void) = xen_initclocks;
#endif


/* --------------------------------------------------------------------- */

/*
 * Main bootinfo structure.  This is filled in by the bootstrap process
 * done in locore.S based on the information passed by the boot loader.
 */
struct bootinfo bootinfo;

/* --------------------------------------------------------------------- */

bool bootmethod_efi;

static kauth_listener_t x86_listener;

extern paddr_t lowmem_rsvd, avail_start, avail_end;

vaddr_t msgbuf_vaddr;

struct msgbuf_p_seg msgbuf_p_seg[VM_PHYSSEG_MAX];

unsigned int msgbuf_p_cnt = 0;

void init_x86_msgbuf(void);

/*
 * Given the type of a bootinfo entry, looks for a matching item inside
 * the bootinfo structure.  If found, returns a pointer to it (which must
 * then be casted to the appropriate bootinfo_* type); otherwise, returns
 * NULL.
 */
void *
lookup_bootinfo(int type)
{
        bool found;
        int i;
        struct btinfo_common *bic;

        bic = (struct btinfo_common *)(bootinfo.bi_data);
        found = FALSE;
        for (i = 0; i < bootinfo.bi_nentries && !found; i++) {
                if (bic->type == type)
                        found = TRUE;
                else
                        bic = (struct btinfo_common *)
                            ((uint8_t *)bic + bic->len);
        }

        return found ? bic : NULL;
}

#ifdef notyet
/*
 * List the available bootinfo entries.
 */
static const char *btinfo_str[] = {
        BTINFO_STR
};

void
aprint_bootinfo(void)
{
        int i;
        struct btinfo_common *bic;

        aprint_normal("bootinfo:");
        bic = (struct btinfo_common *)(bootinfo.bi_data);
        for (i = 0; i < bootinfo.bi_nentries; i++) {
                if (bic->type >= 0 && bic->type < __arraycount(btinfo_str))
                        aprint_normal(" %s", btinfo_str[bic->type]);
                else
                        aprint_normal(" %d", bic->type);
                bic = (struct btinfo_common *)
                    ((uint8_t *)bic + bic->len);
        }
        aprint_normal("\n");
}
#endif

/*
 * mm_md_physacc: check if given pa is accessible.
 */
int
mm_md_physacc(paddr_t pa, vm_prot_t prot)
{
        extern phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
        extern int mem_cluster_cnt;
        int i;

        for (i = 0; i < mem_cluster_cnt; i++) {
                const phys_ram_seg_t *seg = &mem_clusters[i];
                paddr_t lstart = seg->start;

                if (lstart <= pa && pa - lstart <= seg->size) {
                        return 0;
                }
        }
        return kauth_authorize_machdep(kauth_cred_get(),
            KAUTH_MACHDEP_UNMANAGEDMEM, NULL, NULL, NULL, NULL);
}

#ifdef MODULAR
/*
 * Push any modules loaded by the boot loader.
 */
void
module_init_md(void)
{
        struct btinfo_modulelist *biml;
        struct bi_modulelist_entry *bi, *bimax;

        biml = lookup_bootinfo(BTINFO_MODULELIST);
        if (biml == NULL) {
                aprint_debug("No module info at boot\n");
                return;
        }

        bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml));
        bimax = bi + biml->num;
        for (; bi < bimax; bi++) {
                switch (bi->type) {
                case BI_MODULE_ELF:
                        aprint_debug("Prep module path=%s len=%d pa=%x\n",
                            bi->path, bi->len, bi->base);
                        KASSERT(trunc_page(bi->base) == bi->base);
                        module_prime(bi->path,
#ifdef KASLR
                            (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
#else
                            (void *)((uintptr_t)bi->base + KERNBASE),
#endif
                            bi->len);
                        break;
                case BI_MODULE_IMAGE:
#ifdef SPLASHSCREEN
                        aprint_debug("Splash image path=%s len=%d pa=%x\n",
                            bi->path, bi->len, bi->base);
                        KASSERT(trunc_page(bi->base) == bi->base);
                        splash_setimage(
#ifdef KASLR
                            (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
#else
                            (void *)((uintptr_t)bi->base + KERNBASE),
#endif
                            bi->len);
#endif
                        break;
                case BI_MODULE_RND:
                        /* handled in x86_rndseed */
                        break;
                case BI_MODULE_FS:
                        aprint_debug("File-system image path=%s len=%d pa=%x\n",
                            bi->path, bi->len, bi->base);
                        KASSERT(trunc_page(bi->base) == bi->base);
#if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC)
                        md_root_setconf(
#ifdef KASLR
                            (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
#else
                            (void *)((uintptr_t)bi->base + KERNBASE),
#endif
                            bi->len);
#endif
                        break;
                default:
                        aprint_debug("Skipping non-ELF module\n");
                        break;
                }
        }
}
#endif        /* MODULAR */

void
x86_rndseed(void)
{
        struct btinfo_modulelist *biml;
        struct bi_modulelist_entry *bi, *bimax;

        biml = lookup_bootinfo(BTINFO_MODULELIST);
        if (biml == NULL) {
                aprint_debug("No module info at boot\n");
                return;
        }

        bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml));
        bimax = bi + biml->num;
        for (; bi < bimax; bi++) {
                switch (bi->type) {
                case BI_MODULE_RND:
                        aprint_debug("Random seed data path=%s len=%d pa=%x\n",
                                     bi->path, bi->len, bi->base);
                        KASSERT(trunc_page(bi->base) == bi->base);
                        rnd_seed(
#ifdef KASLR
                            (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
#else
                            (void *)((uintptr_t)bi->base + KERNBASE),
#endif
                             bi->len);
                }
        }
}

void
cpu_need_resched(struct cpu_info *ci, struct lwp *l, int flags)
{

        KASSERT(kpreempt_disabled());

        if ((flags & RESCHED_IDLE) != 0) {
                if ((flags & RESCHED_REMOTE) != 0 &&
                    x86_cpu_idle_ipi != false) {
                        cpu_kick(ci);
                }
                return;
        }

#ifdef __HAVE_PREEMPTION
        if ((flags & RESCHED_KPREEMPT) != 0) {
                if ((flags & RESCHED_REMOTE) != 0) {
#ifdef XENPV
                        xen_send_ipi(ci, XEN_IPI_KPREEMPT);
#else
                        x86_send_ipi(ci, X86_IPI_KPREEMPT);
#endif
                } else {
                        softint_trigger(1 << SIR_PREEMPT);
                }
                return;
        }
#endif

        KASSERT((flags & RESCHED_UPREEMPT) != 0);
        if ((flags & RESCHED_REMOTE) != 0) {
                cpu_kick(ci);
        } else {
                aston(l);
        }
}

void
cpu_signotify(struct lwp *l)
{

        KASSERT(kpreempt_disabled());

        if (l->l_cpu != curcpu()) {
                cpu_kick(l->l_cpu);
        } else {
                aston(l);
        }
}

void
cpu_need_proftick(struct lwp *l)
{

        KASSERT(kpreempt_disabled());
        KASSERT(l->l_cpu == curcpu());

        l->l_pflag |= LP_OWEUPC;
        aston(l);
}

bool
cpu_intr_p(void)
{
        int idepth;
        long pctr;
        lwp_t *l;

        l = curlwp;
        if (__predict_false(l->l_cpu == NULL)) {
                KASSERT(l == &lwp0);
                return false;
        }
        do {
                pctr = lwp_pctr();
                idepth = l->l_cpu->ci_idepth;
        } while (__predict_false(pctr != lwp_pctr()));

        return idepth >= 0;
}

#ifdef __HAVE_PREEMPTION
/*
 * Called to check MD conditions that would prevent preemption, and to
 * arrange for those conditions to be rechecked later.
 */
bool
cpu_kpreempt_enter(uintptr_t where, int s)
{
        struct pcb *pcb;
        lwp_t *l;

        KASSERT(kpreempt_disabled());
        l = curlwp;

        /*
         * If SPL raised, can't go.  Note this implies that spin
         * mutexes at IPL_NONE are _not_ valid to use.
         */
        if (s > IPL_PREEMPT) {
                softint_trigger(1 << SIR_PREEMPT);
                return false;
        }

        /* Must save cr2 or it could be clobbered. */
        pcb = lwp_getpcb(l);
        pcb->pcb_cr2 = rcr2();

        return true;
}

/*
 * Called after returning from a kernel preemption, and called with
 * preemption disabled.
 */
void
cpu_kpreempt_exit(uintptr_t where)
{
        extern char x86_copyfunc_start, x86_copyfunc_end;
        struct pcb *pcb;

        KASSERT(kpreempt_disabled());

        /*
         * If we interrupted any of the copy functions we must reload
         * the pmap when resuming, as they cannot tolerate it being
         * swapped out.
         */
        if (where >= (uintptr_t)&x86_copyfunc_start &&
            where < (uintptr_t)&x86_copyfunc_end) {
                pmap_load();
        }

        /* Restore cr2 only after the pmap, as pmap_load can block. */
        pcb = lwp_getpcb(curlwp);
        lcr2(pcb->pcb_cr2);
}

/*
 * Return true if preemption is disabled for MD reasons.  Must be called
 * with preemption disabled, and thus is only for diagnostic checks.
 */
bool
cpu_kpreempt_disabled(void)
{

        return curcpu()->ci_ilevel > IPL_NONE;
}
#endif        /* __HAVE_PREEMPTION */

SYSCTL_SETUP(sysctl_machdep_cpu_idle, "sysctl machdep cpu_idle")
{
        const struct sysctlnode        *mnode, *node;

        sysctl_createv(NULL, 0, NULL, &mnode,
            CTLFLAG_PERMANENT, CTLTYPE_NODE, "machdep", NULL,
            NULL, 0, NULL, 0, CTL_MACHDEP, CTL_EOL);

        sysctl_createv(NULL, 0, &mnode, &node,
                       CTLFLAG_PERMANENT, CTLTYPE_STRING, "idle-mechanism",
                       SYSCTL_DESCR("Mechanism used for the idle loop."),
                       NULL, 0, x86_cpu_idle_text, 0,
                       CTL_CREATE, CTL_EOL);
}

void
x86_cpu_idle_init(void)
{

#ifndef XENPV
        if ((cpu_feature[1] & CPUID2_MONITOR) == 0)
                x86_cpu_idle_set(x86_cpu_idle_halt, "halt", true);
        else
                x86_cpu_idle_set(x86_cpu_idle_mwait, "mwait", false);
#else
        x86_cpu_idle_set(x86_cpu_idle_xen, "xen", true);
#endif
}

void
x86_cpu_idle_get(void (**func)(void), char *text, size_t len)
{

        *func = x86_cpu_idle;

        (void)strlcpy(text, x86_cpu_idle_text, len);
}

void
x86_cpu_idle_set(void (*func)(void), const char *text, bool ipi)
{

        x86_cpu_idle = func;
        x86_cpu_idle_ipi = ipi;

        (void)strlcpy(x86_cpu_idle_text, text, sizeof(x86_cpu_idle_text));
}

#ifndef XENPV

#define KBTOB(x)        ((size_t)(x) * 1024UL)
#define MBTOB(x)        ((size_t)(x) * 1024UL * 1024UL)

static struct {
        int freelist;
        uint64_t limit;
} x86_freelists[VM_NFREELIST] = {
        { VM_FREELIST_DEFAULT, 0 },
#ifdef VM_FREELIST_FIRST1T
        /* 40-bit addresses needed for modern graphics. */
        { VM_FREELIST_FIRST1T,        1ULL * 1024 * 1024 * 1024 * 1024 },
#endif
#ifdef VM_FREELIST_FIRST64G
        /* 36-bit addresses needed for oldish graphics. */
        { VM_FREELIST_FIRST64G, 64ULL * 1024 * 1024 * 1024 },
#endif
#ifdef VM_FREELIST_FIRST4G
        /* 32-bit addresses needed for PCI 32-bit DMA and old graphics. */
        { VM_FREELIST_FIRST4G,  4ULL * 1024 * 1024 * 1024 },
#endif
        /* 30-bit addresses needed for ancient graphics. */
        { VM_FREELIST_FIRST1G,        1ULL * 1024 * 1024 * 1024 },
        /* 24-bit addresses needed for ISA DMA. */
        { VM_FREELIST_FIRST16,        16 * 1024 * 1024 },
};

int
x86_select_freelist(uint64_t maxaddr)
{
        unsigned int i;

        if (avail_end <= maxaddr)
                return VM_NFREELIST;

        for (i = 0; i < __arraycount(x86_freelists); i++) {
                if ((x86_freelists[i].limit - 1) <= maxaddr)
                        return x86_freelists[i].freelist;
        }

        panic("no freelist for maximum address %"PRIx64, maxaddr);
}

static int
x86_add_cluster(uint64_t seg_start, uint64_t seg_end, uint32_t type)
{
        extern struct extent *iomem_ex;
        const uint64_t endext = MAXIOMEM + 1;
        uint64_t new_physmem = 0;
        phys_ram_seg_t *cluster;
        int i;

        if (seg_end > MAXPHYSMEM) {
                aprint_verbose("WARNING: skipping large memory map entry: "
                    "0x%"PRIx64"/0x%"PRIx64"/0x%x\n",
                    seg_start, (seg_end - seg_start), type);
                return 0;
        }

        /*
         * XXX: Chop the last page off the size so that it can fit in avail_end.
         */
        if (seg_end == MAXPHYSMEM)
                seg_end -= PAGE_SIZE;

        if (seg_end <= seg_start)
                return 0;

        for (i = 0; i < mem_cluster_cnt; i++) {
                cluster = &mem_clusters[i];
                if ((cluster->start == round_page(seg_start)) &&
                    (cluster->size == trunc_page(seg_end) - cluster->start)) {
#ifdef DEBUG_MEMLOAD
                        printf("WARNING: skipping duplicate segment entry\n");
#endif
                        return 0;
                }
        }

        /*
         * This cluster is used by RAM. If it is included in the iomem extent,
         * allocate it from there, so that we won't unintentionally reuse it
         * later with extent_alloc_region. A way to avoid collision (with UVM
         * for example).
         *
         * This is done before the addresses are page rounded just to make
         * sure we get them all.
         */
        if (seg_start < endext) {
                uint64_t io_end;

                if (seg_end > endext)
                        io_end = endext;
                else
                        io_end = seg_end;

                if (iomem_ex != NULL && extent_alloc_region(iomem_ex, seg_start,
                    io_end - seg_start, EX_NOWAIT)) {
                        /* XXX What should we do? */
                        printf("WARNING: CAN't ALLOCATE MEMORY SEGMENT "
                            "(0x%"PRIx64"/0x%"PRIx64"/0x%x) FROM "
                            "IOMEM EXTENT MAP!\n",
                            seg_start, seg_end - seg_start, type);
                        return 0;
                }
        }

        /* If it's not free memory, skip it. */
        if (type != BIM_Memory)
                return 0;

        if (mem_cluster_cnt >= VM_PHYSSEG_MAX) {
                printf("WARNING: too many memory segments"
                    "(increase VM_PHYSSEG_MAX)");
                return -1;
        }

#ifdef PHYSMEM_MAX_ADDR
        if (seg_start >= MBTOB(PHYSMEM_MAX_ADDR))
                return 0;
        if (seg_end > MBTOB(PHYSMEM_MAX_ADDR))
                seg_end = MBTOB(PHYSMEM_MAX_ADDR);
#endif

        seg_start = round_page(seg_start);
        seg_end = trunc_page(seg_end);

        if (seg_start == seg_end)
                return 0;

        cluster = &mem_clusters[mem_cluster_cnt];
        cluster->start = seg_start;
        if (iomem_ex != NULL)
                new_physmem = physmem + atop(seg_end - seg_start);

#ifdef PHYSMEM_MAX_SIZE
        if (iomem_ex != NULL) {
                if (physmem >= atop(MBTOB(PHYSMEM_MAX_SIZE)))
                        return 0;
                if (new_physmem > atop(MBTOB(PHYSMEM_MAX_SIZE))) {
                        seg_end = seg_start + MBTOB(PHYSMEM_MAX_SIZE) - ptoa(physmem);
                        new_physmem = atop(MBTOB(PHYSMEM_MAX_SIZE));
                }
        }
#endif

        cluster->size = seg_end - seg_start;

        if (iomem_ex != NULL) {
                if (avail_end < seg_end)
                        avail_end = seg_end;
                physmem = new_physmem;
        }
        mem_cluster_cnt++;

        return 0;
}

static int
x86_parse_clusters(struct btinfo_memmap *bim)
{
        uint64_t seg_start, seg_end;
        uint64_t addr, size;
        uint32_t type;
        int x;

        KASSERT(bim != NULL);
        KASSERT(bim->num > 0);

#ifdef DEBUG_MEMLOAD
        printf("MEMMAP: %s MEMORY MAP (%d ENTRIES):\n",
            lookup_bootinfo(BTINFO_EFIMEMMAP) != NULL ? "UEFI" : "BIOS",
            bim->num);
#endif

        for (x = 0; x < bim->num; x++) {
                addr = bim->entry[x].addr;
                size = bim->entry[x].size;
                type = bim->entry[x].type;
#ifdef DEBUG_MEMLOAD
                printf("MEMMAP: 0x%016" PRIx64 "-0x%016" PRIx64
                    "\n\tsize=0x%016" PRIx64 ", type=%d(%s)\n",
                    addr, addr + size - 1, size, type,
                    (type == BIM_Memory) ?  "Memory" :
                    (type == BIM_Reserved) ?  "Reserved" :
                    (type == BIM_ACPI) ? "ACPI" :
                    (type == BIM_NVS) ? "NVS" :
                    (type == BIM_PMEM) ? "Persistent" :
                    (type == BIM_PRAM) ? "Persistent (Legacy)" :
                    "unknown");
#endif

                /* If the segment is not memory, skip it. */
                switch (type) {
                case BIM_Memory:
                case BIM_ACPI:
                case BIM_NVS:
                        break;
                default:
                        continue;
                }

                /* If the segment is smaller than a page, skip it. */
                if (size < PAGE_SIZE)
                        continue;

                seg_start = addr;
                seg_end = addr + size;

                /*
                 * XXX XXX: Avoid the ISA I/O MEM.
                 *
                 * Some laptops (for example, Toshiba Satellite2550X) report
                 * this area as valid.
                 */
                if (seg_start < IOM_END && seg_end > IOM_BEGIN) {
                        printf("WARNING: memory map entry overlaps "
                            "with ``Compatibility Holes'': "
                            "0x%"PRIx64"/0x%"PRIx64"/0x%x\n", seg_start,
                            seg_end - seg_start, type);

                        if (x86_add_cluster(seg_start, IOM_BEGIN, type) == -1)
                                break;
                        if (x86_add_cluster(IOM_END, seg_end, type) == -1)
                                break;
                } else {
                        if (x86_add_cluster(seg_start, seg_end, type) == -1)
                                break;
                }
        }

        return 0;
}

static int
x86_fake_clusters(void)
{
        extern struct extent *iomem_ex;
        phys_ram_seg_t *cluster;
        KASSERT(mem_cluster_cnt == 0);

        /*
         * Allocate the physical addresses used by RAM from the iomem extent
         * map. This is done before the addresses are page rounded just to make
         * sure we get them all.
         */
        if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem), EX_NOWAIT)) {
                /* XXX What should we do? */
                printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM "
                    "IOMEM EXTENT MAP!\n");
        }

        cluster = &mem_clusters[0];
        cluster->start = 0;
        cluster->size = trunc_page(KBTOB(biosbasemem));
        physmem += atop(cluster->size);

        if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem),
            EX_NOWAIT)) {
                /* XXX What should we do? */
                printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM "
                    "IOMEM EXTENT MAP!\n");
        }

#if NISADMA > 0
        /*
         * Some motherboards/BIOSes remap the 384K of RAM that would
         * normally be covered by the ISA hole to the end of memory
         * so that it can be used.  However, on a 16M system, this
         * would cause bounce buffers to be allocated and used.
         * This is not desirable behaviour, as more than 384K of
         * bounce buffers might be allocated.  As a work-around,
         * we round memory down to the nearest 1M boundary if
         * we're using any isadma devices and the remapped memory
         * is what puts us over 16M.
         */
        if (biosextmem > (15*1024) && biosextmem < (16*1024)) {
                char pbuf[9];

                format_bytes(pbuf, sizeof(pbuf), biosextmem - (15*1024));
                printf("Warning: ignoring %s of remapped memory\n", pbuf);
                biosextmem = (15*1024);
        }
#endif

        cluster = &mem_clusters[1];
        cluster->start = IOM_END;
        cluster->size = trunc_page(KBTOB(biosextmem));
        physmem += atop(cluster->size);

        mem_cluster_cnt = 2;

        avail_end = IOM_END + trunc_page(KBTOB(biosextmem));

        return 0;
}

/*
 * x86_load_region: load the physical memory region from seg_start to seg_end
 * into the VM system.
 */
static void
x86_load_region(uint64_t seg_start, uint64_t seg_end)
{
        unsigned int i;
        uint64_t tmp;

        i = __arraycount(x86_freelists);
        while (i--) {
                if (x86_freelists[i].limit <= seg_start)
                        continue;
                if (x86_freelists[i].freelist == VM_FREELIST_DEFAULT)
                        continue;
                tmp = MIN(x86_freelists[i].limit, seg_end);
                if (tmp == seg_start)
                        continue;

#ifdef DEBUG_MEMLOAD
                printf("loading freelist %d 0x%"PRIx64"-0x%"PRIx64
                    " (0x%"PRIx64"-0x%"PRIx64")\n", x86_freelists[i].freelist,
                    seg_start, tmp, (uint64_t)atop(seg_start),
                    (uint64_t)atop(tmp));
#endif

                uvm_page_physload(atop(seg_start), atop(tmp), atop(seg_start),
                    atop(tmp), x86_freelists[i].freelist);
                seg_start = tmp;
        }

        if (seg_start != seg_end) {
#ifdef DEBUG_MEMLOAD
                printf("loading default 0x%"PRIx64"-0x%"PRIx64
                    " (0x%"PRIx64"-0x%"PRIx64")\n", seg_start, seg_end,
                    (uint64_t)atop(seg_start), (uint64_t)atop(seg_end));
#endif
                uvm_page_physload(atop(seg_start), atop(seg_end),
                    atop(seg_start), atop(seg_end), VM_FREELIST_DEFAULT);
        }
}

#ifdef XEN
static void
x86_add_xen_clusters(void)
{
        if (hvm_start_info->memmap_entries > 0) {
                struct hvm_memmap_table_entry *map_entry;
                map_entry = (void *)((uintptr_t)hvm_start_info->memmap_paddr + KERNBASE);
                for (int i = 0; i < hvm_start_info->memmap_entries; i++) {
                        if (map_entry[i].size < PAGE_SIZE)
                                continue;
                        switch (map_entry[i].type) {
                        case XEN_HVM_MEMMAP_TYPE_RAM:
                                x86_add_cluster(map_entry[i].addr,
                                    map_entry[i].addr + map_entry[i].size,
                                    BIM_Memory);
                                break;
                        case XEN_HVM_MEMMAP_TYPE_ACPI:
                                x86_add_cluster(map_entry[i].addr,
                                    map_entry[i].addr + map_entry[i].size,
                                    BIM_ACPI);
                                break;
                        }
                }
        } else {
                struct xen_memory_map memmap;
                static struct _xen_mmap {
                        struct btinfo_memmap bim;
                        struct bi_memmap_entry map[128]; /* same as FreeBSD */
                } __packed xen_mmap;
                int err;

                memmap.nr_entries = 128;
                set_xen_guest_handle(memmap.buffer, &xen_mmap.bim.entry[0]);
                if ((err = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap))
                    < 0)
                        panic("XENMEM_memory_map %d", err);
                xen_mmap.bim.num = memmap.nr_entries;
                x86_parse_clusters(&xen_mmap.bim);
        }
}
#endif /* XEN */
/*
 * init_x86_clusters: retrieve the memory clusters provided by the BIOS, and
 * initialize mem_clusters.
 */
void
init_x86_clusters(void)
{
        struct btinfo_memmap *bim;
        struct btinfo_efimemmap *biem;

        /*
         * Check to see if we have a memory map from the BIOS (passed to us by
         * the boot program).
         */
#ifdef XEN
        if (vm_guest == VM_GUEST_XENPVH) {
                x86_add_xen_clusters();
        }
#endif /* XEN */

#ifdef i386
        extern int biosmem_implicit;
        biem = lookup_bootinfo(BTINFO_EFIMEMMAP);
        if (biem != NULL)
                bim = efi_get_e820memmap();
        else
                bim = lookup_bootinfo(BTINFO_MEMMAP);
        if ((biosmem_implicit || (biosbasemem == 0 && biosextmem == 0)) &&
            bim != NULL && bim->num > 0)
                x86_parse_clusters(bim);
#else
#if !defined(REALBASEMEM) && !defined(REALEXTMEM)
        biem = lookup_bootinfo(BTINFO_EFIMEMMAP);
        if (biem != NULL)
                bim = efi_get_e820memmap();
        else
                bim = lookup_bootinfo(BTINFO_MEMMAP);
        if (bim != NULL && bim->num > 0)
                x86_parse_clusters(bim);
#else
        (void)bim, (void)biem;
#endif
#endif

        if (mem_cluster_cnt == 0) {
                /*
                 * If x86_parse_clusters didn't find any valid segment, create
                 * fake clusters.
                 */
                x86_fake_clusters();
        }
}

/*
 * init_x86_vm: initialize the VM system on x86. We basically internalize as
 * many physical pages as we can, starting at lowmem_rsvd, but we don't
 * internalize the kernel physical pages (from pa_kstart to pa_kend).
 */
int
init_x86_vm(paddr_t pa_kend)
{
        extern struct bootspace bootspace;
        paddr_t pa_kstart = bootspace.head.pa;
        uint64_t seg_start, seg_end;
        uint64_t seg_start1, seg_end1;
        int x;
        unsigned i;

        for (i = 0; i < __arraycount(x86_freelists); i++) {
                if (avail_end < x86_freelists[i].limit)
                        x86_freelists[i].freelist = VM_FREELIST_DEFAULT;
        }

        /*
         * Now, load the memory clusters (which have already been rounded and
         * truncated) into the VM system.
         *
         * NOTE: we assume that memory starts at 0.
         */
        for (x = 0; x < mem_cluster_cnt; x++) {
                const phys_ram_seg_t *cluster = &mem_clusters[x];

                seg_start = cluster->start;
                seg_end = cluster->start + cluster->size;
                seg_start1 = 0;
                seg_end1 = 0;

#ifdef DEBUG_MEMLOAD
                printf("segment %" PRIx64 " - %" PRIx64 "\n",
                    seg_start, seg_end);
#endif

                /* Skip memory before our available starting point. */
                if (seg_end <= lowmem_rsvd) {
#ifdef DEBUG_MEMLOAD
                        printf("discard segment below starting point "
                            "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end);
#endif
                        continue;
                }

                if (seg_start <= lowmem_rsvd && lowmem_rsvd < seg_end) {
                        seg_start = lowmem_rsvd;
                        if (seg_start == seg_end) {
#ifdef DEBUG_MEMLOAD
                                printf("discard segment below starting point "
                                    "%" PRIx64 " - %" PRIx64 "\n",
                                    seg_start, seg_end);


#endif
                                continue;
                        }
                }

                /*
                 * If this segment contains the kernel, split it in two, around
                 * the kernel.
                 *  [seg_start                       seg_end]
                 *             [pa_kstart  pa_kend]
                 */
                if (seg_start <= pa_kstart && pa_kend <= seg_end) {
#ifdef DEBUG_MEMLOAD
                        printf("split kernel overlapping to "
                            "%" PRIx64 " - %" PRIxPADDR " and "
                            "%" PRIxPADDR " - %" PRIx64 "\n",
                            seg_start, pa_kstart, pa_kend, seg_end);
#endif
                        seg_start1 = pa_kend;
                        seg_end1 = seg_end;
                        seg_end = pa_kstart;
                        KASSERT(seg_end < seg_end1);
                }

                /*
                 * Discard a segment inside the kernel
                 *  [pa_kstart                       pa_kend]
                 *             [seg_start  seg_end]
                 */
                if (pa_kstart < seg_start && seg_end < pa_kend) {
#ifdef DEBUG_MEMLOAD
                        printf("discard complete kernel overlap "
                            "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end);
#endif
                        continue;
                }

                /*
                 * Discard leading hunk that overlaps the kernel
                 *  [pa_kstart             pa_kend]
                 *            [seg_start            seg_end]
                 */
                if (pa_kstart < seg_start &&
                    seg_start < pa_kend &&
                    pa_kend < seg_end) {
#ifdef DEBUG_MEMLOAD
                        printf("discard leading kernel overlap "
                            "%" PRIx64 " - %" PRIxPADDR "\n",
                            seg_start, pa_kend);
#endif
                        seg_start = pa_kend;
                }

                /*
                 * Discard trailing hunk that overlaps the kernel
                 *             [pa_kstart            pa_kend]
                 *  [seg_start              seg_end]
                 */
                if (seg_start < pa_kstart &&
                    pa_kstart < seg_end &&
                    seg_end < pa_kend) {
#ifdef DEBUG_MEMLOAD
                        printf("discard trailing kernel overlap "
                            "%" PRIxPADDR " - %" PRIx64 "\n",
                            pa_kstart, seg_end);
#endif
                        seg_end = pa_kstart;
                }

                /* First hunk */
                if (seg_start != seg_end) {
                        x86_load_region(seg_start, seg_end);
                }

                /* Second hunk */
                if (seg_start1 != seg_end1) {
                        x86_load_region(seg_start1, seg_end1);
                }
        }

        return 0;
}

#endif /* !XENPV */

void
init_x86_msgbuf(void)
{
        /* Message buffer is located at end of core. */
        psize_t sz = round_page(MSGBUFSIZE);
        psize_t reqsz = sz;
        uvm_physseg_t x;

search_again:
        for (x = uvm_physseg_get_first();
             uvm_physseg_valid_p(x);
             x = uvm_physseg_get_next(x)) {

                if (ctob(uvm_physseg_get_avail_end(x)) == avail_end)
                        break;
        }

        if (uvm_physseg_valid_p(x) == false)
                panic("init_x86_msgbuf: can't find end of memory");

        /* Shrink so it'll fit in the last segment. */
        if (uvm_physseg_get_avail_end(x) - uvm_physseg_get_avail_start(x) < atop(sz))
                sz = ctob(uvm_physseg_get_avail_end(x) - uvm_physseg_get_avail_start(x));

        msgbuf_p_seg[msgbuf_p_cnt].sz = sz;
        msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(uvm_physseg_get_avail_end(x)) - sz;
        uvm_physseg_unplug(uvm_physseg_get_end(x) - atop(sz), atop(sz));

        /* Now find where the new avail_end is. */
        avail_end = ctob(uvm_physseg_get_highest_frame());

        if (sz == reqsz)
                return;

        reqsz -= sz;
        if (msgbuf_p_cnt == VM_PHYSSEG_MAX) {
                /* No more segments available, bail out. */
                printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n",
                    (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz));
                return;
        }

        sz = reqsz;
        goto search_again;
}

void
x86_reset(void)
{
        uint8_t b;

#if NACPICA > 0
        /*
         * If ACPI is active, try to reset using the reset register
         * defined in the FADT.
         */
        if (acpi_active) {
                if (acpi_reset() == 0) {
                        delay(500000); /* wait 0.5 sec to see if that did it */
                }
        }
#endif

        /*
         * The keyboard controller has 4 random output pins, one of which is
         * connected to the RESET pin on the CPU in many PCs.  We tell the
         * keyboard controller to pulse this line a couple of times.
         */
        outb(IO_KBD + KBCMDP, KBC_PULSE0);
        delay(100000);
        outb(IO_KBD + KBCMDP, KBC_PULSE0);
        delay(100000);

        /*
         * Attempt to force a reset via the Reset Control register at
         * I/O port 0xcf9.  Bit 2 forces a system reset when it
         * transitions from 0 to 1.  Bit 1 selects the type of reset
         * to attempt: 0 selects a "soft" reset, and 1 selects a
         * "hard" reset.  We try a "hard" reset.  The first write sets
         * bit 1 to select a "hard" reset and clears bit 2.  The
         * second write forces a 0 -> 1 transition in bit 2 to trigger
         * a reset.
         */
        outb(0xcf9, 0x2);
        outb(0xcf9, 0x6);
        DELAY(500000);        /* wait 0.5 sec to see if that did it */

        /*
         * Attempt to force a reset via the Fast A20 and Init register
         * at I/O port 0x92. Bit 1 serves as an alternate A20 gate.
         * Bit 0 asserts INIT# when set to 1. We are careful to only
         * preserve bit 1 while setting bit 0. We also must clear bit
         * 0 before setting it if it isn't already clear.
         */
        b = inb(0x92);
        if (b != 0xff) {
                if ((b & 0x1) != 0)
                        outb(0x92, b & 0xfe);
                outb(0x92, b | 0x1);
                DELAY(500000);        /* wait 0.5 sec to see if that did it */
        }
}

static int
x86_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;

        result = KAUTH_RESULT_DEFER;

        switch (action) {
        case KAUTH_MACHDEP_IOPERM_GET:
                result = KAUTH_RESULT_ALLOW;
                break;

        case KAUTH_MACHDEP_LDT_GET:
        case KAUTH_MACHDEP_LDT_SET:
                if (x86_user_ldt_enabled) {
                        result = KAUTH_RESULT_ALLOW;
                }
                break;

        default:
                break;
        }

        return result;
}

void
machdep_init(void)
{

        x86_listener = kauth_listen_scope(KAUTH_SCOPE_MACHDEP,
            x86_listener_cb, NULL);
}

/*
 * x86_startup: x86 common startup routine
 *
 * called by cpu_startup.
 */

void
x86_startup(void)
{
#if !defined(XENPV)
        nmi_init();
#endif
}

const char *
get_booted_kernel(void)
{
        const struct btinfo_bootpath *bibp = lookup_bootinfo(BTINFO_BOOTPATH);
        return bibp ? bibp->bootpath : NULL;
}

/*
 * machine dependent system variables.
 */
static int
sysctl_machdep_booted_kernel(SYSCTLFN_ARGS)
{
        struct btinfo_bootpath *bibp;
        struct sysctlnode node;

        bibp = lookup_bootinfo(BTINFO_BOOTPATH);
        if (!bibp)
                return ENOENT; /* ??? */

        node = *rnode;
        node.sysctl_data = bibp->bootpath;
        node.sysctl_size = sizeof(bibp->bootpath);
        return sysctl_lookup(SYSCTLFN_CALL(&node));
}

static int
sysctl_machdep_bootmethod(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        char buf[5];

        node = *rnode;
        node.sysctl_data = buf;
        if (bootmethod_efi)
                memcpy(node.sysctl_data, "UEFI", 5);
        else
                memcpy(node.sysctl_data, "BIOS", 5);

        return sysctl_lookup(SYSCTLFN_CALL(&node));
}


static int
sysctl_machdep_diskinfo(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        extern struct bi_devmatch *x86_alldisks;
        extern int x86_ndisks;

        if (x86_alldisks == NULL)
                return EOPNOTSUPP;

        node = *rnode;
        node.sysctl_data = x86_alldisks;
        node.sysctl_size = sizeof(struct disklist) +
            (x86_ndisks - 1) * sizeof(struct nativedisk_info);
        return sysctl_lookup(SYSCTLFN_CALL(&node));
}

#ifndef XENPV
static int
sysctl_machdep_tsc_enable(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int error, val;

        val = *(int *)rnode->sysctl_data;

        node = *rnode;
        node.sysctl_data = &val;

        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error != 0 || newp == NULL)
                return error;

        if (val == 1) {
                tsc_user_enable();
        } else if (val == 0) {
                tsc_user_disable();
        } else {
                error = EINVAL;
        }
        if (error)
                return error;

        *(int *)rnode->sysctl_data = val;

        return 0;
}
#endif

static const char * const vm_guest_name[VM_LAST] = {
        [VM_GUEST_NO] =                "none",
        [VM_GUEST_VM] =                "generic",
        [VM_GUEST_XENPV] =        "XenPV",
        [VM_GUEST_XENPVH] =        "XenPVH",
        [VM_GUEST_XENHVM] =        "XenHVM",
        [VM_GUEST_XENPVHVM] =        "XenPVHVM",
        [VM_GUEST_HV] =                "Hyper-V",
        [VM_GUEST_VMWARE] =        "VMware",
        [VM_GUEST_KVM] =        "KVM",
        [VM_GUEST_VIRTUALBOX] =        "VirtualBox",
};

static int
sysctl_machdep_hypervisor(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        const char *t = NULL;
        char buf[64];

        node = *rnode;
        node.sysctl_data = buf;
        if (vm_guest >= VM_GUEST_NO && vm_guest < VM_LAST)
                t = vm_guest_name[vm_guest];
        if (t == NULL)
                t = "unknown";
        strlcpy(buf, t, sizeof(buf));
        return sysctl_lookup(SYSCTLFN_CALL(&node));
}

static void
const_sysctl(struct sysctllog **clog, const char *name, int type,
    u_quad_t value, int tag)
{
        (sysctl_createv)(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
                       type, name, NULL, NULL, value, NULL, 0,
                       CTL_MACHDEP, tag, CTL_EOL);
}

SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup")
{
        extern uint64_t tsc_freq;
#ifndef XENPV
        extern int tsc_user_enabled;
#endif
        extern int sparse_dump;

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "machdep", NULL,
                       NULL, 0, NULL, 0,
                       CTL_MACHDEP, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "console_device", NULL,
                       sysctl_consdev, 0, NULL, sizeof(dev_t),
                       CTL_MACHDEP, CPU_CONSDEV, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "booted_kernel", NULL,
                       sysctl_machdep_booted_kernel, 0, NULL, 0,
                       CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "bootmethod", NULL,
                       sysctl_machdep_bootmethod, 0, NULL, 0,
                       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "diskinfo", NULL,
                       sysctl_machdep_diskinfo, 0, NULL, 0,
                       CTL_MACHDEP, CPU_DISKINFO, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "cpu_brand", NULL,
                       NULL, 0, cpu_brand_string, 0,
                       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "sparse_dump", NULL,
                       NULL, 0, &sparse_dump, 0,
                       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "tsc_freq", NULL,
                       NULL, 0, &tsc_freq, 0,
                       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_INT, "pae",
                       SYSCTL_DESCR("Whether the kernel uses PAE"),
                       NULL, 0, &use_pae, 0,
                       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
#ifndef XENPV
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_READWRITE,
                       CTLTYPE_INT, "tsc_user_enable",
                       SYSCTL_DESCR("RDTSC instruction enabled in usermode"),
                       sysctl_machdep_tsc_enable, 0, &tsc_user_enabled, 0,
                       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
#endif
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "hypervisor", NULL,
                       sysctl_machdep_hypervisor, 0, NULL, 0,
                       CTL_MACHDEP, CTL_CREATE, CTL_EOL);
#ifdef SVS
        const struct sysctlnode *svs_rnode = NULL;
        sysctl_createv(clog, 0, NULL, &svs_rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "svs", NULL,
                       NULL, 0, NULL, 0,
                       CTL_MACHDEP, CTL_CREATE);
        sysctl_createv(clog, 0, &svs_rnode, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_BOOL, "enabled",
                       SYSCTL_DESCR("Whether the kernel uses SVS"),
                       NULL, 0, &svs_enabled, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &svs_rnode, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_BOOL, "pcid",
                       SYSCTL_DESCR("Whether SVS uses PCID"),
                       NULL, 0, &svs_pcid, 0,
                       CTL_CREATE, CTL_EOL);
#endif

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_READWRITE,
                       CTLTYPE_BOOL, "user_ldt",
                       SYSCTL_DESCR("Whether USER_LDT is enabled"),
                       NULL, 0, &x86_user_ldt_enabled, 0,
                       CTL_MACHDEP, CTL_CREATE, CTL_EOL);

#ifndef XENPV
        void sysctl_speculation_init(struct sysctllog **);
        sysctl_speculation_init(clog);
#endif

        /* None of these can ever change once the system has booted */
        const_sysctl(clog, "fpu_present", CTLTYPE_INT, i386_fpu_present,
            CPU_FPU_PRESENT);
        const_sysctl(clog, "osfxsr", CTLTYPE_INT, i386_use_fxsave,
            CPU_OSFXSR);
        const_sysctl(clog, "sse", CTLTYPE_INT, i386_has_sse,
            CPU_SSE);
        const_sysctl(clog, "sse2", CTLTYPE_INT, i386_has_sse2,
            CPU_SSE2);

        const_sysctl(clog, "fpu_save", CTLTYPE_INT, x86_fpu_save,
            CPU_FPU_SAVE);
        const_sysctl(clog, "fpu_save_size", CTLTYPE_INT, x86_fpu_save_size,
            CPU_FPU_SAVE_SIZE);
        const_sysctl(clog, "xsave_features", CTLTYPE_QUAD, x86_xsave_features,
            CPU_XSAVE_FEATURES);

#ifndef XENPV
        const_sysctl(clog, "biosbasemem", CTLTYPE_INT, biosbasemem,
            CPU_BIOSBASEMEM);
        const_sysctl(clog, "biosextmem", CTLTYPE_INT, biosextmem,
            CPU_BIOSEXTMEM);
#endif
}

/* Here for want of a better place */
#if defined(DOM0OPS) || !defined(XENPV)
struct pic *
intr_findpic(int num)
{
#if NIOAPIC > 0
        struct ioapic_softc *pic;

        pic = ioapic_find_bybase(num);
        if (pic != NULL)
                return &pic->sc_pic;
#endif
        if (num < NUM_LEGACY_IRQS)
                return &i8259_pic;

        return NULL;
}
#endif

void
cpu_initclocks(void)
{

        /*
         * Re-calibrate TSC on boot CPU using most accurate time source,
         * thus making accurate TSC available for x86_initclock_func().
         */
        cpu_get_tsc_freq(curcpu());

        /* Now start the clocks on this CPU (the boot CPU). */
        (*x86_initclock_func)();
}

int
x86_cpu_is_lcall(const void *ip)
{
        static const uint8_t lcall[] = { 0x9a, 0, 0, 0, 0 };
        int error;
        const size_t sz = sizeof(lcall) + 2;
        uint8_t tmp[sizeof(lcall) + 2];

        if ((error = copyin(ip, tmp, sz)) != 0)
                return error;

        if (memcmp(tmp, lcall, sizeof(lcall)) != 0 || tmp[sz - 1] != 0)
                return EINVAL;

        switch (tmp[sz - 2]) {
        case (uint8_t)0x07: /* NetBSD */
        case (uint8_t)0x87: /* BSD/OS */
                return 0;
        default:
                return EINVAL;
        }
}










































































    3 








































































































































   19 

   19 





   19 















   19 













   19 














   19 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
/*        $NetBSD: subr_localcount.c,v 1.7 2017/11/17 09:26:36 ozaki-r Exp $        */

/*-
 * Copyright (c) 2016 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Taylor R. Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * CPU-local reference counts
 *
 *        localcount(9) is a reference-counting scheme that involves no
 *        interprocessor synchronization most of the time, at the cost of
 *        eight bytes of memory per CPU per object and at the cost of
 *        expensive interprocessor synchronization to drain references.
 *
 *        localcount(9) references may be held across sleeps, may be
 *        transferred from CPU to CPU or thread to thread: they behave
 *        semantically like typical reference counts, with different
 *        pragmatic performance characteristics.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_localcount.c,v 1.7 2017/11/17 09:26:36 ozaki-r Exp $");

#include <sys/param.h>
#include <sys/localcount.h>
#include <sys/types.h>
#include <sys/condvar.h>
#include <sys/errno.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/xcall.h>
#if defined(DEBUG) && defined(LOCKDEBUG)
#include <sys/atomic.h>
#endif

static void localcount_xc(void *, void *);

/*
 * localcount_init(lc)
 *
 *        Initialize a localcount object.  Returns 0 on success, error
 *        code on failure.  May fail to allocate memory for percpu(9).
 *
 *        The caller must call localcount_drain and then localcount_fini
 *        when done with lc.
 */
void
localcount_init(struct localcount *lc)
{

        lc->lc_totalp = NULL;
        lc->lc_percpu = percpu_alloc(sizeof(int64_t));
}

/*
 * localcount_drain(lc, cv, interlock)
 *
 *        Wait for all acquired references to lc to drain.  Caller must
 *        hold interlock; localcount_drain releases it during cross-calls
 *        and waits on cv.  The cv and interlock passed here must be the
 *        same as are passed to localcount_release for this lc.
 *
 *        Caller must guarantee that no new references can be acquired
 *        with localcount_acquire before calling localcount_drain.  For
 *        example, any object that may be found in a list and acquired
 *        must be removed from the list before localcount_drain.
 *
 *        The localcount object lc may be used only with localcount_fini
 *        after this, unless reinitialized after localcount_fini with
 *        localcount_init.
 */
void
localcount_drain(struct localcount *lc, kcondvar_t *cv, kmutex_t *interlock)
{
        int64_t total = 0;

        KASSERT(mutex_owned(interlock));
        KASSERT(lc->lc_totalp == NULL);

        /* Mark it draining.  */
        lc->lc_totalp = &total;

        /*
         * Count up all references on all CPUs.
         *
         * This serves as a global memory barrier: after xc_wait, all
         * CPUs will have witnessed the nonnull value of lc->lc_totalp,
         * so that it is safe to wait on the cv for them.
         */
        mutex_exit(interlock);
        xc_wait(xc_broadcast(0, &localcount_xc, lc, interlock));
        mutex_enter(interlock);

        /* Wait for remaining references to drain.  */
        while (total != 0) {
                /*
                 * At this point, now that we have added up all
                 * references on all CPUs, the total had better be
                 * nonnegative.
                 */
                KASSERTMSG((0 < total),
                    "negatively referenced localcount: %p, %"PRId64,
                    lc, total);
                cv_wait(cv, interlock);
        }

        /* Paranoia: Cause any further use of lc->lc_totalp to crash.  */
        lc->lc_totalp = (void *)(uintptr_t)1;
}

/*
 * localcount_fini(lc)
 *
 *        Finalize a localcount object, releasing any memory allocated
 *        for it.  The localcount object must already have been drained.
 */
void
localcount_fini(struct localcount *lc)
{

        KASSERT(lc->lc_totalp == (void *)(uintptr_t)1);
        percpu_free(lc->lc_percpu, sizeof(uint64_t));
}

/*
 * localcount_xc(cookie0, cookie1)
 *
 *        Accumulate and transfer the per-CPU reference counts to a
 *        global total, resetting the per-CPU counter to zero.  Once
 *        localcount_drain() has started, we only maintain the total
 *        count in localcount_release().
 */
static void
localcount_xc(void *cookie0, void *cookie1)
{
        struct localcount *lc = cookie0;
        kmutex_t *interlock = cookie1;
        int64_t *localp;

        mutex_enter(interlock);
        localp = percpu_getref(lc->lc_percpu);
        *lc->lc_totalp += *localp;
        *localp -= *localp;                /* ie, *localp = 0; */
        percpu_putref(lc->lc_percpu);
        mutex_exit(interlock);
}

/*
 * localcount_adjust(lc, delta)
 *
 *        Add delta -- positive or negative -- to the local CPU's count
 *        for lc.
 */
static void
localcount_adjust(struct localcount *lc, int delta)
{
        int64_t *localp;

        localp = percpu_getref(lc->lc_percpu);
        *localp += delta;
        percpu_putref(lc->lc_percpu);
}

/*
 * localcount_acquire(lc)
 *
 *        Acquire a reference to lc.
 *
 *        The reference may be held across sleeps and may be migrated
 *        from CPU to CPU, or even thread to thread -- it is only
 *        counted, not associated with a particular concrete owner.
 *
 *        Involves no interprocessor synchronization.  May be used in any
 *        context: while a lock is held, within a pserialize(9) read
 *        section, in hard interrupt context (provided other users block
 *        hard interrupts), in soft interrupt context, in thread context,
 *        &c.
 *
 *        Caller must guarantee that there is no concurrent
 *        localcount_drain.  For example, any object that may be found in
 *        a list and acquired must be removed from the list before
 *        localcount_drain.
 */
void
localcount_acquire(struct localcount *lc)
{

        KASSERT(lc->lc_totalp == NULL);
        localcount_adjust(lc, +1);
#if defined(DEBUG) && defined(LOCKDEBUG)
        if (atomic_inc_32_nv(&lc->lc_refcnt) == 0)
                panic("counter overflow");
#endif
}

/*
 * localcount_release(lc, cv, interlock)
 *
 *        Release a reference to lc.  If there is a concurrent
 *        localcount_drain and this may be the last reference, notify
 *        localcount_drain by acquiring interlock, waking cv, and
 *        releasing interlock.  The cv and interlock passed here must be
 *        the same as are passed to localcount_drain for this lc.
 *
 *        Involves no interprocessor synchronization unless there is a
 *        concurrent localcount_drain in progress.
 */
void
localcount_release(struct localcount *lc, kcondvar_t *cv, kmutex_t *interlock)
{

        /*
         * Block xcall so that if someone begins draining after we see
         * lc->lc_totalp as null, then they won't start cv_wait until
         * after they have counted this CPU's contributions.
         *
         * Otherwise, localcount_drain may notice an extant reference
         * from this CPU and cv_wait for it, but having seen
         * lc->lc_totalp as null, this CPU will not wake
         * localcount_drain.
         */
        kpreempt_disable();

        KDASSERT(mutex_ownable(interlock));
        if (__predict_false(lc->lc_totalp != NULL)) {
                /*
                 * Slow path -- wake localcount_drain in case this is
                 * the last reference.
                 */
                mutex_enter(interlock);
                if (--*lc->lc_totalp == 0)
                        cv_broadcast(cv);
                mutex_exit(interlock);
                goto out;
        }

        localcount_adjust(lc, -1);
#if defined(DEBUG) && defined(LOCKDEBUG)
        if (atomic_dec_32_nv(&lc->lc_refcnt) == UINT_MAX)
                panic("counter underflow");
#endif
 out:        kpreempt_enable();
}

/*
 * localcount_debug_refcnt(lc)
 *
 *        Return a total reference count of lc.  It returns a correct value
 *        only if DEBUG and LOCKDEBUG enabled.  Otherwise always return 0.
 */
uint32_t
localcount_debug_refcnt(const struct localcount *lc)
{

#if defined(DEBUG) && defined(LOCKDEBUG)
        return lc->lc_refcnt;
#else
        return 0;
#endif
}
















































































































































































































































































































































































































































































































    5 











    5 



    5 



    2 

    3 

    2 







    3 








    4 












    1 




    3 



















































































































































    1 











    1 






    1 
    1 























    1 

























































    1 


















    1 




    1 







    1 


































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
/*        $NetBSD: sysv_sem.c,v 1.98 2019/08/07 00:38:02 pgoyette Exp $        */

/*-
 * Copyright (c) 1999, 2007 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Implementation of SVID semaphores
 *
 * Author: Daniel Boulet
 *
 * This software is provided ``AS IS'' without any warranties of any kind.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sysv_sem.c,v 1.98 2019/08/07 00:38:02 pgoyette Exp $");

#ifdef _KERNEL_OPT
#include "opt_sysv.h"
#endif

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/sem.h>
#include <sys/sysctl.h>
#include <sys/kmem.h>
#include <sys/mount.h>                /* XXX for <sys/syscallargs.h> */
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/once.h>

/* 
 * Memory areas:
 *  1st: Pool of semaphore identifiers
 *  2nd: Semaphores
 *  3rd: Conditional variables
 *  4th: Undo structures
 */
struct semid_ds *        sema                        __read_mostly;
static struct __sem *        sem                        __read_mostly;
static kcondvar_t *        semcv                        __read_mostly;
static int *                semu                        __read_mostly;

static kmutex_t                semlock                        __cacheline_aligned;
static bool                sem_realloc_state        __read_mostly;
static kcondvar_t        sem_realloc_cv;

/*
 * List of active undo structures, total number of semaphores,
 * and total number of semop waiters.
 */
static struct sem_undo *semu_list                __read_mostly;
static u_int                semtot                        __cacheline_aligned;
static u_int                sem_waiters                __cacheline_aligned;

/* Macro to find a particular sem_undo vector */
#define SEMU(s, ix)        ((struct sem_undo *)(((long)s) + ix * seminfo.semusz))

#ifdef SEM_DEBUG
#define SEM_PRINTF(a) printf a
#else
#define SEM_PRINTF(a)
#endif

void *hook;        /* cookie from exithook_establish() */

extern int kern_has_sysvsem;

SYSCTL_SETUP_PROTO(sysctl_ipc_sem_setup);

struct sem_undo *semu_alloc(struct proc *);
int semundo_adjust(struct proc *, struct sem_undo **, int, int, int);
void semundo_clear(int, int);

static ONCE_DECL(exithook_control);
static int seminit_exithook(void);

int
seminit(void)
{
        int i, sz;
        vaddr_t v;

        mutex_init(&semlock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&sem_realloc_cv, "semrealc");
        sem_realloc_state = false;
        semtot = 0;
        sem_waiters = 0;

        /* Allocate the wired memory for our structures */
        sz = ALIGN(seminfo.semmni * sizeof(struct semid_ds)) +
            ALIGN(seminfo.semmns * sizeof(struct __sem)) +
            ALIGN(seminfo.semmni * sizeof(kcondvar_t)) +
            ALIGN(seminfo.semmnu * seminfo.semusz);
        sz = round_page(sz);
        v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
        if (v == 0) {
                printf("sysv_sem: cannot allocate memory");
                return ENOMEM;
        }
        sema = (void *)v;
        sem = (void *)((uintptr_t)sema +
            ALIGN(seminfo.semmni * sizeof(struct semid_ds)));
        semcv = (void *)((uintptr_t)sem +
            ALIGN(seminfo.semmns * sizeof(struct __sem)));
        semu = (void *)((uintptr_t)semcv +
            ALIGN(seminfo.semmni * sizeof(kcondvar_t)));

        for (i = 0; i < seminfo.semmni; i++) {
                sema[i]._sem_base = 0;
                sema[i].sem_perm.mode = 0;
                cv_init(&semcv[i], "semwait");
        }
        for (i = 0; i < seminfo.semmnu; i++) {
                struct sem_undo *suptr = SEMU(semu, i);
                suptr->un_proc = NULL;
        }
        semu_list = NULL;

        kern_has_sysvsem = 1;

        return 0;
}

static int
seminit_exithook(void)
{

        hook = exithook_establish(semexit, NULL);
        return 0;
}

int
semfini(void)
{
        int i, sz;
        vaddr_t v = (vaddr_t)sema;

        /* Don't allow module unload if we're busy */
        mutex_enter(&semlock);
        if (semtot) {
                mutex_exit(&semlock);
                return 1;
        }

        /* Remove the exit hook */
        if (hook)
                exithook_disestablish(hook);

        /* Destroy all our condvars */
        for (i = 0; i < seminfo.semmni; i++) {
                cv_destroy(&semcv[i]);
        }

        /* Free the wired memory that we allocated */
        sz = ALIGN(seminfo.semmni * sizeof(struct semid_ds)) +
            ALIGN(seminfo.semmns * sizeof(struct __sem)) +
            ALIGN(seminfo.semmni * sizeof(kcondvar_t)) +
            ALIGN(seminfo.semmnu * seminfo.semusz);
        sz = round_page(sz);
        uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);

        /* Destroy the last cv and mutex */
        cv_destroy(&sem_realloc_cv);
        mutex_exit(&semlock);
        mutex_destroy(&semlock);

        kern_has_sysvsem = 0;

        return 0;
}

static int
semrealloc(int newsemmni, int newsemmns, int newsemmnu)
{
        struct semid_ds *new_sema, *old_sema;
        struct __sem *new_sem;
        struct sem_undo *new_semu_list, *suptr, *nsuptr;
        int *new_semu;
        kcondvar_t *new_semcv;
        vaddr_t v;
        int i, j, lsemid, nmnus, sz;

        if (newsemmni < 1 || newsemmns < 1 || newsemmnu < 1)
                return EINVAL;

        /* Allocate the wired memory for our structures */
        sz = ALIGN(newsemmni * sizeof(struct semid_ds)) +
            ALIGN(newsemmns * sizeof(struct __sem)) +
            ALIGN(newsemmni * sizeof(kcondvar_t)) +
            ALIGN(newsemmnu * seminfo.semusz);
        sz = round_page(sz);
        v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
        if (v == 0)
                return ENOMEM;

        mutex_enter(&semlock);
        if (sem_realloc_state) {
                mutex_exit(&semlock);
                uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);
                return EBUSY;
        }
        sem_realloc_state = true;
        if (sem_waiters) {
                /*
                 * Mark reallocation state, wake-up all waiters,
                 * and wait while they will all exit.
                 */
                for (i = 0; i < seminfo.semmni; i++)
                        cv_broadcast(&semcv[i]);
                while (sem_waiters)
                        cv_wait(&sem_realloc_cv, &semlock);
        }
        old_sema = sema;

        /* Get the number of last slot */
        lsemid = 0;
        for (i = 0; i < seminfo.semmni; i++)
                if (sema[i].sem_perm.mode & SEM_ALLOC)
                        lsemid = i;

        /* Get the number of currently used undo structures */
        nmnus = 0;
        for (i = 0; i < seminfo.semmnu; i++) {
                suptr = SEMU(semu, i);
                if (suptr->un_proc == NULL)
                        continue;
                nmnus++;
        }

        /* We cannot reallocate less memory than we use */
        if (lsemid >= newsemmni || semtot > newsemmns || nmnus > newsemmnu) {
                mutex_exit(&semlock);
                uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);
                return EBUSY;
        }

        new_sema = (void *)v;
        new_sem = (void *)((uintptr_t)new_sema +
            ALIGN(newsemmni * sizeof(struct semid_ds)));
        new_semcv = (void *)((uintptr_t)new_sem +
            ALIGN(newsemmns * sizeof(struct __sem)));
        new_semu = (void *)((uintptr_t)new_semcv +
            ALIGN(newsemmni * sizeof(kcondvar_t)));

        /* Initialize all semaphore identifiers and condvars */
        for (i = 0; i < newsemmni; i++) {
                new_sema[i]._sem_base = 0;
                new_sema[i].sem_perm.mode = 0;
                cv_init(&new_semcv[i], "semwait");
        }
        for (i = 0; i < newsemmnu; i++) {
                nsuptr = SEMU(new_semu, i);
                nsuptr->un_proc = NULL;
        }

        /*
         * Copy all identifiers, semaphores and list of the
         * undo structures to the new memory allocation.
         */
        j = 0;
        for (i = 0; i <= lsemid; i++) {
                if ((sema[i].sem_perm.mode & SEM_ALLOC) == 0)
                        continue;
                memcpy(&new_sema[i], &sema[i], sizeof(struct semid_ds));
                new_sema[i]._sem_base = &new_sem[j];
                memcpy(new_sema[i]._sem_base, sema[i]._sem_base,
                    (sizeof(struct __sem) * sema[i].sem_nsems));
                j += sema[i].sem_nsems;
        }
        KASSERT(j == semtot);

        j = 0;
        new_semu_list = NULL;
        for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) {
                KASSERT(j < newsemmnu);
                nsuptr = SEMU(new_semu, j);
                memcpy(nsuptr, suptr, SEMUSZ);
                nsuptr->un_next = new_semu_list;
                new_semu_list = nsuptr;
                j++;
        }

        for (i = 0; i < seminfo.semmni; i++) {
                KASSERT(cv_has_waiters(&semcv[i]) == false);
                cv_destroy(&semcv[i]);
        }

        sz = ALIGN(seminfo.semmni * sizeof(struct semid_ds)) +
            ALIGN(seminfo.semmns * sizeof(struct __sem)) +
            ALIGN(seminfo.semmni * sizeof(kcondvar_t)) +
            ALIGN(seminfo.semmnu * seminfo.semusz);
        sz = round_page(sz);

        /* Set the pointers and update the new values */
        sema = new_sema;
        sem = new_sem;
        semcv = new_semcv;
        semu = new_semu;
        semu_list = new_semu_list;

        seminfo.semmni = newsemmni;
        seminfo.semmns = newsemmns;
        seminfo.semmnu = newsemmnu;

        /* Reallocation completed - notify all waiters, if any */
        sem_realloc_state = false;
        cv_broadcast(&sem_realloc_cv);
        mutex_exit(&semlock);

        uvm_km_free(kernel_map, (vaddr_t)old_sema, sz, UVM_KMF_WIRED);
        return 0;
}

/*
 * Placebo.
 */

int
sys_semconfig(struct lwp *l, const struct sys_semconfig_args *uap, register_t *retval)
{

        RUN_ONCE(&exithook_control, seminit_exithook);

        *retval = 0;
        return 0;
}

/*
 * Allocate a new sem_undo structure for a process.
 * => Returns NULL on failure.
 */
struct sem_undo *
semu_alloc(struct proc *p)
{
        struct sem_undo *suptr, **supptr;
        bool attempted = false;
        int i;

        KASSERT(mutex_owned(&semlock));
again:
        /* Look for a free structure. */
        for (i = 0; i < seminfo.semmnu; i++) {
                suptr = SEMU(semu, i);
                if (suptr->un_proc == NULL) {
                        /* Found.  Fill it in and return. */
                        suptr->un_next = semu_list;
                        semu_list = suptr;
                        suptr->un_cnt = 0;
                        suptr->un_proc = p;
                        return suptr;
                }
        }

        /* Not found.  Attempt to free some structures. */
        if (!attempted) {
                bool freed = false;

                attempted = true;
                supptr = &semu_list;
                while ((suptr = *supptr) != NULL) {
                        if (suptr->un_cnt == 0)  {
                                suptr->un_proc = NULL;
                                *supptr = suptr->un_next;
                                freed = true;
                        } else {
                                supptr = &suptr->un_next;
                        }
                }
                if (freed) {
                        goto again;
                }
        }
        return NULL;
}

/*
 * Adjust a particular entry for a particular proc
 */

int
semundo_adjust(struct proc *p, struct sem_undo **supptr, int semid, int semnum,
    int adjval)
{
        struct sem_undo *suptr;
        struct sem_undo_entry *sunptr;
        int i;

        KASSERT(mutex_owned(&semlock));

        /*
         * Look for and remember the sem_undo if the caller doesn't
         * provide it
         */

        suptr = *supptr;
        if (suptr == NULL) {
                for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next)
                        if (suptr->un_proc == p)
                                break;

                if (suptr == NULL) {
                        suptr = semu_alloc(p);
                        if (suptr == NULL)
                                return (ENOSPC);
                }
                *supptr = suptr;
        }

        /*
         * Look for the requested entry and adjust it (delete if
         * adjval becomes 0).
         */
        sunptr = &suptr->un_ent[0];
        for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
                if (sunptr->un_id != semid || sunptr->un_num != semnum)
                        continue;
                sunptr->un_adjval += adjval;
                if (sunptr->un_adjval == 0) {
                        suptr->un_cnt--;
                        if (i < suptr->un_cnt)
                                suptr->un_ent[i] =
                                    suptr->un_ent[suptr->un_cnt];
                }
                return (0);
        }

        /* Didn't find the right entry - create it */
        if (suptr->un_cnt == SEMUME)
                return (EINVAL);

        sunptr = &suptr->un_ent[suptr->un_cnt];
        suptr->un_cnt++;
        sunptr->un_adjval = adjval;
        sunptr->un_id = semid;
        sunptr->un_num = semnum;
        return (0);
}

void
semundo_clear(int semid, int semnum)
{
        struct sem_undo *suptr;
        struct sem_undo_entry *sunptr, *sunend;

        KASSERT(mutex_owned(&semlock));

        for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next)
                for (sunptr = &suptr->un_ent[0],
                    sunend = sunptr + suptr->un_cnt; sunptr < sunend;) {
                        if (sunptr->un_id == semid) {
                                if (semnum == -1 || sunptr->un_num == semnum) {
                                        suptr->un_cnt--;
                                        sunend--;
                                        if (sunptr != sunend)
                                                *sunptr = *sunend;
                                        if (semnum != -1)
                                                break;
                                        else
                                                continue;
                                }
                        }
                        sunptr++;
                }
}

int
sys_____semctl50(struct lwp *l, const struct sys_____semctl50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) semid;
                syscallarg(int) semnum;
                syscallarg(int) cmd;
                syscallarg(union __semun *) arg;
        } */
        struct semid_ds sembuf;
        int cmd, error;
        void *pass_arg;
        union __semun karg;

        RUN_ONCE(&exithook_control, seminit_exithook);

        cmd = SCARG(uap, cmd);

        pass_arg = get_semctl_arg(cmd, &sembuf, &karg);

        if (pass_arg) {
                error = copyin(SCARG(uap, arg), &karg, sizeof(karg));
                if (error)
                        return error;
                if (cmd == IPC_SET) {
                        error = copyin(karg.buf, &sembuf, sizeof(sembuf));
                        if (error)
                                return (error);
                }
        }

        error = semctl1(l, SCARG(uap, semid), SCARG(uap, semnum), cmd,
            pass_arg, retval);

        if (error == 0 && cmd == IPC_STAT)
                error = copyout(&sembuf, karg.buf, sizeof(sembuf));

        return (error);
}

int
semctl1(struct lwp *l, int semid, int semnum, int cmd, void *v,
    register_t *retval)
{
        kauth_cred_t cred = l->l_cred;
        union __semun *arg = v;
        struct semid_ds *sembuf = v, *semaptr;
        int i, error, ix;

        SEM_PRINTF(("call to semctl(%d, %d, %d, %p)\n",
            semid, semnum, cmd, v));

        mutex_enter(&semlock);

        ix = IPCID_TO_IX(semid);
        if (ix < 0 || ix >= seminfo.semmni) {
                mutex_exit(&semlock);
                return (EINVAL);
        }

        semaptr = &sema[ix];
        if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
            semaptr->sem_perm._seq != IPCID_TO_SEQ(semid)) {
                mutex_exit(&semlock);
                return (EINVAL);
        }

        switch (cmd) {
        case IPC_RMID:
                if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_M)) != 0)
                        break;
                semaptr->sem_perm.cuid = kauth_cred_geteuid(cred);
                semaptr->sem_perm.uid = kauth_cred_geteuid(cred);
                semtot -= semaptr->sem_nsems;
                for (i = semaptr->_sem_base - sem; i < semtot; i++)
                        sem[i] = sem[i + semaptr->sem_nsems];
                for (i = 0; i < seminfo.semmni; i++) {
                        if ((sema[i].sem_perm.mode & SEM_ALLOC) &&
                            sema[i]._sem_base > semaptr->_sem_base)
                                sema[i]._sem_base -= semaptr->sem_nsems;
                }
                semaptr->sem_perm.mode = 0;
                semundo_clear(ix, -1);
                cv_broadcast(&semcv[ix]);
                break;

        case IPC_SET:
                if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_M)))
                        break;
                KASSERT(sembuf != NULL);
                semaptr->sem_perm.uid = sembuf->sem_perm.uid;
                semaptr->sem_perm.gid = sembuf->sem_perm.gid;
                semaptr->sem_perm.mode = (semaptr->sem_perm.mode & ~0777) |
                    (sembuf->sem_perm.mode & 0777);
                semaptr->sem_ctime = time_second;
                break;

        case IPC_STAT:
                if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
                        break;
                KASSERT(sembuf != NULL);
                memset(sembuf, 0, sizeof *sembuf);
                sembuf->sem_perm = semaptr->sem_perm;
                sembuf->sem_perm.mode &= 0777;
                sembuf->sem_nsems = semaptr->sem_nsems;
                sembuf->sem_otime = semaptr->sem_otime;
                sembuf->sem_ctime = semaptr->sem_ctime;
                break;

        case GETNCNT:
                if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
                        break;
                if (semnum < 0 || semnum >= semaptr->sem_nsems) {
                        error = EINVAL;
                        break;
                }
                *retval = semaptr->_sem_base[semnum].semncnt;
                break;

        case GETPID:
                if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
                        break;
                if (semnum < 0 || semnum >= semaptr->sem_nsems) {
                        error = EINVAL;
                        break;
                }
                *retval = semaptr->_sem_base[semnum].sempid;
                break;

        case GETVAL:
                if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
                        break;
                if (semnum < 0 || semnum >= semaptr->sem_nsems) {
                        error = EINVAL;
                        break;
                }
                *retval = semaptr->_sem_base[semnum].semval;
                break;

        case GETALL:
                if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
                        break;
                KASSERT(arg != NULL);
                for (i = 0; i < semaptr->sem_nsems; i++) {
                        error = copyout(&semaptr->_sem_base[i].semval,
                            &arg->array[i], sizeof(arg->array[i]));
                        if (error != 0)
                                break;
                }
                break;

        case GETZCNT:
                if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
                        break;
                if (semnum < 0 || semnum >= semaptr->sem_nsems) {
                        error = EINVAL;
                        break;
                }
                *retval = semaptr->_sem_base[semnum].semzcnt;
                break;

        case SETVAL:
                if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_W)))
                        break;
                if (semnum < 0 || semnum >= semaptr->sem_nsems) {
                        error = EINVAL;
                        break;
                }
                KASSERT(arg != NULL);
                if ((unsigned int)arg->val > seminfo.semvmx) {
                        error = ERANGE;
                        break;
                }
                semaptr->_sem_base[semnum].semval = arg->val;
                semundo_clear(ix, semnum);
                cv_broadcast(&semcv[ix]);
                break;

        case SETALL:
                if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_W)))
                        break;
                KASSERT(arg != NULL);
                for (i = 0; i < semaptr->sem_nsems; i++) {
                        unsigned short semval;
                        error = copyin(&arg->array[i], &semval,
                            sizeof(arg->array[i]));
                        if (error != 0)
                                break;
                        if ((unsigned int)semval > seminfo.semvmx) {
                                error = ERANGE;
                                break;
                        }
                        semaptr->_sem_base[i].semval = semval;
                }
                semundo_clear(ix, -1);
                cv_broadcast(&semcv[ix]);
                break;

        default:
                error = EINVAL;
                break;
        }

        mutex_exit(&semlock);
        return (error);
}

int
sys_semget(struct lwp *l, const struct sys_semget_args *uap, register_t *retval)
{
        /* {
                syscallarg(key_t) key;
                syscallarg(int) nsems;
                syscallarg(int) semflg;
        } */
        int semid, error = 0;
        int key = SCARG(uap, key);
        int nsems = SCARG(uap, nsems);
        int semflg = SCARG(uap, semflg);
        kauth_cred_t cred = l->l_cred;

        RUN_ONCE(&exithook_control, seminit_exithook);

        SEM_PRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg));

        mutex_enter(&semlock);

        if (key != IPC_PRIVATE) {
                for (semid = 0; semid < seminfo.semmni; semid++) {
                        if ((sema[semid].sem_perm.mode & SEM_ALLOC) &&
                            sema[semid].sem_perm._key == key)
                                break;
                }
                if (semid < seminfo.semmni) {
                        SEM_PRINTF(("found public key\n"));
                        if ((error = ipcperm(cred, &sema[semid].sem_perm,
                            semflg & 0700)))
                                    goto out;
                        if (nsems > 0 && sema[semid].sem_nsems < nsems) {
                                SEM_PRINTF(("too small\n"));
                                error = EINVAL;
                                goto out;
                        }
                        if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
                                SEM_PRINTF(("not exclusive\n"));
                                error = EEXIST;
                                goto out;
                        }
                        goto found;
                }
        }

        SEM_PRINTF(("need to allocate the semid_ds\n"));
        if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
                if (nsems <= 0 || nsems > seminfo.semmsl) {
                        SEM_PRINTF(("nsems out of range (0<%d<=%d)\n", nsems,
                            seminfo.semmsl));
                        error = EINVAL;
                        goto out;
                }
                if (nsems > seminfo.semmns - semtot) {
                        SEM_PRINTF(("not enough semaphores left "
                            "(need %d, got %d)\n",
                            nsems, seminfo.semmns - semtot));
                        error = ENOSPC;
                        goto out;
                }
                for (semid = 0; semid < seminfo.semmni; semid++) {
                        if ((sema[semid].sem_perm.mode & SEM_ALLOC) == 0)
                                break;
                }
                if (semid == seminfo.semmni) {
                        SEM_PRINTF(("no more semid_ds's available\n"));
                        error = ENOSPC;
                        goto out;
                }
                SEM_PRINTF(("semid %d is available\n", semid));
                sema[semid].sem_perm._key = key;
                sema[semid].sem_perm.cuid = kauth_cred_geteuid(cred);
                sema[semid].sem_perm.uid = kauth_cred_geteuid(cred);
                sema[semid].sem_perm.cgid = kauth_cred_getegid(cred);
                sema[semid].sem_perm.gid = kauth_cred_getegid(cred);
                sema[semid].sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
                sema[semid].sem_perm._seq =
                    (sema[semid].sem_perm._seq + 1) & 0x7fff;
                sema[semid].sem_nsems = nsems;
                sema[semid].sem_otime = 0;
                sema[semid].sem_ctime = time_second;
                sema[semid]._sem_base = &sem[semtot];
                semtot += nsems;
                memset(sema[semid]._sem_base, 0,
                    sizeof(sema[semid]._sem_base[0]) * nsems);
                SEM_PRINTF(("sembase = %p, next = %p\n", sema[semid]._sem_base,
                    &sem[semtot]));
        } else {
                SEM_PRINTF(("didn't find it and wasn't asked to create it\n"));
                error = ENOENT;
                goto out;
        }

 found:
        *retval = IXSEQ_TO_IPCID(semid, sema[semid].sem_perm);
 out:
        mutex_exit(&semlock);
        return (error);
}

#define SMALL_SOPS 8

int
sys_semop(struct lwp *l, const struct sys_semop_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) semid;
                syscallarg(struct sembuf *) sops;
                syscallarg(size_t) nsops;
        } */
        struct proc *p = l->l_proc;
        int semid = SCARG(uap, semid), seq;
        size_t nsops = SCARG(uap, nsops);
        struct sembuf small_sops[SMALL_SOPS];
        struct sembuf *sops;
        struct semid_ds *semaptr;
        struct sembuf *sopptr = NULL;
        struct __sem *semptr = NULL;
        struct sem_undo *suptr = NULL;
        kauth_cred_t cred = l->l_cred;
        int i, error;
        int do_wakeup, do_undos;

        RUN_ONCE(&exithook_control, seminit_exithook);

        SEM_PRINTF(("call to semop(%d, %p, %zd)\n", semid, SCARG(uap,sops), nsops));

        if (__predict_false((p->p_flag & PK_SYSVSEM) == 0)) {
                mutex_enter(p->p_lock);
                p->p_flag |= PK_SYSVSEM;
                mutex_exit(p->p_lock);
        }

restart:
        if (nsops <= SMALL_SOPS) {
                sops = small_sops;
        } else if (nsops <= seminfo.semopm) {
                sops = kmem_alloc(nsops * sizeof(*sops), KM_SLEEP);
        } else {
                SEM_PRINTF(("too many sops (max=%d, nsops=%zd)\n",
                    seminfo.semopm, nsops));
                return (E2BIG);
        }

        error = copyin(SCARG(uap, sops), sops, nsops * sizeof(sops[0]));
        if (error) {
                SEM_PRINTF(("error = %d from copyin(%p, %p, %zd)\n", error,
                    SCARG(uap, sops), &sops, nsops * sizeof(sops[0])));
                if (sops != small_sops)
                        kmem_free(sops, nsops * sizeof(*sops));
                return error;
        }

        mutex_enter(&semlock);
        /* In case of reallocation, we will wait for completion */
        while (__predict_false(sem_realloc_state))
                cv_wait(&sem_realloc_cv, &semlock);

        semid = IPCID_TO_IX(semid);        /* Convert back to zero origin */
        if (semid < 0 || semid >= seminfo.semmni) {
                error = EINVAL;
                goto out;
        }

        semaptr = &sema[semid];
        seq = IPCID_TO_SEQ(SCARG(uap, semid));
        if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
            semaptr->sem_perm._seq != seq) {
                error = EINVAL;
                goto out;
        }

        if ((error = ipcperm(cred, &semaptr->sem_perm, IPC_W))) {
                SEM_PRINTF(("error = %d from ipaccess\n", error));
                goto out;
        }

        for (i = 0; i < nsops; i++)
                if (sops[i].sem_num >= semaptr->sem_nsems) {
                        error = EFBIG;
                        goto out;
                }

        /*
         * Loop trying to satisfy the vector of requests.
         * If we reach a point where we must wait, any requests already
         * performed are rolled back and we go to sleep until some other
         * process wakes us up.  At this point, we start all over again.
         *
         * This ensures that from the perspective of other tasks, a set
         * of requests is atomic (never partially satisfied).
         */
        do_undos = 0;

        for (;;) {
                do_wakeup = 0;

                for (i = 0; i < nsops; i++) {
                        sopptr = &sops[i];
                        semptr = &semaptr->_sem_base[sopptr->sem_num];

                        SEM_PRINTF(("semop:  semaptr=%p, sem_base=%p, "
                            "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
                            semaptr, semaptr->_sem_base, semptr,
                            sopptr->sem_num, semptr->semval, sopptr->sem_op,
                            (sopptr->sem_flg & IPC_NOWAIT) ?
                            "nowait" : "wait"));

                        if (sopptr->sem_op < 0) {
                                if ((int)(semptr->semval +
                                    sopptr->sem_op) < 0) {
                                        SEM_PRINTF(("semop:  "
                                            "can't do it now\n"));
                                        break;
                                } else {
                                        semptr->semval += sopptr->sem_op;
                                        if (semptr->semval == 0 &&
                                            semptr->semzcnt > 0)
                                                do_wakeup = 1;
                                }
                                if (sopptr->sem_flg & SEM_UNDO)
                                        do_undos = 1;
                        } else if (sopptr->sem_op == 0) {
                                if (semptr->semval > 0) {
                                        SEM_PRINTF(("semop:  not zero now\n"));
                                        break;
                                }
                        } else {
                                if (semptr->semncnt > 0)
                                        do_wakeup = 1;
                                semptr->semval += sopptr->sem_op;
                                if (sopptr->sem_flg & SEM_UNDO)
                                        do_undos = 1;
                        }
                }

                /*
                 * Did we get through the entire vector?
                 */
                if (i >= nsops)
                        goto done;

                /*
                 * No ... rollback anything that we've already done
                 */
                SEM_PRINTF(("semop:  rollback 0 through %d\n", i - 1));
                while (i-- > 0)
                        semaptr->_sem_base[sops[i].sem_num].semval -=
                            sops[i].sem_op;

                /*
                 * If the request that we couldn't satisfy has the
                 * NOWAIT flag set then return with EAGAIN.
                 */
                if (sopptr->sem_flg & IPC_NOWAIT) {
                        error = EAGAIN;
                        goto out;
                }

                if (sopptr->sem_op == 0)
                        semptr->semzcnt++;
                else
                        semptr->semncnt++;

                sem_waiters++;
                SEM_PRINTF(("semop:  good night!\n"));
                error = cv_wait_sig(&semcv[semid], &semlock);
                SEM_PRINTF(("semop:  good morning (error=%d)!\n", error));
                sem_waiters--;

                /* Notify reallocator, if it is waiting */
                cv_broadcast(&sem_realloc_cv);

                /*
                 * Make sure that the semaphore still exists
                 */
                if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
                    semaptr->sem_perm._seq != seq) {
                        error = EIDRM;
                        goto out;
                }

                /*
                 * The semaphore is still alive.  Readjust the count of
                 * waiting processes.
                 */
                semptr = &semaptr->_sem_base[sopptr->sem_num];
                if (sopptr->sem_op == 0)
                        semptr->semzcnt--;
                else
                        semptr->semncnt--;

                /* In case of such state, restart the call */
                if (sem_realloc_state) {
                        mutex_exit(&semlock);
                        goto restart;
                }

                /* Is it really morning, or was our sleep interrupted? */
                if (error != 0) {
                        error = EINTR;
                        goto out;
                }
                SEM_PRINTF(("semop:  good morning!\n"));
        }

done:
        /*
         * Process any SEM_UNDO requests.
         */
        if (do_undos) {
                for (i = 0; i < nsops; i++) {
                        /*
                         * We only need to deal with SEM_UNDO's for non-zero
                         * op's.
                         */
                        int adjval;

                        if ((sops[i].sem_flg & SEM_UNDO) == 0)
                                continue;
                        adjval = sops[i].sem_op;
                        if (adjval == 0)
                                continue;
                        error = semundo_adjust(p, &suptr, semid,
                            sops[i].sem_num, -adjval);
                        if (error == 0)
                                continue;

                        /*
                         * Oh-Oh!  We ran out of either sem_undo's or undo's.
                         * Rollback the adjustments to this point and then
                         * rollback the semaphore ups and down so we can return
                         * with an error with all structures restored.  We
                         * rollback the undo's in the exact reverse order that
                         * we applied them.  This guarantees that we won't run
                         * out of space as we roll things back out.
                         */
                        while (i-- > 0) {
                                if ((sops[i].sem_flg & SEM_UNDO) == 0)
                                        continue;
                                adjval = sops[i].sem_op;
                                if (adjval == 0)
                                        continue;
                                if (semundo_adjust(p, &suptr, semid,
                                    sops[i].sem_num, adjval) != 0)
                                        panic("semop - can't undo undos");
                        }

                        for (i = 0; i < nsops; i++)
                                semaptr->_sem_base[sops[i].sem_num].semval -=
                                    sops[i].sem_op;

                        SEM_PRINTF(("error = %d from semundo_adjust\n", error));
                        goto out;
                } /* loop through the sops */
        } /* if (do_undos) */

        /* We're definitely done - set the sempid's */
        for (i = 0; i < nsops; i++) {
                sopptr = &sops[i];
                semptr = &semaptr->_sem_base[sopptr->sem_num];
                semptr->sempid = p->p_pid;
        }

        /* Update sem_otime */
        semaptr->sem_otime = time_second;

        /* Do a wakeup if any semaphore was up'd. */
        if (do_wakeup) {
                SEM_PRINTF(("semop:  doing wakeup\n"));
                cv_broadcast(&semcv[semid]);
                SEM_PRINTF(("semop:  back from wakeup\n"));
        }
        SEM_PRINTF(("semop:  done\n"));
        *retval = 0;

 out:
        mutex_exit(&semlock);
        if (sops != small_sops)
                kmem_free(sops, nsops * sizeof(*sops));
        return error;
}

/*
 * Go through the undo structures for this process and apply the
 * adjustments to semaphores.
 */
/*ARGSUSED*/
void
semexit(struct proc *p, void *v)
{
        struct sem_undo *suptr;
        struct sem_undo **supptr;

        if ((p->p_flag & PK_SYSVSEM) == 0)
                return;

        mutex_enter(&semlock);

        /*
         * Go through the chain of undo vectors looking for one
         * associated with this process.
         */

        for (supptr = &semu_list; (suptr = *supptr) != NULL;
            supptr = &suptr->un_next) {
                if (suptr->un_proc == p)
                        break;
        }

        /*
         * If there is no undo vector, skip to the end.
         */

        if (suptr == NULL) {
                mutex_exit(&semlock);
                return;
        }

        /*
         * We now have an undo vector for this process.
         */

        SEM_PRINTF(("proc @%p has undo structure with %d entries\n", p,
            suptr->un_cnt));

        /*
         * If there are any active undo elements then process them.
         */
        if (suptr->un_cnt > 0) {
                int ix;

                for (ix = 0; ix < suptr->un_cnt; ix++) {
                        int semid = suptr->un_ent[ix].un_id;
                        int semnum = suptr->un_ent[ix].un_num;
                        int adjval = suptr->un_ent[ix].un_adjval;
                        struct semid_ds *semaptr;

                        semaptr = &sema[semid];
                        if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0)
                        if (semnum >= semaptr->sem_nsems)
                                panic("semexit - semnum out of range");

                        SEM_PRINTF(("semexit:  %p id=%d num=%d(adj=%d) ; "
                            "sem=%d\n",
                            suptr->un_proc, suptr->un_ent[ix].un_id,
                            suptr->un_ent[ix].un_num,
                            suptr->un_ent[ix].un_adjval,
                            semaptr->_sem_base[semnum].semval));

                        if (adjval < 0 &&
                            semaptr->_sem_base[semnum].semval < -adjval)
                                semaptr->_sem_base[semnum].semval = 0;
                        else
                                semaptr->_sem_base[semnum].semval += adjval;

                        cv_broadcast(&semcv[semid]);
                        SEM_PRINTF(("semexit:  back from wakeup\n"));
                }
        }

        /*
         * Deallocate the undo vector.
         */
        SEM_PRINTF(("removing vector\n"));
        suptr->un_proc = NULL;
        *supptr = suptr->un_next;
        mutex_exit(&semlock);
}

/*
 * Sysctl initialization and nodes.
 */

static int
sysctl_ipc_semmni(SYSCTLFN_ARGS)
{
        int newsize, error;
        struct sysctlnode node;
        node = *rnode;
        node.sysctl_data = &newsize;

        newsize = seminfo.semmni;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        return semrealloc(newsize, seminfo.semmns, seminfo.semmnu);
}

static int
sysctl_ipc_semmns(SYSCTLFN_ARGS)
{
        int newsize, error;
        struct sysctlnode node;
        node = *rnode;
        node.sysctl_data = &newsize;

        newsize = seminfo.semmns;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        return semrealloc(seminfo.semmni, newsize, seminfo.semmnu);
}

static int
sysctl_ipc_semmnu(SYSCTLFN_ARGS)
{
        int newsize, error;
        struct sysctlnode node;
        node = *rnode;
        node.sysctl_data = &newsize;

        newsize = seminfo.semmnu;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        return semrealloc(seminfo.semmni, seminfo.semmns, newsize);
}

SYSCTL_SETUP(sysctl_ipc_sem_setup, "sysctl kern.ipc subtree setup")
{
        const struct sysctlnode *node = NULL;

        sysctl_createv(clog, 0, NULL, &node,
                CTLFLAG_PERMANENT,
                CTLTYPE_NODE, "ipc",
                SYSCTL_DESCR("SysV IPC options"),
                NULL, 0, NULL, 0,
                CTL_KERN, KERN_SYSVIPC, CTL_EOL);

        if (node == NULL)
                return;

        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "semmni",
                SYSCTL_DESCR("Max number of number of semaphore identifiers"),
                sysctl_ipc_semmni, 0, &seminfo.semmni, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "semmns",
                SYSCTL_DESCR("Max number of number of semaphores in system"),
                sysctl_ipc_semmns, 0, &seminfo.semmns, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "semmnu",
                SYSCTL_DESCR("Max number of undo structures in system"),
                sysctl_ipc_semmnu, 0, &seminfo.semmnu, 0,
                CTL_CREATE, CTL_EOL);
}






















































































































































































































































































































































































































































































































































































































































































    5 




    4 
    1 
    5 



    5 
















    1 












    3 























    3 
    1 






    2 









    2 



    2 



















































































    4 











    2 








    3 






























    3 




    3 




































    2 






















































































    5 

    1 















    4 






















    2 
    1 









    1 
    2 














    2 















    2 















    2 












    2 
    2 




































    1 
    1 
    1 
    1 
    1 









    1 


























    1 












    1 

    1 

















































    1 



































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
/*        $NetBSD: kern_exit.c,v 1.298 2023/10/08 12:38:58 ad Exp $        */

/*-
 * Copyright (c) 1998, 1999, 2006, 2007, 2008, 2020, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_exit.c        8.10 (Berkeley) 2/23/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_exit.c,v 1.298 2023/10/08 12:38:58 ad Exp $");

#include "opt_ktrace.h"
#include "opt_dtrace.h"
#include "opt_sysv.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/ioctl.h>
#include <sys/tty.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/wait.h>
#include <sys/file.h>
#include <sys/fstrans.h>
#include <sys/vnode.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/uidinfo.h>
#include <sys/ptrace.h>
#include <sys/acct.h>
#include <sys/filedesc.h>
#include <sys/ras.h>
#include <sys/signalvar.h>
#include <sys/sched.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/sleepq.h>
#include <sys/lock.h>
#include <sys/lockdebug.h>
#include <sys/ktrace.h>
#include <sys/cpu.h>
#include <sys/lwpctl.h>
#include <sys/atomic.h>
#include <sys/sdt.h>
#include <sys/psref.h>

#include <uvm/uvm_extern.h>

#ifdef DEBUG_EXIT
int debug_exit = 0;
#define DPRINTF(x) if (debug_exit) printf x
#else
#define DPRINTF(x)
#endif

static int find_stopped_child(struct proc *, idtype_t, id_t, int,
    struct proc **, struct wrusage *, siginfo_t *);
static void proc_free(struct proc *, struct wrusage *);

/*
 * DTrace SDT provider definitions
 */
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE1(proc, kernel, , exit, "int");

/*
 * Fill in the appropriate signal information, and signal the parent.
 */
/* XXX noclone works around a gcc 4.5 bug on arm */
static void __noclone
exit_psignal(struct proc *p, struct proc *pp, ksiginfo_t *ksi)
{

        KSI_INIT(ksi);
        if ((ksi->ksi_signo = P_EXITSIG(p)) == SIGCHLD) {
                if (p->p_xsig) {
                        if (p->p_sflag & PS_COREDUMP)
                                ksi->ksi_code = CLD_DUMPED;
                        else
                                ksi->ksi_code = CLD_KILLED;
                        ksi->ksi_status = p->p_xsig;
                } else {
                        ksi->ksi_code = CLD_EXITED;
                        ksi->ksi_status = p->p_xexit;
                }
        } else {
                ksi->ksi_code = SI_USER;
                ksi->ksi_status = p->p_xsig;
        }
        /*
         * We fill those in, even for non-SIGCHLD.
         * It's safe to access p->p_cred unlocked here.
         */
        ksi->ksi_pid = p->p_pid;
        ksi->ksi_uid = kauth_cred_geteuid(p->p_cred);
        /* XXX: is this still valid? */
        ksi->ksi_utime = p->p_stats->p_ru.ru_utime.tv_sec;
        ksi->ksi_stime = p->p_stats->p_ru.ru_stime.tv_sec;
}

/*
 * exit --
 *        Death of process.
 */
int
sys_exit(struct lwp *l, const struct sys_exit_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)        rval;
        } */
        struct proc *p = l->l_proc;

        /* Don't call exit1() multiple times in the same process. */
        mutex_enter(p->p_lock);
        if (p->p_sflag & PS_WEXIT) {
                mutex_exit(p->p_lock);
                lwp_exit(l);
        }

        /* exit1() will release the mutex. */
        exit1(l, SCARG(uap, rval), 0);
        /* NOTREACHED */
        return (0);
}

/*
 * Exit: deallocate address space and other resources, change proc state
 * to zombie, and unlink proc from allproc and parent's lists.  Save exit
 * status and rusage for wait().  Check for child processes and orphan them.
 *
 * Must be called with p->p_lock held.  Does not return.
 */
void
exit1(struct lwp *l, int exitcode, int signo)
{
        struct proc        *p, *child, *next_child, *old_parent, *new_parent;
        struct pgrp        *pgrp;
        ksiginfo_t        ksi;
        ksiginfoq_t        kq;
        int                wakeinit;

        p = l->l_proc;

        /* Verify that we hold no locks other than p->p_lock. */
        LOCKDEBUG_BARRIER(p->p_lock, 0);

        /* XXX Temporary: something is leaking kernel_lock. */
        KERNEL_UNLOCK_ALL(l, NULL);

        KASSERT(mutex_owned(p->p_lock));
        KASSERT(p->p_vmspace != NULL);

        if (__predict_false(p == initproc)) {
                panic("init died (signal %d, exit %d)", signo, exitcode);
        }

        p->p_sflag |= PS_WEXIT;

        /*
         * Force all other LWPs to exit before we do.  Only then can we
         * begin to tear down the rest of the process state.
         */
        if (p->p_nlwps > 1) {
                exit_lwps(l);
        }

        ksiginfo_queue_init(&kq);

        /*
         * If we have been asked to stop on exit, do so now.
         */
        if (__predict_false(p->p_sflag & PS_STOPEXIT)) {
                KASSERT(l->l_blcnt == 0);
                sigclearall(p, &contsigmask, &kq);

                if (!mutex_tryenter(&proc_lock)) {
                        mutex_exit(p->p_lock);
                        mutex_enter(&proc_lock);
                        mutex_enter(p->p_lock);
                }
                p->p_waited = 0;
                p->p_pptr->p_nstopchild++;
                p->p_stat = SSTOP;
                mutex_exit(&proc_lock);
                lwp_lock(l);
                p->p_nrlwps--;
                l->l_stat = LSSTOP;
                lwp_unlock(l);
                mutex_exit(p->p_lock);
                lwp_lock(l);
                spc_lock(l->l_cpu);
                mi_switch(l);
                mutex_enter(p->p_lock);
        }

        /*
         * Bin any remaining signals and mark the process as dying so it will
         * not be found for, e.g. signals.
         */
        sigfillset(&p->p_sigctx.ps_sigignore);
        sigclearall(p, NULL, &kq);
        p->p_stat = SDYING;

        /*
         * Perform any required thread cleanup.  Do this early so
         * anyone wanting to look us up by our global thread ID
         * will fail to find us.
         *
         * N.B. this will unlock p->p_lock on our behalf.
         */
        lwp_thread_cleanup(l);

        ksiginfo_queue_drain(&kq);

        /* Destroy any lwpctl info. */
        if (p->p_lwpctl != NULL)
                lwp_ctl_exit();

        /*
         * Drain all remaining references that procfs, ptrace and others may
         * have on the process.
         */
        rw_enter(&p->p_reflock, RW_WRITER);

        DPRINTF(("%s: %d.%d exiting.\n", __func__, p->p_pid, l->l_lid));

        ptimers_free(p, TIMERS_ALL);
#if defined(__HAVE_RAS)
        ras_purgeall();
#endif

        /*
         * Close open files, release open-file table and free signal
         * actions.  This may block!
         */
        fd_free();
        cwdfree(p->p_cwdi);
        p->p_cwdi = NULL;
        doexithooks(p);
        sigactsfree(p->p_sigacts);

        /*
         * Write out accounting data.
         */
        (void)acct_process(l);

#ifdef KTRACE
        /*
         * Release trace file.
         */
        if (p->p_tracep != NULL) {
                mutex_enter(&ktrace_lock);
                ktrderef(p);
                mutex_exit(&ktrace_lock);
        }
#endif

        p->p_xexit = exitcode;
        p->p_xsig = signo;

        /*
         * If emulation has process exit hook, call it now.
         * Set the exit status now so that the exit hook has
         * an opportunity to tweak it (COMPAT_LINUX requires
         * this for thread group emulation)
         */
        if (p->p_emul->e_proc_exit)
                (*p->p_emul->e_proc_exit)(p);

        /*
         * Free the VM resources we're still holding on to.
         * We must do this from a valid thread because doing
         * so may block. This frees vmspace, which we don't
         * need anymore. The only remaining lwp is the one
         * we run at this moment, nothing runs in userland
         * anymore.
         */
        ruspace(p);        /* Update our vm resource use */
        uvm_proc_exit(p);

        /*
         * Stop profiling.
         */
        if (__predict_false((p->p_stflag & PST_PROFIL) != 0)) {
                mutex_spin_enter(&p->p_stmutex);
                stopprofclock(p);
                mutex_spin_exit(&p->p_stmutex);
        }

        /*
         * If parent is waiting for us to exit or exec, PL_PPWAIT is set; we
         * wake up the parent early to avoid deadlock.  We can do this once
         * the VM resources are released.
         */
        mutex_enter(&proc_lock);
        if (p->p_lflag & PL_PPWAIT) {
                lwp_t *lp;

                l->l_lwpctl = NULL; /* was on loan from blocked parent */
                p->p_lflag &= ~PL_PPWAIT;

                lp = p->p_vforklwp;
                p->p_vforklwp = NULL;
                lp->l_vforkwaiting = false;
                cv_broadcast(&lp->l_waitcv);
        }

        if (SESS_LEADER(p)) {
                struct vnode *vprele = NULL, *vprevoke = NULL;
                struct session *sp = p->p_session;
                struct tty *tp;

                if (sp->s_ttyvp) {
                        /*
                         * Controlling process.
                         * Signal foreground pgrp,
                         * drain controlling terminal
                         * and revoke access to controlling terminal.
                         */
                        tp = sp->s_ttyp;
                        mutex_spin_enter(&tty_lock);
                        if (tp->t_session == sp) {
                                /* we can't guarantee the revoke will do this */
                                pgrp = tp->t_pgrp;
                                tp->t_pgrp = NULL;
                                tp->t_session = NULL;
                                mutex_spin_exit(&tty_lock);
                                if (pgrp != NULL) {
                                        pgsignal(pgrp, SIGHUP, 1);
                                }
                                mutex_exit(&proc_lock);
                                (void) ttywait(tp);
                                mutex_enter(&proc_lock);

                                /* The tty could have been revoked. */
                                vprevoke = sp->s_ttyvp;
                        } else
                                mutex_spin_exit(&tty_lock);
                        vprele = sp->s_ttyvp;
                        sp->s_ttyvp = NULL;
                        /*
                         * s_ttyp is not zero'd; we use this to indicate
                         * that the session once had a controlling terminal.
                         * (for logging and informational purposes)
                         */
                }
                sp->s_leader = NULL;

                if (vprevoke != NULL || vprele != NULL) {
                        if (vprevoke != NULL) {
                                /* Releases proc_lock. */
                                proc_sessrele(sp);
                                VOP_REVOKE(vprevoke, REVOKEALL);
                        } else
                                mutex_exit(&proc_lock);
                        if (vprele != NULL)
                                vrele(vprele);
                        mutex_enter(&proc_lock);
                }
        }
        fixjobc(p, p->p_pgrp, 0);

        /* Release fstrans private data. */
        fstrans_lwp_dtor(l);

        /*
         * Finalize the last LWP's specificdata, as well as the
         * specificdata for the proc itself.
         */
        lwp_finispecific(l);
        proc_finispecific(p);

        /*
         * Reset p_opptr pointer of all former children which got
         * traced by another process and were reparented. We reset
         * it to NULL here; the trace detach code then reparents
         * the child to initproc. We only check allproc list, since
         * eventual former children on zombproc list won't reference
         * p_opptr anymore.
         */
        if (__predict_false(p->p_slflag & PSL_CHTRACED)) {
                struct proc *q;
                PROCLIST_FOREACH(q, &allproc) {
                        if (q->p_opptr == p)
                                q->p_opptr = NULL;
                }
                PROCLIST_FOREACH(q, &zombproc) {
                        if (q->p_opptr == p)
                                q->p_opptr = NULL;
                }
        }

        /*
         * Give orphaned children to init(8).
         */
        child = LIST_FIRST(&p->p_children);
        wakeinit = (child != NULL);
        for (; child != NULL; child = next_child) {
                next_child = LIST_NEXT(child, p_sibling);

                /*
                 * Traced processes are killed since their existence
                 * means someone is screwing up. Since we reset the
                 * trace flags, the logic in sys_wait4() would not be
                 * triggered to reparent the process to its
                 * original parent, so we must do this here.
                 */
                if (__predict_false(child->p_slflag & PSL_TRACED)) {
                        mutex_enter(p->p_lock);
                        child->p_slflag &=
                            ~(PSL_TRACED|PSL_SYSCALL);
                        mutex_exit(p->p_lock);
                        if (child->p_opptr != child->p_pptr) {
                                struct proc *t = child->p_opptr;
                                proc_reparent(child, t ? t : initproc);
                                child->p_opptr = NULL;
                        } else
                                proc_reparent(child, initproc);
                        killproc(child, "orphaned traced process");
                } else
                        proc_reparent(child, initproc);
        }

        /*
         * Move proc from allproc to zombproc, it's now nearly ready to be
         * collected by parent.
         */
        LIST_REMOVE(l, l_list);
        LIST_REMOVE(p, p_list);
        LIST_INSERT_HEAD(&zombproc, p, p_list);

        /*
         * Mark the process as dead.  We must do this before we signal
         * the parent.
         */
        p->p_stat = SDEAD;

        /*
         * Let anyone watching this DTrace probe know what we're
         * on our way out.
         */
        SDT_PROBE(proc, kernel, , exit,
                ((p->p_sflag & PS_COREDUMP) ? CLD_DUMPED :
                 (p->p_xsig ? CLD_KILLED : CLD_EXITED)),
                0,0,0,0);

        /* Put in front of parent's sibling list for parent to collect it */
        old_parent = p->p_pptr;
        old_parent->p_nstopchild++;
        if (LIST_FIRST(&old_parent->p_children) != p) {
                /* Put child where it can be found quickly */
                LIST_REMOVE(p, p_sibling);
                LIST_INSERT_HEAD(&old_parent->p_children, p, p_sibling);
        }

        /*
         * Notify parent that we're gone.  If parent has the P_NOCLDWAIT
         * flag set, notify init instead (and hope it will handle
         * this situation).
         */
        if (old_parent->p_flag & (PK_NOCLDWAIT|PK_CLDSIGIGN)) {
                proc_reparent(p, initproc);
                wakeinit = 1;

                /*
                 * If this was the last child of our parent, notify
                 * parent, so in case he was wait(2)ing, he will
                 * continue.
                 */
                if (LIST_FIRST(&old_parent->p_children) == NULL)
                        cv_broadcast(&old_parent->p_waitcv);
        }

        /* Reload parent pointer, since p may have been reparented above */
        new_parent = p->p_pptr;

        if (__predict_false(p->p_exitsig != 0)) {
                exit_psignal(p, new_parent, &ksi);
                kpsignal(new_parent, &ksi, NULL);
        }

        /* Calculate the final rusage info.  */
        calcru(p, &p->p_stats->p_ru.ru_utime, &p->p_stats->p_ru.ru_stime,
            NULL, NULL);

        callout_destroy(&l->l_timeout_ch);

        /*
         * Release any PCU resources before becoming a zombie.
         */
        pcu_discard_all(l);

        /*
         * Notify other processes tracking us with a knote that
         * we're exiting.
         *
         * N.B. we do this here because the process is now SDEAD,
         * and thus cannot have any more knotes attached.  Also,
         * knote_proc_exit() expects that p->p_lock is already
         * held (and will assert so).
         */
        mutex_enter(p->p_lock);
        if (!SLIST_EMPTY(&p->p_klist)) {
                knote_proc_exit(p);
        }

        /* Free the LWP ID */
        proc_free_lwpid(p, l->l_lid);
        lwp_drainrefs(l);
        lwp_lock(l);
        l->l_prflag &= ~LPR_DETACHED;
        l->l_stat = LSZOMB;
        lwp_unlock(l);
        KASSERT(curlwp == l);
        KASSERT(p->p_nrlwps == 1);
        KASSERT(p->p_nlwps == 1);
        p->p_stat = SZOMB;
        p->p_nrlwps--;
        p->p_nzlwps++;
        p->p_ndlwps = 0;
        mutex_exit(p->p_lock);

        /*
         * Signal the parent to collect us, and drop the proclist lock.
         * Drop debugger/procfs lock; no new references can be gained.
         */
        rw_exit(&p->p_reflock);
        cv_broadcast(&p->p_pptr->p_waitcv);
        mutex_exit(&proc_lock);
        if (wakeinit)
                cv_broadcast(&initproc->p_waitcv);

        /*
         * NOTE: WE ARE NO LONGER ALLOWED TO SLEEP!
         */

        /*
         * Give machine-dependent code a chance to free any MD LWP
         * resources.  This must be done before uvm_lwp_exit(), in
         * case these resources are in the PCB.
         */
        cpu_lwp_free(l, 1);

        /* Switch away into oblivion. */
        lwp_lock(l);
        spc_lock(l->l_cpu);
        mi_switch(l);
        panic("exit1");
}

void
exit_lwps(struct lwp *l)
{
        proc_t *p = l->l_proc;
        lwp_t *l2;

retry:
        KASSERT(mutex_owned(p->p_lock));

        /*
         * Interrupt LWPs in interruptable sleep, unsuspend suspended
         * LWPs and then wait for everyone else to finish.
         */
        LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
                if (l2 == l)
                        continue;
                lwp_lock(l2);
                l2->l_flag |= LW_WEXIT;
                lwp_need_userret(l2);
                if ((l2->l_stat == LSSLEEP && (l2->l_flag & LW_SINTR)) ||
                    l2->l_stat == LSSUSPENDED || l2->l_stat == LSSTOP) {
                        l2->l_flag &= ~LW_DBGSUSPEND;
                            /* setrunnable() will release the lock. */
                        setrunnable(l2);
                        continue;
                }
                lwp_unlock(l2);
        }

        /*
         * Wait for every LWP to exit.  Note: LWPs can get suspended/slept
         * behind us or there may even be new LWPs created.  Therefore, a
         * full retry is required on error.
         */
        while (p->p_nlwps > 1) {
                if (lwp_wait(l, 0, NULL, true)) {
                        goto retry;
                }
        }

        KASSERT(p->p_nlwps == 1);
}

int
do_sys_waitid(idtype_t idtype, id_t id, int *pid, int *status, int options,
    struct wrusage *wru, siginfo_t *si)
{
        proc_t *child;
        int error;


        if (wru != NULL)
                memset(wru, 0, sizeof(*wru));
        if (si != NULL)
                memset(si, 0, sizeof(*si));

        mutex_enter(&proc_lock);
        error = find_stopped_child(curproc, idtype, id, options, &child,
            wru, si);
        if (child == NULL) {
                mutex_exit(&proc_lock);
                *pid = 0;
                *status = 0;
                return error;
        }
        *pid = child->p_pid;

        if (child->p_stat == SZOMB) {
                /* Child is exiting */
                *status = P_WAITSTATUS(child);
                /* proc_free() will release the proc_lock. */
                if (options & WNOWAIT) {
                        mutex_exit(&proc_lock);
                } else {
                        proc_free(child, wru);
                }
        } else {
                /* Don't mark SIGCONT if we are being stopped */
                *status = (child->p_xsig == SIGCONT && child->p_stat != SSTOP) ?
                    W_CONTCODE() : W_STOPCODE(child->p_xsig);
                mutex_exit(&proc_lock);
        }
        return 0;
}

int
do_sys_wait(int *pid, int *status, int options, struct rusage *ru)
{
        idtype_t idtype;
        id_t id;
        int ret;
        struct wrusage wru;

        /*
         * Translate the special pid values into the (idtype, pid)
         * pair for wait6. The WAIT_MYPGRP case is handled by
         * find_stopped_child() on its own.
         */
        if (*pid == WAIT_ANY) {
                idtype = P_ALL;
                id = 0;
        } else if (*pid < 0) {
                idtype = P_PGID;
                id = (id_t)-*pid;
        } else {
                idtype = P_PID;
                id = (id_t)*pid;
        }
        options |= WEXITED | WTRAPPED;
        ret = do_sys_waitid(idtype, id, pid, status, options, ru ? &wru : NULL,
            NULL);
        if (ru)
                *ru = wru.wru_self;
        return ret;
}

int
sys___wait450(struct lwp *l, const struct sys___wait450_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        pid;
                syscallarg(int *)                status;
                syscallarg(int)                        options;
                syscallarg(struct rusage *)        rusage;
        } */
        int error, status, pid = SCARG(uap, pid);
        struct rusage ru;

        error = do_sys_wait(&pid, &status, SCARG(uap, options),
            SCARG(uap, rusage) != NULL ? &ru : NULL);

        retval[0] = pid;
        if (pid == 0) {
                return error;
        }
        if (SCARG(uap, status)) {
                error = copyout(&status, SCARG(uap, status), sizeof(status));
        }
        if (SCARG(uap, rusage) && error == 0) {
                error = copyout(&ru, SCARG(uap, rusage), sizeof(ru));
        }
        return error;
}

int
sys_wait6(struct lwp *l, const struct sys_wait6_args *uap, register_t *retval)
{
        /* {
                syscallarg(idtype_t)                idtype;
                syscallarg(id_t)                id;
                syscallarg(int *)                status;
                syscallarg(int)                        options;
                syscallarg(struct wrusage *)        wru;
                syscallarg(siginfo_t *)                si;
        } */
        struct wrusage wru, *wrup;
        siginfo_t si, *sip;
        idtype_t idtype;
        int pid;
        id_t id;
        int error, status;

        idtype = SCARG(uap, idtype);
        id = SCARG(uap, id);

        if (SCARG(uap, wru) != NULL)
                wrup = &wru;
        else
                wrup = NULL;

        if (SCARG(uap, info) != NULL)
                sip = &si;
        else
                sip = NULL;

        /*
         *  We expect all callers of wait6() to know about WEXITED and
         *  WTRAPPED.
         */
        error = do_sys_waitid(idtype, id, &pid, &status, SCARG(uap, options),
            wrup, sip);

        retval[0] = pid;         /* tell userland who it was */

#if 0
        /*
         * should we copyout if there was no process, hence no useful data?
         * We don't for an old style wait4() (etc) but I believe
         * FreeBSD does for wait6(), so a tossup...  Go with FreeBSD for now.
         */
        if (pid == 0)
                return error;
#endif

        if (SCARG(uap, status) != NULL && error == 0)
                error = copyout(&status, SCARG(uap, status), sizeof(status));
        if (SCARG(uap, wru) != NULL && error == 0)
                error = copyout(&wru, SCARG(uap, wru), sizeof(wru));
        if (SCARG(uap, info) != NULL && error == 0)
                error = copyout(&si, SCARG(uap, info), sizeof(si));
        return error;
}


/*
 * Find a process that matches the provided criteria, and fill siginfo
 * and resources if found.
 * Returns:
 *        -1:         Not found, abort early
 *         0:        Not matched
 *         1:        Matched, there might be more matches
 *         2:        This is the only match
 */
static int
match_process(const struct proc *pp, struct proc **q, idtype_t idtype, id_t id,
    int options, struct wrusage *wrusage, siginfo_t *siginfo)
{
        struct rusage *rup;
        struct proc *p = *q;
        int rv = 1;

        switch (idtype) {
        case P_ALL:
                mutex_enter(p->p_lock);
                break;
        case P_PID:
                if (p->p_pid != (pid_t)id) {
                        p = *q = proc_find_raw((pid_t)id);
                        if (p == NULL || p->p_stat == SIDL || p->p_pptr != pp) {
                                *q = NULL;
                                return -1;
                        }
                }
                mutex_enter(p->p_lock);
                rv++;
                break;
        case P_PGID:
                if (p->p_pgid != (pid_t)id)
                        return 0;
                mutex_enter(p->p_lock);
                break;
        case P_SID:
                if (p->p_session->s_sid != (pid_t)id)
                        return 0;
                mutex_enter(p->p_lock);
                break;
        case P_UID:
                mutex_enter(p->p_lock);
                if (kauth_cred_geteuid(p->p_cred) != (uid_t)id) {
                        mutex_exit(p->p_lock);
                        return 0;
                }
                break;
        case P_GID:
                mutex_enter(p->p_lock);
                if (kauth_cred_getegid(p->p_cred) != (gid_t)id) {
                        mutex_exit(p->p_lock);
                        return 0;
                }
                break;
        case P_CID:
        case P_PSETID:
        case P_CPUID:
                /* XXX: Implement me */
        default:
                return 0;
        }

        if ((options & WEXITED) == 0 && p->p_stat == SZOMB) {
                mutex_exit(p->p_lock);
                return 0;
        }

        if (siginfo != NULL) {
                siginfo->si_errno = 0;

                /*
                 * SUSv4 requires that the si_signo value is always
                 * SIGCHLD. Obey it despite the rfork(2) interface
                 * allows to request other signal for child exit
                 * notification.
                 */
                siginfo->si_signo = SIGCHLD;

                /*
                 *  This is still a rough estimate.  We will fix the
                 *  cases TRAPPED, STOPPED, and CONTINUED later.
                 */
                if (p->p_sflag & PS_COREDUMP) {
                        siginfo->si_code = CLD_DUMPED;
                        siginfo->si_status = p->p_xsig;
                } else if (p->p_xsig) {
                        siginfo->si_code = CLD_KILLED;
                        siginfo->si_status = p->p_xsig;
                } else {
                        siginfo->si_code = CLD_EXITED;
                        siginfo->si_status = p->p_xexit;
                }

                siginfo->si_pid = p->p_pid;
                siginfo->si_uid = kauth_cred_geteuid(p->p_cred);
                siginfo->si_utime = p->p_stats->p_ru.ru_utime.tv_sec;
                siginfo->si_stime = p->p_stats->p_ru.ru_stime.tv_sec;
        }

        /*
         * There should be no reason to limit resources usage info to
         * exited processes only.  A snapshot about any resources used
         * by a stopped process may be exactly what is needed.
         */
        if (wrusage != NULL) {
                rup = &wrusage->wru_self;
                *rup = p->p_stats->p_ru;
                calcru(p, &rup->ru_utime, &rup->ru_stime, NULL, NULL);

                rup = &wrusage->wru_children;
                *rup = p->p_stats->p_cru;
                calcru(p, &rup->ru_utime, &rup->ru_stime, NULL, NULL);
        }

        mutex_exit(p->p_lock);
        return rv;
}

/*
 * Determine if there are existing processes being debugged
 * that used to be (and sometime later will be again) children
 * of a specific parent (while matching wait criteria)
 */
static bool
debugged_child_exists(idtype_t idtype, id_t id, int options, siginfo_t *si,
    const struct proc *parent)
{
        struct proc *pp;

        /*
         * If we are searching for a specific pid, we can optimise a little
         */
        if (idtype == P_PID) {
                /*
                 * Check the specific process to see if its real parent is us
                 */
                pp = proc_find_raw((pid_t)id);
                if (pp != NULL && pp->p_stat != SIDL && pp->p_opptr == parent) {
                        /*
                         * using P_ALL here avoids match_process() doing the
                         * same work that we just did, but incorrectly for
                         * this scenario.
                         */
                        if (match_process(parent, &pp, P_ALL, id, options,
                            NULL, si))
                                return true;
                }
                return false;
        }

        /*
         * For the hard cases, just look everywhere to see if some
         * stolen (reparented) process is really our lost child.
         * Then check if that process could satisfy the wait conditions.
         */

        /*
         * XXX inefficient, but hopefully fairly rare.
         * XXX should really use a list of reparented processes.
         */
        PROCLIST_FOREACH(pp, &allproc) {
                if (pp->p_stat == SIDL)                /* XXX impossible ?? */
                        continue;
                if (pp->p_opptr == parent &&
                    match_process(parent, &pp, idtype, id, options, NULL, si))
                        return true;
        }
        PROCLIST_FOREACH(pp, &zombproc) {
                if (pp->p_stat == SIDL)                /* XXX impossible ?? */
                        continue;
                if (pp->p_opptr == parent &&
                    match_process(parent, &pp, idtype, id, options, NULL, si))
                        return true;
        }

        return false;
}

/*
 * Scan list of child processes for a child process that has stopped or
 * exited.  Used by sys_wait4 and 'compat' equivalents.
 *
 * Must be called with the proc_lock held, and may release while waiting.
 */
static int
find_stopped_child(struct proc *parent, idtype_t idtype, id_t id, int options,
    struct proc **child_p, struct wrusage *wru, siginfo_t *si)
{
        struct proc *child, *dead;
        int error;

        KASSERT(mutex_owned(&proc_lock));

        if (options & ~WALLOPTS) {
                *child_p = NULL;
                return EINVAL;
        }

        if ((options & WSELECTOPTS) == 0) {
                /*
                 * We will be unable to find any matching processes,
                 * because there are no known events to look for.
                 * Prefer to return error instead of blocking
                 * indefinitely.
                 */
                *child_p = NULL;
                return EINVAL;
        }

        if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) {
                id = (id_t)parent->p_pgid;
                idtype = P_PGID;
        }

        for (;;) {
                error = ECHILD;
                dead = NULL;

                LIST_FOREACH(child, &parent->p_children, p_sibling) {
                        int rv = match_process(parent, &child, idtype, id,
                            options, wru, si);
                        if (rv == -1)
                                break;
                        if (rv == 0)
                                continue;

                        /*
                         * Wait for processes with p_exitsig != SIGCHLD
                         * processes only if WALTSIG is set; wait for
                         * processes with p_exitsig == SIGCHLD only
                         * if WALTSIG is clear.
                         */
                        if (((options & WALLSIG) == 0) &&
                            (options & WALTSIG ? child->p_exitsig == SIGCHLD
                                                : P_EXITSIG(child) != SIGCHLD)){
                                if (rv == 2) {
                                        child = NULL;
                                        break;
                                }
                                continue;
                        }

                        error = 0;
                        if ((options & WNOZOMBIE) == 0) {
                                if (child->p_stat == SZOMB)
                                        break;
                                if (child->p_stat == SDEAD) {
                                        /*
                                         * We may occasionally arrive here
                                         * after receiving a signal, but
                                         * immediately before the child
                                         * process is zombified.  The wait
                                         * will be short, so avoid returning
                                         * to userspace.
                                         */
                                        dead = child;
                                }
                        }

                        if ((options & WCONTINUED) != 0 &&
                            child->p_xsig == SIGCONT &&
                            (child->p_sflag & PS_CONTINUED)) {
                                if ((options & WNOWAIT) == 0) {
                                        child->p_sflag &= ~PS_CONTINUED;
                                        child->p_waited = 1;
                                        parent->p_nstopchild--;
                                }
                                if (si) {
                                        si->si_status = child->p_xsig;
                                        si->si_code = CLD_CONTINUED;
                                }
                                break;
                        }

                        if ((options & (WTRAPPED|WSTOPPED)) != 0 &&
                            child->p_stat == SSTOP &&
                            child->p_waited == 0 &&
                            ((child->p_slflag & PSL_TRACED) ||
                            options & (WUNTRACED|WSTOPPED))) {
                                if ((options & WNOWAIT) == 0) {
                                        child->p_waited = 1;
                                        parent->p_nstopchild--;
                                }
                                if (si) {
                                        si->si_status = child->p_xsig;
                                        si->si_code =
                                            (child->p_slflag & PSL_TRACED) ?
                                            CLD_TRAPPED : CLD_STOPPED;
                                }
                                break;
                        }
                        if (parent->p_nstopchild == 0 || rv == 2) {
                                child = NULL;
                                break;
                        }
                }

                /*
                 * If we found nothing, but we are the bereaved parent
                 * of a stolen child, look and see if that child (or
                 * one of them) meets our search criteria.   If so, then
                 * we cannot succeed, but we can hang (wait...), 
                 * or if WNOHANG, return 0 instead of ECHILD
                 */
                if (child == NULL && error == ECHILD && 
                    (parent->p_slflag & PSL_CHTRACED) &&
                    debugged_child_exists(idtype, id, options, si, parent))
                        error = 0;

                if (child != NULL || error != 0 ||
                    ((options & WNOHANG) != 0 && dead == NULL)) {
                        *child_p = child;
                        return error;
                }

                /*
                 * Wait for another child process to stop.
                 */
                error = cv_wait_sig(&parent->p_waitcv, &proc_lock);

                if (error != 0) {
                        *child_p = NULL;
                        return error;
                }
        }
}

/*
 * Free a process after parent has taken all the state info.  Must be called
 * with the proclist lock held, and will release before returning.
 *
 * *ru is returned to the caller, and must be freed by the caller.
 */
static void
proc_free(struct proc *p, struct wrusage *wru)
{
        struct proc *parent = p->p_pptr;
        struct lwp *l;
        ksiginfo_t ksi;
        kauth_cred_t cred1, cred2;
        uid_t uid;

        KASSERT(mutex_owned(&proc_lock));
        KASSERT(p->p_nlwps == 1);
        KASSERT(p->p_nzlwps == 1);
        KASSERT(p->p_nrlwps == 0);
        KASSERT(p->p_stat == SZOMB);

        /*
         * If we got the child via ptrace(2) or procfs, and
         * the parent is different (meaning the process was
         * attached, rather than run as a child), then we need
         * to give it back to the old parent, and send the
         * parent the exit signal.  The rest of the cleanup
         * will be done when the old parent waits on the child.
         */
        if ((p->p_slflag & PSL_TRACED) != 0 && p->p_opptr != parent) {
                mutex_enter(p->p_lock);
                p->p_slflag &= ~(PSL_TRACED|PSL_SYSCALL);
                mutex_exit(p->p_lock);
                parent = (p->p_opptr == NULL) ? initproc : p->p_opptr;
                proc_reparent(p, parent);
                p->p_opptr = NULL;
                if (p->p_exitsig != 0) {
                        exit_psignal(p, parent, &ksi);
                        kpsignal(parent, &ksi, NULL);
                }
                cv_broadcast(&parent->p_waitcv);
                mutex_exit(&proc_lock);
                return;
        }

        sched_proc_exit(parent, p);

        /*
         * Add child times of exiting process onto its own times.
         * This cannot be done any earlier else it might get done twice.
         */
        l = LIST_FIRST(&p->p_lwps);
        ruadd(&p->p_stats->p_ru, &l->l_ru);
        ruadd(&p->p_stats->p_ru, &p->p_stats->p_cru);
        ruadd(&parent->p_stats->p_cru, &p->p_stats->p_ru);
        if (wru != NULL) {
                wru->wru_self = p->p_stats->p_ru;
                wru->wru_children = p->p_stats->p_cru;
        }
        p->p_xsig = 0;
        p->p_xexit = 0;

        /*
         * At this point we are going to start freeing the final resources.
         * If anyone tries to access the proc structure after here they will
         * get a shock - bits are missing.  Attempt to make it hard!  We
         * don't bother with any further locking past this point.
         */
        p->p_stat = SIDL;                /* not even a zombie any more */
        LIST_REMOVE(p, p_list);        /* off zombproc */
        parent->p_nstopchild--;
        LIST_REMOVE(p, p_sibling);

        /*
         * Let pid be reallocated.
         */
        proc_free_pid(p->p_pid);
        atomic_dec_uint(&nprocs);

        /*
         * Unlink process from its process group.
         * Releases the proc_lock.
         */
        proc_leavepgrp(p);

        /*
         * Delay release until after lwp_free.
         */
        cred2 = l->l_cred;

        /*
         * Free the last LWP's resources.
         *
         * lwp_free ensures the LWP is no longer running on another CPU.
         */
        lwp_free(l, false, true);

        /*
         * Now no one except us can reach the process p.
         */

        /*
         * Decrement the count of procs running with this uid.
         */
        cred1 = p->p_cred;
        uid = kauth_cred_getuid(cred1);
        (void)chgproccnt(uid, -1);

        /*
         * Release substructures.
         */

        lim_free(p->p_limit);
        pstatsfree(p->p_stats);
        kauth_cred_free(cred1);
        kauth_cred_free(cred2);

        /*
         * Release reference to text vnode
         */
        if (p->p_textvp)
                vrele(p->p_textvp);
        kmem_strfree(p->p_path);

        mutex_destroy(&p->p_auxlock);
        mutex_obj_free(p->p_lock);
        mutex_destroy(&p->p_stmutex);
        cv_destroy(&p->p_waitcv);
        cv_destroy(&p->p_lwpcv);
        rw_destroy(&p->p_reflock);

        proc_free_mem(p);
}

/*
 * Change the parent of a process for tracing purposes.
 */
void
proc_changeparent(struct proc *t, struct proc *p)
{
        SET(t->p_slflag, PSL_TRACED);
        t->p_opptr = t->p_pptr;
        if (t->p_pptr == p)
                return;
        struct proc *parent = t->p_pptr;

        if (parent->p_lock < t->p_lock) {
                if (!mutex_tryenter(parent->p_lock)) {
                        mutex_exit(t->p_lock);
                        mutex_enter(parent->p_lock);
                        mutex_enter(t->p_lock);
                }
        } else if (parent->p_lock > t->p_lock) {
                mutex_enter(parent->p_lock);
        }
        parent->p_slflag |= PSL_CHTRACED;
        proc_reparent(t, p);
        if (parent->p_lock != t->p_lock)
                mutex_exit(parent->p_lock);
}

/*
 * make process 'parent' the new parent of process 'child'.
 *
 * Must be called with proc_lock held.
 */
void
proc_reparent(struct proc *child, struct proc *parent)
{

        KASSERT(mutex_owned(&proc_lock));

        if (child->p_pptr == parent)
                return;

        if (child->p_stat == SZOMB || child->p_stat == SDEAD ||
            (child->p_stat == SSTOP && !child->p_waited)) {
                child->p_pptr->p_nstopchild--;
                parent->p_nstopchild++;
        }
        if (parent == initproc) {
                child->p_exitsig = SIGCHLD;
                child->p_ppid = parent->p_pid;
        }

        LIST_REMOVE(child, p_sibling);
        LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
        child->p_pptr = parent;
}

















































































































































































    3 




















    3 






   32 



























   34 




































   39 








   40 















    3 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
/*        $NetBSD: subr_device.c,v 1.13 2022/03/28 12:38:59 riastradh Exp $        */

/*
 * Copyright (c) 2006, 2021 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_device.c,v 1.13 2022/03/28 12:38:59 riastradh Exp $");

#include <sys/param.h>
#include <sys/device.h>
#include <sys/device_impl.h>
#include <sys/systm.h>

#include <sys/device_calls.h>

/* Root device. */
device_t                        root_device;

/*
 * devhandle_t accessors / mutators.
 */

static bool
devhandle_is_valid_internal(const devhandle_t * const handlep)
{
        if (handlep->impl == NULL) {
                return false;
        }
        return handlep->impl->type != DEVHANDLE_TYPE_INVALID;
}

bool
devhandle_is_valid(devhandle_t handle)
{
        return devhandle_is_valid_internal(&handle);
}

devhandle_t
devhandle_invalid(void)
{
        static const devhandle_t invalid_devhandle = {
                .impl = NULL,
                .uintptr = 0,
        };
        return invalid_devhandle;
}

devhandle_type_t
devhandle_type(devhandle_t handle)
{
        if (!devhandle_is_valid_internal(&handle)) {
                return DEVHANDLE_TYPE_INVALID;
        }

        return handle.impl->type;
}

int
devhandle_compare(devhandle_t handle1, devhandle_t handle2)
{
        devhandle_type_t type1 = devhandle_type(handle1);
        devhandle_type_t type2 = devhandle_type(handle2);

        if (type1 == DEVHANDLE_TYPE_INVALID) {
                return -1;
        }
        if (type2 == DEVHANDLE_TYPE_INVALID) {
                return 1;
        }

        if (type1 < type2) {
                return -1;
        }
        if (type1 > type2) {
                return 1;
        }

        /* For private handles, we also compare the impl pointers. */
        if (type1 == DEVHANDLE_TYPE_PRIVATE) {
                intptr_t impl1 = (intptr_t)handle1.impl;
                intptr_t impl2 = (intptr_t)handle2.impl;

                if (impl1 < impl2) {
                        return -1;
                }
                if (impl1 > impl2) {
                        return 1;
                }
        }

        if (handle1.integer < handle2.integer) {
                return -1;
        }
        if (handle1.integer > handle2.integer) {
                return 1;
        }

        return 0;
}

device_call_t
devhandle_lookup_device_call(devhandle_t handle, const char *name,
    devhandle_t *call_handlep)
{
        const struct devhandle_impl *impl;
        device_call_t call;

        /*
         * The back-end can override the handle to use for the call,
         * if needed.
         */
        *call_handlep = handle;

        for (impl = handle.impl; impl != NULL; impl = impl->super) {
                if (impl->lookup_device_call != NULL) {
                        call = impl->lookup_device_call(handle, name,
                            call_handlep);
                        if (call != NULL) {
                                return call;
                        }
                }
        }
        return NULL;
}

void
devhandle_impl_inherit(struct devhandle_impl *impl,
    const struct devhandle_impl *super)
{
        memcpy(impl, super, sizeof(*impl));
        impl->super = super;
}

/*
 * Accessor functions for the device_t type.
 */

devclass_t
device_class(device_t dev)
{

        return dev->dv_class;
}

cfdata_t
device_cfdata(device_t dev)
{

        return dev->dv_cfdata;
}

cfdriver_t
device_cfdriver(device_t dev)
{

        return dev->dv_cfdriver;
}

cfattach_t
device_cfattach(device_t dev)
{

        return dev->dv_cfattach;
}

int
device_unit(device_t dev)
{

        return dev->dv_unit;
}

const char *
device_xname(device_t dev)
{

        return dev->dv_xname;
}

device_t
device_parent(device_t dev)
{

        return dev->dv_parent;
}

bool
device_activation(device_t dev, devact_level_t level)
{
        int active_flags;

        active_flags = DVF_ACTIVE;
        switch (level) {
        case DEVACT_LEVEL_FULL:
                active_flags |= DVF_CLASS_SUSPENDED;
                /*FALLTHROUGH*/
        case DEVACT_LEVEL_DRIVER:
                active_flags |= DVF_DRIVER_SUSPENDED;
                /*FALLTHROUGH*/
        case DEVACT_LEVEL_BUS:
                active_flags |= DVF_BUS_SUSPENDED;
                break;
        }

        return (dev->dv_flags & active_flags) == DVF_ACTIVE;
}

bool
device_is_active(device_t dev)
{
        int active_flags;

        active_flags = DVF_ACTIVE;
        active_flags |= DVF_CLASS_SUSPENDED;
        active_flags |= DVF_DRIVER_SUSPENDED;
        active_flags |= DVF_BUS_SUSPENDED;

        return (dev->dv_flags & active_flags) == DVF_ACTIVE;
}

bool
device_is_enabled(device_t dev)
{
        return (dev->dv_flags & DVF_ACTIVE) == DVF_ACTIVE;
}

bool
device_has_power(device_t dev)
{
        int active_flags;

        active_flags = DVF_ACTIVE | DVF_BUS_SUSPENDED;

        return (dev->dv_flags & active_flags) == DVF_ACTIVE;
}

int
device_locator(device_t dev, u_int locnum)
{

        KASSERT(dev->dv_locators != NULL);
        return dev->dv_locators[locnum];
}

void *
device_private(device_t dev)
{

        /*
         * The reason why device_private(NULL) is allowed is to simplify the
         * work of a lot of userspace request handlers (i.e., c/bdev
         * handlers) which grab cfdriver_t->cd_units[n].
         * It avoids having them test for it to be NULL and only then calling
         * device_private.
         */
        return dev == NULL ? NULL : dev->dv_private;
}

void
device_set_private(device_t dev, void *private)
{

        KASSERTMSG(dev->dv_private == NULL, "device_set_private(%p, %p):"
            " device %s already has private set to %p",
            dev, private, device_xname(dev), device_private(dev));
        KASSERT(private != NULL);
        dev->dv_private = private;
}

prop_dictionary_t
device_properties(device_t dev)
{

        return dev->dv_properties;
}

/*
 * device_is_a:
 *
 *        Returns true if the device is an instance of the specified
 *        driver.
 */
bool
device_is_a(device_t dev, const char *dname)
{
        if (dev == NULL || dev->dv_cfdriver == NULL) {
                return false;
        }

        return strcmp(dev->dv_cfdriver->cd_name, dname) == 0;
}

/*
 * device_attached_to_iattr:
 *
 *        Returns true if the device attached to the specified interface
 *        attribute.
 */
bool
device_attached_to_iattr(device_t dev, const char *iattr)
{
        cfdata_t cfdata = device_cfdata(dev);
        const struct cfparent *pspec;

        if (cfdata == NULL || (pspec = cfdata->cf_pspec) == NULL) {
                return false;
        }

        return strcmp(pspec->cfp_iattr, iattr) == 0;
}

void
device_set_handle(device_t dev, devhandle_t handle)
{
        dev->dv_handle = handle;
}

devhandle_t
device_handle(device_t dev)
{
        return dev->dv_handle;
}

int
device_call_generic(device_t dev, const struct device_call_generic *gen)
{
        devhandle_t handle = device_handle(dev);
        device_call_t call;
        devhandle_t call_handle;

        call = devhandle_lookup_device_call(handle, gen->name, &call_handle);
        if (call == NULL) {
                return ENOTSUP;
        }
        return call(dev, call_handle, gen->args);
}

int
device_enumerate_children(device_t dev,
    bool (*callback)(device_t, devhandle_t, void *),
    void *callback_arg)
{
        struct device_enumerate_children_args args = {
                .callback = callback,
                .callback_arg = callback_arg,
        };

        return device_call(dev, DEVICE_ENUMERATE_CHILDREN(&args));
}



















































































































    2 
































    2 








    2 
    2 









    2 










    2 









    2 








    2 









    2 











    2 





































    2 



    2 









    2 


















    2 
    2 







    2 






    2 
























    1 

















    1 






    1 



















    1 












    1 






    1 




    1 







    1 






    1 

    1 













    2 



































    2 



















    2 










    2 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
/*        $NetBSD: umap_vnops.c,v 1.62 2021/10/20 03:08:18 thorpej Exp $        */

/*
 * Copyright (c) 1992, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * the UCLA Ficus project.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)umap_vnops.c        8.6 (Berkeley) 5/22/95
 */

/*
 * Umap Layer
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: umap_vnops.c,v 1.62 2021/10/20 03:08:18 thorpej Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/buf.h>
#include <sys/kauth.h>

#include <miscfs/umapfs/umap.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/layer_extern.h>

/*
 * Note: If the LAYERFS_MBYPASSDEBUG flag is set, it is possible
 * that the debug printing will bomb out, because kauth routines
 * do not handle NOCRED or FSCRED like other credentials and end
 * up dereferencing an inappropriate pointer.
 *
 * That should be fixed in kauth rather than here.
 */

int        umap_lookup(void *);
int        umap_getattr(void *);
int        umap_print(void *);
int        umap_rename(void *);

/*
 * Global vfs data structures
 */
/*
 * XXX - strategy, bwrite are hand coded currently.  They should
 * go away with a merged buffer/block cache.
 *
 */
int (**umap_vnodeop_p)(void *);
const struct vnodeopv_entry_desc umap_vnodeop_entries[] = {
        { &vop_default_desc,        umap_bypass },

        { &vop_lookup_desc,        umap_lookup },
        { &vop_getattr_desc,        umap_getattr },
        { &vop_print_desc,        umap_print },
        { &vop_rename_desc,        umap_rename },

        { &vop_fsync_desc,        layer_fsync },
        { &vop_inactive_desc,        layer_inactive },
        { &vop_reclaim_desc,        layer_reclaim },
        { &vop_open_desc,        layer_open },
        { &vop_close_desc,        layer_close },
        { &vop_setattr_desc,        layer_setattr },
        { &vop_access_desc,        layer_access },
        { &vop_accessx_desc,        genfs_accessx },
        { &vop_remove_desc,        layer_remove },
        { &vop_revoke_desc,        layer_revoke },
        { &vop_rmdir_desc,        layer_rmdir },

        { &vop_bmap_desc,        layer_bmap },
        { &vop_getpages_desc,        layer_getpages },
        { &vop_putpages_desc,        layer_putpages },

        { NULL, NULL }
};
const struct vnodeopv_desc umapfs_vnodeop_opv_desc =
        { &umap_vnodeop_p, umap_vnodeop_entries };

/*
 * This is the 08-June-1999 bypass routine.
 * See layer_vnops.c:layer_bypass for more details.
 */
int
umap_bypass(void *v)
{
        struct vop_generic_args /* {
                struct vnodeop_desc *a_desc;
                <other random data follows, presumably>
        } */ *ap = v;
        int (**our_vnodeop_p)(void *);
        kauth_cred_t *credpp = NULL, credp = 0;
        kauth_cred_t savecredp = 0, savecompcredp = 0;
        kauth_cred_t compcredp = 0;
        struct vnode **this_vp_p;
        int error;
        struct vnode *old_vps[VDESC_MAX_VPS], *vp0;
        struct vnode **vps_p[VDESC_MAX_VPS];
        struct vnode ***vppp;
        struct vnodeop_desc *descp = ap->a_desc;
        int reles, i, flags;
        struct componentname **compnamepp = 0;

#ifdef DIAGNOSTIC
        /*
         * We require at least one vp.
         */
        if (descp->vdesc_vp_offsets == NULL ||
            descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET)
                panic("%s: no vp's in map.\n", __func__);
#endif

        vps_p[0] =
            VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap);
        vp0 = *vps_p[0];
        flags = MOUNTTOUMAPMOUNT(vp0->v_mount)->umapm_flags;
        our_vnodeop_p = vp0->v_op;

        if (flags & LAYERFS_MBYPASSDEBUG)
                printf("%s: %s\n", __func__, descp->vdesc_name);

        /*
         * Map the vnodes going in.
         * Later, we'll invoke the operation based on
         * the first mapped vnode's operation vector.
         */
        reles = descp->vdesc_flags;
        for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
                if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
                        break;   /* bail out at end of list */
                vps_p[i] = this_vp_p =
                    VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[i],
                    ap);
                /*
                 * We're not guaranteed that any but the first vnode
                 * are of our type.  Check for and don't map any
                 * that aren't.  (We must always map first vp or vclean fails.)
                 */
                if (i && (*this_vp_p == NULL ||
                    (*this_vp_p)->v_op != our_vnodeop_p)) {
                        old_vps[i] = NULL;
                } else {
                        old_vps[i] = *this_vp_p;
                        *(vps_p[i]) = UMAPVPTOLOWERVP(*this_vp_p);
                        /*
                         * XXX - Several operations have the side effect
                         * of vrele'ing their vp's.  We must account for
                         * that.  (This should go away in the future.)
                         */
                        if (reles & VDESC_VP0_WILLRELE)
                                vref(*this_vp_p);
                }

        }

        /*
         * Fix the credentials.  (That's the purpose of this layer.)
         */

        if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) {

                credpp = VOPARG_OFFSETTO(kauth_cred_t*,
                    descp->vdesc_cred_offset, ap);

                /* Save old values */

                savecredp = *credpp;
                if (savecredp != NOCRED && savecredp != FSCRED)
                        *credpp = kauth_cred_dup(savecredp);
                credp = *credpp;

                if ((flags & LAYERFS_MBYPASSDEBUG) &&
                    kauth_cred_geteuid(credp) != 0)
                        printf("umap_bypass: user was %d, group %d\n",
                            kauth_cred_geteuid(credp), kauth_cred_getegid(credp));

                /* Map all ids in the credential structure. */

                umap_mapids(vp0->v_mount, credp);

                if ((flags & LAYERFS_MBYPASSDEBUG) &&
                    kauth_cred_geteuid(credp) != 0)
                        printf("umap_bypass: user now %d, group %d\n",
                            kauth_cred_geteuid(credp), kauth_cred_getegid(credp));
        }

        /* BSD often keeps a credential in the componentname structure
         * for speed.  If there is one, it better get mapped, too.
         */

        if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) {

                compnamepp = VOPARG_OFFSETTO(struct componentname**,
                    descp->vdesc_componentname_offset, ap);

                savecompcredp = (*compnamepp)->cn_cred;
                if (savecompcredp != NOCRED && savecompcredp != FSCRED)
                        (*compnamepp)->cn_cred = kauth_cred_dup(savecompcredp);
                compcredp = (*compnamepp)->cn_cred;

                if ((flags & LAYERFS_MBYPASSDEBUG) &&
                    kauth_cred_geteuid(compcredp) != 0)
                        printf("umap_bypass: component credit user was %d, group %d\n",
                            kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));

                /* Map all ids in the credential structure. */

                umap_mapids(vp0->v_mount, compcredp);

                if ((flags & LAYERFS_MBYPASSDEBUG) &&
                    kauth_cred_geteuid(compcredp) != 0)
                        printf("umap_bypass: component credit user now %d, group %d\n",
                            kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
        }

        /*
         * Call the operation on the lower layer
         * with the modified argument structure.
         */
        error = VCALL(*vps_p[0], descp->vdesc_offset, ap);

        /*
         * Maintain the illusion of call-by-value
         * by restoring vnodes in the argument structure
         * to their original value.
         */
        reles = descp->vdesc_flags;
        for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
                if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
                        break;   /* bail out at end of list */
                if (old_vps[i]) {
                        *(vps_p[i]) = old_vps[i];
                        if (reles & VDESC_VP0_WILLRELE)
                                vrele(*(vps_p[i]));
                }
        }

        /*
         * Map the possible out-going vpp
         * (Assumes that the lower layer always returns
         * a VREF'ed vpp unless it gets an error.)
         */
        if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && !error) {
                vppp = VOPARG_OFFSETTO(struct vnode***,
                                 descp->vdesc_vpp_offset, ap);
                /*
                 * Only vop_lookup, vop_create, vop_makedir, vop_mknod
                 * and vop_symlink return vpp's. vop_lookup doesn't call bypass
                 * as a lookup on "." would generate a locking error.
                 * So all the calls which get us here have a unlocked vpp. :-)
                 */
                error = layer_node_create(old_vps[0]->v_mount, **vppp, *vppp);
                if (error) {
                        vrele(**vppp);
                        **vppp = NULL;
                }
        }

        /*
         * Free duplicate cred structure and restore old one.
         */
        if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) {
                if ((flags & LAYERFS_MBYPASSDEBUG) && credp &&
                    kauth_cred_geteuid(credp) != 0)
                        printf("umap_bypass: returning-user was %d\n",
                            kauth_cred_geteuid(credp));

                if (savecredp != NOCRED && savecredp != FSCRED && credpp) {
                        kauth_cred_free(credp);
                        *credpp = savecredp;
                        if ((flags & LAYERFS_MBYPASSDEBUG) && credpp &&
                            kauth_cred_geteuid(*credpp) != 0)
                                 printf("umap_bypass: returning-user now %d\n\n",
                                    kauth_cred_geteuid(savecredp));
                }
        }

        if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) {
                if ((flags & LAYERFS_MBYPASSDEBUG) && compcredp &&
                    kauth_cred_geteuid(compcredp) != 0)
                        printf("umap_bypass: returning-component-user was %d\n",
                            kauth_cred_geteuid(compcredp));

                if (savecompcredp != NOCRED && savecompcredp != FSCRED) {
                        kauth_cred_free(compcredp);
                        (*compnamepp)->cn_cred = savecompcredp;
                        if ((flags & LAYERFS_MBYPASSDEBUG) && savecompcredp &&
                            kauth_cred_geteuid(savecompcredp) != 0)
                                 printf("umap_bypass: returning-component-user now %d\n",
                                    kauth_cred_geteuid(savecompcredp));
                }
        }

        return (error);
}

/*
 * This is based on the 08-June-1999 bypass routine.
 * See layer_vnops.c:layer_bypass for more details.
 */
int
umap_lookup(void *v)
{
        struct vop_lookup_v2_args /* {
                struct vnodeop_desc *a_desc;
                struct vnode * a_dvp;
                struct vnode ** a_vpp;
                struct componentname * a_cnp;
        } */ *ap = v;
        struct componentname *cnp = ap->a_cnp;
        kauth_cred_t savecompcredp = NULL;
        kauth_cred_t compcredp = NULL;
        struct vnode *dvp, *vp, *ldvp;
        struct mount *mp;
        int error;
        int flags, cnf = cnp->cn_flags;

        dvp = ap->a_dvp;
        mp = dvp->v_mount;

        if ((cnf & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
                (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
                return (EROFS);

        flags = MOUNTTOUMAPMOUNT(mp)->umapm_flags;
        ldvp = UMAPVPTOLOWERVP(dvp);

        if (flags & LAYERFS_MBYPASSDEBUG)
                printf("umap_lookup\n");

        /*
         * Fix the credentials.  (That's the purpose of this layer.)
         *
         * BSD often keeps a credential in the componentname structure
         * for speed.  If there is one, it better get mapped, too.
         */

        if ((savecompcredp = cnp->cn_cred)) {
                compcredp = kauth_cred_dup(savecompcredp);
                cnp->cn_cred = compcredp;

                if ((flags & LAYERFS_MBYPASSDEBUG) &&
                    kauth_cred_geteuid(compcredp) != 0)
                        printf("umap_lookup: component credit user was %d, group %d\n",
                            kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));

                /* Map all ids in the credential structure. */
                umap_mapids(mp, compcredp);
        }

        if ((flags & LAYERFS_MBYPASSDEBUG) && compcredp &&
            kauth_cred_geteuid(compcredp) != 0)
                printf("umap_lookup: component credit user now %d, group %d\n",
                    kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));

        ap->a_dvp = ldvp;
        error = VCALL(ldvp, ap->a_desc->vdesc_offset, ap);
        vp = *ap->a_vpp;
        *ap->a_vpp = NULL;

        if (error == EJUSTRETURN && (cnf & ISLASTCN) &&
            (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
            (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME))
                error = EROFS;

        /* Do locking fixup as appropriate. See layer_lookup() for info */
        if (ldvp == vp) {
                *ap->a_vpp = dvp;
                vref(dvp);
                vrele(vp);
        } else if (vp != NULL) {
                error = layer_node_create(mp, vp, ap->a_vpp);
                if (error) {
                        vrele(vp);
                }
        }

        /*
         * Free duplicate cred structure and restore old one.
         */
        if ((flags & LAYERFS_MBYPASSDEBUG) && compcredp &&
            kauth_cred_geteuid(compcredp) != 0)
                printf("umap_lookup: returning-component-user was %d\n",
                            kauth_cred_geteuid(compcredp));

        if (savecompcredp != NOCRED && savecompcredp != FSCRED) {
                if (compcredp)
                        kauth_cred_free(compcredp);
                cnp->cn_cred = savecompcredp;
                if ((flags & LAYERFS_MBYPASSDEBUG) && savecompcredp &&
                    kauth_cred_geteuid(savecompcredp) != 0)
                         printf("umap_lookup: returning-component-user now %d\n",
                            kauth_cred_geteuid(savecompcredp));
        }

        return (error);
}

/*
 *  We handle getattr to change the fsid.
 */
int
umap_getattr(void *v)
{
        struct vop_getattr_args /* {
                struct vnode *a_vp;
                struct vattr *a_vap;
                kauth_cred_t a_cred;
                struct lwp *a_l;
        } */ *ap = v;
        uid_t uid;
        gid_t gid;
        int error, tmpid, nentries, gnentries, flags;
        u_long (*mapdata)[2];
        u_long (*gmapdata)[2];
        struct vnode **vp1p;
        const struct vnodeop_desc *descp = ap->a_desc;

        if ((error = umap_bypass(ap)) != 0)
                return (error);
        /* Requires that arguments be restored. */
        ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];

        flags = MOUNTTOUMAPMOUNT(ap->a_vp->v_mount)->umapm_flags;
        /*
         * Umap needs to map the uid and gid returned by a stat
         * into the proper values for this site.  This involves
         * finding the returned uid in the mapping information,
         * translating it into the uid on the other end,
         * and filling in the proper field in the vattr
         * structure pointed to by ap->a_vap.  The group
         * is easier, since currently all groups will be
         * translate to the NULLGROUP.
         */

        /* Find entry in map */

        uid = ap->a_vap->va_uid;
        gid = ap->a_vap->va_gid;
        if ((flags & LAYERFS_MBYPASSDEBUG))
                printf("umap_getattr: mapped uid = %d, mapped gid = %d\n", uid,
                    gid);

        vp1p = VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap);
        nentries =  MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_nentries;
        mapdata =  (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_mapdata);
        gnentries =  MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gnentries;
        gmapdata =  (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gmapdata);

        /* Reverse map the uid for the vnode.  Since it's a reverse
                map, we can't use umap_mapids() to do it. */

        tmpid = umap_reverse_findid(uid, mapdata, nentries);

        if (tmpid != -1) {
                ap->a_vap->va_uid = (uid_t) tmpid;
                if ((flags & LAYERFS_MBYPASSDEBUG))
                        printf("umap_getattr: original uid = %d\n", uid);
        } else
                ap->a_vap->va_uid = (uid_t) NOBODY;

        /* Reverse map the gid for the vnode. */

        tmpid = umap_reverse_findid(gid, gmapdata, gnentries);

        if (tmpid != -1) {
                ap->a_vap->va_gid = (gid_t) tmpid;
                if ((flags & LAYERFS_MBYPASSDEBUG))
                        printf("umap_getattr: original gid = %d\n", gid);
        } else
                ap->a_vap->va_gid = (gid_t) NULLGROUP;

        return (0);
}

int
umap_print(void *v)
{
        struct vop_print_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        printf("\ttag VT_UMAPFS, vp=%p, lowervp=%p\n", vp,
            UMAPVPTOLOWERVP(vp));
        return (0);
}

int
umap_rename(void *v)
{
        struct vop_rename_args /* {
                struct vnode *a_fdvp;
                struct vnode *a_fvp;
                struct componentname *a_fcnp;
                struct vnode *a_tdvp;
                struct vnode *a_tvp;
                struct componentname *a_tcnp;
        } */ *ap = v;
        int error, flags;
        struct componentname *compnamep;
        kauth_cred_t compcredp, savecompcredp;
        struct vnode *vp;
        struct vnode *tvp;

        /*
         * Rename is irregular, having two componentname structures.
         * We need to map the cre in the second structure,
         * and then bypass takes care of the rest.
         */

        vp = ap->a_fdvp;
        flags = MOUNTTOUMAPMOUNT(vp->v_mount)->umapm_flags;
        compnamep = ap->a_tcnp;
        compcredp = compnamep->cn_cred;

        savecompcredp = compcredp;
        compcredp = compnamep->cn_cred = kauth_cred_dup(savecompcredp);

        if ((flags & LAYERFS_MBYPASSDEBUG) &&
            kauth_cred_geteuid(compcredp) != 0)
                printf("umap_rename: rename component credit user was %d, group %d\n",
                    kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));

        /* Map all ids in the credential structure. */

        umap_mapids(vp->v_mount, compcredp);

        if ((flags & LAYERFS_MBYPASSDEBUG) &&
            kauth_cred_geteuid(compcredp) != 0)
                printf("umap_rename: rename component credit user now %d, group %d\n",
                    kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));

        tvp = ap->a_tvp;
        if (tvp) {
                if (tvp->v_mount != vp->v_mount)
                        tvp = NULL;
                else
                        vref(tvp);
        }
        error = umap_bypass(ap);
        if (tvp) {
                if (error == 0)
                        VTOLAYER(tvp)->layer_flags |= LAYERFS_REMOVED;
                vrele(tvp);
        }

        /* Restore the additional mapped componentname cred structure. */

        kauth_cred_free(compcredp);
        compnamep->cn_cred = savecompcredp;

        return error;
}






























































































































































































































































































































































































































































   42 









































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
/*        $NetBSD: vnode.h,v 1.304 2022/10/26 23:40:30 riastradh Exp $        */

/*-
 * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vnode.h        8.17 (Berkeley) 5/20/95
 */

#ifndef _SYS_VNODE_H_
#define        _SYS_VNODE_H_

#include <sys/event.h>
#include <sys/queue.h>
#include <sys/condvar.h>
#include <sys/rwlock.h>
#include <sys/mutex.h>
#include <sys/time.h>
#include <sys/acl.h>

/* XXX: clean up includes later */
#include <uvm/uvm_param.h>        /* XXX */
#if defined(_KERNEL) || defined(_KMEMUSER)
#include <uvm/uvm_pglist.h>        /* XXX */
#include <uvm/uvm_object.h>        /* XXX */
#include <uvm/uvm_extern.h>        /* XXX */

struct uvm_ractx;
#endif

/*
 * The vnode is the focus of all file activity in UNIX.  There is a
 * unique vnode allocated for each active file, each current directory,
 * each mounted-on file, text file, and the root.
 */

/*
 * Vnode types.  VNON means no type.
 */
enum vtype        { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD };

#define        VNODE_TYPES \
    "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"

/*
 * Vnode tag types.
 * These are for the benefit of external programs only (e.g., pstat)
 * and should NEVER be inspected by the kernel.
 */
enum vtagtype        {
        VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_MSDOSFS, VT_LFS, VT_LOFS,
        VT_FDESC, VT_PORTAL, VT_NULL, VT_UMAP, VT_KERNFS, VT_PROCFS,
        VT_AFS, VT_ISOFS, VT_UNION, VT_ADOSFS, VT_EXT2FS, VT_CODA,
        VT_FILECORE, VT_NTFS, VT_VFS, VT_OVERLAY, VT_SMBFS, VT_PTYFS,
        VT_TMPFS, VT_UDF, VT_SYSVBFS, VT_PUFFS, VT_HFS, VT_EFS, VT_ZFS,
        VT_RUMP, VT_NILFS, VT_V7FS, VT_CHFS, VT_AUTOFS
};

#define        VNODE_TAGS \
    "VT_NON", "VT_UFS", "VT_NFS", "VT_MFS", "VT_MSDOSFS", "VT_LFS", "VT_LOFS", \
    "VT_FDESC", "VT_PORTAL", "VT_NULL", "VT_UMAP", "VT_KERNFS", "VT_PROCFS", \
    "VT_AFS", "VT_ISOFS", "VT_UNION", "VT_ADOSFS", "VT_EXT2FS", "VT_CODA", \
    "VT_FILECORE", "VT_NTFS", "VT_VFS", "VT_OVERLAY", "VT_SMBFS", "VT_PTYFS", \
    "VT_TMPFS", "VT_UDF", "VT_SYSVBFS", "VT_PUFFS", "VT_HFS", "VT_EFS", \
    "VT_ZFS", "VT_RUMP", "VT_NILFS", "VT_V7FS", "VT_CHFS", "VT_AUTOFS"

#if defined(_KERNEL) || defined(_KMEMUSER)
struct vnode;
struct buf;

LIST_HEAD(buflists, buf);

/*
 * Reading or writing any of these items requires holding the appropriate
 * lock.  Field markings and the corresponding locks:
 *
 *        -        stable, reference to the vnode is required
 *        b        bufcache_lock
 *        e        exec_lock
 *        f        vnode_free_list_lock, or vrele_lock for vrele_list
 *        i        v_interlock
 *        i+b        v_interlock + bufcache_lock to modify, either to inspect
 *        i+u        v_interlock + v_uobj.vmobjlock to modify, either to inspect
 *        k        locked by underlying filesystem (maybe kernel_lock)
 *        u        v_uobj.vmobjlock
 *        v        vnode lock
 *
 * Each underlying filesystem allocates its own private area and hangs
 * it from v_data.
 */
struct vnode {
        /*
         * VM system related items.
         */
        struct uvm_object v_uobj;                /* u   the VM object */
        voff_t                v_size;                        /* i+u size of file */
        voff_t                v_writesize;                /* i+u new size after write */

        /*
         * Unstable items get their own cache line.
         * On _LP64 this fills the space nicely.
         */
        kcondvar_t        v_cv                        /* i   synchronization */
            __aligned(COHERENCY_UNIT);
        int                v_iflag;                /* i+u VI_* flags */
        int                v_uflag;                /* k   VU_* flags */
        int                v_usecount;                /* i   reference count */
        int                v_numoutput;                /* i   # of pending writes */
        int                v_writecount;                /* i   ref count of writers */
        int                v_holdcnt;                /* i   page & buffer refs */
        struct buflists        v_cleanblkhd;                /* i+b clean blocklist head */
        struct buflists        v_dirtyblkhd;                /* i+b dirty blocklist head */

        /*
         * The remaining items are largely stable.
         */
        int                v_vflag                        /* v   VV_* flags */
            __aligned(COHERENCY_UNIT);
        kmutex_t        *v_interlock;                /* -   vnode interlock */
        struct mount        *v_mount;                /* v   ptr to vfs we are in */
        int                (**v_op)(void *);        /* :   vnode operations vector */
        union {
                struct mount        *vu_mountedhere;/* v   ptr to vfs (VDIR) */
                struct socket        *vu_socket;        /* v   unix ipc (VSOCK) */
                struct specnode        *vu_specnode;        /* v   device (VCHR, VBLK) */
                struct fifoinfo        *vu_fifoinfo;        /* v   fifo (VFIFO) */
                struct uvm_ractx *vu_ractx;        /* u   read-ahead ctx (VREG) */
        } v_un;
        enum vtype        v_type;                        /* -   vnode type */
        enum vtagtype        v_tag;                        /* -   type of underlying data */
        void                 *v_data;                /* -   private data for fs */
        struct vnode_klist *v_klist;                /* i   kevent / knote info */

        void                *v_segvguard;                /* e   for PAX_SEGVGUARD */
};
#define        v_mountedhere        v_un.vu_mountedhere
#define        v_socket        v_un.vu_socket
#define        v_specnode        v_un.vu_specnode
#define        v_fifoinfo        v_un.vu_fifoinfo
#define        v_ractx                v_un.vu_ractx

typedef struct vnode vnode_t;

/*
 * Structure that encompasses the kevent state for a vnode.  This is
 * carved out as a separate structure because some vnodes may share
 * this state with one another.
 *
 * N.B. if two vnodes share a vnode_klist, then they must also share
 * v_interlock.
 */
struct vnode_klist {
        struct klist        vk_klist;        /* i   notes attached to vnode */
        long                vk_interest;        /* i   what the notes are interested in */
};
#endif

/*
 * Vnode flags.  The first set are locked by vnode lock or are stable.
 * VSYSTEM is only used to skip vflush()ing quota files.  VISTTY is used
 * when reading dead vnodes.
 */
#define        VV_ROOT                0x00000001        /* root of its file system */
#define        VV_SYSTEM        0x00000002        /* vnode being used by kernel */
#define        VV_ISTTY        0x00000004        /* vnode represents a tty */
#define        VV_MAPPED        0x00000008        /* vnode might have user mappings */
#define        VV_MPSAFE        0x00000010        /* file system code is MP safe */

/*
 * The second set are locked by vp->v_interlock.  VI_TEXT and VI_EXECMAP are
 * typically updated with vp->v_uobj.vmobjlock also held as the VM system
 * uses them for accounting purposes.
 */
#define        VI_TEXT                0x00000100        /* vnode is a pure text prototype */
#define        VI_EXECMAP        0x00000200        /* might have PROT_EXEC mappings */
#define        VI_WRMAP        0x00000400        /* might have PROT_WRITE u. mappings */
#define        VI_PAGES        0x00000800        /* UVM object has >0 pages */
#define        VI_ONWORKLST        0x00004000        /* On syncer work-list */
#define        VI_DEADCHECK        0x00008000        /* UVM: need to call vdead_check() */

/*
 * The third set are locked by the underlying file system.
 */
#define        VU_DIROP        0x01000000        /* LFS: involved in a directory op */

#define        VNODE_FLAGBITS \
    "\20\1ROOT\2SYSTEM\3ISTTY\4MAPPED\5MPSAFE\11TEXT\12EXECMAP" \
    "\13WRMAP\14PAGES\17ONWORKLST\20DEADCHECK\31DIROP"

#define        VSIZENOTSET        ((voff_t)-1)

/*
 * vnode lock flags
 */
#define        LK_NONE                0x00000000        /* no lock - for VOP_ISLOCKED() */
#define        LK_SHARED        0x00000001        /* shared lock */
#define        LK_EXCLUSIVE        0x00000002        /* exclusive lock */
#define        LK_UPGRADE        0x00000010        /* upgrade shared -> exclusive */
#define        LK_DOWNGRADE        0x00000020        /* downgrade exclusive -> shared */
#define        LK_NOWAIT        0x00000100        /* do not sleep to await lock */
#define        LK_RETRY        0x00000200        /* vn_lock: retry until locked */

/*
 * Vnode attributes.  A field value of VNOVAL represents a field whose value
 * is unavailable (getattr) or which is not to be changed (setattr).
 */
struct vattr {
        enum vtype        va_type;        /* vnode type (for create) */
        mode_t                va_mode;        /* files access mode and type */
        nlink_t                va_nlink;        /* number of references to file */
        uid_t                va_uid;                /* owner user id */
        gid_t                va_gid;                /* owner group id */
        dev_t                va_fsid;        /* file system id (dev for now) */
        ino_t                va_fileid;        /* file id */
        u_quad_t        va_size;        /* file size in bytes */
        long                va_blocksize;        /* blocksize preferred for i/o */
        struct timespec        va_atime;        /* time of last access */
        struct timespec        va_mtime;        /* time of last modification */
        struct timespec        va_ctime;        /* time file changed */
        struct timespec va_birthtime;        /* time file created */
        u_long                va_gen;                /* generation number of file */
        u_long                va_flags;        /* flags defined for file */
        dev_t                va_rdev;        /* device the special file represents */
        u_quad_t        va_bytes;        /* bytes of disk space held by file */
        u_quad_t        va_filerev;        /* file modification number */
        unsigned int        va_vaflags;        /* operations flags, see below */
        long                va_spare;        /* remain quad aligned */
};

/*
 * Flags for va_vaflags.
 */
#define        VA_UTIMES_NULL        0x01                /* utimes argument was NULL */
#define        VA_EXCLUSIVE        0x02                /* exclusive create request */

#ifdef _KERNEL

/*
 * Flags for ioflag.
 */
#define        IO_UNIT                0x00010                /* do I/O as atomic unit */
#define        IO_APPEND        0x00020                /* append write to end */
#define        IO_SYNC                (0x40|IO_DSYNC)        /* sync I/O file integrity completion */
#define        IO_NODELOCKED        0x00080                /* underlying node already locked */
#define        IO_NDELAY        0x00100                /* FNDELAY flag set in file table */
#define        IO_DSYNC        0x00200                /* sync I/O data integrity completion */
#define        IO_ALTSEMANTICS        0x00400                /* use alternate i/o semantics */
#define        IO_NORMAL        0x00800                /* operate on regular data */
#define        IO_EXT                0x01000                /* operate on extended attributes */
#define        IO_DIRECT        0x02000                /* direct I/O hint */
#define        IO_JOURNALLOCKED 0x04000        /* journal is already locked */
#define        IO_ADV_MASK        0x00003                /* access pattern hint */

#define        IO_ADV_SHIFT        0
#define        IO_ADV_ENCODE(adv)        (((adv) << IO_ADV_SHIFT) & IO_ADV_MASK)
#define        IO_ADV_DECODE(ioflag)        (((ioflag) & IO_ADV_MASK) >> IO_ADV_SHIFT)

/*
 * Flags for accmode_t.
 */
#define        VEXEC                        000000000100 /* execute/search permission */
#define        VWRITE                        000000000200 /* write permission */
#define        VREAD                        000000000400 /* read permission */
#define        VADMIN                        000000010000 /* being the file owner */
#define        VAPPEND                        000000040000 /* permission to write/append */

/*
 * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only
 * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL,
 * and 0 otherwise.  This never happens with ordinary unix access rights
 * or POSIX.1e ACLs.  Obviously, VEXPLICIT_DENY must be OR-ed with
 * some other V* constant.
 */
#define        VEXPLICIT_DENY                000000100000
#define        VREAD_NAMED_ATTRS         000000200000 /* not used */
#define        VWRITE_NAMED_ATTRS         000000400000 /* not used */
#define        VDELETE_CHILD                 000001000000
#define        VREAD_ATTRIBUTES         000002000000 /* permission to stat(2) */
#define        VWRITE_ATTRIBUTES         000004000000 /* change {m,c,a}time */
#define        VDELETE                         000010000000
#define        VREAD_ACL                 000020000000 /* read ACL and file mode */
#define        VWRITE_ACL                 000040000000 /* change ACL and/or file mode */
#define        VWRITE_OWNER                 000100000000 /* change file owner */
#define        VSYNCHRONIZE                 000200000000 /* not used */
#define        VCREAT                        000400000000 /* creating new file */
#define        VVERIFY                        001000000000 /* verification required */

#define __VNODE_PERM_BITS        \
        "\10"                        \
        "\07VEXEC"                \
        "\10VWRITE"                \
        "\11VREAD"                \
        "\15VADMIN"                \
        "\17VAPPEND"                \
        "\20VEXPLICIT_DENY"        \
        "\21VREAD_NAMED_ATTRS"        \
        "\22VWRITE_NAMED_ATTRS"        \
        "\23VDELETE_CHILD"        \
        "\24VREAD_ATTRIBUTES"        \
        "\25VWRITE_ATTRIBUTES"        \
        "\26VDELETE"                \
        "\27VREAD_ACL"                \
        "\30VWRITE_ACL"                \
        "\31VWRITE_OWNER"        \
        "\32VSYNCHRONIZE"        \
        "\33VCREAT"                \
        "\34VVERIFY"

/*
 * Permissions that were traditionally granted only to the file owner.
 */
#define VADMIN_PERMS        (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \
    VWRITE_OWNER)

/*
 * Permissions that were traditionally granted to everyone.
 */
#define VSTAT_PERMS        (VREAD_ATTRIBUTES | VREAD_ACL)

/*
 * Permissions that allow to change the state of the file in any way.
 */
#define VMODIFY_PERMS        (VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \
    VDELETE)

/*
 * Token indicating no attribute value yet assigned.
 */
#define        VNOVAL        (-1)
#define VNOVALSIZE ((u_quad_t)-1)
#define VNOVALFLAGS ((u_long)-1)

/*
 * Convert between vnode types and inode formats (since POSIX.1
 * defines mode word of stat structure in terms of inode formats).
 */
extern const enum vtype        iftovt_tab[];
extern const int        vttoif_tab[];
#define        IFTOVT(mode)        (iftovt_tab[((mode) & S_IFMT) >> 12])
#define        VTTOIF(indx)        (vttoif_tab[(int)(indx)])
#define        MAKEIMODE(indx, mode)        (int)(VTTOIF(indx) | (mode))

/*
 * Flags to various vnode functions.
 */
#define        SKIPSYSTEM        0x0001                /* vflush: skip vnodes marked VSYSTEM */
#define        FORCECLOSE        0x0002                /* vflush: force file closeure */
#define        WRITECLOSE        0x0004                /* vflush: only close writable files */
#define        V_SAVE                0x0001                /* vinvalbuf: sync file first */

/*
 * Flags to various vnode operations.
 */
#define        REVOKEALL        0x0001                /* revoke: revoke all aliases */

#define        FSYNC_WAIT        0x0001                /* fsync: wait for completion */
#define        FSYNC_DATAONLY        0x0002                /* fsync: hint: sync file data only */
#define        FSYNC_RECLAIM        0x0004                /* fsync: hint: vnode is being reclaimed */
#define        FSYNC_LAZY        0x0008                /* fsync: lazy sync (trickle) */
#define        FSYNC_NOLOG        0x0010                /* fsync: do not flush the log */
#define        FSYNC_CACHE        0x0100                /* fsync: flush disk caches too */

#define        UPDATE_WAIT        0x0001                /* update: wait for completion */
#define        UPDATE_DIROP        0x0002                /* update: hint to fs to wait or not */
#define        UPDATE_CLOSE        0x0004                /* update: clean up on close */

#define VDEAD_NOWAIT        0x0001                /* vdead_check: do not sleep */

void holdrelel(struct vnode *);
void holdrele(struct vnode *);
void vholdl(struct vnode *);
void vhold(struct vnode *);
void vref(struct vnode *);

#define        NULLVP        ((struct vnode *)NULL)

/*
 * Macro to determine kevent interest on a vnode.
 */
#define        _VN_KEVENT_INTEREST(vp, n)                                        \
        (((vp)->v_klist->vk_interest & (n)) != 0)

static inline bool
VN_KEVENT_INTEREST(struct vnode *vp, long hint)
{
        mutex_enter(vp->v_interlock);
        bool rv = _VN_KEVENT_INTEREST(vp, hint);
        mutex_exit(vp->v_interlock);
        return rv;
}

static inline void
VN_KNOTE(struct vnode *vp, long hint)
{
        mutex_enter(vp->v_interlock);
        if (__predict_false(_VN_KEVENT_INTEREST(vp, hint))) {
                knote(&vp->v_klist->vk_klist, hint);
        }
        mutex_exit(vp->v_interlock);
}

void        vn_knote_attach(struct vnode *, struct knote *);
void        vn_knote_detach(struct vnode *, struct knote *);

/*
 * Global vnode data.
 */
extern struct vnode        *rootvnode;        /* root (i.e. "/") vnode */
extern int                desiredvnodes;        /* number of vnodes desired */
extern unsigned int        numvnodes;        /* current number of vnodes */

#endif /* _KERNEL */


/*
 * Mods for exensibility.
 */

/*
 * Flags for vdesc_flags:
 */
#define        VDESC_MAX_VPS                8
/* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
#define        VDESC_VP0_WILLRELE        0x00000001
#define        VDESC_VP1_WILLRELE        0x00000002
#define        VDESC_VP2_WILLRELE        0x00000004
#define        VDESC_VP3_WILLRELE        0x00000008
#define        VDESC_VP0_WILLPUT        0x00000101
#define        VDESC_VP1_WILLPUT        0x00000202
#define        VDESC_VP2_WILLPUT        0x00000404
#define        VDESC_VP3_WILLPUT        0x00000808

/*
 * VDESC_NO_OFFSET is used to identify the end of the offset list
 * and in places where no such field exists.
 */
#define        VDESC_NO_OFFSET -1

/*
 * This structure describes the vnode operation taking place.
 */
struct vnodeop_desc {
        int                vdesc_offset;        /* offset in vector--first for speed */
        const char        *vdesc_name;        /* a readable name for debugging */
        int                vdesc_flags;        /* VDESC_* flags */

        /*
         * These ops are used by bypass routines to map and locate arguments.
         * Creds and procs are not needed in bypass routines, but sometimes
         * they are useful to (for example) transport layers.
         * Nameidata is useful because it has a cred in it.
         */
        const int        *vdesc_vp_offsets;        /* list ended by VDESC_NO_OFFSET */
        int                vdesc_vpp_offset;        /* return vpp location */
        int                vdesc_cred_offset;        /* cred location, if any */
        int                vdesc_componentname_offset; /* if any */
};

#ifdef _KERNEL

extern const struct vnodeop_desc * const vfs_op_descs[];

/*
 * Union filesystem hook for vn_readdir().
 */
extern int (*vn_union_readdir_hook) (struct vnode **, struct file *, struct lwp *);

/*
 * Macros for offsets in the vdesc struct.
 */
#define        VOPARG_OFFSETOF(type, member)        offsetof(type, member)
#define        VOPARG_OFFSETTO(type,offset,sp)        ((type)(((char *)(sp)) + (offset)))

/*
 * This structure is used to configure the new vnodeops vector.
 */
struct vnodeopv_entry_desc {
        const struct vnodeop_desc *opve_op;        /* which operation this is */
        int (*opve_impl)(void *);        /* code implementing this operation */
};

struct vnodeopv_desc {
                        /* ptr to the ptr to the vector where op should go */
        int (***opv_desc_vector_p)(void *);
        const struct vnodeopv_entry_desc *opv_desc_ops; /* null terminated list */
};

/*
 * A default routine which just returns an error.
 */
int vn_default_error(void *);

/*
 * A generic structure.
 * This can be used by bypass routines to identify generic arguments.
 */
struct vop_generic_args {
        struct vnodeop_desc *a_desc;
        /* other random data follows, presumably */
};

/*
 * VOCALL calls an op given an ops vector.  We break it out because BSD's
 * vclean changes the ops vector and then wants to call ops with the old
 * vector.
 */
/*
 * actually, vclean doesn't use it anymore, but nfs does,
 * for device specials and fifos.
 */
#define        VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP))

/*
 * This call works for vnodes in the kernel.
 */
#define        VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP))
#define        VDESC(OP) (& __CONCAT(OP,_desc))
#define        VOFFSET(OP) (VDESC(OP)->vdesc_offset)

/* XXX This include should go away */
#include <sys/mount.h>

/*
 * Finally, include the default set of vnode operations.
 */
#include <sys/vnode_if.h>

/*
 * Public vnode manipulation functions.
 */
struct file;
struct filedesc;
struct nameidata;
struct pathbuf;
struct proc;
struct stat;
struct uio;
struct vattr;
struct vnode;

/* see vnode(9) */
void        vfs_vnode_sysinit(void);
int         bdevvp(dev_t, struct vnode **);
int         cdevvp(dev_t, struct vnode **);
void         vattr_null(struct vattr *);
void        vdevgone(int, int, int, enum vtype);
int        vfinddev(dev_t, enum vtype, struct vnode **);
int        vflush(struct mount *, struct vnode *, int);
int        vflushbuf(struct vnode *, int);
void         vgone(struct vnode *);
int        vinvalbuf(struct vnode *, int, kauth_cred_t, struct lwp *, bool, int);
void        vprint(const char *, struct vnode *);
void         vput(struct vnode *);
bool        vrecycle(struct vnode *);
void         vrele(struct vnode *);
void         vrele_async(struct vnode *);
void        vrele_flush(struct mount *);
int        vtruncbuf(struct vnode *, daddr_t, bool, int);
void        vwakeup(struct buf *);
int        vdead_check(struct vnode *, int);
void        vrevoke(struct vnode *);
void        vremfree(struct vnode *);
void        vshareilock(struct vnode *, struct vnode *);
void        vshareklist(struct vnode *, struct vnode *);
int        vrefcnt(struct vnode *);
int        vcache_get(struct mount *, const void *, size_t, struct vnode **);
int        vcache_new(struct mount *, struct vnode *,
            struct vattr *, kauth_cred_t, void *, struct vnode **);
int        vcache_rekey_enter(struct mount *, struct vnode *,
            const void *, size_t, const void *, size_t);
void        vcache_rekey_exit(struct mount *, struct vnode *,
            const void *, size_t, const void *, size_t);

/* see vnsubr(9) */
int        vn_bwrite(void *);
int         vn_close(struct vnode *, int, kauth_cred_t);
int        vn_isunder(struct vnode *, struct vnode *, struct lwp *);
int        vn_lock(struct vnode *, int);
void        vn_markexec(struct vnode *);
int        vn_marktext(struct vnode *);
int         vn_open(struct vnode *, struct pathbuf *, int, int, int,
            struct vnode **, bool *, int *);
int         vn_rdwr(enum uio_rw, struct vnode *, void *, int, off_t, enum uio_seg,
    int, kauth_cred_t, size_t *, struct lwp *);
int        vn_readdir(struct file *, char *, int, unsigned int, int *,
    struct lwp *, off_t **, int *);
int        vn_stat(struct vnode *, struct stat *);
int        vn_kqfilter(struct file *, struct knote *);
int        vn_writechk(struct vnode *);
int        vn_openchk(struct vnode *, kauth_cred_t, int);
int        vn_extattr_get(struct vnode *, int, int, const char *, size_t *,
            void *, struct lwp *);
int        vn_extattr_set(struct vnode *, int, int, const char *, size_t,
            const void *, struct lwp *);
int        vn_extattr_rm(struct vnode *, int, int, const char *, struct lwp *);
int        vn_fifo_bypass(void *);
int        vn_bdev_open(dev_t, struct vnode **, struct lwp *);
int        vn_bdev_openpath(struct pathbuf *pb, struct vnode **, struct lwp *);


/* initialise global vnode management */
void        vntblinit(void);

/* misc stuff */
void        sched_sync(void *);
void        vn_syncer_add_to_worklist(struct vnode *, int);
void        vn_syncer_remove_from_worklist(struct vnode *);
int        dorevoke(struct vnode *, kauth_cred_t);
int        rawdev_mounted(struct vnode *, struct vnode **);
uint8_t        vtype2dt(enum vtype);

/* see vfssubr(9) */
int        vfs_unixify_accmode(accmode_t *);
void        vfs_getnewfsid(struct mount *);
void        vfs_timestamp(struct timespec *);
#if defined(DDB) || defined(DEBUGPRINT)
void        vfs_vnode_print(struct vnode *, int, void (*)(const char *, ...)
    __printflike(1, 2));
void        vfs_vnode_lock_print(void *, int, void (*)(const char *, ...)
    __printflike(1, 2));
void        vfs_mount_print(struct mount *, int, void (*)(const char *, ...)
    __printflike(1, 2));
void        vfs_mount_print_all(int, void (*)(const char *, ...)
    __printflike(1, 2));
#endif /* DDB */

#endif /* _KERNEL */

#endif /* !_SYS_VNODE_H_ */


























































































































































































































































































































































































































































































































































    3 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
/*        $NetBSD: if.h,v 1.305 2023/10/09 11:55:34 riastradh Exp $        */

/*-
 * Copyright (c) 1999, 2000, 2001 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by William Studenmund and Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)if.h        8.3 (Berkeley) 2/9/95
 */

#ifndef _NET_IF_H_
#define _NET_IF_H_

#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <stdbool.h>
#endif

#include <sys/featuretest.h>

/*
 * Length of interface external name, including terminating '\0'.
 * Note: this is the same size as a generic device's external name.
 */
#define IF_NAMESIZE 16

/*
 * Length of interface description, including terminating '\0'.
 */
#define        IFDESCRSIZE        64

#if defined(_NETBSD_SOURCE)

#include <sys/socket.h>
#include <sys/queue.h>
#include <sys/mutex.h>
#include <sys/hook.h>

#include <net/dlt.h>
#include <net/pfil.h>
#ifdef _KERNEL
#include <net/pktqueue.h>
#include <sys/pslist.h>
#include <sys/pserialize.h>
#include <sys/psref.h>
#include <sys/module_hook.h>
#endif

/*
 * Always include ALTQ glue here -- we use the ALTQ interface queue
 * structure even when ALTQ is not configured into the kernel so that
 * the size of struct ifnet does not changed based on the option.  The
 * ALTQ queue structure is API-compatible with the legacy ifqueue.
 */
#include <altq/if_altq.h>

/*
 * Structures defining a network interface, providing a packet
 * transport mechanism (ala level 0 of the PUP protocols).
 *
 * Each interface accepts output datagrams of a specified maximum
 * length, and provides higher level routines with input datagrams
 * received from its medium.
 *
 * Output occurs when the routine if_output is called, with four parameters:
 *        (*ifp->if_output)(ifp, m, dst, rt)
 * Here m is the mbuf chain to be sent and dst is the destination address.
 * The output routine encapsulates the supplied datagram if necessary,
 * and then transmits it on its medium.
 *
 * On input, each interface unwraps the data received by it, and either
 * places it on the input queue of a internetwork datagram routine
 * and posts the associated software interrupt, or passes the datagram to a raw
 * packet input routine.
 *
 * Routines exist for locating interfaces by their addresses
 * or for locating a interface on a certain network, as well as more general
 * routing and gateway routines maintaining information used to locate
 * interfaces.  These routines live in the files if.c and route.c
 */
#include <sys/time.h>

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#include "opt_gateway.h"
#endif

struct mbuf;
struct proc;
struct rtentry;
struct socket;
struct ether_header;
struct ifaddr;
struct ifnet;
struct rt_addrinfo;

#define        IFNAMSIZ        IF_NAMESIZE

/*
 * Structure describing a `cloning' interface.
 */
struct if_clone {
        LIST_ENTRY(if_clone) ifc_list;        /* on list of cloners */
        const char *ifc_name;                /* name of device, e.g. `gif' */
        size_t ifc_namelen;                /* length of name */

        int        (*ifc_create)(struct if_clone *, int);
        int        (*ifc_destroy)(struct ifnet *);
};

#define        IF_CLONE_INITIALIZER(name, create, destroy)                        \
        { { NULL, NULL }, name, sizeof(name) - 1, create, destroy }

/*
 * Structure used to query names of interface cloners.
 */
struct if_clonereq {
        int        ifcr_total;                /* total cloners (out) */
        int        ifcr_count;                /* room for this many in user buffer */
        char        *ifcr_buffer;                /* buffer for cloner names */
};

/*
 * Structure defining statistics and other data kept regarding a network
 * interface.
 *
 * Only used for exporting data from the interface.
 */
struct if_data {
        /* generic interface information */
        u_char        ifi_type;                /* ethernet, tokenring, etc. */
        u_char        ifi_addrlen;                /* media address length */
        u_char        ifi_hdrlen;                /* media header length */
        int        ifi_link_state;                /* current link state */
        uint64_t ifi_mtu;                /* maximum transmission unit */
        uint64_t ifi_metric;                /* routing metric (external only) */
        uint64_t ifi_baudrate;                /* linespeed */
        /* volatile statistics */
        uint64_t ifi_ipackets;                /* packets received on interface */
        uint64_t ifi_ierrors;                /* input errors on interface */
        uint64_t ifi_opackets;                /* packets sent on interface */
        uint64_t ifi_oerrors;                /* output errors on interface */
        uint64_t ifi_collisions;        /* collisions on csma interfaces */
        uint64_t ifi_ibytes;                /* total number of octets received */
        uint64_t ifi_obytes;                /* total number of octets sent */
        uint64_t ifi_imcasts;                /* packets received via multicast */
        uint64_t ifi_omcasts;                /* packets sent via multicast */
        uint64_t ifi_iqdrops;                /* dropped on input, this interface */
        uint64_t ifi_noproto;                /* destined for unsupported protocol */
        struct        timespec ifi_lastchange;/* last operational state change */
};

/*
 * Values for if_link_state.
 */
#define        LINK_STATE_UNKNOWN        0        /* link invalid/unknown */
#define        LINK_STATE_DOWN                1        /* link is down */
#define        LINK_STATE_UP                2        /* link is up */

/*
 * Status bit descriptions for the various interface types.
 */
struct if_status_description {
        unsigned char        ifs_type;
        unsigned char        ifs_state;
        const char        *ifs_string;
};

#define LINK_STATE_DESC_MATCH(_ifs, _t, _s)                                \
        (((_ifs)->ifs_type == (_t) || (_ifs)->ifs_type == 0) &&                \
            (_ifs)->ifs_state == (_s))

#define LINK_STATE_DESCRIPTIONS {                                        \
        { IFT_ETHER, LINK_STATE_DOWN, "no carrier" },                        \
        { IFT_IEEE80211, LINK_STATE_DOWN, "no network" },                \
        { IFT_PPP, LINK_STATE_DOWN, "no carrier" },                        \
        { IFT_CARP, LINK_STATE_DOWN, "backup" },                        \
        { IFT_CARP, LINK_STATE_UP, "master" },                                \
        { 0, LINK_STATE_UP, "active" },                                        \
        { 0, LINK_STATE_UNKNOWN, "unknown" },                                \
        { 0, LINK_STATE_DOWN, "down" },                                        \
        { 0, 0, NULL }                                                        \
}

/*
 * Structure defining a queue for a network interface.
 */
struct ifqueue {
        struct                mbuf *ifq_head;
        struct                mbuf *ifq_tail;
        int                ifq_len;
        int                ifq_maxlen;
        uint64_t        ifq_drops;
        kmutex_t        *ifq_lock;
};

#ifdef _KERNEL
#include <sys/percpu.h>
#include <sys/callout.h>
#include <sys/rwlock.h>
#include <sys/workqueue.h>

#endif /* _KERNEL */

/*
 * Structure defining a queue for a network interface.
 *
 * (Would like to call this struct ``if'', but C isn't PL/1.)
 */
TAILQ_HEAD(ifnet_head, ifnet);                /* the actual queue head */

struct bridge_softc;
struct bridge_iflist;
struct callout;
struct krwlock;
struct if_percpuq;
struct if_deferred_start;
struct in6_multi;

typedef unsigned short if_index_t;

/*
 * Interface.  Field markings and the corresponding locks:
 *
 * i:        IFNET_LOCK (a.k.a., if_ioctl_lock)
 * q:        ifq_lock (struct ifaltq)
 * a:        if_afdata_lock
 * 6:        in6_multilock (global lock)
 * ::        unlocked, stable
 * ?:        unknown, maybe unsafe
 *
 * Lock order: IFNET_LOCK => in6_multilock => if_afdata_lock => ifq_lock
 *   Note that currently if_afdata_lock and ifq_lock aren't held
 *   at the same time, but define the order anyway.
 *
 * Lock order of IFNET_LOCK with other locks:
 *     softnet_lock => solock => IFNET_LOCK => ND6_LOCK, in_multilock
 */
typedef struct ifnet {
        void                *if_softc;        /* :: lower-level data for this if */
        /* DEPRECATED. Keep it to avoid breaking kvm(3) users */
        TAILQ_ENTRY(ifnet)
                        if_list;        /* i: all struct ifnets are chained */
        TAILQ_HEAD(, ifaddr)
                        if_addrlist;        /* i: linked list of addresses per if */
        char                if_xname[IFNAMSIZ];
                                        /* :: external name (name + unit) */
        int                if_pcount;        /* i: number of promiscuous listeners */
        struct bpf_if        *if_bpf;        /* :: packet filter structure */
        if_index_t        if_index;        /* :: numeric abbreviation for this if */
        short                if_timer;        /* ?: time 'til if_slowtimo called */
        unsigned short        if_flags;        /* i: up/down, broadcast, etc. */
        short                if_extflags;        /* :: if_output MP-safe, etc. */
        u_char                if_type;        /* :: ethernet, tokenring, etc. */
        u_char                if_addrlen;        /* :: media address length */
        u_char                if_hdrlen;        /* :: media header length */
        /* XXX audit :? fields here. */
        int                if_link_state;        /* :? current link state */
        uint64_t        if_mtu;                /* :? maximum transmission unit */
        uint64_t        if_metric;        /* :? routing metric (external only) */
        uint64_t        if_baudrate;        /* :? linespeed */
        struct timespec        if_lastchange;        /* :? last operational state change */
#ifdef _KERNEL
        percpu_t        *if_stats;        /* :: statistics */
#else
        void                *if_stats;        /* opaque to user-space */
#endif /* _KERNEL */
        /*
         * Procedure handles.  If you add more of these, don't forget the
         * corresponding NULL stub in if.c.
         */
        int                (*if_output)        /* :: output routine (enqueue) */
                            (struct ifnet *, struct mbuf *, const struct sockaddr *,
                             const struct rtentry *);
        void                (*_if_input)        /* :: input routine (from h/w driver) */
                            (struct ifnet *, struct mbuf *);
        void                (*if_start)        /* :: initiate output routine */
                            (struct ifnet *);
        int                (*if_transmit)        /* :: output routine, must be MP-safe */
                            (struct ifnet *, struct mbuf *);
        int                (*if_ioctl)        /* :: ioctl routine */
                            (struct ifnet *, u_long, void *);
        int                (*if_init)        /* :: init routine */
                            (struct ifnet *);
        void                (*if_stop)        /* :: stop routine */
                            (struct ifnet *, int);
        void                (*if_slowtimo)        /* :: timer routine */
                            (struct ifnet *);
#define        if_watchdog        if_slowtimo
        void                (*if_drain)        /* :: routine to release resources */
                            (struct ifnet *);
        void                (*if_bpf_mtap)        /* :: bpf routine */
                            (struct bpf_if *, struct mbuf *, u_int);
        struct ifaltq        if_snd;                /* q: output queue (includes altq) */
        struct ifaddr        *if_dl;                /* i: identity of this interface. */
        const struct sockaddr_dl
                        *if_sadl;        /* i: pointer to sockaddr_dl of if_dl */
        /*
         * May be NULL.  If not NULL, it is the address assigned
         * to the interface by the manufacturer, so it very likely
         * to be unique.  It MUST NOT be deleted.  It is highly
         * suitable for deriving the EUI64 for the interface.
         */
        struct ifaddr        *if_hwdl;        /* i: h/w identity */
        const uint8_t        *if_broadcastaddr;
                                        /* :: linklevel broadcast bytestring */
        struct bridge_softc
                        *if_bridge;        /* i: bridge glue */
        struct bridge_iflist
                        *if_bridgeif;        /* i: shortcut to interface list entry */
        int                if_dlt;                /* :: data link type (<net/dlt.h>) */
        pfil_head_t *        if_pfil;        /* :: filtering point */
        uint64_t        if_capabilities;
                                        /* i: interface capabilities */
        uint64_t        if_capenable;        /* i: capabilities enabled */
        union {
                void *                carp_s;        /* carp structure (used by !carp ifs) */
                struct ifnet        *carp_d;/* ptr to carpdev (used by carp ifs) */
        }                if_carp_ptr;        /* ?: */
#define if_carp                if_carp_ptr.carp_s
#define if_carpdev        if_carp_ptr.carp_d
        /*
         * These are pre-computed based on an interfaces enabled
         * capabilities, for speed elsewhere.
         */
        int                if_csum_flags_tx;
                                        /* i: M_CSUM_* flags for Tx */
        int                if_csum_flags_rx;
                                        /* i: M_CSUM_* flags for Rx */

        void                *if_afdata[AF_MAX];
                                        /* a: */
        struct mowner        *if_mowner;        /* ?: who owns mbufs for this interface */

        void                *if_lagg;        /* :: lagg or agr structure */
        void                *if_npf_private;/* ?: associated NPF context */

        /*
         * pf specific data, used only when #if NPF > 0.
         */
        void                *if_pf_kif;        /* ?: pf interface abstraction */
        void                *if_pf_groups;        /* ?: pf interface groups */
        /*
         * During an ifnet's lifetime, it has only one if_index, but
         * an if_index is not sufficient to identify an ifnet
         * because during the lifetime of the system, many ifnets may occupy a
         * given if_index.  Let us tell different ifnets at the same
         * if_index apart by their if_index_gen, a unique number that each ifnet
         * is assigned when it if_attach()s.  Now, the kernel can use the
         * pair (if_index, if_index_gen) as a weak reference to an ifnet.
         */
        uint64_t        if_index_gen;        /* :: generation number for the ifnet
                                         * at if_index: if two ifnets' index
                                         * and generation number are both the
                                         * same, they are the same ifnet.
                                         */
        struct sysctllog
                        *if_sysctl_log;        /* :: */
        int                (*if_initaddr)  /* :: */
                            (struct ifnet *, struct ifaddr *, bool);
        int                (*if_setflags)        /* :: */
                            (struct ifnet *, const u_short);
        kmutex_t        *if_ioctl_lock;        /* :: */
        char                *if_description;        /* i: interface description */
#ifdef _KERNEL /* XXX kvm(3) */
        struct if_slowtimo_data *if_slowtimo_data; /* :: */
        struct krwlock        *if_afdata_lock;/* :: */
        struct if_percpuq
                        *if_percpuq;        /* :: we should remove it in the future */
        struct work        if_link_work;        /* q: linkage on link state work queue */
        uint16_t        if_link_queue;        /* q: masked link state change queue */
                                        /* q: is link state work scheduled? */
        bool                if_link_scheduled;
        struct pslist_entry
                        if_pslist_entry;/* i: */
        struct psref_target
                        if_psref;        /* :: */
        struct pslist_head
                        if_addr_pslist;        /* i: */
        struct if_deferred_start
                        *if_deferred_start;
                                        /* :: */
        /* XXX should be protocol independent */
        LIST_HEAD(, in6_multi)
                        if_multiaddrs;        /* 6: */
        khook_list_t        *if_linkstate_hooks;        /* :: */
#endif
} ifnet_t;

#include <net/if_stats.h>

#define        if_name(ifp)        ((ifp)->if_xname)

#define        IFF_UP                0x0001                /* interface is up */
#define        IFF_BROADCAST        0x0002                /* broadcast address valid */
#define        IFF_DEBUG        0x0004                /* turn on debugging */
#define        IFF_LOOPBACK        0x0008                /* is a loopback net */
#define        IFF_POINTOPOINT        0x0010                /* interface is point-to-point link */
#if 0
/*                        0x0020                   was IFF_NOTRAILERS */
#else
/*
 * sys/compat/svr4 is remvoed on 19 Dec 2018.
 * And then, IFF_NOTRAILERS itself is removed by if.h:r1.268 on 5 Feb 2019.
 */
#define        IFF_UNNUMBERED        0x0020                /* explicit unnumbered */
#endif
#define        IFF_RUNNING        0x0040                /* resources allocated */
#define        IFF_NOARP        0x0080                /* no address resolution protocol */
#define        IFF_PROMISC        0x0100                /* receive all packets */
#define        IFF_ALLMULTI        0x0200                /* OBSOLETE -- DO NOT USE */
/*
 * IFF_ALLMULTI obsoleted on 2019-05-15 -- existing non-MP-safe drivers
 * can use it for themselves under IFNET_LOCK, but they should be
 * converted to use ETHER_F_ALLMULTI under ETHER_LOCK instead.  For
 * compatibility with existing drivers, if_ethersubr and if_arcsubr
 * will set IFF_ALLMULTI according to other flags, but you should not
 * rely on this.
 */
#define        IFF_OACTIVE        0x0400                /* transmission in progress */
#define        IFF_SIMPLEX        0x0800                /* can't hear own transmissions */
#define        IFF_LINK0        0x1000                /* per link layer defined bit */
#define        IFF_LINK1        0x2000                /* per link layer defined bit */
#define        IFF_LINK2        0x4000                /* per link layer defined bit */
#define        IFF_MULTICAST        0x8000                /* supports multicast */

#define        IFEF_MPSAFE                        __BIT(0)        /* handlers can run in parallel (see below) */

/*
 * The guidelines for converting an interface to IFEF_MPSAFE are as follows
 *
 * Enabling IFEF_MPSAFE on an interface suppresses taking KERNEL_LOCK when
 * calling the following handlers:
 * - if_start
 *   - Note that if_transmit is always called without KERNEL_LOCK
 * - if_output
 * - if_ioctl
 * - if_init
 * - if_stop
 *
 * This means that an interface with IFEF_MPSAFE must make the above handlers
 * MP-safe or take KERNEL_LOCK by itself inside handlers that aren't MP-safe
 * yet.
 *
 * There are some additional restrictions to access member variables of struct
 * ifnet:
 * - if_flags
 *   - Must be updated with holding IFNET_LOCK
 *   - You cannot use the flag in Tx/Rx paths anymore because there is no
 *     synchronization on the flag except for IFNET_LOCK
 *   - Note that IFNET_LOCK can't be taken in softint because it's known
 *     that it causes a deadlock
 *     - Some synchronization mechanisms such as pserialize_perform are called
 *       with IFNET_LOCK and also require context switches on every CPUs
 *       that mean softints finish so trying to take IFNET_LOCK in softint
 *       might block on IFNET_LOCK and prevent such synchronization mechanisms
 *       from being completed
 *     - Currently the deadlock occurs only if NET_MPSAFE is enabled, however,
 *       we should deal with the restriction because NET_MPSAFE will be enabled
 *       by default in the future
 * - if_watchdog and if_timer
 *   - The watchdog framework works only for non-IFEF_MPSAFE interfaces
 *     that rely on KERNEL_LOCK
 *   - Interfaces with IFEF_MPSAFE have to provide its own watchdog mechanism
 *     if needed
 *     - Keep if_watchdog NULL when calling if_attach
 */

#ifdef _KERNEL
static __inline bool
if_is_mpsafe(struct ifnet *ifp)
{

        return ((ifp->if_extflags & IFEF_MPSAFE) != 0);
}

static __inline int
if_output_lock(struct ifnet *cifp, struct ifnet *ifp, struct mbuf *m,
    const struct sockaddr *dst, const struct rtentry *rt)
{

        if (if_is_mpsafe(cifp)) {
                return (*cifp->if_output)(ifp, m, dst, rt);
        } else {
                int ret;

                KERNEL_LOCK(1, NULL);
                ret = (*cifp->if_output)(ifp, m, dst, rt);
                KERNEL_UNLOCK_ONE(NULL);
                return ret;
        }
}

static __inline void
if_start_lock(struct ifnet *ifp)
{

        if (if_is_mpsafe(ifp)) {
                (*ifp->if_start)(ifp);
        } else {
                KERNEL_LOCK(1, NULL);
                (*ifp->if_start)(ifp);
                KERNEL_UNLOCK_ONE(NULL);
        }
}

#define KERNEL_LOCK_IF_IFP_MPSAFE(ifp)                                        \
        do { if (if_is_mpsafe(ifp)) { KERNEL_LOCK(1, NULL); } } while (0)
#define KERNEL_UNLOCK_IF_IFP_MPSAFE(ifp)                                \
        do { if (if_is_mpsafe(ifp)) { KERNEL_UNLOCK_ONE(NULL); } } while (0)

#define KERNEL_LOCK_UNLESS_IFP_MPSAFE(ifp)                                \
        do { if (!if_is_mpsafe(ifp)) { KERNEL_LOCK(1, NULL); } } while (0)
#define KERNEL_UNLOCK_UNLESS_IFP_MPSAFE(ifp)                                \
        do { if (!if_is_mpsafe(ifp)) { KERNEL_UNLOCK_ONE(NULL); } } while (0)

#ifdef _KERNEL_OPT
#include "opt_net_mpsafe.h"
#endif

/* XXX explore a better place to define */
#ifdef NET_MPSAFE

#define KERNEL_LOCK_UNLESS_NET_MPSAFE()                do { } while (0)
#define KERNEL_UNLOCK_UNLESS_NET_MPSAFE()        do { } while (0)

#define SOFTNET_LOCK_UNLESS_NET_MPSAFE()        do { } while (0)
#define SOFTNET_UNLOCK_UNLESS_NET_MPSAFE()        do { } while (0)

#define SOFTNET_LOCK_IF_NET_MPSAFE()                                        \
        do { mutex_enter(softnet_lock); } while (0)
#define SOFTNET_UNLOCK_IF_NET_MPSAFE()                                        \
        do { mutex_exit(softnet_lock); } while (0)

#else /* NET_MPSAFE */

#define KERNEL_LOCK_UNLESS_NET_MPSAFE()                                        \
        do { KERNEL_LOCK(1, NULL); } while (0)
#define KERNEL_UNLOCK_UNLESS_NET_MPSAFE()                                \
        do { KERNEL_UNLOCK_ONE(NULL); } while (0)

#define SOFTNET_LOCK_UNLESS_NET_MPSAFE()                                \
        do { mutex_enter(softnet_lock); } while (0)
#define SOFTNET_UNLOCK_UNLESS_NET_MPSAFE()                                \
        do { mutex_exit(softnet_lock); } while (0)

#define SOFTNET_LOCK_IF_NET_MPSAFE()                do { } while (0)
#define SOFTNET_UNLOCK_IF_NET_MPSAFE()                do { } while (0)

#endif /* NET_MPSAFE */

#define SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE()                                \
        do {                                                                \
                SOFTNET_LOCK_UNLESS_NET_MPSAFE();                        \
                KERNEL_LOCK_UNLESS_NET_MPSAFE();                        \
        } while (0)

#define SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE()                        \
        do {                                                                \
                KERNEL_UNLOCK_UNLESS_NET_MPSAFE();                        \
                SOFTNET_UNLOCK_UNLESS_NET_MPSAFE();                        \
        } while (0)

#endif /* _KERNEL */

#define        IFFBITS \
    "\020\1UP\2BROADCAST\3DEBUG\4LOOPBACK\5POINTOPOINT\6UNNUMBERED" \
    "\7RUNNING\10NOARP\11PROMISC\12ALLMULTI\13OACTIVE\14SIMPLEX" \
    "\15LINK0\16LINK1\17LINK2\20MULTICAST"

/* flags set internally only: */
#define        IFF_CANTCHANGE \
        (IFF_BROADCAST|IFF_POINTOPOINT|IFF_RUNNING|IFF_OACTIVE|\
            IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_PROMISC)

/*
 * Some convenience macros used for setting ifi_baudrate.
 */
#define        IF_Kbps(x)        ((x) * 1000ULL)                        /* kilobits/sec. */
#define        IF_Mbps(x)        (IF_Kbps((x) * 1000ULL))        /* megabits/sec. */
#define        IF_Gbps(x)        (IF_Mbps((x) * 1000ULL))        /* gigabits/sec. */

/* Capabilities that interfaces can advertise. */
                                        /* 0x01 .. 0x40 were previously used */
#define        IFCAP_TSOv4                0x00080        /* can do TCPv4 segmentation offload */
#define        IFCAP_CSUM_IPv4_Rx        0x00100        /* can do IPv4 header checksums (Rx) */
#define        IFCAP_CSUM_IPv4_Tx        0x00200        /* can do IPv4 header checksums (Tx) */
#define        IFCAP_CSUM_TCPv4_Rx        0x00400        /* can do IPv4/TCP checksums (Rx) */
#define        IFCAP_CSUM_TCPv4_Tx        0x00800        /* can do IPv4/TCP checksums (Tx) */
#define        IFCAP_CSUM_UDPv4_Rx        0x01000        /* can do IPv4/UDP checksums (Rx) */
#define        IFCAP_CSUM_UDPv4_Tx        0x02000        /* can do IPv4/UDP checksums (Tx) */
#define        IFCAP_CSUM_TCPv6_Rx        0x04000        /* can do IPv6/TCP checksums (Rx) */
#define        IFCAP_CSUM_TCPv6_Tx        0x08000        /* can do IPv6/TCP checksums (Tx) */
#define        IFCAP_CSUM_UDPv6_Rx        0x10000        /* can do IPv6/UDP checksums (Rx) */
#define        IFCAP_CSUM_UDPv6_Tx        0x20000        /* can do IPv6/UDP checksums (Tx) */
#define        IFCAP_TSOv6                0x40000        /* can do TCPv6 segmentation offload */
#define        IFCAP_LRO                0x80000        /* can do Large Receive Offload */
#define        IFCAP_MASK                0xfff80 /* currently valid capabilities */

#define        IFCAPBITS                \
        "\020"                        \
        "\10TSO4"                \
        "\11IP4CSUM_Rx"                \
        "\12IP4CSUM_Tx"                \
        "\13TCP4CSUM_Rx"        \
        "\14TCP4CSUM_Tx"        \
        "\15UDP4CSUM_Rx"        \
        "\16UDP4CSUM_Tx"        \
        "\17TCP6CSUM_Rx"        \
        "\20TCP6CSUM_Tx"        \
        "\21UDP6CSUM_Rx"        \
        "\22UDP6CSUM_Tx"        \
        "\23TSO6"                \
        "\24LRO"                \

#define        IF_AFDATA_LOCK_INIT(ifp)        \
        do {(ifp)->if_afdata_lock = rw_obj_alloc();} while (0)

#define        IF_AFDATA_LOCK_DESTROY(ifp)        rw_obj_free((ifp)->if_afdata_lock)

#define        IF_AFDATA_WLOCK(ifp)        rw_enter((ifp)->if_afdata_lock, RW_WRITER)
#define        IF_AFDATA_RLOCK(ifp)        rw_enter((ifp)->if_afdata_lock, RW_READER)
#define        IF_AFDATA_WUNLOCK(ifp)        rw_exit((ifp)->if_afdata_lock)
#define        IF_AFDATA_RUNLOCK(ifp)        rw_exit((ifp)->if_afdata_lock)
#define        IF_AFDATA_LOCK(ifp)        IF_AFDATA_WLOCK(ifp)
#define        IF_AFDATA_UNLOCK(ifp)        IF_AFDATA_WUNLOCK(ifp)
#define        IF_AFDATA_TRYLOCK(ifp)        rw_tryenter((ifp)->if_afdata_lock, RW_WRITER)

#define        IF_AFDATA_LOCK_ASSERT(ifp)        \
        KASSERT(rw_lock_held((ifp)->if_afdata_lock))
#define        IF_AFDATA_RLOCK_ASSERT(ifp)        \
        KASSERT(rw_read_held((ifp)->if_afdata_lock))
#define        IF_AFDATA_WLOCK_ASSERT(ifp)        \
        KASSERT(rw_write_held((ifp)->if_afdata_lock))

/*
 * Output queues (ifp->if_snd) and internetwork datagram level (pup level 1)
 * input routines have queues of messages stored on ifqueue structures
 * (defined above).  Entries are added to and deleted from these structures
 * by these macros, which should be called with ipl raised to splnet().
 */
#define        IF_QFULL(ifq)                ((ifq)->ifq_len >= (ifq)->ifq_maxlen)
#define        IF_DROP(ifq)                ((ifq)->ifq_drops++)
#define        IF_ENQUEUE(ifq, m) do { \
        (m)->m_nextpkt = 0; \
        if ((ifq)->ifq_tail == 0) \
                (ifq)->ifq_head = m; \
        else \
                (ifq)->ifq_tail->m_nextpkt = m; \
        (ifq)->ifq_tail = m; \
        (ifq)->ifq_len++; \
} while (/*CONSTCOND*/0)
#define        IF_PREPEND(ifq, m) do { \
        (m)->m_nextpkt = (ifq)->ifq_head; \
        if ((ifq)->ifq_tail == 0) \
                (ifq)->ifq_tail = (m); \
        (ifq)->ifq_head = (m); \
        (ifq)->ifq_len++; \
} while (/*CONSTCOND*/0)
#define        IF_DEQUEUE(ifq, m) do { \
        (m) = (ifq)->ifq_head; \
        if (m) { \
                if (((ifq)->ifq_head = (m)->m_nextpkt) == 0) \
                        (ifq)->ifq_tail = 0; \
                (m)->m_nextpkt = 0; \
                (ifq)->ifq_len--; \
        } \
} while (/*CONSTCOND*/0)
#define        IF_POLL(ifq, m)                ((m) = (ifq)->ifq_head)
#define        IF_PURGE(ifq)                                                        \
do {                                                                        \
        struct mbuf *__m0;                                                \
                                                                        \
        for (;;) {                                                        \
                IF_DEQUEUE((ifq), __m0);                                \
                if (__m0 == NULL)                                        \
                        break;                                                \
                else                                                        \
                        m_freem(__m0);                                        \
        }                                                                \
} while (/*CONSTCOND*/ 0)
#define        IF_IS_EMPTY(ifq)        ((ifq)->ifq_len == 0)

#ifndef IFQ_MAXLEN
#define        IFQ_MAXLEN        256
#endif
#define        IFNET_SLOWHZ        1                /* granularity is 1 second */

/*
 * Structure defining statistics and other data kept regarding an address
 * on a network interface.
 */
struct ifaddr_data {
        int64_t        ifad_inbytes;
        int64_t        ifad_outbytes;
};

/*
 * The ifaddr structure contains information about one address
 * of an interface.  They are maintained by the different address families,
 * are allocated and attached when an address is set, and are linked
 * together so all addresses for an interface can be located.
 */
struct ifaddr {
        struct        sockaddr *ifa_addr;        /* address of interface */
        struct        sockaddr *ifa_dstaddr;        /* other end of p-to-p link */
#define        ifa_broadaddr        ifa_dstaddr        /* broadcast address interface */
        struct        sockaddr *ifa_netmask;        /* used to determine subnet */
        struct        ifnet *ifa_ifp;                /* back-pointer to interface */
        TAILQ_ENTRY(ifaddr) ifa_list;        /* list of addresses for interface */
        struct        ifaddr_data        ifa_data;        /* statistics on the address */
        void        (*ifa_rtrequest)        /* check or clean routes (+ or -)'d */
                        (int, struct rtentry *, const struct rt_addrinfo *);
        u_int        ifa_flags;                /* mostly rt_flags for cloning */
        int        ifa_refcnt;                /* count of references */
        int        ifa_metric;                /* cost of going out this interface */
        struct ifaddr        *(*ifa_getifa)(struct ifaddr *,
                                       const struct sockaddr *);
        uint32_t        *ifa_seqno;
        int16_t        ifa_preference;        /* preference level for this address */
#ifdef _KERNEL
        struct pslist_entry     ifa_pslist_entry;
        struct psref_target        ifa_psref;
#endif
};
#define        IFA_ROUTE        RTF_UP        /* (0x01) route installed */
#define        IFA_DESTROYING        0x2

/*
 * Message format for use in obtaining information about interfaces from
 * sysctl and the routing socket.  We need to force 64-bit alignment if we
 * aren't using compatibility definitions.
 */
#if !defined(_KERNEL) || !defined(COMPAT_RTSOCK)
#define        __align64        __aligned(sizeof(uint64_t))
#else
#define        __align64
#endif
struct if_msghdr {
        u_short        ifm_msglen __align64;
                                /* to skip over non-understood messages */
        u_char        ifm_version;        /* future binary compatibility */
        u_char        ifm_type;        /* message type */
        int        ifm_addrs;        /* like rtm_addrs */
        int        ifm_flags;        /* value of if_flags */
        u_short        ifm_index;        /* index for associated ifp */
        struct        if_data ifm_data __align64;
                                /* statistics and other data about if */
};

/*
 * Message format for use in obtaining information about interface addresses
 * from sysctl and the routing socket.
 */
struct ifa_msghdr {
        u_short        ifam_msglen __align64;
                                /* to skip over non-understood messages */
        u_char        ifam_version;        /* future binary compatibility */
        u_char        ifam_type;        /* message type */
        u_short        ifam_index;        /* index for associated ifp */
        int        ifam_flags;        /* value of ifa_flags */
        int        ifam_addrs;        /* like rtm_addrs */
        pid_t        ifam_pid;        /* identify sender */
        int        ifam_addrflags;        /* family specific address flags */
        int        ifam_metric;        /* value of ifa_metric */
};

/*
 * Message format announcing the arrival or departure of a network interface.
 */
struct if_announcemsghdr {
        u_short        ifan_msglen __align64;
                                /* to skip over non-understood messages */
        u_char        ifan_version;        /* future binary compatibility */
        u_char        ifan_type;        /* message type */
        u_short        ifan_index;        /* index for associated ifp */
        char        ifan_name[IFNAMSIZ]; /* if name, e.g. "en0" */
        u_short        ifan_what;        /* what type of announcement */
};

#define        IFAN_ARRIVAL        0        /* interface arrival */
#define        IFAN_DEPARTURE        1        /* interface departure */

#undef __align64

/*
 * Interface request structure used for socket
 * ioctl's.  All interface ioctl's must have parameter
 * definitions which begin with ifr_name.  The
 * remainder may be interface specific.
 */
struct        ifreq {
        char        ifr_name[IFNAMSIZ];                /* if name, e.g. "en0" */
        union {
                struct        sockaddr ifru_addr;
                struct        sockaddr ifru_dstaddr;
                struct        sockaddr ifru_broadaddr;
                struct        sockaddr_storage ifru_space;
                short        ifru_flags;
                int        ifru_addrflags;
                int        ifru_metric;
                int        ifru_mtu;
                int        ifru_dlt;
                u_int        ifru_value;
                void *        ifru_data;
                struct {
                        uint32_t        b_buflen;
                        void                *b_buf;
                } ifru_b;
        } ifr_ifru;
#define        ifr_addr        ifr_ifru.ifru_addr        /* address */
#define        ifr_dstaddr        ifr_ifru.ifru_dstaddr        /* other end of p-to-p link */
#define        ifr_broadaddr        ifr_ifru.ifru_broadaddr        /* broadcast address */
#define        ifr_space        ifr_ifru.ifru_space        /* sockaddr_storage */
#define        ifr_flags        ifr_ifru.ifru_flags        /* flags */
#define        ifr_addrflags        ifr_ifru.ifru_addrflags        /* addr flags */
#define        ifr_metric        ifr_ifru.ifru_metric        /* metric */
#define        ifr_mtu                ifr_ifru.ifru_mtu        /* mtu */
#define        ifr_dlt                ifr_ifru.ifru_dlt        /* data link type (DLT_*) */
#define        ifr_value        ifr_ifru.ifru_value        /* generic value */
#define        ifr_media        ifr_ifru.ifru_metric        /* media options (overload) */
#define        ifr_data        ifr_ifru.ifru_data        /* for use by interface
                                                 * XXX deprecated
                                                 */
#define        ifr_buf                ifr_ifru.ifru_b.b_buf        /* new interface ioctls */
#define        ifr_buflen        ifr_ifru.ifru_b.b_buflen
#define        ifr_index        ifr_ifru.ifru_value        /* interface index, BSD */
#define        ifr_ifindex        ifr_index                /* interface index, linux */
};

#ifdef _KERNEL
#define        ifreq_setdstaddr        ifreq_setaddr
#define        ifreq_setbroadaddr        ifreq_setaddr
#define        ifreq_getdstaddr        ifreq_getaddr
#define        ifreq_getbroadaddr        ifreq_getaddr

static __inline const struct sockaddr *
/*ARGSUSED*/
ifreq_getaddr(u_long cmd, const struct ifreq *ifr)
{
        return &ifr->ifr_addr;
}
#endif /* _KERNEL */

struct ifcapreq {
        char                ifcr_name[IFNAMSIZ];        /* if name, e.g. "en0" */
        uint64_t        ifcr_capabilities;        /* supported capabiliites */
        uint64_t        ifcr_capenable;                /* capabilities enabled */
};

struct ifaliasreq {
        char        ifra_name[IFNAMSIZ];                /* if name, e.g. "en0" */
        struct        sockaddr ifra_addr;
        struct        sockaddr ifra_dstaddr;
#define        ifra_broadaddr        ifra_dstaddr
        struct        sockaddr ifra_mask;
};

struct ifdatareq {
        char        ifdr_name[IFNAMSIZ];                /* if name, e.g. "en0" */
        struct        if_data ifdr_data;
};

struct ifmediareq {
        char        ifm_name[IFNAMSIZ];        /* if name, e.g. "en0" */
        int        ifm_current;                /* IFMWD: current media options */
        int        ifm_mask;                /* IFMWD: don't care mask */
        int        ifm_status;                /* media status */
        int        ifm_active;                /* IFMWD: active options */
        int        ifm_count;                /* # entries in ifm_ulist
                                           array */
        int        *ifm_ulist;                /* array of ifmedia word */
};


struct  ifdrv {
        char                ifd_name[IFNAMSIZ];        /* if name, e.g. "en0" */
        unsigned long        ifd_cmd;
        size_t                ifd_len;
        void                *ifd_data;
};
#define IFLINKSTR_QUERYLEN        0x01
#define IFLINKSTR_UNSET                0x02

/*
 * Structure used in SIOCGIFCONF request.
 * Used to retrieve interface configuration
 * for machine (useful for programs which
 * must know all networks accessible).
 */
struct        ifconf {
        int        ifc_len;                /* size of associated buffer */
        union {
                void *        ifcu_buf;
                struct        ifreq *ifcu_req;
        } ifc_ifcu;
#define        ifc_buf        ifc_ifcu.ifcu_buf        /* buffer address */
#define        ifc_req        ifc_ifcu.ifcu_req        /* array of structures returned */
};

/*
 * Structure for SIOC[AGD]LIFADDR
 */
struct if_laddrreq {
        char iflr_name[IFNAMSIZ];
        unsigned int flags;
#define IFLR_PREFIX        0x8000        /* in: prefix given  out: kernel fills id */
#define IFLR_ACTIVE        0x4000        /* in/out: link-layer address activation */
#define IFLR_FACTORY        0x2000        /* in/out: factory link-layer address */
        unsigned int prefixlen;                /* in/out */
        struct sockaddr_storage addr;        /* in/out */
        struct sockaddr_storage dstaddr; /* out */
};

/*
 * Structure for SIOC[SG]IFADDRPREF
 */
struct if_addrprefreq {
        char                        ifap_name[IFNAMSIZ];
        int16_t                        ifap_preference;        /* in/out */
        struct sockaddr_storage        ifap_addr;                /* in/out */
};

#include <net/if_arp.h>

#endif /* _NETBSD_SOURCE */

#ifdef _KERNEL
#ifdef ALTQ
#define IFQ_ENQUEUE(ifq, m, err)                                        \
do {                                                                        \
        mutex_enter((ifq)->ifq_lock);                                        \
        if (ALTQ_IS_ENABLED(ifq))                                        \
                ALTQ_ENQUEUE((ifq), (m), (err));                        \
        else {                                                                \
                if (IF_QFULL(ifq)) {                                        \
                        m_freem(m);                                        \
                        (err) = ENOBUFS;                                \
                } else {                                                \
                        IF_ENQUEUE((ifq), (m));                                \
                        (err) = 0;                                        \
                }                                                        \
        }                                                                \
        if ((err))                                                        \
                (ifq)->ifq_drops++;                                        \
        mutex_exit((ifq)->ifq_lock);                                        \
} while (/*CONSTCOND*/ 0)

#define IFQ_DEQUEUE(ifq, m)                                                \
do {                                                                        \
        mutex_enter((ifq)->ifq_lock);                                        \
        if (TBR_IS_ENABLED(ifq))                                        \
                (m) = tbr_dequeue((ifq), ALTDQ_REMOVE);                        \
        else if (ALTQ_IS_ENABLED(ifq))                                        \
                ALTQ_DEQUEUE((ifq), (m));                                \
        else                                                                \
                IF_DEQUEUE((ifq), (m));                                        \
        mutex_exit((ifq)->ifq_lock);                                        \
} while (/*CONSTCOND*/ 0)

#define        IFQ_POLL(ifq, m)                                                \
do {                                                                        \
        mutex_enter((ifq)->ifq_lock);                                        \
        if (TBR_IS_ENABLED(ifq))                                        \
                (m) = tbr_dequeue((ifq), ALTDQ_POLL);                        \
        else if (ALTQ_IS_ENABLED(ifq))                                        \
                ALTQ_POLL((ifq), (m));                                        \
        else                                                                \
                IF_POLL((ifq), (m));                                        \
        mutex_exit((ifq)->ifq_lock);                                        \
} while (/*CONSTCOND*/ 0)

#define        IFQ_PURGE(ifq)                                                        \
do {                                                                        \
        mutex_enter((ifq)->ifq_lock);                                        \
        if (ALTQ_IS_ENABLED(ifq))                                        \
                ALTQ_PURGE(ifq);                                        \
        else                                                                \
                IF_PURGE(ifq);                                                \
        mutex_exit((ifq)->ifq_lock);                                        \
} while (/*CONSTCOND*/ 0)

#define        IFQ_SET_READY(ifq)                                                \
do {                                                                        \
        (ifq)->altq_flags |= ALTQF_READY;                                \
} while (/*CONSTCOND*/ 0)

#define        IFQ_CLASSIFY(ifq, m, af)                                        \
do {                                                                        \
        KASSERT(((m)->m_flags & M_PKTHDR) != 0);                        \
        mutex_enter((ifq)->ifq_lock);                                        \
        if (ALTQ_IS_ENABLED(ifq)) {                                        \
                if (ALTQ_NEEDS_CLASSIFY(ifq))                                \
                        (m)->m_pkthdr.pattr_class = (*(ifq)->altq_classify) \
                                ((ifq)->altq_clfier, (m), (af));        \
                (m)->m_pkthdr.pattr_af = (af);                                \
                (m)->m_pkthdr.pattr_hdr = mtod((m), void *);                \
        }                                                                \
        mutex_exit((ifq)->ifq_lock);                                        \
} while (/*CONSTCOND*/ 0)
#else /* ! ALTQ */
#define        IFQ_ENQUEUE(ifq, m, err)                                        \
do {                                                                        \
        mutex_enter((ifq)->ifq_lock);                                        \
        if (IF_QFULL(ifq)) {                                                \
                m_freem(m);                                                \
                (err) = ENOBUFS;                                        \
        } else {                                                        \
                IF_ENQUEUE((ifq), (m));                                        \
                (err) = 0;                                                \
        }                                                                \
        if (err)                                                        \
                (ifq)->ifq_drops++;                                        \
        mutex_exit((ifq)->ifq_lock);                                        \
} while (/*CONSTCOND*/ 0)

#define        IFQ_DEQUEUE(ifq, m)                                                \
do {                                                                        \
        mutex_enter((ifq)->ifq_lock);                                        \
        IF_DEQUEUE((ifq), (m));                                                \
        mutex_exit((ifq)->ifq_lock);                                        \
} while (/*CONSTCOND*/ 0)

#define        IFQ_POLL(ifq, m)                                                \
do {                                                                        \
        mutex_enter((ifq)->ifq_lock);                                        \
        IF_POLL((ifq), (m));                                                \
        mutex_exit((ifq)->ifq_lock);                                        \
} while (/*CONSTCOND*/ 0)

#define        IFQ_PURGE(ifq)                                                        \
do {                                                                        \
        mutex_enter((ifq)->ifq_lock);                                        \
        IF_PURGE(ifq);                                                        \
        mutex_exit((ifq)->ifq_lock);                                        \
} while (/*CONSTCOND*/ 0)

#define        IFQ_SET_READY(ifq)        /* nothing */

#define        IFQ_CLASSIFY(ifq, m, af) /* nothing */

#endif /* ALTQ */

#define IFQ_LOCK_INIT(ifq)        (ifq)->ifq_lock =                        \
            mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET)
#define IFQ_LOCK_DESTROY(ifq)        mutex_obj_free((ifq)->ifq_lock)
#define IFQ_LOCK(ifq)                mutex_enter((ifq)->ifq_lock)
#define IFQ_UNLOCK(ifq)                mutex_exit((ifq)->ifq_lock)

#define        IFQ_IS_EMPTY(ifq)                IF_IS_EMPTY(ifq)
#define        IFQ_INC_LEN(ifq)                ((ifq)->ifq_len++)
#define        IFQ_DEC_LEN(ifq)                (--(ifq)->ifq_len)
#define        IFQ_INC_DROPS(ifq)                ((ifq)->ifq_drops++)
#define        IFQ_SET_MAXLEN(ifq, len)        ((ifq)->ifq_maxlen = (len))

#include <sys/mallocvar.h>
MALLOC_DECLARE(M_IFADDR);
MALLOC_DECLARE(M_IFMADDR);

int ifreq_setaddr(u_long, struct ifreq *, const struct sockaddr *);

struct ifnet *if_alloc(u_char);
void if_free(struct ifnet *);
void if_initname(struct ifnet *, const char *, int);
struct ifaddr *if_dl_create(const struct ifnet *, const struct sockaddr_dl **);
void if_activate_sadl(struct ifnet *, struct ifaddr *,
    const struct sockaddr_dl *);
void        if_set_sadl(struct ifnet *, const void *, u_char, bool);
void        if_alloc_sadl(struct ifnet *);
void        if_free_sadl(struct ifnet *, int);
void        if_initialize(struct ifnet *);
void        if_register(struct ifnet *);
void        if_attach(struct ifnet *); /* Deprecated. Use if_initialize and if_register */
void        if_attachdomain(void);
void        if_deactivate(struct ifnet *);
bool        if_is_deactivated(const struct ifnet *);
void        if_export_if_data(struct ifnet *, struct if_data *, bool);
void        if_purgeaddrs(struct ifnet *, int, void (*)(struct ifaddr *));
void        if_detach(struct ifnet *);
void        if_down(struct ifnet *);
void        if_down_locked(struct ifnet *);
void        if_link_state_change(struct ifnet *, int);
void        if_domain_link_state_change(struct ifnet *, int);
void        if_up(struct ifnet *);
void        ifinit(void);
void        ifinit1(void);
void        ifinit_post(void);
int        ifaddrpref_ioctl(struct socket *, u_long, void *, struct ifnet *);
extern int (*ifioctl)(struct socket *, u_long, void *, struct lwp *);
int        ifioctl_common(struct ifnet *, u_long, void *);
int        ifpromisc(struct ifnet *, int);
int        ifpromisc_locked(struct ifnet *, int);
int        if_addr_init(ifnet_t *, struct ifaddr *, bool);
int        if_do_dad(struct ifnet *);
int        if_mcast_op(ifnet_t *, const unsigned long, const struct sockaddr *);
int        if_flags_set(struct ifnet *, const u_short);
int        if_clone_list(int, char *, int *);

int        if_ioctl(struct ifnet *, u_long, void *);
int        if_init(struct ifnet *);
void        if_stop(struct ifnet *, int);

struct        ifnet *ifunit(const char *);
struct        ifnet *if_get(const char *, struct psref *);
ifnet_t *if_byindex(u_int);
ifnet_t *_if_byindex(u_int);
ifnet_t *if_get_byindex(u_int, struct psref *);
ifnet_t *if_get_bylla(const void *, unsigned char, struct psref *);
void        if_put(const struct ifnet *, struct psref *);
void        if_acquire(struct ifnet *, struct psref *);
#define        if_release        if_put

int if_tunnel_check_nesting(struct ifnet *, struct mbuf *, int);
percpu_t *if_tunnel_alloc_ro_percpu(void);
void if_tunnel_free_ro_percpu(percpu_t *);
void if_tunnel_ro_percpu_rtcache_free(percpu_t *);

struct tunnel_ro {
        struct route *tr_ro;
        kmutex_t *tr_lock;
};

static inline void
if_tunnel_get_ro(percpu_t *ro_percpu, struct route **ro, kmutex_t **lock)
{
        struct tunnel_ro *tro;

        tro = percpu_getref(ro_percpu);
        *ro = tro->tr_ro;
        *lock = tro->tr_lock;
        mutex_enter(*lock);
}

static inline void
if_tunnel_put_ro(percpu_t *ro_percpu, kmutex_t *lock)
{

        mutex_exit(lock);
        percpu_putref(ro_percpu);
}

static __inline if_index_t
if_get_index(const struct ifnet *ifp)
{

        return ifp != NULL ? ifp->if_index : 0;
}

bool        if_held(struct ifnet *);

void        if_input(struct ifnet *, struct mbuf *);

struct if_percpuq *
        if_percpuq_create(struct ifnet *);
void        if_percpuq_destroy(struct if_percpuq *);
void
        if_percpuq_enqueue(struct if_percpuq *, struct mbuf *);

void        if_deferred_start_init(struct ifnet *, void (*)(struct ifnet *));
void        if_schedule_deferred_start(struct ifnet *);

void ifa_insert(struct ifnet *, struct ifaddr *);
void ifa_remove(struct ifnet *, struct ifaddr *);

void        ifa_psref_init(struct ifaddr *);
void        ifa_acquire(struct ifaddr *, struct psref *);
void        ifa_release(struct ifaddr *, struct psref *);
bool        ifa_held(struct ifaddr *);
bool        ifa_is_destroying(struct ifaddr *);

void        ifaref(struct ifaddr *);
void        ifafree(struct ifaddr *);

struct        ifaddr *ifa_ifwithaddr(const struct sockaddr *);
struct        ifaddr *ifa_ifwithaddr_psref(const struct sockaddr *, struct psref *);
struct        ifaddr *ifa_ifwithaf(int);
struct        ifaddr *ifa_ifwithdstaddr(const struct sockaddr *);
struct        ifaddr *ifa_ifwithdstaddr_psref(const struct sockaddr *,
            struct psref *);
struct        ifaddr *ifa_ifwithnet(const struct sockaddr *);
struct        ifaddr *ifa_ifwithnet_psref(const struct sockaddr *, struct psref *);
struct        ifaddr *ifa_ifwithladdr(const struct sockaddr *);
struct        ifaddr *ifa_ifwithladdr_psref(const struct sockaddr *, struct psref *);
struct        ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *);
struct        ifaddr *ifaof_ifpforaddr_psref(const struct sockaddr *, struct ifnet *,
            struct psref *);
void        link_rtrequest(int, struct rtentry *, const struct rt_addrinfo *);
void        p2p_rtrequest(int, struct rtentry *, const struct rt_addrinfo *);

void        if_clone_attach(struct if_clone *);
void        if_clone_detach(struct if_clone *);

int        if_transmit_lock(struct ifnet *, struct mbuf *);

int        ifq_enqueue(struct ifnet *, struct mbuf *);
int        ifq_enqueue2(struct ifnet *, struct ifqueue *, struct mbuf *);

int        loioctl(struct ifnet *, u_long, void *);
void        loopattach(int);
void        loopinit(void);
int        looutput(struct ifnet *,
           struct mbuf *, const struct sockaddr *, const struct rtentry *);

void *        if_linkstate_change_establish(struct ifnet *,
            void (*)(void *), void *);
void        if_linkstate_change_disestablish(struct ifnet *,
            void *, kmutex_t *);

/*
 * These are exported because they're an easy way to tell if
 * an interface is going away without having to burn a flag.
 */
int        if_nulloutput(struct ifnet *, struct mbuf *,
            const struct sockaddr *, const struct rtentry *);
void        if_nullinput(struct ifnet *, struct mbuf *);
void        if_nullstart(struct ifnet *);
int        if_nulltransmit(struct ifnet *, struct mbuf *);
int        if_nullioctl(struct ifnet *, u_long, void *);
int        if_nullinit(struct ifnet *);
void        if_nullstop(struct ifnet *, int);
void        if_nullslowtimo(struct ifnet *);
#define        if_nullwatchdog        if_nullslowtimo
void        if_nulldrain(struct ifnet *);
#else
struct if_nameindex {
        unsigned int        if_index;        /* 1, 2, ... */
        char                *if_name;        /* null terminated name: "le0", ... */
};

#include <sys/cdefs.h>
__BEGIN_DECLS
unsigned int if_nametoindex(const char *);
char *        if_indextoname(unsigned int, char *);
struct        if_nameindex * if_nameindex(void);
void        if_freenameindex(struct if_nameindex *);
__END_DECLS
#endif /* _KERNEL */ /* XXX really ALTQ? */

#ifdef _KERNEL

#define        IFADDR_FIRST(__ifp)                TAILQ_FIRST(&(__ifp)->if_addrlist)
#define        IFADDR_NEXT(__ifa)                TAILQ_NEXT((__ifa), ifa_list)
#define        IFADDR_FOREACH(__ifa, __ifp)        TAILQ_FOREACH(__ifa, \
                                            &(__ifp)->if_addrlist, ifa_list)
#define        IFADDR_FOREACH_SAFE(__ifa, __ifp, __nifa) \
                                            TAILQ_FOREACH_SAFE(__ifa, \
                                            &(__ifp)->if_addrlist, ifa_list, __nifa)
#define        IFADDR_EMPTY(__ifp)                TAILQ_EMPTY(&(__ifp)->if_addrlist)

#define IFADDR_ENTRY_INIT(__ifa)                                        \
        PSLIST_ENTRY_INIT((__ifa), ifa_pslist_entry)
#define IFADDR_ENTRY_DESTROY(__ifa)                                        \
        PSLIST_ENTRY_DESTROY((__ifa), ifa_pslist_entry)
#define IFADDR_READER_EMPTY(__ifp)                                        \
        (PSLIST_READER_FIRST(&(__ifp)->if_addr_pslist, struct ifaddr,        \
                             ifa_pslist_entry) == NULL)
#define IFADDR_READER_FIRST(__ifp)                                        \
        PSLIST_READER_FIRST(&(__ifp)->if_addr_pslist, struct ifaddr,        \
                            ifa_pslist_entry)
#define IFADDR_READER_NEXT(__ifa)                                        \
        PSLIST_READER_NEXT((__ifa), struct ifaddr, ifa_pslist_entry)
#define IFADDR_READER_FOREACH(__ifa, __ifp)                                \
        PSLIST_READER_FOREACH((__ifa), &(__ifp)->if_addr_pslist, struct ifaddr,\
                              ifa_pslist_entry)
#define IFADDR_WRITER_INSERT_HEAD(__ifp, __ifa)                                \
        PSLIST_WRITER_INSERT_HEAD(&(__ifp)->if_addr_pslist, (__ifa),        \
                                  ifa_pslist_entry)
#define IFADDR_WRITER_REMOVE(__ifa)                                        \
        PSLIST_WRITER_REMOVE((__ifa), ifa_pslist_entry)
#define IFADDR_WRITER_FOREACH(__ifa, __ifp)                                \
        PSLIST_WRITER_FOREACH((__ifa), &(__ifp)->if_addr_pslist, struct ifaddr,\
                              ifa_pslist_entry)
#define IFADDR_WRITER_NEXT(__ifp)                                        \
        PSLIST_WRITER_NEXT((__ifp), struct ifaddr, ifa_pslist_entry)
#define IFADDR_WRITER_INSERT_AFTER(__ifp, __new)                        \
        PSLIST_WRITER_INSERT_AFTER((__ifp), (__new), ifa_pslist_entry)
#define IFADDR_WRITER_EMPTY(__ifp)                                        \
        (PSLIST_WRITER_FIRST(&(__ifp)->if_addr_pslist, struct ifaddr,        \
                             ifa_pslist_entry) == NULL)
#define IFADDR_WRITER_INSERT_TAIL(__ifp, __new)                                \
        do {                                                                \
                if (IFADDR_WRITER_EMPTY(__ifp)) {                        \
                        IFADDR_WRITER_INSERT_HEAD((__ifp), (__new));        \
                } else {                                                \
                        struct ifaddr *__ifa;                                \
                        IFADDR_WRITER_FOREACH(__ifa, (__ifp)) {                \
                                if (IFADDR_WRITER_NEXT(__ifa) == NULL) {\
                                        IFADDR_WRITER_INSERT_AFTER(__ifa,\
                                            (__new));                        \
                                        break;                                \
                                }                                        \
                        }                                                \
                }                                                        \
        } while (0)

#define        IFNET_GLOBAL_LOCK()                        mutex_enter(&ifnet_mtx)
#define        IFNET_GLOBAL_UNLOCK()                        mutex_exit(&ifnet_mtx)
#define        IFNET_GLOBAL_LOCKED()                        mutex_owned(&ifnet_mtx)

#define IFNET_READER_EMPTY() \
        (PSLIST_READER_FIRST(&ifnet_pslist, struct ifnet, if_pslist_entry) == NULL)
#define IFNET_READER_FIRST() \
        PSLIST_READER_FIRST(&ifnet_pslist, struct ifnet, if_pslist_entry)
#define IFNET_READER_NEXT(__ifp) \
        PSLIST_READER_NEXT((__ifp), struct ifnet, if_pslist_entry)
#define IFNET_READER_FOREACH(__ifp) \
        PSLIST_READER_FOREACH((__ifp), &ifnet_pslist, struct ifnet, \
                              if_pslist_entry)
#define IFNET_WRITER_INSERT_HEAD(__ifp) \
        PSLIST_WRITER_INSERT_HEAD(&ifnet_pslist, (__ifp), if_pslist_entry)
#define IFNET_WRITER_REMOVE(__ifp) \
        PSLIST_WRITER_REMOVE((__ifp), if_pslist_entry)
#define IFNET_WRITER_FOREACH(__ifp) \
        PSLIST_WRITER_FOREACH((__ifp), &ifnet_pslist, struct ifnet, \
                              if_pslist_entry)
#define IFNET_WRITER_NEXT(__ifp) \
        PSLIST_WRITER_NEXT((__ifp), struct ifnet, if_pslist_entry)
#define IFNET_WRITER_INSERT_AFTER(__ifp, __new) \
        PSLIST_WRITER_INSERT_AFTER((__ifp), (__new), if_pslist_entry)
#define IFNET_WRITER_EMPTY() \
        (PSLIST_WRITER_FIRST(&ifnet_pslist, struct ifnet, if_pslist_entry) == NULL)
#define IFNET_WRITER_INSERT_TAIL(__new)                                        \
        do {                                                                \
                if (IFNET_WRITER_EMPTY()) {                                \
                        IFNET_WRITER_INSERT_HEAD(__new);                \
                } else {                                                \
                        struct ifnet *__ifp;                                \
                        IFNET_WRITER_FOREACH(__ifp) {                        \
                                if (IFNET_WRITER_NEXT(__ifp) == NULL) {        \
                                        IFNET_WRITER_INSERT_AFTER(__ifp,\
                                            (__new));                        \
                                        break;                                \
                                }                                        \
                        }                                                \
                }                                                        \
        } while (0)

#define IFNET_LOCK(ifp)                mutex_enter((ifp)->if_ioctl_lock)
#define IFNET_UNLOCK(ifp)        mutex_exit((ifp)->if_ioctl_lock)
#define IFNET_LOCKED(ifp)        mutex_owned((ifp)->if_ioctl_lock)

#define IFNET_ASSERT_UNLOCKED(ifp)        \
        KDASSERT(mutex_ownable((ifp)->if_ioctl_lock))

extern struct pslist_head ifnet_pslist;
extern kmutex_t ifnet_mtx;

extern struct ifnet *lo0ifp;

/*
 * ifq sysctl support
 */
int        sysctl_ifq(int *name, u_int namelen, void *oldp,
                       size_t *oldlenp, void *newp, size_t newlen,
                       struct ifqueue *ifq);
/* symbolic names for terminal (per-protocol) CTL_IFQ_ nodes */
#define IFQCTL_LEN        1
#define IFQCTL_MAXLEN        2
#define IFQCTL_PEAK        3
#define IFQCTL_DROPS        4

/*
 * Hook for if_vlan - needed by if_agr
 */
MODULE_HOOK(if_vlan_vlan_input_hook,
    struct mbuf *, (struct ifnet *, struct mbuf *));

#endif /* _KERNEL */

#endif /* !_NET_IF_H_ */























































































































































    1 
































    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
/*        $NetBSD: keccak.c,v 1.1 2017/11/30 05:47:24 riastradh Exp $        */

/*-
 * Copyright (c) 2015 Taylor R. Campbell
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>

#if defined(_KERNEL) || defined(_STANDALONE)
__KERNEL_RCSID(0, "$NetBSD: keccak.c,v 1.1 2017/11/30 05:47:24 riastradh Exp $");

#include <sys/types.h>
#else
__RCSID("$NetBSD: keccak.c,v 1.1 2017/11/30 05:47:24 riastradh Exp $");

#include <stdint.h>
#endif

#include "keccak.h"

#define        secret        /* can't use in variable-time operations, should zero */

#define        FOR5(X, STMT) do                                                      \
{                                                                              \
        (X) = 0; STMT;                                                              \
        (X) = 1; STMT;                                                              \
        (X) = 2; STMT;                                                              \
        (X) = 3; STMT;                                                              \
        (X) = 4; STMT;                                                              \
} while (0)

static inline secret uint64_t
rol64(secret uint64_t v, unsigned c)
{

        return ((v << c) | (v >> (64 - c)));
}

static inline void
keccakf1600_theta(secret uint64_t A[25])
{
        secret uint64_t C0, C1, C2, C3, C4;
        unsigned y;

        C0 = C1 = C2 = C3 = C4 = 0;
        FOR5(y, {
                C0 ^= A[0 + 5*y];
                C1 ^= A[1 + 5*y];
                C2 ^= A[2 + 5*y];
                C3 ^= A[3 + 5*y];
                C4 ^= A[4 + 5*y];
        });
        FOR5(y, {
                A[0 + 5*y] ^= C4 ^ rol64(C1, 1);
                A[1 + 5*y] ^= C0 ^ rol64(C2, 1);
                A[2 + 5*y] ^= C1 ^ rol64(C3, 1);
                A[3 + 5*y] ^= C2 ^ rol64(C4, 1);
                A[4 + 5*y] ^= C3 ^ rol64(C0, 1);
        });
}

static inline void
keccakf1600_rho_pi(secret uint64_t A[25])
{
        secret uint64_t T, U;

        /*
         * Permute by (x,y) |---> (y, 2x + 3y mod 5) starting at (1,0),
         * rotate the ith element by (i + 1)(i + 2)/2 mod 64.
         */
        U = A[ 1];                       T = U;
        U = A[10]; A[10] = rol64(T,  1); T = U;
        U = A[ 7]; A[ 7] = rol64(T,  3); T = U;
        U = A[11]; A[11] = rol64(T,  6); T = U;
        U = A[17]; A[17] = rol64(T, 10); T = U;
        U = A[18]; A[18] = rol64(T, 15); T = U;
        U = A[ 3]; A[ 3] = rol64(T, 21); T = U;
        U = A[ 5]; A[ 5] = rol64(T, 28); T = U;
        U = A[16]; A[16] = rol64(T, 36); T = U;
        U = A[ 8]; A[ 8] = rol64(T, 45); T = U;
        U = A[21]; A[21] = rol64(T, 55); T = U;
        U = A[24]; A[24] = rol64(T,  2); T = U;
        U = A[ 4]; A[ 4] = rol64(T, 14); T = U;
        U = A[15]; A[15] = rol64(T, 27); T = U;
        U = A[23]; A[23] = rol64(T, 41); T = U;
        U = A[19]; A[19] = rol64(T, 56); T = U;
        U = A[13]; A[13] = rol64(T,  8); T = U;
        U = A[12]; A[12] = rol64(T, 25); T = U;
        U = A[ 2]; A[ 2] = rol64(T, 43); T = U;
        U = A[20]; A[20] = rol64(T, 62); T = U;
        U = A[14]; A[14] = rol64(T, 18); T = U;
        U = A[22]; A[22] = rol64(T, 39); T = U;
        U = A[ 9]; A[ 9] = rol64(T, 61); T = U;
        U = A[ 6]; A[ 6] = rol64(T, 20); T = U;
                   A[ 1] = rol64(T, 44);
}

static inline void
keccakf1600_chi(secret uint64_t A[25])
{
        secret uint64_t B0, B1, B2, B3, B4;
        unsigned y;

        FOR5(y, {
                B0 = A[0 + 5*y];
                B1 = A[1 + 5*y];
                B2 = A[2 + 5*y];
                B3 = A[3 + 5*y];
                B4 = A[4 + 5*y];
                A[0 + 5*y] ^= ~B1 & B2;
                A[1 + 5*y] ^= ~B2 & B3;
                A[2 + 5*y] ^= ~B3 & B4;
                A[3 + 5*y] ^= ~B4 & B0;
                A[4 + 5*y] ^= ~B0 & B1;
        });
}

static void
keccakf1600_round(secret uint64_t A[25])
{

        keccakf1600_theta(A);
        keccakf1600_rho_pi(A);
        keccakf1600_chi(A);
}

void
keccakf1600(secret uint64_t A[25])
{
        /*
         * RC[i] = \sum_{j = 0,...,6} rc(j + 7i) 2^(2^j - 1),
         * rc(t) = (x^t mod x^8 + x^6 + x^5 + x^4 + 1) mod x in GF(2)[x]
         */
        static const uint64_t RC[24] = {
                0x0000000000000001ULL,
                0x0000000000008082ULL,
                0x800000000000808aULL,
                0x8000000080008000ULL,
                0x000000000000808bULL,
                0x0000000080000001ULL,
                0x8000000080008081ULL,
                0x8000000000008009ULL,
                0x000000000000008aULL,
                0x0000000000000088ULL,
                0x0000000080008009ULL,
                0x000000008000000aULL,
                0x000000008000808bULL,
                0x800000000000008bULL,
                0x8000000000008089ULL,
                0x8000000000008003ULL,
                0x8000000000008002ULL,
                0x8000000000000080ULL,
                0x000000000000800aULL,
                0x800000008000000aULL,
                0x8000000080008081ULL,
                0x8000000000008080ULL,
                0x0000000080000001ULL,
                0x8000000080008008ULL,
        };
        unsigned i;

        for (i = 0; i < 24; i++) {
                keccakf1600_round(A);
                A[0] ^= RC[i];
        }
}






























































































































































































    1 


    1 


    1 



























   32 








   31 















   32 























    9 





   10 
   10 













































    9 
































































































































































































































   42 











    6 
    6 
   36 













   41 
   41 
   42 
   42 
   40 







   42 








   41 











































   37 
    6 




   41 



























    3 








  103 


    3 



   99 












































































































































  102 






































  104 

  104 
    2 



















   81 



  104 



   82 




   10 






























    3 




   78 




  101 







































  103 


























    5 












    4 




   79 






   77 

    7 













   32 
   32 



   32 
   32 










   53 















   45 























    7 


   79 
















































   41 









   56 




    3 





   81 













    1 



















    1 







   81 
   80 











   42 


   42 
   41 





   41 













































   82 











   75 



   80 









   10 







   19 






   18 

    1 




   78 
    1 



   82 























    1 
    1 
    1 
    1 






    1 












































   10 
   10 






















































   10 
   10 
   10 






   10 


















    9 

    6 

    9 





























































































    6 















    6 
    6 


    6 


























    9 














   10 






   10 
   10 
   10 
   10 

   10 



























































   10 






   10 





















    2 












    5 





    8 












    1 



    1 
    1 





    1 

























































   55 


















   78 

   54 













   55 
   55 













   55 



    1 








   78 

   76 











   76 
   78 

   54 



   55 

   37 





































   54 






    1 





   55 












   55 
   54 






   55 


   55 
   20 



   54 
   19 












   20 













   20 

















   19 
   18 
   20 

   19 
   20 


   20 






































    1 






    1 





    1 







































    1 






    1 
    1 















    1 











    1 





























    1 




























   55 
    1 



   54 






   55 



   55 





















































































   36 






   37 
   37 










































   37 























   77 
















   76 

   76 

   78 






   76 
   75 










   75 





   77 







































   78 

   78 





















    6 












    1 
    1 



   28 





   75 


















    8 

















    7 



    1 














    1 













    3 

















    3 
    3 




    2 





    3 
    3 









    2 
    3 














    3 



    3 






    3 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
/*        $NetBSD: uvm_fault.c,v 1.237 2024/03/15 07:09:37 andvar Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * from: Id: uvm_fault.c,v 1.1.2.23 1998/02/06 05:29:05 chs Exp
 */

/*
 * uvm_fault.c: fault handler
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_fault.c,v 1.237 2024/03/15 07:09:37 andvar Exp $");

#include "opt_uvmhist.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/kernel.h>
#include <sys/mman.h>

#include <uvm/uvm.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_rndsource.h>

/*
 *
 * a word on page faults:
 *
 * types of page faults we handle:
 *
 * CASE 1: upper layer faults                   CASE 2: lower layer faults
 *
 *    CASE 1A         CASE 1B                  CASE 2A        CASE 2B
 *    read/write1     write>1                  read/write   +-cow_write/zero
 *         |             |                         |        |
 *      +--|--+       +--|--+     +-----+       +  |  +     | +-----+
 * amap |  V  |       |  ---------> new |          |        | |  ^  |
 *      +-----+       +-----+     +-----+       +  |  +     | +--|--+
 *                                                 |        |    |
 *      +-----+       +-----+                   +--|--+     | +--|--+
 * uobj | d/c |       | d/c |                   |  V  |     +----+  |
 *      +-----+       +-----+                   +-----+       +-----+
 *
 * d/c = don't care
 *
 *   case [0]: layerless fault
 *        no amap or uobj is present.   this is an error.
 *
 *   case [1]: upper layer fault [anon active]
 *     1A: [read] or [write with anon->an_ref == 1]
 *                I/O takes place in upper level anon and uobj is not touched.
 *     1B: [write with anon->an_ref > 1]
 *                new anon is alloc'd and data is copied off ["COW"]
 *
 *   case [2]: lower layer fault [uobj]
 *     2A: [read on non-NULL uobj] or [write to non-copy_on_write area]
 *                I/O takes place directly in object.
 *     2B: [write to copy_on_write] or [read on NULL uobj]
 *                data is "promoted" from uobj to a new anon.
 *                if uobj is null, then we zero fill.
 *
 * we follow the standard UVM locking protocol ordering:
 *
 * MAPS => AMAP => UOBJ => ANON => PAGE QUEUES (PQ)
 * we hold a PG_BUSY page if we unlock for I/O
 *
 *
 * the code is structured as follows:
 *
 *     - init the "IN" params in the ufi structure
 *   ReFault: (ERESTART returned to the loop in uvm_fault_internal)
 *     - do lookups [locks maps], check protection, handle needs_copy
 *     - check for case 0 fault (error)
 *     - establish "range" of fault
 *     - if we have an amap lock it and extract the anons
 *     - if sequential advice deactivate pages behind us
 *     - at the same time check pmap for unmapped areas and anon for pages
 *         that we could map in (and do map it if found)
 *     - check object for resident pages that we could map in
 *     - if (case 2) goto Case2
 *     - >>> handle case 1
 *           - ensure source anon is resident in RAM
 *           - if case 1B alloc new anon and copy from source
 *           - map the correct page in
 *   Case2:
 *     - >>> handle case 2
 *           - ensure source page is resident (if uobj)
 *           - if case 2B alloc new anon and copy from source (could be zero
 *                fill if uobj == NULL)
 *           - map the correct page in
 *     - done!
 *
 * note on paging:
 *   if we have to do I/O we place a PG_BUSY page in the correct object,
 * unlock everything, and do the I/O.   when I/O is done we must reverify
 * the state of the world before assuming that our data structures are
 * valid.   [because mappings could change while the map is unlocked]
 *
 *  alternative 1: unbusy the page in question and restart the page fault
 *    from the top (ReFault).   this is easy but does not take advantage
 *    of the information that we already have from our previous lookup,
 *    although it is possible that the "hints" in the vm_map will help here.
 *
 * alternative 2: the system already keeps track of a "version" number of
 *    a map.   [i.e. every time you write-lock a map (e.g. to change a
 *    mapping) you bump the version number up by one...]   so, we can save
 *    the version number of the map before we release the lock and start I/O.
 *    then when I/O is done we can relock and check the version numbers
 *    to see if anything changed.    this might save us some over 1 because
 *    we don't have to unbusy the page and may be less compares(?).
 *
 * alternative 3: put in backpointers or a way to "hold" part of a map
 *    in place while I/O is in progress.   this could be complex to
 *    implement (especially with structures like amap that can be referenced
 *    by multiple map entries, and figuring out what should wait could be
 *    complex as well...).
 *
 * we use alternative 2.  given that we are multi-threaded now we may want
 * to reconsider the choice.
 */

/*
 * local data structures
 */

struct uvm_advice {
        int advice;
        int nback;
        int nforw;
};

/*
 * page range array:
 * note: index in array must match "advice" value
 * XXX: borrowed numbers from freebsd.   do they work well for us?
 */

static const struct uvm_advice uvmadvice[] = {
        { UVM_ADV_NORMAL, 3, 4 },
        { UVM_ADV_RANDOM, 0, 0 },
        { UVM_ADV_SEQUENTIAL, 8, 7},
};

#define UVM_MAXRANGE 16        /* must be MAX() of nback+nforw+1 */

/*
 * private prototypes
 */

/*
 * inline functions
 */

/*
 * uvmfault_anonflush: try and deactivate pages in specified anons
 *
 * => does not have to deactivate page if it is busy
 */

static inline void
uvmfault_anonflush(struct vm_anon **anons, int n)
{
        int lcv;
        struct vm_page *pg;

        for (lcv = 0; lcv < n; lcv++) {
                if (anons[lcv] == NULL)
                        continue;
                KASSERT(rw_lock_held(anons[lcv]->an_lock));
                pg = anons[lcv]->an_page;
                if (pg && (pg->flags & PG_BUSY) == 0) {
                        uvm_pagelock(pg);
                        uvm_pagedeactivate(pg);
                        uvm_pageunlock(pg);
                }
        }
}

/*
 * normal functions
 */

/*
 * uvmfault_amapcopy: clear "needs_copy" in a map.
 *
 * => called with VM data structures unlocked (usually, see below)
 * => we get a write lock on the maps and clear needs_copy for a VA
 * => if we are out of RAM we sleep (waiting for more)
 */

static void
uvmfault_amapcopy(struct uvm_faultinfo *ufi)
{
        for (;;) {

                /*
                 * no mapping?  give up.
                 */

                if (uvmfault_lookup(ufi, true) == false)
                        return;

                /*
                 * copy if needed.
                 */

                if (UVM_ET_ISNEEDSCOPY(ufi->entry))
                        amap_copy(ufi->map, ufi->entry, AMAP_COPY_NOWAIT,
                                ufi->orig_rvaddr, ufi->orig_rvaddr + 1);

                /*
                 * didn't work?  must be out of RAM.   unlock and sleep.
                 */

                if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
                        uvmfault_unlockmaps(ufi, true);
                        uvm_wait("fltamapcopy");
                        continue;
                }

                /*
                 * got it!   unlock and return.
                 */

                uvmfault_unlockmaps(ufi, true);
                return;
        }
        /*NOTREACHED*/
}

/*
 * uvmfault_anonget: get data in an anon into a non-busy, non-released
 * page in that anon.
 *
 * => Map, amap and thus anon should be locked by caller.
 * => If we fail, we unlock everything and error is returned.
 * => If we are successful, return with everything still locked.
 * => We do not move the page on the queues [gets moved later].  If we
 *    allocate a new page [we_own], it gets put on the queues.  Either way,
 *    the result is that the page is on the queues at return time
 * => For pages which are on loan from a uvm_object (and thus are not owned
 *    by the anon): if successful, return with the owning object locked.
 *    The caller must unlock this object when it unlocks everything else.
 */

int
uvmfault_anonget(struct uvm_faultinfo *ufi, struct vm_amap *amap,
    struct vm_anon *anon)
{
        struct vm_page *pg;
        krw_t lock_type;
        int error __unused; /* used for VMSWAP */

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
        KASSERT(rw_lock_held(anon->an_lock));
        KASSERT(anon->an_lock == amap->am_lock);

        /* Increment the counters.*/
        cpu_count(CPU_COUNT_FLTANGET, 1);
        if (anon->an_page) {
                curlwp->l_ru.ru_minflt++;
        } else {
                curlwp->l_ru.ru_majflt++;
        }
        error = 0;

        /*
         * Loop until we get the anon data, or fail.
         */

        for (;;) {
                bool we_own, locked;
                /*
                 * Note: 'we_own' will become true if we set PG_BUSY on a page.
                 */
                we_own = false;
                pg = anon->an_page;

                /*
                 * If there is a resident page and it is loaned, then anon
                 * may not own it.  Call out to uvm_anon_lockloanpg() to
                 * identify and lock the real owner of the page.
                 */

                if (pg && pg->loan_count)
                        pg = uvm_anon_lockloanpg(anon);

                /*
                 * Is page resident?  Make sure it is not busy/released.
                 */

                lock_type = rw_lock_op(anon->an_lock);
                if (pg) {

                        /*
                         * at this point, if the page has a uobject [meaning
                         * we have it on loan], then that uobject is locked
                         * by us!   if the page is busy, we drop all the
                         * locks (including uobject) and try again.
                         */

                        if ((pg->flags & PG_BUSY) == 0) {
                                UVMHIST_LOG(maphist, "<- OK",0,0,0,0);
                                return 0;
                        }
                        cpu_count(CPU_COUNT_FLTPGWAIT, 1);

                        /*
                         * The last unlock must be an atomic unlock and wait
                         * on the owner of page.
                         */

                        if (pg->uobject) {
                                /* Owner of page is UVM object. */
                                uvmfault_unlockall(ufi, amap, NULL);
                                UVMHIST_LOG(maphist, " unlock+wait on uobj",0,
                                    0,0,0);
                                uvm_pagewait(pg, pg->uobject->vmobjlock, "anonget1");
                        } else {
                                /* Owner of page is anon. */
                                uvmfault_unlockall(ufi, NULL, NULL);
                                UVMHIST_LOG(maphist, " unlock+wait on anon",0,
                                    0,0,0);
                                uvm_pagewait(pg, anon->an_lock, "anonget2");
                        }
                } else {
#if defined(VMSWAP)
                        /*
                         * No page, therefore allocate one.  A write lock is
                         * required for this.  If the caller didn't supply
                         * one, fail now and have them retry.
                         */

                        if (lock_type == RW_READER) {
                                return ENOLCK;
                        }
                        pg = uvm_pagealloc(NULL,
                            ufi != NULL ? ufi->orig_rvaddr : 0,
                            anon, ufi != NULL ? UVM_FLAG_COLORMATCH : 0);
                        if (pg == NULL) {
                                /* Out of memory.  Wait a little. */
                                uvmfault_unlockall(ufi, amap, NULL);
                                cpu_count(CPU_COUNT_FLTNORAM, 1);
                                UVMHIST_LOG(maphist, "  noram -- UVM_WAIT",0,
                                    0,0,0);
                                if (!uvm_reclaimable()) {
                                        return ENOMEM;
                                }
                                uvm_wait("flt_noram1");
                        } else {
                                /* PG_BUSY bit is set. */
                                we_own = true;
                                uvmfault_unlockall(ufi, amap, NULL);

                                /*
                                 * Pass a PG_BUSY+PG_FAKE clean page into
                                 * the uvm_swap_get() function with all data
                                 * structures unlocked.  Note that it is OK
                                 * to read an_swslot here, because we hold
                                 * PG_BUSY on the page.
                                 */
                                cpu_count(CPU_COUNT_PAGEINS, 1);
                                error = uvm_swap_get(pg, anon->an_swslot,
                                    PGO_SYNCIO);

                                /*
                                 * We clean up after the I/O below in the
                                 * 'we_own' case.
                                 */
                        }
#else
                        panic("%s: no page", __func__);
#endif /* defined(VMSWAP) */
                }

                /*
                 * Re-lock the map and anon.
                 */

                locked = uvmfault_relock(ufi);
                if (locked || we_own) {
                        rw_enter(anon->an_lock, lock_type);
                }

                /*
                 * If we own the page (i.e. we set PG_BUSY), then we need
                 * to clean up after the I/O.  There are three cases to
                 * consider:
                 *
                 * 1) Page was released during I/O: free anon and ReFault.
                 * 2) I/O not OK.  Free the page and cause the fault to fail.
                 * 3) I/O OK!  Activate the page and sync with the non-we_own
                 *    case (i.e. drop anon lock if not locked).
                 */

                if (we_own) {
                        KASSERT(lock_type == RW_WRITER);
#if defined(VMSWAP)
                        if (error) {

                                /*
                                 * Remove the swap slot from the anon and
                                 * mark the anon as having no real slot.
                                 * Do not free the swap slot, thus preventing
                                 * it from being used again.
                                 */

                                if (anon->an_swslot > 0) {
                                        uvm_swap_markbad(anon->an_swslot, 1);
                                }
                                anon->an_swslot = SWSLOT_BAD;

                                if ((pg->flags & PG_RELEASED) != 0) {
                                        goto released;
                                }

                                /*
                                 * Note: page was never !PG_BUSY, so it
                                 * cannot be mapped and thus no need to
                                 * pmap_page_protect() it.
                                 */

                                uvm_pagefree(pg);

                                if (locked) {
                                        uvmfault_unlockall(ufi, NULL, NULL);
                                }
                                rw_exit(anon->an_lock);
                                UVMHIST_LOG(maphist, "<- ERROR", 0,0,0,0);
                                return error;
                        }

                        if ((pg->flags & PG_RELEASED) != 0) {
released:
                                KASSERT(anon->an_ref == 0);

                                /*
                                 * Released while we had unlocked amap.
                                 */

                                if (locked) {
                                        uvmfault_unlockall(ufi, NULL, NULL);
                                }
                                uvm_anon_release(anon);

                                if (error) {
                                        UVMHIST_LOG(maphist,
                                            "<- ERROR/RELEASED", 0,0,0,0);
                                        return error;
                                }

                                UVMHIST_LOG(maphist, "<- RELEASED", 0,0,0,0);
                                return ERESTART;
                        }

                        /*
                         * We have successfully read the page, activate it.
                         */

                        uvm_pagelock(pg);
                        uvm_pageactivate(pg);
                        uvm_pagewakeup(pg);
                        uvm_pageunlock(pg);
                        pg->flags &= ~(PG_BUSY|PG_FAKE);
                        uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
                        UVM_PAGE_OWN(pg, NULL);
#else
                        panic("%s: we_own", __func__);
#endif /* defined(VMSWAP) */
                }

                /*
                 * We were not able to re-lock the map - restart the fault.
                 */

                if (!locked) {
                        if (we_own) {
                                rw_exit(anon->an_lock);
                        }
                        UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0);
                        return ERESTART;
                }

                /*
                 * Verify that no one has touched the amap and moved
                 * the anon on us.
                 */

                if (ufi != NULL && amap_lookup(&ufi->entry->aref,
                    ufi->orig_rvaddr - ufi->entry->start) != anon) {

                        uvmfault_unlockall(ufi, amap, NULL);
                        UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0);
                        return ERESTART;
                }

                /*
                 * Retry..
                 */

                cpu_count(CPU_COUNT_FLTANRETRY, 1);
                continue;
        }
        /*NOTREACHED*/
}

/*
 * uvmfault_promote: promote data to a new anon.  used for 1B and 2B.
 *
 *        1. allocate an anon and a page.
 *        2. fill its contents.
 *        3. put it into amap.
 *
 * => if we fail (result != 0) we unlock everything.
 * => on success, return a new locked anon via 'nanon'.
 *    (*nanon)->an_page will be a resident, locked, dirty page.
 * => it's caller's responsibility to put the promoted nanon->an_page to the
 *    page queue.
 */

static int
uvmfault_promote(struct uvm_faultinfo *ufi,
    struct vm_anon *oanon,
    struct vm_page *uobjpage,
    struct vm_anon **nanon, /* OUT: allocated anon */
    struct vm_anon **spare)
{
        struct vm_amap *amap = ufi->entry->aref.ar_amap;
        struct uvm_object *uobj;
        struct vm_anon *anon;
        struct vm_page *pg;
        struct vm_page *opg;
        int error;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        if (oanon) {
                /* anon COW */
                opg = oanon->an_page;
                KASSERT(opg != NULL);
                KASSERT(opg->uobject == NULL || opg->loan_count > 0);
        } else if (uobjpage != PGO_DONTCARE) {
                /* object-backed COW */
                opg = uobjpage;
                KASSERT(rw_lock_held(opg->uobject->vmobjlock));
        } else {
                /* ZFOD */
                opg = NULL;
        }
        if (opg != NULL) {
                uobj = opg->uobject;
        } else {
                uobj = NULL;
        }

        KASSERT(amap != NULL);
        KASSERT(uobjpage != NULL);
        KASSERT(rw_write_held(amap->am_lock));
        KASSERT(oanon == NULL || amap->am_lock == oanon->an_lock);
        KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));

        if (*spare != NULL) {
                anon = *spare;
                *spare = NULL;
        } else {
                anon = uvm_analloc();
        }
        if (anon) {

                /*
                 * The new anon is locked.
                 *
                 * if opg == NULL, we want a zero'd, dirty page,
                 * so have uvm_pagealloc() do that for us.
                 */

                KASSERT(anon->an_lock == NULL);
                anon->an_lock = amap->am_lock;
                pg = uvm_pagealloc(NULL, ufi->orig_rvaddr, anon,
                    UVM_FLAG_COLORMATCH | (opg == NULL ? UVM_PGA_ZERO : 0));
                if (pg == NULL) {
                        anon->an_lock = NULL;
                }
        } else {
                pg = NULL;
        }

        /*
         * out of memory resources?
         */

        if (pg == NULL) {
                /* save anon for the next try. */
                if (anon != NULL) {
                        *spare = anon;
                }

                /* unlock and fail ... */
                uvmfault_unlockall(ufi, amap, uobj);
                if (!uvm_reclaimable()) {
                        UVMHIST_LOG(maphist, "out of VM", 0,0,0,0);
                        cpu_count(CPU_COUNT_FLTNOANON, 1);
                        error = ENOMEM;
                        goto done;
                }

                UVMHIST_LOG(maphist, "out of RAM, waiting for more", 0,0,0,0);
                cpu_count(CPU_COUNT_FLTNORAM, 1);
                uvm_wait("flt_noram5");
                error = ERESTART;
                goto done;
        }

        /*
         * copy the page [pg now dirty]
         *
         * Remove the pmap entry now for the old page at this address
         * so that no thread can modify the new page while any thread
         * might still see the old page.
         */
        if (opg) {
                pmap_remove(vm_map_pmap(ufi->orig_map), ufi->orig_rvaddr,
                             ufi->orig_rvaddr + PAGE_SIZE);
                pmap_update(vm_map_pmap(ufi->orig_map));
                uvm_pagecopy(opg, pg);
        }
        KASSERT(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_DIRTY);

        amap_add(&ufi->entry->aref, ufi->orig_rvaddr - ufi->entry->start, anon,
            oanon != NULL);

        /*
         * from this point on am_lock won't be dropped until the page is
         * entered, so it's safe to unbusy the page up front.
         *
         * uvm_fault_{upper,lower}_done will activate or enqueue the page.
         */

        pg = anon->an_page;
        pg->flags &= ~(PG_BUSY|PG_FAKE);
        UVM_PAGE_OWN(pg, NULL);

        *nanon = anon;
        error = 0;
done:
        return error;
}

/*
 * Update statistics after fault resolution.
 * - maxrss
 */
void
uvmfault_update_stats(struct uvm_faultinfo *ufi)
{
        struct vm_map                *map;
        struct vmspace                 *vm;
        struct proc                *p;
        vsize_t                         res;

        map = ufi->orig_map;

        p = curproc;
        KASSERT(p != NULL);
        vm = p->p_vmspace;

        if (&vm->vm_map != map)
                return;

        res = pmap_resident_count(map->pmap);
        if (vm->vm_rssmax < res)
                vm->vm_rssmax = res;
}

/*
 *   F A U L T   -   m a i n   e n t r y   p o i n t
 */

/*
 * uvm_fault: page fault handler
 *
 * => called from MD code to resolve a page fault
 * => VM data structures usually should be unlocked.   however, it is
 *        possible to call here with the main map locked if the caller
 *        gets a write lock, sets it recursive, and then calls us (c.f.
 *        uvm_map_pageable).   this should be avoided because it keeps
 *        the map locked off during I/O.
 * => MUST NEVER BE CALLED IN INTERRUPT CONTEXT
 */

#define MASK(entry)     (UVM_ET_ISCOPYONWRITE(entry) ? \
                         ~VM_PROT_WRITE : VM_PROT_ALL)

/* fault_flag values passed from uvm_fault_wire to uvm_fault_internal */
#define UVM_FAULT_WIRE                (1 << 0)
#define UVM_FAULT_MAXPROT        (1 << 1)

struct uvm_faultctx {

        /*
         * the following members are set up by uvm_fault_check() and
         * read-only after that.
         *
         * note that narrow is used by uvm_fault_check() to change
         * the behaviour after ERESTART.
         *
         * most of them might change after RESTART if the underlying
         * map entry has been changed behind us.  an exception is
         * wire_paging, which does never change.
         */
        vm_prot_t access_type;
        vaddr_t startva;
        int npages;
        int centeridx;
        bool narrow;                /* work on a single requested page only */
        bool wire_mapping;        /* request a PMAP_WIRED mapping
                                   (UVM_FAULT_WIRE or VM_MAPENT_ISWIRED) */
        bool wire_paging;        /* request uvm_pagewire
                                   (true for UVM_FAULT_WIRE) */
        bool cow_now;                /* VM_PROT_WRITE is actually requested
                                   (ie. should break COW and page loaning) */

        /*
         * enter_prot is set up by uvm_fault_check() and clamped
         * (ie. drop the VM_PROT_WRITE bit) in various places in case
         * of !cow_now.
         */
        vm_prot_t enter_prot;        /* prot at which we want to enter pages in */

        /*
         * the following member is for uvmfault_promote() and ERESTART.
         */
        struct vm_anon *anon_spare;

        /*
         * the following is actually a uvm_fault_lower() internal.
         * it's here merely for debugging.
         * (or due to the mechanical separation of the function?)
         */
        bool promote;

        /*
         * type of lock to acquire on objects in both layers.
         */
        krw_t lower_lock_type;
        krw_t upper_lock_type;
};

static inline int        uvm_fault_check(
                            struct uvm_faultinfo *, struct uvm_faultctx *,
                            struct vm_anon ***, bool);

static int                uvm_fault_upper(
                            struct uvm_faultinfo *, struct uvm_faultctx *,
                            struct vm_anon **);
static inline int        uvm_fault_upper_lookup(
                            struct uvm_faultinfo *, const struct uvm_faultctx *,
                            struct vm_anon **, struct vm_page **);
static inline void        uvm_fault_upper_neighbor(
                            struct uvm_faultinfo *, const struct uvm_faultctx *,
                            vaddr_t, struct vm_page *, bool);
static inline int        uvm_fault_upper_loan(
                            struct uvm_faultinfo *, struct uvm_faultctx *,
                            struct vm_anon *, struct uvm_object **);
static inline int        uvm_fault_upper_promote(
                            struct uvm_faultinfo *, struct uvm_faultctx *,
                            struct uvm_object *, struct vm_anon *);
static inline int        uvm_fault_upper_direct(
                            struct uvm_faultinfo *, struct uvm_faultctx *,
                            struct uvm_object *, struct vm_anon *);
static int                uvm_fault_upper_enter(
                            struct uvm_faultinfo *, const struct uvm_faultctx *,
                            struct uvm_object *, struct vm_anon *,
                            struct vm_page *, struct vm_anon *);
static inline void        uvm_fault_upper_done(
                            struct uvm_faultinfo *, const struct uvm_faultctx *,
                            struct vm_anon *, struct vm_page *);

static int                uvm_fault_lower(
                            struct uvm_faultinfo *, struct uvm_faultctx *,
                            struct vm_page **);
static inline void        uvm_fault_lower_lookup(
                            struct uvm_faultinfo *, const struct uvm_faultctx *,
                            struct vm_page **);
static inline void        uvm_fault_lower_neighbor(
                            struct uvm_faultinfo *, const struct uvm_faultctx *,
                            vaddr_t, struct vm_page *);
static inline int        uvm_fault_lower_io(
                            struct uvm_faultinfo *, struct uvm_faultctx *,
                            struct uvm_object **, struct vm_page **);
static inline int        uvm_fault_lower_direct(
                            struct uvm_faultinfo *, struct uvm_faultctx *,
                            struct uvm_object *, struct vm_page *);
static inline int        uvm_fault_lower_direct_loan(
                            struct uvm_faultinfo *, struct uvm_faultctx *,
                            struct uvm_object *, struct vm_page **,
                            struct vm_page **);
static inline int        uvm_fault_lower_promote(
                            struct uvm_faultinfo *, struct uvm_faultctx *,
                            struct uvm_object *, struct vm_page *);
static int                uvm_fault_lower_enter(
                            struct uvm_faultinfo *, const struct uvm_faultctx *,
                            struct uvm_object *,
                            struct vm_anon *, struct vm_page *);
static inline void        uvm_fault_lower_done(
                            struct uvm_faultinfo *, const struct uvm_faultctx *,
                            struct uvm_object *, struct vm_page *);

int
uvm_fault_internal(struct vm_map *orig_map, vaddr_t vaddr,
    vm_prot_t access_type, int fault_flag)
{
        struct uvm_faultinfo ufi;
        struct uvm_faultctx flt = {
                .access_type = access_type,

                /* don't look for neighborhood * pages on "wire" fault */
                .narrow = (fault_flag & UVM_FAULT_WIRE) != 0,

                /* "wire" fault causes wiring of both mapping and paging */
                .wire_mapping = (fault_flag & UVM_FAULT_WIRE) != 0,
                .wire_paging = (fault_flag & UVM_FAULT_WIRE) != 0,

                /*
                 * default lock type to acquire on upper & lower layer
                 * objects: reader.  this can be upgraded at any point
                 * during the fault from read -> write and uvm_faultctx
                 * changed to match, but is never downgraded write -> read.
                 */
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
                .upper_lock_type = RW_WRITER,
                .lower_lock_type = RW_WRITER,
#else
                .upper_lock_type = RW_READER,
                .lower_lock_type = RW_READER,
#endif
        };
        const bool maxprot = (fault_flag & UVM_FAULT_MAXPROT) != 0;
        struct vm_anon *anons_store[UVM_MAXRANGE], **anons;
        struct vm_page *pages_store[UVM_MAXRANGE], **pages;
        int error;

        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist, "(map=%#jx, vaddr=%#jx, at=%jd, ff=%jd)",
              (uintptr_t)orig_map, vaddr, access_type, fault_flag);

        /* Don't count anything until user interaction is possible */
        kpreempt_disable();
        if (__predict_true(start_init_exec)) {
                struct cpu_info *ci = curcpu();
                CPU_COUNT(CPU_COUNT_NFAULT, 1);
                /* Don't flood RNG subsystem with samples. */
                if (++(ci->ci_faultrng) == 503) {
                        ci->ci_faultrng = 0;
                        rnd_add_uint32(&uvm_fault_rndsource,
                            sizeof(vaddr_t) == sizeof(uint32_t) ?
                            (uint32_t)vaddr : sizeof(vaddr_t) ==
                            sizeof(uint64_t) ?
                            (uint32_t)vaddr :
                            (uint32_t)ci->ci_counts[CPU_COUNT_NFAULT]);
                }
        }
        kpreempt_enable();

        /*
         * init the IN parameters in the ufi
         */

        ufi.orig_map = orig_map;
        ufi.orig_rvaddr = trunc_page(vaddr);
        ufi.orig_size = PAGE_SIZE;        /* can't get any smaller than this */

        error = ERESTART;
        while (error == ERESTART) { /* ReFault: */
                anons = anons_store;
                pages = pages_store;

                error = uvm_fault_check(&ufi, &flt, &anons, maxprot);
                if (error != 0)
                        continue;

                error = uvm_fault_upper_lookup(&ufi, &flt, anons, pages);
                if (error != 0)
                        continue;

                if (pages[flt.centeridx] == PGO_DONTCARE)
                        error = uvm_fault_upper(&ufi, &flt, anons);
                else {
                        struct uvm_object * const uobj =
                            ufi.entry->object.uvm_obj;

                        if (uobj && uobj->pgops->pgo_fault != NULL) {
                                /*
                                 * invoke "special" fault routine.
                                 */
                                rw_enter(uobj->vmobjlock, RW_WRITER);
                                /* locked: maps(read), amap(if there), uobj */
                                error = uobj->pgops->pgo_fault(&ufi,
                                    flt.startva, pages, flt.npages,
                                    flt.centeridx, flt.access_type,
                                    PGO_LOCKED|PGO_SYNCIO);

                                /*
                                 * locked: nothing, pgo_fault has unlocked
                                 * everything
                                 */

                                /*
                                 * object fault routine responsible for
                                 * pmap_update().
                                 */

                                /*
                                 * Wake up the pagedaemon if the fault method
                                 * failed for lack of memory but some can be
                                 * reclaimed.
                                 */
                                if (error == ENOMEM && uvm_reclaimable()) {
                                        uvm_wait("pgo_fault");
                                        error = ERESTART;
                                }
                        } else {
                                error = uvm_fault_lower(&ufi, &flt, pages);
                        }
                }
        }

        if (flt.anon_spare != NULL) {
                flt.anon_spare->an_ref--;
                KASSERT(flt.anon_spare->an_ref == 0);
                KASSERT(flt.anon_spare->an_lock == NULL);
                uvm_anfree(flt.anon_spare);
        }
        return error;
}

/*
 * uvm_fault_check: check prot, handle needs-copy, etc.
 *
 *        1. lookup entry.
 *        2. check protection.
 *        3. adjust fault condition (mainly for simulated fault).
 *        4. handle needs-copy (lazy amap copy).
 *        5. establish range of interest for neighbor fault (aka pre-fault).
 *        6. look up anons (if amap exists).
 *        7. flush pages (if MADV_SEQUENTIAL)
 *
 * => called with nothing locked.
 * => if we fail (result != 0) we unlock everything.
 * => initialize/adjust many members of flt.
 */

static int
uvm_fault_check(
        struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
        struct vm_anon ***ranons, bool maxprot)
{
        struct vm_amap *amap;
        struct uvm_object *uobj;
        vm_prot_t check_prot;
        int nback, nforw;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        /*
         * lookup and lock the maps
         */

        if (uvmfault_lookup(ufi, false) == false) {
                UVMHIST_LOG(maphist, "<- no mapping @ %#jx", ufi->orig_rvaddr,
                    0,0,0);
                return EFAULT;
        }
        /* locked: maps(read) */

#ifdef DIAGNOSTIC
        if ((ufi->map->flags & VM_MAP_PAGEABLE) == 0) {
                printf("Page fault on non-pageable map:\n");
                printf("ufi->map = %p\n", ufi->map);
                printf("ufi->orig_map = %p\n", ufi->orig_map);
                printf("ufi->orig_rvaddr = %#lx\n", (u_long) ufi->orig_rvaddr);
                panic("uvm_fault: (ufi->map->flags & VM_MAP_PAGEABLE) == 0");
        }
#endif

        /*
         * check protection
         */

        check_prot = maxprot ?
            ufi->entry->max_protection : ufi->entry->protection;
        if ((check_prot & flt->access_type) != flt->access_type) {
                UVMHIST_LOG(maphist,
                    "<- protection failure (prot=%#jx, access=%#jx)",
                    ufi->entry->protection, flt->access_type, 0, 0);
                uvmfault_unlockmaps(ufi, false);
                return EFAULT;
        }

        /*
         * "enter_prot" is the protection we want to enter the page in at.
         * for certain pages (e.g. copy-on-write pages) this protection can
         * be more strict than ufi->entry->protection.  "wired" means either
         * the entry is wired or we are fault-wiring the pg.
         */

        flt->enter_prot = ufi->entry->protection;
        if (VM_MAPENT_ISWIRED(ufi->entry)) {
                flt->wire_mapping = true;
                flt->wire_paging = true;
                flt->narrow = true;
        }

        if (flt->wire_mapping) {
                flt->access_type = flt->enter_prot; /* full access for wired */
                flt->cow_now = (check_prot & VM_PROT_WRITE) != 0;
        } else {
                flt->cow_now = (flt->access_type & VM_PROT_WRITE) != 0;
        }

        if (flt->wire_paging) {
                /* wiring pages requires a write lock. */
                flt->upper_lock_type = RW_WRITER;
                flt->lower_lock_type = RW_WRITER;
        }

        flt->promote = false;

        /*
         * handle "needs_copy" case.   if we need to copy the amap we will
         * have to drop our readlock and relock it with a write lock.  (we
         * need a write lock to change anything in a map entry [e.g.
         * needs_copy]).
         */

        if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
                if (flt->cow_now || (ufi->entry->object.uvm_obj == NULL)) {
                        KASSERT(!maxprot);
                        /* need to clear */
                        UVMHIST_LOG(maphist,
                            "  need to clear needs_copy and refault",0,0,0,0);
                        uvmfault_unlockmaps(ufi, false);
                        uvmfault_amapcopy(ufi);
                        cpu_count(CPU_COUNT_FLTAMCOPY, 1);
                        return ERESTART;

                } else {

                        /*
                         * ensure that we pmap_enter page R/O since
                         * needs_copy is still true
                         */

                        flt->enter_prot &= ~VM_PROT_WRITE;
                }
        }

        /*
         * identify the players
         */

        amap = ufi->entry->aref.ar_amap;        /* upper layer */
        uobj = ufi->entry->object.uvm_obj;        /* lower layer */

        /*
         * check for a case 0 fault.  if nothing backing the entry then
         * error now.
         */

        if (amap == NULL && uobj == NULL) {
                uvmfault_unlockmaps(ufi, false);
                UVMHIST_LOG(maphist,"<- no backing store, no overlay",0,0,0,0);
                return EFAULT;
        }

        /*
         * for a case 2B fault waste no time on adjacent pages because
         * they are likely already entered.
         */

        if (uobj != NULL && amap != NULL &&
            (flt->access_type & VM_PROT_WRITE) != 0) {
                /* wide fault (!narrow) */
                flt->narrow = true;
        }

        /*
         * establish range of interest based on advice from mapper
         * and then clip to fit map entry.   note that we only want
         * to do this the first time through the fault.   if we
         * ReFault we will disable this by setting "narrow" to true.
         */

        if (flt->narrow == false) {

                /* wide fault (!narrow) */
                KASSERT(uvmadvice[ufi->entry->advice].advice ==
                         ufi->entry->advice);
                nback = MIN(uvmadvice[ufi->entry->advice].nback,
                    (ufi->orig_rvaddr - ufi->entry->start) >> PAGE_SHIFT);
                flt->startva = ufi->orig_rvaddr - (nback << PAGE_SHIFT);
                /*
                 * note: "-1" because we don't want to count the
                 * faulting page as forw
                 */
                nforw = MIN(uvmadvice[ufi->entry->advice].nforw,
                            ((ufi->entry->end - ufi->orig_rvaddr) >>
                             PAGE_SHIFT) - 1);
                flt->npages = nback + nforw + 1;
                flt->centeridx = nback;

                flt->narrow = true;        /* ensure only once per-fault */

        } else {

                /* narrow fault! */
                nback = nforw = 0;
                flt->startva = ufi->orig_rvaddr;
                flt->npages = 1;
                flt->centeridx = 0;

        }
        /* offset from entry's start to pgs' start */
        const voff_t eoff = flt->startva - ufi->entry->start;

        /* locked: maps(read) */
        UVMHIST_LOG(maphist, "  narrow=%jd, back=%jd, forw=%jd, startva=%#jx",
                    flt->narrow, nback, nforw, flt->startva);
        UVMHIST_LOG(maphist, "  entry=%#jx, amap=%#jx, obj=%#jx",
            (uintptr_t)ufi->entry, (uintptr_t)amap, (uintptr_t)uobj, 0);

        /*
         * guess at the most suitable lock types to acquire.
         * if we've got an amap then lock it and extract current anons.
         */

        if (amap) {
                if ((amap_flags(amap) & AMAP_SHARED) == 0) {
                        /*
                         * the amap isn't shared.  get a writer lock to
                         * avoid the cost of upgrading the lock later if
                         * needed.
                         *
                         * XXX nice for PostgreSQL, but consider threads.
                         */
                        flt->upper_lock_type = RW_WRITER;
                } else if ((flt->access_type & VM_PROT_WRITE) != 0) {
                        /*
                         * assume we're about to COW.
                         */
                        flt->upper_lock_type = RW_WRITER;
                }
                amap_lock(amap, flt->upper_lock_type);
                amap_lookups(&ufi->entry->aref, eoff, *ranons, flt->npages);
        } else {
                if ((flt->access_type & VM_PROT_WRITE) != 0) {
                        /*
                         * we are about to dirty the object and that
                         * requires a write lock.
                         */
                        flt->lower_lock_type = RW_WRITER;
                }
                *ranons = NULL;        /* to be safe */
        }

        /* locked: maps(read), amap(if there) */
        KASSERT(amap == NULL ||
            rw_lock_op(amap->am_lock) == flt->upper_lock_type);

        /*
         * for MADV_SEQUENTIAL mappings we want to deactivate the back pages
         * now and then forget about them (for the rest of the fault).
         */

        if (ufi->entry->advice == MADV_SEQUENTIAL && nback != 0) {

                UVMHIST_LOG(maphist, "  MADV_SEQUENTIAL: flushing backpages",
                    0,0,0,0);
                /* flush back-page anons? */
                if (amap)
                        uvmfault_anonflush(*ranons, nback);

                /*
                 * flush object?  change lock type to RW_WRITER, to avoid
                 * excessive competition between read/write locks if many
                 * threads doing "sequential access".
                 */
                if (uobj) {
                        voff_t uoff;

                        flt->lower_lock_type = RW_WRITER;
                        uoff = ufi->entry->offset + eoff;
                        rw_enter(uobj->vmobjlock, RW_WRITER);
                        (void) (uobj->pgops->pgo_put)(uobj, uoff, uoff +
                                    (nback << PAGE_SHIFT), PGO_DEACTIVATE);
                }

                /* now forget about the backpages */
                if (amap)
                        *ranons += nback;
                flt->startva += (nback << PAGE_SHIFT);
                flt->npages -= nback;
                flt->centeridx = 0;
        }
        /*
         * => startva is fixed
         * => npages is fixed
         */
        KASSERT(flt->startva <= ufi->orig_rvaddr);
        KASSERT(ufi->orig_rvaddr + ufi->orig_size <=
            flt->startva + (flt->npages << PAGE_SHIFT));
        return 0;
}

/*
 * uvm_fault_upper_upgrade: upgrade upper lock, reader -> writer
 */

static inline int
uvm_fault_upper_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
    struct vm_amap *amap, struct uvm_object *uobj)
{
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        KASSERT(amap != NULL);
        KASSERT(flt->upper_lock_type == rw_lock_op(amap->am_lock));

        /*
         * fast path.
         */

        if (__predict_true(flt->upper_lock_type == RW_WRITER)) {
                return 0;
        }

        /*
         * otherwise try for the upgrade.  if we don't get it, unlock
         * everything, restart the fault and next time around get a writer
         * lock.
         */

        flt->upper_lock_type = RW_WRITER;
        if (__predict_false(!rw_tryupgrade(amap->am_lock))) {
                uvmfault_unlockall(ufi, amap, uobj);
                cpu_count(CPU_COUNT_FLTNOUP, 1);
                UVMHIST_LOG(maphist, "  !upgrade upper", 0, 0,0,0);
                return ERESTART;
        }
        cpu_count(CPU_COUNT_FLTUP, 1);
        KASSERT(flt->upper_lock_type == rw_lock_op(amap->am_lock));
        return 0;
}

/*
 * uvm_fault_upper_lookup: look up existing h/w mapping and amap.
 *
 * iterate range of interest:
 *        1. check if h/w mapping exists.  if yes, we don't care
 *        2. check if anon exists.  if not, page is lower.
 *        3. if anon exists, enter h/w mapping for neighbors.
 *
 * => called with amap locked (if exists).
 */

static int
uvm_fault_upper_lookup(
        struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
        struct vm_anon **anons, struct vm_page **pages)
{
        struct vm_amap *amap = ufi->entry->aref.ar_amap;
        int lcv;
        vaddr_t currva;
        bool shadowed __unused;
        bool entered;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        /* locked: maps(read), amap(if there) */
        KASSERT(amap == NULL ||
            rw_lock_op(amap->am_lock) == flt->upper_lock_type);

        /*
         * map in the backpages and frontpages we found in the amap in hopes
         * of preventing future faults.    we also init the pages[] array as
         * we go.
         */

        currva = flt->startva;
        shadowed = false;
        entered = false;
        for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) {
                /*
                 * unmapped or center page.   check if any anon at this level.
                 */
                if (amap == NULL || anons[lcv] == NULL) {
                        pages[lcv] = NULL;
                        continue;
                }

                /*
                 * check for present page and map if possible.
                 */

                pages[lcv] = PGO_DONTCARE;
                if (lcv == flt->centeridx) {        /* save center for later! */
                        shadowed = true;
                        continue;
                }

                struct vm_anon *anon = anons[lcv];
                struct vm_page *pg = anon->an_page;

                KASSERT(anon->an_lock == amap->am_lock);

                /*
                 * ignore loaned and busy pages.
                 * don't play with VAs that are already mapped.
                 */

                if (pg && pg->loan_count == 0 && (pg->flags & PG_BUSY) == 0 &&
                    !pmap_extract(ufi->orig_map->pmap, currva, NULL)) {
                        uvm_fault_upper_neighbor(ufi, flt, currva,
                            pg, anon->an_ref > 1);
                        entered = true;
                }
        }
        if (entered) {
                pmap_update(ufi->orig_map->pmap);
        }

        /* locked: maps(read), amap(if there) */
        KASSERT(amap == NULL ||
            rw_lock_op(amap->am_lock) == flt->upper_lock_type);
        /* (shadowed == true) if there is an anon at the faulting address */
        UVMHIST_LOG(maphist, "  shadowed=%jd, will_get=%jd", shadowed,
            (ufi->entry->object.uvm_obj && shadowed != false),0,0);

        return 0;
}

/*
 * uvm_fault_upper_neighbor: enter single upper neighbor page.
 *
 * => called with amap and anon locked.
 */

static void
uvm_fault_upper_neighbor(
        struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
        vaddr_t currva, struct vm_page *pg, bool readonly)
{
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        /* locked: amap, anon */

        KASSERT(pg->uobject == NULL);
        KASSERT(pg->uanon != NULL);
        KASSERT(rw_lock_op(pg->uanon->an_lock) == flt->upper_lock_type);
        KASSERT(uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN);

        /*
         * there wasn't a direct fault on the page, so avoid the cost of
         * activating it.
         */

        if (!uvmpdpol_pageisqueued_p(pg) && pg->wire_count == 0) {
                uvm_pagelock(pg);
                uvm_pageenqueue(pg);
                uvm_pageunlock(pg);
        }

        UVMHIST_LOG(maphist,
            "  MAPPING: n anon: pm=%#jx, va=%#jx, pg=%#jx",
            (uintptr_t)ufi->orig_map->pmap, currva, (uintptr_t)pg, 0);
        cpu_count(CPU_COUNT_FLTNAMAP, 1);

        /*
         * Since this page isn't the page that's actually faulting,
         * ignore pmap_enter() failures; it's not critical that we
         * enter these right now.
         */

        (void) pmap_enter(ufi->orig_map->pmap, currva,
            VM_PAGE_TO_PHYS(pg),
            readonly ? (flt->enter_prot & ~VM_PROT_WRITE) :
            flt->enter_prot,
            PMAP_CANFAIL | (flt->wire_mapping ? PMAP_WIRED : 0));
}

/*
 * uvm_fault_upper: handle upper fault.
 *
 *        1. acquire anon lock.
 *        2. get anon.  let uvmfault_anonget do the dirty work.
 *        3. handle loan.
 *        4. dispatch direct or promote handlers.
 */

static int
uvm_fault_upper(
        struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
        struct vm_anon **anons)
{
        struct vm_amap * const amap = ufi->entry->aref.ar_amap;
        struct vm_anon * const anon = anons[flt->centeridx];
        struct uvm_object *uobj;
        int error;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        /* locked: maps(read), amap, anon */
        KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type);
        KASSERT(anon->an_lock == amap->am_lock);

        /*
         * handle case 1: fault on an anon in our amap
         */

        UVMHIST_LOG(maphist, "  case 1 fault: anon=%#jx",
            (uintptr_t)anon, 0, 0, 0);

        /*
         * no matter if we have case 1A or case 1B we are going to need to
         * have the anon's memory resident.   ensure that now.
         */

        /*
         * let uvmfault_anonget do the dirty work.
         * if it fails (!OK) it will unlock everything for us.
         * if it succeeds, locks are still valid and locked.
         * also, if it is OK, then the anon's page is on the queues.
         * if the page is on loan from a uvm_object, then anonget will
         * lock that object for us if it does not fail.
         */
 retry:
        error = uvmfault_anonget(ufi, amap, anon);
        switch (error) {
        case 0:
                break;

        case ERESTART:
                return ERESTART;

        case EAGAIN:
                kpause("fltagain1", false, hz/2, NULL);
                return ERESTART;

        case ENOLCK:
                /* it needs a write lock: retry */
                error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL);
                if (error != 0) {
                        return error;
                }
                KASSERT(rw_write_held(amap->am_lock));
                goto retry;

        default:
                return error;
        }

        /*
         * uobj is non null if the page is on loan from an object (i.e. uobj)
         */

        uobj = anon->an_page->uobject;        /* locked by anonget if !NULL */

        /* locked: maps(read), amap, anon, uobj(if one) */
        KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type);
        KASSERT(anon->an_lock == amap->am_lock);
        KASSERT(uobj == NULL ||
            rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);

        /*
         * special handling for loaned pages
         */

        if (anon->an_page->loan_count) {
                error = uvm_fault_upper_loan(ufi, flt, anon, &uobj);
                if (error != 0)
                        return error;
        }

        /*
         * if we are case 1B then we will need to allocate a new blank
         * anon to transfer the data into.   note that we have a lock
         * on anon, so no one can busy or release the page until we are done.
         * also note that the ref count can't drop to zero here because
         * it is > 1 and we are only dropping one ref.
         *
         * in the (hopefully very rare) case that we are out of RAM we
         * will unlock, wait for more RAM, and refault.
         *
         * if we are out of anon VM we kill the process (XXX: could wait?).
         */

        if (flt->cow_now && anon->an_ref > 1) {
                flt->promote = true;
                error = uvm_fault_upper_promote(ufi, flt, uobj, anon);
        } else {
                error = uvm_fault_upper_direct(ufi, flt, uobj, anon);
        }
        return error;
}

/*
 * uvm_fault_upper_loan: handle loaned upper page.
 *
 *        1. if not cow'ing now, simply adjust flt->enter_prot.
 *        2. if cow'ing now, and if ref count is 1, break loan.
 */

static int
uvm_fault_upper_loan(
        struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
        struct vm_anon *anon, struct uvm_object **ruobj)
{
        struct vm_amap * const amap = ufi->entry->aref.ar_amap;
        int error = 0;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        if (!flt->cow_now) {

                /*
                 * for read faults on loaned pages we just cap the
                 * protection at read-only.
                 */

                flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE;

        } else {
                /*
                 * note that we can't allow writes into a loaned page!
                 *
                 * if we have a write fault on a loaned page in an
                 * anon then we need to look at the anon's ref count.
                 * if it is greater than one then we are going to do
                 * a normal copy-on-write fault into a new anon (this
                 * is not a problem).  however, if the reference count
                 * is one (a case where we would normally allow a
                 * write directly to the page) then we need to kill
                 * the loan before we continue.
                 */

                /* >1 case is already ok */
                if (anon->an_ref == 1) {
                        /* breaking loan requires a write lock. */
                        error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL);
                        if (error != 0) {
                                return error;
                        }
                        KASSERT(rw_write_held(amap->am_lock));

                        error = uvm_loanbreak_anon(anon, *ruobj);
                        if (error != 0) {
                                uvmfault_unlockall(ufi, amap, *ruobj);
                                uvm_wait("flt_noram2");
                                return ERESTART;
                        }
                        /* if we were a loan receiver uobj is gone */
                        if (*ruobj)
                                *ruobj = NULL;
                }
        }
        return error;
}

/*
 * uvm_fault_upper_promote: promote upper page.
 *
 *        1. call uvmfault_promote.
 *        2. enqueue page.
 *        3. deref.
 *        4. pass page to uvm_fault_upper_enter.
 */

static int
uvm_fault_upper_promote(
        struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
        struct uvm_object *uobj, struct vm_anon *anon)
{
        struct vm_amap * const amap = ufi->entry->aref.ar_amap;
        struct vm_anon * const oanon = anon;
        struct vm_page *pg;
        int error;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        UVMHIST_LOG(maphist, "  case 1B: COW fault",0,0,0,0);

        /* promoting requires a write lock. */
        error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL);
        if (error != 0) {
                return error;
        }
        KASSERT(rw_write_held(amap->am_lock));

        cpu_count(CPU_COUNT_FLT_ACOW, 1);

        error = uvmfault_promote(ufi, oanon, PGO_DONTCARE, &anon,
            &flt->anon_spare);
        switch (error) {
        case 0:
                break;
        case ERESTART:
                return ERESTART;
        default:
                return error;
        }
        pg = anon->an_page;

        KASSERT(anon->an_lock == oanon->an_lock);
        KASSERT((pg->flags & (PG_BUSY | PG_FAKE)) == 0);

        /* deref: can not drop to zero here by defn! */
        KASSERT(oanon->an_ref > 1);
        oanon->an_ref--;

        /*
         * note: oanon is still locked, as is the new anon.  we
         * need to check for this later when we unlock oanon; if
         * oanon != anon, we'll have to unlock anon, too.
         */

        return uvm_fault_upper_enter(ufi, flt, uobj, anon, pg, oanon);
}

/*
 * uvm_fault_upper_direct: handle direct fault.
 */

static int
uvm_fault_upper_direct(
        struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
        struct uvm_object *uobj, struct vm_anon *anon)
{
        struct vm_anon * const oanon = anon;
        struct vm_page *pg;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        cpu_count(CPU_COUNT_FLT_ANON, 1);
        pg = anon->an_page;
        if (anon->an_ref > 1)     /* disallow writes to ref > 1 anons */
                flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE;

        return uvm_fault_upper_enter(ufi, flt, uobj, anon, pg, oanon);
}

/*
 * uvm_fault_upper_enter: enter h/w mapping of upper page.
 */

static int
uvm_fault_upper_enter(
        struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
        struct uvm_object *uobj, struct vm_anon *anon, struct vm_page *pg,
        struct vm_anon *oanon)
{
        struct pmap *pmap = ufi->orig_map->pmap;
        vaddr_t va = ufi->orig_rvaddr;
        struct vm_amap * const amap = ufi->entry->aref.ar_amap;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        /* locked: maps(read), amap, oanon, anon(if different from oanon) */
        KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type);
        KASSERT(anon->an_lock == amap->am_lock);
        KASSERT(oanon->an_lock == amap->am_lock);
        KASSERT(uobj == NULL ||
            rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
        KASSERT(uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN);

        /*
         * now map the page in.
         */

        UVMHIST_LOG(maphist,
            "  MAPPING: anon: pm=%#jx, va=%#jx, pg=%#jx, promote=%jd",
            (uintptr_t)pmap, va, (uintptr_t)pg, flt->promote);
        if (pmap_enter(pmap, va, VM_PAGE_TO_PHYS(pg),
            flt->enter_prot, flt->access_type | PMAP_CANFAIL |
            (flt->wire_mapping ? PMAP_WIRED : 0)) != 0) {

                /*
                 * If pmap_enter() fails, it must not leave behind an existing
                 * pmap entry.  In particular, a now-stale entry for a different
                 * page would leave the pmap inconsistent with the vm_map.
                 * This is not to imply that pmap_enter() should remove an
                 * existing mapping in such a situation (since that could create
                 * different problems, eg. if the existing mapping is wired),
                 * but rather that the pmap should be designed such that it
                 * never needs to fail when the new mapping is replacing an
                 * existing mapping and the new page has no existing mappings.
                 *
                 * XXX This can't be asserted safely any more because many
                 * LWPs and/or many processes could simultaneously fault on
                 * the same VA and some might succeed.
                 */

                /* KASSERT(!pmap_extract(pmap, va, NULL)); */

                /*
                 * ensure that the page is queued in the case that
                 * we just promoted.
                 */

                uvm_pagelock(pg);
                uvm_pageenqueue(pg);
                uvm_pageunlock(pg);

                /*
                 * No need to undo what we did; we can simply think of
                 * this as the pmap throwing away the mapping information.
                 *
                 * We do, however, have to go through the ReFault path,
                 * as the map may change while we're asleep.
                 */

                uvmfault_unlockall(ufi, amap, uobj);
                if (!uvm_reclaimable()) {
                        UVMHIST_LOG(maphist,
                            "<- failed.  out of VM",0,0,0,0);
                        /* XXX instrumentation */
                        return ENOMEM;
                }
                /* XXX instrumentation */
                uvm_wait("flt_pmfail1");
                return ERESTART;
        }

        uvm_fault_upper_done(ufi, flt, anon, pg);

        /*
         * done case 1!  finish up by unlocking everything and returning success
         */

        pmap_update(pmap);
        uvmfault_unlockall(ufi, amap, uobj);
        return 0;
}

/*
 * uvm_fault_upper_done: queue upper center page.
 */

static void
uvm_fault_upper_done(
        struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
        struct vm_anon *anon, struct vm_page *pg)
{
        const bool wire_paging = flt->wire_paging;

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        /*
         * ... update the page queues.
         */

        if (wire_paging) {
                uvm_pagelock(pg);
                uvm_pagewire(pg);
                uvm_pageunlock(pg);

                /*
                 * since the now-wired page cannot be paged out,
                 * release its swap resources for others to use.
                 * and since an anon with no swap cannot be clean,
                 * mark it dirty now.
                 */

                uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
                uvm_anon_dropswap(anon);
        } else if (uvmpdpol_pageactivate_p(pg)) {
                /*
                 * avoid re-activating the page unless needed,
                 * to avoid false sharing on multiprocessor.
                 */

                uvm_pagelock(pg);
                uvm_pageactivate(pg);
                uvm_pageunlock(pg);
        }
}

/*
 * uvm_fault_lower_upgrade: upgrade lower lock, reader -> writer
 */

static inline int
uvm_fault_lower_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
    struct vm_amap *amap, struct uvm_object *uobj, struct vm_page *uobjpage)
{

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        KASSERT(uobj != NULL);
        KASSERT(flt->lower_lock_type == rw_lock_op(uobj->vmobjlock));

        /*
         * fast path.
         */

        if (__predict_true(flt->lower_lock_type == RW_WRITER)) {
                return 0;
        }

        /*
         * otherwise try for the upgrade.  if we don't get it, unlock
         * everything, restart the fault and next time around get a writer
         * lock.
         */

        flt->lower_lock_type = RW_WRITER;
        if (__predict_false(!rw_tryupgrade(uobj->vmobjlock))) {
                uvmfault_unlockall(ufi, amap, uobj);
                cpu_count(CPU_COUNT_FLTNOUP, 1);
                UVMHIST_LOG(maphist, "  !upgrade lower", 0, 0,0,0);
                return ERESTART;
        }
        cpu_count(CPU_COUNT_FLTUP, 1);
        KASSERT(flt->lower_lock_type == rw_lock_op(uobj->vmobjlock));
        return 0;
}

/*
 * uvm_fault_lower: handle lower fault.
 *
 *        1. check uobj
 *        1.1. if null, ZFOD.
 *        1.2. if not null, look up unmapped neighbor pages.
 *        2. for center page, check if promote.
 *        2.1. ZFOD always needs promotion.
 *        2.2. other uobjs, when entry is marked COW (usually MAP_PRIVATE vnode).
 *        3. if uobj is not ZFOD and page is not found, do i/o.
 *        4. dispatch either direct / promote fault.
 */

static int
uvm_fault_lower(
        struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
        struct vm_page **pages)
{
        struct vm_amap *amap __diagused = ufi->entry->aref.ar_amap;
        struct uvm_object *uobj = ufi->entry->object.uvm_obj;
        struct vm_page *uobjpage;
        int error;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        /*
         * now, if the desired page is not shadowed by the amap and we have
         * a backing object that does not have a special fault routine, then
         * we ask (with pgo_get) the object for resident pages that we care
         * about and attempt to map them in.  we do not let pgo_get block
         * (PGO_LOCKED).
         */

        if (uobj == NULL) {
                /* zero fill; don't care neighbor pages */
                uobjpage = NULL;
        } else {
                uvm_fault_lower_lookup(ufi, flt, pages);
                uobjpage = pages[flt->centeridx];
        }

        /*
         * note that at this point we are done with any front or back pages.
         * we are now going to focus on the center page (i.e. the one we've
         * faulted on).  if we have faulted on the upper (anon) layer
         * [i.e. case 1], then the anon we want is anons[centeridx] (we have
         * not touched it yet).  if we have faulted on the bottom (uobj)
         * layer [i.e. case 2] and the page was both present and available,
         * then we've got a pointer to it as "uobjpage" and we've already
         * made it BUSY.
         */

        /*
         * locked:
         * maps(read), amap(if there), uobj(if !null), uobjpage(if !null)
         */
        KASSERT(amap == NULL ||
            rw_lock_op(amap->am_lock) == flt->upper_lock_type);
        KASSERT(uobj == NULL ||
            rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);

        /*
         * note that uobjpage can not be PGO_DONTCARE at this point.  we now
         * set uobjpage to PGO_DONTCARE if we are doing a zero fill.  if we
         * have a backing object, check and see if we are going to promote
         * the data up to an anon during the fault.
         */

        if (uobj == NULL) {
                uobjpage = PGO_DONTCARE;
                flt->promote = true;                /* always need anon here */
        } else {
                KASSERT(uobjpage != PGO_DONTCARE);
                flt->promote = flt->cow_now && UVM_ET_ISCOPYONWRITE(ufi->entry);
        }
        UVMHIST_LOG(maphist, "  case 2 fault: promote=%jd, zfill=%jd",
            flt->promote, (uobj == NULL), 0,0);

        /*
         * if uobjpage is not null then we do not need to do I/O to get the
         * uobjpage.
         *
         * if uobjpage is null, then we need to unlock and ask the pager to
         * get the data for us.   once we have the data, we need to reverify
         * the state the world.   we are currently not holding any resources.
         */

        if (uobjpage) {
                /* update rusage counters */
                curlwp->l_ru.ru_minflt++;
        } else {
                error = uvm_fault_lower_io(ufi, flt, &uobj, &uobjpage);
                if (error != 0)
                        return error;
        }

        /*
         * locked:
         * maps(read), amap(if !null), uobj(if !null), uobjpage(if uobj)
         */
        KASSERT(amap == NULL ||
            rw_lock_op(amap->am_lock) == flt->upper_lock_type);
        KASSERT(uobj == NULL ||
            rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);

        /*
         * notes:
         *  - at this point uobjpage can not be NULL
         *  - at this point uobjpage can not be PG_RELEASED (since we checked
         *  for it above)
         *  - at this point uobjpage could be waited on (handle later)
         *  - uobjpage can be from a different object if tmpfs (vnode vs UAO)
         */

        KASSERT(uobjpage != NULL);
        KASSERT(uobj == NULL ||
            uobjpage->uobject->vmobjlock == uobj->vmobjlock);
        KASSERT(uobj == NULL || !UVM_OBJ_IS_CLEAN(uobjpage->uobject) ||
            uvm_pagegetdirty(uobjpage) == UVM_PAGE_STATUS_CLEAN);

        if (!flt->promote) {
                error = uvm_fault_lower_direct(ufi, flt, uobj, uobjpage);
        } else {
                error = uvm_fault_lower_promote(ufi, flt, uobj, uobjpage);
        }
        return error;
}

/*
 * uvm_fault_lower_lookup: look up on-memory uobj pages.
 *
 *        1. get on-memory pages.
 *        2. if failed, give up (get only center page later).
 *        3. if succeeded, enter h/w mapping of neighbor pages.
 */

static void
uvm_fault_lower_lookup(
        struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
        struct vm_page **pages)
{
        struct uvm_object *uobj = ufi->entry->object.uvm_obj;
        int lcv, gotpages;
        vaddr_t currva;
        bool entered;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        rw_enter(uobj->vmobjlock, flt->lower_lock_type);

        /*
         * Locked: maps(read), amap(if there), uobj
         */

        cpu_count(CPU_COUNT_FLTLGET, 1);
        gotpages = flt->npages;
        (void) uobj->pgops->pgo_get(uobj,
            ufi->entry->offset + flt->startva - ufi->entry->start,
            pages, &gotpages, flt->centeridx,
            flt->access_type & MASK(ufi->entry), ufi->entry->advice,
            PGO_LOCKED);

        KASSERT(rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);

        /*
         * check for pages to map, if we got any
         */

        if (gotpages == 0) {
                pages[flt->centeridx] = NULL;
                return;
        }

        entered = false;
        currva = flt->startva;
        for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) {
                struct vm_page *curpg;

                curpg = pages[lcv];
                if (curpg == NULL || curpg == PGO_DONTCARE) {
                        continue;
                }

                /*
                 * in the case of tmpfs, the pages might be from a different
                 * uvm_object.  just make sure that they have the same lock.
                 */

                KASSERT(curpg->uobject->vmobjlock == uobj->vmobjlock);
                KASSERT((curpg->flags & PG_BUSY) == 0);

                /*
                 * leave the centre page for later.  don't screw with
                 * existing mappings (needless & expensive).
                 */

                if (lcv == flt->centeridx) {
                        UVMHIST_LOG(maphist, "  got uobjpage (%#jx) "
                            "with locked get", (uintptr_t)curpg, 0, 0, 0);
                } else if (!pmap_extract(ufi->orig_map->pmap, currva, NULL)) {
                        uvm_fault_lower_neighbor(ufi, flt, currva, curpg);
                        entered = true;
                }
        }
        if (entered) {
                pmap_update(ufi->orig_map->pmap);
        }
}

/*
 * uvm_fault_lower_neighbor: enter h/w mapping of lower neighbor page.
 */

static void
uvm_fault_lower_neighbor(
        struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
        vaddr_t currva, struct vm_page *pg)
{
        const bool readonly = uvm_pagereadonly_p(pg) || pg->loan_count > 0;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        /* locked: maps(read), amap(if there), uobj */

        /*
         * calling pgo_get with PGO_LOCKED returns us pages which
         * are neither busy nor released, so we don't need to check
         * for this.  we can just directly enter the pages.
         *
         * there wasn't a direct fault on the page, so avoid the cost of
         * activating it.
         */

        if (!uvmpdpol_pageisqueued_p(pg) && pg->wire_count == 0) {
                uvm_pagelock(pg);
                uvm_pageenqueue(pg);
                uvm_pageunlock(pg);
        }

        UVMHIST_LOG(maphist,
            "  MAPPING: n obj: pm=%#jx, va=%#jx, pg=%#jx",
            (uintptr_t)ufi->orig_map->pmap, currva, (uintptr_t)pg, 0);
        cpu_count(CPU_COUNT_FLTNOMAP, 1);

        /*
         * Since this page isn't the page that's actually faulting,
         * ignore pmap_enter() failures; it's not critical that we
         * enter these right now.
         * NOTE: page can't be waited on or PG_RELEASED because we've
         * held the lock the whole time we've had the handle.
         */
        KASSERT((pg->flags & PG_PAGEOUT) == 0);
        KASSERT((pg->flags & PG_RELEASED) == 0);
        KASSERT(!UVM_OBJ_IS_CLEAN(pg->uobject) ||
            uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN);
        KASSERT((pg->flags & PG_BUSY) == 0);
        KASSERT(rw_lock_op(pg->uobject->vmobjlock) == flt->lower_lock_type);

        const vm_prot_t mapprot =
            readonly ? (flt->enter_prot & ~VM_PROT_WRITE) :
            flt->enter_prot & MASK(ufi->entry);
        const u_int mapflags =
            PMAP_CANFAIL | (flt->wire_mapping ? (mapprot | PMAP_WIRED) : 0);
        (void) pmap_enter(ufi->orig_map->pmap, currva,
            VM_PAGE_TO_PHYS(pg), mapprot, mapflags);
}

/*
 * uvm_fault_lower_io: get lower page from backing store.
 *
 *        1. unlock everything, because i/o will block.
 *        2. call pgo_get.
 *        3. if failed, recover.
 *        4. if succeeded, relock everything and verify things.
 */

static int
uvm_fault_lower_io(
        struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
        struct uvm_object **ruobj, struct vm_page **ruobjpage)
{
        struct vm_amap * const amap = ufi->entry->aref.ar_amap;
        struct uvm_object *uobj = *ruobj;
        struct vm_page *pg;
        bool locked;
        int gotpages;
        int error;
        voff_t uoff;
        vm_prot_t access_type;
        int advice;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        /* grab everything we need from the entry before we unlock */
        uoff = (ufi->orig_rvaddr - ufi->entry->start) + ufi->entry->offset;
        access_type = flt->access_type & MASK(ufi->entry);
        advice = ufi->entry->advice;

        /* Locked: maps(read), amap(if there), uobj */
        KASSERT(rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);

        /* Upgrade to a write lock if needed. */
        error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj, NULL);
        if (error != 0) {
                return error;
        }
        uvmfault_unlockall(ufi, amap, NULL);

        /* update rusage counters */
        curlwp->l_ru.ru_majflt++;

        /* Locked: uobj(write) */
        KASSERT(rw_write_held(uobj->vmobjlock));

        cpu_count(CPU_COUNT_FLTGET, 1);
        gotpages = 1;
        pg = NULL;
        error = uobj->pgops->pgo_get(uobj, uoff, &pg, &gotpages,
            0, access_type, advice, PGO_SYNCIO);
        /* locked: pg(if no error) */

        /*
         * recover from I/O
         */

        if (error) {
                if (error == EAGAIN) {
                        UVMHIST_LOG(maphist,
                            "  pgo_get says TRY AGAIN!",0,0,0,0);
                        kpause("fltagain2", false, hz/2, NULL);
                        return ERESTART;
                }

#if 0
                KASSERT(error != ERESTART);
#else
                /* XXXUEBS don't re-fault? */
                if (error == ERESTART)
                        error = EIO;
#endif

                UVMHIST_LOG(maphist, "<- pgo_get failed (code %jd)",
                    error, 0,0,0);
                return error;
        }

        /*
         * re-verify the state of the world by first trying to relock
         * the maps.  always relock the object.
         */

        locked = uvmfault_relock(ufi);
        if (locked && amap)
                amap_lock(amap, flt->upper_lock_type);

        /* might be changed */
        uobj = pg->uobject;

        rw_enter(uobj->vmobjlock, flt->lower_lock_type);
        KASSERT((pg->flags & PG_BUSY) != 0);
        KASSERT(flt->lower_lock_type == RW_WRITER);

        uvm_pagelock(pg);
        uvm_pageactivate(pg);
        uvm_pageunlock(pg);

        /* locked(locked): maps(read), amap(if !null), uobj, pg */
        /* locked(!locked): uobj, pg */

        /*
         * verify that the page has not be released and re-verify
         * that amap slot is still free.   if there is a problem,
         * we unlock and clean up.
         */

        if ((pg->flags & PG_RELEASED) != 0 ||
            (locked && amap && amap_lookup(&ufi->entry->aref,
              ufi->orig_rvaddr - ufi->entry->start))) {
                if (locked)
                        uvmfault_unlockall(ufi, amap, NULL);
                locked = false;
        }

        /*
         * unbusy/release the page.
         */

        if ((pg->flags & PG_RELEASED) == 0) {
                pg->flags &= ~PG_BUSY;
                uvm_pagelock(pg);
                uvm_pagewakeup(pg);
                uvm_pageunlock(pg);
                UVM_PAGE_OWN(pg, NULL);
        } else {
                cpu_count(CPU_COUNT_FLTPGRELE, 1);
                uvm_pagefree(pg);
        }

        /*
         * didn't get the lock?   retry.
         */

        if (locked == false) {
                UVMHIST_LOG(maphist,
                    "  wasn't able to relock after fault: retry",
                    0,0,0,0);
                rw_exit(uobj->vmobjlock);
                return ERESTART;
        }

        /*
         * we have the data in pg.  we are holding object lock (so the page
         * can't be released on us).
         */

        /* locked: maps(read), amap(if !null), uobj */

        *ruobj = uobj;
        *ruobjpage = pg;
        return 0;
}

/*
 * uvm_fault_lower_direct: fault lower center page
 *
 *        1. adjust flt->enter_prot.
 *        2. if page is loaned, resolve.
 */

int
uvm_fault_lower_direct(
        struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
        struct uvm_object *uobj, struct vm_page *uobjpage)
{
        struct vm_page *pg;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        /*
         * we are not promoting.   if the mapping is COW ensure that we
         * don't give more access than we should (e.g. when doing a read
         * fault on a COPYONWRITE mapping we want to map the COW page in
         * R/O even though the entry protection could be R/W).
         *
         * set "pg" to the page we want to map in (uobjpage, usually)
         */

        cpu_count(CPU_COUNT_FLT_OBJ, 1);
        if (UVM_ET_ISCOPYONWRITE(ufi->entry) ||
            UVM_OBJ_NEEDS_WRITEFAULT(uobjpage->uobject))
                flt->enter_prot &= ~VM_PROT_WRITE;
        pg = uobjpage;                /* map in the actual object */

        KASSERT(uobjpage != PGO_DONTCARE);

        /*
         * we are faulting directly on the page.   be careful
         * about writing to loaned pages...
         */

        if (uobjpage->loan_count) {
                uvm_fault_lower_direct_loan(ufi, flt, uobj, &pg, &uobjpage);
        }
        KASSERT(pg == uobjpage);
        KASSERT((pg->flags & PG_BUSY) == 0);
        return uvm_fault_lower_enter(ufi, flt, uobj, NULL, pg);
}

/*
 * uvm_fault_lower_direct_loan: resolve loaned page.
 *
 *        1. if not cow'ing, adjust flt->enter_prot.
 *        2. if cow'ing, break loan.
 */

static int
uvm_fault_lower_direct_loan(
        struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
        struct uvm_object *uobj, struct vm_page **rpg,
        struct vm_page **ruobjpage)
{
        struct vm_amap * const amap = ufi->entry->aref.ar_amap;
        struct vm_page *pg;
        struct vm_page *uobjpage = *ruobjpage;
        int error;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        if (!flt->cow_now) {
                /* read fault: cap the protection at readonly */
                /* cap! */
                flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE;
        } else {
                /*
                 * write fault: must break the loan here.  to do this
                 * we need a write lock on the object.
                 */

                error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj, uobjpage);
                if (error != 0) {
                        return error;
                }
                KASSERT(rw_write_held(uobj->vmobjlock));

                pg = uvm_loanbreak(uobjpage);
                if (pg == NULL) {

                        uvmfault_unlockall(ufi, amap, uobj);
                        UVMHIST_LOG(maphist,
                          "  out of RAM breaking loan, waiting",
                          0,0,0,0);
                        cpu_count(CPU_COUNT_FLTNORAM, 1);
                        uvm_wait("flt_noram4");
                        return ERESTART;
                }
                *rpg = pg;
                *ruobjpage = pg;

                /*
                 * drop ownership of page while still holding object lock,
                 * which won't be dropped until the page is entered.
                 */

                uvm_pagelock(pg);
                uvm_pagewakeup(pg);
                uvm_pageunlock(pg);
                pg->flags &= ~PG_BUSY;
                UVM_PAGE_OWN(pg, NULL);
        }
        return 0;
}

/*
 * uvm_fault_lower_promote: promote lower page.
 *
 *        1. call uvmfault_promote.
 *        2. fill in data.
 *        3. if not ZFOD, dispose old page.
 */

int
uvm_fault_lower_promote(
        struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
        struct uvm_object *uobj, struct vm_page *uobjpage)
{
        struct vm_amap * const amap = ufi->entry->aref.ar_amap;
        struct vm_anon *anon;
        struct vm_page *pg;
        int error;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        KASSERT(amap != NULL);

        /* promoting requires a write lock. */
        error = uvm_fault_upper_upgrade(ufi, flt, amap, uobj);
        if (error != 0) {
                return error;
        }
        KASSERT(rw_write_held(amap->am_lock));
        KASSERT(uobj == NULL ||
            rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);

        /*
         * If we are going to promote the data to an anon we
         * allocate a blank anon here and plug it into our amap.
         */
        error = uvmfault_promote(ufi, NULL, uobjpage, &anon, &flt->anon_spare);
        switch (error) {
        case 0:
                break;
        case ERESTART:
                return ERESTART;
        default:
                return error;
        }

        pg = anon->an_page;

        /*
         * Fill in the data.
         */

        if (uobjpage != PGO_DONTCARE) {
                cpu_count(CPU_COUNT_FLT_PRCOPY, 1);

                /*
                 * promote to shared amap?  make sure all sharing
                 * procs see it
                 */

                if ((amap_flags(amap) & AMAP_SHARED) != 0) {
                        pmap_page_protect(uobjpage, VM_PROT_NONE);
                        /*
                         * XXX: PAGE MIGHT BE WIRED!
                         */
                }

                UVMHIST_LOG(maphist,
                    "  promote uobjpage %#jx to anon/page %#jx/%#jx",
                    (uintptr_t)uobjpage, (uintptr_t)anon, (uintptr_t)pg, 0);

        } else {
                cpu_count(CPU_COUNT_FLT_PRZERO, 1);

                /*
                 * Page is zero'd and marked dirty by
                 * uvmfault_promote().
                 */

                UVMHIST_LOG(maphist,"  zero fill anon/page %#jx/%#jx",
                    (uintptr_t)anon, (uintptr_t)pg, 0, 0);
        }

        return uvm_fault_lower_enter(ufi, flt, uobj, anon, pg);
}

/*
 * uvm_fault_lower_enter: enter h/w mapping of lower page or anon page promoted
 * from the lower page.
 */

int
uvm_fault_lower_enter(
        struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
        struct uvm_object *uobj,
        struct vm_anon *anon, struct vm_page *pg)
{
        struct vm_amap * const amap = ufi->entry->aref.ar_amap;
        const bool readonly = uvm_pagereadonly_p(pg);
        int error;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        /*
         * Locked:
         *
         *        maps(read), amap(if !null), uobj(if !null),
         *        anon(if !null), pg(if anon), unlock_uobj(if !null)
         *
         * anon must be write locked (promotion).  uobj can be either.
         *
         * Note: pg is either the uobjpage or the new page in the new anon.
         */

        KASSERT(amap == NULL ||
            rw_lock_op(amap->am_lock) == flt->upper_lock_type);
        KASSERT(uobj == NULL ||
            rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
        KASSERT(anon == NULL || anon->an_lock == amap->am_lock);

        /*
         * note that pg can't be PG_RELEASED or PG_BUSY since we did
         * not drop the object lock since the last time we checked.
         */

        KASSERT((pg->flags & PG_RELEASED) == 0);
        KASSERT((pg->flags & PG_BUSY) == 0);

        /*
         * all resources are present.   we can now map it in and free our
         * resources.
         */

        UVMHIST_LOG(maphist,
            "  MAPPING: case2: pm=%#jx, va=%#jx, pg=%#jx, promote=%jd",
            (uintptr_t)ufi->orig_map->pmap, ufi->orig_rvaddr,
            (uintptr_t)pg, flt->promote);
        KASSERTMSG((flt->access_type & VM_PROT_WRITE) == 0 || !readonly,
            "promote=%u cow_now=%u access_type=%x enter_prot=%x cow=%u "
            "entry=%p map=%p orig_rvaddr=%p pg=%p",
            flt->promote, flt->cow_now, flt->access_type, flt->enter_prot,
            UVM_ET_ISCOPYONWRITE(ufi->entry), ufi->entry, ufi->orig_map,
            (void *)ufi->orig_rvaddr, pg);
        KASSERT((flt->access_type & VM_PROT_WRITE) == 0 || !readonly);
        if (pmap_enter(ufi->orig_map->pmap, ufi->orig_rvaddr,
            VM_PAGE_TO_PHYS(pg),
            readonly ? flt->enter_prot & ~VM_PROT_WRITE : flt->enter_prot,
            flt->access_type | PMAP_CANFAIL |
            (flt->wire_mapping ? PMAP_WIRED : 0)) != 0) {

                /*
                 * No need to undo what we did; we can simply think of
                 * this as the pmap throwing away the mapping information.
                 *
                 * We do, however, have to go through the ReFault path,
                 * as the map may change while we're asleep.
                 */

                /*
                 * ensure that the page is queued in the case that
                 * we just promoted the page.
                 */

                if (anon != NULL) {
                        uvm_pagelock(pg);
                        uvm_pageenqueue(pg);
                        uvm_pagewakeup(pg);
                        uvm_pageunlock(pg);
                }

                uvmfault_unlockall(ufi, amap, uobj);
                if (!uvm_reclaimable()) {
                        UVMHIST_LOG(maphist,
                            "<- failed.  out of VM",0,0,0,0);
                        /* XXX instrumentation */
                        error = ENOMEM;
                        return error;
                }
                /* XXX instrumentation */
                uvm_wait("flt_pmfail2");
                return ERESTART;
        }

        uvm_fault_lower_done(ufi, flt, uobj, pg);
        pmap_update(ufi->orig_map->pmap);
        uvmfault_unlockall(ufi, amap, uobj);

        UVMHIST_LOG(maphist, "<- done (SUCCESS!)",0,0,0,0);
        return 0;
}

/*
 * uvm_fault_lower_done: queue lower center page.
 */

void
uvm_fault_lower_done(
        struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
        struct uvm_object *uobj, struct vm_page *pg)
{

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        if (flt->wire_paging) {
                uvm_pagelock(pg);
                uvm_pagewire(pg);
                uvm_pageunlock(pg);
                if (pg->flags & PG_AOBJ) {

                        /*
                         * since the now-wired page cannot be paged out,
                         * release its swap resources for others to use.
                         * since an aobj page with no swap cannot be clean,
                         * mark it dirty now.
                         *
                         * use pg->uobject here.  if the page is from a
                         * tmpfs vnode, the pages are backed by its UAO and
                         * not the vnode.
                         */

                        KASSERT(uobj != NULL);
                        KASSERT(uobj->vmobjlock == pg->uobject->vmobjlock);
                        uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
                        uao_dropswap(pg->uobject, pg->offset >> PAGE_SHIFT);
                }
        } else if (uvmpdpol_pageactivate_p(pg)) {
                /*
                 * avoid re-activating the page unless needed,
                 * to avoid false sharing on multiprocessor.
                 */

                uvm_pagelock(pg);
                uvm_pageactivate(pg);
                uvm_pageunlock(pg);
        }
}


/*
 * uvm_fault_wire: wire down a range of virtual addresses in a map.
 *
 * => map may be read-locked by caller, but MUST NOT be write-locked.
 * => if map is read-locked, any operations which may cause map to
 *        be write-locked in uvm_fault() must be taken care of by
 *        the caller.  See uvm_map_pageable().
 */

int
uvm_fault_wire(struct vm_map *map, vaddr_t start, vaddr_t end,
    vm_prot_t access_type, int maxprot)
{
        vaddr_t va;
        int error;

        /*
         * now fault it in a page at a time.   if the fault fails then we have
         * to undo what we have done.   note that in uvm_fault VM_PROT_NONE
         * is replaced with the max protection if fault_type is VM_FAULT_WIRE.
         */

        /*
         * XXX work around overflowing a vaddr_t.  this prevents us from
         * wiring the last page in the address space, though.
         */
        if (start > end) {
                return EFAULT;
        }

        for (va = start; va < end; va += PAGE_SIZE) {
                error = uvm_fault_internal(map, va, access_type,
                    (maxprot ? UVM_FAULT_MAXPROT : 0) | UVM_FAULT_WIRE);
                if (error) {
                        if (va != start) {
                                uvm_fault_unwire(map, start, va);
                        }
                        return error;
                }
        }
        return 0;
}

/*
 * uvm_fault_unwire(): unwire range of virtual space.
 */

void
uvm_fault_unwire(struct vm_map *map, vaddr_t start, vaddr_t end)
{
        vm_map_lock_read(map);
        uvm_fault_unwire_locked(map, start, end);
        vm_map_unlock_read(map);
}

/*
 * uvm_fault_unwire_locked(): the guts of uvm_fault_unwire().
 *
 * => map must be at least read-locked.
 */

void
uvm_fault_unwire_locked(struct vm_map *map, vaddr_t start, vaddr_t end)
{
        struct vm_map_entry *entry, *oentry;
        pmap_t pmap = vm_map_pmap(map);
        vaddr_t va;
        paddr_t pa;
        struct vm_page *pg;

        /*
         * we assume that the area we are unwiring has actually been wired
         * in the first place.   this means that we should be able to extract
         * the PAs from the pmap.   we also lock out the page daemon so that
         * we can call uvm_pageunwire.
         */

        /*
         * find the beginning map entry for the region.
         */

        KASSERT(start >= vm_map_min(map));
        KASSERT(end <= vm_map_max(map));
        if (uvm_map_lookup_entry(map, start, &entry) == false)
                panic("uvm_fault_unwire_locked: address not in map");

        oentry = NULL;
        for (va = start; va < end; va += PAGE_SIZE) {

                /*
                 * find the map entry for the current address.
                 */

                KASSERT(va >= entry->start);
                while (va >= entry->end) {
                        KASSERT(entry->next != &map->header);
                        KASSERT(entry->next->start <= entry->end);
                        entry = entry->next;
                }

                /*
                 * lock it.
                 */

                if (entry != oentry) {
                        if (oentry != NULL) {
                                uvm_map_unlock_entry(oentry);
                        }
                        uvm_map_lock_entry(entry, RW_WRITER);
                        oentry = entry;
                }

                /*
                 * if the entry is no longer wired, tell the pmap.
                 */

                if (!pmap_extract(pmap, va, &pa))
                        continue;

                if (VM_MAPENT_ISWIRED(entry) == 0)
                        pmap_unwire(pmap, va);

                pg = PHYS_TO_VM_PAGE(pa);
                if (pg) {
                        uvm_pagelock(pg);
                        uvm_pageunwire(pg);
                        uvm_pageunlock(pg);
                }
        }

        if (oentry != NULL) {
                uvm_map_unlock_entry(entry);
        }
}
















































































































































































































































































































    6 















    1 

    1 


    1 




    1 

    1 


    1 



















































































































































































































































































































































































































































































































































































































































































































































































    8 


    8 
    8 







    8 




























































































































































    3 

    4 




    4 


    4 


    4 








































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
/* $NetBSD: ipsec.c,v 1.178 2023/01/27 09:33:43 ozaki-r Exp $ */
/* $FreeBSD: ipsec.c,v 1.2.2.2 2003/07/01 01:38:13 sam Exp $ */
/* $KAME: ipsec.c,v 1.103 2001/05/24 07:14:18 sakane Exp $ */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *        notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *        notice, this list of conditions and the following disclaimer in the
 *        documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *        may be used to endorse or promote products derived from this software
 *        without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ipsec.c,v 1.178 2023/01/27 09:33:43 ozaki-r Exp $");

/*
 * IPsec controller part.
 */

#if defined(_KERNEL_OPT)
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/pserialize.h>

#include <net/if.h>
#include <net/route.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/in_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet/tcp.h>
#include <netinet/udp.h>
#include <netinet/ip_icmp.h>
#include <netinet/ip_private.h>

#include <netinet/ip6.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
#endif
#include <netinet/in_pcb.h>
#include <netinet/in_offload.h>
#ifdef INET6
#include <netinet6/in6_pcb.h>
#include <netinet/icmp6.h>
#endif

#include <netipsec/ipsec.h>
#include <netipsec/ipsec_var.h>
#include <netipsec/ipsec_private.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#include <netipsec/ah_var.h>
#include <netipsec/esp_var.h>
#include <netipsec/ipcomp.h>                /*XXX*/
#include <netipsec/ipcomp_var.h>

#include <netipsec/key.h>
#include <netipsec/keydb.h>
#include <netipsec/key_debug.h>

#include <netipsec/xform.h>

int ipsec_used = 0;
int ipsec_enabled = 1;

#ifdef IPSEC_DEBUG
int ipsec_debug = 1;

/*
 * When set to 1, IPsec will send packets with the same sequence number.
 * This allows to verify if the other side has proper replay attacks detection.
 */
int ipsec_replay = 0;

/*
 * When set 1, IPsec will send packets with corrupted HMAC.
 * This allows to verify if the other side properly detects modified packets.
 */
int ipsec_integrity = 0;
#else
int ipsec_debug = 0;
#endif

percpu_t *ipsecstat_percpu;

int ip4_ah_offsetmask = 0;        /* maybe IP_DF? */
int ip4_ipsec_dfbit = 2;        /* DF bit on encap. 0: clear 1: set 2: copy */
int ip4_esp_trans_deflev = IPSEC_LEVEL_USE;
int ip4_esp_net_deflev = IPSEC_LEVEL_USE;
int ip4_ah_trans_deflev = IPSEC_LEVEL_USE;
int ip4_ah_net_deflev = IPSEC_LEVEL_USE;
struct secpolicy ip4_def_policy;
int ip4_ipsec_ecn = 0;                /* ECN ignore(-1)/forbidden(0)/allowed(1) */

u_int ipsec_spdgen = 1;                /* SPD generation # */

static struct secpolicy ipsec_dummy_sp __read_mostly = {
        .state                = IPSEC_SPSTATE_ALIVE,
        /* If ENTRUST, the dummy SP never be used. See ipsec_getpolicybysock. */
        .policy                = IPSEC_POLICY_ENTRUST,
};

static struct secpolicy *ipsec_checkpcbcache(struct mbuf *,
    struct inpcbpolicy *, int);
static int ipsec_fillpcbcache(struct inpcbpolicy *, struct mbuf *,
    struct secpolicy *, int);
static int ipsec_invalpcbcache(struct inpcbpolicy *, int);

/*
 * Crypto support requirements:
 *
 *  1        require hardware support
 * -1        require software support
 *  0        take anything
 */
int crypto_support = 0;

static struct secpolicy *ipsec_getpolicybysock(struct mbuf *, u_int,
    struct inpcb *, int *);

#ifdef INET6
int ip6_esp_trans_deflev = IPSEC_LEVEL_USE;
int ip6_esp_net_deflev = IPSEC_LEVEL_USE;
int ip6_ah_trans_deflev = IPSEC_LEVEL_USE;
int ip6_ah_net_deflev = IPSEC_LEVEL_USE;
struct secpolicy ip6_def_policy;
int ip6_ipsec_ecn = 0;                /* ECN ignore(-1)/forbidden(0)/allowed(1) */
#endif

static int ipsec_setspidx_inpcb(struct mbuf *, struct inpcb *);
static int ipsec_setspidx(struct mbuf *, struct secpolicyindex *, int, int);
static void ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *, int);
static int ipsec4_setspidx_ipaddr(struct mbuf *, struct secpolicyindex *);
#ifdef INET6
static void ipsec6_get_ulp(struct mbuf *m, struct secpolicyindex *, int);
static int ipsec6_setspidx_ipaddr(struct mbuf *, struct secpolicyindex *);
#endif
static void ipsec_delpcbpolicy(struct inpcbpolicy *);
static void ipsec_destroy_policy(struct secpolicy *);
static int ipsec_sp_reject(const struct secpolicy *, const struct mbuf *);
static void vshiftl(unsigned char *, int, int);
static size_t ipsec_sp_hdrsiz(const struct secpolicy *, const struct mbuf *);

/*
 * Try to validate and use cached policy on a PCB.
 */
static struct secpolicy *
ipsec_checkpcbcache(struct mbuf *m, struct inpcbpolicy *pcbsp, int dir)
{
        struct secpolicyindex spidx;
        struct secpolicy *sp = NULL;
        int s;

        KASSERT(IPSEC_DIR_IS_VALID(dir));
        KASSERT(pcbsp != NULL);
        KASSERT(dir < __arraycount(pcbsp->sp_cache));
        KASSERT(inp_locked(pcbsp->sp_inp));

        /*
         * Checking the generation and sp->state and taking a reference to an SP
         * must be in a critical section of pserialize. See key_unlink_sp.
         */
        s = pserialize_read_enter();
        /* SPD table change invalidate all the caches. */
        if (ipsec_spdgen != pcbsp->sp_cache[dir].cachegen) {
                ipsec_invalpcbcache(pcbsp, dir);
                goto out;
        }
        sp = pcbsp->sp_cache[dir].cachesp;
        if (sp == NULL)
                goto out;
        if (sp->state != IPSEC_SPSTATE_ALIVE) {
                sp = NULL;
                ipsec_invalpcbcache(pcbsp, dir);
                goto out;
        }
        if ((pcbsp->sp_cacheflags & IPSEC_PCBSP_CONNECTED) == 0) {
                /* NB: assume ipsec_setspidx never sleep */
                if (ipsec_setspidx(m, &spidx, dir, 1) != 0) {
                        sp = NULL;
                        goto out;
                }

                /*
                 * We have to make an exact match here since the cached rule
                 * might have lower priority than a rule that would otherwise
                 * have matched the packet.
                 */
                if (memcmp(&pcbsp->sp_cache[dir].cacheidx, &spidx,
                    sizeof(spidx))) {
                        sp = NULL;
                        goto out;
                }
        } else {
                /*
                 * The pcb is connected, and the L4 code is sure that:
                 * - outgoing side uses inp_[lf]addr
                 * - incoming side looks up policy after inpcb lookup
                 * and address pair is know to be stable.  We do not need
                 * to generate spidx again, nor check the address match again.
                 *
                 * For IPv4/v6 SOCK_STREAM sockets, this assumptions holds
                 * and there are calls to ipsec_pcbconn() from inpcb_connect().
                 */
        }

        key_sp_touch(sp);
        KEY_SP_REF(sp);
        KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_STAMP,
            "DP cause refcnt++:%d SP:%p\n",
            key_sp_refcnt(sp), pcbsp->sp_cache[dir].cachesp);
out:
        pserialize_read_exit(s);
        return sp;
}

static int
ipsec_fillpcbcache(struct inpcbpolicy *pcbsp, struct mbuf *m,
    struct secpolicy *sp, int dir)
{

        KASSERT(IPSEC_DIR_IS_INOROUT(dir));
        KASSERT(dir < __arraycount(pcbsp->sp_cache));
        KASSERT(inp_locked(pcbsp->sp_inp));

        pcbsp->sp_cache[dir].cachesp = NULL;
        pcbsp->sp_cache[dir].cachehint = IPSEC_PCBHINT_UNKNOWN;
        if (ipsec_setspidx(m, &pcbsp->sp_cache[dir].cacheidx, dir, 1) != 0) {
                return EINVAL;
        }
        pcbsp->sp_cache[dir].cachesp = sp;
        if (pcbsp->sp_cache[dir].cachesp) {
                /*
                 * If the PCB is connected, we can remember a hint to
                 * possibly short-circuit IPsec processing in other places.
                 */
                if (pcbsp->sp_cacheflags & IPSEC_PCBSP_CONNECTED) {
                        switch (pcbsp->sp_cache[dir].cachesp->policy) {
                        case IPSEC_POLICY_NONE:
                        case IPSEC_POLICY_BYPASS:
                                pcbsp->sp_cache[dir].cachehint =
                                    IPSEC_PCBHINT_NO;
                                break;
                        default:
                                pcbsp->sp_cache[dir].cachehint =
                                    IPSEC_PCBHINT_YES;
                        }
                }
        }
        pcbsp->sp_cache[dir].cachegen = ipsec_spdgen;

        return 0;
}

static int
ipsec_invalpcbcache(struct inpcbpolicy *pcbsp, int dir)
{
        int i;

        KASSERT(inp_locked(pcbsp->sp_inp));

        for (i = IPSEC_DIR_INBOUND; i <= IPSEC_DIR_OUTBOUND; i++) {
                if (dir != IPSEC_DIR_ANY && i != dir)
                        continue;
                pcbsp->sp_cache[i].cachesp = NULL;
                pcbsp->sp_cache[i].cachehint = IPSEC_PCBHINT_UNKNOWN;
                pcbsp->sp_cache[i].cachegen = 0;
                memset(&pcbsp->sp_cache[i].cacheidx, 0,
                    sizeof(pcbsp->sp_cache[i].cacheidx));
        }
        return 0;
}

void
ipsec_pcbconn(struct inpcbpolicy *pcbsp)
{

        KASSERT(inp_locked(pcbsp->sp_inp));

        pcbsp->sp_cacheflags |= IPSEC_PCBSP_CONNECTED;
        ipsec_invalpcbcache(pcbsp, IPSEC_DIR_ANY);
}

void
ipsec_pcbdisconn(struct inpcbpolicy *pcbsp)
{

        KASSERT(inp_locked(pcbsp->sp_inp));

        pcbsp->sp_cacheflags &= ~IPSEC_PCBSP_CONNECTED;
        ipsec_invalpcbcache(pcbsp, IPSEC_DIR_ANY);
}

void
ipsec_invalpcbcacheall(void)
{

        if (ipsec_spdgen == UINT_MAX)
                ipsec_spdgen = 1;
        else
                ipsec_spdgen++;
}

/*
 * Return a held reference to the default SP.
 */
static struct secpolicy *
key_get_default_sp(int af, const char *where, int tag)
{
        struct secpolicy *sp;

        KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_STAMP, "DP from %s:%u\n", where, tag);

        switch(af) {
        case AF_INET:
                sp = &ip4_def_policy;
                break;
#ifdef INET6
        case AF_INET6:
                sp = &ip6_def_policy;
                break;
#endif
        default:
                KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_STAMP,
                    "unexpected protocol family %u\n", af);
                return NULL;
        }

        if (sp->policy != IPSEC_POLICY_DISCARD &&
            sp->policy != IPSEC_POLICY_NONE) {
                IPSECLOG(LOG_INFO, "fixed system default policy: %d->%d\n",
                    sp->policy, IPSEC_POLICY_NONE);
                sp->policy = IPSEC_POLICY_NONE;
        }
        KEY_SP_REF(sp);

        KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_STAMP, "DP returns SP:%p (%u)\n",
            sp, key_sp_refcnt(sp));
        return sp;
}

#define        KEY_GET_DEFAULT_SP(af) \
        key_get_default_sp((af), __func__, __LINE__)

/*
 * For OUTBOUND packet having a socket. Searching SPD for packet,
 * and return a pointer to SP.
 * OUT:        NULL:        no appropriate SP found, the following value is set to error.
 *                0        : bypass
 *                EACCES        : discard packet.
 *                ENOENT        : ipsec_acquire() in progress, maybe.
 *                others        : error occurred.
 *        others:        a pointer to SP
 *
 * NOTE: IPv6 mapped address concern is implemented here.
 */
static struct secpolicy *
ipsec_getpolicybysock(struct mbuf *m, u_int dir, struct inpcb *inp,
    int *error)
{
        struct inpcbpolicy *pcbsp = NULL;
        struct secpolicy *currsp = NULL;        /* policy on socket */
        struct secpolicy *sp;
        int af;

        KASSERT(m != NULL);
        KASSERT(inp != NULL);
        KASSERT(error != NULL);
        KASSERTMSG(IPSEC_DIR_IS_INOROUT(dir), "invalid direction %u", dir);

        KASSERT(inp->inp_socket != NULL);
        KASSERT(inp_locked(inp));

        /* XXX FIXME inpcb vs socket*/
        af = inp->inp_af;
        KASSERTMSG(af == AF_INET || af == AF_INET6,
            "unexpected protocol family %u", af);

        KASSERT(inp->inp_sp != NULL);
        /* If we have a cached entry, and if it is still valid, use it. */
        IPSEC_STATINC(IPSEC_STAT_SPDCACHELOOKUP);
        currsp = ipsec_checkpcbcache(m, inp->inp_sp, dir);
        if (currsp) {
                *error = 0;
                return currsp;
        }
        IPSEC_STATINC(IPSEC_STAT_SPDCACHEMISS);

        switch (af) {
        case AF_INET:
#if defined(INET6)
        case AF_INET6:
#endif
                *error = ipsec_setspidx_inpcb(m, inp);
                pcbsp = inp->inp_sp;
                break;
        default:
                *error = EPFNOSUPPORT;
                break;
        }
        if (*error)
                return NULL;

        KASSERT(pcbsp != NULL);
        switch (dir) {
        case IPSEC_DIR_INBOUND:
                currsp = pcbsp->sp_in;
                break;
        case IPSEC_DIR_OUTBOUND:
                currsp = pcbsp->sp_out;
                break;
        }
        KASSERT(currsp != NULL);

        if (pcbsp->priv) {        /* when privileged socket */
                switch (currsp->policy) {
                case IPSEC_POLICY_BYPASS:
                case IPSEC_POLICY_IPSEC:
                        KEY_SP_REF(currsp);
                        sp = currsp;
                        break;

                case IPSEC_POLICY_ENTRUST:
                        /* look for a policy in SPD */
                        if (key_havesp(dir))
                                sp = KEY_LOOKUP_SP_BYSPIDX(&currsp->spidx, dir);
                        else
                                sp = NULL;
                        if (sp == NULL)                /* no SP found */
                                sp = KEY_GET_DEFAULT_SP(af);
                        break;

                default:
                        IPSECLOG(LOG_ERR, "Invalid policy for PCB %d\n",
                            currsp->policy);
                        *error = EINVAL;
                        return NULL;
                }
        } else {                                /* unpriv, SPD has policy */
                if (key_havesp(dir))
                        sp = KEY_LOOKUP_SP_BYSPIDX(&currsp->spidx, dir);
                else
                        sp = NULL;
                if (sp == NULL) {                /* no SP found */
                        switch (currsp->policy) {
                        case IPSEC_POLICY_BYPASS:
                                IPSECLOG(LOG_ERR, "Illegal policy for "
                                    "non-priviliged defined %d\n",
                                    currsp->policy);
                                *error = EINVAL;
                                return NULL;

                        case IPSEC_POLICY_ENTRUST:
                                sp = KEY_GET_DEFAULT_SP(af);
                                break;

                        case IPSEC_POLICY_IPSEC:
                                KEY_SP_REF(currsp);
                                sp = currsp;
                                break;

                        default:
                                IPSECLOG(LOG_ERR, "Invalid policy for "
                                    "PCB %d\n", currsp->policy);
                                *error = EINVAL;
                                return NULL;
                        }
                }
        }
        KASSERTMSG(sp != NULL, "null SP (priv %u policy %u", pcbsp->priv,
            currsp->policy);
        KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_STAMP,
            "DP (priv %u policy %u) allocates SP:%p (refcnt %u)\n",
            pcbsp->priv, currsp->policy, sp, key_sp_refcnt(sp));
        ipsec_fillpcbcache(pcbsp, m, sp, dir);
        return sp;
}

/*
 * For FORWARDING packet or OUTBOUND without a socket. Searching SPD for packet,
 * and return a pointer to SP.
 * OUT:        positive: a pointer to the entry for security policy leaf matched.
 *        NULL:        no appropriate SP found, the following value is set to error.
 *                0        : bypass
 *                EACCES        : discard packet.
 *                ENOENT        : ipsec_acquire() in progress, maybe.
 *                others        : error occurred.
 */
static struct secpolicy *
ipsec_getpolicybyaddr(struct mbuf *m, u_int dir, int flag, int *error)
{
        struct secpolicyindex spidx;
        struct secpolicy *sp;

        KASSERT(m != NULL);
        KASSERT(error != NULL);
        KASSERTMSG(IPSEC_DIR_IS_INOROUT(dir), "invalid direction %u", dir);

        sp = NULL;

        /* Make an index to look for a policy. */
        *error = ipsec_setspidx(m, &spidx, dir, 1);
        if (*error != 0) {
                IPSECLOG(LOG_DEBUG, "setpidx failed, dir %u flag %u\n", dir, flag);
                memset(&spidx, 0, sizeof(spidx));
                return NULL;
        }

        spidx.dir = dir;

        if (key_havesp(dir)) {
                sp = KEY_LOOKUP_SP_BYSPIDX(&spidx, dir);
        }
        if (sp == NULL) {
                /* no SP found, use system default */
                sp = KEY_GET_DEFAULT_SP(spidx.dst.sa.sa_family);
        }

        KASSERT(sp != NULL);
        return sp;
}

static struct secpolicy *
ipsec_checkpolicy(struct mbuf *m, u_int dir, u_int flag, int *error,
    struct inpcb *inp)
{
        struct secpolicy *sp;

        *error = 0;

        if (inp == NULL) {
                sp = ipsec_getpolicybyaddr(m, dir, flag, error);
        } else {
                KASSERT(inp->inp_socket != NULL);
                sp = ipsec_getpolicybysock(m, dir, inp, error);
        }
        if (sp == NULL) {
                KASSERTMSG(*error != 0, "getpolicy failed w/o error");
                IPSEC_STATINC(IPSEC_STAT_OUT_INVAL);
                return NULL;
        }
        KASSERTMSG(*error == 0, "sp w/ error set to %u", *error);

        switch (sp->policy) {
        case IPSEC_POLICY_ENTRUST:
        default:
                printf("%s: invalid policy %u\n", __func__, sp->policy);
                /* fall thru... */
        case IPSEC_POLICY_DISCARD:
                IPSEC_STATINC(IPSEC_STAT_OUT_POLVIO);
                *error = -EINVAL;        /* packet is discarded by caller */
                break;
        case IPSEC_POLICY_BYPASS:
        case IPSEC_POLICY_NONE:
                KEY_SP_UNREF(&sp);
                sp = NULL;                /* NB: force NULL result */
                break;
        case IPSEC_POLICY_IPSEC:
                KASSERT(sp->req != NULL);
                break;
        }

        if (*error != 0) {
                KEY_SP_UNREF(&sp);
                sp = NULL;
                IPSECLOG(LOG_DEBUG, "done, error %d\n", *error);
        }

        return sp;
}

int
ipsec4_output(struct mbuf *m, struct inpcb *inp, int flags,
    u_long *mtu, bool *natt_frag, bool *done, bool *count_drop)
{
        struct secpolicy *sp = NULL;
        u_long _mtu = 0;
        int error;

        /*
         * Check the security policy (SP) for the packet and, if required,
         * do IPsec-related processing.  There are two cases here; the first
         * time a packet is sent through it will be untagged and handled by
         * ipsec_checkpolicy().  If the packet is resubmitted to ip_output
         * (e.g. after AH, ESP, etc. processing), there will be a tag to
         * bypass the lookup and related policy checking.
         */
        if (ipsec_outdone(m)) {
                return 0;
        }
        if (inp && ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND)) {
                return 0;
        }
        sp = ipsec_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags, &error, inp);

        /*
         * There are four return cases:
         *        sp != NULL                    apply IPsec policy
         *        sp == NULL, error == 0        no IPsec handling needed
         *        sp == NULL, error == -EINVAL  discard packet w/o error
         *        sp == NULL, error != 0        discard packet, report error
         */
        if (sp == NULL) {
                if (error) {
                        /*
                         * Hack: -EINVAL is used to signal that a packet
                         * should be silently discarded.  This is typically
                         * because we asked key management for an SA and
                         * it was delayed (e.g. kicked up to IKE).
                         */
                        if (error == -EINVAL)
                                error = 0;
                        m_freem(m);
                        *done = true;
                        *count_drop = true;
                        return error;
                }
                /* No IPsec processing for this packet. */
                return 0;
        }

        /*
         * Do delayed checksums now because we send before
         * this is done in the normal processing path.
         */
        if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
                in_undefer_cksum_tcpudp(m);
                m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
        }

        error = ipsec4_process_packet(m, sp->req, &_mtu);
        if (error == 0 && _mtu != 0) {
                /*
                 * NAT-T ESP fragmentation: do not do IPSec processing
                 * now, we will do it on each fragmented packet.
                 */
                *mtu = _mtu;
                *natt_frag = true;
                KEY_SP_UNREF(&sp);
                return 0;
        }

        /*
         * Preserve KAME behaviour: ENOENT can be returned
         * when an SA acquire is in progress.  Don't propagate
         * this to user-level; it confuses applications.
         *
         * XXX this will go away when the SADB is redone.
         */
        if (error == ENOENT)
                error = 0;
        KEY_SP_UNREF(&sp);
        *done = true;
        return error;
}

int
ipsec_ip_input_checkpolicy(struct mbuf *m, bool forward)
{
        struct secpolicy *sp;
        int error;

        error = ipsec_in_reject(m, NULL);
        if (error) {
                return EINVAL;
        }

        if (!forward || !(m->m_flags & M_CANFASTFWD)) {
                return 0;
        }

        /*
         * Peek at the outbound SP for this packet to determine if
         * it is a Fast Forward candidate.
         */
        sp = ipsec_checkpolicy(m, IPSEC_DIR_OUTBOUND, IP_FORWARDING,
            &error, NULL);
        if (sp != NULL) {
                m->m_flags &= ~M_CANFASTFWD;
                KEY_SP_UNREF(&sp);
        }

        return 0;
}

/*
 * If the packet is routed over IPsec tunnel, tell the originator the
 * tunnel MTU.
 *     tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
 *
 * XXX: Quick hack!!!
 *
 * XXX: And what if the MTU goes negative?
 */
void
ipsec_mtu(struct mbuf *m, int *destmtu)
{
        struct secpolicy *sp;
        size_t ipsechdr;
        int error;

        sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, IP_FORWARDING,
            &error);
        if (sp == NULL) {
                return;
        }

        /* Count IPsec header size. */
        ipsechdr = ipsec_sp_hdrsiz(sp, m);

        /*
         * Find the correct route for outer IP header, compute tunnel MTU.
         */
        if (sp->req) {
                struct secasvar *sav;

                sav = ipsec_lookup_sa(sp->req, m);
                if (sav != NULL) {
                        struct route *ro;
                        struct rtentry *rt;

                        ro = &sav->sah->sa_route;
                        rt = rtcache_validate(ro);
                        if (rt && rt->rt_ifp) {
                                *destmtu = rt->rt_rmx.rmx_mtu ?
                                    rt->rt_rmx.rmx_mtu : rt->rt_ifp->if_mtu;
                                *destmtu -= ipsechdr;
                        }
                        rtcache_unref(rt, ro);
                        KEY_SA_UNREF(&sav);
                }
        }
        KEY_SP_UNREF(&sp);
}

static int
ipsec_setspidx_inpcb(struct mbuf *m, struct inpcb *inp)
{
        int error;

        KASSERT(inp != NULL);
        KASSERT(inp->inp_sp != NULL);
        KASSERT(inp->inp_sp->sp_out != NULL);
        KASSERT(inp->inp_sp->sp_in != NULL);

        error = ipsec_setspidx(m, &inp->inp_sp->sp_in->spidx,
            IPSEC_DIR_INBOUND, 1);
        if (error == 0) {
                inp->inp_sp->sp_out->spidx = inp->inp_sp->sp_in->spidx;
                inp->inp_sp->sp_out->spidx.dir = IPSEC_DIR_OUTBOUND;
        } else {
                memset(&inp->inp_sp->sp_in->spidx, 0,
                    sizeof(inp->inp_sp->sp_in->spidx));
                memset(&inp->inp_sp->sp_out->spidx, 0,
                    sizeof(inp->inp_sp->sp_out->spidx));
        }
        return error;
}

/*
 * configure security policy index (src/dst/proto/sport/dport)
 * by looking at the content of mbuf.
 * the caller is responsible for error recovery (like clearing up spidx).
 */
static int
ipsec_setspidx(struct mbuf *m, struct secpolicyindex *spidx, int dir,
    int needport)
{
        struct ip *ip = NULL;
        struct ip ipbuf;
        u_int v;
        int error;

        KASSERT(m != NULL);
        M_VERIFY_PACKET(m);

        if (m->m_pkthdr.len < sizeof(struct ip)) {
                KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_DUMP,
                    "pkthdr.len(%d) < sizeof(struct ip), ignored.\n",
                    m->m_pkthdr.len);
                return EINVAL;
        }

        memset(spidx, 0, sizeof(*spidx));
        spidx->dir = dir;

        if (m->m_len >= sizeof(*ip)) {
                ip = mtod(m, struct ip *);
        } else {
                m_copydata(m, 0, sizeof(ipbuf), &ipbuf);
                ip = &ipbuf;
        }
        v = ip->ip_v;
        switch (v) {
        case 4:
                error = ipsec4_setspidx_ipaddr(m, spidx);
                if (error)
                        return error;
                ipsec4_get_ulp(m, spidx, needport);
                return 0;
#ifdef INET6
        case 6:
                if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) {
                        KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_DUMP,
                            "pkthdr.len(%d) < sizeof(struct ip6_hdr), "
                            "ignored.\n", m->m_pkthdr.len);
                        return EINVAL;
                }
                error = ipsec6_setspidx_ipaddr(m, spidx);
                if (error)
                        return error;
                ipsec6_get_ulp(m, spidx, needport);
                return 0;
#endif
        default:
                KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_DUMP,
                    "unknown IP version %u, ignored.\n", v);
                return EINVAL;
        }
}

static void
ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport)
{
        u_int8_t nxt;
        int off;

        KASSERT(m != NULL);
        KASSERTMSG(m->m_pkthdr.len >= sizeof(struct ip), "packet too short");

        /* NB: ip_input() flips it into host endian XXX need more checking */
        if (m->m_len >= sizeof(struct ip)) {
                struct ip *ip = mtod(m, struct ip *);
                if (ip->ip_off & htons(IP_MF | IP_OFFMASK))
                        goto done;
                off = ip->ip_hl << 2;
                nxt = ip->ip_p;
        } else {
                struct ip ih;

                m_copydata(m, 0, sizeof(struct ip), &ih);
                if (ih.ip_off & htons(IP_MF | IP_OFFMASK))
                        goto done;
                off = ih.ip_hl << 2;
                nxt = ih.ip_p;
        }

        while (off < m->m_pkthdr.len) {
                struct ip6_ext ip6e;
                struct tcphdr th;
                struct udphdr uh;
                struct icmp icmph;

                switch (nxt) {
                case IPPROTO_TCP:
                        spidx->ul_proto = nxt;
                        if (!needport)
                                goto done_proto;
                        if (off + sizeof(struct tcphdr) > m->m_pkthdr.len)
                                goto done;
                        m_copydata(m, off, sizeof(th), &th);
                        spidx->src.sin.sin_port = th.th_sport;
                        spidx->dst.sin.sin_port = th.th_dport;
                        return;
                case IPPROTO_UDP:
                        spidx->ul_proto = nxt;
                        if (!needport)
                                goto done_proto;
                        if (off + sizeof(struct udphdr) > m->m_pkthdr.len)
                                goto done;
                        m_copydata(m, off, sizeof(uh), &uh);
                        spidx->src.sin.sin_port = uh.uh_sport;
                        spidx->dst.sin.sin_port = uh.uh_dport;
                        return;
                case IPPROTO_AH:
                        if (off + sizeof(ip6e) > m->m_pkthdr.len)
                                goto done;
                        /* XXX sigh, this works but is totally bogus */
                        m_copydata(m, off, sizeof(ip6e), &ip6e);
                        off += (ip6e.ip6e_len + 2) << 2;
                        nxt = ip6e.ip6e_nxt;
                        break;
                case IPPROTO_ICMP:
                        spidx->ul_proto = nxt;
                        if (off + sizeof(struct icmp) > m->m_pkthdr.len)
                                goto done;
                        m_copydata(m, off, sizeof(icmph), &icmph);
                        ((struct sockaddr_in *)&spidx->src)->sin_port =
                            htons((uint16_t)icmph.icmp_type);
                        ((struct sockaddr_in *)&spidx->dst)->sin_port =
                            htons((uint16_t)icmph.icmp_code);
                        return;
                default:
                        /* XXX intermediate headers??? */
                        spidx->ul_proto = nxt;
                        goto done_proto;
                }
        }
done:
        spidx->ul_proto = IPSEC_ULPROTO_ANY;
done_proto:
        spidx->src.sin.sin_port = IPSEC_PORT_ANY;
        spidx->dst.sin.sin_port = IPSEC_PORT_ANY;
}

static int
ipsec4_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx)
{
        static const struct sockaddr_in template = {
                sizeof(struct sockaddr_in),
                AF_INET,
                0, { 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 }
        };

        spidx->src.sin = template;
        spidx->dst.sin = template;

        if (m->m_len < sizeof(struct ip)) {
                m_copydata(m, offsetof(struct ip, ip_src),
                    sizeof(struct in_addr), &spidx->src.sin.sin_addr);
                m_copydata(m, offsetof(struct ip, ip_dst),
                    sizeof(struct in_addr), &spidx->dst.sin.sin_addr);
        } else {
                struct ip *ip = mtod(m, struct ip *);
                spidx->src.sin.sin_addr = ip->ip_src;
                spidx->dst.sin.sin_addr = ip->ip_dst;
        }

        spidx->prefs = sizeof(struct in_addr) << 3;
        spidx->prefd = sizeof(struct in_addr) << 3;

        return 0;
}

#ifdef INET6
static void
ipsec6_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport)
{
        int off, nxt;
        struct tcphdr th;
        struct udphdr uh;
        struct icmp6_hdr icmph;

        KASSERT(m != NULL);

        if (KEYDEBUG_ON(KEYDEBUG_IPSEC_DUMP)) {
                kdebug_mbuf(__func__, m);
        }

        /* set default */
        spidx->ul_proto = IPSEC_ULPROTO_ANY;
        ((struct sockaddr_in6 *)&spidx->src)->sin6_port = IPSEC_PORT_ANY;
        ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = IPSEC_PORT_ANY;

        nxt = -1;
        off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
        if (off < 0 || m->m_pkthdr.len < off)
                return;

        switch (nxt) {
        case IPPROTO_TCP:
                spidx->ul_proto = nxt;
                if (!needport)
                        break;
                if (off + sizeof(struct tcphdr) > m->m_pkthdr.len)
                        break;
                m_copydata(m, off, sizeof(th), &th);
                ((struct sockaddr_in6 *)&spidx->src)->sin6_port = th.th_sport;
                ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = th.th_dport;
                break;
        case IPPROTO_UDP:
                spidx->ul_proto = nxt;
                if (!needport)
                        break;
                if (off + sizeof(struct udphdr) > m->m_pkthdr.len)
                        break;
                m_copydata(m, off, sizeof(uh), &uh);
                ((struct sockaddr_in6 *)&spidx->src)->sin6_port = uh.uh_sport;
                ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = uh.uh_dport;
                break;
        case IPPROTO_ICMPV6:
                spidx->ul_proto = nxt;
                if (off + sizeof(struct icmp6_hdr) > m->m_pkthdr.len)
                        break;
                m_copydata(m, off, sizeof(icmph), &icmph);
                ((struct sockaddr_in6 *)&spidx->src)->sin6_port =
                    htons((uint16_t)icmph.icmp6_type);
                ((struct sockaddr_in6 *)&spidx->dst)->sin6_port =
                    htons((uint16_t)icmph.icmp6_code);
                break;
        default:
                /* XXX intermediate headers??? */
                spidx->ul_proto = nxt;
                break;
        }
}

static int
ipsec6_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx)
{
        struct ip6_hdr *ip6 = NULL;
        struct ip6_hdr ip6buf;
        struct sockaddr_in6 *sin6;

        if (m->m_len >= sizeof(*ip6)) {
                ip6 = mtod(m, struct ip6_hdr *);
        } else {
                m_copydata(m, 0, sizeof(ip6buf), &ip6buf);
                ip6 = &ip6buf;
        }

        sin6 = (struct sockaddr_in6 *)&spidx->src;
        memset(sin6, 0, sizeof(*sin6));
        sin6->sin6_family = AF_INET6;
        sin6->sin6_len = sizeof(struct sockaddr_in6);
        memcpy(&sin6->sin6_addr, &ip6->ip6_src, sizeof(ip6->ip6_src));
        if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
                sin6->sin6_addr.s6_addr16[1] = 0;
                sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]);
        }
        spidx->prefs = sizeof(struct in6_addr) << 3;

        sin6 = (struct sockaddr_in6 *)&spidx->dst;
        memset(sin6, 0, sizeof(*sin6));
        sin6->sin6_family = AF_INET6;
        sin6->sin6_len = sizeof(struct sockaddr_in6);
        memcpy(&sin6->sin6_addr, &ip6->ip6_dst, sizeof(ip6->ip6_dst));
        if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) {
                sin6->sin6_addr.s6_addr16[1] = 0;
                sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]);
        }
        spidx->prefd = sizeof(struct in6_addr) << 3;

        return 0;
}
#endif

static void
ipsec_delpcbpolicy(struct inpcbpolicy *p)
{

        kmem_intr_free(p, sizeof(*p));
}

int
ipsec_init_pcbpolicy(struct socket *so, struct inpcbpolicy **policy)
{
        struct inpcbpolicy *new;

        KASSERT(so != NULL);
        KASSERT(policy != NULL);

        new = kmem_intr_zalloc(sizeof(*new), KM_NOSLEEP);
        if (new == NULL) {
                IPSECLOG(LOG_DEBUG, "No more memory.\n");
                return ENOBUFS;
        }

        if (IPSEC_PRIVILEGED_SO(so))
                new->priv = 1;
        else
                new->priv = 0;

        /*
         * Set dummy SPs. Actual SPs will be allocated later if needed.
         */
        new->sp_in = &ipsec_dummy_sp;
        new->sp_out = &ipsec_dummy_sp;

        *policy = new;

        return 0;
}

static void
ipsec_destroy_policy(struct secpolicy *sp)
{

        if (sp == &ipsec_dummy_sp) {
                ; /* It's dummy. No need to free it. */
        } else {
                /*
                 * We cannot destroy here because it can be called in
                 * softint. So mark the SP as DEAD and let the timer
                 * destroy it. See key_timehandler_spd.
                 */
                sp->state = IPSEC_SPSTATE_DEAD;
        }
}

int
ipsec_set_policy(struct inpcb *inp, const void *request, size_t len,
    kauth_cred_t cred)
{
        const struct sadb_x_policy *xpl;
        struct secpolicy *newsp, *oldsp;
        struct secpolicy **policy;
        int error;

        KASSERT(!cpu_softintr_p());
        KASSERT(inp != NULL);
        KASSERT(inp_locked(inp));
        KASSERT(request != NULL);

        if (len < sizeof(*xpl))
                return EINVAL;
        xpl = (const struct sadb_x_policy *)request;

        KASSERT(inp->inp_sp != NULL);

        /* select direction */
        switch (xpl->sadb_x_policy_dir) {
        case IPSEC_DIR_INBOUND:
                policy = &inp->inp_sp->sp_in;
                break;
        case IPSEC_DIR_OUTBOUND:
                policy = &inp->inp_sp->sp_out;
                break;
        default:
                IPSECLOG(LOG_ERR, "invalid direction=%u\n",
                    xpl->sadb_x_policy_dir);
                return EINVAL;
        }

        /* sanity check. */
        if (policy == NULL || *policy == NULL)
                return EINVAL;

        if (KEYDEBUG_ON(KEYDEBUG_IPSEC_DUMP)) {
                kdebug_sadb_xpolicy("set passed policy", request);
        }

        /* check policy type */
        /* ipsec_set_policy() accepts IPSEC, ENTRUST and BYPASS. */
        if (xpl->sadb_x_policy_type == IPSEC_POLICY_DISCARD ||
            xpl->sadb_x_policy_type == IPSEC_POLICY_NONE)
                return EINVAL;

        /* check privileged socket */
        if (xpl->sadb_x_policy_type == IPSEC_POLICY_BYPASS) {
                error = kauth_authorize_network(cred, KAUTH_NETWORK_IPSEC,
                    KAUTH_REQ_NETWORK_IPSEC_BYPASS, NULL, NULL, NULL);
                if (error)
                        return error;
        }

        /* allocation new SP entry */
        if ((newsp = key_msg2sp(xpl, len, &error)) == NULL)
                return error;

        key_init_sp(newsp);
        newsp->created = time_uptime;
        /* Insert the global list for SPs for sockets */
        key_socksplist_add(newsp);

        /* clear old SP and set new SP */
        oldsp = *policy;
        *policy = newsp;
        ipsec_destroy_policy(oldsp);

        if (KEYDEBUG_ON(KEYDEBUG_IPSEC_DUMP)) {
                printf("%s: new policy\n", __func__);
                kdebug_secpolicy(newsp);
        }

        return 0;
}

int
ipsec_get_policy(struct inpcb *inp, const void *request, size_t len,
    struct mbuf **mp)
{
        const struct sadb_x_policy *xpl;
        struct secpolicy *policy;

        /* sanity check. */
        if (inp == NULL || request == NULL || mp == NULL)
                return EINVAL;
        KASSERT(inp->inp_sp != NULL);
        if (len < sizeof(*xpl))
                return EINVAL;
        xpl = (const struct sadb_x_policy *)request;

        /* select direction */
        switch (xpl->sadb_x_policy_dir) {
        case IPSEC_DIR_INBOUND:
                policy = inp->inp_sp->sp_in;
                break;
        case IPSEC_DIR_OUTBOUND:
                policy = inp->inp_sp->sp_out;
                break;
        default:
                IPSECLOG(LOG_ERR, "invalid direction=%u\n",
                    xpl->sadb_x_policy_dir);
                return EINVAL;
        }

        if (policy == NULL)
                return EINVAL;

        *mp = key_sp2msg(policy, M_NOWAIT);
        if (!*mp) {
                IPSECLOG(LOG_DEBUG, "No more memory.\n");
                return ENOBUFS;
        }

        if (KEYDEBUG_ON(KEYDEBUG_IPSEC_DUMP)) {
                kdebug_mbuf(__func__, *mp);
        }

        return 0;
}

int
ipsec_delete_pcbpolicy(struct inpcb *inp)
{

        KASSERT(inp != NULL);

        if (inp->inp_sp == NULL)
                return 0;

        if (inp->inp_sp->sp_in != NULL)
                ipsec_destroy_policy(inp->inp_sp->sp_in);

        if (inp->inp_sp->sp_out != NULL)
                ipsec_destroy_policy(inp->inp_sp->sp_out);

        ipsec_invalpcbcache(inp->inp_sp, IPSEC_DIR_ANY);

        ipsec_delpcbpolicy(inp->inp_sp);
        inp->inp_sp = NULL;

        return 0;
}

/*
 * Return the current level (either IPSEC_LEVEL_USE or IPSEC_LEVEL_REQUIRE).
 */
u_int
ipsec_get_reqlevel(const struct ipsecrequest *isr)
{
        u_int level = 0;
        u_int esp_trans_deflev, esp_net_deflev;
        u_int ah_trans_deflev, ah_net_deflev;

        KASSERT(isr != NULL);
        KASSERT(isr->sp != NULL);
        KASSERTMSG(
            isr->sp->spidx.src.sa.sa_family == isr->sp->spidx.dst.sa.sa_family,
            "af family mismatch, src %u, dst %u",
            isr->sp->spidx.src.sa.sa_family, isr->sp->spidx.dst.sa.sa_family);

/* XXX note that we have ipseclog() expanded here - code sync issue */
#define IPSEC_CHECK_DEFAULT(lev)                                        \
    (((lev) != IPSEC_LEVEL_USE && (lev) != IPSEC_LEVEL_REQUIRE                \
    && (lev) != IPSEC_LEVEL_UNIQUE) ?                                        \
        (ipsec_debug ? log(LOG_INFO, "fixed system default level " #lev \
        ":%d->%d\n", (lev), IPSEC_LEVEL_REQUIRE) : (void)0),                \
        (lev) = IPSEC_LEVEL_REQUIRE, (lev)                                \
    : (lev))

        /* set default level */
        switch (((struct sockaddr *)&isr->sp->spidx.src)->sa_family) {
#ifdef INET
        case AF_INET:
                esp_trans_deflev = IPSEC_CHECK_DEFAULT(ip4_esp_trans_deflev);
                esp_net_deflev = IPSEC_CHECK_DEFAULT(ip4_esp_net_deflev);
                ah_trans_deflev = IPSEC_CHECK_DEFAULT(ip4_ah_trans_deflev);
                ah_net_deflev = IPSEC_CHECK_DEFAULT(ip4_ah_net_deflev);
                break;
#endif
#ifdef INET6
        case AF_INET6:
                esp_trans_deflev = IPSEC_CHECK_DEFAULT(ip6_esp_trans_deflev);
                esp_net_deflev = IPSEC_CHECK_DEFAULT(ip6_esp_net_deflev);
                ah_trans_deflev = IPSEC_CHECK_DEFAULT(ip6_ah_trans_deflev);
                ah_net_deflev = IPSEC_CHECK_DEFAULT(ip6_ah_net_deflev);
                break;
#endif
        default:
                panic("%s: unknown af %u", __func__,
                    isr->sp->spidx.src.sa.sa_family);
        }

#undef IPSEC_CHECK_DEFAULT

        /* set level */
        switch (isr->level) {
        case IPSEC_LEVEL_DEFAULT:
                switch (isr->saidx.proto) {
                case IPPROTO_ESP:
                        if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
                                level = esp_net_deflev;
                        else
                                level = esp_trans_deflev;
                        break;
                case IPPROTO_AH:
                        if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
                                level = ah_net_deflev;
                        else
                                level = ah_trans_deflev;
                        break;
                case IPPROTO_IPCOMP:
                        /*
                         * we don't really care, as IPcomp document says that
                         * we shouldn't compress small packets
                         */
                        level = IPSEC_LEVEL_USE;
                        break;
                default:
                        panic("%s: Illegal protocol defined %u", __func__,
                            isr->saidx.proto);
                }
                break;

        case IPSEC_LEVEL_USE:
        case IPSEC_LEVEL_REQUIRE:
                level = isr->level;
                break;
        case IPSEC_LEVEL_UNIQUE:
                level = IPSEC_LEVEL_REQUIRE;
                break;

        default:
                panic("%s: Illegal IPsec level %u", __func__, isr->level);
        }

        return level;
}

/*
 * Check security policy requirements against the actual packet contents.
 *
 * If the SP requires an IPsec packet, and the packet was neither AH nor ESP,
 * then kick it.
 */
static int
ipsec_sp_reject(const struct secpolicy *sp, const struct mbuf *m)
{
        struct ipsecrequest *isr;

        if (KEYDEBUG_ON(KEYDEBUG_IPSEC_DATA)) {
                printf("%s: using SP\n", __func__);
                kdebug_secpolicy(sp);
        }

        /* check policy */
        switch (sp->policy) {
        case IPSEC_POLICY_DISCARD:
                return 1;
        case IPSEC_POLICY_BYPASS:
        case IPSEC_POLICY_NONE:
                return 0;
        }

        KASSERTMSG(sp->policy == IPSEC_POLICY_IPSEC,
            "invalid policy %u", sp->policy);

        /* XXX should compare policy against ipsec header history */

        for (isr = sp->req; isr != NULL; isr = isr->next) {
                if (ipsec_get_reqlevel(isr) != IPSEC_LEVEL_REQUIRE)
                        continue;
                switch (isr->saidx.proto) {
                case IPPROTO_ESP:
                        if ((m->m_flags & M_DECRYPTED) == 0) {
                                KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_DUMP,
                                    "ESP m_flags:%x\n", m->m_flags);
                                return 1;
                        }
                        break;
                case IPPROTO_AH:
                        if ((m->m_flags & M_AUTHIPHDR) == 0) {
                                KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_DUMP,
                                    "AH m_flags:%x\n", m->m_flags);
                                return 1;
                        }
                        break;
                case IPPROTO_IPCOMP:
                        /*
                         * We don't really care, as IPcomp document
                         * says that we shouldn't compress small
                         * packets, IPComp policy should always be
                         * treated as being in "use" level.
                         */
                        break;
                }
        }

        return 0;
}

/*
 * Check security policy requirements.
 */
int
ipsec_in_reject(struct mbuf *m, struct inpcb *inp)
{
        struct secpolicy *sp;
        int error;
        int result;

        KASSERT(m != NULL);

        if (inp == NULL)
                sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
                    IP_FORWARDING, &error);
        else
                sp = ipsec_getpolicybysock(m, IPSEC_DIR_INBOUND,
                    inp, &error);

        if (sp != NULL) {
                result = ipsec_sp_reject(sp, m);
                if (result)
                        IPSEC_STATINC(IPSEC_STAT_IN_POLVIO);
                KEY_SP_UNREF(&sp);
        } else {
                result = 0;
        }
        return result;
}

/*
 * Compute the byte size to be occupied by the IPsec header. If it is
 * tunneled, it includes the size of outer IP header.
 */
static size_t
ipsec_sp_hdrsiz(const struct secpolicy *sp, const struct mbuf *m)
{
        struct ipsecrequest *isr;
        size_t siz;

        if (KEYDEBUG_ON(KEYDEBUG_IPSEC_DATA)) {
                printf("%s: using SP\n", __func__);
                kdebug_secpolicy(sp);
        }

        switch (sp->policy) {
        case IPSEC_POLICY_DISCARD:
        case IPSEC_POLICY_BYPASS:
        case IPSEC_POLICY_NONE:
                return 0;
        }

        KASSERTMSG(sp->policy == IPSEC_POLICY_IPSEC,
            "invalid policy %u", sp->policy);

        siz = 0;
        for (isr = sp->req; isr != NULL; isr = isr->next) {
                size_t clen = 0;
                struct secasvar *sav;

                switch (isr->saidx.proto) {
                case IPPROTO_ESP:
                        sav = ipsec_lookup_sa(isr, m);
                        if (sav != NULL) {
                                clen = esp_hdrsiz(sav);
                                KEY_SA_UNREF(&sav);
                        } else
                                clen = esp_hdrsiz(NULL);
                        break;
                case IPPROTO_AH:
                        sav = ipsec_lookup_sa(isr, m);
                        if (sav != NULL) {
                                clen = ah_hdrsiz(sav);
                                KEY_SA_UNREF(&sav);
                        } else
                                clen = ah_hdrsiz(NULL);
                        break;
                case IPPROTO_IPCOMP:
                        clen = sizeof(struct ipcomp);
                        break;
                }

                if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
                        switch (isr->saidx.dst.sa.sa_family) {
                        case AF_INET:
                                clen += sizeof(struct ip);
                                break;
#ifdef INET6
                        case AF_INET6:
                                clen += sizeof(struct ip6_hdr);
                                break;
#endif
                        default:
                                IPSECLOG(LOG_ERR, "unknown AF %d in "
                                    "IPsec tunnel SA\n",
                                    ((const struct sockaddr *)&isr->saidx.dst)
                                    ->sa_family);
                                break;
                        }
                }
                siz += clen;
        }

        return siz;
}

size_t
ipsec_hdrsiz(struct mbuf *m, u_int dir, struct inpcb *inp)
{
        struct secpolicy *sp;
        int error;
        size_t size;

        KASSERT(m != NULL);
        KASSERTMSG(inp == NULL || inp->inp_socket != NULL,
            "socket w/o inpcb");

        if (inp == NULL)
                sp = ipsec_getpolicybyaddr(m, dir, IP_FORWARDING, &error);
        else
                sp = ipsec_getpolicybysock(m, dir, inp, &error);

        if (sp != NULL) {
                size = ipsec_sp_hdrsiz(sp, m);
                KEYDEBUG_PRINTF(KEYDEBUG_IPSEC_DATA, "size:%zu.\n", size);
                KEY_SP_UNREF(&sp);
        } else {
                size = 0;
        }

        return size;
}

/*
 * Check the variable replay window.
 * ipsec_chkreplay() performs replay check before ICV verification.
 * ipsec_updatereplay() updates replay bitmap.  This must be called after
 * ICV verification (it also performs replay check, which is usually done
 * beforehand).
 * 0 (zero) is returned if packet disallowed, 1 if packet permitted.
 *
 * based on RFC 2401.
 */
int
ipsec_chkreplay(u_int32_t seq, const struct secasvar *sav)
{
        const struct secreplay *replay;
        u_int32_t diff;
        int fr;
        u_int32_t wsizeb;        /* constant: bits of window size */
        int frlast;                /* constant: last frame */

        KASSERT(sav != NULL);
        KASSERT(sav->replay != NULL);

        replay = sav->replay;

        if (replay->wsize == 0)
                return 1;        /* no need to check replay. */

        /* constant */
        frlast = replay->wsize - 1;
        wsizeb = replay->wsize << 3;

        /* sequence number of 0 is invalid */
        if (seq == 0)
                return 0;

        /* first time is always okay */
        if (replay->count == 0)
                return 1;

        if (seq > replay->lastseq) {
                /* larger sequences are okay */
                return 1;
        } else {
                /* seq is equal or less than lastseq. */
                diff = replay->lastseq - seq;

                /* over range to check, i.e. too old or wrapped */
                if (diff >= wsizeb)
                        return 0;

                fr = frlast - diff / 8;

                /* this packet already seen ? */
                if ((replay->bitmap)[fr] & (1 << (diff % 8)))
                        return 0;

                /* out of order but good */
                return 1;
        }
}

/*
 * check replay counter whether to update or not.
 * OUT:        0:        OK
 *        1:        NG
 */
int
ipsec_updatereplay(u_int32_t seq, const struct secasvar *sav)
{
        struct secreplay *replay;
        u_int32_t diff;
        int fr;
        u_int32_t wsizeb;        /* constant: bits of window size */
        int frlast;                /* constant: last frame */

        KASSERT(sav != NULL);
        KASSERT(sav->replay != NULL);

        replay = sav->replay;

        if (replay->wsize == 0)
                goto ok;        /* no need to check replay. */

        /* constant */
        frlast = replay->wsize - 1;
        wsizeb = replay->wsize << 3;

        /* sequence number of 0 is invalid */
        if (seq == 0)
                return 1;

        /* first time */
        if (replay->count == 0) {
                replay->lastseq = seq;
                memset(replay->bitmap, 0, replay->wsize);
                (replay->bitmap)[frlast] = 1;
                goto ok;
        }

        if (seq > replay->lastseq) {
                /* seq is larger than lastseq. */
                diff = seq - replay->lastseq;

                /* new larger sequence number */
                if (diff < wsizeb) {
                        /* In window */
                        /* set bit for this packet */
                        vshiftl(replay->bitmap, diff, replay->wsize);
                        (replay->bitmap)[frlast] |= 1;
                } else {
                        /* this packet has a "way larger" */
                        memset(replay->bitmap, 0, replay->wsize);
                        (replay->bitmap)[frlast] = 1;
                }
                replay->lastseq = seq;

                /* larger is good */
        } else {
                /* seq is equal or less than lastseq. */
                diff = replay->lastseq - seq;

                /* over range to check, i.e. too old or wrapped */
                if (diff >= wsizeb)
                        return 1;

                fr = frlast - diff / 8;

                /* this packet already seen ? */
                if ((replay->bitmap)[fr] & (1 << (diff % 8)))
                        return 1;

                /* mark as seen */
                (replay->bitmap)[fr] |= (1 << (diff % 8));

                /* out of order but good */
        }

ok:
        if (replay->count == ~0) {
                char buf[IPSEC_LOGSASTRLEN];

                /* set overflow flag */
                replay->overflow++;

                /* don't increment, no more packets accepted */
                if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0)
                        return 1;

                IPSECLOG(LOG_WARNING, "replay counter made %d cycle. %s\n",
                    replay->overflow, ipsec_logsastr(sav, buf, sizeof(buf)));
        }

        replay->count++;

        return 0;
}

/*
 * shift variable length buffer to left.
 * IN:        bitmap: pointer to the buffer
 *        nbit:        the number of to shift.
 *        wsize:        buffer size (bytes).
 */
static void
vshiftl(unsigned char *bitmap, int nbit, int wsize)
{
        int s, j, i;
        unsigned char over;

        for (j = 0; j < nbit; j += 8) {
                s = (nbit - j < 8) ? (nbit - j): 8;
                bitmap[0] <<= s;
                for (i = 1; i < wsize; i++) {
                        over = (bitmap[i] >> (8 - s));
                        bitmap[i] <<= s;
                        bitmap[i-1] |= over;
                }
        }

        return;
}

/* Return a printable string for the address. */
const char *
ipsec_address(const union sockaddr_union *sa, char *buf, size_t size)
{
        switch (sa->sa.sa_family) {
        case AF_INET:
                in_print(buf, size, &sa->sin.sin_addr);
                return buf;
#if INET6
        case AF_INET6:
                in6_print(buf, size, &sa->sin6.sin6_addr);
                return buf;
#endif
        default:
                return "(unknown address family)";
        }
}

const char *
ipsec_logsastr(const struct secasvar *sav, char *buf, size_t size)
{
        const struct secasindex *saidx = &sav->sah->saidx;
        char sbuf[IPSEC_ADDRSTRLEN], dbuf[IPSEC_ADDRSTRLEN];

        KASSERTMSG(saidx->src.sa.sa_family == saidx->dst.sa.sa_family,
            "af family mismatch, src %u, dst %u",
            saidx->src.sa.sa_family, saidx->dst.sa.sa_family);

        snprintf(buf, size, "SA(SPI=%u src=%s dst=%s)",
            (u_int32_t)ntohl(sav->spi),
            ipsec_address(&saidx->src, sbuf, sizeof(sbuf)),
            ipsec_address(&saidx->dst, dbuf, sizeof(dbuf)));

        return buf;
}

#ifdef INET6
struct secpolicy *
ipsec6_check_policy(struct mbuf *m, struct inpcb *inp, int flags,
    int *needipsecp, int *errorp)
{
        struct secpolicy *sp = NULL;
        int error = 0;
        int needipsec = 0;

        if (ipsec_outdone(m)) {
                goto skippolicycheck;
        }
        if (inp && ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND)) {
                goto skippolicycheck;
        }
        sp = ipsec_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags, &error, inp);

        /*
         * There are four return cases:
         *        sp != NULL                    apply IPsec policy
         *        sp == NULL, error == 0        no IPsec handling needed
         *        sp == NULL, error == -EINVAL  discard packet w/o error
         *        sp == NULL, error != 0        discard packet, report error
         */
        if (sp == NULL) {
                needipsec = 0;
        } else {
                needipsec = 1;
        }

skippolicycheck:
        *errorp = error;
        *needipsecp = needipsec;
        return sp;
}

/*
 * calculate UDP checksum for UDP encapsulated ESP for IPv6.
 *
 * RFC2460(Internet Protocol, Version 6 Specification) says:
 *
 *   IPv6 receivers MUST discard UDP packets with a zero checksum.
 *
 * There is more relaxed specification RFC6935(IPv6 and UDP Checksums for
 * Tunneled Packets). The document allows zero checksum. It's too
 * late to publish, there are a lot of interoperability problems...
 */
void
ipsec6_udp_cksum(struct mbuf *m)
{
        struct ip6_hdr *ip6;
        uint16_t plen, uh_sum;
        int off;

        /* must called after m_pullup() */
        KASSERT(m->m_len >= sizeof(struct ip6_hdr));

        ip6 = mtod(m, struct ip6_hdr *);
        KASSERT(ip6->ip6_nxt == IPPROTO_UDP);

        /* ip6->ip6_plen can not be updated before ip6_output() */
        plen = m->m_pkthdr.len - sizeof(*ip6);
        KASSERT(plen >= sizeof(struct udphdr));

        uh_sum = in6_cksum(m, IPPROTO_UDP, sizeof(*ip6), plen);
        if (uh_sum == 0)
                uh_sum = 0xffff;

        off = sizeof(*ip6) + offsetof(struct udphdr, uh_sum);
        m_copyback(m, off, sizeof(uh_sum), (void *)&uh_sum);
}
#endif /* INET6 */

/*
 * -----------------------------------------------------------------------------
 */

/* XXX this stuff doesn't belong here... */

static struct xformsw *xforms = NULL;

/*
 * Register a transform; typically at system startup.
 */
void
xform_register(struct xformsw *xsp)
{
        xsp->xf_next = xforms;
        xforms = xsp;
}

/*
 * Initialize transform support in an sav.
 */
int
xform_init(struct secasvar *sav, int xftype)
{
        struct xformsw *xsp;

        if (sav->tdb_xform != NULL)        /* previously initialized */
                return 0;
        for (xsp = xforms; xsp; xsp = xsp->xf_next)
                if (xsp->xf_type == xftype)
                        return (*xsp->xf_init)(sav, xsp);

        IPSECLOG(LOG_DEBUG, "no match for xform type %d\n", xftype);
        return EINVAL;
}

/*
 * XXXJRT This should be done as a protosw init call.
 */
void
ipsec_attach(void)
{

        ipsec_output_init();

        ipsecstat_percpu = percpu_alloc(sizeof(uint64_t) * IPSEC_NSTATS);

        sysctl_net_inet_ipsec_setup(NULL);
#ifdef INET6
        sysctl_net_inet6_ipsec6_setup(NULL);
#endif

        ah_attach();
        esp_attach();
        ipcomp_attach();
        ipe4_attach();
#ifdef TCP_SIGNATURE
        tcpsignature_attach();
#endif
}














































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
/*        $NetBSD: rtsock.c,v 1.256 2022/08/27 08:36:41 skrll Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1988, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)rtsock.c        8.7 (Berkeley) 10/12/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rtsock.c,v 1.256 2022/08/27 08:36:41 skrll Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/intr.h>
#include <sys/condvar.h>
#include <sys/compat_stub.h>

#include <net/if.h>
#include <net/if_llatbl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/raw_cb.h>

#include <netinet/in_var.h>
#include <netinet/if_inarp.h>

#include <netmpls/mpls.h>

#include <compat/net/if.h>
#include <compat/net/route.h>

#ifdef COMPAT_RTSOCK
#undef COMPAT_RTSOCK
#endif

static int if_addrflags(struct ifaddr *);

#include <net/rtsock_shared.c>

/*
 * XXX avoid using void * once msghdr compat disappears.
 */
void
rt_setmetrics(void *in, struct rtentry *out)
{
        const struct rt_xmsghdr *rtm = in;

        _rt_setmetrics(rtm->rtm_inits, rtm, out);
}

int
rt_msg3(int type, struct rt_addrinfo *rtinfo, void *cpv, struct rt_walkarg *w,
        int *lenp)
{
        return rt_msg2(type, rtinfo, cpv, w, lenp);
}

static int
if_addrflags(struct ifaddr *ifa)
{

        switch (ifa->ifa_addr->sa_family) {
#ifdef INET
        case AF_INET:
                return ifatoia(ifa)->ia4_flags;
#endif
#ifdef INET6
        case AF_INET6:
                return ifatoia6(ifa)->ia6_flags;
#endif
        default:
                return 0;
        }
}


/*
 * Send a routing message as mimicing that a cloned route is added.
 */
void
rt_clonedmsg(int type, const struct sockaddr *src, const struct sockaddr *dst,
    const uint8_t *lladdr, const struct ifnet *ifp)
{
        struct rt_addrinfo info;
        /* Mimic flags exactly */
#define RTF_LLINFO        0x400
#define RTF_CLONED        0x2000
        int flags = RTF_DONE;
        union {
                struct sockaddr sa;
                struct sockaddr_storage ss;
                struct sockaddr_dl sdl;
        } u;

        if (type != RTM_MISS)
                flags |= RTF_HOST | RTF_CLONED | RTF_LLINFO;
        if (type == RTM_ADD || type == RTM_CHANGE)
                flags |= RTF_UP;
        memset(&info, 0, sizeof(info));
        info.rti_info[RTAX_AUTHOR] = src;
        info.rti_info[RTAX_DST] = dst;
        sockaddr_dl_init(&u.sdl, sizeof(u.ss), ifp->if_index, ifp->if_type,
            NULL, 0, lladdr, ifp->if_addrlen);
        info.rti_info[RTAX_GATEWAY] = &u.sa;

        rt_missmsg(type, &info, flags, 0);
#undef RTF_LLINFO
#undef RTF_CLONED
}


/*
 * The remaining code implements the routing-table sysctl node.  It is
 * compiled only for the non-COMPAT case.
 */

/*
 * This is used in dumping the kernel table via sysctl().
 */
static int
sysctl_dumpentry(struct rtentry *rt, void *v)
{
        struct rt_walkarg *w = v;
        int error = 0, size;
        struct rt_addrinfo info;

        if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
                return 0;
        memset(&info, 0, sizeof(info));
        info.rti_info[RTAX_DST] = rt_getkey(rt);
        info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
        info.rti_info[RTAX_NETMASK] = rt_mask(rt);
        info.rti_info[RTAX_TAG] = rt_gettag(rt);
        if (rt->rt_ifp) {
                const struct ifaddr *rtifa;
                info.rti_info[RTAX_IFP] = rt->rt_ifp->if_dl->ifa_addr;
                /* rtifa used to be simply rt->rt_ifa.  If rt->rt_ifa != NULL,
                 * then rt_get_ifa() != NULL.  So this ought to still be safe.
                 * --dyoung
                 */
                rtifa = rt_get_ifa(rt);
                info.rti_info[RTAX_IFA] = rtifa->ifa_addr;
                if (rt->rt_ifp->if_flags & IFF_POINTOPOINT)
                        info.rti_info[RTAX_BRD] = rtifa->ifa_dstaddr;
        }
        if ((error = rt_msg2(RTM_GET, &info, 0, w, &size)))
                return error;
        if (w->w_where && w->w_tmem && w->w_needed <= 0) {
                struct rt_xmsghdr *rtm = (struct rt_xmsghdr *)w->w_tmem;

                rtm->rtm_flags = rt->rt_flags;
                rtm->rtm_use = rt->rt_use;
                rtm_setmetrics(rt, rtm);
                KASSERT(rt->rt_ifp != NULL);
                rtm->rtm_index = rt->rt_ifp->if_index;
                rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
                rtm->rtm_addrs = info.rti_addrs;
                if ((error = copyout(rtm, w->w_where, size)) != 0)
                        w->w_where = NULL;
                else
                        w->w_where = (char *)w->w_where + size;
        }
        return error;
}

static int
sysctl_iflist_if(struct ifnet *ifp, struct rt_walkarg *w,
    struct rt_addrinfo *info, size_t len)
{
        struct if_xmsghdr *ifm;
        int error;

        ifm = (struct if_xmsghdr *)w->w_tmem;
        ifm->ifm_index = ifp->if_index;
        ifm->ifm_flags = ifp->if_flags;
        if_export_if_data(ifp, &ifm->ifm_data, false);
        ifm->ifm_addrs = info->rti_addrs;
        if ((error = copyout(ifm, w->w_where, len)) == 0)
                w->w_where = (char *)w->w_where + len;
        return error;
}

static int
sysctl_iflist_addr(struct rt_walkarg *w, struct ifaddr *ifa,
     struct rt_addrinfo *info)
{
        int len, error;

        if ((error = rt_msg2(RTM_XNEWADDR, info, 0, w, &len)))
                return error;
        if (w->w_where && w->w_tmem && w->w_needed <= 0) {
                struct ifa_xmsghdr *ifam;

                ifam = (struct ifa_xmsghdr *)w->w_tmem;
                ifam->ifam_index = ifa->ifa_ifp->if_index;
                ifam->ifam_flags = ifa->ifa_flags;
                ifam->ifam_metric = ifa->ifa_metric;
                ifam->ifam_addrs = info->rti_addrs;
                ifam->ifam_pid = 0;
                ifam->ifam_addrflags = if_addrflags(ifa);
                if ((error = copyout(w->w_tmem, w->w_where, len)) == 0)
                        w->w_where = (char *)w->w_where + len;
        }
        return error;
}

static int
sysctl_iflist(int af, struct rt_walkarg *w, int type)
{
        struct ifnet *ifp;
        struct ifaddr *ifa;
        struct        rt_addrinfo info;
        int        cmd, len, error = 0;
        int s;
        struct psref psref;
        int bound;

        switch (type) {
        case NET_RT_IFLIST:
                cmd = RTM_IFINFO;
                break;
        case NET_RT_OOOIFLIST:
                cmd = RTM_OOIFINFO;
                break;
        case NET_RT_OOIFLIST:
                cmd = RTM_OIFINFO;
                break;
        case NET_RT_OIFLIST:
                cmd = RTM_IFINFO;
                break;
        default:
#ifdef RTSOCK_DEBUG
                printf("%s: unsupported IFLIST type %d\n", __func__, type);
#endif
                return EINVAL;
        }

        memset(&info, 0, sizeof(info));

        bound = curlwp_bind();
        s = pserialize_read_enter();
        IFNET_READER_FOREACH(ifp) {
                int _s;
                if (w->w_arg && w->w_arg != ifp->if_index)
                        continue;
                if (IFADDR_READER_EMPTY(ifp))
                        continue;

                if_acquire(ifp, &psref);
                pserialize_read_exit(s);

                info.rti_info[RTAX_IFP] = ifp->if_dl->ifa_addr;
                if ((error = rt_msg2(cmd, &info, NULL, w, &len)) != 0)
                        goto release_exit;
                info.rti_info[RTAX_IFP] = NULL;
                if (w->w_where && w->w_tmem && w->w_needed <= 0) {
                        switch (type) {
                        case NET_RT_OIFLIST: /* old _70 */
                                if (!rtsock_iflist_70_hook.hooked) {
                                        error = EINVAL;
                                        break;
                                }
                                /* FALLTHROUGH */
                        case NET_RT_IFLIST: /* current */
                                error = sysctl_iflist_if(ifp, w, &info, len);
                                break;
                        case NET_RT_OOIFLIST: /* old _50 */
                                MODULE_HOOK_CALL(rtsock_iflist_50_hook,
                                    (ifp, w, &info, len), enosys(), error);
                                break;
                        case NET_RT_OOOIFLIST: /* old _14 */
                                MODULE_HOOK_CALL(rtsock_iflist_14_hook,
                                   (ifp, w, &info, len), enosys(), error);
                                break;
                        default:
                                error = EINVAL;
                        }
                        if (error != 0) {
                                if (error == ENOSYS)
                                        error = EINVAL;
                                goto release_exit;
                        }
                }
                _s = pserialize_read_enter();
                IFADDR_READER_FOREACH(ifa, ifp) {
                        struct psref _psref;
                        if (af && af != ifa->ifa_addr->sa_family)
                                continue;
                        ifa_acquire(ifa, &_psref);
                        pserialize_read_exit(_s);

                        info.rti_info[RTAX_IFA] = ifa->ifa_addr;
                        info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
                        info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
                        switch (type) {
                        case NET_RT_IFLIST:
                                error = sysctl_iflist_addr(w, ifa, &info);
                                break;
                        case NET_RT_OIFLIST:
                        case NET_RT_OOIFLIST:
                        case NET_RT_OOOIFLIST:
                                MODULE_HOOK_CALL(rtsock_iflist_70_hook,
                                    (w, ifa, &info), enosys(), error);
                                break;
                        default:
                                error = EINVAL;
                        }

                        _s = pserialize_read_enter();
                        ifa_release(ifa, &_psref);
                        if (error != 0) {
                                pserialize_read_exit(_s);
                                goto release_exit;
                        }
                }
                pserialize_read_exit(_s);
                info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] =
                    info.rti_info[RTAX_BRD] = NULL;

                s = pserialize_read_enter();
                if_release(ifp, &psref);
        }
        pserialize_read_exit(s);
        curlwp_bindx(bound);

        return 0;

release_exit:
        if_release(ifp, &psref);
        curlwp_bindx(bound);
        return error;
}

static int
sysctl_rtable(SYSCTLFN_ARGS)
{
        void         *where = oldp;
        size_t        *given = oldlenp;
        int        i, error = EINVAL;
        u_char  af;
        struct        rt_walkarg w;

        if (namelen == 1 && name[0] == CTL_QUERY)
                return sysctl_query(SYSCTLFN_CALL(rnode));

        if (newp)
                return EPERM;
        if (namelen != 3)
                return EINVAL;
        af = name[0];
        w.w_tmemneeded = 0;
        w.w_tmemsize = 0;
        w.w_tmem = NULL;
again:
        /* we may return here if a later [re]alloc of the t_mem buffer fails */
        if (w.w_tmemneeded) {
                w.w_tmem = kmem_zalloc(w.w_tmemneeded, KM_SLEEP);
                w.w_tmemsize = w.w_tmemneeded;
                w.w_tmemneeded = 0;
        }
        w.w_op = name[1];
        w.w_arg = name[2];
        w.w_given = *given;
        w.w_needed = 0 - w.w_given;
        w.w_where = where;

        KERNEL_LOCK_UNLESS_NET_MPSAFE();
        const int s = splsoftnet();
        switch (w.w_op) {

        case NET_RT_DUMP:
        case NET_RT_FLAGS:
#if defined(INET) || defined(INET6)
                /*
                 * take care of llinfo entries, the caller must
                 * specify an AF
                 */
                if (w.w_op == NET_RT_FLAGS &&
                    (w.w_arg == 0 || w.w_arg & RTF_LLDATA)) {
                        if (af != 0)
                                error = lltable_sysctl_dump(af, &w);
                        else
                                error = EINVAL;
                        break;
                }
#endif

                for (i = 1; i <= AF_MAX; i++) {
                        if (af == 0 || af == i) {
                                error = rt_walktree(i, sysctl_dumpentry, &w);
                                if (error != 0)
                                        break;
#if defined(INET) || defined(INET6)
                                /*
                                 * Return ARP/NDP entries too for
                                 * backward compatibility.
                                 */
                                error = lltable_sysctl_dump(i, &w);
                                if (error != 0)
                                        break;
#endif
                        }
                }
                break;

        case NET_RT_OOOIFLIST:                /* compat_14 */
        case NET_RT_OOIFLIST:                /* compat_50 */
        case NET_RT_OIFLIST:                /* compat_70 */
        case NET_RT_IFLIST:                /* current */
                error = sysctl_iflist(af, &w, w.w_op);
                break;
        }
        splx(s);
        KERNEL_UNLOCK_UNLESS_NET_MPSAFE();

        /* check to see if we couldn't allocate memory with NOWAIT */
        if (error == ENOBUFS && w.w_tmem == 0 && w.w_tmemneeded)
                goto again;

        if (w.w_tmem)
                kmem_free(w.w_tmem, w.w_tmemsize);
        w.w_needed += w.w_given;
        if (where) {
                *given = (char *)w.w_where - (char *)where;
                if (*given < w.w_needed)
                        return ENOMEM;
        } else {
                *given = (11 * w.w_needed) / 10;
        }
        return error;
}

void
sysctl_net_route_setup(struct sysctllog **clog, int pf, const char *name)
{
        const struct sysctlnode *rnode = NULL;

        sysctl_createv(clog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, name,
                       SYSCTL_DESCR("PF_ROUTE information"),
                       NULL, 0, NULL, 0,
                       CTL_NET, pf, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "rtable",
                       SYSCTL_DESCR("Routing table information"),
                       sysctl_rtable, 0, NULL, 0,
                       CTL_NET, pf, 0 /* any protocol */, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "stats",
                       SYSCTL_DESCR("Routing statistics"),
                       NULL, 0, &rtstat, sizeof(rtstat),
                       CTL_CREATE, CTL_EOL);
}































































































































































































































































































    1 








































































































































































































































































































































































    1 









    1 



    1 




























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
/*        $NetBSD: exec_elf.c,v 1.105 2023/08/17 06:58:26 rin Exp $        */

/*-
 * Copyright (c) 1994, 2000, 2005, 2015, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas and Maxime Villard.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1996 Christopher G. Demetriou
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(1, "$NetBSD: exec_elf.c,v 1.105 2023/08/17 06:58:26 rin Exp $");

#ifdef _KERNEL_OPT
#include "opt_pax.h"
#endif /* _KERNEL_OPT */

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/exec.h>
#include <sys/exec_elf.h>
#include <sys/syscall.h>
#include <sys/signalvar.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/kauth.h>
#include <sys/bitops.h>

#include <sys/cpu.h>
#include <machine/reg.h>

#include <compat/common/compat_util.h>

#include <sys/pax.h>
#include <uvm/uvm_param.h>

#define elf_check_header        ELFNAME(check_header)
#define elf_copyargs                ELFNAME(copyargs)
#define elf_populate_auxv        ELFNAME(populate_auxv)
#define elf_load_interp                ELFNAME(load_interp)
#define elf_load_psection        ELFNAME(load_psection)
#define exec_elf_makecmds        ELFNAME2(exec,makecmds)
#define netbsd_elf_signature        ELFNAME2(netbsd,signature)
#define netbsd_elf_note               ELFNAME2(netbsd,note)
#define netbsd_elf_probe        ELFNAME2(netbsd,probe)
#define        coredump                ELFNAMEEND(coredump)
#define        elf_free_emul_arg        ELFNAME(free_emul_arg)

static int
elf_load_interp(struct lwp *, struct exec_package *, char *,
    struct exec_vmcmd_set *, u_long *, Elf_Addr *);
static int
elf_load_psection(struct exec_vmcmd_set *, struct vnode *, const Elf_Phdr *,
    Elf_Addr *, u_long *, int);

int        netbsd_elf_signature(struct lwp *, struct exec_package *, Elf_Ehdr *);
int        netbsd_elf_note(struct exec_package *, const Elf_Nhdr *, const char *,
            const char *);
int        netbsd_elf_probe(struct lwp *, struct exec_package *, void *, char *,
            vaddr_t *);

static void        elf_free_emul_arg(void *);

#ifdef DEBUG_ELF
#define DPRINTF(a, ...)        printf("%s: " a "\n", __func__, ##__VA_ARGS__)
#else
#define DPRINTF(a, ...)
#endif

/* round up and down to page boundaries. */
#define        ELF_ROUND(a, b)                (((a) + (b) - 1) & ~((b) - 1))
#define        ELF_TRUNC(a, b)                ((a) & ~((b) - 1))

static int
elf_placedynexec(struct exec_package *epp, Elf_Ehdr *eh, Elf_Phdr *ph)
{
        Elf_Addr align, offset;
        int i;

        for (align = 1, i = 0; i < eh->e_phnum; i++)
                if (ph[i].p_type == PT_LOAD && ph[i].p_align > align)
                        align = ph[i].p_align;

        offset = (Elf_Addr)pax_aslr_exec_offset(epp, align);
        if (offset < epp->ep_vm_minaddr)
                offset = roundup(epp->ep_vm_minaddr, align);
        if ((offset & (align - 1)) != 0) {
                DPRINTF("bad offset=%#jx align=%#jx",
                    (uintmax_t)offset, (uintmax_t)align);
                return EINVAL;
        }

        for (i = 0; i < eh->e_phnum; i++)
                ph[i].p_vaddr += offset;
        epp->ep_entryoffset = offset;
        eh->e_entry += offset;
        return 0;
}


int
elf_populate_auxv(struct lwp *l, struct exec_package *pack, char **stackp)
{
        size_t len, vlen;
        AuxInfo ai[ELF_AUX_ENTRIES], *a, *execname;
        struct elf_args *ap;
        char *path = l->l_proc->p_path;
        int error;

        execname = NULL;
        a = ai;

        memset(ai, 0, sizeof(ai));

        /*
         * Push extra arguments on the stack needed by dynamically
         * linked binaries
         */
        if ((ap = (struct elf_args *)pack->ep_emul_arg)) {
                struct vattr *vap = pack->ep_vap;

                a->a_type = AT_PHDR;
                a->a_v = ap->arg_phaddr;
                a++;

                a->a_type = AT_PHENT;
                a->a_v = ap->arg_phentsize;
                a++;

                a->a_type = AT_PHNUM;
                a->a_v = ap->arg_phnum;
                a++;

                a->a_type = AT_PAGESZ;
                a->a_v = PAGE_SIZE;
                a++;

                a->a_type = AT_BASE;
                a->a_v = ap->arg_interp;
                a++;

                a->a_type = AT_FLAGS;
                a->a_v = 0;
                a++;

                a->a_type = AT_ENTRY;
                a->a_v = ap->arg_entry;
                a++;

                a->a_type = AT_STACKBASE;
                a->a_v = l->l_proc->p_stackbase;
                a++;

                a->a_type = AT_EUID;
                if (vap->va_mode & S_ISUID)
                        a->a_v = vap->va_uid;
                else
                        a->a_v = kauth_cred_geteuid(l->l_cred);
                a++;

                a->a_type = AT_RUID;
                a->a_v = kauth_cred_getuid(l->l_cred);
                a++;

                a->a_type = AT_EGID;
                if (vap->va_mode & S_ISGID)
                        a->a_v = vap->va_gid;
                else
                        a->a_v = kauth_cred_getegid(l->l_cred);
                a++;

                a->a_type = AT_RGID;
                a->a_v = kauth_cred_getgid(l->l_cred);
                a++;

                /* "/" means fexecve(2) could not resolve the pathname */
                if (path[0] == '/' && path[1] != '\0') {
                        execname = a;
                        a->a_type = AT_SUN_EXECNAME;
                        a++;
                }

                exec_free_emul_arg(pack);
        }

        a->a_type = AT_NULL;
        a->a_v = 0;
        a++;

        vlen = (a - ai) * sizeof(ai[0]);

        KASSERT(vlen <= sizeof(ai));

        if (execname) {
                execname->a_v = (uintptr_t)(*stackp + vlen);
                len = strlen(path) + 1;
                if ((error = copyout(path, (*stackp + vlen), len)) != 0)
                        return error;
                len = ALIGN(len);
        } else {
                len = 0;
        }

        if ((error = copyout(ai, *stackp, vlen)) != 0)
                return error;
        *stackp += vlen + len;

        return 0;
}

/*
 * Copy arguments onto the stack in the normal way, but add some
 * extra information in case of dynamic binding.
 */
int
elf_copyargs(struct lwp *l, struct exec_package *pack,
    struct ps_strings *arginfo, char **stackp, void *argp)
{
        int error;

        if ((error = copyargs(l, pack, arginfo, stackp, argp)) != 0)
                return error;

        return elf_populate_auxv(l, pack, stackp);
}

/*
 * elf_check_header():
 *
 * Check header for validity; return 0 if ok, ENOEXEC if error
 */
int
elf_check_header(Elf_Ehdr *eh)
{

        if (memcmp(eh->e_ident, ELFMAG, SELFMAG) != 0 ||
            eh->e_ident[EI_CLASS] != ELFCLASS) {
                DPRINTF("bad magic e_ident[EI_MAG0,EI_MAG3] %#x%x%x%x, "
                    "e_ident[EI_CLASS] %#x", eh->e_ident[EI_MAG0],
                    eh->e_ident[EI_MAG1], eh->e_ident[EI_MAG2],
                    eh->e_ident[EI_MAG3], eh->e_ident[EI_CLASS]);
                return ENOEXEC;
        }

        switch (eh->e_machine) {

        ELFDEFNNAME(MACHDEP_ID_CASES)

        default:
                DPRINTF("bad machine %#x", eh->e_machine);
                return ENOEXEC;
        }

        if (ELF_EHDR_FLAGS_OK(eh) == 0) {
                DPRINTF("bad flags %#x", eh->e_flags);
                return ENOEXEC;
        }

        if (eh->e_shnum > ELF_MAXSHNUM || eh->e_phnum > ELF_MAXPHNUM) {
                DPRINTF("bad shnum/phnum %#x/%#x", eh->e_shnum, eh->e_phnum);
                return ENOEXEC;
        }

        return 0;
}

/*
 * elf_load_psection():
 *
 * Load a psection at the appropriate address
 */
static int
elf_load_psection(struct exec_vmcmd_set *vcset, struct vnode *vp,
    const Elf_Phdr *ph, Elf_Addr *addr, u_long *size, int flags)
{
        u_long msize, psize, rm, rf;
        long diff, offset;
        int vmprot = 0;

        KASSERT(VOP_ISLOCKED(vp) != LK_NONE);

        /*
         * If the user specified an address, then we load there.
         */
        if (*addr == ELFDEFNNAME(NO_ADDR))
                *addr = ph->p_vaddr;

        if (ph->p_align > 1) {
                /*
                 * Make sure we are virtually aligned as we are supposed to be.
                 */
                diff = ph->p_vaddr - ELF_TRUNC(ph->p_vaddr, ph->p_align);
                if (*addr - diff != ELF_TRUNC(*addr, ph->p_align)) {
                        DPRINTF("bad alignment %#jx != %#jx\n",
                            (uintptr_t)(*addr - diff),
                            (uintptr_t)ELF_TRUNC(*addr, ph->p_align));
                        return EINVAL;
                }
                /*
                 * But make sure to not map any pages before the start of the
                 * psection by limiting the difference to within a page.
                 */
                diff &= PAGE_MASK;
        } else
                diff = 0;

        vmprot |= (ph->p_flags & PF_R) ? VM_PROT_READ : 0;
        vmprot |= (ph->p_flags & PF_W) ? VM_PROT_WRITE : 0;
        vmprot |= (ph->p_flags & PF_X) ? VM_PROT_EXECUTE : 0;

        /*
         * Adjust everything so it all starts on a page boundary.
         */
        *addr -= diff;
        offset = ph->p_offset - diff;
        *size = ph->p_filesz + diff;
        msize = ph->p_memsz + diff;

        if (ph->p_align >= PAGE_SIZE) {
                if ((ph->p_flags & PF_W) != 0) {
                        /*
                         * Because the pagedvn pager can't handle zero fill
                         * of the last data page if it's not page aligned we
                         * map the last page readvn.
                         */
                        psize = trunc_page(*size);
                } else {
                        psize = round_page(*size);
                }
        } else {
                psize = *size;
        }

        if (psize > 0) {
                NEW_VMCMD2(vcset, ph->p_align < PAGE_SIZE ?
                    vmcmd_map_readvn : vmcmd_map_pagedvn, psize, *addr, vp,
                    offset, vmprot, flags);
                flags &= VMCMD_RELATIVE;
        }
        if (psize < *size) {
                NEW_VMCMD2(vcset, vmcmd_map_readvn, *size - psize,
                    *addr + psize, vp, offset + psize, vmprot, flags);
        }

        /*
         * Check if we need to extend the size of the segment (does
         * bss extend page the next page boundary)?
         */
        rm = round_page(*addr + msize);
        rf = round_page(*addr + *size);

        if (rm != rf) {
                NEW_VMCMD2(vcset, vmcmd_map_zero, rm - rf, rf, NULLVP,
                    0, vmprot, flags & VMCMD_RELATIVE);
                *size = msize;
        }
        return 0;
}

/*
 * elf_load_interp():
 *
 * Load an interpreter pointed to by path.
 */
static int
elf_load_interp(struct lwp *l, struct exec_package *epp, char *path,
    struct exec_vmcmd_set *vcset, u_long *entryoff, Elf_Addr *last)
{
        int error, i;
        struct vnode *vp;
        Elf_Ehdr eh;
        Elf_Phdr *ph = NULL;
        const Elf_Phdr *base_ph;
        const Elf_Phdr *last_ph;
        u_long phsize;
        Elf_Addr addr = *last;
        struct proc *p;
        bool use_topdown;

        p = l->l_proc;

        KASSERT(p->p_vmspace);
        KASSERT(p->p_vmspace != proc0.p_vmspace);

#ifdef __USE_TOPDOWN_VM
        use_topdown = epp->ep_flags & EXEC_TOPDOWN_VM;
#else
        use_topdown = false;
#endif

        /*
         * 1. open file
         * 2. read filehdr
         * 3. map text, data, and bss out of it using VM_*
         */
        vp = epp->ep_interp;
        if (vp == NULL) {
                error = emul_find_interp(l, epp, path);
                if (error != 0)
                        return error;
                vp = epp->ep_interp;
        }
        /* We'll tidy this ourselves - otherwise we have locking issues */
        epp->ep_interp = NULL;
        vn_lock(vp, LK_SHARED | LK_RETRY);

        /*
         * Similarly, if it's not marked as executable, or it's not a regular
         * file, we don't allow it to be used.
         */
        if (vp->v_type != VREG) {
                error = EACCES;
                goto bad;
        }
        if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != 0)
                goto bad;

        /*
         * Check mount point.  Though we're not trying to exec this binary,
         * we will be executing code from it, so if the mount point
         * disallows execution or set-id-ness, we punt or kill the set-id.
         */
        if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
                error = EACCES;
                goto bad;
        }
        if (vp->v_mount->mnt_flag & MNT_NOSUID)
                epp->ep_vap->va_mode &= ~(S_ISUID | S_ISGID);

        error = vn_marktext(vp);
        if (error)
                goto bad;

        error = exec_read(l, vp, 0, &eh, sizeof(eh), IO_NODELOCKED);
        if (error != 0)
                goto bad;

        if ((error = elf_check_header(&eh)) != 0)
                goto bad;
        if (eh.e_type != ET_DYN || eh.e_phnum == 0) {
                DPRINTF("bad interpreter type %#x", eh.e_type);
                error = ENOEXEC;
                goto bad;
        }

        phsize = eh.e_phnum * sizeof(Elf_Phdr);
        ph = kmem_alloc(phsize, KM_SLEEP);

        error = exec_read(l, vp, eh.e_phoff, ph, phsize, IO_NODELOCKED);
        if (error != 0)
                goto bad;

#ifdef ELF_INTERP_NON_RELOCATABLE
        /*
         * Evil hack:  Only MIPS should be non-relocatable, and the
         * psections should have a high address (typically 0x5ffe0000).
         * If it's now relocatable, it should be linked at 0 and the
         * psections should have zeros in the upper part of the address.
         * Otherwise, force the load at the linked address.
         */
        if (*last == ELF_LINK_ADDR && (ph->p_vaddr & 0xffff0000) == 0)
                *last = ELFDEFNNAME(NO_ADDR);
#endif

        /*
         * If no position to load the interpreter was set by a probe
         * function, pick the same address that a non-fixed mmap(0, ..)
         * would (i.e. something safely out of the way).
         */
        if (*last == ELFDEFNNAME(NO_ADDR)) {
                u_long limit = 0;
                /*
                 * Find the start and ending addresses of the psections to
                 * be loaded.  This will give us the size.
                 */
                for (i = 0, base_ph = NULL; i < eh.e_phnum; i++) {
                        if (ph[i].p_type == PT_LOAD) {
                                u_long psize = ph[i].p_vaddr + ph[i].p_memsz;
                                if (base_ph == NULL)
                                        base_ph = &ph[i];
                                if (psize > limit)
                                        limit = psize;
                        }
                }

                if (base_ph == NULL) {
                        DPRINTF("no interpreter loadable sections");
                        error = ENOEXEC;
                        goto bad;
                }

                /*
                 * Now compute the size and load address.
                 */
                addr = (*epp->ep_esch->es_emul->e_vm_default_addr)(p,
                    epp->ep_daddr,
                    round_page(limit) - trunc_page(base_ph->p_vaddr),
                    use_topdown);
                addr += (Elf_Addr)pax_aslr_rtld_offset(epp, base_ph->p_align,
                    use_topdown);
        } else {
                addr = *last; /* may be ELF_LINK_ADDR */
        }

        /*
         * Load all the necessary sections
         */
        for (i = 0, base_ph = NULL, last_ph = NULL; i < eh.e_phnum; i++) {
                switch (ph[i].p_type) {
                case PT_LOAD: {
                        u_long size;
                        int flags;

                        if (base_ph == NULL) {
                                /*
                                 * First encountered psection is always the
                                 * base psection.  Make sure it's aligned
                                 * properly (align down for topdown and align
                                 * upwards for not topdown).
                                 */
                                base_ph = &ph[i];
                                flags = VMCMD_BASE;
                                if (addr == ELF_LINK_ADDR)
                                        addr = ph[i].p_vaddr;
                                if (use_topdown)
                                        addr = ELF_TRUNC(addr, ph[i].p_align);
                                else
                                        addr = ELF_ROUND(addr, ph[i].p_align);
                        } else {
                                u_long limit = round_page(last_ph->p_vaddr
                                    + last_ph->p_memsz);
                                u_long base = trunc_page(ph[i].p_vaddr);

                                /*
                                 * If there is a gap in between the psections,
                                 * map it as inaccessible so nothing else
                                 * mmap'ed will be placed there.
                                 */
                                if (limit != base) {
                                        NEW_VMCMD2(vcset, vmcmd_map_zero,
                                            base - limit,
                                            limit - base_ph->p_vaddr, NULLVP,
                                            0, VM_PROT_NONE, VMCMD_RELATIVE);
                                }

                                addr = ph[i].p_vaddr - base_ph->p_vaddr;
                                flags = VMCMD_RELATIVE;
                        }
                        last_ph = &ph[i];
                        if ((error = elf_load_psection(vcset, vp, &ph[i], &addr,
                            &size, flags)) != 0)
                                goto bad;
                        /*
                         * If entry is within this psection then this
                         * must contain the .text section.  *entryoff is
                         * relative to the base psection.
                         */
                        if (eh.e_entry >= ph[i].p_vaddr &&
                            eh.e_entry < (ph[i].p_vaddr + size)) {
                                *entryoff = eh.e_entry - base_ph->p_vaddr;
                        }
                        addr += size;
                        break;
                }

                default:
                        break;
                }
        }

        kmem_free(ph, phsize);
        /*
         * This value is ignored if TOPDOWN.
         */
        *last = addr;
        vput(vp);
        return 0;

bad:
        if (ph != NULL)
                kmem_free(ph, phsize);
        vput(vp);
        return error;
}

/*
 * exec_elf_makecmds(): Prepare an Elf binary's exec package
 *
 * First, set of the various offsets/lengths in the exec package.
 *
 * Then, mark the text image busy (so it can be demand paged) or error
 * out if this is not possible.  Finally, set up vmcmds for the
 * text, data, bss, and stack segments.
 */
int
exec_elf_makecmds(struct lwp *l, struct exec_package *epp)
{
        Elf_Ehdr *eh = epp->ep_hdr;
        Elf_Phdr *ph, *pp;
        Elf_Addr phdr = 0, computed_phdr = 0, pos = 0, end_text = 0;
        int error, i;
        char *interp = NULL;
        u_long phsize;
        struct elf_args *ap;
        bool is_dyn = false;

        if (epp->ep_hdrvalid < sizeof(Elf_Ehdr)) {
                DPRINTF("small header %#x", epp->ep_hdrvalid);
                return ENOEXEC;
        }
        if ((error = elf_check_header(eh)) != 0)
                return error;

        if (eh->e_type == ET_DYN)
                /* PIE, and some libs have an entry point */
                is_dyn = true;
        else if (eh->e_type != ET_EXEC) {
                DPRINTF("bad type %#x", eh->e_type);
                return ENOEXEC;
        }

        if (eh->e_phnum == 0) {
                DPRINTF("no program headers");
                return ENOEXEC;
        }

        /* XXX only LK_EXCLUSIVE to match all others - allow spinning */
        vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
        error = vn_marktext(epp->ep_vp);
        if (error) {
                VOP_UNLOCK(epp->ep_vp);
                return error;
        }

        /*
         * Allocate space to hold all the program headers, and read them
         * from the file
         */
        phsize = eh->e_phnum * sizeof(Elf_Phdr);
        ph = kmem_alloc(phsize, KM_SLEEP);

        error = exec_read(l, epp->ep_vp, eh->e_phoff, ph, phsize,
            IO_NODELOCKED);
        if (error != 0) {
                VOP_UNLOCK(epp->ep_vp);
                goto bad;
        }

        epp->ep_taddr = epp->ep_tsize = ELFDEFNNAME(NO_ADDR);
        epp->ep_daddr = epp->ep_dsize = ELFDEFNNAME(NO_ADDR);

        for (i = 0; i < eh->e_phnum; i++) {
                pp = &ph[i];
                if (pp->p_type == PT_INTERP) {
                        if (pp->p_filesz < 2 || pp->p_filesz > MAXPATHLEN) {
                                DPRINTF("bad interpreter namelen %#jx",
                                    (uintmax_t)pp->p_filesz);
                                error = ENOEXEC;
                                VOP_UNLOCK(epp->ep_vp);
                                goto bad;
                        }
                        interp = PNBUF_GET();
                        error = exec_read(l, epp->ep_vp, pp->p_offset, interp,
                            pp->p_filesz, IO_NODELOCKED);
                        if (error != 0) {
                                VOP_UNLOCK(epp->ep_vp);
                                goto bad;
                        }
                        /* Ensure interp is NUL-terminated and of the expected length */
                        if (strnlen(interp, pp->p_filesz) != pp->p_filesz - 1) {
                                DPRINTF("bad interpreter name");
                                error = ENOEXEC;
                                VOP_UNLOCK(epp->ep_vp);
                                goto bad;
                        }
                        break;
                }
        }

        /*
         * On the same architecture, we may be emulating different systems.
         * See which one will accept this executable.
         *
         * Probe functions would normally see if the interpreter (if any)
         * exists. Emulation packages may possibly replace the interpreter in
         * interp with a changed path (/emul/xxx/<path>).
         */
        pos = ELFDEFNNAME(NO_ADDR);
        if (epp->ep_esch->u.elf_probe_func) {
                vaddr_t startp = (vaddr_t)pos;

                error = (*epp->ep_esch->u.elf_probe_func)(l, epp, eh, interp,
                                                          &startp);
                if (error) {
                        VOP_UNLOCK(epp->ep_vp);
                        goto bad;
                }
                pos = (Elf_Addr)startp;
        }

        if (is_dyn && (error = elf_placedynexec(epp, eh, ph)) != 0) {
                VOP_UNLOCK(epp->ep_vp);
                goto bad;
        }

        /*
         * Load all the necessary sections
         */
        for (i = 0; i < eh->e_phnum; i++) {
                Elf_Addr addr = ELFDEFNNAME(NO_ADDR);
                u_long size = 0;

                switch (ph[i].p_type) {
                case PT_LOAD:
                        if ((error = elf_load_psection(&epp->ep_vmcmds,
                            epp->ep_vp, &ph[i], &addr, &size, VMCMD_FIXED))
                            != 0) {
                                VOP_UNLOCK(epp->ep_vp);
                                goto bad;
                        }

                        /*
                         * Consider this as text segment, if it is executable.
                         * If there is more than one text segment, pick the
                         * largest.
                         */
                        if (ph[i].p_flags & PF_X) {
                                if (epp->ep_taddr == ELFDEFNNAME(NO_ADDR) ||
                                    size > epp->ep_tsize) {
                                        epp->ep_taddr = addr;
                                        epp->ep_tsize = size;
                                }
                                end_text = addr + size;
                        } else {
                                epp->ep_daddr = addr;
                                epp->ep_dsize = size;
                        }
                        if (ph[i].p_offset == 0) {
                                computed_phdr = ph[i].p_vaddr + eh->e_phoff;
                        }
                        break;

                case PT_SHLIB:
                        /* SCO has these sections. */
                case PT_INTERP:
                        /* Already did this one. */
                case PT_DYNAMIC:
                case PT_NOTE:
                        break;
                case PT_PHDR:
                        /* Note address of program headers (in text segment) */
                        phdr = ph[i].p_vaddr;
                        break;

                default:
                        /*
                         * Not fatal; we don't need to understand everything.
                         */
                        break;
                }
        }

        /* Now done with the vnode. */
        VOP_UNLOCK(epp->ep_vp);

        if (epp->ep_vmcmds.evs_used == 0) {
                /* No VMCMD; there was no PT_LOAD section, or those
                 * sections were empty */
                DPRINTF("no vmcommands");
                error = ENOEXEC;
                goto bad;
        }

        if (epp->ep_daddr == ELFDEFNNAME(NO_ADDR)) {
                epp->ep_daddr = round_page(end_text);
                epp->ep_dsize = 0;
        }

        /*
         * Check if we found a dynamically linked binary and arrange to load
         * its interpreter
         */
        if (interp) {
                u_int nused = epp->ep_vmcmds.evs_used;
                u_long interp_offset = 0;

                if ((error = elf_load_interp(l, epp, interp,
                    &epp->ep_vmcmds, &interp_offset, &pos)) != 0) {
                        goto bad;
                }
                if (epp->ep_vmcmds.evs_used == nused) {
                        /* elf_load_interp() has not set up any new VMCMD */
                        DPRINTF("no vmcommands for interpreter");
                        error = ENOEXEC;
                        goto bad;
                }

                ap = kmem_alloc(sizeof(*ap), KM_SLEEP);
                ap->arg_interp = epp->ep_vmcmds.evs_cmds[nused].ev_addr;
                epp->ep_entryoffset = interp_offset;
                epp->ep_entry = ap->arg_interp + interp_offset;
                PNBUF_PUT(interp);
                interp = NULL;
        } else {
                epp->ep_entry = eh->e_entry;
                if (epp->ep_flags & EXEC_FORCEAUX) {
                        ap = kmem_zalloc(sizeof(*ap), KM_SLEEP);
                        ap->arg_interp = (vaddr_t)NULL;
                } else {
                        ap = NULL;
                }
        }

        if (ap) {
                ap->arg_phaddr = phdr ? phdr : computed_phdr;
                ap->arg_phentsize = eh->e_phentsize;
                ap->arg_phnum = eh->e_phnum;
                ap->arg_entry = eh->e_entry;
                epp->ep_emul_arg = ap;
                epp->ep_emul_arg_free = elf_free_emul_arg;
        }

#ifdef ELF_MAP_PAGE_ZERO
        /* Dell SVR4 maps page zero, yeuch! */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, PAGE_SIZE, 0,
            epp->ep_vp, 0, VM_PROT_READ);
#endif

        error = (*epp->ep_esch->es_setup_stack)(l, epp);
        if (error)
                goto bad;

        kmem_free(ph, phsize);
        return 0;

bad:
        if (interp)
                PNBUF_PUT(interp);
        exec_free_emul_arg(epp);
        kmem_free(ph, phsize);
        kill_vmcmds(&epp->ep_vmcmds);
        return error;
}

int
netbsd_elf_signature(struct lwp *l, struct exec_package *epp,
    Elf_Ehdr *eh)
{
        size_t i;
        Elf_Phdr *ph;
        size_t phsize;
        char *nbuf;
        int error;
        int isnetbsd = 0;

        epp->ep_pax_flags = 0;

        if (eh->e_phnum > ELF_MAXPHNUM || eh->e_phnum == 0) {
                DPRINTF("no signature %#x", eh->e_phnum);
                return ENOEXEC;
        }

        phsize = eh->e_phnum * sizeof(Elf_Phdr);
        ph = kmem_alloc(phsize, KM_SLEEP);
        error = exec_read(l, epp->ep_vp, eh->e_phoff, ph, phsize,
            IO_NODELOCKED);
        if (error)
                goto out;

        nbuf = kmem_alloc(ELF_MAXNOTESIZE, KM_SLEEP);
        for (i = 0; i < eh->e_phnum; i++) {
                const char *nptr;
                size_t nlen;

                if (ph[i].p_type != PT_NOTE ||
                    ph[i].p_filesz > ELF_MAXNOTESIZE)
                        continue;

                nlen = ph[i].p_filesz;
                error = exec_read(l, epp->ep_vp, ph[i].p_offset, nbuf, nlen,
                    IO_NODELOCKED);
                if (error)
                        continue;

                nptr = nbuf;
                while (nlen > 0) {
                        const Elf_Nhdr *np;
                        const char *ndata, *ndesc;

                        /* note header */
                        np = (const Elf_Nhdr *)nptr;
                        if (nlen < sizeof(*np)) {
                                break;
                        }
                        nptr += sizeof(*np);
                        nlen -= sizeof(*np);

                        /* note name */
                        ndata = nptr;
                        if (nlen < roundup(np->n_namesz, 4)) {
                                break;
                        }
                        nptr += roundup(np->n_namesz, 4);
                        nlen -= roundup(np->n_namesz, 4);

                        /* note description */
                        ndesc = nptr;
                        if (nlen < roundup(np->n_descsz, 4)) {
                                break;
                        }
                        nptr += roundup(np->n_descsz, 4);
                        nlen -= roundup(np->n_descsz, 4);

                        isnetbsd |= netbsd_elf_note(epp, np, ndata, ndesc);
                }
        }
        kmem_free(nbuf, ELF_MAXNOTESIZE);

        error = isnetbsd ? 0 : ENOEXEC;
#ifdef DEBUG_ELF
        if (error)
                DPRINTF("not netbsd");
#endif
out:
        kmem_free(ph, phsize);
        return error;
}

int
netbsd_elf_note(struct exec_package *epp,
                const Elf_Nhdr *np, const char *ndata, const char *ndesc)
{
        int isnetbsd = 0;

#ifdef DIAGNOSTIC
        const char *badnote;
#define BADNOTE(n) badnote = (n)
#else
#define BADNOTE(n)
#endif

        switch (np->n_type) {
        case ELF_NOTE_TYPE_NETBSD_TAG:
                /* It is us */
                if (np->n_namesz == ELF_NOTE_NETBSD_NAMESZ &&
                    np->n_descsz == ELF_NOTE_NETBSD_DESCSZ &&
                    memcmp(ndata, ELF_NOTE_NETBSD_NAME,
                    ELF_NOTE_NETBSD_NAMESZ) == 0) {
                        memcpy(&epp->ep_osversion, ndesc,
                            ELF_NOTE_NETBSD_DESCSZ);
                        isnetbsd = 1;
                        break;
                }

                /*
                 * Ignore SuSE tags; SuSE's n_type is the same the
                 * NetBSD one.
                 */
                if (np->n_namesz == ELF_NOTE_SUSE_NAMESZ &&
                    memcmp(ndata, ELF_NOTE_SUSE_NAME,
                    ELF_NOTE_SUSE_NAMESZ) == 0)
                        break;
                /*
                 * Ignore old GCC
                 */
                if (np->n_namesz == ELF_NOTE_OGCC_NAMESZ &&
                    memcmp(ndata, ELF_NOTE_OGCC_NAME,
                    ELF_NOTE_OGCC_NAMESZ) == 0)
                        break;
                BADNOTE("NetBSD tag");
                goto bad;

        case ELF_NOTE_TYPE_PAX_TAG:
                if (np->n_namesz == ELF_NOTE_PAX_NAMESZ &&
                    np->n_descsz == ELF_NOTE_PAX_DESCSZ &&
                    memcmp(ndata, ELF_NOTE_PAX_NAME,
                    ELF_NOTE_PAX_NAMESZ) == 0) {
                        uint32_t flags;
                        memcpy(&flags, ndesc, sizeof(flags));
                        /* Convert the flags and insert them into
                         * the exec package. */
                        pax_setup_elf_flags(epp, flags);
                        break;
                }
                BADNOTE("PaX tag");
                goto bad;

        case ELF_NOTE_TYPE_MARCH_TAG:
                /* Copy the machine arch into the package. */
                if (np->n_namesz == ELF_NOTE_MARCH_NAMESZ
                    && memcmp(ndata, ELF_NOTE_MARCH_NAME,
                            ELF_NOTE_MARCH_NAMESZ) == 0) {
                        /* Do not truncate the buffer */
                        if (np->n_descsz > sizeof(epp->ep_machine_arch)) {
                                BADNOTE("description size limit");
                                goto bad;
                        }
                        /*
                         * Ensure ndesc is NUL-terminated and of the
                         * expected length.
                         */
                        if (strnlen(ndesc, np->n_descsz) + 1 !=
                            np->n_descsz) {
                                BADNOTE("description size");
                                goto bad;
                        }
                        strlcpy(epp->ep_machine_arch, ndesc,
                            sizeof(epp->ep_machine_arch));
                        break;
                }
                BADNOTE("march tag");
                goto bad;

        case ELF_NOTE_TYPE_MCMODEL_TAG:
                /* arch specific check for code model */
#ifdef ELF_MD_MCMODEL_CHECK
                if (np->n_namesz == ELF_NOTE_MCMODEL_NAMESZ
                    && memcmp(ndata, ELF_NOTE_MCMODEL_NAME,
                            ELF_NOTE_MCMODEL_NAMESZ) == 0) {
                        ELF_MD_MCMODEL_CHECK(epp, ndesc, np->n_descsz);
                        break;
                }
                BADNOTE("mcmodel tag");
                goto bad;
#endif
                break;

        case ELF_NOTE_TYPE_SUSE_VERSION_TAG:
                break;

        case ELF_NOTE_TYPE_GO_BUILDID_TAG:
                break;

        case ELF_NOTE_TYPE_FDO_PACKAGING_METADATA:
                break;

        case ELF_NOTE_TYPE_NETBSD_EMUL_TAG:
                /* Ancient NetBSD version tag */
                break;

        default:
                BADNOTE("unknown tag");
bad:
#ifdef DIAGNOSTIC
                /* Ignore GNU tags */
                if (np->n_namesz == ELF_NOTE_GNU_NAMESZ &&
                    memcmp(ndata, ELF_NOTE_GNU_NAME,
                    ELF_NOTE_GNU_NAMESZ) == 0)
                    break;

                int ns = (int)np->n_namesz;
                printf("%s: Unknown elf note type %d (%s): "
                    "[namesz=%d, descsz=%d name=%-*.*s]\n",
                    epp->ep_kname, np->n_type, badnote, np->n_namesz,
                    np->n_descsz, ns, ns, ndata);
#endif
                break;
        }

        return isnetbsd;
}

int
netbsd_elf_probe(struct lwp *l, struct exec_package *epp, void *eh, char *itp,
    vaddr_t *pos)
{
        int error;

        if ((error = netbsd_elf_signature(l, epp, eh)) != 0)
                return error;
#ifdef ELF_MD_PROBE_FUNC
        if ((error = ELF_MD_PROBE_FUNC(l, epp, eh, itp, pos)) != 0)
                return error;
#elif defined(ELF_INTERP_NON_RELOCATABLE)
        *pos = ELF_LINK_ADDR;
#endif
        epp->ep_flags |= EXEC_FORCEAUX;
        return 0;
}

void
elf_free_emul_arg(void *arg)
{
        struct elf_args *ap = arg;
        KASSERT(ap != NULL);
        kmem_free(ap, sizeof(*ap));
}



















































































































































































































































    3 













    3 
    3 

    2 










































































    3 


























    3 




    3 



















































































    1 




    1 

























    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
/*        $NetBSD: if_loop.c,v 1.118 2022/09/04 23:34:51 thorpej Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)if_loop.c        8.2 (Berkeley) 1/9/95
 */

/*
 * Loopback interface driver for protocol testing and timing.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_loop.c,v 1.118 2022/09/04 23:34:51 thorpej Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_atalk.h"
#include "opt_mbuftrace.h"
#include "opt_mpls.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include <sys/device.h>
#include <sys/module.h>

#include <sys/cpu.h>

#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>

#ifdef        INET
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/in_offload.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#endif

#ifdef INET6
#ifndef INET
#include <netinet/in.h>
#endif
#include <netinet6/in6_var.h>
#include <netinet6/in6_offload.h>
#include <netinet/ip6.h>
#endif

#ifdef MPLS
#include <netmpls/mpls.h>
#include <netmpls/mpls_var.h>
#endif

#ifdef NETATALK
#include <netatalk/at.h>
#include <netatalk/at_var.h>
#endif

#include <net/bpf.h>

#if defined(LARGE_LOMTU)
#define LOMTU        (131072 +  MHLEN + MLEN)
#define LOMTU_MAX LOMTU
#else
#define        LOMTU        (32768 +  MHLEN + MLEN)
#define        LOMTU_MAX        (65536 +  MHLEN + MLEN)
#endif

#ifdef ALTQ
static void        lostart(struct ifnet *);
#endif

static int        loop_clone_create(struct if_clone *, int);
static int        loop_clone_destroy(struct ifnet *);

static void        loop_rtrequest(int, struct rtentry *, const struct rt_addrinfo *);

static struct if_clone loop_cloner =
    IF_CLONE_INITIALIZER("lo", loop_clone_create, loop_clone_destroy);

void
loopattach(int n)
{

#ifndef _MODULE
        loop_clone_create(&loop_cloner, 0);        /* lo0 always exists */
#endif
}

void
loopinit(void)
{

        if (lo0ifp != NULL)        /* can happen in rump kernel */
                return;

#ifdef _MODULE
        loop_clone_create(&loop_cloner, 0);        /* lo0 always exists */
#endif
        if_clone_attach(&loop_cloner);
}

static int
loopdetach(void)
{
        /* no detach for now; we don't allow lo0 to be deleted */
        return EBUSY;
}

static int
loop_clone_create(struct if_clone *ifc, int unit)
{
        struct ifnet *ifp;

        ifp = if_alloc(IFT_LOOP);

        if_initname(ifp, ifc->ifc_name, unit);

        ifp->if_mtu = LOMTU;
        ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
#ifdef NET_MPSAFE
        ifp->if_extflags = IFEF_MPSAFE;
#endif
        ifp->if_ioctl = loioctl;
        ifp->if_output = looutput;
#ifdef ALTQ
        ifp->if_start = lostart;
#endif
        ifp->if_type = IFT_LOOP;
        ifp->if_hdrlen = 0;
        ifp->if_addrlen = 0;
        ifp->if_dlt = DLT_NULL;
        IFQ_SET_READY(&ifp->if_snd);
        if (unit == 0)
                lo0ifp = ifp;
        if_initialize(ifp);
        ifp->if_link_state = LINK_STATE_UP;
        if_alloc_sadl(ifp);
        bpf_attach(ifp, DLT_NULL, sizeof(u_int));
#ifdef MBUFTRACE
        ifp->if_mowner = malloc(sizeof(struct mowner), M_DEVBUF,
            M_WAITOK | M_ZERO);
        strlcpy(ifp->if_mowner->mo_name, ifp->if_xname,
            sizeof(ifp->if_mowner->mo_name));
        MOWNER_ATTACH(ifp->if_mowner);
#endif

        ifp->if_flags |= IFF_RUNNING;
        if_register(ifp);

        return (0);
}

static int
loop_clone_destroy(struct ifnet *ifp)
{

        if (ifp == lo0ifp)
                return (EPERM);

        ifp->if_flags &= ~IFF_RUNNING;

#ifdef MBUFTRACE
        MOWNER_DETACH(ifp->if_mowner);
        free(ifp->if_mowner, M_DEVBUF);
#endif

        bpf_detach(ifp);
        if_detach(ifp);

        if_free(ifp);

        return (0);
}

int
looutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
    const struct rtentry *rt)
{
        pktqueue_t *pktq = NULL;
        int s;
        int csum_flags;
        int error = 0;
        size_t pktlen;

        MCLAIM(m, ifp->if_mowner);

        KERNEL_LOCK_UNLESS_NET_MPSAFE();

        if ((m->m_flags & M_PKTHDR) == 0)
                panic("looutput: no header mbuf");
        if (ifp->if_flags & IFF_LOOPBACK)
                bpf_mtap_af(ifp, dst->sa_family, m, BPF_D_OUT);
        m_set_rcvif(m, ifp);

        if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
                m_freem(m);
                error = (rt->rt_flags & RTF_BLACKHOLE ? 0 :
                        rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
                goto out;
        }

        pktlen = m->m_pkthdr.len;

        if_statadd2(ifp, if_opackets, 1, if_obytes, pktlen);

#ifdef ALTQ
        /*
         * ALTQ on the loopback interface is just for debugging.  It's
         * used only for loopback interfaces, not for a simplex interface.
         */
        if ((ALTQ_IS_ENABLED(&ifp->if_snd) || TBR_IS_ENABLED(&ifp->if_snd)) &&
            ifp->if_start == lostart) {
                /*
                 * If the queueing discipline needs packet classification,
                 * do it before prepending the link headers.
                 */
                IFQ_CLASSIFY(&ifp->if_snd, m, dst->sa_family);

                M_PREPEND(m, sizeof(uint32_t), M_DONTWAIT);
                if (m == NULL) {
                        if_statinc(ifp, if_oerrors);
                        error = ENOBUFS;
                        goto out;
                }
                *(mtod(m, uint32_t *)) = dst->sa_family;

                error = if_transmit_lock(ifp, m);
                goto out;
        }
#endif /* ALTQ */

        m_tag_delete_chain(m);

#ifdef MPLS
        bool is_mpls = false;
        if (rt != NULL && rt_gettag(rt) != NULL &&
            rt_gettag(rt)->sa_family == AF_MPLS &&
            (m->m_flags & (M_MCAST | M_BCAST)) == 0) {
                union mpls_shim msh;
                msh.s_addr = MPLS_GETSADDR(rt);
                if (msh.shim.label != MPLS_LABEL_IMPLNULL) {
                        is_mpls = true;
                        pktq = mpls_pktq;
                }
        }
        if (!is_mpls)
#endif
        switch (dst->sa_family) {

#ifdef INET
        case AF_INET:
                csum_flags = m->m_pkthdr.csum_flags;
                KASSERT((csum_flags & ~(M_CSUM_IPv4|M_CSUM_UDPv4)) == 0);
                if (csum_flags != 0 && IN_LOOPBACK_NEED_CHECKSUM(csum_flags)) {
                        in_undefer_cksum(m, 0, csum_flags);
                        m->m_pkthdr.csum_flags = 0;
                } else {
                        /*
                         * Do nothing. Pass M_CSUM_IPv4 and M_CSUM_UDPv4 as
                         * they are to tell those are calculated and good.
                         */
                }
                pktq = ip_pktq;
                break;
#endif
#ifdef INET6
        case AF_INET6:
                csum_flags = m->m_pkthdr.csum_flags;
                KASSERT((csum_flags & ~M_CSUM_UDPv6) == 0);
                if (csum_flags != 0 &&
                    IN6_LOOPBACK_NEED_CHECKSUM(csum_flags)) {
                        in6_undefer_cksum(m, 0, csum_flags);
                        m->m_pkthdr.csum_flags = 0;
                } else {
                        /*
                         * Do nothing. Pass M_CSUM_UDPv6 as
                         * they are to tell those are calculated and good.
                         */
                }
                m->m_flags |= M_LOOP;
                pktq = ip6_pktq;
                break;
#endif
#ifdef NETATALK
        case AF_APPLETALK:
                pktq = at_pktq2;
                break;
#endif
        default:
                printf("%s: can't handle af%d\n", ifp->if_xname,
                    dst->sa_family);
                m_freem(m);
                error = EAFNOSUPPORT;
                goto out;
        }

        KASSERT(pktq != NULL);

        error = 0;
        s = splnet();
        if (__predict_true(pktq_enqueue(pktq, m, 0))) {
                if_statadd2(ifp, if_ipackets, 1, if_ibytes, pktlen);
        } else {
                m_freem(m);
                if_statinc(ifp, if_oerrors);
                error = ENOBUFS;
        }
        splx(s);
out:
        KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
        return error;
}

#ifdef ALTQ
static void
lostart(struct ifnet *ifp)
{
        for (;;) {
                pktqueue_t *pktq = NULL;
                struct mbuf *m;
                size_t pktlen;
                uint32_t af;
                int s;

                IFQ_DEQUEUE(&ifp->if_snd, m);
                if (m == NULL)
                        return;

                af = *(mtod(m, uint32_t *));
                m_adj(m, sizeof(uint32_t));

                switch (af) {
#ifdef INET
                case AF_INET:
                        pktq = ip_pktq;
                        break;
#endif
#ifdef INET6
                case AF_INET6:
                        m->m_flags |= M_LOOP;
                        pktq = ip6_pktq;
                        break;
#endif
#ifdef NETATALK
                case AF_APPLETALK:
                        pktq = at_pktq2;
                        break;
#endif
                default:
                        printf("%s: can't handle af%d\n", ifp->if_xname, af);
                        m_freem(m);
                        return;
                }
                pktlen = m->m_pkthdr.len;

                KASSERT(pktq != NULL);

                s = splnet();
                if (__predict_false(pktq_enqueue(pktq, m, 0))) {
                        m_freem(m);
                        splx(s);
                        return;
                }
                if_statadd2(ifp, if_ipackets, 1, if_ibytes, pktlen);
                splx(s);
        }
}
#endif /* ALTQ */

/* ARGSUSED */
static void
loop_rtrequest(int cmd, struct rtentry *rt,
    const struct rt_addrinfo *info)
{

        if (rt)
                rt->rt_rmx.rmx_mtu = lo0ifp->if_mtu;
}

/*
 * Process an ioctl request.
 */
/* ARGSUSED */
int
loioctl(struct ifnet *ifp, u_long cmd, void *data)
{
        struct ifaddr *ifa;
        struct ifreq *ifr = data;
        int error = 0;

        switch (cmd) {

        case SIOCINITIFADDR:
                ifp->if_flags |= IFF_UP;
                ifa = (struct ifaddr *)data;
                if (ifa != NULL)
                        ifa->ifa_rtrequest = loop_rtrequest;
                /*
                 * Everything else is done at a higher level.
                 */
                break;

        case SIOCSIFMTU:
                if ((unsigned)ifr->ifr_mtu > LOMTU_MAX)
                        error = EINVAL;
                else if ((error = ifioctl_common(ifp, cmd, data)) == ENETRESET){
                        error = 0;
                }
                break;

        case SIOCADDMULTI:
        case SIOCDELMULTI:
                if (ifr == NULL) {
                        error = EAFNOSUPPORT;                /* XXX */
                        break;
                }
                switch (ifreq_getaddr(cmd, ifr)->sa_family) {

#ifdef INET
                case AF_INET:
                        break;
#endif
#ifdef INET6
                case AF_INET6:
                        break;
#endif

                default:
                        error = EAFNOSUPPORT;
                        break;
                }
                break;

        default:
                error = ifioctl_common(ifp, cmd, data);
        }
        return (error);
}

/*
 * Module infrastructure
 */
#include "if_module.h"

IF_MODULE(MODULE_CLASS_DRIVER, loop, NULL)





























































































































































































































































































































































































































































    1 


    1 
    1 
    1 



    1 
    1 



    1 
    1 








    3 

    3 
    3 






    3 




    6 


    6 
    6 
    6 



    6 







   79 

   80 
   79 

   79 




    7 


    6 
    7 


    7 
    7 
    7 

    5 









    9 

   10 
   10 

    9 














    3 




   43 







   42 









   76 



   73 



















































































































    7 


    7 
    7 

    6 





    1 


    6 



















    7 
















    7 




    7 
    7 






    7 










   84 







   83 
    7 











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
/*        $NetBSD: uvm_pdpolicy_clock.c,v 1.40 2022/04/12 20:27:56 andvar Exp $        */
/*        NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $        */

/*-
 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * Copyright (c) 1991, 1993, The Regents of the University of California.
 *
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * The Mach Operating System project at Carnegie-Mellon University.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
 * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
 *
 *
 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
 * All rights reserved.
 *
 * Permission to use, copy, modify and distribute this software and
 * its documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 */

#if defined(PDSIM)

#include "pdsim.h"

#else /* defined(PDSIM) */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.40 2022/04/12 20:27:56 andvar Exp $");

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/atomic.h>

#include <uvm/uvm.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_pdpolicy_impl.h>
#include <uvm/uvm_stat.h>

#endif /* defined(PDSIM) */

/*
 * per-CPU queue of pending page status changes.  128 entries makes for a
 * 1kB queue on _LP64 and has been found to be a reasonable compromise that
 * keeps lock contention events and wait times low, while not using too much
 * memory nor allowing global state to fall too far behind.
 */
#if !defined(CLOCK_PDQ_SIZE)
#define        CLOCK_PDQ_SIZE        128
#endif /* !defined(CLOCK_PDQ_SIZE) */

#define PQ_INACTIVE        0x00000010        /* page is in inactive list */
#define PQ_ACTIVE        0x00000020        /* page is in active list */

#if !defined(CLOCK_INACTIVEPCT)
#define        CLOCK_INACTIVEPCT        33
#endif /* !defined(CLOCK_INACTIVEPCT) */

struct uvmpdpol_globalstate {
        kmutex_t lock;                        /* lock on state */
                                        /* <= compiler pads here */
        struct pglist s_activeq                /* allocated pages, in use */
            __aligned(COHERENCY_UNIT);
        struct pglist s_inactiveq;        /* pages between the clock hands */
        int s_active;
        int s_inactive;
        int s_inactarg;
        struct uvm_pctparam s_anonmin;
        struct uvm_pctparam s_filemin;
        struct uvm_pctparam s_execmin;
        struct uvm_pctparam s_anonmax;
        struct uvm_pctparam s_filemax;
        struct uvm_pctparam s_execmax;
        struct uvm_pctparam s_inactivepct;
};

struct uvmpdpol_scanstate {
        bool ss_anonreact, ss_filereact, ss_execreact;
        struct vm_page ss_marker;
};

static void        uvmpdpol_pageactivate_locked(struct vm_page *);
static void        uvmpdpol_pagedeactivate_locked(struct vm_page *);
static void        uvmpdpol_pagedequeue_locked(struct vm_page *);
static bool        uvmpdpol_pagerealize_locked(struct vm_page *);
static struct uvm_cpu *uvmpdpol_flush(void);

static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned;
static struct uvmpdpol_scanstate pdpol_scanstate;

PDPOL_EVCNT_DEFINE(reactexec)
PDPOL_EVCNT_DEFINE(reactfile)
PDPOL_EVCNT_DEFINE(reactanon)

static void
clock_tune(void)
{
        struct uvmpdpol_globalstate *s = &pdpol_state;

        s->s_inactarg = UVM_PCTPARAM_APPLY(&s->s_inactivepct,
            s->s_active + s->s_inactive);
        if (s->s_inactarg <= uvmexp.freetarg) {
                s->s_inactarg = uvmexp.freetarg + 1;
        }
}

void
uvmpdpol_scaninit(void)
{
        struct uvmpdpol_globalstate *s = &pdpol_state;
        struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
        int t;
        bool anonunder, fileunder, execunder;
        bool anonover, fileover, execover;
        bool anonreact, filereact, execreact;
        int64_t freepg, anonpg, filepg, execpg;

        /*
         * decide which types of pages we want to reactivate instead of freeing
         * to keep usage within the minimum and maximum usage limits.
         * uvm_availmem() will sync the counters.
         */

        freepg = uvm_availmem(false);
        anonpg = cpu_count_get(CPU_COUNT_ANONCLEAN) +
            cpu_count_get(CPU_COUNT_ANONDIRTY) +
            cpu_count_get(CPU_COUNT_ANONUNKNOWN);
        execpg = cpu_count_get(CPU_COUNT_EXECPAGES);
        filepg = cpu_count_get(CPU_COUNT_FILECLEAN) +
            cpu_count_get(CPU_COUNT_FILEDIRTY) +
            cpu_count_get(CPU_COUNT_FILEUNKNOWN) -
            execpg;

        mutex_enter(&s->lock);
        t = s->s_active + s->s_inactive + freepg;
        anonunder = anonpg <= UVM_PCTPARAM_APPLY(&s->s_anonmin, t);
        fileunder = filepg <= UVM_PCTPARAM_APPLY(&s->s_filemin, t);
        execunder = execpg <= UVM_PCTPARAM_APPLY(&s->s_execmin, t);
        anonover = anonpg > UVM_PCTPARAM_APPLY(&s->s_anonmax, t);
        fileover = filepg > UVM_PCTPARAM_APPLY(&s->s_filemax, t);
        execover = execpg > UVM_PCTPARAM_APPLY(&s->s_execmax, t);
        anonreact = anonunder || (!anonover && (fileover || execover));
        filereact = fileunder || (!fileover && (anonover || execover));
        execreact = execunder || (!execover && (anonover || fileover));
        if (filereact && execreact && (anonreact || uvm_swapisfull())) {
                anonreact = filereact = execreact = false;
        }
        ss->ss_anonreact = anonreact;
        ss->ss_filereact = filereact;
        ss->ss_execreact = execreact;
        memset(&ss->ss_marker, 0, sizeof(ss->ss_marker));
        ss->ss_marker.flags = PG_MARKER;
        TAILQ_INSERT_HEAD(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
        mutex_exit(&s->lock);
}

void
uvmpdpol_scanfini(void)
{
        struct uvmpdpol_globalstate *s = &pdpol_state;
        struct uvmpdpol_scanstate *ss = &pdpol_scanstate;

        mutex_enter(&s->lock);
        TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
        mutex_exit(&s->lock);
}

struct vm_page *
uvmpdpol_selectvictim(krwlock_t **plock)
{
        struct uvmpdpol_globalstate *s = &pdpol_state;
        struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
        struct vm_page *pg;
        krwlock_t *lock;

        mutex_enter(&s->lock);
        while (/* CONSTCOND */ 1) {
                struct vm_anon *anon;
                struct uvm_object *uobj;

                pg = TAILQ_NEXT(&ss->ss_marker, pdqueue);
                if (pg == NULL) {
                        break;
                }
                KASSERT((pg->flags & PG_MARKER) == 0);
                uvmexp.pdscans++;

                /*
                 * acquire interlock to stabilize page identity.
                 * if we have caught the page in a state of flux
                 * deal with it and retry.
                 */
                mutex_enter(&pg->interlock);
                if (uvmpdpol_pagerealize_locked(pg)) {
                        mutex_exit(&pg->interlock);
                        continue;
                }

                /*
                 * now prepare to move on to the next page.
                 */
                TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker,
                    pdqueue);
                TAILQ_INSERT_AFTER(&pdpol_state.s_inactiveq, pg,
                    &ss->ss_marker, pdqueue);

                /*
                 * enforce the minimum thresholds on different
                 * types of memory usage.  if reusing the current
                 * page would reduce that type of usage below its
                 * minimum, reactivate the page instead and move
                 * on to the next page.
                 */
                anon = pg->uanon;
                uobj = pg->uobject;
                if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) {
                        uvmpdpol_pageactivate_locked(pg);
                        mutex_exit(&pg->interlock);
                        PDPOL_EVCNT_INCR(reactexec);
                        continue;
                }
                if (uobj && UVM_OBJ_IS_VNODE(uobj) &&
                    !UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) {
                        uvmpdpol_pageactivate_locked(pg);
                        mutex_exit(&pg->interlock);
                        PDPOL_EVCNT_INCR(reactfile);
                        continue;
                }
                if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) {
                        uvmpdpol_pageactivate_locked(pg);
                        mutex_exit(&pg->interlock);
                        PDPOL_EVCNT_INCR(reactanon);
                        continue;
                }

                /*
                 * try to lock the object that owns the page.
                 *
                 * with the page interlock held, we can drop s->lock, which
                 * could otherwise serve as a barrier to us getting the
                 * object locked, because the owner of the object's lock may
                 * be blocked on s->lock (i.e. a deadlock).
                 *
                 * whatever happens, uvmpd_trylockowner() will release the
                 * interlock.  with the interlock dropped we can then
                 * re-acquire our own lock.  the order is:
                 *
                 *        object -> pdpol -> interlock.
                 */
                mutex_exit(&s->lock);
                lock = uvmpd_trylockowner(pg);
                /* pg->interlock now released */
                mutex_enter(&s->lock);
                if (lock == NULL) {
                        /* didn't get it - try the next page. */
                        continue;
                }

                /*
                 * move referenced pages back to active queue and skip to
                 * next page.
                 */
                if (pmap_is_referenced(pg)) {
                        mutex_enter(&pg->interlock);
                        uvmpdpol_pageactivate_locked(pg);
                        mutex_exit(&pg->interlock);
                        uvmexp.pdreact++;
                        rw_exit(lock);
                        continue;
                }

                /* we have a potential victim. */
                *plock = lock;
                break;
        }
        mutex_exit(&s->lock);
        return pg;
}

void
uvmpdpol_balancequeue(int swap_shortage)
{
        struct uvmpdpol_globalstate *s = &pdpol_state;
        int inactive_shortage;
        struct vm_page *p, marker;
        krwlock_t *lock;

        /*
         * we have done the scan to get free pages.   now we work on meeting
         * our inactive target.
         */

        memset(&marker, 0, sizeof(marker));
        marker.flags = PG_MARKER;

        mutex_enter(&s->lock);
        TAILQ_INSERT_HEAD(&pdpol_state.s_activeq, &marker, pdqueue);
        for (;;) {
                inactive_shortage =
                    pdpol_state.s_inactarg - pdpol_state.s_inactive;
                if (inactive_shortage <= 0 && swap_shortage <= 0) {
                        break;
                }
                p = TAILQ_NEXT(&marker, pdqueue);
                if (p == NULL) {
                        break;
                }
                KASSERT((p->flags & PG_MARKER) == 0);

                /*
                 * acquire interlock to stabilize page identity.
                 * if we have caught the page in a state of flux
                 * deal with it and retry.
                 */
                mutex_enter(&p->interlock);
                if (uvmpdpol_pagerealize_locked(p)) {
                        mutex_exit(&p->interlock);
                        continue;
                }

                /*
                 * now prepare to move on to the next page.
                 */
                TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
                TAILQ_INSERT_AFTER(&pdpol_state.s_activeq, p, &marker,
                    pdqueue);

                /*
                 * try to lock the object that owns the page.  see comments
                 * in uvmpdol_selectvictim().
                 */
                mutex_exit(&s->lock);
                lock = uvmpd_trylockowner(p);
                /* p->interlock now released */
                mutex_enter(&s->lock);
                if (lock == NULL) {
                        /* didn't get it - try the next page. */
                        continue;
                }

                /*
                 * if there's a shortage of swap slots, try to free it.
                 */
                if (swap_shortage > 0 && (p->flags & PG_SWAPBACKED) != 0 &&
                    (p->flags & PG_BUSY) == 0) {
                        if (uvmpd_dropswap(p)) {
                                swap_shortage--;
                        }
                }

                /*
                 * if there's a shortage of inactive pages, deactivate.
                 */
                if (inactive_shortage > 0) {
                        pmap_clear_reference(p);
                        mutex_enter(&p->interlock);
                        uvmpdpol_pagedeactivate_locked(p);
                        mutex_exit(&p->interlock);
                        uvmexp.pddeact++;
                        inactive_shortage--;
                }
                rw_exit(lock);
        }
        TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
        mutex_exit(&s->lock);
}

static void
uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
{
        struct uvmpdpol_globalstate *s __diagused = &pdpol_state;

        KASSERT(mutex_owned(&s->lock));
        KASSERT(mutex_owned(&pg->interlock));
        KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
            (PQ_INTENT_D | PQ_INTENT_SET));

        if (pg->pqflags & PQ_ACTIVE) {
                TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
                KASSERT(pdpol_state.s_active > 0);
                pdpol_state.s_active--;
        }
        if ((pg->pqflags & PQ_INACTIVE) == 0) {
                KASSERT(pg->wire_count == 0);
                TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue);
                pdpol_state.s_inactive++;
        }
        pg->pqflags &= ~(PQ_ACTIVE | PQ_INTENT_SET);
        pg->pqflags |= PQ_INACTIVE;
}

void
uvmpdpol_pagedeactivate(struct vm_page *pg)
{

        KASSERT(uvm_page_owner_locked_p(pg, false));
        KASSERT(mutex_owned(&pg->interlock));

        /*
         * we have to clear the reference bit now, as when it comes time to
         * realize the intent we won't have the object locked any more.
         */
        pmap_clear_reference(pg);
        uvmpdpol_set_intent(pg, PQ_INTENT_I);
}

static void
uvmpdpol_pageactivate_locked(struct vm_page *pg)
{
        struct uvmpdpol_globalstate *s __diagused = &pdpol_state;

        KASSERT(mutex_owned(&s->lock));
        KASSERT(mutex_owned(&pg->interlock));
        KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
            (PQ_INTENT_D | PQ_INTENT_SET));

        uvmpdpol_pagedequeue_locked(pg);
        TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue);
        pdpol_state.s_active++;
        pg->pqflags &= ~(PQ_INACTIVE | PQ_INTENT_SET);
        pg->pqflags |= PQ_ACTIVE;
}

void
uvmpdpol_pageactivate(struct vm_page *pg)
{

        KASSERT(uvm_page_owner_locked_p(pg, false));
        KASSERT(mutex_owned(&pg->interlock));

        uvmpdpol_set_intent(pg, PQ_INTENT_A);
}

static void
uvmpdpol_pagedequeue_locked(struct vm_page *pg)
{
        struct uvmpdpol_globalstate *s __diagused = &pdpol_state;

        KASSERT(mutex_owned(&s->lock));
        KASSERT(mutex_owned(&pg->interlock));

        if (pg->pqflags & PQ_ACTIVE) {
                TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
                KASSERT((pg->pqflags & PQ_INACTIVE) == 0);
                KASSERT(pdpol_state.s_active > 0);
                pdpol_state.s_active--;
        } else if (pg->pqflags & PQ_INACTIVE) {
                TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue);
                KASSERT(pdpol_state.s_inactive > 0);
                pdpol_state.s_inactive--;
        }
        pg->pqflags &= ~(PQ_ACTIVE | PQ_INACTIVE | PQ_INTENT_SET);
}

void
uvmpdpol_pagedequeue(struct vm_page *pg)
{

        KASSERT(uvm_page_owner_locked_p(pg, true));
        KASSERT(mutex_owned(&pg->interlock));

        uvmpdpol_set_intent(pg, PQ_INTENT_D);
}

void
uvmpdpol_pageenqueue(struct vm_page *pg)
{

        KASSERT(uvm_page_owner_locked_p(pg, false));
        KASSERT(mutex_owned(&pg->interlock));

        uvmpdpol_set_intent(pg, PQ_INTENT_E);
}

void
uvmpdpol_anfree(struct vm_anon *an)
{
}

bool
uvmpdpol_pageisqueued_p(struct vm_page *pg)
{
        uint32_t pqflags;

        /*
         * if there's an intent set, we have to consider it.  otherwise,
         * return the actual state.  we may be called unlocked for the
         * purpose of assertions, which is safe due to the page lifecycle.
         */
        pqflags = atomic_load_relaxed(&pg->pqflags);
        if ((pqflags & PQ_INTENT_SET) != 0) {
                return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D;
        } else {
                return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
        }
}

bool
uvmpdpol_pageactivate_p(struct vm_page *pg)
{
        uint32_t pqflags;

        /* consider intent in preference to actual state. */
        pqflags = atomic_load_relaxed(&pg->pqflags);
        if ((pqflags & PQ_INTENT_SET) != 0) {
                pqflags &= PQ_INTENT_MASK;
                return pqflags != PQ_INTENT_A && pqflags != PQ_INTENT_E;
        } else {
                /*
                 * TODO: Enabling this may be too much of a big hammer,
                 * since we do get useful information from activations.
                 * Think about it more and maybe come up with a heuristic
                 * or something.
                 *
                 * return (pqflags & PQ_ACTIVE) == 0;
                 */
                return true;
        }
}

void
uvmpdpol_estimatepageable(int *active, int *inactive)
{
        struct uvmpdpol_globalstate *s = &pdpol_state;

        /*
         * Don't take any locks here.  This can be called from DDB, and in
         * any case the numbers are stale the instant the lock is dropped,
         * so it just doesn't matter.
         */
        if (active) {
                *active = s->s_active;
        }
        if (inactive) {
                *inactive = s->s_inactive;
        }
}

#if !defined(PDSIM)
static int
min_check(struct uvm_pctparam *pct, int t)
{
        struct uvmpdpol_globalstate *s = &pdpol_state;
        int total = t;

        if (pct != &s->s_anonmin) {
                total += uvm_pctparam_get(&s->s_anonmin);
        }
        if (pct != &s->s_filemin) {
                total += uvm_pctparam_get(&s->s_filemin);
        }
        if (pct != &s->s_execmin) {
                total += uvm_pctparam_get(&s->s_execmin);
        }
        if (total > 95) {
                return EINVAL;
        }
        return 0;
}
#endif /* !defined(PDSIM) */

void
uvmpdpol_init(void)
{
        struct uvmpdpol_globalstate *s = &pdpol_state;

        mutex_init(&s->lock, MUTEX_DEFAULT, IPL_NONE);
        TAILQ_INIT(&s->s_activeq);
        TAILQ_INIT(&s->s_inactiveq);
        uvm_pctparam_init(&s->s_inactivepct, CLOCK_INACTIVEPCT, NULL);
        uvm_pctparam_init(&s->s_anonmin, 10, min_check);
        uvm_pctparam_init(&s->s_filemin, 10, min_check);
        uvm_pctparam_init(&s->s_execmin,  5, min_check);
        uvm_pctparam_init(&s->s_anonmax, 80, NULL);
        uvm_pctparam_init(&s->s_filemax, 50, NULL);
        uvm_pctparam_init(&s->s_execmax, 30, NULL);
}

void
uvmpdpol_init_cpu(struct uvm_cpu *ucpu)
{

        ucpu->pdq =
            kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP);
        ucpu->pdqhead = CLOCK_PDQ_SIZE;
        ucpu->pdqtail = CLOCK_PDQ_SIZE;
}

void
uvmpdpol_reinit(void)
{
}

bool
uvmpdpol_needsscan_p(void)
{

        /*
         * this must be an unlocked check: can be called from interrupt.
         */
        return pdpol_state.s_inactive < pdpol_state.s_inactarg;
}

void
uvmpdpol_tune(void)
{
        struct uvmpdpol_globalstate *s = &pdpol_state;

        mutex_enter(&s->lock);
        clock_tune();
        mutex_exit(&s->lock);
}

/*
 * uvmpdpol_pagerealize_locked: take the intended state set on a page and
 * make it real.  return true if any work was done.
 */
static bool
uvmpdpol_pagerealize_locked(struct vm_page *pg)
{
        struct uvmpdpol_globalstate *s __diagused = &pdpol_state;

        KASSERT(mutex_owned(&s->lock));
        KASSERT(mutex_owned(&pg->interlock));

        switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
        case PQ_INTENT_A | PQ_INTENT_SET:
        case PQ_INTENT_E | PQ_INTENT_SET:
                uvmpdpol_pageactivate_locked(pg);
                return true;
        case PQ_INTENT_I | PQ_INTENT_SET:
                uvmpdpol_pagedeactivate_locked(pg);
                return true;
        case PQ_INTENT_D | PQ_INTENT_SET:
                uvmpdpol_pagedequeue_locked(pg);
                return true;
        default:
                return false;
        }
}

/*
 * uvmpdpol_flush: return the current uvm_cpu with all of its pending
 * updates flushed to the global queues.  this routine may block, and
 * so can switch cpu.  the idea is to empty to queue on whatever cpu
 * we finally end up on.
 */
static struct uvm_cpu *
uvmpdpol_flush(void)
{
        struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
        struct uvm_cpu *ucpu;
        struct vm_page *pg;

        KASSERT(kpreempt_disabled());

        mutex_enter(&s->lock);
        for (;;) {
                /*
                 * prefer scanning forwards (even though mutex_enter() is
                 * serializing) so as to not defeat any prefetch logic in
                 * the CPU.  that means elsewhere enqueuing backwards, like
                 * a stack, but not so important there as pages are being
                 * added singularly.
                 *
                 * prefetch the next "struct vm_page" while working on the
                 * current one.  this has a measurable and very positive
                 * effect in reducing the amount of time spent here under
                 * the global lock.
                 */
                ucpu = curcpu()->ci_data.cpu_uvm;
                KASSERT(ucpu->pdqhead <= ucpu->pdqtail);
                if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) {
                        break;
                }
                pg = ucpu->pdq[ucpu->pdqhead++];
                if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) {
                        __builtin_prefetch(ucpu->pdq[ucpu->pdqhead]);
                }
                mutex_enter(&pg->interlock);
                pg->pqflags &= ~PQ_INTENT_QUEUED;
                (void)uvmpdpol_pagerealize_locked(pg);
                mutex_exit(&pg->interlock);
        }
        mutex_exit(&s->lock);
        return ucpu;
}

/*
 * uvmpdpol_pagerealize: realize any intent set on the page.  in this
 * implementation, that means putting the page on a per-CPU queue to be
 * dealt with later.
 */
void
uvmpdpol_pagerealize(struct vm_page *pg)
{
        struct uvm_cpu *ucpu;

        /*
         * drain the per per-CPU queue if full, then enter the page.
         */
        kpreempt_disable();
        ucpu = curcpu()->ci_data.cpu_uvm;
        if (__predict_false(ucpu->pdqhead == 0)) {
                ucpu = uvmpdpol_flush();
        }
        ucpu->pdq[--(ucpu->pdqhead)] = pg;
        kpreempt_enable();
}

/*
 * uvmpdpol_idle: called from the system idle loop.  periodically purge any
 * pending updates back to the global queues.
 */
void
uvmpdpol_idle(struct uvm_cpu *ucpu)
{
        struct uvmpdpol_globalstate *s = &pdpol_state;
        struct vm_page *pg;

        KASSERT(kpreempt_disabled());

        /*
         * if no pages in the queue, we have nothing to do.
         */
        if (ucpu->pdqhead == ucpu->pdqtail) {
                ucpu->pdqtime = getticks();
                return;
        }

        /*
         * don't do this more than ~8 times a second as it would needlessly
         * exert pressure.
         */
        if (getticks() - ucpu->pdqtime < (hz >> 3)) {
                return;
        }

        /*
         * the idle LWP can't block, so we have to try for the lock.  if we
         * get it, purge the per-CPU pending update queue.  continually
         * check for a pending resched: in that case exit immediately.
         */
        if (mutex_tryenter(&s->lock)) {
                while (ucpu->pdqhead != ucpu->pdqtail) {
                        pg = ucpu->pdq[ucpu->pdqhead];
                        if (!mutex_tryenter(&pg->interlock)) {
                                break;
                        }
                        ucpu->pdqhead++;
                        pg->pqflags &= ~PQ_INTENT_QUEUED;
                        (void)uvmpdpol_pagerealize_locked(pg);
                        mutex_exit(&pg->interlock);
                        if (curcpu()->ci_want_resched) {
                                break;
                        }
                }
                if (ucpu->pdqhead == ucpu->pdqtail) {
                        ucpu->pdqtime = getticks();
                }
                mutex_exit(&s->lock);
        }
}

#if !defined(PDSIM)

#include <sys/sysctl.h>        /* XXX SYSCTL_DESCR */

void
uvmpdpol_sysctlsetup(void)
{
        struct uvmpdpol_globalstate *s = &pdpol_state;

        uvm_pctparam_createsysctlnode(&s->s_anonmin, "anonmin",
            SYSCTL_DESCR("Percentage of physical memory reserved "
            "for anonymous application data"));
        uvm_pctparam_createsysctlnode(&s->s_filemin, "filemin",
            SYSCTL_DESCR("Percentage of physical memory reserved "
            "for cached file data"));
        uvm_pctparam_createsysctlnode(&s->s_execmin, "execmin",
            SYSCTL_DESCR("Percentage of physical memory reserved "
            "for cached executable data"));

        uvm_pctparam_createsysctlnode(&s->s_anonmax, "anonmax",
            SYSCTL_DESCR("Percentage of physical memory which will "
            "be reclaimed from other usage for "
            "anonymous application data"));
        uvm_pctparam_createsysctlnode(&s->s_filemax, "filemax",
            SYSCTL_DESCR("Percentage of physical memory which will "
            "be reclaimed from other usage for cached "
            "file data"));
        uvm_pctparam_createsysctlnode(&s->s_execmax, "execmax",
            SYSCTL_DESCR("Percentage of physical memory which will "
            "be reclaimed from other usage for cached "
            "executable data"));

        uvm_pctparam_createsysctlnode(&s->s_inactivepct, "inactivepct",
            SYSCTL_DESCR("Percentage of inactive queue of "
            "the entire (active + inactive) queue"));
}

#endif /* !defined(PDSIM) */

#if defined(PDSIM)
void
pdsim_dump(const char *id)
{
#if defined(DEBUG)
        /* XXX */
#endif /* defined(DEBUG) */
}
#endif /* defined(PDSIM) */










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 


    1 






































































































































































































    1 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
/*        $NetBSD: machdep.c,v 1.368 2024/03/05 14:15:28 thorpej Exp $        */

/*
 * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
 * Simulation Facility, NASA Ames Research Center.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Coyote Point Systems, Inc. which was written under contract to Coyote
 * Point by Jed Davis and Devon O'Dell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

/*
 * Copyright (c) 2007 Manuel Bouyer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * William Jolitz.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)machdep.c        7.4 (Berkeley) 6/3/91
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.368 2024/03/05 14:15:28 thorpej Exp $");

#include "opt_modular.h"
#include "opt_user_ldt.h"
#include "opt_ddb.h"
#include "opt_kgdb.h"
#include "opt_cpureset_delay.h"
#include "opt_mtrr.h"
#include "opt_realmem.h"
#include "opt_xen.h"
#include "opt_svs.h"
#include "opt_kaslr.h"
#ifndef XENPV
#include "opt_physmem.h"
#endif
#include "isa.h"
#include "pci.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signal.h>
#include <sys/signalvar.h>
#include <sys/kernel.h>
#include <sys/cpu.h>
#include <sys/exec.h>
#include <sys/exec_aout.h>        /* for MID_* */
#include <sys/reboot.h>
#include <sys/conf.h>
#include <sys/msgbuf.h>
#include <sys/mount.h>
#include <sys/core.h>
#include <sys/kcore.h>
#include <sys/ucontext.h>
#include <machine/kcore.h>
#include <sys/ras.h>
#include <sys/syscallargs.h>
#include <sys/ksyms.h>
#include <sys/device.h>
#include <sys/lwp.h>
#include <sys/proc.h>
#include <sys/asan.h>
#include <sys/csan.h>
#include <sys/msan.h>
#include <sys/module.h>
#include <sys/timevar.h>

#ifdef KGDB
#include <sys/kgdb.h>
#endif

#include <lib/libkern/entpool.h> /* XXX */

#include <dev/cons.h>
#include <dev/mm.h>

#include <uvm/uvm.h>
#include <uvm/uvm_page.h>

#include <sys/sysctl.h>

#include <machine/cpu.h>
#include <machine/cpu_rng.h>
#include <machine/cpufunc.h>
#include <machine/gdt.h>
#include <machine/intr.h>
#include <machine/pio.h>
#include <machine/psl.h>
#include <machine/reg.h>
#include <machine/specialreg.h>
#include <machine/bootinfo.h>
#include <x86/fpu.h>
#include <x86/dbregs.h>
#include <machine/mtrr.h>
#include <machine/mpbiosvar.h>
#include <machine/pmap_private.h>

#include <x86/bootspace.h>
#include <x86/cputypes.h>
#include <x86/cpuvar.h>
#include <x86/machdep.h>
#include <x86/x86/tsc.h>

#include <dev/isa/isareg.h>
#include <machine/isa_machdep.h>
#include <dev/ic/i8042reg.h>

#ifdef XEN
#include <xen/xen.h>
#include <xen/hypervisor.h>
#include <xen/evtchn.h>
#include <xen/include/public/version.h>
#include <xen/include/public/vcpu.h>
#endif /* XEN */

#include <ddb/db_active.h>

#ifdef DDB
#include <machine/db_machdep.h>
#include <ddb/db_extern.h>
#include <ddb/db_output.h>
#include <ddb/db_interface.h>
#endif

#include "acpica.h"

#if NACPICA > 0
#include <dev/acpi/acpivar.h>
#define ACPI_MACHDEP_PRIVATE
#include <machine/acpi_machdep.h>
#else
#include <machine/i82489var.h>
#endif

#include "isa.h"
#include "isadma.h"
#include "ksyms.h"

/* the following is used externally (sysctl_hw) */
char machine[] = "amd64";                /* CPU "architecture" */
char machine_arch[] = "x86_64";                /* machine == machine_arch */

#ifdef CPURESET_DELAY
int cpureset_delay = CPURESET_DELAY;
#else
int cpureset_delay = 2000; /* default to 2s */
#endif

int cpu_class = CPUCLASS_686;

#ifdef MTRR
const struct mtrr_funcs *mtrr_funcs;
#endif

int cpu_class;
int use_pae;

#ifndef NO_SPARSE_DUMP
int sparse_dump = 1;

paddr_t max_paddr = 0;
unsigned char *sparse_dump_physmap;
#endif

char *dump_headerbuf, *dump_headerbuf_ptr;
#define dump_headerbuf_size PAGE_SIZE
#define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size)
#define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr)
daddr_t dump_header_blkno;

size_t dump_nmemsegs;
size_t dump_npages;
size_t dump_header_size;
size_t dump_totalbytesleft;

vaddr_t idt_vaddr;
paddr_t idt_paddr;
vaddr_t gdt_vaddr;
paddr_t gdt_paddr;
vaddr_t ldt_vaddr;
paddr_t ldt_paddr;

static struct vm_map module_map_store;
extern struct bootspace bootspace;
extern struct slotspace slotspace;

vaddr_t vm_min_kernel_address __read_mostly = VM_MIN_KERNEL_ADDRESS_DEFAULT;
vaddr_t vm_max_kernel_address __read_mostly = VM_MAX_KERNEL_ADDRESS_DEFAULT;
pd_entry_t *pte_base __read_mostly;

struct vm_map *phys_map = NULL;

extern paddr_t lowmem_rsvd;
extern paddr_t avail_start, avail_end;
#ifdef XENPV
extern paddr_t pmap_pa_start, pmap_pa_end;
#endif

struct nmistore {
        uint64_t cr3;
        uint64_t scratch;
} __packed;

/*
 * Size of memory segments, before any memory is stolen.
 */
phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
int mem_cluster_cnt;

int cpu_dump(void);
int cpu_dumpsize(void);
u_long cpu_dump_mempagecnt(void);
void dodumpsys(void);
void dumpsys(void);

static void x86_64_proc0_pcb_ldt_init(void);

void dump_misc_init(void);
void dump_seg_prep(void);
int dump_seg_iter(int (*)(paddr_t, paddr_t));

#ifndef NO_SPARSE_DUMP
void sparse_dump_reset(void);
void sparse_dump_mark(void);
void cpu_dump_prep_sparse(void);
#endif

void dump_header_start(void);
int dump_header_flush(void);
int dump_header_addbytes(const void*, size_t);
int dump_header_addseg(paddr_t, paddr_t);
int dump_header_finish(void);

int dump_seg_count_range(paddr_t, paddr_t);
int dumpsys_seg(paddr_t, paddr_t);

void init_bootspace(void);
void init_slotspace(void);
void init_x86_64(paddr_t);

/*
 * Machine-dependent startup code
 */
void
cpu_startup(void)
{
        int x, y;
        vaddr_t minaddr, maxaddr;
        psize_t sz;

        /*
         * For console drivers that require uvm and pmap to be initialized,
         * we'll give them one more chance here...
         */
        consinit();

        /*
         * Initialize error message buffer (at end of core).
         */
        if (msgbuf_p_cnt == 0)
                panic("msgbuf paddr map has not been set up");
        for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
                continue;

        msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY);
        if (msgbuf_vaddr == 0)
                panic("failed to valloc msgbuf_vaddr");

        for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
                for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
                        pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
                            msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
                            VM_PROT_READ|VM_PROT_WRITE, 0);
        }

        pmap_update(pmap_kernel());

        initmsgbuf((void *)msgbuf_vaddr, round_page(sz));

        minaddr = 0;

        /*
         * Allocate a submap for physio.
         */
        phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
            VM_PHYS_SIZE, 0, false, NULL);

        /*
         * Create the module map.
         *
         * The kernel uses RIP-relative addressing with a maximum offset of
         * 2GB. Because of that, we can't put the kernel modules in kernel_map
         * (like i386 does), since kernel_map is too far away in memory from
         * the kernel sections. So we have to create a special module_map.
         *
         * The module map is taken as what is left of the bootstrap memory
         * created in locore/prekern.
         */
        uvm_map_setup(&module_map_store, bootspace.smodule,
            bootspace.emodule, 0);
        module_map_store.pmap = pmap_kernel();
        module_map = &module_map_store;

        /* Say hello. */
        banner();

#if NISA > 0 || NPCI > 0
        /* Safe for i/o port / memory space allocation to use malloc now. */
        x86_bus_space_mallocok();
#endif

#ifdef __HAVE_PCPU_AREA
        cpu_pcpuarea_init(&cpu_info_primary);
#endif
        gdt_init();
        x86_64_proc0_pcb_ldt_init();

        cpu_init_tss(&cpu_info_primary);
#if !defined(XENPV)
        ltr(cpu_info_primary.ci_tss_sel);
#endif

        x86_startup();
}

#ifdef XENPV
/* used in assembly */
void hypervisor_callback(void);
void failsafe_callback(void);
void x86_64_switch_context(struct pcb *);
void x86_64_tls_switch(struct lwp *);

void
x86_64_switch_context(struct pcb *new)
{
        HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0);
        struct physdev_set_iopl set_iopl;
        set_iopl.iopl = new->pcb_iopl;
        HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
}

void
x86_64_tls_switch(struct lwp *l)
{
        struct cpu_info *ci = curcpu();
        struct pcb *pcb = lwp_getpcb(l);
        struct trapframe *tf = l->l_md.md_regs;
        uint64_t zero = 0;

        /*
         * Raise the IPL to IPL_HIGH. XXX Still needed?
         */
        (void)splhigh();

        /* Update segment registers */
        if (pcb->pcb_flags & PCB_COMPAT32) {
                update_descriptor(&ci->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
                update_descriptor(&ci->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
                setds(GSEL(GUDATA32_SEL, SEL_UPL));
                setes(GSEL(GUDATA32_SEL, SEL_UPL));
                setfs(GSEL(GUDATA32_SEL, SEL_UPL));
                HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs);
        } else {
                update_descriptor(&ci->ci_gdt[GUFS_SEL], &zero);
                update_descriptor(&ci->ci_gdt[GUGS_SEL], &zero);
                setds(GSEL(GUDATA_SEL, SEL_UPL));
                setes(GSEL(GUDATA_SEL, SEL_UPL));
                setfs(0);
                HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0);
                HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs);
                HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs);
        }
}
#endif /* XENPV */

/*
 * Set up proc0's PCB and LDT.
 */
static void
x86_64_proc0_pcb_ldt_init(void)
{
        struct lwp *l = &lwp0;
        struct pcb *pcb = lwp_getpcb(l);

        pcb->pcb_flags = 0;
        pcb->pcb_fs = 0;
        pcb->pcb_gs = 0;
        pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + USPACE - 16) & ~0xf;
        pcb->pcb_iopl = IOPL_KPL;
        pcb->pcb_dbregs = NULL;
        pcb->pcb_cr0 = rcr0() & ~CR0_TS;
        l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1;

#if !defined(XENPV)
        lldt(GSYSSEL(GLDT_SEL, SEL_KPL));
#else
        xen_set_ldt((vaddr_t)ldtstore, LDT_SIZE >> 3);
        /* Reset TS bit and set kernel stack for interrupt handlers */
        HYPERVISOR_fpu_taskswitch(1);
        HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0);
        struct physdev_set_iopl set_iopl;
        set_iopl.iopl = pcb->pcb_iopl;
        HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
#endif
}

/*
 * Set up TSS and I/O bitmap.
 */
void
cpu_init_tss(struct cpu_info *ci)
{
#ifdef __HAVE_PCPU_AREA
        const cpuid_t cid = cpu_index(ci);
#endif
        struct cpu_tss *cputss;
        struct nmistore *store;
        uintptr_t p;

#ifdef __HAVE_PCPU_AREA
        cputss = (struct cpu_tss *)&pcpuarea->ent[cid].tss;
#else
        cputss = (struct cpu_tss *)uvm_km_alloc(kernel_map,
            sizeof(struct cpu_tss), 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
#endif

        cputss->tss.tss_iobase = IOMAP_INVALOFF << 16;

        /* DDB stack */
#ifdef __HAVE_PCPU_AREA
        p = (vaddr_t)&pcpuarea->ent[cid].ist0;
#else
        p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
#endif
        cputss->tss.tss_ist[0] = p + PAGE_SIZE - 16;

        /* double fault */
#ifdef __HAVE_PCPU_AREA
        p = (vaddr_t)&pcpuarea->ent[cid].ist1;
#else
        p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
#endif
        cputss->tss.tss_ist[1] = p + PAGE_SIZE - 16;

        /* NMI - store a structure at the top of the stack */
#ifdef __HAVE_PCPU_AREA
        p = (vaddr_t)&pcpuarea->ent[cid].ist2;
#else
        p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
#endif
        cputss->tss.tss_ist[2] = p + PAGE_SIZE - sizeof(struct nmistore);
        store = (struct nmistore *)(p + PAGE_SIZE - sizeof(struct nmistore));
        store->cr3 = pmap_pdirpa(pmap_kernel(), 0);

        /* DB */
#ifdef __HAVE_PCPU_AREA
        p = (vaddr_t)&pcpuarea->ent[cid].ist3;
#else
        p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
#endif
        cputss->tss.tss_ist[3] = p + PAGE_SIZE - 16;

        ci->ci_tss = cputss;
        ci->ci_tss_sel = tss_alloc(&cputss->tss);
}

void
buildcontext(struct lwp *l, void *catcher, void *f)
{
        struct trapframe *tf = l->l_md.md_regs;

        tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
        tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
        tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
        tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);

        tf->tf_rip = (uint64_t)catcher;
        tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
        tf->tf_rflags &= ~PSL_CLEARSIG;
        tf->tf_rsp = (uint64_t)f;
        tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);

        /* Ensure FP state is sane */
        fpu_sigreset(l);
}

void
sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask)
{

        printf("sendsig_sigcontext: illegal\n");
        sigexit(curlwp, SIGILL);
}

void
sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
{
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;
        struct sigacts *ps = p->p_sigacts;
        int onstack, error;
        int sig = ksi->ksi_signo;
        struct sigframe_siginfo *fp, frame;
        sig_t catcher = SIGACTION(p, sig).sa_handler;
        struct trapframe *tf = l->l_md.md_regs;
        char *sp;

        KASSERT(mutex_owned(p->p_lock));

        /* Do we need to jump onto the signal stack? */
        onstack =
            (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
            (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;

        /* Allocate space for the signal handler context. */
        if (onstack)
                sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size);
        else
                /* AMD64 ABI 128-bytes "red zone". */
                sp = (char *)tf->tf_rsp - 128;

        sp -= sizeof(struct sigframe_siginfo);
        /* Round down the stackpointer to a multiple of 16 for the ABI. */
        fp = (struct sigframe_siginfo *)(((unsigned long)sp & ~15) - 8);

        memset(&frame, 0, sizeof(frame));
        frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp;
        frame.sf_si._info = ksi->ksi_info;
        frame.sf_uc.uc_flags = _UC_SIGMASK;
        frame.sf_uc.uc_sigmask = *mask;
        frame.sf_uc.uc_link = l->l_ctxlink;
        frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
            ? _UC_SETSTACK : _UC_CLRSTACK;
        sendsig_reset(l, sig);

        mutex_exit(p->p_lock);
        cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
        /* Copyout all the fp regs, the signal handler might expect them. */
        error = copyout(&frame, fp, sizeof frame);
        mutex_enter(p->p_lock);

        if (error != 0) {
                /*
                 * Process has trashed its stack; give it an illegal
                 * instruction to halt it in its tracks.
                 */
                sigexit(l, SIGILL);
                /* NOTREACHED */
        }

        buildcontext(l, catcher, fp);

        tf->tf_rdi = sig;
        tf->tf_rsi = (uint64_t)&fp->sf_si;
        tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc;

        /* Remember that we're now on the signal stack. */
        if (onstack)
                l->l_sigstk.ss_flags |= SS_ONSTACK;

        if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) {
                /*
                 * process has given an invalid address for the
                 * handler. Stop it, but do not do it before so
                 * we can return the right info to userland (or in core dump)
                 */
                sigexit(l, SIGILL);
                /* NOTREACHED */
        }
}

struct pcb dumppcb;

void
cpu_reboot(int howto, char *bootstr)
{
        static bool syncdone = false;
        int s = IPL_NONE;
        __USE(s);        /* ugly otherwise */

        if (cold) {
                howto |= RB_HALT;
                goto haltsys;
        }

        boothowto = howto;

        /* i386 maybe_dump() */

        /*
         * If we've panic'd, don't make the situation potentially
         * worse by syncing or unmounting the file systems.
         */
        if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
                if (!syncdone) {
                        syncdone = true;
                        /* XXX used to force unmount as well, here */
                        vfs_sync_all(curlwp);
                }

                while (vfs_unmountall1(curlwp, false, false) ||
                       config_detach_all(boothowto) ||
                       vfs_unmount_forceone(curlwp))
                        ;        /* do nothing */
        } else {
                if (!db_active)
                        suspendsched();
        }

        pmf_system_shutdown(boothowto);

        /* Disable interrupts. */
        s = splhigh();

        /* Do a dump if requested. */
        if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
                dumpsys();

haltsys:
        doshutdownhooks();

        if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
#if NACPICA > 0
                if (s != IPL_NONE)
                        splx(s);

                acpi_enter_sleep_state(ACPI_STATE_S5);
#endif
#ifdef XEN
                if (vm_guest == VM_GUEST_XENPV ||
                    vm_guest == VM_GUEST_XENPVH ||
                    vm_guest == VM_GUEST_XENPVHVM)
                        HYPERVISOR_shutdown();
#endif /* XEN */
        }

        cpu_broadcast_halt();

        if (howto & RB_HALT) {
#if NACPICA > 0
                acpi_disable();
#endif

                printf("\n");
                printf("The operating system has halted.\n");
                printf("Please press any key to reboot.\n\n");
                cnpollc(1);        /* for proper keyboard command handling */
                if (cngetc() == 0) {
                        /* no console attached, so just hlt */
                        printf("No keyboard - cannot reboot after all.\n");
                        for(;;) {
                                x86_hlt();
                        }
                }
                cnpollc(0);
        }

        printf("rebooting...\n");
        if (cpureset_delay > 0)
                delay(cpureset_delay * 1000);
        cpu_reset();
        for(;;) ;
        /*NOTREACHED*/
}

/*
 * XXXfvdl share dumpcode.
 */

/*
 * Perform assorted dump-related initialization tasks.  Assumes that
 * the maximum physical memory address will not increase afterwards.
 */
void
dump_misc_init(void)
{
#ifndef NO_SPARSE_DUMP
        int i;
#endif

        if (dump_headerbuf != NULL)
                return; /* already called */

#ifndef NO_SPARSE_DUMP
        for (i = 0; i < mem_cluster_cnt; ++i) {
                paddr_t top = mem_clusters[i].start + mem_clusters[i].size;
                if (max_paddr < top)
                        max_paddr = top;
        }
#ifdef DEBUG
        printf("dump_misc_init: max_paddr = 0x%lx\n",
            (unsigned long)max_paddr);
#endif
        if (max_paddr == 0) {
                printf("Your machine does not initialize mem_clusters; "
                    "sparse_dumps disabled\n");
                sparse_dump = 0;
        } else {
                sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map,
                    roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE),
                    PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
        }
#endif
        dump_headerbuf = (void *)uvm_km_alloc(kernel_map,
            dump_headerbuf_size,
            PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
        /* XXXjld should check for failure here, disable dumps if so. */
}

#ifndef NO_SPARSE_DUMP
/*
 * Clear the set of pages to include in a sparse dump.
 */
void
sparse_dump_reset(void)
{
        memset(sparse_dump_physmap, 0,
            roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE));
}

/*
 * Include or exclude pages in a sparse dump.
 */
void
sparse_dump_mark(void)
{
        paddr_t p, pstart, pend;
        struct vm_page *pg;
        int i;
        uvm_physseg_t upm;

        /*
         * Mark all memory pages, then unmark pages that are uninteresting.
         * Dereferenceing pg->uobject might crash again if another CPU
         * frees the object out from under us, but we can't lock anything
         * so it's a risk we have to take.
         */

        for (i = 0; i < mem_cluster_cnt; ++i) {
                pstart = mem_clusters[i].start / PAGE_SIZE;
                pend = pstart + mem_clusters[i].size / PAGE_SIZE;

                for (p = pstart; p < pend; p++) {
                        setbit(sparse_dump_physmap, p);
                }
        }
        for (upm = uvm_physseg_get_first();
             uvm_physseg_valid_p(upm);
             upm = uvm_physseg_get_next(upm)) {
                paddr_t pfn;

                /*
                 * We assume that seg->start to seg->end are
                 * uvm_page_physload()ed
                 */
                for (pfn = uvm_physseg_get_start(upm);
                     pfn < uvm_physseg_get_end(upm);
                     pfn++) {
                        pg = PHYS_TO_VM_PAGE(ptoa(pfn));

                        if (pg->uanon || (pg->flags & PG_FREE) ||
                            (pg->uobject && pg->uobject->pgops)) {
                                p = VM_PAGE_TO_PHYS(pg) / PAGE_SIZE;
                                clrbit(sparse_dump_physmap, p);
                        }
                }
        }
}

/*
 * Machine-dependently decides on the contents of a sparse dump, using
 * the above.
 */
void
cpu_dump_prep_sparse(void)
{
        sparse_dump_reset();
        /* XXX could the alternate recursive page table be skipped? */
        sparse_dump_mark();
        /* Memory for I/O buffers could be unmarked here, for example. */
        /* The kernel text could also be unmarked, but gdb would be upset. */
}
#endif

/*
 * Abstractly iterate over the collection of memory segments to be
 * dumped; the callback lacks the customary environment-pointer
 * argument because none of the current users really need one.
 *
 * To be used only after dump_seg_prep is called to set things up.
 */
int
dump_seg_iter(int (*callback)(paddr_t, paddr_t))
{
        int error, i;

#define CALLBACK(start,size) do {     \
        error = callback(start,size); \
        if (error)                    \
                return error;         \
} while(0)

        for (i = 0; i < mem_cluster_cnt; ++i) {
#ifndef NO_SPARSE_DUMP
                /*
                 * The bitmap is scanned within each memory segment,
                 * rather than over its entire domain, in case any
                 * pages outside of the memory proper have been mapped
                 * into kva; they might be devices that wouldn't
                 * appreciate being arbitrarily read, and including
                 * them could also break the assumption that a sparse
                 * dump will always be smaller than a full one.
                 */
                if (sparse_dump && sparse_dump_physmap) {
                        paddr_t p, sp_start, sp_end;
                        int lastset;

                        sp_start = mem_clusters[i].start;
                        sp_end = sp_start + mem_clusters[i].size;
                        sp_start = rounddown(sp_start, PAGE_SIZE); /* unnecessary? */
                        lastset = 0;
                        for (p = sp_start; p < sp_end; p += PAGE_SIZE) {
                                int thisset = isset(sparse_dump_physmap,
                                    p/PAGE_SIZE);

                                if (!lastset && thisset)
                                        sp_start = p;
                                if (lastset && !thisset)
                                        CALLBACK(sp_start, p - sp_start);
                                lastset = thisset;
                        }
                        if (lastset)
                                CALLBACK(sp_start, p - sp_start);
                } else
#endif
                        CALLBACK(mem_clusters[i].start, mem_clusters[i].size);
        }
        return 0;
#undef CALLBACK
}

/*
 * Prepare for an impending core dump: decide what's being dumped and
 * how much space it will take up.
 */
void
dump_seg_prep(void)
{
#ifndef NO_SPARSE_DUMP
        if (sparse_dump && sparse_dump_physmap)
                cpu_dump_prep_sparse();
#endif

        dump_nmemsegs = 0;
        dump_npages = 0;
        dump_seg_iter(dump_seg_count_range);

        dump_header_size = ALIGN(sizeof(kcore_seg_t)) +
            ALIGN(sizeof(cpu_kcore_hdr_t)) +
            ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t));
        dump_header_size = roundup(dump_header_size, dbtob(1));

        /*
         * savecore(8) will read this to decide how many pages to
         * copy, and cpu_dumpconf has already used the pessimistic
         * value to set dumplo, so it's time to tell the truth.
         */
        dumpsize = dump_npages; /* XXX could these just be one variable? */
}

int
dump_seg_count_range(paddr_t start, paddr_t size)
{
        ++dump_nmemsegs;
        dump_npages += size / PAGE_SIZE;
        return 0;
}

/*
 * A sparse dump's header may be rather large, due to the number of
 * "segments" emitted.  These routines manage a simple output buffer,
 * so that the header can be written to disk incrementally.
 */
void
dump_header_start(void)
{
        dump_headerbuf_ptr = dump_headerbuf;
        dump_header_blkno = dumplo;
}

int
dump_header_flush(void)
{
        const struct bdevsw *bdev;
        size_t to_write;
        int error;

        bdev = bdevsw_lookup(dumpdev);
        to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1));
        error = bdev->d_dump(dumpdev, dump_header_blkno,
            dump_headerbuf, to_write);
        dump_header_blkno += btodb(to_write);
        dump_headerbuf_ptr = dump_headerbuf;
        return error;
}

int
dump_header_addbytes(const void* vptr, size_t n)
{
        const char* ptr = vptr;
        int error;

        while (n > dump_headerbuf_avail) {
                memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail);
                ptr += dump_headerbuf_avail;
                n -= dump_headerbuf_avail;
                dump_headerbuf_ptr = dump_headerbuf_end;
                error = dump_header_flush();
                if (error)
                        return error;
        }
        memcpy(dump_headerbuf_ptr, ptr, n);
        dump_headerbuf_ptr += n;

        return 0;
}

int
dump_header_addseg(paddr_t start, paddr_t size)
{
        phys_ram_seg_t seg = { start, size };

        return dump_header_addbytes(&seg, sizeof(seg));
}

int
dump_header_finish(void)
{
        memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail);
        return dump_header_flush();
}


/*
 * These variables are needed by /sbin/savecore
 */
uint32_t        dumpmag = 0x8fca0101;        /* magic number */
int         dumpsize = 0;                /* pages */
long        dumplo = 0;                 /* blocks */

/*
 * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers
 * for a full (non-sparse) dump.
 */
int
cpu_dumpsize(void)
{
        int size;

        size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
            ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
        if (roundup(size, dbtob(1)) != dbtob(1))
                return (-1);

        return (1);
}

/*
 * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped
 * for a full (non-sparse) dump.
 */
u_long
cpu_dump_mempagecnt(void)
{
        u_long i, n;

        n = 0;
        for (i = 0; i < mem_cluster_cnt; i++)
                n += atop(mem_clusters[i].size);
        return (n);
}

/*
 * cpu_dump: dump the machine-dependent kernel core dump headers.
 */
int
cpu_dump(void)
{
        kcore_seg_t seg;
        cpu_kcore_hdr_t cpuhdr;
        const struct bdevsw *bdev;

        bdev = bdevsw_lookup(dumpdev);
        if (bdev == NULL)
                return (ENXIO);

        /*
         * Generate a segment header.
         */
        CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
        seg.c_size = dump_header_size - ALIGN(sizeof(seg));
        (void)dump_header_addbytes(&seg, ALIGN(sizeof(seg)));

        /*
         * Add the machine-dependent header info.
         */
        cpuhdr.ptdpaddr = PDPpaddr;
        cpuhdr.nmemsegs = dump_nmemsegs;
        (void)dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr)));

        /*
         * Write out the memory segment descriptors.
         */
        return dump_seg_iter(dump_header_addseg);
}

/*
 * Doadump comes here after turning off memory management and
 * getting on the dump stack, either when called above, or by
 * the auto-restart code.
 */
#define BYTES_PER_DUMP  PAGE_SIZE /* must be a multiple of pagesize XXX small */
static vaddr_t dumpspace;

vaddr_t
reserve_dumppages(vaddr_t p)
{

        dumpspace = p;
        return (p + BYTES_PER_DUMP);
}

int
dumpsys_seg(paddr_t maddr, paddr_t bytes)
{
        u_long i, m, n;
        daddr_t blkno;
        const struct bdevsw *bdev;
        int (*dump)(dev_t, daddr_t, void *, size_t);
        int error;

        if (dumpdev == NODEV)
                return ENODEV;
        bdev = bdevsw_lookup(dumpdev);
        if (bdev == NULL || bdev->d_psize == NULL)
                return ENODEV;

        dump = bdev->d_dump;

        blkno = dump_header_blkno;
        for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) {
                /* Print out how many MBs we have left to go. */
                if ((dump_totalbytesleft % (1024*1024)) == 0)
                        printf_nolog("%lu ", (unsigned long)
                            (dump_totalbytesleft / (1024 * 1024)));

                /* Limit size for next transfer. */
                n = bytes - i;
                if (n > BYTES_PER_DUMP)
                        n = BYTES_PER_DUMP;

                for (m = 0; m < n; m += NBPG)
                        pmap_kenter_pa(dumpspace + m, maddr + m,
                            VM_PROT_READ, 0);
                pmap_update(pmap_kernel());

                error = (*dump)(dumpdev, blkno, (void *)dumpspace, n);
                pmap_kremove_local(dumpspace, n);
                if (error)
                        return error;
                maddr += n;
                blkno += btodb(n);                /* XXX? */

#if 0        /* XXX this doesn't work.  grr. */
                /* operator aborting dump? */
                if (sget() != NULL)
                        return EINTR;
#endif
        }
        dump_header_blkno = blkno;

        return 0;
}

void
dodumpsys(void)
{
        const struct bdevsw *bdev;
        int dumpend, psize;
        int error;

        if (dumpdev == NODEV)
                return;

        bdev = bdevsw_lookup(dumpdev);
        if (bdev == NULL || bdev->d_psize == NULL)
                return;
        /*
         * For dumps during autoconfiguration,
         * if dump device has already configured...
         */
        if (dumpsize == 0)
                cpu_dumpconf();

        printf("\ndumping to dev %llu,%llu (offset=%ld, size=%d):",
            (unsigned long long)major(dumpdev),
            (unsigned long long)minor(dumpdev), dumplo, dumpsize);

        if (dumplo <= 0 || dumpsize <= 0) {
                printf(" not possible\n");
                return;
        }

        psize = bdev_size(dumpdev);
        printf("\ndump ");
        if (psize == -1) {
                printf("area unavailable\n");
                return;
        }

#if 0        /* XXX this doesn't work.  grr. */
        /* toss any characters present prior to dump */
        while (sget() != NULL); /*syscons and pccons differ */
#endif

        dump_seg_prep();
        dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages);
        if (dumpend > psize) {
                printf("failed: insufficient space (%d < %d)\n",
                    psize, dumpend);
                goto failed;
        }

        dump_header_start();
        if ((error = cpu_dump()) != 0)
                goto err;
        if ((error = dump_header_finish()) != 0)
                goto err;

        if (dump_header_blkno != dumplo + btodb(dump_header_size)) {
                printf("BAD header size (%ld [written] != %ld [expected])\n",
                    (long)(dump_header_blkno - dumplo),
                    (long)btodb(dump_header_size));
                goto failed;
        }

        dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP);
        error = dump_seg_iter(dumpsys_seg);

        if (error == 0 && dump_header_blkno != dumpend) {
                printf("BAD dump size (%ld [written] != %ld [expected])\n",
                    (long)(dumpend - dumplo),
                    (long)(dump_header_blkno - dumplo));
                goto failed;
        }

err:
        switch (error) {

        case ENXIO:
                printf("device bad\n");
                break;

        case EFAULT:
                printf("device not ready\n");
                break;

        case EINVAL:
                printf("area improper\n");
                break;

        case EIO:
                printf("i/o error\n");
                break;

        case EINTR:
                printf("aborted from console\n");
                break;

        case 0:
                printf("succeeded\n");
                break;

        default:
                printf("error %d\n", error);
                break;
        }
failed:
        printf("\n\n");
        delay(5000000);                /* 5 seconds */
}

/*
 * This is called by main to set dumplo and dumpsize.
 * Dumps always skip the first PAGE_SIZE of disk space
 * in case there might be a disk label stored there.
 * If there is extra space, put dump at the end to
 * reduce the chance that swapping trashes it.
 *
 * Sparse dumps can't placed as close to the end as possible, because
 * savecore(8) has to know where to start reading in the dump device
 * before it has access to any of the crashed system's state.
 *
 * Note also that a sparse dump will never be larger than a full one:
 * in order to add a phys_ram_seg_t to the header, at least one page
 * must be removed.
 */
void
cpu_dumpconf(void)
{
        int nblks, dumpblks;        /* size of dump area */

        if (dumpdev == NODEV)
                goto bad;
        nblks = bdev_size(dumpdev);
        if (nblks <= ctod(1))
                goto bad;

        dumpblks = cpu_dumpsize();
        if (dumpblks < 0)
                goto bad;

        /* dumpsize is in page units, and doesn't include headers. */
        dumpsize = cpu_dump_mempagecnt();

        dumpblks += ctod(dumpsize);

        /* If dump won't fit (incl. room for possible label), punt. */
        if (dumpblks > (nblks - ctod(1))) {
#ifndef NO_SPARSE_DUMP
                /* A sparse dump might (and hopefully will) fit. */
                dumplo = ctod(1);
#else
                /* But if we're not configured for that, punt. */
                goto bad;
#endif
        } else {
                /* Put dump at end of partition */
                dumplo = nblks - dumpblks;
        }


        /* Now that we've decided this will work, init ancillary stuff. */
        dump_misc_init();
        return;

 bad:
        dumpsize = 0;
}

/*
 * Clear registers on exec
 */
void
setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
{
        struct pcb *pcb = lwp_getpcb(l);
        struct trapframe *tf;

#ifdef USER_LDT
        pmap_ldt_cleanup(l);
#endif

        fpu_clear(l, pack->ep_osversion >= 699002600
            ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
        x86_dbregs_clear(l);

        kpreempt_disable();
        pcb->pcb_flags = 0;
        l->l_proc->p_flag &= ~PK_32;
        l->l_md.md_flags = MDL_IRET;
        cpu_segregs64_zero(l);
        kpreempt_enable();

        tf = l->l_md.md_regs;
        tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
        tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
        tf->tf_rdi = 0;
        tf->tf_rsi = 0;
        tf->tf_rbp = 0;
        tf->tf_rbx = l->l_proc->p_psstrp;
        tf->tf_rdx = 0;
        tf->tf_rcx = 0;
        tf->tf_rax = 0;
        tf->tf_rip = pack->ep_entry;
        tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL);
        tf->tf_rflags = PSL_USERSET;
        tf->tf_rsp = stack;
        tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
}

/*
 * Initialize segments and descriptor tables
 */
char *ldtstore;
char *gdtstore;

void
setgate(struct gate_descriptor *gd, void *func,
    int ist, int type, int dpl, int sel)
{
        vaddr_t vaddr;

        vaddr = ((vaddr_t)gd) & ~PAGE_MASK;

        kpreempt_disable();
        pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);

        gd->gd_looffset = (uint64_t)func & 0xffff;
        gd->gd_selector = sel;
        gd->gd_ist = ist;
        gd->gd_type = type;
        gd->gd_dpl = dpl;
        gd->gd_p = 1;
        gd->gd_hioffset = (uint64_t)func >> 16;
        gd->gd_zero = 0;
        gd->gd_xx1 = 0;
        gd->gd_xx2 = 0;
        gd->gd_xx3 = 0;

        pmap_changeprot_local(vaddr, VM_PROT_READ);
        kpreempt_enable();
}

void
unsetgate(struct gate_descriptor *gd)
{
        vaddr_t vaddr;

        vaddr = ((vaddr_t)gd) & ~PAGE_MASK;

        kpreempt_disable();
        pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);

        memset(gd, 0, sizeof (*gd));

        pmap_changeprot_local(vaddr, VM_PROT_READ);
        kpreempt_enable();
}

void
setregion(struct region_descriptor *rd, void *base, uint16_t limit)
{
        rd->rd_limit = limit;
        rd->rd_base = (uint64_t)base;
}

/*
 * Note that the base and limit fields are ignored in long mode.
 */
void
set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
        int type, int dpl, int gran, int def32, int is64)
{
        sd->sd_lolimit = (unsigned)limit;
        sd->sd_lobase = (unsigned long)base;
        sd->sd_type = type;
        sd->sd_dpl = dpl;
        sd->sd_p = 1;
        sd->sd_hilimit = (unsigned)limit >> 16;
        sd->sd_avl = 0;
        sd->sd_long = is64;
        sd->sd_def32 = def32;
        sd->sd_gran = gran;
        sd->sd_hibase = (unsigned long)base >> 24;
}

void
set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
        int type, int dpl, int gran)
{
        memset(sd, 0, sizeof *sd);
        sd->sd_lolimit = (unsigned)limit;
        sd->sd_lobase = (uint64_t)base;
        sd->sd_type = type;
        sd->sd_dpl = dpl;
        sd->sd_p = 1;
        sd->sd_hilimit = (unsigned)limit >> 16;
        sd->sd_gran = gran;
        sd->sd_hibase = (uint64_t)base >> 24;
}

void
cpu_init_idt(struct cpu_info *ci)
{
        struct region_descriptor region;
        idt_descriptor_t *idt;

        idt = ci->ci_idtvec.iv_idt;
        setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
        lidt(&region);
}

#define        IDTVEC(name)        __CONCAT(X, name)
typedef void (vector)(void);
extern vector IDTVEC(syscall);
extern vector IDTVEC(syscall32);
extern vector IDTVEC(osyscall);
extern vector *x86_exceptions[];

#ifndef XENPV
static void
init_x86_64_ksyms(void)
{
#if NKSYMS || defined(DDB) || defined(MODULAR)
        extern int end;
        extern int *esym;
        struct btinfo_symtab *symtab;
        vaddr_t tssym, tesym;

#ifdef DDB
        db_machine_init();
#endif

        symtab = lookup_bootinfo(BTINFO_SYMTAB);
        if (symtab) {
#ifdef KASLR
                tssym = bootspace.head.va;
                tesym = bootspace.head.va; /* (unused...) */
#else
                tssym = (vaddr_t)symtab->ssym + KERNBASE;
                tesym = (vaddr_t)symtab->esym + KERNBASE;
#endif
                ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym);
        } else {
                uintptr_t endp = (uintptr_t)(void *)&end;

                ksyms_addsyms_elf(*(long *)endp,
                    ((long *)endp) + 1, esym);
        }
#endif
}
#endif /* XENPV */

void __noasan
init_bootspace(void)
{
        extern char __rodata_start;
        extern char __data_start;
        extern char __kernel_end;
        size_t i = 0;

        memset(&bootspace, 0, sizeof(bootspace));

        bootspace.head.va = KERNTEXTOFF;
        bootspace.head.pa = KERNTEXTOFF - KERNBASE;
        bootspace.head.sz = 0;

        bootspace.segs[i].type = BTSEG_TEXT;
        bootspace.segs[i].va = KERNTEXTOFF;
        bootspace.segs[i].pa = KERNTEXTOFF - KERNBASE;
        bootspace.segs[i].sz = (size_t)&__rodata_start - KERNTEXTOFF;
        i++;

        bootspace.segs[i].type = BTSEG_RODATA;
        bootspace.segs[i].va = (vaddr_t)&__rodata_start;
        bootspace.segs[i].pa = (paddr_t)&__rodata_start - KERNBASE;
        bootspace.segs[i].sz = (size_t)&__data_start - (size_t)&__rodata_start;
        i++;

        bootspace.segs[i].type = BTSEG_DATA;
        bootspace.segs[i].va = (vaddr_t)&__data_start;
        bootspace.segs[i].pa = (paddr_t)&__data_start - KERNBASE;
        bootspace.segs[i].sz = (size_t)&__kernel_end - (size_t)&__data_start;
        i++;

        bootspace.boot.va = (vaddr_t)&__kernel_end;
        bootspace.boot.pa = (paddr_t)&__kernel_end - KERNBASE;
        bootspace.boot.sz = (size_t)(atdevbase + IOM_SIZE) -
            (size_t)&__kernel_end;

        /* In locore.S, we allocated a tmp va. We will use it now. */
        bootspace.spareva = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2;

        /* Virtual address of the L4 page. */
        bootspace.pdir = (vaddr_t)(PDPpaddr + KERNBASE);

        /* Kernel module map. */
        bootspace.smodule = (vaddr_t)atdevbase + IOM_SIZE;
        bootspace.emodule = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2;
}

static void
init_pte(void)
{
#ifndef XENPV
        extern uint32_t nox_flag;
        pd_entry_t *pdir = (pd_entry_t *)bootspace.pdir;
        pdir[L4_SLOT_PTE] = PDPpaddr | PTE_W | ((uint64_t)nox_flag << 32) |
            PTE_P;
#endif

        extern pd_entry_t *normal_pdes[3];
        normal_pdes[0] = L2_BASE;
        normal_pdes[1] = L3_BASE;
        normal_pdes[2] = L4_BASE;
}

void
init_slotspace(void)
{
        /*
         * XXX Too early to use cprng(9), or even entropy_extract.
         */
        struct entpool pool;
        size_t randhole;
        vaddr_t randva;
        uint64_t sample;
        vaddr_t va;

        memset(&pool, 0, sizeof pool);
        cpu_rng_early_sample(&sample);
        entpool_enter(&pool, &sample, sizeof sample);

        memset(&slotspace, 0, sizeof(slotspace));

        /* User. [256, because we want to land in >= 256] */
        slotspace.area[SLAREA_USER].sslot = 0;
        slotspace.area[SLAREA_USER].nslot = PDIR_SLOT_USERLIM+1;
        slotspace.area[SLAREA_USER].active = true;

#ifdef XENPV
        /* PTE. */
        slotspace.area[SLAREA_PTE].sslot = PDIR_SLOT_PTE;
        slotspace.area[SLAREA_PTE].nslot = 1;
        slotspace.area[SLAREA_PTE].active = true;
#endif

#ifdef __HAVE_PCPU_AREA
        /* Per-CPU. */
        slotspace.area[SLAREA_PCPU].sslot = PDIR_SLOT_PCPU;
        slotspace.area[SLAREA_PCPU].nslot = 1;
        slotspace.area[SLAREA_PCPU].active = true;
#endif

#ifdef __HAVE_DIRECT_MAP
        /* Direct Map. [Randomized later] */
        slotspace.area[SLAREA_DMAP].active = false;
#endif

#ifdef XENPV
        /* Hypervisor. */
        slotspace.area[SLAREA_HYPV].sslot = 256;
        slotspace.area[SLAREA_HYPV].nslot = 17;
        slotspace.area[SLAREA_HYPV].active = true;
#endif

#ifdef KASAN
        /* ASAN. */
        slotspace.area[SLAREA_ASAN].sslot = L4_SLOT_KASAN;
        slotspace.area[SLAREA_ASAN].nslot = NL4_SLOT_KASAN;
        slotspace.area[SLAREA_ASAN].active = true;
#endif

#ifdef KMSAN
        /* MSAN. */
        slotspace.area[SLAREA_MSAN].sslot = L4_SLOT_KMSAN;
        slotspace.area[SLAREA_MSAN].nslot = NL4_SLOT_KMSAN;
        slotspace.area[SLAREA_MSAN].active = true;
#endif

        /* Kernel. */
        slotspace.area[SLAREA_KERN].sslot = L4_SLOT_KERNBASE;
        slotspace.area[SLAREA_KERN].nslot = 1;
        slotspace.area[SLAREA_KERN].active = true;

        /* Main. */
        cpu_rng_early_sample(&sample);
        entpool_enter(&pool, &sample, sizeof sample);
        entpool_extract(&pool, &randhole, sizeof randhole);
        entpool_extract(&pool, &randva, sizeof randva);
        va = slotspace_rand(SLAREA_MAIN, NKL4_MAX_ENTRIES * NBPD_L4,
            NBPD_L4, randhole, randva); /* TODO: NBPD_L1 */
        vm_min_kernel_address = va;
        vm_max_kernel_address = va + NKL4_MAX_ENTRIES * NBPD_L4;

#ifndef XENPV
        /* PTE. */
        cpu_rng_early_sample(&sample);
        entpool_enter(&pool, &sample, sizeof sample);
        entpool_extract(&pool, &randhole, sizeof randhole);
        entpool_extract(&pool, &randva, sizeof randva);
        va = slotspace_rand(SLAREA_PTE, NBPD_L4, NBPD_L4, randhole, randva);
        pte_base = (pd_entry_t *)va;
#endif

        explicit_memset(&pool, 0, sizeof pool);
}

void
init_x86_64(paddr_t first_avail)
{
        extern void consinit(void);
        struct region_descriptor region;
        struct mem_segment_descriptor *ldt_segp;
        struct idt_vec *iv;
        idt_descriptor_t *idt;
        int x;
        struct pcb *pcb;
        extern vaddr_t lwp0uarea;
#ifndef XENPV
        extern paddr_t local_apic_pa;
#endif

        KASSERT(first_avail % PAGE_SIZE == 0);

#ifdef XENPV
        KASSERT(HYPERVISOR_shared_info != NULL);
        cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
#endif

#ifdef XEN
        if (vm_guest == VM_GUEST_XENPVH)
                xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
#endif
        init_pte();

        uvm_lwp_setuarea(&lwp0, lwp0uarea);

        cpu_probe(&cpu_info_primary);
#ifdef SVS
        svs_init();
#endif

        /*
         * Initialize MSRs on cpu0:
         *
         * - Enables SYSCALL/SYSRET.
         *
         * - Sets up %fs and %gs so that %gs points to the current
         *   struct cpu_info as needed for CPUVAR(...), curcpu(), and
         *   curlwp.
         *
         * - Enables the no-execute bit if supported.
         *
         * Thus, after this point, CPUVAR(...), curcpu(), and curlwp
         * will work on cpu0.
         *
         * Note: The call to cpu_init_msrs for secondary CPUs happens
         * in cpu_hatch.
         */
        cpu_init_msrs(&cpu_info_primary, true);

#ifndef XENPV
        cpu_speculation_init(&cpu_info_primary);
#endif

        use_pae = 1; /* PAE always enabled in long mode */

        pcb = lwp_getpcb(&lwp0);
#ifdef XENPV
        mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
        pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE;
#else
        pcb->pcb_cr3 = PDPpaddr;
#endif

#if NISA > 0 || NPCI > 0
        x86_bus_space_init();
#endif

        pat_init(&cpu_info_primary);

        consinit();        /* XXX SHOULD NOT BE DONE HERE */

        /*
         * Initialize RNG to get entropy ASAP either from CPU
         * RDRAND/RDSEED or from seed on disk.  Must happen after
         * cpu_init_msrs.  Prefer to happen after consinit so we have
         * the opportunity to print useful feedback.
         */
        cpu_rng_init();
        x86_rndseed();

        /*
         * Initialize PAGE_SIZE-dependent variables.
         */
        uvm_md_init();

        uvmexp.ncolors = 2;

        avail_start = first_avail;

#ifndef XENPV
        /*
         * Low memory reservations:
         * Page 0:        BIOS data
         * Page 1:        BIOS callback (not used yet, for symmetry with i386)
         * Page 2:        MP bootstrap code (MP_TRAMPOLINE)
         * Page 3:        ACPI wakeup code (ACPI_WAKEUP_ADDR)
         * Page 4:        Temporary page table for 0MB-4MB
         * Page 5:        Temporary page directory
         * Page 6:        Temporary page map level 3
         * Page 7:        Temporary page map level 4
         */
        lowmem_rsvd = 8 * PAGE_SIZE;

        /* Initialize the memory clusters (needed in pmap_bootstrap). */
        init_x86_clusters();
#else
        /* Parse Xen command line (replace bootinfo) */
        xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);

        avail_end = ctob(xen_start_info.nr_pages);
        pmap_pa_start = (KERNTEXTOFF - KERNBASE);
        pmap_pa_end = avail_end;
#endif

        /*
         * Call pmap initialization to make new kernel address space.
         * We must do this before loading pages into the VM system.
         */
        pmap_bootstrap(VM_MIN_KERNEL_ADDRESS);

#ifndef XENPV
        /* Internalize the physical pages into the VM system. */
        init_x86_vm(avail_start);
#else
        physmem = xen_start_info.nr_pages;
        uvm_page_physload(atop(avail_start), atop(avail_end),
            atop(avail_start), atop(avail_end), VM_FREELIST_DEFAULT);
#endif

        init_x86_msgbuf();

        kasan_init();
        kcsan_init();
        kmsan_init((void *)lwp0uarea);

        pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);

        kpreempt_disable();

#ifndef XENPV
        pmap_kenter_pa(local_apic_va, local_apic_pa,
            VM_PROT_READ|VM_PROT_WRITE, 0);
        pmap_update(pmap_kernel());
        memset((void *)local_apic_va, 0, PAGE_SIZE);
#endif

        pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
        pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
        pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
        pmap_update(pmap_kernel());
        memset((void *)idt_vaddr, 0, PAGE_SIZE);
        memset((void *)gdt_vaddr, 0, PAGE_SIZE);
        memset((void *)ldt_vaddr, 0, PAGE_SIZE);

#ifndef XENPV
        pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
#endif

        pmap_update(pmap_kernel());

        iv = &(cpu_info_primary.ci_idtvec);
        idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary));
        idt = iv->iv_idt;
        gdtstore = (char *)gdt_vaddr;
        ldtstore = (char *)ldt_vaddr;

        /*
         * Make GDT gates and memory segments.
         */
        set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0,
            0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);

        set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0,
            0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);

        set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0,
            x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);

        set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0,
            x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);

#ifndef XENPV
        set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore,
            LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0);
#endif

        /*
         * Make LDT memory segments.
         */
        *(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) =
            *GDT_ADDR_MEM(gdtstore, GUCODE_SEL);
        *(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) =
            *GDT_ADDR_MEM(gdtstore, GUDATA_SEL);

        /*
         * 32 bit GDT entries.
         */
        set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0,
            x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0);

        set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0,
            x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);

        set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0,
            x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);

        set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0,
            x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);

        /*
         * 32 bit LDT entries.
         */
        ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL);
        set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
            SDT_MEMERA, SEL_UPL, 1, 1, 0);
        ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL);
        set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
            SDT_MEMRWA, SEL_UPL, 1, 1, 0);

        /* CPU-specific IDT exceptions. */
        for (x = 0; x < NCPUIDT; x++) {
                int sel, ist;

                /* Reset to default. Special cases below */
                sel = SEL_KPL;
                ist = 0;

                idt_vec_reserve(iv, x);

                switch (x) {
                case 1:        /* DB */
                        ist = 4;
                        break;
                case 2:        /* NMI */
                        ist = 3;
                        break;
                case 3:
                case 4:                        
                        sel = SEL_UPL;
                        break;
                case 8:        /* double fault */
                        ist = 2;
                        break;
#ifdef XENPV                        
                case 18: /* MCA */
                        sel |= 0x4; /* Auto EOI/mask */
                        break;
#endif /* XENPV */                        
                default:
                        break;
                }

                set_idtgate(&idt[x], x86_exceptions[x], ist, SDT_SYS386IGT,
                    sel, GSEL(GCODE_SEL, SEL_KPL));
        }

        /* new-style interrupt gate for syscalls */
        idt_vec_reserve(iv, 128);
        set_idtgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL,
            GSEL(GCODE_SEL, SEL_KPL));

        kpreempt_enable();

        setregion(&region, gdtstore, DYNSEL_START - 1);
        lgdt(&region);

#ifdef XENPV
        /* Init Xen callbacks and syscall handlers */
        if (HYPERVISOR_set_callbacks(
            (unsigned long) hypervisor_callback,
            (unsigned long) failsafe_callback,
            (unsigned long) Xsyscall))
                panic("HYPERVISOR_set_callbacks() failed");
#endif /* XENPV */

        cpu_init_idt(&cpu_info_primary);

#ifdef XENPV
        xen_init_ksyms();
#else /* XENPV */
#ifdef XEN
        if (vm_guest == VM_GUEST_XENPVH)
                xen_init_ksyms();
        else
#endif /* XEN */
                init_x86_64_ksyms();
#endif /* XENPV */

#ifndef XENPV
        intr_default_setup();
#else
        events_default_setup();
#endif

        splraise(IPL_HIGH);
        x86_enable_intr();

#ifdef DDB
        if (boothowto & RB_KDB)
                Debugger();
#endif
#ifdef KGDB
        kgdb_port_init();
        if (boothowto & RB_KDB) {
                kgdb_debug_init = 1;
                kgdb_connect(1);
        }
#endif

        pcb->pcb_dbregs = NULL;
        x86_dbregs_init();
}

void
cpu_reset(void)
{
#ifndef XENPV
        idt_descriptor_t *idt;
        vaddr_t vaddr;

        idt = cpu_info_primary.ci_idtvec.iv_idt;
        vaddr = (vaddr_t)idt;
#endif

        x86_disable_intr();

#ifdef XENPV
        HYPERVISOR_reboot();
#else

        x86_reset();

        /*
         * Try to cause a triple fault and watchdog reset by making the IDT
         * invalid and causing a fault.
         */
        kpreempt_disable();
        pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
        memset((void *)idt, 0, NIDT * sizeof(idt[0]));
        kpreempt_enable();
        breakpoint();

#if 0
        /*
         * Try to cause a triple fault and watchdog reset by unmapping the
         * entire address space and doing a TLB flush.
         */
        memset((void *)PTD, 0, PAGE_SIZE);
        tlbflush();
#endif
#endif        /* XENPV */

        for (;;);
}

void
cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
{
        const struct trapframe *tf = l->l_md.md_regs;
        __greg_t ras_rip;

        mcp->__gregs[_REG_RDI] = tf->tf_rdi;
        mcp->__gregs[_REG_RSI] = tf->tf_rsi;
        mcp->__gregs[_REG_RDX] = tf->tf_rdx;
        mcp->__gregs[_REG_R10] = tf->tf_r10;
        mcp->__gregs[_REG_R8]  = tf->tf_r8;
        mcp->__gregs[_REG_R9]  = tf->tf_r9;
        /* argX not touched */
        mcp->__gregs[_REG_RCX] = tf->tf_rcx;
        mcp->__gregs[_REG_R11] = tf->tf_r11;
        mcp->__gregs[_REG_R12] = tf->tf_r12;
        mcp->__gregs[_REG_R13] = tf->tf_r13;
        mcp->__gregs[_REG_R14] = tf->tf_r14;
        mcp->__gregs[_REG_R15] = tf->tf_r15;
        mcp->__gregs[_REG_RBP] = tf->tf_rbp;
        mcp->__gregs[_REG_RBX] = tf->tf_rbx;
        mcp->__gregs[_REG_RAX] = tf->tf_rax;
        mcp->__gregs[_REG_GS]  = 0;
        mcp->__gregs[_REG_FS]  = 0;
        mcp->__gregs[_REG_ES]  = GSEL(GUDATA_SEL, SEL_UPL);
        mcp->__gregs[_REG_DS]  = GSEL(GUDATA_SEL, SEL_UPL);
        mcp->__gregs[_REG_TRAPNO] = tf->tf_trapno;
        mcp->__gregs[_REG_ERR] = tf->tf_err;
        mcp->__gregs[_REG_RIP] = tf->tf_rip;
        mcp->__gregs[_REG_CS]  = LSEL(LUCODE_SEL, SEL_UPL);
        mcp->__gregs[_REG_RFLAGS] = tf->tf_rflags;
        mcp->__gregs[_REG_RSP] = tf->tf_rsp;
        mcp->__gregs[_REG_SS]  = LSEL(LUDATA_SEL, SEL_UPL);

        if ((ras_rip = (__greg_t)ras_lookup(l->l_proc,
            (void *) mcp->__gregs[_REG_RIP])) != -1)
                mcp->__gregs[_REG_RIP] = ras_rip;

        *flags |= _UC_CPU;

        mcp->_mc_tlsbase = (uintptr_t)l->l_private;
        *flags |= _UC_TLSBASE;

        process_read_fpregs_xmm(l, (struct fxsave *)&mcp->__fpregs);
        *flags |= _UC_FPU;
}

int
cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
{
        struct trapframe *tf = l->l_md.md_regs;
        const __greg_t *gr = mcp->__gregs;
        struct proc *p = l->l_proc;
        int error;
        int64_t rflags;

        CTASSERT(sizeof (mcontext_t) == 26 * 8 + 8 + 512);

        if ((flags & _UC_CPU) != 0) {
                error = cpu_mcontext_validate(l, mcp);
                if (error != 0)
                        return error;

                tf->tf_rdi  = gr[_REG_RDI];
                tf->tf_rsi  = gr[_REG_RSI];
                tf->tf_rdx  = gr[_REG_RDX];
                tf->tf_r10  = gr[_REG_R10];
                tf->tf_r8   = gr[_REG_R8];
                tf->tf_r9   = gr[_REG_R9];
                /* argX not touched */
                tf->tf_rcx  = gr[_REG_RCX];
                tf->tf_r11  = gr[_REG_R11];
                tf->tf_r12  = gr[_REG_R12];
                tf->tf_r13  = gr[_REG_R13];
                tf->tf_r14  = gr[_REG_R14];
                tf->tf_r15  = gr[_REG_R15];
                tf->tf_rbp  = gr[_REG_RBP];
                tf->tf_rbx  = gr[_REG_RBX];
                tf->tf_rax  = gr[_REG_RAX];
                tf->tf_gs   = 0;
                tf->tf_fs   = 0;
                tf->tf_es   = GSEL(GUDATA_SEL, SEL_UPL);
                tf->tf_ds   = GSEL(GUDATA_SEL, SEL_UPL);
                /* trapno, err not touched */
                tf->tf_rip  = gr[_REG_RIP];
                tf->tf_cs   = LSEL(LUCODE_SEL, SEL_UPL);
                rflags = tf->tf_rflags;
                rflags &= ~PSL_USER;
                tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER);
                tf->tf_rsp  = gr[_REG_RSP];
                tf->tf_ss   = LSEL(LUDATA_SEL, SEL_UPL);

                l->l_md.md_flags |= MDL_IRET;
        }

        if ((flags & _UC_FPU) != 0)
                process_write_fpregs_xmm(l, (const struct fxsave *)&mcp->__fpregs);

        if ((flags & _UC_TLSBASE) != 0)
                lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);

        mutex_enter(p->p_lock);
        if (flags & _UC_SETSTACK)
                l->l_sigstk.ss_flags |= SS_ONSTACK;
        if (flags & _UC_CLRSTACK)
                l->l_sigstk.ss_flags &= ~SS_ONSTACK;
        mutex_exit(p->p_lock);

        return 0;
}

int
cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
{
        struct proc *p __diagused = l->l_proc;
        struct trapframe *tf = l->l_md.md_regs;
        const __greg_t *gr;
        uint16_t sel;

        KASSERT((p->p_flag & PK_32) == 0);
        gr = mcp->__gregs;

        if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
                return EINVAL;

        sel = gr[_REG_ES] & 0xffff;
        if (sel != 0 && !VALID_USER_DSEL(sel))
                return EINVAL;

        sel = gr[_REG_FS] & 0xffff;
        if (sel != 0 && !VALID_USER_DSEL(sel))
                return EINVAL;

        sel = gr[_REG_GS] & 0xffff;
        if (sel != 0 && !VALID_USER_DSEL(sel))
                return EINVAL;

        sel = gr[_REG_DS] & 0xffff;
        if (!VALID_USER_DSEL(sel))
                return EINVAL;

#ifndef XENPV
        sel = gr[_REG_SS] & 0xffff;
        if (!VALID_USER_DSEL(sel))
                return EINVAL;

        sel = gr[_REG_CS] & 0xffff;
        if (!VALID_USER_CSEL(sel))
                return EINVAL;
#endif

        if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS)
                return EINVAL;

        return 0;
}

int
mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled)
{
        const vaddr_t v = (vaddr_t)ptr;
        vaddr_t kva, kva_end;
        size_t i;

        kva = bootspace.head.va;
        kva_end = kva + bootspace.head.sz;
        if (v >= kva && v < kva_end) {
                *handled = true;
                return 0;
        }

        for (i = 0; i < BTSPACE_NSEGS; i++) {
                kva = bootspace.segs[i].va;
                kva_end = kva + bootspace.segs[i].sz;
                if (v < kva || v >= kva_end)
                        continue;
                *handled = true;
                if (bootspace.segs[i].type == BTSEG_TEXT ||
                    bootspace.segs[i].type == BTSEG_RODATA) {
                        if (prot & VM_PROT_WRITE) {
                                return EFAULT;
                        }
                }
                return 0;
        }

        kva = bootspace.boot.va;
        kva_end = kva + bootspace.boot.sz;
        if (v >= kva && v < kva_end) {
                *handled = true;
                return 0;
        }

        if (v >= bootspace.smodule && v < bootspace.emodule) {
                *handled = true;
                if (!uvm_map_checkprot(module_map, v, v + 1, prot)) {
                        return EFAULT;
                }
        } else {
                *handled = false;
        }
        return 0;
}

/*
 * Zero out a 64bit LWP's segments registers. Used when exec'ing a new
 * 64bit program.
 */
void
cpu_segregs64_zero(struct lwp *l)
{
        struct trapframe * const tf = l->l_md.md_regs;
        struct pcb *pcb;
        uint64_t zero = 0;

        KASSERT(kpreempt_disabled());
        KASSERT((l->l_proc->p_flag & PK_32) == 0);
        KASSERT(l == curlwp);

        pcb = lwp_getpcb(l);

        tf->tf_fs = 0;
        tf->tf_gs = 0;
        setds(GSEL(GUDATA_SEL, SEL_UPL));
        setes(GSEL(GUDATA_SEL, SEL_UPL));
        setfs(0);
        setusergs(0);

#ifndef XENPV
        wrmsr(MSR_FSBASE, 0);
        wrmsr(MSR_KERNELGSBASE, 0);
#else
        HYPERVISOR_set_segment_base(SEGBASE_FS, 0);
        HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0);
#endif

        pcb->pcb_fs = 0;
        pcb->pcb_gs = 0;
        update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
        update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
}

/*
 * Zero out a 32bit LWP's segments registers. Used when exec'ing a new
 * 32bit program.
 */
void
cpu_segregs32_zero(struct lwp *l)
{
        struct trapframe * const tf = l->l_md.md_regs;
        struct pcb *pcb;
        uint64_t zero = 0;

        KASSERT(kpreempt_disabled());
        KASSERT(l->l_proc->p_flag & PK_32);
        KASSERT(l == curlwp);

        pcb = lwp_getpcb(l);

        tf->tf_fs = 0;
        tf->tf_gs = 0;
        setds(GSEL(GUDATA32_SEL, SEL_UPL));
        setes(GSEL(GUDATA32_SEL, SEL_UPL));
        setfs(0);
        setusergs(0);
        pcb->pcb_fs = 0;
        pcb->pcb_gs = 0;
        update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
        update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
}

/*
 * Load an LWP's TLS context, possibly changing the %fs and %gs selectors.
 * Used only for 32-bit processes.
 */
void
cpu_fsgs_reload(struct lwp *l, int fssel, int gssel)
{
        struct trapframe *tf;
        struct pcb *pcb;

        KASSERT(l->l_proc->p_flag & PK_32);
        KASSERT(l == curlwp);

        tf = l->l_md.md_regs;
        fssel &= 0xFFFF;
        gssel &= 0xFFFF;

        pcb = lwp_getpcb(l);
        kpreempt_disable();
        update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
        update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs);

#ifdef XENPV
        setusergs(gssel);
#endif

        tf->tf_fs = fssel;
        tf->tf_gs = gssel;
        kpreempt_enable();
}

bool
mm_md_direct_mapped_io(void *addr, paddr_t *paddr)
{
        vaddr_t va = (vaddr_t)addr;

#ifdef __HAVE_DIRECT_MAP
        if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
                *paddr = PMAP_DIRECT_UNMAP(va);
                return true;
        }
#else
        __USE(va);
#endif

        return false;
}

bool
mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr)
{
#ifdef __HAVE_DIRECT_MAP
        *vaddr = PMAP_DIRECT_MAP(paddr);
        return true;
#else
        return false;
#endif
}

static void
idt_vec_copy(struct idt_vec *dst, struct idt_vec *src)
{
        idt_descriptor_t *idt_dst;

        idt_dst = dst->iv_idt;

        kpreempt_disable();
        pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ|VM_PROT_WRITE);

        memcpy(idt_dst, src->iv_idt, PAGE_SIZE);
        memcpy(dst->iv_allocmap, src->iv_allocmap, sizeof(dst->iv_allocmap));

        pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ);
        kpreempt_enable();
}

void
idt_vec_init_cpu_md(struct idt_vec *iv, cpuid_t cid)
{
        vaddr_t va;

        if (cid != cpu_index(&cpu_info_primary) &&
            idt_vec_is_pcpu()) {
#ifdef __HAVE_PCPU_AREA
                va = (vaddr_t)&pcpuarea->ent[cid].idt;
#else
                struct vm_page *pg;

                va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
                    UVM_KMF_VAONLY);
                pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
                if (pg == NULL) {
                        panic("failed to allocate a page for IDT");
                }
                pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
                    VM_PROT_READ|VM_PROT_WRITE, 0);
                pmap_update(pmap_kernel());
#endif

                memset((void *)va, 0, PAGE_SIZE);
#ifndef XENPV
                pmap_changeprot_local(va, VM_PROT_READ);
#endif
                pmap_update(pmap_kernel());

                iv->iv_idt = (void *)va;
                idt_vec_copy(iv, &(cpu_info_primary.ci_idtvec));
        } else {
                iv->iv_idt = (void *)idt_vaddr;
        }
}







































































































































































    1 






   58 
    2 



















































   75 








   57 
   57 










   36 













    7 

    7 
    7 










    7 















   83 

   83 

































































































































   31 





    6 

    6 

















    2 





    2 
    1 



    1 
    1 








    1 































   33 










   33 
   33 

   33 
   58 



   87 




   31 


   87 
   87 





    3 









    2 


   85 
   77 
   73 



    2 



   31 







   32 






   33 

   34 





































































   33 


   33 
   33 





   33 









































    7 





    7 
    7 

    7 
    7 
    7 

    2 


    7 


    7 


    7 










    2 








    7 



    7 


    7 
    7 

    7 




















   84 


   85 



   82 





   57 


   58 
   58 

   57 
   58 




































   57 
   57 

   57 














   58 




   58 
   57 





   57 


    5 






   54 




    5 


   57 
   53 
   56 
   56 











   53 








   57 




   53 









































   58 



   58 


















































    1 


    1 

    1 













   71 







   74 



   32 





















   32 




   31 
   32 
   32 
   31 
   31 
   32 
   32 
   32 


   31 

   31 

















    1 




    1 
    1 
    1 
    1 
    1 
    1 










    1 


    1 

    1 






    1 


















































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
/*        $NetBSD: radixtree.c,v 1.33 2023/09/23 19:17:38 ad Exp $        */

/*-
 * Copyright (c)2011,2012,2013 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * radixtree.c
 *
 * Overview:
 *
 * This is an implementation of radix tree, whose keys are uint64_t and leafs
 * are user provided pointers.
 *
 * Leaf nodes are just void * and this implementation doesn't care about
 * what they actually point to.  However, this implementation has an assumption
 * about their alignment.  Specifically, this implementation assumes that their
 * 2 LSBs are always zero and uses them for internal accounting.
 *
 * Intermediate nodes and memory allocation:
 *
 * Intermediate nodes are automatically allocated and freed internally and
 * basically users don't need to care about them.  The allocation is done via
 * kmem_zalloc(9) for _KERNEL, malloc(3) for userland, and alloc() for
 * _STANDALONE environment.  Only radix_tree_insert_node function can allocate
 * memory for intermediate nodes and thus can fail for ENOMEM.
 *
 * Memory Efficiency:
 *
 * It's designed to work efficiently with dense index distribution.
 * The memory consumption (number of necessary intermediate nodes) heavily
 * depends on the index distribution.  Basically, more dense index distribution
 * consumes less nodes per item.  Approximately,
 *
 *  - the best case: about RADIX_TREE_PTR_PER_NODE items per intermediate node.
 *    it would look like the following.
 *
 *     root (t_height=1)
 *      |
 *      v
 *      [ | | | ]   (intermediate node.  RADIX_TREE_PTR_PER_NODE=4 in this fig)
 *       | | | |
 *       v v v v
 *       p p p p    (items)
 *
 *  - the worst case: RADIX_TREE_MAX_HEIGHT intermediate nodes per item.
 *    it would look like the following if RADIX_TREE_MAX_HEIGHT=3.
 *
 *     root (t_height=3)
 *      |
 *      v
 *      [ | | | ]
 *           |
 *           v
 *           [ | | | ]
 *                |
 *                v
 *                [ | | | ]
 *                   |
 *                   v
 *                   p
 *
 * The height of tree (t_height) is dynamic.  It's smaller if only small
 * index values are used.  As an extreme case, if only index 0 is used,
 * the corresponding value is directly stored in the root of the tree
 * (struct radix_tree) without allocating any intermediate nodes.  In that
 * case, t_height=0.
 *
 * Gang lookup:
 *
 * This implementation provides a way to scan many nodes quickly via
 * radix_tree_gang_lookup_node function and its varients.
 *
 * Tags:
 *
 * This implementation provides tagging functionality, which allows quick
 * scanning of a subset of leaf nodes.  Leaf nodes are untagged when inserted
 * into the tree and can be tagged by radix_tree_set_tag function.
 * radix_tree_gang_lookup_tagged_node function and its variants returns only
 * leaf nodes with the given tag.  To reduce amount of nodes to visit for
 * these functions, this implementation keeps tagging information in internal
 * intermediate nodes and quickly skips uninterested parts of a tree.
 *
 * A tree has RADIX_TREE_TAG_ID_MAX independent tag spaces, each of which are
 * identified by a zero-origin numbers, tagid.  For the current implementation,
 * RADIX_TREE_TAG_ID_MAX is 2.  A set of tags is described as a bitmask tagmask,
 * which is a bitwise OR of (1 << tagid).
 */

#include <sys/cdefs.h>

#if defined(_KERNEL) || defined(_STANDALONE)
__KERNEL_RCSID(0, "$NetBSD: radixtree.c,v 1.33 2023/09/23 19:17:38 ad Exp $");
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/kmem.h>
#include <sys/radixtree.h>
#include <lib/libkern/libkern.h>
#if defined(_STANDALONE)
#include <lib/libsa/stand.h>
#endif /* defined(_STANDALONE) */
#else /* defined(_KERNEL) || defined(_STANDALONE) */
__RCSID("$NetBSD: radixtree.c,v 1.33 2023/09/23 19:17:38 ad Exp $");
#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#if 1
#define KASSERT assert
#else
#define KASSERT(a)        /* nothing */
#endif
#endif /* defined(_KERNEL) || defined(_STANDALONE) */

#include <sys/radixtree.h>

#define        RADIX_TREE_BITS_PER_HEIGHT        4        /* XXX tune */
#define        RADIX_TREE_PTR_PER_NODE                (1 << RADIX_TREE_BITS_PER_HEIGHT)
#define        RADIX_TREE_MAX_HEIGHT                (64 / RADIX_TREE_BITS_PER_HEIGHT)
#define        RADIX_TREE_INVALID_HEIGHT        (RADIX_TREE_MAX_HEIGHT + 1)
__CTASSERT((64 % RADIX_TREE_BITS_PER_HEIGHT) == 0);

__CTASSERT(((1 << RADIX_TREE_TAG_ID_MAX) & (sizeof(int) - 1)) == 0);
#define        RADIX_TREE_TAG_MASK        ((1 << RADIX_TREE_TAG_ID_MAX) - 1)

static inline void *
entry_ptr(void *p)
{

        return (void *)((uintptr_t)p & ~RADIX_TREE_TAG_MASK);
}

static inline unsigned int
entry_tagmask(void *p)
{

        return (uintptr_t)p & RADIX_TREE_TAG_MASK;
}

static inline void *
entry_compose(void *p, unsigned int tagmask)
{

        return (void *)((uintptr_t)p | tagmask);
}

static inline bool
entry_match_p(void *p, unsigned int tagmask)
{

        KASSERT(entry_ptr(p) != NULL || entry_tagmask(p) == 0);
        if (p == NULL) {
                return false;
        }
        if (tagmask == 0) {
                return true;
        }
        return (entry_tagmask(p) & tagmask) != 0;
}

/*
 * radix_tree_node: an intermediate node
 *
 * we don't care the type of leaf nodes.  they are just void *.
 *
 * we used to maintain a count of non-NULL nodes in this structure, but it
 * prevented it from being aligned to a cache line boundary; the performance
 * benefit from being cache friendly is greater than the benefit of having
 * a dedicated count value, especially in multi-processor situations where
 * we need to avoid intra-pool-page false sharing.
 */

struct radix_tree_node {
        void *n_ptrs[RADIX_TREE_PTR_PER_NODE];
};

/*
 * p_refs[0].pptr == &t->t_root
 *        :
 * p_refs[n].pptr == &(*p_refs[n-1])->n_ptrs[x]
 *        :
 *        :
 * p_refs[t->t_height].pptr == &leaf_pointer
 */

struct radix_tree_path {
        struct radix_tree_node_ref {
                void **pptr;
        } p_refs[RADIX_TREE_MAX_HEIGHT + 1]; /* +1 for the root ptr */
        /*
         * p_lastidx is either the index of the last valid element of p_refs[]
         * or RADIX_TREE_INVALID_HEIGHT.
         * RADIX_TREE_INVALID_HEIGHT means that radix_tree_lookup_ptr found
         * that the height of the tree is not enough to cover the given index.
         */
        unsigned int p_lastidx;
};

static inline void **
path_pptr(const struct radix_tree *t, const struct radix_tree_path *p,
    unsigned int height)
{

        KASSERT(height <= t->t_height);
        return p->p_refs[height].pptr;
}

static inline struct radix_tree_node *
path_node(const struct radix_tree * t, const struct radix_tree_path *p,
    unsigned int height)
{

        KASSERT(height <= t->t_height);
        return entry_ptr(*path_pptr(t, p, height));
}

/*
 * radix_tree_init_tree:
 *
 * Initialize a tree.
 */

void
radix_tree_init_tree(struct radix_tree *t)
{

        t->t_height = 0;
        t->t_root = NULL;
}

/*
 * radix_tree_fini_tree:
 *
 * Finish using a tree.
 */

void
radix_tree_fini_tree(struct radix_tree *t)
{

        KASSERT(t->t_root == NULL);
        KASSERT(t->t_height == 0);
}

/*
 * radix_tree_empty_tree_p:
 *
 * Return if the tree is empty.
 */

bool
radix_tree_empty_tree_p(struct radix_tree *t)
{

        return t->t_root == NULL;
}

/*
 * radix_tree_empty_tree_p:
 *
 * Return true if the tree has any nodes with the given tag.  Otherwise
 * return false.
 *
 * It's illegal to call this function with tagmask 0.
 */

bool
radix_tree_empty_tagged_tree_p(struct radix_tree *t, unsigned int tagmask)
{

        KASSERT(tagmask != 0);
        return (entry_tagmask(t->t_root) & tagmask) == 0;
}

static void
radix_tree_node_init(struct radix_tree_node *n)
{

        memset(n, 0, sizeof(*n));
}

#if defined(_KERNEL)
/*
 * radix_tree_init:
 *
 * initialize the subsystem.
 */

void
radix_tree_init(void)
{

        /* nothing right now */
}

/*
 * radix_tree_await_memory:
 *
 * after an insert has failed with ENOMEM, wait for memory to become
 * available, so the caller can retry.  this needs to ensure that the
 * maximum possible required number of nodes is available.
 */

void
radix_tree_await_memory(void)
{
        struct radix_tree_node *nodes[RADIX_TREE_MAX_HEIGHT];
        int i;

        for (i = 0; i < __arraycount(nodes); i++) {
                nodes[i] = kmem_intr_alloc(sizeof(struct radix_tree_node),
                    KM_SLEEP);
        }
        while (--i >= 0) {
                kmem_intr_free(nodes[i], sizeof(struct radix_tree_node));
        }
}

#endif /* defined(_KERNEL) */

/*
 * radix_tree_sum_node:
 *
 * return the logical sum of all entries in the given node.  used to quickly
 * check for tag masks or empty nodes.
 */

static uintptr_t
radix_tree_sum_node(const struct radix_tree_node *n)
{
#if RADIX_TREE_PTR_PER_NODE > 16
        unsigned int i;
        uintptr_t sum;

        for (i = 0, sum = 0; i < RADIX_TREE_PTR_PER_NODE; i++) {
                sum |= (uintptr_t)n->n_ptrs[i];
        }
        return sum;
#else /* RADIX_TREE_PTR_PER_NODE > 16 */
        uintptr_t sum;

        /*
         * Unrolling the above is much better than a tight loop with two
         * test+branch pairs.  On x86 with gcc 5.5.0 this compiles into 19
         * deterministic instructions including the "return" and prologue &
         * epilogue.
         */
        sum = (uintptr_t)n->n_ptrs[0];
        sum |= (uintptr_t)n->n_ptrs[1];
        sum |= (uintptr_t)n->n_ptrs[2];
        sum |= (uintptr_t)n->n_ptrs[3];
#if RADIX_TREE_PTR_PER_NODE > 4
        sum |= (uintptr_t)n->n_ptrs[4];
        sum |= (uintptr_t)n->n_ptrs[5];
        sum |= (uintptr_t)n->n_ptrs[6];
        sum |= (uintptr_t)n->n_ptrs[7];
#endif
#if RADIX_TREE_PTR_PER_NODE > 8
        sum |= (uintptr_t)n->n_ptrs[8];
        sum |= (uintptr_t)n->n_ptrs[9];
        sum |= (uintptr_t)n->n_ptrs[10];
        sum |= (uintptr_t)n->n_ptrs[11];
        sum |= (uintptr_t)n->n_ptrs[12];
        sum |= (uintptr_t)n->n_ptrs[13];
        sum |= (uintptr_t)n->n_ptrs[14];
        sum |= (uintptr_t)n->n_ptrs[15];
#endif
        return sum;
#endif /* RADIX_TREE_PTR_PER_NODE > 16 */
}

static int __unused
radix_tree_node_count_ptrs(const struct radix_tree_node *n)
{
        unsigned int i, c;

        for (i = c = 0; i < RADIX_TREE_PTR_PER_NODE; i++) {
                c += (n->n_ptrs[i] != NULL);
        }
        return c;
}

static struct radix_tree_node *
radix_tree_alloc_node(void)
{
        struct radix_tree_node *n;

#if defined(_KERNEL)
        /*
         * note that kmem_alloc can block.
         */
        n = kmem_intr_alloc(sizeof(struct radix_tree_node), KM_SLEEP);
#elif defined(_STANDALONE)
        n = alloc(sizeof(*n));
#else /* defined(_STANDALONE) */
        n = malloc(sizeof(*n));
#endif /* defined(_STANDALONE) */
        if (n != NULL) {
                radix_tree_node_init(n);
        }
        KASSERT(n == NULL || radix_tree_sum_node(n) == 0);
        return n;
}

static void
radix_tree_free_node(struct radix_tree_node *n)
{

        KASSERT(radix_tree_sum_node(n) == 0);
#if defined(_KERNEL)
        kmem_intr_free(n, sizeof(struct radix_tree_node));
#elif defined(_STANDALONE)
        dealloc(n, sizeof(*n));
#else
        free(n);
#endif
}

/*
 * radix_tree_grow:
 *
 * increase the height of the tree.
 */

static __noinline int
radix_tree_grow(struct radix_tree *t, unsigned int newheight)
{
        const unsigned int tagmask = entry_tagmask(t->t_root);
        struct radix_tree_node *newnodes[RADIX_TREE_MAX_HEIGHT];
        void *root;
        int h;

        KASSERT(newheight <= RADIX_TREE_MAX_HEIGHT);
        if ((root = t->t_root) == NULL) {
                t->t_height = newheight;
                return 0;
        }
        for (h = t->t_height; h < newheight; h++) {
                newnodes[h] = radix_tree_alloc_node();
                if (__predict_false(newnodes[h] == NULL)) {
                        while (--h >= (int)t->t_height) {
                                newnodes[h]->n_ptrs[0] = NULL;
                                radix_tree_free_node(newnodes[h]);
                        }
                        return ENOMEM;
                }
                newnodes[h]->n_ptrs[0] = root;
                root = entry_compose(newnodes[h], tagmask);
        }
        t->t_root = root;
        t->t_height = h;
        return 0;
}

/*
 * radix_tree_lookup_ptr:
 *
 * an internal helper function used for various exported functions.
 *
 * return the pointer to store the node for the given index.
 *
 * if alloc is true, try to allocate the storage.  (note for _KERNEL:
 * in that case, this function can block.)  if the allocation failed or
 * alloc is false, return NULL.
 *
 * if path is not NULL, fill it for the caller's investigation.
 *
 * if tagmask is not zero, search only for nodes with the tag set.
 * note that, however, this function doesn't check the tagmask for the leaf
 * pointer.  it's a caller's responsibility to investigate the value which
 * is pointed by the returned pointer if necessary.
 *
 * while this function is a bit large, as it's called with some constant
 * arguments, inlining might have benefits.  anyway, a compiler will decide.
 */

static inline void **
radix_tree_lookup_ptr(struct radix_tree *t, uint64_t idx,
    struct radix_tree_path *path, bool alloc, const unsigned int tagmask)
{
        struct radix_tree_node *n;
        int hshift = RADIX_TREE_BITS_PER_HEIGHT * t->t_height;
        int shift;
        void **vpp;
        const uint64_t mask = (UINT64_C(1) << RADIX_TREE_BITS_PER_HEIGHT) - 1;
        struct radix_tree_node_ref *refs = NULL;

        /*
         * check unsupported combinations
         */
        KASSERT(tagmask == 0 || !alloc);
        KASSERT(path == NULL || !alloc);
        vpp = &t->t_root;
        if (path != NULL) {
                refs = path->p_refs;
                refs->pptr = vpp;
        }
        n = NULL;
        for (shift = 64 - RADIX_TREE_BITS_PER_HEIGHT; shift >= 0;) {
                struct radix_tree_node *c;
                void *entry;
                const uint64_t i = (idx >> shift) & mask;

                if (shift >= hshift) {
                        unsigned int newheight;

                        KASSERT(vpp == &t->t_root);
                        if (i == 0) {
                                shift -= RADIX_TREE_BITS_PER_HEIGHT;
                                continue;
                        }
                        if (!alloc) {
                                if (path != NULL) {
                                        KASSERT((refs - path->p_refs) == 0);
                                        path->p_lastidx =
                                            RADIX_TREE_INVALID_HEIGHT;
                                }
                                return NULL;
                        }
                        newheight = shift / RADIX_TREE_BITS_PER_HEIGHT + 1;
                        if (radix_tree_grow(t, newheight)) {
                                return NULL;
                        }
                        hshift = RADIX_TREE_BITS_PER_HEIGHT * t->t_height;
                }
                entry = *vpp;
                c = entry_ptr(entry);
                if (c == NULL ||
                    (tagmask != 0 &&
                    (entry_tagmask(entry) & tagmask) == 0)) {
                        if (!alloc) {
                                if (path != NULL) {
                                        path->p_lastidx = refs - path->p_refs;
                                }
                                return NULL;
                        }
                        c = radix_tree_alloc_node();
                        if (c == NULL) {
                                return NULL;
                        }
                        *vpp = c;
                }
                n = c;
                vpp = &n->n_ptrs[i];
                if (path != NULL) {
                        refs++;
                        refs->pptr = vpp;
                }
                shift -= RADIX_TREE_BITS_PER_HEIGHT;
        }
        if (alloc) {
                KASSERT(*vpp == NULL);
        }
        if (path != NULL) {
                path->p_lastidx = refs - path->p_refs;
        }
        return vpp;
}

/*
 * radix_tree_undo_insert_node:
 *
 * Undo the effects of a failed insert.  The conditions that led to the
 * insert may change and it may not be retried.  If the insert is not
 * retried, there will be no corresponding radix_tree_remove_node() for
 * this index in the future.  Therefore any adjustments made to the tree
 * before memory was exhausted must be reverted.
 */

static __noinline void
radix_tree_undo_insert_node(struct radix_tree *t, uint64_t idx)
{
        struct radix_tree_path path;
        int i;

        (void)radix_tree_lookup_ptr(t, idx, &path, false, 0);
        if (path.p_lastidx == RADIX_TREE_INVALID_HEIGHT) {
                /*
                 * no nodes were inserted.
                 */
                return;
        }
        for (i = path.p_lastidx - 1; i >= 0; i--) {
                struct radix_tree_node ** const pptr =
                    (struct radix_tree_node **)path_pptr(t, &path, i);
                struct radix_tree_node *n;

                KASSERT(pptr != NULL);
                n = entry_ptr(*pptr);
                KASSERT(n != NULL);
                if (radix_tree_sum_node(n) != 0) {
                        break;
                }
                radix_tree_free_node(n);
                *pptr = NULL;
        }
        /*
         * fix up height
         */
        if (i < 0) {
                KASSERT(t->t_root == NULL);
                t->t_height = 0;
        }
}

/*
 * radix_tree_insert_node:
 *
 * Insert the node at the given index.
 *
 * It's illegal to insert NULL.  It's illegal to insert a non-aligned pointer.
 *
 * This function returns ENOMEM if necessary memory allocation failed.
 * Otherwise, this function returns 0.
 *
 * Note that inserting a node can involves memory allocation for intermediate
 * nodes.  If _KERNEL, it's done with no-sleep IPL_NONE memory allocation.
 *
 * For the newly inserted node, all tags are cleared.
 */

int
radix_tree_insert_node(struct radix_tree *t, uint64_t idx, void *p)
{
        void **vpp;

        KASSERT(p != NULL);
        KASSERT(entry_tagmask(entry_compose(p, 0)) == 0);
        vpp = radix_tree_lookup_ptr(t, idx, NULL, true, 0);
        if (__predict_false(vpp == NULL)) {
                radix_tree_undo_insert_node(t, idx);
                return ENOMEM;
        }
        KASSERT(*vpp == NULL);
        *vpp = p;
        return 0;
}

/*
 * radix_tree_replace_node:
 *
 * Replace a node at the given index with the given node and return the
 * replaced one.
 *
 * It's illegal to try to replace a node which has not been inserted.
 *
 * This function keeps tags intact.
 */

void *
radix_tree_replace_node(struct radix_tree *t, uint64_t idx, void *p)
{
        void **vpp;
        void *oldp;

        KASSERT(p != NULL);
        KASSERT(entry_tagmask(entry_compose(p, 0)) == 0);
        vpp = radix_tree_lookup_ptr(t, idx, NULL, false, 0);
        KASSERT(vpp != NULL);
        oldp = *vpp;
        KASSERT(oldp != NULL);
        *vpp = entry_compose(p, entry_tagmask(*vpp));
        return entry_ptr(oldp);
}

/*
 * radix_tree_remove_node:
 *
 * Remove the node at the given index.
 *
 * It's illegal to try to remove a node which has not been inserted.
 */

void *
radix_tree_remove_node(struct radix_tree *t, uint64_t idx)
{
        struct radix_tree_path path;
        void **vpp;
        void *oldp;
        int i;

        vpp = radix_tree_lookup_ptr(t, idx, &path, false, 0);
        KASSERT(vpp != NULL);
        oldp = *vpp;
        KASSERT(oldp != NULL);
        KASSERT(path.p_lastidx == t->t_height);
        KASSERT(vpp == path_pptr(t, &path, path.p_lastidx));
        *vpp = NULL;
        for (i = t->t_height - 1; i >= 0; i--) {
                void *entry;
                struct radix_tree_node ** const pptr =
                    (struct radix_tree_node **)path_pptr(t, &path, i);
                struct radix_tree_node *n;

                KASSERT(pptr != NULL);
                entry = *pptr;
                n = entry_ptr(entry);
                KASSERT(n != NULL);
                if (radix_tree_sum_node(n) != 0) {
                        break;
                }
                radix_tree_free_node(n);
                *pptr = NULL;
        }
        /*
         * fix up height
         */
        if (i < 0) {
                KASSERT(t->t_root == NULL);
                t->t_height = 0;
        }
        /*
         * update tags
         */
        for (; i >= 0; i--) {
                void *entry;
                struct radix_tree_node ** const pptr =
                    (struct radix_tree_node **)path_pptr(t, &path, i);
                struct radix_tree_node *n;
                unsigned int newmask;

                KASSERT(pptr != NULL);
                entry = *pptr;
                n = entry_ptr(entry);
                KASSERT(n != NULL);
                KASSERT(radix_tree_sum_node(n) != 0);
                newmask = radix_tree_sum_node(n) & RADIX_TREE_TAG_MASK;
                if (newmask == entry_tagmask(entry)) {
                        break;
                }
                *pptr = entry_compose(n, newmask);
        }
        /*
         * XXX is it worth to try to reduce height?
         * if we do that, make radix_tree_grow rollback its change as well.
         */
        return entry_ptr(oldp);
}

/*
 * radix_tree_lookup_node:
 *
 * Returns the node at the given index.
 * Returns NULL if nothing is found at the given index.
 */

void *
radix_tree_lookup_node(struct radix_tree *t, uint64_t idx)
{
        void **vpp;

        vpp = radix_tree_lookup_ptr(t, idx, NULL, false, 0);
        if (vpp == NULL) {
                return NULL;
        }
        return entry_ptr(*vpp);
}

static inline void
gang_lookup_init(struct radix_tree *t, uint64_t idx,
    struct radix_tree_path *path, const unsigned int tagmask)
{
        void **vpp __unused;

        vpp = radix_tree_lookup_ptr(t, idx, path, false, tagmask);
        KASSERT(vpp == NULL ||
            vpp == path_pptr(t, path, path->p_lastidx));
        KASSERT(&t->t_root == path_pptr(t, path, 0));
        KASSERT(path->p_lastidx == RADIX_TREE_INVALID_HEIGHT ||
           path->p_lastidx == t->t_height ||
           !entry_match_p(*path_pptr(t, path, path->p_lastidx), tagmask));
}

/*
 * gang_lookup_scan:
 *
 * a helper routine for radix_tree_gang_lookup_node and its variants.
 */

static inline unsigned int
__attribute__((__always_inline__))
gang_lookup_scan(struct radix_tree *t, struct radix_tree_path *path,
    void **results, const unsigned int maxresults, const unsigned int tagmask,
    const bool reverse, const bool dense)
{

        /*
         * we keep the path updated only for lastidx-1.
         * vpp is what path_pptr(t, path, lastidx) would be.
         */
        void **vpp;
        unsigned int nfound;
        unsigned int lastidx;
        /*
         * set up scan direction dependant constants so that we can iterate
         * n_ptrs as the following.
         *
         *        for (i = first; i != guard; i += step)
         *                visit n->n_ptrs[i];
         */
        const int step = reverse ? -1 : 1;
        const unsigned int first = reverse ? RADIX_TREE_PTR_PER_NODE - 1 : 0;
        const unsigned int last = reverse ? 0 : RADIX_TREE_PTR_PER_NODE - 1;
        const unsigned int guard = last + step;

        KASSERT(maxresults > 0);
        KASSERT(&t->t_root == path_pptr(t, path, 0));
        lastidx = path->p_lastidx;
        KASSERT(lastidx == RADIX_TREE_INVALID_HEIGHT ||
           lastidx == t->t_height ||
           !entry_match_p(*path_pptr(t, path, lastidx), tagmask));
        nfound = 0;
        if (lastidx == RADIX_TREE_INVALID_HEIGHT) {
                /*
                 * requested idx is beyond the right-most node.
                 */
                if (reverse && !dense) {
                        lastidx = 0;
                        vpp = path_pptr(t, path, lastidx);
                        goto descend;
                }
                return 0;
        }
        vpp = path_pptr(t, path, lastidx);
        while (/*CONSTCOND*/true) {
                struct radix_tree_node *n;
                unsigned int i;

                if (entry_match_p(*vpp, tagmask)) {
                        KASSERT(lastidx == t->t_height);
                        /*
                         * record the matching non-NULL leaf.
                         */
                        results[nfound] = entry_ptr(*vpp);
                        nfound++;
                        if (nfound == maxresults) {
                                return nfound;
                        }
                } else if (dense) {
                        return nfound;
                }
scan_siblings:
                /*
                 * try to find the next matching non-NULL sibling.
                 */
                if (lastidx == 0) {
                        /*
                         * the root has no siblings.
                         * we've done.
                         */
                        KASSERT(vpp == &t->t_root);
                        break;
                }
                n = path_node(t, path, lastidx - 1);
                for (i = vpp - n->n_ptrs + step; i != guard; i += step) {
                        KASSERT(i < RADIX_TREE_PTR_PER_NODE);
                        if (entry_match_p(n->n_ptrs[i], tagmask)) {
                                vpp = &n->n_ptrs[i];
                                break;
                        } else if (dense) {
                                return nfound;
                        }
                }
                if (i == guard) {
                        /*
                         * not found.  go to parent.
                         */
                        lastidx--;
                        vpp = path_pptr(t, path, lastidx);
                        goto scan_siblings;
                }
descend:
                /*
                 * following the left-most (or right-most in the case of
                 * reverse scan) child node, descend until reaching the leaf or
                 * a non-matching entry.
                 */
                while (entry_match_p(*vpp, tagmask) && lastidx < t->t_height) {
                        /*
                         * save vpp in the path so that we can come back to this
                         * node after finishing visiting children.
                         */
                        path->p_refs[lastidx].pptr = vpp;
                        n = entry_ptr(*vpp);
                        vpp = &n->n_ptrs[first];
                        lastidx++;
                }
        }
        return nfound;
}

/*
 * radix_tree_gang_lookup_node:
 *
 * Scan the tree starting from the given index in the ascending order and
 * return found nodes.
 *
 * results should be an array large enough to hold maxresults pointers.
 * This function returns the number of nodes found, up to maxresults.
 * Returning less than maxresults means there are no more nodes in the tree.
 *
 * If dense == true, this function stops scanning when it founds a hole of
 * indexes.  I.e. an index for which radix_tree_lookup_node would returns NULL.
 * If dense == false, this function skips holes and continue scanning until
 * maxresults nodes are found or it reaches the limit of the index range.
 *
 * The result of this function is semantically equivalent to what could be
 * obtained by repeated calls of radix_tree_lookup_node with increasing index.
 * but this function is expected to be computationally cheaper when looking up
 * multiple nodes at once.  Especially, it's expected to be much cheaper when
 * node indexes are distributed sparsely.
 *
 * Note that this function doesn't return index values of found nodes.
 * Thus, in the case of dense == false, if index values are important for
 * a caller, it's the caller's responsibility to check them, typically
 * by examining the returned nodes using some caller-specific knowledge
 * about them.
 * In the case of dense == true, a node returned via results[N] is always for
 * the index (idx + N).
 */

unsigned int
radix_tree_gang_lookup_node(struct radix_tree *t, uint64_t idx,
    void **results, unsigned int maxresults, bool dense)
{
        struct radix_tree_path path;

        gang_lookup_init(t, idx, &path, 0);
        return gang_lookup_scan(t, &path, results, maxresults, 0, false, dense);
}

/*
 * radix_tree_gang_lookup_node_reverse:
 *
 * Same as radix_tree_gang_lookup_node except that this one scans the
 * tree in the reverse order.  I.e. descending index values.
 */

unsigned int
radix_tree_gang_lookup_node_reverse(struct radix_tree *t, uint64_t idx,
    void **results, unsigned int maxresults, bool dense)
{
        struct radix_tree_path path;

        gang_lookup_init(t, idx, &path, 0);
        return gang_lookup_scan(t, &path, results, maxresults, 0, true, dense);
}

/*
 * radix_tree_gang_lookup_tagged_node:
 *
 * Same as radix_tree_gang_lookup_node except that this one only returns
 * nodes tagged with tagid.
 *
 * It's illegal to call this function with tagmask 0.
 */

unsigned int
radix_tree_gang_lookup_tagged_node(struct radix_tree *t, uint64_t idx,
    void **results, unsigned int maxresults, bool dense, unsigned int tagmask)
{
        struct radix_tree_path path;

        KASSERT(tagmask != 0);
        gang_lookup_init(t, idx, &path, tagmask);
        return gang_lookup_scan(t, &path, results, maxresults, tagmask, false,
            dense);
}

/*
 * radix_tree_gang_lookup_tagged_node_reverse:
 *
 * Same as radix_tree_gang_lookup_tagged_node except that this one scans the
 * tree in the reverse order.  I.e. descending index values.
 */

unsigned int
radix_tree_gang_lookup_tagged_node_reverse(struct radix_tree *t, uint64_t idx,
    void **results, unsigned int maxresults, bool dense, unsigned int tagmask)
{
        struct radix_tree_path path;

        KASSERT(tagmask != 0);
        gang_lookup_init(t, idx, &path, tagmask);
        return gang_lookup_scan(t, &path, results, maxresults, tagmask, true,
            dense);
}

/*
 * radix_tree_get_tag:
 *
 * Return the tagmask for the node at the given index.
 *
 * It's illegal to call this function for a node which has not been inserted.
 */

unsigned int
radix_tree_get_tag(struct radix_tree *t, uint64_t idx, unsigned int tagmask)
{
        /*
         * the following two implementations should behave same.
         * the former one was chosen because it seems faster.
         */
#if 1
        void **vpp;

        vpp = radix_tree_lookup_ptr(t, idx, NULL, false, tagmask);
        if (vpp == NULL) {
                return false;
        }
        KASSERT(*vpp != NULL);
        return (entry_tagmask(*vpp) & tagmask);
#else
        void **vpp;

        vpp = radix_tree_lookup_ptr(t, idx, NULL, false, 0);
        KASSERT(vpp != NULL);
        return (entry_tagmask(*vpp) & tagmask);
#endif
}

/*
 * radix_tree_set_tag:
 *
 * Set the tag for the node at the given index.
 *
 * It's illegal to call this function for a node which has not been inserted.
 * It's illegal to call this function with tagmask 0.
 */

void
radix_tree_set_tag(struct radix_tree *t, uint64_t idx, unsigned int tagmask)
{
        struct radix_tree_path path;
        void **vpp __unused;
        int i;

        KASSERT(tagmask != 0);
        vpp = radix_tree_lookup_ptr(t, idx, &path, false, 0);
        KASSERT(vpp != NULL);
        KASSERT(*vpp != NULL);
        KASSERT(path.p_lastidx == t->t_height);
        KASSERT(vpp == path_pptr(t, &path, path.p_lastidx));
        for (i = t->t_height; i >= 0; i--) {
                void ** const pptr = (void **)path_pptr(t, &path, i);
                void *entry;

                KASSERT(pptr != NULL);
                entry = *pptr;
                if ((entry_tagmask(entry) & tagmask) != 0) {
                        break;
                }
                *pptr = (void *)((uintptr_t)entry | tagmask);
        }
}

/*
 * radix_tree_clear_tag:
 *
 * Clear the tag for the node at the given index.
 *
 * It's illegal to call this function for a node which has not been inserted.
 * It's illegal to call this function with tagmask 0.
 */

void
radix_tree_clear_tag(struct radix_tree *t, uint64_t idx, unsigned int tagmask)
{
        struct radix_tree_path path;
        void **vpp;
        int i;

        KASSERT(tagmask != 0);
        vpp = radix_tree_lookup_ptr(t, idx, &path, false, 0);
        KASSERT(vpp != NULL);
        KASSERT(*vpp != NULL);
        KASSERT(path.p_lastidx == t->t_height);
        KASSERT(vpp == path_pptr(t, &path, path.p_lastidx));
        /*
         * if already cleared, nothing to do
         */
        if ((entry_tagmask(*vpp) & tagmask) == 0) {
                return;
        }
        /*
         * clear the tag only if no children have the tag.
         */
        for (i = t->t_height; i >= 0; i--) {
                void ** const pptr = (void **)path_pptr(t, &path, i);
                void *entry;

                KASSERT(pptr != NULL);
                entry = *pptr;
                KASSERT((entry_tagmask(entry) & tagmask) != 0);
                *pptr = entry_compose(entry_ptr(entry),
                    entry_tagmask(entry) & ~tagmask);
                /*
                 * check if we should proceed to process the next level.
                 */
                if (0 < i) {
                        struct radix_tree_node *n = path_node(t, &path, i - 1);

                        if ((radix_tree_sum_node(n) & tagmask) != 0) {
                                break;
                        }
                }
        }
}

#if defined(UNITTEST)

#include <inttypes.h>
#include <stdio.h>

static void
radix_tree_dump_node(const struct radix_tree *t, void *vp,
    uint64_t offset, unsigned int height)
{
        struct radix_tree_node *n;
        unsigned int i;

        for (i = 0; i < t->t_height - height; i++) {
                printf(" ");
        }
        if (entry_tagmask(vp) == 0) {
                printf("[%" PRIu64 "] %p", offset, entry_ptr(vp));
        } else {
                printf("[%" PRIu64 "] %p (tagmask=0x%x)", offset, entry_ptr(vp),
                    entry_tagmask(vp));
        }
        if (height == 0) {
                printf(" (leaf)\n");
                return;
        }
        n = entry_ptr(vp);
        assert((radix_tree_sum_node(n) & RADIX_TREE_TAG_MASK) ==
            entry_tagmask(vp));
        printf(" (%u children)\n", radix_tree_node_count_ptrs(n));
        for (i = 0; i < __arraycount(n->n_ptrs); i++) {
                void *c;

                c = n->n_ptrs[i];
                if (c == NULL) {
                        continue;
                }
                radix_tree_dump_node(t, c,
                    offset + i * (UINT64_C(1) <<
                    (RADIX_TREE_BITS_PER_HEIGHT * (height - 1))), height - 1);
        }
}

void radix_tree_dump(const struct radix_tree *);

void
radix_tree_dump(const struct radix_tree *t)
{

        printf("tree %p height=%u\n", t, t->t_height);
        radix_tree_dump_node(t, t->t_root, 0, t->t_height);
}

static void
test1(void)
{
        struct radix_tree s;
        struct radix_tree *t = &s;
        void *results[3];

        radix_tree_init_tree(t);
        radix_tree_dump(t);
        assert(radix_tree_lookup_node(t, 0) == NULL);
        assert(radix_tree_lookup_node(t, 1000) == NULL);
        assert(radix_tree_gang_lookup_node(t, 0, results, 3, false) == 0);
        assert(radix_tree_gang_lookup_node(t, 0, results, 3, true) == 0);
        assert(radix_tree_gang_lookup_node(t, 1000, results, 3, false) == 0);
        assert(radix_tree_gang_lookup_node(t, 1000, results, 3, true) == 0);
        assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, false) ==
            0);
        assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, true) ==
            0);
        assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, false)
            == 0);
        assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, true)
            == 0);
        assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, false, 1)
            == 0);
        assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, true, 1)
            == 0);
        assert(radix_tree_gang_lookup_tagged_node(t, 1000, results, 3, false, 1)
            == 0);
        assert(radix_tree_gang_lookup_tagged_node(t, 1000, results, 3, true, 1)
            == 0);
        assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3,
            false, 1) == 0);
        assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3,
            true, 1) == 0);
        assert(radix_tree_gang_lookup_tagged_node_reverse(t, 1000, results, 3,
            false, 1) == 0);
        assert(radix_tree_gang_lookup_tagged_node_reverse(t, 1000, results, 3,
            true, 1) == 0);
        assert(radix_tree_empty_tree_p(t));
        assert(radix_tree_empty_tagged_tree_p(t, 1));
        assert(radix_tree_empty_tagged_tree_p(t, 2));
        assert(radix_tree_insert_node(t, 0, (void *)0xdeadbea0) == 0);
        assert(!radix_tree_empty_tree_p(t));
        assert(radix_tree_empty_tagged_tree_p(t, 1));
        assert(radix_tree_empty_tagged_tree_p(t, 2));
        assert(radix_tree_lookup_node(t, 0) == (void *)0xdeadbea0);
        assert(radix_tree_lookup_node(t, 1000) == NULL);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node(t, 0, results, 3, false) == 1);
        assert(results[0] == (void *)0xdeadbea0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node(t, 0, results, 3, true) == 1);
        assert(results[0] == (void *)0xdeadbea0);
        assert(radix_tree_gang_lookup_node(t, 1000, results, 3, false) == 0);
        assert(radix_tree_gang_lookup_node(t, 1000, results, 3, true) == 0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, false) ==
            1);
        assert(results[0] == (void *)0xdeadbea0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, true) ==
            1);
        assert(results[0] == (void *)0xdeadbea0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, false)
            == 1);
        assert(results[0] == (void *)0xdeadbea0);
        assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, true)
            == 0);
        assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, false, 1)
            == 0);
        assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, true, 1)
            == 0);
        assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3,
            false, 1) == 0);
        assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3,
            true, 1) == 0);
        assert(radix_tree_insert_node(t, 1000, (void *)0xdeadbea0) == 0);
        assert(radix_tree_remove_node(t, 0) == (void *)0xdeadbea0);
        assert(!radix_tree_empty_tree_p(t));
        radix_tree_dump(t);
        assert(radix_tree_lookup_node(t, 0) == NULL);
        assert(radix_tree_lookup_node(t, 1000) == (void *)0xdeadbea0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node(t, 0, results, 3, false) == 1);
        assert(results[0] == (void *)0xdeadbea0);
        assert(radix_tree_gang_lookup_node(t, 0, results, 3, true) == 0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node(t, 1000, results, 3, false) == 1);
        assert(results[0] == (void *)0xdeadbea0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node(t, 1000, results, 3, true) == 1);
        assert(results[0] == (void *)0xdeadbea0);
        assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, false)
            == 0);
        assert(radix_tree_gang_lookup_node_reverse(t, 0, results, 3, true)
            == 0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, false)
            == 1);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node_reverse(t, 1000, results, 3, true)
            == 1);
        assert(results[0] == (void *)0xdeadbea0);
        assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, false, 1)
            == 0);
        assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 3, true, 1)
            == 0);
        assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3,
            false, 1) == 0);
        assert(radix_tree_gang_lookup_tagged_node_reverse(t, 0, results, 3,
            true, 1) == 0);
        assert(!radix_tree_get_tag(t, 1000, 1));
        assert(!radix_tree_get_tag(t, 1000, 2));
        assert(radix_tree_get_tag(t, 1000, 2 | 1) == 0);
        assert(radix_tree_empty_tagged_tree_p(t, 1));
        assert(radix_tree_empty_tagged_tree_p(t, 2));
        radix_tree_set_tag(t, 1000, 2);
        assert(!radix_tree_get_tag(t, 1000, 1));
        assert(radix_tree_get_tag(t, 1000, 2));
        assert(radix_tree_get_tag(t, 1000, 2 | 1) == 2);
        assert(radix_tree_empty_tagged_tree_p(t, 1));
        assert(!radix_tree_empty_tagged_tree_p(t, 2));
        radix_tree_dump(t);
        assert(radix_tree_lookup_node(t, 1000) == (void *)0xdeadbea0);
        assert(radix_tree_insert_node(t, 0, (void *)0xbea0) == 0);
        radix_tree_dump(t);
        assert(radix_tree_lookup_node(t, 0) == (void *)0xbea0);
        assert(radix_tree_lookup_node(t, 1000) == (void *)0xdeadbea0);
        assert(radix_tree_insert_node(t, UINT64_C(10000000000), (void *)0xdea0)
            == 0);
        radix_tree_dump(t);
        assert(radix_tree_lookup_node(t, 0) == (void *)0xbea0);
        assert(radix_tree_lookup_node(t, 1000) == (void *)0xdeadbea0);
        assert(radix_tree_lookup_node(t, UINT64_C(10000000000)) ==
            (void *)0xdea0);
        radix_tree_dump(t);
        assert(!radix_tree_get_tag(t, 0, 2));
        assert(radix_tree_get_tag(t, 1000, 2));
        assert(!radix_tree_get_tag(t, UINT64_C(10000000000), 1));
        radix_tree_set_tag(t, 0, 2);
        radix_tree_set_tag(t, UINT64_C(10000000000), 2);
        radix_tree_dump(t);
        assert(radix_tree_get_tag(t, 0, 2));
        assert(radix_tree_get_tag(t, 1000, 2));
        assert(radix_tree_get_tag(t, UINT64_C(10000000000), 2));
        radix_tree_clear_tag(t, 0, 2);
        radix_tree_clear_tag(t, UINT64_C(10000000000), 2);
        radix_tree_dump(t);
        assert(!radix_tree_get_tag(t, 0, 2));
        assert(radix_tree_get_tag(t, 1000, 2));
        assert(!radix_tree_get_tag(t, UINT64_C(10000000000), 2));
        radix_tree_dump(t);
        assert(radix_tree_replace_node(t, 1000, (void *)0x12345678) ==
            (void *)0xdeadbea0);
        assert(!radix_tree_get_tag(t, 1000, 1));
        assert(radix_tree_get_tag(t, 1000, 2));
        assert(radix_tree_get_tag(t, 1000, 2 | 1) == 2);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node(t, 0, results, 3, false) == 3);
        assert(results[0] == (void *)0xbea0);
        assert(results[1] == (void *)0x12345678);
        assert(results[2] == (void *)0xdea0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node(t, 0, results, 3, true) == 1);
        assert(results[0] == (void *)0xbea0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node(t, 1, results, 3, false) == 2);
        assert(results[0] == (void *)0x12345678);
        assert(results[1] == (void *)0xdea0);
        assert(radix_tree_gang_lookup_node(t, 1, results, 3, true) == 0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node(t, 1001, results, 3, false) == 1);
        assert(results[0] == (void *)0xdea0);
        assert(radix_tree_gang_lookup_node(t, 1001, results, 3, true) == 0);
        assert(radix_tree_gang_lookup_node(t, UINT64_C(10000000001), results, 3,
            false) == 0);
        assert(radix_tree_gang_lookup_node(t, UINT64_C(10000000001), results, 3,
            true) == 0);
        assert(radix_tree_gang_lookup_node(t, UINT64_C(1000000000000), results,
            3, false) == 0);
        assert(radix_tree_gang_lookup_node(t, UINT64_C(1000000000000), results,
            3, true) == 0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 100, false, 2)
            == 1);
        assert(results[0] == (void *)0x12345678);
        assert(radix_tree_gang_lookup_tagged_node(t, 0, results, 100, true, 2)
            == 0);
        assert(entry_tagmask(t->t_root) != 0);
        assert(radix_tree_remove_node(t, 1000) == (void *)0x12345678);
        assert(entry_tagmask(t->t_root) == 0);
        radix_tree_dump(t);
        assert(radix_tree_insert_node(t, UINT64_C(10000000001), (void *)0xfff0)
            == 0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node(t, UINT64_C(10000000000), results, 3,
            false) == 2);
        assert(results[0] == (void *)0xdea0);
        assert(results[1] == (void *)0xfff0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node(t, UINT64_C(10000000000), results, 3,
            true) == 2);
        assert(results[0] == (void *)0xdea0);
        assert(results[1] == (void *)0xfff0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node_reverse(t, UINT64_C(10000000001),
            results, 3, false) == 3);
        assert(results[0] == (void *)0xfff0);
        assert(results[1] == (void *)0xdea0);
        assert(results[2] == (void *)0xbea0);
        memset(results, 0, sizeof(results));
        assert(radix_tree_gang_lookup_node_reverse(t, UINT64_C(10000000001),
            results, 3, true) == 2);
        assert(results[0] == (void *)0xfff0);
        assert(results[1] == (void *)0xdea0);
        assert(radix_tree_remove_node(t, UINT64_C(10000000000)) ==
            (void *)0xdea0);
        assert(radix_tree_remove_node(t, UINT64_C(10000000001)) ==
            (void *)0xfff0);
        radix_tree_dump(t);
        assert(radix_tree_remove_node(t, 0) == (void *)0xbea0);
        radix_tree_dump(t);
        radix_tree_fini_tree(t);
}

#include <sys/time.h>

struct testnode {
        uint64_t idx;
        bool tagged[RADIX_TREE_TAG_ID_MAX];
};

static void
printops(const char *title, const char *name, int tag, unsigned int n,
    const struct timeval *stv, const struct timeval *etv)
{
        uint64_t s = stv->tv_sec * 1000000 + stv->tv_usec;
        uint64_t e = etv->tv_sec * 1000000 + etv->tv_usec;

        printf("RESULT %s %s %d %lf op/s\n", title, name, tag,
            (double)n / (e - s) * 1000000);
}

#define        TEST2_GANG_LOOKUP_NODES        16

static bool
test2_should_tag(unsigned int i, unsigned int tagid)
{

        if (tagid == 0) {
                return (i % 4) == 0;        /* 25% */
        } else {
                return (i % 7) == 0;        /* 14% */
        }
        return 1;
}

static void
check_tag_count(const unsigned int *ntagged, unsigned int tagmask,
    unsigned int count)
{
        unsigned int tag;

        for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) {
                if ((tagmask & (1 << tag)) == 0) {
                        continue;
                }
                if (((tagmask - 1) & tagmask) == 0) {
                        assert(count == ntagged[tag]);
                } else {
                        assert(count >= ntagged[tag]);
                }
        }
}

static void
test2(const char *title, bool dense)
{
        struct radix_tree s;
        struct radix_tree *t = &s;
        struct testnode *n;
        unsigned int i;
        unsigned int nnodes = 100000;
        unsigned int removed;
        unsigned int tag;
        unsigned int tagmask;
        unsigned int ntagged[RADIX_TREE_TAG_ID_MAX];
        struct testnode *nodes;
        struct timeval stv;
        struct timeval etv;

        nodes = malloc(nnodes * sizeof(*nodes));
        for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) {
                ntagged[tag] = 0;
        }
        radix_tree_init_tree(t);
        for (i = 0; i < nnodes; i++) {
                n = &nodes[i];
                n->idx = random();
                if (sizeof(long) == 4) {
                        n->idx <<= 32;
                        n->idx |= (uint32_t)random();
                }
                if (dense) {
                        n->idx %= nnodes * 2;
                }
                while (radix_tree_lookup_node(t, n->idx) != NULL) {
                        n->idx++;
                }
                radix_tree_insert_node(t, n->idx, n);
                for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) {
                        tagmask = 1 << tag;

                        n->tagged[tag] = test2_should_tag(i, tag);
                        if (n->tagged[tag]) {
                                radix_tree_set_tag(t, n->idx, tagmask);
                                ntagged[tag]++;
                        }
                        assert((n->tagged[tag] ? tagmask : 0) ==
                            radix_tree_get_tag(t, n->idx, tagmask));
                }
        }

        gettimeofday(&stv, NULL);
        for (i = 0; i < nnodes; i++) {
                n = &nodes[i];
                assert(radix_tree_lookup_node(t, n->idx) == n);
        }
        gettimeofday(&etv, NULL);
        printops(title, "lookup", 0, nnodes, &stv, &etv);

        for (tagmask = 1; tagmask <= RADIX_TREE_TAG_MASK; tagmask ++) {
                unsigned int count = 0;

                gettimeofday(&stv, NULL);
                for (i = 0; i < nnodes; i++) {
                        unsigned int tagged;

                        n = &nodes[i];
                        tagged = radix_tree_get_tag(t, n->idx, tagmask);
                        assert((tagged & ~tagmask) == 0);
                        for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) {
                                assert((tagmask & (1 << tag)) == 0 ||
                                    n->tagged[tag] == !!(tagged & (1 << tag)));
                        }
                        if (tagged) {
                                count++;
                        }
                }
                gettimeofday(&etv, NULL);
                check_tag_count(ntagged, tagmask, count);
                printops(title, "get_tag", tagmask, nnodes, &stv, &etv);
        }

        gettimeofday(&stv, NULL);
        for (i = 0; i < nnodes; i++) {
                n = &nodes[i];
                radix_tree_remove_node(t, n->idx);
        }
        gettimeofday(&etv, NULL);
        printops(title, "remove", 0, nnodes, &stv, &etv);

        gettimeofday(&stv, NULL);
        for (i = 0; i < nnodes; i++) {
                n = &nodes[i];
                radix_tree_insert_node(t, n->idx, n);
        }
        gettimeofday(&etv, NULL);
        printops(title, "insert", 0, nnodes, &stv, &etv);

        for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) {
                tagmask = 1 << tag;

                ntagged[tag] = 0;
                gettimeofday(&stv, NULL);
                for (i = 0; i < nnodes; i++) {
                        n = &nodes[i];
                        if (n->tagged[tag]) {
                                radix_tree_set_tag(t, n->idx, tagmask);
                                ntagged[tag]++;
                        }
                }
                gettimeofday(&etv, NULL);
                printops(title, "set_tag", tag, ntagged[tag], &stv, &etv);
        }

        gettimeofday(&stv, NULL);
        {
                struct testnode *results[TEST2_GANG_LOOKUP_NODES];
                uint64_t nextidx;
                unsigned int nfound;
                unsigned int total;

                nextidx = 0;
                total = 0;
                while ((nfound = radix_tree_gang_lookup_node(t, nextidx,
                    (void *)results, __arraycount(results), false)) > 0) {
                        nextidx = results[nfound - 1]->idx + 1;
                        total += nfound;
                        if (nextidx == 0) {
                                break;
                        }
                }
                assert(total == nnodes);
        }
        gettimeofday(&etv, NULL);
        printops(title, "ganglookup", 0, nnodes, &stv, &etv);

        gettimeofday(&stv, NULL);
        {
                struct testnode *results[TEST2_GANG_LOOKUP_NODES];
                uint64_t nextidx;
                unsigned int nfound;
                unsigned int total;

                nextidx = UINT64_MAX;
                total = 0;
                while ((nfound = radix_tree_gang_lookup_node_reverse(t, nextidx,
                    (void *)results, __arraycount(results), false)) > 0) {
                        nextidx = results[nfound - 1]->idx - 1;
                        total += nfound;
                        if (nextidx == UINT64_MAX) {
                                break;
                        }
                }
                assert(total == nnodes);
        }
        gettimeofday(&etv, NULL);
        printops(title, "ganglookup_reverse", 0, nnodes, &stv, &etv);

        for (tagmask = 1; tagmask <= RADIX_TREE_TAG_MASK; tagmask ++) {
                unsigned int total = 0;

                gettimeofday(&stv, NULL);
                {
                        struct testnode *results[TEST2_GANG_LOOKUP_NODES];
                        uint64_t nextidx;
                        unsigned int nfound;

                        nextidx = 0;
                        while ((nfound = radix_tree_gang_lookup_tagged_node(t,
                            nextidx, (void *)results, __arraycount(results),
                            false, tagmask)) > 0) {
                                nextidx = results[nfound - 1]->idx + 1;
                                total += nfound;
                        }
                }
                gettimeofday(&etv, NULL);
                check_tag_count(ntagged, tagmask, total);
                assert(tagmask != 0 || total == 0);
                printops(title, "ganglookup_tag", tagmask, total, &stv, &etv);
        }

        for (tagmask = 1; tagmask <= RADIX_TREE_TAG_MASK; tagmask ++) {
                unsigned int total = 0;

                gettimeofday(&stv, NULL);
                {
                        struct testnode *results[TEST2_GANG_LOOKUP_NODES];
                        uint64_t nextidx;
                        unsigned int nfound;

                        nextidx = UINT64_MAX;
                        while ((nfound =
                            radix_tree_gang_lookup_tagged_node_reverse(t,
                            nextidx, (void *)results, __arraycount(results),
                            false, tagmask)) > 0) {
                                nextidx = results[nfound - 1]->idx - 1;
                                total += nfound;
                                if (nextidx == UINT64_MAX) {
                                        break;
                                }
                        }
                }
                gettimeofday(&etv, NULL);
                check_tag_count(ntagged, tagmask, total);
                assert(tagmask != 0 || total == 0);
                printops(title, "ganglookup_tag_reverse", tagmask, total,
                    &stv, &etv);
        }

        removed = 0;
        for (tag = 0; tag < RADIX_TREE_TAG_ID_MAX; tag++) {
                unsigned int total;

                total = 0;
                tagmask = 1 << tag;
                gettimeofday(&stv, NULL);
                {
                        struct testnode *results[TEST2_GANG_LOOKUP_NODES];
                        uint64_t nextidx;
                        unsigned int nfound;

                        nextidx = 0;
                        while ((nfound = radix_tree_gang_lookup_tagged_node(t,
                            nextidx, (void *)results, __arraycount(results),
                            false, tagmask)) > 0) {
                                for (i = 0; i < nfound; i++) {
                                        radix_tree_remove_node(t,
                                            results[i]->idx);
                                }
                                nextidx = results[nfound - 1]->idx + 1;
                                total += nfound;
                                if (nextidx == 0) {
                                        break;
                                }
                        }
                }
                gettimeofday(&etv, NULL);
                if (tag == 0) {
                        check_tag_count(ntagged, tagmask, total);
                } else {
                        assert(total <= ntagged[tag]);
                }
                printops(title, "ganglookup_tag+remove", tagmask, total, &stv,
                    &etv);
                removed += total;
        }

        gettimeofday(&stv, NULL);
        {
                struct testnode *results[TEST2_GANG_LOOKUP_NODES];
                uint64_t nextidx;
                unsigned int nfound;
                unsigned int total;

                nextidx = 0;
                total = 0;
                while ((nfound = radix_tree_gang_lookup_node(t, nextidx,
                    (void *)results, __arraycount(results), false)) > 0) {
                        for (i = 0; i < nfound; i++) {
                                assert(results[i] == radix_tree_remove_node(t,
                                    results[i]->idx));
                        }
                        nextidx = results[nfound - 1]->idx + 1;
                        total += nfound;
                        if (nextidx == 0) {
                                break;
                        }
                }
                assert(total == nnodes - removed);
        }
        gettimeofday(&etv, NULL);
        printops(title, "ganglookup+remove", 0, nnodes - removed, &stv, &etv);

        assert(radix_tree_empty_tree_p(t));
        for (tagmask = 1; tagmask <= RADIX_TREE_TAG_MASK; tagmask ++) {
                assert(radix_tree_empty_tagged_tree_p(t, tagmask));
        }
        radix_tree_fini_tree(t);
        free(nodes);
}

int
main(int argc, char *argv[])
{

        test1();
        test2("dense", true);
        test2("sparse", false);
        return 0;
}

#endif /* defined(UNITTEST) */




































































































































































    3 















    3 





    3 






















    1 



    1 







    1 










































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
/*        $NetBSD: coda_vfsops.c,v 1.90 2022/03/28 12:37:46 riastradh Exp $        */

/*
 *
 *             Coda: an Experimental Distributed File System
 *                              Release 3.1
 *
 *           Copyright (c) 1987-1998 Carnegie Mellon University
 *                          All Rights Reserved
 *
 * Permission  to  use, copy, modify and distribute this software and its
 * documentation is hereby granted,  provided  that  both  the  copyright
 * notice  and  this  permission  notice  appear  in  all  copies  of the
 * software, derivative works or  modified  versions,  and  any  portions
 * thereof, and that both notices appear in supporting documentation, and
 * that credit is given to Carnegie Mellon University  in  all  documents
 * and publicity pertaining to direct or indirect use of this code or its
 * derivatives.
 *
 * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS  KNOWN  TO  HAVE  BUGS,
 * SOME  OF  WHICH MAY HAVE SERIOUS CONSEQUENCES.  CARNEGIE MELLON ALLOWS
 * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.   CARNEGIE  MELLON
 * DISCLAIMS  ANY  LIABILITY  OF  ANY  KIND  FOR  ANY  DAMAGES WHATSOEVER
 * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE  OR  OF
 * ANY DERIVATIVE WORK.
 *
 * Carnegie  Mellon  encourages  users  of  this  software  to return any
 * improvements or extensions that  they  make,  and  to  grant  Carnegie
 * Mellon the rights to redistribute these changes without encumbrance.
 *
 *         @(#) cfs/coda_vfsops.c,v 1.1.1.1 1998/08/29 21:26:45 rvb Exp $
 */

/*
 * Mach Operating System
 * Copyright (c) 1989 Carnegie-Mellon University
 * All rights reserved.  The CMU software License Agreement specifies
 * the terms and conditions for use and redistribution.
 */

/*
 * This code was written for the Coda file system at Carnegie Mellon
 * University.  Contributers include David Steere, James Kistler, and
 * M. Satyanarayanan.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: coda_vfsops.c,v 1.90 2022/03/28 12:37:46 riastradh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/conf.h>
#include <sys/namei.h>
#include <sys/dirent.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/select.h>
#include <sys/kauth.h>
#include <sys/module.h>

#include <coda/coda.h>
#include <coda/cnode.h>
#include <coda/coda_vfsops.h>
#include <coda/coda_venus.h>
#include <coda/coda_subr.h>
#include <coda/coda_opstats.h>
/* for VN_RDEV */
#include <miscfs/specfs/specdev.h>
#include <miscfs/genfs/genfs.h>
 
MODULE(MODULE_CLASS_VFS, coda, "vcoda");

#define ENTRY if(coda_vfsop_print_entry) myprintf(("Entered %s\n",__func__))

extern struct vnode *coda_ctlvp;
extern struct coda_mntinfo coda_mnttbl[NVCODA]; /* indexed by minor device number */

/* structure to keep statistics of internally generated/satisfied calls */

struct coda_op_stats coda_vfsopstats[CODA_VFSOPS_SIZE];

#define MARK_ENTRY(op) (coda_vfsopstats[op].entries++)
#define MARK_INT_SAT(op) (coda_vfsopstats[op].sat_intrn++)
#define MARK_INT_FAIL(op) (coda_vfsopstats[op].unsat_intrn++)
#define MRAK_INT_GEN(op) (coda_vfsopstats[op].gen_intrn++)

extern const struct cdevsw vcoda_cdevsw;
extern const struct vnodeopv_desc coda_vnodeop_opv_desc;

const struct vnodeopv_desc * const coda_vnodeopv_descs[] = {
        &coda_vnodeop_opv_desc,
        NULL,
};

struct vfsops coda_vfsops = {
        .vfs_name = MOUNT_CODA,
        .vfs_min_mount_data = 256,
                        /* This is the pathname, unlike every other fs */
        .vfs_mount = coda_mount,
        .vfs_start = coda_start,
        .vfs_unmount = coda_unmount,
        .vfs_root = coda_root,
        .vfs_quotactl = (void *)eopnotsupp,
        .vfs_statvfs = coda_nb_statvfs,
        .vfs_sync = coda_sync,
        .vfs_vget = coda_vget,
        .vfs_loadvnode = coda_loadvnode,
        .vfs_fhtovp = (void *)eopnotsupp,
        .vfs_vptofh = (void *)eopnotsupp,
        .vfs_init = coda_init,
        .vfs_done = coda_done,
        .vfs_mountroot = (void *)eopnotsupp,
        .vfs_snapshot = (void *)eopnotsupp,
        .vfs_extattrctl = vfs_stdextattrctl,
        .vfs_suspendctl = genfs_suspendctl,
        .vfs_renamelock_enter = genfs_renamelock_enter,
        .vfs_renamelock_exit = genfs_renamelock_exit,
        .vfs_fsync = (void *)eopnotsupp,
        .vfs_opv_descs = coda_vnodeopv_descs
};

static int
coda_modcmd(modcmd_t cmd, void *arg)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return vfs_attach(&coda_vfsops);
        case MODULE_CMD_FINI:
                return vfs_detach(&coda_vfsops);
        default:
                return ENOTTY;
        }
}

int
coda_vfsopstats_init(void)
{
        int i;

        for (i=0;i<CODA_VFSOPS_SIZE;i++) {
                coda_vfsopstats[i].opcode = i;
                coda_vfsopstats[i].entries = 0;
                coda_vfsopstats[i].sat_intrn = 0;
                coda_vfsopstats[i].unsat_intrn = 0;
                coda_vfsopstats[i].gen_intrn = 0;
        }

        return 0;
}

/*
 * cfs mount vfsop
 * Set up mount info record and attach it to vfs struct.
 */
/*ARGSUSED*/
int
coda_mount(struct mount *vfsp,        /* Allocated and initialized by mount(2) */
    const char *path,        /* path covered: ignored by the fs-layer */
    void *data,                /* Need to define a data type for this in netbsd? */
    size_t *data_len)
{
    struct lwp *l = curlwp;
    struct vnode *dvp;
    struct cnode *cp;
    dev_t dev;
    struct coda_mntinfo *mi;
    struct vnode *rtvp;
    const struct cdevsw *cdev;
    CodaFid rootfid = INVAL_FID;
    CodaFid ctlfid = CTL_FID;
    int error;

    if (data == NULL)
        return EINVAL;
    if (vfsp->mnt_flag & MNT_GETARGS)
        return EINVAL;
    ENTRY;

    coda_vfsopstats_init();
    coda_vnodeopstats_init();

    MARK_ENTRY(CODA_MOUNT_STATS);
    if (CODA_MOUNTED(vfsp)) {
        MARK_INT_FAIL(CODA_MOUNT_STATS);
        return(EBUSY);
    }

    /* Validate mount device.  Similar to getmdev(). */

    /*
     * XXX: coda passes the mount device as the entire mount args,
     * All other fs pass a structure contining a pointer.
     * In order to get sys_mount() to do the copyin() we've set a
     * fixed default size for the filename buffer.
     */
    /* Ensure that namei() doesn't run off the filename buffer */
    if (*data_len < 1 || *data_len > PATH_MAX ||
        strnlen(data, *data_len) >= *data_len) {
        MARK_INT_FAIL(CODA_MOUNT_STATS);
        return EINVAL;
    }
    error = namei_simple_kernel((char *)data, NSM_FOLLOW_NOEMULROOT,
                &dvp);

    if (error) {
        MARK_INT_FAIL(CODA_MOUNT_STATS);
        return (error);
    }
    if (dvp->v_type != VCHR) {
        MARK_INT_FAIL(CODA_MOUNT_STATS);
        vrele(dvp);
        return(ENXIO);
    }
    dev = dvp->v_rdev;
    vrele(dvp);
    cdev = cdevsw_lookup(dev);
    if (cdev == NULL) {
        MARK_INT_FAIL(CODA_MOUNT_STATS);
        return(ENXIO);
    }

    /*
     * See if the device table matches our expectations.
     */
    if (cdev != &vcoda_cdevsw)
    {
        MARK_INT_FAIL(CODA_MOUNT_STATS);
        return(ENXIO);
    }

    if (minor(dev) >= NVCODA) {
        MARK_INT_FAIL(CODA_MOUNT_STATS);
        return(ENXIO);
    }

    /*
     * Initialize the mount record and link it to the vfs struct
     */
    mi = &coda_mnttbl[minor(dev)];

    if (!VC_OPEN(&mi->mi_vcomm)) {
        MARK_INT_FAIL(CODA_MOUNT_STATS);
        return(ENODEV);
    }

    /* No initialization (here) of mi_vcomm! */
    vfsp->mnt_data = mi;
    vfsp->mnt_stat.f_fsidx.__fsid_val[0] = 0;
    vfsp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_CODA);
    vfsp->mnt_stat.f_fsid = vfsp->mnt_stat.f_fsidx.__fsid_val[0];
    vfsp->mnt_stat.f_namemax = CODA_MAXNAMLEN;
    mi->mi_vfsp = vfsp;

    /*
     * Make a root vnode to placate the Vnode interface, but don't
     * actually make the CODA_ROOT call to venus until the first call
     * to coda_root in case a server is down while venus is starting.
     */
    cp = make_coda_node(&rootfid, vfsp, VDIR);
    rtvp = CTOV(cp);
    rtvp->v_vflag |= VV_ROOT;

    cp = make_coda_node(&ctlfid, vfsp, VCHR);

    coda_ctlvp = CTOV(cp);

    /* Add vfs and rootvp to chain of vfs hanging off mntinfo */
    mi->mi_vfsp = vfsp;
    mi->mi_rootvp = rtvp;

    /* set filesystem block size */
    vfsp->mnt_stat.f_bsize = 8192;            /* XXX -JJK */
    vfsp->mnt_stat.f_frsize = 8192;            /* XXX -JJK */

    /* error is currently guaranteed to be zero, but in case some
       code changes... */
    CODADEBUG(1,
             myprintf(("coda_mount returned %d\n",error)););
    if (error)
        MARK_INT_FAIL(CODA_MOUNT_STATS);
    else
        MARK_INT_SAT(CODA_MOUNT_STATS);

    return set_statvfs_info("/coda", UIO_SYSSPACE, "CODA", UIO_SYSSPACE,
        vfsp->mnt_op->vfs_name, vfsp, l);
}

int
coda_start(struct mount *vfsp, int flags)
{
    ENTRY;
    vftomi(vfsp)->mi_started = 1;
    return (0);
}

int
coda_unmount(struct mount *vfsp, int mntflags)
{
    struct coda_mntinfo *mi = vftomi(vfsp);
    int active, error = 0;

    ENTRY;
    MARK_ENTRY(CODA_UMOUNT_STATS);
    if (!CODA_MOUNTED(vfsp)) {
        MARK_INT_FAIL(CODA_UMOUNT_STATS);
        return(EINVAL);
    }

    if (mi->mi_vfsp == vfsp) {        /* We found the victim */
        if (!IS_UNMOUNTING(VTOC(mi->mi_rootvp)))
            return (EBUSY);         /* Venus is still running */

#ifdef        DEBUG
        printf("coda_unmount: ROOT: vp %p, cp %p\n", mi->mi_rootvp, VTOC(mi->mi_rootvp));
#endif
        mi->mi_started = 0;

        vrele(mi->mi_rootvp);
        vrele(coda_ctlvp);

        active = coda_kill(vfsp, NOT_DOWNCALL);
        mi->mi_rootvp->v_vflag &= ~VV_ROOT;
        error = vflush(mi->mi_vfsp, NULLVP, FORCECLOSE);
        printf("coda_unmount: active = %d, vflush active %d\n", active, error);
        error = 0;

        /* I'm going to take this out to allow lookups to go through. I'm
         * not sure it's important anyway. -- DCS 2/2/94
         */
        /* vfsp->VFS_DATA = NULL; */

        /* No more vfsp's to hold onto */
        mi->mi_vfsp = NULL;
        mi->mi_rootvp = NULL;

        if (error)
            MARK_INT_FAIL(CODA_UMOUNT_STATS);
        else
            MARK_INT_SAT(CODA_UMOUNT_STATS);

        return(error);
    }
    return (EINVAL);
}

/*
 * find root of cfs
 */
int
coda_root(struct mount *vfsp, int lktype, struct vnode **vpp)
{
    struct coda_mntinfo *mi = vftomi(vfsp);
    int error;
    struct lwp *l = curlwp;    /* XXX - bnoble */
    CodaFid VFid;
    static const CodaFid invalfid = INVAL_FID;

    ENTRY;
    MARK_ENTRY(CODA_ROOT_STATS);

    if (vfsp == mi->mi_vfsp) {
            if (memcmp(&VTOC(mi->mi_rootvp)->c_fid, &invalfid, sizeof(CodaFid)))
            { /* Found valid root. */
                *vpp = mi->mi_rootvp;
                /* On Mach, this is vref.  On NetBSD, VOP_LOCK */
                vref(*vpp);
                vn_lock(*vpp, lktype);
                MARK_INT_SAT(CODA_ROOT_STATS);
                return(0);
            }
    }

    error = venus_root(vftomi(vfsp), l->l_cred, l->l_proc, &VFid);

    if (!error) {
        struct cnode *cp = VTOC(mi->mi_rootvp);

        /*
         * Save the new rootfid in the cnode, and rekey the cnode
         * with the new fid key.
         */
        error = vcache_rekey_enter(vfsp, mi->mi_rootvp,
            &invalfid, sizeof(CodaFid), &VFid, sizeof(CodaFid));
        if (error)
                goto exit;
        cp->c_fid = VFid;
        vcache_rekey_exit(vfsp, mi->mi_rootvp,
            &invalfid, sizeof(CodaFid), &cp->c_fid, sizeof(CodaFid));

        *vpp = mi->mi_rootvp;
        vref(*vpp);
        vn_lock(*vpp, lktype);
        MARK_INT_SAT(CODA_ROOT_STATS);
        goto exit;
    } else if (error == ENODEV || error == EINTR) {
        /* Gross hack here! */
        /*
         * If Venus fails to respond to the CODA_ROOT call, coda_call returns
         * ENODEV. Return the uninitialized root vnode to allow vfs
         * operations such as unmount to continue. Without this hack,
         * there is no way to do an unmount if Venus dies before a
         * successful CODA_ROOT call is done. All vnode operations
         * will fail.
         */
        *vpp = mi->mi_rootvp;
        vref(*vpp);
        vn_lock(*vpp, lktype);
        MARK_INT_FAIL(CODA_ROOT_STATS);
        error = 0;
        goto exit;
    } else {
        CODADEBUG( CODA_ROOT, myprintf(("error %d in CODA_ROOT\n", error)); );
        MARK_INT_FAIL(CODA_ROOT_STATS);

        goto exit;
    }
 exit:
    return(error);
}

/*
 * Get file system statistics.
 */
int
coda_nb_statvfs(struct mount *vfsp, struct statvfs *sbp)
{
    struct lwp *l = curlwp;
    struct coda_statfs fsstat;
    int error;

    ENTRY;
    MARK_ENTRY(CODA_STATFS_STATS);
    if (!CODA_MOUNTED(vfsp)) {
/*        MARK_INT_FAIL(CODA_STATFS_STATS); */
        return(EINVAL);
    }

    /* XXX - what to do about f_flags, others? --bnoble */
    /* Below This is what AFS does
            #define NB_SFS_SIZ 0x895440
     */
    /* Note: Normal fs's have a bsize of 0x400 == 1024 */

    error = venus_statfs(vftomi(vfsp), l->l_cred, l, &fsstat);

    if (!error) {
        sbp->f_bsize = 8192; /* XXX */
        sbp->f_frsize = 8192; /* XXX */
        sbp->f_iosize = 8192; /* XXX */
        sbp->f_blocks = fsstat.f_blocks;
        sbp->f_bfree  = fsstat.f_bfree;
        sbp->f_bavail = fsstat.f_bavail;
        sbp->f_bresvd = 0;
        sbp->f_files  = fsstat.f_files;
        sbp->f_ffree  = fsstat.f_ffree;
        sbp->f_favail = fsstat.f_ffree;
        sbp->f_fresvd = 0;
        copy_statvfs_info(sbp, vfsp);
    }

    MARK_INT_SAT(CODA_STATFS_STATS);
    return(error);
}

/*
 * Flush any pending I/O.
 */
int
coda_sync(struct mount *vfsp, int waitfor,
    kauth_cred_t cred)
{
    ENTRY;
    MARK_ENTRY(CODA_SYNC_STATS);
    MARK_INT_SAT(CODA_SYNC_STATS);
    return(0);
}

int
coda_vget(struct mount *vfsp, ino_t ino, int lktype,
    struct vnode **vpp)
{
    ENTRY;
    return (EOPNOTSUPP);
}

int
coda_loadvnode(struct mount *mp, struct vnode *vp,
    const void *key, size_t key_len, const void **new_key)
{
        CodaFid fid;
        struct cnode *cp;
        extern int (**coda_vnodeop_p)(void *);

        KASSERT(key_len == sizeof(CodaFid));
        memcpy(&fid, key, key_len);

        cp = kmem_zalloc(sizeof(*cp), KM_SLEEP);
        mutex_init(&cp->c_lock, MUTEX_DEFAULT, IPL_NONE);
        cp->c_fid = fid;
        cp->c_vnode = vp;
        vp->v_op = coda_vnodeop_p;
        vp->v_tag = VT_CODA;
        vp->v_type = VNON;
        vp->v_data = cp;

        *new_key = &cp->c_fid;

        return 0;
}

/*
 * fhtovp is now what vget used to be in 4.3-derived systems.  For
 * some silly reason, vget is now keyed by a 32 bit ino_t, rather than
 * a type-specific fid.
 */
int
coda_fhtovp(struct mount *vfsp, struct fid *fhp, struct mbuf *nam,
    struct vnode **vpp, int *exflagsp,
    kauth_cred_t *creadanonp, int lktype)
{
    struct cfid *cfid = (struct cfid *)fhp;
    struct cnode *cp = 0;
    int error;
    struct lwp *l = curlwp; /* XXX -mach */
    CodaFid VFid;
    int vtype;

    ENTRY;

    MARK_ENTRY(CODA_VGET_STATS);
    /* Check for vget of control object. */
    if (IS_CTL_FID(&cfid->cfid_fid)) {
        *vpp = coda_ctlvp;
        vref(coda_ctlvp);
        MARK_INT_SAT(CODA_VGET_STATS);
        return(0);
    }

    error = venus_fhtovp(vftomi(vfsp), &cfid->cfid_fid, l->l_cred, l->l_proc, &VFid, &vtype);

    if (error) {
        CODADEBUG(CODA_VGET, myprintf(("vget error %d\n",error));)
            *vpp = (struct vnode *)0;
    } else {
        CODADEBUG(CODA_VGET,
                 myprintf(("vget: %s type %d result %d\n",
                        coda_f2s(&VFid), vtype, error)); )

        cp = make_coda_node(&VFid, vfsp, vtype);
        *vpp = CTOV(cp);
    }
    return(error);
}

int
coda_vptofh(struct vnode *vnp, struct fid *fidp)
{
    ENTRY;
    return (EOPNOTSUPP);
}

void
coda_init(void)
{
    ENTRY;
}

void
coda_done(void)
{
    ENTRY;
}

SYSCTL_SETUP(sysctl_vfs_coda_setup, "sysctl vfs.coda subtree setup")
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "coda",
                       SYSCTL_DESCR("code vfs options"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, 18, CTL_EOL);
        /*
         * XXX the "18" above could be dynamic, thereby eliminating
         * one more instance of the "number to vfs" mapping problem,
         * but "18" is the order as taken from sys/mount.h
         */

/*
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "clusterread",
                       SYSCTL_DESCR( anyone? ),
                       NULL, 0, &doclusterread, 0,
                       CTL_VFS, 18, FFS_CLUSTERREAD, CTL_EOL);
*/
}

/*
 * To allow for greater ease of use, some vnodes may be orphaned when
 * Venus dies.  Certain operations should still be allowed to go
 * through, but without propagating orphan-ness.  So this function will
 * get a new vnode for the file from the current run of Venus.
 */

int
getNewVnode(struct vnode **vpp)
{
    struct cfid cfid;
    struct coda_mntinfo *mi = vftomi((*vpp)->v_mount);

    ENTRY;

    cfid.cfid_len = (short)sizeof(CodaFid);
    cfid.cfid_fid = VTOC(*vpp)->c_fid;        /* Structure assignment. */
    /* XXX ? */

    /* We're guessing that if set, the 1st element on the list is a
     * valid vnode to use. If not, return ENODEV as venus is dead.
     */
    if (mi->mi_vfsp == NULL)
        return ENODEV;

    return coda_fhtovp(mi->mi_vfsp, (struct fid*)&cfid, NULL, vpp,
                      NULL, NULL, LK_EXCLUSIVE);
}

/* Get the mount structure corresponding to a given device.
 * Return NULL if no device is found or the device is not mounted.
 */
struct mount *devtomp(dev_t dev)
{
    struct mount *mp;
    struct vnode *vp;

    if (spec_node_lookup_by_dev(VBLK, dev, VDEAD_NOWAIT, &vp) == 0) {
        mp = spec_node_getmountedfs(vp);
        vrele(vp);
    } else {
        mp = NULL;
    }

    return mp;
}
















































































































































































   35 








   34 

   35 
   34 





   34 










    1 
   11 




   25 
















    3 










































    4 
    1 






    5 






    1 

    7 






    7 













   24 










    5 
   33 
   33 



   32 



    4 


    7 

   18 
   12 












    8 







    2 





   28 













    3 































   22 






   12 


































































    6 



    3 

    3 










    2 



    2 


    1 





    1 

















    1 




    1 














    1 








    1 





    1 





    1 













































































    3 


















    4 








    3 











    1 










    1 

















    1 




































    4 














    4 




























































    9 




    9 


























































    7 




















    2 

















    1 















    1 


    1 


















    1 














    1 

























    1 































    1 






    1 
















    1 






























    1 





    1 


















    1 




























    1 



















    1 


    1 







































    1 

    1 










    5 


    5 
    1 






































































































    1 














    1 















   92 



   92 
   92 

   89 


   93 





   93 








   30 


   90 


   40 


   33 











    3 




































































































    5 






























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
/*        $NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $        */

/*-
 * Copyright (c) 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_vnops.c        8.14 (Berkeley) 6/15/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.242 2023/07/10 02:31:55 christos Exp $");

#include "veriexec.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/vnode_impl.h>
#include <sys/ioctl.h>
#include <sys/tty.h>
#include <sys/poll.h>
#include <sys/kauth.h>
#include <sys/syslog.h>
#include <sys/fstrans.h>
#include <sys/atomic.h>
#include <sys/filedesc.h>
#include <sys/wapbl.h>
#include <sys/mman.h>

#include <miscfs/specfs/specdev.h>
#include <miscfs/fifofs/fifo.h>

#include <uvm/uvm_extern.h>
#include <uvm/uvm_readahead.h>
#include <uvm/uvm_device.h>

#ifdef UNION
#include <fs/union/union.h>
#endif

#ifndef COMPAT_ZERODEV
#define COMPAT_ZERODEV(dev)        (0)
#endif

int (*vn_union_readdir_hook)(struct vnode **, struct file *, struct lwp *);

#include <sys/verified_exec.h>

static int vn_read(file_t *fp, off_t *offset, struct uio *uio,
    kauth_cred_t cred, int flags);
static int vn_write(file_t *fp, off_t *offset, struct uio *uio,
    kauth_cred_t cred, int flags);
static int vn_closefile(file_t *fp);
static int vn_poll(file_t *fp, int events);
static int vn_fcntl(file_t *fp, u_int com, void *data);
static int vn_statfile(file_t *fp, struct stat *sb);
static int vn_ioctl(file_t *fp, u_long com, void *data);
static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *,
    struct uvm_object **, int *);
static int vn_seek(struct file *, off_t, int, off_t *, int);
static int vn_advlock(struct file *, void *, int, struct flock *, int);
static int vn_fpathconf(struct file *, int, register_t *);
static int vn_posix_fadvise(struct file *, off_t, off_t, int);
static int vn_truncate(file_t *, off_t);

const struct fileops vnops = {
        .fo_name = "vn",
        .fo_read = vn_read,
        .fo_write = vn_write,
        .fo_ioctl = vn_ioctl,
        .fo_fcntl = vn_fcntl,
        .fo_poll = vn_poll,
        .fo_stat = vn_statfile,
        .fo_close = vn_closefile,
        .fo_kqfilter = vn_kqfilter,
        .fo_restart = fnullop_restart,
        .fo_mmap = vn_mmap,
        .fo_seek = vn_seek,
        .fo_advlock = vn_advlock,
        .fo_fpathconf = vn_fpathconf,
        .fo_posix_fadvise = vn_posix_fadvise,
        .fo_truncate = vn_truncate,
};

/*
 * Common code for vnode open operations.
 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
 *
 * at_dvp is the directory for openat(), if any.
 * pb is the path.
 * nmode is additional namei flags, restricted to TRYEMULROOT and NOCHROOT.
 * fmode is the open flags, converted from O_* to F*
 * cmode is the creation file permissions.
 *
 * XXX shouldn't cmode be mode_t?
 *
 * On success produces either a locked vnode in *ret_vp, or NULL in
 * *ret_vp and a file descriptor number in *ret_fd.
 *
 * The caller may pass NULL for ret_fd (and ret_domove), in which case
 * EOPNOTSUPP will be produced in the cases that would otherwise return
 * a file descriptor.
 *
 * Note that callers that want no-follow behavior should pass
 * O_NOFOLLOW in fmode. Neither FOLLOW nor NOFOLLOW in nmode is
 * honored.
 */
int
vn_open(struct vnode *at_dvp, struct pathbuf *pb,
        int nmode, int fmode, int cmode,
        struct vnode **ret_vp, bool *ret_domove, int *ret_fd)
{
        struct nameidata nd;
        struct vnode *vp = NULL;
        struct lwp *l = curlwp;
        kauth_cred_t cred = l->l_cred;
        struct vattr va;
        int error;
        const char *pathstring;

        KASSERT((nmode & (TRYEMULROOT | NOCHROOT)) == nmode);

        KASSERT(ret_vp != NULL);
        KASSERT((ret_domove == NULL) == (ret_fd == NULL));

        if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY))
                return EINVAL;

        NDINIT(&nd, LOOKUP, nmode, pb);
        if (at_dvp != NULL)
                NDAT(&nd, at_dvp);

        nd.ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT;

        if (fmode & O_CREAT) {
                nd.ni_cnd.cn_nameiop = CREATE;
                nd.ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF;
                if ((fmode & O_EXCL) == 0 &&
                    ((fmode & O_NOFOLLOW) == 0))
                        nd.ni_cnd.cn_flags |= FOLLOW;
                if ((fmode & O_EXCL) == 0)
                        nd.ni_cnd.cn_flags |= NONEXCLHACK;
        } else {
                nd.ni_cnd.cn_nameiop = LOOKUP;
                nd.ni_cnd.cn_flags |= LOCKLEAF;
                if ((fmode & O_NOFOLLOW) == 0)
                        nd.ni_cnd.cn_flags |= FOLLOW;
        }

        pathstring = pathbuf_stringcopy_get(nd.ni_pathbuf);
        if (pathstring == NULL) {
                return ENOMEM;
        }

        /*
         * When this "interface" was exposed to do_open() it used
         * to initialize l_dupfd to -newfd-1 (thus passing in the
         * new file handle number to use)... but nothing in the
         * kernel uses that value. So just send 0.
         */
        l->l_dupfd = 0;

        error = namei(&nd);
        if (error)
                goto out;

        vp = nd.ni_vp;

#if NVERIEXEC > 0
        error = veriexec_openchk(l, nd.ni_vp, pathstring, fmode);
        if (error) {
                /* We have to release the locks ourselves */
                /*
                 * 20210604 dholland passing NONEXCLHACK means we can
                 * get ni_dvp == NULL back if ni_vp exists, and we should
                 * treat that like the non-O_CREAT case.
                 */
                if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) {
                        if (vp == NULL) {
                                vput(nd.ni_dvp);
                        } else {
                                VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
                                if (nd.ni_dvp == nd.ni_vp)
                                        vrele(nd.ni_dvp);
                                else
                                        vput(nd.ni_dvp);
                                nd.ni_dvp = NULL;
                                vput(vp);
                                vp = NULL;
                        }
                } else {
                        vput(vp);
                        vp = NULL;
                }
                goto out;
        }
#endif /* NVERIEXEC > 0 */

        /*
         * 20210604 dholland ditto
         */
        if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) {
                if (nd.ni_vp == NULL) {
                        vattr_null(&va);
                        va.va_type = VREG;
                        va.va_mode = cmode;
                        if (fmode & O_EXCL)
                                 va.va_vaflags |= VA_EXCLUSIVE;
                        error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
                                           &nd.ni_cnd, &va);
                        if (error) {
                                vput(nd.ni_dvp);
                                goto out;
                        }
                        fmode &= ~O_TRUNC;
                        vp = nd.ni_vp;
                        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                        vput(nd.ni_dvp);
                } else {
                        VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
                        if (nd.ni_dvp == nd.ni_vp)
                                vrele(nd.ni_dvp);
                        else
                                vput(nd.ni_dvp);
                        nd.ni_dvp = NULL;
                        vp = nd.ni_vp;
                        if (fmode & O_EXCL) {
                                error = EEXIST;
                                goto bad;
                        }
                        fmode &= ~O_CREAT;
                }
        } else if ((fmode & O_CREAT) != 0) {
                /*
                 * 20210606 dholland passing NONEXCLHACK means this
                 * case exists; it is the same as the following one
                 * but also needs to do things in the second (exists)
                 * half of the following block. (Besides handle
                 * ni_dvp, anyway.)
                 */
                vp = nd.ni_vp;
                KASSERT((fmode & O_EXCL) == 0);
                fmode &= ~O_CREAT;
        } else {
                vp = nd.ni_vp;
        }
        if (vp->v_type == VSOCK) {
                error = EOPNOTSUPP;
                goto bad;
        }
        if (nd.ni_vp->v_type == VLNK) {
                error = EFTYPE;
                goto bad;
        }

        if ((fmode & O_CREAT) == 0) {
                error = vn_openchk(vp, cred, fmode);
                if (error != 0)
                        goto bad;
        }

        if (fmode & O_TRUNC) {
                vattr_null(&va);
                va.va_size = 0;
                error = VOP_SETATTR(vp, &va, cred);
                if (error != 0)
                        goto bad;
        }
        if ((error = VOP_OPEN(vp, fmode, cred)) != 0)
                goto bad;
        if (fmode & FWRITE) {
                mutex_enter(vp->v_interlock);
                vp->v_writecount++;
                mutex_exit(vp->v_interlock);
        }

bad:
        if (error) {
                vput(vp);
                vp = NULL;
        }
out:
        pathbuf_stringcopy_put(nd.ni_pathbuf, pathstring);

        switch (error) {
        case EDUPFD:
        case EMOVEFD:
                /* if the caller isn't prepared to handle fds, fail for them */
                if (ret_fd == NULL) {
                        error = EOPNOTSUPP;
                        break;
                }
                *ret_vp = NULL;
                *ret_domove = error == EMOVEFD;
                *ret_fd = l->l_dupfd;
                error = 0;
                break;
        case 0:
                KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
                *ret_vp = vp;
                break;
        }
        l->l_dupfd = 0;
        return error;
}

/*
 * Check for write permissions on the specified vnode.
 * Prototype text segments cannot be written.
 */
int
vn_writechk(struct vnode *vp)
{

        /*
         * If the vnode is in use as a process's text,
         * we can't allow writing.
         */
        if (vp->v_iflag & VI_TEXT)
                return ETXTBSY;
        return 0;
}

int
vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags)
{
        int permbits = 0;
        int error;

        if (vp->v_type == VNON || vp->v_type == VBAD)
                return ENXIO;

        if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR)
                return ENOTDIR;

        if ((fflags & O_REGULAR) != 0 && vp->v_type != VREG)
                return EFTYPE;

        if ((fflags & FREAD) != 0) {
                permbits = VREAD;
        }
        if ((fflags & FEXEC) != 0) {
                permbits |= VEXEC;
        }
        if ((fflags & (FWRITE | O_TRUNC)) != 0) {
                permbits |= VWRITE;
                if (vp->v_type == VDIR) {
                        error = EISDIR;
                        goto bad;
                }
                error = vn_writechk(vp);
                if (error != 0)
                        goto bad;
        }
        error = VOP_ACCESS(vp, permbits, cred);
bad:
        return error;
}

/*
 * Mark a vnode as having executable mappings.
 */
void
vn_markexec(struct vnode *vp)
{

        if ((vp->v_iflag & VI_EXECMAP) != 0) {
                /* Safe unlocked, as long as caller holds a reference. */
                return;
        }

        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
        mutex_enter(vp->v_interlock);
        if ((vp->v_iflag & VI_EXECMAP) == 0) {
                cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages);
                vp->v_iflag |= VI_EXECMAP;
        }
        mutex_exit(vp->v_interlock);
        rw_exit(vp->v_uobj.vmobjlock);
}

/*
 * Mark a vnode as being the text of a process.
 * Fail if the vnode is currently writable.
 */
int
vn_marktext(struct vnode *vp)
{

        if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) {
                /* Safe unlocked, as long as caller holds a reference. */
                return 0;
        }

        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
        mutex_enter(vp->v_interlock);
        if (vp->v_writecount != 0) {
                KASSERT((vp->v_iflag & VI_TEXT) == 0);
                mutex_exit(vp->v_interlock);
                rw_exit(vp->v_uobj.vmobjlock);
                return ETXTBSY;
        }
        if ((vp->v_iflag & VI_EXECMAP) == 0) {
                cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages);
        }
        vp->v_iflag |= (VI_TEXT | VI_EXECMAP);
        mutex_exit(vp->v_interlock);
        rw_exit(vp->v_uobj.vmobjlock);
        return 0;
}

/*
 * Vnode close call
 *
 * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node.
 */
int
vn_close(struct vnode *vp, int flags, kauth_cred_t cred)
{
        int error;

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        if (flags & FWRITE) {
                mutex_enter(vp->v_interlock);
                KASSERT(vp->v_writecount > 0);
                vp->v_writecount--;
                mutex_exit(vp->v_interlock);
        }
        error = VOP_CLOSE(vp, flags, cred);
        vput(vp);
        return error;
}

static int
enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag)
{
        struct lwp *l = curlwp;
        off_t testoff;

        if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG)
                return 0;

        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        if (ioflag & IO_APPEND)
                testoff = vp->v_size;
        else
                testoff = uio->uio_offset;

        if (testoff + uio->uio_resid >
            l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
                mutex_enter(&proc_lock);
                psignal(l->l_proc, SIGXFSZ);
                mutex_exit(&proc_lock);
                return EFBIG;
        }

        return 0;
}

/*
 * Package up an I/O request on a vnode into a uio and do it.
 */
int
vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
    enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid,
    struct lwp *l)
{
        struct uio auio;
        struct iovec aiov;
        int error;

        if ((ioflg & IO_NODELOCKED) == 0) {
                if (rw == UIO_READ) {
                        vn_lock(vp, LK_SHARED | LK_RETRY);
                } else /* UIO_WRITE */ {
                        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                }
        }
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        aiov.iov_base = base;
        aiov.iov_len = len;
        auio.uio_resid = len;
        auio.uio_offset = offset;
        auio.uio_rw = rw;
        if (segflg == UIO_SYSSPACE) {
                UIO_SETUP_SYSSPACE(&auio);
        } else {
                auio.uio_vmspace = l->l_proc->p_vmspace;
        }

        if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0)
                goto out;

        if (rw == UIO_READ) {
                error = VOP_READ(vp, &auio, ioflg, cred);
        } else {
                error = VOP_WRITE(vp, &auio, ioflg, cred);
        }

        if (aresid)
                *aresid = auio.uio_resid;
        else
                if (auio.uio_resid && error == 0)
                        error = EIO;

 out:
        if ((ioflg & IO_NODELOCKED) == 0) {
                VOP_UNLOCK(vp);
        }
        return error;
}

int
vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done,
    struct lwp *l, off_t **cookies, int *ncookies)
{
        struct vnode *vp = fp->f_vnode;
        struct iovec aiov;
        struct uio auio;
        int error, eofflag;

        /* Limit the size on any kernel buffers used by VOP_READDIR */
        count = uimin(MAXBSIZE, count);

unionread:
        if (vp->v_type != VDIR)
                return EINVAL;
        aiov.iov_base = bf;
        aiov.iov_len = count;
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        auio.uio_rw = UIO_READ;
        if (segflg == UIO_SYSSPACE) {
                UIO_SETUP_SYSSPACE(&auio);
        } else {
                KASSERT(l == curlwp);
                auio.uio_vmspace = l->l_proc->p_vmspace;
        }
        auio.uio_resid = count;
        vn_lock(vp, LK_SHARED | LK_RETRY);
        mutex_enter(&fp->f_lock);
        auio.uio_offset = fp->f_offset;
        mutex_exit(&fp->f_lock);
        error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies,
            ncookies);
        mutex_enter(&fp->f_lock);
        fp->f_offset = auio.uio_offset;
        mutex_exit(&fp->f_lock);
        VOP_UNLOCK(vp);
        if (error)
                return error;

        if (count == auio.uio_resid && vn_union_readdir_hook) {
                struct vnode *ovp = vp;

                error = (*vn_union_readdir_hook)(&vp, fp, l);
                if (error)
                        return error;
                if (vp != ovp)
                        goto unionread;
        }

        if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) &&
            (vp->v_mount->mnt_flag & MNT_UNION)) {
                struct vnode *tvp = vp;
                vp = vp->v_mount->mnt_vnodecovered;
                vref(vp);
                mutex_enter(&fp->f_lock);
                fp->f_vnode = vp;
                fp->f_offset = 0;
                mutex_exit(&fp->f_lock);
                vrele(tvp);
                goto unionread;
        }
        *done = count - auio.uio_resid;
        return error;
}

/*
 * File table vnode read routine.
 */
static int
vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
    int flags)
{
        struct vnode *vp = fp->f_vnode;
        int error, ioflag, fflag;
        size_t count;

        ioflag = IO_ADV_ENCODE(fp->f_advice);
        fflag = fp->f_flag;
        if (fflag & FNONBLOCK)
                ioflag |= IO_NDELAY;
        if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC))
                ioflag |= IO_SYNC;
        if (fflag & FALTIO)
                ioflag |= IO_ALTSEMANTICS;
        if (fflag & FDIRECT)
                ioflag |= IO_DIRECT;
        if (offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) != 0)
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        else
                vn_lock(vp, LK_SHARED | LK_RETRY);
        if (__predict_false(vp->v_type == VDIR) &&
            offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0)
                mutex_enter(&fp->f_lock);
        uio->uio_offset = *offset;
        if (__predict_false(vp->v_type == VDIR) &&
            offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) == 0)
                mutex_enter(&fp->f_lock);
        count = uio->uio_resid;
        error = VOP_READ(vp, uio, ioflag, cred);
        if (flags & FOF_UPDATE_OFFSET)
                *offset += count - uio->uio_resid;
        VOP_UNLOCK(vp);
        return error;
}

/*
 * File table vnode write routine.
 */
static int
vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
    int flags)
{
        struct vnode *vp = fp->f_vnode;
        int error, ioflag, fflag;
        size_t count;

        ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT;
        fflag = fp->f_flag;
        if (vp->v_type == VREG && (fflag & O_APPEND))
                ioflag |= IO_APPEND;
        if (fflag & FNONBLOCK)
                ioflag |= IO_NDELAY;
        if (fflag & FFSYNC ||
            (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
                ioflag |= IO_SYNC;
        else if (fflag & FDSYNC)
                ioflag |= IO_DSYNC;
        if (fflag & FALTIO)
                ioflag |= IO_ALTSEMANTICS;
        if (fflag & FDIRECT)
                ioflag |= IO_DIRECT;
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        uio->uio_offset = *offset;
        count = uio->uio_resid;

        if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0)
                goto out;

        error = VOP_WRITE(vp, uio, ioflag, cred);

        if (flags & FOF_UPDATE_OFFSET) {
                if (ioflag & IO_APPEND) {
                        /*
                         * SUSv3 describes behaviour for count = 0 as following:
                         * "Before any action ... is taken, and if nbyte is zero
                         * and the file is a regular file, the write() function
                         * ... in the absence of errors ... shall return zero
                         * and have no other results."
                         */ 
                        if (count)
                                *offset = uio->uio_offset;
                } else
                        *offset += count - uio->uio_resid;
        }

 out:
        VOP_UNLOCK(vp);
        return error;
}

/*
 * File table vnode stat routine.
 */
static int
vn_statfile(file_t *fp, struct stat *sb)
{
        struct vnode *vp = fp->f_vnode;
        int error;

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        error = vn_stat(vp, sb);
        VOP_UNLOCK(vp);
        return error;
}

int
vn_stat(struct vnode *vp, struct stat *sb)
{
        struct vattr va;
        int error;
        mode_t mode;

        memset(&va, 0, sizeof(va));
        error = VOP_GETATTR(vp, &va, kauth_cred_get());
        if (error)
                return error;
        /*
         * Copy from vattr table
         */
        memset(sb, 0, sizeof(*sb));
        sb->st_dev = va.va_fsid;
        sb->st_ino = va.va_fileid;
        mode = va.va_mode;
        switch (vp->v_type) {
        case VREG:
                mode |= S_IFREG;
                break;
        case VDIR:
                mode |= S_IFDIR;
                break;
        case VBLK:
                mode |= S_IFBLK;
                break;
        case VCHR:
                mode |= S_IFCHR;
                break;
        case VLNK:
                mode |= S_IFLNK;
                break;
        case VSOCK:
                mode |= S_IFSOCK;
                break;
        case VFIFO:
                mode |= S_IFIFO;
                break;
        default:
                return EBADF;
        }
        sb->st_mode = mode;
        sb->st_nlink = va.va_nlink;
        sb->st_uid = va.va_uid;
        sb->st_gid = va.va_gid;
        sb->st_rdev = va.va_rdev;
        sb->st_size = va.va_size;
        sb->st_atimespec = va.va_atime;
        sb->st_mtimespec = va.va_mtime;
        sb->st_ctimespec = va.va_ctime;
        sb->st_birthtimespec = va.va_birthtime;
        sb->st_blksize = va.va_blocksize;
        sb->st_flags = va.va_flags;
        sb->st_gen = 0;
        sb->st_blocks = va.va_bytes / S_BLKSIZE;
        return 0;
}

/*
 * File table vnode fcntl routine.
 */
static int
vn_fcntl(file_t *fp, u_int com, void *data)
{
        struct vnode *vp = fp->f_vnode;
        int error;

        error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get());
        return error;
}

/*
 * File table vnode ioctl routine.
 */
static int
vn_ioctl(file_t *fp, u_long com, void *data)
{
        struct vnode *vp = fp->f_vnode, *ovp;
        struct vattr vattr;
        int error;

        switch (vp->v_type) {

        case VREG:
        case VDIR:
                if (com == FIONREAD) {
                        vn_lock(vp, LK_SHARED | LK_RETRY);
                        error = VOP_GETATTR(vp, &vattr, kauth_cred_get());
                        if (error == 0) {
                                if (vp->v_type == VDIR)
                                        mutex_enter(&fp->f_lock);
                                *(int *)data = vattr.va_size - fp->f_offset;
                                if (vp->v_type == VDIR)
                                        mutex_exit(&fp->f_lock);
                        }
                        VOP_UNLOCK(vp);
                        if (error)
                                return error;
                        return 0;
                }
                if ((com == FIONWRITE) || (com == FIONSPACE)) {
                        /*
                         * Files don't have send queues, so there never
                         * are any bytes in them, nor is there any
                         * open space in them.
                         */
                        *(int *)data = 0;
                        return 0;
                }
                if (com == FIOGETBMAP) {
                        daddr_t *block;

                        if (*(daddr_t *)data < 0)
                                return EINVAL;
                        block = (daddr_t *)data;
                        vn_lock(vp, LK_SHARED | LK_RETRY);
                        error = VOP_BMAP(vp, *block, NULL, block, NULL);
                        VOP_UNLOCK(vp);
                        return error;
                }
                if (com == OFIOGETBMAP) {
                        daddr_t ibn, obn;

                        if (*(int32_t *)data < 0)
                                return EINVAL;
                        ibn = (daddr_t)*(int32_t *)data;
                        vn_lock(vp, LK_SHARED | LK_RETRY);
                        error = VOP_BMAP(vp, ibn, NULL, &obn, NULL);
                        VOP_UNLOCK(vp);
                        *(int32_t *)data = (int32_t)obn;
                        return error;
                }
                if (com == FIONBIO || com == FIOASYNC)        /* XXX */
                        return 0;                        /* XXX */
                /* FALLTHROUGH */
        case VFIFO:
        case VCHR:
        case VBLK:
                error = VOP_IOCTL(vp, com, data, fp->f_flag,
                    kauth_cred_get());
                if (error == 0 && com == TIOCSCTTY) {
                        vref(vp);
                        mutex_enter(&proc_lock);
                        ovp = curproc->p_session->s_ttyvp;
                        curproc->p_session->s_ttyvp = vp;
                        mutex_exit(&proc_lock);
                        if (ovp != NULL)
                                vrele(ovp);
                }
                return error;

        default:
                return EPASSTHROUGH;
        }
}

/*
 * File table vnode poll routine.
 */
static int
vn_poll(file_t *fp, int events)
{

        return VOP_POLL(fp->f_vnode, events);
}

/*
 * File table vnode kqfilter routine.
 */
int
vn_kqfilter(file_t *fp, struct knote *kn)
{

        return VOP_KQFILTER(fp->f_vnode, kn);
}

static int
vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp,
    int *advicep, struct uvm_object **uobjp, int *maxprotp)
{
        struct uvm_object *uobj;
        struct vnode *vp;
        struct vattr va;
        struct lwp *l;
        vm_prot_t maxprot;
        off_t off;
        int error, flags;
        bool needwritemap;

        l = curlwp;

        off = *offp;
        flags = *flagsp;
        maxprot = VM_PROT_EXECUTE;

        KASSERT(size > 0);

        vp = fp->f_vnode;
        if (vp->v_type != VREG && vp->v_type != VCHR &&
            vp->v_type != VBLK) {
                /* only REG/CHR/BLK support mmap */
                return ENODEV;
        }
        if (vp->v_type != VCHR && off < 0) {
                return EINVAL;
        }
#if SIZE_MAX > UINT32_MAX        /* XXX -Wtype-limits */
        if (vp->v_type != VCHR && size > __type_max(off_t)) {
                return EOVERFLOW;
        }
#endif
        if (vp->v_type != VCHR && off > __type_max(off_t) - size) {
                /* no offset wrapping */
                return EOVERFLOW;
        }

        /* special case: catch SunOS style /dev/zero */
        if (vp->v_type == VCHR &&
            (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) {
                *uobjp = NULL;
                *maxprotp = VM_PROT_ALL;
                return 0;
        }

        /*
         * Old programs may not select a specific sharing type, so
         * default to an appropriate one.
         *
         * XXX: how does MAP_ANON fit in the picture?
         */
        if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
#if defined(DEBUG)
                struct proc *p = l->l_proc;
                printf("WARNING: defaulted mmap() share type to "
                       "%s (pid %d command %s)\n", vp->v_type == VCHR ?
                       "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
                       p->p_comm);
#endif
                if (vp->v_type == VCHR)
                        flags |= MAP_SHARED;        /* for a device */
                else
                        flags |= MAP_PRIVATE;        /* for a file */
        }

        /*
         * MAP_PRIVATE device mappings don't make sense (and aren't
         * supported anyway).  However, some programs rely on this,
         * so just change it to MAP_SHARED.
         */
        if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
                flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
        }

        /*
         * now check protection
         */

        /* check read access */
        if (fp->f_flag & FREAD)
                maxprot |= VM_PROT_READ;
        else if (prot & PROT_READ) {
                return EACCES;
        }

        /* check write access, shared case first */
        if (flags & MAP_SHARED) {
                /*
                 * if the file is writable, only add PROT_WRITE to
                 * maxprot if the file is not immutable, append-only.
                 * otherwise, if we have asked for PROT_WRITE, return
                 * EPERM.
                 */
                if (fp->f_flag & FWRITE) {
                        vn_lock(vp, LK_SHARED | LK_RETRY);
                        error = VOP_GETATTR(vp, &va, l->l_cred);
                        VOP_UNLOCK(vp);
                        if (error) {
                                return error;
                        }
                        if ((va.va_flags &
                             (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0)
                                maxprot |= VM_PROT_WRITE;
                        else if (prot & PROT_WRITE) {
                                return EPERM;
                        }
                } else if (prot & PROT_WRITE) {
                        return EACCES;
                }
        } else {
                /* MAP_PRIVATE mappings can always write to */
                maxprot |= VM_PROT_WRITE;
        }

        /*
         * Don't allow mmap for EXEC if the file system
         * is mounted NOEXEC.
         */
        if ((prot & PROT_EXEC) != 0 &&
            (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
                return EACCES;
        }

        if (vp->v_type != VCHR) {
                error = VOP_MMAP(vp, prot, curlwp->l_cred);
                if (error) {
                        return error;
                }
                vref(vp);
                uobj = &vp->v_uobj;

                /*
                 * If the vnode is being mapped with PROT_EXEC,
                 * then mark it as text.
                 */
                if (prot & PROT_EXEC) {
                        vn_markexec(vp);
                }
        } else {
                int i = maxprot;

                /*
                 * XXX Some devices don't like to be mapped with
                 * XXX PROT_EXEC or PROT_WRITE, but we don't really
                 * XXX have a better way of handling this, right now
                 */
                do {
                        uobj = udv_attach(vp->v_rdev,
                                          (flags & MAP_SHARED) ? i :
                                          (i & ~VM_PROT_WRITE), off, size);
                        i--;
                } while ((uobj == NULL) && (i > 0));
                if (uobj == NULL) {
                        return EINVAL;
                }
                *advicep = UVM_ADV_RANDOM;
        }

        /*
         * Set vnode flags to indicate the new kinds of mapping.
         * We take the vnode lock in exclusive mode here to serialize
         * with direct I/O.
         *
         * Safe to check for these flag values without a lock, as
         * long as a reference to the vnode is held.
         */
        needwritemap = (vp->v_iflag & VI_WRMAP) == 0 &&
                (flags & MAP_SHARED) != 0 &&
                (maxprot & VM_PROT_WRITE) != 0;
        if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) {
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                vp->v_vflag |= VV_MAPPED;
                if (needwritemap) {
                        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
                        mutex_enter(vp->v_interlock);
                        vp->v_iflag |= VI_WRMAP;
                        mutex_exit(vp->v_interlock);
                        rw_exit(vp->v_uobj.vmobjlock);
                }
                VOP_UNLOCK(vp);
        }

#if NVERIEXEC > 0

        /*
         * Check if the file can be executed indirectly.
         *
         * XXX: This gives false warnings about "Incorrect access type"
         * XXX: if the mapping is not executable. Harmless, but will be
         * XXX: fixed as part of other changes.
         */
        if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT,
                            NULL)) {

                /*
                 * Don't allow executable mappings if we can't
                 * indirectly execute the file.
                 */
                if (prot & VM_PROT_EXECUTE) {
                        return EPERM;
                }

                /*
                 * Strip the executable bit from 'maxprot' to make sure
                 * it can't be made executable later.
                 */
                maxprot &= ~VM_PROT_EXECUTE;
        }
#endif /* NVERIEXEC > 0 */

        *uobjp = uobj;
        *maxprotp = maxprot;
        *flagsp = flags;

        return 0;
}

static int
vn_seek(struct file *fp, off_t delta, int whence, off_t *newoffp,
    int flags)
{
        const off_t OFF_MIN = __type_min(off_t);
        const off_t OFF_MAX = __type_max(off_t);
        kauth_cred_t cred = fp->f_cred;
        off_t oldoff, newoff;
        struct vnode *vp = fp->f_vnode;
        struct vattr vattr;
        int error;

        if (vp->v_type == VFIFO)
                return ESPIPE;

        if (flags & FOF_UPDATE_OFFSET)
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        else
                vn_lock(vp, LK_SHARED | LK_RETRY);

        /* Compute the old and new offsets.  */
        if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0)
                mutex_enter(&fp->f_lock);
        oldoff = fp->f_offset;
        if (vp->v_type == VDIR && (flags & FOF_UPDATE_OFFSET) == 0)
                mutex_exit(&fp->f_lock);
        switch (whence) {
        case SEEK_CUR:
                if (delta > 0) {
                        if (oldoff > 0 && delta > OFF_MAX - oldoff) {
                                newoff = OFF_MAX;
                                break;
                        }
                } else {
                        if (oldoff < 0 && delta < OFF_MIN - oldoff) {
                                newoff = OFF_MIN;
                                break;
                        }
                }
                newoff = oldoff + delta;
                break;
        case SEEK_END:
                error = VOP_GETATTR(vp, &vattr, cred);
                if (error)
                        goto out;
                if (vattr.va_size > OFF_MAX ||
                    delta > OFF_MAX - (off_t)vattr.va_size) {
                        newoff = OFF_MAX;
                        break;
                }
                newoff = delta + vattr.va_size;
                break;
        case SEEK_SET:
                newoff = delta;
                break;
        default:
                error = EINVAL;
                goto out;
        }

        /* Pass the proposed change to the file system to audit.  */
        error = VOP_SEEK(vp, oldoff, newoff, cred);
        if (error)
                goto out;

        /* Success!  */
        if (newoffp)
                *newoffp = newoff;
        if (flags & FOF_UPDATE_OFFSET)
                fp->f_offset = newoff;
        error = 0;

out:        VOP_UNLOCK(vp);
        return error;
}

static int
vn_advlock(struct file *fp, void *id, int op, struct flock *fl,
    int flags)
{
        struct vnode *const vp = fp->f_vnode;

        if (fl->l_whence == SEEK_CUR) {
                vn_lock(vp, LK_SHARED | LK_RETRY);
                fl->l_start += fp->f_offset;
                VOP_UNLOCK(vp);
        }

        return VOP_ADVLOCK(vp, id, op, fl, flags);
}

static int
vn_fpathconf(struct file *fp, int name, register_t *retval)
{
        struct vnode *const vp = fp->f_vnode;
        int error;

        vn_lock(vp, LK_SHARED | LK_RETRY);
        error = VOP_PATHCONF(vp, name, retval);
        VOP_UNLOCK(vp);

        return error;
}

static int
vn_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice)
{
        const off_t OFF_MAX = __type_max(off_t);
        struct vnode *vp = fp->f_vnode;
        off_t endoffset;
        int error;

        if (offset < 0) {
                return EINVAL;
        }
        if (len == 0) {
                endoffset = OFF_MAX;
        } else if (len > 0 && (OFF_MAX - offset) >= len) {
                endoffset = offset + len;
        } else {
                return EINVAL;
        }

        CTASSERT(POSIX_FADV_NORMAL == UVM_ADV_NORMAL);
        CTASSERT(POSIX_FADV_RANDOM == UVM_ADV_RANDOM);
        CTASSERT(POSIX_FADV_SEQUENTIAL == UVM_ADV_SEQUENTIAL);

        switch (advice) {
        case POSIX_FADV_WILLNEED:
        case POSIX_FADV_DONTNEED:
                if (vp->v_type != VREG && vp->v_type != VBLK)
                        return 0;
                break;
        }

        switch (advice) {
        case POSIX_FADV_NORMAL:
        case POSIX_FADV_RANDOM:
        case POSIX_FADV_SEQUENTIAL:
                /*
                 * We ignore offset and size.  Must lock the file to
                 * do this, as f_advice is sub-word sized.
                 */
                mutex_enter(&fp->f_lock);
                fp->f_advice = (u_char)advice;
                mutex_exit(&fp->f_lock);
                error = 0;
                break;

        case POSIX_FADV_WILLNEED:
                error = uvm_readahead(&vp->v_uobj, offset, endoffset - offset);
                break;

        case POSIX_FADV_DONTNEED:
                /*
                 * Align the region to page boundaries as VOP_PUTPAGES expects
                 * by shrinking it.  We shrink instead of expand because we
                 * do not want to deactivate cache outside of the requested
                 * region.  It means that if the specified region is smaller
                 * than PAGE_SIZE, we do nothing.
                 */
                if (offset <= trunc_page(OFF_MAX) &&
                    round_page(offset) < trunc_page(endoffset)) {
                        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
                        error = VOP_PUTPAGES(vp,
                            round_page(offset), trunc_page(endoffset),
                            PGO_DEACTIVATE | PGO_CLEANIT);
                } else {
                        error = 0;
                }
                break;

        case POSIX_FADV_NOREUSE:
                /* Not implemented yet. */
                error = 0;
                break;
        default:
                error = EINVAL;
                break;
        }

        return error;
}

static int
vn_truncate(file_t *fp, off_t length)
{
        struct vattr vattr;
        struct vnode *vp;
        int error = 0;

        if (length < 0)
                return EINVAL;

        if ((fp->f_flag & FWRITE) == 0)
                return EINVAL;
        vp = fp->f_vnode;
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        if (vp->v_type == VDIR)
                error = EISDIR;
        else if ((error = vn_writechk(vp)) == 0) {
                vattr_null(&vattr);
                vattr.va_size = length;
                error = VOP_SETATTR(vp, &vattr, fp->f_cred);
        }
        VOP_UNLOCK(vp);

        return error;
}


/*
 * Check that the vnode is still valid, and if so
 * acquire requested lock.
 */
int
vn_lock(struct vnode *vp, int flags)
{
        struct lwp *l;
        int error;

        KASSERT(vrefcnt(vp) > 0);
        KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY|
            LK_UPGRADE|LK_DOWNGRADE)) == 0);
        KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock));

#ifdef DIAGNOSTIC
        if (wapbl_vphaswapbl(vp))
                WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp));
#endif

        /* Get a more useful report for lockstat. */
        l = curlwp;
        KASSERT(l->l_rwcallsite == 0);
        l->l_rwcallsite = (uintptr_t)__builtin_return_address(0);        

        error = VOP_LOCK(vp, flags);

        l->l_rwcallsite = 0;

        switch (flags & (LK_RETRY | LK_NOWAIT)) {
        case 0:
                KASSERT(error == 0 || error == ENOENT);
                break;
        case LK_RETRY:
                KASSERT(error == 0);
                break;
        case LK_NOWAIT:
                KASSERT(error == 0 || error == EBUSY || error == ENOENT);
                break;
        case LK_RETRY | LK_NOWAIT:
                KASSERT(error == 0 || error == EBUSY);
                break;
        }

        return error;
}

/*
 * File table vnode close routine.
 */
static int
vn_closefile(file_t *fp)
{

        return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred);
}

/*
 * Simplified in-kernel wrapper calls for extended attribute access.
 * Both calls pass in a NULL credential, authorizing a "kernel" access.
 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
 */
int
vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
    const char *attrname, size_t *buflen, void *bf, struct lwp *l)
{
        struct uio auio;
        struct iovec aiov;
        int error;

        aiov.iov_len = *buflen;
        aiov.iov_base = bf;

        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        auio.uio_rw = UIO_READ;
        auio.uio_offset = 0;
        auio.uio_resid = *buflen;
        UIO_SETUP_SYSSPACE(&auio);

        if ((ioflg & IO_NODELOCKED) == 0)
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);

        error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL,
            NOCRED);

        if ((ioflg & IO_NODELOCKED) == 0)
                VOP_UNLOCK(vp);

        if (error == 0)
                *buflen = *buflen - auio.uio_resid;

        return error;
}

/*
 * XXX Failure mode if partially written?
 */
int
vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
    const char *attrname, size_t buflen, const void *bf, struct lwp *l)
{
        struct uio auio;
        struct iovec aiov;
        int error;

        aiov.iov_len = buflen;
        aiov.iov_base = __UNCONST(bf);                /* XXXUNCONST kills const */

        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        auio.uio_rw = UIO_WRITE;
        auio.uio_offset = 0;
        auio.uio_resid = buflen;
        UIO_SETUP_SYSSPACE(&auio);

        if ((ioflg & IO_NODELOCKED) == 0) {
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        }

        error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NOCRED);

        if ((ioflg & IO_NODELOCKED) == 0) {
                VOP_UNLOCK(vp);
        }

        return error;
}

int
vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
    const char *attrname, struct lwp *l)
{
        int error;

        if ((ioflg & IO_NODELOCKED) == 0) {
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        }

        error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NOCRED);
        if (error == EOPNOTSUPP)
                error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
                    NOCRED);

        if ((ioflg & IO_NODELOCKED) == 0) {
                VOP_UNLOCK(vp);
        }

        return error;
}

int
vn_fifo_bypass(void *v)
{
        struct vop_generic_args *ap = v;

        return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v);
}

/*
 * Open block device by device number
 */
int
vn_bdev_open(dev_t dev, struct vnode **vpp, struct lwp *l)
{
        int     error;

        if ((error = bdevvp(dev, vpp)) != 0)
                return error;

        vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
        if ((error = VOP_OPEN(*vpp, FREAD | FWRITE, l->l_cred)) != 0) {
                vput(*vpp);
                return error;
        }
        mutex_enter((*vpp)->v_interlock);
        (*vpp)->v_writecount++;
        mutex_exit((*vpp)->v_interlock);
        VOP_UNLOCK(*vpp);

        return 0;
}

/*
 * Lookup the provided name in the filesystem.  If the file exists,
 * is a valid block device, and isn't being used by anyone else,
 * set *vpp to the file's vnode.
 */
int
vn_bdev_openpath(struct pathbuf *pb, struct vnode **vpp, struct lwp *l)
{
        struct vnode *vp;
        dev_t dev;
        enum vtype vt;
        int     error;

        error = vn_open(NULL, pb, 0, FREAD | FWRITE, 0, &vp, NULL, NULL);
        if (error != 0)
                return error;

        dev = vp->v_rdev;
        vt = vp->v_type;

        VOP_UNLOCK(vp);
        (void) vn_close(vp, FREAD | FWRITE, l->l_cred);

        if (vt != VBLK)
                return ENOTBLK;

        return vn_bdev_open(dev, vpp, l);
}

static long
vn_knote_to_interest(const struct knote *kn)
{
        switch (kn->kn_filter) {
        case EVFILT_READ:
                /*
                 * Writing to the file or changing its attributes can
                 * set the file size, which impacts the readability
                 * filter.
                 *
                 * (No need to set NOTE_EXTEND here; it's only ever
                 * send with other hints; see vnode_if.c.)
                 */
                return NOTE_WRITE | NOTE_ATTRIB;

        case EVFILT_VNODE:
                return kn->kn_sfflags;

        case EVFILT_WRITE:
        default:
                return 0;
        }
}

void
vn_knote_attach(struct vnode *vp, struct knote *kn)
{
        struct vnode_klist *vk = vp->v_klist;
        long interest = 0;

        /*
         * In the case of layered / stacked file systems, knotes
         * should only ever be associated with the base vnode.
         */
        KASSERT(kn->kn_hook == vp);
        KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);

        /*
         * We maintain a bitmask of the kevents that there is interest in,
         * to minimize the impact of having watchers.  It's silly to have
         * to traverse vn_klist every time a read or write happens simply
         * because there is someone interested in knowing when the file
         * is deleted, for example.
         */

        mutex_enter(vp->v_interlock);
        SLIST_INSERT_HEAD(&vk->vk_klist, kn, kn_selnext);
        SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
                interest |= vn_knote_to_interest(kn);
        }
        vk->vk_interest = interest;
        mutex_exit(vp->v_interlock);
}

void
vn_knote_detach(struct vnode *vp, struct knote *kn)
{
        struct vnode_klist *vk = vp->v_klist;
        long interest = 0;

        /* See above. */
        KASSERT(kn->kn_hook == vp);
        KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);

        /*
         * We special case removing the head of the list, because:
         *
         * 1. It's extremely likely that we're detaching the only
         *    knote.
         *
         * 2. We're already traversing the whole list, so we don't
         *    want to use the generic SLIST_REMOVE() which would
         *    traverse it *again*.
         */

        mutex_enter(vp->v_interlock);
        if (__predict_true(kn == SLIST_FIRST(&vk->vk_klist))) {
                SLIST_REMOVE_HEAD(&vk->vk_klist, kn_selnext);
                SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
                        interest |= vn_knote_to_interest(kn);
                }
                vk->vk_interest = interest;
        } else {
                struct knote *thiskn, *nextkn, *prevkn = NULL;

                SLIST_FOREACH_SAFE(thiskn, &vk->vk_klist, kn_selnext, nextkn) {
                        if (thiskn == kn) {
                                KASSERT(kn != NULL);
                                KASSERT(prevkn != NULL);
                                SLIST_REMOVE_AFTER(prevkn, kn_selnext);
                                kn = NULL;
                        } else {
                                interest |= vn_knote_to_interest(thiskn);
                                prevkn = thiskn;
                        }
                }
                vk->vk_interest = interest;
        }
        mutex_exit(vp->v_interlock);
}













































































   27 






   23 
















    7 








    4 











































    1 











    1 

    1 





































    1 












































































































    4 











    4 




    4 
















    4 












    1 


    1 

    1 
























































    1 

    1 




    1 


    1 



















    1 








    1 











    3 


    2 
    3 

    2 
























    3 





    3 









    3 









    1 

    3 





















    3 





    1 
    2 



    1 
    2 



    3 





    2 
    1 






    1 












    1 




    1 







    1 

































    7 



    7 
    7 
    7 
    1 










    1 

    1 











    3 




















    3 






















    3 







    6 








    4 


    1 

















    3 





























    1 




    1 














    6 
















    1 






































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
/*        $NetBSD: kern_resource.c,v 1.195 2023/10/04 20:28:06 ad Exp $        */

/*-
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_resource.c        8.8 (Berkeley) 2/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_resource.c,v 1.195 2023/10/04 20:28:06 ad Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/resourcevar.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/timevar.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/atomic.h>

#include <uvm/uvm_extern.h>

/*
 * Maximum process data and stack limits.
 * They are variables so they are patchable.
 */
rlim_t                        maxdmap = MAXDSIZ;
rlim_t                        maxsmap = MAXSSIZ;

static kauth_listener_t        resource_listener;
static struct sysctllog        *proc_sysctllog;

static int        donice(struct lwp *, struct proc *, int);
static void        sysctl_proc_setup(void);

static int
resource_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        struct proc *p;
        int result;

        result = KAUTH_RESULT_DEFER;
        p = arg0;

        switch (action) {
        case KAUTH_PROCESS_NICE:
                if (kauth_cred_geteuid(cred) != kauth_cred_geteuid(p->p_cred) &&
                    kauth_cred_getuid(cred) != kauth_cred_geteuid(p->p_cred)) {
                        break;
                }

                if ((u_long)arg1 >= p->p_nice)
                        result = KAUTH_RESULT_ALLOW;

                break;

        case KAUTH_PROCESS_RLIMIT: {
                enum kauth_process_req req;

                req = (enum kauth_process_req)(uintptr_t)arg1;

                switch (req) {
                case KAUTH_REQ_PROCESS_RLIMIT_GET:
                        result = KAUTH_RESULT_ALLOW;
                        break;

                case KAUTH_REQ_PROCESS_RLIMIT_SET: {
                        struct rlimit *new_rlimit;
                        u_long which;

                        if ((p != curlwp->l_proc) &&
                            (proc_uidmatch(cred, p->p_cred) != 0))
                                break;

                        new_rlimit = arg2;
                        which = (u_long)arg3;

                        if (new_rlimit->rlim_max <= p->p_rlimit[which].rlim_max)
                                result = KAUTH_RESULT_ALLOW;

                        break;
                        }

                default:
                        break;
                }

                break;
        }

        default:
                break;
        }

        return result;
}

void
resource_init(void)
{

        resource_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
            resource_listener_cb, NULL);

        sysctl_proc_setup();
}

/*
 * Resource controls and accounting.
 */

int
sys_getpriority(struct lwp *l, const struct sys_getpriority_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) which;
                syscallarg(id_t) who;
        } */
        struct proc *curp = l->l_proc, *p;
        id_t who = SCARG(uap, who);
        int low = NZERO + PRIO_MAX + 1;

        mutex_enter(&proc_lock);
        switch (SCARG(uap, which)) {
        case PRIO_PROCESS:
                p = who ? proc_find(who) : curp;
                if (p != NULL)
                        low = p->p_nice;
                break;

        case PRIO_PGRP: {
                struct pgrp *pg;

                if (who == 0)
                        pg = curp->p_pgrp;
                else if ((pg = pgrp_find(who)) == NULL)
                        break;
                LIST_FOREACH(p, &pg->pg_members, p_pglist) {
                        if (p->p_nice < low)
                                low = p->p_nice;
                }
                break;
        }

        case PRIO_USER:
                if (who == 0)
                        who = (int)kauth_cred_geteuid(l->l_cred);
                PROCLIST_FOREACH(p, &allproc) {
                        mutex_enter(p->p_lock);
                        if (kauth_cred_geteuid(p->p_cred) ==
                            (uid_t)who && p->p_nice < low)
                                low = p->p_nice;
                        mutex_exit(p->p_lock);
                }
                break;

        default:
                mutex_exit(&proc_lock);
                return EINVAL;
        }
        mutex_exit(&proc_lock);

        if (low == NZERO + PRIO_MAX + 1) {
                return ESRCH;
        }
        *retval = low - NZERO;
        return 0;
}

int
sys_setpriority(struct lwp *l, const struct sys_setpriority_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) which;
                syscallarg(id_t) who;
                syscallarg(int) prio;
        } */
        struct proc *curp = l->l_proc, *p;
        id_t who = SCARG(uap, who);
        int found = 0, error = 0;

        mutex_enter(&proc_lock);
        switch (SCARG(uap, which)) {
        case PRIO_PROCESS:
                p = who ? proc_find(who) : curp;
                if (p != NULL) {
                        mutex_enter(p->p_lock);
                        found++;
                        error = donice(l, p, SCARG(uap, prio));
                        mutex_exit(p->p_lock);
                }
                break;

        case PRIO_PGRP: {
                struct pgrp *pg;

                if (who == 0)
                        pg = curp->p_pgrp;
                else if ((pg = pgrp_find(who)) == NULL)
                        break;
                LIST_FOREACH(p, &pg->pg_members, p_pglist) {
                        mutex_enter(p->p_lock);
                        found++;
                        error = donice(l, p, SCARG(uap, prio));
                        mutex_exit(p->p_lock);
                        if (error)
                                break;
                }
                break;
        }

        case PRIO_USER:
                if (who == 0)
                        who = (int)kauth_cred_geteuid(l->l_cred);
                PROCLIST_FOREACH(p, &allproc) {
                        mutex_enter(p->p_lock);
                        if (kauth_cred_geteuid(p->p_cred) ==
                            (uid_t)SCARG(uap, who)) {
                                found++;
                                error = donice(l, p, SCARG(uap, prio));
                        }
                        mutex_exit(p->p_lock);
                        if (error)
                                break;
                }
                break;

        default:
                mutex_exit(&proc_lock);
                return EINVAL;
        }
        mutex_exit(&proc_lock);

        return (found == 0) ? ESRCH : error;
}

/*
 * Renice a process.
 *
 * Call with the target process' credentials locked.
 */
static int
donice(struct lwp *l, struct proc *chgp, int n)
{
        kauth_cred_t cred = l->l_cred;

        KASSERT(mutex_owned(chgp->p_lock));

        if (kauth_cred_geteuid(cred) && kauth_cred_getuid(cred) &&
            kauth_cred_geteuid(cred) != kauth_cred_geteuid(chgp->p_cred) &&
            kauth_cred_getuid(cred) != kauth_cred_geteuid(chgp->p_cred))
                return EPERM;

        if (n > PRIO_MAX) {
                n = PRIO_MAX;
        }
        if (n < PRIO_MIN) {
                n = PRIO_MIN;
        }
        n += NZERO;

        if (kauth_authorize_process(cred, KAUTH_PROCESS_NICE, chgp,
            KAUTH_ARG(n), NULL, NULL)) {
                return EACCES;
        }

        sched_nice(chgp, n);
        return 0;
}

int
sys_setrlimit(struct lwp *l, const struct sys_setrlimit_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) which;
                syscallarg(const struct rlimit *) rlp;
        } */
        int error, which = SCARG(uap, which);
        struct rlimit alim;

        error = copyin(SCARG(uap, rlp), &alim, sizeof(struct rlimit));
        if (error) {
                return error;
        }
        return dosetrlimit(l, l->l_proc, which, &alim);
}

int
dosetrlimit(struct lwp *l, struct proc *p, int which, struct rlimit *limp)
{
        struct rlimit *alimp;
        int error;

        if ((u_int)which >= RLIM_NLIMITS)
                return EINVAL;

        if (limp->rlim_cur > limp->rlim_max) {
                /*
                 * This is programming error. According to SUSv2, we should
                 * return error in this case.
                 */
                return EINVAL;
        }

        alimp = &p->p_rlimit[which];
        /* if we don't change the value, no need to limcopy() */
        if (limp->rlim_cur == alimp->rlim_cur &&
            limp->rlim_max == alimp->rlim_max)
                return 0;

        error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT,
            p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_SET), limp, KAUTH_ARG(which));
        if (error)
                return error;

        lim_privatise(p);
        /* p->p_limit is now unchangeable */
        alimp = &p->p_rlimit[which];

        switch (which) {

        case RLIMIT_DATA:
                if (limp->rlim_cur > maxdmap)
                        limp->rlim_cur = maxdmap;
                if (limp->rlim_max > maxdmap)
                        limp->rlim_max = maxdmap;
                break;

        case RLIMIT_STACK:
                if (limp->rlim_cur > maxsmap)
                        limp->rlim_cur = maxsmap;
                if (limp->rlim_max > maxsmap)
                        limp->rlim_max = maxsmap;

                /*
                 * Return EINVAL if the new stack size limit is lower than
                 * current usage. Otherwise, the process would get SIGSEGV the
                 * moment it would try to access anything on its current stack.
                 * This conforms to SUSv2.
                 */
                if (btoc(limp->rlim_cur) < p->p_vmspace->vm_ssize ||
                    btoc(limp->rlim_max) < p->p_vmspace->vm_ssize) {
                        return EINVAL;
                }

                /*
                 * Stack is allocated to the max at exec time with
                 * only "rlim_cur" bytes accessible (In other words,
                 * allocates stack dividing two contiguous regions at
                 * "rlim_cur" bytes boundary).
                 *
                 * Since allocation is done in terms of page, roundup
                 * "rlim_cur" (otherwise, contiguous regions
                 * overlap).  If stack limit is going up make more
                 * accessible, if going down make inaccessible.
                 */
                limp->rlim_max = round_page(limp->rlim_max);
                limp->rlim_cur = round_page(limp->rlim_cur);
                if (limp->rlim_cur != alimp->rlim_cur) {
                        vaddr_t addr;
                        vsize_t size;
                        vm_prot_t prot;
                        char *base, *tmp;

                        base = p->p_vmspace->vm_minsaddr;
                        if (limp->rlim_cur > alimp->rlim_cur) {
                                prot = VM_PROT_READ | VM_PROT_WRITE;
                                size = limp->rlim_cur - alimp->rlim_cur;
                                tmp = STACK_GROW(base, alimp->rlim_cur);
                        } else {
                                prot = VM_PROT_NONE;
                                size = alimp->rlim_cur - limp->rlim_cur;
                                tmp = STACK_GROW(base, limp->rlim_cur);
                        }
                        addr = (vaddr_t)STACK_ALLOC(tmp, size);
                        (void) uvm_map_protect(&p->p_vmspace->vm_map,
                            addr, addr + size, prot, false);
                }
                break;

        case RLIMIT_NOFILE:
                if (limp->rlim_cur > maxfiles)
                        limp->rlim_cur = maxfiles;
                if (limp->rlim_max > maxfiles)
                        limp->rlim_max = maxfiles;
                break;

        case RLIMIT_NPROC:
                if (limp->rlim_cur > maxproc)
                        limp->rlim_cur = maxproc;
                if (limp->rlim_max > maxproc)
                        limp->rlim_max = maxproc;
                break;

        case RLIMIT_NTHR:
                if (limp->rlim_cur > maxlwp)
                        limp->rlim_cur = maxlwp;
                if (limp->rlim_max > maxlwp)
                        limp->rlim_max = maxlwp;
                break;
        }

        mutex_enter(&p->p_limit->pl_lock);
        *alimp = *limp;
        mutex_exit(&p->p_limit->pl_lock);
        return 0;
}

int
sys_getrlimit(struct lwp *l, const struct sys_getrlimit_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) which;
                syscallarg(struct rlimit *) rlp;
        } */
        struct proc *p = l->l_proc;
        int which = SCARG(uap, which);
        struct rlimit rl;

        if ((u_int)which >= RLIM_NLIMITS)
                return EINVAL;

        mutex_enter(p->p_lock);
        memcpy(&rl, &p->p_rlimit[which], sizeof(rl));
        mutex_exit(p->p_lock);

        return copyout(&rl, SCARG(uap, rlp), sizeof(rl));
}

void
addrulwp(struct lwp *l, struct bintime *tm)
{

        lwp_lock(l);
        bintime_add(tm, &l->l_rtime);
        if ((l->l_pflag & LP_RUNNING) != 0 &&
            (l->l_pflag & (LP_INTR | LP_TIMEINTR)) != LP_INTR) {
                struct bintime diff;
                /*
                 * Adjust for the current time slice.  This is
                 * actually fairly important since the error
                 * here is on the order of a time quantum,
                 * which is much greater than the sampling
                 * error.
                 */
                binuptime(&diff);
                membar_consumer(); /* for softint_dispatch() */
                bintime_sub(&diff, &l->l_stime);
                bintime_add(tm, &diff);
        }
        lwp_unlock(l);
}

/*
 * Transform the running time and tick information in proc p into user,
 * system, and interrupt time usage.
 *
 * Should be called with p->p_lock held unless called from exit1().
 */
void
calcru(struct proc *p, struct timeval *up, struct timeval *sp,
    struct timeval *ip, struct timeval *rp)
{
        uint64_t u, st, ut, it, tot, dt;
        struct lwp *l;
        struct bintime tm;
        struct timeval tv;

        KASSERT(p->p_stat == SDEAD || mutex_owned(p->p_lock));

        mutex_spin_enter(&p->p_stmutex);
        st = p->p_sticks;
        ut = p->p_uticks;
        it = p->p_iticks;
        mutex_spin_exit(&p->p_stmutex);

        tm = p->p_rtime;

        LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                addrulwp(l, &tm);
        }

        tot = st + ut + it;
        bintime2timeval(&tm, &tv);
        u = (uint64_t)tv.tv_sec * 1000000ul + tv.tv_usec;

        if (tot == 0) {
                /* No ticks, so can't use to share time out, split 50-50 */
                st = ut = u / 2;
        } else {
                st = (u * st) / tot;
                ut = (u * ut) / tot;
        }

        /*
         * Try to avoid lying to the users (too much)
         *
         * Of course, user/sys time are based on sampling (ie: statistics)
         * so that would be impossible, but convincing the mark
         * that we have used less ?time this call than we had
         * last time, is beyond reasonable...  (the con fails!)
         *
         * Note that since actual used time cannot decrease, either
         * utime or stime (or both) must be greater now than last time
         * (or both the same) - if one seems to have decreased, hold
         * it constant and steal the necessary bump from the other
         * which must have increased.
         */
        if (p->p_xutime > ut) {
                dt = p->p_xutime - ut;
                st -= uimin(dt, st);
                ut = p->p_xutime;
        } else if (p->p_xstime > st) {
                dt = p->p_xstime - st;
                ut -= uimin(dt, ut);
                st = p->p_xstime;
        }

        if (sp != NULL) {
                p->p_xstime = st;
                sp->tv_sec = st / 1000000;
                sp->tv_usec = st % 1000000;
        }
        if (up != NULL) {
                p->p_xutime = ut;
                up->tv_sec = ut / 1000000;
                up->tv_usec = ut % 1000000;
        }
        if (ip != NULL) {
                if (it != 0)                /* it != 0 --> tot != 0 */
                        it = (u * it) / tot;
                ip->tv_sec = it / 1000000;
                ip->tv_usec = it % 1000000;
        }
        if (rp != NULL) {
                *rp = tv;
        }
}

int
sys___getrusage50(struct lwp *l, const struct sys___getrusage50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) who;
                syscallarg(struct rusage *) rusage;
        } */
        int error;
        struct rusage ru;
        struct proc *p = l->l_proc;

        error = getrusage1(p, SCARG(uap, who), &ru);
        if (error != 0)
                return error;

        return copyout(&ru, SCARG(uap, rusage), sizeof(ru));
}

int
getrusage1(struct proc *p, int who, struct rusage *ru)
{

        switch (who) {
        case RUSAGE_SELF:
                mutex_enter(p->p_lock);
                ruspace(p);
                memcpy(ru, &p->p_stats->p_ru, sizeof(*ru));
                calcru(p, &ru->ru_utime, &ru->ru_stime, NULL, NULL);
                rulwps(p, ru);
                mutex_exit(p->p_lock);
                break;
        case RUSAGE_CHILDREN:
                mutex_enter(p->p_lock);
                memcpy(ru, &p->p_stats->p_cru, sizeof(*ru));
                mutex_exit(p->p_lock);
                break;
        default:
                return EINVAL;
        }

        return 0;
}

void
ruspace(struct proc *p)
{
        struct vmspace *vm = p->p_vmspace;
        struct rusage *ru = &p->p_stats->p_ru;

        ru->ru_ixrss = vm->vm_tsize << (PAGE_SHIFT - 10);
        ru->ru_idrss = vm->vm_dsize << (PAGE_SHIFT - 10);
        ru->ru_isrss = vm->vm_ssize << (PAGE_SHIFT - 10);
#ifdef __HAVE_NO_PMAP_STATS
        /* We don't keep track of the max so we get the current */
        ru->ru_maxrss = vm_resident_count(vm) << (PAGE_SHIFT - 10);
#else
        ru->ru_maxrss = vm->vm_rssmax << (PAGE_SHIFT - 10);
#endif
}

void
ruadd(struct rusage *ru, struct rusage *ru2)
{
        long *ip, *ip2;
        int i;

        timeradd(&ru->ru_utime, &ru2->ru_utime, &ru->ru_utime);
        timeradd(&ru->ru_stime, &ru2->ru_stime, &ru->ru_stime);
        if (ru->ru_maxrss < ru2->ru_maxrss)
                ru->ru_maxrss = ru2->ru_maxrss;
        ip = &ru->ru_first; ip2 = &ru2->ru_first;
        for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
                *ip++ += *ip2++;
}

void
rulwps(proc_t *p, struct rusage *ru)
{
        lwp_t *l;

        KASSERT(mutex_owned(p->p_lock));

        LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                ruadd(ru, &l->l_ru);
        }
}

/*
 * lim_copy: make a copy of the plimit structure.
 *
 * We use copy-on-write after fork, and copy when a limit is changed.
 */
struct plimit *
lim_copy(struct plimit *lim)
{
        struct plimit *newlim;
        char *corename;
        size_t alen, len;

        newlim = kmem_alloc(sizeof(*newlim), KM_SLEEP);
        mutex_init(&newlim->pl_lock, MUTEX_DEFAULT, IPL_NONE);
        newlim->pl_writeable = false;
        newlim->pl_refcnt = 1;
        newlim->pl_sv_limit = NULL;

        mutex_enter(&lim->pl_lock);
        memcpy(newlim->pl_rlimit, lim->pl_rlimit,
            sizeof(struct rlimit) * RLIM_NLIMITS);

        /*
         * Note: the common case is a use of default core name.
         */
        alen = 0;
        corename = NULL;
        for (;;) {
                if (lim->pl_corename == defcorename) {
                        newlim->pl_corename = defcorename;
                        newlim->pl_cnlen = 0;
                        break;
                }
                len = lim->pl_cnlen;
                if (len == alen) {
                        newlim->pl_corename = corename;
                        newlim->pl_cnlen = len;
                        memcpy(corename, lim->pl_corename, len);
                        corename = NULL;
                        break;
                }
                mutex_exit(&lim->pl_lock);
                if (corename) {
                        kmem_free(corename, alen);
                }
                alen = len;
                corename = kmem_alloc(alen, KM_SLEEP);
                mutex_enter(&lim->pl_lock);
        }
        mutex_exit(&lim->pl_lock);

        if (corename) {
                kmem_free(corename, alen);
        }
        return newlim;
}

void
lim_addref(struct plimit *lim)
{
        atomic_inc_uint(&lim->pl_refcnt);
}

/*
 * lim_privatise: give a process its own private plimit structure.
 */
void
lim_privatise(proc_t *p)
{
        struct plimit *lim = p->p_limit, *newlim;

        if (lim->pl_writeable) {
                return;
        }

        newlim = lim_copy(lim);

        mutex_enter(p->p_lock);
        if (p->p_limit->pl_writeable) {
                /* Other thread won the race. */
                mutex_exit(p->p_lock);
                lim_free(newlim);
                return;
        }

        /*
         * Since p->p_limit can be accessed without locked held,
         * old limit structure must not be deleted yet.
         */
        newlim->pl_sv_limit = p->p_limit;
        newlim->pl_writeable = true;
        p->p_limit = newlim;
        mutex_exit(p->p_lock);
}

void
lim_setcorename(proc_t *p, char *name, size_t len)
{
        struct plimit *lim;
        char *oname;
        size_t olen;

        lim_privatise(p);
        lim = p->p_limit;

        mutex_enter(&lim->pl_lock);
        oname = lim->pl_corename;
        olen = lim->pl_cnlen;
        lim->pl_corename = name;
        lim->pl_cnlen = len;
        mutex_exit(&lim->pl_lock);

        if (oname != defcorename) {
                kmem_free(oname, olen);
        }
}

void
lim_free(struct plimit *lim)
{
        struct plimit *sv_lim;

        do {
                membar_release();
                if (atomic_dec_uint_nv(&lim->pl_refcnt) > 0) {
                        return;
                }
                membar_acquire();
                if (lim->pl_corename != defcorename) {
                        kmem_free(lim->pl_corename, lim->pl_cnlen);
                }
                sv_lim = lim->pl_sv_limit;
                mutex_destroy(&lim->pl_lock);
                kmem_free(lim, sizeof(*lim));
        } while ((lim = sv_lim) != NULL);
}

struct pstats *
pstatscopy(struct pstats *ps)
{
        struct pstats *nps;
        size_t len;

        nps = kmem_alloc(sizeof(*nps), KM_SLEEP);

        len = (char *)&nps->pstat_endzero - (char *)&nps->pstat_startzero;
        memset(&nps->pstat_startzero, 0, len);

        len = (char *)&nps->pstat_endcopy - (char *)&nps->pstat_startcopy;
        memcpy(&nps->pstat_startcopy, &ps->pstat_startcopy, len);

        return nps;
}

void
pstatsfree(struct pstats *ps)
{

        kmem_free(ps, sizeof(*ps));
}

/*
 * sysctl_proc_findproc: a routine for sysctl proc subtree helpers that
 * need to pick a valid process by PID.
 *
 * => Hold a reference on the process, on success.
 */
static int
sysctl_proc_findproc(lwp_t *l, pid_t pid, proc_t **p2)
{
        proc_t *p;
        int error;

        if (pid == PROC_CURPROC) {
                p = l->l_proc;
        } else {
                mutex_enter(&proc_lock);
                p = proc_find(pid);
                if (p == NULL) {
                        mutex_exit(&proc_lock);
                        return ESRCH;
                }
        }
        error = rw_tryenter(&p->p_reflock, RW_READER) ? 0 : EBUSY;
        if (pid != PROC_CURPROC) {
                mutex_exit(&proc_lock);
        }
        *p2 = p;
        return error;
}

/*
 * sysctl_proc_paxflags: helper routine to get process's paxctl flags
 */
static int
sysctl_proc_paxflags(SYSCTLFN_ARGS)
{
        struct proc *p;
        struct sysctlnode node;
        int paxflags;
        int error;

        /* First, validate the request. */
        if (namelen != 0 || name[-1] != PROC_PID_PAXFLAGS)
                return EINVAL;

        /* Find the process.  Hold a reference (p_reflock), if found. */
        error = sysctl_proc_findproc(l, (pid_t)name[-2], &p);
        if (error)
                return error;

        /* XXX-elad */
        error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p,
            KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
        if (error) {
                rw_exit(&p->p_reflock);
                return error;
        }

        /* Retrieve the limits. */
        node = *rnode;
        paxflags = p->p_pax;
        node.sysctl_data = &paxflags;

        error = sysctl_lookup(SYSCTLFN_CALL(&node));

        /* If attempting to write new value, it's an error */
        if (error == 0 && newp != NULL)
                error = EACCES;

        rw_exit(&p->p_reflock);
        return error;
}

/*
 * sysctl_proc_corename: helper routine to get or set the core file name
 * for a process specified by PID.
 */
static int
sysctl_proc_corename(SYSCTLFN_ARGS)
{
        struct proc *p;
        struct plimit *lim;
        char *cnbuf, *cname;
        struct sysctlnode node;
        size_t len;
        int error;

        /* First, validate the request. */
        if (namelen != 0 || name[-1] != PROC_PID_CORENAME)
                return EINVAL;

        /* Find the process.  Hold a reference (p_reflock), if found. */
        error = sysctl_proc_findproc(l, (pid_t)name[-2], &p);
        if (error)
                return error;

        /* XXX-elad */
        error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p,
            KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
        if (error) {
                rw_exit(&p->p_reflock);
                return error;
        }

        cnbuf = PNBUF_GET();

        if (oldp) {
                /* Get case: copy the core name into the buffer. */
                error = kauth_authorize_process(l->l_cred,
                    KAUTH_PROCESS_CORENAME, p,
                    KAUTH_ARG(KAUTH_REQ_PROCESS_CORENAME_GET), NULL, NULL);
                if (error) {
                        goto done;
                }
                lim = p->p_limit;
                mutex_enter(&lim->pl_lock);
                strlcpy(cnbuf, lim->pl_corename, MAXPATHLEN);
                mutex_exit(&lim->pl_lock);
        }

        node = *rnode;
        node.sysctl_data = cnbuf;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));

        /* Return if error, or if caller is only getting the core name. */
        if (error || newp == NULL) {
                goto done;
        }

        /*
         * Set case.  Check permission and then validate new core name.
         * It must be either "core", "/core", or end in ".core".
         */
        error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CORENAME,
            p, KAUTH_ARG(KAUTH_REQ_PROCESS_CORENAME_SET), cnbuf, NULL);
        if (error) {
                goto done;
        }
        len = strlen(cnbuf);
        if ((len < 4 || strcmp(cnbuf + len - 4, "core") != 0) ||
            (len > 4 && cnbuf[len - 5] != '/' && cnbuf[len - 5] != '.')) {
                error = EINVAL;
                goto done;
        }

        /* Allocate, copy and set the new core name for plimit structure. */
        cname = kmem_alloc(++len, KM_NOSLEEP);
        if (cname == NULL) {
                error = ENOMEM;
                goto done;
        }
        memcpy(cname, cnbuf, len);
        lim_setcorename(p, cname, len);
done:
        rw_exit(&p->p_reflock);
        PNBUF_PUT(cnbuf);
        return error;
}

/*
 * sysctl_proc_stop: helper routine for checking/setting the stop flags.
 */
static int
sysctl_proc_stop(SYSCTLFN_ARGS)
{
        struct proc *p;
        int isset, flag, error = 0;
        struct sysctlnode node;

        if (namelen != 0)
                return EINVAL;

        /* Find the process.  Hold a reference (p_reflock), if found. */
        error = sysctl_proc_findproc(l, (pid_t)name[-2], &p);
        if (error)
                return error;

        /* XXX-elad */
        error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p,
            KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
        if (error) {
                goto out;
        }

        /* Determine the flag. */
        switch (rnode->sysctl_num) {
        case PROC_PID_STOPFORK:
                flag = PS_STOPFORK;
                break;
        case PROC_PID_STOPEXEC:
                flag = PS_STOPEXEC;
                break;
        case PROC_PID_STOPEXIT:
                flag = PS_STOPEXIT;
                break;
        default:
                error = EINVAL;
                goto out;
        }
        isset = (p->p_flag & flag) ? 1 : 0;
        node = *rnode;
        node.sysctl_data = &isset;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));

        /* Return if error, or if callers is only getting the flag. */
        if (error || newp == NULL) {
                goto out;
        }

        /* Check if caller can set the flags. */
        error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_STOPFLAG,
            p, KAUTH_ARG(flag), NULL, NULL);
        if (error) {
                goto out;
        }
        mutex_enter(p->p_lock);
        if (isset) {
                p->p_sflag |= flag;
        } else {
                p->p_sflag &= ~flag;
        }
        mutex_exit(p->p_lock);
out:
        rw_exit(&p->p_reflock);
        return error;
}

/*
 * sysctl_proc_plimit: helper routine to get/set rlimits of a process.
 */
static int
sysctl_proc_plimit(SYSCTLFN_ARGS)
{
        struct proc *p;
        u_int limitno;
        int which, error = 0;
        struct rlimit alim;
        struct sysctlnode node;

        if (namelen != 0)
                return EINVAL;

        which = name[-1];
        if (which != PROC_PID_LIMIT_TYPE_SOFT &&
            which != PROC_PID_LIMIT_TYPE_HARD)
                return EINVAL;

        limitno = name[-2] - 1;
        if (limitno >= RLIM_NLIMITS)
                return EINVAL;

        if (name[-3] != PROC_PID_LIMIT)
                return EINVAL;

        /* Find the process.  Hold a reference (p_reflock), if found. */
        error = sysctl_proc_findproc(l, (pid_t)name[-4], &p);
        if (error)
                return error;

        /* XXX-elad */
        error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p,
            KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
        if (error)
                goto out;

        /* Check if caller can retrieve the limits. */
        if (newp == NULL) {
                error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT,
                    p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_GET), &alim,
                    KAUTH_ARG(which));
                if (error)
                        goto out;
        }

        /* Retrieve the limits. */
        node = *rnode;
        memcpy(&alim, &p->p_rlimit[limitno], sizeof(alim));
        if (which == PROC_PID_LIMIT_TYPE_HARD) {
                node.sysctl_data = &alim.rlim_max;
        } else {
                node.sysctl_data = &alim.rlim_cur;
        }
        error = sysctl_lookup(SYSCTLFN_CALL(&node));

        /* Return if error, or if we are only retrieving the limits. */
        if (error || newp == NULL) {
                goto out;
        }
        error = dosetrlimit(l, p, limitno, &alim);
out:
        rw_exit(&p->p_reflock);
        return error;
}

/*
 * Setup sysctl nodes.
 */
static void
sysctl_proc_setup(void)
{

        sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_ANYNUMBER,
                       CTLTYPE_NODE, "curproc",
                       SYSCTL_DESCR("Per-process settings"),
                       NULL, 0, NULL, 0,
                       CTL_PROC, PROC_CURPROC, CTL_EOL);

        sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
                       CTLTYPE_INT, "paxflags",
                       SYSCTL_DESCR("Process PAX control flags"),
                       sysctl_proc_paxflags, 0, NULL, 0,
                       CTL_PROC, PROC_CURPROC, PROC_PID_PAXFLAGS, CTL_EOL);

        sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE,
                       CTLTYPE_STRING, "corename",
                       SYSCTL_DESCR("Core file name"),
                       sysctl_proc_corename, 0, NULL, MAXPATHLEN,
                       CTL_PROC, PROC_CURPROC, PROC_PID_CORENAME, CTL_EOL);
        sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "rlimit",
                       SYSCTL_DESCR("Process limits"),
                       NULL, 0, NULL, 0,
                       CTL_PROC, PROC_CURPROC, PROC_PID_LIMIT, CTL_EOL);

#define create_proc_plimit(s, n) do {                                        \
        sysctl_createv(&proc_sysctllog, 0, NULL, NULL,                        \
                       CTLFLAG_PERMANENT,                                \
                       CTLTYPE_NODE, s,                                        \
                       SYSCTL_DESCR("Process " s " limits"),                \
                       NULL, 0, NULL, 0,                                \
                       CTL_PROC, PROC_CURPROC, PROC_PID_LIMIT, n,        \
                       CTL_EOL);                                        \
        sysctl_createv(&proc_sysctllog, 0, NULL, NULL,                        \
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE, \
                       CTLTYPE_QUAD, "soft",                                \
                       SYSCTL_DESCR("Process soft " s " limit"),        \
                       sysctl_proc_plimit, 0, NULL, 0,                        \
                       CTL_PROC, PROC_CURPROC, PROC_PID_LIMIT, n,        \
                       PROC_PID_LIMIT_TYPE_SOFT, CTL_EOL);                \
        sysctl_createv(&proc_sysctllog, 0, NULL, NULL,                        \
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE, \
                       CTLTYPE_QUAD, "hard",                                \
                       SYSCTL_DESCR("Process hard " s " limit"),        \
                       sysctl_proc_plimit, 0, NULL, 0,                        \
                       CTL_PROC, PROC_CURPROC, PROC_PID_LIMIT, n,        \
                       PROC_PID_LIMIT_TYPE_HARD, CTL_EOL);                \
        } while (0/*CONSTCOND*/)

        create_proc_plimit("cputime",                PROC_PID_LIMIT_CPU);
        create_proc_plimit("filesize",                PROC_PID_LIMIT_FSIZE);
        create_proc_plimit("datasize",                PROC_PID_LIMIT_DATA);
        create_proc_plimit("stacksize",                PROC_PID_LIMIT_STACK);
        create_proc_plimit("coredumpsize",        PROC_PID_LIMIT_CORE);
        create_proc_plimit("memoryuse",                PROC_PID_LIMIT_RSS);
        create_proc_plimit("memorylocked",        PROC_PID_LIMIT_MEMLOCK);
        create_proc_plimit("maxproc",                PROC_PID_LIMIT_NPROC);
        create_proc_plimit("descriptors",        PROC_PID_LIMIT_NOFILE);
        create_proc_plimit("sbsize",                PROC_PID_LIMIT_SBSIZE);
        create_proc_plimit("vmemoryuse",        PROC_PID_LIMIT_AS);
        create_proc_plimit("maxlwp",                PROC_PID_LIMIT_NTHR);

#undef create_proc_plimit

        sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE,
                       CTLTYPE_INT, "stopfork",
                       SYSCTL_DESCR("Stop process at fork(2)"),
                       sysctl_proc_stop, 0, NULL, 0,
                       CTL_PROC, PROC_CURPROC, PROC_PID_STOPFORK, CTL_EOL);
        sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE,
                       CTLTYPE_INT, "stopexec",
                       SYSCTL_DESCR("Stop process at execve(2)"),
                       sysctl_proc_stop, 0, NULL, 0,
                       CTL_PROC, PROC_CURPROC, PROC_PID_STOPEXEC, CTL_EOL);
        sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE,
                       CTLTYPE_INT, "stopexit",
                       SYSCTL_DESCR("Stop process before completing exit"),
                       sysctl_proc_stop, 0, NULL, 0,
                       CTL_PROC, PROC_CURPROC, PROC_PID_STOPEXIT, CTL_EOL);
}


















































































    6 





































    2 






































    1 




    1 



































































































    6 




































































    6 

































    1 























    5 








    1 




    1 











    6 





    6 
















    6 






























































































    2 







































    2 










    6 

















    6 















    2 































    7 

















    4 





































































    2 


























    2 













    1 












































































    6 























    6 













    5 
    1 







    1 









    5 










    4 



    4 


    1 


    1 






















    2 







    3 

    2 














    2 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
/*        $NetBSD: uvm_mmap.c,v 1.185 2023/11/21 14:35:36 riastradh Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * Copyright (c) 1991, 1993 The Regents of the University of California.
 * Copyright (c) 1988 University of Utah.
 *
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * the Systems Programming Group of the University of Utah Computer
 * Science Department.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
 *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
 */

/*
 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
 * function.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.185 2023/11/21 14:35:36 riastradh Exp $");

#include "opt_compat_netbsd.h"
#include "opt_pax.h"

#include <sys/param.h>
#include <sys/types.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/resourcevar.h>
#include <sys/mman.h>
#include <sys/pax.h>

#include <sys/syscallargs.h>

#include <uvm/uvm.h>
#include <uvm/uvm_device.h>

static int uvm_mmap(struct vm_map *, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t,
    int, int, struct uvm_object *, voff_t, vsize_t);

static int
range_test(const struct vm_map *map, vaddr_t addr, vsize_t size, bool ismmap)
{
        vaddr_t vm_min_address = vm_map_min(map);
        vaddr_t vm_max_address = vm_map_max(map);
        vaddr_t eaddr = addr + size;
        int res = 0;

        if (addr < vm_min_address)
                return EINVAL;
        if (eaddr > vm_max_address)
                return ismmap ? EFBIG : EINVAL;
        if (addr > eaddr) /* no wrapping! */
                return ismmap ? EOVERFLOW : EINVAL;

#ifdef MD_MMAP_RANGE_TEST
        res = MD_MMAP_RANGE_TEST(addr, eaddr);
#endif

        return res;
}

/*
 * align the address to a page boundary, and adjust the size accordingly
 */
static int
round_and_check(const struct vm_map *map, vaddr_t *addr, vsize_t *size)
{
        const vsize_t pageoff = (vsize_t)(*addr & PAGE_MASK);

        *addr -= pageoff;

        if (*size != 0) {
                *size += pageoff;
                *size = (vsize_t)round_page(*size);
        } else if (*addr + *size < *addr) {
                return ENOMEM;
        }

        return range_test(map, *addr, *size, false);
}

/*
 * sys_mincore: determine if pages are in core or not.
 */

/* ARGSUSED */
int
sys_mincore(struct lwp *l, const struct sys_mincore_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(void *) addr;
                syscallarg(size_t) len;
                syscallarg(char *) vec;
        } */
        struct proc *p = l->l_proc;
        struct vm_page *pg;
        char *vec, pgi;
        struct uvm_object *uobj;
        struct vm_amap *amap;
        struct vm_anon *anon;
        struct vm_map_entry *entry;
        vaddr_t start, end, lim;
        struct vm_map *map;
        vsize_t len;
        int error = 0;
        size_t npgs;

        map = &p->p_vmspace->vm_map;

        start = (vaddr_t)SCARG(uap, addr);
        len = SCARG(uap, len);
        vec = SCARG(uap, vec);

        if (start & PAGE_MASK)
                return EINVAL;
        len = round_page(len);
        end = start + len;
        if (end <= start)
                return EINVAL;

        /*
         * Lock down vec, so our returned status isn't outdated by
         * storing the status byte for a page.
         */

        npgs = len >> PAGE_SHIFT;
        error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE);
        if (error) {
                return error;
        }
        vm_map_lock_read(map);

        if (uvm_map_lookup_entry(map, start, &entry) == false) {
                error = ENOMEM;
                goto out;
        }

        for (/* nothing */;
             entry != &map->header && entry->start < end;
             entry = entry->next) {
                KASSERT(!UVM_ET_ISSUBMAP(entry));
                KASSERT(start >= entry->start);

                /* Make sure there are no holes. */
                if (entry->end < end &&
                     (entry->next == &map->header ||
                      entry->next->start > entry->end)) {
                        error = ENOMEM;
                        goto out;
                }

                lim = end < entry->end ? end : entry->end;

                /*
                 * Special case for objects with no "real" pages.  Those
                 * are always considered resident (mapped devices).
                 */

                if (UVM_ET_ISOBJ(entry)) {
                        KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
                        if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) {
                                for (/* nothing */; start < lim;
                                     start += PAGE_SIZE, vec++)
                                        ustore_char(vec, 1);
                                continue;
                        }
                }

                amap = entry->aref.ar_amap;        /* upper layer */
                uobj = entry->object.uvm_obj;        /* lower layer */

                if (amap != NULL)
                        amap_lock(amap, RW_READER);
                if (uobj != NULL)
                        rw_enter(uobj->vmobjlock, RW_READER);

                for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
                        pgi = 0;
                        if (amap != NULL) {
                                /* Check the upper layer first. */
                                anon = amap_lookup(&entry->aref,
                                    start - entry->start);
                                /* Don't need to lock anon here. */
                                if (anon != NULL && anon->an_page != NULL) {

                                        /*
                                         * Anon has the page for this entry
                                         * offset.
                                         */

                                        pgi = 1;
                                }
                        }
                        if (uobj != NULL && pgi == 0) {
                                /* Check the lower layer. */
                                pg = uvm_pagelookup(uobj,
                                    entry->offset + (start - entry->start));
                                if (pg != NULL) {

                                        /*
                                         * Object has the page for this entry
                                         * offset.
                                         */

                                        pgi = 1;
                                }
                        }
                        (void) ustore_char(vec, pgi);
                }
                if (uobj != NULL)
                        rw_exit(uobj->vmobjlock);
                if (amap != NULL)
                        amap_unlock(amap);
        }

 out:
        vm_map_unlock_read(map);
        uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs);
        return error;
}

/*
 * sys_mmap: mmap system call.
 *
 * => file offset and address may not be page aligned
 *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
 *    - if address isn't page aligned the mapping starts at trunc_page(addr)
 *      and the return value is adjusted up by the page offset.
 */

int
sys_mmap(struct lwp *l, const struct sys_mmap_args *uap, register_t *retval)
{
        /* {
                syscallarg(void *) addr;
                syscallarg(size_t) len;
                syscallarg(int) prot;
                syscallarg(int) flags;
                syscallarg(int) fd;
                syscallarg(long) pad;
                syscallarg(off_t) pos;
        } */
        struct proc *p = l->l_proc;
        vaddr_t addr;
        off_t pos;
        vsize_t size, pageoff;
        vm_prot_t prot, maxprot, extraprot;
        int flags, fd, advice;
        vaddr_t defaddr = 0;        /* XXXGCC */
        bool addrhint = false;
        struct file *fp = NULL;
        struct uvm_object *uobj;
        int error;
        vaddr_t orig_addr;

        /*
         * first, extract syscall args from the uap.
         */

        addr = (vaddr_t)SCARG(uap, addr);
        size = (vsize_t)SCARG(uap, len);
        prot = SCARG(uap, prot) & VM_PROT_ALL;
        extraprot = PROT_MPROTECT_EXTRACT(SCARG(uap, prot));
        flags = SCARG(uap, flags);
        fd = SCARG(uap, fd);
        pos = SCARG(uap, pos);

        orig_addr = addr;

        if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
                return EINVAL;

        if (size == 0 && (flags & MAP_ANON) == 0)
                return EINVAL;

        /*
         * Align file position and save offset into page.  Adjust size
         * so that it is an integral multiple of the page size.
         */
        pageoff = pos & PAGE_MASK;
        pos -= pageoff;
        KASSERT(PAGE_MASK <= __type_max(vsize_t));
        KASSERT((__type_max(vsize_t) - PAGE_SIZE + 1) % PAGE_SIZE == 0);
        if (size > __type_max(vsize_t) - PAGE_SIZE + 1 - pageoff)
                return ENOMEM;
        /*
         * size + pageoff <= VSIZE_MAX + 1 - PAGE_SIZE, and the
         * right-hand side is an integral multiple of the page size, so
         * round_page(size + pageoff) <= VSIZE_MAX + 1 - PAGE_SIZE.
         */
        size = round_page(size + pageoff);

        /*
         * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
         */
        if (flags & MAP_FIXED) {
                /* ensure address and file offset are aligned properly */
                addr -= pageoff;
                if (addr & PAGE_MASK)
                        return EINVAL;

                error = range_test(&p->p_vmspace->vm_map, addr, size, true);
                if (error) {
                        return error;
                }
        } else if (addr == 0 || !(flags & MAP_TRYFIXED)) {
                /*
                 * not fixed: make sure we skip over the largest
                 * possible heap for non-topdown mapping arrangements.
                 * we will refine our guess later (e.g. to account for
                 * VAC, etc)
                 */

                defaddr = p->p_emul->e_vm_default_addr(p,
                    (vaddr_t)p->p_vmspace->vm_daddr, size,
                    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);

                if (addr == 0 || !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN))
                        addr = MAX(addr, defaddr);
                else
                        addr = MIN(addr, defaddr);

                /*
                 * If addr is nonzero and not the default, then the
                 * address is a hint.
                 */
                addrhint = (addr != 0 && addr != defaddr);
        }

        /*
         * check for file mappings (i.e. not anonymous) and verify file.
         */

        advice = UVM_ADV_NORMAL;
        if ((flags & MAP_ANON) == 0) {
                KASSERT(size != 0);

                if ((fp = fd_getfile(fd)) == NULL)
                        return EBADF;

                if (fp->f_ops->fo_mmap == NULL) {
                        error = ENODEV;
                        goto out;
                }
                error = (*fp->f_ops->fo_mmap)(fp, &pos, size, prot, &flags,
                    &advice, &uobj, &maxprot);
                if (error) {
                        goto out;
                }
                if (uobj == NULL) {
                        flags |= MAP_ANON;
                        fd_putfile(fd);
                        fp = NULL;
                        goto is_anon;
                }
        } else {                /* MAP_ANON case */
                /*
                 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
                 */
                if (fd != -1)
                        return EINVAL;

 is_anon:                /* label for SunOS style /dev/zero */
                uobj = NULL;
                maxprot = VM_PROT_ALL;
                pos = 0;
        }

        maxprot = PAX_MPROTECT_MAXPROTECT(l, prot, extraprot, maxprot);
        if (((prot | extraprot) & maxprot) != (prot | extraprot)) {
                error = EACCES;
                goto out;
        }
        if ((error = PAX_MPROTECT_VALIDATE(l, prot)))
                goto out;

        pax_aslr_mmap(l, &addr, orig_addr, flags);

        /*
         * Now let kernel internal function uvm_mmap do the work.
         *
         * If the user provided a hint, take a reference to uobj in
         * case the first attempt to satisfy the hint fails, so we can
         * try again with the default address.
         */
        if (addrhint) {
                if (uobj)
                        (*uobj->pgops->pgo_reference)(uobj);
        }
        error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
            flags, advice, uobj, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
        if (addrhint) {
                if (error) {
                        addr = defaddr;
                        pax_aslr_mmap(l, &addr, orig_addr, flags);
                        error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size,
                            prot, maxprot, flags, advice, uobj, pos,
                            p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
                } else if (uobj) {
                        /* Release the exta reference we took.  */
                        (*uobj->pgops->pgo_detach)(uobj);
                }
        }

        /* remember to add offset */
        *retval = (register_t)(addr + pageoff);

 out:
        if (fp != NULL)
                fd_putfile(fd);

        return error;
}

/*
 * sys___msync13: the msync system call (a front-end for flush)
 */

int
sys___msync13(struct lwp *l, const struct sys___msync13_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(void *) addr;
                syscallarg(size_t) len;
                syscallarg(int) flags;
        } */
        struct proc *p = l->l_proc;
        vaddr_t addr;
        vsize_t size;
        struct vm_map *map;
        int error, flags, uvmflags;
        bool rv;

        /*
         * extract syscall args from the uap
         */

        addr = (vaddr_t)SCARG(uap, addr);
        size = (vsize_t)SCARG(uap, len);
        flags = SCARG(uap, flags);

        /* sanity check flags */
        if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
            (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
            (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
                return EINVAL;
        if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
                flags |= MS_SYNC;

        /*
         * get map
         */
        map = &p->p_vmspace->vm_map;

        if (round_and_check(map, &addr, &size))
                return ENOMEM;

        /*
         * XXXCDC: do we really need this semantic?
         *
         * XXX Gak!  If size is zero we are supposed to sync "all modified
         * pages with the region containing addr".  Unfortunately, we
         * don't really keep track of individual mmaps so we approximate
         * by flushing the range of the map entry containing addr.
         * This can be incorrect if the region splits or is coalesced
         * with a neighbor.
         */

        if (size == 0) {
                struct vm_map_entry *entry;

                vm_map_lock_read(map);
                rv = uvm_map_lookup_entry(map, addr, &entry);
                if (rv == true) {
                        addr = entry->start;
                        size = entry->end - entry->start;
                }
                vm_map_unlock_read(map);
                if (rv == false)
                        return EINVAL;
        }

        /*
         * translate MS_ flags into PGO_ flags
         */

        uvmflags = PGO_CLEANIT;
        if (flags & MS_INVALIDATE)
                uvmflags |= PGO_FREE;
        if (flags & MS_SYNC)
                uvmflags |= PGO_SYNCIO;

        error = uvm_map_clean(map, addr, addr+size, uvmflags);
        return error;
}

/*
 * sys_munmap: unmap a users memory
 */

int
sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval)
{
        /* {
                syscallarg(void *) addr;
                syscallarg(size_t) len;
        } */
        struct proc *p = l->l_proc;
        vaddr_t addr;
        vsize_t size;
        struct vm_map *map;
        struct vm_map_entry *dead_entries;

        /*
         * get syscall args.
         */

        addr = (vaddr_t)SCARG(uap, addr);
        size = (vsize_t)SCARG(uap, len);

        map = &p->p_vmspace->vm_map;

        if (round_and_check(map, &addr, &size))
                return EINVAL;

        if (size == 0)
                return 0;

        vm_map_lock(map);
#if 0
        /*
         * interesting system call semantic: make sure entire range is
         * allocated before allowing an unmap.
         */
        if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
                vm_map_unlock(map);
                return EINVAL;
        }
#endif
        uvm_unmap_remove(map, addr, addr + size, &dead_entries, 0);
        vm_map_unlock(map);
        if (dead_entries != NULL)
                uvm_unmap_detach(dead_entries, 0);
        return 0;
}

/*
 * sys_mprotect: the mprotect system call
 */

int
sys_mprotect(struct lwp *l, const struct sys_mprotect_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(void *) addr;
                syscallarg(size_t) len;
                syscallarg(int) prot;
        } */
        struct proc *p = l->l_proc;
        vaddr_t addr;
        vsize_t size;
        vm_prot_t prot;
        int error;

        /*
         * extract syscall args from uap
         */

        addr = (vaddr_t)SCARG(uap, addr);
        size = (vsize_t)SCARG(uap, len);
        prot = SCARG(uap, prot) & VM_PROT_ALL;

        if (round_and_check(&p->p_vmspace->vm_map, &addr, &size))
                return EINVAL;

        error = uvm_map_protect_user(l, addr, addr + size, prot);
        return error;
}

/*
 * sys_minherit: the minherit system call
 */

int
sys_minherit(struct lwp *l, const struct sys_minherit_args *uap,
   register_t *retval)
{
        /* {
                syscallarg(void *) addr;
                syscallarg(int) len;
                syscallarg(int) inherit;
        } */
        struct proc *p = l->l_proc;
        vaddr_t addr;
        vsize_t size;
        vm_inherit_t inherit;
        int error;

        addr = (vaddr_t)SCARG(uap, addr);
        size = (vsize_t)SCARG(uap, len);
        inherit = SCARG(uap, inherit);

        if (round_and_check(&p->p_vmspace->vm_map, &addr, &size))
                return EINVAL;

        error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
            inherit);
        return error;
}

/*
 * sys_madvise: give advice about memory usage.
 */

/* ARGSUSED */
int
sys_madvise(struct lwp *l, const struct sys_madvise_args *uap,
   register_t *retval)
{
        /* {
                syscallarg(void *) addr;
                syscallarg(size_t) len;
                syscallarg(int) behav;
        } */
        struct proc *p = l->l_proc;
        vaddr_t addr;
        vsize_t size;
        int advice, error;

        addr = (vaddr_t)SCARG(uap, addr);
        size = (vsize_t)SCARG(uap, len);
        advice = SCARG(uap, behav);

        if (round_and_check(&p->p_vmspace->vm_map, &addr, &size))
                return EINVAL;

        switch (advice) {
        case MADV_NORMAL:
        case MADV_RANDOM:
        case MADV_SEQUENTIAL:
                error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
                    advice);
                break;

        case MADV_WILLNEED:

                /*
                 * Activate all these pages, pre-faulting them in if
                 * necessary.
                 */
                error = uvm_map_willneed(&p->p_vmspace->vm_map,
                    addr, addr + size);
                break;

        case MADV_DONTNEED:

                /*
                 * Deactivate all these pages.  We don't need them
                 * any more.  We don't, however, toss the data in
                 * the pages.
                 */

                error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
                    PGO_DEACTIVATE);
                break;

        case MADV_FREE:

                /*
                 * These pages contain no valid data, and may be
                 * garbage-collected.  Toss all resources, including
                 * any swap space in use.
                 */

                error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
                    PGO_FREE);
                break;

        case MADV_SPACEAVAIL:

                /*
                 * XXXMRG What is this?  I think it's:
                 *
                 *        Ensure that we have allocated backing-store
                 *        for these pages.
                 *
                 * This is going to require changes to the page daemon,
                 * as it will free swap space allocated to pages in core.
                 * There's also what to do for device/file/anonymous memory.
                 */

                return EINVAL;

        default:
                return EINVAL;
        }

        return error;
}

/*
 * sys_mlock: memory lock
 */

int
sys_mlock(struct lwp *l, const struct sys_mlock_args *uap, register_t *retval)
{
        /* {
                syscallarg(const void *) addr;
                syscallarg(size_t) len;
        } */
        struct proc *p = l->l_proc;
        vaddr_t addr;
        vsize_t size;
        int error;

        /*
         * extract syscall args from uap
         */

        addr = (vaddr_t)SCARG(uap, addr);
        size = (vsize_t)SCARG(uap, len);

        if (round_and_check(&p->p_vmspace->vm_map, &addr, &size))
                return ENOMEM;

        if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
                return EAGAIN;

        if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
            p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
                return EAGAIN;

        error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false,
            0);
        if (error == EFAULT)
                error = ENOMEM;
        return error;
}

/*
 * sys_munlock: unlock wired pages
 */

int
sys_munlock(struct lwp *l, const struct sys_munlock_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const void *) addr;
                syscallarg(size_t) len;
        } */
        struct proc *p = l->l_proc;
        vaddr_t addr;
        vsize_t size;

        /*
         * extract syscall args from uap
         */

        addr = (vaddr_t)SCARG(uap, addr);
        size = (vsize_t)SCARG(uap, len);

        if (round_and_check(&p->p_vmspace->vm_map, &addr, &size))
                return ENOMEM;

        if (uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true, 0))
                return ENOMEM;

        return 0;
}

/*
 * sys_mlockall: lock all pages mapped into an address space.
 */

int
sys_mlockall(struct lwp *l, const struct sys_mlockall_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) flags;
        } */
        struct proc *p = l->l_proc;
        int error, flags;

        flags = SCARG(uap, flags);

        if (flags == 0 || (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
                return EINVAL;

        error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
            p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
        return error;
}

/*
 * sys_munlockall: unlock all pages mapped into an address space.
 */

int
sys_munlockall(struct lwp *l, const void *v, register_t *retval)
{
        struct proc *p = l->l_proc;

        (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
        return 0;
}

/*
 * uvm_mmap: internal version of mmap
 *
 * - used by sys_mmap and various framebuffers
 * - uobj is a struct uvm_object pointer or NULL for MAP_ANON
 * - caller must page-align the file offset
 *
 * XXX This appears to leak the uobj in various error branches?  Need
 * to clean up the contract around uobj reference.
 */

static int
uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
    vm_prot_t maxprot, int flags, int advice, struct uvm_object *uobj,
    voff_t foff, vsize_t locklimit)
{
        vaddr_t align = 0;
        int error;
        uvm_flag_t uvmflag = 0;

        /*
         * check params
         */

        if (size == 0)
                return 0;
        if (foff & PAGE_MASK)
                return EINVAL;
        if ((prot & maxprot) != prot)
                return EINVAL;

        /*
         * for non-fixed mappings, round off the suggested address.
         * for fixed mappings, check alignment.
         */

        if ((flags & MAP_FIXED) == 0) {
                *addr = round_page(*addr);
        } else {
                if (*addr & PAGE_MASK)
                        return EINVAL;
                uvmflag |= UVM_FLAG_FIXED | UVM_FLAG_UNMAP;
        }

        /*
         * Try to see if any requested alignment can even be attemped.
         * Make sure we can express the alignment (asking for a >= 4GB
         * alignment on an ILP32 architecure make no sense) and the
         * alignment is at least for a page sized quanitiy.  If the
         * request was for a fixed mapping, make sure supplied address
         * adheres to the request alignment.
         */
        align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT;
        if (align) {
                if (align >= sizeof(vaddr_t) * NBBY)
                        return EINVAL;
                align = 1UL << align;
                if (align < PAGE_SIZE)
                        return EINVAL;
                if (align >= vm_map_max(map))
                        return ENOMEM;
                if (flags & MAP_FIXED) {
                        if ((*addr & (align-1)) != 0)
                                return EINVAL;
                        align = 0;
                }
        }

        /*
         * check resource limits
         */

        if (!VM_MAP_IS_KERNEL(map) &&
            (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) >
            curproc->p_rlimit[RLIMIT_AS].rlim_cur))
                return ENOMEM;

        /*
         * handle anon vs. non-anon mappings.   for non-anon mappings attach
         * to underlying vm object.
         */

        if (flags & MAP_ANON) {
                KASSERT(uobj == NULL);
                foff = UVM_UNKNOWN_OFFSET;
                if ((flags & MAP_SHARED) == 0)
                        /* XXX: defer amap create */
                        uvmflag |= UVM_FLAG_COPYONW;
                else
                        /* shared: create amap now */
                        uvmflag |= UVM_FLAG_OVERLAY;

        } else {
                KASSERT(uobj != NULL);
                if ((flags & MAP_SHARED) == 0) {
                        uvmflag |= UVM_FLAG_COPYONW;
                }
        }

        uvmflag = UVM_MAPFLAG(prot, maxprot,
            (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY, advice,
            uvmflag);
        error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
        if (error) {
                if (uobj)
                        uobj->pgops->pgo_detach(uobj);
                return error;
        }

        /*
         * POSIX 1003.1b -- if our address space was configured
         * to lock all future mappings, wire the one we just made.
         *
         * Also handle the MAP_WIRED flag here.
         */

        if (prot == VM_PROT_NONE) {

                /*
                 * No more work to do in this case.
                 */

                return 0;
        }
        if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) {
                vm_map_lock(map);
                if (atop(size) + uvmexp.wired > uvmexp.wiredmax ||
                    (locklimit != 0 &&
                     size + ptoa(pmap_wired_count(vm_map_pmap(map))) >
                     locklimit)) {
                        vm_map_unlock(map);
                        uvm_unmap(map, *addr, *addr + size);
                        return ENOMEM;
                }

                /*
                 * uvm_map_pageable() always returns the map unlocked.
                 */

                error = uvm_map_pageable(map, *addr, *addr + size,
                    false, UVM_LK_ENTER);
                if (error) {
                        uvm_unmap(map, *addr, *addr + size);
                        return error;
                }
                return 0;
        }
        return 0;
}

vaddr_t
uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz, int topdown)
{

        if (topdown)
                return VM_DEFAULT_ADDRESS_TOPDOWN(base, sz);
        else
                return VM_DEFAULT_ADDRESS_BOTTOMUP(base, sz);
}

int
uvm_mmap_dev(struct proc *p, void **addrp, size_t len, dev_t dev,
    off_t off)
{
        struct uvm_object *uobj;
        int error, flags, prot;

        KASSERT(len > 0);

        flags = MAP_SHARED;
        prot = VM_PROT_READ | VM_PROT_WRITE;
        if (*addrp)
                flags |= MAP_FIXED;
        else
                *addrp = (void *)p->p_emul->e_vm_default_addr(p,
                    (vaddr_t)p->p_vmspace->vm_daddr, len,
                    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);

        uobj = udv_attach(dev, prot, off, len);
        if (uobj == NULL)
                return EINVAL;

        error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
            (vsize_t)len, prot, prot, flags, UVM_ADV_RANDOM, uobj, off,
            p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
        return error;
}

int
uvm_mmap_anon(struct proc *p, void **addrp, size_t len)
{
        int error, flags, prot;

        flags = MAP_PRIVATE | MAP_ANON;
        prot = VM_PROT_READ | VM_PROT_WRITE;
        if (*addrp)
                flags |= MAP_FIXED;
        else
                *addrp = (void *)p->p_emul->e_vm_default_addr(p,
                    (vaddr_t)p->p_vmspace->vm_daddr, len,
                    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);

        error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
            (vsize_t)len, prot, prot, flags, UVM_ADV_NORMAL, NULL, 0,
            p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
        return error;
}















































































































































































































































































































































































































































   70 











  125 



































  149 








  150 



  126 





    3 



    3 



















   67 
   64 





   66 



   67 































   10 











   10 



   31 











   31 






































    5 


    5 












    6 











    6 



























   64 




















































































































































































    1 

























































































































































































































































    1 

    1 



































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
/* $NetBSD: kern_tc.c,v 1.76 2023/07/30 12:39:18 riastradh Exp $ */

/*-
 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * ----------------------------------------------------------------------------
 * "THE BEER-WARE LICENSE" (Revision 42):
 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
 * can do whatever you want with this stuff. If we meet some day, and you think
 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
 * ---------------------------------------------------------------------------
 */

/*
 * https://papers.freebsd.org/2002/phk-timecounters.files/timecounter.pdf
 */

#include <sys/cdefs.h>
/* __FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.166 2005/09/19 22:16:31 andre Exp $"); */
__KERNEL_RCSID(0, "$NetBSD: kern_tc.c,v 1.76 2023/07/30 12:39:18 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_ntp.h"
#endif

#include <sys/param.h>

#include <sys/atomic.h>
#include <sys/evcnt.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/reboot.h>        /* XXX just to get AB_VERBOSE */
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/timepps.h>
#include <sys/timetc.h>
#include <sys/timex.h>
#include <sys/xcall.h>

/*
 * A large step happens on boot.  This constant detects such steps.
 * It is relatively small so that ntp_update_second gets called enough
 * in the typical 'missed a couple of seconds' case, but doesn't loop
 * forever when the time step is large.
 */
#define LARGE_STEP        200

/*
 * Implement a dummy timecounter which we can use until we get a real one
 * in the air.  This allows the console and other early stuff to use
 * time services.
 */

static u_int
dummy_get_timecount(struct timecounter *tc)
{
        static u_int now;

        return ++now;
}

static struct timecounter dummy_timecounter = {
        .tc_get_timecount        = dummy_get_timecount,
        .tc_counter_mask        = ~0u,
        .tc_frequency                = 1000000,
        .tc_name                = "dummy",
        .tc_quality                = -1000000,
        .tc_priv                = NULL,
};

struct timehands {
        /* These fields must be initialized by the driver. */
        struct timecounter        *th_counter;     /* active timecounter */
        int64_t                        th_adjustment;   /* frequency adjustment */
                                                 /* (NTP/adjtime) */
        uint64_t                th_scale;        /* scale factor (counter */
                                                 /* tick->time) */
        uint64_t                 th_offset_count; /* offset at last time */
                                                 /* update (tc_windup()) */
        struct bintime                th_offset;       /* bin (up)time at windup */
        struct timeval                th_microtime;    /* cached microtime */
        struct timespec                th_nanotime;     /* cached nanotime */
        /* Fields not to be copied in tc_windup start with th_generation. */
        volatile u_int                th_generation;   /* current genration */
        struct timehands        *th_next;        /* next timehand */
};

static struct timehands th0;
static struct timehands th9 = { .th_next = &th0, };
static struct timehands th8 = { .th_next = &th9, };
static struct timehands th7 = { .th_next = &th8, };
static struct timehands th6 = { .th_next = &th7, };
static struct timehands th5 = { .th_next = &th6, };
static struct timehands th4 = { .th_next = &th5, };
static struct timehands th3 = { .th_next = &th4, };
static struct timehands th2 = { .th_next = &th3, };
static struct timehands th1 = { .th_next = &th2, };
static struct timehands th0 = {
        .th_counter = &dummy_timecounter,
        .th_scale = (uint64_t)-1 / 1000000,
        .th_offset = { .sec = 1, .frac = 0 },
        .th_generation = 1,
        .th_next = &th1,
};

static struct timehands *volatile timehands = &th0;
struct timecounter *timecounter = &dummy_timecounter;
static struct timecounter *timecounters = &dummy_timecounter;

/* used by savecore(8) */
time_t time_second_legacy asm("time_second");

#ifdef __HAVE_ATOMIC64_LOADSTORE
volatile time_t time__second __cacheline_aligned = 1;
volatile time_t time__uptime __cacheline_aligned = 1;
#else
static volatile struct {
        uint32_t lo, hi;
} time__uptime32 __cacheline_aligned = {
        .lo = 1,
}, time__second32 __cacheline_aligned = {
        .lo = 1,
};
#endif

static struct {
        struct bintime bin;
        volatile unsigned gen;        /* even when stable, odd when changing */
} timebase __cacheline_aligned;

static int timestepwarnings;

kmutex_t timecounter_lock;
static u_int timecounter_mods;
static volatile int timecounter_removals = 1;
static u_int timecounter_bad;

#ifdef __HAVE_ATOMIC64_LOADSTORE

static inline void
setrealuptime(time_t second, time_t uptime)
{

        time_second_legacy = second;

        atomic_store_relaxed(&time__second, second);
        atomic_store_relaxed(&time__uptime, uptime);
}

#else

static inline void
setrealuptime(time_t second, time_t uptime)
{
        uint32_t seclo = second & 0xffffffff, sechi = second >> 32;
        uint32_t uplo = uptime & 0xffffffff, uphi = uptime >> 32;

        KDASSERT(mutex_owned(&timecounter_lock));

        time_second_legacy = second;

        /*
         * Fast path -- no wraparound, just updating the low bits, so
         * no need for seqlocked access.
         */
        if (__predict_true(sechi == time__second32.hi) &&
            __predict_true(uphi == time__uptime32.hi)) {
                atomic_store_relaxed(&time__second32.lo, seclo);
                atomic_store_relaxed(&time__uptime32.lo, uplo);
                return;
        }

        atomic_store_relaxed(&time__second32.hi, 0xffffffff);
        atomic_store_relaxed(&time__uptime32.hi, 0xffffffff);
        membar_producer();
        atomic_store_relaxed(&time__second32.lo, seclo);
        atomic_store_relaxed(&time__uptime32.lo, uplo);
        membar_producer();
        atomic_store_relaxed(&time__second32.hi, sechi);
        atomic_store_relaxed(&time__uptime32.hi, uphi);
}

time_t
getrealtime(void)
{
        uint32_t lo, hi;

        do {
                for (;;) {
                        hi = atomic_load_relaxed(&time__second32.hi);
                        if (__predict_true(hi != 0xffffffff))
                                break;
                        SPINLOCK_BACKOFF_HOOK;
                }
                membar_consumer();
                lo = atomic_load_relaxed(&time__second32.lo);
                membar_consumer();
        } while (hi != atomic_load_relaxed(&time__second32.hi));

        return ((time_t)hi << 32) | lo;
}

time_t
getuptime(void)
{
        uint32_t lo, hi;

        do {
                for (;;) {
                        hi = atomic_load_relaxed(&time__uptime32.hi);
                        if (__predict_true(hi != 0xffffffff))
                                break;
                        SPINLOCK_BACKOFF_HOOK;
                }
                membar_consumer();
                lo = atomic_load_relaxed(&time__uptime32.lo);
                membar_consumer();
        } while (hi != atomic_load_relaxed(&time__uptime32.hi));

        return ((time_t)hi << 32) | lo;
}

time_t
getboottime(void)
{

        return getrealtime() - getuptime();
}

uint32_t
getuptime32(void)
{

        return atomic_load_relaxed(&time__uptime32.lo);
}

#endif        /* !defined(__HAVE_ATOMIC64_LOADSTORE) */

/*
 * sysctl helper routine for kern.timercounter.hardware
 */
static int
sysctl_kern_timecounter_hardware(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int error;
        char newname[MAX_TCNAMELEN];
        struct timecounter *newtc, *tc;

        tc = timecounter;

        strlcpy(newname, tc->tc_name, sizeof(newname));

        node = *rnode;
        node.sysctl_data = newname;
        node.sysctl_size = sizeof(newname);

        error = sysctl_lookup(SYSCTLFN_CALL(&node));

        if (error ||
            newp == NULL ||
            strncmp(newname, tc->tc_name, sizeof(newname)) == 0)
                return error;

        if (l != NULL && (error = kauth_authorize_system(l->l_cred,
            KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS, newname,
            NULL, NULL)) != 0)
                return error;

        if (!cold)
                mutex_spin_enter(&timecounter_lock);
        error = EINVAL;
        for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
                if (strcmp(newname, newtc->tc_name) != 0)
                        continue;
                /* Warm up new timecounter. */
                (void)newtc->tc_get_timecount(newtc);
                (void)newtc->tc_get_timecount(newtc);
                timecounter = newtc;
                error = 0;
                break;
        }
        if (!cold)
                mutex_spin_exit(&timecounter_lock);
        return error;
}

static int
sysctl_kern_timecounter_choice(SYSCTLFN_ARGS)
{
        char buf[MAX_TCNAMELEN+48];
        char *where;
        const char *spc;
        struct timecounter *tc;
        size_t needed, left, slen;
        int error, mods;

        if (newp != NULL)
                return EPERM;
        if (namelen != 0)
                return EINVAL;

        mutex_spin_enter(&timecounter_lock);
 retry:
        spc = "";
        error = 0;
        needed = 0;
        left = *oldlenp;
        where = oldp;
        for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) {
                if (where == NULL) {
                        needed += sizeof(buf);  /* be conservative */
                } else {
                        slen = snprintf(buf, sizeof(buf), "%s%s(q=%d, f=%" PRId64
                                        " Hz)", spc, tc->tc_name, tc->tc_quality,
                                        tc->tc_frequency);
                        if (left < slen + 1)
                                break;
                         mods = timecounter_mods;
                        mutex_spin_exit(&timecounter_lock);
                        error = copyout(buf, where, slen + 1);
                        mutex_spin_enter(&timecounter_lock);
                        if (mods != timecounter_mods) {
                                goto retry;
                        }
                        spc = " ";
                        where += slen;
                        needed += slen;
                        left -= slen;
                }
        }
        mutex_spin_exit(&timecounter_lock);

        *oldlenp = needed;
        return error;
}

SYSCTL_SETUP(sysctl_timecounter_setup, "sysctl timecounter setup")
{
        const struct sysctlnode *node;

        sysctl_createv(clog, 0, NULL, &node,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "timecounter",
                       SYSCTL_DESCR("time counter information"),
                       NULL, 0, NULL, 0,
                       CTL_KERN, CTL_CREATE, CTL_EOL);

        if (node != NULL) {
                sysctl_createv(clog, 0, NULL, NULL,
                               CTLFLAG_PERMANENT,
                               CTLTYPE_STRING, "choice",
                               SYSCTL_DESCR("available counters"),
                               sysctl_kern_timecounter_choice, 0, NULL, 0,
                               CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);

                sysctl_createv(clog, 0, NULL, NULL,
                               CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                               CTLTYPE_STRING, "hardware",
                               SYSCTL_DESCR("currently active time counter"),
                               sysctl_kern_timecounter_hardware, 0, NULL, MAX_TCNAMELEN,
                               CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);

                sysctl_createv(clog, 0, NULL, NULL,
                               CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                               CTLTYPE_INT, "timestepwarnings",
                               SYSCTL_DESCR("log time steps"),
                               NULL, 0, &timestepwarnings, 0,
                               CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
        }
}

#ifdef TC_COUNTERS
#define        TC_STATS(name)                                                        \
static struct evcnt n##name =                                                \
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "timecounter", #name);        \
EVCNT_ATTACH_STATIC(n##name)
TC_STATS(binuptime);    TC_STATS(nanouptime);    TC_STATS(microuptime);
TC_STATS(bintime);      TC_STATS(nanotime);      TC_STATS(microtime);
TC_STATS(getbinuptime); TC_STATS(getnanouptime); TC_STATS(getmicrouptime);
TC_STATS(getbintime);   TC_STATS(getnanotime);   TC_STATS(getmicrotime);
TC_STATS(setclock);
#define        TC_COUNT(var)        var.ev_count++
#undef TC_STATS
#else
#define        TC_COUNT(var)        /* nothing */
#endif        /* TC_COUNTERS */

static void tc_windup(void);

/*
 * Return the difference between the timehands' counter value now and what
 * was when we copied it to the timehands' offset_count.
 */
static inline u_int
tc_delta(struct timehands *th)
{
        struct timecounter *tc;

        tc = th->th_counter;
        return (tc->tc_get_timecount(tc) -
                 th->th_offset_count) & tc->tc_counter_mask;
}

/*
 * Functions for reading the time.  We have to loop until we are sure that
 * the timehands that we operated on was not updated under our feet.  See
 * the comment in <sys/timevar.h> for a description of these 12 functions.
 */

void
binuptime(struct bintime *bt)
{
        struct timehands *th;
        lwp_t *l;
        u_int lgen, gen;

        TC_COUNT(nbinuptime);

        /*
         * Provide exclusion against tc_detach().
         *
         * We record the number of timecounter removals before accessing
         * timecounter state.  Note that the LWP can be using multiple
         * "generations" at once, due to interrupts (interrupted while in
         * this function).  Hardware interrupts will borrow the interrupted
         * LWP's l_tcgen value for this purpose, and can themselves be
         * interrupted by higher priority interrupts.  In this case we need
         * to ensure that the oldest generation in use is recorded.
         *
         * splsched() is too expensive to use, so we take care to structure
         * this code in such a way that it is not required.  Likewise, we
         * do not disable preemption.
         *
         * Memory barriers are also too expensive to use for such a
         * performance critical function.  The good news is that we do not
         * need memory barriers for this type of exclusion, as the thread
         * updating timecounter_removals will issue a broadcast cross call
         * before inspecting our l_tcgen value (this elides memory ordering
         * issues).
         *
         * XXX If the author of the above comment knows how to make it
         * safe to avoid memory barriers around the access to
         * th->th_generation, I'm all ears.
         */
        l = curlwp;
        lgen = l->l_tcgen;
        if (__predict_true(lgen == 0)) {
                l->l_tcgen = timecounter_removals;
        }
        __insn_barrier();

        do {
                th = atomic_load_consume(&timehands);
                gen = th->th_generation;
                membar_consumer();
                *bt = th->th_offset;
                bintime_addx(bt, th->th_scale * tc_delta(th));
                membar_consumer();
        } while (gen == 0 || gen != th->th_generation);

        __insn_barrier();
        l->l_tcgen = lgen;
}

void
nanouptime(struct timespec *tsp)
{
        struct bintime bt;

        TC_COUNT(nnanouptime);
        binuptime(&bt);
        bintime2timespec(&bt, tsp);
}

void
microuptime(struct timeval *tvp)
{
        struct bintime bt;

        TC_COUNT(nmicrouptime);
        binuptime(&bt);
        bintime2timeval(&bt, tvp);
}

void
bintime(struct bintime *bt)
{
        struct bintime boottime;

        TC_COUNT(nbintime);
        binuptime(bt);
        getbinboottime(&boottime);
        bintime_add(bt, &boottime);
}

void
nanotime(struct timespec *tsp)
{
        struct bintime bt;

        TC_COUNT(nnanotime);
        bintime(&bt);
        bintime2timespec(&bt, tsp);
}

void
microtime(struct timeval *tvp)
{
        struct bintime bt;

        TC_COUNT(nmicrotime);
        bintime(&bt);
        bintime2timeval(&bt, tvp);
}

void
getbinuptime(struct bintime *bt)
{
        struct timehands *th;
        u_int gen;

        TC_COUNT(ngetbinuptime);
        do {
                th = atomic_load_consume(&timehands);
                gen = th->th_generation;
                membar_consumer();
                *bt = th->th_offset;
                membar_consumer();
        } while (gen == 0 || gen != th->th_generation);
}

void
getnanouptime(struct timespec *tsp)
{
        struct timehands *th;
        u_int gen;

        TC_COUNT(ngetnanouptime);
        do {
                th = atomic_load_consume(&timehands);
                gen = th->th_generation;
                membar_consumer();
                bintime2timespec(&th->th_offset, tsp);
                membar_consumer();
        } while (gen == 0 || gen != th->th_generation);
}

void
getmicrouptime(struct timeval *tvp)
{
        struct timehands *th;
        u_int gen;

        TC_COUNT(ngetmicrouptime);
        do {
                th = atomic_load_consume(&timehands);
                gen = th->th_generation;
                membar_consumer();
                bintime2timeval(&th->th_offset, tvp);
                membar_consumer();
        } while (gen == 0 || gen != th->th_generation);
}

void
getbintime(struct bintime *bt)
{
        struct timehands *th;
        struct bintime boottime;
        u_int gen;

        TC_COUNT(ngetbintime);
        do {
                th = atomic_load_consume(&timehands);
                gen = th->th_generation;
                membar_consumer();
                *bt = th->th_offset;
                membar_consumer();
        } while (gen == 0 || gen != th->th_generation);
        getbinboottime(&boottime);
        bintime_add(bt, &boottime);
}

static inline void
dogetnanotime(struct timespec *tsp)
{
        struct timehands *th;
        u_int gen;

        TC_COUNT(ngetnanotime);
        do {
                th = atomic_load_consume(&timehands);
                gen = th->th_generation;
                membar_consumer();
                *tsp = th->th_nanotime;
                membar_consumer();
        } while (gen == 0 || gen != th->th_generation);
}

void
getnanotime(struct timespec *tsp)
{

        dogetnanotime(tsp);
}

void dtrace_getnanotime(struct timespec *tsp);

void
dtrace_getnanotime(struct timespec *tsp)
{

        dogetnanotime(tsp);
}

void
getmicrotime(struct timeval *tvp)
{
        struct timehands *th;
        u_int gen;

        TC_COUNT(ngetmicrotime);
        do {
                th = atomic_load_consume(&timehands);
                gen = th->th_generation;
                membar_consumer();
                *tvp = th->th_microtime;
                membar_consumer();
        } while (gen == 0 || gen != th->th_generation);
}

void
getnanoboottime(struct timespec *tsp)
{
        struct bintime bt;

        getbinboottime(&bt);
        bintime2timespec(&bt, tsp);
}

void
getmicroboottime(struct timeval *tvp)
{
        struct bintime bt;

        getbinboottime(&bt);
        bintime2timeval(&bt, tvp);
}

void
getbinboottime(struct bintime *basep)
{
        struct bintime base;
        unsigned gen;

        do {
                /* Spin until the timebase isn't changing.  */
                while ((gen = atomic_load_relaxed(&timebase.gen)) & 1)
                        SPINLOCK_BACKOFF_HOOK;

                /* Read out a snapshot of the timebase.  */
                membar_consumer();
                base = timebase.bin;
                membar_consumer();

                /* Restart if it changed while we were reading.  */
        } while (gen != atomic_load_relaxed(&timebase.gen));

        *basep = base;
}

/*
 * Initialize a new timecounter and possibly use it.
 */
void
tc_init(struct timecounter *tc)
{
        u_int u;

        KASSERTMSG(tc->tc_next == NULL, "timecounter %s already initialised",
            tc->tc_name);

        u = tc->tc_frequency / tc->tc_counter_mask;
        /* XXX: We need some margin here, 10% is a guess */
        u *= 11;
        u /= 10;
        if (u > hz && tc->tc_quality >= 0) {
                tc->tc_quality = -2000;
                aprint_verbose(
                    "timecounter: Timecounter \"%s\" frequency %ju Hz",
                            tc->tc_name, (uintmax_t)tc->tc_frequency);
                aprint_verbose(" -- Insufficient hz, needs at least %u\n", u);
        } else if (tc->tc_quality >= 0 || bootverbose) {
                aprint_verbose(
                    "timecounter: Timecounter \"%s\" frequency %ju Hz "
                    "quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency,
                    tc->tc_quality);
        }

        mutex_spin_enter(&timecounter_lock);
        tc->tc_next = timecounters;
        timecounters = tc;
        timecounter_mods++;
        /*
         * Never automatically use a timecounter with negative quality.
         * Even though we run on the dummy counter, switching here may be
         * worse since this timecounter may not be monotonous.
         */
        if (tc->tc_quality >= 0 && (tc->tc_quality > timecounter->tc_quality ||
            (tc->tc_quality == timecounter->tc_quality &&
            tc->tc_frequency > timecounter->tc_frequency))) {
                (void)tc->tc_get_timecount(tc);
                (void)tc->tc_get_timecount(tc);
                timecounter = tc;
                tc_windup();
        }
        mutex_spin_exit(&timecounter_lock);
}

/*
 * Pick a new timecounter due to the existing counter going bad.
 */
static void
tc_pick(void)
{
        struct timecounter *best, *tc;

        KASSERT(mutex_owned(&timecounter_lock));

        for (best = tc = timecounters; tc != NULL; tc = tc->tc_next) {
                if (tc->tc_quality > best->tc_quality)
                        best = tc;
                else if (tc->tc_quality < best->tc_quality)
                        continue;
                else if (tc->tc_frequency > best->tc_frequency)
                        best = tc;
        }
        (void)best->tc_get_timecount(best);
        (void)best->tc_get_timecount(best);
        timecounter = best;
}

/*
 * A timecounter has gone bad, arrange to pick a new one at the next
 * clock tick.
 */
void
tc_gonebad(struct timecounter *tc)
{

        tc->tc_quality = -100;
        membar_producer();
        atomic_inc_uint(&timecounter_bad);
}

/*
 * Stop using a timecounter and remove it from the timecounters list.
 */
int
tc_detach(struct timecounter *target)
{
        struct timecounter *tc;
        struct timecounter **tcp = NULL;
        int removals;
        lwp_t *l;

        /* First, find the timecounter. */
        mutex_spin_enter(&timecounter_lock);
        for (tcp = &timecounters, tc = timecounters;
             tc != NULL;
             tcp = &tc->tc_next, tc = tc->tc_next) {
                if (tc == target)
                        break;
        }
        if (tc == NULL) {
                mutex_spin_exit(&timecounter_lock);
                return ESRCH;
        }

        /* And now, remove it. */
        *tcp = tc->tc_next;
        if (timecounter == target) {
                tc_pick();
                tc_windup();
        }
        timecounter_mods++;
        removals = timecounter_removals++;
        mutex_spin_exit(&timecounter_lock);

        /*
         * We now have to determine if any threads in the system are still
         * making use of this timecounter.
         *
         * We issue a broadcast cross call to elide memory ordering issues,
         * then scan all LWPs in the system looking at each's timecounter
         * generation number.  We need to see a value of zero (not actively
         * using a timecounter) or a value greater than our removal value.
         *
         * We may race with threads that read `timecounter_removals' and
         * and then get preempted before updating `l_tcgen'.  This is not
         * a problem, since it means that these threads have not yet started
         * accessing timecounter state.  All we do need is one clean
         * snapshot of the system where every thread appears not to be using
         * old timecounter state.
         */
        for (;;) {
                xc_barrier(0);

                mutex_enter(&proc_lock);
                LIST_FOREACH(l, &alllwp, l_list) {
                        if (l->l_tcgen == 0 || l->l_tcgen > removals) {
                                /*
                                 * Not using timecounter or old timecounter
                                 * state at time of our xcall or later.
                                 */
                                continue;
                        }
                        break;
                }
                mutex_exit(&proc_lock);

                /*
                 * If the timecounter is still in use, wait at least 10ms
                 * before retrying.
                 */
                if (l == NULL) {
                        break;
                }
                (void)kpause("tcdetach", false, mstohz(10), NULL);
        }

        tc->tc_next = NULL;
        return 0;
}

/* Report the frequency of the current timecounter. */
uint64_t
tc_getfrequency(void)
{

        return atomic_load_consume(&timehands)->th_counter->tc_frequency;
}

/*
 * Step our concept of UTC.  This is done by modifying our estimate of
 * when we booted.
 */
void
tc_setclock(const struct timespec *ts)
{
        struct timespec ts2;
        struct bintime bt, bt2;

        mutex_spin_enter(&timecounter_lock);
        TC_COUNT(nsetclock);
        binuptime(&bt2);
        timespec2bintime(ts, &bt);
        bintime_sub(&bt, &bt2);
        bintime_add(&bt2, &timebase.bin);
        timebase.gen |= 1;        /* change in progress */
        membar_producer();
        timebase.bin = bt;
        membar_producer();
        timebase.gen++;                /* commit change */
        tc_windup();
        mutex_spin_exit(&timecounter_lock);

        if (timestepwarnings) {
                bintime2timespec(&bt2, &ts2);
                log(LOG_INFO,
                    "Time stepped from %lld.%09ld to %lld.%09ld\n",
                    (long long)ts2.tv_sec, ts2.tv_nsec,
                    (long long)ts->tv_sec, ts->tv_nsec);
        }
}

/*
 * Initialize the next struct timehands in the ring and make
 * it the active timehands.  Along the way we might switch to a different
 * timecounter and/or do seconds processing in NTP.  Slightly magic.
 */
static void
tc_windup(void)
{
        struct bintime bt;
        struct timehands *th, *tho;
        uint64_t scale;
        u_int delta, ncount, ogen;
        int i, s_update;
        time_t t;

        KASSERT(mutex_owned(&timecounter_lock));

        s_update = 0;

        /*
         * Make the next timehands a copy of the current one, but do not
         * overwrite the generation or next pointer.  While we update
         * the contents, the generation must be zero.  Ensure global
         * visibility of the generation before proceeding.
         */
        tho = timehands;
        th = tho->th_next;
        ogen = th->th_generation;
        th->th_generation = 0;
        membar_producer();
        bcopy(tho, th, offsetof(struct timehands, th_generation));

        /*
         * Capture a timecounter delta on the current timecounter and if
         * changing timecounters, a counter value from the new timecounter.
         * Update the offset fields accordingly.
         */
        delta = tc_delta(th);
        if (th->th_counter != timecounter)
                ncount = timecounter->tc_get_timecount(timecounter);
        else
                ncount = 0;
        th->th_offset_count += delta;
        bintime_addx(&th->th_offset, th->th_scale * delta);

        /*
         * Hardware latching timecounters may not generate interrupts on
         * PPS events, so instead we poll them.  There is a finite risk that
         * the hardware might capture a count which is later than the one we
         * got above, and therefore possibly in the next NTP second which might
         * have a different rate than the current NTP second.  It doesn't
         * matter in practice.
         */
        if (tho->th_counter->tc_poll_pps)
                tho->th_counter->tc_poll_pps(tho->th_counter);

        /*
         * Deal with NTP second processing.  The for loop normally
         * iterates at most once, but in extreme situations it might
         * keep NTP sane if timeouts are not run for several seconds.
         * At boot, the time step can be large when the TOD hardware
         * has been read, so on really large steps, we call
         * ntp_update_second only twice.  We need to call it twice in
         * case we missed a leap second.
         * If NTP is not compiled in ntp_update_second still calculates
         * the adjustment resulting from adjtime() calls.
         */
        bt = th->th_offset;
        bintime_add(&bt, &timebase.bin);
        i = bt.sec - tho->th_microtime.tv_sec;
        if (i > LARGE_STEP)
                i = 2;
        for (; i > 0; i--) {
                t = bt.sec;
                ntp_update_second(&th->th_adjustment, &bt.sec);
                s_update = 1;
                if (bt.sec != t) {
                        timebase.gen |= 1;        /* change in progress */
                        membar_producer();
                        timebase.bin.sec += bt.sec - t;
                        membar_producer();
                        timebase.gen++;                /* commit change */
                }
        }

        /* Update the UTC timestamps used by the get*() functions. */
        /* XXX shouldn't do this here.  Should force non-`get' versions. */
        bintime2timeval(&bt, &th->th_microtime);
        bintime2timespec(&bt, &th->th_nanotime);
        /* Now is a good time to change timecounters. */
        if (th->th_counter != timecounter) {
                th->th_counter = timecounter;
                th->th_offset_count = ncount;
                s_update = 1;
        }

        /*-
         * Recalculate the scaling factor.  We want the number of 1/2^64
         * fractions of a second per period of the hardware counter, taking
         * into account the th_adjustment factor which the NTP PLL/adjtime(2)
         * processing provides us with.
         *
         * The th_adjustment is nanoseconds per second with 32 bit binary
         * fraction and we want 64 bit binary fraction of second:
         *
         *         x = a * 2^32 / 10^9 = a * 4.294967296
         *
         * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
         * we can only multiply by about 850 without overflowing, but that
         * leaves suitably precise fractions for multiply before divide.
         *
         * Divide before multiply with a fraction of 2199/512 results in a
         * systematic undercompensation of 10PPM of th_adjustment.  On a
         * 5000PPM adjustment this is a 0.05PPM error.  This is acceptable.
          *
         * We happily sacrifice the lowest of the 64 bits of our result
         * to the goddess of code clarity.
         *
         */
        if (s_update) {
                scale = (uint64_t)1 << 63;
                scale += (th->th_adjustment / 1024) * 2199;
                scale /= th->th_counter->tc_frequency;
                th->th_scale = scale * 2;
        }
        /*
         * Now that the struct timehands is again consistent, set the new
         * generation number, making sure to not make it zero.  Ensure
         * changes are globally visible before changing.
         */
        if (++ogen == 0)
                ogen = 1;
        membar_producer();
        th->th_generation = ogen;

        /*
         * Go live with the new struct timehands.  Ensure changes are
         * globally visible before changing.
         */
        setrealuptime(th->th_microtime.tv_sec, th->th_offset.sec);
        atomic_store_release(&timehands, th);

        /*
         * Force users of the old timehand to move on.  This is
         * necessary for MP systems; we need to ensure that the
         * consumers will move away from the old timehand before
         * we begin updating it again when we eventually wrap
         * around.
         */
        if (++tho->th_generation == 0)
                tho->th_generation = 1;
}

/*
 * RFC 2783 PPS-API implementation.
 */

int
pps_ioctl(u_long cmd, void *data, struct pps_state *pps)
{
        pps_params_t *app;
        pps_info_t *pipi;
#ifdef PPS_SYNC
        int *epi;
#endif

        KASSERT(mutex_owned(&timecounter_lock));

        KASSERT(pps != NULL);

        switch (cmd) {
        case PPS_IOC_CREATE:
                return 0;
        case PPS_IOC_DESTROY:
                return 0;
        case PPS_IOC_SETPARAMS:
                app = (pps_params_t *)data;
                if (app->mode & ~pps->ppscap)
                        return EINVAL;
                pps->ppsparam = *app;
                return 0;
        case PPS_IOC_GETPARAMS:
                app = (pps_params_t *)data;
                *app = pps->ppsparam;
                app->api_version = PPS_API_VERS_1;
                return 0;
        case PPS_IOC_GETCAP:
                *(int*)data = pps->ppscap;
                return 0;
        case PPS_IOC_FETCH:
                pipi = (pps_info_t *)data;
                pps->ppsinfo.current_mode = pps->ppsparam.mode;
                *pipi = pps->ppsinfo;
                return 0;
        case PPS_IOC_KCBIND:
#ifdef PPS_SYNC
                epi = (int *)data;
                /* XXX Only root should be able to do this */
                if (*epi & ~pps->ppscap)
                        return EINVAL;
                pps->kcmode = *epi;
                return 0;
#else
                return EOPNOTSUPP;
#endif
        default:
                return EPASSTHROUGH;
        }
}

void
pps_init(struct pps_state *pps)
{

        KASSERT(mutex_owned(&timecounter_lock));

        pps->ppscap |= PPS_TSFMT_TSPEC;
        if (pps->ppscap & PPS_CAPTUREASSERT)
                pps->ppscap |= PPS_OFFSETASSERT;
        if (pps->ppscap & PPS_CAPTURECLEAR)
                pps->ppscap |= PPS_OFFSETCLEAR;
}

/*
 * capture a timetamp in the pps structure
 */
void
pps_capture(struct pps_state *pps)
{
        struct timehands *th;

        KASSERT(mutex_owned(&timecounter_lock));
        KASSERT(pps != NULL);

        th = timehands;
        pps->capgen = th->th_generation;
        pps->capth = th;
        pps->capcount = (uint64_t)tc_delta(th) + th->th_offset_count;
        if (pps->capgen != th->th_generation)
                pps->capgen = 0;
}

#ifdef PPS_DEBUG
int ppsdebug = 0;
#endif

/*
 * process a pps_capture()ed event
 */
void
pps_event(struct pps_state *pps, int event)
{
        pps_ref_event(pps, event, NULL, PPS_REFEVNT_PPS|PPS_REFEVNT_CAPTURE);
}

/*
 * extended pps api /  kernel pll/fll entry point
 *
 * feed reference time stamps to PPS engine
 *
 * will simulate a PPS event and feed
 * the NTP PLL/FLL if requested.
 *
 * the ref time stamps should be roughly once
 * a second but do not need to be exactly in phase
 * with the UTC second but should be close to it.
 * this relaxation of requirements allows callout
 * driven timestamping mechanisms to feed to pps
 * capture/kernel pll logic.
 *
 * calling pattern is:
 *  pps_capture() (for PPS_REFEVNT_{CAPTURE|CAPCUR})
 *  read timestamp from reference source
 *  pps_ref_event()
 *
 * supported refmodes:
 *  PPS_REFEVNT_CAPTURE
 *    use system timestamp of pps_capture()
 *  PPS_REFEVNT_CURRENT
 *    use system timestamp of this call
 *  PPS_REFEVNT_CAPCUR
 *    use average of read capture and current system time stamp
 *  PPS_REFEVNT_PPS
 *    assume timestamp on second mark - ref_ts is ignored
 *
 */

void
pps_ref_event(struct pps_state *pps,
              int event,
              struct bintime *ref_ts,
              int refmode
        )
{
        struct bintime bt;        /* current time */
        struct bintime btd;        /* time difference */
        struct bintime bt_ref;        /* reference time */
        struct timespec ts, *tsp, *osp;
        struct timehands *th;
        uint64_t tcount, acount, dcount, *pcount;
        int foff, gen;
#ifdef PPS_SYNC
        int fhard;
#endif
        pps_seq_t *pseq;

        KASSERT(mutex_owned(&timecounter_lock));

        KASSERT(pps != NULL);

        /* pick up current time stamp if needed */
        if (refmode & (PPS_REFEVNT_CURRENT|PPS_REFEVNT_CAPCUR)) {
                /* pick up current time stamp */
                th = timehands;
                gen = th->th_generation;
                tcount = (uint64_t)tc_delta(th) + th->th_offset_count;
                if (gen != th->th_generation)
                        gen = 0;

                /* If the timecounter was wound up underneath us, bail out. */
                if (pps->capgen == 0 ||
                    pps->capgen != pps->capth->th_generation ||
                    gen == 0 ||
                    gen != pps->capgen) {
#ifdef PPS_DEBUG
                        if (ppsdebug & 0x1) {
                                log(LOG_DEBUG,
                                    "pps_ref_event(pps=%p, event=%d, ...): DROP (wind-up)\n",
                                    pps, event);
                        }
#endif
                        return;
                }
        } else {
                tcount = 0;        /* keep GCC happy */
        }

#ifdef PPS_DEBUG
        if (ppsdebug & 0x1) {
                struct timespec tmsp;

                if (ref_ts == NULL) {
                        tmsp.tv_sec = 0;
                        tmsp.tv_nsec = 0;
                } else {
                        bintime2timespec(ref_ts, &tmsp);
                }

                log(LOG_DEBUG,
                    "pps_ref_event(pps=%p, event=%d, ref_ts=%"PRIi64
                    ".%09"PRIi32", refmode=0x%1x)\n",
                    pps, event, tmsp.tv_sec, (int32_t)tmsp.tv_nsec, refmode);
        }
#endif

        /* setup correct event references */
        if (event == PPS_CAPTUREASSERT) {
                tsp = &pps->ppsinfo.assert_timestamp;
                osp = &pps->ppsparam.assert_offset;
                foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
#ifdef PPS_SYNC
                fhard = pps->kcmode & PPS_CAPTUREASSERT;
#endif
                pcount = &pps->ppscount[0];
                pseq = &pps->ppsinfo.assert_sequence;
        } else {
                tsp = &pps->ppsinfo.clear_timestamp;
                osp = &pps->ppsparam.clear_offset;
                foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
#ifdef PPS_SYNC
                fhard = pps->kcmode & PPS_CAPTURECLEAR;
#endif
                pcount = &pps->ppscount[1];
                pseq = &pps->ppsinfo.clear_sequence;
        }

        /* determine system time stamp according to refmode */
        dcount = 0;                /* keep GCC happy */
        switch (refmode & PPS_REFEVNT_RMASK) {
        case PPS_REFEVNT_CAPTURE:
                acount = pps->capcount;        /* use capture timestamp */
                break;

        case PPS_REFEVNT_CURRENT:
                acount = tcount; /* use current timestamp */
                break;

        case PPS_REFEVNT_CAPCUR:
                /*
                 * calculate counter value between pps_capture() and
                 * pps_ref_event()
                 */
                dcount = tcount - pps->capcount;
                acount = (dcount / 2) + pps->capcount;
                break;

        default:                /* ignore call error silently */
                return;
        }

        /*
         * If the timecounter changed, we cannot compare the count values, so
         * we have to drop the rest of the PPS-stuff until the next event.
         */
        if (pps->ppstc != pps->capth->th_counter) {
                pps->ppstc = pps->capth->th_counter;
                pps->capcount = acount;
                *pcount = acount;
                pps->ppscount[2] = acount;
#ifdef PPS_DEBUG
                if (ppsdebug & 0x1) {
                        log(LOG_DEBUG,
                            "pps_ref_event(pps=%p, event=%d, ...): DROP (time-counter change)\n",
                            pps, event);
                }
#endif
                return;
        }

        pps->capcount = acount;

        /* Convert the count to a bintime. */
        bt = pps->capth->th_offset;
        bintime_addx(&bt, pps->capth->th_scale * (acount - pps->capth->th_offset_count));
        bintime_add(&bt, &timebase.bin);

        if ((refmode & PPS_REFEVNT_PPS) == 0) {
                /* determine difference to reference time stamp */
                bt_ref = *ref_ts;

                btd = bt;
                bintime_sub(&btd, &bt_ref);

                /*
                 * simulate a PPS timestamp by dropping the fraction
                 * and applying the offset
                 */
                if (bt.frac >= (uint64_t)1<<63)        /* skip to nearest second */
                        bt.sec++;
                bt.frac = 0;
                bintime_add(&bt, &btd);
        } else {
                /*
                 * create ref_ts from current time -
                 * we are supposed to be called on
                 * the second mark
                 */
                bt_ref = bt;
                if (bt_ref.frac >= (uint64_t)1<<63)        /* skip to nearest second */
                        bt_ref.sec++;
                bt_ref.frac = 0;
        }

        /* convert bintime to timestamp */
        bintime2timespec(&bt, &ts);

        /* If the timecounter was wound up underneath us, bail out. */
        if (pps->capgen != pps->capth->th_generation)
                return;

        /* store time stamp */
        *pcount = pps->capcount;
        (*pseq)++;
        *tsp = ts;

        /* add offset correction */
        if (foff) {
                timespecadd(tsp, osp, tsp);
                if (tsp->tv_nsec < 0) {
                        tsp->tv_nsec += 1000000000;
                        tsp->tv_sec -= 1;
                }
        }

#ifdef PPS_DEBUG
        if (ppsdebug & 0x2) {
                struct timespec ts2;
                struct timespec ts3;

                bintime2timespec(&bt_ref, &ts2);

                bt.sec = 0;
                bt.frac = 0;

                if (refmode & PPS_REFEVNT_CAPCUR) {
                            bintime_addx(&bt, pps->capth->th_scale * dcount);
                }
                bintime2timespec(&bt, &ts3);

                log(LOG_DEBUG, "ref_ts=%"PRIi64".%09"PRIi32
                    ", ts=%"PRIi64".%09"PRIi32", read latency=%"PRIi64" ns\n",
                    ts2.tv_sec, (int32_t)ts2.tv_nsec,
                    tsp->tv_sec, (int32_t)tsp->tv_nsec,
                    timespec2ns(&ts3));
        }
#endif

#ifdef PPS_SYNC
        if (fhard) {
                uint64_t scale;
                uint64_t div;

                /*
                 * Feed the NTP PLL/FLL.
                 * The FLL wants to know how many (hardware) nanoseconds
                 * elapsed since the previous event (mod 1 second) thus
                 * we are actually looking at the frequency difference scaled
                 * in nsec.
                 * As the counter time stamps are not truly at 1Hz
                 * we need to scale the count by the elapsed
                 * reference time.
                 * valid sampling interval: [0.5..2[ sec
                 */

                /* calculate elapsed raw count */
                tcount = pps->capcount - pps->ppscount[2];
                pps->ppscount[2] = pps->capcount;
                tcount &= pps->capth->th_counter->tc_counter_mask;

                /* calculate elapsed ref time */
                btd = bt_ref;
                bintime_sub(&btd, &pps->ref_time);
                pps->ref_time = bt_ref;

                /* check that we stay below 2 sec */
                if (btd.sec < 0 || btd.sec > 1)
                        return;

                /* we want at least 0.5 sec between samples */
                if (btd.sec == 0 && btd.frac < (uint64_t)1<<63)
                        return;

                /*
                 * calculate cycles per period by multiplying
                 * the frequency with the elapsed period
                 * we pick a fraction of 30 bits
                 * ~1ns resolution for elapsed time
                 */
                div   = (uint64_t)btd.sec << 30;
                div  |= (btd.frac >> 34) & (((uint64_t)1 << 30) - 1);
                div  *= pps->capth->th_counter->tc_frequency;
                div >>= 30;

                if (div == 0)        /* safeguard */
                        return;

                scale = (uint64_t)1 << 63;
                scale /= div;
                scale *= 2;

                bt.sec = 0;
                bt.frac = 0;
                bintime_addx(&bt, scale * tcount);
                bintime2timespec(&bt, &ts);

#ifdef PPS_DEBUG
                if (ppsdebug & 0x4) {
                        struct timespec ts2;
                        int64_t df;

                        bintime2timespec(&bt_ref, &ts2);
                        df = timespec2ns(&ts);
                        if (df > 500000000)
                                df -= 1000000000;
                        log(LOG_DEBUG, "hardpps: ref_ts=%"PRIi64
                            ".%09"PRIi32", ts=%"PRIi64".%09"PRIi32
                            ", freqdiff=%"PRIi64" ns/s\n",
                            ts2.tv_sec, (int32_t)ts2.tv_nsec,
                            tsp->tv_sec, (int32_t)tsp->tv_nsec,
                            df);
                }
#endif

                hardpps(tsp, timespec2ns(&ts));
        }
#endif
}

/*
 * Timecounters need to be updated every so often to prevent the hardware
 * counter from overflowing.  Updating also recalculates the cached values
 * used by the get*() family of functions, so their precision depends on
 * the update frequency.
 */

static int tc_tick;

void
tc_ticktock(void)
{
        static int count;

        if (++count < tc_tick)
                return;
        count = 0;
        mutex_spin_enter(&timecounter_lock);
        if (__predict_false(timecounter_bad != 0)) {
                /* An existing timecounter has gone bad, pick a new one. */
                (void)atomic_swap_uint(&timecounter_bad, 0);
                if (timecounter->tc_quality < 0) {
                        tc_pick();
                }
        }
        tc_windup();
        mutex_spin_exit(&timecounter_lock);
}

void
inittimecounter(void)
{
        u_int p;

        mutex_init(&timecounter_lock, MUTEX_DEFAULT, IPL_HIGH);

        /*
         * Set the initial timeout to
         * max(1, <approx. number of hardclock ticks in a millisecond>).
         * People should probably not use the sysctl to set the timeout
         * to smaller than its initial value, since that value is the
         * smallest reasonable one.  If they want better timestamps they
         * should use the non-"get"* functions.
         */
        if (hz > 1000)
                tc_tick = (hz + 500) / 1000;
        else
                tc_tick = 1;
        p = (tc_tick * 1000000) / hz;
        aprint_verbose("timecounter: Timecounters tick every %d.%03u msec\n",
            p / 1000, p % 1000);

        /* warm up new timecounter (again) and get rolling. */
        (void)timecounter->tc_get_timecount(timecounter);
        (void)timecounter->tc_get_timecount(timecounter);
}
















































































































    2 








    2 














    2 
    2 
    2 


































    1 

















    1 









    1 












    1 



















































































    1 



































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
/*        $NetBSD: vfs_syscalls_43.c,v 1.68 2021/09/07 11:43:02 riastradh Exp $        */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_syscalls.c        8.28 (Berkeley) 12/10/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_43.c,v 1.68 2021/09/07 11:43:02 riastradh Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/dirent.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/malloc.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/syslog.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>

#include <sys/mount.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <sys/vfs_syscalls.h>

#include <compat/sys/stat.h>
#include <compat/sys/mount.h>
#include <compat/sys/dirent.h>

#include <compat/common/compat_util.h>
#include <compat/common/compat_mod.h>

static struct syscall_package vfs_syscalls_43_syscalls[] = {
        { SYS_compat_43_oquota,     0, (sy_call_t *)compat_43_sys_quota },
        { SYS_compat_43_stat43,     0, (sy_call_t *)compat_43_sys_stat },
        { SYS_compat_43_lstat43,    0, (sy_call_t *)compat_43_sys_lstat },
        { SYS_compat_43_fstat43,    0, (sy_call_t *)compat_43_sys_fstat },
        { SYS_compat_43_otruncate,  0, (sy_call_t *)compat_43_sys_ftruncate },
        { SYS_compat_43_oftruncate, 0, (sy_call_t *)compat_43_sys_ftruncate },
        { SYS_compat_43_olseek,     0, (sy_call_t *)compat_43_sys_lseek },
        { SYS_compat_43_ocreat,     0, (sy_call_t *)compat_43_sys_creat },
        { SYS_compat_43_ogetdirentries, 0,
            (sy_call_t *)compat_43_sys_getdirentries },
        { 0, 0, NULL }
};

/*
 * Convert from an old to a new timespec structure.
 */
static void
cvttimespec(struct timespec50 *ots, const struct timespec *ts)
{

        if (ts->tv_sec > INT_MAX) {
#if defined(DEBUG) || 1
                static bool first = true;

                if (first) {
                        first = false;
                        printf("%s[%s:%d]: time_t does not fit\n",
                            __func__, curlwp->l_proc->p_comm,
                            curlwp->l_lid);
                }
#endif
                ots->tv_sec = INT_MAX;
        } else
                ots->tv_sec = ts->tv_sec;
        ots->tv_nsec = ts->tv_nsec;
}

/*
 * Convert from an old to a new stat structure.
 */
static void
cvtstat(struct stat43 *ost, const struct stat *st)
{

        /* Handle any padding. */
        memset(ost, 0, sizeof(*ost));
        ost->st_dev = st->st_dev;
        ost->st_ino = st->st_ino;
        ost->st_mode = st->st_mode & 0xffff;
        ost->st_nlink = st->st_nlink;
        ost->st_uid = st->st_uid;
        ost->st_gid = st->st_gid;
        ost->st_rdev = st->st_rdev;
        if (st->st_size < (quad_t)1 << 32)
                ost->st_size = st->st_size;
        else
                ost->st_size = -2;
        cvttimespec(&ost->st_atimespec, &st->st_atimespec);
        cvttimespec(&ost->st_mtimespec, &st->st_mtimespec);
        cvttimespec(&ost->st_ctimespec, &st->st_ctimespec);
        ost->st_blksize = st->st_blksize;
        ost->st_blocks = st->st_blocks;
        ost->st_flags = st->st_flags;
        ost->st_gen = st->st_gen;
}

/*
 * Get file status; this version follows links.
 */
/* ARGSUSED */
int
compat_43_sys_stat(struct lwp *l, const struct compat_43_sys_stat_args *uap, register_t *retval)
{
        /* {
                syscallarg(char *) path;
                syscallarg(struct stat43 *) ub;
        } */
        struct stat sb;
        struct stat43 osb;
        int error;

        error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb);
        if (error)
                return error;
        cvtstat(&osb, &sb);
        return copyout(&osb, SCARG(uap, ub), sizeof(osb));
}

/*
 * Get file status; this version does not follow links.
 */
/* ARGSUSED */
int
compat_43_sys_lstat(struct lwp *l, const struct compat_43_sys_lstat_args *uap, register_t *retval)
{
        /* {
                syscallarg(char *) path;
                syscallarg(struct stat43 *) ub;
        } */
        struct stat sb;
        struct stat43 osb;
        int error;

        error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb);
        if (error)
                return error;

        /*
         * For symbolic links, BSD4.3 returned the attributes of its
         * containing directory, except for mode, size, and links.
         * This is no longer emulated, the parent directory is not consulted.
         */
        cvtstat(&osb, &sb);
        return copyout(&osb, SCARG(uap, ub), sizeof(osb));
}

/*
 * Return status information about a file descriptor.
 */
/* ARGSUSED */
int
compat_43_sys_fstat(struct lwp *l, const struct compat_43_sys_fstat_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(struct stat43 *) sb;
        } */
        struct stat sb;
        struct stat43 osb;
        int error;

        error = do_sys_fstat(SCARG(uap, fd), &sb);
        if (error)
                return error;

        cvtstat(&osb, &sb);
        return copyout(&osb, SCARG(uap, sb), sizeof(osb));
}


/*
 * Truncate a file given a file descriptor.
 */
/* ARGSUSED */
int
compat_43_sys_ftruncate(struct lwp *l, const struct compat_43_sys_ftruncate_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(long) length;
        } */
        struct sys_ftruncate_args /* {
                syscallarg(int) fd;
                syscallarg(int) pad;
                syscallarg(off_t) length;
        } */ nuap;

        SCARG(&nuap, fd) = SCARG(uap, fd);
        SCARG(&nuap, length) = SCARG(uap, length);
        return sys_ftruncate(l, &nuap, retval);
}

/*
 * Truncate a file given its path name.
 */
/* ARGSUSED */
int
compat_43_sys_truncate(struct lwp *l, const struct compat_43_sys_truncate_args *uap, register_t *retval)
{
        /* {
                syscallarg(char *) path;
                syscallarg(long) length;
        } */
        struct sys_truncate_args /* {
                syscallarg(char *) path;
                syscallarg(int) pad;
                syscallarg(off_t) length;
        } */ nuap;

        SCARG(&nuap, path) = SCARG(uap, path);
        SCARG(&nuap, length) = SCARG(uap, length);
        return (sys_truncate(l, &nuap, retval));
}


/*
 * Reposition read/write file offset.
 */
int
compat_43_sys_lseek(struct lwp *l, const struct compat_43_sys_lseek_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(long) offset;
                syscallarg(int) whence;
        } */
        struct sys_lseek_args /* {
                syscallarg(int) fd;
                syscallarg(int) pad;
                syscallarg(off_t) offset;
                syscallarg(int) whence;
        } */ nuap;
        off_t qret;
        int error;

        SCARG(&nuap, fd) = SCARG(uap, fd);
        SCARG(&nuap, offset) = SCARG(uap, offset);
        SCARG(&nuap, whence) = SCARG(uap, whence);
        error = sys_lseek(l, &nuap, (register_t *)&qret);
        *(long *)retval = qret;
        return (error);
}


/*
 * Create a file.
 */
int
compat_43_sys_creat(struct lwp *l, const struct compat_43_sys_creat_args *uap, register_t *retval)
{
        /* {
                syscallarg(char *) path;
                syscallarg(int) mode;
        } */
        struct sys_open_args /* {
                syscallarg(char *) path;
                syscallarg(int) flags;
                syscallarg(int) mode;
        } */ nuap;

        SCARG(&nuap, path) = SCARG(uap, path);
        SCARG(&nuap, mode) = SCARG(uap, mode);
        SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
        return (sys_open(l, &nuap, retval));
}

/*ARGSUSED*/
int
compat_43_sys_quota(struct lwp *l, const void *v, register_t *retval)
{

        return (ENOSYS);
}


/*
 * Read a block of directory entries in a file system independent format.
 */
int
compat_43_sys_getdirentries(struct lwp *l, const struct compat_43_sys_getdirentries_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(char *) buf;
                syscallarg(u_int) count;
                syscallarg(long *) basep;
        } */
        struct dirent *bdp;
        struct vnode *vp;
        void *tbuf;                        /* Current-format */
        char *inp;                        /* Current-format */
        int len, reclen;                /* Current-format */
        char *outp;                        /* Dirent12-format */
        int resid, old_reclen = 0;        /* Dirent12-format */
        struct file *fp;
        struct uio auio;
        struct iovec aiov;
        struct dirent43 idb;
        off_t off;                /* true file offset */
        int buflen, error, eofflag, nbytes;
        struct vattr va;
        off_t *cookiebuf = NULL, *cookie;
        int ncookies;
        long loff;
                 
        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return (error);

        if ((fp->f_flag & FREAD) == 0) {
                error = EBADF;
                goto out1;
        }

        vp = fp->f_vnode;
        if (vp->v_type != VDIR) {
                error = ENOTDIR;
                goto out1;
        }

        vn_lock(vp, LK_SHARED | LK_RETRY);
        error = VOP_GETATTR(vp, &va, l->l_cred);
        VOP_UNLOCK(vp);
        if (error)
                goto out1;

        loff = fp->f_offset;
        nbytes = SCARG(uap, count);
        buflen = uimin(MAXBSIZE, nbytes);
        if (buflen < va.va_blocksize)
                buflen = va.va_blocksize;
        tbuf = malloc(buflen, M_TEMP, M_WAITOK);

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        off = fp->f_offset;
again:
        aiov.iov_base = tbuf;
        aiov.iov_len = buflen;
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        auio.uio_rw = UIO_READ;
        auio.uio_resid = buflen;
        auio.uio_offset = off;
        UIO_SETUP_SYSSPACE(&auio);
        /*
         * First we read into the malloc'ed buffer, then
         * we massage it into user space, one record at a time.
         */
        error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &cookiebuf,
            &ncookies);
        if (error)
                goto out;

        inp = (char *)tbuf;
        outp = SCARG(uap, buf);
        resid = nbytes;
        if ((len = buflen - auio.uio_resid) == 0)
                goto eof;

        for (cookie = cookiebuf; len > 0; len -= reclen) {
                bdp = (struct dirent *)inp;
                reclen = bdp->d_reclen;
                if (reclen & 3) {
                        error = EIO;
                        goto out;
                }
                if (bdp->d_fileno == 0) {
                        inp += reclen;        /* it is a hole; squish it out */
                        if (cookie)
                                off = *cookie++;
                        else
                                off += reclen;
                        continue;
                }
                memset(&idb, 0, sizeof(idb));
                if (bdp->d_namlen >= sizeof(idb.d_name))
                        idb.d_namlen = sizeof(idb.d_name) - 1;
                else
                        idb.d_namlen = bdp->d_namlen;
                old_reclen = _DIRENT_RECLEN(&idb, bdp->d_namlen);
                if (reclen > len || resid < old_reclen) {
                        /* entry too big for buffer, so just stop */
                        outp++;
                        break;
                }
                /*
                 * Massage in place to make a Dirent12-shaped dirent (otherwise
                 * we have to worry about touching user memory outside of
                 * the copyout() call).
                 */
                idb.d_fileno = (uint32_t)bdp->d_fileno;
                idb.d_reclen = (uint16_t)old_reclen;
                idb.d_fileno = (uint32_t)bdp->d_fileno;
                (void)memcpy(idb.d_name, bdp->d_name, idb.d_namlen);
                memset(idb.d_name + idb.d_namlen, 0,
                    idb.d_reclen - _DIRENT_NAMEOFF(&idb) - idb.d_namlen);
                if ((error = copyout(&idb, outp, old_reclen)))
                        goto out;
                /* advance past this real entry */
                inp += reclen;
                if (cookie)
                        off = *cookie++; /* each entry points to itself */
                else
                        off += reclen;
                /* advance output past Dirent12-shaped entry */
                outp += old_reclen;
                resid -= old_reclen;
        }

        /* if we squished out the whole block, try again */
        if (outp == SCARG(uap, buf)) {
                if (cookiebuf)
                        free(cookiebuf, M_TEMP);
                cookiebuf = NULL;
                goto again;
        }
        fp->f_offset = off;        /* update the vnode offset */

eof:
        *retval = nbytes - resid;
out:
        VOP_UNLOCK(vp);
        if (cookiebuf)
                free(cookiebuf, M_TEMP);
        free(tbuf, M_TEMP);
out1:
        fd_putfile(SCARG(uap, fd));
        if (error)
                return error;
        return copyout(&loff, SCARG(uap, basep), sizeof(loff));
}

int
vfs_syscalls_43_init(void)
{

        return syscall_establish(NULL, vfs_syscalls_43_syscalls);
}

int
vfs_syscalls_43_fini(void)
{

        return syscall_disestablish(NULL, vfs_syscalls_43_syscalls);
}











































































































































































































































































































































































































































































































































































































    1 













    1 














    1 




















    1 








































    1 

    1 







































































    1 



































































































































    1 

    1 
































    1 







































    1 


























    1 
    1 













    1 





    1 












    1 



    1 









    1 

















































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
/*        $NetBSD: procfs_vnops.c,v 1.230 2024/01/17 10:19:21 hannken Exp $        */

/*-
 * Copyright (c) 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)procfs_vnops.c        8.18 (Berkeley) 5/21/95
 */

/*
 * Copyright (c) 1993 Jan-Simon Pendry
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)procfs_vnops.c        8.18 (Berkeley) 5/21/95
 */

/*
 * procfs vnode interface
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: procfs_vnops.c,v 1.230 2024/01/17 10:19:21 hannken Exp $");

#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/dirent.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
#include <sys/ptrace.h>
#include <sys/kauth.h>
#include <sys/exec.h>

#include <uvm/uvm_extern.h>        /* for PAGE_SIZE */

#include <machine/reg.h>

#include <miscfs/genfs/genfs.h>
#include <miscfs/procfs/procfs.h>

/*
 * Vnode Operations.
 *
 */

static int procfs_validfile_linux(struct lwp *, struct mount *);
static int procfs_root_readdir_callback(struct proc *, void *);
static void procfs_dir(pfstype, struct lwp *, struct proc *, char **, char *,
    size_t);

/*
 * This is a list of the valid names in the
 * process-specific sub-directories.  It is
 * used in procfs_lookup and procfs_readdir
 */
static const struct proc_target {
        u_char        pt_type;
        u_char        pt_namlen;
        const char        *pt_name;
        pfstype        pt_pfstype;
        int        (*pt_valid)(struct lwp *, struct mount *);
} proc_targets[] = {
#define N(s) sizeof(s)-1, s
        /*          name                type                validp */
        { DT_DIR, N("."),        PFSproc,        NULL },
        { DT_DIR, N(".."),        PFSroot,        NULL },
        { DT_DIR, N("fd"),        PFSfd,                NULL },
        { DT_DIR, N("task"),        PFStask,        procfs_validfile_linux },
        { DT_LNK, N("cwd"),        PFScwd,                NULL },
        { DT_REG, N("emul"),        PFSemul,        NULL },
        { DT_LNK, N("root"),        PFSchroot,        NULL },
        { DT_REG, N("auxv"),        PFSauxv,        procfs_validauxv },
        { DT_REG, N("cmdline"), PFScmdline,        NULL },
        { DT_REG, N("environ"), PFSenviron,        NULL },
        { DT_LNK, N("exe"),        PFSexe,                procfs_validfile },
        { DT_REG, N("file"),        PFSfile,        procfs_validfile },
        { DT_REG, N("fpregs"),        PFSfpregs,        procfs_validfpregs },
        { DT_REG, N("limit"),        PFSlimit,        NULL },
        { DT_REG, N("map"),        PFSmap,                procfs_validmap },
        { DT_REG, N("maps"),        PFSmaps,        procfs_validmap },
        { DT_REG, N("mem"),        PFSmem,                NULL },
        { DT_REG, N("note"),        PFSnote,        NULL },
        { DT_REG, N("notepg"),        PFSnotepg,        NULL },
        { DT_REG, N("regs"),        PFSregs,        procfs_validregs },
        { DT_REG, N("stat"),        PFSstat,        procfs_validfile_linux },
        { DT_REG, N("statm"),        PFSstatm,        procfs_validfile_linux },
        { DT_REG, N("status"),        PFSstatus,        NULL },
#ifdef __HAVE_PROCFS_MACHDEP
        PROCFS_MACHDEP_NODETYPE_DEFNS
#endif
#undef N
};
static const int nproc_targets = sizeof(proc_targets) / sizeof(proc_targets[0]);

/*
 * List of files in the root directory. Note: the validate function will
 * be called with p == NULL for these ones.
 */
static const struct proc_target proc_root_targets[] = {
#define N(s) sizeof(s)-1, s
        /*          name                    type            validp */
        { DT_REG, N("meminfo"),     PFSmeminfo,        procfs_validfile_linux },
        { DT_REG, N("cpuinfo"),     PFScpuinfo,        procfs_validfile_linux },
        { DT_REG, N("uptime"),      PFSuptime,         procfs_validfile_linux },
        { DT_REG, N("mounts"),            PFSmounts,               procfs_validfile_linux },
        { DT_REG, N("devices"),     PFSdevices,        procfs_validfile_linux },
        { DT_REG, N("stat"),            PFScpustat,        procfs_validfile_linux },
        { DT_REG, N("loadavg"),            PFSloadavg,        procfs_validfile_linux },
        { DT_REG, N("version"),     PFSversion,        procfs_validfile_linux },
#undef N
};
static const int nproc_root_targets =
    sizeof(proc_root_targets) / sizeof(proc_root_targets[0]);

int        procfs_lookup(void *);
int        procfs_open(void *);
int        procfs_close(void *);
int        procfs_access(void *);
int        procfs_getattr(void *);
int        procfs_setattr(void *);
int        procfs_readdir(void *);
int        procfs_readlink(void *);
int        procfs_inactive(void *);
int        procfs_reclaim(void *);
int        procfs_print(void *);
int        procfs_pathconf(void *);
int        procfs_getpages(void *);

static uint8_t fttodt(file_t *);
static int atoi(const char *, size_t);

/*
 * procfs vnode operations.
 */
int (**procfs_vnodeop_p)(void *);
const struct vnodeopv_entry_desc procfs_vnodeop_entries[] = {
        { &vop_default_desc, vn_default_error },
        { &vop_parsepath_desc, genfs_parsepath },        /* parsepath */
        { &vop_lookup_desc, procfs_lookup },                /* lookup */
        { &vop_create_desc, genfs_eopnotsupp },                /* create */
        { &vop_mknod_desc, genfs_eopnotsupp },                /* mknod */
        { &vop_open_desc, procfs_open },                /* open */
        { &vop_close_desc, procfs_close },                /* close */
        { &vop_access_desc, procfs_access },                /* access */
        { &vop_accessx_desc, genfs_accessx },                /* accessx */
        { &vop_getattr_desc, procfs_getattr },                /* getattr */
        { &vop_setattr_desc, procfs_setattr },                /* setattr */
        { &vop_read_desc, procfs_rw },                        /* read */
        { &vop_write_desc, procfs_rw },                        /* write */
        { &vop_fallocate_desc, genfs_eopnotsupp },        /* fallocate */
        { &vop_fdiscard_desc, genfs_eopnotsupp },        /* fdiscard */
        { &vop_fcntl_desc, genfs_fcntl },                /* fcntl */
        { &vop_ioctl_desc, genfs_enoioctl },                /* ioctl */
        { &vop_poll_desc, genfs_poll },                        /* poll */
        { &vop_kqfilter_desc, genfs_kqfilter },                /* kqfilter */
        { &vop_revoke_desc, genfs_revoke },                /* revoke */
        { &vop_fsync_desc, genfs_nullop },                /* fsync */
        { &vop_seek_desc, genfs_nullop },                /* seek */
        { &vop_remove_desc, genfs_eopnotsupp },                /* remove */
        { &vop_link_desc, genfs_erofs_link },                /* link */
        { &vop_rename_desc, genfs_eopnotsupp },                /* rename */
        { &vop_mkdir_desc, genfs_eopnotsupp },                /* mkdir */
        { &vop_rmdir_desc, genfs_eopnotsupp },                /* rmdir */
        { &vop_symlink_desc, genfs_erofs_symlink },        /* symlink */
        { &vop_readdir_desc, procfs_readdir },                /* readdir */
        { &vop_readlink_desc, procfs_readlink },        /* readlink */
        { &vop_abortop_desc, genfs_abortop },                /* abortop */
        { &vop_inactive_desc, procfs_inactive },        /* inactive */
        { &vop_reclaim_desc, procfs_reclaim },                /* reclaim */
        { &vop_lock_desc, genfs_lock },                        /* lock */
        { &vop_unlock_desc, genfs_unlock },                /* unlock */
        { &vop_bmap_desc, genfs_eopnotsupp },                /* bmap */
        { &vop_strategy_desc, genfs_badop },                /* strategy */
        { &vop_print_desc, procfs_print },                /* print */
        { &vop_islocked_desc, genfs_islocked },                /* islocked */
        { &vop_pathconf_desc, procfs_pathconf },        /* pathconf */
        { &vop_advlock_desc, genfs_einval },                /* advlock */
        { &vop_getpages_desc, procfs_getpages },        /* getpages */
        { &vop_putpages_desc, genfs_null_putpages },        /* putpages */
        { NULL, NULL }
};
const struct vnodeopv_desc procfs_vnodeop_opv_desc =
        { &procfs_vnodeop_p, procfs_vnodeop_entries };
/*
 * set things up for doing i/o on
 * the pfsnode (vp).  (vp) is locked
 * on entry, and should be left locked
 * on exit.
 *
 * for procfs we don't need to do anything
 * in particular for i/o.  all that is done
 * is to support exclusive open on process
 * memory images.
 */
int
procfs_open(void *v)
{
        struct vop_open_args /* {
                struct vnode *a_vp;
                int  a_mode;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct pfsnode *pfs = VTOPFS(vp);
        struct lwp *l1;
        struct proc *p2;
        int error;

        if ((error =
             procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p2, ENOENT)) != 0)
                return error;

        l1 = curlwp;                                /* tracer */

#define        M2K(m)        (((m) & FREAD) && ((m) & FWRITE) ? \
                 KAUTH_REQ_PROCESS_PROCFS_RW : \
                 (m) & FWRITE ? KAUTH_REQ_PROCESS_PROCFS_WRITE : \
                 KAUTH_REQ_PROCESS_PROCFS_READ)

        mutex_enter(p2->p_lock);
        error = kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_PROCFS,
            p2, pfs, KAUTH_ARG(M2K(ap->a_mode)), NULL);
        mutex_exit(p2->p_lock);
        if (error) {
                procfs_proc_unlock(p2);
                return (error);
        }

#undef M2K

        switch (pfs->pfs_type) {
        case PFSmem:
                if (((pfs->pfs_flags & FWRITE) && (ap->a_mode & O_EXCL)) ||
                    ((pfs->pfs_flags & O_EXCL) && (ap->a_mode & FWRITE))) {
                        error = EBUSY;
                        break;
                }

                if (!proc_isunder(p2, l1)) {
                        error = EPERM;
                        break;
                }

                if (ap->a_mode & FWRITE)
                        pfs->pfs_flags = ap->a_mode & (FWRITE|O_EXCL);

                break;

        case PFSregs:
        case PFSfpregs:
                if (!proc_isunder(p2, l1)) {
                        error = EPERM;
                        break;
                }
                break;

        default:
                break;
        }

        procfs_proc_unlock(p2);
        return (error);
}

/*
 * close the pfsnode (vp) after doing i/o.
 * (vp) is not locked on entry or exit.
 *
 * nothing to do for procfs other than undo
 * any exclusive open flag (see _open above).
 */
int
procfs_close(void *v)
{
        struct vop_close_args /* {
                struct vnode *a_vp;
                int  a_fflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct pfsnode *pfs = VTOPFS(ap->a_vp);

        switch (pfs->pfs_type) {
        case PFSmem:
                if ((ap->a_fflag & FWRITE) && (pfs->pfs_flags & O_EXCL))
                        pfs->pfs_flags &= ~(FWRITE|O_EXCL);
                break;

        default:
                break;
        }

        return (0);
}

/*
 * _inactive is called when the pfsnode
 * is vrele'd and the reference count goes
 * to zero.  (vp) will be on the vnode free
 * list, so to get it back vget() must be
 * used.
 *
 * (vp) is locked on entry, but must be unlocked on exit.
 */
int
procfs_inactive(void *v)
{
        struct vop_inactive_v2_args /* {
                struct vnode *a_vp;
                bool *a_recycle;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct pfsnode *pfs = VTOPFS(vp);

        mutex_enter(&proc_lock);
        *ap->a_recycle = (procfs_proc_find(vp->v_mount, pfs->pfs_pid) == NULL);
        mutex_exit(&proc_lock);

        return (0);
}

/*
 * _reclaim is called when getnewvnode()
 * wants to make use of an entry on the vnode
 * free list.  at this time the filesystem needs
 * to free any private data and remove the node
 * from any private lists.
 */
int
procfs_reclaim(void *v)
{
        struct vop_reclaim_v2_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct pfsnode *pfs = VTOPFS(vp);

        VOP_UNLOCK(vp);

        /*
         * To interlock with procfs_revoke_vnodes().
         */
        mutex_enter(vp->v_interlock);
        vp->v_data = NULL;
        mutex_exit(vp->v_interlock);
        procfs_hashrem(pfs);
        kmem_free(pfs, sizeof(*pfs));
        return 0;
}

/*
 * Return POSIX pathconf information applicable to special devices.
 */
int
procfs_pathconf(void *v)
{
        struct vop_pathconf_args /* {
                struct vnode *a_vp;
                int a_name;
                register_t *a_retval;
        } */ *ap = v;

        switch (ap->a_name) {
        case _PC_LINK_MAX:
                *ap->a_retval = LINK_MAX;
                return (0);
        case _PC_MAX_CANON:
                *ap->a_retval = MAX_CANON;
                return (0);
        case _PC_MAX_INPUT:
                *ap->a_retval = MAX_INPUT;
                return (0);
        case _PC_PIPE_BUF:
                *ap->a_retval = PIPE_BUF;
                return (0);
        case _PC_CHOWN_RESTRICTED:
                *ap->a_retval = 1;
                return (0);
        case _PC_VDISABLE:
                *ap->a_retval = _POSIX_VDISABLE;
                return (0);
        case _PC_SYNC_IO:
                *ap->a_retval = 1;
                return (0);
        default:
                return genfs_pathconf(ap);
        }
        /* NOTREACHED */
}

/*
 * _print is used for debugging.
 * just print a readable description
 * of (vp).
 */
int
procfs_print(void *v)
{
        struct vop_print_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct pfsnode *pfs = VTOPFS(ap->a_vp);

        printf("tag VT_PROCFS, type %d, pid %d, mode %x, flags %lx\n",
            pfs->pfs_type, pfs->pfs_pid, pfs->pfs_mode, pfs->pfs_flags);
        return 0;
}

/*
 * Works out the path to the target process's current
 * working directory or chroot.  If the caller is in a chroot and
 * can't "reach" the target's cwd or root (or some other error
 * occurs), a "/" is returned for the path.
 */
static void
procfs_dir(pfstype t, struct lwp *caller, struct proc *target, char **bpp,
    char *path, size_t len)
{
        struct cwdinfo *cwdi;
        struct vnode *vp, *rvp;
        char *bp;

        /*
         * Lock target cwdi and take a reference to the vnode
         * we are interested in to prevent it from disappearing
         * before getcwd_common() below.
         */
        rw_enter(&target->p_cwdi->cwdi_lock, RW_READER);
        switch (t) {
        case PFScwd:
                vp = target->p_cwdi->cwdi_cdir;
                break;
        case PFSchroot:
                vp = target->p_cwdi->cwdi_rdir;
                break;
        default:
                rw_exit(&target->p_cwdi->cwdi_lock);
                return;
        }
        if (vp != NULL)
                vref(vp);
        rw_exit(&target->p_cwdi->cwdi_lock);

        cwdi = caller->l_proc->p_cwdi;
        rw_enter(&cwdi->cwdi_lock, RW_READER);

        rvp = cwdi->cwdi_rdir;
        bp = bpp ? *bpp : NULL;

        /*
         * XXX: this horrible kludge avoids locking panics when
         * attempting to lookup links that point to within procfs
         */
        if (vp != NULL && vp->v_tag == VT_PROCFS) {
                if (bpp) {
                        *--bp = '/';
                        *bpp = bp;
                }
                vrele(vp);
                rw_exit(&cwdi->cwdi_lock);
                return;
        }

        if (rvp == NULL)
                rvp = rootvnode;
        if (vp == NULL || getcwd_common(vp, rvp, bp ? &bp : NULL, path,
            len / 2, 0, caller) != 0) {
                if (bpp) {
                        bp = *bpp;
                        *--bp = '/';
                }
        }

        if (bpp)
                *bpp = bp;

        if (vp != NULL)
                vrele(vp);
        rw_exit(&cwdi->cwdi_lock);
}

/*
 * Invent attributes for pfsnode (vp) and store
 * them in (vap).
 * Directories lengths are returned as zero since
 * any real length would require the genuine size
 * to be computed, and nothing cares anyway.
 *
 * this is relatively minimal for procfs.
 */
int
procfs_getattr(void *v)
{
        struct vop_getattr_args /* {
                struct vnode *a_vp;
                struct vattr *a_vap;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct pfsnode *pfs = VTOPFS(vp);
        struct vattr *vap = ap->a_vap;
        struct proc *procp;
        char *path, *bp, bf[16];
        int error;

        /* first check the process still exists */
        switch (pfs->pfs_type) {
        case PFSroot:
        case PFScurproc:
        case PFSself:
                procp = NULL;
                break;

        default:
                error =
                    procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &procp, ENOENT);
                if (error != 0)
                        return (error);
                break;
        }

        switch (pfs->pfs_type) {
        case PFStask:
                if (pfs->pfs_fd == -1) {
                        path = NULL;
                        break;
                }
                /*FALLTHROUGH*/
        case PFScwd:
        case PFSchroot:
                path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK);
                if (path == NULL && procp != NULL) {
                        procfs_proc_unlock(procp);
                        return (ENOMEM);
                }
                break;

        default:
                path = NULL;
                break;
        }

        if (procp != NULL) {
                mutex_enter(procp->p_lock);
                error = kauth_authorize_process(kauth_cred_get(),
                    KAUTH_PROCESS_CANSEE, procp,
                    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
                mutex_exit(procp->p_lock);
                if (error != 0) {
                            procfs_proc_unlock(procp);
                            if (path != NULL)
                                    free(path, M_TEMP);
                        return (ENOENT);
                }
        }

        error = 0;

        /* start by zeroing out the attributes */
        vattr_null(vap);

        /* next do all the common fields */
        vap->va_type = ap->a_vp->v_type;
        vap->va_mode = pfs->pfs_mode;
        vap->va_fileid = pfs->pfs_fileno;
        vap->va_flags = 0;
        vap->va_blocksize = PAGE_SIZE;

        /*
         * Make all times be current TOD.
         *
         * It would be possible to get the process start
         * time from the p_stats structure, but there's
         * no "file creation" time stamp anyway, and the
         * p_stats structure is not addressable if u. gets
         * swapped out for that process.
         */
        getnanotime(&vap->va_ctime);
        vap->va_atime = vap->va_mtime = vap->va_ctime;
        if (procp)
                TIMEVAL_TO_TIMESPEC(&procp->p_stats->p_start,
                    &vap->va_birthtime);
        else
                getnanotime(&vap->va_birthtime);

        switch (pfs->pfs_type) {
        case PFSmem:
        case PFSregs:
        case PFSfpregs:
#if defined(__HAVE_PROCFS_MACHDEP) && defined(PROCFS_MACHDEP_PROTECT_CASES)
        PROCFS_MACHDEP_PROTECT_CASES
#endif
                /*
                 * If the process has exercised some setuid or setgid
                 * privilege, then rip away read/write permission so
                 * that only root can gain access.
                 */
                if (procp->p_flag & PK_SUGID)
                        vap->va_mode &= ~(S_IRUSR|S_IWUSR);
                /* FALLTHROUGH */
        case PFSstatus:
        case PFSstat:
        case PFSnote:
        case PFSnotepg:
        case PFScmdline:
        case PFSenviron:
        case PFSemul:
        case PFSstatm:

        case PFSmap:
        case PFSmaps:
        case PFSlimit:
        case PFSauxv:
                vap->va_nlink = 1;
                vap->va_uid = kauth_cred_geteuid(procp->p_cred);
                vap->va_gid = kauth_cred_getegid(procp->p_cred);
                break;
        case PFScwd:
        case PFSchroot:
        case PFSmeminfo:
        case PFSdevices:
        case PFScpuinfo:
        case PFSuptime:
        case PFSmounts:
        case PFScpustat:
        case PFSloadavg:
        case PFSversion:
        case PFSexe:
        case PFSself:
        case PFScurproc:
        case PFSroot:
                vap->va_nlink = 1;
                vap->va_uid = vap->va_gid = 0;
                break;

        case PFSproc:
        case PFStask:
        case PFSfile:
        case PFSfd:
                break;

        default:
                panic("%s: %d/1", __func__, pfs->pfs_type);
        }

        /*
         * now do the object specific fields
         *
         * The size could be set from struct reg, but it's hardly
         * worth the trouble, and it puts some (potentially) machine
         * dependent data into this machine-independent code.  If it
         * becomes important then this function should break out into
         * a per-file stat function in the corresponding .c file.
         */

        switch (pfs->pfs_type) {
        case PFSroot:
                vap->va_bytes = vap->va_size = DEV_BSIZE;
                break;

        case PFSself:
        case PFScurproc:
                vap->va_bytes = vap->va_size =
                    snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid);
                break;
        case PFStask:
                if (pfs->pfs_fd != -1) {
                        vap->va_nlink = 1;
                        vap->va_uid = 0;
                        vap->va_gid = 0;
                        vap->va_bytes = vap->va_size =
                            snprintf(bf, sizeof(bf), "..");
                        break;
                }
                /*FALLTHROUGH*/
        case PFSfd:
                if (pfs->pfs_fd != -1) {
                        file_t *fp;

                        fp = fd_getfile2(procp, pfs->pfs_fd);
                        if (fp == NULL) {
                                error = EBADF;
                                break;
                        }
                        vap->va_nlink = 1;
                        vap->va_uid = kauth_cred_geteuid(fp->f_cred);
                        vap->va_gid = kauth_cred_getegid(fp->f_cred);
                        switch (fp->f_type) {
                        case DTYPE_VNODE:
                                vap->va_bytes = vap->va_size =
                                    fp->f_vnode->v_size;
                                break;
                        default:
                                vap->va_bytes = vap->va_size = 0;
                                break;
                        }
                        closef(fp);
                        break;
                }
                /*FALLTHROUGH*/
        case PFSproc:
                vap->va_nlink = 2;
                vap->va_uid = kauth_cred_geteuid(procp->p_cred);
                vap->va_gid = kauth_cred_getegid(procp->p_cred);
                vap->va_bytes = vap->va_size = DEV_BSIZE;
                break;

        case PFSfile:
                error = EOPNOTSUPP;
                break;

        case PFSmem:
                vap->va_bytes = vap->va_size =
                        ctob(procp->p_vmspace->vm_tsize +
                                    procp->p_vmspace->vm_dsize +
                                    procp->p_vmspace->vm_ssize);
                break;

        case PFSauxv:
                vap->va_bytes = vap->va_size = procp->p_execsw->es_arglen;
                break;

#if defined(PT_GETREGS) || defined(PT_SETREGS)
        case PFSregs:
                vap->va_bytes = vap->va_size = sizeof(struct reg);
                break;
#endif

#if defined(PT_GETFPREGS) || defined(PT_SETFPREGS)
        case PFSfpregs:
                vap->va_bytes = vap->va_size = sizeof(struct fpreg);
                break;
#endif

        case PFSstatus:
        case PFSstat:
        case PFSnote:
        case PFSnotepg:
        case PFScmdline:
        case PFSenviron:
        case PFSmeminfo:
        case PFSdevices:
        case PFScpuinfo:
        case PFSuptime:
        case PFSmounts:
        case PFScpustat:
        case PFSloadavg:
        case PFSstatm:
        case PFSversion:
                vap->va_bytes = vap->va_size = 0;
                break;
        case PFSlimit:
        case PFSmap:
        case PFSmaps:
                /*
                 * Advise a larger blocksize for the map files, so that
                 * they may be read in one pass.
                 */
                vap->va_blocksize = 4 * PAGE_SIZE;
                vap->va_bytes = vap->va_size = 0;
                break;

        case PFScwd:
        case PFSchroot:
                bp = path + MAXPATHLEN;
                *--bp = '\0';
                procfs_dir(pfs->pfs_type, curlwp, procp, &bp, path,
                     MAXPATHLEN);
                vap->va_bytes = vap->va_size = strlen(bp);
                break;

        case PFSexe:
                vap->va_bytes = vap->va_size = strlen(procp->p_path);
                break;

        case PFSemul:
                vap->va_bytes = vap->va_size = strlen(procp->p_emul->e_name);
                break;

#ifdef __HAVE_PROCFS_MACHDEP
        PROCFS_MACHDEP_NODETYPE_CASES
                error = procfs_machdep_getattr(ap->a_vp, vap, procp);
                break;
#endif

        default:
                panic("%s: %d/2", __func__, pfs->pfs_type);
        }

        if (procp != NULL)
                procfs_proc_unlock(procp);
        if (path != NULL)
                free(path, M_TEMP);

        return (error);
}

/*ARGSUSED*/
int
procfs_setattr(void *v)
{
        /*
         * just fake out attribute setting
         * it's not good to generate an error
         * return, otherwise things like creat()
         * will fail when they try to set the
         * file length to 0.  worse, this means
         * that echo $note > /proc/$pid/note will fail.
         */

        return (0);
}

/*
 * implement access checking.
 *
 * actually, the check for super-user is slightly
 * broken since it will allow read access to write-only
 * objects.  this doesn't cause any particular trouble
 * but does mean that the i/o entry points need to check
 * that the operation really does make sense.
 */
int
procfs_access(void *v)
{
        struct vop_access_args /* {
                struct vnode *a_vp;
                accmode_t a_accmode;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vattr va;
        int error;

        if ((error = VOP_GETATTR(ap->a_vp, &va, ap->a_cred)) != 0)
                return (error);

        return kauth_authorize_vnode(ap->a_cred,
            KAUTH_ACCESS_ACTION(ap->a_accmode, ap->a_vp->v_type, va.va_mode),
            ap->a_vp, NULL, genfs_can_access(ap->a_vp, ap->a_cred,
            va.va_uid, va.va_gid, va.va_mode, NULL, ap->a_accmode));
}

/*
 * lookup.  this is incredibly complicated in the
 * general case, however for most pseudo-filesystems
 * very little needs to be done.
 *
 * Locking isn't hard here, just poorly documented.
 *
 * If we're looking up ".", just vref the parent & return it.
 *
 * If we're looking up "..", unlock the parent, and lock "..". If everything
 * went ok, and we're on the last component and the caller requested the
 * parent locked, try to re-lock the parent. We do this to prevent lock
 * races.
 *
 * For anything else, get the needed node. Then unlock the parent if not
 * the last component or not LOCKPARENT (i.e. if we wouldn't re-lock the
 * parent in the .. case).
 *
 * We try to exit with the parent locked in error cases.
 */
int
procfs_lookup(void *v)
{
        struct vop_lookup_v2_args /* {
                struct vnode * a_dvp;
                struct vnode ** a_vpp;
                struct componentname * a_cnp;
        } */ *ap = v;
        struct componentname *cnp = ap->a_cnp;
        struct vnode **vpp = ap->a_vpp;
        struct vnode *dvp = ap->a_dvp;
        const char *pname = cnp->cn_nameptr;
        const struct proc_target *pt = NULL;
        struct vnode *fvp;
        pid_t pid, vnpid;
        struct pfsnode *pfs;
        struct proc *p = NULL;
        struct lwp *plwp;
        int i, error;
        pfstype type;

        *vpp = NULL;

        if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred)) != 0)
                return (error);

        if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
                return (EROFS);

        if (cnp->cn_namelen == 1 && *pname == '.') {
                *vpp = dvp;
                vref(dvp);
                return (0);
        }

        pfs = VTOPFS(dvp);
        switch (pfs->pfs_type) {
        case PFSroot:
                /*
                 * Shouldn't get here with .. in the root node.
                 */
                if (cnp->cn_flags & ISDOTDOT)
                        return (EIO);

                for (i = 0; i < nproc_root_targets; i++) {
                        pt = &proc_root_targets[i];
                        /*
                         * check for node match.  proc is always NULL here,
                         * so call pt_valid with constant NULL lwp.
                         */
                        if (cnp->cn_namelen == pt->pt_namlen &&
                            memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 &&
                            (pt->pt_valid == NULL ||
                             (*pt->pt_valid)(NULL, dvp->v_mount)))
                                break;
                }

                if (i != nproc_root_targets) {
                        error = procfs_allocvp(dvp->v_mount, vpp, 0,
                            pt->pt_pfstype, -1);
                        return (error);
                }

                if (CNEQ(cnp, "curproc", 7)) {
                        pid = curproc->p_pid;
                        vnpid = 0;
                        type = PFScurproc;
                } else if (CNEQ(cnp, "self", 4)) {
                        pid = curproc->p_pid;
                        vnpid = 0;
                        type = PFSself;
                } else {
                        pid = (pid_t)atoi(pname, cnp->cn_namelen);
                        vnpid = pid;
                        type = PFSproc;
                }

                if (procfs_proc_lock(dvp->v_mount, pid, &p, ESRCH) != 0)
                        break;
                error = procfs_allocvp(dvp->v_mount, vpp, vnpid, type, -1);
                procfs_proc_unlock(p);
                return (error);

        case PFSproc:
                if (cnp->cn_flags & ISDOTDOT) {
                        error = procfs_allocvp(dvp->v_mount, vpp, 0, PFSroot,
                            -1);
                        return (error);
                }

                if (procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p,
                                     ESRCH) != 0)
                        break;

                mutex_enter(p->p_lock);
                LIST_FOREACH(plwp, &p->p_lwps, l_sibling) {
                        if (plwp->l_stat != LSZOMB)
                                break;
                }
                /* Process is exiting if no-LWPS or all LWPs are LSZOMB */
                if (plwp == NULL) {
                        mutex_exit(p->p_lock);
                        procfs_proc_unlock(p);
                        return ESRCH;
                }

                lwp_addref(plwp);
                mutex_exit(p->p_lock);

                for (pt = proc_targets, i = 0; i < nproc_targets; pt++, i++) {
                        int found;

                        found = cnp->cn_namelen == pt->pt_namlen &&
                            memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 &&
                            (pt->pt_valid == NULL
                              || (*pt->pt_valid)(plwp, dvp->v_mount));
                        if (found)
                                break;
                }
                lwp_delref(plwp);

                if (i == nproc_targets) {
                        procfs_proc_unlock(p);
                        break;
                }
                if (pt->pt_pfstype == PFSfile) {
                        fvp = p->p_textvp;
                        /* We already checked that it exists. */
                        vref(fvp);
                        procfs_proc_unlock(p);
                        *vpp = fvp;
                        return (0);
                }

                error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
                    pt->pt_pfstype, -1);
                procfs_proc_unlock(p);
                return (error);

        case PFSfd: {
                int fd;
                file_t *fp;

                if ((error = procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p,
                                              ENOENT)) != 0)
                        return error;

                if (cnp->cn_flags & ISDOTDOT) {
                        error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
                            PFSproc, -1);
                        procfs_proc_unlock(p);
                        return (error);
                }
                fd = atoi(pname, cnp->cn_namelen);

                fp = fd_getfile2(p, fd);
                if (fp == NULL) {
                        procfs_proc_unlock(p);
                        return ENOENT;
                }
                fvp = fp->f_vnode;

                /* Don't show directories */
                if (fp->f_type == DTYPE_VNODE && fvp->v_type != VDIR &&
                    !procfs_proc_is_linux_compat()) {
                        vref(fvp);
                        closef(fp);
                        procfs_proc_unlock(p);
                        *vpp = fvp;
                        return 0;
                }

                closef(fp);
                error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
                    PFSfd, fd);
                procfs_proc_unlock(p);
                return error;
        }
        case PFStask: {
                int xpid;

                if ((error = procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p,
                                              ENOENT)) != 0)
                        return error;

                if (cnp->cn_flags & ISDOTDOT) {
                        error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
                            PFSproc, -1);
                        procfs_proc_unlock(p);
                        return (error);
                }
                xpid = atoi(pname, cnp->cn_namelen);

                if (xpid != pfs->pfs_pid) {
                        procfs_proc_unlock(p);
                        return ENOENT;
                }
                error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
                    PFStask, 0);
                procfs_proc_unlock(p);
                return error;
        }
        default:
                return (ENOTDIR);
        }

        return (cnp->cn_nameiop == LOOKUP ? ENOENT : EROFS);
}

int
procfs_validfile(struct lwp *l, struct mount *mp)
{
        return l != NULL && l->l_proc != NULL && l->l_proc->p_textvp != NULL;
}

static int
procfs_validfile_linux(struct lwp *l, struct mount *mp)
{
        return procfs_use_linux_compat(mp) &&
            (l == NULL || l->l_proc == NULL || procfs_validfile(l, mp));
}

struct procfs_root_readdir_ctx {
        struct uio *uiop;
        off_t *cookies;
        int ncookies;
        off_t off;
        off_t startoff;
        int error;
};

static int
procfs_root_readdir_callback(struct proc *p, void *arg)
{
        struct procfs_root_readdir_ctx *ctxp = arg;
        struct dirent d;
        struct uio *uiop;
        int error;

        uiop = ctxp->uiop;
        if (uiop->uio_resid < UIO_MX)
                return -1; /* no space */

        if (kauth_authorize_process(kauth_cred_get(),
            KAUTH_PROCESS_CANSEE, p,
            KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL) != 0)
                return 0;

        if (ctxp->off < ctxp->startoff) {
                ctxp->off++;
                return 0;
        }

        memset(&d, 0, UIO_MX);
        d.d_reclen = UIO_MX;
        d.d_fileno = PROCFS_FILENO(p->p_pid, PFSproc, -1);
        d.d_namlen = snprintf(d.d_name,
            UIO_MX - offsetof(struct dirent, d_name), "%ld", (long)p->p_pid);
        d.d_type = DT_DIR;

        mutex_exit(&proc_lock);
        error = uiomove(&d, UIO_MX, uiop);
        mutex_enter(&proc_lock);
        if (error) {
                ctxp->error = error;
                return -1;
        }

        ctxp->ncookies++;
        if (ctxp->cookies)
                *(ctxp->cookies)++ = ctxp->off + 1;
        ctxp->off++;

        return 0;
}

/*
 * readdir returns directory entries from pfsnode (vp).
 *
 * the strategy here with procfs is to generate a single
 * directory entry at a time (struct dirent) and then
 * copy that out to userland using uiomove.  a more efficient
 * though more complex implementation, would try to minimize
 * the number of calls to uiomove().  for procfs, this is
 * hardly worth the added code complexity.
 *
 * this should just be done through read()
 */
int
procfs_readdir(void *v)
{
        struct vop_readdir_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                kauth_cred_t a_cred;
                int *a_eofflag;
                off_t **a_cookies;
                int *a_ncookies;
        } */ *ap = v;
        struct uio *uio = ap->a_uio;
        struct dirent d;
        struct pfsnode *pfs;
        off_t i;
        int error;
        off_t *cookies = NULL;
        int ncookies;
        struct vnode *vp;
        const struct proc_target *pt;
        struct procfs_root_readdir_ctx ctx;
        struct proc *p = NULL;
        struct lwp *l;
        int nfd;
        int nc = 0;

        vp = ap->a_vp;
        pfs = VTOPFS(vp);

        if (uio->uio_resid < UIO_MX)
                return (EINVAL);
        if (uio->uio_offset < 0)
                return (EINVAL);

        error = 0;
        i = uio->uio_offset;
        memset(&d, 0, UIO_MX);
        d.d_reclen = UIO_MX;
        ncookies = uio->uio_resid / UIO_MX;

        switch (pfs->pfs_type) {
        /*
         * this is for the process-specific sub-directories.
         * all that is needed to is copy out all the entries
         * from the procent[] table (top of this file).
         */
        case PFSproc: {

                if (i >= nproc_targets)
                        return 0;

                if (procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH) != 0)
                        break;

                if (ap->a_ncookies) {
                        ncookies = uimin(ncookies, (nproc_targets - i));
                        cookies = malloc(ncookies * sizeof (off_t),
                            M_TEMP, M_WAITOK);
                        *ap->a_cookies = cookies;
                }

                for (pt = &proc_targets[i];
                     uio->uio_resid >= UIO_MX && i < nproc_targets; pt++, i++) {
                        if (pt->pt_valid) {
                                /* XXXSMP LWP can disappear */
                                mutex_enter(p->p_lock);
                                l = LIST_FIRST(&p->p_lwps);
                                KASSERT(l != NULL);
                                mutex_exit(p->p_lock);
                                if ((*pt->pt_valid)(l, vp->v_mount) == 0)
                                        continue;
                        }

                        d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
                            pt->pt_pfstype, -1);
                        d.d_namlen = pt->pt_namlen;
                        memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
                        d.d_type = pt->pt_type;

                        if ((error = uiomove(&d, UIO_MX, uio)) != 0)
                                break;
                        if (cookies)
                                *cookies++ = i + 1;
                }

                procfs_proc_unlock(p);
                    break;
        }
        case PFSfd: {
                file_t *fp;
                int lim;

                if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p,
                                              ESRCH)) != 0)
                        return error;

                /* XXX Should this be by file as well? */
                if (kauth_authorize_process(kauth_cred_get(),
                    KAUTH_PROCESS_CANSEE, p,
                    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), NULL,
                    NULL) != 0) {
                            procfs_proc_unlock(p);
                        return ESRCH;
                }

                nfd = atomic_load_consume(&p->p_fd->fd_dt)->dt_nfiles;

                lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
                if (i >= lim) {
                            procfs_proc_unlock(p);
                        return 0;
                }

                if (ap->a_ncookies) {
                        ncookies = uimin(ncookies, (nfd + 2 - i));
                        cookies = malloc(ncookies * sizeof (off_t),
                            M_TEMP, M_WAITOK);
                        *ap->a_cookies = cookies;
                }

                for (; i < 2 && uio->uio_resid >= UIO_MX; i++) {
                        pt = &proc_targets[i];
                        d.d_namlen = pt->pt_namlen;
                        d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
                            pt->pt_pfstype, -1);
                        (void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
                        d.d_type = pt->pt_type;
                        if ((error = uiomove(&d, UIO_MX, uio)) != 0)
                                break;
                        if (cookies)
                                *cookies++ = i + 1;
                        nc++;
                }
                if (error)
                        goto out;
                for (; uio->uio_resid >= UIO_MX && i < nfd; i++) {
                        /* check the descriptor exists */
                        if ((fp = fd_getfile2(p, i - 2)) == NULL)
                                continue;
                        closef(fp);

                        d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFSfd, i - 2);
                        d.d_namlen = snprintf(d.d_name, sizeof(d.d_name),
                            "%lld", (long long)(i - 2));
                        d.d_type = fttodt(fp);
                        if ((error = uiomove(&d, UIO_MX, uio)) != 0)
                                break;
                        if (cookies)
                                *cookies++ = i + 1;
                        nc++;
                }
                goto out;
        }
        case PFStask: {

                if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p,
                                              ESRCH)) != 0)
                        return error;

                nfd = 3;        /* ., .., pid */

                if (ap->a_ncookies) {
                        ncookies = uimin(ncookies, (nfd + 2 - i));
                        cookies = malloc(ncookies * sizeof (off_t),
                            M_TEMP, M_WAITOK);
                        *ap->a_cookies = cookies;
                }

                for (; i < 2 && uio->uio_resid >= UIO_MX; i++) {
                        pt = &proc_targets[i];
                        d.d_namlen = pt->pt_namlen;
                        d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
                            pt->pt_pfstype, -1);
                        (void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
                        d.d_type = pt->pt_type;
                        if ((error = uiomove(&d, UIO_MX, uio)) != 0)
                                break;
                        if (cookies)
                                *cookies++ = i + 1;
                        nc++;
                }
                if (error)
                        goto out;
                for (; uio->uio_resid >= UIO_MX && i < nfd; i++) {
                        /* check the descriptor exists */
                        d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFStask,
                            i - 2);
                        d.d_namlen = snprintf(d.d_name, sizeof(d.d_name),
                            "%ld", (long)pfs->pfs_pid);
                        d.d_type = DT_LNK;
                        if ((error = uiomove(&d, UIO_MX, uio)) != 0)
                                break;
                        if (cookies)
                                *cookies++ = i + 1;
                        nc++;
                }
                goto out;
        }

        /*
         * this is for the root of the procfs filesystem
         * what is needed are special entries for "curproc"
         * and "self" followed by an entry for each process
         * on allproc.
         */

        case PFSroot: {

                if (ap->a_ncookies) {
                        /*
                         * XXX Potentially allocating too much space here,
                         * but I'm lazy. This loop needs some work.
                         */
                        cookies = malloc(ncookies * sizeof (off_t),
                            M_TEMP, M_WAITOK);
                        *ap->a_cookies = cookies;
                }

                /* 0 ... 3 are static entries. */
                for (; i <= 3 && uio->uio_resid >= UIO_MX; i++) {
                        switch (i) {
                        case 0:                /* `.' */
                        case 1:                /* `..' */
                                d.d_fileno = PROCFS_FILENO(0, PFSroot, -1);
                                d.d_namlen = i + 1;
                                memcpy(d.d_name, "..", d.d_namlen);
                                d.d_name[i + 1] = '\0';
                                d.d_type = DT_DIR;
                                break;

                        case 2:
                                d.d_fileno = PROCFS_FILENO(0, PFScurproc, -1);
                                d.d_namlen = sizeof("curproc") - 1;
                                memcpy(d.d_name, "curproc", sizeof("curproc"));
                                d.d_type = DT_LNK;
                                break;

                        case 3:
                                d.d_fileno = PROCFS_FILENO(0, PFSself, -1);
                                d.d_namlen = sizeof("self") - 1;
                                memcpy(d.d_name, "self", sizeof("self"));
                                d.d_type = DT_LNK;
                                break;
                        }

                        if ((error = uiomove(&d, UIO_MX, uio)) != 0)
                                break;
                        nc++;
                        if (cookies)
                                *cookies++ = i + 1;
                }
                if (error)
                        break;
                /* 4 ... are process entries. */
                ctx.uiop = uio;
                ctx.error = 0;
                ctx.off = 4;
                ctx.startoff = i;
                ctx.cookies = cookies;
                ctx.ncookies = nc;
                proclist_foreach_call(&allproc,
                    procfs_root_readdir_callback, &ctx);
                cookies = ctx.cookies;
                nc = ctx.ncookies;
                error = ctx.error;
                if (error)
                        break;

                /* misc entries. */
                if (i < ctx.off)
                        i = ctx.off;
                if (i >= ctx.off + nproc_root_targets)
                        break;
                error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH);
                if (error)
                        break;
                for (pt = &proc_root_targets[i - ctx.off];
                    uio->uio_resid >= UIO_MX &&
                    pt < &proc_root_targets[nproc_root_targets];
                    pt++, i++) {
                        if (pt->pt_valid &&
                            (*pt->pt_valid)(NULL, vp->v_mount) == 0)
                                continue;
                        if (kauth_authorize_process(kauth_cred_get(),
                            KAUTH_PROCESS_CANSEE, p,
                            KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY),
                            NULL, NULL) != 0)
                                continue;
                        d.d_fileno = PROCFS_FILENO(0, pt->pt_pfstype, -1);
                        d.d_namlen = pt->pt_namlen;
                        memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
                        d.d_type = pt->pt_type;

                        if ((error = uiomove(&d, UIO_MX, uio)) != 0)
                                break;
                        nc++;
                        if (cookies)
                                *cookies++ = i + 1;
                }
out:
                KASSERT(p != NULL);
                ncookies = nc;
                procfs_proc_unlock(p);
                break;
        }

        default:
                error = ENOTDIR;
                break;
        }

        if (ap->a_ncookies) {
                if (error) {
                        if (cookies)
                                free(*ap->a_cookies, M_TEMP);
                        *ap->a_ncookies = 0;
                        *ap->a_cookies = NULL;
                } else
                        *ap->a_ncookies = ncookies;
        }
        uio->uio_offset = i;
        return (error);
}

/*
 * readlink reads the link of `curproc' and others
 */
int
procfs_readlink(void *v)
{
        struct vop_readlink_args *ap = v;
        char bf[16];                /* should be enough */
        char *bp = bf;
        char *path = NULL;
        int len = 0;
        int error = 0;
        struct vnode *vp = ap->a_vp;
        struct pfsnode *pfs = VTOPFS(vp);
        struct proc *pown = NULL;

        if (pfs->pfs_fileno == PROCFS_FILENO(0, PFScurproc, -1))
                len = snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid);
        else if (pfs->pfs_fileno == PROCFS_FILENO(0, PFSself, -1))
                len = snprintf(bf, sizeof(bf), "%s", "curproc");
        else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFStask, 0))
                len = snprintf(bf, sizeof(bf), "..");
        else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSexe, -1)) {
                if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown,
                                              ESRCH)) != 0)
                        return error;
                bp = pown->p_path;
                len = strlen(bp);
        } else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFScwd, -1) ||
            pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSchroot, -1)) {
                if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown,
                                              ESRCH)) != 0)
                        return error;
                path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK);
                if (path == NULL) {
                        procfs_proc_unlock(pown);
                        return (ENOMEM);
                }
                bp = path + MAXPATHLEN;
                *--bp = '\0';
                procfs_dir(PROCFS_TYPE(pfs->pfs_fileno), curlwp, pown,
                    &bp, path, MAXPATHLEN);
                len = strlen(bp);
        } else {
                file_t *fp;
                struct vnode *vxp;

                if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown,
                                              ESRCH)) != 0)
                        return error;

                fp = fd_getfile2(pown, pfs->pfs_fd);
                if (fp == NULL) {
                        procfs_proc_unlock(pown);
                        return EBADF;
                }

                switch (fp->f_type) {
                case DTYPE_VNODE:
                        vxp = fp->f_vnode;
                        if (vxp->v_type != VDIR &&
                            !procfs_proc_is_linux_compat()) {
                                error = EINVAL;
                                break;
                        }
                        if ((path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK))
                            == NULL) {
                                error = ENOMEM;
                                break;
                        }
                        bp = path + MAXPATHLEN;
                        *--bp = '\0';

                        /*
                         * XXX: kludge to avoid locking against ourselves
                         * in getcwd()
                         */
                        if (vxp->v_tag == VT_PROCFS) {
                                *--bp = '/';
                        } else {
                                rw_enter(&curproc->p_cwdi->cwdi_lock,
                                    RW_READER);
                                vp = curproc->p_cwdi->cwdi_rdir;
                                if (vp == NULL)
                                        vp = rootvnode;
                                error = getcwd_common(vxp, vp, &bp, path,
                                    MAXPATHLEN / 2, 0, curlwp);
                                rw_exit(&curproc->p_cwdi->cwdi_lock);
                        }
                        if (error)
                                break;
                        len = strlen(bp);
                        break;

                case DTYPE_MISC:
                        len = snprintf(bf, sizeof(bf), "%s", "[misc]");
                        break;

                case DTYPE_KQUEUE:
                        len = snprintf(bf, sizeof(bf), "%s", "[kqueue]");
                        break;

                case DTYPE_SEM:
                        len = snprintf(bf, sizeof(bf), "%s", "[ksem]");
                        break;

                default:
                        error = EINVAL;
                        break;
                }        
                closef(fp);
        }

        if (error == 0)
                error = uiomove(bp, len, ap->a_uio);
        if (pown)
                procfs_proc_unlock(pown);
        if (path)
                free(path, M_TEMP);
        return error;
}

int
procfs_getpages(void *v)
{
        struct vop_getpages_args /* {
                struct vnode *a_vp;
                voff_t a_offset;
                struct vm_page **a_m;
                int *a_count;
                int a_centeridx;
                vm_prot_t a_access_type;
                int a_advice;
                int a_flags;
        } */ *ap = v;

        if ((ap->a_flags & PGO_LOCKED) == 0)
                rw_exit(ap->a_vp->v_uobj.vmobjlock);

        return (EFAULT);
}

/*
 * convert decimal ascii to int
 */
static int
atoi(const char *b, size_t len)
{
        int p = 0;

        while (len--) {
                char c = *b++;
                if (c < '0' || c > '9')
                        return -1;
                p = 10 * p + (c - '0');
        }

        return p;
}

/**
 * convert DTYPE_XXX to corresponding DT_XXX
 * matching what procfs_loadvnode() does.
 */
static uint8_t
fttodt(file_t *fp)
{
        switch (fp->f_type) {
        case DTYPE_VNODE:
                switch (fp->f_vnode->v_type) {
                case VREG:        return DT_REG;
                case VDIR:        return DT_LNK;        /* symlink */
                case VBLK:        return DT_BLK;
                case VCHR:        return DT_CHR;
                case VLNK:        return DT_LNK;
                case VSOCK:        return DT_SOCK;
                case VFIFO:        return DT_FIFO;
                default:        return DT_UNKNOWN;
                }
        case DTYPE_PIPE:        return DT_FIFO;
        case DTYPE_SOCKET:        return DT_SOCK;
        case DTYPE_KQUEUE:        /*FALLTHROUGH*/
        case DTYPE_MISC:        /*FALLTHROUGH*/
        case DTYPE_SEM:                return DT_LNK;        /* symlinks */
        default:                return DT_UNKNOWN;
        }
}











































































































































































































































































































































   16 

   55 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
/*        $NetBSD: pmap_private.h,v 1.5 2023/10/04 20:28:06 ad Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 2001 Wasabi Systems, Inc.
 * All rights reserved.
 *
 * Written by Frank van der Linden for Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed for the NetBSD Project by
 *      Wasabi Systems, Inc.
 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
 *    or promote products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef        _X86_PMAP_PRIVATE_H_
#define        _X86_PMAP_PRIVATE_H_

#ifndef        _MACHINE_PMAP_PRIVATE_H_X86
#error Include machine/pmap_private.h, not x86/pmap_private.h.
#endif

#ifdef _KERNEL_OPT
#include "opt_svs.h"
#endif

#include <sys/param.h>
#include <sys/types.h>

#include <sys/kcpuset.h>
#include <sys/mutex.h>
#include <sys/pool.h>
#include <sys/queue.h>
#include <sys/rwlock.h>

#include <machine/cpufunc.h>
#include <machine/pte.h>
#include <machine/vmparam.h>

#include <uvm/uvm_object.h>
#include <uvm/uvm_pmap.h>

struct pmap;

#define SLAREA_USER        0
#define SLAREA_PTE        1
#define SLAREA_MAIN        2
#define SLAREA_PCPU        3
#define SLAREA_DMAP        4
#define SLAREA_HYPV        5
#define SLAREA_ASAN        6
#define SLAREA_MSAN        7
#define SLAREA_KERN        8
#define SLSPACE_NAREAS        9

struct slotspace {
        struct {
                size_t sslot; /* start slot */
                size_t nslot; /* # of slots */
                bool active;  /* area is active */
        } area[SLSPACE_NAREAS];
};

extern struct slotspace slotspace;

#include <x86/gdt.h>

struct pcpu_entry {
        uint8_t gdt[MAXGDTSIZ];
        uint8_t ldt[MAX_USERLDT_SIZE];
        uint8_t idt[PAGE_SIZE];
        uint8_t tss[PAGE_SIZE];
        uint8_t ist0[PAGE_SIZE];
        uint8_t ist1[PAGE_SIZE];
        uint8_t ist2[PAGE_SIZE];
        uint8_t ist3[PAGE_SIZE];
        uint8_t rsp0[2 * PAGE_SIZE];
} __packed;

struct pcpu_area {
#ifdef SVS
        uint8_t utls[PAGE_SIZE];
#endif
        uint8_t ldt[PAGE_SIZE];
        struct pcpu_entry ent[MAXCPUS];
} __packed;

extern struct pcpu_area *pcpuarea;

#define PMAP_PCID_KERN        0
#define PMAP_PCID_USER        1

/*
 * pmap data structures: see pmap.c for details of locking.
 */

/*
 * we maintain a list of all non-kernel pmaps
 */

LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */

/*
 * linked list of all non-kernel pmaps
 */
extern struct pmap_head pmaps;
extern kmutex_t pmaps_lock;    /* protects pmaps */

/*
 * pool_cache(9) that pmaps are allocated from
 */
extern struct pool_cache pmap_cache;

/*
 * the pmap structure
 *
 * note that the pm_obj contains the lock pointer, the reference count,
 * page list, and number of PTPs within the pmap.
 *
 * pm_lock is the same as the lock for vm object 0.  Changes to
 * the other objects may only be made if that lock has been taken
 * (the other object locks are only used when uvm_pagealloc is called)
 */

struct pv_page;

struct pmap {
        struct uvm_object pm_obj[PTP_LEVELS-1];/* objects for lvl >= 1) */
        LIST_ENTRY(pmap) pm_list;        /* list of all pmaps */
        pd_entry_t *pm_pdir;                /* VA of PD */
        paddr_t pm_pdirpa[PDP_SIZE];        /* PA of PDs (read-only after create) */
        struct vm_page *pm_ptphint[PTP_LEVELS-1];
                                        /* pointer to a PTP in our pmap */
        struct pmap_statistics pm_stats;  /* pmap stats */
        struct pv_entry *pm_pve;        /* spare pv_entry */
        LIST_HEAD(, pv_page) pm_pvp_part;
        LIST_HEAD(, pv_page) pm_pvp_empty;
        LIST_HEAD(, pv_page) pm_pvp_full;

#if !defined(__x86_64__)
        vaddr_t pm_hiexec;                /* highest executable mapping */
#endif /* !defined(__x86_64__) */

        union descriptor *pm_ldt;        /* user-set LDT */
        size_t pm_ldt_len;                /* XXX unused, remove */
        int pm_ldt_sel;                        /* LDT selector */

        kcpuset_t *pm_cpus;                /* mask of CPUs using pmap */
        kcpuset_t *pm_kernel_cpus;        /* mask of CPUs using kernel part
                                         of pmap */
        kcpuset_t *pm_xen_ptp_cpus;        /* mask of CPUs which have this pmap's
                                         ptp mapped */
        long pm_pctr;                        /* for assertions */
        LIST_HEAD(,vm_page) pm_gc_ptp;        /* PTPs queued for free */

        /* Used by NVMM and Xen */
        int (*pm_enter)(struct pmap *, vaddr_t, paddr_t, vm_prot_t, u_int);
        bool (*pm_extract)(struct pmap *, vaddr_t, paddr_t *);
        void (*pm_remove)(struct pmap *, vaddr_t, vaddr_t);
        int (*pm_sync_pv)(struct vm_page *, vaddr_t, paddr_t, int, uint8_t *,
            pt_entry_t *);
        void (*pm_pp_remove_ent)(struct pmap *, struct vm_page *, pt_entry_t,
            vaddr_t);
        void (*pm_write_protect)(struct pmap *, vaddr_t, vaddr_t, vm_prot_t);
        void (*pm_unwire)(struct pmap *, vaddr_t);

        void (*pm_tlb_flush)(struct pmap *);
        void *pm_data;

        kmutex_t pm_lock                /* locks for pm_objs */
            __aligned(64);                /* give lock own cache line */
        krwlock_t pm_dummy_lock;        /* ugly hack for abusing uvm_object */
};

/* macro to access pm_pdirpa slots */
#ifdef PAE
#define pmap_pdirpa(pmap, index) \
        ((pmap)->pm_pdirpa[l2tol3(index)] + l2tol2(index) * sizeof(pd_entry_t))
#else
#define pmap_pdirpa(pmap, index) \
        ((pmap)->pm_pdirpa[0] + (index) * sizeof(pd_entry_t))
#endif

/*
 * global kernel variables
 */

/*
 * PDPpaddr is the physical address of the kernel's PDP.
 * - i386 non-PAE and amd64: PDPpaddr corresponds directly to the %cr3
 * value associated to the kernel process, proc0.
 * - i386 PAE: it still represents the PA of the kernel's PDP (L2). Due to
 * the L3 PD, it cannot be considered as the equivalent of a %cr3 any more.
 * - Xen: it corresponds to the PFN of the kernel's PDP.
 */
extern u_long PDPpaddr;

extern pd_entry_t pmap_pg_g;                        /* do we support PTE_G? */
extern pd_entry_t pmap_pg_nx;                        /* do we support PTE_NX? */
extern int pmap_largepages;
extern long nkptp[PTP_LEVELS];

#define pmap_valid_entry(E)                 ((E) & PTE_P) /* is PDE or PTE valid? */

void                pmap_map_ptes(struct pmap *, struct pmap **, pd_entry_t **,
                    pd_entry_t * const **);
void                pmap_unmap_ptes(struct pmap *, struct pmap *);

bool                pmap_pdes_valid(vaddr_t, pd_entry_t * const *, pd_entry_t *,
                    int *lastlvl);

bool                pmap_is_curpmap(struct pmap *);

void                pmap_ept_transform(struct pmap *);

#ifndef __HAVE_DIRECT_MAP
void                pmap_vpage_cpu_init(struct cpu_info *);
#endif
vaddr_t                slotspace_rand(int, size_t, size_t, size_t, vaddr_t);

vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */

typedef enum tlbwhy {
        TLBSHOOT_REMOVE_ALL,
        TLBSHOOT_KENTER,
        TLBSHOOT_KREMOVE,
        TLBSHOOT_FREE_PTP,
        TLBSHOOT_REMOVE_PTE,
        TLBSHOOT_SYNC_PV,
        TLBSHOOT_WRITE_PROTECT,
        TLBSHOOT_ENTER,
        TLBSHOOT_NVMM,
        TLBSHOOT_BUS_DMA,
        TLBSHOOT_BUS_SPACE,
        TLBSHOOT__MAX,
} tlbwhy_t;

void                pmap_tlb_init(void);
void                pmap_tlb_cpu_init(struct cpu_info *);
void                pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, tlbwhy_t);
void                pmap_tlb_shootnow(void);
void                pmap_tlb_intr(void);

/*
 * inline functions
 */

/*
 * pmap_update_pg: flush one page from the TLB (or flush the whole thing
 *        if hardware doesn't support one-page flushing)
 */

__inline static void __unused
pmap_update_pg(vaddr_t va)
{
        invlpg(va);
}

/*
 * various address inlines
 *
 *  vtopte: return a pointer to the PTE mapping a VA, works only for
 *  user and PT addresses
 *
 *  kvtopte: return a pointer to the PTE mapping a kernel VA
 */

#include <lib/libkern/libkern.h>

static __inline pt_entry_t * __unused
vtopte(vaddr_t va)
{

        KASSERT(va < VM_MIN_KERNEL_ADDRESS);

        return (PTE_BASE + pl1_i(va));
}

static __inline pt_entry_t * __unused
kvtopte(vaddr_t va)
{
        pd_entry_t *pde;

        KASSERT(va >= VM_MIN_KERNEL_ADDRESS);

        pde = L2_BASE + pl2_i(va);
        if (*pde & PTE_PS)
                return ((pt_entry_t *)pde);

        return (PTE_BASE + pl1_i(va));
}

#ifdef XENPV
#include <sys/bitops.h>

#define XPTE_MASK        L1_FRAME
/* Selects the index of a PTE in (A)PTE_BASE */
#define XPTE_SHIFT        (L1_SHIFT - ilog2(sizeof(pt_entry_t)))

/* PTE access inline functions */

/*
 * Get the machine address of the pointed pte
 * We use hardware MMU to get value so works only for levels 1-3
 */

static __inline paddr_t
xpmap_ptetomach(pt_entry_t *pte)
{
        pt_entry_t *up_pte;
        vaddr_t va = (vaddr_t) pte;

        va = ((va & XPTE_MASK) >> XPTE_SHIFT) | (vaddr_t) PTE_BASE;
        up_pte = (pt_entry_t *) va;

        return (paddr_t) (((*up_pte) & PTE_FRAME) + (((vaddr_t) pte) & (~PTE_FRAME & ~VA_SIGN_MASK)));
}

/* Xen helpers to change bits of a pte */
#define XPMAP_UPDATE_DIRECT        1        /* Update direct map entry flags too */

paddr_t        vtomach(vaddr_t);
#define vtomfn(va) (vtomach(va) >> PAGE_SHIFT)
#endif        /* XENPV */

#ifdef __HAVE_PCPU_AREA
extern struct pcpu_area *pcpuarea;
#define PDIR_SLOT_PCPU                510
#define PMAP_PCPU_BASE                (VA_SIGN_NEG((PDIR_SLOT_PCPU * NBPD_L4)))
#endif

void        svs_quad_copy(void *, void *, long);

#ifdef _KERNEL_OPT
#include "opt_efi.h"
#endif

#ifdef EFI_RUNTIME
void *                pmap_activate_sync(struct pmap *);
void                pmap_deactivate_sync(struct pmap *, void *);
bool                pmap_is_user(struct pmap *);
#else
static inline bool
pmap_is_user(struct pmap *pmap)
{

        KASSERT(pmap != pmap_kernel());
        return true;
}
#endif

#endif        /* _X86_PMAP_PRIVATE_H_ */

























































   48 










































   34 


   33 


   35 














   28 


   28 


   28 




    7 


   21 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/*        $NetBSD: kern_mutex_obj.c,v 1.15 2023/10/02 21:03:55 ad Exp $        */

/*-
 * Copyright (c) 2008, 2019, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_mutex_obj.c,v 1.15 2023/10/02 21:03:55 ad Exp $");

#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/mutex.h>
#include <sys/kmem.h>

/* Mutex cache */
#define        MUTEX_OBJ_MAGIC        0x5aa3c85d
struct kmutexobj {
        kmutex_t        mo_lock;
        u_int                mo_magic;
        u_int                mo_refcnt;
        uint8_t                mo_pad[COHERENCY_UNIT - sizeof(kmutex_t) -
            sizeof(u_int) * 2];
};

/*
 * mutex_obj_alloc:
 *
 *        Allocate a single lock object, waiting for memory if needed.
 */
kmutex_t *
mutex_obj_alloc(kmutex_type_t type, int ipl)
{
        struct kmutexobj *mo;

        mo = kmem_intr_alloc(sizeof(*mo), KM_SLEEP);
        KASSERT(ALIGNED_POINTER(mo, coherency_unit));
        _mutex_init(&mo->mo_lock, type, ipl,
            (uintptr_t)__builtin_return_address(0));
        mo->mo_magic = MUTEX_OBJ_MAGIC;
        mo->mo_refcnt = 1;

        return (kmutex_t *)mo;
}

/*
 * mutex_obj_alloc:
 *
 *        Allocate a single lock object, failing if no memory available.
 */
kmutex_t *
mutex_obj_tryalloc(kmutex_type_t type, int ipl)
{
        struct kmutexobj *mo;

        mo = kmem_intr_alloc(sizeof(*mo), KM_NOSLEEP);
        KASSERT(ALIGNED_POINTER(mo, coherency_unit));
        if (__predict_true(mo != NULL)) {
                _mutex_init(&mo->mo_lock, type, ipl,
                    (uintptr_t)__builtin_return_address(0));
                mo->mo_magic = MUTEX_OBJ_MAGIC;
                mo->mo_refcnt = 1;
        }

        return (kmutex_t *)mo;
}

/*
 * mutex_obj_hold:
 *
 *        Add a single reference to a lock object.  A reference to the object
 *        must already be held, and must be held across this call.
 */
void
mutex_obj_hold(kmutex_t *lock)
{
        struct kmutexobj *mo = (struct kmutexobj *)lock;

        KASSERTMSG(mo->mo_magic == MUTEX_OBJ_MAGIC,
            "%s: lock %p: mo->mo_magic (%#x) != MUTEX_OBJ_MAGIC (%#x)",
             __func__, mo, mo->mo_magic, MUTEX_OBJ_MAGIC);
        KASSERTMSG(mo->mo_refcnt > 0,
            "%s: lock %p: mo->mo_refcnt (%#x) == 0",
             __func__, mo, mo->mo_refcnt);

        atomic_inc_uint(&mo->mo_refcnt);
}

/*
 * mutex_obj_free:
 *
 *        Drop a reference from a lock object.  If the last reference is being
 *        dropped, free the object and return true.  Otherwise, return false.
 */
bool
mutex_obj_free(kmutex_t *lock)
{
        struct kmutexobj *mo = (struct kmutexobj *)lock;

        KASSERTMSG(mo->mo_magic == MUTEX_OBJ_MAGIC,
            "%s: lock %p: mo->mo_magic (%#x) != MUTEX_OBJ_MAGIC (%#x)",
             __func__, mo, mo->mo_magic, MUTEX_OBJ_MAGIC);
        KASSERTMSG(mo->mo_refcnt > 0,
            "%s: lock %p: mo->mo_refcnt (%#x) == 0",
             __func__, mo, mo->mo_refcnt);

        membar_release();
        if (atomic_dec_uint_nv(&mo->mo_refcnt) > 0) {
                return false;
        }
        membar_acquire();
        mutex_destroy(&mo->mo_lock);
        kmem_intr_free(mo, sizeof(*mo));
        return true;
}

/*
 * mutex_obj_refcnt:
 *
 *        Return the reference count on a lock object.
 */
u_int
mutex_obj_refcnt(kmutex_t *lock)
{
        struct kmutexobj *mo = (struct kmutexobj *)lock;

        return mo->mo_refcnt;
}

































































































    2 


    1 




























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/*        $NetBSD: if_media_80.c,v 1.5 2022/08/03 01:38:51 riastradh Exp $        */

/*-
 * Copyright (c) 1998 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1997
 *        Jonathan Stone and Jason R. Thorpe.  All rights reserved.
 *
 * This software is derived from information provided by Matt Thomas.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Jonathan Stone
 *        and Jason R. Thorpe for the NetBSD Project.
 * 4. The names of the authors may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_media_80.c,v 1.5 2022/08/03 01:38:51 riastradh Exp $");

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/syscallargs.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/compat_stub.h>

#include <net/if.h>
#include <net/if_media.h>

#include <compat/sys/sockio.h>
#include <compat/common/compat_mod.h>

static void
ifmword_n2o(int *oldwd, int *newwd)
{

        if (IFM_SUBTYPE(*newwd) > IFM_OTHER)
                *oldwd = (*newwd & ~(_IFM_ETH_XTMASK | IFM_TMASK)) | IFM_OTHER;
        else
                *oldwd = *newwd;
}

/*ARGSUSED*/
static int
compat_ifmediareq_pre(struct ifreq *ifr, u_long *cmd, bool *do_post)
{
        struct ifmediareq *ifmr = (struct ifmediareq *)ifr;

        switch (*cmd) {
        case SIOCSIFMEDIA_80:
                *cmd = SIOCSIFMEDIA; /* Convert to new one */
                if ((IFM_TYPE(ifr->ifr_media) == IFM_ETHER) &&
                    IFM_SUBTYPE(ifr->ifr_media) > IFM_OTHER) {
                        /* Clear unused bits to not to change to wrong media */
                        ifr->ifr_media &= ~_IFM_ETH_XTMASK;
                }
                return 0;
        case SIOCGIFMEDIA_80:
                *cmd = SIOCGIFMEDIA; /* Convert to new one */
                if (ifmr->ifm_count != 0) {
                        /*
                         * Tell the upper layer to try to convert each ifmedia
                         * entry in the post process.
                         */
                        *do_post = true;
                }
                return 0;
        default:
                return 0;
        }
}

/*ARGSUSED*/
static int
compat_ifmediareq_post(struct ifreq *ifr, u_long cmd)
{
        struct ifmediareq *ifmr = (struct ifmediareq *)ifr;
        size_t minwords;
        size_t count;
        int error, *kptr;

        switch (cmd) {
        case SIOCSIFMEDIA:
                return 0;
        case SIOCGIFMEDIA:
                if (ifmr->ifm_count < 0)
                        return EINVAL;
                
                /*
                 * ifmr->ifm_count was already ajusted in ifmedia_ioctl(), so
                 * there is no problem to trust ifm_count.
                 */
                minwords = ifmr->ifm_count;
                kptr = malloc(minwords * sizeof(*kptr), M_TEMP,
                    M_WAITOK|M_ZERO);
                if (kptr == NULL)
                        return ENOMEM;

                /*
                 * Convert ifm_current and ifm_active.
                 * It's not required to convert ifm_mask.
                 */
                ifmword_n2o(&ifmr->ifm_current, &ifmr->ifm_current);
                ifmword_n2o(&ifmr->ifm_active, &ifmr->ifm_active);

                /* Convert ifm_ulist array */
                for (count = 0; count < minwords; count++) {
                        int oldmwd;

                        error = ufetch_int(&ifmr->ifm_ulist[count], &oldmwd);
                        if (error != 0)
                                goto out;
                        ifmword_n2o(&kptr[count], &oldmwd);
                }

                /* Copy to userland in old format */
                error = copyout(kptr, ifmr->ifm_ulist,
                    minwords * sizeof(*kptr));
out:
                free(kptr, M_TEMP);
                return error;
        default:
                return 0;
        }
}

void
ifmedia_80_init(void)
{

        MODULE_HOOK_SET(ifmedia_80_pre_hook, compat_ifmediareq_pre);
        MODULE_HOOK_SET(ifmedia_80_post_hook, compat_ifmediareq_post);
}

void
ifmedia_80_fini(void)
{
 
        MODULE_HOOK_UNSET(ifmedia_80_post_hook);
        MODULE_HOOK_UNSET(ifmedia_80_pre_hook);
}

































































































    4 













    4 













    1 


    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 












    4 


    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 



    4 












    4 


    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 





    4 








































































    4 






    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 
    4 









    4 
    4 
    2 



    4 





    4 










    4 

    4 



















    2 














































    2 

























    2 









    2 































    2 























    2 
















    2 






    2 














    4 












    2 


    2 
    2 
    2 






    2 







    2 




    2 





























    2 
    2 
    2 
    2 
    2 
    2 



    2 
    2 




    2 
    2 


    2 



    2 



    2 
    2 






    2 
    2 






    2 











    2 
    2 







    2 





    2 

































    2 






















    2 
    2 
    2 







    2 















































    4 





    4 
    4 
    4 
    4 
    4 
    4 






    2 










    4 

















    4 
    4 


































































































































































    4 


    4 
    4 
    4 



    4 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
/*        $NetBSD: ufs_rename.c,v 1.14 2021/10/20 03:08:19 thorpej Exp $        */

/*-
 * Copyright (c) 2012 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Taylor R Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * UFS Rename
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_rename.c,v 1.14 2021/10/20 03:08:19 thorpej Exp $");

#include <sys/param.h>
#include <sys/buf.h>
#include <sys/errno.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/pool.h>
#include <sys/vnode.h>
#include <sys/vnode_if.h>
#include <sys/wapbl.h>

#include <miscfs/genfs/genfs.h>

#include <ufs/ufs/dir.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_wapbl.h>
#include <ufs/ufs/ufsmount.h>

/*
 * Forward declarations
 */

static int ufs_sane_rename(struct vnode *, struct componentname *,
    struct vnode *, struct componentname *,
    kauth_cred_t, bool);
static bool ufs_rename_ulr_overlap_p(const struct ufs_lookup_results *,
    const struct ufs_lookup_results *);
static int ufs_rename_recalculate_fulr(struct vnode *,
    struct ufs_lookup_results *, const struct ufs_lookup_results *,
    const struct componentname *);
static int ufs_direct_namlen(const struct direct *, const struct vnode *);
static int ufs_read_dotdot(struct vnode *, kauth_cred_t, ino_t *);
static int ufs_dirbuf_dotdot_namlen(const struct dirtemplate *,
    const struct vnode *);

static const struct genfs_rename_ops ufs_genfs_rename_ops;

/*
 * ufs_sane_rename: The hairiest vop, with the saner API.
 *
 * Arguments:
 *
 * . fdvp (from directory vnode),
 * . fcnp (from component name),
 * . tdvp (to directory vnode),
 * . tcnp (to component name),
 * . cred (credentials structure), and
 * . posixly_correct (flag for behaviour if target & source link same file).
 *
 * fdvp and tdvp may be the same, and must be referenced and unlocked.
 */
static int
ufs_sane_rename(
    struct vnode *fdvp, struct componentname *fcnp,
    struct vnode *tdvp, struct componentname *tcnp,
    kauth_cred_t cred, bool posixly_correct)
{
        struct ufs_lookup_results fulr, tulr;

        return genfs_sane_rename(&ufs_genfs_rename_ops,
            fdvp, fcnp, &fulr, tdvp, tcnp, &tulr,
            cred, posixly_correct);
}

/*
 * ufs_rename: The hairiest vop, with the insanest API.  Defer to
 * genfs_insane_rename immediately.
 */
int
ufs_rename(void *v)
{

        return genfs_insane_rename(v, &ufs_sane_rename);
}

/*
 * ufs_gro_directory_empty_p: Return true if the directory vp is
 * empty.  dvp is its parent.
 *
 * vp and dvp must be locked and referenced.
 */
bool
ufs_gro_directory_empty_p(struct mount *mp, kauth_cred_t cred,
    struct vnode *vp, struct vnode *dvp)
{

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(vp != NULL);
        KASSERT(dvp != NULL);
        KASSERT(vp != dvp);
        KASSERT(vp->v_mount == mp);
        KASSERT(dvp->v_mount == mp);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);

        return ufs_dirempty(VTOI(vp), VTOI(dvp)->i_number, cred);
}

/*
 * ufs_gro_rename_check_possible: Check whether a rename is possible
 * independent of credentials.
 */
int
ufs_gro_rename_check_possible(struct mount *mp,
    struct vnode *fdvp, struct vnode *fvp,
    struct vnode *tdvp, struct vnode *tvp)
{

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(fdvp != NULL);
        KASSERT(fvp != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(fdvp != fvp);
        KASSERT(fdvp != tvp);
        KASSERT(tdvp != fvp);
        KASSERT(tdvp != tvp);
        KASSERT(fvp != tvp);
        KASSERT(fdvp->v_type == VDIR);
        KASSERT(tdvp->v_type == VDIR);
        KASSERT(fdvp->v_mount == mp);
        KASSERT(fvp->v_mount == mp);
        KASSERT(tdvp->v_mount == mp);
        KASSERT((tvp == NULL) || (tvp->v_mount == mp));
        KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));

        return genfs_ufslike_rename_check_possible(
            VTOI(fdvp)->i_flags, VTOI(fvp)->i_flags,
            VTOI(tdvp)->i_flags, (tvp? VTOI(tvp)->i_flags : 0),
            (tvp != NULL),
            IMMUTABLE, APPEND);
}

/*
 * ufs_gro_rename_check_permitted: Check whether a rename is permitted
 * given our credentials.
 */
int
ufs_gro_rename_check_permitted(struct mount *mp, kauth_cred_t cred,
    struct vnode *fdvp, struct vnode *fvp,
    struct vnode *tdvp, struct vnode *tvp)
{

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(fdvp != NULL);
        KASSERT(fvp != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(fdvp != fvp);
        KASSERT(fdvp != tvp);
        KASSERT(tdvp != fvp);
        KASSERT(tdvp != tvp);
        KASSERT(fvp != tvp);
        KASSERT(fdvp->v_type == VDIR);
        KASSERT(tdvp->v_type == VDIR);
        KASSERT(fdvp->v_mount == mp);
        KASSERT(fvp->v_mount == mp);
        KASSERT(tdvp->v_mount == mp);
        KASSERT((tvp == NULL) || (tvp->v_mount == mp));
        KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));

        return genfs_ufslike_rename_check_permitted(cred,
            fdvp, VTOI(fdvp)->i_mode, VTOI(fdvp)->i_uid,
            fvp, VTOI(fvp)->i_uid,
            tdvp, VTOI(tdvp)->i_mode, VTOI(tdvp)->i_uid,
            tvp, (tvp? VTOI(tvp)->i_uid : 0));
}

/*
 * ufs_gro_remove_check_possible: Check whether a remove is possible
 * independent of credentials.
 */
int
ufs_gro_remove_check_possible(struct mount *mp,
    struct vnode *dvp, struct vnode *vp)
{

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(dvp != NULL);
        KASSERT(vp != NULL);
        KASSERT(dvp != vp);
        KASSERT(dvp->v_type == VDIR);
        KASSERT(vp->v_type != VDIR);
        KASSERT(dvp->v_mount == mp);
        KASSERT(vp->v_mount == mp);
        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);

        return genfs_ufslike_remove_check_possible(
            VTOI(dvp)->i_flags, VTOI(vp)->i_flags,
            IMMUTABLE, APPEND);
}

/*
 * ufs_gro_remove_check_permitted: Check whether a remove is permitted
 * given our credentials.
 */
int
ufs_gro_remove_check_permitted(struct mount *mp, kauth_cred_t cred,
    struct vnode *dvp, struct vnode *vp)
{

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(dvp != NULL);
        KASSERT(vp != NULL);
        KASSERT(dvp != vp);
        KASSERT(dvp->v_type == VDIR);
        KASSERT(vp->v_type != VDIR);
        KASSERT(dvp->v_mount == mp);
        KASSERT(vp->v_mount == mp);
        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);

        return genfs_ufslike_remove_check_permitted(cred,
            dvp, VTOI(dvp)->i_mode, VTOI(dvp)->i_uid, vp, VTOI(vp)->i_uid);
}

/*
 * A virgin directory (no blushing please).
 *
 * XXX Copypasta from ufs_vnops.c.  Kill!
 */
static const struct dirtemplate mastertemplate = {
        0,        12,                        DT_DIR,        1,        ".",
        0,        UFS_DIRBLKSIZ - 12,        DT_DIR,        2,        ".."
};

/*
 * ufs_gro_rename: Actually perform the rename operation.
 */
int
ufs_gro_rename(struct mount *mp, kauth_cred_t cred,
    struct vnode *fdvp, struct componentname *fcnp,
    void *fde, struct vnode *fvp,
    struct vnode *tdvp, struct componentname *tcnp,
    void *tde, struct vnode *tvp, nlink_t *tvp_nlinkp)
{
        struct ufs_lookup_results *fulr = fde;
        struct ufs_lookup_results *tulr = tde;
        bool directory_p, reparent_p;
        struct direct *newdir;
        int error;

        KASSERT(mp != NULL);
        KASSERT(fdvp != NULL);
        KASSERT(fcnp != NULL);
        KASSERT(fulr != NULL);
        KASSERT(fvp != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(tcnp != NULL);
        KASSERT(tulr != NULL);
        KASSERT(fulr != tulr);
        KASSERT(fdvp != fvp);
        KASSERT(fdvp != tvp);
        KASSERT(tdvp != fvp);
        KASSERT(tdvp != tvp);
        KASSERT(fvp != tvp);
        KASSERT(fdvp->v_mount == mp);
        KASSERT(fvp->v_mount == mp);
        KASSERT(tdvp->v_mount == mp);
        KASSERT((tvp == NULL) || (tvp->v_mount == mp));
        KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));

        /*
         * We shall need to temporarily bump the link count, so make
         * sure there is room to do so.
         */
        if ((nlink_t)VTOI(fvp)->i_nlink >= LINK_MAX)
                return EMLINK;

        directory_p = (fvp->v_type == VDIR);
        KASSERT(directory_p == ((VTOI(fvp)->i_mode & IFMT) == IFDIR));
        KASSERT((tvp == NULL) || (directory_p == (tvp->v_type == VDIR)));
        KASSERT((tvp == NULL) || (directory_p ==
                ((VTOI(tvp)->i_mode & IFMT) == IFDIR)));

        reparent_p = (fdvp != tdvp);
        KASSERT(reparent_p == (VTOI(fdvp)->i_number != VTOI(tdvp)->i_number));

        /*
         * Commence hacking of the data on disk.
         */

        error = UFS_WAPBL_BEGIN(mp);
        if (error)
                goto ihateyou;

        /*
         * 1) Bump link count while we're moving stuff
         *    around.  If we crash somewhere before
         *    completing our work, the link count
         *    may be wrong, but correctable.
         */

        KASSERT((nlink_t)VTOI(fvp)->i_nlink < LINK_MAX);
        VTOI(fvp)->i_nlink++;
        DIP_ASSIGN(VTOI(fvp), nlink, VTOI(fvp)->i_nlink);
        VTOI(fvp)->i_flag |= IN_CHANGE;
        error = UFS_UPDATE(fvp, NULL, NULL, UPDATE_DIROP);
        if (error)
                goto whymustithurtsomuch;

        /*
         * 2) If target doesn't exist, link the target
         *    to the source and unlink the source.
         *    Otherwise, rewrite the target directory
         *    entry to reference the source inode and
         *    expunge the original entry's existence.
         */

        if (tvp == NULL) {
                /*
                 * Account for ".." in new directory.
                 * When source and destination have the same
                 * parent we don't fool with the link count.
                 */
                if (directory_p && reparent_p) {
                        if ((nlink_t)VTOI(tdvp)->i_nlink >= LINK_MAX) {
                                error = EMLINK;
                                goto whymustithurtsomuch;
                        }
                        KASSERT((nlink_t)VTOI(tdvp)->i_nlink < LINK_MAX);
                        VTOI(tdvp)->i_nlink++;
                        DIP_ASSIGN(VTOI(tdvp), nlink, VTOI(tdvp)->i_nlink);
                        VTOI(tdvp)->i_flag |= IN_CHANGE;
                        error = UFS_UPDATE(tdvp, NULL, NULL, UPDATE_DIROP);
                        if (error) {
                                /*
                                 * Link count update didn't take --
                                 * back out the in-memory link count.
                                 */
                                KASSERT(0 < VTOI(tdvp)->i_nlink);
                                VTOI(tdvp)->i_nlink--;
                                DIP_ASSIGN(VTOI(tdvp), nlink,
                                    VTOI(tdvp)->i_nlink);
                                VTOI(tdvp)->i_flag |= IN_CHANGE;
                                goto whymustithurtsomuch;
                        }
                }

                newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
                ufs_makedirentry(VTOI(fvp), tcnp, newdir);
                error = ufs_direnter(tdvp, tulr, NULL, newdir, tcnp, NULL);
                pool_cache_put(ufs_direct_cache, newdir);
                if (error) {
                        if (directory_p && reparent_p) {
                                /*
                                 * Directory update didn't take, but
                                 * the link count update did -- back
                                 * out the in-memory link count and the
                                 * on-disk link count.
                                 */
                                KASSERT(0 < VTOI(tdvp)->i_nlink);
                                VTOI(tdvp)->i_nlink--;
                                DIP_ASSIGN(VTOI(tdvp), nlink,
                                    VTOI(tdvp)->i_nlink);
                                VTOI(tdvp)->i_flag |= IN_CHANGE;
                                (void)UFS_UPDATE(tdvp, NULL, NULL,
                                    UPDATE_WAIT | UPDATE_DIROP);
                        }
                        goto whymustithurtsomuch;
                }
        } else {
                if (directory_p)
                        /* XXX WTF?  Why purge here?  Why not purge others?  */
                        cache_purge(tdvp);

                /*
                 * Make the target directory's entry for tcnp point at
                 * the source node.
                 *
                 * XXX ufs_dirrewrite decrements tvp's link count, but
                 * doesn't touch the link count of the new inode.  Go
                 * figure.
                 */
                error = ufs_dirrewrite(VTOI(tdvp), tulr->ulr_offset,
                    VTOI(tvp), VTOI(fvp)->i_number, IFTODT(VTOI(fvp)->i_mode),
                    ((directory_p && reparent_p) ? reparent_p : directory_p),
                    IN_CHANGE | IN_UPDATE);
                if (error)
                        goto whymustithurtsomuch;

                /*
                 * If the source and target are directories, and the
                 * target is in the same directory as the source,
                 * decrement the link count of the common parent
                 * directory, since we are removing the target from
                 * that directory.
                 */
                if (directory_p && !reparent_p) {
                        KASSERT(fdvp == tdvp);
                        /* XXX check, don't kassert */
                        KASSERT(0 < VTOI(tdvp)->i_nlink);
                        VTOI(tdvp)->i_nlink--;
                        DIP_ASSIGN(VTOI(tdvp), nlink, VTOI(tdvp)->i_nlink);
                        VTOI(tdvp)->i_flag |= IN_CHANGE;
                        UFS_WAPBL_UPDATE(tdvp, NULL, NULL, 0);
                }

                if (directory_p) {
                        /*
                         * XXX I don't understand the following comment
                         * from ufs_rename -- in particular, the part
                         * about `there may be other hard links'.
                         *
                         * Truncate inode. The only stuff left in the directory
                         * is "." and "..". The "." reference is inconsequential
                         * since we are quashing it. We have removed the "."
                         * reference and the reference in the parent directory,
                         * but there may be other hard links.
                         *
                         * XXX The ufs_dirempty call earlier does
                         * not guarantee anything about nlink.
                         */
                        if (VTOI(tvp)->i_nlink != 1)
                                ufs_dirbad(VTOI(tvp), (doff_t)0,
                                    "hard-linked directory");
                        VTOI(tvp)->i_nlink = 0;
                        DIP_ASSIGN(VTOI(tvp), nlink, 0);
                        (void) UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC, cred);
                }
        }

        /*
         * If the source is a directory with a new parent, the link
         * count of the old parent directory must be decremented and
         * ".." set to point to the new parent.
         *
         * XXX ufs_dirrewrite updates the link count of fdvp, but not
         * the link count of fvp or the link count of tdvp.  Go figure.
         */
        if (directory_p && reparent_p) {
                error = ufs_dirrewrite(VTOI(fvp), mastertemplate.dot_reclen,
                    VTOI(fdvp), VTOI(tdvp)->i_number, DT_DIR, 0, IN_CHANGE);
#if 0                /* XXX This branch was not in ufs_rename! */
                if (error)
                        goto whymustithurtsomuch;
#endif

                /* XXX WTF?  Why purge here?  Why not purge others?  */
                cache_purge(fdvp);
        }

        /*
         * 3) Unlink the source.
         */

        /*
         * ufs_direnter may compact the directory in the process of
         * inserting a new entry.  That may invalidate fulr, which we
         * need in order to remove the old entry.  In that case, we
         * need to recalculate what fulr should be.
         */
        if (!reparent_p && (tvp == NULL) &&
            ufs_rename_ulr_overlap_p(fulr, tulr)) {
                error = ufs_rename_recalculate_fulr(fdvp, fulr, tulr, fcnp);
#if 0                                /* XXX */
                if (error)        /* XXX Try to back out changes?  */
                        goto whymustithurtsomuch;
#endif
        }

        /*
         * XXX 0 means !isrmdir.  But can't this be an rmdir?
         * XXX Well, turns out that argument to ufs_dirremove is ignored...
         * XXX And it turns out ufs_dirremove updates the link count of fvp.
         * XXX But it doesn't update the link count of fdvp.  Go figure.
         * XXX fdvp's link count is updated in ufs_dirrewrite instead.
         * XXX Actually, sometimes it doesn't update fvp's link count.
         * XXX I hate the world.
         */
        error = ufs_dirremove(fdvp, fulr, VTOI(fvp), fcnp->cn_flags, 0);
        if (error)
#if 0                                /* XXX */
                goto whymustithurtsomuch;
#endif
                goto arghmybrainhurts;

        if (tvp != NULL) {
                *tvp_nlinkp = VTOI(tvp)->i_nlink;
        }
#if 0                                /* XXX */
        genfs_rename_cache_purge(fdvp, fvp, tdvp, tvp);
#endif
        goto arghmybrainhurts;

whymustithurtsomuch:
        KASSERT(0 < VTOI(fvp)->i_nlink);
        VTOI(fvp)->i_nlink--;
        DIP_ASSIGN(VTOI(fvp), nlink, VTOI(fvp)->i_nlink);
        VTOI(fvp)->i_flag |= IN_CHANGE;
        UFS_WAPBL_UPDATE(fvp, NULL, NULL, 0);

arghmybrainhurts:
        UFS_WAPBL_END(mp);

ihateyou:
        return error;
}

/*
 * ufs_rename_ulr_overlap_p: True iff tulr overlaps with fulr so that
 * entering a directory entry at tulr may move fulr.
 */
static bool
ufs_rename_ulr_overlap_p(const struct ufs_lookup_results *fulr,
    const struct ufs_lookup_results *tulr)
{
        doff_t from_prev_start, from_prev_end, to_start, to_end;

        KASSERT(fulr != NULL);
        KASSERT(tulr != NULL);
        KASSERT(fulr != tulr);

        /*
         * fulr is from a DELETE lookup, so fulr->ulr_count is the size
         * of the preceding entry (d_reclen).
         */
        from_prev_end = fulr->ulr_offset;
        KASSERT(fulr->ulr_count <= from_prev_end);
        from_prev_start = (from_prev_end - fulr->ulr_count);

        /*
         * tulr is from a RENAME lookup, so tulr->ulr_count is the size
         * of the free space for an entry that we are about to fill.
         */
        to_start = tulr->ulr_offset;
        KASSERT(tulr->ulr_count < (UFS_MAXDIRSIZE - to_start));
        to_end = (to_start + tulr->ulr_count);

        return
            (((to_start <= from_prev_start) && (from_prev_start < to_end)) ||
                ((to_start <= from_prev_end) && (from_prev_end < to_end)));
}

/*
 * ufs_rename_recalculate_fulr: If we have just entered a directory into
 * dvp at tulr, and we were about to remove one at fulr for an entry
 * named fcnp, fulr may be invalid.  So, if necessary, recalculate it.
 */
static int
ufs_rename_recalculate_fulr(struct vnode *dvp,
    struct ufs_lookup_results *fulr, const struct ufs_lookup_results *tulr,
    const struct componentname *fcnp)
{
        struct mount *mp;
        struct ufsmount *ump;
        int needswap;
        /* XXX int is a silly type for this; blame ufsmount::um_dirblksiz.  */
        int dirblksiz;
        doff_t search_start, search_end;
        doff_t offset;                /* Offset of entry we're examining.  */
        struct buf *bp;                /* I/O block we're examining.  */
        char *dirbuf;                /* Pointer into directory at search_start.  */
        struct direct *ep;        /* Pointer to the entry we're examining.  */
        /* XXX direct::d_reclen is 16-bit;
         * ufs_lookup_results::ulr_reclen is 32-bit.  Blah.  */
        uint32_t reclen;        /* Length of the entry we're examining.  */
        uint32_t prev_reclen;        /* Length of the preceding entry.  */
        int error;

        KASSERT(dvp != NULL);
        KASSERT(dvp->v_mount != NULL);
        KASSERT(VTOI(dvp) != NULL);
        KASSERT(fulr != NULL);
        KASSERT(tulr != NULL);
        KASSERT(fulr != tulr);
        KASSERT(ufs_rename_ulr_overlap_p(fulr, tulr));

        mp = dvp->v_mount;
        ump = VFSTOUFS(mp);
        KASSERT(ump != NULL);
        KASSERT(ump == VTOI(dvp)->i_ump);

        needswap = UFS_MPNEEDSWAP(ump);

        dirblksiz = ump->um_dirblksiz;
        KASSERT(0 < dirblksiz);
        KASSERT((dirblksiz & (dirblksiz - 1)) == 0);

        /* A directory block may not span across multiple I/O blocks.  */
        KASSERT(dirblksiz <= mp->mnt_stat.f_iosize);

        /* Find the bounds of the search.  */
        search_start = tulr->ulr_offset;
        KASSERT(fulr->ulr_reclen < (UFS_MAXDIRSIZE - fulr->ulr_offset));
        search_end = (fulr->ulr_offset + fulr->ulr_reclen);

        /* Compaction must happen only within a directory block. (*)  */
        KASSERT(search_start <= search_end);
        KASSERT((search_end - (search_start &~ (dirblksiz - 1))) <= dirblksiz);

        dirbuf = NULL;
        bp = NULL;
        error = ufs_blkatoff(dvp, (off_t)search_start, &dirbuf, &bp, false);
        if (error)
                return error;
        KASSERT(dirbuf != NULL);
        KASSERT(bp != NULL);

        /*
         * Guarantee we sha'n't go past the end of the buffer we got.
         * dirbuf is bp->b_data + (search_start & (iosize - 1)), and
         * the valid range is [bp->b_data, bp->b_data + bp->b_bcount).
         */
        KASSERT((search_end - search_start) <=
            (bp->b_bcount - (search_start & (mp->mnt_stat.f_iosize - 1))));

        prev_reclen = fulr->ulr_count;
        offset = search_start;

        /*
         * Search from search_start to search_end for the entry matching
         * fcnp, which must be there because we found it before and it
         * should only at most have moved earlier.
         */
        for (;;) {
                KASSERT(search_start <= offset);
                KASSERT(offset < search_end);

                /*
                 * Examine the directory entry at offset.
                 */
                ep = (struct direct *)(dirbuf + (offset - search_start));
                reclen = ufs_rw16(ep->d_reclen, needswap);

                if (ep->d_ino == 0)
                        goto next;        /* Entry is unused.  */

                if (ufs_rw32(ep->d_ino, needswap) == UFS_WINO)
                        goto next;        /* Entry is whiteout.  */

                if (fcnp->cn_namelen != ufs_direct_namlen(ep, dvp))
                        goto next;        /* Wrong name length.  */

                if (memcmp(ep->d_name, fcnp->cn_nameptr, fcnp->cn_namelen))
                        goto next;        /* Wrong name.  */

                /* Got it!  */
                break;

next:
                if (! ((reclen < search_end) &&
                        (offset < (search_end - reclen)))) {
                        brelse(bp, 0);
                        return EIO;        /* XXX Panic?  What?  */
                }

                /* We may not move past the search end.  */
                KASSERT(reclen < search_end);
                KASSERT(offset < (search_end - reclen));

                /*
                 * We may not move across a directory block boundary;
                 * see (*) above.
                 */
                KASSERT((offset &~ (dirblksiz - 1)) ==
                    ((offset + reclen) &~ (dirblksiz - 1)));

                prev_reclen = reclen;
                offset += reclen;
        }

        /*
         * Found the entry.  Record where.
         */
        fulr->ulr_offset = offset;
        fulr->ulr_reclen = reclen;

        /*
         * Record the preceding record length, but not if we're at the
         * start of a directory block.
         */
        fulr->ulr_count = ((offset & (dirblksiz - 1))? prev_reclen : 0);

        brelse(bp, 0);
        return 0;
}

/*
 * ufs_direct_namlen: Return the namlen of the directory entry ep from
 * the directory vp.
 */
static int                        /* XXX int?  uint8_t?  */
ufs_direct_namlen(const struct direct *ep, const struct vnode *vp)
{
        bool swap;

        KASSERT(ep != NULL);
        KASSERT(vp != NULL);
        KASSERT(VTOI(vp) != NULL);
        KASSERT(VTOI(vp)->i_ump != NULL);

#if (BYTE_ORDER == LITTLE_ENDIAN)
        swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) == 0);
#else
        swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) != 0);
#endif

        return ((FSFMT(vp) && swap)? ep->d_type : ep->d_namlen);
}

/*
 * ufs_gro_remove: Rename an object over another link to itself,
 * effectively removing just the original link.
 */
int
ufs_gro_remove(struct mount *mp, kauth_cred_t cred,
    struct vnode *dvp, struct componentname *cnp, void *de, struct vnode *vp,
    nlink_t *tvp_nlinkp)
{
        struct ufs_lookup_results *ulr = de;
        int error;

        KASSERT(mp != NULL);
        KASSERT(dvp != NULL);
        KASSERT(cnp != NULL);
        KASSERT(ulr != NULL);
        KASSERT(vp != NULL);
        KASSERT(dvp != vp);
        KASSERT(dvp->v_mount == mp);
        KASSERT(vp->v_mount == mp);
        KASSERT(dvp->v_type == VDIR);
        KASSERT(vp->v_type != VDIR);
        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        KASSERT(cnp->cn_nameiop == DELETE);

        error = UFS_WAPBL_BEGIN(mp);
        if (error)
                goto out;

        /* XXX ufs_dirremove decrements vp's link count for us.  */
        error = ufs_dirremove(dvp, ulr, VTOI(vp), cnp->cn_flags, 0);
        UFS_WAPBL_END(mp);

        *tvp_nlinkp = VTOI(vp)->i_nlink;
out:
        return error;
}

/*
 * ufs_gro_lookup: Look up and save the lookup results.
 */
int
ufs_gro_lookup(struct mount *mp, struct vnode *dvp,
    struct componentname *cnp, void *de_ret, struct vnode **vp_ret)
{
        struct ufs_lookup_results *ulr_ret = de_ret;
        struct vnode *vp = NULL;
        int error;

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(dvp != NULL);
        KASSERT(cnp != NULL);
        KASSERT(ulr_ret != NULL);
        KASSERT(vp_ret != NULL);
        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);

        /* Kludge cargo-culted from dholland's ufs_rename.  */
        cnp->cn_flags &=~ MODMASK;
        cnp->cn_flags |= (LOCKPARENT | LOCKLEAF);

        error = relookup(dvp, &vp, cnp, 0 /* dummy */);
        if ((error == 0) && (vp == NULL)) {
                error = ENOENT;
                goto out;
        } else if (error) {
                return error;
        }

        /*
         * Thanks to VFS insanity, relookup locks vp, which screws us
         * in various ways.
         */
        KASSERT(vp != NULL);
        VOP_UNLOCK(vp);

out:        *ulr_ret = VTOI(dvp)->i_crap;
        *vp_ret = vp;
        return error;
}

/*
 * ufs_rmdired_p: Check whether the directory vp has been rmdired.
 *
 * vp must be locked and referenced.
 */
static bool
ufs_rmdired_p(struct vnode *vp)
{

        KASSERT(vp != NULL);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        KASSERT(vp->v_type == VDIR);

        /* XXX Is this correct?  */
        return (VTOI(vp)->i_size == 0);
}

/*
 * ufs_read_dotdot: Store in *ino_ret the inode number of the parent
 * of the directory vp.
 */
static int
ufs_read_dotdot(struct vnode *vp, kauth_cred_t cred, ino_t *ino_ret)
{
        struct dirtemplate dirbuf;
        int error;

        KASSERT(vp != NULL);
        KASSERT(ino_ret != NULL);
        KASSERT(vp->v_type == VDIR);

        error = ufs_bufio(UIO_READ, vp, &dirbuf, sizeof dirbuf, (off_t)0,
            IO_NODELOCKED, cred, NULL, NULL);
        if (error)
                return error;

        if (ufs_dirbuf_dotdot_namlen(&dirbuf, vp) != 2 ||
            dirbuf.dotdot_name[0] != '.' ||
            dirbuf.dotdot_name[1] != '.')
                /* XXX Panic?  Print warning?  */
                return ENOTDIR;

        *ino_ret = ufs_rw32(dirbuf.dotdot_ino,
            UFS_MPNEEDSWAP(VTOI(vp)->i_ump));
        return 0;
}

/*
 * ufs_dirbuf_dotdot_namlen: Return the namlen of the directory buffer
 * dirbuf that came from the directory vp.  Swap byte order if
 * necessary.
 */
static int                        /* XXX int?  uint8_t?  */
ufs_dirbuf_dotdot_namlen(const struct dirtemplate *dirbuf,
    const struct vnode *vp)
{
        bool swap;

        KASSERT(dirbuf != NULL);
        KASSERT(vp != NULL);
        KASSERT(VTOI(vp) != NULL);
        KASSERT(VTOI(vp)->i_ump != NULL);

#if (BYTE_ORDER == LITTLE_ENDIAN)
        swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) == 0);
#else
        swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) != 0);
#endif

        return ((FSFMT(vp) && swap)?
            dirbuf->dotdot_type : dirbuf->dotdot_namlen);
}

/*
 * ufs_gro_genealogy: Analyze the genealogy of the source and target
 * directories.
 */
int
ufs_gro_genealogy(struct mount *mp, kauth_cred_t cred,
    struct vnode *fdvp, struct vnode *tdvp,
    struct vnode **intermediate_node_ret)
{
        struct vnode *vp, *dvp;
        ino_t dotdot_ino = 0;        /* XXX: gcc */
        int error;

        KASSERT(mp != NULL);
        KASSERT(fdvp != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(fdvp != tdvp);
        KASSERT(intermediate_node_ret != NULL);
        KASSERT(fdvp->v_mount == mp);
        KASSERT(tdvp->v_mount == mp);
        KASSERT(fdvp->v_type == VDIR);
        KASSERT(tdvp->v_type == VDIR);

        /*
         * We need to provisionally lock tdvp to keep rmdir from
         * deleting it -- or any ancestor -- at an inopportune moment.
         */
        error = ufs_gro_lock_directory(mp, tdvp);
        if (error)
                return error;

        vp = tdvp;
        vref(vp);

        for (;;) {
                KASSERT(vp != NULL);
                KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
                KASSERT(vp->v_mount == mp);
                KASSERT(vp->v_type == VDIR);
                KASSERT(!ufs_rmdired_p(vp));

                /* Did we hit the root without finding fdvp?  */
                if (VTOI(vp)->i_number == UFS_ROOTINO) {
                        vput(vp);
                        *intermediate_node_ret = NULL;
                        return 0;
                }

                error = ufs_read_dotdot(vp, cred, &dotdot_ino);
                if (error) {
                        vput(vp);
                        return error;
                }

                /* Did we find that fdvp is an ancestor of tdvp?  */
                if (VTOI(fdvp)->i_number == dotdot_ino) {
                        /* Unlock vp, but keep it referenced.  */
                        VOP_UNLOCK(vp);
                        *intermediate_node_ret = vp;
                        return 0;
                }

                /* Neither -- keep ascending the family tree.  */
                error = vcache_get(mp, &dotdot_ino, sizeof(dotdot_ino), &dvp);
                vput(vp);
                if (error)
                        return error;
                error = vn_lock(dvp, LK_EXCLUSIVE);
                if (error) {
                        vrele(dvp);
                        return error;
                }

                KASSERT(dvp != NULL);
                KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
                vp = dvp;

                if (vp->v_type != VDIR) {
                        /*
                         * XXX Panic?  Print a warning?  Can this
                         * happen if we lose the race I suspect to
                         * exist above, and the `..' inode number has
                         * been recycled?
                         */
                        vput(vp);
                        return ENOTDIR;
                }

                if (ufs_rmdired_p(vp)) {
                        vput(vp);
                        return ENOENT;
                }
        }
}

/*
 * ufs_gro_lock_directory: Lock the directory vp, but fail if it has
 * been rmdir'd.
 */
int
ufs_gro_lock_directory(struct mount *mp, struct vnode *vp)
{

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(vp != NULL);
        KASSERT(vp->v_mount == mp);

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);

        if (ufs_rmdired_p(vp)) {
                VOP_UNLOCK(vp);
                return ENOENT;
        }

        return 0;
}

static const struct genfs_rename_ops ufs_genfs_rename_ops = {
        .gro_directory_empty_p                = ufs_gro_directory_empty_p,
        .gro_rename_check_possible        = ufs_gro_rename_check_possible,
        .gro_rename_check_permitted        = ufs_gro_rename_check_permitted,
        .gro_remove_check_possible        = ufs_gro_remove_check_possible,
        .gro_remove_check_permitted        = ufs_gro_remove_check_permitted,
        .gro_rename                        = ufs_gro_rename,
        .gro_remove                        = ufs_gro_remove,
        .gro_lookup                        = ufs_gro_lookup,
        .gro_genealogy                        = ufs_gro_genealogy,
        .gro_lock_directory                = ufs_gro_lock_directory,
};





























































   84 


   82 
   82 
   84 


















   45 




   45 
   45 
   45 
   32 


   41 










    1 
    3 




   28 

    2 




   44 



    3 



   45 

   30 






   45 
   45 

















    1 



    1 



























    1 
    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/*        $NetBSD: uvm_page_status.c,v 1.6 2020/08/14 09:06:15 chs Exp $        */

/*-
 * Copyright (c)2011 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_page_status.c,v 1.6 2020/08/14 09:06:15 chs Exp $");

#include <sys/param.h>
#include <sys/systm.h>

#include <uvm/uvm.h>

/*
 * page dirtiness status tracking
 *
 * separated from uvm_page.c mainly for rump
 */

/*
 * these constants are chosen to match so that we can convert between
 * them quickly.
 */

__CTASSERT(UVM_PAGE_STATUS_UNKNOWN == 0);
__CTASSERT(UVM_PAGE_STATUS_DIRTY == PG_DIRTY);
__CTASSERT(UVM_PAGE_STATUS_CLEAN == PG_CLEAN);

/*
 * uvm_pagegetdirty: return the dirtiness status (one of UVM_PAGE_STATUS_
 * values) of the page.
 *
 * called with the owner locked.
 */

unsigned int
uvm_pagegetdirty(struct vm_page *pg)
{
        struct uvm_object * const uobj __diagused = pg->uobject;

        KASSERT((~pg->flags & (PG_CLEAN|PG_DIRTY)) != 0);
        KASSERT(uvm_page_owner_locked_p(pg, false));
        KASSERT(uobj == NULL || ((pg->flags & PG_CLEAN) == 0) ==
                uvm_obj_page_dirty_p(pg));
        return pg->flags & (PG_CLEAN|PG_DIRTY);
}

/*
 * uvm_pagemarkdirty: set the dirtiness status (one of UVM_PAGE_STATUS_ values)
 * of the page.
 *
 * called with the owner locked.
 *
 * update the radix tree tag for object-owned page.
 *
 * if new status is UVM_PAGE_STATUS_UNKNOWN, clear pmap-level dirty bit
 * so that later uvm_pagecheckdirty() can notice modifications on the page.
 */

void
uvm_pagemarkdirty(struct vm_page *pg, unsigned int newstatus)
{
        struct uvm_object * const uobj = pg->uobject;
        const unsigned int oldstatus = uvm_pagegetdirty(pg);
        enum cpu_count base;

        KASSERT((~newstatus & (PG_CLEAN|PG_DIRTY)) != 0);
        KASSERT((newstatus & ~(PG_CLEAN|PG_DIRTY)) == 0);
        KASSERT(uvm_page_owner_locked_p(pg, true));
        KASSERT(uobj == NULL || ((pg->flags & PG_CLEAN) == 0) ==
                uvm_obj_page_dirty_p(pg));

        if (oldstatus == newstatus) {
                return;
        }

        /*
         * set UVM_PAGE_DIRTY_TAG tag unless known CLEAN so that putpages can
         * find possibly-dirty pages quickly.
         */

        if (uobj != NULL) {
                if (newstatus == UVM_PAGE_STATUS_CLEAN) {
                        uvm_obj_page_clear_dirty(pg);
                } else if (oldstatus == UVM_PAGE_STATUS_CLEAN) {
                        /*
                         * on first dirty page, mark the object dirty.
                         * for vnodes this inserts to the syncer worklist.
                         */
                        if (uvm_obj_clean_p(uobj) &&
                            uobj->pgops->pgo_markdirty != NULL) {
                                (*uobj->pgops->pgo_markdirty)(uobj);
                        }
                        uvm_obj_page_set_dirty(pg);
                }
        }
        if (newstatus == UVM_PAGE_STATUS_UNKNOWN) {
                /*
                 * start relying on pmap-level dirtiness tracking.
                 */
                pmap_clear_modify(pg);
        }
        pg->flags &= ~(PG_CLEAN|PG_DIRTY);
        pg->flags |= newstatus;
        KASSERT(uobj == NULL || ((pg->flags & PG_CLEAN) == 0) ==
                uvm_obj_page_dirty_p(pg));
        if ((pg->flags & PG_STAT) != 0) {
                if ((pg->flags & PG_SWAPBACKED) != 0) {
                        base = CPU_COUNT_ANONUNKNOWN;
                } else {
                        base = CPU_COUNT_FILEUNKNOWN;
                }
                kpreempt_disable();
                CPU_COUNT(base + oldstatus, -1);
                CPU_COUNT(base + newstatus, +1);
                kpreempt_enable();
        }
}

/*
 * uvm_pagecheckdirty: check if page is dirty, and remove its dirty bit.
 *
 * called with the owner locked.
 *
 * returns if the page was dirty.
 *
 * if protected is true, mark the page CLEAN.  otherwise, mark the page UNKNOWN.
 * ("mark" in the sense of uvm_pagemarkdirty().)
 */

bool
uvm_pagecheckdirty(struct vm_page *pg, bool pgprotected)
{
        const unsigned int oldstatus = uvm_pagegetdirty(pg);
        bool modified;

        KASSERT(uvm_page_owner_locked_p(pg, true));

        /*
         * if pgprotected is true, mark the page CLEAN.
         * otherwise mark the page UNKNOWN unless it's CLEAN.
         *
         * possible transitions:
         *
         *        CLEAN   -> CLEAN  , modified = false
         *        UNKNOWN -> UNKNOWN, modified = true
         *        UNKNOWN -> UNKNOWN, modified = false
         *        UNKNOWN -> CLEAN  , modified = true
         *        UNKNOWN -> CLEAN  , modified = false
         *        DIRTY   -> UNKNOWN, modified = true
         *        DIRTY   -> CLEAN  , modified = true
         *
         * pmap_clear_modify is necessary if either of
         * oldstatus or newstatus is UVM_PAGE_STATUS_UNKNOWN.
         */

        if (oldstatus == UVM_PAGE_STATUS_CLEAN) {
                modified = false;
        } else {
                const unsigned int newstatus = pgprotected ?
                    UVM_PAGE_STATUS_CLEAN : UVM_PAGE_STATUS_UNKNOWN;

                if (oldstatus == UVM_PAGE_STATUS_DIRTY) {
                        modified = true;
                        if (newstatus == UVM_PAGE_STATUS_UNKNOWN) {
                                pmap_clear_modify(pg);
                        }
                } else {
                        KASSERT(oldstatus == UVM_PAGE_STATUS_UNKNOWN);
                        modified = pmap_clear_modify(pg);
                }
                uvm_pagemarkdirty(pg, newstatus);
        }
        return modified;
}








































































































    1 






















    4 










    1 


    1 


    2 








    1 











    3 






































































    3 





    1 











    3 






    1 





    1 
    2 











    1 
    1 







    1 









    1 























    9 


















    3 



    1 









    1 









    3 








    1 








    1 






































    1 

















    1 













    1 





















































    5 











    5 
















    2 






    2 











    3 









    3 









































    2 












    2 
























    1 









    1 













































    1 





    1 







    1 











    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
/*        $NetBSD: sys_descrip.c,v 1.48 2023/07/10 02:31:55 christos Exp $        */

/*-
 * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_descrip.c        8.8 (Berkeley) 2/14/95
 */

/*
 * System calls on descriptors.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_descrip.c,v 1.48 2023/07/10 02:31:55 christos Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/namei.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/syslog.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>

#include <uvm/uvm_readahead.h>

/*
 * Duplicate a file descriptor.
 */
int
sys_dup(struct lwp *l, const struct sys_dup_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)        fd;
        } */
        int error, newfd, oldfd;
        file_t *fp;

        oldfd = SCARG(uap, fd);

        if ((fp = fd_getfile(oldfd)) == NULL) {
                return EBADF;
        }
        error = fd_dup(fp, 0, &newfd, false);
        fd_putfile(oldfd);
        *retval = newfd;
        return error;
}

/*
 * Duplicate a file descriptor to a particular value.
 */
int
dodup(struct lwp *l, int from, int to, int flags, register_t *retval)
{
        int error;
        file_t *fp;

        if ((fp = fd_getfile(from)) == NULL)
                return EBADF;
        mutex_enter(&fp->f_lock);
        fp->f_count++;
        mutex_exit(&fp->f_lock);
        fd_putfile(from);

        if ((u_int)to >= curproc->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
            (u_int)to >= maxfiles)
                error = EBADF;
        else if (from == to)
                error = 0;
        else
                error = fd_dup2(fp, to, flags);
        closef(fp);
        *retval = to;

        return error;
}

int
sys_dup3(struct lwp *l, const struct sys_dup3_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)        from;
                syscallarg(int)        to;
                syscallarg(int)        flags;
        } */
        return dodup(l, SCARG(uap, from), SCARG(uap, to), SCARG(uap, flags),
            retval);
}

int
sys_dup2(struct lwp *l, const struct sys_dup2_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)        from;
                syscallarg(int)        to;
        } */
        return dodup(l, SCARG(uap, from), SCARG(uap, to), 0, retval);
}

/*
 * fcntl call which is being passed to the file's fs.
 */
static int
fcntl_forfs(int fd, file_t *fp, int cmd, void *arg)
{
        int                error;
        u_int                size;
        void                *data, *memp;
#define STK_PARAMS        128
        char                stkbuf[STK_PARAMS];

        if ((fp->f_flag & (FREAD | FWRITE)) == 0)
                return (EBADF);

        /*
         * Interpret high order word to find amount of data to be
         * copied to/from the user's address space.
         */
        size = (size_t)F_PARAM_LEN(cmd);
        if (size > F_PARAM_MAX)
                return (EINVAL);
        memp = NULL;
        if (size > sizeof(stkbuf)) {
                memp = kmem_alloc(size, KM_SLEEP);
                data = memp;
        } else
                data = stkbuf;
        if (cmd & F_FSIN) {
                if (size) {
                        error = copyin(arg, data, size);
                        if (error) {
                                if (memp)
                                        kmem_free(memp, size);
                                return (error);
                        }
                } else
                        *(void **)data = arg;
        } else if ((cmd & F_FSOUT) != 0 && size != 0) {
                /*
                 * Zero the buffer so the user always
                 * gets back something deterministic.
                 */
                memset(data, 0, size);
        } else if (cmd & F_FSVOID)
                *(void **)data = arg;


        error = (*fp->f_ops->fo_fcntl)(fp, cmd, data);

        /*
         * Copy any data to user, size was
         * already set and checked above.
         */
        if (error == 0 && (cmd & F_FSOUT) && size)
                error = copyout(data, arg, size);
        if (memp)
                kmem_free(memp, size);
        return (error);
}

int
do_fcntl_lock(int fd, int cmd, struct flock *fl)
{
        struct file *fp = NULL;
        proc_t *p;
        int (*fo_advlock)(struct file *, void *, int, struct flock *, int);
        int error, flg;

        if ((fp = fd_getfile(fd)) == NULL) {
                error = EBADF;
                goto out;
        }
        if ((fo_advlock = fp->f_ops->fo_advlock) == NULL) {
                error = EINVAL;
                goto out;
        }

        flg = F_POSIX;
        p = curproc;

        switch (cmd) {
        case F_SETLKW:
                flg |= F_WAIT;
                /* Fall into F_SETLK */

                /* FALLTHROUGH */
        case F_SETLK:
                switch (fl->l_type) {
                case F_RDLCK:
                        if ((fp->f_flag & FREAD) == 0) {
                                error = EBADF;
                                break;
                        }
                        if ((p->p_flag & PK_ADVLOCK) == 0) {
                                mutex_enter(p->p_lock);
                                p->p_flag |= PK_ADVLOCK;
                                mutex_exit(p->p_lock);
                        }
                        error = (*fo_advlock)(fp, p, F_SETLK, fl, flg);
                        break;

                case F_WRLCK:
                        if ((fp->f_flag & FWRITE) == 0) {
                                error = EBADF;
                                break;
                        }
                        if ((p->p_flag & PK_ADVLOCK) == 0) {
                                mutex_enter(p->p_lock);
                                p->p_flag |= PK_ADVLOCK;
                                mutex_exit(p->p_lock);
                        }
                        error = (*fo_advlock)(fp, p, F_SETLK, fl, flg);
                        break;

                case F_UNLCK:
                        error = (*fo_advlock)(fp, p, F_UNLCK, fl, F_POSIX);
                        break;

                default:
                        error = EINVAL;
                        break;
                }
                break;

        case F_GETLK:
                if (fl->l_type != F_RDLCK &&
                    fl->l_type != F_WRLCK &&
                    fl->l_type != F_UNLCK) {
                        error = EINVAL;
                        break;
                }
                error = (*fo_advlock)(fp, p, F_GETLK, fl, F_POSIX);
                break;

        default:
                error = EINVAL;
                break;
        }

out:        if (fp)
                fd_putfile(fd);
        return error;
}

/*
 * The file control system call.
 */
int
sys_fcntl(struct lwp *l, const struct sys_fcntl_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)                fd;
                syscallarg(int)                cmd;
                syscallarg(void *)        arg;
        } */
        int fd, i, tmp, error, cmd, newmin;
        filedesc_t *fdp;
        fdtab_t *dt;
        file_t *fp;
        char *kpath;
        struct flock fl;
        bool cloexec = false;

        fd = SCARG(uap, fd);
        cmd = SCARG(uap, cmd);
        fdp = l->l_fd;
        error = 0;

        switch (cmd) {
        case F_CLOSEM:
                if (fd < 0)
                        return EBADF;
                while ((i = fdp->fd_lastfile) >= fd) {
                        if (fd_getfile(i) == NULL) {
                                /* Another thread has updated. */
                                continue;
                        }
                        fd_close(i);
                }
                return 0;

        case F_MAXFD:
                *retval = fdp->fd_lastfile;
                return 0;

        case F_SETLKW:
        case F_SETLK:
        case F_GETLK:
                error = copyin(SCARG(uap, arg), &fl, sizeof(fl));
                if (error)
                        return error;
                error = do_fcntl_lock(fd, cmd, &fl);
                if (cmd == F_GETLK && error == 0)
                        error = copyout(&fl, SCARG(uap, arg), sizeof(fl));
                return error;

        default:
                /* Handled below */
                break;
        }

        if ((fp = fd_getfile(fd)) == NULL)
                return EBADF;

        if ((cmd & F_FSCTL)) {
                error = fcntl_forfs(fd, fp, cmd, SCARG(uap, arg));
                fd_putfile(fd);
                return error;
        }

        switch (cmd) {
        case F_DUPFD_CLOEXEC:
                cloexec = true;
                /*FALLTHROUGH*/
        case F_DUPFD:
                newmin = (long)SCARG(uap, arg);
                if ((u_int)newmin >=
                    l->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
                    (u_int)newmin >= maxfiles) {
                        fd_putfile(fd);
                        return EINVAL;
                }
                error = fd_dup(fp, newmin, &i, cloexec);
                *retval = i;
                break;

        case F_GETFD:
                dt = atomic_load_consume(&fdp->fd_dt);
                *retval = dt->dt_ff[fd]->ff_exclose;
                break;

        case F_SETFD:
                fd_set_exclose(l, fd,
                    ((long)SCARG(uap, arg) & FD_CLOEXEC) != 0);
                break;

        case F_GETNOSIGPIPE:
                *retval = (fp->f_flag & FNOSIGPIPE) != 0;
                break;

        case F_SETNOSIGPIPE:
                if (SCARG(uap, arg))
                        atomic_or_uint(&fp->f_flag, FNOSIGPIPE);
                else
                        atomic_and_uint(&fp->f_flag, ~FNOSIGPIPE);
                *retval = 0;
                break;

        case F_GETFL:
                *retval = OFLAGS(fp->f_flag);
                break;

        case F_SETFL:
                /* XXX not guaranteed to be atomic. */
                tmp = FFLAGS((long)SCARG(uap, arg)) & FCNTLFLAGS;
                error = (*fp->f_ops->fo_fcntl)(fp, F_SETFL, &tmp);
                if (error)
                        break;
                i = tmp ^ fp->f_flag;
                if (i & FNONBLOCK) {
                        int flgs = tmp & FNONBLOCK;
                        error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, &flgs);
                        if (error) {
                                (*fp->f_ops->fo_fcntl)(fp, F_SETFL,
                                    &fp->f_flag);
                                break;
                        }
                }
                if (i & FASYNC) {
                        int flgs = tmp & FASYNC;
                        error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, &flgs);
                        if (error) {
                                if (i & FNONBLOCK) {
                                        tmp = fp->f_flag & FNONBLOCK;
                                        (void)(*fp->f_ops->fo_ioctl)(fp,
                                                FIONBIO, &tmp);
                                }
                                (*fp->f_ops->fo_fcntl)(fp, F_SETFL,
                                    &fp->f_flag);
                                break;
                        }
                }
                fp->f_flag = (fp->f_flag & ~FCNTLFLAGS) | tmp;
                break;

        case F_GETOWN:
                error = (*fp->f_ops->fo_ioctl)(fp, FIOGETOWN, &tmp);
                *retval = tmp;
                break;

        case F_SETOWN:
                tmp = (int)(uintptr_t) SCARG(uap, arg);
                error = (*fp->f_ops->fo_ioctl)(fp, FIOSETOWN, &tmp);
                break;

        case F_GETPATH:
                kpath = PNBUF_GET();

                /* vnodes need extra context, so are handled separately */
                if (fp->f_type == DTYPE_VNODE)
                        error = vnode_to_path(kpath, MAXPATHLEN, fp->f_vnode,
                            l, l->l_proc);
                else
                        error = (*fp->f_ops->fo_fcntl)(fp, F_GETPATH, kpath);

                if (error == 0)
                        error = copyoutstr(kpath, SCARG(uap, arg), MAXPATHLEN,
                            NULL);

                PNBUF_PUT(kpath);
                break;

        case F_ADD_SEALS:
                tmp = (int)(uintptr_t) SCARG(uap, arg);
                error = (*fp->f_ops->fo_fcntl)(fp, F_ADD_SEALS, &tmp);
                break;

        case F_GET_SEALS:
                error = (*fp->f_ops->fo_fcntl)(fp, F_GET_SEALS, &tmp);
                *retval = tmp;
                break;

        default:
                error = EINVAL;
        }

        fd_putfile(fd);
        return (error);
}

/*
 * Close a file descriptor.
 */
int
sys_close(struct lwp *l, const struct sys_close_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)        fd;
        } */
        int error;
        int fd = SCARG(uap, fd);

        if (fd_getfile(fd) == NULL) {
                return EBADF;
        }

        error = fd_close(fd);
        if (error == ERESTART) {
#ifdef DIAGNOSTIC
                printf("%s[%d]: close(%d) returned ERESTART\n",
                    l->l_proc->p_comm, (int)l->l_proc->p_pid, fd);
#endif
                error = EINTR;
        }

        return error;
}

/*
 * Return status information about a file descriptor.
 * Common function for compat code.
 */
int
do_sys_fstat(int fd, struct stat *sb)
{
        file_t *fp;
        int error;

        if ((fp = fd_getfile(fd)) == NULL) {
                return EBADF;
        }
        error = (*fp->f_ops->fo_stat)(fp, sb);
        fd_putfile(fd);

        return error;
}

/*
 * Return status information about a file descriptor.
 */
int
sys___fstat50(struct lwp *l, const struct sys___fstat50_args *uap,
              register_t *retval)
{
        /* {
                syscallarg(int)                        fd;
                syscallarg(struct stat *)        sb;
        } */
        struct stat sb;
        int error;

        error = do_sys_fstat(SCARG(uap, fd), &sb);
        if (error == 0) {
                error = copyout(&sb, SCARG(uap, sb), sizeof(sb));
        }
        return error;
}

/*
 * Return pathconf information about a file descriptor.
 */
int
sys_fpathconf(struct lwp *l, const struct sys_fpathconf_args *uap,
              register_t *retval)
{
        /* {
                syscallarg(int)        fd;
                syscallarg(int)        name;
        } */
        int fd, name, error;
        file_t *fp;

        fd = SCARG(uap, fd);
        name = SCARG(uap, name);
        error = 0;

        if ((fp = fd_getfile(fd)) == NULL)
                return EBADF;
        if (fp->f_ops->fo_fpathconf == NULL)
                error = EOPNOTSUPP;
        else
                error = (*fp->f_ops->fo_fpathconf)(fp, name, retval);
        fd_putfile(fd);
        return error;
}

/*
 * Apply an advisory lock on a file descriptor.
 *
 * Just attempt to get a record lock of the requested type on
 * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
 */
/* ARGSUSED */
int
sys_flock(struct lwp *l, const struct sys_flock_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)        fd;
                syscallarg(int)        how;
        } */
        int fd, how, error;
        struct file *fp = NULL;
        int (*fo_advlock)(struct file *, void *, int, struct flock *, int);
        struct flock lf;

        fd = SCARG(uap, fd);
        how = SCARG(uap, how);

        if ((fp = fd_getfile(fd)) == NULL) {
                error = EBADF;
                goto out;
        }
        if ((fo_advlock = fp->f_ops->fo_advlock) == NULL) {
                KASSERT((atomic_load_relaxed(&fp->f_flag) & FHASLOCK) == 0);
                error = EOPNOTSUPP;
                goto out;
        }

        lf.l_whence = SEEK_SET;
        lf.l_start = 0;
        lf.l_len = 0;

        switch (how & ~LOCK_NB) {
        case LOCK_UN:
                lf.l_type = F_UNLCK;
                atomic_and_uint(&fp->f_flag, ~FHASLOCK);
                error = (*fo_advlock)(fp, fp, F_UNLCK, &lf, F_FLOCK);
                goto out;
        case LOCK_EX:
                lf.l_type = F_WRLCK;
                break;
        case LOCK_SH:
                lf.l_type = F_RDLCK;
                break;
        default:
                error = EINVAL;
                goto out;
        }

        atomic_or_uint(&fp->f_flag, FHASLOCK);
        if (how & LOCK_NB) {
                error = (*fo_advlock)(fp, fp, F_SETLK, &lf, F_FLOCK);
        } else {
                error = (*fo_advlock)(fp, fp, F_SETLK, &lf, F_FLOCK|F_WAIT);
        }
out:        if (fp)
                fd_putfile(fd);
        return error;
}

int
do_posix_fadvise(int fd, off_t offset, off_t len, int advice)
{
        file_t *fp;
        int error;

        if ((fp = fd_getfile(fd)) == NULL)
                return EBADF;
        if (fp->f_ops->fo_posix_fadvise == NULL) {
                error = EOPNOTSUPP;
        } else {
                error = (*fp->f_ops->fo_posix_fadvise)(fp, offset, len,
                    advice);
        }
        fd_putfile(fd);
        return error;
}

int
sys___posix_fadvise50(struct lwp *l,
                      const struct sys___posix_fadvise50_args *uap,
                      register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(int) pad;
                syscallarg(off_t) offset;
                syscallarg(off_t) len;
                syscallarg(int) advice;
        } */

        *retval = do_posix_fadvise(SCARG(uap, fd), SCARG(uap, offset),
            SCARG(uap, len), SCARG(uap, advice));

        return 0;
}

int
sys_pipe(struct lwp *l, const void *v, register_t *retval)
{
        int fd[2], error;

        if ((error = pipe1(l, fd, 0)) != 0)
                return error;

        retval[0] = fd[0];
        retval[1] = fd[1];

        return 0;
}

int
sys_pipe2(struct lwp *l, const struct sys_pipe2_args *uap, register_t *retval)
{
        /* {
                syscallarg(int[2]) fildes;
                syscallarg(int) flags;
        } */
        int fd[2], error;

        if ((error = pipe1(l, fd, SCARG(uap, flags))) != 0)
                return error;

        if ((error = copyout(fd, SCARG(uap, fildes), sizeof(fd))) != 0)
                return error;
        retval[0] = 0;
        return 0;
}




















































































































    9 













    9 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/*        $NetBSD: layer_subr.c,v 1.39 2022/04/10 09:50:46 andvar Exp $        */

/*
 * Copyright (c) 1999 National Aeronautics & Space Administration
 * All rights reserved.
 *
 * This software was written by William Studenmund of the
 * Numerical Aerospace Simulation Facility, NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the National Aeronautics & Space Administration
 *    nor the names of its contributors may be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
 * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1992, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        from: Id: lofs_subr.c,v 1.11 1992/05/30 10:05:43 jsp Exp
 *        @(#)null_subr.c        8.7 (Berkeley) 5/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: layer_subr.c,v 1.39 2022/04/10 09:50:46 andvar Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/kmem.h>

#include <miscfs/genfs/layer.h>
#include <miscfs/genfs/layer_extern.h>

#ifdef LAYERFS_DIAGNOSTIC
int layerfs_debug = 1;
#endif

/*
 * layer cache:
 * Each cache entry holds a reference to the lower vnode
 * along with a pointer to the alias vnode.  When an
 * entry is added the lower vnode is VREF'd.  When the
 * alias is removed the lower vnode is vrele'd.
 */

void
layerfs_init(void)
{
        /* Nothing. */
}

void
layerfs_done(void)
{
        /* Nothing. */
}

/*
 * layer_node_create: try to find an existing layerfs vnode referring to it,
 * otherwise make a new vnode which contains a reference to the lower vnode.
 */
int
layer_node_create(struct mount *mp, struct vnode *lowervp, struct vnode **nvpp)
{
        int error;
        struct vnode *aliasvp;

        error = vcache_get(mp, &lowervp, sizeof(lowervp), &aliasvp);
        if (error)
                return error;

        /*
         * Now that we acquired a reference on the upper vnode, release one
         * on the lower node.  The existence of the layer_node retains one
         * reference to the lower node.
         */
        vrele(lowervp);
        KASSERT(vrefcnt(lowervp) > 0);

#ifdef LAYERFS_DIAGNOSTIC
        if (layerfs_debug)
                vprint("layer_node_create: alias", aliasvp);
#endif
        *nvpp = aliasvp;
        return 0;
}

#ifdef LAYERFS_DIAGNOSTIC
struct vnode *
layer_checkvp(struct vnode *vp, const char *fil, int lno)
{
        struct layer_node *a = VTOLAYER(vp);
#ifdef notyet
        /*
         * Can't do this check because vop_reclaim runs
         * with a funny vop vector.
         *
         * WRS - no it doesnt...
         */
        if (vp->v_op != layer_vnodeop_p) {
                printf ("layer_checkvp: on non-layer-node\n");
#ifdef notyet
                while (layer_checkvp_barrier) /*WAIT*/ ;
#endif
                panic("layer_checkvp");
        };
#endif
        if (a->layer_lowervp == NULL) {
                /* Should never happen */
                int i; u_long *p;
                printf("vp = %p, ZERO ptr\n", vp);
                for (p = (u_long *) a, i = 0; i < 8; i++)
                        printf(" %lx", p[i]);
                printf("\n");
                /* wait for debugger */
                panic("layer_checkvp");
        }
        if (vrefcnt(a->layer_lowervp) < 1) {
                int i; u_long *p;
                printf("vp = %p, unref'ed lowervp\n", vp);
                for (p = (u_long *) a, i = 0; i < 8; i++)
                        printf(" %lx", p[i]);
                printf("\n");
                /* wait for debugger */
                panic ("layer with unref'ed lowervp");
        };
#ifdef notnow
        printf("layer %p/%d -> %p/%d [%s, %d]\n",
                LAYERTOV(a), vrefcnt(LAYERTOV(a)),
                a->layer_lowervp, vrefcnt(a->layer_lowervp),
                fil, lno);
#endif
        return a->layer_lowervp;
}
#endif






































































































































































































































































































    1 
    1 

    1 













































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
/*        $NetBSD: if_ether.h,v 1.91 2024/02/05 21:46:06 andvar Exp $        */

/*
 * Copyright (c) 1982, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)if_ether.h        8.1 (Berkeley) 6/10/93
 */

#ifndef _NET_IF_ETHER_H_
#define _NET_IF_ETHER_H_

#ifdef _KERNEL
#ifdef _KERNEL_OPT
#include "opt_mbuftrace.h"
#endif
#include <sys/mbuf.h>
#endif

#ifndef _STANDALONE
#include <net/if.h>
#endif

/*
 * Some basic Ethernet constants.
 */
#define        ETHER_ADDR_LEN        6        /* length of an Ethernet address */
#define        ETHER_TYPE_LEN        2        /* length of the Ethernet type field */
#define        ETHER_CRC_LEN        4        /* length of the Ethernet CRC */
#define        ETHER_HDR_LEN        ((ETHER_ADDR_LEN * 2) + ETHER_TYPE_LEN)
#define        ETHER_MIN_LEN        64        /* minimum frame length, including CRC */
#define        ETHER_MAX_LEN        1518        /* maximum frame length, including CRC */
#define        ETHER_MAX_LEN_JUMBO 9018 /* maximum jumbo frame len, including CRC */

/*
 * Some Ethernet extensions.
 */
#define        ETHER_VLAN_ENCAP_LEN        4     /* length of 802.1Q VLAN encapsulation */
#define        EVL_VLANOFTAG(tag)        ((tag) & 4095)                /* VLAN ID */
#define        EVL_PRIOFTAG(tag)        (((tag) >> 13) & 7)        /* Priority */
#define        EVL_CFIOFTAG(tag)        (((tag) >> 12) & 1)        /* CFI */
#define        ETHER_PPPOE_ENCAP_LEN        8        /* length of PPPoE encapsulation */

/*
 * Mbuf adjust factor to force 32-bit alignment of IP header.
 * Drivers should do m_adj(m, ETHER_ALIGN) when setting up a
 * receive so the upper layers get the IP header properly aligned
 * past the 14-byte Ethernet header.
 */
#define        ETHER_ALIGN        2        /* driver adjust for IP hdr alignment */

/*
 * Ethernet address - 6 octets
 * this is only used by the ethers(3) functions.
 */
struct ether_addr {
        uint8_t ether_addr_octet[ETHER_ADDR_LEN];
};

/*
 * Structure of a 10Mb/s Ethernet header.
 */
struct ether_header {
        uint8_t  ether_dhost[ETHER_ADDR_LEN];
        uint8_t  ether_shost[ETHER_ADDR_LEN];
        uint16_t ether_type;
};

#include <net/ethertypes.h>

#define        ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */
#define        ETHER_IS_LOCAL(addr) (*(addr) & 0x02) /* is address local? */

#define        ETHERMTU_JUMBO        (ETHER_MAX_LEN_JUMBO - ETHER_HDR_LEN - ETHER_CRC_LEN)
#define        ETHERMTU        (ETHER_MAX_LEN - ETHER_HDR_LEN - ETHER_CRC_LEN)
#define        ETHERMIN        (ETHER_MIN_LEN - ETHER_HDR_LEN - ETHER_CRC_LEN)

/*
 * Compute the maximum frame size based on ethertype (i.e. possible
 * encapsulation) and whether or not an FCS is present.
 */
#define        ETHER_MAX_FRAME(ifp, etype, hasfcs)                                \
        ((ifp)->if_mtu + ETHER_HDR_LEN +                                \
         ((hasfcs) ? ETHER_CRC_LEN : 0) +                                \
         (((etype) == ETHERTYPE_VLAN) ? ETHER_VLAN_ENCAP_LEN : 0) +        \
         (((etype) == ETHERTYPE_PPPOE) ? ETHER_PPPOE_ENCAP_LEN : 0))

/*
 * Ethernet CRC32 polynomials (big- and little-endian versions).
 */
#define        ETHER_CRC_POLY_LE        0xedb88320
#define        ETHER_CRC_POLY_BE        0x04c11db6

#ifndef _STANDALONE

/*
 * Ethernet-specific mbuf flags.
 */
#define        M_HASFCS        M_LINK0        /* FCS included at end of frame */
#define        M_PROMISC        M_LINK1        /* this packet is not for us */

#ifdef _KERNEL
/*
 * Macro to map an IP multicast address to an Ethernet multicast address.
 * The high-order 25 bits of the Ethernet address are statically assigned,
 * and the low-order 23 bits are taken from the low end of the IP address.
 */
#define ETHER_MAP_IP_MULTICAST(ipaddr, enaddr)                                \
        /* const struct in_addr *ipaddr; */                                \
        /* uint8_t enaddr[ETHER_ADDR_LEN]; */                                \
do {                                                                        \
        (enaddr)[0] = 0x01;                                                \
        (enaddr)[1] = 0x00;                                                \
        (enaddr)[2] = 0x5e;                                                \
        (enaddr)[3] = ((const uint8_t *)ipaddr)[1] & 0x7f;                \
        (enaddr)[4] = ((const uint8_t *)ipaddr)[2];                        \
        (enaddr)[5] = ((const uint8_t *)ipaddr)[3];                        \
} while (/*CONSTCOND*/0)
/*
 * Macro to map an IP6 multicast address to an Ethernet multicast address.
 * The high-order 16 bits of the Ethernet address are statically assigned,
 * and the low-order 32 bits are taken from the low end of the IP6 address.
 */
#define ETHER_MAP_IPV6_MULTICAST(ip6addr, enaddr)                        \
        /* struct in6_addr *ip6addr; */                                        \
        /* uint8_t enaddr[ETHER_ADDR_LEN]; */                                \
{                                                                       \
        (enaddr)[0] = 0x33;                                                \
        (enaddr)[1] = 0x33;                                                \
        (enaddr)[2] = ((const uint8_t *)ip6addr)[12];                        \
        (enaddr)[3] = ((const uint8_t *)ip6addr)[13];                        \
        (enaddr)[4] = ((const uint8_t *)ip6addr)[14];                        \
        (enaddr)[5] = ((const uint8_t *)ip6addr)[15];                        \
}
#endif

struct mii_data;

struct ethercom;

typedef int (*ether_cb_t)(struct ethercom *);
typedef int (*ether_vlancb_t)(struct ethercom *, uint16_t, bool);

/*
 * Structure shared between the ethernet driver modules and
 * the multicast list code.  For example, each ec_softc or il_softc
 * begins with this structure.
 */
struct ethercom {
        struct        ifnet ec_if;                        /* network-visible interface */
        LIST_HEAD(, ether_multi) ec_multiaddrs;        /* list of ether multicast
                                                   addrs */
        int        ec_multicnt;                        /* length of ec_multiaddrs
                                                   list */
        int        ec_capabilities;                /* capabilities, provided by
                                                   driver */
        int        ec_capenable;                        /* tells hardware which
                                                   capabilities to enable */

        int        ec_nvlans;                        /* # VLANs on this interface */
        SIMPLEQ_HEAD(, vlanid_list) ec_vids;        /* list of VLAN IDs */
        /* The device handle for the MII bus child device. */
        struct mii_data                                *ec_mii;
        struct ifmedia                                *ec_ifmedia;
        /*
         * Called after a change to ec_if.if_flags.  Returns
         * ENETRESET if the device should be reinitialized with
         * ec_if.if_init, 0 on success, not 0 on failure.
         */
        ether_cb_t                                ec_ifflags_cb;
        /*
         * Called whenever a vlan interface is configured or unconfigured.
         * Args include the vlan tag and a flag indicating whether the tag is
         * being added or removed.
         */
        ether_vlancb_t                                ec_vlan_cb;
        /* Hooks called at the beginning of detach of this interface */
        khook_list_t                                *ec_ifdetach_hooks;
        kmutex_t                                *ec_lock;
        /* Flags used only by the kernel */
        int                                        ec_flags;
#ifdef MBUFTRACE
        struct        mowner ec_rx_mowner;                /* mbufs received */
        struct        mowner ec_tx_mowner;                /* mbufs transmitted */
#endif
};

#define        ETHERCAP_VLAN_MTU        0x00000001 /* VLAN-compatible MTU */
#define        ETHERCAP_VLAN_HWTAGGING        0x00000002 /* hardware VLAN tag support */
#define        ETHERCAP_JUMBO_MTU        0x00000004 /* 9000 byte MTU supported */
#define        ETHERCAP_VLAN_HWFILTER        0x00000008 /* iface hw can filter vlan tag */
#define        ETHERCAP_EEE                0x00000010 /* Energy Efficiency Ethernet */
#define        ETHERCAP_MASK                0x0000001f

#define        ECCAPBITS                \
        "\020"                        \
        "\1VLAN_MTU"                \
        "\2VLAN_HWTAGGING"        \
        "\3JUMBO_MTU"                \
        "\4VLAN_HWFILTER"        \
        "\5EEE"

/* ioctl() for Ethernet capabilities */
struct eccapreq {
        char                eccr_name[IFNAMSIZ];        /* if name, e.g. "en0" */
        int                eccr_capabilities;        /* supported capabiliites */
        int                eccr_capenable;                /* capabilities enabled */
};

/* sysctl for Ethernet multicast addresses */
struct ether_multi_sysctl {
        u_int   enm_refcount;
        uint8_t enm_addrlo[ETHER_ADDR_LEN];
        uint8_t enm_addrhi[ETHER_ADDR_LEN];
};

#ifdef        _KERNEL
/*
 * Flags for ec_flags
 */
/* Store IFF_ALLMULTI in ec_flags instead of if_flags to avoid data races. */
#define ETHER_F_ALLMULTI        __BIT(0)

extern const uint8_t etherbroadcastaddr[ETHER_ADDR_LEN];
extern const uint8_t ethermulticastaddr_slowprotocols[ETHER_ADDR_LEN];
extern const uint8_t ether_ipmulticast_min[ETHER_ADDR_LEN];
extern const uint8_t ether_ipmulticast_max[ETHER_ADDR_LEN];

void        ether_set_ifflags_cb(struct ethercom *, ether_cb_t);
void        ether_set_vlan_cb(struct ethercom *, ether_vlancb_t);
int        ether_ioctl(struct ifnet *, u_long, void *);
int        ether_addmulti(const struct sockaddr *, struct ethercom *);
int        ether_delmulti(const struct sockaddr *, struct ethercom *);
int        ether_multiaddr(const struct sockaddr *, uint8_t[ETHER_ADDR_LEN],
                        uint8_t[ETHER_ADDR_LEN]);
void    ether_input(struct ifnet *, struct mbuf *);

/*
 * Ethernet multicast address structure.  There is one of these for each
 * multicast address or range of multicast addresses that we are supposed
 * to listen to on a particular interface.  They are kept in a linked list,
 * rooted in the interface's ethercom structure.
 */
struct ether_multi {
        uint8_t enm_addrlo[ETHER_ADDR_LEN]; /* low  or only address of range */
        uint8_t enm_addrhi[ETHER_ADDR_LEN]; /* high or only address of range */
        u_int        enm_refcount;                /* no. claims to this addr/range */
        LIST_ENTRY(ether_multi) enm_list;
};

/*
 * Structure used by macros below to remember position when stepping through
 * all of the ether_multi records.
 */
struct ether_multistep {
        struct ether_multi  *e_enm;
};

/*
 * lookup the ether_multi record for a given range of Ethernet
 * multicast addresses connected to a given ethercom structure.
 * If no matching record is found, NULL is returned.
 */
static __inline struct ether_multi *
ether_lookup_multi(const uint8_t *addrlo, const uint8_t *addrhi,
    const struct ethercom *ec)
{
        struct ether_multi *enm;

        LIST_FOREACH(enm, &ec->ec_multiaddrs, enm_list) {
                if (memcmp(enm->enm_addrlo, addrlo, ETHER_ADDR_LEN) != 0)
                        continue;
                if (memcmp(enm->enm_addrhi, addrhi, ETHER_ADDR_LEN) != 0)
                        continue;

                break;
        }

        return enm;
}

/*
 * step through all of the ether_multi records, one at a time.
 * The current position is remembered in "step", which the caller must
 * provide.  ether_first_multi(), below, must be called to initialize "step"
 * and get the first record.  Both functions return a NULL when there
 * are no remaining records.
 */
static __inline struct ether_multi *
ether_next_multi(struct ether_multistep *step)
{
        struct ether_multi *enm;

        enm = step->e_enm;
        if (enm != NULL)
                step->e_enm = LIST_NEXT(enm, enm_list);

        return enm;
}
#define ETHER_NEXT_MULTI(step, enm)                \
        /* struct ether_multistep step; */        \
        /* struct ether_multi *enm; */                \
        (enm) = ether_next_multi(&(step))

static __inline struct ether_multi *
ether_first_multi(struct ether_multistep *step, const struct ethercom *ec)
{

        step->e_enm = LIST_FIRST(&ec->ec_multiaddrs);

        return ether_next_multi(step);
}

#define ETHER_FIRST_MULTI(step, ec, enm)                \
        /* struct ether_multistep step; */                \
        /* struct ethercom *ec; */                        \
        /* struct ether_multi *enm; */                        \
        (enm) = ether_first_multi(&(step), (ec))

#define ETHER_LOCK(ec)                mutex_enter((ec)->ec_lock)
#define ETHER_UNLOCK(ec)        mutex_exit((ec)->ec_lock)

/*
 * Ethernet 802.1Q VLAN structures.
 */

/* for ethercom */
struct vlanid_list {
        uint16_t vid;
        SIMPLEQ_ENTRY(vlanid_list) vid_list;
};

/* add VLAN tag to input/received packet */
static __inline void
vlan_set_tag(struct mbuf *m, uint16_t vlantag)
{
        /* VLAN tag contains priority, CFI and VLAN ID */
        KASSERT((m->m_flags & M_PKTHDR) != 0);
        m->m_pkthdr.ether_vtag = vlantag;
        m->m_flags |= M_VLANTAG;
        return;
}

/* extract VLAN ID value from a VLAN tag */
static __inline uint16_t
vlan_get_tag(struct mbuf *m)
{
        KASSERT((m->m_flags & M_PKTHDR) != 0);
        KASSERT(m->m_flags & M_VLANTAG);
        return m->m_pkthdr.ether_vtag;
}

static __inline bool
vlan_has_tag(struct mbuf *m)
{
        return (m->m_flags & M_VLANTAG) != 0;
}

static __inline bool
vlan_is_hwtag_enabled(struct ifnet *_ifp)
{
        struct ethercom *ec = (void *)_ifp;

        if (ec->ec_capenable & ETHERCAP_VLAN_HWTAGGING)
                return true;

        return false;
}

/* test if any VLAN is configured for this interface */
#define VLAN_ATTACHED(ec)        ((ec)->ec_nvlans > 0)

void        etherinit(void);
void        ether_ifattach(struct ifnet *, const uint8_t *);
void        ether_ifdetach(struct ifnet *);
int        ether_mediachange(struct ifnet *);
void        ether_mediastatus(struct ifnet *, struct ifmediareq *);
void *        ether_ifdetachhook_establish(struct ifnet *,
            void (*)(void *), void *arg);
void        ether_ifdetachhook_disestablish(struct ifnet *,
            void *, kmutex_t *);

char        *ether_sprintf(const uint8_t *);
char        *ether_snprintf(char *, size_t, const uint8_t *);

uint32_t ether_crc32_le(const uint8_t *, size_t);
uint32_t ether_crc32_be(const uint8_t *, size_t);

int        ether_aton_r(u_char *, size_t, const char *);
int        ether_enable_vlan_mtu(struct ifnet *);
int        ether_disable_vlan_mtu(struct ifnet *);
int        ether_add_vlantag(struct ifnet *, uint16_t, bool *);
int        ether_del_vlantag(struct ifnet *, uint16_t);
int        ether_inject_vlantag(struct mbuf **, uint16_t, uint16_t);
struct mbuf *
        ether_strip_vlantag(struct mbuf *);
#else
/*
 * Prototype ethers(3) functions.
 */
#include <sys/cdefs.h>
__BEGIN_DECLS
char *        ether_ntoa(const struct ether_addr *);
struct ether_addr *
        ether_aton(const char *);
int        ether_ntohost(char *, const struct ether_addr *);
int        ether_hostton(const char *, struct ether_addr *);
int        ether_line(const char *, struct ether_addr *, char *);
__END_DECLS
#endif

#endif /* _STANDALONE */

#endif /* !_NET_IF_ETHER_H_ */

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  124 









   37 


  117 
















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
/*        $NetBSD: spectre.c,v 1.36 2021/10/07 12:52:27 msaitoh Exp $        */

/*
 * Copyright (c) 2018-2019 NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Maxime Villard.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Mitigations for the SpectreV2, SpectreV4, MDS and TAA CPU flaws.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: spectre.c,v 1.36 2021/10/07 12:52:27 msaitoh Exp $");

#include "opt_spectre.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/cpu.h>
#include <sys/sysctl.h>
#include <sys/xcall.h>

#include <machine/cpufunc.h>
#include <machine/cpuvar.h>
#include <machine/specialreg.h>
#include <machine/frameasm.h>

#include <x86/cputypes.h>

enum v2_mitigation {
        V2_MITIGATION_NONE,
        V2_MITIGATION_AMD_DIS_IND,
        V2_MITIGATION_INTEL_IBRS,
        V2_MITIGATION_INTEL_ENHANCED_IBRS
};

enum v4_mitigation {
        V4_MITIGATION_NONE,
        V4_MITIGATION_INTEL_SSBD,
        V4_MITIGATION_INTEL_SSB_NO,
        V4_MITIGATION_AMD_SSB_NO,
        V4_MITIGATION_AMD_NONARCH_F15H,
        V4_MITIGATION_AMD_NONARCH_F16H,
        V4_MITIGATION_AMD_NONARCH_F17H
};

static enum v2_mitigation v2_mitigation_method = V2_MITIGATION_NONE;
static enum v4_mitigation v4_mitigation_method = V4_MITIGATION_NONE;

static bool v2_mitigation_enabled __read_mostly = false;
static bool v4_mitigation_enabled __read_mostly = false;

static char v2_mitigation_name[64] = "(none)";
static char v4_mitigation_name[64] = "(none)";

/* --------------------------------------------------------------------- */

static void
v2_set_name(void)
{
        char name[64] = "";
        size_t nmitig = 0;

#if defined(SPECTRE_V2_GCC_MITIGATION)
        strlcat(name, "[GCC retpoline]", sizeof(name));
        nmitig++;
#endif

        if (!v2_mitigation_enabled) {
                if (nmitig == 0)
                        strlcat(name, "(none)", sizeof(name));
        } else {
                if (nmitig)
                        strlcat(name, " + ", sizeof(name));
                switch (v2_mitigation_method) {
                case V2_MITIGATION_AMD_DIS_IND:
                        strlcat(name, "[AMD DIS_IND]", sizeof(name));
                        break;
                case V2_MITIGATION_INTEL_IBRS:
                        strlcat(name, "[Intel IBRS]", sizeof(name));
                        break;
                case V2_MITIGATION_INTEL_ENHANCED_IBRS:
                        strlcat(name, "[Intel Enhanced IBRS]", sizeof(name));
                        break;
                default:
                        panic("%s: impossible", __func__);
                }
        }

        strlcpy(v2_mitigation_name, name,
            sizeof(v2_mitigation_name));
}

static void
v2_detect_method(void)
{
        struct cpu_info *ci = curcpu();
        u_int descs[4];
        uint64_t msr;

        if (cpu_vendor == CPUVENDOR_INTEL) {
                if (cpuid_level >= 7) {
                        x86_cpuid(7, descs);

                        if (descs[3] & CPUID_SEF_IBRS) {
                                if (descs[3] & CPUID_SEF_ARCH_CAP) {
                                        msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
                                        if (msr & IA32_ARCH_IBRS_ALL) {
                                                v2_mitigation_method =
                                                    V2_MITIGATION_INTEL_ENHANCED_IBRS;
                                                return;
                                        }
                                }
#ifdef __x86_64__
                                v2_mitigation_method = V2_MITIGATION_INTEL_IBRS;
                                return;
#endif
                        }
                }
                v2_mitigation_method = V2_MITIGATION_NONE;
        } else if (cpu_vendor == CPUVENDOR_AMD) {
                /*
                 * The AMD Family 10h manual documents the IC_CFG.DIS_IND bit.
                 * This bit disables the Indirect Branch Predictor.
                 *
                 * Families 12h and 16h are believed to have this bit too, but
                 * their manuals don't document it.
                 */
                switch (CPUID_TO_FAMILY(ci->ci_signature)) {
                case 0x10:
                        v2_mitigation_method = V2_MITIGATION_AMD_DIS_IND;
                        break;
                default:
                        v2_mitigation_method = V2_MITIGATION_NONE;
                        break;
                }
        } else {
                v2_mitigation_method = V2_MITIGATION_NONE;
        }
}

/* -------------------------------------------------------------------------- */

static volatile unsigned long ibrs_cpu_barrier1 __cacheline_aligned;
static volatile unsigned long ibrs_cpu_barrier2 __cacheline_aligned;

#ifdef __x86_64__
/* IBRS_ENTER. */
extern uint8_t noibrs_enter, noibrs_enter_end;
extern uint8_t ibrs_enter, ibrs_enter_end;
static const struct x86_hotpatch_source hp_noibrs_enter_source = {
        .saddr = &noibrs_enter,
        .eaddr = &noibrs_enter_end
};
static const struct x86_hotpatch_source hp_ibrs_enter_source = {
        .saddr = &ibrs_enter,
        .eaddr = &ibrs_enter_end
};
static const struct x86_hotpatch_descriptor hp_ibrs_enter_desc = {
        .name = HP_NAME_IBRS_ENTER,
        .nsrc = 2,
        .srcs = { &hp_noibrs_enter_source, &hp_ibrs_enter_source }
};
__link_set_add_rodata(x86_hotpatch_descriptors, hp_ibrs_enter_desc);

/* IBRS_LEAVE. */
extern uint8_t noibrs_leave, noibrs_leave_end;
extern uint8_t ibrs_leave, ibrs_leave_end;
static const struct x86_hotpatch_source hp_noibrs_leave_source = {
        .saddr = &noibrs_leave,
        .eaddr = &noibrs_leave_end
};
static const struct x86_hotpatch_source hp_ibrs_leave_source = {
        .saddr = &ibrs_leave,
        .eaddr = &ibrs_leave_end
};
static const struct x86_hotpatch_descriptor hp_ibrs_leave_desc = {
        .name = HP_NAME_IBRS_LEAVE,
        .nsrc = 2,
        .srcs = { &hp_noibrs_leave_source, &hp_ibrs_leave_source }
};
__link_set_add_rodata(x86_hotpatch_descriptors, hp_ibrs_leave_desc);

static void
ibrs_disable_hotpatch(void)
{
        x86_hotpatch(HP_NAME_IBRS_ENTER, /* noibrs */ 0);
        x86_hotpatch(HP_NAME_IBRS_LEAVE, /* noibrs */ 0);
}

static void
ibrs_enable_hotpatch(void)
{
        x86_hotpatch(HP_NAME_IBRS_ENTER, /* ibrs */ 1);
        x86_hotpatch(HP_NAME_IBRS_LEAVE, /* ibrs */ 1);
}
#else
/* IBRS not supported on i386 */
static void
ibrs_disable_hotpatch(void)
{
        panic("%s: impossible", __func__);
}
static void
ibrs_enable_hotpatch(void)
{
        panic("%s: impossible", __func__);
}
#endif

/* -------------------------------------------------------------------------- */

static void
mitigation_v2_apply_cpu(struct cpu_info *ci, bool enabled)
{
        uint64_t msr;

        switch (v2_mitigation_method) {
        case V2_MITIGATION_NONE:
                panic("impossible");
        case V2_MITIGATION_INTEL_IBRS:
                /* cpu0 is the one that does the hotpatch job */
                if (ci == &cpu_info_primary) {
                        if (enabled) {
                                ibrs_enable_hotpatch();
                        } else {
                                ibrs_disable_hotpatch();
                        }
                }
                if (!enabled) {
                        wrmsr(MSR_IA32_SPEC_CTRL, 0);
                }
                break;
        case V2_MITIGATION_INTEL_ENHANCED_IBRS:
                msr = rdmsr(MSR_IA32_SPEC_CTRL);
                if (enabled) {
                        msr |= IA32_SPEC_CTRL_IBRS;
                } else {
                        msr &= ~IA32_SPEC_CTRL_IBRS;
                }
                wrmsr(MSR_IA32_SPEC_CTRL, msr);
                break;
        case V2_MITIGATION_AMD_DIS_IND:
                msr = rdmsr(MSR_IC_CFG);
                if (enabled) {
                        msr |= IC_CFG_DIS_IND;
                } else {
                        msr &= ~IC_CFG_DIS_IND;
                }
                wrmsr(MSR_IC_CFG, msr);
                break;
        }
}

/*
 * Note: IBRS requires hotpatching, so we need barriers.
 */
static void
mitigation_v2_change_cpu(void *arg1, void *arg2)
{
        struct cpu_info *ci = curcpu();
        bool enabled = arg1 != NULL;
        u_long psl = 0;

        /* Rendez-vous 1 (IBRS only). */
        if (v2_mitigation_method == V2_MITIGATION_INTEL_IBRS) {
                psl = x86_read_psl();
                x86_disable_intr();

                atomic_dec_ulong(&ibrs_cpu_barrier1);
                while (atomic_cas_ulong(&ibrs_cpu_barrier1, 0, 0) != 0) {
                        x86_pause();
                }
        }

        mitigation_v2_apply_cpu(ci, enabled);

        /* Rendez-vous 2 (IBRS only). */
        if (v2_mitigation_method == V2_MITIGATION_INTEL_IBRS) {
                atomic_dec_ulong(&ibrs_cpu_barrier2);
                while (atomic_cas_ulong(&ibrs_cpu_barrier2, 0, 0) != 0) {
                        x86_pause();
                }

                /* Write back and invalidate cache, flush pipelines. */
                wbinvd();
                x86_flush();

                x86_write_psl(psl);
        }
}

static int
mitigation_v2_change(bool enabled)
{
        uint64_t xc;

        v2_detect_method();

        switch (v2_mitigation_method) {
        case V2_MITIGATION_NONE:
                printf("[!] No mitigation available\n");
                return EOPNOTSUPP;
        case V2_MITIGATION_AMD_DIS_IND:
        case V2_MITIGATION_INTEL_IBRS:
        case V2_MITIGATION_INTEL_ENHANCED_IBRS:
                /* Initialize the barriers */
                ibrs_cpu_barrier1 = ncpu;
                ibrs_cpu_barrier2 = ncpu;

                printf("[+] %s SpectreV2 Mitigation...",
                    enabled ? "Enabling" : "Disabling");
                xc = xc_broadcast(XC_HIGHPRI, mitigation_v2_change_cpu,
                    (void *)enabled, NULL);
                xc_wait(xc);
                printf(" done!\n");
                v2_mitigation_enabled = enabled;
                v2_set_name();
                return 0;
        default:
                panic("impossible");
        }
}

static int
sysctl_machdep_spectreV2_mitigated(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int error;
        bool val;

        val = *(bool *)rnode->sysctl_data;

        node = *rnode;
        node.sysctl_data = &val;

        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error != 0 || newp == NULL)
                return error;

        if (val == v2_mitigation_enabled)
                return 0;
        return mitigation_v2_change(val);
}

/* -------------------------------------------------------------------------- */

static void
v4_set_name(void)
{
        char name[64] = "";

        if (!v4_mitigation_enabled) {
                strlcat(name, "(none)", sizeof(name));
        } else {
                switch (v4_mitigation_method) {
                case V4_MITIGATION_NONE:
                        panic("%s: impossible", __func__);
                case V4_MITIGATION_INTEL_SSBD:
                        strlcat(name, "[Intel SSBD]", sizeof(name));
                        break;
                case V4_MITIGATION_INTEL_SSB_NO:
                        strlcat(name, "[Intel SSB_NO]", sizeof(name));
                        break;
                case V4_MITIGATION_AMD_SSB_NO:
                        strlcat(name, "[AMD SSB_NO]", sizeof(name));
                        break;
                case V4_MITIGATION_AMD_NONARCH_F15H:
                case V4_MITIGATION_AMD_NONARCH_F16H:
                case V4_MITIGATION_AMD_NONARCH_F17H:
                        strlcat(name, "[AMD NONARCH]", sizeof(name));
                        break;
                }
        }

        strlcpy(v4_mitigation_name, name,
            sizeof(v4_mitigation_name));
}

static void
v4_detect_method(void)
{
        struct cpu_info *ci = curcpu();
        u_int descs[4];
        uint64_t msr;

        if (cpu_vendor == CPUVENDOR_INTEL) {
                if (cpu_info_primary.ci_feat_val[7] & CPUID_SEF_ARCH_CAP) {
                        msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
                        if (msr & IA32_ARCH_SSB_NO) {
                                /* Not vulnerable to SpectreV4. */
                                v4_mitigation_method = V4_MITIGATION_INTEL_SSB_NO;
                                return;
                        }
                }
                if (cpuid_level >= 7) {
                        x86_cpuid(7, descs);
                        if (descs[3] & CPUID_SEF_SSBD) {
                                /* descs[3] = %edx */
                                v4_mitigation_method = V4_MITIGATION_INTEL_SSBD;
                                return;
                        }
                }
        } else if (cpu_vendor == CPUVENDOR_AMD) {
                switch (CPUID_TO_FAMILY(ci->ci_signature)) {
                case 0x15:
                        v4_mitigation_method = V4_MITIGATION_AMD_NONARCH_F15H;
                        return;
                case 0x16:
                        v4_mitigation_method = V4_MITIGATION_AMD_NONARCH_F16H;
                        return;
                case 0x17:
                        v4_mitigation_method = V4_MITIGATION_AMD_NONARCH_F17H;
                        return;
                default:
                        if (cpu_info_primary.ci_max_ext_cpuid < 0x80000008) {
                                break;
                        }
                        x86_cpuid(0x80000008, descs);
                        if (descs[1] & CPUID_CAPEX_SSB_NO) {
                                /* Not vulnerable to SpectreV4. */
                                v4_mitigation_method = V4_MITIGATION_AMD_SSB_NO;
                                return;
                        }

                        break;
                }
        }

        v4_mitigation_method = V4_MITIGATION_NONE;
}

static void
mitigation_v4_apply_cpu(bool enabled)
{
        uint64_t msr, msrval = 0, msrbit = 0;

        switch (v4_mitigation_method) {
        case V4_MITIGATION_NONE:
        case V4_MITIGATION_INTEL_SSB_NO:
        case V4_MITIGATION_AMD_SSB_NO:
                panic("impossible");
        case V4_MITIGATION_INTEL_SSBD:
                msrval = MSR_IA32_SPEC_CTRL;
                msrbit = IA32_SPEC_CTRL_SSBD;
                break;
        case V4_MITIGATION_AMD_NONARCH_F15H:
                msrval = MSR_LS_CFG;
                msrbit = LS_CFG_DIS_SSB_F15H;
                break;
        case V4_MITIGATION_AMD_NONARCH_F16H:
                msrval = MSR_LS_CFG;
                msrbit = LS_CFG_DIS_SSB_F16H;
                break;
        case V4_MITIGATION_AMD_NONARCH_F17H:
                msrval = MSR_LS_CFG;
                msrbit = LS_CFG_DIS_SSB_F17H;
                break;
        }

        msr = rdmsr(msrval);
        if (enabled) {
                msr |= msrbit;
        } else {
                msr &= ~msrbit;
        }
        wrmsr(msrval, msr);
}

static void
mitigation_v4_change_cpu(void *arg1, void *arg2)
{
        bool enabled = arg1 != NULL;

        mitigation_v4_apply_cpu(enabled);
}

static int
mitigation_v4_change(bool enabled)
{
        uint64_t xc;

        v4_detect_method();

        switch (v4_mitigation_method) {
        case V4_MITIGATION_NONE:
                printf("[!] No mitigation available\n");
                return EOPNOTSUPP;
        case V4_MITIGATION_INTEL_SSBD:
        case V4_MITIGATION_AMD_NONARCH_F15H:
        case V4_MITIGATION_AMD_NONARCH_F16H:
        case V4_MITIGATION_AMD_NONARCH_F17H:
                printf("[+] %s SpectreV4 Mitigation...",
                    enabled ? "Enabling" : "Disabling");
                xc = xc_broadcast(0, mitigation_v4_change_cpu,
                    (void *)enabled, NULL);
                xc_wait(xc);
                printf(" done!\n");
                v4_mitigation_enabled = enabled;
                v4_set_name();
                return 0;
        case V4_MITIGATION_INTEL_SSB_NO:
        case V4_MITIGATION_AMD_SSB_NO:
                printf("[+] The CPU is not affected by SpectreV4\n");
                return 0;
        default:
                panic("impossible");
        }
}

static int
sysctl_machdep_spectreV4_mitigated(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int error;
        bool val;

        val = *(bool *)rnode->sysctl_data;

        node = *rnode;
        node.sysctl_data = &val;

        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error != 0 || newp == NULL)
                return error;

        if (val == v4_mitigation_enabled)
                return 0;
        return mitigation_v4_change(val);
}

/* -------------------------------------------------------------------------- */

enum mds_mitigation {
        MDS_MITIGATION_NONE,
        MDS_MITIGATION_VERW,
        MDS_MITIGATION_MDS_NO
};

static char mds_mitigation_name[64] = "(none)";

static enum mds_mitigation mds_mitigation_method = MDS_MITIGATION_NONE;
static bool mds_mitigation_enabled __read_mostly = false;

static volatile unsigned long mds_cpu_barrier1 __cacheline_aligned;
static volatile unsigned long mds_cpu_barrier2 __cacheline_aligned;

#ifdef __x86_64__
/* MDS_LEAVE. */
extern uint8_t nomds_leave, nomds_leave_end;
extern uint8_t mds_leave, mds_leave_end;
static const struct x86_hotpatch_source hp_nomds_leave_source = {
        .saddr = &nomds_leave,
        .eaddr = &nomds_leave_end
};
static const struct x86_hotpatch_source hp_mds_leave_source = {
        .saddr = &mds_leave,
        .eaddr = &mds_leave_end
};
static const struct x86_hotpatch_descriptor hp_mds_leave_desc = {
        .name = HP_NAME_MDS_LEAVE,
        .nsrc = 2,
        .srcs = { &hp_nomds_leave_source, &hp_mds_leave_source }
};
__link_set_add_rodata(x86_hotpatch_descriptors, hp_mds_leave_desc);

static void
mds_disable_hotpatch(void)
{
        x86_hotpatch(HP_NAME_MDS_LEAVE, /* nomds */ 0);
}

static void
mds_enable_hotpatch(void)
{
        x86_hotpatch(HP_NAME_MDS_LEAVE, /* mds */ 1);
}
#else
/* MDS not supported on i386 */
static void
mds_disable_hotpatch(void)
{
        panic("%s: impossible", __func__);
}
static void
mds_enable_hotpatch(void)
{
        panic("%s: impossible", __func__);
}
#endif

static void
mitigation_mds_apply_cpu(struct cpu_info *ci, bool enabled)
{
        switch (mds_mitigation_method) {
        case MDS_MITIGATION_NONE:
        case MDS_MITIGATION_MDS_NO:
                panic("impossible");
        case MDS_MITIGATION_VERW:
                /* cpu0 is the one that does the hotpatch job */
                if (ci == &cpu_info_primary) {
                        if (enabled) {
                                mds_enable_hotpatch();
                        } else {
                                mds_disable_hotpatch();
                        }
                }
                break;
        }
}

static void
mitigation_mds_change_cpu(void *arg1, void *arg2)
{
        struct cpu_info *ci = curcpu();
        bool enabled = arg1 != NULL;
        u_long psl = 0;

        /* Rendez-vous 1. */
        psl = x86_read_psl();
        x86_disable_intr();

        atomic_dec_ulong(&mds_cpu_barrier1);
        while (atomic_cas_ulong(&mds_cpu_barrier1, 0, 0) != 0) {
                x86_pause();
        }

        mitigation_mds_apply_cpu(ci, enabled);

        /* Rendez-vous 2. */
        atomic_dec_ulong(&mds_cpu_barrier2);
        while (atomic_cas_ulong(&mds_cpu_barrier2, 0, 0) != 0) {
                x86_pause();
        }

        /* Write back and invalidate cache, flush pipelines. */
        wbinvd();
        x86_flush();

        x86_write_psl(psl);
}

static void
mds_detect_method(void)
{
        u_int descs[4];
        uint64_t msr;

        if (cpu_vendor != CPUVENDOR_INTEL) {
                mds_mitigation_method = MDS_MITIGATION_MDS_NO;
                return;
        }

        if (cpuid_level < 7) {
                return;
        }

        x86_cpuid(0x7, descs);
        if (descs[3] & CPUID_SEF_ARCH_CAP) {
                msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
                if (msr & IA32_ARCH_MDS_NO) {
                        mds_mitigation_method = MDS_MITIGATION_MDS_NO;
                        return;
                }
        }

#ifdef __x86_64__
        if (descs[3] & CPUID_SEF_MD_CLEAR) {
                mds_mitigation_method = MDS_MITIGATION_VERW;
        }
#endif
}

static void
mds_set_name(void)
{
        char name[64] = "";

        if (!mds_mitigation_enabled) {
                strlcat(name, "(none)", sizeof(name));
        } else {
                switch (mds_mitigation_method) {
                case MDS_MITIGATION_NONE:
                        panic("%s: impossible", __func__);
                case MDS_MITIGATION_MDS_NO:
                        strlcat(name, "[MDS_NO]", sizeof(name));
                        break;
                case MDS_MITIGATION_VERW:
                        strlcat(name, "[VERW]", sizeof(name));
                        break;
                }
        }

        strlcpy(mds_mitigation_name, name,
            sizeof(mds_mitigation_name));
}

static int
mitigation_mds_change(bool enabled)
{
        uint64_t xc;

        mds_detect_method();

        switch (mds_mitigation_method) {
        case MDS_MITIGATION_NONE:
                printf("[!] No mitigation available\n");
                return EOPNOTSUPP;
        case MDS_MITIGATION_VERW:
                /* Initialize the barriers */
                mds_cpu_barrier1 = ncpu;
                mds_cpu_barrier2 = ncpu;

                printf("[+] %s MDS Mitigation...",
                    enabled ? "Enabling" : "Disabling");
                xc = xc_broadcast(XC_HIGHPRI, mitigation_mds_change_cpu,
                    (void *)enabled, NULL);
                xc_wait(xc);
                printf(" done!\n");
                mds_mitigation_enabled = enabled;
                mds_set_name();
                return 0;
        case MDS_MITIGATION_MDS_NO:
                printf("[+] The CPU is not affected by MDS\n");
                return 0;
        default:
                panic("impossible");
        }
}

static int
sysctl_machdep_mds_mitigated(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int error;
        bool val;

        val = *(bool *)rnode->sysctl_data;

        node = *rnode;
        node.sysctl_data = &val;

        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error != 0 || newp == NULL)
                return error;

        if (val == mds_mitigation_enabled)
                return 0;
        return mitigation_mds_change(val);
}

/* -------------------------------------------------------------------------- */

enum taa_mitigation {
        TAA_MITIGATION_NONE,
        TAA_MITIGATION_TAA_NO,
        TAA_MITIGATION_MDS,
        TAA_MITIGATION_RTM_DISABLE
};

static char taa_mitigation_name[64] = "(none)";

static enum taa_mitigation taa_mitigation_method = TAA_MITIGATION_NONE;
static bool taa_mitigation_enabled __read_mostly = false;
static bool *taa_mitigation_enabled_ptr = &taa_mitigation_enabled;

static void
mitigation_taa_apply_cpu(struct cpu_info *ci, bool enabled)
{
        uint64_t msr;

        switch (taa_mitigation_method) {
        case TAA_MITIGATION_NONE:
        case TAA_MITIGATION_TAA_NO:
        case TAA_MITIGATION_MDS:
                panic("impossible");
        case TAA_MITIGATION_RTM_DISABLE:
                msr = rdmsr(MSR_IA32_TSX_CTRL);
                if (enabled) {
                        msr |= IA32_TSX_CTRL_RTM_DISABLE;
                } else {
                        msr &= ~IA32_TSX_CTRL_RTM_DISABLE;
                }
                wrmsr(MSR_IA32_TSX_CTRL, msr);
                break;
        }
}

static void
mitigation_taa_change_cpu(void *arg1, void *arg2)
{
        struct cpu_info *ci = curcpu();
        bool enabled = arg1 != NULL;

        mitigation_taa_apply_cpu(ci, enabled);
}

static void
taa_detect_method(void)
{
        u_int descs[4];
        uint64_t msr;

        taa_mitigation_enabled_ptr = &taa_mitigation_enabled;

        if (cpu_vendor != CPUVENDOR_INTEL) {
                taa_mitigation_method = TAA_MITIGATION_TAA_NO;
                return;
        }
        if (!(cpu_feature[5] & CPUID_SEF_RTM)) {
                taa_mitigation_method = TAA_MITIGATION_TAA_NO;
                return;
        }

        /*
         * If the CPU doesn't have MDS_NO set, then the TAA mitigation is based
         * on the MDS mitigation.
         */
        if (cpuid_level < 7) {
                taa_mitigation_method = TAA_MITIGATION_MDS;
                taa_mitigation_enabled_ptr = &mds_mitigation_enabled;
                return;
        }
        x86_cpuid(0x7, descs);
        if (!(descs[3] & CPUID_SEF_ARCH_CAP)) {
                taa_mitigation_method = TAA_MITIGATION_MDS;
                taa_mitigation_enabled_ptr = &mds_mitigation_enabled;
                return;
        }
        msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
        if (!(msr & IA32_ARCH_MDS_NO)) {
                taa_mitigation_method = TAA_MITIGATION_MDS;
                taa_mitigation_enabled_ptr = &mds_mitigation_enabled;
                return;
        }

        /*
         * Otherwise, we need the TAA-specific mitigation.
         */
        if (msr & IA32_ARCH_TAA_NO) {
                taa_mitigation_method = TAA_MITIGATION_TAA_NO;
                return;
        }
        if (msr & IA32_ARCH_TSX_CTRL) {
                taa_mitigation_method = TAA_MITIGATION_RTM_DISABLE;
                return;
        }
}

static void
taa_set_name(void)
{
        char name[64] = "";

        switch (taa_mitigation_method) {
        case TAA_MITIGATION_NONE:
                strlcpy(name, "(none)", sizeof(name));
                break;
        case TAA_MITIGATION_TAA_NO:
                strlcpy(name, "[TAA_NO]", sizeof(name));
                break;
        case TAA_MITIGATION_MDS:
                strlcpy(name, "[MDS]", sizeof(name));
                break;
        case TAA_MITIGATION_RTM_DISABLE:
                if (!taa_mitigation_enabled) {
                        strlcpy(name, "(none)", sizeof(name));
                } else {
                        strlcpy(name, "[RTM_DISABLE]", sizeof(name));
                }
                break;
        }

        strlcpy(taa_mitigation_name, name, sizeof(taa_mitigation_name));
}

static int
mitigation_taa_change(bool enabled)
{
        uint64_t xc;

        taa_detect_method();

        switch (taa_mitigation_method) {
        case TAA_MITIGATION_NONE:
                printf("[!] No mitigation available\n");
                return EOPNOTSUPP;
        case TAA_MITIGATION_TAA_NO:
                printf("[+] The CPU is not affected by TAA\n");
                return 0;
        case TAA_MITIGATION_MDS:
                printf("[!] Mitigation based on MDS, use machdep.mds\n");
                taa_set_name();
                return EINVAL;
        case TAA_MITIGATION_RTM_DISABLE:
                printf("[+] %s TAA Mitigation...",
                    enabled ? "Enabling" : "Disabling");
                xc = xc_broadcast(XC_HIGHPRI, mitigation_taa_change_cpu,
                    (void *)enabled, NULL);
                xc_wait(xc);
                printf(" done!\n");
                taa_mitigation_enabled = enabled;
                taa_set_name();
                return 0;
        default:
                panic("impossible");
        }
}

static int
sysctl_machdep_taa_mitigated(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int error;
        bool val;

        val = *(bool *)rnode->sysctl_data;

        node = *rnode;
        node.sysctl_data = &val;

        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error != 0 || newp == NULL)
                return error;

        if (val == *taa_mitigation_enabled_ptr)
                return 0;
        return mitigation_taa_change(val);
}

/* -------------------------------------------------------------------------- */

void speculation_barrier(struct lwp *, struct lwp *);

void
speculation_barrier(struct lwp *oldlwp, struct lwp *newlwp)
{
        /*
         * Speculation barriers are applicable only to Spectre V2.
         */
        if (!v2_mitigation_enabled)
                return;

        /*
         * From kernel thread to kernel thread, no need for a barrier.
         */
        if ((oldlwp->l_flag & LW_SYSTEM) && (newlwp->l_flag & LW_SYSTEM))
                return;

        switch (v2_mitigation_method) {
        case V2_MITIGATION_INTEL_IBRS:
                wrmsr(MSR_IA32_PRED_CMD, IA32_PRED_CMD_IBPB);
                break;
        default:
                /* nothing */
                break;
        }
}

/*
 * cpu0 is the one that detects the method and sets the global 'enabled'
 * variable for each mitigation.
 */
void
cpu_speculation_init(struct cpu_info *ci)
{
        /*
         * Spectre V2.
         */
        if (ci == &cpu_info_primary) {
                v2_detect_method();
                v2_mitigation_enabled =
                    (v2_mitigation_method != V2_MITIGATION_NONE);
                v2_set_name();
        }
        if (v2_mitigation_method != V2_MITIGATION_NONE) {
                mitigation_v2_apply_cpu(ci, true);
        }

        /*
         * Spectre V4.
         *
         * Disabled by default, as recommended by AMD, but can be enabled
         * dynamically. We only detect if the CPU is not vulnerable, to
         * mark it as 'mitigated' in the sysctl.
         */
#if 0
        if (ci == &cpu_info_primary) {
                v4_detect_method();
                v4_mitigation_enabled =
                    (v4_mitigation_method != V4_MITIGATION_NONE);
                v4_set_name();
        }
        if (v4_mitigation_method != V4_MITIGATION_NONE &&
            v4_mitigation_method != V4_MITIGATION_INTEL_SSB_NO &&
            v4_mitigation_method != V4_MITIGATION_AMD_SSB_NO) {
                mitigation_v4_apply_cpu(ci, true);
        }
#else
        if (ci == &cpu_info_primary) {
                v4_detect_method();
                if (v4_mitigation_method == V4_MITIGATION_INTEL_SSB_NO ||
                    v4_mitigation_method == V4_MITIGATION_AMD_SSB_NO) {
                        v4_mitigation_enabled = true;
                        v4_set_name();
                }
        }
#endif

        /*
         * Microarchitectural Data Sampling.
         */
        if (ci == &cpu_info_primary) {
                mds_detect_method();
                mds_mitigation_enabled =
                    (mds_mitigation_method != MDS_MITIGATION_NONE);
                mds_set_name();
        }
        if (mds_mitigation_method != MDS_MITIGATION_NONE &&
            mds_mitigation_method != MDS_MITIGATION_MDS_NO) {
                mitigation_mds_apply_cpu(ci, true);
        }

        /*
         * TSX Asynchronous Abort.
         */
        if (ci == &cpu_info_primary) {
                taa_detect_method();
                taa_mitigation_enabled =
                    (taa_mitigation_method == TAA_MITIGATION_RTM_DISABLE) ||
                    (taa_mitigation_method == TAA_MITIGATION_TAA_NO);
                taa_set_name();
        }
        if (taa_mitigation_method == TAA_MITIGATION_RTM_DISABLE) {
                mitigation_taa_apply_cpu(ci, true);
        }
}

void sysctl_speculation_init(struct sysctllog **);

void
sysctl_speculation_init(struct sysctllog **clog)
{
        const struct sysctlnode *spec_rnode;

        /* SpectreV1 */
        spec_rnode = NULL;
        sysctl_createv(clog, 0, NULL, &spec_rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "spectre_v1", NULL,
                       NULL, 0, NULL, 0,
                       CTL_MACHDEP, CTL_CREATE);
        sysctl_createv(clog, 0, &spec_rnode, &spec_rnode,
                       CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
                       CTLTYPE_BOOL, "mitigated",
                       SYSCTL_DESCR("Whether Spectre Variant 1 is mitigated"),
                       NULL, 0 /* mitigated=0 */, NULL, 0,
                       CTL_CREATE, CTL_EOL);

        /* SpectreV2 */
        spec_rnode = NULL;
        sysctl_createv(clog, 0, NULL, &spec_rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "spectre_v2", NULL,
                       NULL, 0, NULL, 0,
                       CTL_MACHDEP, CTL_CREATE);
        sysctl_createv(clog, 0, &spec_rnode, NULL,
                       CTLFLAG_READWRITE,
                       CTLTYPE_BOOL, "hwmitigated",
                       SYSCTL_DESCR("Whether Spectre Variant 2 is HW-mitigated"),
                       sysctl_machdep_spectreV2_mitigated, 0,
                       &v2_mitigation_enabled, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &spec_rnode, NULL,
                       CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
                       CTLTYPE_BOOL, "swmitigated",
                       SYSCTL_DESCR("Whether Spectre Variant 2 is SW-mitigated"),
#if defined(SPECTRE_V2_GCC_MITIGATION)
                       NULL, 1,
#else
                       NULL, 0,
#endif
                       NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &spec_rnode, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "method",
                       SYSCTL_DESCR("Mitigation method in use"),
                       NULL, 0,
                       v2_mitigation_name, 0,
                       CTL_CREATE, CTL_EOL);

        /* SpectreV4 */
        spec_rnode = NULL;
        sysctl_createv(clog, 0, NULL, &spec_rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "spectre_v4", NULL,
                       NULL, 0, NULL, 0,
                       CTL_MACHDEP, CTL_CREATE);
        sysctl_createv(clog, 0, &spec_rnode, NULL,
                       CTLFLAG_READWRITE,
                       CTLTYPE_BOOL, "mitigated",
                       SYSCTL_DESCR("Whether Spectre Variant 4 is mitigated"),
                       sysctl_machdep_spectreV4_mitigated, 0,
                       &v4_mitigation_enabled, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &spec_rnode, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "method",
                       SYSCTL_DESCR("Mitigation method in use"),
                       NULL, 0,
                       v4_mitigation_name, 0,
                       CTL_CREATE, CTL_EOL);

        /* Microarchitectural Data Sampling */
        spec_rnode = NULL;
        sysctl_createv(clog, 0, NULL, &spec_rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "mds", NULL,
                       NULL, 0, NULL, 0,
                       CTL_MACHDEP, CTL_CREATE);
        sysctl_createv(clog, 0, &spec_rnode, NULL,
                       CTLFLAG_READWRITE,
                       CTLTYPE_BOOL, "mitigated",
                       SYSCTL_DESCR("Whether MDS is mitigated"),
                       sysctl_machdep_mds_mitigated, 0,
                       &mds_mitigation_enabled, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &spec_rnode, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "method",
                       SYSCTL_DESCR("Mitigation method in use"),
                       NULL, 0,
                       mds_mitigation_name, 0,
                       CTL_CREATE, CTL_EOL);

        /* TSX Asynchronous Abort */
        spec_rnode = NULL;
        sysctl_createv(clog, 0, NULL, &spec_rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "taa", NULL,
                       NULL, 0, NULL, 0,
                       CTL_MACHDEP, CTL_CREATE);
        sysctl_createv(clog, 0, &spec_rnode, NULL,
                       CTLFLAG_READWRITE,
                       CTLTYPE_BOOL, "mitigated",
                       SYSCTL_DESCR("Whether TAA is mitigated"),
                       sysctl_machdep_taa_mitigated, 0,
                       taa_mitigation_enabled_ptr, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &spec_rnode, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "method",
                       SYSCTL_DESCR("Mitigation method in use"),
                       NULL, 0,
                       taa_mitigation_name, 0,
                       CTL_CREATE, CTL_EOL);
}






































































































































































































































































































































































































   33 






   32 



    3 
















    5 








   22 































































   27 













    5 

   21 


    2 































   24 




    8 







   24 


    2 




   22 





















   23 








    6 












   16 




   14 






















    8 


    8 
    1 



    7 






    2 



    2 








    2 






    2 










    8 


    8 

    4 



    8 










    2 
    5 

    2 
    5 


    8 
    7 
    8 



    8 



    8 








    6 



    6 
    6 






















    6 

    1 
    1 


    1 












    6 



    6 


















































    9 


    9 









    9 

    2 


    2 


    8 







    4 
    4 









    3 
































   19 













   19 


   18 














   19 


   17 
    4 














   11 

    1 



    9 




   17 







   17 
    1 

































   17 

    2 



   16 








   17 






    1 



    1 










    4 
   14 








   17 




   17 



    3 





   17 

   17 
    1 

    1 


   17 


   17 

   17 
    1 




   14 

    1 
    4 






   19 

   19 





























































    6 


















    6 
    4 
    2 





    6 

    1 


    1 








    1 


    6 








    6 

    1 



















    2 


    1 
    1 

    5 


















    4 




    5 




    3 

    1 







































    2 




    2 





















    2 
































    2 









































































    2 













    2 



























    2 
    1 











    1 














































    1 




























    1 











    1 



































    1 

    1 


    2 


















    2 






    2 









    3 



    3 





    1 



    2 


























    9 



   10 














    6 


    3 

















    4 



    1 






























    3 
    3 
    1 


    2 








    1 
    1 























    1 


























































    1 

    1 
    1 




    5 





   20 



    5 
    5 


   17 


    5 


   19 



















































    1 


















    1 




















    1 























    2 











    6 


    6 


    1 




    5 














   27 



   10 



   17 











   27 





   27 








   27 

   17 
   10 















    1 




    2 














    2 








   13 

    7 


   10 








    4 

    8 










































































































































































































































































    1 




    1 


    1 








    1 












    1 
    1 




























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
/*        $NetBSD: uipc_socket.c,v 1.309 2024/02/11 13:01:29 jdolecek Exp $        */

/*
 * Copyright (c) 2002, 2007, 2008, 2009, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 2004 The FreeBSD Foundation
 * Copyright (c) 2004 Robert Watson
 * Copyright (c) 1982, 1986, 1988, 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)uipc_socket.c        8.6 (Berkeley) 5/2/95
 */

/*
 * Socket operation routines.
 *
 * These routines are called by the routines in sys_socket.c or from a
 * system process, and implement the semantics of socket operations by
 * switching out to the protocol specific routines.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.309 2024/02/11 13:01:29 jdolecek Exp $");

#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_sock_counters.h"
#include "opt_sosend_loan.h"
#include "opt_mbuftrace.h"
#include "opt_somaxkva.h"
#include "opt_multiprocessor.h"        /* XXX */
#include "opt_sctp.h"
#include "opt_pipe.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/resourcevar.h>
#include <sys/uidinfo.h>
#include <sys/event.h>
#include <sys/poll.h>
#include <sys/kauth.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/kthread.h>
#include <sys/compat_stub.h>

#include <compat/sys/time.h>
#include <compat/sys/socket.h>

#include <uvm/uvm_extern.h>
#include <uvm/uvm_loan.h>
#include <uvm/uvm_page.h>

#ifdef SCTP
#include <netinet/sctp_route.h>
#endif

MALLOC_DEFINE(M_SONAME, "soname", "socket name");

extern const struct fileops socketops;

static int        sooptions;
extern int        somaxconn;                        /* patchable (XXX sysctl) */
int                somaxconn = SOMAXCONN;
kmutex_t        *softnet_lock;

#ifdef SOSEND_COUNTERS
#include <sys/device.h>

static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "sosend", "loan big");
static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "sosend", "copy big");
static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "sosend", "copy small");
static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "sosend", "kva limit");

#define        SOSEND_COUNTER_INCR(ev)                (ev)->ev_count++

EVCNT_ATTACH_STATIC(sosend_loan_big);
EVCNT_ATTACH_STATIC(sosend_copy_big);
EVCNT_ATTACH_STATIC(sosend_copy_small);
EVCNT_ATTACH_STATIC(sosend_kvalimit);
#else

#define        SOSEND_COUNTER_INCR(ev)                /* nothing */

#endif /* SOSEND_COUNTERS */

#if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR)
int sock_loan_thresh = -1;
#else
int sock_loan_thresh = 4096;
#endif

static kmutex_t so_pendfree_lock;
static struct mbuf *so_pendfree = NULL;

#ifndef SOMAXKVA
#define        SOMAXKVA (16 * 1024 * 1024)
#endif
int somaxkva = SOMAXKVA;
static int socurkva;
static kcondvar_t socurkva_cv;

#ifndef SOFIXEDBUF
#define SOFIXEDBUF true
#endif
bool sofixedbuf = SOFIXEDBUF;

static kauth_listener_t socket_listener;

#define        SOCK_LOAN_CHUNK                65536

static void sopendfree_thread(void *);
static kcondvar_t pendfree_thread_cv;
static lwp_t *sopendfree_lwp;

static void sysctl_kern_socket_setup(void);
static struct sysctllog *socket_sysctllog;

static vsize_t
sokvareserve(struct socket *so, vsize_t len)
{
        int error;

        mutex_enter(&so_pendfree_lock);
        while (socurkva + len > somaxkva) {
                SOSEND_COUNTER_INCR(&sosend_kvalimit);
                error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock);
                if (error) {
                        len = 0;
                        break;
                }
        }
        socurkva += len;
        mutex_exit(&so_pendfree_lock);
        return len;
}

static void
sokvaunreserve(vsize_t len)
{

        mutex_enter(&so_pendfree_lock);
        socurkva -= len;
        cv_broadcast(&socurkva_cv);
        mutex_exit(&so_pendfree_lock);
}

/*
 * sokvaalloc: allocate kva for loan.
 */
vaddr_t
sokvaalloc(vaddr_t sva, vsize_t len, struct socket *so)
{
        vaddr_t lva;

        if (sokvareserve(so, len) == 0)
                return 0;

        lva = uvm_km_alloc(kernel_map, len, atop(sva) & uvmexp.colormask,
            UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA);
        if (lva == 0) {
                sokvaunreserve(len);
                return 0;
        }

        return lva;
}

/*
 * sokvafree: free kva for loan.
 */
void
sokvafree(vaddr_t sva, vsize_t len)
{

        uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY);
        sokvaunreserve(len);
}

static void
sodoloanfree(struct vm_page **pgs, void *buf, size_t size)
{
        vaddr_t sva, eva;
        vsize_t len;
        int npgs;

        KASSERT(pgs != NULL);

        eva = round_page((vaddr_t) buf + size);
        sva = trunc_page((vaddr_t) buf);
        len = eva - sva;
        npgs = len >> PAGE_SHIFT;

        pmap_kremove(sva, len);
        pmap_update(pmap_kernel());
        uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE);
        sokvafree(sva, len);
}

/*
 * sopendfree_thread: free mbufs on "pendfree" list. Unlock and relock
 * so_pendfree_lock when freeing mbufs.
 */
static void
sopendfree_thread(void *v)
{
        struct mbuf *m, *next;
        size_t rv;

        mutex_enter(&so_pendfree_lock);

        for (;;) {
                rv = 0;
                while (so_pendfree != NULL) {
                        m = so_pendfree;
                        so_pendfree = NULL;
                        mutex_exit(&so_pendfree_lock);

                        for (; m != NULL; m = next) {
                                next = m->m_next;
                                KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) ==
                                    0);
                                KASSERT(m->m_ext.ext_refcnt == 0);

                                rv += m->m_ext.ext_size;
                                sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf,
                                    m->m_ext.ext_size);
                                pool_cache_put(mb_cache, m);
                        }

                        mutex_enter(&so_pendfree_lock);
                }
                if (rv)
                        cv_broadcast(&socurkva_cv);
                cv_wait(&pendfree_thread_cv, &so_pendfree_lock);
        }
        panic("sopendfree_thread");
        /* NOTREACHED */
}

void
soloanfree(struct mbuf *m, void *buf, size_t size, void *arg)
{

        KASSERT(m != NULL);

        /*
         * postpone freeing mbuf.
         *
         * we can't do it in interrupt context
         * because we need to put kva back to kernel_map.
         */

        mutex_enter(&so_pendfree_lock);
        m->m_next = so_pendfree;
        so_pendfree = m;
        cv_signal(&pendfree_thread_cv);
        mutex_exit(&so_pendfree_lock);
}

static long
sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space)
{
        struct iovec *iov = uio->uio_iov;
        vaddr_t sva, eva;
        vsize_t len;
        vaddr_t lva;
        int npgs, error;
        vaddr_t va;
        int i;

        if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace))
                return 0;

        if (iov->iov_len < (size_t) space)
                space = iov->iov_len;
        if (space > SOCK_LOAN_CHUNK)
                space = SOCK_LOAN_CHUNK;

        eva = round_page((vaddr_t) iov->iov_base + space);
        sva = trunc_page((vaddr_t) iov->iov_base);
        len = eva - sva;
        npgs = len >> PAGE_SHIFT;

        KASSERT(npgs <= M_EXT_MAXPAGES);

        lva = sokvaalloc(sva, len, so);
        if (lva == 0)
                return 0;

        error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len,
            m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
        if (error) {
                sokvafree(lva, len);
                return 0;
        }

        for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
                pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]),
                    VM_PROT_READ, 0);
        pmap_update(pmap_kernel());

        lva += (vaddr_t) iov->iov_base & PAGE_MASK;

        MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so);
        m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;

        uio->uio_resid -= space;
        /* uio_offset not updated, not set/used for write(2) */
        uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space;
        uio->uio_iov->iov_len -= space;
        if (uio->uio_iov->iov_len == 0) {
                uio->uio_iov++;
                uio->uio_iovcnt--;
        }

        return space;
}

static int
socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;
        enum kauth_network_req req;

        result = KAUTH_RESULT_DEFER;
        req = (enum kauth_network_req)(uintptr_t)arg0;

        if ((action != KAUTH_NETWORK_SOCKET) &&
            (action != KAUTH_NETWORK_BIND))
                return result;

        switch (req) {
        case KAUTH_REQ_NETWORK_BIND_PORT:
                result = KAUTH_RESULT_ALLOW;
                break;

        case KAUTH_REQ_NETWORK_SOCKET_DROP: {
                /* Normal users can only drop their own connections. */
                struct socket *so = (struct socket *)arg1;

                if (so->so_cred && proc_uidmatch(cred, so->so_cred) == 0)
                        result = KAUTH_RESULT_ALLOW;

                break;
                }

        case KAUTH_REQ_NETWORK_SOCKET_OPEN:
                /* We allow "raw" routing/bluetooth sockets to anyone. */
                switch ((u_long)arg1) {
                case PF_ROUTE:
                case PF_OROUTE:
                case PF_BLUETOOTH:
                case PF_CAN:
                        result = KAUTH_RESULT_ALLOW;
                        break;
                default:
                        /* Privileged, let secmodel handle this. */
                        if ((u_long)arg2 == SOCK_RAW)
                                break;
                        result = KAUTH_RESULT_ALLOW;
                        break;
                }
                break;

        case KAUTH_REQ_NETWORK_SOCKET_CANSEE:
                result = KAUTH_RESULT_ALLOW;

                break;

        default:
                break;
        }

        return result;
}

void
soinit(void)
{

        sysctl_kern_socket_setup();

#ifdef SCTP
        /* Update the SCTP function hooks if necessary*/

        vec_sctp_add_ip_address = sctp_add_ip_address;
        vec_sctp_delete_ip_address = sctp_delete_ip_address; 
#endif

        mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM);
        softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
        cv_init(&socurkva_cv, "sokva");
        cv_init(&pendfree_thread_cv, "sopendfr");
        soinit2();

        /* Set the initial adjusted socket buffer size. */
        if (sb_max_set(sb_max))
                panic("bad initial sb_max value: %lu", sb_max);

        socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
            socket_listener_cb, NULL);
}

void
soinit1(void)
{
        int error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
            sopendfree_thread, NULL, &sopendfree_lwp, "sopendfree");
        if (error)
                panic("soinit1 %d", error);
}

/*
 * socreate: create a new socket of the specified type and the protocol.
 *
 * => Caller may specify another socket for lock sharing (must not be held).
 * => Returns the new socket without lock held.
 */
int
socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l,
    struct socket *lockso)
{
        const struct protosw *prp;
        struct socket *so;
        uid_t uid;
        int error;
        kmutex_t *lock;

        error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
            KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type),
            KAUTH_ARG(proto));
        if (error != 0)
                return error;

        if (proto)
                prp = pffindproto(dom, proto, type);
        else
                prp = pffindtype(dom, type);
        if (prp == NULL) {
                /* no support for domain */
                if (pffinddomain(dom) == 0)
                        return EAFNOSUPPORT;
                /* no support for socket type */
                if (proto == 0 && type != 0)
                        return EPROTOTYPE;
                return EPROTONOSUPPORT;
        }
        if (prp->pr_usrreqs == NULL)
                return EPROTONOSUPPORT;
        if (prp->pr_type != type)
                return EPROTOTYPE;

        so = soget(true);
        so->so_type = type;
        so->so_proto = prp;
        so->so_send = sosend;
        so->so_receive = soreceive;
        so->so_options = sooptions;
#ifdef MBUFTRACE
        so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner;
        so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner;
        so->so_mowner = &prp->pr_domain->dom_mowner;
#endif
        uid = kauth_cred_geteuid(l->l_cred);
        so->so_uidinfo = uid_find(uid);
        so->so_egid = kauth_cred_getegid(l->l_cred);
        so->so_cpid = l->l_proc->p_pid;

        /*
         * Lock assigned and taken during PCB attach, unless we share
         * the lock with another socket, e.g. socketpair(2) case.
         */
        if (lockso) {
                /*
                 * lockso->so_lock should be stable at this point, so
                 * no need for atomic_load_*.
                 */
                lock = lockso->so_lock;
                so->so_lock = lock;
                mutex_obj_hold(lock);
                mutex_enter(lock);
        }

        /* Attach the PCB (returns with the socket lock held). */
        error = (*prp->pr_usrreqs->pr_attach)(so, proto);
        KASSERT(solocked(so));

        if (error) {
                KASSERT(so->so_pcb == NULL);
                so->so_state |= SS_NOFDREF;
                sofree(so);
                return error;
        }
        so->so_cred = kauth_cred_hold(l->l_cred);
        sounlock(so);

        *aso = so;
        return 0;
}

/*
 * fsocreate: create a socket and a file descriptor associated with it.
 * Returns the allocated file structure in *fpp, but the descriptor
 * is not visible yet for the process.
 * Caller is responsible for calling fd_affix() for the returned *fpp once
 * it's socket initialization is finished successfully, or fd_abort() if it's
 * initialization fails.
 * 
 *
 * => On success, write file descriptor to *fdout and *fpp and return zero.
 * => On failure, return non-zero; *fdout and *fpp will be undefined.
 */
int
fsocreate(int domain, struct socket **sop, int type, int proto, int *fdout,
    file_t **fpp, struct socket *lockso)
{
        lwp_t *l = curlwp;
        int error, fd, flags;
        struct socket *so;
        file_t *fp;

        flags = type & SOCK_FLAGS_MASK;
        type &= ~SOCK_FLAGS_MASK;
        error = socreate(domain, &so, type, proto, l, lockso);
        if (error) {
                return error;
        }

        if ((error = fd_allocfile(&fp, &fd)) != 0) {
                soclose(so);
                return error;
        }
        fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0);
        fp->f_flag = FREAD|FWRITE|((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0)|
            ((flags & SOCK_NOSIGPIPE) ? FNOSIGPIPE : 0);
        fp->f_type = DTYPE_SOCKET;
        fp->f_ops = &socketops;
        if (flags & SOCK_NONBLOCK) {
                so->so_state |= SS_NBIO;
        }
        fp->f_socket = so;

        if (sop != NULL) {
                *sop = so;
        }
        *fdout = fd;
        *fpp = fp;
        return error;
}

int
sofamily(const struct socket *so)
{
        const struct protosw *pr;
        const struct domain *dom;

        if ((pr = so->so_proto) == NULL)
                return AF_UNSPEC;
        if ((dom = pr->pr_domain) == NULL)
                return AF_UNSPEC;
        return dom->dom_family;
}

int
sobind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        int error;

        solock(so);
        if (nam->sa_family != so->so_proto->pr_domain->dom_family) {
                sounlock(so);
                return EAFNOSUPPORT;
        }
        error = (*so->so_proto->pr_usrreqs->pr_bind)(so, nam, l);
        sounlock(so);
        return error;
}

int
solisten(struct socket *so, int backlog, struct lwp *l)
{
        int error;
        short oldopt, oldqlimit;

        solock(so);
        if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
            SS_ISDISCONNECTING)) != 0) {
                sounlock(so);
                return EINVAL;
        }
        oldopt = so->so_options;
        oldqlimit = so->so_qlimit;
        if (TAILQ_EMPTY(&so->so_q))
                so->so_options |= SO_ACCEPTCONN;
        if (backlog < 0)
                backlog = 0;
        so->so_qlimit = uimin(backlog, somaxconn);

        error = (*so->so_proto->pr_usrreqs->pr_listen)(so, l);
        if (error != 0) {
                so->so_options = oldopt;
                so->so_qlimit = oldqlimit;
                sounlock(so);
                return error;
        }
        sounlock(so);
        return 0;
}

void
sofree(struct socket *so)
{
        u_int refs;

        KASSERT(solocked(so));

        if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
                sounlock(so);
                return;
        }
        if (so->so_head) {
                /*
                 * We must not decommission a socket that's on the accept(2)
                 * queue.  If we do, then accept(2) may hang after select(2)
                 * indicated that the listening socket was ready.
                 */
                if (!soqremque(so, 0)) {
                        sounlock(so);
                        return;
                }
        }
        if (so->so_rcv.sb_hiwat)
                (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0,
                    RLIM_INFINITY);
        if (so->so_snd.sb_hiwat)
                (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0,
                    RLIM_INFINITY);
        sbrelease(&so->so_snd, so);
        KASSERT(!cv_has_waiters(&so->so_cv));
        KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
        KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
        sorflush(so);
        refs = so->so_aborting;        /* XXX */
        /* Remove accept filter if one is present. */
        if (so->so_accf != NULL)
                (void)accept_filt_clear(so);
        sounlock(so);
        if (refs == 0)                /* XXX */
                soput(so);
}

/*
 * soclose: close a socket on last file table reference removal.
 * Initiate disconnect if connected.  Free socket when disconnect complete.
 */
int
soclose(struct socket *so)
{
        struct socket *so2;
        int error = 0;

        solock(so);
        if (so->so_options & SO_ACCEPTCONN) {
                for (;;) {
                        if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) {
                                KASSERT(solocked2(so, so2));
                                (void) soqremque(so2, 0);
                                /* soabort drops the lock. */
                                (void) soabort(so2);
                                solock(so);
                                continue;
                        }
                        if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) {
                                KASSERT(solocked2(so, so2));
                                (void) soqremque(so2, 1);
                                /* soabort drops the lock. */
                                (void) soabort(so2);
                                solock(so);
                                continue;
                        }
                        break;
                }
        }
        if (so->so_pcb == NULL)
                goto discard;
        if (so->so_state & SS_ISCONNECTED) {
                if ((so->so_state & SS_ISDISCONNECTING) == 0) {
                        error = sodisconnect(so);
                        if (error)
                                goto drop;
                }
                if (so->so_options & SO_LINGER) {
                        if ((so->so_state & (SS_ISDISCONNECTING|SS_NBIO)) ==
                            (SS_ISDISCONNECTING|SS_NBIO))
                                goto drop;
                        while (so->so_state & SS_ISCONNECTED) {
                                error = sowait(so, true, so->so_linger * hz);
                                if (error)
                                        break;
                        }
                }
        }
 drop:
        if (so->so_pcb) {
                KASSERT(solocked(so));
                (*so->so_proto->pr_usrreqs->pr_detach)(so);
        }
 discard:
        KASSERT((so->so_state & SS_NOFDREF) == 0);
        kauth_cred_free(so->so_cred);
        so->so_cred = NULL;
        so->so_state |= SS_NOFDREF;
        sofree(so);
        return error;
}

/*
 * Must be called with the socket locked..  Will return with it unlocked.
 */
int
soabort(struct socket *so)
{
        u_int refs;
        int error;

        KASSERT(solocked(so));
        KASSERT(so->so_head == NULL);

        so->so_aborting++;                /* XXX */
        error = (*so->so_proto->pr_usrreqs->pr_abort)(so);
        refs = --so->so_aborting;        /* XXX */
        if (error || (refs == 0)) {
                sofree(so);
        } else {
                sounlock(so);
        }
        return error;
}

int
soaccept(struct socket *so, struct sockaddr *nam)
{
        int error;

        KASSERT(solocked(so));
        KASSERT((so->so_state & SS_NOFDREF) != 0);

        so->so_state &= ~SS_NOFDREF;
        if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
            (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
                error = (*so->so_proto->pr_usrreqs->pr_accept)(so, nam);
        else
                error = ECONNABORTED;

        return error;
}

int
soconnect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        int error;

        KASSERT(solocked(so));

        if (so->so_options & SO_ACCEPTCONN)
                return EOPNOTSUPP;
        /*
         * If protocol is connection-based, can only connect once.
         * Otherwise, if connected, try to disconnect first.
         * This allows user to disconnect by connecting to, e.g.,
         * a null address.
         */
        if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
            ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
            (error = sodisconnect(so)))) {
                error = EISCONN;
        } else {
                if (nam->sa_family != so->so_proto->pr_domain->dom_family) {
                        return EAFNOSUPPORT;
                }
                error = (*so->so_proto->pr_usrreqs->pr_connect)(so, nam, l);
        }

        return error;
}

int
soconnect2(struct socket *so1, struct socket *so2)
{
        KASSERT(solocked2(so1, so2));

        return (*so1->so_proto->pr_usrreqs->pr_connect2)(so1, so2);
}

int
sodisconnect(struct socket *so)
{
        int error;

        KASSERT(solocked(so));

        if ((so->so_state & SS_ISCONNECTED) == 0) {
                error = ENOTCONN;
        } else if (so->so_state & SS_ISDISCONNECTING) {
                error = EALREADY;
        } else {
                error = (*so->so_proto->pr_usrreqs->pr_disconnect)(so);
        }
        return error;
}

#define        SBLOCKWAIT(f)        (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
/*
 * Send on a socket.
 * If send must go all at once and message is larger than
 * send buffering, then hard error.
 * Lock against other senders.
 * If must go all at once and not enough room now, then
 * inform user that this would block and do nothing.
 * Otherwise, if nonblocking, send as much as possible.
 * The data to be sent is described by "uio" if nonzero,
 * otherwise by the mbuf chain "top" (which must be null
 * if uio is not).  Data provided in mbuf chain must be small
 * enough to send all at once.
 *
 * Returns nonzero on error, timeout or signal; callers
 * must check for short counts if EINTR/ERESTART are returned.
 * Data and control buffers are freed on return.
 */
int
sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
        struct mbuf *top, struct mbuf *control, int flags, struct lwp *l)
{
        struct mbuf **mp, *m;
        long space, len, resid, clen, mlen;
        int error, s, dontroute, atomic;
        short wakeup_state = 0;

        clen = 0;

        /*
         * solock() provides atomicity of access.  splsoftnet() prevents
         * protocol processing soft interrupts from interrupting us and
         * blocking (expensive).
         */
        s = splsoftnet();
        solock(so);
        atomic = sosendallatonce(so) || top;
        if (uio)
                resid = uio->uio_resid;
        else
                resid = top->m_pkthdr.len;
        /*
         * In theory resid should be unsigned.
         * However, space must be signed, as it might be less than 0
         * if we over-committed, and we must use a signed comparison
         * of space and resid.  On the other hand, a negative resid
         * causes us to loop sending 0-length segments to the protocol.
         */
        if (resid < 0) {
                error = EINVAL;
                goto out;
        }
        dontroute =
            (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
            (so->so_proto->pr_flags & PR_ATOMIC);
        l->l_ru.ru_msgsnd++;
        if (control)
                clen = control->m_len;
 restart:
        if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
                goto out;
        do {
                if (so->so_state & SS_CANTSENDMORE) {
                        error = EPIPE;
                        goto release;
                }
                if (so->so_error) {
                        error = so->so_error;
                        if ((flags & MSG_PEEK) == 0)
                                so->so_error = 0;
                        goto release;
                }
                if ((so->so_state & SS_ISCONNECTED) == 0) {
                        if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
                                if (resid || clen == 0) {
                                        error = ENOTCONN;
                                        goto release;
                                }
                        } else if (addr == NULL) {
                                error = EDESTADDRREQ;
                                goto release;
                        }
                }
                space = sbspace(&so->so_snd);
                if (flags & MSG_OOB)
                        space += 1024;
                if ((atomic && resid > so->so_snd.sb_hiwat) ||
                    clen > so->so_snd.sb_hiwat) {
                        error = EMSGSIZE;
                        goto release;
                }
                if (space < resid + clen &&
                    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
                        if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
                                error = EWOULDBLOCK;
                                goto release;
                        }
                        sbunlock(&so->so_snd);
                        if (wakeup_state & SS_RESTARTSYS) {
                                error = ERESTART;
                                goto out;
                        }
                        error = sbwait(&so->so_snd);
                        if (error)
                                goto out;
                        wakeup_state = so->so_state;
                        goto restart;
                }
                wakeup_state = 0;
                mp = &top;
                space -= clen;
                do {
                        if (uio == NULL) {
                                /*
                                 * Data is prepackaged in "top".
                                 */
                                resid = 0;
                                if (flags & MSG_EOR)
                                        top->m_flags |= M_EOR;
                        } else do {
                                sounlock(so);
                                splx(s);
                                if (top == NULL) {
                                        m = m_gethdr(M_WAIT, MT_DATA);
                                        mlen = MHLEN;
                                        m->m_pkthdr.len = 0;
                                        m_reset_rcvif(m);
                                } else {
                                        m = m_get(M_WAIT, MT_DATA);
                                        mlen = MLEN;
                                }
                                MCLAIM(m, so->so_snd.sb_mowner);
                                if (sock_loan_thresh >= 0 &&
                                    uio->uio_iov->iov_len >= sock_loan_thresh &&
                                    space >= sock_loan_thresh &&
                                    (len = sosend_loan(so, uio, m,
                                                       space)) != 0) {
                                        SOSEND_COUNTER_INCR(&sosend_loan_big);
                                        space -= len;
                                        goto have_data;
                                }
                                if (resid >= MINCLSIZE && space >= MCLBYTES) {
                                        SOSEND_COUNTER_INCR(&sosend_copy_big);
                                        m_clget(m, M_DONTWAIT);
                                        if ((m->m_flags & M_EXT) == 0)
                                                goto nopages;
                                        mlen = MCLBYTES;
                                        if (atomic && top == 0) {
                                                len = lmin(MCLBYTES - max_hdr,
                                                    resid);
                                                m->m_data += max_hdr;
                                        } else
                                                len = lmin(MCLBYTES, resid);
                                        space -= len;
                                } else {
 nopages:
                                        SOSEND_COUNTER_INCR(&sosend_copy_small);
                                        len = lmin(lmin(mlen, resid), space);
                                        space -= len;
                                        /*
                                         * For datagram protocols, leave room
                                         * for protocol headers in first mbuf.
                                         */
                                        if (atomic && top == 0 && len < mlen)
                                                m_align(m, len);
                                }
                                error = uiomove(mtod(m, void *), (int)len, uio);
 have_data:
                                resid = uio->uio_resid;
                                m->m_len = len;
                                *mp = m;
                                top->m_pkthdr.len += len;
                                s = splsoftnet();
                                solock(so);
                                if (error != 0)
                                        goto release;
                                mp = &m->m_next;
                                if (resid <= 0) {
                                        if (flags & MSG_EOR)
                                                top->m_flags |= M_EOR;
                                        break;
                                }
                        } while (space > 0 && atomic);

                        if (so->so_state & SS_CANTSENDMORE) {
                                error = EPIPE;
                                goto release;
                        }
                        if (dontroute)
                                so->so_options |= SO_DONTROUTE;
                        if (resid > 0)
                                so->so_state |= SS_MORETOCOME;
                        if (flags & MSG_OOB) {
                                error = (*so->so_proto->pr_usrreqs->pr_sendoob)(
                                    so, top, control);
                        } else {
                                error = (*so->so_proto->pr_usrreqs->pr_send)(so,
                                    top, addr, control, l);
                        }
                        if (dontroute)
                                so->so_options &= ~SO_DONTROUTE;
                        if (resid > 0)
                                so->so_state &= ~SS_MORETOCOME;
                        clen = 0;
                        control = NULL;
                        top = NULL;
                        mp = &top;
                        if (error != 0)
                                goto release;
                } while (resid && space > 0);
        } while (resid);

 release:
        sbunlock(&so->so_snd);
 out:
        sounlock(so);
        splx(s);
        if (top)
                m_freem(top);
        if (control)
                m_freem(control);
        return error;
}

/*
 * Following replacement or removal of the first mbuf on the first
 * mbuf chain of a socket buffer, push necessary state changes back
 * into the socket buffer so that other consumers see the values
 * consistently.  'nextrecord' is the caller's locally stored value of
 * the original value of sb->sb_mb->m_nextpkt which must be restored
 * when the lead mbuf changes.  NOTE: 'nextrecord' may be NULL.
 */
static void
sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
{

        KASSERT(solocked(sb->sb_so));

        /*
         * First, update for the new value of nextrecord.  If necessary,
         * make it the first record.
         */
        if (sb->sb_mb != NULL)
                sb->sb_mb->m_nextpkt = nextrecord;
        else
                sb->sb_mb = nextrecord;

        /*
         * Now update any dependent socket buffer fields to reflect
         * the new state.  This is an inline of SB_EMPTY_FIXUP, with
         * the addition of a second clause that takes care of the
         * case where sb_mb has been updated, but remains the last
         * record.
         */
        if (sb->sb_mb == NULL) {
                sb->sb_mbtail = NULL;
                sb->sb_lastrecord = NULL;
        } else if (sb->sb_mb->m_nextpkt == NULL)
                sb->sb_lastrecord = sb->sb_mb;
}

/*
 * Implement receive operations on a socket.
 *
 * We depend on the way that records are added to the sockbuf by sbappend*. In
 * particular, each record (mbufs linked through m_next) must begin with an
 * address if the protocol so specifies, followed by an optional mbuf or mbufs
 * containing ancillary data, and then zero or more mbufs of data.
 *
 * In order to avoid blocking network interrupts for the entire time here, we
 * splx() while doing the actual copy to user space. Although the sockbuf is
 * locked, new data may still be appended, and thus we must maintain
 * consistency of the sockbuf during that time.
 *
 * The caller may receive the data as a single mbuf chain by supplying an mbuf
 * **mp0 for use in returning the chain. The uio is then used only for the
 * count in uio_resid.
 */
int
soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
{
        struct lwp *l = curlwp;
        struct mbuf *m, **mp, *mt;
        size_t len, offset, moff, orig_resid;
        int atomic, flags, error, s, type;
        const struct protosw *pr;
        struct mbuf *nextrecord;
        int mbuf_removed = 0;
        const struct domain *dom;
        short wakeup_state = 0;

        pr = so->so_proto;
        atomic = pr->pr_flags & PR_ATOMIC;
        dom = pr->pr_domain;
        mp = mp0;
        type = 0;
        orig_resid = uio->uio_resid;

        if (paddr != NULL)
                *paddr = NULL;
        if (controlp != NULL)
                *controlp = NULL;
        if (flagsp != NULL)
                flags = *flagsp &~ MSG_EOR;
        else
                flags = 0;

        if (flags & MSG_OOB) {
                m = m_get(M_WAIT, MT_DATA);
                solock(so);
                error = (*pr->pr_usrreqs->pr_recvoob)(so, m, flags & MSG_PEEK);
                sounlock(so);
                if (error)
                        goto bad;
                do {
                        error = uiomove(mtod(m, void *),
                            MIN(uio->uio_resid, m->m_len), uio);
                        m = m_free(m);
                } while (uio->uio_resid > 0 && error == 0 && m);
bad:
                if (m != NULL)
                        m_freem(m);
                return error;
        }
        if (mp != NULL)
                *mp = NULL;

        /*
         * solock() provides atomicity of access.  splsoftnet() prevents
         * protocol processing soft interrupts from interrupting us and
         * blocking (expensive).
         */
        s = splsoftnet();
        solock(so);
restart:
        if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) {
                sounlock(so);
                splx(s);
                return error;
        }
        m = so->so_rcv.sb_mb;

        /*
         * If we have less data than requested, block awaiting more
         * (subject to any timeout) if:
         *   1. the current count is less than the low water mark,
         *   2. MSG_WAITALL is set, and it is possible to do the entire
         *        receive operation at once if we block (resid <= hiwat), or
         *   3. MSG_DONTWAIT is not set.
         * If MSG_WAITALL is set but resid is larger than the receive buffer,
         * we have to do the receive in sections, and thus risk returning
         * a short count if a timeout or signal occurs after we start.
         */
        if (m == NULL ||
            ((flags & MSG_DONTWAIT) == 0 &&
             so->so_rcv.sb_cc < uio->uio_resid &&
             (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
              ((flags & MSG_WAITALL) &&
               uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
             m->m_nextpkt == NULL && !atomic)) {
#ifdef DIAGNOSTIC
                if (m == NULL && so->so_rcv.sb_cc)
                        panic("receive 1");
#endif
                if (so->so_error || so->so_rerror) {
                        u_short *e;
                        if (m != NULL)
                                goto dontblock;
                        e = so->so_error ? &so->so_error : &so->so_rerror;
                        error = *e;
                        if ((flags & MSG_PEEK) == 0)
                                *e = 0;
                        goto release;
                }
                if (so->so_state & SS_CANTRCVMORE) {
                        if (m != NULL)
                                goto dontblock;
                        else
                                goto release;
                }
                for (; m != NULL; m = m->m_next)
                        if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
                                m = so->so_rcv.sb_mb;
                                goto dontblock;
                        }
                if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
                    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
                        error = ENOTCONN;
                        goto release;
                }
                if (uio->uio_resid == 0)
                        goto release;
                if ((so->so_state & SS_NBIO) ||
                    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
                        error = EWOULDBLOCK;
                        goto release;
                }
                SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
                SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
                sbunlock(&so->so_rcv);
                if (wakeup_state & SS_RESTARTSYS)
                        error = ERESTART;
                else
                        error = sbwait(&so->so_rcv);
                if (error != 0) {
                        sounlock(so);
                        splx(s);
                        return error;
                }
                wakeup_state = so->so_state;
                goto restart;
        }

dontblock:
        /*
         * On entry here, m points to the first record of the socket buffer.
         * From this point onward, we maintain 'nextrecord' as a cache of the
         * pointer to the next record in the socket buffer.  We must keep the
         * various socket buffer pointers and local stack versions of the
         * pointers in sync, pushing out modifications before dropping the
         * socket lock, and re-reading them when picking it up.
         *
         * Otherwise, we will race with the network stack appending new data
         * or records onto the socket buffer by using inconsistent/stale
         * versions of the field, possibly resulting in socket buffer
         * corruption.
         *
         * By holding the high-level sblock(), we prevent simultaneous
         * readers from pulling off the front of the socket buffer.
         */
        if (l != NULL)
                l->l_ru.ru_msgrcv++;
        KASSERT(m == so->so_rcv.sb_mb);
        SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
        SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
        nextrecord = m->m_nextpkt;

        if (pr->pr_flags & PR_ADDR) {
                KASSERT(m->m_type == MT_SONAME);
                orig_resid = 0;
                if (flags & MSG_PEEK) {
                        if (paddr)
                                *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT);
                        m = m->m_next;
                } else {
                        sbfree(&so->so_rcv, m);
                        mbuf_removed = 1;
                        if (paddr != NULL) {
                                *paddr = m;
                                so->so_rcv.sb_mb = m->m_next;
                                m->m_next = NULL;
                                m = so->so_rcv.sb_mb;
                        } else {
                                m = so->so_rcv.sb_mb = m_free(m);
                        }
                        sbsync(&so->so_rcv, nextrecord);
                }
        }

        if (pr->pr_flags & PR_ADDR_OPT) {
                /*
                 * For SCTP we may be getting a whole message OR a partial
                 * delivery.
                 */
                if (m->m_type == MT_SONAME) {
                        orig_resid = 0;
                        if (flags & MSG_PEEK) {
                                if (paddr)
                                        *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT);
                                m = m->m_next;
                        } else {
                                sbfree(&so->so_rcv, m);
                                mbuf_removed = 1;
                                if (paddr) {
                                        *paddr = m;
                                        so->so_rcv.sb_mb = m->m_next;
                                        m->m_next = 0;
                                        m = so->so_rcv.sb_mb;
                                } else {
                                        m = so->so_rcv.sb_mb = m_free(m);
                                }
                                sbsync(&so->so_rcv, nextrecord);
                        }
                }
        }

        /*
         * Process one or more MT_CONTROL mbufs present before any data mbufs
         * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
         * just copy the data; if !MSG_PEEK, we call into the protocol to
         * perform externalization (or freeing if controlp == NULL).
         */
        if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) {
                struct mbuf *cm = NULL, *cmn;
                struct mbuf **cme = &cm;

                do {
                        if (flags & MSG_PEEK) {
                                if (controlp != NULL) {
                                        *controlp = m_copym(m, 0, m->m_len, M_DONTWAIT);
                                        controlp = (*controlp == NULL ? NULL :
                                            &(*controlp)->m_next);
                                }
                                m = m->m_next;
                        } else {
                                sbfree(&so->so_rcv, m);
                                so->so_rcv.sb_mb = m->m_next;
                                m->m_next = NULL;
                                *cme = m;
                                cme = &(*cme)->m_next;
                                m = so->so_rcv.sb_mb;
                        }
                } while (m != NULL && m->m_type == MT_CONTROL);
                if ((flags & MSG_PEEK) == 0)
                        sbsync(&so->so_rcv, nextrecord);

                for (; cm != NULL; cm = cmn) {
                        cmn = cm->m_next;
                        cm->m_next = NULL;
                        type = mtod(cm, struct cmsghdr *)->cmsg_type;
                        if (controlp != NULL) {
                                if (dom->dom_externalize != NULL &&
                                    type == SCM_RIGHTS) {
                                        sounlock(so);
                                        splx(s);
                                        error = (*dom->dom_externalize)(cm, l,
                                            (flags & MSG_CMSG_CLOEXEC) ?
                                            O_CLOEXEC : 0);
                                        s = splsoftnet();
                                        solock(so);
                                }
                                *controlp = cm;
                                while (*controlp != NULL)
                                        controlp = &(*controlp)->m_next;
                        } else {
                                /*
                                 * Dispose of any SCM_RIGHTS message that went
                                 * through the read path rather than recv.
                                 */
                                if (dom->dom_dispose != NULL &&
                                    type == SCM_RIGHTS) {
                                        sounlock(so);
                                        (*dom->dom_dispose)(cm);
                                        solock(so);
                                }
                                m_freem(cm);
                        }
                }
                if (m != NULL)
                        nextrecord = so->so_rcv.sb_mb->m_nextpkt;
                else
                        nextrecord = so->so_rcv.sb_mb;
                orig_resid = 0;
        }

        /* If m is non-NULL, we have some data to read. */
        if (__predict_true(m != NULL)) {
                type = m->m_type;
                if (type == MT_OOBDATA)
                        flags |= MSG_OOB;
        }
        SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
        SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");

        moff = 0;
        offset = 0;
        while (m != NULL && uio->uio_resid > 0 && error == 0) {
                /*
                 * If the type of mbuf has changed, end the receive
                 * operation and do a short read.
                 */
                if (m->m_type == MT_OOBDATA) {
                        if (type != MT_OOBDATA)
                                break;
                } else if (type == MT_OOBDATA) {
                        break;
                } else if (m->m_type == MT_CONTROL) {
                        break;
                }
#ifdef DIAGNOSTIC
                else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) {
                        panic("%s: m_type=%d", __func__, m->m_type);
                }
#endif

                so->so_state &= ~SS_RCVATMARK;
                wakeup_state = 0;
                len = uio->uio_resid;
                if (so->so_oobmark && len > so->so_oobmark - offset)
                        len = so->so_oobmark - offset;
                if (len > m->m_len - moff)
                        len = m->m_len - moff;

                /*
                 * If mp is set, just pass back the mbufs.
                 * Otherwise copy them out via the uio, then free.
                 * Sockbuf must be consistent here (points to current mbuf,
                 * it points to next record) when we drop priority;
                 * we must note any additions to the sockbuf when we
                 * block interrupts again.
                 */
                if (mp == NULL) {
                        SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
                        SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
                        sounlock(so);
                        splx(s);
                        error = uiomove(mtod(m, char *) + moff, len, uio);
                        s = splsoftnet();
                        solock(so);
                        if (error != 0) {
                                /*
                                 * If any part of the record has been removed
                                 * (such as the MT_SONAME mbuf, which will
                                 * happen when PR_ADDR, and thus also
                                 * PR_ATOMIC, is set), then drop the entire
                                 * record to maintain the atomicity of the
                                 * receive operation.
                                 *
                                 * This avoids a later panic("receive 1a")
                                 * when compiled with DIAGNOSTIC.
                                 */
                                if (m && mbuf_removed && atomic)
                                        (void) sbdroprecord(&so->so_rcv);

                                goto release;
                        }
                } else {
                        uio->uio_resid -= len;
                }

                if (len == m->m_len - moff) {
                        if (m->m_flags & M_EOR)
                                flags |= MSG_EOR;
#ifdef SCTP
                        if (m->m_flags & M_NOTIFICATION)
                                flags |= MSG_NOTIFICATION;
#endif
                        if (flags & MSG_PEEK) {
                                m = m->m_next;
                                moff = 0;
                        } else {
                                nextrecord = m->m_nextpkt;
                                sbfree(&so->so_rcv, m);
                                if (mp) {
                                        *mp = m;
                                        mp = &m->m_next;
                                        so->so_rcv.sb_mb = m = m->m_next;
                                        *mp = NULL;
                                } else {
                                        m = so->so_rcv.sb_mb = m_free(m);
                                }
                                /*
                                 * If m != NULL, we also know that
                                 * so->so_rcv.sb_mb != NULL.
                                 */
                                KASSERT(so->so_rcv.sb_mb == m);
                                if (m) {
                                        m->m_nextpkt = nextrecord;
                                        if (nextrecord == NULL)
                                                so->so_rcv.sb_lastrecord = m;
                                } else {
                                        so->so_rcv.sb_mb = nextrecord;
                                        SB_EMPTY_FIXUP(&so->so_rcv);
                                }
                                SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
                                SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
                        }
                } else if (flags & MSG_PEEK) {
                        moff += len;
                } else {
                        if (mp != NULL) {
                                mt = m_copym(m, 0, len, M_NOWAIT);
                                if (__predict_false(mt == NULL)) {
                                        sounlock(so);
                                        mt = m_copym(m, 0, len, M_WAIT);
                                        solock(so);
                                }
                                *mp = mt;
                        }
                        m->m_data += len;
                        m->m_len -= len;
                        so->so_rcv.sb_cc -= len;
                }

                if (so->so_oobmark) {
                        if ((flags & MSG_PEEK) == 0) {
                                so->so_oobmark -= len;
                                if (so->so_oobmark == 0) {
                                        so->so_state |= SS_RCVATMARK;
                                        break;
                                }
                        } else {
                                offset += len;
                                if (offset == so->so_oobmark)
                                        break;
                        }
                } else {
                        so->so_state &= ~SS_POLLRDBAND;
                }
                if (flags & MSG_EOR)
                        break;

                /*
                 * If the MSG_WAITALL flag is set (for non-atomic socket),
                 * we must not quit until "uio->uio_resid == 0" or an error
                 * termination.  If a signal/timeout occurs, return
                 * with a short count but without error.
                 * Keep sockbuf locked against other readers.
                 */
                while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
                    !sosendallatonce(so) && !nextrecord) {
                        if (so->so_error || so->so_rerror ||
                            so->so_state & SS_CANTRCVMORE)
                                break;
                        /*
                         * If we are peeking and the socket receive buffer is
                         * full, stop since we can't get more data to peek at.
                         */
                        if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0)
                                break;
                        /*
                         * If we've drained the socket buffer, tell the
                         * protocol in case it needs to do something to
                         * get it filled again.
                         */
                        if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
                                (*pr->pr_usrreqs->pr_rcvd)(so, flags, l);
                        SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
                        SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
                        if (wakeup_state & SS_RESTARTSYS)
                                error = ERESTART;
                        else
                                error = sbwait(&so->so_rcv);
                        if (error != 0) {
                                sbunlock(&so->so_rcv);
                                sounlock(so);
                                splx(s);
                                return 0;
                        }
                        if ((m = so->so_rcv.sb_mb) != NULL)
                                nextrecord = m->m_nextpkt;
                        wakeup_state = so->so_state;
                }
        }

        if (m && atomic) {
                flags |= MSG_TRUNC;
                if ((flags & MSG_PEEK) == 0)
                        (void) sbdroprecord(&so->so_rcv);
        }
        if ((flags & MSG_PEEK) == 0) {
                if (m == NULL) {
                        /*
                         * First part is an inline SB_EMPTY_FIXUP().  Second
                         * part makes sure sb_lastrecord is up-to-date if
                         * there is still data in the socket buffer.
                         */
                        so->so_rcv.sb_mb = nextrecord;
                        if (so->so_rcv.sb_mb == NULL) {
                                so->so_rcv.sb_mbtail = NULL;
                                so->so_rcv.sb_lastrecord = NULL;
                        } else if (nextrecord->m_nextpkt == NULL)
                                so->so_rcv.sb_lastrecord = nextrecord;
                }
                SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
                SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
                if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
                        (*pr->pr_usrreqs->pr_rcvd)(so, flags, l);
        }
        if (orig_resid == uio->uio_resid && orig_resid &&
            (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
                sbunlock(&so->so_rcv);
                goto restart;
        }

        if (flagsp != NULL)
                *flagsp |= flags;
release:
        sbunlock(&so->so_rcv);
        sounlock(so);
        splx(s);
        return error;
}

int
soshutdown(struct socket *so, int how)
{
        const struct protosw *pr;
        int error;

        KASSERT(solocked(so));

        pr = so->so_proto;
        if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
                return EINVAL;

        if (how == SHUT_RD || how == SHUT_RDWR) {
                sorflush(so);
                error = 0;
        }
        if (how == SHUT_WR || how == SHUT_RDWR)
                error = (*pr->pr_usrreqs->pr_shutdown)(so);

        return error;
}

void
sorestart(struct socket *so)
{
        /*
         * An application has called close() on an fd on which another
         * of its threads has called a socket system call.
         * Mark this and wake everyone up, and code that would block again
         * instead returns ERESTART.
         * On system call re-entry the fd is validated and EBADF returned.
         * Any other fd will block again on the 2nd syscall.
         */
        solock(so);
        so->so_state |= SS_RESTARTSYS;
        cv_broadcast(&so->so_cv);
        cv_broadcast(&so->so_snd.sb_cv);
        cv_broadcast(&so->so_rcv.sb_cv);
        sounlock(so);
}

void
sorflush(struct socket *so)
{
        struct sockbuf *sb, asb;
        const struct protosw *pr;

        KASSERT(solocked(so));

        sb = &so->so_rcv;
        pr = so->so_proto;
        socantrcvmore(so);
        sb->sb_flags |= SB_NOINTR;
        (void )sblock(sb, M_WAITOK);
        sbunlock(sb);
        asb = *sb;
        /*
         * Clear most of the sockbuf structure, but leave some of the
         * fields valid.
         */
        memset(&sb->sb_startzero, 0,
            sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
        if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) {
                sounlock(so);
                (*pr->pr_domain->dom_dispose)(asb.sb_mb);
                solock(so);
        }
        sbrelease(&asb, so);
}

/*
 * internal set SOL_SOCKET options
 */
static int
sosetopt1(struct socket *so, const struct sockopt *sopt)
{
        int error, opt;
        int optval = 0; /* XXX: gcc */
        struct linger l;
        struct timeval tv;

        opt = sopt->sopt_name;

        switch (opt) {

        case SO_ACCEPTFILTER:
                error = accept_filt_setopt(so, sopt);
                KASSERT(solocked(so));
                break;

        case SO_LINGER:
                error = sockopt_get(sopt, &l, sizeof(l));
                solock(so);
                if (error)
                        break;
                if (l.l_linger < 0 || l.l_linger > USHRT_MAX ||
                    l.l_linger > (INT_MAX / hz)) {
                        error = EDOM;
                        break;
                }
                so->so_linger = l.l_linger;
                if (l.l_onoff)
                        so->so_options |= SO_LINGER;
                else
                        so->so_options &= ~SO_LINGER;
                break;

        case SO_DEBUG:
        case SO_KEEPALIVE:
        case SO_DONTROUTE:
        case SO_USELOOPBACK:
        case SO_BROADCAST:
        case SO_REUSEADDR:
        case SO_REUSEPORT:
        case SO_OOBINLINE:
        case SO_TIMESTAMP:
        case SO_NOSIGPIPE:
        case SO_RERROR:
                error = sockopt_getint(sopt, &optval);
                solock(so);
                if (error)
                        break;
                if (optval)
                        so->so_options |= opt;
                else
                        so->so_options &= ~opt;
                break;

        case SO_SNDBUF:
        case SO_RCVBUF:
        case SO_SNDLOWAT:
        case SO_RCVLOWAT:
                error = sockopt_getint(sopt, &optval);
                solock(so);
                if (error)
                        break;

                /*
                 * Values < 1 make no sense for any of these
                 * options, so disallow them.
                 */
                if (optval < 1) {
                        error = EINVAL;
                        break;
                }

                switch (opt) {
                case SO_SNDBUF:
                        if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) {
                                error = ENOBUFS;
                                break;
                        }
                        if (sofixedbuf)
                                so->so_snd.sb_flags &= ~SB_AUTOSIZE;
                        break;

                case SO_RCVBUF:
                        if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) {
                                error = ENOBUFS;
                                break;
                        }
                        if (sofixedbuf)
                                so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
                        break;

                /*
                 * Make sure the low-water is never greater than
                 * the high-water.
                 */
                case SO_SNDLOWAT:
                        if (optval > so->so_snd.sb_hiwat)
                                optval = so->so_snd.sb_hiwat;

                        so->so_snd.sb_lowat = optval;
                        break;

                case SO_RCVLOWAT:
                        if (optval > so->so_rcv.sb_hiwat)
                                optval = so->so_rcv.sb_hiwat;

                        so->so_rcv.sb_lowat = optval;
                        break;
                }
                break;

        case SO_SNDTIMEO:
        case SO_RCVTIMEO:
                solock(so);
                error = sockopt_get(sopt, &tv, sizeof(tv));
                if (error)
                        break;

                if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
                        error = EDOM;
                        break;
                }
                if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) {
                        error = EDOM;
                        break;
                }

                optval = tv.tv_sec * hz + tv.tv_usec / tick;
                if (optval == 0 && tv.tv_usec != 0)
                        optval = 1;

                switch (opt) {
                case SO_SNDTIMEO:
                        so->so_snd.sb_timeo = optval;
                        break;
                case SO_RCVTIMEO:
                        so->so_rcv.sb_timeo = optval;
                        break;
                }
                break;

        default:
                MODULE_HOOK_CALL(uipc_socket_50_setopt1_hook,
                    (opt, so, sopt), enosys(), error);
                if (error == ENOSYS || error == EPASSTHROUGH) {
                        solock(so);
                        error = ENOPROTOOPT;
                }
                break;
        }
        KASSERT(solocked(so));
        return error;
}

int
sosetopt(struct socket *so, struct sockopt *sopt)
{
        int error, prerr;

        if (sopt->sopt_level == SOL_SOCKET) {
                error = sosetopt1(so, sopt);
                KASSERT(solocked(so));
        } else {
                error = ENOPROTOOPT;
                solock(so);
        }

        if ((error == 0 || error == ENOPROTOOPT) &&
            so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) {
                /* give the protocol stack a shot */
                prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt);
                if (prerr == 0)
                        error = 0;
                else if (prerr != ENOPROTOOPT)
                        error = prerr;
        }
        sounlock(so);
        return error;
}

/*
 * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt()
 */
int
so_setsockopt(struct lwp *l, struct socket *so, int level, int name,
    const void *val, size_t valsize)
{
        struct sockopt sopt;
        int error;

        KASSERT(valsize == 0 || val != NULL);

        sockopt_init(&sopt, level, name, valsize);
        sockopt_set(&sopt, val, valsize);

        error = sosetopt(so, &sopt);

        sockopt_destroy(&sopt);

        return error;
}

/*
 * internal get SOL_SOCKET options
 */
static int
sogetopt1(struct socket *so, struct sockopt *sopt)
{
        int error, optval, opt;
        struct linger l;
        struct timeval tv;

        switch ((opt = sopt->sopt_name)) {

        case SO_ACCEPTFILTER:
                error = accept_filt_getopt(so, sopt);
                break;

        case SO_LINGER:
                l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0;
                l.l_linger = so->so_linger;

                error = sockopt_set(sopt, &l, sizeof(l));
                break;

        case SO_USELOOPBACK:
        case SO_DONTROUTE:
        case SO_DEBUG:
        case SO_KEEPALIVE:
        case SO_REUSEADDR:
        case SO_REUSEPORT:
        case SO_BROADCAST:
        case SO_OOBINLINE:
        case SO_TIMESTAMP:
        case SO_NOSIGPIPE:
        case SO_RERROR:
        case SO_ACCEPTCONN:
                error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0);
                break;

        case SO_TYPE:
                error = sockopt_setint(sopt, so->so_type);
                break;

        case SO_ERROR:
                if (so->so_error == 0) {
                        so->so_error = so->so_rerror;
                        so->so_rerror = 0;
                }
                error = sockopt_setint(sopt, so->so_error);
                so->so_error = 0;
                break;

        case SO_SNDBUF:
                error = sockopt_setint(sopt, so->so_snd.sb_hiwat);
                break;

        case SO_RCVBUF:
                error = sockopt_setint(sopt, so->so_rcv.sb_hiwat);
                break;

        case SO_SNDLOWAT:
                error = sockopt_setint(sopt, so->so_snd.sb_lowat);
                break;

        case SO_RCVLOWAT:
                error = sockopt_setint(sopt, so->so_rcv.sb_lowat);
                break;

        case SO_SNDTIMEO:
        case SO_RCVTIMEO:
                optval = (opt == SO_SNDTIMEO ?
                     so->so_snd.sb_timeo : so->so_rcv.sb_timeo);

                memset(&tv, 0, sizeof(tv));
                tv.tv_sec = optval / hz;
                tv.tv_usec = (optval % hz) * tick;

                error = sockopt_set(sopt, &tv, sizeof(tv));
                break;

        case SO_OVERFLOWED:
                error = sockopt_setint(sopt, so->so_rcv.sb_overflowed);
                break;

        default:
                MODULE_HOOK_CALL(uipc_socket_50_getopt1_hook,
                    (opt, so, sopt), enosys(), error);
                if (error)
                        error = ENOPROTOOPT;
                break;
        }

        return error;
}

int
sogetopt(struct socket *so, struct sockopt *sopt)
{
        int error;

        solock(so);
        if (sopt->sopt_level != SOL_SOCKET) {
                if (so->so_proto && so->so_proto->pr_ctloutput) {
                        error = ((*so->so_proto->pr_ctloutput)
                            (PRCO_GETOPT, so, sopt));
                } else
                        error = (ENOPROTOOPT);
        } else {
                error = sogetopt1(so, sopt);
        }
        sounlock(so);
        return error;
}

/*
 * alloc sockopt data buffer buffer
 *        - will be released at destroy
 */
static int
sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag)
{
        void *data;

        KASSERT(sopt->sopt_size == 0);

        if (len > sizeof(sopt->sopt_buf)) {
                data = kmem_zalloc(len, kmflag);
                if (data == NULL)
                        return ENOMEM;
                sopt->sopt_data = data;
        } else
                sopt->sopt_data = sopt->sopt_buf;

        sopt->sopt_size = len;
        return 0;
}

/*
 * initialise sockopt storage
 *        - MAY sleep during allocation
 */
void
sockopt_init(struct sockopt *sopt, int level, int name, size_t size)
{

        memset(sopt, 0, sizeof(*sopt));

        sopt->sopt_level = level;
        sopt->sopt_name = name;
        (void)sockopt_alloc(sopt, size, KM_SLEEP);
}

/*
 * destroy sockopt storage
 *        - will release any held memory references
 */
void
sockopt_destroy(struct sockopt *sopt)
{

        if (sopt->sopt_data != sopt->sopt_buf)
                kmem_free(sopt->sopt_data, sopt->sopt_size);

        memset(sopt, 0, sizeof(*sopt));
}

/*
 * set sockopt value
 *        - value is copied into sockopt
 *        - memory is allocated when necessary, will not sleep
 */
int
sockopt_set(struct sockopt *sopt, const void *buf, size_t len)
{
        int error;

        if (sopt->sopt_size == 0) {
                error = sockopt_alloc(sopt, len, KM_NOSLEEP);
                if (error)
                        return error;
        }

        sopt->sopt_retsize = MIN(sopt->sopt_size, len);
        if (sopt->sopt_retsize > 0) {
                memcpy(sopt->sopt_data, buf, sopt->sopt_retsize);
        }

        return 0;
}

/*
 * common case of set sockopt integer value
 */
int
sockopt_setint(struct sockopt *sopt, int val)
{

        return sockopt_set(sopt, &val, sizeof(int));
}

/*
 * get sockopt value
 *        - correct size must be given
 */
int
sockopt_get(const struct sockopt *sopt, void *buf, size_t len)
{

        if (sopt->sopt_size != len)
                return EINVAL;

        memcpy(buf, sopt->sopt_data, len);
        return 0;
}

/*
 * common case of get sockopt integer value
 */
int
sockopt_getint(const struct sockopt *sopt, int *valp)
{

        return sockopt_get(sopt, valp, sizeof(int));
}

/*
 * set sockopt value from mbuf
 *        - ONLY for legacy code
 *        - mbuf is released by sockopt
 *        - will not sleep
 */
int
sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m)
{
        size_t len;
        int error;

        len = m_length(m);

        if (sopt->sopt_size == 0) {
                error = sockopt_alloc(sopt, len, KM_NOSLEEP);
                if (error)
                        return error;
        }

        sopt->sopt_retsize = MIN(sopt->sopt_size, len);
        m_copydata(m, 0, sopt->sopt_retsize, sopt->sopt_data);
        m_freem(m);

        return 0;
}

/*
 * get sockopt value into mbuf
 *        - ONLY for legacy code
 *        - mbuf to be released by the caller
 *        - will not sleep
 */
struct mbuf *
sockopt_getmbuf(const struct sockopt *sopt)
{
        struct mbuf *m;

        if (sopt->sopt_size > MCLBYTES)
                return NULL;

        m = m_get(M_DONTWAIT, MT_SOOPTS);
        if (m == NULL)
                return NULL;

        if (sopt->sopt_size > MLEN) {
                MCLGET(m, M_DONTWAIT);
                if ((m->m_flags & M_EXT) == 0) {
                        m_free(m);
                        return NULL;
                }
        }

        memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size);
        m->m_len = sopt->sopt_size;

        return m;
}

void
sohasoutofband(struct socket *so)
{

        so->so_state |= SS_POLLRDBAND;
        fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so);
        selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT);
}

static void
filt_sordetach(struct knote *kn)
{
        struct socket *so;

        so = ((file_t *)kn->kn_obj)->f_socket;
        solock(so);
        if (selremove_knote(&so->so_rcv.sb_sel, kn))
                so->so_rcv.sb_flags &= ~SB_KNOTE;
        sounlock(so);
}

/*ARGSUSED*/
static int
filt_soread(struct knote *kn, long hint)
{
        struct socket *so;
        int rv;

        so = ((file_t *)kn->kn_obj)->f_socket;
        if (hint != NOTE_SUBMIT)
                solock(so);
        kn->kn_data = so->so_rcv.sb_cc;
        if (so->so_state & SS_CANTRCVMORE) {
                knote_set_eof(kn, 0);
                kn->kn_fflags = so->so_error;
                rv = 1;
        } else if (so->so_error || so->so_rerror)
                rv = 1;
        else if (kn->kn_sfflags & NOTE_LOWAT)
                rv = (kn->kn_data >= kn->kn_sdata);
        else
                rv = (kn->kn_data >= so->so_rcv.sb_lowat);
        if (hint != NOTE_SUBMIT)
                sounlock(so);
        return rv;
}

static void
filt_sowdetach(struct knote *kn)
{
        struct socket *so;

        so = ((file_t *)kn->kn_obj)->f_socket;
        solock(so);
        if (selremove_knote(&so->so_snd.sb_sel, kn))
                so->so_snd.sb_flags &= ~SB_KNOTE;
        sounlock(so);
}

/*ARGSUSED*/
static int
filt_sowrite(struct knote *kn, long hint)
{
        struct socket *so;
        int rv;

        so = ((file_t *)kn->kn_obj)->f_socket;
        if (hint != NOTE_SUBMIT)
                solock(so);
        kn->kn_data = sbspace(&so->so_snd);
        if (so->so_state & SS_CANTSENDMORE) {
                knote_set_eof(kn, 0);
                kn->kn_fflags = so->so_error;
                rv = 1;
        } else if (so->so_error)
                rv = 1;
        else if (((so->so_state & SS_ISCONNECTED) == 0) &&
            (so->so_proto->pr_flags & PR_CONNREQUIRED))
                rv = 0;
        else if (kn->kn_sfflags & NOTE_LOWAT)
                rv = (kn->kn_data >= kn->kn_sdata);
        else
                rv = (kn->kn_data >= so->so_snd.sb_lowat);
        if (hint != NOTE_SUBMIT)
                sounlock(so);
        return rv;
}

static int
filt_soempty(struct knote *kn, long hint)
{
        struct socket *so;
        int rv;

        so = ((file_t *)kn->kn_obj)->f_socket;
        if (hint != NOTE_SUBMIT)
                solock(so);
        rv = (kn->kn_data = sbused(&so->so_snd)) == 0 ||
             (so->so_options & SO_ACCEPTCONN) != 0;
        if (hint != NOTE_SUBMIT)
                sounlock(so);
        return rv;
}

/*ARGSUSED*/
static int
filt_solisten(struct knote *kn, long hint)
{
        struct socket *so;
        int rv;

        so = ((file_t *)kn->kn_obj)->f_socket;

        /*
         * Set kn_data to number of incoming connections, not
         * counting partial (incomplete) connections.
         */
        if (hint != NOTE_SUBMIT)
                solock(so);
        kn->kn_data = so->so_qlen;
        rv = (kn->kn_data > 0);
        if (hint != NOTE_SUBMIT)
                sounlock(so);
        return rv;
}

static const struct filterops solisten_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_sordetach,
        .f_event = filt_solisten,
};

static const struct filterops soread_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_sordetach,
        .f_event = filt_soread,
};

static const struct filterops sowrite_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_sowdetach,
        .f_event = filt_sowrite,
};

static const struct filterops soempty_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_sowdetach,
        .f_event = filt_soempty,
};

int
soo_kqfilter(struct file *fp, struct knote *kn)
{
        struct socket *so;
        struct sockbuf *sb;

        so = ((file_t *)kn->kn_obj)->f_socket;
        solock(so);
        switch (kn->kn_filter) {
        case EVFILT_READ:
                if (so->so_options & SO_ACCEPTCONN)
                        kn->kn_fop = &solisten_filtops;
                else
                        kn->kn_fop = &soread_filtops;
                sb = &so->so_rcv;
                break;
        case EVFILT_WRITE:
                kn->kn_fop = &sowrite_filtops;
                sb = &so->so_snd;

#ifdef PIPE_SOCKETPAIR
                if (so->so_state & SS_ISAPIPE) {
                        /* Other end of pipe has been closed. */
                        if (so->so_state & SS_ISDISCONNECTED) {
                                sounlock(so);
                                return EBADF;
                        }
                }
#endif
                break;
        case EVFILT_EMPTY:
                kn->kn_fop = &soempty_filtops;
                sb = &so->so_snd;
                break;
        default:
                sounlock(so);
                return EINVAL;
        }
        selrecord_knote(&sb->sb_sel, kn);
        sb->sb_flags |= SB_KNOTE;
        sounlock(so);
        return 0;
}

static int
sodopoll(struct socket *so, int events)
{
        int revents;

        revents = 0;

        if (events & (POLLIN | POLLRDNORM))
                if (soreadable(so))
                        revents |= events & (POLLIN | POLLRDNORM);

        if (events & (POLLOUT | POLLWRNORM))
                if (sowritable(so))
                        revents |= events & (POLLOUT | POLLWRNORM);

        if (events & (POLLPRI | POLLRDBAND))
                if (so->so_state & SS_POLLRDBAND)
                        revents |= events & (POLLPRI | POLLRDBAND);

        return revents;
}

int
sopoll(struct socket *so, int events)
{
        int revents = 0;

#ifndef DIAGNOSTIC
        /*
         * Do a quick, unlocked check in expectation that the socket
         * will be ready for I/O.  Don't do this check if DIAGNOSTIC,
         * as the solocked() assertions will fail.
         */
        if ((revents = sodopoll(so, events)) != 0)
                return revents;
#endif

        solock(so);
        if ((revents = sodopoll(so, events)) == 0) {
                if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
                        selrecord(curlwp, &so->so_rcv.sb_sel);
                        so->so_rcv.sb_flags |= SB_NOTIFY;
                }

                if (events & (POLLOUT | POLLWRNORM)) {
                        selrecord(curlwp, &so->so_snd.sb_sel);
                        so->so_snd.sb_flags |= SB_NOTIFY;
                }
        }
        sounlock(so);

        return revents;
}

struct mbuf **
sbsavetimestamp(int opt, struct mbuf **mp)
{
        struct timeval tv;
        int error;

        memset(&tv, 0, sizeof(tv));
        microtime(&tv);

        MODULE_HOOK_CALL(uipc_socket_50_sbts_hook, (opt, &mp), enosys(), error);
        if (error == 0)
                return mp;

        if (opt & SO_TIMESTAMP) {
                *mp = sbcreatecontrol(&tv, sizeof(tv),
                    SCM_TIMESTAMP, SOL_SOCKET);
                if (*mp)
                        mp = &(*mp)->m_next;
        }
        return mp;
}


#include <sys/sysctl.h>

static int sysctl_kern_somaxkva(SYSCTLFN_PROTO);
static int sysctl_kern_sbmax(SYSCTLFN_PROTO);

/*
 * sysctl helper routine for kern.somaxkva.  ensures that the given
 * value is not too small.
 * (XXX should we maybe make sure it's not too large as well?)
 */
static int
sysctl_kern_somaxkva(SYSCTLFN_ARGS)
{
        int error, new_somaxkva;
        struct sysctlnode node;

        new_somaxkva = somaxkva;
        node = *rnode;
        node.sysctl_data = &new_somaxkva;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */
                return EINVAL;

        mutex_enter(&so_pendfree_lock);
        somaxkva = new_somaxkva;
        cv_broadcast(&socurkva_cv);
        mutex_exit(&so_pendfree_lock);

        return error;
}

/*
 * sysctl helper routine for kern.sbmax. Basically just ensures that
 * any new value is not too small.
 */
static int
sysctl_kern_sbmax(SYSCTLFN_ARGS)
{
        int error, new_sbmax;
        struct sysctlnode node;

        new_sbmax = sb_max;
        node = *rnode;
        node.sysctl_data = &new_sbmax;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        KERNEL_LOCK(1, NULL);
        error = sb_max_set(new_sbmax);
        KERNEL_UNLOCK_ONE(NULL);

        return error;
}

/*
 * sysctl helper routine for kern.sooptions. Ensures that only allowed
 * options can be set.
 */
static int
sysctl_kern_sooptions(SYSCTLFN_ARGS)
{
        int error, new_options;
        struct sysctlnode node;

        new_options = sooptions;
        node = *rnode;
        node.sysctl_data = &new_options;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        if (new_options & ~SO_DEFOPTS)
                return EINVAL;

        sooptions = new_options;

        return 0;
}

static void
sysctl_kern_socket_setup(void)
{

        KASSERT(socket_sysctllog == NULL);

        sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "somaxkva",
                       SYSCTL_DESCR("Maximum amount of kernel memory to be "
                                    "used for socket buffers"),
                       sysctl_kern_somaxkva, 0, NULL, 0,
                       CTL_KERN, KERN_SOMAXKVA, CTL_EOL);

        sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_BOOL, "sofixedbuf",
                       SYSCTL_DESCR("Prevent scaling of fixed socket buffers"),
                       NULL, 0, &sofixedbuf, 0,
                       CTL_KERN, KERN_SOFIXEDBUF, CTL_EOL);

        sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "sbmax",
                       SYSCTL_DESCR("Maximum socket buffer size"),
                       sysctl_kern_sbmax, 0, NULL, 0,
                       CTL_KERN, KERN_SBMAX, CTL_EOL);

        sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "sooptions",
                       SYSCTL_DESCR("Default socket options"),
                       sysctl_kern_sooptions, 0, NULL, 0,
                       CTL_KERN, CTL_CREATE, CTL_EOL);
}



































































































































    2 








    1 



















   18 





    1 
    1 



















    1 







   18 

























































    6 



























   34 










































   58 













































   58 












   18 






   39 











   46 





















   46 
   18 























   47 

    2 









    2 











   47 



















   32 





   32 
   47 



   47 







   47 

   18 









   47 


   47 
   47 













   47 


   18 
   18 
   18 










   47 










   47 


















   46 














   22 






   22 






    3 
    1 

   18 





    1 

   18 


   18 

























   47 







   46 














   24 






   28 











    6 














    1 

    4 








    4 



   34 


   13 

































   47 







   47 


































   47 










   47 



















   16 












   16 

































    1 




    1 




    1 



































   16 

















   16 























   16 
   16 

   16 










































   16 








   16 





   16 


   16 





   16 















   16 







   16 




































   16 

    1 

   16 








































    5 







    5 


















    5 





    4 
    1 












    5 



    4 
    1 







    4 

    5 

    5 















    5 


    5 























    2 








    2 

    2 

    2 

    2 







    2 

    2 














    1 










    1 
    1 











    1 




    1 








    1 

    1 



















   46 
















   46 










   47 









   47 





   47 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
/*        $NetBSD: ufs_lookup.c,v 1.158 2023/08/10 20:49:20 mrg Exp $        */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ufs_lookup.c        8.9 (Berkeley) 8/11/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.158 2023/08/10 20:49:20 mrg Exp $");

#ifdef _KERNEL_OPT
#include "opt_ffs.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/buf.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/kernel.h>
#include <sys/kauth.h>
#include <sys/wapbl.h>
#include <sys/proc.h>
#include <sys/kmem.h>

#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dirhash.h>
#endif
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>

#include <miscfs/genfs/genfs.h>

#ifdef DIAGNOSTIC
int        dirchk = 1;
#else
int        dirchk = 0;
#endif

#if BYTE_ORDER == LITTLE_ENDIAN
# define ENDIANSWAP(needswap) ((needswap) == 0)
#else
# define ENDIANSWAP(needswap) ((needswap) != 0)
#endif

#define NAMLEN(fsfmt, needswap, dp) \
    ((fsfmt) && ENDIANSWAP(needswap) ? (dp)->d_type : (dp)->d_namlen)

static void
ufs_dirswap(struct direct *dirp)
{
        uint8_t tmp = dirp->d_namlen;
        dirp->d_namlen = dirp->d_type;
        dirp->d_type = tmp;
}

struct slotinfo {
        enum {
                NONE,                /* need to search a slot for our new entry */
                COMPACT,        /* a compaction can make a slot in the current
                                   DIRBLKSIZ block */
                FOUND,                /* found a slot (or no need to search) */
        } status;
        doff_t offset;                /* offset of area with free space.
                                   a special value -1 for invalid */
        int size;                /* size of area at slotoffset */
        int freespace;                /* accumulated amount of space free in
                                   the current DIRBLKSIZ block */
        int needed;                /* size of the entry we're seeking */
};

static void
calc_count(struct ufs_lookup_results *results, int dirblksiz, doff_t prevoff)
{
        if ((results->ulr_offset & (dirblksiz - 1)) == 0)
                results->ulr_count = 0;
        else
                results->ulr_count = results->ulr_offset - prevoff;
}

static void
slot_init(struct slotinfo *slot)
{
        slot->status = FOUND;
        slot->offset = -1;
        slot->freespace = slot->size = slot->needed = 0;
}

#ifdef UFS_DIRHASH
static doff_t
slot_findfree(struct slotinfo *slot, struct inode *dp)
{
        if (slot->status == FOUND)
                return dp->i_size;

        slot->offset = ufsdirhash_findfree(dp, slot->needed, &slot->size);
        if (slot->offset < 0)
                return dp->i_size;

        slot->status = COMPACT;
        doff_t enduseful = ufsdirhash_enduseful(dp);
        if (enduseful < 0)
                return dp->i_size;
        return enduseful;
}
#endif

static void
slot_white(struct slotinfo *slot, uint16_t reclen,
    struct ufs_lookup_results *results)
{
        slot->status = FOUND;
        slot->offset = results->ulr_offset;
        slot->size = reclen;
        results->ulr_reclen = slot->size;
}

static void
slot_update(struct slotinfo *slot, int size, uint16_t reclen, doff_t offset)
{
        if (size >= slot->needed) {
                slot->status = FOUND;
                slot->offset = offset;
                slot->size = reclen;
        } else if (slot->status == NONE) {
                slot->freespace += size;
                if (slot->offset == -1)
                        slot->offset = offset;
                if (slot->freespace >= slot->needed) {
                        slot->status = COMPACT;
                        slot->size = offset + reclen - slot->offset;
                }
        }
}

/*
 * Return an indication of where the new directory entry should be put.
 * If we didn't find a slot, then set results->ulr_count to 0 indicating
 * that the new slot belongs at the end of the directory. If we found a slot,
 * then the new entry can be put in the range from results->ulr_offset to
 * results->ulr_offset + results->ulr_count.
 */
static int
slot_estimate(const struct slotinfo *slot, int dirblksiz, int nameiop,
    doff_t prevoff, doff_t enduseful, const struct inode *ip,
    struct ufs_lookup_results *results)
{
        if (slot->status == NONE) {
                results->ulr_offset = roundup(ip->i_size, dirblksiz);
                results->ulr_count = 0;
                enduseful = results->ulr_offset;
        } else if (nameiop == DELETE) {
                results->ulr_offset = slot->offset;
                calc_count(results, dirblksiz, prevoff);
        } else {
                results->ulr_offset = slot->offset;
                results->ulr_count = slot->size;
                if (enduseful < slot->offset + slot->size)
                        enduseful = slot->offset + slot->size;
        }
        results->ulr_endoff = roundup(enduseful, dirblksiz);
#if 0 /* commented out by dbj. none of the on disk fields changed */
        ip->i_flag |= IN_CHANGE | IN_UPDATE;
#endif
        return EJUSTRETURN;
}

/*
 * Check if we can delete inode tdp in directory vdp with inode ip and creds.
 */
static int
ufs_can_delete(struct vnode *tdp, struct vnode *vdp, struct inode *ip,
    kauth_cred_t cred)
{
        int error;

#ifdef UFS_ACL
        /*
         * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt
         *
         * 3.16.2.1. ACE4_DELETE vs. ACE4_DELETE_CHILD
         */

        /*
         * XXX: Is this check required?
         */
        error = VOP_ACCESS(vdp, VEXEC, cred);
        if (error)
                goto out;

#if 0
        /* Moved to ufs_remove, ufs_rmdir because they hold the lock */
        error = VOP_ACCESSX(tdp, VDELETE, cred);
        if (error == 0)
                return (0);
#endif

        error = VOP_ACCESSX(vdp, VDELETE_CHILD, cred);
        if (error == 0)
                return (0);

        error = VOP_ACCESSX(vdp, VEXPLICIT_DENY | VDELETE_CHILD, cred);
        if (error)
                goto out;

#endif /* !UFS_ACL */

        /*
         * Write access to directory required to delete files.
         */
        error = VOP_ACCESS(vdp, VWRITE, cred);
        if (error)
                goto out;

        if (!(ip->i_mode & ISVTX))
                return 0;

        /*
         * If directory is "sticky", then user must own
         * the directory, or the file in it, else she
         * may not delete it (unless she's root). This
         * implements append-only directories.
         */
        error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, tdp, vdp,
            genfs_can_sticky(vdp, cred, ip->i_uid, VTOI(tdp)->i_uid));
        if (error) {
                error = EPERM;        // Why override?
                goto out;
        }
        return 0;
out:
        vrele(tdp);
        return error;
}

static int
ufs_getino(struct vnode *vdp, struct inode *ip, ino_t foundino,
    struct vnode **tdp, bool same)
{
        if (ip->i_number == foundino) {
                if (same)
                        return EISDIR;
                vref(vdp);
                *tdp = vdp;
                return 0;
        }
        return vcache_get(vdp->v_mount, &foundino, sizeof(foundino), tdp);
}


/*
 * Convert a component of a pathname into a pointer to a locked inode.
 * This is a very central and rather complicated routine.
 * If the file system is not maintained in a strict tree hierarchy,
 * this can result in a deadlock situation (see comments in code below).
 *
 * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
 * on whether the name is to be looked up, created, renamed, or deleted.
 * When CREATE, RENAME, or DELETE is specified, information usable in
 * creating, renaming, or deleting a directory entry may be calculated.
 * If flag has LOCKPARENT or'ed into it and the target of the pathname
 * exists, lookup returns both the target and its parent directory locked.
 * When creating or renaming and LOCKPARENT is specified, the target may
 * not be ".".  When deleting and LOCKPARENT is specified, the target may
 * be "."., but the caller must check to ensure it does an vrele and vput
 * instead of two vputs.
 *
 * Overall outline of ufs_lookup:
 *
 *        check accessibility of directory
 *        look for name in cache, if found, then if at end of path
 *          and deleting or creating, drop it, else return name
 *        search for name in directory, to found or notfound
 * notfound:
 *        if creating, return locked directory, leaving info on available slots
 *        else return error
 * found:
 *        if at end of path and deleting, return information to allow delete
 *        if at end of path and rewriting (RENAME and LOCKPARENT), lock target
 *          inode and return info to allow rewrite
 *        if not at end, add name to cache; if at end and neither creating
 *          nor deleting, add name to cache
 */
int
ufs_lookup(void *v)
{
        struct vop_lookup_v2_args /* {
                struct vnode *a_dvp;
                struct vnode **a_vpp;
                struct componentname *a_cnp;
        } */ *ap = v;
        struct vnode *vdp = ap->a_dvp;        /* vnode for directory being searched */
        struct inode *dp = VTOI(vdp);        /* inode for directory being searched */
        struct buf *bp;                        /* a buffer of directory entries */
        struct direct *ep;                /* the current directory entry */
        int entryoffsetinblock;                /* offset of ep in bp's buffer */
        struct slotinfo slot;
        int numdirpasses;                /* strategy for directory search */
        doff_t endsearch;                /* offset to end directory search */
        doff_t prevoff;                        /* previous value of ulr_offset */
        struct vnode *tdp;                /* returned by vcache_get */
        doff_t enduseful;                /* pointer past last used dir slot.
                                           used for directory truncation. */
        u_long bmask;                        /* block offset mask */
        int error;
        struct vnode **vpp = ap->a_vpp;
        struct componentname *cnp = ap->a_cnp;
        kauth_cred_t cred = cnp->cn_cred;
        int flags;
        int nameiop = cnp->cn_nameiop;
        struct ufsmount *ump = dp->i_ump;
        const int needswap = UFS_MPNEEDSWAP(ump);
        int dirblksiz = ump->um_dirblksiz;
        ino_t foundino;
        struct ufs_lookup_results *results;
        int iswhiteout;                        /* temp result from cache_lookup() */
        const int fsfmt = FSFMT(vdp);
        uint16_t reclen;

        flags = cnp->cn_flags;

        bp = NULL;
        *vpp = NULL;
        endsearch = 0; /* silence compiler warning */

        /*
         * Check accessibility of directory.
         */
        if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0)
                return (error);

        if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
            (nameiop == DELETE || nameiop == RENAME))
                return (EROFS);

        /*
         * We now have a segment name to search for, and a directory to search.
         *
         * Before tediously performing a linear scan of the directory,
         * check the name cache to see if the directory/name pair
         * we are looking for is known already.
         */
        if (cache_lookup(vdp, cnp->cn_nameptr, cnp->cn_namelen,
            cnp->cn_nameiop, cnp->cn_flags, &iswhiteout, vpp)) {
                if (iswhiteout) {
                        cnp->cn_flags |= ISWHITEOUT;
                }
                return *vpp == NULLVP ? ENOENT : 0;
        }

        /* May need to restart the lookup with an exclusive lock. */
        if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE) {
                return ENOLCK;
        }

        /*
         * Produce the auxiliary lookup results into i_crap. Increment
         * its serial number so elsewhere we can tell if we're using
         * stale results. This should not be done this way. XXX.
         */
        results = &dp->i_crap;
        dp->i_crapcounter++;

        if (iswhiteout) {
                /*
                 * The namecache set iswhiteout without finding a
                 * cache entry. As of this writing (20121014), this
                 * can happen if there was a whiteout entry that has
                 * been invalidated by the lookup. It is not clear if
                 * it is correct to set ISWHITEOUT in this case or
                 * not; however, doing so retains the prior behavior,
                 * so we'll go with that until some clearer answer
                 * appears. XXX
                 */
                cnp->cn_flags |= ISWHITEOUT;
        }

        /*
         * Suppress search for slots unless creating
         * file and at end of pathname, in which case
         * we watch for a place to put the new file in
         * case it doesn't already exist.
         */
        slot_init(&slot);

        if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) {
                slot.status = NONE;
                slot.needed = UFS_DIRECTSIZ(cnp->cn_namelen);
        }

        /*
         * If there is cached information on a previous search of
         * this directory, pick up where we last left off.
         * We cache only lookups as these are the most common
         * and have the greatest payoff. Caching CREATE has little
         * benefit as it usually must search the entire directory
         * to determine that the entry does not exist. Caching the
         * location of the last DELETE or RENAME has not reduced
         * profiling time and hence has been removed in the interest
         * of simplicity.
         */
        bmask = vdp->v_mount->mnt_stat.f_iosize - 1;

#ifdef UFS_DIRHASH
        /*
         * Use dirhash for fast operations on large directories. The logic
         * to determine whether to hash the directory is contained within
         * ufsdirhash_build(); a zero return means that it decided to hash
         * this directory and it successfully built up the hash table.
         */
        if (ufsdirhash_build(dp) == 0) {
                /* Look for a free slot if needed. */
                enduseful = slot_findfree(&slot, dp);
                /* Look up the component. */
                numdirpasses = 1;
                entryoffsetinblock = 0; /* silence compiler warning */
                switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen,
                    &results->ulr_offset, &bp,
                    nameiop == DELETE ? &prevoff : NULL)) {
                case 0:
                        ep = (void *)((char *)bp->b_data +
                            (results->ulr_offset & bmask));
                        reclen = ufs_rw16(ep->d_reclen, needswap);
                        goto foundentry;
                case ENOENT:
                        results->ulr_offset = roundup(dp->i_size, dirblksiz);
                        goto notfound;
                default:
                        /* Something failed; just do a linear search. */
                        break;
                }
        }
#endif /* UFS_DIRHASH */

        if (nameiop != LOOKUP || results->ulr_diroff == 0 ||
            results->ulr_diroff >= dp->i_size) {
                entryoffsetinblock = 0;
                results->ulr_offset = 0;
                numdirpasses = 1;
        } else {
                results->ulr_offset = results->ulr_diroff;
                entryoffsetinblock = results->ulr_offset & bmask;
                if (entryoffsetinblock != 0 &&
                    (error = ufs_blkatoff(vdp, (off_t)results->ulr_offset,
                    NULL, &bp, false)))
                        goto out;
                numdirpasses = 2;
                namecache_count_2passes();
        }
        prevoff = results->ulr_offset;
        endsearch = roundup(dp->i_size, dirblksiz);
        enduseful = 0;

searchloop:
        while (results->ulr_offset < endsearch) {
                preempt_point();

                /*
                 * If necessary, get the next directory block.
                 */
                if ((results->ulr_offset & bmask) == 0) {
                        if (bp != NULL)
                                brelse(bp, 0);
                        error = ufs_blkatoff(vdp, (off_t)results->ulr_offset,
                            NULL, &bp, false);
                        if (error)
                                goto out;
                        entryoffsetinblock = 0;
                }
                /*
                 * If still looking for a slot, and at a DIRBLKSIZ
                 * boundary, have to start looking for free space again.
                 */
                if (slot.status == NONE &&
                    (entryoffsetinblock & (dirblksiz - 1)) == 0) {
                        slot.offset = -1;
                        slot.freespace = 0;
                }
                /*
                 * Get pointer to next entry.
                 * Full validation checks are slow, so we only check
                 * enough to insure forward progress through the
                 * directory. Complete checks can be run by patching
                 * "dirchk" to be true.
                 */
                KASSERT(bp != NULL);
                ep = (void *)((char *)bp->b_data + entryoffsetinblock);
                const char *msg;
                reclen = ufs_rw16(ep->d_reclen, needswap);
                if ((reclen == 0 && (msg = "null entry")) || (dirchk &&
                    (msg = ufs_dirbadentry(vdp, ep, entryoffsetinblock)))) {
                        ufs_dirbad(dp, results->ulr_offset, msg);
                        reclen = dirblksiz -
                            (entryoffsetinblock & (dirblksiz - 1));
                        goto next;
                }

                /*
                 * If an appropriate sized slot has not yet been found,
                 * check to see if one is available. Also accumulate space
                 * in the current block so that we can determine if
                 * compaction is viable.
                 */
                if (slot.status != FOUND) {
                        int size = reclen;
                        if (ep->d_ino != 0)
                                size -= UFS_DIRSIZ(fsfmt, ep, needswap);
                        if (size > 0)
                                slot_update(&slot, size, reclen,
                                    results->ulr_offset);
                }

                if (ep->d_ino == 0)
                        goto next;

                /*
                 * Check for a name match.
                 */
                const uint16_t namlen = NAMLEN(fsfmt, needswap, ep);
                if (namlen != cnp->cn_namelen ||
                    memcmp(cnp->cn_nameptr, ep->d_name, (size_t)namlen))
                        goto next;

#ifdef UFS_DIRHASH
foundentry:
#endif
                /*
                 * Save directory entry's inode number and
                 * reclen, and release directory buffer.
                 */
                if (!fsfmt && ep->d_type == DT_WHT) {
                        slot_white(&slot, reclen, results);
                        /*
                         * This is used to set results->ulr_endoff, which may
                         * be used by ufs_direnter() as a length to truncate
                         * the directory to. Therefore, it must point past the
                         * end of the last non-empty directory entry. We don't
                         * know where that is in this case, so we effectively
                         * disable shrinking by using the existing size of the
                         * directory.
                         *
                         * Note that we wouldn't expect to shrink the
                         * directory while rewriting an existing entry anyway.
                         */
                        enduseful = endsearch;
                        cnp->cn_flags |= ISWHITEOUT;
                        numdirpasses--;
                        goto notfound;
                }
                foundino = ufs_rw32(ep->d_ino, needswap);
                results->ulr_reclen = reclen;
                goto found;
next:
                prevoff = results->ulr_offset;
                results->ulr_offset += reclen;
                entryoffsetinblock += reclen;
                if (ep->d_ino)
                        enduseful = results->ulr_offset;
        }
notfound:
        /*
         * If we started in the middle of the directory and failed
         * to find our target, we must check the beginning as well.
         */
        if (numdirpasses == 2) {
                numdirpasses--;
                results->ulr_offset = 0;
                endsearch = results->ulr_diroff;
                goto searchloop;
        }
        if (bp != NULL)
                brelse(bp, 0);
        /*
         * If creating, and at end of pathname and current
         * directory has not been removed, then can consider
         * allowing file to be created.
         */
        if ((nameiop == CREATE || nameiop == RENAME ||
             (nameiop == DELETE &&
              (cnp->cn_flags & DOWHITEOUT) &&
              (cnp->cn_flags & ISWHITEOUT))) &&
            (flags & ISLASTCN) && dp->i_nlink != 0) {
                /*
                 * Access for write is interpreted as allowing
                 * creation of files in the directory.
                 */
                if (flags & WILLBEDIR)
                        error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred);
                else
                        error = VOP_ACCESS(vdp, VWRITE, cred);
                if (error)
                        goto out;
                error = slot_estimate(&slot, dirblksiz, nameiop,
                    prevoff, enduseful, dp, results);
                /*
                 * We return with the directory locked, so that
                 * the parameters we set up above will still be
                 * valid if we actually decide to do a direnter().
                 * We return ni_vp == NULL to indicate that the entry
                 * does not currently exist; we leave a pointer to
                 * the (locked) directory inode in ndp->ni_dvp.
                 *
                 * NB - if the directory is unlocked, then this
                 * information cannot be used.
                 */
                goto out;
        }
        /*
         * Insert name into cache (as non-existent) if appropriate.
         */
        if (nameiop != CREATE) {
                cache_enter(vdp, *vpp, cnp->cn_nameptr, cnp->cn_namelen,
                            cnp->cn_flags);
        }
        error = ENOENT;
        goto out;

found:
        if (numdirpasses == 2)
                namecache_count_pass2();
        /*
         * Check that directory length properly reflects presence
         * of this entry.
         */
        const uint64_t newisize =
            results->ulr_offset + UFS_DIRSIZ(fsfmt, ep, needswap);
        if (newisize > dp->i_size) {
                ufs_dirbad(dp, results->ulr_offset, "i_size too small");
                dp->i_size = newisize;
                DIP_ASSIGN(dp, size, dp->i_size);
                dp->i_flag |= IN_CHANGE | IN_UPDATE;
                UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
        }
        brelse(bp, 0);

        /*
         * Found component in pathname.
         * If the final component of path name, save information
         * in the cache as to where the entry was found.
         */
        if ((flags & ISLASTCN) && nameiop == LOOKUP)
                results->ulr_diroff = results->ulr_offset & ~(dirblksiz - 1);

        /*
         * If deleting, and at end of pathname, return
         * parameters which can be used to remove file.
         * Lock the inode, being careful with ".".
         */
        if (nameiop == DELETE && (flags & ISLASTCN)) {
                /*
                 * Return pointer to current entry in results->ulr_offset,
                 * and distance past previous entry (if there
                 * is a previous entry in this block) in results->ulr_count.
                 * Save directory inode pointer in ndp->ni_dvp for dirremove().
                 */
                calc_count(results, dirblksiz, prevoff);

                if ((error = ufs_getino(vdp, dp, foundino, &tdp, false)) != 0)
                        goto out;

                if ((error = ufs_can_delete(tdp, vdp, dp, cred)) != 0)
                        goto out;

                *vpp = tdp;
                goto out;
        }

        /*
         * If rewriting (RENAME), return the inode and the
         * information required to rewrite the present directory
         * Must get inode of directory entry to verify it's a
         * regular file, or empty directory.
         */
        if (nameiop == RENAME && (flags & ISLASTCN)) {
                if (flags & WILLBEDIR)
                        error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred);
                else
                        error = VOP_ACCESS(vdp, VWRITE, cred);
                if (error)
                        goto out;
                /*
                 * Careful about locking second inode.
                 * This can only occur if the target is ".".
                 */
                if ((error = ufs_getino(vdp, dp, foundino, &tdp, true)) != 0)
                        goto out;
                *vpp = tdp;
                goto out;
        }

        if ((error = ufs_getino(vdp, dp, foundino, &tdp, false)) != 0)
                goto out;

        *vpp = tdp;
        /*
         * Insert name into cache if appropriate.
         */
        cache_enter(vdp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags);
        error = 0;

out:
        return error;
}

void
ufs_dirbad(struct inode *ip, doff_t offset, const char *how)
{
        struct mount *mp = ITOV(ip)->v_mount;
        void (*p)(const char  *, ...) __printflike(1, 2) =
            (mp->mnt_flag & MNT_RDONLY) == 0 ? panic : printf;

        (*p)("%s: bad dir ino %ju at offset %d: %s\n",
            mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number,
            offset, how);
}

/*
 * Do consistency checking on a directory entry:
 *        record length must be multiple of 4
 *        entry must fit in rest of its DIRBLKSIZ block
 *        record must be large enough to contain entry
 *        name is not longer than FFS_MAXNAMLEN
 *        name must be as long as advertised, and null terminated
 */
const char *
ufs_dirbadentry(const struct vnode *dp, const struct direct *ep,
    int entryoffsetinblock)
{
        const struct ufsmount *ump = VFSTOUFS(dp->v_mount);
        const int needswap = UFS_MPNEEDSWAP(ump);
        const int dirblksiz = ump->um_dirblksiz;
        const int maxsize = dirblksiz - (entryoffsetinblock & (dirblksiz - 1));
        const int fsfmt = FSFMT(dp);
        const uint8_t namlen = NAMLEN(fsfmt, needswap, ep);
        const uint16_t reclen = ufs_rw16(ep->d_reclen, needswap);
        const int dirsiz = (int)UFS_DIRSIZ(fsfmt, ep, needswap);
        const char *name = ep->d_name;
        const char *str;
#ifdef DIAGNOSTIC
        static char buf[512];
#endif

        if ((reclen & 0x3) != 0)
                str = "not rounded";
        else if (reclen > maxsize)
                str = "too big";
        else if (reclen < dirsiz)
                str = "too small";
#if FFS_MAXNAMLEN < 255
        else if (namlen > FFS_MAXNAMLEN)
                str = "long name";
#endif
        else
                str = NULL;

        if (str) {
#ifdef DIAGNOSTIC
                snprintf(buf, sizeof(buf), "Bad dir (%s), reclen=%#x, "
                    "namlen=%d, dirsiz=%d <= reclen=%d <= maxsize=%d, "
                    "flags=%#x, entryoffsetinblock=%d, dirblksiz=%d",
                    str, reclen, namlen, dirsiz, reclen, maxsize,
                    dp->v_mount->mnt_flag, entryoffsetinblock, dirblksiz);
                str = buf;
#endif
                return str;
        }

        if (ep->d_ino == 0)
                return NULL;

        for (uint8_t i = 0; i < namlen; i++)
                if (name[i] == '\0') {
                        str = "NUL in name";
#ifdef DIAGNOSTIC
                        snprintf(buf, sizeof(buf), "%s [%s] i=%d, namlen=%d",
                            str, name, i, namlen);
                        str = buf;
#endif
                        return str;
                }

        if (name[namlen]) {
                str = "missing NUL in name";
#ifdef DIAGNOSTIC
                snprintf(buf, sizeof(buf), "%s [%*.*s] namlen=%d", str,
                    namlen, namlen, name, namlen);
                str = buf;
#endif
                return str;
        }
        return NULL;
}

/*
 * Construct a new directory entry after a call to namei, using the
 * name in the componentname argument cnp. The argument ip is the
 * inode to which the new directory entry will refer.
 */
void
ufs_makedirentry(struct inode *ip, struct componentname *cnp,
    struct direct *newdirp)
{
        size_t namelen = cnp->cn_namelen;

        newdirp->d_ino = ip->i_number;
        newdirp->d_namlen = namelen;
        memcpy(newdirp->d_name, cnp->cn_nameptr, namelen);

        /* NUL terminate and zero out padding */
        memset(&newdirp->d_name[namelen], 0, UFS_NAMEPAD(namelen));

        if (FSFMT(ITOV(ip)))
                newdirp->d_type = 0;
        else
                newdirp->d_type = IFTODT(ip->i_mode);
}


static int
ufs_dirgrow(struct vnode *dvp, const struct ufs_lookup_results *ulr,
    struct vnode *tvp, struct direct *dirp,
    struct componentname *cnp, struct buf *newdirbp)
{
        const kauth_cred_t cr = cnp->cn_cred;
        const struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
        const int needswap = UFS_MPNEEDSWAP(ump);
        const int dirblksiz = ump->um_dirblksiz;
        const int fsfmt = FSFMT(dvp);
        const u_int newentrysize = UFS_DIRSIZ(0, dirp, 0);
        struct inode *dp = VTOI(dvp);
        int error, ret, blkoff;
        struct timespec ts;
        struct buf *bp;

        /*
         * If ulr_count is 0, then namei could find no
         * space in the directory. Here, ulr_offset will
         * be on a directory block boundary and we will write the
         * new entry into a fresh block.
         */
        if (ulr->ulr_offset & (dirblksiz - 1))
                panic("%s: newblk", __func__);
        if ((error = UFS_BALLOC(dvp, (off_t)ulr->ulr_offset, dirblksiz,
            cr, B_CLRBUF | B_SYNC, &bp)) != 0) {
                return error;
        }

        dp->i_size = ulr->ulr_offset + dirblksiz;
        DIP_ASSIGN(dp, size, dp->i_size);
        dp->i_flag |= IN_CHANGE | IN_UPDATE;
        uvm_vnp_setsize(dvp, dp->i_size);
        dirp->d_reclen = ufs_rw16(dirblksiz, needswap);
        dirp->d_ino = ufs_rw32(dirp->d_ino, needswap);
        if (fsfmt && ENDIANSWAP(needswap))
                ufs_dirswap(dirp);
        blkoff = ulr->ulr_offset & (ump->um_mountp->mnt_stat.f_iosize - 1);
        memcpy((char *)bp->b_data + blkoff, dirp, newentrysize);
#ifdef UFS_DIRHASH
        if (dp->i_dirhash != NULL) {
                ufsdirhash_newblk(dp, ulr->ulr_offset);
                ufsdirhash_add(dp, dirp, ulr->ulr_offset);
                ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff,
                    ulr->ulr_offset);
        }
#endif
        error = VOP_BWRITE(bp->b_vp, bp);
        vfs_timestamp(&ts);
        ret = UFS_UPDATE(dvp, &ts, &ts, UPDATE_DIROP);
        if (error == 0)
                return ret;
        return error;
}

static int
#if __GNUC_PREREQ__(5, 3)
/* This gets miscompiled by gcc 5.3 PR/51094 */
__attribute__((__optimize__("no-tree-vrp")))
#endif
ufs_dircompact(struct vnode *dvp, const struct ufs_lookup_results *ulr,
    struct vnode *tvp, struct direct *dirp,
    struct componentname *cnp, struct buf *newdirbp)
{
        const struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
        const int needswap = UFS_MPNEEDSWAP(ump);
        const int fsfmt = FSFMT(dvp);
        const u_int newentrysize = UFS_DIRSIZ(0, dirp, 0);
        struct inode *dp = VTOI(dvp);
        struct buf *bp;
        u_int dsize;
        struct direct *ep, *nep;
        int error, loc, spacefree;
        char *dirbuf;
        uint16_t reclen;

        UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);

        /*
         * If ulr_count is non-zero, then namei found space for the new
         * entry in the range ulr_offset to ulr_offset + ulr_count
         * in the directory. To use this space, we may have to compact
         * the entries located there, by copying them together towards the
         * beginning of the block, leaving the free space in one usable
         * chunk at the end.
         */

        /*
         * Increase size of directory if entry eats into new space.
         * This should never push the size past a new multiple of
         * DIRBLKSIZ.
         *
         * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
         */
        if (ulr->ulr_offset + ulr->ulr_count > dp->i_size) {
#ifdef DIAGNOSTIC
                printf("%s: reached 4.2-only block, not supposed to happen\n",
                    __func__);
#endif
                dp->i_size = ulr->ulr_offset + ulr->ulr_count;
                DIP_ASSIGN(dp, size, dp->i_size);
                dp->i_flag |= IN_CHANGE | IN_UPDATE;
                UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
        }
        /*
         * Get the block containing the space for the new directory entry.
         */
        error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp, true);
        if (error)
                return error;

        /*
         * Find space for the new entry. In the simple case, the entry at
         * offset base will have the space. If it does not, then namei
         * arranged that compacting the region ulr_offset to
         * ulr_offset + ulr_count would yield the space.
         */
        ep = (void *)dirbuf;
        dsize = (ep->d_ino != 0) ? UFS_DIRSIZ(fsfmt, ep, needswap) : 0;
        reclen = ufs_rw16(ep->d_reclen, needswap);
        spacefree = reclen - dsize;
        for (loc = reclen; loc < ulr->ulr_count; ) {
                nep = (void *)(dirbuf + loc);

                /* Trim the existing slot (NB: dsize may be zero). */
                ep->d_reclen = ufs_rw16(dsize, needswap);
                ep = (void *)((char *)ep + dsize);

                reclen = ufs_rw16(nep->d_reclen, needswap);
                loc += reclen;
                if (nep->d_ino == 0) {
                        /*
                         * A mid-block unused entry. Such entries are
                         * never created by the kernel, but fsck_ffs
                         * can create them (and it doesn't fix them).
                         *
                         * Add up the free space, and initialise the
                         * relocated entry since we don't memcpy it.
                         */
                        spacefree += reclen;
                        ep->d_ino = 0;
                        dsize = 0;
                        continue;
                }
                dsize = UFS_DIRSIZ(fsfmt, nep, needswap);
                spacefree += reclen - dsize;
#ifdef UFS_DIRHASH
                if (dp->i_dirhash != NULL)
                        ufsdirhash_move(dp, nep,
                            ulr->ulr_offset + ((char *)nep - dirbuf),
                            ulr->ulr_offset + ((char *)ep - dirbuf));
#endif
                memcpy(ep, nep, dsize);
        }
        /*
         * Here, `ep' points to a directory entry containing `dsize' in-use
         * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0,
         * then the entry is completely unused (dsize == 0). The value
         * of ep->d_reclen is always indeterminate.
         *
         * Update the pointer fields in the previous entry (if any),
         * copy in the new entry, and write out the block.
         */
        if (ep->d_ino == 0 ||
            (ufs_rw32(ep->d_ino, needswap) == UFS_WINO &&
             memcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) {
                if (spacefree + dsize < newentrysize)
                        panic("%s: too big", __func__);
                dirp->d_reclen = spacefree + dsize;
        } else {
                if (spacefree < newentrysize)
                        panic("%s: nospace", __func__);
                dirp->d_reclen = spacefree;
                ep->d_reclen = ufs_rw16(dsize, needswap);
                ep = (void *)((char *)ep + dsize);
        }

        dirp->d_reclen = ufs_rw16(dirp->d_reclen, needswap);
        dirp->d_ino = ufs_rw32(dirp->d_ino, needswap);
        if (fsfmt && ENDIANSWAP(needswap))
                ufs_dirswap(dirp);
#ifdef UFS_DIRHASH
        if (dp->i_dirhash != NULL && (ep->d_ino == 0 ||
            dirp->d_reclen == spacefree))
                ufsdirhash_add(dp, dirp, ulr->ulr_offset + ((char *)ep - dirbuf));
#endif
        memcpy(ep, dirp, newentrysize);
#ifdef UFS_DIRHASH
        if (dp->i_dirhash != NULL) {
                const int dirblkmsk = ump->um_dirblksiz - 1;
                ufsdirhash_checkblock(dp, dirbuf -
                    (ulr->ulr_offset & dirblkmsk),
                    ulr->ulr_offset & ~dirblkmsk);
        }
#endif
        error = VOP_BWRITE(bp->b_vp, bp);
        dp->i_flag |= IN_CHANGE | IN_UPDATE;
        /*
         * If all went well, and the directory can be shortened, proceed
         * with the truncation. Note that we have to unlock the inode for
         * the entry that we just entered, as the truncation may need to
         * lock other inodes which can lead to deadlock if we also hold a
         * lock on the newly entered node.
         */
        if (error == 0 && ulr->ulr_endoff && ulr->ulr_endoff < dp->i_size) {
                const kauth_cred_t cr = cnp->cn_cred;
#ifdef UFS_DIRHASH
                if (dp->i_dirhash != NULL)
                        ufsdirhash_dirtrunc(dp, ulr->ulr_endoff);
#endif
                (void) UFS_TRUNCATE(dvp, (off_t)ulr->ulr_endoff, IO_SYNC, cr);
        }
        UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
        return error;
}

/*
 * Write a directory entry after a call to namei, using the parameters
 * that ufs_lookup left in nameidata and in the ufs_lookup_results.
 *
 * DVP is the directory to be updated. It must be locked.
 * ULR is the ufs_lookup_results structure from the final lookup step.
 * TVP is not used. (XXX: why is it here? remove it)
 * DIRP is the new directory entry contents.
 * CNP is the componentname from the final lookup step.
 * NEWDIRBP is not used and (XXX) should be removed. The previous
 * comment here said it was used by the now-removed softupdates code.
 *
 * The link count of the target inode is *not* incremented; the
 * caller does that.
 *
 * If ulr->ulr_count is 0, ufs_lookup did not find space to insert the
 * directory entry. ulr_offset, which is the place to put the entry,
 * should be on a block boundary (and should be at the end of the
 * directory AFAIK) and a fresh block is allocated to put the new
 * directory entry in.
 *
 * If ulr->ulr_count is not zero, ufs_lookup found a slot to insert
 * the entry into. This slot ranges from ulr_offset to ulr_offset +
 * ulr_count. However, this slot may already be partially populated
 * requiring compaction. See notes below.
 *
 * Furthermore, if ulr_count is not zero and ulr_endoff is not the
 * same as i_size, the directory is truncated to size ulr_endoff.
 */
int
ufs_direnter(struct vnode *dvp, const struct ufs_lookup_results *ulr,
    struct vnode *tvp, struct direct *dirp,
    struct componentname *cnp, struct buf *newdirbp)
{
        if (ulr->ulr_count == 0)
                return ufs_dirgrow(dvp, ulr, tvp, dirp, cnp, newdirbp);
        else
                return ufs_dircompact(dvp, ulr, tvp, dirp, cnp, newdirbp);
}

/*
 * Remove a directory entry after a call to namei, using the
 * parameters that ufs_lookup left in nameidata and in the
 * ufs_lookup_results.
 *
 * DVP is the directory to be updated. It must be locked.
 * ULR is the ufs_lookup_results structure from the final lookup step.
 * IP, if not null, is the inode being unlinked.
 * FLAGS may contain DOWHITEOUT.
 * ISRMDIR is not used and (XXX) should be removed.
 *
 * If FLAGS contains DOWHITEOUT the entry is replaced with a whiteout
 * instead of being cleared.
 *
 * ulr->ulr_offset contains the position of the directory entry
 * to be removed.
 *
 * ulr->ulr_reclen contains the size of the directory entry to be
 * removed.
 *
 * ulr->ulr_count contains the size of the *previous* directory
 * entry. This allows finding it, for free space management. If
 * ulr_count is 0, the target entry is at the beginning of the
 * directory. (Does this ever happen? The first entry should be ".",
 * which should only be removed at rmdir time. Does rmdir come here
 * to clear out the "." and ".." entries? Perhaps, but I doubt it.)
 *
 * The space is marked free by adding it to the record length (not
 * name length) of the preceding entry. If the first entry becomes
 * free, it is marked free by setting the inode number to 0.
 *
 * The link count of IP is decremented. Note that this is not the
 * inverse behavior of ufs_direnter, which does not adjust link
 * counts. Sigh.
 */
int
ufs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr,
    struct inode *ip, int flags, int isrmdir)
{
        struct inode *dp = VTOI(dvp);
        struct direct *ep;
        struct buf *bp;
        int error;
        const int needswap = UFS_MPNEEDSWAP(dp->i_ump);
        uint16_t reclen;

        UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);

        if (flags & DOWHITEOUT) {
                /*
                 * Whiteout entry: set d_ino to UFS_WINO.
                 */
                error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &ep,
                                     &bp, true);
                if (error)
                        return (error);
                ep->d_ino = ufs_rw32(UFS_WINO, needswap);
                ep->d_type = DT_WHT;
                goto out;
        }

        if ((error = ufs_blkatoff(dvp,
            (off_t)(ulr->ulr_offset - ulr->ulr_count), &ep, &bp, true)) != 0)
                return (error);

        reclen = ufs_rw16(ep->d_reclen, needswap);
#ifdef UFS_DIRHASH
        /*
         * Remove the dirhash entry. This is complicated by the fact
         * that `ep' is the previous entry when ulr_count != 0.
         */
        if (dp->i_dirhash != NULL)
                ufsdirhash_remove(dp, (ulr->ulr_count == 0) ? ep :
                   (void *)((char *)ep + reclen), ulr->ulr_offset);
#endif

        if (ulr->ulr_count == 0) {
                /*
                 * First entry in block: set d_ino to zero.
                 */
                ep->d_ino = 0;
        } else {
                /*
                 * Collapse new free space into previous entry.
                 */
                ep->d_reclen = ufs_rw16(reclen + ulr->ulr_reclen, needswap);
        }

#ifdef UFS_DIRHASH
        if (dp->i_dirhash != NULL) {
                int dirblksiz = ip->i_ump->um_dirblksiz;
                ufsdirhash_checkblock(dp, (char *)ep -
                    ((ulr->ulr_offset - ulr->ulr_count) & (dirblksiz - 1)),
                    ulr->ulr_offset & ~(dirblksiz - 1));
        }
#endif

out:
        if (ip) {
                ip->i_nlink--;
                DIP_ASSIGN(ip, nlink, ip->i_nlink);
                ip->i_flag |= IN_CHANGE;
                UFS_WAPBL_UPDATE(ITOV(ip), NULL, NULL, 0);
        }
        /*
         * XXX did it ever occur to anyone that it might be a good
         * idea to restore ip->i_nlink if this fails? Or something?
         * Currently on error return from this function the state of
         * ip->i_nlink depends on what happened, and callers
         * definitely do not take this into account.
         */
        error = VOP_BWRITE(bp->b_vp, bp);
        dp->i_flag |= IN_CHANGE | IN_UPDATE;
        /*
         * If the last named reference to a snapshot goes away,
         * drop its snapshot reference so that it will be reclaimed
         * when last open reference goes away.
         */
        if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 &&
            ip->i_nlink == 0)
                UFS_SNAPGONE(ITOV(ip));
        UFS_WAPBL_UPDATE(dvp, NULL, NULL, 0);
        return (error);
}

/*
 * Rewrite an existing directory entry to point at the inode supplied.
 *
 * DP is the directory to update.
 * OFFSET is the position of the entry in question. It may come
 * from ulr_offset of a ufs_lookup_results.
 * OIP is the old inode the directory previously pointed to.
 * NEWINUM is the number of the new inode.
 * NEWTYPE is the new value for the type field of the directory entry.
 * (This is ignored if the fs doesn't support that.)
 * ISRMDIR is not used and (XXX) should be removed.
 * IFLAGS are added to DP's inode flags.
 *
 * The link count of OIP is decremented. Note that the link count of
 * the new inode is *not* incremented. Yay for symmetry.
 */
int
ufs_dirrewrite(struct inode *dp, off_t offset,
    struct inode *oip, ino_t newinum, int newtype,
    int isrmdir, int iflags)
{
        struct buf *bp;
        struct direct *ep;
        struct vnode *vdp = ITOV(dp);
        int error;

        error = ufs_blkatoff(vdp, offset, &ep, &bp, true);
        if (error)
                return (error);
        ep->d_ino = ufs_rw32(newinum, UFS_MPNEEDSWAP(dp->i_ump));
        if (!FSFMT(vdp))
                ep->d_type = newtype;
        oip->i_nlink--;
        DIP_ASSIGN(oip, nlink, oip->i_nlink);
        oip->i_flag |= IN_CHANGE;
        UFS_WAPBL_UPDATE(ITOV(oip), NULL, NULL, UPDATE_DIROP);
        error = VOP_BWRITE(bp->b_vp, bp);
        dp->i_flag |= iflags;
        /*
         * If the last named reference to a snapshot goes away,
         * drop its snapshot reference so that it will be reclaimed
         * when last open reference goes away.
         */
        if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_nlink == 0)
                UFS_SNAPGONE(ITOV(oip));
        UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
        return (error);
}

/*
 * Check if a directory is empty or not.
 * Inode supplied must be locked.
 *
 * Using a struct dirtemplate here is not precisely
 * what we want, but better than using a struct direct.
 *
 * NB: does not handle corrupted directories.
 */
int
ufs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred)
{
        doff_t off;
        struct direct dbuf;
        struct direct *dp = &dbuf;
        int error;
        size_t count;
        const int needswap = UFS_IPNEEDSWAP(ip);
        const int fsfmt = FSFMT(ITOV(ip));
#define        MINDIRSIZ (sizeof (struct dirtemplate) / 2)

        for (off = 0; off < ip->i_size;
            off += ufs_rw16(dp->d_reclen, needswap)) {
                error = ufs_bufio(UIO_READ, ITOV(ip), dp, MINDIRSIZ,
                    off, IO_NODELOCKED, cred, &count, NULL);
                /*
                 * Since we read MINDIRSIZ, residual must
                 * be 0 unless we're at end of file.
                 */
                if (error || count != 0)
                        return (0);
                /* avoid infinite loops */
                if (dp->d_reclen == 0)
                        return (0);
                /* skip empty entries */
                ino_t ino = ufs_rw32(dp->d_ino, needswap);
                if (ino == 0 || ino == UFS_WINO)
                        continue;
                /* accept only "." and ".." */
                const uint8_t namlen = NAMLEN(fsfmt, needswap, dp);
                if (namlen > 2)
                        return (0);
                if (dp->d_name[0] != '.')
                        return (0);
                /*
                 * At this point namlen must be 1 or 2.
                 * 1 implies ".", 2 implies ".." if second
                 * char is also "."
                 */
                if (namlen == 1 && ino == ip->i_number)
                        continue;
                if (dp->d_name[1] == '.' && ino == parentino)
                        continue;
                return (0);
        }
        return (1);
}

#define        UFS_DIRRABLKS 0
int ufs_dirrablks = UFS_DIRRABLKS;

/*
 * ufs_blkatoff: Return buffer with the contents of block "offset" from
 * the beginning of directory "vp".  If "res" is non-NULL, fill it in with
 * a pointer to the remaining space in the directory.  If the caller intends
 * to modify the buffer returned, "modify" must be true.
 */

int
ufs_blkatoff(struct vnode *vp, off_t offset, void *v, struct buf **bpp,
    bool modify)
{
        char **res = v;
        struct inode *ip __diagused;
        struct buf *bp;
        daddr_t lbn;
        const int dirrablks = ufs_dirrablks;
        daddr_t *blks;
        int *blksizes;
        int run, error;
        struct mount *mp = vp->v_mount;
        const int bshift = mp->mnt_fs_bshift;
        const int bsize = 1 << bshift;
        off_t eof;

        blks = kmem_alloc((1 + dirrablks) * sizeof(daddr_t), KM_SLEEP);
        blksizes = kmem_alloc((1 + dirrablks) * sizeof(int), KM_SLEEP);
        ip = VTOI(vp);
        KASSERT(vp->v_size == ip->i_size);
        GOP_SIZE(vp, vp->v_size, &eof, 0);
        lbn = offset >> bshift;

        for (run = 0; run <= dirrablks;) {
                const off_t curoff = lbn << bshift;
                const int size = MIN(eof - curoff, bsize);

                if (size == 0) {
                        break;
                }
                KASSERT(curoff < eof);
                blks[run] = lbn;
                blksizes[run] = size;
                lbn++;
                run++;
                if (size != bsize) {
                        break;
                }
        }
        KASSERT(run >= 1);
        error = breadn(vp, blks[0], blksizes[0], &blks[1], &blksizes[1],
            run - 1, (modify ? B_MODIFY : 0), &bp);
        if (error != 0) {
                *bpp = NULL;
                goto out;
        }
        if (res) {
                *res = (char *)bp->b_data + (offset & (bsize - 1));
        }
        *bpp = bp;

 out:
        kmem_free(blks, (1 + dirrablks) * sizeof(daddr_t));
        kmem_free(blksizes, (1 + dirrablks) * sizeof(int));
        return error;
}




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/*        $NetBSD: exec_elf64.c,v 1.8 2019/11/20 19:37:53 pgoyette Exp $        */

/*
 * Copyright (c) 1996 Christopher G. Demetriou
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Christopher G. Demetriou
 *        for the NetBSD Project.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_elf64.c,v 1.8 2019/11/20 19:37:53 pgoyette Exp $");

#define        ELFSIZE        64

#include "exec_elf.c"

#include <sys/module.h>

#define ELF64_AUXSIZE (ELF_AUX_ENTRIES * sizeof(Aux64Info) \
    + MAXPATHLEN + ALIGN(1))

MODULE(MODULE_CLASS_EXEC, exec_elf64, NULL);

static struct execsw exec_elf64_execsw[] = {
        /* Native Elf64 */
        {
                .es_hdrsz = sizeof (Elf64_Ehdr),
                  .es_makecmds = exec_elf64_makecmds,
                  .u = {
                        .elf_probe_func = netbsd_elf64_probe,
                },
                .es_emul = &emul_netbsd,
                .es_prio = EXECSW_PRIO_FIRST,
                .es_arglen = ELF64_AUXSIZE,
                .es_copyargs = elf64_copyargs,
                .es_setregs = NULL,
                .es_coredump = coredump_elf64,
                .es_setup_stack = exec_setup_stack,
        },
#if EXEC_ELF_NOTELESS
        /* Generic Elf64 -- run at NetBSD Elf64 */
        {
                .es_hdrsz = sizeof (Elf64_Ehdr),
                .es_makecmds = exec_elf64_makecmds,
                .u = {
                        .elf_probe_func = NULL,
                },
                .es_emul = &emul_netbsd,
                .es_prio = EXECSW_PRIO_ANY,
                .es_arglen = ELF64_AUXSIZE,
                .es_copyargs = elf64_copyargs,
                .es_setregs = NULL,
                .es_coredump = coredump_elf64,
                .es_setup_stack = exec_setup_stack,
        },
#endif
};

static int
exec_elf64_modcmd(modcmd_t cmd, void *arg)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return exec_add(exec_elf64_execsw,
                    __arraycount(exec_elf64_execsw));

        case MODULE_CMD_FINI:
                return exec_remove(exec_elf64_execsw,
                    __arraycount(exec_elf64_execsw));

        default:
                return ENOTTY;
        }
}











































































  255 








   35 
  251 











  252 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/*        $NetBSD: syscallvar.h,v 1.12 2018/04/19 21:19:07 christos Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software developed for The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _SYS_SYSCALLVAR_H_
#define        _SYS_SYSCALLVAR_H_

#ifndef _KERNEL
#error nothing of interest to userspace here
#endif

#if defined(_KERNEL) && defined(_KERNEL_OPT)
#include "opt_dtrace.h"
#endif

#include <sys/systm.h>
#include <sys/proc.h>

extern struct emul emul_netbsd;

struct syscall_package {
        u_short                sp_code;
        u_short                sp_flags;
        sy_call_t        *sp_call;
};

void        syscall_init(void);
int        syscall_establish(const struct emul *, const struct syscall_package *);
int        syscall_disestablish(const struct emul *, const struct syscall_package *);

static __inline int
sy_call(const struct sysent *sy, struct lwp *l, const void *uap,
        register_t *rval)
{
        int error;

        l->l_sysent = sy;
        error = (*sy->sy_call)(l, uap, rval);
        l->l_sysent = NULL;

        return error;
}

static __inline int
sy_invoke(const struct sysent *sy, struct lwp *l, const void *uap,
        register_t *rval, int code)
{
        const bool do_trace = l->l_proc->p_trace_enabled &&
            (sy->sy_flags & SYCALL_INDIRECT) == 0;
        int error;

#ifdef KDTRACE_HOOKS
#define KDTRACE_ENTRY(a)        (a)
#else
#define KDTRACE_ENTRY(a)        (0)
#endif
        if (__predict_true(!(do_trace || KDTRACE_ENTRY(sy->sy_entry)))
            || (error = trace_enter(code, sy, uap)) == 0) {
                rval[0] = 0;
#if !defined(__mips__) && !defined(__m68k__)
                /*
                 * Due to the mips userland code for SYS_break needing v1 to be
                 * preserved, we can't clear this on mips. 
                 */
                rval[1] = 0;
#endif
                error = sy_call(sy, l, uap, rval);
        }

        if (__predict_false(do_trace || KDTRACE_ENTRY(sy->sy_return))) {
                trace_exit(code, sy, uap, rval, error);
        }
        return error;
}

/* inclusion in the kernel currently depends on SYSCALL_DEBUG */
extern const char * const syscallnames[];
extern const char * const altsyscallnames[];

#endif        /* _SYS_SYSCALLVAR_H_ */











































































































   20 










    6 
   14 







    8 








    8 



    8 








    2 








    2 







    2 

    2 







    4 





    2 


    1 








    1 


    1 





    1 

































































    1 





    2 











    2 














    2 











    1 

    1 








    2 














   10 








   10 







    8 




    1 


    9 






    6 






    5 

















    5 












    4 

























    4 

    2 



    2 






    3 



    1 










    5 


























   11 












   11 







   17 








   17 


   17 
   17 



   12 




    1 

   11 






    5 

    3 



   17 



   17 












   11 


    7 



   15 







    4 




   17 









   11 
    7 







    4 
    2 


   11 










   17 




   17 

   11 
    7 
   17 








   13 






    3 

    3 



   14 








    1 





















    1 













    4 










    1 






    1 


    1 



    1 

    1 


    1 









    4 














    1 








    2 

    2 








    2 
















































    1 




    1 















    1 







    1 










































    6 





    6 


    4 
    2 




    6 
    1 



    3 

    4 







    5 



    6 



    6 












    6 
    1 







    5 


    5 




    5 
    1 



    5 


    2 
    1 




    5 
    3 







    1 





    1 


    1 




    5 















    3 
    1 













    1 



















    3 

    3 




    3 












    3 

    1 


    3 




    3 










    3 










    3 








    1 
    2 











    3 









    3 









   21 














   21 





    4 




    6 

   16 





    1 

   21 










    6 








    1 




    3 


    3 



    6 











    4 







    2 









    6 








































































































    1 



    1 


















    2 



    2 


    1 
















































    1 



    5 












    3 


























    2 










    2 













    1 










    1 


















    1 







   16 





   13 
    4 












    8 




















    9 









    9 









    7 







    3 
    4 




    4 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
/*        $NetBSD: uipc_syscalls.c,v 1.211 2024/02/03 19:05:14 jdolecek Exp $        */

/*-
 * Copyright (c) 2008, 2009, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)uipc_syscalls.c        8.6 (Berkeley) 2/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_syscalls.c,v 1.211 2024/02/03 19:05:14 jdolecek Exp $");

#ifdef _KERNEL_OPT
#include "opt_pipe.h"
#include "opt_sctp.h"
#endif

#define MBUFTYPES
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/buf.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/un.h>
#include <sys/ktrace.h>
#include <sys/event.h>
#include <sys/atomic.h>
#include <sys/kauth.h>

#ifdef SCTP
#include <netinet/sctp_uio.h>
#include <netinet/sctp_peeloff.h>
#endif

#include <sys/mount.h>
#include <sys/syscallargs.h>

/*
 * System call interface to the socket abstraction.
 */
extern const struct fileops socketops;

static int        sockargs_sb(struct sockaddr_big *, const void *, socklen_t);

int
sys___socket30(struct lwp *l, const struct sys___socket30_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)        domain;
                syscallarg(int)        type;
                syscallarg(int)        protocol;
        } */
        int fd, error;
        file_t *fp;

        error = fsocreate(SCARG(uap, domain), NULL, SCARG(uap, type),
            SCARG(uap, protocol), &fd, &fp, NULL);
        if (error == 0) {
                fd_affix(l->l_proc, fp, fd);
                *retval = fd;
        }
        return error;
}

int
sys_bind(struct lwp *l, const struct sys_bind_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)                                s;
                syscallarg(const struct sockaddr *)        name;
                syscallarg(unsigned int)                namelen;
        } */
        int                error;
        struct sockaddr_big sb;

        error = sockargs_sb(&sb, SCARG(uap, name), SCARG(uap, namelen));
        if (error)
                return error;

        return do_sys_bind(l, SCARG(uap, s), (struct sockaddr *)&sb);
}

int
do_sys_bind(struct lwp *l, int fd, struct sockaddr *nam)
{
        struct socket        *so;
        int                error;

        if ((error = fd_getsock(fd, &so)) != 0)
                return error;
        error = sobind(so, nam, l);
        fd_putfile(fd);
        return error;
}

int
sys_listen(struct lwp *l, const struct sys_listen_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)        s;
                syscallarg(int)        backlog;
        } */
        struct socket        *so;
        int                error;

        if ((error = fd_getsock(SCARG(uap, s), &so)) != 0)
                return (error);
        error = solisten(so, SCARG(uap, backlog), l);
        fd_putfile(SCARG(uap, s));
        return error;
}

int
do_sys_accept(struct lwp *l, int sock, struct sockaddr *name,
    register_t *new_sock, const sigset_t *mask, int flags, int clrflags)
{
        file_t                *fp, *fp2;
        int                error, fd;
        struct socket        *so, *so2;
        short                wakeup_state = 0;

        if ((fp = fd_getfile(sock)) == NULL)
                return EBADF;
        if (fp->f_type != DTYPE_SOCKET) {
                fd_putfile(sock);
                return ENOTSOCK;
        }
        if ((error = fd_allocfile(&fp2, &fd)) != 0) {
                fd_putfile(sock);
                return error;
        }
        *new_sock = fd;
        so = fp->f_socket;
        solock(so);

        if (__predict_false(mask))
                sigsuspendsetup(l, mask);

        if (!(so->so_proto->pr_flags & PR_LISTEN)) {
                error = EOPNOTSUPP;
                goto bad;
        }
        if ((so->so_options & SO_ACCEPTCONN) == 0) {
                error = EINVAL;
                goto bad;
        }
        if ((so->so_state & SS_NBIO) && so->so_qlen == 0) {
                error = EWOULDBLOCK;
                goto bad;
        }
        while (so->so_qlen == 0 && so->so_error == 0) {
                if (so->so_state & SS_CANTRCVMORE) {
                        so->so_error = ECONNABORTED;
                        break;
                }
                if (wakeup_state & SS_RESTARTSYS) {
                        error = ERESTART;
                        goto bad;
                }
                error = sowait(so, true, 0);
                if (error) {
                        goto bad;
                }
                wakeup_state = so->so_state;
        }
        if (so->so_error) {
                error = so->so_error;
                so->so_error = 0;
                goto bad;
        }
        /* connection has been removed from the listen queue */
        KNOTE(&so->so_rcv.sb_sel.sel_klist, NOTE_SUBMIT);
        so2 = TAILQ_FIRST(&so->so_q);
        if (soqremque(so2, 1) == 0)
                panic("accept");
        fp2->f_type = DTYPE_SOCKET;
        fp2->f_flag = (fp->f_flag & ~clrflags) |
            ((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0)|
            ((flags & SOCK_NOSIGPIPE) ? FNOSIGPIPE : 0);
        fp2->f_ops = &socketops;
        fp2->f_socket = so2;
        if (fp2->f_flag & FNONBLOCK)
                so2->so_state |= SS_NBIO;
        else
                so2->so_state &= ~SS_NBIO;
        error = soaccept(so2, name);
        so2->so_cred = kauth_cred_hold(so->so_cred);
        sounlock(so);
        if (error) {
                /* an error occurred, free the file descriptor and mbuf */
                mutex_enter(&fp2->f_lock);
                fp2->f_count++;
                mutex_exit(&fp2->f_lock);
                closef(fp2);
                fd_abort(curproc, NULL, fd);
        } else {
                fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0);
                fd_affix(curproc, fp2, fd);
        }
        fd_putfile(sock);
        if (__predict_false(mask))
                sigsuspendteardown(l);
        return error;
 bad:
        sounlock(so);
        fd_putfile(sock);
        fd_abort(curproc, fp2, fd);
        if (__predict_false(mask))
                sigsuspendteardown(l);
        return error;
}

int
sys_accept(struct lwp *l, const struct sys_accept_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)                        s;
                syscallarg(struct sockaddr *)        name;
                syscallarg(unsigned int *)        anamelen;
        } */
        int error, fd;
        struct sockaddr_big name;

        name.sb_len = UCHAR_MAX;
        error = do_sys_accept(l, SCARG(uap, s), (struct sockaddr *)&name,
            retval, NULL, 0, 0);
        if (error != 0)
                return error;
        error = copyout_sockname_sb(SCARG(uap, name), SCARG(uap, anamelen),
            MSG_LENUSRSPACE, &name);
        if (error != 0) {
                fd = (int)*retval;
                if (fd_getfile(fd) != NULL)
                        (void)fd_close(fd);
        }
        return error;
}

int
sys_paccept(struct lwp *l, const struct sys_paccept_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        s;
                syscallarg(struct sockaddr *)        name;
                syscallarg(unsigned int *)        anamelen;
                syscallarg(const sigset_t *)        mask;
                syscallarg(int)                        flags;
        } */
        int error, fd;
        struct sockaddr_big name;
        sigset_t *mask, amask;

        if (SCARG(uap, mask) != NULL) {
                error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
                if (error)
                        return error;
                mask = &amask;
        } else
                mask = NULL;

        name.sb_len = UCHAR_MAX;
        error = do_sys_accept(l, SCARG(uap, s), (struct sockaddr *)&name,
            retval, mask, SCARG(uap, flags), FNONBLOCK);
        if (error != 0)
                return error;
        error = copyout_sockname_sb(SCARG(uap, name), SCARG(uap, anamelen),
            MSG_LENUSRSPACE, &name);
        if (error != 0) {
                fd = (int)*retval;
                if (fd_getfile(fd) != NULL)
                        (void)fd_close(fd);
        }
        return error;
}

int
sys_connect(struct lwp *l, const struct sys_connect_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                                s;
                syscallarg(const struct sockaddr *)        name;
                syscallarg(unsigned int)                namelen;
        } */
        int                error;
        struct sockaddr_big sbig;

        error = sockargs_sb(&sbig, SCARG(uap, name), SCARG(uap, namelen));
        if (error)
                return error;
        return do_sys_connect(l, SCARG(uap, s), (struct sockaddr *)&sbig);
}

int
do_sys_connect(struct lwp *l, int fd, struct sockaddr *nam)
{
        struct socket        *so;
        int                error;
        int                interrupted = 0;

        if ((error = fd_getsock(fd, &so)) != 0) {
                return (error);
        }
        solock(so);
        if ((so->so_state & SS_ISCONNECTING) != 0) {
                error = EALREADY;
                goto out;
        }

        error = soconnect(so, nam, l);
        if (error)
                goto bad;
        if ((so->so_state & (SS_NBIO|SS_ISCONNECTING)) ==
            (SS_NBIO|SS_ISCONNECTING)) {
                error = EINPROGRESS;
                goto out;
        }
        while ((so->so_state & SS_ISCONNECTING) != 0 && so->so_error == 0) {
                error = sowait(so, true, 0);
                if (__predict_false((so->so_state & SS_ISABORTING) != 0)) {
                        error = EPIPE;
                        interrupted = 1;
                        break;
                }
                if (error) {
                        if (error == EINTR || error == ERESTART)
                                interrupted = 1;
                        break;
                }
        }
        if (error == 0) {
                error = so->so_error;
                so->so_error = 0;
        }
 bad:
        if (!interrupted)
                so->so_state &= ~SS_ISCONNECTING;
        if (error == ERESTART)
                error = EINTR;
 out:
        sounlock(so);
        fd_putfile(fd);
        return error;
}

int
sys_socketpair(struct lwp *l, const struct sys_socketpair_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                domain;
                syscallarg(int)                type;
                syscallarg(int)                protocol;
                syscallarg(int *)        rsv;
        } */
        file_t                *fp1, *fp2;
        struct socket        *so1, *so2;
        int                fd, error, sv[2];
        proc_t                *p = curproc;
        int                flags = SCARG(uap, type) & SOCK_FLAGS_MASK;
        int                type = SCARG(uap, type) & ~SOCK_FLAGS_MASK;
        int                domain = SCARG(uap, domain);
        int                proto = SCARG(uap, protocol);

        error = fsocreate(domain, &so1, type|flags, proto, &fd, &fp1, NULL);
        if (error)
                return error;
        sv[0] = fd;

        error = fsocreate(domain, &so2, type|flags, proto, &fd, &fp2, so1);
        if (error)
                goto out;
        sv[1] = fd;

        solock(so1);
        error = soconnect2(so1, so2);
        if (error == 0 && type == SOCK_DGRAM) {
                /*
                 * Datagram socket connection is asymmetric.
                 */
                error = soconnect2(so2, so1);
        }
        sounlock(so1);

        if (error == 0)
                error = copyout(sv, SCARG(uap, rsv), sizeof(sv));
        if (error == 0) {
                fd_affix(p, fp2, sv[1]);
                fd_affix(p, fp1, sv[0]);
                return 0;
        }
        fd_abort(p, fp2, sv[1]);
        (void)soclose(so2);
out:
        fd_abort(p, fp1, sv[0]);
        (void)soclose(so1);
        return error;
}

int
sys_sendto(struct lwp *l, const struct sys_sendto_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                                s;
                syscallarg(const void *)                buf;
                syscallarg(size_t)                        len;
                syscallarg(int)                                flags;
                syscallarg(const struct sockaddr *)        to;
                syscallarg(unsigned int)                tolen;
        } */
        struct msghdr        msg = {0};
        struct iovec        aiov;

        msg.msg_name = __UNCONST(SCARG(uap, to)); /* XXXUNCONST kills const */
        msg.msg_namelen = SCARG(uap, tolen);
        msg.msg_iov = &aiov;
        msg.msg_iovlen = 1;
        msg.msg_control = NULL;
        msg.msg_flags = 0;
        aiov.iov_base = __UNCONST(SCARG(uap, buf)); /* XXXUNCONST kills const */
        aiov.iov_len = SCARG(uap, len);
        return do_sys_sendmsg(l, SCARG(uap, s), &msg, SCARG(uap, flags),
            retval);
}

int
sys_sendmsg(struct lwp *l, const struct sys_sendmsg_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                                s;
                syscallarg(const struct msghdr *)        msg;
                syscallarg(int)                                flags;
        } */
        struct msghdr        msg;
        int                error;

        error = copyin(SCARG(uap, msg), &msg, sizeof(msg));
        if (error)
                return (error);

        msg.msg_flags = MSG_IOVUSRSPACE;
        return do_sys_sendmsg(l, SCARG(uap, s), &msg, SCARG(uap, flags),
            retval);
}

int
do_sys_sendmsg_so(struct lwp *l, int s, struct socket *so, file_t *fp,
    struct msghdr *mp, int flags, register_t *retsize)
{

        struct iovec        aiov[UIO_SMALLIOV], *iov = aiov, *tiov, *ktriov = NULL;
        struct sockaddr *sa = NULL;
        struct mbuf        *to, *control;
        struct uio        auio;
        size_t                len, iovsz;
        int                i, error;

        ktrkuser("msghdr", mp, sizeof(*mp));

        /* If the caller passed us stuff in mbufs, we must free them. */
        to = (mp->msg_flags & MSG_NAMEMBUF) ? mp->msg_name : NULL;
        control = (mp->msg_flags & MSG_CONTROLMBUF) ? mp->msg_control : NULL;
        iovsz = mp->msg_iovlen * sizeof(struct iovec);

        if (mp->msg_flags & MSG_IOVUSRSPACE) {
                if ((unsigned int)mp->msg_iovlen > UIO_SMALLIOV) {
                        if ((unsigned int)mp->msg_iovlen > IOV_MAX) {
                                error = EMSGSIZE;
                                goto bad;
                        }
                        iov = kmem_alloc(iovsz, KM_SLEEP);
                }
                if (mp->msg_iovlen != 0) {
                        error = copyin(mp->msg_iov, iov, iovsz);
                        if (error)
                                goto bad;
                }
                auio.uio_iov = iov;
        } else
                auio.uio_iov = mp->msg_iov;

        auio.uio_iovcnt = mp->msg_iovlen;
        auio.uio_rw = UIO_WRITE;
        auio.uio_offset = 0;                        /* XXX */
        auio.uio_resid = 0;
        KASSERT(l == curlwp);
        auio.uio_vmspace = l->l_proc->p_vmspace;

        tiov = auio.uio_iov;
        for (i = 0; i < auio.uio_iovcnt; i++, tiov++) {
                /*
                 * Writes return ssize_t because -1 is returned on error.
                 * Therefore, we must restrict the length to SSIZE_MAX to
                 * avoid garbage return values.
                 */
                auio.uio_resid += tiov->iov_len;
                if (tiov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
                        error = EINVAL;
                        goto bad;
                }
        }

        if (mp->msg_name && to == NULL) {
                error = sockargs(&to, mp->msg_name, mp->msg_namelen,
                    UIO_USERSPACE, MT_SONAME);
                if (error)
                        goto bad;
        }

        if (mp->msg_control) {
                if (mp->msg_controllen < CMSG_ALIGN(sizeof(struct cmsghdr))) {
                        error = EINVAL;
                        goto bad;
                }
                if (control == NULL) {
                        error = sockargs(&control, mp->msg_control,
                            mp->msg_controllen, UIO_USERSPACE, MT_CONTROL);
                        if (error)
                                goto bad;
                }
        }

        if (ktrpoint(KTR_GENIO) && iovsz > 0) {
                ktriov = kmem_alloc(iovsz, KM_SLEEP);
                memcpy(ktriov, auio.uio_iov, iovsz);
        }

        if (mp->msg_name)
                MCLAIM(to, so->so_mowner);
        if (mp->msg_control)
                MCLAIM(control, so->so_mowner);

        if (to) {
                sa = mtod(to, struct sockaddr *);
        }

        len = auio.uio_resid;
        error = (*so->so_send)(so, sa, &auio, NULL, control, flags, l);
        /* Protocol is responsible for freeing 'control' */
        control = NULL;

        if (error) {
                if (auio.uio_resid != len && (error == ERESTART ||
                    error == EINTR || error == EWOULDBLOCK))
                        error = 0;
                if (error == EPIPE && (fp->f_flag & FNOSIGPIPE) == 0 &&
                    (flags & MSG_NOSIGNAL) == 0) {
                        mutex_enter(&proc_lock);
                        psignal(l->l_proc, SIGPIPE);
                        mutex_exit(&proc_lock);
                }
        }
        if (error == 0)
                *retsize = len - auio.uio_resid;

bad:
        if (ktriov != NULL) {
                ktrgeniov(s, UIO_WRITE, ktriov, *retsize, error);
                kmem_free(ktriov, iovsz);
        }

        if (iov != aiov)
                kmem_free(iov, iovsz);
        if (to)
                m_freem(to);
        if (control)
                m_freem(control);

        return error;
}

int
do_sys_sendmsg(struct lwp *l, int s, struct msghdr *mp, int flags,
    register_t *retsize)
{
        int                error;
        struct socket        *so;
        file_t                *fp;

        if ((error = fd_getsock1(s, &so, &fp)) != 0) {
                /* We have to free msg_name and msg_control ourselves */
                if (mp->msg_flags & MSG_NAMEMBUF)
                        m_freem(mp->msg_name);
                if (mp->msg_flags & MSG_CONTROLMBUF)
                        m_freem(mp->msg_control);
                return error;
        }
        error = do_sys_sendmsg_so(l, s, so, fp, mp, flags, retsize);
        /* msg_name and msg_control freed */
        fd_putfile(s);
        return error;
}

int
sys_recvfrom(struct lwp *l, const struct sys_recvfrom_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        s;
                syscallarg(void *)                buf;
                syscallarg(size_t)                len;
                syscallarg(int)                        flags;
                syscallarg(struct sockaddr *)        from;
                syscallarg(unsigned int *)        fromlenaddr;
        } */
        struct msghdr        msg = {0};
        struct iovec        aiov;
        int                error;
        struct mbuf        *from;

        msg.msg_name = NULL;
        msg.msg_iov = &aiov;
        msg.msg_iovlen = 1;
        aiov.iov_base = SCARG(uap, buf);
        aiov.iov_len = SCARG(uap, len);
        msg.msg_control = NULL;
        msg.msg_flags = SCARG(uap, flags) & MSG_USERFLAGS;

        error = do_sys_recvmsg(l, SCARG(uap, s), &msg, &from, NULL, retval);
        if (error != 0)
                return error;

        error = copyout_sockname(SCARG(uap, from), SCARG(uap, fromlenaddr),
            MSG_LENUSRSPACE, from);
        if (from != NULL)
                m_free(from);
        return error;
}

int
sys_recvmsg(struct lwp *l, const struct sys_recvmsg_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        s;
                syscallarg(struct msghdr *)        msg;
                syscallarg(int)                        flags;
        } */
        struct msghdr        msg;
        int                error;
        struct mbuf        *from, *control;

        error = copyin(SCARG(uap, msg), &msg, sizeof(msg));
        if (error)
                return error;

        msg.msg_flags = (SCARG(uap, flags) & MSG_USERFLAGS) | MSG_IOVUSRSPACE;

        error = do_sys_recvmsg(l, SCARG(uap, s), &msg, &from,
            msg.msg_control != NULL ? &control : NULL, retval);
        if (error != 0)
                return error;

        if (msg.msg_control != NULL)
                error = copyout_msg_control(l, &msg, control);

        if (error == 0)
                error = copyout_sockname(msg.msg_name, &msg.msg_namelen, 0,
                        from);
        if (from != NULL)
                m_free(from);
        if (error == 0) {
                ktrkuser("msghdr", &msg, sizeof(msg));
                error = copyout(&msg, SCARG(uap, msg), sizeof(msg));
        }

        return error;
}

int
sys_sendmmsg(struct lwp *l, const struct sys_sendmmsg_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        s;
                syscallarg(struct mmsghdr *)        mmsg;
                syscallarg(unsigned int)        vlen;
                syscallarg(unsigned int)        flags;
        } */
        struct mmsghdr mmsg;
        struct socket *so;
        file_t *fp;
        struct msghdr *msg = &mmsg.msg_hdr;
        int error, s;
        unsigned int vlen, flags, dg;

        s = SCARG(uap, s);
        if ((error = fd_getsock1(s, &so, &fp)) != 0)
                return error;

        vlen = SCARG(uap, vlen);
        if (vlen > 1024)
                vlen = 1024;

        flags = (SCARG(uap, flags) & MSG_USERFLAGS) | MSG_IOVUSRSPACE;

        for (dg = 0; dg < vlen;) {
                error = copyin(SCARG(uap, mmsg) + dg, &mmsg, sizeof(mmsg));
                if (error)
                        break;

                msg->msg_flags = flags;

                error = do_sys_sendmsg_so(l, s, so, fp, msg, flags, retval);
                if (error)
                        break;

                ktrkuser("msghdr", msg, sizeof(*msg));
                mmsg.msg_len = *retval;
                error = copyout(&mmsg, SCARG(uap, mmsg) + dg, sizeof(mmsg));
                if (error)
                        break;
                dg++;

        }

        *retval = dg;

        fd_putfile(s);

        /*
         * If we succeeded at least once, return 0.
         */
        if (dg)
                return 0;
        return error;
}

/*
 * Adjust for a truncated SCM_RIGHTS control message.
 *  This means closing any file descriptors that aren't present
 *  in the returned buffer.
 *  m is the mbuf holding the (already externalized) SCM_RIGHTS message.
 */
static void
free_rights(struct mbuf *m)
{
        struct cmsghdr *cm;
        int *fdv;
        unsigned int nfds, i;

        KASSERT(sizeof(*cm) <= m->m_len);
        cm = mtod(m, struct cmsghdr *);

        KASSERT(CMSG_ALIGN(sizeof(*cm)) <= cm->cmsg_len);
        KASSERT(cm->cmsg_len <= m->m_len);
        nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
        fdv = (int *)CMSG_DATA(cm);

        for (i = 0; i < nfds; i++)
                if (fd_getfile(fdv[i]) != NULL)
                        (void)fd_close(fdv[i]);
}

void
free_control_mbuf(struct lwp *l, struct mbuf *control, struct mbuf *uncopied)
{
        struct mbuf *next;
        struct cmsghdr *cmsg;
        bool do_free_rights = false;

        while (control != NULL) {
                cmsg = mtod(control, struct cmsghdr *);
                if (control == uncopied)
                        do_free_rights = true;
                if (do_free_rights && cmsg->cmsg_level == SOL_SOCKET
                    && cmsg->cmsg_type == SCM_RIGHTS)
                        free_rights(control);
                next = control->m_next;
                m_free(control);
                control = next;
        }
}

/* Copy socket control/CMSG data to user buffer, frees the mbuf */
int
copyout_msg_control(struct lwp *l, struct msghdr *mp, struct mbuf *control)
{
        int i, len, error = 0;
        struct cmsghdr *cmsg;
        struct mbuf *m;
        char *q;

        len = mp->msg_controllen;
        if (len <= 0 || control == 0) {
                mp->msg_controllen = 0;
                free_control_mbuf(l, control, control);
                return 0;
        }

        q = (char *)mp->msg_control;

        for (m = control; m != NULL; ) {
                cmsg = mtod(m, struct cmsghdr *);
                i = m->m_len;
                if (len < i) {
                        mp->msg_flags |= MSG_CTRUNC;
                        if (cmsg->cmsg_level == SOL_SOCKET
                            && cmsg->cmsg_type == SCM_RIGHTS)
                                /* Do not truncate me ... */
                                break;
                        i = len;
                }
                error = copyout(mtod(m, void *), q, i);
                ktrkuser(mbuftypes[MT_CONTROL], cmsg, cmsg->cmsg_len);
                if (error != 0) {
                        /* We must free all the SCM_RIGHTS */
                        m = control;
                        break;
                }
                m = m->m_next;
                if (m)
                        i = ALIGN(i);
                q += i;
                len -= i;
                if (len <= 0)
                        break;
        }

        free_control_mbuf(l, control, m);

        mp->msg_controllen = q - (char *)mp->msg_control;
        return error;
}

int
do_sys_recvmsg_so(struct lwp *l, int s, struct socket *so, struct msghdr *mp,
    struct mbuf **from, struct mbuf **control, register_t *retsize)
{
        struct iovec        aiov[UIO_SMALLIOV], *iov = aiov, *tiov, *ktriov = NULL;
        struct uio        auio;
        size_t                len, iovsz;
        int                i, error;

        ktrkuser("msghdr", mp, sizeof(*mp));

        *from = NULL;
        if (control != NULL)
                *control = NULL;

        iovsz = mp->msg_iovlen * sizeof(struct iovec);

        if (mp->msg_flags & MSG_IOVUSRSPACE) {
                if ((unsigned int)mp->msg_iovlen > UIO_SMALLIOV) {
                        if ((unsigned int)mp->msg_iovlen > IOV_MAX) {
                                error = EMSGSIZE;
                                goto out;
                        }
                        iov = kmem_alloc(iovsz, KM_SLEEP);
                }
                if (mp->msg_iovlen != 0) {
                        error = copyin(mp->msg_iov, iov, iovsz);
                        if (error)
                                goto out;
                }
                auio.uio_iov = iov;
        } else
                auio.uio_iov = mp->msg_iov;
        auio.uio_iovcnt = mp->msg_iovlen;
        auio.uio_rw = UIO_READ;
        auio.uio_offset = 0;                        /* XXX */
        auio.uio_resid = 0;
        KASSERT(l == curlwp);
        auio.uio_vmspace = l->l_proc->p_vmspace;

        tiov = auio.uio_iov;
        for (i = 0; i < auio.uio_iovcnt; i++, tiov++) {
                /*
                 * Reads return ssize_t because -1 is returned on error.
                 * Therefore we must restrict the length to SSIZE_MAX to
                 * avoid garbage return values.
                 */
                auio.uio_resid += tiov->iov_len;
                if (tiov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
                        error = EINVAL;
                        goto out;
                }
        }

        if (ktrpoint(KTR_GENIO) && iovsz > 0) {
                ktriov = kmem_alloc(iovsz, KM_SLEEP);
                memcpy(ktriov, auio.uio_iov, iovsz);
        }

        len = auio.uio_resid;
        mp->msg_flags &= MSG_USERFLAGS;
        error = (*so->so_receive)(so, from, &auio, NULL, control,
            &mp->msg_flags);
        KASSERT(*from == NULL || (*from)->m_next == NULL);
        len -= auio.uio_resid;
        *retsize = len;
        if (error != 0 && len != 0
            && (error == ERESTART || error == EINTR || error == EWOULDBLOCK))
                /* Some data transferred */
                error = 0;

        if (ktriov != NULL) {
                ktrgeniov(s, UIO_READ, ktriov, len, error);
                kmem_free(ktriov, iovsz);
        }

        if (error != 0) {
                m_freem(*from);
                *from = NULL;
                if (control != NULL) {
                        free_control_mbuf(l, *control, *control);
                        *control = NULL;
                }
        }
 out:
        if (iov != aiov)
                kmem_free(iov, iovsz);
        return error;
}


int
do_sys_recvmsg(struct lwp *l, int s, struct msghdr *mp,
    struct mbuf **from, struct mbuf **control, register_t *retsize)
{
        int error;
        struct socket *so;

        if ((error = fd_getsock(s, &so)) != 0)
                return error;
        error = do_sys_recvmsg_so(l, s, so, mp, from, control, retsize);
        fd_putfile(s);
        return error;
}

int
sys_recvmmsg(struct lwp *l, const struct sys_recvmmsg_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        s;
                syscallarg(struct mmsghdr *)        mmsg;
                syscallarg(unsigned int)        vlen;
                syscallarg(unsigned int)        flags;
                syscallarg(struct timespec *)        timeout;
        } */
        struct mmsghdr mmsg;
        struct socket *so;
        struct msghdr *msg = &mmsg.msg_hdr;
        int error, s;
        struct mbuf *from, *control;
        struct timespec ts, now;
        unsigned int vlen, flags, dg;

        if (SCARG(uap, timeout)) {
                if ((error = copyin(SCARG(uap, timeout), &ts, sizeof(ts))) != 0)
                        return error;
                if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000L)
                        return EINVAL;
                getnanotime(&now);
                if (timespecaddok(&now, &ts)) {
                        timespecadd(&now, &ts, &ts);
                } else {
                        ts.tv_sec = __type_max(time_t);
                        ts.tv_nsec = 999999999L;
                }
        }

        s = SCARG(uap, s);
        if ((error = fd_getsock(s, &so)) != 0)
                return error;

        /*
         * If so->so_rerror holds a deferred error return it now.
         */
        if (so->so_rerror) {
                error = so->so_rerror;
                so->so_rerror = 0;
                fd_putfile(s);
                return error;
        }

        vlen = SCARG(uap, vlen);
        if (vlen > 1024)
                vlen = 1024;

        from = NULL;
        flags = (SCARG(uap, flags) & MSG_USERFLAGS) | MSG_IOVUSRSPACE;

        for (dg = 0; dg < vlen;) {
                error = copyin(SCARG(uap, mmsg) + dg, &mmsg, sizeof(mmsg));
                if (error)
                        break;

                msg->msg_flags = flags & ~MSG_WAITFORONE;

                if (from != NULL) {
                        m_free(from);
                        from = NULL;
                }

                error = do_sys_recvmsg_so(l, s, so, msg, &from,
                    msg->msg_control != NULL ? &control : NULL, retval);
                if (error) {
                        if (error == EAGAIN && dg > 0)
                                error = 0;
                        break;
                }

                if (msg->msg_control != NULL)
                        error = copyout_msg_control(l, msg, control);
                if (error)
                        break;

                error = copyout_sockname(msg->msg_name, &msg->msg_namelen, 0,
                    from);
                if (error)
                        break;

                ktrkuser("msghdr", msg, sizeof *msg);
                mmsg.msg_len = *retval;

                error = copyout(&mmsg, SCARG(uap, mmsg) + dg, sizeof(mmsg));
                if (error)
                        break;

                dg++;
                if (msg->msg_flags & MSG_OOB)
                        break;

                if (SCARG(uap, timeout)) {
                        getnanotime(&now);
                        if (timespeccmp(&ts, &now, <))
                                break;
                }

                if (flags & MSG_WAITFORONE)
                        flags |= MSG_DONTWAIT;

        }

        if (from != NULL)
                m_free(from);

        *retval = dg;

        /*
         * If we succeeded at least once, return 0, hopefully so->so_rerror
         * will catch it next time.
         */
        if (error && dg > 0) {
                so->so_rerror = error;
                error = 0;
        }

        fd_putfile(s);

        return error;
}

int
sys_shutdown(struct lwp *l, const struct sys_shutdown_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)        s;
                syscallarg(int)        how;
        } */
        struct socket        *so;
        int                error;

        if ((error = fd_getsock(SCARG(uap, s), &so)) != 0)
                return error;
        solock(so);
        error = soshutdown(so, SCARG(uap, how));
        sounlock(so);
        fd_putfile(SCARG(uap, s));
        return error;
}

int
sys_setsockopt(struct lwp *l, const struct sys_setsockopt_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        s;
                syscallarg(int)                        level;
                syscallarg(int)                        name;
                syscallarg(const void *)        val;
                syscallarg(unsigned int)        valsize;
        } */
        struct sockopt        sopt;
        struct socket        *so;
        file_t                *fp;
        int                error;
        unsigned int        len;

        len = SCARG(uap, valsize);
        if (len > 0 && SCARG(uap, val) == NULL)
                return EINVAL;

        if (len > MCLBYTES)
                return EINVAL;

        if ((error = fd_getsock1(SCARG(uap, s), &so, &fp)) != 0)
                return (error);

        sockopt_init(&sopt, SCARG(uap, level), SCARG(uap, name), len);

        if (len > 0) {
                error = copyin(SCARG(uap, val), sopt.sopt_data, len);
                if (error)
                        goto out;
        }

        error = sosetopt(so, &sopt);
        if (so->so_options & SO_NOSIGPIPE)
                atomic_or_uint(&fp->f_flag, FNOSIGPIPE);
        else
                atomic_and_uint(&fp->f_flag, ~FNOSIGPIPE);

 out:
        sockopt_destroy(&sopt);
        fd_putfile(SCARG(uap, s));
        return error;
}

static int
getsockopt(struct lwp *l, const struct sys_getsockopt_args *uap,
    register_t *retval, bool copyarg)
{
        struct sockopt        sopt;
        struct socket        *so;
        file_t                *fp;
        unsigned int        valsize, len;
        int                error;

        if (SCARG(uap, val) != NULL) {
                error = copyin(SCARG(uap, avalsize), &valsize, sizeof(valsize));
                if (error)
                        return error;
        } else
                valsize = 0;

        if (valsize > MCLBYTES)
                return EINVAL;

        if ((error = fd_getsock1(SCARG(uap, s), &so, &fp)) != 0)
                return error;

        sockopt_init(&sopt, SCARG(uap, level), SCARG(uap, name), valsize);
        if (copyarg && valsize > 0) {
                error = copyin(SCARG(uap, val), sopt.sopt_data, valsize);
                if (error)
                        goto out;
        }

        if (fp->f_flag & FNOSIGPIPE)
                so->so_options |= SO_NOSIGPIPE;
        else
                so->so_options &= ~SO_NOSIGPIPE;

        error = sogetopt(so, &sopt);
        if (error || valsize == 0)
                goto out;

        len = uimin(valsize, sopt.sopt_retsize);
        error = copyout(sopt.sopt_data, SCARG(uap, val), len);
        if (error)
                goto out;

        error = copyout(&len, SCARG(uap, avalsize), sizeof(len));
 out:
        sockopt_destroy(&sopt);
        fd_putfile(SCARG(uap, s));
        return error;
}

int
sys_getsockopt(struct lwp *l, const struct sys_getsockopt_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        s;
                syscallarg(int)                        level;
                syscallarg(int)                        name;
                syscallarg(void *)                val;
                syscallarg(unsigned int *)        avalsize;
        } */
        return getsockopt(l, uap, retval, false);
}

int
sys_getsockopt2(struct lwp *l, const struct sys_getsockopt2_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        s;
                syscallarg(int)                        level;
                syscallarg(int)                        name;
                syscallarg(void *)                val;
                syscallarg(unsigned int *)        avalsize;
        } */
        return getsockopt(l, (const struct sys_getsockopt_args *) uap, retval, true);
}

#ifdef PIPE_SOCKETPAIR

int
pipe1(struct lwp *l, int *fildes, int flags)
{
        file_t                *rf, *wf;
        struct socket        *rso, *wso;
        int                error, soflags = 0;
        unsigned        rfd, wfd;
        proc_t                *p = l->l_proc;

        if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE))
                return EINVAL;
        if (flags & O_CLOEXEC)
                soflags |= SOCK_CLOEXEC;
        if (flags & O_NONBLOCK)
                soflags |= SOCK_NONBLOCK;
        if (flags & O_NOSIGPIPE)
                soflags |= SOCK_NOSIGPIPE;

        error = fsocreate(AF_LOCAL, &rso, SOCK_STREAM|soflags, 0, &rfd, &rf,
            NULL);
        if (error)
                goto free1;
        error = fsocreate(AF_LOCAL, &wso, SOCK_STREAM|soflags, 0, &wfd, &wf,
            rso);
        if (error)
                goto free2;

        /* make sure the descriptors are uni-directional */
        rf->f_type = rf->f_type & ~(FWRITE);
        wf->f_type = wf->f_type & ~(FREAD);

        /* remember this socket pair implements a pipe */
        rso->so_state |= SS_ISAPIPE;
        wso->so_state |= SS_ISAPIPE;

        solock(wso);
        /*
         * Pipes must be readable when there is at least 1
         * byte of data available in the receive buffer.
         *
         * Pipes must be writable when there is space for
         * at least PIPE_BUF bytes in the send buffer.
         * If we're increasing the low water mark for the
         * send buffer, then mimic how soreserve() would
         * have set the high water mark.
         */
        rso->so_rcv.sb_lowat = 1;
        if (wso->so_snd.sb_lowat < PIPE_BUF) {
                wso->so_snd.sb_hiwat = PIPE_BUF * 2;
        }
        wso->so_snd.sb_lowat = PIPE_BUF;
        error = unp_connect2(wso, rso);
        sounlock(wso);

        if (error != 0)
                goto free3;

        fd_affix(p, wf, wfd);
        fd_affix(p, rf, rfd);
        fildes[0] = rfd;
        fildes[1] = wfd;
        return (0);
 free3:
        (void)soclose(wso);
        fd_abort(p, wf, wfd);
 free2:
        (void)soclose(rso);
        fd_abort(p, rf, rfd);
 free1:
        return error;
}
#endif /* PIPE_SOCKETPAIR */

/*
 * Get peer socket name.
 */
int
do_sys_getpeername(int fd, struct sockaddr *nam)
{
        struct socket        *so;
        int                error;

        if ((error = fd_getsock(fd, &so)) != 0)
                return error;

        solock(so);
        if ((so->so_state & SS_ISCONNECTED) == 0)
                error = ENOTCONN;
        else {
                error = (*so->so_proto->pr_usrreqs->pr_peeraddr)(so, nam);
        }
        sounlock(so);
        fd_putfile(fd);
        return error;
}

/*
 * Get local socket name.
 */
int
do_sys_getsockname(int fd, struct sockaddr *nam)
{
        struct socket        *so;
        int                error;

        if ((error = fd_getsock(fd, &so)) != 0)
                return error;

        solock(so);
        error = (*so->so_proto->pr_usrreqs->pr_sockaddr)(so, nam);
        sounlock(so);
        fd_putfile(fd);
        return error;
}

int
copyout_sockname_sb(struct sockaddr *asa, unsigned int *alen, int flags,
    struct sockaddr_big *addr)
{
        unsigned int len;
        int error;

        if (asa == NULL)
                /* Assume application not interested */
                return 0;

        if (flags & MSG_LENUSRSPACE) {
                error = copyin(alen, &len, sizeof(len));
                if (error)
                        return error;
        } else
                len = *alen;

        if (addr == NULL) {
                len = 0;
                error = 0;
        } else {
                if (len > addr->sb_len)
                        len = addr->sb_len;
                /* XXX addr isn't an mbuf... */
                ktrkuser(mbuftypes[MT_SONAME], addr, len);
                error = copyout(addr, asa, len);
        }

        if (error == 0) {
                if (flags & MSG_LENUSRSPACE)
                        error = copyout(&len, alen, sizeof(len));
                else
                        *alen = len;
        }

        return error;
}

int
copyout_sockname(struct sockaddr *asa, unsigned int *alen, int flags,
    struct mbuf *addr)
{
        int len;
        int error;

        if (asa == NULL)
                /* Assume application not interested */
                return 0;

        if (flags & MSG_LENUSRSPACE) {
                error = copyin(alen, &len, sizeof(len));
                if (error)
                        return error;
        } else
                len = *alen;
        if (len < 0)
                return EINVAL;

        if (addr == NULL) {
                len = 0;
                error = 0;
        } else {
                if (len > addr->m_len)
                        len = addr->m_len;
                /* Maybe this ought to copy a chain ? */
                ktrkuser(mbuftypes[MT_SONAME], mtod(addr, void *), len);
                error = copyout(mtod(addr, void *), asa, len);
        }

        if (error == 0) {
                if (flags & MSG_LENUSRSPACE)
                        error = copyout(&len, alen, sizeof(len));
                else
                        *alen = len;
        }

        return error;
}

/*
 * Get socket name.
 */
int
sys_getsockname(struct lwp *l, const struct sys_getsockname_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        fdes;
                syscallarg(struct sockaddr *)        asa;
                syscallarg(unsigned int *)        alen;
        } */
        struct sockaddr_big sbig;
        int                    error;

        sbig.sb_len = UCHAR_MAX;
        error = do_sys_getsockname(SCARG(uap, fdes), (struct sockaddr *)&sbig);
        if (error != 0)
                return error;

        error = copyout_sockname_sb(SCARG(uap, asa), SCARG(uap, alen),
            MSG_LENUSRSPACE, &sbig);
        return error;
}

/*
 * Get name of peer for connected socket.
 */
int
sys_getpeername(struct lwp *l, const struct sys_getpeername_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        fdes;
                syscallarg(struct sockaddr *)        asa;
                syscallarg(unsigned int *)        alen;
        } */
        struct sockaddr_big sbig;
        int                    error;

        sbig.sb_len = UCHAR_MAX;
        error = do_sys_getpeername(SCARG(uap, fdes), (struct sockaddr *)&sbig);
        if (error != 0)
                return error;

        error = copyout_sockname_sb(SCARG(uap, asa), SCARG(uap, alen),
            MSG_LENUSRSPACE, &sbig);
        return error;
}

static int
sockargs_sb(struct sockaddr_big *sb, const void *name, socklen_t buflen)
{
        int error;

        /*
         * We can't allow socket names > UCHAR_MAX in length, since that
         * will overflow sb_len. Further no reasonable buflen is <=
         * offsetof(sockaddr_big, sb_data) since it shall be at least
         * the size of the preamble sb_len and sb_family members.
         */
        if (buflen > UCHAR_MAX ||
            buflen <= offsetof(struct sockaddr_big, sb_data))
                return EINVAL;

        error = copyin(name, (void *)sb, buflen);
        if (error)
                return error;

        ktrkuser(mbuftypes[MT_SONAME], sb, buflen);
#if BYTE_ORDER != BIG_ENDIAN
        /*
         * 4.3BSD compat thing - need to stay, since bind(2),
         * connect(2), sendto(2) were not versioned for COMPAT_43.
         */
        if (sb->sb_family == 0 && sb->sb_len < AF_MAX)
                sb->sb_family = sb->sb_len;
#endif
        sb->sb_len = buflen;
        return 0;
}

/*
 * XXX In a perfect world, we wouldn't pass around socket control
 * XXX arguments in mbufs, and this could go away.
 */
int
sockargs(struct mbuf **mp, const void *bf, size_t buflen, enum uio_seg seg,
    int type)
{
        struct mbuf        *m;
        int                error;

        /*
         * We can't allow socket names > UCHAR_MAX in length, since that
         * will overflow sa_len.  Control data more than a page size in
         * length is just too much.
         */
        if (buflen > (type == MT_SONAME ? UCHAR_MAX : PAGE_SIZE))
                return EINVAL;

        /*
         * length must greater than sizeof(sa_family) + sizeof(sa_len)
         */
        if (type == MT_SONAME && buflen <= 2)
                return EINVAL;

        /* Allocate an mbuf to hold the arguments. */
        m = m_get(M_WAIT, type);
        /* can't claim.  don't who to assign it to. */
        if (buflen > MLEN) {
                /*
                 * Won't fit into a regular mbuf, so we allocate just
                 * enough external storage to hold the argument.
                 */
                MEXTMALLOC(m, buflen, M_WAITOK);
        }
        m->m_len = buflen;
        if (seg == UIO_USERSPACE) {
                error = copyin(bf, mtod(m, void *), buflen);
                if (error) {
                        (void)m_free(m);
                        return error;
                }
        } else {
                memcpy(mtod(m, void *), bf, buflen);
        }
        *mp = m;
        switch (type) {
        case MT_SONAME:
                ktrkuser(mbuftypes[type], mtod(m, void *), buflen);

                struct sockaddr *sa = mtod(m, struct sockaddr *);
#if BYTE_ORDER != BIG_ENDIAN
                /*
                 * 4.3BSD compat thing - need to stay, since bind(2),
                 * connect(2), sendto(2) were not versioned for COMPAT_43.
                 */
                if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
                        sa->sa_family = sa->sa_len;
#endif
                sa->sa_len = buflen;
                return 0;
        case MT_CONTROL:
                if (!KTRPOINT(curproc, KTR_USER))
                        return 0;

                struct msghdr mhdr;
                mhdr.msg_control = mtod(m, void *);
                mhdr.msg_controllen = buflen;
                for (struct cmsghdr *cmsg = CMSG_FIRSTHDR(&mhdr); cmsg;
                    cmsg = CMSG_NXTHDR(&mhdr, cmsg)) {
                        KASSERT(((char *)cmsg - mtod(m, char *)) <= buflen);
                        if (cmsg->cmsg_len >
                            buflen - ((char *)cmsg - mtod(m, char *)))
                                break;
                        ktrkuser(mbuftypes[type], cmsg, cmsg->cmsg_len);
                }
                return 0;
        default:
                return EINVAL;
        }
}

int
do_sys_peeloff(struct socket *head, void *data)
{
#ifdef SCTP
        /*file_t *lfp = NULL;*/
        file_t *nfp = NULL;
        int error;
        struct socket *so;
        int fd;
        uint32_t name;
        /*short fflag;*/                /* type must match fp->f_flag */

        name = *(uint32_t *) data;
        error = sctp_can_peel_off(head, name);
        if (error) {
                printf("peeloff failed\n");
                return error;
        }
        /*
         * At this point we know we do have a assoc to pull
         * we proceed to get the fd setup. This may block
         * but that is ok.
         */
        error = fd_allocfile(&nfp, &fd);
        if (error) {
                /*
                 * Probably ran out of file descriptors. Put the
                 * unaccepted connection back onto the queue and
                 * do another wakeup so some other process might
                 * have a chance at it.
                 */
                return error;
        }
        *(int *) data = fd;

        so = sctp_get_peeloff(head, name, &error);
        if (so == NULL) {
                /*
                 * Either someone else peeled it off OR
                 * we can't get a socket.
                 * close the new descriptor, assuming someone hasn't ripped it
                 * out from under us.
                 */
                mutex_enter(&nfp->f_lock);
                nfp->f_count++;
                mutex_exit(&nfp->f_lock);
                fd_abort(curlwp->l_proc, nfp, fd);
                return error;
        }
        so->so_state &= ~SS_NOFDREF;
        so->so_state &= ~SS_ISCONNECTING;
        so->so_head = NULL;
        so->so_cred = kauth_cred_hold(head->so_cred);
        nfp->f_socket = so;
        nfp->f_flag = FREAD|FWRITE;
        nfp->f_ops = &socketops;
        nfp->f_type = DTYPE_SOCKET;

        fd_affix(curlwp->l_proc, nfp, fd);

        return error;
#else
        return EOPNOTSUPP;
#endif
}
































































































    1 




























































    1 
































































    1 




























































































    1 



















    1 











    1 





































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
/*        $NetBSD: mm.c,v 1.24 2019/02/05 11:33:13 mrg Exp $        */

/*-
 * Copyright (c) 2002, 2008, 2010 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas, Joerg Sonnenberger and Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Special /dev/{mem,kmem,zero,null} memory devices.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: mm.c,v 1.24 2019/02/05 11:33:13 mrg Exp $");

#include "opt_compat_netbsd.h"

#include <sys/param.h>
#include <sys/conf.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <sys/termios.h>

#include <dev/mm.h>

#include <uvm/uvm_extern.h>

static void *                dev_zero_page        __read_mostly;
static kmutex_t                dev_mem_lock        __cacheline_aligned;
static vaddr_t                dev_mem_addr        __read_mostly;

static dev_type_open(mm_open);
static dev_type_read(mm_readwrite);
static dev_type_ioctl(mm_ioctl);
static dev_type_mmap(mm_mmap);
static dev_type_ioctl(mm_ioctl);

const struct cdevsw mem_cdevsw = {
        .d_open = mm_open,
        .d_close = nullclose,
        .d_read = mm_readwrite,
        .d_write = mm_readwrite,
        .d_ioctl = mm_ioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = mm_mmap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_MPSAFE
};

#ifdef pmax        /* XXX */
const struct cdevsw mem_ultrix_cdevsw = {
        .d_open = nullopen,
        .d_close = nullclose,
        .d_read = mm_readwrite,
        .d_write = mm_readwrite,
        .d_ioctl = mm_ioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = mm_mmap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_MPSAFE
};
#endif

static int
mm_open(dev_t dev, int flag, int mode, struct lwp *l)
{
#ifdef __HAVE_MM_MD_OPEN
        int error;
        if ((error = mm_md_open(dev, flag, mode, l)) != 0)
                return error;
#endif
        l->l_proc->p_flag |= PK_KMEM;
        return 0;
}

/*
 * mm_init: initialize memory device driver.
 */
void
mm_init(void)
{
        vaddr_t pg;

        mutex_init(&dev_mem_lock, MUTEX_DEFAULT, IPL_NONE);

        /* Read-only zero-page. */
        pg = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
        KASSERT(pg != 0);
        pmap_protect(pmap_kernel(), pg, pg + PAGE_SIZE, VM_PROT_READ);
        pmap_update(pmap_kernel());
        dev_zero_page = (void *)pg;

#ifndef __HAVE_MM_MD_CACHE_ALIASING
        /* KVA for mappings during I/O. */
        dev_mem_addr = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
            UVM_KMF_VAONLY|UVM_KMF_WAITVA);
        KASSERT(dev_mem_addr != 0);
#else
        dev_mem_addr = 0;
#endif
}


/*
 * dev_mem_getva: get a special virtual address.  If architecture requires,
 * allocate VA according to PA, which avoids cache-aliasing issues.  Use a
 * constant, general mapping address otherwise.
 */
static inline vaddr_t
dev_mem_getva(paddr_t pa, int color)
{
#ifdef __HAVE_MM_MD_CACHE_ALIASING
        return uvm_km_alloc(kernel_map, PAGE_SIZE,
            color & uvmexp.colormask,
            UVM_KMF_VAONLY | UVM_KMF_WAITVA | UVM_KMF_COLORMATCH);
#else
        return dev_mem_addr;
#endif
}

static inline void
dev_mem_relva(paddr_t pa, vaddr_t va)
{
#ifdef __HAVE_MM_MD_CACHE_ALIASING
        uvm_km_free(kernel_map, va, PAGE_SIZE, UVM_KMF_VAONLY);
#else
        KASSERT(dev_mem_addr == va);
#endif
}

/*
 * dev_kmem_readwrite: helper for DEV_MEM (/dev/mem) case of R/W.
 */
static int
dev_mem_readwrite(struct uio *uio, struct iovec *iov)
{
        paddr_t paddr;
        vaddr_t vaddr;
        vm_prot_t prot;
        size_t len, offset;
        bool have_direct;
        int error;
        int color = 0;

        /* Check for wrap around. */
        if ((uintptr_t)uio->uio_offset != uio->uio_offset) {
                return EFAULT;
        }
        paddr = uio->uio_offset & ~PAGE_MASK;
        prot = (uio->uio_rw == UIO_WRITE) ? VM_PROT_WRITE : VM_PROT_READ;
        error = mm_md_physacc(paddr, prot);
        if (error) {
                return error;
        }
        offset = uio->uio_offset & PAGE_MASK;
        len = MIN(uio->uio_resid, PAGE_SIZE - offset);

#ifdef __HAVE_MM_MD_CACHE_ALIASING
        have_direct = mm_md_page_color(paddr, &color);
#else
        have_direct = true;
        color = 0;
#endif

#ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS
        /* Is physical address directly mapped?  Return VA. */
        if (have_direct)
                have_direct = mm_md_direct_mapped_phys(paddr, &vaddr);
#else
        vaddr = 0;
        have_direct = false;
#endif
        if (!have_direct) {
                /* Get a special virtual address. */
                const vaddr_t va = dev_mem_getva(paddr, color);

                /* Map selected KVA to physical address. */
                mutex_enter(&dev_mem_lock);
                pmap_kenter_pa(va, paddr, prot, 0);
                pmap_update(pmap_kernel());

                /* Perform I/O. */
                vaddr = va + offset;
                error = uiomove((void *)vaddr, len, uio);

                /* Unmap, flush before unlock. */
                pmap_kremove(va, PAGE_SIZE);
                pmap_update(pmap_kernel());
                mutex_exit(&dev_mem_lock);

                /* "Release" the virtual address. */
                dev_mem_relva(paddr, va);
        } else {
                /* Direct map, just perform I/O. */
                vaddr += offset;
                error = uiomove((void *)vaddr, len, uio);
        }
        return error;
}

/*
 * dev_kmem_readwrite: helper for DEV_KMEM (/dev/kmem) case of R/W.
 */
static int
dev_kmem_readwrite(struct uio *uio, struct iovec *iov)
{
        void *addr;
        size_t len, offset;
        vm_prot_t prot;
        int error;
        bool md_kva;

        /* Check for wrap around. */
        addr = (void *)(intptr_t)uio->uio_offset;
        if ((uintptr_t)addr != uio->uio_offset) {
                return EFAULT;
        }
        /*
         * Handle non-page aligned offset.
         * Otherwise, we operate in page-by-page basis.
         */
        offset = uio->uio_offset & PAGE_MASK;
        len = MIN(uio->uio_resid, PAGE_SIZE - offset);
        prot = (uio->uio_rw == UIO_WRITE) ? VM_PROT_WRITE : VM_PROT_READ;

        md_kva = false;

#ifdef __HAVE_MM_MD_DIRECT_MAPPED_IO
        paddr_t paddr;
        /* MD case: is this is a directly mapped address? */
        if (mm_md_direct_mapped_io(addr, &paddr)) {
                /* If so, validate physical address. */
                error = mm_md_physacc(paddr, prot);
                if (error) {
                        return error;
                }
                md_kva = true;
        }
#endif
        if (!md_kva) {
                bool checked = false;

#ifdef __HAVE_MM_MD_KERNACC
                /* MD check for the address. */
                error = mm_md_kernacc(addr, prot, &checked);
                if (error) {
                        return error;
                }
#endif
                /* UVM check for the address (unless MD indicated to not). */
                if (!checked && !uvm_kernacc(addr, len, prot)) {
                        return EFAULT;
                }
        }
        error = uiomove(addr, len, uio);
        return error;
}

/*
 * dev_zero_readwrite: helper for DEV_ZERO (/dev/null) case of R/W.
 */
static inline int
dev_zero_readwrite(struct uio *uio, struct iovec *iov)
{
        size_t len;

        /* Nothing to do for the write case. */
        if (uio->uio_rw == UIO_WRITE) {
                uio->uio_resid = 0;
                return 0;
        }
        /*
         * Read in page-by-page basis, caller will continue.
         * Cut appropriately for a single/last-iteration cases.
         */
        len = MIN(iov->iov_len, PAGE_SIZE);
        return uiomove(dev_zero_page, len, uio);
}

/*
 * mm_readwrite: general memory R/W function.
 */
static int
mm_readwrite(dev_t dev, struct uio *uio, int flags)
{
        struct iovec *iov;
        int error;

#ifdef __HAVE_MM_MD_READWRITE
        /* If defined - there are extra MD cases. */
        switch (minor(dev)) {
        case DEV_MEM:
        case DEV_KMEM:
        case DEV_NULL:
        case DEV_ZERO:
#if defined(COMPAT_16) && defined(__arm)
        case _DEV_ZERO_oARM:
#endif
                break;
        default:
                return mm_md_readwrite(dev, uio);
        }
#endif
        error = 0;
        while (uio->uio_resid > 0 && error == 0) {
                iov = uio->uio_iov;
                if (iov->iov_len == 0) {
                        /* Processed; next I/O vector. */
                        uio->uio_iov++;
                        uio->uio_iovcnt--;
                        KASSERT(uio->uio_iovcnt >= 0);
                        continue;
                }
                /* Helper functions will process in page-by-page basis. */
                switch (minor(dev)) {
                case DEV_MEM:
                        error = dev_mem_readwrite(uio, iov);
                        break;
                case DEV_KMEM:
                        error = dev_kmem_readwrite(uio, iov);
                        break;
                case DEV_NULL:
                        if (uio->uio_rw == UIO_WRITE) {
                                uio->uio_resid = 0;
                        }
                        /* Break directly out of the loop. */
                        return 0;
                case DEV_FULL:
                        if (uio->uio_rw == UIO_WRITE) {
                                return ENOSPC;
                        }
#if defined(COMPAT_16) && defined(__arm)
                        /* FALLTHROUGH */
                case _DEV_ZERO_oARM:
#endif
                /* FALLTHROUGH */
                case DEV_ZERO:
                        error = dev_zero_readwrite(uio, iov);
                        break;
                default:
                        error = ENXIO;
                        break;
                }
        }
        return error;
}

/*
 * mm_mmap: general mmap() handler.
 */
static paddr_t
mm_mmap(dev_t dev, off_t off, int acc)
{
        vm_prot_t prot;

#ifdef __HAVE_MM_MD_MMAP
        /* If defined - there are extra mmap() MD cases. */
        switch (minor(dev)) {
        case DEV_MEM:
        case DEV_KMEM:
        case DEV_NULL:
#if defined(COMPAT_16) && defined(__arm)
        case _DEV_ZERO_oARM:
#endif
        case DEV_ZERO:
                break;
        default:
                return mm_md_mmap(dev, off, acc);
        }
#endif
        /*
         * /dev/null does not make sense, /dev/kmem is volatile and
         * /dev/zero is handled in mmap already.
         */
        if (minor(dev) != DEV_MEM) {
                return -1;
        }

        prot = 0;
        if (acc & PROT_EXEC)
                prot |= VM_PROT_EXECUTE;
        if (acc & PROT_READ)
                prot |= VM_PROT_READ;
        if (acc & PROT_WRITE)
                prot |= VM_PROT_WRITE;

        /* Validate the physical address. */
        if (mm_md_physacc(off, prot) != 0) {
                return -1;
        }
        return off >> PGSHIFT;
}

static int
mm_ioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{

        switch (cmd) {
        case FIONBIO:
                /* We never block anyway. */
                return 0;

        case FIOSETOWN:
        case FIOGETOWN:
        case TIOCGPGRP:
        case TIOCSPGRP:
        case TIOCGETA:
                return ENOTTY;

        case FIOASYNC:
                if ((*(int *)data) == 0) {
                        return 0;
                }
                /* FALLTHROUGH */
        default:
                return EOPNOTSUPP;
        }
}































































































































    2 







    2 








    1 













    1 









    1 







    1 










    1 



















































   29 











































   29 



   28 















   29 





















   29 


   29 
















































   29 











   29 










    6 

    2 













   29 







   28 





   29 






















































































































































    2 








    2 



    2 




































    1 




    1 




    1 





















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
/* $NetBSD: dksubr.c,v 1.114 2023/07/11 23:26:41 christos Exp $ */

/*-
 * Copyright (c) 1996, 1997, 1998, 1999, 2002, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe and Roland C. Dowdeswell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: dksubr.c,v 1.114 2023/07/11 23:26:41 christos Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/stat.h>
#include <sys/proc.h>
#include <sys/ioctl.h>
#include <sys/device.h>
#include <sys/disk.h>
#include <sys/disklabel.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/vnode.h>
#include <sys/fcntl.h>
#include <sys/namei.h>
#include <sys/module.h>
#include <sys/syslog.h>

#include <dev/dkvar.h>
#include <miscfs/specfs/specdev.h> /* for v_rdev */

int        dkdebug = 0;

#ifdef DEBUG
#define DKDB_FOLLOW        0x1
#define DKDB_INIT        0x2
#define DKDB_VNODE        0x4
#define DKDB_DUMP        0x8

#define IFDEBUG(x,y)                if (dkdebug & (x)) y
#define DPRINTF(x,y)                IFDEBUG(x, printf y)
#define DPRINTF_FOLLOW(y)        DPRINTF(DKDB_FOLLOW, y)
#else
#define IFDEBUG(x,y)
#define DPRINTF(x,y)
#define DPRINTF_FOLLOW(y)
#endif

#define DKF_READYFORDUMP        (DKF_INITED|DKF_TAKEDUMP)

static int dk_subr_modcmd(modcmd_t, void *);

#define DKLABELDEV(dev)        \
        (MAKEDISKDEV(major((dev)), DISKUNIT((dev)), RAW_PART))

static void        dk_makedisklabel(struct dk_softc *);
static int        dk_translate(struct dk_softc *, struct buf *);

void
dk_init(struct dk_softc *dksc, device_t dev, int dtype)
{

        memset(dksc, 0x0, sizeof(*dksc));
        dksc->sc_dtype = dtype;
        dksc->sc_dev = dev;

        strlcpy(dksc->sc_xname, device_xname(dev), DK_XNAME_SIZE);
        dksc->sc_dkdev.dk_name = dksc->sc_xname;
}

void
dk_attach(struct dk_softc *dksc)
{
        KASSERT(dksc->sc_dev != NULL);

        mutex_init(&dksc->sc_iolock, MUTEX_DEFAULT, IPL_VM);
        dksc->sc_flags |= DKF_READYFORDUMP;
#ifdef DIAGNOSTIC
        dksc->sc_flags |= DKF_WARNLABEL | DKF_LABELSANITY;
#endif

        if ((dksc->sc_flags & DKF_NO_RND) == 0) {
                /* Attach the device into the rnd source list. */
                rnd_attach_source(&dksc->sc_rnd_source, dksc->sc_xname,
                    RND_TYPE_DISK, RND_FLAG_DEFAULT);
        }
}

void
dk_detach(struct dk_softc *dksc)
{
        if ((dksc->sc_flags & DKF_NO_RND) == 0) {
                /* Unhook the entropy source. */
                rnd_detach_source(&dksc->sc_rnd_source);
        }

        dksc->sc_flags &= ~DKF_READYFORDUMP;
        mutex_destroy(&dksc->sc_iolock);
}

/* ARGSUSED */
int
dk_open(struct dk_softc *dksc, dev_t dev,
    int flags, int fmt, struct lwp *l)
{
        const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver;
        struct        disklabel *lp = dksc->sc_dkdev.dk_label;
        int        part = DISKPART(dev);
        int        pmask = 1 << part;
        int        ret = 0;
        struct disk *dk = &dksc->sc_dkdev;

        DPRINTF_FOLLOW(("%s(%s, %p, 0x%"PRIx64", 0x%x)\n", __func__,
            dksc->sc_xname, dksc, dev, flags));

        mutex_enter(&dk->dk_openlock);

        /*
         * If there are wedges, and this is not RAW_PART, then we
         * need to fail.
         */
        if (dk->dk_nwedges != 0 && part != RAW_PART) {
                ret = EBUSY;
                goto done;
        }

        /* If no dkdriver attached, bail */
        if (dkd == NULL) {
                ret = ENXIO;
                goto done;
        }

        /*
         * initialize driver for the first opener
         */
        if (dk->dk_openmask == 0 && dkd->d_firstopen != NULL) {
                ret = (*dkd->d_firstopen)(dksc->sc_dev, dev, flags, fmt);
                if (ret)
                        goto done;
        }

        /*
         * If we're init'ed and there are no other open partitions then
         * update the in-core disklabel.
         */
        if ((dksc->sc_flags & DKF_INITED)) {
                if ((dksc->sc_flags & DKF_VLABEL) == 0) {
                        dksc->sc_flags |= DKF_VLABEL;
                        dk_getdisklabel(dksc, dev);
                }
        }

        /* Fail if we can't find the partition. */
        if (part != RAW_PART &&
            ((dksc->sc_flags & DKF_VLABEL) == 0 ||
             part >= lp->d_npartitions ||
             lp->d_partitions[part].p_fstype == FS_UNUSED)) {
                ret = ENXIO;
                goto done;
        }

        /* Mark our unit as open. */
        switch (fmt) {
        case S_IFCHR:
                dk->dk_copenmask |= pmask;
                break;
        case S_IFBLK:
                dk->dk_bopenmask |= pmask;
                break;
        }

        dk->dk_openmask = dk->dk_copenmask | dk->dk_bopenmask;

done:
        mutex_exit(&dk->dk_openlock);
        return ret;
}

/* ARGSUSED */
int
dk_close(struct dk_softc *dksc, dev_t dev,
    int flags, int fmt, struct lwp *l)
{
        const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver;
        int        part = DISKPART(dev);
        int        pmask = 1 << part;
        struct disk *dk = &dksc->sc_dkdev;

        DPRINTF_FOLLOW(("%s(%s, %p, 0x%"PRIx64", 0x%x)\n", __func__,
            dksc->sc_xname, dksc, dev, flags));

        mutex_enter(&dk->dk_openlock);

        switch (fmt) {
        case S_IFCHR:
                dk->dk_copenmask &= ~pmask;
                break;
        case S_IFBLK:
                dk->dk_bopenmask &= ~pmask;
                break;
        }
        dk->dk_openmask = dk->dk_copenmask | dk->dk_bopenmask;

        if (dk->dk_openmask == 0) {
                if (dkd->d_lastclose != NULL)
                        (*dkd->d_lastclose)(dksc->sc_dev);
                if ((dksc->sc_flags & DKF_KLABEL) == 0)
                        dksc->sc_flags &= ~DKF_VLABEL;
        }

        mutex_exit(&dk->dk_openlock);
        return 0;
}

static int
dk_translate(struct dk_softc *dksc, struct buf *bp)
{
        int        part;
        int        wlabel;
        daddr_t        blkno;
        struct disklabel *lp;
        struct disk *dk;
        uint64_t numsecs;
        unsigned secsize;

        lp = dksc->sc_dkdev.dk_label;
        dk = &dksc->sc_dkdev;

        part = DISKPART(bp->b_dev);
        numsecs = dk->dk_geom.dg_secperunit;
        secsize = dk->dk_geom.dg_secsize;

        /*
         * The transfer must be a whole number of blocks and the offset must
         * not be negative.
         */
        if ((bp->b_bcount % secsize) != 0 || bp->b_blkno < 0) {
                bp->b_error = EINVAL;
                goto done;
        }

        /* If there is nothing to do, then we are done */
        if (bp->b_bcount == 0)
                goto done;

        wlabel = dksc->sc_flags & (DKF_WLABEL|DKF_LABELLING);
        if (part == RAW_PART) {
                uint64_t numblocks = btodb(numsecs * secsize);
                if (bounds_check_with_mediasize(bp, DEV_BSIZE, numblocks) <= 0)
                        goto done;
        } else {
                if (bounds_check_with_label(&dksc->sc_dkdev, bp, wlabel) <= 0)
                        goto done;
        }

        /*
         * Convert the block number to absolute and put it in terms
         * of the device's logical block size.
         */
        if (secsize >= DEV_BSIZE)
                blkno = bp->b_blkno / (secsize / DEV_BSIZE);
        else
                blkno = bp->b_blkno * (DEV_BSIZE / secsize);

        if (part != RAW_PART)
                blkno += lp->d_partitions[DISKPART(bp->b_dev)].p_offset;
        bp->b_rawblkno = blkno;

        return -1;

done:
        bp->b_resid = bp->b_bcount;
        return bp->b_error;
}

static int
dk_strategy1(struct dk_softc *dksc, struct buf *bp)
{
        int error;

        DPRINTF_FOLLOW(("%s(%s, %p, %p)\n", __func__,
            dksc->sc_xname, dksc, bp));

        if (!(dksc->sc_flags & DKF_INITED)) {
                DPRINTF_FOLLOW(("%s: not inited\n", __func__));
                bp->b_error = ENXIO;
                bp->b_resid = bp->b_bcount;
                biodone(bp);
                return 1;
        }

        error = dk_translate(dksc, bp);
        if (error >= 0) {
                biodone(bp);
                return 1;
        }

        return 0;
}

void
dk_strategy(struct dk_softc *dksc, struct buf *bp)
{
        int error;

        error = dk_strategy1(dksc, bp);
        if (error)
                return;

        /*
         * Queue buffer and start unit
         */
        dk_start(dksc, bp);
}

int
dk_strategy_defer(struct dk_softc *dksc, struct buf *bp)
{
        int error;

        error = dk_strategy1(dksc, bp);
        if (error)
                return error;

        /*
         * Queue buffer only
         */
        mutex_enter(&dksc->sc_iolock);
        disk_wait(&dksc->sc_dkdev);
        bufq_put(dksc->sc_bufq, bp);
        mutex_exit(&dksc->sc_iolock);

        return 0;
}

int
dk_strategy_pending(struct dk_softc *dksc)
{
        struct buf *bp;

        if (!(dksc->sc_flags & DKF_INITED)) {
                DPRINTF_FOLLOW(("%s: not inited\n", __func__));
                return 0;
        }

        mutex_enter(&dksc->sc_iolock);
        bp = bufq_peek(dksc->sc_bufq);
        mutex_exit(&dksc->sc_iolock);

        return bp != NULL;
}

void
dk_start(struct dk_softc *dksc, struct buf *bp)
{
        const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver;
        int error;

        if (!(dksc->sc_flags & DKF_INITED)) {
                DPRINTF_FOLLOW(("%s: not inited\n", __func__));
                return;
        }

        mutex_enter(&dksc->sc_iolock);

        if (bp != NULL) {
                bp->b_ci = curcpu();
                disk_wait(&dksc->sc_dkdev);
                bufq_put(dksc->sc_bufq, bp);
        }

        /*
         * If another thread is running the queue, increment
         * busy counter to 2 so that the queue is retried,
         * because the driver may now accept additional
         * requests.
         */
        if (dksc->sc_busy < 2)
                dksc->sc_busy++;
        if (dksc->sc_busy > 1)
                goto done;

        /*
         * Peeking at the buffer queue and committing the operation
         * only after success isn't atomic.
         *
         * So when a diskstart fails, the buffer is saved
         * and tried again before the next buffer is fetched.
         * dk_drain() handles flushing of a saved buffer.
         *
         * This keeps order of I/O operations, unlike bufq_put.
         */

        while (dksc->sc_busy > 0) {

                bp = dksc->sc_deferred;
                dksc->sc_deferred = NULL;

                if (bp == NULL)
                        bp = bufq_get(dksc->sc_bufq);

                while (bp != NULL) {

                        disk_busy(&dksc->sc_dkdev);
                        mutex_exit(&dksc->sc_iolock);
                        error = dkd->d_diskstart(dksc->sc_dev, bp);
                        mutex_enter(&dksc->sc_iolock);
                        if (error == EAGAIN || error == ENOMEM) {
                                /*
                                 * Not a disk error. Retry later.
                                 */
                                KASSERT(dksc->sc_deferred == NULL);
                                dksc->sc_deferred = bp;
                                disk_unbusy(&dksc->sc_dkdev, 0, (bp->b_flags & B_READ));
                                disk_wait(&dksc->sc_dkdev);
                                break;
                        }

                        if (error != 0) {
                                bp->b_error = error;
                                bp->b_resid = bp->b_bcount;
                                mutex_exit(&dksc->sc_iolock);
                                dk_done(dksc, bp);
                                mutex_enter(&dksc->sc_iolock);
                        }

                        bp = bufq_get(dksc->sc_bufq);
                }

                dksc->sc_busy--;
        }
done:
        mutex_exit(&dksc->sc_iolock);
}

void
dk_done(struct dk_softc *dksc, struct buf *bp)
{
        struct disk *dk = &dksc->sc_dkdev;

        if (bp->b_error != 0) {
                struct cfdriver *cd = device_cfdriver(dksc->sc_dev);

                diskerr(bp, cd->cd_name, "error", LOG_PRINTF, 0,
                        dk->dk_label);
                printf("\n");
        }

        mutex_enter(&dksc->sc_iolock);
        disk_unbusy(dk, bp->b_bcount - bp->b_resid, (bp->b_flags & B_READ));
        mutex_exit(&dksc->sc_iolock);

        if ((dksc->sc_flags & DKF_NO_RND) == 0)
                rnd_add_uint32(&dksc->sc_rnd_source, bp->b_rawblkno);

        biodone(bp);
}

void
dk_drain(struct dk_softc *dksc)
{
        struct buf *bp;

        mutex_enter(&dksc->sc_iolock);
        bp = dksc->sc_deferred;
        dksc->sc_deferred = NULL;
        if (bp != NULL) {
                bp->b_error = EIO;
                bp->b_resid = bp->b_bcount;
                biodone(bp); 
        }
        bufq_drain(dksc->sc_bufq);
        mutex_exit(&dksc->sc_iolock);
}

int
dk_discard(struct dk_softc *dksc, dev_t dev, off_t pos, off_t len)
{
        const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver;
        unsigned secsize = dksc->sc_dkdev.dk_geom.dg_secsize;
        struct buf tmp, *bp = &tmp;
        int maxsz;
        int error = 0;

        KASSERT(len >= 0);

        DPRINTF_FOLLOW(("%s(%s, %p, 0x"PRIx64", %jd, %jd)\n", __func__,
            dksc->sc_xname, dksc, (intmax_t)pos, (intmax_t)len));

        if (!(dksc->sc_flags & DKF_INITED)) {
                DPRINTF_FOLLOW(("%s: not inited\n", __func__));
                return ENXIO;
        }

        if (secsize == 0 || (pos % secsize) != 0 || (len % secsize) != 0)
                return EINVAL;

        /* largest value that b_bcount can store */
        maxsz = rounddown(INT_MAX, secsize);

        while (len > 0) {
                /* enough data to please the bounds checking code */
                bp->b_dev = dev;
                bp->b_blkno = (daddr_t)(pos / secsize);
                bp->b_bcount = uimin(len, maxsz);
                bp->b_flags = B_WRITE;

                error = dk_translate(dksc, bp);
                if (error >= 0)
                        break;

                error = dkd->d_discard(dksc->sc_dev,
                        (off_t)bp->b_rawblkno * secsize,
                        (off_t)bp->b_bcount);
                if (error)
                        break;

                pos += bp->b_bcount;
                len -= bp->b_bcount;
        }

        return error;
}

int
dk_size(struct dk_softc *dksc, dev_t dev)
{
        const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver;
        struct        disklabel *lp;
        int        is_open;
        int        part;
        int        size;

        if ((dksc->sc_flags & DKF_INITED) == 0)
                return -1;

        part = DISKPART(dev);
        is_open = dksc->sc_dkdev.dk_openmask & (1 << part);

        if (!is_open && dkd->d_open(dev, 0, S_IFBLK, curlwp))
                return -1;

        lp = dksc->sc_dkdev.dk_label;
        if (lp->d_partitions[part].p_fstype != FS_SWAP)
                size = -1;
        else
                size = lp->d_partitions[part].p_size *
                    (lp->d_secsize / DEV_BSIZE);

        if (!is_open && dkd->d_close(dev, 0, S_IFBLK, curlwp))
                return -1;

        return size;
}

int
dk_ioctl(struct dk_softc *dksc, dev_t dev,
            u_long cmd, void *data, int flag, struct lwp *l)
{
        const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver;
        struct        disklabel *lp;
        struct        disk *dk = &dksc->sc_dkdev;
#ifdef __HAVE_OLD_DISKLABEL
        struct        disklabel newlabel;
#endif
        int        error;

        DPRINTF_FOLLOW(("%s(%s, %p, 0x%"PRIx64", 0x%lx)\n", __func__,
            dksc->sc_xname, dksc, dev, cmd));

        /* ensure that the pseudo disk is open for writes for these commands */
        switch (cmd) {
        case DIOCSDINFO:
        case DIOCWDINFO:
#ifdef __HAVE_OLD_DISKLABEL
        case ODIOCSDINFO:
        case ODIOCWDINFO:
#endif
        case DIOCKLABEL:
        case DIOCWLABEL:
        case DIOCAWEDGE:
        case DIOCDWEDGE:
        case DIOCSSTRATEGY:
                if ((flag & FWRITE) == 0)
                        return EBADF;
        }

        /* ensure that the pseudo-disk is initialized for these */
        switch (cmd) {
        case DIOCGDINFO:
        case DIOCSDINFO:
        case DIOCWDINFO:
        case DIOCGPARTINFO:
        case DIOCKLABEL:
        case DIOCWLABEL:
        case DIOCGDEFLABEL:
        case DIOCAWEDGE:
        case DIOCDWEDGE:
        case DIOCLWEDGES:
        case DIOCMWEDGES:
        case DIOCRMWEDGES:
        case DIOCCACHESYNC:
#ifdef __HAVE_OLD_DISKLABEL
        case ODIOCGDINFO:
        case ODIOCSDINFO:
        case ODIOCWDINFO:
        case ODIOCGDEFLABEL:
#endif
                if ((dksc->sc_flags & DKF_INITED) == 0)
                        return ENXIO;
        }

        error = disk_ioctl(dk, dev, cmd, data, flag, l);
        if (error != EPASSTHROUGH)
                return error;
        else
                error = 0;

        switch (cmd) {
        case DIOCWDINFO:
        case DIOCSDINFO:
#ifdef __HAVE_OLD_DISKLABEL
        case ODIOCWDINFO:
        case ODIOCSDINFO:
#endif
#ifdef __HAVE_OLD_DISKLABEL
                if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
                        memset(&newlabel, 0, sizeof newlabel);
                        memcpy(&newlabel, data, sizeof (struct olddisklabel));
                        lp = &newlabel;
                } else
#endif
                lp = (struct disklabel *)data;

                mutex_enter(&dk->dk_openlock);
                dksc->sc_flags |= DKF_LABELLING;

                error = setdisklabel(dksc->sc_dkdev.dk_label,
                    lp, 0, dksc->sc_dkdev.dk_cpulabel);
                if (error == 0) {
                        if (cmd == DIOCWDINFO
#ifdef __HAVE_OLD_DISKLABEL
                            || cmd == ODIOCWDINFO
#endif
                           )
                                error = writedisklabel(DKLABELDEV(dev),
                                    dkd->d_strategy, dksc->sc_dkdev.dk_label,
                                    dksc->sc_dkdev.dk_cpulabel);
                }

                dksc->sc_flags &= ~DKF_LABELLING;
                mutex_exit(&dk->dk_openlock);
                break;

        case DIOCKLABEL:
                if (*(int *)data != 0)
                        dksc->sc_flags |= DKF_KLABEL;
                else
                        dksc->sc_flags &= ~DKF_KLABEL;
                break;

        case DIOCWLABEL:
                if (*(int *)data != 0)
                        dksc->sc_flags |= DKF_WLABEL;
                else
                        dksc->sc_flags &= ~DKF_WLABEL;
                break;

        case DIOCGDEFLABEL:
                dk_getdefaultlabel(dksc, (struct disklabel *)data);
                break;

#ifdef __HAVE_OLD_DISKLABEL
        case ODIOCGDEFLABEL:
                dk_getdefaultlabel(dksc, &newlabel);
                if (newlabel.d_npartitions > OLDMAXPARTITIONS)
                        return ENOTTY;
                memcpy(data, &newlabel, sizeof (struct olddisklabel));
                break;
#endif

        case DIOCGSTRATEGY:
            {
                struct disk_strategy *dks = (void *)data;

                mutex_enter(&dksc->sc_iolock);
                if (dksc->sc_bufq != NULL)
                        strlcpy(dks->dks_name,
                            bufq_getstrategyname(dksc->sc_bufq),
                            sizeof(dks->dks_name));
                else
                        error = EINVAL;
                mutex_exit(&dksc->sc_iolock);
                dks->dks_paramlen = 0;
                break;
            }

        case DIOCSSTRATEGY:
            {
                struct disk_strategy *dks = (void *)data;
                struct bufq_state *new;
                struct bufq_state *old;

                if (dks->dks_param != NULL) {
                        return EINVAL;
                }
                dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
                error = bufq_alloc(&new, dks->dks_name,
                    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
                if (error) {
                        return error;
                }
                mutex_enter(&dksc->sc_iolock);
                old = dksc->sc_bufq;
                if (old)
                        bufq_move(new, old);
                dksc->sc_bufq = new;
                mutex_exit(&dksc->sc_iolock);
                if (old)
                        bufq_free(old);
                break;
            }

        default:
                error = ENOTTY;
        }

        return error;
}

/*
 * dk_dump dumps all of physical memory into the partition specified.
 * This requires substantially more framework than {s,w}ddump, and hence
 * is probably much more fragile.
 *
 */

#define DKFF_READYFORDUMP(x)        (((x) & DKF_READYFORDUMP) == DKF_READYFORDUMP)
static volatile int        dk_dumping = 0;

/* ARGSUSED */
int
dk_dump(struct dk_softc *dksc, dev_t dev,
    daddr_t blkno, void *vav, size_t size, int flags)
{
        const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver;
        struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
        char *va = vav;
        struct disklabel *lp;
        struct partition *p;
        int part, towrt, maxblkcnt, nblk;
        int maxxfer, rv = 0;

        /*
         * ensure that we consider this device to be safe for dumping,
         * and that the device is configured.
         */
        if (!DKFF_READYFORDUMP(dksc->sc_flags)) {
                DPRINTF(DKDB_DUMP, ("%s: bad dump flags 0x%x\n", __func__,
                    dksc->sc_flags));
                return ENXIO;
        }

        /* ensure that we are not already dumping */
        if (dk_dumping)
                return EFAULT;
        if ((flags & DK_DUMP_RECURSIVE) == 0)
                dk_dumping = 1;

        if (dkd->d_dumpblocks == NULL) {
                DPRINTF(DKDB_DUMP, ("%s: no dumpblocks\n", __func__));
                return ENXIO;
        }

        /* device specific max transfer size */
        maxxfer = MAXPHYS;
        if (dkd->d_iosize != NULL)
                (*dkd->d_iosize)(dksc->sc_dev, &maxxfer);

        /* Convert to disk sectors.  Request must be a multiple of size. */
        part = DISKPART(dev);
        lp = dksc->sc_dkdev.dk_label;
        if ((size % lp->d_secsize) != 0) {
                DPRINTF(DKDB_DUMP, ("%s: odd size %zu\n", __func__, size));
                return EFAULT;
        }
        towrt = size / lp->d_secsize;
        blkno = dbtob(blkno) / lp->d_secsize;   /* blkno in secsize units */

        p = &lp->d_partitions[part];
        if (part == RAW_PART) {
                if (p->p_fstype != FS_UNUSED) {
                        DPRINTF(DKDB_DUMP, ("%s: bad fstype %d\n", __func__,
                            p->p_fstype));
                        return ENXIO;
                }
                /* Check whether dump goes to a wedge */
                if (dksc->sc_dkdev.dk_nwedges == 0) {
                        DPRINTF(DKDB_DUMP, ("%s: dump to raw\n", __func__));
                        return ENXIO;
                }
                /* Check transfer bounds against media size */
                if (blkno < 0 || (blkno + towrt) > dg->dg_secperunit) {
                        DPRINTF(DKDB_DUMP, ("%s: out of bounds blkno=%jd, towrt=%d, "
                            "nsects=%jd\n", __func__, (intmax_t)blkno, towrt, dg->dg_secperunit));
                        return EINVAL;
                }
        } else {
                int nsects, sectoff;

                if (p->p_fstype != FS_SWAP) {
                        DPRINTF(DKDB_DUMP, ("%s: bad fstype %d\n", __func__,
                            p->p_fstype));
                        return ENXIO;
                }
                nsects = p->p_size;
                sectoff = p->p_offset;

                /* Check transfer bounds against partition size. */
                if ((blkno < 0) || ((blkno + towrt) > nsects)) {
                        DPRINTF(DKDB_DUMP, ("%s: out of bounds blkno=%jd, towrt=%d, "
                            "nsects=%d\n", __func__, (intmax_t)blkno, towrt, nsects));
                        return EINVAL;
                }

                /* Offset block number to start of partition. */
                blkno += sectoff;
        }

        /* Start dumping and return when done. */
        maxblkcnt = howmany(maxxfer, lp->d_secsize);
        while (towrt > 0) {
                nblk = uimin(maxblkcnt, towrt);

                if ((rv = (*dkd->d_dumpblocks)(dksc->sc_dev, va, blkno, nblk))
                    != 0) {
                        DPRINTF(DKDB_DUMP, ("%s: dumpblocks %d\n", __func__,
                            rv));
                        return rv;
                }

                towrt -= nblk;
                blkno += nblk;
                va += nblk * lp->d_secsize;
        }

        if ((flags & DK_DUMP_RECURSIVE) == 0)
                dk_dumping = 0;

        return 0;
}

/* ARGSUSED */
void
dk_getdefaultlabel(struct dk_softc *dksc, struct disklabel *lp)
{
        const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver;
        struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;

        memset(lp, 0, sizeof(*lp));

        if (dg->dg_secperunit > UINT32_MAX)
                lp->d_secperunit = UINT32_MAX;
        else
                lp->d_secperunit = dg->dg_secperunit;
        lp->d_secsize = dg->dg_secsize;
        lp->d_nsectors = dg->dg_nsectors;
        lp->d_ntracks = dg->dg_ntracks;
        lp->d_ncylinders = dg->dg_ncylinders;
        lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;

        strlcpy(lp->d_typename, dksc->sc_xname, sizeof(lp->d_typename));
        lp->d_type = dksc->sc_dtype;
        strlcpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
        lp->d_rpm = 3600;
        lp->d_interleave = 1;
        lp->d_flags = 0;

        lp->d_partitions[RAW_PART].p_offset = 0;
        lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
        lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
        lp->d_npartitions = RAW_PART + 1;

        lp->d_magic = DISKMAGIC;
        lp->d_magic2 = DISKMAGIC;

        if (dkd->d_label)
                dkd->d_label(dksc->sc_dev, lp);

        lp->d_checksum = dkcksum(lp);
}

/* ARGSUSED */
void
dk_getdisklabel(struct dk_softc *dksc, dev_t dev)
{
        const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver;
        struct         disklabel *lp = dksc->sc_dkdev.dk_label;
        struct         cpu_disklabel *clp = dksc->sc_dkdev.dk_cpulabel;
        struct   disk_geom *dg = &dksc->sc_dkdev.dk_geom;
        struct         partition *pp;
        int         i, lpratio, dgratio;
        const char        *errstring;

        memset(clp, 0x0, sizeof(*clp));
        dk_getdefaultlabel(dksc, lp);
        errstring = readdisklabel(DKLABELDEV(dev), dkd->d_strategy,
            dksc->sc_dkdev.dk_label, dksc->sc_dkdev.dk_cpulabel);
        if (errstring) {
                dk_makedisklabel(dksc);
                if (dksc->sc_flags & DKF_WARNLABEL)
                        printf("%s: %s\n", dksc->sc_xname, errstring);
                return;
        }

        if ((dksc->sc_flags & DKF_LABELSANITY) == 0)
                return;

        /* Convert sector counts to multiple of DEV_BSIZE for comparison */
        lpratio = dgratio = 1;
        if (lp->d_secsize > DEV_BSIZE)
                lpratio = lp->d_secsize / DEV_BSIZE;
        if (dg->dg_secsize > DEV_BSIZE)
                dgratio = dg->dg_secsize / DEV_BSIZE;

        /* Sanity check */
        if ((uint64_t)lp->d_secperunit * lpratio > dg->dg_secperunit * dgratio)
                printf("WARNING: %s: "
                    "total unit size in disklabel (%" PRIu64 ") "
                    "!= the size of %s (%" PRIu64 ")\n", dksc->sc_xname,
                    (uint64_t)lp->d_secperunit * lpratio, dksc->sc_xname,
                    dg->dg_secperunit * dgratio);
        else if (lp->d_secperunit < UINT32_MAX &&
            (uint64_t)lp->d_secperunit * lpratio < dg->dg_secperunit * dgratio)
                printf("%s: %" PRIu64 " trailing sectors not covered"
                    " by disklabel\n", dksc->sc_xname,
                    (dg->dg_secperunit * dgratio)
                    - (lp->d_secperunit * lpratio));

        for (i=0; i < lp->d_npartitions; i++) {
                uint64_t pend;

                pp = &lp->d_partitions[i];
                pend = pp->p_offset + pp->p_size;
                if (pend * lpratio > dg->dg_secperunit * dgratio)
                        printf("WARNING: %s: end of partition `%c' exceeds "
                            "the size of %s (%" PRIu64 ")\n", dksc->sc_xname,
                            'a' + i, dksc->sc_xname,
                            dg->dg_secperunit * dgratio);
        }
}

/*      
 * Heuristic to conjure a disklabel if reading a disklabel failed.
 *
 * This is to allow the raw partition to be used for a filesystem
 * without caring about the write protected label sector. 
 *
 * If the driver provides it's own callback, use that instead.
 */
/* ARGSUSED */
static void
dk_makedisklabel(struct dk_softc *dksc)
{
        const struct dkdriver *dkd = dksc->sc_dkdev.dk_driver;
        struct  disklabel *lp = dksc->sc_dkdev.dk_label;

        strlcpy(lp->d_packname, "default label", sizeof(lp->d_packname));

        if (dkd->d_label)
                dkd->d_label(dksc->sc_dev, lp);
        else
                lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;

        lp->d_checksum = dkcksum(lp);
}

MODULE(MODULE_CLASS_MISC, dk_subr, NULL);

static int
dk_subr_modcmd(modcmd_t cmd, void *arg)
{
        switch (cmd) {
        case MODULE_CMD_INIT:
        case MODULE_CMD_FINI:
                return 0;
        case MODULE_CMD_STAT:
        case MODULE_CMD_AUTOUNLOAD:
        default:
                return ENOTTY;
        }
}





























































































































    2 



    2 
























    1 




    1 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
/*        $NetBSD: kern_uipc_socket_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $        */

/*
 * Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 2004 The FreeBSD Foundation
 * Copyright (c) 2004 Robert Watson
 * Copyright (c) 1982, 1986, 1988, 1990, 1993
 *     The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *     @(#)uipc_socket.c       8.6 (Berkeley) 5/2/95
 */

/*
 * Copyright (c) 1988 University of Utah.
 * Copyright (c) 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * the Systems Programming Group of the University of Utah Computer
 * Science Department.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * from: Utah $Hdr: vn.c 1.13 94/04/02$
 *
 *        @(#)vn.c        8.9 (Berkeley) 5/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_uipc_socket_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/compat_stub.h>
#include <sys/socketvar.h>

#include <compat/sys/time.h>
#include <compat/sys/socket.h>

#include <compat/common/compat_mod.h>

static int
uipc_socket_50_getopt1(int opt, struct socket *so, struct sockopt *sopt)
{
        int optval, error;
        struct timeval50 otv;

        switch (opt) {

        case SO_OSNDTIMEO:
        case SO_ORCVTIMEO:
                optval = (opt == SO_OSNDTIMEO ?
                    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);

                otv.tv_sec = optval / hz;
                otv.tv_usec = (optval % hz) * tick;

                error = sockopt_set(sopt, &otv, sizeof(otv));
                break;

        case SO_OTIMESTAMP:
                error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0);
                break;

        default:
                error = EPASSTHROUGH;
        }
        return error;
}

static int
uipc_socket_50_setopt1(int opt, struct socket *so, const struct sockopt *sopt)
{
        int optval, error;
        struct timeval50 otv;
        struct timeval tv;

        switch (opt) {

        case SO_OSNDTIMEO:
        case SO_ORCVTIMEO:
                solock(so);

                error = sockopt_get(sopt, &otv, sizeof(otv));
                if (error)
                        break;

                timeval50_to_timeval(&otv, &tv);

                /* Code duplicated from sys/kern/uipc_socket.c */
                if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
                        error = EDOM;
                        break;
                }
                if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) {
                        error = EDOM;
                        break;
                }

                optval = tv.tv_sec * hz + tv.tv_usec / tick;
                if (optval == 0 && tv.tv_usec != 0)
                        optval = 1;

                switch (opt) {
                case SO_OSNDTIMEO:
                        so->so_snd.sb_timeo = optval;
                        break;
                case SO_ORCVTIMEO:
                        so->so_rcv.sb_timeo = optval;
                        break;
                }        
                break;

        case SO_OTIMESTAMP:
                error = sockopt_getint(sopt, &optval);
                solock(so);
                if (error)
                        break;
                if (optval)
                        so->so_options |= opt;
                else
                        so->so_options &= ~opt;
                break;

        default:
                error = EPASSTHROUGH;
        }
        return error;
}

static int
uipc_socket_50_sbts(int opt, struct mbuf ***mp)
{
        struct timeval50 tv50;
        struct timeval tv;

        microtime(&tv);

        if (opt & SO_OTIMESTAMP) {

                timeval_to_timeval50(&tv, &tv50);
                **mp = sbcreatecontrol(&tv50, sizeof(tv50), SCM_OTIMESTAMP,
                    SOL_SOCKET);
                if (**mp)
                        *mp = &(**mp)->m_next;
                return 0;
        } else
                return EPASSTHROUGH;
}

void
kern_uipc_socket_50_init(void)
{

        MODULE_HOOK_SET(uipc_socket_50_setopt1_hook, uipc_socket_50_setopt1);
        MODULE_HOOK_SET(uipc_socket_50_getopt1_hook, uipc_socket_50_getopt1);
        MODULE_HOOK_SET(uipc_socket_50_sbts_hook, uipc_socket_50_sbts);
}

void
kern_uipc_socket_50_fini(void)
{

        MODULE_HOOK_UNSET(uipc_socket_50_setopt1_hook);
        MODULE_HOOK_UNSET(uipc_socket_50_getopt1_hook);
        MODULE_HOOK_UNSET(uipc_socket_50_sbts_hook);
}



















































































































































































































































































































































































   40 






   22 














   19 


































































    8 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
/*        $NetBSD: vfs_init.c,v 1.64 2023/09/23 18:21:11 ad Exp $        */

/*-
 * Copyright (c) 1998, 2000, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed
 * to Berkeley by John Heidemann of the UCLA Ficus project.
 *
 * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_init.c        8.5 (Berkeley) 5/11/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_init.c,v 1.64 2023/09/23 18:21:11 ad Exp $");

#include <sys/param.h>
#include <sys/types.h>

#include <sys/buf.h>
#include <sys/dirhash.h>
#include <sys/errno.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/sdt.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/ucred.h>
#include <sys/vnode.h>
#include <sys/vnode_impl.h>

#include <miscfs/deadfs/deadfs.h>
#include <miscfs/fifofs/fifo.h>
#include <miscfs/specfs/specdev.h>

/*
 * Sigh, such primitive tools are these...
 */
#if 0
#define DODEBUG(A) A
#else
#define DODEBUG(A)
#endif

SDT_PROVIDER_DEFINE(vfs);

/*
 * These vnodeopv_descs are listed here because they are not
 * associated with any particular file system, and thus cannot
 * be initialized by vfs_attach().
 */
const struct vnodeopv_desc * const vfs_special_vnodeopv_descs[] = {
        &dead_vnodeop_opv_desc,
        &fifo_vnodeop_opv_desc,
        &spec_vnodeop_opv_desc,
        NULL,
};

struct vfs_list_head vfs_list =                        /* vfs list */
    LIST_HEAD_INITIALIZER(vfs_list);

static kauth_listener_t mount_listener;

/*
 * This code doesn't work if the defn is **vnodop_defns with cc.
 * The problem is because of the compiler sometimes putting in an
 * extra level of indirection for arrays.  It's an interesting
 * "feature" of C.
 */
typedef int (*PFI)(void *);

/*
 * A miscellaneous routine.
 * A generic "default" routine that just returns an error.
 */
/*ARGSUSED*/
int
vn_default_error(void *v)
{

        return (EOPNOTSUPP);
}

static struct sysctllog *vfs_sysctllog;

/*
 * Top level filesystem related information gathering.
 */
static void
sysctl_vfs_setup(void)
{

        sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "generic",
                       SYSCTL_DESCR("Non-specific vfs related information"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, VFS_GENERIC, CTL_EOL);
        sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "fstypes",
                       SYSCTL_DESCR("List of file systems present"),
                       sysctl_vfs_generic_fstypes, 0, NULL, 0,
                       CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL);
        sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "magiclinks",
                       SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"),
                       NULL, 0, &vfs_magiclinks, 0,
                       CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL);
        sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                        CTLTYPE_INT, "timestamp_precision",
                        SYSCTL_DESCR("File timestamp precision"),
                        NULL, 0, &vfs_timestamp_precision, 0,
                        CTL_VFS, VFS_GENERIC, VFS_TIMESTAMP_PRECISION,
                        CTL_EOL);
}


/*
 * vfs_init.c
 *
 * Allocate and fill in operations vectors.
 *
 * An undocumented feature of this approach to defining operations is that
 * there can be multiple entries in vfs_opv_descs for the same operations
 * vector. This allows third parties to extend the set of operations
 * supported by another layer in a binary compatibile way. For example,
 * assume that NFS needed to be modified to support Ficus. NFS has an entry
 * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by
 * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions)
 * listing those new operations Ficus adds to NFS, all without modifying the
 * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but
 * that is a(whole)nother story.) This is a feature.
 */

/*
 * Init the vector, if it needs it.
 * Also handle backwards compatibility.
 */
static void
vfs_opv_init_explicit(const struct vnodeopv_desc *vfs_opv_desc)
{
        int (**opv_desc_vector)(void *);
        const struct vnodeopv_entry_desc *opve_descp;

        opv_desc_vector = *(vfs_opv_desc->opv_desc_vector_p);

        for (opve_descp = vfs_opv_desc->opv_desc_ops;
             opve_descp->opve_op;
             opve_descp++) {
                /*
                 * Sanity check:  is this operation listed
                 * in the list of operations?  We check this
                 * by seeing if its offset is zero.  Since
                 * the default routine should always be listed
                 * first, it should be the only one with a zero
                 * offset.  Any other operation with a zero
                 * offset is probably not listed in
                 * vfs_op_descs, and so is probably an error.
                 *
                 * A panic here means the layer programmer
                 * has committed the all-too common bug
                 * of adding a new operation to the layer's
                 * list of vnode operations but
                 * not adding the operation to the system-wide
                 * list of supported operations.
                 */
                if (opve_descp->opve_op->vdesc_offset == 0 &&
                    opve_descp->opve_op->vdesc_offset != VOFFSET(vop_default)) {
                        printf("operation %s not listed in %s.\n",
                            opve_descp->opve_op->vdesc_name, "vfs_op_descs");
                        panic ("vfs_opv_init: bad operation");
                }

                /*
                 * Fill in this entry.
                 */
                opv_desc_vector[opve_descp->opve_op->vdesc_offset] =
                    opve_descp->opve_impl;
        }
}

static void
vfs_opv_init_default(const struct vnodeopv_desc *vfs_opv_desc)
{
        int j;
        int (**opv_desc_vector)(void *);

        opv_desc_vector = *(vfs_opv_desc->opv_desc_vector_p);

        /*
         * Force every operations vector to have a default routine.
         */
        if (opv_desc_vector[VOFFSET(vop_default)] == NULL)
                panic("vfs_opv_init: operation vector without default routine.");

        for (j = 0; j < VNODE_OPS_COUNT; j++)
                if (opv_desc_vector[j] == NULL)
                        opv_desc_vector[j] =
                            opv_desc_vector[VOFFSET(vop_default)];
}

void
vfs_opv_init(const struct vnodeopv_desc * const *vopvdpp)
{
        int (**opv_desc_vector)(void *);
        int i;

        /*
         * Allocate the vectors.
         */
        for (i = 0; vopvdpp[i] != NULL; i++) {
                opv_desc_vector =
                    kmem_alloc(VNODE_OPS_COUNT * sizeof(PFI), KM_SLEEP);
                memset(opv_desc_vector, 0, VNODE_OPS_COUNT * sizeof(PFI));
                *(vopvdpp[i]->opv_desc_vector_p) = opv_desc_vector;
                DODEBUG(printf("vector at %p allocated\n",
                    opv_desc_vector_p));
        }

        /*
         * ...and fill them in.
         */
        for (i = 0; vopvdpp[i] != NULL; i++)
                vfs_opv_init_explicit(vopvdpp[i]);

        /*
         * Finally, go back and replace unfilled routines
         * with their default.
         */
        for (i = 0; vopvdpp[i] != NULL; i++)
                vfs_opv_init_default(vopvdpp[i]);
}

void
vfs_opv_free(const struct vnodeopv_desc * const *vopvdpp)
{
        int i;

        /*
         * Free the vectors allocated in vfs_opv_init().
         */
        for (i = 0; vopvdpp[i] != NULL; i++) {
                kmem_free(*(vopvdpp[i]->opv_desc_vector_p),
                    VNODE_OPS_COUNT * sizeof(PFI));
                *(vopvdpp[i]->opv_desc_vector_p) = NULL;
        }
}

#ifdef DEBUG
static void
vfs_op_check(void)
{
        int i;

        DODEBUG(printf("Vnode_interface_init.\n"));

        /*
         * Check offset of each op.
         */
        for (i = 0; vfs_op_descs[i]; i++) {
                if (vfs_op_descs[i]->vdesc_offset != i)
                        panic("vfs_op_check: vfs_op_desc[] offset mismatch");
        }

        if (i != VNODE_OPS_COUNT) {
                panic("vfs_op_check: vnode ops count mismatch (%d != %d)",
                        i, VNODE_OPS_COUNT);
        }

        DODEBUG(printf ("vfs_opv_numops=%d\n", VNODE_OPS_COUNT));
}
#endif /* DEBUG */

/*
 * Common routine to check if an unprivileged mount is allowed.
 *
 * We export just this part (i.e., without the access control) so that if a
 * secmodel wants to implement finer grained user mounts it can do so without
 * copying too much code. More elaborate policies (i.e., specific users allowed
 * to also create devices and/or introduce set-id binaries, or export
 * file-systems) will require a different implementation.
 *
 * This routine is intended to be called from listener context, and as such
 * does not take credentials as an argument.
 */
int
usermount_common_policy(struct mount *mp, u_long flags)
{

        /* No exporting if unprivileged. */
        if (flags & MNT_EXPORTED)
                return EPERM;

        /* Must have 'nosuid' and 'nodev'. */
        if ((flags & MNT_NODEV) == 0 || (flags & MNT_NOSUID) == 0)
                return EPERM;

        /* Retain 'noexec'. */
        if ((mp->mnt_flag & MNT_NOEXEC) && (flags & MNT_NOEXEC) == 0)
                return EPERM;

        return 0;
}

static int
mount_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;
        enum kauth_system_req req;

        result = KAUTH_RESULT_DEFER;
        req = (enum kauth_system_req)(uintptr_t)(uintptr_t)arg0;

        if (action != KAUTH_SYSTEM_MOUNT)
                return result;

        if (req == KAUTH_REQ_SYSTEM_MOUNT_GET)
                result = KAUTH_RESULT_ALLOW;
        else if (req == KAUTH_REQ_SYSTEM_MOUNT_DEVICE) {
                vnode_t *devvp = arg2;
                accmode_t accmode = (accmode_t)(unsigned long)arg3;
                int error;

                error = VOP_ACCESS(devvp, accmode, cred);
                if (!error)
                        result = KAUTH_RESULT_ALLOW;
        }

        return result;
}

/*
 * Initialize the vnode structures and initialize each file system type.
 */
void
vfsinit(void)
{

        /*
         * Attach sysctl nodes
         */
        sysctl_vfs_setup();

        /*
         * Initialize the vnode table
         */
        vntblinit();

        /*
         * Initialize the vnode name cache
         */
        nchinit();

#ifdef DEBUG
        /*
         * Check the list of vnode operations.
         */
        vfs_op_check();
#endif

        /*
         * Initialize the special vnode operations.
         */
        vfs_opv_init(vfs_special_vnodeopv_descs);

        /*
         * Initialise generic dirhash.
         */
        dirhash_init();

        /*
         * Initialise VFS hooks.
         */
        vfs_hooks_init();

        mount_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
            mount_listener_cb, NULL);

        /*
         * Establish each file system which was statically
         * included in the kernel.
         */
        module_init_class(MODULE_CLASS_VFS);

        /*
         * Initialize EVFILT_FS for kqueue.
         */
        vfs_evfilt_fs_init();
}

/*
 * Drop a reference to a file system type.
 */
void
vfs_delref(struct vfsops *vfs)
{

        mutex_enter(&vfs_list_lock);
        vfs->vfs_refcount--;
        mutex_exit(&vfs_list_lock);
}

/*
 * Establish a file system and initialize it.
 */
int
vfs_attach(struct vfsops *vfs)
{
        struct vfsops *v;
        int error = 0;

        mutex_enter(&vfs_list_lock);

        /*
         * Make sure this file system doesn't already exist.
         */
        LIST_FOREACH(v, &vfs_list, vfs_list) {
                if (strcmp(vfs->vfs_name, v->vfs_name) == 0) {
                        error = EEXIST;
                        goto out;
                }
        }

        /*
         * Initialize the vnode operations for this file system.
         */
        vfs_opv_init(vfs->vfs_opv_descs);

        /*
         * Now initialize the file system itself.
         */
        (*vfs->vfs_init)();

        /*
         * ...and link it into the kernel's list.
         */
        LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list);

        /*
         * Sanity: make sure the reference count is 0.
         */
        vfs->vfs_refcount = 0;
 out:
        mutex_exit(&vfs_list_lock);
        return (error);
}

/*
 * Remove a file system from the kernel.
 */
int
vfs_detach(struct vfsops *vfs)
{
        struct vfsops *v;
        int error = 0;

        mutex_enter(&vfs_list_lock);

        /*
         * Make sure no one is using the filesystem.
         */
        if (vfs->vfs_refcount != 0) {
                error = EBUSY;
                goto out;
        }

        /*
         * ...and remove it from the kernel's list.
         */
        LIST_FOREACH(v, &vfs_list, vfs_list) {
                if (v == vfs) {
                        LIST_REMOVE(v, vfs_list);
                        break;
                }
        }

        if (v == NULL) {
                error = ESRCH;
                goto out;
        }

        /*
         * Now run the file system-specific cleanups.
         */
        (*vfs->vfs_done)();

        /*
         * Free the vnode operations vector.
         */
        vfs_opv_free(vfs->vfs_opv_descs);
 out:
         mutex_exit(&vfs_list_lock);
        return (error);
}

void
vfs_reinit(void)
{
        struct vfsops *vfs;

        mutex_enter(&vfs_list_lock);
        LIST_FOREACH(vfs, &vfs_list, vfs_list) {
                if (vfs->vfs_reinit) {
                        vfs->vfs_refcount++;
                        mutex_exit(&vfs_list_lock);
                        (*vfs->vfs_reinit)();
                        mutex_enter(&vfs_list_lock);
                        vfs->vfs_refcount--;
                }
        }
        mutex_exit(&vfs_list_lock);
}































































































































































    2 






    2 



    2 





































    2 











    2 
    2 














    2 











































    2 










    2 


















    2 





    2 

    2 

















    1 

    1 





















    1 











































































































































































































    1 
























    1 














    1 







    1 









































































































    1 


    1 






    1 








    1 







    1 







































































    6 








    4 
    3 








    6 
    2 









    2 
    3 
    3 

    5 









    1 







    1 





































































    2 




































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
/*        $NetBSD: sys_pipe.c,v 1.167 2024/02/10 09:21:54 andvar Exp $        */

/*-
 * Copyright (c) 2003, 2007, 2008, 2009, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Paul Kranenburg, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1996 John S. Dyson
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice immediately at the beginning of the file, without modification,
 *    this list of conditions, and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Absolutely no warranty of function or purpose is made by the author
 *    John S. Dyson.
 * 4. Modifications may be freely made to this file if the above conditions
 *    are met.
 */

/*
 * This file contains a high-performance replacement for the socket-based
 * pipes scheme originally used.  It does not support all features of
 * sockets, but does do everything that pipes normally do.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.167 2024/02/10 09:21:54 andvar Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/kernel.h>
#include <sys/ttycom.h>
#include <sys/stat.h>
#include <sys/poll.h>
#include <sys/signalvar.h>
#include <sys/vnode.h>
#include <sys/uio.h>
#include <sys/select.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/pipe.h>

static int        pipe_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
static int        pipe_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
static int        pipe_close(file_t *);
static int        pipe_poll(file_t *, int);
static int        pipe_kqfilter(file_t *, struct knote *);
static int        pipe_stat(file_t *, struct stat *);
static int        pipe_ioctl(file_t *, u_long, void *);
static void        pipe_restart(file_t *);
static int        pipe_fpathconf(file_t *, int, register_t *);
static int        pipe_posix_fadvise(file_t *, off_t, off_t, int);

static const struct fileops pipeops = {
        .fo_name = "pipe",
        .fo_read = pipe_read,
        .fo_write = pipe_write,
        .fo_ioctl = pipe_ioctl,
        .fo_fcntl = fnullop_fcntl,
        .fo_poll = pipe_poll,
        .fo_stat = pipe_stat,
        .fo_close = pipe_close,
        .fo_kqfilter = pipe_kqfilter,
        .fo_restart = pipe_restart,
        .fo_fpathconf = pipe_fpathconf,
        .fo_posix_fadvise = pipe_posix_fadvise,
};

/*
 * Default pipe buffer size(s), this can be kind-of large now because pipe
 * space is pageable.  The pipe code will try to maintain locality of
 * reference for performance reasons, so small amounts of outstanding I/O
 * will not wipe the cache.
 */
#define        MINPIPESIZE        (PIPE_SIZE / 3)
#define        MAXPIPESIZE        (2 * PIPE_SIZE / 3)

/*
 * Limit the number of "big" pipes
 */
#define        LIMITBIGPIPES        32
static u_int        maxbigpipes __read_mostly = LIMITBIGPIPES;
static u_int        nbigpipe = 0;

/*
 * Amount of KVA consumed by pipe buffers.
 */
static u_int        amountpipekva = 0;

static void        pipeclose(struct pipe *);
static void        pipe_free_kmem(struct pipe *);
static int        pipe_create(struct pipe **, pool_cache_t, struct timespec *);
static int        pipelock(struct pipe *, bool);
static inline void pipeunlock(struct pipe *);
static void        pipeselwakeup(struct pipe *, struct pipe *, int);
static int        pipespace(struct pipe *, int);
static int        pipe_ctor(void *, void *, int);
static void        pipe_dtor(void *, void *);

static pool_cache_t        pipe_wr_cache;
static pool_cache_t        pipe_rd_cache;

void
pipe_init(void)
{

        /* Writer side is not automatically allocated KVA. */
        pipe_wr_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "pipewr",
            NULL, IPL_NONE, pipe_ctor, pipe_dtor, NULL);
        KASSERT(pipe_wr_cache != NULL);

        /* Reader side gets preallocated KVA. */
        pipe_rd_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "piperd",
            NULL, IPL_NONE, pipe_ctor, pipe_dtor, (void *)1);
        KASSERT(pipe_rd_cache != NULL);
}

static int
pipe_ctor(void *arg, void *obj, int flags)
{
        struct pipe *pipe;
        vaddr_t va;

        pipe = obj;

        memset(pipe, 0, sizeof(struct pipe));
        if (arg != NULL) {
                /* Preallocate space. */
                va = uvm_km_alloc(kernel_map, PIPE_SIZE, 0,
                    UVM_KMF_PAGEABLE | UVM_KMF_WAITVA);
                KASSERT(va != 0);
                pipe->pipe_kmem = va;
                atomic_add_int(&amountpipekva, PIPE_SIZE);
        }
        cv_init(&pipe->pipe_rcv, "pipe_rd");
        cv_init(&pipe->pipe_wcv, "pipe_wr");
        cv_init(&pipe->pipe_draincv, "pipe_drn");
        cv_init(&pipe->pipe_lkcv, "pipe_lk");
        selinit(&pipe->pipe_sel);
        pipe->pipe_state = PIPE_SIGNALR;

        return 0;
}

static void
pipe_dtor(void *arg, void *obj)
{
        struct pipe *pipe;

        pipe = obj;

        cv_destroy(&pipe->pipe_rcv);
        cv_destroy(&pipe->pipe_wcv);
        cv_destroy(&pipe->pipe_draincv);
        cv_destroy(&pipe->pipe_lkcv);
        seldestroy(&pipe->pipe_sel);
        if (pipe->pipe_kmem != 0) {
                uvm_km_free(kernel_map, pipe->pipe_kmem, PIPE_SIZE,
                    UVM_KMF_PAGEABLE);
                atomic_add_int(&amountpipekva, -PIPE_SIZE);
        }
}

/*
 * The pipe system call for the DTYPE_PIPE type of pipes
 */
int
pipe1(struct lwp *l, int *fildes, int flags)
{
        struct pipe *rpipe, *wpipe;
        struct timespec nt;
        file_t *rf, *wf;
        int fd, error;
        proc_t *p;

        if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE))
                return EINVAL;
        p = curproc;
        rpipe = wpipe = NULL;
        getnanotime(&nt);
        if ((error = pipe_create(&rpipe, pipe_rd_cache, &nt)) ||
            (error = pipe_create(&wpipe, pipe_wr_cache, &nt))) {
                goto free2;
        }
        rpipe->pipe_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
        wpipe->pipe_lock = rpipe->pipe_lock;
        mutex_obj_hold(wpipe->pipe_lock);

        error = fd_allocfile(&rf, &fd);
        if (error)
                goto free2;
        fildes[0] = fd;

        error = fd_allocfile(&wf, &fd);
        if (error)
                goto free3;
        fildes[1] = fd;

        rf->f_flag = FREAD | flags;
        rf->f_type = DTYPE_PIPE;
        rf->f_pipe = rpipe;
        rf->f_ops = &pipeops;
        fd_set_exclose(l, fildes[0], (flags & O_CLOEXEC) != 0);

        wf->f_flag = FWRITE | flags;
        wf->f_type = DTYPE_PIPE;
        wf->f_pipe = wpipe;
        wf->f_ops = &pipeops;
        fd_set_exclose(l, fildes[1], (flags & O_CLOEXEC) != 0);

        rpipe->pipe_peer = wpipe;
        wpipe->pipe_peer = rpipe;

        fd_affix(p, rf, fildes[0]);
        fd_affix(p, wf, fildes[1]);
        return (0);
free3:
        fd_abort(p, rf, fildes[0]);
free2:
        pipeclose(wpipe);
        pipeclose(rpipe);

        return (error);
}

/*
 * Allocate kva for pipe circular buffer, the space is pageable
 * This routine will 'realloc' the size of a pipe safely, if it fails
 * it will retain the old buffer.
 * If it fails it will return ENOMEM.
 */
static int
pipespace(struct pipe *pipe, int size)
{
        void *buffer;

        /*
         * Allocate pageable virtual address space.  Physical memory is
         * allocated on demand.
         */
        if (size == PIPE_SIZE && pipe->pipe_kmem != 0) {
                buffer = (void *)pipe->pipe_kmem;
        } else {
                buffer = (void *)uvm_km_alloc(kernel_map, round_page(size),
                    0, UVM_KMF_PAGEABLE);
                if (buffer == NULL)
                        return (ENOMEM);
                atomic_add_int(&amountpipekva, size);
        }

        /* free old resources if we're resizing */
        pipe_free_kmem(pipe);
        pipe->pipe_buffer.buffer = buffer;
        pipe->pipe_buffer.size = size;
        pipe->pipe_buffer.in = 0;
        pipe->pipe_buffer.out = 0;
        pipe->pipe_buffer.cnt = 0;
        return (0);
}

/*
 * Initialize and allocate VM and memory for pipe.
 */
static int
pipe_create(struct pipe **pipep, pool_cache_t cache, struct timespec *nt)
{
        struct pipe *pipe;
        int error;

        pipe = pool_cache_get(cache, PR_WAITOK);
        KASSERT(pipe != NULL);
        *pipep = pipe;
        error = 0;
        pipe->pipe_atime = pipe->pipe_mtime = pipe->pipe_btime = *nt;
        pipe->pipe_lock = NULL;
        if (cache == pipe_rd_cache) {
                error = pipespace(pipe, PIPE_SIZE);
        } else {
                pipe->pipe_buffer.buffer = NULL;
                pipe->pipe_buffer.size = 0;
                pipe->pipe_buffer.in = 0;
                pipe->pipe_buffer.out = 0;
                pipe->pipe_buffer.cnt = 0;
        }
        return error;
}

/*
 * Lock a pipe for I/O, blocking other access
 * Called with pipe spin lock held.
 */
static int
pipelock(struct pipe *pipe, bool catch_p)
{
        int error;

        KASSERT(mutex_owned(pipe->pipe_lock));

        while (pipe->pipe_state & PIPE_LOCKFL) {
                if (catch_p) {
                        error = cv_wait_sig(&pipe->pipe_lkcv, pipe->pipe_lock);
                        if (error != 0) {
                                return error;
                        }
                } else
                        cv_wait(&pipe->pipe_lkcv, pipe->pipe_lock);
        }

        pipe->pipe_state |= PIPE_LOCKFL;

        return 0;
}

/*
 * unlock a pipe I/O lock
 */
static inline void
pipeunlock(struct pipe *pipe)
{

        KASSERT(pipe->pipe_state & PIPE_LOCKFL);

        pipe->pipe_state &= ~PIPE_LOCKFL;
        cv_signal(&pipe->pipe_lkcv);
}

/*
 * Select/poll wakeup. This also sends SIGIO to peer connected to
 * 'sigpipe' side of pipe.
 */
static void
pipeselwakeup(struct pipe *selp, struct pipe *sigp, int code)
{
        int band;

        switch (code) {
        case POLL_IN:
                band = POLLIN|POLLRDNORM;
                break;
        case POLL_OUT:
                band = POLLOUT|POLLWRNORM;
                break;
        case POLL_HUP:
                band = POLLHUP;
                break;
        case POLL_ERR:
                band = POLLERR;
                break;
        default:
                band = 0;
#ifdef DIAGNOSTIC
                printf("bad siginfo code %d in pipe notification.\n", code);
#endif
                break;
        }

        selnotify(&selp->pipe_sel, band, NOTE_SUBMIT);

        if (sigp == NULL || (sigp->pipe_state & PIPE_ASYNC) == 0)
                return;

        fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp);
}

static int
pipe_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
    int flags)
{
        struct pipe *rpipe = fp->f_pipe;
        struct pipebuf *bp = &rpipe->pipe_buffer;
        kmutex_t *lock = rpipe->pipe_lock;
        int error;
        size_t nread = 0;
        size_t size;
        size_t ocnt;
        unsigned int wakeup_state = 0;

        /*
         * Try to avoid locking the pipe if we have nothing to do.
         *
         * There are programs which share one pipe amongst multiple processes
         * and perform non-blocking reads in parallel, even if the pipe is
         * empty.  This in particular is the case with BSD make, which when
         * spawned with a high -j number can find itself with over half of the
         * calls failing to find anything.
         */
        if ((fp->f_flag & FNONBLOCK) != 0) {
                if (__predict_false(uio->uio_resid == 0))
                        return (0);
                if (atomic_load_relaxed(&bp->cnt) == 0 &&
                    (atomic_load_relaxed(&rpipe->pipe_state) & PIPE_EOF) == 0)
                        return (EAGAIN);
        }

        mutex_enter(lock);
        ++rpipe->pipe_busy;
        ocnt = bp->cnt;

again:
        error = pipelock(rpipe, true);
        if (error)
                goto unlocked_error;

        while (uio->uio_resid) {
                /*
                 * Normal pipe buffer receive.
                 */
                if (bp->cnt > 0) {
                        size = bp->size - bp->out;
                        if (size > bp->cnt)
                                size = bp->cnt;
                        if (size > uio->uio_resid)
                                size = uio->uio_resid;

                        mutex_exit(lock);
                        error = uiomove((char *)bp->buffer + bp->out, size, uio);
                        mutex_enter(lock);
                        if (error)
                                break;

                        bp->out += size;
                        if (bp->out >= bp->size)
                                bp->out = 0;

                        bp->cnt -= size;

                        /*
                         * If there is no more to read in the pipe, reset
                         * its pointers to the beginning.  This improves
                         * cache hit stats.
                         */
                        if (bp->cnt == 0) {
                                bp->in = 0;
                                bp->out = 0;
                        }
                        nread += size;
                        continue;
                }

                /*
                 * Break if some data was read.
                 */
                if (nread > 0)
                        break;

                /*
                 * Detect EOF condition.
                 * Read returns 0 on EOF, no need to set error.
                 */
                if (rpipe->pipe_state & PIPE_EOF)
                        break;

                /*
                 * Don't block on non-blocking I/O.
                 */
                if (fp->f_flag & FNONBLOCK) {
                        error = EAGAIN;
                        break;
                }

                /*
                 * Unlock the pipe buffer for our remaining processing.
                 * We will either break out with an error or we will
                 * sleep and relock to loop.
                 */
                pipeunlock(rpipe);

#if 1   /* XXX (dsl) I'm sure these aren't needed here ... */
                /*
                 * We want to read more, wake up select/poll.
                 */
                pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT);

                /*
                 * If the "write-side" is blocked, wake it up now.
                 */
                cv_broadcast(&rpipe->pipe_wcv);
#endif

                if (wakeup_state & PIPE_RESTART) {
                        error = ERESTART;
                        goto unlocked_error;
                }

                /* Now wait until the pipe is filled */
                error = cv_wait_sig(&rpipe->pipe_rcv, lock);
                if (error != 0)
                        goto unlocked_error;
                wakeup_state = rpipe->pipe_state;
                goto again;
        }

        if (error == 0)
                getnanotime(&rpipe->pipe_atime);
        pipeunlock(rpipe);

unlocked_error:
        --rpipe->pipe_busy;
        if (rpipe->pipe_busy == 0) {
                rpipe->pipe_state &= ~PIPE_RESTART;
                cv_broadcast(&rpipe->pipe_draincv);
        }
        if (bp->cnt < MINPIPESIZE) {
                cv_broadcast(&rpipe->pipe_wcv);
        }

        /*
         * If anything was read off the buffer, signal to the writer it's
         * possible to write more data. Also send signal if we are here for the
         * first time after last write.
         */
        if ((bp->size - bp->cnt) >= PIPE_BUF
            && (ocnt != bp->cnt || (rpipe->pipe_state & PIPE_SIGNALR))) {
                pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT);
                rpipe->pipe_state &= ~PIPE_SIGNALR;
        }

        mutex_exit(lock);
        return (error);
}

static int
pipe_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
    int flags)
{
        struct pipe *wpipe, *rpipe;
        struct pipebuf *bp;
        kmutex_t *lock;
        int error;
        unsigned int wakeup_state = 0;

        /* We want to write to our peer */
        rpipe = fp->f_pipe;
        lock = rpipe->pipe_lock;
        error = 0;

        mutex_enter(lock);
        wpipe = rpipe->pipe_peer;

        /*
         * Detect loss of pipe read side, issue SIGPIPE if lost.
         */
        if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) != 0) {
                mutex_exit(lock);
                return EPIPE;
        }
        ++wpipe->pipe_busy;

        /* Acquire the long-term pipe lock */
        if ((error = pipelock(wpipe, true)) != 0) {
                --wpipe->pipe_busy;
                if (wpipe->pipe_busy == 0) {
                        wpipe->pipe_state &= ~PIPE_RESTART;
                        cv_broadcast(&wpipe->pipe_draincv);
                }
                mutex_exit(lock);
                return (error);
        }

        bp = &wpipe->pipe_buffer;

        /*
         * If it is advantageous to resize the pipe buffer, do so.
         */
        if ((uio->uio_resid > PIPE_SIZE) &&
            (nbigpipe < maxbigpipes) &&
            (bp->size <= PIPE_SIZE) && (bp->cnt == 0)) {

                if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
                        atomic_inc_uint(&nbigpipe);
        }

        while (uio->uio_resid) {
                size_t space;

                space = bp->size - bp->cnt;

                /* Writes of size <= PIPE_BUF must be atomic. */
                if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF))
                        space = 0;

                if (space > 0) {
                        int size;        /* Transfer size */
                        int segsize;        /* first segment to transfer */

                        /*
                         * Transfer size is minimum of uio transfer
                         * and free space in pipe buffer.
                         */
                        if (space > uio->uio_resid)
                                size = uio->uio_resid;
                        else
                                size = space;
                        /*
                         * First segment to transfer is minimum of
                         * transfer size and contiguous space in
                         * pipe buffer.  If first segment to transfer
                         * is less than the transfer size, we've got
                         * a wraparound in the buffer.
                         */
                        segsize = bp->size - bp->in;
                        if (segsize > size)
                                segsize = size;

                        /* Transfer first segment */
                        mutex_exit(lock);
                        error = uiomove((char *)bp->buffer + bp->in, segsize,
                            uio);

                        if (error == 0 && segsize < size) {
                                /*
                                 * Transfer remaining part now, to
                                 * support atomic writes.  Wraparound
                                 * happened.
                                 */
                                KASSERT(bp->in + segsize == bp->size);
                                error = uiomove(bp->buffer,
                                    size - segsize, uio);
                        }
                        mutex_enter(lock);
                        if (error)
                                break;

                        bp->in += size;
                        if (bp->in >= bp->size) {
                                KASSERT(bp->in == size - segsize + bp->size);
                                bp->in = size - segsize;
                        }

                        bp->cnt += size;
                        KASSERT(bp->cnt <= bp->size);
                        wakeup_state = 0;
                } else {
                        /*
                         * If the "read-side" has been blocked, wake it up now.
                         */
                        cv_broadcast(&wpipe->pipe_rcv);

                        /*
                         * Don't block on non-blocking I/O.
                         */
                        if (fp->f_flag & FNONBLOCK) {
                                error = EAGAIN;
                                break;
                        }

                        /*
                         * We have no more space and have something to offer,
                         * wake up select/poll.
                         */
                        if (bp->cnt)
                                pipeselwakeup(wpipe, wpipe, POLL_IN);

                        if (wakeup_state & PIPE_RESTART) {
                                error = ERESTART;
                                break;
                        }

                        /*
                         * If read side wants to go away, we just issue a signal
                         * to ourselves.
                         */
                        if (wpipe->pipe_state & PIPE_EOF) {
                                error = EPIPE;
                                break;
                        }

                        pipeunlock(wpipe);
                        error = cv_wait_sig(&wpipe->pipe_wcv, lock);
                        (void)pipelock(wpipe, false);
                        if (error != 0)
                                break;
                        wakeup_state = wpipe->pipe_state;
                }
        }

        --wpipe->pipe_busy;
        if (wpipe->pipe_busy == 0) {
                wpipe->pipe_state &= ~PIPE_RESTART;
                cv_broadcast(&wpipe->pipe_draincv);
        }
        if (bp->cnt > 0) {
                cv_broadcast(&wpipe->pipe_rcv);
        }

        /*
         * Don't return EPIPE if I/O was successful
         */
        if (error == EPIPE && bp->cnt == 0 && uio->uio_resid == 0)
                error = 0;

        if (error == 0)
                getnanotime(&wpipe->pipe_mtime);

        /*
         * We have something to offer, wake up select/poll.
         */
        if (bp->cnt)
                pipeselwakeup(wpipe, wpipe, POLL_IN);

        /*
         * Arrange for next read(2) to do a signal.
         */
        wpipe->pipe_state |= PIPE_SIGNALR;

        pipeunlock(wpipe);
        mutex_exit(lock);
        return (error);
}

/*
 * We implement a very minimal set of ioctls for compatibility with sockets.
 */
int
pipe_ioctl(file_t *fp, u_long cmd, void *data)
{
        struct pipe *pipe = fp->f_pipe;
        kmutex_t *lock = pipe->pipe_lock;

        switch (cmd) {

        case FIONBIO:
                return (0);

        case FIOASYNC:
                mutex_enter(lock);
                if (*(int *)data) {
                        pipe->pipe_state |= PIPE_ASYNC;
                } else {
                        pipe->pipe_state &= ~PIPE_ASYNC;
                }
                mutex_exit(lock);
                return (0);

        case FIONREAD:
                mutex_enter(lock);
                *(int *)data = pipe->pipe_buffer.cnt;
                mutex_exit(lock);
                return (0);

        case FIONWRITE:
                /* Look at other side */
                mutex_enter(lock);
                pipe = pipe->pipe_peer;
                if (pipe == NULL)
                        *(int *)data = 0;
                else
                        *(int *)data = pipe->pipe_buffer.cnt;
                mutex_exit(lock);
                return (0);

        case FIONSPACE:
                /* Look at other side */
                mutex_enter(lock);
                pipe = pipe->pipe_peer;
                if (pipe == NULL)
                        *(int *)data = 0;
                else
                        *(int *)data = pipe->pipe_buffer.size -
                            pipe->pipe_buffer.cnt;
                mutex_exit(lock);
                return (0);

        case TIOCSPGRP:
        case FIOSETOWN:
                return fsetown(&pipe->pipe_pgid, cmd, data);

        case TIOCGPGRP:
        case FIOGETOWN:
                return fgetown(pipe->pipe_pgid, cmd, data);

        }
        return (EPASSTHROUGH);
}

int
pipe_poll(file_t *fp, int events)
{
        struct pipe *rpipe = fp->f_pipe;
        struct pipe *wpipe;
        int eof = 0;
        int revents = 0;

        mutex_enter(rpipe->pipe_lock);
        wpipe = rpipe->pipe_peer;

        if (events & (POLLIN | POLLRDNORM))
                if ((rpipe->pipe_buffer.cnt > 0) ||
                    (rpipe->pipe_state & PIPE_EOF))
                        revents |= events & (POLLIN | POLLRDNORM);

        eof |= (rpipe->pipe_state & PIPE_EOF);

        if (wpipe == NULL)
                revents |= events & (POLLOUT | POLLWRNORM);
        else {
                if (events & (POLLOUT | POLLWRNORM))
                        if ((wpipe->pipe_state & PIPE_EOF) || (
                             (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
                                revents |= events & (POLLOUT | POLLWRNORM);

                eof |= (wpipe->pipe_state & PIPE_EOF);
        }

        if (wpipe == NULL || eof)
                revents |= POLLHUP;

        if (revents == 0) {
                if (events & (POLLIN | POLLRDNORM))
                        selrecord(curlwp, &rpipe->pipe_sel);

                if (events & (POLLOUT | POLLWRNORM))
                        selrecord(curlwp, &wpipe->pipe_sel);
        }
        mutex_exit(rpipe->pipe_lock);

        return (revents);
}

static int
pipe_stat(file_t *fp, struct stat *ub)
{
        struct pipe *pipe = fp->f_pipe;

        mutex_enter(pipe->pipe_lock);
        memset(ub, 0, sizeof(*ub));
        ub->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
        ub->st_blksize = pipe->pipe_buffer.size;
        if (ub->st_blksize == 0 && pipe->pipe_peer)
                ub->st_blksize = pipe->pipe_peer->pipe_buffer.size;
        ub->st_size = pipe->pipe_buffer.cnt;
        ub->st_blocks = (ub->st_size) ? 1 : 0;
        ub->st_atimespec = pipe->pipe_atime;
        ub->st_mtimespec = pipe->pipe_mtime;
        ub->st_ctimespec = ub->st_birthtimespec = pipe->pipe_btime;
        ub->st_uid = kauth_cred_geteuid(fp->f_cred);
        ub->st_gid = kauth_cred_getegid(fp->f_cred);

        /*
         * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
         * XXX (st_dev, st_ino) should be unique.
         */
        mutex_exit(pipe->pipe_lock);
        return 0;
}

static int
pipe_close(file_t *fp)
{
        struct pipe *pipe = fp->f_pipe;

        fp->f_pipe = NULL;
        pipeclose(pipe);
        return (0);
}

static void
pipe_restart(file_t *fp)
{
        struct pipe *pipe = fp->f_pipe;

        /*
         * Unblock blocked reads/writes in order to allow close() to complete.
         * System calls return ERESTART so that the fd is revalidated.
         * (Partial writes return the transfer length.)
         */
        mutex_enter(pipe->pipe_lock);
        pipe->pipe_state |= PIPE_RESTART;
        /* Wakeup both cvs, maybe we only need one, but maybe there are some
         * other paths where wakeup is needed, and it saves deciding which! */
        cv_broadcast(&pipe->pipe_rcv);
        cv_broadcast(&pipe->pipe_wcv);
        mutex_exit(pipe->pipe_lock);
}

static int
pipe_fpathconf(struct file *fp, int name, register_t *retval)
{

        switch (name) {
        case _PC_PIPE_BUF:
                *retval = PIPE_BUF;
                return 0;
        default:
                return EINVAL;
        }
}

static int
pipe_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice)
{

        return ESPIPE;
}

static void
pipe_free_kmem(struct pipe *pipe)
{

        if (pipe->pipe_buffer.buffer != NULL) {
                if (pipe->pipe_buffer.size > PIPE_SIZE) {
                        atomic_dec_uint(&nbigpipe);
                }
                if (pipe->pipe_buffer.buffer != (void *)pipe->pipe_kmem) {
                        uvm_km_free(kernel_map,
                            (vaddr_t)pipe->pipe_buffer.buffer,
                            pipe->pipe_buffer.size, UVM_KMF_PAGEABLE);
                        atomic_add_int(&amountpipekva,
                            -pipe->pipe_buffer.size);
                }
                pipe->pipe_buffer.buffer = NULL;
        }
}

/*
 * Shutdown the pipe.
 */
static void
pipeclose(struct pipe *pipe)
{
        kmutex_t *lock;
        struct pipe *ppipe;

        if (pipe == NULL)
                return;

        KASSERT(cv_is_valid(&pipe->pipe_rcv));
        KASSERT(cv_is_valid(&pipe->pipe_wcv));
        KASSERT(cv_is_valid(&pipe->pipe_draincv));
        KASSERT(cv_is_valid(&pipe->pipe_lkcv));

        lock = pipe->pipe_lock;
        if (lock == NULL)
                /* Must have failed during create */
                goto free_resources;

        mutex_enter(lock);
        pipeselwakeup(pipe, pipe, POLL_HUP);

        /*
         * If the other side is blocked, wake it up saying that
         * we want to close it down.
         */
        pipe->pipe_state |= PIPE_EOF;
        if (pipe->pipe_busy) {
                while (pipe->pipe_busy) {
                        cv_broadcast(&pipe->pipe_wcv);
                        cv_wait_sig(&pipe->pipe_draincv, lock);
                }
        }

        /*
         * Disconnect from peer.
         */
        if ((ppipe = pipe->pipe_peer) != NULL) {
                pipeselwakeup(ppipe, ppipe, POLL_HUP);
                ppipe->pipe_state |= PIPE_EOF;
                cv_broadcast(&ppipe->pipe_rcv);
                ppipe->pipe_peer = NULL;
        }

        /*
         * Any knote objects still left in the list are
         * the one attached by peer.  Since no one will
         * traverse this list, we just clear it.
         *
         * XXX Exposes select/kqueue internals.
         */
        SLIST_INIT(&pipe->pipe_sel.sel_klist);

        KASSERT((pipe->pipe_state & PIPE_LOCKFL) == 0);
        mutex_exit(lock);
        mutex_obj_free(lock);

        /*
         * Free resources.
         */
    free_resources:
        pipe->pipe_pgid = 0;
        pipe->pipe_state = PIPE_SIGNALR;
        pipe->pipe_peer = NULL;
        pipe->pipe_lock = NULL;
        pipe_free_kmem(pipe);
        if (pipe->pipe_kmem != 0) {
                pool_cache_put(pipe_rd_cache, pipe);
        } else {
                pool_cache_put(pipe_wr_cache, pipe);
        }
}

static void
filt_pipedetach(struct knote *kn)
{
        struct pipe *pipe;
        kmutex_t *lock;

        pipe = ((file_t *)kn->kn_obj)->f_pipe;
        lock = pipe->pipe_lock;

        mutex_enter(lock);

        switch(kn->kn_filter) {
        case EVFILT_WRITE:
                /* Need the peer structure, not our own. */
                pipe = pipe->pipe_peer;

                /* If reader end already closed, just return. */
                if (pipe == NULL) {
                        mutex_exit(lock);
                        return;
                }

                break;
        default:
                /* Nothing to do. */
                break;
        }

        KASSERT(kn->kn_hook == pipe);
        selremove_knote(&pipe->pipe_sel, kn);
        mutex_exit(lock);
}

static int
filt_piperead(struct knote *kn, long hint)
{
        struct pipe *rpipe = ((file_t *)kn->kn_obj)->f_pipe;
        struct pipe *wpipe;
        int rv;

        if ((hint & NOTE_SUBMIT) == 0) {
                mutex_enter(rpipe->pipe_lock);
        }
        wpipe = rpipe->pipe_peer;
        kn->kn_data = rpipe->pipe_buffer.cnt;

        if ((rpipe->pipe_state & PIPE_EOF) ||
            (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
                knote_set_eof(kn, 0);
                rv = 1;
        } else {
                rv = kn->kn_data > 0;
        }

        if ((hint & NOTE_SUBMIT) == 0) {
                mutex_exit(rpipe->pipe_lock);
        }
        return rv;
}

static int
filt_pipewrite(struct knote *kn, long hint)
{
        struct pipe *rpipe = ((file_t *)kn->kn_obj)->f_pipe;
        struct pipe *wpipe;
        int rv;

        if ((hint & NOTE_SUBMIT) == 0) {
                mutex_enter(rpipe->pipe_lock);
        }
        wpipe = rpipe->pipe_peer;

        if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
                kn->kn_data = 0;
                knote_set_eof(kn, 0);
                rv = 1;
        } else {
                kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
                rv = kn->kn_data >= PIPE_BUF;
        }

        if ((hint & NOTE_SUBMIT) == 0) {
                mutex_exit(rpipe->pipe_lock);
        }
        return rv;
}

static const struct filterops pipe_rfiltops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_pipedetach,
        .f_event = filt_piperead,
};

static const struct filterops pipe_wfiltops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_pipedetach,
        .f_event = filt_pipewrite,
};

static int
pipe_kqfilter(file_t *fp, struct knote *kn)
{
        struct pipe *pipe;
        kmutex_t *lock;

        pipe = ((file_t *)kn->kn_obj)->f_pipe;
        lock = pipe->pipe_lock;

        mutex_enter(lock);

        switch (kn->kn_filter) {
        case EVFILT_READ:
                kn->kn_fop = &pipe_rfiltops;
                break;
        case EVFILT_WRITE:
                kn->kn_fop = &pipe_wfiltops;
                pipe = pipe->pipe_peer;
                if (pipe == NULL) {
                        /* Other end of pipe has been closed. */
                        mutex_exit(lock);
                        return (EBADF);
                }
                break;
        default:
                mutex_exit(lock);
                return (EINVAL);
        }

        kn->kn_hook = pipe;
        selrecord_knote(&pipe->pipe_sel, kn);
        mutex_exit(lock);

        return (0);
}

/*
 * Handle pipe sysctls.
 */
SYSCTL_SETUP(sysctl_kern_pipe_setup, "sysctl kern.pipe subtree setup")
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "pipe",
                       SYSCTL_DESCR("Pipe settings"),
                       NULL, 0, NULL, 0,
                       CTL_KERN, KERN_PIPE, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "maxbigpipes",
                       SYSCTL_DESCR("Maximum number of \"big\" pipes"),
                       NULL, 0, &maxbigpipes, 0,
                       CTL_KERN, KERN_PIPE, KERN_PIPE_MAXBIGPIPES, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_INT, "nbigpipes",
                       SYSCTL_DESCR("Number of \"big\" pipes"),
                       NULL, 0, &nbigpipe, 0,
                       CTL_KERN, KERN_PIPE, KERN_PIPE_NBIGPIPES, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_INT, "kvasize",
                       SYSCTL_DESCR("Amount of kernel memory consumed by pipe "
                                    "buffers"),
                       NULL, 0, &amountpipekva, 0,
                       CTL_KERN, KERN_PIPE, KERN_PIPE_KVASIZE, CTL_EOL);
}


























































































































































    1 





























    1 











































    1 





    1 











    1 









    2 




































    1 




    1 










    1 






    1 





































































































































    1 
















































































































   27 








   27 












































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
/*        $NetBSD: procfs_vfsops.c,v 1.114 2024/01/17 10:21:01 hannken Exp $        */

/*
 * Copyright (c) 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)procfs_vfsops.c        8.7 (Berkeley) 5/10/95
 */

/*
 * Copyright (c) 1993 Jan-Simon Pendry
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)procfs_vfsops.c        8.7 (Berkeley) 5/10/95
 */

/*
 * procfs VFS interface
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: procfs_vfsops.c,v 1.114 2024/01/17 10:21:01 hannken Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/dirent.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/fstrans.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/vnode.h>

#include <miscfs/genfs/genfs.h>

#include <miscfs/procfs/procfs.h>

#include <uvm/uvm_extern.h>                        /* for PAGE_SIZE */

MODULE(MODULE_CLASS_VFS, procfs, "ptrace_common");

VFS_PROTOS(procfs);

#define PROCFS_HASHSIZE        256
#define PROCFS_EXEC_HOOK ((void *)1)
#define PROCFS_EXIT_HOOK ((void *)2)

static kauth_listener_t procfs_listener;
static void *procfs_exechook;
static void *procfs_exithook;
LIST_HEAD(hashhead, pfsnode);
static u_long procfs_hashmask;
static struct hashhead *procfs_hashtab;
static kmutex_t procfs_hashlock;

static struct hashhead *
procfs_hashhead(pid_t pid)
{

        return &procfs_hashtab[pid & procfs_hashmask];
}

void
procfs_hashrem(struct pfsnode *pfs)
{

        mutex_enter(&procfs_hashlock);
        LIST_REMOVE(pfs, pfs_hash);
        mutex_exit(&procfs_hashlock);
}

/*
 * VFS Operations.
 *
 * mount system call
 */
/* ARGSUSED */
int
procfs_mount(
    struct mount *mp,
    const char *path,
    void *data,
    size_t *data_len)
{
        struct lwp *l = curlwp;
        struct procfsmount *pmnt;
        struct procfs_args *args = data;
        int error;

        if (args == NULL)
                return EINVAL;

        if (UIO_MX & (UIO_MX-1)) {
                log(LOG_ERR, "procfs: invalid directory entry size");
                return (EINVAL);
        }

        if (mp->mnt_flag & MNT_GETARGS) {
                if (*data_len < sizeof *args)
                        return EINVAL;

                pmnt = VFSTOPROC(mp);
                if (pmnt == NULL)
                        return EIO;
                args->version = PROCFS_ARGSVERSION;
                args->flags = pmnt->pmnt_flags;
                *data_len = sizeof *args;
                return 0;
        }

        if (mp->mnt_flag & MNT_UPDATE)
                return (EOPNOTSUPP);

        if (*data_len >= sizeof *args && args->version != PROCFS_ARGSVERSION)
                return EINVAL;

        pmnt = kmem_zalloc(sizeof(struct procfsmount), KM_SLEEP);

        mp->mnt_stat.f_namemax = PROCFS_MAXNAMLEN;
        mp->mnt_flag |= MNT_LOCAL;
        mp->mnt_data = pmnt;
        vfs_getnewfsid(mp);

        error = set_statvfs_info(path, UIO_USERSPACE, "procfs", UIO_SYSSPACE,
            mp->mnt_op->vfs_name, mp, l);
        if (*data_len >= sizeof *args)
                pmnt->pmnt_flags = args->flags;
        else
                pmnt->pmnt_flags = 0;

        mp->mnt_iflag |= IMNT_MPSAFE | IMNT_SHRLOOKUP;
        return error;
}

/*
 * unmount system call
 */
int
procfs_unmount(struct mount *mp, int mntflags)
{
        int error;
        int flags = 0;

        if (mntflags & MNT_FORCE)
                flags |= FORCECLOSE;

        if ((error = vflush(mp, 0, flags)) != 0)
                return (error);

        kmem_free(mp->mnt_data, sizeof(struct procfsmount));
        mp->mnt_data = NULL;

        return 0;
}

int
procfs_root(struct mount *mp, int lktype, struct vnode **vpp)
{
        int error;

        error = procfs_allocvp(mp, vpp, 0, PFSroot, -1);
        if (error == 0) {
                error = vn_lock(*vpp, lktype);
                if (error != 0) {
                        vrele(*vpp);
                        *vpp = NULL;
                }
        }

        return error;
}

/* ARGSUSED */
int
procfs_start(struct mount *mp, int flags)
{

        return (0);
}

/*
 * Get file system statistics.
 */
int
procfs_statvfs(struct mount *mp, struct statvfs *sbp)
{

        genfs_statvfs(mp, sbp);

        sbp->f_bsize = PAGE_SIZE;
        sbp->f_frsize = PAGE_SIZE;
        sbp->f_iosize = PAGE_SIZE;
        sbp->f_blocks = 1;
        sbp->f_files = maxproc;                                        /* approx */
        sbp->f_ffree = maxproc - atomic_load_relaxed(&nprocs);        /* approx */
        sbp->f_favail = maxproc - atomic_load_relaxed(&nprocs);        /* approx */

        return (0);
}

/*ARGSUSED*/
int
procfs_sync(
    struct mount *mp,
    int waitfor,
    kauth_cred_t uc)
{

        return (0);
}

/*ARGSUSED*/
int
procfs_vget(struct mount *mp, ino_t ino, int lktype,
    struct vnode **vpp)
{
        return (EOPNOTSUPP);
}

int
procfs_loadvnode(struct mount *mp, struct vnode *vp,
    const void *key, size_t key_len, const void **new_key)
{
        int error;
        struct pfskey pfskey;
        struct pfsnode *pfs;

        KASSERT(key_len == sizeof(pfskey));
        memcpy(&pfskey, key, key_len);

        pfs = kmem_alloc(sizeof(*pfs), KM_SLEEP);
        pfs->pfs_pid = pfskey.pk_pid;
        pfs->pfs_type = pfskey.pk_type;
        pfs->pfs_fd = pfskey.pk_fd;
        pfs->pfs_vnode = vp;
        pfs->pfs_mount = mp;
        pfs->pfs_flags = 0;
        pfs->pfs_fileno =
            PROCFS_FILENO(pfs->pfs_pid, pfs->pfs_type, pfs->pfs_fd);
        vp->v_tag = VT_PROCFS;
        vp->v_op = procfs_vnodeop_p;
        vp->v_data = pfs;

        switch (pfs->pfs_type) {
        case PFSroot:        /* /proc = dr-xr-xr-x */
                vp->v_vflag |= VV_ROOT;
                /*FALLTHROUGH*/
        case PFSproc:        /* /proc/N = dr-xr-xr-x */
                pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
                vp->v_type = VDIR;
                break;

        case PFStask:        /* /proc/N/task = dr-xr-xr-x */
                if (pfs->pfs_fd == -1) {
                        pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|
                            S_IROTH|S_IXOTH;
                        vp->v_type = VDIR;
                        break;
                }
                /*FALLTHROUGH*/
        case PFScurproc:        /* /proc/curproc = lr-xr-xr-x */
        case PFSself:        /* /proc/self    = lr-xr-xr-x */
        case PFScwd:        /* /proc/N/cwd = lr-xr-xr-x */
        case PFSchroot:        /* /proc/N/chroot = lr-xr-xr-x */
        case PFSexe:        /* /proc/N/exe = lr-xr-xr-x */
                pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
                vp->v_type = VLNK;
                break;

        case PFSfd:
                if (pfs->pfs_fd == -1) {        /* /proc/N/fd = dr-x------ */
                        pfs->pfs_mode = S_IRUSR|S_IXUSR;
                        vp->v_type = VDIR;
                } else {        /* /proc/N/fd/M = [ps-]rw------- */
                        file_t *fp;
                        vnode_t *vxp;
                        struct proc *p;

                        mutex_enter(&proc_lock);
                        p = procfs_proc_find(mp, pfs->pfs_pid);
                        mutex_exit(&proc_lock);
                        if (p == NULL) {
                                error = ENOENT;
                                goto bad;
                        }
                        KASSERT(rw_read_held(&p->p_reflock));
                        if ((fp = fd_getfile2(p, pfs->pfs_fd)) == NULL) {
                                error = EBADF;
                                goto bad;
                        }

                        pfs->pfs_mode = S_IRUSR|S_IWUSR;
                        switch (fp->f_type) {
                        case DTYPE_VNODE:
                                vxp = fp->f_vnode;

                                /*
                                 * We make symlinks for directories
                                 * to avoid cycles.
                                 */
                                if (vxp->v_type == VDIR ||
                                    procfs_proc_is_linux_compat())
                                        goto symlink;
                                vp->v_type = vxp->v_type;
                                break;
                        case DTYPE_PIPE:
                                vp->v_type = VFIFO;
                                break;
                        case DTYPE_SOCKET:
                                vp->v_type = VSOCK;
                                break;
                        case DTYPE_KQUEUE:
                        case DTYPE_MISC:
                        case DTYPE_SEM:
                        symlink:
                                pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|
                                    S_IXGRP|S_IROTH|S_IXOTH;
                                vp->v_type = VLNK;
                                break;
                        default:
                                error = EOPNOTSUPP;
                                closef(fp);
                                goto bad;
                        }
                        closef(fp);
                }
                break;

        case PFSfile:        /* /proc/N/file = -rw------- */
        case PFSmem:        /* /proc/N/mem = -rw------- */
        case PFSregs:        /* /proc/N/regs = -rw------- */
        case PFSfpregs:        /* /proc/N/fpregs = -rw------- */
                pfs->pfs_mode = S_IRUSR|S_IWUSR;
                vp->v_type = VREG;
                break;

        case PFSnote:        /* /proc/N/note = --w------ */
        case PFSnotepg:        /* /proc/N/notepg = --w------ */
                pfs->pfs_mode = S_IWUSR;
                vp->v_type = VREG;
                break;

        case PFSmap:                /* /proc/N/map = -r-------- */
        case PFSmaps:                /* /proc/N/maps = -r-------- */
        case PFSauxv:                /* /proc/N/auxv = -r-------- */
        case PFSenviron:        /* /proc/N/environ = -r-------- */
                pfs->pfs_mode = S_IRUSR;
                vp->v_type = VREG;
                break;

        case PFSstatus:                /* /proc/N/status = -r--r--r-- */
        case PFSstat:                /* /proc/N/stat = -r--r--r-- */
        case PFScmdline:        /* /proc/N/cmdline = -r--r--r-- */
        case PFSemul:                /* /proc/N/emul = -r--r--r-- */
        case PFSmeminfo:        /* /proc/meminfo = -r--r--r-- */
        case PFScpustat:        /* /proc/stat = -r--r--r-- */
        case PFSdevices:        /* /proc/devices = -r--r--r-- */
        case PFScpuinfo:        /* /proc/cpuinfo = -r--r--r-- */
        case PFSuptime:                /* /proc/uptime = -r--r--r-- */
        case PFSmounts:                /* /proc/mounts = -r--r--r-- */
        case PFSloadavg:        /* /proc/loadavg = -r--r--r-- */
        case PFSstatm:                /* /proc/N/statm = -r--r--r-- */
        case PFSversion:        /* /proc/version = -r--r--r-- */
        case PFSlimit:                /* /proc/limit = -r--r--r-- */
                pfs->pfs_mode = S_IRUSR|S_IRGRP|S_IROTH;
                vp->v_type = VREG;
                break;

#ifdef __HAVE_PROCFS_MACHDEP
        PROCFS_MACHDEP_NODETYPE_CASES
                procfs_machdep_allocvp(vp);
                break;
#endif

        default:
                panic("procfs_allocvp");
        }

        mutex_enter(&procfs_hashlock);
        LIST_INSERT_HEAD(procfs_hashhead(pfs->pfs_pid), pfs, pfs_hash);
        mutex_exit(&procfs_hashlock);

        uvm_vnp_setsize(vp, 0);
        *new_key = &pfs->pfs_key;

        return 0;

bad:
        vp->v_tag =VT_NON;
        vp->v_type = VNON;
        vp->v_op = NULL;
        vp->v_data = NULL;
        kmem_free(pfs, sizeof(*pfs));
        return error;
}

void
procfs_init(void)
{

}

void
procfs_reinit(void)
{

}

void
procfs_done(void)
{

}

extern const struct vnodeopv_desc procfs_vnodeop_opv_desc;

const struct vnodeopv_desc * const procfs_vnodeopv_descs[] = {
        &procfs_vnodeop_opv_desc,
        NULL,
};

struct vfsops procfs_vfsops = {
        .vfs_name = MOUNT_PROCFS,
        .vfs_min_mount_data = sizeof (struct procfs_args),
        .vfs_mount = procfs_mount,
        .vfs_start = procfs_start,
        .vfs_unmount = procfs_unmount,
        .vfs_root = procfs_root,
        .vfs_quotactl = (void *)eopnotsupp,
        .vfs_statvfs = procfs_statvfs,
        .vfs_sync = procfs_sync,
        .vfs_vget = procfs_vget,
        .vfs_loadvnode = procfs_loadvnode,
        .vfs_fhtovp = (void *)eopnotsupp,
        .vfs_vptofh = (void *)eopnotsupp,
        .vfs_init = procfs_init,
        .vfs_reinit = procfs_reinit,
        .vfs_done = procfs_done,
        .vfs_snapshot = (void *)eopnotsupp,
        .vfs_extattrctl = vfs_stdextattrctl,
        .vfs_suspendctl = genfs_suspendctl,
        .vfs_renamelock_enter = genfs_renamelock_enter,
        .vfs_renamelock_exit = genfs_renamelock_exit,
        .vfs_fsync = (void *)eopnotsupp,
        .vfs_opv_descs = procfs_vnodeopv_descs
};

static void
procfs_exechook_cb(struct proc *p, void *arg)
{
        struct hashhead *head;
        struct pfsnode *pfs;
        struct mount *mp;
        struct pfskey key;
        struct vnode *vp;
        int error;

        if (arg == PROCFS_EXEC_HOOK && !(p->p_flag & PK_SUGID))
                return;

        head = procfs_hashhead(p->p_pid);

again:
        mutex_enter(&procfs_hashlock);
        LIST_FOREACH(pfs, head, pfs_hash) {
                if (pfs->pfs_pid != p->p_pid)
                        continue;
                mp = pfs->pfs_mount;
                key = pfs->pfs_key;
                vfs_ref(mp);
                mutex_exit(&procfs_hashlock);

                error = vcache_get(mp, &key, sizeof(key), &vp);
                vfs_rele(mp);
                if (error != 0)
                        goto again;
                if (vrecycle(vp))
                        goto again;
                do {
                        error = vfs_suspend(mp, 0);
                } while (error == EINTR || error == ERESTART);
                vgone(vp);
                if (error == 0)
                        vfs_resume(mp);
                goto again;
        }
        mutex_exit(&procfs_hashlock);
}

static int
procfs_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        struct proc *p;
        struct pfsnode *pfs;
        int result;

        result = KAUTH_RESULT_DEFER;
        p = arg0;
        pfs = arg1;

        if (action != KAUTH_PROCESS_PROCFS)
                return result;

        switch (pfs->pfs_type) {
        case PFSregs:
        case PFSfpregs:
        case PFSmem:
                if (kauth_cred_getuid(cred) != kauth_cred_getuid(p->p_cred) ||
                    ISSET(p->p_flag, PK_SUGID))
                        break;

                /*FALLTHROUGH*/
        default:
                result = KAUTH_RESULT_ALLOW;
                break;
        }

        return result;
}

SYSCTL_SETUP(procfs_sysctl_setup, "procfs sysctl")
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "procfs",
                       SYSCTL_DESCR("Process file system"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, 12, CTL_EOL);
        /*
         * XXX the "12" above could be dynamic, thereby eliminating
         * one more instance of the "number to vfs" mapping problem,
         * but "12" is the order as taken from sys/mount.h
         */
}

static int
procfs_modcmd(modcmd_t cmd, void *arg)
{
        int error;

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = vfs_attach(&procfs_vfsops);
                if (error != 0)
                        break;

                procfs_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
                    procfs_listener_cb, NULL);

                procfs_exechook = exechook_establish(procfs_exechook_cb,
                    PROCFS_EXEC_HOOK);
                procfs_exithook = exithook_establish(procfs_exechook_cb,
                    PROCFS_EXIT_HOOK);

                mutex_init(&procfs_hashlock, MUTEX_DEFAULT, IPL_NONE);
                procfs_hashtab = hashinit(PROCFS_HASHSIZE, HASH_LIST, true,
                    &procfs_hashmask);

                break;
        case MODULE_CMD_FINI:
                error = vfs_detach(&procfs_vfsops);
                if (error != 0)
                        break;
                kauth_unlisten_scope(procfs_listener);
                exechook_disestablish(procfs_exechook);
                exithook_disestablish(procfs_exithook);
                mutex_destroy(&procfs_hashlock);
                hashdone(procfs_hashtab, HASH_LIST, procfs_hashmask);
                break;
        default:
                error = ENOTTY;
                break;
        }

        return (error);
}




































































































































































    5 








    5 


































































































































    3 


    3 

    3 



    3 





    3 














































    3 































    5 











    5 

    5 


    3 





    5 





    5 
    3 
    2 


    5 




    3 



    3 

    3 
    3 
    3 








    5 









    5 






    5 







    5 




















    5 
    5 



    5 
    5 




    5 















































































































































































































































































































































































































































































































































    2 

    2 



























































    2 



    2 






















    6 




    6 










    8 





    3 

   14 




































































































   14 



















    5 
   14 















   13 

   11 













    6 






    5 
























    5 
























    5 






    5 


    5 

























    6 














































    4 

















    3 














    2 








































    6 










    2 






































    6 
    5 







    9 

    3 
    3 




    5 
    8 





    9 







    9 
    1 


    9 
    9 

    9 







    5 
   14 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
/*        $NetBSD: subr_prf.c,v 1.203 2023/08/29 21:23:14 andvar Exp $        */

/*-
 * Copyright (c) 1986, 1988, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)subr_prf.c        8.4 (Berkeley) 5/4/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_prf.c,v 1.203 2023/08/29 21:23:14 andvar Exp $");

#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "opt_kgdb.h"
#include "opt_dump.h"
#include "opt_rnd_printf.h"
#endif

#include <sys/param.h>
#include <sys/stdint.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/device.h>
#include <sys/reboot.h>
#include <sys/msgbuf.h>
#include <sys/proc.h>
#include <sys/ioctl.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/tty.h>
#include <sys/tprintf.h>
#include <sys/spldebug.h>
#include <sys/syslog.h>
#include <sys/kprintf.h>
#include <sys/atomic.h>
#include <sys/kernel.h>
#include <sys/cpu.h>
#include <sys/rndsource.h>
#include <sys/kmem.h>

#include <dev/cons.h>

#include <net/if.h>

static kmutex_t kprintf_mtx;
static bool kprintf_inited = false;

#ifdef KGDB
#include <sys/kgdb.h>
#endif

#ifdef DDB
#include <ddb/ddbvar.h>                /* db_panic */
#include <ddb/db_output.h>        /* db_printf, db_putchar prototypes */
#endif


/*
 * defines
 */
#define KLOG_PRI        0x80000000


/*
 * local prototypes
 */

static void         putchar(int, int, struct tty *);
static void         kprintf_internal(const char *, int, void *, char *, ...);


/*
 * globals
 */

const        char *panicstr; /* arg to first call to panic (used as a flag
                           to indicate that panic has already been called). */
struct cpu_info *paniccpu;        /* cpu that first panicked */
long        panicstart, panicend;        /* position in the msgbuf of the start and
                                   end of the formatted panicstr. */
int        doing_shutdown;        /* set to indicate shutdown in progress */

#ifdef RND_PRINTF
static krndsource_t        rnd_printf_source;
#endif

#ifndef        DUMP_ON_PANIC
#define        DUMP_ON_PANIC        1
#endif
int        dumponpanic = DUMP_ON_PANIC;

/*
 * v_putc: routine to putc on virtual console
 *
 * the v_putc pointer can be used to redirect the console cnputc elsewhere
 * [e.g. to a "virtual console"].
 */

void (*v_putc)(int) = cnputc;        /* start with cnputc (normal cons) */
void (*v_flush)(void) = cnflush;        /* start with cnflush (normal cons) */

const char hexdigits[] = "0123456789abcdef";
const char HEXDIGITS[] = "0123456789ABCDEF";


/*
 * functions
 */

/*
 * Locking is inited fairly early in MI bootstrap.  Before that
 * prints are done unlocked.  But that doesn't really matter,
 * since nothing can preempt us before interrupts are enabled.
 */
void
kprintf_init(void)
{

        KASSERT(!kprintf_inited); /* not foolproof, but ... */
        KASSERT(cold);
        mutex_init(&kprintf_mtx, MUTEX_DEFAULT, IPL_HIGH);
#ifdef RND_PRINTF
        rnd_attach_source(&rnd_printf_source, "printf", RND_TYPE_UNKNOWN,
            RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE);
#endif
        kprintf_inited = true;
}

void
kprintf_lock(void)
{

        if (__predict_true(kprintf_inited))
                mutex_enter(&kprintf_mtx);
}

void
kprintf_unlock(void)
{

        if (__predict_true(kprintf_inited)) {
                /* assert kprintf wasn't somehow inited while we were in */
                KASSERT(mutex_owned(&kprintf_mtx));
                mutex_exit(&kprintf_mtx);
        }
}

/*
 * twiddle: spin a little propellor on the console.
 */

void
twiddle(void)
{
        static const char twiddle_chars[] = "|/-\\";
        static int pos;

        kprintf_lock();

        putchar(twiddle_chars[pos++ & 3], TOCONS|NOTSTAMP, NULL);
        putchar('\b', TOCONS|NOTSTAMP, NULL);

        kprintf_unlock();
}

/*
 * panic: handle an unresolvable fatal error
 *
 * prints "panic: <message>" and reboots.   if called twice (i.e. recursive
 * call) we avoid trying to dump and just reboot (to avoid recursive panics).
 */

void
panic(const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        vpanic(fmt, ap);
        va_end(ap);
}

void
vpanic(const char *fmt, va_list ap)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci, *oci;
        int bootopt;
        static char scratchstr[384]; /* stores panic message */

        spldebug_stop();

        if (lwp0.l_cpu && curlwp) {
                /*
                 * Disable preemption.  If already panicking on another CPU, sit
                 * here and spin until the system is rebooted.  Allow the CPU that
                 * first panicked to panic again.
                 */
                kpreempt_disable();
                ci = curcpu();
                oci = atomic_cas_ptr((void *)&paniccpu, NULL, ci);
                if (oci != NULL && oci != ci) {
                        /* Give interrupts a chance to try and prevent deadlock. */
                        for (;;) {
#ifndef _RUMPKERNEL /* XXXpooka: temporary build fix, see kern/40505 */
                                DELAY(10);
#endif /* _RUMPKERNEL */
                        }
                }

                /*
                 * Convert the current thread to a bound thread and prevent all
                 * CPUs from scheduling unbound jobs.  Do so without taking any
                 * locks.
                 */
                curlwp->l_pflag |= LP_BOUND;
                for (CPU_INFO_FOREACH(cii, ci)) {
                        ci->ci_schedstate.spc_flags |= SPCF_OFFLINE;
                }
        }

        bootopt = RB_AUTOBOOT | RB_NOSYNC;
        if (!doing_shutdown) {
                if (dumponpanic)
                        bootopt |= RB_DUMP;
        } else
                printf("Skipping crash dump on recursive panic\n");

        doing_shutdown = 1;

        if (logenabled(msgbufp))
                panicstart = msgbufp->msg_bufx;

        kprintf_lock();
        kprintf_internal("panic: ", TOLOG|TOCONS, NULL, NULL);
        if (panicstr == NULL) {
                /* first time in panic - store fmt first for precaution */
                panicstr = fmt;

                vsnprintf(scratchstr, sizeof(scratchstr), fmt, ap);
                kprintf_internal("%s", TOLOG|TOCONS, NULL, NULL, scratchstr);
                panicstr = scratchstr;
        } else {
                kprintf(fmt, TOLOG|TOCONS, NULL, NULL, ap);
        }
        kprintf_internal("\n", TOLOG|TOCONS, NULL, NULL);
        kprintf_unlock();

        if (logenabled(msgbufp))
                panicend = msgbufp->msg_bufx;

#ifdef KGDB
        kgdb_panic();
#endif
#ifdef DDB
        db_panic();
#endif
        kern_reboot(bootopt, NULL);
}

/*
 * kernel logging functions: log, logpri, addlog
 */

/*
 * log: write to the log buffer
 *
 * => will not sleep [so safe to call from interrupt]
 * => will log to console if /dev/klog isn't open
 */

void
log(int level, const char *fmt, ...)
{
        va_list ap;

        kprintf_lock();

        klogpri(level);                /* log the level first */
        va_start(ap, fmt);
        kprintf(fmt, TOLOG, NULL, NULL, ap);
        va_end(ap);
        if (!log_open) {
                va_start(ap, fmt);
                kprintf(fmt, TOCONS, NULL, NULL, ap);
                va_end(ap);
        }

        kprintf_unlock();

        logwakeup();                /* wake up anyone waiting for log msgs */
}

/*
 * vlog: write to the log buffer [already have va_list]
 */

void
vlog(int level, const char *fmt, va_list ap)
{
        va_list cap;

        va_copy(cap, ap);
        kprintf_lock();

        klogpri(level);                /* log the level first */
        kprintf(fmt, TOLOG, NULL, NULL, ap);
        if (!log_open)
                kprintf(fmt, TOCONS, NULL, NULL, cap);

        kprintf_unlock();
        va_end(cap);

        logwakeup();                /* wake up anyone waiting for log msgs */
}

/*
 * logpri: log the priority level to the klog
 */

void
logpri(int level)
{

        kprintf_lock();
        klogpri(level);
        kprintf_unlock();
}

/*
 * Note: we must be in the mutex here!
 */
void
klogpri(int level)
{
        KASSERT((level & KLOG_PRI) == 0);

        putchar(level | KLOG_PRI, TOLOG, NULL);
}

/*
 * addlog: add info to previous log message
 */

void
addlog(const char *fmt, ...)
{
        va_list ap;

        kprintf_lock();

        va_start(ap, fmt);
        kprintf(fmt, TOLOG, NULL, NULL, ap);
        va_end(ap);
        if (!log_open) {
                va_start(ap, fmt);
                kprintf(fmt, TOCONS, NULL, NULL, ap);
                va_end(ap);
        }

        kprintf_unlock();

        logwakeup();
}

static void
putone(int c, int flags, struct tty *tp)
{
        struct tty *ctp;
        int s;
        bool do_ps = !cold;

        ctp = NULL;        /* XXX gcc i386 -Os */

        /*
         * Ensure whatever constty points to can't go away while we're
         * trying to use it.
         */
        if (__predict_true(do_ps))
                s = pserialize_read_enter();

        if (panicstr)
                atomic_store_relaxed(&constty, NULL);

        if ((flags & TOCONS) &&
            (ctp = atomic_load_consume(&constty)) != NULL &&
            tp == NULL) {
                tp = ctp;
                flags |= TOTTY;
        }
        if ((flags & TOTTY) && tp &&
            tputchar(c, flags, tp) < 0 &&
            (flags & TOCONS))
                atomic_cas_ptr(&constty, tp, NULL);
        if ((flags & TOLOG) &&
            c != '\0' && c != '\r' && c != 0177)
                    logputchar(c);
        if ((flags & TOCONS) && ctp == NULL && c != '\0')
                (*v_putc)(c);

        if (__predict_true(do_ps))
                pserialize_read_exit(s);
}

static void
putlogpri(int level)
{
        char *p;
        char snbuf[KPRINTF_BUFSIZE];

        putone('<', TOLOG, NULL);
        snprintf(snbuf, sizeof(snbuf), "%d", level);
        for (p = snbuf ; *p ; p++)
                putone(*p, TOLOG, NULL);
        putone('>', TOLOG, NULL);
}

#ifndef KLOG_NOTIMESTAMP
static int needtstamp = 1;
int log_ts_prec = 7;

static void
addtstamp(int flags, struct tty *tp)
{
        char buf[64];
        struct timespec ts;
        int n, prec;
        long fsec;

        prec = log_ts_prec;
        if (prec < 0) {
                prec = 0;
                log_ts_prec = prec;
        } else if (prec > 9) {
                prec = 9;
                log_ts_prec = prec;
        }

        getnanouptime(&ts);

        for (n = prec, fsec = ts.tv_nsec; n < 8; n++)
                fsec /= 10;
        if (n < 9)
                fsec = (fsec / 10) + ((fsec % 10) >= 5);

        n = snprintf(buf, sizeof(buf), "[% 4jd.%.*ld] ",
            (intmax_t)ts.tv_sec, prec, fsec);

        for (int i = 0; i < n; i++)
                putone(buf[i], flags, tp);
}
#endif

/*
 * putchar: print a single character on console or user terminal.
 *
 * => if console, then the last MSGBUFS chars are saved in msgbuf
 *        for inspection later (e.g. dmesg/syslog)
 * => we must already be in the mutex!
 */
static void
putchar(int c, int flags, struct tty *tp)
{
        if (c & KLOG_PRI) {
                putlogpri(c & ~KLOG_PRI);
                return;
        }

#ifndef KLOG_NOTIMESTAMP
        if (c != '\0' && c != '\n' && needtstamp && (flags & NOTSTAMP) == 0) {
                addtstamp(flags, tp);
                needtstamp = 0;
        }

        if (c == '\n')
                needtstamp = 1;
#endif
        putone(c, flags, tp);

#ifdef DDB
        if (flags & TODDB) {
                db_putchar(c);
                return;
        }
#endif

#ifdef RND_PRINTF
        if (__predict_true(kprintf_inited)) {
                unsigned char ch = c;
                rnd_add_data_intr(&rnd_printf_source, &ch, 1, 0);
        }
#endif
}

/*
 * tablefull: warn that a system table is full
 */

void
tablefull(const char *tab, const char *hint)
{
        if (hint)
                log(LOG_ERR, "%s: table is full - %s\n", tab, hint);
        else
                log(LOG_ERR, "%s: table is full\n", tab);
}


/*
 * uprintf: print to the controlling tty of the current process
 *
 * => we may block if the tty queue is full
 * => no message is printed if the queue doesn't clear in a reasonable
 *        time
 */

void
uprintf(const char *fmt, ...)
{
        struct proc *p = curproc;
        va_list ap;

        /* mutex_enter(&proc_lock); XXXSMP */

        if (p->p_lflag & PL_CONTROLT && p->p_session->s_ttyvp) {
                /* No mutex needed; going to process TTY. */
                va_start(ap, fmt);
                kprintf(fmt, TOTTY, p->p_session->s_ttyp, NULL, ap);
                va_end(ap);
        }

        /* mutex_exit(&proc_lock); XXXSMP */
}

void
uprintf_locked(const char *fmt, ...)
{
        struct proc *p = curproc;
        va_list ap;

        if (p->p_lflag & PL_CONTROLT && p->p_session->s_ttyvp) {
                /* No mutex needed; going to process TTY. */
                va_start(ap, fmt);
                kprintf(fmt, TOTTY, p->p_session->s_ttyp, NULL, ap);
                va_end(ap);
        }
}

/*
 * tprintf functions: used to send messages to a specific process
 *
 * usage:
 *   get a tpr_t handle on a process "p" by using "tprintf_open(p)"
 *   use the handle when calling "tprintf"
 *   when done, do a "tprintf_close" to drop the handle
 */

/*
 * tprintf_open: get a tprintf handle on a process "p"
 *
 * => returns NULL if process can't be printed to
 */

tpr_t
tprintf_open(struct proc *p)
{
        tpr_t cookie;

        cookie = NULL;

        mutex_enter(&proc_lock);
        if (p->p_lflag & PL_CONTROLT && p->p_session->s_ttyvp) {
                proc_sesshold(p->p_session);
                cookie = (tpr_t)p->p_session;
        }
        mutex_exit(&proc_lock);

        return cookie;
}

/*
 * tprintf_close: dispose of a tprintf handle obtained with tprintf_open
 */

void
tprintf_close(tpr_t sess)
{

        if (sess) {
                mutex_enter(&proc_lock);
                /* Releases proc_lock. */
                proc_sessrele((struct session *)sess);
        }
}

/*
 * tprintf: given tprintf handle to a process [obtained with tprintf_open],
 * send a message to the controlling tty for that process.
 *
 * => also sends message to /dev/klog
 */
void
tprintf(tpr_t tpr, const char *fmt, ...)
{
        struct session *sess = (struct session *)tpr;
        struct tty *tp = NULL;
        int flags = TOLOG;
        va_list ap;

        /* mutex_enter(&proc_lock); XXXSMP */
        if (sess && sess->s_ttyvp && ttycheckoutq(sess->s_ttyp)) {
                flags |= TOTTY;
                tp = sess->s_ttyp;
        }

        kprintf_lock();

        klogpri(LOG_INFO);
        va_start(ap, fmt);
        kprintf(fmt, flags, tp, NULL, ap);
        va_end(ap);

        kprintf_unlock();
        /* mutex_exit(&proc_lock);        XXXSMP */

        logwakeup();
}


/*
 * ttyprintf: send a message to a specific tty
 *
 * => should be used only by tty driver or anything that knows the
 *    underlying tty will not be revoked(2)'d away.  [otherwise,
 *    use tprintf]
 */
void
ttyprintf(struct tty *tp, const char *fmt, ...)
{
        va_list ap;

        /* No mutex needed; going to process TTY. */
        va_start(ap, fmt);
        kprintf(fmt, TOTTY, tp, NULL, ap);
        va_end(ap);
}

#ifdef DDB

/*
 * db_printf: printf for DDB (via db_putchar)
 */

void
db_printf(const char *fmt, ...)
{
        va_list ap;

        /* No mutex needed; DDB pauses all processors. */
        va_start(ap, fmt);
        kprintf(fmt, TODDB, NULL, NULL, ap);
        va_end(ap);

        if (db_tee_msgbuf) {
                va_start(ap, fmt);
                kprintf(fmt, TOLOG, NULL, NULL, ap);
                va_end(ap);
        }
}

void
db_vprintf(const char *fmt, va_list ap)
{
        va_list cap;

        va_copy(cap, ap);
        /* No mutex needed; DDB pauses all processors. */
        kprintf(fmt, TODDB, NULL, NULL, ap);
        if (db_tee_msgbuf)
                kprintf(fmt, TOLOG, NULL, NULL, cap);
        va_end(cap);
}

#endif /* DDB */

static void
kprintf_internal(const char *fmt, int oflags, void *vp, char *sbuf, ...)
{
        va_list ap;
        
        va_start(ap, sbuf);
        (void)kprintf(fmt, oflags, vp, sbuf, ap);
        va_end(ap);
}

/*
 * Device autoconfiguration printf routines.  These change their
 * behavior based on the AB_* flags in boothowto.  If AB_SILENT
 * is set, messages never go to the console (but they still always
 * go to the log).  AB_VERBOSE overrides AB_SILENT.
 */

/*
 * aprint_normal: Send to console unless AB_QUIET.  Always goes
 * to the log.
 */
static void
aprint_normal_internal(const char *prefix, const char *fmt, va_list ap)
{
        int flags = TOLOG;

        if ((boothowto & (AB_SILENT|AB_QUIET)) == 0 ||
            (boothowto & AB_VERBOSE) != 0)
                flags |= TOCONS;

        kprintf_lock();

        if (prefix)
                kprintf_internal("%s: ", flags, NULL, NULL, prefix);
        kprintf(fmt, flags, NULL, NULL, ap);

        kprintf_unlock();

        if (!panicstr)
                logwakeup();
}

void
aprint_normal(const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        aprint_normal_internal(NULL, fmt, ap);
        va_end(ap);
}

void
aprint_normal_dev(device_t dv, const char *fmt, ...)
{
        va_list ap;

        KASSERT(dv != NULL);

        va_start(ap, fmt);
        aprint_normal_internal(device_xname(dv), fmt, ap);
        va_end(ap);
}

void
aprint_normal_ifnet(struct ifnet *ifp, const char *fmt, ...)
{
        va_list ap;

        KASSERT(ifp != NULL);

        va_start(ap, fmt);
        aprint_normal_internal(ifp->if_xname, fmt, ap);
        va_end(ap);
}

/*
 * aprint_error: Send to console unless AB_QUIET.  Always goes
 * to the log.  Also counts the number of times called so other
 * parts of the kernel can report the number of errors during a
 * given phase of system startup.
 */
static int aprint_error_count;

int
aprint_get_error_count(void)
{
        int count;

        kprintf_lock();

        count = aprint_error_count;
        aprint_error_count = 0;

        kprintf_unlock();

        return (count);
}

static void
aprint_error_internal(const char *prefix, const char *fmt, va_list ap)
{
        int flags = TOLOG;

        if ((boothowto & (AB_SILENT|AB_QUIET)) == 0 ||
            (boothowto & AB_VERBOSE) != 0)
                flags |= TOCONS;

        kprintf_lock();

        aprint_error_count++;

        if (prefix)
                kprintf_internal("%s: ", flags, NULL, NULL, prefix);
        kprintf_internal("autoconfiguration error: ", TOLOG, NULL, NULL);
        kprintf(fmt, flags, NULL, NULL, ap);

        kprintf_unlock();

        if (!panicstr)
                logwakeup();
}

void
aprint_error(const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        aprint_error_internal(NULL, fmt, ap);
        va_end(ap);
}

void
aprint_error_dev(device_t dv, const char *fmt, ...)
{
        va_list ap;

        KASSERT(dv != NULL);

        va_start(ap, fmt);
        aprint_error_internal(device_xname(dv), fmt, ap);
        va_end(ap);
}

void
aprint_error_ifnet(struct ifnet *ifp, const char *fmt, ...)
{
        va_list ap;

        KASSERT(ifp != NULL);

        va_start(ap, fmt);
        aprint_error_internal(ifp->if_xname, fmt, ap);
        va_end(ap);
}

/*
 * aprint_naive: Send to console only if AB_QUIET.  Never goes
 * to the log.
 */
static void
aprint_naive_internal(const char *prefix, const char *fmt, va_list ap)
{
        if ((boothowto & (AB_QUIET|AB_SILENT|AB_VERBOSE)) != AB_QUIET)
                return;

        kprintf_lock();

        if (prefix)
                kprintf_internal("%s: ", TOCONS, NULL, NULL, prefix);
        kprintf(fmt, TOCONS, NULL, NULL, ap);

        kprintf_unlock();
}

void
aprint_naive(const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        aprint_naive_internal(NULL, fmt, ap);
        va_end(ap);
}

void
aprint_naive_dev(device_t dv, const char *fmt, ...)
{
        va_list ap;

        KASSERT(dv != NULL);

        va_start(ap, fmt);
        aprint_naive_internal(device_xname(dv), fmt, ap);
        va_end(ap);
}

void
aprint_naive_ifnet(struct ifnet *ifp, const char *fmt, ...)
{
        va_list ap;

        KASSERT(ifp != NULL);

        va_start(ap, fmt);
        aprint_naive_internal(ifp->if_xname, fmt, ap);
        va_end(ap);
}

/*
 * aprint_verbose: Send to console only if AB_VERBOSE.  Always
 * goes to the log.
 */
static void
aprint_verbose_internal(const char *prefix, const char *fmt, va_list ap)
{
        int flags = TOLOG;

        if (boothowto & AB_VERBOSE)
                flags |= TOCONS;

        kprintf_lock();

        if (prefix)
                kprintf_internal("%s: ", flags, NULL, NULL, prefix);
        kprintf(fmt, flags, NULL, NULL, ap);

        kprintf_unlock();

        if (!panicstr)
                logwakeup();
}

void
aprint_verbose(const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        aprint_verbose_internal(NULL, fmt, ap);
        va_end(ap);
}

void
aprint_verbose_dev(device_t dv, const char *fmt, ...)
{
        va_list ap;

        KASSERT(dv != NULL);

        va_start(ap, fmt);
        aprint_verbose_internal(device_xname(dv), fmt, ap);
        va_end(ap);
}

void
aprint_verbose_ifnet(struct ifnet *ifp, const char *fmt, ...)
{
        va_list ap;

        KASSERT(ifp != NULL);

        va_start(ap, fmt);
        aprint_verbose_internal(ifp->if_xname, fmt, ap);
        va_end(ap);
}

/*
 * aprint_debug: Send to console and log only if AB_DEBUG.
 */
static void
aprint_debug_internal(const char *prefix, const char *fmt, va_list ap)
{
        if ((boothowto & AB_DEBUG) == 0)
                return;

        kprintf_lock();

        if (prefix)
                kprintf_internal("%s: ", TOCONS | TOLOG, NULL, NULL, prefix);
        kprintf(fmt, TOCONS | TOLOG, NULL, NULL, ap);

        kprintf_unlock();
}

void
aprint_debug(const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        aprint_debug_internal(NULL, fmt, ap);
        va_end(ap);
}

void
aprint_debug_dev(device_t dv, const char *fmt, ...)
{
        va_list ap;

        KASSERT(dv != NULL);

        va_start(ap, fmt);
        aprint_debug_internal(device_xname(dv), fmt, ap);
        va_end(ap);
}

void
aprint_debug_ifnet(struct ifnet *ifp, const char *fmt, ...)
{
        va_list ap;

        KASSERT(ifp != NULL);

        va_start(ap, fmt);
        aprint_debug_internal(ifp->if_xname, fmt, ap);
        va_end(ap);
}

void
vprintf_flags(int flags, const char *fmt, va_list ap)
{
        kprintf_lock();
        kprintf(fmt, flags, NULL, NULL, ap);
        kprintf_unlock();
}

void
printf_flags(int flags, const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        vprintf_flags(flags, fmt, ap);
        va_end(ap);
}

void
printf_tolog(const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        vprintf_flags(TOLOG, fmt, ap);
        va_end(ap);
}

/*
 * printf_nolog: Like printf(), but does not send message to the log.
 */

void
printf_nolog(const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        vprintf_flags(TOCONS, fmt, ap);
        va_end(ap);
}

/*
 * printf_nostamp: Like printf(), but does not prepend a timestamp.
 */

void
printf_nostamp(const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        vprintf_flags(TOCONS|NOTSTAMP, fmt, ap);
        va_end(ap);
}

/*
 * normal kernel printf functions: printf, vprintf, snprintf, vsnprintf
 */

/*
 * printf: print a message to the console and the log
 */
void
printf(const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        vprintf_flags(TOCONS | TOLOG, fmt, ap);
        va_end(ap);
}

/*
 * vprintf: print a message to the console and the log [already have
 *        va_list]
 */

void
vprintf(const char *fmt, va_list ap)
{
        vprintf_flags(TOCONS | TOLOG, fmt, ap);

        if (!panicstr)
                logwakeup();
}

/*
 * snprintf: print a message to a buffer
 */
int
snprintf(char *bf, size_t size, const char *fmt, ...)
{
        int retval;
        va_list ap;

        va_start(ap, fmt);
        retval = vsnprintf(bf, size, fmt, ap);
        va_end(ap);

        return retval;
}

/*
 * vsnprintf: print a message to a buffer [already have va_list]
 */
int
vsnprintf(char *bf, size_t size, const char *fmt, va_list ap)
{
        int retval;
        char *p;

        p = bf + size;
        retval = kprintf(fmt, TOBUFONLY, &p, bf, ap);
        if (bf && size > 0) {
                /* nul terminate */
                if (size <= (size_t)retval)
                        bf[size - 1] = '\0';
                else
                        bf[retval] = '\0';
        }
        return retval;
}

int
vasprintf(char **bf, const char *fmt, va_list ap)
{
        int retval;
        va_list cap;

        va_copy(cap, ap);
        retval = kprintf(fmt, TOBUFONLY, NULL, NULL, cap) + 1;
        va_end(cap);
        *bf = kmem_alloc(retval, KM_SLEEP);
        return vsnprintf(*bf, retval, fmt, ap);
}

/*
 * kprintf: scaled down version of printf(3).
 *
 * this version based on vfprintf() from libc which was derived from
 * software contributed to Berkeley by Chris Torek.
 *
 * NOTE: The kprintf mutex must be held if we're going TOBUF or TOCONS!
 */

/*
 * macros for converting digits to letters and vice versa
 */
#define        to_digit(c)        ((c) - '0')
#define is_digit(c)        ((unsigned)to_digit(c) <= 9)
#define        to_char(n)        ((n) + '0')

/*
 * flags used during conversion.
 */
#define        ALT                0x001                /* alternate form */
#define        HEXPREFIX        0x002                /* add 0x or 0X prefix */
#define        LADJUST                0x004                /* left adjustment */
#define        LONGDBL                0x008                /* long double; unimplemented */
#define        LONGINT                0x010                /* long integer */
#define        QUADINT                0x020                /* quad integer */
#define        SHORTINT        0x040                /* short integer */
#define        MAXINT                0x080                /* intmax_t */
#define        PTRINT                0x100                /* intptr_t */
#define        SIZEINT                0x200                /* size_t */
#define        ZEROPAD                0x400                /* zero (as opposed to blank) pad */
#define FPT                0x800                /* Floating point number */

        /*
         * To extend shorts properly, we need both signed and unsigned
         * argument extraction methods.
         */
#define        SARG() \
        (flags&MAXINT ? va_arg(ap, intmax_t) : \
            flags&PTRINT ? va_arg(ap, intptr_t) : \
            flags&SIZEINT ? va_arg(ap, ssize_t) : /* XXX */ \
            flags&QUADINT ? va_arg(ap, quad_t) : \
            flags&LONGINT ? va_arg(ap, long) : \
            flags&SHORTINT ? (long)(short)va_arg(ap, int) : \
            (long)va_arg(ap, int))
#define        UARG() \
        (flags&MAXINT ? va_arg(ap, uintmax_t) : \
            flags&PTRINT ? va_arg(ap, uintptr_t) : \
            flags&SIZEINT ? va_arg(ap, size_t) : \
            flags&QUADINT ? va_arg(ap, u_quad_t) : \
            flags&LONGINT ? va_arg(ap, u_long) : \
            flags&SHORTINT ? (u_long)(u_short)va_arg(ap, int) : \
            (u_long)va_arg(ap, u_int))

#define KPRINTF_PUTCHAR(C) {                                                \
        if (oflags == TOBUFONLY) {                                        \
                if (sbuf && ((vp == NULL) || (sbuf < tailp)))                 \
                        *sbuf++ = (C);                                        \
        } else {                                                        \
                putchar((C), oflags, vp);                                \
        }                                                                \
}

void
device_printf(device_t dev, const char *fmt, ...)
{
        va_list ap;

        kprintf_lock();
        kprintf_internal("%s: ", TOCONS|TOLOG, NULL, NULL, device_xname(dev));
        va_start(ap, fmt);
        kprintf(fmt, TOCONS|TOLOG, NULL, NULL, ap);
        va_end(ap);
        kprintf_unlock();
}

/*
 * Guts of kernel printf.  Note, we already expect to be in a mutex!
 */
int
kprintf(const char *fmt0, int oflags, void *vp, char *sbuf, va_list ap)
{
        const char *fmt;        /* format string */
        int ch;                        /* character from fmt */
        int n;                        /* handy integer (short term usage) */
        char *cp;                /* handy char pointer (short term usage) */
        int flags;                /* flags as above */
        int ret;                /* return value accumulator */
        int width;                /* width from format (%8d), or 0 */
        int prec;                /* precision from format (%.3d), or -1 */
        char sign;                /* sign prefix (' ', '+', '-', or \0) */

        u_quad_t _uquad;        /* integer arguments %[diouxX] */
        enum { OCT, DEC, HEX } base;/* base for [diouxX] conversion */
        int dprec;                /* a copy of prec if [diouxX], 0 otherwise */
        int realsz;                /* field size expanded by dprec */
        int size;                /* size of converted field or string */
        const char *xdigs;        /* digits for [xX] conversion */
        char bf[KPRINTF_BUFSIZE]; /* space for %c, %[diouxX] */
        char *tailp;                /* tail pointer for snprintf */

        if (oflags == TOBUFONLY && (vp != NULL))
                tailp = *(char **)vp;
        else
                tailp = NULL;

        cp = NULL;        /* XXX: shutup gcc */
        size = 0;        /* XXX: shutup gcc */

        fmt = fmt0;
        ret = 0;

        xdigs = NULL;                /* XXX: shut up gcc warning */

        /*
         * Scan the format for conversions (`%' character).
         */
        for (;;) {
                for (; *fmt != '%' && *fmt; fmt++) {
                        ret++;
                        KPRINTF_PUTCHAR(*fmt);
                }
                if (*fmt == 0)
                        goto done;

                fmt++;                /* skip over '%' */

                flags = 0;
                dprec = 0;
                width = 0;
                prec = -1;
                sign = '\0';

rflag:                ch = *fmt++;
reswitch:        switch (ch) {
                case ' ':
                        /*
                         * ``If the space and + flags both appear, the space
                         * flag will be ignored.''
                         *        -- ANSI X3J11
                         */
                        if (!sign)
                                sign = ' ';
                        goto rflag;
                case '#':
                        flags |= ALT;
                        goto rflag;
                case '*':
                        /*
                         * ``A negative field width argument is taken as a
                         * - flag followed by a positive field width.''
                         *        -- ANSI X3J11
                         * They don't exclude field widths read from args.
                         */
                        if ((width = va_arg(ap, int)) >= 0)
                                goto rflag;
                        width = -width;
                        /* FALLTHROUGH */
                case '-':
                        flags |= LADJUST;
                        goto rflag;
                case '+':
                        sign = '+';
                        goto rflag;
                case '.':
                        if ((ch = *fmt++) == '*') {
                                n = va_arg(ap, int);
                                prec = n < 0 ? -1 : n;
                                goto rflag;
                        }
                        n = 0;
                        while (is_digit(ch)) {
                                n = 10 * n + to_digit(ch);
                                ch = *fmt++;
                        }
                        prec = n < 0 ? -1 : n;
                        goto reswitch;
                case '0':
                        /*
                         * ``Note that 0 is taken as a flag, not as the
                         * beginning of a field width.''
                         *        -- ANSI X3J11
                         */
                        flags |= ZEROPAD;
                        goto rflag;
                case '1': case '2': case '3': case '4':
                case '5': case '6': case '7': case '8': case '9':
                        n = 0;
                        do {
                                n = 10 * n + to_digit(ch);
                                ch = *fmt++;
                        } while (is_digit(ch));
                        width = n;
                        goto reswitch;
                case 'h':
                        flags |= SHORTINT;
                        goto rflag;
                case 'j':
                        flags |= MAXINT;
                        goto rflag;
                case 'l':
                        if (*fmt == 'l') {
                                fmt++;
                                flags |= QUADINT;
                        } else {
                                flags |= LONGINT;
                        }
                        goto rflag;
                case 'q':
                        flags |= QUADINT;
                        goto rflag;
                case 't':
                        flags |= PTRINT;
                        goto rflag;
                case 'z':
                        flags |= SIZEINT;
                        goto rflag;
                case 'c':
                        *(cp = bf) = va_arg(ap, int);
                        size = 1;
                        sign = '\0';
                        break;
                case 'D':
                        flags |= LONGINT;
                        /*FALLTHROUGH*/
                case 'd':
                case 'i':
                        _uquad = SARG();
                        if ((quad_t)_uquad < 0) {
                                _uquad = -_uquad;
                                sign = '-';
                        }
                        base = DEC;
                        goto number;
                case 'n':
                        /* no %n support in the kernel, consume and skip */
                        if (flags & MAXINT)
                                (void)va_arg(ap, intmax_t *);
                        else if (flags & PTRINT)
                                (void)va_arg(ap, intptr_t *);
                        else if (flags & SIZEINT)
                                (void)va_arg(ap, ssize_t *);
                        else if (flags & QUADINT)
                                (void)va_arg(ap, quad_t *);
                        else if (flags & LONGINT)
                                (void)va_arg(ap, long *);
                        else if (flags & SHORTINT)
                                (void)va_arg(ap, short *);
                        else
                                (void)va_arg(ap, int *);
                        continue;        /* no output */
                case 'O':
                        flags |= LONGINT;
                        /*FALLTHROUGH*/
                case 'o':
                        _uquad = UARG();
                        base = OCT;
                        goto nosign;
                case 'p':
                        /*
                         * ``The argument shall be a pointer to void.  The
                         * value of the pointer is converted to a sequence
                         * of printable characters, in an implementation-
                         * defined manner.''
                         *        -- ANSI X3J11
                         */
                        /* NOSTRICT */
                        _uquad = (u_long)va_arg(ap, void *);
                        base = HEX;
                        xdigs = hexdigits;
                        flags |= HEXPREFIX;
                        ch = 'x';
                        goto nosign;
                case 's':
                        if ((cp = va_arg(ap, char *)) == NULL)
                                /*XXXUNCONST*/
                                cp = __UNCONST("(null)");
                        if (prec >= 0) {
                                /*
                                 * can't use strlen; can only look for the
                                 * NUL in the first `prec' characters, and
                                 * strlen() will go further.
                                 */
                                char *p = memchr(cp, 0, prec);

                                if (p != NULL) {
                                        size = p - cp;
                                        if (size > prec)
                                                size = prec;
                                } else
                                        size = prec;
                        } else
                                size = strlen(cp);
                        sign = '\0';
                        break;
                case 'U':
                        flags |= LONGINT;
                        /*FALLTHROUGH*/
                case 'u':
                        _uquad = UARG();
                        base = DEC;
                        goto nosign;
                case 'X':
                        xdigs = HEXDIGITS;
                        goto hex;
                case 'x':
                        xdigs = hexdigits;
hex:                        _uquad = UARG();
                        base = HEX;
                        /* leading 0x/X only if non-zero */
                        if (flags & ALT && _uquad != 0)
                                flags |= HEXPREFIX;

                        /* unsigned conversions */
nosign:                        sign = '\0';
                        /*
                         * ``... diouXx conversions ... if a precision is
                         * specified, the 0 flag will be ignored.''
                         *        -- ANSI X3J11
                         */
number:                        if ((dprec = prec) >= 0)
                                flags &= ~ZEROPAD;

                        /*
                         * ``The result of converting a zero value with an
                         * explicit precision of zero is no characters.''
                         *        -- ANSI X3J11
                         */
                        cp = bf + KPRINTF_BUFSIZE;
                        if (_uquad != 0 || prec != 0) {
                                /*
                                 * Unsigned mod is hard, and unsigned mod
                                 * by a constant is easier than that by
                                 * a variable; hence this switch.
                                 */
                                switch (base) {
                                case OCT:
                                        do {
                                                *--cp = to_char(_uquad & 7);
                                                _uquad >>= 3;
                                        } while (_uquad);
                                        /* handle octal leading 0 */
                                        if (flags & ALT && *cp != '0')
                                                *--cp = '0';
                                        break;

                                case DEC:
                                        /* many numbers are 1 digit */
                                        while (_uquad >= 10) {
                                                *--cp = to_char(_uquad % 10);
                                                _uquad /= 10;
                                        }
                                        *--cp = to_char(_uquad);
                                        break;

                                case HEX:
                                        do {
                                                *--cp = xdigs[_uquad & 15];
                                                _uquad >>= 4;
                                        } while (_uquad);
                                        break;

                                default:
                                        /*XXXUNCONST*/
                                        cp = __UNCONST("bug in kprintf: bad base");
                                        size = strlen(cp);
                                        goto skipsize;
                                }
                        }
                        size = bf + KPRINTF_BUFSIZE - cp;
                skipsize:
                        break;
                default:        /* "%?" prints ?, unless ? is NUL */
                        if (ch == '\0')
                                goto done;
                        /* pretend it was %c with argument ch */
                        cp = bf;
                        *cp = ch;
                        size = 1;
                        sign = '\0';
                        break;
                }

                /*
                 * All reasonable formats wind up here.  At this point, `cp'
                 * points to a string which (if not flags&LADJUST) should be
                 * padded out to `width' places.  If flags&ZEROPAD, it should
                 * first be prefixed by any sign or other prefix; otherwise,
                 * it should be blank padded before the prefix is emitted.
                 * After any left-hand padding and prefixing, emit zeroes
                 * required by a decimal [diouxX] precision, then print the
                 * string proper, then emit zeroes required by any leftover
                 * floating precision; finally, if LADJUST, pad with blanks.
                 *
                 * Compute actual size, so we know how much to pad.
                 * size excludes decimal prec; realsz includes it.
                 */
                realsz = dprec > size ? dprec : size;
                if (sign)
                        realsz++;
                else if (flags & HEXPREFIX)
                        realsz+= 2;

                /* adjust ret */
                ret += width > realsz ? width : realsz;

                /* right-adjusting blank padding */
                if ((flags & (LADJUST|ZEROPAD)) == 0) {
                        n = width - realsz;
                        while (n-- > 0)
                                KPRINTF_PUTCHAR(' ');
                }

                /* prefix */
                if (sign) {
                        KPRINTF_PUTCHAR(sign);
                } else if (flags & HEXPREFIX) {
                        KPRINTF_PUTCHAR('0');
                        KPRINTF_PUTCHAR(ch);
                }

                /* right-adjusting zero padding */
                if ((flags & (LADJUST|ZEROPAD)) == ZEROPAD) {
                        n = width - realsz;
                        while (n-- > 0)
                                KPRINTF_PUTCHAR('0');
                }

                /* leading zeroes from decimal precision */
                n = dprec - size;
                while (n-- > 0)
                        KPRINTF_PUTCHAR('0');

                /* the string or number proper */
                for (; size--; cp++)
                        KPRINTF_PUTCHAR(*cp);
                /* left-adjusting padding (always blank) */
                if (flags & LADJUST) {
                        n = width - realsz;
                        while (n-- > 0)
                                KPRINTF_PUTCHAR(' ');
                }
        }

done:
        if ((oflags == TOBUFONLY) && (vp != NULL))
                *(char **)vp = sbuf;
        (*v_flush)();

#ifdef RND_PRINTF
        if (__predict_true(kprintf_inited))
                rnd_add_data_intr(&rnd_printf_source, NULL, 0, 0);
#endif
        return ret;
}






























































































































































































   42 











    1 



   44 




    1 









    1 













   13 





   13 










    1 





    1 










    1 





    1 






   31 





    2 


   29 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
/*        $NetBSD: kern_uidinfo.c,v 1.13 2021/12/28 13:28:24 riastradh Exp $        */

/*-
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_uidinfo.c,v 1.13 2021/12/28 13:28:24 riastradh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/proc.h>
#include <sys/atomic.h>
#include <sys/uidinfo.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/cpu.h>

static SLIST_HEAD(uihashhead, uidinfo) *uihashtbl;
static u_long                 uihash;

#define        UIHASH(uid)        (&uihashtbl[(uid) & uihash])

static int
sysctl_kern_uidinfo_cnt(SYSCTLFN_ARGS)
{
        static const struct {
                const char *name;
                u_int value;
        } nv[] = {
#define _MEM(n) { # n, offsetof(struct uidinfo, ui_ ## n) }
                _MEM(proccnt),
                _MEM(lwpcnt),
                _MEM(lockcnt),
                _MEM(semcnt),
                _MEM(sbsize),
#undef _MEM
        };

        for (size_t i = 0; i < __arraycount(nv); i++)
                if (strcmp(nv[i].name, rnode->sysctl_name) == 0) {
                        uint64_t cnt;
                        struct sysctlnode node = *rnode;
                        struct uidinfo *uip;

                        node.sysctl_data = &cnt;
                        uip = uid_find(kauth_cred_geteuid(l->l_cred));

                        *(uint64_t *)node.sysctl_data =
                            *(u_long *)((char *)uip + nv[i].value);

                        return sysctl_lookup(SYSCTLFN_CALL(&node));
                }

        return EINVAL;
}

static struct sysctllog *kern_uidinfo_sysctllog;

static void
sysctl_kern_uidinfo_setup(void)
{
        const struct sysctlnode *rnode, *cnode;

        sysctl_createv(&kern_uidinfo_sysctllog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "uidinfo",
                       SYSCTL_DESCR("Resource usage per uid"),
                       NULL, 0, NULL, 0,
                       CTL_KERN, CTL_CREATE, CTL_EOL);

        sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "proccnt",
                       SYSCTL_DESCR("Number of processes for the current user"),
                       sysctl_kern_uidinfo_cnt, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "lwpcnt",
                       SYSCTL_DESCR("Number of lwps for the current user"),
                       sysctl_kern_uidinfo_cnt, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "lockcnt",
                       SYSCTL_DESCR("Number of locks for the current user"),
                       sysctl_kern_uidinfo_cnt, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "semcnt",
                       SYSCTL_DESCR("Number of semaphores used for the current user"),
                       sysctl_kern_uidinfo_cnt, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "sbsize",
                       SYSCTL_DESCR("Socket buffers used for the current user"),
                       sysctl_kern_uidinfo_cnt, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);
}

static int
uid_stats(struct hashstat_sysctl *hs, bool fill)
{
        struct uidinfo *uip;
        uint64_t chain;

        strlcpy(hs->hash_name, "uihash", sizeof(hs->hash_name));
        strlcpy(hs->hash_desc, "user info (uid->used proc) hash",
            sizeof(hs->hash_desc));
        if (!fill)
                return 0;

        hs->hash_size = uihash + 1;

        for (size_t i = 0; i < hs->hash_size; i++) {
                chain = 0;
                SLIST_FOREACH(uip, &uihashtbl[i], ui_hash) {
                        membar_datadep_consumer();
                        chain++;
                }
                if (chain > 0) {
                        hs->hash_used++;
                        hs->hash_items += chain;
                        if (chain > hs->hash_maxchain)
                                hs->hash_maxchain = chain;
                }
        }

        return 0;
}

void
uid_init(void)
{

        /*
         * In case of MP system, SLIST_FOREACH would force a cache line
         * write-back for every modified 'uidinfo', thus we try to keep the
         * lists short.
         */
        const u_int uihash_sz = (maxcpus > 1 ? 1024 : 64);

        uihashtbl = hashinit(uihash_sz, HASH_SLIST, true, &uihash);

        /*
         * Ensure that uid 0 is always in the user hash table, as
         * sbreserve() expects it available from interrupt context.
         */
        (void)uid_find(0);
        sysctl_kern_uidinfo_setup();
        hashstat_register("uihash", uid_stats);
}

struct uidinfo *
uid_find(uid_t uid)
{
        struct uidinfo *uip, *uip_first, *newuip;
        struct uihashhead *uipp;

        uipp = UIHASH(uid);
        newuip = NULL;

        /*
         * To make insertion atomic, abstraction of SLIST will be violated.
         */
        uip_first = uipp->slh_first;
 again:
        SLIST_FOREACH(uip, uipp, ui_hash) {
                membar_datadep_consumer();
                if (uip->ui_uid != uid)
                        continue;
                if (newuip != NULL)
                        kmem_free(newuip, sizeof(*newuip));
                return uip;
        }
        if (newuip == NULL)
                newuip = kmem_zalloc(sizeof(*newuip), KM_SLEEP);
        newuip->ui_uid = uid;

        /*
         * If atomic insert is unsuccessful, another thread might be
         * allocated this 'uid', thus full re-check is needed.
         */
        newuip->ui_hash.sle_next = uip_first;
        membar_producer();
        uip = atomic_cas_ptr(&uipp->slh_first, uip_first, newuip);
        if (uip != uip_first) {
                uip_first = uip;
                goto again;
        }

        return newuip;
}

/*
 * Change the count associated with number of processes
 * a given user is using.
 */
int
chgproccnt(uid_t uid, int diff)
{
        struct uidinfo *uip;
        long proccnt;

        uip = uid_find(uid);
        proccnt = atomic_add_long_nv(&uip->ui_proccnt, diff);
        KASSERTMSG(proccnt >= 0, "uid=%d diff=%d proccnt=%ld",
            uid, diff, proccnt);
        return proccnt;
}

/*
 * Change the count associated with number of lwps
 * a given user is using.
 */
int
chglwpcnt(uid_t uid, int diff)
{
        struct uidinfo *uip;
        long lwpcnt;

        uip = uid_find(uid);
        lwpcnt = atomic_add_long_nv(&uip->ui_lwpcnt, diff);
        KASSERTMSG(lwpcnt >= 0, "uid=%d diff=%d lwpcnt=%ld",
            uid, diff, lwpcnt);
        return lwpcnt;
}

/*
 * Change the count associated with number of semaphores
 * a given user is using.
 */
int
chgsemcnt(uid_t uid, int diff)
{
        struct uidinfo *uip;
        long semcnt;

        uip = uid_find(uid);
        semcnt = atomic_add_long_nv(&uip->ui_semcnt, diff);
        KASSERTMSG(semcnt >= 0, "uid=%d diff=%d semcnt=%ld",
            uid, diff, semcnt);
        return semcnt;
}

int
chgsbsize(struct uidinfo *uip, u_long *hiwat, u_long to, rlim_t xmax)
{
        rlim_t nsb;
        const long diff = to - *hiwat;

        nsb = (rlim_t)atomic_add_long_nv((long *)&uip->ui_sbsize, diff);
        if (diff > 0 && nsb > xmax) {
                atomic_add_long((long *)&uip->ui_sbsize, -diff);
                return 0;
        }
        *hiwat = to;
        return 1;
}















































































  253 

  254 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/*        $NetBSD: userret.h,v 1.13 2018/07/26 09:29:08 maxv Exp $        */

/*
 * XXXfvdl same as i386 counterpart, but should probably be independent.
 */

/*-
 * Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1990 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * William Jolitz.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

#include <sys/userret.h>

static __inline void userret(struct lwp *);

/*
 * Define the code needed before returning to user mode, for
 * trap and syscall.
 */
static __inline void
userret(struct lwp *l)
{
        /* Invoke MI userret code */
        mi_userret(l);
}










































































































    2 

    2 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
/*        $NetBSD: tty_60.c,v 1.11 2021/07/21 06:35:44 skrll Exp $        */

/*-
 * Copyright (c) 2012 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Alan Barrett
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty_60.c,v 1.11 2021/07/21 06:35:44 skrll Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/types.h>

#include <sys/conf.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/compat_stub.h>
#include <sys/kmem.h>

#include <sys/tty.h>

#include <compat/common/compat_mod.h>
#include <compat/sys/ttycom.h>

/* convert struct ptmget to struct compat_60_ptmget */
static int
ptmget_to_ptmget60(struct ptmget *pg, struct compat_60_ptmget *pg60)
{
        memset(pg60, 0, sizeof(*pg60));
        pg60->cfd = pg->cfd;
        pg60->sfd = pg->sfd;
        strlcpy(pg60->cn, pg->cn, sizeof(pg60->cn));
        strlcpy(pg60->sn, pg->sn, sizeof(pg60->sn));
        if (strlen(pg->cn) >= sizeof(pg60->cn)
            || strlen(pg->sn) >= sizeof(pg60->sn))
                return E2BIG;
        return 0;
}

/* Helper for compat ioctls that use struct compat_60_ptmget. */
static int
compat_60_ptmget_ioctl(dev_t dev, u_long cmd, void *data, int flag,
        struct lwp *l)
{
        int ret;
        u_long newcmd;
        struct ptmget *pg;
        const struct cdevsw *cd = cdevsw_lookup(dev);

        if (cd == NULL || cd->d_ioctl == NULL)
                return ENXIO;

        switch (cmd) {
        case COMPAT_60_TIOCPTMGET:  newcmd = TIOCPTMGET; break;
        case COMPAT_60_TIOCPTSNAME: newcmd = TIOCPTSNAME; break;
        default: return ENOTTY;
        }

        pg = kmem_alloc(sizeof(*pg), KM_SLEEP);

        ret = (cd->d_ioctl)(dev, newcmd, pg, flag, l);
        if (ret != 0)
                goto out;

        ret = ptmget_to_ptmget60(pg, data);

out:
        kmem_free(pg, sizeof(*pg));
        return ret;
}

/*
 * COMPAT_60 versions of ttioctl and ptmioctl.
 */
int
compat_60_ttioctl(struct tty *tp, u_long cmd, void *data, int flag,
        struct lwp *l)
{

        switch (cmd) {
        case COMPAT_60_TIOCPTMGET:
        case COMPAT_60_TIOCPTSNAME:
                return compat_60_ptmget_ioctl(tp->t_dev, cmd, data, flag, l);
        default:
                return EPASSTHROUGH;
        }
}

int
compat_60_ptmioctl(dev_t dev, u_long cmd, void *data, int flag,
    struct lwp *l)
{

        switch (cmd) {
        case COMPAT_60_TIOCPTMGET:
                return compat_60_ptmget_ioctl(dev, cmd, data, flag, l);
        default:
                return EPASSTHROUGH;
        }
}

void
kern_tty_60_init(void)
{

        MODULE_HOOK_SET(tty_ttioctl_60_hook, compat_60_ttioctl);
        MODULE_HOOK_SET(tty_ptmioctl_60_hook, compat_60_ptmioctl);
}

void
kern_tty_60_fini(void)
{
        MODULE_HOOK_UNSET(tty_ttioctl_60_hook);
        MODULE_HOOK_UNSET(tty_ptmioctl_60_hook);
}




















































































































































































































































































































































































































































































































































































































































































































































    2 


    1 






























    3 



    3 



    3 













    3 











    7 

































   14 



   14 



   13 













   12 























































































    1 









    1 
    1 









































































































































    3 













    3 





    3 

























































    3 





    3 



    3 


















    3 
    3 

    3 
    3 

    3 





    3 















































   29 



   29 








   28 
   29 

   28 
   28 





















































   19 


    1 

   19 




    1 












    1 
    1 

    1 
    1 











































   14 





   14 



   10 














    3 



   13 
   13 

   13 
   13 

   10 
    3 




   12 






    1 








    1 
    1 

    1 
    1 






    1 






    1 
    1 

    1 
    1 






    2 






    2 
    2 

    1 
    1 






    1 






    1 
    1 

    1 
    1 






    9 






   10 
   10 

    9 
   10 





































    2 






    2 
    2 

    2 
    2 







































































   15 


    1 

   15 










































    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
/*        $NetBSD: subr_devsw.c,v 1.51 2023/02/15 13:12:45 riastradh Exp $        */

/*-
 * Copyright (c) 2001, 2002, 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by MAEKAWA Masahide <gehenna@NetBSD.org>, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Overview
 *
 *        subr_devsw.c: registers device drivers by name and by major
 *        number, and provides wrapper methods for performing I/O and
 *        other tasks on device drivers, keying on the device number
 *        (dev_t).
 *
 *        When the system is built, the config(8) command generates
 *        static tables of device drivers built into the kernel image
 *        along with their associated methods.  These are recorded in
 *        the cdevsw0 and bdevsw0 tables.  Drivers can also be added to
 *        and removed from the system dynamically.
 *
 * Allocation
 *
 *        When the system initially boots only the statically allocated
 *        indexes (bdevsw0, cdevsw0) are used.  If these overflow due to
 *        allocation, we allocate a fixed block of memory to hold the new,
 *        expanded index.  This "fork" of the table is only ever performed
 *        once in order to guarantee that other threads may safely access
 *        the device tables:
 *
 *        o Once a thread has a "reference" to the table via an earlier
 *          open() call, we know that the entry in the table must exist
 *          and so it is safe to access it.
 *
 *        o Regardless of whether other threads see the old or new
 *          pointers, they will point to a correct device switch
 *          structure for the operation being performed.
 *
 *        XXX Currently, the wrapper methods such as cdev_read() verify
 *        that a device driver does in fact exist before calling the
 *        associated driver method.  This should be changed so that
 *        once the device is has been referenced by a vnode (opened),
 *        calling        the other methods should be valid until that reference
 *        is dropped.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_devsw.c,v 1.51 2023/02/15 13:12:45 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_dtrace.h"
#endif

#include <sys/param.h>
#include <sys/conf.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/poll.h>
#include <sys/tty.h>
#include <sys/cpu.h>
#include <sys/buf.h>
#include <sys/reboot.h>
#include <sys/sdt.h>
#include <sys/atomic.h>
#include <sys/localcount.h>
#include <sys/pserialize.h>
#include <sys/xcall.h>
#include <sys/device.h>

#ifdef DEVSW_DEBUG
#define        DPRINTF(x)        printf x
#else /* DEVSW_DEBUG */
#define        DPRINTF(x)
#endif /* DEVSW_DEBUG */

#define        MAXDEVSW        512        /* the maximum of major device number */
#define        BDEVSW_SIZE        (sizeof(struct bdevsw *))
#define        CDEVSW_SIZE        (sizeof(struct cdevsw *))
#define        DEVSWCONV_SIZE        (sizeof(struct devsw_conv))

struct devswref {
        struct localcount        *dr_lc;
};

/* XXX bdevsw, cdevsw, max_bdevsws, and max_cdevsws should be volatile */
extern const struct bdevsw **bdevsw, *bdevsw0[];
extern const struct cdevsw **cdevsw, *cdevsw0[];
extern struct devsw_conv *devsw_conv, devsw_conv0[];
extern const int sys_bdevsws, sys_cdevsws;
extern int max_bdevsws, max_cdevsws, max_devsw_convs;

static struct devswref *cdevswref;
static struct devswref *bdevswref;
static kcondvar_t devsw_cv;

static int bdevsw_attach(const struct bdevsw *, devmajor_t *);
static int cdevsw_attach(const struct cdevsw *, devmajor_t *);
static void devsw_detach_locked(const struct bdevsw *, const struct cdevsw *);

kmutex_t device_lock;

void (*biodone_vfs)(buf_t *) = (void *)nullop;

/*
 * bdev probes
 */
SDT_PROBE_DEFINE6(sdt, bdev, open, acquire,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/,
    "int"/*unit*/,
    "device_t"/*dv*/);
SDT_PROBE_DEFINE4(sdt, bdev, open, entry,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/);
SDT_PROBE_DEFINE5(sdt, bdev, open, return,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/,
    "int"/*error*/);
SDT_PROBE_DEFINE6(sdt, bdev, open, release,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/,
    "int"/*unit*/,
    "device_t"/*dv*/);

SDT_PROBE_DEFINE4(sdt, bdev, cancel, entry,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/);
SDT_PROBE_DEFINE5(sdt, bdev, cancel, return,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/,
    "int"/*error*/);

SDT_PROBE_DEFINE4(sdt, bdev, close, entry,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/);
SDT_PROBE_DEFINE5(sdt, bdev, close, return,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/,
    "int"/*error*/);

SDT_PROBE_DEFINE3(sdt, bdev, strategy, entry,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "struct buf *"/*bp*/);
SDT_PROBE_DEFINE3(sdt, bdev, strategy, return,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "struct buf *"/*bp*/);

SDT_PROBE_DEFINE5(sdt, bdev, ioctl, entry,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "unsigned long"/*cmd*/,
    "void *"/*data*/,
    "int"/*flag*/);
SDT_PROBE_DEFINE6(sdt, bdev, ioctl, return,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "unsigned long"/*cmd*/,
    "void *"/*data*/,
    "int"/*flag*/,
    "int"/*error*/);

SDT_PROBE_DEFINE2(sdt, bdev, psize, entry,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/);
SDT_PROBE_DEFINE3(sdt, bdev, psize, return,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "int"/*psize*/);

SDT_PROBE_DEFINE4(sdt, bdev, discard, entry,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "off_t"/*pos*/,
    "off_t"/*len*/);
SDT_PROBE_DEFINE5(sdt, bdev, discard, return,
    "struct bdevsw *"/*bdevsw*/,
    "dev_t"/*dev*/,
    "off_t"/*pos*/,
    "off_t"/*len*/,
    "int"/*error*/);

/*
 * cdev probes
 */
SDT_PROBE_DEFINE6(sdt, cdev, open, acquire,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/,
    "int"/*unit*/,
    "device_t"/*dv*/);
SDT_PROBE_DEFINE4(sdt, cdev, open, entry,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/);
SDT_PROBE_DEFINE5(sdt, cdev, open, return,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/,
    "int"/*error*/);
SDT_PROBE_DEFINE6(sdt, cdev, open, release,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/,
    "int"/*unit*/,
    "device_t"/*dv*/);

SDT_PROBE_DEFINE4(sdt, cdev, cancel, entry,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/);
SDT_PROBE_DEFINE5(sdt, cdev, cancel, return,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/,
    "int"/*error*/);

SDT_PROBE_DEFINE4(sdt, cdev, close, entry,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/);
SDT_PROBE_DEFINE5(sdt, cdev, close, return,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "int"/*flag*/,
    "int"/*devtype*/,
    "int"/*error*/);

SDT_PROBE_DEFINE4(sdt, cdev, read, entry,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "struct uio *"/*uio*/,
    "int"/*flag*/);
SDT_PROBE_DEFINE5(sdt, cdev, read, return,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "struct uio *"/*uio*/,
    "int"/*flag*/,
    "int"/*error*/);

SDT_PROBE_DEFINE4(sdt, cdev, write, entry,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "struct uio *"/*uio*/,
    "int"/*flag*/);
SDT_PROBE_DEFINE5(sdt, cdev, write, return,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "struct uio *"/*uio*/,
    "int"/*flag*/,
    "int"/*error*/);

SDT_PROBE_DEFINE5(sdt, cdev, ioctl, entry,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "unsigned long"/*cmd*/,
    "void *"/*data*/,
    "int"/*flag*/);
SDT_PROBE_DEFINE6(sdt, cdev, ioctl, return,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "unsigned long"/*cmd*/,
    "void *"/*data*/,
    "int"/*flag*/,
    "int"/*error*/);

SDT_PROBE_DEFINE4(sdt, cdev, stop, entry,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "struct tty *"/*tp*/,
    "int"/*flag*/);
SDT_PROBE_DEFINE4(sdt, cdev, stop, return,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "struct tty *"/*tp*/,
    "int"/*flag*/);

SDT_PROBE_DEFINE3(sdt, cdev, poll, entry,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "int"/*events*/);
SDT_PROBE_DEFINE4(sdt, cdev, poll, return,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "int"/*events*/,
    "int"/*revents*/);

SDT_PROBE_DEFINE4(sdt, cdev, mmap, entry,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "off_t"/*off*/,
    "int"/*flag*/);
SDT_PROBE_DEFINE5(sdt, cdev, mmap, return,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "off_t"/*off*/,
    "int"/*flag*/,
    "paddr_t"/*mmapcookie*/);

SDT_PROBE_DEFINE3(sdt, cdev, kqfilter, entry,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "struct knote *"/*kn*/);
SDT_PROBE_DEFINE4(sdt, cdev, kqfilter, return,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "struct knote *"/*kn*/,
    "int"/*error*/);

SDT_PROBE_DEFINE4(sdt, cdev, discard, entry,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "off_t"/*pos*/,
    "off_t"/*len*/);
SDT_PROBE_DEFINE5(sdt, cdev, discard, return,
    "struct cdevsw *"/*cdevsw*/,
    "dev_t"/*dev*/,
    "off_t"/*pos*/,
    "off_t"/*len*/,
    "int"/*error*/);

void
devsw_init(void)
{

        KASSERT(sys_bdevsws < MAXDEVSW - 1);
        KASSERT(sys_cdevsws < MAXDEVSW - 1);
        mutex_init(&device_lock, MUTEX_DEFAULT, IPL_NONE);

        cv_init(&devsw_cv, "devsw");
}

int
devsw_attach(const char *devname,
             const struct bdevsw *bdev, devmajor_t *bmajor,
             const struct cdevsw *cdev, devmajor_t *cmajor)
{
        struct devsw_conv *conv;
        char *name;
        int error, i;

        if (devname == NULL || cdev == NULL)
                return EINVAL;

        mutex_enter(&device_lock);

        for (i = 0; i < max_devsw_convs; i++) {
                conv = &devsw_conv[i];
                if (conv->d_name == NULL || strcmp(devname, conv->d_name) != 0)
                        continue;

                if ((bdev != NULL) && (*bmajor < 0))
                        *bmajor = conv->d_bmajor;
                if (*cmajor < 0)
                        *cmajor = conv->d_cmajor;

                if (*bmajor != conv->d_bmajor || *cmajor != conv->d_cmajor) {
                        error = EINVAL;
                        goto out;
                }
                if ((*bmajor >= 0 && bdev == NULL) || *cmajor < 0) {
                        error = EINVAL;
                        goto out;
                }

                if ((*bmajor >= 0 && bdevsw[*bmajor] != NULL) ||
                    cdevsw[*cmajor] != NULL) {
                        error = EEXIST;
                        goto out;
                }
                break;
        }

        /*
         * XXX This should allocate what it needs up front so we never
         * need to flail around trying to unwind.
         */
        error = bdevsw_attach(bdev, bmajor);
        if (error != 0)
                goto out;
        error = cdevsw_attach(cdev, cmajor);
        if (error != 0) {
                devsw_detach_locked(bdev, NULL);
                goto out;
        }

        /*
         * If we already found a conv, we're done.  Otherwise, find an
         * empty slot or extend the table.
         */
        if (i < max_devsw_convs) {
                error = 0;
                goto out;
        }

        for (i = 0; i < max_devsw_convs; i++) {
                if (devsw_conv[i].d_name == NULL)
                        break;
        }
        if (i == max_devsw_convs) {
                struct devsw_conv *newptr;
                int old_convs, new_convs;

                old_convs = max_devsw_convs;
                new_convs = old_convs + 1;

                newptr = kmem_zalloc(new_convs * DEVSWCONV_SIZE, KM_NOSLEEP);
                if (newptr == NULL) {
                        devsw_detach_locked(bdev, cdev);
                        error = ENOMEM;
                        goto out;
                }
                newptr[old_convs].d_name = NULL;
                newptr[old_convs].d_bmajor = -1;
                newptr[old_convs].d_cmajor = -1;
                memcpy(newptr, devsw_conv, old_convs * DEVSWCONV_SIZE);
                if (devsw_conv != devsw_conv0)
                        kmem_free(devsw_conv, old_convs * DEVSWCONV_SIZE);
                devsw_conv = newptr;
                max_devsw_convs = new_convs;
        }

        name = kmem_strdupsize(devname, NULL, KM_NOSLEEP);
        if (name == NULL) {
                devsw_detach_locked(bdev, cdev);
                error = ENOMEM;
                goto out;
        }

        devsw_conv[i].d_name = name;
        devsw_conv[i].d_bmajor = *bmajor;
        devsw_conv[i].d_cmajor = *cmajor;
        error = 0;
out:
        mutex_exit(&device_lock);
        return error;
}

static int
bdevsw_attach(const struct bdevsw *devsw, devmajor_t *devmajor)
{
        const struct bdevsw **newbdevsw = NULL;
        struct devswref *newbdevswref = NULL;
        struct localcount *lc;
        devmajor_t bmajor;
        int i;

        KASSERT(mutex_owned(&device_lock));

        if (devsw == NULL)
                return 0;

        if (*devmajor < 0) {
                for (bmajor = sys_bdevsws; bmajor < max_bdevsws; bmajor++) {
                        if (bdevsw[bmajor] != NULL)
                                continue;
                        for (i = 0; i < max_devsw_convs; i++) {
                                if (devsw_conv[i].d_bmajor == bmajor)
                                        break;
                        }
                        if (i != max_devsw_convs)
                                continue;
                        break;
                }
                *devmajor = bmajor;
        }

        if (*devmajor >= MAXDEVSW) {
                printf("%s: block majors exhausted\n", __func__);
                return ENOMEM;
        }

        if (bdevswref == NULL) {
                newbdevswref = kmem_zalloc(MAXDEVSW * sizeof(newbdevswref[0]),
                    KM_NOSLEEP);
                if (newbdevswref == NULL)
                        return ENOMEM;
                atomic_store_release(&bdevswref, newbdevswref);
        }

        if (*devmajor >= max_bdevsws) {
                KASSERT(bdevsw == bdevsw0);
                newbdevsw = kmem_zalloc(MAXDEVSW * sizeof(newbdevsw[0]),
                    KM_NOSLEEP);
                if (newbdevsw == NULL)
                        return ENOMEM;
                memcpy(newbdevsw, bdevsw, max_bdevsws * sizeof(bdevsw[0]));
                atomic_store_release(&bdevsw, newbdevsw);
                atomic_store_release(&max_bdevsws, MAXDEVSW);
        }

        if (bdevsw[*devmajor] != NULL)
                return EEXIST;

        KASSERT(bdevswref[*devmajor].dr_lc == NULL);
        lc = kmem_zalloc(sizeof(*lc), KM_SLEEP);
        localcount_init(lc);
        bdevswref[*devmajor].dr_lc = lc;

        atomic_store_release(&bdevsw[*devmajor], devsw);

        return 0;
}

static int
cdevsw_attach(const struct cdevsw *devsw, devmajor_t *devmajor)
{
        const struct cdevsw **newcdevsw = NULL;
        struct devswref *newcdevswref = NULL;
        struct localcount *lc;
        devmajor_t cmajor;
        int i;

        KASSERT(mutex_owned(&device_lock));

        if (*devmajor < 0) {
                for (cmajor = sys_cdevsws; cmajor < max_cdevsws; cmajor++) {
                        if (cdevsw[cmajor] != NULL)
                                continue;
                        for (i = 0; i < max_devsw_convs; i++) {
                                if (devsw_conv[i].d_cmajor == cmajor)
                                        break;
                        }
                        if (i != max_devsw_convs)
                                continue;
                        break;
                }
                *devmajor = cmajor;
        }

        if (*devmajor >= MAXDEVSW) {
                printf("%s: character majors exhausted\n", __func__);
                return ENOMEM;
        }

        if (cdevswref == NULL) {
                newcdevswref = kmem_zalloc(MAXDEVSW * sizeof(newcdevswref[0]),
                    KM_NOSLEEP);
                if (newcdevswref == NULL)
                        return ENOMEM;
                atomic_store_release(&cdevswref, newcdevswref);
        }

        if (*devmajor >= max_cdevsws) {
                KASSERT(cdevsw == cdevsw0);
                newcdevsw = kmem_zalloc(MAXDEVSW * sizeof(newcdevsw[0]),
                    KM_NOSLEEP);
                if (newcdevsw == NULL)
                        return ENOMEM;
                memcpy(newcdevsw, cdevsw, max_cdevsws * sizeof(cdevsw[0]));
                atomic_store_release(&cdevsw, newcdevsw);
                atomic_store_release(&max_cdevsws, MAXDEVSW);
        }

        if (cdevsw[*devmajor] != NULL)
                return EEXIST;

        KASSERT(cdevswref[*devmajor].dr_lc == NULL);
        lc = kmem_zalloc(sizeof(*lc), KM_SLEEP);
        localcount_init(lc);
        cdevswref[*devmajor].dr_lc = lc;

        atomic_store_release(&cdevsw[*devmajor], devsw);

        return 0;
}

static void
devsw_detach_locked(const struct bdevsw *bdev, const struct cdevsw *cdev)
{
        int bi, ci = -1/*XXXGCC*/, di;
        struct cfdriver *cd;
        device_t dv;

        KASSERT(mutex_owned(&device_lock));

        /*
         * If this is wired to an autoconf device, make sure the device
         * has no more instances.  No locking here because under
         * correct use of devsw_detach, none of this state can change
         * at this point.
         */
        if (cdev != NULL && (cd = cdev->d_cfdriver) != NULL) {
                for (di = 0; di < cd->cd_ndevs; di++) {
                        KASSERTMSG((dv = cd->cd_devs[di]) == NULL,
                            "detaching character device driver %s"
                            " still has attached unit %s",
                            cd->cd_name, device_xname(dv));
                }
        }
        if (bdev != NULL && (cd = bdev->d_cfdriver) != NULL) {
                for (di = 0; di < cd->cd_ndevs; di++) {
                        KASSERTMSG((dv = cd->cd_devs[di]) == NULL,
                            "detaching block device driver %s"
                            " still has attached unit %s",
                            cd->cd_name, device_xname(dv));
                }
        }

        /* Prevent new references.  */
        if (bdev != NULL) {
                for (bi = 0; bi < max_bdevsws; bi++) {
                        if (bdevsw[bi] != bdev)
                                continue;
                        atomic_store_relaxed(&bdevsw[bi], NULL);
                        break;
                }
                KASSERT(bi < max_bdevsws);
        }
        if (cdev != NULL) {
                for (ci = 0; ci < max_cdevsws; ci++) {
                        if (cdevsw[ci] != cdev)
                                continue;
                        atomic_store_relaxed(&cdevsw[ci], NULL);
                        break;
                }
                KASSERT(ci < max_cdevsws);
        }

        if (bdev == NULL && cdev == NULL) /* XXX possible? */
                return;

        /*
         * Wait for all bdevsw_lookup_acquire, cdevsw_lookup_acquire
         * calls to notice that the devsw is gone.
         *
         * XXX Despite the use of the pserialize_read_enter/exit API
         * elsewhere in this file, we use xc_barrier here instead of
         * pserialize_perform -- because devsw_init is too early for
         * pserialize_create.  Either pserialize_create should be made
         * to work earlier, or it should be nixed altogether.  Until
         * that is fixed, xc_barrier will serve the same purpose.
         */
        xc_barrier(0);

        /*
         * Wait for all references to drain.  It is the caller's
         * responsibility to ensure that at this point, there are no
         * extant open instances and all new d_open calls will fail.
         *
         * Note that localcount_drain may release and reacquire
         * device_lock.
         */
        if (bdev != NULL) {
                localcount_drain(bdevswref[bi].dr_lc,
                    &devsw_cv, &device_lock);
                localcount_fini(bdevswref[bi].dr_lc);
                kmem_free(bdevswref[bi].dr_lc, sizeof(*bdevswref[bi].dr_lc));
                bdevswref[bi].dr_lc = NULL;
        }
        if (cdev != NULL) {
                localcount_drain(cdevswref[ci].dr_lc,
                    &devsw_cv, &device_lock);
                localcount_fini(cdevswref[ci].dr_lc);
                kmem_free(cdevswref[ci].dr_lc, sizeof(*cdevswref[ci].dr_lc));
                cdevswref[ci].dr_lc = NULL;
        }
}

void
devsw_detach(const struct bdevsw *bdev, const struct cdevsw *cdev)
{

        mutex_enter(&device_lock);
        devsw_detach_locked(bdev, cdev);
        mutex_exit(&device_lock);
}

/*
 * Look up a block device by number.
 *
 * => Caller must ensure that the device is attached.
 */
const struct bdevsw *
bdevsw_lookup(dev_t dev)
{
        devmajor_t bmajor;

        if (dev == NODEV)
                return NULL;
        bmajor = major(dev);
        if (bmajor < 0 || bmajor >= atomic_load_relaxed(&max_bdevsws))
                return NULL;

        return atomic_load_consume(&bdevsw)[bmajor];
}

static const struct bdevsw *
bdevsw_lookup_acquire(dev_t dev, struct localcount **lcp)
{
        devmajor_t bmajor;
        const struct bdevsw *bdev = NULL, *const *curbdevsw;
        struct devswref *curbdevswref;
        int s;

        if (dev == NODEV)
                return NULL;
        bmajor = major(dev);
        if (bmajor < 0)
                return NULL;

        s = pserialize_read_enter();

        /*
         * max_bdevsws never goes down, so it is safe to rely on this
         * condition without any locking for the array access below.
         * Test sys_bdevsws first so we can avoid the memory barrier in
         * that case.
         */
        if (bmajor >= sys_bdevsws &&
            bmajor >= atomic_load_acquire(&max_bdevsws))
                goto out;
        curbdevsw = atomic_load_consume(&bdevsw);
        if ((bdev = atomic_load_consume(&curbdevsw[bmajor])) == NULL)
                goto out;

        curbdevswref = atomic_load_consume(&bdevswref);
        if (curbdevswref == NULL) {
                *lcp = NULL;
        } else if ((*lcp = curbdevswref[bmajor].dr_lc) != NULL) {
                localcount_acquire(*lcp);
        }
out:
        pserialize_read_exit(s);
        return bdev;
}

static void
bdevsw_release(const struct bdevsw *bdev, struct localcount *lc)
{

        if (lc == NULL)
                return;
        localcount_release(lc, &devsw_cv, &device_lock);
}

/*
 * Look up a character device by number.
 *
 * => Caller must ensure that the device is attached.
 */
const struct cdevsw *
cdevsw_lookup(dev_t dev)
{
        devmajor_t cmajor;

        if (dev == NODEV)
                return NULL;
        cmajor = major(dev);
        if (cmajor < 0 || cmajor >= atomic_load_relaxed(&max_cdevsws))
                return NULL;

        return atomic_load_consume(&cdevsw)[cmajor];
}

static const struct cdevsw *
cdevsw_lookup_acquire(dev_t dev, struct localcount **lcp)
{
        devmajor_t cmajor;
        const struct cdevsw *cdev = NULL, *const *curcdevsw;
        struct devswref *curcdevswref;
        int s;

        if (dev == NODEV)
                return NULL;
        cmajor = major(dev);
        if (cmajor < 0)
                return NULL;

        s = pserialize_read_enter();

        /*
         * max_cdevsws never goes down, so it is safe to rely on this
         * condition without any locking for the array access below.
         * Test sys_cdevsws first so we can avoid the memory barrier in
         * that case.
         */
        if (cmajor >= sys_cdevsws &&
            cmajor >= atomic_load_acquire(&max_cdevsws))
                goto out;
        curcdevsw = atomic_load_consume(&cdevsw);
        if ((cdev = atomic_load_consume(&curcdevsw[cmajor])) == NULL)
                goto out;

        curcdevswref = atomic_load_consume(&cdevswref);
        if (curcdevswref == NULL) {
                *lcp = NULL;
        } else if ((*lcp = curcdevswref[cmajor].dr_lc) != NULL) {
                localcount_acquire(*lcp);
        }
out:
        pserialize_read_exit(s);
        return cdev;
}

static void
cdevsw_release(const struct cdevsw *cdev, struct localcount *lc)
{

        if (lc == NULL)
                return;
        localcount_release(lc, &devsw_cv, &device_lock);
}

/*
 * Look up a block device by reference to its operations set.
 *
 * => Caller must ensure that the device is not detached, and therefore
 *    that the returned major is still valid when dereferenced.
 */
devmajor_t
bdevsw_lookup_major(const struct bdevsw *bdev)
{
        const struct bdevsw *const *curbdevsw;
        devmajor_t bmajor, bmax;

        bmax = atomic_load_acquire(&max_bdevsws);
        curbdevsw = atomic_load_consume(&bdevsw);
        for (bmajor = 0; bmajor < bmax; bmajor++) {
                if (atomic_load_relaxed(&curbdevsw[bmajor]) == bdev)
                        return bmajor;
        }

        return NODEVMAJOR;
}

/*
 * Look up a character device by reference to its operations set.
 *
 * => Caller must ensure that the device is not detached, and therefore
 *    that the returned major is still valid when dereferenced.
 */
devmajor_t
cdevsw_lookup_major(const struct cdevsw *cdev)
{
        const struct cdevsw *const *curcdevsw;
        devmajor_t cmajor, cmax;

        cmax = atomic_load_acquire(&max_cdevsws);
        curcdevsw = atomic_load_consume(&cdevsw);
        for (cmajor = 0; cmajor < cmax; cmajor++) {
                if (atomic_load_relaxed(&curcdevsw[cmajor]) == cdev)
                        return cmajor;
        }

        return NODEVMAJOR;
}

/*
 * Convert from block major number to name.
 *
 * => Caller must ensure that the device is not detached, and therefore
 *    that the name pointer is still valid when dereferenced.
 */
const char *
devsw_blk2name(devmajor_t bmajor)
{
        const char *name;
        devmajor_t cmajor;
        int i;

        name = NULL;
        cmajor = -1;

        mutex_enter(&device_lock);
        if (bmajor < 0 || bmajor >= max_bdevsws || bdevsw[bmajor] == NULL) {
                mutex_exit(&device_lock);
                return NULL;
        }
        for (i = 0; i < max_devsw_convs; i++) {
                if (devsw_conv[i].d_bmajor == bmajor) {
                        cmajor = devsw_conv[i].d_cmajor;
                        break;
                }
        }
        if (cmajor >= 0 && cmajor < max_cdevsws && cdevsw[cmajor] != NULL)
                name = devsw_conv[i].d_name;
        mutex_exit(&device_lock);

        return name;
}

/*
 * Convert char major number to device driver name.
 */
const char *
cdevsw_getname(devmajor_t major)
{
        const char *name;
        int i;

        name = NULL;

        if (major < 0)
                return NULL;

        mutex_enter(&device_lock);
        for (i = 0; i < max_devsw_convs; i++) {
                if (devsw_conv[i].d_cmajor == major) {
                        name = devsw_conv[i].d_name;
                        break;
                }
        }
        mutex_exit(&device_lock);
        return name;
}

/*
 * Convert block major number to device driver name.
 */
const char *
bdevsw_getname(devmajor_t major)
{
        const char *name;
        int i;

        name = NULL;

        if (major < 0)
                return NULL;

        mutex_enter(&device_lock);
        for (i = 0; i < max_devsw_convs; i++) {
                if (devsw_conv[i].d_bmajor == major) {
                        name = devsw_conv[i].d_name;
                        break;
                }
        }
        mutex_exit(&device_lock);
        return name;
}

/*
 * Convert from device name to block major number.
 *
 * => Caller must ensure that the device is not detached, and therefore
 *    that the major number is still valid when dereferenced.
 */
devmajor_t
devsw_name2blk(const char *name, char *devname, size_t devnamelen)
{
        struct devsw_conv *conv;
        devmajor_t bmajor;
        int i;

        if (name == NULL)
                return NODEVMAJOR;

        mutex_enter(&device_lock);
        for (i = 0; i < max_devsw_convs; i++) {
                size_t len;

                conv = &devsw_conv[i];
                if (conv->d_name == NULL)
                        continue;
                len = strlen(conv->d_name);
                if (strncmp(conv->d_name, name, len) != 0)
                        continue;
                if (name[len] != '\0' && !isdigit((unsigned char)name[len]))
                        continue;
                bmajor = conv->d_bmajor;
                if (bmajor < 0 || bmajor >= max_bdevsws ||
                    bdevsw[bmajor] == NULL)
                        break;
                if (devname != NULL) {
#ifdef DEVSW_DEBUG
                        if (strlen(conv->d_name) >= devnamelen)
                                printf("%s: too short buffer\n", __func__);
#endif /* DEVSW_DEBUG */
                        strncpy(devname, conv->d_name, devnamelen);
                        devname[devnamelen - 1] = '\0';
                }
                mutex_exit(&device_lock);
                return bmajor;
        }

        mutex_exit(&device_lock);
        return NODEVMAJOR;
}

/*
 * Convert from device name to char major number.
 *
 * => Caller must ensure that the device is not detached, and therefore
 *    that the major number is still valid when dereferenced.
 */
devmajor_t
devsw_name2chr(const char *name, char *devname, size_t devnamelen)
{
        struct devsw_conv *conv;
        devmajor_t cmajor;
        int i;

        if (name == NULL)
                return NODEVMAJOR;

        mutex_enter(&device_lock);
        for (i = 0; i < max_devsw_convs; i++) {
                size_t len;

                conv = &devsw_conv[i];
                if (conv->d_name == NULL)
                        continue;
                len = strlen(conv->d_name);
                if (strncmp(conv->d_name, name, len) != 0)
                        continue;
                if (name[len] != '\0' && !isdigit((unsigned char)name[len]))
                        continue;
                cmajor = conv->d_cmajor;
                if (cmajor < 0 || cmajor >= max_cdevsws ||
                    cdevsw[cmajor] == NULL)
                        break;
                if (devname != NULL) {
#ifdef DEVSW_DEBUG
                        if (strlen(conv->d_name) >= devnamelen)
                                printf("%s: too short buffer", __func__);
#endif /* DEVSW_DEBUG */
                        strncpy(devname, conv->d_name, devnamelen);
                        devname[devnamelen - 1] = '\0';
                }
                mutex_exit(&device_lock);
                return cmajor;
        }

        mutex_exit(&device_lock);
        return NODEVMAJOR;
}

/*
 * Convert from character dev_t to block dev_t.
 *
 * => Caller must ensure that the device is not detached, and therefore
 *    that the major number is still valid when dereferenced.
 */
dev_t
devsw_chr2blk(dev_t cdev)
{
        devmajor_t bmajor, cmajor;
        int i;
        dev_t rv;

        cmajor = major(cdev);
        bmajor = NODEVMAJOR;
        rv = NODEV;

        mutex_enter(&device_lock);
        if (cmajor < 0 || cmajor >= max_cdevsws || cdevsw[cmajor] == NULL) {
                mutex_exit(&device_lock);
                return NODEV;
        }
        for (i = 0; i < max_devsw_convs; i++) {
                if (devsw_conv[i].d_cmajor == cmajor) {
                        bmajor = devsw_conv[i].d_bmajor;
                        break;
                }
        }
        if (bmajor >= 0 && bmajor < max_bdevsws && bdevsw[bmajor] != NULL)
                rv = makedev(bmajor, minor(cdev));
        mutex_exit(&device_lock);

        return rv;
}

/*
 * Convert from block dev_t to character dev_t.
 *
 * => Caller must ensure that the device is not detached, and therefore
 *    that the major number is still valid when dereferenced.
 */
dev_t
devsw_blk2chr(dev_t bdev)
{
        devmajor_t bmajor, cmajor;
        int i;
        dev_t rv;

        bmajor = major(bdev);
        cmajor = NODEVMAJOR;
        rv = NODEV;

        mutex_enter(&device_lock);
        if (bmajor < 0 || bmajor >= max_bdevsws || bdevsw[bmajor] == NULL) {
                mutex_exit(&device_lock);
                return NODEV;
        }
        for (i = 0; i < max_devsw_convs; i++) {
                if (devsw_conv[i].d_bmajor == bmajor) {
                        cmajor = devsw_conv[i].d_cmajor;
                        break;
                }
        }
        if (cmajor >= 0 && cmajor < max_cdevsws && cdevsw[cmajor] != NULL)
                rv = makedev(cmajor, minor(bdev));
        mutex_exit(&device_lock);

        return rv;
}

/*
 * Device access methods.
 */

#define        DEV_LOCK(d)                                                \
        if ((mpflag = (d->d_flag & D_MPSAFE)) == 0) {                \
                KERNEL_LOCK(1, NULL);                                \
        }

#define        DEV_UNLOCK(d)                                                \
        if (mpflag == 0) {                                        \
                KERNEL_UNLOCK_ONE(NULL);                        \
        }

int
bdev_open(dev_t dev, int flag, int devtype, lwp_t *l)
{
        const struct bdevsw *d;
        struct localcount *lc;
        device_t dv = NULL/*XXXGCC*/;
        int unit = -1/*XXXGCC*/, rv, mpflag;

        d = bdevsw_lookup_acquire(dev, &lc);
        if (d == NULL)
                return ENXIO;

        if (d->d_devtounit) {
                /*
                 * If the device node corresponds to an autoconf device
                 * instance, acquire a reference to it so that during
                 * d_open, device_lookup is stable.
                 *
                 * XXX This should also arrange to instantiate cloning
                 * pseudo-devices if appropriate, but that requires
                 * reviewing them all to find and verify a common
                 * pattern.
                 */
                if ((unit = (*d->d_devtounit)(dev)) == -1)
                        return ENXIO;
                if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL)
                        return ENXIO;
                SDT_PROBE6(sdt, bdev, open, acquire,
                    d, dev, flag, devtype, unit, dv);
        }

        DEV_LOCK(d);
        SDT_PROBE4(sdt, bdev, open, entry,  d, dev, flag, devtype);
        rv = (*d->d_open)(dev, flag, devtype, l);
        SDT_PROBE5(sdt, bdev, open, return,  d, dev, flag, devtype, rv);
        DEV_UNLOCK(d);

        if (d->d_devtounit) {
                SDT_PROBE6(sdt, bdev, open, release,
                    d, dev, flag, devtype, unit, dv);
                device_release(dv);
        }

        bdevsw_release(d, lc);

        return rv;
}

int
bdev_cancel(dev_t dev, int flag, int devtype, struct lwp *l)
{
        const struct bdevsw *d;
        int rv, mpflag;

        if ((d = bdevsw_lookup(dev)) == NULL)
                return ENXIO;
        if (d->d_cancel == NULL)
                return ENODEV;

        DEV_LOCK(d);
        SDT_PROBE4(sdt, bdev, cancel, entry,  d, dev, flag, devtype);
        rv = (*d->d_cancel)(dev, flag, devtype, l);
        SDT_PROBE5(sdt, bdev, cancel, return,  d, dev, flag, devtype, rv);
        DEV_UNLOCK(d);

        return rv;
}

int
bdev_close(dev_t dev, int flag, int devtype, lwp_t *l)
{
        const struct bdevsw *d;
        int rv, mpflag;

        if ((d = bdevsw_lookup(dev)) == NULL)
                return ENXIO;

        DEV_LOCK(d);
        SDT_PROBE4(sdt, bdev, close, entry,  d, dev, flag, devtype);
        rv = (*d->d_close)(dev, flag, devtype, l);
        SDT_PROBE5(sdt, bdev, close, return,  d, dev, flag, devtype, rv);
        DEV_UNLOCK(d);

        return rv;
}

SDT_PROVIDER_DECLARE(io);
SDT_PROBE_DEFINE1(io, kernel, , start, "struct buf *"/*bp*/);

void
bdev_strategy(struct buf *bp)
{
        const struct bdevsw *d;
        int mpflag;

        SDT_PROBE1(io, kernel, , start, bp);

        if ((d = bdevsw_lookup(bp->b_dev)) == NULL) {
                bp->b_error = ENXIO;
                bp->b_resid = bp->b_bcount;
                biodone_vfs(bp); /* biodone() iff vfs present */
                return;
        }

        DEV_LOCK(d);
        SDT_PROBE3(sdt, bdev, strategy, entry,  d, bp->b_dev, bp);
        (*d->d_strategy)(bp);
        SDT_PROBE3(sdt, bdev, strategy, return,  d, bp->b_dev, bp);
        DEV_UNLOCK(d);
}

int
bdev_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l)
{
        const struct bdevsw *d;
        int rv, mpflag;

        if ((d = bdevsw_lookup(dev)) == NULL)
                return ENXIO;

        DEV_LOCK(d);
        SDT_PROBE5(sdt, bdev, ioctl, entry,  d, dev, cmd, data, flag);
        rv = (*d->d_ioctl)(dev, cmd, data, flag, l);
        SDT_PROBE6(sdt, bdev, ioctl, return,  d, dev, cmd, data, flag, rv);
        DEV_UNLOCK(d);

        return rv;
}

int
bdev_dump(dev_t dev, daddr_t addr, void *data, size_t sz)
{
        const struct bdevsw *d;
        int rv;

        /*
         * Dump can be called without the device open.  Since it can
         * currently only be called with the system paused (and in a
         * potentially unstable state), we don't perform any locking.
         */
        if ((d = bdevsw_lookup(dev)) == NULL)
                return ENXIO;

        /* DEV_LOCK(d); */
        rv = (*d->d_dump)(dev, addr, data, sz);
        /* DEV_UNLOCK(d); */

        return rv;
}

int
bdev_flags(dev_t dev)
{
        const struct bdevsw *d;

        if ((d = bdevsw_lookup(dev)) == NULL)
                return 0;
        return d->d_flag & ~D_TYPEMASK;
}

int
bdev_type(dev_t dev)
{
        const struct bdevsw *d;

        if ((d = bdevsw_lookup(dev)) == NULL)
                return D_OTHER;
        return d->d_flag & D_TYPEMASK;
}

int
bdev_size(dev_t dev)
{
        const struct bdevsw *d;
        int rv, mpflag = 0;

        if ((d = bdevsw_lookup(dev)) == NULL ||
            d->d_psize == NULL)
                return -1;

        /*
         * Don't to try lock the device if we're dumping.
         * XXX: is there a better way to test this?
         */
        if ((boothowto & RB_DUMP) == 0)
                DEV_LOCK(d);
        SDT_PROBE2(sdt, bdev, psize, entry,  d, dev);
        rv = (*d->d_psize)(dev);
        SDT_PROBE3(sdt, bdev, psize, return,  d, dev, rv);
        if ((boothowto & RB_DUMP) == 0)
                DEV_UNLOCK(d);

        return rv;
}

int
bdev_discard(dev_t dev, off_t pos, off_t len)
{
        const struct bdevsw *d;
        int rv, mpflag;

        if ((d = bdevsw_lookup(dev)) == NULL)
                return ENXIO;

        DEV_LOCK(d);
        SDT_PROBE4(sdt, bdev, discard, entry,  d, dev, pos, len);
        rv = (*d->d_discard)(dev, pos, len);
        SDT_PROBE5(sdt, bdev, discard, return,  d, dev, pos, len, rv);
        DEV_UNLOCK(d);

        return rv;
}

void
bdev_detached(dev_t dev)
{
        const struct bdevsw *d;
        device_t dv;
        int unit;

        if ((d = bdevsw_lookup(dev)) == NULL)
                return;
        if (d->d_devtounit == NULL)
                return;
        if ((unit = (*d->d_devtounit)(dev)) == -1)
                return;
        if ((dv = device_lookup(d->d_cfdriver, unit)) == NULL)
                return;
        config_detach_commit(dv);
}

int
cdev_open(dev_t dev, int flag, int devtype, lwp_t *l)
{
        const struct cdevsw *d;
        struct localcount *lc;
        device_t dv = NULL/*XXXGCC*/;
        int unit = -1/*XXXGCC*/, rv, mpflag;

        d = cdevsw_lookup_acquire(dev, &lc);
        if (d == NULL)
                return ENXIO;

        if (d->d_devtounit) {
                /*
                 * If the device node corresponds to an autoconf device
                 * instance, acquire a reference to it so that during
                 * d_open, device_lookup is stable.
                 *
                 * XXX This should also arrange to instantiate cloning
                 * pseudo-devices if appropriate, but that requires
                 * reviewing them all to find and verify a common
                 * pattern.
                 */
                if ((unit = (*d->d_devtounit)(dev)) == -1)
                        return ENXIO;
                if ((dv = device_lookup_acquire(d->d_cfdriver, unit)) == NULL)
                        return ENXIO;
                SDT_PROBE6(sdt, cdev, open, acquire,
                    d, dev, flag, devtype, unit, dv);
        }

        DEV_LOCK(d);
        SDT_PROBE4(sdt, cdev, open, entry,  d, dev, flag, devtype);
        rv = (*d->d_open)(dev, flag, devtype, l);
        SDT_PROBE5(sdt, cdev, open, return,  d, dev, flag, devtype, rv);
        DEV_UNLOCK(d);

        if (d->d_devtounit) {
                SDT_PROBE6(sdt, cdev, open, release,
                    d, dev, flag, devtype, unit, dv);
                device_release(dv);
        }

        cdevsw_release(d, lc);

        return rv;
}

int
cdev_cancel(dev_t dev, int flag, int devtype, struct lwp *l)
{
        const struct cdevsw *d;
        int rv, mpflag;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return ENXIO;
        if (d->d_cancel == NULL)
                return ENODEV;

        DEV_LOCK(d);
        SDT_PROBE4(sdt, cdev, cancel, entry,  d, dev, flag, devtype);
        rv = (*d->d_cancel)(dev, flag, devtype, l);
        SDT_PROBE5(sdt, cdev, cancel, return,  d, dev, flag, devtype, rv);
        DEV_UNLOCK(d);

        return rv;
}

int
cdev_close(dev_t dev, int flag, int devtype, lwp_t *l)
{
        const struct cdevsw *d;
        int rv, mpflag;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return ENXIO;

        DEV_LOCK(d);
        SDT_PROBE4(sdt, cdev, close, entry,  d, dev, flag, devtype);
        rv = (*d->d_close)(dev, flag, devtype, l);
        SDT_PROBE5(sdt, cdev, close, return,  d, dev, flag, devtype, rv);
        DEV_UNLOCK(d);

        return rv;
}

int
cdev_read(dev_t dev, struct uio *uio, int flag)
{
        const struct cdevsw *d;
        int rv, mpflag;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return ENXIO;

        DEV_LOCK(d);
        SDT_PROBE4(sdt, cdev, read, entry,  d, dev, uio, flag);
        rv = (*d->d_read)(dev, uio, flag);
        SDT_PROBE5(sdt, cdev, read, return,  d, dev, uio, flag, rv);
        DEV_UNLOCK(d);

        return rv;
}

int
cdev_write(dev_t dev, struct uio *uio, int flag)
{
        const struct cdevsw *d;
        int rv, mpflag;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return ENXIO;

        DEV_LOCK(d);
        SDT_PROBE4(sdt, cdev, write, entry,  d, dev, uio, flag);
        rv = (*d->d_write)(dev, uio, flag);
        SDT_PROBE5(sdt, cdev, write, return,  d, dev, uio, flag, rv);
        DEV_UNLOCK(d);

        return rv;
}

int
cdev_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l)
{
        const struct cdevsw *d;
        int rv, mpflag;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return ENXIO;

        DEV_LOCK(d);
        SDT_PROBE5(sdt, cdev, ioctl, entry,  d, dev, cmd, data, flag);
        rv = (*d->d_ioctl)(dev, cmd, data, flag, l);
        SDT_PROBE6(sdt, cdev, ioctl, return,  d, dev, cmd, data, flag, rv);
        DEV_UNLOCK(d);

        return rv;
}

void
cdev_stop(struct tty *tp, int flag)
{
        const struct cdevsw *d;
        int mpflag;

        if ((d = cdevsw_lookup(tp->t_dev)) == NULL)
                return;

        DEV_LOCK(d);
        SDT_PROBE4(sdt, cdev, stop, entry,  d, tp->t_dev, tp, flag);
        (*d->d_stop)(tp, flag);
        SDT_PROBE4(sdt, cdev, stop, return,  d, tp->t_dev, tp, flag);
        DEV_UNLOCK(d);
}

struct tty *
cdev_tty(dev_t dev)
{
        const struct cdevsw *d;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return NULL;

        /* XXX Check if necessary. */
        if (d->d_tty == NULL)
                return NULL;

        return (*d->d_tty)(dev);
}

int
cdev_poll(dev_t dev, int flag, lwp_t *l)
{
        const struct cdevsw *d;
        int rv, mpflag;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return POLLERR;

        DEV_LOCK(d);
        SDT_PROBE3(sdt, cdev, poll, entry,  d, dev, flag);
        rv = (*d->d_poll)(dev, flag, l);
        SDT_PROBE4(sdt, cdev, poll, return,  d, dev, flag, rv);
        DEV_UNLOCK(d);

        return rv;
}

paddr_t
cdev_mmap(dev_t dev, off_t off, int flag)
{
        const struct cdevsw *d;
        paddr_t rv;
        int mpflag;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return (paddr_t)-1LL;

        DEV_LOCK(d);
        SDT_PROBE4(sdt, cdev, mmap, entry,  d, dev, off, flag);
        rv = (*d->d_mmap)(dev, off, flag);
        SDT_PROBE5(sdt, cdev, mmap, return,  d, dev, off, flag, rv);
        DEV_UNLOCK(d);

        return rv;
}

int
cdev_kqfilter(dev_t dev, struct knote *kn)
{
        const struct cdevsw *d;
        int rv, mpflag;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return ENXIO;

        DEV_LOCK(d);
        SDT_PROBE3(sdt, cdev, kqfilter, entry,  d, dev, kn);
        rv = (*d->d_kqfilter)(dev, kn);
        SDT_PROBE4(sdt, cdev, kqfilter, return,  d, dev, kn, rv);
        DEV_UNLOCK(d);

        return rv;
}

int
cdev_discard(dev_t dev, off_t pos, off_t len)
{
        const struct cdevsw *d;
        int rv, mpflag;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return ENXIO;

        DEV_LOCK(d);
        SDT_PROBE4(sdt, cdev, discard, entry,  d, dev, pos, len);
        rv = (*d->d_discard)(dev, pos, len);
        SDT_PROBE5(sdt, cdev, discard, return,  d, dev, pos, len, rv);
        DEV_UNLOCK(d);

        return rv;
}

int
cdev_flags(dev_t dev)
{
        const struct cdevsw *d;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return 0;
        return d->d_flag & ~D_TYPEMASK;
}

int
cdev_type(dev_t dev)
{
        const struct cdevsw *d;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return D_OTHER;
        return d->d_flag & D_TYPEMASK;
}

void
cdev_detached(dev_t dev)
{
        const struct cdevsw *d;
        device_t dv;
        int unit;

        if ((d = cdevsw_lookup(dev)) == NULL)
                return;
        if (d->d_devtounit == NULL)
                return;
        if ((unit = (*d->d_devtounit)(dev)) == -1)
                return;
        if ((dv = device_lookup(d->d_cfdriver, unit)) == NULL)
                return;
        config_detach_commit(dv);
}

/*
 * nommap(dev, off, prot)
 *
 *        mmap routine that always fails, for non-mmappable devices.
 */
paddr_t
nommap(dev_t dev, off_t off, int prot)
{

        return (paddr_t)-1;
}

/*
 * dev_minor_unit(dev)
 *
 *        Returns minor(dev) as an int.  Intended for use with struct
 *        bdevsw, cdevsw::d_devtounit for drivers whose /dev nodes are
 *        implemented by reference to an autoconf instance with the minor
 *        number.
 */
int
dev_minor_unit(dev_t dev)
{

        return minor(dev);
}


















































































































































































































































































































































































































































































































































































































































































































    3 













    3 






























































































































































    1 









    1 

    1 










    1 









    1 

    1 




























































































    1 



























    1 
















































    1 

























































































































































































































































































    5 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
/*        $NetBSD: udp_usrreq.c,v 1.264 2022/11/04 09:00:58 ozaki-r Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)udp_usrreq.c        8.6 (Berkeley) 5/23/95
 */

/*
 * UDP protocol implementation.
 * Per RFC 768, August, 1980.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: udp_usrreq.c,v 1.264 2022/11/04 09:00:58 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_inet_csum.h"
#include "opt_mbuftrace.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/once.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/domain.h>
#include <sys/sysctl.h>

#include <net/if.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet/udp_private.h>

#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/udp6_var.h>
#include <netinet6/udp6_private.h>
#endif

#ifndef INET6
#include <netinet/ip6.h>
#endif

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/esp.h>
#endif

int udpcksum = 1;
int udp_do_loopback_cksum = 0;

struct inpcbtable udbtable;

percpu_t *udpstat_percpu;

#ifdef INET
#ifdef IPSEC
static int udp4_espinudp(struct mbuf **, int);
#endif
static void udp4_sendup(struct mbuf *, int, struct sockaddr *,
    struct socket *);
static int udp4_realinput(struct sockaddr_in *, struct sockaddr_in *,
    struct mbuf **, int);
static int udp4_input_checksum(struct mbuf *, const struct udphdr *, int, int);
#endif
#ifdef INET
static void udp_notify (struct inpcb *, int);
#endif

#ifndef UDBHASHSIZE
#define        UDBHASHSIZE        128
#endif
int udbhashsize = UDBHASHSIZE;

/*
 * For send - really max datagram size; for receive - 40 1K datagrams.
 */
static int udp_sendspace = 9216;
static int udp_recvspace = 40 * (1024 + sizeof(struct sockaddr_in));

#ifdef MBUFTRACE
struct mowner udp_mowner = MOWNER_INIT("udp", "");
struct mowner udp_rx_mowner = MOWNER_INIT("udp", "rx");
struct mowner udp_tx_mowner = MOWNER_INIT("udp", "tx");
#endif

#ifdef UDP_CSUM_COUNTERS
#include <sys/device.h>

#if defined(INET)
struct evcnt udp_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "udp", "hwcsum bad");
struct evcnt udp_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "udp", "hwcsum ok");
struct evcnt udp_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "udp", "hwcsum data");
struct evcnt udp_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "udp", "swcsum");

EVCNT_ATTACH_STATIC(udp_hwcsum_bad);
EVCNT_ATTACH_STATIC(udp_hwcsum_ok);
EVCNT_ATTACH_STATIC(udp_hwcsum_data);
EVCNT_ATTACH_STATIC(udp_swcsum);
#endif /* defined(INET) */

#define        UDP_CSUM_COUNTER_INCR(ev)        (ev)->ev_count++
#else
#define        UDP_CSUM_COUNTER_INCR(ev)        /* nothing */
#endif /* UDP_CSUM_COUNTERS */

static void sysctl_net_inet_udp_setup(struct sysctllog **);

static int
do_udpinit(void)
{

        inpcb_init(&udbtable, udbhashsize, udbhashsize);
        udpstat_percpu = percpu_alloc(sizeof(uint64_t) * UDP_NSTATS);

        MOWNER_ATTACH(&udp_tx_mowner);
        MOWNER_ATTACH(&udp_rx_mowner);
        MOWNER_ATTACH(&udp_mowner);

        return 0;
}

void
udp_init_common(void)
{
        static ONCE_DECL(doudpinit);

        RUN_ONCE(&doudpinit, do_udpinit);
}

void
udp_init(void)
{

        sysctl_net_inet_udp_setup(NULL);

        udp_init_common();
}

/*
 * Checksum extended UDP header and data.
 */
int
udp_input_checksum(int af, struct mbuf *m, const struct udphdr *uh,
    int iphlen, int len)
{

        switch (af) {
#ifdef INET
        case AF_INET:
                return udp4_input_checksum(m, uh, iphlen, len);
#endif
#ifdef INET6
        case AF_INET6:
                return udp6_input_checksum(m, uh, iphlen, len);
#endif
        }
#ifdef DIAGNOSTIC
        panic("udp_input_checksum: unknown af %d", af);
#endif
        /* NOTREACHED */
        return -1;
}

#ifdef INET

/*
 * Checksum extended UDP header and data.
 */
static int
udp4_input_checksum(struct mbuf *m, const struct udphdr *uh,
    int iphlen, int len)
{

        /*
         * XXX it's better to record and check if this mbuf is
         * already checked.
         */

        if (uh->uh_sum == 0)
                return 0;

        switch (m->m_pkthdr.csum_flags &
            ((m_get_rcvif_NOMPSAFE(m)->if_csum_flags_rx & M_CSUM_UDPv4) |
            M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
        case M_CSUM_UDPv4|M_CSUM_TCP_UDP_BAD:
                UDP_CSUM_COUNTER_INCR(&udp_hwcsum_bad);
                goto badcsum;

        case M_CSUM_UDPv4|M_CSUM_DATA: {
                u_int32_t hw_csum = m->m_pkthdr.csum_data;

                UDP_CSUM_COUNTER_INCR(&udp_hwcsum_data);
                if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) {
                        const struct ip *ip =
                            mtod(m, const struct ip *);

                        hw_csum = in_cksum_phdr(ip->ip_src.s_addr,
                            ip->ip_dst.s_addr,
                            htons(hw_csum + len + IPPROTO_UDP));
                }
                if ((hw_csum ^ 0xffff) != 0)
                        goto badcsum;
                break;
        }

        case M_CSUM_UDPv4:
                /* Checksum was okay. */
                UDP_CSUM_COUNTER_INCR(&udp_hwcsum_ok);
                break;

        default:
                /*
                 * Need to compute it ourselves.  Maybe skip checksum
                 * on loopback interfaces.
                 */
                if (__predict_true(!(m_get_rcvif_NOMPSAFE(m)->if_flags &
                                     IFF_LOOPBACK) ||
                                   udp_do_loopback_cksum)) {
                        UDP_CSUM_COUNTER_INCR(&udp_swcsum);
                        if (in4_cksum(m, IPPROTO_UDP, iphlen, len) != 0)
                                goto badcsum;
                }
                break;
        }

        return 0;

badcsum:
        UDP_STATINC(UDP_STAT_BADSUM);
        return -1;
}

void
udp_input(struct mbuf *m, int off, int proto)
{
        struct sockaddr_in src, dst;
        struct ip *ip;
        struct udphdr *uh;
        int iphlen = off;
        int len;
        int n;
        u_int16_t ip_len;

        MCLAIM(m, &udp_rx_mowner);
        UDP_STATINC(UDP_STAT_IPACKETS);

        /*
         * Get IP and UDP header together in first mbuf.
         */
        ip = mtod(m, struct ip *);
        M_REGION_GET(uh, struct udphdr *, m, iphlen, sizeof(struct udphdr));
        if (uh == NULL) {
                UDP_STATINC(UDP_STAT_HDROPS);
                return;
        }

        /*
         * Enforce alignment requirements that are violated in
         * some cases, see kern/50766 for details.
         */
        if (ACCESSIBLE_POINTER(uh, struct udphdr) == 0) {
                m = m_copyup(m, iphlen + sizeof(struct udphdr), 0);
                if (m == NULL) {
                        UDP_STATINC(UDP_STAT_HDROPS);
                        return;
                }
                ip = mtod(m, struct ip *);
                uh = (struct udphdr *)(mtod(m, char *) + iphlen);
        }
        KASSERT(ACCESSIBLE_POINTER(uh, struct udphdr));

        /* destination port of 0 is illegal, based on RFC768. */
        if (uh->uh_dport == 0)
                goto bad;

        /*
         * Make mbuf data length reflect UDP length.
         * If not enough data to reflect UDP length, drop.
         */
        ip_len = ntohs(ip->ip_len);
        len = ntohs((u_int16_t)uh->uh_ulen);
        if (len < sizeof(struct udphdr)) {
                UDP_STATINC(UDP_STAT_BADLEN);
                goto bad;
        }
        if (ip_len != iphlen + len) {
                if (ip_len < iphlen + len) {
                        UDP_STATINC(UDP_STAT_BADLEN);
                        goto bad;
                }
                m_adj(m, iphlen + len - ip_len);
        }

        /*
         * Checksum extended UDP header and data.
         */
        if (udp4_input_checksum(m, uh, iphlen, len))
                goto badcsum;

        /* construct source and dst sockaddrs. */
        sockaddr_in_init(&src, &ip->ip_src, uh->uh_sport);
        sockaddr_in_init(&dst, &ip->ip_dst, uh->uh_dport);

        if ((n = udp4_realinput(&src, &dst, &m, iphlen)) == -1) {
                UDP_STATINC(UDP_STAT_HDROPS);
                return;
        }
        if (m == NULL) {
                /*
                 * packet has been processed by ESP stuff -
                 * e.g. dropped NAT-T-keep-alive-packet ...
                 */
                return;
        }

        ip = mtod(m, struct ip *);
        M_REGION_GET(uh, struct udphdr *, m, iphlen, sizeof(struct udphdr));
        if (uh == NULL) {
                UDP_STATINC(UDP_STAT_HDROPS);
                return;
        }
        /* XXX Re-enforce alignment? */

#ifdef INET6
        if (IN_MULTICAST(ip->ip_dst.s_addr) || n == 0) {
                struct sockaddr_in6 src6, dst6;

                memset(&src6, 0, sizeof(src6));
                src6.sin6_family = AF_INET6;
                src6.sin6_len = sizeof(struct sockaddr_in6);
                in6_in_2_v4mapin6(&ip->ip_src, &src6.sin6_addr);
                src6.sin6_port = uh->uh_sport;
                memset(&dst6, 0, sizeof(dst6));
                dst6.sin6_family = AF_INET6;
                dst6.sin6_len = sizeof(struct sockaddr_in6);
                in6_in_2_v4mapin6(&ip->ip_dst, &dst6.sin6_addr);
                dst6.sin6_port = uh->uh_dport;

                n += udp6_realinput(AF_INET, &src6, &dst6, &m, iphlen);
        }
#endif

        if (n == 0) {
                if (m->m_flags & (M_BCAST | M_MCAST)) {
                        UDP_STATINC(UDP_STAT_NOPORTBCAST);
                        goto bad;
                }
                UDP_STATINC(UDP_STAT_NOPORT);
                icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
                m = NULL;
        }

bad:
        if (m)
                m_freem(m);
        return;

badcsum:
        m_freem(m);
}
#endif

#ifdef INET
static void
udp4_sendup(struct mbuf *m, int off /* offset of data portion */,
    struct sockaddr *src, struct socket *so)
{
        struct mbuf *opts = NULL;
        struct mbuf *n;
        struct inpcb *inp;

        KASSERT(so != NULL);
        KASSERT(so->so_proto->pr_domain->dom_family == AF_INET);
        inp = sotoinpcb(so);
        KASSERT(inp != NULL);

#if defined(IPSEC)
        if (ipsec_used && ipsec_in_reject(m, inp)) {
                if ((n = m_copypacket(m, M_DONTWAIT)) != NULL)
                        icmp_error(n, ICMP_UNREACH, ICMP_UNREACH_ADMIN_PROHIBIT,
                            0, 0);
                return;
        }
#endif

        if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) {
                if (inp->inp_flags & INP_CONTROLOPTS ||
                    SOOPT_TIMESTAMP(so->so_options)) {
                        struct ip *ip = mtod(n, struct ip *);
                        ip_savecontrol(inp, &opts, ip, n);
                }

                m_adj(n, off);
                if (sbappendaddr(&so->so_rcv, src, n, opts) == 0) {
                        m_freem(n);
                        if (opts)
                                m_freem(opts);
                        UDP_STATINC(UDP_STAT_FULLSOCK);
                        soroverflow(so);
                } else
                        sorwakeup(so);
        }
}
#endif

#ifdef INET
static int
udp4_realinput(struct sockaddr_in *src, struct sockaddr_in *dst,
    struct mbuf **mp, int off /* offset of udphdr */)
{
        u_int16_t *sport, *dport;
        int rcvcnt;
        struct in_addr *src4, *dst4;
        struct inpcb *inp;
        struct mbuf *m = *mp;

        rcvcnt = 0;
        off += sizeof(struct udphdr);        /* now, offset of payload */

        if (src->sin_family != AF_INET || dst->sin_family != AF_INET)
                goto bad;

        src4 = &src->sin_addr;
        sport = &src->sin_port;
        dst4 = &dst->sin_addr;
        dport = &dst->sin_port;

        if (IN_MULTICAST(dst4->s_addr) ||
            in_broadcast(*dst4, m_get_rcvif_NOMPSAFE(m))) {
                /*
                 * Deliver a multicast or broadcast datagram to *all* sockets
                 * for which the local and remote addresses and ports match
                 * those of the incoming datagram.  This allows more than
                 * one process to receive multi/broadcasts on the same port.
                 * (This really ought to be done for unicast datagrams as
                 * well, but that would cause problems with existing
                 * applications that open both address-specific sockets and
                 * a wildcard socket listening to the same port -- they would
                 * end up receiving duplicates of every unicast datagram.
                 * Those applications open the multiple sockets to overcome an
                 * inadequacy of the UDP socket interface, but for backwards
                 * compatibility we avoid the problem here rather than
                 * fixing the interface.  Maybe 4.5BSD will remedy this?)
                 */

                /*
                 * KAME note: traditionally we dropped udpiphdr from mbuf here.
                 * we need udpiphdr for IPsec processing so we do that later.
                 */
                /*
                 * Locate pcb(s) for datagram.
                 */
                TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) {
                        if (inp->inp_af != AF_INET)
                                continue;

                        if (inp->inp_lport != *dport)
                                continue;
                        if (!in_nullhost(in4p_laddr(inp))) {
                                if (!in_hosteq(in4p_laddr(inp), *dst4))
                                        continue;
                        }
                        if (!in_nullhost(in4p_faddr(inp))) {
                                if (!in_hosteq(in4p_faddr(inp), *src4) ||
                                    inp->inp_fport != *sport)
                                        continue;
                        }

                        udp4_sendup(m, off, (struct sockaddr *)src,
                            inp->inp_socket);
                        rcvcnt++;

                        /*
                         * Don't look for additional matches if this one does
                         * not have either the SO_REUSEPORT or SO_REUSEADDR
                         * socket options set.  This heuristic avoids searching
                         * through all pcbs in the common case of a non-shared
                         * port.  It assumes that an application will never
                         * clear these options after setting them.
                         */
                        if ((inp->inp_socket->so_options &
                            (SO_REUSEPORT|SO_REUSEADDR)) == 0)
                                break;
                }
        } else {
                /*
                 * Locate pcb for datagram.
                 */
                inp = inpcb_lookup(&udbtable, *src4, *sport, *dst4,
                    *dport, 0);
                if (inp == 0) {
                        UDP_STATINC(UDP_STAT_PCBHASHMISS);
                        inp = inpcb_lookup_bound(&udbtable, *dst4, *dport);
                        if (inp == 0)
                                return rcvcnt;
                }

#ifdef IPSEC
                /* Handle ESP over UDP */
                if (inp->inp_flags & INP_ESPINUDP) {
                        switch (udp4_espinudp(mp, off)) {
                        case -1: /* Error, m was freed */
                                rcvcnt = -1;
                                goto bad;

                        case 1: /* ESP over UDP */
                                rcvcnt++;
                                goto bad;

                        case 0: /* plain UDP */
                        default: /* Unexpected */
                                /*
                                 * Normal UDP processing will take place,
                                 * m may have changed.
                                 */
                                m = *mp;
                                break;
                        }
                }
#endif
                if (inp->inp_overudp_cb != NULL) {
                        int ret;
                        ret = inp->inp_overudp_cb(mp, off, inp->inp_socket,
                            sintosa(src), inp->inp_overudp_arg);
                        switch (ret) {
                        case -1: /* Error, m was freed */
                                rcvcnt = -1;
                                goto bad;

                        case 1: /* Foo over UDP */
                                KASSERT(*mp == NULL);
                                rcvcnt++;
                                goto bad;

                        case 0: /* plain UDP */
                        default: /* Unexpected */
                                /*
                                 * Normal UDP processing will take place,
                                 * m may have changed.
                                 */
                                m = *mp;
                                break;
                        }
                }

                /*
                 * Check the minimum TTL for socket.
                 */
                if (mtod(m, struct ip *)->ip_ttl < in4p_ip_minttl(inp))
                        goto bad;

                udp4_sendup(m, off, (struct sockaddr *)src, inp->inp_socket);
                rcvcnt++;
        }

bad:
        return rcvcnt;
}
#endif

#ifdef INET
/*
 * Notify a udp user of an asynchronous error;
 * just wake up so that he can collect error status.
 */
static void
udp_notify(struct inpcb *inp, int errno)
{
        inp->inp_socket->so_error = errno;
        sorwakeup(inp->inp_socket);
        sowwakeup(inp->inp_socket);
}

void *
udp_ctlinput(int cmd, const struct sockaddr *sa, void *v)
{
        struct ip *ip = v;
        struct udphdr *uh;
        void (*notify)(struct inpcb *, int) = udp_notify;
        int errno;

        if (sa->sa_family != AF_INET ||
            sa->sa_len != sizeof(struct sockaddr_in))
                return NULL;
        if ((unsigned)cmd >= PRC_NCMDS)
                return NULL;

        errno = inetctlerrmap[cmd];
        if (PRC_IS_REDIRECT(cmd)) {
                notify = inpcb_rtchange;
                ip = NULL;
        } else if (cmd == PRC_HOSTDEAD) {
                ip = NULL;
        } else if (errno == 0) {
                return NULL;
        }

        if (ip) {
                uh = (struct udphdr *)((char *)ip + (ip->ip_hl << 2));
                inpcb_notify(&udbtable, satocsin(sa)->sin_addr, uh->uh_dport,
                    ip->ip_src, uh->uh_sport, errno, notify);
                /* XXX mapped address case */
        } else {
                inpcb_notifyall(&udbtable, satocsin(sa)->sin_addr, errno,
                    notify);
        }

        return NULL;
}

int
udp_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
        int s;
        int error = 0;
        struct inpcb *inp;
        int family;
        int optval;

        family = so->so_proto->pr_domain->dom_family;

        s = splsoftnet();
        switch (family) {
#ifdef INET
        case PF_INET:
                if (sopt->sopt_level != IPPROTO_UDP) {
                        error = ip_ctloutput(op, so, sopt);
                        goto end;
                }
                break;
#endif
#ifdef INET6
        case PF_INET6:
                if (sopt->sopt_level != IPPROTO_UDP) {
                        error = ip6_ctloutput(op, so, sopt);
                        goto end;
                }
                break;
#endif
        default:
                error = EAFNOSUPPORT;
                goto end;
        }


        switch (op) {
        case PRCO_SETOPT:
                inp = sotoinpcb(so);

                switch (sopt->sopt_name) {
                case UDP_ENCAP:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;

                        switch(optval) {
                        case 0:
                                inp->inp_flags &= ~INP_ESPINUDP;
                                break;

                        case UDP_ENCAP_ESPINUDP:
                                inp->inp_flags |= INP_ESPINUDP;
                                break;

                        default:
                                error = EINVAL;
                                break;
                        }
                        break;

                default:
                        error = ENOPROTOOPT;
                        break;
                }
                break;

        default:
                error = EINVAL;
                break;
        }

end:
        splx(s);
        return error;
}

int
udp_output(struct mbuf *m, struct inpcb *inp, struct mbuf *control,
    struct lwp *l)
{
        struct udpiphdr *ui;
        struct route *ro;
        struct ip_pktopts pktopts;
        kauth_cred_t cred;
        int len = m->m_pkthdr.len;
        int error, flags = 0;

        MCLAIM(m, &udp_tx_mowner);

        /*
         * Calculate data length and get a mbuf
         * for UDP and IP headers.
         */
        M_PREPEND(m, sizeof(struct udpiphdr), M_DONTWAIT);
        if (m == NULL) {
                error = ENOBUFS;
                goto release;
        }

        /*
         * Compute the packet length of the IP header, and
         * punt if the length looks bogus.
         */
        if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
                error = EMSGSIZE;
                goto release;
        }

        if (l == NULL)
                cred = NULL;
        else
                cred = l->l_cred;

        /* Setup IP outgoing packet options */
        memset(&pktopts, 0, sizeof(pktopts));
        error = ip_setpktopts(control, &pktopts, &flags, inp, cred);
        if (error != 0)
                goto release;

        if (control != NULL) {
                m_freem(control);
                control = NULL;
        }

        /*
         * Fill in mbuf with extended UDP header
         * and addresses and length put into network format.
         */
        ui = mtod(m, struct udpiphdr *);
        ui->ui_pr = IPPROTO_UDP;
        ui->ui_src = pktopts.ippo_laddr.sin_addr;
        ui->ui_dst = in4p_faddr(inp);
        ui->ui_sport = inp->inp_lport;
        ui->ui_dport = inp->inp_fport;
        ui->ui_ulen = htons((u_int16_t)len + sizeof(struct udphdr));

        ro = &inp->inp_route;

        /*
         * Set up checksum and output datagram.
         */
        if (udpcksum) {
                /*
                 * XXX Cache pseudo-header checksum part for
                 * XXX "connected" UDP sockets.
                 */
                ui->ui_sum = in_cksum_phdr(ui->ui_src.s_addr,
                    ui->ui_dst.s_addr, htons((u_int16_t)len +
                    sizeof(struct udphdr) + IPPROTO_UDP));
                m->m_pkthdr.csum_flags = M_CSUM_UDPv4;
                m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
        } else
                ui->ui_sum = 0;

        ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
        ((struct ip *)ui)->ip_ttl = in4p_ip(inp).ip_ttl;        /* XXX */
        ((struct ip *)ui)->ip_tos = in4p_ip(inp).ip_tos;        /* XXX */
        UDP_STATINC(UDP_STAT_OPACKETS);

        flags |= inp->inp_socket->so_options & (SO_DONTROUTE|SO_BROADCAST);
        return ip_output(m, inp->inp_options, ro, flags, pktopts.ippo_imo, inp);

 release:
        if (control != NULL)
                m_freem(control);
        m_freem(m);
        return error;
}

static int
udp_attach(struct socket *so, int proto)
{
        struct inpcb *inp;
        int error;

        KASSERT(sotoinpcb(so) == NULL);

        /* Assign the lock (must happen even if we will error out). */
        sosetlock(so);

#ifdef MBUFTRACE
        so->so_mowner = &udp_mowner;
        so->so_rcv.sb_mowner = &udp_rx_mowner;
        so->so_snd.sb_mowner = &udp_tx_mowner;
#endif
        if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
                error = soreserve(so, udp_sendspace, udp_recvspace);
                if (error) {
                        return error;
                }
        }

        error = inpcb_create(so, &udbtable);
        if (error) {
                return error;
        }
        inp = sotoinpcb(so);
        in4p_ip(inp).ip_ttl = ip_defttl;
        KASSERT(solocked(so));

        return error;
}

static void
udp_detach(struct socket *so)
{
        struct inpcb *inp;

        KASSERT(solocked(so));
        inp = sotoinpcb(so);
        KASSERT(inp != NULL);
        inpcb_destroy(inp);
}

static int
udp_accept(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));

        panic("udp_accept");

        return EOPNOTSUPP;
}

static int
udp_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct inpcb *inp = sotoinpcb(so);
        struct sockaddr_in *sin = (struct sockaddr_in *)nam;
        int error = 0;
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);
        KASSERT(nam != NULL);

        s = splsoftnet();
        error = inpcb_bind(inp, sin, l);
        splx(s);

        return error;
}

static int
udp_listen(struct socket *so, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
udp_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct inpcb *inp = sotoinpcb(so);
        int error = 0;
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);
        KASSERT(nam != NULL);

        s = splsoftnet();
        error = inpcb_connect(inp, (struct sockaddr_in *)nam, l);
        if (! error)
                soisconnected(so);
        splx(s);
        return error;
}

static int
udp_connect2(struct socket *so, struct socket *so2)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
udp_disconnect(struct socket *so)
{
        struct inpcb *inp = sotoinpcb(so);
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);

        s = splsoftnet();
        /*soisdisconnected(so);*/
        so->so_state &= ~SS_ISCONNECTED;        /* XXX */
        inpcb_disconnect(inp);
        in4p_laddr(inp) = zeroin_addr;                /* XXX */
        inpcb_set_state(inp, INP_BOUND);                /* XXX */
        splx(s);

        return 0;
}

static int
udp_shutdown(struct socket *so)
{
        int s;

        KASSERT(solocked(so));

        s = splsoftnet();
        socantsendmore(so);
        splx(s);

        return 0;
}

static int
udp_abort(struct socket *so)
{
        KASSERT(solocked(so));

        panic("udp_abort");

        return EOPNOTSUPP;
}

static int
udp_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
{
        return in_control(so, cmd, nam, ifp);
}

static int
udp_stat(struct socket *so, struct stat *ub)
{
        KASSERT(solocked(so));

        /* stat: don't bother with a blocksize. */
        return 0;
}

static int
udp_peeraddr(struct socket *so, struct sockaddr *nam)
{
        int s;

        KASSERT(solocked(so));
        KASSERT(sotoinpcb(so) != NULL);
        KASSERT(nam != NULL);

        s = splsoftnet();
        inpcb_fetch_peeraddr(sotoinpcb(so), (struct sockaddr_in *)nam);
        splx(s);

        return 0;
}

static int
udp_sockaddr(struct socket *so, struct sockaddr *nam)
{
        int s;

        KASSERT(solocked(so));
        KASSERT(sotoinpcb(so) != NULL);
        KASSERT(nam != NULL);

        s = splsoftnet();
        inpcb_fetch_sockaddr(sotoinpcb(so), (struct sockaddr_in *)nam);
        splx(s);

        return 0;
}

static int
udp_rcvd(struct socket *so, int flags, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
udp_recvoob(struct socket *so, struct mbuf *m, int flags)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

int
udp_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
    struct mbuf *control, struct lwp *l)
{
        struct inpcb *inp = sotoinpcb(so);
        int error = 0;
        struct in_addr laddr;                        /* XXX */
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);
        KASSERT(m != NULL);

        memset(&laddr, 0, sizeof laddr);

        s = splsoftnet();
        if (nam) {
                laddr = in4p_laddr(inp);                /* XXX */
                if ((so->so_state & SS_ISCONNECTED) != 0) {
                        error = EISCONN;
                        goto die;
                }
                error = inpcb_connect(inp, (struct sockaddr_in *)nam, l);
                if (error)
                        goto die;
        } else {
                if ((so->so_state & SS_ISCONNECTED) == 0) {
                        error = ENOTCONN;
                        goto die;
                }
        }
        error = udp_output(m, inp, control, l);
        m = NULL;
        control = NULL;
        if (nam) {
                inpcb_disconnect(inp);
                in4p_laddr(inp) = laddr;                /* XXX */
                inpcb_set_state(inp, INP_BOUND);        /* XXX */
        }
  die:
        if (m != NULL)
                m_freem(m);
        if (control != NULL)
                m_freem(control);

        splx(s);
        return error;
}

static int
udp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control)
{
        KASSERT(solocked(so));

        m_freem(m);
        m_freem(control);

        return EOPNOTSUPP;
}

static int
udp_purgeif(struct socket *so, struct ifnet *ifp)
{
        int s;

        s = splsoftnet();
        mutex_enter(softnet_lock);
        inpcb_purgeif0(&udbtable, ifp);
#ifdef NET_MPSAFE
        mutex_exit(softnet_lock);
#endif
        in_purgeif(ifp);
#ifdef NET_MPSAFE
        mutex_enter(softnet_lock);
#endif
        inpcb_purgeif(&udbtable, ifp);
        mutex_exit(softnet_lock);
        splx(s);

        return 0;
}

static int
sysctl_net_inet_udp_stats(SYSCTLFN_ARGS)
{

        return (NETSTAT_SYSCTL(udpstat_percpu, UDP_NSTATS));
}

/*
 * Sysctl for udp variables.
 */
static void
sysctl_net_inet_udp_setup(struct sysctllog **clog)
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "inet", NULL,
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "udp",
                       SYSCTL_DESCR("UDPv4 related settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET, IPPROTO_UDP, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "checksum",
                       SYSCTL_DESCR("Compute UDP checksums"),
                       NULL, 0, &udpcksum, 0,
                       CTL_NET, PF_INET, IPPROTO_UDP, UDPCTL_CHECKSUM,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "sendspace",
                       SYSCTL_DESCR("Default UDP send buffer size"),
                       NULL, 0, &udp_sendspace, 0,
                       CTL_NET, PF_INET, IPPROTO_UDP, UDPCTL_SENDSPACE,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "recvspace",
                       SYSCTL_DESCR("Default UDP receive buffer size"),
                       NULL, 0, &udp_recvspace, 0,
                       CTL_NET, PF_INET, IPPROTO_UDP, UDPCTL_RECVSPACE,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "do_loopback_cksum",
                       SYSCTL_DESCR("Perform UDP checksum on loopback"),
                       NULL, 0, &udp_do_loopback_cksum, 0,
                       CTL_NET, PF_INET, IPPROTO_UDP, UDPCTL_LOOPBACKCKSUM,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "pcblist",
                       SYSCTL_DESCR("UDP protocol control block list"),
                       sysctl_inpcblist, 0, &udbtable, 0,
                       CTL_NET, PF_INET, IPPROTO_UDP, CTL_CREATE,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "stats",
                       SYSCTL_DESCR("UDP statistics"),
                       sysctl_net_inet_udp_stats, 0, NULL, 0,
                       CTL_NET, PF_INET, IPPROTO_UDP, UDPCTL_STATS,
                       CTL_EOL);
}
#endif

void
udp_statinc(u_int stat)
{

        KASSERT(stat < UDP_NSTATS);
        UDP_STATINC(stat);
}

#if defined(INET) && defined(IPSEC)
/*
 * Handle ESP-in-UDP packets (RFC3948).
 *
 * We need to distinguish between ESP packets and IKE packets. We do so by
 * looking at the Non-ESP marker. If IKE, we process the UDP packet as usual.
 * Otherwise, ESP, we invoke IPsec.
 *
 * Returns:
 *     1 if the packet was processed
 *     0 if normal UDP processing should take place
 *    -1 if an error occurred and m was freed
 */
static int
udp4_espinudp(struct mbuf **mp, int off)
{
        const size_t skip = sizeof(struct udphdr);
        size_t len;
        uint8_t *data;
        size_t minlen;
        size_t iphdrlen;
        struct ip *ip;
        struct m_tag *tag;
        struct udphdr *udphdr;
        u_int16_t sport, dport;
        struct mbuf *m = *mp;
        uint32_t *marker;

        minlen = off + sizeof(struct esp);
        if (minlen > m->m_pkthdr.len)
                minlen = m->m_pkthdr.len;

        if (m->m_len < minlen) {
                if ((*mp = m_pullup(m, minlen)) == NULL) {
                        return -1;
                }
                m = *mp;
        }

        len = m->m_len - off;
        data = mtod(m, uint8_t *) + off;

        /* Ignore keepalive packets. */
        if ((len == 1) && (*data == 0xff)) {
                m_freem(m);
                *mp = NULL; /* avoid any further processing by caller */
                return 1;
        }

        /* Handle Non-ESP marker (32bit). If zero, then IKE. */
        marker = (uint32_t *)data;
        if (len <= sizeof(uint32_t))
                return 0;
        if (marker[0] == 0)
                return 0;

        /*
         * Get the UDP ports. They are handled in network order
         * everywhere in the IPSEC_NAT_T code.
         */
        udphdr = (struct udphdr *)((char *)data - skip);
        sport = udphdr->uh_sport;
        dport = udphdr->uh_dport;

        /*
         * Remove the UDP header, plus a possible marker. IP header
         * length is iphdrlen.
         *
         * Before:
         *   <--- off --->
         *   +----+------+-----+
         *   | IP |  UDP | ESP |
         *   +----+------+-----+
         *        <-skip->
         * After:
         *          +----+-----+
         *          | IP | ESP |
         *          +----+-----+
         *   <-skip->
         */
        iphdrlen = off - sizeof(struct udphdr);
        memmove(mtod(m, char *) + skip, mtod(m, void *), iphdrlen);
        m_adj(m, skip);

        ip = mtod(m, struct ip *);
        ip->ip_len = htons(ntohs(ip->ip_len) - skip);
        ip->ip_p = IPPROTO_ESP;

        /*
         * We have modified the packet - it is now ESP, so we should not
         * return to UDP processing.
         *
         * Add a PACKET_TAG_IPSEC_NAT_T_PORTS tag to remember the source
         * UDP port. This is required if we want to select the right SPD
         * for multiple hosts behind same NAT.
         */
        if ((tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
            sizeof(sport) + sizeof(dport), M_DONTWAIT)) == NULL) {
                m_freem(m);
                return -1;
        }
        ((u_int16_t *)(tag + 1))[0] = sport;
        ((u_int16_t *)(tag + 1))[1] = dport;
        m_tag_prepend(m, tag);

        if (ipsec_used)
                ipsec4_common_input(m, iphdrlen, IPPROTO_ESP);
        else
                m_freem(m);

        /* We handled it, it shouldn't be handled by UDP */
        *mp = NULL; /* avoid free by caller ... */
        return 1;
}
#endif

PR_WRAP_USRREQS(udp)
#define        udp_attach        udp_attach_wrapper
#define        udp_detach        udp_detach_wrapper
#define        udp_accept        udp_accept_wrapper
#define        udp_bind        udp_bind_wrapper
#define        udp_listen        udp_listen_wrapper
#define        udp_connect        udp_connect_wrapper
#define        udp_connect2        udp_connect2_wrapper
#define        udp_disconnect        udp_disconnect_wrapper
#define        udp_shutdown        udp_shutdown_wrapper
#define        udp_abort        udp_abort_wrapper
#define        udp_ioctl        udp_ioctl_wrapper
#define        udp_stat        udp_stat_wrapper
#define        udp_peeraddr        udp_peeraddr_wrapper
#define        udp_sockaddr        udp_sockaddr_wrapper
#define        udp_rcvd        udp_rcvd_wrapper
#define        udp_recvoob        udp_recvoob_wrapper
#define        udp_send        udp_send_wrapper
#define        udp_sendoob        udp_sendoob_wrapper
#define        udp_purgeif        udp_purgeif_wrapper

const struct pr_usrreqs udp_usrreqs = {
        .pr_attach        = udp_attach,
        .pr_detach        = udp_detach,
        .pr_accept        = udp_accept,
        .pr_bind        = udp_bind,
        .pr_listen        = udp_listen,
        .pr_connect        = udp_connect,
        .pr_connect2        = udp_connect2,
        .pr_disconnect        = udp_disconnect,
        .pr_shutdown        = udp_shutdown,
        .pr_abort        = udp_abort,
        .pr_ioctl        = udp_ioctl,
        .pr_stat        = udp_stat,
        .pr_peeraddr        = udp_peeraddr,
        .pr_sockaddr        = udp_sockaddr,
        .pr_rcvd        = udp_rcvd,
        .pr_recvoob        = udp_recvoob,
        .pr_send        = udp_send,
        .pr_sendoob        = udp_sendoob,
        .pr_purgeif        = udp_purgeif,
};

























































































































































    1 
    6 
    5 
    3 




































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
/*        $NetBSD: in6_proto.c,v 1.131 2024/02/09 22:08:37 andvar Exp $        */
/*        $KAME: in6_proto.c,v 1.66 2000/10/10 15:35:47 itojun Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)in_proto.c        8.1 (Berkeley) 6/10/93
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6_proto.c,v 1.131 2024/02/09 22:08:37 andvar Exp $");

#ifdef _KERNEL_OPT
#include "opt_gateway.h"
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_dccp.h"
#include "opt_sctp.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/kernel.h>
#include <sys/domain.h>
#include <sys/mbuf.h>

#include <net/if.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip_encap.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/in_pcb.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet/icmp6.h>
#include <netinet6/in6_pcb.h>

#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_debug.h>

#include <netinet6/udp6.h>
#include <netinet6/udp6_var.h>

#ifdef DCCP
#include <netinet/dccp.h>
#include <netinet/dccp_var.h>
#include <netinet6/dccp6_var.h>
#endif

#ifdef SCTP
#include <netinet/sctp_pcb.h>
#include <netinet/sctp.h>
#include <netinet/sctp_var.h>
#include <netinet6/sctp6_var.h>
#endif

#include <netinet6/pim6_var.h>

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#include <netipsec/key.h>
#endif

#include "carp.h"
#if NCARP > 0
#include <netinet/ip_carp.h>
#endif

#include <netinet6/ip6protosw.h>

/*
 * TCP/IP protocol family: IP6, ICMP6, UDP, TCP.
 */

DOMAIN_DEFINE(inet6domain);        /* forward declare and add to link set */

/* Wrappers to acquire kernel_lock. */

PR_WRAP_CTLINPUT(rip6_ctlinput)
PR_WRAP_CTLINPUT(encap6_ctlinput)
PR_WRAP_CTLINPUT(udp6_ctlinput)
PR_WRAP_CTLINPUT(tcp6_ctlinput)

#define        rip6_ctlinput        rip6_ctlinput_wrapper
#define        encap6_ctlinput        encap6_ctlinput_wrapper
#define        udp6_ctlinput        udp6_ctlinput_wrapper
#define        tcp6_ctlinput        tcp6_ctlinput_wrapper

PR_WRAP_CTLOUTPUT(rip6_ctloutput)
PR_WRAP_CTLOUTPUT(tcp_ctloutput)
PR_WRAP_CTLOUTPUT(udp6_ctloutput)
PR_WRAP_CTLOUTPUT(icmp6_ctloutput)

#define        rip6_ctloutput        rip6_ctloutput_wrapper
#define        tcp_ctloutput        tcp_ctloutput_wrapper
#define        udp6_ctloutput        udp6_ctloutput_wrapper
#define        icmp6_ctloutput        icmp6_ctloutput_wrapper

#if defined(DCCP)
PR_WRAP_CTLINPUT(dccp6_ctlinput)
PR_WRAP_CTLOUTPUT(dccp_ctloutput)

#define dccp6_ctlinput        dccp6_ctlinput_wrapper
#define dccp_ctloutput        dccp_ctloutput_wrapper
#endif

#if defined(SCTP)
PR_WRAP_CTLINPUT(sctp6_ctlinput)
PR_WRAP_CTLOUTPUT(sctp_ctloutput)

#define sctp6_ctlinput        sctp6_ctlinput_wrapper
#define sctp_ctloutput        sctp_ctloutput_wrapper
#endif

#ifdef NET_MPSAFE
PR_WRAP_INPUT6(udp6_input)
PR_WRAP_INPUT6(tcp6_input)
#ifdef DCCP
PR_WRAP_INPUT6(dccp6_input)
#endif
#ifdef SCTP
PR_WRAP_INPUT6(sctp6_input)
#endif
PR_WRAP_INPUT6(rip6_input)
PR_WRAP_INPUT6(dest6_input)
PR_WRAP_INPUT6(route6_input)
PR_WRAP_INPUT6(frag6_input)
#if NPFSYNC > 0
PR_WRAP_INPUT6(pfsync_input)
#endif
PR_WRAP_INPUT6(pim6_input)

#define        udp6_input                udp6_input_wrapper
#define        tcp6_input                tcp6_input_wrapper
#define        dccp6_input                dccp6_input_wrapper
#define        sctp6_input                sctp6_input_wrapper
#define        rip6_input                rip6_input_wrapper
#define        dest6_input                dest6_input_wrapper
#define        route6_input                route6_input_wrapper
#define        frag6_input                frag6_input_wrapper
#define        pim6_input                pim6_input_wrapper
#endif

#if defined(IPSEC)

#ifdef IPSEC_RUMPKERNEL
/*
 * .pr_input = ipsec6_common_input won't be resolved on loading
 * the ipsec shared library. We need a wrapper anyway.
 */
static int
ipsec6_common_input_wrapper(struct mbuf **mp, int *offp, int proto)
{

        if (ipsec_enabled) {
                return ipsec6_common_input(mp, offp, proto);
        } else {
                m_freem(*mp);
                return IPPROTO_DONE;
        }
}
#define        ipsec6_common_input        ipsec6_common_input_wrapper

/* The ctlinput functions may not be loaded */
#define        IPSEC_WRAP_CTLINPUT(name)                        \
static void *                                                \
name##_wrapper(int a, const struct sockaddr *b, void *c)\
{                                                        \
        void *rv;                                        \
        KERNEL_LOCK(1, NULL);                                \
        if (ipsec_enabled)                                \
                rv = name(a, b, c);                        \
        else                                                \
                rv = NULL;                                \
        KERNEL_UNLOCK_ONE(NULL);                        \
        return rv;                                        \
}
IPSEC_WRAP_CTLINPUT(ah6_ctlinput)
IPSEC_WRAP_CTLINPUT(esp6_ctlinput)

#else /* !IPSEC_RUMPKERNEL */

PR_WRAP_CTLINPUT(ah6_ctlinput)
PR_WRAP_CTLINPUT(esp6_ctlinput)

#endif /* !IPSEC_RUMPKERNEL */

#define        ah6_ctlinput        ah6_ctlinput_wrapper
#define        esp6_ctlinput        esp6_ctlinput_wrapper

#endif /* IPSEC */

static void
tcp6_init(void)
{

        icmp6_mtudisc_callback_register(tcp6_mtudisc_callback);

        tcp_init_common(sizeof(struct ip6_hdr));
}

const struct ip6protosw inet6sw[] = {
{        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_IPV6,
        .pr_init = ip6_init,
        .pr_fasttimo = frag6_fasttimo,
        .pr_slowtimo = frag6_slowtimo,
        .pr_drain = frag6_drainstub,
},
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_ICMPV6,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = icmp6_input,
        .pr_ctlinput = rip6_ctlinput,
        .pr_ctloutput = icmp6_ctloutput,
        .pr_usrreqs = &rip6_usrreqs,
        .pr_init = icmp6_init,
},
{        .pr_type = SOCK_DGRAM,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_UDP,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_PURGEIF,
        .pr_input = udp6_input,
        .pr_ctlinput = udp6_ctlinput,
        .pr_ctloutput = udp6_ctloutput,
        .pr_usrreqs = &udp6_usrreqs,
        .pr_init = udp6_init,
},
{        .pr_type = SOCK_STREAM,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_TCP,
        .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_LISTEN|PR_ABRTACPTDIS|PR_PURGEIF,
        .pr_input = tcp6_input,
        .pr_ctlinput = tcp6_ctlinput,
        .pr_ctloutput = tcp_ctloutput,
        .pr_usrreqs = &tcp_usrreqs,
        .pr_init = tcp6_init,
        .pr_fasttimo = tcp_fasttimo,
        .pr_drain = tcp_drainstub,
},
#ifdef DCCP
{        .pr_type = SOCK_CONN_DGRAM,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_DCCP,
        .pr_flags = PR_CONNREQUIRED|PR_ATOMIC|PR_LISTEN,
        .pr_input = dccp6_input,
        .pr_ctlinput = dccp6_ctlinput,
        .pr_ctloutput = dccp_ctloutput,
        .pr_usrreqs = &dccp6_usrreqs,
#ifndef INET
        .pr_init = dccp_init,
#endif
},
#endif /* DCCP */
#ifdef SCTP
{        .pr_type = SOCK_DGRAM,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_SCTP,
        .pr_flags = PR_ADDR_OPT|PR_WANTRCVD,
        .pr_input = sctp6_input,
        .pr_ctlinput = sctp6_ctlinput,
        .pr_ctloutput = sctp_ctloutput,
        .pr_usrreqs = &sctp6_usrreqs,
        .pr_drain = sctp_drain,
},
{        .pr_type = SOCK_SEQPACKET,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_SCTP,
        .pr_flags = PR_ADDR_OPT|PR_WANTRCVD,
        .pr_input = sctp6_input,
        .pr_ctlinput = sctp6_ctlinput,
        .pr_ctloutput = sctp_ctloutput,
        .pr_usrreqs = &sctp6_usrreqs,
        .pr_drain = sctp_drain,
},
{        .pr_type = SOCK_STREAM,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_SCTP,
        .pr_flags = PR_CONNREQUIRED|PR_ADDR_OPT|PR_WANTRCVD|PR_LISTEN,
        .pr_input = sctp6_input,
        .pr_ctlinput = sctp6_ctlinput,
        .pr_ctloutput = sctp_ctloutput,
        .pr_usrreqs = &sctp6_usrreqs,
        .pr_drain = sctp_drain,
},
#endif /* SCTP */
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_RAW,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_PURGEIF,
        .pr_input = rip6_input,
        .pr_ctlinput = rip6_ctlinput,
        .pr_ctloutput = rip6_ctloutput,
        .pr_usrreqs = &rip6_usrreqs,
},
#ifdef GATEWAY
{        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_IPV6,
        .pr_slowtimo = ip6flow_slowtimo,
        .pr_init = ip6flow_poolinit,
},
#endif /* GATEWAY */
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_DSTOPTS,
        .pr_flags = PR_ATOMIC|PR_ADDR,
        .pr_input = dest6_input,
},
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_ROUTING,
        .pr_flags = PR_ATOMIC|PR_ADDR,
        .pr_input = route6_input,
},
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_FRAGMENT,
        .pr_flags = PR_ATOMIC|PR_ADDR,
        .pr_input = frag6_input,
},
#ifdef IPSEC
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_AH,
        .pr_flags = PR_ATOMIC|PR_ADDR,
        .pr_input = ipsec6_common_input,
        .pr_ctlinput = ah6_ctlinput,
},
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_ESP,
        .pr_flags = PR_ATOMIC|PR_ADDR,
        .pr_input = ipsec6_common_input,
        .pr_ctlinput = esp6_ctlinput,
},
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_IPCOMP,
        .pr_flags = PR_ATOMIC|PR_ADDR,
        .pr_input = ipsec6_common_input,
},
#endif /* IPSEC */
#ifdef INET
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_IPV4,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = encap6_input,
        .pr_ctlinput = encap6_ctlinput,
        .pr_ctloutput = rip6_ctloutput,
        .pr_usrreqs = &rip6_usrreqs,
        .pr_init = encap_init,
},
#endif
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_IPV6,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = encap6_input,
        .pr_ctlinput = encap6_ctlinput,
        .pr_ctloutput = rip6_ctloutput,
        .pr_usrreqs = &rip6_usrreqs,
        .pr_init = encap_init,
},
#if NCARP > 0
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_CARP,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = carp6_proto_input,
        .pr_ctloutput = rip6_ctloutput,
        .pr_usrreqs = &rip6_usrreqs,
},
#endif /* NCARP */
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_L2TP,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = encap6_input,
        .pr_ctlinput = rip6_ctlinput,
        .pr_ctloutput = rip6_ctloutput,
        .pr_usrreqs = &rip6_usrreqs,
        .pr_init = encap_init,
},
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_protocol = IPPROTO_PIM,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = pim6_input,
        .pr_ctloutput = rip6_ctloutput,
        .pr_usrreqs = &rip6_usrreqs,
        .pr_init = pim6_init,
},
/* raw wildcard */
{        .pr_type = SOCK_RAW,
        .pr_domain = &inet6domain,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = rip6_input,
        .pr_ctloutput = rip6_ctloutput,
        .pr_usrreqs = &rip6_usrreqs,
        .pr_init = rip6_init,
},
};

static const struct sockaddr_in6 in6_any = {
          .sin6_len = sizeof(in6_any)
        , .sin6_family = AF_INET6
        , .sin6_port = 0
        , .sin6_flowinfo = 0
        , .sin6_addr = IN6ADDR_ANY_INIT
        , .sin6_scope_id = 0
};

bool in6_present = false;
static void
in6_dom_init(void)
{

        in6_present = true;
}

struct domain inet6domain = {
        .dom_family = AF_INET6, .dom_name = "internet6",
        .dom_init = in6_dom_init, .dom_externalize = NULL, .dom_dispose = NULL,
        .dom_protosw = (const struct protosw *)inet6sw,
        .dom_protoswNPROTOSW = (const struct protosw *)&inet6sw[sizeof(inet6sw)/sizeof(inet6sw[0])],
        .dom_rtattach = rt_inithead,
        .dom_rtoffset = offsetof(struct sockaddr_in6, sin6_addr) << 3,
        .dom_maxrtkey = sizeof(struct ip_pack6),
        .dom_if_up = in6_if_up, .dom_if_down = in6_if_down,
        .dom_ifattach = in6_domifattach, .dom_ifdetach = in6_domifdetach,
        .dom_if_link_state_change = in6_if_link_state_change,
        .dom_link = { NULL },
        .dom_mowner = MOWNER_INIT("",""),
        .dom_sa_cmpofs = offsetof(struct sockaddr_in6, sin6_addr),
        .dom_sa_cmplen = sizeof(struct in6_addr),
        .dom_sa_any = (const struct sockaddr *)&in6_any,
        .dom_sockaddr_externalize = sockaddr_in6_externalize,
};

#if 0
int
sockaddr_in6_cmp(const struct sockaddr *lsa, const struct sockaddr *rsa)
{
        uint_fast8_t len;
        const uint_fast8_t addrofs = offsetof(struct sockaddr_in6, sin6_addr),
                           addrend = addrofs + sizeof(struct in6_addr);
        int rc;
        const struct sockaddr_in6 *lsin6, *rsin6;

        lsin6 = satocsin6(lsa);
        rsin6 = satocsin6(rsa);

        len = MIN(addrend, MIN(lsin6->sin6_len, rsin6->sin6_len));

        if (len > addrofs &&
            (rc = memcmp(&lsin6->sin6_addr, &rsin6->sin6_addr,
                          len - addrofs)) != 0)
                return rc;

        return lsin6->sin6_len - rsin6->sin6_len;
}
#endif

/*
 * Internet configuration info
 */
#ifdef GATEWAY6
#define IPV6FORWARDING        1        /* forward IP6 packets not for us */
#else
#define IPV6FORWARDING        0        /* don't forward IP6 packets not for us */
#endif

int ip6_forwarding = IPV6FORWARDING;        /* act as router? */
int ip6_sendredirects = 1;
int ip6_defhlim = IPV6_DEFHLIM;
int ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS;
int ip6_maxfragpackets = 200;
int ip6_maxfrags = 200;
int ip6_log_interval = 5;
int ip6_hdrnestlimit = 15;
int ip6_dad_count = 1;        /* DupAddrDetectionTransmits */
int ip6_auto_flowlabel = 1;
int ip6_use_deprecated = 1;        /* allow deprecated addr (RFC2462 5.5.4) */
int ip6_mcast_pmtu = 0;        /* enable pMTU discovery for multicast? */
int ip6_v6only = 1;
int ip6_neighborgcthresh = 2048; /* Threshold # of NDP entries for GC */
int ip6_maxdynroutes = 4096; /* Max # of routes created via redirect */
int ip6_param_rt_msg = 1; /* How to send parameter changing rtm */

int ip6_keepfaith = 0;
time_t ip6_log_time = 0;

/* icmp6 */
int pmtu_expire = 60*10;

/* raw IP6 parameters */
/*
 * Nominal space allocated to a raw ip socket.
 */
#define        RIPV6SNDQ        8192
#define        RIPV6RCVQ        16384

u_long        rip6_sendspace = RIPV6SNDQ;
u_long        rip6_recvspace = RIPV6RCVQ;

/* ICMPV6 parameters */
int        icmp6_rediraccept = 1;                /* accept and process redirects */
int        icmp6_redirtimeout = 10 * 60;        /* 10 minutes */
int        icmp6errppslim = 100;                /* 100pps */
int        icmp6_nodeinfo = 1;                /* enable/disable NI response */


























































































































































































































































































































































































































    1 


































































  250 









  249 
  255 



  255 
  111 



























   18 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
/*        $NetBSD: lwp.h,v 1.231 2023/11/02 10:31:55 martin Exp $        */

/*
 * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010, 2019, 2020, 2023
 *    The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Nathan J. Williams and Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _SYS_LWP_H_
#define _SYS_LWP_H_

#if defined(_KERNEL) || defined(_KMEMUSER)

#include <sys/param.h>

#include <sys/callout.h>
#include <sys/condvar.h>
#include <sys/kcpuset.h>
#include <sys/mutex.h>
#include <sys/queue.h>
#include <sys/resource.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/specificdata.h>
#include <sys/time.h>
#include <sys/wchan.h>

#if defined(_KERNEL)
struct lwp;
/* forward declare this for <machine/cpu.h> so it can get l_cpu. */
static __inline struct cpu_info *lwp_getcpu(struct lwp *);
#include <machine/cpu.h>                /* curcpu() and cpu_info */
#include <sys/atomic.h>
#ifdef _KERNEL_OPT
#include "opt_kcov.h"
#include "opt_kmsan.h"
#include "opt_maxlwp.h"
#endif
#endif

#include <machine/proc.h>                /* Machine-dependent proc substruct. */

/*
 * Lightweight process.  Field markings and the corresponding locks:
 *
 * a:        proc_lock
 * c:        condition variable interlock, passed to cv_wait()
 * l:        *l_mutex
 * p:        l_proc->p_lock
 * s:        spc_mutex, which may or may not be referenced by l_mutex
 * S:        l_selcluster->sc_lock
 * (:        unlocked, stable
 * !:        unlocked, may only be reliably accessed by the LWP itself
 *
 * Fields are clustered together by usage (to increase the likelihood
 * of cache hits) and by size (to reduce dead space in the structure).
 */

#include <sys/pcu.h>

struct lockdebug;
struct sysent;

struct lwp {
        /* Must not be zeroed on free. */
        struct cpu_info *volatile l_cpu;/* s: CPU we're on if LSONPROC */
        kmutex_t * volatile l_mutex;        /* l: ptr to mutex on sched state */
        struct turnstile *l_ts;                /* l: current turnstile */
        int                l_stat;                /* l: overall LWP status */
        int                l__reserved;        /*  : padding - reuse as needed */

        /* Scheduling and overall state. */
#define        l_startzero l_runq
        TAILQ_ENTRY(lwp) l_runq;        /* s: run queue */
        union {
                void *        info;                /* s: scheduler-specific structure */
                u_int        timeslice;        /* l: time-quantum for SCHED_M2 */
        } l_sched;
        void                *l_addr;        /* l: PCB address; use lwp_getpcb() */
        struct mdlwp        l_md;                /* l: machine-dependent fields. */
        struct bintime         l_rtime;        /* l: real time */
        struct bintime        l_stime;        /* l: start time (while ONPROC) */
        int                l_flag;                /* l: misc flag values */
        u_int                l_swtime;        /* l: time swapped in or out */
        u_int                l_rticks;        /* l: Saved start time of run */
        u_int                l_rticksum;        /* l: Sum of ticks spent running */
        u_int                l_slpticks;        /* l: Saved start time of sleep */
        u_int                l_slpticksum;        /* l: Sum of ticks spent sleeping */
        int                l_class;        /* l: scheduling class */
        pri_t                l_boostpri;        /* l: boosted priority after blocking */
        pri_t                l_priority;        /* l: scheduler priority */
        pri_t                l_inheritedprio;/* l: inherited priority */
        pri_t                l_protectprio;        /* l: for PTHREAD_PRIO_PROTECT */
        pri_t                l_auxprio;        /* l: max(inherit,protect) priority */
        int                l_protectdepth;        /* l: for PTHREAD_PRIO_PROTECT */
        u_int                l_cpticks;        /* (: Ticks of CPU time */
        psetid_t        l_psid;                /* l: assigned processor-set ID */
        fixpt_t                l_pctcpu;        /* p: %cpu during l_swtime */
        fixpt_t                l_estcpu;        /* l: cpu time for SCHED_4BSD */
        SLIST_HEAD(, turnstile) l_pi_lenders; /* l: ts lending us priority */
        struct cpu_info *l_target_cpu;        /* l: target CPU to migrate */
        struct lwpctl        *l_lwpctl;        /* p: lwpctl block kernel address */
        struct lcpage        *l_lcpage;        /* p: lwpctl containing page */
        kcpuset_t        *l_affinity;        /* l: CPU set for affinity */

        /* Synchronisation. */
        const struct syncobj *l_syncobj;/* l: sync object operations set */
        LIST_ENTRY(lwp) l_sleepchain;        /* l: sleep queue */
        wchan_t                l_wchan;        /* l: sleep address */
        const char        *l_wmesg;        /* l: reason for sleep */
        struct sleepq        *l_sleepq;        /* l: current sleep queue */
        callout_t        l_timeout_ch;        /* !: callout for tsleep */
        kcondvar_t        l_waitcv;        /* a: vfork() wait */
        u_int                l_slptime;        /* l: time since last blocked */
        bool                l_vforkwaiting;        /* a: vfork() waiting */

        /* User-space synchronization. */
        uintptr_t        l_robust_head;        /* !: list of robust futexes */
        uint32_t        l___rsvd1;        /* reserved for future use */

#if PCU_UNIT_COUNT > 0
        struct cpu_info        * volatile l_pcu_cpu[PCU_UNIT_COUNT];
        uint32_t        l_pcu_valid;
#endif

        /* Process level and global state, misc. */
        lwpid_t                l_lid;                /* (: LWP identifier; local to proc */
        LIST_ENTRY(lwp)        l_list;                /* a: entry on list of all LWPs */
        void                *l_ctxlink;        /* p: uc_link {get,set}context */
        struct proc        *l_proc;        /* p: parent process */
        LIST_ENTRY(lwp)        l_sibling;        /* p: entry on proc's list of LWPs */
        char                *l_name;        /* (: name, optional */
        lwpid_t                l_waiter;        /* p: first LWP waiting on us */
        lwpid_t         l_waitingfor;        /* p: specific LWP we are waiting on */
        int                l_prflag;        /* p: process level flags */
        u_int                l_refcnt;        /* p: reference count on this LWP */

        /* State of select() or poll(). */
        int                l_selflag;        /* S: polling state flags */
        int                l_selret;        /* S: return value of select/poll */
        SLIST_HEAD(,selinfo) l_selwait;        /* S: descriptors waited on */
        uintptr_t        l_selrec;        /* !: argument for selrecord() */
        struct selcluster *l_selcluster;/* !: associated cluster data */
        void *                l_selbits;        /* (: select() bit-field */
        size_t                l_selni;        /* (: size of a single bit-field */

        /* Signals. */
        int                l_sigrestore;        /* p: need to restore old sig mask */
        sigset_t        l_sigwaitset;        /* p: signals being waited for */
        kcondvar_t        l_sigcv;        /* p: for sigsuspend() */
        struct ksiginfo        *l_sigwaited;        /* p: delivered signals from set */
        sigpend_t        *l_sigpendset;        /* p: XXX issignal()/postsig() baton */
        LIST_ENTRY(lwp)        l_sigwaiter;        /* p: chain on list of waiting LWPs */
        stack_t                l_sigstk;        /* p: sp & on stack state variable */
        sigset_t        l_sigmask;        /* p: signal mask */
        sigpend_t        l_sigpend;        /* p: signals to this LWP */
        sigset_t        l_sigoldmask;        /* p: mask for sigpause */

        /* Private data. */
        specificdata_reference
                l_specdataref;                /* !: subsystem lwp-specific data */
        struct timespec l_ktrcsw;        /* !: for ktrace CSW trace XXX */
        void                *l_private;        /* !: svr4-style lwp-private data */
        struct lwp        *l_switchto;        /* !: mi_switch: switch to this LWP */
        struct kauth_cred *l_cred;        /* !: cached credentials */
        struct filedesc        *l_fd;                /* !: cached copy of proc::p_fd */
        void                *l_emuldata;        /* !: kernel lwp-private data */
        struct fstrans_lwp_info *l_fstrans; /* (: fstrans private data */
        u_short                l_shlocks;        /* !: lockdebug: shared locks held */
        u_short                l_exlocks;        /* !: lockdebug: excl. locks held */
        u_short                l_psrefs;        /* !: count of psref held */
        u_short                l_blcnt;        /* !: count of kernel_lock held */
        volatile int        l_nopreempt;        /* !: don't preempt me! */
        volatile u_int        l_dopreempt;        /* s: kernel preemption pending */
        int                l_pflag;        /* !: LWP private flags */
        int                l_dupfd;        /* !: side return from cloning devs XXX */
        const struct sysent * volatile l_sysent;/* !: currently active syscall */
        struct rusage        l_ru;                /* !: accounting information */
        uint64_t        l_pfailtime;        /* !: for kernel preemption */
        uintptr_t        l_pfailaddr;        /* !: for kernel preemption */
        uintptr_t        l_pfaillock;        /* !: for kernel preemption */
        _TAILQ_HEAD(,struct lockdebug,volatile) l_ld_locks;/* !: locks held by LWP */
        volatile void        *l_ld_wanted;        /* !: lock currently wanted by LWP */
        uintptr_t        l_rwcallsite;        /* !: rwlock actual callsite */
        int                l_tcgen;        /* !: for timecounter removal */

        /* These are only used by 'options SYSCALL_TIMES'. */
        uint32_t        l_syscall_time;        /* !: time epoch for current syscall */
        uint64_t        *l_syscall_counter; /* !: counter for current process */

        struct kdtrace_thread *l_dtrace; /* (: DTrace-specific data. */

#ifdef KMSAN
        void                *l_kmsan; /* !: KMSAN private data. */
#endif
#ifdef KCOV
        void                *l_kcov; /* !: KCOV private data. */
#endif
};

/*
 * UAREA_PCB_OFFSET: an offset of PCB structure in the uarea.  MD code may
 * define it in <machine/proc.h>, to indicate a different uarea layout.
 */
#ifndef UAREA_PCB_OFFSET
#define        UAREA_PCB_OFFSET        0
#endif

LIST_HEAD(lwplist, lwp);                /* A list of LWPs. */

#ifdef _KERNEL
extern struct lwplist        alllwp;                /* List of all LWPs. */
extern lwp_t                lwp0;                /* LWP for proc0. */
extern int                maxlwp __read_mostly;        /* max number of lwps */
#ifndef MAXLWP
#define        MAXLWP                4096                /* default max */
#endif
#ifndef MAXMAXLWP
#define MAXMAXLWP        65535                /* absolute max */
#endif
#endif

#endif /* _KERNEL || _KMEMUSER */

/*
 * These flags are kept in l_flag, and they are modified only with the LWP
 * locked.
 */
#define        LW_IDLE                0x00000001 /* Idle lwp. */
#define        LW_LWPCTL        0x00000002 /* Adjust lwpctl in userret */
#define        LW_STIMO        0x00000040 /* Sleep timed out */
#define        LW_SINTR        0x00000080 /* Sleep is interruptible. */
#define        LW_CATCHINTR        0x00000100 /* LW_SINTR intent; see sleepq_block(). */
#define        LW_SYSTEM        0x00000200 /* Kernel thread */
#define        LW_SYSTEM_FPU        0x00000400 /* Kernel thread with vector/FP enabled */
#define        LW_DBGSUSPEND        0x00010000 /* Suspend by debugger */
#define        LW_WSUSPEND        0x00020000 /* Suspend before return to user */
#define        LW_BATCH        0x00040000 /* LWP tends to hog CPU */
#define        LW_WCORE        0x00080000 /* Stop for core dump on return to user */
#define        LW_WEXIT        0x00100000 /* Exit before return to user */
#define        LW_PENDSIG        0x01000000 /* Pending signal for us */
#define        LW_CANCELLED        0x02000000 /* tsleep should not sleep */
#define        LW_CACHECRED        0x04000000 /* Cache new process credential */
#define        LW_WREBOOT        0x08000000 /* System is rebooting, please suspend */
#define        LW_UNPARKED        0x10000000 /* Unpark op pending */
#define        LW_RUMP_CLEAR        0x40000000 /* Clear curlwp in RUMP scheduler */
#define        LW_RUMP_QEXIT        0x80000000 /* LWP should exit ASAP */

/*
 * The second set of flags is kept in l_pflag, and they are modified only by
 * the LWP itself, or modified when it's known the LWP cannot be running.
 * LP_RUNNING is typically updated with the LWP locked, but not always in
 * the case of soft interrupt handlers.
 */
#define        LP_KTRACTIVE        0x00000001 /* Executing ktrace operation */
#define        LP_KTRCSW        0x00000002 /* ktrace context switch marker */
#define        LP_KTRCSWUSER        0x00000004 /* ktrace context switch marker */
        /*                 0x00000008    was LP_PIDLID */
#define        LP_OWEUPC        0x00000010 /* Owe user profiling tick */
#define        LP_MPSAFE        0x00000020 /* Starts life without kernel_lock */
#define        LP_INTR                0x00000040 /* Soft interrupt handler */
#define        LP_SYSCTLWRITE        0x00000080 /* sysctl write lock held */
#define        LP_MUSTJOIN        0x00000100 /* Must join kthread on exit */
#define        LP_SINGLESTEP        0x00000400 /* Single step thread in ptrace(2) */
#define        LP_TIMEINTR        0x00010000 /* Time this soft interrupt */
#define        LP_PREEMPTING        0x00020000 /* mi_switch called involuntarily */
#define        LP_RUNNING        0x20000000 /* Active on a CPU */
#define        LP_TELEPORT        0x40000000 /* Teleport to new CPU on preempt() */
#define        LP_BOUND        0x80000000 /* Bound to a CPU */

/*
 * The third set of flags is kept in l_prflag and they are modified only
 * with p_lock held.
 */
#define        LPR_DETACHED        0x00800000 /* Won't be waited for. */
#define        LPR_DRAINING        0x80000000 /* Draining references before exiting */

/*
 * Mask indicating that there is "exceptional" work to be done on return to
 * user.
 */
#define        LW_USERRET        (LW_WEXIT | LW_PENDSIG | LW_WREBOOT | LW_WSUSPEND \
    | LW_WCORE | LW_LWPCTL | LW_CACHECRED)

/*
 * Status values.
 *
 * A note about LSRUN and LSONPROC: LSRUN indicates that a process is
 * runnable but *not* yet running, i.e. is on a run queue.  LSONPROC
 * indicates that the process is actually executing on a CPU, i.e.
 * it is no longer on a run queue.
 *
 * These values are set in stone and must not be reused with future changes.
 */
#define        LSIDL                1        /* Process being created by fork. */
#define        LSRUN                2        /* Currently runnable. */
#define        LSSLEEP                3        /* Sleeping on an address. */
#define        LSSTOP                4        /* Process debugging or suspension. */
#define        LSZOMB                5        /* Awaiting collection by parent. */
/* define        LSDEAD        6        Process is almost a zombie. (removed in 5.0) */
#define        LSONPROC        7        /* Process is currently on a CPU. */
#define        LSSUSPENDED        8        /* Not running, not signalable. */

#if defined(_KERNEL) || defined(_KMEMUSER)
static __inline void *
lwp_getpcb(struct lwp *l)
{

        return l->l_addr;
}
#endif /* _KERNEL || _KMEMUSER */

#ifdef _KERNEL
void        lwpinit(void);
void        lwp0_init(void);

void        lwp_startup(lwp_t *, lwp_t *);
void        startlwp(void *);

void        lwp_lock(lwp_t *);
void        lwp_unlock(lwp_t *);
pri_t        lwp_eprio(lwp_t *);
int        lwp_locked(lwp_t *, kmutex_t *);
kmutex_t *lwp_setlock(lwp_t *, kmutex_t *);
void        lwp_unlock_to(lwp_t *, kmutex_t *);
int        lwp_trylock(lwp_t *);
void        lwp_changepri(lwp_t *, pri_t);
void        lwp_lendpri(lwp_t *, pri_t);
void        lwp_addref(lwp_t *);
void        lwp_delref(lwp_t *);
void        lwp_delref2(lwp_t *);
bool        lwp_drainrefs(lwp_t *);
bool        lwp_alive(lwp_t *);
lwp_t        *lwp_find_first(proc_t *);

int        lwp_wait(lwp_t *, lwpid_t, lwpid_t *, bool);
void        lwp_continue(lwp_t *);
void        lwp_unsleep(lwp_t *, bool);
void        lwp_unstop(lwp_t *);
void        lwp_exit(lwp_t *);
int        lwp_suspend(lwp_t *, lwp_t *);
int        lwp_create1(lwp_t *, const void *, size_t, u_long, lwpid_t *);
void        lwp_start(lwp_t *, int);
void        lwp_migrate(lwp_t *, struct cpu_info *);
lwp_t *        lwp_find2(pid_t, lwpid_t);
lwp_t *        lwp_find(proc_t *, int);
void        lwp_userret(lwp_t *);
void        lwp_need_userret(lwp_t *);
void        lwp_free(lwp_t *, bool, bool);
long        lwp_pctr(void);
int        lwp_setprivate(lwp_t *, void *);
int        do_lwp_create(lwp_t *, void *, u_long, lwp_t **, const sigset_t *,
    const stack_t *);

void        lwp_thread_cleanup(lwp_t *);

void        lwpinit_specificdata(void);
int        lwp_specific_key_create(specificdata_key_t *, specificdata_dtor_t);
void        lwp_specific_key_delete(specificdata_key_t);
void        lwp_initspecific(lwp_t *);
void        lwp_finispecific(lwp_t *);
void        *lwp_getspecific(specificdata_key_t);
#if defined(_LWP_API_PRIVATE)
void        *_lwp_getspecific_by_lwp(lwp_t *, specificdata_key_t);
#endif
void        lwp_setspecific(specificdata_key_t, void *);
void        lwp_setspecific_by_lwp(lwp_t *, specificdata_key_t, void *);

/* Syscalls. */
int        lwp_park(clockid_t, int, struct timespec *);
int        lwp_unpark(const lwpid_t *, const u_int);

/* DDB. */
void        lwp_whatis(uintptr_t, void (*)(const char *, ...) __printflike(1, 2));

int lwp_create(lwp_t *, struct proc *, vaddr_t, int, void *, size_t,
    void (*)(void *), void *, lwp_t **, int, const sigset_t *, const stack_t *);

/*
 * XXX _MODULE
 * We should provide real stubs for the below that modules can use.
 */

static __inline void
spc_lock(struct cpu_info *ci)
{
        mutex_spin_enter(ci->ci_schedstate.spc_mutex);
}

static __inline void
spc_unlock(struct cpu_info *ci)
{
        mutex_spin_exit(ci->ci_schedstate.spc_mutex);
}

static __inline void
spc_dlock(struct cpu_info *ci1, struct cpu_info *ci2)
{
        struct schedstate_percpu *spc1 = &ci1->ci_schedstate;
        struct schedstate_percpu *spc2 = &ci2->ci_schedstate;

        KASSERT(ci1 != ci2);
        if (ci1 < ci2) {
                mutex_spin_enter(spc1->spc_mutex);
                mutex_spin_enter(spc2->spc_mutex);
        } else {
                mutex_spin_enter(spc2->spc_mutex);
                mutex_spin_enter(spc1->spc_mutex);
        }
}

/*
 * Allow machine-dependent code to override curlwp in <machine/cpu.h> for
 * its own convenience.  Otherwise, we declare it as appropriate.
 */
#if !defined(curlwp)
#if defined(MULTIPROCESSOR)
#define        curlwp                curcpu()->ci_curlwp        /* Current running LWP */
#else
extern struct lwp        *curlwp;                /* Current running LWP */
#endif /* MULTIPROCESSOR */
#endif /* ! curlwp */
#define        curproc                (curlwp->l_proc)

/*
 * This provides a way for <machine/cpu.h> to get l_cpu for curlwp before
 * struct lwp is defined.
 */
static __inline struct cpu_info *
lwp_getcpu(struct lwp *l)
{
        return l->l_cpu;
}

static __inline bool
CURCPU_IDLE_P(void)
{
        struct cpu_info *ci = curcpu();
        return ci->ci_onproc == ci->ci_data.cpu_idlelwp;
}

/*
 * Disable and re-enable preemption.  Only for low-level kernel
 * use.  Device drivers and anything that could potentially be
 * compiled as a module should use kpreempt_disable() and
 * kpreempt_enable().
 */
static __inline void
KPREEMPT_DISABLE(lwp_t *l)
{
        struct lwp *l1 __diagused;

        KASSERTMSG(l == (l1 = curlwp), "l=%p curlwp=%p", l, l1);
        l->l_nopreempt++;
        __insn_barrier();
}

static __inline void
KPREEMPT_ENABLE(lwp_t *l)
{
        struct lwp *l1 __diagused;

        KASSERTMSG(l == (l1 = curlwp), "l=%p curlwp=%p", l, l1);
        KASSERT(l->l_nopreempt > 0);
        __insn_barrier();
        l->l_nopreempt--;
        __insn_barrier();
        if (__predict_false(l->l_dopreempt))
                kpreempt(0);
}

/* For lwp::l_dopreempt */
#define        DOPREEMPT_ACTIVE        0x01
#define        DOPREEMPT_COUNTED        0x02

/*
 * Prevent curlwp from migrating between CPUs between curlwp_bind and
 * curlwp_bindx. One use case is psref(9) that has a contract that
 * forbids migrations.
 */
static __inline int
curlwp_bind(void)
{
        int bound;

        bound = curlwp->l_pflag & LP_BOUND;
        curlwp->l_pflag |= LP_BOUND;
        __insn_barrier();

        return bound;
}

static __inline void
curlwp_bindx(int bound)
{

        KASSERT(curlwp->l_pflag & LP_BOUND);
        __insn_barrier();
        curlwp->l_pflag ^= bound ^ LP_BOUND;
}

#endif /* _KERNEL */

/* Flags for _lwp_create(), as per Solaris. */
#define        LWP_DETACHED        0x00000040
#define        LWP_SUSPENDED        0x00000080

/* Kernel-internal flags for LWP creation. */
        /*                0x40000000        was LWP_PIDLID */
#define        LWP_VFORK        0x80000000

#endif        /* !_SYS_LWP_H_ */































































































    6 



    6 



    6 


























































































































































































































































































































































































































































    6 





    2 
    4 
















































































































    6 


    1 



























    6 





































    1 












    2 



    2 










    4 

    1 







    2 


    3 



    6 

















    1 























    1 










    1 



    1 













    1 


    1 


























    1 









    1 





























    1 







    1 












    1 








































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
/*        $NetBSD: ffs_balloc.c,v 1.66 2022/11/17 06:40:40 chs Exp $        */

/*
 * Copyright (c) 2002 Networks Associates Technology, Inc.
 * All rights reserved.
 *
 * This software was developed for the FreeBSD Project by Marshall
 * Kirk McKusick and Network Associates Laboratories, the Security
 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
 * research program
 *
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ffs_balloc.c        8.8 (Berkeley) 6/16/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_balloc.c,v 1.66 2022/11/17 06:40:40 chs Exp $");

#if defined(_KERNEL_OPT)
#include "opt_quota.h"
#include "opt_uvmhist.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/file.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>

#include <ufs/ufs/quota.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>

#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>

#ifdef UVMHIST
#include <uvm/uvm.h>
#endif
#include <uvm/uvm_extern.h>
#include <uvm/uvm_stat.h>

static int ffs_balloc_ufs1(struct vnode *, off_t, int, kauth_cred_t, int,
    struct buf **);
static int ffs_balloc_ufs2(struct vnode *, off_t, int, kauth_cred_t, int,
    struct buf **);

static daddr_t
ffs_extb(struct fs *fs, struct ufs2_dinode *dp, daddr_t nb)
{
        return ufs_rw64(dp->di_extb[nb], UFS_FSNEEDSWAP(fs));
}
   
/*
 * Balloc defines the structure of file system storage
 * by allocating the physical blocks on a device given
 * the inode and the logical block number in a file.
 */

int
ffs_balloc(struct vnode *vp, off_t off, int size, kauth_cred_t cred, int flags,
    struct buf **bpp)
{
        int error;

        if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC)
                error = ffs_balloc_ufs2(vp, off, size, cred, flags, bpp);
        else
                error = ffs_balloc_ufs1(vp, off, size, cred, flags, bpp);

        if (error == 0 && bpp != NULL && (error = fscow_run(*bpp, false)) != 0)
                brelse(*bpp, 0);

        return error;
}

static int
ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
    int flags, struct buf **bpp)
{
        daddr_t lbn, lastlbn;
        struct buf *bp, *nbp;
        struct inode *ip = VTOI(vp);
        struct fs *fs = ip->i_fs;
        struct ufsmount *ump = ip->i_ump;
        struct indir indirs[UFS_NIADDR + 2];
        daddr_t newb, pref, nb;
        int32_t *bap;        /* XXX ondisk32 */
        int deallocated, osize, nsize, num, i, error;
        int32_t *blkp, *allocblk, allociblk[UFS_NIADDR + 1];
        int32_t *allocib;
        int unwindidx = -1;
        const int needswap = UFS_FSNEEDSWAP(fs);
        UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist);

        lbn = ffs_lblkno(fs, off);
        size = ffs_blkoff(fs, off) + size;
        if (size > fs->fs_bsize)
                panic("ffs_balloc: blk too big");
        if (bpp != NULL) {
                *bpp = NULL;
        }
        UVMHIST_LOG(ubchist, "vp %#jx lbn 0x%jx size 0x%jx", (uintptr_t)vp,
            lbn, size, 0);

        if (lbn < 0)
                return (EFBIG);

        /*
         * If the next write will extend the file into a new block,
         * and the file is currently composed of a fragment
         * this fragment has to be extended to be a full block.
         */

        lastlbn = ffs_lblkno(fs, ip->i_size);
        if (lastlbn < UFS_NDADDR && lastlbn < lbn) {
                nb = lastlbn;
                osize = ffs_blksize(fs, ip, nb);
                if (osize < fs->fs_bsize && osize > 0) {
                        mutex_enter(&ump->um_lock);
                        error = ffs_realloccg(ip, nb, ffs_getdb(fs, ip, nb),
                                    ffs_blkpref_ufs1(ip, lastlbn, nb, flags,
                                        &ip->i_ffs1_db[0]),
                                    osize, (int)fs->fs_bsize, flags, cred, bpp,
                                    &newb);
                        if (error)
                                return (error);
                        ip->i_size = ffs_lblktosize(fs, nb + 1);
                        ip->i_ffs1_size = ip->i_size;
                        uvm_vnp_setsize(vp, ip->i_ffs1_size);
                        ip->i_ffs1_db[nb] = ufs_rw32((u_int32_t)newb, needswap);
                        ip->i_flag |= IN_CHANGE | IN_UPDATE;
                        if (bpp && *bpp) {
                                if (flags & B_SYNC)
                                        bwrite(*bpp);
                                else
                                        bawrite(*bpp);
                        }
                }
        }

        /*
         * The first UFS_NDADDR blocks are direct blocks
         */

        if (lbn < UFS_NDADDR) {
                nb = ufs_rw32(ip->i_ffs1_db[lbn], needswap);
                if (nb != 0 && ip->i_size >= ffs_lblktosize(fs, lbn + 1)) {

                        /*
                         * The block is an already-allocated direct block
                         * and the file already extends past this block,
                         * thus this must be a whole block.
                         * Just read the block (if requested).
                         */

                        if (bpp != NULL) {
                                error = bread(vp, lbn, fs->fs_bsize,
                                              B_MODIFY, bpp);
                                if (error) {
                                        return (error);
                                }
                        }
                        return (0);
                }
                if (nb != 0) {

                        /*
                         * Consider need to reallocate a fragment.
                         */

                        osize = ffs_fragroundup(fs, ffs_blkoff(fs, ip->i_size));
                        nsize = ffs_fragroundup(fs, size);
                        if (nsize <= osize) {

                                /*
                                 * The existing block is already
                                 * at least as big as we want.
                                 * Just read the block (if requested).
                                 */

                                if (bpp != NULL) {
                                        error = bread(vp, lbn, osize,
                                                      B_MODIFY, bpp);
                                        if (error) {
                                                return (error);
                                        }
                                }
                                return 0;
                        } else {

                                /*
                                 * The existing block is smaller than we want,
                                 * grow it.
                                 */
                                mutex_enter(&ump->um_lock);
                                error = ffs_realloccg(ip, lbn,
                                    ffs_getdb(fs, ip, lbn),
                                    ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags,
                                        &ip->i_ffs1_db[0]),
                                    osize, nsize, flags, cred, bpp, &newb);
                                if (error)
                                        return (error);
                        }
                } else {

                        /*
                         * the block was not previously allocated,
                         * allocate a new block or fragment.
                         */

                        if (ip->i_size < ffs_lblktosize(fs, lbn + 1))
                                nsize = ffs_fragroundup(fs, size);
                        else
                                nsize = fs->fs_bsize;
                        mutex_enter(&ump->um_lock);
                        error = ffs_alloc(ip, lbn,
                            ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags,
                                &ip->i_ffs1_db[0]),
                            nsize, flags, cred, &newb);
                        if (error)
                                return (error);
                        if (bpp != NULL) {
                                error = ffs_getblk(vp, lbn, FFS_FSBTODB(fs, newb),
                                    nsize, (flags & B_CLRBUF) != 0, bpp);
                                if (error)
                                        return error;
                        }
                }
                ip->i_ffs1_db[lbn] = ufs_rw32((u_int32_t)newb, needswap);
                ip->i_flag |= IN_CHANGE | IN_UPDATE;
                return (0);
        }

        /*
         * Determine the number of levels of indirection.
         */

        pref = 0;
        if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
                return (error);

        /*
         * Fetch the first indirect block allocating if necessary.
         */

        --num;
        nb = ufs_rw32(ip->i_ffs1_ib[indirs[0].in_off], needswap);
        allocib = NULL;
        allocblk = allociblk;
        if (nb == 0) {
                mutex_enter(&ump->um_lock);
                pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY, NULL);
                error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
                    flags | B_METAONLY, cred, &newb);
                if (error)
                        goto fail;
                nb = newb;
                *allocblk++ = nb;
                error = ffs_getblk(vp, indirs[1].in_lbn, FFS_FSBTODB(fs, nb),
                    fs->fs_bsize, true, &bp);
                if (error)
                        goto fail;
                /*
                 * Write synchronously so that indirect blocks
                 * never point at garbage.
                 */
                if ((error = bwrite(bp)) != 0)
                        goto fail;
                unwindidx = 0;
                allocib = &ip->i_ffs1_ib[indirs[0].in_off];
                *allocib = ufs_rw32(nb, needswap);
                ip->i_flag |= IN_CHANGE | IN_UPDATE;
        }

        /*
         * Fetch through the indirect blocks, allocating as necessary.
         */

        for (i = 1;;) {
                error = bread(vp,
                    indirs[i].in_lbn, (int)fs->fs_bsize, 0, &bp);
                if (error) {
                        goto fail;
                }
                bap = (int32_t *)bp->b_data;        /* XXX ondisk32 */
                nb = ufs_rw32(bap[indirs[i].in_off], needswap);
                if (i == num)
                        break;
                i++;
                if (nb != 0) {
                        brelse(bp, 0);
                        continue;
                }
                if (fscow_run(bp, true) != 0) {
                        brelse(bp, 0);
                        goto fail;
                }
                mutex_enter(&ump->um_lock);
                /* Try to keep snapshot indirect blocks contiguous. */
                if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0)
                        pref = ffs_blkpref_ufs1(ip, lbn, indirs[i-1].in_off,
                            flags | B_METAONLY, &bap[0]);
                if (pref == 0)
                        pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY,
                            NULL);
                error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
                    flags | B_METAONLY, cred, &newb);
                if (error) {
                        brelse(bp, 0);
                        goto fail;
                }
                nb = newb;
                *allocblk++ = nb;
                error = ffs_getblk(vp, indirs[i].in_lbn, FFS_FSBTODB(fs, nb),
                    fs->fs_bsize, true, &nbp);
                if (error) {
                        brelse(bp, 0);
                        goto fail;
                }
                /*
                 * Write synchronously so that indirect blocks
                 * never point at garbage.
                 */
                if ((error = bwrite(nbp)) != 0) {
                        brelse(bp, 0);
                        goto fail;
                }
                if (unwindidx < 0)
                        unwindidx = i - 1;
                bap[indirs[i - 1].in_off] = ufs_rw32(nb, needswap);

                /*
                 * If required, write synchronously, otherwise use
                 * delayed write.
                 */

                if (flags & B_SYNC) {
                        bwrite(bp);
                } else {
                        bdwrite(bp);
                }
        }

        if (flags & B_METAONLY) {
                KASSERT(bpp != NULL);
                *bpp = bp;
                return (0);
        }

        /*
         * Get the data block, allocating if necessary.
         */

        if (nb == 0) {
                if (fscow_run(bp, true) != 0) {
                        brelse(bp, 0);
                        goto fail;
                }
                mutex_enter(&ump->um_lock);
                pref = ffs_blkpref_ufs1(ip, lbn, indirs[num].in_off, flags,
                    &bap[0]);
                error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred,
                    &newb);
                if (error) {
                        brelse(bp, 0);
                        goto fail;
                }
                nb = newb;
                *allocblk++ = nb;
                if (bpp != NULL) {
                        error = ffs_getblk(vp, lbn, FFS_FSBTODB(fs, nb),
                            fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp);
                        if (error) {
                                brelse(bp, 0);
                                goto fail;
                        }
                }
                bap[indirs[num].in_off] = ufs_rw32(nb, needswap);
                if (allocib == NULL && unwindidx < 0) {
                        unwindidx = i - 1;
                }

                /*
                 * If required, write synchronously, otherwise use
                 * delayed write.
                 */

                if (flags & B_SYNC) {
                        bwrite(bp);
                } else {
                        bdwrite(bp);
                }
                return (0);
        }
        brelse(bp, 0);
        if (bpp != NULL) {
                if (flags & B_CLRBUF) {
                        error = bread(vp, lbn, (int)fs->fs_bsize,
                            B_MODIFY, &nbp);
                        if (error) {
                                goto fail;
                        }
                } else {
                        error = ffs_getblk(vp, lbn, FFS_FSBTODB(fs, nb),
                            fs->fs_bsize, true, &nbp);
                        if (error)
                                goto fail;
                }
                *bpp = nbp;
        }
        return (0);

fail:
        /*
         * If we have failed part way through block allocation, we
         * have to deallocate any indirect blocks that we have allocated.
         */

        if (unwindidx >= 0) {

                /*
                 * First write out any buffers we've created to resolve their
                 * softdeps.  This must be done in reverse order of creation
                 * so that we resolve the dependencies in one pass.
                 * Write the cylinder group buffers for these buffers too.
                 */

                for (i = num; i >= unwindidx; i--) {
                        if (i == 0) {
                                break;
                        }
                        if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
                            fs->fs_bsize, false, &bp) != 0)
                                continue;
                        if (bp->b_oflags & BO_DELWRI) {
                                nb = FFS_FSBTODB(fs, cgtod(fs, dtog(fs,
                                    FFS_DBTOFSB(fs, bp->b_blkno))));
                                bwrite(bp);
                                if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK,
                                    fs->fs_cgsize, false, &bp) != 0)
                                        continue;
                                if (bp->b_oflags & BO_DELWRI) {
                                        bwrite(bp);
                                } else {
                                        brelse(bp, BC_INVAL);
                                }
                        } else {
                                brelse(bp, BC_INVAL);
                        }
                }

                /*
                 * Undo the partial allocation.
                 */
                if (unwindidx == 0) {
                        *allocib = 0;
                        ip->i_flag |= IN_CHANGE | IN_UPDATE;
                } else {
                        int r;

                        r = bread(vp, indirs[unwindidx].in_lbn,
                            (int)fs->fs_bsize, 0, &bp);
                        if (r) {
                                panic("Could not unwind indirect block, error %d", r);
                        } else {
                                bap = (int32_t *)bp->b_data; /* XXX ondisk32 */
                                bap[indirs[unwindidx].in_off] = 0;
                                bwrite(bp);
                        }
                }
                for (i = unwindidx + 1; i <= num; i++) {
                        if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
                            fs->fs_bsize, false, &bp) == 0)
                                brelse(bp, BC_INVAL);
                }
        }
        for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
                ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
                deallocated += fs->fs_bsize;
        }
        if (deallocated) {
#if defined(QUOTA) || defined(QUOTA2)
                /*
                 * Restore user's disk quota because allocation failed.
                 */
                (void)chkdq(ip, -btodb(deallocated), cred, FORCE);
#endif
                ip->i_ffs1_blocks -= btodb(deallocated);
                ip->i_flag |= IN_CHANGE | IN_UPDATE;
        }
        return (error);
}

static int
ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
    int flags, struct buf **bpp)
{
        daddr_t lbn, lastlbn;
        struct buf *bp, *nbp;
        struct inode *ip = VTOI(vp);
        struct fs *fs = ip->i_fs;
        struct ufsmount *ump = ip->i_ump;
        struct indir indirs[UFS_NIADDR + 2];
        daddr_t newb, pref, nb;
        int64_t *bap;
        int deallocated, osize, nsize, num, i, error;
        daddr_t *blkp, *allocblk, allociblk[UFS_NIADDR + 1];
        int64_t *allocib;
        int unwindidx = -1;
        const int needswap = UFS_FSNEEDSWAP(fs);
        UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist);

        KASSERT((ump->um_flags & UFS_EA) != 0 || (flags & IO_EXT) == 0);

        lbn = ffs_lblkno(fs, off);
        size = ffs_blkoff(fs, off) + size;
        if (size > fs->fs_bsize)
                panic("ffs_balloc: blk too big");
        if (bpp != NULL) {
                *bpp = NULL;
        }
        UVMHIST_LOG(ubchist, "vp %#jx lbn 0x%jx size 0x%jx", (uintptr_t)vp,
            lbn, size, 0);

        if (lbn < 0)
                return (EFBIG);

        /*
         * Check for allocating external data.
         */
        if (flags & IO_EXT) {
                struct ufs2_dinode *dp = ip->i_din.ffs2_din;
                if (lbn >= UFS_NXADDR)
                        return (EFBIG);
                /*
                 * If the next write will extend the data into a new block,
                 * and the data is currently composed of a fragment
                 * this fragment has to be extended to be a full block.
                 */
                lastlbn = ffs_lblkno(fs, dp->di_extsize);
                if (lastlbn < lbn) {
                        nb = lastlbn;
                        osize = ffs_sblksize(fs, dp->di_extsize, nb);
                        if (osize < fs->fs_bsize && osize > 0) {
                                mutex_enter(&ump->um_lock);
                                error = ffs_realloccg(ip, -1 - nb,
                                    ffs_extb(fs, dp, nb),
                                    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
                                        flags, &dp->di_extb[0]),
                                    osize, (int)fs->fs_bsize, flags, cred,
                                    &bp, &newb);
                                if (error)
                                        return (error);
                                dp->di_extsize = ffs_lblktosize(fs, nb + 1);
                                dp->di_extb[nb] = FFS_DBTOFSB(fs, bp->b_blkno);
                                ip->i_flag |= IN_CHANGE | IN_UPDATE;
                                if (flags & IO_SYNC)
                                        bwrite(bp);
                                else
                                        bawrite(bp);
                        }
                }
                /*
                 * All blocks are direct blocks
                 */
                nb = dp->di_extb[lbn];
                if (nb != 0 && dp->di_extsize >= ffs_lblktosize(fs, lbn + 1)) {
                        error = bread(vp, -1 - lbn, fs->fs_bsize,
                            0, &bp);
                        if (error) {
                                return (error);
                        }
                        mutex_enter(bp->b_objlock);
                        bp->b_blkno = FFS_FSBTODB(fs, nb);
                        mutex_exit(bp->b_objlock);
                        *bpp = bp;
                        return (0);
                }
                if (nb != 0) {
                        /*
                         * Consider need to reallocate a fragment.
                         */
                        osize = ffs_fragroundup(fs, ffs_blkoff(fs, dp->di_extsize));
                        nsize = ffs_fragroundup(fs, size);
                        if (nsize <= osize) {
                                error = bread(vp, -1 - lbn, osize,
                                    0, &bp);
                                if (error) {
                                        return (error);
                                }
                                mutex_enter(bp->b_objlock);
                                bp->b_blkno = FFS_FSBTODB(fs, nb);
                                mutex_exit(bp->b_objlock);
                        } else {
                                mutex_enter(&ump->um_lock);
                                error = ffs_realloccg(ip, -1 - lbn,
                                    ffs_extb(fs, dp, lbn),
                                    ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
                                        &dp->di_extb[0]),
                                    osize, nsize, flags, cred, &bp, &newb);
                                if (error)
                                        return (error);
                        }
                } else {
                        if (dp->di_extsize < ffs_lblktosize(fs, lbn + 1))
                                nsize = ffs_fragroundup(fs, size);
                        else
                                nsize = fs->fs_bsize;
                        mutex_enter(&ump->um_lock);
                        error = ffs_alloc(ip, lbn,
                           ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
                               &dp->di_extb[0]),
                           nsize, flags, cred, &newb);
                        if (error)
                                return (error);
                        error = ffs_getblk(vp, -1 - lbn, FFS_FSBTODB(fs, newb),
                            nsize, (flags & B_CLRBUF) != 0, &bp);
                        if (error)
                                return error;
                }
                dp->di_extb[lbn] = FFS_DBTOFSB(fs, bp->b_blkno);
                ip->i_flag |= IN_CHANGE | IN_UPDATE;
                *bpp = bp;
                return (0);
        }
        /*
         * If the next write will extend the file into a new block,
         * and the file is currently composed of a fragment
         * this fragment has to be extended to be a full block.
         */

        lastlbn = ffs_lblkno(fs, ip->i_size);
        if (lastlbn < UFS_NDADDR && lastlbn < lbn) {
                nb = lastlbn;
                osize = ffs_blksize(fs, ip, nb);
                if (osize < fs->fs_bsize && osize > 0) {
                        mutex_enter(&ump->um_lock);
                        error = ffs_realloccg(ip, nb, ffs_getdb(fs, ip, lbn),
                                    ffs_blkpref_ufs2(ip, lastlbn, nb, flags,
                                        &ip->i_ffs2_db[0]),
                                    osize, (int)fs->fs_bsize, flags, cred, bpp,
                                    &newb);
                        if (error)
                                return (error);
                        ip->i_size = ffs_lblktosize(fs, nb + 1);
                        ip->i_ffs2_size = ip->i_size;
                        uvm_vnp_setsize(vp, ip->i_size);
                        ip->i_ffs2_db[nb] = ufs_rw64(newb, needswap);
                        ip->i_flag |= IN_CHANGE | IN_UPDATE;
                        if (bpp) {
                                if (flags & B_SYNC)
                                        bwrite(*bpp);
                                else
                                        bawrite(*bpp);
                        }
                }
        }

        /*
         * The first UFS_NDADDR blocks are direct blocks
         */

        if (lbn < UFS_NDADDR) {
                nb = ufs_rw64(ip->i_ffs2_db[lbn], needswap);
                if (nb != 0 && ip->i_size >= ffs_lblktosize(fs, lbn + 1)) {

                        /*
                         * The block is an already-allocated direct block
                         * and the file already extends past this block,
                         * thus this must be a whole block.
                         * Just read the block (if requested).
                         */

                        if (bpp != NULL) {
                                error = bread(vp, lbn, fs->fs_bsize,
                                              B_MODIFY, bpp);
                                if (error) {
                                        return (error);
                                }
                        }
                        return (0);
                }
                if (nb != 0) {

                        /*
                         * Consider need to reallocate a fragment.
                         */

                        osize = ffs_fragroundup(fs, ffs_blkoff(fs, ip->i_size));
                        nsize = ffs_fragroundup(fs, size);
                        if (nsize <= osize) {

                                /*
                                 * The existing block is already
                                 * at least as big as we want.
                                 * Just read the block (if requested).
                                 */

                                if (bpp != NULL) {
                                        error = bread(vp, lbn, osize,
                                                      B_MODIFY, bpp);
                                        if (error) {
                                                return (error);
                                        }
                                }
                                return 0;
                        } else {

                                /*
                                 * The existing block is smaller than we want,
                                 * grow it.
                                 */
                                mutex_enter(&ump->um_lock);
                                error = ffs_realloccg(ip, lbn,
                                    ffs_getdb(fs, ip, lbn),
                                    ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
                                        &ip->i_ffs2_db[0]),
                                    osize, nsize, flags, cred, bpp, &newb);
                                if (error)
                                        return (error);
                        }
                } else {

                        /*
                         * the block was not previously allocated,
                         * allocate a new block or fragment.
                         */

                        if (ip->i_size < ffs_lblktosize(fs, lbn + 1))
                                nsize = ffs_fragroundup(fs, size);
                        else
                                nsize = fs->fs_bsize;
                        mutex_enter(&ump->um_lock);
                        error = ffs_alloc(ip, lbn,
                            ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
                                &ip->i_ffs2_db[0]),
                            nsize, flags, cred, &newb);
                        if (error)
                                return (error);
                        if (bpp != NULL) {
                                error = ffs_getblk(vp, lbn, FFS_FSBTODB(fs, newb),
                                    nsize, (flags & B_CLRBUF) != 0, bpp);
                                if (error)
                                        return error;
                        }
                }
                ip->i_ffs2_db[lbn] = ufs_rw64(newb, needswap);
                ip->i_flag |= IN_CHANGE | IN_UPDATE;
                return (0);
        }

        /*
         * Determine the number of levels of indirection.
         */

        pref = 0;
        if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
                return (error);

        /*
         * Fetch the first indirect block allocating if necessary.
         */

        --num;
        nb = ufs_rw64(ip->i_ffs2_ib[indirs[0].in_off], needswap);
        allocib = NULL;
        allocblk = allociblk;
        if (nb == 0) {
                mutex_enter(&ump->um_lock);
                pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY, NULL);
                error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
                    flags | B_METAONLY, cred, &newb);
                if (error)
                        goto fail;
                nb = newb;
                *allocblk++ = nb;
                error = ffs_getblk(vp, indirs[1].in_lbn, FFS_FSBTODB(fs, nb),
                    fs->fs_bsize, true, &bp);
                if (error)
                        goto fail;
                /*
                 * Write synchronously so that indirect blocks
                 * never point at garbage.
                 */
                if ((error = bwrite(bp)) != 0)
                        goto fail;
                unwindidx = 0;
                allocib = &ip->i_ffs2_ib[indirs[0].in_off];
                *allocib = ufs_rw64(nb, needswap);
                ip->i_flag |= IN_CHANGE | IN_UPDATE;
        }

        /*
         * Fetch through the indirect blocks, allocating as necessary.
         */

        for (i = 1;;) {
                error = bread(vp,
                    indirs[i].in_lbn, (int)fs->fs_bsize, 0, &bp);
                if (error) {
                        goto fail;
                }
                bap = (int64_t *)bp->b_data;
                nb = ufs_rw64(bap[indirs[i].in_off], needswap);
                if (i == num)
                        break;
                i++;
                if (nb != 0) {
                        brelse(bp, 0);
                        continue;
                }
                if (fscow_run(bp, true) != 0) {
                        brelse(bp, 0);
                        goto fail;
                }
                mutex_enter(&ump->um_lock);
                /* Try to keep snapshot indirect blocks contiguous. */
                if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0)
                        pref = ffs_blkpref_ufs2(ip, lbn, indirs[i-1].in_off,
                            flags | B_METAONLY, &bap[0]);
                if (pref == 0)
                        pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY,
                            NULL);
                error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
                    flags | B_METAONLY, cred, &newb);
                if (error) {
                        brelse(bp, 0);
                        goto fail;
                }
                nb = newb;
                *allocblk++ = nb;
                error = ffs_getblk(vp, indirs[i].in_lbn, FFS_FSBTODB(fs, nb),
                    fs->fs_bsize, true, &nbp);
                if (error) {
                        brelse(bp, 0);
                        goto fail;
                }
                /*
                 * Write synchronously so that indirect blocks
                 * never point at garbage.
                 */
                if ((error = bwrite(nbp)) != 0) {
                        brelse(bp, 0);
                        goto fail;
                }
                if (unwindidx < 0)
                        unwindidx = i - 1;
                bap[indirs[i - 1].in_off] = ufs_rw64(nb, needswap);

                /*
                 * If required, write synchronously, otherwise use
                 * delayed write.
                 */

                if (flags & B_SYNC) {
                        bwrite(bp);
                } else {
                        bdwrite(bp);
                }
        }

        if (flags & B_METAONLY) {
                KASSERT(bpp != NULL);
                *bpp = bp;
                return (0);
        }

        /*
         * Get the data block, allocating if necessary.
         */

        if (nb == 0) {
                if (fscow_run(bp, true) != 0) {
                        brelse(bp, 0);
                        goto fail;
                }
                mutex_enter(&ump->um_lock);
                pref = ffs_blkpref_ufs2(ip, lbn, indirs[num].in_off, flags,
                    &bap[0]);
                error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred,
                    &newb);
                if (error) {
                        brelse(bp, 0);
                        goto fail;
                }
                nb = newb;
                *allocblk++ = nb;
                if (bpp != NULL) {
                        error = ffs_getblk(vp, lbn, FFS_FSBTODB(fs, nb),
                            fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp);
                        if (error) {
                                brelse(bp, 0);
                                goto fail;
                        }
                }
                bap[indirs[num].in_off] = ufs_rw64(nb, needswap);
                if (allocib == NULL && unwindidx < 0) {
                        unwindidx = i - 1;
                }

                /*
                 * If required, write synchronously, otherwise use
                 * delayed write.
                 */

                if (flags & B_SYNC) {
                        bwrite(bp);
                } else {
                        bdwrite(bp);
                }
                return (0);
        }
        brelse(bp, 0);
        if (bpp != NULL) {
                if (flags & B_CLRBUF) {
                        error = bread(vp, lbn, (int)fs->fs_bsize,
                            B_MODIFY, &nbp);
                        if (error) {
                                goto fail;
                        }
                } else {
                        error = ffs_getblk(vp, lbn, FFS_FSBTODB(fs, nb),
                            fs->fs_bsize, true, &nbp);
                        if (error)
                                goto fail;
                }
                *bpp = nbp;
        }
        return (0);

fail:
        /*
         * If we have failed part way through block allocation, we
         * have to deallocate any indirect blocks that we have allocated.
         */

        if (unwindidx >= 0) {

                /*
                 * First write out any buffers we've created to resolve their
                 * softdeps.  This must be done in reverse order of creation
                 * so that we resolve the dependencies in one pass.
                 * Write the cylinder group buffers for these buffers too.
                 */

                for (i = num; i >= unwindidx; i--) {
                        if (i == 0) {
                                break;
                        }
                        if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
                            fs->fs_bsize, false, &bp) != 0)
                                continue;
                        if (bp->b_oflags & BO_DELWRI) {
                                nb = FFS_FSBTODB(fs, cgtod(fs, dtog(fs,
                                    FFS_DBTOFSB(fs, bp->b_blkno))));
                                bwrite(bp);
                                if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK,
                                    fs->fs_cgsize, false, &bp) != 0)
                                        continue;
                                if (bp->b_oflags & BO_DELWRI) {
                                        bwrite(bp);
                                } else {
                                        brelse(bp, BC_INVAL);
                                }
                        } else {
                                brelse(bp, BC_INVAL);
                        }
                }

                /*
                 * Now that any dependencies that we created have been
                 * resolved, we can undo the partial allocation.
                 */

                if (unwindidx == 0) {
                        *allocib = 0;
                        ip->i_flag |= IN_CHANGE | IN_UPDATE;
                } else {
                        int r;

                        r = bread(vp, indirs[unwindidx].in_lbn,
                            (int)fs->fs_bsize, 0, &bp);
                        if (r) {
                                panic("Could not unwind indirect block, error %d", r);
                        } else {
                                bap = (int64_t *)bp->b_data;
                                bap[indirs[unwindidx].in_off] = 0;
                                bwrite(bp);
                        }
                }
                for (i = unwindidx + 1; i <= num; i++) {
                        if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
                            fs->fs_bsize, false, &bp) == 0)
                                brelse(bp, BC_INVAL);
                }
        }
        for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
                ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
                deallocated += fs->fs_bsize;
        }
        if (deallocated) {
#if defined(QUOTA) || defined(QUOTA2)
                /*
                 * Restore user's disk quota because allocation failed.
                 */
                (void)chkdq(ip, -btodb(deallocated), cred, FORCE);
#endif
                ip->i_ffs2_blocks -= btodb(deallocated);
                ip->i_flag |= IN_CHANGE | IN_UPDATE;
        }

        return (error);
}























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/*        $NetBSD: bus_private.h,v 1.16 2022/01/22 15:10:32 skrll Exp $        */
/*        NetBSD: bus.h,v 1.8 2005/03/09 19:04:46 matt Exp        */

/*-
 * Copyright (c) 1996, 1997, 1998, 2001 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1996 Charles M. Hannum.  All rights reserved.
 * Copyright (c) 1996 Christopher G. Demetriou.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Christopher G. Demetriou
 *        for the NetBSD Project.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#if !defined(_X86_BUS_PRIVATE_H_)
#define        _X86_BUS_PRIVATE_H_

/*
 * Cookie used for bounce buffers. A pointer to one of these it stashed in
 * the DMA map.
 */
struct x86_bus_dma_cookie {
        int        id_flags;                /* flags; see below */

        /*
         * Information about the original buffer used during
         * DMA map syncs.  Note that origibuflen is only used
         * for ID_BUFTYPE_LINEAR.
         */
        void        *id_origbuf;                /* pointer to orig buffer if
                                           bouncing */
        bus_size_t id_origbuflen;        /* ...and size */
        int        id_buftype;                /* type of buffer */

        void        *id_bouncebuf;                /* pointer to the bounce buffer */
        bus_size_t id_bouncebuflen;        /* ...and size */
        int        id_nbouncesegs;                /* number of valid bounce segs */
        bus_dma_segment_t id_bouncesegs[0]; /* array of bounce buffer
                                               physical memory segments */
};

/* id_flags */
#define        X86_DMA_MIGHT_NEED_BOUNCE        0x01        /* may need bounce buffers */
#define        X86_DMA_HAS_BOUNCE                0x02        /* has bounce buffers */
#define        X86_DMA_IS_BOUNCING                0x04        /* is bouncing current xfer */

/* id_buftype */
#define        X86_DMA_BUFTYPE_INVALID                0
#define        X86_DMA_BUFTYPE_LINEAR                1
#define        X86_DMA_BUFTYPE_MBUF                2
#define        X86_DMA_BUFTYPE_UIO                3
#define        X86_DMA_BUFTYPE_RAW                4

/*
 * default address translation macros, which are appropriate where
 * paddr_t == bus_addr_t.
 */

#if !defined(_BUS_PHYS_TO_BUS)
#define _BUS_PHYS_TO_BUS(pa)        ((bus_addr_t)(pa))
#endif /* !defined(_BUS_PHYS_TO_BUS) */

#if !defined(_BUS_BUS_TO_PHYS)
#define _BUS_BUS_TO_PHYS(ba)        ((paddr_t)(ba))
#endif /* !defined(_BUS_BUS_TO_PHYS) */

#if !defined(_BUS_VM_PAGE_TO_BUS)
#define        _BUS_VM_PAGE_TO_BUS(pg)        _BUS_PHYS_TO_BUS(VM_PAGE_TO_PHYS(pg))
#endif /* !defined(_BUS_VM_PAGE_TO_BUS) */

#if !defined(_BUS_BUS_TO_VM_PAGE)
#define        _BUS_BUS_TO_VM_PAGE(ba)        PHYS_TO_VM_PAGE(ba)
#endif /* !defined(_BUS_BUS_TO_VM_PAGE) */

#if !defined(_BUS_PMAP_ENTER)
#define _BUS_PMAP_ENTER(pmap, va, ba, prot, flags) \
    pmap_enter(pmap, va, ba, prot, flags)
#endif /* _BUS_PMAP_ENTER */

#if !defined(_BUS_VIRT_TO_BUS)
#include <uvm/uvm_extern.h>

static __inline bus_addr_t _bus_virt_to_bus(struct pmap *, vaddr_t);
#define        _BUS_VIRT_TO_BUS(pm, va) _bus_virt_to_bus((pm), (va))

static __inline bus_addr_t
_bus_virt_to_bus(struct pmap *pm, vaddr_t va)
{
        paddr_t pa;

        if (!pmap_extract(pm, va, &pa)) {
                panic("_bus_virt_to_bus");
        }

        return _BUS_PHYS_TO_BUS(pa);
}
#endif /* !defined(_BUS_VIRT_TO_BUS) */

/*
 * by default, the end address of RAM visible on bus is the same as the
 * largest physical address.
 */
#ifndef _BUS_AVAIL_END
#define _BUS_AVAIL_END (avail_end - 1)
#endif

struct x86_bus_dma_tag {
        bus_dma_tag_t                                bdt_super;
        /* bdt_present: bitmap indicating overrides present (1) in *this* tag,
         * bdt_exists: bitmap indicating overrides present (1) in *this* tag
         * or in an ancestor's tag (follow bdt_super to ancestors)
         */
        uint64_t                                bdt_present;
        uint64_t                                bdt_exists;
        const struct bus_dma_overrides                *bdt_ov;
        void                                        *bdt_ctx;
        /*
         * The `bounce threshold' is checked while we are loading
         * the DMA map.  If the physical address of the segment
         * exceeds the threshold, an error will be returned.  The
         * caller can then take whatever action is necessary to
         * bounce the transfer.  If this value is 0, it will be
         * ignored.
         */
        int        _tag_needs_free;
        bus_addr_t _bounce_thresh;
        bus_addr_t _bounce_alloc_lo;
        bus_addr_t _bounce_alloc_hi;
        int        (*_may_bounce)(bus_dma_tag_t, bus_dmamap_t, int, int *);
};

#endif /* !defined(_X86_BUS_PRIVATE_H_) */





































































   40 








   40 





















































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
/*        $NetBSD: sys_pset.c,v 1.24 2020/05/23 23:42:43 ad Exp $        */

/*
 * Copyright (c) 2008, Mindaugas Rasiukevicius <rmind at NetBSD org>
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Implementation of the Processor Sets.
 * 
 * Locking
 *  The array of the processor-set structures and its members are protected
 *  by the global cpu_lock.  Note that in scheduler, the very l_psid value
 *  might be used without lock held.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_pset.c,v 1.24 2020/05/23 23:42:43 ad Exp $");

#include <sys/param.h>

#include <sys/cpu.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/pset.h>
#include <sys/sched.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/types.h>

static pset_info_t **        psets;
static u_int                psets_max;
static u_int                psets_count;
static kauth_listener_t        psets_listener;

static int        psets_realloc(int);
static int        psid_validate(psetid_t, bool);
static int        kern_pset_create(psetid_t *);
static int        kern_pset_destroy(psetid_t);

static int
psets_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        psetid_t id;
        enum kauth_system_req req;
        int result;

        result = KAUTH_RESULT_DEFER;
        req = (enum kauth_system_req)(uintptr_t)arg0;
        id = (psetid_t)(uintptr_t)arg1;

        if (action != KAUTH_SYSTEM_PSET)
                return result;

        if ((req == KAUTH_REQ_SYSTEM_PSET_ASSIGN) ||
            (req == KAUTH_REQ_SYSTEM_PSET_BIND)) {
                if (id == PS_QUERY)
                        result = KAUTH_RESULT_ALLOW;
        }

        return result;
}

/*
 * Initialization of the processor-sets.
 */
void
psets_init(void)
{

        psets_max = uimax(maxcpus, 32);
        psets = kmem_zalloc(psets_max * sizeof(void *), KM_SLEEP);
        psets_count = 0;

        psets_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
            psets_listener_cb, NULL);
}

/*
 * Reallocate the array of the processor-set structures.
 */
static int
psets_realloc(int new_psets_max)
{
        pset_info_t **new_psets, **old_psets;
        const u_int newsize = new_psets_max * sizeof(void *);
        u_int i, oldsize;

        if (new_psets_max < 1)
                return EINVAL;

        new_psets = kmem_zalloc(newsize, KM_SLEEP);
        mutex_enter(&cpu_lock);
        old_psets = psets;
        oldsize = psets_max * sizeof(void *);

        /* Check if we can lower the size of the array */
        if (new_psets_max < psets_max) {
                for (i = new_psets_max; i < psets_max; i++) {
                        if (psets[i] == NULL)
                                continue;
                        mutex_exit(&cpu_lock);
                        kmem_free(new_psets, newsize);
                        return EBUSY;
                }
        }

        /* Copy all pointers to the new array */
        memcpy(new_psets, psets, newsize);
        psets_max = new_psets_max;
        psets = new_psets;
        mutex_exit(&cpu_lock);

        kmem_free(old_psets, oldsize);
        return 0;
}

/*
 * Validate processor-set ID.
 */
static int
psid_validate(psetid_t psid, bool chkps)
{

        KASSERT(mutex_owned(&cpu_lock));

        if (chkps && (psid == PS_NONE || psid == PS_QUERY || psid == PS_MYID))
                return 0;
        if (psid <= 0 || psid > psets_max)
                return EINVAL;
        if (psets[psid - 1] == NULL)
                return EINVAL;

        return 0;
}

/*
 * Create a processor-set.
 */
static int
kern_pset_create(psetid_t *psid)
{
        pset_info_t *pi;
        u_int i;

        if (psets_count == psets_max)
                return ENOMEM;

        pi = kmem_zalloc(sizeof(pset_info_t), KM_SLEEP);

        mutex_enter(&cpu_lock);
        if (psets_count == psets_max) {
                mutex_exit(&cpu_lock);
                kmem_free(pi, sizeof(pset_info_t));
                return ENOMEM;
        }

        /* Find a free entry in the array */
        for (i = 0; i < psets_max; i++)
                if (psets[i] == NULL)
                        break;
        KASSERT(i != psets_max);

        psets[i] = pi;
        psets_count++;
        mutex_exit(&cpu_lock);

        *psid = i + 1;
        return 0;
}

/*
 * Destroy a processor-set.
 */
static int
kern_pset_destroy(psetid_t psid)
{
        struct cpu_info *ci;
        struct lwp *l;
        CPU_INFO_ITERATOR cii;
        int error;

        mutex_enter(&cpu_lock);
        if (psid == PS_MYID) {
                /* Use caller's processor-set ID */
                psid = curlwp->l_psid;
        }
        error = psid_validate(psid, false);
        if (error) {
                mutex_exit(&cpu_lock);
                return error;
        }

        /* Release the processor-set from all CPUs */
        for (CPU_INFO_FOREACH(cii, ci)) {
                struct schedstate_percpu *spc;

                spc = &ci->ci_schedstate;
                if (spc->spc_psid != psid)
                        continue;
                spc->spc_psid = PS_NONE;
        }

        /* Unmark the processor-set ID from each thread */
        mutex_enter(&proc_lock);
        LIST_FOREACH(l, &alllwp, l_list) {
                /* Safe to check and set without lock held */
                if (l->l_psid != psid)
                        continue;
                l->l_psid = PS_NONE;
        }
        mutex_exit(&proc_lock);

        /* Destroy the processor-set */
        kmem_free(psets[psid - 1], sizeof(pset_info_t));
        psets[psid - 1] = NULL;
        psets_count--;
        mutex_exit(&cpu_lock);

        return 0;
}

/*
 * General system calls for the processor-sets.
 */

int
sys_pset_create(struct lwp *l, const struct sys_pset_create_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(psetid_t) *psid;
        } */
        psetid_t psid;
        int error;

        /* Available only for super-user */
        if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET,
            KAUTH_REQ_SYSTEM_PSET_CREATE, NULL, NULL, NULL))
                return EPERM;

        error = kern_pset_create(&psid);
        if (error)
                return error;

        error = copyout(&psid, SCARG(uap, psid), sizeof(psetid_t));
        if (error)
                (void)kern_pset_destroy(psid);

        return error;
}

int
sys_pset_destroy(struct lwp *l, const struct sys_pset_destroy_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(psetid_t) psid;
        } */

        /* Available only for super-user */
        if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET,
            KAUTH_REQ_SYSTEM_PSET_DESTROY,
            KAUTH_ARG(SCARG(uap, psid)), NULL, NULL))
                return EPERM;

        return kern_pset_destroy(SCARG(uap, psid));
}

int
sys_pset_assign(struct lwp *l, const struct sys_pset_assign_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(psetid_t) psid;
                syscallarg(cpuid_t) cpuid;
                syscallarg(psetid_t) *opsid;
        } */
        struct cpu_info *ici, *ci = NULL;
        struct schedstate_percpu *spc = NULL;
        struct lwp *t;
        psetid_t psid = SCARG(uap, psid), opsid = 0;
        CPU_INFO_ITERATOR cii;
        int error = 0, nnone = 0;

        /* Available only for super-user, except the case of PS_QUERY */
        if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET,
            KAUTH_REQ_SYSTEM_PSET_ASSIGN, KAUTH_ARG(SCARG(uap, psid)), NULL,
            NULL))
                return EPERM;

        /* Find the target CPU */
        mutex_enter(&cpu_lock);
        for (CPU_INFO_FOREACH(cii, ici)) {
                struct schedstate_percpu *ispc;
                ispc = &ici->ci_schedstate;
                if (cpu_index(ici) == SCARG(uap, cpuid)) {
                        ci = ici;
                        spc = ispc;
                }
                nnone += (ispc->spc_psid == PS_NONE);
        }
        if (ci == NULL) {
                mutex_exit(&cpu_lock);
                return EINVAL;
        }
        error = psid_validate(psid, true);
        if (error) {
                mutex_exit(&cpu_lock);
                return error;
        }
        opsid = spc->spc_psid;
        switch (psid) {
        case PS_QUERY:
                break;
        case PS_MYID:
                psid = curlwp->l_psid;
                /* FALLTHROUGH */
        default:
                /*
                 * Just finish if old and new processor-sets are
                 * the same.
                 */
                if (spc->spc_psid == psid)
                        break;
                /*
                 * Ensure at least one CPU stays in the default set,
                 * and that specified CPU is not offline.
                 */
                if (psid != PS_NONE && ((spc->spc_flags & SPCF_OFFLINE) ||
                    (nnone == 1 && spc->spc_psid == PS_NONE))) {
                        mutex_exit(&cpu_lock);
                        return EBUSY;
                }
                mutex_enter(&proc_lock);
                /*
                 * Ensure that none of the threads are using affinity mask
                 * with this target CPU in it.
                 */
                LIST_FOREACH(t, &alllwp, l_list) {
                        if (t->l_affinity == NULL) {
                                continue;
                        }
                        lwp_lock(t);
                        if (t->l_affinity == NULL) {
                                lwp_unlock(t);
                                continue;
                        }
                        if (kcpuset_isset(t->l_affinity, cpu_index(ci))) {
                                lwp_unlock(t);
                                mutex_exit(&proc_lock);
                                mutex_exit(&cpu_lock);
                                return EPERM;
                        }
                        lwp_unlock(t);
                }
                /*
                 * Set the processor-set ID.
                 * Migrate out any threads running on this CPU.
                 */
                spc->spc_psid = psid;

                LIST_FOREACH(t, &alllwp, l_list) {
                        struct cpu_info *tci;
                        if (t->l_cpu != ci)
                                continue;
                        if (t->l_pflag & (LP_BOUND | LP_INTR))
                                continue;
                        lwp_lock(t);
                        tci = sched_takecpu(t);
                        KASSERT(tci != ci);
                        lwp_migrate(t, tci);
                }
                mutex_exit(&proc_lock);
                break;
        }
        mutex_exit(&cpu_lock);

        if (SCARG(uap, opsid) != NULL)
                error = copyout(&opsid, SCARG(uap, opsid), sizeof(psetid_t));

        return error;
}

int
sys__pset_bind(struct lwp *l, const struct sys__pset_bind_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(idtype_t) idtype;
                syscallarg(id_t) first_id;
                syscallarg(id_t) second_id;
                syscallarg(psetid_t) psid;
                syscallarg(psetid_t) *opsid;
        } */
        struct cpu_info *ci;
        struct proc *p;
        struct lwp *t;
        id_t id1, id2;
        pid_t pid = 0;
        lwpid_t lid = 0;
        psetid_t psid, opsid;
        int error = 0, lcnt;

        psid = SCARG(uap, psid);

        /* Available only for super-user, except the case of PS_QUERY */
        if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET,
            KAUTH_REQ_SYSTEM_PSET_BIND, KAUTH_ARG(SCARG(uap, psid)), NULL,
            NULL))
                return EPERM;

        mutex_enter(&cpu_lock);
        error = psid_validate(psid, true);
        if (error) {
                mutex_exit(&cpu_lock);
                return error;
        }
        if (psid == PS_MYID)
                psid = curlwp->l_psid;

        /*
         * Get PID and LID from the ID.
         */
        p = l->l_proc;
        id1 = SCARG(uap, first_id);
        id2 = SCARG(uap, second_id);

        mutex_enter(&proc_lock);
        switch (SCARG(uap, idtype)) {
        case P_PID:
                /*
                 * Process:
                 *  First ID        - PID;
                 *  Second ID        - ignored;
                 */
                pid = (id1 == P_MYID) ? p->p_pid : id1;
                lid = 0;
                break;
        case P_LWPID:
                /*
                 * Thread (LWP):
                 *  First ID        - LID;
                 *  Second ID        - PID;
                 */
                if (id1 == P_MYID) {
                        pid = p->p_pid;
                        lid = l->l_lid;
                        break;
                }
                lid = id1;
                pid = (id2 == P_MYID) ? p->p_pid : id2;
                break;
        default:
                error = EINVAL;
                goto error;
        }

        /* Find the process */
        p = proc_find(pid);
        if (p == NULL) {
                error = ESRCH;
                goto error;
        }
        /* Disallow modification of the system processes */
        if (p->p_flag & PK_SYSTEM) {
                error = EPERM;
                goto error;
        }

        /* Find the LWP(s) */
        lcnt = 0;
        ci = NULL;
        mutex_enter(p->p_lock);
        LIST_FOREACH(t, &p->p_lwps, l_sibling) {
                if (lid && lid != t->l_lid)
                        continue;
                /*
                 * Bind the thread to the processor-set,
                 * take some CPU and migrate.
                 */
                lwp_lock(t);
                opsid = t->l_psid;
                t->l_psid = psid;
                ci = sched_takecpu(t);
                /* Unlocks LWP */
                lwp_migrate(t, ci);
                lcnt++;
        }
        mutex_exit(p->p_lock);
        if (lcnt == 0) {
                error = ESRCH;
        }
error:
        mutex_exit(&proc_lock);
        mutex_exit(&cpu_lock);
        if (error == 0 && SCARG(uap, opsid))
                error = copyout(&opsid, SCARG(uap, opsid), sizeof(psetid_t));
        return error;
}

/*
 * Sysctl nodes and initialization.
 */

static int
sysctl_psets_max(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int error, newsize;

        node = *rnode;
        node.sysctl_data = &newsize;

        newsize = psets_max;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        if (newsize <= 0)
                return EINVAL;

        sysctl_unlock();
        error = psets_realloc(newsize);
        sysctl_relock();
        return error;
}

static int
sysctl_psets_list(SYSCTLFN_ARGS)
{
        const size_t bufsz = 1024;
        char *buf, tbuf[16];
        int i, error;
        size_t len;

        sysctl_unlock();
        buf = kmem_alloc(bufsz, KM_SLEEP);
        snprintf(buf, bufsz, "%d:1", PS_NONE);        /* XXX */

        mutex_enter(&cpu_lock);
        for (i = 0; i < psets_max; i++) {
                if (psets[i] == NULL)
                        continue;
                snprintf(tbuf, sizeof(tbuf), ",%d:2", i + 1);        /* XXX */
                strlcat(buf, tbuf, bufsz);
        }
        mutex_exit(&cpu_lock);
        len = strlen(buf) + 1;
        error = 0;
        if (oldp != NULL)
                error = copyout(buf, oldp, uimin(len, *oldlenp));
        *oldlenp = len;
        kmem_free(buf, bufsz);
        sysctl_relock();
        return error;
}

SYSCTL_SETUP(sysctl_pset_setup, "sysctl kern.pset subtree setup")
{
        const struct sysctlnode *node = NULL;

        sysctl_createv(clog, 0, NULL, &node,
                CTLFLAG_PERMANENT,
                CTLTYPE_NODE, "pset",
                SYSCTL_DESCR("Processor-set options"),
                NULL, 0, NULL, 0,
                CTL_KERN, CTL_CREATE, CTL_EOL);

        if (node == NULL)
                return;

        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "psets_max",
                SYSCTL_DESCR("Maximal count of the processor-sets"),
                sysctl_psets_max, 0, &psets_max, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT,
                CTLTYPE_STRING, "list",
                SYSCTL_DESCR("List of active sets"),
                sysctl_psets_list, 0, NULL, 0,
                CTL_CREATE, CTL_EOL);
}




































































































































    5 







    5 

    2 








    2 
















    5 
    5 





    5 















































   29 






   29 






   28 































    2 

    2 




































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
/*        $NetBSD: subr_bufq.c,v 1.27 2019/02/17 23:17:41 bad Exp $        */
/*        NetBSD: subr_disk.c,v 1.70 2005/08/20 12:00:01 yamt Exp $        */

/*-
 * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ufs_disksubr.c        8.5 (Berkeley) 1/21/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_bufq.c,v 1.27 2019/02/17 23:17:41 bad Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/bufq_impl.h>
#include <sys/kmem.h>
#include <sys/sysctl.h>
#include <sys/module.h>

#define        STRAT_MATCH(id, bs)        (strcmp((id), (bs)->bs_name) == 0)

static void sysctl_kern_bufq_strategies_setup(struct sysctllog **);
static SLIST_HEAD(, bufq_strat) bufq_strat_list =
    SLIST_HEAD_INITIALIZER(bufq_strat_list);

static kmutex_t bufq_mutex;

static struct sysctllog *sysctllog;

void
bufq_init(void)
{

        mutex_init(&bufq_mutex, MUTEX_DEFAULT, IPL_NONE);
        sysctl_kern_bufq_strategies_setup(&sysctllog);
}

int
bufq_register(struct bufq_strat *bs)
{

        mutex_enter(&bufq_mutex);
        SLIST_INSERT_HEAD(&bufq_strat_list, bs, bs_next);
        bs->bs_refcnt = 0;
        mutex_exit(&bufq_mutex);

        return 0;
}

int
bufq_unregister(struct bufq_strat *bs)
{

        mutex_enter(&bufq_mutex);
        if (bs->bs_refcnt != 0) {
                mutex_exit(&bufq_mutex);
                return EBUSY;
        }
        SLIST_REMOVE(&bufq_strat_list, bs, bufq_strat, bs_next);
        mutex_exit(&bufq_mutex);

        return 0;
}

/*
 * Create a device buffer queue.
 */
int
bufq_alloc(struct bufq_state **bufqp, const char *strategy, int flags)
{
        struct bufq_strat *bsp, *it;
        struct bufq_state *bufq;
        int error = 0;
        u_int gen;
        bool found_exact;
        char strategy_module_name[MAXPATHLEN];

        KASSERT((flags & BUFQ_EXACT) == 0 || strategy != BUFQ_STRAT_ANY);

        switch (flags & BUFQ_SORT_MASK) {
        case BUFQ_SORT_RAWBLOCK:
        case BUFQ_SORT_CYLINDER:
                break;
        case 0:
                /*
                 * for strategies which don't care about block numbers.
                 * eg. fcfs
                 */
                flags |= BUFQ_SORT_RAWBLOCK;
                break;
        default:
                panic("bufq_alloc: sort out of range");
        }

        /*
         * select strategy.
         * if a strategy specified by flags is found, use it.
         * otherwise, select one with the largest bs_prio.
         */
        mutex_enter(&bufq_mutex);
        do {
                gen = module_gen;
                bsp = NULL;
                found_exact = false;

                SLIST_FOREACH(it, &bufq_strat_list, bs_next) {
                        if (strategy != BUFQ_STRAT_ANY &&
                            STRAT_MATCH(strategy, (it))) {
                                bsp = it;
                                found_exact = true;
                                break;
                        }
                        if (bsp == NULL || (it)->bs_prio > bsp->bs_prio)
                                bsp = it;
                }
                if (strategy == BUFQ_STRAT_ANY || found_exact)
                        break;

                /* Try to autoload the bufq strategy module */
                strlcpy(strategy_module_name, "bufq_",
                        sizeof(strategy_module_name));
                strlcat(strategy_module_name, strategy,
                        sizeof(strategy_module_name));
                mutex_exit(&bufq_mutex);
                (void) module_autoload(strategy_module_name, MODULE_CLASS_BUFQ);
                mutex_enter(&bufq_mutex);
        } while (gen != module_gen);

        if (bsp == NULL) {
                panic("bufq_alloc: no strategy");
        }
        if (strategy != BUFQ_STRAT_ANY && !found_exact) {
                if ((flags & BUFQ_EXACT)) {
                        error = ENOENT;
                        mutex_exit(&bufq_mutex);
                        goto out;
                }
#if defined(DEBUG)
                printf("bufq_alloc: '%s' is not available. using '%s'.\n",
                    strategy, bsp->bs_name);
#endif
        }
#if defined(BUFQ_DEBUG)
        /* XXX aprint? */
        printf("bufq_alloc: using '%s'\n", bsp->bs_name);
#endif

        bsp->bs_refcnt++;
        mutex_exit(&bufq_mutex);
        *bufqp = bufq = kmem_zalloc(sizeof(*bufq), KM_SLEEP);
        bufq->bq_flags = flags;
        bufq->bq_strat = bsp;
        (*bsp->bs_initfn)(bufq);

out:
        return error;
}

void
bufq_put(struct bufq_state *bufq, struct buf *bp)
{

        (*bufq->bq_put)(bufq, bp);
}

struct buf *
bufq_get(struct bufq_state *bufq)
{

        return (*bufq->bq_get)(bufq, 1);
}

struct buf *
bufq_peek(struct bufq_state *bufq)
{

        return (*bufq->bq_get)(bufq, 0);
}

struct buf *
bufq_cancel(struct bufq_state *bufq, struct buf *bp)
{

        return (*bufq->bq_cancel)(bufq, bp);
}

/*
 * Drain a device buffer queue.
 */
void
bufq_drain(struct bufq_state *bufq)
{
        struct buf *bp;

        while ((bp = bufq_get(bufq)) != NULL) {
                bp->b_error = EIO;
                bp->b_resid = bp->b_bcount;
                biodone(bp);
        }
}

/*
 * Destroy a device buffer queue.
 */
void
bufq_free(struct bufq_state *bufq)
{

        KASSERT(bufq_peek(bufq) == NULL);

        bufq->bq_fini(bufq);

        mutex_enter(&bufq_mutex);
        bufq->bq_strat->bs_refcnt--;
        mutex_exit(&bufq_mutex);
        
        kmem_free(bufq, sizeof(*bufq));
}

/*
 * get a strategy identifier of a buffer queue.
 */
const char *
bufq_getstrategyname(struct bufq_state *bufq)
{

        return bufq->bq_strat->bs_name;
}

/*
 * move all requests on a buffer queue to another.
 */
void
bufq_move(struct bufq_state *dst, struct bufq_state *src)
{
        struct buf *bp;

        while ((bp = bufq_get(src)) != NULL) {
                bufq_put(dst, bp);
        }
}

static int
docopy(char *buf, size_t *bufoffp, size_t buflen,
    const char *datap, size_t datalen)
{
        int error = 0;

        if (buf != NULL && datalen > 0) {

                if (*bufoffp + datalen > buflen) {
                        goto out;
                }
                error = copyout(datap, buf + *bufoffp, datalen);
                if (error) {
                        goto out;
                }
        }
out:
        if (error == 0) {
                *bufoffp += datalen;
        }

        return error;
}

static int
docopystr(char *buf, size_t *bufoffp, size_t buflen, const char *datap)
{

        return docopy(buf, bufoffp, buflen, datap, strlen(datap));
}

static int
docopynul(char *buf, size_t *bufoffp, size_t buflen)
{

        return docopy(buf, bufoffp, buflen, "", 1);
}

/*
 * sysctl function that will print all bufq strategies
 * currently available to the kernel.
 */
static int
sysctl_kern_bufq_strategies(SYSCTLFN_ARGS)
{
        const struct bufq_strat *bq_strat;
        const char *delim = "";
        size_t off = 0;
        size_t buflen = *oldlenp;
        int error;

        SLIST_FOREACH(bq_strat, &bufq_strat_list, bs_next) {
                error = docopystr(oldp, &off, buflen, delim);
                if (error) {
                        goto out;
                }
                error = docopystr(oldp, &off, buflen, (bq_strat)->bs_name);
                if (error) {
                        goto out;
                }
                delim = " ";
        }

        /* In case there are no registered strategies ... */
        if (off == 0) {
                error = docopystr(oldp, &off, buflen, "NULL");
                if (error) {
                        goto out;
                }
        }

        /* NUL terminate */
        error = docopynul(oldp, &off, buflen);
out:
        *oldlenp = off;
        return error;
}

static void
sysctl_kern_bufq_strategies_setup(struct sysctllog **clog)
{
        const struct sysctlnode *node;

        node = NULL;
        sysctl_createv(clog, 0, NULL, &node,
                        CTLFLAG_PERMANENT,
                        CTLTYPE_NODE, "bufq",
                        SYSCTL_DESCR("buffer queue subtree"),
                        NULL, 0, NULL, 0,
                        CTL_KERN, CTL_CREATE, CTL_EOL);
        if (node != NULL) {
                sysctl_createv(clog, 0, NULL, NULL,
                        CTLFLAG_PERMANENT,
                        CTLTYPE_STRING, "strategies",
                        SYSCTL_DESCR("List of bufq strategies present"),
                        sysctl_kern_bufq_strategies, 0, NULL, 0,
                        CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
        }
}
























































































































































































































































    1 








    1 


    1 















































    1 







    1 


    1 









    1 



    1 















    1 
















    1 





































    2 






    1 








    2 



















    2 


    2 
    2 








    2 
























































































    1 












    1 

    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
















    1 













    2 









    1 









    1 











































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
/*        $NetBSD: kern_time.c,v 1.221 2023/02/23 02:57:17 riastradh Exp $        */

/*-
 * Copyright (c) 2000, 2004, 2005, 2007, 2008, 2009, 2020
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christopher G. Demetriou, by Andrew Doran, and by Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_time.c        8.4 (Berkeley) 5/26/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_time.c,v 1.221 2023/02/23 02:57:17 riastradh Exp $");

#include <sys/param.h>
#include <sys/resourcevar.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/signalvar.h>
#include <sys/syslog.h>
#include <sys/timetc.h>
#include <sys/timevar.h>
#include <sys/timex.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/cpu.h>

kmutex_t        itimer_mutex __cacheline_aligned;        /* XXX static */
static struct itlist itimer_realtime_changed_notify;

static void        itimer_callout(void *);
static void        ptimer_intr(void *);
static void        *ptimer_sih __read_mostly;
static TAILQ_HEAD(, ptimer) ptimer_queue;

#define        CLOCK_VIRTUAL_P(clockid)        \
        ((clockid) == CLOCK_VIRTUAL || (clockid) == CLOCK_PROF)

CTASSERT(ITIMER_REAL == CLOCK_REALTIME);
CTASSERT(ITIMER_VIRTUAL == CLOCK_VIRTUAL);
CTASSERT(ITIMER_PROF == CLOCK_PROF);
CTASSERT(ITIMER_MONOTONIC == CLOCK_MONOTONIC);

#define        DELAYTIMER_MAX        32

/*
 * Initialize timekeeping.
 */
void
time_init(void)
{

        mutex_init(&itimer_mutex, MUTEX_DEFAULT, IPL_SCHED);
        LIST_INIT(&itimer_realtime_changed_notify);

        TAILQ_INIT(&ptimer_queue);
        ptimer_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE,
            ptimer_intr, NULL);
}

/*
 * Check if the time will wrap if set to ts.
 *
 * ts - timespec describing the new time
 * delta - the delta between the current time and ts
 */
bool
time_wraps(struct timespec *ts, struct timespec *delta)
{

        /*
         * Don't allow the time to be set forward so far it
         * will wrap and become negative, thus allowing an
         * attacker to bypass the next check below.  The
         * cutoff is 1 year before rollover occurs, so even
         * if the attacker uses adjtime(2) to move the time
         * past the cutoff, it will take a very long time
         * to get to the wrap point.
         */
        if ((ts->tv_sec > LLONG_MAX - 365*24*60*60) ||
            (delta->tv_sec < 0 || delta->tv_nsec < 0))
                return true;

        return false;
}

/*
 * itimer_lock:
 *
 *        Acquire the interval timer data lock.
 */
void
itimer_lock(void)
{
        mutex_spin_enter(&itimer_mutex);
}

/*
 * itimer_unlock:
 *
 *        Release the interval timer data lock.
 */
void
itimer_unlock(void)
{
        mutex_spin_exit(&itimer_mutex);
}

/*
 * itimer_lock_held:
 *
 *        Check that the interval timer lock is held for diagnostic
 *        assertions.
 */
inline bool __diagused
itimer_lock_held(void)
{
        return mutex_owned(&itimer_mutex);
}

/*
 * Time of day and interval timer support.
 *
 * These routines provide the kernel entry points to get and set
 * the time-of-day and per-process interval timers.  Subroutines
 * here provide support for adding and subtracting timeval structures
 * and decrementing interval timers, optionally reloading the interval
 * timers when they expire.
 */

/* This function is used by clock_settime and settimeofday */
static int
settime1(struct proc *p, const struct timespec *ts, bool check_kauth)
{
        struct timespec delta, now;

        /*
         * The time being set to an unreasonable value will cause
         * unreasonable system behaviour.
         */
        if (ts->tv_sec < 0 || ts->tv_sec > (1LL << 36))
                return EINVAL;

        nanotime(&now);
        timespecsub(ts, &now, &delta);

        if (check_kauth && kauth_authorize_system(kauth_cred_get(),
            KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_SYSTEM, __UNCONST(ts),
            &delta, KAUTH_ARG(check_kauth ? false : true)) != 0) {
                return EPERM;
        }

#ifdef notyet
        if ((delta.tv_sec < 86400) && securelevel > 0) { /* XXX elad - notyet */
                return EPERM;
        }
#endif

        tc_setclock(ts);

        resettodr();

        /*
         * Notify pending CLOCK_REALTIME timers about the real time change.
         * There may be inactive timers on this list, but this happens
         * comparatively less often than timers firing, and so it's better
         * to put the extra checks here than to complicate the other code
         * path.
         */
        struct itimer *it;
        itimer_lock();
        LIST_FOREACH(it, &itimer_realtime_changed_notify, it_rtchgq) {
                KASSERT(it->it_ops->ito_realtime_changed != NULL);
                if (timespecisset(&it->it_time.it_value)) {
                        (*it->it_ops->ito_realtime_changed)(it);
                }
        }
        itimer_unlock();

        return 0;
}

int
settime(struct proc *p, struct timespec *ts)
{
        return settime1(p, ts, true);
}

/* ARGSUSED */
int
sys___clock_gettime50(struct lwp *l,
    const struct sys___clock_gettime50_args *uap, register_t *retval)
{
        /* {
                syscallarg(clockid_t) clock_id;
                syscallarg(struct timespec *) tp;
        } */
        int error;
        struct timespec ats;

        error = clock_gettime1(SCARG(uap, clock_id), &ats);
        if (error != 0)
                return error;

        return copyout(&ats, SCARG(uap, tp), sizeof(ats));
}

/* ARGSUSED */
int
sys___clock_settime50(struct lwp *l,
    const struct sys___clock_settime50_args *uap, register_t *retval)
{
        /* {
                syscallarg(clockid_t) clock_id;
                syscallarg(const struct timespec *) tp;
        } */
        int error;
        struct timespec ats;

        if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0)
                return error;

        return clock_settime1(l->l_proc, SCARG(uap, clock_id), &ats, true);
}


int
clock_settime1(struct proc *p, clockid_t clock_id, const struct timespec *tp,
    bool check_kauth)
{
        int error;

        if (tp->tv_nsec < 0 || tp->tv_nsec >= 1000000000L)
                return EINVAL;

        switch (clock_id) {
        case CLOCK_REALTIME:
                if ((error = settime1(p, tp, check_kauth)) != 0)
                        return error;
                break;
        case CLOCK_MONOTONIC:
                return EINVAL;        /* read-only clock */
        default:
                return EINVAL;
        }

        return 0;
}

int
sys___clock_getres50(struct lwp *l, const struct sys___clock_getres50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(clockid_t) clock_id;
                syscallarg(struct timespec *) tp;
        } */
        struct timespec ts;
        int error;

        if ((error = clock_getres1(SCARG(uap, clock_id), &ts)) != 0)
                return error;

        if (SCARG(uap, tp))
                error = copyout(&ts, SCARG(uap, tp), sizeof(ts));

        return error;
}

int
clock_getres1(clockid_t clock_id, struct timespec *ts)
{

        switch (clock_id) {
        case CLOCK_REALTIME:
        case CLOCK_MONOTONIC:
                ts->tv_sec = 0;
                if (tc_getfrequency() > 1000000000)
                        ts->tv_nsec = 1;
                else
                        ts->tv_nsec = 1000000000 / tc_getfrequency();
                break;
        default:
                return EINVAL;
        }

        return 0;
}

/* ARGSUSED */
int
sys___nanosleep50(struct lwp *l, const struct sys___nanosleep50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(struct timespec *) rqtp;
                syscallarg(struct timespec *) rmtp;
        } */
        struct timespec rmt, rqt;
        int error, error1;

        error = copyin(SCARG(uap, rqtp), &rqt, sizeof(struct timespec));
        if (error)
                return error;

        error = nanosleep1(l, CLOCK_MONOTONIC, 0, &rqt,
            SCARG(uap, rmtp) ? &rmt : NULL);
        if (SCARG(uap, rmtp) == NULL || (error != 0 && error != EINTR))
                return error;

        error1 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt));
        return error1 ? error1 : error;
}

/* ARGSUSED */
int
sys_clock_nanosleep(struct lwp *l, const struct sys_clock_nanosleep_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(clockid_t) clock_id;
                syscallarg(int) flags;
                syscallarg(struct timespec *) rqtp;
                syscallarg(struct timespec *) rmtp;
        } */
        struct timespec rmt, rqt;
        int error, error1;

        error = copyin(SCARG(uap, rqtp), &rqt, sizeof(struct timespec));
        if (error)
                goto out;

        error = nanosleep1(l, SCARG(uap, clock_id), SCARG(uap, flags), &rqt,
            SCARG(uap, rmtp) ? &rmt : NULL);
        if (SCARG(uap, rmtp) == NULL || (error != 0 && error != EINTR))
                goto out;

        if ((SCARG(uap, flags) & TIMER_ABSTIME) == 0 &&
            (error1 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt))) != 0)
                error = error1;
out:
        *retval = error;
        return 0;
}

int
nanosleep1(struct lwp *l, clockid_t clock_id, int flags, struct timespec *rqt,
    struct timespec *rmt)
{
        struct timespec rmtstart;
        int error, timo;

        if ((error = ts2timo(clock_id, flags, rqt, &timo, &rmtstart)) != 0) {
                if (error == ETIMEDOUT) {
                        error = 0;
                        if (rmt != NULL)
                                rmt->tv_sec = rmt->tv_nsec = 0;
                }
                return error;
        }

        /*
         * Avoid inadvertently sleeping forever
         */
        if (timo == 0)
                timo = 1;
again:
        error = kpause("nanoslp", true, timo, NULL);
        if (error == EWOULDBLOCK)
                error = 0;
        if (rmt != NULL || error == 0) {
                struct timespec rmtend;
                struct timespec t0;
                struct timespec *t;
                int err;

                err = clock_gettime1(clock_id, &rmtend);
                if (err != 0)
                        return err;

                t = (rmt != NULL) ? rmt : &t0;
                if (flags & TIMER_ABSTIME) {
                        timespecsub(rqt, &rmtend, t);
                } else {
                        if (timespeccmp(&rmtend, &rmtstart, <))
                                timespecclear(t); /* clock wound back */
                        else
                                timespecsub(&rmtend, &rmtstart, t);
                        if (timespeccmp(rqt, t, <))
                                timespecclear(t);
                        else
                                timespecsub(rqt, t, t);
                }
                if (t->tv_sec < 0)
                        timespecclear(t);
                if (error == 0) {
                        timo = tstohz(t);
                        if (timo > 0)
                                goto again;
                }
        }

        if (error == ERESTART)
                error = EINTR;

        return error;
}

int
sys_clock_getcpuclockid2(struct lwp *l,
    const struct sys_clock_getcpuclockid2_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(idtype_t idtype;
                syscallarg(id_t id);
                syscallarg(clockid_t *)clock_id;
        } */
        pid_t pid;
        lwpid_t lid;
        clockid_t clock_id;
        id_t id = SCARG(uap, id);

        switch (SCARG(uap, idtype)) {
        case P_PID:
                pid = id == 0 ? l->l_proc->p_pid : id;
                clock_id = CLOCK_PROCESS_CPUTIME_ID | pid;
                break;
        case P_LWPID:
                lid = id == 0 ? l->l_lid : id;
                clock_id = CLOCK_THREAD_CPUTIME_ID | lid;
                break;
        default:
                return EINVAL;
        }
        return copyout(&clock_id, SCARG(uap, clock_id), sizeof(clock_id));
}

/* ARGSUSED */
int
sys___gettimeofday50(struct lwp *l, const struct sys___gettimeofday50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(struct timeval *) tp;
                syscallarg(void *) tzp;                really "struct timezone *";
        } */
        struct timeval atv;
        int error = 0;
        struct timezone tzfake;

        if (SCARG(uap, tp)) {
                memset(&atv, 0, sizeof(atv));
                microtime(&atv);
                error = copyout(&atv, SCARG(uap, tp), sizeof(atv));
                if (error)
                        return error;
        }
        if (SCARG(uap, tzp)) {
                /*
                 * NetBSD has no kernel notion of time zone, so we just
                 * fake up a timezone struct and return it if demanded.
                 */
                tzfake.tz_minuteswest = 0;
                tzfake.tz_dsttime = 0;
                error = copyout(&tzfake, SCARG(uap, tzp), sizeof(tzfake));
        }
        return error;
}

/* ARGSUSED */
int
sys___settimeofday50(struct lwp *l, const struct sys___settimeofday50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const struct timeval *) tv;
                syscallarg(const void *) tzp; really "const struct timezone *";
        } */

        return settimeofday1(SCARG(uap, tv), true, SCARG(uap, tzp), l, true);
}

int
settimeofday1(const struct timeval *utv, bool userspace,
    const void *utzp, struct lwp *l, bool check_kauth)
{
        struct timeval atv;
        struct timespec ts;
        int error;

        /* Verify all parameters before changing time. */

        /*
         * NetBSD has no kernel notion of time zone, and only an
         * obsolete program would try to set it, so we log a warning.
         */
        if (utzp)
                log(LOG_WARNING, "pid %d attempted to set the "
                    "(obsolete) kernel time zone\n", l->l_proc->p_pid);

        if (utv == NULL)
                return 0;

        if (userspace) {
                if ((error = copyin(utv, &atv, sizeof(atv))) != 0)
                        return error;
                utv = &atv;
        }

        if (utv->tv_usec < 0 || utv->tv_usec >= 1000000)
                return EINVAL;

        TIMEVAL_TO_TIMESPEC(utv, &ts);
        return settime1(l->l_proc, &ts, check_kauth);
}

int        time_adjusted;                        /* set if an adjustment is made */

/* ARGSUSED */
int
sys___adjtime50(struct lwp *l, const struct sys___adjtime50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const struct timeval *) delta;
                syscallarg(struct timeval *) olddelta;
        } */
        int error;
        struct timeval atv, oldatv;

        if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_TIME,
            KAUTH_REQ_SYSTEM_TIME_ADJTIME, NULL, NULL, NULL)) != 0)
                return error;

        if (SCARG(uap, delta)) {
                error = copyin(SCARG(uap, delta), &atv,
                    sizeof(*SCARG(uap, delta)));
                if (error)
                        return error;
        }
        adjtime1(SCARG(uap, delta) ? &atv : NULL,
            SCARG(uap, olddelta) ? &oldatv : NULL, l->l_proc);
        if (SCARG(uap, olddelta))
                error = copyout(&oldatv, SCARG(uap, olddelta),
                    sizeof(*SCARG(uap, olddelta)));
        return error;
}

void
adjtime1(const struct timeval *delta, struct timeval *olddelta, struct proc *p)
{

        if (olddelta) {
                memset(olddelta, 0, sizeof(*olddelta));
                mutex_spin_enter(&timecounter_lock);
                olddelta->tv_sec = time_adjtime / 1000000;
                olddelta->tv_usec = time_adjtime % 1000000;
                if (olddelta->tv_usec < 0) {
                        olddelta->tv_usec += 1000000;
                        olddelta->tv_sec--;
                }
                mutex_spin_exit(&timecounter_lock);
        }

        if (delta) {
                mutex_spin_enter(&timecounter_lock);
                /*
                 * XXX This should maybe just report failure to
                 * userland for nonsense deltas.
                 */
                if (delta->tv_sec > INT64_MAX/1000000 - 1) {
                        time_adjtime = INT64_MAX;
                } else if (delta->tv_sec < INT64_MIN/1000000 + 1) {
                        time_adjtime = INT64_MIN;
                } else {
                        time_adjtime = delta->tv_sec * 1000000
                            + MAX(-999999, MIN(999999, delta->tv_usec));
                }

                if (time_adjtime) {
                        /* We need to save the system time during shutdown */
                        time_adjusted |= 1;
                }
                mutex_spin_exit(&timecounter_lock);
        }
}

/*
 * Interval timer support.
 *
 * The itimer_*() routines provide generic support for interval timers,
 * both real (CLOCK_REALTIME, CLOCK_MONOTIME), and virtual (CLOCK_VIRTUAL,
 * CLOCK_PROF).
 *
 * Real timers keep their deadline as an absolute time, and are fired
 * by a callout.  Virtual timers are kept as a linked-list of deltas,
 * and are processed by hardclock().
 *
 * Because the real time timer callout may be delayed in real time due
 * to interrupt processing on the system, it is possible for the real
 * time timeout routine (itimer_callout()) run past after its deadline.
 * It does not suffice, therefore, to reload the real timer .it_value
 * from the timer's .it_interval.  Rather, we compute the next deadline
 * in absolute time based on the current time and the .it_interval value,
 * and report any overruns.
 *
 * Note that while the virtual timers are supported in a generic fashion
 * here, they only (currently) make sense as per-process timers, and thus
 * only really work for that case.
 */

/*
 * itimer_init:
 *
 *        Initialize the common data for an interval timer.
 */
void
itimer_init(struct itimer * const it, const struct itimer_ops * const ops,
    clockid_t const id, struct itlist * const itl)
{

        KASSERT(itimer_lock_held());
        KASSERT(ops != NULL);

        timespecclear(&it->it_time.it_value);
        it->it_ops = ops;
        it->it_clockid = id;
        it->it_overruns = 0;
        it->it_dying = false;
        if (!CLOCK_VIRTUAL_P(id)) {
                KASSERT(itl == NULL);
                callout_init(&it->it_ch, CALLOUT_MPSAFE);
                callout_setfunc(&it->it_ch, itimer_callout, it);
                if (id == CLOCK_REALTIME && ops->ito_realtime_changed != NULL) {
                        LIST_INSERT_HEAD(&itimer_realtime_changed_notify,
                            it, it_rtchgq);
                }
        } else {
                KASSERT(itl != NULL);
                it->it_vlist = itl;
                it->it_active = false;
        }
}

/*
 * itimer_poison:
 *
 *        Poison an interval timer, preventing it from being scheduled
 *        or processed, in preparation for freeing the timer.
 */
void
itimer_poison(struct itimer * const it)
{

        KASSERT(itimer_lock_held());

        it->it_dying = true;

        /*
         * For non-virtual timers, stop the callout, or wait for it to
         * run if it has already fired.  It cannot restart again after
         * this point: the callout won't restart itself when dying, no
         * other users holding the lock can restart it, and any other
         * users waiting for callout_halt concurrently (itimer_settime)
         * will restart from the top.
         */
        if (!CLOCK_VIRTUAL_P(it->it_clockid)) {
                callout_halt(&it->it_ch, &itimer_mutex);
                if (it->it_clockid == CLOCK_REALTIME &&
                    it->it_ops->ito_realtime_changed != NULL) {
                        LIST_REMOVE(it, it_rtchgq);
                }
        }
}

/*
 * itimer_fini:
 *
 *        Release resources used by an interval timer.
 *
 *        N.B. itimer_lock must be held on entry, and is released on exit.
 */
void
itimer_fini(struct itimer * const it)
{

        KASSERT(itimer_lock_held());

        /* All done with the global state. */
        itimer_unlock();

        /* Destroy the callout, if needed. */
        if (!CLOCK_VIRTUAL_P(it->it_clockid))
                callout_destroy(&it->it_ch);
}

/*
 * itimer_decr:
 *
 *        Decrement an interval timer by a specified number of nanoseconds,
 *        which must be less than a second, i.e. < 1000000000.  If the timer
 *        expires, then reload it.  In this case, carry over (nsec - old value)
 *        to reduce the value reloaded into the timer so that the timer does
 *        not drift.  This routine assumes that it is called in a context where
 *        the timers on which it is operating cannot change in value.
 *
 *        Returns true if the timer has expired.
 */
static bool
itimer_decr(struct itimer *it, int nsec)
{
        struct itimerspec *itp;
        int error __diagused;

        KASSERT(itimer_lock_held());
        KASSERT(CLOCK_VIRTUAL_P(it->it_clockid));

        itp = &it->it_time;
        if (itp->it_value.tv_nsec < nsec) {
                if (itp->it_value.tv_sec == 0) {
                        /* expired, and already in next interval */
                        nsec -= itp->it_value.tv_nsec;
                        goto expire;
                }
                itp->it_value.tv_nsec += 1000000000;
                itp->it_value.tv_sec--;
        }
        itp->it_value.tv_nsec -= nsec;
        nsec = 0;
        if (timespecisset(&itp->it_value))
                return false;
        /* expired, exactly at end of interval */
 expire:
        if (timespecisset(&itp->it_interval)) {
                itp->it_value = itp->it_interval;
                itp->it_value.tv_nsec -= nsec;
                if (itp->it_value.tv_nsec < 0) {
                        itp->it_value.tv_nsec += 1000000000;
                        itp->it_value.tv_sec--;
                }
                error = itimer_settime(it);
                KASSERT(error == 0); /* virtual, never fails */
        } else
                itp->it_value.tv_nsec = 0;                /* sec is already 0 */
        return true;
}

/*
 * itimer_arm_real:
 *
 *        Arm a non-virtual timer.
 */
static void
itimer_arm_real(struct itimer * const it)
{

        KASSERT(!it->it_dying);
        KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid));
        KASSERT(!callout_pending(&it->it_ch));

        /*
         * Don't need to check tshzto() return value, here.
         * callout_schedule() does it for us.
         */
        callout_schedule(&it->it_ch,
            (it->it_clockid == CLOCK_MONOTONIC
                ? tshztoup(&it->it_time.it_value)
                : tshzto(&it->it_time.it_value)));
}

/*
 * itimer_callout:
 *
 *        Callout to expire a non-virtual timer.  Queue it up for processing,
 *        and then reload, if it is configured to do so.
 *
 *        N.B. A delay in processing this callout causes multiple
 *        SIGALRM calls to be compressed into one.
 */
static void
itimer_callout(void *arg)
{
        uint64_t last_val, next_val, interval, now_ns;
        struct timespec now, next;
        struct itimer * const it = arg;
        int backwards;

        itimer_lock();
        (*it->it_ops->ito_fire)(it);

        if (!timespecisset(&it->it_time.it_interval)) {
                timespecclear(&it->it_time.it_value);
                itimer_unlock();
                return;
        }

        if (it->it_clockid == CLOCK_MONOTONIC) {
                getnanouptime(&now);
        } else {
                getnanotime(&now);
        }

        backwards = (timespeccmp(&it->it_time.it_value, &now, >));

        /* Nonnegative interval guaranteed by itimerfix.  */
        KASSERT(it->it_time.it_interval.tv_sec >= 0);
        KASSERT(it->it_time.it_interval.tv_nsec >= 0);

        /* Handle the easy case of non-overflown timers first. */
        if (!backwards &&
            timespecaddok(&it->it_time.it_value, &it->it_time.it_interval)) {
                timespecadd(&it->it_time.it_value, &it->it_time.it_interval,
                    &next);
                it->it_time.it_value = next;
        } else {
                now_ns = timespec2ns(&now);
                last_val = timespec2ns(&it->it_time.it_value);
                interval = timespec2ns(&it->it_time.it_interval);

                next_val = now_ns +
                    (now_ns - last_val + interval - 1) % interval;

                if (backwards)
                        next_val += interval;
                else
                        it->it_overruns += (now_ns - last_val) / interval;

                it->it_time.it_value.tv_sec = next_val / 1000000000;
                it->it_time.it_value.tv_nsec = next_val % 1000000000;
        }

        /*
         * Reset the callout, if it's not going away.
         */
        if (!it->it_dying)
                itimer_arm_real(it);
        itimer_unlock();
}

/*
 * itimer_settime:
 *
 *        Set up the given interval timer. The value in it->it_time.it_value
 *        is taken to be an absolute time for CLOCK_REALTIME/CLOCK_MONOTONIC
 *        timers and a relative time for CLOCK_VIRTUAL/CLOCK_PROF timers.
 *
 *        If the callout had already fired but not yet run, fails with
 *        ERESTART -- caller must restart from the top to look up a timer.
 */
int
itimer_settime(struct itimer *it)
{
        struct itimer *itn, *pitn;
        struct itlist *itl;

        KASSERT(itimer_lock_held());
        KASSERT(!it->it_dying);

        if (!CLOCK_VIRTUAL_P(it->it_clockid)) {
                /*
                 * Try to stop the callout.  However, if it had already
                 * fired, we have to drop the lock to wait for it, so
                 * the world may have changed and pt may not be there
                 * any more.  In that case, tell the caller to start
                 * over from the top.
                 */
                if (callout_halt(&it->it_ch, &itimer_mutex))
                        return ERESTART;
                KASSERT(!it->it_dying);

                /* Now we can touch it and start it up again. */
                if (timespecisset(&it->it_time.it_value))
                        itimer_arm_real(it);
        } else {
                if (it->it_active) {
                        itn = LIST_NEXT(it, it_list);
                        LIST_REMOVE(it, it_list);
                        for ( ; itn; itn = LIST_NEXT(itn, it_list))
                                timespecadd(&it->it_time.it_value,
                                    &itn->it_time.it_value,
                                    &itn->it_time.it_value);
                }
                if (timespecisset(&it->it_time.it_value)) {
                        itl = it->it_vlist;
                        for (itn = LIST_FIRST(itl), pitn = NULL;
                             itn && timespeccmp(&it->it_time.it_value,
                                 &itn->it_time.it_value, >);
                             pitn = itn, itn = LIST_NEXT(itn, it_list))
                                timespecsub(&it->it_time.it_value,
                                    &itn->it_time.it_value,
                                    &it->it_time.it_value);

                        if (pitn)
                                LIST_INSERT_AFTER(pitn, it, it_list);
                        else
                                LIST_INSERT_HEAD(itl, it, it_list);

                        for ( ; itn ; itn = LIST_NEXT(itn, it_list))
                                timespecsub(&itn->it_time.it_value,
                                    &it->it_time.it_value,
                                    &itn->it_time.it_value);

                        it->it_active = true;
                } else {
                        it->it_active = false;
                }
        }

        /* Success!  */
        return 0;
}

/*
 * itimer_gettime:
 *
 *        Return the remaining time of an interval timer.
 */
void
itimer_gettime(const struct itimer *it, struct itimerspec *aits)
{
        struct timespec now;
        struct itimer *itn;

        KASSERT(itimer_lock_held());
        KASSERT(!it->it_dying);

        *aits = it->it_time;
        if (!CLOCK_VIRTUAL_P(it->it_clockid)) {
                /*
                 * Convert from absolute to relative time in .it_value
                 * part of real time timer.  If time for real time
                 * timer has passed return 0, else return difference
                 * between current time and time for the timer to go
                 * off.
                 */
                if (timespecisset(&aits->it_value)) {
                        if (it->it_clockid == CLOCK_REALTIME) {
                                getnanotime(&now);
                        } else { /* CLOCK_MONOTONIC */
                                getnanouptime(&now);
                        }
                        if (timespeccmp(&aits->it_value, &now, <))
                                timespecclear(&aits->it_value);
                        else
                                timespecsub(&aits->it_value, &now,
                                    &aits->it_value);
                }
        } else if (it->it_active) {
                for (itn = LIST_FIRST(it->it_vlist); itn && itn != it;
                     itn = LIST_NEXT(itn, it_list))
                        timespecadd(&aits->it_value,
                            &itn->it_time.it_value, &aits->it_value);
                KASSERT(itn != NULL); /* it should be findable on the list */
        } else
                timespecclear(&aits->it_value);
}

/*
 * Per-process timer support.
 *
 * Both the BSD getitimer() family and the POSIX timer_*() family of
 * routines are supported.
 *
 * All timers are kept in an array pointed to by p_timers, which is
 * allocated on demand - many processes don't use timers at all. The
 * first four elements in this array are reserved for the BSD timers:
 * element 0 is ITIMER_REAL, element 1 is ITIMER_VIRTUAL, element
 * 2 is ITIMER_PROF, and element 3 is ITIMER_MONOTONIC. The rest may be
 * allocated by the timer_create() syscall.
 *
 * These timers are a "sub-class" of interval timer.
 */

/*
 * ptimer_free:
 *
 *        Free the per-process timer at the specified index.
 */
static void
ptimer_free(struct ptimers *pts, int index)
{
        struct itimer *it;
        struct ptimer *pt;

        KASSERT(itimer_lock_held());

        it = pts->pts_timers[index];
        pt = container_of(it, struct ptimer, pt_itimer);
        pts->pts_timers[index] = NULL;
        itimer_poison(it);

        /*
         * Remove it from the queue to be signalled.  Must be done
         * after itimer is poisoned, because we may have had to wait
         * for the callout to complete.
         */
        if (pt->pt_queued) {
                TAILQ_REMOVE(&ptimer_queue, pt, pt_chain);
                pt->pt_queued = false;
        }

        itimer_fini(it);        /* releases itimer_lock */
        kmem_free(pt, sizeof(*pt));
}

/*
 * ptimers_alloc:
 *
 *        Allocate a ptimers for the specified process.
 */
static struct ptimers *
ptimers_alloc(struct proc *p)
{
        struct ptimers *pts;
        int i;

        pts = kmem_alloc(sizeof(*pts), KM_SLEEP);
        LIST_INIT(&pts->pts_virtual);
        LIST_INIT(&pts->pts_prof);
        for (i = 0; i < TIMER_MAX; i++)
                pts->pts_timers[i] = NULL;
        itimer_lock();
        if (p->p_timers == NULL) {
                p->p_timers = pts;
                itimer_unlock();
                return pts;
        }
        itimer_unlock();
        kmem_free(pts, sizeof(*pts));
        return p->p_timers;
}

/*
 * ptimers_free:
 *
 *        Clean up the per-process timers. If "which" is set to TIMERS_ALL,
 *        then clean up all timers and free all the data structures. If
 *        "which" is set to TIMERS_POSIX, only clean up the timers allocated
 *        by timer_create(), not the BSD setitimer() timers, and only free the
 *        structure if none of those remain.
 *
 *        This function is exported because it is needed in the exec and
 *        exit code paths.
 */
void
ptimers_free(struct proc *p, int which)
{
        struct ptimers *pts;
        struct itimer *itn;
        struct timespec ts;
        int i;

        if (p->p_timers == NULL)
                return;

        pts = p->p_timers;
        itimer_lock();
        if (which == TIMERS_ALL) {
                p->p_timers = NULL;
                i = 0;
        } else {
                timespecclear(&ts);
                for (itn = LIST_FIRST(&pts->pts_virtual);
                     itn && itn != pts->pts_timers[ITIMER_VIRTUAL];
                     itn = LIST_NEXT(itn, it_list)) {
                        KASSERT(itn->it_clockid == CLOCK_VIRTUAL);
                        timespecadd(&ts, &itn->it_time.it_value, &ts);
                }
                LIST_FIRST(&pts->pts_virtual) = NULL;
                if (itn) {
                        KASSERT(itn->it_clockid == CLOCK_VIRTUAL);
                        timespecadd(&ts, &itn->it_time.it_value,
                            &itn->it_time.it_value);
                        LIST_INSERT_HEAD(&pts->pts_virtual, itn, it_list);
                }
                timespecclear(&ts);
                for (itn = LIST_FIRST(&pts->pts_prof);
                     itn && itn != pts->pts_timers[ITIMER_PROF];
                     itn = LIST_NEXT(itn, it_list)) {
                        KASSERT(itn->it_clockid == CLOCK_PROF);
                        timespecadd(&ts, &itn->it_time.it_value, &ts);
                }
                LIST_FIRST(&pts->pts_prof) = NULL;
                if (itn) {
                        KASSERT(itn->it_clockid == CLOCK_PROF);
                        timespecadd(&ts, &itn->it_time.it_value,
                            &itn->it_time.it_value);
                        LIST_INSERT_HEAD(&pts->pts_prof, itn, it_list);
                }
                i = TIMER_MIN;
        }
        for ( ; i < TIMER_MAX; i++) {
                if (pts->pts_timers[i] != NULL) {
                        /* Free the timer and release the lock.  */
                        ptimer_free(pts, i);
                        /* Reacquire the lock for the next one.  */
                        itimer_lock();
                }
        }
        if (pts->pts_timers[0] == NULL && pts->pts_timers[1] == NULL &&
            pts->pts_timers[2] == NULL && pts->pts_timers[3] == NULL) {
                p->p_timers = NULL;
                itimer_unlock();
                kmem_free(pts, sizeof(*pts));
        } else
                itimer_unlock();
}

/*
 * ptimer_fire:
 *
 *        Fire a per-process timer.
 */
static void
ptimer_fire(struct itimer *it)
{
        struct ptimer *pt = container_of(it, struct ptimer, pt_itimer);

        KASSERT(itimer_lock_held());

        /*
         * XXX Can overrun, but we don't do signal queueing yet, anyway.
         * XXX Relying on the clock interrupt is stupid.
         */
        if (pt->pt_ev.sigev_notify != SIGEV_SIGNAL) {
                return;
        }

        if (!pt->pt_queued) {
                TAILQ_INSERT_TAIL(&ptimer_queue, pt, pt_chain);
                pt->pt_queued = true;
                softint_schedule(ptimer_sih);
        }
}

/*
 * Operations vector for per-process timers (BSD and POSIX).
 */
static const struct itimer_ops ptimer_itimer_ops = {
        .ito_fire = ptimer_fire,
};

/*
 * sys_timer_create:
 *
 *        System call to create a POSIX timer.
 */
int
sys_timer_create(struct lwp *l, const struct sys_timer_create_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(clockid_t) clock_id;
                syscallarg(struct sigevent *) evp;
                syscallarg(timer_t *) timerid;
        } */

        return timer_create1(SCARG(uap, timerid), SCARG(uap, clock_id),
            SCARG(uap, evp), copyin, l);
}

int
timer_create1(timer_t *tid, clockid_t id, struct sigevent *evp,
    copyin_t fetch_event, struct lwp *l)
{
        int error;
        timer_t timerid;
        struct itlist *itl;
        struct ptimers *pts;
        struct ptimer *pt;
        struct proc *p;

        p = l->l_proc;

        if ((u_int)id > CLOCK_MONOTONIC)
                return EINVAL;

        if ((pts = p->p_timers) == NULL)
                pts = ptimers_alloc(p);

        pt = kmem_zalloc(sizeof(*pt), KM_SLEEP);
        if (evp != NULL) {
                if (((error =
                    (*fetch_event)(evp, &pt->pt_ev, sizeof(pt->pt_ev))) != 0) ||
                    ((pt->pt_ev.sigev_notify < SIGEV_NONE) ||
                        (pt->pt_ev.sigev_notify > SIGEV_SA)) ||
                        (pt->pt_ev.sigev_notify == SIGEV_SIGNAL &&
                         (pt->pt_ev.sigev_signo <= 0 ||
                          pt->pt_ev.sigev_signo >= NSIG))) {
                        kmem_free(pt, sizeof(*pt));
                        return (error ? error : EINVAL);
                }
        }

        /* Find a free timer slot, skipping those reserved for setitimer(). */
        itimer_lock();
        for (timerid = TIMER_MIN; timerid < TIMER_MAX; timerid++)
                if (pts->pts_timers[timerid] == NULL)
                        break;
        if (timerid == TIMER_MAX) {
                itimer_unlock();
                kmem_free(pt, sizeof(*pt));
                return EAGAIN;
        }
        if (evp == NULL) {
                pt->pt_ev.sigev_notify = SIGEV_SIGNAL;
                switch (id) {
                case CLOCK_REALTIME:
                case CLOCK_MONOTONIC:
                        pt->pt_ev.sigev_signo = SIGALRM;
                        break;
                case CLOCK_VIRTUAL:
                        pt->pt_ev.sigev_signo = SIGVTALRM;
                        break;
                case CLOCK_PROF:
                        pt->pt_ev.sigev_signo = SIGPROF;
                        break;
                }
                pt->pt_ev.sigev_value.sival_int = timerid;
        }

        switch (id) {
        case CLOCK_VIRTUAL:
                itl = &pts->pts_virtual;
                break;
        case CLOCK_PROF:
                itl = &pts->pts_prof;
                break;
        default:
                itl = NULL;
        }

        itimer_init(&pt->pt_itimer, &ptimer_itimer_ops, id, itl);
        pt->pt_proc = p;
        pt->pt_poverruns = 0;
        pt->pt_entry = timerid;
        pt->pt_queued = false;

        pts->pts_timers[timerid] = &pt->pt_itimer;
        itimer_unlock();

        return copyout(&timerid, tid, sizeof(timerid));
}

/*
 * sys_timer_delete:
 *
 *        System call to delete a POSIX timer.
 */
int
sys_timer_delete(struct lwp *l, const struct sys_timer_delete_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(timer_t) timerid;
        } */
        struct proc *p = l->l_proc;
        timer_t timerid;
        struct ptimers *pts;
        struct itimer *it, *itn;

        timerid = SCARG(uap, timerid);
        pts = p->p_timers;

        if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
                return EINVAL;

        itimer_lock();
        if ((it = pts->pts_timers[timerid]) == NULL) {
                itimer_unlock();
                return EINVAL;
        }

        if (CLOCK_VIRTUAL_P(it->it_clockid)) {
                if (it->it_active) {
                        itn = LIST_NEXT(it, it_list);
                        LIST_REMOVE(it, it_list);
                        for ( ; itn; itn = LIST_NEXT(itn, it_list))
                                timespecadd(&it->it_time.it_value,
                                    &itn->it_time.it_value,
                                    &itn->it_time.it_value);
                        it->it_active = false;
                }
        }

        /* Free the timer and release the lock.  */
        ptimer_free(pts, timerid);

        return 0;
}

/*
 * sys___timer_settime50:
 *
 *        System call to set/arm a POSIX timer.
 */
int
sys___timer_settime50(struct lwp *l,
    const struct sys___timer_settime50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(timer_t) timerid;
                syscallarg(int) flags;
                syscallarg(const struct itimerspec *) value;
                syscallarg(struct itimerspec *) ovalue;
        } */
        int error;
        struct itimerspec value, ovalue, *ovp = NULL;

        if ((error = copyin(SCARG(uap, value), &value,
            sizeof(struct itimerspec))) != 0)
                return error;

        if (SCARG(uap, ovalue))
                ovp = &ovalue;

        if ((error = dotimer_settime(SCARG(uap, timerid), &value, ovp,
            SCARG(uap, flags), l->l_proc)) != 0)
                return error;

        if (ovp)
                return copyout(&ovalue, SCARG(uap, ovalue),
                    sizeof(struct itimerspec));
        return 0;
}

int
dotimer_settime(int timerid, struct itimerspec *value,
    struct itimerspec *ovalue, int flags, struct proc *p)
{
        struct timespec now;
        struct itimerspec val, oval;
        struct ptimers *pts;
        struct itimer *it;
        int error;

        pts = p->p_timers;

        if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
                return EINVAL;
        val = *value;
        if ((error = itimespecfix(&val.it_value)) != 0 ||
            (error = itimespecfix(&val.it_interval)) != 0)
                return error;

        itimer_lock();
 restart:
        if ((it = pts->pts_timers[timerid]) == NULL) {
                itimer_unlock();
                return EINVAL;
        }

        oval = it->it_time;
        it->it_time = val;

        /*
         * If we've been passed a relative time for a realtime timer,
         * convert it to absolute; if an absolute time for a virtual
         * timer, convert it to relative and make sure we don't set it
         * to zero, which would cancel the timer, or let it go
         * negative, which would confuse the comparison tests.
         */
        if (timespecisset(&it->it_time.it_value)) {
                if (!CLOCK_VIRTUAL_P(it->it_clockid)) {
                        if ((flags & TIMER_ABSTIME) == 0) {
                                if (it->it_clockid == CLOCK_REALTIME) {
                                        getnanotime(&now);
                                } else { /* CLOCK_MONOTONIC */
                                        getnanouptime(&now);
                                }
                                timespecadd(&it->it_time.it_value, &now,
                                    &it->it_time.it_value);
                        }
                } else {
                        if ((flags & TIMER_ABSTIME) != 0) {
                                getnanotime(&now);
                                timespecsub(&it->it_time.it_value, &now,
                                    &it->it_time.it_value);
                                if (!timespecisset(&it->it_time.it_value) ||
                                    it->it_time.it_value.tv_sec < 0) {
                                        it->it_time.it_value.tv_sec = 0;
                                        it->it_time.it_value.tv_nsec = 1;
                                }
                        }
                }
        }

        error = itimer_settime(it);
        if (error == ERESTART) {
                KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid));
                goto restart;
        }
        KASSERT(error == 0);
        itimer_unlock();

        if (ovalue)
                *ovalue = oval;

        return 0;
}

/*
 * sys___timer_gettime50:
 *
 *        System call to return the time remaining until a POSIX timer fires.
 */
int
sys___timer_gettime50(struct lwp *l,
    const struct sys___timer_gettime50_args *uap, register_t *retval)
{
        /* {
                syscallarg(timer_t) timerid;
                syscallarg(struct itimerspec *) value;
        } */
        struct itimerspec its;
        int error;

        if ((error = dotimer_gettime(SCARG(uap, timerid), l->l_proc,
            &its)) != 0)
                return error;

        return copyout(&its, SCARG(uap, value), sizeof(its));
}

int
dotimer_gettime(int timerid, struct proc *p, struct itimerspec *its)
{
        struct itimer *it;
        struct ptimers *pts;

        pts = p->p_timers;
        if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
                return EINVAL;
        itimer_lock();
        if ((it = pts->pts_timers[timerid]) == NULL) {
                itimer_unlock();
                return EINVAL;
        }
        itimer_gettime(it, its);
        itimer_unlock();

        return 0;
}

/*
 * sys_timer_getoverrun:
 *
 *        System call to return the number of times a POSIX timer has
 *        expired while a notification was already pending.  The counter
 *        is reset when a timer expires and a notification can be posted.
 */
int
sys_timer_getoverrun(struct lwp *l, const struct sys_timer_getoverrun_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(timer_t) timerid;
        } */
        struct proc *p = l->l_proc;
        struct ptimers *pts;
        int timerid;
        struct itimer *it;
        struct ptimer *pt;

        timerid = SCARG(uap, timerid);

        pts = p->p_timers;
        if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
                return EINVAL;
        itimer_lock();
        if ((it = pts->pts_timers[timerid]) == NULL) {
                itimer_unlock();
                return EINVAL;
        }
        pt = container_of(it, struct ptimer, pt_itimer);
        *retval = pt->pt_poverruns;
        if (*retval >= DELAYTIMER_MAX)
                *retval = DELAYTIMER_MAX;
        itimer_unlock();

        return 0;
}

/*
 * sys___getitimer50:
 *
 *        System call to get the time remaining before a BSD timer fires.
 */
int
sys___getitimer50(struct lwp *l, const struct sys___getitimer50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) which;
                syscallarg(struct itimerval *) itv;
        } */
        struct proc *p = l->l_proc;
        struct itimerval aitv;
        int error;

        memset(&aitv, 0, sizeof(aitv));
        error = dogetitimer(p, SCARG(uap, which), &aitv);
        if (error)
                return error;
        return copyout(&aitv, SCARG(uap, itv), sizeof(struct itimerval));
}

int
dogetitimer(struct proc *p, int which, struct itimerval *itvp)
{
        struct ptimers *pts;
        struct itimer *it;
        struct itimerspec its;

        if ((u_int)which > ITIMER_MONOTONIC)
                return EINVAL;

        itimer_lock();
        pts = p->p_timers;
        if (pts == NULL || (it = pts->pts_timers[which]) == NULL) {
                timerclear(&itvp->it_value);
                timerclear(&itvp->it_interval);
        } else {
                itimer_gettime(it, &its);
                TIMESPEC_TO_TIMEVAL(&itvp->it_value, &its.it_value);
                TIMESPEC_TO_TIMEVAL(&itvp->it_interval, &its.it_interval);
        }
        itimer_unlock();

        return 0;
}

/*
 * sys___setitimer50:
 *
 *        System call to set/arm a BSD timer.
 */
int
sys___setitimer50(struct lwp *l, const struct sys___setitimer50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) which;
                syscallarg(const struct itimerval *) itv;
                syscallarg(struct itimerval *) oitv;
        } */
        struct proc *p = l->l_proc;
        int which = SCARG(uap, which);
        struct sys___getitimer50_args getargs;
        const struct itimerval *itvp;
        struct itimerval aitv;
        int error;

        itvp = SCARG(uap, itv);
        if (itvp &&
            (error = copyin(itvp, &aitv, sizeof(struct itimerval))) != 0)
                return error;
        if (SCARG(uap, oitv) != NULL) {
                SCARG(&getargs, which) = which;
                SCARG(&getargs, itv) = SCARG(uap, oitv);
                if ((error = sys___getitimer50(l, &getargs, retval)) != 0)
                        return error;
        }
        if (itvp == 0)
                return 0;

        return dosetitimer(p, which, &aitv);
}

int
dosetitimer(struct proc *p, int which, struct itimerval *itvp)
{
        struct timespec now;
        struct ptimers *pts;
        struct ptimer *spare;
        struct itimer *it;
        struct itlist *itl;
        int error;

        if ((u_int)which > ITIMER_MONOTONIC)
                return EINVAL;
        if (itimerfix(&itvp->it_value) || itimerfix(&itvp->it_interval))
                return EINVAL;

        /*
         * Don't bother allocating data structures if the process just
         * wants to clear the timer.
         */
        spare = NULL;
        pts = p->p_timers;
 retry:
        if (!timerisset(&itvp->it_value) && (pts == NULL ||
            pts->pts_timers[which] == NULL))
                return 0;
        if (pts == NULL)
                pts = ptimers_alloc(p);
        itimer_lock();
 restart:
        it = pts->pts_timers[which];
        if (it == NULL) {
                struct ptimer *pt;

                if (spare == NULL) {
                        itimer_unlock();
                        spare = kmem_zalloc(sizeof(*spare), KM_SLEEP);
                        goto retry;
                }
                pt = spare;
                spare = NULL;

                it = &pt->pt_itimer;
                pt->pt_ev.sigev_notify = SIGEV_SIGNAL;
                pt->pt_ev.sigev_value.sival_int = which;

                switch (which) {
                case ITIMER_REAL:
                case ITIMER_MONOTONIC:
                        itl = NULL;
                        pt->pt_ev.sigev_signo = SIGALRM;
                        break;
                case ITIMER_VIRTUAL:
                        itl = &pts->pts_virtual;
                        pt->pt_ev.sigev_signo = SIGVTALRM;
                        break;
                case ITIMER_PROF:
                        itl = &pts->pts_prof;
                        pt->pt_ev.sigev_signo = SIGPROF;
                        break;
                default:
                        panic("%s: can't happen %d", __func__, which);
                }
                itimer_init(it, &ptimer_itimer_ops, which, itl);
                pt->pt_proc = p;
                pt->pt_entry = which;

                pts->pts_timers[which] = it;
        }

        TIMEVAL_TO_TIMESPEC(&itvp->it_value, &it->it_time.it_value);
        TIMEVAL_TO_TIMESPEC(&itvp->it_interval, &it->it_time.it_interval);

        error = 0;
        if (timespecisset(&it->it_time.it_value)) {
                /* Convert to absolute time */
                /* XXX need to wrap in splclock for timecounters case? */
                switch (which) {
                case ITIMER_REAL:
                        getnanotime(&now);
                        if (!timespecaddok(&it->it_time.it_value, &now)) {
                                error = EINVAL;
                                goto out;
                        }
                        timespecadd(&it->it_time.it_value, &now,
                            &it->it_time.it_value);
                        break;
                case ITIMER_MONOTONIC:
                        getnanouptime(&now);
                        if (!timespecaddok(&it->it_time.it_value, &now)) {
                                error = EINVAL;
                                goto out;
                        }
                        timespecadd(&it->it_time.it_value, &now,
                            &it->it_time.it_value);
                        break;
                default:
                        break;
                }
        }

        error = itimer_settime(it);
        if (error == ERESTART) {
                KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid));
                goto restart;
        }
        KASSERT(error == 0);
out:
        itimer_unlock();
        if (spare != NULL)
                kmem_free(spare, sizeof(*spare));

        return error;
}

/*
 * ptimer_tick:
 *
 *        Called from hardclock() to decrement per-process virtual timers.
 */
void
ptimer_tick(lwp_t *l, bool user)
{
        struct ptimers *pts;
        struct itimer *it;
        proc_t *p;

        p = l->l_proc;
        if (p->p_timers == NULL)
                return;

        itimer_lock();
        if ((pts = l->l_proc->p_timers) != NULL) {
                /*
                 * Run current process's virtual and profile time, as needed.
                 */
                if (user && (it = LIST_FIRST(&pts->pts_virtual)) != NULL)
                        if (itimer_decr(it, tick * 1000))
                                (*it->it_ops->ito_fire)(it);
                if ((it = LIST_FIRST(&pts->pts_prof)) != NULL)
                        if (itimer_decr(it, tick * 1000))
                                (*it->it_ops->ito_fire)(it);
        }
        itimer_unlock();
}

/*
 * ptimer_intr:
 *
 *        Software interrupt handler for processing per-process
 *        timer expiration.
 */
static void
ptimer_intr(void *cookie)
{
        ksiginfo_t ksi;
        struct itimer *it;
        struct ptimer *pt;
        proc_t *p;

        mutex_enter(&proc_lock);
        itimer_lock();
        while ((pt = TAILQ_FIRST(&ptimer_queue)) != NULL) {
                it = &pt->pt_itimer;

                TAILQ_REMOVE(&ptimer_queue, pt, pt_chain);
                KASSERT(pt->pt_queued);
                pt->pt_queued = false;

                p = pt->pt_proc;
                if (p->p_timers == NULL) {
                        /* Process is dying. */
                        continue;
                }
                if (pt->pt_ev.sigev_notify != SIGEV_SIGNAL) {
                        continue;
                }
                if (sigismember(&p->p_sigpend.sp_set, pt->pt_ev.sigev_signo)) {
                        it->it_overruns++;
                        continue;
                }

                KSI_INIT(&ksi);
                ksi.ksi_signo = pt->pt_ev.sigev_signo;
                ksi.ksi_code = SI_TIMER;
                ksi.ksi_value = pt->pt_ev.sigev_value;
                pt->pt_poverruns = it->it_overruns;
                it->it_overruns = 0;
                itimer_unlock();
                kpsignal(p, &ksi, NULL);
                itimer_lock();
        }
        itimer_unlock();
        mutex_exit(&proc_lock);
}












































































   11 



   11 








    1 

















   95 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/*        $NetBSD: subr_lwp_specificdata.c,v 1.4 2019/05/17 03:34:26 ozaki-r Exp $        */

/*-
 * Copyright (c) 2006 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#define _LWP_API_PRIVATE

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_lwp_specificdata.c,v 1.4 2019/05/17 03:34:26 ozaki-r Exp $");

#include <sys/param.h>
#include <sys/lwp.h>
#include <sys/specificdata.h>

static specificdata_domain_t lwp_specificdata_domain;

void
lwpinit_specificdata(void)
{

        lwp_specificdata_domain = specificdata_domain_create();
        KASSERT(lwp_specificdata_domain != NULL);
}

/*
 * lwp_specific_key_create --
 *        Create a key for subsystem lwp-specific data.
 */
int
lwp_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
{

        return (specificdata_key_create(lwp_specificdata_domain, keyp, dtor));
}

/*
 * lwp_specific_key_delete --
 *        Delete a key for subsystem lwp-specific data.
 */
void
lwp_specific_key_delete(specificdata_key_t key)
{

        specificdata_key_delete(lwp_specificdata_domain, key);
}

/*
 * lwp_initspecific --
 *        Initialize an LWP's specificdata container.
 */
void
lwp_initspecific(struct lwp *l)
{
        int error __diagused;

        error = specificdata_init(lwp_specificdata_domain, &l->l_specdataref);
        KASSERT(error == 0);
}

/*
 * lwp_finispecific --
 *        Finalize an LWP's specificdata container.
 */
void
lwp_finispecific(struct lwp *l)
{

        specificdata_fini(lwp_specificdata_domain, &l->l_specdataref);
}

/*
 * lwp_getspecific --
 *        Return lwp-specific data corresponding to the specified key.
 *
 *        Note: LWP specific data is NOT INTERLOCKED.  An LWP should access
 *        only its OWN SPECIFIC DATA.  If it is necessary to access another
 *        LWP's specifc data, care must be taken to ensure that doing so
 *        would not cause internal data structure inconsistency (i.e. caller
 *        can guarantee that the target LWP is not inside an lwp_getspecific()
 *        or lwp_setspecific() call).
 */
void *
lwp_getspecific(specificdata_key_t key)
{

        return (specificdata_getspecific_unlocked(lwp_specificdata_domain,
                                                  &curlwp->l_specdataref, key));
}

void *
_lwp_getspecific_by_lwp(struct lwp *l, specificdata_key_t key)
{

        return (specificdata_getspecific_unlocked(lwp_specificdata_domain,
                                                  &l->l_specdataref, key));
}

/*
 * lwp_setspecific --
 *        Set lwp-specific data corresponding to the specified key.
 */
void
lwp_setspecific(specificdata_key_t key, void *data)
{

        specificdata_setspecific(lwp_specificdata_domain,
                                 &curlwp->l_specdataref, key, data);
}

void
lwp_setspecific_by_lwp(struct lwp *l, specificdata_key_t key, void *data)
{

        specificdata_setspecific(lwp_specificdata_domain,
                                 &l->l_specdataref, key, data);
}




































































































































































    3 












































































    4 






































































    4 




    3 













    4 











































    4 






    4 











    4 

































































































































































































































































































































































    4 































    4 































    4 
































    1 



    1 









































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
/*        $NetBSD: portalgo.c,v 1.15 2022/11/04 09:01:53 ozaki-r Exp $        */

/*
 * Copyright 2011 Vlad Balan
 *
 * Written by Vlad Balan for the NetBSD Foundation.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

/*
 * see:
 *        RFC 6056 Recommendations for Transport-Protocol Port Randomization
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: portalgo.c,v 1.15 2022/11/04 09:01:53 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#endif

#include <sys/param.h>
#include <sys/errno.h>
#include <sys/kauth.h>
#include <sys/uidinfo.h>
#include <sys/md5.h>
#include <sys/cprng.h>
#include <sys/bitops.h>

#include <net/if.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>

#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#endif

#include <netinet/tcp_vtw.h>

#include "portalgo.h"

#define NPROTO 2
#define PORTALGO_TCP 0
#define PORTALGO_UDP 1

#define NAF 2
#define PORTALGO_IPV4 0
#define PORTALGO_IPV6 1

#define NRANGES 2
#define PORTALGO_LOWPORT 0
#define PORTALGO_HIGHPORT 1

#if PORTALGO_DEBUG
static bool portalgo_debug = true;
#define DPRINTF if (portalgo_debug) printf
#else
#define DPRINTF while (/*CONSTCOND*/0) printf
#endif

#ifndef PORTALGO_INET4_DEFAULT
#define PORTALGO_INET4_DEFAULT PORTALGO_BSD
#endif
#ifndef PORTALGO_INET6_DEFAULT
#define PORTALGO_INET6_DEFAULT PORTALGO_BSD
#endif

typedef __BITMAP_TYPE(, uint32_t, 0x10000) bitmap;
#ifdef INET
static int inet4_portalgo = PORTALGO_INET4_DEFAULT;
static bitmap inet4_reserve;
#endif
#ifdef INET6
static int inet6_portalgo = PORTALGO_INET6_DEFAULT;
static bitmap inet6_reserve;
#endif

typedef struct {
        const char *name;
        int (*func)(int, uint16_t *, struct inpcb *, kauth_cred_t);
} portalgo_algorithm_t;

static int algo_bsd(int, uint16_t *, struct inpcb *, kauth_cred_t);
static int algo_random_start(int, uint16_t *, struct inpcb *, kauth_cred_t);
static int algo_random_pick(int, uint16_t *, struct inpcb *, kauth_cred_t);
static int algo_hash(int, uint16_t *, struct inpcb *, kauth_cred_t);
static int algo_doublehash(int, uint16_t *, struct inpcb *, kauth_cred_t);
static int algo_randinc(int, uint16_t *, struct inpcb *, kauth_cred_t);

static const portalgo_algorithm_t algos[] = {
        {
                .name = "bsd",
                .func = algo_bsd
        },
        {
                .name = "random_start",
                .func = algo_random_start
        },
        {
                .name = "random_pick",
                .func = algo_random_pick
        },
        {
                .name = "hash",
                .func = algo_hash
        },
        {
                .name = "doublehash",
                .func = algo_doublehash
        },
        {
                .name = "randinc",
                .func = algo_randinc
        }
};

#define NALGOS __arraycount(algos)

static uint16_t portalgo_next_ephemeral[NPROTO][NAF][NRANGES][NALGOS];

/*
 * Access the pcb and copy the values of the last port and the ends of
 * the port range.
 */
static int
pcb_getports(struct inpcb *inp, uint16_t *lastport,
    uint16_t *mymin, uint16_t *mymax, uint16_t **pnext_ephemeral, int algo)
{
        struct inpcbtable * const table = inp->inp_table;
        struct socket *so;
        int portalgo_proto;
        int portalgo_af;
        int portalgo_range;

        so = inp->inp_socket;
        switch (so->so_type) {
        case SOCK_DGRAM: /* UDP or DCCP */
        case SOCK_CONN_DGRAM:
                portalgo_proto = PORTALGO_UDP;
                break;
        case SOCK_STREAM: /* TCP or SCTP */
                portalgo_proto = PORTALGO_TCP;
                break;
        default:
                return EPFNOSUPPORT;
        }

        switch (inp->inp_af) {
#ifdef INET
        case AF_INET: {
                portalgo_af = PORTALGO_IPV4;
                if (inp->inp_flags & INP_LOWPORT) {
                        *mymin = lowportmin;
                        *mymax = lowportmax;
                        *lastport = table->inpt_lastlow;
                        portalgo_range = PORTALGO_LOWPORT;
                } else {
                        *mymin = anonportmin;
                        *mymax = anonportmax;
                        *lastport = table->inpt_lastport;
                        portalgo_range = PORTALGO_HIGHPORT;
                }
                break;
        }
#endif
#ifdef INET6
        case AF_INET6: {
                portalgo_af = PORTALGO_IPV6;
                if (inp->inp_flags & IN6P_LOWPORT) {
                        *mymin = ip6_lowportmin;
                        *mymax = ip6_lowportmax;
                        *lastport = table->inpt_lastlow;
                        portalgo_range = PORTALGO_LOWPORT;
                } else {
                        *mymin = ip6_anonportmin;
                        *mymax = ip6_anonportmax;
                        *lastport = table->inpt_lastport;
                        portalgo_range = PORTALGO_HIGHPORT;
                }
                break;
        }
#endif
        default:
                return EAFNOSUPPORT;
        }

        if (*mymin > *mymax) {        /* sanity check */
                u_int16_t swp;

                swp = *mymin;
                *mymin = *mymax;
                *mymax = swp;
        }

        DPRINTF("%s mymin:%d mymax:%d lastport:%d\n", __func__,
            *mymin, *mymax, *lastport);

        *pnext_ephemeral = &portalgo_next_ephemeral[portalgo_proto]
            [portalgo_af][portalgo_range][algo];

        DPRINTF("%s portalgo_proto:%d portalgo_af:%d portalgo_range:%d\n",
            __func__, portalgo_proto, portalgo_af, portalgo_range);
        return 0;
}

/*
 * Check whether the port picked by the port randomizer is available
 * and whether KAUTH approves of our choice. This part of the code
 * shamelessly copied from in_pcb.c.
 */
static bool
check_suitable_port(uint16_t port, struct inpcb *inp, kauth_cred_t cred)
{
        struct inpcbtable * const table = inp->inp_table;
#ifdef INET
        vestigial_inpcb_t vestigial;
#endif
        int error;
#ifdef INET6
        struct socket *so;
        int wild = 0;
#endif

        DPRINTF("%s called for argument %d\n", __func__, port);

        switch (inp->inp_af) {
#ifdef INET
        case AF_INET: { /* IPv4 */
                struct inpcb *pcb;
                struct sockaddr_in sin;

                if (__BITMAP_ISSET(port, &inet4_reserve))
                        return false;

                sin.sin_addr = in4p_laddr(inp);
                pcb = inpcb_lookup_local(table, sin.sin_addr, htons(port), 1,
                    &vestigial);

                DPRINTF("%s inpcb_lookup_local returned %p and "
                    "vestigial.valid %d\n",
                    __func__, pcb, vestigial.valid);

                if ((!pcb) && (!vestigial.valid)) {
                        enum kauth_network_req req;

                        /* We have a free port. Check with the secmodel. */
                        if (inp->inp_flags & INP_LOWPORT) {
#ifndef IPNOPRIVPORTS
                                req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
#else
                                req = KAUTH_REQ_NETWORK_BIND_PORT;
#endif
                        } else
                                req = KAUTH_REQ_NETWORK_BIND_PORT;

                        sin.sin_port = port;
                        error = kauth_authorize_network(cred,
                            KAUTH_NETWORK_BIND,
                            req, inp->inp_socket, &sin, NULL);
                        DPRINTF("%s kauth_authorize_network returned %d\n",
                            __func__, error);

                        if (error == 0) {
                                DPRINTF("%s port approved\n", __func__);
                                return true;        /* KAUTH agrees */
                        }
                }
                break;
        }
#endif
#ifdef INET6
        case AF_INET6: { /* IPv6 */
                struct sockaddr_in6 sin6;
                void *t;

                if (__BITMAP_ISSET(port, &inet6_reserve))
                        return false;

                sin6.sin6_addr = in6p_laddr(inp);
                so = inp->inp_socket;

                /* XXX: this is redundant when called from in6pcb_bind */
                if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 &&
                    ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 ||
                        (so->so_options & SO_ACCEPTCONN) == 0))
                        wild = 1;

#ifdef INET
                if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) {
                        t = inpcb_lookup_local(table,
                            *(struct in_addr *)&sin6.sin6_addr.s6_addr32[3],
                            htons(port), wild, &vestigial);
                        if (!t && vestigial.valid) {
                                DPRINTF("%s inpcb_lookup_local returned "
                                    "a result\n", __func__);
                                return false;
                        }
                } else
#endif
                {
                        t = in6pcb_lookup_local(table, &sin6.sin6_addr,
                            htons(port), wild, &vestigial);
                        if (!t && vestigial.valid) {
                                DPRINTF("%s in6pcb_lookup_local returned "
                                    "a result\n", __func__);
                                return false;
                        }
                }
                if (t == NULL) {
                        enum kauth_network_req req;

                        /* We have a free port. Check with the secmodel. */
                        if (inp->inp_flags & IN6P_LOWPORT) {
#ifndef IPNOPRIVPORTS
                                req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
#else
                                req = KAUTH_REQ_NETWORK_BIND_PORT;
#endif
                        } else {
                                req = KAUTH_REQ_NETWORK_BIND_PORT;
                        }

                        sin6.sin6_port = port;
                        error = kauth_authorize_network(cred,
                            KAUTH_NETWORK_BIND, req, so, &sin6, NULL);
                        if (error) {
                                /* Secmodel says no. Keep looking. */
                                DPRINTF("%s secmodel says no\n", __func__);
                                return false;
                        }
                        DPRINTF("%s port approved\n", __func__);
                        return true;
                }
                break;
        }
#endif
        default:
                DPRINTF("%s unknown address family\n", __func__);
                return false;
        }
        return false;
}

/* This is the default BSD algorithm, as described in RFC 6056 */
static int
algo_bsd(int algo, uint16_t *port, struct inpcb *inp, kauth_cred_t cred)
{
        uint16_t count;
        uint16_t mymin, mymax, lastport;
        uint16_t *next_ephemeral;
        int error;

        DPRINTF("%s called\n", __func__);
        error = pcb_getports(inp, &lastport, &mymin, &mymax,
            &next_ephemeral, algo);
        if (error)
                return error;
        count = mymax - mymin + 1;
        do {
                uint16_t myport = *next_ephemeral;

                if (myport < mymin || mymax < myport)
                        myport = mymax;
                *next_ephemeral = myport - 1;
                if (check_suitable_port(myport, inp, cred)) {
                        *port = myport;
                        DPRINTF("%s returning port %d\n", __func__, *port);
                        return 0;
                }
                count--;
        } while (count > 0);

        DPRINTF("%s returning EAGAIN\n", __func__);
        return EAGAIN;
}

/*
 * The straightforward algorithm that increments the port number
 * by a random amount.
 */
static int
algo_random_start(int algo, uint16_t *port, struct inpcb *inp,
    kauth_cred_t cred)
{
        uint16_t count, num_ephemeral;
        uint16_t mymin, mymax, lastport;
        uint16_t *next_ephemeral;
        int error;

        DPRINTF("%s called\n", __func__);

        error = pcb_getports(inp, &lastport, &mymin, &mymax,
            &next_ephemeral, algo);
        if (error)
                return error;

        num_ephemeral = mymax - mymin + 1;

        DPRINTF("num_ephemeral: %u\n", num_ephemeral);

        *next_ephemeral = mymin + (cprng_fast32() % num_ephemeral);

        DPRINTF("next_ephemeral initially: %u\n", *next_ephemeral);

        count = num_ephemeral;

        do {
                if (check_suitable_port(*next_ephemeral, inp, cred)) {
                        *port = *next_ephemeral;
                        DPRINTF("%s returning port %d\n", __func__, *port);
                        return 0;
                }
                if (*next_ephemeral == mymax) {
                        *next_ephemeral = mymin;
                } else
                        (*next_ephemeral)++;

                count--;


                DPRINTF("next_ephemeral: %u count: %u\n", *next_ephemeral,
                    count);

        } while (count > 0);

        DPRINTF("%s returning EINVAL\n", __func__);

        return EINVAL;
}

/*
 * Since there is no state kept on the ports tried, we might actually
 * give up before exhausting the free ports.
 */
static int
algo_random_pick(int algo, uint16_t *port, struct inpcb *inp,
    kauth_cred_t cred)
{
        uint16_t count, num_ephemeral;
        uint16_t mymin, mymax, lastport;
        uint16_t *next_ephemeral;
        int error;

        DPRINTF("%s called\n", __func__);

        error = pcb_getports(inp, &lastport, &mymin, &mymax,
            &next_ephemeral, algo);
        if (error)
                return error;

        num_ephemeral = mymax - mymin + 1;

        DPRINTF("num_ephemeral: %u\n", num_ephemeral);
        *next_ephemeral = mymin + (cprng_fast32() % num_ephemeral);

        DPRINTF("next_ephemeral initially: %u\n", *next_ephemeral);

        count = num_ephemeral;

        do {
                if (check_suitable_port(*next_ephemeral, inp, cred)) {
                        *port = *next_ephemeral;
                        DPRINTF("%s returning port %d\n", __func__, *port);
                        return 0;
                }
                *next_ephemeral = mymin +
                    (cprng_fast32() % num_ephemeral);

                count--;

                DPRINTF("next_ephemeral: %u count: %u\n",
                    *next_ephemeral, count);
        } while (count > 0);

        DPRINTF("%s returning EINVAL\n", __func__);

        return EINVAL;
}

/* This is the implementation from FreeBSD, with tweaks */
static uint16_t
Fhash(const struct inpcb *inp)
{
        MD5_CTX f_ctx;
        uint32_t Ff[4];
        uint32_t secret_f[4];
        uint32_t offset;
        uint16_t soffset[2];

        cprng_fast(secret_f, sizeof(secret_f));

        MD5Init(&f_ctx);
        switch (inp->inp_af) {
#ifdef INET
        case AF_INET: {
                MD5Update(&f_ctx, (const u_char *)&const_in4p_laddr(inp),
                    sizeof(const_in4p_laddr(inp)));
                MD5Update(&f_ctx, (const u_char *)&const_in4p_faddr(inp),
                    sizeof(const_in4p_faddr(inp)));
                MD5Update(&f_ctx, (const u_char *)&inp->inp_fport,
                    sizeof(inp->inp_fport));
                break;
        }
#endif
#ifdef INET6
        case AF_INET6: {
                MD5Update(&f_ctx, (const u_char *)&const_in6p_laddr(inp),
                    sizeof(const_in6p_laddr(inp)));
                MD5Update(&f_ctx, (const u_char *)&const_in6p_faddr(inp),
                    sizeof(const_in6p_faddr(inp)));
                MD5Update(&f_ctx, (const u_char *)&inp->inp_fport,
                    sizeof(inp->inp_fport));
                break;
        }
#endif
        default:
                break;
        }
        MD5Update(&f_ctx, (const u_char *)secret_f, sizeof(secret_f));
        MD5Final((u_char *)&Ff, &f_ctx);

        offset = (Ff[0] ^ Ff[1]) ^ (Ff[2] ^ Ff[3]);

        memcpy(&soffset, &offset, sizeof(soffset));

        return soffset[0] ^ soffset[1];
}

/*
 * Checks whether the tuple is complete. If not, marks the pcb for
 * late binding.
 */
static bool
iscompletetuple(struct inpcb *inp)
{

        switch (inp->inp_af) {
#ifdef INET
        case AF_INET: {
                if (inp->inp_fport == 0 || in_nullhost(in4p_faddr(inp))) {
                        DPRINTF("%s fport or faddr missing, delaying port "
                            "to connect/send\n", __func__);
                        inp->inp_bindportonsend = true;
                        return false;
                } else {
                        inp->inp_bindportonsend = false;
                }
                break;
        }
#endif
#ifdef INET6
        case AF_INET6: {
                if (inp->inp_fport == 0 || memcmp(&in6p_faddr(inp),
                    &in6addr_any, sizeof(in6p_faddr(inp))) == 0) {
                        DPRINTF("%s fport or faddr missing, delaying port "
                            "to connect/send\n", __func__);
                        inp->inp_bindportonsend = true;
                        return false;
                } else {
                        inp->inp_bindportonsend = false;
                }
                break;
        }
#endif
        default:
                DPRINTF("%s incorrect address family\n", __func__);
                return false;
        }

        return true;
}

static int
algo_hash(int algo, uint16_t *port, struct inpcb *inp,
    kauth_cred_t cred)
{
        uint16_t count, num_ephemeral;
        uint16_t mymin, mymax, lastport;
        uint16_t *next_ephemeral;
        uint16_t offset, myport;
        int error;

        DPRINTF("%s called\n", __func__);

        error = pcb_getports(inp, &lastport, &mymin, &mymax,
            &next_ephemeral, algo);
        if (error)
                return error;

        if (!iscompletetuple(inp)) {
                *port = 0;
                return 0;
        }

        /* Ephemeral port selection function */
        num_ephemeral = mymax - mymin + 1;

        DPRINTF("num_ephemeral: %d\n", num_ephemeral);

        offset = Fhash(inp);

        count = num_ephemeral;
        do {
                myport = mymin + (*next_ephemeral + offset)
                    % num_ephemeral;

                (*next_ephemeral)++;

                if (check_suitable_port(myport, inp, cred)) {
                        *port = myport;
                        DPRINTF("%s returning port %d\n", __func__, *port);
                        return 0;
                }
                count--;
        } while (count > 0);

        DPRINTF("%s returning EINVAL\n", __func__);

        return EINVAL;
}

static int
algo_doublehash(int algo, uint16_t *port, struct inpcb *inp,
    kauth_cred_t cred)
{
        uint16_t count, num_ephemeral;
        uint16_t mymin, mymax, lastport;
        uint16_t *next_ephemeral;
        uint16_t offset, myport;
        static uint16_t dhtable[8];
        size_t idx;
        int error;

        DPRINTF("%s called\n", __func__);

        error = pcb_getports(inp, &lastport, &mymin, &mymax,
            &next_ephemeral, algo);
        if (error)
                return error;

        if (!iscompletetuple(inp)) {
                *port = 0;
                return 0;
        }
        /* first time initialization */
        if (dhtable[0] == 0)
                for (size_t i = 0; i < __arraycount(dhtable); i++)
                        dhtable[i] = cprng_fast32() & 0xffff;

        /* Ephemeral port selection function */
        num_ephemeral = mymax - mymin + 1;
        offset = Fhash(inp);
        idx = Fhash(inp) % __arraycount(dhtable);        /* G */
        count = num_ephemeral;

        do {
                myport = mymin + (offset + dhtable[idx])
                    % num_ephemeral;
                dhtable[idx]++;

                if (check_suitable_port(myport, inp, cred)) {
                        *port = myport;
                        DPRINTF("%s returning port %d\n", __func__, *port);
                        return 0;
                }
                count--;

        } while (count > 0);

        DPRINTF("%s returning EINVAL\n", __func__);

        return EINVAL;
}

static int
algo_randinc(int algo, uint16_t *port, struct inpcb *inp,
    kauth_cred_t cred)
{
        static const uint16_t N = 500;        /* Determines the trade-off */
        uint16_t count, num_ephemeral;
        uint16_t mymin, mymax, lastport;
        uint16_t *next_ephemeral;
        uint16_t myport;
        int error;

        DPRINTF("%s called\n", __func__);

        error = pcb_getports(inp, &lastport, &mymin, &mymax,
            &next_ephemeral, algo);
        if (error)
                return error;

        if (*next_ephemeral == 0)
                *next_ephemeral = cprng_fast32() & 0xffff;

        /* Ephemeral port selection function */
        num_ephemeral = mymax - mymin + 1;

        count = num_ephemeral;
        do {
                *next_ephemeral = *next_ephemeral +
                    (cprng_fast32() % N) + 1;
                myport = mymin +
                    (*next_ephemeral % num_ephemeral);

                if (check_suitable_port(myport, inp, cred)) {
                        *port = myport;
                        DPRINTF("%s returning port %d\n", __func__, *port);
                        return 0;
                }
                count--;
        } while (count > 0);

        return EINVAL;
}

/* The generic function called in order to pick a port. */
int
portalgo_randport(uint16_t *port, struct inpcb *inp, kauth_cred_t cred)
{
        int algo, error;
        uint16_t lport;
        int default_algo;

        DPRINTF("%s called\n", __func__);

        if (inp->inp_portalgo == PORTALGO_DEFAULT) {
                switch (inp->inp_af) {
#ifdef INET
                case AF_INET:
                        default_algo = inet4_portalgo;
                        break;
#endif
#ifdef INET6
                case AF_INET6:
                        default_algo = inet6_portalgo;
                        break;
#endif
                default:
                        return EINVAL;
                }

                if (default_algo == PORTALGO_DEFAULT)
                        algo = PORTALGO_BSD;
                else
                        algo = default_algo;
        }
        else /* socket specifies the algorithm */
                algo = inp->inp_portalgo;

        KASSERT(algo >= 0);
        KASSERT(algo < NALGOS);

        switch (inp->inp_af) {
#ifdef INET
        case AF_INET: {
                char buf[INET_ADDRSTRLEN];
                DPRINTF("local addr: %s\n", IN_PRINT(buf, &in4p_laddr(inp)));
                DPRINTF("local port: %d\n", inp->inp_lport);
                DPRINTF("foreign addr: %s\n", IN_PRINT(buf, &in4p_faddr(inp)));
                DPRINTF("foreign port: %d\n", inp->inp_fport);
                break;
        }
#endif
#ifdef INET6
        case AF_INET6: {
                char buf[INET6_ADDRSTRLEN];
                DPRINTF("local addr: %s\n", IN6_PRINT(buf, &in6p_laddr(inp)));
                DPRINTF("local port: %d\n", inp->inp_lport);
                DPRINTF("foreign addr: %s\n", IN6_PRINT(buf,
                    &in6p_laddr(inp)));
                DPRINTF("foreign port: %d\n", inp->inp_fport);
                break;
        }
#endif
        default:
                break;
        }

        DPRINTF("%s portalgo = %d\n", __func__, algo);

        error = (*algos[algo].func)(algo, &lport, inp, cred);
        if (error == 0) {
                *port = lport;
        } else if (error != EAGAIN) {
                uint16_t lastport, mymin, mymax, *pnext_ephemeral;

                error = pcb_getports(inp, &lastport, &mymin,
                    &mymax, &pnext_ephemeral, algo);
                if (error)
                        return error;
                *port = lastport - 1;
        }
        return error;
}

/* Sets the algorithm to be used globally */
static int
portalgo_algo_name_select(const char *name, int *algo)
{
        size_t ai;

        DPRINTF("%s called\n", __func__);

        for (ai = 0; ai < NALGOS; ai++)
                if (strcmp(algos[ai].name, name) == 0) {
                        DPRINTF("%s: found idx %zu\n", __func__, ai);
                        *algo = ai;
                        return 0;
                }
        return EINVAL;
}

/* Sets the algorithm to be used by the pcb inp. */
int
portalgo_algo_index_select(struct inpcb *inp, int algo)
{

        DPRINTF("%s called with algo %d for pcb %p\n", __func__, algo, inp );

        if ((algo < 0 || algo >= NALGOS) &&
            (algo != PORTALGO_DEFAULT))
                return EINVAL;

        inp->inp_portalgo = algo;
        return 0;
}

/*
 * The sysctl hook that is supposed to check that we are picking one
 * of the valid algorithms.
 */
static int
sysctl_portalgo_selected(SYSCTLFN_ARGS, int *algo)
{
        struct sysctlnode node;
        int error;
        char newalgo[PORTALGO_MAXLEN];

        DPRINTF("%s called\n", __func__);

        strlcpy(newalgo, algos[*algo].name, sizeof(newalgo));

        node = *rnode;
        node.sysctl_data = newalgo;
        node.sysctl_size = sizeof(newalgo);

        error = sysctl_lookup(SYSCTLFN_CALL(&node));

        DPRINTF("newalgo: %s\n", newalgo);

        if (error || newp == NULL ||
            strncmp(newalgo, algos[*algo].name, sizeof(newalgo)) == 0)
                return error;

#ifdef KAUTH_NETWORK_SOCKET_PORT_RANDOMIZE
        if (l != NULL && (error = kauth_authorize_system(l->l_cred,
            KAUTH_NETWORK_SOCKET, KAUTH_NETWORK_SOCKET_PORT_RANDOMIZE, newname,
            NULL, NULL)) != 0)
                return error;
#endif

        mutex_enter(softnet_lock);
        error = portalgo_algo_name_select(newalgo, algo);
        mutex_exit(softnet_lock);
        return error;
}

static int
sysctl_portalgo_reserve(SYSCTLFN_ARGS, bitmap *bt)
{
        struct sysctlnode node;
        int error;

        DPRINTF("%s called\n", __func__);

        node = *rnode;
        node.sysctl_data = bt;
        node.sysctl_size = sizeof(*bt);

        error = sysctl_lookup(SYSCTLFN_CALL(&node));

        if (error || newp == NULL)
                return error;

#ifdef KAUTH_NETWORK_SOCKET_PORT_RESERVE
        if (l != NULL && (error = kauth_authorize_system(l->l_cred,
            KAUTH_NETWORK_SOCKET, KAUTH_NETWORK_SOCKET_PORT_RESERVE, bt,
            NULL, NULL)) != 0)
                return error;
#endif
        return error;
}

#ifdef INET
/*
 * The sysctl hook that is supposed to check that we are picking one
 * of the valid algorithms.
 */
int
sysctl_portalgo_selected4(SYSCTLFN_ARGS)
{

        return sysctl_portalgo_selected(SYSCTLFN_CALL(rnode), &inet4_portalgo);
}

int
sysctl_portalgo_reserve4(SYSCTLFN_ARGS)
{

        return sysctl_portalgo_reserve(SYSCTLFN_CALL(rnode), &inet4_reserve);
}
#endif

#ifdef INET6
int
sysctl_portalgo_selected6(SYSCTLFN_ARGS)
{

        return sysctl_portalgo_selected(SYSCTLFN_CALL(rnode), &inet6_portalgo);
}

int
sysctl_portalgo_reserve6(SYSCTLFN_ARGS)
{
        return sysctl_portalgo_reserve(SYSCTLFN_CALL(rnode), &inet6_reserve);
}
#endif

/*
 * The sysctl hook that returns the available
 * algorithms.
 */
int
sysctl_portalgo_available(SYSCTLFN_ARGS)
{
        size_t ai, len = 0;
        struct sysctlnode node;
        char availalgo[NALGOS * PORTALGO_MAXLEN];

        DPRINTF("%s called\n", __func__);

        availalgo[0] = '\0';

        for (ai = 0; ai < NALGOS; ai++) {
                len = strlcat(availalgo, algos[ai].name, sizeof(availalgo));
                if (ai < NALGOS - 1)
                        strlcat(availalgo, " ", sizeof(availalgo));
        }

        DPRINTF("available algos: %s\n", availalgo);

        node = *rnode;
        node.sysctl_data = availalgo;
        node.sysctl_size = len;

        return sysctl_lookup(SYSCTLFN_CALL(&node));
}









































































































































































































































    2 




































    2 


















































































    2 
















    2 
















    2 









    2 









    2 




































    1 
    1 


    2 
    2 

























    2 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 










































    2 


















































    2 



































    2 





    2 






























    2 









































































































































































































































































































































































































































































































































    3 




    3 





































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
/*        $NetBSD: icmp6.c,v 1.256 2024/02/24 21:41:13 mlelstv Exp $        */
/*        $KAME: icmp6.c,v 1.217 2001/06/20 15:03:29 jinmei Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ip_icmp.c        8.2 (Berkeley) 1/4/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: icmp6.c,v 1.256 2024/02/24 21:41:13 mlelstv Exp $");

#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/domain.h>
#include <sys/sysctl.h>

#include <net/if.h>
#include <net/route.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/nd.h>

#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet/wqinput.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet/icmp6.h>
#include <netinet6/icmp6_private.h>
#include <netinet6/mld6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/in6_ifattach.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/nd6.h>
#include <netinet6/scope6_var.h>

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#include <netipsec/key.h>
#endif

#include "faith.h"
#if defined(NFAITH) && 0 < NFAITH
#include <net/if_faith.h>
#endif

/* Ensure that non packed structures are the desired size. */
__CTASSERT(sizeof(struct icmp6_hdr) == 8);
__CTASSERT(sizeof(struct icmp6_nodeinfo) == 16);
__CTASSERT(sizeof(struct icmp6_namelookup) == 20);
__CTASSERT(sizeof(struct icmp6_router_renum) == 16);

__CTASSERT(sizeof(struct nd_router_solicit) == 8);
__CTASSERT(sizeof(struct nd_router_advert) == 16);
__CTASSERT(sizeof(struct nd_neighbor_solicit) == 24);
__CTASSERT(sizeof(struct nd_neighbor_advert) == 24);
__CTASSERT(sizeof(struct nd_redirect) == 40);
__CTASSERT(sizeof(struct nd_opt_hdr) == 2);
__CTASSERT(sizeof(struct nd_opt_route_info) == 8);
__CTASSERT(sizeof(struct nd_opt_prefix_info) == 32);
__CTASSERT(sizeof(struct nd_opt_rd_hdr) == 8);
__CTASSERT(sizeof(struct nd_opt_mtu) == 8);
__CTASSERT(sizeof(struct nd_opt_nonce) == 2 + ND_OPT_NONCE_LEN);
__CTASSERT(sizeof(struct nd_opt_rdnss) == 8);
__CTASSERT(sizeof(struct nd_opt_dnssl) == 8);

__CTASSERT(sizeof(struct mld_hdr) == 24);
__CTASSERT(sizeof(struct ni_reply_fqdn) == 8);
__CTASSERT(sizeof(struct rr_pco_match) == 24);
__CTASSERT(sizeof(struct rr_pco_use) == 32);
__CTASSERT(sizeof(struct rr_result) == 24);

extern struct domain inet6domain;

percpu_t *icmp6stat_percpu;

extern struct inpcbtable raw6cbtable;
extern int icmp6errppslim;
static int icmp6errpps_count = 0;
static struct timeval icmp6errppslim_last;
extern int icmp6_nodeinfo;

bool icmp6_dynamic_rt_msg = false;

/*
 * List of callbacks to notify when Path MTU changes are made.
 */
struct icmp6_mtudisc_callback {
        LIST_ENTRY(icmp6_mtudisc_callback) mc_list;
        void (*mc_func)(struct in6_addr *);
};

LIST_HEAD(, icmp6_mtudisc_callback) icmp6_mtudisc_callbacks =
    LIST_HEAD_INITIALIZER(&icmp6_mtudisc_callbacks);

static struct rttimer_queue *icmp6_mtudisc_timeout_q = NULL;
extern int pmtu_expire;

/* XXX do these values make any sense? */
static int icmp6_mtudisc_hiwat = 1280;
static int icmp6_mtudisc_lowat = 256;

/*
 * keep track of # of redirect routes.
 */
static struct rttimer_queue *icmp6_redirect_timeout_q = NULL;

/* XXX experimental, turned off */
static int icmp6_redirect_hiwat = -1;
static int icmp6_redirect_lowat = -1;

/* Protect mtudisc and redirect stuffs */
static kmutex_t icmp6_mtx __cacheline_aligned;

static bool icmp6_reflect_pmtu = false;

static void icmp6_errcount(u_int, int, int);
static int icmp6_rip6_input(struct mbuf **, int);
static void icmp6_reflect(struct mbuf *, size_t);
static int icmp6_ratelimit(const struct in6_addr *, const int, const int);
static const char *icmp6_redirect_diag(char *, size_t, struct in6_addr *,
    struct in6_addr *, struct in6_addr *);
static void icmp6_redirect_input(struct mbuf *, int);
static struct mbuf *ni6_input(struct mbuf *, int);
static struct mbuf *ni6_nametodns(const char *, int, int);
static int ni6_dnsmatch(const char *, int, const char *, int);
static int ni6_addrs(struct icmp6_nodeinfo *, struct ifnet **, char *,
    struct psref *);
static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *,
    struct ifnet *, int);
static int icmp6_notify_error(struct mbuf *, int, int, int);
static struct rtentry *icmp6_mtudisc_clone(struct sockaddr *);
static void icmp6_mtudisc_timeout(struct rtentry *, struct rttimer *);
static void icmp6_redirect_timeout(struct rtentry *, struct rttimer *);
static void sysctl_net_inet6_icmp6_setup(struct sysctllog **);

/* workqueue-based pr_input */
static struct wqinput *icmp6_wqinput;
static void _icmp6_input(struct mbuf *m, int off, int proto);

void
icmp6_init(void)
{

        sysctl_net_inet6_icmp6_setup(NULL);
        mld_init();

        mutex_init(&icmp6_mtx, MUTEX_DEFAULT, IPL_NONE);
        mutex_enter(&icmp6_mtx);
        icmp6_mtudisc_timeout_q = rt_timer_queue_create(pmtu_expire);
        icmp6_redirect_timeout_q = rt_timer_queue_create(icmp6_redirtimeout);
        mutex_exit(&icmp6_mtx);

        icmp6stat_percpu = percpu_alloc(sizeof(uint64_t) * ICMP6_NSTATS);

        icmp6_wqinput = wqinput_create("icmp6", _icmp6_input);
}

static void
icmp6_errcount(u_int base, int type, int code)
{
        switch (type) {
        case ICMP6_DST_UNREACH:
                switch (code) {
                case ICMP6_DST_UNREACH_NOROUTE:
                        ICMP6_STATINC(base + ICMP6_ERRSTAT_DST_UNREACH_NOROUTE);
                        return;
                case ICMP6_DST_UNREACH_ADMIN:
                        ICMP6_STATINC(base + ICMP6_ERRSTAT_DST_UNREACH_ADMIN);
                        return;
                case ICMP6_DST_UNREACH_BEYONDSCOPE:
                        ICMP6_STATINC(base +
                                      ICMP6_ERRSTAT_DST_UNREACH_BEYONDSCOPE);
                        return;
                case ICMP6_DST_UNREACH_ADDR:
                        ICMP6_STATINC(base + ICMP6_ERRSTAT_DST_UNREACH_ADDR);
                        return;
                case ICMP6_DST_UNREACH_NOPORT:
                        ICMP6_STATINC(base + ICMP6_ERRSTAT_DST_UNREACH_NOPORT);
                        return;
                }
                break;
        case ICMP6_PACKET_TOO_BIG:
                ICMP6_STATINC(base + ICMP6_ERRSTAT_PACKET_TOO_BIG);
                return;
        case ICMP6_TIME_EXCEEDED:
                switch (code) {
                case ICMP6_TIME_EXCEED_TRANSIT:
                        ICMP6_STATINC(base + ICMP6_ERRSTAT_TIME_EXCEED_TRANSIT);
                        return;
                case ICMP6_TIME_EXCEED_REASSEMBLY:
                        ICMP6_STATINC(base +
                                      ICMP6_ERRSTAT_TIME_EXCEED_REASSEMBLY);
                        return;
                }
                break;
        case ICMP6_PARAM_PROB:
                switch (code) {
                case ICMP6_PARAMPROB_HEADER:
                        ICMP6_STATINC(base + ICMP6_ERRSTAT_PARAMPROB_HEADER);
                        return;
                case ICMP6_PARAMPROB_NEXTHEADER:
                        ICMP6_STATINC(base +
                                      ICMP6_ERRSTAT_PARAMPROB_NEXTHEADER);
                        return;
                case ICMP6_PARAMPROB_OPTION:
                        ICMP6_STATINC(base + ICMP6_ERRSTAT_PARAMPROB_OPTION);
                        return;
                }
                break;
        case ND_REDIRECT:
                ICMP6_STATINC(base + ICMP6_ERRSTAT_REDIRECT);
                return;
        }
        ICMP6_STATINC(base + ICMP6_ERRSTAT_UNKNOWN);
}

/*
 * Register a Path MTU Discovery callback.
 */
void
icmp6_mtudisc_callback_register(void (*func)(struct in6_addr *))
{
        struct icmp6_mtudisc_callback *mc, *new;

        new = kmem_alloc(sizeof(*mc), KM_SLEEP);

        mutex_enter(&icmp6_mtx);
        for (mc = LIST_FIRST(&icmp6_mtudisc_callbacks); mc != NULL;
             mc = LIST_NEXT(mc, mc_list)) {
                if (mc->mc_func == func) {
                        mutex_exit(&icmp6_mtx);
                        kmem_free(new, sizeof(*mc));
                        return;
                }
        }

        new->mc_func = func;
        LIST_INSERT_HEAD(&icmp6_mtudisc_callbacks, new, mc_list);
        mutex_exit(&icmp6_mtx);
}

/*
 * A wrapper function for icmp6_error() necessary when the erroneous packet
 * may not contain enough scope zone information.
 */
void
icmp6_error2(struct mbuf *m, int type, int code, int param,
        struct ifnet *ifp, struct in6_addr *src)
{
        struct ip6_hdr *ip6;

        KASSERT(ifp != NULL);

        if (m->m_len < sizeof(struct ip6_hdr)) {
                m = m_pullup(m, sizeof(struct ip6_hdr));
                if (m == NULL)
                        return;
        }

        ip6 = mtod(m, struct ip6_hdr *);

        if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0)
                goto out;
        if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
                goto out;

        *src = ip6->ip6_src;
        icmp6_error(m, type, code, param);
        return;

out:
        m_freem(m);
}

/*
 * Generate an error packet of type error in response to bad IP6 packet.
 */
void
icmp6_error(struct mbuf *m, int type, int code, int param)
{
        struct ip6_hdr *oip6, *nip6;
        struct icmp6_hdr *icmp6;
        u_int preplen;
        int off;
        int nxt;

        ICMP6_STATINC(ICMP6_STAT_ERROR);

        /* count per-type-code statistics */
        icmp6_errcount(ICMP6_STAT_OUTERRHIST, type, code);

        if (m->m_flags & M_DECRYPTED) {
                ICMP6_STATINC(ICMP6_STAT_CANTERROR);
                goto freeit;
        }

        if (M_UNWRITABLE(m, sizeof(struct ip6_hdr)) &&
            (m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL)
                return;
        oip6 = mtod(m, struct ip6_hdr *);

        /*
         * If the destination address of the erroneous packet is a multicast
         * address, or the packet was sent using link-layer multicast,
         * we should basically suppress sending an error (RFC 2463, Section
         * 2.4).
         * We have two exceptions (the item e.2 in that section):
         * - the Packet Too Big message can be sent for path MTU discovery.
         * - the Parameter Problem Message that can be allowed an icmp6 error
         *   in the option type field.  This check has been done in
         *   ip6_unknown_opt(), so we can just check the type and code.
         */
        if ((m->m_flags & (M_BCAST|M_MCAST) ||
             IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) &&
            (type != ICMP6_PACKET_TOO_BIG &&
             (type != ICMP6_PARAM_PROB ||
              code != ICMP6_PARAMPROB_OPTION)))
                goto freeit;

        /*
         * RFC 2463, 2.4 (e.5): source address check.
         * XXX: the case of anycast source?
         */
        if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) ||
            IN6_IS_ADDR_MULTICAST(&oip6->ip6_src))
                goto freeit;

        /*
         * If we are about to send ICMPv6 against ICMPv6 error/redirect,
         * don't do it.
         */
        nxt = -1;
        off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
        if (off >= 0 && nxt == IPPROTO_ICMPV6) {
                struct icmp6_hdr *icp;

                IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off,
                        sizeof(*icp));
                if (icp == NULL) {
                        ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                        return;
                }
                if (icp->icmp6_type < ICMP6_ECHO_REQUEST ||
                    icp->icmp6_type == ND_REDIRECT) {
                        /*
                         * ICMPv6 error
                         * Special case: for redirect (which is
                         * informational) we must not send icmp6 error.
                         */
                        ICMP6_STATINC(ICMP6_STAT_CANTERROR);
                        goto freeit;
                } else {
                        /* ICMPv6 informational - send the error */
                }
        } else {
                /* non-ICMPv6 - send the error */
        }

        oip6 = mtod(m, struct ip6_hdr *); /* adjust pointer */

        /* Finally, do rate limitation check. */
        if (icmp6_ratelimit(&oip6->ip6_src, type, code)) {
                ICMP6_STATINC(ICMP6_STAT_TOOFREQ);
                goto freeit;
        }

        /*
         * OK, ICMP6 can be generated.
         */

        if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN)
                m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len);

        preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
        M_PREPEND(m, preplen, M_DONTWAIT);
        if (m && M_UNWRITABLE(m, preplen))
                m = m_pullup(m, preplen);
        if (m == NULL) {
                nd6log(LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__);
                return;
        }

        nip6 = mtod(m, struct ip6_hdr *);
        nip6->ip6_src  = oip6->ip6_src;
        nip6->ip6_dst  = oip6->ip6_dst;

        in6_clearscope(&oip6->ip6_src);
        in6_clearscope(&oip6->ip6_dst);

        icmp6 = (struct icmp6_hdr *)(nip6 + 1);
        icmp6->icmp6_type = type;
        icmp6->icmp6_code = code;
        icmp6->icmp6_pptr = htonl((u_int32_t)param);

        /*
         * icmp6_reflect() is designed to be in the input path.
         * icmp6_error() can be called from both input and output path,
         * and if we are in output path rcvif could contain bogus value.
         * clear m->m_pkthdr.rcvif for safety, we should have enough scope
         * information in ip header (nip6).
         */
        m_reset_rcvif(m);

        ICMP6_STATINC(ICMP6_STAT_OUTHIST + type);

        /* header order: IPv6 - ICMPv6 */
        icmp6_reflect(m, sizeof(struct ip6_hdr));

        return;

freeit:
        /*
         * If we can't tell whether or not we can generate ICMP6, free it.
         */
        m_freem(m);
}

/*
 * Process a received ICMP6 message.
 */
static void
_icmp6_input(struct mbuf *m, int off, int proto)
{
        struct mbuf *n;
        struct ip6_hdr *ip6, *nip6;
        struct icmp6_hdr *icmp6, *nicmp6;
        int icmp6len = m->m_pkthdr.len - off;
        int code, sum;
        struct ifnet *rcvif;
        struct psref psref;
        char ip6buf[INET6_ADDRSTRLEN], ip6buf2[INET6_ADDRSTRLEN];

        rcvif = m_get_rcvif_psref(m, &psref);
        if (__predict_false(rcvif == NULL))
                goto freeit;

#define ICMP6_MAXLEN (sizeof(*nip6) + sizeof(*nicmp6) + 4)
        KASSERT(ICMP6_MAXLEN < MCLBYTES);
        icmp6_ifstat_inc(rcvif, ifs6_in_msg);

        /*
         * Locate icmp6 structure in mbuf, and check
         * that not corrupted and of at least minimum length
         */

        if (icmp6len < sizeof(struct icmp6_hdr)) {
                ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                icmp6_ifstat_inc(rcvif, ifs6_in_error);
                goto freeit;
        }

        if (m->m_len < sizeof(struct ip6_hdr)) {
                m = m_pullup(m, sizeof(struct ip6_hdr));
                if (m == NULL) {
                        ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                        icmp6_ifstat_inc(rcvif, ifs6_in_error);
                        goto freeit;
                }
        }

        ip6 = mtod(m, struct ip6_hdr *);
        IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6));
        if (icmp6 == NULL) {
                ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                icmp6_ifstat_inc(rcvif, ifs6_in_error);
                goto freeit;
        }

        /*
         * Enforce alignment requirements that are violated in
         * some cases, see kern/50766 for details.
         */
        if (ACCESSIBLE_POINTER(icmp6, struct ip6_hdr) == 0) {
                m = m_copyup(m, off + sizeof(struct icmp6_hdr), 0);
                if (m == NULL) {
                        ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                        icmp6_ifstat_inc(rcvif, ifs6_in_error);
                        goto freeit;
                }
                ip6 = mtod(m, struct ip6_hdr *);
                icmp6 = (struct icmp6_hdr *)(mtod(m, char *) + off);
        }
        KASSERT(ACCESSIBLE_POINTER(icmp6, struct ip6_hdr));

        /*
         * calculate the checksum
         */
        if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) {
                nd6log(LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n",
                    icmp6->icmp6_type, sum, IN6_PRINT(ip6buf, &ip6->ip6_src));
                ICMP6_STATINC(ICMP6_STAT_CHECKSUM);
                icmp6_ifstat_inc(rcvif, ifs6_in_error);
                goto freeit;
        }

#if defined(NFAITH) && 0 < NFAITH
        if (faithprefix(&ip6->ip6_dst)) {
                /*
                 * Deliver very specific ICMP6 type only.
                 * This is important to deliver TOOBIG.  Otherwise PMTUD
                 * will not work.
                 */
                switch (icmp6->icmp6_type) {
                case ICMP6_DST_UNREACH:
                case ICMP6_PACKET_TOO_BIG:
                case ICMP6_TIME_EXCEEDED:
                        break;
                default:
                        goto freeit;
                }
        }
#endif

        code = icmp6->icmp6_code;
        ICMP6_STATINC(ICMP6_STAT_INHIST + icmp6->icmp6_type);

        switch (icmp6->icmp6_type) {
        case ICMP6_DST_UNREACH:
                icmp6_ifstat_inc(rcvif, ifs6_in_dstunreach);
                switch (code) {
                case ICMP6_DST_UNREACH_NOROUTE:
                        code = PRC_UNREACH_NET;
                        break;
                case ICMP6_DST_UNREACH_ADMIN:
                        icmp6_ifstat_inc(rcvif, ifs6_in_adminprohib);
                        code = PRC_UNREACH_PROTOCOL; /* is this a good code? */
                        break;
                case ICMP6_DST_UNREACH_ADDR:
                        code = PRC_HOSTDEAD;
                        break;
                case ICMP6_DST_UNREACH_BEYONDSCOPE:
                        /* I mean "source address was incorrect." */
                        code = PRC_UNREACH_NET;
                        break;
                case ICMP6_DST_UNREACH_NOPORT:
                        code = PRC_UNREACH_PORT;
                        break;
                default:
                        goto badcode;
                }
                goto deliver;

        case ICMP6_PACKET_TOO_BIG:
                icmp6_ifstat_inc(rcvif, ifs6_in_pkttoobig);

                /*
                 * MTU is checked in icmp6_mtudisc.
                 */
                code = PRC_MSGSIZE;

                /*
                 * Updating the path MTU will be done after examining
                 * intermediate extension headers.
                 */
                goto deliver;

        case ICMP6_TIME_EXCEEDED:
                icmp6_ifstat_inc(rcvif, ifs6_in_timeexceed);
                switch (code) {
                case ICMP6_TIME_EXCEED_TRANSIT:
                        code = PRC_TIMXCEED_INTRANS;
                        break;
                case ICMP6_TIME_EXCEED_REASSEMBLY:
                        code = PRC_TIMXCEED_REASS;
                        break;
                default:
                        goto badcode;
                }
                goto deliver;

        case ICMP6_PARAM_PROB:
                icmp6_ifstat_inc(rcvif, ifs6_in_paramprob);
                switch (code) {
                case ICMP6_PARAMPROB_NEXTHEADER:
                        code = PRC_UNREACH_PROTOCOL;
                        break;
                case ICMP6_PARAMPROB_HEADER:
                case ICMP6_PARAMPROB_OPTION:
                        code = PRC_PARAMPROB;
                        break;
                default:
                        goto badcode;
                }
                goto deliver;

        case ICMP6_ECHO_REQUEST:
                icmp6_ifstat_inc(rcvif, ifs6_in_echo);
                if (code != 0)
                        goto badcode;
                /*
                 * Copy mbuf to send to two data paths: userland socket(s),
                 * and to the querier (echo reply).
                 * m: a copy for socket, n: a copy for querier
                 *
                 * If the first mbuf is shared, or the first mbuf is too short,
                 * copy the first part of the data into a fresh mbuf.
                 * Otherwise, we will wrongly overwrite both copies.
                 */
                if ((n = m_copypacket(m, M_DONTWAIT)) == NULL) {
                        /* Give up local */
                        n = m;
                        m = NULL;
                } else if (M_UNWRITABLE(n, off + sizeof(struct icmp6_hdr))) {
                        struct mbuf *n0 = n;

                        /*
                         * Prepare an internal mbuf.  m_pullup() doesn't
                         * always copy the length we specified.
                         */
                        if ((n = m_dup(n0, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
                                /* Give up local */
                                n = m;
                                m = NULL;
                        }
                        m_freem(n0);
                }
                IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off,
                    sizeof(*nicmp6));
                if (nicmp6 == NULL)
                        goto freeit;
                nicmp6->icmp6_type = ICMP6_ECHO_REPLY;
                nicmp6->icmp6_code = 0;
                if (n) {
                        uint64_t *icmp6s = ICMP6_STAT_GETREF();
                        icmp6s[ICMP6_STAT_REFLECT]++;
                        icmp6s[ICMP6_STAT_OUTHIST + ICMP6_ECHO_REPLY]++;
                        ICMP6_STAT_PUTREF();
                        icmp6_reflect(n, off);
                }
                if (!m)
                        goto freeit;
                break;

        case ICMP6_ECHO_REPLY:
                icmp6_ifstat_inc(rcvif, ifs6_in_echoreply);
                if (code != 0)
                        goto badcode;
                break;

        case MLD_LISTENER_QUERY:
        case MLD_LISTENER_REPORT:
                if (icmp6len < sizeof(struct mld_hdr))
                        goto badlen;
                if (icmp6->icmp6_type == MLD_LISTENER_QUERY) /* XXX: ugly... */
                        icmp6_ifstat_inc(rcvif, ifs6_in_mldquery);
                else
                        icmp6_ifstat_inc(rcvif, ifs6_in_mldreport);
                if ((n = m_copypacket(m, M_DONTWAIT)) == NULL) {
                        /* give up local */
                        mld_input(m, off);
                        m = NULL;
                        goto freeit;
                }
                mld_input(n, off);
                /* m stays. */
                break;

        case MLD_LISTENER_DONE:
                icmp6_ifstat_inc(rcvif, ifs6_in_mlddone);
                if (icmp6len < sizeof(struct mld_hdr))        /* necessary? */
                        goto badlen;
                break;                /* nothing to be done in kernel */

        case MLD_MTRACE_RESP:
        case MLD_MTRACE:
                /* XXX: these two are experimental.  not officially defined. */
                /* XXX: per-interface statistics? */
                break;                /* just pass it to applications */

        case ICMP6_WRUREQUEST:        /* ICMP6_FQDN_QUERY */
            {
                enum { WRU, FQDN } mode;

                if (!icmp6_nodeinfo)
                        break;

                if (icmp6len == sizeof(struct icmp6_hdr) + 4)
                        mode = WRU;
                else if (icmp6len >= sizeof(struct icmp6_nodeinfo))
                        mode = FQDN;
                else
                        goto badlen;

                if (mode == FQDN) {
                        n = m_copypacket(m, M_DONTWAIT);
                        if (n)
                                n = ni6_input(n, off);
                } else {
                        u_char *p;
                        int maxhlen;

                        if ((icmp6_nodeinfo & 5) != 5)
                                break;

                        if (code != 0)
                                goto badcode;
                        MGETHDR(n, M_DONTWAIT, m->m_type);
                        if (n && ICMP6_MAXLEN > MHLEN) {
                                MCLGET(n, M_DONTWAIT);
                                if ((n->m_flags & M_EXT) == 0) {
                                        m_free(n);
                                        n = NULL;
                                }
                        }
                        if (n == NULL) {
                                /* Give up remote */
                                break;
                        }
                        m_reset_rcvif(n);
                        n->m_len = 0;
                        maxhlen = M_TRAILINGSPACE(n) - ICMP6_MAXLEN;
                        if (maxhlen < 0) {
                                m_free(n);
                                break;
                        }
                        if (maxhlen > hostnamelen)
                                maxhlen = hostnamelen;
                        /*
                         * Copy IPv6 and ICMPv6 only.
                         */
                        nip6 = mtod(n, struct ip6_hdr *);
                        memcpy(nip6, ip6, sizeof(struct ip6_hdr));
                        nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
                        memcpy(nicmp6, icmp6, sizeof(struct icmp6_hdr));

                        p = (u_char *)(nicmp6 + 1);
                        memset(p, 0, 4);
                        memcpy(p + 4, hostname, maxhlen); /* meaningless TTL */

                        m_copy_pkthdr(n, m);
                        n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
                                sizeof(struct icmp6_hdr) + 4 + maxhlen;
                        nicmp6->icmp6_type = ICMP6_WRUREPLY;
                        nicmp6->icmp6_code = 0;
                }
                if (n) {
                        uint64_t *icmp6s = ICMP6_STAT_GETREF();
                        icmp6s[ICMP6_STAT_REFLECT]++;
                        icmp6s[ICMP6_STAT_OUTHIST + ICMP6_WRUREPLY]++;
                        ICMP6_STAT_PUTREF();
                        icmp6_reflect(n, sizeof(struct ip6_hdr));
                }
                break;
            }

        case ICMP6_WRUREPLY:
                if (code != 0)
                        goto badcode;
                break;

        case ND_ROUTER_SOLICIT:
                icmp6_ifstat_inc(rcvif, ifs6_in_routersolicit);
                /* FALLTHROUGH */
        case ND_ROUTER_ADVERT:
                if (icmp6->icmp6_type == ND_ROUTER_ADVERT)
                        icmp6_ifstat_inc(rcvif, ifs6_in_routeradvert);
                if (code != 0)
                        goto badcode;
                if ((icmp6->icmp6_type == ND_ROUTER_SOLICIT &&
                    icmp6len < sizeof(struct nd_router_solicit)) ||
                    (icmp6->icmp6_type == ND_ROUTER_ADVERT &&
                    icmp6len < sizeof(struct nd_router_advert)))
                        goto badlen;
                if ((n = m_copypacket(m, M_DONTWAIT)) == NULL) {
                        /* give up local */
                        nd6_rtr_cache(m, off, icmp6len, icmp6->icmp6_type);
                        m = NULL;
                        goto freeit;
                }
                nd6_rtr_cache(n, off, icmp6len, icmp6->icmp6_type);
                /* m stays. */
                break;

        case ND_NEIGHBOR_SOLICIT:
                icmp6_ifstat_inc(rcvif, ifs6_in_neighborsolicit);
                if (code != 0)
                        goto badcode;
                if (icmp6len < sizeof(struct nd_neighbor_solicit))
                        goto badlen;
                if ((n = m_copypacket(m, M_DONTWAIT)) == NULL) {
                        /* give up local */
                        nd6_ns_input(m, off, icmp6len);
                        m = NULL;
                        goto freeit;
                }
                nd6_ns_input(n, off, icmp6len);
                /* m stays. */
                break;

        case ND_NEIGHBOR_ADVERT:
                icmp6_ifstat_inc(rcvif, ifs6_in_neighboradvert);
                if (code != 0)
                        goto badcode;
                if (icmp6len < sizeof(struct nd_neighbor_advert))
                        goto badlen;
                if ((n = m_copypacket(m, M_DONTWAIT)) == NULL) {
                        /* give up local */
                        nd6_na_input(m, off, icmp6len);
                        m = NULL;
                        goto freeit;
                }
                nd6_na_input(n, off, icmp6len);
                /* m stays. */
                break;

        case ND_REDIRECT:
                icmp6_ifstat_inc(rcvif, ifs6_in_redirect);
                if (code != 0)
                        goto badcode;
                if (icmp6len < sizeof(struct nd_redirect))
                        goto badlen;
                if ((n = m_copypacket(m, M_DONTWAIT)) == NULL) {
                        /* give up local */
                        icmp6_redirect_input(m, off);
                        m = NULL;
                        goto freeit;
                }
                icmp6_redirect_input(n, off);
                /* m stays. */
                break;

        case ICMP6_ROUTER_RENUMBERING:
                if (code != ICMP6_ROUTER_RENUMBERING_COMMAND &&
                    code != ICMP6_ROUTER_RENUMBERING_RESULT)
                        goto badcode;
                if (icmp6len < sizeof(struct icmp6_router_renum))
                        goto badlen;
                break;

        default:
                nd6log(LOG_DEBUG,
                    "unknown type %d(src=%s, dst=%s, ifid=%d)\n",
                    icmp6->icmp6_type,
                    IN6_PRINT(ip6buf, &ip6->ip6_src),
                    IN6_PRINT(ip6buf2, &ip6->ip6_dst),
                    rcvif ? rcvif->if_index : 0);
                if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) {
                        /* ICMPv6 error: MUST deliver it by spec... */
                        code = PRC_NCMDS;
                        /* deliver */
                } else {
                        /* ICMPv6 informational: MUST not deliver */
                        break;
                }
        deliver:
                if (icmp6_notify_error(m, off, icmp6len, code)) {
                        /* In this case, m should've been freed. */
                        m_put_rcvif_psref(rcvif, &psref);
                        return;
                }
                break;

        badcode:
                ICMP6_STATINC(ICMP6_STAT_BADCODE);
                break;

        badlen:
                ICMP6_STATINC(ICMP6_STAT_BADLEN);
                break;
        }
        m_put_rcvif_psref(rcvif, &psref);

        /* deliver the packet to appropriate sockets */
        icmp6_rip6_input(&m, off);

        return;

freeit:
        m_put_rcvif_psref(rcvif, &psref);
        m_freem(m);
        return;
}

int
icmp6_input(struct mbuf **mp, int *offp, int proto)
{

        wqinput_input(icmp6_wqinput, *mp, *offp, proto);

        return IPPROTO_DONE;
}

static int
icmp6_notify_error(struct mbuf *m, int off, int icmp6len, int code)
{
        struct icmp6_hdr *icmp6;
        struct ip6_hdr *eip6;
        u_int32_t notifymtu;
        struct sockaddr_in6 icmp6src, icmp6dst;

        if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) {
                ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                goto freeit;
        }
        IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
            sizeof(*icmp6) + sizeof(struct ip6_hdr));
        if (icmp6 == NULL) {
                ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                return (-1);
        }
        eip6 = (struct ip6_hdr *)(icmp6 + 1);

        /* Detect the upper level protocol */
        {
                void *(*ctlfunc)(int, const struct sockaddr *, void *);
                u_int8_t nxt = eip6->ip6_nxt;
                int eoff = off + sizeof(struct icmp6_hdr) +
                        sizeof(struct ip6_hdr);
                struct ip6ctlparam ip6cp;
                struct in6_addr *finaldst = NULL;
                int icmp6type = icmp6->icmp6_type;
                struct ip6_frag *fh;
                struct ip6_rthdr *rth;
                struct ifnet *rcvif;
                int s;

                while (1) { /* XXX: should avoid infinite loop explicitly? */
                        struct ip6_ext *eh;

                        switch (nxt) {
                        case IPPROTO_HOPOPTS:
                        case IPPROTO_DSTOPTS:
                        case IPPROTO_AH:
                                IP6_EXTHDR_GET(eh, struct ip6_ext *, m,
                                    eoff, sizeof(*eh));
                                if (eh == NULL) {
                                        ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                                        return (-1);
                                }

                                if (nxt == IPPROTO_AH)
                                        eoff += (eh->ip6e_len + 2) << 2;
                                else
                                        eoff += (eh->ip6e_len + 1) << 3;
                                nxt = eh->ip6e_nxt;
                                break;
                        case IPPROTO_ROUTING:
                                /* Ignore the option. */
                                IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m,
                                    eoff, sizeof(*rth));
                                if (rth == NULL) {
                                        ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                                        return (-1);
                                }

                                eoff += (rth->ip6r_len + 1) << 3;
                                nxt = rth->ip6r_nxt;
                                break;
                        case IPPROTO_FRAGMENT:
                                IP6_EXTHDR_GET(fh, struct ip6_frag *, m,
                                    eoff, sizeof(*fh));
                                if (fh == NULL) {
                                        ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                                        return (-1);
                                }
                                /*
                                 * Data after a fragment header is meaningless
                                 * unless it is the first fragment, but
                                 * we'll go to the notify label for path MTU
                                 * discovery.
                                 */
                                if (fh->ip6f_offlg & IP6F_OFF_MASK)
                                        goto notify;

                                eoff += sizeof(struct ip6_frag);
                                nxt = fh->ip6f_nxt;
                                break;
                        default:
                                /*
                                 * This case includes ESP and the No Next
                                 * Header.  In such cases going to the notify
                                 * label does not have any meaning
                                 * (i.e. ctlfunc will be NULL), but we go
                                 * anyway since we might have to update
                                 * path MTU information.
                                 */
                                goto notify;
                        }
                }
          notify:
                IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
                    sizeof(*icmp6) + sizeof(struct ip6_hdr));
                if (icmp6 == NULL) {
                        ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                        return (-1);
                }

                /*
                 * retrieve parameters from the inner IPv6 header, and convert
                 * them into sockaddr structures.
                 * XXX: there is no guarantee that the source or destination
                 * addresses of the inner packet are in the same scope zone as
                 * the addresses of the icmp packet.  But there is no other
                 * way to determine the zone.
                 */
                eip6 = (struct ip6_hdr *)(icmp6 + 1);

                rcvif = m_get_rcvif(m, &s);
                if (__predict_false(rcvif == NULL))
                        goto freeit;
                sockaddr_in6_init(&icmp6dst,
                    (finaldst == NULL) ? &eip6->ip6_dst : finaldst, 0, 0, 0);
                if (in6_setscope(&icmp6dst.sin6_addr, rcvif, NULL)) {
                        m_put_rcvif(rcvif, &s);
                        goto freeit;
                }
                sockaddr_in6_init(&icmp6src, &eip6->ip6_src, 0, 0, 0);
                if (in6_setscope(&icmp6src.sin6_addr, rcvif, NULL)) {
                        m_put_rcvif(rcvif, &s);
                        goto freeit;
                }
                m_put_rcvif(rcvif, &s);

                icmp6src.sin6_flowinfo =
                        (eip6->ip6_flow & IPV6_FLOWLABEL_MASK);

                if (finaldst == NULL)
                        finaldst = &eip6->ip6_dst;
                ip6cp.ip6c_m = m;
                ip6cp.ip6c_icmp6 = icmp6;
                ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1);
                ip6cp.ip6c_off = eoff;
                ip6cp.ip6c_finaldst = finaldst;
                ip6cp.ip6c_src = &icmp6src;
                ip6cp.ip6c_nxt = nxt;

                if (icmp6type == ICMP6_PACKET_TOO_BIG) {
                        notifymtu = ntohl(icmp6->icmp6_mtu);
                        ip6cp.ip6c_cmdarg = (void *)&notifymtu;
                }

                ctlfunc = inet6sw[ip6_protox[nxt]].pr_ctlinput;
                if (ctlfunc) {
                        (void)(*ctlfunc)(code, sin6tosa(&icmp6dst), &ip6cp);
                }
        }
        return (0);

freeit:
        m_freem(m);
        return (-1);
}

void
icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated)
{
        unsigned long rtcount;
        struct icmp6_mtudisc_callback *mc;
        struct in6_addr *dst = ip6cp->ip6c_finaldst;
        struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6;
        struct mbuf *m = ip6cp->ip6c_m;        /* will be necessary for scope issue */
        u_int mtu = ntohl(icmp6->icmp6_mtu);
        struct rtentry *rt = NULL;
        struct sockaddr_in6 sin6;
        struct ifnet *rcvif;
        int s;

        /*
         * The MTU should not be less than the minimal IPv6 MTU except for the
         * hack in ip6_output/ip6_setpmtu where we always include a frag header.
         * In that one case, the MTU might be less than 1280.
         */
        if (__predict_false(mtu < IPV6_MMTU - sizeof(struct ip6_frag))) {
                /* is the mtu even sane? */
                if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8)
                        return;
                if (!validated)
                        return;
                mtu = IPV6_MMTU - sizeof(struct ip6_frag);
        }

        /*
         * allow non-validated cases if memory is plenty, to make traffic
         * from non-connected pcb happy.
         */
        mutex_enter(&icmp6_mtx);
        rtcount = rt_timer_count(icmp6_mtudisc_timeout_q);
        if (validated) {
                if (0 <= icmp6_mtudisc_hiwat && rtcount > icmp6_mtudisc_hiwat) {
                        mutex_exit(&icmp6_mtx);
                        return;
                } else if (0 <= icmp6_mtudisc_lowat &&
                    rtcount > icmp6_mtudisc_lowat) {
                        /*
                         * XXX nuke a victim, install the new one.
                         */
                }
        } else {
                if (0 <= icmp6_mtudisc_lowat && rtcount > icmp6_mtudisc_lowat) {
                        mutex_exit(&icmp6_mtx);
                        return;
                }
        }
        mutex_exit(&icmp6_mtx);

        memset(&sin6, 0, sizeof(sin6));
        sin6.sin6_family = PF_INET6;
        sin6.sin6_len = sizeof(struct sockaddr_in6);
        sin6.sin6_addr = *dst;
        rcvif = m_get_rcvif(m, &s);
        if (__predict_false(rcvif == NULL))
                return;
        if (in6_setscope(&sin6.sin6_addr, rcvif, NULL)) {
                m_put_rcvif(rcvif, &s);
                return;
        }
        m_put_rcvif(rcvif, &s);

        rt = icmp6_mtudisc_clone(sin6tosa(&sin6));

        if (rt && (rt->rt_flags & RTF_HOST) &&
            !(rt->rt_rmx.rmx_locks & RTV_MTU) &&
            (rt->rt_rmx.rmx_mtu > mtu || rt->rt_rmx.rmx_mtu == 0)) {
                if (mtu < rt->rt_ifp->if_mtu) {
                        ICMP6_STATINC(ICMP6_STAT_PMTUCHG);
                        rt->rt_rmx.rmx_mtu = mtu;
                }
        }
        if (rt) {
                rt_unref(rt);
        }

        /*
         * Notify protocols that the MTU for this destination
         * has changed.
         */
        mutex_enter(&icmp6_mtx);
        for (mc = LIST_FIRST(&icmp6_mtudisc_callbacks); mc != NULL;
             mc = LIST_NEXT(mc, mc_list))
                (*mc->mc_func)(&sin6.sin6_addr);
        mutex_exit(&icmp6_mtx);
}

/*
 * Process a Node Information Query packet, based on
 * draft-ietf-ipngwg-icmp-name-lookups-07.
 *
 * Spec incompatibilities:
 * - IPv6 Subject address handling
 * - IPv4 Subject address handling support missing
 * - Proxy reply (answer even if it's not for me)
 * - joins NI group address at in6_ifattach() time only, does not cope
 *   with hostname changes by sethostname(3)
 */
static struct mbuf *
ni6_input(struct mbuf *m, int off)
{
        struct icmp6_nodeinfo *ni6, *nni6;
        struct mbuf *n = NULL;
        u_int16_t qtype;
        int subjlen;
        int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
        struct ni_reply_fqdn *fqdn;
        int addrs;                /* for NI_QTYPE_NODEADDR */
        struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */
        struct sockaddr_in6 sin6; /* ip6_dst */
        struct in6_addr in6_subj; /* subject address */
        struct ip6_hdr *ip6;
        int oldfqdn = 0;        /* if 1, return pascal string (03 draft) */
        char *subj = NULL;
        struct ifnet *rcvif;
        int s, ss;
        struct ifaddr *ifa;
        struct psref psref;

        ip6 = mtod(m, struct ip6_hdr *);
        IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6));
        if (ni6 == NULL) {
                /* m is already reclaimed */
                return NULL;
        }
        KASSERT((m->m_flags & M_PKTHDR) != 0);

        /*
         * Validate IPv6 destination address.
         *
         * The Responder must discard the Query without further processing
         * unless it is one of the Responder's unicast or anycast addresses, or
         * a link-local scope multicast address which the Responder has joined.
         * [icmp-name-lookups-07, Section 4.]
         */
        sockaddr_in6_init(&sin6, &ip6->ip6_dst, 0, 0, 0);
        /* XXX scopeid */
        ss = pserialize_read_enter();
        ifa = ifa_ifwithaddr(sin6tosa(&sin6));
        if (ifa != NULL) {
                ; /* unicast/anycast, fine */
        } else if (IN6_IS_ADDR_MC_LINKLOCAL(&sin6.sin6_addr)) {
                ; /* link-local multicast, fine */
        } else {
                pserialize_read_exit(ss);
                goto bad;
        }
        pserialize_read_exit(ss);

        /* validate query Subject field. */
        qtype = ntohs(ni6->ni_qtype);
        subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo);
        switch (qtype) {
        case NI_QTYPE_NOOP:
        case NI_QTYPE_SUPTYPES:
                /* 07 draft */
                if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0)
                        break;
                /* FALLTHROUGH */
        case NI_QTYPE_FQDN:
        case NI_QTYPE_NODEADDR:
        case NI_QTYPE_IPV4ADDR:
                switch (ni6->ni_code) {
                case ICMP6_NI_SUBJ_IPV6:
#if ICMP6_NI_SUBJ_IPV6 != 0
                case 0:
#endif
                        /*
                         * backward compatibility - try to accept 03 draft
                         * format, where no Subject is present.
                         */
                        if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 &&
                            subjlen == 0) {
                                oldfqdn++;
                                break;
                        }
#if ICMP6_NI_SUBJ_IPV6 != 0
                        if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6)
                                goto bad;
#endif

                        if (subjlen != sizeof(sin6.sin6_addr))
                                goto bad;

                        /*
                         * Validate Subject address.
                         *
                         * Not sure what exactly "address belongs to the node"
                         * means in the spec, is it just unicast, or what?
                         *
                         * At this moment we consider Subject address as
                         * "belong to the node" if the Subject address equals
                         * to the IPv6 destination address; validation for
                         * IPv6 destination address should have done enough
                         * check for us.
                         *
                         * We do not do proxy at this moment.
                         */
                        /* m_pulldown instead of copy? */
                        m_copydata(m, off + sizeof(struct icmp6_nodeinfo),
                            subjlen, (void *)&in6_subj);
                        rcvif = m_get_rcvif(m, &s);
                        if (__predict_false(rcvif == NULL))
                                goto bad;
                        if (in6_setscope(&in6_subj, rcvif, NULL)) {
                                m_put_rcvif(rcvif, &s);
                                goto bad;
                        }
                        m_put_rcvif(rcvif, &s);

                        subj = (char *)&in6_subj;
                        if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj))
                                break;

                        /*
                         * XXX if we are to allow other cases, we should really
                         * be careful about scope here.
                         * basically, we should disallow queries toward IPv6
                         * destination X with subject Y, if scope(X) > scope(Y).
                         * if we allow scope(X) > scope(Y), it will result in
                         * information leakage across scope boundary.
                         */
                        goto bad;

                case ICMP6_NI_SUBJ_FQDN:
                        /*
                         * Validate Subject name with gethostname(3).
                         *
                         * The behavior may need some debate, since:
                         * - we are not sure if the node has FQDN as
                         *   hostname (returned by gethostname(3)).
                         * - the code does wildcard match for truncated names.
                         *   however, we are not sure if we want to perform
                         *   wildcard match, if gethostname(3) side has
                         *   truncated hostname.
                         */
                        n = ni6_nametodns(hostname, hostnamelen, 0);
                        if (!n || n->m_next || n->m_len == 0)
                                goto bad;
                        IP6_EXTHDR_GET(subj, char *, m,
                            off + sizeof(struct icmp6_nodeinfo), subjlen);
                        if (subj == NULL)
                                goto bad;
                        if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *),
                            n->m_len)) {
                                goto bad;
                        }
                        m_freem(n);
                        n = NULL;
                        break;

                case ICMP6_NI_SUBJ_IPV4:        /* XXX: to be implemented? */
                default:
                        goto bad;
                }
                break;
        }

        /* refuse based on configuration.  XXX ICMP6_NI_REFUSED? */
        switch (qtype) {
        case NI_QTYPE_FQDN:
                if ((icmp6_nodeinfo & 1) == 0)
                        goto bad;
                break;
        case NI_QTYPE_NODEADDR:
        case NI_QTYPE_IPV4ADDR:
                if ((icmp6_nodeinfo & 2) == 0)
                        goto bad;
                break;
        }

        /* guess reply length */
        switch (qtype) {
        case NI_QTYPE_NOOP:
                break;                /* no reply data */
        case NI_QTYPE_SUPTYPES:
                replylen += sizeof(u_int32_t);
                break;
        case NI_QTYPE_FQDN:
                /* will append an mbuf */
                replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
                break;
        case NI_QTYPE_NODEADDR:
                addrs = ni6_addrs(ni6, &ifp, subj, &psref);
                replylen += addrs *
                    (sizeof(struct in6_addr) + sizeof(u_int32_t));
                if (replylen > MCLBYTES)
                        replylen = MCLBYTES; /* XXX: will truncate pkt later */
                break;
        case NI_QTYPE_IPV4ADDR:
                /* unsupported - should respond with unknown Qtype? */
                goto bad;
        default:
                /*
                 * XXX: We must return a reply with the ICMP6 code
                 * `unknown Qtype' in this case.  However we regard the case
                 * as an FQDN query for backward compatibility.
                 * Older versions set a random value to this field,
                 * so it rarely varies in the defined qtypes.
                 * But the mechanism is not reliable...
                 * maybe we should obsolete older versions.
                 */
                qtype = NI_QTYPE_FQDN;
                /* will append an mbuf */
                replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
                oldfqdn++;
                break;
        }

        /* allocate an mbuf to reply. */
        MGETHDR(n, M_DONTWAIT, m->m_type);
        if (n == NULL) {
                goto bad;
        }
        m_move_pkthdr(n, m);
        if (replylen > MHLEN) {
                if (replylen > MCLBYTES) {
                        /*
                         * XXX: should we try to allocate more? But MCLBYTES
                         * is probably much larger than IPV6_MMTU...
                         */
                        goto bad;
                }
                MCLGET(n, M_DONTWAIT);
                if ((n->m_flags & M_EXT) == 0) {
                        goto bad;
                }
        }
        n->m_pkthdr.len = n->m_len = replylen;

        /* copy mbuf header and IPv6 + Node Information base headers */
        bcopy(mtod(m, void *), mtod(n, void *), sizeof(struct ip6_hdr));
        nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1);
        bcopy((void *)ni6, (void *)nni6, sizeof(struct icmp6_nodeinfo));

        /* qtype dependent procedure */
        switch (qtype) {
        case NI_QTYPE_NOOP:
                nni6->ni_code = ICMP6_NI_SUCCESS;
                nni6->ni_flags = 0;
                break;
        case NI_QTYPE_SUPTYPES:
        {
                u_int32_t v;
                nni6->ni_code = ICMP6_NI_SUCCESS;
                nni6->ni_flags = htons(0x0000);        /* raw bitmap */
                /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */
                v = (u_int32_t)htonl(0x0000000f);
                memcpy(nni6 + 1, &v, sizeof(u_int32_t));
                break;
        }
        case NI_QTYPE_FQDN:
                nni6->ni_code = ICMP6_NI_SUCCESS;
                fqdn = (struct ni_reply_fqdn *)(mtod(n, char *) +
                    sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo));
                nni6->ni_flags = 0; /* XXX: meaningless TTL */
                fqdn->ni_fqdn_ttl = 0;        /* ditto. */
                /*
                 * XXX do we really have FQDN in variable "hostname"?
                 */
                n->m_next = ni6_nametodns(hostname, hostnamelen, oldfqdn);
                if (n->m_next == NULL)
                        goto bad;
                /* XXX we assume that n->m_next is not a chain */
                if (n->m_next->m_next != NULL)
                        goto bad;
                n->m_pkthdr.len += n->m_next->m_len;
                break;
        case NI_QTYPE_NODEADDR:
        {
                int lenlim, copied;

                nni6->ni_code = ICMP6_NI_SUCCESS;
                n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
                    sizeof(struct icmp6_nodeinfo);
                lenlim = M_TRAILINGSPACE(n);
                copied = ni6_store_addrs(ni6, nni6, ifp, lenlim);
                if_put(ifp, &psref);
                ifp = NULL;
                /* update mbuf length */
                n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
                    sizeof(struct icmp6_nodeinfo) + copied;
                break;
        }
        default:
                panic("%s: impossible", __func__);
                break;
        }

        nni6->ni_type = ICMP6_NI_REPLY;
        m_freem(m);
        return n;

bad:
        if_put(ifp, &psref);
        m_freem(m);
        if (n)
                m_freem(n);
        return NULL;
}

#define isupper(x) ('A' <= (x) && (x) <= 'Z')
#define isalpha(x) (('A' <= (x) && (x) <= 'Z') || ('a' <= (x) && (x) <= 'z'))
#define isalnum(x) (isalpha(x) || ('0' <= (x) && (x) <= '9'))
#define tolower(x) (isupper(x) ? (x) + 'a' - 'A' : (x))

/*
 * make a mbuf with DNS-encoded string.  no compression support.
 *
 * XXX names with less than 2 dots (like "foo" or "foo.section") will be
 * treated as truncated name (two \0 at the end).  this is a wild guess.
 *
 * old - return pascal string if non-zero
 */
static struct mbuf *
ni6_nametodns(const char *name, int namelen, int old)
{
        struct mbuf *m;
        char *cp, *ep;
        const char *p, *q;
        int i, len, nterm;

        if (old)
                len = namelen + 1;
        else
                len = MCLBYTES;

        /* because MAXHOSTNAMELEN is usually 256, we use cluster mbuf */
        MGET(m, M_DONTWAIT, MT_DATA);
        if (m && len > MLEN) {
                MCLGET(m, M_DONTWAIT);
                if ((m->m_flags & M_EXT) == 0)
                        goto fail;
        }
        if (!m)
                goto fail;
        m->m_next = NULL;

        if (old) {
                m->m_len = len;
                *mtod(m, char *) = namelen;
                memcpy(mtod(m, char *) + 1, name, namelen);
                return m;
        } else {
                m->m_len = 0;
                cp = mtod(m, char *);
                ep = mtod(m, char *) + M_TRAILINGSPACE(m);

                /* if not certain about my name, return empty buffer */
                if (namelen == 0)
                        return m;

                /*
                 * guess if it looks like shortened hostname, or FQDN.
                 * shortened hostname needs two trailing "\0".
                 */
                i = 0;
                for (p = name; p < name + namelen; p++) {
                        if (*p == '.')
                                i++;
                }
                if (i < 2)
                        nterm = 2;
                else
                        nterm = 1;

                p = name;
                while (cp < ep && p < name + namelen) {
                        i = 0;
                        for (q = p; q < name + namelen && *q && *q != '.'; q++)
                                i++;
                        /* result does not fit into mbuf */
                        if (cp + i + 1 >= ep)
                                goto fail;
                        /*
                         * DNS label length restriction, RFC1035 page 8.
                         * "i == 0" case is included here to avoid returning
                         * 0-length label on "foo..bar".
                         */
                        if (i <= 0 || i >= 64)
                                goto fail;
                        *cp++ = i;
                        if (!isalpha(p[0]) || !isalnum(p[i - 1]))
                                goto fail;
                        while (i > 0) {
                                if (!isalnum(*p) && *p != '-')
                                        goto fail;
                                if (isupper(*p)) {
                                        *cp++ = tolower(*p);
                                        p++;
                                } else
                                        *cp++ = *p++;
                                i--;
                        }
                        p = q;
                        if (p < name + namelen && *p == '.')
                                p++;
                }
                /* termination */
                if (cp + nterm >= ep)
                        goto fail;
                while (nterm-- > 0)
                        *cp++ = '\0';
                m->m_len = cp - mtod(m, char *);
                return m;
        }

        panic("should not reach here");
        /* NOTREACHED */

fail:
        if (m)
                m_freem(m);
        return NULL;
}

/*
 * check if two DNS-encoded string matches.  takes care of truncated
 * form (with \0\0 at the end).  no compression support.
 * XXX upper/lowercase match (see RFC2065)
 */
static int
ni6_dnsmatch(const char *a, int alen, const char *b, int blen)
{
        const char *a0, *b0;
        int l;

        /* simplest case - need validation? */
        if (alen == blen && memcmp(a, b, alen) == 0)
                return 1;

        a0 = a;
        b0 = b;

        /* termination is mandatory */
        if (alen < 2 || blen < 2)
                return 0;
        if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0')
                return 0;
        alen--;
        blen--;

        while (a - a0 < alen && b - b0 < blen) {
                if (a - a0 + 1 > alen || b - b0 + 1 > blen)
                        return 0;

                if ((signed char)a[0] < 0 || (signed char)b[0] < 0)
                        return 0;
                /* we don't support compression yet */
                if (a[0] >= 64 || b[0] >= 64)
                        return 0;

                /* truncated case */
                if (a[0] == 0 && a - a0 == alen - 1)
                        return 1;
                if (b[0] == 0 && b - b0 == blen - 1)
                        return 1;
                if (a[0] == 0 || b[0] == 0)
                        return 0;

                if (a[0] != b[0])
                        return 0;
                l = a[0];
                if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen)
                        return 0;
                if (memcmp(a + 1, b + 1, l) != 0)
                        return 0;

                a += 1 + l;
                b += 1 + l;
        }

        if (a - a0 == alen && b - b0 == blen)
                return 1;
        else
                return 0;
}

/*
 * calculate the number of addresses to be returned in the node info reply.
 */
static int
ni6_addrs(struct icmp6_nodeinfo *ni6, struct ifnet **ifpp, char *subj,
    struct psref *psref)
{
        struct ifnet *ifp;
        struct in6_ifaddr *ia6;
        struct ifaddr *ifa;
        struct sockaddr_in6 *subj_ip6 = NULL; /* XXX pedant */
        int addrs = 0, addrsofif, iffound = 0;
        int niflags = ni6->ni_flags;
        int s;

        if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) {
                switch (ni6->ni_code) {
                case ICMP6_NI_SUBJ_IPV6:
                        if (subj == NULL) /* must be impossible... */
                                return 0;
                        subj_ip6 = (struct sockaddr_in6 *)subj;
                        break;
                default:
                        /*
                         * XXX: we only support IPv6 subject address for
                         * this Qtype.
                         */
                        return 0;
                }
        }

        s = pserialize_read_enter();
        IFNET_READER_FOREACH(ifp) {
                addrsofif = 0;
                IFADDR_READER_FOREACH(ifa, ifp) {
                        if (ifa->ifa_addr->sa_family != AF_INET6)
                                continue;
                        ia6 = (struct in6_ifaddr *)ifa;

                        if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 &&
                            IN6_ARE_ADDR_EQUAL(&subj_ip6->sin6_addr,
                             &ia6->ia_addr.sin6_addr))
                                iffound = 1;

                        /*
                         * IPv4-mapped addresses can only be returned by a
                         * Node Information proxy, since they represent
                         * addresses of IPv4-only nodes, which perforce do
                         * not implement this protocol.
                         * [icmp-name-lookups-07, Section 5.4]
                         * So we don't support NI_NODEADDR_FLAG_COMPAT in
                         * this function at this moment.
                         */

                        /* What do we have to do about ::1? */
                        switch (in6_addrscope(&ia6->ia_addr.sin6_addr)) {
                        case IPV6_ADDR_SCOPE_LINKLOCAL:
                                if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
                                        continue;
                                break;
                        case IPV6_ADDR_SCOPE_SITELOCAL:
                                if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
                                        continue;
                                break;
                        case IPV6_ADDR_SCOPE_GLOBAL:
                                if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
                                        continue;
                                break;
                        default:
                                continue;
                        }

                        /*
                         * check if anycast is okay.
                         * XXX: just experimental.  not in the spec.
                         */
                        if ((ia6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
                            (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
                                continue; /* we need only unicast addresses */

                        addrsofif++; /* count the address */
                }
                if (iffound) {
                        if_acquire(ifp, psref);
                        pserialize_read_exit(s);
                        *ifpp = ifp;
                        return addrsofif;
                }

                addrs += addrsofif;
        }
        pserialize_read_exit(s);

        return addrs;
}

static int
ni6_store_addrs(struct icmp6_nodeinfo *ni6,
        struct icmp6_nodeinfo *nni6, struct ifnet *ifp0,
        int resid)
{
        struct ifnet *ifp;
        struct in6_ifaddr *ia6;
        struct ifaddr *ifa;
        struct ifnet *ifp_dep = NULL;
        int copied = 0, allow_deprecated = 0;
        u_char *cp = (u_char *)(nni6 + 1);
        int niflags = ni6->ni_flags;
        u_int32_t ltime;
        int s;

        if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL))
                return 0;        /* needless to copy */

        s = pserialize_read_enter();
        ifp = ifp0 ? ifp0 : IFNET_READER_FIRST();
again:

        for (; ifp; ifp = IFNET_READER_NEXT(ifp))
        {
                IFADDR_READER_FOREACH(ifa, ifp) {
                        if (ifa->ifa_addr->sa_family != AF_INET6)
                                continue;
                        ia6 = (struct in6_ifaddr *)ifa;

                        if ((ia6->ia6_flags & IN6_IFF_DEPRECATED) != 0 &&
                            allow_deprecated == 0) {
                                /*
                                 * prefererred address should be put before
                                 * deprecated addresses.
                                 */

                                /* record the interface for later search */
                                if (ifp_dep == NULL)
                                        ifp_dep = ifp;

                                continue;
                        }
                        else if ((ia6->ia6_flags & IN6_IFF_DEPRECATED) == 0 &&
                                 allow_deprecated != 0)
                                continue; /* we now collect deprecated addrs */

                        /* What do we have to do about ::1? */
                        switch (in6_addrscope(&ia6->ia_addr.sin6_addr)) {
                        case IPV6_ADDR_SCOPE_LINKLOCAL:
                                if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
                                        continue;
                                break;
                        case IPV6_ADDR_SCOPE_SITELOCAL:
                                if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
                                        continue;
                                break;
                        case IPV6_ADDR_SCOPE_GLOBAL:
                                if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
                                        continue;
                                break;
                        default:
                                continue;
                        }

                        /*
                         * check if anycast is okay.
                         * XXX: just experimental.  not in the spec.
                         */
                        if ((ia6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
                            (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
                                continue;

                        /* now we can copy the address */
                        if (resid < sizeof(struct in6_addr) +
                            sizeof(u_int32_t)) {
                                /*
                                 * We give up much more copy.
                                 * Set the truncate flag and return.
                                 */
                                nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE;
                                goto out;
                        }

                        /*
                         * Set the TTL of the address.
                         * The TTL value should be one of the following
                         * according to the specification:
                         *
                         * 1. The remaining lifetime of a DHCP lease on the
                         *    address, or
                         * 2. The remaining Valid Lifetime of a prefix from
                         *    which the address was derived through Stateless
                         *    Autoconfiguration.
                         *
                         * Note that we currently do not support stateful
                         * address configuration by DHCPv6, so the former
                         * case can't happen.
                         *
                         * TTL must be 2^31 > TTL >= 0.
                         */
                        if (ia6->ia6_lifetime.ia6t_expire == 0)
                                ltime = ND6_INFINITE_LIFETIME;
                        else {
                                if (ia6->ia6_lifetime.ia6t_expire >
                                    time_uptime)
                                        ltime = ia6->ia6_lifetime.ia6t_expire -
                                            time_uptime;
                                else
                                        ltime = 0;
                        }
                        if (ltime > 0x7fffffff)
                                ltime = 0x7fffffff;
                        ltime = htonl(ltime);

                        memcpy(cp, &ltime, sizeof(u_int32_t));
                        cp += sizeof(u_int32_t);

                        /* copy the address itself */
                        bcopy(&ia6->ia_addr.sin6_addr, cp,
                              sizeof(struct in6_addr));
                        in6_clearscope((struct in6_addr *)cp); /* XXX */
                        cp += sizeof(struct in6_addr);

                        resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t));
                        copied += (sizeof(struct in6_addr) + sizeof(u_int32_t));
                }
                if (ifp0)        /* we need search only on the specified IF */
                        break;
        }

        if (allow_deprecated == 0 && ifp_dep != NULL) {
                ifp = ifp_dep;
                allow_deprecated = 1;

                goto again;
        }
out:
        pserialize_read_exit(s);
        return copied;
}

/*
 * XXX almost dup'ed code with rip6_input.
 */
static int
icmp6_rip6_input(struct mbuf **mp, int off)
{
        struct mbuf *m = *mp;
        struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
        struct inpcb *inp;
        struct inpcb *last = NULL;
        struct sockaddr_in6 rip6src;
        struct icmp6_hdr *icmp6;
        struct mbuf *n, *opts = NULL;

        IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, sizeof(*icmp6));
        if (icmp6 == NULL) {
                /* m is already reclaimed */
                return IPPROTO_DONE;
        }

        /*
         * XXX: the address may have embedded scope zone ID, which should be
         * hidden from applications.
         */
        sockaddr_in6_init(&rip6src, &ip6->ip6_src, 0, 0, 0);
        if (sa6_recoverscope(&rip6src)) {
                m_freem(m);
                return IPPROTO_DONE;
        }

        TAILQ_FOREACH(inp, &raw6cbtable.inpt_queue, inp_queue) {
                if (inp->inp_af != AF_INET6)
                        continue;
                if (in6p_ip6(inp).ip6_nxt != IPPROTO_ICMPV6)
                        continue;
                if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) &&
                    !IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &ip6->ip6_dst))
                        continue;
                if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)) &&
                    !IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &ip6->ip6_src))
                        continue;
                if (in6p_icmp6filt(inp) &&
                    ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
                    in6p_icmp6filt(inp)))
                        continue;

                if (last == NULL) {
                        ;
                }
#ifdef IPSEC
                else if (ipsec_used && ipsec_in_reject(m, last)) {
                        /* do not inject data into pcb */
                }
#endif
                else if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) {
                        if (last->inp_flags & IN6P_CONTROLOPTS ||
                            SOOPT_TIMESTAMP(last->inp_socket->so_options))
                                ip6_savecontrol(last, &opts, ip6, n);
                        /* strip intermediate headers */
                        m_adj(n, off);
                        if (sbappendaddr(&last->inp_socket->so_rcv,
                            sin6tosa(&rip6src), n, opts) == 0) {
                                soroverflow(last->inp_socket);
                                m_freem(n);
                                if (opts)
                                        m_freem(opts);
                        } else {
                                sorwakeup(last->inp_socket);
                        }
                        opts = NULL;
                }

                last = inp;
        }

#ifdef IPSEC
        if (ipsec_used && last && ipsec_in_reject(m, last)) {
                m_freem(m);
                IP6_STATDEC(IP6_STAT_DELIVERED);
                /* do not inject data into pcb */
        } else
#endif
        if (last) {
                if (last->inp_flags & IN6P_CONTROLOPTS ||
                    SOOPT_TIMESTAMP(last->inp_socket->so_options))
                        ip6_savecontrol(last, &opts, ip6, m);
                /* strip intermediate headers */
                m_adj(m, off);
                if (sbappendaddr(&last->inp_socket->so_rcv,
                    sin6tosa(&rip6src), m, opts) == 0) {
                        soroverflow(last->inp_socket);
                        m_freem(m);
                        if (opts)
                                m_freem(opts);
                } else {
                        sorwakeup(last->inp_socket);
                }
        } else {
                m_freem(m);
                IP6_STATDEC(IP6_STAT_DELIVERED);
        }
        return IPPROTO_DONE;
}

/*
 * Reflect the ip6 packet back to the source.
 * OFF points to the icmp6 header, counted from the top of the mbuf.
 *
 * Note: RFC 1885 required that an echo reply should be truncated if it
 * did not fit in with (return) path MTU, and KAME code supported the
 * behavior.  However, as a clarification after the RFC, this limitation
 * was removed in a revised version of the spec, RFC 2463.  We had kept the
 * old behavior, with a (non-default) ifdef block, while the new version of
 * the spec was an internet-draft status, and even after the new RFC was
 * published.  But it would rather make sense to clean the obsoleted part
 * up, and to make the code simpler at this stage.
 */
static void
icmp6_reflect(struct mbuf *m, size_t off)
{
        struct ip6_hdr *ip6;
        struct icmp6_hdr *icmp6;
        const struct in6_ifaddr *ia;
        const struct ip6aux *ip6a;
        int plen;
        int type, code;
        struct ifnet *outif = NULL;
        struct in6_addr origdst;
        struct ifnet *rcvif;
        int s;
        bool ip6_src_filled = false;
        int flags;

        /* too short to reflect */
        if (off < sizeof(struct ip6_hdr)) {
                nd6log(LOG_DEBUG,
                    "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n",
                    (u_long)off, (u_long)sizeof(struct ip6_hdr),
                    __FILE__, __LINE__);
                goto bad;
        }

        /*
         * If there are extra headers between IPv6 and ICMPv6, strip
         * off that header first.
         */
        CTASSERT(sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) <= MHLEN);
        if (off > sizeof(struct ip6_hdr)) {
                size_t l;
                struct ip6_hdr nip6;

                l = off - sizeof(struct ip6_hdr);
                m_copydata(m, 0, sizeof(nip6), (void *)&nip6);
                m_adj(m, l);
                l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
                if (m->m_len < l) {
                        if ((m = m_pullup(m, l)) == NULL)
                                return;
                }
                memcpy(mtod(m, void *), (void *)&nip6, sizeof(nip6));
        } else {
                size_t l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
                if (m->m_len < l) {
                        if ((m = m_pullup(m, l)) == NULL)
                                return;
                }
        }

        plen = m->m_pkthdr.len - sizeof(struct ip6_hdr);
        ip6 = mtod(m, struct ip6_hdr *);
        ip6->ip6_nxt = IPPROTO_ICMPV6;
        icmp6 = (struct icmp6_hdr *)(ip6 + 1);
        type = icmp6->icmp6_type; /* keep type for statistics */
        code = icmp6->icmp6_code; /* ditto. */

        origdst = ip6->ip6_dst;
        /*
         * ip6_input() drops a packet if its src is multicast.
         * So, the src is never multicast.
         */
        ip6->ip6_dst = ip6->ip6_src;

        /*
         * If the incoming packet was addressed directly to us (i.e. unicast),
         * use dst as the src for the reply.
         * The IN6_IFF_NOTREADY case should be VERY rare, but is possible
         * (for example) when we encounter an error while forwarding procedure
         * destined to a duplicated address of ours.
         * Note that ip6_getdstifaddr() may fail if we are in an error handling
         * procedure of an outgoing packet of our own, in which case we need
         * to search in the ifaddr list.
         */
        if (IN6_IS_ADDR_MULTICAST(&origdst)) {
                ;
        } else if ((ip6a = ip6_getdstifaddr(m)) != NULL) {
                if ((ip6a->ip6a_flags &
                     (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)) == 0) {
                        ip6->ip6_src = ip6a->ip6a_src;
                        ip6_src_filled = true;
                }
        } else {
                union {
                        struct sockaddr_in6 sin6;
                        struct sockaddr sa;
                } u;
                int _s;
                struct ifaddr *ifa;

                sockaddr_in6_init(&u.sin6, &origdst, 0, 0, 0);

                _s = pserialize_read_enter();
                ifa = ifa_ifwithaddr(&u.sa);

                if (ifa != NULL) {
                        ia = ifatoia6(ifa);
                        if ((ia->ia6_flags &
                                 (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)) == 0) {
                                ip6->ip6_src = ia->ia_addr.sin6_addr;
                                ip6_src_filled = true;
                        }
                }
                pserialize_read_exit(_s);
        }

        if (!ip6_src_filled) {
                int e;
                struct sockaddr_in6 sin6;
                struct route ro;

                /*
                 * This case matches to multicasts, our anycast, or unicasts
                 * that we do not own.  Select a source address based on the
                 * source address of the erroneous packet.
                 */
                /* zone ID should be embedded */
                sockaddr_in6_init(&sin6, &ip6->ip6_dst, 0, 0, 0);

                memset(&ro, 0, sizeof(ro));
                e = in6_selectsrc(&sin6, NULL, NULL, &ro, NULL, NULL, NULL,
                    &ip6->ip6_src);
                rtcache_free(&ro);
                if (e != 0) {
                        char ip6buf[INET6_ADDRSTRLEN];
                        nd6log(LOG_DEBUG,
                            "source can't be determined: "
                            "dst=%s, error=%d\n",
                            IN6_PRINT(ip6buf, &sin6.sin6_addr), e);
                        goto bad;
                }
        }

        ip6->ip6_flow = 0;
        ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
        ip6->ip6_vfc |= IPV6_VERSION;
        ip6->ip6_nxt = IPPROTO_ICMPV6;
        rcvif = m_get_rcvif(m, &s);
        if (rcvif) {
                /* XXX: This may not be the outgoing interface */
                ip6->ip6_hlim = ND_IFINFO(rcvif)->chlim;
        } else {
                ip6->ip6_hlim = ip6_defhlim;
        }
        m_put_rcvif(rcvif, &s);

        m->m_pkthdr.csum_flags = 0;
        icmp6->icmp6_cksum = 0;
        icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6,
            sizeof(struct ip6_hdr), plen);

        /*
         * XXX option handling
         */

        m->m_flags &= ~(M_BCAST|M_MCAST);

        /*
         * Note for icmp6_reflect_pmtu == false
         * To avoid a "too big" situation at an intermediate router
         * and the path MTU discovery process, specify the IPV6_MINMTU flag.
         * Note that only echo and node information replies are affected,
         * since the length of ICMP6 errors is limited to the minimum MTU.
         */
        flags = icmp6_reflect_pmtu ? 0 : IPV6_MINMTU;
        if (ip6_output(m, NULL, NULL, flags, NULL, NULL, &outif) != 0 &&
            outif)
                icmp6_ifstat_inc(outif, ifs6_out_error);
        if (outif)
                icmp6_ifoutstat_inc(outif, type, code);

        return;

 bad:
        m_freem(m);
        return;
}

static const char *
icmp6_redirect_diag(char *buf, size_t buflen, struct in6_addr *src6,
    struct in6_addr *dst6,  struct in6_addr *tgt6)
{
        char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
        char ip6buft[INET6_ADDRSTRLEN];

        snprintf(buf, buflen, "(src=%s dst=%s tgt=%s)",
            IN6_PRINT(ip6bufs, src6), IN6_PRINT(ip6bufd, dst6),
            IN6_PRINT(ip6buft, tgt6));
        return buf;
}

static void
icmp6_redirect_input(struct mbuf *m, int off)
{
        struct ifnet *ifp;
        struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
        struct nd_redirect *nd_rd;
        int icmp6len = m->m_pkthdr.len - off;
        char *lladdr = NULL;
        int lladdrlen = 0;
        struct rtentry *rt = NULL;
        int is_router;
        int is_onlink;
        struct in6_addr src6 = ip6->ip6_src;
        struct in6_addr redtgt6;
        struct in6_addr reddst6;
        union nd_opts ndopts;
        struct psref psref;
        char ip6buf[INET6_ADDRSTRLEN];
        char diagbuf[256];

        ifp = m_get_rcvif_psref(m, &psref);
        if (ifp == NULL)
                goto freeit;

        /* XXX if we are router, we don't update route by icmp6 redirect */
        if (ip6_forwarding)
                goto freeit;
        if (!icmp6_rediraccept)
                goto freeit;

        IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len);
        if (nd_rd == NULL) {
                ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
                m_put_rcvif_psref(ifp, &psref);
                return;
        }
        redtgt6 = nd_rd->nd_rd_target;
        reddst6 = nd_rd->nd_rd_dst;

        if (in6_setscope(&redtgt6, ifp, NULL) ||
            in6_setscope(&reddst6, ifp, NULL)) {
                goto freeit;
        }

        /* validation */
        if (!IN6_IS_ADDR_LINKLOCAL(&src6)) {
                nd6log(LOG_ERR,
                    "ICMP6 redirect sent from %s rejected; "
                    "must be from linklocal\n", IN6_PRINT(ip6buf, &src6));
                goto bad;
        }
        if (ip6->ip6_hlim != 255) {
                nd6log(LOG_ERR,
                    "ICMP6 redirect sent from %s rejected; "
                    "hlim=%d (must be 255)\n",
                    IN6_PRINT(ip6buf, &src6), ip6->ip6_hlim);
                goto bad;
        }

    {
        /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */
        struct sockaddr_in6 sin6;
        struct in6_addr *gw6;

        sockaddr_in6_init(&sin6, &reddst6, 0, 0, 0);
        rt = rtalloc1(sin6tosa(&sin6), 0);
        if (rt) {
                if (rt->rt_gateway == NULL ||
                    rt->rt_gateway->sa_family != AF_INET6) {
                        nd6log(LOG_ERR,
                            "ICMP6 redirect rejected; no route "
                            "with inet6 gateway found for redirect dst: %s\n",
                            icmp6_redirect_diag(diagbuf, sizeof(diagbuf),
                            &src6, &reddst6, &redtgt6));
                        rt_unref(rt);
                        goto bad;
                }

                gw6 = &(((struct sockaddr_in6 *)rt->rt_gateway)->sin6_addr);
                if (memcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) {
                        nd6log(LOG_ERR,
                            "ICMP6 redirect rejected; "
                            "not equal to gw-for-src=%s (must be same): %s\n",
                            IN6_PRINT(ip6buf, gw6),
                            icmp6_redirect_diag(diagbuf, sizeof(diagbuf),
                            &src6, &reddst6, &redtgt6));
                        rt_unref(rt);
                        goto bad;
                }
        } else {
                nd6log(LOG_ERR, "ICMP6 redirect rejected; "
                    "no route found for redirect dst: %s\n",
                    icmp6_redirect_diag(diagbuf, sizeof(diagbuf),
                    &src6, &reddst6, &redtgt6));
                goto bad;
        }
        rt_unref(rt);
        rt = NULL;
    }

        if (IN6_IS_ADDR_MULTICAST(&reddst6)) {
                nd6log(LOG_ERR, "ICMP6 redirect rejected; "
                    "redirect dst must be unicast: %s\n",
                    icmp6_redirect_diag(diagbuf, sizeof(diagbuf),
                    &src6, &reddst6, &redtgt6));
                goto bad;
        }

        is_router = is_onlink = 0;
        if (IN6_IS_ADDR_LINKLOCAL(&redtgt6))
                is_router = 1;        /* router case */
        if (memcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0)
                is_onlink = 1;        /* on-link destination case */
        if (!is_router && !is_onlink) {
                nd6log(LOG_ERR, "ICMP6 redirect rejected; "
                    "neither router case nor onlink case: %s\n",
                    icmp6_redirect_diag(diagbuf, sizeof(diagbuf),
                    &src6, &reddst6, &redtgt6));
                goto bad;
        }
        /* validation passed */

        icmp6len -= sizeof(*nd_rd);
        nd6_option_init(nd_rd + 1, icmp6len, &ndopts);
        if (nd6_options(&ndopts) < 0) {
                nd6log(LOG_INFO, "invalid ND option, rejected: %s\n",
                    icmp6_redirect_diag(diagbuf, sizeof(diagbuf),
                    &src6, &reddst6, &redtgt6));
                /* nd6_options have incremented stats */
                goto freeit;
        }

        if (ndopts.nd_opts_tgt_lladdr) {
                lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
                lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
        }

        if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
                nd6log(LOG_INFO, "lladdrlen mismatch for %s "
                    "(if %d, icmp6 packet %d): %s\n",
                    IN6_PRINT(ip6buf, &redtgt6),
                    ifp->if_addrlen, lladdrlen - 2,
                    icmp6_redirect_diag(diagbuf, sizeof(diagbuf),
                    &src6, &reddst6, &redtgt6));
                goto bad;
        }

        /* RFC 2461 8.3 */
        nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT,
            is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER);

        m_put_rcvif_psref(ifp, &psref);
        ifp = NULL;

        if (!is_onlink) {        /* better router case.  perform rtredirect. */
                /* perform rtredirect */
                struct sockaddr_in6 sdst;
                struct sockaddr_in6 sgw;
                struct sockaddr_in6 ssrc;
                unsigned long rtcount;
                struct rtentry *newrt = NULL;

                /*
                 * do not install redirect route, if the number of entries
                 * is too much (> hiwat).  note that, the node (= host) will
                 * work just fine even if we do not install redirect route
                 * (there will be additional hops, though).
                 */
                mutex_enter(&icmp6_mtx);
                rtcount = rt_timer_count(icmp6_redirect_timeout_q);
                if (0 <= ip6_maxdynroutes && rtcount >= ip6_maxdynroutes) {
                        mutex_exit(&icmp6_mtx);
                        goto freeit;
                }
                if (0 <= icmp6_redirect_hiwat && rtcount > icmp6_redirect_hiwat) {
                        mutex_exit(&icmp6_mtx);
                        goto freeit;
                } else if (0 <= icmp6_redirect_lowat &&
                    rtcount > icmp6_redirect_lowat) {
                        /*
                         * XXX nuke a victim, install the new one.
                         */
                }

                memset(&sdst, 0, sizeof(sdst));
                memset(&sgw, 0, sizeof(sgw));
                memset(&ssrc, 0, sizeof(ssrc));
                sdst.sin6_family = sgw.sin6_family = ssrc.sin6_family = AF_INET6;
                sdst.sin6_len = sgw.sin6_len = ssrc.sin6_len =
                    sizeof(struct sockaddr_in6);
                bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr));
                bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
                bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr));
                rtredirect(sin6tosa(&sdst), sin6tosa(&sgw), NULL,
                    RTF_GATEWAY | RTF_HOST, sin6tosa(&ssrc), &newrt);

                if (newrt) {
                        (void)rt_timer_add(newrt, icmp6_redirect_timeout,
                            icmp6_redirect_timeout_q);
                        rt_unref(newrt);
                }
                mutex_exit(&icmp6_mtx);
        }
        /* finally update cached route in each socket via pfctlinput */
        {
                struct sockaddr_in6 sdst;

                sockaddr_in6_init(&sdst, &reddst6, 0, 0, 0);
                pfctlinput(PRC_REDIRECT_HOST, sin6tosa(&sdst));
#if defined(IPSEC)
                if (ipsec_used)
                        key_sa_routechange(sin6tosa(&sdst));
#endif
        }

freeit:
        if (ifp != NULL)
                m_put_rcvif_psref(ifp, &psref);
        m_freem(m);
        return;

bad:
        m_put_rcvif_psref(ifp, &psref);
        ICMP6_STATINC(ICMP6_STAT_BADREDIRECT);
        m_freem(m);
}

void
icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt)
{
        struct ifnet *ifp;        /* my outgoing interface */
        struct in6_addr *ifp_ll6;
        struct in6_addr *nexthop;
        struct ip6_hdr *sip6;        /* m0 as struct ip6_hdr */
        struct mbuf *m = NULL;        /* newly allocated one */
        struct ip6_hdr *ip6;        /* m as struct ip6_hdr */
        struct nd_redirect *nd_rd;
        size_t maxlen;
        u_char *p;
        struct sockaddr_in6 src_sa;

        icmp6_errcount(ICMP6_STAT_OUTERRHIST, ND_REDIRECT, 0);

        /* if we are not router, we don't send icmp6 redirect */
        if (!ip6_forwarding)
                goto fail;

        /* sanity check */
        KASSERT(m0 != NULL);
        KASSERT(rt != NULL);

        ifp = rt->rt_ifp;

        /*
         * Address check:
         *  the source address must identify a neighbor, and
         *  the destination address must not be a multicast address
         *  [RFC 2461, sec 8.2]
         */
        sip6 = mtod(m0, struct ip6_hdr *);
        sockaddr_in6_init(&src_sa, &sip6->ip6_src, 0, 0, 0);
        if (nd6_is_addr_neighbor(&src_sa, ifp) == 0)
                goto fail;
        if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst))
                goto fail;        /* what should we do here? */

        /* rate limit */
        if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0))
                goto fail;

        /*
         * Since we are going to append up to 1280 bytes (= IPV6_MMTU),
         * we almost always ask for an mbuf cluster for simplicity.
         * (MHLEN < IPV6_MMTU is almost always true)
         */
        MGETHDR(m, M_DONTWAIT, MT_HEADER);
        if (m && IPV6_MMTU >= MHLEN) {
#if IPV6_MMTU >= MCLBYTES
                MEXTMALLOC(m, IPV6_MMTU, M_NOWAIT);
#else
                MCLGET(m, M_DONTWAIT);
#endif
        }

        if (!m)
                goto fail;
        m_reset_rcvif(m);
        m->m_len = 0;
        maxlen = M_TRAILINGSPACE(m);
        maxlen = uimin(IPV6_MMTU, maxlen);

        /* just for safety */
        if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct nd_redirect) +
            ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) {
                goto fail;
        }

        {
                /* get ip6 linklocal address for ifp(my outgoing interface). */
                struct in6_ifaddr *ia;
                int s = pserialize_read_enter();
                if ((ia = in6ifa_ifpforlinklocal(ifp,
                                                 IN6_IFF_NOTREADY|
                                                 IN6_IFF_ANYCAST)) == NULL) {
                        pserialize_read_exit(s);
                        goto fail;
                }
                ifp_ll6 = &ia->ia_addr.sin6_addr;
                pserialize_read_exit(s);
        }

        /* get ip6 linklocal address for the router. */
        if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) {
                struct sockaddr_in6 *sin6;
                sin6 = (struct sockaddr_in6 *)rt->rt_gateway;
                nexthop = &sin6->sin6_addr;
                if (!IN6_IS_ADDR_LINKLOCAL(nexthop))
                        nexthop = NULL;
        } else
                nexthop = NULL;

        /* ip6 */
        ip6 = mtod(m, struct ip6_hdr *);
        ip6->ip6_flow = 0;
        ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
        ip6->ip6_vfc |= IPV6_VERSION;
        /* ip6->ip6_plen will be set later */
        ip6->ip6_nxt = IPPROTO_ICMPV6;
        ip6->ip6_hlim = 255;
        /* ip6->ip6_src must be linklocal addr for my outgoing if. */
        bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr));
        bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr));

        /* ND Redirect */
        nd_rd = (struct nd_redirect *)(ip6 + 1);
        nd_rd->nd_rd_type = ND_REDIRECT;
        nd_rd->nd_rd_code = 0;
        nd_rd->nd_rd_reserved = 0;
        if (rt->rt_flags & RTF_GATEWAY) {
                /*
                 * nd_rd->nd_rd_target must be a link-local address in
                 * better router cases.
                 */
                if (!nexthop)
                        goto fail;
                bcopy(nexthop, &nd_rd->nd_rd_target,
                      sizeof(nd_rd->nd_rd_target));
                bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
                      sizeof(nd_rd->nd_rd_dst));
        } else {
                /* make sure redtgt == reddst */
                nexthop = &sip6->ip6_dst;
                bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target,
                      sizeof(nd_rd->nd_rd_target));
                bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
                      sizeof(nd_rd->nd_rd_dst));
        }

        p = (u_char *)(nd_rd + 1);

        {
                /* target lladdr option */
                struct llentry *ln = NULL;
                int len, pad;
                struct nd_opt_hdr *nd_opt;
                char *lladdr;

                ln = nd6_lookup(nexthop, ifp, false);
                if (ln == NULL)
                        goto nolladdropt;
                len = sizeof(*nd_opt) + ifp->if_addrlen;
                len = (len + 7) & ~7;        /* round by 8 */
                pad = len - (sizeof(*nd_opt) + ifp->if_addrlen);

                /* safety check */
                if (len + (p - (u_char *)ip6) > maxlen) {
                        LLE_RUNLOCK(ln);
                        goto nolladdropt;
                }

                if (ln->la_flags & LLE_VALID) {
                        nd_opt = (struct nd_opt_hdr *)p;
                        nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
                        nd_opt->nd_opt_len = len >> 3;
                        lladdr = (char *)(nd_opt + 1);
                        memcpy(lladdr, &ln->ll_addr, ifp->if_addrlen);
                        memset(lladdr + ifp->if_addrlen, 0, pad);
                        p += len;
                }
                LLE_RUNLOCK(ln);
        }
nolladdropt:

        m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;

        /* just to be safe */
        if (m0->m_flags & M_DECRYPTED)
                goto noredhdropt;
        if (p - (u_char *)ip6 > maxlen)
                goto noredhdropt;

        {
                /* redirected header option */
                int len;
                struct nd_opt_rd_hdr *nd_opt_rh;

                /*
                 * compute the maximum size for icmp6 redirect header option.
                 * XXX room for auth header?
                 */
                len = maxlen - (p - (u_char *)ip6);
                len &= ~7;

                if (len < sizeof(*nd_opt_rh)) {
                        goto noredhdropt;
                }

                /*
                 * Redirected header option spec (RFC2461 4.6.3) talks nothing
                 * about padding/truncate rule for the original IP packet.
                 * From the discussion on IPv6imp in Feb 1999,
                 * the consensus was:
                 * - "attach as much as possible" is the goal
                 * - pad if not aligned (original size can be guessed by
                 *   original ip6 header)
                 * Following code adds the padding if it is simple enough,
                 * and truncates if not.
                 */
                if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) {
                        /* not enough room, truncate */
                        m_adj(m0, (len - sizeof(*nd_opt_rh)) -
                            m0->m_pkthdr.len);
                } else {
                        /*
                         * enough room, truncate if not aligned.
                         * we don't pad here for simplicity.
                         */
                        int extra;

                        extra = m0->m_pkthdr.len % 8;
                        if (extra) {
                                /* truncate */
                                m_adj(m0, -extra);
                        }
                        len = m0->m_pkthdr.len + sizeof(*nd_opt_rh);
                }

                nd_opt_rh = (struct nd_opt_rd_hdr *)p;
                memset(nd_opt_rh, 0, sizeof(*nd_opt_rh));
                nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER;
                nd_opt_rh->nd_opt_rh_len = len >> 3;
                p += sizeof(*nd_opt_rh);
                m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;

                /* connect m0 to m */
                m->m_pkthdr.len += m0->m_pkthdr.len;
                m_cat(m, m0);
                m0 = NULL;
        }
noredhdropt:
        if (m0) {
                m_freem(m0);
                m0 = NULL;
        }

        /* XXX: clear embedded link IDs in the inner header */
        in6_clearscope(&sip6->ip6_src);
        in6_clearscope(&sip6->ip6_dst);
        in6_clearscope(&nd_rd->nd_rd_target);
        in6_clearscope(&nd_rd->nd_rd_dst);

        ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));

        nd_rd->nd_rd_cksum = 0;
        nd_rd->nd_rd_cksum =
            in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen));

        /* send the packet to outside... */
        if (ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL) != 0)
                icmp6_ifstat_inc(ifp, ifs6_out_error);

        icmp6_ifstat_inc(ifp, ifs6_out_msg);
        icmp6_ifstat_inc(ifp, ifs6_out_redirect);
        ICMP6_STATINC(ICMP6_STAT_OUTHIST + ND_REDIRECT);

        return;

fail:
        if (m)
                m_freem(m);
        if (m0)
                m_freem(m0);
}

/*
 * ICMPv6 socket option processing.
 */
int
icmp6_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
        int error = 0;
        struct inpcb *inp = sotoinpcb(so);

        if (sopt->sopt_level != IPPROTO_ICMPV6)
                return rip6_ctloutput(op, so, sopt);

        switch (op) {
        case PRCO_SETOPT:
                switch (sopt->sopt_name) {
                case ICMP6_FILTER:
                    {
                        struct icmp6_filter fil;

                        error = sockopt_get(sopt, &fil, sizeof(fil));
                        if (error)
                                break;
                        memcpy(in6p_icmp6filt(inp), &fil,
                            sizeof(struct icmp6_filter));
                        error = 0;
                        break;
                    }

                default:
                        error = ENOPROTOOPT;
                        break;
                }
                break;

        case PRCO_GETOPT:
                switch (sopt->sopt_name) {
                case ICMP6_FILTER:
                    {
                        if (in6p_icmp6filt(inp) == NULL) {
                                error = EINVAL;
                                break;
                        }
                        error = sockopt_set(sopt, in6p_icmp6filt(inp),
                            sizeof(struct icmp6_filter));
                        break;
                    }

                default:
                        error = ENOPROTOOPT;
                        break;
                }
                break;
        }

        return error;
}

/*
 * Perform rate limit check.
 * Returns 0 if it is okay to send the icmp6 packet.
 * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate
 * limitation.
 *
 * XXX per-destination/type check necessary?
 */
static int
icmp6_ratelimit(
        const struct in6_addr *dst,        /* not used at this moment */
        const int type,                /* not used at this moment */
        const int code)                /* not used at this moment */
{
        int ret;

        ret = 0;        /* okay to send */

        /* PPS limit */
        if (!ppsratecheck(&icmp6errppslim_last, &icmp6errpps_count,
            icmp6errppslim)) {
                /* The packet is subject to rate limit */
                ret++;
        }

        return ret;
}

static struct rtentry *
icmp6_mtudisc_clone(struct sockaddr *dst)
{
        struct rtentry *rt;
        int    error;

        rt = rtalloc1(dst, 1);
        if (rt == NULL)
                return NULL;

        /* If we didn't get a host route, allocate one */
        if ((rt->rt_flags & RTF_HOST) == 0) {
                struct rtentry *nrt;

                error = rtrequest(RTM_ADD, dst, rt->rt_gateway, NULL,
                    RTF_GATEWAY | RTF_HOST | RTF_DYNAMIC, &nrt);
                if (error) {
                        rt_unref(rt);
                        return NULL;
                }
                nrt->rt_rmx = rt->rt_rmx;
                rt_newmsg_dynamic(RTM_ADD, nrt);
                rt_unref(rt);
                rt = nrt;
        }

        mutex_enter(&icmp6_mtx);
        error = rt_timer_add(rt, icmp6_mtudisc_timeout,
                        icmp6_mtudisc_timeout_q);
        mutex_exit(&icmp6_mtx);

        if (error) {
                rt_unref(rt);
                return NULL;
        }

        return rt;        /* caller need to call rtfree() */
}

static void
icmp6_mtudisc_timeout(struct rtentry *rt, struct rttimer *r)
{
        struct rtentry *retrt;

        KASSERT(rt != NULL);
        rt_assert_referenced(rt);

        if ((rt->rt_flags & (RTF_DYNAMIC | RTF_HOST)) ==
            (RTF_DYNAMIC | RTF_HOST)) {
                rtrequest(RTM_DELETE, rt_getkey(rt),
                    rt->rt_gateway, rt_mask(rt), rt->rt_flags, &retrt);
                rt_newmsg_dynamic(RTM_DELETE, retrt);
                rt_unref(rt);
                rt_free(retrt);
        } else {
                if (!(rt->rt_rmx.rmx_locks & RTV_MTU))
                        rt->rt_rmx.rmx_mtu = 0;
        }
}

static void
icmp6_redirect_timeout(struct rtentry *rt, struct rttimer *r)
{
        struct rtentry *retrt;

        KASSERT(rt != NULL);
        rt_assert_referenced(rt);

        if ((rt->rt_flags & (RTF_GATEWAY | RTF_DYNAMIC | RTF_HOST)) ==
            (RTF_GATEWAY | RTF_DYNAMIC | RTF_HOST)) {
                rtrequest(RTM_DELETE, rt_getkey(rt),
                    rt->rt_gateway, rt_mask(rt), rt->rt_flags, &retrt);
                rt_newmsg_dynamic(RTM_DELETE, retrt);
                rt_unref(rt);
                rt_free(retrt);
        }
}

static int
sysctl_net_inet6_icmp6_stats(SYSCTLFN_ARGS)
{

        return (NETSTAT_SYSCTL(icmp6stat_percpu, ICMP6_NSTATS));
}

static int
sysctl_net_inet6_icmp6_redirtimeout(SYSCTLFN_ARGS)
{
        int error, tmp;
        struct sysctlnode node;

        mutex_enter(&icmp6_mtx);

        node = *rnode;
        node.sysctl_data = &tmp;
        tmp = icmp6_redirtimeout;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                goto out;
        if (tmp < 0) {
                error = EINVAL;
                goto out;
        }
        icmp6_redirtimeout = tmp;

        if (icmp6_redirect_timeout_q != NULL) {
                if (icmp6_redirtimeout == 0) {
                        rt_timer_queue_destroy(icmp6_redirect_timeout_q);
                } else {
                        rt_timer_queue_change(icmp6_redirect_timeout_q,
                            icmp6_redirtimeout);
                }
        } else if (icmp6_redirtimeout > 0) {
                icmp6_redirect_timeout_q =
                    rt_timer_queue_create(icmp6_redirtimeout);
        }
        error = 0;
out:
        mutex_exit(&icmp6_mtx);
        return error;
}

static void
sysctl_net_inet6_icmp6_setup(struct sysctllog **clog)
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "inet6", NULL,
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET6, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "icmp6",
                       SYSCTL_DESCR("ICMPv6 related settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "stats",
                       SYSCTL_DESCR("ICMPv6 transmission statistics"),
                       sysctl_net_inet6_icmp6_stats, 0, NULL, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_STATS, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "rediraccept",
                       SYSCTL_DESCR("Accept and process redirect messages"),
                       NULL, 0, &icmp6_rediraccept, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_REDIRACCEPT, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "redirtimeout",
                       SYSCTL_DESCR("Redirect generated route lifetime"),
                       sysctl_net_inet6_icmp6_redirtimeout, 0,
                       &icmp6_redirtimeout, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_REDIRTIMEOUT, CTL_EOL);
#if 0 /* obsoleted */
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "errratelimit", NULL,
                       NULL, 0, &icmp6_errratelimit, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_ERRRATELIMIT, CTL_EOL);
#endif
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "nd6_prune",
                       SYSCTL_DESCR("Neighbor discovery prune interval"),
                       NULL, 0, &nd6_prune, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_ND6_PRUNE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "nd6_delay",
                       SYSCTL_DESCR("First probe delay time"),
                       NULL, 0, &nd6_nd_domain.nd_delay, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_ND6_DELAY, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "nd6_mmaxtries",
                       SYSCTL_DESCR("Number of multicast discovery attempts"),
                       NULL, 0, &nd6_nd_domain.nd_mmaxtries, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_ND6_MMAXTRIES, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "nd6_umaxtries",
                       SYSCTL_DESCR("Number of unicast discovery attempts"),
                       NULL, 0, &nd6_nd_domain.nd_umaxtries, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_ND6_UMAXTRIES, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "nd6_maxnudhint",
                       SYSCTL_DESCR("Maximum neighbor unreachable hint count"),
                       NULL, 0, &nd6_nd_domain.nd_maxnudhint, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_ND6_MAXNUDHINT, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "maxqueuelen",
                       SYSCTL_DESCR("max packet queue len for a unresolved ND"),
                       NULL, 1, &nd6_nd_domain.nd_maxqueuelen, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_ND6_MAXQLEN, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "nd6_useloopback",
                       SYSCTL_DESCR("Use loopback interface for local traffic"),
                       NULL, 0, &nd6_useloopback, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_ND6_USELOOPBACK, CTL_EOL);
#if 0 /* obsoleted */
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "nd6_proxyall", NULL,
                       NULL, 0, &nd6_proxyall, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_ND6_PROXYALL, CTL_EOL);
#endif
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "nodeinfo",
                       SYSCTL_DESCR("Respond to node information requests"),
                       NULL, 0, &icmp6_nodeinfo, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_NODEINFO, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "errppslimit",
                       SYSCTL_DESCR("Maximum ICMP errors sent per second"),
                       NULL, 0, &icmp6errppslim, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_ERRPPSLIMIT, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "mtudisc_hiwat",
                       SYSCTL_DESCR("Low mark on MTU Discovery route timers"),
                       NULL, 0, &icmp6_mtudisc_hiwat, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_MTUDISC_HIWAT, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "mtudisc_lowat",
                       SYSCTL_DESCR("Low mark on MTU Discovery route timers"),
                       NULL, 0, &icmp6_mtudisc_lowat, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_MTUDISC_LOWAT, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "nd6_debug",
                       SYSCTL_DESCR("Enable neighbor discovery debug output"),
                       NULL, 0, &nd6_debug, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_ND6_DEBUG, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_BOOL, "reflect_pmtu",
                       SYSCTL_DESCR("Use path MTU Discovery for icmpv6 reflect"),
                       NULL, 0, &icmp6_reflect_pmtu, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_REFLECT_PMTU, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_BOOL, "dynamic_rt_msg",
                       SYSCTL_DESCR("Send routing message for RTF_DYNAMIC"),
                       NULL, 0, &icmp6_dynamic_rt_msg, 0,
                       CTL_NET, PF_INET6, IPPROTO_ICMPV6,
                       ICMPV6CTL_DYNAMIC_RT_MSG, CTL_EOL);
}

void
icmp6_statinc(u_int stat)
{

        KASSERT(stat < ICMP6_NSTATS);
        ICMP6_STATINC(stat);
}


















































































































   39 







   38 
   39 

















   39 
   39 
   39 







   39 


   39 












    7 

















    7 

    6 
    6 
    7 


    7 












   81 











   78 



   78 
   78 
   79 
   79 













   22 




   21 


   22 







   22 
   21 
    7 













































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
/*        $NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $        */

/*-
 * Copyright (c) 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * uvm_pgflcache.c: page freelist cache.
 *
 * This implements a tiny per-CPU cache of pages that sits between the main
 * page allocator and the freelists.  By allocating and freeing pages in
 * batch, it reduces freelist contention by an order of magnitude.
 *
 * The cache can be paused & resumed at runtime so that UVM_HOTPLUG,
 * uvm_pglistalloc() and uvm_page_redim() can have a consistent view of the
 * world.  On system with one CPU per physical package (e.g. a uniprocessor)
 * the cache is not enabled.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $");

#include "opt_uvm.h"
#include "opt_multiprocessor.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/xcall.h>

#include <uvm/uvm.h>
#include <uvm/uvm_pglist.h>
#include <uvm/uvm_pgflcache.h>

/* There is no point doing any of this on a uniprocessor. */
#ifdef MULTIPROCESSOR

/*
 * MAXPGS - maximum pages per color, per bucket.
 * FILLPGS - number of pages to allocate at once, per color, per bucket.
 *
 * Why the chosen values:
 *
 * (1) In 2019, an average Intel system has 4kB pages and 8x L2 cache
 * colors.  We make the assumption that most of the time allocation activity
 * will be centered around one UVM freelist, so most of the time there will
 * be no more than 224kB worth of cached pages per-CPU.  That's tiny, but
 * enough to hugely reduce contention on the freelist locks, and give us a
 * small pool of pages which if we're very lucky may have some L1/L2 cache
 * locality, and do so without subtracting too much from the L2/L3 cache
 * benefits of having per-package free lists in the page allocator.
 *
 * (2) With the chosen values on _LP64, the data structure for each color
 * takes up a single cache line (64 bytes) giving this very low overhead
 * even in the "miss" case.
 *
 * (3) We don't want to cause too much pressure by hiding away memory that
 * could otherwise be put to good use.
 */
#define        MAXPGS                7
#define        FILLPGS                6

/* Variable size, according to # colors. */
struct pgflcache {
        struct pccolor {
                intptr_t        count;
                struct vm_page        *pages[MAXPGS];
        } color[1];
};

static kmutex_t                uvm_pgflcache_lock;
static int                uvm_pgflcache_sem;

/*
 * uvm_pgflcache_fill: fill specified freelist/color from global list
 *
 * => must be called at IPL_VM
 * => must be called with given bucket lock held
 * => must only fill from the correct bucket for this CPU
 */

void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{
        struct pgflbucket *pgb;
        struct pgflcache *pc;
        struct pccolor *pcc;
        struct pgflist *head;
        struct vm_page *pg;
        int count;

        KASSERT(mutex_owned(&uvm_freelist_locks[b].lock));
        KASSERT(ucpu->pgflbucket == b);

        /* If caching is off, then bail out. */
        if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
                return;
        }

        /* Fill only to the limit. */
        pcc = &pc->color[c];
        pgb = uvm.page_free[fl].pgfl_buckets[b];
        head = &pgb->pgb_colors[c];
        if (pcc->count >= FILLPGS) {
                return;
        }

        /* Pull pages from the bucket until it's empty, or we are full. */
        count = pcc->count;
        pg = LIST_FIRST(head);
        while (__predict_true(pg != NULL && count < FILLPGS)) {
                KASSERT(pg->flags & PG_FREE);
                KASSERT(uvm_page_get_bucket(pg) == b);
                pcc->pages[count++] = pg;
                pg = LIST_NEXT(pg, pageq.list);
        }

        /* Violate LIST abstraction to remove all pages at once. */
        head->lh_first = pg;
        if (__predict_true(pg != NULL)) {
                pg->pageq.list.le_prev = &head->lh_first;
        }
        pgb->pgb_nfree -= (count - pcc->count);
        CPU_COUNT(CPU_COUNT_FREEPAGES, -(count - pcc->count));
        pcc->count = count;
}

/*
 * uvm_pgflcache_spill: spill specified freelist/color to global list
 *
 * => must be called at IPL_VM
 * => mark __noinline so we don't pull it into uvm_pgflcache_free()
 */

static void __noinline
uvm_pgflcache_spill(struct uvm_cpu *ucpu, int fl, int c)
{
        struct pgflbucket *pgb;
        struct pgfreelist *pgfl;
        struct pgflcache *pc;
        struct pccolor *pcc;
        struct pgflist *head;
        kmutex_t *lock;
        int b, adj;

        pc = ucpu->pgflcache[fl];
        pcc = &pc->color[c];
        pgfl = &uvm.page_free[fl];
        b = ucpu->pgflbucket;
        pgb = pgfl->pgfl_buckets[b];
        head = &pgb->pgb_colors[c];
        lock = &uvm_freelist_locks[b].lock;

        mutex_spin_enter(lock);
        for (adj = pcc->count; pcc->count != 0;) {
                pcc->count--;
                KASSERT(pcc->pages[pcc->count] != NULL);
                KASSERT(pcc->pages[pcc->count]->flags & PG_FREE);
                LIST_INSERT_HEAD(head, pcc->pages[pcc->count], pageq.list);
        }
        pgb->pgb_nfree += adj;
        CPU_COUNT(CPU_COUNT_FREEPAGES, adj);
        mutex_spin_exit(lock);
}

/*
 * uvm_pgflcache_alloc: try to allocate a cached page.
 *
 * => must be called at IPL_VM
 * => allocate only from the given freelist and given page color
 */

struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{
        struct pgflcache *pc;
        struct pccolor *pcc;
        struct vm_page *pg;

        /* If caching is off, then bail out. */
        if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
                return NULL;
        }

        /* Very simple: if we have a page then return it. */
        pcc = &pc->color[c];
        if (__predict_false(pcc->count == 0)) {
                return NULL;
        }
        pg = pcc->pages[--(pcc->count)];
        KASSERT(pg != NULL);
        KASSERT(pg->flags == PG_FREE);
        KASSERT(uvm_page_get_freelist(pg) == fl);
        KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
        pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
        return pg;
}

/*
 * uvm_pgflcache_free: cache a page, if possible.
 *
 * => must be called at IPL_VM
 * => must only send pages for the correct bucket for this CPU
 */

bool
uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
{
        struct pgflcache *pc;
        struct pccolor *pcc;
        int fl, c;

        KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);

        /* If caching is off, then bail out. */
         fl = uvm_page_get_freelist(pg);
        if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
                return false;
        }

        /* If the array is full spill it first, then add page to array. */
        c = VM_PGCOLOR(pg);
        pcc = &pc->color[c];
        KASSERT((pg->flags & PG_FREE) == 0);
        if (__predict_false(pcc->count == MAXPGS)) {
                uvm_pgflcache_spill(ucpu, fl, c);
        }
        pg->flags = PG_FREE;
        pcc->pages[pcc->count] = pg;
        pcc->count++;
        return true;
}

/*
 * uvm_pgflcache_init: allocate and initialize per-CPU data structures for
 * the free page cache.  Don't set anything in motion - that's taken care
 * of by uvm_pgflcache_resume().
 */

static void
uvm_pgflcache_init_cpu(struct cpu_info *ci)
{
        struct uvm_cpu *ucpu;
        size_t sz;

        ucpu = ci->ci_data.cpu_uvm;
        KASSERT(ucpu->pgflcachemem == NULL);
        KASSERT(ucpu->pgflcache[0] == NULL);

        sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
        ucpu->pgflcachememsz =
            (roundup2(sz * VM_NFREELIST, coherency_unit) + coherency_unit - 1);
        ucpu->pgflcachemem = kmem_zalloc(ucpu->pgflcachememsz, KM_SLEEP);
}

/*
 * uvm_pgflcache_fini_cpu: dump all cached pages back to global free list
 * and shut down caching on the CPU.  Called on each CPU in the system via
 * xcall.
 */

static void
uvm_pgflcache_fini_cpu(void *arg1 __unused, void *arg2 __unused)
{
        struct uvm_cpu *ucpu;
        int fl, color, s;

        ucpu = curcpu()->ci_data.cpu_uvm;
        for (fl = 0; fl < VM_NFREELIST; fl++) {
                s = splvm();
                for (color = 0; color < uvmexp.ncolors; color++) {
                        uvm_pgflcache_spill(ucpu, fl, color);
                }
                ucpu->pgflcache[fl] = NULL;
                splx(s);
        }
}

/*
 * uvm_pgflcache_pause: pause operation of the caches
 */

void
uvm_pgflcache_pause(void)
{
        uint64_t where;

        /* First one in starts draining.  Everyone else waits. */
        mutex_enter(&uvm_pgflcache_lock);
        if (uvm_pgflcache_sem++ == 0) {
                where = xc_broadcast(XC_HIGHPRI, uvm_pgflcache_fini_cpu,
                    (void *)1, NULL);
                xc_wait(where);
        }
        mutex_exit(&uvm_pgflcache_lock);
}

/*
 * uvm_pgflcache_resume: resume operation of the caches
 */

void
uvm_pgflcache_resume(void)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
        struct uvm_cpu *ucpu;
        uintptr_t addr;
        size_t sz;
        int fl;

        /* Last guy out takes care of business. */
        mutex_enter(&uvm_pgflcache_lock);
        KASSERT(uvm_pgflcache_sem > 0);
        if (uvm_pgflcache_sem-- > 1) {
                mutex_exit(&uvm_pgflcache_lock);
                return;
        }

        /*
         * Make sure dependant data structure updates are remotely visible.
         * Essentially this functions as a global memory barrier.
         */
        xc_barrier(XC_HIGHPRI);

        /*
         * Then set all of the pointers in place on each CPU.  As soon as
         * each pointer is set, caching is operational in that dimension.
         */
        sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
        for (CPU_INFO_FOREACH(cii, ci)) {
                ucpu = ci->ci_data.cpu_uvm;
                addr = roundup2((uintptr_t)ucpu->pgflcachemem, coherency_unit);
                for (fl = 0; fl < VM_NFREELIST; fl++) {
                        ucpu->pgflcache[fl] = (struct pgflcache *)addr;
                        addr += sz;
                }
        }
        mutex_exit(&uvm_pgflcache_lock);
}

/*
 * uvm_pgflcache_start: start operation of the cache.
 *
 * => called once only, when init(8) is about to be started
 */

void
uvm_pgflcache_start(void)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;

        KASSERT(uvm_pgflcache_sem > 0);

        /*
         * There's not much point doing this if every CPU has its own
         * bucket (and that includes the uniprocessor case).
         */
        if (ncpu == uvm.bucketcount) {
                return;
        }

        /* Create data structures for each CPU. */
        for (CPU_INFO_FOREACH(cii, ci)) {
                uvm_pgflcache_init_cpu(ci);
        }

        /* Kick it into action. */
        uvm_pgflcache_resume();
}

/*
 * uvm_pgflcache_init: set up data structures for the free page cache.
 */

void
uvm_pgflcache_init(void)
{

        uvm_pgflcache_sem = 1;
        mutex_init(&uvm_pgflcache_lock, MUTEX_DEFAULT, IPL_NONE);
}

#else        /* MULTIPROCESSOR */

struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{

        return NULL;
}

bool
uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
{

        return false;
}

void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{

}

void
uvm_pgflcache_pause(void)
{

}

void
uvm_pgflcache_resume(void)
{

}

void
uvm_pgflcache_start(void)
{

}

void
uvm_pgflcache_init(void)
{

}

#endif        /* MULTIPROCESSOR */





































































































































































































































































































































    2 



    2 
    2 





    1 
    1 
















    1 




    1 







    2 












    3 



    2 
    3 
    3 





    3 
    3 


    2 


















    2 


    2 
    2 

    2 
    2 

    1 

















    1 

    1 
    1 


    1 











































    1 














    4 





















































    1 










































    2 




















    1 







    1 






















































    2 













    2 












    2 














    1 

















































    1 







    1 












    1 

    1 
    1 









    1 
    1 
















































    1 
    1 















    1 

    1 






























































































































































    2 


    1 
    1 








































































































































































    1 












    1 












































    1 
































    1 



















































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
/*        $NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $        */

/*
 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $");

#include "opt_uvmhist.h"
#include "opt_compat_netbsd.h"
#include "opt_ddb.h"
#include "opt_vmswap.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/conf.h>
#include <sys/cprng.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/disklabel.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/vmem.h>
#include <sys/blist.h>
#include <sys/mount.h>
#include <sys/pool.h>
#include <sys/kmem.h>
#include <sys/syscallargs.h>
#include <sys/swap.h>
#include <sys/kauth.h>
#include <sys/sysctl.h>
#include <sys/workqueue.h>

#include <uvm/uvm.h>

#include <miscfs/specfs/specdev.h>

#include <crypto/aes/aes.h>
#include <crypto/aes/aes_cbc.h>

/*
 * uvm_swap.c: manage configuration and i/o to swap space.
 */

/*
 * swap space is managed in the following way:
 *
 * each swap partition or file is described by a "swapdev" structure.
 * each "swapdev" structure contains a "swapent" structure which contains
 * information that is passed up to the user (via system calls).
 *
 * each swap partition is assigned a "priority" (int) which controls
 * swap partition usage.
 *
 * the system maintains a global data structure describing all swap
 * partitions/files.   there is a sorted LIST of "swappri" structures
 * which describe "swapdev"'s at that priority.   this LIST is headed
 * by the "swap_priority" global var.    each "swappri" contains a
 * TAILQ of "swapdev" structures at that priority.
 *
 * locking:
 *  - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
 *    system call and prevents the swap priority list from changing
 *    while we are in the middle of a system call (e.g. SWAP_STATS).
 *  - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
 *    structures including the priority list, the swapdev structures,
 *    and the swapmap arena.
 *
 * each swap device has the following info:
 *  - swap device in use (could be disabled, preventing future use)
 *  - swap enabled (allows new allocations on swap)
 *  - map info in /dev/drum
 *  - vnode pointer
 * for swap files only:
 *  - block size
 *  - max byte count in buffer
 *  - buffer
 *
 * userland controls and configures swap with the swapctl(2) system call.
 * the sys_swapctl performs the following operations:
 *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
 *  [2] SWAP_STATS: given a pointer to an array of swapent structures
 *        (passed in via "arg") of a size passed in via "misc" ... we load
 *        the current swap config into the array. The actual work is done
 *        in the uvm_swap_stats() function.
 *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
 *        priority in "misc", start swapping on it.
 *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
 *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
 *        "misc")
 */

/*
 * swapdev: describes a single swap partition/file
 *
 * note the following should be true:
 * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
 */
struct swapdev {
        dev_t                        swd_dev;        /* device id */
        int                        swd_flags;        /* flags:inuse/enable/fake */
        int                        swd_priority;        /* our priority */
        int                        swd_nblks;        /* blocks in this device */
        char                        *swd_path;        /* saved pathname of device */
        int                        swd_pathlen;        /* length of pathname */
        int                        swd_npages;        /* #pages we can use */
        int                        swd_npginuse;        /* #pages in use */
        int                        swd_npgbad;        /* #pages bad */
        int                        swd_drumoffset;        /* page0 offset in drum */
        int                        swd_drumsize;        /* #pages in drum */
        blist_t                        swd_blist;        /* blist for this swapdev */
        struct vnode                *swd_vp;        /* backing vnode */
        TAILQ_ENTRY(swapdev)        swd_next;        /* priority tailq */

        int                        swd_bsize;        /* blocksize (bytes) */
        int                        swd_maxactive;        /* max active i/o reqs */
        struct bufq_state        *swd_tab;        /* buffer list */
        int                        swd_active;        /* number of active buffers */

        volatile uint32_t        *swd_encmap;        /* bitmap of encrypted slots */
        struct aesenc                swd_enckey;        /* AES key expanded for enc */
        struct aesdec                swd_deckey;        /* AES key expanded for dec */
        bool                        swd_encinit;        /* true if keys initialized */
};

/*
 * swap device priority entry; the list is kept sorted on `spi_priority'.
 */
struct swappri {
        int                        spi_priority;     /* priority */
        TAILQ_HEAD(spi_swapdev, swapdev)        spi_swapdev;
        /* tailq of swapdevs at this priority */
        LIST_ENTRY(swappri)        spi_swappri;      /* global list of pri's */
};

/*
 * The following two structures are used to keep track of data transfers
 * on swap devices associated with regular files.
 * NOTE: this code is more or less a copy of vnd.c; we use the same
 * structure names here to ease porting..
 */
struct vndxfer {
        struct buf        *vx_bp;                /* Pointer to parent buffer */
        struct swapdev        *vx_sdp;
        int                vx_error;
        int                vx_pending;        /* # of pending aux buffers */
        int                vx_flags;
#define VX_BUSY                1
#define VX_DEAD                2
};

struct vndbuf {
        struct buf        vb_buf;
        struct vndxfer        *vb_xfer;
};

/*
 * We keep a of pool vndbuf's and vndxfer structures.
 */
static struct pool vndxfer_pool, vndbuf_pool;

/*
 * local variables
 */
static vmem_t *swapmap;        /* controls the mapping of /dev/drum */

/* list of all active swap devices [by priority] */
LIST_HEAD(swap_priority, swappri);
static struct swap_priority swap_priority;

/* locks */
static kmutex_t uvm_swap_data_lock __cacheline_aligned;
static krwlock_t swap_syscall_lock;
bool uvm_swap_init_done = false;

/* workqueue and use counter for swap to regular files */
static int sw_reg_count = 0;
static struct workqueue *sw_reg_workqueue;

/* tuneables */
u_int uvm_swapisfull_factor = 99;
#if VMSWAP_DEFAULT_PLAINTEXT
bool uvm_swap_encrypt = false;
#else
bool uvm_swap_encrypt = true;
#endif

/*
 * prototypes
 */
static struct swapdev        *swapdrum_getsdp(int);

static struct swapdev        *swaplist_find(struct vnode *, bool);
static void                 swaplist_insert(struct swapdev *,
                                         struct swappri *, int);
static void                 swaplist_trim(void);

static int swap_on(struct lwp *, struct swapdev *);
static int swap_off(struct lwp *, struct swapdev *);

static void sw_reg_strategy(struct swapdev *, struct buf *, int);
static void sw_reg_biodone(struct buf *);
static void sw_reg_iodone(struct work *wk, void *dummy);
static void sw_reg_start(struct swapdev *);

static int uvm_swap_io(struct vm_page **, int, int, int);

static void uvm_swap_genkey(struct swapdev *);
static void uvm_swap_encryptpage(struct swapdev *, void *, int);
static void uvm_swap_decryptpage(struct swapdev *, void *, int);

static size_t
encmap_size(size_t npages)
{
        struct swapdev *sdp;
        const size_t bytesperword = sizeof(sdp->swd_encmap[0]);
        const size_t bitsperword = NBBY * bytesperword;
        const size_t nbits = npages; /* one bit for each page */
        const size_t nwords = howmany(nbits, bitsperword);
        const size_t nbytes = nwords * bytesperword;

        return nbytes;
}

/*
 * uvm_swap_init: init the swap system data structures and locks
 *
 * => called at boot time from init_main.c after the filesystems
 *        are brought up (which happens after uvm_init())
 */
void
uvm_swap_init(void)
{
        UVMHIST_FUNC(__func__);

        UVMHIST_CALLED(pdhist);
        /*
         * first, init the swap list, its counter, and its lock.
         * then get a handle on the vnode for /dev/drum by using
         * the its dev_t number ("swapdev", from MD conf.c).
         */

        LIST_INIT(&swap_priority);
        uvmexp.nswapdev = 0;
        rw_init(&swap_syscall_lock);
        mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);

        if (bdevvp(swapdev, &swapdev_vp))
                panic("%s: can't get vnode for swap device", __func__);
        if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
                panic("%s: can't lock swap device", __func__);
        if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
                panic("%s: can't open swap device", __func__);
        VOP_UNLOCK(swapdev_vp);

        /*
         * create swap block resource map to map /dev/drum.   the range
         * from 1 to INT_MAX allows 2 gigablocks of swap space.  note
         * that block 0 is reserved (used to indicate an allocation
         * failure, or no allocation).
         */
        swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
            VM_NOSLEEP, IPL_NONE);
        if (swapmap == 0) {
                panic("%s: vmem_create failed", __func__);
        }

        pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
            NULL, IPL_BIO);
        pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
            NULL, IPL_BIO);

        uvm_swap_init_done = true;

        UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
}

/*
 * swaplist functions: functions that operate on the list of swap
 * devices on the system.
 */

/*
 * swaplist_insert: insert swap device "sdp" into the global list
 *
 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
 * => caller must provide a newly allocated swappri structure (we will
 *        FREE it if we don't need it... this it to prevent allocation
 *        blocking here while adding swap)
 */
static void
swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
{
        struct swappri *spp, *pspp;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);

        KASSERT(rw_write_held(&swap_syscall_lock));
        KASSERT(mutex_owned(&uvm_swap_data_lock));

        /*
         * find entry at or after which to insert the new device.
         */
        pspp = NULL;
        LIST_FOREACH(spp, &swap_priority, spi_swappri) {
                if (priority <= spp->spi_priority)
                        break;
                pspp = spp;
        }

        /*
         * new priority?
         */
        if (spp == NULL || spp->spi_priority != priority) {
                spp = newspp;  /* use newspp! */
                UVMHIST_LOG(pdhist, "created new swappri = %jd",
                            priority, 0, 0, 0);

                spp->spi_priority = priority;
                TAILQ_INIT(&spp->spi_swapdev);

                if (pspp)
                        LIST_INSERT_AFTER(pspp, spp, spi_swappri);
                else
                        LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
        } else {
                  /* we don't need a new priority structure, free it */
                kmem_free(newspp, sizeof(*newspp));
        }

        /*
         * priority found (or created).   now insert on the priority's
         * tailq list and bump the total number of swapdevs.
         */
        sdp->swd_priority = priority;
        TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
        uvmexp.nswapdev++;
}

/*
 * swaplist_find: find and optionally remove a swap device from the
 *        global list.
 *
 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
 * => we return the swapdev we found (and removed)
 */
static struct swapdev *
swaplist_find(struct vnode *vp, bool remove)
{
        struct swapdev *sdp;
        struct swappri *spp;

        KASSERT(rw_lock_held(&swap_syscall_lock));
        KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1);
        KASSERT(mutex_owned(&uvm_swap_data_lock));

        /*
         * search the lists for the requested vp
         */

        LIST_FOREACH(spp, &swap_priority, spi_swappri) {
                TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
                        if (sdp->swd_vp == vp) {
                                if (remove) {
                                        TAILQ_REMOVE(&spp->spi_swapdev,
                                            sdp, swd_next);
                                        uvmexp.nswapdev--;
                                }
                                return(sdp);
                        }
                }
        }
        return (NULL);
}

/*
 * swaplist_trim: scan priority list for empty priority entries and kill
 *        them.
 *
 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
 */
static void
swaplist_trim(void)
{
        struct swappri *spp, *nextspp;

        KASSERT(rw_write_held(&swap_syscall_lock));
        KASSERT(mutex_owned(&uvm_swap_data_lock));

        LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
                if (!TAILQ_EMPTY(&spp->spi_swapdev))
                        continue;
                LIST_REMOVE(spp, spi_swappri);
                kmem_free(spp, sizeof(*spp));
        }
}

/*
 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
 *        to the "swapdev" that maps that section of the drum.
 *
 * => each swapdev takes one big contig chunk of the drum
 * => caller must hold uvm_swap_data_lock
 */
static struct swapdev *
swapdrum_getsdp(int pgno)
{
        struct swapdev *sdp;
        struct swappri *spp;

        KASSERT(mutex_owned(&uvm_swap_data_lock));

        LIST_FOREACH(spp, &swap_priority, spi_swappri) {
                TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
                        if (sdp->swd_flags & SWF_FAKE)
                                continue;
                        if (pgno >= sdp->swd_drumoffset &&
                            pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
                                return sdp;
                        }
                }
        }
        return NULL;
}

/*
 * swapdrum_sdp_is: true iff the swap device for pgno is sdp
 *
 * => for use in positive assertions only; result is not stable
 */
static bool __debugused
swapdrum_sdp_is(int pgno, struct swapdev *sdp)
{
        bool result;

        mutex_enter(&uvm_swap_data_lock);
        result = swapdrum_getsdp(pgno) == sdp;
        mutex_exit(&uvm_swap_data_lock);

        return result;
}

void swapsys_lock(krw_t op)
{
        rw_enter(&swap_syscall_lock, op);
}

void swapsys_unlock(void)
{
        rw_exit(&swap_syscall_lock);
}

static void
swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse)
{
        se->se_dev = sdp->swd_dev;
        se->se_flags = sdp->swd_flags;
        se->se_nblks = sdp->swd_nblks;
        se->se_inuse = inuse;
        se->se_priority = sdp->swd_priority;
        KASSERT(sdp->swd_pathlen < sizeof(se->se_path));
        strcpy(se->se_path, sdp->swd_path);
}

int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) =
    (void *)enosys;
int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) =
    (void *)enosys;

/*
 * sys_swapctl: main entry point for swapctl(2) system call
 *         [with two helper functions: swap_on and swap_off]
 */
int
sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) cmd;
                syscallarg(void *) arg;
                syscallarg(int) misc;
        } */
        struct vnode *vp;
        struct nameidata nd;
        struct swappri *spp;
        struct swapdev *sdp;
#define SWAP_PATH_MAX (PATH_MAX + 1)
        char        *userpath;
        size_t        len = 0;
        int        error;
        int        priority;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);

        /*
         * we handle the non-priv NSWAP and STATS request first.
         *
         * SWAP_NSWAP: return number of config'd swap devices
         * [can also be obtained with uvmexp sysctl]
         */
        if (SCARG(uap, cmd) == SWAP_NSWAP) {
                const int nswapdev = uvmexp.nswapdev;
                UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev,
                    0, 0, 0);
                *retval = nswapdev;
                return 0;
        }

        userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP);

        /*
         * ensure serialized syscall access by grabbing the swap_syscall_lock
         */
        rw_enter(&swap_syscall_lock, RW_WRITER);

        /*
         * SWAP_STATS: get stats on current # of configured swap devs
         *
         * note that the swap_priority list can't change as long
         * as we are holding the swap_syscall_lock.  we don't want
         * to grab the uvm_swap_data_lock because we may fault&sleep during
         * copyout() and we don't want to be holding that lock then!
         */
        switch (SCARG(uap, cmd)) {
        case SWAP_STATS13:
                error = (*uvm_swap_stats13)(uap, retval);
                goto out;
        case SWAP_STATS50:
                error = (*uvm_swap_stats50)(uap, retval);
                goto out;
        case SWAP_STATS:
                error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc),
                    NULL, sizeof(struct swapent), retval);
                UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
                goto out;

        case SWAP_GETDUMPDEV:
                error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev));
                goto out;
        default:
                break;
        }

        /*
         * all other requests require superuser privs.   verify.
         */
        if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
            0, NULL, NULL, NULL)))
                goto out;

        if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
                /* drop the current dump device */
                dumpdev = NODEV;
                dumpcdev = NODEV;
                cpu_dumpconf();
                goto out;
        }

        /*
         * at this point we expect a path name in arg.   we will
         * use namei() to gain a vnode reference (vref), and lock
         * the vnode (VOP_LOCK).
         *
         * XXX: a NULL arg means use the root vnode pointer (e.g. for
         * miniroot)
         */
        if (SCARG(uap, arg) == NULL) {
                vp = rootvp;                /* miniroot */
                vref(vp);
                if (vn_lock(vp, LK_EXCLUSIVE)) {
                        vrele(vp);
                        error = EBUSY;
                        goto out;
                }
                if (SCARG(uap, cmd) == SWAP_ON &&
                    copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
                        panic("swapctl: miniroot copy failed");
        } else {
                struct pathbuf *pb;

                /*
                 * This used to allow copying in one extra byte
                 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON.
                 * This was completely pointless because if anyone
                 * used that extra byte namei would fail with
                 * ENAMETOOLONG anyway, so I've removed the excess
                 * logic. - dholland 20100215
                 */

                error = pathbuf_copyin(SCARG(uap, arg), &pb);
                if (error) {
                        goto out;
                }
                if (SCARG(uap, cmd) == SWAP_ON) {
                        /* get a copy of the string */
                        pathbuf_copystring(pb, userpath, SWAP_PATH_MAX);
                        len = strlen(userpath) + 1;
                }
                NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
                if ((error = namei(&nd))) {
                        pathbuf_destroy(pb);
                        goto out;
                }
                vp = nd.ni_vp;
                pathbuf_destroy(pb);
        }
        /* note: "vp" is referenced and locked */

        error = 0;                /* assume no error */
        switch(SCARG(uap, cmd)) {

        case SWAP_DUMPDEV:
                if (vp->v_type != VBLK) {
                        error = ENOTBLK;
                        break;
                }
                if (bdevsw_lookup(vp->v_rdev)) {
                        dumpdev = vp->v_rdev;
                        dumpcdev = devsw_blk2chr(dumpdev);
                } else
                        dumpdev = NODEV;
                cpu_dumpconf();
                break;

        case SWAP_CTL:
                /*
                 * get new priority, remove old entry (if any) and then
                 * reinsert it in the correct place.  finally, prune out
                 * any empty priority structures.
                 */
                priority = SCARG(uap, misc);
                spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
                mutex_enter(&uvm_swap_data_lock);
                if ((sdp = swaplist_find(vp, true)) == NULL) {
                        error = ENOENT;
                } else {
                        swaplist_insert(sdp, spp, priority);
                        swaplist_trim();
                }
                mutex_exit(&uvm_swap_data_lock);
                if (error)
                        kmem_free(spp, sizeof(*spp));
                break;

        case SWAP_ON:

                /*
                 * check for duplicates.   if none found, then insert a
                 * dummy entry on the list to prevent someone else from
                 * trying to enable this device while we are working on
                 * it.
                 */

                priority = SCARG(uap, misc);
                sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP);
                spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
                sdp->swd_flags = SWF_FAKE;
                sdp->swd_vp = vp;
                sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
                bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
                mutex_enter(&uvm_swap_data_lock);
                if (swaplist_find(vp, false) != NULL) {
                        error = EBUSY;
                        mutex_exit(&uvm_swap_data_lock);
                        bufq_free(sdp->swd_tab);
                        kmem_free(sdp, sizeof(*sdp));
                        kmem_free(spp, sizeof(*spp));
                        break;
                }
                swaplist_insert(sdp, spp, priority);
                mutex_exit(&uvm_swap_data_lock);

                KASSERT(len > 0);
                sdp->swd_pathlen = len;
                sdp->swd_path = kmem_alloc(len, KM_SLEEP);
                if (copystr(userpath, sdp->swd_path, len, 0) != 0)
                        panic("swapctl: copystr");

                /*
                 * we've now got a FAKE placeholder in the swap list.
                 * now attempt to enable swap on it.  if we fail, undo
                 * what we've done and kill the fake entry we just inserted.
                 * if swap_on is a success, it will clear the SWF_FAKE flag
                 */

                if ((error = swap_on(l, sdp)) != 0) {
                        mutex_enter(&uvm_swap_data_lock);
                        (void) swaplist_find(vp, true);  /* kill fake entry */
                        swaplist_trim();
                        mutex_exit(&uvm_swap_data_lock);
                        bufq_free(sdp->swd_tab);
                        kmem_free(sdp->swd_path, sdp->swd_pathlen);
                        kmem_free(sdp, sizeof(*sdp));
                        break;
                }
                break;

        case SWAP_OFF:
                mutex_enter(&uvm_swap_data_lock);
                if ((sdp = swaplist_find(vp, false)) == NULL) {
                        mutex_exit(&uvm_swap_data_lock);
                        error = ENXIO;
                        break;
                }

                /*
                 * If a device isn't in use or enabled, we
                 * can't stop swapping from it (again).
                 */
                if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
                        mutex_exit(&uvm_swap_data_lock);
                        error = EBUSY;
                        break;
                }

                /*
                 * do the real work.
                 */
                error = swap_off(l, sdp);
                break;

        default:
                error = EINVAL;
        }

        /*
         * done!  release the ref gained by namei() and unlock.
         */
        vput(vp);
out:
        rw_exit(&swap_syscall_lock);
        kmem_free(userpath, SWAP_PATH_MAX);

        UVMHIST_LOG(pdhist, "<- done!  error=%jd", error, 0, 0, 0);
        return (error);
}

/*
 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept
 * away from sys_swapctl() in order to allow COMPAT_* swapctl()
 * emulation to use it directly without going through sys_swapctl().
 * The problem with using sys_swapctl() there is that it involves
 * copying the swapent array to the stackgap, and this array's size
 * is not known at build time. Hence it would not be possible to
 * ensure it would fit in the stackgap in any case.
 */
int
uvm_swap_stats(char *ptr, int misc,
    void (*f)(void *, const struct swapent *), size_t len,
    register_t *retval)
{
        struct swappri *spp;
        struct swapdev *sdp;
        struct swapent sep;
        int count = 0;
        int error;

        KASSERT(len <= sizeof(sep));
        if (len == 0)
                return ENOSYS;

        if (misc < 0)
                return EINVAL;

        if (misc == 0 || uvmexp.nswapdev == 0)
                return 0;

        /* Make sure userland cannot exhaust kernel memory */
        if ((size_t)misc > (size_t)uvmexp.nswapdev)
                misc = uvmexp.nswapdev;

        KASSERT(rw_lock_held(&swap_syscall_lock));

        LIST_FOREACH(spp, &swap_priority, spi_swappri) {
                TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
                        int inuse;

                        if (misc-- <= 0)
                                break;

                        inuse = btodb((uint64_t)sdp->swd_npginuse <<
                            PAGE_SHIFT);

                        memset(&sep, 0, sizeof(sep));
                        swapent_cvt(&sep, sdp, inuse);
                        if (f)
                                (*f)(&sep, &sep);
                        if ((error = copyout(&sep, ptr, len)) != 0)
                                return error;
                        ptr += len;
                        count++;
                }
        }
        *retval = count;
        return 0;
}

/*
 * swap_on: attempt to enable a swapdev for swapping.   note that the
 *        swapdev is already on the global list, but disabled (marked
 *        SWF_FAKE).
 *
 * => we avoid the start of the disk (to protect disk labels)
 * => we also avoid the miniroot, if we are swapping to root.
 * => caller should leave uvm_swap_data_lock unlocked, we may lock it
 *        if needed.
 */
static int
swap_on(struct lwp *l, struct swapdev *sdp)
{
        struct vnode *vp;
        int error, npages, nblocks, size;
        long addr;
        vmem_addr_t result;
        struct vattr va;
        dev_t dev;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);

        /*
         * we want to enable swapping on sdp.   the swd_vp contains
         * the vnode we want (locked and ref'd), and the swd_dev
         * contains the dev_t of the file, if it a block device.
         */

        vp = sdp->swd_vp;
        dev = sdp->swd_dev;

        /*
         * open the swap file (mostly useful for block device files to
         * let device driver know what is up).
         *
         * we skip the open/close for root on swap because the root
         * has already been opened when root was mounted (mountroot).
         */
        if (vp != rootvp) {
                if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
                        return (error);
        }

        /* XXX this only works for block devices */
        UVMHIST_LOG(pdhist, "  dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0);

        /*
         * we now need to determine the size of the swap area.   for
         * block specials we can call the d_psize function.
         * for normal files, we must stat [get attrs].
         *
         * we put the result in nblks.
         * for normal files, we also want the filesystem block size
         * (which we get with statfs).
         */
        switch (vp->v_type) {
        case VBLK:
                if ((nblocks = bdev_size(dev)) == -1) {
                        error = ENXIO;
                        goto bad;
                }
                break;

        case VREG:
                if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
                        goto bad;
                nblocks = (int)btodb(va.va_size);
                sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift;
                /*
                 * limit the max # of outstanding I/O requests we issue
                 * at any one time.   take it easy on NFS servers.
                 */
                if (vp->v_tag == VT_NFS)
                        sdp->swd_maxactive = 2; /* XXX */
                else
                        sdp->swd_maxactive = 8; /* XXX */
                break;

        default:
                error = ENXIO;
                goto bad;
        }

        /*
         * save nblocks in a safe place and convert to pages.
         */

        sdp->swd_nblks = nblocks;
        npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;

        /*
         * for block special files, we want to make sure that leave
         * the disklabel and bootblocks alone, so we arrange to skip
         * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
         * note that because of this the "size" can be less than the
         * actual number of blocks on the device.
         */
        if (vp->v_type == VBLK) {
                /* we use pages 1 to (size - 1) [inclusive] */
                size = npages - 1;
                addr = 1;
        } else {
                /* we use pages 0 to (size - 1) [inclusive] */
                size = npages;
                addr = 0;
        }

        /*
         * make sure we have enough blocks for a reasonable sized swap
         * area.   we want at least one page.
         */

        if (size < 1) {
                UVMHIST_LOG(pdhist, "  size <= 1!!", 0, 0, 0, 0);
                error = EINVAL;
                goto bad;
        }

        UVMHIST_LOG(pdhist, "  dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0);

        /*
         * now we need to allocate an extent to manage this swap device
         */

        sdp->swd_blist = blist_create(npages);
        /* mark all expect the `saved' region free. */
        blist_free(sdp->swd_blist, addr, size);

        /*
         * allocate space to for swap encryption state and mark the
         * keys uninitialized so we generate them lazily
         */
        sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP);
        sdp->swd_encinit = false;

        /*
         * if the vnode we are swapping to is the root vnode
         * (i.e. we are swapping to the miniroot) then we want
         * to make sure we don't overwrite it.   do a statfs to
         * find its size and skip over it.
         */
        if (vp == rootvp) {
                struct mount *mp;
                struct statvfs *sp;
                int rootblocks, rootpages;

                mp = rootvnode->v_mount;
                sp = &mp->mnt_stat;
                rootblocks = sp->f_blocks * btodb(sp->f_frsize);
                /*
                 * XXX: sp->f_blocks isn't the total number of
                 * blocks in the filesystem, it's the number of
                 * data blocks.  so, our rootblocks almost
                 * definitely underestimates the total size
                 * of the filesystem - how badly depends on the
                 * details of the filesystem type.  there isn't
                 * an obvious way to deal with this cleanly
                 * and perfectly, so for now we just pad our
                 * rootblocks estimate with an extra 5 percent.
                 */
                rootblocks += (rootblocks >> 5) +
                        (rootblocks >> 6) +
                        (rootblocks >> 7);
                rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
                if (rootpages > size)
                        panic("swap_on: miniroot larger than swap?");

                if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
                        panic("swap_on: unable to preserve miniroot");
                }

                size -= rootpages;
                printf("Preserved %d pages of miniroot ", rootpages);
                printf("leaving %d pages of swap\n", size);
        }

        /*
         * add a ref to vp to reflect usage as a swap device.
         */
        vref(vp);

        /*
         * now add the new swapdev to the drum and enable.
         */
        error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result);
        if (error != 0)
                panic("swapdrum_add");
        /*
         * If this is the first regular swap create the workqueue.
         * => Protected by swap_syscall_lock.
         */
        if (vp->v_type != VBLK) {
                if (sw_reg_count++ == 0) {
                        KASSERT(sw_reg_workqueue == NULL);
                        if (workqueue_create(&sw_reg_workqueue, "swapiod",
                            sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
                                panic("%s: workqueue_create failed", __func__);
                }
        }

        sdp->swd_drumoffset = (int)result;
        sdp->swd_drumsize = npages;
        sdp->swd_npages = size;
        mutex_enter(&uvm_swap_data_lock);
        sdp->swd_flags &= ~SWF_FAKE;        /* going live */
        sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
        uvmexp.swpages += size;
        uvmexp.swpgavail += size;
        mutex_exit(&uvm_swap_data_lock);
        return (0);

        /*
         * failure: clean up and return error.
         */

bad:
        if (sdp->swd_blist) {
                blist_destroy(sdp->swd_blist);
        }
        if (vp != rootvp) {
                (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
        }
        return (error);
}

/*
 * swap_off: stop swapping on swapdev
 *
 * => swap data should be locked, we will unlock.
 */
static int
swap_off(struct lwp *l, struct swapdev *sdp)
{
        int npages = sdp->swd_npages;
        int error = 0;

        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(pdhist, "  dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0);

        KASSERT(rw_write_held(&swap_syscall_lock));
        KASSERT(mutex_owned(&uvm_swap_data_lock));

        /* disable the swap area being removed */
        sdp->swd_flags &= ~SWF_ENABLE;
        uvmexp.swpgavail -= npages;
        mutex_exit(&uvm_swap_data_lock);

        /*
         * the idea is to find all the pages that are paged out to this
         * device, and page them all in.  in uvm, swap-backed pageable
         * memory can take two forms: aobjs and anons.  call the
         * swapoff hook for each subsystem to bring in pages.
         */

        if (uao_swap_off(sdp->swd_drumoffset,
                         sdp->swd_drumoffset + sdp->swd_drumsize) ||
            amap_swap_off(sdp->swd_drumoffset,
                          sdp->swd_drumoffset + sdp->swd_drumsize)) {
                error = ENOMEM;
        } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
                error = EBUSY;
        }

        if (error) {
                mutex_enter(&uvm_swap_data_lock);
                sdp->swd_flags |= SWF_ENABLE;
                uvmexp.swpgavail += npages;
                mutex_exit(&uvm_swap_data_lock);

                return error;
        }

        /*
         * If this is the last regular swap destroy the workqueue.
         * => Protected by swap_syscall_lock.
         */
        if (sdp->swd_vp->v_type != VBLK) {
                KASSERT(sw_reg_count > 0);
                KASSERT(sw_reg_workqueue != NULL);
                if (--sw_reg_count == 0) {
                        workqueue_destroy(sw_reg_workqueue);
                        sw_reg_workqueue = NULL;
                }
        }

        /*
         * done with the vnode.
         * drop our ref on the vnode before calling VOP_CLOSE()
         * so that spec_close() can tell if this is the last close.
         */
        vrele(sdp->swd_vp);
        if (sdp->swd_vp != rootvp) {
                (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
        }

        mutex_enter(&uvm_swap_data_lock);
        uvmexp.swpages -= npages;
        uvmexp.swpginuse -= sdp->swd_npgbad;

        if (swaplist_find(sdp->swd_vp, true) == NULL)
                panic("%s: swapdev not in list", __func__);
        swaplist_trim();
        mutex_exit(&uvm_swap_data_lock);

        /*
         * free all resources!
         */
        vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
        blist_destroy(sdp->swd_blist);
        bufq_free(sdp->swd_tab);
        kmem_free(__UNVOLATILE(sdp->swd_encmap),
            encmap_size(sdp->swd_drumsize));
        explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey);
        explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey);
        kmem_free(sdp, sizeof(*sdp));
        return (0);
}

void
uvm_swap_shutdown(struct lwp *l)
{
        struct swapdev *sdp;
        struct swappri *spp;
        struct vnode *vp;
        int error;

        if (!uvm_swap_init_done || uvmexp.nswapdev == 0)
                return;
        printf("turning off swap...");
        rw_enter(&swap_syscall_lock, RW_WRITER);
        mutex_enter(&uvm_swap_data_lock);
again:
        LIST_FOREACH(spp, &swap_priority, spi_swappri)
                TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
                        if (sdp->swd_flags & SWF_FAKE)
                                continue;
                        if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0)
                                continue;
#ifdef DEBUG
                        printf("\nturning off swap on %s...", sdp->swd_path);
#endif
                        /* Have to lock and reference vnode for swap_off(). */
                        vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY);
                        vref(vp);
                        error = swap_off(l, sdp);
                        vput(vp);
                        mutex_enter(&uvm_swap_data_lock);
                        if (error) {
                                printf("stopping swap on %s failed "
                                    "with error %d\n", sdp->swd_path, error);
                                TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
                                uvmexp.nswapdev--;
                                swaplist_trim();
                        }
                        goto again;
                }
        printf(" done\n");
        mutex_exit(&uvm_swap_data_lock);
        rw_exit(&swap_syscall_lock);
}


/*
 * /dev/drum interface and i/o functions
 */

/*
 * swopen: allow the initial open from uvm_swap_init() and reject all others.
 */

static int
swopen(dev_t dev, int flag, int mode, struct lwp *l)
{
        static bool inited = false;

        if (!inited) {
                inited = true;
                return 0;
        }
        return ENODEV;
}

/*
 * swstrategy: perform I/O on the drum
 *
 * => we must map the i/o request from the drum to the correct swapdev.
 */
static void
swstrategy(struct buf *bp)
{
        struct swapdev *sdp;
        struct vnode *vp;
        int pageno, bn;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);

        /*
         * convert block number to swapdev.   note that swapdev can't
         * be yanked out from under us because we are holding resources
         * in it (i.e. the blocks we are doing I/O on).
         */
        pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
        mutex_enter(&uvm_swap_data_lock);
        sdp = swapdrum_getsdp(pageno);
        mutex_exit(&uvm_swap_data_lock);
        if (sdp == NULL) {
                bp->b_error = EINVAL;
                bp->b_resid = bp->b_bcount;
                biodone(bp);
                UVMHIST_LOG(pdhist, "  failed to get swap device", 0, 0, 0, 0);
                return;
        }

        /*
         * convert drum page number to block number on this swapdev.
         */

        pageno -= sdp->swd_drumoffset;        /* page # on swapdev */
        bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */

        UVMHIST_LOG(pdhist, "  Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd",
                ((bp->b_flags & B_READ) == 0) ? 1 : 0,
                sdp->swd_drumoffset, bn, bp->b_bcount);

        /*
         * for block devices we finish up here.
         * for regular files we have to do more work which we delegate
         * to sw_reg_strategy().
         */

        vp = sdp->swd_vp;                /* swapdev vnode pointer */
        switch (vp->v_type) {
        default:
                panic("%s: vnode type 0x%x", __func__, vp->v_type);

        case VBLK:

                /*
                 * must convert "bp" from an I/O on /dev/drum to an I/O
                 * on the swapdev (sdp).
                 */
                bp->b_blkno = bn;                /* swapdev block number */
                bp->b_dev = sdp->swd_dev;        /* swapdev dev_t */

                /*
                 * if we are doing a write, we have to redirect the i/o on
                 * drum's v_numoutput counter to the swapdevs.
                 */
                if ((bp->b_flags & B_READ) == 0) {
                        mutex_enter(bp->b_objlock);
                        vwakeup(bp);        /* kills one 'v_numoutput' on drum */
                        mutex_exit(bp->b_objlock);
                        mutex_enter(vp->v_interlock);
                        vp->v_numoutput++;        /* put it on swapdev */
                        mutex_exit(vp->v_interlock);
                }

                /*
                 * finally plug in swapdev vnode and start I/O
                 */
                bp->b_vp = vp;
                bp->b_objlock = vp->v_interlock;
                VOP_STRATEGY(vp, bp);
                return;

        case VREG:
                /*
                 * delegate to sw_reg_strategy function.
                 */
                sw_reg_strategy(sdp, bp, bn);
                return;
        }
        /* NOTREACHED */
}

/*
 * swread: the read function for the drum (just a call to physio)
 */
/*ARGSUSED*/
static int
swread(dev_t dev, struct uio *uio, int ioflag)
{
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(pdhist, "  dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);

        return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
}

/*
 * swwrite: the write function for the drum (just a call to physio)
 */
/*ARGSUSED*/
static int
swwrite(dev_t dev, struct uio *uio, int ioflag)
{
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(pdhist, "  dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);

        return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
}

const struct bdevsw swap_bdevsw = {
        .d_open = swopen,
        .d_close = noclose,
        .d_strategy = swstrategy,
        .d_ioctl = noioctl,
        .d_dump = nodump,
        .d_psize = nosize,
        .d_discard = nodiscard,
        .d_flag = D_OTHER
};

const struct cdevsw swap_cdevsw = {
        .d_open = nullopen,
        .d_close = nullclose,
        .d_read = swread,
        .d_write = swwrite,
        .d_ioctl = noioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_OTHER,
};

/*
 * sw_reg_strategy: handle swap i/o to regular files
 */
static void
sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
{
        struct vnode        *vp;
        struct vndxfer        *vnx;
        daddr_t                nbn;
        char                 *addr;
        off_t                byteoff;
        int                s, off, nra, error, sz, resid;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);

        /*
         * allocate a vndxfer head for this transfer and point it to
         * our buffer.
         */
        vnx = pool_get(&vndxfer_pool, PR_WAITOK);
        vnx->vx_flags = VX_BUSY;
        vnx->vx_error = 0;
        vnx->vx_pending = 0;
        vnx->vx_bp = bp;
        vnx->vx_sdp = sdp;

        /*
         * setup for main loop where we read filesystem blocks into
         * our buffer.
         */
        error = 0;
        bp->b_resid = bp->b_bcount;        /* nothing transferred yet! */
        addr = bp->b_data;                /* current position in buffer */
        byteoff = dbtob((uint64_t)bn);

        for (resid = bp->b_resid; resid; resid -= sz) {
                struct vndbuf        *nbp;

                /*
                 * translate byteoffset into block number.  return values:
                 *   vp = vnode of underlying device
                 *  nbn = new block number (on underlying vnode dev)
                 *  nra = num blocks we can read-ahead (excludes requested
                 *        block)
                 */
                nra = 0;
                error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
                                         &vp, &nbn, &nra);

                if (error == 0 && nbn == (daddr_t)-1) {
                        /*
                         * this used to just set error, but that doesn't
                         * do the right thing.  Instead, it causes random
                         * memory errors.  The panic() should remain until
                         * this condition doesn't destabilize the system.
                         */
#if 1
                        panic("%s: swap to sparse file", __func__);
#else
                        error = EIO;        /* failure */
#endif
                }

                /*
                 * punt if there was an error or a hole in the file.
                 * we must wait for any i/o ops we have already started
                 * to finish before returning.
                 *
                 * XXX we could deal with holes here but it would be
                 * a hassle (in the write case).
                 */
                if (error) {
                        s = splbio();
                        vnx->vx_error = error;        /* pass error up */
                        goto out;
                }

                /*
                 * compute the size ("sz") of this transfer (in bytes).
                 */
                off = byteoff % sdp->swd_bsize;
                sz = (1 + nra) * sdp->swd_bsize - off;
                if (sz > resid)
                        sz = resid;

                UVMHIST_LOG(pdhist, "sw_reg_strategy: "
                    "vp %#jx/%#jx offset %#jx/%#jx",
                    (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn);

                /*
                 * now get a buf structure.   note that the vb_buf is
                 * at the front of the nbp structure so that you can
                 * cast pointers between the two structure easily.
                 */
                nbp = pool_get(&vndbuf_pool, PR_WAITOK);
                buf_init(&nbp->vb_buf);
                nbp->vb_buf.b_flags    = bp->b_flags;
                nbp->vb_buf.b_cflags   = bp->b_cflags;
                nbp->vb_buf.b_oflags   = bp->b_oflags;
                nbp->vb_buf.b_bcount   = sz;
                nbp->vb_buf.b_bufsize  = sz;
                nbp->vb_buf.b_error    = 0;
                nbp->vb_buf.b_data     = addr;
                nbp->vb_buf.b_lblkno   = 0;
                nbp->vb_buf.b_blkno    = nbn + btodb(off);
                nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
                nbp->vb_buf.b_iodone   = sw_reg_biodone;
                nbp->vb_buf.b_vp       = vp;
                nbp->vb_buf.b_objlock  = vp->v_interlock;
                if (vp->v_type == VBLK) {
                        nbp->vb_buf.b_dev = vp->v_rdev;
                }

                nbp->vb_xfer = vnx;        /* patch it back in to vnx */

                /*
                 * Just sort by block number
                 */
                s = splbio();
                if (vnx->vx_error != 0) {
                        buf_destroy(&nbp->vb_buf);
                        pool_put(&vndbuf_pool, nbp);
                        goto out;
                }
                vnx->vx_pending++;

                /* sort it in and start I/O if we are not over our limit */
                /* XXXAD locking */
                bufq_put(sdp->swd_tab, &nbp->vb_buf);
                sw_reg_start(sdp);
                splx(s);

                /*
                 * advance to the next I/O
                 */
                byteoff += sz;
                addr += sz;
        }

        s = splbio();

out: /* Arrive here at splbio */
        vnx->vx_flags &= ~VX_BUSY;
        if (vnx->vx_pending == 0) {
                error = vnx->vx_error;
                pool_put(&vndxfer_pool, vnx);
                bp->b_error = error;
                biodone(bp);
        }
        splx(s);
}

/*
 * sw_reg_start: start an I/O request on the requested swapdev
 *
 * => reqs are sorted by b_rawblkno (above)
 */
static void
sw_reg_start(struct swapdev *sdp)
{
        struct buf        *bp;
        struct vnode        *vp;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);

        /* recursion control */
        if ((sdp->swd_flags & SWF_BUSY) != 0)
                return;

        sdp->swd_flags |= SWF_BUSY;

        while (sdp->swd_active < sdp->swd_maxactive) {
                bp = bufq_get(sdp->swd_tab);
                if (bp == NULL)
                        break;
                sdp->swd_active++;

                UVMHIST_LOG(pdhist,
                    "sw_reg_start:  bp %#jx vp %#jx blkno %#jx cnt %#jx",
                    (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno,
                    bp->b_bcount);
                vp = bp->b_vp;
                KASSERT(bp->b_objlock == vp->v_interlock);
                if ((bp->b_flags & B_READ) == 0) {
                        mutex_enter(vp->v_interlock);
                        vp->v_numoutput++;
                        mutex_exit(vp->v_interlock);
                }
                VOP_STRATEGY(vp, bp);
        }
        sdp->swd_flags &= ~SWF_BUSY;
}

/*
 * sw_reg_biodone: one of our i/o's has completed
 */
static void
sw_reg_biodone(struct buf *bp)
{
        workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
}

/*
 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
 *
 * => note that we can recover the vndbuf struct by casting the buf ptr
 */
static void
sw_reg_iodone(struct work *wk, void *dummy)
{
        struct vndbuf *vbp = (void *)wk;
        struct vndxfer *vnx = vbp->vb_xfer;
        struct buf *pbp = vnx->vx_bp;                /* parent buffer */
        struct swapdev        *sdp = vnx->vx_sdp;
        int s, resid, error;
        KASSERT(&vbp->vb_buf.b_work == wk);
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(pdhist, "  vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx",
            (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
            (uintptr_t)vbp->vb_buf.b_data);
        UVMHIST_LOG(pdhist, "  cnt=%#jx resid=%#jx",
            vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);

        /*
         * protect vbp at splbio and update.
         */

        s = splbio();
        resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
        pbp->b_resid -= resid;
        vnx->vx_pending--;

        if (vbp->vb_buf.b_error != 0) {
                /* pass error upward */
                error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
                UVMHIST_LOG(pdhist, "  got error=%jd !", error, 0, 0, 0);
                vnx->vx_error = error;
        }

        /*
         * kill vbp structure
         */
        buf_destroy(&vbp->vb_buf);
        pool_put(&vndbuf_pool, vbp);

        /*
         * wrap up this transaction if it has run to completion or, in
         * case of an error, when all auxiliary buffers have returned.
         */
        if (vnx->vx_error != 0) {
                /* pass error upward */
                error = vnx->vx_error;
                if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
                        pbp->b_error = error;
                        biodone(pbp);
                        pool_put(&vndxfer_pool, vnx);
                }
        } else if (pbp->b_resid == 0) {
                KASSERT(vnx->vx_pending == 0);
                if ((vnx->vx_flags & VX_BUSY) == 0) {
                        UVMHIST_LOG(pdhist, "  iodone, pbp=%#jx error=%jd !",
                            (uintptr_t)pbp, vnx->vx_error, 0, 0);
                        biodone(pbp);
                        pool_put(&vndxfer_pool, vnx);
                }
        }

        /*
         * done!   start next swapdev I/O if one is pending
         */
        sdp->swd_active--;
        sw_reg_start(sdp);
        splx(s);
}


/*
 * uvm_swap_alloc: allocate space on swap
 *
 * => allocation is done "round robin" down the priority list, as we
 *        allocate in a priority we "rotate" the circle queue.
 * => space can be freed with uvm_swap_free
 * => we return the page slot number in /dev/drum (0 == invalid slot)
 * => we lock uvm_swap_data_lock
 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
 */
int
uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
{
        struct swapdev *sdp;
        struct swappri *spp;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);

        /*
         * no swap devices configured yet?   definite failure.
         */
        if (uvmexp.nswapdev < 1)
                return 0;

        /*
         * XXXJAK: BEGIN HACK
         *
         * blist_alloc() in subr_blist.c will panic if we try to allocate
         * too many slots.
         */
        if (*nslots > BLIST_MAX_ALLOC) {
                if (__predict_false(lessok == false))
                        return 0;
                *nslots = BLIST_MAX_ALLOC;
        }
        /* XXXJAK: END HACK */

        /*
         * lock data lock, convert slots into blocks, and enter loop
         */
        mutex_enter(&uvm_swap_data_lock);

ReTry:        /* XXXMRG */
        LIST_FOREACH(spp, &swap_priority, spi_swappri) {
                TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
                        uint64_t result;

                        /* if it's not enabled, then we can't swap from it */
                        if ((sdp->swd_flags & SWF_ENABLE) == 0)
                                continue;
                        if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
                                continue;
                        result = blist_alloc(sdp->swd_blist, *nslots);
                        if (result == BLIST_NONE) {
                                continue;
                        }
                        KASSERT(result < sdp->swd_drumsize);

                        /*
                         * successful allocation!  now rotate the tailq.
                         */
                        TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
                        TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
                        sdp->swd_npginuse += *nslots;
                        uvmexp.swpginuse += *nslots;
                        mutex_exit(&uvm_swap_data_lock);
                        /* done!  return drum slot number */
                        UVMHIST_LOG(pdhist,
                            "success!  returning %jd slots starting at %jd",
                            *nslots, result + sdp->swd_drumoffset, 0, 0);
                        return (result + sdp->swd_drumoffset);
                }
        }

        /* XXXMRG: BEGIN HACK */
        if (*nslots > 1 && lessok) {
                *nslots = 1;
                /* XXXMRG: ugh!  blist should support this for us */
                goto ReTry;
        }
        /* XXXMRG: END HACK */

        mutex_exit(&uvm_swap_data_lock);
        return 0;
}

/*
 * uvm_swapisfull: return true if most of available swap is allocated
 * and in use.  we don't count some small portion as it may be inaccessible
 * to us at any given moment, for example if there is lock contention or if
 * pages are busy.
 */
bool
uvm_swapisfull(void)
{
        int swpgonly;
        bool rv;

        if (uvmexp.swpages == 0) {
                return true;
        }

        mutex_enter(&uvm_swap_data_lock);
        KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
        swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 /
            uvm_swapisfull_factor);
        rv = (swpgonly >= uvmexp.swpgavail);
        mutex_exit(&uvm_swap_data_lock);

        return (rv);
}

/*
 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
 *
 * => we lock uvm_swap_data_lock
 */
void
uvm_swap_markbad(int startslot, int nslots)
{
        struct swapdev *sdp;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);

        mutex_enter(&uvm_swap_data_lock);
        sdp = swapdrum_getsdp(startslot);
        KASSERT(sdp != NULL);

        /*
         * we just keep track of how many pages have been marked bad
         * in this device, to make everything add up in swap_off().
         * we assume here that the range of slots will all be within
         * one swap device.
         */

        KASSERT(uvmexp.swpgonly >= nslots);
        atomic_add_int(&uvmexp.swpgonly, -nslots);
        sdp->swd_npgbad += nslots;
        UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0);
        mutex_exit(&uvm_swap_data_lock);
}

/*
 * uvm_swap_free: free swap slots
 *
 * => this can be all or part of an allocation made by uvm_swap_alloc
 * => we lock uvm_swap_data_lock
 */
void
uvm_swap_free(int startslot, int nslots)
{
        struct swapdev *sdp;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots,
            startslot, 0, 0);

        /*
         * ignore attempts to free the "bad" slot.
         */

        if (startslot == SWSLOT_BAD) {
                return;
        }

        /*
         * convert drum slot offset back to sdp, free the blocks
         * in the extent, and return.   must hold pri lock to do
         * lookup and access the extent.
         */

        mutex_enter(&uvm_swap_data_lock);
        sdp = swapdrum_getsdp(startslot);
        KASSERT(uvmexp.nswapdev >= 1);
        KASSERT(sdp != NULL);
        KASSERT(sdp->swd_npginuse >= nslots);
        blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
        sdp->swd_npginuse -= nslots;
        uvmexp.swpginuse -= nslots;
        mutex_exit(&uvm_swap_data_lock);
}

/*
 * uvm_swap_put: put any number of pages into a contig place on swap
 *
 * => can be sync or async
 */

int
uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
{
        int error;

        error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
            ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
        return error;
}

/*
 * uvm_swap_get: get a single page from swap
 *
 * => usually a sync op (from fault)
 */

int
uvm_swap_get(struct vm_page *page, int swslot, int flags)
{
        int error;

        atomic_inc_uint(&uvmexp.nswget);
        KASSERT(flags & PGO_SYNCIO);
        if (swslot == SWSLOT_BAD) {
                return EIO;
        }

        error = uvm_swap_io(&page, swslot, 1, B_READ |
            ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
        if (error == 0) {

                /*
                 * this page is no longer only in swap.
                 */

                KASSERT(uvmexp.swpgonly > 0);
                atomic_dec_uint(&uvmexp.swpgonly);
        }
        return error;
}

/*
 * uvm_swap_io: do an i/o operation to swap
 */

static int
uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
{
        daddr_t startblk;
        struct        buf *bp;
        vaddr_t kva;
        int        error, mapinflags;
        bool write, async, swap_encrypt;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%#jx",
            startslot, npages, flags, 0);

        write = (flags & B_READ) == 0;
        async = (flags & B_ASYNC) != 0;
        swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt);

        /*
         * allocate a buf for the i/o.
         */

        KASSERT(curlwp != uvm.pagedaemon_lwp || write);
        KASSERT(curlwp != uvm.pagedaemon_lwp || async);
        bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
        if (bp == NULL) {
                uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
                return ENOMEM;
        }

        /*
         * convert starting drum slot to block number
         */

        startblk = btodb((uint64_t)startslot << PAGE_SHIFT);

        /*
         * first, map the pages into the kernel.
         */

        mapinflags = !write ?
                UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
                UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
        if (write && swap_encrypt)        /* need to encrypt in-place */
                mapinflags |= UVMPAGER_MAPIN_READ;
        kva = uvm_pagermapin(pps, npages, mapinflags);

        /*
         * encrypt writes in place if requested
         */

        if (write) do {
                struct swapdev *sdp;
                int i;

                /*
                 * Get the swapdev so we can discriminate on the
                 * encryption state.  There may or may not be an
                 * encryption key generated; we may or may not be asked
                 * to encrypt swap.
                 *
                 * 1. NO KEY, NO ENCRYPTION: Nothing to do.
                 *
                 * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt,
                 *    and mark the slots encrypted.
                 *
                 * 3. KEY, BUT NO ENCRYPTION: The slots may already be
                 *    marked encrypted from a past life.  Mark them not
                 *    encrypted.
                 *
                 * 4. KEY, ENCRYPTION: Encrypt and mark the slots
                 *    encrypted.
                 */
                mutex_enter(&uvm_swap_data_lock);
                sdp = swapdrum_getsdp(startslot);
                if (!sdp->swd_encinit) {
                        if (!swap_encrypt) {
                                mutex_exit(&uvm_swap_data_lock);
                                break;
                        }
                        uvm_swap_genkey(sdp);
                }
                KASSERT(sdp->swd_encinit);
                mutex_exit(&uvm_swap_data_lock);

                for (i = 0; i < npages; i++) {
                        int s = startslot + i;
                        KDASSERT(swapdrum_sdp_is(s, sdp));
                        KASSERT(s >= sdp->swd_drumoffset);
                        s -= sdp->swd_drumoffset;
                        KASSERT(s < sdp->swd_drumsize);

                        if (swap_encrypt) {
                                uvm_swap_encryptpage(sdp,
                                    (void *)(kva + (vsize_t)i*PAGE_SIZE), s);
                                atomic_or_32(&sdp->swd_encmap[s/32],
                                    __BIT(s%32));
                        } else {
                                atomic_and_32(&sdp->swd_encmap[s/32],
                                    ~__BIT(s%32));
                        }
                }
        } while (0);

        /*
         * fill in the bp/sbp.   we currently route our i/o through
         * /dev/drum's vnode [swapdev_vp].
         */

        bp->b_cflags = BC_BUSY | BC_NOCACHE;
        bp->b_flags = (flags & (B_READ|B_ASYNC));
        bp->b_proc = &proc0;        /* XXX */
        bp->b_vnbufs.le_next = NOLIST;
        bp->b_data = (void *)kva;
        bp->b_blkno = startblk;
        bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;

        /*
         * bump v_numoutput (counter of number of active outputs).
         */

        if (write) {
                mutex_enter(swapdev_vp->v_interlock);
                swapdev_vp->v_numoutput++;
                mutex_exit(swapdev_vp->v_interlock);
        }

        /*
         * for async ops we must set up the iodone handler.
         */

        if (async) {
                bp->b_iodone = uvm_aio_aiodone;
                UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
                if (curlwp == uvm.pagedaemon_lwp)
                        BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
                else
                        BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
        } else {
                bp->b_iodone = NULL;
                BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
        }
        UVMHIST_LOG(pdhist,
            "about to start io: data = %#jx blkno = %#jx, bcount = %jd",
            (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0);

        /*
         * now we start the I/O, and if async, return.
         */

        VOP_STRATEGY(swapdev_vp, bp);
        if (async) {
                /*
                 * Reads are always synchronous; if this changes, we
                 * need to add an asynchronous path for decryption.
                 */
                KASSERT(write);
                return 0;
        }

        /*
         * must be sync i/o.   wait for it to finish
         */

        error = biowait(bp);
        if (error)
                goto out;

        /*
         * decrypt reads in place if needed
         */

        if (!write) do {
                struct swapdev *sdp;
                bool encinit;
                int i;

                /*
                 * Get the sdp.  Everything about it except the encinit
                 * bit, saying whether the encryption key is
                 * initialized or not, and the encrypted bit for each
                 * page, is stable until all swap pages have been
                 * released and the device is removed.
                 */
                mutex_enter(&uvm_swap_data_lock);
                sdp = swapdrum_getsdp(startslot);
                encinit = sdp->swd_encinit;
                mutex_exit(&uvm_swap_data_lock);

                if (!encinit)
                        /*
                         * If there's no encryption key, there's no way
                         * any of these slots can be encrypted, so
                         * nothing to do here.
                         */
                        break;
                for (i = 0; i < npages; i++) {
                        int s = startslot + i;
                        KDASSERT(swapdrum_sdp_is(s, sdp));
                        KASSERT(s >= sdp->swd_drumoffset);
                        s -= sdp->swd_drumoffset;
                        KASSERT(s < sdp->swd_drumsize);
                        if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) &
                                __BIT(s%32)) == 0)
                                continue;
                        uvm_swap_decryptpage(sdp,
                            (void *)(kva + (vsize_t)i*PAGE_SIZE), s);
                }
        } while (0);
out:
        /*
         * kill the pager mapping
         */

        uvm_pagermapout(kva, npages);

        /*
         * now dispose of the buf and we're done.
         */

        if (write) {
                mutex_enter(swapdev_vp->v_interlock);
                vwakeup(bp);
                mutex_exit(swapdev_vp->v_interlock);
        }
        putiobuf(bp);
        UVMHIST_LOG(pdhist, "<- done (sync)  error=%jd", error, 0, 0, 0);

        return (error);
}

/*
 * uvm_swap_genkey(sdp)
 *
 *        Generate a key for swap encryption.
 */
static void
uvm_swap_genkey(struct swapdev *sdp)
{
        uint8_t key[32];

        KASSERT(!sdp->swd_encinit);

        cprng_strong(kern_cprng, key, sizeof key, 0);
        aes_setenckey256(&sdp->swd_enckey, key);
        aes_setdeckey256(&sdp->swd_deckey, key);
        explicit_memset(key, 0, sizeof key);

        sdp->swd_encinit = true;
}

/*
 * uvm_swap_encryptpage(sdp, kva, slot)
 *
 *        Encrypt one page of data at kva for the specified slot number
 *        in the swap device.
 */
static void
uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot)
{
        uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);

        /* iv := AES_k(le32enc(slot) || 0^96) */
        le32enc(preiv, slot);
        aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);

        /* *kva := AES-CBC_k(iv, *kva) */
        aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv,
            AES_256_NROUNDS);

        explicit_memset(&iv, 0, sizeof iv);
}

/*
 * uvm_swap_decryptpage(sdp, kva, slot)
 *
 *        Decrypt one page of data at kva for the specified slot number
 *        in the swap device.
 */
static void
uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot)
{
        uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);

        /* iv := AES_k(le32enc(slot) || 0^96) */
        le32enc(preiv, slot);
        aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);

        /* *kva := AES-CBC^{-1}_k(iv, *kva) */
        aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv,
            AES_256_NROUNDS);

        explicit_memset(&iv, 0, sizeof iv);
}

SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup")
{

        sysctl_createv(clog, 0, NULL, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt",
            SYSCTL_DESCR("Encrypt data when swapped out to disk"),
            NULL, 0, &uvm_swap_encrypt, 0,
            CTL_VM, CTL_CREATE, CTL_EOL);
}























































































































































































    2 





    2 






















    2 

    2 





    3 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
/*        $NetBSD: bufq_disksort.c,v 1.14 2017/05/04 11:03:27 kamil Exp $        */
/*        NetBSD: subr_disk.c,v 1.61 2004/09/25 03:30:44 thorpej Exp         */

/*-
 * Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ufs_disksubr.c        8.5 (Berkeley) 1/21/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: bufq_disksort.c,v 1.14 2017/05/04 11:03:27 kamil Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/bufq_impl.h>
#include <sys/kmem.h>
#include <sys/module.h>

/*
 * Seek sort for disks.
 *
 * There are actually two queues, sorted in ascendening order.  The first
 * queue holds those requests which are positioned after the current block;
 * the second holds requests which came in after their position was passed.
 * Thus we implement a one-way scan, retracting after reaching the end of
 * the drive to the first request on the second queue, at which time it
 * becomes the first queue.
 *
 * A one-way scan is natural because of the way UNIX read-ahead blocks are
 * allocated.
 */

struct bufq_disksort {
        TAILQ_HEAD(, buf) bq_head;        /* actual list of buffers */
};

static void bufq_disksort_init(struct bufq_state *);
static void bufq_disksort_put(struct bufq_state *, struct buf *);
static struct buf *bufq_disksort_get(struct bufq_state *, int);

BUFQ_DEFINE(disksort, 20, bufq_disksort_init);

static void
bufq_disksort_put(struct bufq_state *bufq, struct buf *bp)
{
        struct bufq_disksort *disksort = bufq_private(bufq);
        struct buf *bq, *nbq;
        int sortby;

        sortby = bufq->bq_flags & BUFQ_SORT_MASK;

        bq = TAILQ_FIRST(&disksort->bq_head);

        /*
         * If the queue is empty it's easy; we just go on the end.
         */
        if (bq == NULL) {
                TAILQ_INSERT_TAIL(&disksort->bq_head, bp, b_actq);
                return;
        }

        /*
         * If we lie before the currently active request, then we
         * must locate the second request list and add ourselves to it.
         */
        if (buf_inorder(bp, bq, sortby)) {
                while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) {
                        /*
                         * Check for an ``inversion'' in the normally ascending
                         * block numbers, indicating the start of the second
                         * request list.
                         */
                        if (buf_inorder(nbq, bq, sortby)) {
                                /*
                                 * Search the second request list for the first
                                 * request at a larger block number.  We go
                                 * after that; if there is no such request, we
                                 * go at the end.
                                 */
                                do {
                                        if (buf_inorder(bp, nbq, sortby))
                                                goto insert;
                                        bq = nbq;
                                } while ((nbq =
                                    TAILQ_NEXT(bq, b_actq)) != NULL);
                                goto insert;                /* after last */
                        }
                        bq = nbq;
                }
                /*
                 * No inversions... we will go after the last, and
                 * be the first request in the second request list.
                 */
                goto insert;
        }
        /*
         * Request is at/after the current request...
         * sort in the first request list.
         */
        while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) {
                /*
                 * We want to go after the current request if there is an
                 * inversion after it (i.e. it is the end of the first
                 * request list), or if the next request is a larger cylinder
                 * than our request.
                 */
                if (buf_inorder(nbq, bq, sortby) ||
                    buf_inorder(bp, nbq, sortby))
                        goto insert;
                bq = nbq;
        }
        /*
         * Neither a second list nor a larger request... we go at the end of
         * the first list, which is the same as the end of the whole schebang.
         */
insert:        TAILQ_INSERT_AFTER(&disksort->bq_head, bq, bp, b_actq);
}

static struct buf *
bufq_disksort_get(struct bufq_state *bufq, int remove)
{
        struct bufq_disksort *disksort = bufq_private(bufq);
        struct buf *bp;

        bp = TAILQ_FIRST(&disksort->bq_head);

        if (bp != NULL && remove)
                TAILQ_REMOVE(&disksort->bq_head, bp, b_actq);

        return (bp);
}

static struct buf *
bufq_disksort_cancel(struct bufq_state *bufq, struct buf *buf)
{
        struct bufq_disksort *disksort = bufq_private(bufq);
        struct buf *bq;

        TAILQ_FOREACH(bq, &disksort->bq_head, b_actq) {
                if (bq == buf) {
                        TAILQ_REMOVE(&disksort->bq_head, bq, b_actq);
                        return buf;
                }
        }
        return NULL;
}

static void
bufq_disksort_fini(struct bufq_state *bufq)
{

        KASSERT(bufq->bq_private != NULL);
        kmem_free(bufq->bq_private, sizeof(struct bufq_disksort));
}

static void
bufq_disksort_init(struct bufq_state *bufq)
{
        struct bufq_disksort *disksort;

        disksort = kmem_zalloc(sizeof(*disksort), KM_SLEEP);
        bufq->bq_private = disksort;
        bufq->bq_get = bufq_disksort_get;
        bufq->bq_put = bufq_disksort_put;
        bufq->bq_cancel = bufq_disksort_cancel;
        bufq->bq_fini = bufq_disksort_fini;
        TAILQ_INIT(&disksort->bq_head);
}

MODULE(MODULE_CLASS_BUFQ, bufq_disksort, NULL);

static int
bufq_disksort_modcmd(modcmd_t cmd, void *opaque)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return bufq_register(&bufq_strat_disksort);
        case MODULE_CMD_FINI:
                return bufq_unregister(&bufq_strat_disksort);
        default:
                return ENOTTY;
        }
}













































































































































































    1 





    1 
    1 
    1 
    1 










    2 

    2 























    2 









    2 

























    2 








    1 



    2 




    1 


    2 








    2 
































    1 

    1 


































































    1 


    1 















    1 


    1 



    1 










    1 






























    2 

    2 
    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
/*        $NetBSD: kern_physio.c,v 1.102 2022/07/10 23:11:55 riastradh Exp $        */

/*-
 * Copyright (c) 1982, 1986, 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_physio.c        8.1 (Berkeley) 6/10/93
 */

/*-
 * Copyright (c) 1994 Christopher G. Demetriou
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_physio.c        8.1 (Berkeley) 6/10/93
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.102 2022/07/10 23:11:55 riastradh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/once.h>
#include <sys/workqueue.h>
#include <sys/kmem.h>

#include <uvm/uvm_extern.h>

ONCE_DECL(physio_initialized);
struct workqueue *physio_workqueue;

int physio_concurrency = 16;

/* #define        PHYSIO_DEBUG */
#if defined(PHYSIO_DEBUG)
#define        DPRINTF(a)        printf a
#else /* defined(PHYSIO_DEBUG) */
#define        DPRINTF(a)        /* nothing */
#endif /* defined(PHYSIO_DEBUG) */

struct physio_stat {
        int ps_running;
        int ps_error;
        int ps_failed;
        off_t ps_endoffset;
        size_t ps_resid;
        buf_t *ps_orig_bp;
        kmutex_t ps_lock;
        kcondvar_t ps_cv;
};

static void
physio_done(struct work *wk, void *dummy)
{
        struct buf *bp = (void *)wk;
        size_t todo = bp->b_bufsize;
        size_t done = bp->b_bcount - bp->b_resid;
        struct physio_stat *ps = bp->b_private;
        bool is_iobuf;

        KASSERT(&bp->b_work == wk);
        KASSERT(bp->b_bcount <= todo);
        KASSERT(bp->b_resid <= bp->b_bcount);
        KASSERT((bp->b_flags & B_PHYS) != 0);
        KASSERT(dummy == NULL);

        vunmapbuf(bp, todo);
        uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo);

        mutex_enter(&ps->ps_lock);
        is_iobuf = (bp != ps->ps_orig_bp);
        if (__predict_false(done != todo)) {
                off_t endoffset = dbtob(bp->b_blkno) + done;

                /*
                 * we got an error or hit EOM.
                 *
                 * we only care about the first one.
                 * ie. the one at the lowest offset.
                 */

                KASSERT(ps->ps_endoffset != endoffset);
                DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64
                    ", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n",
                    __func__, bp->b_error, dbtob(bp->b_blkno), endoffset,
                    bp->b_blkno, bp->b_bcount, bp->b_flags));

                if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) {
                        DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64
                            " -> %" PRIu64 "\n",
                            __func__, ps,
                            ps->ps_error, bp->b_error,
                            ps->ps_endoffset, endoffset));

                        ps->ps_endoffset = endoffset;
                        ps->ps_error = bp->b_error;
                }
                ps->ps_failed++;

                ps->ps_resid += todo - done;
        } else {
                KASSERT(bp->b_error == 0);
        }

        ps->ps_running--;
        cv_signal(&ps->ps_cv);
        mutex_exit(&ps->ps_lock);

        if (is_iobuf)
                putiobuf(bp);
}

static void
physio_biodone(struct buf *bp)
{
#if defined(DIAGNOSTIC)
        struct physio_stat *ps = bp->b_private;
        size_t todo = bp->b_bufsize;
        size_t done = bp->b_bcount - bp->b_resid;

        KASSERT(ps->ps_running > 0);
        KASSERT(bp->b_bcount <= todo);
        KASSERT(bp->b_resid <= bp->b_bcount);
        if (done == todo)
                KASSERTMSG(bp->b_error == 0, "error=%d", bp->b_error);
#endif /* defined(DIAGNOSTIC) */

        workqueue_enqueue(physio_workqueue, &bp->b_work, NULL);
}

static void
physio_wait(struct physio_stat *ps, int n)
{

        KASSERT(mutex_owned(&ps->ps_lock));

        while (ps->ps_running > n)
                cv_wait(&ps->ps_cv, &ps->ps_lock);
}

static int
physio_init(void)
{
        int error;

        KASSERT(physio_workqueue == NULL);

        error = workqueue_create(&physio_workqueue, "physiod",
            physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE);

        return error;
}

/*
 * Do "physical I/O" on behalf of a user.  "Physical I/O" is I/O directly
 * from the raw device to user buffers, and bypasses the buffer cache.
 */
int
physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
    void (*min_phys)(struct buf *), struct uio *uio)
{
        struct iovec *iovp;
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;
        int i, error;
        struct buf *bp = NULL;
        struct physio_stat *ps;
        int concurrency = physio_concurrency - 1;
        int isdisk;

        error = RUN_ONCE(&physio_initialized, physio_init);
        if (__predict_false(error != 0)) {
                return error;
        }

        DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n",
            __func__, uio->uio_offset, uio->uio_resid));

        flags &= B_READ | B_WRITE;

        ps = kmem_zalloc(sizeof(*ps), KM_SLEEP);
        /* ps->ps_running = 0; */
        /* ps->ps_error = 0; */
        /* ps->ps_failed = 0; */
        ps->ps_orig_bp = obp;
        ps->ps_endoffset = -1;
        ps->ps_resid = 0;
        mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&ps->ps_cv, "physio");

        /* Allow concurrent I/O only for disks */
        isdisk = cdev_type(dev) == D_DISK;
        if (!isdisk)
                concurrency = 0;

        /* Make sure we have a buffer, creating one if necessary. */
        if (obp != NULL) {
                mutex_enter(&bufcache_lock);
                /* Mark it busy, so nobody else will use it. */
                while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH)
                        ;
                mutex_exit(&bufcache_lock);
                concurrency = 0; /* see "XXXkludge" comment below */
        }

        for (i = 0; i < uio->uio_iovcnt; i++) {
                bool sync = true;

                iovp = &uio->uio_iov[i];
                while (iovp->iov_len > 0) {
                        size_t todo;
                        vaddr_t endp;

                        mutex_enter(&ps->ps_lock);
                        if (ps->ps_failed != 0) {
                                goto done_locked;
                        }
                        physio_wait(ps, sync ? 0 : concurrency);
                        mutex_exit(&ps->ps_lock);
                        if (obp != NULL) {
                                /*
                                 * XXXkludge
                                 * some drivers use "obp" as an identifier.
                                 */
                                bp = obp;
                        } else {
                                bp = getiobuf(NULL, true);
                                bp->b_cflags |= BC_BUSY;
                        }
                        bp->b_dev = dev;
                        bp->b_proc = p;
                        bp->b_private = ps;

                        /*
                         * Mrk the buffer busy for physical I/O.  Also set
                         * B_PHYS because it's an I/O to user memory, and
                         * B_RAW because B_RAW is to be "set by physio for
                         * raw transfers".
                         */
                        bp->b_oflags = 0;
                        bp->b_cflags |= BC_BUSY;
                        bp->b_flags = flags | B_PHYS | B_RAW;
                        bp->b_iodone = physio_biodone;

                        /* Set up the buffer for a maximum-sized transfer. */
                        bp->b_blkno = btodb(uio->uio_offset);
                        if (isdisk) {
                                /*
                                 * For disks, check that offsets are at least block
                                 * aligned, the block addresses are used to track
                                 * errors of finished requests.
                                 */
                                if (uio->uio_offset & (DEV_BSIZE - 1)) {
                                        error = EINVAL;
                                        goto done;
                                }
                                /*
                                 * Split request into MAXPHYS chunks
                                 */
                                bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
                        } else {
                                bp->b_bcount = MIN(INT_MAX, iovp->iov_len);
                        }
                        bp->b_data = iovp->iov_base;

                        /*
                         * Call minphys to bound the transfer size,
                         * and remember the amount of data to transfer,
                         * for later comparison.
                         */
                        (*min_phys)(bp);
                        todo = bp->b_bufsize = bp->b_bcount;
#if defined(DIAGNOSTIC)
                        if (todo > MAXPHYS)
                                panic("todo(%zu) > MAXPHYS; minphys broken",
                                    todo);
#endif /* defined(DIAGNOSTIC) */

                        sync = false;
                        endp = (vaddr_t)bp->b_data + todo;
                        if (trunc_page(endp) != endp) {
                                /*
                                 * Following requests can overlap.
                                 * note that uvm_vslock does round_page.
                                 */
                                sync = true;
                        }

                        /*
                         * Lock the part of the user address space involved
                         * in the transfer.
                         */
                        error = uvm_vslock(p->p_vmspace, bp->b_data, todo,
                            (flags & B_READ) ?  VM_PROT_WRITE : VM_PROT_READ);
                        if (error) {
                                goto done;
                        }

                        /*
                         * Beware vmapbuf(); if successful it clobbers
                         * b_data and saves it in b_saveaddr.
                         * However, vunmapbuf() restores b_data.
                         */
                        if ((error = vmapbuf(bp, todo)) != 0) {
                                uvm_vsunlock(p->p_vmspace, bp->b_data, todo);
                                goto done;
                        }

                        BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);

                        mutex_enter(&ps->ps_lock);
                        ps->ps_running++;
                        mutex_exit(&ps->ps_lock);

                        /* Call strategy to start the transfer. */
                        (*strategy)(bp);
                        bp = NULL;

                        iovp->iov_len -= todo;
                        iovp->iov_base = (char *)iovp->iov_base + todo;
                        uio->uio_offset += todo;
                        uio->uio_resid -= todo;
                }
        }

done:
        mutex_enter(&ps->ps_lock);
done_locked:
        physio_wait(ps, 0);
        mutex_exit(&ps->ps_lock);

        KASSERT(ps->ps_failed || ps->ps_endoffset == -1);

        /*
         * Compute residual, for disks adjust for the
         * lowest numbered block that returned an error.
         */
        if (isdisk) {
                if (ps->ps_failed != 0) {
                        off_t delta;

                        delta = uio->uio_offset - ps->ps_endoffset;
                        KASSERT(delta > 0);
                        uio->uio_resid += delta;
                        /* uio->uio_offset = ps->ps_endoffset; */
                }
        } else {
                uio->uio_resid += ps->ps_resid;
        }

        if (bp != NULL && bp != obp) {
                putiobuf(bp);
        }
        if (error == 0) {
                error = ps->ps_error;
        }
        mutex_destroy(&ps->ps_lock);
        cv_destroy(&ps->ps_cv);
        kmem_free(ps, sizeof(*ps));

        /*
         * Clean up the state of the buffer.  Remember if somebody wants
         * it, so we can wake them up below.  Also, if we had to steal it,
         * give it back.
         */
        if (obp != NULL) {
                KASSERT((obp->b_cflags & BC_BUSY) != 0);

                /*
                 * If another process is waiting for the raw I/O buffer,
                 * wake up processes waiting to do physical I/O;
                 */
                mutex_enter(&bufcache_lock);
                obp->b_cflags &= ~(BC_BUSY | BC_WANTED);
                obp->b_flags &= ~(B_PHYS | B_RAW);
                obp->b_iodone = NULL;
                cv_broadcast(&obp->b_busy);
                mutex_exit(&bufcache_lock);
        }

        DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n",
            __func__, uio->uio_offset, uio->uio_resid));

        return error;
}

/*
 * A minphys() routine is called by physio() to adjust the size of each
 * I/O transfer before the latter is passed to the strategy routine.
 *
 * This minphys() is a default that must be called to enforce limits
 * that are applicable to all devices, because of limitations in the
 * kernel or the hardware platform.
 */
void
minphys(struct buf *bp)
{

        if (bp->b_bcount > MAXPHYS)
                bp->b_bcount = MAXPHYS;
}































































































































































































































































































































    3 



    3 

    3 




    3 
    3 




    3 







    3 



















    1 


    1 

    1 

    1 






































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
/*        $NetBSD: tcp_congctl.c,v 1.28 2021/07/31 20:29:37 andvar Exp $        */

/*-
 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
 * Facility, NASA Ames Research Center.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Rui Paulo.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
 *
 * NRL grants permission for redistribution and use in source and binary
 * forms, with or without modification, of the software and documentation
 * created at NRL provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgements:
 *      This product includes software developed by the University of
 *      California, Berkeley and its contributors.
 *      This product includes software developed at the Information
 *      Technology Division, US Naval Research Laboratory.
 * 4. Neither the name of the NRL nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * The views and conclusions contained in the software and documentation
 * are those of the authors and should not be interpreted as representing
 * official policies, either expressed or implied, of the US Naval
 * Research Laboratory (NRL).
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)tcp_input.c        8.12 (Berkeley) 5/24/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.28 2021/07/31 20:29:37 andvar Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_tcp_debug.h"
#include "opt_tcp_congctl.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/mutex.h>

#include <net/if.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>

#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#include <netinet/icmp6.h>
#endif

#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_congctl.h>
#ifdef TCP_DEBUG
#include <netinet/tcp_debug.h>
#endif

/*
 * TODO:
 *   consider separating the actual implementations in another file.
 */

static void tcp_common_congestion_exp(struct tcpcb *, int, int);

static int  tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static int  tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static void tcp_reno_slow_retransmit(struct tcpcb *);
static void tcp_reno_fast_retransmit_newack(struct tcpcb *,
    const struct tcphdr *);
static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *);
static void tcp_reno_congestion_exp(struct tcpcb *tp);

static int  tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static void tcp_newreno_fast_retransmit_newack(struct tcpcb *,
        const struct tcphdr *);
static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *);

static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static void tcp_cubic_slow_retransmit(struct tcpcb *tp);
static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *);
static void tcp_cubic_congestion_exp(struct tcpcb *);

static void tcp_congctl_fillnames(void);

extern int tcprexmtthresh;

MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures");

/* currently selected global congestion control */
char tcp_congctl_global_name[TCPCC_MAXLEN];

/* available global congestion control algorithms */
char tcp_congctl_avail[10 * TCPCC_MAXLEN];

/*
 * Used to list the available congestion control algorithms.
 */
TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd =
    TAILQ_HEAD_INITIALIZER(tcp_congctlhd);

static struct tcp_congctlent * tcp_congctl_global;

static kmutex_t tcp_congctl_mtx;

void
tcp_congctl_init(void)
{
        int r __diagused;
        
        mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE);

        /* Base algorithms. */
        r = tcp_congctl_register("reno", &tcp_reno_ctl);
        KASSERT(r == 0);
        r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
        KASSERT(r == 0);
        r = tcp_congctl_register("cubic", &tcp_cubic_ctl);
        KASSERT(r == 0);

        /* NewReno is the default. */
#ifndef TCP_CONGCTL_DEFAULT
#define TCP_CONGCTL_DEFAULT "newreno"
#endif

        r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT);
        KASSERT(r == 0);
}

/*
 * Register a congestion algorithm and select it if we have none.
 */
int
tcp_congctl_register(const char *name, const struct tcp_congctl *tcc)
{
        struct tcp_congctlent *ntcc, *tccp;

        TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 
                if (!strcmp(name, tccp->congctl_name)) {
                        /* name already registered */
                        return EEXIST;
                }

        ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO);

        strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1);
        ntcc->congctl_ctl = tcc;

        TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent);
        tcp_congctl_fillnames();

        if (TAILQ_FIRST(&tcp_congctlhd) == ntcc)
                tcp_congctl_select(NULL, name);
                
        return 0;
}

int
tcp_congctl_unregister(const char *name)
{
        struct tcp_congctlent *tccp, *rtccp;
        unsigned int size;
        
        rtccp = NULL;
        size = 0;
        TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
                if (!strcmp(name, tccp->congctl_name))
                        rtccp = tccp;
                size++;
        }
        
        if (!rtccp)
                return ENOENT;

        if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt)
                return EBUSY;

        TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent);
        free(rtccp, M_TCPCONGCTL);
        tcp_congctl_fillnames();

        return 0;
}

/*
 * Select a congestion algorithm by name.
 */
int
tcp_congctl_select(struct tcpcb *tp, const char *name)
{
        struct tcp_congctlent *tccp, *old_tccp, *new_tccp;
        bool old_found, new_found;

        KASSERT(name);

        old_found = (tp == NULL || tp->t_congctl == NULL);
        old_tccp = NULL;
        new_found = false;
        new_tccp = NULL;

        TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
                if (!old_found && tccp->congctl_ctl == tp->t_congctl) {
                        old_tccp = tccp;
                        old_found = true;
                }

                if (!new_found && !strcmp(name, tccp->congctl_name)) {
                        new_tccp = tccp;
                        new_found = true;
                }

                if (new_found && old_found) {
                        if (tp) {
                                mutex_enter(&tcp_congctl_mtx);
                                if (old_tccp)
                                        old_tccp->congctl_refcnt--;
                                tp->t_congctl = new_tccp->congctl_ctl;
                                new_tccp->congctl_refcnt++;
                                mutex_exit(&tcp_congctl_mtx);
                        } else {
                                tcp_congctl_global = new_tccp;
                                strlcpy(tcp_congctl_global_name,
                                    new_tccp->congctl_name,
                                    sizeof(tcp_congctl_global_name) - 1);
                        }
                        return 0;
                }
        }

        return EINVAL;
}

void
tcp_congctl_release(struct tcpcb *tp)
{
        struct tcp_congctlent *tccp;

        KASSERT(tp->t_congctl);
        
        TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
                if (tccp->congctl_ctl == tp->t_congctl) {
                        tccp->congctl_refcnt--;
                        return;
                }
        }
}

/*
 * Returns the name of a congestion algorithm.
 */
const char *
tcp_congctl_bystruct(const struct tcp_congctl *tcc)
{
        struct tcp_congctlent *tccp;
        
        KASSERT(tcc);
        
        TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
                if (tccp->congctl_ctl == tcc)
                        return tccp->congctl_name;

        return NULL;
}

static void
tcp_congctl_fillnames(void)
{
        struct tcp_congctlent *tccp;
        const char *delim = " ";
        
        tcp_congctl_avail[0] = '\0';
        TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
                strlcat(tcp_congctl_avail, tccp->congctl_name,
                    sizeof(tcp_congctl_avail) - 1);
                if (TAILQ_NEXT(tccp, congctl_ent))
                        strlcat(tcp_congctl_avail, delim, 
                            sizeof(tcp_congctl_avail) - 1);
        }        
        
}

/* ------------------------------------------------------------------------ */

/*
 * Common stuff
 */

/* Window reduction (1-beta) for [New]Reno: 0.5 */
#define RENO_BETAA 1
#define RENO_BETAB 2
/* Window reduction (1-beta) for Cubic: 0.8 */
#define CUBIC_BETAA 4
#define CUBIC_BETAB 5
/* Draft Rhee Section 4.1 */
#define CUBIC_CA 4
#define CUBIC_CB 10

static void
tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab)
{
        u_long win;

        /* 
         * Reduce the congestion window and the slow start threshold.
         */
        win = ulmin(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz;
        if (win < 2)
                win = 2;

        tp->snd_ssthresh = win * tp->t_segsz;
        tp->snd_recover = tp->snd_max;
        tp->snd_cwnd = tp->snd_ssthresh;

        /*
         * When using TCP ECN, notify the peer that
         * we reduced the cwnd.
         */
        if (TCP_ECN_ALLOWED(tp))
                tp->t_flags |= TF_ECN_SND_CWR;
}


/* ------------------------------------------------------------------------ */

/*
 * TCP/Reno congestion control.
 */
static void
tcp_reno_congestion_exp(struct tcpcb *tp)
{

        tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB);
}

static int
tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{
        /*
         * Dup acks mean that packets have left the
         * network (they're now cached at the receiver)
         * so bump cwnd by the amount in the receiver
         * to keep a constant cwnd packets in the
         * network.
         *
         * If we are using TCP/SACK, then enter
         * Fast Recovery if the receiver SACKs
         * data that is tcprexmtthresh * MSS
         * bytes past the last ACKed segment,
         * irrespective of the number of DupAcks.
         */
        
        tcp_seq onxt = tp->snd_nxt;

        tp->t_partialacks = 0;
        TCP_TIMER_DISARM(tp, TCPT_REXMT);
        tp->t_rtttime = 0;
        if (TCP_SACK_ENABLED(tp)) {
                tp->t_dupacks = tcprexmtthresh;
                tp->sack_newdata = tp->snd_nxt;
                tp->snd_cwnd = tp->t_segsz;
                (void) tcp_output(tp);
                return 0;
        }
        tp->snd_nxt = th->th_ack;
        tp->snd_cwnd = tp->t_segsz;
        (void) tcp_output(tp);
        tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks;
        if (SEQ_GT(onxt, tp->snd_nxt))
                tp->snd_nxt = onxt;

        return 0;
}

static int
tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{

        /*
         * We know we're losing at the current
         * window size so do congestion avoidance
         * (set ssthresh to half the current window
         * and pull our congestion window back to
         * the new ssthresh).
         */

        tcp_reno_congestion_exp(tp);
        return tcp_reno_do_fast_retransmit(tp, th);
}

static void
tcp_reno_slow_retransmit(struct tcpcb *tp)
{
        u_long win;

        /*
         * Close the congestion window down to one segment
         * (we'll open it by one segment for each ack we get).
         * Since we probably have a window's worth of unacked
         * data accumulated, this "slow start" keeps us from
         * dumping all that data as back-to-back packets (which
         * might overwhelm an intermediate gateway).
         *
         * There are two phases to the opening: Initially we
         * open by one mss on each ack.  This makes the window
         * size increase exponentially with time.  If the
         * window is larger than the path can handle, this
         * exponential growth results in dropped packet(s)
         * almost immediately.  To get more time between
         * drops but still "push" the network to take advantage
         * of improving conditions, we switch from exponential
         * to linear window opening at some threshold size.
         * For a threshold, we use half the current window
         * size, truncated to a multiple of the mss.
         *
         * (the minimum cwnd that will give us exponential
         * growth is 2 mss.  We don't allow the threshold
         * to go below this.)
         */

        win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
        if (win < 2)
                win = 2;
        /* Loss Window MUST be one segment. */
        tp->snd_cwnd = tp->t_segsz;
        tp->snd_ssthresh = win * tp->t_segsz;
        tp->t_partialacks = -1;
        tp->t_dupacks = 0;
        tp->t_bytes_acked = 0;

        if (TCP_ECN_ALLOWED(tp))
                tp->t_flags |= TF_ECN_SND_CWR;
}

static void
tcp_reno_fast_retransmit_newack(struct tcpcb *tp,
    const struct tcphdr *th)
{
        if (tp->t_partialacks < 0) {
                /*
                 * We were not in fast recovery.  Reset the duplicate ack
                 * counter.
                 */
                tp->t_dupacks = 0;
        } else {
                /*
                 * Clamp the congestion window to the crossover point and
                 * exit fast recovery.
                 */
                if (tp->snd_cwnd > tp->snd_ssthresh)
                        tp->snd_cwnd = tp->snd_ssthresh;
                tp->t_partialacks = -1;
                tp->t_dupacks = 0;
                tp->t_bytes_acked = 0;
                if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
                        tp->snd_fack = th->th_ack;
        }
}

static void
tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th)
{
        /*
         * When new data is acked, open the congestion window.
         */

        u_int cw = tp->snd_cwnd;
        u_int incr = tp->t_segsz;

        if (tcp_do_abc) {

                /*
                 * RFC 3465 Appropriate Byte Counting (ABC)
                 */

                int acked = th->th_ack - tp->snd_una;

                if (cw >= tp->snd_ssthresh) {
                        tp->t_bytes_acked += acked;
                        if (tp->t_bytes_acked >= cw) {
                                /* Time to increase the window. */
                                tp->t_bytes_acked -= cw;
                        } else {
                                /* No need to increase yet. */
                                incr = 0;
                        }
                } else {
                        /*
                         * use 2*SMSS or 1*SMSS for the "L" param,
                         * depending on sysctl setting.
                         *
                         * (See RFC 3465 2.3 Choosing the Limit)
                         */
                        u_int abc_lim;

                        abc_lim = (tcp_abc_aggressive == 0 ||
                            tp->snd_nxt != tp->snd_max) ? incr : incr * 2;
                        incr = uimin(acked, abc_lim);
                }
        } else {

                /*
                 * If the window gives us less than ssthresh packets
                 * in flight, open exponentially (segsz per packet).
                 * Otherwise open linearly: segsz per window
                 * (segsz^2 / cwnd per packet).
                 */

                if (cw >= tp->snd_ssthresh) {
                        incr = incr * incr / cw;
                }
        }

        tp->snd_cwnd = uimin(cw + incr, TCP_MAXWIN << tp->snd_scale);
}

const struct tcp_congctl tcp_reno_ctl = {
        .fast_retransmit = tcp_reno_fast_retransmit,
        .slow_retransmit = tcp_reno_slow_retransmit,
        .fast_retransmit_newack = tcp_reno_fast_retransmit_newack,
        .newack = tcp_reno_newack,
        .cong_exp = tcp_reno_congestion_exp,
};

/*
 * TCP/NewReno Congestion control.
 */
static int
tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{

        if (SEQ_LT(th->th_ack, tp->snd_high)) {
                /*
                 * False fast retransmit after timeout.
                 * Do not enter fast recovery
                 */
                tp->t_dupacks = 0;
                return 1;
        }
        /*
         * Fast retransmit is same as reno.
         */
        return tcp_reno_fast_retransmit(tp, th);
}

/*
 * Implement the NewReno response to a new ack, checking for partial acks in
 * fast recovery.
 */
static void
tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th)
{
        if (tp->t_partialacks < 0) {
                /*
                 * We were not in fast recovery.  Reset the duplicate ack
                 * counter.
                 */
                tp->t_dupacks = 0;
        } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
                /*
                 * This is a partial ack.  Retransmit the first unacknowledged
                 * segment and deflate the congestion window by the amount of
                 * acknowledged data.  Do not exit fast recovery.
                 */
                tcp_seq onxt = tp->snd_nxt;
                u_long ocwnd = tp->snd_cwnd;
                int sack_num_segs = 1, sack_bytes_rxmt = 0;

                /*
                 * snd_una has not yet been updated and the socket's send
                 * buffer has not yet drained off the ACK'd data, so we
                 * have to leave snd_una as it was to get the correct data
                 * offset in tcp_output().
                 */
                tp->t_partialacks++;
                TCP_TIMER_DISARM(tp, TCPT_REXMT);
                tp->t_rtttime = 0;

                if (TCP_SACK_ENABLED(tp)) {
                        /*
                         * Partial ack handling within a sack recovery episode.
                         * Keeping this very simple for now. When a partial ack
                         * is received, force snd_cwnd to a value that will
                         * allow the sender to transmit no more than 2 segments.
                         * If necessary, a fancier scheme can be adopted at a
                         * later point, but for now, the goal is to prevent the
                         * sender from bursting a large amount of data in the
                         * midst of sack recovery.
                          */

                        /*
                         * send one or 2 segments based on how much
                         * new data was acked
                         */
                        if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
                                sack_num_segs = 2;
                        (void)tcp_sack_output(tp, &sack_bytes_rxmt);
                        tp->snd_cwnd = sack_bytes_rxmt +
                            (tp->snd_nxt - tp->sack_newdata) +
                            sack_num_segs * tp->t_segsz;
                        tp->t_flags |= TF_ACKNOW;
                        (void) tcp_output(tp);
                } else {
                        tp->snd_nxt = th->th_ack;
                        /*
                         * Set snd_cwnd to one segment beyond ACK'd offset
                         * snd_una is not yet updated when we're called
                         */
                        tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
                        (void) tcp_output(tp);
                        tp->snd_cwnd = ocwnd;
                        if (SEQ_GT(onxt, tp->snd_nxt))
                                tp->snd_nxt = onxt;
                        /*
                         * Partial window deflation.  Relies on fact that
                         * tp->snd_una not updated yet.
                          */
                        tp->snd_cwnd -= (th->th_ack - tp->snd_una -
                            tp->t_segsz);
                }
        } else {
                /*
                 * Complete ack.  Inflate the congestion window to ssthresh
                 * and exit fast recovery.
                 *
                 * Window inflation should have left us with approx.
                 * snd_ssthresh outstanding data.  But in case we
                 * would be inclined to send a burst, better to do
                 * it via the slow start mechanism.
                 */
                if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
                        tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
                            + tp->t_segsz;
                else
                        tp->snd_cwnd = tp->snd_ssthresh;
                tp->t_partialacks = -1;
                tp->t_dupacks = 0;
                tp->t_bytes_acked = 0;
                if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
                        tp->snd_fack = th->th_ack;
        }
}

static void
tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th)
{
        /*
         * If we are still in fast recovery (meaning we are using
         * NewReno and we have only received partial acks), do not
         * inflate the window yet.
         */
        if (tp->t_partialacks < 0)
                tcp_reno_newack(tp, th);
}


const struct tcp_congctl tcp_newreno_ctl = {
        .fast_retransmit = tcp_newreno_fast_retransmit,
        .slow_retransmit = tcp_reno_slow_retransmit,
        .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
        .newack = tcp_newreno_newack,
        .cong_exp = tcp_reno_congestion_exp,
};

/*
 * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02
 */

/* Cubic prototypes */
static void        tcp_cubic_update_ctime(struct tcpcb *tp);
static uint32_t        tcp_cubic_diff_ctime(struct tcpcb *);
static uint32_t        tcp_cubic_cbrt(uint32_t);
static ulong        tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t);

/* Cubic TIME functions - XXX I don't like using timevals and microuptime */
/*
 * Set congestion timer to now
 */
static void
tcp_cubic_update_ctime(struct tcpcb *tp)
{
        struct timeval now_timeval;

        getmicrouptime(&now_timeval);
        tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 +
            now_timeval.tv_usec / 1000;
}

/*
 * miliseconds from last congestion
 */
static uint32_t
tcp_cubic_diff_ctime(struct tcpcb *tp)
{
        struct timeval now_timeval;

        getmicrouptime(&now_timeval);
        return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 -
            tp->snd_cubic_ctime;
}

/*
 * Approximate cubic root
 */
#define CBRT_ROUNDS 30
static uint32_t
tcp_cubic_cbrt(uint32_t v)
{
        int i, rounds = CBRT_ROUNDS;
        uint64_t x = v / 3;

        /* We fail to calculate correct for small numbers */
        if (v == 0)
                return 0;
        else if (v < 4)
                return 1;

        /*
         * largest x that 2*x^3+3*x fits 64bit
         * Avoid overflow for a time cost
         */
        if (x > 2097151)
                rounds += 10;

        for (i = 0; i < rounds; i++)
                if (rounds == CBRT_ROUNDS)
                        x = (v + 2 * x * x * x) / (3 * x * x);
                else
                        /* Avoid overflow */
                        x = v / (3 * x * x) + 2 * x / 3;

        return (uint32_t)x;
}

/* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */
static ulong
tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt)
{
        uint32_t K;
        long tK3;

        /* Section 3.1 Eq. 2 */
        K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB *
            CUBIC_CB / CUBIC_CA);
        /*  (t-K)^3 - not clear why is the measure unit mattering */
        tK3 = (long)(ms_elapsed + rtt) - (long)K;
        tK3 = tK3 * tK3 * tK3;

        return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax;
}

static void
tcp_cubic_congestion_exp(struct tcpcb *tp)
{

        /*
         * Congestion - Set WMax and shrink cwnd
         */
        tcp_cubic_update_ctime(tp);

        /* Section 3.6 - Fast Convergence */
        if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) {
                tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
                tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 +
                    tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2;
        } else {
                tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
                tp->snd_cubic_wmax = tp->snd_cwnd;
        }

        tp->snd_cubic_wmax = uimax(tp->t_segsz, tp->snd_cubic_wmax);

        /* Shrink CWND */
        tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB);
}

static int
tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{

        if (SEQ_LT(th->th_ack, tp->snd_high)) {
                /* See newreno */
                tp->t_dupacks = 0;
                return 1;
        }

        /*
         * mark WMax
         */
        tcp_cubic_congestion_exp(tp);

        /* Do fast retransmit */
        return tcp_reno_do_fast_retransmit(tp, th);
}

static void
tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th)
{
        uint32_t ms_elapsed, rtt;
        u_long w_tcp;

        /* Congestion avoidance and not in fast recovery and usable rtt */
        if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 &&
            /*
             * t_srtt is 1/32 units of slow ticks
             * converting it in ms would be equal to
             * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ
             */
            (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) {
                ms_elapsed = tcp_cubic_diff_ctime(tp);

                /* Compute W_tcp(t) */
                w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB +
                    ms_elapsed / rtt / 3;

                if (tp->snd_cwnd > w_tcp) {
                        /* Not in TCP friendly mode */
                        tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) -
                            tp->snd_cwnd) / tp->snd_cwnd;
                } else {
                        /* friendly TCP mode */
                        tp->snd_cwnd = w_tcp;
                }

                /* Make sure we are within limits */
                tp->snd_cwnd = uimax(tp->snd_cwnd, tp->t_segsz);
                tp->snd_cwnd = uimin(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale);
        } else {
                /* Use New Reno */
                tcp_newreno_newack(tp, th);
        }
}

static void
tcp_cubic_slow_retransmit(struct tcpcb *tp)
{

        /* Timeout - Mark new congestion */
        tcp_cubic_congestion_exp(tp);

        /* Loss Window MUST be one segment. */
        tp->snd_cwnd = tp->t_segsz;
        tp->t_partialacks = -1;
        tp->t_dupacks = 0;
        tp->t_bytes_acked = 0;

        if (TCP_ECN_ALLOWED(tp))
                tp->t_flags |= TF_ECN_SND_CWR;
}

const struct tcp_congctl tcp_cubic_ctl = {
        .fast_retransmit = tcp_cubic_fast_retransmit,
        .slow_retransmit = tcp_cubic_slow_retransmit,
        .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
        .newack = tcp_cubic_newack,
        .cong_exp = tcp_cubic_congestion_exp,
};














































































































































































































































































































    5 





















    5 


























































































































































































































































































































































































































    4 














































    2 












    2 
























    2 





























    2 











    2 







    2 







































    2 




    2 





















































    2 












    2 


















    2 



















































    4 













    1 





















    1 

































    1 
    1 










    1 











    5 
    4 






    2 
    3 


















    1 
    1 

    1 















































































































    4 
    4 
    4 




























































































































































































































    8 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
/* $NetBSD: udp6_usrreq.c,v 1.154 2022/11/04 09:01:53 ozaki-r Exp $ */
/* $KAME: udp6_usrreq.c,v 1.86 2001/05/27 17:33:00 itojun Exp $ */
/* $KAME: udp6_output.c,v 1.43 2001/10/15 09:19:52 itojun Exp $ */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)udp_var.h        8.1 (Berkeley) 6/10/93
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: udp6_usrreq.c,v 1.154 2022/11/04 09:01:53 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_inet_csum.h"
#include "opt_ipsec.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/syslog.h>
#include <sys/domain.h>
#include <sys/sysctl.h>

#include <net/if.h>
#include <net/if_types.h>

#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/in_offload.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/in_pcb.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet/udp_private.h>

#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/udp6_var.h>
#include <netinet6/udp6_private.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/scope6_var.h>

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/esp.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#endif

#include "faith.h"
#if defined(NFAITH) && NFAITH > 0
#include <net/if_faith.h>
#endif

/*
 * UDP protocol implementation.
 * Per RFC 768, August, 1980.
 */

extern struct inpcbtable udbtable;

percpu_t *udp6stat_percpu;

/* UDP on IP6 parameters */
static int udp6_sendspace = 9216;        /* really max datagram size */
static int udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6));
                                        /* 40 1K datagrams */

static void udp6_notify(struct inpcb *, int);
static void sysctl_net_inet6_udp6_setup(struct sysctllog **);
#ifdef IPSEC
static int udp6_espinudp(struct mbuf **, int);
#endif

#ifdef UDP_CSUM_COUNTERS
#include <sys/device.h>
struct evcnt udp6_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "udp6", "hwcsum bad");
struct evcnt udp6_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "udp6", "hwcsum ok");
struct evcnt udp6_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "udp6", "hwcsum data");
struct evcnt udp6_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
    NULL, "udp6", "swcsum");

EVCNT_ATTACH_STATIC(udp6_hwcsum_bad);
EVCNT_ATTACH_STATIC(udp6_hwcsum_ok);
EVCNT_ATTACH_STATIC(udp6_hwcsum_data);
EVCNT_ATTACH_STATIC(udp6_swcsum);

#define        UDP_CSUM_COUNTER_INCR(ev)        (ev)->ev_count++
#else
#define        UDP_CSUM_COUNTER_INCR(ev)        /* nothing */
#endif

void
udp6_init(void)
{
        sysctl_net_inet6_udp6_setup(NULL);
        udp6stat_percpu = percpu_alloc(sizeof(uint64_t) * UDP6_NSTATS);

        udp_init_common();
}

/*
 * Notify a udp user of an asynchronous error;
 * just wake up so that he can collect error status.
 */
static        void
udp6_notify(struct inpcb *inp, int errno)
{
        inp->inp_socket->so_error = errno;
        sorwakeup(inp->inp_socket);
        sowwakeup(inp->inp_socket);
}

void *
udp6_ctlinput(int cmd, const struct sockaddr *sa, void *d)
{
        struct udphdr uh;
        struct ip6_hdr *ip6;
        const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa;
        struct mbuf *m;
        int off;
        void *cmdarg;
        struct ip6ctlparam *ip6cp = NULL;
        const struct sockaddr_in6 *sa6_src = NULL;
        void (*notify)(struct inpcb *, int) = udp6_notify;
        struct udp_portonly {
                u_int16_t uh_sport;
                u_int16_t uh_dport;
        } *uhp;

        if (sa->sa_family != AF_INET6 ||
            sa->sa_len != sizeof(struct sockaddr_in6))
                return NULL;

        if ((unsigned)cmd >= PRC_NCMDS)
                return NULL;
        if (PRC_IS_REDIRECT(cmd))
                notify = in6pcb_rtchange, d = NULL;
        else if (cmd == PRC_HOSTDEAD)
                d = NULL;
        else if (cmd == PRC_MSGSIZE) {
                /* special code is present, see below */
                notify = in6pcb_rtchange;
        }
        else if (inet6ctlerrmap[cmd] == 0)
                return NULL;

        /* if the parameter is from icmp6, decode it. */
        if (d != NULL) {
                ip6cp = (struct ip6ctlparam *)d;
                m = ip6cp->ip6c_m;
                ip6 = ip6cp->ip6c_ip6;
                off = ip6cp->ip6c_off;
                cmdarg = ip6cp->ip6c_cmdarg;
                sa6_src = ip6cp->ip6c_src;
        } else {
                m = NULL;
                ip6 = NULL;
                cmdarg = NULL;
                sa6_src = &sa6_any;
                off = 0;
        }

        if (ip6) {
                /* check if we can safely examine src and dst ports */
                if (m->m_pkthdr.len < off + sizeof(*uhp)) {
                        if (cmd == PRC_MSGSIZE)
                                icmp6_mtudisc_update((struct ip6ctlparam *)d, 0);
                        return NULL;
                }

                memset(&uh, 0, sizeof(uh));
                m_copydata(m, off, sizeof(*uhp), (void *)&uh);

                if (cmd == PRC_MSGSIZE) {
                        int valid = 0;

                        /*
                         * Check to see if we have a valid UDP socket
                         * corresponding to the address in the ICMPv6 message
                         * payload.
                         */
                        if (in6pcb_lookup(&udbtable, &sa6->sin6_addr,
                            uh.uh_dport, (const struct in6_addr *)&sa6_src->sin6_addr,
                            uh.uh_sport, 0, 0))
                                valid++;
#if 0
                        /*
                         * As the use of sendto(2) is fairly popular,
                         * we may want to allow non-connected pcb too.
                         * But it could be too weak against attacks...
                         * We should at least check if the local address (= s)
                         * is really ours.
                         */
                        else if (in6pcb_lookup_bound(&udbtable, &sa6->sin6_addr,
                            uh.uh_dport, 0))
                                valid++;
#endif

                        /*
                         * Depending on the value of "valid" and routing table
                         * size (mtudisc_{hi,lo}wat), we will:
                         * - recalculate the new MTU and create the
                         *   corresponding routing entry, or
                         * - ignore the MTU change notification.
                         */
                        icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);

                        /*
                         * regardless of if we called
                         * icmp6_mtudisc_update(), we need to call
                         * in6pcb_notify(), to notify path MTU change
                         * to the userland (RFC3542), because some
                         * unconnected sockets may share the same
                         * destination and want to know the path MTU.
                         */
                }

                (void)in6pcb_notify(&udbtable, sa, uh.uh_dport,
                    sin6tocsa(sa6_src), uh.uh_sport, cmd, cmdarg,
                    notify);
        } else {
                (void)in6pcb_notify(&udbtable, sa, 0,
                    sin6tocsa(sa6_src), 0, cmd, cmdarg, notify);
        }
        return NULL;
}

int
udp6_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
        int s;
        int error = 0;
        struct inpcb *inp;
        int family;
        int optval;

        family = so->so_proto->pr_domain->dom_family;

        s = splsoftnet();
        switch (family) {
#ifdef INET
        case PF_INET:
                if (sopt->sopt_level != IPPROTO_UDP) {
                        error = ip_ctloutput(op, so, sopt);
                        goto end;
                }
                break;
#endif
#ifdef INET6
        case PF_INET6:
                if (sopt->sopt_level != IPPROTO_UDP) {
                        error = ip6_ctloutput(op, so, sopt);
                        goto end;
                }
                break;
#endif
        default:
                error = EAFNOSUPPORT;
                goto end;
        }

        switch (op) {
        case PRCO_SETOPT:
                inp = sotoinpcb(so);

                switch (sopt->sopt_name) {
                case UDP_ENCAP:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;

                        switch(optval) {
                        case 0:
                                inp->inp_flags &= ~IN6P_ESPINUDP;
                                break;

                        case UDP_ENCAP_ESPINUDP:
                                inp->inp_flags |= IN6P_ESPINUDP;
                                break;

                        default:
                                error = EINVAL;
                                break;
                        }
                        break;

                default:
                        error = ENOPROTOOPT;
                        break;
                }
                break;

        default:
                error = EINVAL;
                break;
        }

end:
        splx(s);
        return error;
}

static void
udp6_sendup(struct mbuf *m, int off /* offset of data portion */,
    struct sockaddr *src, struct socket *so)
{
        struct mbuf *opts = NULL;
        struct mbuf *n;
        struct inpcb *inp;

        KASSERT(so != NULL);
        KASSERT(so->so_proto->pr_domain->dom_family == AF_INET6);
        inp = sotoinpcb(so);
        KASSERT(inp != NULL);

#if defined(IPSEC)
        if (ipsec_used && ipsec_in_reject(m, inp)) {
                if ((n = m_copypacket(m, M_DONTWAIT)) != NULL)
                        icmp6_error(n, ICMP6_DST_UNREACH,
                            ICMP6_DST_UNREACH_ADMIN, 0);
                return;
        }
#endif

        if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) {
                if (inp->inp_flags & IN6P_CONTROLOPTS ||
                    SOOPT_TIMESTAMP(inp->inp_socket->so_options)) {
                        struct ip6_hdr *ip6 = mtod(n, struct ip6_hdr *);
                        ip6_savecontrol(inp, &opts, ip6, n);
                }

                m_adj(n, off);
                if (sbappendaddr(&so->so_rcv, src, n, opts) == 0) {
                        m_freem(n);
                        if (opts)
                                m_freem(opts);
                        UDP6_STATINC(UDP6_STAT_FULLSOCK);
                        soroverflow(so);
                } else
                        sorwakeup(so);
        }
}

int
udp6_realinput(int af, struct sockaddr_in6 *src, struct sockaddr_in6 *dst,
    struct mbuf **mp, int off)
{
        u_int16_t sport, dport;
        int rcvcnt;
        struct in6_addr src6, *dst6;
        const struct in_addr *dst4;
        struct inpcb *inp;
        struct mbuf *m = *mp;

        rcvcnt = 0;
        off += sizeof(struct udphdr);        /* now, offset of payload */

        if (af != AF_INET && af != AF_INET6)
                goto bad;
        if (src->sin6_family != AF_INET6 || dst->sin6_family != AF_INET6)
                goto bad;

        src6 = src->sin6_addr;
        if (sa6_recoverscope(src) != 0) {
                /* XXX: should be impossible. */
                goto bad;
        }
        sport = src->sin6_port;

        dport = dst->sin6_port;
        dst4 = (struct in_addr *)&dst->sin6_addr.s6_addr[12];
        dst6 = &dst->sin6_addr;

        if (IN6_IS_ADDR_MULTICAST(dst6) ||
            (af == AF_INET && IN_MULTICAST(dst4->s_addr))) {
                /*
                 * Deliver a multicast or broadcast datagram to *all* sockets
                 * for which the local and remote addresses and ports match
                 * those of the incoming datagram.  This allows more than
                 * one process to receive multi/broadcasts on the same port.
                 * (This really ought to be done for unicast datagrams as
                 * well, but that would cause problems with existing
                 * applications that open both address-specific sockets and
                 * a wildcard socket listening to the same port -- they would
                 * end up receiving duplicates of every unicast datagram.
                 * Those applications open the multiple sockets to overcome an
                 * inadequacy of the UDP socket interface, but for backwards
                 * compatibility we avoid the problem here rather than
                 * fixing the interface.  Maybe 4.5BSD will remedy this?)
                 */

                /*
                 * KAME note: traditionally we dropped udpiphdr from mbuf here.
                 * we need udpiphdr for IPsec processing so we do that later.
                 */
                /*
                 * Locate pcb(s) for datagram.
                 */
                TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) {
                        if (inp->inp_af != AF_INET6)
                                continue;

                        if (inp->inp_lport != dport)
                                continue;
                        if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) {
                                if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp),
                                    dst6))
                                        continue;
                        } else {
                                if (IN6_IS_ADDR_V4MAPPED(dst6) &&
                                    (inp->inp_flags & IN6P_IPV6_V6ONLY))
                                        continue;
                        }
                        if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) {
                                if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp),
                                    &src6) || inp->inp_fport != sport)
                                        continue;
                        } else {
                                if (IN6_IS_ADDR_V4MAPPED(&src6) &&
                                    (inp->inp_flags & IN6P_IPV6_V6ONLY))
                                        continue;
                        }

                        udp6_sendup(m, off, sin6tosa(src), inp->inp_socket);
                        rcvcnt++;

                        /*
                         * Don't look for additional matches if this one does
                         * not have either the SO_REUSEPORT or SO_REUSEADDR
                         * socket options set.  This heuristic avoids searching
                         * through all pcbs in the common case of a non-shared
                         * port.  It assumes that an application will never
                         * clear these options after setting them.
                         */
                        if ((inp->inp_socket->so_options &
                            (SO_REUSEPORT|SO_REUSEADDR)) == 0)
                                break;
                }
        } else {
                /*
                 * Locate pcb for datagram.
                 */
                inp = in6pcb_lookup(&udbtable, &src6, sport, dst6,
                                             dport, 0, 0);
                if (inp == NULL) {
                        UDP_STATINC(UDP_STAT_PCBHASHMISS);
                        inp = in6pcb_lookup_bound(&udbtable, dst6, dport, 0);
                        if (inp == NULL)
                                return rcvcnt;
                }

#ifdef IPSEC
                /* Handle ESP over UDP */
                if (inp->inp_flags & IN6P_ESPINUDP) {
                        switch (udp6_espinudp(mp, off)) {
                        case -1: /* Error, m was freed */
                                rcvcnt = -1;
                                goto bad;

                        case 1: /* ESP over UDP */
                                rcvcnt++;
                                goto bad;

                        case 0: /* plain UDP */
                        default: /* Unexpected */
                                /*
                                 * Normal UDP processing will take place,
                                 * m may have changed.
                                 */
                                m = *mp;
                                break;
                        }
                }
#endif

                if (inp->inp_overudp_cb != NULL) {
                        int ret;
                        ret = inp->inp_overudp_cb(mp, off, inp->inp_socket,
                            sin6tosa(src), inp->inp_overudp_arg);
                        switch (ret) {
                        case -1: /* Error, m was freed */
                                rcvcnt = -1;
                                goto bad;

                        case 1: /* Foo over UDP */
                                KASSERT(*mp == NULL);
                                rcvcnt++;
                                goto bad;

                        case 0: /* plain UDP */
                        default: /* Unexpected */
                                /*
                                 * Normal UDP processing will take place,
                                 * m may have changed.
                                 */
                                break;
                        }
                }

                udp6_sendup(m, off, sin6tosa(src), inp->inp_socket);
                rcvcnt++;
        }

bad:
        return rcvcnt;
}

int
udp6_input_checksum(struct mbuf *m, const struct udphdr *uh, int off, int len)
{

        /*
         * XXX it's better to record and check if this mbuf is
         * already checked.
         */

        if (__predict_false((m->m_flags & M_LOOP) && !udp_do_loopback_cksum)) {
                goto good;
        }
        if (uh->uh_sum == 0) {
                UDP6_STATINC(UDP6_STAT_NOSUM);
                goto bad;
        }

        switch (m->m_pkthdr.csum_flags &
            ((m_get_rcvif_NOMPSAFE(m)->if_csum_flags_rx & M_CSUM_UDPv6) |
            M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
        case M_CSUM_UDPv6|M_CSUM_TCP_UDP_BAD:
                UDP_CSUM_COUNTER_INCR(&udp6_hwcsum_bad);
                UDP6_STATINC(UDP6_STAT_BADSUM);
                goto bad;

#if 0 /* notyet */
        case M_CSUM_UDPv6|M_CSUM_DATA:
#endif

        case M_CSUM_UDPv6:
                /* Checksum was okay. */
                UDP_CSUM_COUNTER_INCR(&udp6_hwcsum_ok);
                break;

        default:
                /*
                 * Need to compute it ourselves.  Maybe skip checksum
                 * on loopback interfaces.
                 */
                UDP_CSUM_COUNTER_INCR(&udp6_swcsum);
                if (in6_cksum(m, IPPROTO_UDP, off, len) != 0) {
                        UDP6_STATINC(UDP6_STAT_BADSUM);
                        goto bad;
                }
        }

good:
        return 0;
bad:
        return -1;
}

int
udp6_input(struct mbuf **mp, int *offp, int proto)
{
        struct mbuf *m = *mp;
        int off = *offp;
        struct sockaddr_in6 src, dst;
        struct ip6_hdr *ip6;
        struct udphdr *uh;
        u_int32_t plen, ulen;

        ip6 = mtod(m, struct ip6_hdr *);

#if defined(NFAITH) && 0 < NFAITH
        if (faithprefix(&ip6->ip6_dst)) {
                /* send icmp6 host unreach? */
                m_freem(m);
                return IPPROTO_DONE;
        }
#endif

        UDP6_STATINC(UDP6_STAT_IPACKETS);

        /* Check for jumbogram is done in ip6_input. We can trust pkthdr.len. */
        plen = m->m_pkthdr.len - off;
        IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(struct udphdr));
        if (uh == NULL) {
                IP6_STATINC(IP6_STAT_TOOSHORT);
                return IPPROTO_DONE;
        }

        /*
         * Enforce alignment requirements that are violated in
         * some cases, see kern/50766 for details.
         */
        if (ACCESSIBLE_POINTER(uh, struct udphdr) == 0) {
                m = m_copyup(m, off + sizeof(struct udphdr), 0);
                if (m == NULL) {
                        IP6_STATINC(IP6_STAT_TOOSHORT);
                        return IPPROTO_DONE;
                }
                ip6 = mtod(m, struct ip6_hdr *);
                uh = (struct udphdr *)(mtod(m, char *) + off);
        }
        KASSERT(ACCESSIBLE_POINTER(uh, struct udphdr));
        ulen = ntohs((u_short)uh->uh_ulen);

        /*
         * RFC2675 section 4: jumbograms will have 0 in the UDP header field,
         * iff payload length > 0xffff.
         */
        if (ulen == 0 && plen > 0xffff)
                ulen = plen;

        if (plen != ulen) {
                UDP6_STATINC(UDP6_STAT_BADLEN);
                goto bad;
        }

        /* destination port of 0 is illegal, based on RFC768. */
        if (uh->uh_dport == 0)
                goto bad;

        /*
         * Checksum extended UDP header and data.  Maybe skip checksum
         * on loopback interfaces.
         */
        if (udp6_input_checksum(m, uh, off, ulen))
                goto bad;

        /*
         * Construct source and dst sockaddrs.
         */
        memset(&src, 0, sizeof(src));
        src.sin6_family = AF_INET6;
        src.sin6_len = sizeof(struct sockaddr_in6);
        src.sin6_addr = ip6->ip6_src;
        src.sin6_port = uh->uh_sport;
        memset(&dst, 0, sizeof(dst));
        dst.sin6_family = AF_INET6;
        dst.sin6_len = sizeof(struct sockaddr_in6);
        dst.sin6_addr = ip6->ip6_dst;
        dst.sin6_port = uh->uh_dport;

        if (udp6_realinput(AF_INET6, &src, &dst, &m, off) == 0) {
                if (m->m_flags & M_MCAST) {
                        UDP6_STATINC(UDP6_STAT_NOPORTMCAST);
                        goto bad;
                }
                UDP6_STATINC(UDP6_STAT_NOPORT);
                icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0);
                m = NULL;
        }

bad:
        if (m)
                m_freem(m);
        return IPPROTO_DONE;
}

int
udp6_output(struct inpcb * const inp, struct mbuf *m,
    struct sockaddr_in6 * const addr6, struct mbuf * const control,
    struct lwp * const l)
{
        u_int32_t ulen = m->m_pkthdr.len;
        u_int32_t plen = sizeof(struct udphdr) + ulen;
        struct ip6_hdr *ip6;
        struct udphdr *udp6;
        struct in6_addr _laddr, *laddr, *faddr;
        struct in6_addr laddr_mapped; /* XXX ugly */
        struct sockaddr_in6 *sin6 = NULL;
        struct ifnet *oifp = NULL;
        int scope_ambiguous = 0;
        u_int16_t fport;
        int error = 0;
        struct ip6_pktopts *optp = NULL;
        struct ip6_pktopts opt;
        int af = AF_INET6, hlen = sizeof(struct ip6_hdr);
#ifdef INET
        struct ip *ip;
        struct udpiphdr *ui;
        int flags = 0;
#endif
        struct sockaddr_in6 tmp;

        if (addr6) {
                sin6 = addr6;
                if (sin6->sin6_len != sizeof(*sin6)) {
                        error = EINVAL;
                        goto release;
                }
                if (sin6->sin6_family != AF_INET6) {
                        error = EAFNOSUPPORT;
                        goto release;
                }

                /* protect *sin6 from overwrites */
                tmp = *sin6;
                sin6 = &tmp;

                /*
                 * Application should provide a proper zone ID or the use of
                 * default zone IDs should be enabled.  Unfortunately, some
                 * applications do not behave as it should, so we need a
                 * workaround.  Even if an appropriate ID is not determined,
                 * we'll see if we can determine the outgoing interface.  If we
                 * can, determine the zone ID based on the interface below.
                 */
                if (sin6->sin6_scope_id == 0 && !ip6_use_defzone)
                        scope_ambiguous = 1;
                if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0)
                        goto release;
        }

        if (control) {
                if (__predict_false(l == NULL)) {
                        panic("%s: control but no lwp", __func__);
                }
                if ((error = ip6_setpktopts(control, &opt,
                    in6p_outputopts(inp), l->l_cred, IPPROTO_UDP)) != 0)
                        goto release;
                optp = &opt;
        } else
                optp = in6p_outputopts(inp);


        if (sin6) {
                /*
                 * Slightly different than v4 version in that we call
                 * in6_selectsrc and in6pcb_set_port to fill in the local
                 * address and port rather than inpcb_connect. inpcb_connect
                 * sets inp_faddr which causes EISCONN below to be hit on
                 * subsequent sendto.
                 */
                if (sin6->sin6_port == 0) {
                        error = EADDRNOTAVAIL;
                        goto release;
                }

                if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) {
                        /* how about ::ffff:0.0.0.0 case? */
                        error = EISCONN;
                        goto release;
                }

                faddr = &sin6->sin6_addr;
                fport = sin6->sin6_port; /* allow 0 port */

                if (IN6_IS_ADDR_V4MAPPED(faddr)) {
                        if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
                                /*
                                 * I believe we should explicitly discard the
                                 * packet when mapped addresses are disabled,
                                 * rather than send the packet as an IPv6 one.
                                 * If we chose the latter approach, the packet
                                 * might be sent out on the wire based on the
                                 * default route, the situation which we'd
                                 * probably want to avoid.
                                 * (20010421 jinmei@kame.net)
                                 */
                                error = EINVAL;
                                goto release;
                        }
                        if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) &&
                            !IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) {
                                /*
                                 * when remote addr is an IPv4-mapped address,
                                 * local addr should not be an IPv6 address,
                                 * since you cannot determine how to map IPv6
                                 * source address to IPv4.
                                 */
                                error = EINVAL;
                                goto release;
                        }

                        af = AF_INET;
                }

                if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
                        struct psref psref;
                        int bound = curlwp_bind();

                        error = in6_selectsrc(sin6, optp,
                            in6p_moptions(inp),
                            &inp->inp_route,
                            &in6p_laddr(inp), &oifp, &psref, &_laddr);
                        if (error)
                                laddr = NULL;
                        else
                                laddr = &_laddr;
                        if (oifp && scope_ambiguous &&
                            (error = in6_setscope(&sin6->sin6_addr,
                            oifp, NULL))) {
                                if_put(oifp, &psref);
                                curlwp_bindx(bound);
                                goto release;
                        }
                        if_put(oifp, &psref);
                        curlwp_bindx(bound);
                } else {
                        /*
                         * XXX: freebsd[34] does not have in_selectsrc, but
                         * we can omit the whole part because freebsd4 calls
                         * udp_output() directly in this case, and thus we'll
                         * never see this path.
                         */
                        if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) {
                                struct sockaddr_in sin_dst;
                                struct in_addr ina;
                                struct in_ifaddr *ia4;
                                struct psref _psref;
                                int bound;

                                memcpy(&ina, &faddr->s6_addr[12], sizeof(ina));
                                sockaddr_in_init(&sin_dst, &ina, 0);
                                bound = curlwp_bind();
                                ia4 = in_selectsrc(&sin_dst, &inp->inp_route,
                                    inp->inp_socket->so_options, NULL,
                                    &error, &_psref);
                                if (ia4 == NULL) {
                                        curlwp_bindx(bound);
                                        if (error == 0)
                                                error = EADDRNOTAVAIL;
                                        goto release;
                                }
                                memset(&laddr_mapped, 0, sizeof(laddr_mapped));
                                laddr_mapped.s6_addr16[5] = 0xffff; /* ugly */
                                memcpy(&laddr_mapped.s6_addr[12],
                                      &IA_SIN(ia4)->sin_addr,
                                      sizeof(IA_SIN(ia4)->sin_addr));
                                ia4_release(ia4, &_psref);
                                curlwp_bindx(bound);
                                laddr = &laddr_mapped;
                        } else
                        {
                                laddr = &in6p_laddr(inp);        /* XXX */
                        }
                }
                if (laddr == NULL) {
                        if (error == 0)
                                error = EADDRNOTAVAIL;
                        goto release;
                }
                if (inp->inp_lport == 0) {
                        /*
                         * Craft a sockaddr_in6 for the local endpoint. Use the
                         * "any" as a base, set the address, and recover the
                         * scope.
                         */
                        struct sockaddr_in6 lsin6 =
                            *((const struct sockaddr_in6 *)inp->inp_socket->so_proto->pr_domain->dom_sa_any);
                        lsin6.sin6_addr = *laddr;
                        error = sa6_recoverscope(&lsin6);
                        if (error)
                                goto release;

                        error = in6pcb_set_port(&lsin6, inp, l);

                        if (error) {
                                in6p_laddr(inp) = in6addr_any;
                                goto release;
                        }
                }
        } else {
                if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) {
                        error = ENOTCONN;
                        goto release;
                }
                if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) {
                        if ((inp->inp_flags & IN6P_IPV6_V6ONLY))
                        {
                                /*
                                 * XXX: this case would happen when the
                                 * application sets the V6ONLY flag after
                                 * connecting the foreign address.
                                 * Such applications should be fixed,
                                 * so we bark here.
                                 */
                                log(LOG_INFO, "udp6_output: IPV6_V6ONLY "
                                    "option was set for a connected socket\n");
                                error = EINVAL;
                                goto release;
                        } else
                                af = AF_INET;
                }
                laddr = &in6p_laddr(inp);
                faddr = &in6p_faddr(inp);
                fport = inp->inp_fport;
        }

        if (af == AF_INET)
                hlen = sizeof(struct ip);

        /*
         * Calculate data length and get a mbuf
         * for UDP and IP6 headers.
         */
        M_PREPEND(m, hlen + sizeof(struct udphdr), M_DONTWAIT);
        if (m == NULL) {
                error = ENOBUFS;
                goto release;
        }

        /*
         * Stuff checksum and output datagram.
         */
        udp6 = (struct udphdr *)(mtod(m, char *) + hlen);
        udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */
        udp6->uh_dport = fport;
        if (plen <= 0xffff)
                udp6->uh_ulen = htons((u_int16_t)plen);
        else
                udp6->uh_ulen = 0;
        udp6->uh_sum = 0;

        switch (af) {
        case AF_INET6:
                ip6 = mtod(m, struct ip6_hdr *);
                ip6->ip6_flow        = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK;
                ip6->ip6_vfc         &= ~IPV6_VERSION_MASK;
                ip6->ip6_vfc         |= IPV6_VERSION;
#if 0                /* ip6_plen will be filled in ip6_output. */
                ip6->ip6_plen        = htons((u_int16_t)plen);
#endif
                ip6->ip6_nxt        = IPPROTO_UDP;
                ip6->ip6_hlim        = in6pcb_selecthlim_rt(inp);
                ip6->ip6_src        = *laddr;
                ip6->ip6_dst        = *faddr;

                udp6->uh_sum = in6_cksum_phdr(laddr, faddr,
                    htonl(plen), htonl(IPPROTO_UDP));
                m->m_pkthdr.csum_flags = M_CSUM_UDPv6;
                m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);

                UDP6_STATINC(UDP6_STAT_OPACKETS);
                error = ip6_output(m, optp, &inp->inp_route, 0,
                    in6p_moptions(inp), inp, NULL);
                break;
        case AF_INET:
#ifdef INET
                /* can't transmit jumbogram over IPv4 */
                if (plen > 0xffff) {
                        error = EMSGSIZE;
                        goto release;
                }

                ip = mtod(m, struct ip *);
                ui = (struct udpiphdr *)ip;
                memset(ui->ui_x1, 0, sizeof(ui->ui_x1));
                ui->ui_pr = IPPROTO_UDP;
                ui->ui_len = htons(plen);
                memcpy(&ui->ui_src, &laddr->s6_addr[12], sizeof(ui->ui_src));
                ui->ui_ulen = ui->ui_len;

                flags = (inp->inp_socket->so_options &
                         (SO_DONTROUTE | SO_BROADCAST));
                memcpy(&ui->ui_dst, &faddr->s6_addr[12], sizeof(ui->ui_dst));

                udp6->uh_sum = in_cksum(m, hlen + plen);
                if (udp6->uh_sum == 0)
                        udp6->uh_sum = 0xffff;

                ip->ip_len = htons(hlen + plen);
                ip->ip_ttl = in6pcb_selecthlim(inp, NULL); /* XXX */
                ip->ip_tos = 0;        /* XXX */

                UDP_STATINC(UDP_STAT_OPACKETS);
                error = ip_output(m, NULL, &inp->inp_route, flags /* XXX */,
                    inp->inp_moptions, NULL);
                break;
#else
                error = EAFNOSUPPORT;
                goto release;
#endif
        }
        goto releaseopt;

release:
        m_freem(m);

releaseopt:
        if (control) {
                if (optp == &opt)
                        ip6_clearpktopts(&opt, -1);
                m_freem(control);
        }
        return (error);
}

static int
udp6_attach(struct socket *so, int proto)
{
        struct inpcb *inp;
        int s, error;

        KASSERT(sotoinpcb(so) == NULL);
        sosetlock(so);

        error = soreserve(so, udp6_sendspace, udp6_recvspace);
        if (error) {
                return error;
        }

        /*
         * MAPPED_ADDR implementation spec:
         *  Always attach for IPv6, and only when necessary for IPv4.
         */
        s = splsoftnet();
        error = inpcb_create(so, &udbtable);
        splx(s);
        if (error) {
                return error;
        }

        inp = sotoinpcb(so);
        in6p_cksum(inp) = -1;        /* just to be sure */

        KASSERT(solocked(so));
        return 0;
}

static void
udp6_detach(struct socket *so)
{
        struct inpcb *inp = sotoinpcb(so);
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);

        s = splsoftnet();
        inpcb_destroy(inp);
        splx(s);
}

static int
udp6_accept(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
udp6_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct inpcb *inp = sotoinpcb(so);
        struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
        int error = 0;
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);

        s = splsoftnet();
        error = in6pcb_bind(inp, sin6, l);
        splx(s);
        return error;
}

static int
udp6_listen(struct socket *so, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
udp6_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct inpcb *inp = sotoinpcb(so);
        int error = 0;
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);

        if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)))
                return EISCONN;
        s = splsoftnet();
        error = in6pcb_connect(inp, (struct sockaddr_in6 *)nam, l);
        splx(s);
        if (error == 0)
                soisconnected(so);

        return error;
}

static int
udp6_connect2(struct socket *so, struct socket *so2)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
udp6_disconnect(struct socket *so)
{
        struct inpcb *inp = sotoinpcb(so);
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);

        if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)))
                return ENOTCONN;

        s = splsoftnet();
        in6pcb_disconnect(inp);
        memset((void *)&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp)));
        splx(s);

        so->so_state &= ~SS_ISCONNECTED;        /* XXX */
        in6pcb_set_state(inp, INP_BOUND);                /* XXX */
        return 0;
}

static int
udp6_shutdown(struct socket *so)
{
        int s;

        s = splsoftnet();
        socantsendmore(so);
        splx(s);

        return 0;
}

static int
udp6_abort(struct socket *so)
{
        int s;

        KASSERT(solocked(so));
        KASSERT(sotoinpcb(so) != NULL);

        s = splsoftnet();
        soisdisconnected(so);
        inpcb_destroy(sotoinpcb(so));
        splx(s);

        return 0;
}

static int
udp6_ioctl(struct socket *so, u_long cmd, void *addr6, struct ifnet *ifp)
{
        /*
         * MAPPED_ADDR implementation info:
         *  Mapped addr support for PRU_CONTROL is not necessary.
         *  Because typical user of PRU_CONTROL is such as ifconfig,
         *  and they don't associate any addr to their socket.  Then
         *  socket family is only hint about the PRU_CONTROL'ed address
         *  family, especially when getting addrs from kernel.
         *  So AF_INET socket need to be used to control AF_INET addrs,
         *  and AF_INET6 socket for AF_INET6 addrs.
         */
        return in6_control(so, cmd, addr6, ifp);
}

static int
udp6_stat(struct socket *so, struct stat *ub)
{
        KASSERT(solocked(so));

        /* stat: don't bother with a blocksize */
        return 0;
}

static int
udp6_peeraddr(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));
        KASSERT(sotoinpcb(so) != NULL);
        KASSERT(nam != NULL);

        in6pcb_fetch_peeraddr(sotoinpcb(so), (struct sockaddr_in6 *)nam);
        return 0;
}

static int
udp6_sockaddr(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));
        KASSERT(sotoinpcb(so) != NULL);
        KASSERT(nam != NULL);

        in6pcb_fetch_sockaddr(sotoinpcb(so), (struct sockaddr_in6 *)nam);
        return 0;
}

static int
udp6_rcvd(struct socket *so, int flags, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
udp6_recvoob(struct socket *so, struct mbuf *m, int flags)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
udp6_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
    struct mbuf *control, struct lwp *l)
{
        struct inpcb *inp = sotoinpcb(so);
        int error = 0;
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);
        KASSERT(m != NULL);

        s = splsoftnet();
        error = udp6_output(inp, m, (struct sockaddr_in6 *)nam, control, l);
        splx(s);

        return error;
}

static int
udp6_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control)
{
        KASSERT(solocked(so));

        m_freem(m);
        m_freem(control);

        return EOPNOTSUPP;
}

static int
udp6_purgeif(struct socket *so, struct ifnet *ifp)
{

        mutex_enter(softnet_lock);
        in6pcb_purgeif0(&udbtable, ifp);
#ifdef NET_MPSAFE
        mutex_exit(softnet_lock);
#endif
        in6_purgeif(ifp);
#ifdef NET_MPSAFE
        mutex_enter(softnet_lock);
#endif
        in6pcb_purgeif(&udbtable, ifp);
        mutex_exit(softnet_lock);

        return 0;
}

static int
sysctl_net_inet6_udp6_stats(SYSCTLFN_ARGS)
{

        return (NETSTAT_SYSCTL(udp6stat_percpu, UDP6_NSTATS));
}

static void
sysctl_net_inet6_udp6_setup(struct sysctllog **clog)
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "inet6", NULL,
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET6, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "udp6",
                       SYSCTL_DESCR("UDPv6 related settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET6, IPPROTO_UDP, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "sendspace",
                       SYSCTL_DESCR("Default UDP send buffer size"),
                       NULL, 0, &udp6_sendspace, 0,
                       CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_SENDSPACE,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "recvspace",
                       SYSCTL_DESCR("Default UDP receive buffer size"),
                       NULL, 0, &udp6_recvspace, 0,
                       CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_RECVSPACE,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "do_loopback_cksum",
                       SYSCTL_DESCR("Perform UDP checksum on loopback"),
                       NULL, 0, &udp_do_loopback_cksum, 0,
                       CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_LOOPBACKCKSUM,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "pcblist",
                       SYSCTL_DESCR("UDP protocol control block list"),
                       sysctl_inpcblist, 0, &udbtable, 0,
                       CTL_NET, PF_INET6, IPPROTO_UDP, CTL_CREATE,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "stats",
                       SYSCTL_DESCR("UDPv6 statistics"),
                       sysctl_net_inet6_udp6_stats, 0, NULL, 0,
                       CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_STATS,
                       CTL_EOL);
}

void
udp6_statinc(u_int stat)
{

        KASSERT(stat < UDP6_NSTATS);
        UDP6_STATINC(stat);
}

#ifdef IPSEC
/*
 * Returns:
 *     1 if the packet was processed
 *     0 if normal UDP processing should take place
 *    -1 if an error occurred and m was freed
 */
static int
udp6_espinudp(struct mbuf **mp, int off)
{
        const size_t skip = sizeof(struct udphdr);
        size_t len;
        void *data;
        size_t minlen;
        int ip6hdrlen;
        struct ip6_hdr *ip6;
        struct m_tag *tag;
        struct udphdr *udphdr;
        u_int16_t sport, dport;
        struct mbuf *m = *mp;
        uint32_t *marker;

        /*
         * Collapse the mbuf chain if the first mbuf is too short
         * The longest case is: UDP + non ESP marker + ESP
         */
        minlen = off + sizeof(u_int64_t) + sizeof(struct esp);
        if (minlen > m->m_pkthdr.len)
                minlen = m->m_pkthdr.len;

        if (m->m_len < minlen) {
                if ((*mp = m_pullup(m, minlen)) == NULL) {
                        return -1;
                }
                m = *mp;
        }

        len = m->m_len - off;
        data = mtod(m, char *) + off;

        /* Ignore keepalive packets */
        if ((len == 1) && (*(unsigned char *)data == 0xff)) {
                m_freem(m);
                *mp = NULL; /* avoid any further processing by caller ... */
                return 1;
        }

        /* Handle Non-ESP marker (32bit). If zero, then IKE. */
        marker = (uint32_t *)data;
        if (len <= sizeof(uint32_t))
                return 0;
        if (marker[0] == 0)
                return 0;

        /*
         * Get the UDP ports. They are handled in network
         * order everywhere in IPSEC_NAT_T code.
         */
        udphdr = (struct udphdr *)((char *)data - skip);
        sport = udphdr->uh_sport;
        dport = udphdr->uh_dport;

        /*
         * Remove the UDP header (and possibly the non ESP marker)
         * IPv6 header length is ip6hdrlen
         * Before:
         *   <---- off --->
         *   +-----+------+-----+
         *   | IP6 |  UDP | ESP |
         *   +-----+------+-----+
         *         <-skip->
         * After:
         *          +-----+-----+
         *          | IP6 | ESP |
         *          +-----+-----+
         *   <-skip->
         */
        ip6hdrlen = off - sizeof(struct udphdr);
        memmove(mtod(m, char *) + skip, mtod(m, void *), ip6hdrlen);
        m_adj(m, skip);

        ip6 = mtod(m, struct ip6_hdr *);
        ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - skip);
        ip6->ip6_nxt = IPPROTO_ESP;

        /*
         * We have modified the packet - it is now ESP, so we should not
         * return to UDP processing ...
         *
         * Add a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
         * the source UDP port. This is required if we want
         * to select the right SPD for multiple hosts behind
         * same NAT
         */
        if ((tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
            sizeof(sport) + sizeof(dport), M_DONTWAIT)) == NULL) {
                m_freem(m);
                return -1;
        }
        ((u_int16_t *)(tag + 1))[0] = sport;
        ((u_int16_t *)(tag + 1))[1] = dport;
        m_tag_prepend(m, tag);

        if (ipsec_used)
                ipsec6_common_input(&m, &ip6hdrlen, IPPROTO_ESP);
        else
                m_freem(m);

        /* We handled it, it shouldn't be handled by UDP */
        *mp = NULL; /* avoid free by caller ... */
        return 1;
}
#endif /* IPSEC */

PR_WRAP_USRREQS(udp6)
#define        udp6_attach        udp6_attach_wrapper
#define        udp6_detach        udp6_detach_wrapper
#define        udp6_accept        udp6_accept_wrapper
#define        udp6_bind        udp6_bind_wrapper
#define        udp6_listen        udp6_listen_wrapper
#define        udp6_connect        udp6_connect_wrapper
#define        udp6_connect2        udp6_connect2_wrapper
#define        udp6_disconnect        udp6_disconnect_wrapper
#define        udp6_shutdown        udp6_shutdown_wrapper
#define        udp6_abort        udp6_abort_wrapper
#define        udp6_ioctl        udp6_ioctl_wrapper
#define        udp6_stat        udp6_stat_wrapper
#define        udp6_peeraddr        udp6_peeraddr_wrapper
#define        udp6_sockaddr        udp6_sockaddr_wrapper
#define        udp6_rcvd        udp6_rcvd_wrapper
#define        udp6_recvoob        udp6_recvoob_wrapper
#define        udp6_send        udp6_send_wrapper
#define        udp6_sendoob        udp6_sendoob_wrapper
#define        udp6_purgeif        udp6_purgeif_wrapper

const struct pr_usrreqs udp6_usrreqs = {
        .pr_attach        = udp6_attach,
        .pr_detach        = udp6_detach,
        .pr_accept        = udp6_accept,
        .pr_bind        = udp6_bind,
        .pr_listen        = udp6_listen,
        .pr_connect        = udp6_connect,
        .pr_connect2        = udp6_connect2,
        .pr_disconnect        = udp6_disconnect,
        .pr_shutdown        = udp6_shutdown,
        .pr_abort        = udp6_abort,
        .pr_ioctl        = udp6_ioctl,
        .pr_stat        = udp6_stat,
        .pr_peeraddr        = udp6_peeraddr,
        .pr_sockaddr        = udp6_sockaddr,
        .pr_rcvd        = udp6_rcvd,
        .pr_recvoob        = udp6_recvoob,
        .pr_send        = udp6_send,
        .pr_sendoob        = udp6_sendoob,
        .pr_purgeif        = udp6_purgeif,
};








































































































    5 



    5 

    6 
















    1 











    2 
    6 













































   11 



   12 


   11 
    1 












































    3 



    4 





    4 























    9 












    9 














    7 







    1 
    5 




    7 



    3 



















    8 
    7 

    9 































    4 


    3 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
/*        $NetBSD: scope6.c,v 1.23 2020/06/16 17:12:18 maxv Exp $        */
/*        $KAME$        */

/*
 * Copyright (C) 2000 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: scope6.c,v 1.23 2020/06/16 17:12:18 maxv Exp $");

#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/systm.h>
#include <sys/queue.h>
#include <sys/syslog.h>

#include <net/if.h>

#include <netinet/in.h>

#include <netinet6/in6_var.h>
#include <netinet6/scope6_var.h>

#ifdef ENABLE_DEFAULT_SCOPE
int ip6_use_defzone = 1;
#else
int ip6_use_defzone = 0;
#endif

static struct scope6_id sid_default;
#define SID(ifp) \
    ((ifp)->if_afdata[AF_INET6] == NULL ? NULL : \
        ((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->scope6_id)

void
scope6_init(void)
{

        memset(&sid_default, 0, sizeof(sid_default));
}

struct scope6_id *
scope6_ifattach(struct ifnet *ifp)
{
        struct scope6_id *sid;

        sid = malloc(sizeof(*sid), M_IFADDR, M_WAITOK | M_ZERO);

        /*
         * XXX: IPV6_ADDR_SCOPE_xxx macros are not standard.
         * Should we rather hardcode here?
         */
        sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index;
        sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index;
#ifdef MULTI_SCOPE
        /* by default, we don't care about scope boundary for these scopes. */
        sid->s6id_list[IPV6_ADDR_SCOPE_SITELOCAL] = 1;
        sid->s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL] = 1;
#endif

        return sid;
}

void
scope6_ifdetach(struct scope6_id *sid)
{

        free(sid, M_IFADDR);
}

/*
 * Get a scope of the address. Interface-local, link-local, site-local
 * or global.
 */
int
in6_addrscope(const struct in6_addr *addr)
{
        int scope;

        if (addr->s6_addr[0] == 0xfe) {
                scope = addr->s6_addr[1] & 0xc0;

                switch (scope) {
                case 0x80:
                        return IPV6_ADDR_SCOPE_LINKLOCAL;
                case 0xc0:
                        return IPV6_ADDR_SCOPE_SITELOCAL;
                default:
                        return IPV6_ADDR_SCOPE_GLOBAL; /* just in case */
                }
        }

        if (addr->s6_addr[0] == 0xff) {
                scope = addr->s6_addr[1] & 0x0f;

                /*
                 * due to other scope such as reserved,
                 * return scope doesn't work.
                 */
                switch (scope) {
                case IPV6_ADDR_SCOPE_INTFACELOCAL:
                        return IPV6_ADDR_SCOPE_INTFACELOCAL;
                case IPV6_ADDR_SCOPE_LINKLOCAL:
                        return IPV6_ADDR_SCOPE_LINKLOCAL;
                case IPV6_ADDR_SCOPE_SITELOCAL:
                        return IPV6_ADDR_SCOPE_SITELOCAL;
                default:
                        return IPV6_ADDR_SCOPE_GLOBAL;
                }
        }

        if (memcmp(&in6addr_loopback, addr, sizeof(*addr) - 1) == 0) {
                if (addr->s6_addr[15] == 1) /* loopback */
                        return IPV6_ADDR_SCOPE_LINKLOCAL;
                if (addr->s6_addr[15] == 0) {
                        /*
                         * Regard the unspecified addresses as global,
                         * since it has no ambiguity.
                         * XXX: not sure if it's correct...
                         */
                        return IPV6_ADDR_SCOPE_GLOBAL;
                }
        }

        return IPV6_ADDR_SCOPE_GLOBAL;
}

uint32_t
scope6_addr2default(const struct in6_addr *addr)
{
        uint32_t id;

        /*
         * special case: The loopback address should be considered as
         * link-local, but there's no ambiguity in the syntax.
         */
        if (IN6_IS_ADDR_LOOPBACK(addr))
                return 0;

        /*
         * XXX: 32-bit read is atomic on all our platforms, is it OK
         * not to lock here?
         */
        id = sid_default.s6id_list[in6_addrscope(addr)];

        return id;
}

/*
 * Validate the specified scope zone ID in the sin6_scope_id field.  If the ID
 * is unspecified (=0), needs to be specified, and the default zone ID can be
 * used, the default value will be used.
 * This routine then generates the kernel-internal form: if the address scope
 * of is interface-local or link-local, embed the interface index in the
 * address.
 */
int
sa6_embedscope(struct sockaddr_in6 *sin6, int defaultok)
{
        struct ifnet *ifp;
        uint32_t zoneid;

        if ((zoneid = sin6->sin6_scope_id) == 0 && defaultok)
                zoneid = scope6_addr2default(&sin6->sin6_addr);

        if (zoneid != 0 &&
            (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
            IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))) {
                int s;
                /*
                 * At this moment, we only check interface-local and
                 * link-local scope IDs, and use interface indices as the
                 * zone IDs assuming a one-to-one mapping between interfaces
                 * and links.
                 */
                s = pserialize_read_enter();
                ifp = if_byindex(zoneid);
                if (ifp == NULL) {
                        pserialize_read_exit(s);
                        return ENXIO;
                }
                pserialize_read_exit(s);

                /* XXX assignment to 16bit from 32bit variable */
                sin6->sin6_addr.s6_addr16[1] = htons(zoneid & 0xffff);

                sin6->sin6_scope_id = 0;
        }

        return 0;
}

struct sockaddr *
sockaddr_in6_externalize(struct sockaddr *dst, socklen_t socklen,
    const struct sockaddr *src)
{
        struct sockaddr_in6 *sin6;

        sin6 = satosin6(sockaddr_copy(dst, socklen, src));

        if (sin6 == NULL || sa6_recoverscope(sin6) != 0)
                return NULL;

        return dst;
}

/*
 * generate standard sockaddr_in6 from embedded form.
 */
int
sa6_recoverscope(struct sockaddr_in6 *sin6)
{
        uint32_t zoneid;
        char ip6buf[INET6_ADDRSTRLEN];

        if (sin6->sin6_scope_id != 0) {
                log(LOG_NOTICE,
                    "%s: assumption failure (non 0 ID): %s%%%d\n", __func__,
                    IN6_PRINT(ip6buf, &sin6->sin6_addr), sin6->sin6_scope_id);
                /* XXX: proceed anyway... */
        }
        if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
            IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr)) {
                /*
                 * KAME assumption: link id == interface id
                 */
                zoneid = ntohs(sin6->sin6_addr.s6_addr16[1]);
                if (zoneid) {
                        int s = pserialize_read_enter();
                        if (!if_byindex(zoneid)) {
                                pserialize_read_exit(s);
                                return ENXIO;
                        }
                        pserialize_read_exit(s);
                        sin6->sin6_addr.s6_addr16[1] = 0;
                        sin6->sin6_scope_id = zoneid;
                }
        }

        return 0;
}

int
in6_setzoneid(struct in6_addr *in6, uint32_t zoneid)
{
        if (IN6_IS_SCOPE_EMBEDDABLE(in6))
                in6->s6_addr16[1] = htons(zoneid & 0xffff); /* XXX */

        return 0;
}

/*
 * Determine the appropriate scope zone ID for in6 and ifp.  If ret_id is
 * non NULL, it is set to the zone ID.  If the zone ID needs to be embedded
 * in the in6_addr structure, in6 will be modified.
 */
int
in6_setscope(struct in6_addr *in6, const struct ifnet *ifp, uint32_t *ret_id)
{
        int scope;
        uint32_t zoneid = 0;
        const struct scope6_id *sid = SID(ifp);

        if (sid == NULL) {
                log(LOG_NOTICE, "%s: no scope id for %s\n", __func__,
                    if_name(ifp));
                return EINVAL;
        }

        /*
         * special case: the loopback address can only belong to a loopback
         * interface.
         */
        if (IN6_IS_ADDR_LOOPBACK(in6)) {
                if (!(ifp->if_flags & IFF_LOOPBACK)) {
                        char ip6buf[INET6_ADDRSTRLEN];
                        log(LOG_NOTICE, "%s: can't set scope for not loopback "
                            "interface %s and loopback address %s\n",
                            __func__, if_name(ifp), IN6_PRINT(ip6buf, in6));
                        return EINVAL;
                } else {
                        if (ret_id != NULL)
                                *ret_id = 0; /* there's no ambiguity */
                        return 0;
                }
        }

        scope = in6_addrscope(in6);

        switch (scope) {
        case IPV6_ADDR_SCOPE_INTFACELOCAL: /* should be interface index */
                zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL];
                break;

        case IPV6_ADDR_SCOPE_LINKLOCAL:
                zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL];
                break;

        case IPV6_ADDR_SCOPE_SITELOCAL:
                zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_SITELOCAL];
                break;

        case IPV6_ADDR_SCOPE_ORGLOCAL:
                zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL];
                break;

        default:
                zoneid = 0;        /* XXX: treat as global. */
                break;
        }

        if (ret_id != NULL)
                *ret_id = zoneid;

        return in6_setzoneid(in6, zoneid);
}

const char *
in6_getscopename(const struct in6_addr *addr)
{
        switch (in6_addrscope(addr)) {
        case IPV6_ADDR_SCOPE_INTFACELOCAL:
                return "interface";
#if IPV6_ADDR_SCOPE_INTFACELOCAL != IPV6_ADDR_SCOPE_NODELOCAL
        case IPV6_ADDR_SCOPE_NODELOCAL:
                return "node";
#endif
        case IPV6_ADDR_SCOPE_LINKLOCAL:
                return "link";
        case IPV6_ADDR_SCOPE_SITELOCAL:
                return "site";
        case IPV6_ADDR_SCOPE_ORGLOCAL:
                return "organization";
        case IPV6_ADDR_SCOPE_GLOBAL:
                return "global";
        default:
                return "unknown";
        }
}

/*
 * Just clear the embedded scope identifier.  Return 0 if the original address
 * is intact; return non 0 if the address is modified.
 */
int
in6_clearscope(struct in6_addr *in6)
{
        int modified = 0;

        if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) {
                if (in6->s6_addr16[1] != 0)
                        modified = 1;
                in6->s6_addr16[1] = 0;
        }

        return modified;
}




















































































































































































































    1 















    1 



































    1 

    1 





    1 


    1 







    1 
























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
/*        $NetBSD: uipc_syscalls_43.c,v 1.51 2019/01/27 02:08:39 pgoyette Exp $        */

/*
 * Copyright (c) 1982, 1986, 1989, 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)uipc_syscalls.c        8.4 (Berkeley) 2/21/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_syscalls_43.c,v 1.51 2019/01/27 02:08:39 pgoyette Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/syslog.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
#include <sys/mbuf.h>                /* for MLEN */
#include <sys/protosw.h>

#include <sys/mount.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>

#include <net/if.h>
#include <net/bpf.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <net/if_gre.h>
#include <net/if_tap.h>
#include <net80211/ieee80211_ioctl.h>
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#include <compat/sys/socket.h>
#include <compat/sys/sockio.h>

#include <compat/common/compat_util.h>
#include <compat/common/compat_mod.h>

#include <uvm/uvm_extern.h>

/*
 * Following 4.3 syscalls were not versioned, even through they should
 * have been:
 * connect(2), bind(2), sendto(2)
 */

static struct syscall_package uipc_syscalls_43_syscalls[] = {
        { SYS_compat_43_oaccept, 0, (sy_call_t *)compat_43_sys_accept },
        { SYS_compat_43_ogetpeername, 0,
            (sy_call_t *)compat_43_sys_getpeername },      
        { SYS_compat_43_ogetsockname, 0,
            (sy_call_t *)compat_43_sys_getsockname },
        { SYS_compat_43_orecv, 0, (sy_call_t *)compat_43_sys_recv },
        { SYS_compat_43_orecvfrom, 0, (sy_call_t *)compat_43_sys_recvfrom },
        { SYS_compat_43_orecvmsg, 0, (sy_call_t *)compat_43_sys_recvmsg },
        { SYS_compat_43_osend, 0, (sy_call_t *)compat_43_sys_send },
        { SYS_compat_43_osendmsg, 0, (sy_call_t *)compat_43_sys_sendmsg },
        { 0, 0, NULL }
};

static int compat_43_sa_put(void *);

int
compat_43_sys_accept(struct lwp *l, const struct compat_43_sys_accept_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) s;
                syscallarg(void *) name;
                syscallarg(int *) anamelen;
        } */
        int error;

        if ((error = sys_accept(l, (const struct sys_accept_args *)uap, retval)) != 0)
                return error;

        if (SCARG(uap, name)
            && (error = compat_43_sa_put(SCARG(uap, name))))
                return (error);

        return 0;
}

int
compat_43_sys_getpeername(struct lwp *l, const struct compat_43_sys_getpeername_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fdes;
                syscallarg(void *) asa;
                syscallarg(int *) alen;
        } */

        int error;

        if ((error = sys_getpeername(l, (const struct sys_getpeername_args *)uap, retval)) != 0)
                return error;

        if ((error = compat_43_sa_put(SCARG(uap, asa))))
                return (error);

        return 0;
}

int
compat_43_sys_getsockname(struct lwp *l, const struct compat_43_sys_getsockname_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fdes;
                syscallarg(void *) asa;
                syscallarg(int *) alen;
        } */
        int error;

        if ((error = sys_getsockname(l, (const struct sys_getsockname_args *)uap, retval)) != 0)
                return error;

        if ((error = compat_43_sa_put(SCARG(uap, asa))))
                return (error);

        return 0;
}

int
compat_43_sys_recv(struct lwp *l, const struct compat_43_sys_recv_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) s;
                syscallarg(void *) buf;
                syscallarg(int) len;
                syscallarg(int) flags;
        } */
        struct sys_recvfrom_args bra;

        SCARG(&bra, s) = SCARG(uap, s);
        SCARG(&bra, buf) = SCARG(uap, buf);
        SCARG(&bra, len) = (size_t) SCARG(uap, len);
        SCARG(&bra, flags) = SCARG(uap, flags);
        SCARG(&bra, from) = NULL;
        SCARG(&bra, fromlenaddr) = NULL;

        return (sys_recvfrom(l, &bra, retval));
}

int
compat_43_sys_recvfrom(struct lwp *l, const struct compat_43_sys_recvfrom_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) s;
                syscallarg(void *) buf;
                syscallarg(size_t) len;
                syscallarg(int) flags;
                syscallarg(void *) from;
                syscallarg(int *) fromlenaddr;
        } */
        int error;

        if ((error = sys_recvfrom(l, (const struct sys_recvfrom_args *)uap, retval)))
                return (error);

        if (SCARG(uap, from) && (error = compat_43_sa_put(SCARG(uap, from))))
                return (error);

        return (0);
}

/*
 * Old recvmsg. Arrange necessary structures, calls generic code and
 * adjusts results accordingly.
 */
int
compat_43_sys_recvmsg(struct lwp *l, const struct compat_43_sys_recvmsg_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) s;
                syscallarg(struct omsghdr *) msg;
                syscallarg(int) flags;
        } */
        struct omsghdr omsg;
        struct msghdr msg;
        struct mbuf *from, *control;
        int error;

        error = copyin(SCARG(uap, msg), &omsg, sizeof (struct omsghdr));
        if (error)
                return (error);

        if (omsg.msg_accrights == NULL)
                omsg.msg_accrightslen = 0;
        /* it was this way in 4.4BSD */
        if (omsg.msg_accrightslen > MLEN)
                return EINVAL;

        msg.msg_name        = omsg.msg_name;
        msg.msg_namelen = omsg.msg_namelen;
        msg.msg_iovlen        = omsg.msg_iovlen;
        msg.msg_iov        = omsg.msg_iov;
        msg.msg_flags        = (SCARG(uap, flags) & MSG_USERFLAGS) | MSG_IOVUSRSPACE;

        error = do_sys_recvmsg(l, SCARG(uap, s), &msg, &from,
            omsg.msg_accrights != NULL ? &control : NULL, retval);
        if (error != 0)
                return error;

        /*
         * If there is any control information and it's SCM_RIGHTS,
         * pass it back to the program.
         * XXX: maybe there can be more than one chunk of control data?
         */
        if (omsg.msg_accrights && control != NULL) {
                struct cmsghdr *cmsg = mtod(control, struct cmsghdr *);

                if (cmsg->cmsg_level == SOL_SOCKET
                    && cmsg->cmsg_type == SCM_RIGHTS
                    && cmsg->cmsg_len < omsg.msg_accrightslen
                    && copyout(CMSG_DATA(cmsg), omsg.msg_accrights,
                            cmsg->cmsg_len) == 0) {
                        omsg.msg_accrightslen = cmsg->cmsg_len;
                        free_control_mbuf(l, control, control->m_next);
                } else {
                        omsg.msg_accrightslen = 0;
                        free_control_mbuf(l, control, control);
                }
        } else
                omsg.msg_accrightslen = 0;

        if (from != NULL)
                /* convert from sockaddr sa_family to osockaddr one here */
                mtod(from, struct osockaddr *)->sa_family =
                                    mtod(from, struct sockaddr *)->sa_family;

        error = copyout_sockname((struct sockaddr *)omsg.msg_name, &omsg.msg_namelen, 0, from);
        if (from != NULL)
                m_free(from);

        if (error != 0)
                 error = copyout(&omsg, SCARG(uap, msg), sizeof(omsg));

        return error;
}

int
compat_43_sys_send(struct lwp *l, const struct compat_43_sys_send_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) s;
                syscallarg(void *) buf;
                syscallarg(int) len;
                syscallarg(int) flags;
        } */
        struct sys_sendto_args bsa;

        SCARG(&bsa, s)                = SCARG(uap, s);
        SCARG(&bsa, buf)        = SCARG(uap, buf);
        SCARG(&bsa, len)        = SCARG(uap, len);
        SCARG(&bsa, flags)        = SCARG(uap, flags);
        SCARG(&bsa, to)                = NULL;
        SCARG(&bsa, tolen)        = 0;

        return (sys_sendto(l, &bsa, retval));
}

int
compat43_set_accrights(struct msghdr *msg, void *accrights, int accrightslen)
{
        struct cmsghdr *cmsg;
        int error;
        struct mbuf *ctl;
        u_int clen;

        if (accrights == NULL || accrightslen == 0) {
                msg->msg_control = NULL;
                msg->msg_controllen = 0;
                return 0;
        }

        clen = CMSG_SPACE(accrightslen);
        /* it was (almost) this way in 4.4BSD */
        if (accrightslen < 0 || clen > MLEN)
                return EINVAL;

        ctl = m_get(M_WAIT, MT_CONTROL);
        ctl->m_len = clen;
        cmsg = mtod(ctl, struct cmsghdr *);
        cmsg->cmsg_len                = CMSG_SPACE(accrightslen);
        cmsg->cmsg_level        = SOL_SOCKET;
        cmsg->cmsg_type         = SCM_RIGHTS;

        error = copyin(accrights, CMSG_DATA(cmsg), accrightslen);
        if (error) {
                m_free(ctl);
                return error;
        }

        msg->msg_control = ctl;
        msg->msg_controllen = clen;
        msg->msg_flags |= MSG_CONTROLMBUF;
        return 0;
}

/*
 * Old sendmsg. Arrange necessary structures, call generic code and
 * adjust the results accordingly for old code.
 */
int
compat_43_sys_sendmsg(struct lwp *l, const struct compat_43_sys_sendmsg_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) s;
                syscallarg(void *) msg;
                syscallarg(int) flags;
        } */
        struct omsghdr omsg;
        struct msghdr msg;
        int error;
        struct mbuf *nam;
        struct osockaddr *osa;
        struct sockaddr *sa;

        error = copyin(SCARG(uap, msg), &omsg, sizeof (struct omsghdr));
        if (error != 0)
                return (error);

        msg.msg_iovlen = omsg.msg_iovlen;
        msg.msg_iov = omsg.msg_iov;

        error = sockargs(&nam, omsg.msg_name, omsg.msg_namelen,
            UIO_USERSPACE, MT_SONAME);
        if (error != 0)
                return (error);

        sa = mtod(nam, struct sockaddr *);
        osa = mtod(nam, struct osockaddr *);
        sa->sa_family = osa->sa_family;
        sa->sa_len = omsg.msg_namelen;

        msg.msg_flags = MSG_IOVUSRSPACE | MSG_NAMEMBUF;

        msg.msg_name = nam;
        msg.msg_namelen = omsg.msg_namelen;
        error = compat43_set_accrights(&msg, omsg.msg_accrights,
            omsg.msg_accrightslen);
        if (error != 0)
                goto bad;

        return do_sys_sendmsg(l, SCARG(uap, s), &msg, SCARG(uap, flags),
            retval);

    bad:
        if (nam != NULL)
                m_free(nam);

        return (error);
}

static int
compat_43_sa_put(void *from)
{
        struct osockaddr *osa = (struct osockaddr *) from;
        struct sockaddr sa;
        struct osockaddr *kosa;
        int error, len;

        /*
         * Only read/write the sockaddr family and length, the rest is
         * not changed.
         */
        len = sizeof(sa.sa_len) + sizeof(sa.sa_family);

        error = copyin((void *) osa, (void *) &sa, len);
        if (error)
                return (error);

        /* Note: we convert from sockaddr sa_family to osockaddr one here */
        kosa = (struct osockaddr *) &sa;
        kosa->sa_family = sa.sa_family;
        error = copyout(kosa, osa, len);
        if (error)
                return (error);

        return (0);
}

int
uipc_syscalls_43_init(void)
{

        return syscall_establish(NULL, uipc_syscalls_43_syscalls);
}

int
uipc_syscalls_43_fini(void)
{

        return syscall_disestablish(NULL, uipc_syscalls_43_syscalls);
}

















































































































































































































































































    2 




























  193 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/*        $NetBSD: kern_stub.c,v 1.50 2020/08/01 02:04:55 riastradh Exp $        */

/*-
 * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)subr_xxx.c        8.3 (Berkeley) 3/29/95
 */

/*
 * Stubs for system calls and facilities not included in the system.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_stub.c,v 1.50 2020/08/01 02:04:55 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_ktrace.h"
#include "opt_sysv.h"
#include "opt_modular.h"
#endif

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/fstypes.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
#include <sys/ktrace.h>
#include <sys/intr.h>
#include <sys/cpu.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <sys/userconf.h>

bool default_bus_space_is_equal(bus_space_tag_t, bus_space_tag_t);
bool default_bus_space_handle_is_equal(bus_space_tag_t, bus_space_handle_t,
    bus_space_handle_t);

/*
 * SYSV Semaphores, Shared Memory, Message Queues
 */
#ifndef MODULAR
#ifndef SYSVMSG
__strong_alias(msgctl1,enosys);
#endif
#ifndef SYSVSHM
__strong_alias(shmctl1,enosys);
#endif
#ifndef SYSVSEM
 __strong_alias(semctl1,enosys);
#endif
#endif

/*
 * ktrace stubs.  ktruser() goes to enosys as we want to fail the syscall,
 * but not kill the process: utrace() is a debugging feature.
 */
#ifndef KTRACE
__strong_alias(ktr_csw,nullop);                /* Probes */
__strong_alias(ktr_emul,nullop);
__strong_alias(ktr_geniov,nullop);
__strong_alias(ktr_genio,nullop);
__strong_alias(ktr_mibio,nullop);
__strong_alias(ktr_namei,nullop);
__strong_alias(ktr_namei2,nullop);
__strong_alias(ktr_psig,nullop);
__strong_alias(ktr_syscall,nullop);
__strong_alias(ktr_sysret,nullop);
__strong_alias(ktr_kuser,nullop);
__strong_alias(ktr_mib,nullop);
__strong_alias(ktr_execarg,nullop);
__strong_alias(ktr_execenv,nullop);
__strong_alias(ktr_execfd,nullop);

__strong_alias(sys_fktrace,sys_nosys);        /* Syscalls */
__strong_alias(sys_ktrace,sys_nosys);
__strong_alias(sys_utrace,sys_nosys);

int        ktrace_on;                        /* Misc */
__strong_alias(ktruser,enosys);
__strong_alias(ktr_point,nullop);
#endif        /* KTRACE */

__weak_alias(device_register, voidop);
__weak_alias(device_register_post_config, voidop);
__weak_alias(spldebug_start, voidop);
__weak_alias(spldebug_stop, voidop);
__weak_alias(machdep_init,nullop);
__weak_alias(pci_chipset_tag_create, eopnotsupp);
__weak_alias(pci_chipset_tag_destroy, voidop);
__weak_alias(bus_space_reserve, eopnotsupp);
__weak_alias(bus_space_reserve_subregion, eopnotsupp);
__weak_alias(bus_space_release, voidop);
__weak_alias(bus_space_reservation_map, eopnotsupp);
__weak_alias(bus_space_reservation_unmap, voidop);
__weak_alias(bus_dma_tag_create, eopnotsupp);
__weak_alias(bus_dma_tag_destroy, voidop);
__weak_alias(bus_space_tag_create, eopnotsupp);
__weak_alias(bus_space_tag_destroy, voidop);
__strict_weak_alias(bus_space_is_equal, default_bus_space_is_equal);
__strict_weak_alias(bus_space_handle_is_equal,
    default_bus_space_handle_is_equal);
__weak_alias(userconf_bootinfo, voidop);
__weak_alias(userconf_init, voidop);
__weak_alias(userconf_prompt, voidop);

__weak_alias(kobj_renamespace, nullop);

__weak_alias(interrupt_get_count, nullop);
__weak_alias(interrupt_get_assigned, voidop);
__weak_alias(interrupt_get_available, voidop);
__weak_alias(interrupt_get_devname, voidop);
__weak_alias(interrupt_construct_intrids, nullret);
__weak_alias(interrupt_destruct_intrids, voidop);
__weak_alias(interrupt_distribute, eopnotsupp);
__weak_alias(interrupt_distribute_handler, eopnotsupp);

/*
 * Scheduler activations system calls.  These need to remain until libc's
 * major version is bumped.
 */
__strong_alias(sys_sa_register,sys_nosys);
__strong_alias(sys_sa_stacks,sys_nosys);
__strong_alias(sys_sa_enable,sys_nosys);
__strong_alias(sys_sa_setconcurrency,sys_nosys);
__strong_alias(sys_sa_yield,sys_nosys);
__strong_alias(sys_sa_preempt,sys_nosys);
__strong_alias(sys_sa_unblockyield,sys_nosys);

/*
 * Stubs for compat_netbsd32.
 */
__strong_alias(dosa_register,sys_nosys);
__strong_alias(sa_stacks1,sys_nosys);

/*
 * Stubs for drivers.  See sys/conf.h.
 */
__strong_alias(devenodev,enodev);
__strong_alias(deveopnotsupp,eopnotsupp);
__strong_alias(devnullop,nullop);
__strong_alias(ttyenodev,enodev);
__strong_alias(ttyvenodev,voidop);
__strong_alias(ttyvnullop,nullop);

/*
 * Stubs for architectures that do not support kernel preemption.
 */
#ifndef __HAVE_PREEMPTION
bool
cpu_kpreempt_enter(uintptr_t where, int s)
{

        return false;
}

void
cpu_kpreempt_exit(uintptr_t where)
{

}

bool
cpu_kpreempt_disabled(void)
{

        return true;
}
#else
# ifndef MULTIPROCESSOR
#   error __HAVE_PREEMPTION requires MULTIPROCESSOR
# endif
#endif        /* !__HAVE_PREEMPTION */

int
sys_nosys(struct lwp *l, const void *v, register_t *retval)
{

        mutex_enter(&proc_lock);
        psignal(l->l_proc, SIGSYS);
        mutex_exit(&proc_lock);
        return ENOSYS;
}

/*
 * Unsupported device function (e.g. writing to read-only device).
 */
int
enodev(void)
{

        return (ENODEV);
}

/*
 * Unconfigured device function; driver not configured.
 */
int
enxio(void)
{

        return (ENXIO);
}

/*
 * Unsupported ioctl function.
 */
int
enoioctl(void)
{

        return (ENOTTY);
}

/*
 * Unsupported system function.
 * This is used for an otherwise-reasonable operation
 * that is not supported by the current system binary.
 */
int
enosys(void)
{

        return (ENOSYS);
}

/*
 * Return error for operation not supported
 * on a specific object or file type.
 */
int
eopnotsupp(void)
{

        return (EOPNOTSUPP);
}

/*
 * Generic null operation, void return value.
 */
void
voidop(void)
{
}

/*
 * Generic null operation, always returns success.
 */
int
nullop(void *v)
{

        return (0);
}

/*
 * Generic null operation, always returns null.
 */
void *
nullret(void)
{

        return (NULL);
}

bool
default_bus_space_handle_is_equal(bus_space_tag_t t,
    bus_space_handle_t h1, bus_space_handle_t h2)
{

        return memcmp(&h1, &h2, sizeof(h1)) == 0;
}

bool
default_bus_space_is_equal(bus_space_tag_t t1, bus_space_tag_t t2)
{

        return memcmp(&t1, &t2, sizeof(t1)) == 0;
}

/* Stubs for architectures with no kernel FPU access.  */
__weak_alias(kthread_fpu_enter_md, voidop);
__weak_alias(kthread_fpu_exit_md, voidop);











































































































































































































































































































































































































































































































































































































    3 















    3 
    3 










    3 
































































    3 
    3 



























































































































































































































































































































































































































































































































































































































































































    3 





















    3 



    2 


    3 





























    3 







    3 








    3 



    3 





    3 























































































    3 




    3 
    3 




















    3 
    3 


    3 



    3 









    3 











    3 








    3 

    3 

















    3 







    3 

















    3 









    3 





    3 








    3 





























































































































































































































































    3 











    3 























    3 









    3 

































    3 





    3 












    3 
    3 
    3 

    3 










    3 







    3 

































































































































































































































































































































































































































































































































































    3 


    3 



    3 























    3 


    3 


    3 








    3 


    3 


    3 



























































































































































    3 


    3 




    3 

    3 
























    5 

   38 













   40 

   40 




















    3 




















    3 






    3 























    6 












    3 


























































































































































































































































































































































































































    3 

































    3 


    3 















































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
/* $NetBSD: subr_autoconf.c,v 1.314 2023/07/18 11:57:37 riastradh Exp $ */

/*
 * Copyright (c) 1996, 2000 Christopher G. Demetriou
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *          This product includes software developed for the
 *          NetBSD Project.  See http://www.NetBSD.org/ for
 *          information about NetBSD.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * --(license Id: LICENSE.proto,v 1.1 2000/06/13 21:40:26 cgd Exp )--
 */

/*
 * Copyright (c) 1992, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This software was developed by the Computer Systems Engineering group
 * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
 * contributed to Berkeley.
 *
 * All advertising materials mentioning features or use of this software
 * must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Lawrence Berkeley Laboratories.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * from: Header: subr_autoconf.c,v 1.12 93/02/01 19:31:48 torek Exp  (LBL)
 *
 *        @(#)subr_autoconf.c        8.3 (Berkeley) 5/17/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_autoconf.c,v 1.314 2023/07/18 11:57:37 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "drvctl.h"
#endif

#include <sys/param.h>
#include <sys/device.h>
#include <sys/device_impl.h>
#include <sys/disklabel.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/errno.h>
#include <sys/proc.h>
#include <sys/reboot.h>
#include <sys/kthread.h>
#include <sys/buf.h>
#include <sys/dirent.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/unistd.h>
#include <sys/fcntl.h>
#include <sys/lockf.h>
#include <sys/callout.h>
#include <sys/devmon.h>
#include <sys/cpu.h>
#include <sys/sysctl.h>
#include <sys/stdarg.h>
#include <sys/localcount.h>

#include <sys/disk.h>

#include <sys/rndsource.h>

#include <machine/limits.h>

/*
 * Autoconfiguration subroutines.
 */

/*
 * Device autoconfiguration timings are mixed into the entropy pool.
 */
static krndsource_t rnd_autoconf_source;

/*
 * ioconf.c exports exactly two names: cfdata and cfroots.  All system
 * devices and drivers are found via these tables.
 */
extern struct cfdata cfdata[];
extern const short cfroots[];

/*
 * List of all cfdriver structures.  We use this to detect duplicates
 * when other cfdrivers are loaded.
 */
struct cfdriverlist allcfdrivers = LIST_HEAD_INITIALIZER(&allcfdrivers);
extern struct cfdriver * const cfdriver_list_initial[];

/*
 * Initial list of cfattach's.
 */
extern const struct cfattachinit cfattachinit[];

/*
 * List of cfdata tables.  We always have one such list -- the one
 * built statically when the kernel was configured.
 */
struct cftablelist allcftables = TAILQ_HEAD_INITIALIZER(allcftables);
static struct cftable initcftable;

#define        ROOT ((device_t)NULL)

struct matchinfo {
        cfsubmatch_t fn;
        device_t parent;
        const int *locs;
        void        *aux;
        struct        cfdata *match;
        int        pri;
};

struct alldevs_foray {
        int                        af_s;
        struct devicelist        af_garbage;
};

/*
 * Internal version of the cfargs structure; all versions are
 * canonicalized to this.
 */
struct cfargs_internal {
        union {
                cfsubmatch_t        submatch;/* submatch function (direct config) */
                cfsearch_t        search;         /* search function (indirect config) */
        };
        const char *        iattr;                /* interface attribute */
        const int *        locators;        /* locators array */
        devhandle_t        devhandle;        /* devhandle_t (by value) */
};

static char *number(char *, int);
static void mapply(struct matchinfo *, cfdata_t);
static void config_devdelete(device_t);
static void config_devunlink(device_t, struct devicelist *);
static void config_makeroom(int, struct cfdriver *);
static void config_devlink(device_t);
static void config_alldevs_enter(struct alldevs_foray *);
static void config_alldevs_exit(struct alldevs_foray *);
static void config_add_attrib_dict(device_t);
static device_t        config_attach_internal(device_t, cfdata_t, void *,
                    cfprint_t, const struct cfargs_internal *);

static void config_collect_garbage(struct devicelist *);
static void config_dump_garbage(struct devicelist *);

static void pmflock_debug(device_t, const char *, int);

static device_t deviter_next1(deviter_t *);
static void deviter_reinit(deviter_t *);

struct deferred_config {
        TAILQ_ENTRY(deferred_config) dc_queue;
        device_t dc_dev;
        void (*dc_func)(device_t);
};

TAILQ_HEAD(deferred_config_head, deferred_config);

static struct deferred_config_head deferred_config_queue =
        TAILQ_HEAD_INITIALIZER(deferred_config_queue);
static struct deferred_config_head interrupt_config_queue =
        TAILQ_HEAD_INITIALIZER(interrupt_config_queue);
static int interrupt_config_threads = 8;
static struct deferred_config_head mountroot_config_queue =
        TAILQ_HEAD_INITIALIZER(mountroot_config_queue);
static int mountroot_config_threads = 2;
static lwp_t **mountroot_config_lwpids;
static size_t mountroot_config_lwpids_size;
bool root_is_mounted = false;

static void config_process_deferred(struct deferred_config_head *, device_t);

/* Hooks to finalize configuration once all real devices have been found. */
struct finalize_hook {
        TAILQ_ENTRY(finalize_hook) f_list;
        int (*f_func)(device_t);
        device_t f_dev;
};
static TAILQ_HEAD(, finalize_hook) config_finalize_list =
        TAILQ_HEAD_INITIALIZER(config_finalize_list);
static int config_finalize_done;

/* list of all devices */
static struct devicelist alldevs = TAILQ_HEAD_INITIALIZER(alldevs);
static kmutex_t alldevs_lock __cacheline_aligned;
static devgen_t alldevs_gen = 1;
static int alldevs_nread = 0;
static int alldevs_nwrite = 0;
static bool alldevs_garbage = false;

static struct devicelist config_pending =
    TAILQ_HEAD_INITIALIZER(config_pending);
static kmutex_t config_misc_lock;
static kcondvar_t config_misc_cv;

static bool detachall = false;

#define        STREQ(s1, s2)                        \
        (*(s1) == *(s2) && strcmp((s1), (s2)) == 0)

static bool config_initialized = false;        /* config_init() has been called. */

static int config_do_twiddle;
static callout_t config_twiddle_ch;

static void sysctl_detach_setup(struct sysctllog **);

int no_devmon_insert(const char *, prop_dictionary_t);
int (*devmon_insert_vec)(const char *, prop_dictionary_t) = no_devmon_insert;

typedef int (*cfdriver_fn)(struct cfdriver *);
static int
frob_cfdrivervec(struct cfdriver * const *cfdriverv,
        cfdriver_fn drv_do, cfdriver_fn drv_undo,
        const char *style, bool dopanic)
{
        void (*pr)(const char *, ...) __printflike(1, 2) =
            dopanic ? panic : printf;
        int i, error = 0, e2 __diagused;

        for (i = 0; cfdriverv[i] != NULL; i++) {
                if ((error = drv_do(cfdriverv[i])) != 0) {
                        pr("configure: `%s' driver %s failed: %d",
                            cfdriverv[i]->cd_name, style, error);
                        goto bad;
                }
        }

        KASSERT(error == 0);
        return 0;

 bad:
        printf("\n");
        for (i--; i >= 0; i--) {
                e2 = drv_undo(cfdriverv[i]);
                KASSERT(e2 == 0);
        }

        return error;
}

typedef int (*cfattach_fn)(const char *, struct cfattach *);
static int
frob_cfattachvec(const struct cfattachinit *cfattachv,
        cfattach_fn att_do, cfattach_fn att_undo,
        const char *style, bool dopanic)
{
        const struct cfattachinit *cfai = NULL;
        void (*pr)(const char *, ...) __printflike(1, 2) =
            dopanic ? panic : printf;
        int j = 0, error = 0, e2 __diagused;

        for (cfai = &cfattachv[0]; cfai->cfai_name != NULL; cfai++) {
                for (j = 0; cfai->cfai_list[j] != NULL; j++) {
                        if ((error = att_do(cfai->cfai_name,
                            cfai->cfai_list[j])) != 0) {
                                pr("configure: attachment `%s' "
                                    "of `%s' driver %s failed: %d",
                                    cfai->cfai_list[j]->ca_name,
                                    cfai->cfai_name, style, error);
                                goto bad;
                        }
                }
        }

        KASSERT(error == 0);
        return 0;

 bad:
        /*
         * Rollback in reverse order.  dunno if super-important, but
         * do that anyway.  Although the code looks a little like
         * someone did a little integration (in the math sense).
         */
        printf("\n");
        if (cfai) {
                bool last;

                for (last = false; last == false; ) {
                        if (cfai == &cfattachv[0])
                                last = true;
                        for (j--; j >= 0; j--) {
                                e2 = att_undo(cfai->cfai_name,
                                    cfai->cfai_list[j]);
                                KASSERT(e2 == 0);
                        }
                        if (!last) {
                                cfai--;
                                for (j = 0; cfai->cfai_list[j] != NULL; j++)
                                        ;
                        }
                }
        }

        return error;
}

/*
 * Initialize the autoconfiguration data structures.  Normally this
 * is done by configure(), but some platforms need to do this very
 * early (to e.g. initialize the console).
 */
void
config_init(void)
{

        KASSERT(config_initialized == false);

        mutex_init(&alldevs_lock, MUTEX_DEFAULT, IPL_VM);

        mutex_init(&config_misc_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&config_misc_cv, "cfgmisc");

        callout_init(&config_twiddle_ch, CALLOUT_MPSAFE);

        frob_cfdrivervec(cfdriver_list_initial,
            config_cfdriver_attach, NULL, "bootstrap", true);
        frob_cfattachvec(cfattachinit,
            config_cfattach_attach, NULL, "bootstrap", true);

        initcftable.ct_cfdata = cfdata;
        TAILQ_INSERT_TAIL(&allcftables, &initcftable, ct_list);

        rnd_attach_source(&rnd_autoconf_source, "autoconf", RND_TYPE_UNKNOWN,
            RND_FLAG_COLLECT_TIME);

        config_initialized = true;
}

/*
 * Init or fini drivers and attachments.  Either all or none
 * are processed (via rollback).  It would be nice if this were
 * atomic to outside consumers, but with the current state of
 * locking ...
 */
int
config_init_component(struct cfdriver * const *cfdriverv,
        const struct cfattachinit *cfattachv, struct cfdata *cfdatav)
{
        int error;

        KERNEL_LOCK(1, NULL);

        if ((error = frob_cfdrivervec(cfdriverv,
            config_cfdriver_attach, config_cfdriver_detach, "init", false))!= 0)
                goto out;
        if ((error = frob_cfattachvec(cfattachv,
            config_cfattach_attach, config_cfattach_detach,
            "init", false)) != 0) {
                frob_cfdrivervec(cfdriverv,
                    config_cfdriver_detach, NULL, "init rollback", true);
                goto out;
        }
        if ((error = config_cfdata_attach(cfdatav, 1)) != 0) {
                frob_cfattachvec(cfattachv,
                    config_cfattach_detach, NULL, "init rollback", true);
                frob_cfdrivervec(cfdriverv,
                    config_cfdriver_detach, NULL, "init rollback", true);
                goto out;
        }

        /* Success!  */
        error = 0;

out:        KERNEL_UNLOCK_ONE(NULL);
        return error;
}

int
config_fini_component(struct cfdriver * const *cfdriverv,
        const struct cfattachinit *cfattachv, struct cfdata *cfdatav)
{
        int error;

        KERNEL_LOCK(1, NULL);

        if ((error = config_cfdata_detach(cfdatav)) != 0)
                goto out;
        if ((error = frob_cfattachvec(cfattachv,
            config_cfattach_detach, config_cfattach_attach,
            "fini", false)) != 0) {
                if (config_cfdata_attach(cfdatav, 0) != 0)
                        panic("config_cfdata fini rollback failed");
                goto out;
        }
        if ((error = frob_cfdrivervec(cfdriverv,
            config_cfdriver_detach, config_cfdriver_attach,
            "fini", false)) != 0) {
                frob_cfattachvec(cfattachv,
                    config_cfattach_attach, NULL, "fini rollback", true);
                if (config_cfdata_attach(cfdatav, 0) != 0)
                        panic("config_cfdata fini rollback failed");
                goto out;
        }

        /* Success!  */
        error = 0;

out:        KERNEL_UNLOCK_ONE(NULL);
        return error;
}

void
config_init_mi(void)
{

        if (!config_initialized)
                config_init();

        sysctl_detach_setup(NULL);
}

void
config_deferred(device_t dev)
{

        KASSERT(KERNEL_LOCKED_P());

        config_process_deferred(&deferred_config_queue, dev);
        config_process_deferred(&interrupt_config_queue, dev);
        config_process_deferred(&mountroot_config_queue, dev);
}

static void
config_interrupts_thread(void *cookie)
{
        struct deferred_config *dc;
        device_t dev;

        mutex_enter(&config_misc_lock);
        while ((dc = TAILQ_FIRST(&interrupt_config_queue)) != NULL) {
                TAILQ_REMOVE(&interrupt_config_queue, dc, dc_queue);
                mutex_exit(&config_misc_lock);

                dev = dc->dc_dev;
                (*dc->dc_func)(dev);
                if (!device_pmf_is_registered(dev))
                        aprint_debug_dev(dev,
                            "WARNING: power management not supported\n");
                config_pending_decr(dev);
                kmem_free(dc, sizeof(*dc));

                mutex_enter(&config_misc_lock);
        }
        mutex_exit(&config_misc_lock);

        kthread_exit(0);
}

void
config_create_interruptthreads(void)
{
        int i;

        for (i = 0; i < interrupt_config_threads; i++) {
                (void)kthread_create(PRI_NONE, 0/*XXXSMP */, NULL,
                    config_interrupts_thread, NULL, NULL, "configintr");
        }
}

static void
config_mountroot_thread(void *cookie)
{
        struct deferred_config *dc;

        mutex_enter(&config_misc_lock);
        while ((dc = TAILQ_FIRST(&mountroot_config_queue)) != NULL) {
                TAILQ_REMOVE(&mountroot_config_queue, dc, dc_queue);
                mutex_exit(&config_misc_lock);

                (*dc->dc_func)(dc->dc_dev);
                kmem_free(dc, sizeof(*dc));

                mutex_enter(&config_misc_lock);
        }
        mutex_exit(&config_misc_lock);

        kthread_exit(0);
}

void
config_create_mountrootthreads(void)
{
        int i;

        if (!root_is_mounted)
                root_is_mounted = true;

        mountroot_config_lwpids_size = sizeof(mountroot_config_lwpids) *
                                       mountroot_config_threads;
        mountroot_config_lwpids = kmem_alloc(mountroot_config_lwpids_size,
                                             KM_NOSLEEP);
        KASSERT(mountroot_config_lwpids);
        for (i = 0; i < mountroot_config_threads; i++) {
                mountroot_config_lwpids[i] = 0;
                (void)kthread_create(PRI_NONE, KTHREAD_MUSTJOIN/* XXXSMP */,
                                     NULL, config_mountroot_thread, NULL,
                                     &mountroot_config_lwpids[i],
                                     "configroot");
        }
}

void
config_finalize_mountroot(void)
{
        int i, error;

        for (i = 0; i < mountroot_config_threads; i++) {
                if (mountroot_config_lwpids[i] == 0)
                        continue;

                error = kthread_join(mountroot_config_lwpids[i]);
                if (error)
                        printf("%s: thread %x joined with error %d\n",
                               __func__, i, error);
        }
        kmem_free(mountroot_config_lwpids, mountroot_config_lwpids_size);
}

/*
 * Announce device attach/detach to userland listeners.
 */

int
no_devmon_insert(const char *name, prop_dictionary_t p)
{

        return ENODEV;
}

static void
devmon_report_device(device_t dev, bool isattach)
{
        prop_dictionary_t ev, dict = device_properties(dev);
        const char *parent;
        const char *what;
        const char *where;
        device_t pdev = device_parent(dev);

        /* If currently no drvctl device, just return */
        if (devmon_insert_vec == no_devmon_insert)
                return;

        ev = prop_dictionary_create();
        if (ev == NULL)
                return;

        what = (isattach ? "device-attach" : "device-detach");
        parent = (pdev == NULL ? "root" : device_xname(pdev));
        if (prop_dictionary_get_string(dict, "location", &where)) {
                prop_dictionary_set_string(ev, "location", where);
                aprint_debug("ev: %s %s at %s in [%s]\n",
                    what, device_xname(dev), parent, where); 
        }
        if (!prop_dictionary_set_string(ev, "device", device_xname(dev)) ||
            !prop_dictionary_set_string(ev, "parent", parent)) {
                prop_object_release(ev);
                return;
        }

        if ((*devmon_insert_vec)(what, ev) != 0)
                prop_object_release(ev);
}

/*
 * Add a cfdriver to the system.
 */
int
config_cfdriver_attach(struct cfdriver *cd)
{
        struct cfdriver *lcd;

        /* Make sure this driver isn't already in the system. */
        LIST_FOREACH(lcd, &allcfdrivers, cd_list) {
                if (STREQ(lcd->cd_name, cd->cd_name))
                        return EEXIST;
        }

        LIST_INIT(&cd->cd_attach);
        LIST_INSERT_HEAD(&allcfdrivers, cd, cd_list);

        return 0;
}

/*
 * Remove a cfdriver from the system.
 */
int
config_cfdriver_detach(struct cfdriver *cd)
{
        struct alldevs_foray af;
        int i, rc = 0;

        config_alldevs_enter(&af);
        /* Make sure there are no active instances. */
        for (i = 0; i < cd->cd_ndevs; i++) {
                if (cd->cd_devs[i] != NULL) {
                        rc = EBUSY;
                        break;
                }
        }
        config_alldevs_exit(&af);

        if (rc != 0)
                return rc;

        /* ...and no attachments loaded. */
        if (LIST_EMPTY(&cd->cd_attach) == 0)
                return EBUSY;

        LIST_REMOVE(cd, cd_list);

        KASSERT(cd->cd_devs == NULL);

        return 0;
}

/*
 * Look up a cfdriver by name.
 */
struct cfdriver *
config_cfdriver_lookup(const char *name)
{
        struct cfdriver *cd;

        LIST_FOREACH(cd, &allcfdrivers, cd_list) {
                if (STREQ(cd->cd_name, name))
                        return cd;
        }

        return NULL;
}

/*
 * Add a cfattach to the specified driver.
 */
int
config_cfattach_attach(const char *driver, struct cfattach *ca)
{
        struct cfattach *lca;
        struct cfdriver *cd;

        cd = config_cfdriver_lookup(driver);
        if (cd == NULL)
                return ESRCH;

        /* Make sure this attachment isn't already on this driver. */
        LIST_FOREACH(lca, &cd->cd_attach, ca_list) {
                if (STREQ(lca->ca_name, ca->ca_name))
                        return EEXIST;
        }

        LIST_INSERT_HEAD(&cd->cd_attach, ca, ca_list);

        return 0;
}

/*
 * Remove a cfattach from the specified driver.
 */
int
config_cfattach_detach(const char *driver, struct cfattach *ca)
{
        struct alldevs_foray af;
        struct cfdriver *cd;
        device_t dev;
        int i, rc = 0;

        cd = config_cfdriver_lookup(driver);
        if (cd == NULL)
                return ESRCH;

        config_alldevs_enter(&af);
        /* Make sure there are no active instances. */
        for (i = 0; i < cd->cd_ndevs; i++) {
                if ((dev = cd->cd_devs[i]) == NULL)
                        continue;
                if (dev->dv_cfattach == ca) {
                        rc = EBUSY;
                        break;
                }
        }
        config_alldevs_exit(&af);

        if (rc != 0)
                return rc;

        LIST_REMOVE(ca, ca_list);

        return 0;
}

/*
 * Look up a cfattach by name.
 */
static struct cfattach *
config_cfattach_lookup_cd(struct cfdriver *cd, const char *atname)
{
        struct cfattach *ca;

        LIST_FOREACH(ca, &cd->cd_attach, ca_list) {
                if (STREQ(ca->ca_name, atname))
                        return ca;
        }

        return NULL;
}

/*
 * Look up a cfattach by driver/attachment name.
 */
struct cfattach *
config_cfattach_lookup(const char *name, const char *atname)
{
        struct cfdriver *cd;

        cd = config_cfdriver_lookup(name);
        if (cd == NULL)
                return NULL;

        return config_cfattach_lookup_cd(cd, atname);
}

/*
 * Apply the matching function and choose the best.  This is used
 * a few times and we want to keep the code small.
 */
static void
mapply(struct matchinfo *m, cfdata_t cf)
{
        int pri;

        if (m->fn != NULL) {
                pri = (*m->fn)(m->parent, cf, m->locs, m->aux);
        } else {
                pri = config_match(m->parent, cf, m->aux);
        }
        if (pri > m->pri) {
                m->match = cf;
                m->pri = pri;
        }
}

int
config_stdsubmatch(device_t parent, cfdata_t cf, const int *locs, void *aux)
{
        const struct cfiattrdata *ci;
        const struct cflocdesc *cl;
        int nlocs, i;

        ci = cfiattr_lookup(cfdata_ifattr(cf), parent->dv_cfdriver);
        KASSERT(ci);
        nlocs = ci->ci_loclen;
        KASSERT(!nlocs || locs);
        for (i = 0; i < nlocs; i++) {
                cl = &ci->ci_locdesc[i];
                if (cl->cld_defaultstr != NULL &&
                    cf->cf_loc[i] == cl->cld_default)
                        continue;
                if (cf->cf_loc[i] == locs[i])
                        continue;
                return 0;
        }

        return config_match(parent, cf, aux);
}

/*
 * Helper function: check whether the driver supports the interface attribute
 * and return its descriptor structure.
 */
static const struct cfiattrdata *
cfdriver_get_iattr(const struct cfdriver *cd, const char *ia)
{
        const struct cfiattrdata * const *cpp;

        if (cd->cd_attrs == NULL)
                return 0;

        for (cpp = cd->cd_attrs; *cpp; cpp++) {
                if (STREQ((*cpp)->ci_name, ia)) {
                        /* Match. */
                        return *cpp;
                }
        }
        return 0;
}

static int __diagused
cfdriver_iattr_count(const struct cfdriver *cd)
{
        const struct cfiattrdata * const *cpp;
        int i;

        if (cd->cd_attrs == NULL)
                return 0;

        for (i = 0, cpp = cd->cd_attrs; *cpp; cpp++) {
                i++;
        }
        return i;
}

/*
 * Lookup an interface attribute description by name.
 * If the driver is given, consider only its supported attributes.
 */
const struct cfiattrdata *
cfiattr_lookup(const char *name, const struct cfdriver *cd)
{
        const struct cfdriver *d;
        const struct cfiattrdata *ia;

        if (cd)
                return cfdriver_get_iattr(cd, name);

        LIST_FOREACH(d, &allcfdrivers, cd_list) {
                ia = cfdriver_get_iattr(d, name);
                if (ia)
                        return ia;
        }
        return 0;
}

/*
 * Determine if `parent' is a potential parent for a device spec based
 * on `cfp'.
 */
static int
cfparent_match(const device_t parent, const struct cfparent *cfp)
{
        struct cfdriver *pcd;

        /* We don't match root nodes here. */
        if (cfp == NULL)
                return 0;

        pcd = parent->dv_cfdriver;
        KASSERT(pcd != NULL);

        /*
         * First, ensure this parent has the correct interface
         * attribute.
         */
        if (!cfdriver_get_iattr(pcd, cfp->cfp_iattr))
                return 0;

        /*
         * If no specific parent device instance was specified (i.e.
         * we're attaching to the attribute only), we're done!
         */
        if (cfp->cfp_parent == NULL)
                return 1;

        /*
         * Check the parent device's name.
         */
        if (STREQ(pcd->cd_name, cfp->cfp_parent) == 0)
                return 0;        /* not the same parent */

        /*
         * Make sure the unit number matches.
         */
        if (cfp->cfp_unit == DVUNIT_ANY ||        /* wildcard */
            cfp->cfp_unit == parent->dv_unit)
                return 1;

        /* Unit numbers don't match. */
        return 0;
}

/*
 * Helper for config_cfdata_attach(): check all devices whether it could be
 * parent any attachment in the config data table passed, and rescan.
 */
static void
rescan_with_cfdata(const struct cfdata *cf)
{
        device_t d;
        const struct cfdata *cf1;
        deviter_t di;

        KASSERT(KERNEL_LOCKED_P());

        /*
         * "alldevs" is likely longer than a modules's cfdata, so make it
         * the outer loop.
         */
        for (d = deviter_first(&di, 0); d != NULL; d = deviter_next(&di)) {

                if (!(d->dv_cfattach->ca_rescan))
                        continue;

                for (cf1 = cf; cf1->cf_name; cf1++) {

                        if (!cfparent_match(d, cf1->cf_pspec))
                                continue;

                        (*d->dv_cfattach->ca_rescan)(d,
                                cfdata_ifattr(cf1), cf1->cf_loc);

                        config_deferred(d);
                }
        }
        deviter_release(&di);
}

/*
 * Attach a supplemental config data table and rescan potential
 * parent devices if required.
 */
int
config_cfdata_attach(cfdata_t cf, int scannow)
{
        struct cftable *ct;

        KERNEL_LOCK(1, NULL);

        ct = kmem_alloc(sizeof(*ct), KM_SLEEP);
        ct->ct_cfdata = cf;
        TAILQ_INSERT_TAIL(&allcftables, ct, ct_list);

        if (scannow)
                rescan_with_cfdata(cf);

        KERNEL_UNLOCK_ONE(NULL);

        return 0;
}

/*
 * Helper for config_cfdata_detach: check whether a device is
 * found through any attachment in the config data table.
 */
static int
dev_in_cfdata(device_t d, cfdata_t cf)
{
        const struct cfdata *cf1;

        for (cf1 = cf; cf1->cf_name; cf1++)
                if (d->dv_cfdata == cf1)
                        return 1;

        return 0;
}

/*
 * Detach a supplemental config data table. Detach all devices found
 * through that table (and thus keeping references to it) before.
 */
int
config_cfdata_detach(cfdata_t cf)
{
        device_t d;
        int error = 0;
        struct cftable *ct;
        deviter_t di;

        KERNEL_LOCK(1, NULL);

        for (d = deviter_first(&di, DEVITER_F_RW); d != NULL;
             d = deviter_next(&di)) {
                if (!dev_in_cfdata(d, cf))
                        continue;
                if ((error = config_detach(d, 0)) != 0)
                        break;
        }
        deviter_release(&di);
        if (error) {
                aprint_error_dev(d, "unable to detach instance\n");
                goto out;
        }

        TAILQ_FOREACH(ct, &allcftables, ct_list) {
                if (ct->ct_cfdata == cf) {
                        TAILQ_REMOVE(&allcftables, ct, ct_list);
                        kmem_free(ct, sizeof(*ct));
                        error = 0;
                        goto out;
                }
        }

        /* not found -- shouldn't happen */
        error = EINVAL;

out:        KERNEL_UNLOCK_ONE(NULL);
        return error;
}

/*
 * Invoke the "match" routine for a cfdata entry on behalf of
 * an external caller, usually a direct config "submatch" routine.
 */
int
config_match(device_t parent, cfdata_t cf, void *aux)
{
        struct cfattach *ca;

        KASSERT(KERNEL_LOCKED_P());

        ca = config_cfattach_lookup(cf->cf_name, cf->cf_atname);
        if (ca == NULL) {
                /* No attachment for this entry, oh well. */
                return 0;
        }

        return (*ca->ca_match)(parent, cf, aux);
}

/*
 * Invoke the "probe" routine for a cfdata entry on behalf of
 * an external caller, usually an indirect config "search" routine.
 */
int
config_probe(device_t parent, cfdata_t cf, void *aux)
{
        /*
         * This is currently a synonym for config_match(), but this
         * is an implementation detail; "match" and "probe" routines
         * have different behaviors.
         *
         * XXX config_probe() should return a bool, because there is
         * XXX no match score for probe -- it's either there or it's
         * XXX not, but some ports abuse the return value as a way
         * XXX to attach "critical" devices before "non-critical"
         * XXX devices.
         */
        return config_match(parent, cf, aux);
}

static struct cfargs_internal *
cfargs_canonicalize(const struct cfargs * const cfargs,
    struct cfargs_internal * const store)
{
        struct cfargs_internal *args = store;

        memset(args, 0, sizeof(*args));

        /* If none specified, are all-NULL pointers are good. */
        if (cfargs == NULL) {
                return args;
        }

        /*
         * Only one arguments version is recognized at this time.
         */
        if (cfargs->cfargs_version != CFARGS_VERSION) {
                panic("cfargs_canonicalize: unknown version %lu\n",
                    (unsigned long)cfargs->cfargs_version);
        }

        /*
         * submatch and search are mutually-exclusive.
         */
        if (cfargs->submatch != NULL && cfargs->search != NULL) {
                panic("cfargs_canonicalize: submatch and search are "
                      "mutually-exclusive");
        }
        if (cfargs->submatch != NULL) {
                args->submatch = cfargs->submatch;
        } else if (cfargs->search != NULL) {
                args->search = cfargs->search;
        }

        args->iattr = cfargs->iattr;
        args->locators = cfargs->locators;
        args->devhandle = cfargs->devhandle;

        return args;
}

/*
 * Iterate over all potential children of some device, calling the given
 * function (default being the child's match function) for each one.
 * Nonzero returns are matches; the highest value returned is considered
 * the best match.  Return the `found child' if we got a match, or NULL
 * otherwise.  The `aux' pointer is simply passed on through.
 *
 * Note that this function is designed so that it can be used to apply
 * an arbitrary function to all potential children (its return value
 * can be ignored).
 */
static cfdata_t
config_search_internal(device_t parent, void *aux,
    const struct cfargs_internal * const args)
{
        struct cftable *ct;
        cfdata_t cf;
        struct matchinfo m;

        KASSERT(config_initialized);
        KASSERTMSG((!args->iattr ||
                cfdriver_get_iattr(parent->dv_cfdriver, args->iattr)),
            "%s searched for child at interface attribute %s,"
            " but device %s(4) has no such interface attribute in config(5)",
            device_xname(parent), args->iattr,
            parent->dv_cfdriver->cd_name);
        KASSERTMSG((args->iattr ||
                cfdriver_iattr_count(parent->dv_cfdriver) < 2),
            "%s searched for child without interface attribute,"
            " needed to disambiguate among the %d declared for in %s(4)"
            " in config(5)",
            device_xname(parent),
            cfdriver_iattr_count(parent->dv_cfdriver),
            parent->dv_cfdriver->cd_name);

        m.fn = args->submatch;                /* N.B. union */
        m.parent = parent;
        m.locs = args->locators;
        m.aux = aux;
        m.match = NULL;
        m.pri = 0;

        TAILQ_FOREACH(ct, &allcftables, ct_list) {
                for (cf = ct->ct_cfdata; cf->cf_name; cf++) {

                        /* We don't match root nodes here. */
                        if (!cf->cf_pspec)
                                continue;

                        /*
                         * Skip cf if no longer eligible, otherwise scan
                         * through parents for one matching `parent', and
                         * try match function.
                         */
                        if (cf->cf_fstate == FSTATE_FOUND)
                                continue;
                        if (cf->cf_fstate == FSTATE_DNOTFOUND ||
                            cf->cf_fstate == FSTATE_DSTAR)
                                continue;

                        /*
                         * If an interface attribute was specified,
                         * consider only children which attach to
                         * that attribute.
                         */
                        if (args->iattr != NULL &&
                            !STREQ(args->iattr, cfdata_ifattr(cf)))
                                continue;

                        if (cfparent_match(parent, cf->cf_pspec))
                                mapply(&m, cf);
                }
        }
        rnd_add_uint32(&rnd_autoconf_source, 0);
        return m.match;
}

cfdata_t
config_search(device_t parent, void *aux, const struct cfargs *cfargs)
{
        cfdata_t cf;
        struct cfargs_internal store;

        cf = config_search_internal(parent, aux,
            cfargs_canonicalize(cfargs, &store));

        return cf;
}

/*
 * Find the given root device.
 * This is much like config_search, but there is no parent.
 * Don't bother with multiple cfdata tables; the root node
 * must always be in the initial table.
 */
cfdata_t
config_rootsearch(cfsubmatch_t fn, const char *rootname, void *aux)
{
        cfdata_t cf;
        const short *p;
        struct matchinfo m;

        m.fn = fn;
        m.parent = ROOT;
        m.aux = aux;
        m.match = NULL;
        m.pri = 0;
        m.locs = 0;
        /*
         * Look at root entries for matching name.  We do not bother
         * with found-state here since only one root should ever be
         * searched (and it must be done first).
         */
        for (p = cfroots; *p >= 0; p++) {
                cf = &cfdata[*p];
                if (strcmp(cf->cf_name, rootname) == 0)
                        mapply(&m, cf);
        }
        return m.match;
}

static const char * const msgs[] = {
[QUIET]                =        "",
[UNCONF]        =        " not configured\n",
[UNSUPP]        =        " unsupported\n",
};

/*
 * The given `aux' argument describes a device that has been found
 * on the given parent, but not necessarily configured.  Locate the
 * configuration data for that device (using the submatch function
 * provided, or using candidates' cd_match configuration driver
 * functions) and attach it, and return its device_t.  If the device was
 * not configured, call the given `print' function and return NULL.
 */
device_t
config_found_acquire(device_t parent, void *aux, cfprint_t print,
    const struct cfargs * const cfargs)
{
        cfdata_t cf;
        struct cfargs_internal store;
        const struct cfargs_internal * const args =
            cfargs_canonicalize(cfargs, &store);
        device_t dev;

        KERNEL_LOCK(1, NULL);

        cf = config_search_internal(parent, aux, args);
        if (cf != NULL) {
                dev = config_attach_internal(parent, cf, aux, print, args);
                goto out;
        }

        if (print) {
                if (config_do_twiddle && cold)
                        twiddle();

                const int pret = (*print)(aux, device_xname(parent));
                KASSERT(pret >= 0);
                KASSERT(pret < __arraycount(msgs));
                KASSERT(msgs[pret] != NULL);
                aprint_normal("%s", msgs[pret]);
        }

        dev = NULL;

out:        KERNEL_UNLOCK_ONE(NULL);
        return dev;
}

/*
 * config_found(parent, aux, print, cfargs)
 *
 *        Legacy entry point for callers whose use of the returned
 *        device_t is not delimited by device_release.
 *
 *        The caller is required to hold the kernel lock as a fragile
 *        defence against races.
 *
 *        Callers should ignore the return value or be converted to
 *        config_found_acquire with a matching device_release once they
 *        have finished with the returned device_t.
 */
device_t
config_found(device_t parent, void *aux, cfprint_t print,
    const struct cfargs * const cfargs)
{
        device_t dev;

        KASSERT(KERNEL_LOCKED_P());

        dev = config_found_acquire(parent, aux, print, cfargs);
        if (dev == NULL)
                return NULL;
        device_release(dev);

        return dev;
}

/*
 * As above, but for root devices.
 */
device_t
config_rootfound(const char *rootname, void *aux)
{
        cfdata_t cf;
        device_t dev = NULL;

        KERNEL_LOCK(1, NULL);
        if ((cf = config_rootsearch(NULL, rootname, aux)) != NULL)
                dev = config_attach(ROOT, cf, aux, NULL, CFARGS_NONE);
        else
                aprint_error("root device %s not configured\n", rootname);
        KERNEL_UNLOCK_ONE(NULL);
        return dev;
}

/* just like sprintf(buf, "%d") except that it works from the end */
static char *
number(char *ep, int n)
{

        *--ep = 0;
        while (n >= 10) {
                *--ep = (n % 10) + '0';
                n /= 10;
        }
        *--ep = n + '0';
        return ep;
}

/*
 * Expand the size of the cd_devs array if necessary.
 *
 * The caller must hold alldevs_lock. config_makeroom() may release and
 * re-acquire alldevs_lock, so callers should re-check conditions such
 * as alldevs_nwrite == 0 and alldevs_nread == 0 when config_makeroom()
 * returns.
 */
static void
config_makeroom(int n, struct cfdriver *cd)
{
        int ondevs, nndevs;
        device_t *osp, *nsp;

        KASSERT(mutex_owned(&alldevs_lock));
        alldevs_nwrite++;

        /* XXX arithmetic overflow */
        for (nndevs = MAX(4, cd->cd_ndevs); nndevs <= n; nndevs += nndevs)
                ;

        while (n >= cd->cd_ndevs) {
                /*
                 * Need to expand the array.
                 */
                ondevs = cd->cd_ndevs;
                osp = cd->cd_devs;

                /*
                 * Release alldevs_lock around allocation, which may
                 * sleep.
                 */
                mutex_exit(&alldevs_lock);
                nsp = kmem_alloc(sizeof(device_t) * nndevs, KM_SLEEP);
                mutex_enter(&alldevs_lock);

                /*
                 * If another thread moved the array while we did
                 * not hold alldevs_lock, try again.
                 */
                if (cd->cd_devs != osp || cd->cd_ndevs != ondevs) {
                        mutex_exit(&alldevs_lock);
                        kmem_free(nsp, sizeof(device_t) * nndevs);
                        mutex_enter(&alldevs_lock);
                        continue;
                }

                memset(nsp + ondevs, 0, sizeof(device_t) * (nndevs - ondevs));
                if (ondevs != 0)
                        memcpy(nsp, cd->cd_devs, sizeof(device_t) * ondevs);

                cd->cd_ndevs = nndevs;
                cd->cd_devs = nsp;
                if (ondevs != 0) {
                        mutex_exit(&alldevs_lock);
                        kmem_free(osp, sizeof(device_t) * ondevs);
                        mutex_enter(&alldevs_lock);
                }
        }
        KASSERT(mutex_owned(&alldevs_lock));
        alldevs_nwrite--;
}

/*
 * Put dev into the devices list.
 */
static void
config_devlink(device_t dev)
{

        mutex_enter(&alldevs_lock);

        KASSERT(device_cfdriver(dev)->cd_devs[dev->dv_unit] == dev);

        dev->dv_add_gen = alldevs_gen;
        /* It is safe to add a device to the tail of the list while
         * readers and writers are in the list.
         */
        TAILQ_INSERT_TAIL(&alldevs, dev, dv_list);
        mutex_exit(&alldevs_lock);
}

static void
config_devfree(device_t dev)
{

        KASSERT(dev->dv_flags & DVF_PRIV_ALLOC);
        KASSERTMSG(dev->dv_pending == 0, "%d", dev->dv_pending);

        if (dev->dv_cfattach->ca_devsize > 0)
                kmem_free(dev->dv_private, dev->dv_cfattach->ca_devsize);
        kmem_free(dev, sizeof(*dev));
}

/*
 * Caller must hold alldevs_lock.
 */
static void
config_devunlink(device_t dev, struct devicelist *garbage)
{
        struct device_garbage *dg = &dev->dv_garbage;
        cfdriver_t cd = device_cfdriver(dev);
        int i;

        KASSERT(mutex_owned(&alldevs_lock));
        KASSERTMSG(dev->dv_pending == 0, "%d", dev->dv_pending);

         /* Unlink from device list.  Link to garbage list. */
        TAILQ_REMOVE(&alldevs, dev, dv_list);
        TAILQ_INSERT_TAIL(garbage, dev, dv_list);

        /* Remove from cfdriver's array. */
        cd->cd_devs[dev->dv_unit] = NULL;

        /*
         * If the device now has no units in use, unlink its softc array.
         */
        for (i = 0; i < cd->cd_ndevs; i++) {
                if (cd->cd_devs[i] != NULL)
                        break;
        }
        /* Nothing found.  Unlink, now.  Deallocate, later. */
        if (i == cd->cd_ndevs) {
                dg->dg_ndevs = cd->cd_ndevs;
                dg->dg_devs = cd->cd_devs;
                cd->cd_devs = NULL;
                cd->cd_ndevs = 0;
        }
}

static void
config_devdelete(device_t dev)
{
        struct device_garbage *dg = &dev->dv_garbage;
        device_lock_t dvl = device_getlock(dev);

        KASSERTMSG(dev->dv_pending == 0, "%d", dev->dv_pending);

        if (dg->dg_devs != NULL)
                kmem_free(dg->dg_devs, sizeof(device_t) * dg->dg_ndevs);

        localcount_fini(dev->dv_localcount);
        kmem_free(dev->dv_localcount, sizeof(*dev->dv_localcount));

        cv_destroy(&dvl->dvl_cv);
        mutex_destroy(&dvl->dvl_mtx);

        KASSERT(dev->dv_properties != NULL);
        prop_object_release(dev->dv_properties);

        if (dev->dv_activity_handlers)
                panic("%s with registered handlers", __func__);

        if (dev->dv_locators) {
                size_t amount = *--dev->dv_locators;
                kmem_free(dev->dv_locators, amount);
        }

        config_devfree(dev);
}

static int
config_unit_nextfree(cfdriver_t cd, cfdata_t cf)
{
        int unit = cf->cf_unit;

        KASSERT(mutex_owned(&alldevs_lock));

        if (unit < 0)
                return -1;
        if (cf->cf_fstate == FSTATE_STAR) {
                for (; unit < cd->cd_ndevs; unit++)
                        if (cd->cd_devs[unit] == NULL)
                                break;
                /*
                 * unit is now the unit of the first NULL device pointer,
                 * or max(cd->cd_ndevs,cf->cf_unit).
                 */
        } else {
                if (unit < cd->cd_ndevs && cd->cd_devs[unit] != NULL)
                        unit = -1;
        }
        return unit;
}

static int
config_unit_alloc(device_t dev, cfdriver_t cd, cfdata_t cf)
{
        struct alldevs_foray af;
        int unit;

        config_alldevs_enter(&af);
        for (;;) {
                unit = config_unit_nextfree(cd, cf);
                if (unit == -1)
                        break;
                if (unit < cd->cd_ndevs) {
                        cd->cd_devs[unit] = dev;
                        dev->dv_unit = unit;
                        break;
                }
                config_makeroom(unit, cd);
        }
        config_alldevs_exit(&af);

        return unit;
}

static device_t
config_devalloc(const device_t parent, const cfdata_t cf,
    const struct cfargs_internal * const args)
{
        cfdriver_t cd;
        cfattach_t ca;
        size_t lname, lunit;
        const char *xunit;
        int myunit;
        char num[10];
        device_t dev;
        void *dev_private;
        const struct cfiattrdata *ia;
        device_lock_t dvl;

        cd = config_cfdriver_lookup(cf->cf_name);
        if (cd == NULL)
                return NULL;

        ca = config_cfattach_lookup_cd(cd, cf->cf_atname);
        if (ca == NULL)
                return NULL;

        /* get memory for all device vars */
        KASSERT(ca->ca_flags & DVF_PRIV_ALLOC);
        if (ca->ca_devsize > 0) {
                dev_private = kmem_zalloc(ca->ca_devsize, KM_SLEEP);
        } else {
                dev_private = NULL;
        }
        dev = kmem_zalloc(sizeof(*dev), KM_SLEEP);

        dev->dv_handle = args->devhandle;

        dev->dv_class = cd->cd_class;
        dev->dv_cfdata = cf;
        dev->dv_cfdriver = cd;
        dev->dv_cfattach = ca;
        dev->dv_activity_count = 0;
        dev->dv_activity_handlers = NULL;
        dev->dv_private = dev_private;
        dev->dv_flags = ca->ca_flags;        /* inherit flags from class */
        dev->dv_attaching = curlwp;

        myunit = config_unit_alloc(dev, cd, cf);
        if (myunit == -1) {
                config_devfree(dev);
                return NULL;
        }

        /* compute length of name and decimal expansion of unit number */
        lname = strlen(cd->cd_name);
        xunit = number(&num[sizeof(num)], myunit);
        lunit = &num[sizeof(num)] - xunit;
        if (lname + lunit > sizeof(dev->dv_xname))
                panic("config_devalloc: device name too long");

        dvl = device_getlock(dev);

        mutex_init(&dvl->dvl_mtx, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&dvl->dvl_cv, "pmfsusp");

        memcpy(dev->dv_xname, cd->cd_name, lname);
        memcpy(dev->dv_xname + lname, xunit, lunit);
        dev->dv_parent = parent;
        if (parent != NULL)
                dev->dv_depth = parent->dv_depth + 1;
        else
                dev->dv_depth = 0;
        dev->dv_flags |= DVF_ACTIVE;        /* always initially active */
        if (args->locators) {
                KASSERT(parent); /* no locators at root */
                ia = cfiattr_lookup(cfdata_ifattr(cf), parent->dv_cfdriver);
                dev->dv_locators =
                    kmem_alloc(sizeof(int) * (ia->ci_loclen + 1), KM_SLEEP);
                *dev->dv_locators++ = sizeof(int) * (ia->ci_loclen + 1);
                memcpy(dev->dv_locators, args->locators,
                    sizeof(int) * ia->ci_loclen);
        }
        dev->dv_properties = prop_dictionary_create();
        KASSERT(dev->dv_properties != NULL);

        prop_dictionary_set_string_nocopy(dev->dv_properties,
            "device-driver", dev->dv_cfdriver->cd_name);
        prop_dictionary_set_uint16(dev->dv_properties,
            "device-unit", dev->dv_unit);
        if (parent != NULL) {
                prop_dictionary_set_string(dev->dv_properties,
                    "device-parent", device_xname(parent));
        }

        dev->dv_localcount = kmem_zalloc(sizeof(*dev->dv_localcount),
            KM_SLEEP);
        localcount_init(dev->dv_localcount);

        if (dev->dv_cfdriver->cd_attrs != NULL)
                config_add_attrib_dict(dev);

        return dev;
}

/*
 * Create an array of device attach attributes and add it
 * to the device's dv_properties dictionary.
 *
 * <key>interface-attributes</key>
 * <array>
 *    <dict>
 *       <key>attribute-name</key>
 *       <string>foo</string>
 *       <key>locators</key>
 *       <array>
 *          <dict>
 *             <key>loc-name</key>
 *             <string>foo-loc1</string>
 *          </dict>
 *          <dict>
 *             <key>loc-name</key>
 *             <string>foo-loc2</string>
 *             <key>default</key>
 *             <string>foo-loc2-default</string>
 *          </dict>
 *          ...
 *       </array>
 *    </dict>
 *    ...
 * </array>
 */

static void
config_add_attrib_dict(device_t dev)
{
        int i, j;
        const struct cfiattrdata *ci;
        prop_dictionary_t attr_dict, loc_dict;
        prop_array_t attr_array, loc_array;

        if ((attr_array = prop_array_create()) == NULL)
                return;

        for (i = 0; ; i++) {
                if ((ci = dev->dv_cfdriver->cd_attrs[i]) == NULL)
                        break;
                if ((attr_dict = prop_dictionary_create()) == NULL)
                        break;
                prop_dictionary_set_string_nocopy(attr_dict, "attribute-name",
                    ci->ci_name);

                /* Create an array of the locator names and defaults */

                if (ci->ci_loclen != 0 &&
                    (loc_array = prop_array_create()) != NULL) {
                        for (j = 0; j < ci->ci_loclen; j++) {
                                loc_dict = prop_dictionary_create();
                                if (loc_dict == NULL)
                                        continue;
                                prop_dictionary_set_string_nocopy(loc_dict,
                                    "loc-name", ci->ci_locdesc[j].cld_name);
                                if (ci->ci_locdesc[j].cld_defaultstr != NULL)
                                        prop_dictionary_set_string_nocopy(
                                            loc_dict, "default",
                                            ci->ci_locdesc[j].cld_defaultstr);
                                prop_array_set(loc_array, j, loc_dict);
                                prop_object_release(loc_dict);
                        }
                        prop_dictionary_set_and_rel(attr_dict, "locators",
                            loc_array);
                }
                prop_array_add(attr_array, attr_dict);
                prop_object_release(attr_dict);
        }
        if (i == 0)
                prop_object_release(attr_array);
        else
                prop_dictionary_set_and_rel(dev->dv_properties,
                    "interface-attributes", attr_array);

        return;
}

/*
 * Attach a found device.
 *
 * Returns the device referenced, to be released with device_release.
 */
static device_t
config_attach_internal(device_t parent, cfdata_t cf, void *aux, cfprint_t print,
    const struct cfargs_internal * const args)
{
        device_t dev;
        struct cftable *ct;
        const char *drvname;
        bool deferred;

        KASSERT(KERNEL_LOCKED_P());

        dev = config_devalloc(parent, cf, args);
        if (!dev)
                panic("config_attach: allocation of device softc failed");

        /* XXX redundant - see below? */
        if (cf->cf_fstate != FSTATE_STAR) {
                KASSERT(cf->cf_fstate == FSTATE_NOTFOUND);
                cf->cf_fstate = FSTATE_FOUND;
        }

        config_devlink(dev);

        if (config_do_twiddle && cold)
                twiddle();
        else
                aprint_naive("Found ");
        /*
         * We want the next two printfs for normal, verbose, and quiet,
         * but not silent (in which case, we're twiddling, instead).
         */
        if (parent == ROOT) {
                aprint_naive("%s (root)", device_xname(dev));
                aprint_normal("%s (root)", device_xname(dev));
        } else {
                aprint_naive("%s at %s", device_xname(dev),
                    device_xname(parent));
                aprint_normal("%s at %s", device_xname(dev),
                    device_xname(parent));
                if (print)
                        (void) (*print)(aux, NULL);
        }

        /*
         * Before attaching, clobber any unfound devices that are
         * otherwise identical.
         * XXX code above is redundant?
         */
        drvname = dev->dv_cfdriver->cd_name;
        TAILQ_FOREACH(ct, &allcftables, ct_list) {
                for (cf = ct->ct_cfdata; cf->cf_name; cf++) {
                        if (STREQ(cf->cf_name, drvname) &&
                            cf->cf_unit == dev->dv_unit) {
                                if (cf->cf_fstate == FSTATE_NOTFOUND)
                                        cf->cf_fstate = FSTATE_FOUND;
                        }
                }
        }
        device_register(dev, aux);

        /* Let userland know */
        devmon_report_device(dev, true);

        /*
         * Prevent detach until the driver's attach function, and all
         * deferred actions, have finished.
         */
        config_pending_incr(dev);

        /*
         * Prevent concurrent detach from destroying the device_t until
         * the caller has released the device.
         */
        device_acquire(dev);

        /* Call the driver's attach function.  */
        (*dev->dv_cfattach->ca_attach)(parent, dev, aux);

        /*
         * Allow other threads to acquire references to the device now
         * that the driver's attach function is done.
         */
        mutex_enter(&config_misc_lock);
        KASSERT(dev->dv_attaching == curlwp);
        dev->dv_attaching = NULL;
        cv_broadcast(&config_misc_cv);
        mutex_exit(&config_misc_lock);

        /*
         * Synchronous parts of attach are done.  Allow detach, unless
         * the driver's attach function scheduled deferred actions.
         */
        config_pending_decr(dev);

        mutex_enter(&config_misc_lock);
        deferred = (dev->dv_pending != 0);
        mutex_exit(&config_misc_lock);

        if (!deferred && !device_pmf_is_registered(dev))
                aprint_debug_dev(dev,
                    "WARNING: power management not supported\n");

        config_process_deferred(&deferred_config_queue, dev);

        device_register_post_config(dev, aux);
        rnd_add_uint32(&rnd_autoconf_source, 0);
        return dev;
}

device_t
config_attach_acquire(device_t parent, cfdata_t cf, void *aux, cfprint_t print,
    const struct cfargs *cfargs)
{
        struct cfargs_internal store;
        device_t dev;

        KERNEL_LOCK(1, NULL);
        dev = config_attach_internal(parent, cf, aux, print,
            cfargs_canonicalize(cfargs, &store));
        KERNEL_UNLOCK_ONE(NULL);

        return dev;
}

/*
 * config_attach(parent, cf, aux, print, cfargs)
 *
 *        Legacy entry point for callers whose use of the returned
 *        device_t is not delimited by device_release.
 *
 *        The caller is required to hold the kernel lock as a fragile
 *        defence against races.
 *
 *        Callers should ignore the return value or be converted to
 *        config_attach_acquire with a matching device_release once they
 *        have finished with the returned device_t.
 */
device_t
config_attach(device_t parent, cfdata_t cf, void *aux, cfprint_t print,
    const struct cfargs *cfargs)
{
        device_t dev;

        KASSERT(KERNEL_LOCKED_P());

        dev = config_attach_acquire(parent, cf, aux, print, cfargs);
        if (dev == NULL)
                return NULL;
        device_release(dev);

        return dev;
}

/*
 * As above, but for pseudo-devices.  Pseudo-devices attached in this
 * way are silently inserted into the device tree, and their children
 * attached.
 *
 * Note that because pseudo-devices are attached silently, any information
 * the attach routine wishes to print should be prefixed with the device
 * name by the attach routine.
 */
device_t
config_attach_pseudo_acquire(cfdata_t cf, void *aux)
{
        device_t dev;

        KERNEL_LOCK(1, NULL);

        struct cfargs_internal args = { };
        dev = config_devalloc(ROOT, cf, &args);
        if (!dev)
                goto out;

        /* XXX mark busy in cfdata */

        if (cf->cf_fstate != FSTATE_STAR) {
                KASSERT(cf->cf_fstate == FSTATE_NOTFOUND);
                cf->cf_fstate = FSTATE_FOUND;
        }

        config_devlink(dev);

#if 0        /* XXXJRT not yet */
        device_register(dev, NULL);        /* like a root node */
#endif

        /* Let userland know */
        devmon_report_device(dev, true);

        /*
         * Prevent detach until the driver's attach function, and all
         * deferred actions, have finished.
         */
        config_pending_incr(dev);

        /*
         * Prevent concurrent detach from destroying the device_t until
         * the caller has released the device.
         */
        device_acquire(dev);

        /* Call the driver's attach function.  */
        (*dev->dv_cfattach->ca_attach)(ROOT, dev, aux);

        /*
         * Allow other threads to acquire references to the device now
         * that the driver's attach function is done.
         */
        mutex_enter(&config_misc_lock);
        KASSERT(dev->dv_attaching == curlwp);
        dev->dv_attaching = NULL;
        cv_broadcast(&config_misc_cv);
        mutex_exit(&config_misc_lock);

        /*
         * Synchronous parts of attach are done.  Allow detach, unless
         * the driver's attach function scheduled deferred actions.
         */
        config_pending_decr(dev);

        config_process_deferred(&deferred_config_queue, dev);

out:        KERNEL_UNLOCK_ONE(NULL);
        return dev;
}

/*
 * config_attach_pseudo(cf)
 *
 *        Legacy entry point for callers whose use of the returned
 *        device_t is not delimited by device_release.
 *
 *        The caller is required to hold the kernel lock as a fragile
 *        defence against races.
 *
 *        Callers should ignore the return value or be converted to
 *        config_attach_pseudo_acquire with a matching device_release
 *        once they have finished with the returned device_t.  As a
 *        bonus, config_attach_pseudo_acquire can pass a non-null aux
 *        argument into the driver's attach routine.
 */
device_t
config_attach_pseudo(cfdata_t cf)
{
        device_t dev;

        dev = config_attach_pseudo_acquire(cf, NULL);
        if (dev == NULL)
                return dev;
        device_release(dev);

        return dev;
}

/*
 * Caller must hold alldevs_lock.
 */
static void
config_collect_garbage(struct devicelist *garbage)
{
        device_t dv;

        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());
        KASSERT(mutex_owned(&alldevs_lock));

        while (alldevs_nwrite == 0 && alldevs_nread == 0 && alldevs_garbage) {
                TAILQ_FOREACH(dv, &alldevs, dv_list) {
                        if (dv->dv_del_gen != 0)
                                break;
                }
                if (dv == NULL) {
                        alldevs_garbage = false;
                        break;
                }
                config_devunlink(dv, garbage);
        }
        KASSERT(mutex_owned(&alldevs_lock));
}

static void
config_dump_garbage(struct devicelist *garbage)
{
        device_t dv;

        while ((dv = TAILQ_FIRST(garbage)) != NULL) {
                TAILQ_REMOVE(garbage, dv, dv_list);
                config_devdelete(dv);
        }
}

static int
config_detach_enter(device_t dev)
{
        struct lwp *l __diagused;
        int error = 0;

        mutex_enter(&config_misc_lock);

        /*
         * Wait until attach has fully completed, and until any
         * concurrent detach (e.g., drvctl racing with USB event
         * thread) has completed.
         *
         * Caller must hold alldevs_nread or alldevs_nwrite (e.g., via
         * deviter) to ensure the winner of the race doesn't free the
         * device leading the loser of the race into use-after-free.
         *
         * XXX Not all callers do this!
         */
        while (dev->dv_pending || dev->dv_detaching) {
                KASSERTMSG(dev->dv_detaching != curlwp,
                    "recursively detaching %s", device_xname(dev));
                error = cv_wait_sig(&config_misc_cv, &config_misc_lock);
                if (error)
                        goto out;
        }

        /*
         * Attach has completed, and no other concurrent detach is
         * running.  Claim the device for detaching.  This will cause
         * all new attempts to acquire references to block.
         */
        KASSERTMSG((l = dev->dv_attaching) == NULL,
            "lwp %ld [%s] @ %p attaching %s",
            (long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l,
            device_xname(dev));
        KASSERTMSG((l = dev->dv_detaching) == NULL,
            "lwp %ld [%s] @ %p detaching %s",
            (long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l,
            device_xname(dev));
        dev->dv_detaching = curlwp;

out:        mutex_exit(&config_misc_lock);
        return error;
}

static void
config_detach_exit(device_t dev)
{
        struct lwp *l __diagused;

        mutex_enter(&config_misc_lock);
        KASSERTMSG(dev->dv_detaching != NULL, "not detaching %s",
            device_xname(dev));
        KASSERTMSG((l = dev->dv_detaching) == curlwp,
            "lwp %ld [%s] @ %p detaching %s",
            (long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l,
            device_xname(dev));
        dev->dv_detaching = NULL;
        cv_broadcast(&config_misc_cv);
        mutex_exit(&config_misc_lock);
}

/*
 * Detach a device.  Optionally forced (e.g. because of hardware
 * removal) and quiet.  Returns zero if successful, non-zero
 * (an error code) otherwise.
 *
 * Note that this code wants to be run from a process context, so
 * that the detach can sleep to allow processes which have a device
 * open to run and unwind their stacks.
 *
 * Caller must hold a reference with device_acquire or
 * device_lookup_acquire.
 */
int
config_detach_release(device_t dev, int flags)
{
        struct alldevs_foray af;
        struct cftable *ct;
        cfdata_t cf;
        const struct cfattach *ca;
        struct cfdriver *cd;
        device_t d __diagused;
        int rv = 0;

        KERNEL_LOCK(1, NULL);

        cf = dev->dv_cfdata;
        KASSERTMSG((cf == NULL || cf->cf_fstate == FSTATE_FOUND ||
                cf->cf_fstate == FSTATE_STAR),
            "config_detach: %s: bad device fstate: %d",
            device_xname(dev), cf ? cf->cf_fstate : -1);

        cd = dev->dv_cfdriver;
        KASSERT(cd != NULL);

        ca = dev->dv_cfattach;
        KASSERT(ca != NULL);

        /*
         * Only one detach at a time, please -- and not until fully
         * attached.
         */
        rv = config_detach_enter(dev);
        device_release(dev);
        if (rv) {
                KERNEL_UNLOCK_ONE(NULL);
                return rv;
        }

        mutex_enter(&alldevs_lock);
        if (dev->dv_del_gen != 0) {
                mutex_exit(&alldevs_lock);
#ifdef DIAGNOSTIC
                printf("%s: %s is already detached\n", __func__,
                    device_xname(dev));
#endif /* DIAGNOSTIC */
                config_detach_exit(dev);
                KERNEL_UNLOCK_ONE(NULL);
                return ENOENT;
        }
        alldevs_nwrite++;
        mutex_exit(&alldevs_lock);

        /*
         * Call the driver's .ca_detach function, unless it has none or
         * we are skipping it because it's unforced shutdown time and
         * the driver didn't ask to detach on shutdown.
         */
        if (!detachall &&
            (flags & (DETACH_SHUTDOWN|DETACH_FORCE)) == DETACH_SHUTDOWN &&
            (dev->dv_flags & DVF_DETACH_SHUTDOWN) == 0) {
                rv = EOPNOTSUPP;
        } else if (ca->ca_detach != NULL) {
                rv = (*ca->ca_detach)(dev, flags);
        } else
                rv = EOPNOTSUPP;

        KASSERTMSG(!dev->dv_detach_done, "%s detached twice, error=%d",
            device_xname(dev), rv);

        /*
         * If it was not possible to detach the device, then we either
         * panic() (for the forced but failed case), or return an error.
         */
        if (rv) {
                /*
                 * Detach failed -- likely EOPNOTSUPP or EBUSY.  Driver
                 * must not have called config_detach_commit.
                 */
                KASSERTMSG(!dev->dv_detach_committed,
                    "%s committed to detaching and then backed out, error=%d",
                    device_xname(dev), rv);
                if (flags & DETACH_FORCE) {
                        panic("config_detach: forced detach of %s failed (%d)",
                            device_xname(dev), rv);
                }
                goto out;
        }

        /*
         * The device has now been successfully detached.
         */
        dev->dv_detach_done = true;

        /*
         * If .ca_detach didn't commit to detach, then do that for it.
         * This wakes any pending device_lookup_acquire calls so they
         * will fail.
         */
        config_detach_commit(dev);

        /*
         * If it was possible to detach the device, ensure that the
         * device is deactivated.
         */
        dev->dv_flags &= ~DVF_ACTIVE; /* XXXSMP */

        /*
         * Wait for all device_lookup_acquire references -- mostly, for
         * all attempts to open the device -- to drain.  It is the
         * responsibility of .ca_detach to ensure anything with open
         * references will be interrupted and release them promptly,
         * not block indefinitely.  All new attempts to acquire
         * references will fail, as config_detach_commit has arranged
         * by now.
         */
        mutex_enter(&config_misc_lock);
        localcount_drain(dev->dv_localcount,
            &config_misc_cv, &config_misc_lock);
        mutex_exit(&config_misc_lock);

        /* Let userland know */
        devmon_report_device(dev, false);

#ifdef DIAGNOSTIC
        /*
         * Sanity: If you're successfully detached, you should have no
         * children.  (Note that because children must be attached
         * after parents, we only need to search the latter part of
         * the list.)
         */
        mutex_enter(&alldevs_lock);
        for (d = TAILQ_NEXT(dev, dv_list); d != NULL;
            d = TAILQ_NEXT(d, dv_list)) {
                if (d->dv_parent == dev && d->dv_del_gen == 0) {
                        printf("config_detach: detached device %s"
                            " has children %s\n", device_xname(dev),
                            device_xname(d));
                        panic("config_detach");
                }
        }
        mutex_exit(&alldevs_lock);
#endif

        /* notify the parent that the child is gone */
        if (dev->dv_parent) {
                device_t p = dev->dv_parent;
                if (p->dv_cfattach->ca_childdetached)
                        (*p->dv_cfattach->ca_childdetached)(p, dev);
        }

        /*
         * Mark cfdata to show that the unit can be reused, if possible.
         */
        TAILQ_FOREACH(ct, &allcftables, ct_list) {
                for (cf = ct->ct_cfdata; cf->cf_name; cf++) {
                        if (STREQ(cf->cf_name, cd->cd_name)) {
                                if (cf->cf_fstate == FSTATE_FOUND &&
                                    cf->cf_unit == dev->dv_unit)
                                        cf->cf_fstate = FSTATE_NOTFOUND;
                        }
                }
        }

        if (dev->dv_cfdata != NULL && (flags & DETACH_QUIET) == 0)
                aprint_normal_dev(dev, "detached\n");

out:
        config_detach_exit(dev);

        config_alldevs_enter(&af);
        KASSERT(alldevs_nwrite != 0);
        --alldevs_nwrite;
        if (rv == 0 && dev->dv_del_gen == 0) {
                if (alldevs_nwrite == 0 && alldevs_nread == 0)
                        config_devunlink(dev, &af.af_garbage);
                else {
                        dev->dv_del_gen = alldevs_gen;
                        alldevs_garbage = true;
                }
        }
        config_alldevs_exit(&af);

        KERNEL_UNLOCK_ONE(NULL);

        return rv;
}

/*
 * config_detach(dev, flags)
 *
 *        Legacy entry point for callers that have not acquired a
 *        reference to dev.
 *
 *        The caller is required to hold the kernel lock as a fragile
 *        defence against races.
 *
 *        Callers should be converted to use device_acquire under a lock
 *        taken also by .ca_childdetached to synchronize access to the
 *        device_t, and then config_detach_release ouside the lock.
 *        Alternatively, most drivers detach children only in their own
 *        detach routines, which can be done with config_detach_children
 *        instead.
 */
int
config_detach(device_t dev, int flags)
{

        device_acquire(dev);
        return config_detach_release(dev, flags);
}

/*
 * config_detach_commit(dev)
 *
 *        Issued by a driver's .ca_detach routine to notify anyone
 *        waiting in device_lookup_acquire that the driver is committed
 *        to detaching the device, which allows device_lookup_acquire to
 *        wake up and fail immediately.
 *
 *        Safe to call multiple times -- idempotent.  Must be called
 *        during config_detach_enter/exit.  Safe to use with
 *        device_lookup because the device is not actually removed from
 *        the table until after config_detach_exit.
 */
void
config_detach_commit(device_t dev)
{
        struct lwp *l __diagused;

        mutex_enter(&config_misc_lock);
        KASSERTMSG(dev->dv_detaching != NULL, "not detaching %s",
            device_xname(dev));
        KASSERTMSG((l = dev->dv_detaching) == curlwp,
            "lwp %ld [%s] @ %p detaching %s",
            (long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l,
            device_xname(dev));
        dev->dv_detach_committed = true;
        cv_broadcast(&config_misc_cv);
        mutex_exit(&config_misc_lock);
}

int
config_detach_children(device_t parent, int flags)
{
        device_t dv;
        deviter_t di;
        int error = 0;

        KASSERT(KERNEL_LOCKED_P());

        for (dv = deviter_first(&di, DEVITER_F_RW); dv != NULL;
             dv = deviter_next(&di)) {
                if (device_parent(dv) != parent)
                        continue;
                if ((error = config_detach(dv, flags)) != 0)
                        break;
        }
        deviter_release(&di);
        return error;
}

device_t
shutdown_first(struct shutdown_state *s)
{
        if (!s->initialized) {
                deviter_init(&s->di, DEVITER_F_SHUTDOWN|DEVITER_F_LEAVES_FIRST);
                s->initialized = true;
        }
        return shutdown_next(s);
}

device_t
shutdown_next(struct shutdown_state *s)
{
        device_t dv;

        while ((dv = deviter_next(&s->di)) != NULL && !device_is_active(dv))
                ;

        if (dv == NULL)
                s->initialized = false;

        return dv;
}

bool
config_detach_all(int how)
{
        static struct shutdown_state s;
        device_t curdev;
        bool progress = false;
        int flags;

        KERNEL_LOCK(1, NULL);

        if ((how & (RB_NOSYNC|RB_DUMP)) != 0)
                goto out;

        if ((how & RB_POWERDOWN) == RB_POWERDOWN)
                flags = DETACH_SHUTDOWN | DETACH_POWEROFF;
        else
                flags = DETACH_SHUTDOWN;

        for (curdev = shutdown_first(&s); curdev != NULL;
             curdev = shutdown_next(&s)) {
                aprint_debug(" detaching %s, ", device_xname(curdev));
                if (config_detach(curdev, flags) == 0) {
                        progress = true;
                        aprint_debug("success.");
                } else
                        aprint_debug("failed.");
        }

out:        KERNEL_UNLOCK_ONE(NULL);
        return progress;
}

static bool
device_is_ancestor_of(device_t ancestor, device_t descendant)
{
        device_t dv;

        for (dv = descendant; dv != NULL; dv = device_parent(dv)) {
                if (device_parent(dv) == ancestor)
                        return true;
        }
        return false;
}

int
config_deactivate(device_t dev)
{
        deviter_t di;
        const struct cfattach *ca;
        device_t descendant;
        int s, rv = 0, oflags;

        for (descendant = deviter_first(&di, DEVITER_F_ROOT_FIRST);
             descendant != NULL;
             descendant = deviter_next(&di)) {
                if (dev != descendant &&
                    !device_is_ancestor_of(dev, descendant))
                        continue;

                if ((descendant->dv_flags & DVF_ACTIVE) == 0)
                        continue;

                ca = descendant->dv_cfattach;
                oflags = descendant->dv_flags;

                descendant->dv_flags &= ~DVF_ACTIVE;
                if (ca->ca_activate == NULL)
                        continue;
                s = splhigh();
                rv = (*ca->ca_activate)(descendant, DVACT_DEACTIVATE);
                splx(s);
                if (rv != 0)
                        descendant->dv_flags = oflags;
        }
        deviter_release(&di);
        return rv;
}

/*
 * Defer the configuration of the specified device until all
 * of its parent's devices have been attached.
 */
void
config_defer(device_t dev, void (*func)(device_t))
{
        struct deferred_config *dc;

        if (dev->dv_parent == NULL)
                panic("config_defer: can't defer config of a root device");

        dc = kmem_alloc(sizeof(*dc), KM_SLEEP);

        config_pending_incr(dev);

        mutex_enter(&config_misc_lock);
#ifdef DIAGNOSTIC
        struct deferred_config *odc;
        TAILQ_FOREACH(odc, &deferred_config_queue, dc_queue) {
                if (odc->dc_dev == dev)
                        panic("config_defer: deferred twice");
        }
#endif
        dc->dc_dev = dev;
        dc->dc_func = func;
        TAILQ_INSERT_TAIL(&deferred_config_queue, dc, dc_queue);
        mutex_exit(&config_misc_lock);
}

/*
 * Defer some autoconfiguration for a device until after interrupts
 * are enabled.
 */
void
config_interrupts(device_t dev, void (*func)(device_t))
{
        struct deferred_config *dc;

        /*
         * If interrupts are enabled, callback now.
         */
        if (cold == 0) {
                (*func)(dev);
                return;
        }

        dc = kmem_alloc(sizeof(*dc), KM_SLEEP);

        config_pending_incr(dev);

        mutex_enter(&config_misc_lock);
#ifdef DIAGNOSTIC
        struct deferred_config *odc;
        TAILQ_FOREACH(odc, &interrupt_config_queue, dc_queue) {
                if (odc->dc_dev == dev)
                        panic("config_interrupts: deferred twice");
        }
#endif
        dc->dc_dev = dev;
        dc->dc_func = func;
        TAILQ_INSERT_TAIL(&interrupt_config_queue, dc, dc_queue);
        mutex_exit(&config_misc_lock);
}

/*
 * Defer some autoconfiguration for a device until after root file system
 * is mounted (to load firmware etc).
 */
void
config_mountroot(device_t dev, void (*func)(device_t))
{
        struct deferred_config *dc;

        /*
         * If root file system is mounted, callback now.
         */
        if (root_is_mounted) {
                (*func)(dev);
                return;
        }

        dc = kmem_alloc(sizeof(*dc), KM_SLEEP);

        mutex_enter(&config_misc_lock);
#ifdef DIAGNOSTIC
        struct deferred_config *odc;
        TAILQ_FOREACH(odc, &mountroot_config_queue, dc_queue) {
                if (odc->dc_dev == dev)
                        panic("%s: deferred twice", __func__);
        }
#endif

        dc->dc_dev = dev;
        dc->dc_func = func;
        TAILQ_INSERT_TAIL(&mountroot_config_queue, dc, dc_queue);
        mutex_exit(&config_misc_lock);
}

/*
 * Process a deferred configuration queue.
 */
static void
config_process_deferred(struct deferred_config_head *queue, device_t parent)
{
        struct deferred_config *dc;

        KASSERT(KERNEL_LOCKED_P());

        mutex_enter(&config_misc_lock);
        dc = TAILQ_FIRST(queue);
        while (dc) {
                if (parent == NULL || dc->dc_dev->dv_parent == parent) {
                        TAILQ_REMOVE(queue, dc, dc_queue);
                        mutex_exit(&config_misc_lock);

                        (*dc->dc_func)(dc->dc_dev);
                        config_pending_decr(dc->dc_dev);
                        kmem_free(dc, sizeof(*dc));

                        mutex_enter(&config_misc_lock);
                        /* Restart, queue might have changed */
                        dc = TAILQ_FIRST(queue);
                } else {
                        dc = TAILQ_NEXT(dc, dc_queue);
                }
        }
        mutex_exit(&config_misc_lock);
}

/*
 * Manipulate the config_pending semaphore.
 */
void
config_pending_incr(device_t dev)
{

        mutex_enter(&config_misc_lock);
        KASSERTMSG(dev->dv_pending < INT_MAX,
            "%s: excess config_pending_incr", device_xname(dev));
        if (dev->dv_pending++ == 0)
                TAILQ_INSERT_TAIL(&config_pending, dev, dv_pending_list);
#ifdef DEBUG_AUTOCONF
        printf("%s: %s %d\n", __func__, device_xname(dev), dev->dv_pending);
#endif
        mutex_exit(&config_misc_lock);
}

void
config_pending_decr(device_t dev)
{

        mutex_enter(&config_misc_lock);
        KASSERTMSG(dev->dv_pending > 0,
            "%s: excess config_pending_decr", device_xname(dev));
        if (--dev->dv_pending == 0) {
                TAILQ_REMOVE(&config_pending, dev, dv_pending_list);
                cv_broadcast(&config_misc_cv);
        }
#ifdef DEBUG_AUTOCONF
        printf("%s: %s %d\n", __func__, device_xname(dev), dev->dv_pending);
#endif
        mutex_exit(&config_misc_lock);
}

/*
 * Register a "finalization" routine.  Finalization routines are
 * called iteratively once all real devices have been found during
 * autoconfiguration, for as long as any one finalizer has done
 * any work.
 */
int
config_finalize_register(device_t dev, int (*fn)(device_t))
{
        struct finalize_hook *f;
        int error = 0;

        KERNEL_LOCK(1, NULL);

        /*
         * If finalization has already been done, invoke the
         * callback function now.
         */
        if (config_finalize_done) {
                while ((*fn)(dev) != 0)
                        /* loop */ ;
                goto out;
        }

        /* Ensure this isn't already on the list. */
        TAILQ_FOREACH(f, &config_finalize_list, f_list) {
                if (f->f_func == fn && f->f_dev == dev) {
                        error = EEXIST;
                        goto out;
                }
        }

        f = kmem_alloc(sizeof(*f), KM_SLEEP);
        f->f_func = fn;
        f->f_dev = dev;
        TAILQ_INSERT_TAIL(&config_finalize_list, f, f_list);

        /* Success!  */
        error = 0;

out:        KERNEL_UNLOCK_ONE(NULL);
        return error;
}

void
config_finalize(void)
{
        struct finalize_hook *f;
        struct pdevinit *pdev;
        extern struct pdevinit pdevinit[];
        unsigned t0 = getticks();
        int errcnt, rv;

        /*
         * Now that device driver threads have been created, wait for
         * them to finish any deferred autoconfiguration.
         */
        mutex_enter(&config_misc_lock);
        while (!TAILQ_EMPTY(&config_pending)) {
                const unsigned t1 = getticks();

                if (t1 - t0 >= hz) {
                        void (*pr)(const char *, ...) __printflike(1,2);
                        device_t dev;

                        if (t1 - t0 >= 60*hz) {
                                pr = aprint_normal;
                                t0 = t1;
                        } else {
                                pr = aprint_debug;
                        }

                        (*pr)("waiting for devices:");
                        TAILQ_FOREACH(dev, &config_pending, dv_pending_list)
                                (*pr)(" %s", device_xname(dev));
                        (*pr)("\n");
                }

                (void)cv_timedwait(&config_misc_cv, &config_misc_lock,
                    mstohz(1000));
        }
        mutex_exit(&config_misc_lock);

        KERNEL_LOCK(1, NULL);

        /* Attach pseudo-devices. */
        for (pdev = pdevinit; pdev->pdev_attach != NULL; pdev++)
                (*pdev->pdev_attach)(pdev->pdev_count);

        /* Run the hooks until none of them does any work. */
        do {
                rv = 0;
                TAILQ_FOREACH(f, &config_finalize_list, f_list)
                        rv |= (*f->f_func)(f->f_dev);
        } while (rv != 0);

        config_finalize_done = 1;

        /* Now free all the hooks. */
        while ((f = TAILQ_FIRST(&config_finalize_list)) != NULL) {
                TAILQ_REMOVE(&config_finalize_list, f, f_list);
                kmem_free(f, sizeof(*f));
        }

        KERNEL_UNLOCK_ONE(NULL);

        errcnt = aprint_get_error_count();
        if ((boothowto & (AB_QUIET|AB_SILENT)) != 0 &&
            (boothowto & AB_VERBOSE) == 0) {
                mutex_enter(&config_misc_lock);
                if (config_do_twiddle) {
                        config_do_twiddle = 0;
                        printf_nolog(" done.\n");
                }
                mutex_exit(&config_misc_lock);
        }
        if (errcnt != 0) {
                printf("WARNING: %d error%s while detecting hardware; "
                    "check system log.\n", errcnt,
                    errcnt == 1 ? "" : "s");
        }
}

void
config_twiddle_init(void)
{

        if ((boothowto & (AB_SILENT|AB_VERBOSE)) == AB_SILENT) {
                config_do_twiddle = 1;
        }
        callout_setfunc(&config_twiddle_ch, config_twiddle_fn, NULL);
}

void
config_twiddle_fn(void *cookie)
{

        mutex_enter(&config_misc_lock);
        if (config_do_twiddle) {
                twiddle();
                callout_schedule(&config_twiddle_ch, mstohz(100));
        }
        mutex_exit(&config_misc_lock);
}

static void
config_alldevs_enter(struct alldevs_foray *af)
{
        TAILQ_INIT(&af->af_garbage);
        mutex_enter(&alldevs_lock);
        config_collect_garbage(&af->af_garbage);
}

static void
config_alldevs_exit(struct alldevs_foray *af)
{
        mutex_exit(&alldevs_lock);
        config_dump_garbage(&af->af_garbage);
}

/*
 * device_lookup:
 *
 *        Look up a device instance for a given driver.
 *
 *        Caller is responsible for ensuring the device's state is
 *        stable, either by holding a reference already obtained with
 *        device_lookup_acquire or by otherwise ensuring the device is
 *        attached and can't be detached (e.g., holding an open device
 *        node and ensuring *_detach calls vdevgone).
 *
 *        XXX Find a way to assert this.
 *
 *        Safe for use up to and including interrupt context at IPL_VM.
 *        Never sleeps.
 */
device_t
device_lookup(cfdriver_t cd, int unit)
{
        device_t dv;

        mutex_enter(&alldevs_lock);
        if (unit < 0 || unit >= cd->cd_ndevs)
                dv = NULL;
        else if ((dv = cd->cd_devs[unit]) != NULL && dv->dv_del_gen != 0)
                dv = NULL;
        mutex_exit(&alldevs_lock);

        return dv;
}

/*
 * device_lookup_private:
 *
 *        Look up a softc instance for a given driver.
 */
void *
device_lookup_private(cfdriver_t cd, int unit)
{

        return device_private(device_lookup(cd, unit));
}

/*
 * device_lookup_acquire:
 *
 *        Look up a device instance for a given driver, and return a
 *        reference to it that must be released by device_release.
 *
 *        => If the device is still attaching, blocks until *_attach has
 *           returned.
 *
 *        => If the device is detaching, blocks until *_detach has
 *           returned.  May succeed or fail in that case, depending on
 *           whether *_detach has backed out (EBUSY) or committed to
 *           detaching.
 *
 *        May sleep.
 */
device_t
device_lookup_acquire(cfdriver_t cd, int unit)
{
        device_t dv;

        ASSERT_SLEEPABLE();

        /* XXX This should have a pserialized fast path -- TBD.  */
        mutex_enter(&config_misc_lock);
        mutex_enter(&alldevs_lock);
retry:        if (unit < 0 || unit >= cd->cd_ndevs ||
            (dv = cd->cd_devs[unit]) == NULL ||
            dv->dv_del_gen != 0 ||
            dv->dv_detach_committed) {
                dv = NULL;
        } else {
                /*
                 * Wait for the device to stabilize, if attaching or
                 * detaching.  Either way we must wait for *_attach or
                 * *_detach to complete, and either way we must retry:
                 * even if detaching, *_detach might fail (EBUSY) so
                 * the device may still be there.
                 */
                if ((dv->dv_attaching != NULL && dv->dv_attaching != curlwp) ||
                    dv->dv_detaching != NULL) {
                        mutex_exit(&alldevs_lock);
                        cv_wait(&config_misc_cv, &config_misc_lock);
                        mutex_enter(&alldevs_lock);
                        goto retry;
                }
                device_acquire(dv);
        }
        mutex_exit(&alldevs_lock);
        mutex_exit(&config_misc_lock);

        return dv;
}

/*
 * device_acquire:
 *
 *        Acquire a reference to a device.  It is the caller's
 *        responsibility to ensure that the device's .ca_detach routine
 *        cannot return before calling this.  Caller must release the
 *        reference with device_release or config_detach_release.
 */
void
device_acquire(device_t dv)
{

        /*
         * No lock because the caller has promised that this can't
         * change concurrently with device_acquire.
         */
        KASSERTMSG(!dv->dv_detach_done, "%s",
            dv == NULL ? "(null)" : device_xname(dv));
        localcount_acquire(dv->dv_localcount);
}

/*
 * device_release:
 *
 *        Release a reference to a device acquired with device_acquire or
 *        device_lookup_acquire.
 */
void
device_release(device_t dv)
{

        localcount_release(dv->dv_localcount,
            &config_misc_cv, &config_misc_lock);
}

/*
 * device_find_by_xname:
 *
 *        Returns the device of the given name or NULL if it doesn't exist.
 */
device_t
device_find_by_xname(const char *name)
{
        device_t dv;
        deviter_t di;

        for (dv = deviter_first(&di, 0); dv != NULL; dv = deviter_next(&di)) {
                if (strcmp(device_xname(dv), name) == 0)
                        break;
        }
        deviter_release(&di);

        return dv;
}

/*
 * device_find_by_driver_unit:
 *
 *        Returns the device of the given driver name and unit or
 *        NULL if it doesn't exist.
 */
device_t
device_find_by_driver_unit(const char *name, int unit)
{
        struct cfdriver *cd;

        if ((cd = config_cfdriver_lookup(name)) == NULL)
                return NULL;
        return device_lookup(cd, unit);
}

static bool
match_strcmp(const char * const s1, const char * const s2)
{
        return strcmp(s1, s2) == 0;
}

static bool
match_pmatch(const char * const s1, const char * const s2)
{
        return pmatch(s1, s2, NULL) == 2;
}

static bool
strarray_match_internal(const char ** const strings,
    unsigned int const nstrings, const char * const str,
    unsigned int * const indexp,
    bool (*match_fn)(const char *, const char *))
{
        unsigned int i;

        if (strings == NULL || nstrings == 0) {
                return false;
        }

        for (i = 0; i < nstrings; i++) {
                if ((*match_fn)(strings[i], str)) {
                        *indexp = i;
                        return true;
                }
        }

        return false;
}

static int
strarray_match(const char ** const strings, unsigned int const nstrings,
    const char * const str)
{
        unsigned int idx;

        if (strarray_match_internal(strings, nstrings, str, &idx,
                                    match_strcmp)) {
                return (int)(nstrings - idx);
        }
        return 0;
}

static int
strarray_pmatch(const char ** const strings, unsigned int const nstrings,
    const char * const pattern)
{
        unsigned int idx;

        if (strarray_match_internal(strings, nstrings, pattern, &idx,
                                    match_pmatch)) {
                return (int)(nstrings - idx);
        }
        return 0;
}

static int
device_compatible_match_strarray_internal(
    const char **device_compats, int ndevice_compats,
    const struct device_compatible_entry *driver_compats,
    const struct device_compatible_entry **matching_entryp,
    int (*match_fn)(const char **, unsigned int, const char *))
{
        const struct device_compatible_entry *dce = NULL;
        int rv;

        if (ndevice_compats == 0 || device_compats == NULL ||
            driver_compats == NULL)
                return 0;

        for (dce = driver_compats; dce->compat != NULL; dce++) {
                rv = (*match_fn)(device_compats, ndevice_compats, dce->compat);
                if (rv != 0) {
                        if (matching_entryp != NULL) {
                                *matching_entryp = dce;
                        }
                        return rv;
                }
        }
        return 0;
}

/*
 * device_compatible_match:
 *
 *        Match a driver's "compatible" data against a device's
 *        "compatible" strings.  Returns resulted weighted by
 *        which device "compatible" string was matched.
 */
int
device_compatible_match(const char **device_compats, int ndevice_compats,
    const struct device_compatible_entry *driver_compats)
{
        return device_compatible_match_strarray_internal(device_compats,
            ndevice_compats, driver_compats, NULL, strarray_match);
}

/*
 * device_compatible_pmatch:
 *
 *        Like device_compatible_match(), but uses pmatch(9) to compare
 *        the device "compatible" strings against patterns in the
 *        driver's "compatible" data.
 */
int
device_compatible_pmatch(const char **device_compats, int ndevice_compats,
    const struct device_compatible_entry *driver_compats)
{
        return device_compatible_match_strarray_internal(device_compats,
            ndevice_compats, driver_compats, NULL, strarray_pmatch);
}

static int
device_compatible_match_strlist_internal(
    const char * const device_compats, size_t const device_compatsize,
    const struct device_compatible_entry *driver_compats,
    const struct device_compatible_entry **matching_entryp,
    int (*match_fn)(const char *, size_t, const char *))
{
        const struct device_compatible_entry *dce = NULL;
        int rv;

        if (device_compats == NULL || device_compatsize == 0 ||
            driver_compats == NULL)
                return 0;

        for (dce = driver_compats; dce->compat != NULL; dce++) {
                rv = (*match_fn)(device_compats, device_compatsize,
                    dce->compat);
                if (rv != 0) {
                        if (matching_entryp != NULL) {
                                *matching_entryp = dce;
                        }
                        return rv;
                }
        }
        return 0;
}

/*
 * device_compatible_match_strlist:
 *
 *        Like device_compatible_match(), but take the device
 *        "compatible" strings as an OpenFirmware-style string
 *        list.
 */
int
device_compatible_match_strlist(
    const char * const device_compats, size_t const device_compatsize,
    const struct device_compatible_entry *driver_compats)
{
        return device_compatible_match_strlist_internal(device_compats,
            device_compatsize, driver_compats, NULL, strlist_match);
}

/*
 * device_compatible_pmatch_strlist:
 *
 *        Like device_compatible_pmatch(), but take the device
 *        "compatible" strings as an OpenFirmware-style string
 *        list.
 */
int
device_compatible_pmatch_strlist(
    const char * const device_compats, size_t const device_compatsize,
    const struct device_compatible_entry *driver_compats)
{
        return device_compatible_match_strlist_internal(device_compats,
            device_compatsize, driver_compats, NULL, strlist_pmatch);
}

static int
device_compatible_match_id_internal(
    uintptr_t const id, uintptr_t const mask, uintptr_t const sentinel_id,
    const struct device_compatible_entry *driver_compats,
    const struct device_compatible_entry **matching_entryp)
{
        const struct device_compatible_entry *dce = NULL;

        if (mask == 0)
                return 0;

        for (dce = driver_compats; dce->id != sentinel_id; dce++) {
                if ((id & mask) == dce->id) {
                        if (matching_entryp != NULL) {
                                *matching_entryp = dce;
                        }
                        return 1;
                }
        }
        return 0;
}

/*
 * device_compatible_match_id:
 *
 *        Like device_compatible_match(), but takes a single
 *        unsigned integer device ID.
 */
int
device_compatible_match_id(
    uintptr_t const id, uintptr_t const sentinel_id,
    const struct device_compatible_entry *driver_compats)
{
        return device_compatible_match_id_internal(id, (uintptr_t)-1,
            sentinel_id, driver_compats, NULL);
}

/*
 * device_compatible_lookup:
 *
 *        Look up and return the device_compatible_entry, using the
 *        same matching criteria used by device_compatible_match().
 */
const struct device_compatible_entry *
device_compatible_lookup(const char **device_compats, int ndevice_compats,
                         const struct device_compatible_entry *driver_compats)
{
        const struct device_compatible_entry *dce;

        if (device_compatible_match_strarray_internal(device_compats,
            ndevice_compats, driver_compats, &dce, strarray_match)) {
                return dce;
        }
        return NULL;
}

/*
 * device_compatible_plookup:
 *
 *        Look up and return the device_compatible_entry, using the
 *        same matching criteria used by device_compatible_pmatch().
 */
const struct device_compatible_entry *
device_compatible_plookup(const char **device_compats, int ndevice_compats,
                          const struct device_compatible_entry *driver_compats)
{
        const struct device_compatible_entry *dce;

        if (device_compatible_match_strarray_internal(device_compats,
            ndevice_compats, driver_compats, &dce, strarray_pmatch)) {
                return dce;
        }
        return NULL;
}

/*
 * device_compatible_lookup_strlist:
 *
 *        Like device_compatible_lookup(), but take the device
 *        "compatible" strings as an OpenFirmware-style string
 *        list.
 */
const struct device_compatible_entry *
device_compatible_lookup_strlist(
    const char * const device_compats, size_t const device_compatsize,
    const struct device_compatible_entry *driver_compats)
{
        const struct device_compatible_entry *dce;

        if (device_compatible_match_strlist_internal(device_compats,
            device_compatsize, driver_compats, &dce, strlist_match)) {
                return dce;
        }
        return NULL;
}

/*
 * device_compatible_plookup_strlist:
 *
 *        Like device_compatible_plookup(), but take the device
 *        "compatible" strings as an OpenFirmware-style string
 *        list.
 */
const struct device_compatible_entry *
device_compatible_plookup_strlist(
    const char * const device_compats, size_t const device_compatsize,
    const struct device_compatible_entry *driver_compats)
{
        const struct device_compatible_entry *dce;

        if (device_compatible_match_strlist_internal(device_compats,
            device_compatsize, driver_compats, &dce, strlist_pmatch)) {
                return dce;
        }
        return NULL;
}

/*
 * device_compatible_lookup_id:
 *
 *        Like device_compatible_lookup(), but takes a single
 *        unsigned integer device ID.
 */
const struct device_compatible_entry *
device_compatible_lookup_id(
    uintptr_t const id, uintptr_t const sentinel_id,
    const struct device_compatible_entry *driver_compats)
{
        const struct device_compatible_entry *dce;

        if (device_compatible_match_id_internal(id, (uintptr_t)-1,
            sentinel_id, driver_compats, &dce)) {
                return dce;
        }
        return NULL;
}

/*
 * Power management related functions.
 */

bool
device_pmf_is_registered(device_t dev)
{
        return (dev->dv_flags & DVF_POWER_HANDLERS) != 0;
}

bool
device_pmf_driver_suspend(device_t dev, const pmf_qual_t *qual)
{
        if ((dev->dv_flags & DVF_DRIVER_SUSPENDED) != 0)
                return true;
        if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0)
                return false;
        if (pmf_qual_depth(qual) <= DEVACT_LEVEL_DRIVER &&
            dev->dv_driver_suspend != NULL &&
            !(*dev->dv_driver_suspend)(dev, qual))
                return false;

        dev->dv_flags |= DVF_DRIVER_SUSPENDED;
        return true;
}

bool
device_pmf_driver_resume(device_t dev, const pmf_qual_t *qual)
{
        if ((dev->dv_flags & DVF_DRIVER_SUSPENDED) == 0)
                return true;
        if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0)
                return false;
        if (pmf_qual_depth(qual) <= DEVACT_LEVEL_DRIVER &&
            dev->dv_driver_resume != NULL &&
            !(*dev->dv_driver_resume)(dev, qual))
                return false;

        dev->dv_flags &= ~DVF_DRIVER_SUSPENDED;
        return true;
}

bool
device_pmf_driver_shutdown(device_t dev, int how)
{

        if (*dev->dv_driver_shutdown != NULL &&
            !(*dev->dv_driver_shutdown)(dev, how))
                return false;
        return true;
}

void
device_pmf_driver_register(device_t dev,
    bool (*suspend)(device_t, const pmf_qual_t *),
    bool (*resume)(device_t, const pmf_qual_t *),
    bool (*shutdown)(device_t, int))
{

        dev->dv_driver_suspend = suspend;
        dev->dv_driver_resume = resume;
        dev->dv_driver_shutdown = shutdown;
        dev->dv_flags |= DVF_POWER_HANDLERS;
}

void
device_pmf_driver_deregister(device_t dev)
{
        device_lock_t dvl = device_getlock(dev);

        dev->dv_driver_suspend = NULL;
        dev->dv_driver_resume = NULL;

        mutex_enter(&dvl->dvl_mtx);
        dev->dv_flags &= ~DVF_POWER_HANDLERS;
        while (dvl->dvl_nlock > 0 || dvl->dvl_nwait > 0) {
                /* Wake a thread that waits for the lock.  That
                 * thread will fail to acquire the lock, and then
                 * it will wake the next thread that waits for the
                 * lock, or else it will wake us.
                 */
                cv_signal(&dvl->dvl_cv);
                pmflock_debug(dev, __func__, __LINE__);
                cv_wait(&dvl->dvl_cv, &dvl->dvl_mtx);
                pmflock_debug(dev, __func__, __LINE__);
        }
        mutex_exit(&dvl->dvl_mtx);
}

void
device_pmf_driver_child_register(device_t dev)
{
        device_t parent = device_parent(dev);

        if (parent == NULL || parent->dv_driver_child_register == NULL)
                return;
        (*parent->dv_driver_child_register)(dev);
}

void
device_pmf_driver_set_child_register(device_t dev,
    void (*child_register)(device_t))
{
        dev->dv_driver_child_register = child_register;
}

static void
pmflock_debug(device_t dev, const char *func, int line)
{
#ifdef PMFLOCK_DEBUG
        device_lock_t dvl = device_getlock(dev);
        const char *curlwp_name;

        if (curlwp->l_name != NULL)
                curlwp_name = curlwp->l_name;
        else
                curlwp_name = curlwp->l_proc->p_comm;

        aprint_debug_dev(dev,
            "%s.%d, %s dvl_nlock %d dvl_nwait %d dv_flags %x\n", func, line,
            curlwp_name, dvl->dvl_nlock, dvl->dvl_nwait, dev->dv_flags);
#endif        /* PMFLOCK_DEBUG */
}

static bool
device_pmf_lock1(device_t dev)
{
        device_lock_t dvl = device_getlock(dev);

        while (device_pmf_is_registered(dev) &&
            dvl->dvl_nlock > 0 && dvl->dvl_holder != curlwp) {
                dvl->dvl_nwait++;
                pmflock_debug(dev, __func__, __LINE__);
                cv_wait(&dvl->dvl_cv, &dvl->dvl_mtx);
                pmflock_debug(dev, __func__, __LINE__);
                dvl->dvl_nwait--;
        }
        if (!device_pmf_is_registered(dev)) {
                pmflock_debug(dev, __func__, __LINE__);
                /* We could not acquire the lock, but some other thread may
                 * wait for it, also.  Wake that thread.
                 */
                cv_signal(&dvl->dvl_cv);
                return false;
        }
        dvl->dvl_nlock++;
        dvl->dvl_holder = curlwp;
        pmflock_debug(dev, __func__, __LINE__);
        return true;
}

bool
device_pmf_lock(device_t dev)
{
        bool rc;
        device_lock_t dvl = device_getlock(dev);

        mutex_enter(&dvl->dvl_mtx);
        rc = device_pmf_lock1(dev);
        mutex_exit(&dvl->dvl_mtx);

        return rc;
}

void
device_pmf_unlock(device_t dev)
{
        device_lock_t dvl = device_getlock(dev);

        KASSERT(dvl->dvl_nlock > 0);
        mutex_enter(&dvl->dvl_mtx);
        if (--dvl->dvl_nlock == 0)
                dvl->dvl_holder = NULL;
        cv_signal(&dvl->dvl_cv);
        pmflock_debug(dev, __func__, __LINE__);
        mutex_exit(&dvl->dvl_mtx);
}

device_lock_t
device_getlock(device_t dev)
{
        return &dev->dv_lock;
}

void *
device_pmf_bus_private(device_t dev)
{
        return dev->dv_bus_private;
}

bool
device_pmf_bus_suspend(device_t dev, const pmf_qual_t *qual)
{
        if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0)
                return true;
        if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0 ||
            (dev->dv_flags & DVF_DRIVER_SUSPENDED) == 0)
                return false;
        if (pmf_qual_depth(qual) <= DEVACT_LEVEL_BUS &&
            dev->dv_bus_suspend != NULL &&
            !(*dev->dv_bus_suspend)(dev, qual))
                return false;

        dev->dv_flags |= DVF_BUS_SUSPENDED;
        return true;
}

bool
device_pmf_bus_resume(device_t dev, const pmf_qual_t *qual)
{
        if ((dev->dv_flags & DVF_BUS_SUSPENDED) == 0)
                return true;
        if (pmf_qual_depth(qual) <= DEVACT_LEVEL_BUS &&
            dev->dv_bus_resume != NULL &&
            !(*dev->dv_bus_resume)(dev, qual))
                return false;

        dev->dv_flags &= ~DVF_BUS_SUSPENDED;
        return true;
}

bool
device_pmf_bus_shutdown(device_t dev, int how)
{

        if (*dev->dv_bus_shutdown != NULL &&
            !(*dev->dv_bus_shutdown)(dev, how))
                return false;
        return true;
}

void
device_pmf_bus_register(device_t dev, void *priv,
    bool (*suspend)(device_t, const pmf_qual_t *),
    bool (*resume)(device_t, const pmf_qual_t *),
    bool (*shutdown)(device_t, int), void (*deregister)(device_t))
{
        dev->dv_bus_private = priv;
        dev->dv_bus_resume = resume;
        dev->dv_bus_suspend = suspend;
        dev->dv_bus_shutdown = shutdown;
        dev->dv_bus_deregister = deregister;
}

void
device_pmf_bus_deregister(device_t dev)
{
        if (dev->dv_bus_deregister == NULL)
                return;
        (*dev->dv_bus_deregister)(dev);
        dev->dv_bus_private = NULL;
        dev->dv_bus_suspend = NULL;
        dev->dv_bus_resume = NULL;
        dev->dv_bus_deregister = NULL;
}

void *
device_pmf_class_private(device_t dev)
{
        return dev->dv_class_private;
}

bool
device_pmf_class_suspend(device_t dev, const pmf_qual_t *qual)
{
        if ((dev->dv_flags & DVF_CLASS_SUSPENDED) != 0)
                return true;
        if (pmf_qual_depth(qual) <= DEVACT_LEVEL_CLASS &&
            dev->dv_class_suspend != NULL &&
            !(*dev->dv_class_suspend)(dev, qual))
                return false;

        dev->dv_flags |= DVF_CLASS_SUSPENDED;
        return true;
}

bool
device_pmf_class_resume(device_t dev, const pmf_qual_t *qual)
{
        if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0)
                return true;
        if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0 ||
            (dev->dv_flags & DVF_DRIVER_SUSPENDED) != 0)
                return false;
        if (pmf_qual_depth(qual) <= DEVACT_LEVEL_CLASS &&
            dev->dv_class_resume != NULL &&
            !(*dev->dv_class_resume)(dev, qual))
                return false;

        dev->dv_flags &= ~DVF_CLASS_SUSPENDED;
        return true;
}

void
device_pmf_class_register(device_t dev, void *priv,
    bool (*suspend)(device_t, const pmf_qual_t *),
    bool (*resume)(device_t, const pmf_qual_t *),
    void (*deregister)(device_t))
{
        dev->dv_class_private = priv;
        dev->dv_class_suspend = suspend;
        dev->dv_class_resume = resume;
        dev->dv_class_deregister = deregister;
}

void
device_pmf_class_deregister(device_t dev)
{
        if (dev->dv_class_deregister == NULL)
                return;
        (*dev->dv_class_deregister)(dev);
        dev->dv_class_private = NULL;
        dev->dv_class_suspend = NULL;
        dev->dv_class_resume = NULL;
        dev->dv_class_deregister = NULL;
}

bool
device_active(device_t dev, devactive_t type)
{
        size_t i;

        if (dev->dv_activity_count == 0)
                return false;

        for (i = 0; i < dev->dv_activity_count; ++i) {
                if (dev->dv_activity_handlers[i] == NULL)
                        break;
                (*dev->dv_activity_handlers[i])(dev, type);
        }

        return true;
}

bool
device_active_register(device_t dev, void (*handler)(device_t, devactive_t))
{
        void (**new_handlers)(device_t, devactive_t);
        void (**old_handlers)(device_t, devactive_t);
        size_t i, old_size, new_size;
        int s;

        old_handlers = dev->dv_activity_handlers;
        old_size = dev->dv_activity_count;

        KASSERT(old_size == 0 || old_handlers != NULL);

        for (i = 0; i < old_size; ++i) {
                KASSERT(old_handlers[i] != handler);
                if (old_handlers[i] == NULL) {
                        old_handlers[i] = handler;
                        return true;
                }
        }

        new_size = old_size + 4;
        new_handlers = kmem_alloc(sizeof(void *) * new_size, KM_SLEEP);

        for (i = 0; i < old_size; ++i)
                new_handlers[i] = old_handlers[i];
        new_handlers[old_size] = handler;
        for (i = old_size+1; i < new_size; ++i)
                new_handlers[i] = NULL;

        s = splhigh();
        dev->dv_activity_count = new_size;
        dev->dv_activity_handlers = new_handlers;
        splx(s);

        if (old_size > 0)
                kmem_free(old_handlers, sizeof(void *) * old_size);

        return true;
}

void
device_active_deregister(device_t dev, void (*handler)(device_t, devactive_t))
{
        void (**old_handlers)(device_t, devactive_t);
        size_t i, old_size;
        int s;

        old_handlers = dev->dv_activity_handlers;
        old_size = dev->dv_activity_count;

        for (i = 0; i < old_size; ++i) {
                if (old_handlers[i] == handler)
                        break;
                if (old_handlers[i] == NULL)
                        return; /* XXX panic? */
        }

        if (i == old_size)
                return; /* XXX panic? */

        for (; i < old_size - 1; ++i) {
                if ((old_handlers[i] = old_handlers[i + 1]) != NULL)
                        continue;

                if (i == 0) {
                        s = splhigh();
                        dev->dv_activity_count = 0;
                        dev->dv_activity_handlers = NULL;
                        splx(s);
                        kmem_free(old_handlers, sizeof(void *) * old_size);
                }
                return;
        }
        old_handlers[i] = NULL;
}

/* Return true iff the device_t `dev' exists at generation `gen'. */
static bool
device_exists_at(device_t dv, devgen_t gen)
{
        return (dv->dv_del_gen == 0 || dv->dv_del_gen > gen) &&
            dv->dv_add_gen <= gen;
}

static bool
deviter_visits(const deviter_t *di, device_t dv)
{
        return device_exists_at(dv, di->di_gen);
}

/*
 * Device Iteration
 *
 * deviter_t: a device iterator.  Holds state for a "walk" visiting
 *     each device_t's in the device tree.
 *
 * deviter_init(di, flags): initialize the device iterator `di'
 *     to "walk" the device tree.  deviter_next(di) will return
 *     the first device_t in the device tree, or NULL if there are
 *     no devices.
 *
 *     `flags' is one or more of DEVITER_F_RW, indicating that the
 *     caller intends to modify the device tree by calling
 *     config_detach(9) on devices in the order that the iterator
 *     returns them; DEVITER_F_ROOT_FIRST, asking for the devices
 *     nearest the "root" of the device tree to be returned, first;
 *     DEVITER_F_LEAVES_FIRST, asking for the devices furthest from
 *     the root of the device tree, first; and DEVITER_F_SHUTDOWN,
 *     indicating both that deviter_init() should not respect any
 *     locks on the device tree, and that deviter_next(di) may run
 *     in more than one LWP before the walk has finished.
 *
 *     Only one DEVITER_F_RW iterator may be in the device tree at
 *     once.
 *
 *     DEVITER_F_SHUTDOWN implies DEVITER_F_RW.
 *
 *     Results are undefined if the flags DEVITER_F_ROOT_FIRST and
 *     DEVITER_F_LEAVES_FIRST are used in combination.
 *
 * deviter_first(di, flags): initialize the device iterator `di'
 *     and return the first device_t in the device tree, or NULL
 *     if there are no devices.  The statement
 *
 *         dv = deviter_first(di);
 *
 *     is shorthand for
 *
 *         deviter_init(di);
 *         dv = deviter_next(di);
 *
 * deviter_next(di): return the next device_t in the device tree,
 *     or NULL if there are no more devices.  deviter_next(di)
 *     is undefined if `di' was not initialized with deviter_init() or
 *     deviter_first().
 *
 * deviter_release(di): stops iteration (subsequent calls to
 *     deviter_next() will return NULL), releases any locks and
 *     resources held by the device iterator.
 *
 * Device iteration does not return device_t's in any particular
 * order.  An iterator will never return the same device_t twice.
 * Device iteration is guaranteed to complete---i.e., if deviter_next(di)
 * is called repeatedly on the same `di', it will eventually return
 * NULL.  It is ok to attach/detach devices during device iteration.
 */
void
deviter_init(deviter_t *di, deviter_flags_t flags)
{
        device_t dv;

        memset(di, 0, sizeof(*di));

        if ((flags & DEVITER_F_SHUTDOWN) != 0)
                flags |= DEVITER_F_RW;

        mutex_enter(&alldevs_lock);
        if ((flags & DEVITER_F_RW) != 0)
                alldevs_nwrite++;
        else
                alldevs_nread++;
        di->di_gen = alldevs_gen++;
        di->di_flags = flags;

        switch (di->di_flags & (DEVITER_F_LEAVES_FIRST|DEVITER_F_ROOT_FIRST)) {
        case DEVITER_F_LEAVES_FIRST:
                TAILQ_FOREACH(dv, &alldevs, dv_list) {
                        if (!deviter_visits(di, dv))
                                continue;
                        di->di_curdepth = MAX(di->di_curdepth, dv->dv_depth);
                }
                break;
        case DEVITER_F_ROOT_FIRST:
                TAILQ_FOREACH(dv, &alldevs, dv_list) {
                        if (!deviter_visits(di, dv))
                                continue;
                        di->di_maxdepth = MAX(di->di_maxdepth, dv->dv_depth);
                }
                break;
        default:
                break;
        }

        deviter_reinit(di);
        mutex_exit(&alldevs_lock);
}

static void
deviter_reinit(deviter_t *di)
{

        KASSERT(mutex_owned(&alldevs_lock));
        if ((di->di_flags & DEVITER_F_RW) != 0)
                di->di_prev = TAILQ_LAST(&alldevs, devicelist);
        else
                di->di_prev = TAILQ_FIRST(&alldevs);
}

device_t
deviter_first(deviter_t *di, deviter_flags_t flags)
{

        deviter_init(di, flags);
        return deviter_next(di);
}

static device_t
deviter_next2(deviter_t *di)
{
        device_t dv;

        KASSERT(mutex_owned(&alldevs_lock));

        dv = di->di_prev;

        if (dv == NULL)
                return NULL;

        if ((di->di_flags & DEVITER_F_RW) != 0)
                di->di_prev = TAILQ_PREV(dv, devicelist, dv_list);
        else
                di->di_prev = TAILQ_NEXT(dv, dv_list);

        return dv;
}

static device_t
deviter_next1(deviter_t *di)
{
        device_t dv;

        KASSERT(mutex_owned(&alldevs_lock));

        do {
                dv = deviter_next2(di);
        } while (dv != NULL && !deviter_visits(di, dv));

        return dv;
}

device_t
deviter_next(deviter_t *di)
{
        device_t dv = NULL;

        mutex_enter(&alldevs_lock);
        switch (di->di_flags & (DEVITER_F_LEAVES_FIRST|DEVITER_F_ROOT_FIRST)) {
        case 0:
                dv = deviter_next1(di);
                break;
        case DEVITER_F_LEAVES_FIRST:
                while (di->di_curdepth >= 0) {
                        if ((dv = deviter_next1(di)) == NULL) {
                                di->di_curdepth--;
                                deviter_reinit(di);
                        } else if (dv->dv_depth == di->di_curdepth)
                                break;
                }
                break;
        case DEVITER_F_ROOT_FIRST:
                while (di->di_curdepth <= di->di_maxdepth) {
                        if ((dv = deviter_next1(di)) == NULL) {
                                di->di_curdepth++;
                                deviter_reinit(di);
                        } else if (dv->dv_depth == di->di_curdepth)
                                break;
                }
                break;
        default:
                break;
        }
        mutex_exit(&alldevs_lock);

        return dv;
}

void
deviter_release(deviter_t *di)
{
        bool rw = (di->di_flags & DEVITER_F_RW) != 0;

        mutex_enter(&alldevs_lock);
        if (rw)
                --alldevs_nwrite;
        else
                --alldevs_nread;
        /* XXX wake a garbage-collection thread */
        mutex_exit(&alldevs_lock);
}

const char *
cfdata_ifattr(const struct cfdata *cf)
{
        return cf->cf_pspec->cfp_iattr;
}

bool
ifattr_match(const char *snull, const char *t)
{
        return (snull == NULL) || strcmp(snull, t) == 0;
}

void
null_childdetached(device_t self, device_t child)
{
        /* do nothing */
}

static void
sysctl_detach_setup(struct sysctllog **clog)
{

        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_BOOL, "detachall",
                SYSCTL_DESCR("Detach all devices at shutdown"),
                NULL, 0, &detachall, 0,
                CTL_KERN, CTL_CREATE, CTL_EOL);
}




































































































    2 















    2 




    2 













    2 

















    2 











    2 











    2 


    2 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/*        $NetBSD: umap_subr.c,v 1.29 2014/11/09 18:08:07 maxv Exp $        */

/*
 * Copyright (c) 1999 National Aeronautics & Space Administration
 * All rights reserved.
 *
 * This software was written by William Studenmund of the
 * Numerical Aerospace Simulation Facility, NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the National Aeronautics & Space Administration
 *    nor the names of its contributors may be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
 * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
/*
 * Copyright (c) 1992, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        from: Id: lofs_subr.c, v 1.11 1992/05/30 10:05:43 jsp Exp
 *        @(#)umap_subr.c        8.9 (Berkeley) 5/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: umap_subr.c,v 1.29 2014/11/09 18:08:07 maxv Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/kauth.h>

#include <miscfs/specfs/specdev.h>
#include <miscfs/umapfs/umap.h>

u_long umap_findid(u_long, u_long [][2], int);
int umap_node_alloc(struct mount *, struct vnode *,
                                struct vnode **);

/*
 * umap_findid is called by various routines in umap_vnodeops.c to
 * find a user or group id in a map.
 */
u_long
umap_findid(u_long id, u_long map[][2], int nentries)
{
        int i;

        /* Find uid entry in map */
        i = 0;
        while ((i<nentries) && ((map[i][0]) != id))
                i++;

        if (i < nentries)
                return (map[i][1]);
        else
                return (-1);

}

/*
 * umap_reverse_findid is called by umap_getattr() in umap_vnodeops.c to
 * find a user or group id in a map, in reverse.
 */
u_long
umap_reverse_findid(u_long id, u_long map[][2], int nentries)
{
        int i;

        /* Find uid entry in map */
        i = 0;
        while ((i<nentries) && ((map[i][1]) != id))
                i++;

        if (i < nentries)
                return (map[i][0]);
        else
                return (-1);

}

/* umap_mapids maps all of the ids in a credential, both user and group. */

void
umap_mapids(struct mount *v_mount, kauth_cred_t credp)
{
        int i, unentries, gnentries;
        uid_t uid;
        gid_t gid;
        u_long (*usermap)[2], (*groupmap)[2];
        gid_t groups[NGROUPS];
        uint16_t ngroups;

        if (credp == NOCRED || credp == FSCRED)
                return;

        unentries =  MOUNTTOUMAPMOUNT(v_mount)->info_nentries;
        usermap =  MOUNTTOUMAPMOUNT(v_mount)->info_mapdata;
        gnentries =  MOUNTTOUMAPMOUNT(v_mount)->info_gnentries;
        groupmap =  MOUNTTOUMAPMOUNT(v_mount)->info_gmapdata;

        /* Find uid entry in map */

        uid = (uid_t) umap_findid(kauth_cred_geteuid(credp), usermap, unentries);

        if (uid != -1)
                kauth_cred_seteuid(credp, uid);
        else
                kauth_cred_seteuid(credp, (uid_t)NOBODY);

#if 1
        /* cr_gid is the same as cr_groups[0] in 4BSD, but not in NetBSD */

        /* Find gid entry in map */

        gid = (gid_t) umap_findid(kauth_cred_getegid(credp), groupmap, gnentries);

        if (gid != -1)
                kauth_cred_setegid(credp, gid);
        else
                kauth_cred_setegid(credp, NULLGROUP);
#endif

        /* Now we must map each of the set of groups in the cr_groups
                structure. */

        ngroups = kauth_cred_ngroups(credp);
        for (i = 0; i < ngroups; i++) {
                /* XXX elad: can't we just skip cases where gid == -1? */
                groups[i] = kauth_cred_group(credp, i);
                gid = (gid_t) umap_findid(groups[i],
                                          groupmap, gnentries);
                if (gid != -1)
                        groups[i] = gid;
                else
                        groups[i] = NULLGROUP;
        }

        kauth_cred_setgroups(credp, groups, ngroups, -1, UIO_SYSSPACE);
}



































































































   29 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/*        $NetBSD: scsipi_base.h,v 1.24 2017/02/26 10:58:47 maya Exp $        */

/*-
 * Copyright (c) 1998, 2004 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _DEV_SCSIPI_SCSIPI_BASE_H_
#define _DEV_SCSIPI_SCSIPI_BASE_H_

struct scsipi_xfer *scsipi_get_xs(struct scsipi_periph *, int);
void        scsipi_put_xs(struct scsipi_xfer *);

static __inline struct scsipi_xfer *scsipi_make_xs_internal(struct scsipi_periph *,
            struct scsipi_generic *, int cmdlen, u_char *data_addr,
            int datalen, int retries, int timeout, struct buf *,
            int flags) __unused;

static __inline struct scsipi_xfer *scsipi_make_xs_unlocked(struct scsipi_periph *,
            struct scsipi_generic *, int cmdlen, u_char *data_addr,
            int datalen, int retries, int timeout, struct buf *,
            int flags) __unused;

static __inline struct scsipi_xfer *scsipi_make_xs_locked(struct scsipi_periph *,
            struct scsipi_generic *, int cmdlen, u_char *data_addr,
            int datalen, int retries, int timeout, struct buf *,
            int flags) __unused;

/*
 * Make a scsipi_xfer, and return a pointer to it.
 */

static __inline struct scsipi_xfer *
scsipi_make_xs_internal(struct scsipi_periph *periph, struct scsipi_generic *cmd,
    int cmdlen, u_char *data_addr, int datalen, int retries, int timeout,
    struct buf *bp, int flags)
{
        struct scsipi_xfer *xs;

        if ((xs = scsipi_get_xs(periph, flags)) == NULL)
                return (NULL);

        /*
         * Fill out the scsipi_xfer structure.  We don't know whose context
         * the cmd is in, so copy it.
         */
        memcpy(&xs->cmdstore, cmd, cmdlen);
        xs->cmd = &xs->cmdstore;
        xs->cmdlen = cmdlen;
        xs->data = data_addr;
        xs->datalen = datalen;
        xs->xs_retries = retries;
        xs->timeout = timeout;
        xs->bp = bp;

        return (xs);
}

static __inline struct scsipi_xfer *
scsipi_make_xs_unlocked(struct scsipi_periph *periph, struct scsipi_generic *cmd,
    int cmdlen, u_char *data_addr, int datalen, int retries, int timeout,
    struct buf *bp, int flags)
{

        return scsipi_make_xs_internal(periph, cmd, cmdlen, data_addr,
            datalen, retries, timeout, bp, flags & ~XS_CTL_NOSLEEP);
}

static __inline struct scsipi_xfer *
scsipi_make_xs_locked(struct scsipi_periph *periph, struct scsipi_generic *cmd,
    int cmdlen, u_char *data_addr, int datalen, int retries, int timeout,
    struct buf *bp, int flags)
{

        KDASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
        return scsipi_make_xs_internal(periph, cmd, cmdlen, data_addr,
            datalen, retries, timeout, bp, flags | XS_CTL_NOSLEEP);
}

#endif /* _DEV_SCSIPI_SCSIPI_BASE_H_ */

























































































































   39 



   41 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
/*        $NetBSD: sys_mqueue.c,v 1.48 2020/05/23 23:42:43 ad Exp $        */

/*
 * Copyright (c) 2007-2011 Mindaugas Rasiukevicius <rmind at NetBSD org>
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Implementation of POSIX message queues.
 * Defined in the Base Definitions volume of IEEE Std 1003.1-2001.
 *
 * Locking
 *
 * Global list of message queues (mqueue_head) is protected by mqlist_lock.
 * Each message queue and its members are protected by mqueue::mq_mtx.
 * Note that proc_t::p_mqueue_cnt is updated atomically.
 *
 * Lock order:
 *
 *        mqlist_lock ->
 *                mqueue::mq_mtx
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_mqueue.c,v 1.48 2020/05/23 23:42:43 ad Exp $");

#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>

#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kauth.h>
#include <sys/lwp.h>
#include <sys/mqueue.h>
#include <sys/module.h>
#include <sys/poll.h>
#include <sys/select.h>
#include <sys/signal.h>
#include <sys/signalvar.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>

#include <miscfs/genfs/genfs.h>

MODULE(MODULE_CLASS_MISC, mqueue, NULL);

/* System-wide limits. */
static u_int                        mq_open_max = MQ_OPEN_MAX;
static u_int                        mq_prio_max = MQ_PRIO_MAX;
static u_int                        mq_max_msgsize = 16 * MQ_DEF_MSGSIZE;
static u_int                        mq_def_maxmsg = 32;
static u_int                        mq_max_maxmsg = 16 * 32;

static pool_cache_t                mqmsg_cache        __read_mostly;
static kmutex_t                        mqlist_lock        __cacheline_aligned;
static LIST_HEAD(, mqueue)        mqueue_head        __cacheline_aligned;

static kauth_listener_t                mq_listener;

static int        mqueue_sysinit(void);
static int        mqueue_sysfini(bool);
static int        mq_poll_fop(file_t *, int);
static int        mq_stat_fop(file_t *, struct stat *);
static int        mq_close_fop(file_t *);

static const struct fileops mqops = {
        .fo_name = "mq",
        .fo_read = fbadop_read,
        .fo_write = fbadop_write,
        .fo_ioctl = fbadop_ioctl,
        .fo_fcntl = fnullop_fcntl,
        .fo_poll = mq_poll_fop,
        .fo_stat = mq_stat_fop,
        .fo_close = mq_close_fop,
        .fo_kqfilter = fnullop_kqfilter,
        .fo_restart = fnullop_restart,
};

static const struct syscall_package mqueue_syscalls[] = {
        { SYS_mq_open, 0, (sy_call_t *)sys_mq_open },
        { SYS_mq_close, 0, (sy_call_t *)sys_mq_close },
        { SYS_mq_unlink, 0, (sy_call_t *)sys_mq_unlink },
        { SYS_mq_getattr, 0, (sy_call_t *)sys_mq_getattr },
        { SYS_mq_setattr, 0, (sy_call_t *)sys_mq_setattr },
        { SYS_mq_notify, 0, (sy_call_t *)sys_mq_notify },
        { SYS_mq_send, 0, (sy_call_t *)sys_mq_send },
        { SYS_mq_receive, 0, (sy_call_t *)sys_mq_receive },
        { SYS___mq_timedsend50, 0, (sy_call_t *)sys___mq_timedsend50 },
        { SYS___mq_timedreceive50, 0, (sy_call_t *)sys___mq_timedreceive50 },
        { 0, 0, NULL }
};

static int
mq_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        mqueue_t *mq;
        int result;

        if (action != KAUTH_SYSTEM_MQUEUE)
                return KAUTH_RESULT_DEFER;

        result = KAUTH_RESULT_DEFER;

        mq = arg1;

        if (kauth_cred_geteuid(cred) == mq->mq_euid)
                result = KAUTH_RESULT_ALLOW;

        return result;
}

/*
 * Initialisation and unloading of POSIX message queue subsystem.
 */

static int
mqueue_sysinit(void)
{
        int error;

        mqmsg_cache = pool_cache_init(MQ_DEF_MSGSIZE, coherency_unit,
            0, 0, "mqmsgpl", NULL, IPL_NONE, NULL, NULL, NULL);
        mutex_init(&mqlist_lock, MUTEX_DEFAULT, IPL_NONE);
        LIST_INIT(&mqueue_head);

        error = syscall_establish(NULL, mqueue_syscalls);
        mq_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
            mq_listener_cb, NULL);
        return error;
}

static int
mqueue_sysfini(bool interface)
{

        if (interface) {
                int error;
                bool inuse;

                /* Stop syscall activity. */
                error = syscall_disestablish(NULL, mqueue_syscalls);
                if (error)
                        return error;
                /* Check if there are any message queues in use. */
                mutex_enter(&mqlist_lock);
                inuse = !LIST_EMPTY(&mqueue_head);
                mutex_exit(&mqlist_lock);
                if (inuse) {
                        error = syscall_establish(NULL, mqueue_syscalls);
                        KASSERT(error == 0);
                        return EBUSY;
                }
        }

        kauth_unlisten_scope(mq_listener);

        mutex_destroy(&mqlist_lock);
        pool_cache_destroy(mqmsg_cache);
        return 0;
}

/*
 * Module interface.
 */
static int
mqueue_modcmd(modcmd_t cmd, void *arg)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return mqueue_sysinit();
        case MODULE_CMD_FINI:
                return mqueue_sysfini(true);
        default:
                return ENOTTY;
        }
}

/*
 * Free the message.
 */
static void
mqueue_freemsg(struct mq_msg *msg, const size_t size)
{

        if (size > MQ_DEF_MSGSIZE) {
                kmem_free(msg, size);
        } else {
                pool_cache_put(mqmsg_cache, msg);
        }
}

/*
 * Destroy the message queue.
 */
static void
mqueue_destroy(struct mqueue *mq)
{
        struct mq_msg *msg;
        size_t msz;
        u_int i;

        /* Note MQ_PQSIZE + 1. */
        for (i = 0; i <= MQ_PQSIZE; i++) {
                while ((msg = TAILQ_FIRST(&mq->mq_head[i])) != NULL) {
                        TAILQ_REMOVE(&mq->mq_head[i], msg, msg_queue);
                        msz = sizeof(struct mq_msg) + msg->msg_len;
                        mqueue_freemsg(msg, msz);
                }
        }
        if (mq->mq_name) {
                kmem_free(mq->mq_name, MQ_NAMELEN);
        }
        seldestroy(&mq->mq_rsel);
        seldestroy(&mq->mq_wsel);
        cv_destroy(&mq->mq_send_cv);
        cv_destroy(&mq->mq_recv_cv);
        mutex_destroy(&mq->mq_mtx);
        kmem_free(mq, sizeof(struct mqueue));
}

/*
 * mqueue_lookup: lookup for file name in general list of message queues.
 *
 * => locks the message queue on success
 */
static mqueue_t *
mqueue_lookup(const char *name)
{
        mqueue_t *mq;

        KASSERT(mutex_owned(&mqlist_lock));

        LIST_FOREACH(mq, &mqueue_head, mq_list) {
                if (strncmp(mq->mq_name, name, MQ_NAMELEN) == 0) {
                        mutex_enter(&mq->mq_mtx);
                        return mq;
                }
        }
        return NULL;
}

/*
 * mqueue_get: get the mqueue from the descriptor.
 *
 * => locks the message queue, if found.
 * => holds a reference on the file descriptor.
 */
int
mqueue_get(mqd_t mqd, int fflag, mqueue_t **mqret)
{
        const int fd = (int)mqd;
        mqueue_t *mq;
        file_t *fp;

        fp = fd_getfile(fd);
        if (__predict_false(fp == NULL)) {
                return EBADF;
        }
        if (__predict_false(fp->f_type != DTYPE_MQUEUE)) {
                fd_putfile(fd);
                return EBADF;
        }
        if (fflag && (fp->f_flag & fflag) == 0) {
                fd_putfile(fd);
                return EBADF;
        }
        mq = fp->f_mqueue;
        mutex_enter(&mq->mq_mtx);

        *mqret = mq;
        return 0;
}

/*
 * mqueue_linear_insert: perform linear insert according to the message
 * priority into the reserved queue (MQ_PQRESQ).  Reserved queue is a
 * sorted list used only when mq_prio_max is increased via sysctl.
 */
static inline void
mqueue_linear_insert(struct mqueue *mq, struct mq_msg *msg)
{
        struct mq_msg *mit;

        TAILQ_FOREACH(mit, &mq->mq_head[MQ_PQRESQ], msg_queue) {
                if (msg->msg_prio > mit->msg_prio)
                        break;
        }
        if (mit == NULL) {
                TAILQ_INSERT_TAIL(&mq->mq_head[MQ_PQRESQ], msg, msg_queue);
        } else {
                TAILQ_INSERT_BEFORE(mit, msg, msg_queue);
        }
}

static int
mq_stat_fop(file_t *fp, struct stat *st)
{
        struct mqueue *mq = fp->f_mqueue;

        memset(st, 0, sizeof(*st));

        mutex_enter(&mq->mq_mtx);
        st->st_mode = mq->mq_mode;
        st->st_uid = mq->mq_euid;
        st->st_gid = mq->mq_egid;
        st->st_atimespec = mq->mq_atime;
        st->st_mtimespec = mq->mq_mtime;
        st->st_ctimespec = st->st_birthtimespec = mq->mq_btime;
        st->st_uid = kauth_cred_geteuid(fp->f_cred);
        st->st_gid = kauth_cred_getegid(fp->f_cred);
        mutex_exit(&mq->mq_mtx);

        return 0;
}

static int
mq_poll_fop(file_t *fp, int events)
{
        struct mqueue *mq = fp->f_mqueue;
        struct mq_attr *mqattr;
        int revents = 0;

        mutex_enter(&mq->mq_mtx);
        mqattr = &mq->mq_attrib;
        if (events & (POLLIN | POLLRDNORM)) {
                /* Ready for receiving, if there are messages in the queue. */
                if (mqattr->mq_curmsgs)
                        revents |= events & (POLLIN | POLLRDNORM);
                else
                        selrecord(curlwp, &mq->mq_rsel);
        }
        if (events & (POLLOUT | POLLWRNORM)) {
                /* Ready for sending, if the message queue is not full. */
                if (mqattr->mq_curmsgs < mqattr->mq_maxmsg)
                        revents |= events & (POLLOUT | POLLWRNORM);
                else
                        selrecord(curlwp, &mq->mq_wsel);
        }
        mutex_exit(&mq->mq_mtx);

        return revents;
}

static int
mq_close_fop(file_t *fp)
{
        proc_t *p = curproc;
        mqueue_t *mq = fp->f_mqueue;
        bool destroy = false;

        mutex_enter(&mq->mq_mtx);
        KASSERT(mq->mq_refcnt > 0);
        if (--mq->mq_refcnt == 0) {
                /* Destroy if the last reference and unlinked. */
                destroy = (mq->mq_attrib.mq_flags & MQ_UNLINKED) != 0;
        }
        mutex_exit(&mq->mq_mtx);

        if (destroy) {
                mqueue_destroy(mq);
        }
        atomic_dec_uint(&p->p_mqueue_cnt);
        return 0;
}

static int
mqueue_access(mqueue_t *mq, int access, kauth_cred_t cred)
{
        accmode_t accmode = 0;

        /* Note the difference between VREAD/VWRITE and FREAD/FWRITE. */
        if (access & FREAD) {
                accmode |= VREAD;
        }
        if (access & FWRITE) {
                accmode |= VWRITE;
        }
        if (genfs_can_access(NULL, cred, mq->mq_euid, mq->mq_egid,
            mq->mq_mode, NULL, accmode)) {
                return EACCES;
        }
        return 0;
}

static int
mqueue_create(lwp_t *l, char *name, struct mq_attr *attr, mode_t mode,
    int oflag, mqueue_t **mqret)
{
        proc_t *p = l->l_proc;
        struct cwdinfo *cwdi = p->p_cwdi;
        mqueue_t *mq;
        u_int i;

        /* Empty name is invalid. */
        if (name[0] == '\0') {
                return EINVAL;
        }

        /* Check for mqueue attributes. */
        if (attr) {
                if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > mq_max_maxmsg ||
                    attr->mq_msgsize <= 0 ||
                    attr->mq_msgsize > mq_max_msgsize) {
                        return EINVAL;
                }
                attr->mq_curmsgs = 0;
        }

        /*
         * Allocate new message queue, initialize data structures, copy the
         * name attributes.  Note that the initial reference is set here.
         */
        mq = kmem_zalloc(sizeof(mqueue_t), KM_SLEEP);

        mutex_init(&mq->mq_mtx, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&mq->mq_send_cv, "mqsendcv");
        cv_init(&mq->mq_recv_cv, "mqrecvcv");
        for (i = 0; i < (MQ_PQSIZE + 1); i++) {
                TAILQ_INIT(&mq->mq_head[i]);
        }
        selinit(&mq->mq_rsel);
        selinit(&mq->mq_wsel);
        mq->mq_name = name;
        mq->mq_refcnt = 1;

        if (attr != NULL) {
                memcpy(&mq->mq_attrib, attr, sizeof(struct mq_attr));
        } else {
                memset(&mq->mq_attrib, 0, sizeof(struct mq_attr));
                mq->mq_attrib.mq_maxmsg = mq_def_maxmsg;
                mq->mq_attrib.mq_msgsize = MQ_DEF_MSGSIZE - sizeof(struct mq_msg);
        }

        CTASSERT((O_MASK & (MQ_UNLINKED | MQ_RECEIVE)) == 0);
        mq->mq_attrib.mq_flags = (O_MASK & oflag);

        /* Store mode and effective UID with GID. */
        mq->mq_mode = ((mode & ~cwdi->cwdi_cmask) & ALLPERMS) & ~S_ISTXT;
        mq->mq_euid = kauth_cred_geteuid(l->l_cred);
        mq->mq_egid = kauth_cred_getegid(l->l_cred);

        *mqret = mq;
        return 0;
}

/*
 * Helper function for mq_open() - note that "u_name" is a userland pointer,
 * while "attr" is a kernel pointer!
 */
int
mq_handle_open(struct lwp *l, const char *u_name, int oflag, mode_t mode,
    struct mq_attr *attr, register_t *retval)
{
        struct proc *p = l->l_proc;
        struct mqueue *mq, *mq_new = NULL;
        int mqd, error;
        file_t *fp;
        char *name;

        /* Get the name from the user-space. */
        name = kmem_alloc(MQ_NAMELEN, KM_SLEEP);
        error = copyinstr(u_name, name, MQ_NAMELEN - 1, NULL);
        if (error) {
                kmem_free(name, MQ_NAMELEN);
                return error;
        }

        /* Allocate file structure and descriptor. */
        error = fd_allocfile(&fp, &mqd);
        if (error) {
                kmem_free(name, MQ_NAMELEN);
                return error;
        }

        /* Account and check for the limit. */
        if (atomic_inc_uint_nv(&p->p_mqueue_cnt) > mq_open_max) {
                atomic_dec_uint(&p->p_mqueue_cnt);
                error = EMFILE;
                goto err;
        }

        fp->f_type = DTYPE_MQUEUE;
        fp->f_flag = FFLAGS(oflag) & (FREAD | FWRITE);
        fp->f_ops = &mqops;

        if (oflag & O_CREAT) {
                /* Create a new message queue. */
                error = mqueue_create(l, name, attr, mode, oflag, &mq_new);
                if (error) {
                        goto err;
                }
                KASSERT(mq_new != NULL);
        }

        /* Lookup for a message queue with such name. */
        mutex_enter(&mqlist_lock);
        mq = mqueue_lookup(name);
        if (mq) {
                KASSERT(mutex_owned(&mq->mq_mtx));
                mutex_exit(&mqlist_lock);

                /* Check for exclusive create. */
                if (oflag & O_EXCL) {
                        mutex_exit(&mq->mq_mtx);
                        error = EEXIST;
                        goto err;
                }

                /* Verify permissions. */
                if (mqueue_access(mq, fp->f_flag, l->l_cred) != 0) {
                        mutex_exit(&mq->mq_mtx);
                        error = EACCES;
                        goto err;
                }

                /* If we have the access, add a new reference. */
                mq->mq_refcnt++;
                mutex_exit(&mq->mq_mtx);
        } else {
                /* Fail if not found and not creating. */
                if ((oflag & O_CREAT) == 0) {
                        mutex_exit(&mqlist_lock);
                        KASSERT(mq_new == NULL);
                        error = ENOENT;
                        goto err;
                }

                /* Initial timestamps. */
                mq = mq_new;
                getnanotime(&mq->mq_btime);
                mq->mq_atime = mq->mq_mtime = mq->mq_btime;

                /*
                 * Finally, insert message queue into the list.
                 * Note: it already has the initial reference.
                 */
                LIST_INSERT_HEAD(&mqueue_head, mq, mq_list);
                mutex_exit(&mqlist_lock);

                mq_new = NULL;
                name = NULL;
        }
        KASSERT(mq != NULL);
        fp->f_mqueue = mq;
        fd_affix(p, fp, mqd);
        *retval = mqd;
err:
        if (error) {
                fd_abort(p, fp, mqd);
        }
        if (mq_new) {
                /* Note: will free the 'name'. */
                mqueue_destroy(mq_new);
        } else if (name) {
                kmem_free(name, MQ_NAMELEN);
        }
        return error;
}

/*
 * General mqueue system calls.
 */

int
sys_mq_open(struct lwp *l, const struct sys_mq_open_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const char *) name;
                syscallarg(int) oflag;
                syscallarg(mode_t) mode;
                syscallarg(struct mq_attr) attr;
        } */
        struct mq_attr *attr = NULL, a;
        int error;

        if ((SCARG(uap, oflag) & O_EXEC) != 0)
                return EINVAL;

        if ((SCARG(uap, oflag) & O_CREAT) != 0 && SCARG(uap, attr) != NULL) {
                error = copyin(SCARG(uap, attr), &a, sizeof(a));
                if (error)
                        return error;
                attr = &a;
        }

        return mq_handle_open(l, SCARG(uap, name), SCARG(uap, oflag),
            SCARG(uap, mode), attr, retval);
}

int
sys_mq_close(struct lwp *l, const struct sys_mq_close_args *uap,
    register_t *retval)
{

        return sys_close(l, (const void *)uap, retval);
}

/*
 * Primary mq_recv1() function.
 */
int
mq_recv1(mqd_t mqdes, void *msg_ptr, size_t msg_len, u_int *msg_prio,
    struct timespec *ts, ssize_t *mlen)
{
        struct mqueue *mq;
        struct mq_msg *msg = NULL;
        struct mq_attr *mqattr;
        u_int idx;
        int error;

        error = mqueue_get(mqdes, FREAD, &mq);
        if (error) {
                return error;
        }
        getnanotime(&mq->mq_atime);
        mqattr = &mq->mq_attrib;

        /* Check the message size limits */
        if (msg_len < mqattr->mq_msgsize) {
                error = EMSGSIZE;
                goto error;
        }

        /* Check if queue is empty */
        while (mqattr->mq_curmsgs == 0) {
                int t;

                if (mqattr->mq_flags & O_NONBLOCK) {
                        error = EAGAIN;
                        goto error;
                }
                if (ts) {
                        error = ts2timo(CLOCK_REALTIME, TIMER_ABSTIME, ts, &t,
                            NULL);
                        if (error)
                                goto error;
                } else
                        t = 0;
                /*
                 * Block until someone sends the message.
                 * While doing this, notification should not be sent.
                 */
                mqattr->mq_flags |= MQ_RECEIVE;
                error = cv_timedwait_sig(&mq->mq_send_cv, &mq->mq_mtx, t);
                mqattr->mq_flags &= ~MQ_RECEIVE;
                if (error || (mqattr->mq_flags & MQ_UNLINKED)) {
                        error = (error == EWOULDBLOCK) ? ETIMEDOUT : EINTR;
                        goto error;
                }
        }

        /*
         * Find the highest priority message, and remove it from the queue.
         * At first, reserved queue is checked, bitmap is next.
         */
        msg = TAILQ_FIRST(&mq->mq_head[MQ_PQRESQ]);
        if (__predict_true(msg == NULL)) {
                idx = ffs(mq->mq_bitmap);
                msg = TAILQ_FIRST(&mq->mq_head[idx]);
                KASSERT(msg != NULL);
        } else {
                idx = MQ_PQRESQ;
        }
        TAILQ_REMOVE(&mq->mq_head[idx], msg, msg_queue);

        /* Unmark the bit, if last message. */
        if (__predict_true(idx) && TAILQ_EMPTY(&mq->mq_head[idx])) {
                KASSERT((MQ_PQSIZE - idx) == msg->msg_prio);
                mq->mq_bitmap &= ~(1U << --idx);
        }

        /* Decrement the counter and signal waiter, if any */
        mqattr->mq_curmsgs--;
        cv_signal(&mq->mq_recv_cv);

        /* Ready for sending now */
        selnotify(&mq->mq_wsel, POLLOUT | POLLWRNORM, 0);
error:
        mutex_exit(&mq->mq_mtx);
        fd_putfile((int)mqdes);
        if (error)
                return error;

        /*
         * Copy the data to the user-space.
         * Note: According to POSIX, no message should be removed from the
         * queue in case of fail - this would be violated.
         */
        *mlen = msg->msg_len;
        error = copyout(msg->msg_ptr, msg_ptr, msg->msg_len);
        if (error == 0 && msg_prio)
                error = copyout(&msg->msg_prio, msg_prio, sizeof(unsigned));
        mqueue_freemsg(msg, sizeof(struct mq_msg) + msg->msg_len);

        return error;
}

int
sys_mq_receive(struct lwp *l, const struct sys_mq_receive_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(mqd_t) mqdes;
                syscallarg(char *) msg_ptr;
                syscallarg(size_t) msg_len;
                syscallarg(unsigned *) msg_prio;
        } */
        ssize_t mlen;
        int error;

        error = mq_recv1(SCARG(uap, mqdes), SCARG(uap, msg_ptr),
            SCARG(uap, msg_len), SCARG(uap, msg_prio), NULL, &mlen);
        if (error == 0)
                *retval = mlen;

        return error;
}

int
sys___mq_timedreceive50(struct lwp *l,
    const struct sys___mq_timedreceive50_args *uap, register_t *retval)
{
        /* {
                syscallarg(mqd_t) mqdes;
                syscallarg(char *) msg_ptr;
                syscallarg(size_t) msg_len;
                syscallarg(unsigned *) msg_prio;
                syscallarg(const struct timespec *) abs_timeout;
        } */
        struct timespec ts, *tsp;
        ssize_t mlen;
        int error;

        /* Get and convert time value */
        if (SCARG(uap, abs_timeout)) {
                error = copyin(SCARG(uap, abs_timeout), &ts, sizeof(ts));
                if (error)
                        return error;
                tsp = &ts;
        } else {
                tsp = NULL;
        }

        error = mq_recv1(SCARG(uap, mqdes), SCARG(uap, msg_ptr),
            SCARG(uap, msg_len), SCARG(uap, msg_prio), tsp, &mlen);
        if (error == 0)
                *retval = mlen;

        return error;
}

/*
 * Primary mq_send1() function.
 */
int
mq_send1(mqd_t mqdes, const char *msg_ptr, size_t msg_len, u_int msg_prio,
    struct timespec *ts)
{
        struct mqueue *mq;
        struct mq_msg *msg;
        struct mq_attr *mqattr;
        struct proc *notify = NULL;
        ksiginfo_t ksi;
        size_t size;
        int error;

        /* Check the priority range */
        if (msg_prio >= mq_prio_max)
                return EINVAL;

        /* Allocate a new message */
        if (msg_len > mq_max_msgsize)
                return EMSGSIZE;
        size = sizeof(struct mq_msg) + msg_len;
        if (size > mq_max_msgsize)
                return EMSGSIZE;

        if (size > MQ_DEF_MSGSIZE) {
                msg = kmem_alloc(size, KM_SLEEP);
        } else {
                msg = pool_cache_get(mqmsg_cache, PR_WAITOK);
        }

        /* Get the data from user-space */
        error = copyin(msg_ptr, msg->msg_ptr, msg_len);
        if (error) {
                mqueue_freemsg(msg, size);
                return error;
        }
        msg->msg_len = msg_len;
        msg->msg_prio = msg_prio;

        error = mqueue_get(mqdes, FWRITE, &mq);
        if (error) {
                mqueue_freemsg(msg, size);
                return error;
        }
        getnanotime(&mq->mq_mtime);
        mqattr = &mq->mq_attrib;

        /* Check the message size limit */
        if (msg_len <= 0 || msg_len > mqattr->mq_msgsize) {
                error = EMSGSIZE;
                goto error;
        }

        /* Check if queue is full */
        while (mqattr->mq_curmsgs >= mqattr->mq_maxmsg) {
                int t;

                if (mqattr->mq_flags & O_NONBLOCK) {
                        error = EAGAIN;
                        goto error;
                }
                if (ts) {
                        error = ts2timo(CLOCK_REALTIME, TIMER_ABSTIME, ts, &t,
                            NULL);
                        if (error)
                                goto error;
                } else
                        t = 0;
                /* Block until queue becomes available */
                error = cv_timedwait_sig(&mq->mq_recv_cv, &mq->mq_mtx, t);
                if (error || (mqattr->mq_flags & MQ_UNLINKED)) {
                        error = (error == EWOULDBLOCK) ? ETIMEDOUT : error;
                        goto error;
                }
        }
        KASSERT(mqattr->mq_curmsgs < mqattr->mq_maxmsg);

        /*
         * Insert message into the queue, according to the priority.
         * Note the difference between index and priority.
         */
        if (__predict_true(msg_prio < MQ_PQSIZE)) {
                u_int idx = MQ_PQSIZE - msg_prio;

                KASSERT(idx != MQ_PQRESQ);
                TAILQ_INSERT_TAIL(&mq->mq_head[idx], msg, msg_queue);
                mq->mq_bitmap |= (1U << --idx);
        } else {
                mqueue_linear_insert(mq, msg);
        }

        /* Check for the notify */
        if (mqattr->mq_curmsgs == 0 && mq->mq_notify_proc &&
            (mqattr->mq_flags & MQ_RECEIVE) == 0 &&
            mq->mq_sig_notify.sigev_notify == SIGEV_SIGNAL) {
                /* Initialize the signal */
                KSI_INIT(&ksi);
                ksi.ksi_signo = mq->mq_sig_notify.sigev_signo;
                ksi.ksi_code = SI_MESGQ;
                ksi.ksi_value = mq->mq_sig_notify.sigev_value;
                /* Unregister the process */
                notify = mq->mq_notify_proc;
                mq->mq_notify_proc = NULL;
        }

        /* Increment the counter and signal waiter, if any */
        mqattr->mq_curmsgs++;
        cv_signal(&mq->mq_send_cv);

        /* Ready for receiving now */
        selnotify(&mq->mq_rsel, POLLIN | POLLRDNORM, 0);
error:
        mutex_exit(&mq->mq_mtx);
        fd_putfile((int)mqdes);

        if (error) {
                mqueue_freemsg(msg, size);
        } else if (notify) {
                /* Send the notify, if needed */
                mutex_enter(&proc_lock);
                kpsignal(notify, &ksi, NULL);
                mutex_exit(&proc_lock);
        }
        return error;
}

int
sys_mq_send(struct lwp *l, const struct sys_mq_send_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(mqd_t) mqdes;
                syscallarg(const char *) msg_ptr;
                syscallarg(size_t) msg_len;
                syscallarg(unsigned) msg_prio;
        } */

        return mq_send1(SCARG(uap, mqdes), SCARG(uap, msg_ptr),
            SCARG(uap, msg_len), SCARG(uap, msg_prio), NULL);
}

int
sys___mq_timedsend50(struct lwp *l, const struct sys___mq_timedsend50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(mqd_t) mqdes;
                syscallarg(const char *) msg_ptr;
                syscallarg(size_t) msg_len;
                syscallarg(unsigned) msg_prio;
                syscallarg(const struct timespec *) abs_timeout;
        } */
        struct timespec ts, *tsp;
        int error;

        /* Get and convert time value */
        if (SCARG(uap, abs_timeout)) {
                error = copyin(SCARG(uap, abs_timeout), &ts, sizeof(ts));
                if (error)
                        return error;
                tsp = &ts;
        } else {
                tsp = NULL;
        }

        return mq_send1(SCARG(uap, mqdes), SCARG(uap, msg_ptr),
            SCARG(uap, msg_len), SCARG(uap, msg_prio), tsp);
}

int
sys_mq_notify(struct lwp *l, const struct sys_mq_notify_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(mqd_t) mqdes;
                syscallarg(const struct sigevent *) notification;
        } */
        struct mqueue *mq;
        struct sigevent sig;
        int error;

        if (SCARG(uap, notification)) {
                /* Get the signal from user-space */
                error = copyin(SCARG(uap, notification), &sig,
                    sizeof(struct sigevent));
                if (error)
                        return error;
                if (sig.sigev_notify == SIGEV_SIGNAL &&
                    (sig.sigev_signo <=0 || sig.sigev_signo >= NSIG))
                        return EINVAL;
        }

        error = mqueue_get(SCARG(uap, mqdes), 0, &mq);
        if (error) {
                return error;
        }
        if (SCARG(uap, notification)) {
                /* Register notification: set the signal and target process */
                if (mq->mq_notify_proc == NULL) {
                        memcpy(&mq->mq_sig_notify, &sig,
                            sizeof(struct sigevent));
                        mq->mq_notify_proc = l->l_proc;
                } else {
                        /* Fail if someone else already registered */
                        error = EBUSY;
                }
        } else {
                /* Unregister the notification */
                mq->mq_notify_proc = NULL;
        }
        mutex_exit(&mq->mq_mtx);
        fd_putfile((int)SCARG(uap, mqdes));

        return error;
}

int
sys_mq_getattr(struct lwp *l, const struct sys_mq_getattr_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(mqd_t) mqdes;
                syscallarg(struct mq_attr *) mqstat;
        } */
        struct mqueue *mq;
        struct mq_attr attr;
        int error;

        error = mqueue_get(SCARG(uap, mqdes), 0, &mq);
        if (error) {
                return error;
        }
        memcpy(&attr, &mq->mq_attrib, sizeof(struct mq_attr));
        mutex_exit(&mq->mq_mtx);
        fd_putfile((int)SCARG(uap, mqdes));

        return copyout(&attr, SCARG(uap, mqstat), sizeof(struct mq_attr));
}

int
sys_mq_setattr(struct lwp *l, const struct sys_mq_setattr_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(mqd_t) mqdes;
                syscallarg(const struct mq_attr *) mqstat;
                syscallarg(struct mq_attr *) omqstat;
        } */
        struct mqueue *mq;
        struct mq_attr attr;
        int error, nonblock;

        error = copyin(SCARG(uap, mqstat), &attr, sizeof(struct mq_attr));
        if (error)
                return error;
        nonblock = (attr.mq_flags & O_NONBLOCK);

        error = mqueue_get(SCARG(uap, mqdes), 0, &mq);
        if (error) {
                return error;
        }

        /* Copy the old attributes, if needed */
        if (SCARG(uap, omqstat)) {
                memcpy(&attr, &mq->mq_attrib, sizeof(struct mq_attr));
        }

        /* Ignore everything, except O_NONBLOCK */
        if (nonblock)
                mq->mq_attrib.mq_flags |= O_NONBLOCK;
        else
                mq->mq_attrib.mq_flags &= ~O_NONBLOCK;

        mutex_exit(&mq->mq_mtx);
        fd_putfile((int)SCARG(uap, mqdes));

        /*
         * Copy the data to the user-space.
         * Note: According to POSIX, the new attributes should not be set in
         * case of fail - this would be violated.
         */
        if (SCARG(uap, omqstat))
                error = copyout(&attr, SCARG(uap, omqstat),
                    sizeof(struct mq_attr));

        return error;
}

int
sys_mq_unlink(struct lwp *l, const struct sys_mq_unlink_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const char *) name;
        } */
        mqueue_t *mq;
        char *name;
        int error, refcnt = 0;

        /* Get the name from the user-space */
        name = kmem_alloc(MQ_NAMELEN, KM_SLEEP);
        error = copyinstr(SCARG(uap, name), name, MQ_NAMELEN - 1, NULL);
        if (error) {
                kmem_free(name, MQ_NAMELEN);
                return error;
        }

        mutex_enter(&mqlist_lock);
        mq = mqueue_lookup(name);
        if (mq == NULL) {
                error = ENOENT;
                goto err;
        }
        KASSERT(mutex_owned(&mq->mq_mtx));

        /* Verify permissions. */
        if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MQUEUE, 0, mq,
            NULL, NULL)) {
                mutex_exit(&mq->mq_mtx);
                error = EACCES;
                goto err;
        }

        /* Remove and destroy if no references. */
        LIST_REMOVE(mq, mq_list);
        refcnt = mq->mq_refcnt;
        if (refcnt) {
                /* Mark as unlinked, if there are references. */
                mq->mq_attrib.mq_flags |= MQ_UNLINKED;
        }

        /* Wake up waiters, if there are any. */
        cv_broadcast(&mq->mq_send_cv);
        cv_broadcast(&mq->mq_recv_cv);

        selnotify(&mq->mq_rsel, POLLHUP, 0);
        selnotify(&mq->mq_wsel, POLLHUP, 0);

        mutex_exit(&mq->mq_mtx);
err:
        mutex_exit(&mqlist_lock);
        /*
         * If last reference - destroy the message queue.  Otherwise,
         * the last mq_close() call will do that.
         */
        if (!error && refcnt == 0) {
                mqueue_destroy(mq);
        }
        kmem_free(name, MQ_NAMELEN);

        return error;
}

/*
 * System control nodes.
 */
SYSCTL_SETUP(mqueue_sysctl_init, "mqueue systl")
{
        const struct sysctlnode *node = NULL;

        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                CTLTYPE_INT, "posix_msg",
                SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
                             "Message Passing option to which the "
                             "system attempts to conform"),
                NULL, _POSIX_MESSAGE_PASSING, NULL, 0,
                CTL_KERN, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, &node,
                CTLFLAG_PERMANENT,
                CTLTYPE_NODE, "mqueue",
                SYSCTL_DESCR("Message queue options"),
                NULL, 0, NULL, 0,
                CTL_KERN, CTL_CREATE, CTL_EOL);

        if (node == NULL)
                return;

        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "mq_open_max",
                SYSCTL_DESCR("Maximal number of message queue descriptors "
                             "that process could open"),
                NULL, 0, &mq_open_max, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "mq_prio_max",
                SYSCTL_DESCR("Maximal priority of the message"),
                NULL, 0, &mq_prio_max, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "mq_max_msgsize",
                SYSCTL_DESCR("Maximal allowed size of the message"),
                NULL, 0, &mq_max_msgsize, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "mq_def_maxmsg",
                SYSCTL_DESCR("Default maximal message count"),
                NULL, 0, &mq_def_maxmsg, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "mq_max_maxmsg",
                SYSCTL_DESCR("Maximal allowed message count"),
                NULL, 0, &mq_max_maxmsg, 0,
                CTL_CREATE, CTL_EOL);

        return;
}

/*
 * Debugging.
 */
#if defined(DDB)

void
mqueue_print_list(void (*pr)(const char *, ...))
{
        struct mqueue *mq;

        (*pr)("Global list of the message queues:\n");
        (*pr)("%20s %10s %8s %8s %3s %4s %4s %4s\n",
            "Name", "Ptr", "Mode", "Flags",  "Ref",
            "MaxMsg", "MsgSze", "CurMsg");
        LIST_FOREACH(mq, &mqueue_head, mq_list) {
                (*pr)("%20s %10p %8x %8x %3u %6lu %6lu %6lu\n",
                    mq->mq_name, mq, mq->mq_mode,
                    mq->mq_attrib.mq_flags, mq->mq_refcnt,
                    mq->mq_attrib.mq_maxmsg, mq->mq_attrib.mq_msgsize,
                    mq->mq_attrib.mq_curmsgs);
        }
}

#endif /* defined(DDB) */



















































































    3 


































































































































































































































































































































































































































































































































































































    2 




    2 


















































    2 













    2 







































































































































































































































































    3 




    3 





































































    3 











    3 

    3 
    3 






    3 



    3 
    3 





    3 
    3 

    3 



    3 


    3 
    3 







    3 


    3 











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
/*        $NetBSD: prop_object.c,v 1.35 2022/08/07 23:49:46 riastradh Exp $        */

/*-
 * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "prop_object_impl.h"
#include <prop/prop_object.h>

#ifdef _PROP_NEED_REFCNT_MTX
static pthread_mutex_t _prop_refcnt_mtx = PTHREAD_MUTEX_INITIALIZER;
#endif /* _PROP_NEED_REFCNT_MTX */

#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <sys/mman.h>
#include <sys/stat.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <unistd.h>
#endif

#ifdef _STANDALONE
void *
_prop_standalone_calloc(size_t size)
{
        void *rv;

        rv = alloc(size);
        if (rv != NULL)
                memset(rv, 0, size);

        return (rv);
}

void *
_prop_standalone_realloc(void *v, size_t size)
{
        void *rv;

        rv = alloc(size);
        if (rv != NULL) {
                memcpy(rv, v, size);        /* XXX */
                dealloc(v, 0);                /* XXX */
        }

        return (rv);
}
#endif /* _STANDALONE */

/*
 * _prop_object_init --
 *        Initialize an object.  Called when sub-classes create
 *        an instance.
 */
void
_prop_object_init(struct _prop_object *po, const struct _prop_object_type *pot)
{

        po->po_type = pot;
        po->po_refcnt = 1;
}

/*
 * _prop_object_fini --
 *        Finalize an object.  Called when sub-classes destroy
 *        an instance.
 */
/*ARGSUSED*/
void
_prop_object_fini(struct _prop_object *po _PROP_ARG_UNUSED)
{
        /* Nothing to do, currently. */
}

/*
 * _prop_object_externalize_start_tag --
 *        Append an XML-style start tag to the externalize buffer.
 */
bool
_prop_object_externalize_start_tag(
    struct _prop_object_externalize_context *ctx, const char *tag)
{
        unsigned int i;

        for (i = 0; i < ctx->poec_depth; i++) {
                if (_prop_object_externalize_append_char(ctx, '\t') == false)
                        return (false);
        }
        if (_prop_object_externalize_append_char(ctx, '<') == false ||
            _prop_object_externalize_append_cstring(ctx, tag) == false ||
            _prop_object_externalize_append_char(ctx, '>') == false)
                return (false);

        return (true);
}

/*
 * _prop_object_externalize_end_tag --
 *        Append an XML-style end tag to the externalize buffer.
 */
bool
_prop_object_externalize_end_tag(
    struct _prop_object_externalize_context *ctx, const char *tag)
{

        if (_prop_object_externalize_append_char(ctx, '<') == false ||
            _prop_object_externalize_append_char(ctx, '/') == false ||
            _prop_object_externalize_append_cstring(ctx, tag) == false ||
            _prop_object_externalize_append_char(ctx, '>') == false ||
            _prop_object_externalize_append_char(ctx, '\n') == false)
                return (false);

        return (true);
}

/*
 * _prop_object_externalize_empty_tag --
 *        Append an XML-style empty tag to the externalize buffer.
 */
bool
_prop_object_externalize_empty_tag(
    struct _prop_object_externalize_context *ctx, const char *tag)
{
        unsigned int i;

        for (i = 0; i < ctx->poec_depth; i++) {
                if (_prop_object_externalize_append_char(ctx, '\t') == false)
                        return (false);
        }

        if (_prop_object_externalize_append_char(ctx, '<') == false ||
            _prop_object_externalize_append_cstring(ctx, tag) == false ||
            _prop_object_externalize_append_char(ctx, '/') == false ||
            _prop_object_externalize_append_char(ctx, '>') == false ||
            _prop_object_externalize_append_char(ctx, '\n') == false)
                    return (false);

        return (true);
}

/*
 * _prop_object_externalize_append_cstring --
 *        Append a C string to the externalize buffer.
 */
bool
_prop_object_externalize_append_cstring(
    struct _prop_object_externalize_context *ctx, const char *cp)
{

        while (*cp != '\0') {
                if (_prop_object_externalize_append_char(ctx,
                                                (unsigned char) *cp) == false)
                        return (false);
                cp++;
        }

        return (true);
}

/*
 * _prop_object_externalize_append_encoded_cstring --
 *        Append an encoded C string to the externalize buffer.
 */
bool
_prop_object_externalize_append_encoded_cstring(
    struct _prop_object_externalize_context *ctx, const char *cp)
{

        while (*cp != '\0') {
                switch (*cp) {
                case '<':
                        if (_prop_object_externalize_append_cstring(ctx,
                                        "&lt;") == false)
                                return (false);
                        break;
                case '>':
                        if (_prop_object_externalize_append_cstring(ctx,
                                        "&gt;") == false)
                                return (false);
                        break;
                case '&':
                        if (_prop_object_externalize_append_cstring(ctx,
                                        "&amp;") == false)
                                return (false);
                        break;
                default:
                        if (_prop_object_externalize_append_char(ctx,
                                        (unsigned char) *cp) == false)
                                return (false);
                        break;
                }
                cp++;
        }

        return (true);
}

#define        BUF_EXPAND                256

/*
 * _prop_object_externalize_append_char --
 *        Append a single character to the externalize buffer.
 */
bool
_prop_object_externalize_append_char(
    struct _prop_object_externalize_context *ctx, unsigned char c)
{

        _PROP_ASSERT(ctx->poec_capacity != 0);
        _PROP_ASSERT(ctx->poec_buf != NULL);
        _PROP_ASSERT(ctx->poec_len <= ctx->poec_capacity);

        if (ctx->poec_len == ctx->poec_capacity) {
                char *cp = _PROP_REALLOC(ctx->poec_buf,
                                         ctx->poec_capacity + BUF_EXPAND,
                                         M_TEMP);
                if (cp == NULL)
                        return (false);
                ctx->poec_capacity = ctx->poec_capacity + BUF_EXPAND;
                ctx->poec_buf = cp;
        }

        ctx->poec_buf[ctx->poec_len++] = c;

        return (true);
}

/*
 * _prop_object_externalize_header --
 *        Append the standard XML header to the externalize buffer.
 */
bool
_prop_object_externalize_header(struct _prop_object_externalize_context *ctx)
{
        static const char _plist_xml_header[] =
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<!DOCTYPE plist PUBLIC \"-//Apple Computer//DTD PLIST 1.0//EN\" \"http://www.apple.com/DTDs/PropertyList-1.0.dtd\">\n";

        if (_prop_object_externalize_append_cstring(ctx,
                                                 _plist_xml_header) == false ||
            _prop_object_externalize_start_tag(ctx,
                                       "plist version=\"1.0\"") == false ||
            _prop_object_externalize_append_char(ctx, '\n') == false)
                return (false);

        return (true);
}

/*
 * _prop_object_externalize_footer --
 *        Append the standard XML footer to the externalize buffer.  This
 *        also NUL-terminates the buffer.
 */
bool
_prop_object_externalize_footer(struct _prop_object_externalize_context *ctx)
{

        if (_prop_object_externalize_end_tag(ctx, "plist") == false ||
            _prop_object_externalize_append_char(ctx, '\0') == false)
                return (false);

        return (true);
}

/*
 * _prop_object_externalize_context_alloc --
 *        Allocate an externalize context.
 */
struct _prop_object_externalize_context *
_prop_object_externalize_context_alloc(void)
{
        struct _prop_object_externalize_context *ctx;

        ctx = _PROP_MALLOC(sizeof(*ctx), M_TEMP);
        if (ctx != NULL) {
                ctx->poec_buf = _PROP_MALLOC(BUF_EXPAND, M_TEMP);
                if (ctx->poec_buf == NULL) {
                        _PROP_FREE(ctx, M_TEMP);
                        return (NULL);
                }
                ctx->poec_len = 0;
                ctx->poec_capacity = BUF_EXPAND;
                ctx->poec_depth = 0;
        }
        return (ctx);
}

/*
 * _prop_object_externalize_context_free --
 *        Free an externalize context.
 */
void
_prop_object_externalize_context_free(
                struct _prop_object_externalize_context *ctx)
{

        /* Buffer is always freed by the caller. */
        _PROP_FREE(ctx, M_TEMP);
}

/*
 * _prop_object_internalize_skip_comment --
 *        Skip the body and end tag of a comment.
 */
static bool
_prop_object_internalize_skip_comment(
                                struct _prop_object_internalize_context *ctx)
{
        const char *cp = ctx->poic_cp;

        while (!_PROP_EOF(*cp)) {
                if (cp[0] == '-' &&
                    cp[1] == '-' &&
                    cp[2] == '>') {
                        ctx->poic_cp = cp + 3;
                        return (true);
                }
                cp++;
        }

        return (false);                /* ran out of buffer */
}

/*
 * _prop_object_internalize_find_tag --
 *        Find the next tag in an XML stream.  Optionally compare the found
 *        tag to an expected tag name.  State of the context is undefined
 *        if this routine returns false.  Upon success, the context points
 *        to the first octet after the tag.
 */
bool
_prop_object_internalize_find_tag(struct _prop_object_internalize_context *ctx,
                      const char *tag, _prop_tag_type_t type)
{
        const char *cp;
        size_t taglen;

        if (tag != NULL)
                taglen = strlen(tag);
        else
                taglen = 0;

 start_over:
        cp = ctx->poic_cp;

        /*
         * Find the start of the tag.
         */
        while (_PROP_ISSPACE(*cp))
                cp++;
        if (_PROP_EOF(*cp))
                return (false);

        if (*cp != '<')
                return (false);

        ctx->poic_tag_start = cp++;
        if (_PROP_EOF(*cp))
                return (false);

        if (*cp == '!') {
                if (cp[1] != '-' || cp[2] != '-')
                        return (false);
                /*
                 * Comment block -- only allowed if we are allowed to
                 * return a start tag.
                 */
                if (type == _PROP_TAG_TYPE_END)
                        return (false);
                ctx->poic_cp = cp + 3;
                if (_prop_object_internalize_skip_comment(ctx) == false)
                        return (false);
                goto start_over;
        }

        if (*cp == '/') {
                if (type != _PROP_TAG_TYPE_END &&
                    type != _PROP_TAG_TYPE_EITHER)
                        return (false);
                cp++;
                if (_PROP_EOF(*cp))
                        return (false);
                ctx->poic_tag_type = _PROP_TAG_TYPE_END;
        } else {
                if (type != _PROP_TAG_TYPE_START &&
                    type != _PROP_TAG_TYPE_EITHER)
                        return (false);
                ctx->poic_tag_type = _PROP_TAG_TYPE_START;
        }

        ctx->poic_tagname = cp;

        while (!_PROP_ISSPACE(*cp) && *cp != '/' && *cp != '>') {
                if (_PROP_EOF(*cp))
                        return (false);
                cp++;
        }

        ctx->poic_tagname_len = cp - ctx->poic_tagname;

        /* Make sure this is the tag we're looking for. */
        if (tag != NULL &&
            (taglen != ctx->poic_tagname_len ||
             memcmp(tag, ctx->poic_tagname, taglen) != 0))
                return (false);

        /* Check for empty tag. */
        if (*cp == '/') {
                if (ctx->poic_tag_type != _PROP_TAG_TYPE_START)
                        return(false);                /* only valid on start tags */
                ctx->poic_is_empty_element = true;
                cp++;
                if (_PROP_EOF(*cp) || *cp != '>')
                        return (false);
        } else
                ctx->poic_is_empty_element = false;

        /* Easy case of no arguments. */
        if (*cp == '>') {
                ctx->poic_tagattr = NULL;
                ctx->poic_tagattr_len = 0;
                ctx->poic_tagattrval = NULL;
                ctx->poic_tagattrval_len = 0;
                ctx->poic_cp = cp + 1;
                return (true);
        }

        _PROP_ASSERT(!_PROP_EOF(*cp));
        cp++;
        if (_PROP_EOF(*cp))
                return (false);

        while (_PROP_ISSPACE(*cp))
                cp++;
        if (_PROP_EOF(*cp))
                return (false);

        ctx->poic_tagattr = cp;

        while (!_PROP_ISSPACE(*cp) && *cp != '=') {
                if (_PROP_EOF(*cp))
                        return (false);
                cp++;
        }

        ctx->poic_tagattr_len = cp - ctx->poic_tagattr;

        cp++;
        if (*cp != '\"')
                return (false);
        cp++;
        if (_PROP_EOF(*cp))
                return (false);

        ctx->poic_tagattrval = cp;
        while (*cp != '\"') {
                if (_PROP_EOF(*cp))
                        return (false);
                cp++;
        }
        ctx->poic_tagattrval_len = cp - ctx->poic_tagattrval;

        cp++;
        if (*cp != '>')
                return (false);

        ctx->poic_cp = cp + 1;
        return (true);
}

/*
 * _prop_object_internalize_decode_string --
 *        Decode an encoded string.
 */
bool
_prop_object_internalize_decode_string(
                                struct _prop_object_internalize_context *ctx,
                                char *target, size_t targsize, size_t *sizep,
                                const char **cpp)
{
        const char *src;
        size_t tarindex;
        char c;

        tarindex = 0;
        src = ctx->poic_cp;

        for (;;) {
                if (_PROP_EOF(*src))
                        return (false);
                if (*src == '<') {
                        break;
                }

                if ((c = *src) == '&') {
                        if (src[1] == 'a' &&
                            src[2] == 'm' &&
                            src[3] == 'p' &&
                            src[4] == ';') {
                                    c = '&';
                                src += 5;
                        } else if (src[1] == 'l' &&
                                   src[2] == 't' &&
                                   src[3] == ';') {
                                c = '<';
                                src += 4;
                        } else if (src[1] == 'g' &&
                                   src[2] == 't' &&
                                   src[3] == ';') {
                                c = '>';
                                src += 4;
                        } else if (src[1] == 'a' &&
                                   src[2] == 'p' &&
                                   src[3] == 'o' &&
                                   src[4] == 's' &&
                                   src[5] == ';') {
                                c = '\'';
                                src += 6;
                        } else if (src[1] == 'q' &&
                                   src[2] == 'u' &&
                                   src[3] == 'o' &&
                                   src[4] == 't' &&
                                   src[5] == ';') {
                                c = '\"';
                                src += 6;
                        } else
                                return (false);
                } else
                        src++;
                if (target) {
                        if (tarindex >= targsize)
                                return (false);
                        target[tarindex] = c;
                }
                tarindex++;
        }

        _PROP_ASSERT(*src == '<');
        if (sizep != NULL)
                *sizep = tarindex;
        if (cpp != NULL)
                *cpp = src;

        return (true);
}

/*
 * _prop_object_internalize_match --
 *        Returns true if the two character streams match.
 */
bool
_prop_object_internalize_match(const char *str1, size_t len1,
                               const char *str2, size_t len2)
{

        return (len1 == len2 && memcmp(str1, str2, len1) == 0);
}

#define        INTERNALIZER(t, f)                        \
{        t,        sizeof(t) - 1,                f        }

static const struct _prop_object_internalizer {
        const char                        *poi_tag;
        size_t                                poi_taglen;
        prop_object_internalizer_t        poi_intern;
} _prop_object_internalizer_table[] = {
        INTERNALIZER("array", _prop_array_internalize),

        INTERNALIZER("true", _prop_bool_internalize),
        INTERNALIZER("false", _prop_bool_internalize),

        INTERNALIZER("data", _prop_data_internalize),

        INTERNALIZER("dict", _prop_dictionary_internalize),

        INTERNALIZER("integer", _prop_number_internalize),

        INTERNALIZER("string", _prop_string_internalize),

        { 0, 0, NULL }
};

#undef INTERNALIZER

/*
 * _prop_object_internalize_by_tag --
 *        Determine the object type from the tag in the context and
 *        internalize it.
 */
prop_object_t
_prop_object_internalize_by_tag(struct _prop_object_internalize_context *ctx)
{
        const struct _prop_object_internalizer *poi;
        prop_object_t obj, parent_obj;
        void *data, *iter;
        prop_object_internalizer_continue_t iter_func;
        struct _prop_stack stack;

        _prop_stack_init(&stack);

match_start:
        for (poi = _prop_object_internalizer_table;
             poi->poi_tag != NULL; poi++) {
                if (_prop_object_internalize_match(ctx->poic_tagname,
                                                   ctx->poic_tagname_len,
                                                   poi->poi_tag,
                                                   poi->poi_taglen))
                        break;
        }
        if ((poi == NULL) || (poi->poi_tag == NULL)) {
                while (_prop_stack_pop(&stack, &obj, &iter, &data, NULL)) {
                        iter_func = (prop_object_internalizer_continue_t)iter;
                        (*iter_func)(&stack, &obj, ctx, data, NULL);
                }

                return (NULL);
        }

        obj = NULL;
        if (!(*poi->poi_intern)(&stack, &obj, ctx))
                goto match_start;

        parent_obj = obj;
        while (_prop_stack_pop(&stack, &parent_obj, &iter, &data, NULL)) {
                iter_func = (prop_object_internalizer_continue_t)iter;
                if (!(*iter_func)(&stack, &parent_obj, ctx, data, obj))
                        goto match_start;
                obj = parent_obj;
        }

        return (parent_obj);
}

prop_object_t
_prop_generic_internalize(const char *xml, const char *master_tag)
{
        prop_object_t obj = NULL;
        struct _prop_object_internalize_context *ctx;

        ctx = _prop_object_internalize_context_alloc(xml);
        if (ctx == NULL)
                return (NULL);

        /* We start with a <plist> tag. */
        if (_prop_object_internalize_find_tag(ctx, "plist",
                                              _PROP_TAG_TYPE_START) == false)
                goto out;

        /* Plist elements cannot be empty. */
        if (ctx->poic_is_empty_element)
                goto out;

        /*
         * We don't understand any plist attributes, but Apple XML
         * property lists often have a "version" attribute.  If we
         * see that one, we simply ignore it.
         */
        if (ctx->poic_tagattr != NULL &&
            !_PROP_TAGATTR_MATCH(ctx, "version"))
                goto out;

        /* Next we expect to see opening master_tag. */
        if (_prop_object_internalize_find_tag(ctx, master_tag,
                                              _PROP_TAG_TYPE_START) == false)
                goto out;

        obj = _prop_object_internalize_by_tag(ctx);
        if (obj == NULL)
                goto out;

        /*
         * We've advanced past the closing master_tag.
         * Now we want </plist>.
         */
        if (_prop_object_internalize_find_tag(ctx, "plist",
                                              _PROP_TAG_TYPE_END) == false) {
                prop_object_release(obj);
                obj = NULL;
        }

 out:
         _prop_object_internalize_context_free(ctx);
        return (obj);
}

/*
 * _prop_object_internalize_context_alloc --
 *        Allocate an internalize context.
 */
struct _prop_object_internalize_context *
_prop_object_internalize_context_alloc(const char *xml)
{
        struct _prop_object_internalize_context *ctx;

        ctx = _PROP_MALLOC(sizeof(*ctx), M_TEMP);
        if (ctx == NULL)
                return (NULL);

        ctx->poic_xml = ctx->poic_cp = xml;

        /*
         * Skip any whitespace and XML preamble stuff that we don't
         * know about / care about.
         */
        for (;;) {
                while (_PROP_ISSPACE(*xml))
                        xml++;
                if (_PROP_EOF(*xml) || *xml != '<')
                        goto bad;

#define        MATCH(str)        (strncmp(&xml[1], str, strlen(str)) == 0)

                /*
                 * Skip over the XML preamble that Apple XML property
                 * lists usually include at the top of the file.
                 */
                if (MATCH("?xml ") ||
                    MATCH("!DOCTYPE plist")) {
                        while (*xml != '>' && !_PROP_EOF(*xml))
                                xml++;
                        if (_PROP_EOF(*xml))
                                goto bad;
                        xml++;        /* advance past the '>' */
                        continue;
                }

                if (MATCH("<!--")) {
                        ctx->poic_cp = xml + 4;
                        if (_prop_object_internalize_skip_comment(ctx) == false)
                                goto bad;
                        xml = ctx->poic_cp;
                        continue;
                }

#undef MATCH

                /*
                 * We don't think we should skip it, so let's hope we can
                 * parse it.
                 */
                break;
        }

        ctx->poic_cp = xml;
        return (ctx);
 bad:
        _PROP_FREE(ctx, M_TEMP);
        return (NULL);
}

/*
 * _prop_object_internalize_context_free --
 *        Free an internalize context.
 */
void
_prop_object_internalize_context_free(
                struct _prop_object_internalize_context *ctx)
{

        _PROP_FREE(ctx, M_TEMP);
}

#if !defined(_KERNEL) && !defined(_STANDALONE)
/*
 * _prop_object_externalize_file_dirname --
 *        dirname(3), basically.  We have to roll our own because the
 *        system dirname(3) isn't reentrant.
 */
static void
_prop_object_externalize_file_dirname(const char *path, char *result)
{
        const char *lastp;
        size_t len;

        /*
         * If `path' is a NULL pointer or points to an empty string,
         * return ".".
         */
        if (path == NULL || *path == '\0')
                goto singledot;

        /* String trailing slashes, if any. */
        lastp = path + strlen(path) - 1;
        while (lastp != path && *lastp == '/')
                lastp--;

        /* Terminate path at the last occurrence of '/'. */
        do {
                if (*lastp == '/') {
                        /* Strip trailing slashes, if any. */
                        while (lastp != path && *lastp == '/')
                                lastp--;

                        /* ...and copy the result into the result buffer. */
                        len = (lastp - path) + 1 /* last char */;
                        if (len > (PATH_MAX - 1))
                                len = PATH_MAX - 1;

                        memcpy(result, path, len);
                        result[len] = '\0';
                        return;
                }
        } while (--lastp >= path);

         /* No /'s found, return ".". */
 singledot:
        strcpy(result, ".");
}

/*
 * _prop_object_externalize_write_file --
 *        Write an externalized dictionary to the specified file.
 *        The file is written atomically from the caller's perspective,
 *        and the mode set to 0666 modified by the caller's umask.
 */
bool
_prop_object_externalize_write_file(const char *fname, const char *xml,
    size_t len)
{
        char tname[PATH_MAX];
        int fd;
        int save_errno;
        mode_t myumask;

        if (len > SSIZE_MAX) {
                errno = EFBIG;
                return (false);
        }

        /*
         * Get the directory name where the file is to be written
         * and create the temporary file.
         */
        _prop_object_externalize_file_dirname(fname, tname);
#define PLISTTMP "/.plistXXXXXX"
        if (strlen(tname) + strlen(PLISTTMP) >= sizeof(tname)) {
                errno = ENAMETOOLONG;
                return (false);
        }
        strcat(tname, PLISTTMP);
#undef PLISTTMP

        if ((fd = mkstemp(tname)) == -1)
                return (false);

        if (write(fd, xml, len) != (ssize_t)len)
                goto bad;

        if (fsync(fd) == -1)
                goto bad;

        myumask = umask(0);
        (void)umask(myumask);
        if (fchmod(fd, 0666 & ~myumask) == -1)
                goto bad;

        (void) close(fd);
        fd = -1;

        if (rename(tname, fname) == -1)
                goto bad;

        return (true);

 bad:
        save_errno = errno;
        if (fd != -1)
                (void) close(fd);
        (void) unlink(tname);
        errno = save_errno;
        return (false);
}

/*
 * _prop_object_internalize_map_file --
 *        Map a file for the purpose of internalizing it.
 */
struct _prop_object_internalize_mapped_file *
_prop_object_internalize_map_file(const char *fname)
{
        struct stat sb;
        struct _prop_object_internalize_mapped_file *mf;
        size_t pgsize = (size_t)sysconf(_SC_PAGESIZE);
        size_t pgmask = pgsize - 1;
        bool need_guard = false;
        int fd;

        mf = _PROP_MALLOC(sizeof(*mf), M_TEMP);
        if (mf == NULL)
                return (NULL);

        fd = open(fname, O_RDONLY, 0400);
        if (fd == -1) {
                _PROP_FREE(mf, M_TEMP);
                return (NULL);
        }

        if (fstat(fd, &sb) == -1) {
                (void) close(fd);
                _PROP_FREE(mf, M_TEMP);
                return (NULL);
        }
        mf->poimf_mapsize = ((size_t)sb.st_size + pgmask) & ~pgmask;
        if (mf->poimf_mapsize < (size_t)sb.st_size) {
                (void) close(fd);
                _PROP_FREE(mf, M_TEMP);
                return (NULL);
        }

        /*
         * If the file length is an integral number of pages, then we
         * need to map a guard page at the end in order to provide the
         * necessary NUL-termination of the buffer.
         */
        if ((sb.st_size & pgmask) == 0)
                need_guard = true;

        mf->poimf_xml = mmap(NULL, need_guard ? mf->poimf_mapsize + pgsize
                                                  : mf->poimf_mapsize,
                            PROT_READ, MAP_FILE|MAP_SHARED, fd, (off_t)0);
        (void) close(fd);
        if (mf->poimf_xml == MAP_FAILED) {
                _PROP_FREE(mf, M_TEMP);
                return (NULL);
        }
#ifdef POSIX_MADV_SEQUENTIAL
        (void) posix_madvise(mf->poimf_xml, mf->poimf_mapsize,
            POSIX_MADV_SEQUENTIAL);
#endif

        if (need_guard) {
                if (mmap(mf->poimf_xml + mf->poimf_mapsize,
                         pgsize, PROT_READ,
                         MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1,
                         (off_t)0) == MAP_FAILED) {
                        (void) munmap(mf->poimf_xml, mf->poimf_mapsize);
                        _PROP_FREE(mf, M_TEMP);
                        return (NULL);
                }
                mf->poimf_mapsize += pgsize;
        }

        return (mf);
}

/*
 * _prop_object_internalize_unmap_file --
 *        Unmap a file previously mapped for internalizing.
 */
void
_prop_object_internalize_unmap_file(
    struct _prop_object_internalize_mapped_file *mf)
{

#ifdef POSIX_MADV_DONTNEED
        (void) posix_madvise(mf->poimf_xml, mf->poimf_mapsize,
            POSIX_MADV_DONTNEED);
#endif
        (void) munmap(mf->poimf_xml, mf->poimf_mapsize);
        _PROP_FREE(mf, M_TEMP);
}
#endif /* !_KERNEL && !_STANDALONE */

/*
 * prop_object_retain --
 *        Increment the reference count on an object.
 */
void
prop_object_retain(prop_object_t obj)
{
        struct _prop_object *po = obj;
        uint32_t ncnt __unused;

        _PROP_ATOMIC_INC32_NV(&po->po_refcnt, ncnt);
        _PROP_ASSERT(ncnt != 0);
}

/*
 * prop_object_release_emergency
 *        A direct free with prop_object_release failed.
 *        Walk down the tree until a leaf is found and
 *        free that. Do not recurse to avoid stack overflows.
 *
 *        This is a slow edge condition, but necessary to
 *        guarantee that an object can always be freed.
 */
static void
prop_object_release_emergency(prop_object_t obj)
{
        struct _prop_object *po;
        void (*unlock)(void);
        prop_object_t parent = NULL;
        uint32_t ocnt;

        for (;;) {
                po = obj;
                _PROP_ASSERT(obj);

                if (po->po_type->pot_lock != NULL)
                po->po_type->pot_lock();

                /* Save pointerto unlock function */
                unlock = po->po_type->pot_unlock;

                /* Dance a bit to make sure we always get the non-racy ocnt */
                _PROP_ATOMIC_DEC32_NV(&po->po_refcnt, ocnt);
                ocnt++;
                _PROP_ASSERT(ocnt != 0);

                if (ocnt != 1) {
                        if (unlock != NULL)
                                unlock();
                        break;
                }

                _PROP_ASSERT(po->po_type);
                if ((po->po_type->pot_free)(NULL, &obj) ==
                    _PROP_OBJECT_FREE_DONE) {
                        if (unlock != NULL)
                                unlock();
                        break;
                }

                if (unlock != NULL)
                        unlock();

                parent = po;
                _PROP_ATOMIC_INC32(&po->po_refcnt);
        }
        _PROP_ASSERT(parent);
        /* One object was just freed. */
        po = parent;
        (*po->po_type->pot_emergency_free)(parent);
}

/*
 * prop_object_release --
 *        Decrement the reference count on an object.
 *
 *        Free the object if we are releasing the final
 *        reference.
 */
void
prop_object_release(prop_object_t obj)
{
        struct _prop_object *po;
        struct _prop_stack stack;
        void (*unlock)(void);
        int ret;
        uint32_t ocnt;

        _prop_stack_init(&stack);

        do {
                do {
                        po = obj;
                        _PROP_ASSERT(obj);

                        if (po->po_type->pot_lock != NULL)
                                po->po_type->pot_lock();

                        /* Save pointer to object unlock function */
                        unlock = po->po_type->pot_unlock;

                        _PROP_ATOMIC_DEC32_NV(&po->po_refcnt, ocnt);
                        ocnt++;
                        _PROP_ASSERT(ocnt != 0);

                        if (ocnt != 1) {
                                ret = 0;
                                if (unlock != NULL)
                                        unlock();
                                break;
                        }

                        ret = (po->po_type->pot_free)(&stack, &obj);

                        if (unlock != NULL)
                                unlock();

                        if (ret == _PROP_OBJECT_FREE_DONE)
                                break;

                        _PROP_ATOMIC_INC32(&po->po_refcnt);
                } while (ret == _PROP_OBJECT_FREE_RECURSE);
                if (ret == _PROP_OBJECT_FREE_FAILED)
                        prop_object_release_emergency(obj);
        } while (_prop_stack_pop(&stack, &obj, NULL, NULL, NULL));
}

/*
 * prop_object_type --
 *        Return the type of an object.
 */
prop_type_t
prop_object_type(prop_object_t obj)
{
        struct _prop_object *po = obj;

        if (obj == NULL)
                return (PROP_TYPE_UNKNOWN);

        return (po->po_type->pot_type);
}

/*
 * prop_object_equals --
 *        Returns true if thw two objects are equivalent.
 */
bool
prop_object_equals(prop_object_t obj1, prop_object_t obj2)
{
        return (prop_object_equals_with_error(obj1, obj2, NULL));
}

bool
prop_object_equals_with_error(prop_object_t obj1, prop_object_t obj2,
    bool *error_flag)
{
        struct _prop_object *po1;
        struct _prop_object *po2;
        void *stored_pointer1, *stored_pointer2;
        prop_object_t next_obj1, next_obj2;
        struct _prop_stack stack;
        _prop_object_equals_rv_t ret;

        _prop_stack_init(&stack);
        if (error_flag)
                *error_flag = false;

 start_subtree:
        stored_pointer1 = NULL;
        stored_pointer2 = NULL;
        po1 = obj1;
        po2 = obj2;

        if (po1->po_type != po2->po_type)
                return (false);

 continue_subtree:
        ret = (*po1->po_type->pot_equals)(obj1, obj2,
                                          &stored_pointer1, &stored_pointer2,
                                          &next_obj1, &next_obj2);
        if (ret == _PROP_OBJECT_EQUALS_FALSE)
                goto finish;
        if (ret == _PROP_OBJECT_EQUALS_TRUE) {
                if (!_prop_stack_pop(&stack, &obj1, &obj2,
                                     &stored_pointer1, &stored_pointer2))
                        return true;
                po1 = obj1;
                po2 = obj2;
                goto continue_subtree;
        }
        _PROP_ASSERT(ret == _PROP_OBJECT_EQUALS_RECURSE);

        if (!_prop_stack_push(&stack, obj1, obj2,
                              stored_pointer1, stored_pointer2)) {
                if (error_flag)
                        *error_flag = true;
                goto finish;
        }
        obj1 = next_obj1;
        obj2 = next_obj2;
        goto start_subtree;

finish:
        while (_prop_stack_pop(&stack, &obj1, &obj2, NULL, NULL)) {
                po1 = obj1;
                (*po1->po_type->pot_equals_finish)(obj1, obj2);
        }
        return (false);
}

/*
 * prop_object_iterator_next --
 *        Return the next item during an iteration.
 */
prop_object_t
prop_object_iterator_next(prop_object_iterator_t pi)
{

        return ((*pi->pi_next_object)(pi));
}

/*
 * prop_object_iterator_reset --
 *        Reset the iterator to the first object so as to restart
 *        iteration.
 */
void
prop_object_iterator_reset(prop_object_iterator_t pi)
{

        (*pi->pi_reset)(pi);
}

/*
 * prop_object_iterator_release --
 *        Release the object iterator.
 */
void
prop_object_iterator_release(prop_object_iterator_t pi)
{

        prop_object_release(pi->pi_obj);
        _PROP_FREE(pi, M_TEMP);
}








































































   96 










   96 





































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
/*        $NetBSD: subr_fault.c,v 1.2 2020/06/30 16:28:17 maxv Exp $        */

/*
 * Copyright (c) 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Maxime Villard.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_fault.c,v 1.2 2020/06/30 16:28:17 maxv Exp $");

#include <sys/module.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>

#include <sys/conf.h>
#include <sys/types.h>
#include <sys/specificdata.h>
#include <sys/kmem.h>
#include <sys/atomic.h>
#include <sys/ioccom.h>
#include <sys/lwp.h>
#include <sys/fault.h>

typedef struct {
        volatile bool enabled;
        volatile bool oneshot;
        volatile unsigned long nth;
        volatile unsigned long cnt;
        volatile unsigned long nfaults;
} fault_t;

static fault_t fault_global __cacheline_aligned = {
        .enabled = false,
        .oneshot = false,
        .nth = FAULT_NTH_MIN,
        .cnt = 0,
        .nfaults = 0
};

static kmutex_t fault_global_lock __cacheline_aligned;
static specificdata_key_t fault_lwp_key;

/* -------------------------------------------------------------------------- */

bool
fault_inject(void)
{
        volatile unsigned long cnt;
        fault_t *f;

        if (__predict_false(cold))
                return false;

        if (__predict_false(atomic_load_acquire(&fault_global.enabled))) {
                f = &fault_global;
        } else {
                f = lwp_getspecific(fault_lwp_key);
                if (__predict_true(f == NULL))
                        return false;
                if (__predict_false(!f->enabled))
                        return false;
        }

        if (atomic_load_relaxed(&f->oneshot)) {
                if (__predict_true(atomic_load_relaxed(&f->nfaults) > 0))
                        return false;
        }

        cnt = atomic_inc_ulong_nv(&f->cnt);
        if (__predict_false(cnt % atomic_load_relaxed(&f->nth) == 0)) {
                atomic_inc_ulong(&f->nfaults);
                return true;
        }

        return false;
}

/* -------------------------------------------------------------------------- */

static int
fault_open(dev_t dev, int flag, int mode, struct lwp *l)
{
        return 0;
}

static int
fault_close(dev_t dev, int flag, int mode, struct lwp *l)
{
        return 0;
}

static int
fault_ioc_enable(struct fault_ioc_enable *args)
{
        fault_t *f;

        if (args->mode != FAULT_MODE_NTH_ONESHOT)
                return EINVAL;
        if (args->nth < FAULT_NTH_MIN)
                return EINVAL;

        switch (args->scope) {
        case FAULT_SCOPE_GLOBAL:
                mutex_enter(&fault_global_lock);
                if (fault_global.enabled) {
                        mutex_exit(&fault_global_lock);
                        return EEXIST;
                }
                fault_global.oneshot = true;
                atomic_store_relaxed(&fault_global.nth, args->nth);
                fault_global.cnt = 0;
                fault_global.nfaults = 0;
                atomic_store_release(&fault_global.enabled, true);
                mutex_exit(&fault_global_lock);
                break;
        case FAULT_SCOPE_LWP:
                f = lwp_getspecific(fault_lwp_key);
                if (f != NULL) {
                        if (f->enabled)
                                return EEXIST;
                } else {
                        f = kmem_zalloc(sizeof(*f), KM_SLEEP);
                        lwp_setspecific(fault_lwp_key, f);
                }
                f->oneshot = true;
                atomic_store_relaxed(&f->nth, args->nth);
                f->cnt = 0;
                f->nfaults = 0;
                atomic_store_release(&f->enabled, true);
                break;
        default:
                return EINVAL;
        }

        return 0;
}

static int
fault_ioc_disable(struct fault_ioc_disable *args)
{
        fault_t *f;

        switch (args->scope) {
        case FAULT_SCOPE_GLOBAL:
                mutex_enter(&fault_global_lock);
                if (!fault_global.enabled) {
                        mutex_exit(&fault_global_lock);
                        return ENOENT;
                }
                atomic_store_release(&fault_global.enabled, false);
                mutex_exit(&fault_global_lock);
                break;
        case FAULT_SCOPE_LWP:
                f = lwp_getspecific(fault_lwp_key);
                if (f == NULL)
                        return ENOENT;
                if (!f->enabled)
                        return ENOENT;
                atomic_store_release(&f->enabled, false);
                break;
        default:
                return EINVAL;
        }

        return 0;
}

static int
fault_ioc_getinfo(struct fault_ioc_getinfo *args)
{
        fault_t *f;

        switch (args->scope) {
        case FAULT_SCOPE_GLOBAL:
                args->nfaults = atomic_load_relaxed(&fault_global.nfaults);
                break;
        case FAULT_SCOPE_LWP:
                f = lwp_getspecific(fault_lwp_key);
                if (f == NULL)
                        return ENOENT;
                args->nfaults = atomic_load_relaxed(&f->nfaults);
                break;
        default:
                return EINVAL;
        }

        return 0;
}

static int
fault_ioctl(dev_t dev, u_long cmd, void *addr, int flag, struct lwp *l)
{
        switch (cmd) {
        case FAULT_IOC_ENABLE:
                return fault_ioc_enable(addr);
        case FAULT_IOC_DISABLE:
                return fault_ioc_disable(addr);
        case FAULT_IOC_GETINFO:
                return fault_ioc_getinfo(addr);
        default:
                return EINVAL;
        }
}

const struct cdevsw fault_cdevsw = {
        .d_open = fault_open,
        .d_close = fault_close,
        .d_read = noread,
        .d_write = nowrite,
        .d_ioctl = fault_ioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_OTHER | D_MPSAFE
};

/* -------------------------------------------------------------------------- */

MODULE(MODULE_CLASS_MISC, fault, NULL);

static void
fault_lwp_free(void *arg)
{
        fault_t *f = (fault_t *)arg;

        if (f == NULL) {
                return;
        }

        kmem_free(f, sizeof(*f));
}

static void
fault_init(void)
{
        mutex_init(&fault_global_lock, MUTEX_DEFAULT, IPL_NONE);
        lwp_specific_key_create(&fault_lwp_key, fault_lwp_free);
}

static int
fault_modcmd(modcmd_t cmd, void *arg)
{
           switch (cmd) {
        case MODULE_CMD_INIT:
                fault_init();
                return 0;
        case MODULE_CMD_FINI:
                return EINVAL;
        default:
                return ENOTTY;
        }
}































































































































































































































































































































































































































































































    1 





    1 








































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
/*        $NetBSD: tcp_syncache.c,v 1.6 2022/11/04 09:01:53 ozaki-r Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
 *
 * NRL grants permission for redistribution and use in source and binary
 * forms, with or without modification, of the software and documentation
 * created at NRL provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgements:
 *      This product includes software developed by the University of
 *      California, Berkeley and its contributors.
 *      This product includes software developed at the Information
 *      Technology Division, US Naval Research Laboratory.
 * 4. Neither the name of the NRL nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * The views and conclusions contained in the software and documentation
 * are those of the authors and should not be interpreted as representing
 * official policies, either expressed or implied, of the US Naval
 * Research Laboratory (NRL).
 */

/*-
 * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
 * 2011 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Coyote Point Systems, Inc.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
 * Facility, NASA Ames Research Center.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Rui Paulo.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)tcp_input.c        8.12 (Berkeley) 5/24/95
 */

/*
 *        TODO list for SYN cache stuff:
 *
 *        Find room for a "state" field, which is needed to keep a
 *        compressed state for TIME_WAIT TCBs.  It's been noted already
 *        that this is fairly important for very high-volume web and
 *        mail servers, which use a large number of short-lived
 *        connections.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.6 2022/11/04 09:01:53 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/lwp.h> /* for lwp0 */
#include <sys/cprng.h>

#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>

#include <netinet/ip6.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#endif

#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_private.h>
#include <netinet/tcp_syncache.h>

#ifdef TCP_SIGNATURE
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#endif        /* IPSEC*/
#endif

static void        syn_cache_timer(void *);
static struct syn_cache *
                syn_cache_lookup(const struct sockaddr *, const struct sockaddr *,
                struct syn_cache_head **);
static int        syn_cache_respond(struct syn_cache *);

/* syn hash parameters */
#define        TCP_SYN_HASH_SIZE        293
#define        TCP_SYN_BUCKET_SIZE        35
static int        tcp_syn_cache_size = TCP_SYN_HASH_SIZE;
int                tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
int                tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
static struct        syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];

/*
 * TCP compressed state engine.  Currently used to hold compressed
 * state for SYN_RECEIVED.
 */

u_long        syn_cache_count;
static u_int32_t syn_hash1, syn_hash2;

#define SYN_HASH(sa, sp, dp) \
        ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
                                     ((u_int32_t)(sp)))^syn_hash2)))
#ifndef INET6
#define        SYN_HASHALL(hash, src, dst) \
do {                                                                        \
        hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr,        \
                ((const struct sockaddr_in *)(src))->sin_port,                \
                ((const struct sockaddr_in *)(dst))->sin_port);                \
} while (/*CONSTCOND*/ 0)
#else
#define SYN_HASH6(sa, sp, dp) \
        ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
          (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
         & 0x7fffffff)

#define SYN_HASHALL(hash, src, dst) \
do {                                                                        \
        switch ((src)->sa_family) {                                        \
        case AF_INET:                                                        \
                hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
                        ((const struct sockaddr_in *)(src))->sin_port,        \
                        ((const struct sockaddr_in *)(dst))->sin_port);        \
                break;                                                        \
        case AF_INET6:                                                        \
                hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
                        ((const struct sockaddr_in6 *)(src))->sin6_port,        \
                        ((const struct sockaddr_in6 *)(dst))->sin6_port);        \
                break;                                                        \
        default:                                                        \
                hash = 0;                                                \
        }                                                                \
} while (/*CONSTCOND*/0)
#endif /* INET6 */

static struct pool syn_cache_pool;

/*
 * We don't estimate RTT with SYNs, so each packet starts with the default
 * RTT and each timer step has a fixed timeout value.
 */
static inline void
syn_cache_timer_arm(struct syn_cache *sc)
{

        TCPT_RANGESET(sc->sc_rxtcur,
            TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
            TCPTV_REXMTMAX);
        callout_reset(&sc->sc_timer,
            sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc);
}

#define        SYN_CACHE_TIMESTAMP(sc)        (tcp_now - (sc)->sc_timebase)

static inline void
syn_cache_rm(struct syn_cache *sc)
{
        TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
            sc, sc_bucketq);
        sc->sc_tp = NULL;
        LIST_REMOVE(sc, sc_tpq);
        tcp_syn_cache[sc->sc_bucketidx].sch_length--;
        callout_stop(&sc->sc_timer);
        syn_cache_count--;
}

static inline void
syn_cache_put(struct syn_cache *sc)
{
        if (sc->sc_ipopts)
                (void) m_free(sc->sc_ipopts);
        rtcache_free(&sc->sc_route);
        sc->sc_flags |= SCF_DEAD;
        if (!callout_invoking(&sc->sc_timer))
                callout_schedule(&(sc)->sc_timer, 1);
}

void
syn_cache_init(void)
{
        int i;

        pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
            "synpl", NULL, IPL_SOFTNET);

        /* Initialize the hash buckets. */
        for (i = 0; i < tcp_syn_cache_size; i++)
                TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
}

void
syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
{
        struct syn_cache_head *scp;
        struct syn_cache *sc2;
        int s;

        /*
         * If there are no entries in the hash table, reinitialize
         * the hash secrets.
         */
        if (syn_cache_count == 0) {
                syn_hash1 = cprng_fast32();
                syn_hash2 = cprng_fast32();
        }

        SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
        sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
        scp = &tcp_syn_cache[sc->sc_bucketidx];

        /*
         * Make sure that we don't overflow the per-bucket
         * limit or the total cache size limit.
         */
        s = splsoftnet();
        if (scp->sch_length >= tcp_syn_bucket_limit) {
                TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
                /*
                 * The bucket is full.  Toss the oldest element in the
                 * bucket.  This will be the first entry in the bucket.
                 */
                sc2 = TAILQ_FIRST(&scp->sch_bucket);
#ifdef DIAGNOSTIC
                /*
                 * This should never happen; we should always find an
                 * entry in our bucket.
                 */
                if (sc2 == NULL)
                        panic("syn_cache_insert: bucketoverflow: impossible");
#endif
                syn_cache_rm(sc2);
                syn_cache_put(sc2);        /* calls pool_put but see spl above */
        } else if (syn_cache_count >= tcp_syn_cache_limit) {
                struct syn_cache_head *scp2, *sce;

                TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
                /*
                 * The cache is full.  Toss the oldest entry in the
                 * first non-empty bucket we can find.
                 *
                 * XXX We would really like to toss the oldest
                 * entry in the cache, but we hope that this
                 * condition doesn't happen very often.
                 */
                scp2 = scp;
                if (TAILQ_EMPTY(&scp2->sch_bucket)) {
                        sce = &tcp_syn_cache[tcp_syn_cache_size];
                        for (++scp2; scp2 != scp; scp2++) {
                                if (scp2 >= sce)
                                        scp2 = &tcp_syn_cache[0];
                                if (! TAILQ_EMPTY(&scp2->sch_bucket))
                                        break;
                        }
#ifdef DIAGNOSTIC
                        /*
                         * This should never happen; we should always find a
                         * non-empty bucket.
                         */
                        if (scp2 == scp)
                                panic("syn_cache_insert: cacheoverflow: "
                                    "impossible");
#endif
                }
                sc2 = TAILQ_FIRST(&scp2->sch_bucket);
                syn_cache_rm(sc2);
                syn_cache_put(sc2);        /* calls pool_put but see spl above */
        }

        /*
         * Initialize the entry's timer.
         */
        sc->sc_rxttot = 0;
        sc->sc_rxtshift = 0;
        syn_cache_timer_arm(sc);

        /* Link it from tcpcb entry */
        LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);

        /* Put it into the bucket. */
        TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
        scp->sch_length++;
        syn_cache_count++;

        TCP_STATINC(TCP_STAT_SC_ADDED);
        splx(s);
}

/*
 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
 * If we have retransmitted an entry the maximum number of times, expire
 * that entry.
 */
static void
syn_cache_timer(void *arg)
{
        struct syn_cache *sc = arg;

        mutex_enter(softnet_lock);
        KERNEL_LOCK(1, NULL);

        callout_ack(&sc->sc_timer);

        if (__predict_false(sc->sc_flags & SCF_DEAD)) {
                TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
                goto free;
        }

        if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
                /* Drop it -- too many retransmissions. */
                goto dropit;
        }

        /*
         * Compute the total amount of time this entry has
         * been on a queue.  If this entry has been on longer
         * than the keep alive timer would allow, expire it.
         */
        sc->sc_rxttot += sc->sc_rxtcur;
        if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS))
                goto dropit;

        TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
        (void)syn_cache_respond(sc);

        /* Advance the timer back-off. */
        sc->sc_rxtshift++;
        syn_cache_timer_arm(sc);

        goto out;

 dropit:
        TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
        syn_cache_rm(sc);
        if (sc->sc_ipopts)
                (void) m_free(sc->sc_ipopts);
        rtcache_free(&sc->sc_route);

 free:
        callout_destroy(&sc->sc_timer);
        pool_put(&syn_cache_pool, sc);

 out:
        KERNEL_UNLOCK_ONE(NULL);
        mutex_exit(softnet_lock);
}

/*
 * Remove syn cache created by the specified tcb entry,
 * because this does not make sense to keep them
 * (if there's no tcb entry, syn cache entry will never be used)
 */
void
syn_cache_cleanup(struct tcpcb *tp)
{
        struct syn_cache *sc, *nsc;
        int s;

        s = splsoftnet();

        for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
                nsc = LIST_NEXT(sc, sc_tpq);

#ifdef DIAGNOSTIC
                if (sc->sc_tp != tp)
                        panic("invalid sc_tp in syn_cache_cleanup");
#endif
                syn_cache_rm(sc);
                syn_cache_put(sc);        /* calls pool_put but see spl above */
        }
        /* just for safety */
        LIST_INIT(&tp->t_sc);

        splx(s);
}

/*
 * Find an entry in the syn cache.
 */
static struct syn_cache *
syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
    struct syn_cache_head **headp)
{
        struct syn_cache *sc;
        struct syn_cache_head *scp;
        u_int32_t hash;
        int s;

        SYN_HASHALL(hash, src, dst);

        scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
        *headp = scp;
        s = splsoftnet();
        for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
             sc = TAILQ_NEXT(sc, sc_bucketq)) {
                if (sc->sc_hash != hash)
                        continue;
                if (!memcmp(&sc->sc_src, src, src->sa_len) &&
                    !memcmp(&sc->sc_dst, dst, dst->sa_len)) {
                        splx(s);
                        return (sc);
                }
        }
        splx(s);
        return (NULL);
}

/*
 * This function gets called when we receive an ACK for a socket in the
 * LISTEN state. We look up the connection in the syn cache, and if it's
 * there, we pull it out of the cache and turn it into a full-blown
 * connection in the SYN-RECEIVED state.
 *
 * The return values may not be immediately obvious, and their effects
 * can be subtle, so here they are:
 *
 *        NULL        SYN was not found in cache; caller should drop the
 *                packet and send an RST.
 *
 *        -1        We were unable to create the new connection, and are
 *                aborting it.  An ACK,RST is being sent to the peer
 *                (unless we got screwey sequence numbers; see below),
 *                because the 3-way handshake has been completed.  Caller
 *                should not free the mbuf, since we may be using it.  If
 *                we are not, we will free it.
 *
 *        Otherwise, the return value is a pointer to the new socket
 *        associated with the connection.
 */
struct socket *
syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
    struct tcphdr *th, struct socket *so, struct mbuf *m)
{
        struct syn_cache *sc;
        struct syn_cache_head *scp;
        struct inpcb *inp = NULL;
        struct tcpcb *tp;
        int s;
        struct socket *oso;

        s = splsoftnet();
        if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
                splx(s);
                return NULL;
        }

        /*
         * Verify the sequence and ack numbers.  Try getting the correct
         * response again.
         */
        if ((th->th_ack != sc->sc_iss + 1) ||
            SEQ_LEQ(th->th_seq, sc->sc_irs) ||
            SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
                m_freem(m);
                (void)syn_cache_respond(sc);
                splx(s);
                return ((struct socket *)(-1));
        }

        /* Remove this cache entry */
        syn_cache_rm(sc);
        splx(s);

        /*
         * Ok, create the full blown connection, and set things up
         * as they would have been set up if we had created the
         * connection when the SYN arrived.  If we can't create
         * the connection, abort it.
         */
        /*
         * inp still has the OLD in_pcb stuff, set the
         * v6-related flags on the new guy, too.   This is
         * done particularly for the case where an AF_INET6
         * socket is bound only to a port, and a v4 connection
         * comes in on that port.
         * we also copy the flowinfo from the original pcb
         * to the new one.
         */
        oso = so;
        so = sonewconn(so, true);
        if (so == NULL)
                goto resetandabort;

        inp = sotoinpcb(so);

        switch (src->sa_family) {
        case AF_INET:
                if (inp->inp_af == AF_INET) {
                        in4p_laddr(inp) = ((struct sockaddr_in *)dst)->sin_addr;
                        inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
                        inp->inp_options = ip_srcroute(m);
                        inpcb_set_state(inp, INP_BOUND);
                        if (inp->inp_options == NULL) {
                                inp->inp_options = sc->sc_ipopts;
                                sc->sc_ipopts = NULL;
                        }
                }
#ifdef INET6
                else if (inp->inp_af == AF_INET6) {
                        /* IPv4 packet to AF_INET6 socket */
                        memset(&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp)));
                        in6p_laddr(inp).s6_addr16[5] = htons(0xffff);
                        bcopy(&((struct sockaddr_in *)dst)->sin_addr,
                                &in6p_laddr(inp).s6_addr32[3],
                                sizeof(((struct sockaddr_in *)dst)->sin_addr));
                        inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
                        intotcpcb(inp)->t_family = AF_INET;
                        if (sotoinpcb(oso)->inp_flags & IN6P_IPV6_V6ONLY)
                                inp->inp_flags |= IN6P_IPV6_V6ONLY;
                        else
                                inp->inp_flags &= ~IN6P_IPV6_V6ONLY;
                        inpcb_set_state(inp, INP_BOUND);
                }
#endif
                break;
#ifdef INET6
        case AF_INET6:
                if (inp->inp_af == AF_INET6) {
                        in6p_laddr(inp) = ((struct sockaddr_in6 *)dst)->sin6_addr;
                        inp->inp_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
                        inpcb_set_state(inp, INP_BOUND);
                }
                break;
#endif
        }

#ifdef INET6
        if (inp && intotcpcb(inp)->t_family == AF_INET6 && sotoinpcb(oso)) {
                struct inpcb *oinp = sotoinpcb(oso);
                /* inherit socket options from the listening socket */
                inp->inp_flags |= (oinp->inp_flags & IN6P_CONTROLOPTS);
                if (inp->inp_flags & IN6P_CONTROLOPTS) {
                        m_freem(inp->inp_options);
                        inp->inp_options = NULL;
                }
                ip6_savecontrol(inp, &inp->inp_options,
                    mtod(m, struct ip6_hdr *), m);
        }
#endif

        /*
         * Give the new socket our cached route reference.
         */
        rtcache_copy(&inp->inp_route, &sc->sc_route);
        rtcache_free(&sc->sc_route);

        if (inp->inp_af == AF_INET) {
                struct sockaddr_in sin;
                memcpy(&sin, src, src->sa_len);
                if (inpcb_connect(inp, &sin, &lwp0)) {
                        goto resetandabort;
                }
        }
#ifdef INET6
        else if (inp->inp_af == AF_INET6) {
                struct sockaddr_in6 sin6;
                memcpy(&sin6, src, src->sa_len);
                if (src->sa_family == AF_INET) {
                        /* IPv4 packet to AF_INET6 socket */
                        in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6);
                }
                if (in6pcb_connect(inp, &sin6, NULL)) {
                        goto resetandabort;
                }
        }
#endif
        else {
                goto resetandabort;
        }

        tp = intotcpcb(inp);

        tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
        if (sc->sc_request_r_scale != 15) {
                tp->requested_s_scale = sc->sc_requested_s_scale;
                tp->request_r_scale = sc->sc_request_r_scale;
                tp->snd_scale = sc->sc_requested_s_scale;
                tp->rcv_scale = sc->sc_request_r_scale;
                tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
        }
        if (sc->sc_flags & SCF_TIMESTAMP)
                tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
        tp->ts_timebase = sc->sc_timebase;

        tp->t_template = tcp_template(tp);
        if (tp->t_template == 0) {
                tp = tcp_drop(tp, ENOBUFS);        /* destroys socket */
                so = NULL;
                m_freem(m);
                goto abort;
        }

        tp->iss = sc->sc_iss;
        tp->irs = sc->sc_irs;
        tcp_sendseqinit(tp);
        tcp_rcvseqinit(tp);
        tp->t_state = TCPS_SYN_RECEIVED;
        TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
        TCP_STATINC(TCP_STAT_ACCEPTS);

        if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
                tp->t_flags |= TF_WILL_SACK;

        if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
                tp->t_flags |= TF_ECN_PERMIT;

#ifdef TCP_SIGNATURE
        if (sc->sc_flags & SCF_SIGNATURE)
                tp->t_flags |= TF_SIGNATURE;
#endif

        /* Initialize tp->t_ourmss before we deal with the peer's! */
        tp->t_ourmss = sc->sc_ourmaxseg;
        tcp_mss_from_peer(tp, sc->sc_peermaxseg);

        /*
         * Initialize the initial congestion window.  If we
         * had to retransmit the SYN,ACK, we must initialize cwnd
         * to 1 segment (i.e. the Loss Window).
         */
        if (sc->sc_rxtshift)
                tp->snd_cwnd = tp->t_peermss;
        else {
                int ss = tcp_init_win;
                if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp)))
                        ss = tcp_init_win_local;
#ifdef INET6
                else if (inp->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(inp)))
                        ss = tcp_init_win_local;
#endif
                tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
        }

        tcp_rmx_rtt(tp);
        tp->snd_wl1 = sc->sc_irs;
        tp->rcv_up = sc->sc_irs + 1;

        /*
         * This is what would have happened in tcp_output() when
         * the SYN,ACK was sent.
         */
        tp->snd_up = tp->snd_una;
        tp->snd_max = tp->snd_nxt = tp->iss+1;
        TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
        if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
                tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
        tp->last_ack_sent = tp->rcv_nxt;
        tp->t_partialacks = -1;
        tp->t_dupacks = 0;

        TCP_STATINC(TCP_STAT_SC_COMPLETED);
        s = splsoftnet();
        syn_cache_put(sc);
        splx(s);
        return so;

resetandabort:
        (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
abort:
        if (so != NULL) {
                (void) soqremque(so, 1);
                (void) soabort(so);
                mutex_enter(softnet_lock);
        }
        s = splsoftnet();
        syn_cache_put(sc);
        splx(s);
        TCP_STATINC(TCP_STAT_SC_ABORTED);
        return ((struct socket *)(-1));
}

/*
 * This function is called when we get a RST for a
 * non-existent connection, so that we can see if the
 * connection is in the syn cache.  If it is, zap it.
 */

void
syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
{
        struct syn_cache *sc;
        struct syn_cache_head *scp;
        int s = splsoftnet();

        if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
                splx(s);
                return;
        }
        if (SEQ_LT(th->th_seq, sc->sc_irs) ||
            SEQ_GT(th->th_seq, sc->sc_irs+1)) {
                splx(s);
                return;
        }
        syn_cache_rm(sc);
        TCP_STATINC(TCP_STAT_SC_RESET);
        syn_cache_put(sc);        /* calls pool_put but see spl above */
        splx(s);
}

void
syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
    struct tcphdr *th)
{
        struct syn_cache *sc;
        struct syn_cache_head *scp;
        int s;

        s = splsoftnet();
        if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
                splx(s);
                return;
        }
        /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
        if (ntohl(th->th_seq) != sc->sc_iss) {
                splx(s);
                return;
        }

        /*
         * If we've retransmitted 3 times and this is our second error,
         * we remove the entry.  Otherwise, we allow it to continue on.
         * This prevents us from incorrectly nuking an entry during a
         * spurious network outage.
         *
         * See tcp_notify().
         */
        if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
                sc->sc_flags |= SCF_UNREACH;
                splx(s);
                return;
        }

        syn_cache_rm(sc);
        TCP_STATINC(TCP_STAT_SC_UNREACH);
        syn_cache_put(sc);        /* calls pool_put but see spl above */
        splx(s);
}

/*
 * Given a LISTEN socket and an inbound SYN request, add this to the syn
 * cache, and send back a segment:
 *        <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
 * to the source.
 *
 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
 * Doing so would require that we hold onto the data and deliver it
 * to the application.  However, if we are the target of a SYN-flood
 * DoS attack, an attacker could send data which would eventually
 * consume all available buffer space if it were ACKed.  By not ACKing
 * the data, we avoid this DoS scenario.
 */
int
syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
    unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp,
    int optlen, struct tcp_opt_info *oi)
{
        struct tcpcb tb, *tp;
        long win;
        struct syn_cache *sc;
        struct syn_cache_head *scp;
        struct mbuf *ipopts;
        int s;

        tp = sototcpcb(so);

        /*
         * Initialize some local state.
         */
        win = sbspace(&so->so_rcv);
        if (win > TCP_MAXWIN)
                win = TCP_MAXWIN;

#ifdef TCP_SIGNATURE
        if (optp || (tp->t_flags & TF_SIGNATURE))
#else
        if (optp)
#endif
        {
                tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
#ifdef TCP_SIGNATURE
                tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
#endif
                tb.t_state = TCPS_LISTEN;
                if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0)
                        return 0;
        } else
                tb.t_flags = 0;

        switch (src->sa_family) {
        case AF_INET:
                /* Remember the IP options, if any. */
                ipopts = ip_srcroute(m);
                break;
        default:
                ipopts = NULL;
        }

        /*
         * See if we already have an entry for this connection.
         * If we do, resend the SYN,ACK.  We do not count this
         * as a retransmission (XXX though maybe we should).
         */
        if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
                TCP_STATINC(TCP_STAT_SC_DUPESYN);
                if (ipopts) {
                        /*
                         * If we were remembering a previous source route,
                         * forget it and use the new one we've been given.
                         */
                        if (sc->sc_ipopts)
                                (void)m_free(sc->sc_ipopts);
                        sc->sc_ipopts = ipopts;
                }
                sc->sc_timestamp = tb.ts_recent;
                m_freem(m);
                if (syn_cache_respond(sc) == 0) {
                        uint64_t *tcps = TCP_STAT_GETREF();
                        tcps[TCP_STAT_SNDACKS]++;
                        tcps[TCP_STAT_SNDTOTAL]++;
                        TCP_STAT_PUTREF();
                }
                return 1;
        }

        s = splsoftnet();
        sc = pool_get(&syn_cache_pool, PR_NOWAIT);
        splx(s);
        if (sc == NULL) {
                if (ipopts)
                        (void)m_free(ipopts);
                return 0;
        }

        /*
         * Fill in the cache, and put the necessary IP and TCP
         * options into the reply.
         */
        memset(sc, 0, sizeof(struct syn_cache));
        callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
        memcpy(&sc->sc_src, src, src->sa_len);
        memcpy(&sc->sc_dst, dst, dst->sa_len);
        sc->sc_flags = 0;
        sc->sc_ipopts = ipopts;
        sc->sc_irs = th->th_seq;
        switch (src->sa_family) {
        case AF_INET:
            {
                struct sockaddr_in *srcin = (void *)src;
                struct sockaddr_in *dstin = (void *)dst;

                sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
                    &srcin->sin_addr, dstin->sin_port,
                    srcin->sin_port, sizeof(dstin->sin_addr));
                break;
            }
#ifdef INET6
        case AF_INET6:
            {
                struct sockaddr_in6 *srcin6 = (void *)src;
                struct sockaddr_in6 *dstin6 = (void *)dst;

                sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
                    &srcin6->sin6_addr, dstin6->sin6_port,
                    srcin6->sin6_port, sizeof(dstin6->sin6_addr));
                break;
            }
#endif
        }
        sc->sc_peermaxseg = oi->maxseg;
        sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
            m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family);
        sc->sc_win = win;
        sc->sc_timebase = tcp_now - 1;        /* see tcp_newtcpcb() */
        sc->sc_timestamp = tb.ts_recent;
        if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
            (TF_REQ_TSTMP|TF_RCVD_TSTMP))
                sc->sc_flags |= SCF_TIMESTAMP;
        if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
            (TF_RCVD_SCALE|TF_REQ_SCALE)) {
                sc->sc_requested_s_scale = tb.requested_s_scale;
                sc->sc_request_r_scale = 0;
                /*
                 * Pick the smallest possible scaling factor that
                 * will still allow us to scale up to sb_max.
                 *
                 * We do this because there are broken firewalls that
                 * will corrupt the window scale option, leading to
                 * the other endpoint believing that our advertised
                 * window is unscaled.  At scale factors larger than
                 * 5 the unscaled window will drop below 1500 bytes,
                 * leading to serious problems when traversing these
                 * broken firewalls.
                 *
                 * With the default sbmax of 256K, a scale factor
                 * of 3 will be chosen by this algorithm.  Those who
                 * choose a larger sbmax should watch out
                 * for the compatibility problems mentioned above.
                 *
                 * RFC1323: The Window field in a SYN (i.e., a <SYN>
                 * or <SYN,ACK>) segment itself is never scaled.
                 */
                while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
                    (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
                        sc->sc_request_r_scale++;
        } else {
                sc->sc_requested_s_scale = 15;
                sc->sc_request_r_scale = 15;
        }
        if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
                sc->sc_flags |= SCF_SACK_PERMIT;

        /*
         * ECN setup packet received.
         */
        if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
                sc->sc_flags |= SCF_ECN_PERMIT;

#ifdef TCP_SIGNATURE
        if (tb.t_flags & TF_SIGNATURE)
                sc->sc_flags |= SCF_SIGNATURE;
#endif
        sc->sc_tp = tp;
        m_freem(m);
        if (syn_cache_respond(sc) == 0) {
                uint64_t *tcps = TCP_STAT_GETREF();
                tcps[TCP_STAT_SNDACKS]++;
                tcps[TCP_STAT_SNDTOTAL]++;
                TCP_STAT_PUTREF();
                syn_cache_insert(sc, tp);
        } else {
                s = splsoftnet();
                /*
                 * syn_cache_put() will try to schedule the timer, so
                 * we need to initialize it
                 */
                syn_cache_timer_arm(sc);
                syn_cache_put(sc);
                splx(s);
                TCP_STATINC(TCP_STAT_SC_DROPPED);
        }
        return 1;
}

/*
 * syn_cache_respond: (re)send SYN+ACK.
 *
 * Returns 0 on success.
 */

static int
syn_cache_respond(struct syn_cache *sc)
{
#ifdef INET6
        struct rtentry *rt = NULL;
#endif
        struct route *ro;
        u_int8_t *optp;
        int optlen, error;
        u_int16_t tlen;
        struct ip *ip = NULL;
#ifdef INET6
        struct ip6_hdr *ip6 = NULL;
#endif
        struct tcpcb *tp;
        struct tcphdr *th;
        struct mbuf *m;
        u_int hlen;
#ifdef TCP_SIGNATURE
        struct secasvar *sav = NULL;
        u_int8_t *sigp = NULL;
#endif

        ro = &sc->sc_route;
        switch (sc->sc_src.sa.sa_family) {
        case AF_INET:
                hlen = sizeof(struct ip);
                break;
#ifdef INET6
        case AF_INET6:
                hlen = sizeof(struct ip6_hdr);
                break;
#endif
        default:
                return EAFNOSUPPORT;
        }

        /* Worst case scenario, since we don't know the option size yet. */
        tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN;
        KASSERT(max_linkhdr + tlen <= MCLBYTES);

        /*
         * Create the IP+TCP header from scratch.
         */
        MGETHDR(m, M_DONTWAIT, MT_DATA);
        if (m && (max_linkhdr + tlen) > MHLEN) {
                MCLGET(m, M_DONTWAIT);
                if ((m->m_flags & M_EXT) == 0) {
                        m_freem(m);
                        m = NULL;
                }
        }
        if (m == NULL)
                return ENOBUFS;
        MCLAIM(m, &tcp_tx_mowner);

        tp = sc->sc_tp;

        /* Fixup the mbuf. */
        m->m_data += max_linkhdr;
        m_reset_rcvif(m);
        memset(mtod(m, void *), 0, tlen);

        switch (sc->sc_src.sa.sa_family) {
        case AF_INET:
                ip = mtod(m, struct ip *);
                ip->ip_v = 4;
                ip->ip_dst = sc->sc_src.sin.sin_addr;
                ip->ip_src = sc->sc_dst.sin.sin_addr;
                ip->ip_p = IPPROTO_TCP;
                th = (struct tcphdr *)(ip + 1);
                th->th_dport = sc->sc_src.sin.sin_port;
                th->th_sport = sc->sc_dst.sin.sin_port;
                break;
#ifdef INET6
        case AF_INET6:
                ip6 = mtod(m, struct ip6_hdr *);
                ip6->ip6_vfc = IPV6_VERSION;
                ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
                ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
                ip6->ip6_nxt = IPPROTO_TCP;
                /* ip6_plen will be updated in ip6_output() */
                th = (struct tcphdr *)(ip6 + 1);
                th->th_dport = sc->sc_src.sin6.sin6_port;
                th->th_sport = sc->sc_dst.sin6.sin6_port;
                break;
#endif
        default:
                panic("%s: impossible (1)", __func__);
        }

        th->th_seq = htonl(sc->sc_iss);
        th->th_ack = htonl(sc->sc_irs + 1);
        th->th_flags = TH_SYN|TH_ACK;
        th->th_win = htons(sc->sc_win);
        /* th_x2, th_sum, th_urp already 0 from memset */

        /* Tack on the TCP options. */
        optp = (u_int8_t *)(th + 1);
        optlen = 0;
        *optp++ = TCPOPT_MAXSEG;
        *optp++ = TCPOLEN_MAXSEG;
        *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
        *optp++ = sc->sc_ourmaxseg & 0xff;
        optlen += TCPOLEN_MAXSEG;

        if (sc->sc_request_r_scale != 15) {
                *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
                    TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
                    sc->sc_request_r_scale);
                optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
                optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
        }

        if (sc->sc_flags & SCF_SACK_PERMIT) {
                /* Let the peer know that we will SACK. */
                *optp++ = TCPOPT_SACK_PERMITTED;
                *optp++ = TCPOLEN_SACK_PERMITTED;
                optlen += TCPOLEN_SACK_PERMITTED;
        }

        if (sc->sc_flags & SCF_TIMESTAMP) {
                while (optlen % 4 != 2) {
                        optlen += TCPOLEN_NOP;
                        *optp++ = TCPOPT_NOP;
                }
                *optp++ = TCPOPT_TIMESTAMP;
                *optp++ = TCPOLEN_TIMESTAMP;
                u_int32_t *lp = (u_int32_t *)(optp);
                /* Form timestamp option as shown in appendix A of RFC 1323. */
                *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
                *lp   = htonl(sc->sc_timestamp);
                optp += TCPOLEN_TIMESTAMP - 2;
                optlen += TCPOLEN_TIMESTAMP;
        }

#ifdef TCP_SIGNATURE
        if (sc->sc_flags & SCF_SIGNATURE) {
                sav = tcp_signature_getsav(m);
                if (sav == NULL) {
                        m_freem(m);
                        return EPERM;
                }

                *optp++ = TCPOPT_SIGNATURE;
                *optp++ = TCPOLEN_SIGNATURE;
                sigp = optp;
                memset(optp, 0, TCP_SIGLEN);
                optp += TCP_SIGLEN;
                optlen += TCPOLEN_SIGNATURE;
        }
#endif

        /*
         * Terminate and pad TCP options to a 4 byte boundary.
         *
         * According to RFC793: "The content of the header beyond the
         * End-of-Option option must be header padding (i.e., zero)."
         * And later: "The padding is composed of zeros."
         */
        if (optlen % 4) {
                optlen += TCPOLEN_EOL;
                *optp++ = TCPOPT_EOL;
        }
        while (optlen % 4) {
                optlen += TCPOLEN_PAD;
                *optp++ = TCPOPT_PAD;
        }

        /* Compute the actual values now that we've added the options. */
        tlen = hlen + sizeof(struct tcphdr) + optlen;
        m->m_len = m->m_pkthdr.len = tlen;
        th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;

#ifdef TCP_SIGNATURE
        if (sav) {
                (void)tcp_signature(m, th, hlen, sav, sigp);
                key_sa_recordxfer(sav, m);
                KEY_SA_UNREF(&sav);
        }
#endif

        /*
         * Send ECN SYN-ACK setup packet.
         * Routes can be asymmetric, so, even if we receive a packet
         * with ECE and CWR set, we must not assume no one will block
         * the ECE packet we are about to send.
         */
        if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
            SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
                th->th_flags |= TH_ECE;
                TCP_STATINC(TCP_STAT_ECN_SHS);

                /*
                 * draft-ietf-tcpm-ecnsyn-00.txt
                 *
                 * "[...] a TCP node MAY respond to an ECN-setup
                 * SYN packet by setting ECT in the responding
                 * ECN-setup SYN/ACK packet, indicating to routers 
                 * that the SYN/ACK packet is ECN-Capable.
                 * This allows a congested router along the path
                 * to mark the packet instead of dropping the
                 * packet as an indication of congestion."
                 *
                 * "[...] There can be a great benefit in setting
                 * an ECN-capable codepoint in SYN/ACK packets [...]
                 * Congestion is  most likely to occur in
                 * the server-to-client direction.  As a result,
                 * setting an ECN-capable codepoint in SYN/ACK
                 * packets can reduce the occurrence of three-second
                 * retransmit timeouts resulting from the drop
                 * of SYN/ACK packets."
                 *
                 * Page 4 and 6, January 2006.
                 */

                switch (sc->sc_src.sa.sa_family) {
                case AF_INET:
                        ip->ip_tos |= IPTOS_ECN_ECT0;
                        break;
#ifdef INET6
                case AF_INET6:
                        ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
                        break;
#endif
                }
                TCP_STATINC(TCP_STAT_ECN_ECT);
        }


        /*
         * Compute the packet's checksum.
         *
         * Fill in some straggling IP bits.  Note the stack expects
         * ip_len to be in host order, for convenience.
         */
        switch (sc->sc_src.sa.sa_family) {
        case AF_INET:
                ip->ip_len = htons(tlen - hlen);
                th->th_sum = 0;
                th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
                ip->ip_len = htons(tlen);
                ip->ip_ttl = ip_defttl;
                /* XXX tos? */
                break;
#ifdef INET6
        case AF_INET6:
                ip6->ip6_plen = htons(tlen - hlen);
                th->th_sum = 0;
                th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
                ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
                ip6->ip6_vfc |= IPV6_VERSION;
                ip6->ip6_plen = htons(tlen - hlen);
                /* ip6_hlim will be initialized afterwards */
                /* XXX flowlabel? */
                break;
#endif
        }

        /* XXX use IPsec policy on listening socket, on SYN ACK */
        tp = sc->sc_tp;

        switch (sc->sc_src.sa.sa_family) {
        case AF_INET:
                error = ip_output(m, sc->sc_ipopts, ro,
                    (ip_mtudisc ? IP_MTUDISC : 0),
                    NULL, tp ? tp->t_inpcb : NULL);
                break;
#ifdef INET6
        case AF_INET6:
                ip6->ip6_hlim = in6pcb_selecthlim(NULL,
                    (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL);
                rtcache_unref(rt, ro);

                error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL,
                    tp ? tp->t_inpcb : NULL, NULL);
                break;
#endif
        default:
                panic("%s: impossible (2)", __func__);
        }

        return error;
}










































































































































































































































































   29 




































   29 




















































































   29 





















   29 





   11 
   21 



   21 
   10 



   28 











































































































































   29 

























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
/*        $NetBSD: vioscsi.c,v 1.36 2023/03/25 11:04:34 mlelstv Exp $        */
/*        $OpenBSD: vioscsi.c,v 1.3 2015/03/14 03:38:49 jsg Exp $        */

/*
 * Copyright (c) 2013 Google Inc.
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vioscsi.c,v 1.36 2023/03/25 11:04:34 mlelstv Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/device.h>
#include <sys/bus.h>
#include <sys/buf.h>
#include <sys/module.h>

#include <dev/pci/vioscsireg.h>
#include <dev/pci/virtiovar.h>

#include <dev/scsipi/scsi_all.h>
#include <dev/scsipi/scsiconf.h>

#ifdef VIOSCSI_DEBUG
static int vioscsi_debug = 1;
#define DPRINTF(f) do { if (vioscsi_debug) printf f; } while (/*CONSTCOND*/0)
#else
#define DPRINTF(f) ((void)0)
#endif

struct vioscsi_req {
        struct virtio_scsi_req_hdr         vr_req;
        struct virtio_scsi_res_hdr         vr_res;
        struct scsipi_xfer                *vr_xs;
        bus_dmamap_t                         vr_control;
        bus_dmamap_t                         vr_data;
};

struct vioscsi_softc {
        device_t                  sc_dev;
        struct scsipi_adapter         sc_adapter;
        struct scsipi_channel          sc_channel;

        struct virtqueue         sc_vqs[3];
#define VIOSCSI_VQ_CONTROL        0
#define VIOSCSI_VQ_EVENT        1
#define VIOSCSI_VQ_REQUEST        2

        struct vioscsi_req        *sc_reqs;
        int                         sc_nreqs;
        bus_dma_segment_t        sc_reqs_segs[1];

        u_int32_t                 sc_seg_max;

        kmutex_t                 sc_mutex;
};

/*
 * Each block request uses at least two segments - one for the header
 * and one for the status.
*/
#define VIRTIO_SCSI_MIN_SEGMENTS 2

static int         vioscsi_match(device_t, cfdata_t, void *);
static void         vioscsi_attach(device_t, device_t, void *);
static int         vioscsi_detach(device_t, int);

static int         vioscsi_alloc_reqs(struct vioscsi_softc *,
    struct virtio_softc *, int);
static void         vioscsi_free_reqs(struct vioscsi_softc *,
    struct virtio_softc *);
static void         vioscsi_scsipi_request(struct scsipi_channel *,
    scsipi_adapter_req_t, void *);
static int         vioscsi_vq_done(struct virtqueue *);
static void         vioscsi_req_done(struct vioscsi_softc *, struct virtio_softc *,
    struct vioscsi_req *, struct virtqueue *, int);
static struct vioscsi_req *vioscsi_req_get(struct vioscsi_softc *);
static void         vioscsi_bad_target(struct scsipi_xfer *);

static const char *const vioscsi_vq_names[] = {
        "control",
        "event",
        "request",
};

CFATTACH_DECL3_NEW(vioscsi, sizeof(struct vioscsi_softc),
    vioscsi_match, vioscsi_attach, vioscsi_detach, NULL, NULL, NULL,
    DVF_DETACH_SHUTDOWN);

static int
vioscsi_match(device_t parent, cfdata_t match, void *aux)
{
        struct virtio_attach_args *va = aux;

        if (va->sc_childdevid == VIRTIO_DEVICE_ID_SCSI)
                return 1;

        return 0;
}

static void
vioscsi_attach(device_t parent, device_t self, void *aux)
{
        struct vioscsi_softc *sc = device_private(self);
        struct virtio_softc *vsc = device_private(parent);
        struct scsipi_adapter *adapt = &sc->sc_adapter;
        struct scsipi_channel *chan = &sc->sc_channel;
        int rv, qsize = 0, i = 0;
        int ipl = IPL_BIO;

        if (virtio_child(vsc) != NULL) {
                aprint_error(": parent %s already has a child\n",
                    device_xname(parent));
                return;
        }

        sc->sc_dev = self;

        virtio_child_attach_start(vsc, self, ipl,
            0, VIRTIO_COMMON_FLAG_BITS);

        mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, ipl);

        uint32_t cmd_per_lun = virtio_read_device_config_4(vsc,
            VIRTIO_SCSI_CONFIG_CMD_PER_LUN);

        uint32_t seg_max = virtio_read_device_config_4(vsc,
            VIRTIO_SCSI_CONFIG_SEG_MAX);

        uint16_t max_target = virtio_read_device_config_2(vsc,
            VIRTIO_SCSI_CONFIG_MAX_TARGET);

        uint32_t max_lun = virtio_read_device_config_4(vsc,
            VIRTIO_SCSI_CONFIG_MAX_LUN);

        sc->sc_seg_max = seg_max;

        for(i=0; i < __arraycount(sc->sc_vqs); i++) {
                virtio_init_vq_vqdone(vsc, &sc->sc_vqs[i], i,
                    vioscsi_vq_done);
                rv = virtio_alloc_vq(vsc, &sc->sc_vqs[i], MAXPHYS,
                    VIRTIO_SCSI_MIN_SEGMENTS + howmany(MAXPHYS, NBPG),
                    vioscsi_vq_names[i]);
                if (rv) {
                        aprint_error_dev(sc->sc_dev,
                            "failed to allocate virtqueue %d\n", i);
                        goto err;
                }

                if (i == VIOSCSI_VQ_REQUEST)
                        sc->sc_vqs[i].vq_done = vioscsi_vq_done;
        }

        qsize = sc->sc_vqs[VIOSCSI_VQ_REQUEST].vq_num;
        if (vioscsi_alloc_reqs(sc, vsc, qsize))
                goto err;

        aprint_normal_dev(sc->sc_dev,
            "cmd_per_lun %u qsize %d seg_max %u max_target %hu"
            " max_lun %u\n",
            cmd_per_lun, qsize, seg_max, max_target, max_lun);

        if (virtio_child_attach_finish(vsc, sc->sc_vqs,
            __arraycount(sc->sc_vqs), NULL,
            VIRTIO_F_INTR_MSIX | VIRTIO_F_INTR_MPSAFE) != 0)
                goto err;

        /*
         * Fill in the scsipi_adapter.
         */
        memset(adapt, 0, sizeof(*adapt));
        adapt->adapt_dev = sc->sc_dev;
        adapt->adapt_nchannels = 1;
        adapt->adapt_openings = MIN(qsize, cmd_per_lun);
        adapt->adapt_max_periph = adapt->adapt_openings;
        adapt->adapt_request = vioscsi_scsipi_request;
        adapt->adapt_minphys = minphys;
        adapt->adapt_flags = SCSIPI_ADAPT_MPSAFE;

        /*
         * Fill in the scsipi_channel.
         */
        memset(chan, 0, sizeof(*chan));
        chan->chan_adapter = adapt;
        chan->chan_bustype = &scsi_bustype;
        chan->chan_channel = 0;
        chan->chan_ntargets = MIN(1 + max_target, 256);        /* cap reasonably */
        chan->chan_nluns = MIN(1 + max_lun, 16384);        /* cap reasonably */
        chan->chan_id = max_target + 1;
        chan->chan_flags = SCSIPI_CHAN_NOSETTLE;

        config_found(self, &sc->sc_channel, scsiprint, CFARGS_NONE);
        return;

err:
        if (qsize > 0)
                vioscsi_free_reqs(sc, vsc);

        for (i=0; i < __arraycount(sc->sc_vqs); i++) {
                virtio_free_vq(vsc, &sc->sc_vqs[i]);
        }

        virtio_child_attach_failed(vsc);
}

static int
vioscsi_detach(device_t self, int flags)
{
        struct vioscsi_softc *sc = device_private(self);
        struct virtio_softc *vsc = device_private(device_parent(sc->sc_dev));
        int rc, i;

        /*
         * Dequeue all pending finished requests. Must be done
         * before we try to detach children so that we process
         * their pending requests while they still exist.
         */
        if (sc->sc_vqs[VIOSCSI_VQ_REQUEST].vq_num > 0)
                vioscsi_vq_done(&sc->sc_vqs[VIOSCSI_VQ_REQUEST]);

        if ((rc = config_detach_children(self, flags)) != 0)
                return rc;

        virtio_reset(vsc);

        for (i = 0; i < __arraycount(sc->sc_vqs); i++) {
                if (sc->sc_vqs[i].vq_num > 0)
                        virtio_free_vq(vsc, &sc->sc_vqs[i]);
        }

        vioscsi_free_reqs(sc, vsc);

        virtio_child_detach(vsc);

        mutex_destroy(&sc->sc_mutex);

        return 0;
}

#define XS2DMA(xs) \
    ((((xs)->xs_control & XS_CTL_DATA_IN) ? BUS_DMA_READ : BUS_DMA_WRITE) | \
    (((xs)->xs_control & XS_CTL_NOSLEEP) ? BUS_DMA_NOWAIT : BUS_DMA_WAITOK) | \
    BUS_DMA_STREAMING)

#define XS2DMAPRE(xs) (((xs)->xs_control & XS_CTL_DATA_IN) ? \
    BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE)

#define XS2DMAPOST(xs) (((xs)->xs_control & XS_CTL_DATA_IN) ? \
    BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE)

static void
vioscsi_scsipi_request(struct scsipi_channel *chan, scsipi_adapter_req_t
    request, void *arg)
{
        struct vioscsi_softc *sc =
            device_private(chan->chan_adapter->adapt_dev);
        struct virtio_softc *vsc = device_private(device_parent(sc->sc_dev));
        struct scsipi_xfer *xs;
        struct scsipi_periph *periph;
        struct vioscsi_req *vr;
        struct virtio_scsi_req_hdr *req;
        struct virtqueue *vq = &sc->sc_vqs[VIOSCSI_VQ_REQUEST];
        int slot, error;
        bool dopoll;

        DPRINTF(("%s: enter\n", __func__));

        switch (request) {
        case ADAPTER_REQ_RUN_XFER:
                break;
        case ADAPTER_REQ_SET_XFER_MODE:
        {
                struct scsipi_xfer_mode *xm = arg;
                xm->xm_mode = PERIPH_CAP_TQING;
                xm->xm_period = 0;
                xm->xm_offset = 0;
                scsipi_async_event(chan, ASYNC_EVENT_XFER_MODE, xm);
                return;
        }
        default:
                DPRINTF(("%s: unhandled %d\n", __func__, request));
                return;
        }

        xs = arg;
        periph = xs->xs_periph;

        /*
         * This can happen when we run out of queue slots.
         */
        vr = vioscsi_req_get(sc);
        if (vr == NULL) {
                xs->error = XS_BUSY;
                scsipi_done(xs);
                return;
        }

        req = &vr->vr_req;
        slot = vr - sc->sc_reqs;

        /*
         * "The only supported format for the LUN field is: first byte set to
         * 1, second byte set to target, third and fourth byte representing a
         * single level LUN structure, followed by four zero bytes."
         */
        if (periph->periph_target >= 256 || periph->periph_lun >= 16384
            || periph->periph_target < 0 || periph->periph_lun < 0) {
                goto stuffup;
        }

        req->lun[0] = 1;
        req->lun[1] = periph->periph_target;
        req->lun[2] = 0x40 | ((periph->periph_lun >> 8) & 0x3F);
        req->lun[3] = periph->periph_lun & 0xFF;
        memset(req->lun + 4, 0, 4);
        DPRINTF(("%s: command %p for %d:%d at slot %d\n", __func__,
            xs, periph->periph_target, periph->periph_lun, slot));

        /* tag */
        switch (XS_CTL_TAGTYPE(xs)) {
        case XS_CTL_HEAD_TAG:
                req->task_attr = VIRTIO_SCSI_S_HEAD;
                break;

#if 0        /* XXX */
        case XS_CTL_ACA_TAG:
                req->task_attr = VIRTIO_SCSI_S_ACA;
                break;
#endif

        case XS_CTL_ORDERED_TAG:
                req->task_attr = VIRTIO_SCSI_S_ORDERED;
                break;

        case XS_CTL_SIMPLE_TAG:
        default:
                req->task_attr = VIRTIO_SCSI_S_SIMPLE;
                break;
        }
        req->id = virtio_rw64(vsc, slot);

        if ((size_t)xs->cmdlen > sizeof(req->cdb)) {
                DPRINTF(("%s: bad cmdlen %zu > %zu\n", __func__,
                    (size_t)xs->cmdlen, sizeof(req->cdb)));
                goto stuffup;
        }

        memset(req->cdb, 0, sizeof(req->cdb));
        memcpy(req->cdb, xs->cmd, xs->cmdlen);

        error = bus_dmamap_load(virtio_dmat(vsc), vr->vr_data,
            xs->data, xs->datalen, NULL, XS2DMA(xs));
        if (error) {
                aprint_error_dev(sc->sc_dev, "%s: error %d loading DMA map\n",
                    __func__, error);

                if (error == ENOMEM || error == EAGAIN) {
                        /*
                         * Map is allocated with ALLOCNOW, so this should
                         * actually never ever happen.
                         */
                        xs->error = XS_RESOURCE_SHORTAGE;
                } else {
stuffup:
                        /* not a temporary condition */
                        xs->error = XS_DRIVER_STUFFUP;
                }

                virtio_enqueue_abort(vsc, vq, slot);
                scsipi_done(xs);
                return;
        }

        int nsegs = VIRTIO_SCSI_MIN_SEGMENTS;
        if ((xs->xs_control & (XS_CTL_DATA_IN|XS_CTL_DATA_OUT)) != 0)
                nsegs += vr->vr_data->dm_nsegs;

        error = virtio_enqueue_reserve(vsc, vq, slot, nsegs);
        if (error) {
                bus_dmamap_unload(virtio_dmat(vsc), vr->vr_data);
                /* slot already freed by virtio_enqueue_reserve() */
                xs->error = XS_BUSY;
                scsipi_done(xs);
                return;
        }

        vr->vr_xs = xs;

        bus_dmamap_sync(virtio_dmat(vsc), vr->vr_control,
            offsetof(struct vioscsi_req, vr_req),
            sizeof(struct virtio_scsi_req_hdr),
            BUS_DMASYNC_PREWRITE);
        bus_dmamap_sync(virtio_dmat(vsc), vr->vr_control,
            offsetof(struct vioscsi_req, vr_res),
            sizeof(struct virtio_scsi_res_hdr),
            BUS_DMASYNC_PREREAD);
        if ((xs->xs_control & (XS_CTL_DATA_IN|XS_CTL_DATA_OUT)) != 0)
                bus_dmamap_sync(virtio_dmat(vsc), vr->vr_data, 0, xs->datalen,
                    XS2DMAPRE(xs));

        virtio_enqueue_p(vsc, vq, slot, vr->vr_control,
            offsetof(struct vioscsi_req, vr_req),
            sizeof(struct virtio_scsi_req_hdr), 1);
        if (xs->xs_control & XS_CTL_DATA_OUT)
                virtio_enqueue(vsc, vq, slot, vr->vr_data, 1);
        virtio_enqueue_p(vsc, vq, slot, vr->vr_control,
            offsetof(struct vioscsi_req, vr_res),
            sizeof(struct virtio_scsi_res_hdr), 0);
        if (xs->xs_control & XS_CTL_DATA_IN)
                virtio_enqueue(vsc, vq, slot, vr->vr_data, 0);
        dopoll = (xs->xs_control & XS_CTL_POLL) != 0;
        virtio_enqueue_commit(vsc, vq, slot, 1);

        if (!dopoll)
                return;

        DPRINTF(("%s: polling...\n", __func__));
        // XXX: do this better.
        int timeout = 1000;
        do {
                virtio_intrhand(vsc);
                if (vr->vr_xs != xs)
                        break;
                delay(1000);
        } while (--timeout > 0);

        if (vr->vr_xs == xs) {
                // XXX: Abort!
                xs->error = XS_TIMEOUT;
                xs->resid = xs->datalen;
                DPRINTF(("%s: polling timeout\n", __func__));
                scsipi_done(xs);
        }
        DPRINTF(("%s: command %p done (timeout=%d)\n", __func__,
            xs, timeout));
}

static void
vioscsi_req_done(struct vioscsi_softc *sc, struct virtio_softc *vsc,
    struct vioscsi_req *vr, struct virtqueue *vq, int slot)
{
        struct scsipi_xfer *xs = vr->vr_xs;
        size_t sense_len;

        DPRINTF(("%s: enter\n", __func__));

        bus_dmamap_sync(virtio_dmat(vsc), vr->vr_control,
            offsetof(struct vioscsi_req, vr_req),
            sizeof(struct virtio_scsi_req_hdr),
            BUS_DMASYNC_POSTWRITE);
        bus_dmamap_sync(virtio_dmat(vsc), vr->vr_control,
            offsetof(struct vioscsi_req, vr_res),
            sizeof(struct virtio_scsi_res_hdr),
            BUS_DMASYNC_POSTREAD);
        if (xs->datalen)
                bus_dmamap_sync(virtio_dmat(vsc), vr->vr_data, 0, xs->datalen,
                    XS2DMAPOST(xs));

        xs->status = vr->vr_res.status;
        xs->resid  = virtio_rw32(vsc, vr->vr_res.residual);

        switch (vr->vr_res.response) {
        case VIRTIO_SCSI_S_OK:
                sense_len = MIN(sizeof(xs->sense),
                                virtio_rw32(vsc, vr->vr_res.sense_len));
                memcpy(&xs->sense, vr->vr_res.sense, sense_len);
                xs->error = (sense_len == 0) ? XS_NOERROR : XS_SENSE;
                break;
        case VIRTIO_SCSI_S_BAD_TARGET:
                vioscsi_bad_target(xs);
                break;
        default:
                DPRINTF(("%s: stuffup: %d\n", __func__, vr->vr_res.response));
                xs->error = XS_DRIVER_STUFFUP;
                xs->resid = xs->datalen;
                break;
        }

        DPRINTF(("%s: command %p done %d, %d, %d\n", __func__,
            xs, xs->error, xs->status, xs->resid));

        bus_dmamap_unload(virtio_dmat(vsc), vr->vr_data);
        vr->vr_xs = NULL;

        virtio_dequeue_commit(vsc, vq, slot);

        mutex_exit(&sc->sc_mutex);
        scsipi_done(xs);
        mutex_enter(&sc->sc_mutex);
}

static void
vioscsi_bad_target(struct scsipi_xfer *xs)
{
        struct scsi_sense_data *sense = &xs->sense.scsi_sense;

        DPRINTF(("%s: bad target %d:%d\n", __func__,
            xs->xs_periph->periph_target, xs->xs_periph->periph_lun));

        memset(sense, 0, sizeof(*sense));
        sense->response_code = 0x70;
        sense->flags = SKEY_ILLEGAL_REQUEST;
        xs->error = XS_SENSE;
        xs->status = 0;
        xs->resid = 0;
}

static int
vioscsi_vq_done(struct virtqueue *vq)
{
        struct virtio_softc *vsc = vq->vq_owner;
        struct vioscsi_softc *sc = device_private(virtio_child(vsc));
        int ret = 0;

        DPRINTF(("%s: enter %d\n", __func__, vq->vq_index));

        mutex_enter(&sc->sc_mutex);

        for (;;) {
                int r, slot;

                r = virtio_dequeue(vsc, vq, &slot, NULL);
                if (r != 0)
                        break;

                DPRINTF(("%s: slot=%d\n", __func__, slot));

                vioscsi_req_done(sc, vsc, &sc->sc_reqs[slot], vq, slot);

                ret = 1;
        }

        mutex_exit(&sc->sc_mutex);

        DPRINTF(("%s: exit %d: %d\n", __func__, vq->vq_index, ret));

        return ret;
}

static struct vioscsi_req *
vioscsi_req_get(struct vioscsi_softc *sc)
{
        struct virtio_softc *vsc = device_private(device_parent(sc->sc_dev));
        struct virtqueue *vq = &sc->sc_vqs[VIOSCSI_VQ_REQUEST];
        struct vioscsi_req *vr = NULL;
        int r, slot;

        mutex_enter(&sc->sc_mutex);

        if ((r = virtio_enqueue_prep(vsc, vq, &slot)) != 0) {
                DPRINTF(("%s: virtio_enqueue_get error %d\n", __func__, r));
                goto out;
        }
        KASSERT(slot < sc->sc_nreqs);
        vr = &sc->sc_reqs[slot];

        DPRINTF(("%s: %p, %d\n", __func__, vr, slot));

out:
        mutex_exit(&sc->sc_mutex);

        return vr;
}

static int
vioscsi_alloc_reqs(struct vioscsi_softc *sc, struct virtio_softc *vsc,
    int qsize)
{
        size_t allocsize;
        int r, rsegs, slot;
        void *vaddr;
        struct vioscsi_req *vr;

        allocsize = qsize * sizeof(struct vioscsi_req);
        r = bus_dmamem_alloc(virtio_dmat(vsc), allocsize, 0, 0,
            &sc->sc_reqs_segs[0], 1, &rsegs, BUS_DMA_NOWAIT);
        if (r != 0) {
                aprint_error_dev(sc->sc_dev,
                    "%s: bus_dmamem_alloc, size %zu, error %d\n", __func__,
                    allocsize, r);
                return r;
        }
        r = bus_dmamem_map(virtio_dmat(vsc), &sc->sc_reqs_segs[0], 1,
            allocsize, &vaddr, BUS_DMA_NOWAIT);
        if (r != 0) {
                aprint_error_dev(sc->sc_dev,
                    "%s: bus_dmamem_map failed, error %d\n", __func__, r);
                bus_dmamem_free(virtio_dmat(vsc), &sc->sc_reqs_segs[0], 1);
                return r;
        }
        memset(vaddr, 0, allocsize);

        sc->sc_reqs = vaddr;
        sc->sc_nreqs = qsize;

        /* Prepare maps for the requests */
        for (slot=0; slot < qsize; slot++) {
                vr = &sc->sc_reqs[slot];

                r = bus_dmamap_create(virtio_dmat(vsc),
                    offsetof(struct vioscsi_req, vr_xs), 1,
                    offsetof(struct vioscsi_req, vr_xs), 0,
                    BUS_DMA_NOWAIT|BUS_DMA_ALLOCNOW, &vr->vr_control);
                if (r != 0) {
                        aprint_error_dev(sc->sc_dev,
                                "%s: bus_dmamap_create ctrl failed, error %d\n",
                            __func__, r);
                        goto cleanup;
                }

                r = bus_dmamap_create(virtio_dmat(vsc), MAXPHYS, sc->sc_seg_max,
                    MAXPHYS, 0, BUS_DMA_NOWAIT|BUS_DMA_ALLOCNOW, &vr->vr_data);
                if (r != 0) {
                        aprint_error_dev(sc->sc_dev,
                                "%s: bus_dmamap_create data failed, error %d\n",
                            __func__, r);
                        goto cleanup;
                }

                r = bus_dmamap_load(virtio_dmat(vsc), vr->vr_control,
                    vr, offsetof(struct vioscsi_req, vr_xs), NULL,
                    BUS_DMA_NOWAIT);
                if (r != 0) {
                        aprint_error_dev(sc->sc_dev,
                                "%s: bus_dmamap_load ctrl error %d\n",
                            __func__, r);
                        goto cleanup;
                }
        }

        return 0;

cleanup:
        for (; slot > 0; slot--) {
                vr = &sc->sc_reqs[slot];

                if (vr->vr_control) {
                        /* this will also unload the mapping if loaded */
                        bus_dmamap_destroy(virtio_dmat(vsc), vr->vr_control);
                        vr->vr_control = NULL;
                }

                if (vr->vr_data) {
                        bus_dmamap_destroy(virtio_dmat(vsc), vr->vr_data);
                        vr->vr_data = NULL;
                }
        }

        bus_dmamem_unmap(virtio_dmat(vsc), vaddr, allocsize);
        bus_dmamem_free(virtio_dmat(vsc), &sc->sc_reqs_segs[0], 1);

        return r;
}

static void
vioscsi_free_reqs(struct vioscsi_softc *sc, struct virtio_softc *vsc)
{
        int slot;
        struct vioscsi_req *vr;

        if (sc->sc_nreqs == 0) {
                /* Not allocated */
                return;
        }

        /* Free request maps */
        for (slot=0; slot < sc->sc_nreqs; slot++) {
                vr = &sc->sc_reqs[slot];

                bus_dmamap_destroy(virtio_dmat(vsc), vr->vr_control);
                bus_dmamap_destroy(virtio_dmat(vsc), vr->vr_data);
        }

        bus_dmamem_unmap(virtio_dmat(vsc), sc->sc_reqs,
                         sc->sc_nreqs * sizeof(struct vioscsi_req));
        bus_dmamem_free(virtio_dmat(vsc), &sc->sc_reqs_segs[0], 1);
}

MODULE(MODULE_CLASS_DRIVER, vioscsi, "virtio");

#ifdef _MODULE
#include "ioconf.c"
#endif

static int
vioscsi_modcmd(modcmd_t cmd, void *opaque)
{
        int error = 0;

#ifdef _MODULE
        switch (cmd) {
        case MODULE_CMD_INIT:
                error = config_init_component(cfdriver_ioconf_vioscsi,
                    cfattach_ioconf_vioscsi, cfdata_ioconf_vioscsi);
                break;
        case MODULE_CMD_FINI:
                error = config_fini_component(cfdriver_ioconf_vioscsi,
                    cfattach_ioconf_vioscsi, cfdata_ioconf_vioscsi);
                break;
        default:
                error = ENOTTY;
                break;
        }
#endif

        return error;
}


















































































































    6 


















   11 





   11 





























   11 




























    9 

    2 






   10 































    1 











    1 










    6 



    6 
    6 


























    2 



    2 



















    2 















































    3 



















    3 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
/*        $NetBSD: vm_machdep.c,v 1.46 2023/10/06 11:53:27 skrll Exp $        */

/*-
 * Copyright (c) 1982, 1986 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * the Systems Programming Group of the University of Utah Computer
 * Science Department, and William Jolitz.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vm_machdep.c        7.3 (Berkeley) 5/13/91
 */

/*-
 * Copyright (c) 1995 Charles M. Hannum.  All rights reserved.
 * Copyright (c) 1989, 1990 William Jolitz
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * the Systems Programming Group of the University of Utah Computer
 * Science Department, and William Jolitz.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vm_machdep.c        7.3 (Berkeley) 5/13/91
 */

/*
 *        Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vm_machdep.c,v 1.46 2023/10/06 11:53:27 skrll Exp $");

#include "opt_mtrr.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/buf.h>
#include <sys/core.h>
#include <sys/exec.h>
#include <sys/ptrace.h>

#include <uvm/uvm.h>

#include <machine/cpu.h>
#include <machine/gdt.h>
#include <machine/reg.h>
#include <machine/specialreg.h>

#ifdef MTRR
#include <machine/mtrr.h>
#endif

#include <x86/fpu.h>
#include <x86/dbregs.h>

extern struct pool x86_dbregspl;

void
cpu_proc_fork(struct proc *p1, struct proc *p2)
{

        p2->p_md.md_flags = p1->p_md.md_flags;
}

/*
 * cpu_lwp_fork: finish a new LWP (l2) operation.
 *
 * First LWP (l1) is the process being forked.  If it is &lwp0, then we
 * are creating a kthread, where return path and argument are specified
 * with `func' and `arg'.
 *
 * If an alternate user-level stack is requested (with non-zero values
 * in both the stack and stacksize arguments), then set up the user stack
 * pointer accordingly.
 */
void
cpu_lwp_fork(struct lwp *l1, struct lwp *l2, void *stack, size_t stacksize,
    void (*func)(void *), void *arg)
{
        struct pcb *pcb1, *pcb2;
        struct trapframe *tf;
        struct switchframe *sf;
        vaddr_t uv;

        KASSERT(l1 == curlwp || l1 == &lwp0);

        pcb1 = lwp_getpcb(l1);
        pcb2 = lwp_getpcb(l2);

        /* Copy the PCB from parent, except the FPU state. */
        memcpy(pcb2, pcb1, offsetof(struct pcb, pcb_savefpu));

        /* Fork the FPU state. */
        fpu_lwp_fork(l1, l2);

        /* Never inherit CPU Debug Registers */
        pcb2->pcb_dbregs = NULL;
        pcb2->pcb_flags &= ~PCB_DBREGS;

#if defined(XENPV)
        pcb2->pcb_iopl = IOPL_KPL;
#endif

        /*
         * Set the kernel stack address (from the address to uarea) and
         * trapframe address for child.
         *
         * Rig kernel stack so that it would start out in lwp_trampoline()
         * and call child_return() with l2 as an argument.  This causes the
         * newly-created child process to go directly to user level with a
         * parent return value of 0 from fork(), while the parent process
         * returns normally.
         */
        uv = uvm_lwp_getuarea(l2);
        KASSERT(uv % PAGE_SIZE == 0);

#ifdef __x86_64__
#ifdef SVS
        pcb2->pcb_rsp0 = (uv + USPACE - PAGE_SIZE +
            sizeof(struct trapframe));
        KASSERT((pcb2->pcb_rsp0 & 0xF) == 0);
#else
        pcb2->pcb_rsp0 = (uv + USPACE - 16);
#endif
        tf = (struct trapframe *)pcb2->pcb_rsp0 - 1;
#else
        pcb2->pcb_esp0 = (uv + USPACE - 16);
        tf = (struct trapframe *)pcb2->pcb_esp0 - 1;

        pcb2->pcb_iomap = NULL;
#endif
        l2->l_md.md_regs = tf;

        /*
         * Copy the trapframe from parent, so that return to userspace
         * will be to right address, with correct registers.
         */
        memcpy(tf, l1->l_md.md_regs, sizeof(struct trapframe));

        /* Child LWP might get aston() before returning to userspace. */
        tf->tf_trapno = T_ASTFLT;

        /* If specified, set a different user stack for a child. */
        if (stack != NULL) {
#ifdef __x86_64__
                tf->tf_rsp = (uint64_t)stack + stacksize;
#else
                tf->tf_esp = (uint32_t)stack + stacksize;
#endif
        }

        l2->l_md.md_flags = l1->l_md.md_flags;
        KASSERT(l2->l_md.md_astpending == 0);

        sf = (struct switchframe *)tf - 1;

#ifdef __x86_64__
        sf->sf_r12 = (uint64_t)func;
        sf->sf_r13 = (uint64_t)arg;
        sf->sf_rip = (uint64_t)lwp_trampoline;
        pcb2->pcb_rsp = (uint64_t)sf;
        pcb2->pcb_rbp = (uint64_t)l2;
#else
        /*
         * XXX Is there a reason sf->sf_edi isn't initialized here?
         * Could this leak potentially sensitive information to new
         * userspace processes?
         */
        sf->sf_esi = (int)func;
        sf->sf_ebx = (int)arg;
        sf->sf_eip = (int)lwp_trampoline;
        pcb2->pcb_esp = (int)sf;
        pcb2->pcb_ebp = (int)l2;
#endif
}

/*
 * cpu_lwp_free is called from exit() to let machine-dependent
 * code free machine-dependent resources.  Note that this routine
 * must not block.  NB: this may be called with l != curlwp in
 * error paths.
 */
void
cpu_lwp_free(struct lwp *l, int proc)
{

        if (l != curlwp)
                return;

        /* Abandon the FPU state. */
        fpu_lwp_abandon(l);

        /* Abandon the dbregs state. */
        x86_dbregs_abandon(l);

#ifdef MTRR
        if (proc && l->l_proc->p_md.md_flags & MDP_USEDMTRR)
                mtrr_clean(l->l_proc);
#endif
}

/*
 * cpu_lwp_free2 is called when an LWP is being reaped.
 * This routine may block.
 */
void
cpu_lwp_free2(struct lwp *l)
{
        struct pcb *pcb;

        pcb = lwp_getpcb(l);
        KASSERT((pcb->pcb_flags & PCB_DBREGS) == 0);
        if (pcb->pcb_dbregs) {
                pool_put(&x86_dbregspl, pcb->pcb_dbregs);
                pcb->pcb_dbregs = NULL;
        }
}

/*
 * Convert kernel VA to physical address
 */
paddr_t
kvtop(void *addr)
{
        paddr_t pa;
        bool ret __diagused;

        ret = pmap_extract(pmap_kernel(), (vaddr_t)addr, &pa);
        KASSERT(ret == true);
        return pa;
}

/*
 * Map a user I/O request into kernel virtual address space.
 * Note: the pages are already locked by uvm_vslock(), so we
 * do not need to pass an access_type to pmap_enter().
 */
int
vmapbuf(struct buf *bp, vsize_t len)
{
        vaddr_t faddr, taddr, off;
        paddr_t fpa;

        KASSERT((bp->b_flags & B_PHYS) != 0);

        bp->b_saveaddr = bp->b_data;
        faddr = trunc_page((vaddr_t)bp->b_data);
        off = (vaddr_t)bp->b_data - faddr;
        len = round_page(off + len);
        taddr = uvm_km_alloc(phys_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA);
        bp->b_data = (void *)(taddr + off);
        /*
         * The region is locked, so we expect that pmap_extract() will return
         * true.
         * XXX: unwise to expect this in a multithreaded environment.
         * anything can happen to a pmap between the time we lock a
         * region, release the pmap lock, and then relock it for
         * the pmap_extract().
         *
         * no need to flush TLB since we expect nothing to be mapped
         * where we just allocated (TLB will be flushed when our
         * mapping is removed).
         */
        while (len) {
                (void) pmap_extract(vm_map_pmap(&bp->b_proc->p_vmspace->vm_map),
                    faddr, &fpa);
                pmap_kenter_pa(taddr, fpa, VM_PROT_READ|VM_PROT_WRITE, 0);
                faddr += PAGE_SIZE;
                taddr += PAGE_SIZE;
                len -= PAGE_SIZE;
        }
        pmap_update(pmap_kernel());

        return 0;
}

/*
 * Unmap a previously-mapped user I/O request.
 */
void
vunmapbuf(struct buf *bp, vsize_t len)
{
        vaddr_t addr, off;

        KASSERT((bp->b_flags & B_PHYS) != 0);

        addr = trunc_page((vaddr_t)bp->b_data);
        off = (vaddr_t)bp->b_data - addr;
        len = round_page(off + len);
        pmap_kremove(addr, len);
        pmap_update(pmap_kernel());
        uvm_km_free(phys_map, addr, len, UVM_KMF_VAONLY);
        bp->b_data = bp->b_saveaddr;
        bp->b_saveaddr = 0;
}

#ifdef __HAVE_CPU_UAREA_ROUTINES
/*
 * Layout of the uarea:
 *    Page[0]        = PCB
 *    Page[1]        = RedZone
 *    Page[2]        = Stack
 *    Page[...]      = Stack
 *    Page[UPAGES-1] = Stack
 *    Page[UPAGES]   = RedZone
 * There is a redzone at the beginning of the stack, and another one at the
 * end. The former is to protect against deep recursions that could corrupt
 * the PCB, the latter to protect against severe stack overflows.
 */
void *
cpu_uarea_alloc(bool system)
{
        vaddr_t base, va;
        paddr_t pa;

        base = uvm_km_alloc(kernel_map, USPACE + PAGE_SIZE, 0,
            UVM_KMF_WIRED|UVM_KMF_WAITVA);

        /* Page[1] = RedZone */
        va = base + PAGE_SIZE;
        if (!pmap_extract(pmap_kernel(), va, &pa)) {
                panic("%s: impossible, Page[1] unmapped", __func__);
        }
        pmap_kremove(va, PAGE_SIZE);
        uvm_pagefree(PHYS_TO_VM_PAGE(pa));

        /* Page[UPAGES] = RedZone */
        va = base + USPACE;
        if (!pmap_extract(pmap_kernel(), va, &pa)) {
                panic("%s: impossible, Page[UPAGES] unmapped", __func__);
        }
        pmap_kremove(va, PAGE_SIZE);
        uvm_pagefree(PHYS_TO_VM_PAGE(pa));

        pmap_update(pmap_kernel());

        return (void *)base;
}

bool
cpu_uarea_free(void *addr)
{
        vaddr_t base = (vaddr_t)addr;

        KASSERT(!pmap_extract(pmap_kernel(), base + PAGE_SIZE, NULL));
        KASSERT(!pmap_extract(pmap_kernel(), base + USPACE, NULL));
        uvm_km_free(kernel_map, base, USPACE + PAGE_SIZE, UVM_KMF_WIRED);
        return true;
}
#endif /* __HAVE_CPU_UAREA_ROUTINES */





















































   21 






















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
/* $NetBSD: secmodel.c,v 1.2 2014/11/04 16:01:58 maxv Exp $ */
/*-
 * Copyright (c) 2011 Elad Efrat <elad@NetBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/errno.h>

#include <sys/atomic.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/queue.h>
#include <sys/rwlock.h>
#include <secmodel/secmodel.h>
#include <prop/proplib.h>

/* List of secmodels, parameters, and lock. */
static LIST_HEAD(, secmodel_descr) secmodels =
    LIST_HEAD_INITIALIZER(secmodels);
static unsigned int secmodel_copy_cred_on_fork = false;
static krwlock_t secmodels_lock;
static int nsecmodels = 0; /* number of registered secmodels */

static int secmodel_plug(secmodel_t);
static int secmodel_unplug(secmodel_t);

int
secmodel_nsecmodels(void)
{

        return nsecmodels;
}

void
secmodel_init(void)
{

        rw_init(&secmodels_lock);

        secmodel_copy_cred_on_fork = false;
}

/*
 * Register a new secmodel.
 */
int
secmodel_register(secmodel_t *secmodel, const char *id, const char *name,
                  prop_dictionary_t behavior,
                  secmodel_eval_t eval, secmodel_setinfo_t setinfo)
{
        int err;
        secmodel_t sm;

        sm = kmem_alloc(sizeof(*sm), KM_SLEEP);

        sm->sm_id = id;
        sm->sm_name = name;
        sm->sm_behavior = behavior;
        sm->sm_eval = eval;
        sm->sm_setinfo = setinfo;

        err = secmodel_plug(sm);
        if (err == 0) {
                atomic_inc_uint(&nsecmodels);
        } else {
                kmem_free(sm, sizeof(*sm));
                sm = NULL;
        }

        *secmodel = sm;
        return err;
}

/*
 * Deregister a secmodel.
 */
int
secmodel_deregister(secmodel_t sm)
{
        int error;

        error = secmodel_unplug(sm);
        if (error == 0) {
                atomic_dec_uint(&nsecmodels);
                kmem_free(sm, sizeof(*sm));
        }

        return error;
}

/*
 * Lookup a secmodel by its id.
 *
 * Requires "secmodels_lock" handling by the caller.
 */
static secmodel_t
secmodel_lookup(const char *id)
{
        secmodel_t tsm;

        KASSERT(rw_lock_held(&secmodels_lock));

        LIST_FOREACH(tsm, &secmodels, sm_list) {
                if (strcasecmp(tsm->sm_id, id) == 0) {
                        return tsm;
                }
        }

        return NULL;
}

/*
 * Adjust system-global secmodel behavior following the addition
 * or removal of a secmodel.
 *
 * Requires "secmodels_lock" to be held by the caller.
 */
static void
secmodel_adjust_behavior(secmodel_t sm, bool added)
{
        bool r, b;

        KASSERT(rw_write_held(&secmodels_lock));

#define        ADJUST_COUNTER(which, added)                \
        do {                                        \
                if (added) {                        \
                        (which)++;                \
                } else {                        \
                        if ((which) > 0)        \
                                (which)--;        \
                }                                \
        } while (/*CONSTCOND*/0)

        /* Copy credentials on fork? */
        r = prop_dictionary_get_bool(sm->sm_behavior, "copy-cred-on-fork", &b);
        if (r) {
                ADJUST_COUNTER(secmodel_copy_cred_on_fork, added);
        }

#undef ADJUST_COUNTER
}

static int
secmodel_plug(secmodel_t sm)
{
        secmodel_t tsm;
        int error = 0;

        if (sm == NULL)
                return EFAULT;

        /* Check if the secmodel is already present. */
        rw_enter(&secmodels_lock, RW_WRITER);
        tsm = secmodel_lookup(sm->sm_id);
        if (tsm != NULL) {
                error = EEXIST;
                goto out;
        }

        /* Add the secmodel. */
        LIST_INSERT_HEAD(&secmodels, sm, sm_list);

        /* Adjust behavior. */
        secmodel_adjust_behavior(sm, true);

 out:
        /* Unlock the secmodels list. */
        rw_exit(&secmodels_lock);

        return error;
}

static int
secmodel_unplug(secmodel_t sm)
{
        secmodel_t tsm;
        int error = 0;

        if (sm == NULL)
                return EFAULT;

        /* Make sure the secmodel is present. */
        rw_enter(&secmodels_lock, RW_WRITER);
        tsm = secmodel_lookup(sm->sm_id);
        if (tsm == NULL) {
                error = ENOENT;
                goto out;
        }

        /* Remove the secmodel. */
        LIST_REMOVE(tsm, sm_list);

        /* Adjust behavior. */
        secmodel_adjust_behavior(tsm, false);

 out:
        /* Unlock the secmodels list. */
        rw_exit(&secmodels_lock);

        return error;
}

/* XXX TODO */
int
secmodel_setinfo(const char *id, void *v, int *err)
{

        return EOPNOTSUPP;
}

int
secmodel_eval(const char *id, const char *what, void *arg, void *ret)
{
        secmodel_t sm;
        int error = 0;

        rw_enter(&secmodels_lock, RW_READER);
        sm = secmodel_lookup(id);
        if (sm == NULL) {
                error = EINVAL;
                goto out;
        }

        if (sm->sm_eval == NULL) {
                error = ENOENT;
                goto out;
        }

        if (ret == NULL) {
                error = EFAULT;
                goto out;
        }

        error = sm->sm_eval(what, arg, ret);
        /* pass error from a secmodel(9) callback as a negative value */
        error = -error;

 out:
        rw_exit(&secmodels_lock);

        return error;
}
















































    2 















    2 







































    2 









    2 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/*        $NetBSD: in6_cksum.c,v 1.28 2011/04/25 22:05:05 yamt Exp $        */

/*-
 * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6_cksum.c,v 1.28 2011/04/25 22:05:05 yamt Exp $");

#include <sys/param.h>
#include <sys/mbuf.h>
#include <netinet/in.h>
#include <netinet/ip6.h>

/*
 * Checksum of the IPv6 pseudo header.
 *
 * off is supposed to be the skipped IPv6 header, len is the payload size.
 */

int
in6_cksum(struct mbuf *m, u_int8_t nxt, uint32_t off, uint32_t len)
{
        union {
                uint16_t words[16];
                struct {
                        struct in6_addr ip6_src;
                        struct in6_addr ip6_dst;
                } addrs;
        } u;
        const struct in6_addr *in6_src;
        const struct in6_addr *in6_dst;
        const struct ip6_hdr *ip6;
        uint32_t sum;
        const uint16_t *w;
        const char *cp;

        if (nxt == 0)
                return cpu_in_cksum(m, len, off, 0);

        if (__predict_false(off < sizeof(struct ip6_hdr)))
                panic("in6_cksum: offset too short for IPv6 header");
        if (__predict_false(m->m_len < sizeof(struct ip6_hdr)))
                panic("in6_cksum: mbuf too short for IPv6 header");

        /*
         * Compute the equivalent of:
         * struct ip6_hdr_pseudo ip6;
         *
         * bzero(sizeof(*ip6));
         * ip6.ip6ph_nxt = nxt;
         * ip6.ip6ph_len = htonl(len);
         * ipv6.ip6ph_src = mtod(m, struct ip6_hdr *)->ip6_src;
         * in6_clearscope(&ip6->ip6ph_src);
         * ipv6.ip6ph_dst = mtod(m, struct ip6_hdr *)->ip6_dst;
         * in6_clearscope(&ip6->ip6ph_dst);
         * sum = one_add(&ip6);
         */

#if BYTE_ORDER == LITTLE_ENDIAN
        sum = ((len & 0xffff) + ((len >> 16) & 0xffff) + nxt) << 8;
#else
        sum = (len & 0xffff) + ((len >> 16) & 0xffff) + nxt;
#endif
        cp = mtod(m, const char *);
        w = (const uint16_t *)(cp + offsetof(struct ip6_hdr, ip6_src));
        ip6 = (const void *)cp;
        if (__predict_true((uintptr_t)w % 2 == 0)) {
                in6_src = &ip6->ip6_src;
                in6_dst = &ip6->ip6_dst;
        } else {
                memcpy(&u, &ip6->ip6_src, 32);
                w = u.words;
                in6_src = &u.addrs.ip6_src;
                in6_dst = &u.addrs.ip6_dst;
        }

        sum += w[0];
        if (!IN6_IS_SCOPE_EMBEDDABLE(in6_src))
                sum += w[1];
        sum += w[2];
        sum += w[3];
        sum += w[4];
        sum += w[5];
        sum += w[6];
        sum += w[7];
        w += 8;
        sum += w[0];
        if (!IN6_IS_SCOPE_EMBEDDABLE(in6_dst))
                sum += w[1];
        sum += w[2];
        sum += w[3];
        sum += w[4];
        sum += w[5];
        sum += w[6];
        sum += w[7];

        return cpu_in_cksum(m, len, off, sum);
}
































































































































































































































































































































































































































































































































































































































































    1 
















    1 










    1 











    1 
































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
/*        $NetBSD: sys_sig.c,v 1.57 2023/10/04 20:42:38 ad Exp $        */

/*-
 * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_sig.c        8.14 (Berkeley) 5/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_sig.c,v 1.57 2023/10/04 20:42:38 ad Exp $");

#include "opt_dtrace.h"

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/signalvar.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/wait.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/sdt.h>
#include <sys/compat_stub.h>

SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE2(proc, kernel, , signal__clear,
    "int",                 /* signal */
    "ksiginfo_t *");        /* signal-info */

int
sys___sigaction_sigtramp(struct lwp *l,
    const struct sys___sigaction_sigtramp_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)                                signum;
                syscallarg(const struct sigaction *)        nsa;
                syscallarg(struct sigaction *)                osa;
                syscallarg(void *)                        tramp;
                syscallarg(int)                                vers;
        } */
        struct sigaction nsa, osa;
        int error;

        if (SCARG(uap, nsa)) {
                error = copyin(SCARG(uap, nsa), &nsa, sizeof(nsa));
                if (error)
                        return (error);
        }
        error = sigaction1(l, SCARG(uap, signum),
            SCARG(uap, nsa) ? &nsa : 0, SCARG(uap, osa) ? &osa : 0,
            SCARG(uap, tramp), SCARG(uap, vers));
        if (error)
                return (error);
        if (SCARG(uap, osa)) {
                error = copyout(&osa, SCARG(uap, osa), sizeof(osa));
                if (error)
                        return (error);
        }
        return 0;
}

/*
 * Manipulate signal mask.  Note that we receive new mask, not pointer, and
 * return old mask as return value; the library stub does the rest.
 */
int
sys___sigprocmask14(struct lwp *l, const struct sys___sigprocmask14_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        how;
                syscallarg(const sigset_t *)        set;
                syscallarg(sigset_t *)                oset;
        } */
        struct proc        *p = l->l_proc;
        sigset_t        nss, oss;
        int                error;

        if (SCARG(uap, set)) {
                error = copyin(SCARG(uap, set), &nss, sizeof(nss));
                if (error)
                        return error;
        }
        mutex_enter(p->p_lock);
        error = sigprocmask1(l, SCARG(uap, how),
            SCARG(uap, set) ? &nss : 0, SCARG(uap, oset) ? &oss : 0);
        mutex_exit(p->p_lock);
        if (error)
                return error;
        if (SCARG(uap, oset)) {
                error = copyout(&oss, SCARG(uap, oset), sizeof(oss));
                if (error)
                        return error;
        }
        return 0;
}

int
sys___sigpending14(struct lwp *l, const struct sys___sigpending14_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(sigset_t *)        set;
        } */
        sigset_t ss;

        sigpending1(l, &ss);
        return copyout(&ss, SCARG(uap, set), sizeof(ss));
}

/*
 * Suspend process until signal, providing mask to be set in the meantime. 
 * Note nonstandard calling convention: libc stub passes mask, not pointer,
 * to save a copyin.
 */
int
sys___sigsuspend14(struct lwp *l, const struct sys___sigsuspend14_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const sigset_t *)        set;
        } */
        sigset_t        ss;
        int                error;

        if (SCARG(uap, set)) {
                error = copyin(SCARG(uap, set), &ss, sizeof(ss));
                if (error)
                        return error;
        }
        return sigsuspend1(l, SCARG(uap, set) ? &ss : 0);
}

int
sys___sigaltstack14(struct lwp *l, const struct sys___sigaltstack14_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const struct sigaltstack *)        nss;
                syscallarg(struct sigaltstack *)        oss;
        } */
        stack_t        nss, oss;
        int        error;

        if (SCARG(uap, nss)) {
                error = copyin(SCARG(uap, nss), &nss, sizeof(nss));
                if (error)
                        return error;
        }
        error = sigaltstack1(l,
            SCARG(uap, nss) ? &nss : 0, SCARG(uap, oss) ? &oss : 0);
        if (error)
                return error;
        if (SCARG(uap, oss)) {
                error = copyout(&oss, SCARG(uap, oss), sizeof(oss));
                if (error)
                        return error;
        }
        return 0;
}

int
kill1(struct lwp *l, pid_t pid, ksiginfo_t *ksi, register_t *retval)
{
        int error;
        struct proc *p;

        if ((u_int)ksi->ksi_signo >= NSIG)
                return EINVAL;

        if (pid != l->l_proc->p_pid) {
                if (ksi->ksi_pid != l->l_proc->p_pid)
                        return EPERM;

                if (ksi->ksi_uid != kauth_cred_geteuid(l->l_cred))
                        return EPERM;

                switch (ksi->ksi_code) {
                case SI_USER:
                case SI_QUEUE:
                        break;
                default:
                        return EPERM;
                }
        }

        if (pid > 0) {
                /* kill single process */
                mutex_enter(&proc_lock);
                p = proc_find_raw(pid);
                if (p == NULL || (p->p_stat != SACTIVE && p->p_stat != SSTOP)) {
                        mutex_exit(&proc_lock);
                        /* IEEE Std 1003.1-2001: return success for zombies */
                        return p ? 0 : ESRCH;
                }
                mutex_enter(p->p_lock);
                error = kauth_authorize_process(l->l_cred,
                    KAUTH_PROCESS_SIGNAL, p, KAUTH_ARG(ksi->ksi_signo),
                    NULL, NULL);
                if (!error && ksi->ksi_signo) {
                        error = kpsignal2(p, ksi);
                }
                mutex_exit(p->p_lock);
                mutex_exit(&proc_lock);
                return error;
        }

        switch (pid) {
        case -1:                /* broadcast signal */
                return killpg1(l, ksi, 0, 1);
        case 0:                        /* signal own process group */
                return killpg1(l, ksi, 0, 0);
        default:                /* negative explicit process group */
                return killpg1(l, ksi, -pid, 0);
        }
        /* NOTREACHED */
}

int
sys_sigqueueinfo(struct lwp *l, const struct sys_sigqueueinfo_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(pid_t int)        pid;
                syscallarg(const siginfo_t *)        info;
        } */
        ksiginfo_t        ksi;
        int error;

        KSI_INIT(&ksi);

        if ((error = copyin(&SCARG(uap, info)->_info, &ksi.ksi_info,
            sizeof(ksi.ksi_info))) != 0)
                return error;

        return kill1(l, SCARG(uap, pid), &ksi, retval);
}

int
sys_kill(struct lwp *l, const struct sys_kill_args *uap, register_t *retval)
{
        /* {
                syscallarg(pid_t)        pid;
                syscallarg(int)        signum;
        } */
        ksiginfo_t        ksi;

        KSI_INIT(&ksi);

        ksi.ksi_signo = SCARG(uap, signum);
        ksi.ksi_code = SI_USER;
        ksi.ksi_pid = l->l_proc->p_pid;
        ksi.ksi_uid = kauth_cred_geteuid(l->l_cred);

        return kill1(l, SCARG(uap, pid), &ksi, retval);
}

int
sys_getcontext(struct lwp *l, const struct sys_getcontext_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(struct __ucontext *) ucp;
        } */
        struct proc *p = l->l_proc;
        ucontext_t uc;

        memset(&uc, 0, sizeof(uc));

        mutex_enter(p->p_lock);
        getucontext(l, &uc);
        mutex_exit(p->p_lock);

        return copyout(&uc, SCARG(uap, ucp), sizeof (*SCARG(uap, ucp)));
}

int
sys_setcontext(struct lwp *l, const struct sys_setcontext_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const ucontext_t *) ucp;
        } */
        struct proc *p = l->l_proc;
        ucontext_t uc;
        int error;

        error = copyin(SCARG(uap, ucp), &uc, sizeof (uc));
        if (error)
                return error;
        if ((uc.uc_flags & _UC_CPU) == 0)
                return EINVAL;
        mutex_enter(p->p_lock);
        error = setucontext(l, &uc);
        mutex_exit(p->p_lock);
        if (error)
                 return error;

        return EJUSTRETURN;
}

/*
 * sigtimedwait(2) system call, used also for implementation
 * of sigwaitinfo() and sigwait().
 *
 * This only handles single LWP in signal wait. libpthread provides
 * its own sigtimedwait() wrapper to DTRT WRT individual threads.
 */
int
sys_____sigtimedwait50(struct lwp *l,
    const struct sys_____sigtimedwait50_args *uap, register_t *retval)
{

        return sigtimedwait1(l, uap, retval, copyin, copyout, copyin, copyout);
}

int
sigaction1(struct lwp *l, int signum, const struct sigaction *nsa,
        struct sigaction *osa, const void *tramp, int vers)
{
        struct proc *p;
        struct sigacts *ps;
        sigset_t tset;
        int prop, error;
        ksiginfoq_t kq;
        static bool v0v1valid;

        if (signum <= 0 || signum >= NSIG)
                return EINVAL;

        p = l->l_proc;
        error = 0;
        ksiginfo_queue_init(&kq);

        /*
         * Trampoline ABI version __SIGTRAMP_SIGCODE_VERSION (0) is reserved
         * for the legacy kernel provided on-stack trampoline.  Conversely,
         * if we are using a non-0 ABI version, we must have a trampoline.
         * Only validate the vers if a new sigaction was supplied and there
         * was an actual handler specified (not SIG_IGN or SIG_DFL), which
         * don't require a trampoline. Emulations use legacy kernel
         * trampolines with version 0, alternatively check for that too.
         *
         * If version < __SIGTRAMP_SIGINFO_VERSION_MIN (usually 2), we try
         * to autoload the compat module.  Note that we interlock with the
         * unload check in compat_modcmd() using kernconfig_lock.  If the
         * autoload fails, we don't try it again for this process.
         */
        if (nsa != NULL && nsa->sa_handler != SIG_IGN
            && nsa->sa_handler != SIG_DFL) {
                if (__predict_false(vers < __SIGTRAMP_SIGINFO_VERSION_MIN)) {
                        if (vers == __SIGTRAMP_SIGCODE_VERSION &&
                            p->p_sigctx.ps_sigcode != NULL) {
                                /*
                                 * if sigcode is used for this emulation,
                                 * version 0 is allowed.
                                 */
                        }
#ifdef __HAVE_STRUCT_SIGCONTEXT
                        else if (p->p_flag & PK_32) {
                                /*
                                 * The 32-bit compat module will have
                                 * pre-validated this for us.
                                 */
                                v0v1valid = true;
                        } else if ((p->p_lflag & PL_SIGCOMPAT) == 0) {
                                kernconfig_lock();
                                (void)module_autoload("compat_16",
                                    MODULE_CLASS_ANY);
                                if (sendsig_sigcontext_16_hook.hooked) {
                                        /*
                                         * We need to remember if the
                                         * sigcontext method may be useable,
                                         * because libc may use it even
                                         * if siginfo is available.
                                         */
                                        v0v1valid = true;
                                }
                                mutex_enter(&proc_lock);
                                /*
                                 * Prevent unload of compat module while
                                 * this process remains.
                                 */
                                p->p_lflag |= PL_SIGCOMPAT;
                                mutex_exit(&proc_lock);
                                kernconfig_unlock();
                        }
#endif /* __HAVE_STRUCT_SIGCONTEXT */
                }

                switch (vers) {
                case __SIGTRAMP_SIGCODE_VERSION:
                        /* kernel supplied trampoline. */
                        if (tramp != NULL ||
                            (p->p_sigctx.ps_sigcode == NULL && !v0v1valid)) {
                                return EINVAL;
                        }
                        break;
#ifdef __HAVE_STRUCT_SIGCONTEXT
                case __SIGTRAMP_SIGCONTEXT_VERSION_MIN ...
                     __SIGTRAMP_SIGCONTEXT_VERSION_MAX:
                        /* sigcontext, user supplied trampoline. */
                        if (tramp == NULL || !v0v1valid) {
                                return EINVAL;
                        }
                        break;
#endif /* __HAVE_STRUCT_SIGCONTEXT */
                case __SIGTRAMP_SIGINFO_VERSION_MIN ...
                     __SIGTRAMP_SIGINFO_VERSION_MAX:
                        /* siginfo, user supplied trampoline. */
                        if (tramp == NULL) {
                                return EINVAL;
                        }
                        break;
                default:
                        /* Invalid trampoline version. */
                        return EINVAL;
                }
        }

        mutex_enter(p->p_lock);

        ps = p->p_sigacts;
        if (osa)
                sigaction_copy(osa, &SIGACTION_PS(ps, signum));
        if (!nsa)
                goto out;

        prop = sigprop[signum];
        if ((nsa->sa_flags & ~SA_ALLBITS) || (prop & SA_CANTMASK)) {
                error = EINVAL;
                goto out;
        }

        sigaction_copy(&SIGACTION_PS(ps, signum), nsa);
        ps->sa_sigdesc[signum].sd_tramp = tramp;
        ps->sa_sigdesc[signum].sd_vers = vers;
        sigminusset(&sigcantmask, &SIGACTION_PS(ps, signum).sa_mask);

        if ((prop & SA_NORESET) != 0)
                SIGACTION_PS(ps, signum).sa_flags &= ~SA_RESETHAND;

        if (signum == SIGCHLD) {
                if (nsa->sa_flags & SA_NOCLDSTOP)
                        p->p_sflag |= PS_NOCLDSTOP;
                else
                        p->p_sflag &= ~PS_NOCLDSTOP;
                if (nsa->sa_flags & SA_NOCLDWAIT) {
                        /*
                         * Paranoia: since SA_NOCLDWAIT is implemented by
                         * reparenting the dying child to PID 1 (and trust
                         * it to reap the zombie), PID 1 itself is forbidden
                         * to set SA_NOCLDWAIT.
                         */
                        if (p->p_pid == 1)
                                p->p_flag &= ~PK_NOCLDWAIT;
                        else
                                p->p_flag |= PK_NOCLDWAIT;
                } else
                        p->p_flag &= ~PK_NOCLDWAIT;

                if (nsa->sa_handler == SIG_IGN) {
                        /*
                         * Paranoia: same as above.
                         */
                        if (p->p_pid == 1)
                                p->p_flag &= ~PK_CLDSIGIGN;
                        else
                                p->p_flag |= PK_CLDSIGIGN;
                } else
                        p->p_flag &= ~PK_CLDSIGIGN;
        }

        if ((nsa->sa_flags & SA_NODEFER) == 0)
                sigaddset(&SIGACTION_PS(ps, signum).sa_mask, signum);
        else
                sigdelset(&SIGACTION_PS(ps, signum).sa_mask, signum);

        /*
         * Set bit in p_sigctx.ps_sigignore for signals that are set to
         * SIG_IGN, and for signals set to SIG_DFL where the default is to
         * ignore. However, don't put SIGCONT in p_sigctx.ps_sigignore, as
         * we have to restart the process.
         */
        if (nsa->sa_handler == SIG_IGN ||
            (nsa->sa_handler == SIG_DFL && (prop & SA_IGNORE) != 0)) {
                /* Never to be seen again. */
                sigemptyset(&tset);
                sigaddset(&tset, signum);
                sigclearall(p, &tset, &kq);
                if (signum != SIGCONT) {
                        /* Easier in psignal */
                        sigaddset(&p->p_sigctx.ps_sigignore, signum);
                }
                sigdelset(&p->p_sigctx.ps_sigcatch, signum);
        } else {
                sigdelset(&p->p_sigctx.ps_sigignore, signum);
                if (nsa->sa_handler == SIG_DFL)
                        sigdelset(&p->p_sigctx.ps_sigcatch, signum);
                else
                        sigaddset(&p->p_sigctx.ps_sigcatch, signum);
        }

        /*
         * Previously held signals may now have become visible.  Ensure that
         * we check for them before returning to userspace.
         */
        if (sigispending(l, 0)) {
                lwp_lock(l);
                l->l_flag |= LW_PENDSIG;
                lwp_need_userret(l);
                lwp_unlock(l);
        }
out:
        mutex_exit(p->p_lock);
        ksiginfo_queue_drain(&kq);

        return error;
}

int
sigprocmask1(struct lwp *l, int how, const sigset_t *nss, sigset_t *oss)
{
        sigset_t *mask = &l->l_sigmask;
        bool more;

        KASSERT(mutex_owned(l->l_proc->p_lock));

        if (oss) {
                *oss = *mask;
        }

        if (nss == NULL) {
                return 0;
        }

        switch (how) {
        case SIG_BLOCK:
                sigplusset(nss, mask);
                more = false;
                break;
        case SIG_UNBLOCK:
                sigminusset(nss, mask);
                more = true;
                break;
        case SIG_SETMASK:
                *mask = *nss;
                more = true;
                break;
        default:
                return EINVAL;
        }
        sigminusset(&sigcantmask, mask);
        if (more && sigispending(l, 0)) {
                /*
                 * Check for pending signals on return to user.
                 */
                lwp_lock(l);
                l->l_flag |= LW_PENDSIG;
                lwp_need_userret(l);
                lwp_unlock(l);
        }
        return 0;
}

void
sigpending1(struct lwp *l, sigset_t *ss)
{
        struct proc *p = l->l_proc;

        mutex_enter(p->p_lock);
        *ss = l->l_sigpend.sp_set;
        sigplusset(&p->p_sigpend.sp_set, ss);
        mutex_exit(p->p_lock);
}

void
sigsuspendsetup(struct lwp *l, const sigset_t *ss)
{
        struct proc *p = l->l_proc;

        /*
         * When returning from sigsuspend/pselect/pollts, we want
         * the old mask to be restored after the
         * signal handler has finished.  Thus, we
         * save it here and mark the sigctx structure
         * to indicate this.
         */
        mutex_enter(p->p_lock);
        l->l_sigrestore = 1;
        l->l_sigoldmask = l->l_sigmask;
        l->l_sigmask = *ss;
        sigminusset(&sigcantmask, &l->l_sigmask);

        /* Check for pending signals when sleeping. */
        if (sigispending(l, 0)) {
                lwp_lock(l);
                l->l_flag |= LW_PENDSIG;
                lwp_need_userret(l);
                lwp_unlock(l);
        }
        mutex_exit(p->p_lock);
}

void
sigsuspendteardown(struct lwp *l)
{
        struct proc *p = l->l_proc;

        mutex_enter(p->p_lock);
        /* Check for pending signals when sleeping. */
        if (l->l_sigrestore) {
                if (sigispending(l, 0)) {
                        lwp_lock(l);
                        l->l_flag |= LW_PENDSIG;
                        lwp_need_userret(l);
                        lwp_unlock(l);
                } else {
                        l->l_sigrestore = 0;
                        l->l_sigmask = l->l_sigoldmask;
                }
        }
        mutex_exit(p->p_lock);
}

int
sigsuspend1(struct lwp *l, const sigset_t *ss)
{

        if (ss)
                sigsuspendsetup(l, ss);

        while (kpause("pause", true, 0, NULL) == 0)
                ;

        /* always return EINTR rather than ERESTART... */
        return EINTR;
}

int
sigaltstack1(struct lwp *l, const stack_t *nss, stack_t *oss)
{
        struct proc *p = l->l_proc;
        int error = 0;

        mutex_enter(p->p_lock);

        if (oss)
                *oss = l->l_sigstk;

        if (nss) {
                if (nss->ss_flags & ~SS_ALLBITS)
                        error = EINVAL;
                else if (nss->ss_flags & SS_DISABLE) {
                        if (l->l_sigstk.ss_flags & SS_ONSTACK)
                                error = EINVAL;
                } else if (nss->ss_size < MINSIGSTKSZ)
                        error = ENOMEM;

                if (!error)
                        l->l_sigstk = *nss;
        }

        mutex_exit(p->p_lock);

        return error;
}

int
sigtimedwait1(struct lwp *l, const struct sys_____sigtimedwait50_args *uap,
    register_t *retval, copyin_t fetchss, copyout_t storeinf, copyin_t fetchts,
    copyout_t storets)
{
        /* {
                syscallarg(const sigset_t *) set;
                syscallarg(siginfo_t *) info;
                syscallarg(struct timespec *) timeout;
        } */
        struct proc *p = l->l_proc;
        int error, signum, timo;
        struct timespec ts, tsstart, tsnow;
        ksiginfo_t ksi;

        /*
         * Calculate timeout, if it was specified.
         *
         * NULL pointer means an infinite timeout.
         * {.tv_sec = 0, .tv_nsec = 0} means do not block.
         */
        if (SCARG(uap, timeout)) {
                error = (*fetchts)(SCARG(uap, timeout), &ts, sizeof(ts));
                if (error)
                        return error;

                if ((error = itimespecfix(&ts)) != 0)
                        return error;

                timo = tstohz(&ts);
                if (timo == 0) {
                        if (ts.tv_sec == 0 && ts.tv_nsec == 0)
                                timo = -1; /* do not block */
                        else
                                timo = 1; /* the shortest possible timeout */
                }

                /*
                 * Remember current uptime, it would be used in
                 * ECANCELED/ERESTART case.
                 */
                getnanouptime(&tsstart);
        } else {
                memset(&tsstart, 0, sizeof(tsstart)); /* XXXgcc */
                timo = 0; /* infinite timeout */
        }

        error = (*fetchss)(SCARG(uap, set), &l->l_sigwaitset,
            sizeof(l->l_sigwaitset));
        if (error)
                return error;

        /*
         * Silently ignore SA_CANTMASK signals. psignal1() would ignore
         * SA_CANTMASK signals in waitset, we do this only for the below
         * siglist check.
         */
        sigminusset(&sigcantmask, &l->l_sigwaitset);

        memset(&ksi.ksi_info, 0, sizeof(ksi.ksi_info));

        mutex_enter(p->p_lock);

        /* Check for pending signals in the process, if no - then in LWP. */
        if ((signum = sigget(&p->p_sigpend, &ksi, 0, &l->l_sigwaitset)) == 0)
                signum = sigget(&l->l_sigpend, &ksi, 0, &l->l_sigwaitset);

        if (signum != 0) {
                /* If found a pending signal, just copy it out to the user. */
                mutex_exit(p->p_lock);
                goto out;
        }

        if (timo < 0) {
                /* If not allowed to block, return an error */
                mutex_exit(p->p_lock);
                return EAGAIN;
        }

        /*
         * Set up the sigwait list and wait for signal to arrive.
         * We can either be woken up or time out.
         */
        l->l_sigwaited = &ksi;
        LIST_INSERT_HEAD(&p->p_sigwaiters, l, l_sigwaiter);
        error = cv_timedwait_sig(&l->l_sigcv, p->p_lock, timo);

        /*
         * Need to find out if we woke as a result of _lwp_wakeup() or a
         * signal outside our wait set.
         */
        if (l->l_sigwaited != NULL) {
                if (error == EINTR) {
                        /* Wakeup via _lwp_wakeup(). */
                        error = ECANCELED;
                } else if (!error) {
                        /* Spurious wakeup - arrange for syscall restart. */
                        error = ERESTART;
                }
                l->l_sigwaited = NULL;
                LIST_REMOVE(l, l_sigwaiter);
        }
        mutex_exit(p->p_lock);

        /*
         * If the sleep was interrupted (either by signal or wakeup), update
         * the timeout and copyout new value back.  It would be used when
         * the syscall would be restarted or called again.
         */
        if (timo && (error == ERESTART || error == ECANCELED)) {
                getnanouptime(&tsnow);

                /* Compute how much time has passed since start. */
                timespecsub(&tsnow, &tsstart, &tsnow);

                /* Subtract passed time from timeout. */
                timespecsub(&ts, &tsnow, &ts);

                if (ts.tv_sec < 0)
                        error = EAGAIN;
                else {
                        /* Copy updated timeout to userland. */
                        error = (*storets)(&ts, SCARG(uap, timeout),
                            sizeof(ts));
                }
        }
out:
        /*
         * If a signal from the wait set arrived, copy it to userland.
         * Copy only the used part of siginfo, the padding part is
         * left unchanged (userland is not supposed to touch it anyway).
         */
        if (error == 0 && SCARG(uap, info)) {
                error = (*storeinf)(&ksi.ksi_info, SCARG(uap, info),
                    sizeof(ksi.ksi_info));
        }
        if (error == 0) {
                *retval = ksi.ksi_info._signo;
                SDT_PROBE(proc, kernel, , signal__clear, *retval,
                    &ksi, 0, 0, 0);
        }
        return error;
}











































































































































































































































































































































































































































































































































































































































































































































































































































































































   55 






   34 
   39 






   56 
































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
/*        $NetBSD: lapic.c,v 1.90 2024/02/25 18:27:54 andvar Exp $        */

/*-
 * Copyright (c) 2000, 2008, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by RedBack Networks Inc.
 *
 * Author: Bill Sommerfeld
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lapic.c,v 1.90 2024/02/25 18:27:54 andvar Exp $");

#include "acpica.h"
#include "ioapic.h"
#include "opt_acpi.h"
#include "opt_ddb.h"
#include "opt_mpbios.h"                /* for MPDEBUG */
#include "opt_multiprocessor.h"
#include "opt_ntp.h"
#include "opt_xen.h"


#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/device.h>
#include <sys/timetc.h>

#include <uvm/uvm_extern.h>

#include <dev/ic/i8253reg.h>

#include <x86/machdep.h>
#include <machine/cpu.h>
#include <machine/cpu_counter.h>
#include <machine/cpufunc.h>
#include <machine/cpuvar.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#include <machine/mpacpi.h>
#include <machine/mpbiosvar.h>
#include <machine/pcb.h>
#include <machine/pmap_private.h>
#include <machine/specialreg.h>
#include <machine/segments.h>
#include <x86/x86/tsc.h>
#include <x86/i82093var.h>

#include <machine/apicvar.h>
#include <machine/i82489reg.h>
#include <machine/i82489var.h>

#ifndef XENPV
#if NACPICA > 0
#include <dev/acpi/acpica.h>
#include <dev/acpi/acpivar.h>
#endif

#ifdef DDB
#include <machine/db_machdep.h>
#ifdef MULTIPROCESSOR
#ifdef __x86_64__
typedef void (vector)(void);
extern vector Xintr_x2apic_ddbipi;
extern int ddb_vec;
#endif
#endif
#endif

#include <dev/vmt/vmtreg.h>        /* for vmt_hvcall() */
#include <dev/vmt/vmtvar.h>        /* for vmt_hvcall() */

/* Referenced from vector.S */
void                lapic_clockintr(void *, struct intrframe *);

static void        lapic_delay(unsigned int);
static uint32_t lapic_gettick(void);
static void        lapic_setup_bsp(paddr_t);
static void        lapic_map(paddr_t);

static void lapic_hwmask(struct pic *, int);
static void lapic_hwunmask(struct pic *, int);
static void lapic_setup(struct pic *, struct cpu_info *, int, int, int);
/* Make it public to call via ddb */
void        lapic_dump(void);

struct pic local_pic = {
        .pic_name = "lapic",
        .pic_type = PIC_LAPIC,
        .pic_lock = __SIMPLELOCK_UNLOCKED,
        .pic_hwmask = lapic_hwmask,
        .pic_hwunmask = lapic_hwunmask,
        .pic_addroute = lapic_setup,
        .pic_delroute = lapic_setup,
        .pic_intr_get_devname = x86_intr_get_devname,
        .pic_intr_get_assigned = x86_intr_get_assigned,
        .pic_intr_get_count = x86_intr_get_count,
};

static int i82489_ipi(int vec, int target, int dl);
static int x2apic_ipi(int vec, int target, int dl);
int (*x86_ipi)(int, int, int) = i82489_ipi;

bool x2apic_mode __read_mostly;
#ifdef LAPIC_ENABLE_X2APIC
bool x2apic_enable = true;
#else
bool x2apic_enable = false;
#endif

static bool lapic_broken_periodic __read_mostly;

static uint32_t
i82489_readreg(u_int reg)
{
        return *((volatile uint32_t *)(local_apic_va + reg));
}

static void
i82489_writereg(u_int reg, uint32_t val)
{
        *((volatile uint32_t *)(local_apic_va + reg)) = val;
}

static uint32_t
i82489_cpu_number(void)
{
        return i82489_readreg(LAPIC_ID) >> LAPIC_ID_SHIFT;
}

static uint32_t
x2apic_readreg(u_int reg)
{
        return rdmsr(MSR_X2APIC_BASE + (reg >> 4));
}

static void
x2apic_writereg(u_int reg, uint32_t val)
{
        x86_mfence();
        wrmsr(MSR_X2APIC_BASE + (reg >> 4), val);
}

static void
x2apic_writereg64(u_int reg, uint64_t val)
{
        KDASSERT(reg == LAPIC_ICRLO);
        x86_mfence();
        wrmsr(MSR_X2APIC_BASE + (reg >> 4), val);
}

static void
x2apic_write_icr(uint32_t hi, uint32_t lo)
{
        x2apic_writereg64(LAPIC_ICRLO, ((uint64_t)hi << 32) | lo);
}

static uint32_t
x2apic_cpu_number(void)
{
        return x2apic_readreg(LAPIC_ID);
}

uint32_t
lapic_readreg(u_int reg)
{
        if (x2apic_mode)
                return x2apic_readreg(reg);
        return i82489_readreg(reg);
}

void
lapic_writereg(u_int reg, uint32_t val)
{
        if (x2apic_mode)
                x2apic_writereg(reg, val);
        else
                i82489_writereg(reg, val);
}

void
lapic_write_tpri(uint32_t val)
{

        val &= LAPIC_TPRI_MASK;
#ifdef i386
        lapic_writereg(LAPIC_TPRI, val);
#else
        lcr8(val >> 4);
#endif
}

uint32_t
lapic_cpu_number(void)
{
        if (x2apic_mode)
                return x2apic_cpu_number();
        return i82489_cpu_number();
}

static void
lapic_enable_x2apic(void)
{
        uint64_t apicbase;

        apicbase = rdmsr(MSR_APICBASE);
        if (!ISSET(apicbase, APICBASE_EN)) {
                apicbase |= APICBASE_EN;
                wrmsr(MSR_APICBASE, apicbase);
        }
        apicbase |= APICBASE_EXTD;
        wrmsr(MSR_APICBASE, apicbase);
}

bool
lapic_is_x2apic(void)
{
        uint64_t msr;

        if (!ISSET(cpu_feature[0], CPUID_APIC) ||
            rdmsr_safe(MSR_APICBASE, &msr) == EFAULT)
                return false;
        return (msr & (APICBASE_EN | APICBASE_EXTD)) ==
            (APICBASE_EN | APICBASE_EXTD);
}

/*
 * Initialize the local APIC on the BSP.
 */
static void
lapic_setup_bsp(paddr_t lapic_base)
{
        u_int regs[6];
        const char *reason = NULL;
        const char *hw_vendor;
        bool bios_x2apic;

        if (ISSET(cpu_feature[1], CPUID2_X2APIC)) {
#if NACPICA > 0
                if (acpi_present) {
                        ACPI_TABLE_DMAR *dmar;
                        ACPI_STATUS status;

                        /*
                         * Automatically detect several configurations where
                         * x2APIC mode is known to cause troubles.  User can
                         * override the setting with hw.x2apic_enable tunable.
                         */
                        status = AcpiGetTable(ACPI_SIG_DMAR, 1,
                            (ACPI_TABLE_HEADER **)&dmar);
                        if (ACPI_SUCCESS(status)) {
                                if (ISSET(dmar->Flags, ACPI_DMAR_X2APIC_OPT_OUT)) {
                                        reason = "by DMAR table";
                                }
                                AcpiPutTable(&dmar->Header);
                        }
                }
#endif        /* NACPICA > 0 */
                if (vm_guest == VM_GUEST_VMWARE) {
                        vmt_hvcall(VM_CMD_GET_VCPU_INFO, regs);
                        if (ISSET(regs[0], VCPUINFO_VCPU_RESERVED) ||
                            !ISSET(regs[0], VCPUINFO_LEGACY_X2APIC))
                                reason = "inside VMWare without intr "
                                    "redirection";
                } else if (vm_guest == VM_GUEST_XENHVM) {
                        reason = "due to running under XEN";
                } else if (vm_guest == VM_GUEST_NO &&
                    CPUID_TO_FAMILY(curcpu()->ci_signature) == 6 &&
                    CPUID_TO_MODEL(curcpu()->ci_signature) == 0x2a) {
                        hw_vendor = pmf_get_platform("board-vendor");
                        if (hw_vendor != NULL) {
                                /*
                                 * It seems that some Lenovo and ASUS
                                 * SandyBridge-based notebook BIOSes have a bug
                                 * which prevents booting AP in x2APIC mode.
                                 * Since the only way to detect mobile CPU is
                                 * to check northbridge pci id, which cannot be
                                 * done that early, disable x2APIC for all
                                 * Lenovo and ASUS SandyBridge machines.
                                 */
                                if (strcmp(hw_vendor, "LENOVO") == 0 ||
                                    strcmp(hw_vendor, "ASUSTeK Computer Inc.") == 0) {
                                        reason = "for a suspected SandyBridge "
                                            "BIOS bug";
                                }
                        }
                }
                bios_x2apic = lapic_is_x2apic();
                if (reason != NULL && bios_x2apic) {
                        aprint_verbose("x2APIC should be disabled %s but "
                            "already enabled by BIOS; enabling.\n", reason);
                        reason = NULL;
                }
                if (reason == NULL)
                        x2apic_mode = true;
                else
                        aprint_verbose("x2APIC available but disabled %s\n",
                            reason);
                if (x2apic_enable != x2apic_mode) {
                        if (bios_x2apic && !x2apic_enable)
                                aprint_verbose("x2APIC disabled by user and "
                                    "enabled by BIOS; ignoring user setting.\n");
                        else
                                x2apic_mode = x2apic_enable;
                }
        }
        if (x2apic_mode) {
                x86_ipi = x2apic_ipi;
#if NIOAPIC > 0
                struct ioapic_softc *ioapic;
                for (ioapic = ioapics; ioapic != NULL; ioapic = ioapic->sc_next) {
                        ioapic->sc_pic.pic_edge_stubs = x2apic_edge_stubs;
                        ioapic->sc_pic.pic_level_stubs = x2apic_level_stubs;
                }
#endif
#if defined(DDB) && defined(MULTIPROCESSOR)
#ifdef __x86_64__
                struct idt_vec *iv = &(cpu_info_primary.ci_idtvec);
                idt_descriptor_t *idt = iv->iv_idt;
                set_idtgate(&idt[ddb_vec], &Xintr_x2apic_ddbipi, 1,
                    SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
#else
                /*
                 * Set DDB IPI handler in cpu_set_tss_gates() when cpu0 is
                 * attached.
                 */
#endif
#endif

                x86_disable_intr();
                lapic_enable_x2apic();
#ifdef MULTIPROCESSOR
                cpu_init_first();        /* Catch up to changed cpu_number() */
#endif
                lapic_write_tpri(0);
                x86_enable_intr();
        } else
                lapic_map(lapic_base);
}

static void
lapic_map(paddr_t lapic_base)
{
        pt_entry_t *pte;
        vaddr_t va = local_apic_va;

        /*
         * If the CPU has an APIC MSR, use it and ignore the supplied value:
         * some ACPI implementations have been observed to pass bad values.
         * Additionally, ensure that the lapic is enabled as we are committed
         * to using it at this point.  Be conservative and assume that the MSR
         * is not present on the Pentium (is it?).
         */
        if (CPUID_TO_FAMILY(curcpu()->ci_signature) >= 6) {
                lapic_base = (paddr_t)rdmsr(MSR_APICBASE);
                if ((lapic_base & APICBASE_PHYSADDR) == 0) {
                        lapic_base |= LAPIC_BASE;
                }
                wrmsr(MSR_APICBASE, lapic_base | APICBASE_EN);
                lapic_base &= APICBASE_PHYSADDR;
        }

        x86_disable_intr();

        /*
         * Map local apic.  If we have a local apic, it's safe to assume
         * we're on a 486 or better and can use invlpg and non-cacheable PTE's
         *
         * Whap the PTE "by hand" rather than calling pmap_kenter_pa because
         * the latter will attempt to invoke TLB shootdown code just as we
         * might have changed the value of cpu_number()..
         */

        pte = kvtopte(va);
        *pte = lapic_base | PTE_W | PTE_P | PTE_PCD | pmap_pg_g | pmap_pg_nx;
        invlpg(va);

#ifdef MULTIPROCESSOR
        cpu_init_first();        /* Catch up to changed cpu_number() */
#endif

        lapic_write_tpri(0);
        x86_enable_intr();
}

/*
 * enable local apic
 */
void
lapic_enable(void)
{
        lapic_writereg(LAPIC_SVR, LAPIC_SVR_ENABLE | LAPIC_SPURIOUS_VECTOR);
}

void
lapic_set_lvt(void)
{
        struct cpu_info *ci = curcpu();
        int i;
        struct mp_intr_map *mpi;
        uint32_t lint0, lint1;

#ifdef MULTIPROCESSOR
        if (mp_verbose) {
                apic_format_redir(device_xname(ci->ci_dev), "prelint", 0,
                    APIC_VECTYPE_LAPIC_LVT, 0, lapic_readreg(LAPIC_LVT_LINT0));
                apic_format_redir(device_xname(ci->ci_dev), "prelint", 1,
                    APIC_VECTYPE_LAPIC_LVT, 0, lapic_readreg(LAPIC_LVT_LINT1));
        }
#endif

        /*
         * If an I/O APIC has been attached, assume that it is used instead of
         * the 8259A for interrupt delivery.  Otherwise request the LAPIC to
         * get external interrupts via LINT0 for the primary CPU.
         */
        lint0 = LAPIC_DLMODE_EXTINT;
        if (nioapics > 0 || !CPU_IS_PRIMARY(curcpu()))
                lint0 |= LAPIC_LVT_MASKED;
        lapic_writereg(LAPIC_LVT_LINT0, lint0);

        /*
         * Non Maskable Interrupts are to be delivered to the primary CPU.
         */
        lint1 = LAPIC_DLMODE_NMI;
        if (!CPU_IS_PRIMARY(curcpu()))
                lint1 |= LAPIC_LVT_MASKED;
        lapic_writereg(LAPIC_LVT_LINT1, lint1);

        for (i = 0; i < mp_nintr; i++) {
                mpi = &mp_intrs[i];
                if (mpi->ioapic == NULL && (mpi->cpu_id == MPS_ALL_APICS ||
                    mpi->cpu_id == ci->ci_cpuid)) {
                        if (mpi->ioapic_pin > 1)
                                aprint_error_dev(ci->ci_dev,
                                    "%s: WARNING: bad pin value %d\n",
                                    __func__, mpi->ioapic_pin);
                        if (mpi->ioapic_pin == 0)
                                lapic_writereg(LAPIC_LVT_LINT0, mpi->redir);
                        else
                                lapic_writereg(LAPIC_LVT_LINT1, mpi->redir);
                }
        }

#ifdef MULTIPROCESSOR
        if (mp_verbose)
                lapic_dump();
#endif
}

/*
 * Initialize fixed idt vectors for use by local apic.
 */
void
lapic_boot_init(paddr_t lapic_base)
{
        struct idt_vec *iv = &(cpu_info_primary.ci_idtvec);

        lapic_setup_bsp(lapic_base);

#ifdef MULTIPROCESSOR
        idt_vec_reserve(iv, LAPIC_IPI_VECTOR);
        idt_vec_set(iv, LAPIC_IPI_VECTOR,
            x2apic_mode ? Xintr_x2apic_ipi : Xintr_lapic_ipi);

        idt_vec_reserve(iv, LAPIC_TLB_VECTOR);
        idt_vec_set(iv, LAPIC_TLB_VECTOR,
            x2apic_mode ? Xintr_x2apic_tlb : Xintr_lapic_tlb);
#endif
        idt_vec_reserve(iv, LAPIC_SPURIOUS_VECTOR);
        idt_vec_set(iv, LAPIC_SPURIOUS_VECTOR, Xintrspurious);

        idt_vec_reserve(iv, LAPIC_TIMER_VECTOR);
        idt_vec_set(iv, LAPIC_TIMER_VECTOR,
            x2apic_mode ? Xintr_x2apic_ltimer : Xintr_lapic_ltimer);
}

static uint32_t
lapic_gettick(void)
{
        return lapic_readreg(LAPIC_CCR_TIMER);
}

#include <sys/kernel.h>                /* for hz */

uint32_t lapic_tval;

/*
 * this gets us up to a 4GHz busclock....
 */
uint32_t lapic_per_second;
uint32_t lapic_frac_usec_per_cycle;
uint64_t lapic_frac_cycle_per_usec;
uint32_t lapic_delaytab[26];

static u_int
lapic_get_timecount(struct timecounter *tc)
{
        struct cpu_info *ci;
        uint32_t cur_timer;
        int s;

        s = splhigh();
        ci = curcpu();

        /*
         * Check for a race against the clockinterrupt.
         * The update of ci_lapic_counter is blocked by splhigh() and
         * the check for a pending clockinterrupt compensates for that.
         *
         * If the current tick is almost the Initial Counter, explicitly
         * check for the pending interrupt bit as the interrupt delivery
         * could be asynchronous and compensate as well.
         *
         * This can't be done without splhigh() as the calling code might
         * have masked the clockinterrupt already.
         *
         * This code assumes that clockinterrupts are not missed.
         */
        cur_timer = lapic_gettick();
        if (cur_timer >= lapic_tval - 1) {
                uint16_t reg = LAPIC_IRR + LAPIC_TIMER_VECTOR / 32 * 16;

                if (lapic_readreg(reg) & (1 << (LAPIC_TIMER_VECTOR % 32))) {
                        cur_timer -= lapic_tval;
                }
        } else if (ci->ci_ipending & (1ULL << LIR_TIMER))
                cur_timer = lapic_gettick() - lapic_tval;
        cur_timer = ci->ci_lapic_counter - cur_timer;
        splx(s);

        return cur_timer;
}

static struct timecounter lapic_timecounter = {
        .tc_get_timecount = lapic_get_timecount,
        .tc_counter_mask = ~0u,
        .tc_name = "lapic",
        .tc_quality =
#ifndef MULTIPROCESSOR
            2100,
#else
            -100, /* per CPU state */
#endif
};

extern u_int i8254_get_timecount(struct timecounter *);

void
lapic_clockintr(void *arg, struct intrframe *frame)
{
        struct cpu_info *ci = curcpu();

        ci->ci_lapic_counter += lapic_tval;
        ci->ci_isources[LIR_TIMER]->is_evcnt.ev_count++;
        hardclock((struct clockframe *)frame);
}

void
lapic_reset(void)
{

        /*
         * Mask the clock interrupt and set mode,
         * then set divisor,
         * then unmask and set the vector.
         */
        lapic_writereg(LAPIC_LVT_TIMER,
            LAPIC_LVT_TMM_PERIODIC | LAPIC_LVT_MASKED);
        lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1);
        lapic_writereg(LAPIC_ICR_TIMER, lapic_tval);
        lapic_writereg(LAPIC_LVT_TIMER,
            LAPIC_LVT_TMM_PERIODIC | LAPIC_TIMER_VECTOR);
        lapic_writereg(LAPIC_EOI, 0);
}

static void
lapic_initclock(void)
{

        if (curcpu() == &cpu_info_primary) {
                /*
                 * Recalibrate the timer using the cycle counter, now that
                 * the cycle counter itself has been recalibrated.
                 */
                lapic_calibrate_timer(true);

                /*
                 * Hook up time counter.  This assume that all LAPICs have
                 * the same frequency.
                 */
                lapic_timecounter.tc_frequency = lapic_per_second;
                tc_init(&lapic_timecounter);
        }

        /* Start local apic countdown timer running, in repeated mode. */
        lapic_reset();
}

/*
 * Calibrate the local apic count-down timer (which is running at
 * bus-clock speed) vs. the i8254 counter/timer (which is running at
 * a fixed rate).
 *
 * The Intel MP spec says: "An MP operating system may use the IRQ8
 * real-time clock as a reference to determine the actual APIC timer clock
 * speed."
 *
 * We're actually using the IRQ0 timer.  Hmm.
 */
void
lapic_calibrate_timer(bool secondpass)
{
        struct cpu_info *ci = curcpu();
        uint64_t tmp;
        int i;
        char tbuf[9];

        KASSERT(ci == &cpu_info_primary);

        aprint_debug_dev(ci->ci_dev, "[re]calibrating local timer\n");

        /*
         * Configure timer to one-shot, interrupt masked,
         * large positive number.
         */
        x86_disable_intr();
        lapic_writereg(LAPIC_LVT_TIMER, LAPIC_LVT_MASKED);
        lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1);
        lapic_writereg(LAPIC_ICR_TIMER, 0x80000000);
        (void)lapic_gettick();

        if (secondpass && cpu_hascounter()) {
                /*
                 * Second pass calibration, using the TSC which has ideally
                 * been calibrated using the HPET or information gleaned
                 * from MSRs by this point.
                 */
                uint64_t l0, l1, t0, t1;

                (void)cpu_counter();
                t0 = cpu_counter();
                l0 = lapic_gettick();
                t0 += cpu_counter();
                DELAY(50000);
                t1 = cpu_counter();
                l1 = lapic_gettick();
                t1 += cpu_counter();

                tmp = (l0 - l1) * cpu_frequency(ci) / ((t1 - t0 + 1) / 2);
                lapic_per_second = rounddown(tmp + 500, 1000);
        } else if (lapic_per_second == 0) {
                /*
                 * Inaccurate first pass calibration using the i8254.
                 */
                unsigned int seen, delta, initial_i8254, initial_lapic;
                unsigned int cur_i8254, cur_lapic;

                (void)gettick();
                initial_lapic = lapic_gettick();
                initial_i8254 = gettick();
                for (seen = 0; seen < TIMER_FREQ / 100; seen += delta) {
                        cur_i8254 = gettick();
                        if (cur_i8254 > initial_i8254)
                                delta = x86_rtclock_tval - (cur_i8254 - initial_i8254);
                        else
                                delta = initial_i8254 - cur_i8254;
                        initial_i8254 = cur_i8254;
                }
                cur_lapic = lapic_gettick();
                tmp = initial_lapic - cur_lapic;
                lapic_per_second = (tmp * TIMER_FREQ + seen / 2) / seen;
        }
        x86_enable_intr();

        humanize_number(tbuf, sizeof(tbuf), lapic_per_second, "Hz", 1000);
        aprint_debug_dev(ci->ci_dev, "apic clock running at %s\n", tbuf);

        if (lapic_per_second != 0) {
                /*
                 * reprogram the apic timer to run in periodic mode.
                 * XXX need to program timer on other CPUs, too.
                 */
                lapic_tval = (lapic_per_second * 2) / hz;
                lapic_tval = (lapic_tval / 2) + (lapic_tval & 0x1);

                lapic_writereg(LAPIC_LVT_TIMER, LAPIC_LVT_TMM_PERIODIC
                    | LAPIC_LVT_MASKED | LAPIC_TIMER_VECTOR);
                lapic_writereg(LAPIC_DCR_TIMER, LAPIC_DCRT_DIV1);
                lapic_writereg(LAPIC_ICR_TIMER, lapic_tval);

                /*
                 * Compute fixed-point ratios between cycles and
                 * microseconds to avoid having to do any division
                 * in lapic_delay.
                 */

                tmp = (1000000 * (uint64_t)1 << 32) / lapic_per_second;
                lapic_frac_usec_per_cycle = tmp;

                tmp = (lapic_per_second * (uint64_t)1 << 32) / 1000000;

                lapic_frac_cycle_per_usec = tmp;

                /*
                 * Compute delay in cycles for likely short delays in usec.
                 */
                for (i = 0; i < 26; i++)
                        lapic_delaytab[i] = (lapic_frac_cycle_per_usec * i) >>
                            32;

                /*
                 * Apply workaround for broken periodic timer under KVM
                 */
                if (vm_guest == VM_GUEST_KVM) {
                        lapic_broken_periodic = true;
                        lapic_timecounter.tc_quality = -100;
                        aprint_debug_dev(ci->ci_dev,
                            "applying KVM timer workaround\n");
                }

                /*
                 * Now that the timer's calibrated, use the apic timer routines
                 * for all our timing needs..
                 */
                if (!secondpass) {
                        delay_func = lapic_delay;
                        x86_initclock_func = lapic_initclock;
                        initrtclock(0);
                }
        }
}

/*
 * delay for N usec.
 */

static void
lapic_delay(unsigned int usec)
{
        int32_t xtick, otick;
        int64_t deltat;

        /* XXX Bad to disable preemption, but it's tied to the cpu. */
        kpreempt_disable();
        otick = lapic_gettick();

        if (usec <= 0) {
                kpreempt_enable();
                return;
        }

        if (usec <= 25)
                deltat = lapic_delaytab[usec];
        else
                deltat = (lapic_frac_cycle_per_usec * usec) >> 32;

        while (deltat > 0) {
                xtick = lapic_gettick();
                if (lapic_broken_periodic && xtick == 0 && otick == 0) {
                        lapic_reset();
                        xtick = lapic_gettick();
                        if (xtick == 0)
                                panic("lapic timer stopped ticking");
                }
                if (xtick > otick)
                        deltat -= lapic_tval - (xtick - otick);
                else
                        deltat -= otick - xtick;
                otick = xtick;

                x86_pause();
        }
        kpreempt_enable();
}

/*
 * XXX the following belong mostly or partly elsewhere..
 */

static void
i82489_icr_wait(void)
{
#ifdef DIAGNOSTIC
        unsigned j = 100000;
#endif /* DIAGNOSTIC */

        while ((i82489_readreg(LAPIC_ICRLO) & LAPIC_DLSTAT_BUSY) != 0) {
                x86_pause();
#ifdef DIAGNOSTIC
                j--;
                if (j == 0)
                        panic("i82489_icr_wait: busy");
#endif /* DIAGNOSTIC */
        }
}

static int
i82489_ipi_init(int target)
{
        uint32_t esr;

        i82489_writereg(LAPIC_ESR, 0);
        (void)i82489_readreg(LAPIC_ESR);

        i82489_writereg(LAPIC_ICRHI, target << LAPIC_ID_SHIFT);

        i82489_writereg(LAPIC_ICRLO, LAPIC_DLMODE_INIT | LAPIC_LEVEL_ASSERT);
        i82489_icr_wait();
        delay_func(10000);
        i82489_writereg(LAPIC_ICRLO,
            LAPIC_DLMODE_INIT | LAPIC_TRIGMODE_LEVEL | LAPIC_LEVEL_DEASSERT);
        i82489_icr_wait();

        if ((i82489_readreg(LAPIC_ICRLO) & LAPIC_DLSTAT_BUSY) != 0)
                return EBUSY;

        esr = i82489_readreg(LAPIC_ESR);
        if (esr != 0)
                aprint_debug("%s: ESR %08x\n", __func__, esr);

        return 0;
}

static int
i82489_ipi_startup(int target, int vec)
{
        uint32_t esr;

        i82489_writereg(LAPIC_ESR, 0);
        (void)i82489_readreg(LAPIC_ESR);

        i82489_icr_wait();
        i82489_writereg(LAPIC_ICRHI, target << LAPIC_ID_SHIFT);
        i82489_writereg(LAPIC_ICRLO, vec | LAPIC_DLMODE_STARTUP |
            LAPIC_LEVEL_ASSERT);
        i82489_icr_wait();

        if ((i82489_readreg(LAPIC_ICRLO) & LAPIC_DLSTAT_BUSY) != 0)
                return EBUSY;

        esr = i82489_readreg(LAPIC_ESR);
        if (esr != 0)
                aprint_debug("%s: ESR %08x\n", __func__, esr);

        return 0;
}

static int
i82489_ipi(int vec, int target, int dl)
{
        int result, s;

        s = splhigh();

        i82489_icr_wait();

        if ((target & LAPIC_DEST_MASK) == 0)
                i82489_writereg(LAPIC_ICRHI, target << LAPIC_ID_SHIFT);

        i82489_writereg(LAPIC_ICRLO,
            (target & LAPIC_DEST_MASK) | vec | dl | LAPIC_LEVEL_ASSERT);

#ifdef DIAGNOSTIC
        i82489_icr_wait();
        result = (i82489_readreg(LAPIC_ICRLO) & LAPIC_DLSTAT_BUSY) ? EBUSY : 0;
#else
        /* Don't wait - if it doesn't go, we're in big trouble anyway. */
        result = 0;
#endif
        splx(s);

        return result;
}

static int
x2apic_ipi_init(int target)
{

        x2apic_write_icr(target, LAPIC_DLMODE_INIT | LAPIC_LEVEL_ASSERT);

        delay_func(10000);

        x2apic_write_icr(0,
            LAPIC_DLMODE_INIT | LAPIC_TRIGMODE_LEVEL | LAPIC_LEVEL_DEASSERT);

        return 0;
}

static int
x2apic_ipi_startup(int target, int vec)
{

        x2apic_write_icr(target,
            vec | LAPIC_DLMODE_STARTUP | LAPIC_LEVEL_ASSERT);

        return 0;
}

static int
x2apic_ipi(int vec, int target, int dl)
{
        uint32_t dest_id = 0;

        if ((target & LAPIC_DEST_MASK) == 0)
                dest_id = target;

        x2apic_write_icr(dest_id,
            (target & LAPIC_DEST_MASK) | vec | dl | LAPIC_LEVEL_ASSERT);

        return 0;
}

int
x86_ipi_init(int target)
{
        if (x2apic_mode)
                return x2apic_ipi_init(target);
        return i82489_ipi_init(target);
}

int
x86_ipi_startup(int target, int vec)
{
        if (x2apic_mode)
                return x2apic_ipi_startup(target, vec);
        return i82489_ipi_startup(target, vec);
}

/*
 * Using 'pin numbers' as:
 * 0 - timer
 * 1 - thermal
 * 2 - PCINT
 * 3 - LVINT0
 * 4 - LVINT1
 * 5 - LVERR
 */

static void
lapic_hwmask(struct pic *pic, int pin)
{
        int reg;
        uint32_t val;

        reg = LAPIC_LVT_TIMER + (pin << 4);
        val = lapic_readreg(reg);
        val |= LAPIC_LVT_MASKED;
        lapic_writereg(reg, val);
}

static void
lapic_hwunmask(struct pic *pic, int pin)
{
        int reg;
        uint32_t val;

        reg = LAPIC_LVT_TIMER + (pin << 4);
        val = lapic_readreg(reg);
        val &= ~LAPIC_LVT_MASKED;
        lapic_writereg(reg, val);
}

static void
lapic_setup(struct pic *pic, struct cpu_info *ci,
    int pin, int idtvec, int type)
{
}

void
lapic_dump(void)
{
        struct cpu_info *ci = curcpu();

#define APIC_LVT_PRINT(ci, where, idx, lvtreg)                                \
        apic_format_redir(device_xname(ci->ci_dev), where, (idx),        \
            APIC_VECTYPE_LAPIC_LVT, 0, lapic_readreg(lvtreg))

        APIC_LVT_PRINT(ci, "cmci", 0, LAPIC_LVT_CMCI);
        APIC_LVT_PRINT(ci, "timer", 0, LAPIC_LVT_TIMER);
        APIC_LVT_PRINT(ci, "thermal", 0, LAPIC_LVT_THERM);
        APIC_LVT_PRINT(ci, "pcint", 0, LAPIC_LVT_PCINT);
        APIC_LVT_PRINT(ci, "lint", 0, LAPIC_LVT_LINT0);
        APIC_LVT_PRINT(ci, "lint", 1, LAPIC_LVT_LINT1);
        APIC_LVT_PRINT(ci, "err", 0, LAPIC_LVT_ERR);

#undef APIC_LVT_PRINT
}
#else /* XENPV */
void
lapic_boot_init(paddr_t lapic_base)
{
}
#endif /* XENPV */







































































































































































    3 






   40 



   40 
















































































































    1 

    1 











































    1 











    1 











    1 










    1 






    1 







    1 











    2 








    1 


    1 
    1 





    1 






















    2 














    1 





    1 























































    1 


    1 

    1 













    1 







    2 












    2 













    1 































    1 












    1 







    2 















    2 













    1 






















    1 






    1 






























































    2 



    2 









































































































    1 













    1 
    1 

    1 

































    1 










    1 






    1 








    1 







    1 













    1 


    1 








    1 










    1 








    1 
















































    1 









    1 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
/*        $NetBSD: uipc_sem.c,v 1.60 2020/12/14 23:12:12 chs Exp $        */

/*-
 * Copyright (c) 2011, 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Mindaugas Rasiukevicius and Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Implementation of POSIX semaphore.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_sem.c,v 1.60 2020/12/14 23:12:12 chs Exp $");

#include <sys/param.h>
#include <sys/kernel.h>

#include <sys/atomic.h>
#include <sys/proc.h>
#include <sys/lwp.h>
#include <sys/ksem.h>
#include <sys/syscall.h>
#include <sys/stat.h>
#include <sys/kmem.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/rwlock.h>
#include <sys/semaphore.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/syscallvar.h>
#include <sys/sysctl.h>
#include <sys/uidinfo.h>
#include <sys/cprng.h>

MODULE(MODULE_CLASS_MISC, ksem, NULL);

#define        SEM_MAX_NAMELEN                NAME_MAX

#define        KS_UNLINKED                0x01

static kmutex_t                ksem_lock        __cacheline_aligned;
static LIST_HEAD(,ksem)        ksem_head        __cacheline_aligned;
static u_int                nsems_total        __cacheline_aligned;
static u_int                nsems                __cacheline_aligned;

static krwlock_t        ksem_pshared_lock __cacheline_aligned;
static LIST_HEAD(, ksem) *ksem_pshared_hashtab __cacheline_aligned;
static u_long                ksem_pshared_hashmask __read_mostly;

#define        KSEM_PSHARED_HASHSIZE        32

static kauth_listener_t        ksem_listener;

static int                ksem_sysinit(void);
static int                ksem_sysfini(bool);
static int                ksem_modcmd(modcmd_t, void *);
static void                ksem_release(ksem_t *, int);
static int                ksem_close_fop(file_t *);
static int                ksem_stat_fop(file_t *, struct stat *);
static int                ksem_read_fop(file_t *, off_t *, struct uio *,
    kauth_cred_t, int);

static const struct fileops semops = {
        .fo_name = "sem",
        .fo_read = ksem_read_fop,
        .fo_write = fbadop_write,
        .fo_ioctl = fbadop_ioctl,
        .fo_fcntl = fnullop_fcntl,
        .fo_poll = fnullop_poll,
        .fo_stat = ksem_stat_fop,
        .fo_close = ksem_close_fop,
        .fo_kqfilter = fnullop_kqfilter,
        .fo_restart = fnullop_restart,
};

static const struct syscall_package ksem_syscalls[] = {
        { SYS__ksem_init, 0, (sy_call_t *)sys__ksem_init },
        { SYS__ksem_open, 0, (sy_call_t *)sys__ksem_open },
        { SYS__ksem_unlink, 0, (sy_call_t *)sys__ksem_unlink },
        { SYS__ksem_close, 0, (sy_call_t *)sys__ksem_close },
        { SYS__ksem_post, 0, (sy_call_t *)sys__ksem_post },
        { SYS__ksem_wait, 0, (sy_call_t *)sys__ksem_wait },
        { SYS__ksem_trywait, 0, (sy_call_t *)sys__ksem_trywait },
        { SYS__ksem_getvalue, 0, (sy_call_t *)sys__ksem_getvalue },
        { SYS__ksem_destroy, 0, (sy_call_t *)sys__ksem_destroy },
        { SYS__ksem_timedwait, 0, (sy_call_t *)sys__ksem_timedwait },
        { 0, 0, NULL },
};

struct sysctllog *ksem_clog;
int ksem_max = KSEM_MAX;

static int
name_copyin(const char *uname, char **name)
{
        *name = kmem_alloc(SEM_MAX_NAMELEN, KM_SLEEP);

        int error = copyinstr(uname, *name, SEM_MAX_NAMELEN, NULL);
        if (error)
                kmem_free(*name, SEM_MAX_NAMELEN);

        return error;
}

static void
name_destroy(char **name)
{
        if (!*name)
                return;

        kmem_free(*name, SEM_MAX_NAMELEN);
        *name = NULL;
}

static int
ksem_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        ksem_t *ks;
        mode_t mode;

        if (action != KAUTH_SYSTEM_SEMAPHORE)
                return KAUTH_RESULT_DEFER;

        ks = arg1;
        mode = ks->ks_mode;

        if ((kauth_cred_geteuid(cred) == ks->ks_uid && (mode & S_IWUSR) != 0) ||
            (kauth_cred_getegid(cred) == ks->ks_gid && (mode & S_IWGRP) != 0) ||
            (mode & S_IWOTH) != 0)
                return KAUTH_RESULT_ALLOW;

        return KAUTH_RESULT_DEFER;
}

static int
ksem_sysinit(void)
{
        int error;
        const struct sysctlnode *rnode;

        mutex_init(&ksem_lock, MUTEX_DEFAULT, IPL_NONE);
        LIST_INIT(&ksem_head);
        nsems_total = 0;
        nsems = 0;

        rw_init(&ksem_pshared_lock);
        ksem_pshared_hashtab = hashinit(KSEM_PSHARED_HASHSIZE, HASH_LIST,
            true, &ksem_pshared_hashmask);
        KASSERT(ksem_pshared_hashtab != NULL);

        ksem_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
            ksem_listener_cb, NULL);

        /* Define module-specific sysctl tree */

        ksem_clog = NULL;

        sysctl_createv(&ksem_clog, 0, NULL, &rnode,
                        CTLFLAG_PERMANENT,
                        CTLTYPE_NODE, "posix",
                        SYSCTL_DESCR("POSIX options"),
                        NULL, 0, NULL, 0,
                        CTL_KERN, CTL_CREATE, CTL_EOL);
        sysctl_createv(&ksem_clog, 0, &rnode, NULL,
                        CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                        CTLTYPE_INT, "semmax",
                        SYSCTL_DESCR("Maximal number of semaphores"),
                        NULL, 0, &ksem_max, 0,
                        CTL_CREATE, CTL_EOL);
        sysctl_createv(&ksem_clog, 0, &rnode, NULL,
                        CTLFLAG_PERMANENT | CTLFLAG_READONLY,
                        CTLTYPE_INT, "semcnt",
                        SYSCTL_DESCR("Current number of semaphores"),
                        NULL, 0, &nsems, 0,
                        CTL_CREATE, CTL_EOL);

        error = syscall_establish(NULL, ksem_syscalls);
        if (error) {
                (void)ksem_sysfini(false);
        }

        return error;
}

static int
ksem_sysfini(bool interface)
{
        int error;

        if (interface) {
                error = syscall_disestablish(NULL, ksem_syscalls);
                if (error != 0) {
                        return error;
                }
                /*
                 * Make sure that no semaphores are in use.  Note: semops
                 * must be unused at this point.
                 */
                if (nsems_total) {
                        error = syscall_establish(NULL, ksem_syscalls);
                        KASSERT(error == 0);
                        return EBUSY;
                }
        }
        kauth_unlisten_scope(ksem_listener);
        hashdone(ksem_pshared_hashtab, HASH_LIST, ksem_pshared_hashmask);
        rw_destroy(&ksem_pshared_lock);
        mutex_destroy(&ksem_lock);
        sysctl_teardown(&ksem_clog);
        return 0;
}

static int
ksem_modcmd(modcmd_t cmd, void *arg)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return ksem_sysinit();

        case MODULE_CMD_FINI:
                return ksem_sysfini(true);

        default:
                return ENOTTY;
        }
}

static ksem_t *
ksem_lookup(const char *name)
{
        ksem_t *ks;

        KASSERT(mutex_owned(&ksem_lock));

        LIST_FOREACH(ks, &ksem_head, ks_entry) {
                if (strcmp(ks->ks_name, name) == 0) {
                        mutex_enter(&ks->ks_lock);
                        return ks;
                }
        }
        return NULL;
}

static int
ksem_perm(lwp_t *l, ksem_t *ks)
{
        kauth_cred_t uc = l->l_cred;

        KASSERT(mutex_owned(&ks->ks_lock));

        if (kauth_authorize_system(uc, KAUTH_SYSTEM_SEMAPHORE, 0, ks, NULL, NULL) != 0)
                return EACCES;

        return 0;
}

/*
 * Bits 1..23 are random, just pluck a few of those and assume the
 * distribution is going to be pretty good.
 */
#define        KSEM_PSHARED_HASH(id)        (((id) >> 1) & ksem_pshared_hashmask)

static void
ksem_remove_pshared(ksem_t *ksem)
{
        rw_enter(&ksem_pshared_lock, RW_WRITER);
        LIST_REMOVE(ksem, ks_entry);
        rw_exit(&ksem_pshared_lock);
}

static ksem_t *
ksem_lookup_pshared_locked(intptr_t id)
{
        u_long bucket = KSEM_PSHARED_HASH(id);
        ksem_t *ksem = NULL;

        /* ksem_t is locked and referenced upon return. */

        LIST_FOREACH(ksem, &ksem_pshared_hashtab[bucket], ks_entry) {
                if (ksem->ks_pshared_id == id) {
                        mutex_enter(&ksem->ks_lock);
                        if (ksem->ks_pshared_proc == NULL) {
                                /*
                                 * This entry is dead, and in the process
                                 * of being torn down; skip it.
                                 */
                                mutex_exit(&ksem->ks_lock);
                                continue;
                        }
                        ksem->ks_ref++;
                        KASSERT(ksem->ks_ref != 0);
                        return ksem;
                }
        }

        return NULL;
}

static ksem_t *
ksem_lookup_pshared(intptr_t id)
{
        rw_enter(&ksem_pshared_lock, RW_READER);
        ksem_t *ksem = ksem_lookup_pshared_locked(id);
        rw_exit(&ksem_pshared_lock);
        return ksem;
}

static void
ksem_alloc_pshared_id(ksem_t *ksem)
{
        ksem_t *ksem0;
        uint32_t try;

        KASSERT(ksem->ks_pshared_proc != NULL);

        rw_enter(&ksem_pshared_lock, RW_WRITER);
        for (;;) {
                try = (cprng_fast32() & ~KSEM_MARKER_MASK) |
                    KSEM_PSHARED_MARKER;

                if ((ksem0 = ksem_lookup_pshared_locked(try)) == NULL) {
                        /* Got it! */
                        break;
                }
                ksem_release(ksem0, -1);
        }
        ksem->ks_pshared_id = try;
        u_long bucket = KSEM_PSHARED_HASH(ksem->ks_pshared_id);
        LIST_INSERT_HEAD(&ksem_pshared_hashtab[bucket], ksem, ks_entry);
        rw_exit(&ksem_pshared_lock);
}

/*
 * ksem_get: get the semaphore from the descriptor.
 *
 * => locks the semaphore, if found, and holds an extra reference.
 * => holds a reference on the file descriptor.
 */
static int
ksem_get(intptr_t id, ksem_t **ksret, int *fdp)
{
        ksem_t *ks;
        int fd;

        if ((id & KSEM_MARKER_MASK) == KSEM_PSHARED_MARKER) {
                /*
                 * ksem_lookup_pshared() returns the ksem_t *
                 * locked and referenced.
                 */
                ks = ksem_lookup_pshared(id);
                if (ks == NULL)
                        return EINVAL;
                KASSERT(ks->ks_pshared_id == id);
                KASSERT(ks->ks_pshared_proc != NULL);
                fd = -1;
        } else if (id <= INT_MAX) {
                fd = (int)id;
                file_t *fp = fd_getfile(fd);

                if (__predict_false(fp == NULL))
                        return EINVAL;
                if (__predict_false(fp->f_type != DTYPE_SEM)) {
                        fd_putfile(fd);
                        return EINVAL;
                }
                ks = fp->f_ksem;
                mutex_enter(&ks->ks_lock);
                ks->ks_ref++;
        } else {
                return EINVAL;
        }

        *ksret = ks;
        *fdp = fd;
        return 0;
}

/*
 * ksem_create: allocate and setup a new semaphore structure.
 */
static int
ksem_create(lwp_t *l, const char *name, ksem_t **ksret, mode_t mode, u_int val)
{
        ksem_t *ks;
        kauth_cred_t uc;
        char *kname;
        size_t len;

        /* Pre-check for the limit. */
        if (nsems >= ksem_max) {
                return ENFILE;
        }

        if (val > SEM_VALUE_MAX) {
                return EINVAL;
        }

        if (name != NULL) {
                len = strlen(name);
                if (len > SEM_MAX_NAMELEN) {
                        return ENAMETOOLONG;
                }
                /* Name must start with a '/' but not contain one. */
                if (*name != '/' || len < 2 || strchr(name + 1, '/') != NULL) {
                        return EINVAL;
                }
                kname = kmem_alloc(++len, KM_SLEEP);
                strlcpy(kname, name, len);
        } else {
                kname = NULL;
                len = 0;
        }

        ks = kmem_zalloc(sizeof(ksem_t), KM_SLEEP);
        mutex_init(&ks->ks_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&ks->ks_cv, "psem");
        ks->ks_name = kname;
        ks->ks_namelen = len;
        ks->ks_mode = mode;
        ks->ks_value = val;
        ks->ks_ref = 1;

        uc = l->l_cred;
        ks->ks_uid = kauth_cred_geteuid(uc);
        ks->ks_gid = kauth_cred_getegid(uc);
        chgsemcnt(ks->ks_uid, 1);
        atomic_inc_uint(&nsems_total);

        *ksret = ks;
        return 0;
}

static void
ksem_free(ksem_t *ks)
{

        KASSERT(!cv_has_waiters(&ks->ks_cv));

        chgsemcnt(ks->ks_uid, -1);
        atomic_dec_uint(&nsems_total);

        if (ks->ks_pshared_id) {
                KASSERT(ks->ks_pshared_proc == NULL);
                ksem_remove_pshared(ks);
        }
        if (ks->ks_name) {
                KASSERT(ks->ks_namelen > 0);
                kmem_free(ks->ks_name, ks->ks_namelen);
        }
        mutex_destroy(&ks->ks_lock);
        cv_destroy(&ks->ks_cv);
        kmem_free(ks, sizeof(ksem_t));
}

#define        KSEM_ID_IS_PSHARED(id)                \
        (((id) & KSEM_MARKER_MASK) == KSEM_PSHARED_MARKER)

static void
ksem_release(ksem_t *ksem, int fd)
{
        bool destroy = false;

        KASSERT(mutex_owned(&ksem->ks_lock));

        KASSERT(ksem->ks_ref > 0);
        if (--ksem->ks_ref == 0) {
                /*
                 * Destroy if the last reference and semaphore is unnamed,
                 * or unlinked (for named semaphore).
                 */
                destroy = (ksem->ks_flags & KS_UNLINKED) ||
                    (ksem->ks_name == NULL);
        }
        mutex_exit(&ksem->ks_lock);

        if (destroy) {
                ksem_free(ksem);
        }
        if (fd != -1) {
                fd_putfile(fd);
        }
}

int
sys__ksem_init(struct lwp *l, const struct sys__ksem_init_args *uap,
    register_t *retval)
{
        /* {
                unsigned int value;
                intptr_t *idp;
        } */

        return do_ksem_init(l, SCARG(uap, value), SCARG(uap, idp),
            copyin, copyout);
}

int
do_ksem_init(lwp_t *l, u_int val, intptr_t *idp, copyin_t docopyin,
    copyout_t docopyout)
{
        proc_t *p = l->l_proc;
        ksem_t *ks;
        file_t *fp;
        intptr_t id, arg;
        int fd, error;

        /*
         * Newer versions of librt / libpthread pass us 'PSRD' in *idp to
         * indicate that a pshared semaphore is wanted.  In that case we
         * allocate globally unique ID and return that, rather than the
         * process-scoped file descriptor ID.
         */
        error = (*docopyin)(idp, &arg, sizeof(*idp));
        if (error) {
                return error;
        }

        error = fd_allocfile(&fp, &fd);
        if (error) {
                return error;
        }
        fp->f_type = DTYPE_SEM;
        fp->f_flag = FREAD | FWRITE;
        fp->f_ops = &semops;

        if (fd >= KSEM_MARKER_MIN) {
                /*
                 * This is super-unlikely, but we check for it anyway
                 * because potential collisions with the pshared marker
                 * would be bad.
                 */
                fd_abort(p, fp, fd);
                return EMFILE;
        }

        /* Note the mode does not matter for anonymous semaphores. */
        error = ksem_create(l, NULL, &ks, 0, val);
        if (error) {
                fd_abort(p, fp, fd);
                return error;
        }

        if (arg == KSEM_PSHARED) {
                ks->ks_pshared_proc = curproc;
                ks->ks_pshared_fd = fd;
                ksem_alloc_pshared_id(ks);
                id = ks->ks_pshared_id;
        } else {
                id = (intptr_t)fd;
        }

        error = (*docopyout)(&id, idp, sizeof(*idp));
        if (error) {
                ksem_free(ks);
                fd_abort(p, fp, fd);
                return error;
        }

        fp->f_ksem = ks;
        fd_affix(p, fp, fd);
        return error;
}

int
sys__ksem_open(struct lwp *l, const struct sys__ksem_open_args *uap,
    register_t *retval)
{
        /* {
                const char *name;
                int oflag;
                mode_t mode;
                unsigned int value;
                intptr_t *idp;
        } */

        return do_ksem_open(l, SCARG(uap, name), SCARG(uap, oflag),
            SCARG(uap, mode), SCARG(uap, value), SCARG(uap, idp), copyout);
}

int
do_ksem_open(struct lwp *l, const char *semname, int oflag, mode_t mode,
     unsigned int value, intptr_t *idp, copyout_t docopyout)
{
        char *name;
        proc_t *p = l->l_proc;
        ksem_t *ksnew = NULL, *ks;
        file_t *fp;
        intptr_t id;
        int fd, error;

        error = name_copyin(semname, &name);
        if (error) {
                return error;
        }
        error = fd_allocfile(&fp, &fd);
        if (error) {
                name_destroy(&name);
                return error;
        }
        fp->f_type = DTYPE_SEM;
        fp->f_flag = FREAD | FWRITE;
        fp->f_ops = &semops;

        if (fd >= KSEM_MARKER_MIN) {
                /*
                 * This is super-unlikely, but we check for it anyway
                 * because potential collisions with the pshared marker
                 * would be bad.
                 */
                fd_abort(p, fp, fd);
                return EMFILE;
        }

        /*
         * The ID (file descriptor number) can be stored early.
         * Note that zero is a special value for libpthread.
         */
        id = (intptr_t)fd;
        error = (*docopyout)(&id, idp, sizeof(*idp));
        if (error) {
                goto err;
        }

        if (oflag & O_CREAT) {
                /* Create a new semaphore. */
                error = ksem_create(l, name, &ksnew, mode, value);
                if (error) {
                        goto err;
                }
                KASSERT(ksnew != NULL);
        }

        /* Lookup for a semaphore with such name. */
        mutex_enter(&ksem_lock);
        ks = ksem_lookup(name);
        name_destroy(&name);
        if (ks) {
                KASSERT(mutex_owned(&ks->ks_lock));
                mutex_exit(&ksem_lock);

                /* Check for exclusive create. */
                if (oflag & O_EXCL) {
                        mutex_exit(&ks->ks_lock);
                        error = EEXIST;
                        goto err;
                }
                /*
                 * Verify permissions.  If we can access it,
                 * add the reference of this thread.
                 */
                error = ksem_perm(l, ks);
                if (error == 0) {
                        ks->ks_ref++;
                }
                mutex_exit(&ks->ks_lock);
                if (error) {
                        goto err;
                }
        } else {
                /* Fail if not found and not creating. */
                if ((oflag & O_CREAT) == 0) {
                        mutex_exit(&ksem_lock);
                        KASSERT(ksnew == NULL);
                        error = ENOENT;
                        goto err;
                }

                /* Check for the limit locked. */
                if (nsems >= ksem_max) {
                        mutex_exit(&ksem_lock);
                        error = ENFILE;
                        goto err;
                }

                /*
                 * Finally, insert semaphore into the list.
                 * Note: it already has the initial reference.
                 */
                ks = ksnew;
                LIST_INSERT_HEAD(&ksem_head, ks, ks_entry);
                nsems++;
                mutex_exit(&ksem_lock);

                ksnew = NULL;
        }
        KASSERT(ks != NULL);
        fp->f_ksem = ks;
        fd_affix(p, fp, fd);
err:
        name_destroy(&name);
        if (error) {
                fd_abort(p, fp, fd);
        }
        if (ksnew) {
                ksem_free(ksnew);
        }
        return error;
}

int
sys__ksem_close(struct lwp *l, const struct sys__ksem_close_args *uap,
    register_t *retval)
{
        /* {
                intptr_t id;
        } */
        intptr_t id = SCARG(uap, id);
        int fd, error;
        ksem_t *ks;

        error = ksem_get(id, &ks, &fd);
        if (error) {
                return error;
        }

        /* This is only for named semaphores. */
        if (ks->ks_name == NULL) {
                error = EINVAL;
        }
        ksem_release(ks, -1);
        if (error) {
                if (fd != -1)
                        fd_putfile(fd);
                return error;
        }
        return fd_close(fd);
}

static int
ksem_read_fop(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
    int flags)
{
        size_t len;
        char *name;
        ksem_t *ks = fp->f_ksem;

        mutex_enter(&ks->ks_lock);
        len = ks->ks_namelen;
        name = ks->ks_name;
        mutex_exit(&ks->ks_lock);
        if (name == NULL || len == 0)
                return 0;
        return uiomove(name, len, uio);
}

static int
ksem_stat_fop(file_t *fp, struct stat *ub)
{
        ksem_t *ks = fp->f_ksem;

        mutex_enter(&ks->ks_lock);

        memset(ub, 0, sizeof(*ub));

        ub->st_mode = ks->ks_mode | ((ks->ks_name && ks->ks_namelen)
            ? _S_IFLNK : _S_IFREG);
        ub->st_uid = ks->ks_uid;
        ub->st_gid = ks->ks_gid;
        ub->st_size = ks->ks_value;
        ub->st_blocks = (ub->st_size) ? 1 : 0;
        ub->st_nlink = ks->ks_ref;
        ub->st_blksize = 4096;

        nanotime(&ub->st_atimespec);
        ub->st_mtimespec = ub->st_ctimespec = ub->st_birthtimespec =
            ub->st_atimespec;

        /*
         * Left as 0: st_dev, st_ino, st_rdev, st_flags, st_gen.
         * XXX (st_dev, st_ino) should be unique.
         */
        mutex_exit(&ks->ks_lock);
        return 0;
}

static int
ksem_close_fop(file_t *fp)
{
        ksem_t *ks = fp->f_ksem;

        mutex_enter(&ks->ks_lock);

        if (ks->ks_pshared_id) {
                if (ks->ks_pshared_proc != curproc) {
                        /* Do nothing if this is not the creator. */
                        mutex_exit(&ks->ks_lock);
                        return 0;
                }
                /* Mark this semaphore as dead. */
                ks->ks_pshared_proc = NULL;
        }

        ksem_release(ks, -1);
        return 0;
}

int
sys__ksem_unlink(struct lwp *l, const struct sys__ksem_unlink_args *uap,
    register_t *retval)
{
        /* {
                const char *name;
        } */
        char *name;
        ksem_t *ks;
        u_int refcnt;
        int error;

        error = name_copyin(SCARG(uap, name), &name);
        if (error)
                return error;

        mutex_enter(&ksem_lock);
        ks = ksem_lookup(name);
        name_destroy(&name);
        if (ks == NULL) {
                mutex_exit(&ksem_lock);
                return ENOENT;
        }
        KASSERT(mutex_owned(&ks->ks_lock));

        /* Verify permissions. */
        error = ksem_perm(l, ks);
        if (error) {
                mutex_exit(&ks->ks_lock);
                mutex_exit(&ksem_lock);
                return error;
        }

        /* Remove from the global list. */
        LIST_REMOVE(ks, ks_entry);
        nsems--;
        mutex_exit(&ksem_lock);

        refcnt = ks->ks_ref;
        if (refcnt) {
                /* Mark as unlinked, if there are references. */
                ks->ks_flags |= KS_UNLINKED;
        }
        mutex_exit(&ks->ks_lock);

        if (refcnt == 0) {
                ksem_free(ks);
        }
        return 0;
}

int
sys__ksem_post(struct lwp *l, const struct sys__ksem_post_args *uap,
    register_t *retval)
{
        /* {
                intptr_t id;
        } */
        int fd, error;
        ksem_t *ks;

        error = ksem_get(SCARG(uap, id), &ks, &fd);
        if (error) {
                return error;
        }
        KASSERT(mutex_owned(&ks->ks_lock));
        if (ks->ks_value == SEM_VALUE_MAX) {
                error = EOVERFLOW;
                goto out;
        }
        ks->ks_value++;
        if (ks->ks_waiters) {
                cv_broadcast(&ks->ks_cv);
        }
out:
        ksem_release(ks, fd);
        return error;
}

int
do_ksem_wait(lwp_t *l, intptr_t id, bool try_p, struct timespec *abstime)
{
        int fd, error, timeo;
        ksem_t *ks;

        error = ksem_get(id, &ks, &fd);
        if (error) {
                return error;
        }
        KASSERT(mutex_owned(&ks->ks_lock));
        while (ks->ks_value == 0) {
                ks->ks_waiters++;
                if (!try_p && abstime != NULL) {
                        error = ts2timo(CLOCK_REALTIME, TIMER_ABSTIME, abstime,
                            &timeo, NULL);
                        if (error != 0)
                                goto out;
                } else {
                        timeo = 0;
                }
                error = try_p ? EAGAIN : cv_timedwait_sig(&ks->ks_cv,
                    &ks->ks_lock, timeo);
                ks->ks_waiters--;
                if (error)
                        goto out;
        }
        ks->ks_value--;
out:
        ksem_release(ks, fd);
        return error;
}

int
sys__ksem_wait(struct lwp *l, const struct sys__ksem_wait_args *uap,
    register_t *retval)
{
        /* {
                intptr_t id;
        } */

        return do_ksem_wait(l, SCARG(uap, id), false, NULL);
}

int
sys__ksem_timedwait(struct lwp *l, const struct sys__ksem_timedwait_args *uap,
    register_t *retval)
{
        /* {
                intptr_t id;
                const struct timespec *abstime;
        } */
        struct timespec ts;
        int error;

        error = copyin(SCARG(uap, abstime), &ts, sizeof(ts));
        if (error != 0)
                return error;

        if (ts.tv_sec < 0 || ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000)
                return EINVAL;

        error = do_ksem_wait(l, SCARG(uap, id), false, &ts);
        if (error == EWOULDBLOCK)
                error = ETIMEDOUT;
        return error;
}

int
sys__ksem_trywait(struct lwp *l, const struct sys__ksem_trywait_args *uap,
    register_t *retval)
{
        /* {
                intptr_t id;
        } */

        return do_ksem_wait(l, SCARG(uap, id), true, NULL);
}

int
sys__ksem_getvalue(struct lwp *l, const struct sys__ksem_getvalue_args *uap,
    register_t *retval)
{
        /* {
                intptr_t id;
                unsigned int *value;
        } */
        int fd, error;
        ksem_t *ks;
        unsigned int val;

        error = ksem_get(SCARG(uap, id), &ks, &fd);
        if (error) {
                return error;
        }
        KASSERT(mutex_owned(&ks->ks_lock));
        val = ks->ks_value;
        ksem_release(ks, fd);

        return copyout(&val, SCARG(uap, value), sizeof(val));
}

int
sys__ksem_destroy(struct lwp *l, const struct sys__ksem_destroy_args *uap,
    register_t *retval)
{
        /* {
                intptr_t id;
        } */
        int fd, error;
        ksem_t *ks;

        intptr_t id = SCARG(uap, id);

        error = ksem_get(id, &ks, &fd);
        if (error) {
                return error;
        }
        KASSERT(mutex_owned(&ks->ks_lock));

        /* Operation is only for unnamed semaphores. */
        if (ks->ks_name != NULL) {
                error = EINVAL;
                goto out;
        }
        /* Cannot destroy if there are waiters. */
        if (ks->ks_waiters) {
                error = EBUSY;
                goto out;
        }
        if (KSEM_ID_IS_PSHARED(id)) {
                /* Cannot destroy if we did't create it. */
                KASSERT(fd == -1);
                KASSERT(ks->ks_pshared_proc != NULL);
                if (ks->ks_pshared_proc != curproc) {
                        error = EINVAL;
                        goto out;
                }
                fd = ks->ks_pshared_fd;

                /* Mark it dead so subsequent lookups fail. */
                ks->ks_pshared_proc = NULL;

                /* Do an fd_getfile() to for the benefit of fd_close(). */
                file_t *fp __diagused = fd_getfile(fd);
                KASSERT(fp != NULL);
                KASSERT(fp->f_ksem == ks);
        }
out:
        ksem_release(ks, -1);
        if (error) {
                if (!KSEM_ID_IS_PSHARED(id))
                        fd_putfile(fd);
                return error;
        }
        return fd_close(fd);
}


































































    5 





    5 




















    5 




















    5 








    5 



    5 


    5 







    5 

    5 








    5 













































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
/*        $NetBSD: kern_kthread.c,v 1.49 2023/09/23 14:40:42 ad Exp $        */

/*-
 * Copyright (c) 1998, 1999, 2007, 2009, 2019, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_kthread.c,v 1.49 2023/09/23 14:40:42 ad Exp $");

#include <sys/param.h>
#include <sys/cpu.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/mutex.h>
#include <sys/sched.h>
#include <sys/kmem.h>
#include <sys/msan.h>

#include <uvm/uvm_extern.h>

static kmutex_t                kthread_lock;
static kcondvar_t        kthread_cv;

void
kthread_sysinit(void)
{

        mutex_init(&kthread_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&kthread_cv, "kthrwait");
}

/*
 * kthread_create: create a kernel thread, that is, system-only LWP.
 */
int
kthread_create(pri_t pri, int flag, struct cpu_info *ci,
    void (*func)(void *), void *arg, lwp_t **lp, const char *fmt, ...)
{
        lwp_t *l;
        vaddr_t uaddr;
        int error, lc;
        va_list ap;

        KASSERT((flag & KTHREAD_INTR) == 0 || (flag & KTHREAD_MPSAFE) != 0);

        uaddr = uvm_uarea_system_alloc(
           (flag & (KTHREAD_INTR|KTHREAD_IDLE)) == KTHREAD_IDLE ? ci : NULL);
        if (uaddr == 0) {
                return ENOMEM;
        }
        kmsan_orig((void *)uaddr, USPACE, KMSAN_TYPE_POOL, __RET_ADDR);
        if ((flag & KTHREAD_TS) != 0) {
                lc = SCHED_OTHER;
        } else {
                lc = SCHED_RR;
        }

        error = lwp_create(&lwp0, &proc0, uaddr, LWP_DETACHED, NULL,
            0, func, arg, &l, lc, &lwp0.l_sigmask, &lwp0.l_sigstk);
        if (error) {
                uvm_uarea_system_free(uaddr);
                return error;
        }
        if (fmt != NULL) {
                l->l_name = kmem_alloc(MAXCOMLEN, KM_SLEEP);
                va_start(ap, fmt);
                vsnprintf(l->l_name, MAXCOMLEN, fmt, ap);
                va_end(ap);
        }

        /*
         * Set parameters.
         */
        if (pri == PRI_NONE) {
                if ((flag & KTHREAD_TS) != 0) {
                        /* Maximum user priority level. */
                        pri = MAXPRI_USER;
                } else {
                        /* Minimum kernel priority level. */
                        pri = PRI_KTHREAD;
                }
        }
        mutex_enter(proc0.p_lock);
        lwp_lock(l);
        lwp_changepri(l, pri);
        if (ci != NULL) {
                if (ci != l->l_cpu) {
                        lwp_unlock_to(l, ci->ci_schedstate.spc_lwplock);
                        lwp_lock(l);
                        l->l_cpu = ci;
                }
                l->l_pflag |= LP_BOUND;
        }

        if ((flag & KTHREAD_MUSTJOIN) != 0) {
                KASSERT(lp != NULL);
                l->l_pflag |= LP_MUSTJOIN;
        }
        if ((flag & KTHREAD_INTR) != 0) {
                l->l_pflag |= LP_INTR;
        }
        if ((flag & KTHREAD_MPSAFE) == 0) {
                l->l_pflag &= ~LP_MPSAFE;
        }

        /*
         * Set the new LWP running, unless the caller has requested
         * otherwise.
         */
        KASSERT(l->l_stat == LSIDL);
        if ((flag & KTHREAD_IDLE) == 0) {
                setrunnable(l);
                /* LWP now unlocked */
        } else {
                lwp_unlock(l);
        }
        mutex_exit(proc0.p_lock);

        /* All done! */
        if (lp != NULL) {
                *lp = l;
        }
        return 0;
}

/*
 * Cause a kernel thread to exit.  Assumes the exiting thread is the
 * current context.
 */
void
kthread_exit(int ecode)
{
        const char *name;
        lwp_t *l = curlwp;

        /* If the kernel lock is held, we need to drop it now. */
        if ((l->l_pflag & LP_MPSAFE) == 0) {
                KERNEL_UNLOCK_LAST(l);
        }

        /* We can't do much with the exit code, so just report it. */
        if (ecode != 0) {
                if ((name = l->l_name) == NULL)
                        name = "unnamed";
                printf("WARNING: kthread `%s' (%d) exits with status %d\n",
                    name, l->l_lid, ecode);
        }

        /* Barrier for joining. */
        if (l->l_pflag & LP_MUSTJOIN) {
                bool *exitedp;

                mutex_enter(&kthread_lock);
                while ((exitedp = l->l_private) == NULL) {
                        cv_wait(&kthread_cv, &kthread_lock);
                }
                KASSERT(!*exitedp);
                *exitedp = true;
                cv_broadcast(&kthread_cv);
                mutex_exit(&kthread_lock);
        }

        /* And exit.. */
        lwp_exit(l);
        panic("kthread_exit");
}

/*
 * Wait for a kthread to exit, as pthread_join().
 */
int
kthread_join(lwp_t *l)
{
        bool exited = false;

        KASSERT((l->l_flag & LW_SYSTEM) != 0);
        KASSERT((l->l_pflag & LP_MUSTJOIN) != 0);

        /*
         * - Ask the kthread to write to `exited'.
         * - After this, touching l is forbidden -- it may be freed.
         * - Wait until the kthread has written to `exited'.
         */
        mutex_enter(&kthread_lock);
        KASSERT(l->l_private == NULL);
        l->l_private = &exited;
        cv_broadcast(&kthread_cv);
        while (!exited) {
                cv_wait(&kthread_cv, &kthread_lock);
        }
        mutex_exit(&kthread_lock);

        return 0;
}

/*
 * kthread_fpu_enter()
 *
 *        Allow the current lwp, which must be a kthread, to use the FPU.
 *        Return a cookie that must be passed to kthread_fpu_exit when
 *        done.  Must be used only in thread context.  Recursive -- you
 *        can call kthread_fpu_enter several times in a row as long as
 *        you pass the cookies in reverse order to kthread_fpu_exit.
 */
int
kthread_fpu_enter(void)
{
        struct lwp *l = curlwp;
        int s;

        KASSERTMSG(!cpu_intr_p(),
            "%s is not allowed in interrupt context", __func__);
        KASSERTMSG(!cpu_softintr_p(),
            "%s is not allowed in interrupt context", __func__);

        /*
         * Remember whether this thread already had FPU access, and
         * mark this thread as having FPU access.
         */
        lwp_lock(l);
        KASSERTMSG(l->l_flag & LW_SYSTEM,
            "%s is allowed only in kthreads", __func__);
        s = l->l_flag & LW_SYSTEM_FPU;
        l->l_flag |= LW_SYSTEM_FPU;
        lwp_unlock(l);

        /* Take MD steps to enable the FPU if necessary.  */
        if (s == 0)
                kthread_fpu_enter_md();

        return s;
}

/*
 * kthread_fpu_exit(s)
 *
 *        Restore the current lwp's FPU access to what it was before the
 *        matching call to kthread_fpu_enter() that returned s.  Must be
 *        used only in thread context.
 */
void
kthread_fpu_exit(int s)
{
        struct lwp *l = curlwp;

        KASSERT(s == (s & LW_SYSTEM_FPU));
        KASSERTMSG(!cpu_intr_p(),
            "%s is not allowed in interrupt context", __func__);
        KASSERTMSG(!cpu_softintr_p(),
            "%s is not allowed in interrupt context", __func__);

        lwp_lock(l);
        KASSERTMSG(l->l_flag & LW_SYSTEM,
            "%s is allowed only in kthreads", __func__);
        KASSERT(l->l_flag & LW_SYSTEM_FPU);
        l->l_flag ^= s ^ LW_SYSTEM_FPU;
        lwp_unlock(l);

        /* Take MD steps to zero and disable the FPU if necessary.  */
        if (s == 0)
                kthread_fpu_exit_md();
}





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
/*        $NetBSD: spl.h,v 1.10 2021/11/02 11:26:05 ryo Exp $        */

/*-
 * Copyright (c)2005 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * this header is intended to be included by MD header.
 *
 * an assumption: makeiplcookie() is reasonably fast.
 * if it isn't the case for your port, it's better to have MD optimized
 * splxxx() functions, rather than using this header.
 */

#if !defined(_KERNEL) && !defined(_KMEMUSER)
#error not supposed to be exposed to userland.
#endif /* !defined(_KERNEL) && !defined(_KMEMUSER) */

#define        _SPL_DECL(x, X)        \
        static __inline __always_inline int \
        spl##x(void) \
        { return splraiseipl(makeiplcookie(IPL_##X)); }

#if defined(IPL_SOFTCLOCK)
_SPL_DECL(softclock, SOFTCLOCK)
#endif /* defined(IPL_SOFTCLOCK) */
#if defined(IPL_SOFTNET)
_SPL_DECL(softnet, SOFTNET)
#endif /* defined(IPL_SOFTNET) */
#if defined(IPL_SOFTSERIAL)
_SPL_DECL(softserial, SOFTSERIAL)
#endif /* defined(IPL_SOFTSERIAL) */

_SPL_DECL(vm, VM)
_SPL_DECL(sched, SCHED)
_SPL_DECL(high, HIGH)

#undef _SPL_DECL





































































































































































































































































  232 





























































































   98 





   98 





   99 




    7 





   93 











   53 



   51 
   53 

    1 


   52 

















   59 

   49 










   22 

   51 

















  244 




















  227 
  226 
  224 


  223 







   47 







   48 

   48 

   48 



    1 

   48 







   48 







  230 
  233 


  232 











   59 














   58 








   59 
   51 


   51 
   51 


   51 
   51 

   50 












    2 




























































































   21 





















  234 

  232 





  229 
  230 









  243 





  225 


  225 


  223 

















  230 
  230 
  231 















  232 



















   26 




































  235 




  219 

  216 













   21 


   21 












   19 


   18 











   45 






   43 


   43 
   42 










    5 

    5 
    5 
    5 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
/*        $NetBSD: kern_mutex.c,v 1.112 2023/10/15 10:28:23 riastradh Exp $        */

/*-
 * Copyright (c) 2002, 2006, 2007, 2008, 2019, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe and Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Kernel mutex implementation, modeled after those found in Solaris,
 * a description of which can be found in:
 *
 *        Solaris Internals: Core Kernel Architecture, Jim Mauro and
 *            Richard McDougall.
 */

#define        __MUTEX_PRIVATE

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_mutex.c,v 1.112 2023/10/15 10:28:23 riastradh Exp $");

#include <sys/param.h>

#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/lockdebug.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/systm.h>
#include <sys/types.h>

#include <dev/lockstat.h>

#include <machine/lock.h>

/*
 * When not running a debug kernel, spin mutexes are not much
 * more than an splraiseipl() and splx() pair.
 */

#if defined(DIAGNOSTIC) || defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
#define        FULL
#endif

/*
 * Debugging support.
 */

#define        MUTEX_WANTLOCK(mtx)                                        \
    LOCKDEBUG_WANTLOCK(MUTEX_DEBUG_P(mtx), (mtx),                \
        (uintptr_t)__builtin_return_address(0), 0)
#define        MUTEX_TESTLOCK(mtx)                                        \
    LOCKDEBUG_WANTLOCK(MUTEX_DEBUG_P(mtx), (mtx),                \
        (uintptr_t)__builtin_return_address(0), -1)
#define        MUTEX_LOCKED(mtx)                                        \
    LOCKDEBUG_LOCKED(MUTEX_DEBUG_P(mtx), (mtx), NULL,                \
        (uintptr_t)__builtin_return_address(0), 0)
#define        MUTEX_UNLOCKED(mtx)                                        \
    LOCKDEBUG_UNLOCKED(MUTEX_DEBUG_P(mtx), (mtx),                \
        (uintptr_t)__builtin_return_address(0), 0)
#define        MUTEX_ABORT(mtx, msg)                                        \
    mutex_abort(__func__, __LINE__, mtx, msg)

#if defined(LOCKDEBUG)

#define        MUTEX_DASSERT(mtx, cond)                                \
do {                                                                \
        if (__predict_false(!(cond)))                                \
                MUTEX_ABORT(mtx, "assertion failed: " #cond);        \
} while (/* CONSTCOND */ 0)

#else        /* LOCKDEBUG */

#define        MUTEX_DASSERT(mtx, cond)        /* nothing */

#endif /* LOCKDEBUG */

#if defined(DIAGNOSTIC)

#define        MUTEX_ASSERT(mtx, cond)                                        \
do {                                                                \
        if (__predict_false(!(cond)))                                \
                MUTEX_ABORT(mtx, "assertion failed: " #cond);        \
} while (/* CONSTCOND */ 0)

#else        /* DIAGNOSTIC */

#define        MUTEX_ASSERT(mtx, cond)        /* nothing */

#endif        /* DIAGNOSTIC */

/*
 * Some architectures can't use __cpu_simple_lock as is so allow a way
 * for them to use an alternate definition.
 */
#ifndef MUTEX_SPINBIT_LOCK_INIT
#define MUTEX_SPINBIT_LOCK_INIT(mtx)        __cpu_simple_lock_init(&(mtx)->mtx_lock)
#endif
#ifndef MUTEX_SPINBIT_LOCKED_P
#define MUTEX_SPINBIT_LOCKED_P(mtx)        __SIMPLELOCK_LOCKED_P(&(mtx)->mtx_lock)
#endif
#ifndef MUTEX_SPINBIT_LOCK_TRY
#define MUTEX_SPINBIT_LOCK_TRY(mtx)        __cpu_simple_lock_try(&(mtx)->mtx_lock)
#endif
#ifndef MUTEX_SPINBIT_LOCK_UNLOCK
#define MUTEX_SPINBIT_LOCK_UNLOCK(mtx)        __cpu_simple_unlock(&(mtx)->mtx_lock)
#endif

#ifndef MUTEX_INITIALIZE_SPIN_IPL
#define MUTEX_INITIALIZE_SPIN_IPL(mtx, ipl) \
                                        ((mtx)->mtx_ipl = makeiplcookie((ipl)))
#endif

/*
 * Spin mutex SPL save / restore.
 */

#define        MUTEX_SPIN_SPLRAISE(mtx)                                        \
do {                                                                        \
        const int s = splraiseipl(MUTEX_SPIN_IPL(mtx));                        \
        struct cpu_info * const x__ci = curcpu();                        \
        const int x__cnt = x__ci->ci_mtx_count--;                        \
        __insn_barrier();                                                \
        if (x__cnt == 0)                                                \
                x__ci->ci_mtx_oldspl = s;                                \
} while (/* CONSTCOND */ 0)

#define        MUTEX_SPIN_SPLRESTORE(mtx)                                        \
do {                                                                        \
        struct cpu_info * const x__ci = curcpu();                        \
        const int s = x__ci->ci_mtx_oldspl;                                \
        __insn_barrier();                                                \
        if (++(x__ci->ci_mtx_count) == 0)                                \
                splx(s);                                                \
} while (/* CONSTCOND */ 0)

/*
 * Memory barriers.
 */
#ifdef __HAVE_ATOMIC_AS_MEMBAR
#define        MUTEX_MEMBAR_ENTER()
#else
#define        MUTEX_MEMBAR_ENTER()                membar_enter()
#endif

/*
 * For architectures that provide 'simple' mutexes: they provide a
 * CAS function that is either MP-safe, or does not need to be MP
 * safe.  Adaptive mutexes on these architectures do not require an
 * additional interlock.
 */

#ifdef __HAVE_SIMPLE_MUTEXES

#define        MUTEX_OWNER(owner)                                                \
        (owner & MUTEX_THREAD)
#define        MUTEX_HAS_WAITERS(mtx)                                                \
        (((int)(mtx)->mtx_owner & MUTEX_BIT_WAITERS) != 0)

#define        MUTEX_INITIALIZE_ADAPTIVE(mtx, dodebug)                                \
do {                                                                        \
        if (!dodebug)                                                        \
                (mtx)->mtx_owner |= MUTEX_BIT_NODEBUG;                        \
} while (/* CONSTCOND */ 0)

#define        MUTEX_INITIALIZE_SPIN(mtx, dodebug, ipl)                        \
do {                                                                        \
        (mtx)->mtx_owner = MUTEX_BIT_SPIN;                                \
        if (!dodebug)                                                        \
                (mtx)->mtx_owner |= MUTEX_BIT_NODEBUG;                        \
        MUTEX_INITIALIZE_SPIN_IPL((mtx), (ipl));                        \
        MUTEX_SPINBIT_LOCK_INIT((mtx));                                        \
} while (/* CONSTCOND */ 0)

#define        MUTEX_DESTROY(mtx)                                                \
do {                                                                        \
        (mtx)->mtx_owner = MUTEX_THREAD;                                \
} while (/* CONSTCOND */ 0)

#define        MUTEX_SPIN_P(owner)                \
    (((owner) & MUTEX_BIT_SPIN) != 0)
#define        MUTEX_ADAPTIVE_P(owner)                \
    (((owner) & MUTEX_BIT_SPIN) == 0)

#ifndef MUTEX_CAS
#define        MUTEX_CAS(p, o, n)                \
        (atomic_cas_ulong((volatile unsigned long *)(p), (o), (n)) == (o))
#endif /* MUTEX_CAS */

#define        MUTEX_DEBUG_P(mtx)        (((mtx)->mtx_owner & MUTEX_BIT_NODEBUG) == 0)
#if defined(LOCKDEBUG)
#define        MUTEX_OWNED(owner)                (((owner) & ~MUTEX_BIT_NODEBUG) != 0)
#define        MUTEX_INHERITDEBUG(n, o)        (n) |= (o) & MUTEX_BIT_NODEBUG
#else /* defined(LOCKDEBUG) */
#define        MUTEX_OWNED(owner)                ((owner) != 0)
#define        MUTEX_INHERITDEBUG(n, o)        /* nothing */
#endif /* defined(LOCKDEBUG) */

static inline int
MUTEX_ACQUIRE(kmutex_t *mtx, uintptr_t curthread)
{
        int rv;
        uintptr_t oldown = 0;
        uintptr_t newown = curthread;

        MUTEX_INHERITDEBUG(oldown, mtx->mtx_owner);
        MUTEX_INHERITDEBUG(newown, oldown);
        rv = MUTEX_CAS(&mtx->mtx_owner, oldown, newown);
        membar_acquire();
        return rv;
}

static inline int
MUTEX_SET_WAITERS(kmutex_t *mtx, uintptr_t owner)
{
        int rv;

        rv = MUTEX_CAS(&mtx->mtx_owner, owner, owner | MUTEX_BIT_WAITERS);
        MUTEX_MEMBAR_ENTER();
        return rv;
}

static inline void
MUTEX_RELEASE(kmutex_t *mtx)
{
        uintptr_t newown;

        newown = 0;
        MUTEX_INHERITDEBUG(newown, mtx->mtx_owner);
        atomic_store_release(&mtx->mtx_owner, newown);
}
#endif        /* __HAVE_SIMPLE_MUTEXES */

/*
 * Patch in stubs via strong alias where they are not available.
 */

#if defined(LOCKDEBUG)
#undef        __HAVE_MUTEX_STUBS
#undef        __HAVE_SPIN_MUTEX_STUBS
#endif

#ifndef __HAVE_MUTEX_STUBS
__strong_alias(mutex_enter,mutex_vector_enter);
__strong_alias(mutex_exit,mutex_vector_exit);
#endif

#ifndef __HAVE_SPIN_MUTEX_STUBS
__strong_alias(mutex_spin_enter,mutex_vector_enter);
__strong_alias(mutex_spin_exit,mutex_vector_exit);
#endif

static void        mutex_abort(const char *, size_t, volatile const kmutex_t *,
                    const char *);
static void        mutex_dump(const volatile void *, lockop_printer_t);
static lwp_t        *mutex_owner(wchan_t);

lockops_t mutex_spin_lockops = {
        .lo_name = "Mutex",
        .lo_type = LOCKOPS_SPIN,
        .lo_dump = mutex_dump,
};

lockops_t mutex_adaptive_lockops = {
        .lo_name = "Mutex",
        .lo_type = LOCKOPS_SLEEP,
        .lo_dump = mutex_dump,
};

syncobj_t mutex_syncobj = {
        .sobj_name        = "mutex",
        .sobj_flag        = SOBJ_SLEEPQ_SORTED,
        .sobj_boostpri  = PRI_KERNEL,
        .sobj_unsleep        = turnstile_unsleep,
        .sobj_changepri        = turnstile_changepri,
        .sobj_lendpri        = sleepq_lendpri,
        .sobj_owner        = mutex_owner,
};

/*
 * mutex_dump:
 *
 *        Dump the contents of a mutex structure.
 */
static void
mutex_dump(const volatile void *cookie, lockop_printer_t pr)
{
        const volatile kmutex_t *mtx = cookie;
        uintptr_t owner = mtx->mtx_owner;

        pr("owner field  : %#018lx wait/spin: %16d/%d\n",
            (long)MUTEX_OWNER(owner), MUTEX_HAS_WAITERS(mtx),
            MUTEX_SPIN_P(owner));
}

/*
 * mutex_abort:
 *
 *        Dump information about an error and panic the system.  This
 *        generates a lot of machine code in the DIAGNOSTIC case, so
 *        we ask the compiler to not inline it.
 */
static void __noinline
mutex_abort(const char *func, size_t line, volatile const kmutex_t *mtx,
    const char *msg)
{

        LOCKDEBUG_ABORT(func, line, mtx, (MUTEX_SPIN_P(mtx->mtx_owner) ?
            &mutex_spin_lockops : &mutex_adaptive_lockops), msg);
}

/*
 * mutex_init:
 *
 *        Initialize a mutex for use.  Note that adaptive mutexes are in
 *        essence spin mutexes that can sleep to avoid deadlock and wasting
 *        CPU time.  We can't easily provide a type of mutex that always
 *        sleeps - see comments in mutex_vector_enter() about releasing
 *        mutexes unlocked.
 */
void
_mutex_init(kmutex_t *mtx, kmutex_type_t type, int ipl,
    uintptr_t return_address)
{
        lockops_t *lockops __unused;
        bool dodebug;

        memset(mtx, 0, sizeof(*mtx));

        if (ipl == IPL_NONE || ipl == IPL_SOFTCLOCK ||
            ipl == IPL_SOFTBIO || ipl == IPL_SOFTNET ||
            ipl == IPL_SOFTSERIAL) {
                lockops = (type == MUTEX_NODEBUG ?
                    NULL : &mutex_adaptive_lockops);
                dodebug = LOCKDEBUG_ALLOC(mtx, lockops, return_address);
                MUTEX_INITIALIZE_ADAPTIVE(mtx, dodebug);
        } else {
                lockops = (type == MUTEX_NODEBUG ?
                    NULL : &mutex_spin_lockops);
                dodebug = LOCKDEBUG_ALLOC(mtx, lockops, return_address);
                MUTEX_INITIALIZE_SPIN(mtx, dodebug, ipl);
        }
}

void
mutex_init(kmutex_t *mtx, kmutex_type_t type, int ipl)
{

        _mutex_init(mtx, type, ipl, (uintptr_t)__builtin_return_address(0));
}

/*
 * mutex_destroy:
 *
 *        Tear down a mutex.
 */
void
mutex_destroy(kmutex_t *mtx)
{
        uintptr_t owner = mtx->mtx_owner;

        if (MUTEX_ADAPTIVE_P(owner)) {
                MUTEX_ASSERT(mtx, !MUTEX_OWNED(owner));
                MUTEX_ASSERT(mtx, !MUTEX_HAS_WAITERS(mtx));
        } else {
                MUTEX_ASSERT(mtx, !MUTEX_SPINBIT_LOCKED_P(mtx));
        }

        LOCKDEBUG_FREE(MUTEX_DEBUG_P(mtx), mtx);
        MUTEX_DESTROY(mtx);
}

#ifdef MULTIPROCESSOR
/*
 * mutex_oncpu:
 *
 *        Return true if an adaptive mutex owner is running on a CPU in the
 *        system.  If the target is waiting on the kernel big lock, then we
 *        must release it.  This is necessary to avoid deadlock.
 */
static bool
mutex_oncpu(uintptr_t owner)
{
        struct cpu_info *ci;
        lwp_t *l;

        KASSERT(kpreempt_disabled());

        if (!MUTEX_OWNED(owner)) {
                return false;
        }

        /*
         * See lwp_dtor() why dereference of the LWP pointer is safe.
         * We must have kernel preemption disabled for that.
         */
        l = (lwp_t *)MUTEX_OWNER(owner);
        ci = l->l_cpu;

        if (ci && ci->ci_curlwp == l) {
                /* Target is running; do we need to block? */
                return (atomic_load_relaxed(&ci->ci_biglock_wanted) != l);
        }

        /* Not running.  It may be safe to block now. */
        return false;
}
#endif        /* MULTIPROCESSOR */

/*
 * mutex_vector_enter:
 *
 *        Support routine for mutex_enter() that must handle all cases.  In
 *        the LOCKDEBUG case, mutex_enter() is always aliased here, even if
 *        fast-path stubs are available.  If a mutex_spin_enter() stub is
 *        not available, then it is also aliased directly here.
 */
void
mutex_vector_enter(kmutex_t *mtx)
{
        uintptr_t owner, curthread;
        turnstile_t *ts;
#ifdef MULTIPROCESSOR
        u_int count;
#endif
        LOCKSTAT_COUNTER(spincnt);
        LOCKSTAT_COUNTER(slpcnt);
        LOCKSTAT_TIMER(spintime);
        LOCKSTAT_TIMER(slptime);
        LOCKSTAT_FLAG(lsflag);

        /*
         * Handle spin mutexes.
         */
        KPREEMPT_DISABLE(curlwp);
        owner = mtx->mtx_owner;
        if (MUTEX_SPIN_P(owner)) {
#if defined(LOCKDEBUG) && defined(MULTIPROCESSOR)
                u_int spins = 0;
#endif
                KPREEMPT_ENABLE(curlwp);
                MUTEX_SPIN_SPLRAISE(mtx);
                MUTEX_WANTLOCK(mtx);
#ifdef FULL
                if (MUTEX_SPINBIT_LOCK_TRY(mtx)) {
                        MUTEX_LOCKED(mtx);
                        return;
                }
#if !defined(MULTIPROCESSOR)
                MUTEX_ABORT(mtx, "locking against myself");
#else /* !MULTIPROCESSOR */

                LOCKSTAT_ENTER(lsflag);
                LOCKSTAT_START_TIMER(lsflag, spintime);
                count = SPINLOCK_BACKOFF_MIN;

                /*
                 * Spin testing the lock word and do exponential backoff
                 * to reduce cache line ping-ponging between CPUs.
                 */
                do {
                        while (MUTEX_SPINBIT_LOCKED_P(mtx)) {
                                SPINLOCK_SPIN_HOOK;
                                SPINLOCK_BACKOFF(count);
#ifdef LOCKDEBUG
                                if (SPINLOCK_SPINOUT(spins))
                                        MUTEX_ABORT(mtx, "spinout");
#endif        /* LOCKDEBUG */
                        }
                } while (!MUTEX_SPINBIT_LOCK_TRY(mtx));

                if (count != SPINLOCK_BACKOFF_MIN) {
                        LOCKSTAT_STOP_TIMER(lsflag, spintime);
                        LOCKSTAT_EVENT(lsflag, mtx,
                            LB_SPIN_MUTEX | LB_SPIN, 1, spintime);
                }
                LOCKSTAT_EXIT(lsflag);
#endif        /* !MULTIPROCESSOR */
#endif        /* FULL */
                MUTEX_LOCKED(mtx);
                return;
        }

        curthread = (uintptr_t)curlwp;

        MUTEX_DASSERT(mtx, MUTEX_ADAPTIVE_P(owner));
        MUTEX_ASSERT(mtx, curthread != 0);
        MUTEX_ASSERT(mtx, !cpu_intr_p());
        MUTEX_WANTLOCK(mtx);

        if (__predict_true(panicstr == NULL)) {
                KDASSERT(pserialize_not_in_read_section());
                LOCKDEBUG_BARRIER(&kernel_lock, 1);
        }

        LOCKSTAT_ENTER(lsflag);

        /*
         * Adaptive mutex; spin trying to acquire the mutex.  If we
         * determine that the owner is not running on a processor,
         * then we stop spinning, and sleep instead.
         */
        for (;;) {
                if (!MUTEX_OWNED(owner)) {
                        /*
                         * Mutex owner clear could mean two things:
                         *
                         *        * The mutex has been released.
                         *        * The owner field hasn't been set yet.
                         *
                         * Try to acquire it again.  If that fails,
                         * we'll just loop again.
                         */
                        if (MUTEX_ACQUIRE(mtx, curthread))
                                break;
                        owner = mtx->mtx_owner;
                        continue;
                }
                if (__predict_false(MUTEX_OWNER(owner) == curthread)) {
                        MUTEX_ABORT(mtx, "locking against myself");
                }
#ifdef MULTIPROCESSOR
                /*
                 * Check to see if the owner is running on a processor.
                 * If so, then we should just spin, as the owner will
                 * likely release the lock very soon.
                 */
                if (mutex_oncpu(owner)) {
                        LOCKSTAT_START_TIMER(lsflag, spintime);
                        count = SPINLOCK_BACKOFF_MIN;
                        do {
                                KPREEMPT_ENABLE(curlwp);
                                SPINLOCK_BACKOFF(count);
                                KPREEMPT_DISABLE(curlwp);
                                owner = mtx->mtx_owner;
                        } while (mutex_oncpu(owner));
                        LOCKSTAT_STOP_TIMER(lsflag, spintime);
                        LOCKSTAT_COUNT(spincnt, 1);
                        if (!MUTEX_OWNED(owner))
                                continue;
                }
#endif

                ts = turnstile_lookup(mtx);

                /*
                 * Once we have the turnstile chain interlock, mark the
                 * mutex as having waiters.  If that fails, spin again:
                 * chances are that the mutex has been released.
                 */
                if (!MUTEX_SET_WAITERS(mtx, owner)) {
                        turnstile_exit(mtx);
                        owner = mtx->mtx_owner;
                        continue;
                }

#ifdef MULTIPROCESSOR
                /*
                 * mutex_exit() is permitted to release the mutex without
                 * any interlocking instructions, and the following can
                 * occur as a result:
                 *
                 *  CPU 1: MUTEX_SET_WAITERS()      CPU2: mutex_exit()
                 * ---------------------------- ----------------------------
                 *                ..                load mtx->mtx_owner
                 *                ..                see has-waiters bit clear
                 *        set has-waiters bit                     ..
                 *                ..                store mtx->mtx_owner := 0
                 *          return success
                 *
                 * There is another race that can occur: a third CPU could
                 * acquire the mutex as soon as it is released.  Since
                 * adaptive mutexes are primarily spin mutexes, this is not
                 * something that we need to worry about too much.  What we
                 * do need to ensure is that the waiters bit gets set.
                 *
                 * To allow the unlocked release, we need to make some
                 * assumptions here:
                 *
                 * o Release is the only non-atomic/unlocked operation
                 *   that can be performed on the mutex.  (It must still
                 *   be atomic on the local CPU, e.g. in case interrupted
                 *   or preempted).
                 *
                 * o At any given time on each mutex, MUTEX_SET_WAITERS()
                 *   can only ever be in progress on one CPU in the
                 *   system - guaranteed by the turnstile chain lock.
                 *
                 * o No other operations other than MUTEX_SET_WAITERS()
                 *   and release can modify a mutex with a non-zero
                 *   owner field.
                 *
                 * o If the holding LWP switches away, it posts a store
                 *   fence before changing curlwp, ensuring that any
                 *   overwrite of the mutex waiters flag by mutex_exit()
                 *   completes before the modification of curlwp becomes
                 *   visible to this CPU.
                 *
                 * o cpu_switchto() posts a store fence after setting curlwp
                 *   and before resuming execution of an LWP.
                 *
                 * o _kernel_lock() posts a store fence before setting
                 *   curcpu()->ci_biglock_wanted, and after clearing it.
                 *   This ensures that any overwrite of the mutex waiters
                 *   flag by mutex_exit() completes before the modification
                 *   of ci_biglock_wanted becomes visible.
                 *
                 * After MUTEX_SET_WAITERS() succeeds, simultaneously
                 * confirming that the same LWP still holds the mutex
                 * since we took the turnstile lock and notifying it that
                 * we're waiting, we check the lock holder's status again.
                 * Some of the possible outcomes (not an exhaustive list;
                 * XXX this should be made exhaustive):
                 *
                 * 1. The on-CPU check returns true: the holding LWP is
                 *    running again.  The lock may be released soon and
                 *    we should spin.  Importantly, we can't trust the
                 *    value of the waiters flag.
                 *
                 * 2. The on-CPU check returns false: the holding LWP is
                 *    not running.  We now have the opportunity to check
                 *    if mutex_exit() has blatted the modifications made
                 *    by MUTEX_SET_WAITERS().
                 *
                 * 3. The on-CPU check returns false: the holding LWP may
                 *    or may not be running.  It has context switched at
                 *    some point during our check.  Again, we have the
                 *    chance to see if the waiters bit is still set or
                 *    has been overwritten.
                 *
                 * 4. The on-CPU check returns false: the holding LWP is
                 *    running on a CPU, but wants the big lock.  It's OK
                 *    to check the waiters field in this case.
                 *
                 * 5. The has-waiters check fails: the mutex has been
                 *    released, the waiters flag cleared and another LWP
                 *    now owns the mutex.
                 *
                 * 6. The has-waiters check fails: the mutex has been
                 *    released.
                 *
                 * If the waiters bit is not set it's unsafe to go asleep,
                 * as we might never be awoken.
                 */
                if (mutex_oncpu(owner)) {
                        turnstile_exit(mtx);
                        owner = mtx->mtx_owner;
                        continue;
                }
                membar_consumer();
                if (!MUTEX_HAS_WAITERS(mtx)) {
                        turnstile_exit(mtx);
                        owner = mtx->mtx_owner;
                        continue;
                }
#endif        /* MULTIPROCESSOR */

                LOCKSTAT_START_TIMER(lsflag, slptime);

                turnstile_block(ts, TS_WRITER_Q, mtx, &mutex_syncobj);

                LOCKSTAT_STOP_TIMER(lsflag, slptime);
                LOCKSTAT_COUNT(slpcnt, 1);

                owner = mtx->mtx_owner;
        }
        KPREEMPT_ENABLE(curlwp);

        LOCKSTAT_EVENT(lsflag, mtx, LB_ADAPTIVE_MUTEX | LB_SLEEP1,
            slpcnt, slptime);
        LOCKSTAT_EVENT(lsflag, mtx, LB_ADAPTIVE_MUTEX | LB_SPIN,
            spincnt, spintime);
        LOCKSTAT_EXIT(lsflag);

        MUTEX_DASSERT(mtx, MUTEX_OWNER(mtx->mtx_owner) == curthread);
        MUTEX_LOCKED(mtx);
}

/*
 * mutex_vector_exit:
 *
 *        Support routine for mutex_exit() that handles all cases.
 */
void
mutex_vector_exit(kmutex_t *mtx)
{
        turnstile_t *ts;
        uintptr_t curthread;

        if (MUTEX_SPIN_P(mtx->mtx_owner)) {
#ifdef FULL
                if (__predict_false(!MUTEX_SPINBIT_LOCKED_P(mtx))) {
                        MUTEX_ABORT(mtx, "exiting unheld spin mutex");
                }
                MUTEX_UNLOCKED(mtx);
                MUTEX_SPINBIT_LOCK_UNLOCK(mtx);
#endif
                MUTEX_SPIN_SPLRESTORE(mtx);
                return;
        }

#ifndef __HAVE_MUTEX_STUBS
        /*
         * On some architectures without mutex stubs, we can enter here to
         * release mutexes before interrupts and whatnot are up and running.
         * We need this hack to keep them sweet.
         */
        if (__predict_false(cold)) {
                MUTEX_UNLOCKED(mtx);
                MUTEX_RELEASE(mtx);
                return;
        }
#endif

        curthread = (uintptr_t)curlwp;
        MUTEX_DASSERT(mtx, curthread != 0);
        MUTEX_ASSERT(mtx, MUTEX_OWNER(mtx->mtx_owner) == curthread);
        MUTEX_UNLOCKED(mtx);
#if !defined(LOCKDEBUG)
        __USE(curthread);
#endif

#ifdef LOCKDEBUG
        /*
         * Avoid having to take the turnstile chain lock every time
         * around.  Raise the priority level to splhigh() in order
         * to disable preemption and so make the following atomic.
         * This also blocks out soft interrupts that could set the
         * waiters bit.
         */
        {
                int s = splhigh();
                if (!MUTEX_HAS_WAITERS(mtx)) {
                        MUTEX_RELEASE(mtx);
                        splx(s);
                        return;
                }
                splx(s);
        }
#endif

        /*
         * Get this lock's turnstile.  This gets the interlock on
         * the sleep queue.  Once we have that, we can clear the
         * lock.  If there was no turnstile for the lock, there
         * were no waiters remaining.
         */
        ts = turnstile_lookup(mtx);

        if (ts == NULL) {
                MUTEX_RELEASE(mtx);
                turnstile_exit(mtx);
        } else {
                MUTEX_RELEASE(mtx);
                turnstile_wakeup(ts, TS_WRITER_Q,
                    TS_WAITERS(ts, TS_WRITER_Q), NULL);
        }
}

#ifndef __HAVE_SIMPLE_MUTEXES
/*
 * mutex_wakeup:
 *
 *        Support routine for mutex_exit() that wakes up all waiters.
 *        We assume that the mutex has been released, but it need not
 *        be.
 */
void
mutex_wakeup(kmutex_t *mtx)
{
        turnstile_t *ts;

        ts = turnstile_lookup(mtx);
        if (ts == NULL) {
                turnstile_exit(mtx);
                return;
        }
        MUTEX_CLEAR_WAITERS(mtx);
        turnstile_wakeup(ts, TS_WRITER_Q, TS_WAITERS(ts, TS_WRITER_Q), NULL);
}
#endif        /* !__HAVE_SIMPLE_MUTEXES */

/*
 * mutex_owned:
 *
 *        Return true if the current LWP (adaptive) or CPU (spin)
 *        holds the mutex.
 */
int
mutex_owned(const kmutex_t *mtx)
{

        if (mtx == NULL)
                return 0;
        if (MUTEX_ADAPTIVE_P(mtx->mtx_owner))
                return MUTEX_OWNER(mtx->mtx_owner) == (uintptr_t)curlwp;
#ifdef FULL
        return MUTEX_SPINBIT_LOCKED_P(mtx);
#else
        return 1;
#endif
}

/*
 * mutex_owner:
 *
 *        Return the current owner of an adaptive mutex.  Used for
 *        priority inheritance.
 */
static lwp_t *
mutex_owner(wchan_t wchan)
{
        volatile const kmutex_t *mtx = wchan;

        MUTEX_ASSERT(mtx, MUTEX_ADAPTIVE_P(mtx->mtx_owner));
        return (struct lwp *)MUTEX_OWNER(mtx->mtx_owner);
}

/*
 * mutex_ownable:
 *
 *        When compiled with DEBUG and LOCKDEBUG defined, ensure that
 *        the mutex is available.  We cannot use !mutex_owned() since
 *        that won't work correctly for spin mutexes.
 */
int
mutex_ownable(const kmutex_t *mtx)
{

#ifdef LOCKDEBUG
        MUTEX_TESTLOCK(mtx);
#endif
        return 1;
}

/*
 * mutex_tryenter:
 *
 *        Try to acquire the mutex; return non-zero if we did.
 */
int
mutex_tryenter(kmutex_t *mtx)
{
        uintptr_t curthread;

        /*
         * Handle spin mutexes.
         */
        if (MUTEX_SPIN_P(mtx->mtx_owner)) {
                MUTEX_SPIN_SPLRAISE(mtx);
#ifdef FULL
                if (MUTEX_SPINBIT_LOCK_TRY(mtx)) {
                        MUTEX_WANTLOCK(mtx);
                        MUTEX_LOCKED(mtx);
                        return 1;
                }
                MUTEX_SPIN_SPLRESTORE(mtx);
#else
                MUTEX_WANTLOCK(mtx);
                MUTEX_LOCKED(mtx);
                return 1;
#endif
        } else {
                curthread = (uintptr_t)curlwp;
                MUTEX_ASSERT(mtx, curthread != 0);
                if (MUTEX_ACQUIRE(mtx, curthread)) {
                        MUTEX_WANTLOCK(mtx);
                        MUTEX_LOCKED(mtx);
                        MUTEX_DASSERT(mtx,
                            MUTEX_OWNER(mtx->mtx_owner) == curthread);
                        return 1;
                }
        }

        return 0;
}

#if defined(__HAVE_SPIN_MUTEX_STUBS) || defined(FULL)
/*
 * mutex_spin_retry:
 *
 *        Support routine for mutex_spin_enter().  Assumes that the caller
 *        has already raised the SPL, and adjusted counters.
 */
void
mutex_spin_retry(kmutex_t *mtx)
{
#ifdef MULTIPROCESSOR
        u_int count;
        LOCKSTAT_TIMER(spintime);
        LOCKSTAT_FLAG(lsflag);
#ifdef LOCKDEBUG
        u_int spins = 0;
#endif        /* LOCKDEBUG */

        MUTEX_WANTLOCK(mtx);

        LOCKSTAT_ENTER(lsflag);
        LOCKSTAT_START_TIMER(lsflag, spintime);
        count = SPINLOCK_BACKOFF_MIN;

        /*
         * Spin testing the lock word and do exponential backoff
         * to reduce cache line ping-ponging between CPUs.
         */
        do {
                while (MUTEX_SPINBIT_LOCKED_P(mtx)) {
                        SPINLOCK_BACKOFF(count);
#ifdef LOCKDEBUG
                        if (SPINLOCK_SPINOUT(spins))
                                MUTEX_ABORT(mtx, "spinout");
#endif        /* LOCKDEBUG */
                }
        } while (!MUTEX_SPINBIT_LOCK_TRY(mtx));

        LOCKSTAT_STOP_TIMER(lsflag, spintime);
        LOCKSTAT_EVENT(lsflag, mtx, LB_SPIN_MUTEX | LB_SPIN, 1, spintime);
        LOCKSTAT_EXIT(lsflag);

        MUTEX_LOCKED(mtx);
#else        /* MULTIPROCESSOR */
        MUTEX_ABORT(mtx, "locking against myself");
#endif        /* MULTIPROCESSOR */
}
#endif        /* defined(__HAVE_SPIN_MUTEX_STUBS) || defined(FULL) */







































































































































































































































































   74 
   94 
   94 






    3 











    4 








    4 









   92 









   14 


   88 











   24 

   24 















    4 











   46 
   43 
   87 








   46 
   87 
   87 

   43 
   86 








































































































   87 
   87 


   86 

















   86 

    2 

   86 
   86 




















   86 


















   10 










   83 



   85 








    5 
















   86 

   85 
   86 











































   82 
















    8 






















    2 










   86 
   85 






















   51 


























    1 




   13 




































   13 







    4 

   12 











   13 




























   86 






   86 

















    2 


















    1 
   74 




   84 







   84 





   73 


   85 
    2 
    3 















    8 










    8 


    8 












    8 
    8 

































    8 

























    2 


    8 






    8 





















   39 
   34 
























   64 









   65 
















   64 



    2 




    1 







    2 






















    2 










   65 







   40 
   64 





   47 











    1 
   40 
   40 
   40 







    1 
    4 






































   22 









   64 
   17 

   55 






















































   82 
   86 
    2 







   85 





    3 











   73 
   84 












   54 




   66 
   65 



























   10 












   65 
    3 


    3 
    3 

    1 

    1 





























   61 



   47 







   46 





















    3 

    3 



    3 














   47 

   34 


   84 




















   87 






















    5 













   85 
   86 







   86 





   48 
   65 









   30 
   56 
   53 











    2 


























   22 










   53 

   11 





































   13 




    1 

    1 















   13 
















   44 






















   78 







    5 




   32 






   81 




   22 
   76 

















































   63 

   17 
































   17 



















   78 















   34 
    1 

   58 












   71 
   30 














   87 






   87 

   11 
   84 





   87 
   82 





   11 












   11 
   84 









   85 



   87 



   87 

   80 

   11 
   11 






















































































































































































    6 






































    1 




















    5 
    3 


    3 






    6 






    6 









    3 

    5 


































    4 

    5 

   27 








    3 






    3 









    3 




    3 




    1 



    2 






   27 






   31 





    5 



   28 




   28 




    2 



   26 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
/*        $NetBSD: vfs_lookup.c,v 1.234 2023/05/01 05:12:44 mlelstv Exp $        */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_lookup.c        8.10 (Berkeley) 5/27/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.234 2023/05/01 05:12:44 mlelstv Exp $");

#ifdef _KERNEL_OPT
#include "opt_magiclinks.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/syslimits.h>
#include <sys/time.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/vnode_impl.h>
#include <sys/fstrans.h>
#include <sys/mount.h>
#include <sys/errno.h>
#include <sys/filedesc.h>
#include <sys/hash.h>
#include <sys/proc.h>
#include <sys/syslog.h>
#include <sys/kauth.h>
#include <sys/ktrace.h>
#include <sys/dirent.h>

#ifndef MAGICLINKS
#define MAGICLINKS 0
#endif

int vfs_magiclinks = MAGICLINKS;

__CTASSERT(MAXNAMLEN == NAME_MAX);

/*
 * Substitute replacement text for 'magic' strings in symlinks.
 * Returns 0 if successful, and returns non-zero if an error
 * occurs.  (Currently, the only possible error is running out
 * of temporary pathname space.)
 *
 * Looks for "@<string>" and "@<string>/", where <string> is a
 * recognized 'magic' string.  Replaces the "@<string>" with the
 * appropriate replacement text.  (Note that in some cases the
 * replacement text may have zero length.)
 *
 * This would have been table driven, but the variance in
 * replacement strings (and replacement string lengths) made
 * that impractical.
 */
#define        VNL(x)                                                        \
        (sizeof(x) - 1)

#define        VO        '{'
#define        VC        '}'

#define        MATCH(str)                                                \
        ((termchar == '/' && i + VNL(str) == *len) ||                \
         (i + VNL(str) < *len &&                                \
          cp[i + VNL(str)] == termchar)) &&                        \
        !strncmp((str), &cp[i], VNL(str))

#define        SUBSTITUTE(m, s, sl)                                        \
        if ((newlen + (sl)) >= MAXPATHLEN)                        \
                return 1;                                        \
        i += VNL(m);                                                \
        if (termchar != '/')                                        \
                i++;                                                \
        (void)memcpy(&tmp[newlen], (s), (sl));                        \
        newlen += (sl);                                                \
        change = 1;                                                \
        termchar = '/';

static int
symlink_magic(struct proc *p, char *cp, size_t *len)
{
        char *tmp;
        size_t change, i, newlen, slen;
        char termchar = '/';
        char idtmp[11]; /* enough for 32 bit *unsigned* integer */


        tmp = PNBUF_GET();
        for (change = i = newlen = 0; i < *len; ) {
                if (cp[i] != '@') {
                        tmp[newlen++] = cp[i++];
                        continue;
                }

                i++;

                /* Check for @{var} syntax. */
                if (cp[i] == VO) {
                        termchar = VC;
                        i++;
                }

                /*
                 * The following checks should be ordered according
                 * to frequency of use.
                 */
                if (MATCH("machine_arch")) {
                        slen = strlen(PROC_MACHINE_ARCH(p));
                        SUBSTITUTE("machine_arch", PROC_MACHINE_ARCH(p), slen);
                } else if (MATCH("machine")) {
                        slen = VNL(MACHINE);
                        SUBSTITUTE("machine", MACHINE, slen);
                } else if (MATCH("hostname")) {
                        SUBSTITUTE("hostname", hostname, hostnamelen);
                } else if (MATCH("osrelease")) {
                        slen = strlen(osrelease);
                        SUBSTITUTE("osrelease", osrelease, slen);
                } else if (MATCH("emul")) {
                        slen = strlen(p->p_emul->e_name);
                        SUBSTITUTE("emul", p->p_emul->e_name, slen);
                } else if (MATCH("kernel_ident")) {
                        slen = strlen(kernel_ident);
                        SUBSTITUTE("kernel_ident", kernel_ident, slen);
                } else if (MATCH("domainname")) {
                        SUBSTITUTE("domainname", domainname, domainnamelen);
                } else if (MATCH("ostype")) {
                        slen = strlen(ostype);
                        SUBSTITUTE("ostype", ostype, slen);
                } else if (MATCH("uid")) {
                        slen = snprintf(idtmp, sizeof(idtmp), "%u",
                            kauth_cred_geteuid(kauth_cred_get()));
                        SUBSTITUTE("uid", idtmp, slen);
                } else if (MATCH("ruid")) {
                        slen = snprintf(idtmp, sizeof(idtmp), "%u",
                            kauth_cred_getuid(kauth_cred_get()));
                        SUBSTITUTE("ruid", idtmp, slen);
                } else if (MATCH("gid")) {
                        slen = snprintf(idtmp, sizeof(idtmp), "%u",
                            kauth_cred_getegid(kauth_cred_get()));
                        SUBSTITUTE("gid", idtmp, slen);
                } else if (MATCH("rgid")) {
                        slen = snprintf(idtmp, sizeof(idtmp), "%u",
                            kauth_cred_getgid(kauth_cred_get()));
                        SUBSTITUTE("rgid", idtmp, slen);
                } else {
                        tmp[newlen++] = '@';
                        if (termchar == VC)
                                tmp[newlen++] = VO;
                }
        }

        if (change) {
                (void)memcpy(cp, tmp, newlen);
                *len = newlen;
        }
        PNBUF_PUT(tmp);

        return 0;
}

#undef VNL
#undef VO
#undef VC
#undef MATCH
#undef SUBSTITUTE

////////////////////////////////////////////////////////////

/*
 * Determine the namei hash (for the namecache) for name.
 * If *ep != NULL, hash from name to ep-1.
 * If *ep == NULL, hash from name until the first NUL or '/', and
 * return the location of this termination character in *ep.
 *
 * This function returns an equivalent hash to the MI hash32_strn().
 * The latter isn't used because in the *ep == NULL case, determining
 * the length of the string to the first NUL or `/' and then calling
 * hash32_strn() involves unnecessary double-handling of the data.
 */
uint32_t
namei_hash(const char *name, const char **ep)
{
        uint32_t        hash;

        hash = HASH32_STR_INIT;
        if (*ep != NULL) {
                for (; name < *ep; name++)
                        hash = hash * 33 + *(const uint8_t *)name;
        } else {
                for (; *name != '\0' && *name != '/'; name++)
                        hash = hash * 33 + *(const uint8_t *)name;
                *ep = name;
        }
        return (hash + (hash >> 5));
}

////////////////////////////////////////////////////////////

/*
 * Sealed abstraction for pathnames.
 *
 * System-call-layer level code that is going to call namei should
 * first create a pathbuf and adjust all the bells and whistles on it
 * as needed by context.
 */

struct pathbuf {
        char *pb_path;
        char *pb_pathcopy;
        unsigned pb_pathcopyuses;
};

static struct pathbuf *
pathbuf_create_raw(void)
{
        struct pathbuf *pb;

        pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
        pb->pb_path = PNBUF_GET();
        if (pb->pb_path == NULL) {
                kmem_free(pb, sizeof(*pb));
                return NULL;
        }
        pb->pb_pathcopy = NULL;
        pb->pb_pathcopyuses = 0;
        return pb;
}

void
pathbuf_destroy(struct pathbuf *pb)
{
        KASSERT(pb->pb_pathcopyuses == 0);
        KASSERT(pb->pb_pathcopy == NULL);
        PNBUF_PUT(pb->pb_path);
        kmem_free(pb, sizeof(*pb));
}

struct pathbuf *
pathbuf_assimilate(char *pnbuf)
{
        struct pathbuf *pb;

        pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
        pb->pb_path = pnbuf;
        pb->pb_pathcopy = NULL;
        pb->pb_pathcopyuses = 0;
        return pb;
}

struct pathbuf *
pathbuf_create(const char *path)
{
        struct pathbuf *pb;
        int error;

        pb = pathbuf_create_raw();
        if (pb == NULL) {
                return NULL;
        }
        error = copystr(path, pb->pb_path, PATH_MAX, NULL);
        if (error != 0) {
                KASSERT(!"kernel path too long in pathbuf_create");
                /* make sure it's null-terminated, just in case */
                pb->pb_path[PATH_MAX-1] = '\0';
        }
        return pb;
}

int
pathbuf_copyin(const char *userpath, struct pathbuf **ret)
{
        struct pathbuf *pb;
        int error;

        pb = pathbuf_create_raw();
        if (pb == NULL) {
                return ENOMEM;
        }
        error = copyinstr(userpath, pb->pb_path, PATH_MAX, NULL);
        if (error) {
                pathbuf_destroy(pb);
                return error;
        }
        *ret = pb;
        return 0;
}

/*
 * XXX should not exist:
 *   1. whether a pointer is kernel or user should be statically checkable.
 *   2. copyin should be handled by the upper part of the syscall layer,
 *      not in here.
 */
int
pathbuf_maybe_copyin(const char *path, enum uio_seg seg, struct pathbuf **ret)
{
        if (seg == UIO_USERSPACE) {
                return pathbuf_copyin(path, ret);
        } else {
                *ret = pathbuf_create(path);
                if (*ret == NULL) {
                        return ENOMEM;
                }
                return 0;
        }
}

/*
 * Get a copy of the path buffer as it currently exists. If this is
 * called after namei starts the results may be arbitrary.
 */
void
pathbuf_copystring(const struct pathbuf *pb, char *buf, size_t maxlen)
{
        strlcpy(buf, pb->pb_path, maxlen);
}

/*
 * These two functions allow access to a saved copy of the original
 * path string. The first copy should be gotten before namei is
 * called. Each copy that is gotten should be put back.
 */

const char *
pathbuf_stringcopy_get(struct pathbuf *pb)
{
        if (pb->pb_pathcopyuses == 0) {
                pb->pb_pathcopy = PNBUF_GET();
                strcpy(pb->pb_pathcopy, pb->pb_path);
        }
        pb->pb_pathcopyuses++;
        return pb->pb_pathcopy;
}

void
pathbuf_stringcopy_put(struct pathbuf *pb, const char *str)
{
        KASSERT(str == pb->pb_pathcopy);
        KASSERT(pb->pb_pathcopyuses > 0);
        pb->pb_pathcopyuses--;
        if (pb->pb_pathcopyuses == 0) {
                PNBUF_PUT(pb->pb_pathcopy);
                pb->pb_pathcopy = NULL;
        }
}


////////////////////////////////////////////////////////////

/*
 * namei: convert a pathname into a pointer to a (maybe-locked) vnode,
 * and maybe also its parent directory vnode, and assorted other guff.
 * See namei(9) for the interface documentation.
 *
 *
 * The FOLLOW flag is set when symbolic links are to be followed
 * when they occur at the end of the name translation process.
 * Symbolic links are always followed for all other pathname
 * components other than the last.
 *
 * The segflg defines whether the name is to be copied from user
 * space or kernel space.
 *
 * Overall outline of namei:
 *
 *        copy in name
 *        get starting directory
 *        while (!done && !error) {
 *                call lookup to search path.
 *                if symbolic link, massage name in buffer and continue
 *        }
 */

/*
 * Search a pathname.
 * This is a very central and rather complicated routine.
 *
 * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
 * The starting directory is passed in. The pathname is descended
 * until done, or a symbolic link is encountered. The variable ni_more
 * is clear if the path is completed; it is set to one if a symbolic
 * link needing interpretation is encountered.
 *
 * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
 * whether the name is to be looked up, created, renamed, or deleted.
 * When CREATE, RENAME, or DELETE is specified, information usable in
 * creating, renaming, or deleting a directory entry may be calculated.
 * If flag has LOCKPARENT or'ed into it, the parent directory is returned
 * locked.  Otherwise the parent directory is not returned. If the target
 * of the pathname exists and LOCKLEAF is or'ed into the flag the target
 * is returned locked, otherwise it is returned unlocked.  When creating
 * or renaming and LOCKPARENT is specified, the target may not be ".".
 * When deleting and LOCKPARENT is specified, the target may be ".".
 *
 * Overall outline of lookup:
 *
 * dirloop:
 *        identify next component of name at ndp->ni_ptr
 *        handle degenerate case where name is null string
 *        if .. and crossing mount points and on mounted filesys, find parent
 *        call VOP_LOOKUP routine for next component name
 *            directory vnode returned in ni_dvp, locked.
 *            component vnode returned in ni_vp (if it exists), locked.
 *        if result vnode is mounted on and crossing mount points,
 *            find mounted on vnode
 *        if more components of name, do next level at dirloop
 *        return the answer in ni_vp, locked if LOCKLEAF set
 *            if LOCKPARENT set, return locked parent in ni_dvp
 */


/*
 * Internal state for a namei operation.
 *
 * cnp is always equal to &ndp->ni_cnp.
 */
struct namei_state {
        struct nameidata *ndp;
        struct componentname *cnp;

        int docache;                        /* == 0 do not cache last component */
        int rdonly;                        /* lookup read-only flag bit */
        int slashes;

        unsigned attempt_retry:1;        /* true if error allows emul retry */
        unsigned root_referenced:1;        /* true if ndp->ni_rootdir and
                                             ndp->ni_erootdir were referenced */
};


/*
 * Initialize the namei working state.
 */
static void
namei_init(struct namei_state *state, struct nameidata *ndp)
{

        state->ndp = ndp;
        state->cnp = &ndp->ni_cnd;

        state->docache = 0;
        state->rdonly = 0;
        state->slashes = 0;

        state->root_referenced = 0;

        KASSERTMSG((state->cnp->cn_cred != NULL), "namei: bad cred/proc");
        KASSERTMSG(((state->cnp->cn_nameiop & (~OPMASK)) == 0),
            "namei: nameiop contaminated with flags: %08"PRIx32,
            state->cnp->cn_nameiop);
        KASSERTMSG(((state->cnp->cn_flags & OPMASK) == 0),
            "name: flags contaminated with nameiops: %08"PRIx32,
            state->cnp->cn_flags);

        /*
         * The buffer for name translation shall be the one inside the
         * pathbuf.
         */
        state->ndp->ni_pnbuf = state->ndp->ni_pathbuf->pb_path;
}

/*
 * Clean up the working namei state, leaving things ready for return
 * from namei.
 */
static void
namei_cleanup(struct namei_state *state)
{
        KASSERT(state->cnp == &state->ndp->ni_cnd);

        if (state->root_referenced) {
                if (state->ndp->ni_rootdir != NULL)
                        vrele(state->ndp->ni_rootdir);
                if (state->ndp->ni_erootdir != NULL)
                        vrele(state->ndp->ni_erootdir);
        }
}

//////////////////////////////

/*
 * Get the directory context.
 * Initializes the rootdir and erootdir state and returns a reference
 * to the starting dir.
 */
static struct vnode *
namei_getstartdir(struct namei_state *state)
{
        struct nameidata *ndp = state->ndp;
        struct componentname *cnp = state->cnp;
        struct cwdinfo *cwdi;                /* pointer to cwd state */
        struct lwp *self = curlwp;        /* thread doing namei() */
        struct vnode *rootdir, *erootdir, *curdir, *startdir;

        if (state->root_referenced) {
                if (state->ndp->ni_rootdir != NULL)
                        vrele(state->ndp->ni_rootdir);
                if (state->ndp->ni_erootdir != NULL)
                        vrele(state->ndp->ni_erootdir);
                state->root_referenced = 0;
        }

        cwdi = self->l_proc->p_cwdi;
        rw_enter(&cwdi->cwdi_lock, RW_READER);

        /* root dir */
        if (cwdi->cwdi_rdir == NULL || (cnp->cn_flags & NOCHROOT)) {
                rootdir = rootvnode;
        } else {
                rootdir = cwdi->cwdi_rdir;
        }

        /* emulation root dir, if any */
        if ((cnp->cn_flags & TRYEMULROOT) == 0) {
                /* if we don't want it, don't fetch it */
                erootdir = NULL;
        } else if (cnp->cn_flags & EMULROOTSET) {
                /* explicitly set emulroot; "/../" doesn't override this */
                erootdir = ndp->ni_erootdir;
        } else if (!strncmp(ndp->ni_pnbuf, "/../", 4)) {
                /* explicit reference to real rootdir */
                erootdir = NULL;
        } else {
                /* may be null */
                erootdir = cwdi->cwdi_edir;
        }

        /* current dir */
        curdir = cwdi->cwdi_cdir;

        if (ndp->ni_pnbuf[0] != '/') {
                if (ndp->ni_atdir != NULL) {
                        startdir = ndp->ni_atdir;
                } else {
                        startdir = curdir;
                }
                erootdir = NULL;
        } else if (cnp->cn_flags & TRYEMULROOT && erootdir != NULL) {
                startdir = erootdir;
        } else {
                startdir = rootdir;
                erootdir = NULL;
        }

        state->ndp->ni_rootdir = rootdir;
        state->ndp->ni_erootdir = erootdir;

        /*
         * Get a reference to the start dir so we can safely unlock cwdi.
         *
         * Must hold references to rootdir and erootdir while we're running.
         * A multithreaded process may chroot during namei.
         */
        if (startdir != NULL)
                vref(startdir);
        if (state->ndp->ni_rootdir != NULL)
                vref(state->ndp->ni_rootdir);
        if (state->ndp->ni_erootdir != NULL)
                vref(state->ndp->ni_erootdir);
        state->root_referenced = 1;

        rw_exit(&cwdi->cwdi_lock);
        return startdir;
}

/*
 * Get the directory context for the nfsd case, in parallel to
 * getstartdir. Initializes the rootdir and erootdir state and
 * returns a reference to the passed-in starting dir.
 */
static struct vnode *
namei_getstartdir_for_nfsd(struct namei_state *state)
{
        KASSERT(state->ndp->ni_atdir != NULL);

        /* always use the real root, and never set an emulation root */
        if (rootvnode == NULL) {
                return NULL;
        }
        state->ndp->ni_rootdir = rootvnode;
        state->ndp->ni_erootdir = NULL;

        vref(state->ndp->ni_atdir);
        KASSERT(! state->root_referenced);
        vref(state->ndp->ni_rootdir);
        state->root_referenced = 1;
        return state->ndp->ni_atdir;
}


/*
 * Ktrace the namei operation.
 */
static void
namei_ktrace(struct namei_state *state)
{
        struct nameidata *ndp = state->ndp;
        struct componentname *cnp = state->cnp;
        struct lwp *self = curlwp;        /* thread doing namei() */
        const char *emul_path;

        if (ktrpoint(KTR_NAMEI)) {
                if (ndp->ni_erootdir != NULL) {
                        /*
                         * To make any sense, the trace entry need to have the
                         * text of the emulation path prepended.
                         * Usually we can get this from the current process,
                         * but when called from emul_find_interp() it is only
                         * in the exec_package - so we get it passed in ni_next
                         * (this is a hack).
                         */
                        if (cnp->cn_flags & EMULROOTSET)
                                emul_path = ndp->ni_next;
                        else
                                emul_path = self->l_proc->p_emul->e_path;
                        ktrnamei2(emul_path, strlen(emul_path),
                            ndp->ni_pnbuf, ndp->ni_pathlen);
                } else
                        ktrnamei(ndp->ni_pnbuf, ndp->ni_pathlen);
        }
}

/*
 * Start up namei. Find the root dir and cwd, establish the starting
 * directory for lookup, and lock it. Also calls ktrace when
 * appropriate.
 */
static int
namei_start(struct namei_state *state, int isnfsd,
            struct vnode **startdir_ret)
{
        struct nameidata *ndp = state->ndp;
        struct vnode *startdir;

        /* length includes null terminator (was originally from copyinstr) */
        ndp->ni_pathlen = strlen(ndp->ni_pnbuf) + 1;

        /*
         * POSIX.1 requirement: "" is not a valid file name.
         */
        if (ndp->ni_pathlen == 1) {
                ndp->ni_erootdir = NULL;
                return ENOENT;
        }

        ndp->ni_loopcnt = 0;

        /* Get starting directory, set up root, and ktrace. */
        if (isnfsd) {
                startdir = namei_getstartdir_for_nfsd(state);
                /* no ktrace */
        } else {
                startdir = namei_getstartdir(state);
                namei_ktrace(state);
        }

        if (startdir == NULL) {
                return ENOENT;
        }

        /* NDAT may feed us with a non directory namei_getstartdir */
        if (startdir->v_type != VDIR) {
                vrele(startdir);
                return ENOTDIR;
        }

        *startdir_ret = startdir;
        return 0;
}

/*
 * Check for being at a symlink that we're going to follow.
 */
static inline int
namei_atsymlink(struct namei_state *state, struct vnode *foundobj)
{
        return (foundobj->v_type == VLNK) &&
                (state->cnp->cn_flags & (FOLLOW|REQUIREDIR));
}

/*
 * Follow a symlink.
 *
 * Updates searchdir. inhibitmagic causes magic symlinks to not be
 * interpreted; this is used by nfsd.
 *
 * Unlocks foundobj on success (ugh)
 */
static inline int
namei_follow(struct namei_state *state, int inhibitmagic,
             struct vnode *searchdir, struct vnode *foundobj,
             struct vnode **newsearchdir_ret)
{
        struct nameidata *ndp = state->ndp;
        struct componentname *cnp = state->cnp;

        struct lwp *self = curlwp;        /* thread doing namei() */
        struct iovec aiov;                /* uio for reading symbolic links */
        struct uio auio;
        char *cp;                        /* pointer into pathname argument */
        size_t linklen;
        int error;

        if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
                return ELOOP;
        }

        vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
        if (foundobj->v_mount->mnt_flag & MNT_SYMPERM) {
                error = VOP_ACCESS(foundobj, VEXEC, cnp->cn_cred);
                if (error != 0) {
                        VOP_UNLOCK(foundobj);
                        return error;
                }
        }

        /* FUTURE: fix this to not use a second buffer */
        cp = PNBUF_GET();
        aiov.iov_base = cp;
        aiov.iov_len = MAXPATHLEN;
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        auio.uio_offset = 0;
        auio.uio_rw = UIO_READ;
        auio.uio_resid = MAXPATHLEN;
        UIO_SETUP_SYSSPACE(&auio);
        error = VOP_READLINK(foundobj, &auio, cnp->cn_cred);
        VOP_UNLOCK(foundobj);
        if (error) {
                PNBUF_PUT(cp);
                return error;
        }
        linklen = MAXPATHLEN - auio.uio_resid;
        if (linklen == 0) {
                PNBUF_PUT(cp);
                return ENOENT;
        }

        /*
         * Do symlink substitution, if appropriate, and
         * check length for potential overflow.
         *
         * Inhibit symlink substitution for nfsd.
         * XXX: This is how it was before; is that a bug or a feature?
         */
        if ((!inhibitmagic && vfs_magiclinks &&
             symlink_magic(self->l_proc, cp, &linklen)) ||
            (linklen + ndp->ni_pathlen >= MAXPATHLEN)) {
                PNBUF_PUT(cp);
                return ENAMETOOLONG;
        }
        if (ndp->ni_pathlen > 1) {
                /* includes a null-terminator */
                memcpy(cp + linklen, ndp->ni_next, ndp->ni_pathlen);
        } else {
                cp[linklen] = '\0';
        }
        ndp->ni_pathlen += linklen;
        memcpy(ndp->ni_pnbuf, cp, ndp->ni_pathlen);
        PNBUF_PUT(cp);

        /* we're now starting from the beginning of the buffer again */
        cnp->cn_nameptr = ndp->ni_pnbuf;

        /*
         * Check if root directory should replace current directory.
         */
        if (ndp->ni_pnbuf[0] == '/') {
                vrele(searchdir);
                /* Keep absolute symbolic links inside emulation root */
                searchdir = ndp->ni_erootdir;
                if (searchdir == NULL ||
                    (ndp->ni_pnbuf[1] == '.'
                     && ndp->ni_pnbuf[2] == '.'
                     && ndp->ni_pnbuf[3] == '/')) {
                        ndp->ni_erootdir = NULL;
                        searchdir = ndp->ni_rootdir;
                }
                vref(searchdir);
                while (cnp->cn_nameptr[0] == '/') {
                        cnp->cn_nameptr++;
                        ndp->ni_pathlen--;
                }
        }

        *newsearchdir_ret = searchdir;
        return 0;
}

//////////////////////////////

/*
 * Inspect the leading path component and update the state accordingly.
 */
static int
lookup_parsepath(struct namei_state *state, struct vnode *searchdir)
{
        const char *cp;                        /* pointer into pathname argument */
        int error;

        struct componentname *cnp = state->cnp;
        struct nameidata *ndp = state->ndp;

        KASSERT(cnp == &ndp->ni_cnd);

        /*
         * Search a new directory.
         *
         * The last component of the filename is left accessible via
         * cnp->cn_nameptr for callers that need the name. Callers needing
         * the name set the SAVENAME flag. When done, they assume
         * responsibility for freeing the pathname buffer.
         *
         * At this point, our only vnode state is that the search dir
         * is held.
         */
        error = VOP_PARSEPATH(searchdir, cnp->cn_nameptr, &cnp->cn_namelen);
        if (error) {
                return error;
        }
        cp = cnp->cn_nameptr + cnp->cn_namelen;
        if (cnp->cn_namelen > KERNEL_NAME_MAX) {
                return ENAMETOOLONG;
        }
#ifdef NAMEI_DIAGNOSTIC
        { char c = *cp;
        *(char *)cp = '\0';
        printf("{%s}: ", cnp->cn_nameptr);
        *(char *)cp = c; }
#endif /* NAMEI_DIAGNOSTIC */
        ndp->ni_pathlen -= cnp->cn_namelen;
        ndp->ni_next = cp;
        /*
         * If this component is followed by a slash, then move the pointer to
         * the next component forward, and remember that this component must be
         * a directory.
         */
        if (*cp == '/') {
                do {
                        cp++;
                } while (*cp == '/');
                state->slashes = cp - ndp->ni_next;
                ndp->ni_pathlen -= state->slashes;
                ndp->ni_next = cp;
                cnp->cn_flags |= REQUIREDIR;
        } else {
                state->slashes = 0;
                cnp->cn_flags &= ~REQUIREDIR;
        }
        /*
         * We do special processing on the last component, whether or not it's
         * a directory.  Cache all intervening lookups, but not the final one.
         */
        if (*cp == '\0') {
                if (state->docache)
                        cnp->cn_flags |= MAKEENTRY;
                else
                        cnp->cn_flags &= ~MAKEENTRY;
                cnp->cn_flags |= ISLASTCN;
        } else {
                cnp->cn_flags |= MAKEENTRY;
                cnp->cn_flags &= ~ISLASTCN;
        }
        if (cnp->cn_namelen == 2 &&
            cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
                cnp->cn_flags |= ISDOTDOT;
        else
                cnp->cn_flags &= ~ISDOTDOT;

        return 0;
}

/*
 * Take care of crossing a mounted-on vnode.  On error, foundobj_ret will be
 * vrele'd, but searchdir is left alone.
 */
static int
lookup_crossmount(struct namei_state *state,
                  struct vnode **searchdir_ret,
                  struct vnode **foundobj_ret,
                  bool *searchdir_locked)
{
        struct componentname *cnp = state->cnp;
        struct vnode *foundobj, *vp;
        struct vnode *searchdir;
        struct mount *mp;
        int error, lktype;

        searchdir = *searchdir_ret;
        foundobj = *foundobj_ret;
        error = 0;

        KASSERT((cnp->cn_flags & NOCROSSMOUNT) == 0);

        /* First, unlock searchdir (oof). */
        if (*searchdir_locked) {
                KASSERT(searchdir != NULL);
                lktype = VOP_ISLOCKED(searchdir);
                VOP_UNLOCK(searchdir);
                *searchdir_locked = false;
        } else {
                lktype = LK_NONE;
        }

        /*
         * Do an unlocked check to see if the vnode has been mounted on; if
         * so find the root of the mounted file system.
         */
        while (foundobj->v_type == VDIR &&
            (mp = foundobj->v_mountedhere) != NULL &&
            (cnp->cn_flags & NOCROSSMOUNT) == 0) {
                /*
                 * Try the namecache first.  If that doesn't work, do
                 * it the hard way.
                 */
                if (cache_lookup_mount(foundobj, &vp)) {
                        vrele(foundobj);
                        foundobj = vp;
                } else {
                        /* First get the vnodes mount stable. */
                        while ((mp = foundobj->v_mountedhere) != NULL) {
                                fstrans_start(mp);
                                if (fstrans_held(mp) &&
                                    mp == foundobj->v_mountedhere) {
                                        break;
                                }
                                fstrans_done(mp);
                        }
                        if (mp == NULL) {
                                break;
                        }

                        /*
                         * Now get a reference on the root vnode.
                         * XXX Future - maybe allow only VDIR here.
                         */
                        error = VFS_ROOT(mp, LK_NONE, &vp);

                        /*
                         * If successful, enter it into the cache while
                         * holding the mount busy (competing with unmount).
                         */
                        if (error == 0) {
                                cache_enter_mount(foundobj, vp);
                        }

                        /* Finally, drop references to foundobj & mountpoint. */
                        vrele(foundobj);
                        fstrans_done(mp);
                        if (error) {
                                foundobj = NULL;
                                break;
                        }
                        foundobj = vp;
                }

                /*
                 * Avoid locking vnodes from two filesystems because
                 * it's prone to deadlock, e.g. when using puffs.
                 * Also, it isn't a good idea to propagate slowness of
                 * a filesystem up to the root directory. For now,
                 * only handle the common case, where foundobj is
                 * VDIR.
                 *
                 * In this case set searchdir to null to avoid using
                 * it again. It is not correct to set searchdir ==
                 * foundobj here as that will confuse the caller.
                 * (See PR 40740.)
                 */
                if (searchdir == NULL) {
                        /* already been here once; do nothing further */
                } else if (foundobj->v_type == VDIR) {
                        vrele(searchdir);
                        *searchdir_ret = searchdir = NULL;
                        lktype = LK_NONE;
                }
        }

        /* If searchdir is still around, re-lock it. */
         if (error == 0 && lktype != LK_NONE) {
                vn_lock(searchdir, lktype | LK_RETRY);
                *searchdir_locked = true;
        }
        *foundobj_ret = foundobj;
        return error;
}

/*
 * Determine the desired locking mode for the directory of a lookup.
 */
static int
lookup_lktype(struct vnode *searchdir, struct componentname *cnp)
{

        /*
         * If the file system supports VOP_LOOKUP() with a shared lock, and
         * we are not making any modifications (nameiop LOOKUP) or this is
         * not the last component then get a shared lock.  Where we can't do
         * fast-forwarded lookups (for example with layered file systems)
         * then this is the fallback for reducing lock contention.
         */
        if ((searchdir->v_mount->mnt_iflag & IMNT_SHRLOOKUP) != 0 &&
            (cnp->cn_nameiop == LOOKUP || (cnp->cn_flags & ISLASTCN) == 0)) {
                return LK_SHARED;
        } else {
                return LK_EXCLUSIVE;
        }
}

/*
 * Call VOP_LOOKUP for a single lookup; return a new search directory
 * (used when crossing mountpoints up or searching union mounts down) and
 * the found object, which for create operations may be NULL on success.
 *
 * Note that the new search directory may be null, which means the
 * searchdir was unlocked and released. This happens in the common case
 * when crossing a mount point downwards, in order to avoid coupling
 * locks between different file system volumes. Importantly, this can
 * happen even if the call fails. (XXX: this is gross and should be
 * tidied somehow.)
 */
static int
lookup_once(struct namei_state *state,
            struct vnode *searchdir,
            struct vnode **newsearchdir_ret,
            struct vnode **foundobj_ret,
            bool *newsearchdir_locked_ret)
{
        struct vnode *tmpvn;                /* scratch vnode */
        struct vnode *foundobj;                /* result */
        struct lwp *l = curlwp;
        bool searchdir_locked = false;
        int error, lktype;

        struct componentname *cnp = state->cnp;
        struct nameidata *ndp = state->ndp;

        KASSERT(cnp == &ndp->ni_cnd);
        *newsearchdir_ret = searchdir;

        /*
         * Handle "..": two special cases.
         * 1. If at root directory (e.g. after chroot)
         *    or at absolute root directory
         *    then ignore it so can't get out.
         * 1a. If at the root of the emulation filesystem go to the real
         *    root. So "/../<path>" is always absolute.
         * 1b. If we have somehow gotten out of a jail, warn
         *    and also ignore it so we can't get farther out.
         * 2. If this vnode is the root of a mounted
         *    filesystem, then replace it with the
         *    vnode which was mounted on so we take the
         *    .. in the other file system.
         */
        if (cnp->cn_flags & ISDOTDOT) {
                struct proc *p = l->l_proc;

                for (;;) {
                        if (searchdir == ndp->ni_rootdir ||
                            searchdir == rootvnode) {
                                foundobj = searchdir;
                                vref(foundobj);
                                *foundobj_ret = foundobj;
                                if (cnp->cn_flags & LOCKPARENT) {
                                        lktype = lookup_lktype(searchdir, cnp);
                                        vn_lock(searchdir, lktype | LK_RETRY);
                                        searchdir_locked = true;
                                }
                                error = 0;
                                goto done;
                        }
                        if (ndp->ni_rootdir != rootvnode) {
                                int retval;

                                retval = vn_isunder(searchdir, ndp->ni_rootdir, l);
                                if (!retval) {
                                    /* Oops! We got out of jail! */
                                    log(LOG_WARNING,
                                        "chrooted pid %d uid %d (%s) "
                                        "detected outside of its chroot\n",
                                        p->p_pid, kauth_cred_geteuid(l->l_cred),
                                        p->p_comm);
                                    /* Put us at the jail root. */
                                    vrele(searchdir);
                                    searchdir = NULL;
                                    foundobj = ndp->ni_rootdir;
                                    vref(foundobj);
                                    vref(foundobj);
                                    *newsearchdir_ret = foundobj;
                                    *foundobj_ret = foundobj;
                                    error = 0;
                                    goto done;
                                }
                        }
                        if ((searchdir->v_vflag & VV_ROOT) == 0 ||
                            (cnp->cn_flags & NOCROSSMOUNT))
                                break;
                        tmpvn = searchdir;
                        searchdir = searchdir->v_mount->mnt_vnodecovered;
                        vref(searchdir);
                        vrele(tmpvn);
                        *newsearchdir_ret = searchdir;
                }
        }

        lktype = lookup_lktype(searchdir, cnp);

        /*
         * We now have a segment name to search for, and a directory to search.
         * Our vnode state here is that "searchdir" is held.
         */
unionlookup:
        foundobj = NULL;
        if (!searchdir_locked) {
                vn_lock(searchdir, lktype | LK_RETRY);
                searchdir_locked = true;
        }
        error = VOP_LOOKUP(searchdir, &foundobj, cnp);

        if (error != 0) {
                KASSERTMSG((foundobj == NULL),
                    "leaf `%s' should be empty but is %p",
                    cnp->cn_nameptr, foundobj);
#ifdef NAMEI_DIAGNOSTIC
                printf("not found\n");
#endif /* NAMEI_DIAGNOSTIC */

                /*
                 * If ENOLCK, the file system needs us to retry the lookup
                 * with an exclusive lock.  It's likely nothing was found in
                 * cache and/or modifications need to be made.
                 */
                if (error == ENOLCK) {
                        KASSERT(VOP_ISLOCKED(searchdir) == LK_SHARED);
                        KASSERT(searchdir_locked);
                        if (vn_lock(searchdir, LK_UPGRADE | LK_NOWAIT)) {
                                VOP_UNLOCK(searchdir);
                                searchdir_locked = false;
                        }
                        lktype = LK_EXCLUSIVE;
                        goto unionlookup;
                }

                if ((error == ENOENT) &&
                    (searchdir->v_vflag & VV_ROOT) &&
                    (searchdir->v_mount->mnt_flag & MNT_UNION)) {
                        tmpvn = searchdir;
                        searchdir = searchdir->v_mount->mnt_vnodecovered;
                        vref(searchdir);
                        vput(tmpvn);
                        searchdir_locked = false;
                        *newsearchdir_ret = searchdir;
                        goto unionlookup;
                }

                if (error != EJUSTRETURN)
                        goto done;

                /*
                 * If this was not the last component, or there were trailing
                 * slashes, and we are not going to create a directory,
                 * then the name must exist.
                 */
                if ((cnp->cn_flags & (REQUIREDIR | CREATEDIR)) == REQUIREDIR) {
                        error = ENOENT;
                        goto done;
                }

                /*
                 * If creating and at end of pathname, then can consider
                 * allowing file to be created.
                 */
                if (state->rdonly) {
                        error = EROFS;
                        goto done;
                }

                /*
                 * We return success and a NULL foundobj to indicate
                 * that the entry doesn't currently exist, leaving a
                 * pointer to the (normally, locked) directory vnode
                 * as searchdir.
                 */
                *foundobj_ret = NULL;
                error = 0;
                goto done;
        }
#ifdef NAMEI_DIAGNOSTIC
        printf("found\n");
#endif /* NAMEI_DIAGNOSTIC */

        /* Unlock, unless the caller needs the parent locked. */
        if (searchdir != NULL) {
                KASSERT(searchdir_locked);
                if ((cnp->cn_flags & (ISLASTCN | LOCKPARENT)) !=
                    (ISLASTCN | LOCKPARENT)) {
                            VOP_UNLOCK(searchdir);
                            searchdir_locked = false;
                }
        } else {
                KASSERT(!searchdir_locked);
        }

        *foundobj_ret = foundobj;
        error = 0;
done:
        *newsearchdir_locked_ret = searchdir_locked;
        return error;
}

/*
 * Parse out the first path name component that we need to to consider.
 *
 * While doing this, attempt to use the name cache to fast-forward through
 * as many "easy" to find components of the path as possible.
 *
 * We use the namecache's node locks to form a chain, and avoid as many
 * vnode references and locks as possible.  In the ideal case, only the
 * final vnode will have its reference count adjusted and lock taken.
 */
static int
lookup_fastforward(struct namei_state *state, struct vnode **searchdir_ret,
                   struct vnode **foundobj_ret)
{
        struct componentname *cnp = state->cnp;
        struct nameidata *ndp = state->ndp;
        krwlock_t *plock;
        struct vnode *foundobj, *searchdir;
        int error, error2;
        size_t oldpathlen;
        const char *oldnameptr;
        bool terminal;

        /*
         * Eat as many path name components as possible before giving up and
         * letting lookup_once() handle it.  Remember the starting point in
         * case we can't get vnode references and need to roll back.
         */
        plock = NULL;
        searchdir = *searchdir_ret;
        oldnameptr = cnp->cn_nameptr;
        oldpathlen = ndp->ni_pathlen;
        terminal = false;
        for (;;) {
                foundobj = NULL;

                /*
                 * Get the next component name.  There should be no slashes
                 * here, and we shouldn't have looped around if we were
                 * done.
                 */
                KASSERT(cnp->cn_nameptr[0] != '/');
                KASSERT(cnp->cn_nameptr[0] != '\0');
                if ((error = lookup_parsepath(state, searchdir)) != 0) {
                        break;
                }

                /*
                 * Can't deal with DOTDOT lookups if NOCROSSMOUNT or the
                 * lookup is chrooted.
                 */
                if ((cnp->cn_flags & ISDOTDOT) != 0) {
                        if ((searchdir->v_vflag & VV_ROOT) != 0 &&
                            (cnp->cn_flags & NOCROSSMOUNT)) {
                                    error = EOPNOTSUPP;
                                break;
                        }
                        if (ndp->ni_rootdir != rootvnode) {
                                    error = EOPNOTSUPP;
                                break;
                        }
                }

                /*
                 * Can't deal with last component when modifying; this needs
                 * searchdir locked and VOP_LOOKUP() called (which can and
                 * does modify state, despite the name).  NB: this case means
                 * terminal is never set true when LOCKPARENT.
                 */
                if ((cnp->cn_flags & ISLASTCN) != 0) {
                        if (cnp->cn_nameiop != LOOKUP ||
                            (cnp->cn_flags & LOCKPARENT) != 0) {
                                error = EOPNOTSUPP;
                                break;
                        }
                }

                /*
                 * Good, now look for it in cache.  cache_lookup_linked()
                 * will fail if there's nothing there, or if there's no
                 * ownership info for the directory, or if the user doesn't
                 * have permission to look up files in this directory.
                 */
                if (!cache_lookup_linked(searchdir, cnp->cn_nameptr,
                    cnp->cn_namelen, &foundobj, &plock, cnp->cn_cred)) {
                        error = EOPNOTSUPP;
                        break;
                }
                KASSERT(plock != NULL);
                KASSERT(rw_lock_held(plock));

                /*
                 * Scored a hit.  Negative is good too (ENOENT).  If there's
                 * a '-o union' mount here, punt and let lookup_once() deal
                 * with it.
                 */
                if (foundobj == NULL) {
                        if ((searchdir->v_vflag & VV_ROOT) != 0 &&
                            (searchdir->v_mount->mnt_flag & MNT_UNION) != 0) {
                                    error = EOPNOTSUPP;
                        } else {
                                error = ENOENT;
                                terminal = ((cnp->cn_flags & ISLASTCN) != 0);
                        }
                        break;
                }

                /*
                 * Stop and get a hold on the vnode if we've encountered
                 * something other than a dirctory.
                 */
                if (foundobj->v_type != VDIR) {
                        error = vcache_tryvget(foundobj);
                        if (error != 0) {
                                foundobj = NULL;
                                error = EOPNOTSUPP;
                        } else {
                                terminal = (foundobj->v_type != VLNK &&
                                    (cnp->cn_flags & ISLASTCN) != 0);
                        }
                        break;
                }

                /*
                 * Try to cross mountpoints, bearing in mind that they can
                 * be stacked.  If at any point we can't go further, stop
                 * and try to get a reference on the vnode.  If we are able
                 * to get a ref then lookup_crossmount() will take care of
                 * it, otherwise we'll fall through to lookup_once().
                 */
                if (foundobj->v_mountedhere != NULL) {
                        while (foundobj->v_mountedhere != NULL &&
                            (cnp->cn_flags & NOCROSSMOUNT) == 0 &&
                            cache_cross_mount(&foundobj, &plock)) {
                                KASSERT(foundobj != NULL);
                                KASSERT(foundobj->v_type == VDIR);
                        }
                        if (foundobj->v_mountedhere != NULL) {
                                error = vcache_tryvget(foundobj);
                                if (error != 0) {
                                        foundobj = NULL;
                                        error = EOPNOTSUPP;
                                }
                                break;
                        } else {
                                searchdir = NULL;
                        }
                }

                /*
                 * Time to stop if we found the last component & traversed
                 * all mounts.
                 */
                if ((cnp->cn_flags & ISLASTCN) != 0) {
                        error = vcache_tryvget(foundobj);
                        if (error != 0) {
                                foundobj = NULL;
                                error = EOPNOTSUPP;
                        } else {
                                terminal = (foundobj->v_type != VLNK);
                        }
                        break;
                }

                /*
                 * Otherwise, we're still in business.  Set the found VDIR
                 * vnode as the search dir for the next component and
                 * continue on to it.
                 */
                cnp->cn_nameptr = ndp->ni_next;
                searchdir = foundobj;
        }

        if (terminal) {
                /*
                 * If we exited the loop above having successfully located
                 * the last component with a zero error code, and it's not a
                 * symbolic link, then the parent directory is not needed.
                 * Release reference to the starting parent and make the
                 * terminal parent disappear into thin air.
                 */
                KASSERT(plock != NULL);
                rw_exit(plock);
                vrele(*searchdir_ret);
                *searchdir_ret = NULL;
        } else if (searchdir != *searchdir_ret) {
                /*
                 * Otherwise we need to return the parent.  If we ended up
                 * with a new search dir, ref it before dropping the
                 * namecache's lock.  The lock prevents both searchdir and
                 * foundobj from disappearing.  If we can't ref the new
                 * searchdir, we have a bit of a problem.  Roll back the
                 * fastforward to the beginning and let lookup_once() take
                 * care of it.
                 */
                if (searchdir == NULL) {
                        /*
                         * It's possible for searchdir to be NULL in the
                         * case of a root vnode being reclaimed while
                         * trying to cross a mount.
                         */
                        error2 = EOPNOTSUPP;
                } else {
                        error2 = vcache_tryvget(searchdir);
                }
                KASSERT(plock != NULL);
                rw_exit(plock);
                if (__predict_true(error2 == 0)) {
                        /* Returning new searchdir, and maybe new foundobj. */
                        vrele(*searchdir_ret);
                        *searchdir_ret = searchdir;
                } else {
                        /* Returning nothing. */
                        if (foundobj != NULL) {
                                vrele(foundobj);
                                foundobj = NULL;
                        }
                        cnp->cn_nameptr = oldnameptr;
                        ndp->ni_pathlen = oldpathlen;
                        error = lookup_parsepath(state, *searchdir_ret);
                        if (error == 0) {
                                error = EOPNOTSUPP;
                        }
                }
        } else if (plock != NULL) {
                /* Drop any namecache lock still held. */
                rw_exit(plock);
        }

        KASSERT(error == 0 ? foundobj != NULL : foundobj == NULL);
        *foundobj_ret = foundobj;
        return error;
}

//////////////////////////////

/*
 * Do a complete path search from a single root directory.
 * (This is called up to twice if TRYEMULROOT is in effect.)
 */
static int
namei_oneroot(struct namei_state *state,
         int neverfollow, int inhibitmagic, int isnfsd)
{
        struct nameidata *ndp = state->ndp;
        struct componentname *cnp = state->cnp;
        struct vnode *searchdir, *foundobj;
        bool searchdir_locked = false;
        int error;

        error = namei_start(state, isnfsd, &searchdir);
        if (error) {
                ndp->ni_dvp = NULL;
                ndp->ni_vp = NULL;
                return error;
        }
        KASSERT(searchdir->v_type == VDIR);

        /*
         * Setup: break out flag bits into variables.
         */
        state->docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
        if (cnp->cn_nameiop == DELETE)
                state->docache = 0;
        state->rdonly = cnp->cn_flags & RDONLY;

        /*
         * Keep going until we run out of path components.
         */
        cnp->cn_nameptr = ndp->ni_pnbuf;

        /* drop leading slashes (already used them to choose startdir) */
        while (cnp->cn_nameptr[0] == '/') {
                cnp->cn_nameptr++;
                ndp->ni_pathlen--;
        }
        /* was it just "/"? */
        if (cnp->cn_nameptr[0] == '\0') {
                foundobj = searchdir;
                searchdir = NULL;
                cnp->cn_flags |= ISLASTCN;

                /* bleh */
                goto skiploop;
        }

        for (;;) {
                KASSERT(searchdir != NULL);
                KASSERT(!searchdir_locked);

                /*
                 * Parse out the first path name component that we need to
                 * to consider.  While doing this, attempt to use the name
                 * cache to fast-forward through as many "easy" to find
                 * components of the path as possible.
                 */
                error = lookup_fastforward(state, &searchdir, &foundobj);

                /*
                 * If we didn't get a good answer from the namecache, then
                 * go directly to the file system.
                 */
                if (error == EOPNOTSUPP) {
                        error = lookup_once(state, searchdir, &searchdir,
                            &foundobj, &searchdir_locked);
                }

                /*
                 * If the vnode we found is mounted on, then cross the mount
                 * and get the root vnode in foundobj.  If this encounters
                 * an error, it will dispose of foundobj, but searchdir is
                 * untouched.
                 */
                if (error == 0 && foundobj != NULL &&
                    foundobj->v_type == VDIR &&
                    foundobj->v_mountedhere != NULL &&
                    (cnp->cn_flags & NOCROSSMOUNT) == 0) {
                            error = lookup_crossmount(state, &searchdir,
                                &foundobj, &searchdir_locked);
                }

                if (error) {
                        if (searchdir != NULL) {
                                if (searchdir_locked) {
                                        searchdir_locked = false;
                                        vput(searchdir);
                                } else {
                                        vrele(searchdir);
                                }
                        }
                        ndp->ni_dvp = NULL;
                        ndp->ni_vp = NULL;
                        /*
                         * Note that if we're doing TRYEMULROOT we can
                         * retry with the normal root. Where this is
                         * currently set matches previous practice,
                         * but the previous practice didn't make much
                         * sense and somebody should sit down and
                         * figure out which cases should cause retry
                         * and which shouldn't. XXX.
                         */
                        state->attempt_retry = 1;
                        return (error);
                }

                if (foundobj == NULL) {
                        /*
                         * Success with no object returned means we're
                         * creating something and it isn't already
                         * there. Break out of the main loop now so
                         * the code below doesn't have to test for
                         * foundobj == NULL.
                         */
                        /* lookup_once can't have dropped the searchdir */
                        KASSERT(searchdir != NULL ||
                            (cnp->cn_flags & ISLASTCN) != 0);
                        break;
                }

                /*
                 * Check for symbolic link. If we've reached one,
                 * follow it, unless we aren't supposed to. Back up
                 * over any slashes that we skipped, as we will need
                 * them again.
                 */
                if (namei_atsymlink(state, foundobj)) {
                        /* Don't need searchdir locked any more. */
                        if (searchdir_locked) {
                                searchdir_locked = false;
                                VOP_UNLOCK(searchdir);
                        }
                        ndp->ni_pathlen += state->slashes;
                        ndp->ni_next -= state->slashes;
                        if (neverfollow) {
                                error = EINVAL;
                        } else if (searchdir == NULL) {
                                /*
                                 * dholland 20160410: lookup_once only
                                 * drops searchdir if it crossed a
                                 * mount point. Therefore, if we get
                                 * here it means we crossed a mount
                                 * point to a mounted filesystem whose
                                 * root vnode is a symlink. In theory
                                 * we could continue at this point by
                                 * using the pre-crossing searchdir
                                 * (e.g. just take out an extra
                                 * reference on it before calling
                                 * lookup_once so we still have it),
                                 * but this will make an ugly mess and
                                 * it should never happen in practice
                                 * as only badly broken filesystems
                                 * have non-directory root vnodes. (I
                                 * have seen this sort of thing with
                                 * NFS occasionally but even then it
                                 * means something's badly wrong.)
                                 */
                                error = ENOTDIR;
                        } else {
                                /*
                                 * dholland 20110410: if we're at a
                                 * union mount it might make sense to
                                 * use the top of the union stack here
                                 * rather than the layer we found the
                                 * symlink in. (FUTURE)
                                 */
                                error = namei_follow(state, inhibitmagic,
                                                     searchdir, foundobj,
                                                     &searchdir);
                        }
                        if (error) {
                                KASSERT(searchdir != foundobj);
                                if (searchdir != NULL) {
                                        vrele(searchdir);
                                }
                                vrele(foundobj);
                                ndp->ni_dvp = NULL;
                                ndp->ni_vp = NULL;
                                return error;
                        }
                        vrele(foundobj);
                        foundobj = NULL;

                        /*
                         * If we followed a symlink to `/' and there
                         * are no more components after the symlink,
                         * we're done with the loop and what we found
                         * is the searchdir.
                         */
                        if (cnp->cn_nameptr[0] == '\0') {
                                KASSERT(searchdir != NULL);
                                foundobj = searchdir;
                                searchdir = NULL;
                                cnp->cn_flags |= ISLASTCN;
                                break;
                        }

                        continue;
                }

                /*
                 * Not a symbolic link.
                 *
                 * Check for directory, if the component was
                 * followed by a series of slashes.
                 */
                if ((foundobj->v_type != VDIR) &&
                    (cnp->cn_flags & REQUIREDIR)) {
                        KASSERT(foundobj != searchdir);
                        if (searchdir) {
                                if (searchdir_locked) {
                                        searchdir_locked = false;
                                        vput(searchdir);
                                } else {
                                        vrele(searchdir);
                                }
                        } else {
                                KASSERT(!searchdir_locked);
                        }
                        vrele(foundobj);
                        ndp->ni_dvp = NULL;
                        ndp->ni_vp = NULL;
                        state->attempt_retry = 1;
                        return ENOTDIR;
                }

                /*
                 * Stop if we've reached the last component.
                 */
                if (cnp->cn_flags & ISLASTCN) {
                        break;
                }

                /*
                 * Continue with the next component.
                 */
                cnp->cn_nameptr = ndp->ni_next;
                if (searchdir != NULL) {
                        if (searchdir_locked) {
                                searchdir_locked = false;
                                vput(searchdir);
                        } else {
                                vrele(searchdir);
                        }
                }
                searchdir = foundobj;
                foundobj = NULL;
        }

        KASSERT((cnp->cn_flags & LOCKPARENT) == 0 || searchdir == NULL ||
            VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);

 skiploop:

        if (foundobj != NULL) {
                if (foundobj == ndp->ni_erootdir) {
                        /*
                         * We are about to return the emulation root.
                         * This isn't a good idea because code might
                         * repeatedly lookup ".." until the file
                         * matches that returned for "/" and loop
                         * forever.  So convert it to the real root.
                         */
                        if (searchdir != NULL) {
                                if (searchdir_locked) {
                                        vput(searchdir);
                                        searchdir_locked = false;
                                } else {
                                        vrele(searchdir);
                                }
                                searchdir = NULL;
                        }
                        vrele(foundobj);
                        foundobj = ndp->ni_rootdir;
                        vref(foundobj);
                }

                /*
                 * If the caller requested the parent node (i.e. it's
                 * a CREATE, DELETE, or RENAME), and we don't have one
                 * (because this is the root directory, or we crossed
                 * a mount point), then we must fail.
                 *
                 * 20210604 dholland when NONEXCLHACK is set (open
                 * with O_CREAT but not O_EXCL) skip this logic. Since
                 * we have a foundobj, open will not be creating, so
                 * it doesn't actually need or use the searchdir, so
                 * it's ok to return it even if it's on a different
                 * volume, and it's also ok to return NULL; by setting
                 * NONEXCLHACK the open code promises to cope with
                 * those cases correctly. (That is, it should do what
                 * it would do anyway, that is, just release the
                 * searchdir, except not crash if it's null.) This is
                 * needed because otherwise opening mountpoints with
                 * O_CREAT but not O_EXCL fails... which is a silly
                 * thing to do but ought to work. (This whole issue
                 * came to light because 3rd party code wanted to open
                 * certain procfs nodes with O_CREAT for some 3rd
                 * party reason, and it failed.)
                 *
                 * Note that NONEXCLHACK is properly a different
                 * nameiop (it is partway between LOOKUP and CREATE)
                 * but it was stuffed in as a flag instead to make the
                 * resulting patch less invasive for pullup. Blah.
                 */
                if (cnp->cn_nameiop != LOOKUP &&
                    (searchdir == NULL ||
                     searchdir->v_mount != foundobj->v_mount) &&
                    (cnp->cn_flags & NONEXCLHACK) == 0) {
                        if (searchdir) {
                                if (searchdir_locked) {
                                        vput(searchdir);
                                        searchdir_locked = false;
                                } else {
                                        vrele(searchdir);
                                }
                                searchdir = NULL;
                        }
                        vrele(foundobj);
                        foundobj = NULL;
                        ndp->ni_dvp = NULL;
                        ndp->ni_vp = NULL;
                        state->attempt_retry = 1;

                        switch (cnp->cn_nameiop) {
                            case CREATE:
                                return EEXIST;
                            case DELETE:
                            case RENAME:
                                return EBUSY;
                            default:
                                break;
                        }
                        panic("Invalid nameiop\n");
                }

                /*
                 * Disallow directory write attempts on read-only lookups.
                 * Prefers EEXIST over EROFS for the CREATE case.
                 */
                if (state->rdonly &&
                    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
                        if (searchdir) {
                                if (searchdir_locked) {
                                        vput(searchdir);
                                        searchdir_locked = false;
                                } else {
                                        vrele(searchdir);
                                }
                                searchdir = NULL;
                        }
                        vrele(foundobj);
                        foundobj = NULL;
                        ndp->ni_dvp = NULL;
                        ndp->ni_vp = NULL;
                        state->attempt_retry = 1;
                        return EROFS;
                }

                /* Lock the leaf node if requested. */
                if ((cnp->cn_flags & (LOCKLEAF | LOCKPARENT)) == LOCKPARENT &&
                    searchdir == foundobj) {
                        /*
                         * Note: if LOCKPARENT but not LOCKLEAF is
                         * set, and searchdir == foundobj, this code
                         * necessarily unlocks the parent as well as
                         * the leaf. That is, just because you specify
                         * LOCKPARENT doesn't mean you necessarily get
                         * a locked parent vnode. The code in
                         * vfs_syscalls.c, and possibly elsewhere,
                         * that uses this combination "knows" this, so
                         * it can't be safely changed. Feh. XXX
                         */
                        KASSERT(searchdir_locked);
                            VOP_UNLOCK(searchdir);
                            searchdir_locked = false;
                } else if ((cnp->cn_flags & LOCKLEAF) != 0 &&
                    (searchdir != foundobj ||
                    (cnp->cn_flags & LOCKPARENT) == 0)) {
                        const int lktype = (cnp->cn_flags & LOCKSHARED) != 0 ?
                            LK_SHARED : LK_EXCLUSIVE;
                        vn_lock(foundobj, lktype | LK_RETRY);
                }
        }

        /*
         * Done.
         */

        /*
         * If LOCKPARENT is not set, the parent directory isn't returned.
         */
        if ((cnp->cn_flags & LOCKPARENT) == 0 && searchdir != NULL) {
                vrele(searchdir);
                searchdir = NULL;
        }

        ndp->ni_dvp = searchdir;
        ndp->ni_vp = foundobj;
        return 0;
}

/*
 * Do namei; wrapper layer that handles TRYEMULROOT.
 */
static int
namei_tryemulroot(struct namei_state *state,
         int neverfollow, int inhibitmagic, int isnfsd)
{
        int error;

        struct nameidata *ndp = state->ndp;
        struct componentname *cnp = state->cnp;
        const char *savepath = NULL;

        KASSERT(cnp == &ndp->ni_cnd);

        if (cnp->cn_flags & TRYEMULROOT) {
                savepath = pathbuf_stringcopy_get(ndp->ni_pathbuf);
        }

    emul_retry:
        state->attempt_retry = 0;

        error = namei_oneroot(state, neverfollow, inhibitmagic, isnfsd);
        if (error) {
                /*
                 * Once namei has started up, the existence of ni_erootdir
                 * tells us whether we're working from an emulation root.
                 * The TRYEMULROOT flag isn't necessarily authoritative.
                 */
                if (ndp->ni_erootdir != NULL && state->attempt_retry) {
                        /* Retry the whole thing using the normal root */
                        cnp->cn_flags &= ~TRYEMULROOT;
                        state->attempt_retry = 0;

                        /* kinda gross */
                        strcpy(ndp->ni_pathbuf->pb_path, savepath);
                        pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
                        savepath = NULL;

                        goto emul_retry;
                }
        }
        if (savepath != NULL) {
                pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
        }
        return error;
}

/*
 * External interface.
 */
int
namei(struct nameidata *ndp)
{
        struct namei_state state;
        int error;

        namei_init(&state, ndp);
        error = namei_tryemulroot(&state,
                                  0/*!neverfollow*/, 0/*!inhibitmagic*/,
                                  0/*isnfsd*/);
        namei_cleanup(&state);

        if (error) {
                /* make sure no stray refs leak out */
                KASSERT(ndp->ni_dvp == NULL);
                KASSERT(ndp->ni_vp == NULL);
        }

        return error;
}

////////////////////////////////////////////////////////////

/*
 * External interface used by nfsd. This is basically different from
 * namei only in that it has the ability to pass in the "current
 * directory", and uses an extra flag "neverfollow" for which there's
 * no physical flag defined in namei.h. (There used to be a cut&paste
 * copy of about half of namei in nfsd to allow these minor
 * adjustments to exist.)
 *
 * XXX: the namei interface should be adjusted so nfsd can just use
 * ordinary namei().
 */
int
lookup_for_nfsd(struct nameidata *ndp, struct vnode *forcecwd, int neverfollow)
{
        struct namei_state state;
        int error;

        KASSERT(ndp->ni_atdir == NULL);
        ndp->ni_atdir = forcecwd;

        namei_init(&state, ndp);
        error = namei_tryemulroot(&state,
                                  neverfollow, 1/*inhibitmagic*/, 1/*isnfsd*/);
        namei_cleanup(&state);

        if (error) {
                /* make sure no stray refs leak out */
                KASSERT(ndp->ni_dvp == NULL);
                KASSERT(ndp->ni_vp == NULL);
        }

        return error;
}

/*
 * A second external interface used by nfsd. This turns out to be a
 * single lookup used by the WebNFS code (ha!) to get "index.html" or
 * equivalent when asked for a directory. It should eventually evolve
 * into some kind of namei_once() call; for the time being it's kind
 * of a mess. XXX.
 *
 * dholland 20110109: I don't think it works, and I don't think it
 * worked before I started hacking and slashing either, and I doubt
 * anyone will ever notice.
 */

/*
 * Internals. This calls lookup_once() after setting up the assorted
 * pieces of state the way they ought to be.
 */
static int
do_lookup_for_nfsd_index(struct namei_state *state)
{
        int error;

        struct componentname *cnp = state->cnp;
        struct nameidata *ndp = state->ndp;
        struct vnode *startdir;
        struct vnode *foundobj;
        bool startdir_locked;
        const char *cp;                        /* pointer into pathname argument */

        KASSERT(cnp == &ndp->ni_cnd);

        startdir = state->ndp->ni_atdir;

        cnp->cn_nameptr = ndp->ni_pnbuf;
        state->docache = 1;
        state->rdonly = cnp->cn_flags & RDONLY;
        ndp->ni_dvp = NULL;

        error = VOP_PARSEPATH(startdir, cnp->cn_nameptr, &cnp->cn_namelen);
        if (error) {
                return error;
        }

        cp = cnp->cn_nameptr + cnp->cn_namelen;
        KASSERT(cnp->cn_namelen <= KERNEL_NAME_MAX);
        ndp->ni_pathlen -= cnp->cn_namelen;
        ndp->ni_next = cp;
        state->slashes = 0;
        cnp->cn_flags &= ~REQUIREDIR;
        cnp->cn_flags |= MAKEENTRY|ISLASTCN;

        if (cnp->cn_namelen == 2 &&
            cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
                cnp->cn_flags |= ISDOTDOT;
        else
                cnp->cn_flags &= ~ISDOTDOT;

        /*
         * Because lookup_once can change the startdir, we need our
         * own reference to it to avoid consuming the caller's.
         */
        vref(startdir);
        error = lookup_once(state, startdir, &startdir, &foundobj,
            &startdir_locked);

        KASSERT((cnp->cn_flags & LOCKPARENT) == 0);
        if (startdir_locked) {
                VOP_UNLOCK(startdir);
                startdir_locked = false;
        }

        /*
         * If the vnode we found is mounted on, then cross the mount and get
         * the root vnode in foundobj.  If this encounters an error, it will
         * dispose of foundobj, but searchdir is untouched.
         */
        if (error == 0 && foundobj != NULL &&
            foundobj->v_type == VDIR &&
            foundobj->v_mountedhere != NULL &&
            (cnp->cn_flags & NOCROSSMOUNT) == 0) {
                error = lookup_crossmount(state, &startdir, &foundobj,
                    &startdir_locked);
        }

        /* Now toss startdir and see if we have an error. */
        if (startdir != NULL)
                vrele(startdir);
        if (error)
                foundobj = NULL;
        else if (foundobj != NULL && (cnp->cn_flags & LOCKLEAF) != 0)
                vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);

        ndp->ni_vp = foundobj;
        return (error);
}

/*
 * External interface. The partitioning between this function and the
 * above isn't very clear - the above function exists mostly so code
 * that uses "state->" can be shuffled around without having to change
 * it to "state.".
 */
int
lookup_for_nfsd_index(struct nameidata *ndp, struct vnode *startdir)
{
        struct namei_state state;
        int error;

        KASSERT(ndp->ni_atdir == NULL);
        ndp->ni_atdir = startdir;

        /*
         * Note: the name sent in here (is not|should not be) allowed
         * to contain a slash.
         */
        if (strlen(ndp->ni_pathbuf->pb_path) > KERNEL_NAME_MAX) {
                return ENAMETOOLONG;
        }
        if (strchr(ndp->ni_pathbuf->pb_path, '/')) {
                return EINVAL;
        }

        ndp->ni_pathlen = strlen(ndp->ni_pathbuf->pb_path) + 1;
        ndp->ni_pnbuf = NULL;
        ndp->ni_cnd.cn_nameptr = NULL;

        namei_init(&state, ndp);
        error = do_lookup_for_nfsd_index(&state);
        namei_cleanup(&state);

        return error;
}

////////////////////////////////////////////////////////////

/*
 * Reacquire a path name component.
 * dvp is locked on entry and exit.
 * *vpp is locked on exit unless it's NULL.
 */
int
relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int dummy)
{
        int rdonly;                        /* lookup read-only flag bit */
        int error = 0;
#ifdef DEBUG
        size_t newlen;                        /* DEBUG: check name len */
        const char *cp;                        /* DEBUG: check name ptr */
#endif /* DEBUG */

        (void)dummy;

        /*
         * Setup: break out flag bits into variables.
         */
        rdonly = cnp->cn_flags & RDONLY;

        /*
         * Search a new directory.
         *
         * The cn_hash value is for use by vfs_cache.
         * The last component of the filename is left accessible via
         * cnp->cn_nameptr for callers that need the name. Callers needing
         * the name set the SAVENAME flag. When done, they assume
         * responsibility for freeing the pathname buffer.
         */
#ifdef DEBUG
#if 0
        cp = NULL;
        newhash = namei_hash(cnp->cn_nameptr, &cp);
        if ((uint32_t)newhash != (uint32_t)cnp->cn_hash)
                panic("relookup: bad hash");
#endif
        error = VOP_PARSEPATH(dvp, cnp->cn_nameptr, &newlen);
        if (error) {
                panic("relookup: parsepath failed with error %d", error);
        }
        if (cnp->cn_namelen != newlen)
                panic("relookup: bad len");
        cp = cnp->cn_nameptr + cnp->cn_namelen;
        while (*cp == '/')
                cp++;
        if (*cp != 0)
                panic("relookup: not last component");
#endif /* DEBUG */

        /*
         * Check for degenerate name (e.g. / or "")
         * which is a way of talking about a directory,
         * e.g. like "/." or ".".
         */
        if (cnp->cn_nameptr[0] == '\0')
                panic("relookup: null name");

        if (cnp->cn_flags & ISDOTDOT)
                panic("relookup: lookup on dot-dot");

        /*
         * We now have a segment name to search for, and a directory to search.
         */
        *vpp = NULL;
        error = VOP_LOOKUP(dvp, vpp, cnp);
        if ((error) != 0) {
                KASSERTMSG((*vpp == NULL),
                    "leaf `%s' should be empty but is %p",
                    cnp->cn_nameptr, *vpp);
                if (error != EJUSTRETURN)
                        goto bad;
        }

        /*
         * Check for symbolic link
         */
        KASSERTMSG((*vpp == NULL || (*vpp)->v_type != VLNK ||
                (cnp->cn_flags & FOLLOW) == 0),
            "relookup: symlink found");

        /*
         * Check for read-only lookups.
         */
        if (rdonly && cnp->cn_nameiop != LOOKUP) {
                error = EROFS;
                if (*vpp) {
                        vrele(*vpp);
                }
                goto bad;
        }
        /*
         * Lock result.
         */
        if (*vpp && *vpp != dvp) {
                error = vn_lock(*vpp, LK_EXCLUSIVE);
                if (error != 0) {
                        vrele(*vpp);
                        goto bad;
                }
        }
        return (0);

bad:
        *vpp = NULL;
        return (error);
}

/*
 * namei_simple - simple forms of namei.
 *
 * These are wrappers to allow the simple case callers of namei to be
 * left alone while everything else changes under them.
 */

/* Flags */
struct namei_simple_flags_type {
        int dummy;
};
static const struct namei_simple_flags_type ns_nn, ns_nt, ns_fn, ns_ft;
const namei_simple_flags_t NSM_NOFOLLOW_NOEMULROOT = &ns_nn;
const namei_simple_flags_t NSM_NOFOLLOW_TRYEMULROOT = &ns_nt;
const namei_simple_flags_t NSM_FOLLOW_NOEMULROOT = &ns_fn;
const namei_simple_flags_t NSM_FOLLOW_TRYEMULROOT = &ns_ft;

static
int
namei_simple_convert_flags(namei_simple_flags_t sflags)
{
        if (sflags == NSM_NOFOLLOW_NOEMULROOT)
                return NOFOLLOW | 0;
        if (sflags == NSM_NOFOLLOW_TRYEMULROOT)
                return NOFOLLOW | TRYEMULROOT;
        if (sflags == NSM_FOLLOW_NOEMULROOT)
                return FOLLOW | 0;
        if (sflags == NSM_FOLLOW_TRYEMULROOT)
                return FOLLOW | TRYEMULROOT;
        panic("namei_simple_convert_flags: bogus sflags\n");
        return 0;
}

int
namei_simple_kernel(const char *path, namei_simple_flags_t sflags,
        struct vnode **vp_ret)
{
        return nameiat_simple_kernel(NULL, path, sflags, vp_ret);
}

int
nameiat_simple_kernel(struct vnode *dvp, const char *path,
        namei_simple_flags_t sflags, struct vnode **vp_ret)
{
        struct nameidata nd;
        struct pathbuf *pb;
        int err;

        pb = pathbuf_create(path);
        if (pb == NULL) {
                return ENOMEM;
        }

        NDINIT(&nd,
                LOOKUP,
                namei_simple_convert_flags(sflags),
                pb);

        if (dvp != NULL)
                NDAT(&nd, dvp);

        err = namei(&nd);
        if (err != 0) {
                pathbuf_destroy(pb);
                return err;
        }
        *vp_ret = nd.ni_vp;
        pathbuf_destroy(pb);
        return 0;
}

int
namei_simple_user(const char *path, namei_simple_flags_t sflags,
        struct vnode **vp_ret)
{
        return nameiat_simple_user(NULL, path, sflags, vp_ret);
}

int
nameiat_simple_user(struct vnode *dvp, const char *path,
        namei_simple_flags_t sflags, struct vnode **vp_ret)
{
        struct pathbuf *pb;
        struct nameidata nd;
        int err;

        err = pathbuf_copyin(path, &pb);
        if (err) {
                return err;
        }

        NDINIT(&nd,
                LOOKUP,
                namei_simple_convert_flags(sflags),
                pb);

        if (dvp != NULL)
                NDAT(&nd, dvp);

        err = namei(&nd);
        if (err != 0) {
                pathbuf_destroy(pb);
                return err;
        }
        *vp_ret = nd.ni_vp;
        pathbuf_destroy(pb);
        return 0;
}





























































































   86 











   86 
   85 



















    1 















   13 







































    3 







    2 























































































































































































   92 












   41 
   41 


    8 


   31 


   92 











   92 
















   78 






   77 


   46 







    1 










    1 








    1 






   15 








    3 

































































































































































    4 







    3 







   55 







   58 







    2 













   85 



   86 
   86 



























    2 










    2 







   81 


   40 








































































































































































































































































































































































































































































    3 












    2 






    1 











    1 























    2 






    1 
















    2 































































    2 














    1 




    1 












































   73 


   73 







    2 










    2 












    2 


    2 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
/*        $NetBSD: genfs_vnops.c,v 1.220 2023/03/03 10:02:51 hannken Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.220 2023/03/03 10:02:51 hannken Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/fstrans.h>
#include <sys/namei.h>
#include <sys/vnode_impl.h>
#include <sys/fcntl.h>
#include <sys/kmem.h>
#include <sys/poll.h>
#include <sys/mman.h>
#include <sys/file.h>
#include <sys/kauth.h>
#include <sys/stat.h>
#include <sys/extattr.h>

#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/genfs_node.h>
#include <miscfs/specfs/specdev.h>

static void filt_genfsdetach(struct knote *);
static int filt_genfsread(struct knote *, long);
static int filt_genfsvnode(struct knote *, long);

/*
 * Find the end of the first path component in NAME and return its
 * length.
 */
int
genfs_parsepath(void *v)
{
        struct vop_parsepath_args /* {
                struct vnode *a_dvp;
                const char *a_name;
                size_t *a_ret;
        } */ *ap = v;
        const char *name = ap->a_name;
        size_t pos;

        (void)ap->a_dvp;

        pos = 0;
        while (name[pos] != '\0' && name[pos] != '/') {
                pos++;
        }
        *ap->a_retval = pos;
        return 0;
}

int
genfs_poll(void *v)
{
        struct vop_poll_args /* {
                struct vnode *a_vp;
                int a_events;
                struct lwp *a_l;
        } */ *ap = v;

        return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
}

int
genfs_seek(void *v)
{
        struct vop_seek_args /* {
                struct vnode *a_vp;
                off_t a_oldoff;
                off_t a_newoff;
                kauth_cred_t cred;
        } */ *ap = v;

        if (ap->a_newoff < 0)
                return (EINVAL);

        return (0);
}

int
genfs_abortop(void *v)
{
        struct vop_abortop_args /* {
                struct vnode *a_dvp;
                struct componentname *a_cnp;
        } */ *ap = v;

        (void)ap;

        return (0);
}

int
genfs_fcntl(void *v)
{
        struct vop_fcntl_args /* {
                struct vnode *a_vp;
                u_int a_command;
                void *a_data;
                int a_fflag;
                kauth_cred_t a_cred;
                struct lwp *a_l;
        } */ *ap = v;

        if (ap->a_command == F_SETFL)
                return (0);
        else
                return (EOPNOTSUPP);
}

/*ARGSUSED*/
int
genfs_badop(void *v)
{

        panic("genfs: bad op");
}

/*ARGSUSED*/
int
genfs_nullop(void *v)
{

        return (0);
}

/*ARGSUSED*/
int
genfs_einval(void *v)
{

        return (EINVAL);
}

int
genfs_erofs_link(void *v)
{
        /* also for symlink */
        struct vop_link_v2_args /* {
                struct vnode *a_dvp;
                struct vnode **a_vpp;
                struct componentname *a_cnp;
        } */ *ap = v;

        VOP_ABORTOP(ap->a_dvp, ap->a_cnp);
        return EROFS;
}

/*
 * Called when an fs doesn't support a particular vop.
 * This takes care to vrele, vput, or vunlock passed in vnodes
 * and calls VOP_ABORTOP for a componentname (in non-rename VOP).
 */
int
genfs_eopnotsupp(void *v)
{
        struct vop_generic_args /*
                struct vnodeop_desc *a_desc;
                / * other random data follows, presumably * /
        } */ *ap = v;
        struct vnodeop_desc *desc = ap->a_desc;
        struct vnode *vp, *vp_last = NULL;
        int flags, i, j, offset_cnp, offset_vp;

        KASSERT(desc->vdesc_offset != VOP_LOOKUP_DESCOFFSET);
        KASSERT(desc->vdesc_offset != VOP_ABORTOP_DESCOFFSET);

        /*
         * Abort any componentname that lookup potentially left state in.
         *
         * As is logical, componentnames for VOP_RENAME are handled by
         * the caller of VOP_RENAME.  Yay, rename!
         */
        if (desc->vdesc_offset != VOP_RENAME_DESCOFFSET &&
            (offset_vp = desc->vdesc_vp_offsets[0]) != VDESC_NO_OFFSET &&
            (offset_cnp = desc->vdesc_componentname_offset) != VDESC_NO_OFFSET){
                struct componentname *cnp;
                struct vnode *dvp;

                dvp = *VOPARG_OFFSETTO(struct vnode **, offset_vp, ap);
                cnp = *VOPARG_OFFSETTO(struct componentname **, offset_cnp, ap);

                VOP_ABORTOP(dvp, cnp);
        }

        flags = desc->vdesc_flags;
        for (i = 0; i < VDESC_MAX_VPS; flags >>=1, i++) {
                if ((offset_vp = desc->vdesc_vp_offsets[i]) == VDESC_NO_OFFSET)
                        break;        /* stop at end of list */
                if ((j = flags & VDESC_VP0_WILLPUT)) {
                        vp = *VOPARG_OFFSETTO(struct vnode **, offset_vp, ap);

                        /* Skip if NULL */
                        if (!vp)
                                continue;

                        switch (j) {
                        case VDESC_VP0_WILLPUT:
                                /* Check for dvp == vp cases */
                                if (vp == vp_last)
                                        vrele(vp);
                                else {
                                        vput(vp);
                                        vp_last = vp;
                                }
                                break;
                        case VDESC_VP0_WILLRELE:
                                vrele(vp);
                                break;
                        }
                }
        }

        return (EOPNOTSUPP);
}

/*ARGSUSED*/
int
genfs_ebadf(void *v)
{

        return (EBADF);
}

/* ARGSUSED */
int
genfs_enoioctl(void *v)
{

        return (EPASSTHROUGH);
}


/*
 * Eliminate all activity associated with the requested vnode
 * and with all vnodes aliased to the requested vnode.
 */
int
genfs_revoke(void *v)
{
        struct vop_revoke_args /* {
                struct vnode *a_vp;
                int a_flags;
        } */ *ap = v;

#ifdef DIAGNOSTIC
        if ((ap->a_flags & REVOKEALL) == 0)
                panic("genfs_revoke: not revokeall");
#endif
        vrevoke(ap->a_vp);
        return (0);
}

/*
 * Lock the node (for deadfs).
 */
int
genfs_deadlock(void *v)
{
        struct vop_lock_args /* {
                struct vnode *a_vp;
                int a_flags;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
        int flags = ap->a_flags;
        krw_t op;

        if (! ISSET(flags, LK_RETRY))
                return ENOENT;

        if (ISSET(flags, LK_DOWNGRADE)) {
                rw_downgrade(&vip->vi_lock);
        } else if (ISSET(flags, LK_UPGRADE)) {
                KASSERT(ISSET(flags, LK_NOWAIT));
                if (!rw_tryupgrade(&vip->vi_lock)) {
                        return EBUSY;
                }
        } else if ((flags & (LK_EXCLUSIVE | LK_SHARED)) != 0) {
                op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER);
                if (ISSET(flags, LK_NOWAIT)) {
                        if (!rw_tryenter(&vip->vi_lock, op))
                                return EBUSY;
                } else {
                        rw_enter(&vip->vi_lock, op);
                }
        }
        VSTATE_ASSERT_UNLOCKED(vp, VS_RECLAIMED);
        return 0;
}

/*
 * Unlock the node (for deadfs).
 */
int
genfs_deadunlock(void *v)
{
        struct vop_unlock_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);

        rw_exit(&vip->vi_lock);

        return 0;
}

/*
 * Lock the node.
 */
int
genfs_lock(void *v)
{
        struct vop_lock_args /* {
                struct vnode *a_vp;
                int a_flags;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
        int flags = ap->a_flags;
        krw_t op;

        if (ISSET(flags, LK_DOWNGRADE)) {
                rw_downgrade(&vip->vi_lock);
        } else if (ISSET(flags, LK_UPGRADE)) {
                KASSERT(ISSET(flags, LK_NOWAIT));
                if (!rw_tryupgrade(&vip->vi_lock)) {
                        return EBUSY;
                }
        } else if ((flags & (LK_EXCLUSIVE | LK_SHARED)) != 0) {
                op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER);
                if (ISSET(flags, LK_NOWAIT)) {
                        if (!rw_tryenter(&vip->vi_lock, op))
                                return EBUSY;
                } else {
                        rw_enter(&vip->vi_lock, op);
                }
        }
        VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
        return 0;
}

/*
 * Unlock the node.
 */
int
genfs_unlock(void *v)
{
        struct vop_unlock_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);

        rw_exit(&vip->vi_lock);

        return 0;
}

/*
 * Return whether or not the node is locked.
 */
int
genfs_islocked(void *v)
{
        struct vop_islocked_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);

        if (rw_write_held(&vip->vi_lock))
                return LK_EXCLUSIVE;

        if (rw_read_held(&vip->vi_lock))
                return LK_SHARED;

        return 0;
}

int
genfs_mmap(void *v)
{

        return (0);
}

/*
 * VOP_PUTPAGES() for vnodes which never have pages.
 */

int
genfs_null_putpages(void *v)
{
        struct vop_putpages_args /* {
                struct vnode *a_vp;
                voff_t a_offlo;
                voff_t a_offhi;
                int a_flags;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;

        KASSERT(vp->v_uobj.uo_npages == 0);
        rw_exit(vp->v_uobj.vmobjlock);
        return (0);
}

void
genfs_node_init(struct vnode *vp, const struct genfs_ops *ops)
{
        struct genfs_node *gp = VTOG(vp);

        rw_init(&gp->g_glock);
        gp->g_op = ops;
}

void
genfs_node_destroy(struct vnode *vp)
{
        struct genfs_node *gp = VTOG(vp);

        rw_destroy(&gp->g_glock);
}

void
genfs_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
{
        int bsize;

        bsize = 1 << vp->v_mount->mnt_fs_bshift;
        *eobp = (size + bsize - 1) & ~(bsize - 1);
}

static void
filt_genfsdetach(struct knote *kn)
{
        struct vnode *vp = (struct vnode *)kn->kn_hook;

        vn_knote_detach(vp, kn);
}

static int
filt_genfsread(struct knote *kn, long hint)
{
        struct vnode *vp = (struct vnode *)kn->kn_hook;
        int rv;

        /*
         * filesystem is gone, so set the EOF flag and schedule
         * the knote for deletion.
         */
        switch (hint) {
        case NOTE_REVOKE:
                KASSERT(mutex_owned(vp->v_interlock));
                knote_set_eof(kn, EV_ONESHOT);
                return (1);
        case 0:
                mutex_enter(vp->v_interlock);
                kn->kn_data = vp->v_size - ((file_t *)kn->kn_obj)->f_offset;
                rv = (kn->kn_data != 0);
                mutex_exit(vp->v_interlock);
                return rv;
        default:
                KASSERT(mutex_owned(vp->v_interlock));
                kn->kn_data = vp->v_size - ((file_t *)kn->kn_obj)->f_offset;
                return (kn->kn_data != 0);
        }
}

static int
filt_genfswrite(struct knote *kn, long hint)
{
        struct vnode *vp = (struct vnode *)kn->kn_hook;

        /*
         * filesystem is gone, so set the EOF flag and schedule
         * the knote for deletion.
         */
        switch (hint) {
        case NOTE_REVOKE:
                KASSERT(mutex_owned(vp->v_interlock));
                knote_set_eof(kn, EV_ONESHOT);
                return (1);
        case 0:
                mutex_enter(vp->v_interlock);
                kn->kn_data = 0;
                mutex_exit(vp->v_interlock);
                return 1;
        default:
                KASSERT(mutex_owned(vp->v_interlock));
                kn->kn_data = 0;
                return 1;
        }
}

static int
filt_genfsvnode(struct knote *kn, long hint)
{
        struct vnode *vp = (struct vnode *)kn->kn_hook;
        int fflags;

        switch (hint) {
        case NOTE_REVOKE:
                KASSERT(mutex_owned(vp->v_interlock));
                knote_set_eof(kn, 0);
                if ((kn->kn_sfflags & hint) != 0)
                        kn->kn_fflags |= hint;
                return (1);
        case 0:
                mutex_enter(vp->v_interlock);
                fflags = kn->kn_fflags;
                mutex_exit(vp->v_interlock);
                break;
        default:
                KASSERT(mutex_owned(vp->v_interlock));
                if ((kn->kn_sfflags & hint) != 0)
                        kn->kn_fflags |= hint;
                fflags = kn->kn_fflags;
                break;
        }

        return (fflags != 0);
}

static const struct filterops genfsread_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_genfsdetach,
        .f_event = filt_genfsread,
};

static const struct filterops genfswrite_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_genfsdetach,
        .f_event = filt_genfswrite,
};

static const struct filterops genfsvnode_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_genfsdetach,
        .f_event = filt_genfsvnode,
};

int
genfs_kqfilter(void *v)
{
        struct vop_kqfilter_args /* {
                struct vnode        *a_vp;
                struct knote        *a_kn;
        } */ *ap = v;
        struct vnode *vp;
        struct knote *kn;

        vp = ap->a_vp;
        kn = ap->a_kn;
        switch (kn->kn_filter) {
        case EVFILT_READ:
                kn->kn_fop = &genfsread_filtops;
                break;
        case EVFILT_WRITE:
                kn->kn_fop = &genfswrite_filtops;
                break;
        case EVFILT_VNODE:
                kn->kn_fop = &genfsvnode_filtops;
                break;
        default:
                return (EINVAL);
        }

        kn->kn_hook = vp;

        vn_knote_attach(vp, kn);

        return (0);
}

void
genfs_node_wrlock(struct vnode *vp)
{
        struct genfs_node *gp = VTOG(vp);

        rw_enter(&gp->g_glock, RW_WRITER);
}

void
genfs_node_rdlock(struct vnode *vp)
{
        struct genfs_node *gp = VTOG(vp);

        rw_enter(&gp->g_glock, RW_READER);
}

int
genfs_node_rdtrylock(struct vnode *vp)
{
        struct genfs_node *gp = VTOG(vp);

        return rw_tryenter(&gp->g_glock, RW_READER);
}

void
genfs_node_unlock(struct vnode *vp)
{
        struct genfs_node *gp = VTOG(vp);

        rw_exit(&gp->g_glock);
}

int
genfs_node_wrlocked(struct vnode *vp)
{
        struct genfs_node *gp = VTOG(vp);

        return rw_write_held(&gp->g_glock);
}

/*
 * Common filesystem object access control check routine.  Accepts a
 * vnode, cred, uid, gid, mode, acl, requested access mode.
 * Returns 0 on success, or an errno on failure.
 */
int
genfs_can_access(vnode_t *vp, kauth_cred_t cred, uid_t file_uid, gid_t file_gid,
    mode_t file_mode, struct acl *acl, accmode_t accmode)
{
        accmode_t dac_granted;
        int error;

        KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0);
        KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE));

        /*
         * Look for a normal, non-privileged way to access the file/directory
         * as requested.  If it exists, go with that.
         */

        dac_granted = 0;

        /* Check the owner. */
        if (kauth_cred_geteuid(cred) == file_uid) {
                dac_granted |= VADMIN;
                if (file_mode & S_IXUSR)
                        dac_granted |= VEXEC;
                if (file_mode & S_IRUSR)
                        dac_granted |= VREAD;
                if (file_mode & S_IWUSR)
                        dac_granted |= (VWRITE | VAPPEND);

                goto privchk;
        }

        /* Otherwise, check the groups (first match) */
        /* Otherwise, check the groups. */
        error = kauth_cred_groupmember(cred, file_gid);
        if (error > 0)
                return error;
        if (error == 0) {
                if (file_mode & S_IXGRP)
                        dac_granted |= VEXEC;
                if (file_mode & S_IRGRP)
                        dac_granted |= VREAD;
                if (file_mode & S_IWGRP)
                        dac_granted |= (VWRITE | VAPPEND);

                goto privchk;
        }

        /* Otherwise, check everyone else. */
        if (file_mode & S_IXOTH)
                dac_granted |= VEXEC;
        if (file_mode & S_IROTH)
                dac_granted |= VREAD;
        if (file_mode & S_IWOTH)
                dac_granted |= (VWRITE | VAPPEND);

privchk:
        if ((accmode & dac_granted) == accmode)
                return 0;

        return (accmode & VADMIN) ? EPERM : EACCES;
}

/*
 * Implement a version of genfs_can_access() that understands POSIX.1e ACL
 * semantics;
 * the access ACL has already been prepared for evaluation by the file system
 * and is passed via 'uid', 'gid', and 'acl'.  Return 0 on success, else an
 * errno value.
 */
int
genfs_can_access_acl_posix1e(vnode_t *vp, kauth_cred_t cred, uid_t file_uid,
    gid_t file_gid, mode_t file_mode, struct acl *acl, accmode_t accmode)
{
        struct acl_entry *acl_other, *acl_mask;
        accmode_t dac_granted;
        accmode_t acl_mask_granted;
        int group_matched, i;
        int error;

        KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0);
        KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE));

        /*
         * The owner matches if the effective uid associated with the
         * credential matches that of the ACL_USER_OBJ entry.  While we're
         * doing the first scan, also cache the location of the ACL_MASK and
         * ACL_OTHER entries, preventing some future iterations.
         */
        acl_mask = acl_other = NULL;
        for (i = 0; i < acl->acl_cnt; i++) {
                struct acl_entry *ae = &acl->acl_entry[i];
                switch (ae->ae_tag) {
                case ACL_USER_OBJ:
                        if (kauth_cred_geteuid(cred) != file_uid)
                                break;
                        dac_granted = 0;
                        dac_granted |= VADMIN;
                        if (ae->ae_perm & ACL_EXECUTE)
                                dac_granted |= VEXEC;
                        if (ae->ae_perm & ACL_READ)
                                dac_granted |= VREAD;
                        if (ae->ae_perm & ACL_WRITE)
                                dac_granted |= (VWRITE | VAPPEND);
                        goto out;

                case ACL_MASK:
                        acl_mask = ae;
                        break;

                case ACL_OTHER:
                        acl_other = ae;
                        break;

                default:
                        break;
                }
        }

        /*
         * An ACL_OTHER entry should always exist in a valid access ACL.  If
         * it doesn't, then generate a serious failure.         For now, this means
         * a debugging message and EPERM, but in the future should probably
         * be a panic.
         */
        if (acl_other == NULL) {
                /*
                 * XXX This should never happen
                 */
                printf("%s: ACL_OTHER missing\n", __func__);
                return EPERM;
        }

        /*
         * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields are
         * masked by an ACL_MASK entry, if any.         As such, first identify the
         * ACL_MASK field, then iterate through identifying potential user
         * matches, then group matches.         If there is no ACL_MASK, assume that
         * the mask allows all requests to succeed.
         */
        if (acl_mask != NULL) {
                acl_mask_granted = 0;
                if (acl_mask->ae_perm & ACL_EXECUTE)
                        acl_mask_granted |= VEXEC;
                if (acl_mask->ae_perm & ACL_READ)
                        acl_mask_granted |= VREAD;
                if (acl_mask->ae_perm & ACL_WRITE)
                        acl_mask_granted |= (VWRITE | VAPPEND);
        } else
                acl_mask_granted = VEXEC | VREAD | VWRITE | VAPPEND;

        /*
         * Check ACL_USER ACL entries.        There will either be one or no
         * matches; if there is one, we accept or rejected based on the
         * match; otherwise, we continue on to groups.
         */
        for (i = 0; i < acl->acl_cnt; i++) {
                struct acl_entry *ae = &acl->acl_entry[i];
                switch (ae->ae_tag) {
                case ACL_USER:
                        if (kauth_cred_geteuid(cred) != ae->ae_id)
                                break;
                        dac_granted = 0;
                        if (ae->ae_perm & ACL_EXECUTE)
                                dac_granted |= VEXEC;
                        if (ae->ae_perm & ACL_READ)
                                dac_granted |= VREAD;
                        if (ae->ae_perm & ACL_WRITE)
                                dac_granted |= (VWRITE | VAPPEND);
                        dac_granted &= acl_mask_granted;
                        goto out;
                }
        }

        /*
         * Group match is best-match, not first-match, so find a "best"
         * match.  Iterate across, testing each potential group match.        Make
         * sure we keep track of whether we found a match or not, so that we
         * know if we should try again with any available privilege, or if we
         * should move on to ACL_OTHER.
         */
        group_matched = 0;
        for (i = 0; i < acl->acl_cnt; i++) {
                struct acl_entry *ae = &acl->acl_entry[i];
                switch (ae->ae_tag) {
                case ACL_GROUP_OBJ:
                        error = kauth_cred_groupmember(cred, file_gid);
                        if (error > 0)
                                return error;
                        if (error)
                                break;
                        dac_granted = 0;
                        if (ae->ae_perm & ACL_EXECUTE)
                                dac_granted |= VEXEC;
                        if (ae->ae_perm & ACL_READ)
                                dac_granted |= VREAD;
                        if (ae->ae_perm & ACL_WRITE)
                                dac_granted |= (VWRITE | VAPPEND);
                        dac_granted  &= acl_mask_granted;

                        if ((accmode & dac_granted) == accmode)
                                return 0;

                        group_matched = 1;
                        break;

                case ACL_GROUP:
                        error = kauth_cred_groupmember(cred, ae->ae_id);
                        if (error > 0)
                                return error;
                        if (error)
                                break;
                        dac_granted = 0;
                        if (ae->ae_perm & ACL_EXECUTE)
                                dac_granted |= VEXEC;
                        if (ae->ae_perm & ACL_READ)
                                dac_granted |= VREAD;
                        if (ae->ae_perm & ACL_WRITE)
                                dac_granted |= (VWRITE | VAPPEND);
                        dac_granted  &= acl_mask_granted;

                        if ((accmode & dac_granted) == accmode)
                                return 0;

                        group_matched = 1;
                        break;

                default:
                        break;
                }
        }

        if (group_matched == 1) {
                /*
                 * There was a match, but it did not grant rights via pure
                 * DAC.         Try again, this time with privilege.
                 */
                for (i = 0; i < acl->acl_cnt; i++) {
                        struct acl_entry *ae = &acl->acl_entry[i];
                        switch (ae->ae_tag) {
                        case ACL_GROUP_OBJ:
                                error = kauth_cred_groupmember(cred, file_gid);
                                if (error > 0)
                                        return error;
                                if (error)
                                        break;
                                dac_granted = 0;
                                if (ae->ae_perm & ACL_EXECUTE)
                                        dac_granted |= VEXEC;
                                if (ae->ae_perm & ACL_READ)
                                        dac_granted |= VREAD;
                                if (ae->ae_perm & ACL_WRITE)
                                        dac_granted |= (VWRITE | VAPPEND);
                                dac_granted &= acl_mask_granted;
                                goto out;

                        case ACL_GROUP:
                                error = kauth_cred_groupmember(cred, ae->ae_id);
                                if (error > 0)
                                        return error;
                                if (error)
                                        break;
                                dac_granted = 0;
                                if (ae->ae_perm & ACL_EXECUTE)
                                dac_granted |= VEXEC;
                                if (ae->ae_perm & ACL_READ)
                                        dac_granted |= VREAD;
                                if (ae->ae_perm & ACL_WRITE)
                                        dac_granted |= (VWRITE | VAPPEND);
                                dac_granted &= acl_mask_granted;

                                goto out;
                        default:
                                break;
                        }
                }
                /*
                 * Even with privilege, group membership was not sufficient.
                 * Return failure.
                 */
                dac_granted = 0;
                goto out;
        }
                
        /*
         * Fall back on ACL_OTHER.  ACL_MASK is not applied to ACL_OTHER.
         */
        dac_granted = 0;
        if (acl_other->ae_perm & ACL_EXECUTE)
                dac_granted |= VEXEC;
        if (acl_other->ae_perm & ACL_READ)
                dac_granted |= VREAD;
        if (acl_other->ae_perm & ACL_WRITE)
                dac_granted |= (VWRITE | VAPPEND);

out:
        if ((accmode & dac_granted) == accmode)
                return 0;
        return (accmode & VADMIN) ? EPERM : EACCES;
}

static struct {
        accmode_t accmode;
        int mask;
} accmode2mask[] = {
        { VREAD, ACL_READ_DATA },
        { VWRITE, ACL_WRITE_DATA },
        { VAPPEND, ACL_APPEND_DATA },
        { VEXEC, ACL_EXECUTE },
        { VREAD_NAMED_ATTRS, ACL_READ_NAMED_ATTRS },
        { VWRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS },
        { VDELETE_CHILD, ACL_DELETE_CHILD },
        { VREAD_ATTRIBUTES, ACL_READ_ATTRIBUTES },
        { VWRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES },
        { VDELETE, ACL_DELETE },
        { VREAD_ACL, ACL_READ_ACL },
        { VWRITE_ACL, ACL_WRITE_ACL },
        { VWRITE_OWNER, ACL_WRITE_OWNER },
        { VSYNCHRONIZE, ACL_SYNCHRONIZE },
        { 0, 0 },
};

static int
_access_mask_from_accmode(accmode_t accmode)
{
        int access_mask = 0, i;

        for (i = 0; accmode2mask[i].accmode != 0; i++) {
                if (accmode & accmode2mask[i].accmode)
                        access_mask |= accmode2mask[i].mask;
        }

        /*
         * VAPPEND is just a modifier for VWRITE; if the caller asked
         * for 'VAPPEND | VWRITE', we want to check for ACL_APPEND_DATA only.
         */
        if (access_mask & ACL_APPEND_DATA)
                access_mask &= ~ACL_WRITE_DATA;

        return (access_mask);
}

/*
 * Return 0, iff access is allowed, 1 otherwise.
 */
static int
_acl_denies(const struct acl *aclp, int access_mask, kauth_cred_t cred,
    int file_uid, int file_gid, int *denied_explicitly)
{
        int i, error;
        const struct acl_entry *ae;

        if (denied_explicitly != NULL)
                *denied_explicitly = 0;

        KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES);

        for (i = 0; i < aclp->acl_cnt; i++) {
                ae = &(aclp->acl_entry[i]);

                if (ae->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
                    ae->ae_entry_type != ACL_ENTRY_TYPE_DENY)
                        continue;
                if (ae->ae_flags & ACL_ENTRY_INHERIT_ONLY)
                        continue;
                switch (ae->ae_tag) {
                case ACL_USER_OBJ:
                        if (kauth_cred_geteuid(cred) != file_uid)
                                continue;
                        break;
                case ACL_USER:
                        if (kauth_cred_geteuid(cred) != ae->ae_id)
                                continue;
                        break;
                case ACL_GROUP_OBJ:
                        error = kauth_cred_groupmember(cred, file_gid);
                        if (error > 0)
                                return error;
                        if (error != 0)
                                continue;
                        break;
                case ACL_GROUP:
                        error = kauth_cred_groupmember(cred, ae->ae_id);
                        if (error > 0)
                                return error;
                        if (error != 0)
                                continue;
                        break;
                default:
                        KASSERT(ae->ae_tag == ACL_EVERYONE);
                }

                if (ae->ae_entry_type == ACL_ENTRY_TYPE_DENY) {
                        if (ae->ae_perm & access_mask) {
                                if (denied_explicitly != NULL)
                                        *denied_explicitly = 1;
                                return (1);
                        }
                }

                access_mask &= ~(ae->ae_perm);
                if (access_mask == 0)
                        return (0);
        }

        if (access_mask == 0)
                return (0);

        return (1);
}

int
genfs_can_access_acl_nfs4(vnode_t *vp, kauth_cred_t cred, uid_t file_uid,
    gid_t file_gid, mode_t file_mode, struct acl *aclp, accmode_t accmode)
{
        int denied, explicitly_denied, access_mask, is_directory,
            must_be_owner = 0;
        file_mode = 0;

        KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND |
            VEXPLICIT_DENY | VREAD_NAMED_ATTRS | VWRITE_NAMED_ATTRS |
            VDELETE_CHILD | VREAD_ATTRIBUTES | VWRITE_ATTRIBUTES | VDELETE |
            VREAD_ACL | VWRITE_ACL | VWRITE_OWNER | VSYNCHRONIZE)) == 0);
        KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE));

        if (accmode & VADMIN)
                must_be_owner = 1;

        /*
         * Ignore VSYNCHRONIZE permission.
         */
        accmode &= ~VSYNCHRONIZE;

        access_mask = _access_mask_from_accmode(accmode);

        if (vp && vp->v_type == VDIR)
                is_directory = 1;
        else
                is_directory = 0;

        /*
         * File owner is always allowed to read and write the ACL
         * and basic attributes.  This is to prevent a situation
         * where user would change ACL in a way that prevents him
         * from undoing the change.
         */
        if (kauth_cred_geteuid(cred) == file_uid)
                access_mask &= ~(ACL_READ_ACL | ACL_WRITE_ACL |
                    ACL_READ_ATTRIBUTES | ACL_WRITE_ATTRIBUTES);

        /*
         * Ignore append permission for regular files; use write
         * permission instead.
         */
        if (!is_directory && (access_mask & ACL_APPEND_DATA)) {
                access_mask &= ~ACL_APPEND_DATA;
                access_mask |= ACL_WRITE_DATA;
        }

        denied = _acl_denies(aclp, access_mask, cred, file_uid, file_gid,
            &explicitly_denied);

        if (must_be_owner) {
                if (kauth_cred_geteuid(cred) != file_uid)
                        denied = EPERM;
        }

        /*
         * For VEXEC, ensure that at least one execute bit is set for
         * non-directories. We have to check the mode here to stay
         * consistent with execve(2). See the test in
         * exec_check_permissions().
         */
        __acl_nfs4_sync_mode_from_acl(&file_mode, aclp);
        if (!denied && !is_directory && (accmode & VEXEC) &&
            (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)
                denied = EACCES;

        if (!denied)
                return (0);

        /*
         * Access failed.  Iff it was not denied explicitly and
         * VEXPLICIT_DENY flag was specified, allow access.
         */
        if ((accmode & VEXPLICIT_DENY) && explicitly_denied == 0)
                return (0);

        accmode &= ~VEXPLICIT_DENY;

        if (accmode & (VADMIN_PERMS | VDELETE_CHILD | VDELETE))
                denied = EPERM;
        else
                denied = EACCES;

        return (denied);
}

/*
 * Common routine to check if chmod() is allowed.
 *
 * Policy:
 *   - You must own the file, and
 *     - You must not set the "sticky" bit (meaningless, see chmod(2))
 *     - You must be a member of the group if you're trying to set the
 *         SGIDf bit
 *
 * vp - vnode of the file-system object
 * cred - credentials of the invoker
 * cur_uid, cur_gid - current uid/gid of the file-system object
 * new_mode - new mode for the file-system object
 *
 * Returns 0 if the change is allowed, or an error value otherwise.
 */
int
genfs_can_chmod(vnode_t *vp, kauth_cred_t cred, uid_t cur_uid,
    gid_t cur_gid, mode_t new_mode)
{
        int error;

        /*
         * To modify the permissions on a file, must possess VADMIN
         * for that file.
         */
        if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred)) != 0)
                return (error);

        /*
         * Unprivileged users can't set the sticky bit on files.
         */
        if ((vp->v_type != VDIR) && (new_mode & S_ISTXT))
                return (EFTYPE);

        /*
         * If the invoker is trying to set the SGID bit on the file,
         * check group membership.
         */
        if (new_mode & S_ISGID) {
                int ismember;

                error = kauth_cred_ismember_gid(cred, cur_gid,
                    &ismember);
                if (error || !ismember)
                        return (EPERM);
        }

        /*
         * Deny setting setuid if we are not the file owner.
         */
        if ((new_mode & S_ISUID) && cur_uid != kauth_cred_geteuid(cred))
                return (EPERM);

        return (0);
}

/*
 * Common routine to check if chown() is allowed.
 *
 * Policy:
 *   - You must own the file, and
 *     - You must not try to change ownership, and
 *     - You must be member of the new group
 *
 * vp - vnode
 * cred - credentials of the invoker
 * cur_uid, cur_gid - current uid/gid of the file-system object
 * new_uid, new_gid - target uid/gid of the file-system object
 *
 * Returns 0 if the change is allowed, or an error value otherwise.
 */
int        
genfs_can_chown(vnode_t *vp, kauth_cred_t cred, uid_t cur_uid,
    gid_t cur_gid, uid_t new_uid, gid_t new_gid)
{
        int error, ismember;

        /*
         * To modify the ownership of a file, must possess VADMIN for that
         * file.
         */
        if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred)) != 0)
                return (error);

        /*
         * You can only change ownership of a file if:
         * You own the file and...
         */
        if (kauth_cred_geteuid(cred) == cur_uid) {
                /*
                 * You don't try to change ownership, and...
                 */
                if (new_uid != cur_uid)
                        return (EPERM);

                /*
                 * You don't try to change group (no-op), or...
                 */
                if (new_gid == cur_gid)
                        return (0);

                /*
                 * Your effective gid is the new gid, or...
                 */
                if (kauth_cred_getegid(cred) == new_gid)
                        return (0);

                /*
                 * The new gid is one you're a member of.
                 */
                ismember = 0;
                error = kauth_cred_ismember_gid(cred, new_gid,
                    &ismember);
                if (!error && ismember)
                        return (0);
        }

        return (EPERM);
}

int
genfs_can_chtimes(vnode_t *vp, kauth_cred_t cred, uid_t owner_uid,
    u_int vaflags)
{
        int error;
        /*
         * Grant permission if the caller is the owner of the file, or
         * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
         * on the file.         If the time pointer is null, then write
         * permission on the file is also sufficient.
         *
         * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: 
         * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
         * will be allowed to set the times [..] to the current 
         * server time.
         */
        error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred);
        if (error != 0 && (vaflags & VA_UTIMES_NULL) != 0)
                error = VOP_ACCESS(vp, VWRITE, cred);

        if (error)
                return (vaflags & VA_UTIMES_NULL) == 0 ? EPERM : EACCES;

        return 0;
}

/*
 * Common routine to check if chflags() is allowed.
 *
 * Policy:
 *   - You must own the file, and
 *   - You must not change system flags, and
 *   - You must not change flags on character/block devices.
 *
 * vp - vnode
 * cred - credentials of the invoker
 * owner_uid - uid of the file-system object
 * changing_sysflags - true if the invoker wants to change system flags
 */
int
genfs_can_chflags(vnode_t *vp, kauth_cred_t cred,
     uid_t owner_uid, bool changing_sysflags)
{

        /* The user must own the file. */
        if (kauth_cred_geteuid(cred) != owner_uid) {
                return EPERM;
        }

        if (changing_sysflags) {
                return EPERM;
        }

        /*
         * Unprivileged users cannot change the flags on devices, even if they
         * own them.
         */
        if (vp->v_type == VCHR || vp->v_type == VBLK) {
                return EPERM;
        }

        return 0;
}

/*
 * Common "sticky" policy.
 *
 * When a directory is "sticky" (as determined by the caller), this
 * function may help implementing the following policy:
 * - Renaming a file in it is only possible if the user owns the directory
 *   or the file being renamed.
 * - Deleting a file from it is only possible if the user owns the
 *   directory or the file being deleted.
 */
int
genfs_can_sticky(vnode_t *vp, kauth_cred_t cred, uid_t dir_uid, uid_t file_uid)
{
        if (kauth_cred_geteuid(cred) != dir_uid &&
            kauth_cred_geteuid(cred) != file_uid)
                return EPERM;

        return 0;
}

int
genfs_can_extattr(vnode_t *vp, kauth_cred_t cred, accmode_t accmode,
    int attrnamespace)
{
        /*
         * Kernel-invoked always succeeds.
         */
        if (cred == NOCRED)
                return 0;

        switch (attrnamespace) {
        case EXTATTR_NAMESPACE_SYSTEM:
                return kauth_authorize_system(cred, KAUTH_SYSTEM_FS_EXTATTR,
                    0, vp->v_mount, NULL, NULL);
        case EXTATTR_NAMESPACE_USER:
                return VOP_ACCESS(vp, accmode, cred);
        default:
                return EPERM;
        }
}

int
genfs_access(void *v)
{
        struct vop_access_args *ap = v;

        KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN |
            VAPPEND)) == 0);

        return VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred);
}

int
genfs_accessx(void *v)
{
        struct vop_accessx_args *ap = v;
        int error;
        accmode_t accmode = ap->a_accmode;
        error = vfs_unixify_accmode(&accmode);
        if (error != 0)
                return error;

        if (accmode == 0)
                return 0;

        return VOP_ACCESS(ap->a_vp, accmode, ap->a_cred);
}

/*
 * genfs_pathconf:
 *
 * Standard implementation of POSIX pathconf, to get information about limits
 * for a filesystem.
 * Override per filesystem for the case where the filesystem has smaller
 * limits.
 */
int
genfs_pathconf(void *v)
{
        struct vop_pathconf_args *ap = v;

        switch (ap->a_name) {
        case _PC_PATH_MAX:
                *ap->a_retval = PATH_MAX;
                return 0;
        case _PC_ACL_EXTENDED:
        case _PC_ACL_NFS4:
                *ap->a_retval = 0;
                return 0;
        default:
                return EINVAL;
        }
}
























































  104 



















   42 
   55 
   54 
   36 
   82 















































   59 

   31 






   84 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
/*        $NetBSD: uvm_fault_i.h,v 1.33 2020/02/23 15:46:43 ad Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * from: Id: uvm_fault_i.h,v 1.1.6.1 1997/12/08 16:07:12 chuck Exp
 */

#ifndef _UVM_UVM_FAULT_I_H_
#define _UVM_UVM_FAULT_I_H_

/*
 * uvm_fault_i.h: fault inline functions
 */
void uvmfault_update_stats(struct uvm_faultinfo *);


/*
 * uvmfault_unlockmaps: unlock the maps
 */

static __inline void
uvmfault_unlockmaps(struct uvm_faultinfo *ufi, bool write_locked)
{
        /*
         * ufi can be NULL when this isn't really a fault,
         * but merely paging in anon data.
         */

        if (ufi == NULL) {
                return;
        }

#ifndef __HAVE_NO_PMAP_STATS
        uvmfault_update_stats(ufi);
#endif
        if (write_locked) {
                vm_map_unlock(ufi->map);
        } else {
                vm_map_unlock_read(ufi->map);
        }
}

/*
 * uvmfault_unlockall: unlock everything passed in.
 *
 * => maps must be read-locked (not write-locked).
 */

static __inline void
uvmfault_unlockall(struct uvm_faultinfo *ufi, struct vm_amap *amap,
    struct uvm_object *uobj)
{

        if (uobj)
                rw_exit(uobj->vmobjlock);
        if (amap)
                amap_unlock(amap);
        uvmfault_unlockmaps(ufi, false);
}

/*
 * uvmfault_lookup: lookup a virtual address in a map
 *
 * => caller must provide a uvm_faultinfo structure with the IN
 *        params properly filled in
 * => we will lookup the map entry (handling submaps) as we go
 * => if the lookup is a success we will return with the maps locked
 * => if "write_lock" is true, we write_lock the map, otherwise we only
 *        get a read lock.
 * => note that submaps can only appear in the kernel and they are
 *        required to use the same virtual addresses as the map they
 *        are referenced by (thus address translation between the main
 *        map and the submap is unnecessary).
 */

static __inline bool
uvmfault_lookup(struct uvm_faultinfo *ufi, bool write_lock)
{
        struct vm_map *tmpmap;

        /*
         * init ufi values for lookup.
         */

        ufi->map = ufi->orig_map;
        ufi->size = ufi->orig_size;

        /*
         * keep going down levels until we are done.   note that there can
         * only be two levels so we won't loop very long.
         */

        for (;;) {
                /*
                 * lock map
                 */
                if (write_lock) {
                        vm_map_lock(ufi->map);
                } else {
                        vm_map_lock_read(ufi->map);
                }

                /*
                 * lookup
                 */
                if (!uvm_map_lookup_entry(ufi->map, ufi->orig_rvaddr,
                    &ufi->entry)) {
                        uvmfault_unlockmaps(ufi, write_lock);
                        return(false);
                }

                /*
                 * reduce size if necessary
                 */
                if (ufi->entry->end - ufi->orig_rvaddr < ufi->size)
                        ufi->size = ufi->entry->end - ufi->orig_rvaddr;

                /*
                 * submap?    replace map with the submap and lookup again.
                 * note: VAs in submaps must match VAs in main map.
                 */
                if (UVM_ET_ISSUBMAP(ufi->entry)) {
                        tmpmap = ufi->entry->object.sub_map;
                        if (write_lock) {
                                vm_map_unlock(ufi->map);
                        } else {
                                vm_map_unlock_read(ufi->map);
                        }
                        ufi->map = tmpmap;
                        continue;
                }

                /*
                 * got it!
                 */

                ufi->mapv = ufi->map->timestamp;
                return(true);

        }        /* while loop */

        /*NOTREACHED*/
}

/*
 * uvmfault_relock: attempt to relock the same version of the map
 *
 * => fault data structures should be unlocked before calling.
 * => if a success (true) maps will be locked after call.
 */

static __inline bool
uvmfault_relock(struct uvm_faultinfo *ufi)
{
        /*
         * ufi can be NULL when this isn't really a fault,
         * but merely paging in anon data.
         */

        if (ufi == NULL) {
                return true;
        }

        cpu_count(CPU_COUNT_FLTRELCK, 1);

        /*
         * relock map.   fail if version mismatch (in which case nothing
         * gets locked).
         */

        vm_map_lock_read(ufi->map);
        if (ufi->mapv != ufi->map->timestamp) {
                vm_map_unlock_read(ufi->map);
                return(false);
        }

        cpu_count(CPU_COUNT_FLTRELCKOK, 1);
        return(true);
}

#endif /* _UVM_UVM_FAULT_I_H_ */


























































































































































































































































































































    1 



    1 






    1 










    2 

    2 















    2 





    2 

    2 




    2 







    1 



















































































    4 





    4 
    4 
    4 







    4 

    1 
    1 
    1 






    2 


    2 
















    2 








    2 
    2 













    2 
    2 







    2 




















    1 


    1 














    2 


    2 





































































































































    8 




    8 

    6 



    6 

    6 






    2 


    2 









    8 

    8 









    8 





    2 




    2 
    2 

    2 










    2 


    1 
    1 

















    2 



















































































































































    1 












    1 
    1 
    1 


































    1 





























    1 






    1 


    1 
    1 






    1 








    1 
    1 
    1 































    8 













    8 
    8 
    8 







    6 


    2 







































































































































































    8 




    8 






    8 




















    1 









    1 
















































    1 



    1 














































































































































































    1 




































    1 
    1 
    1 
    1 










    1 

    1 




    1 
    1 



















































































































































































































































































    4 

    4 



































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
/*        $NetBSD: uipc_usrreq.c,v 1.203 2022/05/28 22:08:46 andvar Exp $        */

/*-
 * Copyright (c) 1998, 2000, 2004, 2008, 2009, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)uipc_usrreq.c        8.9 (Berkeley) 5/14/95
 */

/*
 * Copyright (c) 1997 Christopher G. Demetriou.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)uipc_usrreq.c        8.9 (Berkeley) 5/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.203 2022/05/28 22:08:46 andvar Exp $");

#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/filedesc.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/unpcb.h>
#include <sys/un.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/mbuf.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/atomic.h>
#include <sys/uidinfo.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/compat_stub.h>

#include <compat/sys/socket.h>
#include <compat/net/route_70.h>

/*
 * Unix communications domain.
 *
 * TODO:
 *        RDM
 *        rethink name space problems
 *        need a proper out-of-band
 *
 * Notes on locking:
 *
 * The generic rules noted in uipc_socket2.c apply.  In addition:
 *
 * o We have a global lock, uipc_lock.
 *
 * o All datagram sockets are locked by uipc_lock.
 *
 * o For stream socketpairs, the two endpoints are created sharing the same
 *   independent lock.  Sockets presented to PRU_CONNECT2 must already have
 *   matching locks.
 *
 * o Stream sockets created via socket() start life with their own
 *   independent lock.
 * 
 * o Stream connections to a named endpoint are slightly more complicated.
 *   Sockets that have called listen() have their lock pointer mutated to
 *   the global uipc_lock.  When establishing a connection, the connecting
 *   socket also has its lock mutated to uipc_lock, which matches the head
 *   (listening socket).  We create a new socket for accept() to return, and
 *   that also shares the head's lock.  Until the connection is completely
 *   done on both ends, all three sockets are locked by uipc_lock.  Once the
 *   connection is complete, the association with the head's lock is broken.
 *   The connecting socket and the socket returned from accept() have their
 *   lock pointers mutated away from uipc_lock, and back to the connecting
 *   socket's original, independent lock.  The head continues to be locked
 *   by uipc_lock.
 *
 * o If uipc_lock is determined to be a significant source of contention,
 *   it could easily be hashed out.  It is difficult to simply make it an
 *   independent lock because of visibility / garbage collection issues:
 *   if a socket has been associated with a lock at any point, that lock
 *   must remain valid until the socket is no longer visible in the system.
 *   The lock must not be freed or otherwise destroyed until any sockets
 *   that had referenced it have also been destroyed.
 */
const struct sockaddr_un sun_noname = {
        .sun_len = offsetof(struct sockaddr_un, sun_path),
        .sun_family = AF_LOCAL,
};
ino_t        unp_ino;                        /* prototype for fake inode numbers */

static struct mbuf * unp_addsockcred(struct lwp *, struct mbuf *);
static void   unp_discard_later(file_t *);
static void   unp_discard_now(file_t *);
static void   unp_disconnect1(struct unpcb *);
static bool   unp_drop(struct unpcb *, int);
static int    unp_internalize(struct mbuf **);
static void   unp_mark(file_t *);
static void   unp_scan(struct mbuf *, void (*)(file_t *), int);
static void   unp_shutdown1(struct unpcb *);
static void   unp_thread(void *);
static void   unp_thread_kick(void);

static kmutex_t *uipc_lock;

static kcondvar_t unp_thread_cv;
static lwp_t *unp_thread_lwp;
static SLIST_HEAD(,file) unp_thread_discard;
static int unp_defer;
static struct sysctllog *usrreq_sysctllog;
static void unp_sysctl_create(void);

/* Compat interface */

struct mbuf * stub_compat_70_unp_addsockcred(lwp_t *, struct mbuf *);

struct mbuf * stub_compat_70_unp_addsockcred(struct lwp *lwp,
    struct mbuf *control)
{

/* just copy our initial argument */
        return control;
}

bool compat70_ocreds_valid = false;

/*
 * Initialize Unix protocols.
 */
void
uipc_init(void)
{
        int error;

        unp_sysctl_create();

        uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
        cv_init(&unp_thread_cv, "unpgc");

        error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, unp_thread,
            NULL, &unp_thread_lwp, "unpgc");
        if (error != 0)
                panic("uipc_init %d", error);
}

static void
unp_connid(struct lwp *l, struct unpcb *unp, int flags)
{
        unp->unp_connid.unp_pid = l->l_proc->p_pid;
        unp->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred);
        unp->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred);
        unp->unp_flags |= flags;
}

/*
 * A connection succeeded: disassociate both endpoints from the head's
 * lock, and make them share their own lock.  There is a race here: for
 * a very brief time one endpoint will be locked by a different lock
 * than the other end.  However, since the current thread holds the old
 * lock (the listening socket's lock, the head) access can still only be
 * made to one side of the connection.
 */
static void
unp_setpeerlocks(struct socket *so, struct socket *so2)
{
        struct unpcb *unp;
        kmutex_t *lock;

        KASSERT(solocked2(so, so2));

        /*
         * Bail out if either end of the socket is not yet fully
         * connected or accepted.  We only break the lock association
         * with the head when the pair of sockets stand completely
         * on their own.
         */
        KASSERT(so->so_head == NULL);
        if (so2->so_head != NULL)
                return;

        /*
         * Drop references to old lock.  A third reference (from the
         * queue head) must be held as we still hold its lock.  Bonus:
         * we don't need to worry about garbage collecting the lock.
         */
        lock = so->so_lock;
        KASSERT(lock == uipc_lock);
        mutex_obj_free(lock);
        mutex_obj_free(lock);

        /*
         * Grab stream lock from the initiator and share between the two
         * endpoints.  Issue memory barrier to ensure all modifications
         * become globally visible before the lock change.  so2 is
         * assumed not to have a stream lock, because it was created
         * purely for the server side to accept this connection and
         * started out life using the domain-wide lock.
         */
        unp = sotounpcb(so);
        KASSERT(unp->unp_streamlock != NULL);
        KASSERT(sotounpcb(so2)->unp_streamlock == NULL);
        lock = unp->unp_streamlock;
        unp->unp_streamlock = NULL;
        mutex_obj_hold(lock);
        /*
         * Ensure lock is initialized before publishing it with
         * solockreset.  Pairs with atomic_load_consume in solock and
         * various loops to reacquire lock after wakeup.
         */
        membar_release();
        /*
         * possible race if lock is not held - see comment in
         * uipc_usrreq(PRU_ACCEPT).
         */
        KASSERT(mutex_owned(lock));
        solockreset(so, lock);
        solockreset(so2, lock);
}

/*
 * Reset a socket's lock back to the domain-wide lock.
 */
static void
unp_resetlock(struct socket *so)
{
        kmutex_t *olock, *nlock;
        struct unpcb *unp;

        KASSERT(solocked(so));

        olock = so->so_lock;
        nlock = uipc_lock;
        if (olock == nlock)
                return;
        unp = sotounpcb(so);
        KASSERT(unp->unp_streamlock == NULL);
        unp->unp_streamlock = olock;
        mutex_obj_hold(nlock);
        mutex_enter(nlock);
        solockreset(so, nlock);
        mutex_exit(olock);
}

static void
unp_free(struct unpcb *unp)
{
        if (unp->unp_addr)
                free(unp->unp_addr, M_SONAME);
        if (unp->unp_streamlock != NULL)
                mutex_obj_free(unp->unp_streamlock);
        kmem_free(unp, sizeof(*unp));
}

static int
unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp)
{
        struct socket *so2;
        const struct sockaddr_un *sun;

        /* XXX: server side closed the socket */
        if (unp->unp_conn == NULL)
                return ECONNREFUSED;
        so2 = unp->unp_conn->unp_socket;

        KASSERT(solocked(so2));

        if (unp->unp_addr)
                sun = unp->unp_addr;
        else
                sun = &sun_noname;
        if (unp->unp_conn->unp_flags & UNP_WANTCRED)
                control = unp_addsockcred(curlwp, control);
        if (unp->unp_conn->unp_flags & UNP_OWANTCRED)
                MODULE_HOOK_CALL(uipc_unp_70_hook, (curlwp, control),
                    stub_compat_70_unp_addsockcred(curlwp, control), control);
        if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m,
            control) == 0) {
                unp_dispose(control);
                m_freem(control);
                m_freem(m);
                /* Don't call soroverflow because we're returning this
                 * error directly to the sender. */
                so2->so_rcv.sb_overflowed++;
                return ENOBUFS;
        } else {
                sorwakeup(so2);
                return 0;
        }
}

static void
unp_setaddr(struct socket *so, struct sockaddr *nam, bool peeraddr)
{
        const struct sockaddr_un *sun = NULL;
        struct unpcb *unp;

        KASSERT(solocked(so));
        unp = sotounpcb(so);

        if (peeraddr) {
                if (unp->unp_conn && unp->unp_conn->unp_addr)
                        sun = unp->unp_conn->unp_addr;
        } else {
                if (unp->unp_addr)
                        sun = unp->unp_addr;
        }
        if (sun == NULL)
                sun = &sun_noname;

        memcpy(nam, sun, sun->sun_len);
}

static int
unp_rcvd(struct socket *so, int flags, struct lwp *l)
{
        struct unpcb *unp = sotounpcb(so);
        struct socket *so2;
        u_int newhiwat;

        KASSERT(solocked(so));
        KASSERT(unp != NULL);

        switch (so->so_type) {

        case SOCK_DGRAM:
                panic("uipc 1");
                /*NOTREACHED*/

        case SOCK_SEQPACKET: /* FALLTHROUGH */
        case SOCK_STREAM:
#define        rcv (&so->so_rcv)
#define snd (&so2->so_snd)
                if (unp->unp_conn == 0)
                        break;
                so2 = unp->unp_conn->unp_socket;
                KASSERT(solocked2(so, so2));
                /*
                 * Adjust backpressure on sender
                 * and wakeup any waiting to write.
                 */
                snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
                unp->unp_mbcnt = rcv->sb_mbcnt;
                newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc;
                (void)chgsbsize(so2->so_uidinfo,
                    &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
                unp->unp_cc = rcv->sb_cc;
                sowwakeup(so2);
#undef snd
#undef rcv
                break;

        default:
                panic("uipc 2");
        }

        return 0;
}

static int
unp_recvoob(struct socket *so, struct mbuf *m, int flags)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
unp_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
    struct mbuf *control, struct lwp *l)
{
        struct unpcb *unp = sotounpcb(so);
        int error = 0;
        u_int newhiwat;
        struct socket *so2;

        KASSERT(solocked(so));
        KASSERT(unp != NULL);
        KASSERT(m != NULL);

        /*
         * Note: unp_internalize() rejects any control message
         * other than SCM_RIGHTS, and only allows one.  This
         * has the side-effect of preventing a caller from
         * forging SCM_CREDS.
         */
        if (control) {
                sounlock(so);
                error = unp_internalize(&control);
                solock(so);
                if (error != 0) {
                        m_freem(control);
                        m_freem(m);
                        return error;
                }
        }

        switch (so->so_type) {

        case SOCK_DGRAM: {
                KASSERT(so->so_lock == uipc_lock);
                if (nam) {
                        if ((so->so_state & SS_ISCONNECTED) != 0)
                                error = EISCONN;
                        else {
                                /*
                                 * Note: once connected, the
                                 * socket's lock must not be
                                 * dropped until we have sent
                                 * the message and disconnected.
                                 * This is necessary to prevent
                                 * intervening control ops, like
                                 * another connection.
                                 */
                                error = unp_connect(so, nam, l);
                        }
                } else {
                        if ((so->so_state & SS_ISCONNECTED) == 0)
                                error = ENOTCONN;
                }
                if (error) {
                        unp_dispose(control);
                        m_freem(control);
                        m_freem(m);
                        return error;
                }
                error = unp_output(m, control, unp);
                if (nam)
                        unp_disconnect1(unp);
                break;
        }

        case SOCK_SEQPACKET: /* FALLTHROUGH */
        case SOCK_STREAM:
#define        rcv (&so2->so_rcv)
#define        snd (&so->so_snd)
                if (unp->unp_conn == NULL) {
                        error = ENOTCONN;
                        break;
                }
                so2 = unp->unp_conn->unp_socket;
                KASSERT(solocked2(so, so2));
                if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
                        /*
                         * Credentials are passed only once on
                         * SOCK_STREAM and SOCK_SEQPACKET.
                         */
                        unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
                        control = unp_addsockcred(l, control);
                }
                if (unp->unp_conn->unp_flags & UNP_OWANTCRED) {
                        /*
                         * Credentials are passed only once on
                         * SOCK_STREAM and SOCK_SEQPACKET.
                         */
                        unp->unp_conn->unp_flags &= ~UNP_OWANTCRED;
                        MODULE_HOOK_CALL(uipc_unp_70_hook, (curlwp, control),
                            stub_compat_70_unp_addsockcred(curlwp, control),
                            control);
                }
                /*
                 * Send to paired receive port, and then reduce
                 * send buffer hiwater marks to maintain backpressure.
                 * Wake up readers.
                 */
                if (control) {
                        if (sbappendcontrol(rcv, m, control) != 0)
                                control = NULL;
                } else {
                        switch(so->so_type) {
                        case SOCK_SEQPACKET:
                                sbappendrecord(rcv, m);
                                break;
                        case SOCK_STREAM:
                                sbappend(rcv, m);
                                break;
                        default:
                                panic("uipc_usrreq");
                                break;
                        }
                }
                snd->sb_mbmax -=
                    rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
                unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
                newhiwat = snd->sb_hiwat -
                    (rcv->sb_cc - unp->unp_conn->unp_cc);
                (void)chgsbsize(so->so_uidinfo,
                    &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
                unp->unp_conn->unp_cc = rcv->sb_cc;
                sorwakeup(so2);
#undef snd
#undef rcv
                if (control != NULL) {
                        unp_dispose(control);
                        m_freem(control);
                }
                break;

        default:
                panic("uipc 4");
        }

        return error;
}

static int
unp_sendoob(struct socket *so, struct mbuf *m, struct mbuf * control)
{
        KASSERT(solocked(so));

        m_freem(m);
        m_freem(control);

        return EOPNOTSUPP;
}

/*
 * Unix domain socket option processing.
 */
int
uipc_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
        struct unpcb *unp = sotounpcb(so);
        int optval = 0, error = 0;

        KASSERT(solocked(so));

        if (sopt->sopt_level != SOL_LOCAL) {
                error = ENOPROTOOPT;
        } else switch (op) {

        case PRCO_SETOPT:
                switch (sopt->sopt_name) {
                case LOCAL_OCREDS:
                        if (!compat70_ocreds_valid)  {
                                error = ENOPROTOOPT;
                                break;
                        }
                        /* FALLTHROUGH */
                case LOCAL_CREDS:
                case LOCAL_CONNWAIT:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;
                        switch (sopt->sopt_name) {
#define        OPTSET(bit) \
        if (optval) \
                unp->unp_flags |= (bit); \
        else \
                unp->unp_flags &= ~(bit);

                        case LOCAL_CREDS:
                                OPTSET(UNP_WANTCRED);
                                break;
                        case LOCAL_CONNWAIT:
                                OPTSET(UNP_CONNWAIT);
                                break;
                        case LOCAL_OCREDS:
                                OPTSET(UNP_OWANTCRED);
                                break;
                        }
                        break;
#undef OPTSET

                default:
                        error = ENOPROTOOPT;
                        break;
                }
                break;

        case PRCO_GETOPT:
                sounlock(so);
                switch (sopt->sopt_name) {
                case LOCAL_PEEREID:
                        if (unp->unp_flags & UNP_EIDSVALID) {
                                error = sockopt_set(sopt, &unp->unp_connid,
                                    sizeof(unp->unp_connid));
                        } else {
                                error = EINVAL;
                        }
                        break;
                case LOCAL_CREDS:
#define        OPTBIT(bit)        (unp->unp_flags & (bit) ? 1 : 0)

                        optval = OPTBIT(UNP_WANTCRED);
                        error = sockopt_setint(sopt, optval);
                        break;
                case LOCAL_OCREDS:
                        if (compat70_ocreds_valid) {
                                optval = OPTBIT(UNP_OWANTCRED);
                                error = sockopt_setint(sopt, optval);
                                break;
                        }
#undef OPTBIT
                        /* FALLTHROUGH */
                default:
                        error = ENOPROTOOPT;
                        break;
                }
                solock(so);
                break;
        }
        return (error);
}

/*
 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
 * for stream sockets, although the total for sender and receiver is
 * actually only PIPSIZ.
 * Datagram sockets really use the sendspace as the maximum datagram size,
 * and don't really want to reserve the sendspace.  Their recvspace should
 * be large enough for at least one max-size datagram plus address.
 */
#ifndef PIPSIZ
#define        PIPSIZ        8192
#endif
u_long        unpst_sendspace = PIPSIZ;
u_long        unpst_recvspace = PIPSIZ;
u_long        unpdg_sendspace = 2*1024;        /* really max datagram size */
u_long        unpdg_recvspace = 16*1024;

u_int        unp_rights;                        /* files in flight */
u_int        unp_rights_ratio = 2;                /* limit, fraction of maxfiles */

static int
unp_attach(struct socket *so, int proto)
{
        struct unpcb *unp = sotounpcb(so);
        u_long sndspc, rcvspc;
        int error;

        KASSERT(unp == NULL);

        switch (so->so_type) {
        case SOCK_SEQPACKET:
                /* FALLTHROUGH */
        case SOCK_STREAM:
                if (so->so_lock == NULL) {
                        so->so_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
                        solock(so);
                }
                sndspc = unpst_sendspace;
                rcvspc = unpst_recvspace;
                break;

        case SOCK_DGRAM:
                if (so->so_lock == NULL) {
                        mutex_obj_hold(uipc_lock);
                        so->so_lock = uipc_lock;
                        solock(so);
                }
                sndspc = unpdg_sendspace;
                rcvspc = unpdg_recvspace;
                break;

        default:
                panic("unp_attach");
        }

        if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
                error = soreserve(so, sndspc, rcvspc);
                if (error) {
                        return error;
                }
        }

        unp = kmem_zalloc(sizeof(*unp), KM_SLEEP);
        nanotime(&unp->unp_ctime);
        unp->unp_socket = so;
        so->so_pcb = unp;

        KASSERT(solocked(so));
        return 0;
}

static void
unp_detach(struct socket *so)
{
        struct unpcb *unp;
        vnode_t *vp;

        unp = sotounpcb(so);
        KASSERT(unp != NULL);
        KASSERT(solocked(so));
 retry:
        if ((vp = unp->unp_vnode) != NULL) {
                sounlock(so);
                /* Acquire v_interlock to protect against unp_connect(). */
                /* XXXAD racy */
                mutex_enter(vp->v_interlock);
                vp->v_socket = NULL;
                mutex_exit(vp->v_interlock);
                vrele(vp);
                solock(so);
                unp->unp_vnode = NULL;
        }
        if (unp->unp_conn)
                unp_disconnect1(unp);
        while (unp->unp_refs) {
                KASSERT(solocked2(so, unp->unp_refs->unp_socket));
                if (unp_drop(unp->unp_refs, ECONNRESET)) {
                        solock(so);
                        goto retry;
                }
        }
        soisdisconnected(so);
        so->so_pcb = NULL;
        if (unp_rights) {
                /*
                 * Normally the receive buffer is flushed later, in sofree,
                 * but if our receive buffer holds references to files that
                 * are now garbage, we will enqueue those file references to
                 * the garbage collector and kick it into action.
                 */
                sorflush(so);
                unp_free(unp);
                unp_thread_kick();
        } else
                unp_free(unp);
}

static int
unp_accept(struct socket *so, struct sockaddr *nam)
{
        struct unpcb *unp = sotounpcb(so);
        struct socket *so2;

        KASSERT(solocked(so));
        KASSERT(nam != NULL);

        /* XXX code review required to determine if unp can ever be NULL */
        if (unp == NULL)
                return EINVAL;

        KASSERT(so->so_lock == uipc_lock);
        /*
         * Mark the initiating STREAM socket as connected *ONLY*
         * after it's been accepted.  This prevents a client from
         * overrunning a server and receiving ECONNREFUSED.
         */
        if (unp->unp_conn == NULL) {
                /*
                 * This will use the empty socket and will not
                 * allocate.
                 */
                unp_setaddr(so, nam, true);
                return 0;
        }
        so2 = unp->unp_conn->unp_socket;
        if (so2->so_state & SS_ISCONNECTING) {
                KASSERT(solocked2(so, so->so_head));
                KASSERT(solocked2(so2, so->so_head));
                soisconnected(so2);
        }
        /*
         * If the connection is fully established, break the
         * association with uipc_lock and give the connected
         * pair a separate lock to share.
         * There is a race here: sotounpcb(so2)->unp_streamlock
         * is not locked, so when changing so2->so_lock
         * another thread can grab it while so->so_lock is still
         * pointing to the (locked) uipc_lock.
         * this should be harmless, except that this makes
         * solocked2() and solocked() unreliable.
         * Another problem is that unp_setaddr() expects the
         * the socket locked. Grabbing sotounpcb(so2)->unp_streamlock
         * fixes both issues.
         */
        mutex_enter(sotounpcb(so2)->unp_streamlock);
        unp_setpeerlocks(so2, so);
        /*
         * Only now return peer's address, as we may need to
         * block in order to allocate memory.
         *
         * XXX Minor race: connection can be broken while
         * lock is dropped in unp_setaddr().  We will return
         * error == 0 and sun_noname as the peer address.
         */
        unp_setaddr(so, nam, true);
        /* so_lock now points to unp_streamlock */
        mutex_exit(so2->so_lock);
        return 0;
}

static int
unp_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
{
        return EOPNOTSUPP;
}

static int
unp_stat(struct socket *so, struct stat *ub)
{
        struct unpcb *unp;
        struct socket *so2;

        KASSERT(solocked(so));

        unp = sotounpcb(so);
        if (unp == NULL)
                return EINVAL;

        ub->st_blksize = so->so_snd.sb_hiwat;
        switch (so->so_type) {
        case SOCK_SEQPACKET: /* FALLTHROUGH */
        case SOCK_STREAM:
                if (unp->unp_conn == 0) 
                        break;

                so2 = unp->unp_conn->unp_socket;
                KASSERT(solocked2(so, so2));
                ub->st_blksize += so2->so_rcv.sb_cc;
                break;
        default:
                break;
        }
        ub->st_dev = NODEV;
        if (unp->unp_ino == 0)
                unp->unp_ino = unp_ino++;
        ub->st_atimespec = ub->st_mtimespec = ub->st_ctimespec = unp->unp_ctime;
        ub->st_ino = unp->unp_ino;
        ub->st_uid = so->so_uidinfo->ui_uid;
        ub->st_gid = so->so_egid;
        return (0);
}

static int
unp_peeraddr(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));
        KASSERT(sotounpcb(so) != NULL);
        KASSERT(nam != NULL);

        unp_setaddr(so, nam, true);
        return 0;
}

static int
unp_sockaddr(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));
        KASSERT(sotounpcb(so) != NULL);
        KASSERT(nam != NULL);

        unp_setaddr(so, nam, false);
        return 0;
}

/*
 * we only need to perform this allocation until syscalls other than
 * bind are adjusted to use sockaddr_big.
 */
static struct sockaddr_un *
makeun_sb(struct sockaddr *nam, size_t *addrlen)
{
        struct sockaddr_un *sun;

        *addrlen = nam->sa_len + 1;
        sun = malloc(*addrlen, M_SONAME, M_WAITOK);
        memcpy(sun, nam, nam->sa_len);
        *(((char *)sun) + nam->sa_len) = '\0';
        return sun;
}

static int
unp_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct sockaddr_un *sun;
        struct unpcb *unp;
        vnode_t *vp;
        struct vattr vattr;
        size_t addrlen;
        int error;
        struct pathbuf *pb;
        struct nameidata nd;
        proc_t *p;

        unp = sotounpcb(so);

        KASSERT(solocked(so));
        KASSERT(unp != NULL);
        KASSERT(nam != NULL);

        if (unp->unp_vnode != NULL)
                return (EINVAL);
        if ((unp->unp_flags & UNP_BUSY) != 0) {
                /*
                 * EALREADY may not be strictly accurate, but since this
                 * is a major application error it's hardly a big deal.
                 */
                return (EALREADY);
        }
        unp->unp_flags |= UNP_BUSY;
        sounlock(so);

        p = l->l_proc;
        sun = makeun_sb(nam, &addrlen);

        pb = pathbuf_create(sun->sun_path);
        if (pb == NULL) {
                error = ENOMEM;
                goto bad;
        }
        NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT | TRYEMULROOT, pb);

/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
        if ((error = namei(&nd)) != 0) {
                pathbuf_destroy(pb);
                goto bad;
        }
        vp = nd.ni_vp;
        if (vp != NULL) {
                VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
                if (nd.ni_dvp == vp)
                        vrele(nd.ni_dvp);
                else
                        vput(nd.ni_dvp);
                vrele(vp);
                pathbuf_destroy(pb);
                error = EADDRINUSE;
                goto bad;
        }
        vattr_null(&vattr);
        vattr.va_type = VSOCK;
        vattr.va_mode = ACCESSPERMS & ~(p->p_cwdi->cwdi_cmask);
        error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
        if (error) {
                vput(nd.ni_dvp);
                pathbuf_destroy(pb);
                goto bad;
        }
        vp = nd.ni_vp;
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        solock(so);
        vp->v_socket = unp->unp_socket;
        unp->unp_vnode = vp;
        unp->unp_addrlen = addrlen;
        unp->unp_addr = sun;
        VOP_UNLOCK(vp);
        vput(nd.ni_dvp);
        unp->unp_flags &= ~UNP_BUSY;
        pathbuf_destroy(pb);
        return (0);

 bad:
        free(sun, M_SONAME);
        solock(so);
        unp->unp_flags &= ~UNP_BUSY;
        return (error);
}

static int
unp_listen(struct socket *so, struct lwp *l)
{
        struct unpcb *unp = sotounpcb(so);

        KASSERT(solocked(so));
        KASSERT(unp != NULL);

        /*
         * If the socket can accept a connection, it must be
         * locked by uipc_lock.
         */
        unp_resetlock(so);
        if (unp->unp_vnode == NULL)
                return EINVAL;

        unp_connid(l, unp, UNP_EIDSBIND);
        return 0;
}

static int
unp_disconnect(struct socket *so)
{
        KASSERT(solocked(so));
        KASSERT(sotounpcb(so) != NULL);

        unp_disconnect1(sotounpcb(so));
        return 0;
}

static int
unp_shutdown(struct socket *so)
{
        KASSERT(solocked(so));
        KASSERT(sotounpcb(so) != NULL);

        socantsendmore(so);
        unp_shutdown1(sotounpcb(so));
        return 0;
}

static int
unp_abort(struct socket *so)
{
        KASSERT(solocked(so));
        KASSERT(sotounpcb(so) != NULL);

        (void)unp_drop(sotounpcb(so), ECONNABORTED);
        KASSERT(so->so_head == NULL);
        KASSERT(so->so_pcb != NULL);
        unp_detach(so);
        return 0;
}

static int
unp_connect1(struct socket *so, struct socket *so2, struct lwp *l)
{
        struct unpcb *unp = sotounpcb(so);
        struct unpcb *unp2;

        if (so2->so_type != so->so_type)
                return EPROTOTYPE;

        /*
         * All three sockets involved must be locked by same lock:
         *
         * local endpoint (so)
         * remote endpoint (so2)
         * queue head (so2->so_head, only if PR_CONNREQUIRED)
         */
        KASSERT(solocked2(so, so2));
        KASSERT(so->so_head == NULL);
        if (so2->so_head != NULL) {
                KASSERT(so2->so_lock == uipc_lock);
                KASSERT(solocked2(so2, so2->so_head));
        }

        unp2 = sotounpcb(so2);
        unp->unp_conn = unp2;

        switch (so->so_type) {

        case SOCK_DGRAM:
                unp->unp_nextref = unp2->unp_refs;
                unp2->unp_refs = unp;
                soisconnected(so);
                break;

        case SOCK_SEQPACKET: /* FALLTHROUGH */
        case SOCK_STREAM:

                /*
                 * SOCK_SEQPACKET and SOCK_STREAM cases are handled by callers
                 * which are unp_connect() or unp_connect2().
                 */

                break;

        default:
                panic("unp_connect1");
        }

        return 0;
}

int
unp_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct sockaddr_un *sun;
        vnode_t *vp;
        struct socket *so2, *so3;
        struct unpcb *unp, *unp2, *unp3;
        size_t addrlen;
        int error;
        struct pathbuf *pb;
        struct nameidata nd;

        unp = sotounpcb(so);
        if ((unp->unp_flags & UNP_BUSY) != 0) {
                /*
                 * EALREADY may not be strictly accurate, but since this
                 * is a major application error it's hardly a big deal.
                 */
                return (EALREADY);
        }
        unp->unp_flags |= UNP_BUSY;
        sounlock(so);

        sun = makeun_sb(nam, &addrlen);
        pb = pathbuf_create(sun->sun_path);
        if (pb == NULL) {
                error = ENOMEM;
                goto bad2;
        }

        NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);

        if ((error = namei(&nd)) != 0) {
                pathbuf_destroy(pb);
                goto bad2;
        }
        vp = nd.ni_vp;
        pathbuf_destroy(pb);
        if (vp->v_type != VSOCK) {
                error = ENOTSOCK;
                goto bad;
        }
        if ((error = VOP_ACCESS(vp, VWRITE, l->l_cred)) != 0)
                goto bad;
        /* Acquire v_interlock to protect against unp_detach(). */
        mutex_enter(vp->v_interlock);
        so2 = vp->v_socket;
        if (so2 == NULL) {
                mutex_exit(vp->v_interlock);
                error = ECONNREFUSED;
                goto bad;
        }
        if (so->so_type != so2->so_type) {
                mutex_exit(vp->v_interlock);
                error = EPROTOTYPE;
                goto bad;
        }
        solock(so);
        unp_resetlock(so);
        mutex_exit(vp->v_interlock);
        if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
                /*
                 * This may seem somewhat fragile but is OK: if we can
                 * see SO_ACCEPTCONN set on the endpoint, then it must
                 * be locked by the domain-wide uipc_lock.
                 */
                KASSERT((so2->so_options & SO_ACCEPTCONN) == 0 ||
                    so2->so_lock == uipc_lock);
                if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
                    (so3 = sonewconn(so2, false)) == NULL) {
                        error = ECONNREFUSED;
                        sounlock(so);
                        goto bad;
                }
                unp2 = sotounpcb(so2);
                unp3 = sotounpcb(so3);
                if (unp2->unp_addr) {
                        unp3->unp_addr = malloc(unp2->unp_addrlen,
                            M_SONAME, M_WAITOK);
                        memcpy(unp3->unp_addr, unp2->unp_addr,
                            unp2->unp_addrlen);
                        unp3->unp_addrlen = unp2->unp_addrlen;
                }
                unp3->unp_flags = unp2->unp_flags;
                so2 = so3;
                /*
                 * The connector's (client's) credentials are copied from its
                 * process structure at the time of connect() (which is now).
                 */
                unp_connid(l, unp3, UNP_EIDSVALID);
                 /*
                  * The receiver's (server's) credentials are copied from the
                  * unp_peercred member of socket on which the former called
                  * listen(); unp_listen() cached that process's credentials
                  * at that time so we can use them now.
                  */
                if (unp2->unp_flags & UNP_EIDSBIND) {
                        memcpy(&unp->unp_connid, &unp2->unp_connid,
                            sizeof(unp->unp_connid));
                        unp->unp_flags |= UNP_EIDSVALID;
                }
        }
        error = unp_connect1(so, so2, l);
        if (error) {
                sounlock(so);
                goto bad;
        }
        unp2 = sotounpcb(so2);
        switch (so->so_type) {

        /*
         * SOCK_DGRAM and default cases are handled in prior call to
         * unp_connect1(), do not add a default case without fixing
         * unp_connect1().
         */

        case SOCK_SEQPACKET: /* FALLTHROUGH */
        case SOCK_STREAM:
                unp2->unp_conn = unp;
                if ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)
                        soisconnecting(so);
                else
                        soisconnected(so);
                soisconnected(so2);
                /*
                 * If the connection is fully established, break the
                 * association with uipc_lock and give the connected
                 * pair a separate lock to share.
                 */
                KASSERT(so2->so_head != NULL);
                unp_setpeerlocks(so, so2);
                break;

        }
        sounlock(so);
 bad:
        vput(vp);
 bad2:
        free(sun, M_SONAME);
        solock(so);
        unp->unp_flags &= ~UNP_BUSY;
        return (error);
}

int
unp_connect2(struct socket *so, struct socket *so2)
{
        struct unpcb *unp = sotounpcb(so);
        struct unpcb *unp2;
        int error = 0;

        KASSERT(solocked2(so, so2));

        error = unp_connect1(so, so2, curlwp);
        if (error)
                return error;

        unp2 = sotounpcb(so2);
        switch (so->so_type) {

        /*
         * SOCK_DGRAM and default cases are handled in prior call to
         * unp_connect1(), do not add a default case without fixing
         * unp_connect1().
         */

        case SOCK_SEQPACKET: /* FALLTHROUGH */
        case SOCK_STREAM:
                unp2->unp_conn = unp;
                soisconnected(so);
                soisconnected(so2);
                break;

        }
        return error;
}

static void
unp_disconnect1(struct unpcb *unp)
{
        struct unpcb *unp2 = unp->unp_conn;
        struct socket *so;

        if (unp2 == 0)
                return;
        unp->unp_conn = 0;
        so = unp->unp_socket;
        switch (so->so_type) {
        case SOCK_DGRAM:
                if (unp2->unp_refs == unp)
                        unp2->unp_refs = unp->unp_nextref;
                else {
                        unp2 = unp2->unp_refs;
                        for (;;) {
                                KASSERT(solocked2(so, unp2->unp_socket));
                                if (unp2 == 0)
                                        panic("unp_disconnect1");
                                if (unp2->unp_nextref == unp)
                                        break;
                                unp2 = unp2->unp_nextref;
                        }
                        unp2->unp_nextref = unp->unp_nextref;
                }
                unp->unp_nextref = 0;
                so->so_state &= ~SS_ISCONNECTED;
                break;

        case SOCK_SEQPACKET: /* FALLTHROUGH */
        case SOCK_STREAM:
                KASSERT(solocked2(so, unp2->unp_socket));
                soisdisconnected(so);
                unp2->unp_conn = 0;
                soisdisconnected(unp2->unp_socket);
                break;
        }
}

static void
unp_shutdown1(struct unpcb *unp)
{
        struct socket *so;

        switch(unp->unp_socket->so_type) {
        case SOCK_SEQPACKET: /* FALLTHROUGH */
        case SOCK_STREAM:
                if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
                        socantrcvmore(so);
                break;
        default:
                break;
        }
}

static bool
unp_drop(struct unpcb *unp, int errno)
{
        struct socket *so = unp->unp_socket;

        KASSERT(solocked(so));

        so->so_error = errno;
        unp_disconnect1(unp);
        if (so->so_head) {
                so->so_pcb = NULL;
                /* sofree() drops the socket lock */
                sofree(so);
                unp_free(unp);
                return true;
        }
        return false;
}

#ifdef notdef
unp_drain(void)
{

}
#endif

int
unp_externalize(struct mbuf *rights, struct lwp *l, int flags)
{
        struct cmsghdr * const cm = mtod(rights, struct cmsghdr *);
        struct proc * const p = l->l_proc;
        file_t **rp;
        int error = 0;

        const size_t nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
            sizeof(file_t *);
        if (nfds == 0)
                goto noop;

        int * const fdp = kmem_alloc(nfds * sizeof(int), KM_SLEEP);
        rw_enter(&p->p_cwdi->cwdi_lock, RW_READER);

        /* Make sure the recipient should be able to see the files.. */
        rp = (file_t **)CMSG_DATA(cm);
        for (size_t i = 0; i < nfds; i++) {
                file_t * const fp = *rp++;
                if (fp == NULL) {
                        error = EINVAL;
                        goto out;
                }
                /*
                 * If we are in a chroot'ed directory, and
                 * someone wants to pass us a directory, make
                 * sure it's inside the subtree we're allowed
                 * to access.
                 */
                if (p->p_cwdi->cwdi_rdir != NULL && fp->f_type == DTYPE_VNODE) {
                        vnode_t *vp = fp->f_vnode;
                        if ((vp->v_type == VDIR) &&
                            !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) {
                                error = EPERM;
                                goto out;
                        }
                }
        }

 restart:
        /*
         * First loop -- allocate file descriptor table slots for the
         * new files.
         */
        for (size_t i = 0; i < nfds; i++) {
                if ((error = fd_alloc(p, 0, &fdp[i])) != 0) {
                        /*
                         * Back out what we've done so far.
                         */
                        while (i-- > 0) {
                                fd_abort(p, NULL, fdp[i]);
                        }
                        if (error == ENOSPC) {
                                fd_tryexpand(p);
                                error = 0;
                                goto restart;
                        }
                        /*
                         * This is the error that has historically
                         * been returned, and some callers may
                         * expect it.
                         */
                        error = EMSGSIZE;
                        goto out;
                }
        }

        /*
         * Now that adding them has succeeded, update all of the
         * file passing state and affix the descriptors.
         */
        rp = (file_t **)CMSG_DATA(cm);
        int *ofdp = (int *)CMSG_DATA(cm);
        for (size_t i = 0; i < nfds; i++) {
                file_t * const fp = *rp++;
                const int fd = fdp[i];
                atomic_dec_uint(&unp_rights);
                fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
                fd_affix(p, fp, fd);
                /*
                 * Done with this file pointer, replace it with a fd;
                 */
                *ofdp++ = fd;
                mutex_enter(&fp->f_lock);
                fp->f_msgcount--;
                mutex_exit(&fp->f_lock);
                /*
                 * Note that fd_affix() adds a reference to the file.
                 * The file may already have been closed by another
                 * LWP in the process, so we must drop the reference
                 * added by unp_internalize() with closef().
                 */
                closef(fp);
        }

        /*
         * Adjust length, in case of transition from large file_t
         * pointers to ints.
         */
        if (sizeof(file_t *) != sizeof(int)) {
                cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
                rights->m_len = CMSG_SPACE(nfds * sizeof(int));
        }
 out:
        if (__predict_false(error != 0)) {
                file_t **const fpp = (file_t **)CMSG_DATA(cm);
                for (size_t i = 0; i < nfds; i++)
                        unp_discard_now(fpp[i]);
                /*
                 * Truncate the array so that nobody will try to interpret
                 * what is now garbage in it.
                 */
                cm->cmsg_len = CMSG_LEN(0);
                rights->m_len = CMSG_SPACE(0);
        }
        rw_exit(&p->p_cwdi->cwdi_lock);
        kmem_free(fdp, nfds * sizeof(int));

 noop:
        /*
         * Don't disclose kernel memory in the alignment space.
         */
        KASSERT(cm->cmsg_len <= rights->m_len);
        memset(&mtod(rights, char *)[cm->cmsg_len], 0, rights->m_len -
            cm->cmsg_len);
        return error;
}

static int
unp_internalize(struct mbuf **controlp)
{
        filedesc_t *fdescp = curlwp->l_fd;
        fdtab_t *dt;
        struct mbuf *control = *controlp;
        struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *);
        file_t **rp, **files;
        file_t *fp;
        int i, fd, *fdp;
        int nfds, error;
        u_int maxmsg;

        error = 0;
        newcm = NULL;

        /* Sanity check the control message header. */
        if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
            cm->cmsg_len > control->m_len ||
            cm->cmsg_len < CMSG_ALIGN(sizeof(*cm)))
                return (EINVAL);

        /*
         * Verify that the file descriptors are valid, and acquire
         * a reference to each.
         */
        nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
        fdp = (int *)CMSG_DATA(cm);
        maxmsg = maxfiles / unp_rights_ratio;
        for (i = 0; i < nfds; i++) {
                fd = *fdp++;
                if (atomic_inc_uint_nv(&unp_rights) > maxmsg) {
                        atomic_dec_uint(&unp_rights);
                        nfds = i;
                        error = EAGAIN;
                        goto out;
                }
                if ((fp = fd_getfile(fd)) == NULL
                    || fp->f_type == DTYPE_KQUEUE) {
                            if (fp)
                                    fd_putfile(fd);
                        atomic_dec_uint(&unp_rights);
                        nfds = i;
                        error = EBADF;
                        goto out;
                }
        }

        /* Allocate new space and copy header into it. */
        newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK);
        if (newcm == NULL) {
                error = E2BIG;
                goto out;
        }
        memcpy(newcm, cm, sizeof(struct cmsghdr));
        memset(newcm + 1, 0, CMSG_LEN(0) - sizeof(struct cmsghdr));
        files = (file_t **)CMSG_DATA(newcm);

        /*
         * Transform the file descriptors into file_t pointers, in
         * reverse order so that if pointers are bigger than ints, the
         * int won't get until we're done.  No need to lock, as we have
         * already validated the descriptors with fd_getfile().
         */
        fdp = (int *)CMSG_DATA(cm) + nfds;
        rp = files + nfds;
        for (i = 0; i < nfds; i++) {
                dt = atomic_load_consume(&fdescp->fd_dt);
                fp = atomic_load_consume(&dt->dt_ff[*--fdp]->ff_file);
                KASSERT(fp != NULL);
                mutex_enter(&fp->f_lock);
                *--rp = fp;
                fp->f_count++;
                fp->f_msgcount++;
                mutex_exit(&fp->f_lock);
        }

 out:
         /* Release descriptor references. */
        fdp = (int *)CMSG_DATA(cm);
        for (i = 0; i < nfds; i++) {
                fd_putfile(*fdp++);
                if (error != 0) {
                        atomic_dec_uint(&unp_rights);
                }
        }

        if (error == 0) {
                if (control->m_flags & M_EXT) {
                        m_freem(control);
                        *controlp = control = m_get(M_WAIT, MT_CONTROL);
                }
                MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)),
                    M_MBUF, NULL, NULL);
                cm = newcm;
                /*
                 * Adjust message & mbuf to note amount of space
                 * actually used.
                 */
                cm->cmsg_len = CMSG_LEN(nfds * sizeof(file_t *));
                control->m_len = CMSG_SPACE(nfds * sizeof(file_t *));
        }

        return error;
}

struct mbuf *
unp_addsockcred(struct lwp *l, struct mbuf *control)
{
        struct sockcred *sc;
        struct mbuf *m;
        void *p;

        m = sbcreatecontrol1(&p, SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)),
                SCM_CREDS, SOL_SOCKET, M_WAITOK);
        if (m == NULL)
                return control;

        sc = p;
        sc->sc_pid = l->l_proc->p_pid;
        sc->sc_uid = kauth_cred_getuid(l->l_cred);
        sc->sc_euid = kauth_cred_geteuid(l->l_cred);
        sc->sc_gid = kauth_cred_getgid(l->l_cred);
        sc->sc_egid = kauth_cred_getegid(l->l_cred);
        sc->sc_ngroups = kauth_cred_ngroups(l->l_cred);

        for (int i = 0; i < sc->sc_ngroups; i++)
                sc->sc_groups[i] = kauth_cred_group(l->l_cred, i);

        return m_add(control, m);
}

/*
 * Do a mark-sweep GC of files in the system, to free up any which are
 * caught in flight to an about-to-be-closed socket.  Additionally,
 * process deferred file closures.
 */
static void
unp_gc(file_t *dp)
{
        extern        struct domain unixdomain;
        file_t *fp, *np;
        struct socket *so, *so1;
        u_int i, oflags, rflags;
        bool didwork;

        KASSERT(curlwp == unp_thread_lwp);
        KASSERT(mutex_owned(&filelist_lock));

        /*
         * First, process deferred file closures.
         */
        while (!SLIST_EMPTY(&unp_thread_discard)) {
                fp = SLIST_FIRST(&unp_thread_discard);
                KASSERT(fp->f_unpcount > 0);
                KASSERT(fp->f_count > 0);
                KASSERT(fp->f_msgcount > 0);
                KASSERT(fp->f_count >= fp->f_unpcount);
                KASSERT(fp->f_count >= fp->f_msgcount);
                KASSERT(fp->f_msgcount >= fp->f_unpcount);
                SLIST_REMOVE_HEAD(&unp_thread_discard, f_unplist);
                i = fp->f_unpcount;
                fp->f_unpcount = 0;
                mutex_exit(&filelist_lock);
                for (; i != 0; i--) {
                        unp_discard_now(fp);
                }
                mutex_enter(&filelist_lock);
        }

        /*
         * Clear mark bits.  Ensure that we don't consider new files
         * entering the file table during this loop (they will not have
         * FSCAN set).
         */
        unp_defer = 0;
        LIST_FOREACH(fp, &filehead, f_list) {
                for (oflags = fp->f_flag;; oflags = rflags) {
                        rflags = atomic_cas_uint(&fp->f_flag, oflags,
                            (oflags | FSCAN) & ~(FMARK|FDEFER));
                        if (__predict_true(oflags == rflags)) {
                                break;
                        }
                }
        }

        /*
         * Iterate over the set of sockets, marking ones believed (based on
         * refcount) to be referenced from a process, and marking for rescan
         * sockets which are queued on a socket.  Recan continues descending
         * and searching for sockets referenced by sockets (FDEFER), until
         * there are no more socket->socket references to be discovered.
         */
        do {
                didwork = false;
                for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
                        KASSERT(mutex_owned(&filelist_lock));
                        np = LIST_NEXT(fp, f_list);
                        mutex_enter(&fp->f_lock);
                        if ((fp->f_flag & FDEFER) != 0) {
                                atomic_and_uint(&fp->f_flag, ~FDEFER);
                                unp_defer--;
                                if (fp->f_count == 0) {
                                        /*
                                         * XXX: closef() doesn't pay attention
                                         * to FDEFER
                                         */
                                        mutex_exit(&fp->f_lock);
                                        continue;
                                }
                        } else {
                                if (fp->f_count == 0 ||
                                    (fp->f_flag & FMARK) != 0 ||
                                    fp->f_count == fp->f_msgcount ||
                                    fp->f_unpcount != 0) {
                                        mutex_exit(&fp->f_lock);
                                        continue;
                                }
                        }
                        atomic_or_uint(&fp->f_flag, FMARK);

                        if (fp->f_type != DTYPE_SOCKET ||
                            (so = fp->f_socket) == NULL ||
                            so->so_proto->pr_domain != &unixdomain ||
                            (so->so_proto->pr_flags & PR_RIGHTS) == 0) {
                                mutex_exit(&fp->f_lock);
                                continue;
                        }

                        /* Gain file ref, mark our position, and unlock. */
                        didwork = true;
                        LIST_INSERT_AFTER(fp, dp, f_list);
                        fp->f_count++;
                        mutex_exit(&fp->f_lock);
                        mutex_exit(&filelist_lock);

                        /*
                         * Mark files referenced from sockets queued on the
                         * accept queue as well.
                         */
                        solock(so);
                        unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
                        if ((so->so_options & SO_ACCEPTCONN) != 0) {
                                TAILQ_FOREACH(so1, &so->so_q0, so_qe) {
                                        unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
                                }
                                TAILQ_FOREACH(so1, &so->so_q, so_qe) {
                                        unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
                                }
                        }
                        sounlock(so);

                        /* Re-lock and restart from where we left off. */
                        closef(fp);
                        mutex_enter(&filelist_lock);
                        np = LIST_NEXT(dp, f_list);
                        LIST_REMOVE(dp, f_list);
                }
                /*
                 * Bail early if we did nothing in the loop above.  Could
                 * happen because of concurrent activity causing unp_defer
                 * to get out of sync.
                 */
        } while (unp_defer != 0 && didwork);

        /*
         * Sweep pass.
         *
         * We grab an extra reference to each of the files that are
         * not otherwise accessible and then free the rights that are
         * stored in messages on them.
         */
        for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
                KASSERT(mutex_owned(&filelist_lock));
                np = LIST_NEXT(fp, f_list);
                mutex_enter(&fp->f_lock);

                /*
                 * Ignore non-sockets.
                 * Ignore dead sockets, or sockets with pending close.
                 * Ignore sockets obviously referenced elsewhere. 
                 * Ignore sockets marked as referenced by our scan.
                 * Ignore new sockets that did not exist during the scan.
                 */
                if (fp->f_type != DTYPE_SOCKET ||
                    fp->f_count == 0 || fp->f_unpcount != 0 ||
                    fp->f_count != fp->f_msgcount ||
                    (fp->f_flag & (FMARK | FSCAN)) != FSCAN) {
                        mutex_exit(&fp->f_lock);
                        continue;
                }

                /* Gain file ref, mark our position, and unlock. */
                LIST_INSERT_AFTER(fp, dp, f_list);
                fp->f_count++;
                mutex_exit(&fp->f_lock);
                mutex_exit(&filelist_lock);

                /*
                 * Flush all data from the socket's receive buffer.
                 * This will cause files referenced only by the
                 * socket to be queued for close.
                 */
                so = fp->f_socket;
                solock(so);
                sorflush(so);
                sounlock(so);

                /* Re-lock and restart from where we left off. */
                closef(fp);
                mutex_enter(&filelist_lock);
                np = LIST_NEXT(dp, f_list);
                LIST_REMOVE(dp, f_list);
        }
}

/*
 * Garbage collector thread.  While SCM_RIGHTS messages are in transit,
 * wake once per second to garbage collect.  Run continually while we
 * have deferred closes to process.
 */
static void
unp_thread(void *cookie)
{
        file_t *dp;

        /* Allocate a dummy file for our scans. */
        if ((dp = fgetdummy()) == NULL) {
                panic("unp_thread");
        }

        mutex_enter(&filelist_lock);
        for (;;) {
                KASSERT(mutex_owned(&filelist_lock));
                if (SLIST_EMPTY(&unp_thread_discard)) {
                        if (unp_rights != 0) {
                                (void)cv_timedwait(&unp_thread_cv,
                                    &filelist_lock, hz);
                        } else {
                                cv_wait(&unp_thread_cv, &filelist_lock);
                        }
                }
                unp_gc(dp);
        }
        /* NOTREACHED */
}

/*
 * Kick the garbage collector into action if there is something for
 * it to process.
 */
static void
unp_thread_kick(void)
{

        if (!SLIST_EMPTY(&unp_thread_discard) || unp_rights != 0) {
                mutex_enter(&filelist_lock);
                cv_signal(&unp_thread_cv);
                mutex_exit(&filelist_lock);
        }
}

void
unp_dispose(struct mbuf *m)
{

        if (m)
                unp_scan(m, unp_discard_later, 1);
}

void
unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard)
{
        struct mbuf *m;
        file_t **rp, *fp;
        struct cmsghdr *cm;
        int i, qfds;

        while (m0) {
                for (m = m0; m; m = m->m_next) {
                        if (m->m_type != MT_CONTROL ||
                            m->m_len < sizeof(*cm)) {
                                    continue;
                        }
                        cm = mtod(m, struct cmsghdr *);
                        if (cm->cmsg_level != SOL_SOCKET ||
                            cm->cmsg_type != SCM_RIGHTS)
                                continue;
                        qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
                            / sizeof(file_t *);
                        rp = (file_t **)CMSG_DATA(cm);
                        for (i = 0; i < qfds; i++) {
                                fp = *rp;
                                if (discard) {
                                        *rp = 0;
                                }
                                (*op)(fp);
                                rp++;
                        }
                }
                m0 = m0->m_nextpkt;
        }
}

void
unp_mark(file_t *fp)
{

        if (fp == NULL)
                return;

        /* If we're already deferred, don't screw up the defer count */
        mutex_enter(&fp->f_lock);
        if (fp->f_flag & (FMARK | FDEFER)) {
                mutex_exit(&fp->f_lock);
                return;
        }

        /*
         * Minimize the number of deferrals...  Sockets are the only type of
         * file which can hold references to another file, so just mark
         * other files, and defer unmarked sockets for the next pass.
         */
        if (fp->f_type == DTYPE_SOCKET) {
                unp_defer++;
                KASSERT(fp->f_count != 0);
                atomic_or_uint(&fp->f_flag, FDEFER);
        } else {
                atomic_or_uint(&fp->f_flag, FMARK);
        }
        mutex_exit(&fp->f_lock);
}

static void
unp_discard_now(file_t *fp)
{

        if (fp == NULL)
                return;

        KASSERT(fp->f_count > 0);
        KASSERT(fp->f_msgcount > 0);

        mutex_enter(&fp->f_lock);
        fp->f_msgcount--;
        mutex_exit(&fp->f_lock);
        atomic_dec_uint(&unp_rights);
        (void)closef(fp);
}

static void
unp_discard_later(file_t *fp)
{

        if (fp == NULL)
                return;

        KASSERT(fp->f_count > 0);
        KASSERT(fp->f_msgcount > 0);

        mutex_enter(&filelist_lock);
        if (fp->f_unpcount++ == 0) {
                SLIST_INSERT_HEAD(&unp_thread_discard, fp, f_unplist);
        }
        mutex_exit(&filelist_lock);
}

static void
unp_sysctl_create(void)
{

        KASSERT(usrreq_sysctllog == NULL);
        sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_LONG, "sendspace",
                       SYSCTL_DESCR("Default stream send space"),
                       NULL, 0, &unpst_sendspace, 0,
                       CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL);
        sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_LONG, "recvspace",
                       SYSCTL_DESCR("Default stream recv space"),
                       NULL, 0, &unpst_recvspace, 0,
                       CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL);
        sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_LONG, "sendspace",
                       SYSCTL_DESCR("Default datagram send space"),
                       NULL, 0, &unpdg_sendspace, 0,
                       CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL);
        sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_LONG, "recvspace",
                       SYSCTL_DESCR("Default datagram recv space"),
                       NULL, 0, &unpdg_recvspace, 0,
                       CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL);
        sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
                       CTLTYPE_INT, "inflight",
                       SYSCTL_DESCR("File descriptors in flight"),
                       NULL, 0, &unp_rights, 0,
                       CTL_NET, PF_LOCAL, CTL_CREATE, CTL_EOL);
        sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
                       CTLTYPE_INT, "deferred",
                       SYSCTL_DESCR("File descriptors deferred for close"),
                       NULL, 0, &unp_defer, 0,
                       CTL_NET, PF_LOCAL, CTL_CREATE, CTL_EOL);
}

const struct pr_usrreqs unp_usrreqs = {
        .pr_attach        = unp_attach,
        .pr_detach        = unp_detach,
        .pr_accept        = unp_accept,
        .pr_bind        = unp_bind,
        .pr_listen        = unp_listen,
        .pr_connect        = unp_connect,
        .pr_connect2        = unp_connect2,
        .pr_disconnect        = unp_disconnect,
        .pr_shutdown        = unp_shutdown,
        .pr_abort        = unp_abort,
        .pr_ioctl        = unp_ioctl,
        .pr_stat        = unp_stat,
        .pr_peeraddr        = unp_peeraddr,
        .pr_sockaddr        = unp_sockaddr,
        .pr_rcvd        = unp_rcvd,
        .pr_recvoob        = unp_recvoob,
        .pr_send        = unp_send,
        .pr_sendoob        = unp_sendoob,
};










































































































































































    4 


   32 


    2 







   21 



   22 



   11 
   21 







    5 







    5 



    3 
    4 


    3 
    2 
































































    6 
























    6 
    6 













    6 

















    6 






    6 




    6 



















    7 






    7 

    6 










    6 





    3 





    6 





    5 




    4 







































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
/*        $NetBSD: uipc_domain.c,v 1.109 2023/03/30 15:58:21 riastradh Exp $        */

/*
 * Copyright (c) 1982, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)uipc_domain.c        8.3 (Berkeley) 2/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_domain.c,v 1.109 2023/03/30 15:58:21 riastradh Exp $");

#include <sys/param.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/domain.h>
#include <sys/mbuf.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/queue.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/un.h>
#include <sys/unpcb.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kauth.h>

#include <netatalk/at.h>
#include <net/if_dl.h>
#include <netinet/in.h>

MALLOC_DECLARE(M_SOCKADDR);

MALLOC_DEFINE(M_SOCKADDR, "sockaddr", "socket endpoints");

void        pffasttimo(void *);
void        pfslowtimo(void *);

struct domainhead domains = STAILQ_HEAD_INITIALIZER(domains);
static struct domain *domain_array[AF_MAX];

callout_t pffasttimo_ch, pfslowtimo_ch;

/*
 * Current time values for fast and slow timeouts.  We can use u_int
 * relatively safely.  The fast timer will roll over in 27 years and
 * the slow timer in 68 years.
 */
u_int        pfslowtimo_now;
u_int        pffasttimo_now;

static struct sysctllog *domain_sysctllog;
static void sysctl_net_setup(void);

/* ensure successful linkage even without any domains in link sets */
static struct domain domain_dummy;
__link_set_add_rodata(domains,domain_dummy);

static void
domain_init_timers(void)
{

        callout_init(&pffasttimo_ch, CALLOUT_MPSAFE);
        callout_init(&pfslowtimo_ch, CALLOUT_MPSAFE);

        callout_reset(&pffasttimo_ch, 1, pffasttimo, NULL);
        callout_reset(&pfslowtimo_ch, 1, pfslowtimo, NULL);
}

void
domaininit(bool attach)
{
        __link_set_decl(domains, struct domain);
        struct domain * const * dpp;
        struct domain *rt_domain = NULL;

        sysctl_net_setup();

        /*
         * Add all of the domains.  Make sure the PF_ROUTE
         * domain is added last.
         */
        if (attach) {
                __link_set_foreach(dpp, domains) {
                        if (*dpp == &domain_dummy)
                                continue;
                        if ((*dpp)->dom_family == PF_ROUTE)
                                rt_domain = *dpp;
                        else
                                domain_attach(*dpp);
                }
                if (rt_domain)
                        domain_attach(rt_domain);

                domain_init_timers();
        }
}

/*
 * Must be called only if domaininit has been called with false and
 * after all domains have been attached.
 */
void
domaininit_post(void)
{

        domain_init_timers();
}

void
domain_attach(struct domain *dp)
{
        const struct protosw *pr;

        STAILQ_INSERT_TAIL(&domains, dp, dom_link);
        if (dp->dom_family < __arraycount(domain_array))
                domain_array[dp->dom_family] = dp;

        if (dp->dom_init)
                (*dp->dom_init)();

#ifdef MBUFTRACE
        if (dp->dom_mowner.mo_name[0] == '\0') {
                strncpy(dp->dom_mowner.mo_name, dp->dom_name,
                    sizeof(dp->dom_mowner.mo_name));
                MOWNER_ATTACH(&dp->dom_mowner);
        }
#endif
        for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
                if (pr->pr_init)
                        (*pr->pr_init)();
        }

        if (max_linkhdr < 16)                /* XXX */
                max_linkhdr = 16;
        max_hdr = max_linkhdr + max_protohdr;
        max_datalen = MHLEN - max_hdr;
}

struct domain *
pffinddomain(int family)
{
        struct domain *dp;

        if (family < __arraycount(domain_array) && domain_array[family] != NULL)
                return domain_array[family];

        DOMAIN_FOREACH(dp)
                if (dp->dom_family == family)
                        return dp;
        return NULL;
}

const struct protosw *
pffindtype(int family, int type)
{
        struct domain *dp;
        const struct protosw *pr;

        dp = pffinddomain(family);
        if (dp == NULL)
                return NULL;

        for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
                if (pr->pr_type && pr->pr_type == type)
                        return pr;

        return NULL;
}

const struct protosw *
pffindproto(int family, int protocol, int type)
{
        struct domain *dp;
        const struct protosw *pr;
        const struct protosw *maybe = NULL;

        if (family == 0)
                return NULL;

        dp = pffinddomain(family);
        if (dp == NULL)
                return NULL;

        for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
                if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
                        return pr;

                if (type == SOCK_RAW && pr->pr_type == SOCK_RAW &&
                    pr->pr_protocol == 0 && maybe == NULL)
                        maybe = pr;
        }
        return maybe;
}

void *
sockaddr_addr(struct sockaddr *sa, socklen_t *slenp)
{
        const struct domain *dom;

        if ((dom = pffinddomain(sa->sa_family)) == NULL ||
            dom->dom_sockaddr_addr == NULL)
                return NULL;

        return (*dom->dom_sockaddr_addr)(sa, slenp);
}

const void *
sockaddr_const_addr(const struct sockaddr *sa, socklen_t *slenp)
{
        const struct domain *dom;

        if ((dom = pffinddomain(sa->sa_family)) == NULL ||
            dom->dom_sockaddr_const_addr == NULL)
                return NULL;

        return (*dom->dom_sockaddr_const_addr)(sa, slenp);
}

const struct sockaddr *
sockaddr_any_by_family(sa_family_t family)
{
        const struct domain *dom;

        if ((dom = pffinddomain(family)) == NULL)
                return NULL;

        return dom->dom_sa_any;
}

const struct sockaddr *
sockaddr_any(const struct sockaddr *sa)
{
        return sockaddr_any_by_family(sa->sa_family);
}

const void *
sockaddr_anyaddr(const struct sockaddr *sa, socklen_t *slenp)
{
        const struct sockaddr *any;

        if ((any = sockaddr_any(sa)) == NULL)
                return NULL;

        return sockaddr_const_addr(any, slenp);
}

socklen_t
sockaddr_getsize_by_family(sa_family_t af)
{
        switch (af) {
        case AF_INET:
                return sizeof(struct sockaddr_in);
        case AF_INET6:
                return sizeof(struct sockaddr_in6);
        case AF_UNIX:
                return sizeof(struct sockaddr_un);
        case AF_LINK:
                return sizeof(struct sockaddr_dl);
        case AF_APPLETALK:
                return sizeof(struct sockaddr_at);
        default:
#ifdef DIAGNOSTIC
                printf("%s: (%s:%u:%u) Unhandled address family=%hhu\n",
                    __func__, curlwp->l_proc->p_comm,
                    curlwp->l_proc->p_pid, curlwp->l_lid, af);
#endif
                return 0;
        }
}

#ifdef DIAGNOSTIC
static void
sockaddr_checklen(const struct sockaddr *sa)
{
        // Can't tell how much was allocated, if it was allocated.
        if (sa->sa_family == AF_LINK)
                return;

        socklen_t len = sockaddr_getsize_by_family(sa->sa_family);
        if (len == 0 || len == sa->sa_len)
                return;

        char buf[512];
        sockaddr_format(sa, buf, sizeof(buf));
        printf("%s: %p bad len af=%hhu socklen=%hhu len=%u [%s]\n",
            __func__, sa, sa->sa_family, sa->sa_len, (unsigned)len, buf);
}
#else
#define sockaddr_checklen(sa) ((void)0)
#endif

struct sockaddr *
sockaddr_alloc(sa_family_t af, socklen_t socklen, int flags)
{
        struct sockaddr *sa;
        socklen_t reallen = MAX(socklen, offsetof(struct sockaddr, sa_data[0]));

#ifdef DIAGNOSTIC
        /*
         * sockaddr_checklen passes sa to sockaddr_format which
         * requires it to be fully initialized.
         *
         * XXX This should be factored better.
         */
        flags |= M_ZERO;
#endif
        if ((sa = malloc(reallen, M_SOCKADDR, flags)) == NULL)
                return NULL;

        sa->sa_family = af;
        sa->sa_len = reallen;
        sockaddr_checklen(sa);
        return sa;
}

struct sockaddr *
sockaddr_copy(struct sockaddr *dst, socklen_t socklen,
    const struct sockaddr *src)
{
        if (__predict_false(socklen < src->sa_len)) {
                panic("%s: source too long, %d < %d bytes", __func__, socklen,
                    src->sa_len);
        }
        sockaddr_checklen(src);
        return memcpy(dst, src, src->sa_len);
}

struct sockaddr *
sockaddr_externalize(struct sockaddr *dst, socklen_t socklen,
    const struct sockaddr *src)
{
        struct domain *dom;

        dom = pffinddomain(src->sa_family);

        if (dom != NULL && dom->dom_sockaddr_externalize != NULL)
                return (*dom->dom_sockaddr_externalize)(dst, socklen, src);

        return sockaddr_copy(dst, socklen, src);
}

int
sockaddr_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2)
{
        int len, rc;
        struct domain *dom;

        if (sa1->sa_family != sa2->sa_family)
                return sa1->sa_family - sa2->sa_family;

        dom = pffinddomain(sa1->sa_family);

        if (dom != NULL && dom->dom_sockaddr_cmp != NULL)
                return (*dom->dom_sockaddr_cmp)(sa1, sa2);

        len = MIN(sa1->sa_len, sa2->sa_len);

        if (dom == NULL || dom->dom_sa_cmplen == 0) {
                if ((rc = memcmp(sa1, sa2, len)) != 0)
                        return rc;
                return sa1->sa_len - sa2->sa_len;
        }

        if ((rc = memcmp((const char *)sa1 + dom->dom_sa_cmpofs,
                         (const char *)sa2 + dom->dom_sa_cmpofs,
                         MIN(dom->dom_sa_cmplen,
                             len - MIN(len, dom->dom_sa_cmpofs)))) != 0)
                return rc;

        return MIN(dom->dom_sa_cmplen + dom->dom_sa_cmpofs, sa1->sa_len) -
               MIN(dom->dom_sa_cmplen + dom->dom_sa_cmpofs, sa2->sa_len);
}

struct sockaddr *
sockaddr_dup(const struct sockaddr *src, int flags)
{
        struct sockaddr *dst;

        if ((dst = sockaddr_alloc(src->sa_family, src->sa_len, flags)) == NULL)
                return NULL;

        return sockaddr_copy(dst, dst->sa_len, src);
}

void
sockaddr_free(struct sockaddr *sa)
{
        free(sa, M_SOCKADDR);
}

static int
sun_print(char *buf, size_t len, const void *v)
{
        const struct sockaddr_un *sun = v;
        size_t plen;

        KASSERT(sun->sun_len >= offsetof(struct sockaddr_un, sun_path[0]));
        plen = sun->sun_len - offsetof(struct sockaddr_un, sun_path[0]);

        len = MIN(len, plen);

        return snprintf(buf, len, "%s", sun->sun_path);
}

int
sockaddr_format(const struct sockaddr *sa, char *buf, size_t len)
{
        size_t plen = 0;

        if (sa == NULL)
                return strlcpy(buf, "(null)", len);

        switch (sa->sa_family) {
        case AF_LOCAL:
                plen = strlcpy(buf, "unix: ", len);
                break;
        case AF_INET:
                plen = strlcpy(buf, "inet: ", len);
                break;
        case AF_INET6:
                plen = strlcpy(buf, "inet6: ", len);
                break;
        case AF_LINK:
                plen = strlcpy(buf, "link: ", len);
                break;
        case AF_APPLETALK:
                plen = strlcpy(buf, "atalk: ", len);
                break;
        default:
                return snprintf(buf, len, "(unknown socket family %d)",
                    (int)sa->sa_family);
        }

        buf += plen;
        if (plen > len)
                len = 0;
        else
                len -= plen;

        switch (sa->sa_family) {
        case AF_LOCAL:
                return sun_print(buf, len, sa);
        case AF_INET:
                return sin_print(buf, len, sa);
        case AF_INET6:
                return sin6_print(buf, len, sa);
        case AF_LINK:
                return sdl_print(buf, len, sa);
        case AF_APPLETALK:
                return sat_print(buf, len, sa);
        default:
                panic("bad family %hhu", sa->sa_family);
        }
}

/*
 * sysctl helper to stuff PF_LOCAL pcbs into sysctl structures
 */
static void
sysctl_dounpcb(struct kinfo_pcb *pcb, const struct socket *so)
{
        const bool allowaddr = get_expose_address(curproc);
        struct unpcb *unp = sotounpcb(so);
        struct sockaddr_un *un = unp->unp_addr;

        memset(pcb, 0, sizeof(*pcb));

        pcb->ki_family = so->so_proto->pr_domain->dom_family;
        pcb->ki_type = so->so_proto->pr_type;
        pcb->ki_protocol = so->so_proto->pr_protocol;
        pcb->ki_pflags = unp->unp_flags;

        COND_SET_VALUE(pcb->ki_pcbaddr, PTRTOUINT64(unp), allowaddr);
        /* pcb->ki_ppcbaddr = unp has no ppcb... */
        COND_SET_VALUE(pcb->ki_sockaddr, PTRTOUINT64(so), allowaddr);

        pcb->ki_sostate = so->so_state;
        /* pcb->ki_prstate = unp has no state... */

        pcb->ki_rcvq = so->so_rcv.sb_cc;
        pcb->ki_sndq = so->so_snd.sb_cc;

        un = (struct sockaddr_un *)pcb->ki_spad;
        /*
         * local domain sockets may bind without having a local
         * endpoint.  bleah!
         */
        if (unp->unp_addr != NULL) {
                /*
                 * We've added one to sun_len when allocating to
                 * hold terminating NUL which we want here.  See
                 * makeun().
                 */
                memcpy(un, unp->unp_addr,
                    uimin(sizeof(pcb->ki_spad), unp->unp_addr->sun_len + 1));
        }
        else {
                un->sun_len = offsetof(struct sockaddr_un, sun_path);
                un->sun_family = pcb->ki_family;
        }
        if (unp->unp_conn != NULL) {
                un = (struct sockaddr_un *)pcb->ki_dpad;
                if (unp->unp_conn->unp_addr != NULL) {
                        memcpy(un, unp->unp_conn->unp_addr,
                            uimin(sizeof(pcb->ki_dpad), unp->unp_conn->unp_addr->sun_len + 1));
                }
                else {
                        un->sun_len = offsetof(struct sockaddr_un, sun_path);
                        un->sun_family = pcb->ki_family;
                }
        }

        pcb->ki_inode = unp->unp_ino;
        COND_SET_VALUE(pcb->ki_vnode, PTRTOUINT64(unp->unp_vnode), allowaddr);
        COND_SET_VALUE(pcb->ki_conn, PTRTOUINT64(unp->unp_conn), allowaddr);
        COND_SET_VALUE(pcb->ki_refs, PTRTOUINT64(unp->unp_refs), allowaddr);
        COND_SET_VALUE(pcb->ki_nextref, PTRTOUINT64(unp->unp_nextref),
            allowaddr);
}

static int
sysctl_unpcblist(SYSCTLFN_ARGS)
{
        struct file *fp, *np, *dfp;
        struct socket *so;
        struct kinfo_pcb pcb;
        char *dp;
        size_t len, needed, elem_size, out_size;
        int error, elem_count, pf, type;

        if (namelen == 1 && name[0] == CTL_QUERY)
                return sysctl_query(SYSCTLFN_CALL(rnode));

        if (namelen != 4)
                return EINVAL;

        if (oldp != NULL) {
                len = *oldlenp;
                elem_size = name[2];
                elem_count = name[3];
                if (elem_size != sizeof(pcb))
                        return EINVAL;
        } else {
                len = 0;
                elem_size = sizeof(pcb);
                elem_count = INT_MAX;
        }
        error = 0;
        dp = oldp;
        out_size = elem_size;
        needed = 0;

        if (name - oname != 4)
                return EINVAL;

        pf = oname[1];
        type = oname[2];

        /*
         * allocate dummy file descriptor to make position in list.
         */
        sysctl_unlock();
        if ((dfp = fgetdummy()) == NULL) {
                 sysctl_relock();
                return ENOMEM;
        }

        /*
         * there's no "list" of local domain sockets, so we have
         * to walk the file list looking for them.  :-/
         */
        mutex_enter(&filelist_lock);
        LIST_FOREACH_SAFE(fp, &filehead, f_list, np) {
                if (fp->f_count == 0 || fp->f_type != DTYPE_SOCKET ||
                    fp->f_socket == NULL)
                        continue;
                so = fp->f_socket;
                if (so->so_type != type)
                        continue;
                if (so->so_proto->pr_domain->dom_family != pf)
                        continue;
                if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
                    KAUTH_REQ_NETWORK_SOCKET_CANSEE, so, NULL, NULL) != 0)
                        continue;
                if (len >= elem_size && elem_count > 0) {
                        mutex_enter(&fp->f_lock);
                        /*
                         * Do not add references, if the count reached 0.
                         * Since the check above has been performed without
                         * locking, it must be rechecked here as a concurrent
                         * closef could have reduced it.
                         */
                        if (fp->f_count == 0) {
                                mutex_exit(&fp->f_lock);
                                continue;
                        }
                        fp->f_count++;
                        mutex_exit(&fp->f_lock);
                        LIST_INSERT_AFTER(fp, dfp, f_list);
                        mutex_exit(&filelist_lock);
                        sysctl_dounpcb(&pcb, so);
                        error = copyout(&pcb, dp, out_size);
                        closef(fp);
                        mutex_enter(&filelist_lock);
                        np = LIST_NEXT(dfp, f_list);
                        LIST_REMOVE(dfp, f_list);
                        if (error)
                                break;
                        dp += elem_size;
                        len -= elem_size;
                }
                needed += elem_size;
                if (elem_count > 0 && elem_count != INT_MAX)
                        elem_count--;
        }
        mutex_exit(&filelist_lock);
        fputdummy(dfp);
         *oldlenp = needed;
        if (oldp == NULL)
                *oldlenp += PCB_SLOP * sizeof(struct kinfo_pcb);
         sysctl_relock();

        return error;
}

static void
sysctl_net_setup(void)
{

        KASSERT(domain_sysctllog == NULL);
        sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "local",
                       SYSCTL_DESCR("PF_LOCAL related settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_LOCAL, CTL_EOL);
        sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "stream",
                       SYSCTL_DESCR("SOCK_STREAM settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_EOL);
        sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "seqpacket",
                       SYSCTL_DESCR("SOCK_SEQPACKET settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_LOCAL, SOCK_SEQPACKET, CTL_EOL);
        sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "dgram",
                       SYSCTL_DESCR("SOCK_DGRAM settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_EOL);

        sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "pcblist",
                       SYSCTL_DESCR("SOCK_STREAM protocol control block list"),
                       sysctl_unpcblist, 0, NULL, 0,
                       CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL);
        sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "pcblist",
                       SYSCTL_DESCR("SOCK_SEQPACKET protocol control "
                                    "block list"),
                       sysctl_unpcblist, 0, NULL, 0,
                       CTL_NET, PF_LOCAL, SOCK_SEQPACKET, CTL_CREATE, CTL_EOL);
        sysctl_createv(&domain_sysctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "pcblist",
                       SYSCTL_DESCR("SOCK_DGRAM protocol control block list"),
                       sysctl_unpcblist, 0, NULL, 0,
                       CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL);
}

void
pfctlinput(int cmd, const struct sockaddr *sa)
{
        struct domain *dp;
        const struct protosw *pr;

        DOMAIN_FOREACH(dp) {
                for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
                        if (pr->pr_ctlinput != NULL)
                                (*pr->pr_ctlinput)(cmd, sa, NULL);
                }
        }
}

void
pfctlinput2(int cmd, const struct sockaddr *sa, void *ctlparam)
{
        struct domain *dp;
        const struct protosw *pr;

        if (sa == NULL)
                return;

        DOMAIN_FOREACH(dp) {
                /*
                 * the check must be made by xx_ctlinput() anyways, to
                 * make sure we use data item pointed to by ctlparam in
                 * correct way.  the following check is made just for safety.
                 */
                if (dp->dom_family != sa->sa_family)
                        continue;

                for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
                        if (pr->pr_ctlinput != NULL)
                                (*pr->pr_ctlinput)(cmd, sa, ctlparam);
                }
        }
}

void
pfslowtimo(void *arg)
{
        struct domain *dp;
        const struct protosw *pr;

        pfslowtimo_now++;

        DOMAIN_FOREACH(dp) {
                for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
                        if (pr->pr_slowtimo)
                                (*pr->pr_slowtimo)();
        }
        callout_schedule(&pfslowtimo_ch, hz / PR_SLOWHZ);
}

void
pffasttimo(void *arg)
{
        struct domain *dp;
        const struct protosw *pr;

        pffasttimo_now++;

        DOMAIN_FOREACH(dp) {
                for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
                        if (pr->pr_fasttimo)
                                (*pr->pr_fasttimo)();
        }
        callout_schedule(&pffasttimo_ch, hz / PR_FASTHZ);
}































































































































































    1 
    3 
    2 








































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
/*        $NetBSD: in_proto.c,v 1.131 2022/09/03 02:53:18 thorpej Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)in_proto.c        8.2 (Berkeley) 2/9/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in_proto.c,v 1.131 2022/09/03 02:53:18 thorpej Exp $");

#ifdef _KERNEL_OPT
#include "opt_mrouting.h"
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_pim.h"
#include "opt_gateway.h"
#include "opt_dccp.h"
#include "opt_sctp.h"
#include "opt_compat_netbsd.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/domain.h>
#include <sys/mbuf.h>

#include <net/if.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#include <netinet/in_ifattach.h>
#include <netinet/in_pcb.h>
#include <netinet/in_proto.h>

#ifdef INET6
#ifndef INET
#include <netinet/in.h>
#endif
#include <netinet/ip6.h>
#endif

#include <netinet/igmp_var.h>
#ifdef PIM
#include <netinet/pim_var.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_debug.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet/ip_encap.h>

#ifdef DCCP
#include <netinet/dccp.h>
#include <netinet/dccp_var.h>
#endif

#ifdef SCTP
#include <netinet/sctp.h>
#include <netinet/sctp_var.h>
#endif

/*
 * TCP/IP protocol family: IP, ICMP, UDP, TCP.
 */

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#endif        /* IPSEC */

#include "carp.h"
#if NCARP > 0
#include <netinet/ip_carp.h>
#endif

#include "pfsync.h"
#if NPFSYNC > 0
#include <net/pfvar.h>
#include <net/if_pfsync.h>
#endif

DOMAIN_DEFINE(inetdomain);        /* forward declare and add to link set */

/* Wrappers to acquire kernel_lock. */

PR_WRAP_CTLINPUT(rip_ctlinput)
PR_WRAP_CTLINPUT(udp_ctlinput)
PR_WRAP_CTLINPUT(tcp_ctlinput)

#define        rip_ctlinput        rip_ctlinput_wrapper
#define        udp_ctlinput        udp_ctlinput_wrapper
#define        tcp_ctlinput        tcp_ctlinput_wrapper

PR_WRAP_CTLOUTPUT(rip_ctloutput)
PR_WRAP_CTLOUTPUT(udp_ctloutput)
PR_WRAP_CTLOUTPUT(tcp_ctloutput)

#define        rip_ctloutput        rip_ctloutput_wrapper
#define        udp_ctloutput        udp_ctloutput_wrapper
#define        tcp_ctloutput        tcp_ctloutput_wrapper

#ifdef DCCP
PR_WRAP_CTLINPUT(dccp_ctlinput)
PR_WRAP_CTLOUTPUT(dccp_ctloutput)

#define dccp_ctlinput        dccp_ctlinput_wrapper
#define dccp_ctloutput        dccp_ctloutput_wrapper
#endif

#ifdef SCTP
PR_WRAP_CTLINPUT(sctp_ctlinput)
PR_WRAP_CTLOUTPUT(sctp_ctloutput)

#define sctp_ctlinput        sctp_ctlinput_wrapper
#define sctp_ctloutput        sctp_ctloutput_wrapper
#endif

#ifdef NET_MPSAFE
PR_WRAP_INPUT(udp_input)
PR_WRAP_INPUT(tcp_input)
#ifdef DCCP
PR_WRAP_INPUT(dccp_input)
#endif
#ifdef SCTP
PR_WRAP_INPUT(sctp_input)
#endif
PR_WRAP_INPUT(rip_input)
#if NPFSYNC > 0
PR_WRAP_INPUT(pfsync_input)
#endif
PR_WRAP_INPUT(igmp_input)
#ifdef PIM
PR_WRAP_INPUT(pim_input)
#endif

#define        udp_input                udp_input_wrapper
#define        tcp_input                tcp_input_wrapper
#define        dccp_input                dccp_input_wrapper
#define        sctp_input                sctp_input_wrapper
#define        rip_input                rip_input_wrapper
#define        pfsync_input                pfsync_input_wrapper
#define        igmp_input                igmp_input_wrapper
#define        pim_input                pim_input_wrapper
#endif

#if defined(IPSEC)

#ifdef IPSEC_RUMPKERNEL
/*
 * .pr_input = ipsec4_common_input won't be resolved on loading
 * the ipsec shared library. We need a wrapper anyway.
 */
static void
ipsec4_common_input_wrapper(struct mbuf *m, int off, int proto)
{

        if (ipsec_enabled) {
                ipsec4_common_input(m, off, proto);
        } else {
                m_freem(m);
        }
}
#define        ipsec4_common_input        ipsec4_common_input_wrapper

/* The ctlinput functions may not be loaded */
#define        IPSEC_WRAP_CTLINPUT(name)                        \
static void *                                                \
name##_wrapper(int a, const struct sockaddr *b, void *c)\
{                                                        \
        void *rv;                                        \
        KERNEL_LOCK(1, NULL);                                \
        if (ipsec_enabled)                                \
                rv = name(a, b, c);                        \
        else                                                \
                rv = NULL;                                \
        KERNEL_UNLOCK_ONE(NULL);                        \
        return rv;                                        \
}
IPSEC_WRAP_CTLINPUT(ah4_ctlinput)
IPSEC_WRAP_CTLINPUT(esp4_ctlinput)

#else /* !IPSEC_RUMPKERNEL */

PR_WRAP_CTLINPUT(ah4_ctlinput)
PR_WRAP_CTLINPUT(esp4_ctlinput)

#endif /* !IPSEC_RUMPKERNEL */

#define        ah4_ctlinput        ah4_ctlinput_wrapper
#define        esp4_ctlinput        esp4_ctlinput_wrapper

#endif /* IPSEC */

const struct protosw inetsw[] = {
{        .pr_domain = &inetdomain,
        .pr_init = ip_init,
        .pr_fasttimo = ip_fasttimo,
        .pr_slowtimo = ip_slowtimo,
        .pr_drain = ip_drainstub,
},
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_ICMP,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = icmp_input,
        .pr_ctlinput = rip_ctlinput,
        .pr_ctloutput = rip_ctloutput,
        .pr_usrreqs = &rip_usrreqs,
        .pr_init = icmp_init,
},
{        .pr_type = SOCK_DGRAM,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_UDP,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_PURGEIF,
        .pr_input = udp_input,
        .pr_ctlinput = udp_ctlinput,
        .pr_ctloutput = udp_ctloutput,
        .pr_usrreqs = &udp_usrreqs,
        .pr_init = udp_init,
},
{        .pr_type = SOCK_STREAM,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_TCP,
        .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_LISTEN|PR_ABRTACPTDIS|PR_PURGEIF,
        .pr_input = tcp_input,
        .pr_ctlinput = tcp_ctlinput,
        .pr_ctloutput = tcp_ctloutput,
        .pr_usrreqs = &tcp_usrreqs,
        .pr_init = tcp_init,
        .pr_fasttimo = tcp_fasttimo,
        .pr_drain = tcp_drainstub,
},
#ifdef DCCP
{        .pr_type = SOCK_CONN_DGRAM,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_DCCP,
        .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ATOMIC|PR_LISTEN|PR_ABRTACPTDIS,
        .pr_input = dccp_input,
        .pr_ctlinput = dccp_ctlinput,
        .pr_ctloutput = dccp_ctloutput,
        .pr_usrreqs = &dccp_usrreqs,
        .pr_init = dccp_init,
},
#endif
#ifdef SCTP
{        .pr_type = SOCK_DGRAM,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_SCTP,
        .pr_flags = PR_ADDR_OPT|PR_WANTRCVD,
        .pr_input = sctp_input,
        .pr_ctlinput = sctp_ctlinput,
        .pr_ctloutput = sctp_ctloutput,
        .pr_usrreqs = &sctp_usrreqs,
        .pr_init = sctp_init,
        .pr_drain = sctp_drain
},
{        .pr_type = SOCK_SEQPACKET,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_SCTP,
        .pr_flags = PR_ADDR_OPT|PR_WANTRCVD,
        .pr_input = sctp_input,
        .pr_ctlinput = sctp_ctlinput,
        .pr_ctloutput = sctp_ctloutput,
        .pr_usrreqs = &sctp_usrreqs,
        .pr_drain = sctp_drain
},
{        .pr_type = SOCK_STREAM,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_SCTP,
        .pr_flags = PR_CONNREQUIRED|PR_ADDR_OPT|PR_WANTRCVD|PR_LISTEN,
        .pr_input = sctp_input,
        .pr_ctlinput = sctp_ctlinput,
        .pr_ctloutput = sctp_ctloutput,
        .pr_usrreqs = &sctp_usrreqs,
        .pr_drain = sctp_drain
},
#endif /* SCTP */
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_RAW,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_PURGEIF,
        .pr_input = rip_input,
        .pr_ctlinput = rip_ctlinput,
        .pr_ctloutput = rip_ctloutput,
        .pr_usrreqs = &rip_usrreqs,
},
#ifdef GATEWAY
{        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_IP,
        .pr_slowtimo = ipflow_slowtimo,
        .pr_init = ipflow_poolinit,
},
#endif /* GATEWAY */
#ifdef IPSEC
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_AH,
        .pr_flags = PR_ATOMIC|PR_ADDR,
        .pr_input = ipsec4_common_input,
        .pr_ctlinput = ah4_ctlinput,
},
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_ESP,
        .pr_flags = PR_ATOMIC|PR_ADDR,
        .pr_input = ipsec4_common_input,
        .pr_ctlinput = esp4_ctlinput,
},
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_IPCOMP,
        .pr_flags = PR_ATOMIC|PR_ADDR,
        .pr_input = ipsec4_common_input,
},
#endif /* IPSEC */
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_IPV4,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = encap4_input,
        .pr_ctlinput = rip_ctlinput,
        .pr_ctloutput = rip_ctloutput,
        .pr_usrreqs = &rip_usrreqs,
        .pr_init = encap_init,
},
#ifdef INET6
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_IPV6,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = encap4_input,
        .pr_ctlinput = rip_ctlinput,
        .pr_ctloutput = rip_ctloutput,
        .pr_usrreqs = &rip_usrreqs,
        .pr_init = encap_init,
},
#endif /* INET6 */
#if NCARP > 0
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_CARP,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = carp_proto_input,
        .pr_ctloutput = rip_ctloutput,
        .pr_usrreqs = &rip_usrreqs,
        .pr_init = carp_init,
},
#endif /* NCARP > 0 */
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_L2TP,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = encap4_input,
        .pr_ctlinput = rip_ctlinput,
        .pr_ctloutput = rip_ctloutput,
        .pr_usrreqs = &rip_usrreqs,        /*XXX*/
        .pr_init = encap_init,
},
#if NPFSYNC > 0
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_PFSYNC,
        .pr_flags         = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input         = pfsync_input,
        .pr_ctloutput = rip_ctloutput,
        .pr_usrreqs         = &rip_usrreqs,
},
#endif /* NPFSYNC > 0 */
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_IGMP,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = igmp_input, 
        .pr_ctloutput = rip_ctloutput,
        .pr_ctlinput = rip_ctlinput,
        .pr_usrreqs = &rip_usrreqs,
        .pr_fasttimo = igmp_fasttimo,
        .pr_slowtimo = igmp_slowtimo,
        .pr_init = igmp_init,
},
#ifdef PIM
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_protocol = IPPROTO_PIM,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = pim_input, 
        .pr_ctloutput = rip_ctloutput,
        .pr_ctlinput = rip_ctlinput,
        .pr_usrreqs = &rip_usrreqs,
},
#endif /* PIM */
/* raw wildcard */
{        .pr_type = SOCK_RAW,
        .pr_domain = &inetdomain,
        .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
        .pr_input = rip_input, 
        .pr_ctloutput = rip_ctloutput,
        .pr_ctlinput = rip_ctlinput,
        .pr_usrreqs = &rip_usrreqs,
        .pr_init = rip_init,
},
};

const struct sockaddr_in in_any = {
          .sin_len = sizeof(struct sockaddr_in)
        , .sin_family = AF_INET
        , .sin_port = 0
        , .sin_addr = {.s_addr = 0 /* INADDR_ANY */}
};

struct domain inetdomain = {
        .dom_family = PF_INET, .dom_name = "internet", .dom_init = NULL,
        .dom_externalize = NULL, .dom_dispose = NULL,
        .dom_protosw = inetsw,
        .dom_protoswNPROTOSW = &inetsw[__arraycount(inetsw)],
        .dom_rtattach = rt_inithead,
        .dom_rtoffset = 32,
        .dom_maxrtkey = sizeof(struct ip_pack4),
        .dom_if_up = in_if_up,
        .dom_if_down = in_if_down,
        .dom_ifattach = in_domifattach,
        .dom_ifdetach = in_domifdetach,
        .dom_if_link_state_change = in_if_link_state_change,
        .dom_link = { NULL },
        .dom_mowner = MOWNER_INIT("",""),
        .dom_sa_cmpofs = offsetof(struct sockaddr_in, sin_addr),
        .dom_sa_cmplen = sizeof(struct in_addr),
        .dom_sa_any = (const struct sockaddr *)&in_any,
        .dom_sockaddr_const_addr = sockaddr_in_const_addr,
        .dom_sockaddr_addr = sockaddr_in_addr,
};

u_char        ip_protox[IPPROTO_MAX];

static void
sockaddr_in_addrlen(const struct sockaddr *sa, socklen_t *slenp)
{
        socklen_t slen;

        if (slenp == NULL)
                return;

        slen = sockaddr_getlen(sa);
        *slenp = (socklen_t)MIN(sizeof(struct in_addr),
            slen - MIN(slen, offsetof(struct sockaddr_in, sin_addr)));
}

const void *
sockaddr_in_const_addr(const struct sockaddr *sa, socklen_t *slenp)
{
        const struct sockaddr_in *sin;

        sockaddr_in_addrlen(sa, slenp);
        sin = (const struct sockaddr_in *)sa;
        return &sin->sin_addr;
}

void *
sockaddr_in_addr(struct sockaddr *sa, socklen_t *slenp)
{
        struct sockaddr_in *sin;

        sockaddr_in_addrlen(sa, slenp);
        sin = (struct sockaddr_in *)sa;
        return &sin->sin_addr;
}

int
sockaddr_in_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2)
{
        uint_fast8_t len;
        const uint_fast8_t addrofs = offsetof(struct sockaddr_in, sin_addr),
                           addrend = addrofs + sizeof(struct in_addr);
        int rc;
        const struct sockaddr_in *sin1, *sin2;

        sin1 = satocsin(sa1);
        sin2 = satocsin(sa2);

        len = MIN(addrend, MIN(sin1->sin_len, sin2->sin_len));

        if (len > addrofs &&
             (rc = memcmp(&sin1->sin_addr, &sin2->sin_addr,
                          len - addrofs)) != 0)
                return rc;

        return sin1->sin_len - sin2->sin_len;
}
















































































































































































































































































































































































































































































































































































































































































   27 






   27 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
/*        $NetBSD: sys_sched.c,v 1.50 2023/04/09 09:18:09 riastradh Exp $        */

/*
 * Copyright (c) 2008, 2011 Mindaugas Rasiukevicius <rmind at NetBSD org>
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * System calls relating to the scheduler.
 *
 * Lock order:
 *
 *        cpu_lock ->
 *            proc_lock ->
 *                proc_t::p_lock ->
 *                    lwp_t::lwp_lock
 *
 * TODO:
 *  - Handle pthread_setschedprio() as defined by POSIX;
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.50 2023/04/09 09:18:09 riastradh Exp $");

#include <sys/param.h>

#include <sys/cpu.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/pset.h>
#include <sys/sched.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/types.h>
#include <sys/unistd.h>

static struct sysctllog *sched_sysctl_log;
static kauth_listener_t sched_listener;

/*
 * Convert user priority or the in-kernel priority or convert the current
 * priority to the appropriate range according to the policy change.
 */
static pri_t
convert_pri(lwp_t *l, int policy, pri_t pri)
{

        /* Convert user priority to the in-kernel */
        if (pri != PRI_NONE) {
                /* Only for real-time threads */
                KASSERT(pri >= SCHED_PRI_MIN);
                KASSERT(pri <= SCHED_PRI_MAX);
                KASSERT(policy != SCHED_OTHER);
                return PRI_USER_RT + pri;
        }

        /* Neither policy, nor priority change */
        if (l->l_class == policy)
                return l->l_priority;

        /* Time-sharing -> real-time */
        if (l->l_class == SCHED_OTHER) {
                KASSERT(policy == SCHED_FIFO || policy == SCHED_RR);
                return PRI_USER_RT;
        }

        /* Real-time -> time-sharing */
        if (policy == SCHED_OTHER) {
                KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR);
                /*
                 * this is a bit arbitrary because the priority is dynamic
                 * for SCHED_OTHER threads and will likely be changed by
                 * the scheduler soon anyway.
                 */
                return l->l_priority - PRI_USER_RT;
        }

        /* Real-time -> real-time */
        return l->l_priority;
}

int
do_sched_setparam(pid_t pid, lwpid_t lid, int policy,
    const struct sched_param *params)
{
        struct proc *p;
        struct lwp *t;
        pri_t pri;
        u_int lcnt;
        int error;

        error = 0;

        pri = params->sched_priority;

        /* If no parameters specified, just return (this should not happen) */
        if (pri == PRI_NONE && policy == SCHED_NONE)
                return 0;

        /* Validate scheduling class */
        if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR))
                return EINVAL;

        /* Validate priority */
        if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX))
                return EINVAL;

        if (pid != 0) {
                /* Find the process */
                mutex_enter(&proc_lock);
                p = proc_find(pid);
                if (p == NULL) {
                        mutex_exit(&proc_lock);
                        return ESRCH;
                }
                mutex_enter(p->p_lock);
                mutex_exit(&proc_lock);
                /* Disallow modification of system processes */
                if ((p->p_flag & PK_SYSTEM) != 0) {
                        mutex_exit(p->p_lock);
                        return EPERM;
                }
        } else {
                /* Use the calling process */
                p = curlwp->l_proc;
                mutex_enter(p->p_lock);
        }

        /* Find the LWP(s) */
        lcnt = 0;
        LIST_FOREACH(t, &p->p_lwps, l_sibling) {
                pri_t kpri;
                int lpolicy;

                if (lid && lid != t->l_lid)
                        continue;

                lcnt++;
                lwp_lock(t);
                lpolicy = (policy == SCHED_NONE) ? t->l_class : policy;

                /* Disallow setting of priority for SCHED_OTHER threads */
                if (lpolicy == SCHED_OTHER && pri != PRI_NONE) {
                        lwp_unlock(t);
                        error = EINVAL;
                        break;
                }

                /* Convert priority, if needed */
                kpri = convert_pri(t, lpolicy, pri);

                /* Check the permission */
                error = kauth_authorize_process(kauth_cred_get(),
                    KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy),
                    KAUTH_ARG(kpri));
                if (error) {
                        lwp_unlock(t);
                        break;
                }

                /* Set the scheduling class, change the priority */
                t->l_class = lpolicy;
                lwp_changepri(t, kpri);
                lwp_unlock(t);
        }
        mutex_exit(p->p_lock);
        return (lcnt == 0) ? ESRCH : error;
}

/*
 * Set scheduling parameters.
 */
int
sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(pid_t) pid;
                syscallarg(lwpid_t) lid;
                syscallarg(int) policy;
                syscallarg(const struct sched_param *) params;
        } */
        struct sched_param params;
        int error;

        /* Get the parameters from the user-space */
        error = copyin(SCARG(uap, params), &params, sizeof(params));
        if (error)
                goto out;

        error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid),
            SCARG(uap, policy), &params);
out:
        return error;
}

/*
 * do_sched_getparam:
 *
 * if lid=0, returns the parameter of the first LWP in the process.
 */
int
do_sched_getparam(pid_t pid, lwpid_t lid, int *policy,
    struct sched_param *params)
{
        struct sched_param lparams;
        struct lwp *t;
        int error, lpolicy;

        if (pid < 0 || lid < 0)
                return EINVAL;

        t = lwp_find2(pid, lid); /* acquire p_lock */
        if (t == NULL)
                return ESRCH;

        /* Check the permission */
        error = kauth_authorize_process(kauth_cred_get(),
            KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL);
        if (error != 0) {
                mutex_exit(t->l_proc->p_lock);
                return error;
        }

        lwp_lock(t);
        lparams.sched_priority = t->l_priority;
        lpolicy = t->l_class;
        lwp_unlock(t);
        mutex_exit(t->l_proc->p_lock);

        /*
         * convert to the user-visible priority value.
         * it's an inversion of convert_pri().
         *
         * the SCHED_OTHER case is a bit arbitrary given that
         *        - we don't allow setting the priority.
         *        - the priority is dynamic.
         */
        switch (lpolicy) {
        case SCHED_OTHER:
                lparams.sched_priority -= PRI_USER;
                break;
        case SCHED_RR:
        case SCHED_FIFO:
                lparams.sched_priority -= PRI_USER_RT;
                break;
        }

        if (policy != NULL)
                *policy = lpolicy;

        if (params != NULL)
                *params = lparams;

        return error;
}

/*
 * Get scheduling parameters.
 */
int
sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(pid_t) pid;
                syscallarg(lwpid_t) lid;
                syscallarg(int *) policy;
                syscallarg(struct sched_param *) params;
        } */
        struct sched_param params;
        int error, policy;

        error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy,
            &params);
        if (error)
                goto out;

        error = copyout(&params, SCARG(uap, params), sizeof(params));
        if (error == 0 && SCARG(uap, policy) != NULL)
                error = copyout(&policy, SCARG(uap, policy), sizeof(int));
out:
        return error;
}

/*
 * Allocate the CPU set, and get it from userspace.
 */
static int
genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size)
{
        kcpuset_t *kset;
        int error;

        kcpuset_create(&kset, true);
        error = kcpuset_copyin(sset, kset, size);
        if (error) {
                kcpuset_unuse(kset, NULL);
        } else {
                *dset = kset;
        }
        return error;
}

/*
 * Set affinity.
 */
int
sys__sched_setaffinity(struct lwp *l,
    const struct sys__sched_setaffinity_args *uap, register_t *retval)
{
        /* {
                syscallarg(pid_t) pid;
                syscallarg(lwpid_t) lid;
                syscallarg(size_t) size;
                syscallarg(const cpuset_t *) cpuset;
        } */
        kcpuset_t *kcset, *kcpulst = NULL;
        struct cpu_info *ici, *ci;
        struct proc *p;
        struct lwp *t;
        CPU_INFO_ITERATOR cii;
        bool alloff;
        lwpid_t lid;
        u_int lcnt;
        int error;

        error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
        if (error)
                return error;

        /*
         * Traverse _each_ CPU to:
         *  - Check that CPUs in the mask have no assigned processor set.
         *  - Check that at least one CPU from the mask is online.
         *  - Find the first target CPU to migrate.
         *
         * To avoid the race with CPU online/offline calls and processor sets,
         * cpu_lock will be locked for the entire operation.
         */
        ci = NULL;
        alloff = false;
        mutex_enter(&cpu_lock);
        for (CPU_INFO_FOREACH(cii, ici)) {
                struct schedstate_percpu *ispc;

                if (!kcpuset_isset(kcset, cpu_index(ici))) {
                        continue;
                }

                ispc = &ici->ci_schedstate;
                /* Check that CPU is not in the processor-set */
                if (ispc->spc_psid != PS_NONE) {
                        error = EPERM;
                        goto out;
                }
                /* Skip offline CPUs */
                if (ispc->spc_flags & SPCF_OFFLINE) {
                        alloff = true;
                        continue;
                }
                /* Target CPU to migrate */
                if (ci == NULL) {
                        ci = ici;
                }
        }
        if (ci == NULL) {
                if (alloff) {
                        /* All CPUs in the set are offline */
                        error = EPERM;
                        goto out;
                }
                /* Empty set */
                kcpuset_unuse(kcset, &kcpulst);
                kcset = NULL;
        }

        if (SCARG(uap, pid) != 0) {
                /* Find the process */
                mutex_enter(&proc_lock);
                p = proc_find(SCARG(uap, pid));
                if (p == NULL) {
                        mutex_exit(&proc_lock);
                        error = ESRCH;
                        goto out;
                }
                mutex_enter(p->p_lock);
                mutex_exit(&proc_lock);
                /* Disallow modification of system processes. */
                if ((p->p_flag & PK_SYSTEM) != 0) {
                        mutex_exit(p->p_lock);
                        error = EPERM;
                        goto out;
                }
        } else {
                /* Use the calling process */
                p = l->l_proc;
                mutex_enter(p->p_lock);
        }

        /*
         * Check the permission.
         */
        error = kauth_authorize_process(l->l_cred,
            KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL);
        if (error != 0) {
                mutex_exit(p->p_lock);
                goto out;
        }

        /* Iterate through LWP(s). */
        lcnt = 0;
        lid = SCARG(uap, lid);
        LIST_FOREACH(t, &p->p_lwps, l_sibling) {
                if (lid && lid != t->l_lid) {
                        continue;
                }
                lwp_lock(t);
                /* No affinity for zombie LWPs. */
                if (t->l_stat == LSZOMB) {
                        lwp_unlock(t);
                        continue;
                }
                /* First, release existing affinity, if any. */
                if (t->l_affinity) {
                        kcpuset_unuse(t->l_affinity, &kcpulst);
                }
                if (kcset) {
                        /*
                         * Hold a reference on affinity mask, assign mask to
                         * LWP and migrate it to another CPU (unlocks LWP).
                         */
                        kcpuset_use(kcset);
                        t->l_affinity = kcset;
                        lwp_migrate(t, ci);
                } else {
                        /* Old affinity mask is released, just clear. */
                        t->l_affinity = NULL;
                        lwp_unlock(t);
                }
                lcnt++;
        }
        mutex_exit(p->p_lock);
        if (lcnt == 0) {
                error = ESRCH;
        }
out:
        mutex_exit(&cpu_lock);

        /*
         * Drop the initial reference (LWPs, if any, have the ownership now),
         * and destroy whatever is in the G/C list, if filled.
         */
        if (kcset) {
                kcpuset_unuse(kcset, &kcpulst);
        }
        if (kcpulst) {
                kcpuset_destroy(kcpulst);
        }
        return error;
}

/*
 * Get affinity.
 */
int
sys__sched_getaffinity(struct lwp *l,
    const struct sys__sched_getaffinity_args *uap, register_t *retval)
{
        /* {
                syscallarg(pid_t) pid;
                syscallarg(lwpid_t) lid;
                syscallarg(size_t) size;
                syscallarg(cpuset_t *) cpuset;
        } */
        struct lwp *t;
        kcpuset_t *kcset;
        int error;

        if (SCARG(uap, pid) < 0 || SCARG(uap, lid) < 0)
                return EINVAL;

        error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
        if (error)
                return error;

        /* Locks the LWP */
        t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid));
        if (t == NULL) {
                error = ESRCH;
                goto out;
        }
        /* Check the permission */
        if (kauth_authorize_process(l->l_cred,
            KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) {
                mutex_exit(t->l_proc->p_lock);
                error = EPERM;
                goto out;
        }
        lwp_lock(t);
        if (t->l_affinity) {
                kcpuset_copy(kcset, t->l_affinity);
        } else {
                kcpuset_zero(kcset);
        }
        lwp_unlock(t);
        mutex_exit(t->l_proc->p_lock);

        error = kcpuset_copyout(kcset, SCARG(uap, cpuset), SCARG(uap, size));
out:
        kcpuset_unuse(kcset, NULL);
        return error;
}

/*
 * Priority protection for PTHREAD_PRIO_PROTECT. This is a weak
 * analogue of priority inheritance: temp raise the priority
 * of the caller when accessing a protected resource.
 */
int 
sys__sched_protect(struct lwp *l, 
    const struct sys__sched_protect_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) priority;
                syscallarg(int *) opriority;
        } */
        int error;
        pri_t pri;

        KASSERT(l->l_inheritedprio == -1);
        KASSERT(l->l_auxprio == -1 || l->l_auxprio == l->l_protectprio);
        
        pri = SCARG(uap, priority);
        error = 0;
        lwp_lock(l);
        if (pri == -1) {
                /* back out priority changes */
                switch(l->l_protectdepth) {
                case 0:
                        error = EINVAL;
                        break;
                case 1:
                        l->l_protectdepth = 0;
                        l->l_protectprio = -1;
                        l->l_auxprio = -1;
                        break;
                default:
                        l->l_protectdepth--;
                        break;
                }
        } else if (pri < 0) {
                /* Just retrieve the current value, for debugging */
                if (l->l_protectprio == -1)
                        error = ENOENT;
                else
                        *retval = l->l_protectprio - PRI_USER_RT;
        } else if (__predict_false(pri < SCHED_PRI_MIN ||
            pri > SCHED_PRI_MAX || l->l_priority > pri + PRI_USER_RT)) {
                /* must fail if existing priority is higher */
                error = EPERM;
        } else {
                /* play along but make no changes if not a realtime LWP. */
                l->l_protectdepth++;
                pri += PRI_USER_RT;
                if (__predict_true(l->l_class != SCHED_OTHER && 
                    pri > l->l_protectprio)) {
                        l->l_protectprio = pri;
                        l->l_auxprio = pri;
                }
        }
        lwp_unlock(l);

        return error;
}

/*
 * Yield.
 */
int
sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
{

        yield();
        return 0;
}

/*
 * Sysctl nodes and initialization.
 */
static void
sysctl_sched_setup(struct sysctllog **clog)
{
        const struct sysctlnode *node = NULL;

        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                CTLTYPE_INT, "posix_sched",
                SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
                             "Process Scheduling option to which the "
                             "system attempts to conform"),
                NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0,
                CTL_KERN, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, &node,
                CTLFLAG_PERMANENT,
                CTLTYPE_NODE, "sched",
                SYSCTL_DESCR("Scheduler options"),
                NULL, 0, NULL, 0,
                CTL_KERN, CTL_CREATE, CTL_EOL);

        if (node == NULL)
                return;

        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
                CTLTYPE_INT, "pri_min",
                SYSCTL_DESCR("Minimal POSIX real-time priority"),
                NULL, SCHED_PRI_MIN, NULL, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
                CTLTYPE_INT, "pri_max",
                SYSCTL_DESCR("Maximal POSIX real-time priority"),
                NULL, SCHED_PRI_MAX, NULL, 0,
                CTL_CREATE, CTL_EOL);
}

static int
sched_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        struct proc *p;
        int result;

        result = KAUTH_RESULT_DEFER;
        p = arg0;

        switch (action) {
        case KAUTH_PROCESS_SCHEDULER_GETPARAM:
                if (kauth_cred_uidmatch(cred, p->p_cred))
                        result = KAUTH_RESULT_ALLOW;
                break;

        case KAUTH_PROCESS_SCHEDULER_SETPARAM:
                if (kauth_cred_uidmatch(cred, p->p_cred)) {
                        struct lwp *l;
                        int policy;
                        pri_t priority;

                        l = arg1;
                        policy = (int)(unsigned long)arg2;
                        priority = (pri_t)(unsigned long)arg3;

                        if ((policy == l->l_class ||
                            (policy != SCHED_FIFO && policy != SCHED_RR)) &&
                            priority <= l->l_priority)
                                result = KAUTH_RESULT_ALLOW;
                }

                break;

        case KAUTH_PROCESS_SCHEDULER_GETAFFINITY:
                result = KAUTH_RESULT_ALLOW;
                break;

        case KAUTH_PROCESS_SCHEDULER_SETAFFINITY:
                /* Privileged; we let the secmodel handle this. */
                break;

        default:
                break;
        }

        return result;
}

void
sched_init(void)
{

        sysctl_sched_setup(&sched_sysctl_log);

        sched_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
            sched_listener_cb, NULL);
}























































































































































































































































































    2 



    1 


    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
/*        $NetBSD: init_sysctl_base.c,v 1.9 2023/12/20 20:35:37 andvar Exp $ */

/*-
 * Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Brown, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: init_sysctl_base.c,v 1.9 2023/12/20 20:35:37 andvar Exp $");

#include <sys/types.h>
#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/proc.h>
#include <sys/cpu.h>
#include <sys/kernel.h>
#include <sys/disklabel.h>

static int sysctl_setlen(SYSCTLFN_PROTO);

/*
 * sets up the base nodes...
 */
void
sysctl_basenode_init(void)
{

        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "kern",
                       SYSCTL_DESCR("High kernel"),
                       NULL, 0, NULL, 0,
                       CTL_KERN, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "vm",
                       SYSCTL_DESCR("Virtual memory"),
                       NULL, 0, NULL, 0,
                       CTL_VM, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "vfs",
                       SYSCTL_DESCR("Filesystem"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "net",
                       SYSCTL_DESCR("Networking"),
                       NULL, 0, NULL, 0,
                       CTL_NET, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "debug",
                       SYSCTL_DESCR("Debugging"),
                       NULL, 0, NULL, 0,
                       CTL_DEBUG, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "hw",
                       SYSCTL_DESCR("Generic CPU, I/O"),
                       NULL, 0, NULL, 0,
                       CTL_HW, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "machdep",
                       SYSCTL_DESCR("Machine dependent"),
                       NULL, 0, NULL, 0,
                       CTL_MACHDEP, CTL_EOL);
        /*
         * this node is inserted so that the sysctl nodes in libc can
         * operate.
         */
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "user",
                       SYSCTL_DESCR("User-level"),
                       NULL, 0, NULL, 0,
                       CTL_USER, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "ddb",
                       SYSCTL_DESCR("In-kernel debugger"),
                       NULL, 0, NULL, 0,
                       CTL_DDB, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "proc",
                       SYSCTL_DESCR("Per-process"),
                       NULL, 0, NULL, 0,
                       CTL_PROC, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_NODE, "vendor",
                       SYSCTL_DESCR("Vendor specific"),
                       NULL, 0, NULL, 0,
                       CTL_VENDOR, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "emul",
                       SYSCTL_DESCR("Emulation settings"),
                       NULL, 0, NULL, 0,
                       CTL_EMUL, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "security",
                       SYSCTL_DESCR("Security"),
                       NULL, 0, NULL, 0,
                       CTL_SECURITY, CTL_EOL);
}

/*
 * now add some nodes which both rump kernel and standard
 * NetBSD both need, as rump cannot use sys/kern/init_sysctl.c
 */
SYSCTL_SETUP(sysctl_kernbase_setup, "sysctl kern subtree base setup")
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "ostype",
                       SYSCTL_DESCR("Operating system type"),
                       NULL, 0, __UNCONST(&ostype), 0,
                       CTL_KERN, KERN_OSTYPE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "osrelease",
                       SYSCTL_DESCR("Operating system release"),
                       NULL, 0, __UNCONST(&osrelease), 0,
                       CTL_KERN, KERN_OSRELEASE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_INT, "osrevision",
                       SYSCTL_DESCR("Operating system revision"),
                       NULL, __NetBSD_Version__, NULL, 0,
                       CTL_KERN, KERN_OSREV, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "version",
                       SYSCTL_DESCR("Kernel version"),
                       NULL, 0, __UNCONST(&version), 0,
                       CTL_KERN, KERN_VERSION, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_STRING, "hostname",
                       SYSCTL_DESCR("System hostname"),
                       sysctl_setlen, 0, hostname, MAXHOSTNAMELEN,
                       CTL_KERN, KERN_HOSTNAME, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_STRING, "domainname",
                       SYSCTL_DESCR("YP domain name"),
                       sysctl_setlen, 0, domainname, MAXHOSTNAMELEN,
                       CTL_KERN, KERN_DOMAINNAME, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_INT, "rawpartition",
                       SYSCTL_DESCR("Raw partition of a disk"),
                       NULL, RAW_PART, NULL, 0,
                       CTL_KERN, KERN_RAWPARTITION, CTL_EOL);
}

static int
sysctl_hw_machine_arch(SYSCTLFN_ARGS)
{
        struct sysctlnode node = *rnode;
#ifndef PROC_MACHINE_ARCH
#define PROC_MACHINE_ARCH(P)        machine_arch
#endif

        node.sysctl_data = PROC_MACHINE_ARCH(l->l_proc);
        node.sysctl_size = strlen(node.sysctl_data) + 1;
        return sysctl_lookup(SYSCTLFN_CALL(&node));
}

SYSCTL_SETUP(sysctl_hwbase_setup, "sysctl hw subtree base setup")
{
        u_int u;
        u_quad_t q;
        const char *model = cpu_getmodel();

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "model",
                       SYSCTL_DESCR("Machine model"),
                       NULL, 0, __UNCONST(model), 0,
                       CTL_HW, HW_MODEL, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "machine",
                       SYSCTL_DESCR("Machine class"),
                       NULL, 0, machine, 0,
                       CTL_HW, HW_MACHINE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
                       CTLTYPE_STRING, "machine_arch",
                       SYSCTL_DESCR("Machine CPU class"),
                       sysctl_hw_machine_arch, 0, NULL, 0,
                       CTL_HW, HW_MACHINE_ARCH, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_INT, "ncpu",
                       SYSCTL_DESCR("Number of CPUs configured"),
                       NULL, 0, &ncpu, 0,
                       CTL_HW, HW_NCPU, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_INT, "byteorder",
                       SYSCTL_DESCR("System byte order"),
                       NULL, BYTE_ORDER, NULL, 0,
                       CTL_HW, HW_BYTEORDER, CTL_EOL);
        u = ((u_int)physmem > (UINT_MAX / PAGE_SIZE)) ?
                UINT_MAX : physmem * PAGE_SIZE;
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_INT, "physmem",
                       SYSCTL_DESCR("Bytes of physical memory"),
                       NULL, u, NULL, 0,
                       CTL_HW, HW_PHYSMEM, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_INT, "pagesize",
                       SYSCTL_DESCR("Software page size"),
                       NULL, PAGE_SIZE, NULL, 0,
                       CTL_HW, HW_PAGESIZE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_INT, "alignbytes",
                       SYSCTL_DESCR("Alignment constraint for all possible "
                                    "data types"),
                       NULL, ALIGNBYTES, NULL, 0,
                       CTL_HW, HW_ALIGNBYTES, CTL_EOL);
        q = (u_quad_t)physmem * PAGE_SIZE;
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_QUAD, "physmem64",
                       SYSCTL_DESCR("Bytes of physical memory"),
                       NULL, q, NULL, 0,
                       CTL_HW, HW_PHYSMEM64, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_INT, "ncpuonline",
                       SYSCTL_DESCR("Number of CPUs online"),
                       NULL, 0, &ncpuonline, 0,
                       CTL_HW, HW_NCPUONLINE, CTL_EOL);
}

/*
 * sysctl helper function for kern.hostname and kern.domainname.
 * resets the relevant recorded length when the underlying name is
 * changed.
 */
static int
sysctl_setlen(SYSCTLFN_ARGS)
{
        int error;

        error = sysctl_lookup(SYSCTLFN_CALL(rnode));
        if (error || newp == NULL)
                return (error);

        switch (rnode->sysctl_num) {
        case KERN_HOSTNAME:
                hostnamelen = strlen((const char*)rnode->sysctl_data);
                break;
        case KERN_DOMAINNAME:
                domainnamelen = strlen((const char*)rnode->sysctl_data);
                break;
        }

        return (0);
}





















































































































































































































































































































































































































































































































































   96 















  126 


















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
/*        $NetBSD: cpufunc.h,v 1.42 2020/10/24 07:14:29 mgorny Exp $        */

/*
 * Copyright (c) 1998, 2007, 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _X86_CPUFUNC_H_
#define        _X86_CPUFUNC_H_

/*
 * Functions to provide access to x86-specific instructions.
 */

#include <sys/cdefs.h>
#include <sys/types.h>

#include <machine/segments.h>
#include <machine/specialreg.h>

#ifdef _KERNEL
#if defined(_KERNEL_OPT)
#include "opt_xen.h"
#endif

static inline void
x86_pause(void)
{
        __asm volatile ("pause");
}

void        x86_lfence(void);
void        x86_sfence(void);
void        x86_mfence(void);
void        x86_flush(void);
void        x86_hlt(void);
void        x86_stihlt(void);
void        tlbflush(void);
void        tlbflushg(void);
void        invlpg(vaddr_t);
void        wbinvd(void);
void        breakpoint(void);

#define INVPCID_ADDRESS                0
#define INVPCID_CONTEXT                1
#define INVPCID_ALL                2
#define INVPCID_ALL_NONGLOBAL        3

static inline void
invpcid(register_t op, uint64_t pcid, vaddr_t va)
{
        struct {
                uint64_t pcid;
                uint64_t addr;
        } desc = {
                .pcid = pcid,
                .addr = va
        };

        __asm volatile (
                "invpcid %[desc],%[op]"
                :
                : [desc] "m" (desc), [op] "r" (op)
                : "memory"
        );
}

extern uint64_t (*rdtsc)(void);

#define _SERIALIZE_lfence        __asm volatile ("lfence")
#define _SERIALIZE_mfence        __asm volatile ("mfence")
#define _SERIALIZE_cpuid        __asm volatile ("xor %%eax, %%eax;cpuid" ::: \
            "eax", "ebx", "ecx", "edx");

#define RDTSCFUNC(fence)                        \
static inline uint64_t                                \
rdtsc_##fence(void)                                \
{                                                \
        uint32_t low, high;                        \
                                                \
        _SERIALIZE_##fence;                        \
        __asm volatile (                        \
                "rdtsc"                                \
                : "=a" (low), "=d" (high)        \
                :                                \
        );                                        \
                                                \
        return (low | ((uint64_t)high << 32));        \
}

RDTSCFUNC(lfence)
RDTSCFUNC(mfence)
RDTSCFUNC(cpuid)

#undef _SERIALIZE_LFENCE
#undef _SERIALIZE_MFENCE
#undef _SERIALIZE_CPUID


#ifndef XENPV
struct x86_hotpatch_source {
        uint8_t *saddr;
        uint8_t *eaddr;
};

struct x86_hotpatch_descriptor {
        uint8_t name;
        uint8_t nsrc;
        const struct x86_hotpatch_source *srcs[];
};

void        x86_hotpatch(uint8_t, uint8_t);
void        x86_patch(bool);
#endif

void        x86_monitor(const void *, uint32_t, uint32_t);
void        x86_mwait(uint32_t, uint32_t);

static inline void
x86_cpuid2(uint32_t eax, uint32_t ecx, uint32_t *regs)
{
        uint32_t ebx, edx;

        __asm volatile (
                "cpuid"
                : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
                : "a" (eax), "c" (ecx)
        );

        regs[0] = eax;
        regs[1] = ebx;
        regs[2] = ecx;
        regs[3] = edx;
}
#define x86_cpuid(a,b)        x86_cpuid2((a), 0, (b))

/* -------------------------------------------------------------------------- */

void        lidt(struct region_descriptor *);
void        lldt(u_short);
void        ltr(u_short);

static inline uint16_t
x86_getss(void)
{
        uint16_t val;

        __asm volatile (
                "mov        %%ss,%[val]"
                : [val] "=r" (val)
                :
        );
        return val;
}

static inline void
setds(uint16_t val)
{
        __asm volatile (
                "mov        %[val],%%ds"
                :
                : [val] "r" (val)
        );
}

static inline void
setes(uint16_t val)
{
        __asm volatile (
                "mov        %[val],%%es"
                :
                : [val] "r" (val)
        );
}

static inline void
setfs(uint16_t val)
{
        __asm volatile (
                "mov        %[val],%%fs"
                :
                : [val] "r" (val)
        );
}

void        setusergs(int);

/* -------------------------------------------------------------------------- */

#define FUNC_CR(crnum)                                        \
        static inline void lcr##crnum(register_t val)        \
        {                                                \
                __asm volatile (                                \
                        "mov        %[val],%%cr" #crnum        \
                        :                                \
                        : [val] "r" (val)                \
                        : "memory"                        \
                );                                        \
        }                                                \
        static inline register_t rcr##crnum(void)        \
        {                                                \
                register_t val;                                \
                __asm volatile (                                \
                        "mov        %%cr" #crnum ",%[val]"        \
                        : [val] "=r" (val)                \
                        :                                \
                );                                        \
                return val;                                \
        }

#define PROTO_CR(crnum)                                        \
        void lcr##crnum(register_t);                        \
        register_t rcr##crnum(void);

#ifndef XENPV
FUNC_CR(0)
FUNC_CR(2)
FUNC_CR(3)
#else
PROTO_CR(0)
PROTO_CR(2)
PROTO_CR(3)
#endif

FUNC_CR(4)
FUNC_CR(8)

/* -------------------------------------------------------------------------- */

#define FUNC_DR(drnum)                                        \
        static inline void ldr##drnum(register_t val)        \
        {                                                \
                __asm volatile (                                \
                        "mov        %[val],%%dr" #drnum        \
                        :                                \
                        : [val] "r" (val)                \
                );                                        \
        }                                                \
        static inline register_t rdr##drnum(void)        \
        {                                                \
                register_t val;                                \
                __asm volatile (                                \
                        "mov        %%dr" #drnum ",%[val]"        \
                        : [val] "=r" (val)                \
                        :                                \
                );                                        \
                return val;                                \
        }

#define PROTO_DR(drnum)                                        \
        register_t rdr##drnum(void);                        \
        void ldr##drnum(register_t);

#ifndef XENPV
FUNC_DR(0)
FUNC_DR(1)
FUNC_DR(2)
FUNC_DR(3)
FUNC_DR(6)
FUNC_DR(7)
#else
PROTO_DR(0)
PROTO_DR(1)
PROTO_DR(2)
PROTO_DR(3)
PROTO_DR(6)
PROTO_DR(7)
#endif

/* -------------------------------------------------------------------------- */

union savefpu;

static inline void
fninit(void)
{
        __asm volatile ("fninit" ::: "memory");
}

static inline void
fnclex(void)
{
        __asm volatile ("fnclex");
}

static inline void
fnstcw(uint16_t *val)
{
        __asm volatile (
                "fnstcw        %[val]"
                : [val] "=m" (*val)
                :
        );
}

static inline void
fnstsw(uint16_t *val)
{
        __asm volatile (
                "fnstsw        %[val]"
                : [val] "=m" (*val)
                :
        );
}

static inline void
clts(void)
{
        __asm volatile ("clts" ::: "memory");
}

void        stts(void);

static inline void
x86_stmxcsr(uint32_t *val)
{
        __asm volatile (
                "stmxcsr %[val]"
                : [val] "=m" (*val)
                :
        );
}

static inline void
x86_ldmxcsr(uint32_t *val)
{
        __asm volatile (
                "ldmxcsr %[val]"
                :
                : [val] "m" (*val)
        );
}

void        fldummy(void);

static inline uint64_t
rdxcr(uint32_t xcr)
{
        uint32_t low, high;

        __asm volatile (
                "xgetbv"
                : "=a" (low), "=d" (high)
                : "c" (xcr)
        );

        return (low | ((uint64_t)high << 32));
}

static inline void
wrxcr(uint32_t xcr, uint64_t val)
{
        uint32_t low, high;

        low = val;
        high = val >> 32;
        __asm volatile (
                "xsetbv"
                :
                : "a" (low), "d" (high), "c" (xcr)
        );
}

static inline void
fnsave(void *addr)
{
        uint8_t *area = addr;

        __asm volatile (
                "fnsave        %[area]"
                : [area] "=m" (*area)
                :
                : "memory"
        );
}

static inline void
frstor(const void *addr)
{
        const uint8_t *area = addr;

        __asm volatile (
                "frstor        %[area]"
                :
                : [area] "m" (*area)
                : "memory"
        );
}

static inline void
fxsave(void *addr)
{
        uint8_t *area = addr;

        __asm volatile (
                "fxsave        %[area]"
                : [area] "=m" (*area)
                :
                : "memory"
        );
}

static inline void
fxrstor(const void *addr)
{
        const uint8_t *area = addr;

        __asm volatile (
                "fxrstor %[area]"
                :
                : [area] "m" (*area)
                : "memory"
        );
}

static inline void
xsave(void *addr, uint64_t mask)
{
        uint8_t *area = addr;
        uint32_t low, high;

        low = mask;
        high = mask >> 32;
        __asm volatile (
                "xsave        %[area]"
                : [area] "=m" (*area)
                : "a" (low), "d" (high)
                : "memory"
        );
}

static inline void
xsaveopt(void *addr, uint64_t mask)
{
        uint8_t *area = addr;
        uint32_t low, high;

        low = mask;
        high = mask >> 32;
        __asm volatile (
                "xsaveopt %[area]"
                : [area] "=m" (*area)
                : "a" (low), "d" (high)
                : "memory"
        );
}

static inline void
xrstor(const void *addr, uint64_t mask)
{
        const uint8_t *area = addr;
        uint32_t low, high;

        low = mask;
        high = mask >> 32;
        __asm volatile (
                "xrstor %[area]"
                :
                : [area] "m" (*area), "a" (low), "d" (high)
                : "memory"
        );
}

#ifdef __x86_64__
static inline void
fxsave64(void *addr)
{
        uint8_t *area = addr;

        __asm volatile (
                "fxsave64        %[area]"
                : [area] "=m" (*area)
                :
                : "memory"
        );
}

static inline void
fxrstor64(const void *addr)
{
        const uint8_t *area = addr;

        __asm volatile (
                "fxrstor64 %[area]"
                :
                : [area] "m" (*area)
                : "memory"
        );
}

static inline void
xsave64(void *addr, uint64_t mask)
{
        uint8_t *area = addr;
        uint32_t low, high;

        low = mask;
        high = mask >> 32;
        __asm volatile (
                "xsave64        %[area]"
                : [area] "=m" (*area)
                : "a" (low), "d" (high)
                : "memory"
        );
}

static inline void
xsaveopt64(void *addr, uint64_t mask)
{
        uint8_t *area = addr;
        uint32_t low, high;

        low = mask;
        high = mask >> 32;
        __asm volatile (
                "xsaveopt64 %[area]"
                : [area] "=m" (*area)
                : "a" (low), "d" (high)
                : "memory"
        );
}

static inline void
xrstor64(const void *addr, uint64_t mask)
{
        const uint8_t *area = addr;
        uint32_t low, high;

        low = mask;
        high = mask >> 32;
        __asm volatile (
                "xrstor64 %[area]"
                :
                : [area] "m" (*area), "a" (low), "d" (high)
                : "memory"
        );
}
#endif

/* -------------------------------------------------------------------------- */

#ifdef XENPV
void x86_disable_intr(void);
void x86_enable_intr(void);
#else
static inline void
x86_disable_intr(void)
{
        __asm volatile ("cli" ::: "memory");
}

static inline void
x86_enable_intr(void)
{
        __asm volatile ("sti" ::: "memory");
}
#endif /* XENPV */

/* Use read_psl, write_psl when saving and restoring interrupt state. */
u_long        x86_read_psl(void);
void        x86_write_psl(u_long);

/* Use read_flags, write_flags to adjust other members of %eflags. */
u_long        x86_read_flags(void);
void        x86_write_flags(u_long);

void        x86_reset(void);

/* -------------------------------------------------------------------------- */

/* 
 * Some of the undocumented AMD64 MSRs need a 'passcode' to access.
 * See LinuxBIOSv2: src/cpu/amd/model_fxx/model_fxx_init.c
 */
#define        OPTERON_MSR_PASSCODE        0x9c5a203aU

static inline uint64_t
rdmsr(u_int msr)
{
        uint32_t low, high;

        __asm volatile (
                "rdmsr"
                : "=a" (low), "=d" (high)
                : "c" (msr)
        );

        return (low | ((uint64_t)high << 32));
}

static inline uint64_t
rdmsr_locked(u_int msr)
{
        uint32_t low, high, pass = OPTERON_MSR_PASSCODE;

        __asm volatile (
                "rdmsr"
                : "=a" (low), "=d" (high)
                : "c" (msr), "D" (pass)
        );

        return (low | ((uint64_t)high << 32));
}

int        rdmsr_safe(u_int, uint64_t *);

static inline void
wrmsr(u_int msr, uint64_t val)
{
        uint32_t low, high;

        low = val;
        high = val >> 32;
        __asm volatile (
                "wrmsr"
                :
                : "a" (low), "d" (high), "c" (msr)
                : "memory"
        );
}

static inline void
wrmsr_locked(u_int msr, uint64_t val)
{
        uint32_t low, high, pass = OPTERON_MSR_PASSCODE;

        low = val;
        high = val >> 32;
        __asm volatile (
                "wrmsr"
                :
                : "a" (low), "d" (high), "c" (msr), "D" (pass)
                : "memory"
        );
}

#endif /* _KERNEL */

#endif /* !_X86_CPUFUNC_H_ */













































































































































































    4 








































































































































































































































    4 
    4 






    4 
    4 





    4 





    4 


    4 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
/*        $NetBSD: pfil.c,v 1.42 2022/08/16 04:35:57 knakahara Exp $        */

/*
 * Copyright (c) 2013 Mindaugas Rasiukevicius <rmind at NetBSD org>
 * Copyright (c) 1996 Matthew R. Green
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: pfil.c,v 1.42 2022/08/16 04:35:57 knakahara Exp $");

#if defined(_KERNEL_OPT)
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/queue.h>
#include <sys/kmem.h>
#include <sys/psref.h>
#include <sys/cpu.h>

#include <net/if.h>
#include <net/pfil.h>

#define        MAX_HOOKS        8

/* Func is either pfil_func_t or pfil_ifunc_t. */
typedef void                (*pfil_polyfunc_t)(void);

typedef struct {
        pfil_polyfunc_t pfil_func;
        void *                pfil_arg;
} pfil_hook_t;

typedef struct {
        pfil_hook_t        hooks[MAX_HOOKS];
        u_int                nhooks;
        struct psref_target psref;
} pfil_list_t;

typedef struct {
        pfil_list_t        *active;        /* lists[0] or lists[1] */
        pfil_list_t        lists[2];
} pfil_listset_t;

CTASSERT(PFIL_IN == 1);
CTASSERT(PFIL_OUT == 2);

struct pfil_head {
        pfil_listset_t        ph_in;
        pfil_listset_t        ph_out;
        pfil_listset_t        ph_ifaddr;
        pfil_listset_t        ph_ifevent;
        int                ph_type;
        void *                ph_key;
        LIST_ENTRY(pfil_head) ph_list;
};

static const int pfil_flag_cases[] = {
        PFIL_IN, PFIL_OUT
};

static LIST_HEAD(, pfil_head) pfil_head_list __read_mostly =
    LIST_HEAD_INITIALIZER(&pfil_head_list);

static kmutex_t pfil_mtx __cacheline_aligned;
static struct psref_class *pfil_psref_class __read_mostly;
#ifdef NET_MPSAFE
static pserialize_t pfil_psz;
#endif

void
pfil_init(void)
{
        mutex_init(&pfil_mtx, MUTEX_DEFAULT, IPL_NONE);
#ifdef NET_MPSAFE
        pfil_psz = pserialize_create();
#endif
        pfil_psref_class = psref_class_create("pfil", IPL_SOFTNET);
}

static inline void
pfil_listset_init(pfil_listset_t *pflistset)
{
        pflistset->active = &pflistset->lists[0];
        psref_target_init(&pflistset->active->psref, pfil_psref_class);
}

/*
 * pfil_head_create: create and register a packet filter head.
 */
pfil_head_t *
pfil_head_create(int type, void *key)
{
        pfil_head_t *ph;

        if (pfil_head_get(type, key)) {
                return NULL;
        }
        ph = kmem_zalloc(sizeof(pfil_head_t), KM_SLEEP);
        ph->ph_type = type;
        ph->ph_key = key;

        pfil_listset_init(&ph->ph_in);
        pfil_listset_init(&ph->ph_out);
        pfil_listset_init(&ph->ph_ifaddr);
        pfil_listset_init(&ph->ph_ifevent);

        LIST_INSERT_HEAD(&pfil_head_list, ph, ph_list);
        return ph;
}

/*
 * pfil_head_destroy: remove and destroy a packet filter head.
 */
void
pfil_head_destroy(pfil_head_t *pfh)
{
        LIST_REMOVE(pfh, ph_list);

        psref_target_destroy(&pfh->ph_in.active->psref, pfil_psref_class);
        psref_target_destroy(&pfh->ph_out.active->psref, pfil_psref_class);
        psref_target_destroy(&pfh->ph_ifaddr.active->psref, pfil_psref_class);
        psref_target_destroy(&pfh->ph_ifevent.active->psref, pfil_psref_class);

        kmem_free(pfh, sizeof(pfil_head_t));
}

/*
 * pfil_head_get: returns the packer filter head for a given key.
 */
pfil_head_t *
pfil_head_get(int type, void *key)
{
        pfil_head_t *ph;

        LIST_FOREACH(ph, &pfil_head_list, ph_list) {
                if (ph->ph_type == type && ph->ph_key == key)
                        break;
        }
        return ph;
}

static pfil_listset_t *
pfil_hook_get(int dir, pfil_head_t *ph)
{
        switch (dir) {
        case PFIL_IN:
                return &ph->ph_in;
        case PFIL_OUT:
                return &ph->ph_out;
        case PFIL_IFADDR:
                return &ph->ph_ifaddr;
        case PFIL_IFNET:
                return &ph->ph_ifevent;
        }
        return NULL;
}

static int
pfil_list_add(pfil_listset_t *phlistset, pfil_polyfunc_t func, void *arg,
              int flags)
{
        u_int nhooks;
        pfil_list_t *newlist, *oldlist;
        pfil_hook_t *pfh;

        mutex_enter(&pfil_mtx);

        /* Check if we have a free slot. */
        nhooks = phlistset->active->nhooks;
        if (nhooks == MAX_HOOKS) {
                mutex_exit(&pfil_mtx);
                return ENOSPC;
        }
        KASSERT(nhooks < MAX_HOOKS);

        if (phlistset->active == &phlistset->lists[0]) {
                oldlist = &phlistset->lists[0];
                newlist = &phlistset->lists[1];
        } else{
                oldlist = &phlistset->lists[1];
                newlist = &phlistset->lists[0];
        }

        /* Make sure the hook is not already added. */
        for (u_int i = 0; i < nhooks; i++) {
                pfh = &oldlist->hooks[i];
                if (pfh->pfil_func == func && pfh->pfil_arg == arg) {
                        mutex_exit(&pfil_mtx);
                        return EEXIST;
                }
        }

        /* create new pfil_list_t copied from old */
        memcpy(newlist, oldlist, sizeof(pfil_list_t));
        psref_target_init(&newlist->psref, pfil_psref_class);

        /*
         * Finally, add the hook.  Note: for PFIL_IN we insert the hooks in
         * reverse order of the PFIL_OUT so that the same path is followed
         * in or out of the kernel.
         */
        if (flags & PFIL_IN) {
                /* XXX: May want to revisit this later; */
                size_t len = sizeof(pfil_hook_t) * nhooks;
                pfh = &newlist->hooks[0];
                memmove(&newlist->hooks[1], pfh, len);
        } else {
                pfh = &newlist->hooks[nhooks];
        }
        newlist->nhooks++;

        pfh->pfil_func = func;
        pfh->pfil_arg  = arg;

        /* switch from oldlist to newlist */
        atomic_store_release(&phlistset->active, newlist);
#ifdef NET_MPSAFE
        pserialize_perform(pfil_psz);
#endif
        mutex_exit(&pfil_mtx);

        /* Wait for all readers */
#ifdef NET_MPSAFE
        psref_target_destroy(&oldlist->psref, pfil_psref_class);
#endif

        return 0;
}

/*
 * pfil_add_hook: add a function (hook) to the packet filter head.
 * The possible flags are:
 *
 *        PFIL_IN                call on incoming packets
 *        PFIL_OUT        call on outgoing packets
 *        PFIL_ALL        call on all of the above
 */
int
pfil_add_hook(pfil_func_t func, void *arg, int flags, pfil_head_t *ph)
{
        int error = 0;

        KASSERT(func != NULL);
        KASSERT((flags & ~PFIL_ALL) == 0);

        ASSERT_SLEEPABLE();

        for (u_int i = 0; i < __arraycount(pfil_flag_cases); i++) {
                const int fcase = pfil_flag_cases[i];
                pfil_listset_t *phlistset;

                if ((flags & fcase) == 0) {
                        continue;
                }
                phlistset = pfil_hook_get(fcase, ph);
                error = pfil_list_add(phlistset, (pfil_polyfunc_t)func, arg,
                    flags);
                if (error && (error != EEXIST))
                        break;
        }
        if (error && (error != EEXIST)) {
                pfil_remove_hook(func, arg, flags, ph);
        }
        return error;
}

/*
 * pfil_add_ihook: add an interface-event function (hook) to the packet
 * filter head.  The possible flags are:
 *
 *        PFIL_IFADDR        call on interface reconfig (cmd is ioctl #)
 *        PFIL_IFNET        call on interface attach/detach (cmd is PFIL_IFNET_*)
 */
int
pfil_add_ihook(pfil_ifunc_t func, void *arg, int flags, pfil_head_t *ph)
{
        pfil_listset_t *phlistset;

        KASSERT(func != NULL);
        KASSERT(flags == PFIL_IFADDR || flags == PFIL_IFNET);

        ASSERT_SLEEPABLE();

        phlistset = pfil_hook_get(flags, ph);
        return pfil_list_add(phlistset, (pfil_polyfunc_t)func, arg, flags);
}

/*
 * pfil_list_remove: remove the hook from a specified list.
 */
static int
pfil_list_remove(pfil_listset_t *phlistset, pfil_polyfunc_t func, void *arg)
{
        u_int nhooks;
        pfil_list_t *oldlist, *newlist;

        mutex_enter(&pfil_mtx);

        /* create new pfil_list_t copied from old */
        if (phlistset->active == &phlistset->lists[0]) {
                oldlist = &phlistset->lists[0];
                newlist = &phlistset->lists[1];
        } else{
                oldlist = &phlistset->lists[1];
                newlist = &phlistset->lists[0];
        }
        memcpy(newlist, oldlist, sizeof(*newlist));
        psref_target_init(&newlist->psref, pfil_psref_class);

        nhooks = newlist->nhooks;
        for (u_int i = 0; i < nhooks; i++) {
                pfil_hook_t *last, *pfh = &newlist->hooks[i];

                if (pfh->pfil_func != func || pfh->pfil_arg != arg) {
                        continue;
                }
                if ((last = &newlist->hooks[nhooks - 1]) != pfh) {
                        memcpy(pfh, last, sizeof(pfil_hook_t));
                }
                newlist->nhooks--;

                /* switch from oldlist to newlist */
                atomic_store_release(&phlistset->active, newlist);
#ifdef NET_MPSAFE
                pserialize_perform(pfil_psz);
#endif
                mutex_exit(&pfil_mtx);

                /* Wait for all readers */
#ifdef NET_MPSAFE
                psref_target_destroy(&oldlist->psref, pfil_psref_class);
#endif

                return 0;
        }
        mutex_exit(&pfil_mtx);
        return ENOENT;
}

/*
 * pfil_remove_hook: remove the hook from the packet filter head.
 */
int
pfil_remove_hook(pfil_func_t func, void *arg, int flags, pfil_head_t *ph)
{
        KASSERT((flags & ~PFIL_ALL) == 0);

        ASSERT_SLEEPABLE();

        for (u_int i = 0; i < __arraycount(pfil_flag_cases); i++) {
                const int fcase = pfil_flag_cases[i];
                pfil_listset_t *pflistset;

                if ((flags & fcase) == 0) {
                        continue;
                }
                pflistset = pfil_hook_get(fcase, ph);
                (void)pfil_list_remove(pflistset, (pfil_polyfunc_t)func, arg);
        }
        return 0;
}

int
pfil_remove_ihook(pfil_ifunc_t func, void *arg, int flags, pfil_head_t *ph)
{
        pfil_listset_t *pflistset;

        KASSERT(flags == PFIL_IFADDR || flags == PFIL_IFNET);

        ASSERT_SLEEPABLE();

        pflistset = pfil_hook_get(flags, ph);
        (void)pfil_list_remove(pflistset, (pfil_polyfunc_t)func, arg);
        return 0;
}

/*
 * pfil_run_hooks: run the specified packet filter hooks.
 */
int
pfil_run_hooks(pfil_head_t *ph, struct mbuf **mp, ifnet_t *ifp, int dir)
{
        struct mbuf *m = mp ? *mp : NULL;
        pfil_listset_t *phlistset;
        pfil_list_t *phlist;
        struct psref psref;
        int s, bound;
        int ret = 0;

        KASSERT(dir == PFIL_IN || dir == PFIL_OUT);
        KASSERT(!cpu_intr_p());

        if (ph == NULL) {
                return ret;
        }

        if (__predict_false((phlistset = pfil_hook_get(dir, ph)) == NULL)) {
                return ret;
        }

        bound = curlwp_bind();
        s = pserialize_read_enter();
        phlist = atomic_load_consume(&phlistset->active);
        if (phlist->nhooks == 0) {
                pserialize_read_exit(s);
                curlwp_bindx(bound);
                return ret;
        }
        psref_acquire(&psref, &phlist->psref, pfil_psref_class);
        pserialize_read_exit(s);
        for (u_int i = 0; i < phlist->nhooks; i++) {
                pfil_hook_t *pfh = &phlist->hooks[i];
                pfil_func_t func = (pfil_func_t)pfh->pfil_func;

                ret = (*func)(pfh->pfil_arg, &m, ifp, dir);
                if (m == NULL || ret)
                        break;
        }
        psref_release(&psref, &phlist->psref, pfil_psref_class);
        curlwp_bindx(bound);

        if (mp) {
                *mp = m;
        }
        return ret;
}

static void
pfil_run_arg(pfil_listset_t *phlistset, u_long cmd, void *arg)
{
        pfil_list_t *phlist;
        struct psref psref;
        int s, bound;

        KASSERT(!cpu_intr_p());

        bound = curlwp_bind();
        s = pserialize_read_enter();
        phlist = atomic_load_consume(&phlistset->active);
        psref_acquire(&psref, &phlist->psref, pfil_psref_class);
        pserialize_read_exit(s);
        for (u_int i = 0; i < phlist->nhooks; i++) {
                pfil_hook_t *pfh = &phlist->hooks[i];
                pfil_ifunc_t func = (pfil_ifunc_t)pfh->pfil_func;
                (*func)(pfh->pfil_arg, cmd, arg);
        }
        psref_release(&psref, &phlist->psref, pfil_psref_class);
        curlwp_bindx(bound);
}

void
pfil_run_addrhooks(pfil_head_t *ph, u_long cmd, struct ifaddr *ifa)
{
        pfil_run_arg(&ph->ph_ifaddr, cmd, ifa);
}

void
pfil_run_ifhooks(pfil_head_t *ph, u_long cmd, struct ifnet *ifp)
{
        pfil_run_arg(&ph->ph_ifevent, cmd, ifp);
}




























































































































































    3 




































































































































































































































































    3 






    3 

    3 



    3 




































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
/*        $NetBSD: pktqueue.c,v 1.22 2023/05/28 08:09:34 andvar Exp $        */

/*-
 * Copyright (c) 2014 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * The packet queue (pktqueue) interface is a lockless IP input queue
 * which also abstracts and handles network ISR scheduling.  It provides
 * a mechanism to enable receiver-side packet steering (RPS).
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: pktqueue.c,v 1.22 2023/05/28 08:09:34 andvar Exp $");

#ifdef _KERNEL_OPT
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/types.h>

#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/pcq.h>
#include <sys/intr.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/percpu.h>
#include <sys/xcall.h>
#include <sys/once.h>
#include <sys/queue.h>
#include <sys/rwlock.h>

#include <net/pktqueue.h>
#include <net/rss_config.h>

#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>

struct pktqueue {
        /*
         * The lock used for a barrier mechanism.  The barrier counter,
         * as well as the drop counter, are managed atomically though.
         * Ensure this group is in a separate cache line.
         */
        union {
                struct {
                        kmutex_t        pq_lock;
                        volatile u_int        pq_barrier;
                };
                uint8_t         _pad[COHERENCY_UNIT];
        };

        /* The size of the queue, counters and the interrupt handler. */
        u_int                pq_maxlen;
        percpu_t *        pq_counters;
        void *                pq_sih;

        /* The per-CPU queues. */
        struct percpu *        pq_pcq;        /* struct pcq * */

        /* The linkage on the list of all pktqueues. */
        LIST_ENTRY(pktqueue) pq_list;
};

/* The counters of the packet queue. */
#define        PQCNT_ENQUEUE        0
#define        PQCNT_DEQUEUE        1
#define        PQCNT_DROP        2
#define        PQCNT_NCOUNTERS        3

typedef struct {
        uint64_t        count[PQCNT_NCOUNTERS];
} pktq_counters_t;

/* Special marker value used by pktq_barrier() mechanism. */
#define        PKTQ_MARKER        ((void *)(~0ULL))

/*
 * This is a list of all pktqueues.  This list is used by
 * pktq_ifdetach() to issue a barrier on every pktqueue.
 *
 * The r/w lock is acquired for writing in pktq_create() and
 * pktq_destroy(), and for reading in pktq_ifdetach().
 *
 * This list is not performance critical, and will seldom be
 * accessed.
 */
static LIST_HEAD(, pktqueue) pktqueue_list        __read_mostly;
static krwlock_t pktqueue_list_lock                __read_mostly;
static once_t pktqueue_list_init_once                __read_mostly;

static int
pktqueue_list_init(void)
{
        LIST_INIT(&pktqueue_list);
        rw_init(&pktqueue_list_lock);
        return 0;
}

static void
pktq_init_cpu(void *vqp, void *vpq, struct cpu_info *ci)
{
        struct pcq **qp = vqp;
        struct pktqueue *pq = vpq;

        *qp = pcq_create(pq->pq_maxlen, KM_SLEEP);
}

static void
pktq_fini_cpu(void *vqp, void *vpq, struct cpu_info *ci)
{
        struct pcq **qp = vqp, *q = *qp;

        KASSERT(pcq_peek(q) == NULL);
        pcq_destroy(q);
        *qp = NULL;                /* paranoia */
}

static struct pcq *
pktq_pcq(struct pktqueue *pq, struct cpu_info *ci)
{
        struct pcq **qp, *q;

        /*
         * As long as preemption is disabled, the xcall to swap percpu
         * buffers can't complete, so it is safe to read the pointer.
         */
        KASSERT(kpreempt_disabled());

        qp = percpu_getptr_remote(pq->pq_pcq, ci);
        q = *qp;

        return q;
}

pktqueue_t *
pktq_create(size_t maxlen, void (*intrh)(void *), void *sc)
{
        const u_int sflags = SOFTINT_NET | SOFTINT_MPSAFE | SOFTINT_RCPU;
        pktqueue_t *pq;
        percpu_t *pc;
        void *sih;

        RUN_ONCE(&pktqueue_list_init_once, pktqueue_list_init);

        pc = percpu_alloc(sizeof(pktq_counters_t));
        if ((sih = softint_establish(sflags, intrh, sc)) == NULL) {
                percpu_free(pc, sizeof(pktq_counters_t));
                return NULL;
        }

        pq = kmem_zalloc(sizeof(*pq), KM_SLEEP);
        mutex_init(&pq->pq_lock, MUTEX_DEFAULT, IPL_NONE);
        pq->pq_maxlen = maxlen;
        pq->pq_counters = pc;
        pq->pq_sih = sih;
        pq->pq_pcq = percpu_create(sizeof(struct pcq *),
            pktq_init_cpu, pktq_fini_cpu, pq);

        rw_enter(&pktqueue_list_lock, RW_WRITER);
        LIST_INSERT_HEAD(&pktqueue_list, pq, pq_list);
        rw_exit(&pktqueue_list_lock);

        return pq;
}

void
pktq_destroy(pktqueue_t *pq)
{

        KASSERT(pktqueue_list_init_once.o_status == ONCE_DONE);

        rw_enter(&pktqueue_list_lock, RW_WRITER);
        LIST_REMOVE(pq, pq_list);
        rw_exit(&pktqueue_list_lock);

        percpu_free(pq->pq_pcq, sizeof(struct pcq *));
        percpu_free(pq->pq_counters, sizeof(pktq_counters_t));
        softint_disestablish(pq->pq_sih);
        mutex_destroy(&pq->pq_lock);
        kmem_free(pq, sizeof(*pq));
}

/*
 * - pktq_inc_counter: increment the counter given an ID.
 * - pktq_collect_counts: handler to sum up the counts from each CPU.
 * - pktq_getcount: return the effective count given an ID.
 */

static inline void
pktq_inc_count(pktqueue_t *pq, u_int i)
{
        percpu_t *pc = pq->pq_counters;
        pktq_counters_t *c;

        c = percpu_getref(pc);
        c->count[i]++;
        percpu_putref(pc);
}

static void
pktq_collect_counts(void *mem, void *arg, struct cpu_info *ci)
{
        const pktq_counters_t *c = mem;
        pktq_counters_t *sum = arg;

        int s = splnet();

        for (u_int i = 0; i < PQCNT_NCOUNTERS; i++) {
                sum->count[i] += c->count[i];
        }

        splx(s);
}

static uint64_t
pktq_get_count(pktqueue_t *pq, pktq_count_t c)
{
        pktq_counters_t sum;

        if (c != PKTQ_MAXLEN) {
                memset(&sum, 0, sizeof(sum));
                percpu_foreach_xcall(pq->pq_counters,
                    XC_HIGHPRI_IPL(IPL_SOFTNET), pktq_collect_counts, &sum);
        }
        switch (c) {
        case PKTQ_NITEMS:
                return sum.count[PQCNT_ENQUEUE] - sum.count[PQCNT_DEQUEUE];
        case PKTQ_DROPS:
                return sum.count[PQCNT_DROP];
        case PKTQ_MAXLEN:
                return pq->pq_maxlen;
        }
        return 0;
}

uint32_t
pktq_rps_hash(const pktq_rps_hash_func_t *funcp, const struct mbuf *m)
{
        pktq_rps_hash_func_t func = atomic_load_relaxed(funcp);

        KASSERT(func != NULL);

        return (*func)(m);
}

static uint32_t
pktq_rps_hash_zero(const struct mbuf *m __unused)
{

        return 0;
}

static uint32_t
pktq_rps_hash_curcpu(const struct mbuf *m __unused)
{

        return cpu_index(curcpu());
}

static uint32_t
pktq_rps_hash_toeplitz(const struct mbuf *m)
{
        struct ip *ip;
        /*
         * Disable UDP port - IP fragments aren't currently being handled
         * and so we end up with a mix of 2-tuple and 4-tuple
         * traffic.
         */
        const u_int flag = RSS_TOEPLITZ_USE_TCP_PORT;

        /* glance IP version */
        if ((m->m_flags & M_PKTHDR) == 0)
                return 0;

        ip = mtod(m, struct ip *);
        if (ip->ip_v == IPVERSION) {
                if (__predict_false(m->m_len < sizeof(struct ip)))
                        return 0;
                return rss_toeplitz_hash_from_mbuf_ipv4(m, flag);
        } else if (ip->ip_v == 6) {
                if (__predict_false(m->m_len < sizeof(struct ip6_hdr)))
                        return 0;
                return rss_toeplitz_hash_from_mbuf_ipv6(m, flag);
        }

        return 0;
}

/*
 * toeplitz without curcpu.
 * Generally, this has better performance than toeplitz.
 */
static uint32_t
pktq_rps_hash_toeplitz_othercpus(const struct mbuf *m)
{
        uint32_t hash;

        if (ncpu == 1)
                return 0;

        hash = pktq_rps_hash_toeplitz(m);
        hash %= ncpu - 1;
        if (hash >= cpu_index(curcpu()))
                return hash + 1;
        else
                return hash;
}

static struct pktq_rps_hash_table {
        const char* prh_type;
        pktq_rps_hash_func_t prh_func;
} const pktq_rps_hash_tab[] = {
        { "zero", pktq_rps_hash_zero },
        { "curcpu", pktq_rps_hash_curcpu },
        { "toeplitz", pktq_rps_hash_toeplitz },
        { "toeplitz-othercpus", pktq_rps_hash_toeplitz_othercpus },
};
const pktq_rps_hash_func_t pktq_rps_hash_default =
#ifdef NET_MPSAFE
        pktq_rps_hash_curcpu;
#else
        pktq_rps_hash_zero;
#endif

static const char *
pktq_get_rps_hash_type(pktq_rps_hash_func_t func)
{

        for (int i = 0; i < __arraycount(pktq_rps_hash_tab); i++) {
                if (func == pktq_rps_hash_tab[i].prh_func) {
                        return pktq_rps_hash_tab[i].prh_type;
                }
        }

        return NULL;
}

static int
pktq_set_rps_hash_type(pktq_rps_hash_func_t *func, const char *type)
{

        if (strcmp(type, pktq_get_rps_hash_type(*func)) == 0)
                return 0;

        for (int i = 0; i < __arraycount(pktq_rps_hash_tab); i++) {
                if (strcmp(type, pktq_rps_hash_tab[i].prh_type) == 0) {
                        atomic_store_relaxed(func, pktq_rps_hash_tab[i].prh_func);
                        return 0;
                }
        }

        return ENOENT;
}

int
sysctl_pktq_rps_hash_handler(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        pktq_rps_hash_func_t *func;
        int error;
        char type[PKTQ_RPS_HASH_NAME_LEN];

        node = *rnode;
        func = node.sysctl_data;

        strlcpy(type, pktq_get_rps_hash_type(*func), PKTQ_RPS_HASH_NAME_LEN);

        node.sysctl_data = &type;
        node.sysctl_size = sizeof(type);
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        error = pktq_set_rps_hash_type(func, type);

        return error;
 }

/*
 * pktq_enqueue: inject the packet into the end of the queue.
 *
 * => Must be called from the interrupt or with the preemption disabled.
 * => Consumes the packet and returns true on success.
 * => Returns false on failure; caller is responsible to free the packet.
 */
bool
pktq_enqueue(pktqueue_t *pq, struct mbuf *m, const u_int hash __unused)
{
#if defined(_RUMPKERNEL) || defined(_RUMP_NATIVE_ABI)
        struct cpu_info *ci = curcpu();
#else
        struct cpu_info *ci = cpu_lookup(hash % ncpu);
#endif

        KASSERT(kpreempt_disabled());

        if (__predict_false(!pcq_put(pktq_pcq(pq, ci), m))) {
                pktq_inc_count(pq, PQCNT_DROP);
                return false;
        }
        softint_schedule_cpu(pq->pq_sih, ci);
        pktq_inc_count(pq, PQCNT_ENQUEUE);
        return true;
}

/*
 * pktq_dequeue: take a packet from the queue.
 *
 * => Must be called with preemption disabled.
 * => Must ensure there are not concurrent dequeue calls.
 */
struct mbuf *
pktq_dequeue(pktqueue_t *pq)
{
        struct cpu_info *ci = curcpu();
        struct mbuf *m;

        KASSERT(kpreempt_disabled());

        m = pcq_get(pktq_pcq(pq, ci));
        if (__predict_false(m == PKTQ_MARKER)) {
                /* Note the marker entry. */
                atomic_inc_uint(&pq->pq_barrier);

                /* Get the next queue entry. */
                m = pcq_get(pktq_pcq(pq, ci));

                /*
                 * There can only be one barrier operation pending
                 * on a pktqueue at any given time, so we can assert
                 * that the next item is not a marker.
                 */
                KASSERT(m != PKTQ_MARKER);
        }
        if (__predict_true(m != NULL)) {
                pktq_inc_count(pq, PQCNT_DEQUEUE);
        }
        return m;
}

/*
 * pktq_barrier: waits for a grace period when all packets enqueued at
 * the moment of calling this routine will be processed.  This is used
 * to ensure that e.g. packets referencing some interface were drained.
 */
void
pktq_barrier(pktqueue_t *pq)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
        u_int pending = 0;

        mutex_enter(&pq->pq_lock);
        KASSERT(pq->pq_barrier == 0);

        for (CPU_INFO_FOREACH(cii, ci)) {
                struct pcq *q;

                kpreempt_disable();
                q = pktq_pcq(pq, ci);
                kpreempt_enable();

                /* If the queue is empty - nothing to do. */
                if (pcq_peek(q) == NULL) {
                        continue;
                }
                /* Otherwise, put the marker and entry. */
                while (!pcq_put(q, PKTQ_MARKER)) {
                        kpause("pktqsync", false, 1, NULL);
                }
                kpreempt_disable();
                softint_schedule_cpu(pq->pq_sih, ci);
                kpreempt_enable();
                pending++;
        }

        /* Wait for each queue to process the markers. */
        while (pq->pq_barrier != pending) {
                kpause("pktqsync", false, 1, NULL);
        }
        pq->pq_barrier = 0;
        mutex_exit(&pq->pq_lock);
}

/*
 * pktq_ifdetach: issue a barrier on all pktqueues when a network
 * interface is detached.
 */
void
pktq_ifdetach(void)
{
        pktqueue_t *pq;

        /* Just in case no pktqueues have been created yet... */
        RUN_ONCE(&pktqueue_list_init_once, pktqueue_list_init);

        rw_enter(&pktqueue_list_lock, RW_READER);
        LIST_FOREACH(pq, &pktqueue_list, pq_list) {
                pktq_barrier(pq);
        }
        rw_exit(&pktqueue_list_lock);
}

/*
 * pktq_flush: free mbufs in all queues.
 *
 * => The caller must ensure there are no concurrent writers or flush calls.
 */
void
pktq_flush(pktqueue_t *pq)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
        struct mbuf *m, *m0 = NULL;

        ASSERT_SLEEPABLE();

        /*
         * Run a dummy softint at IPL_SOFTNET on all CPUs to ensure that any
         * already running handler for this pktqueue is no longer running.
         */
        xc_barrier(XC_HIGHPRI_IPL(IPL_SOFTNET));

        /*
         * Acquire the barrier lock.  While the caller ensures that
         * no explicit pktq_barrier() calls will be issued, this holds
         * off any implicit pktq_barrier() calls that would happen
         * as the result of pktq_ifdetach().
         */
        mutex_enter(&pq->pq_lock);

        for (CPU_INFO_FOREACH(cii, ci)) {
                struct pcq *q;

                kpreempt_disable();
                q = pktq_pcq(pq, ci);
                kpreempt_enable();

                /*
                 * Pull the packets off the pcq and chain them into
                 * a list to be freed later.
                 */
                while ((m = pcq_get(q)) != NULL) {
                        pktq_inc_count(pq, PQCNT_DEQUEUE);
                        m->m_nextpkt = m0;
                        m0 = m;
                }
        }

        mutex_exit(&pq->pq_lock);

        /* Free the packets now that the critical section is over. */
        while ((m = m0) != NULL) {
                m0 = m->m_nextpkt;
                m_freem(m);
        }
}

static void
pktq_set_maxlen_cpu(void *vpq, void *vqs)
{
        struct pktqueue *pq = vpq;
        struct pcq **qp, *q, **qs = vqs;
        unsigned i = cpu_index(curcpu());
        int s;

        s = splnet();
        qp = percpu_getref(pq->pq_pcq);
        q = *qp;
        *qp = qs[i];
        qs[i] = q;
        percpu_putref(pq->pq_pcq);
        splx(s);
}

/*
 * pktq_set_maxlen: create per-CPU queues using a new size and replace
 * the existing queues without losing any packets.
 *
 * XXX ncpu must remain stable throughout.
 */
int
pktq_set_maxlen(pktqueue_t *pq, size_t maxlen)
{
        const u_int slotbytes = ncpu * sizeof(pcq_t *);
        pcq_t **qs;

        if (!maxlen || maxlen > PCQ_MAXLEN)
                return EINVAL;
        if (pq->pq_maxlen == maxlen)
                return 0;

        /* First, allocate the new queues. */
        qs = kmem_zalloc(slotbytes, KM_SLEEP);
        for (u_int i = 0; i < ncpu; i++) {
                qs[i] = pcq_create(maxlen, KM_SLEEP);
        }

        /*
         * Issue an xcall to replace the queue pointers on each CPU.
         * This implies all the necessary memory barriers.
         */
        mutex_enter(&pq->pq_lock);
        xc_wait(xc_broadcast(XC_HIGHPRI, pktq_set_maxlen_cpu, pq, qs));
        pq->pq_maxlen = maxlen;
        mutex_exit(&pq->pq_lock);

        /*
         * At this point, the new packets are flowing into the new
         * queues.  However, the old queues may have some packets
         * present which are no longer being processed.  We are going
         * to re-enqueue them.  This may change the order of packet
         * arrival, but it is not considered an issue.
         *
         * There may be in-flight interrupts calling pktq_dequeue()
         * which reference the old queues.  Issue a barrier to ensure
         * that we are going to be the only pcq_get() callers on the
         * old queues.
         */
        pktq_barrier(pq);

        for (u_int i = 0; i < ncpu; i++) {
                struct pcq *q;
                struct mbuf *m;

                kpreempt_disable();
                q = pktq_pcq(pq, cpu_lookup(i));
                kpreempt_enable();

                while ((m = pcq_get(qs[i])) != NULL) {
                        while (!pcq_put(q, m)) {
                                kpause("pktqrenq", false, 1, NULL);
                        }
                }
                pcq_destroy(qs[i]);
        }

        /* Well, that was fun. */
        kmem_free(qs, slotbytes);
        return 0;
}

static int
sysctl_pktq_maxlen(SYSCTLFN_ARGS)
{
        struct sysctlnode node = *rnode;
        pktqueue_t * const pq = node.sysctl_data;
        u_int nmaxlen = pktq_get_count(pq, PKTQ_MAXLEN);
        int error;

        node.sysctl_data = &nmaxlen;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;
        return pktq_set_maxlen(pq, nmaxlen);
}

static int
sysctl_pktq_count(SYSCTLFN_ARGS, u_int count_id)
{
        struct sysctlnode node = *rnode;
        pktqueue_t * const pq = node.sysctl_data;
        uint64_t count = pktq_get_count(pq, count_id);

        node.sysctl_data = &count;
        return sysctl_lookup(SYSCTLFN_CALL(&node));
}

static int
sysctl_pktq_nitems(SYSCTLFN_ARGS)
{
        return sysctl_pktq_count(SYSCTLFN_CALL(rnode), PKTQ_NITEMS);
}

static int
sysctl_pktq_drops(SYSCTLFN_ARGS)
{
        return sysctl_pktq_count(SYSCTLFN_CALL(rnode), PKTQ_DROPS);
}

/*
 * pktqueue_sysctl_setup: set up the sysctl nodes for a pktqueue
 * using standardized names at the specified parent node and
 * node ID (or CTL_CREATE).
 */
void
pktq_sysctl_setup(pktqueue_t * const pq, struct sysctllog ** const clog,
                  const struct sysctlnode * const parent_node, const int qid)
{
        const struct sysctlnode *rnode = parent_node, *cnode;

        KASSERT(pq != NULL);
        KASSERT(parent_node != NULL);
        KASSERT(qid == CTL_CREATE || qid >= 0);

        /* Create the "ifq" node below the parent node. */
        sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "ifq",
                       SYSCTL_DESCR("Protocol input queue controls"),
                       NULL, 0, NULL, 0,
                       qid, CTL_EOL);

        /* Now create the standard child nodes below "ifq". */
        rnode = cnode;

        sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "len",
                       SYSCTL_DESCR("Current input queue length"),
                       sysctl_pktq_nitems, 0, (void *)pq, 0,
                       IFQCTL_LEN, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                       CTLTYPE_INT, "maxlen",
                       SYSCTL_DESCR("Maximum allowed input queue length"),
                       sysctl_pktq_maxlen, 0, (void *)pq, 0,
                       IFQCTL_MAXLEN, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "drops",
                       SYSCTL_DESCR("Packets dropped due to full input queue"),
                       sysctl_pktq_drops, 0, (void *)pq, 0,
                       IFQCTL_DROPS, CTL_EOL);
}






















































































































































































































































































    1 


































    1 
































































    1 


































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
/*        $NetBSD: msdosfs_vfsops.c,v 1.138 2022/04/16 07:58:21 hannken Exp $        */

/*-
 * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
 * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
 * All rights reserved.
 * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by TooLs GmbH.
 * 4. The name of TooLs GmbH may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
/*
 * Written by Paul Popelka (paulp@uts.amdahl.com)
 *
 * You can do anything you want with this software, just don't say you wrote
 * it, and don't remove this notice.
 *
 * This software is provided "as is".
 *
 * The author supplies this software to be publicly redistributed on the
 * understanding that the author is not responsible for the correct
 * functioning of this software in any circumstances and is not liable for
 * any damages caused by this software.
 *
 * October 1992
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: msdosfs_vfsops.c,v 1.138 2022/04/16 07:58:21 hannken Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h> /* XXX */        /* defines v_rdev */
#include <sys/mount.h>
#include <sys/buf.h>
#include <sys/file.h>
#include <sys/device.h>
#include <sys/disklabel.h>
#include <sys/disk.h>
#include <sys/ioctl.h>
#include <sys/malloc.h>
#include <sys/dirent.h>
#include <sys/stat.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/module.h>

#include <fs/msdosfs/bpb.h>
#include <fs/msdosfs/bootsect.h>
#include <fs/msdosfs/direntry.h>
#include <fs/msdosfs/denode.h>
#include <fs/msdosfs/msdosfsmount.h>
#include <fs/msdosfs/fat.h>

MODULE(MODULE_CLASS_VFS, msdos, NULL);

#ifdef MSDOSFS_DEBUG
#define DPRINTF(fmt, ...) uprintf("%s(): " fmt "\n", __func__, ##__VA_ARGS__)
#else
#define DPRINTF(fmt, ...)
#endif

#define GEMDOSFS_BSIZE        512

#define MSDOSFS_NAMEMAX(pmp) \
        (pmp)->pm_flags & MSDOSFSMNT_LONGNAME ? WIN_MAXLEN : 12

int msdosfs_mountfs(struct vnode *, struct mount *, struct lwp *,
    struct msdosfs_args *);

static int update_mp(struct mount *, struct msdosfs_args *);

MALLOC_JUSTDEFINE(M_MSDOSFSMNT, "MSDOSFS mount", "MSDOS FS mount structure");
MALLOC_JUSTDEFINE(M_MSDOSFSFAT, "MSDOSFS FAT", "MSDOS FS FAT table");
MALLOC_JUSTDEFINE(M_MSDOSFSTMP, "MSDOSFS temp", "MSDOS FS temp. structures");

extern const struct vnodeopv_desc msdosfs_vnodeop_opv_desc;

const struct vnodeopv_desc * const msdosfs_vnodeopv_descs[] = {
        &msdosfs_vnodeop_opv_desc,
        NULL,
};

struct vfsops msdosfs_vfsops = {
        .vfs_name = MOUNT_MSDOS,
        .vfs_min_mount_data = sizeof (struct msdosfs_args),
        .vfs_mount = msdosfs_mount,
        .vfs_start = msdosfs_start,
        .vfs_unmount = msdosfs_unmount,
        .vfs_root = msdosfs_root,
        .vfs_quotactl = (void *)eopnotsupp,
        .vfs_statvfs = msdosfs_statvfs,
        .vfs_sync = msdosfs_sync,
        .vfs_vget = msdosfs_vget,
        .vfs_loadvnode = msdosfs_loadvnode,
        .vfs_fhtovp = msdosfs_fhtovp,
        .vfs_vptofh = msdosfs_vptofh,
        .vfs_init = msdosfs_init,
        .vfs_reinit = msdosfs_reinit,
        .vfs_done = msdosfs_done,
        .vfs_mountroot = msdosfs_mountroot,
        .vfs_snapshot = (void *)eopnotsupp,
        .vfs_extattrctl = vfs_stdextattrctl,
        .vfs_suspendctl = genfs_suspendctl,
        .vfs_renamelock_enter = genfs_renamelock_enter,
        .vfs_renamelock_exit = genfs_renamelock_exit,
        .vfs_fsync = (void *)eopnotsupp,
        .vfs_opv_descs = msdosfs_vnodeopv_descs
};

SYSCTL_SETUP(msdosfs_sysctl_setup, "msdosfs sysctl")
{
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "msdosfs",
                       SYSCTL_DESCR("MS-DOS file system"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, 4, CTL_EOL);
        /*
         * XXX the "4" above could be dynamic, thereby eliminating one
         * more instance of the "number to vfs" mapping problem, but
         * "4" is the order as taken from sys/mount.h
         */
}

static int
msdos_modcmd(modcmd_t cmd, void *arg)
{
        int error;

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = vfs_attach(&msdosfs_vfsops);
                if (error != 0)
                        break;
                break;
        case MODULE_CMD_FINI:
                error = vfs_detach(&msdosfs_vfsops);
                if (error != 0)
                        break;
                break;
        default:
                error = ENOTTY;
                break;
        }

        return (error);
}

static int
update_mp(struct mount *mp, struct msdosfs_args *argp)
{
        struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
        int error;

        pmp->pm_gid = argp->gid;
        pmp->pm_uid = argp->uid;
        pmp->pm_mask = argp->mask & ALLPERMS;
        pmp->pm_dirmask = argp->dirmask & ALLPERMS;
        pmp->pm_gmtoff = argp->gmtoff;
        pmp->pm_flags |= argp->flags & MSDOSFSMNT_MNTOPT;

        /*
         * GEMDOS knows nothing about win95 long filenames
         */
        if (pmp->pm_flags & MSDOSFSMNT_GEMDOSFS)
                pmp->pm_flags |= MSDOSFSMNT_NOWIN95;

        if (pmp->pm_flags & MSDOSFSMNT_NOWIN95)
                pmp->pm_flags |= MSDOSFSMNT_SHORTNAME;
        else if (!(pmp->pm_flags &
            (MSDOSFSMNT_SHORTNAME | MSDOSFSMNT_LONGNAME))) {
                struct vnode *rtvp;

                /*
                 * Try to divine whether to support Win'95 long filenames
                 */
                if (FAT32(pmp))
                        pmp->pm_flags |= MSDOSFSMNT_LONGNAME;
                else {
                        error = msdosfs_root(mp, LK_EXCLUSIVE, &rtvp);
                        if (error != 0)
                                return error;
                        pmp->pm_flags |= msdosfs_findwin95(VTODE(rtvp))
                                ? MSDOSFSMNT_LONGNAME
                                        : MSDOSFSMNT_SHORTNAME;
                        vput(rtvp);
                }
        }

        mp->mnt_stat.f_namemax = MSDOSFS_NAMEMAX(pmp);

        return 0;
}

int
msdosfs_mountroot(void)
{
        struct mount *mp;
        struct lwp *l = curlwp;        /* XXX */
        int error;
        struct msdosfs_args args;

        if (device_class(root_device) != DV_DISK)
                return (ENODEV);

        if ((error = vfs_rootmountalloc(MOUNT_MSDOS, "root_device", &mp))) {
                vrele(rootvp);
                return (error);
        }

        args.flags = MSDOSFSMNT_VERSIONED;
        args.uid = 0;
        args.gid = 0;
        args.mask = 0777;
        args.version = MSDOSFSMNT_VERSION;
        args.dirmask = 0777;

        if ((error = msdosfs_mountfs(rootvp, mp, l, &args)) != 0) {
                vfs_unbusy(mp);
                vfs_rele(mp);
                return (error);
        }

        if ((error = update_mp(mp, &args)) != 0) {
                (void)msdosfs_unmount(mp, 0);
                vfs_unbusy(mp);
                vfs_rele(mp);
                vrele(rootvp);
                return (error);
        }

        mountlist_append(mp);
        (void)msdosfs_statvfs(mp, &mp->mnt_stat);
        vfs_unbusy(mp);
        return (0);
}

/*
 * mp - path - addr in user space of mount point (ie /usr or whatever)
 * data - addr in user space of mount params including the name of the block
 * special file to treat as a filesystem.
 */
int
msdosfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
        struct lwp *l = curlwp;
        struct vnode *devvp;          /* vnode for blk device to mount */
        struct msdosfs_args *args = data; /* holds data from mount request */
        /* msdosfs specific mount control block */
        struct msdosfsmount *pmp = NULL;
        int error, flags;
        mode_t accessmode;

        if (args == NULL)
                return EINVAL;
        if (*data_len < sizeof *args)
                return EINVAL;

        if (mp->mnt_flag & MNT_GETARGS) {
                pmp = VFSTOMSDOSFS(mp);
                if (pmp == NULL)
                        return EIO;
                args->fspec = NULL;
                args->uid = pmp->pm_uid;
                args->gid = pmp->pm_gid;
                args->mask = pmp->pm_mask;
                args->flags = pmp->pm_flags;
                args->version = MSDOSFSMNT_VERSION;
                args->dirmask = pmp->pm_dirmask;
                args->gmtoff = pmp->pm_gmtoff;
                *data_len = sizeof *args;
                return 0;
        }

        /*
         * If not versioned (i.e. using old mount_msdos(8)), fill in
         * the additional structure items with suitable defaults.
         */
        if ((args->flags & MSDOSFSMNT_VERSIONED) == 0) {
                args->version = 1;
                args->dirmask = args->mask;
        }

        /*
         * Reset GMT offset for pre-v3 mount structure args.
         */
        if (args->version < 3)
                args->gmtoff = 0;

        /*
         * If updating, check whether changing from read-only to
         * read/write; if there is no device name, that's all we do.
         */
        if (mp->mnt_flag & MNT_UPDATE) {
                pmp = VFSTOMSDOSFS(mp);
                error = 0;
                if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) &&
                    (mp->mnt_flag & MNT_RDONLY)) {
                        flags = WRITECLOSE;
                        if (mp->mnt_flag & MNT_FORCE)
                                flags |= FORCECLOSE;
                        error = vflush(mp, NULLVP, flags);
                }
                if (!error && (mp->mnt_flag & MNT_RELOAD))
                        /* not yet implemented */
                        error = EOPNOTSUPP;
                if (error) {
                        DPRINTF("vflush %d", error);
                        return (error);
                }
                if ((pmp->pm_flags & MSDOSFSMNT_RONLY) &&
                    (mp->mnt_iflag & IMNT_WANTRDWR)) {
                        /*
                         * If upgrade to read-write by non-root, then verify
                         * that user has necessary permissions on the device.
                         *
                         * Permission to update a mount is checked higher, so
                         * here we presume updating the mount is okay (for
                         * example, as far as securelevel goes) which leaves us
                         * with the normal check.
                         */
                        devvp = pmp->pm_devvp;
                        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                        error = kauth_authorize_system(l->l_cred,
                            KAUTH_SYSTEM_MOUNT, KAUTH_REQ_SYSTEM_MOUNT_DEVICE,
                            mp, devvp, KAUTH_ARG(VREAD | VWRITE));
                        VOP_UNLOCK(devvp);
                        DPRINTF("KAUTH_REQ_SYSTEM_MOUNT_DEVICE %d", error);
                        if (error)
                                return (error);

                        pmp->pm_flags &= ~MSDOSFSMNT_RONLY;
                }
                if (args->fspec == NULL) {
                        DPRINTF("missing fspec");
                        return EINVAL;
                }
        }
        /*
         * Not an update, or updating the name: look up the name
         * and verify that it refers to a sensible block device.
         */
        error = namei_simple_user(args->fspec,
                                NSM_FOLLOW_NOEMULROOT, &devvp);
        if (error != 0) {
                DPRINTF("namei %d", error);
                return (error);
        }

        if (devvp->v_type != VBLK) {
                DPRINTF("not block");
                vrele(devvp);
                return (ENOTBLK);
        }
        if (bdevsw_lookup(devvp->v_rdev) == NULL) {
                DPRINTF("no block switch");
                vrele(devvp);
                return (ENXIO);
        }
        /*
         * If mount by non-root, then verify that user has necessary
         * permissions on the device.
         */
        accessmode = VREAD;
        if ((mp->mnt_flag & MNT_RDONLY) == 0)
                accessmode |= VWRITE;
        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
            KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp, KAUTH_ARG(accessmode));
        VOP_UNLOCK(devvp);
        if (error) {
                DPRINTF("KAUTH_REQ_SYSTEM_MOUNT_DEVICE %d", error);
                vrele(devvp);
                return (error);
        }
        if ((mp->mnt_flag & MNT_UPDATE) == 0) {
                int xflags;

                if (mp->mnt_flag & MNT_RDONLY)
                        xflags = FREAD;
                else
                        xflags = FREAD|FWRITE;
                vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                error = VOP_OPEN(devvp, xflags, FSCRED);
                VOP_UNLOCK(devvp);
                if (error) {
                        DPRINTF("VOP_OPEN %d", error);
                        goto fail;
                }
                error = msdosfs_mountfs(devvp, mp, l, args);
                if (error) {
                        DPRINTF("msdosfs_mountfs %d", error);
                        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                        (void) VOP_CLOSE(devvp, xflags, NOCRED);
                        VOP_UNLOCK(devvp);
                        goto fail;
                }
#ifdef MSDOSFS_DEBUG                /* only needed for the printf below */
                pmp = VFSTOMSDOSFS(mp);
#endif
        } else {
                vrele(devvp);
                if (devvp != pmp->pm_devvp) {
                        DPRINTF("devvp %p pmp %p", devvp, pmp->pm_devvp);
                        return (EINVAL);        /* needs translation */
                }
        }
        if ((error = update_mp(mp, args)) != 0) {
                msdosfs_unmount(mp, MNT_FORCE);
                DPRINTF("update_mp %d", error);
                return error;
        }

#ifdef MSDOSFS_DEBUG
        printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap);
#endif
        return set_statvfs_info(path, UIO_USERSPACE, args->fspec, UIO_USERSPACE,
            mp->mnt_op->vfs_name, mp, l);

fail:
        vrele(devvp);
        return (error);
}

int
msdosfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l, struct msdosfs_args *argp)
{
        struct msdosfsmount *pmp;
        struct buf *bp;
        dev_t dev = devvp->v_rdev;
        union bootsector *bsp;
        struct byte_bpb33 *b33;
        struct byte_bpb50 *b50;
        struct byte_bpb710 *b710;
        uint8_t SecPerClust;
        int        ronly, error, BlkPerSec;
        uint64_t psize;
        unsigned secsize;
        u_long fatbytes, fatblocksecs;

        /* Flush out any old buffers remaining from a previous use. */
        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
        error = vinvalbuf(devvp, V_SAVE, l->l_cred, l, 0, 0);
        VOP_UNLOCK(devvp);
        if (error)
                return (error);

        ronly = (mp->mnt_flag & MNT_RDONLY) != 0;

        bp  = NULL; /* both used in error_exit */
        pmp = NULL;

        error = getdisksize(devvp, &psize, &secsize);
        if (error) {
                if (argp->flags & MSDOSFSMNT_GEMDOSFS)
                        goto error_exit;

                /* ok, so it failed.  we most likely don't need the info */
                secsize = DEV_BSIZE;
                psize = 0;
                error = 0;
        }
        if (secsize < DEV_BSIZE) {
                DPRINTF("Invalid block secsize (%d < DEV_BSIZE)", secsize);
                error = EINVAL;
                goto error_exit;
        }

        if (argp->flags & MSDOSFSMNT_GEMDOSFS) {
                if (secsize != GEMDOSFS_BSIZE) {
                        DPRINTF("Invalid block secsize %d for GEMDOS", secsize);
                        error = EINVAL;
                        goto error_exit;
                }
        }

        /*
         * Read the boot sector of the filesystem, and then check the
         * boot signature.  If not a dos boot sector then error out.
         */
        if (secsize < sizeof(*b50)) {
                DPRINTF("50 bootsec %u\n", secsize);
                error = EINVAL;
                goto error_exit;
        }
        if ((error = bread(devvp, 0, secsize, 0, &bp)) != 0)
                goto error_exit;
        bsp = (union bootsector *)bp->b_data;
        b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB;
        b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB;
        b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB;

#if 0
        /*
         * Some FAT partition, for example Raspberry Pi Pico's
         * USB mass storage, does not have exptected BOOTSIGs.
         * According to FreeBSD's comment, some PC-9800/9821
         * FAT floppy disks have similar problems.
         */
        if (!(argp->flags & MSDOSFSMNT_GEMDOSFS)) {
                if (bsp->bs50.bsBootSectSig0 != BOOTSIG0
                    || bsp->bs50.bsBootSectSig1 != BOOTSIG1) {
                        DPRINTF("bootsig0 %d bootsig1 %d", 
                            bsp->bs50.bsBootSectSig0,
                            bsp->bs50.bsBootSectSig1);
                        error = EINVAL;
                        goto error_exit;
                }
        }
#endif

        pmp = malloc(sizeof(*pmp), M_MSDOSFSMNT, M_WAITOK|M_ZERO);
        pmp->pm_mountp = mp;

        /*
         * Compute several useful quantities from the bpb in the
         * bootsector.  Copy in the dos 5 variant of the bpb then fix up
         * the fields that are different between dos 5 and dos 3.3.
         */
        SecPerClust = b50->bpbSecPerClust;
        pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec);
        pmp->pm_ResSectors = getushort(b50->bpbResSectors);
        pmp->pm_FATs = b50->bpbFATs;
        pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts);
        pmp->pm_Sectors = getushort(b50->bpbSectors);
        pmp->pm_FATsecs = getushort(b50->bpbFATsecs);
        pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack);
        pmp->pm_Heads = getushort(b50->bpbHeads);
        pmp->pm_Media = b50->bpbMedia;

        if (pmp->pm_Sectors == 0) {
                pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs);
                pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors);
        } else {
                if (secsize < sizeof(*b33)) {
                        DPRINTF("33 bootsec %u\n", secsize);
                        error = EINVAL;
                        goto error_exit;
                }
                pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs);
                pmp->pm_HugeSectors = pmp->pm_Sectors;
        }

        /*
         * Sanity checks, from the FAT specification:
         * - sectors per cluster: >= 1, power of 2
         * - logical sector size: >= 1, power of 2
         * - cluster size:        <= max FS block size
         * - number of sectors:   >= 1
         */
        if ((SecPerClust == 0) || !powerof2(SecPerClust) ||
            (pmp->pm_BytesPerSec == 0) || !powerof2(pmp->pm_BytesPerSec) ||
            (SecPerClust * pmp->pm_BytesPerSec > MAXBSIZE) ||
            (pmp->pm_HugeSectors == 0)) {
                DPRINTF("consistency checks");
                error = EINVAL;
                goto error_exit;
        }

        if (!(argp->flags & MSDOSFSMNT_GEMDOSFS) &&
            (pmp->pm_SecPerTrack > 63)) {
                DPRINTF("SecPerTrack %d", pmp->pm_SecPerTrack);
                error = EINVAL;
                goto error_exit;
        }

        if (pmp->pm_RootDirEnts == 0) {
                if (secsize < sizeof(*b710)) {
                        DPRINTF("710 bootsec %u\n", secsize);
                        error = EINVAL;
                        goto error_exit;
                }
                unsigned short FSVers = getushort(b710->bpbFSVers);
                unsigned short ExtFlags = getushort(b710->bpbExtFlags);
                /*
                 * Some say that bsBootSectSig[23] must be zero, but
                 * Windows does not require this and some digital cameras
                 * do not set these to zero.  Therefore, do not insist.
                 */
                if (pmp->pm_Sectors || pmp->pm_FATsecs || FSVers) {
                        DPRINTF("Sectors %d FATsecs %lu FSVers %d",
                            pmp->pm_Sectors, pmp->pm_FATsecs, FSVers);
                        error = EINVAL;
                        goto error_exit;
                }
                pmp->pm_fatmask = FAT32_MASK;
                pmp->pm_fatmult = 4;
                pmp->pm_fatdiv = 1;
                pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs);

                /* Mirroring is enabled if the FATMIRROR bit is not set. */
                if ((ExtFlags & FATMIRROR) == 0)
                        pmp->pm_flags |= MSDOSFS_FATMIRROR;
                else
                        pmp->pm_curfat = ExtFlags & FATNUM;
        } else
                pmp->pm_flags |= MSDOSFS_FATMIRROR;

        if (argp->flags & MSDOSFSMNT_GEMDOSFS) {
                if (FAT32(pmp)) {
                        /* GEMDOS doesn't know FAT32. */
                        DPRINTF("FAT32 for GEMDOS");
                        error = EINVAL;
                        goto error_exit;
                }

                /*
                 * Check a few values (could do some more):
                 * - logical sector size: >= block size
                 * - number of sectors:   <= size of partition
                 */
                if ((pmp->pm_BytesPerSec < GEMDOSFS_BSIZE) ||
                    (pmp->pm_HugeSectors *
                     (pmp->pm_BytesPerSec / GEMDOSFS_BSIZE) > psize)) {
                        DPRINTF("consistency checks for GEMDOS");
                        error = EINVAL;
                        goto error_exit;
                }
                /*
                 * XXX - Many parts of the msdosfs driver seem to assume that
                 * the number of bytes per logical sector (BytesPerSec) will
                 * always be the same as the number of bytes per disk block
                 * Let's pretend it is.
                 */
                BlkPerSec = pmp->pm_BytesPerSec / GEMDOSFS_BSIZE;
                pmp->pm_BytesPerSec  = GEMDOSFS_BSIZE;
                pmp->pm_HugeSectors *= BlkPerSec;
                pmp->pm_HiddenSects *= BlkPerSec;
                pmp->pm_ResSectors  *= BlkPerSec;
                pmp->pm_Sectors     *= BlkPerSec;
                pmp->pm_FATsecs     *= BlkPerSec;
                SecPerClust         *= BlkPerSec;
        }

        /* Check that fs has nonzero FAT size */
        if (pmp->pm_FATsecs == 0) {
                DPRINTF("FATsecs is 0");
                error = EINVAL;
                goto error_exit;
        }

        pmp->pm_fatblk = pmp->pm_ResSectors;
        if (FAT32(pmp)) {
                if (secsize < sizeof(*b710)) {
                        DPRINTF("710 bootsec %u\n", secsize);
                        error = EINVAL;
                        goto error_exit;
                }
                pmp->pm_rootdirblk = getulong(b710->bpbRootClust);
                pmp->pm_firstcluster = pmp->pm_fatblk
                        + (pmp->pm_FATs * pmp->pm_FATsecs);
                pmp->pm_fsinfo = getushort(b710->bpbFSInfo);
        } else {
                pmp->pm_rootdirblk = pmp->pm_fatblk +
                        (pmp->pm_FATs * pmp->pm_FATsecs);
                pmp->pm_rootdirsize = (pmp->pm_RootDirEnts * sizeof(struct direntry)
                                       + pmp->pm_BytesPerSec - 1)
                        / pmp->pm_BytesPerSec;/* in sectors */
                pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize;
        }

        pmp->pm_nmbrofclusters = (pmp->pm_HugeSectors - pmp->pm_firstcluster) /
            SecPerClust;
        pmp->pm_maxcluster = pmp->pm_nmbrofclusters + 1;
        pmp->pm_fatsize = pmp->pm_FATsecs * pmp->pm_BytesPerSec;

        if (argp->flags & MSDOSFSMNT_GEMDOSFS) {
                if (pmp->pm_nmbrofclusters <= (0xff0 - 2)) {
                        pmp->pm_fatmask = FAT12_MASK;
                        pmp->pm_fatmult = 3;
                        pmp->pm_fatdiv = 2;
                } else {
                        pmp->pm_fatmask = FAT16_MASK;
                        pmp->pm_fatmult = 2;
                        pmp->pm_fatdiv = 1;
                }
        } else if (pmp->pm_fatmask == 0) {
                if (pmp->pm_maxcluster
                    <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) {
                        /*
                         * This will usually be a floppy disk. This size makes
                         * sure that one FAT entry will not be split across
                         * multiple blocks.
                         */
                        pmp->pm_fatmask = FAT12_MASK;
                        pmp->pm_fatmult = 3;
                        pmp->pm_fatdiv = 2;
                } else {
                        pmp->pm_fatmask = FAT16_MASK;
                        pmp->pm_fatmult = 2;
                        pmp->pm_fatdiv = 1;
                }
        }

        /* validate cluster count against FAT */
        if ((pmp->pm_maxcluster & pmp->pm_fatmask) != pmp->pm_maxcluster) {
                DPRINTF("maxcluster %lu outside of mask %#lx\n",
                        pmp->pm_maxcluster, pmp->pm_fatmask);
                error = EINVAL;
                goto error_exit;
        }

        /* validate FAT size */
        fatbytes = (pmp->pm_maxcluster+1) * pmp->pm_fatmult / pmp->pm_fatdiv;
        fatblocksecs = howmany(fatbytes, pmp->pm_BytesPerSec);

        if (pmp->pm_FATsecs < fatblocksecs) {
                DPRINTF("FATsecs %lu < real %lu\n", pmp->pm_FATsecs,
                        fatblocksecs);
                error = EINVAL;
                goto error_exit;
        }

        if (FAT12(pmp)) {
                /*
                 * limit block size to what is needed to read a FAT block
                 * to not exceed MAXBSIZE
                 */
                pmp->pm_fatblocksec = uimin(3, fatblocksecs);
                pmp->pm_fatblocksize = pmp->pm_fatblocksec
                        * pmp->pm_BytesPerSec;
        } else {
                pmp->pm_fatblocksize = MAXBSIZE;
                pmp->pm_fatblocksec = pmp->pm_fatblocksize
                        / pmp->pm_BytesPerSec;
        }

        pmp->pm_bnshift = ffs(pmp->pm_BytesPerSec) - 1;

        /*
         * Compute mask and shift value for isolating cluster relative byte
         * offsets and cluster numbers from a file offset.
         */
        pmp->pm_bpcluster = SecPerClust * pmp->pm_BytesPerSec;
        pmp->pm_crbomask = pmp->pm_bpcluster - 1;
        pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1;

        /*
         * Check for valid cluster size
         * must be a power of 2
         */
        if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) {
                DPRINTF("bpcluster %lu cnshift %lu", pmp->pm_bpcluster,
                    pmp->pm_cnshift);
                error = EINVAL;
                goto error_exit;
        }

        /*
         * Cluster size must be within limit of MAXBSIZE.
         * Many FAT filesystems will not have clusters larger than
         * 32KiB due to limits in Windows versions before Vista.
         */
        if (pmp->pm_bpcluster > MAXBSIZE) {
                DPRINTF("bpcluster %lu > MAXBSIZE %d",
                    pmp->pm_bpcluster, MAXBSIZE);
                error = EINVAL;
                goto error_exit;
        }

        /*
         * Release the bootsector buffer.
         */
        brelse(bp, BC_AGE);
        bp = NULL;

        /*
         * Check FSInfo.
         */
        if (pmp->pm_fsinfo) {
                struct fsinfo *fp;
                const int rdsz = roundup(sizeof(*fp), pmp->pm_BytesPerSec);

                /*
                 * XXX        If the fsinfo block is stored on media with
                 *        2KB or larger sectors, is the fsinfo structure
                 *        padded at the end or in the middle?
                 */
                if ((error = bread(devvp, de_bn2kb(pmp, pmp->pm_fsinfo),
                    rdsz, 0, &bp)) != 0)
                        goto error_exit;
                fp = (struct fsinfo *)bp->b_data;
                if (!memcmp(fp->fsisig1, "RRaA", 4)
                    && !memcmp(fp->fsisig2, "rrAa", 4)
                    && !memcmp(fp->fsisig3, "\0\0\125\252", 4)
                    && !memcmp(fp->fsisig4, "\0\0\125\252", 4))
                        pmp->pm_nxtfree = getulong(fp->fsinxtfree);
                else
                        pmp->pm_fsinfo = 0;
                brelse(bp, 0);
                bp = NULL;
        }

        /*
         * Check and validate (or perhaps invalidate?) the fsinfo structure?
         * XXX
         */
        if (pmp->pm_fsinfo) {
                if ((pmp->pm_nxtfree == 0xffffffffUL) ||
                    (pmp->pm_nxtfree > pmp->pm_maxcluster))
                        pmp->pm_fsinfo = 0;
        }

        /*
         * Allocate memory for the bitmap of allocated clusters, and then
         * fill it in.
         */
        pmp->pm_inusemap = malloc(((pmp->pm_maxcluster + N_INUSEBITS)
                                   / N_INUSEBITS)
                                  * sizeof(*pmp->pm_inusemap),
                                  M_MSDOSFSFAT, M_WAITOK);

        /*
         * fillinusemap() needs pm_devvp.
         */
        pmp->pm_dev = dev;
        pmp->pm_devvp = devvp;

        /*
         * Have the inuse map filled in.
         */
        if ((error = msdosfs_fillinusemap(pmp)) != 0) {
                DPRINTF("fillinusemap %d", error);
                goto error_exit;
        }

        /*
         * If they want FAT updates to be synchronous then let them suffer
         * the performance degradation in exchange for the on disk copy of
         * the FAT being correct just about all the time.  I suppose this
         * would be a good thing to turn on if the kernel is still flakey.
         */
        if (mp->mnt_flag & MNT_SYNCHRONOUS)
                pmp->pm_flags |= MSDOSFSMNT_WAITONFAT;

        /*
         * Finish up.
         */
        if (ronly)
                pmp->pm_flags |= MSDOSFSMNT_RONLY;
        else
                pmp->pm_fmod = 1;
        mp->mnt_data = pmp;
        mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
        mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_MSDOS);
        mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
        mp->mnt_stat.f_namemax = MSDOSFS_NAMEMAX(pmp);
        mp->mnt_flag |= MNT_LOCAL;
        mp->mnt_iflag |= IMNT_SHRLOOKUP;
        mp->mnt_dev_bshift = pmp->pm_bnshift;
        mp->mnt_fs_bshift = pmp->pm_cnshift;

        /*
         * If we ever do quotas for DOS filesystems this would be a place
         * to fill in the info in the msdosfsmount structure. You dolt,
         * quotas on dos filesystems make no sense because files have no
         * owners on dos filesystems. of course there is some empty space
         * in the directory entry where we could put uid's and gid's.
         */

        spec_node_setmountedfs(devvp, mp);

        return (0);

error_exit:
        if (bp)
                brelse(bp, BC_AGE);
        if (pmp) {
                if (pmp->pm_inusemap)
                        free(pmp->pm_inusemap, M_MSDOSFSFAT);
                free(pmp, M_MSDOSFSMNT);
                mp->mnt_data = NULL;
        }
        return (error);
}

int
msdosfs_start(struct mount *mp, int flags)
{

        return (0);
}

/*
 * Unmount the filesystem described by mp.
 */
int
msdosfs_unmount(struct mount *mp, int mntflags)
{
        struct msdosfsmount *pmp;
        int error, flags;

        flags = 0;
        if (mntflags & MNT_FORCE)
                flags |= FORCECLOSE;
        if ((error = vflush(mp, NULLVP, flags)) != 0)
                return (error);
        pmp = VFSTOMSDOSFS(mp);
        if (pmp->pm_devvp->v_type != VBAD)
                spec_node_setmountedfs(pmp->pm_devvp, NULL);
#ifdef MSDOSFS_DEBUG
        {
                struct vnode *vp = pmp->pm_devvp;

                printf("msdosfs_umount(): just before calling VOP_CLOSE()\n");
                printf("flag %08x, usecount %d, writecount %d, holdcnt %d\n",
                    vp->v_vflag | vp->v_iflag | vp->v_uflag, vrefcnt(vp),
                    vp->v_writecount, vp->v_holdcnt);
                printf("mount %p, op %p\n",
                    vp->v_mount, vp->v_op);
                printf("cleanblkhd %p, dirtyblkhd %p, numoutput %d, type %d\n",
                    vp->v_cleanblkhd.lh_first,
                    vp->v_dirtyblkhd.lh_first,
                    vp->v_numoutput, vp->v_type);
                printf("union %p, tag %d, data[0] %08x, data[1] %08x\n",
                    vp->v_socket, vp->v_tag,
                    ((u_int *)vp->v_data)[0],
                    ((u_int *)vp->v_data)[1]);
        }
#endif
        vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY);
        (void) VOP_CLOSE(pmp->pm_devvp,
            pmp->pm_flags & MSDOSFSMNT_RONLY ? FREAD : FREAD|FWRITE, NOCRED);
        vput(pmp->pm_devvp);
        msdosfs_fh_destroy(pmp);
        free(pmp->pm_inusemap, M_MSDOSFSFAT);
        free(pmp, M_MSDOSFSMNT);
        mp->mnt_data = NULL;
        mp->mnt_flag &= ~MNT_LOCAL;
        return (0);
}

int
msdosfs_root(struct mount *mp, int lktype, struct vnode **vpp)
{
        struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
        int error;

#ifdef MSDOSFS_DEBUG
        printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp);
#endif
        if ((error = msdosfs_deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS,
            vpp)) != 0)
                return error;
        error = vn_lock(*vpp, lktype);
        if (error) {
                vrele(*vpp);
                *vpp = NULL;
                return error;
        }
        return 0;
}

int
msdosfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
        struct msdosfsmount *pmp;

        pmp = VFSTOMSDOSFS(mp);
        sbp->f_bsize = pmp->pm_bpcluster;
        sbp->f_frsize = sbp->f_bsize;
        sbp->f_iosize = pmp->pm_bpcluster;
        sbp->f_blocks = pmp->pm_nmbrofclusters;
        sbp->f_bfree = pmp->pm_freeclustercount;
        sbp->f_bavail = pmp->pm_freeclustercount;
        sbp->f_bresvd = 0;
        sbp->f_files = pmp->pm_RootDirEnts;                        /* XXX */
        sbp->f_ffree = 0;        /* what to put in here? */
        sbp->f_favail = 0;        /* what to put in here? */
        sbp->f_fresvd = 0;
        copy_statvfs_info(sbp, mp);
        return (0);
}

struct msdosfs_sync_ctx {
        int waitfor;
};

static bool
msdosfs_sync_selector(void *cl, struct vnode *vp)
{
        struct msdosfs_sync_ctx *c = cl;
        struct denode *dep;

        KASSERT(mutex_owned(vp->v_interlock));

        dep = VTODE(vp);
        if (c->waitfor == MNT_LAZY || vp->v_type == VNON ||
            dep == NULL || (((dep->de_flag &
            (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0) &&
             (LIST_EMPTY(&vp->v_dirtyblkhd) &&
              (vp->v_iflag & VI_ONWORKLST) == 0)))
                return false;
        return true;
}

int
msdosfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
{
        struct vnode *vp;
        struct vnode_iterator *marker;
        struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
        int error, allerror = 0;
        struct msdosfs_sync_ctx ctx;

        /*
         * If we ever switch to not updating all of the FATs all the time,
         * this would be the place to update them from the first one.
         */
        if (pmp->pm_fmod != 0) {
                if (pmp->pm_flags & MSDOSFSMNT_RONLY)
                        panic("msdosfs_sync: rofs mod");
                else {
                        /* update FATs here */
                }
        }
        /*
         * Write back each (modified) denode.
         */
        vfs_vnode_iterator_init(mp, &marker);
        ctx.waitfor = waitfor;
        while ((vp = vfs_vnode_iterator_next(marker, msdosfs_sync_selector,
            &ctx)))
        {
                error = vn_lock(vp, LK_EXCLUSIVE);
                if (error) {
                        vrele(vp);
                        continue;
                }
                if ((error = VOP_FSYNC(vp, cred,
                    waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0)) != 0)
                        allerror = error;
                vput(vp);
        }
        vfs_vnode_iterator_destroy(marker);

        /*
         * Force stale file system control information to be flushed.
         */
        vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY);
        if ((error = VOP_FSYNC(pmp->pm_devvp, cred,
            waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0)) != 0)
                allerror = error;
        VOP_UNLOCK(pmp->pm_devvp);
        return (allerror);
}

int
msdosfs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp)
{
        struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
        struct defid defh;
        uint32_t gen;
        int error;

        if (fhp->fid_len != sizeof(struct defid)) {
                DPRINTF("fid_len %d %zd", fhp->fid_len, sizeof(struct defid));
                return EINVAL;
        }
        memcpy(&defh, fhp, sizeof(defh));
        error = msdosfs_fh_lookup(pmp, defh.defid_dirclust, defh.defid_dirofs,
            &gen);
        if (error == 0 && gen != defh.defid_gen)
                error = ESTALE;
        if (error) {
                *vpp = NULLVP;
                return error;
        }
        error = msdosfs_deget(pmp, defh.defid_dirclust, defh.defid_dirofs, vpp);
        if (error) {
                DPRINTF("deget %d", error);
                *vpp = NULLVP;
                return error;
        }
        error = vn_lock(*vpp, lktype);
        if (error) {
                vrele(*vpp);
                *vpp = NULLVP;
                return error;
        }
        return 0;
}

int
msdosfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
{
        struct msdosfsmount *pmp = VFSTOMSDOSFS(vp->v_mount);
        struct denode *dep;
        struct defid defh;
        int error;

        if (*fh_size < sizeof(struct defid)) {
                *fh_size = sizeof(struct defid);
                return E2BIG;
        }
        *fh_size = sizeof(struct defid);
        dep = VTODE(vp);
        memset(&defh, 0, sizeof(defh));
        defh.defid_len = sizeof(struct defid);
        defh.defid_dirclust = dep->de_dirclust;
        defh.defid_dirofs = dep->de_diroffset;
        error = msdosfs_fh_enter(pmp, dep->de_dirclust, dep->de_diroffset,
             &defh.defid_gen);
        if (error == 0)
                memcpy(fhp, &defh, sizeof(defh));
        return error;
}

int
msdosfs_vget(struct mount *mp, ino_t ino, int lktype,
    struct vnode **vpp)
{

        return (EOPNOTSUPP);
}










































































































































































































   34 

   89 










   56 


   92 
    6 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
/*        $NetBSD: wapbl.h,v 1.21 2018/12/10 21:19:33 jdolecek Exp $        */

/*-
 * Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _SYS_WAPBL_H
#define        _SYS_WAPBL_H

#include <sys/mutex.h>

#if defined(_KERNEL) || defined(_KMEMUSER)
#include <miscfs/specfs/specdev.h>
#endif

/* This header file describes the api and data structures for
 * write ahead physical block logging (WAPBL) support.
 */

#if defined(_KERNEL_OPT)
#include "opt_wapbl.h"
#endif

#ifdef WAPBL_DEBUG
#ifndef WAPBL_DEBUG_PRINT
#define        WAPBL_DEBUG_PRINT (WAPBL_PRINT_REPLAY | WAPBL_PRINT_OPEN)
#endif

#if 0
#define        WAPBL_DEBUG_BUFBYTES
#endif

#endif

#ifdef WAPBL_DEBUG_PRINT

enum {
        WAPBL_PRINT_OPEN = 0x1,
        WAPBL_PRINT_FLUSH = 0x2,
        WAPBL_PRINT_TRUNCATE = 0x4,
        WAPBL_PRINT_TRANSACTION = 0x8,
        WAPBL_PRINT_BUFFER = 0x10,
        WAPBL_PRINT_BUFFER2 = 0x20,
        WAPBL_PRINT_ALLOC = 0x40,
        WAPBL_PRINT_INODE = 0x80,
        WAPBL_PRINT_WRITE = 0x100,
        WAPBL_PRINT_IO = 0x200,
        WAPBL_PRINT_REPLAY = 0x400,
        WAPBL_PRINT_ERROR = 0x800,
        WAPBL_PRINT_DISCARD = 0x1000,
        WAPBL_PRINT_BIODONE = 0x2000,
};

#define        WAPBL_PRINTF(mask, a) if (wapbl_debug_print & (mask)) printf  a
extern int wapbl_debug_print;
#else
#define        WAPBL_PRINTF(mask, a)
#endif

/****************************************************************/

#include <sys/queue.h>
#include <sys/vnode.h>
#include <sys/buf.h>

#ifdef _KERNEL

struct wapbl_entry;
struct wapbl_replay;
struct wapbl;

struct wapbl_dealloc {
        TAILQ_ENTRY(wapbl_dealloc) wd_entries;
        daddr_t wd_blkno;        /* address of block */
        int wd_len;                /* size of block */
};

typedef void (*wapbl_flush_fn_t)(struct mount *, struct wapbl_dealloc *);

/*
 * This structure holds per transaction log information
 */
struct wapbl_entry {
        struct wapbl *we_wapbl;
        SIMPLEQ_ENTRY(wapbl_entry) we_entries;
        size_t we_bufcount;                /* Count of unsynced buffers */
        size_t we_reclaimable_bytes;        /* Number on disk bytes for this
                                           transaction */
        int        we_error;
#ifdef WAPBL_DEBUG_BUFBYTES
        size_t we_unsynced_bufbytes;        /* Byte count of unsynced buffers */
#endif
};

/* Start using a log */
int        wapbl_start(struct wapbl **, struct mount *, struct vnode *, daddr_t,
                    size_t, size_t, struct wapbl_replay *,
                    wapbl_flush_fn_t, wapbl_flush_fn_t);

/* Discard the current transaction, potentially dangerous */
void        wapbl_discard(struct wapbl *);

/* stop using a log */
int        wapbl_stop(struct wapbl *, int);

/*
 * Begin a new transaction or increment transaction recursion
 * level if called while a transaction is already in progress
 * by the current process.
 */
int        wapbl_begin(struct wapbl *, const char *, int);


/* End a transaction or decrement the transaction recursion level */
void        wapbl_end(struct wapbl *);

/*
 * Add a new buffer to the current transaction.  The buffers
 * data will be copied to the current transaction log and the
 * buffer will be marked B_LOCKED so that it will not be
 * flushed to disk by the syncer or reallocated.
 */
void        wapbl_add_buf(struct wapbl *, struct buf *);

/* Remove a buffer from the current transaction. */
void        wapbl_remove_buf(struct wapbl *, struct buf *);

void        wapbl_resize_buf(struct wapbl *, struct buf *, long, long);

/*
 * This will flush all completed transactions to disk and
 * start asynchronous writes on the associated buffers
 */
int        wapbl_flush(struct wapbl *, int);

/*
 * Inodes that are allocated but have zero link count
 * must be registered with the current transaction
 * so they may be recorded in the log and cleaned up later.
 * registration/unregistration of ino numbers already registered is ok.
 */
void        wapbl_register_inode(struct wapbl *, ino_t, mode_t);
void        wapbl_unregister_inode(struct wapbl *, ino_t, mode_t);

/*
 * Metadata block deallocations must be registered so
 * that revocations records can be written and to prevent
 * the corresponding blocks from being reused as data
 * blocks until the log is on disk.
 */
int        wapbl_register_deallocation(struct wapbl *, daddr_t, int, bool,
                void **);
void        wapbl_unregister_deallocation(struct wapbl *, void *);

void        wapbl_jlock_assert(struct wapbl *wl);
void        wapbl_junlock_assert(struct wapbl *wl);

void        wapbl_print(struct wapbl *wl, int full, void (*pr)(const char *, ...)
    __printflike(1, 2));

#if defined(WAPBL_DEBUG) || defined(DDB)
void        wapbl_dump(struct wapbl *);
#endif

void        wapbl_biodone(struct buf *);

extern const struct wapbl_ops wapbl_ops;

static __inline struct mount *
wapbl_vptomp(struct vnode *vp)
{
        struct mount *mp;

        mp = NULL;
        if (vp != NULL) {
                if (vp->v_type == VBLK)
                        mp = spec_node_getmountedfs(vp);
                else
                        mp = vp->v_mount;
        }

        return mp;
}

static __inline bool
wapbl_vphaswapbl(struct vnode *vp)
{
        struct mount *mp;

        if (vp == NULL)
                return false;

        mp = wapbl_vptomp(vp);
        return mp && mp->mnt_wapbl;
}

#endif /* _KERNEL */

/****************************************************************/
/* Replay support */

#ifdef WAPBL_INTERNAL
LIST_HEAD(wapbl_blk_head, wapbl_blk);
struct wapbl_replay {
        struct vnode *wr_logvp;
        struct vnode *wr_devvp;
        daddr_t wr_logpbn;

        int wr_log_dev_bshift;
        int wr_fs_dev_bshift;
        int64_t wr_circ_off;
        int64_t wr_circ_size;        
        uint32_t wr_generation;

        void *wr_scratch;

        struct wapbl_blk_head *wr_blkhash;
        u_long wr_blkhashmask;
        int wr_blkhashcnt;

        off_t wr_inodeshead;
        off_t wr_inodestail;
        int wr_inodescnt;
        struct {
                uint32_t wr_inumber;
                uint32_t wr_imode;
        } *wr_inodes;
};

#define        wapbl_replay_isopen(wr) ((wr)->wr_scratch != 0)

/* Supply this to provide i/o support */
int wapbl_write(void *, size_t, struct vnode *, daddr_t);
int wapbl_read(void *, size_t, struct vnode *, daddr_t);

/****************************************************************/
#else
struct wapbl_replay;
#endif /* WAPBL_INTERNAL */

/****************************************************************/

int        wapbl_replay_start(struct wapbl_replay **, struct vnode *,
        daddr_t, size_t, size_t);
void        wapbl_replay_stop(struct wapbl_replay *);
void        wapbl_replay_free(struct wapbl_replay *);
int        wapbl_replay_write(struct wapbl_replay *, struct vnode *);
int        wapbl_replay_can_read(struct wapbl_replay *, daddr_t, long);
int        wapbl_replay_read(struct wapbl_replay *, void *, daddr_t, long);

/****************************************************************/

#endif /* !_SYS_WAPBL_H */

























































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 







    1 












    1 






    1 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 










    1 


































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
/*        $NetBSD: if_vioif.c,v 1.111 2024/03/21 12:33:21 isaki Exp $        */

/*
 * Copyright (c) 2020 The NetBSD Foundation, Inc.
 * Copyright (c) 2010 Minoura Makoto.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_vioif.c,v 1.111 2024/03/21 12:33:21 isaki Exp $");

#ifdef _KERNEL_OPT
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <sys/bus.h>
#include <sys/condvar.h>
#include <sys/device.h>
#include <sys/evcnt.h>
#include <sys/intr.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/sockio.h>
#include <sys/syslog.h>
#include <sys/cpu.h>
#include <sys/module.h>
#include <sys/pcq.h>
#include <sys/workqueue.h>
#include <sys/xcall.h>

#include <dev/pci/virtioreg.h>
#include <dev/pci/virtiovar.h>

#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_media.h>
#include <net/if_ether.h>

#include <net/bpf.h>

#include "ioconf.h"

#ifdef NET_MPSAFE
#define VIOIF_MPSAFE        1
#define VIOIF_MULTIQ        1
#endif

/*
 * if_vioifreg.h:
 */
/* Configuration registers */
#define VIRTIO_NET_CONFIG_MAC                 0 /* 8bit x 6byte */
#define VIRTIO_NET_CONFIG_STATUS         6 /* 16bit */
#define VIRTIO_NET_CONFIG_MAX_VQ_PAIRS         8 /* 16bit */
#define VIRTIO_NET_CONFIG_MTU                10 /* 16bit */

/* Feature bits */
#define VIRTIO_NET_F_CSUM                __BIT(0)
#define VIRTIO_NET_F_GUEST_CSUM                __BIT(1)
#define VIRTIO_NET_F_MAC                __BIT(5)
#define VIRTIO_NET_F_GSO                __BIT(6)
#define VIRTIO_NET_F_GUEST_TSO4                __BIT(7)
#define VIRTIO_NET_F_GUEST_TSO6                __BIT(8)
#define VIRTIO_NET_F_GUEST_ECN                __BIT(9)
#define VIRTIO_NET_F_GUEST_UFO                __BIT(10)
#define VIRTIO_NET_F_HOST_TSO4                __BIT(11)
#define VIRTIO_NET_F_HOST_TSO6                __BIT(12)
#define VIRTIO_NET_F_HOST_ECN                __BIT(13)
#define VIRTIO_NET_F_HOST_UFO                __BIT(14)
#define VIRTIO_NET_F_MRG_RXBUF                __BIT(15)
#define VIRTIO_NET_F_STATUS                __BIT(16)
#define VIRTIO_NET_F_CTRL_VQ                __BIT(17)
#define VIRTIO_NET_F_CTRL_RX                __BIT(18)
#define VIRTIO_NET_F_CTRL_VLAN                __BIT(19)
#define VIRTIO_NET_F_CTRL_RX_EXTRA        __BIT(20)
#define VIRTIO_NET_F_GUEST_ANNOUNCE        __BIT(21)
#define VIRTIO_NET_F_MQ                        __BIT(22)
#define VIRTIO_NET_F_CTRL_MAC_ADDR         __BIT(23)

#define VIRTIO_NET_FLAG_BITS                        \
        VIRTIO_COMMON_FLAG_BITS                        \
        "b\x17" "CTRL_MAC\0"                        \
        "b\x16" "MQ\0"                                \
        "b\x15" "GUEST_ANNOUNCE\0"                \
        "b\x14" "CTRL_RX_EXTRA\0"                \
        "b\x13" "CTRL_VLAN\0"                        \
        "b\x12" "CTRL_RX\0"                        \
        "b\x11" "CTRL_VQ\0"                        \
        "b\x10" "STATUS\0"                        \
        "b\x0f" "MRG_RXBUF\0"                        \
        "b\x0e" "HOST_UFO\0"                        \
        "b\x0d" "HOST_ECN\0"                        \
        "b\x0c" "HOST_TSO6\0"                        \
        "b\x0b" "HOST_TSO4\0"                        \
        "b\x0a" "GUEST_UFO\0"                        \
        "b\x09" "GUEST_ECN\0"                        \
        "b\x08" "GUEST_TSO6\0"                        \
        "b\x07" "GUEST_TSO4\0"                        \
        "b\x06" "GSO\0"                                \
        "b\x05" "MAC\0"                                \
        "b\x01" "GUEST_CSUM\0"                        \
        "b\x00" "CSUM\0"

/* Status */
#define VIRTIO_NET_S_LINK_UP        1

/* Packet header structure */
struct virtio_net_hdr {
        uint8_t                flags;
        uint8_t                gso_type;
        uint16_t        hdr_len;
        uint16_t        gso_size;
        uint16_t        csum_start;
        uint16_t        csum_offset;

        uint16_t        num_buffers; /* VIRTIO_NET_F_MRG_RXBUF enabled or v1 */
} __packed;

#define VIRTIO_NET_HDR_F_NEEDS_CSUM        1 /* flags */
#define VIRTIO_NET_HDR_GSO_NONE                0 /* gso_type */
#define VIRTIO_NET_HDR_GSO_TCPV4        1 /* gso_type */
#define VIRTIO_NET_HDR_GSO_UDP                3 /* gso_type */
#define VIRTIO_NET_HDR_GSO_TCPV6        4 /* gso_type */
#define VIRTIO_NET_HDR_GSO_ECN                0x80 /* gso_type, |'ed */

#define VIRTIO_NET_MAX_GSO_LEN                (65536+ETHER_HDR_LEN)

/* Control virtqueue */
struct virtio_net_ctrl_cmd {
        uint8_t        class;
        uint8_t        command;
} __packed;
#define VIRTIO_NET_CTRL_RX                0
# define VIRTIO_NET_CTRL_RX_PROMISC        0
# define VIRTIO_NET_CTRL_RX_ALLMULTI        1

#define VIRTIO_NET_CTRL_MAC                1
# define VIRTIO_NET_CTRL_MAC_TABLE_SET        0
# define  VIRTIO_NET_CTRL_MAC_ADDR_SET        1

#define VIRTIO_NET_CTRL_VLAN                2
# define VIRTIO_NET_CTRL_VLAN_ADD        0
# define VIRTIO_NET_CTRL_VLAN_DEL        1

#define VIRTIO_NET_CTRL_MQ                        4
# define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET        0
# define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN        1
# define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX        0x8000

struct virtio_net_ctrl_status {
        uint8_t        ack;
} __packed;
#define VIRTIO_NET_OK                        0
#define VIRTIO_NET_ERR                        1

struct virtio_net_ctrl_rx {
        uint8_t        onoff;
} __packed;

struct virtio_net_ctrl_mac_tbl {
        uint32_t nentries;
        uint8_t macs[][ETHER_ADDR_LEN];
} __packed;

struct virtio_net_ctrl_mac_addr {
        uint8_t mac[ETHER_ADDR_LEN];
} __packed;

struct virtio_net_ctrl_vlan {
        uint16_t id;
} __packed;

struct virtio_net_ctrl_mq {
        uint16_t virtqueue_pairs;
} __packed;

/*
 * if_vioifvar.h:
 */

/*
 * Locking notes:
 * + a field in vioif_netqueue is protected by netq_lock (a spin mutex)
 *      - more than one lock cannot be held at onece
 * + a field in vioif_tx_context and vioif_rx_context is also protected
 *   by netq_lock.
 * + ctrlq_inuse is protected by ctrlq_wait_lock.
 *      - other fields in vioif_ctrlqueue are protected by ctrlq_inuse
 *      - netq_lock cannot be held along with ctrlq_wait_lock
 * + fields in vioif_softc except queues are protected by
 *   sc->sc_lock(an adaptive mutex)
 *      - the lock is held before acquisition of other locks
 */

struct vioif_ctrl_cmdspec {
        bus_dmamap_t        dmamap;
        void                *buf;
        bus_size_t        bufsize;
};

struct vioif_work {
        struct work         cookie;
        void                (*func)(void *);
        void                *arg;
        unsigned int         added;
};

struct vioif_net_map {
        struct virtio_net_hdr        *vnm_hdr;
        bus_dmamap_t                 vnm_hdr_map;
        struct mbuf                *vnm_mbuf;
        bus_dmamap_t                 vnm_mbuf_map;
};

#define VIOIF_NETQ_RX                0
#define VIOIF_NETQ_TX                1
#define VIOIF_NETQ_IDX                2
#define VIOIF_NETQ_DIR(n)        ((n) % VIOIF_NETQ_IDX)
#define VIOIF_NETQ_PAIRIDX(n)        ((n) / VIOIF_NETQ_IDX)
#define VIOIF_NETQ_RXQID(n)        ((n) * VIOIF_NETQ_IDX + VIOIF_NETQ_RX)
#define VIOIF_NETQ_TXQID(n)        ((n) * VIOIF_NETQ_IDX + VIOIF_NETQ_TX)

struct vioif_netqueue {
        kmutex_t                 netq_lock;
        struct virtqueue        *netq_vq;
        bool                         netq_stopping;
        bool                         netq_running_handle;
        void                        *netq_maps_kva;
        struct vioif_net_map        *netq_maps;

        void                        *netq_softint;
        struct vioif_work         netq_work;
        bool                         netq_workqueue;

        char                         netq_evgroup[32];
        struct evcnt                 netq_mbuf_load_failed;
        struct evcnt                 netq_enqueue_failed;

        void                        *netq_ctx;
};

struct vioif_tx_context {
        bool                         txc_link_active;
        bool                         txc_no_free_slots;
        pcq_t                        *txc_intrq;
        void                        *txc_deferred_transmit;

        struct evcnt                 txc_defrag_failed;
};

struct vioif_rx_context {
        struct evcnt                 rxc_mbuf_enobufs;
};
struct vioif_ctrlqueue {
        struct virtqueue                *ctrlq_vq;
        enum {
                FREE, INUSE, DONE
        }                                ctrlq_inuse;
        kcondvar_t                        ctrlq_wait;
        kmutex_t                        ctrlq_wait_lock;
        struct lwp                        *ctrlq_owner;

        struct virtio_net_ctrl_cmd        *ctrlq_cmd;
        struct virtio_net_ctrl_status        *ctrlq_status;
        struct virtio_net_ctrl_rx        *ctrlq_rx;
        struct virtio_net_ctrl_mac_tbl        *ctrlq_mac_tbl_uc;
        struct virtio_net_ctrl_mac_tbl        *ctrlq_mac_tbl_mc;
        struct virtio_net_ctrl_mac_addr        *ctrlq_mac_addr;
        struct virtio_net_ctrl_mq        *ctrlq_mq;

        bus_dmamap_t                        ctrlq_cmd_dmamap;
        bus_dmamap_t                        ctrlq_status_dmamap;
        bus_dmamap_t                        ctrlq_rx_dmamap;
        bus_dmamap_t                        ctrlq_tbl_uc_dmamap;
        bus_dmamap_t                        ctrlq_tbl_mc_dmamap;
        bus_dmamap_t                        ctrlq_mac_addr_dmamap;
        bus_dmamap_t                        ctrlq_mq_dmamap;

        struct evcnt                        ctrlq_cmd_load_failed;
        struct evcnt                        ctrlq_cmd_failed;
};

struct vioif_softc {
        device_t                sc_dev;
        kmutex_t                sc_lock;
        struct sysctllog        *sc_sysctllog;

        struct virtio_softc        *sc_virtio;
        struct virtqueue        *sc_vqs;
        u_int                         sc_hdr_size;

        int                        sc_max_nvq_pairs;
        int                        sc_req_nvq_pairs;
        int                        sc_act_nvq_pairs;

        uint8_t                        sc_mac[ETHER_ADDR_LEN];
        struct ethercom                sc_ethercom;
        int                        sc_link_state;

        struct vioif_netqueue        *sc_netqs;

        bool                        sc_has_ctrl;
        struct vioif_ctrlqueue        sc_ctrlq;

        bus_dma_segment_t         sc_segs[1];
        void                        *sc_dmamem;
        void                        *sc_kmem;

        void                        *sc_cfg_softint;

        struct workqueue        *sc_txrx_workqueue;
        bool                         sc_txrx_workqueue_sysctl;
        u_int                         sc_tx_intr_process_limit;
        u_int                         sc_tx_process_limit;
        u_int                         sc_rx_intr_process_limit;
        u_int                         sc_rx_process_limit;
};
#define VIRTIO_NET_TX_MAXNSEGS                (16) /* XXX */
#define VIRTIO_NET_CTRL_MAC_MAXENTRIES        (64) /* XXX */

#define VIOIF_TX_INTR_PROCESS_LIMIT        256
#define VIOIF_TX_PROCESS_LIMIT                256
#define VIOIF_RX_INTR_PROCESS_LIMIT        0U
#define VIOIF_RX_PROCESS_LIMIT                256

#define VIOIF_WORKQUEUE_PRI                PRI_SOFTNET
#define VIOIF_IS_LINK_ACTIVE(_sc)        ((_sc)->sc_link_state == LINK_STATE_UP ? \
                                            true : false)

/* cfattach interface functions */
static int        vioif_match(device_t, cfdata_t, void *);
static void        vioif_attach(device_t, device_t, void *);
static int        vioif_finalize_teardown(device_t);

/* ifnet interface functions */
static int        vioif_init(struct ifnet *);
static void        vioif_stop(struct ifnet *, int);
static void        vioif_start(struct ifnet *);
static int        vioif_transmit(struct ifnet *, struct mbuf *);
static int        vioif_ioctl(struct ifnet *, u_long, void *);
static void        vioif_watchdog(struct ifnet *);
static int        vioif_ifflags(struct vioif_softc *);
static int        vioif_ifflags_cb(struct ethercom *);

/* tx & rx */
static int        vioif_netqueue_init(struct vioif_softc *,
                    struct virtio_softc *, size_t, u_int);
static void        vioif_netqueue_teardown(struct vioif_softc *,
                    struct virtio_softc *, size_t);
static void        vioif_net_intr_enable(struct vioif_softc *,
                    struct virtio_softc *);
static void        vioif_net_intr_disable(struct vioif_softc *,
                    struct virtio_softc *);
static void        vioif_net_sched_handle(struct vioif_softc *,
                    struct vioif_netqueue *);

/* rx */
static void        vioif_populate_rx_mbufs_locked(struct vioif_softc *,
                    struct vioif_netqueue *);
static int        vioif_rx_intr(void *);
static void        vioif_rx_handle(void *);
static void        vioif_rx_queue_clear(struct vioif_softc *,
                    struct virtio_softc *, struct vioif_netqueue *);

/* tx */
static void        vioif_start_locked(struct ifnet *, struct vioif_netqueue *);
static void        vioif_transmit_locked(struct ifnet *, struct vioif_netqueue *);
static void        vioif_deferred_transmit(void *);
static int        vioif_tx_intr(void *);
static void        vioif_tx_handle(void *);
static void        vioif_tx_queue_clear(struct vioif_softc *, struct virtio_softc *,
                    struct vioif_netqueue *);

/* controls */
static int        vioif_ctrl_intr(void *);
static int        vioif_ctrl_rx(struct vioif_softc *, int, bool);
static int        vioif_set_promisc(struct vioif_softc *, bool);
static int        vioif_set_allmulti(struct vioif_softc *, bool);
static int        vioif_set_rx_filter(struct vioif_softc *);
static int        vioif_rx_filter(struct vioif_softc *);
static int        vioif_set_mac_addr(struct vioif_softc *);
static int        vioif_ctrl_mq_vq_pairs_set(struct vioif_softc *, int);

/* config interrupt */
static int        vioif_config_change(struct virtio_softc *);
static void        vioif_cfg_softint(void *);
static void        vioif_update_link_status(struct vioif_softc *);

/* others */
static void        vioif_alloc_queues(struct vioif_softc *);
static void        vioif_free_queues(struct vioif_softc *);
static int        vioif_alloc_mems(struct vioif_softc *);
static struct workqueue*
                vioif_workq_create(const char *, pri_t, int, int);
static void        vioif_workq_destroy(struct workqueue *);
static void        vioif_work_set(struct vioif_work *, void(*)(void *), void *);
static void        vioif_work_add(struct workqueue *, struct vioif_work *);
static void        vioif_work_wait(struct workqueue *, struct vioif_work *);
static int        vioif_setup_sysctl(struct vioif_softc *);
static void        vioif_setup_stats(struct vioif_softc *);

CFATTACH_DECL_NEW(vioif, sizeof(struct vioif_softc),
                  vioif_match, vioif_attach, NULL, NULL);

static void
vioif_intr_barrier(void)
{

        /* wait for finish all interrupt handler */
        xc_barrier(0);
}

static void
vioif_notify(struct virtio_softc *vsc, struct virtqueue *vq)
{

        virtio_enqueue_commit(vsc, vq, -1, true);
}

static int
vioif_match(device_t parent, cfdata_t match, void *aux)
{
        struct virtio_attach_args *va = aux;

        if (va->sc_childdevid == VIRTIO_DEVICE_ID_NETWORK)
                return 1;

        return 0;
}

static void
vioif_attach(device_t parent, device_t self, void *aux)
{
        struct vioif_softc *sc = device_private(self);
        struct virtio_softc *vsc = device_private(parent);
        struct vioif_netqueue *txq0;
        struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq;
        uint64_t features, req_features;
        struct ifnet *ifp = &sc->sc_ethercom.ec_if;
        u_int softint_flags;
        int r, i, req_flags;
        char xnamebuf[MAXCOMLEN];
        size_t nvqs;

        if (virtio_child(vsc) != NULL) {
                aprint_normal(": child already attached for %s; "
                    "something wrong...\n", device_xname(parent));
                return;
        }

        sc->sc_dev = self;
        sc->sc_virtio = vsc;
        sc->sc_link_state = LINK_STATE_UNKNOWN;

        sc->sc_max_nvq_pairs = 1;
        sc->sc_req_nvq_pairs = 1;
        sc->sc_act_nvq_pairs = 1;
        sc->sc_txrx_workqueue_sysctl = true;
        sc->sc_tx_intr_process_limit = VIOIF_TX_INTR_PROCESS_LIMIT;
        sc->sc_tx_process_limit = VIOIF_TX_PROCESS_LIMIT;
        sc->sc_rx_intr_process_limit = VIOIF_RX_INTR_PROCESS_LIMIT;
        sc->sc_rx_process_limit = VIOIF_RX_PROCESS_LIMIT;

        mutex_init(&sc->sc_lock, MUTEX_DEFAULT, IPL_NONE);

        snprintf(xnamebuf, sizeof(xnamebuf), "%s_txrx", device_xname(self));
        sc->sc_txrx_workqueue = vioif_workq_create(xnamebuf, VIOIF_WORKQUEUE_PRI,
            IPL_NET, WQ_PERCPU | WQ_MPSAFE);
        if (sc->sc_txrx_workqueue == NULL)
                goto err;

        req_flags = 0;

#ifdef VIOIF_MPSAFE
        req_flags |= VIRTIO_F_INTR_MPSAFE;
#endif
        req_flags |= VIRTIO_F_INTR_MSIX;

        req_features =
            VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | VIRTIO_NET_F_CTRL_VQ |
            VIRTIO_NET_F_CTRL_RX | VIRTIO_F_NOTIFY_ON_EMPTY;
        req_features |= VIRTIO_F_RING_EVENT_IDX;
        req_features |= VIRTIO_NET_F_CTRL_MAC_ADDR;
#ifdef VIOIF_MULTIQ
        req_features |= VIRTIO_NET_F_MQ;
#endif

        virtio_child_attach_start(vsc, self, IPL_NET,
            req_features, VIRTIO_NET_FLAG_BITS);
        features = virtio_features(vsc);

        if (features == 0)
                goto err;

        if (features & VIRTIO_NET_F_MAC) {
                for (i = 0; i < __arraycount(sc->sc_mac); i++) {
                        sc->sc_mac[i] = virtio_read_device_config_1(vsc,
                            VIRTIO_NET_CONFIG_MAC + i);
                }
        } else {
                /* code stolen from sys/net/if_tap.c */
                struct timeval tv;
                uint32_t ui;
                getmicrouptime(&tv);
                ui = (tv.tv_sec ^ tv.tv_usec) & 0xffffff;
                memcpy(sc->sc_mac+3, (uint8_t *)&ui, 3);
                for (i = 0; i < __arraycount(sc->sc_mac); i++) {
                        virtio_write_device_config_1(vsc,
                            VIRTIO_NET_CONFIG_MAC + i, sc->sc_mac[i]);
                }
        }

        /* 'Ethernet' with capital follows other ethernet driver attachment */
        aprint_normal_dev(self, "Ethernet address %s\n",
            ether_sprintf(sc->sc_mac));

        if (features & (VIRTIO_NET_F_MRG_RXBUF | VIRTIO_F_VERSION_1)) {
                sc->sc_hdr_size = sizeof(struct virtio_net_hdr);
        } else {
                sc->sc_hdr_size = offsetof(struct virtio_net_hdr, num_buffers);
        }

        if ((features & VIRTIO_NET_F_CTRL_VQ) &&
            (features & VIRTIO_NET_F_CTRL_RX)) {
                sc->sc_has_ctrl = true;

                cv_init(&ctrlq->ctrlq_wait, "ctrl_vq");
                mutex_init(&ctrlq->ctrlq_wait_lock, MUTEX_DEFAULT, IPL_NET);
                ctrlq->ctrlq_inuse = FREE;
        } else {
                sc->sc_has_ctrl = false;
        }

        if (sc->sc_has_ctrl && (features & VIRTIO_NET_F_MQ)) {
                sc->sc_max_nvq_pairs = virtio_read_device_config_2(vsc,
                    VIRTIO_NET_CONFIG_MAX_VQ_PAIRS);

                if (sc->sc_max_nvq_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX)
                        goto err;

                /* Limit the number of queue pairs to use */
                sc->sc_req_nvq_pairs = MIN(sc->sc_max_nvq_pairs, ncpu);

                if (sc->sc_max_nvq_pairs > 1)
                        req_flags |= VIRTIO_F_INTR_PERVQ;
        }

        vioif_alloc_queues(sc);

#ifdef VIOIF_MPSAFE
        softint_flags = SOFTINT_NET | SOFTINT_MPSAFE;
#else
        softint_flags = SOFTINT_NET;
#endif

        /*
         * Initialize network queues
         */
        nvqs = sc->sc_max_nvq_pairs * 2;
        for (i = 0; i < nvqs; i++) {
                r = vioif_netqueue_init(sc, vsc, i, softint_flags);
                if (r != 0)
                        goto err;
        }

        if (sc->sc_has_ctrl) {
                int ctrlq_idx = nvqs;

                nvqs++;
                /*
                 * Allocating a virtqueue for control channel
                 */
                sc->sc_ctrlq.ctrlq_vq = &sc->sc_vqs[ctrlq_idx];
                virtio_init_vq(vsc, ctrlq->ctrlq_vq, ctrlq_idx,
                    vioif_ctrl_intr, ctrlq);

                r = virtio_alloc_vq(vsc, ctrlq->ctrlq_vq, NBPG, 1, "control");
                if (r != 0) {
                        aprint_error_dev(self, "failed to allocate "
                            "a virtqueue for control channel, error code %d\n",
                            r);

                        sc->sc_has_ctrl = false;
                        cv_destroy(&ctrlq->ctrlq_wait);
                        mutex_destroy(&ctrlq->ctrlq_wait_lock);
                }
        }

        sc->sc_cfg_softint = softint_establish(softint_flags,
            vioif_cfg_softint, sc);
        if (sc->sc_cfg_softint == NULL) {
                aprint_error_dev(self, "cannot establish ctl softint\n");
                goto err;
        }

        if (vioif_alloc_mems(sc) < 0)
                goto err;

        r = virtio_child_attach_finish(vsc, sc->sc_vqs, nvqs,
            vioif_config_change, req_flags);
        if (r != 0)
                goto err;

        if (vioif_setup_sysctl(sc) != 0) {
                aprint_error_dev(self, "unable to create sysctl node\n");
                /* continue */
        }

        vioif_setup_stats(sc);

        strlcpy(ifp->if_xname, device_xname(self), IFNAMSIZ);
        ifp->if_softc = sc;
        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
#ifdef VIOIF_MPSAFE
        ifp->if_extflags = IFEF_MPSAFE;
#endif
        ifp->if_start = vioif_start;
        if (sc->sc_req_nvq_pairs > 1)
                ifp->if_transmit = vioif_transmit;
        ifp->if_ioctl = vioif_ioctl;
        ifp->if_init = vioif_init;
        ifp->if_stop = vioif_stop;
        ifp->if_capabilities = 0;
        ifp->if_watchdog = vioif_watchdog;
        txq0 = &sc->sc_netqs[VIOIF_NETQ_TXQID(0)];
        IFQ_SET_MAXLEN(&ifp->if_snd, MAX(txq0->netq_vq->vq_num, IFQ_MAXLEN));
        IFQ_SET_READY(&ifp->if_snd);

        sc->sc_ethercom.ec_capabilities |= ETHERCAP_VLAN_MTU;

        if_attach(ifp);
        if_deferred_start_init(ifp, NULL);
        ether_ifattach(ifp, sc->sc_mac);
        ether_set_ifflags_cb(&sc->sc_ethercom, vioif_ifflags_cb);

        return;

err:
        nvqs = sc->sc_max_nvq_pairs * 2;
        for (i = 0; i < nvqs; i++) {
                vioif_netqueue_teardown(sc, vsc, i);
        }

        if (sc->sc_has_ctrl) {
                cv_destroy(&ctrlq->ctrlq_wait);
                mutex_destroy(&ctrlq->ctrlq_wait_lock);
                virtio_free_vq(vsc, ctrlq->ctrlq_vq);
                ctrlq->ctrlq_vq = NULL;
        }

        vioif_free_queues(sc);
        mutex_destroy(&sc->sc_lock);
        virtio_child_attach_failed(vsc);
        config_finalize_register(self, vioif_finalize_teardown);

        return;
}

static int
vioif_finalize_teardown(device_t self)
{
        struct vioif_softc *sc = device_private(self);

        if (sc->sc_txrx_workqueue != NULL) {
                vioif_workq_destroy(sc->sc_txrx_workqueue);
                sc->sc_txrx_workqueue = NULL;
        }

        return 0;
}

/*
 * Interface functions for ifnet
 */
static int
vioif_init(struct ifnet *ifp)
{
        struct vioif_softc *sc = ifp->if_softc;
        struct virtio_softc *vsc = sc->sc_virtio;
        struct vioif_netqueue *netq;
        struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq;
        int r, i;

        vioif_stop(ifp, 0);

        r = virtio_reinit_start(vsc);
        if (r != 0) {
                log(LOG_ERR, "%s: reset failed\n", ifp->if_xname);
                return EIO;
        }

        virtio_negotiate_features(vsc, virtio_features(vsc));

        for (i = 0; i < sc->sc_req_nvq_pairs; i++) {
                netq = &sc->sc_netqs[VIOIF_NETQ_RXQID(i)];

                mutex_enter(&netq->netq_lock);
                vioif_populate_rx_mbufs_locked(sc, netq);
                mutex_exit(&netq->netq_lock);
        }

        virtio_reinit_end(vsc);

        if (sc->sc_has_ctrl)
                virtio_start_vq_intr(vsc, ctrlq->ctrlq_vq);

        r = vioif_ctrl_mq_vq_pairs_set(sc, sc->sc_req_nvq_pairs);
        if (r == 0)
                sc->sc_act_nvq_pairs = sc->sc_req_nvq_pairs;
        else
                sc->sc_act_nvq_pairs = 1;

        SET(ifp->if_flags, IFF_RUNNING);

        vioif_net_intr_enable(sc, vsc);

        vioif_update_link_status(sc);
        r = vioif_rx_filter(sc);

        return r;
}

static void
vioif_stop(struct ifnet *ifp, int disable)
{
        struct vioif_softc *sc = ifp->if_softc;
        struct virtio_softc *vsc = sc->sc_virtio;
        struct vioif_netqueue *netq;
        struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq;
        size_t i, act_qnum;

        act_qnum = sc->sc_act_nvq_pairs * 2;

        CLR(ifp->if_flags, IFF_RUNNING);
        for (i = 0; i < act_qnum; i++) {
                netq = &sc->sc_netqs[i];

                mutex_enter(&netq->netq_lock);
                netq->netq_stopping = true;
                mutex_exit(&netq->netq_lock);
        }

        /* disable interrupts */
        vioif_net_intr_disable(sc, vsc);
        if (sc->sc_has_ctrl)
                virtio_stop_vq_intr(vsc, ctrlq->ctrlq_vq);

        /*
         * only way to stop interrupt, I/O and DMA is resetting...
         *
         * NOTE: Devices based on VirtIO draft specification can not
         * stop interrupt completely even if virtio_stop_vq_intr() is called.
         */
        virtio_reset(vsc);

        vioif_intr_barrier();

        for (i = 0; i < act_qnum; i++) {
                netq = &sc->sc_netqs[i];
                vioif_work_wait(sc->sc_txrx_workqueue, &netq->netq_work);
        }

        for (i = 0; i < sc->sc_act_nvq_pairs; i++) {
                netq = &sc->sc_netqs[VIOIF_NETQ_RXQID(i)];
                vioif_rx_queue_clear(sc, vsc, netq);

                netq = &sc->sc_netqs[VIOIF_NETQ_TXQID(i)];
                vioif_tx_queue_clear(sc, vsc, netq);
        }

        /* all packet processing is stopped */
        for (i = 0; i < act_qnum; i++) {
                netq = &sc->sc_netqs[i];

                mutex_enter(&netq->netq_lock);
                netq->netq_stopping = false;
                mutex_exit(&netq->netq_lock);
        }
}

static void
vioif_start(struct ifnet *ifp)
{
        struct vioif_softc *sc = ifp->if_softc;
        struct vioif_netqueue *txq0 = &sc->sc_netqs[VIOIF_NETQ_TXQID(0)];

#ifdef VIOIF_MPSAFE
        KASSERT(if_is_mpsafe(ifp));
#endif

        mutex_enter(&txq0->netq_lock);
        vioif_start_locked(ifp, txq0);
        mutex_exit(&txq0->netq_lock);
}

static inline int
vioif_select_txqueue(struct ifnet *ifp, struct mbuf *m)
{
        struct vioif_softc *sc = ifp->if_softc;
        u_int cpuid = cpu_index(curcpu());

        return VIOIF_NETQ_TXQID(cpuid % sc->sc_act_nvq_pairs);
}

static int
vioif_transmit(struct ifnet *ifp, struct mbuf *m)
{
        struct vioif_softc *sc = ifp->if_softc;
        struct vioif_netqueue *netq;
        struct vioif_tx_context *txc;
        int qid;

        qid = vioif_select_txqueue(ifp, m);
        netq = &sc->sc_netqs[qid];
        txc = netq->netq_ctx;

        if (__predict_false(!pcq_put(txc->txc_intrq, m))) {
                m_freem(m);
                return ENOBUFS;
        }

        net_stat_ref_t nsr = IF_STAT_GETREF(ifp);
        if_statadd_ref(nsr, if_obytes, m->m_pkthdr.len);
        if (m->m_flags & M_MCAST)
                if_statinc_ref(nsr, if_omcasts);
        IF_STAT_PUTREF(ifp);

        if (mutex_tryenter(&netq->netq_lock)) {
                vioif_transmit_locked(ifp, netq);
                mutex_exit(&netq->netq_lock);
        }

        return 0;
}

void
vioif_watchdog(struct ifnet *ifp)
{
        struct vioif_softc *sc = ifp->if_softc;
        struct vioif_netqueue *netq;
        int i;

        if (ISSET(ifp->if_flags, IFF_RUNNING)) {
                if (ISSET(ifp->if_flags, IFF_DEBUG)) {
                        log(LOG_DEBUG, "%s: watchdog timed out\n",
                            ifp->if_xname);
                }

                for (i = 0; i < sc->sc_act_nvq_pairs; i++) {
                        netq = &sc->sc_netqs[VIOIF_NETQ_TXQID(i)];

                        mutex_enter(&netq->netq_lock);
                        if (!netq->netq_running_handle) {
                                netq->netq_running_handle = true;
                                vioif_net_sched_handle(sc, netq);
                        }
                        mutex_exit(&netq->netq_lock);
                }
        }
}

static int
vioif_ioctl(struct ifnet *ifp, u_long cmd, void *data)
{
        int s, r;

        s = splnet();

        r = ether_ioctl(ifp, cmd, data);
        if (r == ENETRESET && (cmd == SIOCADDMULTI || cmd == SIOCDELMULTI)) {
                if (ifp->if_flags & IFF_RUNNING) {
                        r = vioif_rx_filter(ifp->if_softc);
                } else {
                        r = 0;
                }
        }

        splx(s);

        return r;
}

static int
vioif_ifflags(struct vioif_softc *sc)
{
        struct ifnet *ifp = &sc->sc_ethercom.ec_if;
        bool onoff;
        int r;

        if (!sc->sc_has_ctrl) {
                /* no ctrl vq; always promisc and allmulti */
                ifp->if_flags |= (IFF_PROMISC | IFF_ALLMULTI);
                return 0;
        }

        onoff = ifp->if_flags & IFF_ALLMULTI ? true : false;
        r = vioif_set_allmulti(sc, onoff);
        if (r != 0) {
                log(LOG_WARNING,
                    "%s: couldn't %sable ALLMULTI\n",
                    ifp->if_xname, onoff ? "en" : "dis");
                if (onoff) {
                        CLR(ifp->if_flags, IFF_ALLMULTI);
                } else {
                        SET(ifp->if_flags, IFF_ALLMULTI);
                }
        }

        onoff = ifp->if_flags & IFF_PROMISC ? true : false;
        r = vioif_set_promisc(sc, onoff);
        if (r != 0) {
                log(LOG_WARNING,
                    "%s: couldn't %sable PROMISC\n",
                    ifp->if_xname, onoff ? "en" : "dis");
                if (onoff) {
                        CLR(ifp->if_flags, IFF_PROMISC);
                } else {
                        SET(ifp->if_flags, IFF_PROMISC);
                }
        }

        return 0;
}

static int
vioif_ifflags_cb(struct ethercom *ec)
{
        struct ifnet *ifp = &ec->ec_if;
        struct vioif_softc *sc = ifp->if_softc;

        return vioif_ifflags(sc);
}

static int
vioif_setup_sysctl(struct vioif_softc *sc)
{
        const char *devname;
        struct sysctllog **log;
        const struct sysctlnode *rnode, *rxnode, *txnode;
        int error;

        log = &sc->sc_sysctllog;
        devname = device_xname(sc->sc_dev);

        error = sysctl_createv(log, 0, NULL, &rnode,
            0, CTLTYPE_NODE, devname,
            SYSCTL_DESCR("virtio-net information and settings"),
            NULL, 0, NULL, 0, CTL_HW, CTL_CREATE, CTL_EOL);
        if (error)
                goto out;

        error = sysctl_createv(log, 0, &rnode, NULL,
            CTLFLAG_READWRITE, CTLTYPE_BOOL, "txrx_workqueue",
            SYSCTL_DESCR("Use workqueue for packet processing"),
            NULL, 0, &sc->sc_txrx_workqueue_sysctl, 0, CTL_CREATE, CTL_EOL);
        if (error)
                goto out;

        error = sysctl_createv(log, 0, &rnode, &rxnode,
            0, CTLTYPE_NODE, "rx",
            SYSCTL_DESCR("virtio-net information and settings for Rx"),
            NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL);
        if (error)
                goto out;

        error = sysctl_createv(log, 0, &rxnode, NULL,
            CTLFLAG_READWRITE, CTLTYPE_INT, "intr_process_limit",
            SYSCTL_DESCR("max number of Rx packets to process for interrupt processing"),
            NULL, 0, &sc->sc_rx_intr_process_limit, 0, CTL_CREATE, CTL_EOL);
        if (error)
                goto out;

        error = sysctl_createv(log, 0, &rxnode, NULL,
            CTLFLAG_READWRITE, CTLTYPE_INT, "process_limit",
            SYSCTL_DESCR("max number of Rx packets to process for deferred processing"),
            NULL, 0, &sc->sc_rx_process_limit, 0, CTL_CREATE, CTL_EOL);
        if (error)
                goto out;

        error = sysctl_createv(log, 0, &rnode, &txnode,
            0, CTLTYPE_NODE, "tx",
            SYSCTL_DESCR("virtio-net information and settings for Tx"),
            NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL);
        if (error)
                goto out;

        error = sysctl_createv(log, 0, &txnode, NULL,
            CTLFLAG_READWRITE, CTLTYPE_INT, "intr_process_limit",
            SYSCTL_DESCR("max number of Tx packets to process for interrupt processing"),
            NULL, 0, &sc->sc_tx_intr_process_limit, 0, CTL_CREATE, CTL_EOL);
        if (error)
                goto out;

        error = sysctl_createv(log, 0, &txnode, NULL,
            CTLFLAG_READWRITE, CTLTYPE_INT, "process_limit",
            SYSCTL_DESCR("max number of Tx packets to process for deferred processing"),
            NULL, 0, &sc->sc_tx_process_limit, 0, CTL_CREATE, CTL_EOL);

out:
        if (error)
                sysctl_teardown(log);

        return error;
}

static void
vioif_setup_stats(struct vioif_softc *sc)
{
        struct vioif_netqueue *netq;
        struct vioif_tx_context *txc;
        struct vioif_rx_context *rxc;
        size_t i, netq_num;

        netq_num = sc->sc_max_nvq_pairs * 2;
        for (i = 0; i < netq_num; i++) {
                netq = &sc->sc_netqs[i];
                evcnt_attach_dynamic(&netq->netq_mbuf_load_failed, EVCNT_TYPE_MISC,
                    NULL, netq->netq_evgroup, "failed to load mbuf to DMA");
                evcnt_attach_dynamic(&netq->netq_enqueue_failed,
                    EVCNT_TYPE_MISC, NULL, netq->netq_evgroup,
                    "virtqueue enqueue failed failed");

                switch (VIOIF_NETQ_DIR(i)) {
                case VIOIF_NETQ_RX:
                        rxc = netq->netq_ctx;
                        evcnt_attach_dynamic(&rxc->rxc_mbuf_enobufs,
                            EVCNT_TYPE_MISC, NULL, netq->netq_evgroup,
                            "no receive buffer");
                        break;
                case VIOIF_NETQ_TX:
                        txc = netq->netq_ctx;
                        evcnt_attach_dynamic(&txc->txc_defrag_failed,
                            EVCNT_TYPE_MISC, NULL, netq->netq_evgroup,
                            "m_defrag() failed");
                        break;
                }
        }

        evcnt_attach_dynamic(&sc->sc_ctrlq.ctrlq_cmd_load_failed, EVCNT_TYPE_MISC,
            NULL, device_xname(sc->sc_dev), "control command dmamap load failed");
        evcnt_attach_dynamic(&sc->sc_ctrlq.ctrlq_cmd_failed, EVCNT_TYPE_MISC,
            NULL, device_xname(sc->sc_dev), "control command failed");
}

/*
 * allocate memory
 */
static int
vioif_dmamap_create(struct vioif_softc *sc, bus_dmamap_t *map,
    bus_size_t size, int nsegs, const char *usage)
{
        int r;

        r = bus_dmamap_create(virtio_dmat(sc->sc_virtio), size,
            nsegs, size, 0, BUS_DMA_NOWAIT | BUS_DMA_ALLOCNOW, map);

        if (r != 0) {
                aprint_error_dev(sc->sc_dev, "%s dmamap creation failed, "
                    "error code %d\n", usage, r);
        }

        return r;
}

static void
vioif_dmamap_destroy(struct vioif_softc *sc, bus_dmamap_t *map)
{

        if (*map) {
                bus_dmamap_destroy(virtio_dmat(sc->sc_virtio), *map);
                *map = NULL;
        }
}

static int
vioif_dmamap_create_load(struct vioif_softc *sc, bus_dmamap_t *map,
    void *buf, bus_size_t size, int nsegs, int rw, const char *usage)
{
        int r;

        r = vioif_dmamap_create(sc, map, size, nsegs, usage);
        if (r != 0)
                return 1;

        r = bus_dmamap_load(virtio_dmat(sc->sc_virtio), *map, buf,
            size, NULL, rw | BUS_DMA_NOWAIT);
        if (r != 0) {
                vioif_dmamap_destroy(sc, map);
                aprint_error_dev(sc->sc_dev, "%s dmamap load failed. "
                    "error code %d\n", usage, r);
        }

        return r;
}

static void *
vioif_assign_mem(intptr_t *p, size_t size)
{
        intptr_t rv;

        rv = *p;
        *p += size;

        return (void *)rv;
}

/*
 * dma memory is used for:
 *   netq_maps_kva:         metadata array for received frames (READ) and
 *                         sent frames (WRITE)
 *   ctrlq_cmd:                 command to be sent via ctrl vq (WRITE)
 *   ctrlq_status:         return value for a command via ctrl vq (READ)
 *   ctrlq_rx:                 parameter for a VIRTIO_NET_CTRL_RX class command
 *                         (WRITE)
 *   ctrlq_mac_tbl_uc:         unicast MAC address filter for a VIRTIO_NET_CTRL_MAC
 *                         class command (WRITE)
 *   ctrlq_mac_tbl_mc:         multicast MAC address filter for a VIRTIO_NET_CTRL_MAC
 *                         class command (WRITE)
 * ctrlq_* structures are allocated only one each; they are protected by
 * ctrlq_inuse variable and ctrlq_wait condvar.
 */
static int
vioif_alloc_mems(struct vioif_softc *sc)
{
        struct virtio_softc *vsc = sc->sc_virtio;
        struct vioif_netqueue *netq;
        struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq;
        struct vioif_net_map *maps;
        unsigned int vq_num;
        int r, rsegs;
        bus_size_t dmamemsize;
        size_t qid, i, netq_num, kmemsize;
        void *vaddr;
        intptr_t p;

        netq_num = sc->sc_max_nvq_pairs * 2;

        /* allocate DMA memory */
        dmamemsize = 0;

        for (qid = 0; qid < netq_num; qid++) {
                maps = sc->sc_netqs[qid].netq_maps;
                vq_num = sc->sc_netqs[qid].netq_vq->vq_num;
                dmamemsize += sizeof(*maps[0].vnm_hdr) * vq_num;
        }

        if (sc->sc_has_ctrl) {
                dmamemsize += sizeof(struct virtio_net_ctrl_cmd);
                dmamemsize += sizeof(struct virtio_net_ctrl_status);
                dmamemsize += sizeof(struct virtio_net_ctrl_rx);
                dmamemsize += sizeof(struct virtio_net_ctrl_mac_tbl)
                    + ETHER_ADDR_LEN;
                dmamemsize += sizeof(struct virtio_net_ctrl_mac_tbl)
                    + ETHER_ADDR_LEN * VIRTIO_NET_CTRL_MAC_MAXENTRIES;
                dmamemsize += sizeof(struct virtio_net_ctrl_mac_addr);
                dmamemsize += sizeof(struct virtio_net_ctrl_mq);
        }

        r = bus_dmamem_alloc(virtio_dmat(vsc), dmamemsize, 0, 0,
            &sc->sc_segs[0], 1, &rsegs, BUS_DMA_NOWAIT);
        if (r != 0) {
                aprint_error_dev(sc->sc_dev,
                    "DMA memory allocation failed, size %" PRIuBUSSIZE ", "
                    "error code %d\n", dmamemsize, r);
                goto err_none;
        }
        r = bus_dmamem_map(virtio_dmat(vsc), &sc->sc_segs[0], 1,
            dmamemsize, &vaddr, BUS_DMA_NOWAIT);
        if (r != 0) {
                aprint_error_dev(sc->sc_dev,
                    "DMA memory map failed, error code %d\n", r);
                goto err_dmamem_alloc;
        }

        /* assign DMA memory */
        memset(vaddr, 0, dmamemsize);
        sc->sc_dmamem = vaddr;
        p = (intptr_t) vaddr;

        for (qid = 0; qid < netq_num; qid++) {
                netq = &sc->sc_netqs[qid];
                maps = netq->netq_maps;
                vq_num = netq->netq_vq->vq_num;

                netq->netq_maps_kva = vioif_assign_mem(&p,
                    sizeof(*maps[0].vnm_hdr) * vq_num);
        }

        if (sc->sc_has_ctrl) {
                ctrlq->ctrlq_cmd = vioif_assign_mem(&p,
                    sizeof(*ctrlq->ctrlq_cmd));
                ctrlq->ctrlq_status = vioif_assign_mem(&p,
                    sizeof(*ctrlq->ctrlq_status));
                ctrlq->ctrlq_rx = vioif_assign_mem(&p,
                    sizeof(*ctrlq->ctrlq_rx));
                ctrlq->ctrlq_mac_tbl_uc = vioif_assign_mem(&p,
                    sizeof(*ctrlq->ctrlq_mac_tbl_uc)
                    + ETHER_ADDR_LEN);
                ctrlq->ctrlq_mac_tbl_mc = vioif_assign_mem(&p,
                    sizeof(*ctrlq->ctrlq_mac_tbl_mc)
                    + ETHER_ADDR_LEN * VIRTIO_NET_CTRL_MAC_MAXENTRIES);
                ctrlq->ctrlq_mac_addr = vioif_assign_mem(&p,
                    sizeof(*ctrlq->ctrlq_mac_addr));
                ctrlq->ctrlq_mq = vioif_assign_mem(&p, sizeof(*ctrlq->ctrlq_mq));
        }

        /* allocate kmem */
        kmemsize = 0;

        for (qid = 0; qid < netq_num; qid++) {
                netq = &sc->sc_netqs[qid];
                vq_num = netq->netq_vq->vq_num;

                kmemsize += sizeof(netq->netq_maps[0]) * vq_num;
        }

        vaddr = kmem_zalloc(kmemsize, KM_SLEEP);
        sc->sc_kmem = vaddr;

        /* assign allocated kmem */
        p = (intptr_t) vaddr;

        for (qid = 0; qid < netq_num; qid++) {
                netq = &sc->sc_netqs[qid];
                vq_num = netq->netq_vq->vq_num;

                netq->netq_maps = vioif_assign_mem(&p,
                    sizeof(netq->netq_maps[0]) * vq_num);
        }

        /* prepare dmamaps */
        for (qid = 0; qid < netq_num; qid++) {
                static const struct {
                        const char        *msg_hdr;
                        const char        *msg_payload;
                        int                 dma_flag;
                        bus_size_t         dma_size;
                        int                 dma_nsegs;
                } dmaparams[VIOIF_NETQ_IDX] = {
                        [VIOIF_NETQ_RX] = {
                                .msg_hdr        = "rx header",
                                .msg_payload        = "rx payload",
                                .dma_flag        = BUS_DMA_READ,
                                .dma_size        = MCLBYTES - ETHER_ALIGN,
                                .dma_nsegs        = 1,
                        },
                        [VIOIF_NETQ_TX] = {
                                .msg_hdr        = "tx header",
                                .msg_payload        = "tx payload",
                                .dma_flag        = BUS_DMA_WRITE,
                                .dma_size        = ETHER_MAX_LEN,
                                .dma_nsegs        = VIRTIO_NET_TX_MAXNSEGS,
                        }
                };

                struct virtio_net_hdr *hdrs;
                int dir;
                int nsegs;

                dir = VIOIF_NETQ_DIR(qid);
                netq = &sc->sc_netqs[qid];
                vq_num = netq->netq_vq->vq_num;
                maps = netq->netq_maps;
                hdrs = netq->netq_maps_kva;
                nsegs = uimin(dmaparams[dir].dma_nsegs, vq_num - 1/*hdr*/);

                for (i = 0; i < vq_num; i++) {
                        maps[i].vnm_hdr = &hdrs[i];
        
                        r = vioif_dmamap_create_load(sc, &maps[i].vnm_hdr_map,
                            maps[i].vnm_hdr, sc->sc_hdr_size, 1,
                            dmaparams[dir].dma_flag, dmaparams[dir].msg_hdr);
                        if (r != 0)
                                goto err_reqs;

                        r = vioif_dmamap_create(sc, &maps[i].vnm_mbuf_map,
                            dmaparams[dir].dma_size, nsegs,
                            dmaparams[dir].msg_payload);
                        if (r != 0)
                                goto err_reqs;
                }
        }

        if (sc->sc_has_ctrl) {
                /* control vq class & command */
                r = vioif_dmamap_create_load(sc, &ctrlq->ctrlq_cmd_dmamap,
                    ctrlq->ctrlq_cmd, sizeof(*ctrlq->ctrlq_cmd), 1,
                    BUS_DMA_WRITE, "control command");
                if (r != 0)
                        goto err_reqs;

                r = vioif_dmamap_create_load(sc, &ctrlq->ctrlq_status_dmamap,
                    ctrlq->ctrlq_status, sizeof(*ctrlq->ctrlq_status), 1,
                    BUS_DMA_READ, "control status");
                if (r != 0)
                        goto err_reqs;

                /* control vq rx mode command parameter */
                r = vioif_dmamap_create_load(sc, &ctrlq->ctrlq_rx_dmamap,
                    ctrlq->ctrlq_rx, sizeof(*ctrlq->ctrlq_rx), 1,
                    BUS_DMA_WRITE, "rx mode control command");
                if (r != 0)
                        goto err_reqs;

                /* multiqueue set command */
                r = vioif_dmamap_create_load(sc, &ctrlq->ctrlq_mq_dmamap,
                    ctrlq->ctrlq_mq, sizeof(*ctrlq->ctrlq_mq), 1,
                    BUS_DMA_WRITE, "multiqueue set command");
                if (r != 0)
                        goto err_reqs;

                /* control vq MAC filter table for unicast */
                /* do not load now since its length is variable */
                r = vioif_dmamap_create(sc, &ctrlq->ctrlq_tbl_uc_dmamap,
                    sizeof(*ctrlq->ctrlq_mac_tbl_uc)
                    + ETHER_ADDR_LEN, 1,
                    "unicast MAC address filter command");
                if (r != 0)
                        goto err_reqs;

                /* control vq MAC filter table for multicast */
                r = vioif_dmamap_create(sc, &ctrlq->ctrlq_tbl_mc_dmamap,
                    sizeof(*ctrlq->ctrlq_mac_tbl_mc)
                    + ETHER_ADDR_LEN * VIRTIO_NET_CTRL_MAC_MAXENTRIES, 1,
                    "multicast MAC address filter command");
                if (r != 0)
                        goto err_reqs;

                /* control vq MAC address set command */
                r = vioif_dmamap_create_load(sc,
                    &ctrlq->ctrlq_mac_addr_dmamap,
                    ctrlq->ctrlq_mac_addr,
                    sizeof(*ctrlq->ctrlq_mac_addr), 1,
                    BUS_DMA_WRITE, "mac addr set command");
                if (r != 0)
                        goto err_reqs;
        }

        return 0;

err_reqs:
        vioif_dmamap_destroy(sc, &ctrlq->ctrlq_tbl_mc_dmamap);
        vioif_dmamap_destroy(sc, &ctrlq->ctrlq_tbl_uc_dmamap);
        vioif_dmamap_destroy(sc, &ctrlq->ctrlq_rx_dmamap);
        vioif_dmamap_destroy(sc, &ctrlq->ctrlq_status_dmamap);
        vioif_dmamap_destroy(sc, &ctrlq->ctrlq_cmd_dmamap);
        vioif_dmamap_destroy(sc, &ctrlq->ctrlq_mac_addr_dmamap);
        for (qid = 0; qid < netq_num; qid++) {
                vq_num = sc->sc_netqs[qid].netq_vq->vq_num;
                maps = sc->sc_netqs[qid].netq_maps;

                for (i = 0; i < vq_num; i++) {
                        vioif_dmamap_destroy(sc, &maps[i].vnm_mbuf_map);
                        vioif_dmamap_destroy(sc, &maps[i].vnm_hdr_map);
                }
        }
        if (sc->sc_kmem) {
                kmem_free(sc->sc_kmem, kmemsize);
                sc->sc_kmem = NULL;
        }
        bus_dmamem_unmap(virtio_dmat(vsc), sc->sc_dmamem, dmamemsize);
err_dmamem_alloc:
        bus_dmamem_free(virtio_dmat(vsc), &sc->sc_segs[0], 1);
err_none:
        return -1;
}

static void
vioif_alloc_queues(struct vioif_softc *sc)
{
        int nvq_pairs = sc->sc_max_nvq_pairs;
        size_t nvqs, netq_num;

        KASSERT(nvq_pairs <= VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX);

        nvqs = netq_num = sc->sc_max_nvq_pairs * 2;
        if (sc->sc_has_ctrl)
                nvqs++;

        sc->sc_vqs = kmem_zalloc(sizeof(sc->sc_vqs[0]) * nvqs, KM_SLEEP);
        sc->sc_netqs = kmem_zalloc(sizeof(sc->sc_netqs[0]) * netq_num,
            KM_SLEEP);
}

static void
vioif_free_queues(struct vioif_softc *sc)
{
        size_t nvqs, netq_num;

        nvqs = netq_num = sc->sc_max_nvq_pairs * 2;
        if (sc->sc_ctrlq.ctrlq_vq)
                nvqs++;

        kmem_free(sc->sc_netqs, sizeof(sc->sc_netqs[0]) * netq_num);
        kmem_free(sc->sc_vqs, sizeof(sc->sc_vqs[0]) * nvqs);
        sc->sc_netqs = NULL;
        sc->sc_vqs = NULL;
}

/*
 * Network queues
 */
static int
vioif_netqueue_init(struct vioif_softc *sc, struct virtio_softc *vsc,
    size_t qid, u_int softint_flags)
{
        static const struct {
                const char        *dirname;
                int                 segsize;
                int                 nsegs;
                int                 (*intrhand)(void *);
                void                (*sihand)(void *);
        } params[VIOIF_NETQ_IDX] = {
                [VIOIF_NETQ_RX] = {
                        .dirname        = "rx",
                        .segsize        = MCLBYTES,
                        .nsegs                = 2,
                        .intrhand        = vioif_rx_intr,
                        .sihand                = vioif_rx_handle,
                },
                [VIOIF_NETQ_TX] = {
                        .dirname        = "tx",
                        .segsize        = ETHER_MAX_LEN - ETHER_HDR_LEN,
                        .nsegs                = 2,
                        .intrhand        = vioif_tx_intr,
                        .sihand                = vioif_tx_handle,
                }
        };

        struct virtqueue *vq;
        struct vioif_netqueue *netq;
        struct vioif_tx_context *txc;
        struct vioif_rx_context *rxc;
        char qname[32];
        int r, dir;

        txc = NULL;
        rxc = NULL;
        netq = &sc->sc_netqs[qid];
        vq = &sc->sc_vqs[qid];
        dir = VIOIF_NETQ_DIR(qid);

        netq->netq_vq = &sc->sc_vqs[qid];
        netq->netq_stopping = false;
        netq->netq_running_handle = false;

        snprintf(qname, sizeof(qname), "%s%zu",
            params[dir].dirname, VIOIF_NETQ_PAIRIDX(qid));
        snprintf(netq->netq_evgroup, sizeof(netq->netq_evgroup),
            "%s-%s", device_xname(sc->sc_dev), qname);

        mutex_init(&netq->netq_lock, MUTEX_DEFAULT, IPL_NET);
        virtio_init_vq(vsc, vq, qid, params[dir].intrhand, netq);

        r = virtio_alloc_vq(vsc, vq,
            params[dir].segsize + sc->sc_hdr_size,
            params[dir].nsegs, qname);
        if (r != 0)
                goto err;
        netq->netq_vq = vq;

        netq->netq_softint = softint_establish(softint_flags,
            params[dir].sihand, netq);
        if (netq->netq_softint == NULL) {
                aprint_error_dev(sc->sc_dev,
                    "couldn't establish %s softint\n",
                    params[dir].dirname);
                goto err;
        }
        vioif_work_set(&netq->netq_work, params[dir].sihand, netq);

        switch (dir) {
        case VIOIF_NETQ_RX:
                rxc = kmem_zalloc(sizeof(*rxc), KM_SLEEP);
                netq->netq_ctx = rxc;
                /* nothing to do */
                break;
        case VIOIF_NETQ_TX:
                txc = kmem_zalloc(sizeof(*txc), KM_SLEEP);
                netq->netq_ctx = (void *)txc;
                txc->txc_deferred_transmit = softint_establish(softint_flags,
                    vioif_deferred_transmit, netq);
                if (txc->txc_deferred_transmit == NULL) {
                        aprint_error_dev(sc->sc_dev,
                            "couldn't establish softint for "
                            "tx deferred transmit\n");
                        goto err;
                }
                txc->txc_link_active = VIOIF_IS_LINK_ACTIVE(sc);
                txc->txc_no_free_slots = false;
                txc->txc_intrq = pcq_create(vq->vq_num, KM_SLEEP);
                break;
        }

        return 0;

err:
        netq->netq_ctx = NULL;

        if (rxc != NULL) {
                kmem_free(rxc, sizeof(*rxc));
        }

        if (txc != NULL) {
                if (txc->txc_deferred_transmit != NULL)
                        softint_disestablish(txc->txc_deferred_transmit);
                if (txc->txc_intrq != NULL)
                        pcq_destroy(txc->txc_intrq);
                kmem_free(txc, sizeof(txc));
        }

        vioif_work_set(&netq->netq_work, NULL, NULL);
        if (netq->netq_softint != NULL) {
                softint_disestablish(netq->netq_softint);
                netq->netq_softint = NULL;
        }

        virtio_free_vq(vsc, vq);
        mutex_destroy(&netq->netq_lock);
        netq->netq_vq = NULL;

        return -1;
}

static void
vioif_netqueue_teardown(struct vioif_softc *sc, struct virtio_softc *vsc,
    size_t qid)
{
        struct vioif_netqueue *netq;
        struct vioif_rx_context *rxc;
        struct vioif_tx_context *txc;
        int dir;

        netq = &sc->sc_netqs[qid];

        if (netq->netq_vq == NULL)
                return;

        netq = &sc->sc_netqs[qid];
        dir = VIOIF_NETQ_DIR(qid);
        switch (dir) {
        case VIOIF_NETQ_RX:
                rxc = netq->netq_ctx;
                netq->netq_ctx = NULL;
                kmem_free(rxc, sizeof(*rxc));
                break;
        case VIOIF_NETQ_TX:
                txc = netq->netq_ctx;
                netq->netq_ctx = NULL;
                softint_disestablish(txc->txc_deferred_transmit);
                pcq_destroy(txc->txc_intrq);
                kmem_free(txc, sizeof(*txc));
                break;
        }

        softint_disestablish(netq->netq_softint);
        virtio_free_vq(vsc, netq->netq_vq);
        mutex_destroy(&netq->netq_lock);
        netq->netq_vq = NULL;
}

static void
vioif_net_sched_handle(struct vioif_softc *sc, struct vioif_netqueue *netq)
{

        KASSERT(mutex_owned(&netq->netq_lock));
        KASSERT(!netq->netq_stopping);

        if (netq->netq_workqueue) {
                vioif_work_add(sc->sc_txrx_workqueue, &netq->netq_work);
        } else {
                softint_schedule(netq->netq_softint);
        }
}

static int
vioif_net_load_mbuf(struct virtio_softc *vsc, struct vioif_net_map *map,
   struct mbuf *m, int dma_flags)
{
        int r;

        KASSERT(map->vnm_mbuf == NULL);

        r = bus_dmamap_load_mbuf(virtio_dmat(vsc),
            map->vnm_mbuf_map, m, dma_flags | BUS_DMA_NOWAIT);
        if (r == 0) {
                map->vnm_mbuf = m;
        }

        return r;
}

static void
vioif_net_unload_mbuf(struct virtio_softc *vsc, struct vioif_net_map *map)
{

        KASSERT(map->vnm_mbuf != NULL);
        bus_dmamap_unload(virtio_dmat(vsc), map->vnm_mbuf_map);
        map->vnm_mbuf = NULL;
}

static int
vioif_net_enqueue(struct virtio_softc *vsc, struct virtqueue *vq,
    int slot, struct vioif_net_map *map, int dma_ops, bool is_write)
{
        int r;

        KASSERT(map->vnm_mbuf != NULL);

        /* This should actually never fail */
        r = virtio_enqueue_reserve(vsc, vq, slot,
            map->vnm_mbuf_map->dm_nsegs + 1);
        if (r != 0) {
                /* slot already freed by virtio_enqueue_reserve */
                return r;
        }

        bus_dmamap_sync(virtio_dmat(vsc), map->vnm_mbuf_map,
            0, map->vnm_mbuf_map->dm_mapsize, dma_ops);
        bus_dmamap_sync(virtio_dmat(vsc), map->vnm_hdr_map,
            0, map->vnm_hdr_map->dm_mapsize, dma_ops);

        virtio_enqueue(vsc, vq, slot, map->vnm_hdr_map, is_write);
        virtio_enqueue(vsc, vq, slot, map->vnm_mbuf_map, is_write);
        virtio_enqueue_commit(vsc, vq, slot, false);

        return 0;
}

static int
vioif_net_enqueue_tx(struct virtio_softc *vsc, struct virtqueue *vq,
    int slot, struct vioif_net_map *map)
{

        return vioif_net_enqueue(vsc, vq, slot, map,
            BUS_DMASYNC_PREWRITE, true);
}

static int
vioif_net_enqueue_rx(struct virtio_softc *vsc, struct virtqueue *vq,
    int slot, struct vioif_net_map *map)
{

        return vioif_net_enqueue(vsc, vq, slot, map,
            BUS_DMASYNC_PREREAD, false);
}

static struct mbuf *
vioif_net_dequeue_commit(struct virtio_softc *vsc, struct virtqueue *vq,
   int slot, struct vioif_net_map *map, int dma_flags)
{
        struct mbuf *m;

        m = map->vnm_mbuf;
        KASSERT(m != NULL);
        map->vnm_mbuf = NULL;

        bus_dmamap_sync(virtio_dmat(vsc), map->vnm_hdr_map,
            0, map->vnm_hdr_map->dm_mapsize, dma_flags);
        bus_dmamap_sync(virtio_dmat(vsc), map->vnm_mbuf_map,
            0, map->vnm_mbuf_map->dm_mapsize, dma_flags);

        bus_dmamap_unload(virtio_dmat(vsc), map->vnm_mbuf_map);
        virtio_dequeue_commit(vsc, vq, slot);

        return m;
}

static void
vioif_net_intr_enable(struct vioif_softc *sc, struct virtio_softc *vsc)
{
        struct vioif_netqueue *netq;
        size_t i, act_qnum;
        int enqueued;

        act_qnum = sc->sc_act_nvq_pairs * 2;
        for (i = 0; i < act_qnum; i++) {
                netq = &sc->sc_netqs[i];

                KASSERT(!netq->netq_stopping);
                KASSERT(!netq->netq_running_handle);

                enqueued = virtio_start_vq_intr(vsc, netq->netq_vq);
                if (enqueued != 0) {
                        virtio_stop_vq_intr(vsc, netq->netq_vq);

                        mutex_enter(&netq->netq_lock);
                        netq->netq_running_handle = true;
                        vioif_net_sched_handle(sc, netq);
                        mutex_exit(&netq->netq_lock);
                }
        }
}

static void
vioif_net_intr_disable(struct vioif_softc *sc, struct virtio_softc *vsc)
{
        struct vioif_netqueue *netq;
        size_t i, act_qnum;

        act_qnum = sc->sc_act_nvq_pairs * 2;
        for (i = 0; i < act_qnum; i++) {
                netq = &sc->sc_netqs[i];

                virtio_stop_vq_intr(vsc, netq->netq_vq);
        }
}

/*
 * Receive implementation
 */
/* enqueue mbufs to receive slots */
static void
vioif_populate_rx_mbufs_locked(struct vioif_softc *sc, struct vioif_netqueue *netq)
{
        struct virtqueue *vq = netq->netq_vq;
        struct virtio_softc *vsc = vq->vq_owner;
        struct vioif_rx_context *rxc;
        struct vioif_net_map *map;
        struct mbuf *m;
        int i, r, ndone = 0;

        KASSERT(mutex_owned(&netq->netq_lock));

        rxc = netq->netq_ctx;

        for (i = 0; i < vq->vq_num; i++) {
                int slot;
                r = virtio_enqueue_prep(vsc, vq, &slot);
                if (r == EAGAIN)
                        break;
                if (__predict_false(r != 0))
                        panic("enqueue_prep for rx buffers");

                MGETHDR(m, M_DONTWAIT, MT_DATA);
                if (m == NULL) {
                        virtio_enqueue_abort(vsc, vq, slot);
                        rxc->rxc_mbuf_enobufs.ev_count++;
                        break;
                }
                MCLGET(m, M_DONTWAIT);
                if ((m->m_flags & M_EXT) == 0) {
                        virtio_enqueue_abort(vsc, vq, slot);
                        m_freem(m);
                        rxc->rxc_mbuf_enobufs.ev_count++;
                        break;
                }

                m->m_len = m->m_pkthdr.len = MCLBYTES;
                m_adj(m, ETHER_ALIGN);

                map = &netq->netq_maps[slot];
                r = vioif_net_load_mbuf(vsc, map, m, BUS_DMA_READ);
                if (r != 0) {
                        virtio_enqueue_abort(vsc, vq, slot);
                        m_freem(m);
                        netq->netq_mbuf_load_failed.ev_count++;
                        break;
                }

                r = vioif_net_enqueue_rx(vsc, vq, slot, map);
                if (r != 0) {
                        vioif_net_unload_mbuf(vsc, map);
                        netq->netq_enqueue_failed.ev_count++;
                        m_freem(m);
                        /* slot already freed by vioif_net_enqueue_rx */
                        break;
                }

                ndone++;
        }

        if (ndone > 0)
                vioif_notify(vsc, vq);
}

/* dequeue received packets */
static bool
vioif_rx_deq_locked(struct vioif_softc *sc, struct virtio_softc *vsc,
    struct vioif_netqueue *netq, u_int limit, size_t *ndeqp)
{
        struct virtqueue *vq = netq->netq_vq;
        struct ifnet *ifp = &sc->sc_ethercom.ec_if;
        struct vioif_net_map *map;
        struct mbuf *m;
        int slot, len;
        bool more;
        size_t ndeq;

        KASSERT(mutex_owned(&netq->netq_lock));

        more = false;
        ndeq = 0;

        if (virtio_vq_is_enqueued(vsc, vq) == false)
                goto done;

        for (;;ndeq++) {
                if (ndeq >= limit) {
                        more = true;
                        break;
                }

                if (virtio_dequeue(vsc, vq, &slot, &len) != 0)
                        break;

                map = &netq->netq_maps[slot];
                KASSERT(map->vnm_mbuf != NULL);
                m = vioif_net_dequeue_commit(vsc, vq, slot,
                    map, BUS_DMASYNC_POSTREAD);
                KASSERT(m != NULL);

                m->m_len = m->m_pkthdr.len = len - sc->sc_hdr_size;
                m_set_rcvif(m, ifp);
                if_percpuq_enqueue(ifp->if_percpuq, m);
        }

done:
        if (ndeqp != NULL)
                *ndeqp = ndeq;

        return more;
}

static void
vioif_rx_queue_clear(struct vioif_softc *sc, struct virtio_softc *vsc,
    struct vioif_netqueue *netq)
{
        struct vioif_net_map *map;
        struct mbuf *m;
        unsigned int i, vq_num;
        bool more;

        mutex_enter(&netq->netq_lock);

        vq_num = netq->netq_vq->vq_num;
        for (;;) {
                more = vioif_rx_deq_locked(sc, vsc, netq, vq_num, NULL);
                if (more == false)
                        break;
        }

        for (i = 0; i < vq_num; i++) {
                map = &netq->netq_maps[i];

                m = map->vnm_mbuf;
                if (m == NULL)
                        continue;

                vioif_net_unload_mbuf(vsc, map);
                m_freem(m);
        }
        mutex_exit(&netq->netq_lock);
}

static void
vioif_rx_handle_locked(void *xnetq, u_int limit)
{
        struct vioif_netqueue *netq = xnetq;
        struct virtqueue *vq = netq->netq_vq;
        struct virtio_softc *vsc = vq->vq_owner;
        struct vioif_softc *sc = device_private(virtio_child(vsc));
        bool more;
        int enqueued;
        size_t ndeq;

        KASSERT(mutex_owned(&netq->netq_lock));
        KASSERT(!netq->netq_stopping);

        more = vioif_rx_deq_locked(sc, vsc, netq, limit, &ndeq);
        if (ndeq > 0)
                vioif_populate_rx_mbufs_locked(sc, netq);

        if (more) {
                vioif_net_sched_handle(sc, netq);
                return;
        }

        enqueued = virtio_start_vq_intr(vsc, netq->netq_vq);
        if (enqueued != 0) {
                virtio_stop_vq_intr(vsc, netq->netq_vq);
                vioif_net_sched_handle(sc, netq);
                return;
        }

        netq->netq_running_handle = false;
}

static int
vioif_rx_intr(void *arg)
{
        struct vioif_netqueue *netq = arg;
        struct virtqueue *vq = netq->netq_vq;
        struct virtio_softc *vsc = vq->vq_owner;
        struct vioif_softc *sc = device_private(virtio_child(vsc));
        u_int limit;

        mutex_enter(&netq->netq_lock);

        /* handler is already running in softint/workqueue */
        if (netq->netq_running_handle)
                goto done;

        if (netq->netq_stopping)
                goto done;

        netq->netq_running_handle = true;

        limit = sc->sc_rx_intr_process_limit;
        virtio_stop_vq_intr(vsc, vq);
        vioif_rx_handle_locked(netq, limit);

done:
        mutex_exit(&netq->netq_lock);
        return 1;
}

static void
vioif_rx_handle(void *xnetq)
{
        struct vioif_netqueue *netq = xnetq;
        struct virtqueue *vq = netq->netq_vq;
        struct virtio_softc *vsc = vq->vq_owner;
        struct vioif_softc *sc = device_private(virtio_child(vsc));
        u_int limit;

        mutex_enter(&netq->netq_lock);

        KASSERT(netq->netq_running_handle);

        if (netq->netq_stopping) {
                netq->netq_running_handle = false;
                goto done;
        }

        limit = sc->sc_rx_process_limit;
        vioif_rx_handle_locked(netq, limit);

done:
        mutex_exit(&netq->netq_lock);
}

/*
 * Transmission implementation
 */
/* enqueue mbufs to send */
static void
vioif_send_common_locked(struct ifnet *ifp, struct vioif_netqueue *netq,
    bool is_transmit)
{
        struct vioif_softc *sc = ifp->if_softc;
        struct virtio_softc *vsc = sc->sc_virtio;
        struct virtqueue *vq = netq->netq_vq;
        struct vioif_tx_context *txc;
        struct vioif_net_map *map;
        struct mbuf *m;
        int queued = 0;

        KASSERT(mutex_owned(&netq->netq_lock));

        if (netq->netq_stopping ||
            !ISSET(ifp->if_flags, IFF_RUNNING))
                return;

        txc = netq->netq_ctx;

        if (!txc->txc_link_active ||
            txc->txc_no_free_slots)
                return;

        for (;;) {
                int slot, r;
                r = virtio_enqueue_prep(vsc, vq, &slot);
                if (r == EAGAIN) {
                        txc->txc_no_free_slots = true;
                        break;
                }
                if (__predict_false(r != 0))
                        panic("enqueue_prep for tx buffers");

                if (is_transmit)
                        m = pcq_get(txc->txc_intrq);
                else
                        IFQ_DEQUEUE(&ifp->if_snd, m);

                if (m == NULL) {
                        virtio_enqueue_abort(vsc, vq, slot);
                        break;
                }

                map = &netq->netq_maps[slot];
                KASSERT(map->vnm_mbuf == NULL);

                r = vioif_net_load_mbuf(vsc, map, m, BUS_DMA_WRITE);
                if (r != 0) {
                        /* maybe just too fragmented */
                        struct mbuf *newm;

                        newm = m_defrag(m, M_NOWAIT);
                        if (newm != NULL) {
                                m = newm;
                                r = vioif_net_load_mbuf(vsc, map, m,
                                    BUS_DMA_WRITE);
                        } else {
                                txc->txc_defrag_failed.ev_count++;
                                r = -1;
                        }

                        if (r != 0) {
                                netq->netq_mbuf_load_failed.ev_count++;
                                m_freem(m);
                                if_statinc(ifp, if_oerrors);
                                virtio_enqueue_abort(vsc, vq, slot);
                                continue;
                        }
                }

                memset(map->vnm_hdr, 0, sc->sc_hdr_size);

                r = vioif_net_enqueue_tx(vsc, vq, slot, map);
                if (r != 0) {
                        netq->netq_enqueue_failed.ev_count++;
                        vioif_net_unload_mbuf(vsc, map);
                        m_freem(m);
                        /* slot already freed by vioif_net_enqueue_tx */

                        if_statinc(ifp, if_oerrors);
                        continue;
                }

                queued++;
                bpf_mtap(ifp, m, BPF_D_OUT);
        }

        if (queued > 0) {
                vioif_notify(vsc, vq);
                ifp->if_timer = 5;
        }
}

/* dequeue sent mbufs */
static bool
vioif_tx_deq_locked(struct vioif_softc *sc, struct virtio_softc *vsc,
    struct vioif_netqueue *netq, u_int limit, size_t *ndeqp)
{
        struct virtqueue *vq = netq->netq_vq;
        struct ifnet *ifp = &sc->sc_ethercom.ec_if;
        struct vioif_net_map *map;
        struct mbuf *m;
        int slot, len;
        bool more;
        size_t ndeq;

        KASSERT(mutex_owned(&netq->netq_lock));

        more = false;
        ndeq = 0;

        if (virtio_vq_is_enqueued(vsc, vq) == false)
                goto done;

        for (;;ndeq++) {
                if (limit-- == 0) {
                        more = true;
                        break;
                }

                if (virtio_dequeue(vsc, vq, &slot, &len) != 0)
                        break;

                map = &netq->netq_maps[slot];
                KASSERT(map->vnm_mbuf != NULL);
                m = vioif_net_dequeue_commit(vsc, vq, slot,
                    map, BUS_DMASYNC_POSTWRITE);
                KASSERT(m != NULL);

                if_statinc(ifp, if_opackets);
                m_freem(m);
        }

done:
        if (ndeqp != NULL)
                *ndeqp = ndeq;
        return more;
}

static void
vioif_tx_queue_clear(struct vioif_softc *sc, struct virtio_softc *vsc,
    struct vioif_netqueue *netq)
{
        struct vioif_tx_context *txc;
        struct vioif_net_map *map;
        struct mbuf *m;
        unsigned int i, vq_num;
        bool more;

        mutex_enter(&netq->netq_lock);

        txc = netq->netq_ctx;
        vq_num = netq->netq_vq->vq_num;

        for (;;) {
                more = vioif_tx_deq_locked(sc, vsc, netq, vq_num, NULL);
                if (more == false)
                        break;
        }

        for (i = 0; i < vq_num; i++) {
                map = &netq->netq_maps[i];

                m = map->vnm_mbuf;
                if (m == NULL)
                        continue;

                vioif_net_unload_mbuf(vsc, map);
                m_freem(m);
        }

        txc->txc_no_free_slots = false;

        mutex_exit(&netq->netq_lock);
}

static void
vioif_start_locked(struct ifnet *ifp, struct vioif_netqueue *netq)
{

        /*
         * ifp->if_obytes and ifp->if_omcasts are added in if_transmit()@if.c.
         */
        vioif_send_common_locked(ifp, netq, false);

}

static void
vioif_transmit_locked(struct ifnet *ifp, struct vioif_netqueue *netq)
{

        vioif_send_common_locked(ifp, netq, true);
}

static void
vioif_deferred_transmit(void *arg)
{
        struct vioif_netqueue *netq = arg;
        struct virtio_softc *vsc = netq->netq_vq->vq_owner;
        struct vioif_softc *sc = device_private(virtio_child(vsc));
        struct ifnet *ifp = &sc->sc_ethercom.ec_if;

        mutex_enter(&netq->netq_lock);
        vioif_send_common_locked(ifp, netq, true);
        mutex_exit(&netq->netq_lock);
}

static void
vioif_tx_handle_locked(struct vioif_netqueue *netq, u_int limit)
{
        struct virtqueue *vq = netq->netq_vq;
        struct vioif_tx_context *txc = netq->netq_ctx;
        struct virtio_softc *vsc = vq->vq_owner;
        struct vioif_softc *sc = device_private(virtio_child(vsc));
        struct ifnet *ifp = &sc->sc_ethercom.ec_if;
        bool more;
        int enqueued;
        size_t ndeq;

        KASSERT(mutex_owned(&netq->netq_lock));
        KASSERT(!netq->netq_stopping);

        more = vioif_tx_deq_locked(sc, vsc, netq, limit, &ndeq);
        if (txc->txc_no_free_slots && ndeq > 0) {
                txc->txc_no_free_slots = false;
                softint_schedule(txc->txc_deferred_transmit);
        }

        if (more) {
                vioif_net_sched_handle(sc, netq);
                return;
        }

        enqueued = (virtio_features(vsc) & VIRTIO_F_RING_EVENT_IDX) ?
            virtio_postpone_intr_smart(vsc, vq):
            virtio_start_vq_intr(vsc, vq);
        if (enqueued != 0) {
                virtio_stop_vq_intr(vsc, vq);
                vioif_net_sched_handle(sc, netq);
                return;
        }

        netq->netq_running_handle = false;

        /* for ALTQ */
        if (netq == &sc->sc_netqs[VIOIF_NETQ_TXQID(0)])
                if_schedule_deferred_start(ifp);

        softint_schedule(txc->txc_deferred_transmit);
}

static int
vioif_tx_intr(void *arg)
{
        struct vioif_netqueue *netq = arg;
        struct virtqueue *vq = netq->netq_vq;
        struct virtio_softc *vsc = vq->vq_owner;
        struct vioif_softc *sc = device_private(virtio_child(vsc));
        u_int limit;

        mutex_enter(&netq->netq_lock);

        /* tx handler is already running in softint/workqueue */
        if (netq->netq_running_handle)
                goto done;

        if (netq->netq_stopping)
                goto done;

        netq->netq_running_handle = true;

        virtio_stop_vq_intr(vsc, vq);
        netq->netq_workqueue = sc->sc_txrx_workqueue_sysctl;
        limit = sc->sc_tx_intr_process_limit;
        vioif_tx_handle_locked(netq, limit);

done:
        mutex_exit(&netq->netq_lock);
        return 1;
}

static void
vioif_tx_handle(void *xnetq)
{
        struct vioif_netqueue *netq = xnetq;
        struct virtqueue *vq = netq->netq_vq;
        struct virtio_softc *vsc = vq->vq_owner;
        struct vioif_softc *sc = device_private(virtio_child(vsc));
        u_int limit;

        mutex_enter(&netq->netq_lock);

        KASSERT(netq->netq_running_handle);

        if (netq->netq_stopping) {
                netq->netq_running_handle = false;
                goto done;
        }

        limit = sc->sc_tx_process_limit;
        vioif_tx_handle_locked(netq, limit);

done:
        mutex_exit(&netq->netq_lock);
}

/*
 * Control vq
 */
/* issue a VIRTIO_NET_CTRL_RX class command and wait for completion */
static void
vioif_ctrl_acquire(struct vioif_softc *sc)
{
        struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq;

        mutex_enter(&ctrlq->ctrlq_wait_lock);
        while (ctrlq->ctrlq_inuse != FREE)
                cv_wait(&ctrlq->ctrlq_wait, &ctrlq->ctrlq_wait_lock);
        ctrlq->ctrlq_inuse = INUSE;
        ctrlq->ctrlq_owner = curlwp;
        mutex_exit(&ctrlq->ctrlq_wait_lock);
}

static void
vioif_ctrl_release(struct vioif_softc *sc)
{
        struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq;

        KASSERT(ctrlq->ctrlq_inuse != FREE);
        KASSERT(ctrlq->ctrlq_owner == curlwp);

        mutex_enter(&ctrlq->ctrlq_wait_lock);
        ctrlq->ctrlq_inuse = FREE;
        ctrlq->ctrlq_owner = NULL;
        cv_signal(&ctrlq->ctrlq_wait);
        mutex_exit(&ctrlq->ctrlq_wait_lock);
}

static int
vioif_ctrl_load_cmdspec(struct vioif_softc *sc,
    struct vioif_ctrl_cmdspec *specs, int nspecs)
{
        struct virtio_softc *vsc = sc->sc_virtio;
        int i, r, loaded;

        loaded = 0;
        for (i = 0; i < nspecs; i++) {
                r = bus_dmamap_load(virtio_dmat(vsc),
                    specs[i].dmamap, specs[i].buf, specs[i].bufsize,
                    NULL, BUS_DMA_WRITE | BUS_DMA_NOWAIT);
                if (r) {
                        sc->sc_ctrlq.ctrlq_cmd_load_failed.ev_count++;
                        goto err;
                }
                loaded++;

        }

        return r;

err:
        for (i = 0; i < loaded; i++) {
                bus_dmamap_unload(virtio_dmat(vsc), specs[i].dmamap);
        }

        return r;
}

static void
vioif_ctrl_unload_cmdspec(struct vioif_softc *sc,
    struct vioif_ctrl_cmdspec *specs, int nspecs)
{
        struct virtio_softc *vsc = sc->sc_virtio;
        int i;

        for (i = 0; i < nspecs; i++) {
                bus_dmamap_unload(virtio_dmat(vsc), specs[i].dmamap);
        }
}

static int
vioif_ctrl_send_command(struct vioif_softc *sc, uint8_t class, uint8_t cmd,
    struct vioif_ctrl_cmdspec *specs, int nspecs)
{
        struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq;
        struct virtqueue *vq = ctrlq->ctrlq_vq;
        struct virtio_softc *vsc = sc->sc_virtio;
        int i, r, slot;

        ctrlq->ctrlq_cmd->class = class;
        ctrlq->ctrlq_cmd->command = cmd;

        bus_dmamap_sync(virtio_dmat(vsc), ctrlq->ctrlq_cmd_dmamap,
            0, sizeof(struct virtio_net_ctrl_cmd), BUS_DMASYNC_PREWRITE);
        for (i = 0; i < nspecs; i++) {
                bus_dmamap_sync(virtio_dmat(vsc), specs[i].dmamap,
                    0, specs[i].bufsize, BUS_DMASYNC_PREWRITE);
        }
        bus_dmamap_sync(virtio_dmat(vsc), ctrlq->ctrlq_status_dmamap,
            0, sizeof(struct virtio_net_ctrl_status), BUS_DMASYNC_PREREAD);

        /* we need to explicitly (re)start vq intr when using RING EVENT IDX */
        if (virtio_features(vsc) & VIRTIO_F_RING_EVENT_IDX)
                virtio_start_vq_intr(vsc, ctrlq->ctrlq_vq);

        r = virtio_enqueue_prep(vsc, vq, &slot);
        if (r != 0)
                panic("%s: control vq busy!?", device_xname(sc->sc_dev));
        r = virtio_enqueue_reserve(vsc, vq, slot, nspecs + 2);
        if (r != 0)
                panic("%s: control vq busy!?", device_xname(sc->sc_dev));
        virtio_enqueue(vsc, vq, slot, ctrlq->ctrlq_cmd_dmamap, true);
        for (i = 0; i < nspecs; i++) {
                virtio_enqueue(vsc, vq, slot, specs[i].dmamap, true);
        }
        virtio_enqueue(vsc, vq, slot, ctrlq->ctrlq_status_dmamap, false);
        virtio_enqueue_commit(vsc, vq, slot, true);

        /* wait for done */
        mutex_enter(&ctrlq->ctrlq_wait_lock);
        while (ctrlq->ctrlq_inuse != DONE)
                cv_wait(&ctrlq->ctrlq_wait, &ctrlq->ctrlq_wait_lock);
        mutex_exit(&ctrlq->ctrlq_wait_lock);
        /* already dequeued */

        bus_dmamap_sync(virtio_dmat(vsc), ctrlq->ctrlq_cmd_dmamap, 0,
            sizeof(struct virtio_net_ctrl_cmd), BUS_DMASYNC_POSTWRITE);
        for (i = 0; i < nspecs; i++) {
                bus_dmamap_sync(virtio_dmat(vsc), specs[i].dmamap, 0,
                    specs[i].bufsize, BUS_DMASYNC_POSTWRITE);
        }
        bus_dmamap_sync(virtio_dmat(vsc), ctrlq->ctrlq_status_dmamap, 0,
            sizeof(struct virtio_net_ctrl_status), BUS_DMASYNC_POSTREAD);

        if (ctrlq->ctrlq_status->ack == VIRTIO_NET_OK)
                r = 0;
        else {
                device_printf(sc->sc_dev, "failed setting rx mode\n");
                sc->sc_ctrlq.ctrlq_cmd_failed.ev_count++;
                r = EIO;
        }

        return r;
}

/* ctrl vq interrupt; wake up the command issuer */
static int
vioif_ctrl_intr(void *arg)
{
        struct vioif_ctrlqueue *ctrlq = arg;
        struct virtqueue *vq = ctrlq->ctrlq_vq;
        struct virtio_softc *vsc = vq->vq_owner;
        int r, slot;

        if (virtio_vq_is_enqueued(vsc, vq) == false)
                return 0;

        r = virtio_dequeue(vsc, vq, &slot, NULL);
        if (r == ENOENT)
                return 0;
        virtio_dequeue_commit(vsc, vq, slot);

        mutex_enter(&ctrlq->ctrlq_wait_lock);
        ctrlq->ctrlq_inuse = DONE;
        cv_signal(&ctrlq->ctrlq_wait);
        mutex_exit(&ctrlq->ctrlq_wait_lock);

        return 1;
}

static int
vioif_ctrl_rx(struct vioif_softc *sc, int cmd, bool onoff)
{
        struct virtio_net_ctrl_rx *rx = sc->sc_ctrlq.ctrlq_rx;
        struct vioif_ctrl_cmdspec specs[1];
        int r;

        if (!sc->sc_has_ctrl)
                return ENOTSUP;

        vioif_ctrl_acquire(sc);

        rx->onoff = onoff;
        specs[0].dmamap = sc->sc_ctrlq.ctrlq_rx_dmamap;
        specs[0].buf = rx;
        specs[0].bufsize = sizeof(*rx);

        r = vioif_ctrl_send_command(sc, VIRTIO_NET_CTRL_RX, cmd,
            specs, __arraycount(specs));

        vioif_ctrl_release(sc);
        return r;
}

static int
vioif_set_promisc(struct vioif_softc *sc, bool onoff)
{
        return vioif_ctrl_rx(sc, VIRTIO_NET_CTRL_RX_PROMISC, onoff);
}

static int
vioif_set_allmulti(struct vioif_softc *sc, bool onoff)
{
        return vioif_ctrl_rx(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, onoff);
}

static int
vioif_ctrl_mq_vq_pairs_set(struct vioif_softc *sc, int nvq_pairs)
{
        struct virtio_net_ctrl_mq *mq = sc->sc_ctrlq.ctrlq_mq;
        struct vioif_ctrl_cmdspec specs[1];
        int r;

        if (!sc->sc_has_ctrl)
                return ENOTSUP;

        if (nvq_pairs <= 1)
                return EINVAL;

        vioif_ctrl_acquire(sc);

        mq->virtqueue_pairs = virtio_rw16(sc->sc_virtio, nvq_pairs);
        specs[0].dmamap = sc->sc_ctrlq.ctrlq_mq_dmamap;
        specs[0].buf = mq;
        specs[0].bufsize = sizeof(*mq);

        r = vioif_ctrl_send_command(sc,
            VIRTIO_NET_CTRL_MQ, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET,
            specs, __arraycount(specs));

        vioif_ctrl_release(sc);

        return r;
}

static int
vioif_set_mac_addr(struct vioif_softc *sc)
{
        struct virtio_net_ctrl_mac_addr *ma =
            sc->sc_ctrlq.ctrlq_mac_addr;
        struct vioif_ctrl_cmdspec specs[1];
        struct ifnet *ifp = &sc->sc_ethercom.ec_if;
        int nspecs = __arraycount(specs);
        uint64_t features;
        int r;
        size_t i;

        if (!sc->sc_has_ctrl)
                return ENOTSUP;

        if (memcmp(CLLADDR(ifp->if_sadl), sc->sc_mac,
            ETHER_ADDR_LEN) == 0) {
                return 0;
        }

        memcpy(sc->sc_mac, CLLADDR(ifp->if_sadl), ETHER_ADDR_LEN);

        features = virtio_features(sc->sc_virtio);
        if (features & VIRTIO_NET_F_CTRL_MAC_ADDR) {
                vioif_ctrl_acquire(sc);

                memcpy(ma->mac, sc->sc_mac, ETHER_ADDR_LEN);
                specs[0].dmamap = sc->sc_ctrlq.ctrlq_mac_addr_dmamap;
                specs[0].buf = ma;
                specs[0].bufsize = sizeof(*ma);

                r = vioif_ctrl_send_command(sc,
                    VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_ADDR_SET,
                    specs, nspecs);

                vioif_ctrl_release(sc);
        } else {
                for (i = 0; i < __arraycount(sc->sc_mac); i++) {
                        virtio_write_device_config_1(sc->sc_virtio,
                            VIRTIO_NET_CONFIG_MAC + i, sc->sc_mac[i]);
                }
                r = 0;
        }

        return r;
}

static int
vioif_set_rx_filter(struct vioif_softc *sc)
{
        /* filter already set in ctrlq->ctrlq_mac_tbl */
        struct virtio_softc *vsc = sc->sc_virtio;
        struct virtio_net_ctrl_mac_tbl *mac_tbl_uc, *mac_tbl_mc;
        struct vioif_ctrl_cmdspec specs[2];
        int nspecs = __arraycount(specs);
        int r;

        mac_tbl_uc = sc->sc_ctrlq.ctrlq_mac_tbl_uc;
        mac_tbl_mc = sc->sc_ctrlq.ctrlq_mac_tbl_mc;

        if (!sc->sc_has_ctrl)
                return ENOTSUP;

        vioif_ctrl_acquire(sc);

        specs[0].dmamap = sc->sc_ctrlq.ctrlq_tbl_uc_dmamap;
        specs[0].buf = mac_tbl_uc;
        specs[0].bufsize = sizeof(*mac_tbl_uc)
            + (ETHER_ADDR_LEN * virtio_rw32(vsc, mac_tbl_uc->nentries));

        specs[1].dmamap = sc->sc_ctrlq.ctrlq_tbl_mc_dmamap;
        specs[1].buf = mac_tbl_mc;
        specs[1].bufsize = sizeof(*mac_tbl_mc)
            + (ETHER_ADDR_LEN * virtio_rw32(vsc, mac_tbl_mc->nentries));

        r = vioif_ctrl_load_cmdspec(sc, specs, nspecs);
        if (r != 0)
                goto out;

        r = vioif_ctrl_send_command(sc,
            VIRTIO_NET_CTRL_MAC, VIRTIO_NET_CTRL_MAC_TABLE_SET,
            specs, nspecs);

        vioif_ctrl_unload_cmdspec(sc, specs, nspecs);

out:
        vioif_ctrl_release(sc);

        return r;
}

/*
 * If multicast filter small enough (<=MAXENTRIES) set rx filter
 * If large multicast filter exist use ALLMULTI
 * If setting rx filter fails fall back to ALLMULTI
 */
static int
vioif_rx_filter(struct vioif_softc *sc)
{
        struct virtio_softc *vsc = sc->sc_virtio;
        struct ethercom *ec = &sc->sc_ethercom;
        struct ifnet *ifp = &ec->ec_if;
        struct ether_multi *enm;
        struct ether_multistep step;
        struct vioif_ctrlqueue *ctrlq = &sc->sc_ctrlq;
        int nentries;
        bool allmulti = 0;
        int r;

        if (!sc->sc_has_ctrl) {
                goto set_ifflags;
        }

        memcpy(ctrlq->ctrlq_mac_tbl_uc->macs[0],
            CLLADDR(ifp->if_sadl), ETHER_ADDR_LEN);

        nentries = 0;
        allmulti = false;

        ETHER_LOCK(ec);
        for (ETHER_FIRST_MULTI(step, ec, enm); enm != NULL;
            ETHER_NEXT_MULTI(step, enm)) {
                if (nentries >= VIRTIO_NET_CTRL_MAC_MAXENTRIES) {
                        allmulti = true;
                        break;
                }
                if (memcmp(enm->enm_addrlo, enm->enm_addrhi, ETHER_ADDR_LEN)) {
                        allmulti = true;
                        break;
                }

                memcpy(ctrlq->ctrlq_mac_tbl_mc->macs[nentries],
                    enm->enm_addrlo, ETHER_ADDR_LEN);
                nentries++;
        }
        ETHER_UNLOCK(ec);

        r = vioif_set_mac_addr(sc);
        if (r != 0) {
                log(LOG_WARNING, "%s: couldn't set MAC address\n",
                    ifp->if_xname);
        }

        if (!allmulti) {
                ctrlq->ctrlq_mac_tbl_uc->nentries = virtio_rw32(vsc, 1);
                ctrlq->ctrlq_mac_tbl_mc->nentries = virtio_rw32(vsc, nentries);
                r = vioif_set_rx_filter(sc);
                if (r != 0) {
                        allmulti = true; /* fallback */
                }
        }

        if (allmulti) {
                ctrlq->ctrlq_mac_tbl_uc->nentries = virtio_rw32(vsc, 0);
                ctrlq->ctrlq_mac_tbl_mc->nentries = virtio_rw32(vsc, 0);
                r = vioif_set_rx_filter(sc);
                if (r != 0) {
                        log(LOG_DEBUG, "%s: couldn't clear RX filter\n",
                            ifp->if_xname);
                        /* what to do on failure? */
                }

                ifp->if_flags |= IFF_ALLMULTI;
        }

set_ifflags:
        r = vioif_ifflags(sc);

        return r;
}

/*
 * VM configuration changes
 */
static int
vioif_config_change(struct virtio_softc *vsc)
{
        struct vioif_softc *sc = device_private(virtio_child(vsc));

        softint_schedule(sc->sc_cfg_softint);
        return 0;
}

static void
vioif_cfg_softint(void *arg)
{
        struct vioif_softc *sc = arg;
        struct ifnet *ifp = &sc->sc_ethercom.ec_if;

        vioif_update_link_status(sc);
        vioif_start(ifp);
}

static int
vioif_get_link_status(struct vioif_softc *sc)
{
        struct virtio_softc *vsc = sc->sc_virtio;
        uint16_t status;

        if (virtio_features(vsc) & VIRTIO_NET_F_STATUS)
                status = virtio_read_device_config_2(vsc,
                    VIRTIO_NET_CONFIG_STATUS);
        else
                status = VIRTIO_NET_S_LINK_UP;

        if ((status & VIRTIO_NET_S_LINK_UP) != 0)
                return LINK_STATE_UP;

        return LINK_STATE_DOWN;
}

static void
vioif_update_link_status(struct vioif_softc *sc)
{
        struct ifnet *ifp = &sc->sc_ethercom.ec_if;
        struct vioif_netqueue *netq;
        struct vioif_tx_context *txc;
        bool active;
        int link, i;

        mutex_enter(&sc->sc_lock);

        link = vioif_get_link_status(sc);

        if (link == sc->sc_link_state)
                goto done;

        sc->sc_link_state = link;

        active = VIOIF_IS_LINK_ACTIVE(sc);
        for (i = 0; i < sc->sc_act_nvq_pairs; i++) {
                netq = &sc->sc_netqs[VIOIF_NETQ_TXQID(i)];

                mutex_enter(&netq->netq_lock);
                txc = netq->netq_ctx;
                txc->txc_link_active = active;
                mutex_exit(&netq->netq_lock);
        }

        if_link_state_change(ifp, sc->sc_link_state);

done:
        mutex_exit(&sc->sc_lock);
}

static void
vioif_workq_work(struct work *wk, void *context)
{
        struct vioif_work *work;

        work = container_of(wk, struct vioif_work, cookie);

        atomic_store_relaxed(&work->added, 0);
        work->func(work->arg);
}

static struct workqueue *
vioif_workq_create(const char *name, pri_t prio, int ipl, int flags)
{
        struct workqueue *wq;
        int error;

        error = workqueue_create(&wq, name, vioif_workq_work, NULL,
            prio, ipl, flags);

        if (error)
                return NULL;

        return wq;
}

static void
vioif_workq_destroy(struct workqueue *wq)
{

        workqueue_destroy(wq);
}

static void
vioif_work_set(struct vioif_work *work, void (*func)(void *), void *arg)
{

        memset(work, 0, sizeof(*work));
        work->func = func;
        work->arg = arg;
}

static void
vioif_work_add(struct workqueue *wq, struct vioif_work *work)
{

        if (atomic_load_relaxed(&work->added) != 0)
                return;

        atomic_store_relaxed(&work->added, 1);
        kpreempt_disable();
        workqueue_enqueue(wq, &work->cookie, NULL);
        kpreempt_enable();
}

static void
vioif_work_wait(struct workqueue *wq, struct vioif_work *work)
{

        workqueue_wait(wq, &work->cookie);
}

MODULE(MODULE_CLASS_DRIVER, if_vioif, "virtio");

#ifdef _MODULE
#include "ioconf.c"
#endif

static int
if_vioif_modcmd(modcmd_t cmd, void *opaque)
{
        int error = 0;

#ifdef _MODULE
        switch (cmd) {
        case MODULE_CMD_INIT:
                error = config_init_component(cfdriver_ioconf_if_vioif,
                    cfattach_ioconf_if_vioif, cfdata_ioconf_if_vioif);
                break;
        case MODULE_CMD_FINI:
                error = config_fini_component(cfdriver_ioconf_if_vioif,
                    cfattach_ioconf_if_vioif, cfdata_ioconf_if_vioif);
                break;
        default:
                error = ENOTTY;
                break;
        }
#endif

        return error;
}




























































































































































































































































































    1 

















    1 

    1 

    1 











    1 







    1 













    1 
    1 


    1 
    1 




    1 















    1 




















    1 






































































































































































































































    1 



    1 







    1 



































































































































































































































































    1 












    1 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
/*        $NetBSD: kern_ntptime.c,v 1.64 2022/10/26 23:23:52 riastradh Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 ***********************************************************************
 *                                                                       *
 * Copyright (c) David L. Mills 1993-2001                               *
 *                                                                       *
 * Permission to use, copy, modify, and distribute this software and   *
 * its documentation for any purpose and without fee is hereby               *
 * granted, provided that the above copyright notice appears in all    *
 * copies and that both the copyright notice and this permission       *
 * notice appear in supporting documentation, and that the name               *
 * University of Delaware not be used in advertising or publicity      *
 * pertaining to distribution of the software without specific,               *
 * written prior permission. The University of Delaware makes no       *
 * representations about the suitability this software for any               *
 * purpose. It is provided "as is" without express or implied               *
 * warranty.                                                               *
 *                                                                       *
 **********************************************************************/

/*
 * Adapted from the original sources for FreeBSD and timecounters by:
 * Poul-Henning Kamp <phk@FreeBSD.org>.
 *
 * The 32bit version of the "LP" macros seems a bit past its "sell by" 
 * date so I have retained only the 64bit version and included it directly
 * in this file.
 *
 * Only minor changes done to interface with the timecounters over in
 * sys/kern/kern_clock.c.   Some of the comments below may be (even more)
 * confusing and/or plain wrong in that context.
 */

#include <sys/cdefs.h>
/* __FBSDID("$FreeBSD: src/sys/kern/kern_ntptime.c,v 1.59 2005/05/28 14:34:41 rwatson Exp $"); */
__KERNEL_RCSID(0, "$NetBSD: kern_ntptime.c,v 1.64 2022/10/26 23:23:52 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_ntp.h"
#endif

#include <sys/param.h>
#include <sys/resourcevar.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/timex.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/cpu.h>

#include <compat/sys/timex.h>

/*
 * Single-precision macros for 64-bit machines
 */
typedef int64_t l_fp;
#define L_ADD(v, u)        ((v) += (u))
#define L_SUB(v, u)        ((v) -= (u))
#define L_ADDHI(v, a)        ((v) += (int64_t)(a) << 32)
#define L_NEG(v)        ((v) = -(v))
#define L_RSHIFT(v, n) \
        do { \
                if ((v) < 0) \
                        (v) = -(-(v) >> (n)); \
                else \
                        (v) = (v) >> (n); \
        } while (0)
#define L_MPY(v, a)        ((v) *= (a))
#define L_CLR(v)        ((v) = 0)
#define L_ISNEG(v)        ((v) < 0)
#define L_LINT(v, a)        ((v) = (int64_t)((uint64_t)(a) << 32))
#define L_GINT(v)        ((v) < 0 ? -(-(v) >> 32) : (v) >> 32)

#ifdef NTP
/*
 * Generic NTP kernel interface
 *
 * These routines constitute the Network Time Protocol (NTP) interfaces
 * for user and daemon application programs. The ntp_gettime() routine
 * provides the time, maximum error (synch distance) and estimated error
 * (dispersion) to client user application programs. The ntp_adjtime()
 * routine is used by the NTP daemon to adjust the system clock to an
 * externally derived time. The time offset and related variables set by
 * this routine are used by other routines in this module to adjust the
 * phase and frequency of the clock discipline loop which controls the
 * system clock.
 *
 * When the kernel time is reckoned directly in nanoseconds (NTP_NANO
 * defined), the time at each tick interrupt is derived directly from
 * the kernel time variable. When the kernel time is reckoned in
 * microseconds, (NTP_NANO undefined), the time is derived from the
 * kernel time variable together with a variable representing the
 * leftover nanoseconds at the last tick interrupt. In either case, the
 * current nanosecond time is reckoned from these values plus an
 * interpolated value derived by the clock routines in another
 * architecture-specific module. The interpolation can use either a
 * dedicated counter or a processor cycle counter (PCC) implemented in
 * some architectures.
 *
 * Note that all routines must run at priority splclock or higher.
 */
/*
 * Phase/frequency-lock loop (PLL/FLL) definitions
 *
 * The nanosecond clock discipline uses two variable types, time
 * variables and frequency variables. Both types are represented as 64-
 * bit fixed-point quantities with the decimal point between two 32-bit
 * halves. On a 32-bit machine, each half is represented as a single
 * word and mathematical operations are done using multiple-precision
 * arithmetic. On a 64-bit machine, ordinary computer arithmetic is
 * used.
 *
 * A time variable is a signed 64-bit fixed-point number in ns and
 * fraction. It represents the remaining time offset to be amortized
 * over succeeding tick interrupts. The maximum time offset is about
 * 0.5 s and the resolution is about 2.3e-10 ns.
 *
 *                        1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
 *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |s s s|                         ns                                   |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |                            fraction                                   |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 *
 * A frequency variable is a signed 64-bit fixed-point number in ns/s
 * and fraction. It represents the ns and fraction to be added to the
 * kernel time variable at each second. The maximum frequency offset is
 * about +-500000 ns/s and the resolution is about 2.3e-10 ns/s.
 *
 *                        1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
 *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |s s s s s s s s s s s s s|                  ns/s                           |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |                            fraction                                   |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 */
/*
 * The following variables establish the state of the PLL/FLL and the
 * residual time and frequency offset of the local clock.
 */
#define SHIFT_PLL        4                /* PLL loop gain (shift) */
#define SHIFT_FLL        2                /* FLL loop gain (shift) */

static int time_state = TIME_OK;        /* clock state */
static int time_status = STA_UNSYNC;        /* clock status bits */
static long time_tai;                        /* TAI offset (s) */
static long time_monitor;                /* last time offset scaled (ns) */
static long time_constant;                /* poll interval (shift) (s) */
static long time_precision = 1;                /* clock precision (ns) */
static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */
static long time_esterror = MAXPHASE / 1000; /* estimated error (us) */
static time_t time_reftime;                /* time at last adjustment (s) */
static l_fp time_offset;                /* time offset (ns) */
static l_fp time_freq;                        /* frequency offset (ns/s) */
#endif /* NTP */

static l_fp time_adj;                        /* tick adjust (ns/s) */
int64_t time_adjtime;                /* correction from adjtime(2) (usec) */

#ifdef NTP
#ifdef PPS_SYNC
/*
 * The following variables are used when a pulse-per-second (PPS) signal
 * is available and connected via a modem control lead. They establish
 * the engineering parameters of the clock discipline loop when
 * controlled by the PPS signal.
 */
#define PPS_FAVG        2                /* min freq avg interval (s) (shift) */
#define PPS_FAVGDEF        8                /* default freq avg int (s) (shift) */
#define PPS_FAVGMAX        15                /* max freq avg interval (s) (shift) */
#define PPS_PAVG        4                /* phase avg interval (s) (shift) */
#define PPS_VALID        120                /* PPS signal watchdog max (s) */
#define PPS_MAXWANDER        100000                /* max PPS wander (ns/s) */
#define PPS_POPCORN        2                /* popcorn spike threshold (shift) */

static struct timespec pps_tf[3];        /* phase median filter */
static l_fp pps_freq;                        /* scaled frequency offset (ns/s) */
static long pps_fcount;                        /* frequency accumulator */
static long pps_jitter;                        /* nominal jitter (ns) */
static long pps_stabil;                        /* nominal stability (scaled ns/s) */
static long pps_lastsec;                /* time at last calibration (s) */
static int pps_valid;                        /* signal watchdog counter */
static int pps_shift = PPS_FAVG;        /* interval duration (s) (shift) */
static int pps_shiftmax = PPS_FAVGDEF;        /* max interval duration (s) (shift) */
static int pps_intcnt;                        /* wander counter */

/*
 * PPS signal quality monitors
 */
static long pps_calcnt;                        /* calibration intervals */
static long pps_jitcnt;                        /* jitter limit exceeded */
static long pps_stbcnt;                        /* stability limit exceeded */
static long pps_errcnt;                        /* calibration errors */
#endif /* PPS_SYNC */
/*
 * End of phase/frequency-lock loop (PLL/FLL) definitions
 */

static void hardupdate(long offset);

/*
 * ntp_gettime() - NTP user application interface
 */
void
ntp_gettime(struct ntptimeval *ntv)
{
        memset(ntv, 0, sizeof(*ntv));

        mutex_spin_enter(&timecounter_lock);
        nanotime(&ntv->time);
        ntv->maxerror = time_maxerror;
        ntv->esterror = time_esterror;
        ntv->tai = time_tai;
        ntv->time_state = time_state;
        mutex_spin_exit(&timecounter_lock);
}

/* ARGSUSED */
/*
 * ntp_adjtime() - NTP daemon application interface
 */
int
sys_ntp_adjtime(struct lwp *l, const struct sys_ntp_adjtime_args *uap, register_t *retval)
{
        /* {
                syscallarg(struct timex *) tp;
        } */
        struct timex ntv;
        int error;

        error = copyin((void *)SCARG(uap, tp), (void *)&ntv, sizeof(ntv));
        if (error != 0)
                return (error);

        if (ntv.modes != 0 && (error = kauth_authorize_system(l->l_cred,
            KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_NTPADJTIME, NULL,
            NULL, NULL)) != 0)
                return (error);

        ntp_adjtime1(&ntv);

        error = copyout((void *)&ntv, (void *)SCARG(uap, tp), sizeof(ntv));
        if (!error)
                *retval = ntp_timestatus();

        return error;
}

void
ntp_adjtime1(struct timex *ntv)
{
        long freq;
        int modes;

        /*
         * Update selected clock variables - only the superuser can
         * change anything. Note that there is no error checking here on
         * the assumption the superuser should know what it is doing.
         * Note that either the time constant or TAI offset are loaded
         * from the ntv.constant member, depending on the mode bits. If
         * the STA_PLL bit in the status word is cleared, the state and
         * status words are reset to the initial values at boot.
         */
        mutex_spin_enter(&timecounter_lock);
        modes = ntv->modes;
        if (modes != 0)
                /* We need to save the system time during shutdown */
                time_adjusted |= 2;
        if (modes & MOD_MAXERROR)
                time_maxerror = ntv->maxerror;
        if (modes & MOD_ESTERROR)
                time_esterror = ntv->esterror;
        if (modes & MOD_STATUS) {
                if (time_status & STA_PLL && !(ntv->status & STA_PLL)) {
                        time_state = TIME_OK;
                        time_status = STA_UNSYNC;
#ifdef PPS_SYNC
                        pps_shift = PPS_FAVG;
#endif /* PPS_SYNC */
                }
                time_status &= STA_RONLY;
                time_status |= ntv->status & ~STA_RONLY;
        }
        if (modes & MOD_TIMECONST) {
                if (ntv->constant < 0)
                        time_constant = 0;
                else if (ntv->constant > MAXTC)
                        time_constant = MAXTC;
                else
                        time_constant = ntv->constant;
        }
        if (modes & MOD_TAI) {
                if (ntv->constant > 0)        /* XXX zero & negative numbers ? */
                        time_tai = ntv->constant;
        }
#ifdef PPS_SYNC
        if (modes & MOD_PPSMAX) {
                if (ntv->shift < PPS_FAVG)
                        pps_shiftmax = PPS_FAVG;
                else if (ntv->shift > PPS_FAVGMAX)
                        pps_shiftmax = PPS_FAVGMAX;
                else
                        pps_shiftmax = ntv->shift;
        }
#endif /* PPS_SYNC */
        if (modes & MOD_NANO)
                time_status |= STA_NANO;
        if (modes & MOD_MICRO)
                time_status &= ~STA_NANO;
        if (modes & MOD_CLKB)
                time_status |= STA_CLK;
        if (modes & MOD_CLKA)
                time_status &= ~STA_CLK;
        if (modes & MOD_FREQUENCY) {
                freq = MIN(INT32_MAX, MAX(INT32_MIN, ntv->freq));
                freq = (freq * (int64_t)1000) >> 16;
                if (freq > MAXFREQ)
                        L_LINT(time_freq, MAXFREQ);
                else if (freq < -MAXFREQ)
                        L_LINT(time_freq, -MAXFREQ);
                else {
                        /*
                         * ntv.freq is [PPM * 2^16] = [us/s * 2^16]
                         * time_freq is [ns/s * 2^32]
                         */
                        time_freq = ntv->freq * 1000LL * 65536LL;
                }
#ifdef PPS_SYNC
                pps_freq = time_freq;
#endif /* PPS_SYNC */
        }
        if (modes & MOD_OFFSET) {
                if (time_status & STA_NANO) {
                        hardupdate(ntv->offset);
                } else {
                        long offset = ntv->offset;
                        offset = MIN(offset, MAXPHASE/1000);
                        offset = MAX(offset, -MAXPHASE/1000);
                        hardupdate(offset * 1000);
                }
        }

        /*
         * Retrieve all clock variables. Note that the TAI offset is
         * returned only by ntp_gettime();
         */
        if (time_status & STA_NANO)
                ntv->offset = L_GINT(time_offset);
        else
                ntv->offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */
        if (time_freq < 0)
                ntv->freq = L_GINT(-((-time_freq / 1000LL) << 16));
        else
                ntv->freq = L_GINT((time_freq / 1000LL) << 16);
        ntv->maxerror = time_maxerror;
        ntv->esterror = time_esterror;
        ntv->status = time_status;
        ntv->constant = time_constant;
        if (time_status & STA_NANO)
                ntv->precision = time_precision;
        else
                ntv->precision = time_precision / 1000;
        ntv->tolerance = MAXFREQ * SCALE_PPM;
#ifdef PPS_SYNC
        ntv->shift = pps_shift;
        ntv->ppsfreq = L_GINT((pps_freq / 1000LL) << 16);
        if (time_status & STA_NANO)
                ntv->jitter = pps_jitter;
        else
                ntv->jitter = pps_jitter / 1000;
        ntv->stabil = pps_stabil;
        ntv->calcnt = pps_calcnt;
        ntv->errcnt = pps_errcnt;
        ntv->jitcnt = pps_jitcnt;
        ntv->stbcnt = pps_stbcnt;
#endif /* PPS_SYNC */
        mutex_spin_exit(&timecounter_lock);
}
#endif /* NTP */

/*
 * second_overflow() - called after ntp_tick_adjust()
 *
 * This routine is ordinarily called immediately following the above
 * routine ntp_tick_adjust(). While these two routines are normally
 * combined, they are separated here only for the purposes of
 * simulation.
 */
void
ntp_update_second(int64_t *adjustment, time_t *newsec)
{
        int tickrate;
        l_fp ftemp;                /* 32/64-bit temporary */

        KASSERT(mutex_owned(&timecounter_lock));

#ifdef NTP

        /*
         * On rollover of the second both the nanosecond and microsecond
         * clocks are updated and the state machine cranked as
         * necessary. The phase adjustment to be used for the next
         * second is calculated and the maximum error is increased by
         * the tolerance.
         */
        time_maxerror += MAXFREQ / 1000;

        /*
         * Leap second processing. If in leap-insert state at
         * the end of the day, the system clock is set back one
         * second; if in leap-delete state, the system clock is
         * set ahead one second. The nano_time() routine or
         * external clock driver will insure that reported time
         * is always monotonic.
         */
        switch (time_state) {

                /*
                 * No warning.
                 */
                case TIME_OK:
                if (time_status & STA_INS)
                        time_state = TIME_INS;
                else if (time_status & STA_DEL)
                        time_state = TIME_DEL;
                break;

                /*
                 * Insert second 23:59:60 following second
                 * 23:59:59.
                 */
                case TIME_INS:
                if (!(time_status & STA_INS))
                        time_state = TIME_OK;
                else if ((*newsec) % 86400 == 0) {
                        (*newsec)--;
                        time_state = TIME_OOP;
                        time_tai++;
                }
                break;

                /*
                 * Delete second 23:59:59.
                 */
                case TIME_DEL:
                if (!(time_status & STA_DEL))
                        time_state = TIME_OK;
                else if (((*newsec) + 1) % 86400 == 0) {
                        (*newsec)++;
                        time_tai--;
                        time_state = TIME_WAIT;
                }
                break;

                /*
                 * Insert second in progress.
                 */
                case TIME_OOP:
                        time_state = TIME_WAIT;
                break;

                /*
                 * Wait for status bits to clear.
                 */
                case TIME_WAIT:
                if (!(time_status & (STA_INS | STA_DEL)))
                        time_state = TIME_OK;
        }

        /*
         * Compute the total time adjustment for the next second
         * in ns. The offset is reduced by a factor depending on
         * whether the PPS signal is operating. Note that the
         * value is in effect scaled by the clock frequency,
         * since the adjustment is added at each tick interrupt.
         */
        ftemp = time_offset;
#ifdef PPS_SYNC
        /* XXX even if PPS signal dies we should finish adjustment ? */
        if (time_status & STA_PPSTIME && time_status &
            STA_PPSSIGNAL)
                L_RSHIFT(ftemp, pps_shift);
        else
                L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
#else
                L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
#endif /* PPS_SYNC */
        time_adj = ftemp;
        L_SUB(time_offset, ftemp);
        L_ADD(time_adj, time_freq);
        
#ifdef PPS_SYNC
        if (pps_valid > 0)
                pps_valid--;
        else
                time_status &= ~STA_PPSSIGNAL;
#endif /* PPS_SYNC */
#else  /* !NTP */
        L_CLR(time_adj);
#endif /* !NTP */

        /*
         * Apply any correction from adjtime(2).  If more than one second
         * off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM)
         * until the last second is slewed the final < 500 usecs.
         */
        if (time_adjtime != 0) {
                if (time_adjtime > 1000000)
                        tickrate = 5000;
                else if (time_adjtime < -1000000)
                        tickrate = -5000;
                else if (time_adjtime > 500)
                        tickrate = 500;
                else if (time_adjtime < -500)
                        tickrate = -500;
                else
                        tickrate = time_adjtime;
                time_adjtime -= tickrate;
                L_LINT(ftemp, tickrate * 1000);
                L_ADD(time_adj, ftemp);
        }
        *adjustment = time_adj;
}

/*
 * ntp_init() - initialize variables and structures
 *
 * This routine must be called after the kernel variables hz and tick
 * are set or changed and before the next tick interrupt. In this
 * particular implementation, these values are assumed set elsewhere in
 * the kernel. The design allows the clock frequency and tick interval
 * to be changed while the system is running. So, this routine should
 * probably be integrated with the code that does that.
 */
void
ntp_init(void)
{

        /*
         * The following variables are initialized only at startup. Only
         * those structures not cleared by the compiler need to be
         * initialized, and these only in the simulator. In the actual
         * kernel, any nonzero values here will quickly evaporate.
         */
        L_CLR(time_adj);
#ifdef NTP
        L_CLR(time_offset);
        L_CLR(time_freq);
#ifdef PPS_SYNC
        pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0;
        pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0;
        pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0;
        pps_fcount = 0;
        L_CLR(pps_freq);
#endif /* PPS_SYNC */
#endif
}

#ifdef NTP
/*
 * hardupdate() - local clock update
 *
 * This routine is called by ntp_adjtime() to update the local clock
 * phase and frequency. The implementation is of an adaptive-parameter,
 * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
 * time and frequency offset estimates for each call. If the kernel PPS
 * discipline code is configured (PPS_SYNC), the PPS signal itself
 * determines the new time offset, instead of the calling argument.
 * Presumably, calls to ntp_adjtime() occur only when the caller
 * believes the local clock is valid within some bound (+-128 ms with
 * NTP). If the caller's time is far different than the PPS time, an
 * argument will ensue, and it's not clear who will lose.
 *
 * For uncompensated quartz crystal oscillators and nominal update
 * intervals less than 256 s, operation should be in phase-lock mode,
 * where the loop is disciplined to phase. For update intervals greater
 * than 1024 s, operation should be in frequency-lock mode, where the
 * loop is disciplined to frequency. Between 256 s and 1024 s, the mode
 * is selected by the STA_MODE status bit.
 *
 * Note: splclock() is in effect.
 */
void
hardupdate(long offset)
{
        long mtemp;
        l_fp ftemp;

        KASSERT(mutex_owned(&timecounter_lock));

        /*
         * Select how the phase is to be controlled and from which
         * source. If the PPS signal is present and enabled to
         * discipline the time, the PPS offset is used; otherwise, the
         * argument offset is used.
         */
        if (!(time_status & STA_PLL))
                return;
        if (!(time_status & STA_PPSTIME && time_status &
            STA_PPSSIGNAL)) {
                if (offset > MAXPHASE)
                        time_monitor = MAXPHASE;
                else if (offset < -MAXPHASE)
                        time_monitor = -MAXPHASE;
                else
                        time_monitor = offset;
                L_LINT(time_offset, time_monitor);
        }

        /*
         * Select how the frequency is to be controlled and in which
         * mode (PLL or FLL). If the PPS signal is present and enabled
         * to discipline the frequency, the PPS frequency is used;
         * otherwise, the argument offset is used to compute it.
         */
        if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) {
                time_reftime = time_second;
                return;
        }
        if (time_status & STA_FREQHOLD || time_reftime == 0)
                time_reftime = time_second;
        mtemp = time_second - time_reftime;
        L_LINT(ftemp, time_monitor);
        L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1);
        L_MPY(ftemp, mtemp);
        L_ADD(time_freq, ftemp);
        time_status &= ~STA_MODE;
        if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp >
            MAXSEC)) {
                L_LINT(ftemp, (time_monitor << 4) / mtemp);
                L_RSHIFT(ftemp, SHIFT_FLL + 4);
                L_ADD(time_freq, ftemp);
                time_status |= STA_MODE;
        }
        time_reftime = time_second;
        if (L_GINT(time_freq) > MAXFREQ)
                L_LINT(time_freq, MAXFREQ);
        else if (L_GINT(time_freq) < -MAXFREQ)
                L_LINT(time_freq, -MAXFREQ);
}

#ifdef PPS_SYNC
/*
 * hardpps() - discipline CPU clock oscillator to external PPS signal
 *
 * This routine is called at each PPS interrupt in order to discipline
 * the CPU clock oscillator to the PPS signal. It measures the PPS phase
 * and leaves it in a handy spot for the hardclock() routine. It
 * integrates successive PPS phase differences and calculates the
 * frequency offset. This is used in hardclock() to discipline the CPU
 * clock oscillator so that intrinsic frequency error is cancelled out.
 * The code requires the caller to capture the time and hardware counter
 * value at the on-time PPS signal transition.
 *
 * Note that, on some Unix systems, this routine runs at an interrupt
 * priority level higher than the timer interrupt routine hardclock().
 * Therefore, the variables used are distinct from the hardclock()
 * variables, except for certain exceptions: The PPS frequency pps_freq
 * and phase pps_offset variables are determined by this routine and
 * updated atomically. The time_tolerance variable can be considered a
 * constant, since it is infrequently changed, and then only when the
 * PPS signal is disabled. The watchdog counter pps_valid is updated
 * once per second by hardclock() and is atomically cleared in this
 * routine.
 */
void
hardpps(struct timespec *tsp,                /* time at PPS */
        long nsec                        /* hardware counter at PPS */)
{
        long u_sec, u_nsec, v_nsec; /* temps */
        l_fp ftemp;

        KASSERT(mutex_owned(&timecounter_lock));

        /*
         * The signal is first processed by a range gate and frequency
         * discriminator. The range gate rejects noise spikes outside
         * the range +-500 us. The frequency discriminator rejects input
         * signals with apparent frequency outside the range 1 +-500
         * PPM. If two hits occur in the same second, we ignore the
         * later hit; if not and a hit occurs outside the range gate,
         * keep the later hit for later comparison, but do not process
         * it.
         */
        time_status |= STA_PPSSIGNAL | STA_PPSJITTER;
        time_status &= ~(STA_PPSWANDER | STA_PPSERROR);
        pps_valid = PPS_VALID;
        u_sec = tsp->tv_sec;
        u_nsec = tsp->tv_nsec;
        if (u_nsec >= (NANOSECOND >> 1)) {
                u_nsec -= NANOSECOND;
                u_sec++;
        }
        v_nsec = u_nsec - pps_tf[0].tv_nsec;
        if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND -
            MAXFREQ)
                return;
        pps_tf[2] = pps_tf[1];
        pps_tf[1] = pps_tf[0];
        pps_tf[0].tv_sec = u_sec;
        pps_tf[0].tv_nsec = u_nsec;

        /*
         * Compute the difference between the current and previous
         * counter values. If the difference exceeds 0.5 s, assume it
         * has wrapped around, so correct 1.0 s. If the result exceeds
         * the tick interval, the sample point has crossed a tick
         * boundary during the last second, so correct the tick. Very
         * intricate.
         */
        u_nsec = nsec;
        if (u_nsec > (NANOSECOND >> 1))
                u_nsec -= NANOSECOND;
        else if (u_nsec < -(NANOSECOND >> 1))
                u_nsec += NANOSECOND;
        pps_fcount += u_nsec;
        if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ)
                return;
        time_status &= ~STA_PPSJITTER;

        /*
         * A three-stage median filter is used to help denoise the PPS
         * time. The median sample becomes the time offset estimate; the
         * difference between the other two samples becomes the time
         * dispersion (jitter) estimate.
         */
        if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) {
                if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) {
                        v_nsec = pps_tf[1].tv_nsec;        /* 0 1 2 */
                        u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec;
                } else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) {
                        v_nsec = pps_tf[0].tv_nsec;        /* 2 0 1 */
                        u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec;
                } else {
                        v_nsec = pps_tf[2].tv_nsec;        /* 0 2 1 */
                        u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec;
                }
        } else {
                if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) {
                        v_nsec = pps_tf[1].tv_nsec;        /* 2 1 0 */
                        u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec;
                } else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) {
                        v_nsec = pps_tf[0].tv_nsec;        /* 1 0 2 */
                        u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec;
                } else {
                        v_nsec = pps_tf[2].tv_nsec;        /* 1 2 0 */
                        u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec;
                }
        }

        /*
         * Nominal jitter is due to PPS signal noise and interrupt
         * latency. If it exceeds the popcorn threshold, the sample is
         * discarded. otherwise, if so enabled, the time offset is
         * updated. We can tolerate a modest loss of data here without
         * much degrading time accuracy.
         */
        if (u_nsec > (pps_jitter << PPS_POPCORN)) {
                time_status |= STA_PPSJITTER;
                pps_jitcnt++;
        } else if (time_status & STA_PPSTIME) {
                time_monitor = -v_nsec;
                L_LINT(time_offset, time_monitor);
        }
        pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG;
        u_sec = pps_tf[0].tv_sec - pps_lastsec;
        if (u_sec < (1 << pps_shift))
                return;

        /*
         * At the end of the calibration interval the difference between
         * the first and last counter values becomes the scaled
         * frequency. It will later be divided by the length of the
         * interval to determine the frequency update. If the frequency
         * exceeds a sanity threshold, or if the actual calibration
         * interval is not equal to the expected length, the data are
         * discarded. We can tolerate a modest loss of data here without
         * much degrading frequency accuracy.
         */
        pps_calcnt++;
        v_nsec = -pps_fcount;
        pps_lastsec = pps_tf[0].tv_sec;
        pps_fcount = 0;
        u_nsec = MAXFREQ << pps_shift;
        if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 <<
            pps_shift)) {
                time_status |= STA_PPSERROR;
                pps_errcnt++;
                return;
        }

        /*
         * Here the raw frequency offset and wander (stability) is
         * calculated. If the wander is less than the wander threshold
         * for four consecutive averaging intervals, the interval is
         * doubled; if it is greater than the threshold for four
         * consecutive intervals, the interval is halved. The scaled
         * frequency offset is converted to frequency offset. The
         * stability metric is calculated as the average of recent
         * frequency changes, but is used only for performance
         * monitoring.
         */
        L_LINT(ftemp, v_nsec);
        L_RSHIFT(ftemp, pps_shift);
        L_SUB(ftemp, pps_freq);
        u_nsec = L_GINT(ftemp);
        if (u_nsec > PPS_MAXWANDER) {
                L_LINT(ftemp, PPS_MAXWANDER);
                pps_intcnt--;
                time_status |= STA_PPSWANDER;
                pps_stbcnt++;
        } else if (u_nsec < -PPS_MAXWANDER) {
                L_LINT(ftemp, -PPS_MAXWANDER);
                pps_intcnt--;
                time_status |= STA_PPSWANDER;
                pps_stbcnt++;
        } else {
                pps_intcnt++;
        }
        if (pps_intcnt >= 4) {
                pps_intcnt = 4;
                if (pps_shift < pps_shiftmax) {
                        pps_shift++;
                        pps_intcnt = 0;
                }
        } else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) {
                pps_intcnt = -4;
                if (pps_shift > PPS_FAVG) {
                        pps_shift--;
                        pps_intcnt = 0;
                }
        }
        if (u_nsec < 0)
                u_nsec = -u_nsec;
        pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG;

        /*
         * The PPS frequency is recalculated and clamped to the maximum
         * MAXFREQ. If enabled, the system clock frequency is updated as
         * well.
         */
        L_ADD(pps_freq, ftemp);
        u_nsec = L_GINT(pps_freq);
        if (u_nsec > MAXFREQ)
                L_LINT(pps_freq, MAXFREQ);
        else if (u_nsec < -MAXFREQ)
                L_LINT(pps_freq, -MAXFREQ);
        if (time_status & STA_PPSFREQ)
                time_freq = pps_freq;
}
#endif /* PPS_SYNC */
#endif /* NTP */

#ifdef NTP
int
ntp_timestatus(void)
{
        int rv;

        /*
         * Status word error decode. If any of these conditions
         * occur, an error is returned, instead of the status
         * word. Most applications will care only about the fact
         * the system clock may not be trusted, not about the
         * details.
         *
         * Hardware or software error
         */
        mutex_spin_enter(&timecounter_lock);
        if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||

        /*
         * PPS signal lost when either time or frequency
         * synchronization requested
         */
            (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
             !(time_status & STA_PPSSIGNAL)) ||

        /*
         * PPS jitter exceeded when time synchronization
         * requested
         */
            (time_status & STA_PPSTIME &&
             time_status & STA_PPSJITTER) ||

        /*
         * PPS wander exceeded or calibration error when
         * frequency synchronization requested
         */
            (time_status & STA_PPSFREQ &&
             time_status & (STA_PPSWANDER | STA_PPSERROR)))
                rv = TIME_ERROR;
        else
                rv = time_state;
        mutex_spin_exit(&timecounter_lock);

        return rv;
}

/*ARGSUSED*/
/*
 * ntp_gettime() - NTP user application interface
 */
int
sys___ntp_gettime50(struct lwp *l, const struct sys___ntp_gettime50_args *uap, register_t *retval)
{
        /* {
                syscallarg(struct ntptimeval *) ntvp;
        } */
        struct ntptimeval ntv;
        int error = 0;

        if (SCARG(uap, ntvp)) {
                ntp_gettime(&ntv);

                error = copyout((void *)&ntv, (void *)SCARG(uap, ntvp),
                                sizeof(ntv));
        }
        if (!error) {
                *retval = ntp_timestatus();
        }
        return(error);
}

/*
 * return information about kernel precision timekeeping
 */
static int
sysctl_kern_ntptime(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        struct ntptimeval ntv;

        ntp_gettime(&ntv);

        node = *rnode;
        node.sysctl_data = &ntv;
        node.sysctl_size = sizeof(ntv);
        return (sysctl_lookup(SYSCTLFN_CALL(&node)));
}

SYSCTL_SETUP(sysctl_kern_ntptime_setup, "sysctl kern.ntptime node setup")
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "ntptime",
                       SYSCTL_DESCR("Kernel clock values for NTP"),
                       sysctl_kern_ntptime, 0, NULL,
                       sizeof(struct ntptimeval),
                       CTL_KERN, KERN_NTPTIME, CTL_EOL);
}
#endif /* !NTP */





















































































































































































































































































































































































































































































































































































    1 



    1 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
/* $NetBSD: kern_fileassoc.c,v 1.38 2023/12/28 12:49:06 hannken Exp $ */

/*-
 * Copyright (c) 2006 Elad Efrat <elad@NetBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_fileassoc.c,v 1.38 2023/12/28 12:49:06 hannken Exp $");

#include "opt_fileassoc.h"

#include <sys/param.h>
#include <sys/mount.h>
#include <sys/queue.h>
#include <sys/vnode.h>
#include <sys/errno.h>
#include <sys/fileassoc.h>
#include <sys/specificdata.h>
#include <sys/hash.h>
#include <sys/kmem.h>
#include <sys/once.h>
#include <sys/mutex.h>
#include <sys/xcall.h>

#define        FILEASSOC_INITIAL_TABLESIZE        128

static specificdata_domain_t fileassoc_domain = NULL;
static specificdata_key_t fileassoc_mountspecific_key;
static ONCE_DECL(control);

/*
 * Assoc entry.
 * Includes the assoc name for identification and private clear callback.
 */
struct fileassoc {
        LIST_ENTRY(fileassoc) assoc_list;
        const char *assoc_name;                                /* Name. */
        fileassoc_cleanup_cb_t assoc_cleanup_cb;        /* Clear callback. */
        specificdata_key_t assoc_key;
};

static LIST_HEAD(, fileassoc) fileassoc_list;

/* An entry in the per-mount hash table. */
struct fileassoc_file {
        fhandle_t *faf_handle;                                /* File handle */
        specificdata_reference faf_data;                /* Assoc data. */
        u_int faf_nassocs;                                /* # of assocs. */
        LIST_ENTRY(fileassoc_file) faf_list;                /* List pointer. */
};

LIST_HEAD(fileassoc_hash_entry, fileassoc_file);

struct fileassoc_table {
        struct fileassoc_hash_entry *tbl_hash;
        u_long tbl_mask;                                /* Hash table mask. */
        size_t tbl_nslots;                                /* Number of slots. */
        size_t tbl_nused;                                /* # of used slots. */
        specificdata_reference tbl_data;
};

/*
 * Hashing function: Takes a number modulus the mask to give back an
 * index into the hash table.
 */
#define FILEASSOC_HASH(tbl, handle)        \
        (hash32_buf((handle), FHANDLE_SIZE(handle), HASH32_BUF_INIT) \
         & ((tbl)->tbl_mask))

/*
 * Global usage counting.  This is bad for parallelism of updates, but
 * good for avoiding calls to fileassoc when it's not in use.  Unclear
 * if parallelism of updates matters much.  If you want to improve
 * fileassoc(9) update performance, feel free to rip this out as long
 * as you don't cause the fast paths to take any global locks or incur
 * memory barriers when fileassoc(9) is not in use.
 */
static struct {
        kmutex_t lock;
        uint64_t nassocs;
        volatile bool inuse;
} fileassoc_global __cacheline_aligned;

static void
fileassoc_incuse(void)
{

        mutex_enter(&fileassoc_global.lock);
        if (fileassoc_global.nassocs++ == 0) {
                KASSERT(!fileassoc_global.inuse);
                atomic_store_relaxed(&fileassoc_global.inuse, true);
                xc_barrier(0);
        }
        mutex_exit(&fileassoc_global.lock);
}

static void
fileassoc_decuse(void)
{

        mutex_enter(&fileassoc_global.lock);
        KASSERT(fileassoc_global.nassocs > 0);
        KASSERT(fileassoc_global.inuse);
        if (--fileassoc_global.nassocs == 0)
                atomic_store_relaxed(&fileassoc_global.inuse, false);
        mutex_exit(&fileassoc_global.lock);
}

static bool
fileassoc_inuse(void)
{

        return __predict_false(atomic_load_relaxed(&fileassoc_global.inuse));
}

static void *
file_getdata(struct fileassoc_file *faf, const struct fileassoc *assoc)
{

        return specificdata_getspecific(fileassoc_domain, &faf->faf_data,
            assoc->assoc_key);
}

static void
file_setdata(struct fileassoc_file *faf, const struct fileassoc *assoc,
    void *data)
{

        specificdata_setspecific(fileassoc_domain, &faf->faf_data,
            assoc->assoc_key, data);
}

static void
file_cleanup(struct fileassoc_file *faf, const struct fileassoc *assoc)
{
        fileassoc_cleanup_cb_t cb;
        void *data;

        cb = assoc->assoc_cleanup_cb;
        if (cb == NULL) {
                return;
        }
        data = file_getdata(faf, assoc);
        (*cb)(data);
}

static void
file_free(struct fileassoc_file *faf)
{
        struct fileassoc *assoc;

        LIST_REMOVE(faf, faf_list);

        LIST_FOREACH(assoc, &fileassoc_list, assoc_list) {
                file_cleanup(faf, assoc);
                fileassoc_decuse();
        }
        vfs_composefh_free(faf->faf_handle);
        specificdata_fini(fileassoc_domain, &faf->faf_data);
        kmem_free(faf, sizeof(*faf));
}

static void
table_dtor(void *v)
{
        struct fileassoc_table *tbl = v;
        u_long i;

        /* Remove all entries from the table and lists */
        for (i = 0; i < tbl->tbl_nslots; i++) {
                struct fileassoc_file *faf;

                while ((faf = LIST_FIRST(&tbl->tbl_hash[i])) != NULL) {
                        file_free(faf);
                }
        }

        /* Remove hash table and sysctl node */
        hashdone(tbl->tbl_hash, HASH_LIST, tbl->tbl_mask);
        specificdata_fini(fileassoc_domain, &tbl->tbl_data);
        kmem_free(tbl, sizeof(*tbl));
}

/*
 * Initialize the fileassoc subsystem.
 */
static int
fileassoc_init(void)
{
        int error;

        error = mount_specific_key_create(&fileassoc_mountspecific_key,
            table_dtor);
        if (error) {
                return error;
        }
        fileassoc_domain = specificdata_domain_create();

        mutex_init(&fileassoc_global.lock, MUTEX_DEFAULT, IPL_NONE);

        return 0;
}

/*
 * Register a new assoc.
 */
int
fileassoc_register(const char *name, fileassoc_cleanup_cb_t cleanup_cb,
    fileassoc_t *result)
{
        int error;
        specificdata_key_t key;
        struct fileassoc *assoc;

        error = RUN_ONCE(&control, fileassoc_init);
        if (error) {
                return error;
        }
        error = specificdata_key_create(fileassoc_domain, &key, NULL);
        if (error) {
                return error;
        }
        assoc = kmem_alloc(sizeof(*assoc), KM_SLEEP);
        assoc->assoc_name = name;
        assoc->assoc_cleanup_cb = cleanup_cb;
        assoc->assoc_key = key;

        LIST_INSERT_HEAD(&fileassoc_list, assoc, assoc_list);

        *result = assoc;

        return 0;
}

/*
 * Deregister an assoc.
 */
int
fileassoc_deregister(fileassoc_t assoc)
{

        LIST_REMOVE(assoc, assoc_list);
        specificdata_key_delete(fileassoc_domain, assoc->assoc_key);
        kmem_free(assoc, sizeof(*assoc));

        return 0;
}

/*
 * Get the hash table for the specified device.
 */
static struct fileassoc_table *
fileassoc_table_lookup(struct mount *mp)
{
        int error;

        if (!fileassoc_inuse())
                return NULL;

        error = RUN_ONCE(&control, fileassoc_init);
        if (error) {
                return NULL;
        }
        return mount_getspecific(mp, fileassoc_mountspecific_key);
}

/*
 * Perform a lookup on a hash table.  If hint is non-zero then use the value
 * of the hint as the identifier instead of performing a lookup for the
 * fileid.
 */
static struct fileassoc_file *
fileassoc_file_lookup(struct vnode *vp, fhandle_t *hint)
{
        struct fileassoc_table *tbl;
        struct fileassoc_hash_entry *hash_entry;
        struct fileassoc_file *faf;
        size_t indx;
        fhandle_t *th;
        int error;

        tbl = fileassoc_table_lookup(vp->v_mount);
        if (tbl == NULL) {
                return NULL;
        }

        if (hint == NULL) {
                error = vfs_composefh_alloc(vp, &th);
                if (error)
                        return (NULL);
        } else {
                th = hint;
        }

        indx = FILEASSOC_HASH(tbl, th);
        hash_entry = &(tbl->tbl_hash[indx]);

        LIST_FOREACH(faf, hash_entry, faf_list) {
                if (((FHANDLE_FILEID(faf->faf_handle)->fid_len ==
                     FHANDLE_FILEID(th)->fid_len)) &&
                    (memcmp(FHANDLE_FILEID(faf->faf_handle), FHANDLE_FILEID(th),
                           (FHANDLE_FILEID(th))->fid_len) == 0)) {
                        break;
                }
        }

        if (hint == NULL)
                vfs_composefh_free(th);

        return faf;
}

/*
 * Return assoc data associated with a vnode.
 */
void *
fileassoc_lookup(struct vnode *vp, fileassoc_t assoc)
{
        struct fileassoc_file *faf;

        faf = fileassoc_file_lookup(vp, NULL);
        if (faf == NULL)
                return (NULL);

        return file_getdata(faf, assoc);
}

static struct fileassoc_table *
fileassoc_table_resize(struct fileassoc_table *tbl)
{
        struct fileassoc_table *newtbl;
        u_long i;

        /*
         * Allocate a new table. Like the condition in fileassoc_file_add(),
         * this is also temporary -- just double the number of slots.
         */
        newtbl = kmem_zalloc(sizeof(*newtbl), KM_SLEEP);
        newtbl->tbl_nslots = (tbl->tbl_nslots * 2);
        if (newtbl->tbl_nslots < tbl->tbl_nslots)
                newtbl->tbl_nslots = tbl->tbl_nslots;
        newtbl->tbl_hash = hashinit(newtbl->tbl_nslots, HASH_LIST,
            true, &newtbl->tbl_mask);
        newtbl->tbl_nused = 0;
        specificdata_init(fileassoc_domain, &newtbl->tbl_data);

        /* XXX we need to make sure nothing uses fileassoc here! */

        for (i = 0; i < tbl->tbl_nslots; i++) {
                struct fileassoc_file *faf;

                while ((faf = LIST_FIRST(&tbl->tbl_hash[i])) != NULL) {
                        struct fileassoc_hash_entry *hash_entry;
                        size_t indx;

                        LIST_REMOVE(faf, faf_list);

                        indx = FILEASSOC_HASH(newtbl, faf->faf_handle);
                        hash_entry = &(newtbl->tbl_hash[indx]);

                        LIST_INSERT_HEAD(hash_entry, faf, faf_list);

                        newtbl->tbl_nused++;
                }
        }

        if (tbl->tbl_nused != newtbl->tbl_nused)
                panic("fileassoc_table_resize: inconsistency detected! "
                    "needed %zu entries, got %zu", tbl->tbl_nused,
                    newtbl->tbl_nused);

        hashdone(tbl->tbl_hash, HASH_LIST, tbl->tbl_mask);
        specificdata_fini(fileassoc_domain, &tbl->tbl_data);
        kmem_free(tbl, sizeof(*tbl));

        return (newtbl);
}

/*
 * Create a new fileassoc table.
 */
static struct fileassoc_table *
fileassoc_table_add(struct mount *mp)
{
        struct fileassoc_table *tbl;

        /* Check for existing table for device. */
        tbl = fileassoc_table_lookup(mp);
        if (tbl != NULL)
                return (tbl);

        /* Allocate and initialize a table. */
        tbl = kmem_zalloc(sizeof(*tbl), KM_SLEEP);
        tbl->tbl_nslots = FILEASSOC_INITIAL_TABLESIZE;
        tbl->tbl_hash = hashinit(tbl->tbl_nslots, HASH_LIST, true,
            &tbl->tbl_mask);
        tbl->tbl_nused = 0;
        specificdata_init(fileassoc_domain, &tbl->tbl_data);

        mount_setspecific(mp, fileassoc_mountspecific_key, tbl);

        return (tbl);
}

/*
 * Delete a table.
 */
int
fileassoc_table_delete(struct mount *mp)
{
        struct fileassoc_table *tbl;

        tbl = fileassoc_table_lookup(mp);
        if (tbl == NULL)
                return (EEXIST);

        mount_setspecific(mp, fileassoc_mountspecific_key, NULL);
        table_dtor(tbl);

        return (0);
}

/*
 * Run a callback for each assoc in a table.
 */
int
fileassoc_table_run(struct mount *mp, fileassoc_t assoc, fileassoc_cb_t cb,
    void *cookie)
{
        struct fileassoc_table *tbl;
        u_long i;

        tbl = fileassoc_table_lookup(mp);
        if (tbl == NULL)
                return (EEXIST);

        for (i = 0; i < tbl->tbl_nslots; i++) {
                struct fileassoc_file *faf;

                LIST_FOREACH(faf, &tbl->tbl_hash[i], faf_list) {
                        void *data;

                        data = file_getdata(faf, assoc);
                        if (data != NULL)
                                cb(data, cookie);
                }
        }

        return (0);
}

/*
 * Clear a table for a given assoc.
 */
int
fileassoc_table_clear(struct mount *mp, fileassoc_t assoc)
{
        struct fileassoc_table *tbl;
        u_long i;

        tbl = fileassoc_table_lookup(mp);
        if (tbl == NULL)
                return (EEXIST);

        for (i = 0; i < tbl->tbl_nslots; i++) {
                struct fileassoc_file *faf;

                LIST_FOREACH(faf, &tbl->tbl_hash[i], faf_list) {
                        file_cleanup(faf, assoc);
                        file_setdata(faf, assoc, NULL);
                        /* XXX missing faf->faf_nassocs--? */
                        fileassoc_decuse();
                }
        }

        return (0);
}

/*
 * Add a file entry to a table.
 */
static struct fileassoc_file *
fileassoc_file_add(struct vnode *vp, fhandle_t *hint)
{
        struct fileassoc_table *tbl;
        struct fileassoc_hash_entry *hash_entry;
        struct fileassoc_file *faf;
        size_t indx;
        fhandle_t *th;
        int error;

        if (hint == NULL) {
                error = vfs_composefh_alloc(vp, &th);
                if (error)
                        return (NULL);
        } else
                th = hint;

        faf = fileassoc_file_lookup(vp, th);
        if (faf != NULL) {
                if (hint == NULL)
                        vfs_composefh_free(th);

                return (faf);
        }

        tbl = fileassoc_table_lookup(vp->v_mount);
        if (tbl == NULL) {
                tbl = fileassoc_table_add(vp->v_mount);
        }

        indx = FILEASSOC_HASH(tbl, th);
        hash_entry = &(tbl->tbl_hash[indx]);

        faf = kmem_zalloc(sizeof(*faf), KM_SLEEP);
        faf->faf_handle = th;
        specificdata_init(fileassoc_domain, &faf->faf_data);
        LIST_INSERT_HEAD(hash_entry, faf, faf_list);

        /*
         * This decides when we need to resize the table. For now,
         * resize it whenever we "filled" up the number of slots it
         * has. That's not really true unless of course we had zero
         * collisions. Think positive! :)
         */
        if (++(tbl->tbl_nused) == tbl->tbl_nslots) { 
                struct fileassoc_table *newtbl;

                newtbl = fileassoc_table_resize(tbl);
                mount_setspecific(vp->v_mount, fileassoc_mountspecific_key,
                    newtbl);
        }

        return (faf);
}

/*
 * Delete a file entry from a table.
 */
int
fileassoc_file_delete(struct vnode *vp)
{
        struct fileassoc_table *tbl;
        struct fileassoc_file *faf;

        if (!fileassoc_inuse())
                return ENOENT;

        KERNEL_LOCK(1, NULL);

        faf = fileassoc_file_lookup(vp, NULL);
        if (faf == NULL) {
                KERNEL_UNLOCK_ONE(NULL);
                return (ENOENT);
        }

        file_free(faf);

        tbl = fileassoc_table_lookup(vp->v_mount);
        KASSERT(tbl != NULL);
        --(tbl->tbl_nused); /* XXX gc? */

        KERNEL_UNLOCK_ONE(NULL);

        return (0);
}

/*
 * Add an assoc to a vnode.
 */
int
fileassoc_add(struct vnode *vp, fileassoc_t assoc, void *data)
{
        struct fileassoc_file *faf;
        void *olddata;

        faf = fileassoc_file_lookup(vp, NULL);
        if (faf == NULL) {
                faf = fileassoc_file_add(vp, NULL);
                if (faf == NULL)
                        return (ENOTDIR);
        }

        olddata = file_getdata(faf, assoc);
        if (olddata != NULL)
                return (EEXIST);

        fileassoc_incuse();

        file_setdata(faf, assoc, data);

        faf->faf_nassocs++;

        return (0);
}

/*
 * Clear an assoc from a vnode.
 */
int
fileassoc_clear(struct vnode *vp, fileassoc_t assoc)
{
        struct fileassoc_file *faf;

        faf = fileassoc_file_lookup(vp, NULL);
        if (faf == NULL)
                return (ENOENT);

        file_cleanup(faf, assoc);
        file_setdata(faf, assoc, NULL);

        --(faf->faf_nassocs); /* XXX gc? */

        fileassoc_decuse();

        return (0);
}



































































































































































































































































































































































































































    3 















































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
/* $NetBSD: kern_pmf.c,v 1.51 2022/08/24 11:41:39 riastradh Exp $ */

/*-
 * Copyright (c) 2007 Jared D. McNeill <jmcneill@invisible.ca>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_pmf.c,v 1.51 2022/08/24 11:41:39 riastradh Exp $");

#include <sys/types.h>
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/buf.h>
#include <sys/callout.h>
#include <sys/kernel.h>
#include <sys/device.h>
#include <sys/device_impl.h>
#include <sys/pmf.h>
#include <sys/queue.h>
#include <sys/sched.h>
#include <sys/workqueue.h>
#include <prop/proplib.h>
#include <sys/condvar.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/reboot.h>        /* for RB_NOSYNC */
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/vfs_syscalls.h>

/* XXX ugly special case, but for now the only client */
#include "wsdisplay.h"
#if NWSDISPLAY > 0
#include <dev/wscons/wsdisplayvar.h>
#endif

#define PMF_DEBUG

#ifdef PMF_DEBUG
int  pmf_debug_event;
int  pmf_debug_suspend;
int  pmf_debug_suspensor;
int  pmf_debug_idle;
int  pmf_debug_transition;

#define        PMF_SUSPENSOR_PRINTF(x)                if (pmf_debug_suspensor) printf x
#define        PMF_SUSPEND_PRINTF(x)                if (pmf_debug_suspend) printf x
#define        PMF_EVENT_PRINTF(x)                if (pmf_debug_event) printf x
#define        PMF_IDLE_PRINTF(x)                if (pmf_debug_idle) printf x
#define        PMF_TRANSITION_PRINTF(x)        if (pmf_debug_transition) printf x
#define        PMF_TRANSITION_PRINTF2(y,x)        if (pmf_debug_transition>y) printf x
#else
#define        PMF_SUSPENSOR_PRINTF(x)                do { } while (0)
#define        PMF_SUSPEND_PRINTF(x)                do { } while (0)
#define        PMF_EVENT_PRINTF(x)                do { } while (0)
#define        PMF_IDLE_PRINTF(x)                do { } while (0)
#define        PMF_TRANSITION_PRINTF(x)        do { } while (0)
#define        PMF_TRANSITION_PRINTF2(y,x)        do { } while (0)
#endif

static prop_dictionary_t pmf_platform = NULL;
static struct workqueue *pmf_event_workqueue;
static struct workqueue *pmf_suspend_workqueue;

typedef struct pmf_event_handler {
        TAILQ_ENTRY(pmf_event_handler) pmf_link;
        pmf_generic_event_t pmf_event;
        void (*pmf_handler)(device_t);
        device_t pmf_device;
        bool pmf_global;
} pmf_event_handler_t;

static TAILQ_HEAD(, pmf_event_handler) pmf_all_events =
    TAILQ_HEAD_INITIALIZER(pmf_all_events);

typedef struct pmf_event_workitem {
        struct work                                pew_work;
        pmf_generic_event_t                        pew_event;
        device_t                                pew_device;
} pmf_event_workitem_t;

typedef struct pmf_suspend_workitem {
        struct work        psw_work;
        device_t        psw_dev;
        pmf_qual_t        psw_qual;
} pmf_suspend_workitem_t;

static struct pool pew_pl;

static pmf_event_workitem_t *pmf_event_workitem_get(void);
static void pmf_event_workitem_put(pmf_event_workitem_t *);

static bool pmf_device_resume_locked(device_t, const pmf_qual_t *);
static bool pmf_device_suspend_locked(device_t, const pmf_qual_t *);
static bool device_pmf_any_suspensor(device_t, devact_level_t);

static bool
complete_suspension(device_t dev, const device_suspensor_t **susp,
    const pmf_qual_t *pqp)
{
        int i;
        pmf_qual_t pq;
        const device_suspensor_t *ds;

        ds = pmf_qual_suspension(pqp);
        KASSERT(ds->ds_delegator != NULL);

        pq = *pqp;
        pq.pq_suspensor = ds->ds_delegator;

        for (i = 0; i < DEVICE_SUSPENSORS_MAX; i++) {
                if (susp[i] != ds)
                        continue;
                if (!pmf_device_suspend(dev, &pq))
                        return false;
        }
        return true;
}

static void
pmf_suspend_worker(struct work *wk, void *dummy)
{
        pmf_suspend_workitem_t *psw;
        deviter_t di;
        device_t dev;

        psw = (void *)wk;
        KASSERT(wk == &psw->psw_work);
        KASSERT(psw != NULL);

        for (dev = deviter_first(&di, 0); dev != NULL;
             dev = deviter_next(&di)) {
                if (dev == psw->psw_dev && device_pmf_lock(dev))
                        break;
        }
        deviter_release(&di);

        if (dev == NULL)
                return;

        switch (pmf_qual_depth(&psw->psw_qual)) {
        case DEVACT_LEVEL_FULL:
                if (!complete_suspension(dev, dev->dv_class_suspensors,
                        &psw->psw_qual))
                        break;
                /*FALLTHROUGH*/
        case DEVACT_LEVEL_DRIVER:
                if (!complete_suspension(dev, dev->dv_driver_suspensors,
                        &psw->psw_qual))
                        break;
                /*FALLTHROUGH*/
        case DEVACT_LEVEL_BUS:
                if (!complete_suspension(dev, dev->dv_bus_suspensors,
                        &psw->psw_qual))
                        break;
        }
        device_pmf_unlock(dev);
        kmem_free(psw, sizeof(*psw));
}

static void
pmf_event_worker(struct work *wk, void *dummy)
{
        pmf_event_workitem_t *pew;
        pmf_event_handler_t *event;

        pew = (void *)wk;
        KASSERT(wk == &pew->pew_work);
        KASSERT(pew != NULL);

        TAILQ_FOREACH(event, &pmf_all_events, pmf_link) {
                if (event->pmf_event != pew->pew_event)
                        continue;
                if (event->pmf_device == pew->pew_device || event->pmf_global)
                        (*event->pmf_handler)(event->pmf_device);
        }

        pmf_event_workitem_put(pew);
}

static bool
pmf_check_system_drivers(void)
{
        device_t curdev;
        bool unsupported_devs;
        deviter_t di;

        unsupported_devs = false;
        for (curdev = deviter_first(&di, 0); curdev != NULL;
             curdev = deviter_next(&di)) {
                if (device_pmf_is_registered(curdev))
                        continue;
                if (!unsupported_devs)
                        printf("Devices without power management support:");
                printf(" %s", device_xname(curdev));
                unsupported_devs = true;
        }
        deviter_release(&di);
        if (unsupported_devs) {
                printf("\n");
                return false;
        }
        return true;
}

bool
pmf_system_bus_resume(const pmf_qual_t *qual)
{
        bool rv;
        device_t curdev;
        deviter_t di;

        aprint_debug("Powering devices:");
        /* D0 handlers are run in order */
        rv = true;
        for (curdev = deviter_first(&di, DEVITER_F_ROOT_FIRST); curdev != NULL;
             curdev = deviter_next(&di)) {
                if (!device_pmf_is_registered(curdev))
                        continue;
                if (device_is_active(curdev) ||
                    !device_is_enabled(curdev))
                        continue;

                aprint_debug(" %s", device_xname(curdev));

                if (!device_pmf_bus_resume(curdev, qual)) {
                        rv = false;
                        aprint_debug("(failed)");
                }
        }
        deviter_release(&di);
        aprint_debug("\n");

        return rv;
}

bool
pmf_system_resume(const pmf_qual_t *qual)
{
        bool rv;
        device_t curdev, parent;
        deviter_t di;

        if (!pmf_check_system_drivers())
                return false;

        aprint_debug("Resuming devices:");
        /* D0 handlers are run in order */
        rv = true;
        for (curdev = deviter_first(&di, DEVITER_F_ROOT_FIRST); curdev != NULL;
             curdev = deviter_next(&di)) {
                if (device_is_active(curdev) ||
                    !device_is_enabled(curdev))
                        continue;
                parent = device_parent(curdev);
                if (parent != NULL &&
                    !device_is_active(parent))
                        continue;

                aprint_debug(" %s", device_xname(curdev));

                if (!pmf_device_resume(curdev, qual)) {
                        rv = false;
                        aprint_debug("(failed)");
                }
        }
        deviter_release(&di);
        aprint_debug(".\n");

        KERNEL_UNLOCK_ONE(0);
#if NWSDISPLAY > 0
        if (rv)
                wsdisplay_handlex(1);
#endif
        return rv;
}

bool
pmf_system_suspend(const pmf_qual_t *qual)
{
        device_t curdev;
        deviter_t di;

        if (!pmf_check_system_drivers())
                return false;
#if NWSDISPLAY > 0
        if (wsdisplay_handlex(0))
                return false;
#endif
        KERNEL_LOCK(1, NULL);

        /*
         * Flush buffers only if the shutdown didn't do so
         * already and if there was no panic.
         */
        if (doing_shutdown == 0 && panicstr == NULL) {
                printf("Flushing disk caches: ");
                do_sys_sync(&lwp0);
                if (vfs_syncwait() != 0)
                        printf("giving up\n");
                else
                        printf("done\n");
        }

        aprint_debug("Suspending devices:");

        for (curdev = deviter_first(&di, DEVITER_F_LEAVES_FIRST);
             curdev != NULL;
             curdev = deviter_next(&di)) {
                if (!device_is_active(curdev))
                        continue;

                aprint_debug(" %s", device_xname(curdev));

                /* XXX joerg check return value and abort suspend */
                if (!pmf_device_suspend(curdev, qual))
                        aprint_debug("(failed)");
        }
        deviter_release(&di);

        aprint_debug(".\n");

        return true;
}

static bool
shutdown_all(int how)
{
        static struct shutdown_state s;
        device_t curdev;
        bool progress = false;

        KERNEL_LOCK(1, NULL);
        for (curdev = shutdown_first(&s); curdev != NULL;
             curdev = shutdown_next(&s)) {
                aprint_debug(" shutting down %s, ", device_xname(curdev));
                if (!device_pmf_is_registered(curdev))
                        aprint_debug("skipped.");
#if 0 /* needed? */
                else if (!device_pmf_class_shutdown(curdev, how))
                        aprint_debug("failed.");
#endif
                else if (!device_pmf_driver_shutdown(curdev, how))
                        aprint_debug("failed.");
                else if (!device_pmf_bus_shutdown(curdev, how))
                        aprint_debug("failed.");
                else {
                        progress = true;
                        aprint_debug("success.");
                }
        }
        KERNEL_UNLOCK_ONE(NULL);
        return progress;
}

void
pmf_system_shutdown(int how)
{

        if (panicstr != NULL)
                return;

        aprint_debug("Shutting down devices:");
        shutdown_all(how);
}

bool
pmf_set_platform(const char *key, const char *value)
{
        if (pmf_platform == NULL)
                pmf_platform = prop_dictionary_create();
        if (pmf_platform == NULL)
                return false;

        return prop_dictionary_set_string(pmf_platform, key, value);
}

const char *
pmf_get_platform(const char *key)
{
        const char *value;

        if (pmf_platform == NULL)
                return NULL;

        if (!prop_dictionary_get_string(pmf_platform, key, &value))
                return NULL;

        return value;
}

bool
pmf_device_register1(device_t dev,
    bool (*suspend)(device_t, const pmf_qual_t *),
    bool (*resume)(device_t, const pmf_qual_t *),
    bool (*shutdown)(device_t, int))
{

        device_pmf_driver_register(dev, suspend, resume, shutdown);
        device_pmf_driver_child_register(dev);

        return true;
}

void
pmf_device_deregister(device_t dev)
{

        device_pmf_class_deregister(dev);
        device_pmf_bus_deregister(dev);
        device_pmf_driver_deregister(dev);
}

static const device_suspensor_t _device_suspensor_drvctl = {
        .ds_delegator = NULL,
        .ds_name = "drvctl",
};

static const device_suspensor_t _device_suspensor_self = {
        .ds_delegator = NULL,
        .ds_name = "self",
};

#if 0
static const device_suspensor_t _device_suspensor_self_delegate = {
        .ds_delegator = &_device_suspensor_self,
        .ds_name = "self delegate",
};
#endif

static const device_suspensor_t _device_suspensor_system = {
        .ds_delegator = NULL,
        .ds_name = "system",
};

const device_suspensor_t
    * const device_suspensor_self = &_device_suspensor_self,
#if 0
    * const device_suspensor_self_delegate = &_device_suspensor_self_delegate,
#endif
    * const device_suspensor_system = &_device_suspensor_system,
    * const device_suspensor_drvctl = &_device_suspensor_drvctl;

static const pmf_qual_t _pmf_qual_system = {
        .pq_actlvl = DEVACT_LEVEL_FULL,
        .pq_suspensor = &_device_suspensor_system,
};

static const pmf_qual_t _pmf_qual_drvctl = {
        .pq_actlvl = DEVACT_LEVEL_FULL,
        .pq_suspensor = &_device_suspensor_drvctl,
};

static const pmf_qual_t _pmf_qual_self = {
        .pq_actlvl = DEVACT_LEVEL_DRIVER,
        .pq_suspensor = &_device_suspensor_self,
};

const pmf_qual_t
    * const PMF_Q_DRVCTL = &_pmf_qual_drvctl,
    * const PMF_Q_NONE = &_pmf_qual_system,
    * const PMF_Q_SELF = &_pmf_qual_self;

static bool
device_suspensor_delegates_to(const device_suspensor_t *ds,
    const device_suspensor_t *delegate)
{
        const device_suspensor_t *iter;

        for (iter = delegate->ds_delegator; iter != NULL;
             iter = iter->ds_delegator) {
                if (ds == iter)
                        return true;
        }
        return false;
}

static bool
add_suspensor(device_t dev, const char *kind, const device_suspensor_t **susp,
    const device_suspensor_t *ds)
{
        int i;

        for (i = 0; i < DEVICE_SUSPENSORS_MAX; i++) {
                if (susp[i] == NULL)
                        continue;
                if (ds == susp[i]) {
                        PMF_SUSPENSOR_PRINTF((
                            "%s: %s-suspended by %s (delegator %s) already\n",
                            device_xname(dev), kind,
                            susp[i]->ds_name,
                            (susp[i]->ds_delegator != NULL) ?
                            susp[i]->ds_delegator->ds_name : "<none>"));
                        return true;
                }
                if (device_suspensor_delegates_to(ds, susp[i])) {
                        PMF_SUSPENSOR_PRINTF((
                            "%s: %s assumes %s-suspension by %s "
                            "(delegator %s)\n",
                            device_xname(dev), ds->ds_name, kind,
                            susp[i]->ds_name,
                            (susp[i]->ds_delegator != NULL) ?
                            susp[i]->ds_delegator->ds_name : "<none>"));
                        susp[i] = ds;
                        return true;
                }
        }
        for (i = 0; i < DEVICE_SUSPENSORS_MAX; i++) {
                if (susp[i] == NULL) {
                        susp[i] = ds;
                        PMF_SUSPENSOR_PRINTF((
                            "%s: newly %s-suspended by %s (delegator %s)\n",
                            device_xname(dev), kind,
                            susp[i]->ds_name,
                            (susp[i]->ds_delegator != NULL) ?
                            susp[i]->ds_delegator->ds_name : "<none>"));
                        return true;
                }
        }
        return false;
}

static bool
device_pmf_add_suspensor(device_t dev, const pmf_qual_t *pq)
{
        const device_suspensor_t *ds;

        KASSERT(pq != NULL);

        ds = pmf_qual_suspension(pq);

        KASSERT(ds != NULL);

        if (!add_suspensor(dev, "class", dev->dv_class_suspensors, ds))
                return false;
        if (!add_suspensor(dev, "driver", dev->dv_driver_suspensors, ds))
                return false;
        if (!add_suspensor(dev, "bus", dev->dv_bus_suspensors, ds))
                return false;
        return true;
}

#if 0
static bool
device_pmf_has_suspension(device_t dev, const device_suspensor_t *ds)
{
        int i;

        for (i = 0; i < DEVICE_SUSPENSORS_MAX; i++) {
                if (dev->dv_suspensions[i] == ds)
                        return true;
                if (device_suspensor_delegates_to(dev->dv_suspensions[i], ds))
                        return true;
        }
        return false;
}
#endif

static bool
any_suspensor(device_t dev, const char *kind, const device_suspensor_t **susp)
{
        int i;
        bool suspended = false;

        for (i = 0; i < DEVICE_SUSPENSORS_MAX; i++) {
                if (susp[i] != NULL) {
                        PMF_SUSPENSOR_PRINTF(("%s: %s is suspended by %s "
                            "(delegator %s)\n",
                            device_xname(dev), kind,
                            susp[i]->ds_name,
                            (susp[i]->ds_delegator != NULL) ?
                            susp[i]->ds_delegator->ds_name : "<none>"));
                        suspended = true;
                }
        }
        return suspended;
}

static bool
device_pmf_any_suspensor(device_t dev, devact_level_t depth)
{
        switch (depth) {
        case DEVACT_LEVEL_FULL:
                if (any_suspensor(dev, "class", dev->dv_class_suspensors))
                        return true;
                /*FALLTHROUGH*/
        case DEVACT_LEVEL_DRIVER:
                if (any_suspensor(dev, "driver", dev->dv_driver_suspensors))
                        return true;
                /*FALLTHROUGH*/
        case DEVACT_LEVEL_BUS:
                if (any_suspensor(dev, "bus", dev->dv_bus_suspensors))
                        return true;
        }
        return false;
}

static bool
remove_suspensor(device_t dev, const char *kind,
    const device_suspensor_t **susp, const device_suspensor_t *ds)
{
        int i;

        for (i = 0; i < DEVICE_SUSPENSORS_MAX; i++) {
                if (susp[i] == NULL)
                        continue;
                if (ds == susp[i] ||
                    device_suspensor_delegates_to(ds, susp[i])) {
                        PMF_SUSPENSOR_PRINTF(("%s: %s suspension %s "
                            "(delegator %s) removed by %s\n",
                            device_xname(dev), kind,
                            susp[i]->ds_name,
                            (susp[i]->ds_delegator != NULL)
                                ?  susp[i]->ds_delegator->ds_name
                                : "<none>",
                            ds->ds_name));
                        susp[i] = NULL;
                        return true;
                }
        }
        return false;
}

static bool
device_pmf_remove_suspensor(device_t dev, const pmf_qual_t *pq)
{
        const device_suspensor_t *ds;

        KASSERT(pq != NULL);

        ds = pmf_qual_suspension(pq);

        KASSERT(ds != NULL);

        if (!remove_suspensor(dev, "class", dev->dv_class_suspensors, ds))
                return false;
        if (!remove_suspensor(dev, "driver", dev->dv_driver_suspensors, ds))
                return false;
        if (!remove_suspensor(dev, "bus", dev->dv_bus_suspensors, ds))
                return false;

        return true;
}

void
pmf_self_suspensor_init(device_t dev, device_suspensor_t *ds,
    pmf_qual_t *pq)
{

        ds->ds_delegator = device_suspensor_self;
        snprintf(ds->ds_name, sizeof(ds->ds_name), "%s-self",
            device_xname(dev));
        pq->pq_actlvl = DEVACT_LEVEL_DRIVER;
        pq->pq_suspensor = ds;
}

bool
pmf_device_suspend(device_t dev, const pmf_qual_t *qual)
{
        bool rc;

        PMF_TRANSITION_PRINTF(("%s: suspend enter\n", device_xname(dev)));
        if (!device_pmf_is_registered(dev))
                return false;

        if (!device_pmf_lock(dev))
                return false;

        rc = pmf_device_suspend_locked(dev, qual);

        device_pmf_unlock(dev);

        PMF_TRANSITION_PRINTF(("%s: suspend exit\n", device_xname(dev)));
        return rc;
}

bool
pmf_device_suspend_locked(device_t dev, const pmf_qual_t *qual)
{

        if (!device_pmf_add_suspensor(dev, qual))
                return false;

        PMF_TRANSITION_PRINTF2(1, ("%s: class suspend\n", device_xname(dev)));
        if (!device_pmf_class_suspend(dev, qual))
                return false;

        PMF_TRANSITION_PRINTF2(1, ("%s: driver suspend\n", device_xname(dev)));
        if (!device_pmf_driver_suspend(dev, qual))
                return false;

        PMF_TRANSITION_PRINTF2(1, ("%s: bus suspend\n", device_xname(dev)));
        if (!device_pmf_bus_suspend(dev, qual))
                return false;

        return true;
}

bool
pmf_device_resume(device_t dev, const pmf_qual_t *qual)
{
        bool rc;

        PMF_TRANSITION_PRINTF(("%s: resume enter\n", device_xname(dev)));
        if (!device_pmf_is_registered(dev))
                return false;

        if (!device_pmf_lock(dev))
                return false;

        rc = pmf_device_resume_locked(dev, qual);

        device_pmf_unlock(dev);

        PMF_TRANSITION_PRINTF(("%s: resume exit\n", device_xname(dev)));
        return rc;
}

bool
pmf_device_resume_locked(device_t dev, const pmf_qual_t *qual)
{

        device_pmf_remove_suspensor(dev, qual);

        if (device_pmf_any_suspensor(dev, DEVACT_LEVEL_FULL))
                return true;

        PMF_TRANSITION_PRINTF2(1, ("%s: bus resume\n", device_xname(dev)));
        if (!device_pmf_bus_resume(dev, qual))
                return false;

        PMF_TRANSITION_PRINTF2(1, ("%s: driver resume\n", device_xname(dev)));
        if (!device_pmf_driver_resume(dev, qual))
                return false;

        PMF_TRANSITION_PRINTF2(1, ("%s: class resume\n", device_xname(dev)));
        if (!device_pmf_class_resume(dev, qual))
                return false;

        return true;
}

bool
pmf_device_recursive_suspend(device_t dv, const pmf_qual_t *qual)
{
        bool rv = true;
        device_t curdev;
        deviter_t di;
        pmf_qual_t pq;

        pmf_qual_recursive_copy(&pq, qual);

        for (curdev = deviter_first(&di, 0); curdev != NULL;
             curdev = deviter_next(&di)) {
                if (device_parent(curdev) != dv)
                        continue;
                if (!pmf_device_recursive_suspend(curdev, &pq)) {
                        rv = false;
                        break;
                }
        }
        deviter_release(&di);

        return rv && pmf_device_suspend(dv, qual);
}

void
pmf_qual_recursive_copy(pmf_qual_t *dst, const pmf_qual_t *src)
{

        *dst = *src;
        dst->pq_actlvl = DEVACT_LEVEL_FULL;
}

bool
pmf_device_recursive_resume(device_t dv, const pmf_qual_t *qual)
{
        device_t parent;
        pmf_qual_t pq;

        if (device_is_active(dv))
                return true;

        pmf_qual_recursive_copy(&pq, qual);

        parent = device_parent(dv);
        if (parent != NULL) {
                if (!pmf_device_recursive_resume(parent, &pq))
                        return false;
        }

        return pmf_device_resume(dv, qual);
}

bool
pmf_device_descendants_release(device_t dv, const pmf_qual_t *qual)
{
        bool rv = true;
        device_t curdev;
        deviter_t di;

        for (curdev = deviter_first(&di, 0); curdev != NULL;
             curdev = deviter_next(&di)) {
                if (device_parent(curdev) != dv)
                        continue;
                device_pmf_remove_suspensor(curdev, qual);
                if (!pmf_device_descendants_release(curdev, qual)) {
                        rv = false;
                        break;
                }
        }
        deviter_release(&di);
        return rv;
}

bool
pmf_device_descendants_resume(device_t dv, const pmf_qual_t *qual)
{
        bool rv = true;
        device_t curdev;
        deviter_t di;

        KASSERT(pmf_qual_descend_ok(qual));

        for (curdev = deviter_first(&di, 0); curdev != NULL;
             curdev = deviter_next(&di)) {
                if (device_parent(curdev) != dv)
                        continue;
                if (!pmf_device_resume(curdev, qual) ||
                    !pmf_device_descendants_resume(curdev, qual)) {
                        rv = false;
                        break;
                }
        }
        deviter_release(&di);
        return rv;
}

bool
pmf_device_subtree_release(device_t dv, const pmf_qual_t *qual)
{
        pmf_qual_t pq;

        device_pmf_remove_suspensor(dv, qual);

        pmf_qual_recursive_copy(&pq, qual);

        return pmf_device_descendants_release(dv, &pq);
}

bool
pmf_device_subtree_resume(device_t dv, const pmf_qual_t *qual)
{
        pmf_qual_t pq;

        if (!pmf_device_subtree_release(dv, qual))
                return false;

        if (!pmf_device_recursive_resume(dv, qual))
                return false;

        pmf_qual_recursive_copy(&pq, qual);

        return pmf_device_descendants_resume(dv, &pq);
}

#include <net/if.h>

static bool
pmf_class_network_suspend(device_t dev, const pmf_qual_t *qual)
{
        struct ifnet *ifp = device_pmf_class_private(dev);
        int s;

        s = splnet();
        IFNET_LOCK(ifp);
        (*ifp->if_stop)(ifp, 0);
        IFNET_UNLOCK(ifp);
        splx(s);

        return true;
}

static bool
pmf_class_network_resume(device_t dev, const pmf_qual_t *qual)
{
        struct ifnet *ifp = device_pmf_class_private(dev);
        int s;
        bool restart = false;

        s = splnet();
        IFNET_LOCK(ifp);
        if (ifp->if_flags & IFF_UP) {
                ifp->if_flags &= ~IFF_RUNNING;
                if ((*ifp->if_init)(ifp) != 0)
                        aprint_normal_ifnet(ifp, "resume failed\n");
                restart = true;
        }
        IFNET_UNLOCK(ifp);

        if (restart)
                if_start_lock(ifp);

        splx(s);

        return true;
}

void
pmf_class_network_register(device_t dev, struct ifnet *ifp)
{

        device_pmf_class_register(dev, ifp, pmf_class_network_suspend,
            pmf_class_network_resume, NULL);
}

bool
pmf_event_inject(device_t dv, pmf_generic_event_t ev)
{
        pmf_event_workitem_t *pew;

        pew = pmf_event_workitem_get();
        if (pew == NULL) {
                PMF_EVENT_PRINTF(("%s: PMF event %d dropped (no memory)\n",
                    dv ? device_xname(dv) : "<anonymous>", ev));
                return false;
        }

        pew->pew_event = ev;
        pew->pew_device = dv;

        workqueue_enqueue(pmf_event_workqueue, &pew->pew_work, NULL);
        PMF_EVENT_PRINTF(("%s: PMF event %d injected\n",
            dv ? device_xname(dv) : "<anonymous>", ev));

        return true;
}

bool
pmf_event_register(device_t dv, pmf_generic_event_t ev,
    void (*handler)(device_t), bool global)
{
        pmf_event_handler_t *event;

        event = kmem_alloc(sizeof(*event), KM_SLEEP);
        event->pmf_event = ev;
        event->pmf_handler = handler;
        event->pmf_device = dv;
        event->pmf_global = global;
        TAILQ_INSERT_TAIL(&pmf_all_events, event, pmf_link);

        return true;
}

void
pmf_event_deregister(device_t dv, pmf_generic_event_t ev,
    void (*handler)(device_t), bool global)
{
        pmf_event_handler_t *event;

        TAILQ_FOREACH(event, &pmf_all_events, pmf_link) {
                if (event->pmf_event != ev)
                        continue;
                if (event->pmf_device != dv)
                        continue;
                if (event->pmf_global != global)
                        continue;
                if (event->pmf_handler != handler)
                        continue;
                TAILQ_REMOVE(&pmf_all_events, event, pmf_link);
                kmem_free(event, sizeof(*event));
                return;
        }
}

struct display_class_softc {
        TAILQ_ENTRY(display_class_softc) dc_link;
        device_t dc_dev;
};

static TAILQ_HEAD(, display_class_softc) all_displays;
static callout_t global_idle_counter;
static int idle_timeout = 30;

static void
input_idle(void *dummy)
{

        PMF_IDLE_PRINTF(("Input idle handler called\n"));
        pmf_event_inject(NULL, PMFE_DISPLAY_OFF);
}

static void
input_activity_handler(device_t dv, devactive_t type)
{

        if (!TAILQ_EMPTY(&all_displays))
                callout_schedule(&global_idle_counter, idle_timeout * hz);
}

static void
pmf_class_input_deregister(device_t dv)
{

        device_active_deregister(dv, input_activity_handler);
}

bool
pmf_class_input_register(device_t dv)
{

        if (!device_active_register(dv, input_activity_handler))
                return false;

        device_pmf_class_register(dv, NULL, NULL, NULL,
            pmf_class_input_deregister);

        return true;
}

static void
pmf_class_display_deregister(device_t dv)
{
        struct display_class_softc *sc = device_pmf_class_private(dv);
        int s;

        s = splsoftclock();
        TAILQ_REMOVE(&all_displays, sc, dc_link);
        if (TAILQ_EMPTY(&all_displays))
                callout_stop(&global_idle_counter);
        splx(s);

        kmem_free(sc, sizeof(*sc));
}

bool
pmf_class_display_register(device_t dv)
{
        struct display_class_softc *sc;
        int s;

        sc = kmem_alloc(sizeof(*sc), KM_SLEEP);

        s = splsoftclock();
        if (TAILQ_EMPTY(&all_displays))
                callout_schedule(&global_idle_counter, idle_timeout * hz);

        TAILQ_INSERT_HEAD(&all_displays, sc, dc_link);
        splx(s);

        device_pmf_class_register(dv, sc, NULL, NULL,
            pmf_class_display_deregister);

        return true;
}

static void
pmf_event_workitem_put(pmf_event_workitem_t *pew)
{

        KASSERT(pew != NULL);
        pool_put(&pew_pl, pew);
}

static pmf_event_workitem_t *
pmf_event_workitem_get(void)
{

        return pool_get(&pew_pl, PR_NOWAIT);
}

SYSCTL_SETUP(sysctl_pmf_setup, "PMF subtree setup")
{
        const struct sysctlnode *node = NULL;

        sysctl_createv(clog, 0, NULL, &node,
            CTLFLAG_PERMANENT,
            CTLTYPE_NODE, "pmf",
            SYSCTL_DESCR("pmf controls"),
            NULL, 0, NULL, 0,
            CTL_KERN, CTL_CREATE, CTL_EOL);

#ifdef PMF_DEBUG
        sysctl_createv(clog, 0, &node, &node,
            CTLFLAG_PERMANENT,
            CTLTYPE_NODE, "debug",
            SYSCTL_DESCR("debug levels"),
            NULL, 0, NULL, 0,
            CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &node, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
            CTLTYPE_INT, "event",
            SYSCTL_DESCR("event"),
            NULL, 0,  &pmf_debug_event, sizeof(pmf_debug_event),
            CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
            CTLTYPE_INT, "suspend",
            SYSCTL_DESCR("suspend"),
            NULL, 0,  &pmf_debug_suspend, sizeof(pmf_debug_suspend),
            CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
            CTLTYPE_INT, "suspensor",
            SYSCTL_DESCR("suspensor"),
            NULL, 0,  &pmf_debug_suspensor, sizeof(pmf_debug_suspensor),
            CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
            CTLTYPE_INT, "idle",
            SYSCTL_DESCR("idle"),
            NULL, 0,  &pmf_debug_idle, sizeof(pmf_debug_idle),
            CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
            CTLTYPE_INT, "transition",
            SYSCTL_DESCR("event"),
            NULL, 0,  &pmf_debug_transition, sizeof(pmf_debug_transition),
            CTL_CREATE, CTL_EOL);
#endif
}

void
pmf_init(void)
{
        int err;

        pool_init(&pew_pl, sizeof(pmf_event_workitem_t), 0, 0, 0,
            "pewpl", NULL, IPL_HIGH);
        pool_setlowat(&pew_pl, 1);
        pool_sethiwat(&pew_pl, 8);

        KASSERT(pmf_event_workqueue == NULL);
        err = workqueue_create(&pmf_event_workqueue, "pmfevent",
            pmf_event_worker, NULL, PRI_NONE, IPL_VM, 0);
        if (err)
                panic("couldn't create pmfevent workqueue");

        KASSERT(pmf_suspend_workqueue == NULL);
        err = workqueue_create(&pmf_suspend_workqueue, "pmfsuspend",
            pmf_suspend_worker, NULL, PRI_NONE, IPL_VM, 0);
        if (err)
                panic("couldn't create pmfsuspend workqueue");

        callout_init(&global_idle_counter, 0);
        callout_setfunc(&global_idle_counter, input_idle, NULL);
}












































































































































































































    1 











    1 





















   11 







   12 



   11 












    5 

    6 

















   12 









    7 

    4 

    7 

    5 



























    4 



    2 








   11 




   11 






    2 
    9 





    8 






    1 

    7 





    6 
    1 










    7 
    7 
    6 



    4 
    2 
    3 
    3 
    5 
    1 

    5 
    1 

















    4 
    4 




    5 

    3 






    4 





    3 






    1 







    2 















    5 








    5 











































    5 
































    2 

    4 







    4 

    3 
    2 









    4 

    1 

    1 












    4 


    2 











    1 



























    3 



    3 








    3 








    3 
















































































































    8 






    8 

    7 


































    8 





























   11 














    8 




    3 
    3 
    3 










    3 



































   28 

















    8 






    7 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
/*        $NetBSD: sys_select.c,v 1.66 2023/10/15 10:29:34 riastradh Exp $        */

/*-
 * Copyright (c) 2007, 2008, 2009, 2010, 2019, 2020, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran and Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)sys_generic.c        8.9 (Berkeley) 2/14/95
 */

/*
 * System calls of synchronous I/O multiplexing subsystem.
 *
 * Locking
 *
 * Two locks are used: <object-lock> and selcluster_t::sc_lock.
 *
 * The <object-lock> might be a device driver or another subsystem, e.g.
 * socket or pipe.  This lock is not exported, and thus invisible to this
 * subsystem.  Mainly, synchronisation between selrecord() and selnotify()
 * routines depends on this lock, as it will be described in the comments.
 *
 * Lock order
 *
 *        <object-lock> ->
 *                selcluster_t::sc_lock
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.66 2023/10/15 10:29:34 riastradh Exp $");

#include <sys/param.h>

#include <sys/atomic.h>
#include <sys/bitops.h>
#include <sys/cpu.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/lwp.h>
#include <sys/mount.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/sleepq.h>
#include <sys/socketvar.h>
#include <sys/socketvar.h>
#include <sys/syncobj.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/uio.h>

/* Flags for lwp::l_selflag. */
#define        SEL_RESET        0        /* awoken, interrupted, or not yet polling */
#define        SEL_SCANNING        1        /* polling descriptors */
#define        SEL_BLOCKING        2        /* blocking and waiting for event */
#define        SEL_EVENT        3        /* interrupted, events set directly */

/*
 * Per-cluster state for select()/poll().  For a system with fewer
 * than 64 CPUs, this gives us per-CPU clusters.
 */
#define        SELCLUSTERS        64
#define        SELCLUSTERMASK        (SELCLUSTERS - 1)

typedef struct selcluster {
        kmutex_t        *sc_lock;
        sleepq_t        sc_sleepq;
        uint64_t        sc_mask;
        int                sc_ncoll;
} selcluster_t;

static inline int        selscan(char *, const int, const size_t, register_t *);
static inline int        pollscan(struct pollfd *, const int, register_t *);
static void                selclear(void);

static const int sel_flag[] = {
        POLLRDNORM | POLLHUP | POLLERR,
        POLLWRNORM | POLLHUP | POLLERR,
        POLLRDBAND
};

/* 
 * LWPs are woken using the sleep queue only due to a collision, the case
 * with the maximum Suck Factor.  Save the cost of sorting for named waiters
 * by inserting in LIFO order.  In the future it would be preferable to not
 * enqueue LWPs at all, unless subject to a collision.
 */
syncobj_t select_sobj = {
        .sobj_name        = "select",
        .sobj_flag        = SOBJ_SLEEPQ_LIFO,
        .sobj_boostpri  = PRI_KERNEL,
        .sobj_unsleep        = sleepq_unsleep,
        .sobj_changepri        = sleepq_changepri,
        .sobj_lendpri        = sleepq_lendpri,
        .sobj_owner        = syncobj_noowner,
};

static selcluster_t        *selcluster[SELCLUSTERS] __read_mostly;
static int                direct_select __read_mostly = 0;

/* Operations: either select() or poll(). */
const char                selop_select[] = "select";
const char                selop_poll[] = "poll";

/*
 * Select system call.
 */
int
sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                                nd;
                syscallarg(fd_set *)                        in;
                syscallarg(fd_set *)                        ou;
                syscallarg(fd_set *)                        ex;
                syscallarg(const struct timespec *)        ts;
                syscallarg(sigset_t *)                        mask;
        } */
        struct timespec        ats, *ts = NULL;
        sigset_t        amask, *mask = NULL;
        int                error;

        if (SCARG(uap, ts)) {
                error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
                if (error)
                        return error;
                ts = &ats;
        }
        if (SCARG(uap, mask) != NULL) {
                error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
                if (error)
                        return error;
                mask = &amask;
        }

        return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
            SCARG(uap, ou), SCARG(uap, ex), ts, mask);
}

int
sys___select50(struct lwp *l, const struct sys___select50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        nd;
                syscallarg(fd_set *)                in;
                syscallarg(fd_set *)                ou;
                syscallarg(fd_set *)                ex;
                syscallarg(struct timeval *)        tv;
        } */
        struct timeval atv;
        struct timespec ats, *ts = NULL;
        int error;

        if (SCARG(uap, tv)) {
                error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv));
                if (error)
                        return error;

                if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
                        return EINVAL;

                TIMEVAL_TO_TIMESPEC(&atv, &ats);
                ts = &ats;
        }

        return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
            SCARG(uap, ou), SCARG(uap, ex), ts, NULL);
}

/*
 * sel_do_scan: common code to perform the scan on descriptors.
 */
static int
sel_do_scan(const char *opname, void *fds, const int nf, const size_t ni,
    struct timespec *ts, sigset_t *mask, register_t *retval)
{
        lwp_t                * const l = curlwp;
        selcluster_t        *sc;
        kmutex_t        *lock;
        struct timespec        sleepts;
        int                error, timo;

        timo = 0;
        if (ts && inittimeleft(ts, &sleepts) == -1) {
                return EINVAL;
        }

        if (__predict_false(mask))
                sigsuspendsetup(l, mask);

        /*
         * We may context switch during or at any time after picking a CPU
         * and cluster to associate with, but it doesn't matter.  In the
         * unlikely event we migrate elsewhere all we risk is a little lock
         * contention; correctness is not sacrificed.
         */
        sc = curcpu()->ci_data.cpu_selcluster;
        lock = sc->sc_lock;
        l->l_selcluster = sc;

        if (opname == selop_select) {
                l->l_selbits = fds;
                l->l_selni = ni;
        } else {
                l->l_selbits = NULL;
        }

        for (;;) {
                int ncoll;

                SLIST_INIT(&l->l_selwait);
                l->l_selret = 0;

                /*
                 * No need to lock.  If this is overwritten by another value
                 * while scanning, we will retry below.  We only need to see
                 * exact state from the descriptors that we are about to poll,
                 * and lock activity resulting from fo_poll is enough to
                 * provide an up to date value for new polling activity.
                 */
                if (ts && (ts->tv_sec | ts->tv_nsec | direct_select) == 0) {
                        /* Non-blocking: no need for selrecord()/selclear() */
                        l->l_selflag = SEL_RESET;
                } else {
                        l->l_selflag = SEL_SCANNING;
                }
                ncoll = sc->sc_ncoll;
                membar_release();

                if (opname == selop_select) {
                        error = selscan((char *)fds, nf, ni, retval);
                } else {
                        error = pollscan((struct pollfd *)fds, nf, retval);
                }
                if (error || *retval)
                        break;
                if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0)
                        break;
                /*
                 * Acquire the lock and perform the (re)checks.  Note, if
                 * collision has occurred, then our state does not matter,
                 * as we must perform re-scan.  Therefore, check it first.
                 */
state_check:
                mutex_spin_enter(lock);
                if (__predict_false(sc->sc_ncoll != ncoll)) {
                        /* Collision: perform re-scan. */
                        mutex_spin_exit(lock);
                        selclear();
                        continue;
                }
                if (__predict_true(l->l_selflag == SEL_EVENT)) {
                        /* Events occurred, they are set directly. */
                        mutex_spin_exit(lock);
                        break;
                }
                if (__predict_true(l->l_selflag == SEL_RESET)) {
                        /* Events occurred, but re-scan is requested. */
                        mutex_spin_exit(lock);
                        selclear();
                        continue;
                }
                /* Nothing happen, therefore - sleep. */
                l->l_selflag = SEL_BLOCKING;
                KASSERT(l->l_blcnt == 0);
                (void)sleepq_enter(&sc->sc_sleepq, l, lock);
                sleepq_enqueue(&sc->sc_sleepq, sc, opname, &select_sobj, true);
                error = sleepq_block(timo, true, &select_sobj, 0);
                if (error != 0) {
                        break;
                }
                /* Awoken: need to check the state. */
                goto state_check;
        }
        selclear();

        /* Add direct events if any. */
        if (l->l_selflag == SEL_EVENT) {
                KASSERT(l->l_selret != 0);
                *retval += l->l_selret;
        }

        if (__predict_false(mask))
                sigsuspendteardown(l);

        /* select and poll are not restarted after signals... */
        if (error == ERESTART)
                return EINTR;
        if (error == EWOULDBLOCK)
                return 0;
        return error;
}

int
selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou,
    fd_set *u_ex, struct timespec *ts, sigset_t *mask)
{
        char                smallbits[howmany(FD_SETSIZE, NFDBITS) *
                            sizeof(fd_mask) * 6];
        char                 *bits;
        int                error, nf;
        size_t                ni;

        if (nd < 0)
                return (EINVAL);
        nf = atomic_load_consume(&curlwp->l_fd->fd_dt)->dt_nfiles;
        if (nd > nf) {
                /* forgiving; slightly wrong */
                nd = nf;
        }
        ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
        if (ni * 6 > sizeof(smallbits))
                bits = kmem_alloc(ni * 6, KM_SLEEP);
        else
                bits = smallbits;

#define        getbits(name, x)                                                \
        if (u_ ## name) {                                                \
                error = copyin(u_ ## name, bits + ni * x, ni);                \
                if (error)                                                \
                        goto fail;                                        \
        } else                                                                \
                memset(bits + ni * x, 0, ni);
        getbits(in, 0);
        getbits(ou, 1);
        getbits(ex, 2);
#undef        getbits

        error = sel_do_scan(selop_select, bits, nd, ni, ts, mask, retval);
        if (error == 0 && u_in != NULL)
                error = copyout(bits + ni * 3, u_in, ni);
        if (error == 0 && u_ou != NULL)
                error = copyout(bits + ni * 4, u_ou, ni);
        if (error == 0 && u_ex != NULL)
                error = copyout(bits + ni * 5, u_ex, ni);
 fail:
        if (bits != smallbits)
                kmem_free(bits, ni * 6);
        return (error);
}

static inline int
selscan(char *bits, const int nfd, const size_t ni, register_t *retval)
{
        fd_mask *ibitp, *obitp;
        int msk, i, j, fd, n;
        file_t *fp;
        lwp_t *l;

        ibitp = (fd_mask *)(bits + ni * 0);
        obitp = (fd_mask *)(bits + ni * 3);
        n = 0;
        l = curlwp;

        memset(obitp, 0, ni * 3);
        for (msk = 0; msk < 3; msk++) {
                for (i = 0; i < nfd; i += NFDBITS) {
                        fd_mask ibits, obits;

                        ibits = *ibitp;
                        obits = 0;
                        while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
                                ibits &= ~(1U << j);
                                if ((fp = fd_getfile(fd)) == NULL)
                                        return (EBADF);
                                /*
                                 * Setup an argument to selrecord(), which is
                                 * a file descriptor number.
                                 */
                                l->l_selrec = fd;
                                if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) {
                                        if (!direct_select) {
                                                /*
                                                 * Have events: do nothing in
                                                 * selrecord().
                                                 */
                                                l->l_selflag = SEL_RESET;
                                        }
                                        obits |= (1U << j);
                                        n++;
                                }
                                fd_putfile(fd);
                        }
                        if (obits != 0) {
                                if (direct_select) {
                                        kmutex_t *lock;
                                        lock = l->l_selcluster->sc_lock;
                                        mutex_spin_enter(lock);
                                        *obitp |= obits;
                                        mutex_spin_exit(lock);
                                } else {
                                        *obitp |= obits;
                                }
                        }
                        ibitp++;
                        obitp++;
                }
        }
        *retval = n;
        return (0);
}

/*
 * Poll system call.
 */
int
sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
{
        /* {
                syscallarg(struct pollfd *)        fds;
                syscallarg(u_int)                nfds;
                syscallarg(int)                        timeout;
        } */
        struct timespec        ats, *ts = NULL;

        if (SCARG(uap, timeout) != INFTIM) {
                ats.tv_sec = SCARG(uap, timeout) / 1000;
                ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000;
                ts = &ats;
        }

        return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL);
}

/*
 * Poll system call.
 */
int
sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(struct pollfd *)                fds;
                syscallarg(u_int)                        nfds;
                syscallarg(const struct timespec *)        ts;
                syscallarg(const sigset_t *)                mask;
        } */
        struct timespec        ats, *ts = NULL;
        sigset_t        amask, *mask = NULL;
        int                error;

        if (SCARG(uap, ts)) {
                error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
                if (error)
                        return error;
                ts = &ats;
        }
        if (SCARG(uap, mask)) {
                error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
                if (error)
                        return error;
                mask = &amask;
        }

        return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask);
}

int
pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds,
    struct timespec *ts, sigset_t *mask)
{
        struct pollfd        smallfds[32];
        struct pollfd        *fds;
        int                error;
        size_t                ni;

        if (nfds > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + 1000) {
                /*
                 * Prevent userland from causing over-allocation.
                 * Raising the default limit too high can still cause
                 * a lot of memory to be allocated, but this also means
                 * that the file descriptor array will also be large.
                 *
                 * To reduce the memory requirements here, we could 
                 * process the 'fds' array in chunks, but that
                 * is a lot of code that isn't normally useful.
                 * (Or just move the copyin/out into pollscan().)
                 *
                 * Historically the code silently truncated 'fds' to
                 * dt_nfiles entries - but that does cause issues.
                 *
                 * Using the max limit equivalent to sysctl
                 * kern.maxfiles is the moral equivalent of OPEN_MAX
                 * as specified by POSIX.
                 *
                 * We add a slop of 1000 in case the resource limit was
                 * changed after opening descriptors or the same descriptor
                 * was specified more than once.
                 */
                return EINVAL;
        }
        ni = nfds * sizeof(struct pollfd);
        if (ni > sizeof(smallfds))
                fds = kmem_alloc(ni, KM_SLEEP);
        else
                fds = smallfds;

        error = copyin(u_fds, fds, ni);
        if (error)
                goto fail;

        error = sel_do_scan(selop_poll, fds, nfds, ni, ts, mask, retval);
        if (error == 0)
                error = copyout(fds, u_fds, ni);
 fail:
        if (fds != smallfds)
                kmem_free(fds, ni);
        return (error);
}

static inline int
pollscan(struct pollfd *fds, const int nfd, register_t *retval)
{
        file_t *fp;
        int i, n = 0, revents;

        for (i = 0; i < nfd; i++, fds++) {
                fds->revents = 0;
                if (fds->fd < 0) {
                        revents = 0;
                } else if ((fp = fd_getfile(fds->fd)) == NULL) {
                        revents = POLLNVAL;
                } else {
                        /*
                         * Perform poll: registers select request or returns
                         * the events which are set.  Setup an argument for
                         * selrecord(), which is a pointer to struct pollfd.
                         */
                        curlwp->l_selrec = (uintptr_t)fds;
                        revents = (*fp->f_ops->fo_poll)(fp,
                            fds->events | POLLERR | POLLHUP);
                        fd_putfile(fds->fd);
                }
                if (revents) {
                        if (!direct_select)  {
                                /* Have events: do nothing in selrecord(). */
                                curlwp->l_selflag = SEL_RESET;
                        }
                        fds->revents = revents;
                        n++;
                }
        }
        *retval = n;
        return (0);
}

int
seltrue(dev_t dev, int events, lwp_t *l)
{

        return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
}

/*
 * Record a select request.  Concurrency issues:
 *
 * The caller holds the same lock across calls to selrecord() and
 * selnotify(), so we don't need to consider a concurrent wakeup
 * while in this routine.
 *
 * The only activity we need to guard against is selclear(), called by
 * another thread that is exiting sel_do_scan().
 * `sel_lwp' can only become non-NULL while the caller's lock is held,
 * so it cannot become non-NULL due to a change made by another thread
 * while we are in this routine.  It can only become _NULL_ due to a
 * call to selclear().
 *
 * If it is non-NULL and != selector there is the potential for
 * selclear() to be called by another thread.  If either of those
 * conditions are true, we're not interested in touching the `named
 * waiter' part of the selinfo record because we need to record a
 * collision.  Hence there is no need for additional locking in this
 * routine.
 */
void
selrecord(lwp_t *selector, struct selinfo *sip)
{
        selcluster_t *sc;
        lwp_t *other;

        KASSERT(selector == curlwp);

        sc = selector->l_selcluster;
        other = sip->sel_lwp;

        if (selector->l_selflag == SEL_RESET) {
                /* 0. We're not going to block - will poll again if needed. */
        } else if (other == selector) {
                /* 1. We (selector) already claimed to be the first LWP. */
                KASSERT(sip->sel_cluster == sc);
        } else if (other == NULL) {
                /*
                 * 2. No first LWP, therefore we (selector) are the first.
                 *
                 * There may be unnamed waiters (collisions).  Issue a memory
                 * barrier to ensure that we access sel_lwp (above) before
                 * other fields - this guards against a call to selclear().
                 */
                membar_acquire();
                sip->sel_lwp = selector;
                SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
                /* Copy the argument, which is for selnotify(). */
                sip->sel_fdinfo = selector->l_selrec;
                /* Replace selinfo's lock with the chosen cluster's lock. */
                sip->sel_cluster = sc;
        } else {
                /* 3. Multiple waiters: record a collision. */
                sip->sel_collision |= sc->sc_mask;
                KASSERT(sip->sel_cluster != NULL);
        }
}

/*
 * Record a knote.
 *
 * The caller holds the same lock as for selrecord().
 */
void
selrecord_knote(struct selinfo *sip, struct knote *kn)
{
        klist_insert(&sip->sel_klist, kn);
}

/*
 * Remove a knote.
 *
 * The caller holds the same lock as for selrecord().
 *
 * Returns true if the last knote was removed and the list
 * is now empty.
 */
bool
selremove_knote(struct selinfo *sip, struct knote *kn)
{
        return klist_remove(&sip->sel_klist, kn);
}

/*
 * sel_setevents: a helper function for selnotify(), to set the events
 * for LWP sleeping in selcommon() or pollcommon().
 */
static inline bool
sel_setevents(lwp_t *l, struct selinfo *sip, const int events)
{
        const int oflag = l->l_selflag;
        int ret = 0;

        /*
         * If we require re-scan or it was required by somebody else,
         * then just (re)set SEL_RESET and return.
         */
        if (__predict_false(events == 0 || oflag == SEL_RESET)) {
                l->l_selflag = SEL_RESET;
                return true;
        }
        /*
         * Direct set.  Note: select state of LWP is locked.  First,
         * determine whether it is selcommon() or pollcommon().
         */
        if (l->l_selbits != NULL) {
                const size_t ni = l->l_selni;
                fd_mask *fds = (fd_mask *)l->l_selbits;
                fd_mask *ofds = (fd_mask *)((char *)fds + ni * 3);
                const int fd = sip->sel_fdinfo, fbit = 1 << (fd & __NFDMASK);
                const int idx = fd >> __NFDSHIFT;
                int n;

                for (n = 0; n < 3; n++) {
                        if ((fds[idx] & fbit) != 0 &&
                            (ofds[idx] & fbit) == 0 &&
                            (sel_flag[n] & events)) {
                                ofds[idx] |= fbit;
                                ret++;
                        }
                        fds = (fd_mask *)((char *)fds + ni);
                        ofds = (fd_mask *)((char *)ofds + ni);
                }
        } else {
                struct pollfd *pfd = (void *)sip->sel_fdinfo;
                int revents = events & (pfd->events | POLLERR | POLLHUP);

                if (revents) {
                        if (pfd->revents == 0)
                                ret = 1;
                        pfd->revents |= revents;
                }
        }
        /* Check whether there are any events to return. */
        if (!ret) {
                return false;
        }
        /* Indicate direct set and note the event (cluster lock is held). */
        l->l_selflag = SEL_EVENT;
        l->l_selret += ret;
        return true;
}

/*
 * Do a wakeup when a selectable event occurs.  Concurrency issues:
 *
 * As per selrecord(), the caller's object lock is held.  If there
 * is a named waiter, we must acquire the associated selcluster's lock
 * in order to synchronize with selclear() and pollers going to sleep
 * in sel_do_scan().
 *
 * sip->sel_cluser cannot change at this point, as it is only changed
 * in selrecord(), and concurrent calls to selrecord() are locked
 * out by the caller.
 */
void
selnotify(struct selinfo *sip, int events, long knhint)
{
        selcluster_t *sc;
        uint64_t mask;
        int index, oflag;
        lwp_t *l;
        kmutex_t *lock;

        KNOTE(&sip->sel_klist, knhint);

        if (sip->sel_lwp != NULL) {
                /* One named LWP is waiting. */
                sc = sip->sel_cluster;
                lock = sc->sc_lock;
                mutex_spin_enter(lock);
                /* Still there? */
                if (sip->sel_lwp != NULL) {
                        /*
                         * Set the events for our LWP and indicate that.
                         * Otherwise, request for a full re-scan.
                         */
                        l = sip->sel_lwp;
                        oflag = l->l_selflag;

                        if (!direct_select) {
                                l->l_selflag = SEL_RESET;
                        } else if (!sel_setevents(l, sip, events)) {
                                /* No events to return. */
                                mutex_spin_exit(lock);
                                return;
                        }

                        /*
                         * If thread is sleeping, wake it up.  If it's not
                         * yet asleep, it will notice the change in state
                         * and will re-poll the descriptors.
                         */
                        if (oflag == SEL_BLOCKING && l->l_mutex == lock) {
                                KASSERT(l->l_wchan == sc);
                                sleepq_remove(l->l_sleepq, l, true);
                        }
                }
                mutex_spin_exit(lock);
        }

        if ((mask = sip->sel_collision) != 0) {
                /*
                 * There was a collision (multiple waiters): we must
                 * inform all potentially interested waiters.
                 */
                sip->sel_collision = 0;
                do {
                        index = ffs64(mask) - 1;
                        mask ^= __BIT(index);
                        sc = selcluster[index];
                        lock = sc->sc_lock;
                        mutex_spin_enter(lock);
                        sc->sc_ncoll++;
                        sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock);
                } while (__predict_false(mask != 0));
        }
}

/*
 * Remove an LWP from all objects that it is waiting for.  Concurrency
 * issues:
 *
 * The object owner's (e.g. device driver) lock is not held here.  Calls
 * can be made to selrecord() and we do not synchronize against those
 * directly using locks.  However, we use `sel_lwp' to lock out changes.
 * Before clearing it we must use memory barriers to ensure that we can
 * safely traverse the list of selinfo records.
 */
static void
selclear(void)
{
        struct selinfo *sip, *next;
        selcluster_t *sc;
        lwp_t *l;
        kmutex_t *lock;

        l = curlwp;
        sc = l->l_selcluster;
        lock = sc->sc_lock;

        /*
         * If the request was non-blocking, or we found events on the first
         * descriptor, there will be no need to clear anything - avoid
         * taking the lock.
         */
        if (SLIST_EMPTY(&l->l_selwait)) {
                return;
        }

        mutex_spin_enter(lock);
        for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) {
                KASSERT(sip->sel_lwp == l);
                KASSERT(sip->sel_cluster == l->l_selcluster);

                /*
                 * Read link to next selinfo record, if any.
                 * It's no longer safe to touch `sip' after clearing
                 * `sel_lwp', so ensure that the read of `sel_chain'
                 * completes before the clearing of sel_lwp becomes
                 * globally visible.
                 */
                next = SLIST_NEXT(sip, sel_chain);
                /* Release the record for another named waiter to use. */
                atomic_store_release(&sip->sel_lwp, NULL);
        }
        mutex_spin_exit(lock);
}

/*
 * Initialize the select/poll system calls.  Called once for each
 * CPU in the system, as they are attached.
 */
void
selsysinit(struct cpu_info *ci)
{
        selcluster_t *sc;
        u_int index;

        /* If already a cluster in place for this bit, re-use. */
        index = cpu_index(ci) & SELCLUSTERMASK;
        sc = selcluster[index];
        if (sc == NULL) {
                sc = kmem_alloc(roundup2(sizeof(selcluster_t),
                    coherency_unit) + coherency_unit, KM_SLEEP);
                sc = (void *)roundup2((uintptr_t)sc, coherency_unit);
                sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
                sleepq_init(&sc->sc_sleepq);
                sc->sc_ncoll = 0;
                sc->sc_mask = __BIT(index);
                selcluster[index] = sc;
        }
        ci->ci_data.cpu_selcluster = sc;
}

/*
 * Initialize a selinfo record.
 */
void
selinit(struct selinfo *sip)
{

        memset(sip, 0, sizeof(*sip));
        klist_init(&sip->sel_klist);
}

/*
 * Destroy a selinfo record.  The owning object must not gain new
 * references while this is in progress: all activity on the record
 * must be stopped.
 *
 * Concurrency issues: we only need guard against a call to selclear()
 * by a thread exiting sel_do_scan().  The caller has prevented further
 * references being made to the selinfo record via selrecord(), and it
 * will not call selnotify() again.
 */
void
seldestroy(struct selinfo *sip)
{
        selcluster_t *sc;
        kmutex_t *lock;
        lwp_t *l;

        klist_fini(&sip->sel_klist);

        if (sip->sel_lwp == NULL)
                return;

        /*
         * Lock out selclear().  The selcluster pointer can't change while
         * we are here since it is only ever changed in selrecord(),
         * and that will not be entered again for this record because
         * it is dying.
         */
        KASSERT(sip->sel_cluster != NULL);
        sc = sip->sel_cluster;
        lock = sc->sc_lock;
        mutex_spin_enter(lock);
        if ((l = sip->sel_lwp) != NULL) {
                /*
                 * This should rarely happen, so although SLIST_REMOVE()
                 * is slow, using it here is not a problem.
                 */
                KASSERT(l->l_selcluster == sc);
                SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
                sip->sel_lwp = NULL;
        }
        mutex_spin_exit(lock);
}

/*
 * System control nodes.
 */
SYSCTL_SETUP(sysctl_select_setup, "sysctl select setup")
{

        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "direct_select",
                SYSCTL_DESCR("Enable/disable direct select (for testing)"),
                NULL, 0, &direct_select, 0,
                CTL_KERN, CTL_CREATE, CTL_EOL);
}















































































































    1 










    1 

    1 
















    1 



    1 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
/*        $NetBSD: mount.h,v 1.16 2024/01/19 18:39:15 christos Exp $        */

/*
 * Copyright (c) 1989, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)mount.h        8.21 (Berkeley) 5/20/95
 */

#ifndef _COMPAT_SYS_MOUNT_H_
#define _COMPAT_SYS_MOUNT_H_

#ifdef _KERNEL_OPT
#include "opt_compat_43.h"
#endif

#define MFSNAMELEN        16

struct statfs12 {
        short        f_type;                        /* type of file system */
        u_short        f_oflags;                /* deprecated copy of mount flags */
        long        f_bsize;                /* fundamental file system block size */
        long        f_iosize;                /* optimal transfer block size */
        long        f_blocks;                /* total data blocks in file system */
        long        f_bfree;                /* free blocks in fs */
        long        f_bavail;                /* free blocks avail to non-superuser */
        long        f_files;                /* total file nodes in file system */
        long        f_ffree;                /* free file nodes in fs */
        fsid_t        f_fsid;                        /* file system id */
        uid_t        f_owner;                /* user that mounted the file system */
        long        f_flags;                /* copy of mount flags */
        long        f_syncwrites;                /* count of sync writes since mount */
        long        f_asyncwrites;                /* count of async writes since mount */
        long        f_spare[1];                /* spare for later */
        char        f_fstypename[MFSNAMELEN]; /* fs type name */
        char        f_mntonname[MNAMELEN];          /* directory on which mounted */
        char        f_mntfromname[MNAMELEN];  /* mounted file system */
};

#ifndef _KERNEL
#include <string.h>
#endif

/*
 * Operations supported on mounted file system.
 */
/*
 * Convert from a new statvfs to an old statfs structure.
 */

#define MOUNTNO_NONE        0
#define MOUNTNO_UFS        1                /* UNIX "Fast" Filesystem */
#define MOUNTNO_NFS        2                /* Network Filesystem */
#define MOUNTNO_MFS        3                /* Memory Filesystem */
#define MOUNTNO_MSDOS        4                /* MSDOS Filesystem */
#define MOUNTNO_CD9660        5                /* iso9660 cdrom */
#define MOUNTNO_FDESC        6                /* /dev/fd filesystem */
#define MOUNTNO_KERNFS        7                /* kernel variable filesystem */ 
#define MOUNTNO_DEVFS        8                /* device node filesystem */
#define MOUNTNO_AFS        9                /* AFS 3.x */

static const struct {
        const char *name;
        const int value;
} __nv[] = {
        { MOUNT_UFS, MOUNTNO_UFS },
        { MOUNT_NFS, MOUNTNO_NFS },
        { MOUNT_MFS, MOUNTNO_MFS },
        { MOUNT_MSDOS, MOUNTNO_MSDOS },
        { MOUNT_CD9660, MOUNTNO_CD9660 },
        { MOUNT_FDESC, MOUNTNO_FDESC },
        { MOUNT_KERNFS, MOUNTNO_KERNFS },
        { MOUNT_AFS, MOUNTNO_AFS },
};

static __inline void
statvfs_to_statfs12(const struct statvfs *fs, struct statfs12 *s12)
{
        size_t i = 0;

        memset(s12, 0, sizeof(*s12));

        s12->f_type = 0;
        s12->f_oflags = (short)fs->f_flag;

        for (i = 0; i < sizeof(__nv) / sizeof(__nv[0]); i++) {
                if (strcmp(__nv[i].name, fs->f_fstypename) == 0) {
                        s12->f_type = __nv[i].value;
                        break;
                }
        }
#define __STATFSCLAMP(a)        (long)(((a) & ~LONG_MAX) ? LONG_MAX : (a))
        s12->f_bsize = __STATFSCLAMP(fs->f_frsize);
        s12->f_iosize = __STATFSCLAMP(fs->f_iosize);
        s12->f_blocks = __STATFSCLAMP(fs->f_blocks);
        s12->f_bfree = __STATFSCLAMP(fs->f_bfree);
        if (fs->f_bfree > fs->f_bresvd)
                s12->f_bavail = __STATFSCLAMP(fs->f_bfree - fs->f_bresvd);
        else
                s12->f_bavail = -__STATFSCLAMP(fs->f_bresvd - fs->f_bfree);
        s12->f_files = __STATFSCLAMP(fs->f_files);
        s12->f_ffree = __STATFSCLAMP(fs->f_ffree);
        s12->f_fsid = fs->f_fsidx;
        s12->f_owner = fs->f_owner;
        s12->f_flags = (long)fs->f_flag;
        s12->f_syncwrites = __STATFSCLAMP(fs->f_syncwrites);
        s12->f_asyncwrites = __STATFSCLAMP(fs->f_asyncwrites);
        memcpy(s12->f_fstypename, fs->f_fstypename, sizeof(s12->f_fstypename));
        memcpy(s12->f_mntonname, fs->f_mntonname, sizeof(s12->f_mntonname));
        memcpy(s12->f_mntfromname, fs->f_mntfromname,
            sizeof(s12->f_mntfromname));
}

#ifdef _KERNEL
static __inline int
statvfs_to_statfs12_copy(const void *vs, void *vs12, size_t l)
{
        struct statfs12 *s12 = kmem_zalloc(sizeof(*s12), KM_SLEEP);
        int error;

        statvfs_to_statfs12(vs, s12);
        error = copyout(s12, vs12, sizeof(*s12));
        kmem_free(s12, sizeof(*s12));

        return error;
}

/*
 * Filesystem configuration information. Not used by NetBSD, but
 * defined here to provide a compatible sysctl interface to Lite2.
 */
struct vfsconf {
        struct        vfsops *vfc_vfsops;        /* filesystem operations vector */
        char        vfc_name[MFSNAMELEN];         /* filesystem type name */
        int        vfc_typenum;                /* historic filesystem type number */
        int          vfc_refcount;                /* number mounted of this type */
        int        vfc_flags;                /* permanent flags */
        int        (*vfc_mountroot)(void);        /* if != NULL, routine to mount root */
        struct        vfsconf *vfc_next;         /* next in list */
};

/* Old, fixed size filehandle structures (used upto (including) 3.x) */
struct compat_30_fid {
        unsigned short        fid_len;
        unsigned short        fid_reserved;
        char                fid_data[16];
};
struct compat_30_fhandle {
        fsid_t        fh_fsid;
        struct compat_30_fid fh_fid;
};

#else

__BEGIN_DECLS
int        __compat_fstatfs(int, struct statfs12 *) __dso_hidden;
int        __compat_getfsstat(struct statfs12 *, long, int) __dso_hidden;
int        __compat_statfs(const char *, struct statfs12 *) __dso_hidden;
int        __compat_getmntinfo(struct statfs12 **, int) __dso_hidden;
#if defined(_NETBSD_SOURCE)
struct compat_30_fhandle;
int        __compat_fhstatfs(const struct compat_30_fhandle *, struct statfs12 *)
    __dso_hidden;
struct stat13;
int        __compat_fhstat(const struct compat_30_fhandle *, struct stat13 *)
    __dso_hidden;
struct stat30;
int        __compat___fhstat30(const struct compat_30_fhandle *, struct stat30 *)
    __dso_hidden;
int        __compat___fhstat40(const void *, size_t, struct stat30 *) __dso_hidden;
struct stat;
int        __fhstat50(const void *, size_t, struct stat *);
int        __fhopen40(const void *, size_t, int);
int        fhopen(const struct compat_30_fhandle *, int);
int     __getfh30(const char *, void*, size_t *);
int        getfh(const char *path, struct compat_30_fhandle *fhp);
int        mount(const char *, const char *, int, void *);
int        __mount50(const char *, const char *, int, void *, size_t);
#endif /* _NETBSD_SOURCE */
__END_DECLS

#endif /* _KERNEL */

#endif /* !_COMPAT_SYS_MOUNT_H_ */










































































    2 
























































    1 











    1 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
/*        $NetBSD: vfs_syscalls_90.c,v 1.1 2019/09/22 22:59:38 christos Exp $        */

/*-
 * Copyright (c) 2005, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_90.c,v 1.1 2019/09/22 22:59:38 christos Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/socketvar.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/dirent.h>
#include <sys/malloc.h>
#include <sys/kauth.h>
#include <sys/vfs_syscalls.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>

#include <compat/common/compat_mod.h>
#include <compat/common/compat_util.h>
#include <compat/sys/statvfs.h>

static const struct syscall_package vfs_syscalls_90_syscalls[] = {
        { SYS_compat_90_getvfsstat, 0, (sy_call_t *)compat_90_sys_getvfsstat },
        { SYS_compat_90_statvfs1, 0, (sy_call_t *)compat_90_sys_statvfs1 },
        { SYS_compat_90_fstatvfs1, 0, (sy_call_t *)compat_90_sys_fstatvfs1 },
        { SYS_compat_90_fhstatvfs1, 0, (sy_call_t *)compat_90_sys_fhstatvfs1 },
        { 0,0, NULL }
};


int
compat_90_sys_getvfsstat(struct lwp *l,
    const struct compat_90_sys_getvfsstat_args *uap, register_t *retval)
{
        /* {
                syscallarg(struct statvfs90 *) buf;
                syscallarg(size_t) bufsize;
                syscallarg(int)        flags;
        } */

        return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
            SCARG(uap, flags), statvfs_to_statvfs90_copy,
            sizeof(struct statvfs90), retval);
}

int
compat_90_sys_statvfs1(struct lwp *l,
    const struct compat_90_sys_statvfs1_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(struct statvfs90 *) buf;
                syscallarg(int)        flags;
        } */

        struct statvfs *sb = STATVFSBUF_GET();
        int error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);

        if (!error)
                error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf),
                    sizeof(struct statvfs90));

        STATVFSBUF_PUT(sb);
        return error;
}

int
compat_90_sys_fstatvfs1(struct lwp *l,
    const struct compat_90_sys_fstatvfs1_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(struct statvfs90 *) buf;
                syscallarg(int)        flags;
        } */

        struct statvfs *sb = STATVFSBUF_GET();
        int error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);

        if (!error)
                error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf),
                    sizeof(struct statvfs90));

        STATVFSBUF_PUT(sb);
        return error;
}

int
compat_90_sys_fhstatvfs1(struct lwp *l,
    const struct compat_90_sys_fhstatvfs1_args *uap, register_t *retval)
{
        /* {
                syscallarg(const void *) fhp;
                syscallarg(size_t) fh_size;
                syscallarg(struct statvfs90 *) buf;
                syscallarg(int)        flags;
        } */

        struct statvfs *sb = STATVFSBUF_GET();
        int error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size),
            sb, SCARG(uap, flags));

        if (!error)
                error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf),
                    sizeof(struct statvfs90));

        STATVFSBUF_PUT(sb);
        return error;
}

int
vfs_syscalls_90_init(void)
{

        return syscall_establish(NULL, vfs_syscalls_90_syscalls);
}

int
vfs_syscalls_90_fini(void)
{

        return syscall_disestablish(NULL, vfs_syscalls_90_syscalls);
}




































































































































    1 




    1 





    1 





    1 

    1 
    1 






    1 



















    2 




    2 




    2 








    2 



    2 








































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
/*        $NetBSD: entpool.c,v 1.1 2020/04/30 03:28:19 riastradh Exp $        */

/*-
 * Copyright (c) 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Taylor R. Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Entropy pool (`reseedable pseudorandom number generator') based on a
 * sponge duplex, following the design described and analyzed in
 *
 *        Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van
 *        Assche, `Sponge-Based Pseudo-Random Number Generators', in
 *        Stefan Mangard and François-Xavier Standaert, eds.,
 *        Cryptographic Hardware and Embedded Systems—CHES 2010, Springer
 *        LNCS 6225, pp. 33–47.
 *        https://link.springer.com/chapter/10.1007/978-3-642-15031-9_3
 *        https://keccak.team/files/SpongePRNG.pdf
 *
 *        Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van
 *        Assche, `Duplexing the Sponge: Single-Pass Authenticated
 *        Encryption and Other Applications', in Ali Miri and Serge
 *        Vaudenay, eds., Selected Areas in Cryptography—SAC 2011,
 *        Springer LNCS 7118, pp. 320–337.
 *        https://link.springer.com/chapter/10.1007/978-3-642-28496-0_19
 *        https://keccak.team/files/SpongeDuplex.pdf
 *
 * We make the following tweaks that don't affect security:
 *
 *        - Samples are length-delimited 7-bit variable-length encoding.
 *          The encoding is still injective, so the security theorems
 *          continue to apply.
 *
 *        - Output is not buffered -- callers should draw 32 bytes and
 *          expand with a stream cipher.  In effect, every output draws
 *          the full rate, and we just discard whatever the caller didn't
 *          ask for; the impact is only on performance, not security.
 *
 * On top of the underlying sponge state, an entropy pool maintains an
 * integer i in [0, RATE-1] indicating where to write the next byte in
 * the input buffer.  Zeroing an entropy pool initializes it.
 */

#if defined(_KERNEL) || defined(_STANDALONE)
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: entpool.c,v 1.1 2020/04/30 03:28:19 riastradh Exp $");
#endif

#include "entpool.h"
#include ENTPOOL_HEADER

#if defined(_KERNEL) || defined(_STANDALONE)
#include <sys/types.h>
#include <lib/libkern/libkern.h>
#define        ASSERT                KASSERT
#else
#include <sys/cdefs.h>
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#define        ASSERT                assert
#define        CTASSERT        __CTASSERT
#endif

#define        secret        /* must not use in variable-time operations; should zero */
#define        arraycount(A)        (sizeof(A)/sizeof((A)[0]))
#define        MIN(X,Y)        ((X) < (Y) ? (X) : (Y))

#define        RATE                ENTPOOL_RATE

/*
 * stir(P)
 *
 *        Internal subroutine to apply the sponge permutation to the
 *        state in P.  Resets P->i to 0 to indicate that the input buffer
 *        is empty.
 */
static void
stir(struct entpool *P)
{
        size_t i;

        /*
         * Switch to the permutation's byte order, if necessary, apply
         * permutation, and then switch back.  This way we can data in
         * and out byte by byte, but get the same answers out of test
         * vectors.
         */
        for (i = 0; i < arraycount(P->s.w); i++)
                P->s.w[i] = ENTPOOL_WTOH(P->s.w[i]);
        ENTPOOL_PERMUTE(P->s.w);
        for (i = 0; i < arraycount(P->s.w); i++)
                P->s.w[i] = ENTPOOL_HTOW(P->s.w[i]);

        /* Reset the input buffer.  */
        P->i = 0;
}

/*
 * entpool_enter(P, buf, len)
 *
 *        Enter len bytes from buf into the entropy pool P, stirring as
 *        needed.  Corresponds to P.feed in the paper.
 */
void
entpool_enter(struct entpool *P, const void *buf, size_t len)
{
        const uint8_t *p = buf;
        size_t n = len, n1 = n;

        /* Sanity-check P->i.  */
        ASSERT(P->i <= RATE-1);

        /* Encode the length, stirring as needed.  */
        while (n1) {
                if (P->i == RATE-1)
                        stir(P);
                ASSERT(P->i < RATE-1);
                P->s.u8[P->i++] ^= (n1 >= 0x80 ? 0x80 : 0) | (n1 & 0x7f);
                n1 >>= 7;
        }

        /* Enter the sample, stirring as needed.  */
        while (n --> 0) {
                if (P->i == RATE-1)
                        stir(P);
                ASSERT(P->i < RATE-1);
                P->s.u8[P->i++] ^= *p++;
        }

        /* If we filled the input buffer exactly, stir once more.  */
        if (P->i == RATE-1)
                stir(P);
        ASSERT(P->i < RATE-1);
}

/*
 * entpool_enter_nostir(P, buf, len)
 *
 *        Enter as many bytes as possible, up to len, from buf into the
 *        entropy pool P.  Roughly corresponds to P.feed in the paper,
 *        but we stop if we would have run the permutation.
 *
 *        Return true if the sample was consumed in its entirety, or true
 *        if the sample was truncated so the caller should arrange to
 *        call entpool_stir when it is next convenient to do so.
 *
 *        This function is cheap -- it only xors the input into the
 *        state, and never calls the underlying permutation, but it may
 *        truncate samples.
 */
bool
entpool_enter_nostir(struct entpool *P, const void *buf, size_t len)
{
        const uint8_t *p = buf;
        size_t n0, n;

        /* Sanity-check P->i.  */
        ASSERT(P->i <= RATE-1);

        /* If the input buffer is full, fail.  */
        if (P->i == RATE-1)
                return false;
        ASSERT(P->i < RATE-1);

        /*
         * Truncate the sample and enter it with 1-byte length encoding
         * -- don't bother with variable-length encoding, not worth the
         * trouble.
         */
        n = n0 = MIN(127, MIN(len, RATE-1 - P->i - 1));
        P->s.u8[P->i++] ^= n;
        while (n --> 0)
                P->s.u8[P->i++] ^= *p++;

        /* Can't guarantee anything better than 0 <= i <= RATE-1.  */
        ASSERT(P->i <= RATE-1);

        /* Return true if all done, false if truncated and in need of stir.  */
        return (n0 == len);
}

/*
 * entpool_stir(P)
 *
 *        Stir the entropy pool after entpool_enter_nostir fails.  If it
 *        has already been stirred already, this has no effect.
 */
void
entpool_stir(struct entpool *P)
{

        /* Sanity-check P->i.  */
        ASSERT(P->i <= RATE-1);

        /* If the input buffer is full, stir.  */
        if (P->i == RATE-1)
                stir(P);
        ASSERT(P->i < RATE-1);
}

/*
 * entpool_extract(P, buf, len)
 *
 *        Extract len bytes from the entropy pool P into buf.
 *        Corresponds to iterating P.fetch/P.forget in the paper.
 *        (Feeding the output back in -- as P.forget does -- is the same
 *        as zeroing what we just read out.)
 */
void
entpool_extract(struct entpool *P, secret void *buf, size_t len)
{
        uint8_t *p = buf;
        size_t n = len;

        /* Sanity-check P->i.  */
        ASSERT(P->i <= RATE-1);

        /* If input buffer is not empty, stir.  */
        if (P->i != 0)
                stir(P);
        ASSERT(P->i == 0);

        /*
         * Copy out and zero (RATE-1)-sized chunks at a time, stirring
         * with a bit set to distinguish this from inputs.
         */
        while (n >= RATE-1) {
                memcpy(p, P->s.u8, RATE-1);
                memset(P->s.u8, 0, RATE-1);
                P->s.u8[RATE-1] ^= 0x80;
                stir(P);
                p += RATE-1;
                n -= RATE-1;
        }

        /*
         * If there's anything left, copy out a partial rate's worth
         * and zero the entire rate's worth, stirring with a bit set to
         * distinguish this from inputs.
         */
        if (n) {
                ASSERT(n < RATE-1);
                memcpy(p, P->s.u8, n);                /* Copy part of it.  */
                memset(P->s.u8, 0, RATE-1);        /* Zero all of it. */
                P->s.u8[RATE-1] ^= 0x80;
                stir(P);
        }
}

/*
 * Known-answer tests
 */

#if ENTPOOL_SMALL

#define        KATLEN        15

/* Gimli */
static const uint8_t known_answers[][KATLEN] = {
        [0] = {
                0x69,0xb8,0x49,0x0d,0x39,0xfb,0x42,0x61,
                0xf7,0x66,0xdf,0x04,0xb6,0xed,0x11,
        },
        [1] = {
                0x74,0x15,0x16,0x49,0x31,0x07,0x77,0xa1,
                0x3b,0x4d,0x78,0xc6,0x5d,0xef,0x87,
        },
        [2] = {
                0xae,0xfd,0x7d,0xc4,0x3b,0xce,0x09,0x25,
                0xbf,0x60,0x21,0x6e,0x3c,0x3a,0x84,
        },
        [3] = {
                0xae,0xfd,0x7d,0xc4,0x3b,0xce,0x09,0x25,
                0xbf,0x60,0x21,0x6e,0x3c,0x3a,0x84,
        },
        [4] = {
                0x69,0xb8,0x49,0x0d,0x39,0xfb,0x42,0x61,
                0xf7,0x66,0xdf,0x04,0xb6,0xed,0x11,
        },
        [5] = {
                0xa9,0x3c,0x3c,0xac,0x5f,0x6d,0x80,0xdc,
                0x33,0x0c,0xb2,0xe3,0xdd,0x55,0x31,
        },
        [6] = {
                0x2e,0x69,0x1a,0x2a,0x2d,0x09,0xd4,0x5e,
                0x49,0xcc,0x8c,0xb2,0x0b,0xcc,0x42,
        },
        [7] = {
                0xae,0xfd,0x7d,0xc4,0x3b,0xce,0x09,0x25,
                0xbf,0x60,0x21,0x6e,0x3c,0x3a,0x84,
        },
        [8] = {
                0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
                0x00,0x00,0x00,0x00,0x00,0x00,0x00,
        },
        [9] = {
                0x69,0xb8,0x49,0x0d,0x39,0xfb,0x42,0x61,
                0xf7,0x66,0xdf,0x04,0xb6,0xed,0x11,
        },
        [10] = {
                0x2e,0x69,0x1a,0x2a,0x2d,0x09,0xd4,0x5e,
                0x49,0xcc,0x8c,0xb2,0x0b,0xcc,0x42,
        },
        [11] = {
                0x6f,0xfd,0xd2,0x29,0x78,0x46,0xc0,0x7d,
                0xc7,0xf2,0x0a,0x2b,0x72,0xd6,0xc6,
        },
        [12] = {
                0x86,0xf0,0xc1,0xf9,0x95,0x0f,0xc9,0x12,
                0xde,0x38,0x39,0x10,0x1f,0x8c,0xc4,
        },
};

#else  /* !ENTPOOL_SMALL */

#define        KATLEN        16

/* Keccak-p[1600, 24] */
static const uint8_t known_answers[][KATLEN] = {
        [0] = {
                0x3b,0x20,0xf0,0xe9,0xce,0x94,0x48,0x07,
                0x97,0xb6,0x16,0xb5,0xb5,0x05,0x1a,0xce,
        },
        [1] = {
                0x57,0x49,0x6e,0x28,0x7f,0xaa,0xee,0x6c,
                0xa8,0xb0,0xf5,0x0b,0x87,0xae,0xd6,0xd6,
        },
        [2] = {
                0x51,0x72,0x0f,0x59,0x54,0xe1,0xaf,0xa8,
                0x16,0x67,0xfa,0x3f,0x8a,0x19,0x52,0x50,
        },
        [3] = {
                0x51,0x72,0x0f,0x59,0x54,0xe1,0xaf,0xa8,
                0x16,0x67,0xfa,0x3f,0x8a,0x19,0x52,0x50,
        },
        [4] = {
                0x3b,0x20,0xf0,0xe9,0xce,0x94,0x48,0x07,
                0x97,0xb6,0x16,0xb5,0xb5,0x05,0x1a,0xce,
        },
        [5] = {
                0x95,0x23,0x77,0xe4,0x84,0xeb,0xaa,0x2e,
                0x6a,0x99,0xc2,0x52,0x06,0x6d,0xdf,0xea,
        },
        [6] = {
                0x8c,0xdd,0x1b,0xaf,0x0e,0xf6,0xe9,0x1d,
                0x51,0x33,0x68,0x38,0x8d,0xad,0x55,0x84,
        },
        [7] = {
                0x51,0x72,0x0f,0x59,0x54,0xe1,0xaf,0xa8,
                0x16,0x67,0xfa,0x3f,0x8a,0x19,0x52,0x50,
        },
        [8] = {
                0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
                0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
        },
        [9] = {
                0x3b,0x20,0xf0,0xe9,0xce,0x94,0x48,0x07,
                0x97,0xb6,0x16,0xb5,0xb5,0x05,0x1a,0xce,
        },
        [10] = {
                0x8c,0xdd,0x1b,0xaf,0x0e,0xf6,0xe9,0x1d,
                0x51,0x33,0x68,0x38,0x8d,0xad,0x55,0x84,
        },
        [11] = {
                0xf6,0xc1,0x14,0xbb,0x13,0x0a,0xaf,0xed,
                0xca,0x0b,0x35,0x2c,0xf1,0x2b,0x1a,0x85,
        },
        [12] = {
                0xf9,0x4b,0x05,0xd1,0x8b,0xcd,0xb3,0xd0,
                0x77,0x27,0xfe,0x46,0xf9,0x33,0xb2,0xa2,
        },
};

#endif

#define        KAT_BEGIN(P, n)        memset(P, 0, sizeof(*(P)))
#define        KAT_ERROR()        return -1
#define        KAT_END(P, n)        do                                                      \
{                                                                              \
        uint8_t KAT_ACTUAL[KATLEN];                                              \
        entpool_extract(P, KAT_ACTUAL, KATLEN);                                      \
        if (memcmp(KAT_ACTUAL, known_answers[n], KATLEN))                      \
                return -1;                                                      \
} while (0)

int
entpool_selftest(void)
{
        struct entpool pool, *P = &pool;
        uint8_t sample[1] = {0xff};
        uint8_t scratch[RATE];
        const uint8_t zero[RATE] = {0};

        /* Test entpool_enter with empty buffer.  */
        KAT_BEGIN(P, 0);
        entpool_stir(P);        /* noop */
        entpool_enter(P, sample, 1);
        entpool_stir(P);        /* noop */
        KAT_END(P, 0);

        /* Test entpool_enter with partial buffer.  */
        KAT_BEGIN(P, 1);
        entpool_stir(P);        /* noop */
#if ENTPOOL_SMALL
        entpool_enter(P, zero, RATE-3);
#else
        entpool_enter(P, zero, RATE-4);
#endif
        entpool_stir(P);        /* noop */
        entpool_enter(P, sample, 1);
        entpool_stir(P);        /* noop */
        KAT_END(P, 1);

        /* Test entpool_enter with full buffer.  */
        KAT_BEGIN(P, 2);
        entpool_stir(P);        /* noop */
#if ENTPOOL_SMALL
        if (!entpool_enter_nostir(P, zero, RATE-2))
                KAT_ERROR();
#else
        if (!entpool_enter_nostir(P, zero, 127))
                KAT_ERROR();
        if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
                KAT_ERROR();
#endif
        entpool_enter(P, sample, 1);
        entpool_stir(P);        /* noop */
        KAT_END(P, 2);

        /* Test entpool_enter with full buffer after stir.  */
        KAT_BEGIN(P, 3);
        entpool_stir(P);        /* noop */
#if ENTPOOL_SMALL
        if (!entpool_enter_nostir(P, zero, RATE-2))
                KAT_ERROR();
#else
        CTASSERT(127 <= RATE-2);
        if (!entpool_enter_nostir(P, zero, 127))
                KAT_ERROR();
        if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
                KAT_ERROR();
#endif
        entpool_stir(P);
        entpool_enter(P, sample, 1);
        entpool_stir(P);        /* noop */
        KAT_END(P, 3);

        /* Test entpool_enter_nostir with empty buffer.  */
        KAT_BEGIN(P, 4);
        entpool_stir(P);        /* noop */
        if (!entpool_enter_nostir(P, sample, 1))
                KAT_ERROR();
        entpool_stir(P);        /* noop */
        KAT_END(P, 4);

        /* Test entpool_enter_nostir with partial buffer.  */
        KAT_BEGIN(P, 5);
        entpool_stir(P);        /* noop */
#if ENTPOOL_SMALL
        entpool_enter(P, zero, RATE-3);
#else
        entpool_enter(P, zero, RATE-4);
#endif
        entpool_stir(P);        /* noop */
        if (entpool_enter_nostir(P, sample, 1))
                KAT_ERROR();
        entpool_stir(P);
        KAT_END(P, 5);

        /* Test entpool_enter_nostir with full buffer.  */
        KAT_BEGIN(P, 6);
        entpool_stir(P);        /* noop */
#if ENTPOOL_SMALL
        if (!entpool_enter_nostir(P, zero, RATE-2))
                KAT_ERROR();
#else
        CTASSERT(127 <= RATE-2);
        if (!entpool_enter_nostir(P, zero, 127))
                KAT_ERROR();
        if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
                KAT_ERROR();
#endif
        if (entpool_enter_nostir(P, sample, 1))
                KAT_ERROR();
        entpool_stir(P);
        KAT_END(P, 6);

        /* Test entpool_enter_nostir with full buffer after stir.  */
        KAT_BEGIN(P, 7);
        entpool_stir(P);        /* noop */
#if ENTPOOL_SMALL
        if (!entpool_enter_nostir(P, zero, RATE-2))
                KAT_ERROR();
#else
        CTASSERT(127 <= RATE-2);
        if (!entpool_enter_nostir(P, zero, 127))
                KAT_ERROR();
        if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
                KAT_ERROR();
#endif
        entpool_stir(P);
        if (!entpool_enter_nostir(P, sample, 1))
                KAT_ERROR();
        entpool_stir(P);        /* noop */
        KAT_END(P, 7);

        /* Test entpool_extract with empty input buffer.  */
        KAT_BEGIN(P, 8);
        entpool_stir(P);        /* noop */
        KAT_END(P, 8);

        /* Test entpool_extract with nonempty input buffer.  */
        KAT_BEGIN(P, 9);
        entpool_stir(P);        /* noop */
        entpool_enter(P, sample, 1);
        entpool_stir(P);        /* noop */
        KAT_END(P, 9);

        /* Test entpool_extract with full input buffer.  */
        KAT_BEGIN(P, 10);
        entpool_stir(P);        /* noop */
#if ENTPOOL_SMALL
        if (!entpool_enter_nostir(P, zero, RATE-2))
                KAT_ERROR();
#else
        CTASSERT(127 <= RATE-2);
        if (!entpool_enter_nostir(P, zero, 127))
                KAT_ERROR();
        if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
                KAT_ERROR();
#endif
        KAT_END(P, 10);

        /* Test entpool_extract with iterated output.  */
        KAT_BEGIN(P, 11);
        entpool_stir(P);        /* noop */
        entpool_extract(P, scratch, RATE-1 + 1);
        entpool_stir(P);        /* noop */
        KAT_END(P, 11);

        /* Test extract, enter, extract.  */
        KAT_BEGIN(P, 12);
        entpool_stir(P);        /* noop */
        entpool_extract(P, scratch, 1);
        entpool_stir(P);        /* noop */
        entpool_enter(P, sample, 1);
        entpool_stir(P);        /* noop */
        KAT_END(P, 12);

        return 0;
}

#if ENTPOOL_TEST
int
main(void)
{
        return entpool_selftest();
}
#endif

/*
 * Known-answer test generation
 *
 *        This generates the known-answer test vectors from explicitly
 *        specified duplex inputs that correspond to what entpool_enter
 *        &c. induce, to confirm the encoding of inputs works as
 *        intended.
 */

#if ENTPOOL_GENKAT

#include <stdio.h>

struct event {
        enum { IN, OUT, STOP } t;
        uint8_t b[RATE-1];
};

/* Cases correspond to entpool_selftest above.  */
static const struct event *const cases[] = {
        [0] = (const struct event[]) {
                {IN, {1, 0xff}},
                {STOP, {0}},
        },
        [1] = (const struct event[]) {
#if ENTPOOL_SMALL
                {IN, {RATE-3, [RATE-2] = 1}},
#else
                {IN, {0x80|((RATE-4)&0x7f), (RATE-4)>>7, [RATE-2] = 1}},
#endif
                {IN, {0xff}},
                {STOP, {0}},
        },
        [2] = (const struct event[]) {
#if ENTPOOL_SMALL
                {IN, {RATE-2}},
#else
                {IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
                {IN, {1, 0xff}},
                {STOP, {0}},
        },
        [3] = (const struct event[]) {
#if ENTPOOL_SMALL
                {IN, {RATE-2}},
#else
                {IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
                {IN, {1, 0xff}},
                {STOP, {0}},
        },
        [4] = (const struct event[]) {
                {IN, {1, 0xff}},
                {STOP, {0}},
        },

        [5] = (const struct event[]) {
#if ENTPOOL_SMALL
                {IN, {RATE-3, [RATE-2] = 0 /* truncated length */}},
#else
                {IN, {0x80|((RATE-4)&0x7f), (RATE-4)>>7,
                      [RATE-2] = 0 /* truncated length */}},
#endif
                {STOP, {0}},
        },
        [6] = (const struct event[]) {
#if ENTPOOL_SMALL
                {IN, {RATE-2}},
#else
                {IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
                {STOP, {0}},
        },
        [7] = (const struct event[]) {
#if ENTPOOL_SMALL
                {IN, {RATE-2}},
#else
                {IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
                {IN, {1, 0xff}},
                {STOP, {0}},
        },
        [8] = (const struct event[]) {
                {STOP, {0}},
        },
        [9] = (const struct event[]) {
                {IN, {1, 0xff}},
                {STOP, {0}},
        },
        [10] = (const struct event[]) {
#if ENTPOOL_SMALL
                {IN, {RATE-2}},
#else
                {IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
                {STOP, {0}},
        },
        [11] = (const struct event[]) {
                {OUT, {0}},
                {OUT, {0}},
                {STOP, {0}},
        },
        [12] = (const struct event[]) {
                {OUT, {0}},
                {IN, {1, 0xff}},
                {STOP, {0}},
        },
};

static void
compute(uint8_t output[KATLEN], const struct event *events)
{
        union {
                uint8_t b[ENTPOOL_SIZE];
                ENTPOOL_WORD w[ENTPOOL_SIZE/sizeof(ENTPOOL_WORD)];
        } u;
        unsigned i, j, k;

        memset(&u.b, 0, sizeof u.b);
        for (i = 0;; i++) {
                if (events[i].t == STOP)
                        break;
                for (j = 0; j < sizeof(events[i].b); j++)
                        u.b[j] ^= events[i].b[j];
                if (events[i].t == OUT) {
                        memset(u.b, 0, RATE-1);
                        u.b[RATE-1] ^= 0x80;
                }

                for (k = 0; k < arraycount(u.w); k++)
                        u.w[k] = ENTPOOL_WTOH(u.w[k]);
                ENTPOOL_PERMUTE(u.w);
                for (k = 0; k < arraycount(u.w); k++)
                        u.w[k] = ENTPOOL_HTOW(u.w[k]);
        }

        for (j = 0; j < KATLEN; j++)
                output[j] = u.b[j];
}

int
main(void)
{
        uint8_t output[KATLEN];
        unsigned i, j;

        printf("static const uint8_t known_answers[][KATLEN] = {\n");
        for (i = 0; i < arraycount(cases); i++) {
                printf("\t[%u] = {\n", i);
                compute(output, cases[i]);
                for (j = 0; j < KATLEN; j++) {
                        if (j % 8 == 0)
                                printf("\t\t");
                        printf("0x%02hhx,", output[j]);
                        if (j % 8 == 7)
                                printf("\n");
                }
                if ((KATLEN % 8) != 0)
                        printf("\n");
                printf("\t},\n");
        }
        printf("};\n");

        fflush(stdout);
        return ferror(stdout);
}

#endif

























































































































































































































































































   31 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 





























































































































































































































































































































































































    1 


    1 



















































    1 





    1 





    1 


    1 


































































































































































































































    5 



    5 


    5 
    5 



    4 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 








    1 










    1 


    1 














    2 








    2 










    2 


    2 

















   12 

    6 


    8 



















    3 



    3 











    9 










































    4 

    4 









































































































































































































































































































































































































    4 


















    4 

    1 

    3 

    1 








    1 
    1 




    2 


    1 









    1 

    1 








    1 


    1 















    1 

















































































































































































































    1 







    1 














    1 

















































































































































































































    1 











    1 





















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
/*        $NetBSD: if.c,v 1.529 2023/02/24 11:02:45 riastradh Exp $        */

/*-
 * Copyright (c) 1999, 2000, 2001, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by William Studenmund and Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1980, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)if.c        8.5 (Berkeley) 1/9/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if.c,v 1.529 2023/02/24 11:02:45 riastradh Exp $");

#if defined(_KERNEL_OPT)
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_atalk.h"
#include "opt_wlan.h"
#include "opt_net_mpsafe.h"
#include "opt_mrouting.h"
#endif

#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/kernel.h>
#include <sys/ioctl.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/xcall.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/module_hook.h>
#include <sys/compat_stub.h>
#include <sys/msan.h>
#include <sys/hook.h>

#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_ether.h>
#include <net/if_media.h>
#include <net80211/ieee80211.h>
#include <net80211/ieee80211_ioctl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <sys/module.h>
#ifdef NETATALK
#include <netatalk/at_extern.h>
#include <netatalk/at.h>
#endif
#include <net/pfil.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip_encap.h>
#include <net/bpf.h>

#ifdef INET6
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#endif

#include "ether.h"

#include "bridge.h"
#if NBRIDGE > 0
#include <net/if_bridgevar.h>
#endif

#include "carp.h"
#if NCARP > 0
#include <netinet/ip_carp.h>
#endif

#include <compat/sys/sockio.h>

MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");

/*
 * XXX reusing (ifp)->if_snd->ifq_lock rather than having another spin mutex
 * for each ifnet.  It doesn't matter because:
 * - if IFEF_MPSAFE is enabled, if_snd isn't used and lock contentions on
 *   ifq_lock don't happen
 * - if IFEF_MPSAFE is disabled, there is no lock contention on ifq_lock
 *   because if_snd, if_link_state_change and if_link_state_change_process
 *   are all called with KERNEL_LOCK
 */
#define IF_LINK_STATE_CHANGE_LOCK(ifp)                \
        mutex_enter((ifp)->if_snd.ifq_lock)
#define IF_LINK_STATE_CHANGE_UNLOCK(ifp)        \
        mutex_exit((ifp)->if_snd.ifq_lock)

/*
 * Global list of interfaces.
 */
/* DEPRECATED. Remove it once kvm(3) users disappeared */
struct ifnet_head                ifnet_list;

struct pslist_head                ifnet_pslist;
static ifnet_t **                ifindex2ifnet = NULL;
static u_int                        if_index = 1;
static size_t                        if_indexlim = 0;
static uint64_t                        index_gen;
/* Mutex to protect the above objects. */
kmutex_t                        ifnet_mtx __cacheline_aligned;
static struct psref_class        *ifnet_psref_class __read_mostly;
static pserialize_t                ifnet_psz;
static struct workqueue                *ifnet_link_state_wq __read_mostly;

static struct workqueue                *if_slowtimo_wq __read_mostly;

static kmutex_t                        if_clone_mtx;

struct ifnet *lo0ifp;
int        ifqmaxlen = IFQ_MAXLEN;

struct psref_class                *ifa_psref_class __read_mostly;

static int        if_delroute_matcher(struct rtentry *, void *);

static bool if_is_unit(const char *);
static struct if_clone *if_clone_lookup(const char *, int *);

static LIST_HEAD(, if_clone) if_cloners = LIST_HEAD_INITIALIZER(if_cloners);
static int if_cloners_count;

/* Packet filtering hook for interfaces. */
pfil_head_t *                        if_pfil __read_mostly;

static kauth_listener_t if_listener;

static int doifioctl(struct socket *, u_long, void *, struct lwp *);
static void sysctl_sndq_setup(struct sysctllog **, const char *,
    struct ifaltq *);
static void if_slowtimo_intr(void *);
static void if_slowtimo_work(struct work *, void *);
static int sysctl_if_watchdog(SYSCTLFN_PROTO);
static void sysctl_watchdog_setup(struct ifnet *);
static void if_attachdomain1(struct ifnet *);
static int ifconf(u_long, void *);
static int if_transmit(struct ifnet *, struct mbuf *);
static int if_clone_create(const char *);
static int if_clone_destroy(const char *);
static void if_link_state_change_work(struct work *, void *);
static void if_up_locked(struct ifnet *);
static void _if_down(struct ifnet *);
static void if_down_deactivated(struct ifnet *);

struct if_percpuq {
        struct ifnet        *ipq_ifp;
        void                *ipq_si;
        struct percpu        *ipq_ifqs;        /* struct ifqueue */
};

static struct mbuf *if_percpuq_dequeue(struct if_percpuq *);

static void if_percpuq_drops(void *, void *, struct cpu_info *);
static int sysctl_percpuq_drops_handler(SYSCTLFN_PROTO);
static void sysctl_percpuq_setup(struct sysctllog **, const char *,
    struct if_percpuq *);

struct if_deferred_start {
        struct ifnet        *ids_ifp;
        void                (*ids_if_start)(struct ifnet *);
        void                *ids_si;
};

static void if_deferred_start_softint(void *);
static void if_deferred_start_common(struct ifnet *);
static void if_deferred_start_destroy(struct ifnet *);

struct if_slowtimo_data {
        kmutex_t                isd_lock;
        struct callout                isd_ch;
        struct work                isd_work;
        struct ifnet                *isd_ifp;
        bool                        isd_queued;
        bool                        isd_dying;
        bool                        isd_trigger;
};

/*
 * Hook for if_vlan - needed by if_agr
 */
struct if_vlan_vlan_input_hook_t if_vlan_vlan_input_hook;

static void if_sysctl_setup(struct sysctllog **);

static int
if_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;
        enum kauth_network_req req;

        result = KAUTH_RESULT_DEFER;
        req = (enum kauth_network_req)(uintptr_t)arg1;

        if (action != KAUTH_NETWORK_INTERFACE)
                return result;

        if ((req == KAUTH_REQ_NETWORK_INTERFACE_GET) ||
            (req == KAUTH_REQ_NETWORK_INTERFACE_SET))
                result = KAUTH_RESULT_ALLOW;

        return result;
}

/*
 * Network interface utility routines.
 *
 * Routines with ifa_ifwith* names take sockaddr *'s as
 * parameters.
 */
void
ifinit(void)
{

#if (defined(INET) || defined(INET6))
        encapinit();
#endif

        if_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
            if_listener_cb, NULL);

        /* interfaces are available, inform socket code */
        ifioctl = doifioctl;
}

/*
 * XXX Initialization before configure().
 * XXX hack to get pfil_add_hook working in autoconf.
 */
void
ifinit1(void)
{
        int error __diagused;

#ifdef NET_MPSAFE
        printf("NET_MPSAFE enabled\n");
#endif

        mutex_init(&if_clone_mtx, MUTEX_DEFAULT, IPL_NONE);

        TAILQ_INIT(&ifnet_list);
        mutex_init(&ifnet_mtx, MUTEX_DEFAULT, IPL_NONE);
        ifnet_psz = pserialize_create();
        ifnet_psref_class = psref_class_create("ifnet", IPL_SOFTNET);
        ifa_psref_class = psref_class_create("ifa", IPL_SOFTNET);
        error = workqueue_create(&ifnet_link_state_wq, "iflnkst",
            if_link_state_change_work, NULL, PRI_SOFTNET, IPL_NET,
            WQ_MPSAFE);
        KASSERT(error == 0);
        PSLIST_INIT(&ifnet_pslist);

        error = workqueue_create(&if_slowtimo_wq, "ifwdog",
            if_slowtimo_work, NULL, PRI_SOFTNET, IPL_SOFTCLOCK, WQ_MPSAFE);
        KASSERTMSG(error == 0, "error=%d", error);

        if_indexlim = 8;

        if_pfil = pfil_head_create(PFIL_TYPE_IFNET, NULL);
        KASSERT(if_pfil != NULL);

#if NETHER > 0 || defined(NETATALK) || defined(WLAN)
        etherinit();
#endif
}

/* XXX must be after domaininit() */
void
ifinit_post(void)
{

        if_sysctl_setup(NULL);
}

ifnet_t *
if_alloc(u_char type)
{

        return kmem_zalloc(sizeof(ifnet_t), KM_SLEEP);
}

void
if_free(ifnet_t *ifp)
{

        kmem_free(ifp, sizeof(ifnet_t));
}

void
if_initname(struct ifnet *ifp, const char *name, int unit)
{

        (void)snprintf(ifp->if_xname, sizeof(ifp->if_xname),
            "%s%d", name, unit);
}

/*
 * Null routines used while an interface is going away.  These routines
 * just return an error.
 */

int
if_nulloutput(struct ifnet *ifp, struct mbuf *m,
    const struct sockaddr *so, const struct rtentry *rt)
{

        return ENXIO;
}

void
if_nullinput(struct ifnet *ifp, struct mbuf *m)
{

        /* Nothing. */
}

void
if_nullstart(struct ifnet *ifp)
{

        /* Nothing. */
}

int
if_nulltransmit(struct ifnet *ifp, struct mbuf *m)
{

        m_freem(m);
        return ENXIO;
}

int
if_nullioctl(struct ifnet *ifp, u_long cmd, void *data)
{

        return ENXIO;
}

int
if_nullinit(struct ifnet *ifp)
{

        return ENXIO;
}

void
if_nullstop(struct ifnet *ifp, int disable)
{

        /* Nothing. */
}

void
if_nullslowtimo(struct ifnet *ifp)
{

        /* Nothing. */
}

void
if_nulldrain(struct ifnet *ifp)
{

        /* Nothing. */
}

void
if_set_sadl(struct ifnet *ifp, const void *lla, u_char addrlen, bool factory)
{
        struct ifaddr *ifa;
        struct sockaddr_dl *sdl;

        ifp->if_addrlen = addrlen;
        if_alloc_sadl(ifp);
        ifa = ifp->if_dl;
        sdl = satosdl(ifa->ifa_addr);

        (void)sockaddr_dl_setaddr(sdl, sdl->sdl_len, lla, ifp->if_addrlen);
        if (factory) {
                KASSERT(ifp->if_hwdl == NULL);
                ifp->if_hwdl = ifp->if_dl;
                ifaref(ifp->if_hwdl);
        }
        /* TBD routing socket */
}

struct ifaddr *
if_dl_create(const struct ifnet *ifp, const struct sockaddr_dl **sdlp)
{
        unsigned socksize, ifasize;
        int addrlen, namelen;
        struct sockaddr_dl *mask, *sdl;
        struct ifaddr *ifa;

        namelen = strlen(ifp->if_xname);
        addrlen = ifp->if_addrlen;
        socksize = roundup(sockaddr_dl_measure(namelen, addrlen),
            sizeof(long));
        ifasize = sizeof(*ifa) + 2 * socksize;
        ifa = malloc(ifasize, M_IFADDR, M_WAITOK | M_ZERO);

        sdl = (struct sockaddr_dl *)(ifa + 1);
        mask = (struct sockaddr_dl *)(socksize + (char *)sdl);

        sockaddr_dl_init(sdl, socksize, ifp->if_index, ifp->if_type,
            ifp->if_xname, namelen, NULL, addrlen);
        mask->sdl_family = AF_LINK;
        mask->sdl_len = sockaddr_dl_measure(namelen, 0);
        memset(&mask->sdl_data[0], 0xff, namelen);
        ifa->ifa_rtrequest = link_rtrequest;
        ifa->ifa_addr = (struct sockaddr *)sdl;
        ifa->ifa_netmask = (struct sockaddr *)mask;
        ifa_psref_init(ifa);

        *sdlp = sdl;

        return ifa;
}

static void
if_sadl_setrefs(struct ifnet *ifp, struct ifaddr *ifa)
{
        const struct sockaddr_dl *sdl;

        ifp->if_dl = ifa;
        ifaref(ifa);
        sdl = satosdl(ifa->ifa_addr);
        ifp->if_sadl = sdl;
}

/*
 * Allocate the link level name for the specified interface.  This
 * is an attachment helper.  It must be called after ifp->if_addrlen
 * is initialized, which may not be the case when if_attach() is
 * called.
 */
void
if_alloc_sadl(struct ifnet *ifp)
{
        struct ifaddr *ifa;
        const struct sockaddr_dl *sdl;

        /*
         * If the interface already has a link name, release it
         * now.  This is useful for interfaces that can change
         * link types, and thus switch link names often.
         */
        if (ifp->if_sadl != NULL)
                if_free_sadl(ifp, 0);

        ifa = if_dl_create(ifp, &sdl);

        ifa_insert(ifp, ifa);
        if_sadl_setrefs(ifp, ifa);
}

static void
if_deactivate_sadl(struct ifnet *ifp)
{
        struct ifaddr *ifa;

        KASSERT(ifp->if_dl != NULL);

        ifa = ifp->if_dl;

        ifp->if_sadl = NULL;

        ifp->if_dl = NULL;
        ifafree(ifa);
}

static void
if_replace_sadl(struct ifnet *ifp, struct ifaddr *ifa)
{
        struct ifaddr *old;

        KASSERT(ifp->if_dl != NULL);

        old = ifp->if_dl;

        ifaref(ifa);
        /* XXX Update if_dl and if_sadl atomically */
        ifp->if_dl = ifa;
        ifp->if_sadl = satosdl(ifa->ifa_addr);

        ifafree(old);
}

void
if_activate_sadl(struct ifnet *ifp, struct ifaddr *ifa0,
    const struct sockaddr_dl *sdl)
{
        struct ifaddr *ifa;
        const int bound = curlwp_bind();

        KASSERT(ifa_held(ifa0));

        const int s = splsoftnet();

        if_replace_sadl(ifp, ifa0);

        int ss = pserialize_read_enter();
        IFADDR_READER_FOREACH(ifa, ifp) {
                struct psref psref;
                ifa_acquire(ifa, &psref);
                pserialize_read_exit(ss);

                rtinit(ifa, RTM_LLINFO_UPD, 0);

                ss = pserialize_read_enter();
                ifa_release(ifa, &psref);
        }
        pserialize_read_exit(ss);

        splx(s);
        curlwp_bindx(bound);
}

/*
 * Free the link level name for the specified interface.  This is
 * a detach helper.  This is called from if_detach().
 */
void
if_free_sadl(struct ifnet *ifp, int factory)
{
        struct ifaddr *ifa;

        if (factory && ifp->if_hwdl != NULL) {
                ifa = ifp->if_hwdl;
                ifp->if_hwdl = NULL;
                ifafree(ifa);
        }

        ifa = ifp->if_dl;
        if (ifa == NULL) {
                KASSERT(ifp->if_sadl == NULL);
                return;
        }

        KASSERT(ifp->if_sadl != NULL);

        const int s = splsoftnet();
        KASSERT(ifa->ifa_addr->sa_family == AF_LINK);
        ifa_remove(ifp, ifa);
        if_deactivate_sadl(ifp);
        splx(s);
}

static void
if_getindex(ifnet_t *ifp)
{
        bool hitlimit = false;
        char xnamebuf[HOOKNAMSIZ];

        ifp->if_index_gen = index_gen++;
        snprintf(xnamebuf, sizeof(xnamebuf), "%s-lshk", ifp->if_xname);
        ifp->if_linkstate_hooks = simplehook_create(IPL_NET,
            xnamebuf);

        ifp->if_index = if_index;
        if (ifindex2ifnet == NULL) {
                if_index++;
                goto skip;
        }
        while (if_byindex(ifp->if_index)) {
                /*
                 * If we hit USHRT_MAX, we skip back to 0 since
                 * there are a number of places where the value
                 * of if_index or if_index itself is compared
                 * to or stored in an unsigned short.  By
                 * jumping back, we won't botch those assignments
                 * or comparisons.
                 */
                if (++if_index == 0) {
                        if_index = 1;
                } else if (if_index == USHRT_MAX) {
                        /*
                         * However, if we have to jump back to
                         * zero *twice* without finding an empty
                         * slot in ifindex2ifnet[], then there
                         * there are too many (>65535) interfaces.
                         */
                        if (hitlimit)
                                panic("too many interfaces");
                        hitlimit = true;
                        if_index = 1;
                }
                ifp->if_index = if_index;
        }
skip:
        /*
         * ifindex2ifnet is indexed by if_index. Since if_index will
         * grow dynamically, it should grow too.
         */
        if (ifindex2ifnet == NULL || ifp->if_index >= if_indexlim) {
                size_t m, n, oldlim;
                void *q;

                oldlim = if_indexlim;
                while (ifp->if_index >= if_indexlim)
                        if_indexlim <<= 1;

                /* grow ifindex2ifnet */
                m = oldlim * sizeof(struct ifnet *);
                n = if_indexlim * sizeof(struct ifnet *);
                q = malloc(n, M_IFADDR, M_WAITOK | M_ZERO);
                if (ifindex2ifnet != NULL) {
                        memcpy(q, ifindex2ifnet, m);
                        free(ifindex2ifnet, M_IFADDR);
                }
                ifindex2ifnet = (struct ifnet **)q;
        }
        ifindex2ifnet[ifp->if_index] = ifp;
}

/*
 * Initialize an interface and assign an index for it.
 *
 * It must be called prior to a device specific attach routine
 * (e.g., ether_ifattach and ieee80211_ifattach) or if_alloc_sadl,
 * and be followed by if_register:
 *
 *     if_initialize(ifp);
 *     ether_ifattach(ifp, enaddr);
 *     if_register(ifp);
 */
void
if_initialize(ifnet_t *ifp)
{

        KASSERT(if_indexlim > 0);
        TAILQ_INIT(&ifp->if_addrlist);

        /*
         * Link level name is allocated later by a separate call to
         * if_alloc_sadl().
         */

        if (ifp->if_snd.ifq_maxlen == 0)
                ifp->if_snd.ifq_maxlen = ifqmaxlen;

        ifp->if_broadcastaddr = 0; /* reliably crash if used uninitialized */

        ifp->if_link_state = LINK_STATE_UNKNOWN;
        ifp->if_link_queue = -1; /* all bits set, see link_state_change() */
        ifp->if_link_scheduled = false;

        ifp->if_capenable = 0;
        ifp->if_csum_flags_tx = 0;
        ifp->if_csum_flags_rx = 0;

#ifdef ALTQ
        ifp->if_snd.altq_type = 0;
        ifp->if_snd.altq_disc = NULL;
        ifp->if_snd.altq_flags &= ALTQF_CANTCHANGE;
        ifp->if_snd.altq_tbr  = NULL;
        ifp->if_snd.altq_ifp  = ifp;
#endif

        IFQ_LOCK_INIT(&ifp->if_snd);

        ifp->if_pfil = pfil_head_create(PFIL_TYPE_IFNET, ifp);
        pfil_run_ifhooks(if_pfil, PFIL_IFNET_ATTACH, ifp);

        IF_AFDATA_LOCK_INIT(ifp);

        PSLIST_ENTRY_INIT(ifp, if_pslist_entry);
        PSLIST_INIT(&ifp->if_addr_pslist);
        psref_target_init(&ifp->if_psref, ifnet_psref_class);
        ifp->if_ioctl_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
        LIST_INIT(&ifp->if_multiaddrs);
        if_stats_init(ifp);

        IFNET_GLOBAL_LOCK();
        if_getindex(ifp);
        IFNET_GLOBAL_UNLOCK();
}

/*
 * Register an interface to the list of "active" interfaces.
 */
void
if_register(ifnet_t *ifp)
{
        /*
         * If the driver has not supplied its own if_ioctl or if_stop,
         * then supply the default.
         */
        if (ifp->if_ioctl == NULL)
                ifp->if_ioctl = ifioctl_common;
        if (ifp->if_stop == NULL)
                ifp->if_stop = if_nullstop;

        sysctl_sndq_setup(&ifp->if_sysctl_log, ifp->if_xname, &ifp->if_snd);

        if (!STAILQ_EMPTY(&domains))
                if_attachdomain1(ifp);

        /* Announce the interface. */
        rt_ifannouncemsg(ifp, IFAN_ARRIVAL);

        if (ifp->if_slowtimo != NULL) {
                struct if_slowtimo_data *isd;

                isd = kmem_zalloc(sizeof(*isd), KM_SLEEP);
                mutex_init(&isd->isd_lock, MUTEX_DEFAULT, IPL_SOFTCLOCK);
                callout_init(&isd->isd_ch, CALLOUT_MPSAFE);
                callout_setfunc(&isd->isd_ch, if_slowtimo_intr, ifp);
                isd->isd_ifp = ifp;

                ifp->if_slowtimo_data = isd;

                if_slowtimo_intr(ifp);

                sysctl_watchdog_setup(ifp);
        }

        if (ifp->if_transmit == NULL || ifp->if_transmit == if_nulltransmit)
                ifp->if_transmit = if_transmit;

        IFNET_GLOBAL_LOCK();
        TAILQ_INSERT_TAIL(&ifnet_list, ifp, if_list);
        IFNET_WRITER_INSERT_TAIL(ifp);
        IFNET_GLOBAL_UNLOCK();
}

/*
 * The if_percpuq framework
 *
 * It allows network device drivers to execute the network stack
 * in softint (so called softint-based if_input). It utilizes
 * softint and percpu ifqueue. It doesn't distribute any packets
 * between CPUs, unlike pktqueue(9).
 *
 * Currently we support two options for device drivers to apply the framework:
 * - Use it implicitly with less changes
 *   - If you use if_attach in driver's _attach function and if_input in
 *     driver's Rx interrupt handler, a packet is queued and a softint handles
 *     the packet implicitly
 * - Use it explicitly in each driver (recommended)
 *   - You can use if_percpuq_* directly in your driver
 *   - In this case, you need to allocate struct if_percpuq in driver's softc
 *   - See wm(4) as a reference implementation
 */

static void
if_percpuq_softint(void *arg)
{
        struct if_percpuq *ipq = arg;
        struct ifnet *ifp = ipq->ipq_ifp;
        struct mbuf *m;

        while ((m = if_percpuq_dequeue(ipq)) != NULL) {
                if_statinc(ifp, if_ipackets);
                bpf_mtap(ifp, m, BPF_D_IN);

                ifp->_if_input(ifp, m);
        }
}

static void
if_percpuq_init_ifq(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
        struct ifqueue *const ifq = p;

        memset(ifq, 0, sizeof(*ifq));
        ifq->ifq_maxlen = IFQ_MAXLEN;
}

struct if_percpuq *
if_percpuq_create(struct ifnet *ifp)
{
        struct if_percpuq *ipq;
        u_int flags = SOFTINT_NET;

        flags |= if_is_mpsafe(ifp) ? SOFTINT_MPSAFE : 0;

        ipq = kmem_zalloc(sizeof(*ipq), KM_SLEEP);
        ipq->ipq_ifp = ifp;
        ipq->ipq_si = softint_establish(flags, if_percpuq_softint, ipq);
        ipq->ipq_ifqs = percpu_alloc(sizeof(struct ifqueue));
        percpu_foreach(ipq->ipq_ifqs, &if_percpuq_init_ifq, NULL);

        sysctl_percpuq_setup(&ifp->if_sysctl_log, ifp->if_xname, ipq);

        return ipq;
}

static struct mbuf *
if_percpuq_dequeue(struct if_percpuq *ipq)
{
        struct mbuf *m;
        struct ifqueue *ifq;

        const int s = splnet();
        ifq = percpu_getref(ipq->ipq_ifqs);
        IF_DEQUEUE(ifq, m);
        percpu_putref(ipq->ipq_ifqs);
        splx(s);

        return m;
}

static void
if_percpuq_purge_ifq(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
        struct ifqueue *const ifq = p;

        IF_PURGE(ifq);
}

void
if_percpuq_destroy(struct if_percpuq *ipq)
{

        /* if_detach may already destroy it */
        if (ipq == NULL)
                return;

        softint_disestablish(ipq->ipq_si);
        percpu_foreach(ipq->ipq_ifqs, &if_percpuq_purge_ifq, NULL);
        percpu_free(ipq->ipq_ifqs, sizeof(struct ifqueue));
        kmem_free(ipq, sizeof(*ipq));
}

void
if_percpuq_enqueue(struct if_percpuq *ipq, struct mbuf *m)
{
        struct ifqueue *ifq;

        KASSERT(ipq != NULL);

        const int s = splnet();
        ifq = percpu_getref(ipq->ipq_ifqs);
        if (IF_QFULL(ifq)) {
                IF_DROP(ifq);
                percpu_putref(ipq->ipq_ifqs);
                m_freem(m);
                goto out;
        }
        IF_ENQUEUE(ifq, m);
        percpu_putref(ipq->ipq_ifqs);

        softint_schedule(ipq->ipq_si);
out:
        splx(s);
}

static void
if_percpuq_drops(void *p, void *arg, struct cpu_info *ci __unused)
{
        struct ifqueue *const ifq = p;
        uint64_t *sum = arg;

        *sum += ifq->ifq_drops;
}

static int
sysctl_percpuq_drops_handler(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        struct if_percpuq *ipq;
        uint64_t sum = 0;
        int error;

        node = *rnode;
        ipq = node.sysctl_data;

        percpu_foreach(ipq->ipq_ifqs, if_percpuq_drops, &sum);

        node.sysctl_data = &sum;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error != 0 || newp == NULL)
                return error;

        return 0;
}

static void
sysctl_percpuq_setup(struct sysctllog **clog, const char* ifname,
    struct if_percpuq *ipq)
{
        const struct sysctlnode *cnode, *rnode;

        if (sysctl_createv(clog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "interfaces",
                       SYSCTL_DESCR("Per-interface controls"),
                       NULL, 0, NULL, 0,
                       CTL_NET, CTL_CREATE, CTL_EOL) != 0)
                goto bad;

        if (sysctl_createv(clog, 0, &rnode, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, ifname,
                       SYSCTL_DESCR("Interface controls"),
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL) != 0)
                goto bad;

        if (sysctl_createv(clog, 0, &rnode, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "rcvq",
                       SYSCTL_DESCR("Interface input queue controls"),
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL) != 0)
                goto bad;

#ifdef NOTYET
        /* XXX Should show each per-CPU queue length? */
        if (sysctl_createv(clog, 0, &rnode, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_INT, "len",
                       SYSCTL_DESCR("Current input queue length"),
                       sysctl_percpuq_len, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL) != 0)
                goto bad;

        if (sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                       CTLTYPE_INT, "maxlen",
                       SYSCTL_DESCR("Maximum allowed input queue length"),
                       sysctl_percpuq_maxlen_handler, 0, (void *)ipq, 0,
                       CTL_CREATE, CTL_EOL) != 0)
                goto bad;
#endif

        if (sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "drops",
                       SYSCTL_DESCR("Total packets dropped due to full input queue"),
                       sysctl_percpuq_drops_handler, 0, (void *)ipq, 0,
                       CTL_CREATE, CTL_EOL) != 0)
                goto bad;

        return;
bad:
        printf("%s: could not attach sysctl nodes\n", ifname);
        return;
}

/*
 * The deferred if_start framework
 *
 * The common APIs to defer if_start to softint when if_start is requested
 * from a device driver running in hardware interrupt context.
 */
/*
 * Call ifp->if_start (or equivalent) in a dedicated softint for
 * deferred if_start.
 */
static void
if_deferred_start_softint(void *arg)
{
        struct if_deferred_start *ids = arg;
        struct ifnet *ifp = ids->ids_ifp;

        ids->ids_if_start(ifp);
}

/*
 * The default callback function for deferred if_start.
 */
static void
if_deferred_start_common(struct ifnet *ifp)
{
        const int s = splnet();
        if_start_lock(ifp);
        splx(s);
}

static inline bool
if_snd_is_used(struct ifnet *ifp)
{

        return ALTQ_IS_ENABLED(&ifp->if_snd) ||
            ifp->if_transmit == if_transmit ||
            ifp->if_transmit == NULL ||
            ifp->if_transmit == if_nulltransmit;
}

/*
 * Schedule deferred if_start.
 */
void
if_schedule_deferred_start(struct ifnet *ifp)
{

        KASSERT(ifp->if_deferred_start != NULL);

        if (if_snd_is_used(ifp) && IFQ_IS_EMPTY(&ifp->if_snd))
                return;

        softint_schedule(ifp->if_deferred_start->ids_si);
}

/*
 * Create an instance of deferred if_start. A driver should call the function
 * only if the driver needs deferred if_start. Drivers can setup their own
 * deferred if_start function via 2nd argument.
 */
void
if_deferred_start_init(struct ifnet *ifp, void (*func)(struct ifnet *))
{
        struct if_deferred_start *ids;
        u_int flags = SOFTINT_NET;

        flags |= if_is_mpsafe(ifp) ? SOFTINT_MPSAFE : 0;

        ids = kmem_zalloc(sizeof(*ids), KM_SLEEP);
        ids->ids_ifp = ifp;
        ids->ids_si = softint_establish(flags, if_deferred_start_softint, ids);
        if (func != NULL)
                ids->ids_if_start = func;
        else
                ids->ids_if_start = if_deferred_start_common;

        ifp->if_deferred_start = ids;
}

static void
if_deferred_start_destroy(struct ifnet *ifp)
{

        if (ifp->if_deferred_start == NULL)
                return;

        softint_disestablish(ifp->if_deferred_start->ids_si);
        kmem_free(ifp->if_deferred_start, sizeof(*ifp->if_deferred_start));
        ifp->if_deferred_start = NULL;
}

/*
 * The common interface input routine that is called by device drivers,
 * which should be used only when the driver's rx handler already runs
 * in softint.
 */
void
if_input(struct ifnet *ifp, struct mbuf *m)
{

        KASSERT(ifp->if_percpuq == NULL);
        KASSERT(!cpu_intr_p());

        if_statinc(ifp, if_ipackets);
        bpf_mtap(ifp, m, BPF_D_IN);

        ifp->_if_input(ifp, m);
}

/*
 * DEPRECATED. Use if_initialize and if_register instead.
 * See the above comment of if_initialize.
 *
 * Note that it implicitly enables if_percpuq to make drivers easy to
 * migrate softint-based if_input without much changes. If you don't
 * want to enable it, use if_initialize instead.
 */
void
if_attach(ifnet_t *ifp)
{

        if_initialize(ifp);
        ifp->if_percpuq = if_percpuq_create(ifp);
        if_register(ifp);
}

void
if_attachdomain(void)
{
        struct ifnet *ifp;
        const int bound = curlwp_bind();

        int s = pserialize_read_enter();
        IFNET_READER_FOREACH(ifp) {
                struct psref psref;
                psref_acquire(&psref, &ifp->if_psref, ifnet_psref_class);
                pserialize_read_exit(s);
                if_attachdomain1(ifp);
                s = pserialize_read_enter();
                psref_release(&psref, &ifp->if_psref, ifnet_psref_class);
        }
        pserialize_read_exit(s);
        curlwp_bindx(bound);
}

static void
if_attachdomain1(struct ifnet *ifp)
{
        struct domain *dp;
        const int s = splsoftnet();

        /* address family dependent data region */
        memset(ifp->if_afdata, 0, sizeof(ifp->if_afdata));
        DOMAIN_FOREACH(dp) {
                if (dp->dom_ifattach != NULL)
                        ifp->if_afdata[dp->dom_family] =
                            (*dp->dom_ifattach)(ifp);
        }

        splx(s);
}

/*
 * Deactivate an interface.  This points all of the procedure
 * handles at error stubs.  May be called from interrupt context.
 */
void
if_deactivate(struct ifnet *ifp)
{
        const int s = splsoftnet();

        ifp->if_output         = if_nulloutput;
        ifp->_if_input         = if_nullinput;
        ifp->if_start         = if_nullstart;
        ifp->if_transmit = if_nulltransmit;
        ifp->if_ioctl         = if_nullioctl;
        ifp->if_init         = if_nullinit;
        ifp->if_stop         = if_nullstop;
        if (ifp->if_slowtimo)
                ifp->if_slowtimo = if_nullslowtimo;
        ifp->if_drain         = if_nulldrain;

        /* No more packets may be enqueued. */
        ifp->if_snd.ifq_maxlen = 0;

        splx(s);
}

bool
if_is_deactivated(const struct ifnet *ifp)
{

        return ifp->if_output == if_nulloutput;
}

void
if_purgeaddrs(struct ifnet *ifp, int family,
    void (*purgeaddr)(struct ifaddr *))
{
        struct ifaddr *ifa, *nifa;
        int s;

        s = pserialize_read_enter();
        for (ifa = IFADDR_READER_FIRST(ifp); ifa; ifa = nifa) {
                nifa = IFADDR_READER_NEXT(ifa);
                if (ifa->ifa_addr->sa_family != family)
                        continue;
                pserialize_read_exit(s);

                (*purgeaddr)(ifa);

                s = pserialize_read_enter();
        }
        pserialize_read_exit(s);
}

#ifdef IFAREF_DEBUG
static struct ifaddr **ifa_list;
static int ifa_list_size;

/* Depends on only one if_attach runs at once */
static void
if_build_ifa_list(struct ifnet *ifp)
{
        struct ifaddr *ifa;
        int i;

        KASSERT(ifa_list == NULL);
        KASSERT(ifa_list_size == 0);

        IFADDR_READER_FOREACH(ifa, ifp)
                ifa_list_size++;

        ifa_list = kmem_alloc(sizeof(*ifa) * ifa_list_size, KM_SLEEP);
        i = 0;
        IFADDR_READER_FOREACH(ifa, ifp) {
                ifa_list[i++] = ifa;
                ifaref(ifa);
        }
}

static void
if_check_and_free_ifa_list(struct ifnet *ifp)
{
        int i;
        struct ifaddr *ifa;

        if (ifa_list == NULL)
                return;

        for (i = 0; i < ifa_list_size; i++) {
                char buf[64];

                ifa = ifa_list[i];
                sockaddr_format(ifa->ifa_addr, buf, sizeof(buf));
                if (ifa->ifa_refcnt > 1) {
                        log(LOG_WARNING,
                            "ifa(%s) still referenced (refcnt=%d)\n",
                            buf, ifa->ifa_refcnt - 1);
                } else
                        log(LOG_DEBUG,
                            "ifa(%s) not referenced (refcnt=%d)\n",
                            buf, ifa->ifa_refcnt - 1);
                ifafree(ifa);
        }

        kmem_free(ifa_list, sizeof(*ifa) * ifa_list_size);
        ifa_list = NULL;
        ifa_list_size = 0;
}
#endif

/*
 * Detach an interface from the list of "active" interfaces,
 * freeing any resources as we go along.
 *
 * NOTE: This routine must be called with a valid thread context,
 * as it may block.
 */
void
if_detach(struct ifnet *ifp)
{
        struct socket so;
        struct ifaddr *ifa;
#ifdef IFAREF_DEBUG
        struct ifaddr *last_ifa = NULL;
#endif
        struct domain *dp;
        const struct protosw *pr;
        int i, family, purged;

#ifdef IFAREF_DEBUG
        if_build_ifa_list(ifp);
#endif
        /*
         * XXX It's kind of lame that we have to have the
         * XXX socket structure...
         */
        memset(&so, 0, sizeof(so));

        const int s = splnet();

        sysctl_teardown(&ifp->if_sysctl_log);

        IFNET_LOCK(ifp);

        /*
         * Unset all queued link states and pretend a
         * link state change is scheduled.
         * This stops any more link state changes occurring for this
         * interface while it's being detached so it's safe
         * to drain the workqueue.
         */
        IF_LINK_STATE_CHANGE_LOCK(ifp);
        ifp->if_link_queue = -1; /* all bits set, see link_state_change() */
        ifp->if_link_scheduled = true;
        IF_LINK_STATE_CHANGE_UNLOCK(ifp);
        workqueue_wait(ifnet_link_state_wq, &ifp->if_link_work);

        if_deactivate(ifp);
        IFNET_UNLOCK(ifp);

        /*
         * Unlink from the list and wait for all readers to leave
         * from pserialize read sections.  Note that we can't do
         * psref_target_destroy here.  See below.
         */
        IFNET_GLOBAL_LOCK();
        ifindex2ifnet[ifp->if_index] = NULL;
        TAILQ_REMOVE(&ifnet_list, ifp, if_list);
        IFNET_WRITER_REMOVE(ifp);
        pserialize_perform(ifnet_psz);
        IFNET_GLOBAL_UNLOCK();

        if (ifp->if_slowtimo != NULL) {
                struct if_slowtimo_data *isd = ifp->if_slowtimo_data;

                mutex_enter(&isd->isd_lock);
                isd->isd_dying = true;
                mutex_exit(&isd->isd_lock);
                callout_halt(&isd->isd_ch, NULL);
                workqueue_wait(if_slowtimo_wq, &isd->isd_work);
                callout_destroy(&isd->isd_ch);
                mutex_destroy(&isd->isd_lock);
                kmem_free(isd, sizeof(*isd));

                ifp->if_slowtimo_data = NULL; /* paraonia */
                ifp->if_slowtimo = NULL;      /* paranoia */
        }
        if_deferred_start_destroy(ifp);

        /*
         * Do an if_down() to give protocols a chance to do something.
         */
        if_down_deactivated(ifp);

#ifdef ALTQ
        if (ALTQ_IS_ENABLED(&ifp->if_snd))
                altq_disable(&ifp->if_snd);
        if (ALTQ_IS_ATTACHED(&ifp->if_snd))
                altq_detach(&ifp->if_snd);
#endif

#if NCARP > 0
        /* Remove the interface from any carp group it is a part of.  */
        if (ifp->if_carp != NULL && ifp->if_type != IFT_CARP)
                carp_ifdetach(ifp);
#endif

        /*
         * Ensure that all packets on protocol input pktqueues have been
         * processed, or, at least, removed from the queues.
         *
         * A cross-call will ensure that the interrupts have completed.
         * FIXME: not quite..
         */
        pktq_ifdetach();
        xc_barrier(0);

        /*
         * Rip all the addresses off the interface.  This should make
         * all of the routes go away.
         *
         * pr_usrreq calls can remove an arbitrary number of ifaddrs
         * from the list, including our "cursor", ifa.  For safety,
         * and to honor the TAILQ abstraction, I just restart the
         * loop after each removal.  Note that the loop will exit
         * when all of the remaining ifaddrs belong to the AF_LINK
         * family.  I am counting on the historical fact that at
         * least one pr_usrreq in each address domain removes at
         * least one ifaddr.
         */
again:
        /*
         * At this point, no other one tries to remove ifa in the list,
         * so we don't need to take a lock or psref.  Avoid using
         * IFADDR_READER_FOREACH to pass over an inspection of contract
         * violations of pserialize.
         */
        IFADDR_WRITER_FOREACH(ifa, ifp) {
                family = ifa->ifa_addr->sa_family;
#ifdef IFAREF_DEBUG
                printf("if_detach: ifaddr %p, family %d, refcnt %d\n",
                    ifa, family, ifa->ifa_refcnt);
                if (last_ifa != NULL && ifa == last_ifa)
                        panic("if_detach: loop detected");
                last_ifa = ifa;
#endif
                if (family == AF_LINK)
                        continue;
                dp = pffinddomain(family);
                KASSERTMSG(dp != NULL, "no domain for AF %d", family);
                /*
                 * XXX These PURGEIF calls are redundant with the
                 * purge-all-families calls below, but are left in for
                 * now both to make a smaller change, and to avoid
                 * unplanned interactions with clearing of
                 * ifp->if_addrlist.
                 */
                purged = 0;
                for (pr = dp->dom_protosw;
                     pr < dp->dom_protoswNPROTOSW; pr++) {
                        so.so_proto = pr;
                        if (pr->pr_usrreqs) {
                                (void) (*pr->pr_usrreqs->pr_purgeif)(&so, ifp);
                                purged = 1;
                        }
                }
                if (purged == 0) {
                        /*
                         * XXX What's really the best thing to do
                         * XXX here?  --thorpej@NetBSD.org
                         */
                        printf("if_detach: WARNING: AF %d not purged\n",
                            family);
                        ifa_remove(ifp, ifa);
                }
                goto again;
        }

        if_free_sadl(ifp, 1);

restart:
        IFADDR_WRITER_FOREACH(ifa, ifp) {
                family = ifa->ifa_addr->sa_family;
                KASSERT(family == AF_LINK);
                ifa_remove(ifp, ifa);
                goto restart;
        }

        /* Delete stray routes from the routing table. */
        for (i = 0; i <= AF_MAX; i++)
                rt_delete_matched_entries(i, if_delroute_matcher, ifp, false);

        DOMAIN_FOREACH(dp) {
                if (dp->dom_ifdetach != NULL && ifp->if_afdata[dp->dom_family])
                {
                        void *p = ifp->if_afdata[dp->dom_family];
                        if (p) {
                                ifp->if_afdata[dp->dom_family] = NULL;
                                (*dp->dom_ifdetach)(ifp, p);
                        }
                }

                /*
                 * One would expect multicast memberships (INET and
                 * INET6) on UDP sockets to be purged by the PURGEIF
                 * calls above, but if all addresses were removed from
                 * the interface prior to destruction, the calls will
                 * not be made (e.g. ppp, for which pppd(8) generally
                 * removes addresses before destroying the interface).
                 * Because there is no invariant that multicast
                 * memberships only exist for interfaces with IPv4
                 * addresses, we must call PURGEIF regardless of
                 * addresses.  (Protocols which might store ifnet
                 * pointers are marked with PR_PURGEIF.)
                 */
                for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
                {
                        so.so_proto = pr;
                        if (pr->pr_usrreqs && pr->pr_flags & PR_PURGEIF)
                                (void)(*pr->pr_usrreqs->pr_purgeif)(&so, ifp);
                }
        }

        /*
         * Must be done after the above pr_purgeif because if_psref may be
         * still used in pr_purgeif.
         */
        psref_target_destroy(&ifp->if_psref, ifnet_psref_class);
        PSLIST_ENTRY_DESTROY(ifp, if_pslist_entry);

        pfil_run_ifhooks(if_pfil, PFIL_IFNET_DETACH, ifp);
        (void)pfil_head_destroy(ifp->if_pfil);

        /* Announce that the interface is gone. */
        rt_ifannouncemsg(ifp, IFAN_DEPARTURE);

        IF_AFDATA_LOCK_DESTROY(ifp);

        if (ifp->if_percpuq != NULL) {
                if_percpuq_destroy(ifp->if_percpuq);
                ifp->if_percpuq = NULL;
        }

        mutex_obj_free(ifp->if_ioctl_lock);
        ifp->if_ioctl_lock = NULL;
        mutex_obj_free(ifp->if_snd.ifq_lock);
        if_stats_fini(ifp);
        KASSERT(!simplehook_has_hooks(ifp->if_linkstate_hooks));
        simplehook_destroy(ifp->if_linkstate_hooks);

        splx(s);

#ifdef IFAREF_DEBUG
        if_check_and_free_ifa_list(ifp);
#endif
}

/*
 * Callback for a radix tree walk to delete all references to an
 * ifnet.
 */
static int
if_delroute_matcher(struct rtentry *rt, void *v)
{
        struct ifnet *ifp = (struct ifnet *)v;

        if (rt->rt_ifp == ifp)
                return 1;
        else
                return 0;
}

/*
 * Create a clone network interface.
 */
static int
if_clone_create(const char *name)
{
        struct if_clone *ifc;
        struct ifnet *ifp;
        struct psref psref;
        int unit;

        KASSERT(mutex_owned(&if_clone_mtx));

        ifc = if_clone_lookup(name, &unit);
        if (ifc == NULL)
                return EINVAL;

        ifp = if_get(name, &psref);
        if (ifp != NULL) {
                if_put(ifp, &psref);
                return EEXIST;
        }

        return (*ifc->ifc_create)(ifc, unit);
}

/*
 * Destroy a clone network interface.
 */
static int
if_clone_destroy(const char *name)
{
        struct if_clone *ifc;
        struct ifnet *ifp;
        struct psref psref;
        int error;
        int (*if_ioctlfn)(struct ifnet *, u_long, void *);

        KASSERT(mutex_owned(&if_clone_mtx));

        ifc = if_clone_lookup(name, NULL);
        if (ifc == NULL)
                return EINVAL;

        if (ifc->ifc_destroy == NULL)
                return EOPNOTSUPP;

        ifp = if_get(name, &psref);
        if (ifp == NULL)
                return ENXIO;

        /* We have to disable ioctls here */
        IFNET_LOCK(ifp);
        if_ioctlfn = ifp->if_ioctl;
        ifp->if_ioctl = if_nullioctl;
        IFNET_UNLOCK(ifp);

        /*
         * We cannot call ifc_destroy with holding ifp.
         * Releasing ifp here is safe thanks to if_clone_mtx.
         */
        if_put(ifp, &psref);

        error = (*ifc->ifc_destroy)(ifp);

        if (error != 0) {
                /* We have to restore if_ioctl on error */
                IFNET_LOCK(ifp);
                ifp->if_ioctl = if_ioctlfn;
                IFNET_UNLOCK(ifp);
        }

        return error;
}

static bool
if_is_unit(const char *name)
{

        while (*name != '\0') {
                if (*name < '0' || *name > '9')
                        return false;
                name++;
        }

        return true;
}

/*
 * Look up a network interface cloner.
 */
static struct if_clone *
if_clone_lookup(const char *name, int *unitp)
{
        struct if_clone *ifc;
        const char *cp;
        char *dp, ifname[IFNAMSIZ + 3];
        int unit;

        KASSERT(mutex_owned(&if_clone_mtx));

        strcpy(ifname, "if_");
        /* separate interface name from unit */
        /* TODO: search unit number from backward */
        for (dp = ifname + 3, cp = name; cp - name < IFNAMSIZ &&
            *cp && !if_is_unit(cp);)
                *dp++ = *cp++;

        if (cp == name || cp - name == IFNAMSIZ || !*cp)
                return NULL;        /* No name or unit number */
        *dp++ = '\0';

again:
        LIST_FOREACH(ifc, &if_cloners, ifc_list) {
                if (strcmp(ifname + 3, ifc->ifc_name) == 0)
                        break;
        }

        if (ifc == NULL) {
                int error;
                if (*ifname == '\0')
                        return NULL;
                mutex_exit(&if_clone_mtx);
                error = module_autoload(ifname, MODULE_CLASS_DRIVER);
                mutex_enter(&if_clone_mtx);
                if (error)
                        return NULL;
                *ifname = '\0';
                goto again;
        }

        unit = 0;
        while (cp - name < IFNAMSIZ && *cp) {
                if (*cp < '0' || *cp > '9' || unit >= INT_MAX / 10) {
                        /* Bogus unit number. */
                        return NULL;
                }
                unit = (unit * 10) + (*cp++ - '0');
        }

        if (unitp != NULL)
                *unitp = unit;
        return ifc;
}

/*
 * Register a network interface cloner.
 */
void
if_clone_attach(struct if_clone *ifc)
{

        mutex_enter(&if_clone_mtx);
        LIST_INSERT_HEAD(&if_cloners, ifc, ifc_list);
        if_cloners_count++;
        mutex_exit(&if_clone_mtx);
}

/*
 * Unregister a network interface cloner.
 */
void
if_clone_detach(struct if_clone *ifc)
{

        mutex_enter(&if_clone_mtx);
        LIST_REMOVE(ifc, ifc_list);
        if_cloners_count--;
        mutex_exit(&if_clone_mtx);
}

/*
 * Provide list of interface cloners to userspace.
 */
int
if_clone_list(int buf_count, char *buffer, int *total)
{
        char outbuf[IFNAMSIZ], *dst;
        struct if_clone *ifc;
        int count, error = 0;

        mutex_enter(&if_clone_mtx);
        *total = if_cloners_count;
        if ((dst = buffer) == NULL) {
                /* Just asking how many there are. */
                goto out;
        }

        if (buf_count < 0) {
                error = EINVAL;
                goto out;
        }

        count = (if_cloners_count < buf_count) ? if_cloners_count : buf_count;

        for (ifc = LIST_FIRST(&if_cloners); ifc != NULL && count != 0;
             ifc = LIST_NEXT(ifc, ifc_list), count--, dst += IFNAMSIZ) {
                (void)strncpy(outbuf, ifc->ifc_name, sizeof(outbuf));
                if (outbuf[sizeof(outbuf) - 1] != '\0') {
                        error = ENAMETOOLONG;
                        goto out;
                }
                error = copyout(outbuf, dst, sizeof(outbuf));
                if (error != 0)
                        break;
        }

out:
        mutex_exit(&if_clone_mtx);
        return error;
}

void
ifa_psref_init(struct ifaddr *ifa)
{

        psref_target_init(&ifa->ifa_psref, ifa_psref_class);
}

void
ifaref(struct ifaddr *ifa)
{

        atomic_inc_uint(&ifa->ifa_refcnt);
}

void
ifafree(struct ifaddr *ifa)
{
        KASSERT(ifa != NULL);
        KASSERTMSG(ifa->ifa_refcnt > 0, "ifa_refcnt=%d", ifa->ifa_refcnt);

        membar_release();
        if (atomic_dec_uint_nv(&ifa->ifa_refcnt) != 0)
                return;
        membar_acquire();
        free(ifa, M_IFADDR);
}

bool
ifa_is_destroying(struct ifaddr *ifa)
{

        return ISSET(ifa->ifa_flags, IFA_DESTROYING);
}

void
ifa_insert(struct ifnet *ifp, struct ifaddr *ifa)
{

        ifa->ifa_ifp = ifp;

        /*
         * Check MP-safety for IFEF_MPSAFE drivers.
         * Check !IFF_RUNNING for initialization routines that normally don't
         * take IFNET_LOCK but it's safe because there is no competitor.
         * XXX there are false positive cases because IFF_RUNNING can be off on
         * if_stop.
         */
        KASSERT(!if_is_mpsafe(ifp) || !ISSET(ifp->if_flags, IFF_RUNNING) ||
            IFNET_LOCKED(ifp));

        TAILQ_INSERT_TAIL(&ifp->if_addrlist, ifa, ifa_list);
        IFADDR_ENTRY_INIT(ifa);
        IFADDR_WRITER_INSERT_TAIL(ifp, ifa);

        ifaref(ifa);
}

void
ifa_remove(struct ifnet *ifp, struct ifaddr *ifa)
{

        KASSERT(ifa->ifa_ifp == ifp);
        /*
         * Check MP-safety for IFEF_MPSAFE drivers.
         * if_is_deactivated indicates ifa_remove is called from if_detach
         * where it is safe even if IFNET_LOCK isn't held.
         */
        KASSERT(!if_is_mpsafe(ifp) || if_is_deactivated(ifp) ||
            IFNET_LOCKED(ifp));

        TAILQ_REMOVE(&ifp->if_addrlist, ifa, ifa_list);
        IFADDR_WRITER_REMOVE(ifa);
#ifdef NET_MPSAFE
        IFNET_GLOBAL_LOCK();
        pserialize_perform(ifnet_psz);
        IFNET_GLOBAL_UNLOCK();
#endif

#ifdef NET_MPSAFE
        psref_target_destroy(&ifa->ifa_psref, ifa_psref_class);
#endif
        IFADDR_ENTRY_DESTROY(ifa);
        ifafree(ifa);
}

void
ifa_acquire(struct ifaddr *ifa, struct psref *psref)
{

        PSREF_DEBUG_FILL_RETURN_ADDRESS(psref);
        psref_acquire(psref, &ifa->ifa_psref, ifa_psref_class);
}

void
ifa_release(struct ifaddr *ifa, struct psref *psref)
{

        if (ifa == NULL)
                return;

        psref_release(psref, &ifa->ifa_psref, ifa_psref_class);
}

bool
ifa_held(struct ifaddr *ifa)
{

        return psref_held(&ifa->ifa_psref, ifa_psref_class);
}

static inline int
equal(const struct sockaddr *sa1, const struct sockaddr *sa2)
{

        return sockaddr_cmp(sa1, sa2) == 0;
}

/*
 * Locate an interface based on a complete address.
 */
/*ARGSUSED*/
struct ifaddr *
ifa_ifwithaddr(const struct sockaddr *addr)
{
        struct ifnet *ifp;
        struct ifaddr *ifa;

        IFNET_READER_FOREACH(ifp) {
                if (if_is_deactivated(ifp))
                        continue;
                IFADDR_READER_FOREACH(ifa, ifp) {
                        if (ifa->ifa_addr->sa_family != addr->sa_family)
                                continue;
                        if (equal(addr, ifa->ifa_addr))
                                return ifa;
                        if ((ifp->if_flags & IFF_BROADCAST) &&
                            ifa->ifa_broadaddr &&
                            /* IP6 doesn't have broadcast */
                            ifa->ifa_broadaddr->sa_len != 0 &&
                            equal(ifa->ifa_broadaddr, addr))
                                return ifa;
                }
        }
        return NULL;
}

struct ifaddr *
ifa_ifwithaddr_psref(const struct sockaddr *addr, struct psref *psref)
{
        struct ifaddr *ifa;
        int s = pserialize_read_enter();

        ifa = ifa_ifwithaddr(addr);
        if (ifa != NULL)
                ifa_acquire(ifa, psref);
        pserialize_read_exit(s);

        return ifa;
}

/*
 * Locate the point to point interface with a given destination address.
 */
/*ARGSUSED*/
struct ifaddr *
ifa_ifwithdstaddr(const struct sockaddr *addr)
{
        struct ifnet *ifp;
        struct ifaddr *ifa;

        IFNET_READER_FOREACH(ifp) {
                if (if_is_deactivated(ifp))
                        continue;
                if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
                        continue;
                IFADDR_READER_FOREACH(ifa, ifp) {
                        if (ifa->ifa_addr->sa_family != addr->sa_family ||
                            ifa->ifa_dstaddr == NULL)
                                continue;
                        if (equal(addr, ifa->ifa_dstaddr))
                                return ifa;
                }
        }

        return NULL;
}

struct ifaddr *
ifa_ifwithdstaddr_psref(const struct sockaddr *addr, struct psref *psref)
{
        struct ifaddr *ifa;
        int s;

        s = pserialize_read_enter();
        ifa = ifa_ifwithdstaddr(addr);
        if (ifa != NULL)
                ifa_acquire(ifa, psref);
        pserialize_read_exit(s);

        return ifa;
}

/*
 * Find an interface on a specific network.  If many, choice
 * is most specific found.
 */
struct ifaddr *
ifa_ifwithnet(const struct sockaddr *addr)
{
        struct ifnet *ifp;
        struct ifaddr *ifa, *ifa_maybe = NULL;
        const struct sockaddr_dl *sdl;
        u_int af = addr->sa_family;
        const char *addr_data = addr->sa_data, *cplim;

        if (af == AF_LINK) {
                sdl = satocsdl(addr);
                if (sdl->sdl_index && sdl->sdl_index < if_indexlim &&
                    ifindex2ifnet[sdl->sdl_index] &&
                    !if_is_deactivated(ifindex2ifnet[sdl->sdl_index])) {
                        return ifindex2ifnet[sdl->sdl_index]->if_dl;
                }
        }
#ifdef NETATALK
        if (af == AF_APPLETALK) {
                const struct sockaddr_at *sat, *sat2;
                sat = (const struct sockaddr_at *)addr;
                IFNET_READER_FOREACH(ifp) {
                        if (if_is_deactivated(ifp))
                                continue;
                        ifa = at_ifawithnet((const struct sockaddr_at *)addr,
                            ifp);
                        if (ifa == NULL)
                                continue;
                        sat2 = (struct sockaddr_at *)ifa->ifa_addr;
                        if (sat2->sat_addr.s_net == sat->sat_addr.s_net)
                                return ifa; /* exact match */
                        if (ifa_maybe == NULL) {
                                /* else keep the if with the right range */
                                ifa_maybe = ifa;
                        }
                }
                return ifa_maybe;
        }
#endif
        IFNET_READER_FOREACH(ifp) {
                if (if_is_deactivated(ifp))
                        continue;
                IFADDR_READER_FOREACH(ifa, ifp) {
                        const char *cp, *cp2, *cp3;

                        if (ifa->ifa_addr->sa_family != af ||
                            ifa->ifa_netmask == NULL)
 next:                                continue;
                        cp = addr_data;
                        cp2 = ifa->ifa_addr->sa_data;
                        cp3 = ifa->ifa_netmask->sa_data;
                        cplim = (const char *)ifa->ifa_netmask +
                            ifa->ifa_netmask->sa_len;
                        while (cp3 < cplim) {
                                if ((*cp++ ^ *cp2++) & *cp3++) {
                                        /* want to continue for() loop */
                                        goto next;
                                }
                        }
                        if (ifa_maybe == NULL ||
                            rt_refines(ifa->ifa_netmask,
                                       ifa_maybe->ifa_netmask))
                                ifa_maybe = ifa;
                }
        }
        return ifa_maybe;
}

struct ifaddr *
ifa_ifwithnet_psref(const struct sockaddr *addr, struct psref *psref)
{
        struct ifaddr *ifa;
        int s;

        s = pserialize_read_enter();
        ifa = ifa_ifwithnet(addr);
        if (ifa != NULL)
                ifa_acquire(ifa, psref);
        pserialize_read_exit(s);

        return ifa;
}

/*
 * Find the interface of the address.
 */
struct ifaddr *
ifa_ifwithladdr(const struct sockaddr *addr)
{
        struct ifaddr *ia;

        if ((ia = ifa_ifwithaddr(addr)) || (ia = ifa_ifwithdstaddr(addr)) ||
            (ia = ifa_ifwithnet(addr)))
                return ia;
        return NULL;
}

struct ifaddr *
ifa_ifwithladdr_psref(const struct sockaddr *addr, struct psref *psref)
{
        struct ifaddr *ifa;
        int s;

        s = pserialize_read_enter();
        ifa = ifa_ifwithladdr(addr);
        if (ifa != NULL)
                ifa_acquire(ifa, psref);
        pserialize_read_exit(s);

        return ifa;
}

/*
 * Find an interface using a specific address family
 */
struct ifaddr *
ifa_ifwithaf(int af)
{
        struct ifnet *ifp;
        struct ifaddr *ifa = NULL;
        int s;

        s = pserialize_read_enter();
        IFNET_READER_FOREACH(ifp) {
                if (if_is_deactivated(ifp))
                        continue;
                IFADDR_READER_FOREACH(ifa, ifp) {
                        if (ifa->ifa_addr->sa_family == af)
                                goto out;
                }
        }
out:
        pserialize_read_exit(s);
        return ifa;
}

/*
 * Find an interface address specific to an interface best matching
 * a given address.
 */
struct ifaddr *
ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp)
{
        struct ifaddr *ifa;
        const char *cp, *cp2, *cp3;
        const char *cplim;
        struct ifaddr *ifa_maybe = 0;
        u_int af = addr->sa_family;

        if (if_is_deactivated(ifp))
                return NULL;

        if (af >= AF_MAX)
                return NULL;

        IFADDR_READER_FOREACH(ifa, ifp) {
                if (ifa->ifa_addr->sa_family != af)
                        continue;
                ifa_maybe = ifa;
                if (ifa->ifa_netmask == NULL) {
                        if (equal(addr, ifa->ifa_addr) ||
                            (ifa->ifa_dstaddr &&
                             equal(addr, ifa->ifa_dstaddr)))
                                return ifa;
                        continue;
                }
                cp = addr->sa_data;
                cp2 = ifa->ifa_addr->sa_data;
                cp3 = ifa->ifa_netmask->sa_data;
                cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
                for (; cp3 < cplim; cp3++) {
                        if ((*cp++ ^ *cp2++) & *cp3)
                                break;
                }
                if (cp3 == cplim)
                        return ifa;
        }
        return ifa_maybe;
}

struct ifaddr *
ifaof_ifpforaddr_psref(const struct sockaddr *addr, struct ifnet *ifp,
    struct psref *psref)
{
        struct ifaddr *ifa;
        int s;

        s = pserialize_read_enter();
        ifa = ifaof_ifpforaddr(addr, ifp);
        if (ifa != NULL)
                ifa_acquire(ifa, psref);
        pserialize_read_exit(s);

        return ifa;
}

/*
 * Default action when installing a route with a Link Level gateway.
 * Lookup an appropriate real ifa to point to.
 * This should be moved to /sys/net/link.c eventually.
 */
void
link_rtrequest(int cmd, struct rtentry *rt, const struct rt_addrinfo *info)
{
        struct ifaddr *ifa;
        const struct sockaddr *dst;
        struct ifnet *ifp;
        struct psref psref;

        if (cmd != RTM_ADD || ISSET(info->rti_flags, RTF_DONTCHANGEIFA))
                return;
        ifp = rt->rt_ifa->ifa_ifp;
        dst = rt_getkey(rt);
        if ((ifa = ifaof_ifpforaddr_psref(dst, ifp, &psref)) != NULL) {
                rt_replace_ifa(rt, ifa);
                if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
                        ifa->ifa_rtrequest(cmd, rt, info);
                ifa_release(ifa, &psref);
        }
}

/*
 * bitmask macros to manage a densely packed link_state change queue.
 * Because we need to store LINK_STATE_UNKNOWN(0), LINK_STATE_DOWN(1) and
 * LINK_STATE_UP(2) we need 2 bits for each state change.
 * As a state change to store is 0, treat all bits set as an unset item.
 */
#define LQ_ITEM_BITS                2
#define LQ_ITEM_MASK                ((1 << LQ_ITEM_BITS) - 1)
#define LQ_MASK(i)                (LQ_ITEM_MASK << (i) * LQ_ITEM_BITS)
#define LINK_STATE_UNSET        LQ_ITEM_MASK
#define LQ_ITEM(q, i)                (((q) & LQ_MASK((i))) >> (i) * LQ_ITEM_BITS)
#define LQ_STORE(q, i, v)                                                      \
        do {                                                                      \
                (q) &= ~LQ_MASK((i));                                              \
                (q) |= (v) << (i) * LQ_ITEM_BITS;                              \
        } while (0 /* CONSTCOND */)
#define LQ_MAX(q)                ((sizeof((q)) * NBBY) / LQ_ITEM_BITS)
#define LQ_POP(q, v)                                                              \
        do {                                                                      \
                (v) = LQ_ITEM((q), 0);                                              \
                (q) >>= LQ_ITEM_BITS;                                              \
                (q) |= LINK_STATE_UNSET << (LQ_MAX((q)) - 1) * LQ_ITEM_BITS;  \
        } while (0 /* CONSTCOND */)
#define LQ_PUSH(q, v)                                                              \
        do {                                                                      \
                (q) >>= LQ_ITEM_BITS;                                              \
                (q) |= (v) << (LQ_MAX((q)) - 1) * LQ_ITEM_BITS;                      \
        } while (0 /* CONSTCOND */)
#define LQ_FIND_UNSET(q, i)                                                      \
        for ((i) = 0; i < LQ_MAX((q)); (i)++) {                                      \
                if (LQ_ITEM((q), (i)) == LINK_STATE_UNSET)                      \
                        break;                                                      \
        }

/*
 * Handle a change in the interface link state and
 * queue notifications.
 */
void
if_link_state_change(struct ifnet *ifp, int link_state)
{
        int idx;

        /* Ensure change is to a valid state */
        switch (link_state) {
        case LINK_STATE_UNKNOWN:        /* FALLTHROUGH */
        case LINK_STATE_DOWN:                /* FALLTHROUGH */
        case LINK_STATE_UP:
                break;
        default:
#ifdef DEBUG
                printf("%s: invalid link state %d\n",
                    ifp->if_xname, link_state);
#endif
                return;
        }

        IF_LINK_STATE_CHANGE_LOCK(ifp);

        /* Find the last unset event in the queue. */
        LQ_FIND_UNSET(ifp->if_link_queue, idx);

        if (idx == 0) {
                /*
                 * There is no queue of link state changes.
                 * As we have the lock we can safely compare against the
                 * current link state and return if the same.
                 * Otherwise, if scheduled is true then the interface is being
                 * detached and the queue is being drained so we need
                 * to avoid queuing more work.
                 */
                 if (ifp->if_link_state == link_state ||
                     ifp->if_link_scheduled)
                        goto out;
        } else {
                /* Ensure link_state doesn't match the last queued state. */
                if (LQ_ITEM(ifp->if_link_queue, idx - 1)
                    == (uint8_t)link_state)
                        goto out;
        }

        /* Handle queue overflow. */
        if (idx == LQ_MAX(ifp->if_link_queue)) {
                uint8_t lost;

                /*
                 * The DOWN state must be protected from being pushed off
                 * the queue to ensure that userland will always be
                 * in a sane state.
                 * Because DOWN is protected, there is no need to protect
                 * UNKNOWN.
                 * It should be invalid to change from any other state to
                 * UNKNOWN anyway ...
                 */
                lost = LQ_ITEM(ifp->if_link_queue, 0);
                LQ_PUSH(ifp->if_link_queue, (uint8_t)link_state);
                if (lost == LINK_STATE_DOWN) {
                        lost = LQ_ITEM(ifp->if_link_queue, 0);
                        LQ_STORE(ifp->if_link_queue, 0, LINK_STATE_DOWN);
                }
                printf("%s: lost link state change %s\n",
                    ifp->if_xname,
                    lost == LINK_STATE_UP ? "UP" :
                    lost == LINK_STATE_DOWN ? "DOWN" :
                    "UNKNOWN");
        } else
                LQ_STORE(ifp->if_link_queue, idx, (uint8_t)link_state);

        if (ifp->if_link_scheduled)
                goto out;

        ifp->if_link_scheduled = true;
        workqueue_enqueue(ifnet_link_state_wq, &ifp->if_link_work, NULL);

out:
        IF_LINK_STATE_CHANGE_UNLOCK(ifp);
}

/*
 * Handle interface link state change notifications.
 */
static void
if_link_state_change_process(struct ifnet *ifp, int link_state)
{
        struct domain *dp;
        const int s = splnet();
        bool notify;

        KASSERT(!cpu_intr_p());

        IF_LINK_STATE_CHANGE_LOCK(ifp);

        /* Ensure the change is still valid. */
        if (ifp->if_link_state == link_state) {
                IF_LINK_STATE_CHANGE_UNLOCK(ifp);
                splx(s);
                return;
        }

#ifdef DEBUG
        log(LOG_DEBUG, "%s: link state %s (was %s)\n", ifp->if_xname,
                link_state == LINK_STATE_UP ? "UP" :
                link_state == LINK_STATE_DOWN ? "DOWN" :
                "UNKNOWN",
                ifp->if_link_state == LINK_STATE_UP ? "UP" :
                ifp->if_link_state == LINK_STATE_DOWN ? "DOWN" :
                "UNKNOWN");
#endif

        /*
         * When going from UNKNOWN to UP, we need to mark existing
         * addresses as tentative and restart DAD as we may have
         * erroneously not found a duplicate.
         *
         * This needs to happen before rt_ifmsg to avoid a race where
         * listeners would have an address and expect it to work right
         * away.
         */
        notify = (link_state == LINK_STATE_UP &&
            ifp->if_link_state == LINK_STATE_UNKNOWN);
        ifp->if_link_state = link_state;
        /* The following routines may sleep so release the spin mutex */
        IF_LINK_STATE_CHANGE_UNLOCK(ifp);

        KERNEL_LOCK_UNLESS_NET_MPSAFE();
        if (notify) {
                DOMAIN_FOREACH(dp) {
                        if (dp->dom_if_link_state_change != NULL)
                                dp->dom_if_link_state_change(ifp,
                                    LINK_STATE_DOWN);
                }
        }

        /* Notify that the link state has changed. */
        rt_ifmsg(ifp);

        simplehook_dohooks(ifp->if_linkstate_hooks);

        DOMAIN_FOREACH(dp) {
                if (dp->dom_if_link_state_change != NULL)
                        dp->dom_if_link_state_change(ifp, link_state);
        }
        KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
        splx(s);
}

/*
 * Process the interface link state change queue.
 */
static void
if_link_state_change_work(struct work *work, void *arg)
{
        struct ifnet *ifp = container_of(work, struct ifnet, if_link_work);
        uint8_t state;

        KERNEL_LOCK_UNLESS_NET_MPSAFE();
        const int s = splnet();

        /*
         * Pop a link state change from the queue and process it.
         * If there is nothing to process then if_detach() has been called.
         * We keep if_link_scheduled = true so the queue can safely drain
         * without more work being queued.
         */
        IF_LINK_STATE_CHANGE_LOCK(ifp);
        LQ_POP(ifp->if_link_queue, state);
        IF_LINK_STATE_CHANGE_UNLOCK(ifp);
        if (state == LINK_STATE_UNSET)
                goto out;

        if_link_state_change_process(ifp, state);

        /* If there is a link state change to come, schedule it. */
        IF_LINK_STATE_CHANGE_LOCK(ifp);
        if (LQ_ITEM(ifp->if_link_queue, 0) != LINK_STATE_UNSET) {
                ifp->if_link_scheduled = true;
                workqueue_enqueue(ifnet_link_state_wq, &ifp->if_link_work,
                    NULL);
        } else
                ifp->if_link_scheduled = false;
        IF_LINK_STATE_CHANGE_UNLOCK(ifp);

out:
        splx(s);
        KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}

void *
if_linkstate_change_establish(struct ifnet *ifp, void (*fn)(void *), void *arg)
{
        khook_t *hk;

        hk = simplehook_establish(ifp->if_linkstate_hooks, fn, arg);

        return (void *)hk;
}

void
if_linkstate_change_disestablish(struct ifnet *ifp, void *vhook,
    kmutex_t *lock)
{

        simplehook_disestablish(ifp->if_linkstate_hooks, vhook, lock);
}

/*
 * Used to mark addresses on an interface as DETATCHED or TENTATIVE
 * and thus start Duplicate Address Detection without changing the
 * real link state.
 */
void
if_domain_link_state_change(struct ifnet *ifp, int link_state)
{
        struct domain *dp;

        const int s = splnet();
        KERNEL_LOCK_UNLESS_NET_MPSAFE();

        DOMAIN_FOREACH(dp) {
                if (dp->dom_if_link_state_change != NULL)
                        dp->dom_if_link_state_change(ifp, link_state);
        }

        splx(s);
        KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}

/*
 * Default action when installing a local route on a point-to-point
 * interface.
 */
void
p2p_rtrequest(int req, struct rtentry *rt,
    __unused const struct rt_addrinfo *info)
{
        struct ifnet *ifp = rt->rt_ifp;
        struct ifaddr *ifa, *lo0ifa;
        int s = pserialize_read_enter();

        switch (req) {
        case RTM_ADD:
                if ((rt->rt_flags & RTF_LOCAL) == 0)
                        break;

                rt->rt_ifp = lo0ifp;

                if (ISSET(info->rti_flags, RTF_DONTCHANGEIFA))
                        break;

                IFADDR_READER_FOREACH(ifa, ifp) {
                        if (equal(rt_getkey(rt), ifa->ifa_addr))
                                break;
                }
                if (ifa == NULL)
                        break;

                /*
                 * Ensure lo0 has an address of the same family.
                 */
                IFADDR_READER_FOREACH(lo0ifa, lo0ifp) {
                        if (lo0ifa->ifa_addr->sa_family ==
                            ifa->ifa_addr->sa_family)
                                break;
                }
                if (lo0ifa == NULL)
                        break;

                /*
                 * Make sure to set rt->rt_ifa to the interface
                 * address we are using, otherwise we will have trouble
                 * with source address selection.
                 */
                if (ifa != rt->rt_ifa)
                        rt_replace_ifa(rt, ifa);
                break;
        case RTM_DELETE:
        default:
                break;
        }
        pserialize_read_exit(s);
}

static void
_if_down(struct ifnet *ifp)
{
        struct ifaddr *ifa;
        struct domain *dp;
        struct psref psref;

        ifp->if_flags &= ~IFF_UP;
        nanotime(&ifp->if_lastchange);

        const int bound = curlwp_bind();
        int s = pserialize_read_enter();
        IFADDR_READER_FOREACH(ifa, ifp) {
                ifa_acquire(ifa, &psref);
                pserialize_read_exit(s);

                pfctlinput(PRC_IFDOWN, ifa->ifa_addr);

                s = pserialize_read_enter();
                ifa_release(ifa, &psref);
        }
        pserialize_read_exit(s);
        curlwp_bindx(bound);

        IFQ_PURGE(&ifp->if_snd);
#if NCARP > 0
        if (ifp->if_carp)
                carp_carpdev_state(ifp);
#endif
        rt_ifmsg(ifp);
        DOMAIN_FOREACH(dp) {
                if (dp->dom_if_down)
                        dp->dom_if_down(ifp);
        }
}

static void
if_down_deactivated(struct ifnet *ifp)
{

        KASSERT(if_is_deactivated(ifp));
        _if_down(ifp);
}

void
if_down_locked(struct ifnet *ifp)
{

        KASSERT(IFNET_LOCKED(ifp));
        _if_down(ifp);
}

/*
 * Mark an interface down and notify protocols of
 * the transition.
 * NOTE: must be called at splsoftnet or equivalent.
 */
void
if_down(struct ifnet *ifp)
{

        IFNET_LOCK(ifp);
        if_down_locked(ifp);
        IFNET_UNLOCK(ifp);
}

/*
 * Must be called with holding if_ioctl_lock.
 */
static void
if_up_locked(struct ifnet *ifp)
{
#ifdef notyet
        struct ifaddr *ifa;
#endif
        struct domain *dp;

        KASSERT(IFNET_LOCKED(ifp));

        KASSERT(!if_is_deactivated(ifp));
        ifp->if_flags |= IFF_UP;
        nanotime(&ifp->if_lastchange);
#ifdef notyet
        /* this has no effect on IP, and will kill all ISO connections XXX */
        IFADDR_READER_FOREACH(ifa, ifp)
                pfctlinput(PRC_IFUP, ifa->ifa_addr);
#endif
#if NCARP > 0
        if (ifp->if_carp)
                carp_carpdev_state(ifp);
#endif
        rt_ifmsg(ifp);
        DOMAIN_FOREACH(dp) {
                if (dp->dom_if_up)
                        dp->dom_if_up(ifp);
        }
}

/*
 * Handle interface slowtimo timer routine.  Called
 * from softclock, we decrement timer (if set) and
 * call the appropriate interface routine on expiration.
 */
static bool
if_slowtimo_countdown(struct ifnet *ifp)
{
        bool fire = false;
        const int s = splnet();

        KERNEL_LOCK(1, NULL);
        if (ifp->if_timer != 0 && --ifp->if_timer == 0)
                fire = true;
        KERNEL_UNLOCK_ONE(NULL);
        splx(s);

        return fire;
}

static void
if_slowtimo_intr(void *arg)
{
        struct ifnet *ifp = arg;
        struct if_slowtimo_data *isd = ifp->if_slowtimo_data;

        mutex_enter(&isd->isd_lock);
        if (!isd->isd_dying) {
                if (isd->isd_trigger || if_slowtimo_countdown(ifp)) {
                        if (!isd->isd_queued) {
                                isd->isd_queued = true;
                                workqueue_enqueue(if_slowtimo_wq,
                                    &isd->isd_work, NULL);
                        }
                } else
                        callout_schedule(&isd->isd_ch, hz / IFNET_SLOWHZ);
        }
        mutex_exit(&isd->isd_lock);
}

static void
if_slowtimo_work(struct work *work, void *arg)
{
        struct if_slowtimo_data *isd =
            container_of(work, struct if_slowtimo_data, isd_work);
        struct ifnet *ifp = isd->isd_ifp;
        const int s = splnet();

        KERNEL_LOCK(1, NULL);
        (*ifp->if_slowtimo)(ifp);
        KERNEL_UNLOCK_ONE(NULL);
        splx(s);

        mutex_enter(&isd->isd_lock);
        if (isd->isd_trigger) {
                isd->isd_trigger = false;
                printf("%s: watchdog triggered\n", ifp->if_xname);
        }
        isd->isd_queued = false;
        if (!isd->isd_dying)
                callout_schedule(&isd->isd_ch, hz / IFNET_SLOWHZ);
        mutex_exit(&isd->isd_lock);
}

static int
sysctl_if_watchdog(SYSCTLFN_ARGS)
{
        struct sysctlnode node = *rnode;
        struct ifnet *ifp = node.sysctl_data;
        struct if_slowtimo_data *isd = ifp->if_slowtimo_data;
        int arg = 0;
        int error;

        node.sysctl_data = &arg;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;
        if (arg) {
                mutex_enter(&isd->isd_lock);
                KASSERT(!isd->isd_dying);
                isd->isd_trigger = true;
                callout_schedule(&isd->isd_ch, 0);
                mutex_exit(&isd->isd_lock);
        }

        return 0;
}

static void
sysctl_watchdog_setup(struct ifnet *ifp)
{
        struct sysctllog **clog = &ifp->if_sysctl_log;
        const struct sysctlnode *rnode;

        if (sysctl_createv(clog, 0, NULL, &rnode,
                CTLFLAG_PERMANENT, CTLTYPE_NODE, "interfaces",
                SYSCTL_DESCR("Per-interface controls"),
                NULL, 0, NULL, 0,
                CTL_NET, CTL_CREATE, CTL_EOL) != 0)
                goto bad;
        if (sysctl_createv(clog, 0, &rnode, &rnode,
                CTLFLAG_PERMANENT, CTLTYPE_NODE, ifp->if_xname,
                SYSCTL_DESCR("Interface controls"),
                NULL, 0, NULL, 0,
                CTL_CREATE, CTL_EOL) != 0)
                goto bad;
        if (sysctl_createv(clog, 0, &rnode, &rnode,
                CTLFLAG_PERMANENT, CTLTYPE_NODE, "watchdog",
                SYSCTL_DESCR("Interface watchdog controls"),
                NULL, 0, NULL, 0,
                CTL_CREATE, CTL_EOL) != 0)
                goto bad;
        if (sysctl_createv(clog, 0, &rnode, NULL,
                CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "trigger",
                SYSCTL_DESCR("Trigger watchdog timeout"),
                sysctl_if_watchdog, 0, (int *)ifp, 0,
                CTL_CREATE, CTL_EOL) != 0)
                goto bad;

        return;

bad:
        printf("%s: could not attach sysctl watchdog nodes\n", ifp->if_xname);
}

/*
 * Mark an interface up and notify protocols of
 * the transition.
 * NOTE: must be called at splsoftnet or equivalent.
 */
void
if_up(struct ifnet *ifp)
{

        IFNET_LOCK(ifp);
        if_up_locked(ifp);
        IFNET_UNLOCK(ifp);
}

/*
 * Set/clear promiscuous mode on interface ifp based on the truth value
 * of pswitch.  The calls are reference counted so that only the first
 * "on" request actually has an effect, as does the final "off" request.
 * Results are undefined if the "off" and "on" requests are not matched.
 */
int
ifpromisc_locked(struct ifnet *ifp, int pswitch)
{
        int pcount, ret = 0;
        u_short nflags;

        KASSERT(IFNET_LOCKED(ifp));

        pcount = ifp->if_pcount;
        if (pswitch) {
                /*
                 * Allow the device to be "placed" into promiscuous
                 * mode even if it is not configured up.  It will
                 * consult IFF_PROMISC when it is brought up.
                 */
                if (ifp->if_pcount++ != 0)
                        goto out;
                nflags = ifp->if_flags | IFF_PROMISC;
        } else {
                if (--ifp->if_pcount > 0)
                        goto out;
                nflags = ifp->if_flags & ~IFF_PROMISC;
        }
        ret = if_flags_set(ifp, nflags);
        /* Restore interface state if not successful. */
        if (ret != 0)
                ifp->if_pcount = pcount;

out:
        return ret;
}

int
ifpromisc(struct ifnet *ifp, int pswitch)
{
        int e;

        IFNET_LOCK(ifp);
        e = ifpromisc_locked(ifp, pswitch);
        IFNET_UNLOCK(ifp);

        return e;
}

/*
 * if_ioctl(ifp, cmd, data)
 *
 *        Apply an ioctl command to the interface.  Returns 0 on success,
 *        nonzero errno(3) number on failure.
 *
 *        For SIOCADDMULTI/SIOCDELMULTI, caller need not hold locks -- it
 *        is the driver's responsibility to take any internal locks.
 *        (Kernel logic should generally invoke these only through
 *        if_mcast_op.)
 *
 *        For all other ioctls, caller must hold ifp->if_ioctl_lock,
 *        a.k.a. IFNET_LOCK.  May sleep.
 */
int
if_ioctl(struct ifnet *ifp, u_long cmd, void *data)
{

        switch (cmd) {
        case SIOCADDMULTI:
        case SIOCDELMULTI:
                break;
        default:
                KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname);
        }

        return (*ifp->if_ioctl)(ifp, cmd, data);
}

/*
 * if_init(ifp)
 *
 *        Prepare the hardware underlying ifp to process packets
 *        according to its current configuration.  Returns 0 on success,
 *        nonzero errno(3) number on failure.
 *
 *        May sleep.  Caller must hold ifp->if_ioctl_lock, a.k.a
 *        IFNET_LOCK.
 */
int
if_init(struct ifnet *ifp)
{

        KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname);

        return (*ifp->if_init)(ifp);
}

/*
 * if_stop(ifp, disable)
 *
 *        Stop the hardware underlying ifp from processing packets.
 *
 *        If disable is true, ... XXX(?)
 *
 *        May sleep.  Caller must hold ifp->if_ioctl_lock, a.k.a
 *        IFNET_LOCK.
 */
void
if_stop(struct ifnet *ifp, int disable)
{

        KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname);

        (*ifp->if_stop)(ifp, disable);
}

/*
 * Map interface name to
 * interface structure pointer.
 */
struct ifnet *
ifunit(const char *name)
{
        struct ifnet *ifp;
        const char *cp = name;
        u_int unit = 0;
        u_int i;

        /*
         * If the entire name is a number, treat it as an ifindex.
         */
        for (i = 0; i < IFNAMSIZ && *cp >= '0' && *cp <= '9'; i++, cp++)
                unit = unit * 10 + (*cp - '0');

        /*
         * If the number took all of the name, then it's a valid ifindex.
         */
        if (i == IFNAMSIZ || (cp != name && *cp == '\0'))
                return if_byindex(unit);

        ifp = NULL;
        const int s = pserialize_read_enter();
        IFNET_READER_FOREACH(ifp) {
                if (if_is_deactivated(ifp))
                        continue;
                if (strcmp(ifp->if_xname, name) == 0)
                        goto out;
        }
out:
        pserialize_read_exit(s);
        return ifp;
}

/*
 * Get a reference of an ifnet object by an interface name.
 * The returned reference is protected by psref(9). The caller
 * must release a returned reference by if_put after use.
 */
struct ifnet *
if_get(const char *name, struct psref *psref)
{
        struct ifnet *ifp;
        const char *cp = name;
        u_int unit = 0;
        u_int i;

        /*
         * If the entire name is a number, treat it as an ifindex.
         */
        for (i = 0; i < IFNAMSIZ && *cp >= '0' && *cp <= '9'; i++, cp++)
                unit = unit * 10 + (*cp - '0');

        /*
         * If the number took all of the name, then it's a valid ifindex.
         */
        if (i == IFNAMSIZ || (cp != name && *cp == '\0'))
                return if_get_byindex(unit, psref);

        ifp = NULL;
        const int s = pserialize_read_enter();
        IFNET_READER_FOREACH(ifp) {
                if (if_is_deactivated(ifp))
                        continue;
                if (strcmp(ifp->if_xname, name) == 0) {
                        PSREF_DEBUG_FILL_RETURN_ADDRESS(psref);
                        psref_acquire(psref, &ifp->if_psref,
                            ifnet_psref_class);
                        goto out;
                }
        }
out:
        pserialize_read_exit(s);
        return ifp;
}

/*
 * Release a reference of an ifnet object given by if_get, if_get_byindex
 * or if_get_bylla.
 */
void
if_put(const struct ifnet *ifp, struct psref *psref)
{

        if (ifp == NULL)
                return;

        psref_release(psref, &ifp->if_psref, ifnet_psref_class);
}

/*
 * Return ifp having idx. Return NULL if not found.  Normally if_byindex
 * should be used.
 */
ifnet_t *
_if_byindex(u_int idx)
{

        return (__predict_true(idx < if_indexlim)) ? ifindex2ifnet[idx] : NULL;
}

/*
 * Return ifp having idx. Return NULL if not found or the found ifp is
 * already deactivated.
 */
ifnet_t *
if_byindex(u_int idx)
{
        ifnet_t *ifp;

        ifp = _if_byindex(idx);
        if (ifp != NULL && if_is_deactivated(ifp))
                ifp = NULL;
        return ifp;
}

/*
 * Get a reference of an ifnet object by an interface index.
 * The returned reference is protected by psref(9). The caller
 * must release a returned reference by if_put after use.
 */
ifnet_t *
if_get_byindex(u_int idx, struct psref *psref)
{
        ifnet_t *ifp;

        const int s = pserialize_read_enter();
        ifp = if_byindex(idx);
        if (__predict_true(ifp != NULL)) {
                PSREF_DEBUG_FILL_RETURN_ADDRESS(psref);
                psref_acquire(psref, &ifp->if_psref, ifnet_psref_class);
        }
        pserialize_read_exit(s);

        return ifp;
}

ifnet_t *
if_get_bylla(const void *lla, unsigned char lla_len, struct psref *psref)
{
        ifnet_t *ifp;

        const int s = pserialize_read_enter();
        IFNET_READER_FOREACH(ifp) {
                if (if_is_deactivated(ifp))
                        continue;
                if (ifp->if_addrlen != lla_len)
                        continue;
                if (memcmp(lla, CLLADDR(ifp->if_sadl), lla_len) == 0) {
                        psref_acquire(psref, &ifp->if_psref,
                            ifnet_psref_class);
                        break;
                }
        }
        pserialize_read_exit(s);

        return ifp;
}

/*
 * Note that it's safe only if the passed ifp is guaranteed to not be freed,
 * for example using pserialize or the ifp is already held or some other
 * object is held which guarantes the ifp to not be freed indirectly.
 */
void
if_acquire(struct ifnet *ifp, struct psref *psref)
{

        KASSERT(ifp->if_index != 0);
        psref_acquire(psref, &ifp->if_psref, ifnet_psref_class);
}

bool
if_held(struct ifnet *ifp)
{

        return psref_held(&ifp->if_psref, ifnet_psref_class);
}

/*
 * Some tunnel interfaces can nest, e.g. IPv4 over IPv4 gif(4) tunnel over
 * IPv4. Check the tunnel nesting count.
 * Return > 0, if tunnel nesting count is more than limit.
 * Return 0, if tunnel nesting count is equal or less than limit.
 */
int
if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, int limit)
{
        struct m_tag *mtag;
        int *count;

        mtag = m_tag_find(m, PACKET_TAG_TUNNEL_INFO);
        if (mtag != NULL) {
                count = (int *)(mtag + 1);
                if (++(*count) > limit) {
                        log(LOG_NOTICE,
                            "%s: recursively called too many times(%d)\n",
                            ifp->if_xname, *count);
                        return EIO;
                }
        } else {
                mtag = m_tag_get(PACKET_TAG_TUNNEL_INFO, sizeof(*count),
                    M_NOWAIT);
                if (mtag != NULL) {
                        m_tag_prepend(m, mtag);
                        count = (int *)(mtag + 1);
                        *count = 0;
                } else {
                        log(LOG_DEBUG, "%s: m_tag_get() failed, "
                            "recursion calls are not prevented.\n",
                            ifp->if_xname);
                }
        }

        return 0;
}

static void
if_tunnel_ro_init_pc(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
        struct tunnel_ro *tro = p;

        tro->tr_ro = kmem_zalloc(sizeof(*tro->tr_ro), KM_SLEEP);
        tro->tr_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
}

static void
if_tunnel_ro_fini_pc(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
        struct tunnel_ro *tro = p;

        rtcache_free(tro->tr_ro);
        kmem_free(tro->tr_ro, sizeof(*tro->tr_ro));

        mutex_obj_free(tro->tr_lock);
}

percpu_t *
if_tunnel_alloc_ro_percpu(void)
{

        return percpu_create(sizeof(struct tunnel_ro),
            if_tunnel_ro_init_pc, if_tunnel_ro_fini_pc, NULL);
}

void
if_tunnel_free_ro_percpu(percpu_t *ro_percpu)
{

        percpu_free(ro_percpu, sizeof(struct tunnel_ro));
}


static void
if_tunnel_rtcache_free_pc(void *p, void *arg __unused,
    struct cpu_info *ci __unused)
{
        struct tunnel_ro *tro = p;

        mutex_enter(tro->tr_lock);
        rtcache_free(tro->tr_ro);
        mutex_exit(tro->tr_lock);
}

void if_tunnel_ro_percpu_rtcache_free(percpu_t *ro_percpu)
{

        percpu_foreach(ro_percpu, if_tunnel_rtcache_free_pc, NULL);
}

void
if_export_if_data(ifnet_t * const ifp, struct if_data *ifi, bool zero_stats)
{

        /* Collect the volatile stats first; this zeros *ifi. */
        if_stats_to_if_data(ifp, ifi, zero_stats);

        ifi->ifi_type = ifp->if_type;
        ifi->ifi_addrlen = ifp->if_addrlen;
        ifi->ifi_hdrlen = ifp->if_hdrlen;
        ifi->ifi_link_state = ifp->if_link_state;
        ifi->ifi_mtu = ifp->if_mtu;
        ifi->ifi_metric = ifp->if_metric;
        ifi->ifi_baudrate = ifp->if_baudrate;
        ifi->ifi_lastchange = ifp->if_lastchange;
}

/* common */
int
ifioctl_common(struct ifnet *ifp, u_long cmd, void *data)
{
        struct ifreq *ifr;
        struct ifcapreq *ifcr;
        struct ifdatareq *ifdr;
        unsigned short flags;
        char *descr;
        int error;

        switch (cmd) {
        case SIOCSIFCAP:
                ifcr = data;
                if ((ifcr->ifcr_capenable & ~ifp->if_capabilities) != 0)
                        return EINVAL;

                if (ifcr->ifcr_capenable == ifp->if_capenable)
                        return 0;

                ifp->if_capenable = ifcr->ifcr_capenable;

                /* Pre-compute the checksum flags mask. */
                ifp->if_csum_flags_tx = 0;
                ifp->if_csum_flags_rx = 0;
                if (ifp->if_capenable & IFCAP_CSUM_IPv4_Tx)
                        ifp->if_csum_flags_tx |= M_CSUM_IPv4;
                if (ifp->if_capenable & IFCAP_CSUM_IPv4_Rx)
                        ifp->if_csum_flags_rx |= M_CSUM_IPv4;

                if (ifp->if_capenable & IFCAP_CSUM_TCPv4_Tx)
                        ifp->if_csum_flags_tx |= M_CSUM_TCPv4;
                if (ifp->if_capenable & IFCAP_CSUM_TCPv4_Rx)
                        ifp->if_csum_flags_rx |= M_CSUM_TCPv4;

                if (ifp->if_capenable & IFCAP_CSUM_UDPv4_Tx)
                        ifp->if_csum_flags_tx |= M_CSUM_UDPv4;
                if (ifp->if_capenable & IFCAP_CSUM_UDPv4_Rx)
                        ifp->if_csum_flags_rx |= M_CSUM_UDPv4;

                if (ifp->if_capenable & IFCAP_CSUM_TCPv6_Tx)
                        ifp->if_csum_flags_tx |= M_CSUM_TCPv6;
                if (ifp->if_capenable & IFCAP_CSUM_TCPv6_Rx)
                        ifp->if_csum_flags_rx |= M_CSUM_TCPv6;

                if (ifp->if_capenable & IFCAP_CSUM_UDPv6_Tx)
                        ifp->if_csum_flags_tx |= M_CSUM_UDPv6;
                if (ifp->if_capenable & IFCAP_CSUM_UDPv6_Rx)
                        ifp->if_csum_flags_rx |= M_CSUM_UDPv6;

                if (ifp->if_capenable & IFCAP_TSOv4)
                        ifp->if_csum_flags_tx |= M_CSUM_TSOv4;
                if (ifp->if_capenable & IFCAP_TSOv6)
                        ifp->if_csum_flags_tx |= M_CSUM_TSOv6;

#if NBRIDGE > 0
                if (ifp->if_bridge != NULL)
                        bridge_calc_csum_flags(ifp->if_bridge);
#endif

                if (ifp->if_flags & IFF_UP)
                        return ENETRESET;
                return 0;
        case SIOCSIFFLAGS:
                ifr = data;
                /*
                 * If if_is_mpsafe(ifp), KERNEL_LOCK isn't held here, but if_up
                 * and if_down aren't MP-safe yet, so we must hold the lock.
                 */
                KERNEL_LOCK_IF_IFP_MPSAFE(ifp);
                if (ifp->if_flags & IFF_UP && (ifr->ifr_flags & IFF_UP) == 0) {
                        const int s = splsoftnet();
                        if_down_locked(ifp);
                        splx(s);
                }
                if (ifr->ifr_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) {
                        const int s = splsoftnet();
                        if_up_locked(ifp);
                        splx(s);
                }
                KERNEL_UNLOCK_IF_IFP_MPSAFE(ifp);
                flags = (ifp->if_flags & IFF_CANTCHANGE) |
                    (ifr->ifr_flags &~ IFF_CANTCHANGE);
                if (ifp->if_flags != flags) {
                        ifp->if_flags = flags;
                        /* Notify that the flags have changed. */
                        rt_ifmsg(ifp);
                }
                break;
        case SIOCGIFFLAGS:
                ifr = data;
                ifr->ifr_flags = ifp->if_flags;
                break;

        case SIOCGIFMETRIC:
                ifr = data;
                ifr->ifr_metric = ifp->if_metric;
                break;

        case SIOCGIFMTU:
                ifr = data;
                ifr->ifr_mtu = ifp->if_mtu;
                break;

        case SIOCGIFDLT:
                ifr = data;
                ifr->ifr_dlt = ifp->if_dlt;
                break;

        case SIOCGIFCAP:
                ifcr = data;
                ifcr->ifcr_capabilities = ifp->if_capabilities;
                ifcr->ifcr_capenable = ifp->if_capenable;
                break;

        case SIOCSIFMETRIC:
                ifr = data;
                ifp->if_metric = ifr->ifr_metric;
                break;

        case SIOCGIFDATA:
                ifdr = data;
                if_export_if_data(ifp, &ifdr->ifdr_data, false);
                break;

        case SIOCGIFINDEX:
                ifr = data;
                ifr->ifr_index = ifp->if_index;
                break;

        case SIOCZIFDATA:
                ifdr = data;
                if_export_if_data(ifp, &ifdr->ifdr_data, true);
                getnanotime(&ifp->if_lastchange);
                break;
        case SIOCSIFMTU:
                ifr = data;
                if (ifp->if_mtu == ifr->ifr_mtu)
                        break;
                ifp->if_mtu = ifr->ifr_mtu;
                return ENETRESET;
        case SIOCSIFDESCR:
                error = kauth_authorize_network(kauth_cred_get(),
                    KAUTH_NETWORK_INTERFACE,
                    KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, KAUTH_ARG(cmd),
                    NULL);
                if (error)
                        return error;

                ifr = data;

                if (ifr->ifr_buflen > IFDESCRSIZE)
                        return ENAMETOOLONG;

                if (ifr->ifr_buf == NULL || ifr->ifr_buflen == 0) {
                        /* unset description */
                        descr = NULL;
                } else {
                        descr = kmem_zalloc(IFDESCRSIZE, KM_SLEEP);
                        /*
                         * copy (IFDESCRSIZE - 1) bytes to ensure
                         * terminating nul
                         */
                        error = copyin(ifr->ifr_buf, descr, IFDESCRSIZE - 1);
                        if (error) {
                                kmem_free(descr, IFDESCRSIZE);
                                return error;
                        }
                }

                if (ifp->if_description != NULL)
                        kmem_free(ifp->if_description, IFDESCRSIZE);

                ifp->if_description = descr;
                break;

        case SIOCGIFDESCR:
                ifr = data;
                descr = ifp->if_description;

                if (descr == NULL)
                        return ENOMSG;

                if (ifr->ifr_buflen < IFDESCRSIZE)
                        return EINVAL;

                error = copyout(descr, ifr->ifr_buf, IFDESCRSIZE);
                if (error)
                        return error;
                break;

        default:
                return ENOTTY;
        }
        return 0;
}

int
ifaddrpref_ioctl(struct socket *so, u_long cmd, void *data, struct ifnet *ifp)
{
        struct if_addrprefreq *ifap = (struct if_addrprefreq *)data;
        struct ifaddr *ifa;
        const struct sockaddr *any, *sa;
        union {
                struct sockaddr sa;
                struct sockaddr_storage ss;
        } u, v;
        int s, error = 0;

        switch (cmd) {
        case SIOCSIFADDRPREF:
                error = kauth_authorize_network(kauth_cred_get(),
                    KAUTH_NETWORK_INTERFACE,
                    KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, KAUTH_ARG(cmd),
                    NULL);
                if (error)
                        return error;
                break;
        case SIOCGIFADDRPREF:
                break;
        default:
                return EOPNOTSUPP;
        }

        /* sanity checks */
        if (data == NULL || ifp == NULL) {
                panic("invalid argument to %s", __func__);
                /*NOTREACHED*/
        }

        /* address must be specified on ADD and DELETE */
        sa = sstocsa(&ifap->ifap_addr);
        if (sa->sa_family != sofamily(so))
                return EINVAL;
        if ((any = sockaddr_any(sa)) == NULL || sa->sa_len != any->sa_len)
                return EINVAL;

        sockaddr_externalize(&v.sa, sizeof(v.ss), sa);

        s = pserialize_read_enter();
        IFADDR_READER_FOREACH(ifa, ifp) {
                if (ifa->ifa_addr->sa_family != sa->sa_family)
                        continue;
                sockaddr_externalize(&u.sa, sizeof(u.ss), ifa->ifa_addr);
                if (sockaddr_cmp(&u.sa, &v.sa) == 0)
                        break;
        }
        if (ifa == NULL) {
                error = EADDRNOTAVAIL;
                goto out;
        }

        switch (cmd) {
        case SIOCSIFADDRPREF:
                ifa->ifa_preference = ifap->ifap_preference;
                goto out;
        case SIOCGIFADDRPREF:
                /* fill in the if_laddrreq structure */
                (void)sockaddr_copy(sstosa(&ifap->ifap_addr),
                    sizeof(ifap->ifap_addr), ifa->ifa_addr);
                ifap->ifap_preference = ifa->ifa_preference;
                goto out;
        default:
                error = EOPNOTSUPP;
        }
out:
        pserialize_read_exit(s);
        return error;
}

/*
 * Interface ioctls.
 */
static int
doifioctl(struct socket *so, u_long cmd, void *data, struct lwp *l)
{
        struct ifnet *ifp;
        struct ifreq *ifr;
        int error = 0;
        u_long ocmd = cmd;
        u_short oif_flags;
        struct ifreq ifrb;
        struct oifreq *oifr = NULL;
        int r;
        struct psref psref;
        bool do_if43_post = false;
        bool do_ifm80_post = false;

        switch (cmd) {
        case SIOCGIFCONF:
                return ifconf(cmd, data);
        case SIOCINITIFADDR:
                return EPERM;
        default:
                MODULE_HOOK_CALL(uipc_syscalls_40_hook, (cmd, data), enosys(),
                    error);
                if (error != ENOSYS)
                        return error;
                MODULE_HOOK_CALL(uipc_syscalls_50_hook, (l, cmd, data),
                    enosys(), error);
                if (error != ENOSYS)
                        return error;
                error = 0;
                break;
        }

        ifr = data;
        /* Pre-conversion */
        MODULE_HOOK_CALL(if_cvtcmd_43_hook, (&cmd, ocmd), enosys(), error);
        if (cmd != ocmd) {
                oifr = data;
                data = ifr = &ifrb;
                IFREQO2N_43(oifr, ifr);
                do_if43_post = true;
        }
        MODULE_HOOK_CALL(ifmedia_80_pre_hook, (ifr, &cmd, &do_ifm80_post),
            enosys(), error);

        switch (cmd) {
        case SIOCIFCREATE:
        case SIOCIFDESTROY: {
                const int bound = curlwp_bind();
                if (l != NULL) {
                        ifp = if_get(ifr->ifr_name, &psref);
                        error = kauth_authorize_network(l->l_cred,
                            KAUTH_NETWORK_INTERFACE,
                            KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp,
                            KAUTH_ARG(cmd), NULL);
                        if (ifp != NULL)
                                if_put(ifp, &psref);
                        if (error != 0) {
                                curlwp_bindx(bound);
                                return error;
                        }
                }
                KERNEL_LOCK_UNLESS_NET_MPSAFE();
                mutex_enter(&if_clone_mtx);
                r = (cmd == SIOCIFCREATE) ?
                        if_clone_create(ifr->ifr_name) :
                        if_clone_destroy(ifr->ifr_name);
                mutex_exit(&if_clone_mtx);
                KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
                curlwp_bindx(bound);
                return r;
            }
        case SIOCIFGCLONERS: {
                struct if_clonereq *req = (struct if_clonereq *)data;
                return if_clone_list(req->ifcr_count, req->ifcr_buffer,
                    &req->ifcr_total);
            }
        }

        if ((cmd & IOC_IN) == 0 || IOCPARM_LEN(cmd) < sizeof(ifr->ifr_name))
                return EINVAL;

        const int bound = curlwp_bind();
        ifp = if_get(ifr->ifr_name, &psref);
        if (ifp == NULL) {
                curlwp_bindx(bound);
                return ENXIO;
        }

        switch (cmd) {
        case SIOCALIFADDR:
        case SIOCDLIFADDR:
        case SIOCSIFADDRPREF:
        case SIOCSIFFLAGS:
        case SIOCSIFCAP:
        case SIOCSIFMETRIC:
        case SIOCZIFDATA:
        case SIOCSIFMTU:
        case SIOCSIFPHYADDR:
        case SIOCDIFPHYADDR:
#ifdef INET6
        case SIOCSIFPHYADDR_IN6:
#endif
        case SIOCSLIFPHYADDR:
        case SIOCADDMULTI:
        case SIOCDELMULTI:
        case SIOCSETHERCAP:
        case SIOCSIFMEDIA:
        case SIOCSDRVSPEC:
        case SIOCG80211:
        case SIOCS80211:
        case SIOCS80211NWID:
        case SIOCS80211NWKEY:
        case SIOCS80211POWER:
        case SIOCS80211BSSID:
        case SIOCS80211CHANNEL:
        case SIOCSLINKSTR:
                if (l != NULL) {
                        error = kauth_authorize_network(l->l_cred,
                            KAUTH_NETWORK_INTERFACE,
                            KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp,
                            KAUTH_ARG(cmd), NULL);
                        if (error != 0)
                                goto out;
                }
        }

        oif_flags = ifp->if_flags;

        KERNEL_LOCK_UNLESS_IFP_MPSAFE(ifp);
        IFNET_LOCK(ifp);

        error = if_ioctl(ifp, cmd, data);
        if (error != ENOTTY)
                ;
        else if (so->so_proto == NULL)
                error = EOPNOTSUPP;
        else {
                KERNEL_LOCK_IF_IFP_MPSAFE(ifp);
                MODULE_HOOK_CALL(if_ifioctl_43_hook,
                             (so, ocmd, cmd, data, l), enosys(), error);
                if (error == ENOSYS)
                        error = (*so->so_proto->pr_usrreqs->pr_ioctl)(so,
                            cmd, data, ifp);
                KERNEL_UNLOCK_IF_IFP_MPSAFE(ifp);
        }

        if (((oif_flags ^ ifp->if_flags) & IFF_UP) != 0) {
                if ((ifp->if_flags & IFF_UP) != 0) {
                        const int s = splsoftnet();
                        if_up_locked(ifp);
                        splx(s);
                }
        }

        /* Post-conversion */
        if (do_ifm80_post && (error == 0))
                MODULE_HOOK_CALL(ifmedia_80_post_hook, (ifr, cmd),
                    enosys(), error);
        if (do_if43_post)
                IFREQN2O_43(oifr, ifr);

        IFNET_UNLOCK(ifp);
        KERNEL_UNLOCK_UNLESS_IFP_MPSAFE(ifp);
out:
        if_put(ifp, &psref);
        curlwp_bindx(bound);
        return error;
}

/*
 * Return interface configuration
 * of system.  List may be used
 * in later ioctl's (above) to get
 * other information.
 *
 * Each record is a struct ifreq.  Before the addition of
 * sockaddr_storage, the API rule was that sockaddr flavors that did
 * not fit would extend beyond the struct ifreq, with the next struct
 * ifreq starting sa_len beyond the struct sockaddr.  Because the
 * union in struct ifreq includes struct sockaddr_storage, every kind
 * of sockaddr must fit.  Thus, there are no longer any overlength
 * records.
 *
 * Records are added to the user buffer if they fit, and ifc_len is
 * adjusted to the length that was written.  Thus, the user is only
 * assured of getting the complete list if ifc_len on return is at
 * least sizeof(struct ifreq) less than it was on entry.
 *
 * If the user buffer pointer is NULL, this routine copies no data and
 * returns the amount of space that would be needed.
 *
 * Invariants:
 * ifrp points to the next part of the user's buffer to be used.  If
 * ifrp != NULL, space holds the number of bytes remaining that we may
 * write at ifrp.  Otherwise, space holds the number of bytes that
 * would have been written had there been adequate space.
 */
/*ARGSUSED*/
static int
ifconf(u_long cmd, void *data)
{
        struct ifconf *ifc = (struct ifconf *)data;
        struct ifnet *ifp;
        struct ifaddr *ifa;
        struct ifreq ifr, *ifrp = NULL;
        int space = 0, error = 0;
        const int sz = (int)sizeof(struct ifreq);
        const bool docopy = ifc->ifc_req != NULL;
        struct psref psref;

        if (docopy) {
                if (ifc->ifc_len < 0)
                        return EINVAL;

                space = ifc->ifc_len;
                ifrp = ifc->ifc_req;
        }
        memset(&ifr, 0, sizeof(ifr));

        const int bound = curlwp_bind();
        int s = pserialize_read_enter();
        IFNET_READER_FOREACH(ifp) {
                psref_acquire(&psref, &ifp->if_psref, ifnet_psref_class);
                pserialize_read_exit(s);

                (void)strncpy(ifr.ifr_name, ifp->if_xname,
                    sizeof(ifr.ifr_name));
                if (ifr.ifr_name[sizeof(ifr.ifr_name) - 1] != '\0') {
                        error = ENAMETOOLONG;
                        goto release_exit;
                }
                if (IFADDR_READER_EMPTY(ifp)) {
                        /* Interface with no addresses - send zero sockaddr. */
                        memset(&ifr.ifr_addr, 0, sizeof(ifr.ifr_addr));
                        if (!docopy) {
                                space += sz;
                                goto next;
                        }
                        if (space >= sz) {
                                error = copyout(&ifr, ifrp, sz);
                                if (error != 0)
                                        goto release_exit;
                                ifrp++;
                                space -= sz;
                        }
                }

                s = pserialize_read_enter();
                IFADDR_READER_FOREACH(ifa, ifp) {
                        struct sockaddr *sa = ifa->ifa_addr;
                        /* all sockaddrs must fit in sockaddr_storage */
                        KASSERT(sa->sa_len <= sizeof(ifr.ifr_ifru));

                        if (!docopy) {
                                space += sz;
                                continue;
                        }
                        memcpy(&ifr.ifr_space, sa, sa->sa_len);
                        pserialize_read_exit(s);

                        if (space >= sz) {
                                error = copyout(&ifr, ifrp, sz);
                                if (error != 0)
                                        goto release_exit;
                                ifrp++; space -= sz;
                        }
                        s = pserialize_read_enter();
                }
                pserialize_read_exit(s);

next:
                s = pserialize_read_enter();
                psref_release(&psref, &ifp->if_psref, ifnet_psref_class);
        }
        pserialize_read_exit(s);
        curlwp_bindx(bound);

        if (docopy) {
                KASSERT(0 <= space && space <= ifc->ifc_len);
                ifc->ifc_len -= space;
        } else {
                KASSERT(space >= 0);
                ifc->ifc_len = space;
        }
        return 0;

release_exit:
        psref_release(&psref, &ifp->if_psref, ifnet_psref_class);
        curlwp_bindx(bound);
        return error;
}

int
ifreq_setaddr(u_long cmd, struct ifreq *ifr, const struct sockaddr *sa)
{
        uint8_t len = sizeof(ifr->ifr_ifru.ifru_space);
        struct ifreq ifrb;
        struct oifreq *oifr = NULL;
        u_long ocmd = cmd;
        int hook;

        MODULE_HOOK_CALL(if_cvtcmd_43_hook, (&cmd, ocmd), enosys(), hook);
        if (hook != ENOSYS) {
                if (cmd != ocmd) {
                        oifr = (struct oifreq *)(void *)ifr;
                        ifr = &ifrb;
                        IFREQO2N_43(oifr, ifr);
                                len = sizeof(oifr->ifr_addr);
                }
        }

        if (len < sa->sa_len)
                return EFBIG;

        memset(&ifr->ifr_addr, 0, len);
        sockaddr_copy(&ifr->ifr_addr, len, sa);

        if (cmd != ocmd)
                IFREQN2O_43(oifr, ifr);
        return 0;
}

/*
 * wrapper function for the drivers which doesn't have if_transmit().
 */
static int
if_transmit(struct ifnet *ifp, struct mbuf *m)
{
        int error;
        size_t pktlen = m->m_pkthdr.len;
        bool mcast = (m->m_flags & M_MCAST) != 0;

        const int s = splnet();

        IFQ_ENQUEUE(&ifp->if_snd, m, error);
        if (error != 0) {
                /* mbuf is already freed */
                goto out;
        }

        net_stat_ref_t nsr = IF_STAT_GETREF(ifp);
        if_statadd_ref(nsr, if_obytes, pktlen);
        if (mcast)
                if_statinc_ref(nsr, if_omcasts);
        IF_STAT_PUTREF(ifp);

        if ((ifp->if_flags & IFF_OACTIVE) == 0)
                if_start_lock(ifp);
out:
        splx(s);

        return error;
}

int
if_transmit_lock(struct ifnet *ifp, struct mbuf *m)
{
        int error;

        kmsan_check_mbuf(m);

#ifdef ALTQ
        KERNEL_LOCK(1, NULL);
        if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
                error = if_transmit(ifp, m);
                KERNEL_UNLOCK_ONE(NULL);
        } else {
                KERNEL_UNLOCK_ONE(NULL);
                error = (*ifp->if_transmit)(ifp, m);
                /* mbuf is already freed */
        }
#else /* !ALTQ */
        error = (*ifp->if_transmit)(ifp, m);
        /* mbuf is already freed */
#endif /* !ALTQ */

        return error;
}

/*
 * Queue message on interface, and start output if interface
 * not yet active.
 */
int
ifq_enqueue(struct ifnet *ifp, struct mbuf *m)
{

        return if_transmit_lock(ifp, m);
}

/*
 * Queue message on interface, possibly using a second fast queue
 */
int
ifq_enqueue2(struct ifnet *ifp, struct ifqueue *ifq, struct mbuf *m)
{
        int error = 0;

        if (ifq != NULL
#ifdef ALTQ
            && ALTQ_IS_ENABLED(&ifp->if_snd) == 0
#endif
            ) {
                if (IF_QFULL(ifq)) {
                        IF_DROP(&ifp->if_snd);
                        m_freem(m);
                        if (error == 0)
                                error = ENOBUFS;
                } else
                        IF_ENQUEUE(ifq, m);
        } else
                IFQ_ENQUEUE(&ifp->if_snd, m, error);
        if (error != 0) {
                if_statinc(ifp, if_oerrors);
                return error;
        }
        return 0;
}

int
if_addr_init(ifnet_t *ifp, struct ifaddr *ifa, const bool src)
{
        int rc;

        KASSERT(IFNET_LOCKED(ifp));
        if (ifp->if_initaddr != NULL)
                rc = (*ifp->if_initaddr)(ifp, ifa, src);
        else if (src || (rc = if_ioctl(ifp, SIOCSIFDSTADDR, ifa)) == ENOTTY)
                rc = if_ioctl(ifp, SIOCINITIFADDR, ifa);

        return rc;
}

int
if_do_dad(struct ifnet *ifp)
{
        if ((ifp->if_flags & IFF_LOOPBACK) != 0)
                return 0;

        switch (ifp->if_type) {
        case IFT_FAITH:
                /*
                 * These interfaces do not have the IFF_LOOPBACK flag,
                 * but loop packets back.  We do not have to do DAD on such
                 * interfaces.  We should even omit it, because loop-backed
                 * responses would confuse the DAD procedure.
                 */
                return 0;
        default:
                /*
                 * Our DAD routine requires the interface up and running.
                 * However, some interfaces can be up before the RUNNING
                 * status.  Additionally, users may try to assign addresses
                 * before the interface becomes up (or running).
                 * We simply skip DAD in such a case as a work around.
                 * XXX: we should rather mark "tentative" on such addresses,
                 * and do DAD after the interface becomes ready.
                 */
                if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) !=
                    (IFF_UP | IFF_RUNNING))
                        return 0;

                return 1;
        }
}

/*
 * if_flags_set(ifp, flags)
 *
 *        Ask ifp to change ifp->if_flags to flags, as if with the
 *        SIOCSIFFLAGS ioctl command.
 *
 *        May sleep.  Caller must hold ifp->if_ioctl_lock, a.k.a
 *        IFNET_LOCK.
 */
int
if_flags_set(ifnet_t *ifp, const u_short flags)
{
        int rc;

        KASSERT(IFNET_LOCKED(ifp));

        if (ifp->if_setflags != NULL)
                rc = (*ifp->if_setflags)(ifp, flags);
        else {
                u_short cantflags, chgdflags;
                struct ifreq ifr;

                chgdflags = ifp->if_flags ^ flags;
                cantflags = chgdflags & IFF_CANTCHANGE;

                if (cantflags != 0)
                        ifp->if_flags ^= cantflags;

                /*
                 * Traditionally, we do not call if_ioctl after
                 * setting/clearing only IFF_PROMISC if the interface
                 * isn't IFF_UP.  Uphold that tradition.
                 */
                if (chgdflags == IFF_PROMISC && (ifp->if_flags & IFF_UP) == 0)
                        return 0;

                memset(&ifr, 0, sizeof(ifr));

                ifr.ifr_flags = flags & ~IFF_CANTCHANGE;
                rc = if_ioctl(ifp, SIOCSIFFLAGS, &ifr);

                if (rc != 0 && cantflags != 0)
                        ifp->if_flags ^= cantflags;
        }

        return rc;
}

/*
 * if_mcast_op(ifp, cmd, sa)
 *
 *        Apply a multicast command, SIOCADDMULTI/SIOCDELMULTI, to the
 *        interface.  Returns 0 on success, nonzero errno(3) number on
 *        failure.
 *
 *        May sleep.
 *
 *        Use this, not if_ioctl, for the multicast commands.
 */
int
if_mcast_op(ifnet_t *ifp, const unsigned long cmd, const struct sockaddr *sa)
{
        int rc;
        struct ifreq ifr;

        switch (cmd) {
        case SIOCADDMULTI:
        case SIOCDELMULTI:
                break;
        default:
                panic("invalid ifnet multicast command: 0x%lx", cmd);
        }

        ifreq_setaddr(cmd, &ifr, sa);
        rc = if_ioctl(ifp, cmd, &ifr);

        return rc;
}

static void
sysctl_sndq_setup(struct sysctllog **clog, const char *ifname,
    struct ifaltq *ifq)
{
        const struct sysctlnode *cnode, *rnode;

        if (sysctl_createv(clog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "interfaces",
                       SYSCTL_DESCR("Per-interface controls"),
                       NULL, 0, NULL, 0,
                       CTL_NET, CTL_CREATE, CTL_EOL) != 0)
                goto bad;

        if (sysctl_createv(clog, 0, &rnode, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, ifname,
                       SYSCTL_DESCR("Interface controls"),
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL) != 0)
                goto bad;

        if (sysctl_createv(clog, 0, &rnode, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "sndq",
                       SYSCTL_DESCR("Interface output queue controls"),
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL) != 0)
                goto bad;

        if (sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_INT, "len",
                       SYSCTL_DESCR("Current output queue length"),
                       NULL, 0, &ifq->ifq_len, 0,
                       CTL_CREATE, CTL_EOL) != 0)
                goto bad;

        if (sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                       CTLTYPE_INT, "maxlen",
                       SYSCTL_DESCR("Maximum allowed output queue length"),
                       NULL, 0, &ifq->ifq_maxlen, 0,
                       CTL_CREATE, CTL_EOL) != 0)
                goto bad;

        if (sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "drops",
                       SYSCTL_DESCR("Packets dropped due to full output queue"),
                       NULL, 0, &ifq->ifq_drops, 0,
                       CTL_CREATE, CTL_EOL) != 0)
                goto bad;

        return;
bad:
        printf("%s: could not attach sysctl nodes\n", ifname);
        return;
}

static int
if_sdl_sysctl(SYSCTLFN_ARGS)
{
        struct ifnet *ifp;
        const struct sockaddr_dl *sdl;
        struct psref psref;
        int error = 0;

        if (namelen != 1)
                return EINVAL;

        const int bound = curlwp_bind();
        ifp = if_get_byindex(name[0], &psref);
        if (ifp == NULL) {
                error = ENODEV;
                goto out0;
        }

        sdl = ifp->if_sadl;
        if (sdl == NULL) {
                *oldlenp = 0;
                goto out1;
        }

        if (oldp == NULL) {
                *oldlenp = sdl->sdl_alen;
                goto out1;
        }

        if (*oldlenp >= sdl->sdl_alen)
                *oldlenp = sdl->sdl_alen;
        error = sysctl_copyout(l, &sdl->sdl_data[sdl->sdl_nlen],
            oldp, *oldlenp);
out1:
        if_put(ifp, &psref);
out0:
        curlwp_bindx(bound);
        return error;
}

static void
if_sysctl_setup(struct sysctllog **clog)
{
        const struct sysctlnode *rnode = NULL;

        sysctl_createv(clog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "sdl",
                       SYSCTL_DESCR("Get active link-layer address"),
                       if_sdl_sysctl, 0, NULL, 0,
                       CTL_NET, CTL_CREATE, CTL_EOL);
}





























































































































   47 











    1 
   47 










    2 











    1 











    1 





























    1 


    1 








    1 











    1 



    1 
    1 

    1 













    1 
    1 










    1 

    1 








































































    2 








































    2 








    1 
    1 






    2 

    2 






    1 











    1 










    2 


    2 
















    2 

    1 

    1 









    2 
    1 
















































    1 




















    1 







    1 








    1 





    1 













    1 


    1 















    1 











    1 















    1 
































































    1 















    1 




    1 





































































































































    1 






    1 






































































    2 














    1 


    1 



    1 





    1 


    1 
    1 





    1 

    1 












    1 


    1 


    1 
    1 
    1 





    1 












    1 






    1 

    1 
    1 


    1 

    1 



























    1 





























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
/*        $NetBSD: ufs_dirhash.c,v 1.41 2022/08/07 02:33:47 simonb Exp $        */

/*
 * Copyright (c) 2001, 2002 Ian Dowse.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * $FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.3.2.8 2004/12/08 11:54:13 dwmalone Exp $
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_dirhash.c,v 1.41 2022/08/07 02:33:47 simonb Exp $");

/*
 * This implements a hash-based lookup scheme for UFS directories.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/types.h>
#include <sys/hash.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/pool.h>
#include <sys/sysctl.h>
#include <sys/atomic.h>

#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/dirhash.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_extern.h>


/*
 * Defaults for dirhash cache sizes:
 *  - use up to 1/64th of system memory.
 *  - disable dirhash (set the cache size to 0 bytes) if the
 *    calculated value of hash is less than 2MB.
 *  - cap maximum size of the dirhash cache at 32MB.
 */
#define        DIRHASH_DEFAULT_DIVIDER        64
#define        MIN_DEFAULT_DIRHASH_MEM        (2 * 1024 * 1024)
#define        MAX_DEFAULT_DIRHASH_MEM        (32 * 1024 * 1024)


#define WRAPINCR(val, limit)        (((val) + 1 == (limit)) ? 0 : ((val) + 1))
#define WRAPDECR(val, limit)        (((val) == 0) ? ((limit) - 1) : ((val) - 1))
#define OFSFMT(ip)                ((ip)->i_ump->um_maxsymlinklen <= 0)
#define BLKFREE2IDX(n)                ((n) > DH_NFSTATS ? DH_NFSTATS : (n))

static u_int ufs_dirhashminblks = 5;
static u_int ufs_dirhashmaxmem = 0;
static u_int ufs_dirhashmem;
static u_int ufs_dirhashcheck = 0;

static int ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen);
static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff,
           int dirblksiz);
static void ufsdirhash_delslot(struct dirhash *dh, int slot);
static int ufsdirhash_findslot(struct dirhash *dh, const char *name,
           int namelen, doff_t offset);
static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset,
           int dirblksiz);
static int ufsdirhash_recycle(int wanted);

static pool_cache_t ufsdirhashblk_cache;
static pool_cache_t ufsdirhash_cache;

#define DIRHASHLIST_LOCK()                mutex_enter(&ufsdirhash_lock)
#define DIRHASHLIST_UNLOCK()                mutex_exit(&ufsdirhash_lock)
#define DIRHASH_LOCK(dh)                mutex_enter(&(dh)->dh_lock)
#define DIRHASH_UNLOCK(dh)                mutex_exit(&(dh)->dh_lock)
#define DIRHASH_BLKALLOC()                \
    pool_cache_get(ufsdirhashblk_cache, PR_NOWAIT)
#define DIRHASH_BLKFREE(ptr)                \
    pool_cache_put(ufsdirhashblk_cache, ptr)

/* Dirhash list; recently-used entries are near the tail. */
static TAILQ_HEAD(, dirhash) ufsdirhash_list;

/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
static kmutex_t ufsdirhash_lock;

/*
 * Locking order:
 *        ufsdirhash_lock
 *        dh_lock
 *
 * The dh_lock mutex should be acquired either via the inode lock, or via
 * ufsdirhash_lock. Only the owner of the inode may free the associated
 * dirhash, but anything can steal its memory and set dh_hash to NULL.
 */

/*
 * Attempt to build up a hash table for the directory contents in
 * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
 */
int
ufsdirhash_build(struct inode *ip)
{
        struct dirhash *dh;
        struct buf *bp = NULL;
        struct direct *ep;
        struct vnode *vp;
        doff_t bmask, pos;
        int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
        const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
        int dirblksiz = ip->i_ump->um_dirblksiz;

        /* Check if we can/should use dirhash. */
        if (ip->i_dirhash == NULL) {
                if (ufs_dirhashmaxmem == 0 ||
                    ip->i_size < (ufs_dirhashminblks * dirblksiz) ||
                    OFSFMT(ip))
                        return (-1);
        } else {
                /* Hash exists, but sysctls could have changed. */
                if (ip->i_size < (ufs_dirhashminblks * dirblksiz) ||
                    ufs_dirhashmem > ufs_dirhashmaxmem) {
                        ufsdirhash_free(ip);
                        return (-1);
                }
                /* Check if hash exists and is intact (note: unlocked read). */
                if (ip->i_dirhash->dh_hash != NULL)
                        return (0);
                /* Free the old, recycled hash and build a new one. */
                ufsdirhash_free(ip);
        }

        /* Don't hash removed directories. */
        if (ip->i_nlink == 0)
                return (-1);

        vp = ip->i_vnode;
        /* Allocate 50% more entries than this dir size could ever need. */
        KASSERT(ip->i_size >= dirblksiz);
        nslots = ip->i_size / UFS_DIRECTSIZ(1);
        nslots = (nslots * 3 + 1) / 2;
        narrays = howmany(nslots, DH_NBLKOFF);
        nslots = narrays * DH_NBLKOFF;
        dirblocks = howmany(ip->i_size, dirblksiz);
        nblocks = (dirblocks * 3 + 1) / 2;

        memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
            narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
            nblocks * sizeof(*dh->dh_blkfree);

        while (atomic_add_int_nv(&ufs_dirhashmem, memreqd) >
            ufs_dirhashmaxmem) {
                atomic_add_int(&ufs_dirhashmem, -memreqd);
                if (memreqd > ufs_dirhashmaxmem / 2)
                        return (-1);
                /* Try to free some space. */
                if (ufsdirhash_recycle(memreqd) != 0)
                        return (-1);
                else
                            DIRHASHLIST_UNLOCK();
        }

        /*
         * Use non-blocking mallocs so that we will revert to a linear
         * lookup on failure rather than potentially blocking forever.
         */
        dh = pool_cache_get(ufsdirhash_cache, PR_NOWAIT);
        if (dh == NULL) {
                atomic_add_int(&ufs_dirhashmem, -memreqd);
                return (-1);
        }
        memset(dh, 0, sizeof(*dh));
        mutex_init(&dh->dh_lock, MUTEX_DEFAULT, IPL_NONE);
        DIRHASH_LOCK(dh);
        dh->dh_hashsz = narrays * sizeof(dh->dh_hash[0]);
        dh->dh_hash = kmem_zalloc(dh->dh_hashsz, KM_NOSLEEP);
        dh->dh_blkfreesz = nblocks * sizeof(dh->dh_blkfree[0]);
        dh->dh_blkfree = kmem_zalloc(dh->dh_blkfreesz, KM_NOSLEEP);
        if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
                goto fail;
        for (i = 0; i < narrays; i++) {
                if ((dh->dh_hash[i] = DIRHASH_BLKALLOC()) == NULL)
                        goto fail;
                for (j = 0; j < DH_NBLKOFF; j++)
                        dh->dh_hash[i][j] = DIRHASH_EMPTY;
        }

        /* Initialise the hash table and block statistics. */
        dh->dh_narrays = narrays;
        dh->dh_hlen = nslots;
        dh->dh_nblk = nblocks;
        dh->dh_dirblks = dirblocks;
        for (i = 0; i < dirblocks; i++)
                dh->dh_blkfree[i] = dirblksiz / DIRALIGN;
        for (i = 0; i < DH_NFSTATS; i++)
                dh->dh_firstfree[i] = -1;
        dh->dh_firstfree[DH_NFSTATS] = 0;
        dh->dh_seqopt = 0;
        dh->dh_seqoff = 0;
        dh->dh_score = DH_SCOREINIT;
        ip->i_dirhash = dh;

        bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
        pos = 0;
        while (pos < ip->i_size) {
                preempt_point();

                /* If necessary, get the next directory block. */
                if ((pos & bmask) == 0) {
                        if (bp != NULL)
                                brelse(bp, 0);
                        if (ufs_blkatoff(vp, (off_t)pos, NULL, &bp, false) != 0)
                                goto fail;
                }

                /* Add this entry to the hash. */
                ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
                if (ep->d_reclen == 0 || ep->d_reclen >
                    dirblksiz - (pos & (dirblksiz - 1))) {
                        /* Corrupted directory. */
                        brelse(bp, 0);
                        goto fail;
                }
                if (ep->d_ino != 0) {
                        /* Add the entry (simplified ufsdirhash_add). */
                        slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
                        while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
                                slot = WRAPINCR(slot, dh->dh_hlen);
                        dh->dh_hused++;
                        DH_ENTRY(dh, slot) = pos;
                        ufsdirhash_adjfree(dh, pos, -UFS_DIRSIZ(0, ep, needswap),
                            dirblksiz);
                }
                pos += ep->d_reclen;
        }

        if (bp != NULL)
                brelse(bp, 0);
        DIRHASHLIST_LOCK();
        TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
        dh->dh_onlist = 1;
        DIRHASH_UNLOCK(dh);
        DIRHASHLIST_UNLOCK();
        return (0);

fail:
        ip->i_dirhash = NULL;
        DIRHASH_UNLOCK(dh);
        if (dh->dh_hash != NULL) {
                for (i = 0; i < narrays; i++)
                        if (dh->dh_hash[i] != NULL)
                                DIRHASH_BLKFREE(dh->dh_hash[i]);
                kmem_free(dh->dh_hash, dh->dh_hashsz);
        }
        if (dh->dh_blkfree != NULL)
                kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
        mutex_destroy(&dh->dh_lock);
        pool_cache_put(ufsdirhash_cache, dh);
        atomic_add_int(&ufs_dirhashmem, -memreqd);
        return (-1);
}

/*
 * Free any hash table associated with inode 'ip'.
 */
void
ufsdirhash_free(struct inode *ip)
{
        struct dirhash *dh;
        int i, mem;

        if ((dh = ip->i_dirhash) == NULL)
                return;

        ip->i_dirhash = NULL;

        DIRHASHLIST_LOCK();
        if (dh->dh_onlist)
                TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
        DIRHASHLIST_UNLOCK();

        /* The dirhash pointed to by 'dh' is exclusively ours now. */
        mem = sizeof(*dh);
        if (dh->dh_hash != NULL) {
                for (i = 0; i < dh->dh_narrays; i++)
                        DIRHASH_BLKFREE(dh->dh_hash[i]);
                kmem_free(dh->dh_hash, dh->dh_hashsz);
                kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
                mem += dh->dh_hashsz;
                mem += dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash);
                mem += dh->dh_nblk * sizeof(*dh->dh_blkfree);
        }
        mutex_destroy(&dh->dh_lock);
        pool_cache_put(ufsdirhash_cache, dh);

        atomic_add_int(&ufs_dirhashmem, -mem);
}

/*
 * Find the offset of the specified name within the given inode.
 * Returns 0 on success, ENOENT if the entry does not exist, or
 * EJUSTRETURN if the caller should revert to a linear search.
 *
 * If successful, the directory offset is stored in *offp, and a
 * pointer to a struct buf containing the entry is stored in *bpp. If
 * prevoffp is non-NULL, the offset of the previous entry within
 * the UFS_DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
 * is the first in a block, the start of the block is used).
 */
int
ufsdirhash_lookup(struct inode *ip, const char *name, int namelen, doff_t *offp,
    struct buf **bpp, doff_t *prevoffp)
{
        struct dirhash *dh, *dh_next;
        struct direct *dp;
        struct vnode *vp;
        struct buf *bp;
        doff_t blkoff, bmask, offset, prevoff;
        int i, slot;
        const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
        int dirblksiz = ip->i_ump->um_dirblksiz;

        if ((dh = ip->i_dirhash) == NULL)
                return (EJUSTRETURN);

        /*
         * Move this dirhash towards the end of the list if it has a
         * score higher than the next entry, and acquire the dh_lock.
         * Optimise the case where it's already the last by performing
         * an unlocked read of the TAILQ_NEXT pointer.
         *
         * In both cases, end up holding just dh_lock.
         */
        if (TAILQ_NEXT(dh, dh_list) != NULL) {
                DIRHASHLIST_LOCK();
                DIRHASH_LOCK(dh);
                /*
                 * If the new score will be greater than that of the next
                 * entry, then move this entry past it. With both mutexes
                 * held, dh_next won't go away, but its dh_score could
                 * change; that's not important since it is just a hint.
                 */
                if (dh->dh_hash != NULL &&
                    (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
                    dh->dh_score >= dh_next->dh_score) {
                        KASSERT(dh->dh_onlist);
                        TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
                        TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
                            dh_list);
                }
                DIRHASHLIST_UNLOCK();
        } else {
                /* Already the last, though that could change as we wait. */
                DIRHASH_LOCK(dh);
        }
        if (dh->dh_hash == NULL) {
                DIRHASH_UNLOCK(dh);
                ufsdirhash_free(ip);
                return (EJUSTRETURN);
        }

        /* Update the score. */
        if (dh->dh_score < DH_SCOREMAX)
                dh->dh_score++;

        vp = ip->i_vnode;
        bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
        blkoff = -1;
        bp = NULL;
restart:
        slot = ufsdirhash_hash(dh, name, namelen);

        if (dh->dh_seqopt) {
                /*
                 * Sequential access optimisation. dh_seqoff contains the
                 * offset of the directory entry immediately following
                 * the last entry that was looked up. Check if this offset
                 * appears in the hash chain for the name we are looking for.
                 */
                for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
                    i = WRAPINCR(i, dh->dh_hlen))
                        if (offset == dh->dh_seqoff)
                                break;
                if (offset == dh->dh_seqoff) {
                        /*
                         * We found an entry with the expected offset. This
                         * is probably the entry we want, but if not, the
                         * code below will turn off seqoff and retry.
                         */
                        slot = i;
                } else
                        dh->dh_seqopt = 0;
        }

        for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
            slot = WRAPINCR(slot, dh->dh_hlen)) {
                if (offset == DIRHASH_DEL)
                        continue;

                if (offset < 0 || offset >= ip->i_size)
                        panic("ufsdirhash_lookup: bad offset in hash array");
                if ((offset & ~bmask) != blkoff) {
                        if (bp != NULL)
                                brelse(bp, 0);
                        blkoff = offset & ~bmask;
                        if (ufs_blkatoff(vp, (off_t)blkoff,
                            NULL, &bp, false) != 0) {
                                DIRHASH_UNLOCK(dh);
                                return (EJUSTRETURN);
                        }
                }
                dp = (struct direct *)((char *)bp->b_data + (offset & bmask));
                if (dp->d_reclen == 0 || dp->d_reclen >
                    dirblksiz - (offset & (dirblksiz - 1))) {
                        /* Corrupted directory. */
                        DIRHASH_UNLOCK(dh);
                        brelse(bp, 0);
                        return (EJUSTRETURN);
                }
                if (dp->d_namlen == namelen &&
                    memcmp(dp->d_name, name, namelen) == 0) {
                        /* Found. Get the prev offset if needed. */
                        if (prevoffp != NULL) {
                                if (offset & (dirblksiz - 1)) {
                                        prevoff = ufsdirhash_getprev(dp,
                                            offset, dirblksiz);
                                        if (prevoff == -1) {
                                                brelse(bp, 0);
                                                return (EJUSTRETURN);
                                        }
                                } else
                                        prevoff = offset;
                                *prevoffp = prevoff;
                        }

                        /* Check for sequential access, and update offset. */
                        if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
                                dh->dh_seqopt = 1;
                        dh->dh_seqoff = offset + UFS_DIRSIZ(0, dp, needswap);
                        DIRHASH_UNLOCK(dh);

                        *bpp = bp;
                        *offp = offset;
                        return (0);
                }

                if (dh->dh_hash == NULL) {
                        DIRHASH_UNLOCK(dh);
                        if (bp != NULL)
                                brelse(bp, 0);
                        ufsdirhash_free(ip);
                        return (EJUSTRETURN);
                }
                /*
                 * When the name doesn't match in the seqopt case, go back
                 * and search normally.
                 */
                if (dh->dh_seqopt) {
                        dh->dh_seqopt = 0;
                        goto restart;
                }
        }
        DIRHASH_UNLOCK(dh);
        if (bp != NULL)
                brelse(bp, 0);
        return (ENOENT);
}

/*
 * Find a directory block with room for 'slotneeded' bytes. Returns
 * the offset of the directory entry that begins the free space.
 * This will either be the offset of an existing entry that has free
 * space at the end, or the offset of an entry with d_ino == 0 at
 * the start of a UFS_DIRBLKSIZ block.
 *
 * To use the space, the caller may need to compact existing entries in
 * the directory. The total number of bytes in all of the entries involved
 * in the compaction is stored in *slotsize. In other words, all of
 * the entries that must be compacted are exactly contained in the
 * region beginning at the returned offset and spanning *slotsize bytes.
 *
 * Returns -1 if no space was found, indicating that the directory
 * must be extended.
 */
doff_t
ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
{
        struct direct *dp;
        struct dirhash *dh;
        struct buf *bp;
        doff_t pos, slotstart;
        int dirblock, error, freebytes, i;
        const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
        int dirblksiz = ip->i_ump->um_dirblksiz;

        if ((dh = ip->i_dirhash) == NULL)
                return (-1);

        DIRHASH_LOCK(dh);
        if (dh->dh_hash == NULL) {
                DIRHASH_UNLOCK(dh);
                ufsdirhash_free(ip);
                return (-1);
        }

        /* Find a directory block with the desired free space. */
        dirblock = -1;
        for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
                if ((dirblock = dh->dh_firstfree[i]) != -1)
                        break;
        if (dirblock == -1) {
                DIRHASH_UNLOCK(dh);
                return (-1);
        }

        KASSERT(dirblock < dh->dh_nblk &&
            dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN));
        pos = dirblock * dirblksiz;
        error = ufs_blkatoff(ip->i_vnode, (off_t)pos, (void *)&dp, &bp, false);
        if (error) {
                DIRHASH_UNLOCK(dh);
                return (-1);
        }
        /* Find the first entry with free space. */
        for (i = 0; i < dirblksiz; ) {
                if (dp->d_reclen == 0) {
                        DIRHASH_UNLOCK(dh);
                        brelse(bp, 0);
                        return (-1);
                }
                if (dp->d_ino == 0 || dp->d_reclen > UFS_DIRSIZ(0, dp, needswap))
                        break;
                i += dp->d_reclen;
                dp = (struct direct *)((char *)dp + dp->d_reclen);
        }
        if (i > dirblksiz) {
                DIRHASH_UNLOCK(dh);
                brelse(bp, 0);
                return (-1);
        }
        slotstart = pos + i;

        /* Find the range of entries needed to get enough space */
        freebytes = 0;
        while (i < dirblksiz && freebytes < slotneeded) {
                freebytes += dp->d_reclen;
                if (dp->d_ino != 0)
                        freebytes -= UFS_DIRSIZ(0, dp, needswap);
                if (dp->d_reclen == 0) {
                        DIRHASH_UNLOCK(dh);
                        brelse(bp, 0);
                        return (-1);
                }
                i += dp->d_reclen;
                dp = (struct direct *)((char *)dp + dp->d_reclen);
        }
        if (i > dirblksiz) {
                DIRHASH_UNLOCK(dh);
                brelse(bp, 0);
                return (-1);
        }
        if (freebytes < slotneeded)
                panic("ufsdirhash_findfree: free mismatch");
        DIRHASH_UNLOCK(dh);
        brelse(bp, 0);
        *slotsize = pos + i - slotstart;
        return (slotstart);
}

/*
 * Return the start of the unused space at the end of a directory, or
 * -1 if there are no trailing unused blocks.
 */
doff_t
ufsdirhash_enduseful(struct inode *ip)
{
        struct dirhash *dh;
        int i;
        int dirblksiz = ip->i_ump->um_dirblksiz;

        if ((dh = ip->i_dirhash) == NULL)
                return (-1);

        DIRHASH_LOCK(dh);
        if (dh->dh_hash == NULL) {
                DIRHASH_UNLOCK(dh);
                ufsdirhash_free(ip);
                return (-1);
        }

        if (dh->dh_blkfree[dh->dh_dirblks - 1] != dirblksiz / DIRALIGN) {
                DIRHASH_UNLOCK(dh);
                return (-1);
        }

        for (i = dh->dh_dirblks - 1; i >= 0; i--)
                if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
                        break;
        DIRHASH_UNLOCK(dh);
        return ((doff_t)(i + 1) * dirblksiz);
}

/*
 * Insert information into the hash about a new directory entry. dirp
 * points to a struct direct containing the entry, and offset specifies
 * the offset of this entry.
 */
void
ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
{
        struct dirhash *dh;
        int slot;
        const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
        int dirblksiz = ip->i_ump->um_dirblksiz;

        if ((dh = ip->i_dirhash) == NULL)
                return;

        DIRHASH_LOCK(dh);
        if (dh->dh_hash == NULL) {
                DIRHASH_UNLOCK(dh);
                ufsdirhash_free(ip);
                return;
        }

        KASSERT(offset < dh->dh_dirblks * dirblksiz);
        /*
         * Normal hash usage is < 66%. If the usage gets too high then
         * remove the hash entirely and let it be rebuilt later.
         */
        if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
                DIRHASH_UNLOCK(dh);
                ufsdirhash_free(ip);
                return;
        }

        /* Find a free hash slot (empty or deleted), and add the entry. */
        slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
        while (DH_ENTRY(dh, slot) >= 0)
                slot = WRAPINCR(slot, dh->dh_hlen);
        if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
                dh->dh_hused++;
        DH_ENTRY(dh, slot) = offset;

        /* Update the per-block summary info. */
        ufsdirhash_adjfree(dh, offset, -UFS_DIRSIZ(0, dirp, needswap), dirblksiz);
        DIRHASH_UNLOCK(dh);
}

/*
 * Remove the specified directory entry from the hash. The entry to remove
 * is defined by the name in `dirp', which must exist at the specified
 * `offset' within the directory.
 */
void
ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
{
        struct dirhash *dh;
        int slot;
        const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
        int dirblksiz = ip->i_ump->um_dirblksiz;

        if ((dh = ip->i_dirhash) == NULL)
                return;

        DIRHASH_LOCK(dh);
        if (dh->dh_hash == NULL) {
                DIRHASH_UNLOCK(dh);
                ufsdirhash_free(ip);
                return;
        }

        KASSERT(offset < dh->dh_dirblks * dirblksiz);
        /* Find the entry */
        slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);

        /* Remove the hash entry. */
        ufsdirhash_delslot(dh, slot);

        /* Update the per-block summary info. */
        ufsdirhash_adjfree(dh, offset, UFS_DIRSIZ(0, dirp, needswap), dirblksiz);
        DIRHASH_UNLOCK(dh);
}

/*
 * Change the offset associated with a directory entry in the hash. Used
 * when compacting directory blocks.
 */
void
ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
    doff_t newoff)
{
        struct dirhash *dh;
        int slot;

        if ((dh = ip->i_dirhash) == NULL)
                return;
        DIRHASH_LOCK(dh);
        if (dh->dh_hash == NULL) {
                DIRHASH_UNLOCK(dh);
                ufsdirhash_free(ip);
                return;
        }

        KASSERT(oldoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz &&
            newoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz);
        /* Find the entry, and update the offset. */
        slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
        DH_ENTRY(dh, slot) = newoff;
        DIRHASH_UNLOCK(dh);
}

/*
 * Inform dirhash that the directory has grown by one block that
 * begins at offset (i.e. the new length is offset + UFS_DIRBLKSIZ).
 */
void
ufsdirhash_newblk(struct inode *ip, doff_t offset)
{
        struct dirhash *dh;
        int block;
        int dirblksiz = ip->i_ump->um_dirblksiz;

        if ((dh = ip->i_dirhash) == NULL)
                return;
        DIRHASH_LOCK(dh);
        if (dh->dh_hash == NULL) {
                DIRHASH_UNLOCK(dh);
                ufsdirhash_free(ip);
                return;
        }

        KASSERT(offset == dh->dh_dirblks * dirblksiz);
        block = offset / dirblksiz;
        if (block >= dh->dh_nblk) {
                /* Out of space; must rebuild. */
                DIRHASH_UNLOCK(dh);
                ufsdirhash_free(ip);
                return;
        }
        dh->dh_dirblks = block + 1;

        /* Account for the new free block. */
        dh->dh_blkfree[block] = dirblksiz / DIRALIGN;
        if (dh->dh_firstfree[DH_NFSTATS] == -1)
                dh->dh_firstfree[DH_NFSTATS] = block;
        DIRHASH_UNLOCK(dh);
}

/*
 * Inform dirhash that the directory is being truncated.
 */
void
ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
{
        struct dirhash *dh;
        int block, i;
        int dirblksiz = ip->i_ump->um_dirblksiz;

        if ((dh = ip->i_dirhash) == NULL)
                return;

        DIRHASH_LOCK(dh);
        if (dh->dh_hash == NULL) {
                DIRHASH_UNLOCK(dh);
                ufsdirhash_free(ip);
                return;
        }

        KASSERT(offset <= dh->dh_dirblks * dirblksiz);
        block = howmany(offset, dirblksiz);
        /*
         * If the directory shrinks to less than 1/8 of dh_nblk blocks
         * (about 20% of its original size due to the 50% extra added in
         * ufsdirhash_build) then free it, and let the caller rebuild
         * if necessary.
         */
        if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
                DIRHASH_UNLOCK(dh);
                ufsdirhash_free(ip);
                return;
        }

        /*
         * Remove any `first free' information pertaining to the
         * truncated blocks. All blocks we're removing should be
         * completely unused.
         */
        if (dh->dh_firstfree[DH_NFSTATS] >= block)
                dh->dh_firstfree[DH_NFSTATS] = -1;
        for (i = block; i < dh->dh_dirblks; i++)
                if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
                        panic("ufsdirhash_dirtrunc: blocks in use");
        for (i = 0; i < DH_NFSTATS; i++)
                if (dh->dh_firstfree[i] >= block)
                        panic("ufsdirhash_dirtrunc: first free corrupt");
        dh->dh_dirblks = block;
        DIRHASH_UNLOCK(dh);
}

/*
 * Debugging function to check that the dirhash information about
 * a directory block matches its actual contents. Panics if a mismatch
 * is detected.
 *
 * On entry, `sbuf' should point to the start of an in-core
 * DIRBLKSIZ-sized directory block, and `offset' should contain the
 * offset from the start of the directory of that block.
 */
void
ufsdirhash_checkblock(struct inode *ip, char *sbuf, doff_t offset)
{
        struct dirhash *dh;
        struct direct *dp;
        int block, ffslot, i, nfree;
        const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
        int dirblksiz = ip->i_ump->um_dirblksiz;

        if (!ufs_dirhashcheck)
                return;
        if ((dh = ip->i_dirhash) == NULL)
                return;

        DIRHASH_LOCK(dh);
        if (dh->dh_hash == NULL) {
                DIRHASH_UNLOCK(dh);
                ufsdirhash_free(ip);
                return;
        }

        block = offset / dirblksiz;
        if ((offset & (dirblksiz - 1)) != 0 || block >= dh->dh_dirblks)
                panic("ufsdirhash_checkblock: bad offset");

        nfree = 0;
        for (i = 0; i < dirblksiz; i += dp->d_reclen) {
                dp = (struct direct *)(sbuf + i);
                if (dp->d_reclen == 0 || i + dp->d_reclen > dirblksiz)
                        panic("ufsdirhash_checkblock: bad dir");

                if (dp->d_ino == 0) {
#if 0
                        /*
                         * XXX entries with d_ino == 0 should only occur
                         * at the start of a DIRBLKSIZ block. However the
                         * ufs code is tolerant of such entries at other
                         * offsets, and fsck does not fix them.
                         */
                        if (i != 0)
                                panic("ufsdirhash_checkblock: bad dir inode");
#endif
                        nfree += dp->d_reclen;
                        continue;
                }

                /* Check that the entry        exists (will panic if it doesn't). */
                ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);

                nfree += dp->d_reclen - UFS_DIRSIZ(0, dp, needswap);
        }
        if (i != dirblksiz)
                panic("ufsdirhash_checkblock: bad dir end");

        if (dh->dh_blkfree[block] * DIRALIGN != nfree)
                panic("ufsdirhash_checkblock: bad free count");

        ffslot = BLKFREE2IDX(nfree / DIRALIGN);
        for (i = 0; i <= DH_NFSTATS; i++)
                if (dh->dh_firstfree[i] == block && i != ffslot)
                        panic("ufsdirhash_checkblock: bad first-free");
        if (dh->dh_firstfree[ffslot] == -1)
                panic("ufsdirhash_checkblock: missing first-free entry");
        DIRHASH_UNLOCK(dh);
}

/*
 * Hash the specified filename into a dirhash slot.
 */
static int
ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen)
{
        u_int32_t hash;

        /*
         * We hash the name and then some other bit of data that is
         * invariant over the dirhash's lifetime. Otherwise names
         * differing only in the last byte are placed close to one
         * another in the table, which is bad for linear probing.
         */
        hash = hash32_buf(name, namelen, HASH32_BUF_INIT);
        hash = hash32_buf(&dh, sizeof(dh), hash);
        return (hash % dh->dh_hlen);
}

/*
 * Adjust the number of free bytes in the block containing `offset'
 * by the value specified by `diff'.
 *
 * The caller must ensure we have exclusive access to `dh'; normally
 * that means that dh_lock should be held, but this is also called
 * from ufsdirhash_build() where exclusive access can be assumed.
 */
static void
ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, int dirblksiz)
{
        int block, i, nfidx, ofidx;

        KASSERT(mutex_owned(&dh->dh_lock));

        /* Update the per-block summary info. */
        block = offset / dirblksiz;
        KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks);
        ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
        dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
        nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);

        /* Update the `first free' list if necessary. */
        if (ofidx != nfidx) {
                /* If removing, scan forward for the next block. */
                if (dh->dh_firstfree[ofidx] == block) {
                        for (i = block + 1; i < dh->dh_dirblks; i++)
                                if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
                                        break;
                        dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
                }

                /* Make this the new `first free' if necessary */
                if (dh->dh_firstfree[nfidx] > block ||
                    dh->dh_firstfree[nfidx] == -1)
                        dh->dh_firstfree[nfidx] = block;
        }
}

/*
 * Find the specified name which should have the specified offset.
 * Returns a slot number, and panics on failure.
 *
 * `dh' must be locked on entry and remains so on return.
 */
static int
ufsdirhash_findslot(struct dirhash *dh, const char *name, int namelen,
    doff_t offset)
{
        int slot;

        KASSERT(mutex_owned(&dh->dh_lock));

        /* Find the entry. */
        KASSERT(dh->dh_hused < dh->dh_hlen);
        slot = ufsdirhash_hash(dh, name, namelen);
        while (DH_ENTRY(dh, slot) != offset &&
            DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
                slot = WRAPINCR(slot, dh->dh_hlen);
        if (DH_ENTRY(dh, slot) != offset)
                panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);

        return (slot);
}

/*
 * Remove the entry corresponding to the specified slot from the hash array.
 *
 * `dh' must be locked on entry and remains so on return.
 */
static void
ufsdirhash_delslot(struct dirhash *dh, int slot)
{
        int i;

        KASSERT(mutex_owned(&dh->dh_lock));

        /* Mark the entry as deleted. */
        DH_ENTRY(dh, slot) = DIRHASH_DEL;

        /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
        for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
                i = WRAPINCR(i, dh->dh_hlen);
        if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
                i = WRAPDECR(i, dh->dh_hlen);
                while (DH_ENTRY(dh, i) == DIRHASH_DEL) {
                        DH_ENTRY(dh, i) = DIRHASH_EMPTY;
                        dh->dh_hused--;
                        i = WRAPDECR(i, dh->dh_hlen);
                }
                KASSERT(dh->dh_hused >= 0);
        }
}

/*
 * Given a directory entry and its offset, find the offset of the
 * previous entry in the same UFS_DIRBLKSIZ-sized block. Returns an
 * offset, or -1 if there is no previous entry in the block or some
 * other problem occurred.
 */
static doff_t
ufsdirhash_getprev(struct direct *dirp, doff_t offset, int dirblksiz)
{
        struct direct *dp;
        char *blkbuf;
        doff_t blkoff, prevoff;
        int entrypos, i;

        blkoff = offset & ~(dirblksiz - 1);        /* offset of start of block */
        entrypos = offset & (dirblksiz - 1);        /* entry relative to block */
        blkbuf = (char *)dirp - entrypos;
        prevoff = blkoff;

        /* If `offset' is the start of a block, there is no previous entry. */
        if (entrypos == 0)
                return (-1);

        /* Scan from the start of the block until we get to the entry. */
        for (i = 0; i < entrypos; i += dp->d_reclen) {
                dp = (struct direct *)(blkbuf + i);
                if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
                        return (-1);        /* Corrupted directory. */
                prevoff = blkoff + i;
        }
        return (prevoff);
}

/*
 * Try to free up `wanted' bytes by stealing memory from existing
 * dirhashes. Returns zero with list locked if successful.
 */
static int
ufsdirhash_recycle(int wanted)
{
        struct dirhash *dh;
        doff_t **hash;
        u_int8_t *blkfree;
        int i, mem, narrays;
        size_t hashsz, blkfreesz;

        DIRHASHLIST_LOCK();
        while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
                /* Find a dirhash, and lock it. */
                if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
                        DIRHASHLIST_UNLOCK();
                        return (-1);
                }
                DIRHASH_LOCK(dh);
                KASSERT(dh->dh_hash != NULL);

                /* Decrement the score; only recycle if it becomes zero. */
                if (--dh->dh_score > 0) {
                        DIRHASH_UNLOCK(dh);
                        DIRHASHLIST_UNLOCK();
                        return (-1);
                }

                /* Remove it from the list and detach its memory. */
                TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
                dh->dh_onlist = 0;
                hash = dh->dh_hash;
                hashsz = dh->dh_hashsz;
                dh->dh_hash = NULL;
                blkfree = dh->dh_blkfree;
                blkfreesz = dh->dh_blkfreesz;
                dh->dh_blkfree = NULL;
                narrays = dh->dh_narrays;
                mem = narrays * sizeof(*dh->dh_hash) +
                    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
                    dh->dh_nblk * sizeof(*dh->dh_blkfree);

                /* Unlock everything, free the detached memory. */
                DIRHASH_UNLOCK(dh);
                DIRHASHLIST_UNLOCK();

                for (i = 0; i < narrays; i++)
                        DIRHASH_BLKFREE(hash[i]);
                kmem_free(hash, hashsz);
                kmem_free(blkfree, blkfreesz);

                /* Account for the returned memory, and repeat if necessary. */
                DIRHASHLIST_LOCK();
                atomic_add_int(&ufs_dirhashmem, -mem);
        }
        /* Success. */
        return (0);
}

SYSCTL_SETUP(ufsdirhash_sysctl_init, "ufs_dirhash sysctl")
{
        const struct sysctlnode *rnode, *cnode;

        sysctl_createv(clog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "ufs",
                       SYSCTL_DESCR("ufs"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "dirhash",
                       SYSCTL_DESCR("dirhash"),
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "minblocks",
                       SYSCTL_DESCR("minimum hashed directory size in blocks"),
                       NULL, 0, &ufs_dirhashminblks, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "maxmem",
                       SYSCTL_DESCR("maximum dirhash memory usage"),
                       NULL, 0, &ufs_dirhashmaxmem, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
                       CTLTYPE_INT, "memused",
                       SYSCTL_DESCR("current dirhash memory usage"),
                       NULL, 0, &ufs_dirhashmem, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "docheck",
                       SYSCTL_DESCR("enable extra sanity checks"),
                       NULL, 0, &ufs_dirhashcheck, 0,
                       CTL_CREATE, CTL_EOL);
}

void
ufsdirhash_init(void)
{

        /*
         * Only initialise defaults for the dirhash size if it hasn't
         * hasn't been set.
         */
        if (ufs_dirhashmaxmem == 0) {
                /* Use 64-bit math to avoid overflows. */
                uint64_t physmem_bytes, hash_bytes;

                physmem_bytes = ctob((uint64_t)physmem);
                hash_bytes = physmem_bytes / DIRHASH_DEFAULT_DIVIDER;

                if (hash_bytes < MIN_DEFAULT_DIRHASH_MEM)
                        hash_bytes = 0;

                if (hash_bytes > MAX_DEFAULT_DIRHASH_MEM)
                        hash_bytes = MAX_DEFAULT_DIRHASH_MEM;

                ufs_dirhashmaxmem = (u_int)hash_bytes;
        }

        mutex_init(&ufsdirhash_lock, MUTEX_DEFAULT, IPL_NONE);
        ufsdirhashblk_cache = pool_cache_init(DH_NBLKOFF * sizeof(daddr_t), 0,
            0, 0, "dirhashblk", NULL, IPL_NONE, NULL, NULL, NULL);
        ufsdirhash_cache = pool_cache_init(sizeof(struct dirhash), 0,
            0, 0, "dirhash", NULL, IPL_NONE, NULL, NULL, NULL);
        TAILQ_INIT(&ufsdirhash_list);
}

void
ufsdirhash_done(void)
{

        KASSERT(TAILQ_EMPTY(&ufsdirhash_list));
        pool_cache_destroy(ufsdirhashblk_cache);
        pool_cache_destroy(ufsdirhash_cache);
        mutex_destroy(&ufsdirhash_lock);
}































































































































































































































    7 






























    7 








    7 
    7 









    7 










    7 

















    7 



    7 









    7 









    3 














    7 












    7 











    6 
    3 













    6 


    3 

    5 

    5 











    2 










    2 
































    3 




















    3 














    3 




















    2 










    2 




    2 
    1 











    1 











    1 















    3 










    3 












    6 





















































































































































































































































    3 








    3 


    3 


    3 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
/*        $NetBSD: layer_vnops.c,v 1.72 2021/10/20 03:08:18 thorpej Exp $        */

/*
 * Copyright (c) 1999 National Aeronautics & Space Administration
 * All rights reserved.
 *
 * This software was written by William Studenmund of the
 * Numerical Aerospace Simulation Facility, NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the National Aeronautics & Space Administration
 *    nor the names of its contributors may be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
 * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1992, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * John Heidemann of the UCLA Ficus project.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)null_vnops.c        8.6 (Berkeley) 5/27/95
 *
 * Ancestors:
 *        @(#)lofs_vnops.c        1.2 (Berkeley) 6/18/92
 *        Id: lofs_vnops.c,v 1.11 1992/05/30 10:05:43 jsp Exp jsp
 *        ...and...
 *        @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project
 */

/*
 * Generic layer vnode operations.
 *
 * The layer.h, layer_extern.h, layer_vfs.c, and layer_vnops.c files provide
 * the core implementation of stacked file-systems.
 *
 * The layerfs duplicates a portion of the file system name space under
 * a new name.  In this respect, it is similar to the loopback file system.
 * It differs from the loopback fs in two respects: it is implemented using
 * a stackable layers technique, and it is "layerfs-nodes" stack above all
 * lower-layer vnodes, not just over directory vnodes.
 *
 * OPERATION OF LAYERFS
 *
 * The layerfs is the minimum file system layer, bypassing all possible
 * operations to the lower layer for processing there.  The majority of its
 * activity centers on the bypass routine, through which nearly all vnode
 * operations pass.
 *
 * The bypass routine accepts arbitrary vnode operations for handling by
 * the lower layer.  It begins by examining vnode operation arguments and
 * replacing any layered nodes by their lower-layer equivalents.  It then
 * invokes an operation on the lower layer.  Finally, it replaces the
 * layered nodes in the arguments and, if a vnode is returned by the
 * operation, stacks a layered node on top of the returned vnode.
 *
 * The bypass routine in this file, layer_bypass(), is suitable for use
 * by many different layered filesystems. It can be used by multiple
 * filesystems simultaneously. Alternatively, a layered fs may provide
 * its own bypass routine, in which case layer_bypass() should be used as
 * a model. For instance, the main functionality provided by umapfs, the user
 * identity mapping file system, is handled by a custom bypass routine.
 *
 * Typically a layered fs registers its selected bypass routine as the
 * default vnode operation in its vnodeopv_entry_desc table. Additionally
 * the filesystem must store the bypass entry point in the layerm_bypass
 * field of struct layer_mount. All other layer routines in this file will
 * use the layerm_bypass() routine.
 *
 * Although the bypass routine handles most operations outright, a number
 * of operations are special cased and handled by the layerfs.  For instance,
 * layer_getattr() must change the fsid being returned.  While layer_lock()
 * and layer_unlock() must handle any locking for the current vnode as well
 * as pass the lock request down.  layer_inactive() and layer_reclaim() are
 * not bypassed so that they can handle freeing layerfs-specific data.  Also,
 * certain vnode operations (create, mknod, remove, link, rename, mkdir,
 * rmdir, and symlink) change the locking state within the operation.  Ideally
 * these operations should not change the lock state, but should be changed
 * to let the caller of the function unlock them.  Otherwise, all intermediate
 * vnode layers (such as union, umapfs, etc) must catch these functions to do
 * the necessary locking at their layer.
 *
 * INSTANTIATING VNODE STACKS
 *
 * Mounting associates "layerfs-nodes" stack and lower layer, in effect
 * stacking two VFSes.  The initial mount creates a single vnode stack for
 * the root of the new layerfs.  All other vnode stacks are created as a
 * result of vnode operations on this or other layerfs vnode stacks.
 *
 * New vnode stacks come into existence as a result of an operation which
 * returns a vnode.  The bypass routine stacks a layerfs-node above the new
 * vnode before returning it to the caller.
 *
 * For example, imagine mounting a null layer with:
 *
 *        "mount_null /usr/include /dev/layer/null"
 *
 * Changing directory to /dev/layer/null will assign the root layerfs-node,
 * which was created when the null layer was mounted).  Now consider opening
 * "sys".  A layer_lookup() would be performed on the root layerfs-node.
 * This operation would bypass through to the lower layer which would return
 * a vnode representing the UFS "sys".  Then, layer_bypass() builds a
 * layerfs-node aliasing the UFS "sys" and returns this to the caller.
 * Later operations on the layerfs-node "sys" will repeat this process when
 * constructing other vnode stacks.
 *
 * INVOKING OPERATIONS ON LOWER LAYERS
 *
 * There are two techniques to invoke operations on a lower layer when the
 * operation cannot be completely bypassed.  Each method is appropriate in
 * different situations.  In both cases, it is the responsibility of the
 * aliasing layer to make the operation arguments "correct" for the lower
 * layer by mapping any vnode arguments to the lower layer.
 *
 * The first approach is to call the aliasing layer's bypass routine.  This
 * method is most suitable when you wish to invoke the operation currently
 * being handled on the lower layer.  It has the advantage that the bypass
 * routine already must do argument mapping.  An example of this is
 * layer_getattr().
 *
 * A second approach is to directly invoke vnode operations on the lower
 * layer with the VOP_OPERATIONNAME interface.  The advantage of this method
 * is that it is easy to invoke arbitrary operations on the lower layer.
 * The disadvantage is that vnode's arguments must be manually mapped.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: layer_vnops.c,v 1.72 2021/10/20 03:08:18 thorpej Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/kmem.h>
#include <sys/buf.h>
#include <sys/kauth.h>
#include <sys/fcntl.h>
#include <sys/fstrans.h>

#include <miscfs/genfs/layer.h>
#include <miscfs/genfs/layer_extern.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

/*
 * This is the 08-June-99 bypass routine, based on the 10-Apr-92 bypass
 *                routine by John Heidemann.
 *        The new element for this version is that the whole nullfs
 * system gained the concept of locks on the lower node.
 *    The 10-Apr-92 version was optimized for speed, throwing away some
 * safety checks.  It should still always work, but it's not as
 * robust to programmer errors.
 *
 * In general, we map all vnodes going down and unmap them on the way back.
 *
 * Also, some BSD vnode operations have the side effect of vrele'ing
 * their arguments.  With stacking, the reference counts are held
 * by the upper node, not the lower one, so we must handle these
 * side-effects here.  This is not of concern in Sun-derived systems
 * since there are no such side-effects.
 *
 * New for the 08-June-99 version: we also handle operations which unlock
 * the passed-in node (typically they vput the node).
 *
 * This makes the following assumptions:
 * - only one returned vpp
 * - no INOUT vpp's (Sun's vop_open has one of these)
 * - the vnode operation vector of the first vnode should be used
 *   to determine what implementation of the op should be invoked
 * - all mapped vnodes are of our vnode-type (NEEDSWORK:
 *   problems on rmdir'ing mount points and renaming?)
 */
int
layer_bypass(void *v)
{
        struct vop_generic_args /* {
                struct vnodeop_desc *a_desc;
                <other random data follows, presumably>
        } */ *ap = v;
        int (**our_vnodeop_p)(void *);
        struct vnode **this_vp_p;
        int error;
        struct vnode *old_vps[VDESC_MAX_VPS], *vp0;
        struct vnode **vps_p[VDESC_MAX_VPS];
        struct vnode ***vppp;
        struct mount *mp;
        struct vnodeop_desc *descp = ap->a_desc;
        int reles, i, flags;

#ifdef DIAGNOSTIC
        /*
         * We require at least one vp.
         */
        if (descp->vdesc_vp_offsets == NULL ||
            descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET)
                panic("%s: no vp's in map.\n", __func__);
#endif

        vps_p[0] =
            VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap);
        vp0 = *vps_p[0];
        mp = vp0->v_mount;
        flags = MOUNTTOLAYERMOUNT(mp)->layerm_flags;
        our_vnodeop_p = vp0->v_op;

        if (flags & LAYERFS_MBYPASSDEBUG)
                printf("%s: %s\n", __func__, descp->vdesc_name);

        /*
         * Map the vnodes going in.
         * Later, we'll invoke the operation based on
         * the first mapped vnode's operation vector.
         */
        reles = descp->vdesc_flags;
        for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
                if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
                        break;   /* bail out at end of list */
                vps_p[i] = this_vp_p =
                    VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[i],
                    ap);
                /*
                 * We're not guaranteed that any but the first vnode
                 * are of our type.  Check for and don't map any
                 * that aren't.  (We must always map first vp or vclean fails.)
                 */
                if (i && (*this_vp_p == NULL ||
                    (*this_vp_p)->v_op != our_vnodeop_p)) {
                        old_vps[i] = NULL;
                } else {
                        old_vps[i] = *this_vp_p;
                        *(vps_p[i]) = LAYERVPTOLOWERVP(*this_vp_p);
                        /*
                         * XXX - Several operations have the side effect
                         * of vrele'ing their vp's.  We must account for
                         * that.  (This should go away in the future.)
                         */
                        if (reles & VDESC_VP0_WILLRELE)
                                vref(*this_vp_p);
                }
        }

        /*
         * Call the operation on the lower layer
         * with the modified argument structure.
         */
        error = VCALL(*vps_p[0], descp->vdesc_offset, ap);

        /*
         * Maintain the illusion of call-by-value
         * by restoring vnodes in the argument structure
         * to their original value.
         */
        reles = descp->vdesc_flags;
        for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
                if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
                        break;   /* bail out at end of list */
                if (old_vps[i]) {
                        *(vps_p[i]) = old_vps[i];
                        if (reles & VDESC_VP0_WILLRELE)
                                vrele(*(vps_p[i]));
                }
        }

        /*
         * Map the possible out-going vpp
         * (Assumes that the lower layer always returns
         * a VREF'ed vpp unless it gets an error.)
         */
        if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && !error) {
                vppp = VOPARG_OFFSETTO(struct vnode***,
                                 descp->vdesc_vpp_offset, ap);
                /*
                 * Only vop_lookup, vop_create, vop_makedir, vop_mknod
                 * and vop_symlink return vpp's. vop_lookup doesn't call bypass
                 * as a lookup on "." would generate a locking error.
                 * So all the calls which get us here have a unlocked vpp. :-)
                 */
                error = layer_node_create(mp, **vppp, *vppp);
                if (error) {
                        vrele(**vppp);
                        **vppp = NULL;
                }
        }
        return error;
}

/*
 * We have to carry on the locking protocol on the layer vnodes
 * as we progress through the tree. We also have to enforce read-only
 * if this layer is mounted read-only.
 */
int
layer_lookup(void *v)
{
        struct vop_lookup_v2_args /* {
                struct vnodeop_desc *a_desc;
                struct vnode * a_dvp;
                struct vnode ** a_vpp;
                struct componentname * a_cnp;
        } */ *ap = v;
        struct componentname *cnp = ap->a_cnp;
        struct vnode *dvp, *lvp, *ldvp;
        int error, flags = cnp->cn_flags;

        dvp = ap->a_dvp;

        if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
            (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
                *ap->a_vpp = NULL;
                return EROFS;
        }

        ldvp = LAYERVPTOLOWERVP(dvp);
        ap->a_dvp = ldvp;
        error = VCALL(ldvp, ap->a_desc->vdesc_offset, ap);
        lvp = *ap->a_vpp;
        *ap->a_vpp = NULL;

        if (error == EJUSTRETURN && (flags & ISLASTCN) &&
            (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
            (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME))
                error = EROFS;

        /*
         * We must do the same locking and unlocking at this layer as
         * is done in the layers below us.
         */
        if (ldvp == lvp) {
                /*
                 * Got the same object back, because we looked up ".",
                 * or ".." in the root node of a mount point.
                 * So we make another reference to dvp and return it.
                 */
                vref(dvp);
                *ap->a_vpp = dvp;
                vrele(lvp);
        } else if (lvp != NULL) {
                /* Note: dvp and ldvp are both locked. */
                KASSERT(error != ENOLCK);
                error = layer_node_create(dvp->v_mount, lvp, ap->a_vpp);
                if (error) {
                        vrele(lvp);
                }
        }
        return error;
}

/*
 * Setattr call. Disallow write attempts if the layer is mounted read-only.
 */
int
layer_setattr(void *v)
{
        struct vop_setattr_args /* {
                struct vnodeop_desc *a_desc;
                struct vnode *a_vp;
                struct vattr *a_vap;
                kauth_cred_t a_cred;
                struct lwp *a_l;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct vattr *vap = ap->a_vap;

          if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
            vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
            vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
            (vp->v_mount->mnt_flag & MNT_RDONLY))
                return EROFS;
        if (vap->va_size != VNOVAL) {
                 switch (vp->v_type) {
                 case VDIR:
                         return EISDIR;
                 case VCHR:
                 case VBLK:
                 case VSOCK:
                 case VFIFO:
                        return 0;
                case VREG:
                case VLNK:
                 default:
                        /*
                         * Disallow write attempts if the filesystem is
                         * mounted read-only.
                         */
                        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                                return EROFS;
                }
        }
        return LAYERFS_DO_BYPASS(vp, ap);
}

/*
 *  We handle getattr only to change the fsid.
 */
int
layer_getattr(void *v)
{
        struct vop_getattr_args /* {
                struct vnode *a_vp;
                struct vattr *a_vap;
                kauth_cred_t a_cred;
                struct lwp *a_l;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        int error;

        error = LAYERFS_DO_BYPASS(vp, ap);
        if (error) {
                return error;
        }
        /* Requires that arguments be restored. */
        ap->a_vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
        return 0;
}

int
layer_access(void *v)
{
        struct vop_access_args /* {
                struct vnode *a_vp;
                accmode_t  a_accmode;
                kauth_cred_t a_cred;
                struct lwp *a_l;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        accmode_t accmode = ap->a_accmode;

        /*
         * Disallow write attempts on read-only layers;
         * unless the file is a socket, fifo, or a block or
         * character device resident on the file system.
         */
        if (accmode & VWRITE) {
                switch (vp->v_type) {
                case VDIR:
                case VLNK:
                case VREG:
                        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                                return EROFS;
                        break;
                default:
                        break;
                }
        }
        return LAYERFS_DO_BYPASS(vp, ap);
}

/*
 * We must handle open to be able to catch MNT_NODEV and friends
 * and increment the lower v_writecount.
 */
int
layer_open(void *v)
{
        struct vop_open_args /* {
                const struct vnodeop_desc *a_desc;
                struct vnode *a_vp;
                int a_mode;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct vnode *lvp = LAYERVPTOLOWERVP(vp);
        int error;

        if (((lvp->v_type == VBLK) || (lvp->v_type == VCHR)) &&
            (vp->v_mount->mnt_flag & MNT_NODEV))
                return ENXIO;

        error = LAYERFS_DO_BYPASS(vp, ap);
        if (error == 0 && (ap->a_mode & FWRITE)) {
                mutex_enter(lvp->v_interlock);
                lvp->v_writecount++;
                mutex_exit(lvp->v_interlock);
        }
        return error;
}

/*
 * We must handle close to decrement the lower v_writecount.
 */
int
layer_close(void *v)
{
        struct vop_close_args /* {
                const struct vnodeop_desc *a_desc;
                struct vnode *a_vp;
                int a_fflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct vnode *lvp = LAYERVPTOLOWERVP(vp);

        if ((ap->a_fflag & FWRITE)) {
                mutex_enter(lvp->v_interlock);
                KASSERT(lvp->v_writecount > 0);
                lvp->v_writecount--;
                mutex_exit(lvp->v_interlock);
        }
        return LAYERFS_DO_BYPASS(vp, ap);
}

/*
 * If vinvalbuf is calling us, it's a "shallow fsync" -- don't bother
 * syncing the underlying vnodes, since they'll be fsync'ed when
 * reclaimed; otherwise, pass it through to the underlying layer.
 *
 * XXX Do we still need to worry about shallow fsync?
 */
int
layer_fsync(void *v)
{
        struct vop_fsync_args /* {
                struct vnode *a_vp;
                kauth_cred_t a_cred;
                int  a_flags;
                off_t offlo;
                off_t offhi;
                struct lwp *a_l;
        } */ *ap = v;
        int error;

        if (ap->a_flags & FSYNC_RECLAIM) {
                return 0;
        }
        if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR) {
                error = spec_fsync(v);
                if (error)
                        return error;
        }
        return LAYERFS_DO_BYPASS(ap->a_vp, ap);
}

int
layer_inactive(void *v)
{
        struct vop_inactive_v2_args /* {
                struct vnode *a_vp;
                bool *a_recycle;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;

        /*
         * If we did a remove, don't cache the node.
         */
        *ap->a_recycle = ((VTOLAYER(vp)->layer_flags & LAYERFS_REMOVED) != 0);

        /*
         * Do nothing (and _don't_ bypass).
         * Wait to vrele lowervp until reclaim,
         * so that until then our layer_node is in the
         * cache and reusable.
         *
         * NEEDSWORK: Someday, consider inactive'ing
         * the lowervp and then trying to reactivate it
         * with capabilities (v_id)
         * like they do in the name lookup cache code.
         * That's too much work for now.
         */

        return 0;
}

int
layer_remove(void *v)
{
        struct vop_remove_v3_args /* {
                struct vnode                *a_dvp;
                struct vnode                *a_vp;
                struct componentname        *a_cnp;
                nlink_t                         ctx_vp_new_nlink;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        int error;

        vref(vp);
        error = LAYERFS_DO_BYPASS(vp, ap);
        if (error == 0) {
                VTOLAYER(vp)->layer_flags |= LAYERFS_REMOVED;
        }
        vrele(vp);

        return error;
}

int
layer_rename(void *v)
{
        struct vop_rename_args /* {
                struct vnode                *a_fdvp;
                struct vnode                *a_fvp;
                struct componentname        *a_fcnp;
                struct vnode                *a_tdvp;
                struct vnode                *a_tvp;
                struct componentname        *a_tcnp;
        } */ *ap = v;
        struct vnode *fdvp = ap->a_fdvp, *tvp;
        int error;

        tvp = ap->a_tvp;
        if (tvp) {
                if (tvp->v_mount != fdvp->v_mount)
                        tvp = NULL;
                else
                        vref(tvp);
        }
        error = LAYERFS_DO_BYPASS(fdvp, ap);
        if (tvp) {
                if (error == 0)
                        VTOLAYER(tvp)->layer_flags |= LAYERFS_REMOVED;
                vrele(tvp);
        }
        return error;
}

int
layer_rmdir(void *v)
{
        struct vop_rmdir_v2_args /* {
                struct vnode                *a_dvp;
                struct vnode                *a_vp;
                struct componentname        *a_cnp;
        } */ *ap = v;
        int                error;
        struct vnode        *vp = ap->a_vp;

        vref(vp);
        error = LAYERFS_DO_BYPASS(vp, ap);
        if (error == 0) {
                VTOLAYER(vp)->layer_flags |= LAYERFS_REMOVED;
        }
        vrele(vp);

        return error;
}

int
layer_revoke(void *v)
{
        struct vop_revoke_args /* {
                struct vnode *a_vp;
                int a_flags;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct vnode *lvp = LAYERVPTOLOWERVP(vp);
        int error;

        /*
         * We will most likely end up in vclean which uses the usecount
         * to determine if a vnode is active.  Take an extra reference on
         * the lower vnode so it will always close and inactivate.
         */
        vref(lvp);
        error = LAYERFS_DO_BYPASS(vp, ap);
        vrele(lvp);

        return error;
}

int
layer_reclaim(void *v)
{
        struct vop_reclaim_v2_args /* {
                struct vnode *a_vp;
                struct lwp *a_l;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct layer_mount *lmp = MOUNTTOLAYERMOUNT(vp->v_mount);
        struct layer_node *xp = VTOLAYER(vp);
        struct vnode *lowervp = xp->layer_lowervp;

        VOP_UNLOCK(vp);

        /*
         * Note: in vop_reclaim, the node's struct lock has been
         * decomissioned, so we have to be careful about calling
         * VOP's on ourself.  We must be careful as VXLOCK is set.
         */
        if (vp == lmp->layerm_rootvp) {
                /*
                 * Oops! We no longer have a root node. Most likely reason is
                 * that someone forcably unmunted the underlying fs.
                 *
                 * Now getting the root vnode will fail. We're dead. :-(
                 */
                lmp->layerm_rootvp = NULL;
        }

        mutex_enter(vp->v_interlock);
        KASSERT(vp->v_interlock == lowervp->v_interlock);
        lowervp->v_writecount -= vp->v_writecount;
        mutex_exit(vp->v_interlock);

        /* After this assignment, this node will not be re-used. */
        xp->layer_lowervp = NULL;
        kmem_free(vp->v_data, lmp->layerm_size);
        vp->v_data = NULL;
        vrele(lowervp);

        return 0;
}

/*
 * We just feed the returned vnode up to the caller - there's no need
 * to build a layer node on top of the node on which we're going to do
 * i/o. :-)
 */
int
layer_bmap(void *v)
{
        struct vop_bmap_args /* {
                struct vnode *a_vp;
                daddr_t  a_bn;
                struct vnode **a_vpp;
                daddr_t *a_bnp;
                int *a_runp;
        } */ *ap = v;
        struct vnode *vp;

        vp = LAYERVPTOLOWERVP(ap->a_vp);
        ap->a_vp = vp;

        return VCALL(vp, ap->a_desc->vdesc_offset, ap);
}

int
layer_print(void *v)
{
        struct vop_print_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        printf ("\ttag VT_LAYERFS, vp=%p, lowervp=%p\n", vp, LAYERVPTOLOWERVP(vp));
        return 0;
}

int
layer_getpages(void *v)
{
        struct vop_getpages_args /* {
                struct vnode *a_vp;
                voff_t a_offset;
                struct vm_page **a_m;
                int *a_count;
                int a_centeridx;
                vm_prot_t a_access_type;
                int a_advice;
                int a_flags;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct mount *mp = vp->v_mount;
        int error;
        krw_t op;

        KASSERT(rw_lock_held(vp->v_uobj.vmobjlock));

        if (ap->a_flags & PGO_LOCKED) {
                return EBUSY;
        }
        ap->a_vp = LAYERVPTOLOWERVP(vp);
        KASSERT(vp->v_uobj.vmobjlock == ap->a_vp->v_uobj.vmobjlock);

        /* Just pass the request on to the underlying layer. */
        op = rw_lock_op(vp->v_uobj.vmobjlock);
        rw_exit(vp->v_uobj.vmobjlock);
        fstrans_start(mp);
        rw_enter(vp->v_uobj.vmobjlock, op);
        if (mp == vp->v_mount) {
                /* Will release the lock. */
                error = VCALL(ap->a_vp, VOFFSET(vop_getpages), ap);
        } else {
                rw_exit(vp->v_uobj.vmobjlock);
                error = ENOENT;
        }
        fstrans_done(mp);

        return error;
}

int
layer_putpages(void *v)
{
        struct vop_putpages_args /* {
                struct vnode *a_vp;
                voff_t a_offlo;
                voff_t a_offhi;
                int a_flags;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;

        KASSERT(rw_write_held(vp->v_uobj.vmobjlock));

        ap->a_vp = LAYERVPTOLOWERVP(vp);
        KASSERT(vp->v_uobj.vmobjlock == ap->a_vp->v_uobj.vmobjlock);

        if (ap->a_flags & PGO_RECLAIM) {
                rw_exit(vp->v_uobj.vmobjlock);
                return 0;
        }

        /* Just pass the request on to the underlying layer. */
        return VCALL(ap->a_vp, VOFFSET(vop_putpages), ap);
}

























































































































    6 































    3 




















   58 







   57 

    1 







    3 
   55 











    2 


    1 



    2 














   58 




    1 









   58 








    1 



    1 


    1 



   58 



    1 


   58 



    1 
   57 















   58 










   58 
   56 
   56 
   58 












    2 


    2 











    2 


















    2 




























   57 








   54 
    2 














































   33 















   33 
   33 



   33 














   33 
    3 










    2 



    2 
    2 
    2 
    2 




    2 












    3 







    3 









    1 












    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
/*        $NetBSD: uvm_vnode.c,v 1.121 2024/04/05 13:05:41 riastradh Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * Copyright (c) 1991, 1993
 *      The Regents of the University of California.
 * Copyright (c) 1990 University of Utah.
 *
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * the Systems Programming Group of the University of Utah Computer
 * Science Department.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *      @(#)vnode_pager.c       8.8 (Berkeley) 2/13/94
 * from: Id: uvm_vnode.c,v 1.1.2.26 1998/02/02 20:38:07 chuck Exp
 */

/*
 * uvm_vnode.c: the vnode pager.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_vnode.c,v 1.121 2024/04/05 13:05:41 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_uvmhist.h"
#endif

#include <sys/atomic.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/disklabel.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/conf.h>
#include <sys/pool.h>
#include <sys/mount.h>

#include <miscfs/specfs/specdev.h>

#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>
#include <uvm/uvm_page_array.h>

#ifdef UVMHIST
UVMHIST_DEFINE(ubchist);
#endif

/*
 * functions
 */

static void        uvn_alloc_ractx(struct uvm_object *);
static void        uvn_detach(struct uvm_object *);
static int        uvn_get(struct uvm_object *, voff_t, struct vm_page **, int *,
                        int, vm_prot_t, int, int);
static void        uvn_markdirty(struct uvm_object *);
static int        uvn_put(struct uvm_object *, voff_t, voff_t, int);
static void        uvn_reference(struct uvm_object *);

static int        uvn_findpage(struct uvm_object *, voff_t, struct vm_page **,
                             unsigned int, struct uvm_page_array *a,
                             unsigned int);

/*
 * master pager structure
 */

const struct uvm_pagerops uvm_vnodeops = {
        .pgo_reference = uvn_reference,
        .pgo_detach = uvn_detach,
        .pgo_get = uvn_get,
        .pgo_put = uvn_put,
        .pgo_markdirty = uvn_markdirty,
};

/*
 * the ops!
 */

/*
 * uvn_reference
 *
 * duplicate a reference to a VM object.  Note that the reference
 * count must already be at least one (the passed in reference) so
 * there is no chance of the uvn being killed or locked out here.
 *
 * => caller must call with object unlocked.
 * => caller must be using the same accessprot as was used at attach time
 */

static void
uvn_reference(struct uvm_object *uobj)
{
        vref((struct vnode *)uobj);
}


/*
 * uvn_detach
 *
 * remove a reference to a VM object.
 *
 * => caller must call with object unlocked and map locked.
 */

static void
uvn_detach(struct uvm_object *uobj)
{
        vrele((struct vnode *)uobj);
}

/*
 * uvn_put: flush page data to backing store.
 *
 * => object must be locked on entry!   VOP_PUTPAGES must unlock it.
 * => flags: PGO_SYNCIO -- use sync. I/O
 */

static int
uvn_put(struct uvm_object *uobj, voff_t offlo, voff_t offhi, int flags)
{
        struct vnode *vp = (struct vnode *)uobj;
        int error;

        KASSERT(rw_write_held(uobj->vmobjlock));
        error = VOP_PUTPAGES(vp, offlo, offhi, flags);

        return error;
}

/*
 * uvn_get: get pages (synchronously) from backing store
 *
 * => prefer map unlocked (not required)
 * => object must be locked!  we will _unlock_ it before starting any I/O.
 * => flags: PGO_LOCKED: fault data structures are locked
 * => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx]
 * => NOTE: caller must check for released pages!!
 */

static int
uvn_get(struct uvm_object *uobj, voff_t offset,
    struct vm_page **pps /* IN/OUT */,
    int *npagesp /* IN (OUT if PGO_LOCKED)*/,
    int centeridx, vm_prot_t access_type, int advice, int flags)
{
        struct vnode *vp = (struct vnode *)uobj;
        int error;

        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(ubchist, "vp %#jx off %#jx", (uintptr_t)vp, offset,
            0, 0);

        if (vp->v_type == VREG && (access_type & VM_PROT_WRITE) == 0
            && (flags & PGO_LOCKED) == 0 && vp->v_tag != VT_TMPFS) {
                uvn_alloc_ractx(uobj);
                uvm_ra_request(vp->v_ractx, advice, uobj, offset,
                    *npagesp << PAGE_SHIFT);
        }

        error = VOP_GETPAGES(vp, offset, pps, npagesp, centeridx,
                             access_type, advice, flags);

        if (flags & PGO_LOCKED)
                KASSERT(rw_lock_held(uobj->vmobjlock));
        return error;
}

/*
 * uvn_markdirty: called when the object gains first dirty page
 *
 * => uobj must be write locked.
 */

static void
uvn_markdirty(struct uvm_object *uobj)
{
        struct vnode *vp = (struct vnode *)uobj;

        KASSERT(rw_write_held(uobj->vmobjlock));

        mutex_enter(vp->v_interlock);
        if ((vp->v_iflag & VI_ONWORKLST) == 0) {
                vn_syncer_add_to_worklist(vp, filedelay);
        }
        mutex_exit(vp->v_interlock);
}

/*
 * uvn_findpages:
 * return the page for the uobj and offset requested, allocating if needed.
 * => uobj must be locked.
 * => returned pages will be BUSY.
 */

int
uvn_findpages(struct uvm_object *uobj, voff_t offset, unsigned int *npagesp,
    struct vm_page **pgs, struct uvm_page_array *a, unsigned int flags)
{
        unsigned int count, found, npages;
        int i, rv;
        struct uvm_page_array a_store;

        if (a == NULL) {
                /*
                 * XXX fragile API
                 * note that the array can be the one supplied by the caller of
                 * uvn_findpages.  in that case, fillflags used by the caller
                 * might not match strictly with ours.
                 * in particular, the caller might have filled the array
                 * without DENSE but passed us UFP_DIRTYONLY (thus DENSE).
                 */
                const unsigned int fillflags =
                    ((flags & UFP_BACKWARD) ? UVM_PAGE_ARRAY_FILL_BACKWARD : 0) |
                    ((flags & UFP_DIRTYONLY) ?
                    (UVM_PAGE_ARRAY_FILL_DIRTY|UVM_PAGE_ARRAY_FILL_DENSE) : 0);
                a = &a_store;
                uvm_page_array_init(a, uobj, fillflags);
        }
        count = found = 0;
        npages = *npagesp;
        if (flags & UFP_BACKWARD) {
                for (i = npages - 1; i >= 0; i--, offset -= PAGE_SIZE) {
                        rv = uvn_findpage(uobj, offset, &pgs[i], flags, a,
                            i + 1);
                        if (rv == 0) {
                                if (flags & UFP_DIRTYONLY)
                                        break;
                        } else
                                found++;
                        count++;
                }
        } else {
                for (i = 0; i < npages; i++, offset += PAGE_SIZE) {
                        rv = uvn_findpage(uobj, offset, &pgs[i], flags, a,
                            npages - i);
                        if (rv == 0) {
                                if (flags & UFP_DIRTYONLY)
                                        break;
                        } else
                                found++;
                        count++;
                }
        }
        if (a == &a_store) {
                uvm_page_array_fini(a);
        }
        *npagesp = count;
        return (found);
}

/*
 * uvn_findpage: find a single page
 *
 * if a suitable page was found, put it in *pgp and return 1.
 * otherwise return 0.
 */

static int
uvn_findpage(struct uvm_object *uobj, voff_t offset, struct vm_page **pgp,
    unsigned int flags, struct uvm_page_array *a, unsigned int nleft)
{
        struct vm_page *pg;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(ubchist, "vp %#jx off %#jx", (uintptr_t)uobj, offset,
            0, 0);

        /*
         * NOBUSY must come with NOWAIT and NOALLOC.  if NOBUSY is
         * specified, this may be called with a reader lock.
         */

        KASSERT(rw_lock_held(uobj->vmobjlock));
        KASSERT((flags & UFP_NOBUSY) == 0 || (flags & UFP_NOWAIT) != 0);
        KASSERT((flags & UFP_NOBUSY) == 0 || (flags & UFP_NOALLOC) != 0);
        KASSERT((flags & UFP_NOBUSY) != 0 || rw_write_held(uobj->vmobjlock));

        if (*pgp != NULL) {
                UVMHIST_LOG(ubchist, "dontcare", 0,0,0,0);
                goto skip_offset;
        }
        for (;;) {
                /*
                 * look for an existing page.
                 */
                pg = uvm_page_array_fill_and_peek(a, offset, nleft);
                if (pg != NULL && pg->offset != offset) {
                        struct vm_page __diagused *tpg;
                        KASSERT(
                            ((a->ar_flags & UVM_PAGE_ARRAY_FILL_BACKWARD) != 0)
                            == (pg->offset < offset));
                        KASSERT((tpg = uvm_pagelookup(uobj, offset)) == NULL ||
                                ((a->ar_flags & UVM_PAGE_ARRAY_FILL_DIRTY) != 0 &&
                                 !uvm_obj_page_dirty_p(tpg)));
                        pg = NULL;
                        if ((a->ar_flags & UVM_PAGE_ARRAY_FILL_DENSE) != 0) {
                                UVMHIST_LOG(ubchist, "dense", 0,0,0,0);
                                return 0;
                        }
                }

                /* nope?  allocate one now */
                if (pg == NULL) {
                        if (flags & UFP_NOALLOC) {
                                UVMHIST_LOG(ubchist, "noalloc", 0,0,0,0);
                                return 0;
                        }
                        pg = uvm_pagealloc(uobj, offset, NULL,
                            UVM_FLAG_COLORMATCH);
                        if (pg == NULL) {
                                if (flags & UFP_NOWAIT) {
                                        UVMHIST_LOG(ubchist, "nowait",0,0,0,0);
                                        return 0;
                                }
                                rw_exit(uobj->vmobjlock);
                                uvm_wait("uvnfp1");
                                uvm_page_array_clear(a);
                                rw_enter(uobj->vmobjlock, RW_WRITER);
                                continue;
                        }
                        UVMHIST_LOG(ubchist, "alloced %#jx (color %ju)",
                            (uintptr_t)pg, VM_PGCOLOR(pg), 0, 0);
                        KASSERTMSG(uvm_pagegetdirty(pg) ==
                            UVM_PAGE_STATUS_CLEAN, "page %p not clean", pg);
                        break;
                } else if (flags & UFP_NOCACHE) {
                        UVMHIST_LOG(ubchist, "nocache",0,0,0,0);
                        goto skip;
                }

                /* page is there, see if we need to wait on it */
                if ((pg->flags & PG_BUSY) != 0) {
                        if (flags & UFP_NOWAIT) {
                                UVMHIST_LOG(ubchist, "nowait",0,0,0,0);
                                goto skip;
                        }
                        UVMHIST_LOG(ubchist, "wait %#jx (color %ju)",
                            (uintptr_t)pg, VM_PGCOLOR(pg), 0, 0);
                        uvm_pagewait(pg, uobj->vmobjlock, "uvnfp2");
                        uvm_page_array_clear(a);
                        rw_enter(uobj->vmobjlock, RW_WRITER);
                        continue;
                }

                /* skip PG_RDONLY pages if requested */
                if ((flags & UFP_NORDONLY) && (pg->flags & PG_RDONLY)) {
                        UVMHIST_LOG(ubchist, "nordonly",0,0,0,0);
                        goto skip;
                }

                /* stop on clean pages if requested */
                if (flags & UFP_DIRTYONLY) {
                        const bool dirty = uvm_pagecheckdirty(pg, false);
                        if (!dirty) {
                                UVMHIST_LOG(ubchist, "dirtonly", 0,0,0,0);
                                return 0;
                        }
                }

                /* mark the page BUSY and we're done. */
                if ((flags & UFP_NOBUSY) == 0) {
                        pg->flags |= PG_BUSY;
                        UVM_PAGE_OWN(pg, "uvn_findpage");
                }
                UVMHIST_LOG(ubchist, "found %#jx (color %ju)",
                    (uintptr_t)pg, VM_PGCOLOR(pg), 0, 0);
                uvm_page_array_advance(a);
                break;
        }
        *pgp = pg;
        return 1;

 skip_offset:
        /*
         * skip this offset
         */
        pg = uvm_page_array_peek(a);
        if (pg != NULL) {
                if (pg->offset == offset) {
                        uvm_page_array_advance(a);
                } else {
                        KASSERT((a->ar_flags & UVM_PAGE_ARRAY_FILL_DENSE) == 0);
                }
        }
        return 0;

 skip:
        /*
         * skip this page
         */
        KASSERT(pg != NULL);
        uvm_page_array_advance(a);
        return 0;
}

/*
 * uvm_vnp_setsize: grow or shrink a vnode uobj
 *
 * grow   => just update size value
 * shrink => toss un-needed pages
 *
 * => we assume that the caller has a reference of some sort to the
 *        vnode in question so that it will not be yanked out from under
 *        us.
 */

void
uvm_vnp_setsize(struct vnode *vp, voff_t newsize)
{
        struct uvm_object *uobj = &vp->v_uobj;
        voff_t pgend = round_page(newsize);
        voff_t oldsize;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);

        rw_enter(uobj->vmobjlock, RW_WRITER);
        UVMHIST_LOG(ubchist, "vp %#jx old %#jx new %#jx",
            (uintptr_t)vp, vp->v_size, newsize, 0);

        /*
         * now check if the size has changed: if we shrink we had better
         * toss some pages...
         */

        KASSERT(newsize != VSIZENOTSET);
        KASSERT(newsize >= 0);
        KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p"
            " v_size=0x%llx v_writesize=0x%llx", vp,
            (unsigned long long)vp->v_size,
            (unsigned long long)vp->v_writesize);
        KASSERTMSG((vp->v_size == vp->v_writesize ||
                newsize == vp->v_writesize || newsize <= vp->v_size),
            "vp=%p v_size=0x%llx v_writesize=0x%llx newsize=0x%llx",
            vp,
            (unsigned long long)vp->v_size,
            (unsigned long long)vp->v_writesize,
            (unsigned long long)newsize);

        oldsize = vp->v_writesize;

        /*
         * check whether size shrinks
         * if old size hasn't been set, there are no pages to drop
         * if there was an integer overflow in pgend, then this is no shrink
         */
        if (oldsize > pgend && oldsize != VSIZENOTSET && pgend >= 0) {
                (void) uvn_put(uobj, pgend, 0, PGO_FREE | PGO_SYNCIO);
                rw_enter(uobj->vmobjlock, RW_WRITER);
        }
        mutex_enter(vp->v_interlock);
        vp->v_size = vp->v_writesize = newsize;
        mutex_exit(vp->v_interlock);
        rw_exit(uobj->vmobjlock);
}

void
uvm_vnp_setwritesize(struct vnode *vp, voff_t newsize)
{

        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
        KASSERT(newsize != VSIZENOTSET);
        KASSERT(newsize >= 0);
        KASSERT(vp->v_size != VSIZENOTSET);
        KASSERT(vp->v_writesize != VSIZENOTSET);
        KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p"
            " v_size=0x%llx v_writesize=0x%llx newsize=0x%llx", vp,
            (unsigned long long)vp->v_size,
            (unsigned long long)vp->v_writesize,
            (unsigned long long)newsize);
        KASSERTMSG(vp->v_size <= newsize, "vp=%p"
            " v_size=0x%llx v_writesize=0x%llx newsize=0x%llx", vp,
            (unsigned long long)vp->v_size,
            (unsigned long long)vp->v_writesize,
            (unsigned long long)newsize);
        mutex_enter(vp->v_interlock);
        vp->v_writesize = newsize;
        mutex_exit(vp->v_interlock);
        rw_exit(vp->v_uobj.vmobjlock);
}

bool
uvn_text_p(struct uvm_object *uobj)
{
        struct vnode *vp = (struct vnode *)uobj;
        int iflag;

        /*
         * v_interlock is not held here, but VI_EXECMAP is only ever changed
         * with the vmobjlock held too.
         */
        iflag = atomic_load_relaxed(&vp->v_iflag);
        return (iflag & VI_EXECMAP) != 0;
}

static void
uvn_alloc_ractx(struct uvm_object *uobj)
{
        struct vnode *vp = (struct vnode *)uobj;
        struct uvm_ractx *ra = NULL;

        KASSERT(rw_write_held(uobj->vmobjlock));

        if (vp->v_type != VREG) {
                return;
        }
        if (vp->v_ractx != NULL) {
                return;
        }
        if (vp->v_ractx == NULL) {
                rw_exit(uobj->vmobjlock);
                ra = uvm_ra_allocctx();
                rw_enter(uobj->vmobjlock, RW_WRITER);
                if (ra != NULL && vp->v_ractx == NULL) {
                        vp->v_ractx = ra;
                        ra = NULL;
                }
        }
        if (ra != NULL) {
                uvm_ra_freectx(ra);
        }
}

























































































































































































































































































































































































































































































































   15 








   15 
   14 



   15 
   15 

   15 


   14 











   15 

   13 
   13 
   13 
   13 









   10 



   10 
   10 
    8 
   10 

   10 





















    9 




















    9 









    9 
    9 


    9 

















    1 












    4 



























    4 
    4 












































    4 


































































   51 









































   51 







   52 

   51 



















   51 
















   13 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
/*        $NetBSD: uvm_km.c,v 1.165 2023/04/09 09:00:56 riastradh Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * Copyright (c) 1991, 1993, The Regents of the University of California.
 *
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * The Mach Operating System project at Carnegie-Mellon University.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vm_kern.c   8.3 (Berkeley) 1/12/94
 * from: Id: uvm_km.c,v 1.1.2.14 1998/02/06 05:19:27 chs Exp
 *
 *
 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
 * All rights reserved.
 *
 * Permission to use, copy, modify and distribute this software and
 * its documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 */

/*
 * uvm_km.c: handle kernel memory allocation and management
 */

/*
 * overview of kernel memory management:
 *
 * the kernel virtual address space is mapped by "kernel_map."   kernel_map
 * starts at VM_MIN_KERNEL_ADDRESS and goes to VM_MAX_KERNEL_ADDRESS.
 * note that VM_MIN_KERNEL_ADDRESS is equal to vm_map_min(kernel_map).
 *
 * the kernel_map has several "submaps."   submaps can only appear in
 * the kernel_map (user processes can't use them).   submaps "take over"
 * the management of a sub-range of the kernel's address space.  submaps
 * are typically allocated at boot time and are never released.   kernel
 * virtual address space that is mapped by a submap is locked by the
 * submap's lock -- not the kernel_map's lock.
 *
 * thus, the useful feature of submaps is that they allow us to break
 * up the locking and protection of the kernel address space into smaller
 * chunks.
 *
 * the vm system has several standard kernel submaps/arenas, including:
 *   kmem_arena => used for kmem/pool (memoryallocators(9))
 *   pager_map => used to map "buf" structures into kernel space
 *   exec_map => used during exec to handle exec args
 *   etc...
 *
 * The kmem_arena is a "special submap", as it lives in a fixed map entry
 * within the kernel_map and is controlled by vmem(9).
 *
 * the kernel allocates its private memory out of special uvm_objects whose
 * reference count is set to UVM_OBJ_KERN (thus indicating that the objects
 * are "special" and never die).   all kernel objects should be thought of
 * as large, fixed-sized, sparsely populated uvm_objects.   each kernel
 * object is equal to the size of kernel virtual address space (i.e. the
 * value "VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS").
 *
 * note that just because a kernel object spans the entire kernel virtual
 * address space doesn't mean that it has to be mapped into the entire space.
 * large chunks of a kernel object's space go unused either because
 * that area of kernel VM is unmapped, or there is some other type of
 * object mapped into that range (e.g. a vnode).    for submap's kernel
 * objects, the only part of the object that can ever be populated is the
 * offsets that are managed by the submap.
 *
 * note that the "offset" in a kernel object is always the kernel virtual
 * address minus the VM_MIN_KERNEL_ADDRESS (aka vm_map_min(kernel_map)).
 * example:
 *   suppose VM_MIN_KERNEL_ADDRESS is 0xf8000000 and the kernel does a
 *   uvm_km_alloc(kernel_map, PAGE_SIZE) [allocate 1 wired down page in the
 *   kernel map].    if uvm_km_alloc returns virtual address 0xf8235000,
 *   then that means that the page at offset 0x235000 in kernel_object is
 *   mapped at 0xf8235000.
 *
 * kernel object have one other special property: when the kernel virtual
 * memory mapping them is unmapped, the backing memory in the object is
 * freed right away.   this is done with the uvm_km_pgremove() function.
 * this has to be done because there is no backing store for kernel pages
 * and no need to save them after they are no longer referenced.
 *
 * Generic arenas:
 *
 * kmem_arena:
 *        Main arena controlling the kernel KVA used by other arenas.
 *
 * kmem_va_arena:
 *        Implements quantum caching in order to speedup allocations and
 *        reduce fragmentation.  The pool(9), unless created with a custom
 *        meta-data allocator, and kmem(9) subsystems use this arena.
 *
 * Arenas for meta-data allocations are used by vmem(9) and pool(9).
 * These arenas cannot use quantum cache.  However, kmem_va_meta_arena
 * compensates this by importing larger chunks from kmem_arena.
 *
 * kmem_va_meta_arena:
 *        Space for meta-data.
 *
 * kmem_meta_arena:
 *        Imports from kmem_va_meta_arena.  Allocations from this arena are
 *        backed with the pages.
 *
 * Arena stacking:
 *
 *        kmem_arena
 *                kmem_va_arena
 *                kmem_va_meta_arena
 *                        kmem_meta_arena
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_km.c,v 1.165 2023/04/09 09:00:56 riastradh Exp $");

#include "opt_uvmhist.h"

#include "opt_kmempages.h"

#ifndef NKMEMPAGES
#define NKMEMPAGES 0
#endif

/*
 * Defaults for lower and upper-bounds for the kmem_arena page count.
 * Can be overridden by kernel config options.
 */
#ifndef NKMEMPAGES_MIN
#define NKMEMPAGES_MIN NKMEMPAGES_MIN_DEFAULT
#endif

#ifndef NKMEMPAGES_MAX
#define NKMEMPAGES_MAX NKMEMPAGES_MAX_DEFAULT
#endif


#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/vmem.h>
#include <sys/vmem_impl.h>
#include <sys/kmem.h>
#include <sys/msan.h>

#include <uvm/uvm.h>

/*
 * global data structures
 */

struct vm_map *kernel_map = NULL;

/*
 * local data structures
 */

static struct vm_map                kernel_map_store;
static struct vm_map_entry        kernel_image_mapent_store;
static struct vm_map_entry        kernel_kmem_mapent_store;

size_t nkmempages = 0;
vaddr_t kmembase;
vsize_t kmemsize;

static struct vmem kmem_arena_store;
vmem_t *kmem_arena = NULL;
static struct vmem kmem_va_arena_store;
vmem_t *kmem_va_arena;

/*
 * kmeminit_nkmempages: calculate the size of kmem_arena.
 */
void
kmeminit_nkmempages(void)
{
        size_t npages;

        if (nkmempages != 0) {
                /*
                 * It's already been set (by us being here before)
                 * bail out now;
                 */
                return;
        }

#if defined(NKMEMPAGES_MAX_UNLIMITED) && !defined(KMSAN)
        npages = physmem;
#else

#if defined(KMSAN)
        npages = (physmem / 4);
#elif defined(PMAP_MAP_POOLPAGE)
        npages = (physmem / 4);
#else
        npages = (physmem / 3) * 2;
#endif /* defined(PMAP_MAP_POOLPAGE) */

#if !defined(NKMEMPAGES_MAX_UNLIMITED)
        if (npages > NKMEMPAGES_MAX)
                npages = NKMEMPAGES_MAX;
#endif

#endif

        if (npages < NKMEMPAGES_MIN)
                npages = NKMEMPAGES_MIN;

        nkmempages = npages;
}

/*
 * uvm_km_bootstrap: init kernel maps and objects to reflect reality (i.e.
 * KVM already allocated for text, data, bss, and static data structures).
 *
 * => KVM is defined by VM_MIN_KERNEL_ADDRESS/VM_MAX_KERNEL_ADDRESS.
 *    we assume that [vmin -> start] has already been allocated and that
 *    "end" is the end.
 */

void
uvm_km_bootstrap(vaddr_t start, vaddr_t end)
{
        bool kmem_arena_small;
        vaddr_t base = VM_MIN_KERNEL_ADDRESS;
        struct uvm_map_args args;
        int error;

        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist, "start=%#jx end=%#jx", start, end, 0,0);

        kmeminit_nkmempages();
        kmemsize = (vsize_t)nkmempages * PAGE_SIZE;
        kmem_arena_small = kmemsize < 64 * 1024 * 1024;

        UVMHIST_LOG(maphist, "kmemsize=%#jx", kmemsize, 0,0,0);

        /*
         * next, init kernel memory objects.
         */

        /* kernel_object: for pageable anonymous kernel memory */
        uvm_kernel_object = uao_create(VM_MAX_KERNEL_ADDRESS -
                                VM_MIN_KERNEL_ADDRESS, UAO_FLAG_KERNOBJ);

        /*
         * init the map and reserve any space that might already
         * have been allocated kernel space before installing.
         */

        uvm_map_setup(&kernel_map_store, base, end, VM_MAP_PAGEABLE);
        kernel_map_store.pmap = pmap_kernel();
        if (start != base) {
                error = uvm_map_prepare(&kernel_map_store,
                    base, start - base,
                    NULL, UVM_UNKNOWN_OFFSET, 0,
                    UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
                                    UVM_ADV_RANDOM, UVM_FLAG_FIXED), &args);
                if (!error) {
                        kernel_image_mapent_store.flags =
                            UVM_MAP_KERNEL | UVM_MAP_STATIC | UVM_MAP_NOMERGE;
                        error = uvm_map_enter(&kernel_map_store, &args,
                            &kernel_image_mapent_store);
                }

                if (error)
                        panic(
                            "uvm_km_bootstrap: could not reserve space for kernel");

                kmembase = args.uma_start + args.uma_size;
        } else {
                kmembase = base;
        }

        error = uvm_map_prepare(&kernel_map_store,
            kmembase, kmemsize,
            NULL, UVM_UNKNOWN_OFFSET, 0,
            UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
                            UVM_ADV_RANDOM, UVM_FLAG_FIXED), &args);
        if (!error) {
                kernel_kmem_mapent_store.flags =
                    UVM_MAP_KERNEL | UVM_MAP_STATIC | UVM_MAP_NOMERGE;
                error = uvm_map_enter(&kernel_map_store, &args,
                    &kernel_kmem_mapent_store);
        }

        if (error)
                panic("uvm_km_bootstrap: could not reserve kernel kmem");

        /*
         * install!
         */

        kernel_map = &kernel_map_store;

        pool_subsystem_init();

        kmem_arena = vmem_init(&kmem_arena_store, "kmem",
            kmembase, kmemsize, PAGE_SIZE, NULL, NULL, NULL,
            0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM);
#ifdef PMAP_GROWKERNEL
        /*
         * kmem_arena VA allocations happen independently of uvm_map.
         * grow kernel to accommodate the kmem_arena.
         */
        if (uvm_maxkaddr < kmembase + kmemsize) {
                uvm_maxkaddr = pmap_growkernel(kmembase + kmemsize);
                KASSERTMSG(uvm_maxkaddr >= kmembase + kmemsize,
                    "%#"PRIxVADDR" %#"PRIxVADDR" %#"PRIxVSIZE,
                    uvm_maxkaddr, kmembase, kmemsize);
        }
#endif

        vmem_subsystem_init(kmem_arena);

        UVMHIST_LOG(maphist, "kmem vmem created (base=%#jx, size=%#jx",
            kmembase, kmemsize, 0,0);

        kmem_va_arena = vmem_init(&kmem_va_arena_store, "kva",
            0, 0, PAGE_SIZE, vmem_alloc, vmem_free, kmem_arena,
            (kmem_arena_small ? 4 : VMEM_QCACHE_IDX_MAX) * PAGE_SIZE,
            VM_NOSLEEP, IPL_VM);

        UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
}

/*
 * uvm_km_init: init the kernel maps virtual memory caches
 * and start the pool/kmem allocator.
 */
void
uvm_km_init(void)
{
        kmem_init();
}

/*
 * uvm_km_suballoc: allocate a submap in the kernel map.   once a submap
 * is allocated all references to that area of VM must go through it.  this
 * allows the locking of VAs in kernel_map to be broken up into regions.
 *
 * => if `fixed' is true, *vmin specifies where the region described
 *   pager_map => used to map "buf" structures into kernel space
 *      by the submap must start
 * => if submap is non NULL we use that as the submap, otherwise we
 *        alloc a new map
 */

struct vm_map *
uvm_km_suballoc(struct vm_map *map, vaddr_t *vmin /* IN/OUT */,
    vaddr_t *vmax /* OUT */, vsize_t size, int flags, bool fixed,
    struct vm_map *submap)
{
        int mapflags = UVM_FLAG_NOMERGE | (fixed ? UVM_FLAG_FIXED : 0);
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        KASSERT(vm_map_pmap(map) == pmap_kernel());

        size = round_page(size);        /* round up to pagesize */

        /*
         * first allocate a blank spot in the parent map
         */

        if (uvm_map(map, vmin, size, NULL, UVM_UNKNOWN_OFFSET, 0,
            UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
            UVM_ADV_RANDOM, mapflags)) != 0) {
                panic("%s: unable to allocate space in parent map", __func__);
        }

        /*
         * set VM bounds (vmin is filled in by uvm_map)
         */

        *vmax = *vmin + size;

        /*
         * add references to pmap and create or init the submap
         */

        pmap_reference(vm_map_pmap(map));
        if (submap == NULL) {
                submap = kmem_alloc(sizeof(*submap), KM_SLEEP);
        }
        uvm_map_setup(submap, *vmin, *vmax, flags);
        submap->pmap = vm_map_pmap(map);

        /*
         * now let uvm_map_submap plug in it...
         */

        if (uvm_map_submap(map, *vmin, *vmax, submap) != 0)
                panic("uvm_km_suballoc: submap allocation failed");

        return(submap);
}

/*
 * uvm_km_pgremove: remove pages from a kernel uvm_object and KVA.
 */

void
uvm_km_pgremove(vaddr_t startva, vaddr_t endva)
{
        struct uvm_object * const uobj = uvm_kernel_object;
        const voff_t start = startva - vm_map_min(kernel_map);
        const voff_t end = endva - vm_map_min(kernel_map);
        struct vm_page *pg;
        voff_t curoff, nextoff;
        int swpgonlydelta = 0;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        KASSERT(VM_MIN_KERNEL_ADDRESS <= startva);
        KASSERT(startva < endva);
        KASSERT(endva <= VM_MAX_KERNEL_ADDRESS);

        rw_enter(uobj->vmobjlock, RW_WRITER);
        pmap_remove(pmap_kernel(), startva, endva);
        for (curoff = start; curoff < end; curoff = nextoff) {
                nextoff = curoff + PAGE_SIZE;
                pg = uvm_pagelookup(uobj, curoff);
                if (pg != NULL && pg->flags & PG_BUSY) {
                        uvm_pagewait(pg, uobj->vmobjlock, "km_pgrm");
                        rw_enter(uobj->vmobjlock, RW_WRITER);
                        nextoff = curoff;
                        continue;
                }

                /*
                 * free the swap slot, then the page.
                 */

                if (pg == NULL &&
                    uao_find_swslot(uobj, curoff >> PAGE_SHIFT) > 0) {
                        swpgonlydelta++;
                }
                uao_dropswap(uobj, curoff >> PAGE_SHIFT);
                if (pg != NULL) {
                        uvm_pagefree(pg);
                }
        }
        rw_exit(uobj->vmobjlock);

        if (swpgonlydelta > 0) {
                KASSERT(uvmexp.swpgonly >= swpgonlydelta);
                atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
        }
}


/*
 * uvm_km_pgremove_intrsafe: like uvm_km_pgremove(), but for non object backed
 *    regions.
 *
 * => when you unmap a part of anonymous kernel memory you want to toss
 *    the pages right away.    (this is called from uvm_unmap_...).
 * => none of the pages will ever be busy, and none of them will ever
 *    be on the active or inactive queues (because they have no object).
 */

void
uvm_km_pgremove_intrsafe(struct vm_map *map, vaddr_t start, vaddr_t end)
{
#define __PGRM_BATCH 16
        struct vm_page *pg;
        paddr_t pa[__PGRM_BATCH];
        int npgrm, i;
        vaddr_t va, batch_vastart;

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        KASSERT(VM_MAP_IS_KERNEL(map));
        KASSERTMSG(vm_map_min(map) <= start,
            "vm_map_min(map) [%#"PRIxVADDR"] <= start [%#"PRIxVADDR"]"
            " (size=%#"PRIxVSIZE")",
            vm_map_min(map), start, end - start);
        KASSERT(start < end);
        KASSERT(end <= vm_map_max(map));

        for (va = start; va < end;) {
                batch_vastart = va;
                /* create a batch of at most __PGRM_BATCH pages to free */
                for (i = 0;
                     i < __PGRM_BATCH && va < end;
                     va += PAGE_SIZE) {
                        if (!pmap_extract(pmap_kernel(), va, &pa[i])) {
                                continue;
                        }
                        i++;
                }
                npgrm = i;
                /* now remove the mappings */
                pmap_kremove(batch_vastart, va - batch_vastart);
                /* and free the pages */
                for (i = 0; i < npgrm; i++) {
                        pg = PHYS_TO_VM_PAGE(pa[i]);
                        KASSERT(pg);
                        KASSERT(pg->uobject == NULL);
                        KASSERT(pg->uanon == NULL);
                        KASSERT((pg->flags & PG_BUSY) == 0);
                        uvm_pagefree(pg);
                }
        }
#undef __PGRM_BATCH
}

#if defined(DEBUG)
void
uvm_km_check_empty(struct vm_map *map, vaddr_t start, vaddr_t end)
{
        vaddr_t va;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        KDASSERT(VM_MAP_IS_KERNEL(map));
        KDASSERT(vm_map_min(map) <= start);
        KDASSERT(start < end);
        KDASSERT(end <= vm_map_max(map));

        for (va = start; va < end; va += PAGE_SIZE) {
                paddr_t pa;

                if (pmap_extract(pmap_kernel(), va, &pa)) {
                        panic("uvm_km_check_empty: va %p has pa %#llx",
                            (void *)va, (long long)pa);
                }
                /*
                 * kernel_object should not have pages for the corresponding
                 * region.  check it.
                 *
                 * why trylock?  because:
                 * - caller might not want to block.
                 * - we can recurse when allocating radix_node for
                 *   kernel_object.
                 */
                if (rw_tryenter(uvm_kernel_object->vmobjlock, RW_READER)) {
                        struct vm_page *pg;

                        pg = uvm_pagelookup(uvm_kernel_object,
                            va - vm_map_min(kernel_map));
                        rw_exit(uvm_kernel_object->vmobjlock);
                        if (pg) {
                                panic("uvm_km_check_empty: "
                                    "has page hashed at %p",
                                    (const void *)va);
                        }
                }
        }
}
#endif /* defined(DEBUG) */

/*
 * uvm_km_alloc: allocate an area of kernel memory.
 *
 * => NOTE: we can return 0 even if we can wait if there is not enough
 *        free VM space in the map... caller should be prepared to handle
 *        this case.
 * => we return KVA of memory allocated
 */

vaddr_t
uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags)
{
        vaddr_t kva, loopva;
        vaddr_t offset;
        vsize_t loopsize;
        struct vm_page *pg;
        struct uvm_object *obj;
        int pgaflags;
        vm_prot_t prot, vaprot;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        KASSERT(vm_map_pmap(map) == pmap_kernel());
        KASSERT((flags & UVM_KMF_TYPEMASK) == UVM_KMF_WIRED ||
                (flags & UVM_KMF_TYPEMASK) == UVM_KMF_PAGEABLE ||
                (flags & UVM_KMF_TYPEMASK) == UVM_KMF_VAONLY);
        KASSERT((flags & UVM_KMF_VAONLY) != 0 || (flags & UVM_KMF_COLORMATCH) == 0);
        KASSERT((flags & UVM_KMF_COLORMATCH) == 0 || (flags & UVM_KMF_VAONLY) != 0);

        /*
         * setup for call
         */

        kva = vm_map_min(map);        /* hint */
        size = round_page(size);
        obj = (flags & UVM_KMF_PAGEABLE) ? uvm_kernel_object : NULL;
        UVMHIST_LOG(maphist,"  (map=%#jx, obj=%#jx, size=%#jx, flags=%#jx)",
            (uintptr_t)map, (uintptr_t)obj, size, flags);

        /*
         * allocate some virtual space
         */

        vaprot = (flags & UVM_KMF_EXEC) ? UVM_PROT_ALL : UVM_PROT_RW;
        if (__predict_false(uvm_map(map, &kva, size, obj, UVM_UNKNOWN_OFFSET,
            align, UVM_MAPFLAG(vaprot, UVM_PROT_ALL, UVM_INH_NONE,
            UVM_ADV_RANDOM,
            (flags & (UVM_KMF_TRYLOCK | UVM_KMF_NOWAIT | UVM_KMF_WAITVA
             | UVM_KMF_COLORMATCH)))) != 0)) {
                UVMHIST_LOG(maphist, "<- done (no VM)",0,0,0,0);
                return(0);
        }

        /*
         * if all we wanted was VA, return now
         */

        if (flags & (UVM_KMF_VAONLY | UVM_KMF_PAGEABLE)) {
                UVMHIST_LOG(maphist,"<- done valloc (kva=%#jx)", kva,0,0,0);
                return(kva);
        }

        /*
         * recover object offset from virtual address
         */

        offset = kva - vm_map_min(kernel_map);
        UVMHIST_LOG(maphist, "  kva=%#jx, offset=%#jx", kva, offset,0,0);

        /*
         * now allocate and map in the memory... note that we are the only ones
         * whom should ever get a handle on this area of VM.
         */

        loopva = kva;
        loopsize = size;

        pgaflags = UVM_FLAG_COLORMATCH;
        if (flags & UVM_KMF_NOWAIT)
                pgaflags |= UVM_PGA_USERESERVE;
        if (flags & UVM_KMF_ZERO)
                pgaflags |= UVM_PGA_ZERO;
        prot = VM_PROT_READ | VM_PROT_WRITE;
        if (flags & UVM_KMF_EXEC)
                prot |= VM_PROT_EXECUTE;
        while (loopsize) {
                KASSERTMSG(!pmap_extract(pmap_kernel(), loopva, NULL),
                    "loopva=%#"PRIxVADDR, loopva);

                pg = uvm_pagealloc_strat(NULL, offset, NULL, pgaflags,
#ifdef UVM_KM_VMFREELIST
                   UVM_PGA_STRAT_ONLY, UVM_KM_VMFREELIST
#else
                   UVM_PGA_STRAT_NORMAL, 0
#endif
                   );

                /*
                 * out of memory?
                 */

                if (__predict_false(pg == NULL)) {
                        if ((flags & UVM_KMF_NOWAIT) ||
                            ((flags & UVM_KMF_CANFAIL) && !uvm_reclaimable())) {
                                /* free everything! */
                                uvm_km_free(map, kva, size,
                                    flags & UVM_KMF_TYPEMASK);
                                return (0);
                        } else {
                                uvm_wait("km_getwait2");        /* sleep here */
                                continue;
                        }
                }

                pg->flags &= ~PG_BUSY;        /* new page */
                UVM_PAGE_OWN(pg, NULL);

                /*
                 * map it in
                 */

                pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg),
                    prot, PMAP_KMPAGE);
                loopva += PAGE_SIZE;
                offset += PAGE_SIZE;
                loopsize -= PAGE_SIZE;
        }

        pmap_update(pmap_kernel());

        if ((flags & UVM_KMF_ZERO) == 0) {
                kmsan_orig((void *)kva, size, KMSAN_TYPE_UVM, __RET_ADDR);
                kmsan_mark((void *)kva, size, KMSAN_STATE_UNINIT);
        }

        UVMHIST_LOG(maphist,"<- done (kva=%#jx)", kva,0,0,0);
        return(kva);
}

/*
 * uvm_km_protect: change the protection of an allocated area
 */

int
uvm_km_protect(struct vm_map *map, vaddr_t addr, vsize_t size, vm_prot_t prot)
{
        return uvm_map_protect(map, addr, addr + round_page(size), prot, false);
}

/*
 * uvm_km_free: free an area of kernel memory
 */

void
uvm_km_free(struct vm_map *map, vaddr_t addr, vsize_t size, uvm_flag_t flags)
{
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        KASSERT((flags & UVM_KMF_TYPEMASK) == UVM_KMF_WIRED ||
                (flags & UVM_KMF_TYPEMASK) == UVM_KMF_PAGEABLE ||
                (flags & UVM_KMF_TYPEMASK) == UVM_KMF_VAONLY);
        KASSERT((addr & PAGE_MASK) == 0);
        KASSERT(vm_map_pmap(map) == pmap_kernel());

        size = round_page(size);

        if (flags & UVM_KMF_PAGEABLE) {
                uvm_km_pgremove(addr, addr + size);
        } else if (flags & UVM_KMF_WIRED) {
                /*
                 * Note: uvm_km_pgremove_intrsafe() extracts mapping, thus
                 * remove it after.  See comment below about KVA visibility.
                 */
                uvm_km_pgremove_intrsafe(map, addr, addr + size);
        }

        /*
         * Note: uvm_unmap_remove() calls pmap_update() for us, before
         * KVA becomes globally available.
         */

        uvm_unmap1(map, addr, addr + size, UVM_FLAG_VAONLY);
}

/* Sanity; must specify both or none. */
#if (defined(PMAP_MAP_POOLPAGE) || defined(PMAP_UNMAP_POOLPAGE)) && \
    (!defined(PMAP_MAP_POOLPAGE) || !defined(PMAP_UNMAP_POOLPAGE))
#error Must specify MAP and UNMAP together.
#endif

#if defined(PMAP_ALLOC_POOLPAGE) && \
    !defined(PMAP_MAP_POOLPAGE) && !defined(PMAP_UNMAP_POOLPAGE)
#error Must specify ALLOC with MAP and UNMAP
#endif

int
uvm_km_kmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags,
    vmem_addr_t *addr)
{
        struct vm_page *pg;
        vmem_addr_t va;
        int rc;
        vaddr_t loopva;
        vsize_t loopsize;

        size = round_page(size);

#if defined(PMAP_MAP_POOLPAGE)
        if (size == PAGE_SIZE) {
again:
#ifdef PMAP_ALLOC_POOLPAGE
                pg = PMAP_ALLOC_POOLPAGE((flags & VM_SLEEP) ?
                   0 : UVM_PGA_USERESERVE);
#else
                pg = uvm_pagealloc(NULL, 0, NULL,
                   (flags & VM_SLEEP) ? 0 : UVM_PGA_USERESERVE);
#endif /* PMAP_ALLOC_POOLPAGE */
                if (__predict_false(pg == NULL)) {
                        if (flags & VM_SLEEP) {
                                uvm_wait("plpg");
                                goto again;
                        }
                        return ENOMEM;
                }
                va = PMAP_MAP_POOLPAGE(VM_PAGE_TO_PHYS(pg));
                KASSERT(va != 0);
                *addr = va;
                return 0;
        }
#endif /* PMAP_MAP_POOLPAGE */

        rc = vmem_alloc(vm, size, flags, &va);
        if (rc != 0)
                return rc;

#ifdef PMAP_GROWKERNEL
        /*
         * These VA allocations happen independently of uvm_map
         * so this allocation must not extend beyond the current limit.
         */
        KASSERTMSG(uvm_maxkaddr >= va + size,
            "%#"PRIxVADDR" %#"PRIxPTR" %#zx",
            uvm_maxkaddr, va, size);
#endif

        loopva = va;
        loopsize = size;

        while (loopsize) {
                paddr_t pa __diagused;
                KASSERTMSG(!pmap_extract(pmap_kernel(), loopva, &pa),
                    "loopva=%#"PRIxVADDR" loopsize=%#"PRIxVSIZE
                    " pa=%#"PRIxPADDR" vmem=%p",
                    loopva, loopsize, pa, vm);

                pg = uvm_pagealloc(NULL, loopva, NULL,
                    UVM_FLAG_COLORMATCH
                    | ((flags & VM_SLEEP) ? 0 : UVM_PGA_USERESERVE));
                if (__predict_false(pg == NULL)) {
                        if (flags & VM_SLEEP) {
                                uvm_wait("plpg");
                                continue;
                        } else {
                                uvm_km_pgremove_intrsafe(kernel_map, va,
                                    va + size);
                                vmem_free(vm, va, size);
                                return ENOMEM;
                        }
                }

                pg->flags &= ~PG_BUSY;        /* new page */
                UVM_PAGE_OWN(pg, NULL);
                pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg),
                    VM_PROT_READ|VM_PROT_WRITE, PMAP_KMPAGE);

                loopva += PAGE_SIZE;
                loopsize -= PAGE_SIZE;
        }
        pmap_update(pmap_kernel());

        *addr = va;

        return 0;
}

void
uvm_km_kmem_free(vmem_t *vm, vmem_addr_t addr, size_t size)
{

        size = round_page(size);
#if defined(PMAP_UNMAP_POOLPAGE)
        if (size == PAGE_SIZE) {
                paddr_t pa;

                pa = PMAP_UNMAP_POOLPAGE(addr);
                uvm_pagefree(PHYS_TO_VM_PAGE(pa));
                return;
        }
#endif /* PMAP_UNMAP_POOLPAGE */
        uvm_km_pgremove_intrsafe(kernel_map, addr, addr + size);
        pmap_update(pmap_kernel());

        vmem_free(vm, addr, size);
}

bool
uvm_km_va_starved_p(void)
{
        vmem_size_t total;
        vmem_size_t free;

        if (kmem_arena == NULL)
                return false;

        total = vmem_size(kmem_arena, VMEM_ALLOC|VMEM_FREE);
        free = vmem_size(kmem_arena, VMEM_FREE);

        return (free < (total / 10));
}






















































































































    5 










    5 










    5 





















    6 













    6 
































    8 




    8 













    8 














































































    1 


























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
/*        $NetBSD: layer_vfsops.c,v 1.56 2022/12/09 10:33:18 hannken Exp $        */

/*
 * Copyright (c) 1999 National Aeronautics & Space Administration
 * All rights reserved.
 *
 * This software was written by William Studenmund of the
 * Numerical Aerospace Simulation Facility, NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the National Aeronautics & Space Administration
 *    nor the names of its contributors may be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
 * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1992, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        from: Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp
 *        from: @(#)lofs_vfsops.c        1.2 (Berkeley) 6/18/92
 *        @(#)null_vfsops.c        8.7 (Berkeley) 5/14/95
 */

/*
 * Generic layer VFS operations.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: layer_vfsops.c,v 1.56 2022/12/09 10:33:18 hannken Exp $");

#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/module.h>

#include <miscfs/specfs/specdev.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/layer.h>
#include <miscfs/genfs/layer_extern.h>

SYSCTL_SETUP_PROTO(sysctl_vfs_layerfs_setup);

MODULE(MODULE_CLASS_MISC, layerfs, NULL);

static int
layerfs_modcmd(modcmd_t cmd, void *arg)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return 0;
        case MODULE_CMD_FINI:
                return 0;
        default:
                return ENOTTY;
        }
        return 0;
}

/*
 * VFS start.  Nothing needed here - the start routine on the underlying
 * filesystem will have been called when that filesystem was mounted.
 */
int
layerfs_start(struct mount *mp, int flags)
{

#ifdef notyet
        return VFS_START(mp->mnt_lower, flags);
#else
        return 0;
#endif
}

int
layerfs_root(struct mount *mp, int lktype, struct vnode **vpp)
{
        struct vnode *vp;

        vp = MOUNTTOLAYERMOUNT(mp)->layerm_rootvp;
        if (vp == NULL) {
                *vpp = NULL;
                return EINVAL;
        }
        /*
         * Return root vnode with locked and with a reference held.
         */
        vref(vp);
        vn_lock(vp, lktype | LK_RETRY);
        *vpp = vp;
        return 0;
}

int
layerfs_quotactl(struct mount *mp, struct quotactl_args *args)
{
        int error;

        error = vfs_busy(mp);
        if (error == 0) {
                error = VFS_QUOTACTL(mp->mnt_lower, args);
                vfs_unbusy(mp);
        }

        return error;
}

int
layerfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
        struct statvfs *sbuf;
        int error;

        sbuf = kmem_zalloc(sizeof(*sbuf), KM_SLEEP);
        error = vfs_busy(mp);
        if (error == 0) {
                error = VFS_STATVFS(mp->mnt_lower, sbuf);
                vfs_unbusy(mp);
        }
        if (error) {
                goto done;
        }
        /* Copy across the relevant data and fake the rest. */
        sbp->f_flag = sbuf->f_flag;
        sbp->f_bsize = sbuf->f_bsize;
        sbp->f_frsize = sbuf->f_frsize;
        sbp->f_iosize = sbuf->f_iosize;
        sbp->f_blocks = sbuf->f_blocks;
        sbp->f_bfree = sbuf->f_bfree;
        sbp->f_bavail = sbuf->f_bavail;
        sbp->f_bresvd = sbuf->f_bresvd;
        sbp->f_files = sbuf->f_files;
        sbp->f_ffree = sbuf->f_ffree;
        sbp->f_favail = sbuf->f_favail;
        sbp->f_fresvd = sbuf->f_fresvd;
        sbp->f_namemax = sbuf->f_namemax;
        copy_statvfs_info(sbp, mp);
done:
        kmem_free(sbuf, sizeof(*sbuf));
        return error;
}

int
layerfs_sync(struct mount *mp, int waitfor,
    kauth_cred_t cred)
{

        /*
         * XXX - Assumes no data cached at layer.
         */
        return 0;
}

int
layerfs_loadvnode(struct mount *mp, struct vnode *vp,
    const void *key, size_t key_len, const void **new_key)
{
        struct layer_mount *lmp = MOUNTTOLAYERMOUNT(mp);
        struct vnode *lowervp;
        struct layer_node *xp;

        KASSERT(key_len == sizeof(struct vnode *));
        memcpy(&lowervp, key, key_len);

        xp = kmem_alloc(lmp->layerm_size, KM_SLEEP);

        /* Share the interlock, vmobjlock, and klist with the lower node. */
        vshareilock(vp, lowervp);
        rw_obj_hold(lowervp->v_uobj.vmobjlock);
        uvm_obj_setlock(&vp->v_uobj, lowervp->v_uobj.vmobjlock);
        vshareklist(vp, lowervp);

        vp->v_tag = lmp->layerm_tag;
        vp->v_type = lowervp->v_type;
        vp->v_op = lmp->layerm_vnodeop_p;
        if (vp->v_type == VBLK || vp->v_type == VCHR)
                spec_node_init(vp, lowervp->v_rdev);
        vp->v_data = xp;
        xp->layer_vnode = vp;
        xp->layer_lowervp = lowervp;
        xp->layer_flags = 0;
        uvm_vnp_setsize(vp, 0);

        /*  Add a reference to the lower node. */
        vref(lowervp);
        *new_key = &xp->layer_lowervp;
        return 0;
}

int
layerfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp)
{
        struct vnode *vp;
        int error;

        error = vfs_busy(mp);
        if (error == 0) {
                error = VFS_VGET(mp->mnt_lower, ino, lktype, &vp);
                vfs_unbusy(mp);
        }
        if (error) {
                *vpp = NULL;
                return error;
        }
        VOP_UNLOCK(vp);
        error = layer_node_create(mp, vp, vpp);
        if (error) {
                vrele(vp);
                *vpp = NULL;
                return error;
        }
        error = vn_lock(*vpp, lktype);
        if (error) {
                vrele(*vpp);
                *vpp = NULL;
                return error;
        }
        return 0;
}

int
layerfs_fhtovp(struct mount *mp, struct fid *fidp, int lktype,
    struct vnode **vpp)
{
        struct vnode *vp;
        int error;

        error = vfs_busy(mp);
        if (error == 0) {
                error = VFS_FHTOVP(mp->mnt_lower, fidp, lktype, &vp);
                vfs_unbusy(mp);
        }
        if (error) {
                *vpp = NULL;
                return error;
        }
        VOP_UNLOCK(vp);
        error = layer_node_create(mp, vp, vpp);
        if (error) {
                vput(vp);
                *vpp = NULL;
                return (error);
        }
        error = vn_lock(*vpp, lktype);
        if (error) {
                vrele(*vpp);
                *vpp = NULL;
                return error;
        }
        return 0;
}

int
layerfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
{

        return VFS_VPTOFH(LAYERVPTOLOWERVP(vp), fhp, fh_size);
}

/*
 * layerfs_snapshot - handle a snapshot through a layered file system
 *
 * At present, we do NOT support snapshotting through a layered file
 * system as the ffs implementation changes v_vnlock of the snapshot
 * vnodes to point to one common lock. As there is no way for us to
 * absolutely pass this change up the stack, a layered file system
 * would end up referencing the wrong lock.
 *
 * This routine serves as a central resource for this behavior; all
 * layered file systems don't need to worry about the above. Also, if
 * things get fixed, all layers get the benefit.
 */
int
layerfs_snapshot(struct mount *mp, struct vnode *vp,
    struct timespec *ts)
{

        return EOPNOTSUPP;
}

/*
 * layerfs_suspendctl - suspend a layered file system
 *
 * Here we should suspend the lower file system(s) too.  At present
 * this will deadlock as we don't know which to suspend first.
 *
 * This routine serves as a central resource for this behavior; all
 * layered file systems don't need to worry about the above. Also, if
 * things get fixed, all layers get the benefit.
 */
int
layerfs_suspendctl(struct mount *mp, int cmd)
{

        return genfs_suspendctl(mp, cmd);
}

SYSCTL_SETUP(sysctl_vfs_layerfs_setup, "sysctl vfs.layerfs subtree setup")
{
        const struct sysctlnode *layerfs_node = NULL;

        sysctl_createv(clog, 0, NULL, &layerfs_node,
#ifdef _MODULE
                       0,
#else
                       CTLFLAG_PERMANENT,
#endif
                       CTLTYPE_NODE, "layerfs",
                       SYSCTL_DESCR("Generic layered file system"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, CTL_CREATE, CTL_EOL);

#ifdef LAYERFS_DIAGNOSTIC
        sysctl_createv(clog, 0, &layerfs_node, NULL,
#ifndef _MODULE
                       CTLFLAG_PERMANENT |
#endif
                       CTLFLAG_READWRITE,
                       CTLTYPE_INT,
                       "debug",
                       SYSCTL_DESCR("Verbose debugging messages"),
                       NULL, 0, &layerfs_debug, 0,
                       CTL_CREATE, CTL_EOL);
#endif

        /*
         * other subtrees should really be aliases to this, but since
         * they can't tell if layerfs has been instantiated yet, they
         * can't do that...not easily.  not yet.  :-)
         */
}

int
layerfs_renamelock_enter(struct mount *mp)
{

        return VFS_RENAMELOCK_ENTER(mp->mnt_lower);
}

void
layerfs_renamelock_exit(struct mount *mp)
{

        VFS_RENAMELOCK_EXIT(mp->mnt_lower);
}









































































































































































    1 

    1 







   16 




   17 


   17 






















   17 
   17 


















    3 

    3 




    3 
    3 


















































































































































   23 


   22 

















    8 

    8 
    8 
    8 











































































    2 
    2 


    2 










    8 
   10 


   10 























    2 






    2 






    2 
    1 











    1 


    1 
    1 

    1 























    1 

    1 












    1 

    1 

























































   23 
   24 















    2 




   21 

   22 
   21 















   24 




   25 
   25 
   24 

    1 





    2 


   21 











   10 

   10 









































































































    1 


    1 





















    1 





    1 

































































    1 


    1 











    1 




    1 



































































    2 



    2 






    2 




    2 


    1 


    2 



















    1 
    1 






    1 
    1 
    1 

































































































































































































    2 



    2 


    2 

    1 
    1 



    1 



    1 
















    1 





    1 














    9 

    9 
    9 

   10 


    9 
   10 
   10 
   10 







    1 


    1 

    1 
    1 
    1 














    1 


    1 
    1 














    1 






























































































   59 










   10 









   10 







   14 
    2 
   12 





   14 









   27 




   29 



   29 




















   29 




   29 
   29 



















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
/*        $NetBSD: uipc_socket2.c,v 1.143 2024/01/03 18:10:42 andvar Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)uipc_socket2.c        8.2 (Berkeley) 2/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.143 2024/01/03 18:10:42 andvar Exp $");

#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "opt_inet.h"
#include "opt_mbuftrace.h"
#include "opt_sb_max.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/buf.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/domain.h>
#include <sys/poll.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/kauth.h>
#include <sys/pool.h>
#include <sys/uidinfo.h>

#ifdef DDB
#include <sys/filedesc.h>
#include <ddb/db_active.h>
#endif

/*
 * Primitive routines for operating on sockets and socket buffers.
 *
 * Connection life-cycle:
 *
 *        Normal sequence from the active (originating) side:
 *
 *        - soisconnecting() is called during processing of connect() call,
 *        - resulting in an eventual call to soisconnected() if/when the
 *          connection is established.
 *
 *        When the connection is torn down during processing of disconnect():
 *
 *        - soisdisconnecting() is called and,
 *        - soisdisconnected() is called when the connection to the peer
 *          is totally severed.
 *
 *        The semantics of these routines are such that connectionless protocols
 *        can call soisconnected() and soisdisconnected() only, bypassing the
 *        in-progress calls when setting up a ``connection'' takes no time.
 *
 *        From the passive side, a socket is created with two queues of sockets:
 *
 *        - so_q0 (0) for partial connections (i.e. connections in progress)
 *        - so_q (1) for connections already made and awaiting user acceptance.
 *
 *        As a protocol is preparing incoming connections, it creates a socket
 *        structure queued on so_q0 by calling sonewconn().  When the connection
 *        is established, soisconnected() is called, and transfers the
 *        socket structure to so_q, making it available to accept().
 *
 *        If a socket is closed with sockets on either so_q0 or so_q, these
 *        sockets are dropped.
 *
 * Locking rules and assumptions:
 *
 * o socket::so_lock can change on the fly.  The low level routines used
 *   to lock sockets are aware of this.  When so_lock is acquired, the
 *   routine locking must check to see if so_lock still points to the
 *   lock that was acquired.  If so_lock has changed in the meantime, the
 *   now irrelevant lock that was acquired must be dropped and the lock
 *   operation retried.  Although not proven here, this is completely safe
 *   on a multiprocessor system, even with relaxed memory ordering, given
 *   the next two rules:
 *
 * o In order to mutate so_lock, the lock pointed to by the current value
 *   of so_lock must be held: i.e., the socket must be held locked by the
 *   changing thread.  The thread must issue membar_release() to prevent
 *   memory accesses being reordered, and can set so_lock to the desired
 *   value.  If the lock pointed to by the new value of so_lock is not
 *   held by the changing thread, the socket must then be considered
 *   unlocked.
 *
 * o If so_lock is mutated, and the previous lock referred to by so_lock
 *   could still be visible to other threads in the system (e.g. via file
 *   descriptor or protocol-internal reference), then the old lock must
 *   remain valid until the socket and/or protocol control block has been
 *   torn down.
 *
 * o If a socket has a non-NULL so_head value (i.e. is in the process of
 *   connecting), then locking the socket must also lock the socket pointed
 *   to by so_head: their lock pointers must match.
 *
 * o If a socket has connections in progress (so_q, so_q0 not empty) then
 *   locking the socket must also lock the sockets attached to both queues.
 *   Again, their lock pointers must match.
 *
 * o Beyond the initial lock assignment in socreate(), assigning locks to
 *   sockets is the responsibility of the individual protocols / protocol
 *   domains.
 */

static pool_cache_t        socket_cache;
u_long                        sb_max = SB_MAX;/* maximum socket buffer size */
static u_long                sb_max_adj;        /* adjusted sb_max */

void
soisconnecting(struct socket *so)
{

        KASSERT(solocked(so));

        so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
        so->so_state |= SS_ISCONNECTING;
}

void
soisconnected(struct socket *so)
{
        struct socket        *head;

        head = so->so_head;

        KASSERT(solocked(so));
        KASSERT(head == NULL || solocked2(so, head));

        so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING);
        so->so_state |= SS_ISCONNECTED;
        if (head && so->so_onq == &head->so_q0) {
                if ((so->so_options & SO_ACCEPTFILTER) == 0) {
                        /*
                         * Re-enqueue and wake up any waiters, e.g.
                         * processes blocking on accept().
                         */
                        soqremque(so, 0);
                        soqinsque(head, so, 1);
                        sorwakeup(head);
                        cv_broadcast(&head->so_cv);
                } else {
                        so->so_upcall =
                            head->so_accf->so_accept_filter->accf_callback;
                        so->so_upcallarg = head->so_accf->so_accept_filter_arg;
                        so->so_rcv.sb_flags |= SB_UPCALL;
                        so->so_options &= ~SO_ACCEPTFILTER;
                        (*so->so_upcall)(so, so->so_upcallarg,
                                         POLLIN|POLLRDNORM, M_DONTWAIT);
                }
        } else {
                cv_broadcast(&so->so_cv);
                sorwakeup(so);
                sowwakeup(so);
        }
}

void
soisdisconnecting(struct socket *so)
{

        KASSERT(solocked(so));

        so->so_state &= ~SS_ISCONNECTING;
        so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
        cv_broadcast(&so->so_cv);
        sowwakeup(so);
        sorwakeup(so);
}

void
soisdisconnected(struct socket *so)
{

        KASSERT(solocked(so));

        so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
        so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
        cv_broadcast(&so->so_cv);
        sowwakeup(so);
        sorwakeup(so);
}

void
soinit2(void)
{

        socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0,
            "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL);
}

/*
 * sonewconn: accept a new connection.
 *
 * When an attempt at a new connection is noted on a socket which accepts
 * connections, sonewconn(9) is called.  If the connection is possible
 * (subject to space constraints, etc) then we allocate a new structure,
 * properly linked into the data structure of the original socket.
 *
 * => If 'soready' is true, then socket will become ready for accept() i.e.
 *    inserted into the so_q queue, SS_ISCONNECTED set and waiters awoken.
 * => May be called from soft-interrupt context.
 * => Listening socket should be locked.
 * => Returns the new socket locked.
 */
struct socket *
sonewconn(struct socket *head, bool soready)
{
        struct socket *so;
        int soqueue, error;

        KASSERT(solocked(head));

        if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) {
                /*
                 * Listen queue overflow.  If there is an accept filter
                 * active, pass through the oldest cxn it's handling.
                 */
                if (head->so_accf == NULL) {
                        return NULL;
                } else {
                        struct socket *so2, *next;

                        /* Pass the oldest connection waiting in the
                           accept filter */
                        for (so2 = TAILQ_FIRST(&head->so_q0);
                             so2 != NULL; so2 = next) {
                                next = TAILQ_NEXT(so2, so_qe);
                                if (so2->so_upcall == NULL) {
                                        continue;
                                }
                                so2->so_upcall = NULL;
                                so2->so_upcallarg = NULL;
                                so2->so_options &= ~SO_ACCEPTFILTER;
                                so2->so_rcv.sb_flags &= ~SB_UPCALL;
                                soisconnected(so2);
                                break;
                        }

                        /* If nothing was nudged out of the acept filter, bail
                         * out; otherwise proceed allocating the socket. */
                        if (so2 == NULL) {
                                return NULL;
                        }
                }
        }
        if ((head->so_options & SO_ACCEPTFILTER) != 0) {
                soready = false;
        }
        soqueue = soready ? 1 : 0;

        if ((so = soget(false)) == NULL) {
                return NULL;
        }
        so->so_type = head->so_type;
        so->so_options = head->so_options & ~SO_ACCEPTCONN;
        so->so_linger = head->so_linger;
        so->so_state = head->so_state | SS_NOFDREF;
        so->so_proto = head->so_proto;
        so->so_timeo = head->so_timeo;
        so->so_pgid = head->so_pgid;
        so->so_send = head->so_send;
        so->so_receive = head->so_receive;
        so->so_uidinfo = head->so_uidinfo;
        so->so_egid = head->so_egid;
        so->so_cpid = head->so_cpid;

        /*
         * Share the lock with the listening-socket, it may get unshared
         * once the connection is complete.
         *
         * so_lock is stable while we hold the socket locked, so no
         * need for atomic_load_* here.
         */
        mutex_obj_hold(head->so_lock);
        so->so_lock = head->so_lock;

        /*
         * Reserve the space for socket buffers.
         */
#ifdef MBUFTRACE
        so->so_mowner = head->so_mowner;
        so->so_rcv.sb_mowner = head->so_rcv.sb_mowner;
        so->so_snd.sb_mowner = head->so_snd.sb_mowner;
#endif
        if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
                goto out;
        }
        so->so_snd.sb_lowat = head->so_snd.sb_lowat;
        so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
        so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
        so->so_snd.sb_timeo = head->so_snd.sb_timeo;
        so->so_rcv.sb_flags |= head->so_rcv.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
        so->so_snd.sb_flags |= head->so_snd.sb_flags & (SB_AUTOSIZE | SB_ASYNC);

        /*
         * Finally, perform the protocol attach.  Note: a new socket
         * lock may be assigned at this point (if so, it will be held).
         */
        error = (*so->so_proto->pr_usrreqs->pr_attach)(so, 0);
        if (error) {
out:
                KASSERT(solocked(so));
                KASSERT(so->so_accf == NULL);
                soput(so);

                /* Note: the listening socket shall stay locked. */
                KASSERT(solocked(head));
                return NULL;
        }
        KASSERT(solocked2(head, so));

        /*
         * Insert into the queue.  If ready, update the connection status
         * and wake up any waiters, e.g. processes blocking on accept().
         */
        soqinsque(head, so, soqueue);
        if (soready) {
                so->so_state |= SS_ISCONNECTED;
                sorwakeup(head);
                cv_broadcast(&head->so_cv);
        }
        return so;
}

struct socket *
soget(bool waitok)
{
        struct socket *so;

        so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
        if (__predict_false(so == NULL))
                return (NULL);
        memset(so, 0, sizeof(*so));
        TAILQ_INIT(&so->so_q0);
        TAILQ_INIT(&so->so_q);
        cv_init(&so->so_cv, "socket");
        cv_init(&so->so_rcv.sb_cv, "netio");
        cv_init(&so->so_snd.sb_cv, "netio");
        selinit(&so->so_rcv.sb_sel);
        selinit(&so->so_snd.sb_sel);
        so->so_rcv.sb_so = so;
        so->so_snd.sb_so = so;
        return so;
}

void
soput(struct socket *so)
{

        KASSERT(!cv_has_waiters(&so->so_cv));
        KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
        KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
        seldestroy(&so->so_rcv.sb_sel);
        seldestroy(&so->so_snd.sb_sel);
        mutex_obj_free(so->so_lock);
        cv_destroy(&so->so_cv);
        cv_destroy(&so->so_rcv.sb_cv);
        cv_destroy(&so->so_snd.sb_cv);
        pool_cache_put(socket_cache, so);
}

/*
 * soqinsque: insert socket of a new connection into the specified
 * accept queue of the listening socket (head).
 *
 *        q = 0: queue of partial connections
 *        q = 1: queue of incoming connections
 */
void
soqinsque(struct socket *head, struct socket *so, int q)
{
        KASSERT(q == 0 || q == 1);
        KASSERT(solocked2(head, so));
        KASSERT(so->so_onq == NULL);
        KASSERT(so->so_head == NULL);

        so->so_head = head;
        if (q == 0) {
                head->so_q0len++;
                so->so_onq = &head->so_q0;
        } else {
                head->so_qlen++;
                so->so_onq = &head->so_q;
        }
        TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
}

/*
 * soqremque: remove socket from the specified queue.
 *
 * => Returns true if socket was removed from the specified queue.
 * => False if socket was not removed (because it was in other queue).
 */
bool
soqremque(struct socket *so, int q)
{
        struct socket *head = so->so_head;

        KASSERT(q == 0 || q == 1);
        KASSERT(solocked(so));
        KASSERT(so->so_onq != NULL);
        KASSERT(head != NULL);

        if (q == 0) {
                if (so->so_onq != &head->so_q0)
                        return false;
                head->so_q0len--;
        } else {
                if (so->so_onq != &head->so_q)
                        return false;
                head->so_qlen--;
        }
        KASSERT(solocked2(so, head));
        TAILQ_REMOVE(so->so_onq, so, so_qe);
        so->so_onq = NULL;
        so->so_head = NULL;
        return true;
}

/*
 * socantsendmore: indicates that no more data will be sent on the
 * socket; it would normally be applied to a socket when the user
 * informs the system that no more data is to be sent, by the protocol
 * code (in case pr_shutdown()).
 */
void
socantsendmore(struct socket *so)
{
        KASSERT(solocked(so));

        so->so_state |= SS_CANTSENDMORE;
        sowwakeup(so);
}

/*
 * socantrcvmore(): indicates that no more data will be received and
 * will normally be applied to the socket by a protocol when it detects
 * that the peer will send no more data.  Data queued for reading in
 * the socket may yet be read.
 */
void
socantrcvmore(struct socket *so)
{
        KASSERT(solocked(so));

        so->so_state |= SS_CANTRCVMORE;
        sorwakeup(so);
}

/*
 * soroverflow(): indicates that data was attempted to be sent
 * but the receiving buffer overflowed.
 */
void
soroverflow(struct socket *so)
{
        KASSERT(solocked(so));

        so->so_rcv.sb_overflowed++;
        if (so->so_options & SO_RERROR)  {
                so->so_rerror = ENOBUFS;
                sorwakeup(so);
        }
}

/*
 * Wait for data to arrive at/drain from a socket buffer.
 */
int
sbwait(struct sockbuf *sb)
{
        struct socket *so;
        kmutex_t *lock;
        int error;

        so = sb->sb_so;

        KASSERT(solocked(so));

        sb->sb_flags |= SB_NOTIFY;
        lock = so->so_lock;
        if ((sb->sb_flags & SB_NOINTR) != 0)
                error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo);
        else
                error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo);
        if (__predict_false(lock != atomic_load_relaxed(&so->so_lock)))
                solockretry(so, lock);
        return error;
}

/*
 * Wakeup processes waiting on a socket buffer.
 * Do asynchronous notification via SIGIO
 * if the socket buffer has the SB_ASYNC flag set.
 */
void
sowakeup(struct socket *so, struct sockbuf *sb, int code)
{
        int band;

        KASSERT(solocked(so));
        KASSERT(sb->sb_so == so);

        switch (code) {
        case POLL_IN:
                band = POLLIN|POLLRDNORM;
                break;

        case POLL_OUT:
                band = POLLOUT|POLLWRNORM;
                break;

        case POLL_HUP:
                band = POLLHUP;
                break;

        default:
                band = 0;
#ifdef DIAGNOSTIC
                printf("bad siginfo code %d in socket notification.\n", code);
#endif 
                break;
        }

        sb->sb_flags &= ~SB_NOTIFY;
        selnotify(&sb->sb_sel, band, NOTE_SUBMIT);
        cv_broadcast(&sb->sb_cv);
        if (sb->sb_flags & SB_ASYNC)
                fownsignal(so->so_pgid, SIGIO, code, band, so);
        if (sb->sb_flags & SB_UPCALL)
                (*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT);
}

/*
 * Reset a socket's lock pointer.  Wake all threads waiting on the
 * socket's condition variables so that they can restart their waits
 * using the new lock.  The existing lock must be held.
 *
 * Caller must have issued membar_release before this.
 */
void
solockreset(struct socket *so, kmutex_t *lock)
{

        KASSERT(solocked(so));

        so->so_lock = lock;
        cv_broadcast(&so->so_snd.sb_cv);
        cv_broadcast(&so->so_rcv.sb_cv);
        cv_broadcast(&so->so_cv);
}

/*
 * Socket buffer (struct sockbuf) utility routines.
 *
 * Each socket contains two socket buffers: one for sending data and
 * one for receiving data.  Each buffer contains a queue of mbufs,
 * information about the number of mbufs and amount of data in the
 * queue, and other fields allowing poll() statements and notification
 * on data availability to be implemented.
 *
 * Data stored in a socket buffer is maintained as a list of records.
 * Each record is a list of mbufs chained together with the m_next
 * field.  Records are chained together with the m_nextpkt field. The upper
 * level routine soreceive() expects the following conventions to be
 * observed when placing information in the receive buffer:
 *
 * 1. If the protocol requires each message be preceded by the sender's
 *    name, then a record containing that name must be present before
 *    any associated data (mbuf's must be of type MT_SONAME).
 * 2. If the protocol supports the exchange of ``access rights'' (really
 *    just additional data associated with the message), and there are
 *    ``rights'' to be received, then a record containing this data
 *    should be present (mbuf's must be of type MT_CONTROL).
 * 3. If a name or rights record exists, then it must be followed by
 *    a data record, perhaps of zero length.
 *
 * Before using a new socket structure it is first necessary to reserve
 * buffer space to the socket, by calling sbreserve().  This should commit
 * some of the available buffer space in the system buffer pool for the
 * socket (currently, it does nothing but enforce limits).  The space
 * should be released by calling sbrelease() when the socket is destroyed.
 */

int
sb_max_set(u_long new_sbmax)
{
        int s;

        if (new_sbmax < (16 * 1024))
                return (EINVAL);

        s = splsoftnet();
        sb_max = new_sbmax;
        sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES);
        splx(s);

        return (0);
}

int
soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
{
        KASSERT(so->so_pcb == NULL || solocked(so));

        /*
         * there's at least one application (a configure script of screen)
         * which expects a fifo is writable even if it has "some" bytes
         * in its buffer.
         * so we want to make sure (hiwat - lowat) >= (some bytes).
         *
         * PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
         * we expect it's large enough for such applications.
         */
        u_long  lowat = MAX(sock_loan_thresh, MCLBYTES);
        u_long  hiwat = lowat + PIPE_BUF;

        if (sndcc < hiwat)
                sndcc = hiwat;
        if (sbreserve(&so->so_snd, sndcc, so) == 0)
                goto bad;
        if (sbreserve(&so->so_rcv, rcvcc, so) == 0)
                goto bad2;
        if (so->so_rcv.sb_lowat == 0)
                so->so_rcv.sb_lowat = 1;
        if (so->so_snd.sb_lowat == 0)
                so->so_snd.sb_lowat = lowat;
        if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
                so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
        return (0);
 bad2:
        sbrelease(&so->so_snd, so);
 bad:
        return (ENOBUFS);
}

/*
 * Allot mbufs to a sockbuf.
 * Attempt to scale mbmax so that mbcnt doesn't become limiting
 * if buffering efficiency is near the normal case.
 */
int
sbreserve(struct sockbuf *sb, u_long cc, struct socket *so)
{
        struct lwp *l = curlwp; /* XXX */
        rlim_t maxcc;
        struct uidinfo *uidinfo;

        KASSERT(so->so_pcb == NULL || solocked(so));
        KASSERT(sb->sb_so == so);
        KASSERT(sb_max_adj != 0);

        if (cc == 0 || cc > sb_max_adj)
                return (0);

        maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur;

        uidinfo = so->so_uidinfo;
        if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc))
                return 0;
        sb->sb_mbmax = uimin(cc * 2, sb_max);
        if (sb->sb_lowat > sb->sb_hiwat)
                sb->sb_lowat = sb->sb_hiwat;

        return (1);
}

/*
 * Free mbufs held by a socket, and reserved mbuf space.  We do not assert
 * that the socket is held locked here: see sorflush().
 */
void
sbrelease(struct sockbuf *sb, struct socket *so)
{

        KASSERT(sb->sb_so == so);

        sbflush(sb);
        (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY);
        sb->sb_mbmax = 0;
}

/*
 * Routines to add and remove
 * data from an mbuf queue.
 *
 * The routines sbappend() or sbappendrecord() are normally called to
 * append new mbufs to a socket buffer, after checking that adequate
 * space is available, comparing the function sbspace() with the amount
 * of data to be added.  sbappendrecord() differs from sbappend() in
 * that data supplied is treated as the beginning of a new record.
 * To place a sender's address, optional access rights, and data in a
 * socket receive buffer, sbappendaddr() should be used.  To place
 * access rights and data in a socket receive buffer, sbappendrights()
 * should be used.  In either case, the new data begins a new record.
 * Note that unlike sbappend() and sbappendrecord(), these routines check
 * for the caller that there will be enough space to store the data.
 * Each fails if there is not enough space, or if it cannot find mbufs
 * to store additional information in.
 *
 * Reliable protocols may use the socket send buffer to hold data
 * awaiting acknowledgement.  Data is normally copied from a socket
 * send buffer in a protocol with m_copym for output to a peer,
 * and then removing the data from the socket buffer with sbdrop()
 * or sbdroprecord() when the data is acknowledged by the peer.
 */

#ifdef SOCKBUF_DEBUG
void
sblastrecordchk(struct sockbuf *sb, const char *where)
{
        struct mbuf *m = sb->sb_mb;

        KASSERT(solocked(sb->sb_so));

        while (m && m->m_nextpkt)
                m = m->m_nextpkt;

        if (m != sb->sb_lastrecord) {
                printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
                    sb->sb_mb, sb->sb_lastrecord, m);
                printf("packet chain:\n");
                for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
                        printf("\t%p\n", m);
                panic("sblastrecordchk from %s", where);
        }
}

void
sblastmbufchk(struct sockbuf *sb, const char *where)
{
        struct mbuf *m = sb->sb_mb;
        struct mbuf *n;

        KASSERT(solocked(sb->sb_so));

        while (m && m->m_nextpkt)
                m = m->m_nextpkt;

        while (m && m->m_next)
                m = m->m_next;

        if (m != sb->sb_mbtail) {
                printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
                    sb->sb_mb, sb->sb_mbtail, m);
                printf("packet tree:\n");
                for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
                        printf("\t");
                        for (n = m; n != NULL; n = n->m_next)
                                printf("%p ", n);
                        printf("\n");
                }
                panic("sblastmbufchk from %s", where);
        }
}
#endif /* SOCKBUF_DEBUG */

/*
 * Link a chain of records onto a socket buffer
 */
#define        SBLINKRECORDCHAIN(sb, m0, mlast)                                \
do {                                                                        \
        if ((sb)->sb_lastrecord != NULL)                                \
                (sb)->sb_lastrecord->m_nextpkt = (m0);                        \
        else                                                                \
                (sb)->sb_mb = (m0);                                        \
        (sb)->sb_lastrecord = (mlast);                                        \
} while (/*CONSTCOND*/0)


#define        SBLINKRECORD(sb, m0)                                                \
    SBLINKRECORDCHAIN(sb, m0, m0)

/*
 * Append mbuf chain m to the last record in the
 * socket buffer sb.  The additional space associated
 * the mbuf chain is recorded in sb.  Empty mbufs are
 * discarded and mbufs are compacted where possible.
 */
void
sbappend(struct sockbuf *sb, struct mbuf *m)
{
        struct mbuf        *n;

        KASSERT(solocked(sb->sb_so));

        if (m == NULL)
                return;

#ifdef MBUFTRACE
        m_claimm(m, sb->sb_mowner);
#endif

        SBLASTRECORDCHK(sb, "sbappend 1");

        if ((n = sb->sb_lastrecord) != NULL) {
                /*
                 * XXX Would like to simply use sb_mbtail here, but
                 * XXX I need to verify that I won't miss an EOR that
                 * XXX way.
                 */
                do {
                        if (n->m_flags & M_EOR) {
                                sbappendrecord(sb, m); /* XXXXXX!!!! */
                                return;
                        }
                } while (n->m_next && (n = n->m_next));
        } else {
                /*
                 * If this is the first record in the socket buffer, it's
                 * also the last record.
                 */
                sb->sb_lastrecord = m;
        }
        sbcompress(sb, m, n);
        SBLASTRECORDCHK(sb, "sbappend 2");
}

/*
 * This version of sbappend() should only be used when the caller
 * absolutely knows that there will never be more than one record
 * in the socket buffer, that is, a stream protocol (such as TCP).
 */
void
sbappendstream(struct sockbuf *sb, struct mbuf *m)
{

        KASSERT(solocked(sb->sb_so));
        KDASSERT(m->m_nextpkt == NULL);
        KASSERT(sb->sb_mb == sb->sb_lastrecord);

        SBLASTMBUFCHK(sb, __func__);

#ifdef MBUFTRACE
        m_claimm(m, sb->sb_mowner);
#endif

        sbcompress(sb, m, sb->sb_mbtail);

        sb->sb_lastrecord = sb->sb_mb;
        SBLASTRECORDCHK(sb, __func__);
}

#ifdef SOCKBUF_DEBUG
void
sbcheck(struct sockbuf *sb)
{
        struct mbuf        *m, *m2;
        u_long                len, mbcnt;

        KASSERT(solocked(sb->sb_so));

        len = 0;
        mbcnt = 0;
        for (m = sb->sb_mb; m; m = m->m_nextpkt) {
                for (m2 = m; m2 != NULL; m2 = m2->m_next) {
                        len += m2->m_len;
                        mbcnt += MSIZE;
                        if (m2->m_flags & M_EXT)
                                mbcnt += m2->m_ext.ext_size;
                        if (m2->m_nextpkt != NULL)
                                panic("sbcheck nextpkt");
                }
        }
        if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
                printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
                    mbcnt, sb->sb_mbcnt);
                panic("sbcheck");
        }
}
#endif

/*
 * As above, except the mbuf chain
 * begins a new record.
 */
void
sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
{
        struct mbuf        *m;

        KASSERT(solocked(sb->sb_so));

        if (m0 == NULL)
                return;

#ifdef MBUFTRACE
        m_claimm(m0, sb->sb_mowner);
#endif
        /*
         * Put the first mbuf on the queue.
         * Note this permits zero length records.
         */
        sballoc(sb, m0);
        SBLASTRECORDCHK(sb, "sbappendrecord 1");
        SBLINKRECORD(sb, m0);
        m = m0->m_next;
        m0->m_next = 0;
        if (m && (m0->m_flags & M_EOR)) {
                m0->m_flags &= ~M_EOR;
                m->m_flags |= M_EOR;
        }
        sbcompress(sb, m, m0);
        SBLASTRECORDCHK(sb, "sbappendrecord 2");
}

/*
 * As above except that OOB data
 * is inserted at the beginning of the sockbuf,
 * but after any other OOB data.
 */
void
sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
{
        struct mbuf        *m, **mp;

        KASSERT(solocked(sb->sb_so));

        if (m0 == NULL)
                return;

        SBLASTRECORDCHK(sb, "sbinsertoob 1");

        for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
            again:
                switch (m->m_type) {

                case MT_OOBDATA:
                        continue;                /* WANT next train */

                case MT_CONTROL:
                        if ((m = m->m_next) != NULL)
                                goto again;        /* inspect THIS train further */
                }
                break;
        }
        /*
         * Put the first mbuf on the queue.
         * Note this permits zero length records.
         */
        sballoc(sb, m0);
        m0->m_nextpkt = *mp;
        if (*mp == NULL) {
                /* m0 is actually the new tail */
                sb->sb_lastrecord = m0;
        }
        *mp = m0;
        m = m0->m_next;
        m0->m_next = 0;
        if (m && (m0->m_flags & M_EOR)) {
                m0->m_flags &= ~M_EOR;
                m->m_flags |= M_EOR;
        }
        sbcompress(sb, m, m0);
        SBLASTRECORDCHK(sb, "sbinsertoob 2");
}

/*
 * Append address and data, and optionally, control (ancillary) data
 * to the receive queue of a socket.  If present,
 * m0 must include a packet header with total length.
 * Returns 0 if no space in sockbuf or insufficient mbufs.
 */
int
sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
        struct mbuf *control)
{
        struct mbuf        *m, *n, *nlast;
        int                space, len;

        KASSERT(solocked(sb->sb_so));

        space = asa->sa_len;

        if (m0 != NULL) {
                if ((m0->m_flags & M_PKTHDR) == 0)
                        panic("sbappendaddr");
                space += m0->m_pkthdr.len;
#ifdef MBUFTRACE
                m_claimm(m0, sb->sb_mowner);
#endif
        }
        for (n = control; n; n = n->m_next) {
                space += n->m_len;
                MCLAIM(n, sb->sb_mowner);
                if (n->m_next == NULL)        /* keep pointer to last control buf */
                        break;
        }
        if (space > sbspace(sb))
                return (0);
        m = m_get(M_DONTWAIT, MT_SONAME);
        if (m == NULL)
                return (0);
        MCLAIM(m, sb->sb_mowner);
        /*
         * XXX avoid 'comparison always true' warning which isn't easily
         * avoided.
         */
        len = asa->sa_len;
        if (len > MLEN) {
                MEXTMALLOC(m, asa->sa_len, M_NOWAIT);
                if ((m->m_flags & M_EXT) == 0) {
                        m_free(m);
                        return (0);
                }
        }
        m->m_len = asa->sa_len;
        memcpy(mtod(m, void *), asa, asa->sa_len);
        if (n)
                n->m_next = m0;                /* concatenate data to control */
        else
                control = m0;
        m->m_next = control;

        SBLASTRECORDCHK(sb, "sbappendaddr 1");

        for (n = m; n->m_next != NULL; n = n->m_next)
                sballoc(sb, n);
        sballoc(sb, n);
        nlast = n;
        SBLINKRECORD(sb, m);

        sb->sb_mbtail = nlast;
        SBLASTMBUFCHK(sb, "sbappendaddr");
        SBLASTRECORDCHK(sb, "sbappendaddr 2");

        return (1);
}

/*
 * Helper for sbappendchainaddr: prepend a struct sockaddr* to
 * an mbuf chain.
 */
static inline struct mbuf *
m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0,
                   const struct sockaddr *asa)
{
        struct mbuf *m;
        const int salen = asa->sa_len;

        KASSERT(solocked(sb->sb_so));

        /* only the first in each chain need be a pkthdr */
        m = m_gethdr(M_DONTWAIT, MT_SONAME);
        if (m == NULL)
                return NULL;
        MCLAIM(m, sb->sb_mowner);
#ifdef notyet
        if (salen > MHLEN) {
                MEXTMALLOC(m, salen, M_NOWAIT);
                if ((m->m_flags & M_EXT) == 0) {
                        m_free(m);
                        return NULL;
                }
        }
#else
        KASSERT(salen <= MHLEN);
#endif
        m->m_len = salen;
        memcpy(mtod(m, void *), asa, salen);
        m->m_next = m0;
        m->m_pkthdr.len = salen + m0->m_pkthdr.len;

        return m;
}

int
sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
                  struct mbuf *m0, int sbprio)
{
        struct mbuf *m, *n, *n0, *nlast;
        int error;

        KASSERT(solocked(sb->sb_so));

        /*
         * XXX sbprio reserved for encoding priority of this* request:
         *  SB_PRIO_NONE --> honour normal sb limits
         *  SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
         *        take whole chain. Intended for large requests
         *      that should be delivered atomically (all, or none).
         * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
         *       over normal socket limits, for messages indicating
         *       buffer overflow in earlier normal/lower-priority messages
         * SB_PRIO_BESTEFFORT -->  ignore limits entirely.
         *       Intended for  kernel-generated messages only.
         *        Up to generator to avoid total mbuf resource exhaustion.
         */
        (void)sbprio;

        if (m0 && (m0->m_flags & M_PKTHDR) == 0)
                panic("sbappendaddrchain");

#ifdef notyet
        space = sbspace(sb);

        /*
         * Enforce SB_PRIO_* limits as described above.
         */
#endif

        n0 = NULL;
        nlast = NULL;
        for (m = m0; m; m = m->m_nextpkt) {
                struct mbuf *np;

#ifdef MBUFTRACE
                m_claimm(m, sb->sb_mowner);
#endif

                /* Prepend sockaddr to this record (m) of input chain m0 */
                  n = m_prepend_sockaddr(sb, m, asa);
                if (n == NULL) {
                        error = ENOBUFS;
                        goto bad;
                }

                /* Append record (asa+m) to end of new chain n0 */
                if (n0 == NULL) {
                        n0 = n;
                } else {
                        nlast->m_nextpkt = n;
                }
                /* Keep track of last record on new chain */
                nlast = n;

                for (np = n; np; np = np->m_next)
                        sballoc(sb, np);
        }

        SBLASTRECORDCHK(sb, "sbappendaddrchain 1");

        /* Drop the entire chain of (asa+m) records onto the socket */
        SBLINKRECORDCHAIN(sb, n0, nlast);

        SBLASTRECORDCHK(sb, "sbappendaddrchain 2");

        for (m = nlast; m->m_next; m = m->m_next)
                ;
        sb->sb_mbtail = m;
        SBLASTMBUFCHK(sb, "sbappendaddrchain");

        return (1);

bad:
        /*
         * On error, free the prepended addresses. For consistency
         * with sbappendaddr(), leave it to our caller to free
         * the input record chain passed to us as m0.
         */
        while ((n = n0) != NULL) {
                  struct mbuf *np;

                /* Undo the sballoc() of this record */
                for (np = n; np; np = np->m_next)
                        sbfree(sb, np);

                n0 = n->m_nextpkt;        /* iterate at next prepended address */
                np = m_free(n);                /* free prepended address (not data) */
        }
        return error;
}


int
sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
{
        struct mbuf        *m, *mlast, *n;
        int                space;

        KASSERT(solocked(sb->sb_so));

        space = 0;
        if (control == NULL)
                panic("sbappendcontrol");
        for (m = control; ; m = m->m_next) {
                space += m->m_len;
                MCLAIM(m, sb->sb_mowner);
                if (m->m_next == NULL)
                        break;
        }
        n = m;                        /* save pointer to last control buffer */
        for (m = m0; m; m = m->m_next) {
                MCLAIM(m, sb->sb_mowner);
                space += m->m_len;
        }
        if (space > sbspace(sb))
                return (0);
        n->m_next = m0;                        /* concatenate data to control */

        SBLASTRECORDCHK(sb, "sbappendcontrol 1");

        for (m = control; m->m_next != NULL; m = m->m_next)
                sballoc(sb, m);
        sballoc(sb, m);
        mlast = m;
        SBLINKRECORD(sb, control);

        sb->sb_mbtail = mlast;
        SBLASTMBUFCHK(sb, "sbappendcontrol");
        SBLASTRECORDCHK(sb, "sbappendcontrol 2");

        return (1);
}

/*
 * Compress mbuf chain m into the socket
 * buffer sb following mbuf n.  If n
 * is null, the buffer is presumed empty.
 */
void
sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
{
        int                eor;
        struct mbuf        *o;

        KASSERT(solocked(sb->sb_so));

        eor = 0;
        while (m) {
                eor |= m->m_flags & M_EOR;
                if (m->m_len == 0 &&
                    (eor == 0 ||
                     (((o = m->m_next) || (o = n)) &&
                      o->m_type == m->m_type))) {
                        if (sb->sb_lastrecord == m)
                                sb->sb_lastrecord = m->m_next;
                        m = m_free(m);
                        continue;
                }
                if (n && (n->m_flags & M_EOR) == 0 &&
                    /* M_TRAILINGSPACE() checks buffer writeability */
                    m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
                    m->m_len <= M_TRAILINGSPACE(n) &&
                    n->m_type == m->m_type) {
                        memcpy(mtod(n, char *) + n->m_len, mtod(m, void *),
                            (unsigned)m->m_len);
                        n->m_len += m->m_len;
                        sb->sb_cc += m->m_len;
                        m = m_free(m);
                        continue;
                }
                if (n)
                        n->m_next = m;
                else
                        sb->sb_mb = m;
                sb->sb_mbtail = m;
                sballoc(sb, m);
                n = m;
                m->m_flags &= ~M_EOR;
                m = m->m_next;
                n->m_next = 0;
        }
        if (eor) {
                if (n)
                        n->m_flags |= eor;
                else
                        printf("semi-panic: sbcompress\n");
        }
        SBLASTMBUFCHK(sb, __func__);
}

/*
 * Free all mbufs in a sockbuf.
 * Check that all resources are reclaimed.
 */
void
sbflush(struct sockbuf *sb)
{

        KASSERT(solocked(sb->sb_so));
        KASSERT((sb->sb_flags & SB_LOCK) == 0);

        while (sb->sb_mbcnt)
                sbdrop(sb, (int)sb->sb_cc);

        KASSERT(sb->sb_cc == 0);
        KASSERT(sb->sb_mb == NULL);
        KASSERT(sb->sb_mbtail == NULL);
        KASSERT(sb->sb_lastrecord == NULL);
}

/*
 * Drop data from (the front of) a sockbuf.
 */
void
sbdrop(struct sockbuf *sb, int len)
{
        struct mbuf        *m, *next;

        KASSERT(solocked(sb->sb_so));

        next = (m = sb->sb_mb) ? m->m_nextpkt : NULL;
        while (len > 0) {
                if (m == NULL) {
                        if (next == NULL)
                                panic("sbdrop(%p,%d): cc=%lu",
                                    sb, len, sb->sb_cc);
                        m = next;
                        next = m->m_nextpkt;
                        continue;
                }
                if (m->m_len > len) {
                        m->m_len -= len;
                        m->m_data += len;
                        sb->sb_cc -= len;
                        break;
                }
                len -= m->m_len;
                sbfree(sb, m);
                m = m_free(m);
        }
        while (m && m->m_len == 0) {
                sbfree(sb, m);
                m = m_free(m);
        }
        if (m) {
                sb->sb_mb = m;
                m->m_nextpkt = next;
        } else
                sb->sb_mb = next;
        /*
         * First part is an inline SB_EMPTY_FIXUP().  Second part
         * makes sure sb_lastrecord is up-to-date if we dropped
         * part of the last record.
         */
        m = sb->sb_mb;
        if (m == NULL) {
                sb->sb_mbtail = NULL;
                sb->sb_lastrecord = NULL;
        } else if (m->m_nextpkt == NULL)
                sb->sb_lastrecord = m;
}

/*
 * Drop a record off the front of a sockbuf
 * and move the next record to the front.
 */
void
sbdroprecord(struct sockbuf *sb)
{
        struct mbuf        *m, *mn;

        KASSERT(solocked(sb->sb_so));

        m = sb->sb_mb;
        if (m) {
                sb->sb_mb = m->m_nextpkt;
                do {
                        sbfree(sb, m);
                        mn = m_free(m);
                } while ((m = mn) != NULL);
        }
        SB_EMPTY_FIXUP(sb);
}

/*
 * Create a "control" mbuf containing the specified data
 * with the specified type for presentation on a socket buffer.
 */
struct mbuf *
sbcreatecontrol1(void **p, int size, int type, int level, int flags)
{
        struct cmsghdr        *cp;
        struct mbuf        *m;
        int space = CMSG_SPACE(size);

        if ((flags & M_DONTWAIT) && space > MCLBYTES) {
                printf("%s: message too large %d\n", __func__, space);
                return NULL;
        }

        if ((m = m_get(flags, MT_CONTROL)) == NULL)
                return NULL;
        if (space > MLEN) {
                if (space > MCLBYTES)
                        MEXTMALLOC(m, space, M_WAITOK);
                else
                        MCLGET(m, flags);
                if ((m->m_flags & M_EXT) == 0) {
                        m_free(m);
                        return NULL;
                }
        }
        cp = mtod(m, struct cmsghdr *);
        *p = CMSG_DATA(cp);
        m->m_len = space;
        cp->cmsg_len = CMSG_LEN(size);
        cp->cmsg_level = level;
        cp->cmsg_type = type;

        memset(cp + 1, 0, CMSG_LEN(0) - sizeof(*cp));
        memset((uint8_t *)*p + size, 0, CMSG_ALIGN(size) - size);

        return m;
}

struct mbuf *
sbcreatecontrol(void *p, int size, int type, int level)
{
        struct mbuf *m;
        void *v;

        m = sbcreatecontrol1(&v, size, type, level, M_DONTWAIT);
        if (m == NULL)
                return NULL;
        memcpy(v, p, size);
        return m;
}

void
solockretry(struct socket *so, kmutex_t *lock)
{

        while (lock != atomic_load_relaxed(&so->so_lock)) {
                mutex_exit(lock);
                lock = atomic_load_consume(&so->so_lock);
                mutex_enter(lock);
        }
}

bool
solocked(const struct socket *so)
{

        /*
         * Used only for diagnostic assertions, so so_lock should be
         * stable at this point, hence on need for atomic_load_*.
         */
        return mutex_owned(so->so_lock);
}

bool
solocked2(const struct socket *so1, const struct socket *so2)
{
        const kmutex_t *lock;

        /*
         * Used only for diagnostic assertions, so so_lock should be
         * stable at this point, hence on need for atomic_load_*.
         */
        lock = so1->so_lock;
        if (lock != so2->so_lock)
                return false;
        return mutex_owned(lock);
}

/*
 * sosetlock: assign a default lock to a new socket.
 */
void
sosetlock(struct socket *so)
{
        if (so->so_lock == NULL) {
                kmutex_t *lock = softnet_lock;

                so->so_lock = lock;
                mutex_obj_hold(lock);
                mutex_enter(lock);
        }
        KASSERT(solocked(so));
}

/*
 * Set lock on sockbuf sb; sleep if lock is already held.
 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
 * Returns error without lock if sleep is interrupted.
 */
int
sblock(struct sockbuf *sb, int wf)
{
        struct socket *so;
        kmutex_t *lock;
        int error;

        KASSERT(solocked(sb->sb_so));

        for (;;) {
                if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) {
                        sb->sb_flags |= SB_LOCK;
                        return 0;
                }
                if (wf != M_WAITOK)
                        return EWOULDBLOCK;
                so = sb->sb_so;
                lock = so->so_lock;
                if ((sb->sb_flags & SB_NOINTR) != 0) {
                        cv_wait(&so->so_cv, lock);
                        error = 0;
                } else
                        error = cv_wait_sig(&so->so_cv, lock);
                if (__predict_false(lock != atomic_load_relaxed(&so->so_lock)))
                        solockretry(so, lock);
                if (error != 0)
                        return error;
        }
}

void
sbunlock(struct sockbuf *sb)
{
        struct socket *so;

        so = sb->sb_so;

        KASSERT(solocked(so));
        KASSERT((sb->sb_flags & SB_LOCK) != 0);

        sb->sb_flags &= ~SB_LOCK;
        cv_broadcast(&so->so_cv);
}

int
sowait(struct socket *so, bool catch_p, int timo)
{
        kmutex_t *lock;
        int error;

        KASSERT(solocked(so));
        KASSERT(catch_p || timo != 0);

        lock = so->so_lock;
        if (catch_p)
                error = cv_timedwait_sig(&so->so_cv, lock, timo);
        else
                error = cv_timedwait(&so->so_cv, lock, timo);
        if (__predict_false(lock != atomic_load_relaxed(&so->so_lock)))
                solockretry(so, lock);
        return error;
}

#ifdef DDB

/*
 * Currently, sofindproc() is used only from DDB. It could be used from others
 * by using db_mutex_enter()
 */

static inline int
db_mutex_enter(kmutex_t *mtx)
{
        int rv;

        if (!db_active) {
                mutex_enter(mtx);
                rv = 1;
        } else
                rv = mutex_tryenter(mtx);

        return rv;
}

int
sofindproc(struct socket *so, int all, void (*pr)(const char *, ...))
{
        proc_t *p;
        filedesc_t *fdp;
        fdtab_t *dt;
        fdfile_t *ff;
        file_t *fp = NULL;
        int found = 0;
        int i, t;

        if (so == NULL)
                return 0;

        t = db_mutex_enter(&proc_lock);
        if (!t) {
                pr("could not acquire proc_lock mutex\n");
                return 0;
        }
        PROCLIST_FOREACH(p, &allproc) {
                if (p->p_stat == SIDL)
                        continue;
                fdp = p->p_fd;
                t = db_mutex_enter(&fdp->fd_lock);
                if (!t) {
                        pr("could not acquire fd_lock mutex\n");
                        continue;
                }
                dt = atomic_load_consume(&fdp->fd_dt);
                for (i = 0; i < dt->dt_nfiles; i++) {
                        ff = dt->dt_ff[i];
                        if (ff == NULL)
                                continue;

                        fp = atomic_load_consume(&ff->ff_file);
                        if (fp == NULL)
                                continue;

                        t = db_mutex_enter(&fp->f_lock);
                        if (!t) {
                                pr("could not acquire f_lock mutex\n");
                                continue;
                        }
                        if ((struct socket *)fp->f_data != so) {
                                mutex_exit(&fp->f_lock);
                                continue;
                        }
                        found++;
                        if (pr)
                                pr("socket %p: owner %s(pid=%d)\n",
                                    so, p->p_comm, p->p_pid);
                        mutex_exit(&fp->f_lock);
                        if (all == 0)
                                break;
                }
                mutex_exit(&fdp->fd_lock);
                if (all == 0 && found != 0)
                        break;
        }
        mutex_exit(&proc_lock);

        return found;
}

void
socket_print(const char *modif, void (*pr)(const char *, ...))
{
        file_t *fp;
        struct socket *so;
        struct sockbuf *sb_snd, *sb_rcv;
        struct mbuf *m_rec, *m;
        bool opt_v = false;
        bool opt_m = false;
        bool opt_a = false;
        bool opt_p = false;
        int nrecs, nmbufs;
        char ch;
        const char *family;

        while ( (ch = *(modif++)) != '\0') {
                switch (ch) {
                case 'v':
                        opt_v = true;
                        break;
                case 'm':
                        opt_m = true;
                        break;
                case 'a':
                        opt_a = true;
                        break;
                case 'p':
                        opt_p = true;
                        break;
                }
        }
        if (opt_v == false && pr)
                (pr)("Ignore empty sockets. use /v to print all.\n");
        if (opt_p == true && pr)
                (pr)("Don't search owner process.\n");

        LIST_FOREACH(fp, &filehead, f_list) {
                if (fp->f_type != DTYPE_SOCKET)
                        continue;
                so = (struct socket *)fp->f_data;
                if (so == NULL)
                        continue;

                if (so->so_proto->pr_domain->dom_family == AF_INET)
                        family = "INET";
#ifdef INET6
                else if (so->so_proto->pr_domain->dom_family == AF_INET6)
                        family = "INET6";
#endif
                else if (so->so_proto->pr_domain->dom_family == pseudo_AF_KEY)
                        family = "KEY";
                else if (so->so_proto->pr_domain->dom_family == AF_ROUTE)
                        family = "ROUTE";
                else
                        continue;

                sb_snd = &so->so_snd;
                sb_rcv = &so->so_rcv;

                if (opt_v != true &&
                    sb_snd->sb_cc == 0 && sb_rcv->sb_cc == 0)
                        continue;

                pr("---SOCKET %p: type %s\n", so, family);
                if (opt_p != true)
                        sofindproc(so, opt_a == true ? 1 : 0, pr);
                pr("Send Buffer Bytes: %d [bytes]\n", sb_snd->sb_cc);
                pr("Send Buffer mbufs:\n");
                m_rec = m = sb_snd->sb_mb;
                nrecs = 0;
                nmbufs = 0;
                while (m_rec) {
                        nrecs++;
                        if (opt_m == true)
                                pr(" mbuf chain %p\n", m_rec);
                        while (m) {
                                nmbufs++;
                                m = m->m_next;
                        }
                        m_rec = m = m_rec->m_nextpkt;
                }
                pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs);

                pr("Recv Buffer Usage: %d [bytes]\n", sb_rcv->sb_cc);
                pr("Recv Buffer mbufs:\n");
                m_rec = m = sb_rcv->sb_mb;
                nrecs = 0;
                nmbufs = 0;
                while (m_rec) {
                        nrecs++;
                        if (opt_m == true)
                                pr(" mbuf chain %p\n", m_rec);
                        while (m) {
                                nmbufs++;
                                m = m->m_next;
                        }
                        m_rec = m = m_rec->m_nextpkt;
                }
                pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs);
        }
}
#endif /* DDB */


































































































































































































































































































































































































    3 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
/*        $NetBSD: ip6_var.h,v 1.94 2024/02/09 22:08:37 andvar Exp $        */
/*        $KAME: ip6_var.h,v 1.33 2000/06/11 14:59:20 jinmei Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ip_var.h        8.1 (Berkeley) 6/10/93
 */

#ifndef _NETINET6_IP6_VAR_H_
#define _NETINET6_IP6_VAR_H_

#include <sys/types.h>
#include <sys/queue.h>
#include <sys/socketvar.h>

#include <net/if.h>
#include <net/route.h>

#include <netinet/in.h>
#include <netinet/ip6.h>

struct        ip6_moptions {
        if_index_t im6o_multicast_if_index; /* I/F for outgoing multicasts */
        u_char        im6o_multicast_hlim;        /* hoplimit for outgoing multicasts */
        u_char        im6o_multicast_loop;        /* 1 >= hear sends if a member */
        LIST_HEAD(, in6_multi_mship) im6o_memberships;
};

/*
 * Control options for outgoing packets
 */

/* Routing header related info */
struct        ip6po_rhinfo {
        struct        ip6_rthdr *ip6po_rhi_rthdr; /* Routing header */
        struct        route ip6po_rhi_route; /* Route to the 1st hop */
};
#define ip6po_rthdr        ip6po_rhinfo.ip6po_rhi_rthdr
#define ip6po_route        ip6po_rhinfo.ip6po_rhi_route

/* Nexthop related info */
struct        ip6po_nhinfo {
        struct        sockaddr *ip6po_nhi_nexthop;
        struct        route ip6po_nhi_route; /* Route to the nexthop */
};
#define ip6po_nexthop        ip6po_nhinfo.ip6po_nhi_nexthop
#define ip6po_nextroute        ip6po_nhinfo.ip6po_nhi_route

struct        ip6_pktopts {
        int        ip6po_hlim;                /* Hoplimit for outgoing packets */
        struct        in6_pktinfo *ip6po_pktinfo; /* Outgoing IF/address information */
        struct        ip6po_nhinfo ip6po_nhinfo; /* Next-hop address information */
        struct        ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */
        struct        ip6_dest *ip6po_dest1; /* Destination options header(1st part) */
        struct        ip6po_rhinfo ip6po_rhinfo; /* Routing header related info. */
        struct        ip6_dest *ip6po_dest2; /* Destination options header(2nd part) */
        int        ip6po_tclass;        /* traffic class */
        int        ip6po_minmtu;  /* fragment vs PMTU discovery policy */
#define IP6PO_MINMTU_MCASTONLY        -1 /* default; send at min MTU for multicast*/
#define IP6PO_MINMTU_DISABLE         0 /* always perform pmtu disc */
#define IP6PO_MINMTU_ALL         1 /* always send at min MTU */
        int        ip6po_prefer_tempaddr;        /* whether temporary addresses are
                                         * preferred as source address */
#define IP6PO_TEMPADDR_SYSTEM        -1 /* follow the system default */
#define IP6PO_TEMPADDR_NOTPREFER 0 /* not prefer temporary address */
#define IP6PO_TEMPADDR_PREFER         1 /* prefer temporary address */
        int ip6po_flags;
#if 0        /* parameters in this block is obsolete. do not reuse the values. */
#define IP6PO_REACHCONF        0x01        /* upper-layer reachability confirmation. */
#define IP6PO_MINMTU        0x02        /* use minimum MTU (IPV6_USE_MIN_MTU) */
#endif
#define IP6PO_DONTFRAG        0x04        /* disable fragmentation (IPV6_DONTFRAG) */
};

/*
 * IPv6 statistics.
 * Each counter is an unsigned 64-bit value.
 */
#define        IP6_STAT_TOTAL                0        /* total packets received */
#define        IP6_STAT_TOOSHORT        1        /* packet too short */
#define        IP6_STAT_TOOSMALL        2        /* not enough data */
#define        IP6_STAT_FRAGMENTS        3        /* fragments received */
#define        IP6_STAT_FRAGDROPPED        4        /* frags dropped (dups, out of space) */
#define        IP6_STAT_FRAGTIMEOUT        5        /* fragments timed out */
#define        IP6_STAT_FRAGOVERFLOW        6        /* fragments that exceed limit */
#define IP6_STAT_FORWARD        7        /* packets forwarded */
#define        IP6_STAT_CANTFORWARD        8        /* packets rcvd for uncreachable dst */
#define        IP6_STAT_REDIRECTSENT        9        /* packets forwarded on same net */
#define        IP6_STAT_DELIVERED        10        /* datagrams delivered to upper level */
#define        IP6_STAT_LOCALOUT        11        /* total IP packets generated here */
#define        IP6_STAT_ODROPPED        12        /* lost packets due to nobufs, etc. */
#define        IP6_STAT_REASSEMBLED        13        /* total packets reassembled ok */
#define        IP6_STAT_FRAGMENTED        14        /* datagrams successfully fragmented */
#define        IP6_STAT_OFRAGMENTS        15        /* output fragments created */
#define        IP6_STAT_CANTFRAG        16        /* don't fragment flag was set, etc. */
#define        IP6_STAT_BADOPTIONS        17        /* error in option processing */
#define        IP6_STAT_NOROUTE        18        /* packets discarded due to no route */
#define        IP6_STAT_BADVERS        19        /* ip6 version != 6 */
#define        IP6_STAT_RAWOUT                20        /* total raw ip packets generated */
#define        IP6_STAT_BADSCOPE        21        /* scope error */
#define        IP6_STAT_NOTMEMBER        22        /* don't join this multicast group */
#define        IP6_STAT_NXTHIST        23        /* next header histogram */
                /* space for 256 counters */
#define        IP6_STAT_M1                279        /* one mbuf */
#define        IP6_STAT_M2M                280        /* two or more mbuf */
                /* space for 32 counters */
#define        IP6_STAT_MEXT1                312        /* one ext mbuf */
#define        IP6_STAT_MEXT2M                313        /* two or more ext mbuf */
#define        IP6_STAT_EXTHDRTOOLONG        314        /* ext hdr are not contiguous */
#define        IP6_STAT_NOGIF                315        /* no match gif found */
#define        IP6_STAT_TOOMANYHDR        316        /* discarded due to too many headers */
        /*
         * statistics for improvement of the source address selection
         * algorithm:
         * XXX: hardcoded 16 = # of ip6 multicast scope types + 1
         */
#define        IP6_STAT_SOURCES_NONE        317        /* number of times that address
                                           selection fails */
#define        IP6_STAT_SOURCES_SAMEIF        318        /* number of times that an address
                                           on the outgoing I/F is chosen */
                /* space for 16 counters */
#define        IP6_STAT_SOURCES_OTHERIF 334        /* number of times that an address on
                                           a non-outgoing I/F is chosen */
                /* space for 16 counters */
#define        IP6_STAT_SOURCES_SAMESCOPE 350        /* number of times that an address that
                                           has the same scope from the dest.
                                           is chosen */
                /* space for 16 counters */
#define        IP6_STAT_SOURCES_OTHERSCOPE 366        /* number of times that an address that
                                           has a different scope from the dest.
                                           is chosen */
                /* space for 16 counters */
#define        IP6_STAT_SOURCES_DEPRECATED 382        /* number of times that a deprecated
                                           address is chosen */
                /* space for 16 counters */
#define        IP6_STAT_FORWARD_CACHEHIT 398
#define        IP6_STAT_FORWARD_CACHEMISS 399
#define        IP6_STAT_FASTFORWARD        400        /* packets fast forwarded */
#define        IP6_STAT_FASTFORWARDFLOWS 401        /* number of fast forward flows */
#define        IP6_STAT_NOIPSEC        402        /* no match ipsec(4) found */
#define        IP6_STAT_PFILDROP_IN        403        /* dropped by pfil (PFIL_IN) */
#define        IP6_STAT_PFILDROP_OUT        404        /* dropped by pfil (PFIL_OUT) */
#define        IP6_STAT_IPSECDROP_IN        405        /* dropped by IPsec SP check */
#define        IP6_STAT_IPSECDROP_OUT        406        /* dropped by IPsec SP check */
#define        IP6_STAT_IFDROP                407        /* dropped due to interface state */
#define        IP6_STAT_IDROPPED        408        /* lost packets due to nobufs, etc. */
#define        IP6_STAT_TIMXCEED        409        /* hop limit exceeded */
#define        IP6_STAT_TOOBIG                410        /* packet bigger than MTU */
#define        IP6_STAT_RTREJECT        411        /* rejected by route */

#define        IP6_NSTATS                412

#define IP6FLOW_HASHBITS         6 /* should not be a multiple of 8 */

/* 
 * Structure for an IPv6 flow (ip6_fastforward).
 */
struct ip6flow {
        TAILQ_ENTRY(ip6flow) ip6f_list;  /* next in active list */
        TAILQ_ENTRY(ip6flow) ip6f_hash;  /* next ip6flow in bucket */
        size_t ip6f_hashidx;             /* own hash index of ipflowtable[] */
        struct in6_addr ip6f_dst;       /* destination address */
        struct in6_addr ip6f_src;       /* source address */
        struct route ip6f_ro;       /* associated route entry */
        u_int32_t ip6f_flow;                /* flow (tos) */
        u_quad_t ip6f_uses;               /* number of uses in this period */
        u_quad_t ip6f_last_uses;          /* number of uses in last period */
        u_quad_t ip6f_dropped;            /* ENOBUFS returned by if_output */
        u_quad_t ip6f_forwarded;          /* packets forwarded */
        u_int ip6f_timer;               /* lifetime timer */
};

#ifdef _KERNEL

#include <sys/protosw.h>
#include <sys/cprng.h>

/*
 * Auxiliary attributes of incoming IPv6 packets, which is initialized when we
 * come into ip6_input().
 * XXX do not make it a kitchen sink!
 */
struct ip6aux {
        /* ip6.ip6_dst */
        struct in6_addr        ip6a_src;
        uint32_t        ip6a_scope_id;
        int                ip6a_flags;
};

/* flags passed to ip6_output as last parameter */
#define        IPV6_UNSPECSRC                0x01        /* allow :: as the source address */
#define        IPV6_FORWARDING                0x02        /* most of IPv6 header exists */
#define        IPV6_MINMTU                0x04        /* use minimum MTU (IPV6_USE_MIN_MTU) */

extern u_int32_t ip6_id;                /* fragment identifier */
extern int        ip6_defhlim;                /* default hop limit */
extern int        ip6_defmcasthlim;        /* default multicast hop limit */
extern int        ip6_forwarding;                /* act as router? */
extern int        ip6_sendredirect;        /* send ICMPv6 redirect? */
extern int        ip6_use_deprecated;        /* allow deprecated addr as source */
extern int        ip6_mcast_pmtu;                /* enable pMTU discovery for multicast? */
extern int        ip6_v6only;
extern int        ip6_neighborgcthresh;        /* Threshold # of NDP entries for GC */
extern int        ip6_maxdynroutes; /* Max # of routes created via redirect */
extern int        ip6_param_rt_msg;  /* How to send parameter changing rtm */


extern struct socket *ip6_mrouter;         /* multicast routing daemon */
extern int        ip6_sendredirects;        /* send IP redirects when forwarding? */
extern int        ip6_maxfragpackets; /* Maximum packets in reassembly queue */
extern int        ip6_maxfrags;        /* Maximum fragments in reassembly queue */
extern int        ip6_keepfaith;                /* Firewall Aided Internet Translator */
extern int        ip6_log_interval;
extern time_t        ip6_log_time;
extern int        ip6_hdrnestlimit; /* upper limit of # of extension headers */
extern int        ip6_dad_count;                /* DupAddrDetectionTransmits */

extern int ip6_auto_flowlabel;
extern int ip6_auto_linklocal;

extern int   ip6_anonportmin;                /* minimum ephemeral port */
extern int   ip6_anonportmax;                /* maximum ephemeral port */
extern int   ip6_lowportmin;                /* minimum reserved port */
extern int   ip6_lowportmax;                /* maximum reserved port */

extern int        ip6_prefer_tempaddr; /* whether to prefer temporary addresses
                                        in the source address selection */
extern int        ip6_use_defzone; /* whether to use the default scope zone
                                    when unspecified */

#ifdef GATEWAY
extern int      ip6_maxflows;           /* maximum amount of flows for ip6ff */
extern int        ip6_hashsize;                /* size of hash table */
#endif

struct inpcb;
extern const struct pr_usrreqs rip6_usrreqs;

int        icmp6_ctloutput(int, struct socket *, struct sockopt *);

struct mbuf;
void        ip6_init(void);
const struct ip6aux *ip6_getdstifaddr(struct mbuf *);
void        ip6_freepcbopts(struct ip6_pktopts *);
void        ip6_freemoptions(struct ip6_moptions *);
int        ip6_unknown_opt(u_int8_t *, struct mbuf *, int);
int        ip6_get_prevhdr(struct mbuf *, int);
int        ip6_nexthdr(struct mbuf *, int, int, int *);
int        ip6_lasthdr(struct mbuf *, int, int, int *);

struct ip6_hdr;
int        ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *);
int        ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
void        ip6_savecontrol(struct inpcb *, struct mbuf **, struct ip6_hdr *,
                struct mbuf *);
void        ip6_notify_pmtu(struct inpcb *, const struct sockaddr_in6 *,
                u_int32_t *);
int        ip6_sysctl(int *, u_int, void *, size_t *, void *, size_t);

void        ip6_forward(struct mbuf *, int, struct ifnet *);

void        ip6_mloopback(struct ifnet *, struct mbuf *,
                      const struct sockaddr_in6 *);
int        ip6_output(struct mbuf *, struct ip6_pktopts *, struct route *, int,
            struct ip6_moptions *, struct inpcb *, struct ifnet **);
int        ip6_if_output(struct ifnet * const, struct ifnet * const,
            struct mbuf * const,
            const struct sockaddr_in6 * const, const struct rtentry *);
int        ip6_ctloutput(int, struct socket *, struct sockopt *);
int        ip6_raw_ctloutput(int, struct socket *, struct sockopt *);
void        ip6_initpktopts(struct ip6_pktopts *);
int        ip6_setpktopts(struct mbuf *, struct ip6_pktopts *,
                            struct ip6_pktopts *, kauth_cred_t, int);
void        ip6_clearpktopts(struct ip6_pktopts *, int);
struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int);
int        ip6_optlen(struct inpcb *);

void        ip6_statinc(u_int);

int        route6_input(struct mbuf **, int *, int);

void        frag6_init(void);
int        frag6_input(struct mbuf **, int *, int);
int        ip6_reass_packet(struct mbuf **, int);
void        frag6_slowtimo(void);
void        frag6_fasttimo(void);
void        frag6_drain(void);
void        frag6_drainstub(void);

int        ip6flow_init(int);
void        ip6flow_poolinit(void);
struct  ip6flow *ip6flow_reap(int);
void    ip6flow_create(struct route *, struct mbuf *);
void    ip6flow_slowtimo(void);
int        ip6flow_invalidate_all(int);

void        rip6_init(void);
int        rip6_input(struct mbuf **, int *, int);
void        *rip6_ctlinput(int, const struct sockaddr *, void *);
int        rip6_ctloutput(int, struct socket *, struct sockopt *);
int        rip6_output(struct mbuf *, struct socket *, struct sockaddr_in6 *,
                         struct mbuf *);
int        rip6_attach(struct socket *, int);
int        rip6_usrreq(struct socket *,
            int, struct mbuf *, struct mbuf *, struct mbuf *, struct lwp *);

int        dest6_input(struct mbuf **, int *, int);
int        none_input(struct mbuf **, int *, int);

struct route;

int        in6_selectsrc(struct sockaddr_in6 *, struct ip6_pktopts *,
           struct ip6_moptions *, struct route *, struct in6_addr *,
           struct ifnet **, struct psref *, struct in6_addr *);
int in6_selectroute(struct sockaddr_in6 *, struct ip6_pktopts *,
        struct route **, struct rtentry **, bool);
int        ip6_get_membership(const struct sockopt *, struct ifnet **,
            struct psref *, void *, size_t);

static __inline uint32_t
ip6_randomid(void)
{

        return cprng_fast32();
}

static __inline uint32_t
ip6_randomflowlabel(void)
{

        return cprng_fast32() & 0xfffff;
}

static __inline bool
ip6_dad_enabled(void)
{

        return ip6_dad_count > 0;
}
#endif /* _KERNEL */

#endif /* !_NETINET6_IP6_VAR_H_ */

















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/*        $NetBSD: bufq_impl.h,v 1.10 2016/11/16 00:46:46 pgoyette Exp $        */
/*        NetBSD: bufq.h,v 1.3 2005/03/31 11:28:53 yamt Exp        */
/*        NetBSD: buf.h,v 1.75 2004/09/18 16:40:11 yamt Exp         */

/*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)buf.h        8.9 (Berkeley) 3/30/95
 */

#if !defined(_KERNEL)
#error not supposed to be exposed to userland.
#endif

struct bufq_strat;

/*
 * Device driver buffer queue.
 */
struct bufq_state {
        void (*bq_put)(struct bufq_state *, struct buf *);
        struct buf *(*bq_get)(struct bufq_state *, int);
        struct buf *(*bq_cancel)(struct bufq_state *, struct buf *);
        void (*bq_fini)(struct bufq_state *);
        void *bq_private;
        int bq_flags;                        /* Flags from bufq_alloc() */
        struct bufq_strat *bq_strat;
};

static __inline void *bufq_private(const struct bufq_state *) __unused;
static __inline bool buf_inorder(const struct buf *, const struct buf *, int)
    __unused;

#include <sys/null.h> /* for NULL */

static __inline void *
bufq_private(const struct bufq_state *bufq)
{

        return bufq->bq_private;
}

/*
 * Check if two buf's are in ascending order.
 *
 * this function consider a NULL buf is after any non-NULL buf.
 *
 * this function returns false if two are "same".
 */
static __inline bool
buf_inorder(const struct buf *bp, const struct buf *bq, int sortby)
{

        KASSERT(bp != NULL || bq != NULL);
        if (bp == NULL || bq == NULL)
                return (bq == NULL);

        if (sortby == BUFQ_SORT_CYLINDER) {
                if (bp->b_cylinder != bq->b_cylinder)
                        return bp->b_cylinder < bq->b_cylinder;
                else
                        return bp->b_rawblkno < bq->b_rawblkno;
        } else
                return bp->b_rawblkno < bq->b_rawblkno;
}

struct bufq_strat {
        const char *bs_name;
        void (*bs_initfn)(struct bufq_state *);
        int bs_prio;
        int bs_refcnt;
        SLIST_ENTRY(bufq_strat) bs_next;
};

#define        BUFQ_DEFINE(name, prio, initfn)                        \
static struct bufq_strat bufq_strat_##name = {                \
        .bs_name = #name,                                \
        .bs_prio = prio,                                \
        .bs_initfn = initfn,                                \
        .bs_refcnt = 0                                        \
};

int bufq_register(struct bufq_strat *);
int bufq_unregister(struct bufq_strat *);








































































































































































































































































































    3 





















































































































































































































































    1 









    1 

    1 
    1 


    1 




































































































































    9 







































    1 




    1 
    1 
    1 





    1 













































































































































































    1 





    1 

    1 



























    1 



    1 

    1 






    1 
    1 

    1 


























    2 






















































    2 
    2 







    2 





    2 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 













































    1 










    1 
    1 




    1 







































    2 


    2 













    2 













    2 














    2 



    2 






























    2 








    2 









    2 

















    2 



    2 



    2 













































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
/*        $NetBSD: kern_entropy.c,v 1.66 2023/10/04 20:28:06 ad Exp $        */

/*-
 * Copyright (c) 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Taylor R. Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Entropy subsystem
 *
 *        * Each CPU maintains a per-CPU entropy pool so that gathering
 *          entropy requires no interprocessor synchronization, except
 *          early at boot when we may be scrambling to gather entropy as
 *          soon as possible.
 *
 *          - entropy_enter gathers entropy and never drops it on the
 *            floor, at the cost of sometimes having to do cryptography.
 *
 *          - entropy_enter_intr gathers entropy or drops it on the
 *            floor, with low latency.  Work to stir the pool or kick the
 *            housekeeping thread is scheduled in soft interrupts.
 *
 *        * entropy_enter immediately enters into the global pool if it
 *          can transition to full entropy in one swell foop.  Otherwise,
 *          it defers to a housekeeping thread that consolidates entropy,
 *          but only when the CPUs collectively have full entropy, in
 *          order to mitigate iterative-guessing attacks.
 *
 *        * The entropy housekeeping thread continues to consolidate
 *          entropy even after we think we have full entropy, in case we
 *          are wrong, but is limited to one discretionary consolidation
 *          per minute, and only when new entropy is actually coming in,
 *          to limit performance impact.
 *
 *        * The entropy epoch is the number that changes when we
 *          transition from partial entropy to full entropy, so that
 *          users can easily determine when to reseed.  This also
 *          facilitates an operator explicitly causing everything to
 *          reseed by sysctl -w kern.entropy.consolidate=1.
 *
 *        * Entropy depletion is available for testing (or if you're into
 *          that sort of thing), with sysctl -w kern.entropy.depletion=1;
 *          the logic to support it is small, to minimize chance of bugs.
 *
 *        * While cold, a single global entropy pool is available for
 *          entering and extracting, serialized through splhigh/splx.
 *          The per-CPU entropy pool data structures are initialized in
 *          entropy_init and entropy_init_late (separated mainly for
 *          hysterical raisins at this point), but are not used until the
 *          system is warm, at which point access to the global entropy
 *          pool is limited to thread and softint context and serialized
 *          by E->lock.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_entropy.c,v 1.66 2023/10/04 20:28:06 ad Exp $");

#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/compat_stub.h>
#include <sys/condvar.h>
#include <sys/cpu.h>
#include <sys/entropy.h>
#include <sys/errno.h>
#include <sys/evcnt.h>
#include <sys/event.h>
#include <sys/file.h>
#include <sys/intr.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/kthread.h>
#include <sys/lwp.h>
#include <sys/module_hook.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/reboot.h>
#include <sys/rnd.h>                /* legacy kernel API */
#include <sys/rndio.h>                /* userland ioctl interface */
#include <sys/rndsource.h>        /* kernel rndsource driver API */
#include <sys/select.h>
#include <sys/selinfo.h>
#include <sys/sha1.h>                /* for boot seed checksum */
#include <sys/stdint.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/xcall.h>

#include <lib/libkern/entpool.h>

#include <machine/limits.h>

#ifdef __HAVE_CPU_COUNTER
#include <machine/cpu_counter.h>
#endif

#define        MINENTROPYBYTES        ENTROPY_CAPACITY
#define        MINENTROPYBITS        (MINENTROPYBYTES*NBBY)
#define        MINSAMPLES        (2*MINENTROPYBITS)

/*
 * struct entropy_cpu
 *
 *        Per-CPU entropy state.  The pool is allocated separately
 *        because percpu(9) sometimes moves per-CPU objects around
 *        without zeroing them, which would lead to unwanted copies of
 *        sensitive secrets.  The evcnt is allocated separately because
 *        evcnt(9) assumes it stays put in memory.
 */
struct entropy_cpu {
        struct entropy_cpu_evcnt {
                struct evcnt                softint;
                struct evcnt                intrdrop;
                struct evcnt                intrtrunc;
        }                        *ec_evcnt;
        struct entpool                *ec_pool;
        unsigned                ec_bitspending;
        unsigned                ec_samplespending;
        bool                        ec_locked;
};

/*
 * struct entropy_cpu_lock
 *
 *        State for locking the per-CPU entropy state.
 */
struct entropy_cpu_lock {
        int                ecl_s;
        long                ecl_pctr;
};

/*
 * struct rndsource_cpu
 *
 *        Per-CPU rndsource state.
 */
struct rndsource_cpu {
        unsigned                rc_entropybits;
        unsigned                rc_timesamples;
        unsigned                rc_datasamples;
        rnd_delta_t                rc_timedelta;
};

/*
 * entropy_global (a.k.a. E for short in this file)
 *
 *        Global entropy state.  Writes protected by the global lock.
 *        Some fields, marked (A), can be read outside the lock, and are
 *        maintained with atomic_load/store_relaxed.
 */
struct {
        kmutex_t        lock;                /* covers all global state */
        struct entpool        pool;                /* global pool for extraction */
        unsigned        bitsneeded;        /* (A) needed globally */
        unsigned        bitspending;        /* pending in per-CPU pools */
        unsigned        samplesneeded;        /* (A) needed globally */
        unsigned        samplespending;        /* pending in per-CPU pools */
        unsigned        timestamp;        /* (A) time of last consolidation */
        unsigned        epoch;                /* (A) changes when needed -> 0 */
        kcondvar_t        cv;                /* notifies state changes */
        struct selinfo        selq;                /* notifies needed -> 0 */
        struct lwp        *sourcelock;        /* lock on list of sources */
        kcondvar_t        sourcelock_cv;        /* notifies sourcelock release */
        LIST_HEAD(,krndsource) sources;        /* list of entropy sources */
        bool                consolidate;        /* kick thread to consolidate */
        bool                seed_rndsource;        /* true if seed source is attached */
        bool                seeded;                /* true if seed file already loaded */
} entropy_global __cacheline_aligned = {
        /* Fields that must be initialized when the kernel is loaded.  */
        .bitsneeded = MINENTROPYBITS,
        .samplesneeded = MINSAMPLES,
        .epoch = (unsigned)-1,        /* -1 means entropy never consolidated */
        .sources = LIST_HEAD_INITIALIZER(entropy_global.sources),
};

#define        E        (&entropy_global)        /* declutter */

/* Read-mostly globals */
static struct percpu        *entropy_percpu __read_mostly; /* struct entropy_cpu */
static void                *entropy_sih __read_mostly; /* softint handler */
static struct lwp        *entropy_lwp __read_mostly; /* housekeeping thread */

static struct krndsource seed_rndsource __read_mostly;

/*
 * Event counters
 *
 *        Must be careful with adding these because they can serve as
 *        side channels.
 */
static struct evcnt entropy_discretionary_evcnt =
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "discretionary");
EVCNT_ATTACH_STATIC(entropy_discretionary_evcnt);
static struct evcnt entropy_immediate_evcnt =
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "immediate");
EVCNT_ATTACH_STATIC(entropy_immediate_evcnt);
static struct evcnt entropy_partial_evcnt =
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "partial");
EVCNT_ATTACH_STATIC(entropy_partial_evcnt);
static struct evcnt entropy_consolidate_evcnt =
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "consolidate");
EVCNT_ATTACH_STATIC(entropy_consolidate_evcnt);
static struct evcnt entropy_extract_fail_evcnt =
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "extract fail");
EVCNT_ATTACH_STATIC(entropy_extract_fail_evcnt);
static struct evcnt entropy_request_evcnt =
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "request");
EVCNT_ATTACH_STATIC(entropy_request_evcnt);
static struct evcnt entropy_deplete_evcnt =
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "deplete");
EVCNT_ATTACH_STATIC(entropy_deplete_evcnt);
static struct evcnt entropy_notify_evcnt =
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "notify");
EVCNT_ATTACH_STATIC(entropy_notify_evcnt);

/* Sysctl knobs */
static bool        entropy_collection = 1;
static bool        entropy_depletion = 0; /* Silly!  */

static const struct sysctlnode        *entropy_sysctlroot;
static struct sysctllog                *entropy_sysctllog;

/* Forward declarations */
static void        entropy_init_cpu(void *, void *, struct cpu_info *);
static void        entropy_fini_cpu(void *, void *, struct cpu_info *);
static void        entropy_account_cpu(struct entropy_cpu *);
static void        entropy_enter(const void *, size_t, unsigned, bool);
static bool        entropy_enter_intr(const void *, size_t, unsigned, bool);
static void        entropy_softintr(void *);
static void        entropy_thread(void *);
static bool        entropy_pending(void);
static void        entropy_pending_cpu(void *, void *, struct cpu_info *);
static void        entropy_do_consolidate(void);
static void        entropy_consolidate_xc(void *, void *);
static void        entropy_notify(void);
static int        sysctl_entropy_consolidate(SYSCTLFN_ARGS);
static int        sysctl_entropy_gather(SYSCTLFN_ARGS);
static void        filt_entropy_read_detach(struct knote *);
static int        filt_entropy_read_event(struct knote *, long);
static int        entropy_request(size_t, int);
static void        rnd_add_data_internal(struct krndsource *, const void *,
                    uint32_t, uint32_t, bool);
static void        rnd_add_data_1(struct krndsource *, const void *, uint32_t,
                    uint32_t, bool, uint32_t, bool);
static unsigned        rndsource_entropybits(struct krndsource *);
static void        rndsource_entropybits_cpu(void *, void *, struct cpu_info *);
static void        rndsource_to_user(struct krndsource *, rndsource_t *);
static void        rndsource_to_user_est(struct krndsource *, rndsource_est_t *);
static void        rndsource_to_user_est_cpu(void *, void *, struct cpu_info *);

/*
 * entropy_timer()
 *
 *        Cycle counter, time counter, or anything that changes a wee bit
 *        unpredictably.
 */
static inline uint32_t
entropy_timer(void)
{
        struct bintime bt;
        uint32_t v;

        /* If we have a CPU cycle counter, use the low 32 bits.  */
#ifdef __HAVE_CPU_COUNTER
        if (__predict_true(cpu_hascounter()))
                return cpu_counter32();
#endif        /* __HAVE_CPU_COUNTER */

        /* If we're cold, tough.  Can't binuptime while cold.  */
        if (__predict_false(cold))
                return 0;

        /* Fold the 128 bits of binuptime into 32 bits.  */
        binuptime(&bt);
        v = bt.frac;
        v ^= bt.frac >> 32;
        v ^= bt.sec;
        v ^= bt.sec >> 32;
        return v;
}

static void
attach_seed_rndsource(void)
{

        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());
        KASSERT(cold);

        /*
         * First called no later than entropy_init, while we are still
         * single-threaded, so no need for RUN_ONCE.
         */
        if (E->seed_rndsource)
                return;

        rnd_attach_source(&seed_rndsource, "seed", RND_TYPE_UNKNOWN,
            RND_FLAG_COLLECT_VALUE);
        E->seed_rndsource = true;
}

/*
 * entropy_init()
 *
 *        Initialize the entropy subsystem.  Panic on failure.
 *
 *        Requires percpu(9) and sysctl(9) to be initialized.  Must run
 *        while cold.
 */
static void
entropy_init(void)
{
        uint32_t extra[2];
        struct krndsource *rs;
        unsigned i = 0;

        KASSERT(cold);

        /* Grab some cycle counts early at boot.  */
        extra[i++] = entropy_timer();

        /* Run the entropy pool cryptography self-test.  */
        if (entpool_selftest() == -1)
                panic("entropy pool crypto self-test failed");

        /* Create the sysctl directory.  */
        sysctl_createv(&entropy_sysctllog, 0, NULL, &entropy_sysctlroot,
            CTLFLAG_PERMANENT, CTLTYPE_NODE, "entropy",
            SYSCTL_DESCR("Entropy (random number sources) options"),
            NULL, 0, NULL, 0,
            CTL_KERN, CTL_CREATE, CTL_EOL);

        /* Create the sysctl knobs.  */
        /* XXX These shouldn't be writable at securelevel>0.  */
        sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "collection",
            SYSCTL_DESCR("Automatically collect entropy from hardware"),
            NULL, 0, &entropy_collection, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "depletion",
            SYSCTL_DESCR("`Deplete' entropy pool when observed"),
            NULL, 0, &entropy_depletion, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "consolidate",
            SYSCTL_DESCR("Trigger entropy consolidation now"),
            sysctl_entropy_consolidate, 0, NULL, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "gather",
            SYSCTL_DESCR("Trigger entropy gathering from sources now"),
            sysctl_entropy_gather, 0, NULL, 0, CTL_CREATE, CTL_EOL);
        /* XXX These should maybe not be readable at securelevel>0.  */
        sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
            "needed",
            SYSCTL_DESCR("Systemwide entropy deficit (bits of entropy)"),
            NULL, 0, &E->bitsneeded, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
            "pending",
            SYSCTL_DESCR("Number of bits of entropy pending on CPUs"),
            NULL, 0, &E->bitspending, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
            "samplesneeded",
            SYSCTL_DESCR("Systemwide entropy deficit (samples)"),
            NULL, 0, &E->samplesneeded, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
            "samplespending",
            SYSCTL_DESCR("Number of samples pending on CPUs"),
            NULL, 0, &E->samplespending, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
            CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
            "epoch", SYSCTL_DESCR("Entropy epoch"),
            NULL, 0, &E->epoch, 0, CTL_CREATE, CTL_EOL);

        /* Initialize the global state for multithreaded operation.  */
        mutex_init(&E->lock, MUTEX_DEFAULT, IPL_SOFTSERIAL);
        cv_init(&E->cv, "entropy");
        selinit(&E->selq);
        cv_init(&E->sourcelock_cv, "entsrclock");

        /* Make sure the seed source is attached.  */
        attach_seed_rndsource();

        /* Note if the bootloader didn't provide a seed.  */
        if (!E->seeded)
                aprint_debug("entropy: no seed from bootloader\n");

        /* Allocate the per-CPU records for all early entropy sources.  */
        LIST_FOREACH(rs, &E->sources, list)
                rs->state = percpu_alloc(sizeof(struct rndsource_cpu));

        /* Allocate and initialize the per-CPU state.  */
        entropy_percpu = percpu_create(sizeof(struct entropy_cpu),
            entropy_init_cpu, entropy_fini_cpu, NULL);

        /* Enter the boot cycle count to get started.  */
        extra[i++] = entropy_timer();
        KASSERT(i == __arraycount(extra));
        entropy_enter(extra, sizeof extra, /*nbits*/0, /*count*/false);
        explicit_memset(extra, 0, sizeof extra);
}

/*
 * entropy_init_late()
 *
 *        Late initialization.  Panic on failure.
 *
 *        Requires CPUs to have been detected and LWPs to have started.
 *        Must run while cold.
 */
static void
entropy_init_late(void)
{
        int error;

        KASSERT(cold);

        /*
         * Establish the softint at the highest softint priority level.
         * Must happen after CPU detection.
         */
        entropy_sih = softint_establish(SOFTINT_SERIAL|SOFTINT_MPSAFE,
            &entropy_softintr, NULL);
        if (entropy_sih == NULL)
                panic("unable to establish entropy softint");

        /*
         * Create the entropy housekeeping thread.  Must happen after
         * lwpinit.
         */
        error = kthread_create(PRI_NONE, KTHREAD_MPSAFE|KTHREAD_TS, NULL,
            entropy_thread, NULL, &entropy_lwp, "entbutler");
        if (error)
                panic("unable to create entropy housekeeping thread: %d",
                    error);
}

/*
 * entropy_init_cpu(ptr, cookie, ci)
 *
 *        percpu(9) constructor for per-CPU entropy pool.
 */
static void
entropy_init_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
        struct entropy_cpu *ec = ptr;
        const char *cpuname;

        ec->ec_evcnt = kmem_alloc(sizeof(*ec->ec_evcnt), KM_SLEEP);
        ec->ec_pool = kmem_zalloc(sizeof(*ec->ec_pool), KM_SLEEP);
        ec->ec_bitspending = 0;
        ec->ec_samplespending = 0;
        ec->ec_locked = false;

        /* XXX ci_cpuname may not be initialized early enough.  */
        cpuname = ci->ci_cpuname[0] == '\0' ? "cpu0" : ci->ci_cpuname;
        evcnt_attach_dynamic(&ec->ec_evcnt->softint, EVCNT_TYPE_MISC, NULL,
            cpuname, "entropy softint");
        evcnt_attach_dynamic(&ec->ec_evcnt->intrdrop, EVCNT_TYPE_MISC, NULL,
            cpuname, "entropy intrdrop");
        evcnt_attach_dynamic(&ec->ec_evcnt->intrtrunc, EVCNT_TYPE_MISC, NULL,
            cpuname, "entropy intrtrunc");
}

/*
 * entropy_fini_cpu(ptr, cookie, ci)
 *
 *        percpu(9) destructor for per-CPU entropy pool.
 */
static void
entropy_fini_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
        struct entropy_cpu *ec = ptr;

        /*
         * Zero any lingering data.  Disclosure of the per-CPU pool
         * shouldn't retroactively affect the security of any keys
         * generated, because entpool(9) erases whatever we have just
         * drawn out of any pool, but better safe than sorry.
         */
        explicit_memset(ec->ec_pool, 0, sizeof(*ec->ec_pool));

        evcnt_detach(&ec->ec_evcnt->intrtrunc);
        evcnt_detach(&ec->ec_evcnt->intrdrop);
        evcnt_detach(&ec->ec_evcnt->softint);

        kmem_free(ec->ec_pool, sizeof(*ec->ec_pool));
        kmem_free(ec->ec_evcnt, sizeof(*ec->ec_evcnt));
}

/*
 * ec = entropy_cpu_get(&lock)
 * entropy_cpu_put(&lock, ec)
 *
 *        Lock and unlock the per-CPU entropy state.  This only prevents
 *        access on the same CPU -- by hard interrupts, by soft
 *        interrupts, or by other threads.
 *
 *        Blocks soft interrupts and preemption altogether; doesn't block
 *        hard interrupts, but causes samples in hard interrupts to be
 *        dropped.
 */
static struct entropy_cpu *
entropy_cpu_get(struct entropy_cpu_lock *lock)
{
        struct entropy_cpu *ec;

        ec = percpu_getref(entropy_percpu);
        lock->ecl_s = splsoftserial();
        KASSERT(!ec->ec_locked);
        ec->ec_locked = true;
        lock->ecl_pctr = lwp_pctr();
        __insn_barrier();

        return ec;
}

static void
entropy_cpu_put(struct entropy_cpu_lock *lock, struct entropy_cpu *ec)
{

        KASSERT(ec == percpu_getptr_remote(entropy_percpu, curcpu()));
        KASSERT(ec->ec_locked);

        __insn_barrier();
        KASSERT(lock->ecl_pctr == lwp_pctr());
        ec->ec_locked = false;
        splx(lock->ecl_s);
        percpu_putref(entropy_percpu);
}

/*
 * entropy_seed(seed)
 *
 *        Seed the entropy pool with seed.  Meant to be called as early
 *        as possible by the bootloader; may be called before or after
 *        entropy_init.  Must be called before system reaches userland.
 *        Must be called in thread or soft interrupt context, not in hard
 *        interrupt context.  Must be called at most once.
 *
 *        Overwrites the seed in place.  Caller may then free the memory.
 */
static void
entropy_seed(rndsave_t *seed)
{
        SHA1_CTX ctx;
        uint8_t digest[SHA1_DIGEST_LENGTH];
        bool seeded;

        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());
        KASSERT(cold);

        /*
         * Verify the checksum.  If the checksum fails, take the data
         * but ignore the entropy estimate -- the file may have been
         * incompletely written with garbage, which is harmless to add
         * but may not be as unpredictable as alleged.
         */
        SHA1Init(&ctx);
        SHA1Update(&ctx, (const void *)&seed->entropy, sizeof(seed->entropy));
        SHA1Update(&ctx, seed->data, sizeof(seed->data));
        SHA1Final(digest, &ctx);
        CTASSERT(sizeof(seed->digest) == sizeof(digest));
        if (!consttime_memequal(digest, seed->digest, sizeof(digest))) {
                printf("entropy: invalid seed checksum\n");
                seed->entropy = 0;
        }
        explicit_memset(&ctx, 0, sizeof ctx);
        explicit_memset(digest, 0, sizeof digest);

        /*
         * If the entropy is insensibly large, try byte-swapping.
         * Otherwise assume the file is corrupted and act as though it
         * has zero entropy.
         */
        if (howmany(seed->entropy, NBBY) > sizeof(seed->data)) {
                seed->entropy = bswap32(seed->entropy);
                if (howmany(seed->entropy, NBBY) > sizeof(seed->data))
                        seed->entropy = 0;
        }

        /* Make sure the seed source is attached.  */
        attach_seed_rndsource();

        /* Test and set E->seeded.  */
        seeded = E->seeded;
        E->seeded = (seed->entropy > 0);

        /*
         * If we've been seeded, may be re-entering the same seed
         * (e.g., bootloader vs module init, or something).  No harm in
         * entering it twice, but it contributes no additional entropy.
         */
        if (seeded) {
                printf("entropy: double-seeded by bootloader\n");
                seed->entropy = 0;
        } else {
                printf("entropy: entering seed from bootloader"
                    " with %u bits of entropy\n", (unsigned)seed->entropy);
        }

        /* Enter it into the pool and promptly zero it.  */
        rnd_add_data(&seed_rndsource, seed->data, sizeof(seed->data),
            seed->entropy);
        explicit_memset(seed, 0, sizeof(*seed));
}

/*
 * entropy_bootrequest()
 *
 *        Request entropy from all sources at boot, once config is
 *        complete and interrupts are running but we are still cold.
 */
void
entropy_bootrequest(void)
{
        int error;

        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());
        KASSERT(cold);

        /*
         * Request enough to satisfy the maximum entropy shortage.
         * This is harmless overkill if the bootloader provided a seed.
         */
        error = entropy_request(MINENTROPYBYTES, ENTROPY_WAIT);
        KASSERTMSG(error == 0, "error=%d", error);
}

/*
 * entropy_epoch()
 *
 *        Returns the current entropy epoch.  If this changes, you should
 *        reseed.  If -1, means system entropy has not yet reached full
 *        entropy or been explicitly consolidated; never reverts back to
 *        -1.  Never zero, so you can always use zero as an uninitialized
 *        sentinel value meaning `reseed ASAP'.
 *
 *        Usage model:
 *
 *                struct foo {
 *                        struct crypto_prng prng;
 *                        unsigned epoch;
 *                } *foo;
 *
 *                unsigned epoch = entropy_epoch();
 *                if (__predict_false(epoch != foo->epoch)) {
 *                        uint8_t seed[32];
 *                        if (entropy_extract(seed, sizeof seed, 0) != 0)
 *                                warn("no entropy");
 *                        crypto_prng_reseed(&foo->prng, seed, sizeof seed);
 *                        foo->epoch = epoch;
 *                }
 */
unsigned
entropy_epoch(void)
{

        /*
         * Unsigned int, so no need for seqlock for an atomic read, but
         * make sure we read it afresh each time.
         */
        return atomic_load_relaxed(&E->epoch);
}

/*
 * entropy_ready()
 *
 *        True if the entropy pool has full entropy.
 */
bool
entropy_ready(void)
{

        return atomic_load_relaxed(&E->bitsneeded) == 0;
}

/*
 * entropy_account_cpu(ec)
 *
 *        Consider whether to consolidate entropy into the global pool
 *        after we just added some into the current CPU's pending pool.
 *
 *        - If this CPU can provide enough entropy now, do so.
 *
 *        - If this and whatever else is available on other CPUs can
 *          provide enough entropy, kick the consolidation thread.
 *
 *        - Otherwise, do as little as possible, except maybe consolidate
 *          entropy at most once a minute.
 *
 *        Caller must be bound to a CPU and therefore have exclusive
 *        access to ec.  Will acquire and release the global lock.
 */
static void
entropy_account_cpu(struct entropy_cpu *ec)
{
        struct entropy_cpu_lock lock;
        struct entropy_cpu *ec0;
        unsigned bitsdiff, samplesdiff;

        KASSERT(!cpu_intr_p());
        KASSERT(!cold);
        KASSERT(curlwp->l_pflag & LP_BOUND);

        /*
         * If there's no entropy needed, and entropy has been
         * consolidated in the last minute, do nothing.
         */
        if (__predict_true(atomic_load_relaxed(&E->bitsneeded) == 0) &&
            __predict_true(!atomic_load_relaxed(&entropy_depletion)) &&
            __predict_true((time_uptime - E->timestamp) <= 60))
                return;

        /*
         * Consider consolidation, under the global lock and with the
         * per-CPU state locked.
         */
        mutex_enter(&E->lock);
        ec0 = entropy_cpu_get(&lock);
        KASSERT(ec0 == ec);

        if (ec->ec_bitspending == 0 && ec->ec_samplespending == 0) {
                /* Raced with consolidation xcall.  Nothing to do.  */
        } else if (E->bitsneeded != 0 && E->bitsneeded <= ec->ec_bitspending) {
                /*
                 * If we have not yet attained full entropy but we can
                 * now, do so.  This way we disseminate entropy
                 * promptly when it becomes available early at boot;
                 * otherwise we leave it to the entropy consolidation
                 * thread, which is rate-limited to mitigate side
                 * channels and abuse.
                 */
                uint8_t buf[ENTPOOL_CAPACITY];

                /* Transfer from the local pool to the global pool.  */
                entpool_extract(ec->ec_pool, buf, sizeof buf);
                entpool_enter(&E->pool, buf, sizeof buf);
                atomic_store_relaxed(&ec->ec_bitspending, 0);
                atomic_store_relaxed(&ec->ec_samplespending, 0);
                atomic_store_relaxed(&E->bitsneeded, 0);
                atomic_store_relaxed(&E->samplesneeded, 0);

                /* Notify waiters that we now have full entropy.  */
                entropy_notify();
                entropy_immediate_evcnt.ev_count++;
        } else {
                /* Determine how much we can add to the global pool.  */
                KASSERTMSG(E->bitspending <= MINENTROPYBITS,
                    "E->bitspending=%u", E->bitspending);
                bitsdiff = MIN(ec->ec_bitspending,
                    MINENTROPYBITS - E->bitspending);
                KASSERTMSG(E->samplespending <= MINSAMPLES,
                    "E->samplespending=%u", E->samplespending);
                samplesdiff = MIN(ec->ec_samplespending,
                    MINSAMPLES - E->samplespending);

                /*
                 * This should make a difference unless we are already
                 * saturated.
                 */
                KASSERTMSG((bitsdiff || samplesdiff ||
                        E->bitspending == MINENTROPYBITS ||
                        E->samplespending == MINSAMPLES),
                    "bitsdiff=%u E->bitspending=%u ec->ec_bitspending=%u"
                    "samplesdiff=%u E->samplespending=%u"
                    " ec->ec_samplespending=%u"
                    " minentropybits=%u minsamples=%u",
                    bitsdiff, E->bitspending, ec->ec_bitspending,
                    samplesdiff, E->samplespending, ec->ec_samplespending,
                    (unsigned)MINENTROPYBITS, (unsigned)MINSAMPLES);

                /* Add to the global, subtract from the local.  */
                E->bitspending += bitsdiff;
                KASSERTMSG(E->bitspending <= MINENTROPYBITS,
                    "E->bitspending=%u", E->bitspending);
                atomic_store_relaxed(&ec->ec_bitspending,
                    ec->ec_bitspending - bitsdiff);

                E->samplespending += samplesdiff;
                KASSERTMSG(E->samplespending <= MINSAMPLES,
                    "E->samplespending=%u", E->samplespending);
                atomic_store_relaxed(&ec->ec_samplespending,
                    ec->ec_samplespending - samplesdiff);

                /* One or the other must have gone up from zero.  */
                KASSERT(E->bitspending || E->samplespending);

                if (E->bitsneeded <= E->bitspending ||
                    E->samplesneeded <= E->samplespending) {
                        /*
                         * Enough bits or at least samples between all
                         * the per-CPU pools.  Leave a note for the
                         * housekeeping thread to consolidate entropy
                         * next time it wakes up -- and wake it up if
                         * this is the first time, to speed things up.
                         *
                         * If we don't need any entropy, this doesn't
                         * mean much, but it is the only time we ever
                         * gather additional entropy in case the
                         * accounting has been overly optimistic.  This
                         * happens at most once a minute, so there's
                         * negligible performance cost.
                         */
                        E->consolidate = true;
                        if (E->epoch == (unsigned)-1)
                                cv_broadcast(&E->cv);
                        if (E->bitsneeded == 0)
                                entropy_discretionary_evcnt.ev_count++;
                } else {
                        /* Can't get full entropy.  Keep gathering.  */
                        entropy_partial_evcnt.ev_count++;
                }
        }

        entropy_cpu_put(&lock, ec);
        mutex_exit(&E->lock);
}

/*
 * entropy_enter_early(buf, len, nbits)
 *
 *        Do entropy bookkeeping globally, before we have established
 *        per-CPU pools.  Enter directly into the global pool in the hope
 *        that we enter enough before the first entropy_extract to thwart
 *        iterative-guessing attacks; entropy_extract will warn if not.
 */
static void
entropy_enter_early(const void *buf, size_t len, unsigned nbits)
{
        bool notify = false;
        int s;

        KASSERT(cold);

        /*
         * We're early at boot before multithreading and multi-CPU
         * operation, and we don't have softints yet to defer
         * processing from interrupt context, so we have to enter the
         * samples directly into the global pool.  But interrupts may
         * be enabled, and we enter this path from interrupt context,
         * so block interrupts until we're done.
         */
        s = splhigh();

        /* Enter it into the pool.  */
        entpool_enter(&E->pool, buf, len);

        /*
         * Decide whether to notify reseed -- we will do so if either:
         * (a) we transition from partial entropy to full entropy, or
         * (b) we get a batch of full entropy all at once.
         * We don't count timing samples because we assume, while cold,
         * there's not likely to be much jitter yet.
         */
        notify |= (E->bitsneeded && E->bitsneeded <= nbits);
        notify |= (nbits >= MINENTROPYBITS);

        /*
         * Subtract from the needed count and notify if appropriate.
         * We don't count samples here because entropy_timer might
         * still be returning zero at this point if there's no CPU
         * cycle counter.
         */
        E->bitsneeded -= MIN(E->bitsneeded, nbits);
        if (notify) {
                entropy_notify();
                entropy_immediate_evcnt.ev_count++;
        }

        splx(s);
}

/*
 * entropy_enter(buf, len, nbits, count)
 *
 *        Enter len bytes of data from buf into the system's entropy
 *        pool, stirring as necessary when the internal buffer fills up.
 *        nbits is a lower bound on the number of bits of entropy in the
 *        process that led to this sample.
 */
static void
entropy_enter(const void *buf, size_t len, unsigned nbits, bool count)
{
        struct entropy_cpu_lock lock;
        struct entropy_cpu *ec;
        unsigned bitspending, samplespending;
        int bound;

        KASSERTMSG(!cpu_intr_p(),
            "use entropy_enter_intr from interrupt context");
        KASSERTMSG(howmany(nbits, NBBY) <= len,
            "impossible entropy rate: %u bits in %zu-byte string", nbits, len);

        /*
         * If we're still cold, just use entropy_enter_early to put
         * samples directly into the global pool.
         */
        if (__predict_false(cold)) {
                entropy_enter_early(buf, len, nbits);
                return;
        }

        /*
         * Bind ourselves to the current CPU so we don't switch CPUs
         * between entering data into the current CPU's pool (and
         * updating the pending count) and transferring it to the
         * global pool in entropy_account_cpu.
         */
        bound = curlwp_bind();

        /*
         * With the per-CPU state locked, enter into the per-CPU pool
         * and count up what we can add.
         *
         * We don't count samples while cold because entropy_timer
         * might still be returning zero if there's no CPU cycle
         * counter.
         */
        ec = entropy_cpu_get(&lock);
        entpool_enter(ec->ec_pool, buf, len);
        bitspending = ec->ec_bitspending;
        bitspending += MIN(MINENTROPYBITS - bitspending, nbits);
        atomic_store_relaxed(&ec->ec_bitspending, bitspending);
        samplespending = ec->ec_samplespending;
        if (__predict_true(count)) {
                samplespending += MIN(MINSAMPLES - samplespending, 1);
                atomic_store_relaxed(&ec->ec_samplespending, samplespending);
        }
        entropy_cpu_put(&lock, ec);

        /* Consolidate globally if appropriate based on what we added.  */
        if (bitspending > 0 || samplespending >= MINSAMPLES)
                entropy_account_cpu(ec);

        curlwp_bindx(bound);
}

/*
 * entropy_enter_intr(buf, len, nbits, count)
 *
 *        Enter up to len bytes of data from buf into the system's
 *        entropy pool without stirring.  nbits is a lower bound on the
 *        number of bits of entropy in the process that led to this
 *        sample.  If the sample could be entered completely, assume
 *        nbits of entropy pending; otherwise assume none, since we don't
 *        know whether some parts of the sample are constant, for
 *        instance.  Schedule a softint to stir the entropy pool if
 *        needed.  Return true if used fully, false if truncated at all.
 *
 *        Using this in thread or softint context with no spin locks held
 *        will work, but you might as well use entropy_enter in that
 *        case.
 */
static bool
entropy_enter_intr(const void *buf, size_t len, unsigned nbits, bool count)
{
        struct entropy_cpu *ec;
        bool fullyused = false;
        uint32_t bitspending, samplespending;
        int s;

        KASSERTMSG(howmany(nbits, NBBY) <= len,
            "impossible entropy rate: %u bits in %zu-byte string", nbits, len);

        /*
         * If we're still cold, just use entropy_enter_early to put
         * samples directly into the global pool.
         */
        if (__predict_false(cold)) {
                entropy_enter_early(buf, len, nbits);
                return true;
        }

        /*
         * In case we were called in thread or interrupt context with
         * interrupts unblocked, block soft interrupts up to
         * IPL_SOFTSERIAL.  This way logic that is safe in interrupt
         * context or under a spin lock is also safe in less
         * restrictive contexts.
         */
        s = splsoftserial();

        /*
         * Acquire the per-CPU state.  If someone is in the middle of
         * using it, drop the sample.  Otherwise, take the lock so that
         * higher-priority interrupts will drop their samples.
         */
        ec = percpu_getref(entropy_percpu);
        if (ec->ec_locked) {
                ec->ec_evcnt->intrdrop.ev_count++;
                goto out0;
        }
        ec->ec_locked = true;
        __insn_barrier();

        /*
         * Enter as much as we can into the per-CPU pool.  If it was
         * truncated, schedule a softint to stir the pool and stop.
         */
        if (!entpool_enter_nostir(ec->ec_pool, buf, len)) {
                if (__predict_true(!cold))
                        softint_schedule(entropy_sih);
                ec->ec_evcnt->intrtrunc.ev_count++;
                goto out1;
        }
        fullyused = true;

        /*
         * Count up what we can contribute.
         *
         * We don't count samples while cold because entropy_timer
         * might still be returning zero if there's no CPU cycle
         * counter.
         */
        bitspending = ec->ec_bitspending;
        bitspending += MIN(MINENTROPYBITS - bitspending, nbits);
        atomic_store_relaxed(&ec->ec_bitspending, bitspending);
        if (__predict_true(count)) {
                samplespending = ec->ec_samplespending;
                samplespending += MIN(MINSAMPLES - samplespending, 1);
                atomic_store_relaxed(&ec->ec_samplespending, samplespending);
        }

        /* Schedule a softint if we added anything and it matters.  */
        if (__predict_false(atomic_load_relaxed(&E->bitsneeded) ||
                atomic_load_relaxed(&entropy_depletion)) &&
            (nbits != 0 || count) &&
            __predict_true(!cold))
                softint_schedule(entropy_sih);

out1:        /* Release the per-CPU state.  */
        KASSERT(ec->ec_locked);
        __insn_barrier();
        ec->ec_locked = false;
out0:        percpu_putref(entropy_percpu);
        splx(s);

        return fullyused;
}

/*
 * entropy_softintr(cookie)
 *
 *        Soft interrupt handler for entering entropy.  Takes care of
 *        stirring the local CPU's entropy pool if it filled up during
 *        hard interrupts, and promptly crediting entropy from the local
 *        CPU's entropy pool to the global entropy pool if needed.
 */
static void
entropy_softintr(void *cookie)
{
        struct entropy_cpu_lock lock;
        struct entropy_cpu *ec;
        unsigned bitspending, samplespending;

        /*
         * With the per-CPU state locked, stir the pool if necessary
         * and determine if there's any pending entropy on this CPU to
         * account globally.
         */
        ec = entropy_cpu_get(&lock);
        ec->ec_evcnt->softint.ev_count++;
        entpool_stir(ec->ec_pool);
        bitspending = ec->ec_bitspending;
        samplespending = ec->ec_samplespending;
        entropy_cpu_put(&lock, ec);

        /* Consolidate globally if appropriate based on what we added.  */
        if (bitspending > 0 || samplespending >= MINSAMPLES)
                entropy_account_cpu(ec);
}

/*
 * entropy_thread(cookie)
 *
 *        Handle any asynchronous entropy housekeeping.
 */
static void
entropy_thread(void *cookie)
{
        bool consolidate;

#ifndef _RUMPKERNEL                /* XXX rump starts threads before cold */
        KASSERT(!cold);
#endif

        for (;;) {
                /*
                 * Wait until there's full entropy somewhere among the
                 * CPUs, as confirmed at most once per minute, or
                 * someone wants to consolidate.
                 */
                if (entropy_pending()) {
                        consolidate = true;
                } else {
                        mutex_enter(&E->lock);
                        if (!E->consolidate)
                                cv_timedwait(&E->cv, &E->lock, 60*hz);
                        consolidate = E->consolidate;
                        E->consolidate = false;
                        mutex_exit(&E->lock);
                }

                if (consolidate) {
                        /* Do it.  */
                        entropy_do_consolidate();

                        /* Mitigate abuse.  */
                        kpause("entropy", false, hz, NULL);
                }
        }
}

struct entropy_pending_count {
        uint32_t bitspending;
        uint32_t samplespending;
};

/*
 * entropy_pending()
 *
 *        True if enough bits or samples are pending on other CPUs to
 *        warrant consolidation.
 */
static bool
entropy_pending(void)
{
        struct entropy_pending_count count = { 0, 0 }, *C = &count;

        percpu_foreach(entropy_percpu, &entropy_pending_cpu, C);
        return C->bitspending >= MINENTROPYBITS ||
            C->samplespending >= MINSAMPLES;
}

static void
entropy_pending_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
        struct entropy_cpu *ec = ptr;
        struct entropy_pending_count *C = cookie;
        uint32_t cpu_bitspending;
        uint32_t cpu_samplespending;

        cpu_bitspending = atomic_load_relaxed(&ec->ec_bitspending);
        cpu_samplespending = atomic_load_relaxed(&ec->ec_samplespending);
        C->bitspending += MIN(MINENTROPYBITS - C->bitspending,
            cpu_bitspending);
        C->samplespending += MIN(MINSAMPLES - C->samplespending,
            cpu_samplespending);
}

/*
 * entropy_do_consolidate()
 *
 *        Issue a cross-call to gather entropy on all CPUs and advance
 *        the entropy epoch.
 */
static void
entropy_do_consolidate(void)
{
        static const struct timeval interval = {.tv_sec = 60, .tv_usec = 0};
        static struct timeval lasttime; /* serialized by E->lock */
        struct entpool pool;
        uint8_t buf[ENTPOOL_CAPACITY];
        unsigned bitsdiff, samplesdiff;
        uint64_t ticket;

        KASSERT(!cold);
        ASSERT_SLEEPABLE();

        /* Gather entropy on all CPUs into a temporary pool.  */
        memset(&pool, 0, sizeof pool);
        ticket = xc_broadcast(0, &entropy_consolidate_xc, &pool, NULL);
        xc_wait(ticket);

        /* Acquire the lock to notify waiters.  */
        mutex_enter(&E->lock);

        /* Count another consolidation.  */
        entropy_consolidate_evcnt.ev_count++;

        /* Note when we last consolidated, i.e. now.  */
        E->timestamp = time_uptime;

        /* Mix what we gathered into the global pool.  */
        entpool_extract(&pool, buf, sizeof buf);
        entpool_enter(&E->pool, buf, sizeof buf);
        explicit_memset(&pool, 0, sizeof pool);

        /* Count the entropy that was gathered.  */
        bitsdiff = MIN(E->bitsneeded, E->bitspending);
        atomic_store_relaxed(&E->bitsneeded, E->bitsneeded - bitsdiff);
        E->bitspending -= bitsdiff;
        if (__predict_false(E->bitsneeded > 0) && bitsdiff != 0) {
                if ((boothowto & AB_DEBUG) != 0 &&
                    ratecheck(&lasttime, &interval)) {
                        printf("WARNING:"
                            " consolidating less than full entropy\n");
                }
        }

        samplesdiff = MIN(E->samplesneeded, E->samplespending);
        atomic_store_relaxed(&E->samplesneeded,
            E->samplesneeded - samplesdiff);
        E->samplespending -= samplesdiff;

        /* Advance the epoch and notify waiters.  */
        entropy_notify();

        /* Release the lock.  */
        mutex_exit(&E->lock);
}

/*
 * entropy_consolidate_xc(vpool, arg2)
 *
 *        Extract output from the local CPU's input pool and enter it
 *        into a temporary pool passed as vpool.
 */
static void
entropy_consolidate_xc(void *vpool, void *arg2 __unused)
{
        struct entpool *pool = vpool;
        struct entropy_cpu_lock lock;
        struct entropy_cpu *ec;
        uint8_t buf[ENTPOOL_CAPACITY];
        uint32_t extra[7];
        unsigned i = 0;

        /* Grab CPU number and cycle counter to mix extra into the pool.  */
        extra[i++] = cpu_number();
        extra[i++] = entropy_timer();

        /*
         * With the per-CPU state locked, extract from the per-CPU pool
         * and count it as no longer pending.
         */
        ec = entropy_cpu_get(&lock);
        extra[i++] = entropy_timer();
        entpool_extract(ec->ec_pool, buf, sizeof buf);
        atomic_store_relaxed(&ec->ec_bitspending, 0);
        atomic_store_relaxed(&ec->ec_samplespending, 0);
        extra[i++] = entropy_timer();
        entropy_cpu_put(&lock, ec);
        extra[i++] = entropy_timer();

        /*
         * Copy over statistics, and enter the per-CPU extract and the
         * extra timing into the temporary pool, under the global lock.
         */
        mutex_enter(&E->lock);
        extra[i++] = entropy_timer();
        entpool_enter(pool, buf, sizeof buf);
        explicit_memset(buf, 0, sizeof buf);
        extra[i++] = entropy_timer();
        KASSERT(i == __arraycount(extra));
        entpool_enter(pool, extra, sizeof extra);
        explicit_memset(extra, 0, sizeof extra);
        mutex_exit(&E->lock);
}

/*
 * entropy_notify()
 *
 *        Caller just contributed entropy to the global pool.  Advance
 *        the entropy epoch and notify waiters.
 *
 *        Caller must hold the global entropy lock.
 */
static void
entropy_notify(void)
{
        static const struct timeval interval = {.tv_sec = 60, .tv_usec = 0};
        static struct timeval lasttime; /* serialized by E->lock */
        static bool ready = false, besteffort = false;
        unsigned epoch;

        KASSERT(__predict_false(cold) || mutex_owned(&E->lock));

        /*
         * If this is the first time, print a message to the console
         * that we're ready so operators can compare it to the timing
         * of other events.
         *
         * If we didn't get full entropy from reliable sources, report
         * instead that we are running on fumes with best effort.  (If
         * we ever do get full entropy after that, print the ready
         * message once.)
         */
        if (__predict_false(!ready)) {
                if (E->bitsneeded == 0) {
                        printf("entropy: ready\n");
                        ready = true;
                } else if (E->samplesneeded == 0 && !besteffort) {
                        printf("entropy: best effort\n");
                        besteffort = true;
                }
        }

        /* Set the epoch; roll over from UINTMAX-1 to 1.  */
        if (__predict_true(!atomic_load_relaxed(&entropy_depletion)) ||
            ratecheck(&lasttime, &interval)) {
                epoch = E->epoch + 1;
                if (epoch == 0 || epoch == (unsigned)-1)
                        epoch = 1;
                atomic_store_relaxed(&E->epoch, epoch);
        }
        KASSERT(E->epoch != (unsigned)-1);

        /* Notify waiters.  */
        if (__predict_true(!cold)) {
                cv_broadcast(&E->cv);
                selnotify(&E->selq, POLLIN|POLLRDNORM, NOTE_SUBMIT);
        }

        /* Count another notification.  */
        entropy_notify_evcnt.ev_count++;
}

/*
 * entropy_consolidate()
 *
 *        Trigger entropy consolidation and wait for it to complete.
 *
 *        This should be used sparingly, not periodically -- requiring
 *        conscious intervention by the operator or a clear policy
 *        decision.  Otherwise, the kernel will automatically consolidate
 *        when enough entropy has been gathered into per-CPU pools to
 *        transition to full entropy.
 */
void
entropy_consolidate(void)
{
        uint64_t ticket;
        int error;

        KASSERT(!cold);
        ASSERT_SLEEPABLE();

        mutex_enter(&E->lock);
        ticket = entropy_consolidate_evcnt.ev_count;
        E->consolidate = true;
        cv_broadcast(&E->cv);
        while (ticket == entropy_consolidate_evcnt.ev_count) {
                error = cv_wait_sig(&E->cv, &E->lock);
                if (error)
                        break;
        }
        mutex_exit(&E->lock);
}

/*
 * sysctl -w kern.entropy.consolidate=1
 *
 *        Trigger entropy consolidation and wait for it to complete.
 *        Writable only by superuser.  This, writing to /dev/random, and
 *        ioctl(RNDADDDATA) are the only ways for the system to
 *        consolidate entropy if the operator knows something the kernel
 *        doesn't about how unpredictable the pending entropy pools are.
 */
static int
sysctl_entropy_consolidate(SYSCTLFN_ARGS)
{
        struct sysctlnode node = *rnode;
        int arg = 0;
        int error;

        node.sysctl_data = &arg;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;
        if (arg)
                entropy_consolidate();

        return error;
}

/*
 * sysctl -w kern.entropy.gather=1
 *
 *        Trigger gathering entropy from all on-demand sources, and wait
 *        for synchronous sources (but not asynchronous sources) to
 *        complete.  Writable only by superuser.
 */
static int
sysctl_entropy_gather(SYSCTLFN_ARGS)
{
        struct sysctlnode node = *rnode;
        int arg = 0;
        int error;

        node.sysctl_data = &arg;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;
        if (arg) {
                mutex_enter(&E->lock);
                error = entropy_request(ENTROPY_CAPACITY,
                    ENTROPY_WAIT|ENTROPY_SIG);
                mutex_exit(&E->lock);
        }

        return 0;
}

/*
 * entropy_extract(buf, len, flags)
 *
 *        Extract len bytes from the global entropy pool into buf.
 *
 *        Caller MUST NOT expose these bytes directly -- must use them
 *        ONLY to seed a cryptographic pseudorandom number generator
 *        (`CPRNG'), a.k.a. deterministic random bit generator (`DRBG'),
 *        and then erase them.  entropy_extract does not, on its own,
 *        provide backtracking resistance -- it must be combined with a
 *        PRNG/DRBG that does.
 *
 *        This may be used very early at boot, before even entropy_init
 *        has been called.
 *
 *        You generally shouldn't use this directly -- use cprng(9)
 *        instead.
 *
 *        Flags may have:
 *
 *                ENTROPY_WAIT        Wait for entropy if not available yet.
 *                ENTROPY_SIG        Allow interruption by a signal during wait.
 *                ENTROPY_HARDFAIL Either fill the buffer with full entropy,
 *                                or fail without filling it at all.
 *
 *        Return zero on success, or error on failure:
 *
 *                EWOULDBLOCK        No entropy and ENTROPY_WAIT not set.
 *                EINTR/ERESTART        No entropy, ENTROPY_SIG set, and interrupted.
 *
 *        If ENTROPY_WAIT is set, allowed only in thread context.  If
 *        ENTROPY_WAIT is not set, allowed also in softint context -- may
 *        sleep on an adaptive lock up to IPL_SOFTSERIAL.  Forbidden in
 *        hard interrupt context.
 */
int
entropy_extract(void *buf, size_t len, int flags)
{
        static const struct timeval interval = {.tv_sec = 60, .tv_usec = 0};
        static struct timeval lasttime; /* serialized by E->lock */
        bool printed = false;
        int s = -1/*XXXGCC*/, error;

        if (ISSET(flags, ENTROPY_WAIT)) {
                ASSERT_SLEEPABLE();
                KASSERT(!cold);
        }

        /* Refuse to operate in interrupt context.  */
        KASSERT(!cpu_intr_p());

        /*
         * If we're cold, we are only contending with interrupts on the
         * current CPU, so block them.  Otherwise, we are _not_
         * contending with interrupts on the current CPU, but we are
         * contending with other threads, to exclude them with a mutex.
         */
        if (__predict_false(cold))
                s = splhigh();
        else
                mutex_enter(&E->lock);

        /* Wait until there is enough entropy in the system.  */
        error = 0;
        if (E->bitsneeded > 0 && E->samplesneeded == 0) {
                /*
                 * We don't have full entropy from reliable sources,
                 * but we gathered a plausible number of samples from
                 * other sources such as timers.  Try asking for more
                 * from any sources we can, but don't worry if it
                 * fails -- best effort.
                 */
                (void)entropy_request(ENTROPY_CAPACITY, flags);
        } else while (E->bitsneeded > 0 && E->samplesneeded > 0) {
                /* Ask for more, synchronously if possible.  */
                error = entropy_request(len, flags);
                if (error)
                        break;

                /* If we got enough, we're done.  */
                if (E->bitsneeded == 0 || E->samplesneeded == 0) {
                        KASSERT(error == 0);
                        break;
                }

                /* If not waiting, stop here.  */
                if (!ISSET(flags, ENTROPY_WAIT)) {
                        error = EWOULDBLOCK;
                        break;
                }

                /* Wait for some entropy to come in and try again.  */
                KASSERT(!cold);
                if (!printed) {
                        printf("entropy: pid %d (%s) waiting for entropy(7)\n",
                            curproc->p_pid, curproc->p_comm);
                        printed = true;
                }

                if (ISSET(flags, ENTROPY_SIG)) {
                        error = cv_timedwait_sig(&E->cv, &E->lock, hz);
                        if (error && error != EWOULDBLOCK)
                                break;
                } else {
                        cv_timedwait(&E->cv, &E->lock, hz);
                }
        }

        /*
         * Count failure -- but fill the buffer nevertheless, unless
         * the caller specified ENTROPY_HARDFAIL.
         */
        if (error) {
                if (ISSET(flags, ENTROPY_HARDFAIL))
                        goto out;
                entropy_extract_fail_evcnt.ev_count++;
        }

        /*
         * Report a warning if we haven't yet reached full entropy.
         * This is the only case where we consider entropy to be
         * `depleted' without kern.entropy.depletion enabled -- when we
         * only have partial entropy, an adversary may be able to
         * narrow the state of the pool down to a small number of
         * possibilities; the output then enables them to confirm a
         * guess, reducing its entropy from the adversary's perspective
         * to zero.
         *
         * This should only happen if the operator has chosen to
         * consolidate, either through sysctl kern.entropy.consolidate
         * or by writing less than full entropy to /dev/random as root
         * (which /dev/random promises will immediately affect
         * subsequent output, for better or worse).
         */
        if (E->bitsneeded > 0 && E->samplesneeded > 0) {
                if (__predict_false(E->epoch == (unsigned)-1) &&
                    ratecheck(&lasttime, &interval)) {
                        printf("WARNING:"
                            " system needs entropy for security;"
                            " see entropy(7)\n");
                }
                atomic_store_relaxed(&E->bitsneeded, MINENTROPYBITS);
                atomic_store_relaxed(&E->samplesneeded, MINSAMPLES);
        }

        /* Extract data from the pool, and `deplete' if we're doing that.  */
        entpool_extract(&E->pool, buf, len);
        if (__predict_false(atomic_load_relaxed(&entropy_depletion)) &&
            error == 0) {
                unsigned cost = MIN(len, ENTROPY_CAPACITY)*NBBY;
                unsigned bitsneeded = E->bitsneeded;
                unsigned samplesneeded = E->samplesneeded;

                bitsneeded += MIN(MINENTROPYBITS - bitsneeded, cost);
                samplesneeded += MIN(MINSAMPLES - samplesneeded, cost);

                atomic_store_relaxed(&E->bitsneeded, bitsneeded);
                atomic_store_relaxed(&E->samplesneeded, samplesneeded);
                entropy_deplete_evcnt.ev_count++;
        }

out:        /* Release the global lock and return the error.  */
        if (__predict_false(cold))
                splx(s);
        else
                mutex_exit(&E->lock);
        return error;
}

/*
 * entropy_poll(events)
 *
 *        Return the subset of events ready, and if it is not all of
 *        events, record curlwp as waiting for entropy.
 */
int
entropy_poll(int events)
{
        int revents = 0;

        KASSERT(!cold);

        /* Always ready for writing.  */
        revents |= events & (POLLOUT|POLLWRNORM);

        /* Narrow it down to reads.  */
        events &= POLLIN|POLLRDNORM;
        if (events == 0)
                return revents;

        /*
         * If we have reached full entropy and we're not depleting
         * entropy, we are forever ready.
         */
        if (__predict_true(atomic_load_relaxed(&E->bitsneeded) == 0 ||
                atomic_load_relaxed(&E->samplesneeded) == 0) &&
            __predict_true(!atomic_load_relaxed(&entropy_depletion)))
                return revents | events;

        /*
         * Otherwise, check whether we need entropy under the lock.  If
         * we don't, we're ready; if we do, add ourselves to the queue.
         */
        mutex_enter(&E->lock);
        if (E->bitsneeded == 0 || E->samplesneeded == 0)
                revents |= events;
        else
                selrecord(curlwp, &E->selq);
        mutex_exit(&E->lock);

        return revents;
}

/*
 * filt_entropy_read_detach(kn)
 *
 *        struct filterops::f_detach callback for entropy read events:
 *        remove kn from the list of waiters.
 */
static void
filt_entropy_read_detach(struct knote *kn)
{

        KASSERT(!cold);

        mutex_enter(&E->lock);
        selremove_knote(&E->selq, kn);
        mutex_exit(&E->lock);
}

/*
 * filt_entropy_read_event(kn, hint)
 *
 *        struct filterops::f_event callback for entropy read events:
 *        poll for entropy.  Caller must hold the global entropy lock if
 *        hint is NOTE_SUBMIT, and must not if hint is not NOTE_SUBMIT.
 */
static int
filt_entropy_read_event(struct knote *kn, long hint)
{
        int ret;

        KASSERT(!cold);

        /* Acquire the lock, if caller is outside entropy subsystem.  */
        if (hint == NOTE_SUBMIT)
                KASSERT(mutex_owned(&E->lock));
        else
                mutex_enter(&E->lock);

        /*
         * If we still need entropy, can't read anything; if not, can
         * read arbitrarily much.
         */
        if (E->bitsneeded != 0 && E->samplesneeded != 0) {
                ret = 0;
        } else {
                if (atomic_load_relaxed(&entropy_depletion))
                        kn->kn_data = ENTROPY_CAPACITY; /* bytes */
                else
                        kn->kn_data = MIN(INT64_MAX, SSIZE_MAX);
                ret = 1;
        }

        /* Release the lock, if caller is outside entropy subsystem.  */
        if (hint == NOTE_SUBMIT)
                KASSERT(mutex_owned(&E->lock));
        else
                mutex_exit(&E->lock);

        return ret;
}

/* XXX Makes sense only for /dev/u?random.  */
static const struct filterops entropy_read_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_entropy_read_detach,
        .f_event = filt_entropy_read_event,
};

/*
 * entropy_kqfilter(kn)
 *
 *        Register kn to receive entropy event notifications.  May be
 *        EVFILT_READ or EVFILT_WRITE; anything else yields EINVAL.
 */
int
entropy_kqfilter(struct knote *kn)
{

        KASSERT(!cold);

        switch (kn->kn_filter) {
        case EVFILT_READ:
                /* Enter into the global select queue.  */
                mutex_enter(&E->lock);
                kn->kn_fop = &entropy_read_filtops;
                selrecord_knote(&E->selq, kn);
                mutex_exit(&E->lock);
                return 0;
        case EVFILT_WRITE:
                /* Can always dump entropy into the system.  */
                kn->kn_fop = &seltrue_filtops;
                return 0;
        default:
                return EINVAL;
        }
}

/*
 * rndsource_setcb(rs, get, getarg)
 *
 *        Set the request callback for the entropy source rs, if it can
 *        provide entropy on demand.  Must precede rnd_attach_source.
 */
void
rndsource_setcb(struct krndsource *rs, void (*get)(size_t, void *),
    void *getarg)
{

        rs->get = get;
        rs->getarg = getarg;
}

/*
 * rnd_attach_source(rs, name, type, flags)
 *
 *        Attach the entropy source rs.  Must be done after
 *        rndsource_setcb, if any, and before any calls to rnd_add_data.
 */
void
rnd_attach_source(struct krndsource *rs, const char *name, uint32_t type,
    uint32_t flags)
{
        uint32_t extra[4];
        unsigned i = 0;

        KASSERTMSG(name[0] != '\0', "rndsource must have nonempty name");

        /* Grab cycle counter to mix extra into the pool.  */
        extra[i++] = entropy_timer();

        /*
         * Apply some standard flags:
         *
         * - We do not bother with network devices by default, for
         *   hysterical raisins (perhaps: because it is often the case
         *   that an adversary can influence network packet timings).
         */
        switch (type) {
        case RND_TYPE_NET:
                flags |= RND_FLAG_NO_COLLECT;
                break;
        }

        /* Sanity-check the callback if RND_FLAG_HASCB is set.  */
        KASSERT(!ISSET(flags, RND_FLAG_HASCB) || rs->get != NULL);

        /* Initialize the random source.  */
        memset(rs->name, 0, sizeof(rs->name)); /* paranoia */
        strlcpy(rs->name, name, sizeof(rs->name));
        memset(&rs->time_delta, 0, sizeof(rs->time_delta));
        memset(&rs->value_delta, 0, sizeof(rs->value_delta));
        rs->total = 0;
        rs->type = type;
        rs->flags = flags;
        if (entropy_percpu != NULL)
                rs->state = percpu_alloc(sizeof(struct rndsource_cpu));
        extra[i++] = entropy_timer();

        /* Wire it into the global list of random sources.  */
        if (__predict_true(!cold))
                mutex_enter(&E->lock);
        LIST_INSERT_HEAD(&E->sources, rs, list);
        if (__predict_true(!cold))
                mutex_exit(&E->lock);
        extra[i++] = entropy_timer();

        /* Request that it provide entropy ASAP, if we can.  */
        if (ISSET(flags, RND_FLAG_HASCB))
                (*rs->get)(ENTROPY_CAPACITY, rs->getarg);
        extra[i++] = entropy_timer();

        /* Mix the extra into the pool.  */
        KASSERT(i == __arraycount(extra));
        entropy_enter(extra, sizeof extra, 0, /*count*/__predict_true(!cold));
        explicit_memset(extra, 0, sizeof extra);
}

/*
 * rnd_detach_source(rs)
 *
 *        Detach the entropy source rs.  May sleep waiting for users to
 *        drain.  Further use is not allowed.
 */
void
rnd_detach_source(struct krndsource *rs)
{

        /*
         * If we're cold (shouldn't happen, but hey), just remove it
         * from the list -- there's nothing allocated.
         */
        if (__predict_false(cold) && entropy_percpu == NULL) {
                LIST_REMOVE(rs, list);
                return;
        }

        /* We may have to wait for entropy_request.  */
        ASSERT_SLEEPABLE();

        /* Wait until the source list is not in use, and remove it.  */
        mutex_enter(&E->lock);
        while (E->sourcelock)
                cv_wait(&E->sourcelock_cv, &E->lock);
        LIST_REMOVE(rs, list);
        mutex_exit(&E->lock);

        /* Free the per-CPU data.  */
        percpu_free(rs->state, sizeof(struct rndsource_cpu));
}

/*
 * rnd_lock_sources(flags)
 *
 *        Lock the list of entropy sources.  Caller must hold the global
 *        entropy lock.  If successful, no rndsource will go away until
 *        rnd_unlock_sources even while the caller releases the global
 *        entropy lock.
 *
 *        May be called very early at boot, before entropy_init.
 *
 *        If flags & ENTROPY_WAIT, wait for concurrent access to finish.
 *        If flags & ENTROPY_SIG, allow interruption by signal.
 */
static int __attribute__((warn_unused_result))
rnd_lock_sources(int flags)
{
        int error;

        KASSERT(__predict_false(cold) || mutex_owned(&E->lock));
        KASSERT(!cpu_intr_p());

        while (E->sourcelock) {
                KASSERT(!cold);
                if (!ISSET(flags, ENTROPY_WAIT))
                        return EWOULDBLOCK;
                if (ISSET(flags, ENTROPY_SIG)) {
                        error = cv_wait_sig(&E->sourcelock_cv, &E->lock);
                        if (error)
                                return error;
                } else {
                        cv_wait(&E->sourcelock_cv, &E->lock);
                }
        }

        E->sourcelock = curlwp;
        return 0;
}

/*
 * rnd_unlock_sources()
 *
 *        Unlock the list of sources after rnd_lock_sources.  Caller must
 *        hold the global entropy lock.
 *
 *        May be called very early at boot, before entropy_init.
 */
static void
rnd_unlock_sources(void)
{

        KASSERT(__predict_false(cold) || mutex_owned(&E->lock));
        KASSERT(!cpu_intr_p());

        KASSERTMSG(E->sourcelock == curlwp, "lwp %p releasing lock held by %p",
            curlwp, E->sourcelock);
        E->sourcelock = NULL;
        if (__predict_true(!cold))
                cv_signal(&E->sourcelock_cv);
}

/*
 * rnd_sources_locked()
 *
 *        True if we hold the list of rndsources locked, for diagnostic
 *        assertions.
 *
 *        May be called very early at boot, before entropy_init.
 */
static bool __diagused
rnd_sources_locked(void)
{

        return E->sourcelock == curlwp;
}

/*
 * entropy_request(nbytes, flags)
 *
 *        Request nbytes bytes of entropy from all sources in the system.
 *        OK if we overdo it.  Caller must hold the global entropy lock;
 *        will release and re-acquire it.
 *
 *        May be called very early at boot, before entropy_init.
 *
 *        If flags & ENTROPY_WAIT, wait for concurrent access to finish.
 *        If flags & ENTROPY_SIG, allow interruption by signal.
 */
static int
entropy_request(size_t nbytes, int flags)
{
        struct krndsource *rs;
        int error;

        KASSERT(__predict_false(cold) || mutex_owned(&E->lock));
        KASSERT(!cpu_intr_p());
        if ((flags & ENTROPY_WAIT) != 0 && __predict_false(!cold))
                ASSERT_SLEEPABLE();

        /*
         * Lock the list of entropy sources to block rnd_detach_source
         * until we're done, and to serialize calls to the entropy
         * callbacks as guaranteed to drivers.
         */
        error = rnd_lock_sources(flags);
        if (error)
                return error;
        entropy_request_evcnt.ev_count++;

        /* Clamp to the maximum reasonable request.  */
        nbytes = MIN(nbytes, ENTROPY_CAPACITY);

        /* Walk the list of sources.  */
        LIST_FOREACH(rs, &E->sources, list) {
                /* Skip sources without callbacks.  */
                if (!ISSET(rs->flags, RND_FLAG_HASCB))
                        continue;

                /*
                 * Skip sources that are disabled altogether -- we
                 * would just ignore their samples anyway.
                 */
                if (ISSET(rs->flags, RND_FLAG_NO_COLLECT))
                        continue;

                /* Drop the lock while we call the callback.  */
                if (__predict_true(!cold))
                        mutex_exit(&E->lock);
                (*rs->get)(nbytes, rs->getarg);
                if (__predict_true(!cold))
                        mutex_enter(&E->lock);
        }

        /* Request done; unlock the list of entropy sources.  */
        rnd_unlock_sources();
        return 0;
}

static inline uint32_t
rnd_delta_estimate(rnd_delta_t *d, uint32_t v, int32_t delta)
{
        int32_t delta2, delta3;

        /*
         * Calculate the second and third order differentials
         */
        delta2 = d->dx - delta;
        if (delta2 < 0)
                delta2 = -delta2; /* XXX arithmetic overflow */

        delta3 = d->d2x - delta2;
        if (delta3 < 0)
                delta3 = -delta3; /* XXX arithmetic overflow */

        d->x = v;
        d->dx = delta;
        d->d2x = delta2;

        /*
         * If any delta is 0, we got no entropy.  If all are non-zero, we
         * might have something.
         */
        if (delta == 0 || delta2 == 0 || delta3 == 0)
                return 0;

        return 1;
}

static inline uint32_t
rnd_dt_estimate(struct krndsource *rs, uint32_t t)
{
        int32_t delta;
        uint32_t ret;
        rnd_delta_t *d;
        struct rndsource_cpu *rc;

        rc = percpu_getref(rs->state);
        d = &rc->rc_timedelta;

        if (t < d->x) {
                delta = UINT32_MAX - d->x + t;
        } else {
                delta = d->x - t;
        }

        if (delta < 0) {
                delta = -delta;        /* XXX arithmetic overflow */
        }

        ret = rnd_delta_estimate(d, t, delta);

        KASSERT(d->x == t);
        KASSERT(d->dx == delta);
        percpu_putref(rs->state);
        return ret;
}

/*
 * rnd_add_uint32(rs, value)
 *
 *        Enter 32 bits of data from an entropy source into the pool.
 *
 *        May be called from any context or with spin locks held, but may
 *        drop data.
 *
 *        This is meant for cheaply taking samples from devices that
 *        aren't designed to be hardware random number generators.
 */
void
rnd_add_uint32(struct krndsource *rs, uint32_t value)
{
        bool intr_p = true;

        rnd_add_data_internal(rs, &value, sizeof value, 0, intr_p);
}

void
_rnd_add_uint32(struct krndsource *rs, uint32_t value)
{
        bool intr_p = true;

        rnd_add_data_internal(rs, &value, sizeof value, 0, intr_p);
}

void
_rnd_add_uint64(struct krndsource *rs, uint64_t value)
{
        bool intr_p = true;

        rnd_add_data_internal(rs, &value, sizeof value, 0, intr_p);
}

/*
 * rnd_add_data(rs, buf, len, entropybits)
 *
 *        Enter data from an entropy source into the pool, with a
 *        driver's estimate of how much entropy the physical source of
 *        the data has.  If RND_FLAG_NO_ESTIMATE, we ignore the driver's
 *        estimate and treat it as zero.
 *
 *        rs MAY but SHOULD NOT be NULL.  If rs is NULL, MUST NOT be
 *        called from interrupt context or with spin locks held.
 *
 *        If rs is non-NULL, MAY but SHOULD NOT be called from interrupt
 *        context, in which case act like rnd_add_data_intr -- if the
 *        sample buffer is full, schedule a softint and drop any
 *        additional data on the floor.  (This may change later once we
 *        fix drivers that still call this from interrupt context to use
 *        rnd_add_data_intr instead.)  MUST NOT be called with spin locks
 *        held if not in hard interrupt context -- i.e., MUST NOT be
 *        called in thread context or softint context with spin locks
 *        held.
 */
void
rnd_add_data(struct krndsource *rs, const void *buf, uint32_t len,
    uint32_t entropybits)
{
        bool intr_p = cpu_intr_p(); /* XXX make this unconditionally false */

        /*
         * Weird legacy exception that we should rip out and replace by
         * creating new rndsources to attribute entropy to the callers:
         * If there's no rndsource, just enter the data and time now.
         */
        if (rs == NULL) {
                uint32_t extra;

                KASSERT(!intr_p);
                KASSERTMSG(howmany(entropybits, NBBY) <= len,
                    "%s: impossible entropy rate:"
                    " %"PRIu32" bits in %"PRIu32"-byte string",
                    rs ? rs->name : "(anonymous)", entropybits, len);
                entropy_enter(buf, len, entropybits, /*count*/false);
                extra = entropy_timer();
                entropy_enter(&extra, sizeof extra, 0, /*count*/false);
                explicit_memset(&extra, 0, sizeof extra);
                return;
        }

        rnd_add_data_internal(rs, buf, len, entropybits, intr_p);
}

/*
 * rnd_add_data_intr(rs, buf, len, entropybits)
 *
 *        Try to enter data from an entropy source into the pool, with a
 *        driver's estimate of how much entropy the physical source of
 *        the data has.  If RND_FLAG_NO_ESTIMATE, we ignore the driver's
 *        estimate and treat it as zero.  If the sample buffer is full,
 *        schedule a softint and drop any additional data on the floor.
 */
void
rnd_add_data_intr(struct krndsource *rs, const void *buf, uint32_t len,
    uint32_t entropybits)
{
        bool intr_p = true;

        rnd_add_data_internal(rs, buf, len, entropybits, intr_p);
}

/*
 * rnd_add_data_internal(rs, buf, len, entropybits, intr_p)
 *
 *        Internal subroutine to decide whether or not to enter data or
 *        timing for a particular rndsource, and if so, to enter it.
 *
 *        intr_p is true for callers from interrupt context or spin locks
 *        held, and false for callers from thread or soft interrupt
 *        context and no spin locks held.
 */
static void
rnd_add_data_internal(struct krndsource *rs, const void *buf, uint32_t len,
    uint32_t entropybits, bool intr_p)
{
        uint32_t flags;

        KASSERTMSG(howmany(entropybits, NBBY) <= len,
            "%s: impossible entropy rate:"
            " %"PRIu32" bits in %"PRIu32"-byte string",
            rs ? rs->name : "(anonymous)", entropybits, len);

        /*
         * Hold up the reset xcall before it zeroes the entropy counts
         * on this CPU or globally.  Otherwise, we might leave some
         * nonzero entropy attributed to an untrusted source in the
         * event of a race with a change to flags.
         */
        kpreempt_disable();

        /* Load a snapshot of the flags.  Ioctl may change them under us.  */
        flags = atomic_load_relaxed(&rs->flags);

        /*
         * Skip if:
         * - we're not collecting entropy, or
         * - the operator doesn't want to collect entropy from this, or
         * - neither data nor timings are being collected from this.
         */
        if (!atomic_load_relaxed(&entropy_collection) ||
            ISSET(flags, RND_FLAG_NO_COLLECT) ||
            !ISSET(flags, RND_FLAG_COLLECT_VALUE|RND_FLAG_COLLECT_TIME))
                goto out;

        /* If asked, ignore the estimate.  */
        if (ISSET(flags, RND_FLAG_NO_ESTIMATE))
                entropybits = 0;

        /* If we are collecting data, enter them.  */
        if (ISSET(flags, RND_FLAG_COLLECT_VALUE)) {
                rnd_add_data_1(rs, buf, len, entropybits, /*count*/false,
                    RND_FLAG_COLLECT_VALUE, intr_p);
        }

        /* If we are collecting timings, enter one.  */
        if (ISSET(flags, RND_FLAG_COLLECT_TIME)) {
                uint32_t extra;
                bool count;

                /* Sample a timer.  */
                extra = entropy_timer();

                /* If asked, do entropy estimation on the time.  */
                if ((flags & (RND_FLAG_ESTIMATE_TIME|RND_FLAG_NO_ESTIMATE)) ==
                    RND_FLAG_ESTIMATE_TIME && __predict_true(!cold))
                        count = rnd_dt_estimate(rs, extra);
                else
                        count = false;

                rnd_add_data_1(rs, &extra, sizeof extra, 0, count,
                    RND_FLAG_COLLECT_TIME, intr_p);
        }

out:        /* Allow concurrent changes to flags to finish.  */
        kpreempt_enable();
}

static unsigned
add_sat(unsigned a, unsigned b)
{
        unsigned c = a + b;

        return (c < a ? UINT_MAX : c);
}

/*
 * rnd_add_data_1(rs, buf, len, entropybits, count, flag)
 *
 *        Internal subroutine to call either entropy_enter_intr, if we're
 *        in interrupt context, or entropy_enter if not, and to count the
 *        entropy in an rndsource.
 */
static void
rnd_add_data_1(struct krndsource *rs, const void *buf, uint32_t len,
    uint32_t entropybits, bool count, uint32_t flag, bool intr_p)
{
        bool fullyused;

        /*
         * For the interrupt-like path, use entropy_enter_intr and take
         * note of whether it consumed the full sample; otherwise, use
         * entropy_enter, which always consumes the full sample.
         */
        if (intr_p) {
                fullyused = entropy_enter_intr(buf, len, entropybits, count);
        } else {
                entropy_enter(buf, len, entropybits, count);
                fullyused = true;
        }

        /*
         * If we used the full sample, note how many bits were
         * contributed from this source.
         */
        if (fullyused) {
                if (__predict_false(cold)) {
                        const int s = splhigh();
                        rs->total = add_sat(rs->total, entropybits);
                        switch (flag) {
                        case RND_FLAG_COLLECT_TIME:
                                rs->time_delta.insamples =
                                    add_sat(rs->time_delta.insamples, 1);
                                break;
                        case RND_FLAG_COLLECT_VALUE:
                                rs->value_delta.insamples =
                                    add_sat(rs->value_delta.insamples, 1);
                                break;
                        }
                        splx(s);
                } else {
                        struct rndsource_cpu *rc = percpu_getref(rs->state);

                        atomic_store_relaxed(&rc->rc_entropybits,
                            add_sat(rc->rc_entropybits, entropybits));
                        switch (flag) {
                        case RND_FLAG_COLLECT_TIME:
                                atomic_store_relaxed(&rc->rc_timesamples,
                                    add_sat(rc->rc_timesamples, 1));
                                break;
                        case RND_FLAG_COLLECT_VALUE:
                                atomic_store_relaxed(&rc->rc_datasamples,
                                    add_sat(rc->rc_datasamples, 1));
                                break;
                        }
                        percpu_putref(rs->state);
                }
        }
}

/*
 * rnd_add_data_sync(rs, buf, len, entropybits)
 *
 *        Same as rnd_add_data.  Originally used in rndsource callbacks,
 *        to break an unnecessary cycle; no longer really needed.
 */
void
rnd_add_data_sync(struct krndsource *rs, const void *buf, uint32_t len,
    uint32_t entropybits)
{

        rnd_add_data(rs, buf, len, entropybits);
}

/*
 * rndsource_entropybits(rs)
 *
 *        Return approximately the number of bits of entropy that have
 *        been contributed via rs so far.  Approximate if other CPUs may
 *        be calling rnd_add_data concurrently.
 */
static unsigned
rndsource_entropybits(struct krndsource *rs)
{
        unsigned nbits = rs->total;

        KASSERT(!cold);
        KASSERT(rnd_sources_locked());
        percpu_foreach(rs->state, rndsource_entropybits_cpu, &nbits);
        return nbits;
}

static void
rndsource_entropybits_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
        struct rndsource_cpu *rc = ptr;
        unsigned *nbitsp = cookie;
        unsigned cpu_nbits;

        cpu_nbits = atomic_load_relaxed(&rc->rc_entropybits);
        *nbitsp += MIN(UINT_MAX - *nbitsp, cpu_nbits);
}

/*
 * rndsource_to_user(rs, urs)
 *
 *        Copy a description of rs out to urs for userland.
 */
static void
rndsource_to_user(struct krndsource *rs, rndsource_t *urs)
{

        KASSERT(!cold);
        KASSERT(rnd_sources_locked());

        /* Avoid kernel memory disclosure.  */
        memset(urs, 0, sizeof(*urs));

        CTASSERT(sizeof(urs->name) == sizeof(rs->name));
        strlcpy(urs->name, rs->name, sizeof(urs->name));
        urs->total = rndsource_entropybits(rs);
        urs->type = rs->type;
        urs->flags = atomic_load_relaxed(&rs->flags);
}

/*
 * rndsource_to_user_est(rs, urse)
 *
 *        Copy a description of rs and estimation statistics out to urse
 *        for userland.
 */
static void
rndsource_to_user_est(struct krndsource *rs, rndsource_est_t *urse)
{

        KASSERT(!cold);
        KASSERT(rnd_sources_locked());

        /* Avoid kernel memory disclosure.  */
        memset(urse, 0, sizeof(*urse));

        /* Copy out the rndsource description.  */
        rndsource_to_user(rs, &urse->rt);

        /* Gather the statistics.  */
        urse->dt_samples = rs->time_delta.insamples;
        urse->dt_total = 0;
        urse->dv_samples = rs->value_delta.insamples;
        urse->dv_total = urse->rt.total;
        percpu_foreach(rs->state, rndsource_to_user_est_cpu, urse);
}

static void
rndsource_to_user_est_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
        struct rndsource_cpu *rc = ptr;
        rndsource_est_t *urse = cookie;

        urse->dt_samples = add_sat(urse->dt_samples,
            atomic_load_relaxed(&rc->rc_timesamples));
        urse->dv_samples = add_sat(urse->dv_samples,
            atomic_load_relaxed(&rc->rc_datasamples));
}

/*
 * entropy_reset_xc(arg1, arg2)
 *
 *        Reset the current CPU's pending entropy to zero.
 */
static void
entropy_reset_xc(void *arg1 __unused, void *arg2 __unused)
{
        uint32_t extra = entropy_timer();
        struct entropy_cpu_lock lock;
        struct entropy_cpu *ec;

        /*
         * With the per-CPU state locked, zero the pending count and
         * enter a cycle count for fun.
         */
        ec = entropy_cpu_get(&lock);
        ec->ec_bitspending = 0;
        ec->ec_samplespending = 0;
        entpool_enter(ec->ec_pool, &extra, sizeof extra);
        entropy_cpu_put(&lock, ec);
}

/*
 * entropy_ioctl(cmd, data)
 *
 *        Handle various /dev/random ioctl queries.
 */
int
entropy_ioctl(unsigned long cmd, void *data)
{
        struct krndsource *rs;
        bool privileged;
        int error;

        KASSERT(!cold);

        /* Verify user's authorization to perform the ioctl.  */
        switch (cmd) {
        case RNDGETENTCNT:
        case RNDGETPOOLSTAT:
        case RNDGETSRCNUM:
        case RNDGETSRCNAME:
        case RNDGETESTNUM:
        case RNDGETESTNAME:
                error = kauth_authorize_device(kauth_cred_get(),
                    KAUTH_DEVICE_RND_GETPRIV, NULL, NULL, NULL, NULL);
                break;
        case RNDCTL:
                error = kauth_authorize_device(kauth_cred_get(),
                    KAUTH_DEVICE_RND_SETPRIV, NULL, NULL, NULL, NULL);
                break;
        case RNDADDDATA:
                error = kauth_authorize_device(kauth_cred_get(),
                    KAUTH_DEVICE_RND_ADDDATA, NULL, NULL, NULL, NULL);
                /* Ascertain whether the user's inputs should be counted.  */
                if (kauth_authorize_device(kauth_cred_get(),
                        KAUTH_DEVICE_RND_ADDDATA_ESTIMATE,
                        NULL, NULL, NULL, NULL) == 0)
                        privileged = true;
                break;
        default: {
                /*
                 * XXX Hack to avoid changing module ABI so this can be
                 * pulled up.  Later, we can just remove the argument.
                 */
                static const struct fileops fops = {
                        .fo_ioctl = rnd_system_ioctl,
                };
                struct file f = {
                        .f_ops = &fops,
                };
                MODULE_HOOK_CALL(rnd_ioctl_50_hook, (&f, cmd, data),
                    enosys(), error);
#if defined(_LP64)
                if (error == ENOSYS)
                        MODULE_HOOK_CALL(rnd_ioctl32_50_hook, (&f, cmd, data),
                            enosys(), error);
#endif
                if (error == ENOSYS)
                        error = ENOTTY;
                break;
        }
        }

        /* If anything went wrong with authorization, stop here.  */
        if (error)
                return error;

        /* Dispatch on the command.  */
        switch (cmd) {
        case RNDGETENTCNT: {        /* Get current entropy count in bits.  */
                uint32_t *countp = data;

                mutex_enter(&E->lock);
                *countp = MINENTROPYBITS - E->bitsneeded;
                mutex_exit(&E->lock);

                break;
        }
        case RNDGETPOOLSTAT: {        /* Get entropy pool statistics.  */
                rndpoolstat_t *pstat = data;

                mutex_enter(&E->lock);

                /* parameters */
                pstat->poolsize = ENTPOOL_SIZE/sizeof(uint32_t); /* words */
                pstat->threshold = MINENTROPYBITS/NBBY; /* bytes */
                pstat->maxentropy = ENTROPY_CAPACITY*NBBY; /* bits */

                /* state */
                pstat->added = 0; /* XXX total entropy_enter count */
                pstat->curentropy = MINENTROPYBITS - E->bitsneeded; /* bits */
                pstat->removed = 0; /* XXX total entropy_extract count */
                pstat->discarded = 0; /* XXX bits of entropy beyond capacity */

                /*
                 * This used to be bits of data fabricated in some
                 * sense; we'll take it to mean number of samples,
                 * excluding the bits of entropy from HWRNG or seed.
                 */
                pstat->generated = MINSAMPLES - E->samplesneeded;
                pstat->generated -= MIN(pstat->generated, pstat->curentropy);

                mutex_exit(&E->lock);
                break;
        }
        case RNDGETSRCNUM: {        /* Get entropy sources by number.  */
                rndstat_t *stat = data;
                uint32_t start = 0, i = 0;

                /* Skip if none requested; fail if too many requested.  */
                if (stat->count == 0)
                        break;
                if (stat->count > RND_MAXSTATCOUNT)
                        return EINVAL;

                /*
                 * Under the lock, find the first one, copy out as many
                 * as requested, and report how many we copied out.
                 */
                mutex_enter(&E->lock);
                error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG);
                if (error) {
                        mutex_exit(&E->lock);
                        return error;
                }
                LIST_FOREACH(rs, &E->sources, list) {
                        if (start++ == stat->start)
                                break;
                }
                while (i < stat->count && rs != NULL) {
                        mutex_exit(&E->lock);
                        rndsource_to_user(rs, &stat->source[i++]);
                        mutex_enter(&E->lock);
                        rs = LIST_NEXT(rs, list);
                }
                KASSERT(i <= stat->count);
                stat->count = i;
                rnd_unlock_sources();
                mutex_exit(&E->lock);
                break;
        }
        case RNDGETESTNUM: {        /* Get sources and estimates by number.  */
                rndstat_est_t *estat = data;
                uint32_t start = 0, i = 0;

                /* Skip if none requested; fail if too many requested.  */
                if (estat->count == 0)
                        break;
                if (estat->count > RND_MAXSTATCOUNT)
                        return EINVAL;

                /*
                 * Under the lock, find the first one, copy out as many
                 * as requested, and report how many we copied out.
                 */
                mutex_enter(&E->lock);
                error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG);
                if (error) {
                        mutex_exit(&E->lock);
                        return error;
                }
                LIST_FOREACH(rs, &E->sources, list) {
                        if (start++ == estat->start)
                                break;
                }
                while (i < estat->count && rs != NULL) {
                        mutex_exit(&E->lock);
                        rndsource_to_user_est(rs, &estat->source[i++]);
                        mutex_enter(&E->lock);
                        rs = LIST_NEXT(rs, list);
                }
                KASSERT(i <= estat->count);
                estat->count = i;
                rnd_unlock_sources();
                mutex_exit(&E->lock);
                break;
        }
        case RNDGETSRCNAME: {        /* Get entropy sources by name.  */
                rndstat_name_t *nstat = data;
                const size_t n = sizeof(rs->name);

                CTASSERT(sizeof(rs->name) == sizeof(nstat->name));

                /*
                 * Under the lock, search by name.  If found, copy it
                 * out; if not found, fail with ENOENT.
                 */
                mutex_enter(&E->lock);
                error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG);
                if (error) {
                        mutex_exit(&E->lock);
                        return error;
                }
                LIST_FOREACH(rs, &E->sources, list) {
                        if (strncmp(rs->name, nstat->name, n) == 0)
                                break;
                }
                if (rs != NULL) {
                        mutex_exit(&E->lock);
                        rndsource_to_user(rs, &nstat->source);
                        mutex_enter(&E->lock);
                } else {
                        error = ENOENT;
                }
                rnd_unlock_sources();
                mutex_exit(&E->lock);
                break;
        }
        case RNDGETESTNAME: {        /* Get sources and estimates by name.  */
                rndstat_est_name_t *enstat = data;
                const size_t n = sizeof(rs->name);

                CTASSERT(sizeof(rs->name) == sizeof(enstat->name));

                /*
                 * Under the lock, search by name.  If found, copy it
                 * out; if not found, fail with ENOENT.
                 */
                mutex_enter(&E->lock);
                error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG);
                if (error) {
                        mutex_exit(&E->lock);
                        return error;
                }
                LIST_FOREACH(rs, &E->sources, list) {
                        if (strncmp(rs->name, enstat->name, n) == 0)
                                break;
                }
                if (rs != NULL) {
                        mutex_exit(&E->lock);
                        rndsource_to_user_est(rs, &enstat->source);
                        mutex_enter(&E->lock);
                } else {
                        error = ENOENT;
                }
                rnd_unlock_sources();
                mutex_exit(&E->lock);
                break;
        }
        case RNDCTL: {                /* Modify entropy source flags.  */
                rndctl_t *rndctl = data;
                const size_t n = sizeof(rs->name);
                uint32_t resetflags = RND_FLAG_NO_ESTIMATE|RND_FLAG_NO_COLLECT;
                uint32_t flags;
                bool reset = false, request = false;

                CTASSERT(sizeof(rs->name) == sizeof(rndctl->name));

                /* Whitelist the flags that user can change.  */
                rndctl->mask &= RND_FLAG_NO_ESTIMATE|RND_FLAG_NO_COLLECT;

                /*
                 * For each matching rndsource, either by type if
                 * specified or by name if not, set the masked flags.
                 */
                mutex_enter(&E->lock);
                LIST_FOREACH(rs, &E->sources, list) {
                        if (rndctl->type != 0xff) {
                                if (rs->type != rndctl->type)
                                        continue;
                        } else if (rndctl->name[0] != '\0') {
                                if (strncmp(rs->name, rndctl->name, n) != 0)
                                        continue;
                        }
                        flags = rs->flags & ~rndctl->mask;
                        flags |= rndctl->flags & rndctl->mask;
                        if ((rs->flags & resetflags) == 0 &&
                            (flags & resetflags) != 0)
                                reset = true;
                        if ((rs->flags ^ flags) & resetflags)
                                request = true;
                        atomic_store_relaxed(&rs->flags, flags);
                }
                mutex_exit(&E->lock);

                /*
                 * If we disabled estimation or collection, nix all the
                 * pending entropy and set needed to the maximum.
                 */
                if (reset) {
                        xc_broadcast(0, &entropy_reset_xc, NULL, NULL);
                        mutex_enter(&E->lock);
                        E->bitspending = 0;
                        E->samplespending = 0;
                        atomic_store_relaxed(&E->bitsneeded, MINENTROPYBITS);
                        atomic_store_relaxed(&E->samplesneeded, MINSAMPLES);
                        E->consolidate = false;
                        mutex_exit(&E->lock);
                }

                /*
                 * If we changed any of the estimation or collection
                 * flags, request new samples from everyone -- either
                 * to make up for what we just lost, or to get new
                 * samples from what we just added.
                 *
                 * Failing on signal, while waiting for another process
                 * to finish requesting entropy, is OK here even though
                 * we have committed side effects, because this ioctl
                 * command is idempotent, so repeating it is safe.
                 */
                if (request) {
                        mutex_enter(&E->lock);
                        error = entropy_request(ENTROPY_CAPACITY,
                            ENTROPY_WAIT|ENTROPY_SIG);
                        mutex_exit(&E->lock);
                }
                break;
        }
        case RNDADDDATA: {        /* Enter seed into entropy pool.  */
                rnddata_t *rdata = data;
                unsigned entropybits = 0;

                if (!atomic_load_relaxed(&entropy_collection))
                        break;        /* thanks but no thanks */
                if (rdata->len > MIN(sizeof(rdata->data), UINT32_MAX/NBBY))
                        return EINVAL;

                /*
                 * This ioctl serves as the userland alternative a
                 * bootloader-provided seed -- typically furnished by
                 * /etc/rc.d/random_seed.  We accept the user's entropy
                 * claim only if
                 *
                 * (a) the user is privileged, and
                 * (b) we have not entered a bootloader seed.
                 *
                 * under the assumption that the user may use this to
                 * load a seed from disk that we have already loaded
                 * from the bootloader, so we don't double-count it.
                 */
                if (privileged && rdata->entropy && rdata->len) {
                        mutex_enter(&E->lock);
                        if (!E->seeded) {
                                entropybits = MIN(rdata->entropy,
                                    MIN(rdata->len, ENTROPY_CAPACITY)*NBBY);
                                E->seeded = true;
                        }
                        mutex_exit(&E->lock);
                }

                /* Enter the data and consolidate entropy.  */
                rnd_add_data(&seed_rndsource, rdata->data, rdata->len,
                    entropybits);
                entropy_consolidate();
                break;
        }
        default:
                error = ENOTTY;
        }

        /* Return any error that may have come up.  */
        return error;
}

/* Legacy entry points */

void
rnd_seed(void *seed, size_t len)
{

        if (len != sizeof(rndsave_t)) {
                printf("entropy: invalid seed length: %zu,"
                    " expected sizeof(rndsave_t) = %zu\n",
                    len, sizeof(rndsave_t));
                return;
        }
        entropy_seed(seed);
}

void
rnd_init(void)
{

        entropy_init();
}

void
rnd_init_softint(void)
{

        entropy_init_late();
        entropy_bootrequest();
}

int
rnd_system_ioctl(struct file *fp, unsigned long cmd, void *data)
{

        return entropy_ioctl(cmd, data);
}






















































































































































































































































    3 

    3 




    2 




























    3 



























    3 
















    3 






    3 
    3 




















    2 


    2 






    1 


























































    3 









    3 
    3 
    3 


























    2 
    1 


















    3 








    3 



    3 







    2 



































    2 



    2 

    2 






    3 


    1 
    2 

    2 






































































    3 












    3 










    3 

    3 


































































    3 

    3 















    2 





    2 
    2 































    2 

    2 
























    1 














    1 






































































































































































































































































    7 



    7 





    7 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
/*        $NetBSD: uvm_bio.c,v 1.128 2023/04/09 09:00:56 riastradh Exp $        */

/*
 * Copyright (c) 1998 Chuck Silvers.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

/*
 * uvm_bio.c: buffered i/o object mapping cache
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_bio.c,v 1.128 2023/04/09 09:00:56 riastradh Exp $");

#include "opt_uvmhist.h"
#include "opt_ubc.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/bitops.h>                /* for ilog2() */

#include <uvm/uvm.h>
#include <uvm/uvm_pdpolicy.h>

#ifdef PMAP_DIRECT
#  define UBC_USE_PMAP_DIRECT
#endif

/*
 * local functions
 */

static int        ubc_fault(struct uvm_faultinfo *, vaddr_t, struct vm_page **,
                          int, int, vm_prot_t, int);
static struct ubc_map *ubc_find_mapping(struct uvm_object *, voff_t);
static int        ubchash_stats(struct hashstat_sysctl *hs, bool fill);
#ifdef UBC_USE_PMAP_DIRECT
static int __noinline ubc_uiomove_direct(struct uvm_object *, struct uio *, vsize_t,
                          int, int);
static void __noinline ubc_zerorange_direct(struct uvm_object *, off_t, size_t, int);

/* XXX disabled by default until the kinks are worked out. */
bool ubc_direct = false;
#endif

/*
 * local data structures
 */

#define UBC_HASH(uobj, offset)                                                 \
        (((((u_long)(uobj)) >> 8) + (((u_long)(offset)) >> PAGE_SHIFT)) & \
                                ubc_object.hashmask)

#define UBC_QUEUE(offset)                                                \
        (&ubc_object.inactive[(((u_long)(offset)) >> ubc_winshift) &        \
                             (UBC_NQUEUES - 1)])

#define UBC_UMAP_ADDR(u)                                                \
        (vaddr_t)(ubc_object.kva + (((u) - ubc_object.umap) << ubc_winshift))


#define UMAP_PAGES_LOCKED        0x0001
#define UMAP_MAPPING_CACHED        0x0002

struct ubc_map {
        struct uvm_object *        uobj;                /* mapped object */
        voff_t                        offset;                /* offset into uobj */
        voff_t                        writeoff;        /* write offset */
        vsize_t                        writelen;        /* write len */
        int                        refcount;        /* refcount on mapping */
        int                        flags;                /* extra state */
        int                        advice;

        LIST_ENTRY(ubc_map)        hash;                /* hash table */
        TAILQ_ENTRY(ubc_map)        inactive;        /* inactive queue */
        LIST_ENTRY(ubc_map)        list;                /* per-object list */
};

TAILQ_HEAD(ubc_inactive_head, ubc_map);
static struct ubc_object {
        struct uvm_object uobj;                /* glue for uvm_map() */
        char *kva;                        /* where ubc_object is mapped */
        struct ubc_map *umap;                /* array of ubc_map's */

        LIST_HEAD(, ubc_map) *hash;        /* hashtable for cached ubc_map's */
        u_long hashmask;                /* mask for hashtable */

        struct ubc_inactive_head *inactive;
                                        /* inactive queues for ubc_map's */
} ubc_object;

const struct uvm_pagerops ubc_pager = {
        .pgo_fault = ubc_fault,
        /* ... rest are NULL */
};

/* Use value at least as big as maximum page size supported by architecture */
#define UBC_MAX_WINSHIFT        \
    ((1 << UBC_WINSHIFT) > MAX_PAGE_SIZE ? UBC_WINSHIFT : ilog2(MAX_PAGE_SIZE))

int ubc_nwins = UBC_NWINS;
const int ubc_winshift = UBC_MAX_WINSHIFT;
const int ubc_winsize = 1 << UBC_MAX_WINSHIFT;
#if defined(PMAP_PREFER)
int ubc_nqueues;
#define UBC_NQUEUES ubc_nqueues
#else
#define UBC_NQUEUES 1
#endif

#if defined(UBC_STATS)

#define        UBC_EVCNT_DEFINE(name) \
struct evcnt ubc_evcnt_##name = \
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "ubc", #name); \
EVCNT_ATTACH_STATIC(ubc_evcnt_##name);
#define        UBC_EVCNT_INCR(name) ubc_evcnt_##name.ev_count++

#else /* defined(UBC_STATS) */

#define        UBC_EVCNT_DEFINE(name)        /* nothing */
#define        UBC_EVCNT_INCR(name)        /* nothing */

#endif /* defined(UBC_STATS) */

UBC_EVCNT_DEFINE(wincachehit)
UBC_EVCNT_DEFINE(wincachemiss)
UBC_EVCNT_DEFINE(faultbusy)

/*
 * ubc_init
 *
 * init pager private data structures.
 */

void
ubc_init(void)
{
        /*
         * Make sure ubc_winshift is sane.
         */
        KASSERT(ubc_winshift >= PAGE_SHIFT);

        /*
         * init ubc_object.
         * alloc and init ubc_map's.
         * init inactive queues.
         * alloc and init hashtable.
         * map in ubc_object.
         */

        uvm_obj_init(&ubc_object.uobj, &ubc_pager, true, UVM_OBJ_KERN);

        ubc_object.umap = kmem_zalloc(ubc_nwins * sizeof(struct ubc_map),
            KM_SLEEP);
        if (ubc_object.umap == NULL)
                panic("ubc_init: failed to allocate ubc_map");

        vaddr_t va = (vaddr_t)1L;
#ifdef PMAP_PREFER
        PMAP_PREFER(0, &va, 0, 0);        /* kernel is never topdown */
        ubc_nqueues = va >> ubc_winshift;
        if (ubc_nqueues == 0) {
                ubc_nqueues = 1;
        }
#endif
        ubc_object.inactive = kmem_alloc(UBC_NQUEUES *
            sizeof(struct ubc_inactive_head), KM_SLEEP);
        for (int i = 0; i < UBC_NQUEUES; i++) {
                TAILQ_INIT(&ubc_object.inactive[i]);
        }
        for (int i = 0; i < ubc_nwins; i++) {
                struct ubc_map *umap;
                umap = &ubc_object.umap[i];
                TAILQ_INSERT_TAIL(&ubc_object.inactive[i & (UBC_NQUEUES - 1)],
                                  umap, inactive);
        }

        ubc_object.hash = hashinit(ubc_nwins, HASH_LIST, true,
            &ubc_object.hashmask);
        for (int i = 0; i <= ubc_object.hashmask; i++) {
                LIST_INIT(&ubc_object.hash[i]);
        }

        if (uvm_map(kernel_map, (vaddr_t *)&ubc_object.kva,
                    ubc_nwins << ubc_winshift, &ubc_object.uobj, 0, (vsize_t)va,
                    UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_NONE,
                                UVM_ADV_RANDOM, UVM_FLAG_NOMERGE)) != 0) {
                panic("ubc_init: failed to map ubc_object");
        }

        hashstat_register("ubchash", ubchash_stats);
}

void
ubchist_init(void)
{

        UVMHIST_INIT(ubchist, 300);
}

/*
 * ubc_fault_page: helper of ubc_fault to handle a single page.
 *
 * => Caller has UVM object locked.
 * => Caller will perform pmap_update().
 */

static inline int
ubc_fault_page(const struct uvm_faultinfo *ufi, const struct ubc_map *umap,
    struct vm_page *pg, vm_prot_t prot, vm_prot_t access_type, vaddr_t va)
{
        vm_prot_t mask;
        int error;
        bool rdonly;

        KASSERT(rw_write_held(pg->uobject->vmobjlock));

        KASSERT((pg->flags & PG_FAKE) == 0);
        if (pg->flags & PG_RELEASED) {
                uvm_pagefree(pg);
                return 0;
        }
        if (pg->loan_count != 0) {

                /*
                 * Avoid unneeded loan break, if possible.
                 */

                if ((access_type & VM_PROT_WRITE) == 0) {
                        prot &= ~VM_PROT_WRITE;
                }
                if (prot & VM_PROT_WRITE) {
                        struct vm_page *newpg;

                        newpg = uvm_loanbreak(pg);
                        if (newpg == NULL) {
                                uvm_page_unbusy(&pg, 1);
                                return ENOMEM;
                        }
                        pg = newpg;
                }
        }

        /*
         * Note that a page whose backing store is partially allocated
         * is marked as PG_RDONLY.
         *
         * it's a responsibility of ubc_alloc's caller to allocate backing
         * blocks before writing to the window.
         */

        KASSERT((pg->flags & PG_RDONLY) == 0 ||
            (access_type & VM_PROT_WRITE) == 0 ||
            pg->offset < umap->writeoff ||
            pg->offset + PAGE_SIZE > umap->writeoff + umap->writelen);

        rdonly = uvm_pagereadonly_p(pg);
        mask = rdonly ? ~VM_PROT_WRITE : VM_PROT_ALL;

        error = pmap_enter(ufi->orig_map->pmap, va, VM_PAGE_TO_PHYS(pg),
            prot & mask, PMAP_CANFAIL | (access_type & mask));

        uvm_pagelock(pg);
        uvm_pageactivate(pg);
        uvm_pagewakeup(pg);
        uvm_pageunlock(pg);
        pg->flags &= ~PG_BUSY;
        UVM_PAGE_OWN(pg, NULL);

        return error;
}

/*
 * ubc_fault: fault routine for ubc mapping
 */

static int
ubc_fault(struct uvm_faultinfo *ufi, vaddr_t ign1, struct vm_page **ign2,
    int ign3, int ign4, vm_prot_t access_type, int flags)
{
        struct uvm_object *uobj;
        struct ubc_map *umap;
        vaddr_t va, eva, ubc_offset, slot_offset;
        struct vm_page *pgs[howmany(ubc_winsize, MIN_PAGE_SIZE)];
        int i, error, npages;
        vm_prot_t prot;

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);

        /*
         * no need to try with PGO_LOCKED...
         * we don't need to have the map locked since we know that
         * no one will mess with it until our reference is released.
         */

        if (flags & PGO_LOCKED) {
                uvmfault_unlockall(ufi, NULL, &ubc_object.uobj);
                flags &= ~PGO_LOCKED;
        }

        va = ufi->orig_rvaddr;
        ubc_offset = va - (vaddr_t)ubc_object.kva;
        umap = &ubc_object.umap[ubc_offset >> ubc_winshift];
        KASSERT(umap->refcount != 0);
        KASSERT((umap->flags & UMAP_PAGES_LOCKED) == 0);
        slot_offset = ubc_offset & (ubc_winsize - 1);

        /*
         * some platforms cannot write to individual bytes atomically, so
         * software has to do read/modify/write of larger quantities instead.
         * this means that the access_type for "write" operations
         * can be VM_PROT_READ, which confuses us mightily.
         *
         * deal with this by resetting access_type based on the info
         * that ubc_alloc() stores for us.
         */

        access_type = umap->writelen ? VM_PROT_WRITE : VM_PROT_READ;
        UVMHIST_LOG(ubchist, "va %#jx ubc_offset %#jx access_type %jd",
            va, ubc_offset, access_type, 0);

        if ((access_type & VM_PROT_WRITE) != 0) {
#ifndef PRIxOFF                /* XXX */
#define PRIxOFF "jx"        /* XXX */
#endif                        /* XXX */
                KASSERTMSG((trunc_page(umap->writeoff) <= slot_offset),
                    "out of range write: slot=%#"PRIxVSIZE" off=%#"PRIxOFF,
                    slot_offset, (intmax_t)umap->writeoff);
                KASSERTMSG((slot_offset < umap->writeoff + umap->writelen),
                    "out of range write: slot=%#"PRIxVADDR
                        " off=%#"PRIxOFF" len=%#"PRIxVSIZE,
                    slot_offset, (intmax_t)umap->writeoff, umap->writelen);
        }

        /* no umap locking needed since we have a ref on the umap */
        uobj = umap->uobj;

        if ((access_type & VM_PROT_WRITE) == 0) {
                npages = (ubc_winsize - slot_offset) >> PAGE_SHIFT;
        } else {
                npages = (round_page(umap->offset + umap->writeoff +
                    umap->writelen) - (umap->offset + slot_offset))
                    >> PAGE_SHIFT;
                flags |= PGO_PASTEOF;
        }

again:
        memset(pgs, 0, sizeof (pgs));
        rw_enter(uobj->vmobjlock, RW_WRITER);

        UVMHIST_LOG(ubchist, "slot_offset %#jx writeoff %#jx writelen %#jx ",
            slot_offset, umap->writeoff, umap->writelen, 0);
        UVMHIST_LOG(ubchist, "getpages uobj %#jx offset %#jx npages %jd",
            (uintptr_t)uobj, umap->offset + slot_offset, npages, 0);

        error = (*uobj->pgops->pgo_get)(uobj, umap->offset + slot_offset, pgs,
            &npages, 0, access_type, umap->advice, flags | PGO_NOBLOCKALLOC |
            PGO_NOTIMESTAMP);
        UVMHIST_LOG(ubchist, "getpages error %jd npages %jd", error, npages, 0,
            0);

        if (error == EAGAIN) {
                kpause("ubc_fault", false, hz >> 2, NULL);
                goto again;
        }
        if (error) {
                return error;
        }

        /*
         * For virtually-indexed, virtually-tagged caches we should avoid
         * creating writable mappings when we do not absolutely need them,
         * since the "compatible alias" trick does not work on such caches.
         * Otherwise, we can always map the pages writable.
         */

#ifdef PMAP_CACHE_VIVT
        prot = VM_PROT_READ | access_type;
#else
        prot = VM_PROT_READ | VM_PROT_WRITE;
#endif

        va = ufi->orig_rvaddr;
        eva = ufi->orig_rvaddr + (npages << PAGE_SHIFT);

        UVMHIST_LOG(ubchist, "va %#jx eva %#jx", va, eva, 0, 0);

        /*
         * Note: normally all returned pages would have the same UVM object.
         * However, layered file-systems and e.g. tmpfs, may return pages
         * which belong to underlying UVM object.  In such case, lock is
         * shared amongst the objects.
         */
        rw_enter(uobj->vmobjlock, RW_WRITER);
        for (i = 0; va < eva; i++, va += PAGE_SIZE) {
                struct vm_page *pg;

                UVMHIST_LOG(ubchist, "pgs[%jd] = %#jx", i, (uintptr_t)pgs[i],
                    0, 0);
                pg = pgs[i];

                if (pg == NULL || pg == PGO_DONTCARE) {
                        continue;
                }
                KASSERT(uobj->vmobjlock == pg->uobject->vmobjlock);
                error = ubc_fault_page(ufi, umap, pg, prot, access_type, va);
                if (error) {
                        /*
                         * Flush (there might be pages entered), drop the lock,
                         * and perform uvm_wait().  Note: page will re-fault.
                         */
                        pmap_update(ufi->orig_map->pmap);
                        rw_exit(uobj->vmobjlock);
                        uvm_wait("ubc_fault");
                        rw_enter(uobj->vmobjlock, RW_WRITER);
                }
        }
        /* Must make VA visible before the unlock. */
        pmap_update(ufi->orig_map->pmap);
        rw_exit(uobj->vmobjlock);

        return 0;
}

/*
 * local functions
 */

static struct ubc_map *
ubc_find_mapping(struct uvm_object *uobj, voff_t offset)
{
        struct ubc_map *umap;

        LIST_FOREACH(umap, &ubc_object.hash[UBC_HASH(uobj, offset)], hash) {
                if (umap->uobj == uobj && umap->offset == offset) {
                        return umap;
                }
        }
        return NULL;
}


/*
 * ubc interface functions
 */

/*
 * ubc_alloc:  allocate a file mapping window
 */

static void * __noinline
ubc_alloc(struct uvm_object *uobj, voff_t offset, vsize_t *lenp, int advice,
    int flags, struct vm_page **pgs, int *npagesp)
{
        vaddr_t slot_offset, va;
        struct ubc_map *umap;
        voff_t umap_offset;
        int error;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(ubchist, "uobj %#jx offset %#jx len %#jx",
            (uintptr_t)uobj, offset, *lenp, 0);

        KASSERT(*lenp > 0);
        umap_offset = (offset & ~((voff_t)ubc_winsize - 1));
        slot_offset = (vaddr_t)(offset & ((voff_t)ubc_winsize - 1));
        *lenp = MIN(*lenp, ubc_winsize - slot_offset);
        KASSERT(*lenp > 0);

        rw_enter(ubc_object.uobj.vmobjlock, RW_WRITER);
again:
        /*
         * The UVM object is already referenced.
         * Lock order: UBC object -> ubc_map::uobj.
         */
        umap = ubc_find_mapping(uobj, umap_offset);
        if (umap == NULL) {
                struct uvm_object *oobj;

                UBC_EVCNT_INCR(wincachemiss);
                umap = TAILQ_FIRST(UBC_QUEUE(offset));
                if (umap == NULL) {
                        rw_exit(ubc_object.uobj.vmobjlock);
                        kpause("ubc_alloc", false, hz >> 2, NULL);
                        rw_enter(ubc_object.uobj.vmobjlock, RW_WRITER);
                        goto again;
                }

                va = UBC_UMAP_ADDR(umap);
                oobj = umap->uobj;

                /*
                 * Remove from old hash (if any), add to new hash.
                 */

                if (oobj != NULL) {
                        /*
                         * Mapping must be removed before the list entry,
                         * since there is a race with ubc_purge().
                         */
                        if (umap->flags & UMAP_MAPPING_CACHED) {
                                umap->flags &= ~UMAP_MAPPING_CACHED;
                                rw_enter(oobj->vmobjlock, RW_WRITER);
                                pmap_remove(pmap_kernel(), va,
                                    va + ubc_winsize);
                                pmap_update(pmap_kernel());
                                rw_exit(oobj->vmobjlock);
                        }
                        LIST_REMOVE(umap, hash);
                        LIST_REMOVE(umap, list);
                } else {
                        KASSERT((umap->flags & UMAP_MAPPING_CACHED) == 0);
                }
                umap->uobj = uobj;
                umap->offset = umap_offset;
                LIST_INSERT_HEAD(&ubc_object.hash[UBC_HASH(uobj, umap_offset)],
                    umap, hash);
                LIST_INSERT_HEAD(&uobj->uo_ubc, umap, list);
        } else {
                UBC_EVCNT_INCR(wincachehit);
                va = UBC_UMAP_ADDR(umap);
        }

        if (umap->refcount == 0) {
                TAILQ_REMOVE(UBC_QUEUE(offset), umap, inactive);
        }

        if (flags & UBC_WRITE) {
                KASSERTMSG(umap->writeoff == 0,
                    "ubc_alloc: concurrent writes to uobj %p", uobj);
                KASSERTMSG(umap->writelen == 0,
                    "ubc_alloc: concurrent writes to uobj %p", uobj);
                umap->writeoff = slot_offset;
                umap->writelen = *lenp;
        }

        umap->refcount++;
        umap->advice = advice;
        rw_exit(ubc_object.uobj.vmobjlock);
        UVMHIST_LOG(ubchist, "umap %#jx refs %jd va %#jx flags %#jx",
            (uintptr_t)umap, umap->refcount, (uintptr_t)va, flags);

        if (flags & UBC_FAULTBUSY) {
                int npages = (*lenp + (offset & (PAGE_SIZE - 1)) +
                    PAGE_SIZE - 1) >> PAGE_SHIFT;
                int gpflags =
                    PGO_SYNCIO|PGO_OVERWRITE|PGO_PASTEOF|PGO_NOBLOCKALLOC|
                    PGO_NOTIMESTAMP;
                int i;
                KDASSERT(flags & UBC_WRITE);
                KASSERT(npages <= *npagesp);
                KASSERT(umap->refcount == 1);

                UBC_EVCNT_INCR(faultbusy);
again_faultbusy:
                rw_enter(uobj->vmobjlock, RW_WRITER);
                if (umap->flags & UMAP_MAPPING_CACHED) {
                        umap->flags &= ~UMAP_MAPPING_CACHED;
                        pmap_remove(pmap_kernel(), va, va + ubc_winsize);
                }
                memset(pgs, 0, *npagesp * sizeof(pgs[0]));

                error = (*uobj->pgops->pgo_get)(uobj, trunc_page(offset), pgs,
                    &npages, 0, VM_PROT_READ | VM_PROT_WRITE, advice, gpflags);
                UVMHIST_LOG(ubchist, "faultbusy getpages %jd", error, 0, 0, 0);
                if (error) {
                        /*
                         * Flush: the mapping above might have been removed.
                         */
                        pmap_update(pmap_kernel());
                        goto out;
                }
                for (i = 0; i < npages; i++) {
                        struct vm_page *pg = pgs[i];

                        KASSERT(pg->uobject == uobj);
                        if (pg->loan_count != 0) {
                                rw_enter(uobj->vmobjlock, RW_WRITER);
                                if (pg->loan_count != 0) {
                                        pg = uvm_loanbreak(pg);
                                }
                                if (pg == NULL) {
                                        pmap_kremove(va, ubc_winsize);
                                        pmap_update(pmap_kernel());
                                        uvm_page_unbusy(pgs, npages);
                                        rw_exit(uobj->vmobjlock);
                                        uvm_wait("ubc_alloc");
                                        goto again_faultbusy;
                                }
                                rw_exit(uobj->vmobjlock);
                                pgs[i] = pg;
                        }
                        pmap_kenter_pa(
                            va + trunc_page(slot_offset) + (i << PAGE_SHIFT),
                            VM_PAGE_TO_PHYS(pg),
                            VM_PROT_READ | VM_PROT_WRITE, 0);
                }
                pmap_update(pmap_kernel());
                umap->flags |= UMAP_PAGES_LOCKED;
                *npagesp = npages;
        } else {
                KASSERT((umap->flags & UMAP_PAGES_LOCKED) == 0);
        }

out:
        return (void *)(va + slot_offset);
}

/*
 * ubc_release:  free a file mapping window.
 */

static void __noinline
ubc_release(void *va, int flags, struct vm_page **pgs, int npages)
{
        struct ubc_map *umap;
        struct uvm_object *uobj;
        vaddr_t umapva;
        bool unmapped;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(ubchist, "va %#jx", (uintptr_t)va, 0, 0, 0);

        umap = &ubc_object.umap[((char *)va - ubc_object.kva) >> ubc_winshift];
        umapva = UBC_UMAP_ADDR(umap);
        uobj = umap->uobj;
        KASSERT(uobj != NULL);

        if (umap->flags & UMAP_PAGES_LOCKED) {
                const voff_t endoff = umap->writeoff + umap->writelen;
                const voff_t zerolen = round_page(endoff) - endoff;

                KASSERT(npages == (round_page(endoff) -
                    trunc_page(umap->writeoff)) >> PAGE_SHIFT);
                KASSERT((umap->flags & UMAP_MAPPING_CACHED) == 0);
                if (zerolen) {
                        memset((char *)umapva + endoff, 0, zerolen);
                }
                umap->flags &= ~UMAP_PAGES_LOCKED;
                rw_enter(uobj->vmobjlock, RW_WRITER);
                for (u_int i = 0; i < npages; i++) {
                        struct vm_page *pg = pgs[i];
#ifdef DIAGNOSTIC
                        paddr_t pa;
                        bool rv;
                        rv = pmap_extract(pmap_kernel(), umapva +
                            umap->writeoff + (i << PAGE_SHIFT), &pa);
                        KASSERT(rv);
                        KASSERT(PHYS_TO_VM_PAGE(pa) == pg);
#endif
                        pg->flags &= ~PG_FAKE;
                        KASSERTMSG(uvm_pagegetdirty(pg) ==
                            UVM_PAGE_STATUS_DIRTY,
                            "page %p not dirty", pg);
                        KASSERT(pg->loan_count == 0);
                        if (uvmpdpol_pageactivate_p(pg)) {
                                uvm_pagelock(pg);
                                uvm_pageactivate(pg);
                                uvm_pageunlock(pg);
                        }
                }
                pmap_kremove(umapva, ubc_winsize);
                pmap_update(pmap_kernel());
                uvm_page_unbusy(pgs, npages);
                rw_exit(uobj->vmobjlock);
                unmapped = true;
        } else {
                unmapped = false;
        }

        rw_enter(ubc_object.uobj.vmobjlock, RW_WRITER);
        umap->writeoff = 0;
        umap->writelen = 0;
        umap->refcount--;
        if (umap->refcount == 0) {
                if (flags & UBC_UNMAP) {
                        /*
                         * Invalidate any cached mappings if requested.
                         * This is typically used to avoid leaving
                         * incompatible cache aliases around indefinitely.
                         */
                        rw_enter(uobj->vmobjlock, RW_WRITER);
                        pmap_remove(pmap_kernel(), umapva,
                                    umapva + ubc_winsize);
                        pmap_update(pmap_kernel());
                        rw_exit(uobj->vmobjlock);

                        umap->flags &= ~UMAP_MAPPING_CACHED;
                        LIST_REMOVE(umap, hash);
                        LIST_REMOVE(umap, list);
                        umap->uobj = NULL;
                        TAILQ_INSERT_HEAD(UBC_QUEUE(umap->offset), umap,
                            inactive);
                } else {
                        if (!unmapped) {
                                umap->flags |= UMAP_MAPPING_CACHED;
                        }
                        TAILQ_INSERT_TAIL(UBC_QUEUE(umap->offset), umap,
                            inactive);
                }
        }
        UVMHIST_LOG(ubchist, "umap %#jx refs %jd", (uintptr_t)umap,
            umap->refcount, 0, 0);
        rw_exit(ubc_object.uobj.vmobjlock);
}

/*
 * ubc_uiomove: move data to/from an object.
 */

int
ubc_uiomove(struct uvm_object *uobj, struct uio *uio, vsize_t todo, int advice,
    int flags)
{
        const bool overwrite = (flags & UBC_FAULTBUSY) != 0;
        struct vm_page *pgs[howmany(ubc_winsize, MIN_PAGE_SIZE)];
        voff_t off;
        int error, npages;

        KASSERT(todo <= uio->uio_resid);
        KASSERT(((flags & UBC_WRITE) != 0 && uio->uio_rw == UIO_WRITE) ||
            ((flags & UBC_READ) != 0 && uio->uio_rw == UIO_READ));

#ifdef UBC_USE_PMAP_DIRECT
        /*
         * during direct access pages need to be held busy to prevent them
         * changing identity, and therefore if we read or write an object
         * into a mapped view of same we could deadlock while faulting.
         *
         * avoid the problem by disallowing direct access if the object
         * might be visible somewhere via mmap().
         *
         * XXX concurrent reads cause thundering herd issues with PG_BUSY.
         * In the future enable by default for writes or if ncpu<=2, and
         * make the toggle override that.
         */
        if ((ubc_direct && (flags & UBC_ISMAPPED) == 0) ||
            (flags & UBC_FAULTBUSY) != 0) {
                return ubc_uiomove_direct(uobj, uio, todo, advice, flags);
        }
#endif

        off = uio->uio_offset;
        error = 0;
        while (todo > 0) {
                vsize_t bytelen = todo;
                void *win;

                npages = __arraycount(pgs);
                win = ubc_alloc(uobj, off, &bytelen, advice, flags, pgs,
                    &npages);
                if (error == 0) {
                        error = uiomove(win, bytelen, uio);
                }
                if (error != 0 && overwrite) {
                        /*
                         * if we haven't initialized the pages yet,
                         * do it now.  it's safe to use memset here
                         * because we just mapped the pages above.
                         */
                        memset(win, 0, bytelen);
                }
                ubc_release(win, flags, pgs, npages);
                off += bytelen;
                todo -= bytelen;
                if (error != 0 && (flags & UBC_PARTIALOK) != 0) {
                        break;
                }
        }

        return error;
}

/*
 * ubc_zerorange: set a range of bytes in an object to zero.
 */

void
ubc_zerorange(struct uvm_object *uobj, off_t off, size_t len, int flags)
{
        struct vm_page *pgs[howmany(ubc_winsize, MIN_PAGE_SIZE)];
        int npages;

#ifdef UBC_USE_PMAP_DIRECT
        if (ubc_direct || (flags & UBC_FAULTBUSY) != 0) {
                ubc_zerorange_direct(uobj, off, len, flags);
                return;
        }
#endif

        /*
         * XXXUBC invent kzero() and use it
         */

        while (len) {
                void *win;
                vsize_t bytelen = len;

                npages = __arraycount(pgs);
                win = ubc_alloc(uobj, off, &bytelen, UVM_ADV_NORMAL, UBC_WRITE,
                    pgs, &npages);
                memset(win, 0, bytelen);
                ubc_release(win, flags, pgs, npages);

                off += bytelen;
                len -= bytelen;
        }
}

#ifdef UBC_USE_PMAP_DIRECT
/* Copy data using direct map */

/*
 * ubc_alloc_direct:  allocate a file mapping window using direct map
 */
static int __noinline
ubc_alloc_direct(struct uvm_object *uobj, voff_t offset, vsize_t *lenp,
    int advice, int flags, struct vm_page **pgs, int *npages)
{
        voff_t pgoff;
        int error;
        int gpflags = flags | PGO_NOTIMESTAMP | PGO_SYNCIO;
        int access_type = VM_PROT_READ;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);

        if (flags & UBC_WRITE) {
                if (flags & UBC_FAULTBUSY)
                        gpflags |= PGO_OVERWRITE | PGO_NOBLOCKALLOC;
#if 0
                KASSERT(!UVM_OBJ_NEEDS_WRITEFAULT(uobj));
#endif

                /*
                 * Tell genfs_getpages() we already have the journal lock,
                 * allow allocation past current EOF.
                 */
                gpflags |= PGO_JOURNALLOCKED | PGO_PASTEOF;
                access_type |= VM_PROT_WRITE;
        } else {
                /* Don't need the empty blocks allocated, PG_RDONLY is okay */
                gpflags |= PGO_NOBLOCKALLOC;
        }

        pgoff = (offset & PAGE_MASK);
        *lenp = MIN(*lenp, ubc_winsize - pgoff);

again:
        *npages = (*lenp + pgoff + PAGE_SIZE - 1) >> PAGE_SHIFT;
        KASSERT((*npages * PAGE_SIZE) <= ubc_winsize);
        KASSERT(*lenp + pgoff <= ubc_winsize);
        memset(pgs, 0, *npages * sizeof(pgs[0]));

        rw_enter(uobj->vmobjlock, RW_WRITER);
        error = (*uobj->pgops->pgo_get)(uobj, trunc_page(offset), pgs,
            npages, 0, access_type, advice, gpflags);
        UVMHIST_LOG(ubchist, "alloc_direct getpages %jd", error, 0, 0, 0);
        if (error) {
                if (error == EAGAIN) {
                        kpause("ubc_alloc_directg", false, hz >> 2, NULL);
                        goto again;
                }
                return error;
        }

        rw_enter(uobj->vmobjlock, RW_WRITER);
        for (int i = 0; i < *npages; i++) {
                struct vm_page *pg = pgs[i];

                KASSERT(pg != NULL);
                KASSERT(pg != PGO_DONTCARE);
                KASSERT((pg->flags & PG_FAKE) == 0 || (gpflags & PGO_OVERWRITE));
                KASSERT(pg->uobject->vmobjlock == uobj->vmobjlock);

                /* Avoid breaking loan if possible, only do it on write */
                if ((flags & UBC_WRITE) && pg->loan_count != 0) {
                        pg = uvm_loanbreak(pg);
                        if (pg == NULL) {
                                uvm_page_unbusy(pgs, *npages);
                                rw_exit(uobj->vmobjlock);
                                uvm_wait("ubc_alloc_directl");
                                goto again;
                        }
                        pgs[i] = pg;
                }

                /* Page must be writable by now */
                KASSERT((pg->flags & PG_RDONLY) == 0 || (flags & UBC_WRITE) == 0);

                /*
                 * XXX For aobj pages.  No managed mapping - mark the page
                 * dirty.
                 */
                if ((flags & UBC_WRITE) != 0) {
                        uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
                }
        }
        rw_exit(uobj->vmobjlock);

        return 0;
}

static void __noinline
ubc_direct_release(struct uvm_object *uobj,
        int flags, struct vm_page **pgs, int npages)
{
        rw_enter(uobj->vmobjlock, RW_WRITER);
        for (int i = 0; i < npages; i++) {
                struct vm_page *pg = pgs[i];

                pg->flags &= ~PG_BUSY;
                UVM_PAGE_OWN(pg, NULL);
                if (pg->flags & PG_RELEASED) {
                        pg->flags &= ~PG_RELEASED;
                        uvm_pagefree(pg);
                        continue;
                }

                if (uvm_pagewanted_p(pg) || uvmpdpol_pageactivate_p(pg)) {
                        uvm_pagelock(pg);
                        uvm_pageactivate(pg);
                        uvm_pagewakeup(pg);
                        uvm_pageunlock(pg);
                }

                /* Page was changed, no longer fake and neither clean. */
                if (flags & UBC_WRITE) {
                        KASSERTMSG(uvm_pagegetdirty(pg) ==
                            UVM_PAGE_STATUS_DIRTY,
                            "page %p not dirty", pg);
                        pg->flags &= ~PG_FAKE;
                }
        }
        rw_exit(uobj->vmobjlock);
}

static int
ubc_uiomove_process(void *win, size_t len, void *arg)
{
        struct uio *uio = (struct uio *)arg;

        return uiomove(win, len, uio);
}

static int
ubc_zerorange_process(void *win, size_t len, void *arg)
{
        memset(win, 0, len);
        return 0;
}

static int __noinline
ubc_uiomove_direct(struct uvm_object *uobj, struct uio *uio, vsize_t todo, int advice,
    int flags)
{
        const bool overwrite = (flags & UBC_FAULTBUSY) != 0;
        voff_t off;
        int error, npages;
        struct vm_page *pgs[howmany(ubc_winsize, MIN_PAGE_SIZE)];

        KASSERT(todo <= uio->uio_resid);
        KASSERT(((flags & UBC_WRITE) != 0 && uio->uio_rw == UIO_WRITE) ||
            ((flags & UBC_READ) != 0 && uio->uio_rw == UIO_READ));

        off = uio->uio_offset;
        error = 0;
        while (todo > 0) {
                vsize_t bytelen = todo;

                error = ubc_alloc_direct(uobj, off, &bytelen, advice, flags,
                    pgs, &npages);
                if (error != 0) {
                        /* can't do anything, failed to get the pages */
                        break;
                }

                if (error == 0) {
                        error = uvm_direct_process(pgs, npages, off, bytelen,
                            ubc_uiomove_process, uio);
                }

                if (overwrite) {
                        voff_t endoff;

                        /*
                         * if we haven't initialized the pages yet due to an
                         * error above, do it now.
                         */
                        if (error != 0) {
                                (void) uvm_direct_process(pgs, npages, off,
                                    bytelen, ubc_zerorange_process, NULL);
                        }

                        off += bytelen;
                        todo -= bytelen;
                        endoff = off & (PAGE_SIZE - 1);

                        /*
                         * zero out the remaining portion of the final page
                         * (if any).
                         */
                        if (todo == 0 && endoff != 0) {
                                vsize_t zlen = PAGE_SIZE - endoff;
                                (void) uvm_direct_process(pgs + npages - 1, 1,
                                    off, zlen, ubc_zerorange_process, NULL);
                        }
                } else {
                        off += bytelen;
                        todo -= bytelen;
                }

                ubc_direct_release(uobj, flags, pgs, npages);

                if (error != 0 && ISSET(flags, UBC_PARTIALOK)) {
                        break;
                }
        }

        return error;
}

static void __noinline
ubc_zerorange_direct(struct uvm_object *uobj, off_t off, size_t todo, int flags)
{
        int error, npages;
        struct vm_page *pgs[howmany(ubc_winsize, MIN_PAGE_SIZE)];

        flags |= UBC_WRITE;

        error = 0;
        while (todo > 0) {
                vsize_t bytelen = todo;

                error = ubc_alloc_direct(uobj, off, &bytelen, UVM_ADV_NORMAL,
                    flags, pgs, &npages);
                if (error != 0) {
                        /* can't do anything, failed to get the pages */
                        break;
                }

                error = uvm_direct_process(pgs, npages, off, bytelen,
                    ubc_zerorange_process, NULL);

                ubc_direct_release(uobj, flags, pgs, npages);

                off += bytelen;
                todo -= bytelen;
        }
}

#endif /* UBC_USE_PMAP_DIRECT */

/*
 * ubc_purge: disassociate ubc_map structures from an empty uvm_object.
 */

void
ubc_purge(struct uvm_object *uobj)
{
        struct ubc_map *umap;
        vaddr_t va;

        KASSERT(uobj->uo_npages == 0);

        /*
         * Safe to check without lock held, as ubc_alloc() removes
         * the mapping and list entry in the correct order.
         */
        if (__predict_true(LIST_EMPTY(&uobj->uo_ubc))) {
                return;
        }
        rw_enter(ubc_object.uobj.vmobjlock, RW_WRITER);
        while ((umap = LIST_FIRST(&uobj->uo_ubc)) != NULL) {
                KASSERT(umap->refcount == 0);
                for (va = 0; va < ubc_winsize; va += PAGE_SIZE) {
                        KASSERT(!pmap_extract(pmap_kernel(),
                            va + UBC_UMAP_ADDR(umap), NULL));
                }
                LIST_REMOVE(umap, list);
                LIST_REMOVE(umap, hash);
                umap->flags &= ~UMAP_MAPPING_CACHED;
                umap->uobj = NULL;
        }
        rw_exit(ubc_object.uobj.vmobjlock);
}

static int
ubchash_stats(struct hashstat_sysctl *hs, bool fill)
{
        struct ubc_map *umap;
        uint64_t chain;

        strlcpy(hs->hash_name, "ubchash", sizeof(hs->hash_name));
        strlcpy(hs->hash_desc, "ubc object hash", sizeof(hs->hash_desc));
        if (!fill)
                return 0;

        hs->hash_size = ubc_object.hashmask + 1;

        for (size_t i = 0; i < hs->hash_size; i++) {
                chain = 0;
                rw_enter(ubc_object.uobj.vmobjlock, RW_READER);
                LIST_FOREACH(umap, &ubc_object.hash[i], hash) {
                        chain++;
                }
                rw_exit(ubc_object.uobj.vmobjlock);
                if (chain > 0) {
                        hs->hash_used++;
                        hs->hash_items += chain;
                        if (chain > hs->hash_maxchain)
                                hs->hash_maxchain = chain;
                }
                preempt_point();
        }

        return 0;
}

























































   33 


















   28 





















   14 


   14 
   13 












   20 


   19 
   20 





   19 












   32 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/*        $NetBSD: kern_rwlock_obj.c,v 1.13 2023/10/02 21:03:55 ad Exp $        */

/*-
 * Copyright (c) 2008, 2009, 2019, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_rwlock_obj.c,v 1.13 2023/10/02 21:03:55 ad Exp $");

#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/rwlock.h>

/* Mutex cache */
#define        RW_OBJ_MAGIC        0x85d3c85d
struct krwobj {
        krwlock_t        ro_lock;
        u_int                ro_magic;
        u_int                ro_refcnt;
        uint8_t                mo_pad[COHERENCY_UNIT - sizeof(krwlock_t) -
            sizeof(u_int) * 2];
};

/*
 * rw_obj_alloc:
 *
 *        Allocate a single lock object, waiting for memory if needed.
 */
krwlock_t *
rw_obj_alloc(void)
{
        struct krwobj *ro;

        ro = kmem_intr_alloc(sizeof(*ro), KM_SLEEP);
        KASSERT(ALIGNED_POINTER(ro, coherency_unit));
        _rw_init(&ro->ro_lock, (uintptr_t)__builtin_return_address(0));
        ro->ro_magic = RW_OBJ_MAGIC;
        ro->ro_refcnt = 1;

        return (krwlock_t *)ro;
}

/*
 * rw_obj_tryalloc:
 *
 *        Allocate a single lock object, but fail if no memory is available.
 */
krwlock_t *
rw_obj_tryalloc(void)
{
        struct krwobj *ro;

        ro = kmem_intr_alloc(sizeof(*ro), KM_NOSLEEP);
        KASSERT(ALIGNED_POINTER(ro, coherency_unit));
        if (__predict_true(ro != NULL)) {
                _rw_init(&ro->ro_lock, (uintptr_t)__builtin_return_address(0));
                ro->ro_magic = RW_OBJ_MAGIC;
                ro->ro_refcnt = 1;
        }

        return (krwlock_t *)ro;
}

/*
 * rw_obj_hold:
 *
 *        Add a single reference to a lock object.  A reference to the object
 *        must already be held, and must be held across this call.
 */
void
rw_obj_hold(krwlock_t *lock)
{
        struct krwobj *ro = (struct krwobj *)lock;

        KASSERT(ro->ro_magic == RW_OBJ_MAGIC);
        KASSERT(ro->ro_refcnt > 0);

        atomic_inc_uint(&ro->ro_refcnt);
}

/*
 * rw_obj_free:
 *
 *        Drop a reference from a lock object.  If the last reference is being
 *        dropped, free the object and return true.  Otherwise, return false.
 */
bool
rw_obj_free(krwlock_t *lock)
{
        struct krwobj *ro = (struct krwobj *)lock;

        KASSERT(ro->ro_magic == RW_OBJ_MAGIC);
        KASSERT(ro->ro_refcnt > 0);

        membar_release();
        if (atomic_dec_uint_nv(&ro->ro_refcnt) > 0) {
                return false;
        }
        membar_acquire();
        rw_destroy(&ro->ro_lock);
        kmem_intr_free(ro, sizeof(*ro));
        return true;
}

/*
 * rw_obj_refcnt:
 *
 *        Return the reference count for a lock object.
 */
u_int
rw_obj_refcnt(krwlock_t *lock)
{
        struct krwobj *ro = (struct krwobj *)lock;

        return ro->ro_refcnt;
}





























































    3 






    2 








    1 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/*        $NetBSD: uipc_syscalls_50.c,v 1.12 2022/09/28 15:32:09 msaitoh Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/msg.h>
#include <sys/sysctl.h>
#include <sys/syscallargs.h>
#include <sys/errno.h>
#include <sys/kauth.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/compat_stub.h>

#include <net/if.h>

#include <compat/net/if.h>
#include <compat/sys/time.h>
#include <compat/sys/socket.h>
#include <compat/sys/sockio.h>

#include <compat/common/compat_mod.h>

/*ARGSUSED*/
static int
compat_ifdatareq(struct lwp *l, u_long cmd, void *data)
{
        struct if_data ifi;
        struct ifdatareq50 *ifdr = data;
        struct ifnet *ifp;
        int error;

        /* Validate arguments. */
        switch (cmd) {
        case OSIOCGIFDATA:
        case OSIOCZIFDATA:
                break;
        default:
                return ENOSYS;
        }

        ifp = ifunit(ifdr->ifdr_name);
        if (ifp == NULL)
                return ENXIO;

        /* Do work. */
        switch (cmd) {
        case OSIOCGIFDATA:
                if_export_if_data(ifp, &ifi, false);
                ifdatan2o(&ifdr->ifdr_data, &ifi);
                return 0;

        case OSIOCZIFDATA:
                if (l != NULL) {
                        error = kauth_authorize_network(l->l_cred,
                            KAUTH_NETWORK_INTERFACE,
                            KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp,
                            (void *)cmd, NULL);
                        if (error != 0)
                                return error;
                }
                if_export_if_data(ifp, &ifi, true);
                ifdatan2o(&ifdr->ifdr_data, &ifi);
                /* XXX if_lastchange? */
                return 0;

        default:
                /* Impossible due to above validation, but makes gcc happy. */
                return ENOSYS;
        }
}

void
uipc_syscalls_50_init(void)
{

        MODULE_HOOK_SET(uipc_syscalls_50_hook, compat_ifdatareq);
}

void
uipc_syscalls_50_fini(void)
{
 
        MODULE_HOOK_UNSET(uipc_syscalls_50_hook);
}
















































































































































































































   17 




   16 

















































































































































    1 




    1 

    1 


















    1 






















































































































































    1 

































































    2 
    2 
    2 

    1 
    1 


    2 










































































    2 


































































































































































































































    4 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
/*        $NetBSD: hci_socket.c,v 1.47 2019/09/28 07:10:55 plunky Exp $        */

/*-
 * Copyright (c) 2005 Iain Hibbert.
 * Copyright (c) 2006 Itronix Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of Itronix Inc. may not be used to endorse
 *    or promote products derived from this software without specific
 *    prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: hci_socket.c,v 1.47 2019/09/28 07:10:55 plunky Exp $");

/* load symbolic names */
#ifdef BLUETOOTH_DEBUG
#define PRUREQUESTS
#define PRCOREQUESTS
#endif

#include <sys/param.h>
#include <sys/domain.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/systm.h>

#include <netbt/bluetooth.h>
#include <netbt/hci.h>

/*******************************************************************************
 *
 * HCI SOCK_RAW Sockets - for control of Bluetooth Devices
 *
 */

/*
 * the raw HCI protocol control block
 */
struct hci_pcb {
        struct socket                *hp_socket;        /* socket */
        kauth_cred_t                hp_cred;        /* owner credential */
        unsigned int                hp_flags;        /* flags */
        bdaddr_t                hp_laddr;        /* local address */
        bdaddr_t                hp_raddr;        /* remote address */
        struct hci_filter        hp_efilter;        /* user event filter */
        struct hci_filter        hp_pfilter;        /* user packet filter */
        LIST_ENTRY(hci_pcb)        hp_next;        /* next HCI pcb */
};

/* hp_flags */
#define HCI_DIRECTION                (1<<1)        /* direction control messages */
#define HCI_PROMISCUOUS                (1<<2)        /* listen to all units */

LIST_HEAD(hci_pcb_list, hci_pcb) hci_pcb = LIST_HEAD_INITIALIZER(hci_pcb);

/* sysctl defaults */
int hci_sendspace = HCI_CMD_PKT_SIZE;
int hci_recvspace = 4096;

/* unprivileged commands opcode table */
static const struct {
        uint16_t        opcode;
        uint8_t                offs;        /* 0 - 63 */
        uint8_t                mask;        /* bit 0 - 7 */
        uint8_t                length;        /* approved length */
} hci_cmds[] = {
        { HCI_CMD_INQUIRY,
          0,  0x01, sizeof(hci_inquiry_cp) },
        { HCI_CMD_REMOTE_NAME_REQ,
          2,  0x08, sizeof(hci_remote_name_req_cp) },
        { HCI_CMD_READ_REMOTE_FEATURES,
          2,  0x20, sizeof(hci_read_remote_features_cp) },
        { HCI_CMD_READ_REMOTE_EXTENDED_FEATURES,
          2,  0x40, sizeof(hci_read_remote_extended_features_cp) },
        { HCI_CMD_READ_REMOTE_VER_INFO,
          2,  0x80, sizeof(hci_read_remote_ver_info_cp) },
        { HCI_CMD_READ_CLOCK_OFFSET,
          3,  0x01, sizeof(hci_read_clock_offset_cp) },
        { HCI_CMD_READ_LMP_HANDLE,
          3,  0x02, sizeof(hci_read_lmp_handle_cp) },
        { HCI_CMD_ROLE_DISCOVERY,
          4,  0x80, sizeof(hci_role_discovery_cp) },
        { HCI_CMD_READ_LINK_POLICY_SETTINGS,
          5,  0x02, sizeof(hci_read_link_policy_settings_cp) },
        { HCI_CMD_READ_DEFAULT_LINK_POLICY_SETTINGS,
          5,  0x08, 0 },
        { HCI_CMD_READ_PIN_TYPE,
          6,  0x04, 0 },
        { HCI_CMD_READ_LOCAL_NAME,
          7,  0x02, 0 },
        { HCI_CMD_READ_CON_ACCEPT_TIMEOUT,
          7,  0x04, 0 },
        { HCI_CMD_READ_PAGE_TIMEOUT,
          7,  0x10, 0 },
        { HCI_CMD_READ_SCAN_ENABLE,
          7,  0x40, 0 },
        { HCI_CMD_READ_PAGE_SCAN_ACTIVITY,
          8,  0x01, 0 },
        { HCI_CMD_READ_INQUIRY_SCAN_ACTIVITY,
          8,  0x04, 0 },
        { HCI_CMD_READ_AUTH_ENABLE,
          8,  0x10, 0 },
        { HCI_CMD_READ_ENCRYPTION_MODE,
          8,  0x40, 0 },
        { HCI_CMD_READ_UNIT_CLASS,
          9,  0x01, 0 },
        { HCI_CMD_READ_VOICE_SETTING,
          9,  0x04, 0 },
        { HCI_CMD_READ_AUTO_FLUSH_TIMEOUT,
          9,  0x10, sizeof(hci_read_auto_flush_timeout_cp) },
        { HCI_CMD_READ_NUM_BROADCAST_RETRANS,
          9,  0x40, 0 },
        { HCI_CMD_READ_HOLD_MODE_ACTIVITY,
          10, 0x01, 0 },
        { HCI_CMD_READ_XMIT_LEVEL,
          10, 0x04, sizeof(hci_read_xmit_level_cp) },
        { HCI_CMD_READ_SCO_FLOW_CONTROL,
          10, 0x08, 0 },
        { HCI_CMD_READ_LINK_SUPERVISION_TIMEOUT,
          11, 0x01, sizeof(hci_read_link_supervision_timeout_cp) },
        { HCI_CMD_READ_NUM_SUPPORTED_IAC,
          11, 0x04, 0 },
        { HCI_CMD_READ_IAC_LAP,
          11, 0x08, 0 },
        { HCI_CMD_READ_PAGE_SCAN_PERIOD,
          11, 0x20, 0 },
        { HCI_CMD_READ_PAGE_SCAN,
          11, 0x80, 0 },
        { HCI_CMD_READ_INQUIRY_SCAN_TYPE,
          12, 0x10, 0 },
        { HCI_CMD_READ_INQUIRY_MODE,
          12, 0x40, 0 },
        { HCI_CMD_READ_PAGE_SCAN_TYPE,
          13, 0x01, 0 },
        { HCI_CMD_READ_AFH_ASSESSMENT,
          13, 0x04, 0 },
        { HCI_CMD_READ_LOCAL_VER,
          14, 0x08, 0 },
        { HCI_CMD_READ_LOCAL_COMMANDS,
          14, 0x10, 0 },
        { HCI_CMD_READ_LOCAL_FEATURES,
          14, 0x20, 0 },
        { HCI_CMD_READ_LOCAL_EXTENDED_FEATURES,
          14, 0x40, sizeof(hci_read_local_extended_features_cp) },
        { HCI_CMD_READ_BUFFER_SIZE,
          14, 0x80, 0 },
        { HCI_CMD_READ_COUNTRY_CODE,
          15, 0x01, 0 },
        { HCI_CMD_READ_BDADDR,
          15, 0x02, 0 },
        { HCI_CMD_READ_FAILED_CONTACT_CNTR,
          15, 0x04, sizeof(hci_read_failed_contact_cntr_cp) },
        { HCI_CMD_READ_LINK_QUALITY,
          15, 0x10, sizeof(hci_read_link_quality_cp) },
        { HCI_CMD_READ_RSSI,
          15, 0x20, sizeof(hci_read_rssi_cp) },
        { HCI_CMD_READ_AFH_CHANNEL_MAP,
          15, 0x40, sizeof(hci_read_afh_channel_map_cp) },
        { HCI_CMD_READ_CLOCK,
          15, 0x80, sizeof(hci_read_clock_cp) },
        { HCI_CMD_READ_LOOPBACK_MODE,
          16, 0x01, 0 },
        { HCI_CMD_READ_EXTENDED_INQUIRY_RSP,
          17, 0x01, 0 },
        { HCI_CMD_READ_SIMPLE_PAIRING_MODE,
          17, 0x20, 0 },
        { HCI_CMD_READ_INQUIRY_RSP_XMIT_POWER,
          18, 0x01, 0 },
        { HCI_CMD_READ_DEFAULT_ERRDATA_REPORTING,
          18, 0x04, 0 },
        { HCI_CMD_READ_ENCRYPTION_KEY_SIZE,
          20, 0x10, sizeof(hci_read_encryption_key_size_cp) },
};

/*
 * supply a basic device send/recv policy
 */
static int
hci_device_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        int i, result;

        result = KAUTH_RESULT_DEFER;

        switch (action) {
        case KAUTH_DEVICE_BLUETOOTH_SEND: {
                struct hci_unit *unit = (struct hci_unit *)arg0;
                hci_cmd_hdr_t *hdr = (hci_cmd_hdr_t *)arg1;

                /*
                 * Allow sending unprivileged commands if the packet size
                 * is correct and the unit claims to support it
                 */

                if (hdr->type != HCI_CMD_PKT)
                        break;

                for (i = 0; i < __arraycount(hci_cmds); i++) {
                        if (hdr->opcode == hci_cmds[i].opcode
                            && hdr->length == hci_cmds[i].length
                            && (unit->hci_cmds[hci_cmds[i].offs] & hci_cmds[i].mask)) {
                                result = KAUTH_RESULT_ALLOW;
                                break;
                        }
                }

                break;
                }

        case KAUTH_DEVICE_BLUETOOTH_RECV:
                switch((uint8_t)(uintptr_t)arg0) {
                case HCI_CMD_PKT: {
                        uint16_t opcode = (uint16_t)(uintptr_t)arg1;

                        /*
                         * Allow to see any unprivileged command packet
                         */

                        for (i = 0; i < __arraycount(hci_cmds); i++) {
                                if (opcode == hci_cmds[i].opcode) {
                                        result = KAUTH_RESULT_ALLOW;
                                        break;
                                }
                        }

                        break;
                        }

                case HCI_EVENT_PKT: {
                        uint8_t event = (uint8_t)(uintptr_t)arg1;

                        /*
                         * Allow to receive most events
                         */

                        switch (event) {
                        case HCI_EVENT_RETURN_LINK_KEYS:
                        case HCI_EVENT_LINK_KEY_NOTIFICATION:
                        case HCI_EVENT_USER_CONFIRM_REQ:
                        case HCI_EVENT_USER_PASSKEY_NOTIFICATION:
                        case HCI_EVENT_VENDOR:
                                break;

                        default:
                                result = KAUTH_RESULT_ALLOW;
                                break;
                        }

                            break;
                        }

                case HCI_ACL_DATA_PKT:
                case HCI_SCO_DATA_PKT: {
                        /* uint16_t handle = (uint16_t)(uintptr_t)arg1; */
                        /*
                         * don't normally allow receiving data packets
                         */
                        break;
                        }

                default:
                        break;
                }

                break;

        default:
                break;
        }

        return result;
}

/*
 * HCI protocol init routine,
 * - set up a kauth listener to provide basic packet access policy
 */
void
hci_init(void)
{

        if (kauth_listen_scope(KAUTH_SCOPE_DEVICE, hci_device_cb, NULL) == NULL)
                panic("Bluetooth HCI: cannot listen on device scope");
}

/*
 * When command packet reaches the device, we can drop
 * it from the socket buffer (called from hci_output_acl)
 */
void
hci_drop(void *arg)
{
        struct socket *so = arg;

        sbdroprecord(&so->so_snd);
        sowwakeup(so);
}

/*
 * HCI socket is going away and has some pending packets. We let them
 * go by design, but remove the context pointer as it will be invalid
 * and we no longer need to be notified.
 */
static void
hci_cmdwait_flush(struct socket *so)
{
        struct hci_unit *unit;
        struct socket *ctx;
        struct mbuf *m;

        DPRINTF("flushing %p\n", so);

        SIMPLEQ_FOREACH(unit, &hci_unit_list, hci_next) {
                m = MBUFQ_FIRST(&unit->hci_cmdwait);
                while (m != NULL) {
                        ctx = M_GETCTX(m, struct socket *);
                        if (ctx == so)
                                M_SETCTX(m, NULL);

                        m = MBUFQ_NEXT(m);
                }
        }
}

static int
hci_attach(struct socket *so, int proto)
{
        struct hci_pcb *pcb;
        int error;

        KASSERT(so->so_pcb == NULL);

        if (so->so_lock == NULL) {
                mutex_obj_hold(bt_lock);
                so->so_lock = bt_lock;
                solock(so);
        }
        KASSERT(solocked(so));

        error = soreserve(so, hci_sendspace, hci_recvspace);
        if (error) {
                return error;
        }

        pcb = kmem_zalloc(sizeof(struct hci_pcb), KM_SLEEP);
        pcb->hp_cred = kauth_cred_dup(curlwp->l_cred);
        pcb->hp_socket = so;

        /*
         * Set default user filter. By default, socket only passes
         * Command_Complete and Command_Status Events.
         */
        hci_filter_set(HCI_EVENT_COMMAND_COMPL, &pcb->hp_efilter);
        hci_filter_set(HCI_EVENT_COMMAND_STATUS, &pcb->hp_efilter);
        hci_filter_set(HCI_EVENT_PKT, &pcb->hp_pfilter);

        LIST_INSERT_HEAD(&hci_pcb, pcb, hp_next);
        so->so_pcb = pcb;

        return 0;
}

static void
hci_detach(struct socket *so)
{
        struct hci_pcb *pcb;

        pcb = (struct hci_pcb *)so->so_pcb;
        KASSERT(pcb != NULL);

        if (so->so_snd.sb_mb != NULL)
                hci_cmdwait_flush(so);

        if (pcb->hp_cred != NULL)
                kauth_cred_free(pcb->hp_cred);

        so->so_pcb = NULL;
        LIST_REMOVE(pcb, hp_next);
        kmem_free(pcb, sizeof(*pcb));
}

static int
hci_accept(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
hci_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct hci_pcb *pcb = so->so_pcb;
        struct sockaddr_bt *sa = (struct sockaddr_bt *)nam;

        KASSERT(solocked(so));
        KASSERT(pcb != NULL);
        KASSERT(nam != NULL);

        if (sa->bt_len != sizeof(struct sockaddr_bt))
                return EINVAL;

        if (sa->bt_family != AF_BLUETOOTH)
                return EAFNOSUPPORT;

        bdaddr_copy(&pcb->hp_laddr, &sa->bt_bdaddr);

        if (bdaddr_any(&sa->bt_bdaddr))
                pcb->hp_flags |= HCI_PROMISCUOUS;
        else
                pcb->hp_flags &= ~HCI_PROMISCUOUS;

        return 0;
}

static int
hci_listen(struct socket *so, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
hci_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct hci_pcb *pcb = so->so_pcb;
        struct sockaddr_bt *sa = (struct sockaddr_bt *)nam;

        KASSERT(solocked(so));
        KASSERT(pcb != NULL);
        KASSERT(nam != NULL);

        if (sa->bt_len != sizeof(struct sockaddr_bt))
                return EINVAL;

        if (sa->bt_family != AF_BLUETOOTH)
                return EAFNOSUPPORT;

        if (hci_unit_lookup(&sa->bt_bdaddr) == NULL)
                return EADDRNOTAVAIL;

        bdaddr_copy(&pcb->hp_raddr, &sa->bt_bdaddr);
        soisconnected(so);
        return 0;
}

static int
hci_connect2(struct socket *so, struct socket *so2)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
hci_disconnect(struct socket *so)
{
        struct hci_pcb *pcb = so->so_pcb;

        KASSERT(solocked(so));
        KASSERT(pcb != NULL);

        bdaddr_copy(&pcb->hp_raddr, BDADDR_ANY);

        /* XXX we cannot call soisdisconnected() here, as it sets
         * SS_CANTRCVMORE and SS_CANTSENDMORE. The problem being,
         * that soisconnected() does not clear these and if you
         * try to reconnect this socket (which is permitted) you
         * get a broken pipe when you try to write any data.
         */
        so->so_state &= ~SS_ISCONNECTED;
        return 0;
}

static int
hci_shutdown(struct socket *so)
{
        KASSERT(solocked(so));

        socantsendmore(so);
        return 0;
}

static int
hci_abort(struct socket *so)
{
        KASSERT(solocked(so));

        soisdisconnected(so);
        hci_detach(so);
        return 0;
}

static int
hci_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
{
        int err;
        mutex_enter(bt_lock);
        err = hci_ioctl_pcb(cmd, nam);
        mutex_exit(bt_lock);
        return err;
}

static int
hci_stat(struct socket *so, struct stat *ub)
{
        KASSERT(solocked(so));

        return 0;
}

static int
hci_peeraddr(struct socket *so, struct sockaddr *nam)
{
        struct hci_pcb *pcb = (struct hci_pcb *)so->so_pcb;
        struct sockaddr_bt *sa = (struct sockaddr_bt *)nam;

        KASSERT(solocked(so));
        KASSERT(pcb != NULL);
        KASSERT(nam != NULL);

        memset(sa, 0, sizeof(struct sockaddr_bt));
        sa->bt_len = sizeof(struct sockaddr_bt);
        sa->bt_family = AF_BLUETOOTH;
        bdaddr_copy(&sa->bt_bdaddr, &pcb->hp_raddr);
        return 0;
}

static int
hci_sockaddr(struct socket *so, struct sockaddr *nam)
{
        struct hci_pcb *pcb = (struct hci_pcb *)so->so_pcb;
        struct sockaddr_bt *sa = (struct sockaddr_bt *)nam;

        KASSERT(solocked(so));
        KASSERT(pcb != NULL);
        KASSERT(nam != NULL);

        memset(sa, 0, sizeof(struct sockaddr_bt));
        sa->bt_len = sizeof(struct sockaddr_bt);
        sa->bt_family = AF_BLUETOOTH;
        bdaddr_copy(&sa->bt_bdaddr, &pcb->hp_laddr);
        return 0;
}

static int
hci_rcvd(struct socket *so, int flags, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
hci_recvoob(struct socket *so, struct mbuf *m, int flags)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
hci_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
    struct mbuf *control, struct lwp *l)
{
        struct hci_pcb *pcb = so->so_pcb;
        struct sockaddr_bt *sa = (struct sockaddr_bt *)nam;
        struct hci_unit *unit;
        struct mbuf *m0;
        hci_cmd_hdr_t hdr;
        int err = 0;

        KASSERT(solocked(so));
        KASSERT(pcb != NULL);
        KASSERT(m != NULL);

        if (control) /* have no use for this */
                m_freem(control);

        if (sa) {
                if (sa->bt_len != sizeof(struct sockaddr_bt)) {
                        err = EINVAL;
                        goto bad;
                }

                if (sa->bt_family != AF_BLUETOOTH) {
                        err = EAFNOSUPPORT;
                        goto bad;
                }
        }

         /*
         * this came from userland, so we check it out first
         */

        /* wants at least a header to start with */
        if (m->m_pkthdr.len < sizeof(hdr)) {
                err = EMSGSIZE;
                goto bad;
        }
        m_copydata(m, 0, sizeof(hdr), &hdr);
        hdr.opcode = le16toh(hdr.opcode);

        /* only allows CMD packets to be sent */
        if (hdr.type != HCI_CMD_PKT) {
                err = EINVAL;
                goto bad;
        }

        /* validates packet length */
        if (m->m_pkthdr.len != sizeof(hdr) + hdr.length) {
                err = EMSGSIZE;
                goto bad;
        }

        /* finds destination */
        unit = hci_unit_lookup((sa ? &sa->bt_bdaddr : &pcb->hp_raddr));
        if (unit == NULL) {
                err = ENETDOWN;
                goto bad;
        }

        /* security checks for unprivileged users */
        if (pcb->hp_cred != NULL
            && kauth_authorize_device(pcb->hp_cred,
            KAUTH_DEVICE_BLUETOOTH_SEND,
            unit, &hdr, NULL, NULL) != 0) {
                err = EPERM;
                goto bad;
        }

        /* makess a copy for precious to keep */
        m0 = m_copypacket(m, M_DONTWAIT);
        if (m0 == NULL) {
                err = ENOMEM;
                goto bad;
        }
        sbappendrecord(&pcb->hp_socket->so_snd, m0);
        M_SETCTX(m, pcb->hp_socket);        /* enable drop callback */

        DPRINTFN(2, "(%s) opcode (%03x|%04x)\n", device_xname(unit->hci_dev),
                HCI_OGF(hdr.opcode), HCI_OCF(hdr.opcode));

        /* Sendss it */
        if (unit->hci_num_cmd_pkts == 0)
                MBUFQ_ENQUEUE(&unit->hci_cmdwait, m);
        else
                hci_output_cmd(unit, m);

        return 0;

bad:
        DPRINTF("packet (%d bytes) not sent (error %d)\n",
                        m->m_pkthdr.len, err);
        if (m)
                m_freem(m);

        return err;
}

static int
hci_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control)
{
        KASSERT(solocked(so));

        m_freem(m);
        m_freem(control);

        return EOPNOTSUPP;
}

static int
hci_purgeif(struct socket *so, struct ifnet *ifp)
{

        return EOPNOTSUPP;
}

/*
 * get/set socket options
 */
int
hci_ctloutput(int req, struct socket *so, struct sockopt *sopt)
{
        struct hci_pcb *pcb = (struct hci_pcb *)so->so_pcb;
        int optval, err = 0;

        DPRINTFN(2, "req %s\n", prcorequests[req]);

        if (pcb == NULL)
                return EINVAL;

        if (sopt->sopt_level != BTPROTO_HCI)
                return ENOPROTOOPT;

        switch(req) {
        case PRCO_GETOPT:
                switch (sopt->sopt_name) {
                case SO_HCI_EVT_FILTER:
                        err = sockopt_set(sopt, &pcb->hp_efilter,
                            sizeof(struct hci_filter));

                        break;

                case SO_HCI_PKT_FILTER:
                        err = sockopt_set(sopt, &pcb->hp_pfilter,
                            sizeof(struct hci_filter));

                        break;

                case SO_HCI_DIRECTION:
                        err = sockopt_setint(sopt,
                            (pcb->hp_flags & HCI_DIRECTION ? 1 : 0));

                        break;

                default:
                        err = ENOPROTOOPT;
                        break;
                }
                break;

        case PRCO_SETOPT:
                switch (sopt->sopt_name) {
                case SO_HCI_EVT_FILTER:        /* set event filter */
                        err = sockopt_get(sopt, &pcb->hp_efilter,
                            sizeof(pcb->hp_efilter));

                        break;

                case SO_HCI_PKT_FILTER:        /* set packet filter */
                        err = sockopt_get(sopt, &pcb->hp_pfilter,
                            sizeof(pcb->hp_pfilter));

                        break;

                case SO_HCI_DIRECTION:        /* request direction ctl messages */
                        err = sockopt_getint(sopt, &optval);
                        if (err)
                                break;

                        if (optval)
                                pcb->hp_flags |= HCI_DIRECTION;
                        else
                                pcb->hp_flags &= ~HCI_DIRECTION;
                        break;

                default:
                        err = ENOPROTOOPT;
                        break;
                }
                break;

        default:
                err = ENOPROTOOPT;
                break;
        }

        return err;
}

/*
 * HCI mbuf tap routine
 *
 * copy packets to any raw HCI sockets that wish (and are
 * permitted) to see them
 */
void
hci_mtap(struct mbuf *m, struct hci_unit *unit)
{
        struct hci_pcb *pcb;
        struct mbuf *m0, *ctlmsg, **ctl;
        struct sockaddr_bt sa;
        uint8_t type;
        uint8_t event;
        uint16_t arg1;

        KASSERT(m->m_len >= sizeof(type));

        type = *mtod(m, uint8_t *);

        memset(&sa, 0, sizeof(sa));
        sa.bt_len = sizeof(struct sockaddr_bt);
        sa.bt_family = AF_BLUETOOTH;
        bdaddr_copy(&sa.bt_bdaddr, &unit->hci_bdaddr);

        LIST_FOREACH(pcb, &hci_pcb, hp_next) {
                /*
                 * filter according to source address
                 */
                if ((pcb->hp_flags & HCI_PROMISCUOUS) == 0
                    && bdaddr_same(&pcb->hp_laddr, &sa.bt_bdaddr) == 0)
                        continue;

                /*
                 * filter according to packet type filter
                 */
                if (hci_filter_test(type, &pcb->hp_pfilter) == 0)
                        continue;

                /*
                 * filter according to event/security filters
                 */
                switch(type) {
                case HCI_EVENT_PKT:
                        KASSERT(m->m_len >= sizeof(hci_event_hdr_t));

                        event = mtod(m, hci_event_hdr_t *)->event;

                        if (hci_filter_test(event, &pcb->hp_efilter) == 0)
                                continue;

                        arg1 = event;
                        break;

                case HCI_CMD_PKT:
                        KASSERT(m->m_len >= sizeof(hci_cmd_hdr_t));
                        arg1 = le16toh(mtod(m, hci_cmd_hdr_t *)->opcode);
                        break;

                case HCI_ACL_DATA_PKT:
                        KASSERT(m->m_len >= sizeof(hci_acldata_hdr_t));
                        arg1 = le16toh(mtod(m, hci_acldata_hdr_t *)->con_handle);
                        arg1 = HCI_CON_HANDLE(arg1);
                        break;

                case HCI_SCO_DATA_PKT:
                        KASSERT(m->m_len >= sizeof(hci_scodata_hdr_t));
                        arg1 = le16toh(mtod(m, hci_scodata_hdr_t *)->con_handle);
                        arg1 = HCI_CON_HANDLE(arg1);
                        break;

                default:
                        arg1 = 0;
                        break;
                }

                if (pcb->hp_cred != NULL
                    && kauth_authorize_device(pcb->hp_cred,
                    KAUTH_DEVICE_BLUETOOTH_RECV,
                    KAUTH_ARG(type), KAUTH_ARG(arg1), NULL, NULL) != 0)
                        continue;

                /*
                 * create control messages
                 */
                ctlmsg = NULL;
                ctl = &ctlmsg;
                if (pcb->hp_flags & HCI_DIRECTION) {
                        int dir = m->m_flags & M_LINK0 ? 1 : 0;

                        *ctl = sbcreatecontrol(&dir, sizeof(dir),
                            SCM_HCI_DIRECTION, BTPROTO_HCI);

                        if (*ctl != NULL)
                                ctl = &((*ctl)->m_next);
                }
                if (pcb->hp_socket->so_options & SO_TIMESTAMP) {
                        struct timeval tv;

                        microtime(&tv);
                        *ctl = sbcreatecontrol(&tv, sizeof(tv),
                            SCM_TIMESTAMP, SOL_SOCKET);

                        if (*ctl != NULL)
                                ctl = &((*ctl)->m_next);
                }

                /*
                 * copy to socket
                 */
                m0 = m_copypacket(m, M_DONTWAIT);
                if (m0 && sbappendaddr(&pcb->hp_socket->so_rcv,
                                (struct sockaddr *)&sa, m0, ctlmsg)) {
                        sorwakeup(pcb->hp_socket);
                } else {
                        m_freem(ctlmsg);
                        m_freem(m0);
                }
        }
}

PR_WRAP_USRREQS(hci)

#define        hci_attach                hci_attach_wrapper
#define        hci_detach                hci_detach_wrapper
#define        hci_accept                hci_accept_wrapper
#define        hci_bind                hci_bind_wrapper
#define        hci_listen                hci_listen_wrapper
#define        hci_connect                hci_connect_wrapper
#define        hci_connect2                hci_connect2_wrapper
#define        hci_disconnect                hci_disconnect_wrapper
#define        hci_shutdown                hci_shutdown_wrapper
#define        hci_abort                hci_abort_wrapper
#define        hci_ioctl                hci_ioctl_wrapper
#define        hci_stat                hci_stat_wrapper
#define        hci_peeraddr                hci_peeraddr_wrapper
#define        hci_sockaddr                hci_sockaddr_wrapper
#define        hci_rcvd                hci_rcvd_wrapper
#define        hci_recvoob                hci_recvoob_wrapper
#define        hci_send                hci_send_wrapper
#define        hci_sendoob                hci_sendoob_wrapper
#define        hci_purgeif                hci_purgeif_wrapper

const struct pr_usrreqs hci_usrreqs = {
        .pr_attach        = hci_attach,
        .pr_detach        = hci_detach,
        .pr_accept        = hci_accept,
        .pr_bind        = hci_bind,
        .pr_listen        = hci_listen,
        .pr_connect        = hci_connect,
        .pr_connect2        = hci_connect2,
        .pr_disconnect        = hci_disconnect,
        .pr_shutdown        = hci_shutdown,
        .pr_abort        = hci_abort,
        .pr_ioctl        = hci_ioctl,
        .pr_stat        = hci_stat,
        .pr_peeraddr        = hci_peeraddr,
        .pr_sockaddr        = hci_sockaddr,
        .pr_rcvd        = hci_rcvd,
        .pr_recvoob        = hci_recvoob,
        .pr_send        = hci_send,
        .pr_sendoob        = hci_sendoob,
        .pr_purgeif        = hci_purgeif,
};


























































































































































































    3 

    2 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
/*        $NetBSD: pmap.h,v 1.134 2022/08/20 23:49:31 riastradh Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 2001 Wasabi Systems, Inc.
 * All rights reserved.
 *
 * Written by Frank van der Linden for Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed for the NetBSD Project by
 *      Wasabi Systems, Inc.
 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
 *    or promote products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * pmap.h: see pmap.c for the history of this pmap module.
 */

#ifndef _X86_PMAP_H_
#define        _X86_PMAP_H_

#if defined(_KERNEL)
#include <x86/pmap_pv.h>
#include <uvm/pmap/pmap_pvt.h>

/*
 * MD flags that we use for pmap_enter and pmap_kenter_pa:
 */

/*
 * macros
 */

#define pmap_clear_modify(pg)                pmap_clear_attrs(pg, PP_ATTRS_D)
#define pmap_clear_reference(pg)        pmap_clear_attrs(pg, PP_ATTRS_A)
#define pmap_copy(DP,SP,D,L,S)                __USE(L)
#define pmap_is_modified(pg)                pmap_test_attrs(pg, PP_ATTRS_D)
#define pmap_is_referenced(pg)                pmap_test_attrs(pg, PP_ATTRS_A)
#define pmap_move(DP,SP,D,L,S)
#define pmap_phys_address(ppn)                (x86_ptob(ppn) & ~X86_MMAP_FLAG_MASK)
#define pmap_mmap_flags(ppn)                x86_mmap_flags(ppn)

#if defined(__x86_64__) || defined(PAE)
#define X86_MMAP_FLAG_SHIFT        (64 - PGSHIFT)
#else
#define X86_MMAP_FLAG_SHIFT        (32 - PGSHIFT)
#endif

#define X86_MMAP_FLAG_MASK        0xf
#define X86_MMAP_FLAG_PREFETCH        0x1

/*
 * prototypes
 */

void                pmap_activate(struct lwp *);
void                pmap_bootstrap(vaddr_t);
bool                pmap_clear_attrs(struct vm_page *, unsigned);
bool                pmap_pv_clear_attrs(paddr_t, unsigned);
void                pmap_deactivate(struct lwp *);
void                pmap_page_remove(struct vm_page *);
void                pmap_pv_remove(paddr_t);
void                pmap_remove(struct pmap *, vaddr_t, vaddr_t);
bool                pmap_test_attrs(struct vm_page *, unsigned);
void                pmap_write_protect(struct pmap *, vaddr_t, vaddr_t, vm_prot_t);
void                pmap_load(void);
paddr_t                pmap_init_tmp_pgtbl(paddr_t);
bool                pmap_remove_all(struct pmap *);
void                pmap_ldt_cleanup(struct lwp *);
void                pmap_ldt_sync(struct pmap *);
void                pmap_kremove_local(vaddr_t, vsize_t);

#define        __HAVE_PMAP_PV_TRACK        1
void                pmap_pv_init(void);
void                pmap_pv_track(paddr_t, psize_t);
void                pmap_pv_untrack(paddr_t, psize_t);

u_int                x86_mmap_flags(paddr_t);

#define PMAP_GROWKERNEL                /* turn on pmap_growkernel interface */
#define PMAP_FORK                /* turn on pmap_fork interface */

/*
 * inline functions
 */

/*
 * pmap_page_protect: change the protection of all recorded mappings
 *        of a managed page
 *
 * => this function is a frontend for pmap_page_remove/pmap_clear_attrs
 * => we only have to worry about making the page more protected.
 *        unprotecting a page is done on-demand at fault time.
 */

__inline static void __unused
pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
{
        if ((prot & VM_PROT_WRITE) == 0) {
                if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
                        (void)pmap_clear_attrs(pg, PP_ATTRS_W);
                } else {
                        pmap_page_remove(pg);
                }
        }
}

/*
 * pmap_pv_protect: change the protection of all recorded mappings
 *        of an unmanaged page
 */

__inline static void __unused
pmap_pv_protect(paddr_t pa, vm_prot_t prot)
{
        if ((prot & VM_PROT_WRITE) == 0) {
                if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
                        (void)pmap_pv_clear_attrs(pa, PP_ATTRS_W);
                } else {
                        pmap_pv_remove(pa);
                }
        }
}

/*
 * pmap_protect: change the protection of pages in a pmap
 *
 * => this function is a frontend for pmap_remove/pmap_write_protect
 * => we only have to worry about making the page more protected.
 *        unprotecting a page is done on-demand at fault time.
 */

__inline static void __unused
pmap_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
{
        if ((prot & VM_PROT_WRITE) == 0) {
                if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
                        pmap_write_protect(pmap, sva, eva, prot);
                } else {
                        pmap_remove(pmap, sva, eva);
                }
        }
}

paddr_t vtophys(vaddr_t);
vaddr_t        pmap_map(vaddr_t, paddr_t, paddr_t, vm_prot_t);
void        pmap_cpu_init_late(struct cpu_info *);

/* pmap functions with machine addresses */
void        pmap_kenter_ma(vaddr_t, paddr_t, vm_prot_t, u_int);
int        pmap_enter_ma(struct pmap *, vaddr_t, paddr_t, paddr_t,
            vm_prot_t, u_int, int);
bool        pmap_extract_ma(pmap_t, vaddr_t, paddr_t *);

paddr_t pmap_get_physpage(void);

/*
 * Hooks for the pool allocator.
 */
#define        POOL_VTOPHYS(va)        vtophys((vaddr_t) (va))

#ifdef __HAVE_DIRECT_MAP

extern vaddr_t pmap_direct_base;
extern vaddr_t pmap_direct_end;

#define PMAP_DIRECT_BASE        pmap_direct_base
#define PMAP_DIRECT_END                pmap_direct_end

#define PMAP_DIRECT_MAP(pa)        ((vaddr_t)PMAP_DIRECT_BASE + (pa))
#define PMAP_DIRECT_UNMAP(va)        ((paddr_t)(va) - PMAP_DIRECT_BASE)

/*
 * Alternate mapping hooks for pool pages.
 */
#define PMAP_MAP_POOLPAGE(pa)        PMAP_DIRECT_MAP((pa))
#define PMAP_UNMAP_POOLPAGE(va)        PMAP_DIRECT_UNMAP((va))

#endif /* __HAVE_DIRECT_MAP */

#define        __HAVE_VM_PAGE_MD
#define        VM_MDPAGE_INIT(pg) \
        memset(&(pg)->mdpage, 0, sizeof((pg)->mdpage)); \
        PMAP_PAGE_INIT(&(pg)->mdpage.mp_pp)

struct vm_page_md {
        struct pmap_page mp_pp;
};

#endif /* _KERNEL */

#endif /* _X86_PMAP_H_ */

























































































































































































   10 

   10 



    1 




































































    5 
    5 






























































   45 



   46 
   46 

   45 

   41 































   49 
















   49 


   49 







   49 


   49 
   49 

   49 




   47 









   31 
   47 
   47 











   47 


   47 
   11 
   46 


   48 














   14 








    3 
   11 
    3 
   12 







    3 



   12 

    8 
    5 
    9 







    2 

    8 
    6 
    9 
    1 













  107 
  107 









































































































































   49 



   46 





    3 














    2 



    2 


   16 









   37 

   50 
   51 












    6 




















  104 










  103 














    6 
    6 





























   48 












































   62 







   59 




   62 
   58 




































































































   40 


   40 
   39 




   40 





   40 
    2 



    2 
















    8 




    9 
    9 






    9 


    9 

    9 













   40 


   40 
   40 





   40 


   39 

   40 

































   18 




   18 
   18 







   16 
    3 











    1 




    7 
   10 

















   18 














   16 




   18 

   17 

   18 















   18 
    1 


    3 


    5 
    5 








    3 

    4 



   12 






































   16 





















   11 


    3 
    2 

























   17 



























   17 
   15 
   15 



















   10 
    5 






    5 
    5 












    2 


    5 

    8 


    7 









    6 























    4 
    2 









    7 






   11 

    5 


    3 































    2 






    1 


























































    2 
    1 




    1 
    1 





    1 

    1 


    2 



    3 






   13 











    3 
   10 













   11 
    3 






















    1 


   12 


   13 





   11 

    2 











   17 

   17 


   17 
    1 


   12 
    5 















  100 









   99 
















  108 















   74 
   65 




   59 


  101 






  100 



   81 
   82 
   81 





   39 
   39 












   10 












    3 







    3 




    1 
    2 





    7 
    8 
















   10 












   12 







   12 





























   12 















   12 
   12 

   12 













   10 





















    3 
   10 










































































    2 












































   10 













   10 


   10 
   10 

















    6 











    6 



    6 






    5 








    6 




    6 

































    5 


    6 


    6 















    6 
    6 






    6 





    6 










    6 


    6 


    6 



























































   12 
   12 
   12 






























   10 







   10 








   10 

   10 








   10 



























   10 
   10 

   10 








    8 
    2 













    3 






    5 









    5 












    5 




    5 































    9 

   10 
   10 







   10 









   10 















    9 



   10 
   10 











    9 
    4 





   10 
   10 

    1 




























































































































































































































































































































































































































































































































































































































    6 





    6 

















    6 









    5 

    1 







    6 













    5 

















    6 


    5 

    5 















    5 

    5 

    5 






    5 



    1 












    5 








































    6 


















    2 





    2 














    1 

    1 

    2 
    2 



    2 












    4 









    4 

    1 






    4 
    4 

















    4 










    1 







    1 


    1 



    1 
    1 








    1 



    1 

































    4 








    4 

    2 
    2 












    1 



















    1 







    1 




    1 














































    4 










    1 
    1 
    3 






    4 
    4 






    2 
    4 








    1 




    1 













    4 
    4 



    4 



    4 

























































    4 
































































































































































































































































    3 













    3 





    1 












    3 



    3 


    3 





    3 
    3 

    3 


    3 









    3 






    2 

    2 


    2 




























    1 












    1 









    1 















    1 
    2 









    3 



































































































































































































































































































































    6 


    6 











    6 
    6 

    6 

    6 



    6 




















    6 





























































    6 
    6 






































    6 





    6 



    6 
    6 




















    6 





    6 





















    6 





















    6 





    6 
    6 







    6 



    6 



    6 



    6 









    6 

    6 




















   31 






















    3 
   28 
    2 

    2 


































   23 
    3 
    6 
    3 
    1 



    5 









    5 
    5 







    5 
    5 

    5 


    5 


















































    1 





    1 


    1 









    1 


    1 




















    3 

    5 
    6 

   15 
    7 





    3 

   15 
    8 

    5 
   13 




























































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
/*        $NetBSD: uvm_map.c,v 1.411 2024/02/09 22:08:38 andvar Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * Copyright (c) 1991, 1993, The Regents of the University of California.
 *
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * The Mach Operating System project at Carnegie-Mellon University.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vm_map.c    8.3 (Berkeley) 1/12/94
 * from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
 *
 *
 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
 * All rights reserved.
 *
 * Permission to use, copy, modify and distribute this software and
 * its documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 */

/*
 * uvm_map.c: uvm map operations
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_map.c,v 1.411 2024/02/09 22:08:38 andvar Exp $");

#include "opt_ddb.h"
#include "opt_pax.h"
#include "opt_uvmhist.h"
#include "opt_uvm.h"
#include "opt_sysv.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/pax.h>
#include <sys/vnode.h>
#include <sys/filedesc.h>
#include <sys/lockdebug.h>
#include <sys/atomic.h>
#include <sys/sysctl.h>
#ifndef __USER_VA0_IS_SAFE
#include <sys/kauth.h>
#include "opt_user_va0_disable_default.h"
#endif

#include <sys/shm.h>

#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>

#if defined(DDB) || defined(DEBUGPRINT)
#include <uvm/uvm_ddb.h>
#endif

#ifdef UVMHIST
#ifndef UVMHIST_MAPHIST_SIZE
#define UVMHIST_MAPHIST_SIZE 100
#endif
static struct kern_history_ent maphistbuf[UVMHIST_MAPHIST_SIZE];
UVMHIST_DEFINE(maphist) = UVMHIST_INITIALIZER(maphist, maphistbuf);
#endif

#if !defined(UVMMAP_COUNTERS)

#define        UVMMAP_EVCNT_DEFINE(name)        /* nothing */
#define UVMMAP_EVCNT_INCR(ev)                /* nothing */
#define UVMMAP_EVCNT_DECR(ev)                /* nothing */

#else /* defined(UVMMAP_NOCOUNTERS) */

#include <sys/evcnt.h>
#define        UVMMAP_EVCNT_DEFINE(name) \
struct evcnt uvmmap_evcnt_##name = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, \
    "uvmmap", #name); \
EVCNT_ATTACH_STATIC(uvmmap_evcnt_##name);
#define        UVMMAP_EVCNT_INCR(ev)                uvmmap_evcnt_##ev.ev_count++
#define        UVMMAP_EVCNT_DECR(ev)                uvmmap_evcnt_##ev.ev_count--

#endif /* defined(UVMMAP_NOCOUNTERS) */

UVMMAP_EVCNT_DEFINE(ubackmerge)
UVMMAP_EVCNT_DEFINE(uforwmerge)
UVMMAP_EVCNT_DEFINE(ubimerge)
UVMMAP_EVCNT_DEFINE(unomerge)
UVMMAP_EVCNT_DEFINE(kbackmerge)
UVMMAP_EVCNT_DEFINE(kforwmerge)
UVMMAP_EVCNT_DEFINE(kbimerge)
UVMMAP_EVCNT_DEFINE(knomerge)
UVMMAP_EVCNT_DEFINE(map_call)
UVMMAP_EVCNT_DEFINE(mlk_call)
UVMMAP_EVCNT_DEFINE(mlk_hint)
UVMMAP_EVCNT_DEFINE(mlk_tree)
UVMMAP_EVCNT_DEFINE(mlk_treeloop)

const char vmmapbsy[] = "vmmapbsy";

/*
 * cache for dynamically-allocated map entries.
 */

static struct pool_cache uvm_map_entry_cache;

#ifdef PMAP_GROWKERNEL
/*
 * This global represents the end of the kernel virtual address
 * space.  If we want to exceed this, we must grow the kernel
 * virtual address space dynamically.
 *
 * Note, this variable is locked by kernel_map's lock.
 */
vaddr_t uvm_maxkaddr;
#endif

#ifndef __USER_VA0_IS_SAFE
#ifndef __USER_VA0_DISABLE_DEFAULT
#define __USER_VA0_DISABLE_DEFAULT 1
#endif
#ifdef USER_VA0_DISABLE_DEFAULT /* kernel config option overrides */
#undef __USER_VA0_DISABLE_DEFAULT
#define __USER_VA0_DISABLE_DEFAULT USER_VA0_DISABLE_DEFAULT
#endif
int user_va0_disable = __USER_VA0_DISABLE_DEFAULT;
#endif

/*
 * macros
 */

/*
 * uvm_map_align_va: round down or up virtual address
 */
static __inline void
uvm_map_align_va(vaddr_t *vap, vsize_t align, int topdown)
{

        KASSERT(powerof2(align));

        if (align != 0 && (*vap & (align - 1)) != 0) {
                if (topdown)
                        *vap = rounddown2(*vap, align);
                else
                        *vap = roundup2(*vap, align);
        }
}

/*
 * UVM_ET_ISCOMPATIBLE: check some requirements for map entry merging
 */
extern struct vm_map *pager_map;

#define        UVM_ET_ISCOMPATIBLE(ent, type, uobj, meflags, \
    prot, maxprot, inh, adv, wire) \
        ((ent)->etype == (type) && \
        (((ent)->flags ^ (meflags)) & (UVM_MAP_NOMERGE)) == 0 && \
        (ent)->object.uvm_obj == (uobj) && \
        (ent)->protection == (prot) && \
        (ent)->max_protection == (maxprot) && \
        (ent)->inheritance == (inh) && \
        (ent)->advice == (adv) && \
        (ent)->wired_count == (wire))

/*
 * uvm_map_entry_link: insert entry into a map
 *
 * => map must be locked
 */
#define uvm_map_entry_link(map, after_where, entry) do { \
        uvm_mapent_check(entry); \
        (map)->nentries++; \
        (entry)->prev = (after_where); \
        (entry)->next = (after_where)->next; \
        (entry)->prev->next = (entry); \
        (entry)->next->prev = (entry); \
        uvm_rb_insert((map), (entry)); \
} while (/*CONSTCOND*/ 0)

/*
 * uvm_map_entry_unlink: remove entry from a map
 *
 * => map must be locked
 */
#define uvm_map_entry_unlink(map, entry) do { \
        KASSERT((entry) != (map)->first_free); \
        KASSERT((entry) != (map)->hint); \
        uvm_mapent_check(entry); \
        (map)->nentries--; \
        (entry)->next->prev = (entry)->prev; \
        (entry)->prev->next = (entry)->next; \
        uvm_rb_remove((map), (entry)); \
} while (/*CONSTCOND*/ 0)

/*
 * SAVE_HINT: saves the specified entry as the hint for future lookups.
 *
 * => map need not be locked.
 */
#define SAVE_HINT(map, check, value) do { \
        if ((map)->hint == (check)) \
                (map)->hint = (value); \
} while (/*CONSTCOND*/ 0)

/*
 * clear_hints: ensure that hints don't point to the entry.
 *
 * => map must be write-locked.
 */
static void
clear_hints(struct vm_map *map, struct vm_map_entry *ent)
{

        SAVE_HINT(map, ent, ent->prev);
        if (map->first_free == ent) {
                map->first_free = ent->prev;
        }
}

/*
 * VM_MAP_RANGE_CHECK: check and correct range
 *
 * => map must at least be read locked
 */

#define VM_MAP_RANGE_CHECK(map, start, end) do { \
        if (start < vm_map_min(map))                \
                start = vm_map_min(map);        \
        if (end > vm_map_max(map))                \
                end = vm_map_max(map);                \
        if (start > end)                        \
                start = end;                        \
} while (/*CONSTCOND*/ 0)

/*
 * local prototypes
 */

static struct vm_map_entry *
                uvm_mapent_alloc(struct vm_map *, int);
static void        uvm_mapent_copy(struct vm_map_entry *, struct vm_map_entry *);
static void        uvm_mapent_free(struct vm_map_entry *);
#if defined(DEBUG)
static void        _uvm_mapent_check(const struct vm_map_entry *, int);
#define        uvm_mapent_check(map)        _uvm_mapent_check(map, __LINE__)
#else /* defined(DEBUG) */
#define        uvm_mapent_check(e)        /* nothing */
#endif /* defined(DEBUG) */

static void        uvm_map_entry_unwire(struct vm_map *, struct vm_map_entry *);
static void        uvm_map_reference_amap(struct vm_map_entry *, int);
static int        uvm_map_space_avail(vaddr_t *, vsize_t, voff_t, vsize_t, int,
                    int, struct vm_map_entry *);
static void        uvm_map_unreference_amap(struct vm_map_entry *, int);

int _uvm_map_sanity(struct vm_map *);
int _uvm_tree_sanity(struct vm_map *);
static vsize_t uvm_rb_maxgap(const struct vm_map_entry *);

#define        ROOT_ENTRY(map)                ((struct vm_map_entry *)(map)->rb_tree.rbt_root)
#define        LEFT_ENTRY(entry)        ((struct vm_map_entry *)(entry)->rb_node.rb_left)
#define        RIGHT_ENTRY(entry)        ((struct vm_map_entry *)(entry)->rb_node.rb_right)
#define        PARENT_ENTRY(map, entry) \
        (ROOT_ENTRY(map) == (entry) \
            ? NULL : (struct vm_map_entry *)RB_FATHER(&(entry)->rb_node))

/*
 * These get filled in if/when SYSVSHM shared memory code is loaded
 *
 * We do this with function pointers rather the #ifdef SYSVSHM so the
 * SYSVSHM code can be loaded and unloaded
 */
void (*uvm_shmexit)(struct vmspace *) = NULL;
void (*uvm_shmfork)(struct vmspace *, struct vmspace *) = NULL;

static int
uvm_map_compare_nodes(void *ctx, const void *nparent, const void *nkey)
{
        const struct vm_map_entry *eparent = nparent;
        const struct vm_map_entry *ekey = nkey;

        KASSERT(eparent->start < ekey->start || eparent->start >= ekey->end);
        KASSERT(ekey->start < eparent->start || ekey->start >= eparent->end);

        if (eparent->start < ekey->start)
                return -1;
        if (eparent->end >= ekey->start)
                return 1;
        return 0;
}

static int
uvm_map_compare_key(void *ctx, const void *nparent, const void *vkey)
{
        const struct vm_map_entry *eparent = nparent;
        const vaddr_t va = *(const vaddr_t *) vkey;

        if (eparent->start < va)
                return -1;
        if (eparent->end >= va)
                return 1;
        return 0;
}

static const rb_tree_ops_t uvm_map_tree_ops = {
        .rbto_compare_nodes = uvm_map_compare_nodes,
        .rbto_compare_key = uvm_map_compare_key,
        .rbto_node_offset = offsetof(struct vm_map_entry, rb_node),
        .rbto_context = NULL
};

/*
 * uvm_rb_gap: return the gap size between our entry and next entry.
 */
static inline vsize_t
uvm_rb_gap(const struct vm_map_entry *entry)
{

        KASSERT(entry->next != NULL);
        return entry->next->start - entry->end;
}

static vsize_t
uvm_rb_maxgap(const struct vm_map_entry *entry)
{
        struct vm_map_entry *child;
        vsize_t maxgap = entry->gap;

        /*
         * We need maxgap to be the largest gap of us or any of our
         * descendents.  Since each of our children's maxgap is the
         * cached value of their largest gap of themselves or their
         * descendents, we can just use that value and avoid recursing
         * down the tree to calculate it.
         */
        if ((child = LEFT_ENTRY(entry)) != NULL && maxgap < child->maxgap)
                maxgap = child->maxgap;

        if ((child = RIGHT_ENTRY(entry)) != NULL && maxgap < child->maxgap)
                maxgap = child->maxgap;

        return maxgap;
}

static void
uvm_rb_fixup(struct vm_map *map, struct vm_map_entry *entry)
{
        struct vm_map_entry *parent;

        KASSERT(entry->gap == uvm_rb_gap(entry));
        entry->maxgap = uvm_rb_maxgap(entry);

        while ((parent = PARENT_ENTRY(map, entry)) != NULL) {
                struct vm_map_entry *brother;
                vsize_t maxgap = parent->gap;
                unsigned int which;

                KDASSERT(parent->gap == uvm_rb_gap(parent));
                if (maxgap < entry->maxgap)
                        maxgap = entry->maxgap;
                /*
                 * Since we work towards the root, we know entry's maxgap
                 * value is OK, but its brothers may now be out-of-date due
                 * to rebalancing.  So refresh it.
                 */
                which = RB_POSITION(&entry->rb_node) ^ RB_DIR_OTHER;
                brother = (struct vm_map_entry *)parent->rb_node.rb_nodes[which];
                if (brother != NULL) {
                        KDASSERT(brother->gap == uvm_rb_gap(brother));
                        brother->maxgap = uvm_rb_maxgap(brother);
                        if (maxgap < brother->maxgap)
                                maxgap = brother->maxgap;
                }

                parent->maxgap = maxgap;
                entry = parent;
        }
}

static void
uvm_rb_insert(struct vm_map *map, struct vm_map_entry *entry)
{
        struct vm_map_entry *ret __diagused;

        entry->gap = entry->maxgap = uvm_rb_gap(entry);
        if (entry->prev != &map->header)
                entry->prev->gap = uvm_rb_gap(entry->prev);

        ret = rb_tree_insert_node(&map->rb_tree, entry);
        KASSERTMSG(ret == entry,
            "uvm_rb_insert: map %p: duplicate entry %p", map, ret);

        /*
         * If the previous entry is not our immediate left child, then it's an
         * ancestor and will be fixed up on the way to the root.  We don't
         * have to check entry->prev against &map->header since &map->header
         * will never be in the tree.
         */
        uvm_rb_fixup(map,
            LEFT_ENTRY(entry) == entry->prev ? entry->prev : entry);
}

static void
uvm_rb_remove(struct vm_map *map, struct vm_map_entry *entry)
{
        struct vm_map_entry *prev_parent = NULL, *next_parent = NULL;

        /*
         * If we are removing an interior node, then an adjacent node will
         * be used to replace its position in the tree.  Therefore we will
         * need to fixup the tree starting at the parent of the replacement
         * node.  So record their parents for later use.
         */
        if (entry->prev != &map->header)
                prev_parent = PARENT_ENTRY(map, entry->prev);
        if (entry->next != &map->header)
                next_parent = PARENT_ENTRY(map, entry->next);

        rb_tree_remove_node(&map->rb_tree, entry);

        /*
         * If the previous node has a new parent, fixup the tree starting
         * at the previous node's old parent.
         */
        if (entry->prev != &map->header) {
                /*
                 * Update the previous entry's gap due to our absence.
                 */
                entry->prev->gap = uvm_rb_gap(entry->prev);
                uvm_rb_fixup(map, entry->prev);
                if (prev_parent != NULL
                    && prev_parent != entry
                    && prev_parent != PARENT_ENTRY(map, entry->prev))
                        uvm_rb_fixup(map, prev_parent);
        }

        /*
         * If the next node has a new parent, fixup the tree starting
         * at the next node's old parent.
         */
        if (entry->next != &map->header) {
                uvm_rb_fixup(map, entry->next);
                if (next_parent != NULL
                    && next_parent != entry
                    && next_parent != PARENT_ENTRY(map, entry->next))
                        uvm_rb_fixup(map, next_parent);
        }
}

#if defined(DEBUG)
int uvm_debug_check_map = 0;
int uvm_debug_check_rbtree = 0;
#define uvm_map_check(map, name) \
        _uvm_map_check((map), (name), __FILE__, __LINE__)
static void
_uvm_map_check(struct vm_map *map, const char *name,
    const char *file, int line)
{

        if ((uvm_debug_check_map && _uvm_map_sanity(map)) ||
            (uvm_debug_check_rbtree && _uvm_tree_sanity(map))) {
                panic("uvm_map_check failed: \"%s\" map=%p (%s:%d)",
                    name, map, file, line);
        }
}
#else /* defined(DEBUG) */
#define uvm_map_check(map, name)        /* nothing */
#endif /* defined(DEBUG) */

#if defined(DEBUG) || defined(DDB)
int
_uvm_map_sanity(struct vm_map *map)
{
        bool first_free_found = false;
        bool hint_found = false;
        const struct vm_map_entry *e;
        struct vm_map_entry *hint = map->hint;

        e = &map->header;
        for (;;) {
                if (map->first_free == e) {
                        first_free_found = true;
                } else if (!first_free_found && e->next->start > e->end) {
                        printf("first_free %p should be %p\n",
                            map->first_free, e);
                        return -1;
                }
                if (hint == e) {
                        hint_found = true;
                }

                e = e->next;
                if (e == &map->header) {
                        break;
                }
        }
        if (!first_free_found) {
                printf("stale first_free\n");
                return -1;
        }
        if (!hint_found) {
                printf("stale hint\n");
                return -1;
        }
        return 0;
}

int
_uvm_tree_sanity(struct vm_map *map)
{
        struct vm_map_entry *tmp, *trtmp;
        int n = 0, i = 1;

        for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) {
                if (tmp->gap != uvm_rb_gap(tmp)) {
                        printf("%d/%d gap %#lx != %#lx %s\n",
                            n + 1, map->nentries,
                            (ulong)tmp->gap, (ulong)uvm_rb_gap(tmp),
                            tmp->next == &map->header ? "(last)" : "");
                        goto error;
                }
                /*
                 * If any entries are out of order, tmp->gap will be unsigned
                 * and will likely exceed the size of the map.
                 */
                if (tmp->gap >= vm_map_max(map) - vm_map_min(map)) {
                        printf("too large gap %zu\n", (size_t)tmp->gap);
                        goto error;
                }
                n++;
        }

        if (n != map->nentries) {
                printf("nentries: %d vs %d\n", n, map->nentries);
                goto error;
        }

        trtmp = NULL;
        for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) {
                if (tmp->maxgap != uvm_rb_maxgap(tmp)) {
                        printf("maxgap %#lx != %#lx\n",
                            (ulong)tmp->maxgap,
                            (ulong)uvm_rb_maxgap(tmp));
                        goto error;
                }
                if (trtmp != NULL && trtmp->start >= tmp->start) {
                        printf("corrupt: 0x%"PRIxVADDR"x >= 0x%"PRIxVADDR"x\n",
                            trtmp->start, tmp->start);
                        goto error;
                }

                trtmp = tmp;
        }

        for (tmp = map->header.next; tmp != &map->header;
            tmp = tmp->next, i++) {
                trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_LEFT);
                if (trtmp == NULL)
                        trtmp = &map->header;
                if (tmp->prev != trtmp) {
                        printf("lookup: %d: %p->prev=%p: %p\n",
                            i, tmp, tmp->prev, trtmp);
                        goto error;
                }
                trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_RIGHT);
                if (trtmp == NULL)
                        trtmp = &map->header;
                if (tmp->next != trtmp) {
                        printf("lookup: %d: %p->next=%p: %p\n",
                            i, tmp, tmp->next, trtmp);
                        goto error;
                }
                trtmp = rb_tree_find_node(&map->rb_tree, &tmp->start);
                if (trtmp != tmp) {
                        printf("lookup: %d: %p - %p: %p\n", i, tmp, trtmp,
                            PARENT_ENTRY(map, tmp));
                        goto error;
                }
        }

        return (0);
 error:
        return (-1);
}
#endif /* defined(DEBUG) || defined(DDB) */

/*
 * vm_map_lock: acquire an exclusive (write) lock on a map.
 *
 * => The locking protocol provides for guaranteed upgrade from shared ->
 *    exclusive by whichever thread currently has the map marked busy.
 *    See "LOCKING PROTOCOL NOTES" in uvm_map.h.  This is horrible; among
 *    other problems, it defeats any fairness guarantees provided by RW
 *    locks.
 */

void
vm_map_lock(struct vm_map *map)
{

        for (;;) {
                rw_enter(&map->lock, RW_WRITER);
                if (map->busy == NULL || map->busy == curlwp) {
                        break;
                }
                mutex_enter(&map->misc_lock);
                rw_exit(&map->lock);
                if (map->busy != NULL) {
                        cv_wait(&map->cv, &map->misc_lock);
                }
                mutex_exit(&map->misc_lock);
        }
        map->timestamp++;
}

/*
 * vm_map_lock_try: try to lock a map, failing if it is already locked.
 */

bool
vm_map_lock_try(struct vm_map *map)
{

        if (!rw_tryenter(&map->lock, RW_WRITER)) {
                return false;
        }
        if (map->busy != NULL) {
                rw_exit(&map->lock);
                return false;
        }
        map->timestamp++;
        return true;
}

/*
 * vm_map_unlock: release an exclusive lock on a map.
 */

void
vm_map_unlock(struct vm_map *map)
{

        KASSERT(rw_write_held(&map->lock));
        KASSERT(map->busy == NULL || map->busy == curlwp);
        rw_exit(&map->lock);
}

/*
 * vm_map_unbusy: mark the map as unbusy, and wake any waiters that
 *     want an exclusive lock.
 */

void
vm_map_unbusy(struct vm_map *map)
{

        KASSERT(map->busy == curlwp);

        /*
         * Safe to clear 'busy' and 'waiters' with only a read lock held:
         *
         * o they can only be set with a write lock held
         * o writers are blocked out with a read or write hold
         * o at any time, only one thread owns the set of values
         */
        mutex_enter(&map->misc_lock);
        map->busy = NULL;
        cv_broadcast(&map->cv);
        mutex_exit(&map->misc_lock);
}

/*
 * vm_map_lock_read: acquire a shared (read) lock on a map.
 */

void
vm_map_lock_read(struct vm_map *map)
{

        rw_enter(&map->lock, RW_READER);
}

/*
 * vm_map_unlock_read: release a shared lock on a map.
 */

void
vm_map_unlock_read(struct vm_map *map)
{

        rw_exit(&map->lock);
}

/*
 * vm_map_busy: mark a map as busy.
 *
 * => the caller must hold the map write locked
 */

void
vm_map_busy(struct vm_map *map)
{

        KASSERT(rw_write_held(&map->lock));
        KASSERT(map->busy == NULL);

        map->busy = curlwp;
}

/*
 * vm_map_locked_p: return true if the map is write locked.
 *
 * => only for debug purposes like KASSERTs.
 * => should not be used to verify that a map is not locked.
 */

bool
vm_map_locked_p(struct vm_map *map)
{

        return rw_write_held(&map->lock);
}

/*
 * uvm_mapent_alloc: allocate a map entry
 */

static struct vm_map_entry *
uvm_mapent_alloc(struct vm_map *map, int flags)
{
        struct vm_map_entry *me;
        int pflags = (flags & UVM_FLAG_NOWAIT) ? PR_NOWAIT : PR_WAITOK;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        me = pool_cache_get(&uvm_map_entry_cache, pflags);
        if (__predict_false(me == NULL)) {
                return NULL;
        }
        me->flags = 0;

        UVMHIST_LOG(maphist, "<- new entry=%#jx [kentry=%jd]", (uintptr_t)me,
            (map == kernel_map), 0, 0);
        return me;
}

/*
 * uvm_mapent_free: free map entry
 */

static void
uvm_mapent_free(struct vm_map_entry *me)
{
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"<- freeing map entry=%#jx [flags=%#jx]",
                (uintptr_t)me, me->flags, 0, 0);
        pool_cache_put(&uvm_map_entry_cache, me);
}

/*
 * uvm_mapent_copy: copy a map entry, preserving flags
 */

static inline void
uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
{

        memcpy(dst, src, sizeof(*dst));
        dst->flags = 0;
}

#if defined(DEBUG)
static void
_uvm_mapent_check(const struct vm_map_entry *entry, int line)
{

        if (entry->start >= entry->end) {
                goto bad;
        }
        if (UVM_ET_ISOBJ(entry)) {
                if (entry->object.uvm_obj == NULL) {
                        goto bad;
                }
        } else if (UVM_ET_ISSUBMAP(entry)) {
                if (entry->object.sub_map == NULL) {
                        goto bad;
                }
        } else {
                if (entry->object.uvm_obj != NULL ||
                    entry->object.sub_map != NULL) {
                        goto bad;
                }
        }
        if (!UVM_ET_ISOBJ(entry)) {
                if (entry->offset != 0) {
                        goto bad;
                }
        }

        return;

bad:
        panic("%s: bad entry %p, line %d", __func__, entry, line);
}
#endif /* defined(DEBUG) */

/*
 * uvm_map_entry_unwire: unwire a map entry
 *
 * => map should be locked by caller
 */

static inline void
uvm_map_entry_unwire(struct vm_map *map, struct vm_map_entry *entry)
{

        entry->wired_count = 0;
        uvm_fault_unwire_locked(map, entry->start, entry->end);
}


/*
 * wrapper for calling amap_ref()
 */
static inline void
uvm_map_reference_amap(struct vm_map_entry *entry, int flags)
{

        amap_ref(entry->aref.ar_amap, entry->aref.ar_pageoff,
            (entry->end - entry->start) >> PAGE_SHIFT, flags);
}


/*
 * wrapper for calling amap_unref()
 */
static inline void
uvm_map_unreference_amap(struct vm_map_entry *entry, int flags)
{

        amap_unref(entry->aref.ar_amap, entry->aref.ar_pageoff,
            (entry->end - entry->start) >> PAGE_SHIFT, flags);
}


/*
 * uvm_map_init: init mapping system at boot time.
 */

void
uvm_map_init(void)
{
        /*
         * first, init logging system.
         */

        UVMHIST_FUNC(__func__);
        UVMHIST_LINK_STATIC(maphist);
        UVMHIST_LINK_STATIC(pdhist);
        UVMHIST_CALLED(maphist);
        UVMHIST_LOG(maphist,"<starting uvm map system>", 0, 0, 0, 0);

        /*
         * initialize the global lock for kernel map entry.
         */

        mutex_init(&uvm_kentry_lock, MUTEX_DRIVER, IPL_VM);
}

/*
 * uvm_map_init_caches: init mapping system caches.
 */
void
uvm_map_init_caches(void)
{
        /*
         * initialize caches.
         */

        pool_cache_bootstrap(&uvm_map_entry_cache, sizeof(struct vm_map_entry),
            coherency_unit, 0, PR_LARGECACHE, "vmmpepl", NULL, IPL_NONE, NULL,
            NULL, NULL);
}

/*
 * clippers
 */

/*
 * uvm_mapent_splitadj: adjust map entries for splitting, after uvm_mapent_copy.
 */

static void
uvm_mapent_splitadj(struct vm_map_entry *entry1, struct vm_map_entry *entry2,
    vaddr_t splitat)
{
        vaddr_t adj;

        KASSERT(entry1->start < splitat);
        KASSERT(splitat < entry1->end);

        adj = splitat - entry1->start;
        entry1->end = entry2->start = splitat;

        if (entry1->aref.ar_amap) {
                amap_splitref(&entry1->aref, &entry2->aref, adj);
        }
        if (UVM_ET_ISSUBMAP(entry1)) {
                /* ... unlikely to happen, but play it safe */
                 uvm_map_reference(entry1->object.sub_map);
        } else if (UVM_ET_ISOBJ(entry1)) {
                KASSERT(entry1->object.uvm_obj != NULL); /* suppress coverity */
                entry2->offset += adj;
                if (entry1->object.uvm_obj->pgops &&
                    entry1->object.uvm_obj->pgops->pgo_reference)
                        entry1->object.uvm_obj->pgops->pgo_reference(
                            entry1->object.uvm_obj);
        }
}

/*
 * uvm_map_clip_start: ensure that the entry begins at or after
 *        the starting address, if it doesn't we split the entry.
 *
 * => caller should use UVM_MAP_CLIP_START macro rather than calling
 *    this directly
 * => map must be locked by caller
 */

void
uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry,
    vaddr_t start)
{
        struct vm_map_entry *new_entry;

        /* uvm_map_simplify_entry(map, entry); */ /* XXX */

        uvm_map_check(map, "clip_start entry");
        uvm_mapent_check(entry);

        /*
         * Split off the front portion.  note that we must insert the new
         * entry BEFORE this one, so that this entry has the specified
         * starting address.
         */
        new_entry = uvm_mapent_alloc(map, 0);
        uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
        uvm_mapent_splitadj(new_entry, entry, start);
        uvm_map_entry_link(map, entry->prev, new_entry);

        uvm_map_check(map, "clip_start leave");
}

/*
 * uvm_map_clip_end: ensure that the entry ends at or before
 *        the ending address, if it does't we split the reference
 *
 * => caller should use UVM_MAP_CLIP_END macro rather than calling
 *    this directly
 * => map must be locked by caller
 */

void
uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t end)
{
        struct vm_map_entry *new_entry;

        uvm_map_check(map, "clip_end entry");
        uvm_mapent_check(entry);

        /*
         *        Create a new entry and insert it
         *        AFTER the specified entry
         */
        new_entry = uvm_mapent_alloc(map, 0);
        uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
        uvm_mapent_splitadj(entry, new_entry, end);
        uvm_map_entry_link(map, entry, new_entry);

        uvm_map_check(map, "clip_end leave");
}

/*
 *   M A P   -   m a i n   e n t r y   p o i n t
 */
/*
 * uvm_map: establish a valid mapping in a map
 *
 * => assume startp is page aligned.
 * => assume size is a multiple of PAGE_SIZE.
 * => assume sys_mmap provides enough of a "hint" to have us skip
 *        over text/data/bss area.
 * => map must be unlocked (we will lock it)
 * => <uobj,uoffset> value meanings (4 cases):
 *         [1] <NULL,uoffset>                == uoffset is a hint for PMAP_PREFER
 *         [2] <NULL,UVM_UNKNOWN_OFFSET>        == don't PMAP_PREFER
 *         [3] <uobj,uoffset>                == normal mapping
 *         [4] <uobj,UVM_UNKNOWN_OFFSET>        == uvm_map finds offset based on VA
 *
 *    case [4] is for kernel mappings where we don't know the offset until
 *    we've found a virtual address.   note that kernel object offsets are
 *    always relative to vm_map_min(kernel_map).
 *
 * => if `align' is non-zero, we align the virtual address to the specified
 *        alignment.
 *        this is provided as a mechanism for large pages.
 *
 * => XXXCDC: need way to map in external amap?
 */

int
uvm_map(struct vm_map *map, vaddr_t *startp /* IN/OUT */, vsize_t size,
    struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags)
{
        struct uvm_map_args args;
        struct vm_map_entry *new_entry;
        int error;

        KASSERT((size & PAGE_MASK) == 0);
        KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0);

        /*
         * for pager_map, allocate the new entry first to avoid sleeping
         * for memory while we have the map locked.
         */

        new_entry = NULL;
        if (map == pager_map) {
                new_entry = uvm_mapent_alloc(map, (flags & UVM_FLAG_NOWAIT));
                if (__predict_false(new_entry == NULL))
                        return ENOMEM;
        }
        if (map == pager_map)
                flags |= UVM_FLAG_NOMERGE;

        error = uvm_map_prepare(map, *startp, size, uobj, uoffset, align,
            flags, &args);
        if (!error) {
                error = uvm_map_enter(map, &args, new_entry);
                *startp = args.uma_start;
        } else if (new_entry) {
                uvm_mapent_free(new_entry);
        }

#if defined(DEBUG)
        if (!error && VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) {
                uvm_km_check_empty(map, *startp, *startp + size);
        }
#endif /* defined(DEBUG) */

        return error;
}

/*
 * uvm_map_prepare:
 *
 * called with map unlocked.
 * on success, returns the map locked.
 */

int
uvm_map_prepare(struct vm_map *map, vaddr_t start, vsize_t size,
    struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags,
    struct uvm_map_args *args)
{
        struct vm_map_entry *prev_entry;
        vm_prot_t prot = UVM_PROTECTION(flags);
        vm_prot_t maxprot = UVM_MAXPROTECTION(flags);

        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist, "(map=%#jx, start=%#jx, size=%jx, flags=%#jx)",
            (uintptr_t)map, start, size, flags);
        UVMHIST_LOG(maphist, "  uobj/offset %#jx/%jd", (uintptr_t)uobj,
            uoffset,0,0);

        /*
         * detect a popular device driver bug.
         */

        KASSERT(doing_shutdown || curlwp != NULL);

        /*
         * zero-sized mapping doesn't make any sense.
         */
        KASSERT(size > 0);

        KASSERT((~flags & (UVM_FLAG_NOWAIT | UVM_FLAG_WAITVA)) != 0);

        uvm_map_check(map, "map entry");

        /*
         * check sanity of protection code
         */

        if ((prot & maxprot) != prot) {
                UVMHIST_LOG(maphist, "<- prot. failure:  prot=%#jx, max=%#jx",
                prot, maxprot,0,0);
                return EACCES;
        }

        /*
         * figure out where to put new VM range
         */
retry:
        if (vm_map_lock_try(map) == false) {
                if ((flags & UVM_FLAG_TRYLOCK) != 0) {
                        return EAGAIN;
                }
                vm_map_lock(map); /* could sleep here */
        }
        if (flags & UVM_FLAG_UNMAP) {
                KASSERT(flags & UVM_FLAG_FIXED);
                KASSERT((flags & UVM_FLAG_NOWAIT) == 0);

                /*
                 * Set prev_entry to what it will need to be after any existing
                 * entries are removed later in uvm_map_enter().
                 */

                if (uvm_map_lookup_entry(map, start, &prev_entry)) {
                        if (start == prev_entry->start)
                                prev_entry = prev_entry->prev;
                        else
                                UVM_MAP_CLIP_END(map, prev_entry, start);
                        SAVE_HINT(map, map->hint, prev_entry);
                }
        } else {
                prev_entry = uvm_map_findspace(map, start, size, &start,
                    uobj, uoffset, align, flags);
        }
        if (prev_entry == NULL) {
                unsigned int timestamp;

                timestamp = map->timestamp;
                UVMHIST_LOG(maphist,"waiting va timestamp=%#jx",
                            timestamp,0,0,0);
                map->flags |= VM_MAP_WANTVA;
                vm_map_unlock(map);

                /*
                 * try to reclaim kva and wait until someone does unmap.
                 * fragile locking here, so we awaken every second to
                 * recheck the condition.
                 */

                mutex_enter(&map->misc_lock);
                while ((map->flags & VM_MAP_WANTVA) != 0 &&
                   map->timestamp == timestamp) {
                        if ((flags & UVM_FLAG_WAITVA) == 0) {
                                mutex_exit(&map->misc_lock);
                                UVMHIST_LOG(maphist,
                                    "<- uvm_map_findspace failed!", 0,0,0,0);
                                return ENOMEM;
                        } else {
                                cv_timedwait(&map->cv, &map->misc_lock, hz);
                        }
                }
                mutex_exit(&map->misc_lock);
                goto retry;
        }

#ifdef PMAP_GROWKERNEL
        /*
         * If the kernel pmap can't map the requested space,
         * then allocate more resources for it.
         */
        if (map == kernel_map && uvm_maxkaddr < (start + size))
                uvm_maxkaddr = pmap_growkernel(start + size);
#endif

        UVMMAP_EVCNT_INCR(map_call);

        /*
         * if uobj is null, then uoffset is either a VAC hint for PMAP_PREFER
         * [typically from uvm_map_reserve] or it is UVM_UNKNOWN_OFFSET.   in
         * either case we want to zero it  before storing it in the map entry
         * (because it looks strange and confusing when debugging...)
         *
         * if uobj is not null
         *   if uoffset is not UVM_UNKNOWN_OFFSET then we have a normal mapping
         *      and we do not need to change uoffset.
         *   if uoffset is UVM_UNKNOWN_OFFSET then we need to find the offset
         *      now (based on the starting address of the map).   this case is
         *      for kernel object mappings where we don't know the offset until
         *      the virtual address is found (with uvm_map_findspace).   the
         *      offset is the distance we are from the start of the map.
         */

        if (uobj == NULL) {
                uoffset = 0;
        } else {
                if (uoffset == UVM_UNKNOWN_OFFSET) {
                        KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj));
                        uoffset = start - vm_map_min(kernel_map);
                }
        }

        args->uma_flags = flags;
        args->uma_prev = prev_entry;
        args->uma_start = start;
        args->uma_size = size;
        args->uma_uobj = uobj;
        args->uma_uoffset = uoffset;

        UVMHIST_LOG(maphist, "<- done!", 0,0,0,0);
        return 0;
}

/*
 * uvm_map_enter:
 *
 * called with map locked.
 * unlock the map before returning.
 */

int
uvm_map_enter(struct vm_map *map, const struct uvm_map_args *args,
    struct vm_map_entry *new_entry)
{
        struct vm_map_entry *prev_entry = args->uma_prev;
        struct vm_map_entry *dead = NULL, *dead_entries = NULL;

        const uvm_flag_t flags = args->uma_flags;
        const vm_prot_t prot = UVM_PROTECTION(flags);
        const vm_prot_t maxprot = UVM_MAXPROTECTION(flags);
        const vm_inherit_t inherit = UVM_INHERIT(flags);
        const int amapwaitflag = (flags & UVM_FLAG_NOWAIT) ?
            AMAP_EXTEND_NOWAIT : 0;
        const int advice = UVM_ADVICE(flags);

        vaddr_t start = args->uma_start;
        vsize_t size = args->uma_size;
        struct uvm_object *uobj = args->uma_uobj;
        voff_t uoffset = args->uma_uoffset;

        const int kmap = (vm_map_pmap(map) == pmap_kernel());
        int merged = 0;
        int error;
        int newetype;

        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist, "(map=%#jx, start=%#jx, size=%ju, flags=%#jx)",
            (uintptr_t)map, start, size, flags);
        UVMHIST_LOG(maphist, "  uobj/offset %#jx/%jd", (uintptr_t)uobj,
            uoffset,0,0);

        KASSERT(map->hint == prev_entry); /* bimerge case assumes this */
        KASSERT(vm_map_locked_p(map));
        KASSERT((flags & (UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP)) !=
                (UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP));

        if (uobj)
                newetype = UVM_ET_OBJ;
        else
                newetype = 0;

        if (flags & UVM_FLAG_COPYONW) {
                newetype |= UVM_ET_COPYONWRITE;
                if ((flags & UVM_FLAG_OVERLAY) == 0)
                        newetype |= UVM_ET_NEEDSCOPY;
        }

        /*
         * For mappings with unmap, remove any old entries now.  Adding the new
         * entry cannot fail because that can only happen if UVM_FLAG_NOWAIT
         * is set, and we do not support nowait and unmap together.
         */

        if (flags & UVM_FLAG_UNMAP) {
                KASSERT(flags & UVM_FLAG_FIXED);
                uvm_unmap_remove(map, start, start + size, &dead_entries, 0);
#ifdef DEBUG
                struct vm_map_entry *tmp_entry __diagused;
                bool rv __diagused;

                rv = uvm_map_lookup_entry(map, start, &tmp_entry);
                KASSERT(!rv);
                KASSERTMSG(prev_entry == tmp_entry,
                           "args %p prev_entry %p tmp_entry %p",
                           args, prev_entry, tmp_entry);
#endif
                SAVE_HINT(map, map->hint, prev_entry);
        }

        /*
         * try and insert in map by extending previous entry, if possible.
         * XXX: we don't try and pull back the next entry.   might be useful
         * for a stack, but we are currently allocating our stack in advance.
         */

        if (flags & UVM_FLAG_NOMERGE)
                goto nomerge;

        if (prev_entry->end == start &&
            prev_entry != &map->header &&
            UVM_ET_ISCOMPATIBLE(prev_entry, newetype, uobj, 0,
            prot, maxprot, inherit, advice, 0)) {

                if (uobj && prev_entry->offset +
                    (prev_entry->end - prev_entry->start) != uoffset)
                        goto forwardmerge;

                /*
                 * can't extend a shared amap.  note: no need to lock amap to
                 * look at refs since we don't care about its exact value.
                 * if it is one (i.e. we have only reference) it will stay there
                 */

                if (prev_entry->aref.ar_amap &&
                    amap_refs(prev_entry->aref.ar_amap) != 1) {
                        goto forwardmerge;
                }

                if (prev_entry->aref.ar_amap) {
                        error = amap_extend(prev_entry, size,
                            amapwaitflag | AMAP_EXTEND_FORWARDS);
                        if (error)
                                goto nomerge;
                }

                if (kmap) {
                        UVMMAP_EVCNT_INCR(kbackmerge);
                } else {
                        UVMMAP_EVCNT_INCR(ubackmerge);
                }
                UVMHIST_LOG(maphist,"  starting back merge", 0, 0, 0, 0);

                /*
                 * drop our reference to uobj since we are extending a reference
                 * that we already have (the ref count can not drop to zero).
                 */

                if (uobj && uobj->pgops->pgo_detach)
                        uobj->pgops->pgo_detach(uobj);

                /*
                 * Now that we've merged the entries, note that we've grown
                 * and our gap has shrunk.  Then fix the tree.
                 */
                prev_entry->end += size;
                prev_entry->gap -= size;
                uvm_rb_fixup(map, prev_entry);

                uvm_map_check(map, "map backmerged");

                UVMHIST_LOG(maphist,"<- done (via backmerge)!", 0, 0, 0, 0);
                merged++;
        }

forwardmerge:
        if (prev_entry->next->start == (start + size) &&
            prev_entry->next != &map->header &&
            UVM_ET_ISCOMPATIBLE(prev_entry->next, newetype, uobj, 0,
            prot, maxprot, inherit, advice, 0)) {

                if (uobj && prev_entry->next->offset != uoffset + size)
                        goto nomerge;

                /*
                 * can't extend a shared amap.  note: no need to lock amap to
                 * look at refs since we don't care about its exact value.
                 * if it is one (i.e. we have only reference) it will stay there.
                 *
                 * note that we also can't merge two amaps, so if we
                 * merged with the previous entry which has an amap,
                 * and the next entry also has an amap, we give up.
                 *
                 * Interesting cases:
                 * amap, new, amap -> give up second merge (single fwd extend)
                 * amap, new, none -> double forward extend (extend again here)
                 * none, new, amap -> double backward extend (done here)
                 * uobj, new, amap -> single backward extend (done here)
                 *
                 * XXX should we attempt to deal with someone refilling
                 * the deallocated region between two entries that are
                 * backed by the same amap (ie, arefs is 2, "prev" and
                 * "next" refer to it, and adding this allocation will
                 * close the hole, thus restoring arefs to 1 and
                 * deallocating the "next" vm_map_entry)?  -- @@@
                 */

                if (prev_entry->next->aref.ar_amap &&
                    (amap_refs(prev_entry->next->aref.ar_amap) != 1 ||
                     (merged && prev_entry->aref.ar_amap))) {
                        goto nomerge;
                }

                if (merged) {
                        /*
                         * Try to extend the amap of the previous entry to
                         * cover the next entry as well.  If it doesn't work
                         * just skip on, don't actually give up, since we've
                         * already completed the back merge.
                         */
                        if (prev_entry->aref.ar_amap) {
                                if (amap_extend(prev_entry,
                                    prev_entry->next->end -
                                    prev_entry->next->start,
                                    amapwaitflag | AMAP_EXTEND_FORWARDS))
                                        goto nomerge;
                        }

                        /*
                         * Try to extend the amap of the *next* entry
                         * back to cover the new allocation *and* the
                         * previous entry as well (the previous merge
                         * didn't have an amap already otherwise we
                         * wouldn't be checking here for an amap).  If
                         * it doesn't work just skip on, again, don't
                         * actually give up, since we've already
                         * completed the back merge.
                         */
                        else if (prev_entry->next->aref.ar_amap) {
                                if (amap_extend(prev_entry->next,
                                    prev_entry->end -
                                    prev_entry->start,
                                    amapwaitflag | AMAP_EXTEND_BACKWARDS))
                                        goto nomerge;
                        }
                } else {
                        /*
                         * Pull the next entry's amap backwards to cover this
                         * new allocation.
                         */
                        if (prev_entry->next->aref.ar_amap) {
                                error = amap_extend(prev_entry->next, size,
                                    amapwaitflag | AMAP_EXTEND_BACKWARDS);
                                if (error)
                                        goto nomerge;
                        }
                }

                if (merged) {
                        if (kmap) {
                                UVMMAP_EVCNT_DECR(kbackmerge);
                                UVMMAP_EVCNT_INCR(kbimerge);
                        } else {
                                UVMMAP_EVCNT_DECR(ubackmerge);
                                UVMMAP_EVCNT_INCR(ubimerge);
                        }
                } else {
                        if (kmap) {
                                UVMMAP_EVCNT_INCR(kforwmerge);
                        } else {
                                UVMMAP_EVCNT_INCR(uforwmerge);
                        }
                }
                UVMHIST_LOG(maphist,"  starting forward merge", 0, 0, 0, 0);

                /*
                 * drop our reference to uobj since we are extending a reference
                 * that we already have (the ref count can not drop to zero).
                 */
                if (uobj && uobj->pgops->pgo_detach)
                        uobj->pgops->pgo_detach(uobj);

                if (merged) {
                        dead = prev_entry->next;
                        prev_entry->end = dead->end;
                        uvm_map_entry_unlink(map, dead);
                        if (dead->aref.ar_amap != NULL) {
                                prev_entry->aref = dead->aref;
                                dead->aref.ar_amap = NULL;
                        }
                } else {
                        prev_entry->next->start -= size;
                        if (prev_entry != &map->header) {
                                prev_entry->gap -= size;
                                KASSERT(prev_entry->gap == uvm_rb_gap(prev_entry));
                                uvm_rb_fixup(map, prev_entry);
                        }
                        if (uobj)
                                prev_entry->next->offset = uoffset;
                }

                uvm_map_check(map, "map forwardmerged");

                UVMHIST_LOG(maphist,"<- done forwardmerge", 0, 0, 0, 0);
                merged++;
        }

nomerge:
        if (!merged) {
                UVMHIST_LOG(maphist,"  allocating new map entry", 0, 0, 0, 0);
                if (kmap) {
                        UVMMAP_EVCNT_INCR(knomerge);
                } else {
                        UVMMAP_EVCNT_INCR(unomerge);
                }

                /*
                 * allocate new entry and link it in.
                 */

                if (new_entry == NULL) {
                        new_entry = uvm_mapent_alloc(map,
                                (flags & UVM_FLAG_NOWAIT));
                        if (__predict_false(new_entry == NULL)) {
                                error = ENOMEM;
                                goto done;
                        }
                }
                new_entry->start = start;
                new_entry->end = new_entry->start + size;
                new_entry->object.uvm_obj = uobj;
                new_entry->offset = uoffset;

                new_entry->etype = newetype;

                if (flags & UVM_FLAG_NOMERGE) {
                        new_entry->flags |= UVM_MAP_NOMERGE;
                }

                new_entry->protection = prot;
                new_entry->max_protection = maxprot;
                new_entry->inheritance = inherit;
                new_entry->wired_count = 0;
                new_entry->advice = advice;
                if (flags & UVM_FLAG_OVERLAY) {

                        /*
                         * to_add: for BSS we overallocate a little since we
                         * are likely to extend
                         */

                        vaddr_t to_add = (flags & UVM_FLAG_AMAPPAD) ?
                                UVM_AMAP_CHUNK << PAGE_SHIFT : 0;
                        struct vm_amap *amap = amap_alloc(size, to_add,
                            (flags & UVM_FLAG_NOWAIT));
                        if (__predict_false(amap == NULL)) {
                                error = ENOMEM;
                                goto done;
                        }
                        new_entry->aref.ar_pageoff = 0;
                        new_entry->aref.ar_amap = amap;
                } else {
                        new_entry->aref.ar_pageoff = 0;
                        new_entry->aref.ar_amap = NULL;
                }
                uvm_map_entry_link(map, prev_entry, new_entry);

                /*
                 * Update the free space hint
                 */

                if ((map->first_free == prev_entry) &&
                    (prev_entry->end >= new_entry->start))
                        map->first_free = new_entry;

                new_entry = NULL;
        }

        map->size += size;

        UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);

        error = 0;

done:
        vm_map_unlock(map);

        if (new_entry) {
                uvm_mapent_free(new_entry);
        }
        if (dead) {
                KDASSERT(merged);
                uvm_mapent_free(dead);
        }
        if (dead_entries)
                uvm_unmap_detach(dead_entries, 0);

        return error;
}

/*
 * uvm_map_lookup_entry_bytree: lookup an entry in tree
 */

static inline bool
uvm_map_lookup_entry_bytree(struct vm_map *map, vaddr_t address,
    struct vm_map_entry **entry        /* OUT */)
{
        struct vm_map_entry *prev = &map->header;
        struct vm_map_entry *cur = ROOT_ENTRY(map);

        while (cur) {
                UVMMAP_EVCNT_INCR(mlk_treeloop);
                if (address >= cur->start) {
                        if (address < cur->end) {
                                *entry = cur;
                                return true;
                        }
                        prev = cur;
                        cur = RIGHT_ENTRY(cur);
                } else
                        cur = LEFT_ENTRY(cur);
        }
        *entry = prev;
        return false;
}

/*
 * uvm_map_lookup_entry: find map entry at or before an address
 *
 * => map must at least be read-locked by caller
 * => entry is returned in "entry"
 * => return value is true if address is in the returned entry
 */

bool
uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
    struct vm_map_entry **entry        /* OUT */)
{
        struct vm_map_entry *cur;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(map=%#jx,addr=%#jx,ent=%#jx)",
            (uintptr_t)map, address, (uintptr_t)entry, 0);

        /*
         * make a quick check to see if we are already looking at
         * the entry we want (which is usually the case).  note also
         * that we don't need to save the hint here...  it is the
         * same hint (unless we are at the header, in which case the
         * hint didn't buy us anything anyway).
         */

        cur = map->hint;
        UVMMAP_EVCNT_INCR(mlk_call);
        if (cur != &map->header &&
            address >= cur->start && cur->end > address) {
                UVMMAP_EVCNT_INCR(mlk_hint);
                *entry = cur;
                UVMHIST_LOG(maphist,"<- got it via hint (%#jx)",
                    (uintptr_t)cur, 0, 0, 0);
                uvm_mapent_check(*entry);
                return (true);
        }
        uvm_map_check(map, __func__);

        /*
         * lookup in the tree.
         */

        UVMMAP_EVCNT_INCR(mlk_tree);
        if (__predict_true(uvm_map_lookup_entry_bytree(map, address, entry))) {
                SAVE_HINT(map, map->hint, *entry);
                UVMHIST_LOG(maphist,"<- search got it (%#jx)",
                    (uintptr_t)cur, 0, 0, 0);
                KDASSERT((*entry)->start <= address);
                KDASSERT(address < (*entry)->end);
                uvm_mapent_check(*entry);
                return (true);
        }

        SAVE_HINT(map, map->hint, *entry);
        UVMHIST_LOG(maphist,"<- failed!",0,0,0,0);
        KDASSERT((*entry) == &map->header || (*entry)->end <= address);
        KDASSERT((*entry)->next == &map->header ||
            address < (*entry)->next->start);
        return (false);
}

/*
 * See if the range between start and start + length fits in the gap
 * entry->next->start and entry->end.  Returns 1 if fits, 0 if doesn't
 * fit, and -1 address wraps around.
 */
static int
uvm_map_space_avail(vaddr_t *start, vsize_t length, voff_t uoffset,
    vsize_t align, int flags, int topdown, struct vm_map_entry *entry)
{
        vaddr_t end;

#ifdef PMAP_PREFER
        /*
         * push start address forward as needed to avoid VAC alias problems.
         * we only do this if a valid offset is specified.
         */

        if (uoffset != UVM_UNKNOWN_OFFSET)
                PMAP_PREFER(uoffset, start, length, topdown);
#endif
        if ((flags & UVM_FLAG_COLORMATCH) != 0) {
                KASSERT(align < uvmexp.ncolors);
                if (uvmexp.ncolors > 1) {
                        const u_int colormask = uvmexp.colormask;
                        const u_int colorsize = colormask + 1;
                        vaddr_t hint = atop(*start);
                        const u_int color = hint & colormask;
                        if (color != align) {
                                hint -= color;        /* adjust to color boundary */
                                KASSERT((hint & colormask) == 0);
                                if (topdown) {
                                        if (align > color)
                                                hint -= colorsize;
                                } else {
                                        if (align < color)
                                                hint += colorsize;
                                }
                                *start = ptoa(hint + align); /* adjust to color */
                        }
                }
        } else {
                KASSERT(powerof2(align));
                uvm_map_align_va(start, align, topdown);
                /*
                 * XXX Should we PMAP_PREFER() here again?
                 * eh...i think we're okay
                 */
        }

        /*
         * Find the end of the proposed new region.  Be sure we didn't
         * wrap around the address; if so, we lose.  Otherwise, if the
         * proposed new region fits before the next entry, we win.
         */

        end = *start + length;
        if (end < *start)
                return (-1);

        if (entry->next->start >= end && *start >= entry->end)
                return (1);

        return (0);
}

static void
uvm_findspace_invariants(struct vm_map *map, vaddr_t orig_hint, vaddr_t length,
    struct uvm_object *uobj, voff_t uoffset, vsize_t align, int flags,
    vaddr_t hint, struct vm_map_entry *entry, int line)
{
        const int topdown = map->flags & VM_MAP_TOPDOWN;

        KASSERTMSG( topdown || hint >= orig_hint,
            "map=%p hint=%#"PRIxVADDR" orig_hint=%#"PRIxVADDR
            " length=%#"PRIxVSIZE" uobj=%p uoffset=%#llx align=%"PRIxVSIZE
            " flags=%#x entry=%p (uvm_map_findspace line %d)",
            map, hint, orig_hint,
            length, uobj, (unsigned long long)uoffset, align,
            flags, entry, line);
#ifndef __sh3__ /* XXXRO: kern/51254 */
        KASSERTMSG(!topdown || hint <= orig_hint,
#else
        if (__predict_false(!(!topdown || hint <= orig_hint)))
                printf(
#endif
            "map=%p hint=%#"PRIxVADDR" orig_hint=%#"PRIxVADDR
            " length=%#"PRIxVSIZE" uobj=%p uoffset=%#llx align=%"PRIxVSIZE
            " flags=%#x entry=%p (uvm_map_findspace line %d)",
            map, hint, orig_hint,
            length, uobj, (unsigned long long)uoffset, align,
            flags, entry, line);
}

/*
 * uvm_map_findspace: find "length" sized space in "map".
 *
 * => "hint" is a hint about where we want it, unless UVM_FLAG_FIXED is
 *        set in "flags" (in which case we insist on using "hint").
 * => "result" is VA returned
 * => uobj/uoffset are to be used to handle VAC alignment, if required
 * => if "align" is non-zero, we attempt to align to that value.
 * => caller must at least have read-locked map
 * => returns NULL on failure, or pointer to prev. map entry if success
 * => note this is a cross between the old vm_map_findspace and vm_map_find
 */

struct vm_map_entry *
uvm_map_findspace(struct vm_map *map, vaddr_t hint, vsize_t length,
    vaddr_t *result /* OUT */, struct uvm_object *uobj, voff_t uoffset,
    vsize_t align, int flags)
{
#define        INVARIANTS()                                                              \
        uvm_findspace_invariants(map, orig_hint, length, uobj, uoffset, align,\
            flags, hint, entry, __LINE__)
        struct vm_map_entry *entry = NULL;
        struct vm_map_entry *child, *prev, *tmp;
        vaddr_t orig_hint __diagused;
        const int topdown = map->flags & VM_MAP_TOPDOWN;
        int avail;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist, "(map=%#jx, hint=%#jx, len=%ju, flags=%#jx...",
            (uintptr_t)map, hint, length, flags);
        UVMHIST_LOG(maphist, " uobj=%#jx, uoffset=%#jx, align=%#jx)",
            (uintptr_t)uobj, uoffset, align, 0);

        KASSERT((flags & UVM_FLAG_COLORMATCH) != 0 || powerof2(align));
        KASSERT((flags & UVM_FLAG_COLORMATCH) == 0 || align < uvmexp.ncolors);
        KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0);

        uvm_map_check(map, "map_findspace entry");

        /*
         * Clamp the hint to the VM map's min/max address, and remmeber
         * the clamped original hint.  Remember the original hint,
         * clamped to the min/max address.  If we are aligning, then we
         * may have to try again with no alignment constraint if we
         * fail the first time.
         *
         * We use the original hint to verify later that the search has
         * been monotonic -- that is, nonincreasing or nondecreasing,
         * according to topdown or !topdown respectively.  But the
         * clamping is not monotonic.
         */
        if (hint < vm_map_min(map)) {        /* check ranges ... */
                if (flags & UVM_FLAG_FIXED) {
                        UVMHIST_LOG(maphist,"<- VA below map range",0,0,0,0);
                        return (NULL);
                }
                hint = vm_map_min(map);
        }
        if (hint > vm_map_max(map)) {
                UVMHIST_LOG(maphist,"<- VA %#jx > range [%#jx->%#jx]",
                    hint, vm_map_min(map), vm_map_max(map), 0);
                return (NULL);
        }
        orig_hint = hint;
        INVARIANTS();

        UVMHIST_LOG(maphist,"<- VA %#jx vs range [%#jx->%#jx]",
            hint, vm_map_min(map), vm_map_max(map), 0);

        /*
         * hint may not be aligned properly; we need round up or down it
         * before proceeding further.
         */
        if ((flags & UVM_FLAG_COLORMATCH) == 0) {
                uvm_map_align_va(&hint, align, topdown);
                INVARIANTS();
        }

        UVMHIST_LOG(maphist,"<- VA %#jx vs range [%#jx->%#jx]",
            hint, vm_map_min(map), vm_map_max(map), 0);
        /*
         * Look for the first possible address; if there's already
         * something at this address, we have to start after it.
         */

        /*
         * @@@: there are four, no, eight cases to consider.
         *
         * 0: found,     fixed,     bottom up -> fail
         * 1: found,     fixed,     top down  -> fail
         * 2: found,     not fixed, bottom up -> start after entry->end,
         *                                       loop up
         * 3: found,     not fixed, top down  -> start before entry->start,
         *                                       loop down
         * 4: not found, fixed,     bottom up -> check entry->next->start, fail
         * 5: not found, fixed,     top down  -> check entry->next->start, fail
         * 6: not found, not fixed, bottom up -> check entry->next->start,
         *                                       loop up
         * 7: not found, not fixed, top down  -> check entry->next->start,
         *                                       loop down
         *
         * as you can see, it reduces to roughly five cases, and that
         * adding top down mapping only adds one unique case (without
         * it, there would be four cases).
         */

        if ((flags & UVM_FLAG_FIXED) == 0 &&
            hint == (topdown ? vm_map_max(map) : vm_map_min(map))) {
                /*
                 * The uvm_map_findspace algorithm is monotonic -- for
                 * topdown VM it starts with a high hint and returns a
                 * lower free address; for !topdown VM it starts with a
                 * low hint and returns a higher free address.  As an
                 * optimization, start with the first (highest for
                 * topdown, lowest for !topdown) free address.
                 *
                 * XXX This `optimization' probably doesn't actually do
                 * much in practice unless userland explicitly passes
                 * the VM map's minimum or maximum address, which
                 * varies from machine to machine (VM_MAX/MIN_ADDRESS,
                 * e.g. 0x7fbfdfeff000 on amd64 but 0xfffffffff000 on
                 * aarch64) and may vary according to other factors
                 * like sysctl vm.user_va0_disable.  In particular, if
                 * the user specifies 0 as a hint to mmap, then mmap
                 * will choose a default address which is usually _not_
                 * VM_MAX/MIN_ADDRESS but something else instead like
                 * VM_MAX_ADDRESS - stack size - guard page overhead,
                 * in which case this branch is never hit.
                 *
                 * In fact, this branch appears to have been broken for
                 * two decades between when topdown was introduced in
                 * ~2003 and when it was adapted to handle the topdown
                 * case without violating the monotonicity assertion in
                 * 2022.  Maybe Someone^TM should either ditch the
                 * optimization or find a better way to do it.
                 */
                entry = map->first_free;
        } else {
                if (uvm_map_lookup_entry(map, hint, &entry)) {
                        /* "hint" address already in use ... */
                        if (flags & UVM_FLAG_FIXED) {
                                UVMHIST_LOG(maphist, "<- fixed & VA in use",
                                    0, 0, 0, 0);
                                return (NULL);
                        }
                        if (topdown)
                                /* Start from lower gap. */
                                entry = entry->prev;
                } else if (flags & UVM_FLAG_FIXED) {
                        if (entry->next->start >= hint + length &&
                            hint + length > hint)
                                goto found;

                        /* "hint" address is gap but too small */
                        UVMHIST_LOG(maphist, "<- fixed mapping failed",
                            0, 0, 0, 0);
                        return (NULL); /* only one shot at it ... */
                } else {
                        /*
                         * See if given hint fits in this gap.
                         */
                        avail = uvm_map_space_avail(&hint, length,
                            uoffset, align, flags, topdown, entry);
                        INVARIANTS();
                        switch (avail) {
                        case 1:
                                goto found;
                        case -1:
                                goto wraparound;
                        }

                        if (topdown) {
                                /*
                                 * Still there is a chance to fit
                                 * if hint > entry->end.
                                 */
                        } else {
                                /* Start from higher gap. */
                                entry = entry->next;
                                if (entry == &map->header)
                                        goto notfound;
                                goto nextgap;
                        }
                }
        }

        /*
         * Note that all UVM_FLAGS_FIXED case is already handled.
         */
        KDASSERT((flags & UVM_FLAG_FIXED) == 0);

        /* Try to find the space in the red-black tree */

        /* Check slot before any entry */
        if (topdown) {
                KASSERTMSG(entry->next->start >= vm_map_min(map),
                    "map=%p entry=%p entry->next=%p"
                    " entry->next->start=0x%"PRIxVADDR" min=0x%"PRIxVADDR,
                    map, entry, entry->next,
                    entry->next->start, vm_map_min(map));
                if (length > entry->next->start - vm_map_min(map))
                        hint = vm_map_min(map); /* XXX goto wraparound? */
                else
                        hint = entry->next->start - length;
                KASSERT(hint >= vm_map_min(map));
        } else {
                hint = entry->end;
        }
        INVARIANTS();
        avail = uvm_map_space_avail(&hint, length, uoffset, align, flags,
            topdown, entry);
        INVARIANTS();
        switch (avail) {
        case 1:
                goto found;
        case -1:
                goto wraparound;
        }

nextgap:
        KDASSERT((flags & UVM_FLAG_FIXED) == 0);
        /* If there is not enough space in the whole tree, we fail */
        tmp = ROOT_ENTRY(map);
        if (tmp == NULL || tmp->maxgap < length)
                goto notfound;

        prev = NULL; /* previous candidate */

        /* Find an entry close to hint that has enough space */
        for (; tmp;) {
                KASSERT(tmp->next->start == tmp->end + tmp->gap);
                if (topdown) {
                        if (tmp->next->start < hint + length &&
                            (prev == NULL || tmp->end > prev->end)) {
                                if (tmp->gap >= length)
                                        prev = tmp;
                                else if ((child = LEFT_ENTRY(tmp)) != NULL
                                    && child->maxgap >= length)
                                        prev = tmp;
                        }
                } else {
                        if (tmp->end >= hint &&
                            (prev == NULL || tmp->end < prev->end)) {
                                if (tmp->gap >= length)
                                        prev = tmp;
                                else if ((child = RIGHT_ENTRY(tmp)) != NULL
                                    && child->maxgap >= length)
                                        prev = tmp;
                        }
                }
                if (tmp->next->start < hint + length)
                        child = RIGHT_ENTRY(tmp);
                else if (tmp->end > hint)
                        child = LEFT_ENTRY(tmp);
                else {
                        if (tmp->gap >= length)
                                break;
                        if (topdown)
                                child = LEFT_ENTRY(tmp);
                        else
                                child = RIGHT_ENTRY(tmp);
                }
                if (child == NULL || child->maxgap < length)
                        break;
                tmp = child;
        }

        if (tmp != NULL && tmp->start < hint && hint < tmp->next->start) {
                /*
                 * Check if the entry that we found satifies the
                 * space requirement
                 */
                if (topdown) {
                        if (hint > tmp->next->start - length)
                                hint = tmp->next->start - length;
                } else {
                        if (hint < tmp->end)
                                hint = tmp->end;
                }
                INVARIANTS();
                avail = uvm_map_space_avail(&hint, length, uoffset, align,
                    flags, topdown, tmp);
                INVARIANTS();
                switch (avail) {
                case 1:
                        entry = tmp;
                        goto found;
                case -1:
                        goto wraparound;
                }
                if (tmp->gap >= length)
                        goto listsearch;
        }
        if (prev == NULL)
                goto notfound;

        if (topdown) {
                KASSERT(orig_hint >= prev->next->start - length ||
                    prev->next->start - length > prev->next->start);
                hint = prev->next->start - length;
        } else {
                KASSERT(orig_hint <= prev->end);
                hint = prev->end;
        }
        INVARIANTS();
        avail = uvm_map_space_avail(&hint, length, uoffset, align,
            flags, topdown, prev);
        INVARIANTS();
        switch (avail) {
        case 1:
                entry = prev;
                goto found;
        case -1:
                goto wraparound;
        }
        if (prev->gap >= length)
                goto listsearch;

        if (topdown)
                tmp = LEFT_ENTRY(prev);
        else
                tmp = RIGHT_ENTRY(prev);
        for (;;) {
                KASSERT(tmp);
                KASSERTMSG(tmp->maxgap >= length,
                    "tmp->maxgap=0x%"PRIxVSIZE" length=0x%"PRIxVSIZE,
                    tmp->maxgap, length);
                if (topdown)
                        child = RIGHT_ENTRY(tmp);
                else
                        child = LEFT_ENTRY(tmp);
                if (child && child->maxgap >= length) {
                        tmp = child;
                        continue;
                }
                if (tmp->gap >= length)
                        break;
                if (topdown)
                        tmp = LEFT_ENTRY(tmp);
                else
                        tmp = RIGHT_ENTRY(tmp);
        }

        if (topdown) {
                KASSERT(orig_hint >= tmp->next->start - length ||
                    tmp->next->start - length > tmp->next->start);
                hint = tmp->next->start - length;
        } else {
                KASSERT(orig_hint <= tmp->end);
                hint = tmp->end;
        }
        INVARIANTS();
        avail = uvm_map_space_avail(&hint, length, uoffset, align,
            flags, topdown, tmp);
        INVARIANTS();
        switch (avail) {
        case 1:
                entry = tmp;
                goto found;
        case -1:
                goto wraparound;
        }

        /*
         * The tree fails to find an entry because of offset or alignment
         * restrictions.  Search the list instead.
         */
 listsearch:
        /*
         * Look through the rest of the map, trying to fit a new region in
         * the gap between existing regions, or after the very last region.
         * note: entry->end = base VA of current gap,
         *         entry->next->start = VA of end of current gap
         */

        INVARIANTS();
        for (;;) {
                /* Update hint for current gap. */
                hint = topdown ? entry->next->start - length : entry->end;
                INVARIANTS();

                /* See if it fits. */
                avail = uvm_map_space_avail(&hint, length, uoffset, align,
                    flags, topdown, entry);
                INVARIANTS();
                switch (avail) {
                case 1:
                        goto found;
                case -1:
                        goto wraparound;
                }

                /* Advance to next/previous gap */
                if (topdown) {
                        if (entry == &map->header) {
                                UVMHIST_LOG(maphist, "<- failed (off start)",
                                    0,0,0,0);
                                goto notfound;
                        }
                        entry = entry->prev;
                } else {
                        entry = entry->next;
                        if (entry == &map->header) {
                                UVMHIST_LOG(maphist, "<- failed (off end)",
                                    0,0,0,0);
                                goto notfound;
                        }
                }
        }

 found:
        SAVE_HINT(map, map->hint, entry);
        *result = hint;
        UVMHIST_LOG(maphist,"<- got it!  (result=%#jx)", hint, 0,0,0);
        INVARIANTS();
        KASSERT(entry->end <= hint);
        KASSERT(hint + length <= entry->next->start);
        return (entry);

 wraparound:
        UVMHIST_LOG(maphist, "<- failed (wrap around)", 0,0,0,0);

        return (NULL);

 notfound:
        UVMHIST_LOG(maphist, "<- failed (notfound)", 0,0,0,0);

        return (NULL);
#undef INVARIANTS
}

/*
 *   U N M A P   -   m a i n   h e l p e r   f u n c t i o n s
 */

/*
 * uvm_unmap_remove: remove mappings from a vm_map (from "start" up to "stop")
 *
 * => caller must check alignment and size
 * => map must be locked by caller
 * => we return a list of map entries that we've remove from the map
 *    in "entry_list"
 */

void
uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
    struct vm_map_entry **entry_list /* OUT */, int flags)
{
        struct vm_map_entry *entry, *first_entry, *next;
        vaddr_t len;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(map=%#jx, start=%#jx, end=%#jx)",
            (uintptr_t)map, start, end, 0);
        VM_MAP_RANGE_CHECK(map, start, end);

        uvm_map_check(map, "unmap_remove entry");

        /*
         * find first entry
         */

        if (uvm_map_lookup_entry(map, start, &first_entry) == true) {
                /* clip and go... */
                entry = first_entry;
                UVM_MAP_CLIP_START(map, entry, start);
                /* critical!  prevents stale hint */
                SAVE_HINT(map, entry, entry->prev);
        } else {
                entry = first_entry->next;
        }

        /*
         * save the free space hint
         */

        if (map->first_free != &map->header && map->first_free->start >= start)
                map->first_free = entry->prev;

        /*
         * note: we now re-use first_entry for a different task.  we remove
         * a number of map entries from the map and save them in a linked
         * list headed by "first_entry".  once we remove them from the map
         * the caller should unlock the map and drop the references to the
         * backing objects [c.f. uvm_unmap_detach].  the object is to
         * separate unmapping from reference dropping.  why?
         *   [1] the map has to be locked for unmapping
         *   [2] the map need not be locked for reference dropping
         *   [3] dropping references may trigger pager I/O, and if we hit
         *       a pager that does synchronous I/O we may have to wait for it.
         *   [4] we would like all waiting for I/O to occur with maps unlocked
         *       so that we don't block other threads.
         */

        first_entry = NULL;
        *entry_list = NULL;

        /*
         * break up the area into map entry sized regions and unmap.  note
         * that all mappings have to be removed before we can even consider
         * dropping references to amaps or VM objects (otherwise we could end
         * up with a mapping to a page on the free list which would be very bad)
         */

        while ((entry != &map->header) && (entry->start < end)) {
                KASSERT((entry->flags & UVM_MAP_STATIC) == 0);

                UVM_MAP_CLIP_END(map, entry, end);
                next = entry->next;
                len = entry->end - entry->start;

                /*
                 * unwire before removing addresses from the pmap; otherwise
                 * unwiring will put the entries back into the pmap (XXX).
                 */

                if (VM_MAPENT_ISWIRED(entry)) {
                        uvm_map_entry_unwire(map, entry);
                }
                if (flags & UVM_FLAG_VAONLY) {

                        /* nothing */

                } else if ((map->flags & VM_MAP_PAGEABLE) == 0) {

                        /*
                         * if the map is non-pageable, any pages mapped there
                         * must be wired and entered with pmap_kenter_pa(),
                         * and we should free any such pages immediately.
                         * this is mostly used for kmem_map.
                         */
                        KASSERT(vm_map_pmap(map) == pmap_kernel());

                        uvm_km_pgremove_intrsafe(map, entry->start, entry->end);
                } else if (UVM_ET_ISOBJ(entry) &&
                           UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
                        panic("%s: kernel object %p %p\n",
                            __func__, map, entry);
                } else if (UVM_ET_ISOBJ(entry) || entry->aref.ar_amap) {
                        /*
                         * remove mappings the standard way.  lock object
                         * and/or amap to ensure vm_page state does not
                         * change while in pmap_remove().
                         */

#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
                        uvm_map_lock_entry(entry, RW_WRITER);
#else
                        uvm_map_lock_entry(entry, RW_READER);
#endif
                        pmap_remove(map->pmap, entry->start, entry->end);

                        /*
                         * note: if map is dying, leave pmap_update() for
                         * later.  if the map is to be reused (exec) then
                         * pmap_update() will be called.  if the map is
                         * being disposed of (exit) then pmap_destroy()
                         * will be called.
                         */

                        if ((map->flags & VM_MAP_DYING) == 0) {
                                pmap_update(vm_map_pmap(map));
                        } else {
                                KASSERT(vm_map_pmap(map) != pmap_kernel());
                        }

                        uvm_map_unlock_entry(entry);
                }

#if defined(UVMDEBUG)
                /*
                 * check if there's remaining mapping,
                 * which is a bug in caller.
                 */

                vaddr_t va;
                for (va = entry->start; va < entry->end;
                    va += PAGE_SIZE) {
                        if (pmap_extract(vm_map_pmap(map), va, NULL)) {
                                panic("%s: %#"PRIxVADDR" has mapping",
                                    __func__, va);
                        }
                }

                if (VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) {
                        uvm_km_check_empty(map, entry->start, entry->end);
                }
#endif /* defined(UVMDEBUG) */

                /*
                 * remove entry from map and put it on our list of entries
                 * that we've nuked.  then go to next entry.
                 */

                UVMHIST_LOG(maphist, "  removed map entry %#jx",
                    (uintptr_t)entry, 0, 0, 0);

                /* critical!  prevents stale hint */
                SAVE_HINT(map, entry, entry->prev);

                uvm_map_entry_unlink(map, entry);
                KASSERT(map->size >= len);
                map->size -= len;
                entry->prev = NULL;
                entry->next = first_entry;
                first_entry = entry;
                entry = next;
        }

        uvm_map_check(map, "unmap_remove leave");

        /*
         * now we've cleaned up the map and are ready for the caller to drop
         * references to the mapped objects.
         */

        *entry_list = first_entry;
        UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);

        if (map->flags & VM_MAP_WANTVA) {
                mutex_enter(&map->misc_lock);
                map->flags &= ~VM_MAP_WANTVA;
                cv_broadcast(&map->cv);
                mutex_exit(&map->misc_lock);
        }
}

/*
 * uvm_unmap_detach: drop references in a chain of map entries
 *
 * => we will free the map entries as we traverse the list.
 */

void
uvm_unmap_detach(struct vm_map_entry *first_entry, int flags)
{
        struct vm_map_entry *next_entry;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        while (first_entry) {
                KASSERT(!VM_MAPENT_ISWIRED(first_entry));
                UVMHIST_LOG(maphist,
                    "  detach %#jx: amap=%#jx, obj=%#jx, submap?=%jd",
                    (uintptr_t)first_entry,
                    (uintptr_t)first_entry->aref.ar_amap,
                    (uintptr_t)first_entry->object.uvm_obj,
                    UVM_ET_ISSUBMAP(first_entry));

                /*
                 * drop reference to amap, if we've got one
                 */

                if (first_entry->aref.ar_amap)
                        uvm_map_unreference_amap(first_entry, flags);

                /*
                 * drop reference to our backing object, if we've got one
                 */

                KASSERT(!UVM_ET_ISSUBMAP(first_entry));
                if (UVM_ET_ISOBJ(first_entry) &&
                    first_entry->object.uvm_obj->pgops->pgo_detach) {
                        (*first_entry->object.uvm_obj->pgops->pgo_detach)
                                (first_entry->object.uvm_obj);
                }
                next_entry = first_entry->next;
                uvm_mapent_free(first_entry);
                first_entry = next_entry;
        }
        UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
}

/*
 *   E X T R A C T I O N   F U N C T I O N S
 */

/*
 * uvm_map_reserve: reserve space in a vm_map for future use.
 *
 * => we reserve space in a map by putting a dummy map entry in the
 *    map (dummy means obj=NULL, amap=NULL, prot=VM_PROT_NONE)
 * => map should be unlocked (we will write lock it)
 * => we return true if we were able to reserve space
 * => XXXCDC: should be inline?
 */

int
uvm_map_reserve(struct vm_map *map, vsize_t size,
    vaddr_t offset        /* hint for pmap_prefer */,
    vsize_t align        /* alignment */,
    vaddr_t *raddr        /* IN:hint, OUT: reserved VA */,
    uvm_flag_t flags        /* UVM_FLAG_FIXED or UVM_FLAG_COLORMATCH or 0 */)
{
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist, "(map=%#jx, size=%#jx, offset=%#jx, addr=%#jx)",
            (uintptr_t)map, size, offset, (uintptr_t)raddr);

        size = round_page(size);

        /*
         * reserve some virtual space.
         */

        if (uvm_map(map, raddr, size, NULL, offset, align,
            UVM_MAPFLAG(UVM_PROT_NONE, UVM_PROT_NONE, UVM_INH_NONE,
            UVM_ADV_RANDOM, UVM_FLAG_NOMERGE|flags)) != 0) {
            UVMHIST_LOG(maphist, "<- done (no VM)", 0,0,0,0);
                return (false);
        }

        UVMHIST_LOG(maphist, "<- done (*raddr=%#jx)", *raddr,0,0,0);
        return (true);
}

/*
 * uvm_map_replace: replace a reserved (blank) area of memory with
 * real mappings.
 *
 * => caller must WRITE-LOCK the map
 * => we return true if replacement was a success
 * => we expect the newents chain to have nnewents entrys on it and
 *    we expect newents->prev to point to the last entry on the list
 * => note newents is allowed to be NULL
 */

static int
uvm_map_replace(struct vm_map *map, vaddr_t start, vaddr_t end,
    struct vm_map_entry *newents, int nnewents, vsize_t nsize,
    struct vm_map_entry **oldentryp)
{
        struct vm_map_entry *oldent, *last;

        uvm_map_check(map, "map_replace entry");

        /*
         * first find the blank map entry at the specified address
         */

        if (!uvm_map_lookup_entry(map, start, &oldent)) {
                return (false);
        }

        /*
         * check to make sure we have a proper blank entry
         */

        if (end < oldent->end) {
                UVM_MAP_CLIP_END(map, oldent, end);
        }
        if (oldent->start != start || oldent->end != end ||
            oldent->object.uvm_obj != NULL || oldent->aref.ar_amap != NULL) {
                return (false);
        }

#ifdef DIAGNOSTIC

        /*
         * sanity check the newents chain
         */

        {
                struct vm_map_entry *tmpent = newents;
                int nent = 0;
                vsize_t sz = 0;
                vaddr_t cur = start;

                while (tmpent) {
                        nent++;
                        sz += tmpent->end - tmpent->start;
                        if (tmpent->start < cur)
                                panic("uvm_map_replace1");
                        if (tmpent->start >= tmpent->end || tmpent->end > end) {
                                panic("uvm_map_replace2: "
                                    "tmpent->start=%#"PRIxVADDR
                                    ", tmpent->end=%#"PRIxVADDR
                                    ", end=%#"PRIxVADDR,
                                    tmpent->start, tmpent->end, end);
                        }
                        cur = tmpent->end;
                        if (tmpent->next) {
                                if (tmpent->next->prev != tmpent)
                                        panic("uvm_map_replace3");
                        } else {
                                if (newents->prev != tmpent)
                                        panic("uvm_map_replace4");
                        }
                        tmpent = tmpent->next;
                }
                if (nent != nnewents)
                        panic("uvm_map_replace5");
                if (sz != nsize)
                        panic("uvm_map_replace6");
        }
#endif

        /*
         * map entry is a valid blank!   replace it.   (this does all the
         * work of map entry link/unlink...).
         */

        if (newents) {
                last = newents->prev;

                /* critical: flush stale hints out of map */
                SAVE_HINT(map, map->hint, newents);
                if (map->first_free == oldent)
                        map->first_free = last;

                last->next = oldent->next;
                last->next->prev = last;

                /* Fix RB tree */
                uvm_rb_remove(map, oldent);

                newents->prev = oldent->prev;
                newents->prev->next = newents;
                map->nentries = map->nentries + (nnewents - 1);

                /* Fixup the RB tree */
                {
                        int i;
                        struct vm_map_entry *tmp;

                        tmp = newents;
                        for (i = 0; i < nnewents && tmp; i++) {
                                uvm_rb_insert(map, tmp);
                                tmp = tmp->next;
                        }
                }
        } else {
                /* NULL list of new entries: just remove the old one */
                clear_hints(map, oldent);
                uvm_map_entry_unlink(map, oldent);
        }
        map->size -= end - start - nsize;

        uvm_map_check(map, "map_replace leave");

        /*
         * now we can free the old blank entry and return.
         */

        *oldentryp = oldent;
        return (true);
}

/*
 * uvm_map_extract: extract a mapping from a map and put it somewhere
 *        (maybe removing the old mapping)
 *
 * => maps should be unlocked (we will write lock them)
 * => returns 0 on success, error code otherwise
 * => start must be page aligned
 * => len must be page sized
 * => flags:
 *      UVM_EXTRACT_REMOVE: remove mappings from srcmap
 *      UVM_EXTRACT_CONTIG: abort if unmapped area (advisory only)
 *      UVM_EXTRACT_QREF: for a temporary extraction do quick obj refs
 *      UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
 *      UVM_EXTRACT_PROT_ALL: set prot to UVM_PROT_ALL as we go
 *    >>>NOTE: if you set REMOVE, you are not allowed to use CONTIG or QREF!<<<
 *    >>>NOTE: QREF's must be unmapped via the QREF path, thus should only
 *             be used from within the kernel in a kernel level map <<<
 */

int
uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
    struct vm_map *dstmap, vaddr_t *dstaddrp, int flags)
{
        vaddr_t dstaddr, end, newend, oldoffset, fudge, orig_fudge;
        struct vm_map_entry *chain, *endchain, *entry, *orig_entry, *newentry,
            *deadentry, *oldentry;
        struct vm_map_entry *resentry = NULL; /* a dummy reservation entry */
        vsize_t elen __unused;
        int nchain, error, copy_ok;
        vsize_t nsize;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(srcmap=%#jx,start=%#jx, len=%#jx",
            (uintptr_t)srcmap, start, len, 0);
        UVMHIST_LOG(maphist," ...,dstmap=%#jx, flags=%#jx)",
            (uintptr_t)dstmap, flags, 0, 0);

        /*
         * step 0: sanity check: start must be on a page boundary, length
         * must be page sized.  can't ask for CONTIG/QREF if you asked for
         * REMOVE.
         */

        KASSERTMSG((start & PAGE_MASK) == 0, "start=0x%"PRIxVADDR, start);
        KASSERTMSG((len & PAGE_MASK) == 0, "len=0x%"PRIxVADDR, len);
        KASSERT((flags & UVM_EXTRACT_REMOVE) == 0 ||
                (flags & (UVM_EXTRACT_CONTIG|UVM_EXTRACT_QREF)) == 0);

        /*
         * step 1: reserve space in the target map for the extracted area
         */

        if ((flags & UVM_EXTRACT_RESERVED) == 0) {
                dstaddr = vm_map_min(dstmap);
                if (!uvm_map_reserve(dstmap, len, start,
                    atop(start) & uvmexp.colormask, &dstaddr,
                    UVM_FLAG_COLORMATCH))
                        return (ENOMEM);
                KASSERT((atop(start ^ dstaddr) & uvmexp.colormask) == 0);
                *dstaddrp = dstaddr;        /* pass address back to caller */
                UVMHIST_LOG(maphist, "  dstaddr=%#jx", dstaddr,0,0,0);
        } else {
                dstaddr = *dstaddrp;
        }

        /*
         * step 2: setup for the extraction process loop by init'ing the
         * map entry chain, locking src map, and looking up the first useful
         * entry in the map.
         */

        end = start + len;
        newend = dstaddr + len;
        chain = endchain = NULL;
        nchain = 0;
        nsize = 0;
        vm_map_lock(srcmap);

        if (uvm_map_lookup_entry(srcmap, start, &entry)) {

                /* "start" is within an entry */
                if (flags & UVM_EXTRACT_QREF) {

                        /*
                         * for quick references we don't clip the entry, so
                         * the entry may map space "before" the starting
                         * virtual address... this is the "fudge" factor
                         * (which can be non-zero only the first time
                         * through the "while" loop in step 3).
                         */

                        fudge = start - entry->start;
                } else {

                        /*
                         * normal reference: we clip the map to fit (thus
                         * fudge is zero)
                         */

                        UVM_MAP_CLIP_START(srcmap, entry, start);
                        SAVE_HINT(srcmap, srcmap->hint, entry->prev);
                        fudge = 0;
                }
        } else {

                /* "start" is not within an entry ... skip to next entry */
                if (flags & UVM_EXTRACT_CONTIG) {
                        error = EINVAL;
                        goto bad;    /* definite hole here ... */
                }

                entry = entry->next;
                fudge = 0;
        }

        /* save values from srcmap for step 6 */
        orig_entry = entry;
        orig_fudge = fudge;

        /*
         * step 3: now start looping through the map entries, extracting
         * as we go.
         */

        while (entry->start < end && entry != &srcmap->header) {

                /* if we are not doing a quick reference, clip it */
                if ((flags & UVM_EXTRACT_QREF) == 0)
                        UVM_MAP_CLIP_END(srcmap, entry, end);

                /* clear needs_copy (allow chunking) */
                if (UVM_ET_ISNEEDSCOPY(entry)) {
                        amap_copy(srcmap, entry,
                            AMAP_COPY_NOWAIT|AMAP_COPY_NOMERGE, start, end);
                        if (UVM_ET_ISNEEDSCOPY(entry)) {  /* failed? */
                                error = ENOMEM;
                                goto bad;
                        }

                        /* amap_copy could clip (during chunk)!  update fudge */
                        if (fudge) {
                                fudge = start - entry->start;
                                orig_fudge = fudge;
                        }
                }

                /* calculate the offset of this from "start" */
                oldoffset = (entry->start + fudge) - start;

                /* allocate a new map entry */
                newentry = uvm_mapent_alloc(dstmap, 0);
                if (newentry == NULL) {
                        error = ENOMEM;
                        goto bad;
                }

                /* set up new map entry */
                newentry->next = NULL;
                newentry->prev = endchain;
                newentry->start = dstaddr + oldoffset;
                newentry->end =
                    newentry->start + (entry->end - (entry->start + fudge));
                if (newentry->end > newend || newentry->end < newentry->start)
                        newentry->end = newend;
                newentry->object.uvm_obj = entry->object.uvm_obj;
                if (newentry->object.uvm_obj) {
                        if (newentry->object.uvm_obj->pgops->pgo_reference)
                                newentry->object.uvm_obj->pgops->
                                    pgo_reference(newentry->object.uvm_obj);
                        newentry->offset = entry->offset + fudge;
                } else {
                        newentry->offset = 0;
                }
                newentry->etype = entry->etype;
                if (flags & UVM_EXTRACT_PROT_ALL) {
                        newentry->protection = newentry->max_protection =
                            UVM_PROT_ALL;
                } else {
                        newentry->protection = (flags & UVM_EXTRACT_FIXPROT) ?
                            entry->max_protection : entry->protection;
                        newentry->max_protection = entry->max_protection;
                }
                newentry->inheritance = entry->inheritance;
                newentry->wired_count = 0;
                newentry->aref.ar_amap = entry->aref.ar_amap;
                if (newentry->aref.ar_amap) {
                        newentry->aref.ar_pageoff =
                            entry->aref.ar_pageoff + (fudge >> PAGE_SHIFT);
                        uvm_map_reference_amap(newentry, AMAP_SHARED |
                            ((flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0));
                } else {
                        newentry->aref.ar_pageoff = 0;
                }
                newentry->advice = entry->advice;
                if ((flags & UVM_EXTRACT_QREF) != 0) {
                        newentry->flags |= UVM_MAP_NOMERGE;
                }

                /* now link it on the chain */
                nchain++;
                nsize += newentry->end - newentry->start;
                if (endchain == NULL) {
                        chain = endchain = newentry;
                } else {
                        endchain->next = newentry;
                        endchain = newentry;
                }

                /* end of 'while' loop! */
                if ((flags & UVM_EXTRACT_CONTIG) && entry->end < end &&
                    (entry->next == &srcmap->header ||
                    entry->next->start != entry->end)) {
                        error = EINVAL;
                        goto bad;
                }
                entry = entry->next;
                fudge = 0;
        }

        /*
         * step 4: close off chain (in format expected by uvm_map_replace)
         */

        if (chain)
                chain->prev = endchain;

        /*
         * step 5: attempt to lock the dest map so we can pmap_copy.
         * note usage of copy_ok:
         *   1 => dstmap locked, pmap_copy ok, and we "replace" here (step 5)
         *   0 => dstmap unlocked, NO pmap_copy, and we will "replace" in step 7
         */

        if (srcmap == dstmap || vm_map_lock_try(dstmap) == true) {
                copy_ok = 1;
                if (!uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
                    nchain, nsize, &resentry)) {
                        if (srcmap != dstmap)
                                vm_map_unlock(dstmap);
                        error = EIO;
                        goto bad;
                }
        } else {
                copy_ok = 0;
                /* replace deferred until step 7 */
        }

        /*
         * step 6: traverse the srcmap a second time to do the following:
         *  - if we got a lock on the dstmap do pmap_copy
         *  - if UVM_EXTRACT_REMOVE remove the entries
         * we make use of orig_entry and orig_fudge (saved in step 2)
         */

        if (copy_ok || (flags & UVM_EXTRACT_REMOVE)) {

                /* purge possible stale hints from srcmap */
                if (flags & UVM_EXTRACT_REMOVE) {
                        SAVE_HINT(srcmap, srcmap->hint, orig_entry->prev);
                        if (srcmap->first_free != &srcmap->header &&
                            srcmap->first_free->start >= start)
                                srcmap->first_free = orig_entry->prev;
                }

                entry = orig_entry;
                fudge = orig_fudge;
                deadentry = NULL;        /* for UVM_EXTRACT_REMOVE */

                while (entry->start < end && entry != &srcmap->header) {
                        if (copy_ok) {
                                oldoffset = (entry->start + fudge) - start;
                                elen = MIN(end, entry->end) -
                                    (entry->start + fudge);
                                pmap_copy(dstmap->pmap, srcmap->pmap,
                                    dstaddr + oldoffset, elen,
                                    entry->start + fudge);
                        }

                        /* we advance "entry" in the following if statement */
                        if (flags & UVM_EXTRACT_REMOVE) {
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
                                uvm_map_lock_entry(entry, RW_WRITER);
#else
                                uvm_map_lock_entry(entry, RW_READER);
#endif
                                pmap_remove(srcmap->pmap, entry->start,
                                                entry->end);
                                uvm_map_unlock_entry(entry);
                                oldentry = entry;        /* save entry */
                                entry = entry->next;        /* advance */
                                uvm_map_entry_unlink(srcmap, oldentry);
                                                        /* add to dead list */
                                oldentry->next = deadentry;
                                deadentry = oldentry;
                        } else {
                                entry = entry->next;                /* advance */
                        }

                        /* end of 'while' loop */
                        fudge = 0;
                }
                pmap_update(srcmap->pmap);

                /*
                 * unlock dstmap.  we will dispose of deadentry in
                 * step 7 if needed
                 */

                if (copy_ok && srcmap != dstmap)
                        vm_map_unlock(dstmap);

        } else {
                deadentry = NULL;
        }

        /*
         * step 7: we are done with the source map, unlock.   if copy_ok
         * is 0 then we have not replaced the dummy mapping in dstmap yet
         * and we need to do so now.
         */

        vm_map_unlock(srcmap);
        if ((flags & UVM_EXTRACT_REMOVE) && deadentry)
                uvm_unmap_detach(deadentry, 0);   /* dispose of old entries */

        /* now do the replacement if we didn't do it in step 5 */
        if (copy_ok == 0) {
                vm_map_lock(dstmap);
                error = uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
                    nchain, nsize, &resentry);
                vm_map_unlock(dstmap);

                if (error == false) {
                        error = EIO;
                        goto bad2;
                }
        }

        if (resentry != NULL)
                uvm_mapent_free(resentry);

        return (0);

        /*
         * bad: failure recovery
         */
bad:
        vm_map_unlock(srcmap);
bad2:                        /* src already unlocked */
        if (chain)
                uvm_unmap_detach(chain,
                    (flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0);

        if (resentry != NULL)
                uvm_mapent_free(resentry);

        if ((flags & UVM_EXTRACT_RESERVED) == 0) {
                uvm_unmap(dstmap, dstaddr, dstaddr+len);   /* ??? */
        }
        return (error);
}

/* end of extraction functions */

/*
 * uvm_map_submap: punch down part of a map into a submap
 *
 * => only the kernel_map is allowed to be submapped
 * => the purpose of submapping is to break up the locking granularity
 *        of a larger map
 * => the range specified must have been mapped previously with a uvm_map()
 *        call [with uobj==NULL] to create a blank map entry in the main map.
 *        [And it had better still be blank!]
 * => maps which contain submaps should never be copied or forked.
 * => to remove a submap, use uvm_unmap() on the main map
 *        and then uvm_map_deallocate() the submap.
 * => main map must be unlocked.
 * => submap must have been init'd and have a zero reference count.
 *        [need not be locked as we don't actually reference it]
 */

int
uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
    struct vm_map *submap)
{
        struct vm_map_entry *entry;
        int error;

        vm_map_lock(map);
        VM_MAP_RANGE_CHECK(map, start, end);

        if (uvm_map_lookup_entry(map, start, &entry)) {
                UVM_MAP_CLIP_START(map, entry, start);
                UVM_MAP_CLIP_END(map, entry, end);        /* to be safe */
        } else {
                entry = NULL;
        }

        if (entry != NULL &&
            entry->start == start && entry->end == end &&
            entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
            !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
                entry->etype |= UVM_ET_SUBMAP;
                entry->object.sub_map = submap;
                entry->offset = 0;
                uvm_map_reference(submap);
                error = 0;
        } else {
                error = EINVAL;
        }
        vm_map_unlock(map);

        return error;
}

/*
 * uvm_map_protect_user: change map protection on behalf of the user.
 * Enforces PAX settings as necessary.
 */
int
uvm_map_protect_user(struct lwp *l, vaddr_t start, vaddr_t end,
    vm_prot_t new_prot)
{
        int error;

        if ((error = PAX_MPROTECT_VALIDATE(l, new_prot)))
                return error;

        return uvm_map_protect(&l->l_proc->p_vmspace->vm_map, start, end,
            new_prot, false);
}


/*
 * uvm_map_protect: change map protection
 *
 * => set_max means set max_protection.
 * => map must be unlocked.
 */

#define MASK(entry)        (UVM_ET_ISCOPYONWRITE(entry) ? \
                         ~VM_PROT_WRITE : VM_PROT_ALL)

int
uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
    vm_prot_t new_prot, bool set_max)
{
        struct vm_map_entry *current, *entry;
        int error = 0;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_prot=%#jx)",
            (uintptr_t)map, start, end, new_prot);

        vm_map_lock(map);
        VM_MAP_RANGE_CHECK(map, start, end);
        if (uvm_map_lookup_entry(map, start, &entry)) {
                UVM_MAP_CLIP_START(map, entry, start);
        } else {
                entry = entry->next;
        }

        /*
         * make a first pass to check for protection violations.
         */

        current = entry;
        while ((current != &map->header) && (current->start < end)) {
                if (UVM_ET_ISSUBMAP(current)) {
                        error = EINVAL;
                        goto out;
                }
                if ((new_prot & current->max_protection) != new_prot) {
                        error = EACCES;
                        goto out;
                }
                /*
                 * Don't allow VM_PROT_EXECUTE to be set on entries that
                 * point to vnodes that are associated with a NOEXEC file
                 * system.
                 */
                if (UVM_ET_ISOBJ(current) &&
                    UVM_OBJ_IS_VNODE(current->object.uvm_obj)) {
                        struct vnode *vp =
                            (struct vnode *) current->object.uvm_obj;

                        if ((new_prot & VM_PROT_EXECUTE) != 0 &&
                            (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
                                error = EACCES;
                                goto out;
                        }
                }

                current = current->next;
        }

        /* go back and fix up protections (no need to clip this time). */

        current = entry;
        while ((current != &map->header) && (current->start < end)) {
                vm_prot_t old_prot;

                UVM_MAP_CLIP_END(map, current, end);
                old_prot = current->protection;
                if (set_max)
                        current->protection =
                            (current->max_protection = new_prot) & old_prot;
                else
                        current->protection = new_prot;

                /*
                 * update physical map if necessary.  worry about copy-on-write
                 * here -- CHECK THIS XXX
                 */

                if (current->protection != old_prot) {
                        /* update pmap! */
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
                        uvm_map_lock_entry(current, RW_WRITER);
#else
                        uvm_map_lock_entry(current, RW_READER);
#endif
                        pmap_protect(map->pmap, current->start, current->end,
                            current->protection & MASK(current));
                        uvm_map_unlock_entry(current);

                        /*
                         * If this entry points at a vnode, and the
                         * protection includes VM_PROT_EXECUTE, mark
                         * the vnode as VEXECMAP.
                         */
                        if (UVM_ET_ISOBJ(current)) {
                                struct uvm_object *uobj =
                                    current->object.uvm_obj;

                                if (UVM_OBJ_IS_VNODE(uobj) &&
                                    (current->protection & VM_PROT_EXECUTE)) {
                                        vn_markexec((struct vnode *) uobj);
                                }
                        }
                }

                /*
                 * If the map is configured to lock any future mappings,
                 * wire this entry now if the old protection was VM_PROT_NONE
                 * and the new protection is not VM_PROT_NONE.
                 */

                if ((map->flags & VM_MAP_WIREFUTURE) != 0 &&
                    VM_MAPENT_ISWIRED(current) == 0 &&
                    old_prot == VM_PROT_NONE &&
                    new_prot != VM_PROT_NONE) {

                        /*
                         * We must call pmap_update() here because the
                         * pmap_protect() call above might have removed some
                         * pmap entries and uvm_map_pageable() might create
                         * some new pmap entries that rely on the prior
                         * removals being completely finished.
                         */

                        pmap_update(map->pmap);

                        if (uvm_map_pageable(map, current->start,
                            current->end, false,
                            UVM_LK_ENTER|UVM_LK_EXIT) != 0) {

                                /*
                                 * If locking the entry fails, remember the
                                 * error if it's the first one.  Note we
                                 * still continue setting the protection in
                                 * the map, but will return the error
                                 * condition regardless.
                                 *
                                 * XXX Ignore what the actual error is,
                                 * XXX just call it a resource shortage
                                 * XXX so that it doesn't get confused
                                 * XXX what uvm_map_protect() itself would
                                 * XXX normally return.
                                 */

                                error = ENOMEM;
                        }
                }
                current = current->next;
        }
        pmap_update(map->pmap);

 out:
        vm_map_unlock(map);

        UVMHIST_LOG(maphist, "<- done, error=%jd",error,0,0,0);
        return error;
}

#undef  MASK

/*
 * uvm_map_inherit: set inheritance code for range of addrs in map.
 *
 * => map must be unlocked
 * => note that the inherit code is used during a "fork".  see fork
 *        code for details.
 */

int
uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
    vm_inherit_t new_inheritance)
{
        struct vm_map_entry *entry, *temp_entry;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_inh=%#jx)",
            (uintptr_t)map, start, end, new_inheritance);

        switch (new_inheritance) {
        case MAP_INHERIT_NONE:
        case MAP_INHERIT_COPY:
        case MAP_INHERIT_SHARE:
        case MAP_INHERIT_ZERO:
                break;
        default:
                UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
                return EINVAL;
        }

        vm_map_lock(map);
        VM_MAP_RANGE_CHECK(map, start, end);
        if (uvm_map_lookup_entry(map, start, &temp_entry)) {
                entry = temp_entry;
                UVM_MAP_CLIP_START(map, entry, start);
        }  else {
                entry = temp_entry->next;
        }
        while ((entry != &map->header) && (entry->start < end)) {
                UVM_MAP_CLIP_END(map, entry, end);
                entry->inheritance = new_inheritance;
                entry = entry->next;
        }
        vm_map_unlock(map);
        UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
        return 0;
}

/*
 * uvm_map_advice: set advice code for range of addrs in map.
 *
 * => map must be unlocked
 */

int
uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
{
        struct vm_map_entry *entry, *temp_entry;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_adv=%#jx)",
            (uintptr_t)map, start, end, new_advice);

        vm_map_lock(map);
        VM_MAP_RANGE_CHECK(map, start, end);
        if (uvm_map_lookup_entry(map, start, &temp_entry)) {
                entry = temp_entry;
                UVM_MAP_CLIP_START(map, entry, start);
        } else {
                entry = temp_entry->next;
        }

        /*
         * XXXJRT: disallow holes?
         */

        while ((entry != &map->header) && (entry->start < end)) {
                UVM_MAP_CLIP_END(map, entry, end);

                switch (new_advice) {
                case MADV_NORMAL:
                case MADV_RANDOM:
                case MADV_SEQUENTIAL:
                        /* nothing special here */
                        break;

                default:
                        vm_map_unlock(map);
                        UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
                        return EINVAL;
                }
                entry->advice = new_advice;
                entry = entry->next;
        }

        vm_map_unlock(map);
        UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
        return 0;
}

/*
 * uvm_map_willneed: apply MADV_WILLNEED
 */

int
uvm_map_willneed(struct vm_map *map, vaddr_t start, vaddr_t end)
{
        struct vm_map_entry *entry;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx)",
            (uintptr_t)map, start, end, 0);

        vm_map_lock_read(map);
        VM_MAP_RANGE_CHECK(map, start, end);
        if (!uvm_map_lookup_entry(map, start, &entry)) {
                entry = entry->next;
        }
        while (entry->start < end) {
                struct vm_amap * const amap = entry->aref.ar_amap;
                struct uvm_object * const uobj = entry->object.uvm_obj;

                KASSERT(entry != &map->header);
                KASSERT(start < entry->end);
                /*
                 * For now, we handle only the easy but commonly-requested case.
                 * ie. start prefetching of backing uobj pages.
                 *
                 * XXX It might be useful to pmap_enter() the already-in-core
                 * pages by inventing a "weak" mode for uvm_fault() which would
                 * only do the PGO_LOCKED pgo_get().
                 */
                if (UVM_ET_ISOBJ(entry) && amap == NULL && uobj != NULL) {
                        off_t offset;
                        off_t size;

                        offset = entry->offset;
                        if (start < entry->start) {
                                offset += entry->start - start;
                        }
                        size = entry->offset + (entry->end - entry->start);
                        if (entry->end < end) {
                                size -= end - entry->end;
                        }
                        uvm_readahead(uobj, offset, size);
                }
                entry = entry->next;
        }
        vm_map_unlock_read(map);
        UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
        return 0;
}

/*
 * uvm_map_pageable: sets the pageability of a range in a map.
 *
 * => wires map entries.  should not be used for transient page locking.
 *        for that, use uvm_fault_wire()/uvm_fault_unwire() (see uvm_vslock()).
 * => regions specified as not pageable require lock-down (wired) memory
 *        and page tables.
 * => map must never be read-locked
 * => if islocked is true, map is already write-locked
 * => we always unlock the map, since we must downgrade to a read-lock
 *        to call uvm_fault_wire()
 * => XXXCDC: check this and try and clean it up.
 */

int
uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
    bool new_pageable, int lockflags)
{
        struct vm_map_entry *entry, *start_entry, *failed_entry;
        int rv;
#ifdef DIAGNOSTIC
        u_int timestamp_save;
#endif
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_pageable=%ju)",
            (uintptr_t)map, start, end, new_pageable);
        KASSERT(map->flags & VM_MAP_PAGEABLE);

        if ((lockflags & UVM_LK_ENTER) == 0)
                vm_map_lock(map);
        VM_MAP_RANGE_CHECK(map, start, end);

        /*
         * only one pageability change may take place at one time, since
         * uvm_fault_wire assumes it will be called only once for each
         * wiring/unwiring.  therefore, we have to make sure we're actually
         * changing the pageability for the entire region.  we do so before
         * making any changes.
         */

        if (uvm_map_lookup_entry(map, start, &start_entry) == false) {
                if ((lockflags & UVM_LK_EXIT) == 0)
                        vm_map_unlock(map);

                UVMHIST_LOG(maphist,"<- done (fault)",0,0,0,0);
                return EFAULT;
        }
        entry = start_entry;

        if (start == end) {                /* nothing required */
                if ((lockflags & UVM_LK_EXIT) == 0)
                        vm_map_unlock(map);

                UVMHIST_LOG(maphist,"<- done (nothing)",0,0,0,0);
                return 0;
        }

        /*
         * handle wiring and unwiring separately.
         */

        if (new_pageable) {                /* unwire */
                UVM_MAP_CLIP_START(map, entry, start);

                /*
                 * unwiring.  first ensure that the range to be unwired is
                 * really wired down and that there are no holes.
                 */

                while ((entry != &map->header) && (entry->start < end)) {
                        if (entry->wired_count == 0 ||
                            (entry->end < end &&
                             (entry->next == &map->header ||
                              entry->next->start > entry->end))) {
                                if ((lockflags & UVM_LK_EXIT) == 0)
                                        vm_map_unlock(map);
                                UVMHIST_LOG(maphist, "<- done (INVAL)",0,0,0,0);
                                return EINVAL;
                        }
                        entry = entry->next;
                }

                /*
                 * POSIX 1003.1b - a single munlock call unlocks a region,
                 * regardless of the number of mlock calls made on that
                 * region.
                 */

                entry = start_entry;
                while ((entry != &map->header) && (entry->start < end)) {
                        UVM_MAP_CLIP_END(map, entry, end);
                        if (VM_MAPENT_ISWIRED(entry))
                                uvm_map_entry_unwire(map, entry);
                        entry = entry->next;
                }
                if ((lockflags & UVM_LK_EXIT) == 0)
                        vm_map_unlock(map);
                UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
                return 0;
        }

        /*
         * wire case: in two passes [XXXCDC: ugly block of code here]
         *
         * 1: holding the write lock, we create any anonymous maps that need
         *    to be created.  then we clip each map entry to the region to
         *    be wired and increment its wiring count.
         *
         * 2: we downgrade to a read lock, and call uvm_fault_wire to fault
         *    in the pages for any newly wired area (wired_count == 1).
         *
         *    downgrading to a read lock for uvm_fault_wire avoids a possible
         *    deadlock with another thread that may have faulted on one of
         *    the pages to be wired (it would mark the page busy, blocking
         *    us, then in turn block on the map lock that we hold).  because
         *    of problems in the recursive lock package, we cannot upgrade
         *    to a write lock in vm_map_lookup.  thus, any actions that
         *    require the write lock must be done beforehand.  because we
         *    keep the read lock on the map, the copy-on-write status of the
         *    entries we modify here cannot change.
         */

        while ((entry != &map->header) && (entry->start < end)) {
                if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */

                        /*
                         * perform actions of vm_map_lookup that need the
                         * write lock on the map: create an anonymous map
                         * for a copy-on-write region, or an anonymous map
                         * for a zero-fill region.  (XXXCDC: submap case
                         * ok?)
                         */

                        if (!UVM_ET_ISSUBMAP(entry)) {  /* not submap */
                                if (UVM_ET_ISNEEDSCOPY(entry) &&
                                    ((entry->max_protection & VM_PROT_WRITE) ||
                                     (entry->object.uvm_obj == NULL))) {
                                        amap_copy(map, entry, 0, start, end);
                                        /* XXXCDC: wait OK? */
                                }
                        }
                }
                UVM_MAP_CLIP_START(map, entry, start);
                UVM_MAP_CLIP_END(map, entry, end);
                entry->wired_count++;

                /*
                 * Check for holes
                 */

                if (entry->protection == VM_PROT_NONE ||
                    (entry->end < end &&
                     (entry->next == &map->header ||
                      entry->next->start > entry->end))) {

                        /*
                         * found one.  amap creation actions do not need to
                         * be undone, but the wired counts need to be restored.
                         */

                        while (entry != &map->header && entry->end > start) {
                                entry->wired_count--;
                                entry = entry->prev;
                        }
                        if ((lockflags & UVM_LK_EXIT) == 0)
                                vm_map_unlock(map);
                        UVMHIST_LOG(maphist,"<- done (INVALID WIRE)",0,0,0,0);
                        return EINVAL;
                }
                entry = entry->next;
        }

        /*
         * Pass 2.
         */

#ifdef DIAGNOSTIC
        timestamp_save = map->timestamp;
#endif
        vm_map_busy(map);
        vm_map_unlock(map);

        rv = 0;
        entry = start_entry;
        while (entry != &map->header && entry->start < end) {
                if (entry->wired_count == 1) {
                        rv = uvm_fault_wire(map, entry->start, entry->end,
                            entry->max_protection, 1);
                        if (rv) {

                                /*
                                 * wiring failed.  break out of the loop.
                                 * we'll clean up the map below, once we
                                 * have a write lock again.
                                 */

                                break;
                        }
                }
                entry = entry->next;
        }

        if (rv) {        /* failed? */

                /*
                 * Get back to an exclusive (write) lock.
                 */

                vm_map_lock(map);
                vm_map_unbusy(map);

#ifdef DIAGNOSTIC
                if (timestamp_save + 1 != map->timestamp)
                        panic("uvm_map_pageable: stale map");
#endif

                /*
                 * first drop the wiring count on all the entries
                 * which haven't actually been wired yet.
                 */

                failed_entry = entry;
                while (entry != &map->header && entry->start < end) {
                        entry->wired_count--;
                        entry = entry->next;
                }

                /*
                 * now, unwire all the entries that were successfully
                 * wired above.
                 */

                entry = start_entry;
                while (entry != failed_entry) {
                        entry->wired_count--;
                        if (VM_MAPENT_ISWIRED(entry) == 0)
                                uvm_map_entry_unwire(map, entry);
                        entry = entry->next;
                }
                if ((lockflags & UVM_LK_EXIT) == 0)
                        vm_map_unlock(map);
                UVMHIST_LOG(maphist, "<- done (RV=%jd)", rv,0,0,0);
                return (rv);
        }

        if ((lockflags & UVM_LK_EXIT) == 0) {
                vm_map_unbusy(map);
        } else {

                /*
                 * Get back to an exclusive (write) lock.
                 */

                vm_map_lock(map);
                vm_map_unbusy(map);
        }

        UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
        return 0;
}

/*
 * uvm_map_pageable_all: special case of uvm_map_pageable - affects
 * all mapped regions.
 *
 * => map must not be locked.
 * => if no flags are specified, all regions are unwired.
 * => XXXJRT: has some of the same problems as uvm_map_pageable() above.
 */

int
uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
{
        struct vm_map_entry *entry, *failed_entry;
        vsize_t size;
        int rv;
#ifdef DIAGNOSTIC
        u_int timestamp_save;
#endif
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(map=%#jx,flags=%#jx)", (uintptr_t)map, flags,
            0, 0);

        KASSERT(map->flags & VM_MAP_PAGEABLE);

        vm_map_lock(map);

        /*
         * handle wiring and unwiring separately.
         */

        if (flags == 0) {                        /* unwire */

                /*
                 * POSIX 1003.1b -- munlockall unlocks all regions,
                 * regardless of how many times mlockall has been called.
                 */

                for (entry = map->header.next; entry != &map->header;
                     entry = entry->next) {
                        if (VM_MAPENT_ISWIRED(entry))
                                uvm_map_entry_unwire(map, entry);
                }
                map->flags &= ~VM_MAP_WIREFUTURE;
                vm_map_unlock(map);
                UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
                return 0;
        }

        if (flags & MCL_FUTURE) {

                /*
                 * must wire all future mappings; remember this.
                 */

                map->flags |= VM_MAP_WIREFUTURE;
        }

        if ((flags & MCL_CURRENT) == 0) {

                /*
                 * no more work to do!
                 */

                UVMHIST_LOG(maphist,"<- done (OK no wire)",0,0,0,0);
                vm_map_unlock(map);
                return 0;
        }

        /*
         * wire case: in three passes [XXXCDC: ugly block of code here]
         *
         * 1: holding the write lock, count all pages mapped by non-wired
         *    entries.  if this would cause us to go over our limit, we fail.
         *
         * 2: still holding the write lock, we create any anonymous maps that
         *    need to be created.  then we increment its wiring count.
         *
         * 3: we downgrade to a read lock, and call uvm_fault_wire to fault
         *    in the pages for any newly wired area (wired_count == 1).
         *
         *    downgrading to a read lock for uvm_fault_wire avoids a possible
         *    deadlock with another thread that may have faulted on one of
         *    the pages to be wired (it would mark the page busy, blocking
         *    us, then in turn block on the map lock that we hold).  because
         *    of problems in the recursive lock package, we cannot upgrade
         *    to a write lock in vm_map_lookup.  thus, any actions that
         *    require the write lock must be done beforehand.  because we
         *    keep the read lock on the map, the copy-on-write status of the
         *    entries we modify here cannot change.
         */

        for (size = 0, entry = map->header.next; entry != &map->header;
             entry = entry->next) {
                if (entry->protection != VM_PROT_NONE &&
                    VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
                        size += entry->end - entry->start;
                }
        }

        if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
                vm_map_unlock(map);
                return ENOMEM;
        }

        if (limit != 0 &&
            (size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit)) {
                vm_map_unlock(map);
                return ENOMEM;
        }

        /*
         * Pass 2.
         */

        for (entry = map->header.next; entry != &map->header;
             entry = entry->next) {
                if (entry->protection == VM_PROT_NONE)
                        continue;
                if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */

                        /*
                         * perform actions of vm_map_lookup that need the
                         * write lock on the map: create an anonymous map
                         * for a copy-on-write region, or an anonymous map
                         * for a zero-fill region.  (XXXCDC: submap case
                         * ok?)
                         */

                        if (!UVM_ET_ISSUBMAP(entry)) {        /* not submap */
                                if (UVM_ET_ISNEEDSCOPY(entry) &&
                                    ((entry->max_protection & VM_PROT_WRITE) ||
                                     (entry->object.uvm_obj == NULL))) {
                                        amap_copy(map, entry, 0, entry->start,
                                            entry->end);
                                        /* XXXCDC: wait OK? */
                                }
                        }
                }
                entry->wired_count++;
        }

        /*
         * Pass 3.
         */

#ifdef DIAGNOSTIC
        timestamp_save = map->timestamp;
#endif
        vm_map_busy(map);
        vm_map_unlock(map);

        rv = 0;
        for (entry = map->header.next; entry != &map->header;
             entry = entry->next) {
                if (entry->wired_count == 1) {
                        rv = uvm_fault_wire(map, entry->start, entry->end,
                            entry->max_protection, 1);
                        if (rv) {

                                /*
                                 * wiring failed.  break out of the loop.
                                 * we'll clean up the map below, once we
                                 * have a write lock again.
                                 */

                                break;
                        }
                }
        }

        if (rv) {

                /*
                 * Get back an exclusive (write) lock.
                 */

                vm_map_lock(map);
                vm_map_unbusy(map);

#ifdef DIAGNOSTIC
                if (timestamp_save + 1 != map->timestamp)
                        panic("uvm_map_pageable_all: stale map");
#endif

                /*
                 * first drop the wiring count on all the entries
                 * which haven't actually been wired yet.
                 *
                 * Skip VM_PROT_NONE entries like we did above.
                 */

                failed_entry = entry;
                for (/* nothing */; entry != &map->header;
                     entry = entry->next) {
                        if (entry->protection == VM_PROT_NONE)
                                continue;
                        entry->wired_count--;
                }

                /*
                 * now, unwire all the entries that were successfully
                 * wired above.
                 *
                 * Skip VM_PROT_NONE entries like we did above.
                 */

                for (entry = map->header.next; entry != failed_entry;
                     entry = entry->next) {
                        if (entry->protection == VM_PROT_NONE)
                                continue;
                        entry->wired_count--;
                        if (VM_MAPENT_ISWIRED(entry))
                                uvm_map_entry_unwire(map, entry);
                }
                vm_map_unlock(map);
                UVMHIST_LOG(maphist,"<- done (RV=%jd)", rv,0,0,0);
                return (rv);
        }

        vm_map_unbusy(map);

        UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
        return 0;
}

/*
 * uvm_map_clean: clean out a map range
 *
 * => valid flags:
 *   if (flags & PGO_CLEANIT): dirty pages are cleaned first
 *   if (flags & PGO_SYNCIO): dirty pages are written synchronously
 *   if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
 *   if (flags & PGO_FREE): any cached pages are freed after clean
 * => returns an error if any part of the specified range isn't mapped
 * => never a need to flush amap layer since the anonymous memory has
 *        no permanent home, but may deactivate pages there
 * => called from sys_msync() and sys_madvise()
 * => caller must not have map locked
 */

int
uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
{
        struct vm_map_entry *current, *entry;
        struct uvm_object *uobj;
        struct vm_amap *amap;
        struct vm_anon *anon;
        struct vm_page *pg;
        vaddr_t offset;
        vsize_t size;
        voff_t uoff;
        int error, refs;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,flags=%#jx)",
            (uintptr_t)map, start, end, flags);

        KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
                (PGO_FREE|PGO_DEACTIVATE));

        vm_map_lock(map);
        VM_MAP_RANGE_CHECK(map, start, end);
        if (!uvm_map_lookup_entry(map, start, &entry)) {
                vm_map_unlock(map);
                return EFAULT;
        }

        /*
         * Make a first pass to check for holes and wiring problems.
         */

        for (current = entry; current->start < end; current = current->next) {
                if (UVM_ET_ISSUBMAP(current)) {
                        vm_map_unlock(map);
                        return EINVAL;
                }
                if ((flags & PGO_FREE) != 0 && VM_MAPENT_ISWIRED(entry)) {
                        vm_map_unlock(map);
                        return EBUSY;
                }
                if (end <= current->end) {
                        break;
                }
                if (current->end != current->next->start) {
                        vm_map_unlock(map);
                        return EFAULT;
                }
        }

        vm_map_busy(map);
        vm_map_unlock(map);
        error = 0;
        for (current = entry; start < end; current = current->next) {
                amap = current->aref.ar_amap;        /* upper layer */
                uobj = current->object.uvm_obj;        /* lower layer */
                KASSERT(start >= current->start);

                /*
                 * No amap cleaning necessary if:
                 *
                 *        (1) There's no amap.
                 *
                 *        (2) We're not deactivating or freeing pages.
                 */

                if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
                        goto flush_object;

                offset = start - current->start;
                size = MIN(end, current->end) - start;

                amap_lock(amap, RW_WRITER);
                for ( ; size != 0; size -= PAGE_SIZE, offset += PAGE_SIZE) {
                        anon = amap_lookup(&current->aref, offset);
                        if (anon == NULL)
                                continue;

                        KASSERT(anon->an_lock == amap->am_lock);
                        pg = anon->an_page;
                        if (pg == NULL) {
                                continue;
                        }
                        if (pg->flags & PG_BUSY) {
                                continue;
                        }

                        switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {

                        /*
                         * In these first 3 cases, we just deactivate the page.
                         */

                        case PGO_CLEANIT|PGO_FREE:
                        case PGO_CLEANIT|PGO_DEACTIVATE:
                        case PGO_DEACTIVATE:
 deactivate_it:
                                /*
                                 * skip the page if it's loaned or wired,
                                 * since it shouldn't be on a paging queue
                                 * at all in these cases.
                                 */

                                if (pg->loan_count != 0 ||
                                    pg->wire_count != 0) {
                                        continue;
                                }
                                KASSERT(pg->uanon == anon);
                                uvm_pagelock(pg);
                                uvm_pagedeactivate(pg);
                                uvm_pageunlock(pg);
                                continue;

                        case PGO_FREE:

                                /*
                                 * If there are multiple references to
                                 * the amap, just deactivate the page.
                                 */

                                if (amap_refs(amap) > 1)
                                        goto deactivate_it;

                                /* skip the page if it's wired */
                                if (pg->wire_count != 0) {
                                        continue;
                                }
                                amap_unadd(&current->aref, offset);
                                refs = --anon->an_ref;
                                if (refs == 0) {
                                        uvm_anfree(anon);
                                }
                                continue;
                        }
                }
                amap_unlock(amap);

 flush_object:
                /*
                 * flush pages if we've got a valid backing object.
                 * note that we must always clean object pages before
                 * freeing them since otherwise we could reveal stale
                 * data from files.
                 */

                uoff = current->offset + (start - current->start);
                size = MIN(end, current->end) - start;
                if (uobj != NULL) {
                        rw_enter(uobj->vmobjlock, RW_WRITER);
                        if (uobj->pgops->pgo_put != NULL)
                                error = (uobj->pgops->pgo_put)(uobj, uoff,
                                    uoff + size, flags | PGO_CLEANIT);
                        else
                                error = 0;
                }
                start += size;
        }
        vm_map_unbusy(map);
        return error;
}


/*
 * uvm_map_checkprot: check protection in map
 *
 * => must allow specified protection in a fully allocated region.
 * => map must be read or write locked by caller.
 */

bool
uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
    vm_prot_t protection)
{
        struct vm_map_entry *entry;
        struct vm_map_entry *tmp_entry;

        if (!uvm_map_lookup_entry(map, start, &tmp_entry)) {
                return (false);
        }
        entry = tmp_entry;
        while (start < end) {
                if (entry == &map->header) {
                        return (false);
                }

                /*
                 * no holes allowed
                 */

                if (start < entry->start) {
                        return (false);
                }

                /*
                 * check protection associated with entry
                 */

                if ((entry->protection & protection) != protection) {
                        return (false);
                }
                start = entry->end;
                entry = entry->next;
        }
        return (true);
}

/*
 * uvmspace_alloc: allocate a vmspace structure.
 *
 * - structure includes vm_map and pmap
 * - XXX: no locking on this structure
 * - refcnt set to 1, rest must be init'd by caller
 */
struct vmspace *
uvmspace_alloc(vaddr_t vmin, vaddr_t vmax, bool topdown)
{
        struct vmspace *vm;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        vm = kmem_alloc(sizeof(*vm), KM_SLEEP);
        uvmspace_init(vm, NULL, vmin, vmax, topdown);
        UVMHIST_LOG(maphist,"<- done (vm=%#jx)", (uintptr_t)vm, 0, 0, 0);
        return (vm);
}

/*
 * uvmspace_init: initialize a vmspace structure.
 *
 * - XXX: no locking on this structure
 * - refcnt set to 1, rest must be init'd by caller
 */
void
uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin,
    vaddr_t vmax, bool topdown)
{
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist, "(vm=%#jx, pmap=%#jx, vmin=%#jx, vmax=%#jx",
            (uintptr_t)vm, (uintptr_t)pmap, vmin, vmax);
        UVMHIST_LOG(maphist, "   topdown=%ju)", topdown, 0, 0, 0);

        memset(vm, 0, sizeof(*vm));
        uvm_map_setup(&vm->vm_map, vmin, vmax, VM_MAP_PAGEABLE
            | (topdown ? VM_MAP_TOPDOWN : 0)
            );
        if (pmap)
                pmap_reference(pmap);
        else
                pmap = pmap_create();
        vm->vm_map.pmap = pmap;
        vm->vm_refcnt = 1;
        UVMHIST_LOG(maphist,"<- done",0,0,0,0);
}

/*
 * uvmspace_share: share a vmspace between two processes
 *
 * - used for vfork, threads(?)
 */

void
uvmspace_share(struct proc *p1, struct proc *p2)
{

        uvmspace_addref(p1->p_vmspace);
        p2->p_vmspace = p1->p_vmspace;
}

#if 0

/*
 * uvmspace_unshare: ensure that process "p" has its own, unshared, vmspace
 *
 * - XXX: no locking on vmspace
 */

void
uvmspace_unshare(struct lwp *l)
{
        struct proc *p = l->l_proc;
        struct vmspace *nvm, *ovm = p->p_vmspace;

        if (ovm->vm_refcnt == 1)
                /* nothing to do: vmspace isn't shared in the first place */
                return;

        /* make a new vmspace, still holding old one */
        nvm = uvmspace_fork(ovm);

        kpreempt_disable();
        pmap_deactivate(l);                /* unbind old vmspace */
        p->p_vmspace = nvm;
        pmap_activate(l);                /* switch to new vmspace */
        kpreempt_enable();

        uvmspace_free(ovm);                /* drop reference to old vmspace */
}

#endif


/*
 * uvmspace_spawn: a new process has been spawned and needs a vmspace
 */

void
uvmspace_spawn(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown)
{
        struct proc *p = l->l_proc;
        struct vmspace *nvm;

#ifdef __HAVE_CPU_VMSPACE_EXEC
        cpu_vmspace_exec(l, start, end);
#endif

        nvm = uvmspace_alloc(start, end, topdown);
        kpreempt_disable();
        p->p_vmspace = nvm;
        pmap_activate(l);
        kpreempt_enable();
}

/*
 * uvmspace_exec: the process wants to exec a new program
 */

void
uvmspace_exec(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown)
{
        struct proc *p = l->l_proc;
        struct vmspace *nvm, *ovm = p->p_vmspace;
        struct vm_map *map;
        int flags;

        KASSERT(ovm != NULL);
#ifdef __HAVE_CPU_VMSPACE_EXEC
        cpu_vmspace_exec(l, start, end);
#endif

        map = &ovm->vm_map;
        /*
         * see if more than one process is using this vmspace...
         */

        if (ovm->vm_refcnt == 1
            && topdown == ((ovm->vm_map.flags & VM_MAP_TOPDOWN) != 0)) {

                /*
                 * if p is the only process using its vmspace then we can safely
                 * recycle that vmspace for the program that is being exec'd.
                 * But only if TOPDOWN matches the requested value for the new
                 * vm space!
                 */

                /*
                 * SYSV SHM semantics require us to kill all segments on an exec
                 */
                if (uvm_shmexit && ovm->vm_shm)
                        (*uvm_shmexit)(ovm);

                /*
                 * POSIX 1003.1b -- "lock future mappings" is revoked
                 * when a process execs another program image.
                 */

                map->flags &= ~VM_MAP_WIREFUTURE;

                /*
                 * now unmap the old program.
                 *
                 * XXX set VM_MAP_DYING for the duration, so pmap_update()
                 * is not called until the pmap has been totally cleared out
                 * after pmap_remove_all(), or it can confuse some pmap
                 * implementations.  it would be nice to handle this by
                 * deferring the pmap_update() while it is known the address
                 * space is not visible to any user LWP other than curlwp,
                 * but there isn't an elegant way of inferring that right
                 * now.
                 */

                flags = pmap_remove_all(map->pmap) ? UVM_FLAG_VAONLY : 0;
                map->flags |= VM_MAP_DYING;
                uvm_unmap1(map, vm_map_min(map), vm_map_max(map), flags);
                map->flags &= ~VM_MAP_DYING;
                pmap_update(map->pmap);
                KASSERT(map->header.prev == &map->header);
                KASSERT(map->nentries == 0);

                /*
                 * resize the map
                 */

                vm_map_setmin(map, start);
                vm_map_setmax(map, end);
        } else {

                /*
                 * p's vmspace is being shared, so we can't reuse it for p since
                 * it is still being used for others.   allocate a new vmspace
                 * for p
                 */

                nvm = uvmspace_alloc(start, end, topdown);

                /*
                 * install new vmspace and drop our ref to the old one.
                 */

                kpreempt_disable();
                pmap_deactivate(l);
                p->p_vmspace = nvm;
                pmap_activate(l);
                kpreempt_enable();

                uvmspace_free(ovm);
        }
}

/*
 * uvmspace_addref: add a reference to a vmspace.
 */

void
uvmspace_addref(struct vmspace *vm)
{

        KASSERT((vm->vm_map.flags & VM_MAP_DYING) == 0);
        KASSERT(vm->vm_refcnt > 0);
        atomic_inc_uint(&vm->vm_refcnt);
}

/*
 * uvmspace_free: free a vmspace data structure
 */

void
uvmspace_free(struct vmspace *vm)
{
        struct vm_map_entry *dead_entries;
        struct vm_map *map = &vm->vm_map;
        int flags;

        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(vm=%#jx) ref=%jd", (uintptr_t)vm,
            vm->vm_refcnt, 0, 0);

        membar_release();
        if (atomic_dec_uint_nv(&vm->vm_refcnt) > 0)
                return;
        membar_acquire();

        /*
         * at this point, there should be no other references to the map.
         * delete all of the mappings, then destroy the pmap.
         */

        map->flags |= VM_MAP_DYING;
        flags = pmap_remove_all(map->pmap) ? UVM_FLAG_VAONLY : 0;

        /* Get rid of any SYSV shared memory segments. */
        if (uvm_shmexit && vm->vm_shm != NULL)
                (*uvm_shmexit)(vm);

        if (map->nentries) {
                uvm_unmap_remove(map, vm_map_min(map), vm_map_max(map),
                    &dead_entries, flags);
                if (dead_entries != NULL)
                        uvm_unmap_detach(dead_entries, 0);
        }
        KASSERT(map->nentries == 0);
        KASSERT(map->size == 0);

        mutex_destroy(&map->misc_lock);
        rw_destroy(&map->lock);
        cv_destroy(&map->cv);
        pmap_destroy(map->pmap);
        kmem_free(vm, sizeof(*vm));
}

static struct vm_map_entry *
uvm_mapent_clone(struct vm_map *new_map, struct vm_map_entry *old_entry,
    int flags)
{
        struct vm_map_entry *new_entry;

        new_entry = uvm_mapent_alloc(new_map, 0);
        /* old_entry -> new_entry */
        uvm_mapent_copy(old_entry, new_entry);

        /* new pmap has nothing wired in it */
        new_entry->wired_count = 0;

        /*
         * gain reference to object backing the map (can't
         * be a submap, already checked this case).
         */

        if (new_entry->aref.ar_amap)
                uvm_map_reference_amap(new_entry, flags);

        if (new_entry->object.uvm_obj &&
            new_entry->object.uvm_obj->pgops->pgo_reference)
                new_entry->object.uvm_obj->pgops->pgo_reference(
                        new_entry->object.uvm_obj);

        /* insert entry at end of new_map's entry list */
        uvm_map_entry_link(new_map, new_map->header.prev,
            new_entry);

        return new_entry;
}

/*
 * share the mapping: this means we want the old and
 * new entries to share amaps and backing objects.
 */
static void
uvm_mapent_forkshared(struct vm_map *new_map, struct vm_map *old_map,
    struct vm_map_entry *old_entry)
{
        /*
         * if the old_entry needs a new amap (due to prev fork)
         * then we need to allocate it now so that we have
         * something we own to share with the new_entry.   [in
         * other words, we need to clear needs_copy]
         */

        if (UVM_ET_ISNEEDSCOPY(old_entry)) {
                /* get our own amap, clears needs_copy */
                amap_copy(old_map, old_entry, AMAP_COPY_NOCHUNK,
                    0, 0);
                /* XXXCDC: WAITOK??? */
        }

        uvm_mapent_clone(new_map, old_entry, AMAP_SHARED);
}


static void
uvm_mapent_forkcopy(struct vm_map *new_map, struct vm_map *old_map,
    struct vm_map_entry *old_entry)
{
        struct vm_map_entry *new_entry;

        /*
         * copy-on-write the mapping (using mmap's
         * MAP_PRIVATE semantics)
         *
         * allocate new_entry, adjust reference counts.
         * (note that new references are read-only).
         */

        new_entry = uvm_mapent_clone(new_map, old_entry, 0);

        new_entry->etype |=
            (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);

        /*
         * the new entry will need an amap.  it will either
         * need to be copied from the old entry or created
         * from scratch (if the old entry does not have an
         * amap).  can we defer this process until later
         * (by setting "needs_copy") or do we need to copy
         * the amap now?
         *
         * we must copy the amap now if any of the following
         * conditions hold:
         * 1. the old entry has an amap and that amap is
         *    being shared.  this means that the old (parent)
         *    process is sharing the amap with another
         *    process.  if we do not clear needs_copy here
         *    we will end up in a situation where both the
         *    parent and child process are referring to the
         *    same amap with "needs_copy" set.  if the
         *    parent write-faults, the fault routine will
         *    clear "needs_copy" in the parent by allocating
         *    a new amap.   this is wrong because the
         *    parent is supposed to be sharing the old amap
         *    and the new amap will break that.
         *
         * 2. if the old entry has an amap and a non-zero
         *    wire count then we are going to have to call
         *    amap_cow_now to avoid page faults in the
         *    parent process.   since amap_cow_now requires
         *    "needs_copy" to be clear we might as well
         *    clear it here as well.
         *
         */

        if (old_entry->aref.ar_amap != NULL) {
                if ((amap_flags(old_entry->aref.ar_amap) & AMAP_SHARED) != 0 ||
                    VM_MAPENT_ISWIRED(old_entry)) {

                        amap_copy(new_map, new_entry,
                            AMAP_COPY_NOCHUNK, 0, 0);
                        /* XXXCDC: M_WAITOK ... ok? */
                }
        }

        /*
         * if the parent's entry is wired down, then the
         * parent process does not want page faults on
         * access to that memory.  this means that we
         * cannot do copy-on-write because we can't write
         * protect the old entry.   in this case we
         * resolve all copy-on-write faults now, using
         * amap_cow_now.   note that we have already
         * allocated any needed amap (above).
         */

        if (VM_MAPENT_ISWIRED(old_entry)) {

                /*
                 * resolve all copy-on-write faults now
                 * (note that there is nothing to do if
                 * the old mapping does not have an amap).
                 */
                if (old_entry->aref.ar_amap)
                        amap_cow_now(new_map, new_entry);

        } else {
                /*
                 * setup mappings to trigger copy-on-write faults
                 * we must write-protect the parent if it has
                 * an amap and it is not already "needs_copy"...
                 * if it is already "needs_copy" then the parent
                 * has already been write-protected by a previous
                 * fork operation.
                 */
                if (old_entry->aref.ar_amap &&
                    !UVM_ET_ISNEEDSCOPY(old_entry)) {
                        if (old_entry->max_protection & VM_PROT_WRITE) {
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
                                uvm_map_lock_entry(old_entry, RW_WRITER);
#else
                                uvm_map_lock_entry(old_entry, RW_READER);
#endif
                                pmap_protect(old_map->pmap,
                                    old_entry->start, old_entry->end,
                                    old_entry->protection & ~VM_PROT_WRITE);
                                uvm_map_unlock_entry(old_entry);
                        }
                        old_entry->etype |= UVM_ET_NEEDSCOPY;
                }
        }
}

/*
 * zero the mapping: the new entry will be zero initialized
 */
static void
uvm_mapent_forkzero(struct vm_map *new_map, struct vm_map *old_map,
    struct vm_map_entry *old_entry)
{
        struct vm_map_entry *new_entry;

        new_entry = uvm_mapent_clone(new_map, old_entry, 0);

        new_entry->etype |=
            (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);

        if (new_entry->aref.ar_amap) {
                uvm_map_unreference_amap(new_entry, 0);
                new_entry->aref.ar_pageoff = 0;
                new_entry->aref.ar_amap = NULL;
        }

        if (UVM_ET_ISOBJ(new_entry)) {
                if (new_entry->object.uvm_obj->pgops->pgo_detach)
                        new_entry->object.uvm_obj->pgops->pgo_detach(
                            new_entry->object.uvm_obj);
                new_entry->object.uvm_obj = NULL;
                new_entry->offset = 0;
                new_entry->etype &= ~UVM_ET_OBJ;
        }
}

/*
 *   F O R K   -   m a i n   e n t r y   p o i n t
 */
/*
 * uvmspace_fork: fork a process' main map
 *
 * => create a new vmspace for child process from parent.
 * => parent's map must not be locked.
 */

struct vmspace *
uvmspace_fork(struct vmspace *vm1)
{
        struct vmspace *vm2;
        struct vm_map *old_map = &vm1->vm_map;
        struct vm_map *new_map;
        struct vm_map_entry *old_entry;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        vm_map_lock(old_map);

        vm2 = uvmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
            vm1->vm_map.flags & VM_MAP_TOPDOWN);
        memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
            (char *) (vm1 + 1) - (char *) &vm1->vm_startcopy);
        new_map = &vm2->vm_map;                  /* XXX */

        old_entry = old_map->header.next;
        new_map->size = old_map->size;

        /*
         * go entry-by-entry
         */

        while (old_entry != &old_map->header) {

                /*
                 * first, some sanity checks on the old entry
                 */

                KASSERT(!UVM_ET_ISSUBMAP(old_entry));
                KASSERT(UVM_ET_ISCOPYONWRITE(old_entry) ||
                        !UVM_ET_ISNEEDSCOPY(old_entry));

                switch (old_entry->inheritance) {
                case MAP_INHERIT_NONE:
                        /*
                         * drop the mapping, modify size
                         */
                        new_map->size -= old_entry->end - old_entry->start;
                        break;

                case MAP_INHERIT_SHARE:
                        uvm_mapent_forkshared(new_map, old_map, old_entry);
                        break;

                case MAP_INHERIT_COPY:
                        uvm_mapent_forkcopy(new_map, old_map, old_entry);
                        break;

                case MAP_INHERIT_ZERO:
                        uvm_mapent_forkzero(new_map, old_map, old_entry);
                        break;
                default:
                        KASSERT(0);
                        break;
                }
                old_entry = old_entry->next;
        }

        pmap_update(old_map->pmap);
        vm_map_unlock(old_map);

        if (uvm_shmfork && vm1->vm_shm)
                (*uvm_shmfork)(vm1, vm2);

#ifdef PMAP_FORK
        pmap_fork(vm1->vm_map.pmap, vm2->vm_map.pmap);
#endif

        UVMHIST_LOG(maphist,"<- done",0,0,0,0);
        return (vm2);
}


/*
 * uvm_mapent_trymerge: try to merge an entry with its neighbors.
 *
 * => called with map locked.
 * => return non zero if successfully merged.
 */

int
uvm_mapent_trymerge(struct vm_map *map, struct vm_map_entry *entry, int flags)
{
        struct uvm_object *uobj;
        struct vm_map_entry *next;
        struct vm_map_entry *prev;
        vsize_t size;
        int merged = 0;
        bool copying;
        int newetype;

        if (entry->aref.ar_amap != NULL) {
                return 0;
        }
        if ((entry->flags & UVM_MAP_NOMERGE) != 0) {
                return 0;
        }

        uobj = entry->object.uvm_obj;
        size = entry->end - entry->start;
        copying = (flags & UVM_MERGE_COPYING) != 0;
        newetype = copying ? (entry->etype & ~UVM_ET_NEEDSCOPY) : entry->etype;

        next = entry->next;
        if (next != &map->header &&
            next->start == entry->end &&
            ((copying && next->aref.ar_amap != NULL &&
            amap_refs(next->aref.ar_amap) == 1) ||
            (!copying && next->aref.ar_amap == NULL)) &&
            UVM_ET_ISCOMPATIBLE(next, newetype,
            uobj, entry->flags, entry->protection,
            entry->max_protection, entry->inheritance, entry->advice,
            entry->wired_count) &&
            (uobj == NULL || entry->offset + size == next->offset)) {
                int error;

                if (copying) {
                        error = amap_extend(next, size,
                            AMAP_EXTEND_NOWAIT|AMAP_EXTEND_BACKWARDS);
                } else {
                        error = 0;
                }
                if (error == 0) {
                        if (uobj) {
                                if (uobj->pgops->pgo_detach) {
                                        uobj->pgops->pgo_detach(uobj);
                                }
                        }

                        entry->end = next->end;
                        clear_hints(map, next);
                        uvm_map_entry_unlink(map, next);
                        if (copying) {
                                entry->aref = next->aref;
                                entry->etype &= ~UVM_ET_NEEDSCOPY;
                        }
                        uvm_map_check(map, "trymerge forwardmerge");
                        uvm_mapent_free(next);
                        merged++;
                }
        }

        prev = entry->prev;
        if (prev != &map->header &&
            prev->end == entry->start &&
            ((copying && !merged && prev->aref.ar_amap != NULL &&
            amap_refs(prev->aref.ar_amap) == 1) ||
            (!copying && prev->aref.ar_amap == NULL)) &&
            UVM_ET_ISCOMPATIBLE(prev, newetype,
            uobj, entry->flags, entry->protection,
            entry->max_protection, entry->inheritance, entry->advice,
            entry->wired_count) &&
            (uobj == NULL ||
            prev->offset + prev->end - prev->start == entry->offset)) {
                int error;

                if (copying) {
                        error = amap_extend(prev, size,
                            AMAP_EXTEND_NOWAIT|AMAP_EXTEND_FORWARDS);
                } else {
                        error = 0;
                }
                if (error == 0) {
                        if (uobj) {
                                if (uobj->pgops->pgo_detach) {
                                        uobj->pgops->pgo_detach(uobj);
                                }
                                entry->offset = prev->offset;
                        }

                        entry->start = prev->start;
                        clear_hints(map, prev);
                        uvm_map_entry_unlink(map, prev);
                        if (copying) {
                                entry->aref = prev->aref;
                                entry->etype &= ~UVM_ET_NEEDSCOPY;
                        }
                        uvm_map_check(map, "trymerge backmerge");
                        uvm_mapent_free(prev);
                        merged++;
                }
        }

        return merged;
}

/*
 * uvm_map_setup: init map
 *
 * => map must not be in service yet.
 */

void
uvm_map_setup(struct vm_map *map, vaddr_t vmin, vaddr_t vmax, int flags)
{

        rb_tree_init(&map->rb_tree, &uvm_map_tree_ops);
        map->header.next = map->header.prev = &map->header;
        map->nentries = 0;
        map->size = 0;
        map->ref_count = 1;
        vm_map_setmin(map, vmin);
        vm_map_setmax(map, vmax);
        map->flags = flags;
        map->first_free = &map->header;
        map->hint = &map->header;
        map->timestamp = 0;
        map->busy = NULL;

        rw_init(&map->lock);
        cv_init(&map->cv, "vm_map");
        mutex_init(&map->misc_lock, MUTEX_DRIVER, IPL_NONE);
}

/*
 *   U N M A P   -   m a i n   e n t r y   p o i n t
 */

/*
 * uvm_unmap1: remove mappings from a vm_map (from "start" up to "stop")
 *
 * => caller must check alignment and size
 * => map must be unlocked (we will lock it)
 * => flags is UVM_FLAG_QUANTUM or 0.
 */

void
uvm_unmap1(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
{
        struct vm_map_entry *dead_entries;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist, "  (map=%#jx, start=%#jx, end=%#jx)",
            (uintptr_t)map, start, end, 0);

        KASSERTMSG(start < end,
            "%s: map %p: start %#jx < end %#jx", __func__, map,
            (uintmax_t)start, (uintmax_t)end);
        if (map == kernel_map) {
                LOCKDEBUG_MEM_CHECK((void *)start, end - start);
        }

        /*
         * work now done by helper functions.   wipe the pmap's and then
         * detach from the dead entries...
         */
        vm_map_lock(map);
        uvm_unmap_remove(map, start, end, &dead_entries, flags);
        vm_map_unlock(map);

        if (dead_entries != NULL)
                uvm_unmap_detach(dead_entries, 0);

        UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
}


/*
 * uvm_map_reference: add reference to a map
 *
 * => map need not be locked
 */

void
uvm_map_reference(struct vm_map *map)
{

        atomic_inc_uint(&map->ref_count);
}

void
uvm_map_lock_entry(struct vm_map_entry *entry, krw_t op)
{

        if (entry->aref.ar_amap != NULL) {
                amap_lock(entry->aref.ar_amap, op);
        }
        if (UVM_ET_ISOBJ(entry)) {
                rw_enter(entry->object.uvm_obj->vmobjlock, op);
        }
}

void
uvm_map_unlock_entry(struct vm_map_entry *entry)
{

        if (UVM_ET_ISOBJ(entry)) {
                rw_exit(entry->object.uvm_obj->vmobjlock);
        }
        if (entry->aref.ar_amap != NULL) {
                amap_unlock(entry->aref.ar_amap);
        }
}

#define        UVM_VOADDR_TYPE_MASK        0x3UL
#define        UVM_VOADDR_TYPE_UOBJ        0x1UL
#define        UVM_VOADDR_TYPE_ANON        0x2UL
#define        UVM_VOADDR_OBJECT_MASK        ~UVM_VOADDR_TYPE_MASK

#define        UVM_VOADDR_GET_TYPE(voa)                                        \
        ((voa)->object & UVM_VOADDR_TYPE_MASK)
#define        UVM_VOADDR_GET_OBJECT(voa)                                        \
        ((voa)->object & UVM_VOADDR_OBJECT_MASK)
#define        UVM_VOADDR_SET_OBJECT(voa, obj, type)                                \
do {                                                                        \
        KASSERT(((uintptr_t)(obj) & UVM_VOADDR_TYPE_MASK) == 0);        \
        (voa)->object = ((uintptr_t)(obj)) | (type);                        \
} while (/*CONSTCOND*/0)

#define        UVM_VOADDR_GET_UOBJ(voa)                                        \
        ((struct uvm_object *)UVM_VOADDR_GET_OBJECT(voa))
#define        UVM_VOADDR_SET_UOBJ(voa, uobj)                                        \
        UVM_VOADDR_SET_OBJECT(voa, uobj, UVM_VOADDR_TYPE_UOBJ)

#define        UVM_VOADDR_GET_ANON(voa)                                        \
        ((struct vm_anon *)UVM_VOADDR_GET_OBJECT(voa))
#define        UVM_VOADDR_SET_ANON(voa, anon)                                        \
        UVM_VOADDR_SET_OBJECT(voa, anon, UVM_VOADDR_TYPE_ANON)

/*
 * uvm_voaddr_acquire: returns the virtual object address corresponding
 * to the specified virtual address.
 *
 * => resolves COW so the true page identity is tracked.
 *
 * => acquires a reference on the page's owner (uvm_object or vm_anon)
 */
bool
uvm_voaddr_acquire(struct vm_map * const map, vaddr_t const va,
    struct uvm_voaddr * const voaddr)
{
        struct vm_map_entry *entry;
        struct vm_anon *anon = NULL;
        bool result = false;
        bool exclusive = false;
        void (*unlock_fn)(struct vm_map *);

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
        UVMHIST_LOG(maphist,"(map=%#jx,va=%#jx)", (uintptr_t)map, va, 0, 0);

        const vaddr_t start = trunc_page(va);
        const vaddr_t end = round_page(va+1);

 lookup_again:
        if (__predict_false(exclusive)) {
                vm_map_lock(map);
                unlock_fn = vm_map_unlock;
        } else {
                vm_map_lock_read(map);
                unlock_fn = vm_map_unlock_read;
        }

        if (__predict_false(!uvm_map_lookup_entry(map, start, &entry))) {
                unlock_fn(map);
                UVMHIST_LOG(maphist,"<- done (no entry)",0,0,0,0);
                return false;
        }

        if (__predict_false(entry->protection == VM_PROT_NONE)) {
                unlock_fn(map);
                UVMHIST_LOG(maphist,"<- done (PROT_NONE)",0,0,0,0);
                return false;
        }

        /*
         * We have a fast path for the common case of "no COW resolution
         * needed" whereby we have taken a read lock on the map and if
         * we don't encounter any need to create a vm_anon then great!
         * But if we do, we loop around again, instead taking an exclusive
         * lock so that we can perform the fault.
         *
         * In the event that we have to resolve the fault, we do nearly the
         * same work as uvm_map_pageable() does:
         *
         * 1: holding the write lock, we create any anonymous maps that need
         *    to be created.  however, we do NOT need to clip the map entries
         *    in this case.
         *
         * 2: we downgrade to a read lock, and call uvm_fault_wire to fault
         *    in the page (assuming the entry is not already wired).  this
         *    is done because we need the vm_anon to be present.
         */
        if (__predict_true(!VM_MAPENT_ISWIRED(entry))) {

                bool need_fault = false;

                /*
                 * perform the action of vm_map_lookup that need the
                 * write lock on the map: create an anonymous map for
                 * a copy-on-write region, or an anonymous map for
                 * a zero-fill region.
                 */
                if (__predict_false(UVM_ET_ISSUBMAP(entry))) {
                        unlock_fn(map);
                        UVMHIST_LOG(maphist,"<- done (submap)",0,0,0,0);
                        return false;
                }
                if (__predict_false(UVM_ET_ISNEEDSCOPY(entry) &&
                    ((entry->max_protection & VM_PROT_WRITE) ||
                     (entry->object.uvm_obj == NULL)))) {
                        if (!exclusive) {
                                /* need to take the slow path */
                                KASSERT(unlock_fn == vm_map_unlock_read);
                                vm_map_unlock_read(map);
                                exclusive = true;
                                goto lookup_again;
                        }
                        need_fault = true;
                        amap_copy(map, entry, 0, start, end);
                        /* XXXCDC: wait OK? */
                }

                /*
                 * do a quick check to see if the fault has already
                 * been resolved to the upper layer.
                 */
                if (__predict_true(entry->aref.ar_amap != NULL &&
                                   need_fault == false)) {
                        amap_lock(entry->aref.ar_amap, RW_WRITER);
                        anon = amap_lookup(&entry->aref, start - entry->start);
                        if (__predict_true(anon != NULL)) {
                                /* amap unlocked below */
                                goto found_anon;
                        }
                        amap_unlock(entry->aref.ar_amap);
                        need_fault = true;
                }

                /*
                 * we predict this test as false because if we reach
                 * this point, then we are likely dealing with a
                 * shared memory region backed by a uvm_object, in
                 * which case a fault to create the vm_anon is not
                 * necessary.
                 */
                if (__predict_false(need_fault)) {
                        if (exclusive) {
                                vm_map_busy(map);
                                vm_map_unlock(map);
                                unlock_fn = vm_map_unbusy;
                        }

                        if (uvm_fault_wire(map, start, end,
                                           entry->max_protection, 1)) {
                                /* wiring failed */
                                unlock_fn(map);
                                UVMHIST_LOG(maphist,"<- done (wire failed)",
                                            0,0,0,0);
                                return false;
                        }

                        /*
                         * now that we have resolved the fault, we can unwire
                         * the page.
                         */
                        if (exclusive) {
                                vm_map_lock(map);
                                vm_map_unbusy(map);
                                unlock_fn = vm_map_unlock;
                        }

                        uvm_fault_unwire_locked(map, start, end);
                }
        }

        /* check the upper layer */
        if (entry->aref.ar_amap) {
                amap_lock(entry->aref.ar_amap, RW_WRITER);
                anon = amap_lookup(&entry->aref, start - entry->start);
                if (anon) {
 found_anon:                KASSERT(anon->an_lock == entry->aref.ar_amap->am_lock);
                        anon->an_ref++;
                        rw_obj_hold(anon->an_lock);
                        KASSERT(anon->an_ref != 0);
                        UVM_VOADDR_SET_ANON(voaddr, anon);
                        voaddr->offset = va & PAGE_MASK;
                        result = true;
                }
                amap_unlock(entry->aref.ar_amap);
        }

        /* check the lower layer */
        if (!result && UVM_ET_ISOBJ(entry)) {
                struct uvm_object *uobj = entry->object.uvm_obj;

                KASSERT(uobj != NULL);
                (*uobj->pgops->pgo_reference)(uobj);
                UVM_VOADDR_SET_UOBJ(voaddr, uobj);
                voaddr->offset = entry->offset + (va - entry->start);
                result = true;
        }

        unlock_fn(map);

        if (result) {
                UVMHIST_LOG(maphist,
                    "<- done OK (type=%jd,owner=%#jx,offset=%#jx)",
                    UVM_VOADDR_GET_TYPE(voaddr),
                    UVM_VOADDR_GET_OBJECT(voaddr),
                    voaddr->offset, 0);
        } else {
                UVMHIST_LOG(maphist,"<- done (failed)",0,0,0,0);
        }

        return result;
}

/*
 * uvm_voaddr_release: release the references held by the
 * vitual object address.
 */
void
uvm_voaddr_release(struct uvm_voaddr * const voaddr)
{

        switch (UVM_VOADDR_GET_TYPE(voaddr)) {
        case UVM_VOADDR_TYPE_UOBJ: {
                struct uvm_object * const uobj = UVM_VOADDR_GET_UOBJ(voaddr);

                KASSERT(uobj != NULL);
                KASSERT(uobj->pgops->pgo_detach != NULL);
                (*uobj->pgops->pgo_detach)(uobj);
                break;
            }
        case UVM_VOADDR_TYPE_ANON: {
                struct vm_anon * const anon = UVM_VOADDR_GET_ANON(voaddr);
                krwlock_t *lock;

                KASSERT(anon != NULL);
                rw_enter((lock = anon->an_lock), RW_WRITER);
                    KASSERT(anon->an_ref > 0);
                if (--anon->an_ref == 0) {
                        uvm_anfree(anon);
                }
                rw_exit(lock);
                rw_obj_free(lock);
                    break;
            }
        default:
                panic("uvm_voaddr_release: bad type");
        }
        memset(voaddr, 0, sizeof(*voaddr));
}

/*
 * uvm_voaddr_compare: compare two uvm_voaddr objects.
 *
 * => memcmp() semantics
 */
int
uvm_voaddr_compare(const struct uvm_voaddr * const voaddr1,
    const struct uvm_voaddr * const voaddr2)
{
        const uintptr_t type1 = UVM_VOADDR_GET_TYPE(voaddr1);
        const uintptr_t type2 = UVM_VOADDR_GET_TYPE(voaddr2);

        KASSERT(type1 == UVM_VOADDR_TYPE_UOBJ ||
                type1 == UVM_VOADDR_TYPE_ANON);

        KASSERT(type2 == UVM_VOADDR_TYPE_UOBJ ||
                type2 == UVM_VOADDR_TYPE_ANON);

        if (type1 < type2)
                return -1;
        if (type1 > type2)
                return 1;

        const uintptr_t addr1 = UVM_VOADDR_GET_OBJECT(voaddr1);
        const uintptr_t addr2 = UVM_VOADDR_GET_OBJECT(voaddr2);

        if (addr1 < addr2)
                return -1;
        if (addr1 > addr2)
                return 1;

        if (voaddr1->offset < voaddr2->offset)
                return -1;
        if (voaddr1->offset > voaddr2->offset)
                return 1;

        return 0;
}

#if defined(DDB) || defined(DEBUGPRINT)

/*
 * uvm_map_printit: actually prints the map
 */

void
uvm_map_printit(struct vm_map *map, bool full,
    void (*pr)(const char *, ...))
{
        struct vm_map_entry *entry;

        (*pr)("MAP %p: [%#lx->%#lx]\n", map, vm_map_min(map),
            vm_map_max(map));
        (*pr)("\t#ent=%d, sz=%d, ref=%d, version=%d, flags=%#x\n",
            map->nentries, map->size, map->ref_count, map->timestamp,
            map->flags);
        (*pr)("\tpmap=%p(resident=%ld, wired=%ld)\n", map->pmap,
            pmap_resident_count(map->pmap), pmap_wired_count(map->pmap));
        if (!full)
                return;
        for (entry = map->header.next; entry != &map->header;
            entry = entry->next) {
                (*pr)(" - %p: %#lx->%#lx: obj=%p/%#llx, amap=%p/%d\n",
                    entry, entry->start, entry->end, entry->object.uvm_obj,
                    (long long)entry->offset, entry->aref.ar_amap,
                    entry->aref.ar_pageoff);
                (*pr)(
                    "\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, "
                    "wc=%d, adv=%d%s\n",
                    (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
                    (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
                    (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
                    entry->protection, entry->max_protection,
                    entry->inheritance, entry->wired_count, entry->advice,
                    entry == map->first_free ? " (first_free)" : "");
        }
}

void
uvm_whatis(uintptr_t addr, void (*pr)(const char *, ...))
{
        struct vm_map *map;

        for (map = kernel_map;;) {
                struct vm_map_entry *entry;

                if (!uvm_map_lookup_entry_bytree(map, (vaddr_t)addr, &entry)) {
                        break;
                }
                (*pr)("%p is %p+%zu from VMMAP %p\n",
                    (void *)addr, (void *)entry->start,
                    (size_t)(addr - (uintptr_t)entry->start), map);
                if (!UVM_ET_ISSUBMAP(entry)) {
                        break;
                }
                map = entry->object.sub_map;
        }
}

#endif /* DDB || DEBUGPRINT */

#ifndef __USER_VA0_IS_SAFE
static int
sysctl_user_va0_disable(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int t, error;

        node = *rnode;
        node.sysctl_data = &t;
        t = user_va0_disable;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return (error);

        if (!t && user_va0_disable &&
            kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MAP_VA_ZERO, 0,
            NULL, NULL, NULL))
                return EPERM;

        user_va0_disable = !!t;
        return 0;
}
#endif

static int
fill_vmentry(struct lwp *l, struct proc *p, struct kinfo_vmentry *kve,
    struct vm_map *m, struct vm_map_entry *e)
{
#ifndef _RUMPKERNEL
        int error;

        memset(kve, 0, sizeof(*kve));
        KASSERT(e != NULL);
        if (UVM_ET_ISOBJ(e)) {
                struct uvm_object *uobj = e->object.uvm_obj;
                KASSERT(uobj != NULL);
                kve->kve_ref_count = uobj->uo_refs;
                kve->kve_count = uobj->uo_npages;
                if (UVM_OBJ_IS_VNODE(uobj)) {
                        struct vattr va;
                        struct vnode *vp = (struct vnode *)uobj;
                        vn_lock(vp, LK_SHARED | LK_RETRY);
                        error = VOP_GETATTR(vp, &va, l->l_cred);
                        VOP_UNLOCK(vp);
                        kve->kve_type = KVME_TYPE_VNODE;
                        if (error == 0) {
                                kve->kve_vn_size = vp->v_size;
                                kve->kve_vn_type = (int)vp->v_type;
                                kve->kve_vn_mode = va.va_mode;
                                kve->kve_vn_rdev = va.va_rdev;
                                kve->kve_vn_fileid = va.va_fileid;
                                kve->kve_vn_fsid = va.va_fsid;
                                error = vnode_to_path(kve->kve_path,
                                    sizeof(kve->kve_path) / 2, vp, l, p);
                        }
                } else if (UVM_OBJ_IS_KERN_OBJECT(uobj)) {
                        kve->kve_type = KVME_TYPE_KERN;
                } else if (UVM_OBJ_IS_DEVICE(uobj)) {
                        kve->kve_type = KVME_TYPE_DEVICE;
                } else if (UVM_OBJ_IS_AOBJ(uobj)) {
                        kve->kve_type = KVME_TYPE_ANON;
                } else {
                        kve->kve_type = KVME_TYPE_OBJECT;
                }
        } else if (UVM_ET_ISSUBMAP(e)) {
                struct vm_map *map = e->object.sub_map;
                KASSERT(map != NULL);
                kve->kve_ref_count = map->ref_count;
                kve->kve_count = map->nentries;
                kve->kve_type = KVME_TYPE_SUBMAP;
        } else
                kve->kve_type = KVME_TYPE_UNKNOWN;

        kve->kve_start = e->start;
        kve->kve_end = e->end;
        kve->kve_offset = e->offset;
        kve->kve_wired_count = e->wired_count;
        kve->kve_inheritance = e->inheritance;
        kve->kve_attributes = 0; /* unused */
        kve->kve_advice = e->advice;
#define PROT(p) (((p) & VM_PROT_READ) ? KVME_PROT_READ : 0) | \
        (((p) & VM_PROT_WRITE) ? KVME_PROT_WRITE : 0) | \
        (((p) & VM_PROT_EXECUTE) ? KVME_PROT_EXEC : 0)
        kve->kve_protection = PROT(e->protection);
        kve->kve_max_protection = PROT(e->max_protection);
        kve->kve_flags |= (e->etype & UVM_ET_COPYONWRITE)
            ? KVME_FLAG_COW : 0;
        kve->kve_flags |= (e->etype & UVM_ET_NEEDSCOPY)
            ? KVME_FLAG_NEEDS_COPY : 0;
        kve->kve_flags |= (m->flags & VM_MAP_TOPDOWN)
            ? KVME_FLAG_GROWS_DOWN : KVME_FLAG_GROWS_UP;
        kve->kve_flags |= (m->flags & VM_MAP_PAGEABLE)
            ? KVME_FLAG_PAGEABLE : 0;
#endif
        return 0;
}

static int
fill_vmentries(struct lwp *l, pid_t pid, u_int elem_size, void *oldp,
    size_t *oldlenp)
{
        int error;
        struct proc *p;
        struct kinfo_vmentry *vme;
        struct vmspace *vm;
        struct vm_map *map;
        struct vm_map_entry *entry;
        char *dp;
        size_t count, vmesize;

        if (elem_size == 0 || elem_size > 2 * sizeof(*vme))
                return EINVAL;

        if (oldp) {
                if (*oldlenp > 10UL * 1024UL * 1024UL)
                        return E2BIG;
                count = *oldlenp / elem_size;
                if (count == 0)
                        return ENOMEM;
                vmesize = count * sizeof(*vme);
        } else
                vmesize = 0;

        if ((error = proc_find_locked(l, &p, pid)) != 0)
                return error;

        vme = NULL;
        count = 0;

        if ((error = proc_vmspace_getref(p, &vm)) != 0)
                goto out;

        map = &vm->vm_map;
        vm_map_lock_read(map);

        dp = oldp;
        if (oldp)
                vme = kmem_alloc(vmesize, KM_SLEEP);
        for (entry = map->header.next; entry != &map->header;
            entry = entry->next) {
                if (oldp && (dp - (char *)oldp) < vmesize) {
                        error = fill_vmentry(l, p, &vme[count], map, entry);
                        if (error)
                                goto out;
                        dp += elem_size;
                }
                count++;
        }
        vm_map_unlock_read(map);
        uvmspace_free(vm);

out:
        if (pid != -1)
                mutex_exit(p->p_lock);
        if (error == 0) {
                const u_int esize = uimin(sizeof(*vme), elem_size);
                dp = oldp;
                for (size_t i = 0; i < count; i++) {
                        if (oldp && (dp - (char *)oldp) < vmesize) {
                                error = sysctl_copyout(l, &vme[i], dp, esize);
                                if (error)
                                        break;
                                dp += elem_size;
                        } else
                                break;
                }
                count *= elem_size;
                if (oldp != NULL && *oldlenp < count)
                        error = ENOSPC;
                *oldlenp = count;
        }
        if (vme)
                kmem_free(vme, vmesize);
        return error;
}

static int
sysctl_vmproc(SYSCTLFN_ARGS)
{
        int error;

        if (namelen == 1 && name[0] == CTL_QUERY)
                return (sysctl_query(SYSCTLFN_CALL(rnode)));

        if (namelen == 0)
                return EINVAL;

        switch (name[0]) {
        case VM_PROC_MAP:
                if (namelen != 3)
                        return EINVAL;
                sysctl_unlock();
                error = fill_vmentries(l, name[1], name[2], oldp, oldlenp);
                sysctl_relock();
                return error;
        default:
                return EINVAL;
        }
}

SYSCTL_SETUP(sysctl_uvmmap_setup, "sysctl uvmmap setup")
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "proc",
                       SYSCTL_DESCR("Process vm information"),
                       sysctl_vmproc, 0, NULL, 0,
                       CTL_VM, VM_PROC, CTL_EOL);
#ifndef __USER_VA0_IS_SAFE
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "user_va0_disable",
                       SYSCTL_DESCR("Disable VA 0"),
                       sysctl_user_va0_disable, 0, &user_va0_disable, 0,
                       CTL_VM, CTL_CREATE, CTL_EOL);
#endif
}






















































































































































































































   10 




    5 



    6 













   35 






   35 











   34 
























   35 





   35 











   35 


   35 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
/*        $NetBSD: kern_syscall.c,v 1.21 2020/08/31 19:51:30 christos Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software developed for The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_syscall.c,v 1.21 2020/08/31 19:51:30 christos Exp $");

#ifdef _KERNEL_OPT
#include "opt_modular.h"
#include "opt_syscall_debug.h"
#include "opt_ktrace.h"
#include "opt_ptrace.h"
#include "opt_dtrace.h"
#endif

/* XXX To get syscall prototypes. */
#define SYSVSHM
#define SYSVSEM
#define SYSVMSG

#include <sys/param.h>
#include <sys/module.h>
#include <sys/sched.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/syscallvar.h>
#include <sys/systm.h>
#include <sys/xcall.h>
#include <sys/ktrace.h>
#include <sys/ptrace.h>

int
sys_nomodule(struct lwp *l, const void *v, register_t *retval)
{
#ifdef MODULAR

        const struct sysent *sy;
        const struct emul *em;
        const struct sc_autoload *auto_list;
        u_int code;

        /*
         * Restart the syscall if we interrupted a module unload that
         * failed.  Acquiring kernconfig_lock delays us until any unload
         * has been completed or rolled back.
         */
        kernconfig_lock();
        sy = l->l_sysent;
        if (sy->sy_call != sys_nomodule) {
                kernconfig_unlock();
                return ERESTART;
        }
        /*
         * Try to autoload a module to satisfy the request.  If it 
         * works, retry the request.
         */
        em = l->l_proc->p_emul;
        code = sy - em->e_sysent;

        if ((auto_list = em->e_sc_autoload) != NULL)
                for (; auto_list->al_code > 0; auto_list++) {
                        if (auto_list->al_code != code) {
                                continue;
                        }
                        if (module_autoload(auto_list->al_module,
                            MODULE_CLASS_ANY) != 0 ||
                            sy->sy_call == sys_nomodule) {
                                    break;
                        }
                        kernconfig_unlock();
                        return ERESTART;
                }
        kernconfig_unlock();
#endif        /* MODULAR */

        return sys_nosys(l, v, retval);
}

int
syscall_establish(const struct emul *em, const struct syscall_package *sp)
{
        struct sysent *sy;
        int i;

        KASSERT(kernconfig_is_held());

        if (em == NULL) {
                em = &emul_netbsd;
        }
        sy = em->e_sysent;

        /*
         * Ensure that all preconditions are valid, since this is
         * an all or nothing deal.  Once a system call is entered,
         * it can become busy and we could be unable to remove it
         * on error.
         */
        for (i = 0; sp[i].sp_call != NULL; i++) {
                if (sp[i].sp_code >= SYS_NSYSENT)
                        return EINVAL;
                if (sy[sp[i].sp_code].sy_call != sys_nomodule &&
                    sy[sp[i].sp_code].sy_call != sys_nosys) {
#ifdef DIAGNOSTIC
                        printf("syscall %d is busy\n", sp[i].sp_code);
#endif
                        return EBUSY;
                }
        }
        /* Everything looks good, patch them in. */
        for (i = 0; sp[i].sp_call != NULL; i++) {
                sy[sp[i].sp_code].sy_call = sp[i].sp_call;
        }

        return 0;
}

int
syscall_disestablish(const struct emul *em, const struct syscall_package *sp)
{
        struct sysent *sy;
        const uint32_t *sb;
        lwp_t *l;
        int i;

        KASSERT(kernconfig_is_held());

        if (em == NULL) {
                em = &emul_netbsd;
        }
        sy = em->e_sysent;
        sb = em->e_nomodbits;

        /*
         * First, patch the system calls to sys_nomodule or sys_nosys
         * to gate further activity.
         */
        for (i = 0; sp[i].sp_call != NULL; i++) {
                KASSERT(sy[sp[i].sp_code].sy_call == sp[i].sp_call);
                sy[sp[i].sp_code].sy_call =
                    sb[sp[i].sp_code / 32] & (1 << (sp[i].sp_code % 32)) ?
                      sys_nomodule : sys_nosys;
        }

        /*
         * Run a cross call to cycle through all CPUs.  This does two
         * things: lock activity provides a barrier and makes our update
         * of sy_call visible to all CPUs, and upon return we can be sure
         * that we see pertinent values of l_sysent posted by remote CPUs.
         */
        xc_barrier(0);

        /*
         * Now it's safe to check l_sysent.  Run through all LWPs and see
         * if anyone is still using the system call.
         */
        for (i = 0; sp[i].sp_call != NULL; i++) {
                mutex_enter(&proc_lock);
                LIST_FOREACH(l, &alllwp, l_list) {
                        if (l->l_sysent == &sy[sp[i].sp_code]) {
                                break;
                        }
                }
                mutex_exit(&proc_lock);
                if (l == NULL) {
                        continue;
                }
                /*
                 * We lose: one or more calls are still in use.  Put back
                 * the old entrypoints and act like nothing happened.
                 * When we drop kernconfig_lock, any system calls held in
                 * sys_nomodule() will be restarted.
                 */
                for (i = 0; sp[i].sp_call != NULL; i++) {
                        sy[sp[i].sp_code].sy_call = sp[i].sp_call;
                }
                return EBUSY;
        }

        return 0;
}

/*
 * Return true if system call tracing is enabled for the specified process.
 */
bool
trace_is_enabled(struct proc *p)
{
#ifdef SYSCALL_DEBUG
        return (true);
#endif
#ifdef KTRACE
        if (ISSET(p->p_traceflag, (KTRFAC_SYSCALL | KTRFAC_SYSRET)))
                return (true);
#endif
#ifdef PTRACE
        if (ISSET(p->p_slflag, PSL_SYSCALL))
                return (true);
#endif

        return (false);
}

/*
 * Start trace of particular system call. If process is being traced,
 * this routine is called by MD syscall dispatch code just before
 * a system call is actually executed.
 */
int
trace_enter(register_t code, const struct sysent *sy, const void *args)
{
        int error = 0;
#if defined(PTRACE) || defined(KDTRACE_HOOKS)
        struct proc *p = curlwp->l_proc;
#endif

#ifdef KDTRACE_HOOKS
        if (sy->sy_entry) {
                struct emul *e = p->p_emul;
                if (e->e_dtrace_syscall)
                        (*e->e_dtrace_syscall)(sy->sy_entry, code, sy, args,
                            NULL, 0);
        }
#endif

#ifdef SYSCALL_DEBUG
        scdebug_call(code, args);
#endif /* SYSCALL_DEBUG */

        ktrsyscall(code, args, sy->sy_narg);

#ifdef PTRACE
        if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED)) ==
            (PSL_SYSCALL|PSL_TRACED)) {
                proc_stoptrace(TRAP_SCE, code, args, NULL, 0);
                if (curlwp->l_proc->p_slflag & PSL_SYSCALLEMU) {
                        /* tracer will emulate syscall for us */
                        error = EJUSTRETURN;
                }
        }
#endif
        return error;
}

/*
 * End trace of particular system call. If process is being traced,
 * this routine is called by MD syscall dispatch code just after
 * a system call finishes.
 * MD caller guarantees the passed 'code' is within the supported
 * system call number range for emulation the process runs under.
 */
void
trace_exit(register_t code, const struct sysent *sy, const void *args,
    register_t rval[], int error)
{
#if defined(PTRACE) || defined(KDTRACE_HOOKS)
        struct proc *p = curlwp->l_proc;
#endif

#ifdef KDTRACE_HOOKS
        if (sy->sy_return) {
                struct emul *e = p->p_emul;
                if (e->e_dtrace_syscall)
                        (*p->p_emul->e_dtrace_syscall)(sy->sy_return, code, sy,
                            args, rval, error);
        }
#endif

#ifdef SYSCALL_DEBUG
        scdebug_ret(code, error, rval);
#endif /* SYSCALL_DEBUG */

        ktrsysret(code, error, rval);
        
#ifdef PTRACE
        if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED|PSL_SYSCALLEMU)) ==
            (PSL_SYSCALL|PSL_TRACED)) {
                proc_stoptrace(TRAP_SCX, code, args, rval, error);
        }
        CLR(p->p_slflag, PSL_SYSCALLEMU);
#endif
}





























































    2 




















    2 
    2 






    1 





    1 


    1 

    1 
    1 







    1 




    1 








    6 









    6 
    6 
    6 






    6 
    1 







    6 




    1 

    5 



    6 






















    6 









    5 



    5 











    5 




















    1 

























    1 
    1 
    1 
    1 
    1 


    1 
    1 
















    1 




























    1 















    1 

















    1 




    1 


































    1 





    1 


























    1 










    1 








    1 









    1 









    1 










    1 
    1 
    1 
    1 
    1 
    1 
    1 





    1 













    1 



    1 
    1 















    1 


    1 



















    1 
    1 












    2 





    2 








    2 







    2 



















    2 


    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
/*        $NetBSD: ufs_readwrite.c,v 1.128 2022/02/21 17:07:45 hannken Exp $        */

/*-
 * Copyright (c) 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ufs_readwrite.c        8.11 (Berkeley) 5/8/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.128 2022/02/21 17:07:45 hannken Exp $");

#define        FS                        struct fs
#define        I_FS                        i_fs
#define        READ                        ffs_read
#define        READ_S                        "ffs_read"
#define        WRITE                        ffs_write
#define        WRITE_S                        "ffs_write"
#define        BUFRD                        ffs_bufrd
#define        BUFWR                        ffs_bufwr
#define ufs_blkoff                ffs_blkoff
#define ufs_blksize                ffs_blksize
#define ufs_lblkno                ffs_lblkno
#define ufs_lblktosize                ffs_lblktosize
#define ufs_blkroundup                ffs_blkroundup

static int        ufs_post_read_update(struct vnode *, int, int);
static int        ufs_post_write_update(struct vnode *, struct uio *, int,
                    kauth_cred_t, off_t, int, int);

/*
 * Vnode op for reading.
 */
/* ARGSUSED */
int
READ(void *v)
{
        struct vop_read_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp;
        struct inode *ip;
        struct uio *uio;
        struct ufsmount *ump;
        vsize_t bytelen;
        int error, ioflag, advice;

        vp = ap->a_vp;
        ip = VTOI(vp);
        ump = ip->i_ump;
        uio = ap->a_uio;
        ioflag = ap->a_ioflag;
        error = 0;

        KASSERT(uio->uio_rw == UIO_READ);
        KASSERT(vp->v_type == VREG || vp->v_type == VDIR);

        /* XXX Eliminate me by refusing directory reads from userland.  */
        if (vp->v_type == VDIR)
                return BUFRD(vp, uio, ioflag, ap->a_cred);
        if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize)
                return (EFBIG);
        if (uio->uio_resid == 0)
                return (0);

        if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT)
                return ffs_snapshot_read(vp, uio, ioflag);

        if (uio->uio_offset >= ip->i_size)
                goto out;

        KASSERT(vp->v_type == VREG);
        advice = IO_ADV_DECODE(ap->a_ioflag);
        while (uio->uio_resid > 0) {
                if (ioflag & IO_DIRECT) {
                        genfs_directio(vp, uio, ioflag);
                }
                bytelen = MIN(ip->i_size - uio->uio_offset, uio->uio_resid);
                if (bytelen == 0)
                        break;
                error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
                    UBC_READ | UBC_PARTIALOK | UBC_VNODE_FLAGS(vp));
                if (error)
                        break;
        }

 out:
        error = ufs_post_read_update(vp, ap->a_ioflag, error);
        return (error);
}

/*
 * UFS op for reading via the buffer cache
 */
int
BUFRD(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred)
{
        struct inode *ip;
        struct ufsmount *ump;
        FS *fs;
        struct buf *bp;
        daddr_t lbn, nextlbn;
        off_t bytesinfile;
        long size, xfersize, blkoffset;
        int error;

        KASSERT(VOP_ISLOCKED(vp));
        KASSERT(vp->v_type == VDIR || vp->v_type == VLNK);
        KASSERT(uio->uio_rw == UIO_READ);

        ip = VTOI(vp);
        ump = ip->i_ump;
        fs = ip->I_FS;
        error = 0;

        KASSERT(vp->v_type != VLNK || ip->i_size >= ump->um_maxsymlinklen);
        KASSERT(vp->v_type != VLNK || ump->um_maxsymlinklen != 0 ||
            DIP(ip, blocks) != 0);

        if (uio->uio_offset > ump->um_maxfilesize)
                return EFBIG;
        if (uio->uio_resid == 0)
                return 0;

        KASSERT(!ISSET(ip->i_flags, (SF_SNAPSHOT | SF_SNAPINVAL)));

        if (uio->uio_offset >= ip->i_size)
                goto out;

        for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
                bytesinfile = ip->i_size - uio->uio_offset;
                if (bytesinfile <= 0)
                        break;
                lbn = ufs_lblkno(fs, uio->uio_offset);
                nextlbn = lbn + 1;
                size = ufs_blksize(fs, ip, lbn);
                blkoffset = ufs_blkoff(fs, uio->uio_offset);
                xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
                    bytesinfile);

                if (ufs_lblktosize(fs, nextlbn) >= ip->i_size)
                        error = bread(vp, lbn, size, 0, &bp);
                else {
                        int nextsize = ufs_blksize(fs, ip, nextlbn);
                        error = breadn(vp, lbn,
                            size, &nextlbn, &nextsize, 1, 0, &bp);
                }
                if (error)
                        break;

                /*
                 * We should only get non-zero b_resid when an I/O error
                 * has occurred, which should cause us to break above.
                 * However, if the short read did not cause an error,
                 * then we want to ensure that we do not uiomove bad
                 * or uninitialized data.
                 */
                size -= bp->b_resid;
                if (size < xfersize) {
                        if (size == 0)
                                break;
                        xfersize = size;
                }
                error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
                if (error)
                        break;
                brelse(bp, 0);
        }
        if (bp != NULL)
                brelse(bp, 0);

 out:
        error = ufs_post_read_update(vp, ioflag, error);
        return (error);
}

static int
ufs_post_read_update(struct vnode *vp, int ioflag, int oerror)
{
        struct inode *ip = VTOI(vp);
        int error = oerror;

        if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
                ip->i_flag |= IN_ACCESS;
                if ((ioflag & IO_SYNC) == IO_SYNC) {
                        error = UFS_WAPBL_BEGIN(vp->v_mount);
                        if (error)
                                goto out;
                        error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
                        UFS_WAPBL_END(vp->v_mount);
                }
        }

out:
        /* Read error overrides any inode update error.  */
        if (oerror)
                error = oerror;
        return error;
}

/*
 * Vnode op for writing.
 */
int
WRITE(void *v)
{
        struct vop_write_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp;
        struct uio *uio;
        struct inode *ip;
        FS *fs;
        kauth_cred_t cred;
        off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
        int blkoffset, error, flags, ioflag, resid;
        int aflag;
        vsize_t bytelen;
        bool async;
        struct ufsmount *ump;

        cred = ap->a_cred;
        ioflag = ap->a_ioflag;
        uio = ap->a_uio;
        vp = ap->a_vp;
        ip = VTOI(vp);
        ump = ip->i_ump;

        KASSERT(vp->v_size == ip->i_size);
        KASSERT(uio->uio_rw == UIO_WRITE);
        KASSERT(vp->v_type == VREG);
        KASSERT(!ISSET(ioflag, IO_JOURNALLOCKED));
        UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount);

        if (ioflag & IO_APPEND)
                uio->uio_offset = ip->i_size;
        if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
                return (EPERM);

        fs = ip->I_FS;
        if (uio->uio_offset < 0 ||
            (u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize)
                return (EFBIG);
        if (uio->uio_resid == 0)
                return (0);

        flags = ioflag & IO_SYNC ? B_SYNC : 0;
        async = vp->v_mount->mnt_flag & MNT_ASYNC;
        origoff = uio->uio_offset;
        resid = uio->uio_resid;
        osize = ip->i_size;
        error = 0;

        KASSERT(vp->v_type == VREG);

        /*
         * XXX The entire write operation must occur in a single WAPBL
         * transaction because it may allocate disk blocks, if
         * appending or filling holes, which is allowed to happen only
         * if the write fully succeeds.
         *
         * If ubc_uiomove fails in the middle with EFAULT, we can clean
         * up at the end with UFS_TRUNCATE.  But if the power fails in
         * the middle, there would be nobody to deallocate the blocks,
         * without an fsck to globally analyze the file system.
         *
         * If the increasingly inaccurately named WAPBL were augmented
         * with rollback records for block allocations, then we could
         * split this into multiple transactions and commit the
         * allocations in the last one.
         *
         * But WAPBL doesn't have that notion now, so we'll have to
         * live with gigantic transactions and WAPBL tentacles in
         * genfs_getpages/putpages to cope with the possibility that
         * the transaction may or may not be locked on entry to the
         * page cache.
         *
         * And even if we added that notion to WAPBL, it wouldn't help
         * us get rid of the tentacles in genfs_getpages/putpages
         * because we'd have to interoperate with old implementations
         * that assume they can replay the log without fsck.
         */
        error = UFS_WAPBL_BEGIN(vp->v_mount);
        if (error) {
                return error;
        }


        preallocoff = round_page(ufs_blkroundup(fs, MAX(osize, uio->uio_offset)));
        aflag = ioflag & IO_SYNC ? B_SYNC : 0;
        nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
        endallocoff = nsize - ufs_blkoff(fs, nsize);

        /*
         * if we're increasing the file size, deal with expanding
         * the fragment if there is one.
         */

        if (nsize > osize && ufs_lblkno(fs, osize) < UFS_NDADDR &&
            ufs_lblkno(fs, osize) != ufs_lblkno(fs, nsize) &&
            ufs_blkroundup(fs, osize) != osize) {
                off_t eob;

                eob = ufs_blkroundup(fs, osize);
                uvm_vnp_setwritesize(vp, eob);
                error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag);
                if (error)
                        goto out;
                if (flags & B_SYNC) {
                        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
                        VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask),
                            round_page(eob),
                            PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
                }
        }

        while (uio->uio_resid > 0) {
                int ubc_flags = UBC_WRITE;
                bool overwrite; /* if we're overwrite a whole block */
                off_t newoff;

                if (ioflag & IO_DIRECT) {
                        genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED);
                }

                oldoff = uio->uio_offset;
                blkoffset = ufs_blkoff(fs, uio->uio_offset);
                bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
                if (bytelen == 0) {
                        break;
                }

                /*
                 * if we're filling in a hole, allocate the blocks now and
                 * initialize the pages first.  if we're extending the file,
                 * we can safely allocate blocks without initializing pages
                 * since the new blocks will be inaccessible until the write
                 * is complete.
                 */
                overwrite = uio->uio_offset >= preallocoff &&
                    uio->uio_offset < endallocoff;
                if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 &&
                    ufs_blkoff(fs, uio->uio_offset) == 0 &&
                    (uio->uio_offset & PAGE_MASK) == 0) {
                        vsize_t len;

                        len = trunc_page(bytelen);
                        len -= ufs_blkoff(fs, len);
                        if (len > 0) {
                                overwrite = true;
                                bytelen = len;
                        }
                }

                newoff = oldoff + bytelen;
                if (vp->v_size < newoff) {
                        uvm_vnp_setwritesize(vp, newoff);
                }

                if (!overwrite) {
                        error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
                            cred, aflag);
                        if (error)
                                break;
                } else {
                        genfs_node_wrlock(vp);
                        error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
                            aflag, cred);
                        genfs_node_unlock(vp);
                        if (error)
                                break;
                        ubc_flags |= UBC_FAULTBUSY;
                }

                /*
                 * copy the data.
                 */

                error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
                    IO_ADV_DECODE(ioflag), ubc_flags | UBC_VNODE_FLAGS(vp));

                /*
                 * update UVM's notion of the size now that we've
                 * copied the data into the vnode's pages.
                 *
                 * we should update the size even when uiomove failed.
                 */

                if (vp->v_size < newoff) {
                        uvm_vnp_setsize(vp, newoff);
                }

                if (error)
                        break;

                /*
                 * flush what we just wrote if necessary.
                 * XXXUBC simplistic async flushing.
                 */

                if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
                        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
                        error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
                            (uio->uio_offset >> 16) << 16,
                            PGO_CLEANIT | PGO_JOURNALLOCKED | PGO_LAZY);
                        if (error)
                                break;
                }
        }
        if (error == 0 && ioflag & IO_SYNC) {
                rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
                error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask),
                    round_page(ufs_blkroundup(fs, uio->uio_offset)),
                    PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
        }

out:
        error = ufs_post_write_update(vp, uio, ioflag, cred, osize, resid,
            error);
        UFS_WAPBL_END(vp->v_mount);

        return (error);
}

/*
 * UFS op for writing via the buffer cache
 */
int
BUFWR(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred)
{
        struct inode *ip;
        struct ufsmount *ump;
        FS *fs;
        int flags;
        struct buf *bp;
        off_t osize;
        int resid, xfersize, size, blkoffset;
        daddr_t lbn;
        int error;

        KASSERT(ISSET(ioflag, IO_NODELOCKED));
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        KASSERT(vp->v_type == VDIR || vp->v_type == VLNK);
        KASSERT(vp->v_type != VDIR || ISSET(ioflag, IO_SYNC));
        KASSERT(uio->uio_rw == UIO_WRITE);
        KASSERT(ISSET(ioflag, IO_JOURNALLOCKED));
        UFS_WAPBL_JLOCK_ASSERT(vp->v_mount);

        ip = VTOI(vp);
        ump = ip->i_ump;
        fs = ip->I_FS;

        KASSERT(vp->v_size == ip->i_size);

        if (uio->uio_offset < 0 ||
            uio->uio_resid > ump->um_maxfilesize ||
            uio->uio_offset > (ump->um_maxfilesize - uio->uio_resid))
                return EFBIG;
        if (uio->uio_resid == 0)
                return 0;

        flags = ioflag & IO_SYNC ? B_SYNC : 0;
        resid = uio->uio_resid;
        osize = ip->i_size;
        error = 0;

        KASSERT(vp->v_type != VREG);


        /* XXX Should never have pages cached here.  */
        KASSERT(vp->v_uobj.uo_npages == 0);
        while (uio->uio_resid > 0) {
                lbn = ufs_lblkno(fs, uio->uio_offset);
                blkoffset = ufs_blkoff(fs, uio->uio_offset);
                xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
                if (fs->fs_bsize > xfersize)
                        flags |= B_CLRBUF;
                else
                        flags &= ~B_CLRBUF;

                error = UFS_BALLOC(vp, uio->uio_offset, xfersize, cred, flags,
                    &bp);

                if (error)
                        break;
                if (uio->uio_offset + xfersize > ip->i_size) {
                        ip->i_size = uio->uio_offset + xfersize;
                        DIP_ASSIGN(ip, size, ip->i_size);
                        uvm_vnp_setsize(vp, ip->i_size);
                }
                size = ufs_blksize(fs, ip, lbn) - bp->b_resid;
                if (xfersize > size)
                        xfersize = size;

                error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);

                /*
                 * if we didn't clear the block and the uiomove failed,
                 * the buf will now contain part of some other file,
                 * so we need to invalidate it.
                 */
                if (error && (flags & B_CLRBUF) == 0) {
                        brelse(bp, BC_INVAL);
                        break;
                }
                if (ioflag & IO_SYNC)
                        (void)bwrite(bp);
                else if (xfersize + blkoffset == fs->fs_bsize)
                        bawrite(bp);
                else
                        bdwrite(bp);
                if (error || xfersize == 0)
                        break;
        }

        error = ufs_post_write_update(vp, uio, ioflag, cred, osize, resid,
            error);

        return (error);
}

static int
ufs_post_write_update(struct vnode *vp, struct uio *uio, int ioflag,
    kauth_cred_t cred, off_t osize, int resid, int oerror)
{
        struct inode *ip = VTOI(vp);
        int error = oerror;

        /* Trigger ctime and mtime updates, and atime if MNT_RELATIME.  */
        ip->i_flag |= IN_CHANGE | IN_UPDATE;
        if (vp->v_mount->mnt_flag & MNT_RELATIME)
                ip->i_flag |= IN_ACCESS;

        /*
         * If we successfully wrote any data and we are not the superuser,
         * we clear the setuid and setgid bits as a precaution against
         * tampering.
         */
        if (resid > uio->uio_resid && cred) {
                if (ip->i_mode & ISUID) {
                        if (kauth_authorize_vnode(cred,
                            KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0) {
                                ip->i_mode &= ~ISUID;
                                DIP_ASSIGN(ip, mode, ip->i_mode);
                        }
                }

                if (ip->i_mode & ISGID) {
                        if (kauth_authorize_vnode(cred,
                            KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0) {
                                ip->i_mode &= ~ISGID;
                                DIP_ASSIGN(ip, mode, ip->i_mode);
                        }
                }
        }

        /*
         * Update the size on disk: truncate back to original size on
         * error, or reflect the new size on success.
         */
        if (error) {
                (void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, cred);
                uio->uio_offset -= resid - uio->uio_resid;
                uio->uio_resid = resid;
        } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
                error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
        else
                UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);

        /* Make sure the vnode uvm size matches the inode file size.  */
        KASSERT(vp->v_size == ip->i_size);

        /* Write error overrides any inode update error.  */
        if (oerror)
                error = oerror;
        return error;
}













































































































































































































































































   54 











    6 

    6 











































































   45 



   44 

   43 




















    2 


    2 

    2 







    2 


    2 














    9 



    9 


    9 
    9 























    8 












    3 














    3 



    3 
    3 

    3 
    3 













    6 



    6 

    6 












    6 




    6 

    6 

    4 
    2 



    6 



















    5 



    5 
    5 
    5 


    5 
    5 

    5 


    5 



    5 





















































































































   15 



   15 
   15 

   15 
   15 























    1 




    1 

    1 








   33 




   34 

   34 



























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
/*        $NetBSD: kern_timeout.c,v 1.79 2023/10/08 13:23:05 ad Exp $        */

/*-
 * Copyright (c) 2003, 2006, 2007, 2008, 2009, 2019, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 2001 Thomas Nordin <nordin@openbsd.org>
 * Copyright (c) 2000-2001 Artur Grabowski <art@openbsd.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
 * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL  DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_timeout.c,v 1.79 2023/10/08 13:23:05 ad Exp $");

/*
 * Timeouts are kept in a hierarchical timing wheel.  The c_time is the
 * value of c_cpu->cc_ticks when the timeout should be called.  There are
 * four levels with 256 buckets each. See 'Scheme 7' in "Hashed and
 * Hierarchical Timing Wheels: Efficient Data Structures for Implementing
 * a Timer Facility" by George Varghese and Tony Lauck.
 *
 * Some of the "math" in here is a bit tricky.  We have to beware of
 * wrapping ints.
 *
 * We use the fact that any element added to the queue must be added with
 * a positive time.  That means that any element `to' on the queue cannot
 * be scheduled to timeout further in time than INT_MAX, but c->c_time can
 * be positive or negative so comparing it with anything is dangerous.
 * The only way we can use the c->c_time value in any predictable way is
 * when we calculate how far in the future `to' will timeout - "c->c_time
 * - c->c_cpu->cc_ticks".  The result will always be positive for future
 * timeouts and 0 or negative for due timeouts.
 */

#define        _CALLOUT_PRIVATE

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/callout.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/evcnt.h>
#include <sys/intr.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/sdt.h>

#ifdef DDB
#include <machine/db_machdep.h>
#include <ddb/db_interface.h>
#include <ddb/db_access.h>
#include <ddb/db_cpu.h>
#include <ddb/db_sym.h>
#include <ddb/db_output.h>
#endif

#define BUCKETS                1024
#define WHEELSIZE        256
#define WHEELMASK        255
#define WHEELBITS        8

#define MASKWHEEL(wheel, time) (((time) >> ((wheel)*WHEELBITS)) & WHEELMASK)

#define BUCKET(cc, rel, abs)                                                \
    (((rel) <= (1 << (2*WHEELBITS)))                                        \
            ? ((rel) <= (1 << WHEELBITS))                                        \
            ? &(cc)->cc_wheel[MASKWHEEL(0, (abs))]                        \
            : &(cc)->cc_wheel[MASKWHEEL(1, (abs)) + WHEELSIZE]                \
        : ((rel) <= (1 << (3*WHEELBITS)))                                \
            ? &(cc)->cc_wheel[MASKWHEEL(2, (abs)) + 2*WHEELSIZE]        \
            : &(cc)->cc_wheel[MASKWHEEL(3, (abs)) + 3*WHEELSIZE])

#define MOVEBUCKET(cc, wheel, time)                                        \
    CIRCQ_APPEND(&(cc)->cc_todo,                                        \
        &(cc)->cc_wheel[MASKWHEEL((wheel), (time)) + (wheel)*WHEELSIZE])

/*
 * Circular queue definitions.
 */

#define CIRCQ_INIT(list)                                                \
do {                                                                        \
        (list)->cq_next_l = (list);                                        \
        (list)->cq_prev_l = (list);                                        \
} while (/*CONSTCOND*/0)

#define CIRCQ_INSERT(elem, list)                                        \
do {                                                                        \
        (elem)->cq_prev_e = (list)->cq_prev_e;                                \
        (elem)->cq_next_l = (list);                                        \
        (list)->cq_prev_l->cq_next_l = (elem);                                \
        (list)->cq_prev_l = (elem);                                        \
} while (/*CONSTCOND*/0)

#define CIRCQ_APPEND(fst, snd)                                                \
do {                                                                        \
        if (!CIRCQ_EMPTY(snd)) {                                        \
                (fst)->cq_prev_l->cq_next_l = (snd)->cq_next_l;                \
                (snd)->cq_next_l->cq_prev_l = (fst)->cq_prev_l;                \
                (snd)->cq_prev_l->cq_next_l = (fst);                        \
                (fst)->cq_prev_l = (snd)->cq_prev_l;                        \
                CIRCQ_INIT(snd);                                        \
        }                                                                \
} while (/*CONSTCOND*/0)

#define CIRCQ_REMOVE(elem)                                                \
do {                                                                        \
        (elem)->cq_next_l->cq_prev_e = (elem)->cq_prev_e;                \
        (elem)->cq_prev_l->cq_next_e = (elem)->cq_next_e;                \
} while (/*CONSTCOND*/0)

#define CIRCQ_FIRST(list)        ((list)->cq_next_e)
#define CIRCQ_NEXT(elem)        ((elem)->cq_next_e)
#define CIRCQ_LAST(elem,list)        ((elem)->cq_next_l == (list))
#define CIRCQ_EMPTY(list)        ((list)->cq_next_l == (list))

struct callout_cpu {
        kmutex_t        *cc_lock;
        sleepq_t        cc_sleepq;
        u_int                cc_nwait;
        u_int                cc_ticks;
        lwp_t                *cc_lwp;
        callout_impl_t        *cc_active;
        struct evcnt        cc_ev_late;
        struct evcnt        cc_ev_block;
        struct callout_circq cc_todo;                /* Worklist */
        struct callout_circq cc_wheel[BUCKETS];        /* Queues of timeouts */
        char                cc_name1[12];
        char                cc_name2[12];
        struct cpu_info        *cc_cpu;
};

#ifdef DDB
static struct callout_cpu ccb;
#endif

#ifndef CRASH /* _KERNEL */
static void        callout_softclock(void *);
static void        callout_wait(callout_impl_t *, void *, kmutex_t *);

static struct callout_cpu callout_cpu0 __cacheline_aligned;
static void *callout_sih __read_mostly;

SDT_PROBE_DEFINE2(sdt, kernel, callout, init,
    "struct callout *"/*ch*/,
    "unsigned"/*flags*/);
SDT_PROBE_DEFINE1(sdt, kernel, callout, destroy,
    "struct callout *"/*ch*/);
SDT_PROBE_DEFINE4(sdt, kernel, callout, setfunc,
    "struct callout *"/*ch*/,
    "void (*)(void *)"/*func*/,
    "void *"/*arg*/,
    "unsigned"/*flags*/);
SDT_PROBE_DEFINE5(sdt, kernel, callout, schedule,
    "struct callout *"/*ch*/,
    "void (*)(void *)"/*func*/,
    "void *"/*arg*/,
    "unsigned"/*flags*/,
    "int"/*ticks*/);
SDT_PROBE_DEFINE6(sdt, kernel, callout, migrate,
    "struct callout *"/*ch*/,
    "void (*)(void *)"/*func*/,
    "void *"/*arg*/,
    "unsigned"/*flags*/,
    "struct cpu_info *"/*ocpu*/,
    "struct cpu_info *"/*ncpu*/);
SDT_PROBE_DEFINE4(sdt, kernel, callout, entry,
    "struct callout *"/*ch*/,
    "void (*)(void *)"/*func*/,
    "void *"/*arg*/,
    "unsigned"/*flags*/);
SDT_PROBE_DEFINE4(sdt, kernel, callout, return,
    "struct callout *"/*ch*/,
    "void (*)(void *)"/*func*/,
    "void *"/*arg*/,
    "unsigned"/*flags*/);
SDT_PROBE_DEFINE5(sdt, kernel, callout, stop,
    "struct callout *"/*ch*/,
    "void (*)(void *)"/*func*/,
    "void *"/*arg*/,
    "unsigned"/*flags*/,
    "bool"/*expired*/);
SDT_PROBE_DEFINE4(sdt, kernel, callout, halt,
    "struct callout *"/*ch*/,
    "void (*)(void *)"/*func*/,
    "void *"/*arg*/,
    "unsigned"/*flags*/);
SDT_PROBE_DEFINE5(sdt, kernel, callout, halt__done,
    "struct callout *"/*ch*/,
    "void (*)(void *)"/*func*/,
    "void *"/*arg*/,
    "unsigned"/*flags*/,
    "bool"/*expired*/);

syncobj_t callout_syncobj = {
        .sobj_name        = "callout",
        .sobj_flag        = SOBJ_SLEEPQ_SORTED,
        .sobj_boostpri  = PRI_KERNEL,
        .sobj_unsleep        = sleepq_unsleep,
        .sobj_changepri        = sleepq_changepri,
        .sobj_lendpri        = sleepq_lendpri,
        .sobj_owner        = syncobj_noowner,
};

static inline kmutex_t *
callout_lock(callout_impl_t *c)
{
        struct callout_cpu *cc;
        kmutex_t *lock;

        for (;;) {
                cc = c->c_cpu;
                lock = cc->cc_lock;
                mutex_spin_enter(lock);
                if (__predict_true(cc == c->c_cpu))
                        return lock;
                mutex_spin_exit(lock);
        }
}

/*
 * Check if the callout is currently running on an LWP that isn't curlwp.
 */
static inline bool
callout_running_somewhere_else(callout_impl_t *c, struct callout_cpu *cc)
{
        KASSERT(c->c_cpu == cc);

        return cc->cc_active == c && cc->cc_lwp != curlwp;
}

/*
 * callout_startup:
 *
 *        Initialize the callout facility, called at system startup time.
 *        Do just enough to allow callouts to be safely registered.
 */
void
callout_startup(void)
{
        struct callout_cpu *cc;
        int b;

        KASSERT(curcpu()->ci_data.cpu_callout == NULL);

        cc = &callout_cpu0;
        cc->cc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
        CIRCQ_INIT(&cc->cc_todo);
        for (b = 0; b < BUCKETS; b++)
                CIRCQ_INIT(&cc->cc_wheel[b]);
        curcpu()->ci_data.cpu_callout = cc;
}

/*
 * callout_init_cpu:
 *
 *        Per-CPU initialization.
 */
CTASSERT(sizeof(callout_impl_t) <= sizeof(callout_t));

void
callout_init_cpu(struct cpu_info *ci)
{
        struct callout_cpu *cc;
        int b;

        if ((cc = ci->ci_data.cpu_callout) == NULL) {
                cc = kmem_zalloc(sizeof(*cc), KM_SLEEP);
                cc->cc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
                CIRCQ_INIT(&cc->cc_todo);
                for (b = 0; b < BUCKETS; b++)
                        CIRCQ_INIT(&cc->cc_wheel[b]);
        } else {
                /* Boot CPU, one time only. */
                callout_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE,
                    callout_softclock, NULL);
                if (callout_sih == NULL)
                        panic("callout_init_cpu (2)");
        }

        sleepq_init(&cc->cc_sleepq);

        snprintf(cc->cc_name1, sizeof(cc->cc_name1), "late/%u",
            cpu_index(ci));
        evcnt_attach_dynamic(&cc->cc_ev_late, EVCNT_TYPE_MISC,
            NULL, "callout", cc->cc_name1);

        snprintf(cc->cc_name2, sizeof(cc->cc_name2), "wait/%u",
            cpu_index(ci));
        evcnt_attach_dynamic(&cc->cc_ev_block, EVCNT_TYPE_MISC,
            NULL, "callout", cc->cc_name2);

        cc->cc_cpu = ci;
        ci->ci_data.cpu_callout = cc;
}

/*
 * callout_init:
 *
 *        Initialize a callout structure.  This must be quick, so we fill
 *        only the minimum number of fields.
 */
void
callout_init(callout_t *cs, u_int flags)
{
        callout_impl_t *c = (callout_impl_t *)cs;
        struct callout_cpu *cc;

        KASSERT((flags & ~CALLOUT_FLAGMASK) == 0);

        SDT_PROBE2(sdt, kernel, callout, init,  cs, flags);

        cc = curcpu()->ci_data.cpu_callout;
        c->c_func = NULL;
        c->c_magic = CALLOUT_MAGIC;
        if (__predict_true((flags & CALLOUT_MPSAFE) != 0 && cc != NULL)) {
                c->c_flags = flags;
                c->c_cpu = cc;
                return;
        }
        c->c_flags = flags | CALLOUT_BOUND;
        c->c_cpu = &callout_cpu0;
}

/*
 * callout_destroy:
 *
 *        Destroy a callout structure.  The callout must be stopped.
 */
void
callout_destroy(callout_t *cs)
{
        callout_impl_t *c = (callout_impl_t *)cs;

        SDT_PROBE1(sdt, kernel, callout, destroy,  cs);

        KASSERTMSG(c->c_magic == CALLOUT_MAGIC,
            "callout %p: c_magic (%#x) != CALLOUT_MAGIC (%#x)",
            c, c->c_magic, CALLOUT_MAGIC);
        /*
         * It's not necessary to lock in order to see the correct value
         * of c->c_flags.  If the callout could potentially have been
         * running, the current thread should have stopped it.
         */
        KASSERTMSG((c->c_flags & CALLOUT_PENDING) == 0,
            "pending callout %p: c_func (%p) c_flags (%#x) destroyed from %p",
            c, c->c_func, c->c_flags, __builtin_return_address(0));
        KASSERTMSG(!callout_running_somewhere_else(c, c->c_cpu),
            "running callout %p: c_func (%p) c_flags (%#x) destroyed from %p",
            c, c->c_func, c->c_flags, __builtin_return_address(0));
        c->c_magic = 0;
}

/*
 * callout_schedule_locked:
 *
 *        Schedule a callout to run.  The function and argument must
 *        already be set in the callout structure.  Must be called with
 *        callout_lock.
 */
static void
callout_schedule_locked(callout_impl_t *c, kmutex_t *lock, int to_ticks)
{
        struct callout_cpu *cc, *occ;
        int old_time;

        SDT_PROBE5(sdt, kernel, callout, schedule,
            c, c->c_func, c->c_arg, c->c_flags, to_ticks);

        KASSERT(to_ticks >= 0);
        KASSERT(c->c_func != NULL);

        /* Initialize the time here, it won't change. */
        occ = c->c_cpu;
        c->c_flags &= ~(CALLOUT_FIRED | CALLOUT_INVOKING);

        /*
         * If this timeout is already scheduled and now is moved
         * earlier, reschedule it now.  Otherwise leave it in place
         * and let it be rescheduled later.
         */
        if ((c->c_flags & CALLOUT_PENDING) != 0) {
                /* Leave on existing CPU. */
                old_time = c->c_time;
                c->c_time = to_ticks + occ->cc_ticks;
                if (c->c_time - old_time < 0) {
                        CIRCQ_REMOVE(&c->c_list);
                        CIRCQ_INSERT(&c->c_list, &occ->cc_todo);
                }
                mutex_spin_exit(lock);
                return;
        }

        cc = curcpu()->ci_data.cpu_callout;
        if ((c->c_flags & CALLOUT_BOUND) != 0 || cc == occ ||
            !mutex_tryenter(cc->cc_lock)) {
                /* Leave on existing CPU. */
                c->c_time = to_ticks + occ->cc_ticks;
                c->c_flags |= CALLOUT_PENDING;
                CIRCQ_INSERT(&c->c_list, &occ->cc_todo);
        } else {
                /* Move to this CPU. */
                c->c_cpu = cc;
                c->c_time = to_ticks + cc->cc_ticks;
                c->c_flags |= CALLOUT_PENDING;
                CIRCQ_INSERT(&c->c_list, &cc->cc_todo);
                mutex_spin_exit(cc->cc_lock);
                SDT_PROBE6(sdt, kernel, callout, migrate,
                    c, c->c_func, c->c_arg, c->c_flags,
                    occ->cc_cpu, cc->cc_cpu);
        }
        mutex_spin_exit(lock);
}

/*
 * callout_reset:
 *
 *        Reset a callout structure with a new function and argument, and
 *        schedule it to run.
 */
void
callout_reset(callout_t *cs, int to_ticks, void (*func)(void *), void *arg)
{
        callout_impl_t *c = (callout_impl_t *)cs;
        kmutex_t *lock;

        KASSERT(c->c_magic == CALLOUT_MAGIC);
        KASSERT(func != NULL);

        lock = callout_lock(c);
        SDT_PROBE4(sdt, kernel, callout, setfunc,  cs, func, arg, c->c_flags);
        c->c_func = func;
        c->c_arg = arg;
        callout_schedule_locked(c, lock, to_ticks);
}

/*
 * callout_schedule:
 *
 *        Schedule a callout to run.  The function and argument must
 *        already be set in the callout structure.
 */
void
callout_schedule(callout_t *cs, int to_ticks)
{
        callout_impl_t *c = (callout_impl_t *)cs;
        kmutex_t *lock;

        KASSERT(c->c_magic == CALLOUT_MAGIC);

        lock = callout_lock(c);
        callout_schedule_locked(c, lock, to_ticks);
}

/*
 * callout_stop:
 *
 *        Try to cancel a pending callout.  It may be too late: the callout
 *        could be running on another CPU.  If called from interrupt context,
 *        the callout could already be in progress at a lower priority.
 */
bool
callout_stop(callout_t *cs)
{
        callout_impl_t *c = (callout_impl_t *)cs;
        kmutex_t *lock;
        bool expired;

        KASSERT(c->c_magic == CALLOUT_MAGIC);

        lock = callout_lock(c);

        if ((c->c_flags & CALLOUT_PENDING) != 0)
                CIRCQ_REMOVE(&c->c_list);
        expired = ((c->c_flags & CALLOUT_FIRED) != 0);
        c->c_flags &= ~(CALLOUT_PENDING|CALLOUT_FIRED);

        SDT_PROBE5(sdt, kernel, callout, stop,
            c, c->c_func, c->c_arg, c->c_flags, expired);

        mutex_spin_exit(lock);

        return expired;
}

/*
 * callout_halt:
 *
 *        Cancel a pending callout.  If in-flight, block until it completes.
 *        May not be called from a hard interrupt handler.  If the callout
 *         can take locks, the caller of callout_halt() must not hold any of
 *        those locks, otherwise the two could deadlock.  If 'interlock' is
 *        non-NULL and we must wait for the callout to complete, it will be
 *        released and re-acquired before returning.
 */
bool
callout_halt(callout_t *cs, void *interlock)
{
        callout_impl_t *c = (callout_impl_t *)cs;
        kmutex_t *lock;

        KASSERT(c->c_magic == CALLOUT_MAGIC);
        KASSERT(!cpu_intr_p());
        KASSERT(interlock == NULL || mutex_owned(interlock));

        /* Fast path. */
        lock = callout_lock(c);
        SDT_PROBE4(sdt, kernel, callout, halt,
            c, c->c_func, c->c_arg, c->c_flags);
        if ((c->c_flags & CALLOUT_PENDING) != 0)
                CIRCQ_REMOVE(&c->c_list);
        c->c_flags &= ~(CALLOUT_PENDING|CALLOUT_FIRED);
        if (__predict_false(callout_running_somewhere_else(c, c->c_cpu))) {
                callout_wait(c, interlock, lock);
                return true;
        }
        SDT_PROBE5(sdt, kernel, callout, halt__done,
            c, c->c_func, c->c_arg, c->c_flags, /*expired*/false);
        mutex_spin_exit(lock);
        return false;
}

/*
 * callout_wait:
 *
 *        Slow path for callout_halt().  Deliberately marked __noinline to
 *        prevent unneeded overhead in the caller.
 */
static void __noinline
callout_wait(callout_impl_t *c, void *interlock, kmutex_t *lock)
{
        struct callout_cpu *cc;
        struct lwp *l;
        kmutex_t *relock;
        int nlocks;

        l = curlwp;
        relock = NULL;
        for (;;) {
                /*
                 * At this point we know the callout is not pending, but it
                 * could be running on a CPU somewhere.  That can be curcpu
                 * in a few cases:
                 *
                 * - curlwp is a higher priority soft interrupt
                 * - the callout blocked on a lock and is currently asleep
                 * - the callout itself has called callout_halt() (nice!)
                 */
                cc = c->c_cpu;
                if (__predict_true(!callout_running_somewhere_else(c, cc)))
                        break;

                /* It's running - need to wait for it to complete. */
                if (interlock != NULL) {
                        /*
                         * Avoid potential scheduler lock order problems by
                         * dropping the interlock without the callout lock
                         * held; then retry.
                         */
                        mutex_spin_exit(lock);
                        mutex_exit(interlock);
                        relock = interlock;
                        interlock = NULL;
                } else {
                        /* XXX Better to do priority inheritance. */
                        KASSERT(l->l_wchan == NULL);
                        cc->cc_nwait++;
                        cc->cc_ev_block.ev_count++;
                        nlocks = sleepq_enter(&cc->cc_sleepq, l, cc->cc_lock);
                        sleepq_enqueue(&cc->cc_sleepq, cc, "callout",
                            &callout_syncobj, false);
                        sleepq_block(0, false, &callout_syncobj, nlocks);
                }

                /*
                 * Re-lock the callout and check the state of play again.
                 * It's a common design pattern for callouts to re-schedule
                 * themselves so put a stop to it again if needed.
                 */
                lock = callout_lock(c);
                if ((c->c_flags & CALLOUT_PENDING) != 0)
                        CIRCQ_REMOVE(&c->c_list);
                c->c_flags &= ~(CALLOUT_PENDING|CALLOUT_FIRED);
        }

        SDT_PROBE5(sdt, kernel, callout, halt__done,
            c, c->c_func, c->c_arg, c->c_flags, /*expired*/true);

        mutex_spin_exit(lock);
        if (__predict_false(relock != NULL))
                mutex_enter(relock);
}

#ifdef notyet
/*
 * callout_bind:
 *
 *        Bind a callout so that it will only execute on one CPU.
 *        The callout must be stopped, and must be MPSAFE.
 *
 *        XXX Disabled for now until it is decided how to handle
 *        offlined CPUs.  We may want weak+strong binding.
 */
void
callout_bind(callout_t *cs, struct cpu_info *ci)
{
        callout_impl_t *c = (callout_impl_t *)cs;
        struct callout_cpu *cc;
        kmutex_t *lock;

        KASSERT((c->c_flags & CALLOUT_PENDING) == 0);
        KASSERT(c->c_cpu->cc_active != c);
        KASSERT(c->c_magic == CALLOUT_MAGIC);
        KASSERT((c->c_flags & CALLOUT_MPSAFE) != 0);

        lock = callout_lock(c);
        cc = ci->ci_data.cpu_callout;
        c->c_flags |= CALLOUT_BOUND;
        if (c->c_cpu != cc) {
                /*
                 * Assigning c_cpu effectively unlocks the callout
                 * structure, as we don't hold the new CPU's lock.
                 * Issue memory barrier to prevent accesses being
                 * reordered.
                 */
                membar_exit();
                c->c_cpu = cc;
        }
        mutex_spin_exit(lock);
}
#endif

void
callout_setfunc(callout_t *cs, void (*func)(void *), void *arg)
{
        callout_impl_t *c = (callout_impl_t *)cs;
        kmutex_t *lock;

        KASSERT(c->c_magic == CALLOUT_MAGIC);
        KASSERT(func != NULL);

        lock = callout_lock(c);
        SDT_PROBE4(sdt, kernel, callout, setfunc,  cs, func, arg, c->c_flags);
        c->c_func = func;
        c->c_arg = arg;
        mutex_spin_exit(lock);
}

bool
callout_expired(callout_t *cs)
{
        callout_impl_t *c = (callout_impl_t *)cs;
        kmutex_t *lock;
        bool rv;

        KASSERT(c->c_magic == CALLOUT_MAGIC);

        lock = callout_lock(c);
        rv = ((c->c_flags & CALLOUT_FIRED) != 0);
        mutex_spin_exit(lock);

        return rv;
}

bool
callout_active(callout_t *cs)
{
        callout_impl_t *c = (callout_impl_t *)cs;
        kmutex_t *lock;
        bool rv;

        KASSERT(c->c_magic == CALLOUT_MAGIC);

        lock = callout_lock(c);
        rv = ((c->c_flags & (CALLOUT_PENDING|CALLOUT_FIRED)) != 0);
        mutex_spin_exit(lock);

        return rv;
}

bool
callout_pending(callout_t *cs)
{
        callout_impl_t *c = (callout_impl_t *)cs;
        kmutex_t *lock;
        bool rv;

        KASSERT(c->c_magic == CALLOUT_MAGIC);

        lock = callout_lock(c);
        rv = ((c->c_flags & CALLOUT_PENDING) != 0);
        mutex_spin_exit(lock);

        return rv;
}

bool
callout_invoking(callout_t *cs)
{
        callout_impl_t *c = (callout_impl_t *)cs;
        kmutex_t *lock;
        bool rv;

        KASSERT(c->c_magic == CALLOUT_MAGIC);

        lock = callout_lock(c);
        rv = ((c->c_flags & CALLOUT_INVOKING) != 0);
        mutex_spin_exit(lock);

        return rv;
}

void
callout_ack(callout_t *cs)
{
        callout_impl_t *c = (callout_impl_t *)cs;
        kmutex_t *lock;

        KASSERT(c->c_magic == CALLOUT_MAGIC);

        lock = callout_lock(c);
        c->c_flags &= ~CALLOUT_INVOKING;
        mutex_spin_exit(lock);
}

/*
 * callout_hardclock:
 *
 *        Called from hardclock() once every tick.  We schedule a soft
 *        interrupt if there is work to be done.
 */
void
callout_hardclock(void)
{
        struct callout_cpu *cc;
        int needsoftclock, ticks;

        cc = curcpu()->ci_data.cpu_callout;
        mutex_spin_enter(cc->cc_lock);

        ticks = ++cc->cc_ticks;

        MOVEBUCKET(cc, 0, ticks);
        if (MASKWHEEL(0, ticks) == 0) {
                MOVEBUCKET(cc, 1, ticks);
                if (MASKWHEEL(1, ticks) == 0) {
                        MOVEBUCKET(cc, 2, ticks);
                        if (MASKWHEEL(2, ticks) == 0)
                                MOVEBUCKET(cc, 3, ticks);
                }
        }

        needsoftclock = !CIRCQ_EMPTY(&cc->cc_todo);
        mutex_spin_exit(cc->cc_lock);

        if (needsoftclock)
                softint_schedule(callout_sih);
}

/*
 * callout_softclock:
 *
 *        Soft interrupt handler, scheduled above if there is work to
 *         be done.  Callouts are made in soft interrupt context.
 */
static void
callout_softclock(void *v)
{
        callout_impl_t *c;
        struct callout_cpu *cc;
        void (*func)(void *);
        void *arg;
        int mpsafe, count, ticks, delta;
        u_int flags __unused;
        lwp_t *l;

        l = curlwp;
        KASSERT(l->l_cpu == curcpu());
        cc = l->l_cpu->ci_data.cpu_callout;

        mutex_spin_enter(cc->cc_lock);
        cc->cc_lwp = l;
        while (!CIRCQ_EMPTY(&cc->cc_todo)) {
                c = CIRCQ_FIRST(&cc->cc_todo);
                KASSERT(c->c_magic == CALLOUT_MAGIC);
                KASSERT(c->c_func != NULL);
                KASSERT(c->c_cpu == cc);
                KASSERT((c->c_flags & CALLOUT_PENDING) != 0);
                KASSERT((c->c_flags & CALLOUT_FIRED) == 0);
                CIRCQ_REMOVE(&c->c_list);

                /* If due run it, otherwise insert it into the right bucket. */
                ticks = cc->cc_ticks;
                delta = (int)((unsigned)c->c_time - (unsigned)ticks);
                if (delta > 0) {
                        CIRCQ_INSERT(&c->c_list, BUCKET(cc, delta, c->c_time));
                        continue;
                }
                if (delta < 0)
                        cc->cc_ev_late.ev_count++;

                c->c_flags = (c->c_flags & ~CALLOUT_PENDING) |
                    (CALLOUT_FIRED | CALLOUT_INVOKING);
                mpsafe = (c->c_flags & CALLOUT_MPSAFE);
                func = c->c_func;
                arg = c->c_arg;
                cc->cc_active = c;
                flags = c->c_flags;

                mutex_spin_exit(cc->cc_lock);
                KASSERT(func != NULL);
                SDT_PROBE4(sdt, kernel, callout, entry,  c, func, arg, flags);
                if (__predict_false(!mpsafe)) {
                        KERNEL_LOCK(1, NULL);
                        (*func)(arg);
                        KERNEL_UNLOCK_ONE(NULL);
                } else
                        (*func)(arg);
                SDT_PROBE4(sdt, kernel, callout, return,  c, func, arg, flags);
                KASSERTMSG(l->l_blcnt == 0,
                    "callout %p func %p leaked %d biglocks",
                    c, func, l->l_blcnt);
                mutex_spin_enter(cc->cc_lock);

                /*
                 * We can't touch 'c' here because it might be
                 * freed already.  If LWPs waiting for callout
                 * to complete, awaken them.
                 */
                cc->cc_active = NULL;
                if ((count = cc->cc_nwait) != 0) {
                        cc->cc_nwait = 0;
                        /* sleepq_wake() drops the lock. */
                        sleepq_wake(&cc->cc_sleepq, cc, count, cc->cc_lock);
                        mutex_spin_enter(cc->cc_lock);
                }
        }
        cc->cc_lwp = NULL;
        mutex_spin_exit(cc->cc_lock);
}
#endif /* !CRASH */

#ifdef DDB
static void
db_show_callout_bucket(struct callout_cpu *cc, struct callout_circq *kbucket,
    struct callout_circq *bucket)
{
        callout_impl_t *c, ci;
        db_expr_t offset;
        const char *name;
        static char question[] = "?";
        int b;

        if (CIRCQ_LAST(bucket, kbucket))
                return;

        for (c = CIRCQ_FIRST(bucket); /*nothing*/; c = CIRCQ_NEXT(&c->c_list)) {
                db_read_bytes((db_addr_t)c, sizeof(ci), (char *)&ci);
                c = &ci;
                db_find_sym_and_offset((db_addr_t)(intptr_t)c->c_func, &name,
                    &offset);
                name = name ? name : question;
                b = (bucket - cc->cc_wheel);
                if (b < 0)
                        b = -WHEELSIZE;
                db_printf("%9d %2d/%-4d %16lx  %s\n",
                    c->c_time - cc->cc_ticks, b / WHEELSIZE, b,
                    (u_long)c->c_arg, name);
                if (CIRCQ_LAST(&c->c_list, kbucket))
                        break;
        }
}

void
db_show_callout(db_expr_t addr, bool haddr, db_expr_t count, const char *modif)
{
        struct callout_cpu *cc;
        struct cpu_info *ci;
        int b;

#ifndef CRASH
        db_printf("hardclock_ticks now: %d\n", getticks());
#endif
        db_printf("    ticks  wheel               arg  func\n");

        /*
         * Don't lock the callwheel; all the other CPUs are paused
         * anyhow, and we might be called in a circumstance where
         * some other CPU was paused while holding the lock.
         */
        for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) {
                db_read_bytes((db_addr_t)ci +
                    offsetof(struct cpu_info, ci_data.cpu_callout),
                    sizeof(cc), (char *)&cc);
                db_read_bytes((db_addr_t)cc, sizeof(ccb), (char *)&ccb);
                db_show_callout_bucket(&ccb, &cc->cc_todo, &ccb.cc_todo);
        }
        for (b = 0; b < BUCKETS; b++) {
                for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) {
                        db_read_bytes((db_addr_t)ci +
                            offsetof(struct cpu_info, ci_data.cpu_callout),
                            sizeof(cc), (char *)&cc);
                        db_read_bytes((db_addr_t)cc, sizeof(ccb), (char *)&ccb);
                        db_show_callout_bucket(&ccb, &cc->cc_wheel[b],
                            &ccb.cc_wheel[b]);
                }
        }
}
#endif /* DDB */





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  110 

  109 



















































    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
/*        $NetBSD: cpu.c,v 1.209 2023/07/16 19:55:43 riastradh Exp $        */

/*
 * Copyright (c) 2000-2020 NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Bill Sommerfeld of RedBack Networks Inc, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1999 Stefan Grefen
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by the NetBSD
 *      Foundation, Inc. and its contributors.
 * 4. Neither the name of The NetBSD Foundation nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR AND CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.209 2023/07/16 19:55:43 riastradh Exp $");

#include "opt_ddb.h"
#include "opt_mpbios.h"                /* for MPDEBUG */
#include "opt_mtrr.h"
#include "opt_multiprocessor.h"
#include "opt_svs.h"

#include "lapic.h"
#include "ioapic.h"
#include "acpica.h"
#include "hpet.h"

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/device.h>
#include <sys/cpu.h>
#include <sys/cpufreq.h>
#include <sys/idle.h>
#include <sys/atomic.h>
#include <sys/reboot.h>
#include <sys/csan.h>

#include <uvm/uvm.h>

#include "acpica.h"                /* for NACPICA, for mp_verbose */

#include <x86/machdep.h>
#include <machine/cpufunc.h>
#include <machine/cpuvar.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#if defined(MULTIPROCESSOR)
#include <machine/mpbiosvar.h>
#endif
#include <machine/mpconfig.h>                /* for mp_verbose */
#include <machine/pcb.h>
#include <machine/specialreg.h>
#include <machine/segments.h>
#include <machine/gdt.h>
#include <machine/mtrr.h>
#include <machine/pio.h>
#include <machine/cpu_counter.h>
#include <machine/pmap_private.h>

#include <x86/fpu.h>

#if NACPICA > 0
#include <dev/acpi/acpi_srat.h>
#endif

#if NLAPIC > 0
#include <machine/apicvar.h>
#include <machine/i82489reg.h>
#include <machine/i82489var.h>
#endif

#include <dev/ic/mc146818reg.h>
#include <dev/ic/hpetvar.h>
#include <i386/isa/nvram.h>
#include <dev/isa/isareg.h>

#include "tsc.h"

#ifndef XENPV
#include "hyperv.h"
#if NHYPERV > 0
#include <x86/x86/hypervvar.h>
#endif
#endif

#ifdef XEN
#include <xen/hypervisor.h>
#endif

static int        cpu_match(device_t, cfdata_t, void *);
static void        cpu_attach(device_t, device_t, void *);
static void        cpu_defer(device_t);
static int        cpu_rescan(device_t, const char *, const int *);
static void        cpu_childdetached(device_t, device_t);
static bool        cpu_stop(device_t);
static bool        cpu_suspend(device_t, const pmf_qual_t *);
static bool        cpu_resume(device_t, const pmf_qual_t *);
static bool        cpu_shutdown(device_t, int);

struct cpu_softc {
        device_t sc_dev;                /* device tree glue */
        struct cpu_info *sc_info;        /* pointer to CPU info */
        bool sc_wasonline;
};

#ifdef MULTIPROCESSOR
int mp_cpu_start(struct cpu_info *, paddr_t);
void mp_cpu_start_cleanup(struct cpu_info *);
const struct cpu_functions mp_cpu_funcs = { mp_cpu_start, NULL,
                                            mp_cpu_start_cleanup };
#endif


CFATTACH_DECL2_NEW(cpu, sizeof(struct cpu_softc),
    cpu_match, cpu_attach, NULL, NULL, cpu_rescan, cpu_childdetached);

/*
 * Statically-allocated CPU info for the primary CPU (or the only
 * CPU, on uniprocessors).  The CPU info list is initialized to
 * point at it.
 */
struct cpu_info cpu_info_primary __aligned(CACHE_LINE_SIZE) = {
        .ci_dev = 0,
        .ci_self = &cpu_info_primary,
        .ci_idepth = -1,
        .ci_curlwp = &lwp0,
        .ci_curldt = -1,
        .ci_kfpu_spl = -1,
};

struct cpu_info *cpu_info_list = &cpu_info_primary;

#ifdef i386
void                cpu_set_tss_gates(struct cpu_info *);
#endif

static void        cpu_init_idle_lwp(struct cpu_info *);

uint32_t cpu_feature[7] __read_mostly; /* X86 CPUID feature bits */
                        /* [0] basic features cpuid.1:%edx
                         * [1] basic features cpuid.1:%ecx (CPUID2_xxx bits)
                         * [2] extended features cpuid:80000001:%edx
                         * [3] extended features cpuid:80000001:%ecx
                         * [4] VIA padlock features
                         * [5] structured extended features cpuid.7:%ebx
                         * [6] structured extended features cpuid.7:%ecx
                         */

#ifdef MULTIPROCESSOR
bool x86_mp_online;
paddr_t mp_trampoline_paddr = MP_TRAMPOLINE;
#endif
#if NLAPIC > 0
static vaddr_t cmos_data_mapping;
#endif
struct cpu_info *cpu_starting;

#ifdef MULTIPROCESSOR
void                cpu_hatch(void *);
static void        cpu_boot_secondary(struct cpu_info *ci);
static void        cpu_start_secondary(struct cpu_info *ci);
#if NLAPIC > 0
static void        cpu_copy_trampoline(paddr_t);
#endif
#endif /* MULTIPROCESSOR */

/*
 * Runs once per boot once multiprocessor goo has been detected and
 * the local APIC on the boot processor has been mapped.
 *
 * Called from lapic_boot_init() (from mpbios_scan()).
 */
#if NLAPIC > 0
void
cpu_init_first(void)
{

        cpu_info_primary.ci_cpuid = lapic_cpu_number();

        cmos_data_mapping = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_VAONLY);
        if (cmos_data_mapping == 0)
                panic("No KVA for page 0");
        pmap_kenter_pa(cmos_data_mapping, 0, VM_PROT_READ|VM_PROT_WRITE, 0);
        pmap_update(pmap_kernel());
}
#endif

static int
cpu_match(device_t parent, cfdata_t match, void *aux)
{

        return 1;
}

#ifdef __HAVE_PCPU_AREA
void
cpu_pcpuarea_init(struct cpu_info *ci)
{
        struct vm_page *pg;
        size_t i, npages;
        vaddr_t base, va;
        paddr_t pa;

        CTASSERT(sizeof(struct pcpu_entry) % PAGE_SIZE == 0);

        npages = sizeof(struct pcpu_entry) / PAGE_SIZE;
        base = (vaddr_t)&pcpuarea->ent[cpu_index(ci)];

        for (i = 0; i < npages; i++) {
                pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
                if (pg == NULL) {
                        panic("failed to allocate pcpu PA");
                }

                va = base + i * PAGE_SIZE;
                pa = VM_PAGE_TO_PHYS(pg);

                pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0);
        }

        pmap_update(pmap_kernel());
}
#endif

static void
cpu_vm_init(struct cpu_info *ci)
{
        unsigned int ncolors = 2;

        /*
         * XXX: for AP's the cache info has not been initialized yet
         * but that does not matter because uvm only pays attention at
         * the maximum only. We should fix it once cpus have different
         * cache sizes.
         */
        for (unsigned int i = CAI_ICACHE; i <= CAI_L2CACHE; i++) {
                struct x86_cache_info *cai;
                unsigned int tcolors;

                cai = &ci->ci_cinfo[i];

                tcolors = atop(cai->cai_totalsize);
                switch (cai->cai_associativity) {
                case 0xff:
                        tcolors = 1; /* fully associative */
                        break;
                case 0:
                case 1:
                        break;
                default:
                        tcolors /= cai->cai_associativity;
                }
                if (tcolors <= ncolors)
                        continue;
                ncolors = tcolors;
        }

        /*
         * If the desired number of colors is not a power of
         * two, it won't be good.  Find the greatest power of
         * two which is an even divisor of the number of colors,
         * to preserve even coloring of pages.
         */
        if (ncolors & (ncolors - 1) ) {
                unsigned int try, picked = 1;
                for (try = 1; try < ncolors; try *= 2) {
                        if (ncolors % try == 0) picked = try;
                }
                if (picked == 1) {
                        panic("desired number of cache colors %u is "
                        " > 1, but not even!", ncolors);
                }
                ncolors = picked;
        }

        /*
         * Knowing the size of the largest cache on this CPU, potentially
         * re-color our pages.
         */
        aprint_debug_dev(ci->ci_dev, "%d page colors\n", ncolors);
        uvm_page_recolor(ncolors);

        pmap_tlb_cpu_init(ci);
#ifndef __HAVE_DIRECT_MAP
        pmap_vpage_cpu_init(ci);
#endif
}

static void
cpu_attach(device_t parent, device_t self, void *aux)
{
        struct cpu_softc *sc = device_private(self);
        struct cpu_attach_args *caa = aux;
        struct cpu_info *ci;
        uintptr_t ptr;
#if NLAPIC > 0
        int cpunum = caa->cpu_number;
#endif
        static bool again;

        sc->sc_dev = self;

        if (ncpu > maxcpus) {
#ifndef _LP64
                aprint_error(": too many CPUs, please use NetBSD/amd64\n");
#else
                aprint_error(": too many CPUs\n");
#endif
                return;
        }

        /*
         * If we're an Application Processor, allocate a cpu_info
         * structure, otherwise use the primary's.
         */
        if (caa->cpu_role == CPU_ROLE_AP) {
                if ((boothowto & RB_MD1) != 0) {
                        aprint_error(": multiprocessor boot disabled\n");
                        if (!pmf_device_register(self, NULL, NULL))
                                aprint_error_dev(self,
                                    "couldn't establish power handler\n");
                        return;
                }
                aprint_naive(": Application Processor\n");
                ptr = (uintptr_t)uvm_km_alloc(kernel_map,
                    sizeof(*ci) + CACHE_LINE_SIZE - 1, 0,
                    UVM_KMF_WIRED|UVM_KMF_ZERO);
                ci = (struct cpu_info *)roundup2(ptr, CACHE_LINE_SIZE);
                ci->ci_curldt = -1;
        } else {
                aprint_naive(": %s Processor\n",
                    caa->cpu_role == CPU_ROLE_SP ? "Single" : "Boot");
                ci = &cpu_info_primary;
#if NLAPIC > 0
                if (cpunum != lapic_cpu_number()) {
                        /* XXX should be done earlier. */
                        uint32_t reg;
                        aprint_verbose("\n");
                        aprint_verbose_dev(self, "running CPU at apic %d"
                            " instead of at expected %d", lapic_cpu_number(),
                            cpunum);
                        reg = lapic_readreg(LAPIC_ID);
                        lapic_writereg(LAPIC_ID, (reg & ~LAPIC_ID_MASK) |
                            (cpunum << LAPIC_ID_SHIFT));
                }
                if (cpunum != lapic_cpu_number()) {
                        aprint_error_dev(self, "unable to reset apic id\n");
                }
#endif
        }

        ci->ci_self = ci;
        sc->sc_info = ci;
        ci->ci_dev = self;
        ci->ci_acpiid = caa->cpu_id;
        ci->ci_cpuid = caa->cpu_number;
        ci->ci_func = caa->cpu_func;
        ci->ci_kfpu_spl = -1;
        aprint_normal("\n");

        /* Must be before mi_cpu_attach(). */
        cpu_vm_init(ci);

        if (caa->cpu_role == CPU_ROLE_AP) {
                int error;

                error = mi_cpu_attach(ci);
                if (error != 0) {
                        aprint_error_dev(self,
                            "mi_cpu_attach failed with %d\n", error);
                        return;
                }
#ifdef __HAVE_PCPU_AREA
                cpu_pcpuarea_init(ci);
#endif
                cpu_init_tss(ci);
        } else {
                KASSERT(ci->ci_data.cpu_idlelwp != NULL);
#if NACPICA > 0
                /* Parse out NUMA info for cpu_identify(). */
                acpisrat_init();
#endif
        }

#ifdef SVS
        cpu_svs_init(ci);
#endif

        pmap_reference(pmap_kernel());
        ci->ci_pmap = pmap_kernel();
        ci->ci_tlbstate = TLBSTATE_STALE;

        /*
         * Boot processor may not be attached first, but the below
         * must be done to allow booting other processors.
         */
        if (!again) {
                /* Make sure DELAY() (likely i8254_delay()) is initialized. */
                DELAY(1);

                /*
                 * Basic init.  Compute an approximate frequency for the TSC
                 * using the i8254.  If there's a HPET we'll redo it later.
                 */
                atomic_or_32(&ci->ci_flags, CPUF_PRESENT | CPUF_PRIMARY);
                cpu_intr_init(ci);
                tsc_setfunc(ci);
                cpu_get_tsc_freq(ci);
                cpu_init(ci);
#ifdef i386
                cpu_set_tss_gates(ci);
#endif
                pmap_cpu_init_late(ci);
#if NLAPIC > 0
                if (caa->cpu_role != CPU_ROLE_SP) {
                        /* Enable lapic. */
                        lapic_enable();
                        lapic_set_lvt();
                        if (!vm_guest_is_xenpvh_or_pvhvm())
                                lapic_calibrate_timer(false);
                }
#endif
                kcsan_cpu_init(ci);
                again = true;
        }

        /* further PCB init done later. */

        switch (caa->cpu_role) {
        case CPU_ROLE_SP:
                atomic_or_32(&ci->ci_flags, CPUF_SP);
                cpu_identify(ci);
                x86_errata();
                x86_cpu_idle_init();
#ifdef XENPVHVM
                xen_hvm_init_cpu(ci);
#endif
                break;

        case CPU_ROLE_BP:
                atomic_or_32(&ci->ci_flags, CPUF_BSP);
                cpu_identify(ci);
                x86_errata();
                x86_cpu_idle_init();
#ifdef XENPVHVM
                xen_hvm_init_cpu(ci);
#endif
                break;

#ifdef MULTIPROCESSOR
        case CPU_ROLE_AP:
                /*
                 * report on an AP
                 */
                cpu_intr_init(ci);
                idt_vec_init_cpu_md(&ci->ci_idtvec, cpu_index(ci));
                gdt_alloc_cpu(ci);
#ifdef i386
                cpu_set_tss_gates(ci);
#endif
                pmap_cpu_init_late(ci);
                cpu_start_secondary(ci);
                if (ci->ci_flags & CPUF_PRESENT) {
                        struct cpu_info *tmp;

                        cpu_identify(ci);
                        tmp = cpu_info_list;
                        while (tmp->ci_next)
                                tmp = tmp->ci_next;

                        tmp->ci_next = ci;
                }
                break;
#endif

        default:
                panic("unknown processor type??\n");
        }

        pat_init(ci);

        if (!pmf_device_register1(self, cpu_suspend, cpu_resume, cpu_shutdown))
                aprint_error_dev(self, "couldn't establish power handler\n");

#ifdef MULTIPROCESSOR
        if (mp_verbose) {
                struct lwp *l = ci->ci_data.cpu_idlelwp;
                struct pcb *pcb = lwp_getpcb(l);

                aprint_verbose_dev(self,
                    "idle lwp at %p, idle sp at %p\n",
                    l,
#ifdef i386
                    (void *)pcb->pcb_esp
#else
                    (void *)pcb->pcb_rsp
#endif
                );
        }
#endif

        /*
         * Postpone the "cpufeaturebus" scan.
         * It is safe to scan the pseudo-bus
         * only after all CPUs have attached.
         */
        (void)config_defer(self, cpu_defer);
}

static void
cpu_defer(device_t self)
{
        cpu_rescan(self, NULL, NULL);
}

static int
cpu_rescan(device_t self, const char *ifattr, const int *locators)
{
        struct cpu_softc *sc = device_private(self);
        struct cpufeature_attach_args cfaa;
        struct cpu_info *ci = sc->sc_info;

        /*
         * If we booted with RB_MD1 to disable multiprocessor, the
         * auto-configuration data still contains the additional
         * CPUs.   But their initialization was mostly bypassed
         * during attach, so we have to make sure we don't look at
         * their featurebus info, since it wasn't retrieved.
         */
        if (ci == NULL)
                return 0;

        memset(&cfaa, 0, sizeof(cfaa));
        cfaa.ci = ci;

        if (ifattr_match(ifattr, "cpufeaturebus")) {
                if (ci->ci_frequency == NULL) {
                        cfaa.name = "frequency";
                        ci->ci_frequency =
                            config_found(self, &cfaa, NULL,
                                         CFARGS(.iattr = "cpufeaturebus"));
                }

                if (ci->ci_padlock == NULL) {
                        cfaa.name = "padlock";
                        ci->ci_padlock =
                            config_found(self, &cfaa, NULL,
                                         CFARGS(.iattr = "cpufeaturebus"));
                }

                if (ci->ci_temperature == NULL) {
                        cfaa.name = "temperature";
                        ci->ci_temperature =
                            config_found(self, &cfaa, NULL,
                                         CFARGS(.iattr = "cpufeaturebus"));
                }

                if (ci->ci_vm == NULL) {
                        cfaa.name = "vm";
                        ci->ci_vm =
                            config_found(self, &cfaa, NULL,
                                         CFARGS(.iattr = "cpufeaturebus"));
                }
        }

        return 0;
}

static void
cpu_childdetached(device_t self, device_t child)
{
        struct cpu_softc *sc = device_private(self);
        struct cpu_info *ci = sc->sc_info;

        if (ci->ci_frequency == child)
                ci->ci_frequency = NULL;

        if (ci->ci_padlock == child)
                ci->ci_padlock = NULL;

        if (ci->ci_temperature == child)
                ci->ci_temperature = NULL;

        if (ci->ci_vm == child)
                ci->ci_vm = NULL;
}

/*
 * Initialize the processor appropriately.
 */

void
cpu_init(struct cpu_info *ci)
{
        extern int x86_fpu_save;
        uint32_t cr4 = 0;

        lcr0(rcr0() | CR0_WP);

        /* If global TLB caching is supported, enable it */
        if (cpu_feature[0] & CPUID_PGE)
                cr4 |= CR4_PGE;

        /*
         * If we have FXSAVE/FXRESTOR, use them.
         */
        if (cpu_feature[0] & CPUID_FXSR) {
                cr4 |= CR4_OSFXSR;

                /*
                 * If we have SSE/SSE2, enable XMM exceptions.
                 */
                if (cpu_feature[0] & (CPUID_SSE|CPUID_SSE2))
                        cr4 |= CR4_OSXMMEXCPT;
        }

        /* If xsave is supported, enable it */
        if (cpu_feature[1] & CPUID2_XSAVE)
                cr4 |= CR4_OSXSAVE;

        /* If SMEP is supported, enable it */
        if (cpu_feature[5] & CPUID_SEF_SMEP)
                cr4 |= CR4_SMEP;

        /* If SMAP is supported, enable it */
        if (cpu_feature[5] & CPUID_SEF_SMAP)
                cr4 |= CR4_SMAP;

#ifdef SVS
        /* If PCID is supported, enable it */
        if (svs_pcid)
                cr4 |= CR4_PCIDE;
#endif

        if (cr4) {
                cr4 |= rcr4();
                lcr4(cr4);
        }

        /*
         * Changing CR4 register may change cpuid values. For example, setting
         * CR4_OSXSAVE sets CPUID2_OSXSAVE. The CPUID2_OSXSAVE is in
         * ci_feat_val[1], so update it.
         * XXX Other than ci_feat_val[1] might be changed.
         */
        if (cpuid_level >= 1) {
                u_int descs[4];

                x86_cpuid(1, descs);
                ci->ci_feat_val[1] = descs[2];
        }

        if (CPU_IS_PRIMARY(ci) &&
            x86_fpu_save >= FPU_SAVE_FXSAVE) {
                fpuinit_mxcsr_mask();
        }

        /* If xsave is enabled, enable all fpu features */
        if (cr4 & CR4_OSXSAVE)
                wrxcr(0, x86_xsave_features & XCR0_FPU);

#ifdef MTRR
        /*
         * On a P6 or above, initialize MTRR's if the hardware supports them.
         */
        if (cpu_feature[0] & CPUID_MTRR) {
                if ((ci->ci_flags & CPUF_AP) == 0)
                        i686_mtrr_init_first();
                mtrr_init_cpu(ci);
        }

#ifdef i386
        if (strcmp((char *)(ci->ci_vendor), "AuthenticAMD") == 0) {
                /*
                 * Must be a K6-2 Step >= 7 or a K6-III.
                 */
                if (CPUID_TO_FAMILY(ci->ci_signature) == 5) {
                        if (CPUID_TO_MODEL(ci->ci_signature) > 8 ||
                            (CPUID_TO_MODEL(ci->ci_signature) == 8 &&
                             CPUID_TO_STEPPING(ci->ci_signature) >= 7)) {
                                mtrr_funcs = &k6_mtrr_funcs;
                                k6_mtrr_init_first();
                                mtrr_init_cpu(ci);
                        }
                }
        }
#endif        /* i386 */
#endif /* MTRR */

        if (ci != &cpu_info_primary) {
                /* Synchronize TSC */
                atomic_or_32(&ci->ci_flags, CPUF_RUNNING);
                tsc_sync_ap(ci);
        } else {
                atomic_or_32(&ci->ci_flags, CPUF_RUNNING);
        }
}

#ifdef MULTIPROCESSOR
void
cpu_boot_secondary_processors(void)
{
        struct cpu_info *ci;
        kcpuset_t *cpus;
        u_long i;

        /* Now that we know the number of CPUs, patch the text segment. */
        x86_patch(false);

#if NACPICA > 0
        /* Finished with NUMA info for now. */
        acpisrat_exit();
#endif

        kcpuset_create(&cpus, true);
        kcpuset_set(cpus, cpu_index(curcpu()));
        for (i = 0; i < maxcpus; i++) {
                ci = cpu_lookup(i);
                if (ci == NULL)
                        continue;
                if (ci->ci_data.cpu_idlelwp == NULL)
                        continue;
                if ((ci->ci_flags & CPUF_PRESENT) == 0)
                        continue;
                if (ci->ci_flags & (CPUF_BSP|CPUF_SP|CPUF_PRIMARY))
                        continue;
                cpu_boot_secondary(ci);
                kcpuset_set(cpus, cpu_index(ci));
        }
        while (!kcpuset_match(cpus, kcpuset_running))
                ;
        kcpuset_destroy(cpus);

        x86_mp_online = true;

        /* Now that we know about the TSC, attach the timecounter. */
        tsc_tc_init();
}
#endif

static void
cpu_init_idle_lwp(struct cpu_info *ci)
{
        struct lwp *l = ci->ci_data.cpu_idlelwp;
        struct pcb *pcb = lwp_getpcb(l);

        pcb->pcb_cr0 = rcr0();
}

void
cpu_init_idle_lwps(void)
{
        struct cpu_info *ci;
        u_long i;

        for (i = 0; i < maxcpus; i++) {
                ci = cpu_lookup(i);
                if (ci == NULL)
                        continue;
                if (ci->ci_data.cpu_idlelwp == NULL)
                        continue;
                if ((ci->ci_flags & CPUF_PRESENT) == 0)
                        continue;
                cpu_init_idle_lwp(ci);
        }
}

#ifdef MULTIPROCESSOR
void
cpu_start_secondary(struct cpu_info *ci)
{
        u_long psl;
        int i;

#if NLAPIC > 0
        paddr_t mp_pdirpa;
        mp_pdirpa = pmap_init_tmp_pgtbl(mp_trampoline_paddr);
        cpu_copy_trampoline(mp_pdirpa);
#endif

        atomic_or_32(&ci->ci_flags, CPUF_AP);
        ci->ci_curlwp = ci->ci_data.cpu_idlelwp;
        if (CPU_STARTUP(ci, mp_trampoline_paddr) != 0) {
                return;
        }

        /*
         * Wait for it to become ready.   Setting cpu_starting opens the
         * initial gate and allows the AP to start soft initialization.
         */
        KASSERT(cpu_starting == NULL);
        cpu_starting = ci;
        for (i = 100000; (!(ci->ci_flags & CPUF_PRESENT)) && i > 0; i--) {
                delay_func(10);
        }

        if ((ci->ci_flags & CPUF_PRESENT) == 0) {
                aprint_error_dev(ci->ci_dev, "failed to become ready\n");
#if defined(MPDEBUG) && defined(DDB)
                printf("dropping into debugger; continue from here to resume boot\n");
                Debugger();
#endif
        } else {
                /*
                 * Synchronize time stamp counters. Invalidate cache and do
                 * twice (in tsc_sync_bp) to minimize possible cache effects.
                 * Disable interrupts to try and rule out any external
                 * interference.
                 */
                psl = x86_read_psl();
                x86_disable_intr();
                tsc_sync_bp(ci);
                x86_write_psl(psl);
        }

        CPU_START_CLEANUP(ci);
        cpu_starting = NULL;
}

void
cpu_boot_secondary(struct cpu_info *ci)
{
        int64_t drift;
        u_long psl;
        int i;

        atomic_or_32(&ci->ci_flags, CPUF_GO);
        for (i = 100000; (!(ci->ci_flags & CPUF_RUNNING)) && i > 0; i--) {
                delay_func(10);
        }
        if ((ci->ci_flags & CPUF_RUNNING) == 0) {
                aprint_error_dev(ci->ci_dev, "failed to start\n");
#if defined(MPDEBUG) && defined(DDB)
                printf("dropping into debugger; continue from here to resume boot\n");
                Debugger();
#endif
        } else {
                /* Synchronize TSC again, check for drift. */
                drift = ci->ci_data.cpu_cc_skew;
                psl = x86_read_psl();
                x86_disable_intr();
                tsc_sync_bp(ci);
                x86_write_psl(psl);
                drift -= ci->ci_data.cpu_cc_skew;
                aprint_debug_dev(ci->ci_dev, "TSC skew=%lld drift=%lld\n",
                    (long long)ci->ci_data.cpu_cc_skew, (long long)drift);
                tsc_sync_drift(drift);
        }
}

/*
 * The CPU ends up here when it's ready to run.
 * This is called from code in mptramp.s; at this point, we are running
 * in the idle pcb/idle stack of the new CPU.  When this function returns,
 * this processor will enter the idle loop and start looking for work.
 */
void
cpu_hatch(void *v)
{
        struct cpu_info *ci = (struct cpu_info *)v;
        struct pcb *pcb;
        int s, i;

        /* ------------------------------------------------------------- */

        /*
         * This section of code must be compiled with SSP disabled, to
         * prevent a race against cpu0. See sys/conf/ssp.mk.
         */

        /*
         * Initialize MSRs on this CPU:
         *
         * - On amd64: Enables SYSCALL/SYSRET.
         *
         * - On amd64: Sets up %fs and %gs so that %gs points to the
         *   current struct cpu_info as needed for CPUVAR(...),
         *   curcpu(), and curlwp.
         *
         *   (On i386, CPUVAR(...), curcpu(), and curlwp are made to
         *   work first by the conifguration of segment descriptors in
         *   the Global Descriptor Table (GDT) in initgdt.)
         *
         * - Enables the no-execute bit if supported.
         *
         * Thus, after this point, CPUVAR(...), curcpu(), and curlwp
         * will work on this CPU.
         *
         * Note: The call to cpu_init_msrs for cpu0 happens in
         * init386/init_x86_64.
         */
        cpu_init_msrs(ci, true);

        cpu_probe(ci);
        cpu_speculation_init(ci);
#if NHYPERV > 0
        hyperv_init_cpu(ci);
#endif

        ci->ci_data.cpu_cc_freq = cpu_info_primary.ci_data.cpu_cc_freq;
        /* cpu_get_tsc_freq(ci); */

        KDASSERT((ci->ci_flags & CPUF_PRESENT) == 0);

        /*
         * Synchronize the TSC for the first time. Note that interrupts are
         * off at this point.
         */
        atomic_or_32(&ci->ci_flags, CPUF_PRESENT);
        tsc_sync_ap(ci);

        /* ------------------------------------------------------------- */

        /*
         * Wait to be brought online.
         *
         * Use MONITOR/MWAIT if available. These instructions put the CPU in
         * a low consumption mode (C-state), and if the TSC is not invariant,
         * this causes the TSC to drift. We want this to happen, so that we
         * can later detect (in tsc_tc_init) any abnormal drift with invariant
         * TSCs. That's just for safety; by definition such drifts should
         * never occur with invariant TSCs.
         *
         * If not available, try PAUSE. We'd like to use HLT, but we have
         * interrupts off.
         */
        while ((ci->ci_flags & CPUF_GO) == 0) {
                if ((cpu_feature[1] & CPUID2_MONITOR) != 0) {
                        x86_monitor(&ci->ci_flags, 0, 0);
                        if ((ci->ci_flags & CPUF_GO) != 0) {
                                continue;
                        }
                        x86_mwait(0, 0);
                } else {
        /*
         * XXX The loop repetition count could be a lot higher, but
         * XXX currently qemu emulator takes a _very_long_time_ to
         * XXX execute the pause instruction.  So for now, use a low
         * XXX value to allow the cpu to hatch before timing out.
         */
                        for (i = 50; i != 0; i--) {
                                x86_pause();
                        }
                }
        }

        /* Because the text may have been patched in x86_patch(). */
        wbinvd();
        x86_flush();
        tlbflushg();

        KASSERT((ci->ci_flags & CPUF_RUNNING) == 0);

#ifdef PAE
        pd_entry_t * l3_pd = ci->ci_pae_l3_pdir;
        for (i = 0 ; i < PDP_SIZE; i++) {
                l3_pd[i] = pmap_kernel()->pm_pdirpa[i] | PTE_P;
        }
        lcr3(ci->ci_pae_l3_pdirpa);
#else
        lcr3(pmap_pdirpa(pmap_kernel(), 0));
#endif

        pcb = lwp_getpcb(curlwp);
        pcb->pcb_cr3 = rcr3();
        pcb = lwp_getpcb(ci->ci_data.cpu_idlelwp);
        lcr0(pcb->pcb_cr0);

        cpu_init_idt(ci);
        gdt_init_cpu(ci);
#if NLAPIC > 0
        lapic_enable();
        lapic_set_lvt();
#endif

        fpuinit(ci);
        lldt(GSYSSEL(GLDT_SEL, SEL_KPL));
        ltr(ci->ci_tss_sel);

        /*
         * cpu_init will re-synchronize the TSC, and will detect any abnormal
         * drift that would have been caused by the use of MONITOR/MWAIT
         * above.
         */
        cpu_init(ci);
#ifdef XENPVHVM
        xen_hvm_init_cpu(ci);
#endif
        (*x86_initclock_func)();
        cpu_get_tsc_freq(ci);

        s = splhigh();
#if NLAPIC > 0
        lapic_write_tpri(0);
#endif
        x86_enable_intr();
        splx(s);
        x86_errata();

        aprint_debug_dev(ci->ci_dev, "running\n");

        kcsan_cpu_init(ci);

        idle_loop(NULL);
        KASSERT(false);
}
#endif

#if defined(DDB)

#include <ddb/db_output.h>
#include <machine/db_machdep.h>

/*
 * Dump CPU information from ddb.
 */
void
cpu_debug_dump(void)
{
        struct cpu_info *ci;
        CPU_INFO_ITERATOR cii;
        const char sixtyfour64space[] =
#ifdef _LP64
                           "        "
#endif
                           "";

        db_printf("addr                %sdev        id        flags        ipis        spl curlwp                 "
                  "\n", sixtyfour64space);
        for (CPU_INFO_FOREACH(cii, ci)) {
                db_printf("%p        %s        %ld        %x        %x        %d  %10p\n",
                    ci,
                    ci->ci_dev == NULL ? "BOOT" : device_xname(ci->ci_dev),
                    (long)ci->ci_cpuid,
                    ci->ci_flags, ci->ci_ipis, ci->ci_ilevel,
                    ci->ci_curlwp);
        }
}
#endif

#ifdef MULTIPROCESSOR
#if NLAPIC > 0
static void
cpu_copy_trampoline(paddr_t pdir_pa)
{
        extern uint32_t nox_flag;
        extern u_char cpu_spinup_trampoline[];
        extern u_char cpu_spinup_trampoline_end[];
        vaddr_t mp_trampoline_vaddr;
        struct {
                uint32_t large;
                uint32_t nox;
                uint32_t pdir;
        } smp_data;
        CTASSERT(sizeof(smp_data) == 3 * 4);

        smp_data.large = (pmap_largepages != 0);
        smp_data.nox = nox_flag;
        smp_data.pdir = (uint32_t)(pdir_pa & 0xFFFFFFFF);

        /* Enter the physical address */
        mp_trampoline_vaddr = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
            UVM_KMF_VAONLY);
        pmap_kenter_pa(mp_trampoline_vaddr, mp_trampoline_paddr,
            VM_PROT_READ | VM_PROT_WRITE, 0);
        pmap_update(pmap_kernel());

        /* Copy boot code */
        memcpy((void *)mp_trampoline_vaddr,
            cpu_spinup_trampoline,
            cpu_spinup_trampoline_end - cpu_spinup_trampoline);

        /* Copy smp_data at the end */
        memcpy((void *)(mp_trampoline_vaddr + PAGE_SIZE - sizeof(smp_data)),
            &smp_data, sizeof(smp_data));

        pmap_kremove(mp_trampoline_vaddr, PAGE_SIZE);
        pmap_update(pmap_kernel());
        uvm_km_free(kernel_map, mp_trampoline_vaddr, PAGE_SIZE, UVM_KMF_VAONLY);
}
#endif

int
mp_cpu_start(struct cpu_info *ci, paddr_t target)
{
        int error;

        /*
         * Bootstrap code must be addressable in real mode
         * and it must be page aligned.
         */
        KASSERT(target < 0x10000 && target % PAGE_SIZE == 0);

        /*
         * "The BSP must initialize CMOS shutdown code to 0Ah ..."
         */

        outb(IO_RTC, NVRAM_RESET);
        outb(IO_RTC+1, NVRAM_RESET_JUMP);

#if NLAPIC > 0
        /*
         * "and the warm reset vector (DWORD based at 40:67) to point
         * to the AP startup code ..."
         */
        unsigned short dwordptr[2];
        dwordptr[0] = 0;
        dwordptr[1] = target >> 4;

        memcpy((uint8_t *)cmos_data_mapping + 0x467, dwordptr, 4);
#endif

        if ((cpu_feature[0] & CPUID_APIC) == 0) {
                aprint_error("mp_cpu_start: CPU does not have APIC\n");
                return ENODEV;
        }

        /*
         * ... prior to executing the following sequence:".  We'll also add in
         * local cache flush, in case the BIOS has left the AP with its cache
         * disabled.  It may not be able to cope with MP coherency.
         */
        wbinvd();

        if (ci->ci_flags & CPUF_AP) {
                error = x86_ipi_init(ci->ci_cpuid);
                if (error != 0) {
                        aprint_error_dev(ci->ci_dev, "%s: IPI not taken (1)\n",
                            __func__);
                        return error;
                }
                delay_func(10000);

                error = x86_ipi_startup(ci->ci_cpuid, target / PAGE_SIZE);
                if (error != 0) {
                        aprint_error_dev(ci->ci_dev, "%s: IPI not taken (2)\n",
                            __func__);
                        return error;
                }
                delay_func(200);

                error = x86_ipi_startup(ci->ci_cpuid, target / PAGE_SIZE);
                if (error != 0) {
                        aprint_error_dev(ci->ci_dev, "%s: IPI not taken (3)\n",
                            __func__);
                        return error;
                }
                delay_func(200);
        }

        return 0;
}

void
mp_cpu_start_cleanup(struct cpu_info *ci)
{
        /*
         * Ensure the NVRAM reset byte contains something vaguely sane.
         */

        outb(IO_RTC, NVRAM_RESET);
        outb(IO_RTC+1, NVRAM_RESET_RST);
}
#endif

#ifdef __x86_64__
typedef void (vector)(void);
extern vector Xsyscall, Xsyscall32, Xsyscall_svs;
#endif

/*
 * cpu_init_msrs(ci, full)
 *
 *        Initialize some Model-Specific Registers (MSRs) on the current
 *        CPU, whose struct cpu_info pointer is ci, for:
 *
 *        - SYSCALL/SYSRET.
 *        - %fs/%gs on amd64 if `full' is true; needed to make
 *          CPUVAR(...), curcpu(), and curlwp work.  (We do this at boot,
 *          but skip it on ACPI wakeup.)
 *        - No-execute bit, if supported.
 *
 *        References:
 *
 *        - Intel 64 and IA-32 Architectures Software Developer's Manual,
 *          Volume 3: System Programming Guide, Order Number 325384,
 *          April 2022, Sec. 5.8.8 `Fast System Calls in 64-Bit Mode',
 *          pp. 5-22 through 5-23.
 *
 *        - Intel 64 and IA-32 Architectures Software Developer's Manual,
 *          Volume 4: Model-Specific Registers, Order Number 335592,
 *          April 2022, Sec. 2.1 `Architectural MSRs', Table 2-2,
 *          pp. 2-60 through 2-61.
 */
void
cpu_init_msrs(struct cpu_info *ci, bool full)
{
#ifdef __x86_64__
        /*
         * On amd64, set up the syscall target address registers
         * for SYSCALL/SYSRET:
         *
         * - IA32_STAR, c000_0081h (MSR_STAR): System Call Target
         *   Address.  Code and stack segment selectors for SYSRET
         *   (bits 48:63) and SYSCALL (bits 32:47).
         *
         * - IA32_LSTAR, c000_0082h (MSR_LSTAR): IA-32e Mode System
         *   Call Target Address.  Target rip for SYSCALL when executed
         *   in 64-bit mode.
         *
         * - IA32_CSTAR, c000_0083h (MSR_CSTAR): IA-32e Mode System
         *   Call Target Address.  Target rip for SYSCALL when executed
         *   in compatibility mode.  (XXX Manual says this is `[n]ot
         *   used, as the SYSCALL instruction is not recognized in
         *   compatibility mode', so why do we set it?)
         *
         * - IA32_FMASK, c000_0084h (MSR_SFMASK): System Call Flag
         *   Mask.  Mask for the RFLAGS register on SYSCALL.
         */
        wrmsr(MSR_STAR,
            ((uint64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
            ((uint64_t)LSEL(LSYSRETBASE_SEL, SEL_UPL) << 48));
        wrmsr(MSR_LSTAR, (uint64_t)Xsyscall);
        wrmsr(MSR_CSTAR, (uint64_t)Xsyscall32);
        wrmsr(MSR_SFMASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_AC);

#ifdef SVS
        if (svs_enabled)
                wrmsr(MSR_LSTAR, (uint64_t)Xsyscall_svs);
#endif

        /*
         * On amd64 if `full' is true -- used at boot, but not on ACPI
         * wakeup -- then additionally set up %fs and %gs:
         *
         * - IA32_FS_BASE, c000_0100h (MSR_FSBASE): Base address of
         *   %fs.  Not used in NetBSD kernel, so zero it.
         *
         * - IA32_GS_BASE, c000_0101h (MSR_GSBASE): Base address of
         *   %gs.  Used in NetBSD kernel by CPUVAR(...), curcpu(), and
         *   curlwp for access to the CPU-local area, so set it to ci.
         *
         * - IA32_KERNEL_GS_BASE, c000_0102h (MSR_KERNELGSBASE): Base
         *   address of what swapgs will leave in %gs when switching to
         *   userland.  Zero for now; will be set to pcb->pcb_gs in
         *   cpu_switchto for user threads.
         */
        if (full) {
                wrmsr(MSR_FSBASE, 0);
                wrmsr(MSR_GSBASE, (uint64_t)ci);
                wrmsr(MSR_KERNELGSBASE, 0);
        }
#endif        /* __x86_64__ */

        /*
         * If the no-execute bit is supported, enable it in:
         *
         * - IA32_EFER, c000_0080h (MSR_EFER): Extended Feature
         *   Enables.
         */
        if (cpu_feature[2] & CPUID_NOX)
                wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_NXE);
}

void
cpu_offline_md(void)
{
        return;
}

/* XXX joerg restructure and restart CPUs individually */
static bool
cpu_stop(device_t dv)
{
        struct cpu_softc *sc = device_private(dv);
        struct cpu_info *ci = sc->sc_info;
        int err;

        KASSERT((ci->ci_flags & CPUF_PRESENT) != 0);

        if (CPU_IS_PRIMARY(ci))
                return true;

        if (ci->ci_data.cpu_idlelwp == NULL)
                return true;

        sc->sc_wasonline = !(ci->ci_schedstate.spc_flags & SPCF_OFFLINE);

        if (sc->sc_wasonline) {
                mutex_enter(&cpu_lock);
                err = cpu_setstate(ci, false);
                mutex_exit(&cpu_lock);

                if (err != 0)
                        return false;
        }

        return true;
}

static bool
cpu_suspend(device_t dv, const pmf_qual_t *qual)
{
        struct cpu_softc *sc = device_private(dv);
        struct cpu_info *ci = sc->sc_info;

        if ((ci->ci_flags & CPUF_PRESENT) == 0)
                return true;
        else {
                cpufreq_suspend(ci);
        }

        return cpu_stop(dv);
}

static bool
cpu_resume(device_t dv, const pmf_qual_t *qual)
{
        struct cpu_softc *sc = device_private(dv);
        struct cpu_info *ci = sc->sc_info;
        int err = 0;

        if ((ci->ci_flags & CPUF_PRESENT) == 0)
                return true;

        if (CPU_IS_PRIMARY(ci))
                goto out;

        if (ci->ci_data.cpu_idlelwp == NULL)
                goto out;

        if (sc->sc_wasonline) {
                mutex_enter(&cpu_lock);
                err = cpu_setstate(ci, true);
                mutex_exit(&cpu_lock);
        }

out:
        if (err != 0)
                return false;

        cpufreq_resume(ci);

        return true;
}

static bool
cpu_shutdown(device_t dv, int how)
{
        struct cpu_softc *sc = device_private(dv);
        struct cpu_info *ci = sc->sc_info;

        if ((ci->ci_flags & CPUF_BSP) != 0)
                return false;

        if ((ci->ci_flags & CPUF_PRESENT) == 0)
                return true;

        return cpu_stop(dv);
}

/* Get the TSC frequency and set it to ci->ci_data.cpu_cc_freq. */
void
cpu_get_tsc_freq(struct cpu_info *ci)
{
        uint64_t freq = 0, freq_from_cpuid, t0, t1;
        int64_t overhead;

        if (CPU_IS_PRIMARY(ci) && cpu_hascounter()) {
                /*
                 * If it's the first call of this function, try to get TSC
                 * freq from CPUID by calling cpu_tsc_freq_cpuid().
                 * The function also set lapic_per_second variable if it's
                 * known. This is required for Intel's Comet Lake and newer
                 * processors to set LAPIC timer correctly.
                 */
                if (ci->ci_data.cpu_cc_freq == 0)
                        freq = freq_from_cpuid = cpu_tsc_freq_cpuid(ci);
                if (freq != 0)
                        aprint_debug_dev(ci->ci_dev, "TSC freq "
                            "from CPUID %" PRIu64 " Hz\n", freq);
#if NHPET > 0
                if (freq == 0) {
                        freq = hpet_tsc_freq();
                        if (freq != 0)
                                aprint_debug_dev(ci->ci_dev, "TSC freq "
                                    "from HPET %" PRIu64 " Hz\n", freq);
                }
#endif
                if (freq == 0) {
                        /*
                         * Work out the approximate overhead involved below.
                         * Discard the result of the first go around the
                         * loop.
                         */
                        overhead = 0;
                        for (int i = 0; i <= 8; i++) {
                                const int s = splhigh();
                                t0 = cpu_counter();
                                delay_func(0);
                                t1 = cpu_counter();
                                splx(s);
                                if (i > 0) {
                                        overhead += (t1 - t0);
                                }
                        }
                        overhead >>= 3;

                        /*
                         * Now do the calibration.
                         */
                        freq = 0;
                        for (int i = 0; i < 1000; i++) {
                                const int s = splhigh();
                                t0 = cpu_counter();
                                delay_func(100);
                                t1 = cpu_counter();
                                splx(s);
                                freq += t1 - t0 - overhead;
                        }
                        freq = freq * 10;

                        aprint_debug_dev(ci->ci_dev, "TSC freq "
                            "from delay %" PRIu64 " Hz\n", freq);
                }
                if (ci->ci_data.cpu_cc_freq != 0) {
                        freq_from_cpuid = cpu_tsc_freq_cpuid(ci);
                        if ((freq_from_cpuid != 0)
                            && (freq != freq_from_cpuid))
                                aprint_verbose_dev(ci->ci_dev, "TSC freq "
                                    "calibrated %" PRIu64 " Hz\n", freq);
                }
        } else {
                freq = cpu_info_primary.ci_data.cpu_cc_freq;
        }

        ci->ci_data.cpu_cc_freq = freq;
}

void
x86_cpu_idle_mwait(void)
{
        struct cpu_info *ci = curcpu();

        KASSERT(ci->ci_ilevel == IPL_NONE);

        x86_monitor(&ci->ci_want_resched, 0, 0);
        if (__predict_false(ci->ci_want_resched)) {
                return;
        }
        x86_mwait(0, 0);
}

void
x86_cpu_idle_halt(void)
{
        struct cpu_info *ci = curcpu();

        KASSERT(ci->ci_ilevel == IPL_NONE);

        x86_disable_intr();
        if (!__predict_false(ci->ci_want_resched)) {
                x86_stihlt();
        } else {
                x86_enable_intr();
        }
}

/*
 * Loads pmap for the current CPU.
 */
void
cpu_load_pmap(struct pmap *pmap, struct pmap *oldpmap)
{

        KASSERT(kpreempt_disabled());

#ifdef SVS
        if (svs_enabled && pmap_is_user(pmap)) {
                svs_pdir_switch(pmap);
        }
#endif

#ifdef PAE
        struct cpu_info *ci = curcpu();
        bool interrupts_enabled;
        pd_entry_t *l3_pd = ci->ci_pae_l3_pdir;
        int i;

        /*
         * disable interrupts to block TLB shootdowns, which can reload cr3.
         * while this doesn't block NMIs, it's probably ok as NMIs unlikely
         * reload cr3.
         */
        interrupts_enabled = (x86_read_flags() & PSL_I) != 0;
        if (interrupts_enabled)
                x86_disable_intr();

        for (i = 0 ; i < PDP_SIZE; i++) {
                l3_pd[i] = pmap->pm_pdirpa[i] | PTE_P;
        }

        if (interrupts_enabled)
                x86_enable_intr();
        tlbflush();
#else
        lcr3(pmap_pdirpa(pmap, 0));
#endif
}

/*
 * Notify all other cpus to halt.
 */

void
cpu_broadcast_halt(void)
{
        x86_broadcast_ipi(X86_IPI_HALT);
}

/*
 * Send a dummy ipi to a cpu to force it to run splraise()/spllower(),
 * and trigger an AST on the running LWP.
 */

void
cpu_kick(struct cpu_info *ci)
{
        x86_send_ipi(ci, X86_IPI_AST);
}




















































































    1 













    1 






































    1 


    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 


    1 
    1 
    1 











    1 


    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 


    1 

    1 
    1 

    1 































































    1 
    1 
    1 





    1 

    1 
    1 
    1 

    1 
    1 
    1 
    1 
    1 
    1 
    1 

    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 


    1 

    1 









    1 









    1 





























    1 

    1 






























































    1 





    1 
    1 
    1 
    1 
    1 
    1 

    1 
    1 





    1 













    1 

    1 
    1 
    1 

    1 




































































































    1 


    1 
    1 
    1 



    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
/*        $NetBSD: tmpfs_rename.c,v 1.12 2021/10/20 14:28:21 thorpej Exp $        */

/*-
 * Copyright (c) 2012 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Taylor R Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * tmpfs rename
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_rename.c,v 1.12 2021/10/20 14:28:21 thorpej Exp $");

#include <sys/param.h>
#include <sys/errno.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/vnode_if.h>

#include <miscfs/genfs/genfs.h>

#include <fs/tmpfs/tmpfs_vnops.h>
#include <fs/tmpfs/tmpfs.h>

/*
 * Forward declarations
 */

static int tmpfs_sane_rename(struct vnode *, struct componentname *,
    struct vnode *, struct componentname *,
    kauth_cred_t, bool);
static bool tmpfs_rmdired_p(struct vnode *);
static int tmpfs_gro_lock_directory(struct mount *, struct vnode *);

static const struct genfs_rename_ops tmpfs_genfs_rename_ops;

/*
 * tmpfs_sane_rename: The hairiest vop, with the saner API.
 *
 * Arguments:
 *
 * . fdvp (from directory vnode),
 * . fcnp (from component name),
 * . tdvp (to directory vnode),
 * . tcnp (to component name),
 * . cred (credentials structure), and
 * . posixly_correct (flag for behaviour if target & source link same file).
 *
 * fdvp and tdvp may be the same, and must be referenced and unlocked.
 */
static int
tmpfs_sane_rename(
    struct vnode *fdvp, struct componentname *fcnp,
    struct vnode *tdvp, struct componentname *tcnp,
    kauth_cred_t cred, bool posixly_correct)
{
        struct tmpfs_dirent *fdirent, *tdirent;

        return genfs_sane_rename(&tmpfs_genfs_rename_ops,
            fdvp, fcnp, &fdirent, tdvp, tcnp, &tdirent,
            cred, posixly_correct);
}

/*
 * tmpfs_rename: The hairiest vop, with the insanest API.  Defer to
 * genfs_insane_rename immediately.
 */
int
tmpfs_rename(void *v)
{

        return genfs_insane_rename(v, &tmpfs_sane_rename);
}

/*
 * tmpfs_gro_directory_empty_p: Return true if the directory vp is
 * empty.  dvp is its parent.
 *
 * vp and dvp must be locked and referenced.
 */
static bool
tmpfs_gro_directory_empty_p(struct mount *mp, kauth_cred_t cred,
    struct vnode *vp, struct vnode *dvp)
{

        (void)mp;
        (void)cred;
        (void)dvp;
        KASSERT(mp != NULL);
        KASSERT(vp != NULL);
        KASSERT(dvp != NULL);
        KASSERT(vp != dvp);
        KASSERT(vp->v_mount == mp);
        KASSERT(dvp->v_mount == mp);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);

        return (VP_TO_TMPFS_NODE(vp)->tn_size == 0);
}

/*
 * tmpfs_gro_rename_check_possible: Check whether a rename is possible
 * independent of credentials.
 */
static int
tmpfs_gro_rename_check_possible(struct mount *mp,
    struct vnode *fdvp, struct vnode *fvp,
    struct vnode *tdvp, struct vnode *tvp)
{

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(fdvp != NULL);
        KASSERT(fvp != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(fdvp != fvp);
        KASSERT(fdvp != tvp);
        KASSERT(tdvp != fvp);
        KASSERT(tdvp != tvp);
        KASSERT(fvp != tvp);
        KASSERT(fdvp->v_type == VDIR);
        KASSERT(tdvp->v_type == VDIR);
        KASSERT(fdvp->v_mount == mp);
        KASSERT(fvp->v_mount == mp);
        KASSERT(tdvp->v_mount == mp);
        KASSERT((tvp == NULL) || (tvp->v_mount == mp));
        KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));

        return genfs_ufslike_rename_check_possible(
            VP_TO_TMPFS_NODE(fdvp)->tn_flags, VP_TO_TMPFS_NODE(fvp)->tn_flags,
            VP_TO_TMPFS_NODE(tdvp)->tn_flags,
            (tvp? VP_TO_TMPFS_NODE(tvp)->tn_flags : 0), (tvp != NULL),
            IMMUTABLE, APPEND);
}

/*
 * tmpfs_gro_rename_check_permitted: Check whether a rename is
 * permitted given our credentials.
 */
static int
tmpfs_gro_rename_check_permitted(struct mount *mp, kauth_cred_t cred,
    struct vnode *fdvp, struct vnode *fvp,
    struct vnode *tdvp, struct vnode *tvp)
{

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(fdvp != NULL);
        KASSERT(fvp != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(fdvp != fvp);
        KASSERT(fdvp != tvp);
        KASSERT(tdvp != fvp);
        KASSERT(tdvp != tvp);
        KASSERT(fvp != tvp);
        KASSERT(fdvp->v_type == VDIR);
        KASSERT(tdvp->v_type == VDIR);
        KASSERT(fdvp->v_mount == mp);
        KASSERT(fvp->v_mount == mp);
        KASSERT(tdvp->v_mount == mp);
        KASSERT((tvp == NULL) || (tvp->v_mount == mp));
        KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));

        return genfs_ufslike_rename_check_permitted(cred,
            fdvp, VP_TO_TMPFS_NODE(fdvp)->tn_mode,
            VP_TO_TMPFS_NODE(fdvp)->tn_uid,
            fvp, VP_TO_TMPFS_NODE(fvp)->tn_uid,
            tdvp, VP_TO_TMPFS_NODE(tdvp)->tn_mode,
            VP_TO_TMPFS_NODE(tdvp)->tn_uid,
            tvp, (tvp? VP_TO_TMPFS_NODE(tvp)->tn_uid : 0));
}

/*
 * tmpfs_gro_remove_check_possible: Check whether a remove is possible
 * independent of credentials.
 */
static int
tmpfs_gro_remove_check_possible(struct mount *mp,
    struct vnode *dvp, struct vnode *vp)
{

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(dvp != NULL);
        KASSERT(vp != NULL);
        KASSERT(dvp != vp);
        KASSERT(dvp->v_type == VDIR);
        KASSERT(vp->v_type != VDIR);
        KASSERT(dvp->v_mount == mp);
        KASSERT(vp->v_mount == mp);
        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);

        return genfs_ufslike_remove_check_possible(
            VP_TO_TMPFS_NODE(dvp)->tn_flags, VP_TO_TMPFS_NODE(vp)->tn_flags,
            IMMUTABLE, APPEND);
}

/*
 * tmpfs_gro_remove_check_permitted: Check whether a remove is
 * permitted given our credentials.
 */
static int
tmpfs_gro_remove_check_permitted(struct mount *mp, kauth_cred_t cred,
    struct vnode *dvp, struct vnode *vp)
{

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(dvp != NULL);
        KASSERT(vp != NULL);
        KASSERT(dvp != vp);
        KASSERT(dvp->v_type == VDIR);
        KASSERT(vp->v_type != VDIR);
        KASSERT(dvp->v_mount == mp);
        KASSERT(vp->v_mount == mp);
        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);

        return genfs_ufslike_remove_check_permitted(cred,
            dvp, VP_TO_TMPFS_NODE(dvp)->tn_mode, VP_TO_TMPFS_NODE(dvp)->tn_uid,
            vp, VP_TO_TMPFS_NODE(vp)->tn_uid);
}

/*
 * tmpfs_gro_rename: Actually perform the rename operation.
 */
static int
tmpfs_gro_rename(struct mount *mp, kauth_cred_t cred,
    struct vnode *fdvp, struct componentname *fcnp,
    void *fde, struct vnode *fvp,
    struct vnode *tdvp, struct componentname *tcnp,
    void *tde, struct vnode *tvp, nlink_t *tvp_nlinkp)
{
        tmpfs_node_t *fdnode = VP_TO_TMPFS_DIR(fdvp);
        tmpfs_node_t *tdnode = VP_TO_TMPFS_DIR(tdvp);
        struct tmpfs_dirent **fdep = fde;
        struct tmpfs_dirent **tdep = tde;
        char *newname;

        (void)cred;
        KASSERT(mp != NULL);
        KASSERT(fdvp != NULL);
        KASSERT(fcnp != NULL);
        KASSERT(fdep != NULL);
        KASSERT(fvp != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(tcnp != NULL);
        KASSERT(tdep != NULL);
        KASSERT(fdep != tdep);
        KASSERT((tvp == NULL) || (*fdep) != (*tdep));
        KASSERT((*fdep) != NULL);
        KASSERT((*fdep)->td_node == VP_TO_TMPFS_NODE(fvp));
        KASSERT((tvp == NULL) || ((*tdep) != NULL));
        KASSERT((tvp == NULL) || ((*tdep)->td_node == VP_TO_TMPFS_NODE(tvp)));
        KASSERT(fdvp != fvp);
        KASSERT(fdvp != tvp);
        KASSERT(tdvp != fvp);
        KASSERT(tdvp != tvp);
        KASSERT(fvp != tvp);
        KASSERT(fdvp->v_mount == mp);
        KASSERT(fvp->v_mount == mp);
        KASSERT(tdvp->v_mount == mp);
        KASSERT((tvp == NULL) || (tvp->v_mount == mp));
        KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));

        if (tmpfs_strname_neqlen(fcnp, tcnp)) {
                newname = tmpfs_strname_alloc(VFS_TO_TMPFS(mp),
                    tcnp->cn_namelen);
                if (newname == NULL)
                        return ENOSPC;
        } else {
                newname = NULL;
        }

        /*
         * If we are moving from one directory to another, detach the
         * source entry and reattach it to the target directory.
         */
        if (fdvp != tdvp) {
                tmpfs_dir_detach(fdnode, *fdep);
                tmpfs_dir_attach(tdnode, *fdep, VP_TO_TMPFS_NODE(fvp));
        }

        /*
         * If we are replacing an existing target entry, delete it.
         *
         * XXX What if the target is a directory with whiteout entries?
         */
        if (tvp != NULL) {
                tdnode = VP_TO_TMPFS_DIR(tdvp);

                KASSERT((*tdep) != NULL);
                KASSERT((*tdep)->td_node == VP_TO_TMPFS_NODE(tvp));
                KASSERT((fvp->v_type == VDIR) == (tvp->v_type == VDIR));
                if (tvp->v_type == VDIR) {
                        KASSERT(VP_TO_TMPFS_NODE(tvp)->tn_size == 0);
                        KASSERT(VP_TO_TMPFS_NODE(tvp)->tn_links == 2);

                        /*
                         * Decrement the extra link count for `.' so
                         * the vnode will be recycled when released.
                         */
                        VP_TO_TMPFS_NODE(tvp)->tn_links--;
                }
                tmpfs_dir_detach(tdnode, *tdep);
                tmpfs_free_dirent(VFS_TO_TMPFS(mp), *tdep);

                *tvp_nlinkp = VP_TO_TMPFS_NODE(tvp)->tn_links;
        }

        /*
         * Update the directory entry's name if necessary, and flag
         * metadata updates.  A memory allocation failure here is not
         * OK because we've already committed some changes that we
         * can't back out at this point, hence the early allocation
         * above.
         */
        if (newname != NULL) {
                KASSERT(tcnp->cn_namelen <= TMPFS_MAXNAMLEN);

                tmpfs_strname_free(VFS_TO_TMPFS(mp), (*fdep)->td_name,
                    (*fdep)->td_namelen);
                (*fdep)->td_namelen = (uint16_t)tcnp->cn_namelen;
                (void)memcpy(newname, tcnp->cn_nameptr, tcnp->cn_namelen);
                (*fdep)->td_name = newname;
        }

        /*
         * Update the timestamps of both parent directories and
         * the renamed file itself.
         */
        tmpfs_update(fdvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);
        tmpfs_update(tdvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);
        tmpfs_update(fvp, TMPFS_UPDATE_CTIME);

        genfs_rename_cache_purge(fdvp, fvp, tdvp, tvp);

        return 0;
}

/*
 * tmpfs_gro_remove: Rename an object over another link to itself,
 * effectively removing just the original link.
 */
static int
tmpfs_gro_remove(struct mount *mp, kauth_cred_t cred,
    struct vnode *dvp, struct componentname *cnp, void *de, struct vnode *vp,
    nlink_t *tvp_nlinkp)
{
        tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp);
        struct tmpfs_dirent **dep = de;

        (void)vp;
        KASSERT(mp != NULL);
        KASSERT(dvp != NULL);
        KASSERT(cnp != NULL);
        KASSERT(dep != NULL);
        KASSERT(vp != NULL);
        KASSERT(dvp != vp);
        KASSERT(dvp->v_mount == mp);
        KASSERT(vp->v_mount == mp);
        KASSERT(dvp->v_type == VDIR);
        KASSERT(vp->v_type != VDIR);
        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);

        KASSERT((*dep)->td_node == VP_TO_TMPFS_NODE(vp));

        tmpfs_dir_detach(dnode, *dep);
        tmpfs_free_dirent(VFS_TO_TMPFS(mp), *dep);
        tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);

        *tvp_nlinkp = VP_TO_TMPFS_NODE(vp)->tn_links;

        return 0;
}

/*
 * tmpfs_gro_lookup: Look up and save the lookup results.
 */
static int
tmpfs_gro_lookup(struct mount *mp, struct vnode *dvp,
    struct componentname *cnp, void *de_ret, struct vnode **vp_ret)
{
        struct tmpfs_dirent *dirent, **dep_ret = de_ret;
        struct vnode *vp;
        int error;

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(dvp != NULL);
        KASSERT(cnp != NULL);
        KASSERT(dep_ret != NULL);
        KASSERT(vp_ret != NULL);
        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);

        dirent = tmpfs_dir_lookup(VP_TO_TMPFS_NODE(dvp), cnp);
        if (dirent == NULL)
                return ENOENT;

        error = vcache_get(mp, &dirent->td_node, sizeof(dirent->td_node), &vp);
        if (error)
                return error;
        KASSERT(vp != NULL);

        *dep_ret = dirent;
        *vp_ret = vp;
        return 0;
}

/*
 * tmpfs_rmdired_p: Check whether the directory vp has been rmdired.
 *
 * vp must be locked and referenced.
 */
static bool
tmpfs_rmdired_p(struct vnode *vp)
{

        KASSERT(vp != NULL);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        KASSERT(vp->v_type == VDIR);

        return (VP_TO_TMPFS_NODE(vp)->tn_spec.tn_dir.tn_parent == NULL);
}

/*
 * tmpfs_gro_genealogy: Analyze the genealogy of the source and target
 * directories.
 */
static int
tmpfs_gro_genealogy(struct mount *mp, kauth_cred_t cred,
    struct vnode *fdvp, struct vnode *tdvp,
    struct vnode **intermediate_node_ret)
{
        struct vnode *vp, *ovp;
        struct tmpfs_node *dnode;
        int error;

        (void)cred;
        KASSERT(mp != NULL);
        KASSERT(fdvp != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(fdvp != tdvp);
        KASSERT(intermediate_node_ret != NULL);
        KASSERT(fdvp->v_mount == mp);
        KASSERT(tdvp->v_mount == mp);
        KASSERT(fdvp->v_type == VDIR);
        KASSERT(tdvp->v_type == VDIR);

        /*
         * We need to provisionally lock tdvp to keep rmdir from
         * deleting it -- or any ancestor -- at an inopportune moment.
         */
        error = tmpfs_gro_lock_directory(mp, tdvp);
        if (error)
                return error;

        vp = tdvp;
        vref(vp);

        for (;;) {
                KASSERT(vp != NULL);
                KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
                KASSERT(vp->v_mount == mp);
                KASSERT(vp->v_type == VDIR);
                KASSERT(!tmpfs_rmdired_p(vp));

                dnode = VP_TO_TMPFS_NODE(vp)->tn_spec.tn_dir.tn_parent;

                /*
                 * If dnode is null then vp has been rmdir'd, which is
                 * not supposed to happen because we have it locked.
                 */
                KASSERT(dnode != NULL);

                /* Did we hit the root without finding fdvp?  */
                if (dnode == VP_TO_TMPFS_NODE(vp)) {
                        vput(vp);
                        *intermediate_node_ret = NULL;
                        return 0;
                }

                /* Did we find that fdvp is an ancestor of tdvp? */
                if (dnode == VP_TO_TMPFS_NODE(fdvp)) {
                        KASSERT(dnode->tn_vnode == fdvp);
                        /* Unlock vp, but keep it referenced.  */
                        VOP_UNLOCK(vp);
                        *intermediate_node_ret = vp;
                        return 0;
                }

                /* Neither -- keep ascending the family tree.  */
                ovp = vp;
                vp = NULL;
                error = vcache_get(mp, &dnode, sizeof(dnode), &vp);
                vput(ovp);
                if (error)
                        return error;
                error = vn_lock(vp, LK_EXCLUSIVE);
                if (error) {
                        vrele(vp);
                        return error;
                }

                /*
                 * vcache_get only guarantees that dnode will not
                 * be freed while we get a vnode for it.  It does not
                 * preserve any other invariants, so we must check
                 * whether the parent has been removed in the meantime.
                 */
                if (tmpfs_rmdired_p(vp)) {
                        vput(vp);
                        return ENOENT;
                }
        }
}

/*
 * tmpfs_gro_lock_directory: Lock the directory vp, but fail if it has
 * been rmdir'd.
 */
static int
tmpfs_gro_lock_directory(struct mount *mp, struct vnode *vp)
{

        (void)mp;
        KASSERT(mp != NULL);
        KASSERT(vp != NULL);
        KASSERT(vp->v_mount == mp);

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);

        if (tmpfs_rmdired_p(vp)) {
                VOP_UNLOCK(vp);
                return ENOENT;
        }

        return 0;
}

static const struct genfs_rename_ops tmpfs_genfs_rename_ops = {
        .gro_directory_empty_p                = tmpfs_gro_directory_empty_p,
        .gro_rename_check_possible        = tmpfs_gro_rename_check_possible,
        .gro_rename_check_permitted        = tmpfs_gro_rename_check_permitted,
        .gro_remove_check_possible        = tmpfs_gro_remove_check_possible,
        .gro_remove_check_permitted        = tmpfs_gro_remove_check_permitted,
        .gro_rename                        = tmpfs_gro_rename,
        .gro_remove                        = tmpfs_gro_remove,
        .gro_lookup                        = tmpfs_gro_lookup,
        .gro_genealogy                        = tmpfs_gro_genealogy,
        .gro_lock_directory                = tmpfs_gro_lock_directory,
};





















































    2 









































    2 


















    5 















    5 















    7 









    1 






    4 

    4 















    6 





    6 























    5 


    5 
    5 







    1 


    1 
    1 





    1 



    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
/*        $NetBSD: tmpfs_mem.c,v 1.14 2023/04/29 06:29:55 riastradh Exp $        */

/*
 * Copyright (c) 2010, 2011, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * tmpfs memory allocation routines.
 * Implements memory usage accounting and limiting.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_mem.c,v 1.14 2023/04/29 06:29:55 riastradh Exp $");

#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/pool.h>

#include <fs/tmpfs/tmpfs.h>

extern struct pool        tmpfs_dirent_pool;
extern struct pool        tmpfs_node_pool;

void
tmpfs_mntmem_init(struct tmpfs_mount *mp, uint64_t memlimit)
{

        mutex_init(&mp->tm_acc_lock, MUTEX_DEFAULT, IPL_NONE);
        mp->tm_mem_limit = memlimit;
        mp->tm_bytes_used = 0;
}

void
tmpfs_mntmem_destroy(struct tmpfs_mount *mp)
{

        KASSERT(mp->tm_bytes_used == 0);
        mutex_destroy(&mp->tm_acc_lock);
}

int
tmpfs_mntmem_set(struct tmpfs_mount *mp, uint64_t memlimit)
{
        int error;

        mutex_enter(&mp->tm_acc_lock);
        if (round_page(mp->tm_bytes_used) >= memlimit)
                error = EBUSY;
        else {
                error = 0;
                mp->tm_mem_limit = memlimit;
        }
        mutex_exit(&mp->tm_acc_lock);
        return error;
}

/*
 * tmpfs_mem_info: return the number of available memory pages.
 *
 * => If 'total' is true, then return _total_ amount of pages.
 * => If false, then return the amount of _free_ memory pages.
 *
 * Remember to remove uvmexp.freetarg from the returned value to avoid
 * excessive memory usage.
 */
size_t
tmpfs_mem_info(bool total)
{
        size_t size = 0;

        size += uvmexp.swpgavail;
        if (!total) {
                size -= uvmexp.swpgonly;
        }
        size += uvm_availmem(true);
        size += uvmexp.filepages;
        if (size > uvmexp.wired) {
                size -= uvmexp.wired;
        } else {
                size = 0;
        }
        return size;
}

uint64_t
tmpfs_bytes_max(struct tmpfs_mount *mp)
{
        psize_t freepages = tmpfs_mem_info(false);
        int freetarg = uvmexp.freetarg;        // XXX unlocked
        uint64_t avail_mem;

        if (freepages < freetarg) {
                freepages = 0;
        } else {
                freepages -= freetarg;
        }
        avail_mem = round_page(mp->tm_bytes_used) + (freepages << PAGE_SHIFT);
        return MIN(mp->tm_mem_limit, avail_mem);
}

size_t
tmpfs_pages_avail(struct tmpfs_mount *mp)
{

        return (tmpfs_bytes_max(mp) - mp->tm_bytes_used) >> PAGE_SHIFT;
}

bool
tmpfs_mem_incr(struct tmpfs_mount *mp, size_t sz)
{
        uint64_t lim;

        mutex_enter(&mp->tm_acc_lock);
        lim = tmpfs_bytes_max(mp);
        if (mp->tm_bytes_used + sz >= lim) {
                mutex_exit(&mp->tm_acc_lock);
                return false;
        }
        mp->tm_bytes_used += sz;
        mutex_exit(&mp->tm_acc_lock);
        return true;
}

void
tmpfs_mem_decr(struct tmpfs_mount *mp, size_t sz)
{

        mutex_enter(&mp->tm_acc_lock);
        KASSERT(mp->tm_bytes_used >= sz);
        mp->tm_bytes_used -= sz;
        mutex_exit(&mp->tm_acc_lock);
}

struct tmpfs_dirent *
tmpfs_dirent_get(struct tmpfs_mount *mp)
{

        if (!tmpfs_mem_incr(mp, sizeof(struct tmpfs_dirent))) {
                return NULL;
        }
        return pool_get(&tmpfs_dirent_pool, PR_WAITOK);
}

void
tmpfs_dirent_put(struct tmpfs_mount *mp, struct tmpfs_dirent *de)
{

        tmpfs_mem_decr(mp, sizeof(struct tmpfs_dirent));
        pool_put(&tmpfs_dirent_pool, de);
}

struct tmpfs_node *
tmpfs_node_get(struct tmpfs_mount *mp)
{

        if (atomic_inc_uint_nv(&mp->tm_nodes_cnt) >= mp->tm_nodes_max) {
                atomic_dec_uint(&mp->tm_nodes_cnt);
                return NULL;
        }
        if (!tmpfs_mem_incr(mp, sizeof(struct tmpfs_node))) {
                atomic_dec_uint(&mp->tm_nodes_cnt);
                return NULL;
        }
        return pool_get(&tmpfs_node_pool, PR_WAITOK);
}

void
tmpfs_node_put(struct tmpfs_mount *mp, struct tmpfs_node *tn)
{

        atomic_dec_uint(&mp->tm_nodes_cnt);
        tmpfs_mem_decr(mp, sizeof(struct tmpfs_node));
        pool_put(&tmpfs_node_pool, tn);
}

/*
 * Quantum size to round-up the tmpfs names in order to reduce re-allocations.
 */

#define        TMPFS_NAME_QUANTUM        (32)

char *
tmpfs_strname_alloc(struct tmpfs_mount *mp, size_t len)
{
        const size_t sz = roundup2(len, TMPFS_NAME_QUANTUM);

        KASSERT(sz > 0 && sz <= 1024);
        if (!tmpfs_mem_incr(mp, sz)) {
                return NULL;
        }
        return kmem_alloc(sz, KM_SLEEP);
}

void
tmpfs_strname_free(struct tmpfs_mount *mp, char *str, size_t len)
{
        const size_t sz = roundup2(len, TMPFS_NAME_QUANTUM);

        KASSERT(sz > 0 && sz <= 1024);
        tmpfs_mem_decr(mp, sz);
        kmem_free(str, sz);
}

bool
tmpfs_strname_neqlen(struct componentname *fcnp, struct componentname *tcnp)
{
        const size_t fln = fcnp->cn_namelen;
        const size_t tln = tcnp->cn_namelen;

        return (fln != tln) || memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fln);
}









































































































































































































   15 
















   15 




   15 












    9 
    8 



























   15 















    3 













    3 
    3 

    2 








    1 


    3 

























   10 













   10 



























   10 




   10 














































   12 

   12 
   12 
   12 
   12 
   12 
   12 



   12 







   12 








   11 


   11 
   11 
   11 
   11 
   11 
   11 




   11 
   11 

   10 
















   25 



   24 
   25 
   25 
   24 




   11 
   25 







   17 
    9 

    2 


   13 
    1 













    1 




   25 






















































































































































    3 



    3 










    3 






    3 




    3 


    3 



    2 









    2 



    2 



    2 


    2 









   11 



   11 
   11 








   10 


    5 
    1 






   11 
   11 






















































































































































































































































































































































   31 

































































































































   20 



   20 




   20 







   12 


   12 


    6 






















   12 






































    1 









   11 



    3 





   10 



















   29 
















   29 


















   73 




    6 










    6 


   74 


































    1 







   20 











   19 



   20 








   12 


    3 
    9 


    3 
    9 























   15 


    5 
   10 


    5 
   10 























   13 


    8 
   10 


    9 
   10 







































    2 


    2 
    1 


    1 
    1 



































   12 


    8 
    5 


    8 
    5 



























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
/*        $NetBSD: vfs_subr.c,v 1.500 2023/04/30 08:46:11 riastradh Exp $        */

/*-
 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008, 2019, 2020
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, by Charles M. Hannum, by Andrew Doran,
 * by Marshall Kirk McKusick and Greg Ganger at the University of Michigan.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_subr.c        8.13 (Berkeley) 4/18/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.500 2023/04/30 08:46:11 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_compat_43.h"
#include "opt_compat_netbsd.h"
#include "opt_ddb.h"
#endif

#include <sys/param.h>
#include <sys/types.h>

#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/dirent.h>
#include <sys/errno.h>
#include <sys/filedesc.h>
#include <sys/fstrans.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/stat.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/vnode_impl.h>

#include <miscfs/deadfs/deadfs.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

#include <uvm/uvm_ddb.h>

SDT_PROBE_DEFINE3(vfs, syncer, worklist, vnode__add,
    "struct vnode *"/*vp*/,
    "int"/*delayx*/,
    "int"/*slot*/);
SDT_PROBE_DEFINE4(vfs, syncer, worklist, vnode__update,
    "struct vnode *"/*vp*/,
    "int"/*delayx*/,
    "int"/*oslot*/,
    "int"/*nslot*/);
SDT_PROBE_DEFINE1(vfs, syncer, worklist, vnode__remove,
    "struct vnode *"/*vp*/);

SDT_PROBE_DEFINE3(vfs, syncer, worklist, mount__add,
    "struct mount *"/*mp*/,
    "int"/*vdelay*/,
    "int"/*slot*/);
SDT_PROBE_DEFINE4(vfs, syncer, worklist, mount__update,
    "struct mount *"/*vp*/,
    "int"/*vdelay*/,
    "int"/*oslot*/,
    "int"/*nslot*/);
SDT_PROBE_DEFINE1(vfs, syncer, worklist, mount__remove,
    "struct mount *"/*mp*/);

SDT_PROBE_DEFINE1(vfs, syncer, sync, start,
    "int"/*starttime*/);
SDT_PROBE_DEFINE1(vfs, syncer, sync, mount__start,
    "struct mount *"/*mp*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, mount__done,
    "struct mount *"/*mp*/,
    "int"/*error*/);
SDT_PROBE_DEFINE1(vfs, syncer, sync, mount__skip,
    "struct mount *"/*mp*/);
SDT_PROBE_DEFINE1(vfs, syncer, sync, vnode__start,
    "struct vnode *"/*vp*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__done,
    "struct vnode *"/*vp*/,
    "int"/*error*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__fail__lock,
    "struct vnode *"/*vp*/,
    "int"/*error*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__fail__vget,
    "struct vnode *"/*vp*/,
    "int"/*error*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, done,
    "int"/*starttime*/,
    "int"/*endtime*/);

const enum vtype iftovt_tab[16] = {
        VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
        VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
};
const int        vttoif_tab[9] = {
        0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
        S_IFSOCK, S_IFIFO, S_IFMT,
};

/*
 * Insq/Remq for the vnode usage lists.
 */
#define        bufinsvn(bp, dp)        LIST_INSERT_HEAD(dp, bp, b_vnbufs)
#define        bufremvn(bp) {                                                        \
        LIST_REMOVE(bp, b_vnbufs);                                        \
        (bp)->b_vnbufs.le_next = NOLIST;                                \
}

int doforce = 1;                /* 1 => permit forcible unmounting */

/*
 * Local declarations.
 */

static void vn_initialize_syncerd(void);

/*
 * Initialize the vnode management data structures.
 */
void
vntblinit(void)
{

        vn_initialize_syncerd();
        vfs_mount_sysinit();
        vfs_vnode_sysinit();
}

/*
 * Flush out and invalidate all buffers associated with a vnode.
 * Called with the underlying vnode locked, which should prevent new dirty
 * buffers from being queued.
 */
int
vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
          bool catch_p, int slptimeo)
{
        struct buf *bp, *nbp;
        int error;
        int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
            (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);

        /* XXXUBC this doesn't look at flags or slp* */
        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
        error = VOP_PUTPAGES(vp, 0, 0, flushflags);
        if (error) {
                return error;
        }

        if (flags & V_SAVE) {
                error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0);
                if (error)
                        return (error);
                KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
        }

        mutex_enter(&bufcache_lock);
restart:
        for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
                KASSERT(bp->b_vp == vp);
                nbp = LIST_NEXT(bp, b_vnbufs);
                error = bbusy(bp, catch_p, slptimeo, NULL);
                if (error != 0) {
                        if (error == EPASSTHROUGH)
                                goto restart;
                        mutex_exit(&bufcache_lock);
                        return (error);
                }
                brelsel(bp, BC_INVAL | BC_VFLUSH);
        }

        for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
                KASSERT(bp->b_vp == vp);
                nbp = LIST_NEXT(bp, b_vnbufs);
                error = bbusy(bp, catch_p, slptimeo, NULL);
                if (error != 0) {
                        if (error == EPASSTHROUGH)
                                goto restart;
                        mutex_exit(&bufcache_lock);
                        return (error);
                }
                /*
                 * XXX Since there are no node locks for NFS, I believe
                 * there is a slight chance that a delayed write will
                 * occur while sleeping just above, so check for it.
                 */
                if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) {
#ifdef DEBUG
                        printf("buffer still DELWRI\n");
#endif
                        bp->b_cflags |= BC_BUSY | BC_VFLUSH;
                        mutex_exit(&bufcache_lock);
                        VOP_BWRITE(bp->b_vp, bp);
                        mutex_enter(&bufcache_lock);
                        goto restart;
                }
                brelsel(bp, BC_INVAL | BC_VFLUSH);
        }

#ifdef DIAGNOSTIC
        if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
                panic("vinvalbuf: flush failed, vp %p", vp);
#endif

        mutex_exit(&bufcache_lock);

        return (0);
}

/*
 * Destroy any in core blocks past the truncation length.
 * Called with the underlying vnode locked, which should prevent new dirty
 * buffers from being queued.
 */
int
vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch_p, int slptimeo)
{
        struct buf *bp, *nbp;
        int error;
        voff_t off;

        off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
        error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
        if (error) {
                return error;
        }

        mutex_enter(&bufcache_lock);
restart:
        for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
                KASSERT(bp->b_vp == vp);
                nbp = LIST_NEXT(bp, b_vnbufs);
                if (bp->b_lblkno < lbn)
                        continue;
                error = bbusy(bp, catch_p, slptimeo, NULL);
                if (error != 0) {
                        if (error == EPASSTHROUGH)
                                goto restart;
                        mutex_exit(&bufcache_lock);
                        return (error);
                }
                brelsel(bp, BC_INVAL | BC_VFLUSH);
        }

        for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
                KASSERT(bp->b_vp == vp);
                nbp = LIST_NEXT(bp, b_vnbufs);
                if (bp->b_lblkno < lbn)
                        continue;
                error = bbusy(bp, catch_p, slptimeo, NULL);
                if (error != 0) {
                        if (error == EPASSTHROUGH)
                                goto restart;
                        mutex_exit(&bufcache_lock);
                        return (error);
                }
                brelsel(bp, BC_INVAL | BC_VFLUSH);
        }
        mutex_exit(&bufcache_lock);

        return (0);
}

/*
 * Flush all dirty buffers from a vnode.
 * Called with the underlying vnode locked, which should prevent new dirty
 * buffers from being queued.
 */
int
vflushbuf(struct vnode *vp, int flags)
{
        struct buf *bp, *nbp;
        int error, pflags;
        bool dirty, sync;

        sync = (flags & FSYNC_WAIT) != 0;
        pflags = PGO_CLEANIT | PGO_ALLPAGES |
                (sync ? PGO_SYNCIO : 0) |
                ((flags & FSYNC_LAZY) ? PGO_LAZY : 0);
        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
        (void) VOP_PUTPAGES(vp, 0, 0, pflags);

loop:
        mutex_enter(&bufcache_lock);
        for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
                KASSERT(bp->b_vp == vp);
                nbp = LIST_NEXT(bp, b_vnbufs);
                if ((bp->b_cflags & BC_BUSY))
                        continue;
                if ((bp->b_oflags & BO_DELWRI) == 0)
                        panic("vflushbuf: not dirty, bp %p", bp);
                bp->b_cflags |= BC_BUSY | BC_VFLUSH;
                mutex_exit(&bufcache_lock);
                /*
                 * Wait for I/O associated with indirect blocks to complete,
                 * since there is no way to quickly wait for them below.
                 */
                if (bp->b_vp == vp || !sync)
                        (void) bawrite(bp);
                else {
                        error = bwrite(bp);
                        if (error)
                                return error;
                }
                goto loop;
        }
        mutex_exit(&bufcache_lock);

        if (!sync)
                return 0;

        mutex_enter(vp->v_interlock);
        while (vp->v_numoutput != 0)
                cv_wait(&vp->v_cv, vp->v_interlock);
        dirty = !LIST_EMPTY(&vp->v_dirtyblkhd);
        mutex_exit(vp->v_interlock);

        if (dirty) {
                vprint("vflushbuf: dirty", vp);
                goto loop;
        }

        return 0;
}

/*
 * Create a vnode for a block device.
 * Used for root filesystem and swap areas.
 * Also used for memory file system special devices.
 */
int
bdevvp(dev_t dev, vnode_t **vpp)
{
        struct vattr va;

        vattr_null(&va);
        va.va_type = VBLK;
        va.va_rdev = dev;

        return vcache_new(dead_rootmount, NULL, &va, NOCRED, NULL, vpp);
}

/*
 * Create a vnode for a character device.
 * Used for kernfs and some console handling.
 */
int
cdevvp(dev_t dev, vnode_t **vpp)
{
        struct vattr va;

        vattr_null(&va);
        va.va_type = VCHR;
        va.va_rdev = dev;

        return vcache_new(dead_rootmount, NULL, &va, NOCRED, NULL, vpp);
}

/*
 * Associate a buffer with a vnode.  There must already be a hold on
 * the vnode.
 */
void
bgetvp(struct vnode *vp, struct buf *bp)
{

        KASSERT(bp->b_vp == NULL);
        KASSERT(bp->b_objlock == &buffer_lock);
        KASSERT(mutex_owned(vp->v_interlock));
        KASSERT(mutex_owned(&bufcache_lock));
        KASSERT((bp->b_cflags & BC_BUSY) != 0);
        KASSERT(!cv_has_waiters(&bp->b_done));

        vholdl(vp);
        bp->b_vp = vp;
        if (vp->v_type == VBLK || vp->v_type == VCHR)
                bp->b_dev = vp->v_rdev;
        else
                bp->b_dev = NODEV;

        /*
         * Insert onto list for new vnode.
         */
        bufinsvn(bp, &vp->v_cleanblkhd);
        bp->b_objlock = vp->v_interlock;
}

/*
 * Disassociate a buffer from a vnode.
 */
void
brelvp(struct buf *bp)
{
        struct vnode *vp = bp->b_vp;

        KASSERT(vp != NULL);
        KASSERT(bp->b_objlock == vp->v_interlock);
        KASSERT(mutex_owned(vp->v_interlock));
        KASSERT(mutex_owned(&bufcache_lock));
        KASSERT((bp->b_cflags & BC_BUSY) != 0);
        KASSERT(!cv_has_waiters(&bp->b_done));

        /*
         * Delete from old vnode list, if on one.
         */
        if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
                bufremvn(bp);

        if ((vp->v_iflag & (VI_ONWORKLST | VI_PAGES)) == VI_ONWORKLST &&
            LIST_FIRST(&vp->v_dirtyblkhd) == NULL)
                vn_syncer_remove_from_worklist(vp);

        bp->b_objlock = &buffer_lock;
        bp->b_vp = NULL;
        holdrelel(vp);
}

/*
 * Reassign a buffer from one vnode list to another.
 * The list reassignment must be within the same vnode.
 * Used to assign file specific control information
 * (indirect blocks) to the list to which they belong.
 */
void
reassignbuf(struct buf *bp, struct vnode *vp)
{
        struct buflists *listheadp;
        int delayx;

        KASSERT(mutex_owned(&bufcache_lock));
        KASSERT(bp->b_objlock == vp->v_interlock);
        KASSERT(mutex_owned(vp->v_interlock));
        KASSERT((bp->b_cflags & BC_BUSY) != 0);

        /*
         * Delete from old vnode list, if on one.
         */
        if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
                bufremvn(bp);

        /*
         * If dirty, put on list of dirty buffers;
         * otherwise insert onto list of clean buffers.
         */
        if ((bp->b_oflags & BO_DELWRI) == 0) {
                listheadp = &vp->v_cleanblkhd;
                if ((vp->v_iflag & (VI_ONWORKLST | VI_PAGES)) ==
                    VI_ONWORKLST &&
                    LIST_FIRST(&vp->v_dirtyblkhd) == NULL)
                        vn_syncer_remove_from_worklist(vp);
        } else {
                listheadp = &vp->v_dirtyblkhd;
                if ((vp->v_iflag & VI_ONWORKLST) == 0) {
                        switch (vp->v_type) {
                        case VDIR:
                                delayx = dirdelay;
                                break;
                        case VBLK:
                                if (spec_node_getmountedfs(vp) != NULL) {
                                        delayx = metadelay;
                                        break;
                                }
                                /* fall through */
                        default:
                                delayx = filedelay;
                                break;
                        }
                        if (!vp->v_mount ||
                            (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
                                vn_syncer_add_to_worklist(vp, delayx);
                }
        }
        bufinsvn(bp, listheadp);
}

/*
 * Lookup a vnode by device number and return it referenced.
 */
int
vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
{

        return (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, vpp) == 0);
}

/*
 * Revoke all the vnodes corresponding to the specified minor number
 * range (endpoints inclusive) of the specified major.
 */
void
vdevgone(int maj, int minl, int minh, enum vtype type)
{
        vnode_t *vp;
        dev_t dev;
        int mn;

        for (mn = minl; mn <= minh; mn++) {
                dev = makedev(maj, mn);
                /*
                 * Notify anyone trying to get at this device that it
                 * has been detached, and then revoke it.
                 */
                switch (type) {
                case VBLK:
                        bdev_detached(dev);
                        break;
                case VCHR:
                        cdev_detached(dev);
                        break;
                default:
                        panic("invalid specnode type: %d", type);
                }
                /*
                 * Passing 0 as flags, instead of VDEAD_NOWAIT, means
                 * spec_node_lookup_by_dev will wait for vnodes it
                 * finds concurrently being revoked before returning.
                 */
                while (spec_node_lookup_by_dev(type, dev, 0, &vp) == 0) {
                        VOP_REVOKE(vp, REVOKEALL);
                        vrele(vp);
                }
        }
}

/*
 * The filesystem synchronizer mechanism - syncer.
 *
 * It is useful to delay writes of file data and filesystem metadata for
 * a certain amount of time so that quickly created and deleted files need
 * not waste disk bandwidth being created and removed.  To implement this,
 * vnodes are appended to a "workitem" queue.
 *
 * Most pending metadata should not wait for more than ten seconds.  Thus,
 * mounted on block devices are delayed only about a half the time that file
 * data is delayed.  Similarly, directory updates are more critical, so are
 * only delayed about a third the time that file data is delayed.
 *
 * There are SYNCER_MAXDELAY queues that are processed in a round-robin
 * manner at a rate of one each second (driven off the filesystem syner
 * thread). The syncer_delayno variable indicates the next queue that is
 * to be processed.  Items that need to be processed soon are placed in
 * this queue:
 *
 *        syncer_workitem_pending[syncer_delayno]
 *
 * A delay of e.g. fifteen seconds is done by placing the request fifteen
 * entries later in the queue:
 *
 *        syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 *
 * Flag VI_ONWORKLST indicates that vnode is added into the queue.
 */

#define SYNCER_MAXDELAY                32

typedef TAILQ_HEAD(synclist, vnode_impl) synclist_t;

static void        vn_syncer_add1(struct vnode *, int);
static void        sysctl_vfs_syncfs_setup(struct sysctllog **);

/*
 * Defines and variables for the syncer process.
 */
int syncer_maxdelay = SYNCER_MAXDELAY;        /* maximum delay time */
time_t syncdelay = 30;                        /* max time to delay syncing data */
time_t filedelay = 30;                        /* time to delay syncing files */
time_t dirdelay  = 15;                        /* time to delay syncing directories */
time_t metadelay = 10;                        /* time to delay syncing metadata */
time_t lockdelay = 1;                        /* time to delay if locking fails */

static kmutex_t                syncer_data_lock; /* short term lock on data structs */

static int                syncer_delayno = 0;
static long                syncer_last;
static synclist_t *        syncer_workitem_pending;

static void
vn_initialize_syncerd(void)
{
        int i;

        syncer_last = SYNCER_MAXDELAY + 2;

        sysctl_vfs_syncfs_setup(NULL);

        syncer_workitem_pending =
            kmem_alloc(syncer_last * sizeof (struct synclist), KM_SLEEP);

        for (i = 0; i < syncer_last; i++)
                TAILQ_INIT(&syncer_workitem_pending[i]);

        mutex_init(&syncer_data_lock, MUTEX_DEFAULT, IPL_NONE);
}

/*
 * Return delay factor appropriate for the given file system.   For
 * WAPBL we use the sync vnode to burst out metadata updates: sync
 * those file systems more frequently.
 */
static inline int
sync_delay(struct mount *mp)
{

        return mp->mnt_wapbl != NULL ? metadelay : syncdelay;
}

/*
 * Compute the next slot index from delay.
 */
static inline int
sync_delay_slot(int delayx)
{

        if (delayx > syncer_maxdelay - 2)
                delayx = syncer_maxdelay - 2;
        return (syncer_delayno + delayx) % syncer_last;
}

/*
 * Add an item to the syncer work queue.
 */
static void
vn_syncer_add1(struct vnode *vp, int delayx)
{
        synclist_t *slp;
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);

        KASSERT(mutex_owned(&syncer_data_lock));

        if (vp->v_iflag & VI_ONWORKLST) {
                /*
                 * Remove in order to adjust the position of the vnode.
                 * Note: called from sched_sync(), which will not hold
                 * interlock, therefore we cannot modify v_iflag here.
                 */
                slp = &syncer_workitem_pending[vip->vi_synclist_slot];
                TAILQ_REMOVE(slp, vip, vi_synclist);
        } else {
                KASSERT(mutex_owned(vp->v_interlock));
                vp->v_iflag |= VI_ONWORKLST;
        }

        vip->vi_synclist_slot = sync_delay_slot(delayx);

        slp = &syncer_workitem_pending[vip->vi_synclist_slot];
        TAILQ_INSERT_TAIL(slp, vip, vi_synclist);
}

void
vn_syncer_add_to_worklist(struct vnode *vp, int delayx)
{
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);

        KASSERT(mutex_owned(vp->v_interlock));

        mutex_enter(&syncer_data_lock);
        vn_syncer_add1(vp, delayx);
        SDT_PROBE3(vfs, syncer, worklist, vnode__add,
            vp, delayx, vip->vi_synclist_slot);
        mutex_exit(&syncer_data_lock);
}

/*
 * Remove an item from the syncer work queue.
 */
void
vn_syncer_remove_from_worklist(struct vnode *vp)
{
        synclist_t *slp;
        vnode_impl_t *vip = VNODE_TO_VIMPL(vp);

        KASSERT(mutex_owned(vp->v_interlock));

        if (vp->v_iflag & VI_ONWORKLST) {
                mutex_enter(&syncer_data_lock);
                SDT_PROBE1(vfs, syncer, worklist, vnode__remove,  vp);
                vp->v_iflag &= ~VI_ONWORKLST;
                slp = &syncer_workitem_pending[vip->vi_synclist_slot];
                TAILQ_REMOVE(slp, vip, vi_synclist);
                mutex_exit(&syncer_data_lock);
        }
}

/*
 * Add this mount point to the syncer.
 */
void
vfs_syncer_add_to_worklist(struct mount *mp)
{
        static int start, incr, next;
        int vdelay;

        KASSERT(mutex_owned(mp->mnt_updating));
        KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) == 0);

        /*
         * We attempt to scatter the mount points on the list
         * so that they will go off at evenly distributed times
         * even if all the filesystems are mounted at once.
         */

        next += incr;
        if (next == 0 || next > syncer_maxdelay) {
                start /= 2;
                incr /= 2;
                if (start == 0) {
                        start = syncer_maxdelay / 2;
                        incr = syncer_maxdelay;
                }
                next = start;
        }
        mp->mnt_iflag |= IMNT_ONWORKLIST;
        vdelay = sync_delay(mp);
        mp->mnt_synclist_slot = vdelay > 0 ? next % vdelay : 0;
        SDT_PROBE3(vfs, syncer, worklist, mount__add,
            mp, vdelay, mp->mnt_synclist_slot);
}

/*
 * Remove the mount point from the syncer.
 */
void
vfs_syncer_remove_from_worklist(struct mount *mp)
{

        KASSERT(mutex_owned(mp->mnt_updating));
        KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) != 0);

        SDT_PROBE1(vfs, syncer, worklist, mount__remove,  mp);
        mp->mnt_iflag &= ~IMNT_ONWORKLIST;
}

/*
 * Try lazy sync, return true on success.
 */
static bool
lazy_sync_vnode(struct vnode *vp)
{
        bool synced;
        int error;

        KASSERT(mutex_owned(&syncer_data_lock));

        synced = false;
        if ((error = vcache_tryvget(vp)) == 0) {
                mutex_exit(&syncer_data_lock);
                if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT)) == 0) {
                        synced = true;
                        SDT_PROBE1(vfs, syncer, sync, vnode__start,  vp);
                        error = VOP_FSYNC(vp, curlwp->l_cred,
                            FSYNC_LAZY, 0, 0);
                        SDT_PROBE2(vfs, syncer, sync, vnode__done,  vp, error);
                        vput(vp);
                } else {
                        SDT_PROBE2(vfs, syncer, sync, vnode__fail__lock,
                            vp, error);
                        vrele(vp);
                }
                mutex_enter(&syncer_data_lock);
        } else {
                SDT_PROBE2(vfs, syncer, sync, vnode__fail__vget,  vp, error);
        }
        return synced;
}

/*
 * System filesystem synchronizer daemon.
 */
void
sched_sync(void *arg)
{
        mount_iterator_t *iter;
        synclist_t *slp;
        struct vnode_impl *vi;
        struct vnode *vp;
        struct mount *mp;
        time_t starttime, endtime;
        int vdelay, oslot, nslot, delayx;
        bool synced;
        int error;

        for (;;) {
                starttime = time_second;
                SDT_PROBE1(vfs, syncer, sync, start,  starttime);

                /*
                 * Sync mounts whose dirty time has expired.
                 */
                mountlist_iterator_init(&iter);
                while ((mp = mountlist_iterator_trynext(iter)) != NULL) {
                        if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0 ||
                            mp->mnt_synclist_slot != syncer_delayno) {
                                SDT_PROBE1(vfs, syncer, sync, mount__skip,
                                    mp);
                                continue;
                        }

                        vdelay = sync_delay(mp);
                        oslot = mp->mnt_synclist_slot;
                        nslot = sync_delay_slot(vdelay);
                        mp->mnt_synclist_slot = nslot;
                        SDT_PROBE4(vfs, syncer, worklist, mount__update,
                            mp, vdelay, oslot, nslot);

                        SDT_PROBE1(vfs, syncer, sync, mount__start,  mp);
                        error = VFS_SYNC(mp, MNT_LAZY, curlwp->l_cred);
                        SDT_PROBE2(vfs, syncer, sync, mount__done,
                            mp, error);
                }
                mountlist_iterator_destroy(iter);

                mutex_enter(&syncer_data_lock);

                /*
                 * Push files whose dirty time has expired.
                 */
                slp = &syncer_workitem_pending[syncer_delayno];
                syncer_delayno += 1;
                if (syncer_delayno >= syncer_last)
                        syncer_delayno = 0;

                while ((vi = TAILQ_FIRST(slp)) != NULL) {
                        vp = VIMPL_TO_VNODE(vi);
                        synced = lazy_sync_vnode(vp);

                        /*
                         * XXX The vnode may have been recycled, in which
                         * case it may have a new identity.
                         */
                        vi = TAILQ_FIRST(slp);
                        if (vi != NULL && VIMPL_TO_VNODE(vi) == vp) {
                                /*
                                 * Put us back on the worklist.  The worklist
                                 * routine will remove us from our current
                                 * position and then add us back in at a later
                                 * position.
                                 *
                                 * Try again sooner rather than later if
                                 * we were unable to lock the vnode.  Lock
                                 * failure should not prevent us from doing
                                 * the sync "soon".
                                 *
                                 * If we locked it yet arrive here, it's
                                 * likely that lazy sync is in progress and
                                 * so the vnode still has dirty metadata.
                                 * syncdelay is mainly to get this vnode out
                                 * of the way so we do not consider it again
                                 * "soon" in this loop, so the delay time is
                                 * not critical as long as it is not "soon".
                                 * While write-back strategy is the file
                                 * system's domain, we expect write-back to
                                 * occur no later than syncdelay seconds
                                 * into the future.
                                 */
                                delayx = synced ? syncdelay : lockdelay;
                                oslot = vi->vi_synclist_slot;
                                vn_syncer_add1(vp, delayx);
                                nslot = vi->vi_synclist_slot;
                                SDT_PROBE4(vfs, syncer, worklist,
                                    vnode__update,
                                    vp, delayx, oslot, nslot);
                        }
                }

                endtime = time_second;

                SDT_PROBE2(vfs, syncer, sync, done,  starttime, endtime);

                /*
                 * If it has taken us less than a second to process the
                 * current work, then wait.  Otherwise start right over
                 * again.  We can still lose time if any single round
                 * takes more than two seconds, but it does not really
                 * matter as we are just trying to generally pace the
                 * filesystem activity.
                 */
                if (endtime == starttime) {
                        kpause("syncer", false, hz, &syncer_data_lock);
                }
                mutex_exit(&syncer_data_lock);
        }
}

static void
sysctl_vfs_syncfs_setup(struct sysctllog **clog)
{
        const struct sysctlnode *rnode, *cnode;

        sysctl_createv(clog, 0, NULL, &rnode,
                        CTLFLAG_PERMANENT,
                        CTLTYPE_NODE, "sync",
                        SYSCTL_DESCR("syncer options"),
                        NULL, 0, NULL, 0,
                        CTL_VFS, CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, &cnode,
                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                        CTLTYPE_QUAD, "delay",
                        SYSCTL_DESCR("max time to delay syncing data"),
                        NULL, 0, &syncdelay, 0,
                        CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, &cnode,
                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                        CTLTYPE_QUAD, "filedelay",
                        SYSCTL_DESCR("time to delay syncing files"),
                        NULL, 0, &filedelay, 0,
                        CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, &cnode,
                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                        CTLTYPE_QUAD, "dirdelay",
                        SYSCTL_DESCR("time to delay syncing directories"),
                        NULL, 0, &dirdelay, 0,
                        CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, &cnode,
                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                        CTLTYPE_QUAD, "metadelay",
                        SYSCTL_DESCR("time to delay syncing metadata"),
                        NULL, 0, &metadelay, 0,
                        CTL_CREATE, CTL_EOL);
}

/*
 * sysctl helper routine to return list of supported fstypes
 */
int
sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
{
        char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
        char *where = oldp;
        struct vfsops *v;
        size_t needed, left, slen;
        int error, first;

        if (newp != NULL)
                return (EPERM);
        if (namelen != 0)
                return (EINVAL);

        first = 1;
        error = 0;
        needed = 0;
        left = *oldlenp;

        sysctl_unlock();
        mutex_enter(&vfs_list_lock);
        LIST_FOREACH(v, &vfs_list, vfs_list) {
                if (where == NULL)
                        needed += strlen(v->vfs_name) + 1;
                else {
                        memset(bf, 0, sizeof(bf));
                        if (first) {
                                strncpy(bf, v->vfs_name, sizeof(bf));
                                first = 0;
                        } else {
                                bf[0] = ' ';
                                strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
                        }
                        bf[sizeof(bf)-1] = '\0';
                        slen = strlen(bf);
                        if (left < slen + 1)
                                break;
                        v->vfs_refcount++;
                        mutex_exit(&vfs_list_lock);
                        /* +1 to copy out the trailing NUL byte */
                        error = copyout(bf, where, slen + 1);
                        mutex_enter(&vfs_list_lock);
                        v->vfs_refcount--;
                        if (error)
                                break;
                        where += slen;
                        needed += slen;
                        left -= slen;
                }
        }
        mutex_exit(&vfs_list_lock);
        sysctl_relock();
        *oldlenp = needed;
        return (error);
}

int kinfo_vdebug = 1;
int kinfo_vgetfailed;

#define KINFO_VNODESLOP        10

/*
 * Dump vnode list (via sysctl).
 * Copyout address of vnode followed by vnode.
 */
int
sysctl_kern_vnode(SYSCTLFN_ARGS)
{
        char *where = oldp;
        size_t *sizep = oldlenp;
        struct mount *mp;
        vnode_t *vp, vbuf;
        mount_iterator_t *iter;
        struct vnode_iterator *marker;
        char *bp = where;
        char *ewhere;
        int error;

        if (namelen != 0)
                return (EOPNOTSUPP);
        if (newp != NULL)
                return (EPERM);

#define VPTRSZ        sizeof(vnode_t *)
#define VNODESZ        sizeof(vnode_t)
        if (where == NULL) {
                *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
                return (0);
        }
        ewhere = where + *sizep;

        sysctl_unlock();
        mountlist_iterator_init(&iter);
        while ((mp = mountlist_iterator_next(iter)) != NULL) {
                vfs_vnode_iterator_init(mp, &marker);
                while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) {
                        if (bp + VPTRSZ + VNODESZ > ewhere) {
                                vrele(vp);
                                vfs_vnode_iterator_destroy(marker);
                                mountlist_iterator_destroy(iter);
                                sysctl_relock();
                                *sizep = bp - where;
                                return (ENOMEM);
                        }
                        memcpy(&vbuf, vp, VNODESZ);
                        if ((error = copyout(&vp, bp, VPTRSZ)) ||
                            (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
                                vrele(vp);
                                vfs_vnode_iterator_destroy(marker);
                                mountlist_iterator_destroy(iter);
                                sysctl_relock();
                                return (error);
                        }
                        vrele(vp);
                        bp += VPTRSZ + VNODESZ;
                }
                vfs_vnode_iterator_destroy(marker);
        }
        mountlist_iterator_destroy(iter);
        sysctl_relock();

        *sizep = bp - where;
        return (0);
}

/*
 * Set vnode attributes to VNOVAL
 */
void
vattr_null(struct vattr *vap)
{

        memset(vap, 0, sizeof(*vap));

        vap->va_type = VNON;

        /*
         * Assign individually so that it is safe even if size and
         * sign of each member are varied.
         */
        vap->va_mode = VNOVAL;
        vap->va_nlink = VNOVAL;
        vap->va_uid = VNOVAL;
        vap->va_gid = VNOVAL;
        vap->va_fsid = VNOVAL;
        vap->va_fileid = VNOVAL;
        vap->va_size = VNOVAL;
        vap->va_blocksize = VNOVAL;
        vap->va_atime.tv_sec =
            vap->va_mtime.tv_sec =
            vap->va_ctime.tv_sec =
            vap->va_birthtime.tv_sec = VNOVAL;
        vap->va_atime.tv_nsec =
            vap->va_mtime.tv_nsec =
            vap->va_ctime.tv_nsec =
            vap->va_birthtime.tv_nsec = VNOVAL;
        vap->va_gen = VNOVAL;
        vap->va_flags = VNOVAL;
        vap->va_rdev = VNOVAL;
        vap->va_bytes = VNOVAL;
}

/*
 * Vnode state to string.
 */
const char *
vstate_name(enum vnode_state state)
{

        switch (state) {
        case VS_ACTIVE:
                return "ACTIVE";
        case VS_MARKER:
                return "MARKER";
        case VS_LOADING:
                return "LOADING";
        case VS_LOADED:
                return "LOADED";
        case VS_BLOCKED:
                return "BLOCKED";
        case VS_RECLAIMING:
                return "RECLAIMING";
        case VS_RECLAIMED:
                return "RECLAIMED";
        default:
                return "ILLEGAL";
        }
}

/*
 * Print a description of a vnode (common part).
 */
static void
vprint_common(struct vnode *vp, const char *prefix,
    void (*pr)(const char *, ...) __printflike(1, 2))
{
        int n;
        char bf[96];
        const uint8_t *cp;
        vnode_impl_t *vip;
        const char * const vnode_tags[] = { VNODE_TAGS };
        const char * const vnode_types[] = { VNODE_TYPES };
        const char vnode_flagbits[] = VNODE_FLAGBITS;

#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
#define ARRAY_PRINT(idx, arr) \
    ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN")

        vip = VNODE_TO_VIMPL(vp);

        snprintb(bf, sizeof(bf),
            vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag);

        (*pr)("vnode %p flags %s\n", vp, bf);
        (*pr)("%stag %s(%d) type %s(%d) mount %p typedata %p\n", prefix,
            ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
            ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
            vp->v_mount, vp->v_mountedhere);
        (*pr)("%susecount %d writecount %d holdcount %d\n", prefix,
            vrefcnt(vp), vp->v_writecount, vp->v_holdcnt);
        (*pr)("%ssize %" PRIx64 " writesize %" PRIx64 " numoutput %d\n",
            prefix, vp->v_size, vp->v_writesize, vp->v_numoutput);
        (*pr)("%sdata %p lock %p\n", prefix, vp->v_data, &vip->vi_lock);

        (*pr)("%sstate %s key(%p %zd)", prefix, vstate_name(vip->vi_state),
            vip->vi_key.vk_mount, vip->vi_key.vk_key_len);
        n = vip->vi_key.vk_key_len;
        cp = vip->vi_key.vk_key;
        while (n-- > 0)
                (*pr)(" %02x", *cp++);
        (*pr)("\n");
        (*pr)("%slrulisthd %p\n", prefix, vip->vi_lrulisthd);

#undef ARRAY_PRINT
#undef ARRAY_SIZE
}

/*
 * Print out a description of a vnode.
 */
void
vprint(const char *label, struct vnode *vp)
{

        if (label != NULL)
                printf("%s: ", label);
        vprint_common(vp, "\t", printf);
        if (vp->v_data != NULL) {
                printf("\t");
                VOP_PRINT(vp);
        }
}

/*
 * Given a file system name, look up the vfsops for that
 * file system, or return NULL if file system isn't present
 * in the kernel.
 */
struct vfsops *
vfs_getopsbyname(const char *name)
{
        struct vfsops *v;

        mutex_enter(&vfs_list_lock);
        LIST_FOREACH(v, &vfs_list, vfs_list) {
                if (strcmp(v->vfs_name, name) == 0)
                        break;
        }
        if (v != NULL)
                v->vfs_refcount++;
        mutex_exit(&vfs_list_lock);

        return (v);
}

void
copy_statvfs_info(struct statvfs *sbp, const struct mount *mp)
{
        const struct statvfs *mbp;

        if (sbp == (mbp = &mp->mnt_stat))
                return;

        (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx));
        sbp->f_fsid = mbp->f_fsid;
        sbp->f_owner = mbp->f_owner;
        sbp->f_flag = mbp->f_flag;
        sbp->f_syncwrites = mbp->f_syncwrites;
        sbp->f_asyncwrites = mbp->f_asyncwrites;
        sbp->f_syncreads = mbp->f_syncreads;
        sbp->f_asyncreads = mbp->f_asyncreads;
        (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare));
        (void)memcpy(sbp->f_fstypename, mbp->f_fstypename,
            sizeof(sbp->f_fstypename));
        (void)memcpy(sbp->f_mntonname, mbp->f_mntonname,
            sizeof(sbp->f_mntonname));
        (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname,
            sizeof(sbp->f_mntfromname));
        (void)memcpy(sbp->f_mntfromlabel, mp->mnt_stat.f_mntfromlabel,
            sizeof(sbp->f_mntfromlabel));
        sbp->f_namemax = mbp->f_namemax;
}

int
set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom,
    const char *vfsname, struct mount *mp, struct lwp *l)
{
        int error;
        size_t size;
        struct statvfs *sfs = &mp->mnt_stat;
        int (*fun)(const void *, void *, size_t, size_t *);

        (void)strlcpy(mp->mnt_stat.f_fstypename, vfsname,
            sizeof(mp->mnt_stat.f_fstypename));

        if (onp) {
                struct cwdinfo *cwdi = l->l_proc->p_cwdi;
                fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr;
                if (cwdi->cwdi_rdir != NULL) {
                        size_t len;
                        char *bp;
                        char *path = PNBUF_GET();

                        bp = path + MAXPATHLEN;
                        *--bp = '\0';
                        rw_enter(&cwdi->cwdi_lock, RW_READER);
                        error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp,
                            path, MAXPATHLEN / 2, 0, l);
                        rw_exit(&cwdi->cwdi_lock);
                        if (error) {
                                PNBUF_PUT(path);
                                return error;
                        }

                        len = strlen(bp);
                        if (len > sizeof(sfs->f_mntonname) - 1)
                                len = sizeof(sfs->f_mntonname) - 1;
                        (void)strncpy(sfs->f_mntonname, bp, len);
                        PNBUF_PUT(path);

                        if (len < sizeof(sfs->f_mntonname) - 1) {
                                error = (*fun)(onp, &sfs->f_mntonname[len],
                                    sizeof(sfs->f_mntonname) - len - 1, &size);
                                if (error)
                                        return error;
                                size += len;
                        } else {
                                size = len;
                        }
                } else {
                        error = (*fun)(onp, &sfs->f_mntonname,
                            sizeof(sfs->f_mntonname) - 1, &size);
                        if (error)
                                return error;
                }
                (void)memset(sfs->f_mntonname + size, 0,
                    sizeof(sfs->f_mntonname) - size);
        }

        if (fromp) {
                fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr;
                error = (*fun)(fromp, sfs->f_mntfromname,
                    sizeof(sfs->f_mntfromname) - 1, &size);
                if (error)
                        return error;
                (void)memset(sfs->f_mntfromname + size, 0,
                    sizeof(sfs->f_mntfromname) - size);
        }
        return 0;
}

/*
 * Knob to control the precision of file timestamps:
 *
 *   0 = seconds only; nanoseconds zeroed.
 *   1 = seconds and nanoseconds, accurate within 1/HZ.
 *   2 = seconds and nanoseconds, truncated to microseconds.
 * >=3 = seconds and nanoseconds, maximum precision.
 */
enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };

int vfs_timestamp_precision __read_mostly = TSP_NSEC;

void
vfs_timestamp(struct timespec *tsp)
{
        struct timeval tv;

        switch (vfs_timestamp_precision) {
        case TSP_SEC:
                tsp->tv_sec = time_second;
                tsp->tv_nsec = 0;
                break;
        case TSP_HZ:
                getnanotime(tsp);
                break;
        case TSP_USEC:
                microtime(&tv);
                TIMEVAL_TO_TIMESPEC(&tv, tsp);
                break;
        case TSP_NSEC:
        default:
                nanotime(tsp);
                break;
        }
}

/*
 * The purpose of this routine is to remove granularity from accmode_t,
 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
 * VADMIN and VAPPEND.
 *
 * If it returns 0, the caller is supposed to continue with the usual
 * access checks using 'accmode' as modified by this routine.  If it
 * returns nonzero value, the caller is supposed to return that value
 * as errno.
 *
 * Note that after this routine runs, accmode may be zero.
 */
int
vfs_unixify_accmode(accmode_t *accmode)
{
        /*
         * There is no way to specify explicit "deny" rule using
         * file mode or POSIX.1e ACLs.
         */
        if (*accmode & VEXPLICIT_DENY) {
                *accmode = 0;
                return (0);
        }

        /*
         * None of these can be translated into usual access bits.
         * Also, the common case for NFSv4 ACLs is to not contain
         * either of these bits. Caller should check for VWRITE
         * on the containing directory instead.
         */
        if (*accmode & (VDELETE_CHILD | VDELETE))
                return (EPERM);

        if (*accmode & VADMIN_PERMS) {
                *accmode &= ~VADMIN_PERMS;
                *accmode |= VADMIN;
        }

        /*
         * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
         * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
         */
        *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);

        return (0);
}

time_t        rootfstime;                        /* recorded root fs time, if known */
void
setrootfstime(time_t t)
{
        rootfstime = t;
}

static const uint8_t vttodt_tab[ ] = {
        [VNON]        =        DT_UNKNOWN,
        [VREG]        =        DT_REG,
        [VDIR]        =        DT_DIR,
        [VBLK]        =        DT_BLK,
        [VCHR]        =        DT_CHR,
        [VLNK]        =        DT_LNK,
        [VSOCK]        =        DT_SOCK,
        [VFIFO]        =        DT_FIFO,
        [VBAD]        =        DT_UNKNOWN
};

uint8_t
vtype2dt(enum vtype vt)
{

        CTASSERT(VBAD == __arraycount(vttodt_tab) - 1);
        return vttodt_tab[vt];
}

int
VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c)
{
        int mpsafe = mp->mnt_iflag & IMNT_MPSAFE;
        int error;

        /*
         * Note: The first time through, the vfs_mount function may set
         * IMNT_MPSAFE, so we have to cache it on entry in order to
         * avoid leaking a kernel lock.
         *
         * XXX Maybe the MPSAFE bit should be set in struct vfsops and
         * not in struct mount.
         */
        if (mpsafe) {
                KERNEL_LOCK(1, NULL);
        }
        error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c);
        if (mpsafe) {
                KERNEL_UNLOCK_ONE(NULL);
        }

        return error;
}

int
VFS_START(struct mount *mp, int a)
{
        int error;

        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_LOCK(1, NULL);
        }
        error = (*(mp->mnt_op->vfs_start))(mp, a);
        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_UNLOCK_ONE(NULL);
        }

        return error;
}

int
VFS_UNMOUNT(struct mount *mp, int a)
{
        int error;

        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_LOCK(1, NULL);
        }
        error = (*(mp->mnt_op->vfs_unmount))(mp, a);
        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_UNLOCK_ONE(NULL);
        }

        return error;
}

int
VFS_ROOT(struct mount *mp, int lktype, struct vnode **a)
{
        int error;

        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_LOCK(1, NULL);
        }
        error = (*(mp->mnt_op->vfs_root))(mp, lktype, a);
        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_UNLOCK_ONE(NULL);
        }

        return error;
}

int
VFS_QUOTACTL(struct mount *mp, struct quotactl_args *args)
{
        int error;

        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_LOCK(1, NULL);
        }
        error = (*(mp->mnt_op->vfs_quotactl))(mp, args);
        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_UNLOCK_ONE(NULL);
        }

        return error;
}

int
VFS_STATVFS(struct mount *mp, struct statvfs *a)
{
        int error;

        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_LOCK(1, NULL);
        }
        error = (*(mp->mnt_op->vfs_statvfs))(mp, a);
        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_UNLOCK_ONE(NULL);
        }

        return error;
}

int
VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b)
{
        int error;

        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_LOCK(1, NULL);
        }
        error = (*(mp->mnt_op->vfs_sync))(mp, a, b);
        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_UNLOCK_ONE(NULL);
        }

        return error;
}

int
VFS_FHTOVP(struct mount *mp, struct fid *a, int b, struct vnode **c)
{
        int error;

        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_LOCK(1, NULL);
        }
        error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b, c);
        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_UNLOCK_ONE(NULL);
        }

        return error;
}

int
VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b)
{
        int error;

        if ((vp->v_vflag & VV_MPSAFE) == 0) {
                KERNEL_LOCK(1, NULL);
        }
        error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b);
        if ((vp->v_vflag & VV_MPSAFE) == 0) {
                KERNEL_UNLOCK_ONE(NULL);
        }

        return error;
}

int
VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b)
{
        int error;

        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_LOCK(1, NULL);
        }
        error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b);
        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_UNLOCK_ONE(NULL);
        }

        return error;
}

int
VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d)
{
        int error;

        KERNEL_LOCK(1, NULL);                /* XXXSMP check ffs */
        error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d);
        KERNEL_UNLOCK_ONE(NULL);        /* XXX */

        return error;
}

int
VFS_SUSPENDCTL(struct mount *mp, int a)
{
        int error;

        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_LOCK(1, NULL);
        }
        error = (*(mp->mnt_op->vfs_suspendctl))(mp, a);
        if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
                KERNEL_UNLOCK_ONE(NULL);
        }

        return error;
}

#if defined(DDB) || defined(DEBUGPRINT)
static const char buf_flagbits[] = BUF_FLAGBITS;

void
vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...))
{
        char bf[1024];

        (*pr)("  vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%"
            PRIx64 " dev 0x%x\n",
            bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev);

        snprintb(bf, sizeof(bf),
            buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags);
        (*pr)("  error %d flags %s\n", bp->b_error, bf);

        (*pr)("  bufsize 0x%lx bcount 0x%lx resid 0x%lx\n",
                  bp->b_bufsize, bp->b_bcount, bp->b_resid);
        (*pr)("  data %p saveaddr %p\n",
                  bp->b_data, bp->b_saveaddr);
        (*pr)("  iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock);
}

void
vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...))
{

        uvm_object_printit(&vp->v_uobj, full, pr);
        (*pr)("\n");
        vprint_common(vp, "", pr);
        if (full) {
                struct buf *bp;

                (*pr)("clean bufs:\n");
                LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
                        (*pr)(" bp %p\n", bp);
                        vfs_buf_print(bp, full, pr);
                }

                (*pr)("dirty bufs:\n");
                LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
                        (*pr)(" bp %p\n", bp);
                        vfs_buf_print(bp, full, pr);
                }
        }
}

void
vfs_vnode_lock_print(void *vlock, int full, void (*pr)(const char *, ...))
{
        struct mount *mp;
        vnode_impl_t *vip;

        for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) {
                TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
                        if (&vip->vi_lock == vlock ||
                            VIMPL_TO_VNODE(vip)->v_interlock == vlock)
                                vfs_vnode_print(VIMPL_TO_VNODE(vip), full, pr);
                }
        }
}

void
vfs_mount_print_all(int full, void (*pr)(const char *, ...))
{
        struct mount *mp;
        for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
                vfs_mount_print(mp, full, pr);
}

void
vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...))
{
        char sbuf[256];

        (*pr)("vnodecovered = %p data = %p\n",
                        mp->mnt_vnodecovered, mp->mnt_data);

        (*pr)("fs_bshift %d dev_bshift = %d\n",
                        mp->mnt_fs_bshift, mp->mnt_dev_bshift);

        snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag);
        (*pr)("flag = %s\n", sbuf);

        snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag);
        (*pr)("iflag = %s\n", sbuf);

        (*pr)("refcnt = %d updating @ %p\n", mp->mnt_refcnt, mp->mnt_updating);

        (*pr)("statvfs cache:\n");
        (*pr)("\tbsize = %lu\n", mp->mnt_stat.f_bsize);
        (*pr)("\tfrsize = %lu\n", mp->mnt_stat.f_frsize);
        (*pr)("\tiosize = %lu\n", mp->mnt_stat.f_iosize);

        (*pr)("\tblocks = %"PRIu64"\n", mp->mnt_stat.f_blocks);
        (*pr)("\tbfree = %"PRIu64"\n", mp->mnt_stat.f_bfree);
        (*pr)("\tbavail = %"PRIu64"\n", mp->mnt_stat.f_bavail);
        (*pr)("\tbresvd = %"PRIu64"\n", mp->mnt_stat.f_bresvd);

        (*pr)("\tfiles = %"PRIu64"\n", mp->mnt_stat.f_files);
        (*pr)("\tffree = %"PRIu64"\n", mp->mnt_stat.f_ffree);
        (*pr)("\tfavail = %"PRIu64"\n", mp->mnt_stat.f_favail);
        (*pr)("\tfresvd = %"PRIu64"\n", mp->mnt_stat.f_fresvd);

        (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n",
                        mp->mnt_stat.f_fsidx.__fsid_val[0],
                        mp->mnt_stat.f_fsidx.__fsid_val[1]);

        (*pr)("\towner = %"PRIu32"\n", mp->mnt_stat.f_owner);
        (*pr)("\tnamemax = %lu\n", mp->mnt_stat.f_namemax);

        snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag);

        (*pr)("\tflag = %s\n", sbuf);
        (*pr)("\tsyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_syncwrites);
        (*pr)("\tasyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_asyncwrites);
        (*pr)("\tsyncreads = %" PRIu64 "\n", mp->mnt_stat.f_syncreads);
        (*pr)("\tasyncreads = %" PRIu64 "\n", mp->mnt_stat.f_asyncreads);
        (*pr)("\tfstypename = %s\n", mp->mnt_stat.f_fstypename);
        (*pr)("\tmntonname = %s\n", mp->mnt_stat.f_mntonname);
        (*pr)("\tmntfromname = %s\n", mp->mnt_stat.f_mntfromname);

        {
                int cnt = 0;
                vnode_t *vp;
                vnode_impl_t *vip;
                (*pr)("locked vnodes =");
                TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
                        vp = VIMPL_TO_VNODE(vip);
                        if (VOP_ISLOCKED(vp)) {
                                if ((++cnt % 6) == 0) {
                                        (*pr)(" %p,\n\t", vp);
                                } else {
                                        (*pr)(" %p,", vp);
                                }
                        }
                }
                (*pr)("\n");
        }

        if (full) {
                int cnt = 0;
                vnode_t *vp;
                vnode_impl_t *vip;
                (*pr)("all vnodes =");
                TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
                        vp = VIMPL_TO_VNODE(vip);
                        if (!TAILQ_NEXT(vip, vi_mntvnodes)) {
                                (*pr)(" %p", vp);
                        } else if ((++cnt % 6) == 0) {
                                (*pr)(" %p,\n\t", vp);
                        } else {
                                (*pr)(" %p,", vp);
                        }
                }
                (*pr)("\n");
        }
}

/*
 * List all of the locked vnodes in the system.
 */
void printlockedvnodes(void);

void
printlockedvnodes(void)
{
        struct mount *mp;
        vnode_t *vp;
        vnode_impl_t *vip;

        printf("Locked vnodes\n");
        for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) {
                TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
                        vp = VIMPL_TO_VNODE(vip);
                        if (VOP_ISLOCKED(vp))
                                vprint(NULL, vp);
                }
        }
}

#endif /* DDB || DEBUGPRINT */
































































































































































































































































































































































































































































   33 






   32 






























































































    6 







































    6 

































    6 








    6 


    6 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    6 


    6 
    6 

    6 





    5 




    6 


    6 





    1 















































































    7 













    5 





    7 

    2 









    3 

























    5 



    5 




    1 



    4 





    4 


    4 



    5 

    5 

    5 






    9 


    6 



    4 




    6 
    6 

    6 











    5 

    6 














































    4 

    3 


    1 























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
/*        $NetBSD: route.c,v 1.237 2023/06/05 03:51:45 ozaki-r Exp $        */

/*-
 * Copyright (c) 1998, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Kevin M. Lahey of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1980, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)route.c        8.3 (Berkeley) 1/9/95
 */

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_route.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: route.c,v 1.237 2023/06/05 03:51:45 ozaki-r Exp $");

#include <sys/param.h>
#ifdef RTFLUSH_DEBUG
#include <sys/sysctl.h>
#endif
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/proc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/ioctl.h>
#include <sys/pool.h>
#include <sys/kauth.h>
#include <sys/workqueue.h>
#include <sys/syslog.h>
#include <sys/rwlock.h>
#include <sys/mutex.h>
#include <sys/cpu.h>
#include <sys/kmem.h>

#include <net/if.h>
#include <net/if_dl.h>
#include <net/route.h>
#if defined(INET) || defined(INET6)
#include <net/if_llatbl.h>
#endif

#include <netinet/in.h>
#include <netinet/in_var.h>

#define        PRESERVED_RTF        (RTF_UP | RTF_GATEWAY | RTF_HOST | RTF_DONE | RTF_MASK)

#ifdef RTFLUSH_DEBUG
#define        rtcache_debug() __predict_false(_rtcache_debug)
#else /* RTFLUSH_DEBUG */
#define        rtcache_debug() 0
#endif /* RTFLUSH_DEBUG */

#ifdef RT_DEBUG
#define RT_REFCNT_TRACE(rt)        printf("%s:%d: rt=%p refcnt=%d\n", \
                                    __func__, __LINE__, (rt), (rt)->rt_refcnt)
#else
#define RT_REFCNT_TRACE(rt)        do {} while (0)
#endif

#ifdef RT_DEBUG
#define dlog(level, fmt, args...)        log(level, fmt, ##args)
#else
#define dlog(level, fmt, args...)        do {} while (0)
#endif

struct rtstat                rtstat;

static int                rttrash;        /* routes not in table but not freed */

static struct pool        rtentry_pool;
static struct pool        rttimer_pool;

static struct callout        rt_timer_ch; /* callout for rt_timer_timer() */
static struct workqueue        *rt_timer_wq;
static struct work        rt_timer_wk;

static void        rt_timer_init(void);
static void        rt_timer_queue_remove_all(struct rttimer_queue *);
static void        rt_timer_remove_all(struct rtentry *);
static void        rt_timer_timer(void *);

/*
 * Locking notes:
 * - The routing table is protected by a global rwlock
 *   - API: RT_RLOCK and friends
 * - rtcaches are NOT protected by the framework
 *   - Callers must guarantee a rtcache isn't accessed simultaneously
 *   - How the constraint is guaranteed in the wild
 *     - Protect a rtcache by a mutex (e.g., inp_route)
 *     - Make rtcache per-CPU and allow only accesses from softint
 *       (e.g., ipforward_rt_percpu)
 * - References to a rtentry is managed by reference counting and psref
 *   - Reference counting is used for temporal reference when a rtentry
 *     is fetched from the routing table
 *   - psref is used for temporal reference when a rtentry is fetched
 *     from a rtcache
 *     - struct route (rtcache) has struct psref, so we cannot obtain
 *       a reference twice on the same struct route
 *   - Before destroying or updating a rtentry, we have to wait for
 *     all references left (see below for details)
 *   - APIs
 *     - An obtained rtentry via rtalloc1 or rtrequest* must be
 *       unreferenced by rt_unref
 *     - An obtained rtentry via rtcache_* must be unreferenced by
 *       rtcache_unref
 *   - TODO: once we get a lockless routing table, we should use only
 *           psref for rtentries
 * - rtentry destruction
 *   - A rtentry is destroyed (freed) only when we call rtrequest(RTM_DELETE)
 *   - If a caller of rtrequest grabs a reference of a rtentry, the caller
 *     has a responsibility to destroy the rtentry by itself by calling
 *     rt_free
 *     - If not, rtrequest itself does that
 *   - If rt_free is called in softint, the actual destruction routine is
 *     deferred to a workqueue
 * - rtentry update
 *   - When updating a rtentry, RTF_UPDATING flag is set
 *   - If a rtentry is set RTF_UPDATING, fetching the rtentry from
 *     the routing table or a rtcache results in either of the following
 *     cases:
 *     - if the caller runs in softint, the caller fails to fetch
 *     - otherwise, the caller waits for the update completed and retries
 *       to fetch (probably succeed to fetch for the second time)
 * - rtcache invalidation
 *   - There is a global generation counter that is incremented when
 *     any routes have been added or deleted
 *   - When a rtcache caches a rtentry into itself, it also stores
 *     a snapshot of the generation counter
 *   - If the snapshot equals to the global counter, the cache is valid,
 *     otherwise the cache is invalidated
 */

/*
 * Global lock for the routing table.
 */
static krwlock_t                rt_lock __cacheline_aligned;
#ifdef NET_MPSAFE
#define RT_RLOCK()                rw_enter(&rt_lock, RW_READER)
#define RT_WLOCK()                rw_enter(&rt_lock, RW_WRITER)
#define RT_UNLOCK()                rw_exit(&rt_lock)
#define RT_WLOCKED()                rw_write_held(&rt_lock)
#define        RT_ASSERT_WLOCK()        KASSERT(rw_write_held(&rt_lock))
#define RT_WQ_FLAGS                WQ_MPSAFE
#else
#define RT_RLOCK()                do {} while (0)
#define RT_WLOCK()                do {} while (0)
#define RT_UNLOCK()                do {} while (0)
#define RT_WLOCKED()                true
#define        RT_ASSERT_WLOCK()        do {} while (0)
#define RT_WQ_FLAGS                0
#endif

static uint64_t rtcache_generation;

/*
 * mutex and cv that are used to wait for references to a rtentry left
 * before updating the rtentry.
 */
static struct {
        kmutex_t                lock;
        kcondvar_t                cv;
        bool                        ongoing;
        const struct lwp        *lwp;
} rt_update_global __cacheline_aligned;

/*
 * A workqueue and stuff that are used to defer the destruction routine
 * of rtentries.
 */
static struct {
        struct workqueue        *wq;
        struct work                wk;
        kmutex_t                lock;
        SLIST_HEAD(, rtentry)        queue;
        bool                        enqueued;
} rt_free_global __cacheline_aligned;

/* psref for rtentry */
static struct psref_class *rt_psref_class __read_mostly;

#ifdef RTFLUSH_DEBUG
static int _rtcache_debug = 0;
#endif /* RTFLUSH_DEBUG */

static kauth_listener_t route_listener;

static int rtdeletemsg(struct rtentry *);

static void rt_maskedcopy(const struct sockaddr *,
    struct sockaddr *, const struct sockaddr *);

static void rtcache_invalidate(void);

static void rt_ref(struct rtentry *);

static struct rtentry *
    rtalloc1_locked(const struct sockaddr *, int, bool, bool);

static struct ifaddr *rt_getifa(struct rt_addrinfo *, struct psref *);
static struct ifnet *rt_getifp(struct rt_addrinfo *, struct psref *);
static struct ifaddr *ifa_ifwithroute_psref(int, const struct sockaddr *,
    const struct sockaddr *, struct psref *);

static void rtcache_ref(struct rtentry *, struct route *);

#ifdef NET_MPSAFE
static void rt_update_wait(void);
#endif

static bool rt_wait_ok(void);
static void rt_wait_refcnt(const char *, struct rtentry *, int);
static void rt_wait_psref(struct rtentry *);

#ifdef DDB
static void db_print_sa(const struct sockaddr *);
static void db_print_ifa(struct ifaddr *);
static int db_show_rtentry(struct rtentry *, void *);
#endif

#ifdef RTFLUSH_DEBUG
static void sysctl_net_rtcache_setup(struct sysctllog **);
static void
sysctl_net_rtcache_setup(struct sysctllog **clog)
{
        const struct sysctlnode *rnode;

        if (sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT,
            CTLTYPE_NODE,
            "rtcache", SYSCTL_DESCR("Route cache related settings"),
            NULL, 0, NULL, 0, CTL_NET, CTL_CREATE, CTL_EOL) != 0)
                return;
        if (sysctl_createv(clog, 0, &rnode, &rnode,
            CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT,
            "debug", SYSCTL_DESCR("Debug route caches"),
            NULL, 0, &_rtcache_debug, 0, CTL_CREATE, CTL_EOL) != 0)
                return;
}
#endif /* RTFLUSH_DEBUG */

static inline void
rt_destroy(struct rtentry *rt)
{
        if (rt->_rt_key != NULL)
                sockaddr_free(rt->_rt_key);
        if (rt->rt_gateway != NULL)
                sockaddr_free(rt->rt_gateway);
        if (rt_gettag(rt) != NULL)
                sockaddr_free(rt_gettag(rt));
        rt->_rt_key = rt->rt_gateway = rt->rt_tag = NULL;
}

static inline const struct sockaddr *
rt_setkey(struct rtentry *rt, const struct sockaddr *key, int flags)
{
        if (rt->_rt_key == key)
                goto out;

        if (rt->_rt_key != NULL)
                sockaddr_free(rt->_rt_key);
        rt->_rt_key = sockaddr_dup(key, flags);
out:
        rt->rt_nodes->rn_key = (const char *)rt->_rt_key;
        return rt->_rt_key;
}

struct ifaddr *
rt_get_ifa(struct rtentry *rt)
{
        struct ifaddr *ifa;

        ifa = rt->rt_ifa;
        if (ifa->ifa_getifa == NULL)
                return ifa;
#if 0
        else if (ifa->ifa_seqno != NULL && *ifa->ifa_seqno == rt->rt_ifa_seqno)
                return ifa;
#endif
        else {
                ifa = (*ifa->ifa_getifa)(ifa, rt_getkey(rt));
                if (ifa == NULL)
                        return NULL;
                rt_replace_ifa(rt, ifa);
                return ifa;
        }
}

static void
rt_set_ifa1(struct rtentry *rt, struct ifaddr *ifa)
{
        rt->rt_ifa = ifa;
        if (ifa->ifa_seqno != NULL)
                rt->rt_ifa_seqno = *ifa->ifa_seqno;
}

/*
 * Is this route the connected route for the ifa?
 */
static int
rt_ifa_connected(const struct rtentry *rt, const struct ifaddr *ifa)
{
        const struct sockaddr *key, *dst, *odst;
        struct sockaddr_storage maskeddst;

        key = rt_getkey(rt);
        dst = rt->rt_flags & RTF_HOST ? ifa->ifa_dstaddr : ifa->ifa_addr;
        if (dst == NULL ||
            dst->sa_family != key->sa_family ||
            dst->sa_len != key->sa_len)
                return 0;
        if ((rt->rt_flags & RTF_HOST) == 0 && ifa->ifa_netmask) {
                odst = dst;
                dst = (struct sockaddr *)&maskeddst;
                rt_maskedcopy(odst, (struct sockaddr *)&maskeddst,
                    ifa->ifa_netmask);
        }
        return (memcmp(dst, key, dst->sa_len) == 0);
}

void
rt_replace_ifa(struct rtentry *rt, struct ifaddr *ifa)
{
        struct ifaddr *old;

        if (rt->rt_ifa == ifa)
                return;

        if (rt->rt_ifa != ifa &&
            rt->rt_ifa->ifa_flags & IFA_ROUTE &&
            rt_ifa_connected(rt, rt->rt_ifa))
        {
                RT_DPRINTF("rt->_rt_key = %p, ifa = %p, "
                    "replace deleted IFA_ROUTE\n",
                    (void *)rt->_rt_key, (void *)rt->rt_ifa);
                rt->rt_ifa->ifa_flags &= ~IFA_ROUTE;
                if (rt_ifa_connected(rt, ifa)) {
                        RT_DPRINTF("rt->_rt_key = %p, ifa = %p, "
                            "replace added IFA_ROUTE\n",
                            (void *)rt->_rt_key, (void *)ifa);
                        ifa->ifa_flags |= IFA_ROUTE;
                }
        }

        ifaref(ifa);
        old = rt->rt_ifa;
        rt_set_ifa1(rt, ifa);
        ifafree(old);
}

static void
rt_set_ifa(struct rtentry *rt, struct ifaddr *ifa)
{
        ifaref(ifa);
        rt_set_ifa1(rt, ifa);
}

static int
route_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        struct rt_msghdr *rtm;
        int result;

        result = KAUTH_RESULT_DEFER;
        rtm = arg1;

        if (action != KAUTH_NETWORK_ROUTE)
                return result;

        if (rtm->rtm_type == RTM_GET)
                result = KAUTH_RESULT_ALLOW;

        return result;
}

static void rt_free_work(struct work *, void *);

void
rt_init(void)
{
        int error;

#ifdef RTFLUSH_DEBUG
        sysctl_net_rtcache_setup(NULL);
#endif

        mutex_init(&rt_free_global.lock, MUTEX_DEFAULT, IPL_SOFTNET);
        SLIST_INIT(&rt_free_global.queue);
        rt_free_global.enqueued = false;

        rt_psref_class = psref_class_create("rtentry", IPL_SOFTNET);

        error = workqueue_create(&rt_free_global.wq, "rt_free",
            rt_free_work, NULL, PRI_SOFTNET, IPL_SOFTNET, RT_WQ_FLAGS);
        if (error)
                panic("%s: workqueue_create failed (%d)\n", __func__, error);

        mutex_init(&rt_update_global.lock, MUTEX_DEFAULT, IPL_SOFTNET);
        cv_init(&rt_update_global.cv, "rt_update");

        pool_init(&rtentry_pool, sizeof(struct rtentry), 0, 0, 0, "rtentpl",
            NULL, IPL_SOFTNET);
        pool_init(&rttimer_pool, sizeof(struct rttimer), 0, 0, 0, "rttmrpl",
            NULL, IPL_SOFTNET);

        rn_init();        /* initialize all zeroes, all ones, mask table */
        rtbl_init();

        route_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
            route_listener_cb, NULL);
}

static void
rtcache_invalidate(void)
{

        RT_ASSERT_WLOCK();

        if (rtcache_debug())
                printf("%s: enter\n", __func__);

        rtcache_generation++;
}

#ifdef RT_DEBUG
static void
dump_rt(const struct rtentry *rt)
{
        char buf[512];

        log(LOG_DEBUG, "rt: ");
        log(LOG_DEBUG, "p=%p ", rt);
        if (rt->_rt_key == NULL) {
                log(LOG_DEBUG, "dst=(NULL) ");
        } else {
                sockaddr_format(rt->_rt_key, buf, sizeof(buf));
                log(LOG_DEBUG, "dst=%s ", buf);
        }
        if (rt->rt_gateway == NULL) {
                log(LOG_DEBUG, "gw=(NULL) ");
        } else {
                sockaddr_format(rt->_rt_key, buf, sizeof(buf));
                log(LOG_DEBUG, "gw=%s ", buf);
        }
        log(LOG_DEBUG, "flags=%x ", rt->rt_flags);
        if (rt->rt_ifp == NULL) {
                log(LOG_DEBUG, "if=(NULL) ");
        } else {
                log(LOG_DEBUG, "if=%s ", rt->rt_ifp->if_xname);
        }
        log(LOG_DEBUG, "\n");
}
#endif /* RT_DEBUG */

/*
 * Packet routing routines. If success, refcnt of a returned rtentry
 * will be incremented. The caller has to rtfree it by itself.
 */
struct rtentry *
rtalloc1_locked(const struct sockaddr *dst, int report, bool wait_ok,
    bool wlock)
{
        rtbl_t *rtbl;
        struct rtentry *rt;
        int s;

#ifdef NET_MPSAFE
retry:
#endif
        s = splsoftnet();
        rtbl = rt_gettable(dst->sa_family);
        if (rtbl == NULL)
                goto miss;

        rt = rt_matchaddr(rtbl, dst);
        if (rt == NULL)
                goto miss;

        if (!ISSET(rt->rt_flags, RTF_UP))
                goto miss;

#ifdef NET_MPSAFE
        if (ISSET(rt->rt_flags, RTF_UPDATING) &&
            /* XXX updater should be always able to acquire */
            curlwp != rt_update_global.lwp) {
                if (!wait_ok || !rt_wait_ok())
                        goto miss;
                RT_UNLOCK();
                splx(s);

                /* We can wait until the update is complete */
                rt_update_wait();

                if (wlock)
                        RT_WLOCK();
                else
                        RT_RLOCK();
                goto retry;
        }
#endif /* NET_MPSAFE */

        rt_ref(rt);
        RT_REFCNT_TRACE(rt);

        splx(s);
        return rt;
miss:
        rtstat.rts_unreach++;
        if (report) {
                struct rt_addrinfo info;

                memset(&info, 0, sizeof(info));
                info.rti_info[RTAX_DST] = dst;
                rt_missmsg(RTM_MISS, &info, 0, 0);
        }
        splx(s);
        return NULL;
}

struct rtentry *
rtalloc1(const struct sockaddr *dst, int report)
{
        struct rtentry *rt;

        RT_RLOCK();
        rt = rtalloc1_locked(dst, report, true, false);
        RT_UNLOCK();

        return rt;
}

static void
rt_ref(struct rtentry *rt)
{

        KASSERTMSG(rt->rt_refcnt >= 0, "rt_refcnt=%d", rt->rt_refcnt);
        atomic_inc_uint(&rt->rt_refcnt);
}

void
rt_unref(struct rtentry *rt)
{

        KASSERT(rt != NULL);
        KASSERTMSG(rt->rt_refcnt > 0, "refcnt=%d", rt->rt_refcnt);

        atomic_dec_uint(&rt->rt_refcnt);
        if (!ISSET(rt->rt_flags, RTF_UP) || ISSET(rt->rt_flags, RTF_UPDATING)) {
                mutex_enter(&rt_free_global.lock);
                cv_broadcast(&rt->rt_cv);
                mutex_exit(&rt_free_global.lock);
        }
}

static bool
rt_wait_ok(void)
{

        /*
         * This originally returned !cpu_softintr_p(), but that doesn't
         * work: the caller may hold a lock (probably softnet lock)
         * that a softint is waiting for, in which case waiting here
         * would cause a deadlock.  See https://gnats.netbsd.org/56844
         * for details.  For now, until the locking paths are sorted
         * out, we just disable the waiting option altogether and
         * always defer to workqueue.
         */
        KASSERT(!cpu_intr_p());
        return false;
}

void
rt_wait_refcnt(const char *title, struct rtentry *rt, int cnt)
{
        mutex_enter(&rt_free_global.lock);
        while (rt->rt_refcnt > cnt) {
                dlog(LOG_DEBUG, "%s: %s waiting (refcnt=%d)\n",
                    __func__, title, rt->rt_refcnt);
                cv_wait(&rt->rt_cv, &rt_free_global.lock);
                dlog(LOG_DEBUG, "%s: %s waited (refcnt=%d)\n",
                    __func__, title, rt->rt_refcnt);
        }
        mutex_exit(&rt_free_global.lock);
}

void
rt_wait_psref(struct rtentry *rt)
{

        psref_target_destroy(&rt->rt_psref, rt_psref_class);
        psref_target_init(&rt->rt_psref, rt_psref_class);
}

static void
_rt_free(struct rtentry *rt)
{
        struct ifaddr *ifa;

        /*
         * Need to avoid a deadlock on rt_wait_refcnt of update
         * and a conflict on psref_target_destroy of update.
         */
#ifdef NET_MPSAFE
        rt_update_wait();
#endif

        RT_REFCNT_TRACE(rt);
        KASSERTMSG(rt->rt_refcnt >= 0, "refcnt=%d", rt->rt_refcnt);
        rt_wait_refcnt("free", rt, 0);
#ifdef NET_MPSAFE
        psref_target_destroy(&rt->rt_psref, rt_psref_class);
#endif

        rt_assert_inactive(rt);
        rttrash--;
        ifa = rt->rt_ifa;
        rt->rt_ifa = NULL;
        ifafree(ifa);
        rt->rt_ifp = NULL;
        cv_destroy(&rt->rt_cv);
        rt_destroy(rt);
        pool_put(&rtentry_pool, rt);
}

static void
rt_free_work(struct work *wk, void *arg)
{

        for (;;) {
                struct rtentry *rt;

                mutex_enter(&rt_free_global.lock);
                if ((rt = SLIST_FIRST(&rt_free_global.queue)) == NULL) {
                        rt_free_global.enqueued = false;
                        mutex_exit(&rt_free_global.lock);
                        return;
                }
                SLIST_REMOVE_HEAD(&rt_free_global.queue, rt_free);
                mutex_exit(&rt_free_global.lock);
                atomic_dec_uint(&rt->rt_refcnt);
                _rt_free(rt);
        }
}

void
rt_free(struct rtentry *rt)
{

        KASSERTMSG(rt->rt_refcnt > 0, "rt_refcnt=%d", rt->rt_refcnt);
        if (rt_wait_ok()) {
                atomic_dec_uint(&rt->rt_refcnt);
                _rt_free(rt);
                return;
        }

        mutex_enter(&rt_free_global.lock);
        /* No need to add a reference here. */
        SLIST_INSERT_HEAD(&rt_free_global.queue, rt, rt_free);
        if (!rt_free_global.enqueued) {
                workqueue_enqueue(rt_free_global.wq, &rt_free_global.wk, NULL);
                rt_free_global.enqueued = true;
        }
        mutex_exit(&rt_free_global.lock);
}

#ifdef NET_MPSAFE
static void
rt_update_wait(void)
{

        mutex_enter(&rt_update_global.lock);
        while (rt_update_global.ongoing) {
                dlog(LOG_DEBUG, "%s: waiting lwp=%p\n", __func__, curlwp);
                cv_wait(&rt_update_global.cv, &rt_update_global.lock);
                dlog(LOG_DEBUG, "%s: waited lwp=%p\n", __func__, curlwp);
        }
        mutex_exit(&rt_update_global.lock);
}
#endif

int
rt_update_prepare(struct rtentry *rt)
{

        dlog(LOG_DEBUG, "%s: updating rt=%p lwp=%p\n", __func__, rt, curlwp);

        RT_WLOCK();
        /* If the entry is being destroyed, don't proceed the update. */
        if (!ISSET(rt->rt_flags, RTF_UP)) {
                RT_UNLOCK();
                return ESRCH;
        }
        rt->rt_flags |= RTF_UPDATING;
        RT_UNLOCK();

        mutex_enter(&rt_update_global.lock);
        while (rt_update_global.ongoing) {
                dlog(LOG_DEBUG, "%s: waiting ongoing updating rt=%p lwp=%p\n",
                    __func__, rt, curlwp);
                cv_wait(&rt_update_global.cv, &rt_update_global.lock);
                dlog(LOG_DEBUG, "%s: waited ongoing updating rt=%p lwp=%p\n",
                    __func__, rt, curlwp);
        }
        rt_update_global.ongoing = true;
        /* XXX need it to avoid rt_update_wait by updater itself. */
        rt_update_global.lwp = curlwp;
        mutex_exit(&rt_update_global.lock);

        rt_wait_refcnt("update", rt, 1);
        rt_wait_psref(rt);

        return 0;
}

void
rt_update_finish(struct rtentry *rt)
{

        RT_WLOCK();
        rt->rt_flags &= ~RTF_UPDATING;
        RT_UNLOCK();

        mutex_enter(&rt_update_global.lock);
        rt_update_global.ongoing = false;
        rt_update_global.lwp = NULL;
        cv_broadcast(&rt_update_global.cv);
        mutex_exit(&rt_update_global.lock);

        dlog(LOG_DEBUG, "%s: updated rt=%p lwp=%p\n", __func__, rt, curlwp);
}

/*
 * Force a routing table entry to the specified
 * destination to go through the given gateway.
 * Normally called as a result of a routing redirect
 * message from the network layer.
 *
 * N.B.: must be called at splsoftnet
 */
void
rtredirect(const struct sockaddr *dst, const struct sockaddr *gateway,
        const struct sockaddr *netmask, int flags, const struct sockaddr *src,
        struct rtentry **rtp)
{
        struct rtentry *rt;
        int error = 0;
        uint64_t *stat = NULL;
        struct rt_addrinfo info;
        struct ifaddr *ifa;
        struct psref psref;

        /* verify the gateway is directly reachable */
        if ((ifa = ifa_ifwithnet_psref(gateway, &psref)) == NULL) {
                error = ENETUNREACH;
                goto out;
        }
        rt = rtalloc1(dst, 0);
        /*
         * If the redirect isn't from our current router for this dst,
         * it's either old or wrong.  If it redirects us to ourselves,
         * we have a routing loop, perhaps as a result of an interface
         * going down recently.
         */
        if (!(flags & RTF_DONE) && rt &&
             (sockaddr_cmp(src, rt->rt_gateway) != 0 || rt->rt_ifa != ifa))
                error = EINVAL;
        else {
                int s = pserialize_read_enter();
                struct ifaddr *_ifa;

                _ifa = ifa_ifwithaddr(gateway);
                if (_ifa != NULL)
                        error = EHOSTUNREACH;
                pserialize_read_exit(s);
        }
        if (error)
                goto done;
        /*
         * Create a new entry if we just got back a wildcard entry
         * or the lookup failed.  This is necessary for hosts
         * which use routing redirects generated by smart gateways
         * to dynamically build the routing tables.
         */
        if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
                goto create;
        /*
         * Don't listen to the redirect if it's
         * for a route to an interface.
         */
        if (rt->rt_flags & RTF_GATEWAY) {
                if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
                        /*
                         * Changing from route to net => route to host.
                         * Create new route, rather than smashing route to net.
                         */
                create:
                        if (rt != NULL)
                                rt_unref(rt);
                        flags |=  RTF_GATEWAY | RTF_DYNAMIC;
                        memset(&info, 0, sizeof(info));
                        info.rti_info[RTAX_DST] = dst;
                        info.rti_info[RTAX_GATEWAY] = gateway;
                        info.rti_info[RTAX_NETMASK] = netmask;
                        info.rti_ifa = ifa;
                        info.rti_flags = flags;
                        rt = NULL;
                        error = rtrequest1(RTM_ADD, &info, &rt);
                        if (rt != NULL)
                                flags = rt->rt_flags;
                        if (error == 0)
                                rt_newmsg_dynamic(RTM_ADD, rt);
                        stat = &rtstat.rts_dynamic;
                } else {
                        /*
                         * Smash the current notion of the gateway to
                         * this destination.  Should check about netmask!!!
                         */
#ifdef NET_MPSAFE
                        KASSERT(!cpu_softintr_p());

                        error = rt_update_prepare(rt);
                        if (error == 0) {
#endif
                                RT_WLOCK();
                                error = rt_setgate(rt, gateway);
                                if (error == 0) {
                                        rt->rt_flags |= RTF_MODIFIED;
                                        flags |= RTF_MODIFIED;
                                }
                                RT_UNLOCK();
#ifdef NET_MPSAFE
                                rt_update_finish(rt);
                        } else {
                                /*
                                 * If error != 0, the rtentry is being
                                 * destroyed, so doing nothing doesn't
                                 * matter.
                                 */
                        }
#endif
                        stat = &rtstat.rts_newgateway;
                }
        } else
                error = EHOSTUNREACH;
done:
        if (rt) {
                if (rtp != NULL && !error)
                        *rtp = rt;
                else
                        rt_unref(rt);
        }
out:
        if (error)
                rtstat.rts_badredirect++;
        else if (stat != NULL)
                (*stat)++;
        memset(&info, 0, sizeof(info));
        info.rti_info[RTAX_DST] = dst;
        info.rti_info[RTAX_GATEWAY] = gateway;
        info.rti_info[RTAX_NETMASK] = netmask;
        info.rti_info[RTAX_AUTHOR] = src;
        rt_missmsg(RTM_REDIRECT, &info, flags, error);
        ifa_release(ifa, &psref);
}

/*
 * Delete a route and generate a message.
 * It doesn't free a passed rt.
 */
static int
rtdeletemsg(struct rtentry *rt)
{
        int error;
        struct rt_addrinfo info;
        struct rtentry *retrt;

        /*
         * Request the new route so that the entry is not actually
         * deleted.  That will allow the information being reported to
         * be accurate (and consistent with route_output()).
         */
        memset(&info, 0, sizeof(info));
        info.rti_info[RTAX_DST] = rt_getkey(rt);
        info.rti_info[RTAX_NETMASK] = rt_mask(rt);
        info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
        info.rti_flags = rt->rt_flags;
        error = rtrequest1(RTM_DELETE, &info, &retrt);

        rt_missmsg(RTM_DELETE, &info, info.rti_flags, error);

        return error;
}

static struct ifaddr *
ifa_ifwithroute_psref(int flags, const struct sockaddr *dst,
    const struct sockaddr *gateway, struct psref *psref)
{
        struct ifaddr *ifa = NULL;

        if ((flags & RTF_GATEWAY) == 0) {
                /*
                 * If we are adding a route to an interface,
                 * and the interface is a pt to pt link
                 * we should search for the destination
                 * as our clue to the interface.  Otherwise
                 * we can use the local address.
                 */
                if ((flags & RTF_HOST) && gateway->sa_family != AF_LINK)
                        ifa = ifa_ifwithdstaddr_psref(dst, psref);
                if (ifa == NULL)
                        ifa = ifa_ifwithaddr_psref(gateway, psref);
        } else {
                /*
                 * If we are adding a route to a remote net
                 * or host, the gateway may still be on the
                 * other end of a pt to pt link.
                 */
                ifa = ifa_ifwithdstaddr_psref(gateway, psref);
        }
        if (ifa == NULL)
                ifa = ifa_ifwithnet_psref(gateway, psref);
        if (ifa == NULL) {
                int s;
                struct rtentry *rt;

                rt = rtalloc1_locked(gateway, 0, true, true);
                if (rt == NULL)
                        return NULL;
                if (rt->rt_flags & RTF_GATEWAY) {
                        rt_unref(rt);
                        return NULL;
                }
                /*
                 * Just in case. May not need to do this workaround.
                 * Revisit when working on rtentry MP-ification.
                 */
                s = pserialize_read_enter();
                IFADDR_READER_FOREACH(ifa, rt->rt_ifp) {
                        if (ifa == rt->rt_ifa)
                                break;
                }
                if (ifa != NULL)
                        ifa_acquire(ifa, psref);
                pserialize_read_exit(s);
                rt_unref(rt);
                if (ifa == NULL)
                        return NULL;
        }
        if (ifa->ifa_addr->sa_family != dst->sa_family) {
                struct ifaddr *nifa;
                int s;

                s = pserialize_read_enter();
                nifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
                if (nifa != NULL) {
                        ifa_release(ifa, psref);
                        ifa_acquire(nifa, psref);
                        ifa = nifa;
                }
                pserialize_read_exit(s);
        }
        return ifa;
}

/*
 * If it suceeds and ret_nrt isn't NULL, refcnt of ret_nrt is incremented.
 * The caller has to rtfree it by itself.
 */
int
rtrequest(int req, const struct sockaddr *dst, const struct sockaddr *gateway,
        const struct sockaddr *netmask, int flags, struct rtentry **ret_nrt)
{
        struct rt_addrinfo info;

        memset(&info, 0, sizeof(info));
        info.rti_flags = flags;
        info.rti_info[RTAX_DST] = dst;
        info.rti_info[RTAX_GATEWAY] = gateway;
        info.rti_info[RTAX_NETMASK] = netmask;
        return rtrequest1(req, &info, ret_nrt);
}

static struct ifnet *
rt_getifp(struct rt_addrinfo *info, struct psref *psref)
{
        const struct sockaddr *ifpaddr = info->rti_info[RTAX_IFP];

        if (info->rti_ifp != NULL)
                return NULL;
        /*
         * ifp may be specified by sockaddr_dl when protocol address
         * is ambiguous
         */
        if (ifpaddr != NULL && ifpaddr->sa_family == AF_LINK) {
                struct ifaddr *ifa;
                int s = pserialize_read_enter();

                ifa = ifa_ifwithnet(ifpaddr);
                if (ifa != NULL)
                        info->rti_ifp = if_get_byindex(ifa->ifa_ifp->if_index,
                            psref);
                pserialize_read_exit(s);
        }

        return info->rti_ifp;
}

static struct ifaddr *
rt_getifa(struct rt_addrinfo *info, struct psref *psref)
{
        struct ifaddr *ifa = NULL;
        const struct sockaddr *dst = info->rti_info[RTAX_DST];
        const struct sockaddr *gateway = info->rti_info[RTAX_GATEWAY];
        const struct sockaddr *ifaaddr = info->rti_info[RTAX_IFA];
        int flags = info->rti_flags;
        const struct sockaddr *sa;

        if (info->rti_ifa == NULL && ifaaddr != NULL) {
                ifa = ifa_ifwithaddr_psref(ifaaddr, psref);
                if (ifa != NULL)
                        goto got;
        }

        sa = ifaaddr != NULL ? ifaaddr :
            (gateway != NULL ? gateway : dst);
        if (sa != NULL && info->rti_ifp != NULL)
                ifa = ifaof_ifpforaddr_psref(sa, info->rti_ifp, psref);
        else if (dst != NULL && gateway != NULL)
                ifa = ifa_ifwithroute_psref(flags, dst, gateway, psref);
        else if (sa != NULL)
                ifa = ifa_ifwithroute_psref(flags, sa, sa, psref);
        if (ifa == NULL)
                return NULL;
got:
        if (ifa->ifa_getifa != NULL) {
                /* FIXME ifa_getifa is NOMPSAFE */
                ifa = (*ifa->ifa_getifa)(ifa, dst);
                if (ifa == NULL)
                        return NULL;
                ifa_acquire(ifa, psref);
        }
        info->rti_ifa = ifa;
        if (info->rti_ifp == NULL)
                info->rti_ifp = ifa->ifa_ifp;
        return ifa;
}

/*
 * If it suceeds and ret_nrt isn't NULL, refcnt of ret_nrt is incremented.
 * The caller has to rtfree it by itself.
 */
int
rtrequest1(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt)
{
        int s = splsoftnet(), ss;
        int error = 0, rc;
        struct rtentry *rt;
        rtbl_t *rtbl;
        struct ifaddr *ifa = NULL;
        struct sockaddr_storage maskeddst;
        const struct sockaddr *dst = info->rti_info[RTAX_DST];
        const struct sockaddr *gateway = info->rti_info[RTAX_GATEWAY];
        const struct sockaddr *netmask = info->rti_info[RTAX_NETMASK];
        int flags = info->rti_flags;
        struct psref psref_ifp, psref_ifa;
        int bound = 0;
        struct ifnet *ifp = NULL;
        bool need_to_release_ifa = true;
        bool need_unlock = true;
#define senderr(x) { error = x ; goto bad; }

        RT_WLOCK();

        bound = curlwp_bind();
        if ((rtbl = rt_gettable(dst->sa_family)) == NULL)
                senderr(ESRCH);
        if (flags & RTF_HOST)
                netmask = NULL;
        switch (req) {
        case RTM_DELETE:
                if (netmask) {
                        rt_maskedcopy(dst, (struct sockaddr *)&maskeddst,
                            netmask);
                        dst = (struct sockaddr *)&maskeddst;
                }
                if ((rt = rt_lookup(rtbl, dst, netmask)) == NULL)
                        senderr(ESRCH);
                if ((rt = rt_deladdr(rtbl, dst, netmask)) == NULL)
                        senderr(ESRCH);
                rt->rt_flags &= ~RTF_UP;
                ifa = rt->rt_ifa;
                if (ifa->ifa_flags & IFA_ROUTE &&
                    rt_ifa_connected(rt, ifa)) {
                        RT_DPRINTF("rt->_rt_key = %p, ifa = %p, "
                            "deleted IFA_ROUTE\n",
                            (void *)rt->_rt_key, (void *)ifa);
                        ifa->ifa_flags &= ~IFA_ROUTE;
                }
                if (ifa->ifa_rtrequest)
                        ifa->ifa_rtrequest(RTM_DELETE, rt, info);
                ifa = NULL;
                rttrash++;
                if (ret_nrt) {
                        *ret_nrt = rt;
                        rt_ref(rt);
                        RT_REFCNT_TRACE(rt);
                }
                rtcache_invalidate();
                RT_UNLOCK();
                need_unlock = false;
                rt_timer_remove_all(rt);
#if defined(INET) || defined(INET6)
                if (netmask != NULL)
                        lltable_prefix_free(dst->sa_family, dst, netmask, 0);
#endif
                if (ret_nrt == NULL) {
                        /* Adjust the refcount */
                        rt_ref(rt);
                        RT_REFCNT_TRACE(rt);
                        rt_free(rt);
                }
                break;

        case RTM_ADD:
                if (info->rti_ifa == NULL) {
                        ifp = rt_getifp(info, &psref_ifp);
                        ifa = rt_getifa(info, &psref_ifa);
                        if (ifa == NULL)
                                senderr(ENETUNREACH);
                } else {
                        /* Caller should have a reference of ifa */
                        ifa = info->rti_ifa;
                        need_to_release_ifa = false;
                }
                rt = pool_get(&rtentry_pool, PR_NOWAIT);
                if (rt == NULL)
                        senderr(ENOBUFS);
                memset(rt, 0, sizeof(*rt));
                rt->rt_flags = RTF_UP | (flags & ~RTF_DONTCHANGEIFA);
                LIST_INIT(&rt->rt_timer);

                RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
                if (netmask) {
                        rt_maskedcopy(dst, (struct sockaddr *)&maskeddst,
                            netmask);
                        rt_setkey(rt, (struct sockaddr *)&maskeddst, M_NOWAIT);
                } else {
                        rt_setkey(rt, dst, M_NOWAIT);
                }
                RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
                if (rt_getkey(rt) == NULL ||
                    rt_setgate(rt, gateway) != 0) {
                        pool_put(&rtentry_pool, rt);
                        senderr(ENOBUFS);
                }

                rt_set_ifa(rt, ifa);
                if (info->rti_info[RTAX_TAG] != NULL) {
                        const struct sockaddr *tag;
                        tag = rt_settag(rt, info->rti_info[RTAX_TAG]);
                        if (tag == NULL)
                                senderr(ENOBUFS);
                }
                RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);

                ss = pserialize_read_enter();
                if (info->rti_info[RTAX_IFP] != NULL) {
                        struct ifaddr *ifa2;
                        ifa2 = ifa_ifwithnet(info->rti_info[RTAX_IFP]);
                        if (ifa2 != NULL)
                                rt->rt_ifp = ifa2->ifa_ifp;
                        else
                                rt->rt_ifp = ifa->ifa_ifp;
                } else
                        rt->rt_ifp = ifa->ifa_ifp;
                pserialize_read_exit(ss);
                cv_init(&rt->rt_cv, "rtentry");
                psref_target_init(&rt->rt_psref, rt_psref_class);

                RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
                rc = rt_addaddr(rtbl, rt, netmask);
                RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
                if (rc != 0) {
                        ifafree(ifa); /* for rt_set_ifa above */
                        cv_destroy(&rt->rt_cv);
                        rt_destroy(rt);
                        pool_put(&rtentry_pool, rt);
                        senderr(rc);
                }
                RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
                if (ifa->ifa_rtrequest)
                        ifa->ifa_rtrequest(req, rt, info);
                if (need_to_release_ifa)
                        ifa_release(ifa, &psref_ifa);
                ifa = NULL;
                if_put(ifp, &psref_ifp);
                ifp = NULL;
                RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
                if (ret_nrt) {
                        *ret_nrt = rt;
                        rt_ref(rt);
                        RT_REFCNT_TRACE(rt);
                }
                rtcache_invalidate();
                RT_UNLOCK();
                need_unlock = false;
                break;
        case RTM_GET:
                if (netmask != NULL) {
                        rt_maskedcopy(dst, (struct sockaddr *)&maskeddst,
                            netmask);
                        dst = (struct sockaddr *)&maskeddst;
                }
                if ((rt = rt_lookup(rtbl, dst, netmask)) == NULL)
                        senderr(ESRCH);
                if (ret_nrt != NULL) {
                        *ret_nrt = rt;
                        rt_ref(rt);
                        RT_REFCNT_TRACE(rt);
                }
                break;
        }
bad:
        if (need_to_release_ifa)
                ifa_release(ifa, &psref_ifa);
        if_put(ifp, &psref_ifp);
        curlwp_bindx(bound);
        if (need_unlock)
                RT_UNLOCK();
        splx(s);
        return error;
}

int
rt_setgate(struct rtentry *rt, const struct sockaddr *gate)
{
        struct sockaddr *new, *old;

        KASSERT(RT_WLOCKED());
        KASSERT(rt->_rt_key != NULL);
        RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);

        new = sockaddr_dup(gate, M_ZERO | M_NOWAIT);
        if (new == NULL)
                return ENOMEM;

        old = rt->rt_gateway;
        rt->rt_gateway = new;
        if (old != NULL)
                sockaddr_free(old);

        KASSERT(rt->_rt_key != NULL);
        RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);

        if (rt->rt_flags & RTF_GATEWAY) {
                struct rtentry *gwrt;

                gwrt = rtalloc1_locked(gate, 1, false, true);
                /*
                 * If we switched gateways, grab the MTU from the new
                 * gateway route if the current MTU, if the current MTU is
                 * greater than the MTU of gateway.
                 * Note that, if the MTU of gateway is 0, we will reset the
                 * MTU of the route to run PMTUD again from scratch. XXX
                 */
                if (gwrt != NULL) {
                        KASSERT(gwrt->_rt_key != NULL);
                        RT_DPRINTF("gwrt->_rt_key = %p\n", gwrt->_rt_key);
                        if ((rt->rt_rmx.rmx_locks & RTV_MTU) == 0 &&
                            rt->rt_rmx.rmx_mtu &&
                            rt->rt_rmx.rmx_mtu > gwrt->rt_rmx.rmx_mtu) {
                                rt->rt_rmx.rmx_mtu = gwrt->rt_rmx.rmx_mtu;
                        }
                        rt_unref(gwrt);
                }
        }
        KASSERT(rt->_rt_key != NULL);
        RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
        return 0;
}

static struct ifaddr *
rt_update_get_ifa(const struct rt_addrinfo *info, const struct rtentry *rt,
    struct ifnet **ifp, struct psref *psref_ifp, struct psref *psref)
{
        struct ifaddr *ifa = NULL;

        *ifp = NULL;
        if (info->rti_info[RTAX_IFP] != NULL) {
                ifa = ifa_ifwithnet_psref(info->rti_info[RTAX_IFP], psref);
                if (ifa == NULL)
                        goto next;
                if (ifa->ifa_ifp->if_flags & IFF_UNNUMBERED) {
                        ifa_release(ifa, psref);
                        ifa = NULL;
                        goto next;
                }
                *ifp = ifa->ifa_ifp;
                if_acquire(*ifp, psref_ifp);
                if (info->rti_info[RTAX_IFA] == NULL &&
                    info->rti_info[RTAX_GATEWAY] == NULL)
                        goto out;
                ifa_release(ifa, psref);
                if (info->rti_info[RTAX_IFA] == NULL) {
                        /* route change <dst> <gw> -ifp <if> */
                        ifa = ifaof_ifpforaddr_psref(
                            info->rti_info[RTAX_GATEWAY], *ifp, psref);
                } else {
                        /* route change <dst> -ifp <if> -ifa <addr> */
                        ifa = ifa_ifwithaddr_psref(info->rti_info[RTAX_IFA],
                            psref);
                        if (ifa != NULL)
                                goto out;
                        ifa = ifaof_ifpforaddr_psref(info->rti_info[RTAX_IFA],
                            *ifp, psref);
                }
                goto out;
        }
next:
        if (info->rti_info[RTAX_IFA] != NULL) {
                /* route change <dst> <gw> -ifa <addr> */
                ifa = ifa_ifwithaddr_psref(info->rti_info[RTAX_IFA], psref);
                if (ifa != NULL)
                        goto out;
        }
        if (info->rti_info[RTAX_GATEWAY] != NULL) {
                /* route change <dst> <gw> */
                ifa = ifa_ifwithroute_psref(rt->rt_flags, rt_getkey(rt),
                    info->rti_info[RTAX_GATEWAY], psref);
        }
out:
        if (ifa != NULL && *ifp == NULL) {
                *ifp = ifa->ifa_ifp;
                if_acquire(*ifp, psref_ifp);
        }
        if (ifa == NULL && *ifp != NULL) {
                if_put(*ifp, psref_ifp);
                *ifp = NULL;
        }
        return ifa;
}

int
rt_update(struct rtentry *rt, struct rt_addrinfo *info, void *rtm)
{
        int error = 0;
        struct ifnet *ifp = NULL, *new_ifp = NULL;
        struct ifaddr *ifa = NULL, *new_ifa;
        struct psref psref_ifa, psref_new_ifa, psref_ifp, psref_new_ifp;
        bool newgw, ifp_changed = false;

        RT_WLOCK();
        /*
         * New gateway could require new ifaddr, ifp;
         * flags may also be different; ifp may be specified
         * by ll sockaddr when protocol address is ambiguous
         */
        newgw = info->rti_info[RTAX_GATEWAY] != NULL &&
            sockaddr_cmp(info->rti_info[RTAX_GATEWAY], rt->rt_gateway) != 0;

        if (newgw || info->rti_info[RTAX_IFP] != NULL ||
            info->rti_info[RTAX_IFA] != NULL) {
                ifp = rt_getifp(info, &psref_ifp);
                /* info refers ifp so we need to keep a reference */
                ifa = rt_getifa(info, &psref_ifa);
                if (ifa == NULL) {
                        error = ENETUNREACH;
                        goto out;
                }
        }
        if (newgw) {
                error = rt_setgate(rt, info->rti_info[RTAX_GATEWAY]);
                if (error != 0)
                        goto out;
        }
        if (info->rti_info[RTAX_TAG]) {
                const struct sockaddr *tag;
                tag = rt_settag(rt, info->rti_info[RTAX_TAG]);
                if (tag == NULL) {
                        error = ENOBUFS;
                        goto out;
                }
        }
        /*
         * New gateway could require new ifaddr, ifp;
         * flags may also be different; ifp may be specified
         * by ll sockaddr when protocol address is ambiguous
         */
        new_ifa = rt_update_get_ifa(info, rt, &new_ifp, &psref_new_ifp,
            &psref_new_ifa);
        if (new_ifa != NULL) {
                ifa_release(ifa, &psref_ifa);
                ifa = new_ifa;
        }
        if (ifa) {
                struct ifaddr *oifa = rt->rt_ifa;
                if (oifa != ifa && !ifa_is_destroying(ifa) &&
                    new_ifp != NULL && !if_is_deactivated(new_ifp)) {
                        if (oifa && oifa->ifa_rtrequest)
                                oifa->ifa_rtrequest(RTM_DELETE, rt, info);
                        rt_replace_ifa(rt, ifa);
                        rt->rt_ifp = new_ifp;
                        ifp_changed = true;
                }
                if (new_ifa == NULL)
                        ifa_release(ifa, &psref_ifa);
                /* To avoid ifa_release below */
                ifa = NULL;
        }
        ifa_release(new_ifa, &psref_new_ifa);
        if (new_ifp && rt->rt_ifp != new_ifp && !if_is_deactivated(new_ifp)) {
                rt->rt_ifp = new_ifp;
                ifp_changed = true;
        }
        rt_setmetrics(rtm, rt);
        if (rt->rt_flags != info->rti_flags) {
                rt->rt_flags = (info->rti_flags & ~PRESERVED_RTF) |
                    (rt->rt_flags & PRESERVED_RTF);
        }
        if (rt->rt_ifa->ifa_rtrequest)
                rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info);
#if defined(INET) || defined(INET6)
        if (ifp_changed && rt_mask(rt) != NULL)
                lltable_prefix_free(rt_getkey(rt)->sa_family, rt_getkey(rt),
                    rt_mask(rt), 0);
#else
        (void)ifp_changed; /* XXX gcc */
#endif
out:
        ifa_release(ifa, &psref_ifa);
        if_put(new_ifp, &psref_new_ifp);
        if_put(ifp, &psref_ifp);

        RT_UNLOCK();

        return error;
}

static void
rt_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
        const struct sockaddr *netmask)
{
        const char *netmaskp = &netmask->sa_data[0],
                   *srcp = &src->sa_data[0];
        char *dstp = &dst->sa_data[0];
        const char *maskend = (char *)dst + MIN(netmask->sa_len, src->sa_len);
        const char *srcend = (char *)dst + src->sa_len;

        dst->sa_len = src->sa_len;
        dst->sa_family = src->sa_family;

        while (dstp < maskend)
                *dstp++ = *srcp++ & *netmaskp++;
        if (dstp < srcend)
                memset(dstp, 0, (size_t)(srcend - dstp));
}

/*
 * Inform the routing socket of a route change.
 */
void
rt_newmsg(const int cmd, const struct rtentry *rt)
{
        struct rt_addrinfo info;

        memset((void *)&info, 0, sizeof(info));
        info.rti_info[RTAX_DST] = rt_getkey(rt);
        info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
        info.rti_info[RTAX_NETMASK] = rt_mask(rt);
        if (rt->rt_ifp) {
                info.rti_info[RTAX_IFP] = rt->rt_ifp->if_dl->ifa_addr;
                info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
        }

        rt_missmsg(cmd, &info, rt->rt_flags, 0);
}

/*
 * Inform the routing socket of a route change for RTF_DYNAMIC.
 */
void
rt_newmsg_dynamic(const int cmd, const struct rtentry *rt)
{
        struct rt_addrinfo info;
        struct sockaddr *gateway = rt->rt_gateway;

        if (gateway == NULL)
                return;

        switch(gateway->sa_family) {
#ifdef INET
        case AF_INET: {
                extern bool icmp_dynamic_rt_msg;
                if (!icmp_dynamic_rt_msg)
                        return;
                break;
        }
#endif
#ifdef INET6
        case AF_INET6: {
                extern bool icmp6_dynamic_rt_msg;
                if (!icmp6_dynamic_rt_msg)
                        return;
                break;
        }
#endif
        default:
                return;
        }

        memset((void *)&info, 0, sizeof(info));
        info.rti_info[RTAX_DST] = rt_getkey(rt);
        info.rti_info[RTAX_GATEWAY] = gateway;
        info.rti_info[RTAX_NETMASK] = rt_mask(rt);
        if (rt->rt_ifp) {
                info.rti_info[RTAX_IFP] = rt->rt_ifp->if_dl->ifa_addr;
                info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
        }

        rt_missmsg(cmd, &info, rt->rt_flags, 0);
}

/*
 * Set up or tear down a routing table entry, normally
 * for an interface.
 */
int
rtinit(struct ifaddr *ifa, int cmd, int flags)
{
        struct rtentry *rt;
        struct sockaddr *dst, *odst;
        struct sockaddr_storage maskeddst;
        struct rtentry *nrt = NULL;
        int error;
        struct rt_addrinfo info;

        dst = flags & RTF_HOST ? ifa->ifa_dstaddr : ifa->ifa_addr;
        if (cmd == RTM_DELETE) {
                if ((flags & RTF_HOST) == 0 && ifa->ifa_netmask) {
                        /* Delete subnet route for this interface */
                        odst = dst;
                        dst = (struct sockaddr *)&maskeddst;
                        rt_maskedcopy(odst, dst, ifa->ifa_netmask);
                }
                if ((rt = rtalloc1(dst, 0)) != NULL) {
                        if (rt->rt_ifa != ifa) {
                                rt_unref(rt);
                                return (flags & RTF_HOST) ? EHOSTUNREACH
                                                        : ENETUNREACH;
                        }
                        rt_unref(rt);
                }
        }
        memset(&info, 0, sizeof(info));
        info.rti_ifa = ifa;
        info.rti_flags = flags | ifa->ifa_flags | RTF_DONTCHANGEIFA;
        info.rti_info[RTAX_DST] = dst;
        info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;

        /*
         * XXX here, it seems that we are assuming that ifa_netmask is NULL
         * for RTF_HOST.  bsdi4 passes NULL explicitly (via intermediate
         * variable) when RTF_HOST is 1.  still not sure if i can safely
         * change it to meet bsdi4 behavior.
         */
        if (cmd != RTM_LLINFO_UPD)
                info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
        error = rtrequest1((cmd == RTM_LLINFO_UPD) ? RTM_GET : cmd, &info,
            &nrt);
        if (error != 0)
                return error;

        rt = nrt;
        RT_REFCNT_TRACE(rt);
        switch (cmd) {
        case RTM_DELETE:
                rt_newmsg(cmd, rt);
                rt_free(rt);
                break;
        case RTM_LLINFO_UPD:
                if (cmd == RTM_LLINFO_UPD && ifa->ifa_rtrequest != NULL)
                        ifa->ifa_rtrequest(RTM_LLINFO_UPD, rt, &info);
                rt_newmsg(RTM_CHANGE, rt);
                rt_unref(rt);
                break;
        case RTM_ADD:
                KASSERT(rt->rt_ifa == ifa);
                rt_newmsg(cmd, rt);
                rt_unref(rt);
                RT_REFCNT_TRACE(rt);
                break;
        }
        return error;
}

/*
 * Create a local route entry for the address.
 * Announce the addition of the address and the route to the routing socket.
 */
int
rt_ifa_addlocal(struct ifaddr *ifa)
{
        struct rtentry *rt;
        int e;

        /* If there is no loopback entry, allocate one. */
        rt = rtalloc1(ifa->ifa_addr, 0);
#ifdef RT_DEBUG
        if (rt != NULL)
                dump_rt(rt);
#endif
        if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0 ||
            (rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0)
        {
                struct rt_addrinfo info;
                struct rtentry *nrt;

                memset(&info, 0, sizeof(info));
                info.rti_flags = RTF_HOST | RTF_LOCAL | RTF_DONTCHANGEIFA;
                info.rti_info[RTAX_DST] = ifa->ifa_addr;
                info.rti_info[RTAX_GATEWAY] =
                    (const struct sockaddr *)ifa->ifa_ifp->if_sadl;
                info.rti_ifa = ifa;
                nrt = NULL;
                e = rtrequest1(RTM_ADD, &info, &nrt);
                rt_addrmsg_rt(RTM_ADD, ifa, e, nrt);
                if (nrt != NULL) {
                        KASSERT(nrt->rt_ifa == ifa);
#ifdef RT_DEBUG
                        dump_rt(nrt);
#endif
                        rt_unref(nrt);
                        RT_REFCNT_TRACE(nrt);
                }
        } else {
                e = 0;
                rt_addrmsg(RTM_NEWADDR, ifa);
        }
        if (rt != NULL)
                rt_unref(rt);
        return e;
}

/*
 * Remove the local route entry for the address.
 * Announce the removal of the address and the route to the routing socket.
 */
int
rt_ifa_remlocal(struct ifaddr *ifa, struct ifaddr *alt_ifa)
{
        struct rtentry *rt;
        int e = 0;

        rt = rtalloc1(ifa->ifa_addr, 0);

        /*
         * Before deleting, check if a corresponding loopbacked
         * host route surely exists.  With this check, we can avoid
         * deleting an interface direct route whose destination is
         * the same as the address being removed.  This can happen
         * when removing a subnet-router anycast address on an
         * interface attached to a shared medium.
         */
        if (rt != NULL &&
            (rt->rt_flags & RTF_HOST) &&
            (rt->rt_ifp->if_flags & IFF_LOOPBACK))
        {
                /* If we cannot replace the route's ifaddr with the equivalent
                 * ifaddr of another interface, I believe it is safest to
                 * delete the route.
                 */
                if (alt_ifa == NULL) {
                        e = rtdeletemsg(rt);
                        if (e == 0) {
                                rt_unref(rt);
                                rt_free(rt);
                                rt = NULL;
                        }
                        rt_addrmsg(RTM_DELADDR, ifa);
                } else {
#ifdef NET_MPSAFE
                        int error = rt_update_prepare(rt);
                        if (error == 0) {
                                rt_replace_ifa(rt, alt_ifa);
                                rt_update_finish(rt);
                        } else {
                                /*
                                 * If error != 0, the rtentry is being
                                 * destroyed, so doing nothing doesn't
                                 * matter.
                                 */
                        }
#else
                        rt_replace_ifa(rt, alt_ifa);
#endif
                        rt_newmsg(RTM_CHANGE, rt);
                }
        } else
                rt_addrmsg(RTM_DELADDR, ifa);
        if (rt != NULL)
                rt_unref(rt);
        return e;
}

/*
 * Route timer routines.  These routes allow functions to be called
 * for various routes at any time.  This is useful in supporting
 * path MTU discovery and redirect route deletion.
 *
 * This is similar to some BSDI internal functions, but it provides
 * for multiple queues for efficiency's sake...
 */

LIST_HEAD(, rttimer_queue) rttimer_queue_head;
static int rt_init_done = 0;

/*
 * Some subtle order problems with domain initialization mean that
 * we cannot count on this being run from rt_init before various
 * protocol initializations are done.  Therefore, we make sure
 * that this is run when the first queue is added...
 */

static void rt_timer_work(struct work *, void *);

static void
rt_timer_init(void)
{
        int error;

        assert(rt_init_done == 0);

        /* XXX should be in rt_init */
        rw_init(&rt_lock);

        LIST_INIT(&rttimer_queue_head);
        callout_init(&rt_timer_ch, CALLOUT_MPSAFE);
        error = workqueue_create(&rt_timer_wq, "rt_timer",
            rt_timer_work, NULL, PRI_SOFTNET, IPL_SOFTNET, RT_WQ_FLAGS);
        if (error)
                panic("%s: workqueue_create failed (%d)\n", __func__, error);
        callout_reset(&rt_timer_ch, hz, rt_timer_timer, NULL);
        rt_init_done = 1;
}

struct rttimer_queue *
rt_timer_queue_create(u_int timeout)
{
        struct rttimer_queue *rtq;

        if (rt_init_done == 0)
                rt_timer_init();

        R_Malloc(rtq, struct rttimer_queue *, sizeof *rtq);
        if (rtq == NULL)
                return NULL;
        memset(rtq, 0, sizeof(*rtq));

        rtq->rtq_timeout = timeout;
        TAILQ_INIT(&rtq->rtq_head);
        RT_WLOCK();
        LIST_INSERT_HEAD(&rttimer_queue_head, rtq, rtq_link);
        RT_UNLOCK();

        return rtq;
}

void
rt_timer_queue_change(struct rttimer_queue *rtq, long timeout)
{

        rtq->rtq_timeout = timeout;
}

static void
rt_timer_queue_remove_all(struct rttimer_queue *rtq)
{
        struct rttimer *r;

        RT_ASSERT_WLOCK();

        while ((r = TAILQ_FIRST(&rtq->rtq_head)) != NULL) {
                LIST_REMOVE(r, rtt_link);
                TAILQ_REMOVE(&rtq->rtq_head, r, rtt_next);
                rt_ref(r->rtt_rt); /* XXX */
                RT_REFCNT_TRACE(r->rtt_rt);
                RT_UNLOCK();
                (*r->rtt_func)(r->rtt_rt, r);
                pool_put(&rttimer_pool, r);
                RT_WLOCK();
                if (rtq->rtq_count > 0)
                        rtq->rtq_count--;
                else
                        printf("rt_timer_queue_remove_all: "
                            "rtq_count reached 0\n");
        }
}

void
rt_timer_queue_destroy(struct rttimer_queue *rtq)
{

        RT_WLOCK();
        rt_timer_queue_remove_all(rtq);
        LIST_REMOVE(rtq, rtq_link);
        RT_UNLOCK();

        /*
         * Caller is responsible for freeing the rttimer_queue structure.
         */
}

unsigned long
rt_timer_count(struct rttimer_queue *rtq)
{
        return rtq->rtq_count;
}

static void
rt_timer_remove_all(struct rtentry *rt)
{
        struct rttimer *r;

        RT_WLOCK();
        while ((r = LIST_FIRST(&rt->rt_timer)) != NULL) {
                LIST_REMOVE(r, rtt_link);
                TAILQ_REMOVE(&r->rtt_queue->rtq_head, r, rtt_next);
                if (r->rtt_queue->rtq_count > 0)
                        r->rtt_queue->rtq_count--;
                else
                        printf("rt_timer_remove_all: rtq_count reached 0\n");
                pool_put(&rttimer_pool, r);
        }
        RT_UNLOCK();
}

int
rt_timer_add(struct rtentry *rt,
        void (*func)(struct rtentry *, struct rttimer *),
        struct rttimer_queue *queue)
{
        struct rttimer *r;

        KASSERT(func != NULL);
        RT_WLOCK();
        /*
         * If there's already a timer with this action, destroy it before
         * we add a new one.
         */
        LIST_FOREACH(r, &rt->rt_timer, rtt_link) {
                if (r->rtt_func == func)
                        break;
        }
        if (r != NULL) {
                LIST_REMOVE(r, rtt_link);
                TAILQ_REMOVE(&r->rtt_queue->rtq_head, r, rtt_next);
                if (r->rtt_queue->rtq_count > 0)
                        r->rtt_queue->rtq_count--;
                else
                        printf("rt_timer_add: rtq_count reached 0\n");
        } else {
                r = pool_get(&rttimer_pool, PR_NOWAIT);
                if (r == NULL) {
                        RT_UNLOCK();
                        return ENOBUFS;
                }
        }

        memset(r, 0, sizeof(*r));

        r->rtt_rt = rt;
        r->rtt_time = time_uptime;
        r->rtt_func = func;
        r->rtt_queue = queue;
        LIST_INSERT_HEAD(&rt->rt_timer, r, rtt_link);
        TAILQ_INSERT_TAIL(&queue->rtq_head, r, rtt_next);
        r->rtt_queue->rtq_count++;

        RT_UNLOCK();

        return 0;
}

static void
rt_timer_work(struct work *wk, void *arg)
{
        struct rttimer_queue *rtq;
        struct rttimer *r;

        RT_WLOCK();
        LIST_FOREACH(rtq, &rttimer_queue_head, rtq_link) {
                while ((r = TAILQ_FIRST(&rtq->rtq_head)) != NULL &&
                    (r->rtt_time + rtq->rtq_timeout) < time_uptime) {
                        LIST_REMOVE(r, rtt_link);
                        TAILQ_REMOVE(&rtq->rtq_head, r, rtt_next);
                        /*
                         * Take a reference to avoid the rtentry is freed
                         * accidentally after RT_UNLOCK.  The callback
                         * (rtt_func) must rt_unref it by itself.
                         */
                        rt_ref(r->rtt_rt);
                        RT_REFCNT_TRACE(r->rtt_rt);
                        RT_UNLOCK();
                        (*r->rtt_func)(r->rtt_rt, r);
                        pool_put(&rttimer_pool, r);
                        RT_WLOCK();
                        if (rtq->rtq_count > 0)
                                rtq->rtq_count--;
                        else
                                printf("rt_timer_timer: rtq_count reached 0\n");
                }
        }
        RT_UNLOCK();

        callout_reset(&rt_timer_ch, hz, rt_timer_timer, NULL);
}

static void
rt_timer_timer(void *arg)
{

        workqueue_enqueue(rt_timer_wq, &rt_timer_wk, NULL);
}

static struct rtentry *
_rtcache_init(struct route *ro, int flag)
{
        struct rtentry *rt;

        rtcache_invariants(ro);
        KASSERT(ro->_ro_rt == NULL);

        if (rtcache_getdst(ro) == NULL)
                return NULL;
        rt = rtalloc1(rtcache_getdst(ro), flag);
        if (rt != NULL) {
                RT_RLOCK();
                if (ISSET(rt->rt_flags, RTF_UP)) {
                        ro->_ro_rt = rt;
                        ro->ro_rtcache_generation = rtcache_generation;
                        rtcache_ref(rt, ro);
                }
                RT_UNLOCK();
                rt_unref(rt);
        }

        rtcache_invariants(ro);
        return ro->_ro_rt;
}

struct rtentry *
rtcache_init(struct route *ro)
{

        return _rtcache_init(ro, 1);
}

struct rtentry *
rtcache_init_noclone(struct route *ro)
{

        return _rtcache_init(ro, 0);
}

struct rtentry *
rtcache_update(struct route *ro, int clone)
{

        ro->_ro_rt = NULL;
        return _rtcache_init(ro, clone);
}

void
rtcache_copy(struct route *new_ro, struct route *old_ro)
{
        struct rtentry *rt;
        int ret;

        KASSERT(new_ro != old_ro);
        rtcache_invariants(new_ro);
        rtcache_invariants(old_ro);

        rt = rtcache_validate(old_ro);

        if (rtcache_getdst(old_ro) == NULL)
                goto out;
        ret = rtcache_setdst(new_ro, rtcache_getdst(old_ro));
        if (ret != 0)
                goto out;

        RT_RLOCK();
        new_ro->_ro_rt = rt;
        new_ro->ro_rtcache_generation = rtcache_generation;
        RT_UNLOCK();
        rtcache_invariants(new_ro);
out:
        rtcache_unref(rt, old_ro);
        return;
}

#if defined(RT_DEBUG) && defined(NET_MPSAFE)
static void
rtcache_trace(const char *func, struct rtentry *rt, struct route *ro)
{
        char dst[64];

        sockaddr_format(ro->ro_sa, dst, 64);
        printf("trace: %s:\tdst=%s cpu=%d lwp=%p psref=%p target=%p\n", func, dst,
            cpu_index(curcpu()), curlwp, &ro->ro_psref, &rt->rt_psref);
}
#define RTCACHE_PSREF_TRACE(rt, ro)        rtcache_trace(__func__, (rt), (ro))
#else
#define RTCACHE_PSREF_TRACE(rt, ro)        do {} while (0)
#endif

static void
rtcache_ref(struct rtentry *rt, struct route *ro)
{

        KASSERT(rt != NULL);

#ifdef NET_MPSAFE
        RTCACHE_PSREF_TRACE(rt, ro);
        ro->ro_bound = curlwp_bind();
        /* XXX Use a real caller's address */
        PSREF_DEBUG_FILL_RETURN_ADDRESS(&ro->ro_psref);
        psref_acquire(&ro->ro_psref, &rt->rt_psref, rt_psref_class);
#endif
}

void
rtcache_unref(struct rtentry *rt, struct route *ro)
{

        if (rt == NULL)
                return;

#ifdef NET_MPSAFE
        psref_release(&ro->ro_psref, &rt->rt_psref, rt_psref_class);
        curlwp_bindx(ro->ro_bound);
        RTCACHE_PSREF_TRACE(rt, ro);
#endif
}

struct rtentry *
rtcache_validate(struct route *ro)
{
        struct rtentry *rt = NULL;

#ifdef NET_MPSAFE
retry:
#endif
        rtcache_invariants(ro);
        RT_RLOCK();
        if (ro->ro_rtcache_generation != rtcache_generation) {
                /* The cache is invalidated */
                rt = NULL;
                goto out;
        }

        rt = ro->_ro_rt;
        if (rt == NULL)
                goto out;

        if ((rt->rt_flags & RTF_UP) == 0) {
                rt = NULL;
                goto out;
        }
#ifdef NET_MPSAFE
        if (ISSET(rt->rt_flags, RTF_UPDATING)) {
                if (rt_wait_ok()) {
                        RT_UNLOCK();

                        /* We can wait until the update is complete */
                        rt_update_wait();
                        goto retry;
                } else {
                        rt = NULL;
                }
        } else
#endif
                rtcache_ref(rt, ro);
out:
        RT_UNLOCK();
        return rt;
}

struct rtentry *
rtcache_lookup2(struct route *ro, const struct sockaddr *dst,
    int clone, int *hitp)
{
        const struct sockaddr *odst;
        struct rtentry *rt = NULL;

        odst = rtcache_getdst(ro);
        if (odst == NULL)
                goto miss;

        if (sockaddr_cmp(odst, dst) != 0) {
                rtcache_free(ro);
                goto miss;
        }

        rt = rtcache_validate(ro);
        if (rt == NULL) {
                ro->_ro_rt = NULL;
                goto miss;
        }

        rtcache_invariants(ro);

        if (hitp != NULL)
                *hitp = 1;
        return rt;
miss:
        if (hitp != NULL)
                *hitp = 0;
        if (rtcache_setdst(ro, dst) == 0)
                rt = _rtcache_init(ro, clone);

        rtcache_invariants(ro);

        return rt;
}

void
rtcache_free(struct route *ro)
{

        ro->_ro_rt = NULL;
        if (ro->ro_sa != NULL) {
                sockaddr_free(ro->ro_sa);
                ro->ro_sa = NULL;
        }
        rtcache_invariants(ro);
}

int
rtcache_setdst(struct route *ro, const struct sockaddr *sa)
{
        KASSERT(sa != NULL);

        rtcache_invariants(ro);
        if (ro->ro_sa != NULL) {
                if (ro->ro_sa->sa_family == sa->sa_family) {
                        ro->_ro_rt = NULL;
                        sockaddr_copy(ro->ro_sa, ro->ro_sa->sa_len, sa);
                        rtcache_invariants(ro);
                        return 0;
                }
                /* free ro_sa, wrong family */
                rtcache_free(ro);
        }

        KASSERT(ro->_ro_rt == NULL);

        if ((ro->ro_sa = sockaddr_dup(sa, M_ZERO | M_NOWAIT)) == NULL) {
                rtcache_invariants(ro);
                return ENOMEM;
        }
        rtcache_invariants(ro);
        return 0;
}

static void
rtcache_percpu_init_cpu(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
        struct route **rop = p;

        /*
         * We can't have struct route as percpu data because it can be destroyed
         * over a memory enlargement processing of percpu.
         */
        *rop = kmem_zalloc(sizeof(**rop), KM_SLEEP);
}

percpu_t *
rtcache_percpu_alloc(void)
{

        return percpu_create(sizeof(struct route *),
            rtcache_percpu_init_cpu, NULL, NULL);
}

const struct sockaddr *
rt_settag(struct rtentry *rt, const struct sockaddr *tag)
{
        if (rt->rt_tag != tag) {
                if (rt->rt_tag != NULL)
                        sockaddr_free(rt->rt_tag);
                rt->rt_tag = sockaddr_dup(tag, M_ZERO | M_NOWAIT);
        }
        return rt->rt_tag;
}

struct sockaddr *
rt_gettag(const struct rtentry *rt)
{
        return rt->rt_tag;
}

int
rt_check_reject_route(const struct rtentry *rt, const struct ifnet *ifp)
{

        if ((rt->rt_flags & RTF_REJECT) != 0) {
                /* Mimic looutput */
                if (ifp->if_flags & IFF_LOOPBACK)
                        return (rt->rt_flags & RTF_HOST) ?
                            EHOSTUNREACH : ENETUNREACH;
                else if (rt->rt_rmx.rmx_expire == 0 ||
                    time_uptime < rt->rt_rmx.rmx_expire)
                        return (rt->rt_flags & RTF_GATEWAY) ?
                            EHOSTUNREACH : EHOSTDOWN;
        }

        return 0;
}

void
rt_delete_matched_entries(sa_family_t family, int (*f)(struct rtentry *, void *),
    void *v, bool notify)
{

        for (;;) {
                int s;
                int error;
                struct rtentry *rt, *retrt = NULL;

                RT_RLOCK();
                s = splsoftnet();
                rt = rtbl_search_matched_entry(family, f, v);
                if (rt == NULL) {
                        splx(s);
                        RT_UNLOCK();
                        return;
                }
                rt_ref(rt);
                RT_REFCNT_TRACE(rt);
                splx(s);
                RT_UNLOCK();

                error = rtrequest(RTM_DELETE, rt_getkey(rt), rt->rt_gateway,
                    rt_mask(rt), rt->rt_flags, &retrt);
                if (error == 0) {
                        KASSERT(retrt == rt);
                        KASSERT((retrt->rt_flags & RTF_UP) == 0);
                        if (notify)
                                rt_newmsg(RTM_DELETE, retrt);
                        retrt->rt_ifp = NULL;
                        rt_unref(rt);
                        RT_REFCNT_TRACE(rt);
                        rt_free(retrt);
                } else if (error == ESRCH) {
                        /* Someone deleted the entry already. */
                        rt_unref(rt);
                        RT_REFCNT_TRACE(rt);
                } else {
                        log(LOG_ERR, "%s: unable to delete rtentry @ %p, "
                            "error = %d\n", rt->rt_ifp->if_xname, rt, error);
                        /* XXX how to treat this case? */
                }
        }
}

static int
rt_walktree_locked(sa_family_t family, int (*f)(struct rtentry *, void *),
    void *v)
{

        return rtbl_walktree(family, f, v);
}

void
rt_replace_ifa_matched_entries(sa_family_t family,
    int (*f)(struct rtentry *, void *), void *v, struct ifaddr *ifa)
{

        for (;;) {
                int s;
#ifdef NET_MPSAFE
                int error;
#endif
                struct rtentry *rt;

                RT_RLOCK();
                s = splsoftnet();
                rt = rtbl_search_matched_entry(family, f, v);
                if (rt == NULL) {
                        splx(s);
                        RT_UNLOCK();
                        return;
                }
                rt_ref(rt);
                RT_REFCNT_TRACE(rt);
                splx(s);
                RT_UNLOCK();

#ifdef NET_MPSAFE
                error = rt_update_prepare(rt);
                if (error == 0) {
                        rt_replace_ifa(rt, ifa);
                        rt_update_finish(rt);
                        rt_newmsg(RTM_CHANGE, rt);
                } else {
                        /*
                         * If error != 0, the rtentry is being
                         * destroyed, so doing nothing doesn't
                         * matter.
                         */
                }
#else
                rt_replace_ifa(rt, ifa);
                rt_newmsg(RTM_CHANGE, rt);
#endif
                rt_unref(rt);
                RT_REFCNT_TRACE(rt);
        }
}

int
rt_walktree(sa_family_t family, int (*f)(struct rtentry *, void *), void *v)
{
        int error;

        RT_RLOCK();
        error = rt_walktree_locked(family, f, v);
        RT_UNLOCK();

        return error;
}

#ifdef DDB

#include <machine/db_machdep.h>
#include <ddb/db_interface.h>
#include <ddb/db_output.h>

#define        rt_expire rt_rmx.rmx_expire

static void
db_print_sa(const struct sockaddr *sa)
{
        int len;
        const u_char *p;

        if (sa == NULL) {
                db_printf("[NULL]");
                return;
        }

        p = (const u_char *)sa;
        len = sa->sa_len;
        db_printf("[");
        while (len > 0) {
                db_printf("%d", *p);
                p++; len--;
                if (len) db_printf(",");
        }
        db_printf("]\n");
}

static void
db_print_ifa(struct ifaddr *ifa)
{
        if (ifa == NULL)
                return;
        db_printf("  ifa_addr=");
        db_print_sa(ifa->ifa_addr);
        db_printf("  ifa_dsta=");
        db_print_sa(ifa->ifa_dstaddr);
        db_printf("  ifa_mask=");
        db_print_sa(ifa->ifa_netmask);
        db_printf("  flags=0x%x,refcnt=%d,metric=%d\n",
                          ifa->ifa_flags,
                          ifa->ifa_refcnt,
                          ifa->ifa_metric);
}

/*
 * Function to pass to rt_walktree().
 * Return non-zero error to abort walk.
 */
static int
db_show_rtentry(struct rtentry *rt, void *w)
{
        db_printf("rtentry=%p", rt);

        db_printf(" flags=0x%x refcnt=%d use=%"PRId64" expire=%"PRId64"\n",
                          rt->rt_flags, rt->rt_refcnt,
                          rt->rt_use, (uint64_t)rt->rt_expire);

        db_printf(" key="); db_print_sa(rt_getkey(rt));
        db_printf(" mask="); db_print_sa(rt_mask(rt));
        db_printf(" gw="); db_print_sa(rt->rt_gateway);

        db_printf(" ifp=%p ", rt->rt_ifp);
        if (rt->rt_ifp)
                db_printf("(%s)", rt->rt_ifp->if_xname);
        else
                db_printf("(NULL)");

        db_printf(" ifa=%p\n", rt->rt_ifa);
        db_print_ifa(rt->rt_ifa);

        db_printf(" gwroute=%p llinfo=%p\n",
                          rt->rt_gwroute, rt->rt_llinfo);

        return 0;
}

/*
 * Function to print all the route trees.
 * Use this from ddb:  "show routes"
 */
void
db_show_routes(db_expr_t addr, bool have_addr,
    db_expr_t count, const char *modif)
{

        /* Taking RT_LOCK will fail if LOCKDEBUG is enabled. */
        rt_walktree_locked(AF_INET, db_show_rtentry, NULL);
}
#endif






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



































    1 






    1 




















    1 
























    1 















    1 


































































































































    1 













































































    1 







































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
/*        $NetBSD: if_ethersubr.c,v 1.326 2023/11/02 09:40:47 yamaguchi Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)if_ethersubr.c        8.2 (Berkeley) 4/4/96
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_ethersubr.c,v 1.326 2023/11/02 09:40:47 yamaguchi Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_atalk.h"
#include "opt_mbuftrace.h"
#include "opt_mpls.h"
#include "opt_gateway.h"
#include "opt_pppoe.h"
#include "opt_net_mpsafe.h"
#endif

#include "vlan.h"
#include "pppoe.h"
#include "bridge.h"
#include "arp.h"
#include "agr.h"

#include <sys/sysctl.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/device.h>
#include <sys/entropy.h>
#include <sys/rndsource.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/hook.h>

#include <net/if.h>
#include <net/route.h>
#include <net/if_llc.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/pktqueue.h>

#include <net/if_media.h>
#include <dev/mii/mii.h>
#include <dev/mii/miivar.h>

#if NARP == 0
/*
 * XXX there should really be a way to issue this warning from within config(8)
 */
#error You have included NETATALK or a pseudo-device in your configuration that depends on the presence of ethernet interfaces, but have no such interfaces configured. Check if you really need pseudo-device bridge, pppoe, vlan or options NETATALK.
#endif

#include <net/bpf.h>

#include <net/if_ether.h>
#include <net/if_vlanvar.h>

#if NPPPOE > 0
#include <net/if_pppoe.h>
#endif

#if NAGR > 0
#include <net/ether_slowprotocols.h>
#include <net/agr/ieee8023ad.h>
#include <net/agr/if_agrvar.h>
#endif

#if NBRIDGE > 0
#include <net/if_bridgevar.h>
#endif

#include <netinet/in.h>
#ifdef INET
#include <netinet/in_var.h>
#endif
#include <netinet/if_inarp.h>

#ifdef INET6
#ifndef INET
#include <netinet/in.h>
#endif
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#endif

#include "carp.h"
#if NCARP > 0
#include <netinet/ip_carp.h>
#endif

#ifdef NETATALK
#include <netatalk/at.h>
#include <netatalk/at_var.h>
#include <netatalk/at_extern.h>

#define llc_snap_org_code llc_un.type_snap.org_code
#define llc_snap_ether_type llc_un.type_snap.ether_type

extern u_char        at_org_code[3];
extern u_char        aarp_org_code[3];
#endif /* NETATALK */

#ifdef MPLS
#include <netmpls/mpls.h>
#include <netmpls/mpls_var.h>
#endif

CTASSERT(sizeof(struct ether_addr) == 6);
CTASSERT(sizeof(struct ether_header) == 14);

#ifdef DIAGNOSTIC
static struct timeval bigpktppslim_last;
static int bigpktppslim = 2;        /* XXX */
static int bigpktpps_count;
static kmutex_t bigpktpps_lock __cacheline_aligned;
#endif

const uint8_t etherbroadcastaddr[ETHER_ADDR_LEN] =
    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
const uint8_t ethermulticastaddr_slowprotocols[ETHER_ADDR_LEN] =
    { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x02 };
#define senderr(e) { error = (e); goto bad;}

static pktq_rps_hash_func_t ether_pktq_rps_hash_p;

static int ether_output(struct ifnet *, struct mbuf *,
    const struct sockaddr *, const struct rtentry *);

/*
 * Ethernet output routine.
 * Encapsulate a packet of type family for the local net.
 * Assumes that ifp is actually pointer to ethercom structure.
 */
static int
ether_output(struct ifnet * const ifp0, struct mbuf * const m0,
    const struct sockaddr * const dst, const struct rtentry *rt)
{
        uint8_t esrc[ETHER_ADDR_LEN], edst[ETHER_ADDR_LEN];
        uint16_t etype = 0;
        int error = 0, hdrcmplt = 0;
        struct mbuf *m = m0;
        struct mbuf *mcopy = NULL;
        struct ether_header *eh;
        struct ifnet *ifp = ifp0;
#ifdef INET
        struct arphdr *ah;
#endif
#ifdef NETATALK
        struct at_ifaddr *aa;
#endif

#ifdef MBUFTRACE
        m_claimm(m, ifp->if_mowner);
#endif

#if NCARP > 0
        if (ifp->if_type == IFT_CARP) {
                struct ifaddr *ifa;
                int s = pserialize_read_enter();

                /* loop back if this is going to the carp interface */
                if (dst != NULL && ifp0->if_link_state == LINK_STATE_UP &&
                    (ifa = ifa_ifwithaddr(dst)) != NULL) {
                        if (ifa->ifa_ifp == ifp0) {
                                pserialize_read_exit(s);
                                return looutput(ifp0, m, dst, rt);
                        }
                }
                pserialize_read_exit(s);

                ifp = ifp->if_carpdev;
                /* ac = (struct arpcom *)ifp; */

                if ((ifp0->if_flags & (IFF_UP | IFF_RUNNING)) !=
                    (IFF_UP | IFF_RUNNING))
                        senderr(ENETDOWN);
        }
#endif

        if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) != (IFF_UP | IFF_RUNNING))
                senderr(ENETDOWN);

        switch (dst->sa_family) {

#ifdef INET
        case AF_INET:
                if (m->m_flags & M_BCAST) {
                        memcpy(edst, etherbroadcastaddr, sizeof(edst));
                } else if (m->m_flags & M_MCAST) {
                        ETHER_MAP_IP_MULTICAST(&satocsin(dst)->sin_addr, edst);
                } else {
                        error = arpresolve(ifp0, rt, m, dst, edst, sizeof(edst));
                        if (error)
                                return (error == EWOULDBLOCK) ? 0 : error;
                }
                /* If broadcasting on a simplex interface, loopback a copy */
                if ((m->m_flags & M_BCAST) && (ifp->if_flags & IFF_SIMPLEX))
                        mcopy = m_copypacket(m, M_DONTWAIT);
                etype = htons(ETHERTYPE_IP);
                break;

        case AF_ARP:
                ah = mtod(m, struct arphdr *);
                if (m->m_flags & M_BCAST) {
                        memcpy(edst, etherbroadcastaddr, sizeof(edst));
                } else {
                        void *tha = ar_tha(ah);

                        if (tha == NULL) {
                                /* fake with ARPHRD_IEEE1394 */
                                m_freem(m);
                                return 0;
                        }
                        memcpy(edst, tha, sizeof(edst));
                }

                ah->ar_hrd = htons(ARPHRD_ETHER);

                switch (ntohs(ah->ar_op)) {
                case ARPOP_REVREQUEST:
                case ARPOP_REVREPLY:
                        etype = htons(ETHERTYPE_REVARP);
                        break;

                case ARPOP_REQUEST:
                case ARPOP_REPLY:
                default:
                        etype = htons(ETHERTYPE_ARP);
                }
                break;
#endif

#ifdef INET6
        case AF_INET6:
                if (m->m_flags & M_BCAST) {
                        memcpy(edst, etherbroadcastaddr, sizeof(edst));
                } else if (m->m_flags & M_MCAST) {
                        ETHER_MAP_IPV6_MULTICAST(&satocsin6(dst)->sin6_addr,
                            edst);
                } else {
                        error = nd6_resolve(ifp0, rt, m, dst, edst,
                            sizeof(edst));
                        if (error)
                                return (error == EWOULDBLOCK) ? 0 : error;
                }
                etype = htons(ETHERTYPE_IPV6);
                break;
#endif

#ifdef NETATALK
        case AF_APPLETALK: {
                struct ifaddr *ifa;
                int s;

                KERNEL_LOCK(1, NULL);

                if (!aarpresolve(ifp, m, (const struct sockaddr_at *)dst, edst)) {
                        KERNEL_UNLOCK_ONE(NULL);
                        return 0;
                }

                /*
                 * ifaddr is the first thing in at_ifaddr
                 */
                s = pserialize_read_enter();
                ifa = at_ifawithnet((const struct sockaddr_at *)dst, ifp);
                if (ifa == NULL) {
                        pserialize_read_exit(s);
                        KERNEL_UNLOCK_ONE(NULL);
                        senderr(EADDRNOTAVAIL);
                }
                aa = (struct at_ifaddr *)ifa;

                /*
                 * In the phase 2 case, we need to prepend an mbuf for the
                 * llc header.
                 */
                if (aa->aa_flags & AFA_PHASE2) {
                        struct llc llc;

                        M_PREPEND(m, sizeof(struct llc), M_DONTWAIT);
                        if (m == NULL) {
                                pserialize_read_exit(s);
                                KERNEL_UNLOCK_ONE(NULL);
                                senderr(ENOBUFS);
                        }

                        llc.llc_dsap = llc.llc_ssap = LLC_SNAP_LSAP;
                        llc.llc_control = LLC_UI;
                        memcpy(llc.llc_snap_org_code, at_org_code,
                            sizeof(llc.llc_snap_org_code));
                        llc.llc_snap_ether_type = htons(ETHERTYPE_ATALK);
                        memcpy(mtod(m, void *), &llc, sizeof(struct llc));
                } else {
                        etype = htons(ETHERTYPE_ATALK);
                }
                pserialize_read_exit(s);
                KERNEL_UNLOCK_ONE(NULL);
                break;
        }
#endif /* NETATALK */

        case pseudo_AF_HDRCMPLT:
                hdrcmplt = 1;
                memcpy(esrc,
                    ((const struct ether_header *)dst->sa_data)->ether_shost,
                    sizeof(esrc));
                /* FALLTHROUGH */

        case AF_UNSPEC:
                memcpy(edst,
                    ((const struct ether_header *)dst->sa_data)->ether_dhost,
                    sizeof(edst));
                /* AF_UNSPEC doesn't swap the byte order of the ether_type. */
                etype = ((const struct ether_header *)dst->sa_data)->ether_type;
                break;

        default:
                printf("%s: can't handle af%d\n", ifp->if_xname,
                    dst->sa_family);
                senderr(EAFNOSUPPORT);
        }

#ifdef MPLS
        {
                struct m_tag *mtag;
                mtag = m_tag_find(m, PACKET_TAG_MPLS);
                if (mtag != NULL) {
                        /* Having the tag itself indicates it's MPLS */
                        etype = htons(ETHERTYPE_MPLS);
                        m_tag_delete(m, mtag);
                }
        }
#endif

        if (mcopy)
                (void)looutput(ifp, mcopy, dst, rt);

        KASSERT((m->m_flags & M_PKTHDR) != 0);

        /*
         * If no ether type is set, this must be a 802.2 formatted packet.
         */
        if (etype == 0)
                etype = htons(m->m_pkthdr.len);

        /*
         * Add local net header. If no space in first mbuf, allocate another.
         */
        M_PREPEND(m, sizeof(struct ether_header), M_DONTWAIT);
        if (m == NULL)
                senderr(ENOBUFS);

        eh = mtod(m, struct ether_header *);
        /* Note: etype is already in network byte order. */
        memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type));
        memcpy(eh->ether_dhost, edst, sizeof(edst));
        if (hdrcmplt) {
                memcpy(eh->ether_shost, esrc, sizeof(eh->ether_shost));
        } else {
                 memcpy(eh->ether_shost, CLLADDR(ifp->if_sadl),
                    sizeof(eh->ether_shost));
        }

#if NCARP > 0
        if (ifp0 != ifp && ifp0->if_type == IFT_CARP) {
                 memcpy(eh->ether_shost, CLLADDR(ifp0->if_sadl),
                    sizeof(eh->ether_shost));
        }
#endif

        if ((error = pfil_run_hooks(ifp->if_pfil, &m, ifp, PFIL_OUT)) != 0)
                return error;
        if (m == NULL)
                return 0;

#if NBRIDGE > 0
        /*
         * Bridges require special output handling.
         */
        if (ifp->if_bridge)
                return bridge_output(ifp, m, NULL, NULL);
#endif

#if NCARP > 0
        if (ifp != ifp0)
                if_statadd(ifp0, if_obytes, m->m_pkthdr.len + ETHER_HDR_LEN);
#endif

#ifdef ALTQ
        KERNEL_LOCK(1, NULL);
        /*
         * If ALTQ is enabled on the parent interface, do
         * classification; the queueing discipline might not
         * require classification, but might require the
         * address family/header pointer in the pktattr.
         */
        if (ALTQ_IS_ENABLED(&ifp->if_snd))
                altq_etherclassify(&ifp->if_snd, m);
        KERNEL_UNLOCK_ONE(NULL);
#endif
        return ifq_enqueue(ifp, m);

bad:
        if_statinc(ifp, if_oerrors);
        if (m)
                m_freem(m);
        return error;
}

#ifdef ALTQ
/*
 * This routine is a slight hack to allow a packet to be classified
 * if the Ethernet headers are present.  It will go away when ALTQ's
 * classification engine understands link headers.
 *
 * XXX: We may need to do m_pullups here. First to ensure struct ether_header
 * is indeed contiguous, then to read the LLC and so on.
 */
void
altq_etherclassify(struct ifaltq *ifq, struct mbuf *m)
{
        struct ether_header *eh;
        struct mbuf *mtop = m;
        uint16_t ether_type;
        int hlen, af, hdrsize;
        void *hdr;

        KASSERT((mtop->m_flags & M_PKTHDR) != 0);

        hlen = ETHER_HDR_LEN;
        eh = mtod(m, struct ether_header *);

        ether_type = htons(eh->ether_type);

        if (ether_type < ETHERMTU) {
                /* LLC/SNAP */
                struct llc *llc = (struct llc *)(eh + 1);
                hlen += 8;

                if (m->m_len < hlen ||
                    llc->llc_dsap != LLC_SNAP_LSAP ||
                    llc->llc_ssap != LLC_SNAP_LSAP ||
                    llc->llc_control != LLC_UI) {
                        /* Not SNAP. */
                        goto bad;
                }

                ether_type = htons(llc->llc_un.type_snap.ether_type);
        }

        switch (ether_type) {
        case ETHERTYPE_IP:
                af = AF_INET;
                hdrsize = 20;                /* sizeof(struct ip) */
                break;

        case ETHERTYPE_IPV6:
                af = AF_INET6;
                hdrsize = 40;                /* sizeof(struct ip6_hdr) */
                break;

        default:
                af = AF_UNSPEC;
                hdrsize = 0;
                break;
        }

        while (m->m_len <= hlen) {
                hlen -= m->m_len;
                m = m->m_next;
                if (m == NULL)
                        goto bad;
        }

        if (m->m_len < (hlen + hdrsize)) {
                /*
                 * protocol header not in a single mbuf.
                 * We can't cope with this situation right
                 * now (but it shouldn't ever happen, really, anyhow).
                 */
#ifdef DEBUG
                printf("altq_etherclassify: headers span multiple mbufs: "
                    "%d < %d\n", m->m_len, (hlen + hdrsize));
#endif
                goto bad;
        }

        m->m_data += hlen;
        m->m_len -= hlen;

        hdr = mtod(m, void *);

        if (ALTQ_NEEDS_CLASSIFY(ifq)) {
                mtop->m_pkthdr.pattr_class =
                    (*ifq->altq_classify)(ifq->altq_clfier, m, af);
        }
        mtop->m_pkthdr.pattr_af = af;
        mtop->m_pkthdr.pattr_hdr = hdr;

        m->m_data -= hlen;
        m->m_len += hlen;

        return;

bad:
        mtop->m_pkthdr.pattr_class = NULL;
        mtop->m_pkthdr.pattr_hdr = NULL;
        mtop->m_pkthdr.pattr_af = AF_UNSPEC;
}
#endif /* ALTQ */

#if defined (LLC) || defined (NETATALK)
static void
ether_input_llc(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh)
{
        pktqueue_t *pktq = NULL;
        struct llc *l;

        if (m->m_len < sizeof(*eh) + sizeof(struct llc))
                goto error;

        l = (struct llc *)(eh+1);
        switch (l->llc_dsap) {
#ifdef NETATALK
        case LLC_SNAP_LSAP:
                switch (l->llc_control) {
                case LLC_UI:
                        if (l->llc_ssap != LLC_SNAP_LSAP)
                                goto error;

                        if (memcmp(&(l->llc_snap_org_code)[0],
                            at_org_code, sizeof(at_org_code)) == 0 &&
                            ntohs(l->llc_snap_ether_type) ==
                            ETHERTYPE_ATALK) {
                                pktq = at_pktq2;
                                m_adj(m, sizeof(struct ether_header)
                                    + sizeof(struct llc));
                                break;
                        }

                        if (memcmp(&(l->llc_snap_org_code)[0],
                            aarp_org_code,
                            sizeof(aarp_org_code)) == 0 &&
                            ntohs(l->llc_snap_ether_type) ==
                            ETHERTYPE_AARP) {
                                m_adj(m, sizeof(struct ether_header)
                                    + sizeof(struct llc));
                                aarpinput(ifp, m); /* XXX queue? */
                                return;
                        }

                default:
                        goto error;
                }
                break;
#endif
        default:
                goto noproto;
        }

        KASSERT(pktq != NULL);
        if (__predict_false(!pktq_enqueue(pktq, m, 0))) {
                m_freem(m);
        }
        return;

noproto:
        m_freem(m);
        if_statinc(ifp, if_noproto);
        return;
error:
        m_freem(m);
        if_statinc(ifp, if_ierrors);
        return;
}
#endif /* defined (LLC) || defined (NETATALK) */

/*
 * Process a received Ethernet packet;
 * the packet is in the mbuf chain m with
 * the ether header.
 */
void
ether_input(struct ifnet *ifp, struct mbuf *m)
{
#if NVLAN > 0 || defined(MBUFTRACE)
        struct ethercom *ec = (struct ethercom *) ifp;
#endif
        pktqueue_t *pktq = NULL;
        uint16_t etype;
        struct ether_header *eh;
        size_t ehlen;
        static int earlypkts;

        /* No RPS for not-IP. */
        pktq_rps_hash_func_t rps_hash = NULL;

        KASSERT(!cpu_intr_p());
        KASSERT((m->m_flags & M_PKTHDR) != 0);

        if ((ifp->if_flags & IFF_UP) == 0)
                goto drop;

#ifdef MBUFTRACE
        m_claimm(m, &ec->ec_rx_mowner);
#endif

        if (__predict_false(m->m_len < sizeof(*eh))) {
                if ((m = m_pullup(m, sizeof(*eh))) == NULL) {
                        if_statinc(ifp, if_ierrors);
                        return;
                }
        }

        eh = mtod(m, struct ether_header *);
        etype = ntohs(eh->ether_type);
        ehlen = sizeof(*eh);

        if (__predict_false(earlypkts < 100 ||
                entropy_epoch() == (unsigned)-1)) {
                rnd_add_data(NULL, eh, ehlen, 0);
                earlypkts++;
        }

        /*
         * Determine if the packet is within its size limits. For MPLS the
         * header length is variable, so we skip the check.
         */
        if (etype != ETHERTYPE_MPLS && m->m_pkthdr.len >
            ETHER_MAX_FRAME(ifp, etype, m->m_flags & M_HASFCS)) {
#ifdef DIAGNOSTIC
                mutex_enter(&bigpktpps_lock);
                if (ppsratecheck(&bigpktppslim_last, &bigpktpps_count,
                    bigpktppslim)) {
                        printf("%s: discarding oversize frame (len=%d)\n",
                            ifp->if_xname, m->m_pkthdr.len);
                }
                mutex_exit(&bigpktpps_lock);
#endif
                goto error;
        }

        if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
                /*
                 * If this is not a simplex interface, drop the packet
                 * if it came from us.
                 */
                if ((ifp->if_flags & IFF_SIMPLEX) == 0 &&
                    memcmp(CLLADDR(ifp->if_sadl), eh->ether_shost,
                    ETHER_ADDR_LEN) == 0) {
                        goto drop;
                }

                if (memcmp(etherbroadcastaddr,
                    eh->ether_dhost, ETHER_ADDR_LEN) == 0)
                        m->m_flags |= M_BCAST;
                else
                        m->m_flags |= M_MCAST;
                if_statinc(ifp, if_imcasts);
        }

        /* If the CRC is still on the packet, trim it off. */
        if (m->m_flags & M_HASFCS) {
                m_adj(m, -ETHER_CRC_LEN);
                m->m_flags &= ~M_HASFCS;
        }

        if_statadd(ifp, if_ibytes, m->m_pkthdr.len);

        if (!vlan_has_tag(m) && etype == ETHERTYPE_VLAN) {
                m = ether_strip_vlantag(m);
                if (m == NULL) {
                        if_statinc(ifp, if_ierrors);
                        return;
                }

                eh = mtod(m, struct ether_header *);
                etype = ntohs(eh->ether_type);
                ehlen = sizeof(*eh);
        }

        if ((m->m_flags & (M_BCAST | M_MCAST | M_PROMISC)) == 0 &&
            (ifp->if_flags & IFF_PROMISC) != 0 &&
            memcmp(CLLADDR(ifp->if_sadl), eh->ether_dhost,
             ETHER_ADDR_LEN) != 0) {
                m->m_flags |= M_PROMISC;
        }

        if ((m->m_flags & M_PROMISC) == 0) {
                if (pfil_run_hooks(ifp->if_pfil, &m, ifp, PFIL_IN) != 0)
                        return;
                if (m == NULL)
                        return;

                eh = mtod(m, struct ether_header *);
                etype = ntohs(eh->ether_type);
        }

        /*
         * Processing a logical interfaces that are able
         * to configure vlan(4).
        */
#if NAGR > 0
        if (ifp->if_lagg != NULL &&
            __predict_true(etype != ETHERTYPE_SLOWPROTOCOLS)) {
                m->m_flags &= ~M_PROMISC;
                agr_input(ifp, m);
                return;
        }
#endif

        /*
         * VLAN processing.
         *
         * VLAN provides service delimiting so the frames are
         * processed before other handlings. If a VLAN interface
         * does not exist to take those frames, they're returned
         * to ether_input().
         */

        if (vlan_has_tag(m)) {
                if (EVL_VLANOFTAG(vlan_get_tag(m)) == 0) {
                        if (etype == ETHERTYPE_VLAN ||
                             etype == ETHERTYPE_QINQ)
                                goto drop;

                        /* XXX we should actually use the prio value? */
                        m->m_flags &= ~M_VLANTAG;
                } else {
#if NVLAN > 0
                        if (ec->ec_nvlans > 0) {
                                m = vlan_input(ifp, m);

                                /* vlan_input() called ether_input() recursively */
                                if (m == NULL)
                                        return;
                        }
#endif
                        /* drop VLAN frames not for this port. */
                        goto noproto;
                }
        }

#if NCARP > 0
        if (__predict_false(ifp->if_carp && ifp->if_type != IFT_CARP)) {
                /*
                 * Clear M_PROMISC, in case the packet comes from a
                 * vlan.
                 */
                m->m_flags &= ~M_PROMISC;
                if (carp_input(m, (uint8_t *)&eh->ether_shost,
                    (uint8_t *)&eh->ether_dhost, eh->ether_type) == 0)
                        return;
        }
#endif

        /*
         * Handle protocols that expect to have the Ethernet header
         * (and possibly FCS) intact.
         */
        switch (etype) {
#if NPPPOE > 0
        case ETHERTYPE_PPPOEDISC:
                pppoedisc_input(ifp, m);
                return;

        case ETHERTYPE_PPPOE:
                pppoe_input(ifp, m);
                return;
#endif

        case ETHERTYPE_SLOWPROTOCOLS: {
                uint8_t subtype;

                if (m->m_pkthdr.len < sizeof(*eh) + sizeof(subtype))
                        goto error;

                m_copydata(m, sizeof(*eh), sizeof(subtype), &subtype);
                switch (subtype) {
#if NAGR > 0
                case SLOWPROTOCOLS_SUBTYPE_LACP:
                        if (ifp->if_lagg != NULL) {
                                ieee8023ad_lacp_input(ifp, m);
                                return;
                        }
                        break;

                case SLOWPROTOCOLS_SUBTYPE_MARKER:
                        if (ifp->if_lagg != NULL) {
                                ieee8023ad_marker_input(ifp, m);
                                return;
                        }
                        break;
#endif

                default:
                        if (subtype == 0 || subtype > 10) {
                                /* illegal value */
                                goto error;
                        }
                        /* unknown subtype */
                        break;
                }
        }
        /* FALLTHROUGH */
        default:
                if (m->m_flags & M_PROMISC)
                        goto drop;
        }

        /* If the CRC is still on the packet, trim it off. */
        if (m->m_flags & M_HASFCS) {
                m_adj(m, -ETHER_CRC_LEN);
                m->m_flags &= ~M_HASFCS;
        }

        /* etype represents the size of the payload in this case */
        if (etype <= ETHERMTU + sizeof(struct ether_header)) {
                KASSERT(ehlen == sizeof(*eh));
#if defined (LLC) || defined (NETATALK)
                ether_input_llc(ifp, m, eh);
                return;
#else
                /* ethertype of 0-1500 is regarded as noproto */
                goto noproto;
#endif
        }

        /* For ARP packets, store the source address so that
         * ARP DAD probes can be validated. */
        if (etype == ETHERTYPE_ARP) {
                struct m_tag *mtag;

                mtag = m_tag_get(PACKET_TAG_ETHERNET_SRC, ETHER_ADDR_LEN,
                    M_NOWAIT);
                if (mtag != NULL) {
                        memcpy(mtag + 1, &eh->ether_shost, ETHER_ADDR_LEN);
                        m_tag_prepend(m, mtag);
                }
        }

        /* Strip off the Ethernet header. */
        m_adj(m, ehlen);

        switch (etype) {
#ifdef INET
        case ETHERTYPE_IP:
#ifdef GATEWAY
                if (ipflow_fastforward(m))
                        return;
#endif
                pktq = ip_pktq;
                rps_hash = atomic_load_relaxed(&ether_pktq_rps_hash_p);
                break;

        case ETHERTYPE_ARP:
                pktq = arp_pktq;
                break;

        case ETHERTYPE_REVARP:
                revarpinput(m);        /* XXX queue? */
                return;
#endif

#ifdef INET6
        case ETHERTYPE_IPV6:
                if (__predict_false(!in6_present))
                        goto noproto;
#ifdef GATEWAY
                if (ip6flow_fastforward(&m))
                        return;
#endif
                pktq = ip6_pktq;
                rps_hash = atomic_load_relaxed(&ether_pktq_rps_hash_p);
                break;
#endif

#ifdef NETATALK
        case ETHERTYPE_ATALK:
                pktq = at_pktq1;
                break;

        case ETHERTYPE_AARP:
                aarpinput(ifp, m); /* XXX queue? */
                return;
#endif

#ifdef MPLS
        case ETHERTYPE_MPLS:
                pktq = mpls_pktq;
                break;
#endif

        default:
                goto noproto;
        }

        KASSERT(pktq != NULL);
        const uint32_t h = rps_hash ? pktq_rps_hash(&rps_hash, m) : 0;
        if (__predict_false(!pktq_enqueue(pktq, m, h))) {
                m_freem(m);
        }
        return;

drop:
        m_freem(m);
        if_statinc(ifp, if_iqdrops);
        return;
noproto:
        m_freem(m);
        if_statinc(ifp, if_noproto);
        return;
error:
        m_freem(m);
        if_statinc(ifp, if_ierrors);
        return;
}

static void
ether_bpf_mtap(struct bpf_if *bp, struct mbuf *m, u_int direction)
{
        struct ether_vlan_header evl;
        struct m_hdr mh, md;

        KASSERT(bp != NULL);

        if (!vlan_has_tag(m)) {
                bpf_mtap3(bp, m, direction);
                return;
        }

        memcpy(&evl, mtod(m, char *), ETHER_HDR_LEN);
        evl.evl_proto = evl.evl_encap_proto;
        evl.evl_encap_proto = htons(ETHERTYPE_VLAN);
        evl.evl_tag = htons(vlan_get_tag(m));

        md.mh_flags = 0;
        md.mh_data = m->m_data + ETHER_HDR_LEN;
        md.mh_len = m->m_len - ETHER_HDR_LEN;
        md.mh_next = m->m_next;

        mh.mh_flags = 0;
        mh.mh_data = (char *)&evl;
        mh.mh_len = sizeof(evl);
        mh.mh_next = (struct mbuf *)&md;

        bpf_mtap3(bp, (struct mbuf *)&mh, direction);
}

/*
 * Convert Ethernet address to printable (loggable) representation.
 */
char *
ether_sprintf(const u_char *ap)
{
        static char etherbuf[3 * ETHER_ADDR_LEN];
        return ether_snprintf(etherbuf, sizeof(etherbuf), ap);
}

char *
ether_snprintf(char *buf, size_t len, const u_char *ap)
{
        char *cp = buf;
        size_t i;

        for (i = 0; i < len / 3; i++) {
                *cp++ = hexdigits[*ap >> 4];
                *cp++ = hexdigits[*ap++ & 0xf];
                *cp++ = ':';
        }
        *--cp = '\0';
        return buf;
}

/*
 * Perform common duties while attaching to interface list
 */
void
ether_ifattach(struct ifnet *ifp, const uint8_t *lla)
{
        struct ethercom *ec = (struct ethercom *)ifp;
        char xnamebuf[HOOKNAMSIZ];

        if (lla != NULL && ETHER_IS_MULTICAST(lla))
                aprint_error("The multicast bit is set in the MAC address. "
                        "It's wrong.\n");
        
        ifp->if_type = IFT_ETHER;
        ifp->if_hdrlen = ETHER_HDR_LEN;
        ifp->if_dlt = DLT_EN10MB;
        ifp->if_mtu = ETHERMTU;
        ifp->if_output = ether_output;
        ifp->_if_input = ether_input;
        if (ec->ec_capabilities & ETHERCAP_VLAN_HWTAGGING)
                ifp->if_bpf_mtap = ether_bpf_mtap;
        if (ifp->if_baudrate == 0)
                ifp->if_baudrate = IF_Mbps(10);                /* just a default */

        if (lla != NULL)
                if_set_sadl(ifp, lla, ETHER_ADDR_LEN, !ETHER_IS_LOCAL(lla));

        LIST_INIT(&ec->ec_multiaddrs);
        SIMPLEQ_INIT(&ec->ec_vids);
        ec->ec_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET);
        ec->ec_flags = 0;
        ifp->if_broadcastaddr = etherbroadcastaddr;
        bpf_attach(ifp, DLT_EN10MB, sizeof(struct ether_header));
        snprintf(xnamebuf, sizeof(xnamebuf),
            "%s-ether_ifdetachhooks", ifp->if_xname);
        ec->ec_ifdetach_hooks = simplehook_create(IPL_NET, xnamebuf);
#ifdef MBUFTRACE
        mowner_init_owner(&ec->ec_tx_mowner, ifp->if_xname, "tx");
        mowner_init_owner(&ec->ec_rx_mowner, ifp->if_xname, "rx");
        MOWNER_ATTACH(&ec->ec_tx_mowner);
        MOWNER_ATTACH(&ec->ec_rx_mowner);
        ifp->if_mowner = &ec->ec_tx_mowner;
#endif
}

void
ether_ifdetach(struct ifnet *ifp)
{
        struct ethercom *ec = (void *) ifp;
        struct ether_multi *enm;

        IFNET_ASSERT_UNLOCKED(ifp);
        /*
         * Prevent further calls to ioctl (for example turning off
         * promiscuous mode from the bridge code), which eventually can
         * call if_init() which can cause panics because the interface
         * is in the process of being detached. Return device not configured
         * instead.
         */
        ifp->if_ioctl = __FPTRCAST(int (*)(struct ifnet *, u_long, void *),
            enxio);

        simplehook_dohooks(ec->ec_ifdetach_hooks);
        KASSERT(!simplehook_has_hooks(ec->ec_ifdetach_hooks));
        simplehook_destroy(ec->ec_ifdetach_hooks);

        bpf_detach(ifp);

        ETHER_LOCK(ec);
        KASSERT(ec->ec_nvlans == 0);
        while ((enm = LIST_FIRST(&ec->ec_multiaddrs)) != NULL) {
                LIST_REMOVE(enm, enm_list);
                kmem_free(enm, sizeof(*enm));
                ec->ec_multicnt--;
        }
        ETHER_UNLOCK(ec);

        mutex_obj_free(ec->ec_lock);
        ec->ec_lock = NULL;

        ifp->if_mowner = NULL;
        MOWNER_DETACH(&ec->ec_rx_mowner);
        MOWNER_DETACH(&ec->ec_tx_mowner);
}

void *
ether_ifdetachhook_establish(struct ifnet *ifp,
    void (*fn)(void *), void *arg)
{
        struct ethercom *ec;
        khook_t *hk;

        if (ifp->if_type != IFT_ETHER)
                return NULL;

        ec = (struct ethercom *)ifp;
        hk = simplehook_establish(ec->ec_ifdetach_hooks,
            fn, arg);

        return (void *)hk;
}

void
ether_ifdetachhook_disestablish(struct ifnet *ifp,
    void *vhook, kmutex_t *lock)
{
        struct ethercom *ec;

        if (vhook == NULL)
                return;

        ec = (struct ethercom *)ifp;
        simplehook_disestablish(ec->ec_ifdetach_hooks, vhook, lock);
}

#if 0
/*
 * This is for reference.  We have a table-driven version
 * of the little-endian crc32 generator, which is faster
 * than the double-loop.
 */
uint32_t
ether_crc32_le(const uint8_t *buf, size_t len)
{
        uint32_t c, crc, carry;
        size_t i, j;

        crc = 0xffffffffU;        /* initial value */

        for (i = 0; i < len; i++) {
                c = buf[i];
                for (j = 0; j < 8; j++) {
                        carry = ((crc & 0x01) ? 1 : 0) ^ (c & 0x01);
                        crc >>= 1;
                        c >>= 1;
                        if (carry)
                                crc = (crc ^ ETHER_CRC_POLY_LE);
                }
        }

        return (crc);
}
#else
uint32_t
ether_crc32_le(const uint8_t *buf, size_t len)
{
        static const uint32_t crctab[] = {
                0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac,
                0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
                0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
                0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c
        };
        uint32_t crc;
        size_t i;

        crc = 0xffffffffU;        /* initial value */

        for (i = 0; i < len; i++) {
                crc ^= buf[i];
                crc = (crc >> 4) ^ crctab[crc & 0xf];
                crc = (crc >> 4) ^ crctab[crc & 0xf];
        }

        return (crc);
}
#endif

uint32_t
ether_crc32_be(const uint8_t *buf, size_t len)
{
        uint32_t c, crc, carry;
        size_t i, j;

        crc = 0xffffffffU;        /* initial value */

        for (i = 0; i < len; i++) {
                c = buf[i];
                for (j = 0; j < 8; j++) {
                        carry = ((crc & 0x80000000U) ? 1 : 0) ^ (c & 0x01);
                        crc <<= 1;
                        c >>= 1;
                        if (carry)
                                crc = (crc ^ ETHER_CRC_POLY_BE) | carry;
                }
        }

        return (crc);
}

#ifdef INET
const uint8_t ether_ipmulticast_min[ETHER_ADDR_LEN] =
    { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 };
const uint8_t ether_ipmulticast_max[ETHER_ADDR_LEN] =
    { 0x01, 0x00, 0x5e, 0x7f, 0xff, 0xff };
#endif
#ifdef INET6
const uint8_t ether_ip6multicast_min[ETHER_ADDR_LEN] =
    { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 };
const uint8_t ether_ip6multicast_max[ETHER_ADDR_LEN] =
    { 0x33, 0x33, 0xff, 0xff, 0xff, 0xff };
#endif

/*
 * ether_aton implementation, not using a static buffer.
 */
int
ether_aton_r(u_char *dest, size_t len, const char *str)
{
        const u_char *cp = (const void *)str;
        u_char *ep;

#define atox(c)        (((c) <= '9') ? ((c) - '0') : ((toupper(c) - 'A') + 10))

        if (len < ETHER_ADDR_LEN)
                return ENOSPC;

        ep = dest + ETHER_ADDR_LEN;

        while (*cp) {
                if (!isxdigit(*cp))
                        return EINVAL;

                *dest = atox(*cp);
                cp++;
                if (isxdigit(*cp)) {
                        *dest = (*dest << 4) | atox(*cp);
                        cp++;
                }
                dest++;

                if (dest == ep)
                        return (*cp == '\0') ? 0 : ENAMETOOLONG;

                switch (*cp) {
                case ':':
                case '-':
                case '.':
                        cp++;
                        break;
                }
        }
        return ENOBUFS;
}

/*
 * Convert a sockaddr into an Ethernet address or range of Ethernet
 * addresses.
 */
int
ether_multiaddr(const struct sockaddr *sa, uint8_t addrlo[ETHER_ADDR_LEN],
    uint8_t addrhi[ETHER_ADDR_LEN])
{
#ifdef INET
        const struct sockaddr_in *sin;
#endif
#ifdef INET6
        const struct sockaddr_in6 *sin6;
#endif

        switch (sa->sa_family) {

        case AF_UNSPEC:
                memcpy(addrlo, sa->sa_data, ETHER_ADDR_LEN);
                memcpy(addrhi, addrlo, ETHER_ADDR_LEN);
                break;

#ifdef INET
        case AF_INET:
                sin = satocsin(sa);
                if (sin->sin_addr.s_addr == INADDR_ANY) {
                        /*
                         * An IP address of INADDR_ANY means listen to
                         * or stop listening to all of the Ethernet
                         * multicast addresses used for IP.
                         * (This is for the sake of IP multicast routers.)
                         */
                        memcpy(addrlo, ether_ipmulticast_min, ETHER_ADDR_LEN);
                        memcpy(addrhi, ether_ipmulticast_max, ETHER_ADDR_LEN);
                } else {
                        ETHER_MAP_IP_MULTICAST(&sin->sin_addr, addrlo);
                        memcpy(addrhi, addrlo, ETHER_ADDR_LEN);
                }
                break;
#endif
#ifdef INET6
        case AF_INET6:
                sin6 = satocsin6(sa);
                if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
                        /*
                         * An IP6 address of 0 means listen to or stop
                         * listening to all of the Ethernet multicast
                         * address used for IP6.
                         * (This is used for multicast routers.)
                         */
                        memcpy(addrlo, ether_ip6multicast_min, ETHER_ADDR_LEN);
                        memcpy(addrhi, ether_ip6multicast_max, ETHER_ADDR_LEN);
                } else {
                        ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, addrlo);
                        memcpy(addrhi, addrlo, ETHER_ADDR_LEN);
                }
                break;
#endif

        default:
                return EAFNOSUPPORT;
        }
        return 0;
}

/*
 * Add an Ethernet multicast address or range of addresses to the list for a
 * given interface.
 */
int
ether_addmulti(const struct sockaddr *sa, struct ethercom *ec)
{
        struct ether_multi *enm, *_enm;
        u_char addrlo[ETHER_ADDR_LEN];
        u_char addrhi[ETHER_ADDR_LEN];
        int error = 0;

        /* Allocate out of lock */
        enm = kmem_alloc(sizeof(*enm), KM_SLEEP);

        ETHER_LOCK(ec);
        error = ether_multiaddr(sa, addrlo, addrhi);
        if (error != 0)
                goto out;

        /*
         * Verify that we have valid Ethernet multicast addresses.
         */
        if (!ETHER_IS_MULTICAST(addrlo) || !ETHER_IS_MULTICAST(addrhi)) {
                error = EINVAL;
                goto out;
        }

        /*
         * See if the address range is already in the list.
         */
        _enm = ether_lookup_multi(addrlo, addrhi, ec);
        if (_enm != NULL) {
                /*
                 * Found it; just increment the reference count.
                 */
                ++_enm->enm_refcount;
                error = 0;
                goto out;
        }

        /*
         * Link a new multicast record into the interface's multicast list.
         */
        memcpy(enm->enm_addrlo, addrlo, ETHER_ADDR_LEN);
        memcpy(enm->enm_addrhi, addrhi, ETHER_ADDR_LEN);
        enm->enm_refcount = 1;
        LIST_INSERT_HEAD(&ec->ec_multiaddrs, enm, enm_list);
        ec->ec_multicnt++;

        /*
         * Return ENETRESET to inform the driver that the list has changed
         * and its reception filter should be adjusted accordingly.
         */
        error = ENETRESET;
        enm = NULL;

out:
        ETHER_UNLOCK(ec);
        if (enm != NULL)
                kmem_free(enm, sizeof(*enm));
        return error;
}

/*
 * Delete a multicast address record.
 */
int
ether_delmulti(const struct sockaddr *sa, struct ethercom *ec)
{
        struct ether_multi *enm;
        u_char addrlo[ETHER_ADDR_LEN];
        u_char addrhi[ETHER_ADDR_LEN];
        int error;

        ETHER_LOCK(ec);
        error = ether_multiaddr(sa, addrlo, addrhi);
        if (error != 0)
                goto error;

        /*
         * Look up the address in our list.
         */
        enm = ether_lookup_multi(addrlo, addrhi, ec);
        if (enm == NULL) {
                error = ENXIO;
                goto error;
        }
        if (--enm->enm_refcount != 0) {
                /*
                 * Still some claims to this record.
                 */
                error = 0;
                goto error;
        }

        /*
         * No remaining claims to this record; unlink and free it.
         */
        LIST_REMOVE(enm, enm_list);
        ec->ec_multicnt--;
        ETHER_UNLOCK(ec);
        kmem_free(enm, sizeof(*enm));

        /*
         * Return ENETRESET to inform the driver that the list has changed
         * and its reception filter should be adjusted accordingly.
         */
        return ENETRESET;

error:
        ETHER_UNLOCK(ec);
        return error;
}

void
ether_set_ifflags_cb(struct ethercom *ec, ether_cb_t cb)
{
        ec->ec_ifflags_cb = cb;
}

void
ether_set_vlan_cb(struct ethercom *ec, ether_vlancb_t cb)
{

        ec->ec_vlan_cb = cb;
}

static int
ether_ioctl_reinit(struct ethercom *ec)
{
        struct ifnet *ifp = &ec->ec_if;
        int error;

        KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname);

        switch (ifp->if_flags & (IFF_UP | IFF_RUNNING)) {
        case IFF_RUNNING:
                /*
                 * If interface is marked down and it is running,
                 * then stop and disable it.
                 */
                if_stop(ifp, 1);
                break;
        case IFF_UP:
                /*
                 * If interface is marked up and it is stopped, then
                 * start it.
                 */
                return if_init(ifp);
        case IFF_UP | IFF_RUNNING:
                error = 0;
                if (ec->ec_ifflags_cb != NULL) {
                        error = (*ec->ec_ifflags_cb)(ec);
                        if (error == ENETRESET) {
                                /*
                                 * Reset the interface to pick up
                                 * changes in any other flags that
                                 * affect the hardware state.
                                 */
                                return if_init(ifp);
                        }
                } else
                        error = if_init(ifp);
                return error;
        case 0:
                break;
        }

        return 0;
}

/*
 * Common ioctls for Ethernet interfaces.  Note, we must be
 * called at splnet().
 */
int
ether_ioctl(struct ifnet *ifp, u_long cmd, void *data)
{
        struct ethercom *ec = (void *)ifp;
        struct eccapreq *eccr;
        struct ifreq *ifr = (struct ifreq *)data;
        struct if_laddrreq *iflr = data;
        const struct sockaddr_dl *sdl;
        static const uint8_t zero[ETHER_ADDR_LEN];
        int error;

        switch (cmd) {
        case SIOCINITIFADDR:
            {
                struct ifaddr *ifa = (struct ifaddr *)data;
                if (ifa->ifa_addr->sa_family != AF_LINK
                    && (ifp->if_flags & (IFF_UP | IFF_RUNNING)) !=
                       (IFF_UP | IFF_RUNNING)) {
                        ifp->if_flags |= IFF_UP;
                        if ((error = if_init(ifp)) != 0)
                                return error;
                }
#ifdef INET
                if (ifa->ifa_addr->sa_family == AF_INET)
                        arp_ifinit(ifp, ifa);
#endif
                return 0;
            }

        case SIOCSIFMTU:
            {
                int maxmtu;

                if (ec->ec_capabilities & ETHERCAP_JUMBO_MTU)
                        maxmtu = ETHERMTU_JUMBO;
                else
                        maxmtu = ETHERMTU;

                if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > maxmtu)
                        return EINVAL;
                else if ((error = ifioctl_common(ifp, cmd, data)) != ENETRESET)
                        return error;
                else if (ifp->if_flags & IFF_UP) {
                        /* Make sure the device notices the MTU change. */
                        return if_init(ifp);
                } else
                        return 0;
            }

        case SIOCSIFFLAGS:
                if ((error = ifioctl_common(ifp, cmd, data)) != 0)
                        return error;
                return ether_ioctl_reinit(ec);
        case SIOCGIFFLAGS:
                error = ifioctl_common(ifp, cmd, data);
                if (error == 0) {
                        /* Set IFF_ALLMULTI for backcompat */
                        ifr->ifr_flags |= (ec->ec_flags & ETHER_F_ALLMULTI) ?
                            IFF_ALLMULTI : 0;
                }
                return error;
        case SIOCGETHERCAP:
                eccr = (struct eccapreq *)data;
                eccr->eccr_capabilities = ec->ec_capabilities;
                eccr->eccr_capenable = ec->ec_capenable;
                return 0;
        case SIOCSETHERCAP:
                eccr = (struct eccapreq *)data;
                if ((eccr->eccr_capenable & ~ec->ec_capabilities) != 0)
                        return EINVAL;
                if (eccr->eccr_capenable == ec->ec_capenable)
                        return 0;
#if 0 /* notyet */
                ec->ec_capenable = (ec->ec_capenable & ETHERCAP_CANTCHANGE)
                    | (eccr->eccr_capenable & ~ETHERCAP_CANTCHANGE);
#else
                ec->ec_capenable = eccr->eccr_capenable;
#endif
                return ether_ioctl_reinit(ec);
        case SIOCADDMULTI:
                return ether_addmulti(ifreq_getaddr(cmd, ifr), ec);
        case SIOCDELMULTI:
                return ether_delmulti(ifreq_getaddr(cmd, ifr), ec);
        case SIOCSIFMEDIA:
        case SIOCGIFMEDIA:
                if (ec->ec_mii != NULL)
                        return ifmedia_ioctl(ifp, ifr, &ec->ec_mii->mii_media,
                            cmd);
                else if (ec->ec_ifmedia != NULL)
                        return ifmedia_ioctl(ifp, ifr, ec->ec_ifmedia, cmd);
                else
                        return ENOTTY;
                break;
        case SIOCALIFADDR:
                sdl = satocsdl(sstocsa(&iflr->addr));
                if (sdl->sdl_family != AF_LINK)
                        ;
                else if (ETHER_IS_MULTICAST(CLLADDR(sdl)))
                        return EINVAL;
                else if (memcmp(zero, CLLADDR(sdl), sizeof(zero)) == 0)
                        return EINVAL;
                /*FALLTHROUGH*/
        default:
                return ifioctl_common(ifp, cmd, data);
        }
        return 0;
}

/*
 * Enable/disable passing VLAN packets if the parent interface supports it.
 * Return:
 *          0: Ok
 *        -1: Parent interface does not support vlans
 *        >0: Error
 */
int
ether_enable_vlan_mtu(struct ifnet *ifp)
{
        int error;
        struct ethercom *ec = (void *)ifp;

        /* Parent does not support VLAN's */
        if ((ec->ec_capabilities & ETHERCAP_VLAN_MTU) == 0)
                return -1;

        /*
         * Parent supports the VLAN_MTU capability,
         * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames;
         * enable it.
         */
        ec->ec_capenable |= ETHERCAP_VLAN_MTU;

        /* Interface is down, defer for later */
        if ((ifp->if_flags & IFF_UP) == 0)
                return 0;

        if ((error = if_flags_set(ifp, ifp->if_flags)) == 0)
                return 0;

        ec->ec_capenable &= ~ETHERCAP_VLAN_MTU;
        return error;
}

int
ether_disable_vlan_mtu(struct ifnet *ifp)
{
        int error;
        struct ethercom *ec = (void *)ifp;

        /* We still have VLAN's, defer for later */
        if (ec->ec_nvlans != 0)
                return 0;

        /* Parent does not support VLAB's, nothing to do. */
        if ((ec->ec_capenable & ETHERCAP_VLAN_MTU) == 0)
                return -1;

        /*
         * Disable Tx/Rx of VLAN-sized frames.
         */
        ec->ec_capenable &= ~ETHERCAP_VLAN_MTU;

        /* Interface is down, defer for later */
        if ((ifp->if_flags & IFF_UP) == 0)
                return 0;

        if ((error = if_flags_set(ifp, ifp->if_flags)) == 0)
                return 0;

        ec->ec_capenable |= ETHERCAP_VLAN_MTU;
        return error;
}

/*
 * Add and delete VLAN TAG
 */
int
ether_add_vlantag(struct ifnet *ifp, uint16_t vtag, bool *vlanmtu_status)
{
        struct ethercom *ec = (void *)ifp;
        struct vlanid_list *vidp;
        bool vlanmtu_enabled;
        uint16_t vid = EVL_VLANOFTAG(vtag);
        int error;

        vlanmtu_enabled = false;

        /* Add a vid to the list */
        vidp = kmem_alloc(sizeof(*vidp), KM_SLEEP);
        vidp->vid = vid;

        ETHER_LOCK(ec);
        ec->ec_nvlans++;
        SIMPLEQ_INSERT_TAIL(&ec->ec_vids, vidp, vid_list);
        ETHER_UNLOCK(ec);

        if (ec->ec_nvlans == 1) {
                IFNET_LOCK(ifp);
                error = ether_enable_vlan_mtu(ifp);
                IFNET_UNLOCK(ifp);

                if (error == 0) {
                        vlanmtu_enabled = true;
                } else if (error != -1) {
                        goto fail;
                }
        }

        if (ec->ec_vlan_cb != NULL) {
                error = (*ec->ec_vlan_cb)(ec, vid, true);
                if (error != 0)
                        goto fail;
        }

        if (vlanmtu_status != NULL)
                *vlanmtu_status = vlanmtu_enabled;

        return 0;
fail:
        ETHER_LOCK(ec);
        ec->ec_nvlans--;
        SIMPLEQ_REMOVE(&ec->ec_vids, vidp, vlanid_list, vid_list);
        ETHER_UNLOCK(ec);

        if (vlanmtu_enabled) {
                IFNET_LOCK(ifp);
                (void)ether_disable_vlan_mtu(ifp);
                IFNET_UNLOCK(ifp);
        }

        kmem_free(vidp, sizeof(*vidp));

        return error;
}

int
ether_del_vlantag(struct ifnet *ifp, uint16_t vtag)
{
        struct ethercom *ec = (void *)ifp;
        struct vlanid_list *vidp;
        uint16_t vid = EVL_VLANOFTAG(vtag);

        ETHER_LOCK(ec);
        SIMPLEQ_FOREACH(vidp, &ec->ec_vids, vid_list) {
                if (vidp->vid == vid) {
                        SIMPLEQ_REMOVE(&ec->ec_vids, vidp,
                            vlanid_list, vid_list);
                        ec->ec_nvlans--;
                        break;
                }
        }
        ETHER_UNLOCK(ec);

        if (vidp == NULL)
                return ENOENT;

        if (ec->ec_vlan_cb != NULL) {
                (void)(*ec->ec_vlan_cb)(ec, vidp->vid, false);
        }

        if (ec->ec_nvlans == 0) {
                IFNET_LOCK(ifp);
                (void)ether_disable_vlan_mtu(ifp);
                IFNET_UNLOCK(ifp);
        }

        kmem_free(vidp, sizeof(*vidp));

        return 0;
}

int
ether_inject_vlantag(struct mbuf **mp, uint16_t etype, uint16_t tag)
{
        static const size_t min_data_len =
            ETHER_MIN_LEN - ETHER_CRC_LEN + ETHER_VLAN_ENCAP_LEN;
        /* Used to pad ethernet frames with < ETHER_MIN_LEN bytes */
        static const char vlan_zero_pad_buff[ETHER_MIN_LEN] = { 0 };

        struct ether_vlan_header *evl;
        struct mbuf *m = *mp;
        int error;

        error = 0;

        M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
        if (m == NULL) {
                error = ENOBUFS;
                goto out;
        }

        if (m->m_len < sizeof(*evl)) {
                m = m_pullup(m, sizeof(*evl));
                if (m == NULL) {
                        error = ENOBUFS;
                        goto out;
                }
        }

        /*
         * Transform the Ethernet header into an
         * Ethernet header with 802.1Q encapsulation.
         */
        memmove(mtod(m, void *),
            mtod(m, char *) + ETHER_VLAN_ENCAP_LEN,
            sizeof(struct ether_header));
        evl = mtod(m, struct ether_vlan_header *);
        evl->evl_proto = evl->evl_encap_proto;
        evl->evl_encap_proto = htons(etype);
        evl->evl_tag = htons(tag);

        /*
         * To cater for VLAN-aware layer 2 ethernet
         * switches which may need to strip the tag
         * before forwarding the packet, make sure
         * the packet+tag is at least 68 bytes long.
         * This is necessary because our parent will
         * only pad to 64 bytes (ETHER_MIN_LEN) and
         * some switches will not pad by themselves
         * after deleting a tag.
         */
        if (m->m_pkthdr.len < min_data_len) {
                m_copyback(m, m->m_pkthdr.len,
                    min_data_len - m->m_pkthdr.len,
                    vlan_zero_pad_buff);
        }

        m->m_flags &= ~M_VLANTAG;

out:
        *mp = m;
        return error;
}

struct mbuf *
ether_strip_vlantag(struct mbuf *m)
{
        struct ether_vlan_header *evl;

        if (m->m_len < sizeof(*evl) &&
            (m = m_pullup(m, sizeof(*evl))) == NULL) {
                return NULL;
        }

        if (m_makewritable(&m, 0, sizeof(*evl), M_DONTWAIT)) {
                m_freem(m);
                return NULL;
        }

        evl = mtod(m, struct ether_vlan_header *);
        KASSERT(ntohs(evl->evl_encap_proto) == ETHERTYPE_VLAN);

        vlan_set_tag(m, ntohs(evl->evl_tag));

        /*
         * Restore the original ethertype.  We'll remove
         * the encapsulation after we've found the vlan
         * interface corresponding to the tag.
         */
        evl->evl_encap_proto = evl->evl_proto;

        /*
         * Remove the encapsulation header and append tag.
         * The original header has already been fixed up above.
         */
        vlan_set_tag(m, ntohs(evl->evl_tag));
        memmove((char *)evl + ETHER_VLAN_ENCAP_LEN, evl,
            offsetof(struct ether_vlan_header, evl_encap_proto));
        m_adj(m, ETHER_VLAN_ENCAP_LEN);

        return m;
}

static int
ether_multicast_sysctl(SYSCTLFN_ARGS)
{
        struct ether_multi *enm;
        struct ifnet *ifp;
        struct ethercom *ec;
        int error = 0;
        size_t written;
        struct psref psref;
        int bound;
        unsigned int multicnt;
        struct ether_multi_sysctl *addrs;
        int i;

        if (namelen != 1)
                return EINVAL;

        bound = curlwp_bind();
        ifp = if_get_byindex(name[0], &psref);
        if (ifp == NULL) {
                error = ENODEV;
                goto out;
        }
        if (ifp->if_type != IFT_ETHER) {
                if_put(ifp, &psref);
                *oldlenp = 0;
                goto out;
        }
        ec = (struct ethercom *)ifp;

        if (oldp == NULL) {
                if_put(ifp, &psref);
                *oldlenp = ec->ec_multicnt * sizeof(*addrs);
                goto out;
        }

        /*
         * ec->ec_lock is a spin mutex so we cannot call sysctl_copyout, which
         * is sleepable, while holding it. Copy data to a local buffer first
         * with the lock taken and then call sysctl_copyout without holding it.
         */
retry:
        multicnt = ec->ec_multicnt;

        if (multicnt == 0) {
                if_put(ifp, &psref);
                *oldlenp = 0;
                goto out;
        }

        addrs = kmem_zalloc(sizeof(*addrs) * multicnt, KM_SLEEP);

        ETHER_LOCK(ec);
        if (multicnt != ec->ec_multicnt) {
                /* The number of multicast addresses has changed */
                ETHER_UNLOCK(ec);
                kmem_free(addrs, sizeof(*addrs) * multicnt);
                goto retry;
        }

        i = 0;
        LIST_FOREACH(enm, &ec->ec_multiaddrs, enm_list) {
                struct ether_multi_sysctl *addr = &addrs[i];
                addr->enm_refcount = enm->enm_refcount;
                memcpy(addr->enm_addrlo, enm->enm_addrlo, ETHER_ADDR_LEN);
                memcpy(addr->enm_addrhi, enm->enm_addrhi, ETHER_ADDR_LEN);
                i++;
        }
        ETHER_UNLOCK(ec);

        error = 0;
        written = 0;
        for (i = 0; i < multicnt; i++) {
                struct ether_multi_sysctl *addr = &addrs[i];

                if (written + sizeof(*addr) > *oldlenp)
                        break;
                error = sysctl_copyout(l, addr, oldp, sizeof(*addr));
                if (error)
                        break;
                written += sizeof(*addr);
                oldp = (char *)oldp + sizeof(*addr);
        }
        kmem_free(addrs, sizeof(*addrs) * multicnt);

        if_put(ifp, &psref);

        *oldlenp = written;
out:
        curlwp_bindx(bound);
        return error;
}

static void
ether_sysctl_setup(struct sysctllog **clog)
{
        const struct sysctlnode *rnode = NULL;

        sysctl_createv(clog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "ether",
                       SYSCTL_DESCR("Ethernet-specific information"),
                       NULL, 0, NULL, 0,
                       CTL_NET, CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "multicast",
                       SYSCTL_DESCR("multicast addresses"),
                       ether_multicast_sysctl, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                       CTLTYPE_STRING, "rps_hash",
                       SYSCTL_DESCR("Interface rps hash function control"),
                       sysctl_pktq_rps_hash_handler, 0, (void *)&ether_pktq_rps_hash_p,
                       PKTQ_RPS_HASH_NAME_LEN,
                       CTL_CREATE, CTL_EOL);
}

void
etherinit(void)
{

#ifdef DIAGNOSTIC
        mutex_init(&bigpktpps_lock, MUTEX_DEFAULT, IPL_NET);
#endif
        ether_pktq_rps_hash_p = pktq_rps_hash_default;
        ether_sysctl_setup(NULL);
}





















































































































































































































































   39 



   39 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 






















































































































































































   15 








   15 





   15 











































































































   15 







   15 
   15 




























   15 
















   15 


   15 
   15 


   15 













   15 




   15 

   15 
   13 








    2 








   15 

   15 





   14 




































    2 




    1 


    2 












































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
/*        $NetBSD: ffs_vfsops.c,v 1.382 2023/09/08 23:21:55 riastradh Exp $        */

/*-
 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Wasabi Systems, Inc, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1989, 1991, 1993, 1994
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ffs_vfsops.c        8.31 (Berkeley) 5/20/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.382 2023/09/08 23:21:55 riastradh Exp $");

#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#include "opt_wapbl.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/fstrans.h>
#include <sys/socket.h>
#include <sys/mount.h>
#include <sys/buf.h>
#include <sys/device.h>
#include <sys/disk.h>
#include <sys/file.h>
#include <sys/disklabel.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/lock.h>
#include <sys/sysctl.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/wapbl.h>
#include <sys/module.h>

#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

#include <ufs/ufs/quota.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>

#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>

#ifdef WAPBL
MODULE(MODULE_CLASS_VFS, ffs, "ufs,wapbl");
#else
MODULE(MODULE_CLASS_VFS, ffs, "ufs");
#endif

static int ffs_vfs_fsync(vnode_t *, int);
static int ffs_superblock_validate(struct fs *);
static int ffs_is_appleufs(struct vnode *, struct fs *);

static int ffs_init_vnode(struct ufsmount *, struct vnode *, ino_t);
static void ffs_deinit_vnode(struct ufsmount *, struct vnode *);

static kauth_listener_t ffs_snapshot_listener;

/* how many times ffs_init() was called */
int ffs_initcount = 0;

#ifdef DEBUG_FFS_MOUNT
#define DPRINTF(_fmt, args...)        printf("%s: " _fmt "\n", __func__, ##args)
#else
#define DPRINTF(_fmt, args...)        do {} while (/*CONSTCOND*/0)
#endif

extern const struct vnodeopv_desc ffs_vnodeop_opv_desc;
extern const struct vnodeopv_desc ffs_specop_opv_desc;
extern const struct vnodeopv_desc ffs_fifoop_opv_desc;

const struct vnodeopv_desc * const ffs_vnodeopv_descs[] = {
        &ffs_vnodeop_opv_desc,
        &ffs_specop_opv_desc,
        &ffs_fifoop_opv_desc,
        NULL,
};

struct vfsops ffs_vfsops = {
        .vfs_name = MOUNT_FFS,
        .vfs_min_mount_data = sizeof (struct ufs_args),
        .vfs_mount = ffs_mount,
        .vfs_start = ufs_start,
        .vfs_unmount = ffs_unmount,
        .vfs_root = ufs_root,
        .vfs_quotactl = ufs_quotactl,
        .vfs_statvfs = ffs_statvfs,
        .vfs_sync = ffs_sync,
        .vfs_vget = ufs_vget,
        .vfs_loadvnode = ffs_loadvnode,
        .vfs_newvnode = ffs_newvnode,
        .vfs_fhtovp = ffs_fhtovp,
        .vfs_vptofh = ffs_vptofh,
        .vfs_init = ffs_init,
        .vfs_reinit = ffs_reinit,
        .vfs_done = ffs_done,
        .vfs_mountroot = ffs_mountroot,
        .vfs_snapshot = ffs_snapshot,
        .vfs_extattrctl = ffs_extattrctl,
        .vfs_suspendctl = genfs_suspendctl,
        .vfs_renamelock_enter = genfs_renamelock_enter,
        .vfs_renamelock_exit = genfs_renamelock_exit,
        .vfs_fsync = ffs_vfs_fsync,
        .vfs_opv_descs = ffs_vnodeopv_descs
};

static const struct genfs_ops ffs_genfsops = {
        .gop_size = ffs_gop_size,
        .gop_alloc = ufs_gop_alloc,
        .gop_write = genfs_gop_write,
        .gop_markupdate = ufs_gop_markupdate,
        .gop_putrange = genfs_gop_putrange,
};

static const struct ufs_ops ffs_ufsops = {
        .uo_itimes = ffs_itimes,
        .uo_update = ffs_update,
        .uo_truncate = ffs_truncate,
        .uo_balloc = ffs_balloc,
        .uo_snapgone = ffs_snapgone,
        .uo_bufrd = ffs_bufrd,
        .uo_bufwr = ffs_bufwr,
};

static int
ffs_checkrange(struct mount *mp, ino_t ino)
{
        struct fs *fs = VFSTOUFS(mp)->um_fs;

        if (ino < UFS_ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg) {
                DPRINTF("out of range %" PRIu64 "\n", ino);
                return ESTALE;
        }

        /*
         * Need to check if inode is initialized because ffsv2 does 
         * lazy initialization and we can get here from nfs_fhtovp
         */
        if (fs->fs_magic != FS_UFS2_MAGIC)
                return 0;

        struct buf *bp;
        int cg = ino_to_cg(fs, ino);
        struct ufsmount *ump = VFSTOUFS(mp);

        int error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
            (int)fs->fs_cgsize, B_MODIFY, &bp);
        if (error) {
                DPRINTF("error %d reading cg %d ino %" PRIu64 "\n",
                    error, cg, ino);
                return error;
        }

        const int needswap = UFS_FSNEEDSWAP(fs);

        struct cg *cgp = (struct cg *)bp->b_data;
        if (!cg_chkmagic(cgp, needswap)) {
                brelse(bp, 0);
                DPRINTF("bad cylinder group magic cg %d ino %" PRIu64 "\n",
                    cg, ino);
                return ESTALE;
        }

        int32_t initediblk = ufs_rw32(cgp->cg_initediblk, needswap);
        brelse(bp, 0);

        if (cg * fs->fs_ipg + initediblk < ino) {
                DPRINTF("cg=%d fs->fs_ipg=%d initediblk=%d ino=%" PRIu64 "\n",
                    cg, fs->fs_ipg, initediblk, ino);
                return ESTALE;
        }
        return 0;
}

static int
ffs_snapshot_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        vnode_t *vp = arg2;
        int result = KAUTH_RESULT_DEFER;

        if (action != KAUTH_SYSTEM_FS_SNAPSHOT)
                return result;

        if (VTOI(vp)->i_uid == kauth_cred_geteuid(cred))
                result = KAUTH_RESULT_ALLOW;

        return result;
}

SYSCTL_SETUP(ffs_sysctl_setup, "ffs sysctls")
{
#ifdef UFS_EXTATTR
        extern int ufs_extattr_autocreate;
#endif
        extern int ffs_log_changeopt;

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "ffs",
                       SYSCTL_DESCR("Berkeley Fast File System"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, 1, CTL_EOL);
        /*
         * @@@ should we even bother with these first three?
         */
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "doclusterread", NULL,
                       sysctl_notavail, 0, NULL, 0,
                       CTL_VFS, 1, FFS_CLUSTERREAD, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "doclusterwrite", NULL,
                       sysctl_notavail, 0, NULL, 0,
                       CTL_VFS, 1, FFS_CLUSTERWRITE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "doreallocblks", NULL,
                       sysctl_notavail, 0, NULL, 0,
                       CTL_VFS, 1, FFS_REALLOCBLKS, CTL_EOL);
#if 0
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "doasyncfree",
                       SYSCTL_DESCR("Release dirty blocks asynchronously"),
                       NULL, 0, &doasyncfree, 0,
                       CTL_VFS, 1, FFS_ASYNCFREE, CTL_EOL);
#endif
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "log_changeopt",
                       SYSCTL_DESCR("Log changes in optimization strategy"),
                       NULL, 0, &ffs_log_changeopt, 0,
                       CTL_VFS, 1, FFS_LOG_CHANGEOPT, CTL_EOL);
#ifdef UFS_EXTATTR
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "extattr_autocreate",
                       SYSCTL_DESCR("Size of attribute for "
                                    "backing file autocreation"),
                       NULL, 0, &ufs_extattr_autocreate, 0,
                       CTL_VFS, 1, FFS_EXTATTR_AUTOCREATE, CTL_EOL);

#endif /* UFS_EXTATTR */
}

static int
ffs_modcmd(modcmd_t cmd, void *arg)
{
        int error;

#if 0
        extern int doasyncfree;
#endif

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = vfs_attach(&ffs_vfsops);
                if (error != 0)
                        break;

                ffs_snapshot_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
                    ffs_snapshot_cb, NULL);
                if (ffs_snapshot_listener == NULL)
                        printf("ffs_modcmd: can't listen on system scope.\n");

                break;
        case MODULE_CMD_FINI:
                error = vfs_detach(&ffs_vfsops);
                if (error != 0)
                        break;
                if (ffs_snapshot_listener != NULL)
                        kauth_unlisten_scope(ffs_snapshot_listener);
                break;
        default:
                error = ENOTTY;
                break;
        }

        return (error);
}

pool_cache_t ffs_inode_cache;
pool_cache_t ffs_dinode1_cache;
pool_cache_t ffs_dinode2_cache;

static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, daddr_t);
static void ffs_oldfscompat_write(struct fs *, struct ufsmount *);

/*
 * Called by main() when ffs is going to be mounted as root.
 */

int
ffs_mountroot(void)
{
        struct fs *fs;
        struct mount *mp;
        struct lwp *l = curlwp;                        /* XXX */
        struct ufsmount *ump;
        int error;

        if (device_class(root_device) != DV_DISK)
                return (ENODEV);

        if ((error = vfs_rootmountalloc(MOUNT_FFS, "root_device", &mp))) {
                vrele(rootvp);
                return (error);
        }

        /*
         * We always need to be able to mount the root file system.
         */
        mp->mnt_flag |= MNT_FORCE;
        if ((error = ffs_mountfs(rootvp, mp, l)) != 0) {
                vfs_unbusy(mp);
                vfs_rele(mp);
                return (error);
        }
        mp->mnt_flag &= ~MNT_FORCE;
        mountlist_append(mp);
        ump = VFSTOUFS(mp);
        fs = ump->um_fs;
        memset(fs->fs_fsmnt, 0, sizeof(fs->fs_fsmnt));
        (void)copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0);
        (void)ffs_statvfs(mp, &mp->mnt_stat);
        vfs_unbusy(mp);
        setrootfstime((time_t)fs->fs_time);
        return (0);
}

static int
ffs_acls(struct mount *mp, int fs_flags)
{
        struct ufsmount *ump;

        ump = VFSTOUFS(mp);
        if (ump->um_fstype == UFS2 && (ump->um_flags & UFS_EA) == 0 &&
            ((mp->mnt_flag & (MNT_POSIX1EACLS | MNT_NFS4ACLS)) != 0 ||
             (fs_flags & (FS_POSIX1EACLS | FS_NFS4ACLS)) != 0)) {
                printf("%s: ACLs requested but not supported by this fs\n",
                       mp->mnt_stat.f_mntonname);
                return EINVAL;
        }

        if ((fs_flags & FS_POSIX1EACLS) != 0) {
#ifdef UFS_ACL
                if (mp->mnt_flag & MNT_NFS4ACLS)
                        printf("WARNING: %s: POSIX.1e ACLs flag on fs conflicts "
                            "with \"nfsv4acls\" mount option; option ignored\n",
                            mp->mnt_stat.f_mntonname);
                mp->mnt_flag &= ~MNT_NFS4ACLS;
                mp->mnt_flag |= MNT_POSIX1EACLS;
#else
                printf("WARNING: %s: POSIX.1e ACLs flag on fs but no "
                    "ACLs support\n", mp->mnt_stat.f_mntonname);
#endif
        }
        if ((fs_flags & FS_NFS4ACLS) != 0) {
#ifdef UFS_ACL
                if (mp->mnt_flag & MNT_POSIX1EACLS)
                        printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts "
                            "with \"posix1eacls\" mount option; option ignored\n",
                            mp->mnt_stat.f_mntonname);
                mp->mnt_flag &= ~MNT_POSIX1EACLS;
                mp->mnt_flag |= MNT_NFS4ACLS;

#else
                printf("WARNING: %s: NFSv4 ACLs flag on fs but no "
                    "ACLs support\n", mp->mnt_stat.f_mntonname);
#endif
        }
        if ((mp->mnt_flag & (MNT_NFS4ACLS | MNT_POSIX1EACLS))
            == (MNT_NFS4ACLS | MNT_POSIX1EACLS))
        {
                printf("%s: \"posix1eacls\" and \"nfsv4acls\" options "
                       "are mutually exclusive\n",
                    mp->mnt_stat.f_mntonname);
                return EINVAL;
        }

        if (mp->mnt_flag & (MNT_NFS4ACLS | MNT_POSIX1EACLS))
                mp->mnt_iflag &= ~(IMNT_SHRLOOKUP|IMNT_NCLOOKUP);
        else
                mp->mnt_iflag |= IMNT_SHRLOOKUP|IMNT_NCLOOKUP;
        return 0;
}

/*
 * VFS Operations.
 *
 * mount system call
 */
int
ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
        struct lwp *l = curlwp;
        struct vnode *devvp = NULL;
        struct ufs_args *args = data;
        struct ufsmount *ump = NULL;
        struct fs *fs;
        int error = 0, flags, update;
        mode_t accessmode;

        if (args == NULL) {
                DPRINTF("NULL args");
                return EINVAL;
        }
        if (*data_len < sizeof(*args)) {
                DPRINTF("bad size args %zu != %zu", *data_len, sizeof(*args));
                return EINVAL;
        }

        ump = VFSTOUFS(mp);
        if ((mp->mnt_flag & (MNT_GETARGS|MNT_UPDATE)) && ump == NULL) {
                DPRINTF("no ump");
                return EIO;
        }

        if (mp->mnt_flag & MNT_GETARGS) {
                args->fspec = NULL;
                *data_len = sizeof *args;
                return 0;
        }

        update = mp->mnt_flag & MNT_UPDATE;

        /* Check arguments */
        if (args->fspec == NULL) {
                if (!update) {
                        /* New mounts must have a filename for the device */
                        DPRINTF("no filename for mount");
                        return EINVAL;
                }
        } else {
                /*
                 * Look up the name and verify that it's sane.
                 */
                error = namei_simple_user(args->fspec,
                    NSM_FOLLOW_NOEMULROOT, &devvp);
                if (error != 0) {
                        DPRINTF("namei_simple_user returned %d", error);
                        return error;
                }

                /*
                 * Be sure this is a valid block device
                 */
                if (devvp->v_type != VBLK) {
                        DPRINTF("non block device %d", devvp->v_type);
                        error = ENOTBLK;
                        goto fail;
                }

                if (bdevsw_lookup(devvp->v_rdev) == NULL) {
                        DPRINTF("can't find block device 0x%jx",
                            devvp->v_rdev);
                        error = ENXIO;
                        goto fail;
                }

                if (update) {
                        /*
                         * Be sure we're still naming the same device
                         * used for our initial mount
                         */
                        if (devvp != ump->um_devvp &&
                            devvp->v_rdev != ump->um_devvp->v_rdev) {
                                DPRINTF("wrong device 0x%jx != 0x%jx",
                                    (uintmax_t)devvp->v_rdev,
                                    (uintmax_t)ump->um_devvp->v_rdev);
                                error = EINVAL;
                                goto fail;
                        }
                        vrele(devvp);
                        devvp = NULL;
                }
        }

        if (devvp == NULL) {
                devvp = ump->um_devvp;
                vref(devvp);
        }

        /*
         * If mount by non-root, then verify that user has necessary
         * permissions on the device.
         *
         * Permission to update a mount is checked higher, so here we presume
         * updating the mount is okay (for example, as far as securelevel goes)
         * which leaves us with the normal check.
         */
        accessmode = VREAD;
        if (update ? (mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
            (mp->mnt_flag & MNT_RDONLY) == 0)
                accessmode |= VWRITE;
        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
            KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp, KAUTH_ARG(accessmode));
        VOP_UNLOCK(devvp);
        if (error) {
                DPRINTF("kauth returned %d", error);
                goto fail;
        }

#ifdef WAPBL
        /* WAPBL can only be enabled on a r/w mount. */
        if (((mp->mnt_flag & MNT_RDONLY) && !(mp->mnt_iflag & IMNT_WANTRDWR)) ||
            (mp->mnt_iflag & IMNT_WANTRDONLY)) {
                mp->mnt_flag &= ~MNT_LOG;
        }
#else /* !WAPBL */
        mp->mnt_flag &= ~MNT_LOG;
#endif /* !WAPBL */

        error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
            UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
        if (error)
                goto fail;

        if (!update) {
                int xflags;

                if (mp->mnt_flag & MNT_RDONLY)
                        xflags = FREAD;
                else
                        xflags = FREAD | FWRITE;
                vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                error = VOP_OPEN(devvp, xflags, FSCRED);
                VOP_UNLOCK(devvp);
                if (error) {        
                        DPRINTF("VOP_OPEN returned %d", error);
                        goto fail;
                }
                /* Need fstrans_start() for assertion in ufs_strategy(). */
                if ((mp->mnt_flag & MNT_RDONLY) == 0)
                        fstrans_start(mp);
                error = ffs_mountfs(devvp, mp, l);
                if ((mp->mnt_flag & MNT_RDONLY) == 0)
                        fstrans_done(mp);
                if (error) {
                        DPRINTF("ffs_mountfs returned %d", error);
                        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
                        (void)VOP_CLOSE(devvp, xflags, NOCRED);
                        VOP_UNLOCK(devvp);
                        goto fail;
                }

                ump = VFSTOUFS(mp);
                fs = ump->um_fs;
        } else {
                /*
                 * Update the mount.
                 */

                /*
                 * The initial mount got a reference on this
                 * device, so drop the one obtained via
                 * namei(), above.
                 */
                vrele(devvp);

                ump = VFSTOUFS(mp);
                fs = ump->um_fs;
                if (fs->fs_ronly == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY)) {
                        /*
                         * Changing from r/w to r/o
                         */
                        flags = WRITECLOSE;
                        if (mp->mnt_flag & MNT_FORCE)
                                flags |= FORCECLOSE;
                        error = ffs_flushfiles(mp, flags, l);
                        if (error)
                                return error;

                        error = UFS_WAPBL_BEGIN(mp);
                        if (error) {
                                DPRINTF("wapbl %d", error);
                                return error;
                        }

                        if (ffs_cgupdate(ump, MNT_WAIT) == 0 &&
                            fs->fs_clean & FS_WASCLEAN) {
                                if (mp->mnt_flag & MNT_SOFTDEP)
                                        fs->fs_flags &= ~FS_DOSOFTDEP;
                                fs->fs_clean = FS_ISCLEAN;
                                (void) ffs_sbupdate(ump, MNT_WAIT);
                        }

                        UFS_WAPBL_END(mp);
                }

#ifdef WAPBL
                if ((mp->mnt_flag & MNT_LOG) == 0) {
                        error = ffs_wapbl_stop(mp, mp->mnt_flag & MNT_FORCE);
                        if (error) {
                                DPRINTF("ffs_wapbl_stop returned %d", error);
                                return error;
                        }
                }
#endif /* WAPBL */

                if (fs->fs_ronly == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY)) {
                        /*
                         * Finish change from r/w to r/o
                         */
                        fs->fs_ronly = 1;
                        fs->fs_fmod = 0;
                }

                error = ffs_acls(mp, fs->fs_flags);
                if (error)
                        return error;
                if (mp->mnt_flag & MNT_RELOAD) {
                        error = ffs_reload(mp, l->l_cred, l);
                        if (error) {
                                DPRINTF("ffs_reload returned %d", error);
                                return error;
                        }
                }

                if (fs->fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
                        /*
                         * Changing from read-only to read/write
                         */
#ifndef QUOTA2
                        if (fs->fs_flags & FS_DOQUOTA2) {
                                ump->um_flags |= UFS_QUOTA2;
                                uprintf("%s: options QUOTA2 not enabled%s\n",
                                    mp->mnt_stat.f_mntonname,
                                    (mp->mnt_flag & MNT_FORCE) ? "" :
                                    ", not mounting");
                                DPRINTF("ffs_quota2 %d", EINVAL);
                                return EINVAL;
                        }
#endif
                        fs->fs_ronly = 0;
                        fs->fs_clean =
                            fs->fs_clean == FS_ISCLEAN ? FS_WASCLEAN : 0;
                        fs->fs_fmod = 1;
#ifdef WAPBL
                        if (fs->fs_flags & FS_DOWAPBL) {
                                const char *nm = mp->mnt_stat.f_mntonname;
                                if (!mp->mnt_wapbl_replay) {
                                        printf("%s: log corrupted;"
                                            " replay cancelled\n", nm);
                                        return EFTYPE;
                                }
                                printf("%s: replaying log to disk\n", nm);
                                error = wapbl_replay_write(mp->mnt_wapbl_replay,
                                    devvp);
                                if (error) {
                                        DPRINTF("%s: wapbl_replay_write %d",
                                            nm, error);
                                        return error;
                                }
                                wapbl_replay_stop(mp->mnt_wapbl_replay);
                                fs->fs_clean = FS_WASCLEAN;
                        }
#endif /* WAPBL */
                        if (fs->fs_snapinum[0] != 0)
                                ffs_snapshot_mount(mp);
                }

#ifdef WAPBL
                error = ffs_wapbl_start(mp);
                if (error) {
                        DPRINTF("ffs_wapbl_start returned %d", error);
                        return error;
                }
#endif /* WAPBL */

#ifdef QUOTA2
                if (!fs->fs_ronly) {
                        error = ffs_quota2_mount(mp);
                        if (error) {
                                DPRINTF("ffs_quota2_mount returned %d", error);
                                return error;
                        }
                }
#endif

                if ((mp->mnt_flag & MNT_DISCARD) && !(ump->um_discarddata))
                        ump->um_discarddata = ffs_discard_init(devvp, fs);

                if (args->fspec == NULL)
                        return 0;
        }

        (void)strncpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname,
            sizeof(fs->fs_fsmnt));

        fs->fs_flags &= ~FS_DOSOFTDEP;

        if ((fs->fs_ronly && (fs->fs_clean & FS_ISCLEAN) == 0) ||
            (!fs->fs_ronly && (fs->fs_clean & FS_WASCLEAN) == 0)) {
                printf("%s: file system not clean (fs_clean=%#x); "
                    "please fsck(8)\n", mp->mnt_stat.f_mntfromname,
                    fs->fs_clean);
        }

        if (fs->fs_fmod != 0) {
                int err;

                KASSERT(!fs->fs_ronly);

                if (fs->fs_clean & FS_WASCLEAN)
                        fs->fs_time = time_second;
                fs->fs_fmod = 0;
                err = UFS_WAPBL_BEGIN(mp);
                if (err == 0) {
                        (void) ffs_cgupdate(ump, MNT_WAIT);
                        UFS_WAPBL_END(mp);
                }
        }
        if ((mp->mnt_flag & MNT_SOFTDEP) != 0) {
                printf("%s: `-o softdep' is no longer supported, "
                    "consider `-o log'\n", mp->mnt_stat.f_mntfromname);
                mp->mnt_flag &= ~MNT_SOFTDEP;
        }

        return (error);

fail:
        vrele(devvp);
        return (error);
}

/*
 * Reload all incore data for a filesystem (used after running fsck on
 * the root filesystem and finding things to fix). The filesystem must
 * be mounted read-only.
 *
 * Things to do to update the mount:
 *        1) invalidate all cached meta-data.
 *        2) re-read superblock from disk.
 *        3) re-read summary information from disk.
 *        4) invalidate all inactive vnodes.
 *        5) invalidate all cached file data.
 *        6) re-read inode data for all active vnodes.
 */
int
ffs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l)
{
        struct vnode *vp, *devvp;
        struct inode *ip;
        void *space;
        struct buf *bp;
        struct fs *fs, *newfs;
        int i, bsize, blks, error;
        int32_t *lp, fs_sbsize;
        struct ufsmount *ump;
        daddr_t sblockloc;
        struct vnode_iterator *marker;

        if ((mp->mnt_flag & MNT_RDONLY) == 0)
                return (EINVAL);

        ump = VFSTOUFS(mp);

        /*
         * Step 1: invalidate all cached meta-data.
         */
        devvp = ump->um_devvp;
        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
        error = vinvalbuf(devvp, 0, cred, l, 0, 0);
        VOP_UNLOCK(devvp);
        if (error)
                panic("%s: dirty1", __func__);

        /*
         * Step 2: re-read superblock from disk. XXX: We don't handle
         * possibility that superblock moved. Which implies that we don't
         * want its size to change either.
         */
        fs = ump->um_fs;
        fs_sbsize = fs->fs_sbsize;
        error = bread(devvp, fs->fs_sblockloc / DEV_BSIZE, fs_sbsize,
                      0, &bp);
        if (error)
                return (error);
        newfs = kmem_alloc(fs_sbsize, KM_SLEEP);
        memcpy(newfs, bp->b_data, fs_sbsize);

#ifdef FFS_EI
        if (ump->um_flags & UFS_NEEDSWAP) {
                ffs_sb_swap((struct fs *)bp->b_data, newfs);
                newfs->fs_flags |= FS_SWAPPED;
        } else
#endif
                newfs->fs_flags &= ~FS_SWAPPED;

        brelse(bp, 0);

        /* Allow converting from UFS2 to UFS2EA but not vice versa. */
        if (newfs->fs_magic == FS_UFS2EA_MAGIC) {
                ump->um_flags |= UFS_EA;
                newfs->fs_magic = FS_UFS2_MAGIC;
        } else {
                if ((ump->um_flags & UFS_EA) != 0)
                        return EINVAL;
        }

        if ((newfs->fs_magic != FS_UFS1_MAGIC) &&
            (newfs->fs_magic != FS_UFS2_MAGIC)) {
                kmem_free(newfs, fs_sbsize);
                return (EIO);                /* XXX needs translation */
        }
        if (!ffs_superblock_validate(newfs)) {
                kmem_free(newfs, fs_sbsize);
                return (EINVAL);
        }

        /*
         * The current implementation doesn't handle the possibility that
         * these values may have changed.
         */
        if ((newfs->fs_sbsize != fs_sbsize) ||
            (newfs->fs_cssize != fs->fs_cssize) ||
            (newfs->fs_contigsumsize != fs->fs_contigsumsize) ||
            (newfs->fs_ncg != fs->fs_ncg)) {
                kmem_free(newfs, fs_sbsize);
                return (EINVAL);
        }

        /* Store off old fs_sblockloc for fs_oldfscompat_read. */
        sblockloc = fs->fs_sblockloc;
        /*
         * Copy pointer fields back into superblock before copying in        XXX
         * new superblock. These should really be in the ufsmount.        XXX
         * Note that important parameters (eg fs_ncg) are unchanged.
         */
        newfs->fs_csp = fs->fs_csp;
        newfs->fs_maxcluster = fs->fs_maxcluster;
        newfs->fs_contigdirs = fs->fs_contigdirs;
        newfs->fs_ronly = fs->fs_ronly;
        newfs->fs_active = fs->fs_active;
        memcpy(fs, newfs, (u_int)fs_sbsize);
        kmem_free(newfs, fs_sbsize);

        /*
         * Recheck for Apple UFS filesystem.
         */
        ump->um_flags &= ~UFS_ISAPPLEUFS;
        if (ffs_is_appleufs(devvp, fs)) {
#ifdef APPLE_UFS
                ump->um_flags |= UFS_ISAPPLEUFS;
#else
                DPRINTF("AppleUFS not supported");
                return (EIO); /* XXX: really? */
#endif
        }

        if (UFS_MPISAPPLEUFS(ump)) {
                /* see comment about NeXT below */
                ump->um_maxsymlinklen = APPLEUFS_MAXSYMLINKLEN;
                ump->um_dirblksiz = APPLEUFS_DIRBLKSIZ;
                mp->mnt_iflag |= IMNT_DTYPE;
        } else {
                ump->um_maxsymlinklen = fs->fs_maxsymlinklen;
                ump->um_dirblksiz = UFS_DIRBLKSIZ;
                if (ump->um_maxsymlinklen > 0)
                        mp->mnt_iflag |= IMNT_DTYPE;
                else
                        mp->mnt_iflag &= ~IMNT_DTYPE;
        }
        ffs_oldfscompat_read(fs, ump, sblockloc);

        mutex_enter(&ump->um_lock);
        ump->um_maxfilesize = fs->fs_maxfilesize;
        if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) {
                uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n",
                    mp->mnt_stat.f_mntonname, fs->fs_flags,
                    (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
                if ((mp->mnt_flag & MNT_FORCE) == 0) {
                        mutex_exit(&ump->um_lock);
                        return (EINVAL);
                }
        }

        if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
                fs->fs_pendingblocks = 0;
                fs->fs_pendinginodes = 0;
        }
        mutex_exit(&ump->um_lock);

        ffs_statvfs(mp, &mp->mnt_stat);
        /*
         * Step 3: re-read summary information from disk.
         */
        blks = howmany(fs->fs_cssize, fs->fs_fsize);
        space = fs->fs_csp;
        for (i = 0; i < blks; i += fs->fs_frag) {
                bsize = fs->fs_bsize;
                if (i + fs->fs_frag > blks)
                        bsize = (blks - i) * fs->fs_fsize;
                error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + i), bsize,
                              0, &bp);
                if (error) {
                        return (error);
                }
#ifdef FFS_EI
                if (UFS_FSNEEDSWAP(fs))
                        ffs_csum_swap((struct csum *)bp->b_data,
                            (struct csum *)space, bsize);
                else
#endif
                        memcpy(space, bp->b_data, (size_t)bsize);
                space = (char *)space + bsize;
                brelse(bp, 0);
        }
        /*
         * We no longer know anything about clusters per cylinder group.
         */
        if (fs->fs_contigsumsize > 0) {
                lp = fs->fs_maxcluster;
                for (i = 0; i < fs->fs_ncg; i++)
                        *lp++ = fs->fs_contigsumsize;
        }

        vfs_vnode_iterator_init(mp, &marker);
        while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) {
                /*
                 * Step 4: invalidate all inactive vnodes.
                 */
                if (vrecycle(vp))
                        continue;
                /*
                 * Step 5: invalidate all cached file data.
                 */
                if (vn_lock(vp, LK_EXCLUSIVE)) {
                        vrele(vp);
                        continue;
                }
                if (vinvalbuf(vp, 0, cred, l, 0, 0))
                        panic("%s: dirty2", __func__);
                /*
                 * Step 6: re-read inode data for all active vnodes.
                 */
                ip = VTOI(vp);
                error = bread(devvp, FFS_FSBTODB(fs, ino_to_fsba(fs, ip->i_number)),
                              (int)fs->fs_bsize, 0, &bp);
                if (error) {
                        vput(vp);
                        break;
                }
                ffs_load_inode(bp, ip, fs, ip->i_number);
                brelse(bp, 0);
                vput(vp);
        }
        vfs_vnode_iterator_destroy(marker);
        return (error);
}

/*
 * Possible superblock locations ordered from most to least likely.
 */
static const int sblock_try[] = SBLOCKSEARCH;


static int
ffs_superblock_validate(struct fs *fs)
{
        int32_t i, fs_bshift = 0, fs_fshift = 0, fs_fragshift = 0, fs_frag;
        int32_t fs_inopb;

        /* Check the superblock size */
        if (fs->fs_sbsize > SBLOCKSIZE || fs->fs_sbsize < sizeof(struct fs))
                return 0;

        /* Check the file system blocksize */
        if (fs->fs_bsize > MAXBSIZE || fs->fs_bsize < MINBSIZE)
                return 0;
        if (!powerof2(fs->fs_bsize))
                return 0;

        /* Check the size of frag blocks */
        if (!powerof2(fs->fs_fsize))
                return 0;
        if (fs->fs_fsize == 0)
                return 0;

        /*
         * XXX: these values are just zero-checked to prevent obvious
         * bugs. We need more strict checks.
         */
        if (fs->fs_size == 0 && fs->fs_old_size == 0)
                return 0;
        if (fs->fs_cssize == 0)
                return 0;
        if (fs->fs_ipg == 0)
                return 0;
        if (fs->fs_fpg == 0)
                return 0;
        if (fs->fs_ncg == 0)
                return 0;
        if (fs->fs_maxbpg == 0)
                return 0;

        /* Check the number of inodes per block */
        if (fs->fs_magic == FS_UFS1_MAGIC)
                fs_inopb = fs->fs_bsize / sizeof(struct ufs1_dinode);
        else /* fs->fs_magic == FS_UFS2_MAGIC */
                fs_inopb = fs->fs_bsize / sizeof(struct ufs2_dinode);
        if (fs->fs_inopb != fs_inopb)
                return 0;

        /* Block size cannot be smaller than fragment size */
        if (fs->fs_bsize < fs->fs_fsize)
                return 0;

        /* Compute fs_bshift and ensure it is consistent */
        for (i = fs->fs_bsize; i > 1; i >>= 1)
                fs_bshift++;
        if (fs->fs_bshift != fs_bshift)
                return 0;

        /* Compute fs_fshift and ensure it is consistent */
        for (i = fs->fs_fsize; i > 1; i >>= 1)
                fs_fshift++;
        if (fs->fs_fshift != fs_fshift)
                return 0;

        /* Compute fs_fragshift and ensure it is consistent */
        for (i = fs->fs_frag; i > 1; i >>= 1)
                fs_fragshift++;
        if (fs->fs_fragshift != fs_fragshift)
                return 0;

        /* Check the masks */
        if (fs->fs_bmask != ~(fs->fs_bsize - 1))
                return 0;
        if (fs->fs_fmask != ~(fs->fs_fsize - 1))
                return 0;

        /*
         * Now that the shifts and masks are sanitized, we can use the ffs_ API.
         */

        /* Check the number of frag blocks */
        if ((fs_frag = ffs_numfrags(fs, fs->fs_bsize)) > MAXFRAG)
                return 0;
        if (fs->fs_frag != fs_frag)
                return 0;

        /* Check the size of cylinder groups */
        if ((fs->fs_cgsize < sizeof(struct cg)) ||
            (fs->fs_cgsize > fs->fs_bsize))
                return 0;

        return 1;
}

static int
ffs_is_appleufs(struct vnode *devvp, struct fs *fs)
{
        struct dkwedge_info dkw;
        int ret = 0;

        /*
         * First check to see if this is tagged as an Apple UFS filesystem
         * in the disklabel.
         */
        if (getdiskinfo(devvp, &dkw) == 0 &&
            strcmp(dkw.dkw_ptype, DKW_PTYPE_APPLEUFS) == 0)
                ret = 1;
#ifdef APPLE_UFS
        else {
                struct appleufslabel *applefs;
                struct buf *bp;
                daddr_t blkno = APPLEUFS_LABEL_OFFSET / DEV_BSIZE;
                int error;

                /*
                 * Manually look for an Apple UFS label, and if a valid one
                 * is found, then treat it like an Apple UFS filesystem anyway.
                 */
                error = bread(devvp, blkno, APPLEUFS_LABEL_SIZE, 0, &bp);
                if (error) {
                        DPRINTF("bread@0x%jx returned %d", (intmax_t)blkno, error);
                        return 0;
                }
                applefs = (struct appleufslabel *)bp->b_data;
                error = ffs_appleufs_validate(fs->fs_fsmnt, applefs, NULL);
                if (error == 0)
                        ret = 1;
                brelse(bp, 0);
        }
#endif

        return ret;
}

/*
 * Common code for mount and mountroot
 */
int
ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
{
        struct ufsmount *ump = NULL;
        struct buf *bp = NULL;
        struct fs *fs = NULL;
        dev_t dev;
        void *space;
        daddr_t sblockloc = 0;
        int blks, fstype = 0;
        int error, i, bsize, ronly, bset = 0;
#ifdef FFS_EI
        int needswap = 0;                /* keep gcc happy */
#endif
        int32_t *lp;
        kauth_cred_t cred;
        u_int32_t allocsbsize, fs_sbsize = 0;

        dev = devvp->v_rdev;
        cred = l ? l->l_cred : NOCRED;

        /* Flush out any old buffers remaining from a previous use. */
        vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
        error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
        VOP_UNLOCK(devvp);
        if (error) {
                DPRINTF("vinvalbuf returned %d", error);
                return error;
        }

        ronly = (mp->mnt_flag & MNT_RDONLY) != 0;

        ump = kmem_zalloc(sizeof(*ump), KM_SLEEP);
        mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE);
        error = ffs_snapshot_init(ump);
        if (error) {
                DPRINTF("ffs_snapshot_init returned %d", error);
                goto out;
        }
        ump->um_ops = &ffs_ufsops;

#ifdef WAPBL
 sbagain:
#endif
        /*
         * Try reading the superblock in each of its possible locations.
         */
        for (i = 0; ; i++) {
                daddr_t fs_sblockloc;

                if (bp != NULL) {
                        brelse(bp, BC_NOCACHE);
                        bp = NULL;
                }
                if (sblock_try[i] == -1) {
                        DPRINTF("no superblock found");
                        error = EINVAL;
                        fs = NULL;
                        goto out;
                }

                error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE,
                    0, &bp);
                if (error) {
                        DPRINTF("bread@0x%x returned %d",
                            sblock_try[i] / DEV_BSIZE, error);
                        fs = NULL;
                        goto out;
                }
                fs = (struct fs *)bp->b_data;

                sblockloc = sblock_try[i];
                DPRINTF("fs_magic 0x%x", fs->fs_magic);

                /*
                 * Swap: here, we swap fs->fs_sbsize in order to get the correct
                 * size to read the superblock. Once read, we swap the whole
                 * superblock structure.
                 */
                if (fs->fs_magic == FS_UFS2EA_MAGIC) {
                        ump->um_flags |= UFS_EA;
                        fs->fs_magic = FS_UFS2_MAGIC;
                } else if (fs->fs_magic == FS_UFS2EA_MAGIC_SWAPPED) {
                        ump->um_flags |= UFS_EA;
                        fs->fs_magic = FS_UFS2_MAGIC_SWAPPED;
                }
                if (fs->fs_magic == FS_UFS1_MAGIC) {
                        fs_sbsize = fs->fs_sbsize;
                        fstype = UFS1;
#ifdef FFS_EI
                        needswap = 0;
                } else if (fs->fs_magic == FS_UFS1_MAGIC_SWAPPED) {
                        fs_sbsize = bswap32(fs->fs_sbsize);
                        fstype = UFS1;
                        needswap = 1;
#endif
                } else if (fs->fs_magic == FS_UFS2_MAGIC) {
                        fs_sbsize = fs->fs_sbsize;
                        fstype = UFS2;
#ifdef FFS_EI
                        needswap = 0;
                } else if (fs->fs_magic == FS_UFS2_MAGIC_SWAPPED) {
                        fs_sbsize = bswap32(fs->fs_sbsize);
                        fstype = UFS2;
                        needswap = 1;
#endif
                } else
                        continue;

                /* fs->fs_sblockloc isn't defined for old filesystems */
                if (fstype == UFS1 && !(fs->fs_old_flags & FS_FLAGS_UPDATED)) {
                        if (sblockloc == SBLOCK_UFS2)
                                /*
                                 * This is likely to be the first alternate
                                 * in a filesystem with 64k blocks.
                                 * Don't use it.
                                 */
                                continue;
                        fs_sblockloc = sblockloc;
                } else {
                        fs_sblockloc = fs->fs_sblockloc;
#ifdef FFS_EI
                        if (needswap)
                                fs_sblockloc = bswap64(fs_sblockloc);
#endif
                }

                /* Check we haven't found an alternate superblock */
                if (fs_sblockloc != sblockloc)
                        continue;

                /* Check the superblock size */
                if (fs_sbsize > SBLOCKSIZE || fs_sbsize < sizeof(struct fs))
                        continue;
                fs = kmem_alloc((u_long)fs_sbsize, KM_SLEEP);
                memcpy(fs, bp->b_data, fs_sbsize);

                /* Swap the whole superblock structure, if necessary. */
#ifdef FFS_EI
                if (needswap) {
                        ffs_sb_swap((struct fs*)bp->b_data, fs);
                        fs->fs_flags |= FS_SWAPPED;
                } else
#endif
                        fs->fs_flags &= ~FS_SWAPPED;

                /*
                 * Now that everything is swapped, the superblock is ready to
                 * be sanitized.
                 */
                if (!ffs_superblock_validate(fs)) {
                        kmem_free(fs, fs_sbsize);
                        continue;
                }

                /* Ok seems to be a good superblock */
                break;
        }

        ump->um_fs = fs;

#ifdef WAPBL
        if ((mp->mnt_wapbl_replay == 0) && (fs->fs_flags & FS_DOWAPBL)) {
                error = ffs_wapbl_replay_start(mp, fs, devvp);
                if (error && (mp->mnt_flag & MNT_FORCE) == 0) {
                        DPRINTF("ffs_wapbl_replay_start returned %d", error);
                        goto out;
                }
                if (!error) {
                        if (!ronly) {
                                /* XXX fsmnt may be stale. */
                                printf("%s: replaying log to disk\n",
                                    fs->fs_fsmnt);
                                error = wapbl_replay_write(mp->mnt_wapbl_replay,
                                    devvp);
                                if (error) {
                                        DPRINTF("wapbl_replay_write returned %d",
                                            error);
                                        goto out;
                                }
                                wapbl_replay_stop(mp->mnt_wapbl_replay);
                                fs->fs_clean = FS_WASCLEAN;
                        } else {
                                /* XXX fsmnt may be stale */
                                printf("%s: replaying log to memory\n",
                                    fs->fs_fsmnt);
                        }

                        /* Force a re-read of the superblock */
                        brelse(bp, BC_INVAL);
                        bp = NULL;
                        kmem_free(fs, fs_sbsize);
                        fs = NULL;
                        goto sbagain;
                }
        }
#else /* !WAPBL */
        if ((fs->fs_flags & FS_DOWAPBL) && (mp->mnt_flag & MNT_FORCE) == 0) {
                error = EPERM;
                DPRINTF("no force %d", error);
                goto out;
        }
#endif /* !WAPBL */

        ffs_oldfscompat_read(fs, ump, sblockloc);
        ump->um_maxfilesize = fs->fs_maxfilesize;

        if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) {
                uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n",
                    mp->mnt_stat.f_mntonname, fs->fs_flags,
                    (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
                if ((mp->mnt_flag & MNT_FORCE) == 0) {
                        error = EINVAL;
                        DPRINTF("no force %d", error);
                        goto out;
                }
        }

        fs->fs_fmod = 0;
        if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
                fs->fs_pendingblocks = 0;
                fs->fs_pendinginodes = 0;
        }

        ump->um_fstype = fstype;
        if (fs->fs_sbsize < SBLOCKSIZE)
                brelse(bp, BC_INVAL);
        else
                brelse(bp, 0);
        bp = NULL;

        if (ffs_is_appleufs(devvp, fs)) {
#ifdef APPLE_UFS
                ump->um_flags |= UFS_ISAPPLEUFS;
#else
                DPRINTF("AppleUFS not supported");
                error = EINVAL;
                goto out;
#endif
        }

#if 0
/*
 * XXX This code changes the behaviour of mounting dirty filesystems, to
 * XXX require "mount -f ..." to mount them.  This doesn't match what
 * XXX mount(8) describes and is disabled for now.
 */
        /*
         * If the file system is not clean, don't allow it to be mounted
         * unless MNT_FORCE is specified.  (Note: MNT_FORCE is always set
         * for the root file system.)
         */
        if (fs->fs_flags & FS_DOWAPBL) {
                /*
                 * wapbl normally expects to be FS_WASCLEAN when the FS_DOWAPBL
                 * bit is set, although there's a window in unmount where it
                 * could be FS_ISCLEAN
                 */
                if ((mp->mnt_flag & MNT_FORCE) == 0 &&
                    (fs->fs_clean & (FS_WASCLEAN | FS_ISCLEAN)) == 0) {
                        error = EPERM;
                        goto out;
                }
        } else
                if ((fs->fs_clean & FS_ISCLEAN) == 0 &&
                    (mp->mnt_flag & MNT_FORCE) == 0) {
                        error = EPERM;
                        goto out;
                }
#endif

        /*
         * Verify that we can access the last block in the fs
         * if we're mounting read/write.
         */
        if (!ronly) {
                error = bread(devvp, FFS_FSBTODB(fs, fs->fs_size - 1),
                    fs->fs_fsize, 0, &bp);
                if (error) {
                        DPRINTF("bread@0x%jx returned %d",
                            (intmax_t)FFS_FSBTODB(fs, fs->fs_size - 1),
                            error);
                        bset = BC_INVAL;
                        goto out;
                }
                if (bp->b_bcount != fs->fs_fsize) {
                        DPRINTF("bcount %x != fsize %x", bp->b_bcount,
                            fs->fs_fsize);
                        error = EINVAL;
                        bset = BC_INVAL;
                        goto out;
                }
                brelse(bp, BC_INVAL);
                bp = NULL;
        }

        fs->fs_ronly = ronly;
        /* Don't bump fs_clean if we're replaying journal */
        if (!((fs->fs_flags & FS_DOWAPBL) && (fs->fs_clean & FS_WASCLEAN))) {
                if (ronly == 0) {
                        fs->fs_clean =
                            fs->fs_clean == FS_ISCLEAN ? FS_WASCLEAN : 0;
                        fs->fs_fmod = 1;
                }
        }

        bsize = fs->fs_cssize;
        blks = howmany(bsize, fs->fs_fsize);
        if (fs->fs_contigsumsize > 0)
                bsize += fs->fs_ncg * sizeof(int32_t);
        bsize += fs->fs_ncg * sizeof(*fs->fs_contigdirs);
        allocsbsize = bsize;
        space = kmem_alloc((u_long)allocsbsize, KM_SLEEP);
        fs->fs_csp = space;

        for (i = 0; i < blks; i += fs->fs_frag) {
                bsize = fs->fs_bsize;
                if (i + fs->fs_frag > blks)
                        bsize = (blks - i) * fs->fs_fsize;
                error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + i), bsize,
                              0, &bp);
                if (error) {
                        DPRINTF("bread@0x%jx %d",
                            (intmax_t)FFS_FSBTODB(fs, fs->fs_csaddr + i),
                            error);
                        goto out1;
                }
#ifdef FFS_EI
                if (needswap)
                        ffs_csum_swap((struct csum *)bp->b_data,
                                (struct csum *)space, bsize);
                else
#endif
                        memcpy(space, bp->b_data, (u_int)bsize);

                space = (char *)space + bsize;
                brelse(bp, 0);
                bp = NULL;
        }
        if (fs->fs_contigsumsize > 0) {
                fs->fs_maxcluster = lp = space;
                for (i = 0; i < fs->fs_ncg; i++)
                        *lp++ = fs->fs_contigsumsize;
                space = lp;
        }
        bsize = fs->fs_ncg * sizeof(*fs->fs_contigdirs);
        fs->fs_contigdirs = space;
        space = (char *)space + bsize;
        memset(fs->fs_contigdirs, 0, bsize);

        /* Compatibility for old filesystems - XXX */
        if (fs->fs_avgfilesize <= 0)
                fs->fs_avgfilesize = AVFILESIZ;
        if (fs->fs_avgfpdir <= 0)
                fs->fs_avgfpdir = AFPDIR;
        fs->fs_active = NULL;

        mp->mnt_data = ump;
        mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
        mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_FFS);
        mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
        mp->mnt_stat.f_namemax = FFS_MAXNAMLEN;
        if (UFS_MPISAPPLEUFS(ump)) {
                /* NeXT used to keep short symlinks in the inode even
                 * when using FS_42INODEFMT.  In that case fs->fs_maxsymlinklen
                 * is probably -1, but we still need to be able to identify
                 * short symlinks.
                 */
                ump->um_maxsymlinklen = APPLEUFS_MAXSYMLINKLEN;
                ump->um_dirblksiz = APPLEUFS_DIRBLKSIZ;
                mp->mnt_iflag |= IMNT_DTYPE;
        } else {
                ump->um_maxsymlinklen = fs->fs_maxsymlinklen;
                ump->um_dirblksiz = UFS_DIRBLKSIZ;
                if (ump->um_maxsymlinklen > 0)
                        mp->mnt_iflag |= IMNT_DTYPE;
                else
                        mp->mnt_iflag &= ~IMNT_DTYPE;
        }
        mp->mnt_fs_bshift = fs->fs_bshift;
        mp->mnt_dev_bshift = DEV_BSHIFT;        /* XXX */
        mp->mnt_flag |= MNT_LOCAL;
        mp->mnt_iflag |= IMNT_MPSAFE | IMNT_CAN_RWTORO | IMNT_SHRLOOKUP |
            IMNT_NCLOOKUP;
#ifdef FFS_EI
        if (needswap)
                ump->um_flags |= UFS_NEEDSWAP;
#endif
        error = ffs_acls(mp, fs->fs_flags);
        if (error)
                goto out1;
        ump->um_mountp = mp;
        ump->um_dev = dev;
        ump->um_devvp = devvp;
        ump->um_nindir = fs->fs_nindir;
        ump->um_lognindir = ffs(fs->fs_nindir) - 1;
        ump->um_bptrtodb = fs->fs_fshift - DEV_BSHIFT;
        ump->um_seqinc = fs->fs_frag;
        for (i = 0; i < MAXQUOTAS; i++)
                ump->um_quotas[i] = NULLVP;
        spec_node_setmountedfs(devvp, mp);
        if (ronly == 0 && fs->fs_snapinum[0] != 0)
                ffs_snapshot_mount(mp);
#ifdef WAPBL
        if (!ronly) {
                KDASSERT(fs->fs_ronly == 0);
                /*
                 * ffs_wapbl_start() needs mp->mnt_stat initialised if it
                 * needs to create a new log file in-filesystem.
                 */
                error = ffs_statvfs(mp, &mp->mnt_stat);
                if (error) {
                        DPRINTF("ffs_statvfs returned %d", error);
                        goto out1;
                }

                error = ffs_wapbl_start(mp);
                if (error) {
                        DPRINTF("ffs_wapbl_start returned %d", error);
                        goto out1;
                }
        }
#endif /* WAPBL */
        if (ronly == 0) {
#ifdef QUOTA2
                error = ffs_quota2_mount(mp);
                if (error) {
                        DPRINTF("ffs_quota2_mount returned %d", error);
                        goto out1;
                }
#else
                if (fs->fs_flags & FS_DOQUOTA2) {
                        ump->um_flags |= UFS_QUOTA2;
                        uprintf("%s: options QUOTA2 not enabled%s\n",
                            mp->mnt_stat.f_mntonname,
                            (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
                        if ((mp->mnt_flag & MNT_FORCE) == 0) {
                                error = EINVAL;
                                DPRINTF("quota disabled %d", error);
                                goto out1;
                        }
                }
#endif
         }

        if (mp->mnt_flag & MNT_DISCARD)
                ump->um_discarddata = ffs_discard_init(devvp, fs);

        return (0);
out1:
        kmem_free(fs->fs_csp, allocsbsize);
out:
#ifdef WAPBL
        if (mp->mnt_wapbl_replay) {
                wapbl_replay_stop(mp->mnt_wapbl_replay);
                wapbl_replay_free(mp->mnt_wapbl_replay);
                mp->mnt_wapbl_replay = 0;
        }
#endif

        if (fs)
                kmem_free(fs, fs->fs_sbsize);
        spec_node_setmountedfs(devvp, NULL);
        if (bp)
                brelse(bp, bset);
        if (ump) {
                if (ump->um_oldfscompat)
                        kmem_free(ump->um_oldfscompat, 512 + 3*sizeof(int32_t));
                mutex_destroy(&ump->um_lock);
                kmem_free(ump, sizeof(*ump));
                mp->mnt_data = NULL;
        }
        return (error);
}

/*
 * Sanity checks for loading old filesystem superblocks.
 * See ffs_oldfscompat_write below for unwound actions.
 *
 * XXX - Parts get retired eventually.
 * Unfortunately new bits get added.
 */
static void
ffs_oldfscompat_read(struct fs *fs, struct ufsmount *ump, daddr_t sblockloc)
{
        off_t maxfilesize;
        int32_t *extrasave;

        if ((fs->fs_magic != FS_UFS1_MAGIC) ||
            (fs->fs_old_flags & FS_FLAGS_UPDATED))
                return;

        if (!ump->um_oldfscompat)
                ump->um_oldfscompat = kmem_alloc(512 + 3*sizeof(int32_t),
                    KM_SLEEP);

        memcpy(ump->um_oldfscompat, &fs->fs_old_postbl_start, 512);
        extrasave = ump->um_oldfscompat;
        extrasave += 512/sizeof(int32_t);
        extrasave[0] = fs->fs_old_npsect;
        extrasave[1] = fs->fs_old_interleave;
        extrasave[2] = fs->fs_old_trackskew;

        /* These fields will be overwritten by their
         * original values in fs_oldfscompat_write, so it is harmless
         * to modify them here.
         */
        fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
        fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
        fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
        fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;

        fs->fs_maxbsize = fs->fs_bsize;
        fs->fs_time = fs->fs_old_time;
        fs->fs_size = fs->fs_old_size;
        fs->fs_dsize = fs->fs_old_dsize;
        fs->fs_csaddr = fs->fs_old_csaddr;
        fs->fs_sblockloc = sblockloc;

        fs->fs_flags = fs->fs_old_flags | (fs->fs_flags & FS_INTERNAL);

        if (fs->fs_old_postblformat == FS_42POSTBLFMT) {
                fs->fs_old_nrpos = 8;
                fs->fs_old_npsect = fs->fs_old_nsect;
                fs->fs_old_interleave = 1;
                fs->fs_old_trackskew = 0;
        }

        if (fs->fs_magic == FS_UFS1_MAGIC &&
            fs->fs_old_inodefmt < FS_44INODEFMT) {
                fs->fs_maxfilesize = (u_quad_t) 1LL << 39;
                fs->fs_qbmask = ~fs->fs_bmask;
                fs->fs_qfmask = ~fs->fs_fmask;
        }

        maxfilesize = (u_int64_t)0x80000000 * fs->fs_bsize - 1;
        if (fs->fs_maxfilesize > maxfilesize)
                fs->fs_maxfilesize = maxfilesize;

        /* Compatibility for old filesystems */
        if (fs->fs_avgfilesize <= 0)
                fs->fs_avgfilesize = AVFILESIZ;
        if (fs->fs_avgfpdir <= 0)
                fs->fs_avgfpdir = AFPDIR;

#if 0
        if (bigcgs) {
                fs->fs_save_cgsize = fs->fs_cgsize;
                fs->fs_cgsize = fs->fs_bsize;
        }
#endif
}

/*
 * Unwinding superblock updates for old filesystems.
 * See ffs_oldfscompat_read above for details.
 *
 * XXX - Parts get retired eventually.
 * Unfortunately new bits get added.
 */
static void
ffs_oldfscompat_write(struct fs *fs, struct ufsmount *ump)
{
        int32_t *extrasave;

        if ((fs->fs_magic != FS_UFS1_MAGIC) ||
            (fs->fs_old_flags & FS_FLAGS_UPDATED))
                return;

        fs->fs_old_time = fs->fs_time;
        fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
        fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
        fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
        fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
        fs->fs_old_flags = fs->fs_flags;

#if 0
        if (bigcgs) {
                fs->fs_cgsize = fs->fs_save_cgsize;
        }
#endif

        memcpy(&fs->fs_old_postbl_start, ump->um_oldfscompat, 512);
        extrasave = ump->um_oldfscompat;
        extrasave += 512/sizeof(int32_t);
        fs->fs_old_npsect = extrasave[0];
        fs->fs_old_interleave = extrasave[1];
        fs->fs_old_trackskew = extrasave[2];

}

/*
 * unmount vfs operation
 */
int
ffs_unmount(struct mount *mp, int mntflags)
{
        struct lwp *l = curlwp;
        struct ufsmount *ump = VFSTOUFS(mp);
        struct fs *fs = ump->um_fs;
        int error, flags;
        u_int32_t bsize;
#ifdef WAPBL
        extern int doforce;
#endif

        if (ump->um_discarddata) {
                ffs_discard_finish(ump->um_discarddata, mntflags);
                ump->um_discarddata = NULL;
        }

        flags = 0;
        if (mntflags & MNT_FORCE)
                flags |= FORCECLOSE;
        if ((error = ffs_flushfiles(mp, flags, l)) != 0)
                return (error);
        error = UFS_WAPBL_BEGIN(mp);
        if (error == 0)
                if (fs->fs_ronly == 0 &&
                    ffs_cgupdate(ump, MNT_WAIT) == 0 &&
                    fs->fs_clean & FS_WASCLEAN) {
                        fs->fs_clean = FS_ISCLEAN;
                        fs->fs_fmod = 0;
                        (void) ffs_sbupdate(ump, MNT_WAIT);
                }
        if (error == 0)
                UFS_WAPBL_END(mp);
#ifdef WAPBL
        KASSERT(!(mp->mnt_wapbl_replay && mp->mnt_wapbl));
        if (mp->mnt_wapbl_replay) {
                KDASSERT(fs->fs_ronly);
                wapbl_replay_stop(mp->mnt_wapbl_replay);
                wapbl_replay_free(mp->mnt_wapbl_replay);
                mp->mnt_wapbl_replay = 0;
        }
        error = ffs_wapbl_stop(mp, doforce && (mntflags & MNT_FORCE));
        if (error) {
                return error;
        }
#endif /* WAPBL */

        if (ump->um_devvp->v_type != VBAD)
                spec_node_setmountedfs(ump->um_devvp, NULL);
        vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
        (void)VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD | FWRITE,
                NOCRED);
        vput(ump->um_devvp);

        bsize = fs->fs_cssize;
        if (fs->fs_contigsumsize > 0)
                bsize += fs->fs_ncg * sizeof(int32_t);
        bsize += fs->fs_ncg * sizeof(*fs->fs_contigdirs);
        kmem_free(fs->fs_csp, bsize);

        kmem_free(fs, fs->fs_sbsize);
        if (ump->um_oldfscompat != NULL)
                kmem_free(ump->um_oldfscompat, 512 + 3*sizeof(int32_t));
        mutex_destroy(&ump->um_lock);
        ffs_snapshot_fini(ump);
        kmem_free(ump, sizeof(*ump));
        mp->mnt_data = NULL;
        mp->mnt_flag &= ~MNT_LOCAL;
        return (0);
}

/*
 * Flush out all the files in a filesystem.
 */
int
ffs_flushfiles(struct mount *mp, int flags, struct lwp *l)
{
        extern int doforce;
        struct ufsmount *ump;
        int error;

        if (!doforce)
                flags &= ~FORCECLOSE;
        ump = VFSTOUFS(mp);
#ifdef QUOTA
        if ((error = quota1_umount(mp, flags)) != 0)
                return (error);
#endif
#ifdef QUOTA2
        if ((error = quota2_umount(mp, flags)) != 0)
                return (error);
#endif
#ifdef UFS_EXTATTR
        if (ump->um_fstype == UFS1) {
                if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)
                        ufs_extattr_stop(mp, l);
                if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)
                        ufs_extattr_uepm_destroy(&ump->um_extattr);
                mp->mnt_flag &= ~MNT_EXTATTR;
        }
#endif
        if ((error = vflush(mp, 0, SKIPSYSTEM | flags)) != 0)
                return (error);
        ffs_snapshot_unmount(mp);
        /*
         * Flush all the files.
         */
        error = vflush(mp, NULLVP, flags);
        if (error)
                return (error);
        /*
         * Flush filesystem metadata.
         */
        vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_FSYNC(ump->um_devvp, l->l_cred, FSYNC_WAIT, 0, 0);
        VOP_UNLOCK(ump->um_devvp);
        if (flags & FORCECLOSE) /* XXXDBJ */
                error = 0;

#ifdef WAPBL
        if (error)
                return error;
        if (mp->mnt_wapbl) {
                error = wapbl_flush(mp->mnt_wapbl, 1);
                if (flags & FORCECLOSE)
                        error = 0;
        }
#endif

        return (error);
}

/*
 * Get file system statistics.
 */
int
ffs_statvfs(struct mount *mp, struct statvfs *sbp)
{
        struct ufsmount *ump;
        struct fs *fs;

        ump = VFSTOUFS(mp);
        fs = ump->um_fs;
        mutex_enter(&ump->um_lock);
        sbp->f_bsize = fs->fs_bsize;
        sbp->f_frsize = fs->fs_fsize;
        sbp->f_iosize = fs->fs_bsize;
        sbp->f_blocks = fs->fs_dsize;
        sbp->f_bfree = ffs_blkstofrags(fs, fs->fs_cstotal.cs_nbfree) +
            fs->fs_cstotal.cs_nffree + FFS_DBTOFSB(fs, fs->fs_pendingblocks);
        sbp->f_bresvd = ((u_int64_t) fs->fs_dsize * (u_int64_t)
            fs->fs_minfree) / (u_int64_t) 100;
        if (sbp->f_bfree > sbp->f_bresvd)
                sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
        else
                sbp->f_bavail = 0;
        sbp->f_files =  fs->fs_ncg * fs->fs_ipg - UFS_ROOTINO;
        sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
        sbp->f_favail = sbp->f_ffree;
        sbp->f_fresvd = 0;
        mutex_exit(&ump->um_lock);
        copy_statvfs_info(sbp, mp);

        return (0);
}

struct ffs_sync_ctx {
        int waitfor;
};

static bool
ffs_sync_selector(void *cl, struct vnode *vp)
{
        struct ffs_sync_ctx *c = cl;
        struct inode *ip;

        KASSERT(mutex_owned(vp->v_interlock));

        ip = VTOI(vp);
        /*
         * Skip the vnode/inode if inaccessible.
         */
        if (ip == NULL || vp->v_type == VNON)
                return false;

        /*
         * We deliberately update inode times here.  This will
         * prevent a massive queue of updates accumulating, only
         * to be handled by a call to unmount.
         *
         * XXX It would be better to have the syncer trickle these
         * out.  Adjustment needed to allow registering vnodes for
         * sync when the vnode is clean, but the inode dirty.  Or
         * have ufs itself trickle out inode updates.
         *
         * If doing a lazy sync, we don't care about metadata or
         * data updates, because they are handled by each vnode's
         * synclist entry.  In this case we are only interested in
         * writing back modified inodes.
         */
        if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE |
            IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) == 0 &&
            (c->waitfor == MNT_LAZY || (LIST_EMPTY(&vp->v_dirtyblkhd) &&
            (vp->v_iflag & VI_ONWORKLST) == 0)))
                return false;

        return true;
}

/*
 * Go through the disk queues to initiate sandbagged IO;
 * go through the inodes to write those that have been modified;
 * initiate the writing of the super block if it has been modified.
 *
 * Note: we are always called with the filesystem marked `MPBUSY'.
 */
int
ffs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
{
        struct vnode *vp;
        struct ufsmount *ump = VFSTOUFS(mp);
        struct fs *fs;
        struct vnode_iterator *marker;
        int error, allerror = 0;
        struct ffs_sync_ctx ctx;

        fs = ump->um_fs;
        if (fs->fs_fmod != 0 && fs->fs_ronly != 0) {                /* XXX */
                panic("%s: rofs mod, fs=%s", __func__, fs->fs_fsmnt);
        }

        /*
         * Write back each (modified) inode.
         */
        vfs_vnode_iterator_init(mp, &marker);

        ctx.waitfor = waitfor;
        while ((vp = vfs_vnode_iterator_next(marker, ffs_sync_selector, &ctx)))
        {
                error = vn_lock(vp,
                    LK_EXCLUSIVE | (waitfor == MNT_LAZY ? LK_NOWAIT : 0));
                if (error) {
                        vrele(vp);
                        continue;
                }
                if (waitfor == MNT_LAZY) {
                        error = UFS_WAPBL_BEGIN(vp->v_mount);
                        if (!error) {
                                error = ffs_update(vp, NULL, NULL,
                                    UPDATE_CLOSE);
                                UFS_WAPBL_END(vp->v_mount);
                        }
                } else {
                        error = VOP_FSYNC(vp, cred, FSYNC_NOLOG |
                            (waitfor == MNT_WAIT ? FSYNC_WAIT : 0), 0, 0);
                }
                if (error)
                        allerror = error;
                vput(vp);
        }
        vfs_vnode_iterator_destroy(marker);

        /*
         * Force stale file system control information to be flushed.
         */
        if (waitfor != MNT_LAZY)  {
                bool need_devvp_fsync;

                mutex_enter(ump->um_devvp->v_interlock);
                need_devvp_fsync = (ump->um_devvp->v_numoutput > 0 ||
                    !LIST_EMPTY(&ump->um_devvp->v_dirtyblkhd));
                mutex_exit(ump->um_devvp->v_interlock);
                if (need_devvp_fsync) {
                        int flags = FSYNC_NOLOG;

                        if (waitfor == MNT_WAIT)
                                flags |= FSYNC_WAIT;

                        vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
                        if ((error = VOP_FSYNC(ump->um_devvp, cred, flags, 0,
                                    0)) != 0)
                                allerror = error;
                        VOP_UNLOCK(ump->um_devvp);
                }
        }
#if defined(QUOTA) || defined(QUOTA2)
        qsync(mp);
#endif
        /*
         * Write back modified superblock.
         */
        if (fs->fs_fmod != 0) {
                fs->fs_fmod = 0;
                fs->fs_time = time_second;
                error = UFS_WAPBL_BEGIN(mp);
                if (error)
                        allerror = error;
                else {
                        if ((error = ffs_cgupdate(ump, waitfor)))
                                allerror = error;
                        UFS_WAPBL_END(mp);
                }
        }

#ifdef WAPBL
        if (mp->mnt_wapbl) {
                error = wapbl_flush(mp->mnt_wapbl, (waitfor == MNT_WAIT));
                if (error)
                        allerror = error;
        }
#endif

        return (allerror);
}

/*
 * Load inode from disk and initialize vnode.
 */
static int
ffs_init_vnode(struct ufsmount *ump, struct vnode *vp, ino_t ino)
{
        struct fs *fs;
        struct inode *ip;
        struct buf *bp;
        int error;

        fs = ump->um_fs;

        /* Read in the disk contents for the inode. */
        error = bread(ump->um_devvp, FFS_FSBTODB(fs, ino_to_fsba(fs, ino)),
                      (int)fs->fs_bsize, 0, &bp);
        if (error)
                return error;

        /* Allocate and initialize inode. */
        ip = pool_cache_get(ffs_inode_cache, PR_WAITOK);
        memset(ip, 0, sizeof(struct inode));
        ip->i_ump = ump;
        ip->i_fs = fs;
        ip->i_dev = ump->um_dev;
        ip->i_number = ino;
        if (ump->um_fstype == UFS1)
                ip->i_din.ffs1_din = pool_cache_get(ffs_dinode1_cache,
                    PR_WAITOK);
        else
                ip->i_din.ffs2_din = pool_cache_get(ffs_dinode2_cache,
                    PR_WAITOK);
        ffs_load_inode(bp, ip, fs, ino);
        brelse(bp, 0);
        ip->i_vnode = vp;
#if defined(QUOTA) || defined(QUOTA2)
        ufsquota_init(ip);
#endif

        /* Initialise vnode with this inode. */
        vp->v_tag = VT_UFS;
        vp->v_op = ffs_vnodeop_p;
        vp->v_data = ip;

        /* Initialize genfs node. */
        genfs_node_init(vp, &ffs_genfsops);

        return 0;
}

/*
 * Undo ffs_init_vnode().
 */
static void
ffs_deinit_vnode(struct ufsmount *ump, struct vnode *vp)
{
        struct inode *ip = VTOI(vp);

        genfs_node_destroy(vp);
        vp->v_data = NULL;

        if (ump->um_fstype == UFS1)
                pool_cache_put(ffs_dinode1_cache, ip->i_din.ffs1_din);
        else
                pool_cache_put(ffs_dinode2_cache, ip->i_din.ffs2_din);
        pool_cache_put(ffs_inode_cache, ip);
}

/*
 * Read an inode from disk and initialize this vnode / inode pair.
 * Caller assures no other thread will try to load this inode.
 */
int
ffs_loadvnode(struct mount *mp, struct vnode *vp,
    const void *key, size_t key_len, const void **new_key)
{
        ino_t ino;
        struct fs *fs;
        struct inode *ip;
        struct ufsmount *ump;
        int error;

        KASSERT(key_len == sizeof(ino));
        memcpy(&ino, key, key_len);
        ump = VFSTOUFS(mp);
        fs = ump->um_fs;

        error = ffs_init_vnode(ump, vp, ino);
        if (error)
                return error;

        ip = VTOI(vp);
        if (ip->i_mode == 0) {
                ffs_deinit_vnode(ump, vp);

                return ENOENT;
        }

        /* Initialize the vnode from the inode. */
        ufs_vinit(mp, ffs_specop_p, ffs_fifoop_p, &vp);

        /* Finish inode initialization.  */
        ip->i_devvp = ump->um_devvp;
        vref(ip->i_devvp);

        /*
         * Ensure that uid and gid are correct. This is a temporary
         * fix until fsck has been changed to do the update.
         */

        if (fs->fs_magic == FS_UFS1_MAGIC &&                        /* XXX */
            fs->fs_old_inodefmt < FS_44INODEFMT) {                /* XXX */
                ip->i_uid = ip->i_ffs1_ouid;                        /* XXX */
                ip->i_gid = ip->i_ffs1_ogid;                        /* XXX */
        }                                                        /* XXX */
        uvm_vnp_setsize(vp, ip->i_size);
        cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip));
        *new_key = &ip->i_number;
        return 0;
}

/*
 * Create a new inode on disk and initialize this vnode / inode pair.
 */
int
ffs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp,
    struct vattr *vap, kauth_cred_t cred, void *extra,
    size_t *key_len, const void **new_key)
{
        ino_t ino;
        struct fs *fs;
        struct inode *ip;
        struct timespec ts;
        struct ufsmount *ump;
        int error, mode;

        KASSERT(dvp->v_mount == mp);
        KASSERT(vap->va_type != VNON);

        *key_len = sizeof(ino);
        ump = VFSTOUFS(mp);
        fs = ump->um_fs;
        mode = MAKEIMODE(vap->va_type, vap->va_mode);

        /* Allocate fresh inode. */
        error = ffs_valloc(dvp, mode, cred, &ino);
        if (error)
                return error;

        /* Attach inode to vnode. */
        error = ffs_init_vnode(ump, vp, ino);
        if (error) {
                if (UFS_WAPBL_BEGIN(mp) == 0) {
                        ffs_vfree(dvp, ino, mode);
                        UFS_WAPBL_END(mp);
                }
                return error;
        }

        ip = VTOI(vp);
        if (ip->i_mode) {
                panic("%s: dup alloc ino=%" PRId64 " on %s: mode %o/%o "
                    "gen %x/%x size %" PRIx64 " blocks %" PRIx64,
                    __func__, ino, fs->fs_fsmnt, DIP(ip, mode), ip->i_mode,
                    DIP(ip, gen), ip->i_gen, DIP(ip, size), DIP(ip, blocks));
        }
        if (DIP(ip, size) || DIP(ip, blocks)) {
                printf("%s: ino=%" PRId64 " on %s: "
                    "gen %x/%x has non zero blocks %" PRIx64 " or size %"
                    PRIx64 "\n",
                    __func__, ino, fs->fs_fsmnt, DIP(ip, gen), ip->i_gen,
                    DIP(ip, blocks), DIP(ip, size));
                if ((ip)->i_ump->um_fstype == UFS1)
                        panic("%s: dirty filesystem?", __func__);
                DIP_ASSIGN(ip, blocks, 0);
                DIP_ASSIGN(ip, size, 0);
        }

        /* Set uid / gid. */
        if (cred == NOCRED || cred == FSCRED) {
                ip->i_gid = 0;
                ip->i_uid = 0;
        } else {
                ip->i_gid = VTOI(dvp)->i_gid;
                ip->i_uid = kauth_cred_geteuid(cred);
        }
        DIP_ASSIGN(ip, gid, ip->i_gid);
        DIP_ASSIGN(ip, uid, ip->i_uid);

#if defined(QUOTA) || defined(QUOTA2)
        error = UFS_WAPBL_BEGIN(mp);
        if (error) {
                ffs_deinit_vnode(ump, vp);

                return error;
        }
        error = chkiq(ip, 1, cred, 0);
        if (error) {
                ffs_vfree(dvp, ino, mode);
                UFS_WAPBL_END(mp);
                ffs_deinit_vnode(ump, vp);

                return error;
        }
        UFS_WAPBL_END(mp);
#endif

        /* Set type and finalize. */
        ip->i_flags = 0;
        DIP_ASSIGN(ip, flags, 0);
        ip->i_mode = mode;
        DIP_ASSIGN(ip, mode, mode);
        if (vap->va_rdev != VNOVAL) {
                /*
                 * Want to be able to use this to make badblock
                 * inodes, so don't truncate the dev number.
                 */
                if (ump->um_fstype == UFS1)
                        ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev,
                            UFS_MPNEEDSWAP(ump));
                else
                        ip->i_ffs2_rdev = ufs_rw64(vap->va_rdev,
                            UFS_MPNEEDSWAP(ump));
        }
        ufs_vinit(mp, ffs_specop_p, ffs_fifoop_p, &vp);
        ip->i_devvp = ump->um_devvp;
        vref(ip->i_devvp);

        /* Set up a new generation number for this inode.  */
        ip->i_gen++;
        DIP_ASSIGN(ip, gen, ip->i_gen);
        if (fs->fs_magic == FS_UFS2_MAGIC) {
                vfs_timestamp(&ts);
                ip->i_ffs2_birthtime = ts.tv_sec;
                ip->i_ffs2_birthnsec = ts.tv_nsec;
        }

        uvm_vnp_setsize(vp, ip->i_size);
        cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip));
        *new_key = &ip->i_number;
        return 0;
}

/*
 * File handle to vnode
 *
 * Have to be really careful about stale file handles:
 * - check that the inode number is valid
 * - call ffs_vget() to get the locked inode
 * - check for an unallocated inode (i_mode == 0)
 * - check that the given client host has export rights and return
 *   those rights via. exflagsp and credanonp
 */
int
ffs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp)
{
        struct ufid ufh;
        int error;

        if (fhp->fid_len != sizeof(struct ufid))
                return EINVAL;

        memcpy(&ufh, fhp, sizeof(ufh));
        if ((error = ffs_checkrange(mp, ufh.ufid_ino)) != 0)
                return error;

        return (ufs_fhtovp(mp, &ufh, lktype, vpp));
}

/*
 * Vnode pointer to File handle
 */
/* ARGSUSED */
int
ffs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
{
        struct inode *ip;
        struct ufid ufh;

        if (*fh_size < sizeof(struct ufid)) {
                *fh_size = sizeof(struct ufid);
                return E2BIG;
        }
        ip = VTOI(vp);
        *fh_size = sizeof(struct ufid);
        memset(&ufh, 0, sizeof(ufh));
        ufh.ufid_len = sizeof(struct ufid);
        ufh.ufid_ino = ip->i_number;
        ufh.ufid_gen = ip->i_gen;
        memcpy(fhp, &ufh, sizeof(ufh));
        return (0);
}

void
ffs_init(void)
{
        if (ffs_initcount++ > 0)
                return;

        ffs_inode_cache = pool_cache_init(sizeof(struct inode), 0, 0, 0,
            "ffsino", NULL, IPL_NONE, NULL, NULL, NULL);
        ffs_dinode1_cache = pool_cache_init(sizeof(struct ufs1_dinode), 0, 0, 0,
            "ffsdino1", NULL, IPL_NONE, NULL, NULL, NULL);
        ffs_dinode2_cache = pool_cache_init(sizeof(struct ufs2_dinode), 0, 0, 0,
            "ffsdino2", NULL, IPL_NONE, NULL, NULL, NULL);
        ufs_init();
}

void
ffs_reinit(void)
{
        ufs_reinit();
}

void
ffs_done(void)
{
        if (--ffs_initcount > 0)
                return;

        ufs_done();
        pool_cache_destroy(ffs_dinode2_cache);
        pool_cache_destroy(ffs_dinode1_cache);
        pool_cache_destroy(ffs_inode_cache);
}

/*
 * Write a superblock and associated information back to disk.
 */
int
ffs_sbupdate(struct ufsmount *mp, int waitfor)
{
        struct fs *fs = mp->um_fs;
        struct buf *bp;
        int error;
        u_int32_t saveflag;

        error = ffs_getblk(mp->um_devvp,
            fs->fs_sblockloc / DEV_BSIZE, FFS_NOBLK,
            fs->fs_sbsize, false, &bp);
        if (error)
                return error;
        saveflag = fs->fs_flags & FS_INTERNAL;
        fs->fs_flags &= ~FS_INTERNAL;

        memcpy(bp->b_data, fs, fs->fs_sbsize);

        ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
        if (mp->um_flags & UFS_EA) {
                struct fs *bfs = (struct fs *)bp->b_data;
                KASSERT(bfs->fs_magic == FS_UFS2_MAGIC);
                bfs->fs_magic = FS_UFS2EA_MAGIC;
        }
#ifdef FFS_EI
        if (mp->um_flags & UFS_NEEDSWAP)
                ffs_sb_swap((struct fs *)bp->b_data, (struct fs *)bp->b_data);
#endif
        fs->fs_flags |= saveflag;

        if (waitfor == MNT_WAIT)
                error = bwrite(bp);
        else
                bawrite(bp);
        return (error);
}

int
ffs_cgupdate(struct ufsmount *mp, int waitfor)
{
        struct fs *fs = mp->um_fs;
        struct buf *bp;
        int blks;
        void *space;
        int i, size, error = 0, allerror = 0;

        UFS_WAPBL_JLOCK_ASSERT(mp->um_mountp);

        allerror = ffs_sbupdate(mp, waitfor);
        blks = howmany(fs->fs_cssize, fs->fs_fsize);
        space = fs->fs_csp;
        for (i = 0; i < blks; i += fs->fs_frag) {
                size = fs->fs_bsize;
                if (i + fs->fs_frag > blks)
                        size = (blks - i) * fs->fs_fsize;
                error = ffs_getblk(mp->um_devvp, FFS_FSBTODB(fs, fs->fs_csaddr + i),
                    FFS_NOBLK, size, false, &bp);
                if (error)
                        break;
#ifdef FFS_EI
                if (mp->um_flags & UFS_NEEDSWAP)
                        ffs_csum_swap((struct csum*)space,
                            (struct csum*)bp->b_data, size);
                else
#endif
                        memcpy(bp->b_data, space, (u_int)size);
                space = (char *)space + size;
                if (waitfor == MNT_WAIT)
                        error = bwrite(bp);
                else
                        bawrite(bp);
        }
        if (!allerror && error)
                allerror = error;
        return (allerror);
}

int
ffs_extattrctl(struct mount *mp, int cmd, struct vnode *vp,
    int attrnamespace, const char *attrname)
{
#ifdef UFS_EXTATTR
        /*
         * File-backed extended attributes are only supported on UFS1.
         * UFS2 has native extended attributes.
         */
        if (VFSTOUFS(mp)->um_fstype == UFS1)
                return (ufs_extattrctl(mp, cmd, vp, attrnamespace, attrname));
#endif
        return (vfs_stdextattrctl(mp, cmd, vp, attrnamespace, attrname));
}

/*
 * Synch vnode for a mounted file system.
 */
static int
ffs_vfs_fsync(vnode_t *vp, int flags)
{
        int error, i, pflags;
#ifdef WAPBL
        struct mount *mp;
#endif

        KASSERT(vp->v_type == VBLK);
        KASSERT(spec_node_getmountedfs(vp) != NULL);

        /*
         * Flush all dirty data associated with the vnode.
         */
        pflags = PGO_ALLPAGES | PGO_CLEANIT;
        if ((flags & FSYNC_WAIT) != 0)
                pflags |= PGO_SYNCIO;
        rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
        error = VOP_PUTPAGES(vp, 0, 0, pflags);
        if (error)
                return error;

#ifdef WAPBL
        mp = spec_node_getmountedfs(vp);
        if (mp && mp->mnt_wapbl) {
                /*
                 * Don't bother writing out metadata if the syncer is
                 * making the request.  We will let the sync vnode
                 * write it out in a single burst through a call to
                 * VFS_SYNC().
                 */
                if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY | FSYNC_NOLOG)) != 0)
                        return 0;

                /*
                 * Don't flush the log if the vnode being flushed
                 * contains no dirty buffers that could be in the log.
                 */
                if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
                        error = wapbl_flush(mp->mnt_wapbl, 0);
                        if (error)
                                return error;
                }

                if ((flags & FSYNC_WAIT) != 0) {
                        mutex_enter(vp->v_interlock);
                        while (vp->v_numoutput)
                                cv_wait(&vp->v_cv, vp->v_interlock);
                        mutex_exit(vp->v_interlock);
                }

                return 0;
        }
#endif /* WAPBL */

        error = vflushbuf(vp, flags);
        if (error == 0 && (flags & FSYNC_CACHE) != 0) {
                i = 1;
                (void)VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE,
                    kauth_cred_get());
        }

        return error;
}










































































































































































































   26 














   24 


    2 





























    2 



















































    1 













    2 






















    1 

















    1 









    1 





























































    1 





























    1 
































    1 











































































































































































































































































































































































































































































































































































































    3 

















    3 






    3 



















    1 













    1 



































































































































































































































































































































































    1 



    2 
    1 

    3 





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
/*        $NetBSD: sys_ptrace_common.c,v 1.92 2021/08/09 20:49:10 andvar Exp $        */

/*-
 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        from: @(#)sys_process.c        8.1 (Berkeley) 6/10/93
 */

/*-
 * Copyright (c) 1993 Jan-Simon Pendry.
 * Copyright (c) 1994 Christopher G. Demetriou.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        from: @(#)sys_process.c        8.1 (Berkeley) 6/10/93
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_ptrace_common.c,v 1.92 2021/08/09 20:49:10 andvar Exp $");

#ifdef _KERNEL_OPT
#include "opt_ptrace.h"
#include "opt_ktrace.h"
#include "opt_pax.h"
#include "opt_compat_netbsd32.h"
#endif

#if defined(__HAVE_COMPAT_NETBSD32) && !defined(COMPAT_NETBSD32) \
    && !defined(_RUMPKERNEL)
#define COMPAT_NETBSD32
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/exec.h>
#include <sys/pax.h>
#include <sys/ptrace.h>
#include <sys/uio.h>
#include <sys/ras.h>
#include <sys/kmem.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/module.h>
#include <sys/condvar.h>
#include <sys/mutex.h>
#include <sys/compat_stub.h>

#include <uvm/uvm_extern.h>

#include <machine/reg.h>

# ifdef PTRACE_DEBUG
#  define DPRINTF(a) uprintf a
# else
#  define DPRINTF(a)
# endif

static kauth_listener_t ptrace_listener;
static int process_auxv_offset(struct proc *, struct uio *);

extern int user_va0_disable;

#if 0
static int ptrace_cbref;
static kmutex_t ptrace_mtx;
static kcondvar_t ptrace_cv;
#endif

#ifdef PT_GETREGS
# define case_PT_GETREGS        case PT_GETREGS:
#else
# define case_PT_GETREGS
#endif

#ifdef PT_SETREGS
# define case_PT_SETREGS        case PT_SETREGS:
#else
# define case_PT_SETREGS
#endif

#ifdef PT_GETFPREGS
# define case_PT_GETFPREGS        case PT_GETFPREGS:
#else
# define case_PT_GETFPREGS
#endif

#ifdef PT_SETFPREGS
# define case_PT_SETFPREGS        case PT_SETFPREGS:
#else
# define case_PT_SETFPREGS
#endif

#ifdef PT_GETDBREGS
# define case_PT_GETDBREGS        case PT_GETDBREGS:
#else
# define case_PT_GETDBREGS
#endif

#ifdef PT_SETDBREGS
# define case_PT_SETDBREGS        case PT_SETDBREGS:
#else
# define case_PT_SETDBREGS
#endif

static int
ptrace_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        struct proc *p;
        int result;
#ifdef PT_SETDBREGS
        extern int user_set_dbregs;
#endif

        result = KAUTH_RESULT_DEFER;
        p = arg0;

#if 0
        mutex_enter(&ptrace_mtx);
        ptrace_cbref++;
        mutex_exit(&ptrace_mtx);
#endif
        if (action != KAUTH_PROCESS_PTRACE)
                goto out;

        switch ((u_long)arg1) {
#ifdef PT_SETDBREGS
        case_PT_SETDBREGS
                if (kauth_cred_getuid(cred) != 0 && user_set_dbregs == 0) {
                        result = KAUTH_RESULT_DENY;
                        break;
                }
#endif
                /* FALLTHROUGH */
        case PT_TRACE_ME:
        case PT_ATTACH:
        case PT_WRITE_I:
        case PT_WRITE_D:
        case PT_READ_I:
        case PT_READ_D:
        case PT_IO:
        case_PT_GETREGS
        case_PT_SETREGS
        case_PT_GETFPREGS
        case_PT_SETFPREGS
        case_PT_GETDBREGS
        case PT_SET_EVENT_MASK:
        case PT_GET_EVENT_MASK:
        case PT_GET_PROCESS_STATE:
        case PT_SET_SIGINFO:
        case PT_GET_SIGINFO:
#ifdef __HAVE_PTRACE_MACHDEP
        PTRACE_MACHDEP_REQUEST_CASES
#endif
                if (kauth_cred_getuid(cred) != kauth_cred_getuid(p->p_cred) ||
                    ISSET(p->p_flag, PK_SUGID)) {
                        break;
                }

                result = KAUTH_RESULT_ALLOW;

        break;

#ifdef PT_STEP
        case PT_STEP:
        case PT_SETSTEP:
        case PT_CLEARSTEP:
#endif
        case PT_CONTINUE:
        case PT_KILL:
        case PT_DETACH:
        case PT_LWPINFO:
        case PT_SYSCALL:
        case PT_SYSCALLEMU:
        case PT_DUMPCORE:
        case PT_RESUME:
        case PT_SUSPEND:
        case PT_STOP:
        case PT_LWPSTATUS:
        case PT_LWPNEXT:
        case PT_SET_SIGPASS:
        case PT_GET_SIGPASS:
                result = KAUTH_RESULT_ALLOW;
                break;

        default:
                break;
        }

 out:
#if 0
        mutex_enter(&ptrace_mtx);
        if (--ptrace_cbref == 0)
                cv_broadcast(&ptrace_cv);
        mutex_exit(&ptrace_mtx);
#endif

        return result;
}

static struct proc *
ptrace_find(struct lwp *l, int req, pid_t pid)
{
        struct proc *t;

        /* "A foolish consistency..." XXX */
        if (req == PT_TRACE_ME) {
                t = l->l_proc;
                mutex_enter(t->p_lock);
                return t;
        }

        /* Find the process we're supposed to be operating on. */
        t = proc_find(pid);
        if (t == NULL)
                return NULL;

        /* XXX-elad */
        mutex_enter(t->p_lock);
        int error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE,
            t, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
        if (error) {
                mutex_exit(t->p_lock);
                return NULL;
        }
        return t;
}

static int
ptrace_allowed(struct lwp *l, int req, struct proc *t, struct proc *p,
    bool *locked)
{
        *locked = false;

        /*
         * Grab a reference on the process to prevent it from execing or
         * exiting.
         */
        if (!rw_tryenter(&t->p_reflock, RW_READER))
                return EBUSY;

        *locked = true;

        /* Make sure we can operate on it. */
        switch (req) {
        case PT_TRACE_ME:
                /*
                 * You can't say to the parent of a process to start tracing if:
                 *        (1) the parent is initproc,
                 */
                if (p->p_pptr == initproc)
                        return EPERM;

                /*
                 *        (2) the process is initproc, or
                 */
                if (p == initproc)
                        return EPERM;

                /*
                 *        (3) the child is already traced.
                 */
                if (ISSET(p->p_slflag, PSL_TRACED))
                        return EBUSY;

                return 0;

        case PT_ATTACH:
                /*
                 * You can't attach to a process if:
                 *        (1) it's the process that's doing the attaching,
                 */
                if (t == p)
                        return EINVAL;

                /*
                 *        (2) it's a system process,
                 */
                if (t->p_flag & PK_SYSTEM)
                        return EPERM;

                /*
                 *        (3) the tracer is initproc,
                 */
                if (p == initproc)
                        return EPERM;

                /*
                 *        (4) it's already being traced,
                 */
                if (ISSET(t->p_slflag, PSL_TRACED))
                        return EBUSY;

                /*
                 *        (5) it's a vfork(2)ed parent of the current process, or
                 */
                if (ISSET(p->p_lflag, PL_PPWAIT) && p->p_pptr == t)
                        return EPERM;

                /*
                 *         (6) the tracer is chrooted, and its root directory is
                 *             not at or above the root directory of the tracee
                 */
                mutex_exit(t->p_lock);        /* XXXSMP */
                int tmp = proc_isunder(t, l);
                mutex_enter(t->p_lock);        /* XXXSMP */
                if (!tmp)
                        return EPERM;
                return 0;

        case PT_READ_I:
        case PT_READ_D:
        case PT_WRITE_I:
        case PT_WRITE_D:
        case PT_IO:
        case PT_SET_SIGINFO:
        case PT_GET_SIGINFO:
        case_PT_GETREGS
        case_PT_SETREGS
        case_PT_GETFPREGS
        case_PT_SETFPREGS
        case_PT_GETDBREGS
        case_PT_SETDBREGS
#ifdef __HAVE_PTRACE_MACHDEP
        PTRACE_MACHDEP_REQUEST_CASES
#endif
                /*
                 * You can't read/write the memory or registers of a process
                 * if the tracer is chrooted, and its root directory is not at
                 * or above the root directory of the tracee.
                 */
                mutex_exit(t->p_lock);        /* XXXSMP */
                tmp = proc_isunder(t, l);
                mutex_enter(t->p_lock);        /* XXXSMP */
                if (!tmp)
                        return EPERM;
                /*FALLTHROUGH*/

        case PT_CONTINUE:
        case PT_KILL:
        case PT_DETACH:
        case PT_LWPINFO:
        case PT_SYSCALL:
        case PT_SYSCALLEMU:
        case PT_DUMPCORE:
#ifdef PT_STEP
        case PT_STEP:
        case PT_SETSTEP:
        case PT_CLEARSTEP:
#endif
        case PT_SET_EVENT_MASK:
        case PT_GET_EVENT_MASK:
        case PT_GET_PROCESS_STATE:
        case PT_RESUME:
        case PT_SUSPEND:
        case PT_STOP:
        case PT_LWPSTATUS:
        case PT_LWPNEXT:
        case PT_SET_SIGPASS:
        case PT_GET_SIGPASS:
                /*
                 * You can't do what you want to the process if:
                 *        (1) It's not being traced at all,
                 */
                if (!ISSET(t->p_slflag, PSL_TRACED))
                        return EPERM;

                /*
                 *        (2) it's not being traced by _you_, or
                 */
                if (t->p_pptr != p) {
                        DPRINTF(("parent %d != %d\n", t->p_pptr->p_pid,
                            p->p_pid));
                        return EBUSY;
                }

                /*
                 *        (3) it's not currently stopped.
                 *
                 *        As an exception allow PT_KILL and PT_STOP here.
                 */
                if (req != PT_KILL && req != PT_STOP &&
                    (t->p_stat != SSTOP || !t->p_waited /* XXXSMP */)) {
                        DPRINTF(("stat %d flag %d\n", t->p_stat,
                            !t->p_waited));
                        return EBUSY;
                }
                return 0;

        default:                        /* It was not a legal request. */
                return EINVAL;
        }
}

static int
ptrace_needs_hold(int req)
{
        switch (req) {
#ifdef PT_STEP
        case PT_STEP:
#endif
        case PT_CONTINUE:
        case PT_DETACH:
        case PT_KILL:
        case PT_SYSCALL:
        case PT_SYSCALLEMU:
        case PT_ATTACH:
        case PT_TRACE_ME:
        case PT_GET_SIGINFO:
        case PT_SET_SIGINFO:
        case PT_STOP:
                return 1;
        default:
                return 0;
        }
}

static int
ptrace_get_siginfo(struct proc *t, struct ptrace_methods *ptm, void *addr,
    size_t data)
{
        struct ptrace_siginfo psi;

        memset(&psi, 0, sizeof(psi));
        psi.psi_siginfo._info = t->p_sigctx.ps_info;
        psi.psi_lwpid = t->p_sigctx.ps_lwp;
        DPRINTF(("%s: lwp=%d signal=%d\n", __func__, psi.psi_lwpid,
            psi.psi_siginfo.si_signo));

        return ptm->ptm_copyout_siginfo(&psi, addr, data);
}

static int
ptrace_set_siginfo(struct proc *t, struct lwp **lt, struct ptrace_methods *ptm,
    void *addr, size_t data)
{
        struct ptrace_siginfo psi;

        int error = ptm->ptm_copyin_siginfo(&psi, addr, data);
        if (error)
                return error;

        /* Check that the data is a valid signal number or zero. */
        if (psi.psi_siginfo.si_signo < 0 || psi.psi_siginfo.si_signo >= NSIG)
                return EINVAL;

        t->p_sigctx.ps_faked = true;
        t->p_sigctx.ps_info = psi.psi_siginfo._info;
        t->p_sigctx.ps_lwp = psi.psi_lwpid;
        DPRINTF(("%s: lwp=%d signal=%d\n", __func__, psi.psi_lwpid,
            psi.psi_siginfo.si_signo));
        return 0;
}

static int
ptrace_get_sigpass(struct proc *t, void *addr, size_t data)
{
        sigset_t set;

        if (data > sizeof(set) || data <= 0) {
                DPRINTF(("%s: invalid data: %zu < %zu <= 0\n",
                        __func__, sizeof(set), data));
                return EINVAL;
        }

        set = t->p_sigctx.ps_sigpass;

        return copyout(&set, addr, data);
}

static int
ptrace_set_sigpass(struct proc *t, void *addr, size_t data)
{
        sigset_t set;
        int error;

        if (data > sizeof(set) || data <= 0) {
                DPRINTF(("%s: invalid data: %zu < %zu <= 0\n",
                        __func__, sizeof(set), data));
                return EINVAL;
        }

        memset(&set, 0, sizeof(set));

        if ((error = copyin(addr, &set, data)))
                return error;

        /* We catch SIGSTOP and cannot intercept SIGKILL. */
        sigminusset(&sigcantmask, &set);

        t->p_sigctx.ps_sigpass = set;

        return 0;
}

static int
ptrace_get_event_mask(struct proc *t, void *addr, size_t data)
{
        struct ptrace_event pe;

        if (data != sizeof(pe)) {
                DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(pe)));
                return EINVAL;
        }
        memset(&pe, 0, sizeof(pe));
        pe.pe_set_event = ISSET(t->p_slflag, PSL_TRACEFORK) ?
            PTRACE_FORK : 0;
        pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACEVFORK) ?
            PTRACE_VFORK : 0;
        pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACEVFORK_DONE) ?
            PTRACE_VFORK_DONE : 0;
        pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACELWP_CREATE) ?
            PTRACE_LWP_CREATE : 0;
        pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACELWP_EXIT) ?
            PTRACE_LWP_EXIT : 0;
        pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACEPOSIX_SPAWN) ?
            PTRACE_POSIX_SPAWN : 0;
        DPRINTF(("%s: lwp=%d event=%#x\n", __func__,
            t->p_sigctx.ps_lwp, pe.pe_set_event));
        return copyout(&pe, addr, sizeof(pe));
}

static int
ptrace_set_event_mask(struct proc *t, void *addr, size_t data)
{
        struct ptrace_event pe;
        int error;

        if (data != sizeof(pe)) {
                DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(pe)));
                return EINVAL;
        }
        if ((error = copyin(addr, &pe, sizeof(pe))) != 0)
                return error;

        DPRINTF(("%s: lwp=%d event=%#x\n", __func__,
            t->p_sigctx.ps_lwp, pe.pe_set_event));
        if (pe.pe_set_event & PTRACE_FORK)
                SET(t->p_slflag, PSL_TRACEFORK);
        else
                CLR(t->p_slflag, PSL_TRACEFORK);

        if (pe.pe_set_event & PTRACE_VFORK)
                SET(t->p_slflag, PSL_TRACEVFORK);
        else
                CLR(t->p_slflag, PSL_TRACEVFORK);

        if (pe.pe_set_event & PTRACE_VFORK_DONE)
                SET(t->p_slflag, PSL_TRACEVFORK_DONE);
        else
                CLR(t->p_slflag, PSL_TRACEVFORK_DONE);

        if (pe.pe_set_event & PTRACE_LWP_CREATE)
                SET(t->p_slflag, PSL_TRACELWP_CREATE);
        else
                CLR(t->p_slflag, PSL_TRACELWP_CREATE);

        if (pe.pe_set_event & PTRACE_LWP_EXIT)
                SET(t->p_slflag, PSL_TRACELWP_EXIT);
        else
                CLR(t->p_slflag, PSL_TRACELWP_EXIT);

        if (pe.pe_set_event & PTRACE_POSIX_SPAWN)
                SET(t->p_slflag, PSL_TRACEPOSIX_SPAWN);
        else
                CLR(t->p_slflag, PSL_TRACEPOSIX_SPAWN);

        return 0;
}

static int
ptrace_get_process_state(struct proc *t, void *addr, size_t data)
{
        struct _ksiginfo *si;
        struct ptrace_state ps;

        if (data != sizeof(ps)) {
                DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(ps)));
                return EINVAL;
        }

        if (t->p_sigctx.ps_info._signo != SIGTRAP ||
            (t->p_sigctx.ps_info._code != TRAP_CHLD &&
                t->p_sigctx.ps_info._code != TRAP_LWP)) {
                memset(&ps, 0, sizeof(ps));
        } else {
                si = &t->p_sigctx.ps_info;

                KASSERT(si->_reason._ptrace_state._pe_report_event > 0);
                KASSERT(si->_reason._ptrace_state._option._pe_other_pid > 0);

                ps.pe_report_event = si->_reason._ptrace_state._pe_report_event;

                CTASSERT(sizeof(ps.pe_other_pid) == sizeof(ps.pe_lwp));
                ps.pe_other_pid =
                        si->_reason._ptrace_state._option._pe_other_pid;
        }

        DPRINTF(("%s: lwp=%d event=%#x pid=%d lwp=%d\n", __func__,
            t->p_sigctx.ps_lwp, ps.pe_report_event,
            ps.pe_other_pid, ps.pe_lwp));
        return copyout(&ps, addr, sizeof(ps));
}

static int
ptrace_lwpinfo(struct proc *t, struct lwp **lt, void *addr, size_t data)
{
        struct ptrace_lwpinfo pl;

        if (data != sizeof(pl)) {
                DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(pl)));
                return EINVAL;
        }
        int error = copyin(addr, &pl, sizeof(pl));
        if (error)
                return error;

        lwpid_t tmp = pl.pl_lwpid;
        lwp_delref(*lt);
        mutex_enter(t->p_lock);
        if (tmp == 0)
                *lt = lwp_find_first(t);
        else {
                *lt = lwp_find(t, tmp);
                if (*lt == NULL) {
                        mutex_exit(t->p_lock);
                        return ESRCH;
                }
                *lt = LIST_NEXT(*lt, l_sibling);
        }

        while (*lt != NULL && (!lwp_alive(*lt) ||
               ((*lt)->l_flag & LW_SYSTEM) != 0))
                *lt = LIST_NEXT(*lt, l_sibling);

        pl.pl_lwpid = 0;
        pl.pl_event = 0;
        if (*lt) {
                lwp_addref(*lt);
                pl.pl_lwpid = (*lt)->l_lid;

                if ((*lt)->l_flag & LW_WSUSPEND)
                        pl.pl_event = PL_EVENT_SUSPENDED;
                /*
                 * If we match the lwp, or it was sent to every lwp,
                 * we set PL_EVENT_SIGNAL.
                 * XXX: ps_lwp == 0 means everyone and noone, so
                 * check ps_signo too.
                 */
                else if ((*lt)->l_lid == t->p_sigctx.ps_lwp
                         || (t->p_sigctx.ps_lwp == 0 &&
                             t->p_sigctx.ps_info._signo)) {
                        DPRINTF(("%s: lwp=%d siglwp=%d signo %d\n", __func__,
                            pl.pl_lwpid, t->p_sigctx.ps_lwp,
                            t->p_sigctx.ps_info._signo));
                        pl.pl_event = PL_EVENT_SIGNAL;
                }
        }
        mutex_exit(t->p_lock);
        DPRINTF(("%s: lwp=%d event=%#x\n", __func__,
            pl.pl_lwpid, pl.pl_event));

        return copyout(&pl, addr, sizeof(pl));
}

static int
ptrace_lwpstatus(struct proc *t, struct ptrace_methods *ptm, struct lwp **lt,
    void *addr, size_t data, bool next)
{
        struct ptrace_lwpstatus pls;
        struct lwp *l;
        int error;

        if (data > sizeof(pls) || data < sizeof(lwpid_t)) {
                DPRINTF(("%s: invalid data: %zu < %zu < %zu\n",
                        __func__, sizeof(lwpid_t), data, sizeof(pls)));
                return EINVAL;
        }
        error = copyin(addr, &pls.pl_lwpid, sizeof(lwpid_t));
        if (error)
                return error;

        if (next) {
                lwp_delref(*lt);
                lwpid_t tmp = pls.pl_lwpid;
                mutex_enter(t->p_lock);
                if (tmp == 0)
                        *lt = lwp_find_first(t);
                else {
                        *lt = lwp_find(t, tmp);
                        if (*lt == NULL) {
                                mutex_exit(t->p_lock);
                                return ESRCH;
                        }
                        *lt = LIST_NEXT(*lt, l_sibling);
                }

                while (*lt != NULL && (!lwp_alive(*lt) ||
                       ((*lt)->l_flag & LW_SYSTEM) != 0))
                        *lt = LIST_NEXT(*lt, l_sibling);

                if (*lt == NULL) {
                        memset(&pls, 0, sizeof(pls));
                        mutex_exit(t->p_lock);
                        goto out;
                }
                lwp_addref(*lt);
                mutex_exit(t->p_lock);

                pls.pl_lwpid = (*lt)->l_lid;
        } else {
                if ((error = ptrace_update_lwp(t, lt, pls.pl_lwpid)) != 0)
                        return error;
        }

        l = *lt;

        ptrace_read_lwpstatus(l, &pls);

out:
        DPRINTF(("%s: lwp=%d sigpend=%02x%02x%02x%02x sigmask=%02x%02x%02x%02x "
           "name='%s' private=%p\n", __func__, pls.pl_lwpid,
            pls.pl_sigpend.__bits[0], pls.pl_sigpend.__bits[1],
            pls.pl_sigpend.__bits[2], pls.pl_sigpend.__bits[3],
            pls.pl_sigmask.__bits[0], pls.pl_sigmask.__bits[1],
            pls.pl_sigmask.__bits[2], pls.pl_sigmask.__bits[3],
            pls.pl_name, pls.pl_private));

        return ptm->ptm_copyout_lwpstatus(&pls, addr, data);
}

static int
ptrace_startstop(struct proc *t, struct lwp **lt, int rq, void *addr,
    size_t data)
{
        int error;

        if ((error = ptrace_update_lwp(t, lt, data)) != 0)
                return error;

        DPRINTF(("%s: lwp=%d request=%d\n", __func__, (*lt)->l_lid, rq));
        lwp_lock(*lt);
        if (rq == PT_SUSPEND)
                (*lt)->l_flag |= LW_DBGSUSPEND;
        else {
                (*lt)->l_flag &= ~LW_DBGSUSPEND;
                if ((*lt)->l_flag != LSSUSPENDED)
                        (*lt)->l_stat = LSSTOP;
        }
        lwp_unlock(*lt);
        return 0;
}

#ifdef PT_REGISTERS
static int
ptrace_uio_dir(int req)
{
        switch (req) {
        case_PT_GETREGS
        case_PT_GETFPREGS
        case_PT_GETDBREGS
                return UIO_READ;
        case_PT_SETREGS
        case_PT_SETFPREGS
        case_PT_SETDBREGS
                return UIO_WRITE;
        default:
                return -1;
        }
}

static int
ptrace_regs(struct lwp *l, struct lwp **lt, int rq, struct ptrace_methods *ptm,
    void *addr, size_t data)
{
        int error;
        struct proc *p, *t;
        struct vmspace *vm;

        p = l->l_proc;                /* tracer */
        t = (*lt)->l_proc;        /* traced */

        if ((error = ptrace_update_lwp(t, lt, data)) != 0)
                return error;

        int dir = ptrace_uio_dir(rq);
        size_t size;
        int (*func)(struct lwp *, struct lwp *, struct uio *);

        DPRINTF(("%s: lwp=%d request=%d\n", __func__, l->l_lid, rq));

        switch (rq) {
#if defined(PT_SETREGS) || defined(PT_GETREGS)
        case_PT_GETREGS
        case_PT_SETREGS
                if (!process_validregs(*lt))
                        return EINVAL;
                size = PROC_REGSZ(p);
                func = ptm->ptm_doregs;
                break;
#endif
#if defined(PT_SETFPREGS) || defined(PT_GETFPREGS)
        case_PT_GETFPREGS
        case_PT_SETFPREGS
                if (!process_validfpregs(*lt))
                        return EINVAL;
                size = PROC_FPREGSZ(p);
                func = ptm->ptm_dofpregs;
                break;
#endif
#if defined(PT_SETDBREGS) || defined(PT_GETDBREGS)
        case_PT_GETDBREGS
        case_PT_SETDBREGS
                if (!process_validdbregs(*lt))
                        return EINVAL;
                size = PROC_DBREGSZ(p);
                func = ptm->ptm_dodbregs;
                break;
#endif
        default:
                return EINVAL;
        }

        error = proc_vmspace_getref(l->l_proc, &vm);
        if (error)
                return error;

        struct uio uio;
        struct iovec iov;

        iov.iov_base = addr;
        iov.iov_len = size;
        uio.uio_iov = &iov;
        uio.uio_iovcnt = 1;
        uio.uio_offset = 0;
        uio.uio_resid = iov.iov_len;
        uio.uio_rw = dir;
        uio.uio_vmspace = vm;

        error = (*func)(l, *lt, &uio);
        uvmspace_free(vm);
        return error;
}
#endif

static int
ptrace_sendsig(struct lwp *l, int req, struct proc *t, struct lwp *lt, int signo, int resume_all)
{
        ksiginfo_t ksi;

        /* Finally, deliver the requested signal (or none). */
        if (t->p_stat == SSTOP) {
                /*
                 * Unstop the process.  If it needs to take a
                 * signal, make all efforts to ensure that at
                 * an LWP runs to see it.
                 */
                t->p_xsig = signo;

                /*
                 * signo > 0 check prevents a potential panic, as
                 * sigismember(&...,0) is invalid check and signo
                 * can be equal to 0 as a special case of no-signal.
                 */
                if (signo > 0 && sigismember(&stopsigmask, signo)) {
                        t->p_waited = 0;
                        child_psignal(t, 0);
                } else if (resume_all)
                        proc_unstop(t);
                else
                        lwp_unstop(lt);
                return 0;
        }

        KASSERT(req == PT_KILL || req == PT_STOP || req == PT_ATTACH);

        KSI_INIT(&ksi);
        ksi.ksi_signo = signo;
        ksi.ksi_code = SI_USER;
        ksi.ksi_pid = l->l_proc->p_pid;
        ksi.ksi_uid = kauth_cred_geteuid(l->l_cred);

        t->p_sigctx.ps_faked = false;

        DPRINTF(("%s: pid=%d.%d signal=%d resume_all=%d\n", __func__, t->p_pid,
            lt->l_lid, signo, resume_all));

        return kpsignal2(t, &ksi);
}

static int
ptrace_dumpcore(struct lwp *lt, char *path, size_t len)
{
        int error;
        if (path != NULL) {

                if (len >= MAXPATHLEN)
                        return EINVAL;

                char *src = path;
                path = kmem_alloc(len + 1, KM_SLEEP);
                error = copyin(src, path, len);
                if (error)
                        goto out;
                path[len] = '\0';
        }
        DPRINTF(("%s: lwp=%d\n", __func__, lt->l_lid));
        MODULE_HOOK_CALL(coredump_hook, (lt, path), 0, error);
out:
        if (path)
                kmem_free(path, len + 1);
        return error;
}

static int
ptrace_doio(struct lwp *l, struct proc *t, struct lwp *lt,
    struct ptrace_io_desc *piod, void *addr, bool sysspace)
{
        struct uio uio;
        struct iovec iov;
        int error, tmp;

        error = 0;
        iov.iov_base = piod->piod_addr;
        iov.iov_len = piod->piod_len;
        uio.uio_iov = &iov;
        uio.uio_iovcnt = 1;
        uio.uio_offset = (off_t)(unsigned long)piod->piod_offs;
        uio.uio_resid = piod->piod_len;

        DPRINTF(("%s: lwp=%d request=%d\n", __func__, l->l_lid, piod->piod_op));

        switch (piod->piod_op) {
        case PIOD_READ_D:
        case PIOD_READ_I:
                uio.uio_rw = UIO_READ;
                break;
        case PIOD_WRITE_D:
        case PIOD_WRITE_I:
                /*
                 * Can't write to a RAS
                 */
                if (ras_lookup(t, addr) != (void *)-1) {
                        return EACCES;
                }
                uio.uio_rw = UIO_WRITE;
                break;
        case PIOD_READ_AUXV:
                uio.uio_rw = UIO_READ;
                tmp = t->p_execsw->es_arglen;
                if (uio.uio_offset > tmp)
                        return EIO;
                if (uio.uio_resid > tmp - uio.uio_offset)
                        uio.uio_resid = tmp - uio.uio_offset;
                piod->piod_len = iov.iov_len = uio.uio_resid;
                error = process_auxv_offset(t, &uio);
                break;
        default:
                error = EINVAL;
                break;
        }

        if (error)
                return error;

        if (sysspace) {
                uio.uio_vmspace = vmspace_kernel();
        } else {
                error = proc_vmspace_getref(l->l_proc, &uio.uio_vmspace);
                if (error)
                        return error;
        }

        error = process_domem(l, lt, &uio);
        if (!sysspace)
                uvmspace_free(uio.uio_vmspace);
        if (error)
                return error;
        piod->piod_len -= uio.uio_resid;
        return 0;
}

int
do_ptrace(struct ptrace_methods *ptm, struct lwp *l, int req, pid_t pid,
    void *addr, int data, register_t *retval)
{
        struct proc *p = l->l_proc;
        struct lwp *lt = NULL;
        struct lwp *lt2;
        struct proc *t;                                /* target process */
        struct ptrace_io_desc piod;
        int error, write, tmp, pheld;
        int signo = 0;
        int resume_all;
        bool locked;
        error = 0;

        /*
         * If attaching or detaching, we need to get a write hold on the
         * proclist lock so that we can re-parent the target process.
         */
        mutex_enter(&proc_lock);

        t = ptrace_find(l, req, pid);
        if (t == NULL) {
                mutex_exit(&proc_lock);
                return ESRCH;
        }

        pheld = 1;
        if ((error = ptrace_allowed(l, req, t, p, &locked)) != 0)
                goto out;

        if ((error = kauth_authorize_process(l->l_cred,
            KAUTH_PROCESS_PTRACE, t, KAUTH_ARG(req), NULL, NULL)) != 0)
                goto out;

        if ((lt = lwp_find_first(t)) == NULL) {
            error = ESRCH;
            goto out;
        }

        /* Do single-step fixup if needed. */
        FIX_SSTEP(t);
        KASSERT(lt != NULL);
        lwp_addref(lt);

        /*
         * Which locks do we need held? XXX Ugly.
         */
        if ((pheld = ptrace_needs_hold(req)) == 0) {
                mutex_exit(t->p_lock);
                mutex_exit(&proc_lock);
        }

        /* Now do the operation. */
        write = 0;
        *retval = 0;
        tmp = 0;
        resume_all = 1;

        switch (req) {
        case PT_TRACE_ME:
                /* Just set the trace flag. */
                SET(t->p_slflag, PSL_TRACED);
                t->p_opptr = t->p_pptr;
                break;

        /*
         * The I and D separate address space has been inherited from PDP-11.
         * The 16-bit UNIX started with a single address space per program,
         * but was extended to two 16-bit (2 x 64kb) address spaces.
         *
         * We no longer maintain this feature in maintained architectures, but
         * we keep the API for backward compatibility. Currently the I and D
         * operations are exactly the same and not distinguished in debuggers.
         */
        case PT_WRITE_I:
        case PT_WRITE_D:
                write = 1;
                tmp = data;
                /* FALLTHROUGH */
        case PT_READ_I:
        case PT_READ_D:
                piod.piod_addr = &tmp;
                piod.piod_len = sizeof(tmp);
                piod.piod_offs = addr;
                piod.piod_op = write ? PIOD_WRITE_D : PIOD_READ_D;
                if ((error = ptrace_doio(l, t, lt, &piod, addr, true)) != 0)
                        break;
                /*
                 * For legacy reasons we treat here two results as success:
                 *  - incomplete transfer  piod.piod_len < sizeof(tmp)
                 *  - no transfer          piod.piod_len == 0
                 *
                 * This means that there is no way to determine whether
                 * transfer operation was performed in PT_WRITE and PT_READ
                 * calls.
                 */
                if (!write)
                        *retval = tmp;
                break;

        case PT_IO:
                if ((error = ptm->ptm_copyin_piod(&piod, addr, data)) != 0)
                        break;
                if (piod.piod_len < 1) {
                        error = EINVAL;
                        break;
                }
                if ((error = ptrace_doio(l, t, lt, &piod, addr, false)) != 0)
                        break;
                /*
                 * For legacy reasons we treat here two results as success:
                 *  - incomplete transfer  piod.piod_len < sizeof(tmp)
                 *  - no transfer          piod.piod_len == 0
                 */
                error = ptm->ptm_copyout_piod(&piod, addr, data);
                break;

        case PT_DUMPCORE:
                error = ptrace_dumpcore(lt, addr, data);
                break;

#ifdef PT_STEP
        case PT_STEP:
                /*
                 * From the 4.4BSD PRM:
                 * "Execution continues as in request PT_CONTINUE; however
                 * as soon as possible after execution of at least one
                 * instruction, execution stops again. [ ... ]"
                 */
#endif
        case PT_CONTINUE:
        case PT_SYSCALL:
        case PT_DETACH:
                if (req == PT_SYSCALL) {
                        if (!ISSET(t->p_slflag, PSL_SYSCALL)) {
                                SET(t->p_slflag, PSL_SYSCALL);
#ifdef __HAVE_SYSCALL_INTERN
                                (*t->p_emul->e_syscall_intern)(t);
#endif
                        }
                } else {
                        if (ISSET(t->p_slflag, PSL_SYSCALL)) {
                                CLR(t->p_slflag, PSL_SYSCALL);
#ifdef __HAVE_SYSCALL_INTERN
                                (*t->p_emul->e_syscall_intern)(t);
#endif
                        }
                }
                t->p_trace_enabled = trace_is_enabled(t);

                /*
                 * Pick up the LWPID, if supplied.  There are two cases:
                 * data < 0 : step or continue single thread, lwp = -data
                 * data > 0 in PT_STEP : step this thread, continue others
                 * For operations other than PT_STEP, data > 0 means
                 * data is the signo to deliver to the process.
                 */
                tmp = data;
                if (tmp >= 0) {
#ifdef PT_STEP
                        if (req == PT_STEP)
                                signo = 0;
                        else
#endif
                        {
                                signo = tmp;
                                tmp = 0;        /* don't search for LWP */
                        }
                } else if (tmp == INT_MIN) {
                        error = ESRCH;
                        break;
                } else {
                        tmp = -tmp;
                }

                if (tmp > 0) {
                        if (req == PT_DETACH) {
                                error = EINVAL;
                                break;
                        }
                        lwp_delref2 (lt);
                        lt = lwp_find(t, tmp);
                        if (lt == NULL) {
                                error = ESRCH;
                                break;
                        }
                        lwp_addref(lt);
                        resume_all = 0;
                        signo = 0;
                }

                /*
                 * From the 4.4BSD PRM:
                 * "The data argument is taken as a signal number and the
                 * child's execution continues at location addr as if it
                 * incurred that signal.  Normally the signal number will
                 * be either 0 to indicate that the signal that caused the
                 * stop should be ignored, or that value fetched out of
                 * the process's image indicating which signal caused
                 * the stop.  If addr is (int *)1 then execution continues
                 * from where it stopped."
                 */

                /* Check that the data is a valid signal number or zero. */
                if (signo < 0 || signo >= NSIG) {
                        error = EINVAL;
                        break;
                }

                /* Prevent process deadlock */
                if (resume_all) {
#ifdef PT_STEP
                        if (req == PT_STEP) {
                                if (lt->l_flag &
                                    (LW_WSUSPEND | LW_DBGSUSPEND)) {
                                        error = EDEADLK;
                                        break;
                                }
                        } else
#endif
                        {
                                error = EDEADLK;
                                LIST_FOREACH(lt2, &t->p_lwps, l_sibling) {
                                        if ((lt2->l_flag &
                                            (LW_WSUSPEND | LW_DBGSUSPEND)) == 0
                                            ) {
                                                error = 0;
                                                break;
                                        }
                                }
                                if (error != 0)
                                        break;
                        }
                } else {
                        if (lt->l_flag & (LW_WSUSPEND | LW_DBGSUSPEND)) {
                                error = EDEADLK;
                                break;
                        }
                }

                /*
                 * Reject setting program counter to 0x0 if VA0 is disabled.
                 *
                 * Not all kernels implement this feature to set Program
                 * Counter in one go in PT_CONTINUE and similar operations.
                 * This causes portability issues as passing address 0x0
                 * on these kernels is no-operation, but can cause failure
                 * in most cases on NetBSD.
                 */
                if (user_va0_disable && addr == 0) {
                        error = EINVAL;
                        break;
                }

                /* If the address parameter is not (int *)1, set the pc. */
                if ((int *)addr != (int *)1) {
                        error = process_set_pc(lt, addr);
                        if (error != 0)
                                break;
                }
#ifdef PT_STEP
                /*
                 * Arrange for a single-step, if that's requested and possible.
                 * More precisely, set the single step status as requested for
                 * the requested thread, and clear it for other threads.
                 */
                LIST_FOREACH(lt2, &t->p_lwps, l_sibling) {
                        error = process_sstep(lt2,
                            ISSET(lt2->l_pflag, LP_SINGLESTEP));
                        if (error)
                                break;
                }
                if (error)
                        break;
                error = process_sstep(lt,
                    ISSET(lt->l_pflag, LP_SINGLESTEP) || req == PT_STEP);
                if (error)
                        break;
#endif
                if (req == PT_DETACH) {
                        CLR(t->p_slflag,
                            PSL_TRACED|PSL_TRACEDCHILD|PSL_SYSCALL);

                        /* clear sigpass mask */
                        sigemptyset(&t->p_sigctx.ps_sigpass);

                        /* give process back to original parent or init */
                        if (t->p_opptr != t->p_pptr) {
                                struct proc *pp = t->p_opptr;
                                proc_reparent(t, pp ? pp : initproc);
                        }

                        /* not being traced any more */
                        t->p_opptr = NULL;

                        /* clear single step */
                        LIST_FOREACH(lt2, &t->p_lwps, l_sibling) {
                                CLR(lt2->l_pflag, LP_SINGLESTEP);
                        }
                        CLR(lt->l_pflag, LP_SINGLESTEP);
                }
        sendsig:
                error = ptrace_sendsig(l, req, t, lt, signo, resume_all);
                break;

        case PT_SYSCALLEMU:
                if (!ISSET(t->p_slflag, PSL_SYSCALL) || t->p_stat != SSTOP) {
                        error = EINVAL;
                        break;
                }
                SET(t->p_slflag, PSL_SYSCALLEMU);
                break;

#ifdef PT_STEP
        case PT_SETSTEP:
                write = 1;

                /* FALLTHROUGH */
        case PT_CLEARSTEP:
                /* write = 0 done above. */
                if ((error = ptrace_update_lwp(t, &lt, data)) != 0)
                        break;

                if (write)
                        SET(lt->l_pflag, LP_SINGLESTEP);
                else
                        CLR(lt->l_pflag, LP_SINGLESTEP);
                break;
#endif

        case PT_KILL:
                /* just send the process a KILL signal. */
                signo = SIGKILL;
                goto sendsig;        /* in PT_CONTINUE, above. */

        case PT_STOP:
                /* just send the process a STOP signal. */
                signo = SIGSTOP;
                goto sendsig;        /* in PT_CONTINUE, above. */

        case PT_ATTACH:
                /*
                 * Go ahead and set the trace flag.
                 * Save the old parent (it's reset in
                 *   _DETACH, and also in kern_exit.c:wait4()
                 * Reparent the process so that the tracing
                 *   proc gets to see all the action.
                 * Stop the target.
                 */
                proc_changeparent(t, p);
                signo = SIGSTOP;
                goto sendsig;

        case PT_GET_EVENT_MASK:
                error = ptrace_get_event_mask(t, addr, data);
                break;

        case PT_SET_EVENT_MASK:
                error = ptrace_set_event_mask(t, addr, data);
                break;

        case PT_GET_PROCESS_STATE:
                error = ptrace_get_process_state(t, addr, data);
                break;

        case PT_LWPINFO:
                error = ptrace_lwpinfo(t, &lt, addr, data);
                break;

        case PT_SET_SIGINFO:
                error = ptrace_set_siginfo(t, &lt, ptm, addr, data);
                break;

        case PT_GET_SIGINFO:
                error = ptrace_get_siginfo(t, ptm, addr, data);
                break;

        case PT_RESUME:
        case PT_SUSPEND:
                error = ptrace_startstop(t, &lt, req, addr, data);
                break;

        case PT_LWPSTATUS:
                error = ptrace_lwpstatus(t, ptm, &lt, addr, data, false);
                break;

        case PT_LWPNEXT:
                error = ptrace_lwpstatus(t, ptm, &lt, addr, data, true);
                break;

        case PT_SET_SIGPASS:
                error = ptrace_set_sigpass(t, addr, data);
                break;

        case PT_GET_SIGPASS:
                error = ptrace_get_sigpass(t, addr, data);
                break;

#ifdef PT_REGISTERS
        case_PT_SETREGS
        case_PT_GETREGS
        case_PT_SETFPREGS
        case_PT_GETFPREGS
        case_PT_SETDBREGS
        case_PT_GETDBREGS
                error = ptrace_regs(l, &lt, req, ptm, addr, data);
                break;
#endif

#ifdef __HAVE_PTRACE_MACHDEP
        PTRACE_MACHDEP_REQUEST_CASES
                error = ptrace_machdep_dorequest(l, &lt, req, addr, data);
                break;
#endif
        }

out:
        if (pheld) {
                mutex_exit(t->p_lock);
                mutex_exit(&proc_lock);
        }
        if (lt != NULL)
                lwp_delref(lt);
        if (locked)
                rw_exit(&t->p_reflock);

        return error;
}

static int
process_auxv_offset(struct proc *p, struct uio *uio)
{
        struct ps_strings pss;
        int error;
        off_t off = (off_t)p->p_psstrp;

        if ((error = copyin_psstrings(p, &pss)) != 0)
                return error;

        if (pss.ps_envstr == NULL)
                return EIO;

#ifdef COMPAT_NETBSD32
        if (p->p_flag & PK_32)
                uio->uio_offset += (off_t)((vaddr_t)pss.ps_envstr +
                    sizeof(uint32_t) * (pss.ps_nenvstr + 1));
        else
#endif
                uio->uio_offset += (off_t)(vaddr_t)(pss.ps_envstr +
                    pss.ps_nenvstr + 1);

#ifdef __MACHINE_STACK_GROWS_UP
        if (uio->uio_offset < off)
                return EIO;
#else
        if (uio->uio_offset > off)
                return EIO;
        if ((uio->uio_offset + uio->uio_resid) > off)
                uio->uio_resid = off - uio->uio_offset;
#endif
        return 0;
}

MODULE(MODULE_CLASS_EXEC, ptrace_common, NULL);
 
static int
ptrace_common_init(void)
{

#if 0
        mutex_init(&ptrace_mtx, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&ptrace_cv, "ptracecb");
        ptrace_cbref = 0;
#endif
        ptrace_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
            ptrace_listener_cb, NULL);
        return 0;
}

static int
ptrace_common_fini(void)
{

        kauth_unlisten_scope(ptrace_listener);

#if 0
        /* Make sure no-one is executing our kauth listener */

        mutex_enter(&ptrace_mtx);
        while (ptrace_cbref != 0)
                cv_wait(&ptrace_cv, &ptrace_mtx);
        mutex_exit(&ptrace_mtx);
        mutex_destroy(&ptrace_mtx);
        cv_destroy(&ptrace_cv);
#endif

        return 0;
}

static int
ptrace_common_modcmd(modcmd_t cmd, void *arg)
{
        int error;
 
        switch (cmd) {
        case MODULE_CMD_INIT:
                error = ptrace_common_init();
                break;
        case MODULE_CMD_FINI:
                error = ptrace_common_fini();
                break;
        default:
                ptrace_hooks();
                error = ENOTTY;
                break;
        }
        return error;
}




































































  148 
   14 







   30 








   90 

   93 




   42 








































































































































































































   91 



  147 
   14 































































   17 







   17 



   16 




















   84 








   84 



   85 




















   65 









   65 



   64 

   47 
   65 























    5 










    5 



    5 
    5 


    5 























    6 










    6 



    6 
    6 


    6 






















   36 









   35 



   34 
   35 




















    8 









    8 



    8 
    8 




















   78 









   79 



   77 




















   73 









   74 



   74 




















   13 









   13 



   13 




















   11 









   11 
   11 



   11 
   10 





















    5 










    5 



    4 
    4 





















    1 










    1 
    1 



    1 
    1 




























































































    8 










    8 



    7 


























































    2 








    2 



    2 






















































































    1 








    1 



    1 






















   15 











   15 



   15 





















    1 









    1 



    1 





















    1 










    1 
    1 



    1 
    1 
































































    5 












    5 



    5 





















    5 










    5 



    5 
    5 


    5 































































    4 











    4 



    4 
    4 


    4 

























    4 












    4 



    4 




















   13 









   13 



   13 



















   13 







   13 



   13 



















   47 








   47 



   46 


















    3 







    3 



    3 



















   92 







   93 



   93 


















   90 







   93 



   93 






















   12 










   12 



   12 



















   29 







   29 



   29 
















































   78 






   78 



   78 




















    7 









    7 



    7 






















   10 











    9 



    9 




















    1 









    1 



    1 

























   57 













   57 



   57 





















   17 









   17 



   17 














































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
/*        $NetBSD: vnode_if.c,v 1.119 2023/06/15 09:15:13 hannken Exp $        */

/*
 * Warning: DO NOT EDIT! This file is automatically generated!
 * (Modifications made here may easily be lost!)
 *
 * Created from the file:
 *        NetBSD: vnode_if.src,v 1.85 2023/06/15 09:13:36 hannken Exp
 * by the script:
 *        NetBSD: vnode_if.sh,v 1.77 2022/10/26 23:39:43 riastradh Exp
 */

/*
 * Copyright (c) 1992, 1993, 1994, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vnode_if.c,v 1.119 2023/06/15 09:15:13 hannken Exp $");

#ifdef _KERNEL_OPT
#include "opt_vnode_lockdebug.h"
#endif /* _KERNEL_OPT */

#include <sys/param.h>
#include <sys/mount.h>
#include <sys/buf.h>
#include <sys/fcntl.h>
#include <sys/vnode.h>
#include <sys/lock.h>
#include <sys/fstrans.h>

#include <miscfs/deadfs/deadfs.h>

enum fst_op { FST_NO, FST_YES, FST_LAZY, FST_TRY };

static inline int
vop_pre(vnode_t *vp, struct mount **mp, bool *mpsafe, enum fst_op op)
{
        int error;

        *mpsafe = (vp->v_vflag & VV_MPSAFE);

        if (!*mpsafe) {
                KERNEL_LOCK(1, curlwp);
        }

        if (op == FST_YES || op == FST_LAZY || op == FST_TRY) {
                for (;;) {
                        *mp = vp->v_mount;
                        if (op == FST_TRY) {
                                error = fstrans_start_nowait(*mp);
                                if (error) {
                                        if (!*mpsafe) {
                                                KERNEL_UNLOCK_ONE(curlwp);
                                        }
                                        return error;
                                }
                        } else if (op == FST_LAZY) {
                                fstrans_start_lazy(*mp);
                        } else {
                                fstrans_start(*mp);
                        }
                        if (__predict_true(*mp == vp->v_mount))
                                break;
                        fstrans_done(*mp);
                }
        } else {
                *mp = vp->v_mount;
        }

        return 0;
}

static inline u_quad_t
vop_pre_get_size(struct vnode *vp)
{
        mutex_enter(vp->v_interlock);
        KASSERT(vp->v_size != VSIZENOTSET);
        u_quad_t rv = (u_quad_t)vp->v_size;
        mutex_exit(vp->v_interlock);

        return rv;
}

/*
 * VOP_RMDIR(), VOP_REMOVE(), and VOP_RENAME() need special handling
 * because they each drop the caller's references on one or more of
 * their arguments.  While there must be an open file descriptor in
 * associated with a vnode in order for knotes to be attached to it,
 * that status could change during the course of the operation.  So,
 * for the vnode arguments that are WILLRELE or WILLPUT, we check
 * pre-op if there are registered knotes, take a hold count if so,
 * and post-op release the hold after activating any knotes still
 * associated with the vnode.
 */

#define        VOP_POST_KNOTE(thisvp, e, n)                                        \
do {                                                                        \
        if (__predict_true((e) == 0)) {                                        \
                /*                                                        \
                 * VN_KNOTE() does the VN_KEVENT_INTEREST()                \
                 * check for us.                                        \
                 */                                                        \
                VN_KNOTE((thisvp), (n));                                \
        }                                                                \
} while (/*CONSTCOND*/0)

#define        VOP_POST_KNOTE_HELD(thisvp, e, n)                                \
do {                                                                        \
        /*                                                                \
         * We don't perform a VN_KEVENT_INTEREST() check here; it        \
         * was already performed when we did the pre-op work that        \
         * caused the vnode to be held in the first place.                \
         */                                                                \
        mutex_enter((thisvp)->v_interlock);                                \
        if (__predict_true((e) == 0)) {                                        \
                knote(&(thisvp)->v_klist->vk_klist, (n));                \
        }                                                                \
        holdrelel((thisvp));                                                \
        mutex_exit((thisvp)->v_interlock);                                \
        /*                                                                \
         * thisvp might be gone now!  Don't touch!                        \
         */                                                                \
} while (/*CONSTCOND*/0)

#define        vop_create_post(ap, e)                                                \
        VOP_POST_KNOTE((ap)->a_dvp, (e), NOTE_WRITE)

#define        vop_mknod_post(ap, e)                                                \
        VOP_POST_KNOTE((ap)->a_dvp, (e), NOTE_WRITE)

#define        vop_setattr_pre(ap)                                                \
        u_quad_t osize = 0;                                                \
        long vp_events =                                                \
            VN_KEVENT_INTEREST((ap)->a_vp, NOTE_ATTRIB | NOTE_EXTEND)        \
            ? NOTE_ATTRIB : 0;                                                \
        bool check_extend = false;                                        \
        if (__predict_false(vp_events != 0 &&                                \
            (ap)->a_vap->va_size != VNOVALSIZE)) {                        \
                check_extend = true;                                        \
                osize = vop_pre_get_size((ap)->a_vp);                        \
        }

#define        vop_setattr_post(ap, e)                                                \
do {                                                                        \
        if (__predict_false(vp_events != 0)) {                                \
                if (__predict_false(check_extend &&                        \
                    (ap)->a_vap->va_size > osize)) {                        \
                        vp_events |= NOTE_EXTEND;                        \
                }                                                        \
                VOP_POST_KNOTE((ap)->a_vp, (e), vp_events);                \
        }                                                                \
} while (/*CONSTCOND*/0)

#define        vop_setacl_post(ap, e)                                                \
        VOP_POST_KNOTE((ap)->a_vp, (e), NOTE_ATTRIB)

#define        vop_link_post(ap, e)                                                \
do {                                                                        \
        VOP_POST_KNOTE((ap)->a_dvp, (e), NOTE_WRITE);                        \
        VOP_POST_KNOTE((ap)->a_vp, (e), NOTE_LINK);                        \
} while (/*CONSTCOND*/0)

#define        vop_mkdir_post(ap, e)                                                \
        VOP_POST_KNOTE((ap)->a_dvp, (e), NOTE_WRITE | NOTE_LINK)

#define        vop_remove_pre_common(ap)                                        \
        bool post_event_vp =                                                \
            VN_KEVENT_INTEREST((ap)->a_vp, NOTE_DELETE | NOTE_LINK);        \
        if (__predict_false(post_event_vp)) {                                \
                vhold((ap)->a_vp);                                        \
        }

#define        vop_remove_post_common(ap, e, dn, lc)                                \
do {                                                                        \
        VOP_POST_KNOTE((ap)->a_dvp, (e), (dn));                                \
        if (__predict_false(post_event_vp)) {                                \
                VOP_POST_KNOTE_HELD((ap)->a_vp, (e),                        \
                    (lc) ? NOTE_LINK : NOTE_DELETE);                        \
        }                                                                \
} while (/*CONSTCOND*/0)

/*
 * One could make the argument that VOP_REMOVE() should send NOTE_LINK
 * on vp if the resulting link count is not zero, but that's not what
 * the documentation says.
 *
 * We could change this easily by passing ap->ctx_vp_new_nlink to
 * vop_remove_post_common().
 */
#define        vop_remove_pre(ap)                                                \
        vop_remove_pre_common((ap));                                        \
        /*                                                                \
         * We will assume that the file being removed is deleted unless        \
         * the file system tells us otherwise by updating vp_new_nlink.        \
         */                                                                \
        (ap)->ctx_vp_new_nlink = 0;

#define        vop_remove_post(ap, e)                                                \
        vop_remove_post_common((ap), (e), NOTE_WRITE, 0)

#define        vop_rmdir_pre(ap)                                                \
        vop_remove_pre_common(ap)

#define        vop_rmdir_post(ap, e)                                                \
        vop_remove_post_common((ap), (e), NOTE_WRITE | NOTE_LINK, 0)

#define        vop_symlink_post(ap, e)                                                \
        VOP_POST_KNOTE((ap)->a_dvp, (e), NOTE_WRITE)

#define        vop_open_post(ap, e)                                                \
        VOP_POST_KNOTE((ap)->a_vp, (e), NOTE_OPEN)

#define        vop_close_post(ap, e)                                                \
do {                                                                        \
        /* See the definition of VN_KNOTE() in <sys/vnode.h>. */        \
        if (__predict_false(VN_KEVENT_INTEREST((ap)->a_vp,                \
            NOTE_CLOSE_WRITE | NOTE_CLOSE) && (e) == 0)) {                \
                struct vnode *thisvp = (ap)->a_vp;                        \
                mutex_enter(thisvp->v_interlock);                        \
                /*                                                        \
                 * Don't send NOTE_CLOSE when closing a vnode that's        \
                 * been reclaimed or otherwise revoked; a NOTE_REVOKE        \
                 * has already been sent, and this close is effectively        \
                 * meaningless from the watcher's perspective.                \
                 */                                                        \
                if (__predict_true(thisvp->v_op != dead_vnodeop_p)) {        \
                        knote(&thisvp->v_klist->vk_klist,                \
                            ((ap)->a_fflag & FWRITE)                        \
                            ? NOTE_CLOSE_WRITE : NOTE_CLOSE);                \
                }                                                        \
                mutex_exit(thisvp->v_interlock);                        \
        }                                                                \
} while (/*CONSTCOND*/0)

#define        vop_read_post(ap, e)                                                \
        VOP_POST_KNOTE((ap)->a_vp, (e), NOTE_READ)

#define        vop_write_pre(ap)                                                \
        off_t ooffset = 0, noffset = 0;                                        \
        u_quad_t osize = 0;                                                \
        long vp_events =                                                \
            VN_KEVENT_INTEREST((ap)->a_vp, NOTE_WRITE | NOTE_EXTEND)        \
            ? NOTE_WRITE : 0;                                                \
        if (__predict_false(vp_events != 0)) {                                \
                ooffset = (ap)->a_uio->uio_offset;                        \
                osize = vop_pre_get_size((ap)->a_vp);                        \
        }

#define        vop_write_post(ap, e)                                                \
do {                                                                        \
        /*                                                                \
         * If any data was written, we'll post an event, even if        \
         * there was an error.                                                \
         */                                                                \
        noffset = (ap)->a_uio->uio_offset;                                \
        if (__predict_false(vp_events != 0 && noffset > ooffset)) {        \
                if (noffset > osize) {                                        \
                        vp_events |= NOTE_EXTEND;                        \
                }                                                        \
                VN_KNOTE((ap)->a_vp, vp_events);                        \
        }                                                                \
} while (/*CONSTCOND*/0)

static inline void
vop_post(vnode_t *vp, struct mount *mp, bool mpsafe, enum fst_op op)
{

        if (op == FST_YES || op == FST_LAZY) {
                fstrans_done(mp);
        }

        if (!mpsafe) {
                KERNEL_UNLOCK_ONE(curlwp);
        }
}

static inline void
assert_vop_unlocked(vnode_t *vp, const char *str)
{
#if defined(VNODE_LOCKDEBUG)

        if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
                panic("%s: %p %d/%d is locked but should not be",
                    str, vp, vp->v_tag, vp->v_type);
#endif
}

static inline void
assert_vop_locked(vnode_t *vp, const char *str)
{
#if defined(VNODE_LOCKDEBUG)

        if (VOP_ISLOCKED(vp) == LK_NONE)
                panic("%s: %p %d/%d is not locked but should be",
                    str, vp, vp->v_tag, vp->v_type);
#endif
}

static inline void
assert_vop_elocked(vnode_t *vp, const char *str)
{
#if defined(VNODE_LOCKDEBUG)

        if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
                panic("%s: %p %d/%d is not exclusive locked but should be",
                    str, vp, vp->v_tag, vp->v_type);
#endif
}

const struct vnodeop_desc vop_default_desc = {
        0,
        "default",
        0,
        NULL,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};


const int vop_bwrite_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_bwrite_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_bwrite_desc = {
        VOP_BWRITE_DESCOFFSET,
        "vop_bwrite",
        0,
        vop_bwrite_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_BWRITE(struct vnode *vp,
    struct buf *bp)
{
        int error;
        bool mpsafe;
        struct vop_bwrite_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_bwrite);
        a.a_vp = vp;
        a.a_bp = bp;
        error = vop_pre(vp, &mp, &mpsafe, FST_YES);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_bwrite), &a));
        vop_post(vp, mp, mpsafe, FST_YES);
        return error;
}

const int vop_parsepath_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_parsepath_args,a_dvp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_parsepath_desc = {
        VOP_PARSEPATH_DESCOFFSET,
        "vop_parsepath",
        0,
        vop_parsepath_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_PARSEPATH(struct vnode *dvp,
    const char *name,
    size_t *retval)
{
        int error;
        bool mpsafe;
        struct vop_parsepath_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_parsepath);
        a.a_dvp = dvp;
        a.a_name = name;
        a.a_retval = retval;
        error = vop_pre(dvp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(dvp, VOFFSET(vop_parsepath), &a));
        vop_post(dvp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_lookup_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_lookup_v2_args,a_dvp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_lookup_desc = {
        VOP_LOOKUP_DESCOFFSET,
        "vop_lookup",
        0,
        vop_lookup_vp_offsets,
        VOPARG_OFFSETOF(struct vop_lookup_v2_args, a_vpp),
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_lookup_v2_args, a_cnp),
};
int
VOP_LOOKUP(struct vnode *dvp,
    struct vnode **vpp,
    struct componentname *cnp)
{
        int error;
        bool mpsafe;
        struct vop_lookup_v2_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_lookup);
        a.a_dvp = dvp;
        a.a_vpp = vpp;
        a.a_cnp = cnp;
        assert_vop_locked(dvp, "vop_lookup: dvp");
        error = vop_pre(dvp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(dvp, VOFFSET(vop_lookup), &a));
        vop_post(dvp, mp, mpsafe, FST_NO);
#ifdef DIAGNOSTIC
        if (error == 0)
                KASSERT((*vpp)->v_size != VSIZENOTSET
                    && (*vpp)->v_writesize != VSIZENOTSET);
#endif /* DIAGNOSTIC */
        return error;
}

const int vop_create_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_create_v3_args,a_dvp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_create_desc = {
        VOP_CREATE_DESCOFFSET,
        "vop_create",
        0,
        vop_create_vp_offsets,
        VOPARG_OFFSETOF(struct vop_create_v3_args, a_vpp),
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_create_v3_args, a_cnp),
};
int
VOP_CREATE(struct vnode *dvp,
    struct vnode **vpp,
    struct componentname *cnp,
    struct vattr *vap)
{
        int error;
        bool mpsafe;
        struct vop_create_v3_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_create);
        a.a_dvp = dvp;
        a.a_vpp = vpp;
        a.a_cnp = cnp;
        a.a_vap = vap;
        assert_vop_elocked(dvp, "vop_create: dvp");
        error = vop_pre(dvp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(dvp, VOFFSET(vop_create), &a));
        vop_post(dvp, mp, mpsafe, FST_NO);
        vop_create_post(&a, error);
#ifdef DIAGNOSTIC
        if (error == 0)
                KASSERT((*vpp)->v_size != VSIZENOTSET
                    && (*vpp)->v_writesize != VSIZENOTSET);
#endif /* DIAGNOSTIC */
        return error;
}

const int vop_mknod_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_mknod_v3_args,a_dvp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_mknod_desc = {
        VOP_MKNOD_DESCOFFSET,
        "vop_mknod",
        0,
        vop_mknod_vp_offsets,
        VOPARG_OFFSETOF(struct vop_mknod_v3_args, a_vpp),
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_mknod_v3_args, a_cnp),
};
int
VOP_MKNOD(struct vnode *dvp,
    struct vnode **vpp,
    struct componentname *cnp,
    struct vattr *vap)
{
        int error;
        bool mpsafe;
        struct vop_mknod_v3_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_mknod);
        a.a_dvp = dvp;
        a.a_vpp = vpp;
        a.a_cnp = cnp;
        a.a_vap = vap;
        assert_vop_elocked(dvp, "vop_mknod: dvp");
        error = vop_pre(dvp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(dvp, VOFFSET(vop_mknod), &a));
        vop_post(dvp, mp, mpsafe, FST_NO);
        vop_mknod_post(&a, error);
#ifdef DIAGNOSTIC
        if (error == 0)
                KASSERT((*vpp)->v_size != VSIZENOTSET
                    && (*vpp)->v_writesize != VSIZENOTSET);
#endif /* DIAGNOSTIC */
        return error;
}

const int vop_open_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_open_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_open_desc = {
        VOP_OPEN_DESCOFFSET,
        "vop_open",
        0,
        vop_open_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_open_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_OPEN(struct vnode *vp,
    int mode,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_open_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_open);
        a.a_vp = vp;
        a.a_mode = mode;
        a.a_cred = cred;
        assert_vop_locked(vp, "vop_open: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_open), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        vop_open_post(&a, error);
        return error;
}

const int vop_close_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_close_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_close_desc = {
        VOP_CLOSE_DESCOFFSET,
        "vop_close",
        0,
        vop_close_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_close_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_CLOSE(struct vnode *vp,
    int fflag,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_close_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_close);
        a.a_vp = vp;
        a.a_fflag = fflag;
        a.a_cred = cred;
        assert_vop_locked(vp, "vop_close: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_close), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        vop_close_post(&a, error);
        return error;
}

const int vop_access_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_access_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_access_desc = {
        VOP_ACCESS_DESCOFFSET,
        "vop_access",
        0,
        vop_access_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_access_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_ACCESS(struct vnode *vp,
    accmode_t accmode,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_access_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_access);
        a.a_vp = vp;
        a.a_accmode = accmode;
        a.a_cred = cred;
        assert_vop_locked(vp, "vop_access: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_access), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_accessx_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_accessx_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_accessx_desc = {
        VOP_ACCESSX_DESCOFFSET,
        "vop_accessx",
        0,
        vop_accessx_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_accessx_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_ACCESSX(struct vnode *vp,
    accmode_t accmode,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_accessx_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_accessx);
        a.a_vp = vp;
        a.a_accmode = accmode;
        a.a_cred = cred;
        assert_vop_locked(vp, "vop_accessx: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_accessx), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_getattr_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_getattr_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_getattr_desc = {
        VOP_GETATTR_DESCOFFSET,
        "vop_getattr",
        0,
        vop_getattr_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_getattr_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_GETATTR(struct vnode *vp,
    struct vattr *vap,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_getattr_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_getattr);
        a.a_vp = vp;
        a.a_vap = vap;
        a.a_cred = cred;
        assert_vop_locked(vp, "vop_getattr: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_getattr), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_setattr_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_setattr_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_setattr_desc = {
        VOP_SETATTR_DESCOFFSET,
        "vop_setattr",
        0,
        vop_setattr_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_setattr_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_SETATTR(struct vnode *vp,
    struct vattr *vap,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_setattr_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_setattr);
        a.a_vp = vp;
        a.a_vap = vap;
        a.a_cred = cred;
        assert_vop_elocked(vp, "vop_setattr: vp");
        vop_setattr_pre(&a);
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_setattr), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        vop_setattr_post(&a, error);
        return error;
}

const int vop_read_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_read_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_read_desc = {
        VOP_READ_DESCOFFSET,
        "vop_read",
        0,
        vop_read_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_read_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_READ(struct vnode *vp,
    struct uio *uio,
    int ioflag,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_read_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_read);
        a.a_vp = vp;
        a.a_uio = uio;
        a.a_ioflag = ioflag;
        a.a_cred = cred;
        assert_vop_locked(vp, "vop_read: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_read), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        vop_read_post(&a, error);
        return error;
}

const int vop_write_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_write_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_write_desc = {
        VOP_WRITE_DESCOFFSET,
        "vop_write",
        0,
        vop_write_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_write_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_WRITE(struct vnode *vp,
    struct uio *uio,
    int ioflag,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_write_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_write);
        a.a_vp = vp;
        a.a_uio = uio;
        a.a_ioflag = ioflag;
        a.a_cred = cred;
        assert_vop_locked(vp, "vop_write: vp");
        vop_write_pre(&a);
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_write), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        vop_write_post(&a, error);
        return error;
}

const int vop_fallocate_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_fallocate_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_fallocate_desc = {
        VOP_FALLOCATE_DESCOFFSET,
        "vop_fallocate",
        0,
        vop_fallocate_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_FALLOCATE(struct vnode *vp,
    off_t pos,
    off_t len)
{
        int error;
        bool mpsafe;
        struct vop_fallocate_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_fallocate);
        a.a_vp = vp;
        a.a_pos = pos;
        a.a_len = len;
        assert_vop_locked(vp, "vop_fallocate: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_fallocate), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_fdiscard_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_fdiscard_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_fdiscard_desc = {
        VOP_FDISCARD_DESCOFFSET,
        "vop_fdiscard",
        0,
        vop_fdiscard_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_FDISCARD(struct vnode *vp,
    off_t pos,
    off_t len)
{
        int error;
        bool mpsafe;
        struct vop_fdiscard_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_fdiscard);
        a.a_vp = vp;
        a.a_pos = pos;
        a.a_len = len;
        assert_vop_locked(vp, "vop_fdiscard: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_fdiscard), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_ioctl_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_ioctl_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_ioctl_desc = {
        VOP_IOCTL_DESCOFFSET,
        "vop_ioctl",
        0,
        vop_ioctl_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_ioctl_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_IOCTL(struct vnode *vp,
    u_long command,
    void *data,
    int fflag,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_ioctl_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_ioctl);
        a.a_vp = vp;
        a.a_command = command;
        a.a_data = data;
        a.a_fflag = fflag;
        a.a_cred = cred;
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_ioctl), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_fcntl_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_fcntl_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_fcntl_desc = {
        VOP_FCNTL_DESCOFFSET,
        "vop_fcntl",
        0,
        vop_fcntl_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_fcntl_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_FCNTL(struct vnode *vp,
    u_int command,
    void *data,
    int fflag,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_fcntl_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_fcntl);
        a.a_vp = vp;
        a.a_command = command;
        a.a_data = data;
        a.a_fflag = fflag;
        a.a_cred = cred;
        assert_vop_unlocked(vp, "vop_fcntl: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_fcntl), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_poll_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_poll_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_poll_desc = {
        VOP_POLL_DESCOFFSET,
        "vop_poll",
        0,
        vop_poll_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_POLL(struct vnode *vp,
    int events)
{
        int error;
        bool mpsafe;
        struct vop_poll_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_poll);
        a.a_vp = vp;
        a.a_events = events;
        assert_vop_unlocked(vp, "vop_poll: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_YES);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_poll), &a));
        vop_post(vp, mp, mpsafe, FST_YES);
        return error;
}

const int vop_kqfilter_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_kqfilter_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_kqfilter_desc = {
        VOP_KQFILTER_DESCOFFSET,
        "vop_kqfilter",
        0,
        vop_kqfilter_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_KQFILTER(struct vnode *vp,
    struct knote *kn)
{
        int error;
        bool mpsafe;
        struct vop_kqfilter_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_kqfilter);
        a.a_vp = vp;
        a.a_kn = kn;
        assert_vop_unlocked(vp, "vop_kqfilter: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_YES);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_kqfilter), &a));
        vop_post(vp, mp, mpsafe, FST_YES);
        return error;
}

const int vop_revoke_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_revoke_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_revoke_desc = {
        VOP_REVOKE_DESCOFFSET,
        "vop_revoke",
        0,
        vop_revoke_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_REVOKE(struct vnode *vp,
    int flags)
{
        int error;
        bool mpsafe;
        struct vop_revoke_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_revoke);
        a.a_vp = vp;
        a.a_flags = flags;
        assert_vop_unlocked(vp, "vop_revoke: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_revoke), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_mmap_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_mmap_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_mmap_desc = {
        VOP_MMAP_DESCOFFSET,
        "vop_mmap",
        0,
        vop_mmap_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_mmap_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_MMAP(struct vnode *vp,
    vm_prot_t prot,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_mmap_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_mmap);
        a.a_vp = vp;
        a.a_prot = prot;
        a.a_cred = cred;
        error = vop_pre(vp, &mp, &mpsafe, FST_YES);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_mmap), &a));
        vop_post(vp, mp, mpsafe, FST_YES);
        return error;
}

const int vop_fsync_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_fsync_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_fsync_desc = {
        VOP_FSYNC_DESCOFFSET,
        "vop_fsync",
        0,
        vop_fsync_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_fsync_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_FSYNC(struct vnode *vp,
    kauth_cred_t cred,
    int flags,
    off_t offlo,
    off_t offhi)
{
        int error;
        bool mpsafe;
        struct vop_fsync_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_fsync);
        a.a_vp = vp;
        a.a_cred = cred;
        a.a_flags = flags;
        a.a_offlo = offlo;
        a.a_offhi = offhi;
        assert_vop_locked(vp, "vop_fsync: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_fsync), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_seek_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_seek_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_seek_desc = {
        VOP_SEEK_DESCOFFSET,
        "vop_seek",
        0,
        vop_seek_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_seek_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_SEEK(struct vnode *vp,
    off_t oldoff,
    off_t newoff,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_seek_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_seek);
        a.a_vp = vp;
        a.a_oldoff = oldoff;
        a.a_newoff = newoff;
        a.a_cred = cred;
        error = vop_pre(vp, &mp, &mpsafe, FST_YES);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_seek), &a));
        vop_post(vp, mp, mpsafe, FST_YES);
        return error;
}

const int vop_remove_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_remove_v3_args,a_dvp),
        VOPARG_OFFSETOF(struct vop_remove_v3_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_remove_desc = {
        VOP_REMOVE_DESCOFFSET,
        "vop_remove",
        0 | VDESC_VP1_WILLPUT,
        vop_remove_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_remove_v3_args, a_cnp),
};
int
VOP_REMOVE(struct vnode *dvp,
    struct vnode *vp,
    struct componentname *cnp)
{
        int error;
        bool mpsafe;
        struct vop_remove_v3_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_remove);
        a.a_dvp = dvp;
        a.a_vp = vp;
        a.a_cnp = cnp;
        assert_vop_elocked(dvp, "vop_remove: dvp");
        assert_vop_locked(vp, "vop_remove: vp");
        vop_remove_pre(&a);
        error = vop_pre(dvp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(dvp, VOFFSET(vop_remove), &a));
        vop_post(dvp, mp, mpsafe, FST_NO);
        vop_remove_post(&a, error);
        return error;
}

const int vop_link_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_link_v2_args,a_dvp),
        VOPARG_OFFSETOF(struct vop_link_v2_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_link_desc = {
        VOP_LINK_DESCOFFSET,
        "vop_link",
        0,
        vop_link_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_link_v2_args, a_cnp),
};
int
VOP_LINK(struct vnode *dvp,
    struct vnode *vp,
    struct componentname *cnp)
{
        int error;
        bool mpsafe;
        struct vop_link_v2_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_link);
        a.a_dvp = dvp;
        a.a_vp = vp;
        a.a_cnp = cnp;
        assert_vop_elocked(dvp, "vop_link: dvp");
        assert_vop_unlocked(vp, "vop_link: vp");
        error = vop_pre(dvp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(dvp, VOFFSET(vop_link), &a));
        vop_post(dvp, mp, mpsafe, FST_NO);
        vop_link_post(&a, error);
        return error;
}

const int vop_rename_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_rename_args,a_fdvp),
        VOPARG_OFFSETOF(struct vop_rename_args,a_fvp),
        VOPARG_OFFSETOF(struct vop_rename_args,a_tdvp),
        VOPARG_OFFSETOF(struct vop_rename_args,a_tvp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_rename_desc = {
        VOP_RENAME_DESCOFFSET,
        "vop_rename",
        0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VP2_WILLPUT | VDESC_VP3_WILLPUT,
        vop_rename_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_rename_args, a_fcnp),
};
int
VOP_RENAME(struct vnode *fdvp,
    struct vnode *fvp,
    struct componentname *fcnp,
    struct vnode *tdvp,
    struct vnode *tvp,
    struct componentname *tcnp)
{
        int error;
        bool mpsafe;
        struct vop_rename_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_rename);
        a.a_fdvp = fdvp;
        a.a_fvp = fvp;
        a.a_fcnp = fcnp;
        a.a_tdvp = tdvp;
        a.a_tvp = tvp;
        a.a_tcnp = tcnp;
        assert_vop_locked(tdvp, "vop_rename: tdvp");
        error = vop_pre(fdvp, &mp, &mpsafe, FST_YES);
        if (error)
                return error;
        error = (VCALL(fdvp, VOFFSET(vop_rename), &a));
        vop_post(fdvp, mp, mpsafe, FST_YES);
        return error;
}

const int vop_mkdir_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_mkdir_v3_args,a_dvp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_mkdir_desc = {
        VOP_MKDIR_DESCOFFSET,
        "vop_mkdir",
        0,
        vop_mkdir_vp_offsets,
        VOPARG_OFFSETOF(struct vop_mkdir_v3_args, a_vpp),
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_mkdir_v3_args, a_cnp),
};
int
VOP_MKDIR(struct vnode *dvp,
    struct vnode **vpp,
    struct componentname *cnp,
    struct vattr *vap)
{
        int error;
        bool mpsafe;
        struct vop_mkdir_v3_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_mkdir);
        a.a_dvp = dvp;
        a.a_vpp = vpp;
        a.a_cnp = cnp;
        a.a_vap = vap;
        assert_vop_elocked(dvp, "vop_mkdir: dvp");
        error = vop_pre(dvp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(dvp, VOFFSET(vop_mkdir), &a));
        vop_post(dvp, mp, mpsafe, FST_NO);
        vop_mkdir_post(&a, error);
#ifdef DIAGNOSTIC
        if (error == 0)
                KASSERT((*vpp)->v_size != VSIZENOTSET
                    && (*vpp)->v_writesize != VSIZENOTSET);
#endif /* DIAGNOSTIC */
        return error;
}

const int vop_rmdir_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_rmdir_v2_args,a_dvp),
        VOPARG_OFFSETOF(struct vop_rmdir_v2_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_rmdir_desc = {
        VOP_RMDIR_DESCOFFSET,
        "vop_rmdir",
        0 | VDESC_VP1_WILLPUT,
        vop_rmdir_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_rmdir_v2_args, a_cnp),
};
int
VOP_RMDIR(struct vnode *dvp,
    struct vnode *vp,
    struct componentname *cnp)
{
        int error;
        bool mpsafe;
        struct vop_rmdir_v2_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_rmdir);
        a.a_dvp = dvp;
        a.a_vp = vp;
        a.a_cnp = cnp;
        assert_vop_elocked(dvp, "vop_rmdir: dvp");
        assert_vop_elocked(vp, "vop_rmdir: vp");
        vop_rmdir_pre(&a);
        error = vop_pre(dvp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(dvp, VOFFSET(vop_rmdir), &a));
        vop_post(dvp, mp, mpsafe, FST_NO);
        vop_rmdir_post(&a, error);
        return error;
}

const int vop_symlink_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_symlink_v3_args,a_dvp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_symlink_desc = {
        VOP_SYMLINK_DESCOFFSET,
        "vop_symlink",
        0,
        vop_symlink_vp_offsets,
        VOPARG_OFFSETOF(struct vop_symlink_v3_args, a_vpp),
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_symlink_v3_args, a_cnp),
};
int
VOP_SYMLINK(struct vnode *dvp,
    struct vnode **vpp,
    struct componentname *cnp,
    struct vattr *vap,
    char *target)
{
        int error;
        bool mpsafe;
        struct vop_symlink_v3_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_symlink);
        a.a_dvp = dvp;
        a.a_vpp = vpp;
        a.a_cnp = cnp;
        a.a_vap = vap;
        a.a_target = target;
        assert_vop_elocked(dvp, "vop_symlink: dvp");
        error = vop_pre(dvp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(dvp, VOFFSET(vop_symlink), &a));
        vop_post(dvp, mp, mpsafe, FST_NO);
        vop_symlink_post(&a, error);
#ifdef DIAGNOSTIC
        if (error == 0)
                KASSERT((*vpp)->v_size != VSIZENOTSET
                    && (*vpp)->v_writesize != VSIZENOTSET);
#endif /* DIAGNOSTIC */
        return error;
}

const int vop_readdir_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_readdir_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_readdir_desc = {
        VOP_READDIR_DESCOFFSET,
        "vop_readdir",
        0,
        vop_readdir_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_readdir_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_READDIR(struct vnode *vp,
    struct uio *uio,
    kauth_cred_t cred,
    int *eofflag,
    off_t **cookies,
    int *ncookies)
{
        int error;
        bool mpsafe;
        struct vop_readdir_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_readdir);
        a.a_vp = vp;
        a.a_uio = uio;
        a.a_cred = cred;
        a.a_eofflag = eofflag;
        a.a_cookies = cookies;
        a.a_ncookies = ncookies;
        assert_vop_locked(vp, "vop_readdir: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_readdir), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_readlink_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_readlink_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_readlink_desc = {
        VOP_READLINK_DESCOFFSET,
        "vop_readlink",
        0,
        vop_readlink_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_readlink_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_READLINK(struct vnode *vp,
    struct uio *uio,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_readlink_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_readlink);
        a.a_vp = vp;
        a.a_uio = uio;
        a.a_cred = cred;
        assert_vop_locked(vp, "vop_readlink: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_readlink), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_abortop_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_abortop_args,a_dvp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_abortop_desc = {
        VOP_ABORTOP_DESCOFFSET,
        "vop_abortop",
        0,
        vop_abortop_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_abortop_args, a_cnp),
};
int
VOP_ABORTOP(struct vnode *dvp,
    struct componentname *cnp)
{
        int error;
        bool mpsafe;
        struct vop_abortop_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_abortop);
        a.a_dvp = dvp;
        a.a_cnp = cnp;
        error = vop_pre(dvp, &mp, &mpsafe, FST_YES);
        if (error)
                return error;
        error = (VCALL(dvp, VOFFSET(vop_abortop), &a));
        vop_post(dvp, mp, mpsafe, FST_YES);
        return error;
}

const int vop_inactive_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_inactive_v2_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_inactive_desc = {
        VOP_INACTIVE_DESCOFFSET,
        "vop_inactive",
        0,
        vop_inactive_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_INACTIVE(struct vnode *vp,
    bool *recycle)
{
        int error;
        bool mpsafe;
        struct vop_inactive_v2_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_inactive);
        a.a_vp = vp;
        a.a_recycle = recycle;
        assert_vop_elocked(vp, "vop_inactive: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_inactive), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_reclaim_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_reclaim_v2_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_reclaim_desc = {
        VOP_RECLAIM_DESCOFFSET,
        "vop_reclaim",
        0,
        vop_reclaim_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_RECLAIM(struct vnode *vp)
{
        int error;
        bool mpsafe;
        struct vop_reclaim_v2_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_reclaim);
        a.a_vp = vp;
        assert_vop_elocked(vp, "vop_reclaim: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_reclaim), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_lock_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_lock_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_lock_desc = {
        VOP_LOCK_DESCOFFSET,
        "vop_lock",
        0,
        vop_lock_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_LOCK(struct vnode *vp,
    int flags)
{
        int error;
        bool mpsafe;
        struct vop_lock_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_lock);
        a.a_vp = vp;
        a.a_flags = flags;
        error = vop_pre(vp, &mp, &mpsafe, (!(flags & (LK_SHARED|LK_EXCLUSIVE)) ? FST_NO : (flags & LK_NOWAIT ? FST_TRY : FST_YES)));
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_lock), &a));
        vop_post(vp, mp, mpsafe, (flags & (LK_UPGRADE|LK_DOWNGRADE) ? FST_NO : (error ? FST_YES : FST_NO)));
        return error;
}

const int vop_unlock_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_unlock_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_unlock_desc = {
        VOP_UNLOCK_DESCOFFSET,
        "vop_unlock",
        0,
        vop_unlock_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_UNLOCK(struct vnode *vp)
{
        int error;
        bool mpsafe;
        struct vop_unlock_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_unlock);
        a.a_vp = vp;
        assert_vop_locked(vp, "vop_unlock: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_unlock), &a));
        vop_post(vp, mp, mpsafe, FST_YES);
        return error;
}

const int vop_bmap_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_bmap_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_bmap_desc = {
        VOP_BMAP_DESCOFFSET,
        "vop_bmap",
        0,
        vop_bmap_vp_offsets,
        VOPARG_OFFSETOF(struct vop_bmap_args, a_vpp),
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_BMAP(struct vnode *vp,
    daddr_t bn,
    struct vnode **vpp,
    daddr_t *bnp,
    int *runp)
{
        int error;
        bool mpsafe;
        struct vop_bmap_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_bmap);
        a.a_vp = vp;
        a.a_bn = bn;
        a.a_vpp = vpp;
        a.a_bnp = bnp;
        a.a_runp = runp;
        error = vop_pre(vp, &mp, &mpsafe, FST_YES);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_bmap), &a));
        vop_post(vp, mp, mpsafe, FST_YES);
        return error;
}

const int vop_strategy_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_strategy_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_strategy_desc = {
        VOP_STRATEGY_DESCOFFSET,
        "vop_strategy",
        0,
        vop_strategy_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_STRATEGY(struct vnode *vp,
    struct buf *bp)
{
        int error;
        bool mpsafe;
        struct vop_strategy_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_strategy);
        a.a_vp = vp;
        a.a_bp = bp;
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_strategy), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_print_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_print_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_print_desc = {
        VOP_PRINT_DESCOFFSET,
        "vop_print",
        0,
        vop_print_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_PRINT(struct vnode *vp)
{
        int error;
        bool mpsafe;
        struct vop_print_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_print);
        a.a_vp = vp;
        error = vop_pre(vp, &mp, &mpsafe, FST_YES);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_print), &a));
        vop_post(vp, mp, mpsafe, FST_YES);
        return error;
}

const int vop_islocked_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_islocked_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_islocked_desc = {
        VOP_ISLOCKED_DESCOFFSET,
        "vop_islocked",
        0,
        vop_islocked_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_ISLOCKED(struct vnode *vp)
{
        int error;
        bool mpsafe;
        struct vop_islocked_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_islocked);
        a.a_vp = vp;
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_islocked), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_pathconf_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_pathconf_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_pathconf_desc = {
        VOP_PATHCONF_DESCOFFSET,
        "vop_pathconf",
        0,
        vop_pathconf_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_PATHCONF(struct vnode *vp,
    int name,
    register_t *retval)
{
        int error;
        bool mpsafe;
        struct vop_pathconf_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_pathconf);
        a.a_vp = vp;
        a.a_name = name;
        a.a_retval = retval;
        assert_vop_locked(vp, "vop_pathconf: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_pathconf), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_advlock_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_advlock_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_advlock_desc = {
        VOP_ADVLOCK_DESCOFFSET,
        "vop_advlock",
        0,
        vop_advlock_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_ADVLOCK(struct vnode *vp,
    void *id,
    int op,
    struct flock *fl,
    int flags)
{
        int error;
        bool mpsafe;
        struct vop_advlock_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_advlock);
        a.a_vp = vp;
        a.a_id = id;
        a.a_op = op;
        a.a_fl = fl;
        a.a_flags = flags;
        assert_vop_unlocked(vp, "vop_advlock: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_advlock), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_whiteout_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_whiteout_args,a_dvp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_whiteout_desc = {
        VOP_WHITEOUT_DESCOFFSET,
        "vop_whiteout",
        0,
        vop_whiteout_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_whiteout_args, a_cnp),
};
int
VOP_WHITEOUT(struct vnode *dvp,
    struct componentname *cnp,
    int flags)
{
        int error;
        bool mpsafe;
        struct vop_whiteout_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_whiteout);
        a.a_dvp = dvp;
        a.a_cnp = cnp;
        a.a_flags = flags;
        assert_vop_elocked(dvp, "vop_whiteout: dvp");
        error = vop_pre(dvp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(dvp, VOFFSET(vop_whiteout), &a));
        vop_post(dvp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_getpages_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_getpages_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_getpages_desc = {
        VOP_GETPAGES_DESCOFFSET,
        "vop_getpages",
        0,
        vop_getpages_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_GETPAGES(struct vnode *vp,
    voff_t offset,
    struct vm_page **m,
    int *count,
    int centeridx,
    vm_prot_t access_type,
    int advice,
    int flags)
{
        int error;
        bool mpsafe;
        struct vop_getpages_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_getpages);
        a.a_vp = vp;
        a.a_offset = offset;
        a.a_m = m;
        a.a_count = count;
        a.a_centeridx = centeridx;
        a.a_access_type = access_type;
        a.a_advice = advice;
        a.a_flags = flags;
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_getpages), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_putpages_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_putpages_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_putpages_desc = {
        VOP_PUTPAGES_DESCOFFSET,
        "vop_putpages",
        0,
        vop_putpages_vp_offsets,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
        VDESC_NO_OFFSET,
};
int
VOP_PUTPAGES(struct vnode *vp,
    voff_t offlo,
    voff_t offhi,
    int flags)
{
        int error;
        bool mpsafe;
        struct vop_putpages_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_putpages);
        a.a_vp = vp;
        a.a_offlo = offlo;
        a.a_offhi = offhi;
        a.a_flags = flags;
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_putpages), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_getacl_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_getacl_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_getacl_desc = {
        VOP_GETACL_DESCOFFSET,
        "vop_getacl",
        0,
        vop_getacl_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_getacl_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_GETACL(struct vnode *vp,
    acl_type_t type,
    struct acl *aclp,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_getacl_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_getacl);
        a.a_vp = vp;
        a.a_type = type;
        a.a_aclp = aclp;
        a.a_cred = cred;
        error = vop_pre(vp, &mp, &mpsafe, FST_YES);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_getacl), &a));
        vop_post(vp, mp, mpsafe, FST_YES);
        return error;
}

const int vop_setacl_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_setacl_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_setacl_desc = {
        VOP_SETACL_DESCOFFSET,
        "vop_setacl",
        0,
        vop_setacl_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_setacl_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_SETACL(struct vnode *vp,
    acl_type_t type,
    struct acl *aclp,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_setacl_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_setacl);
        a.a_vp = vp;
        a.a_type = type;
        a.a_aclp = aclp;
        a.a_cred = cred;
        assert_vop_elocked(vp, "vop_setacl: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_setacl), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        vop_setacl_post(&a, error);
        return error;
}

const int vop_aclcheck_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_aclcheck_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_aclcheck_desc = {
        VOP_ACLCHECK_DESCOFFSET,
        "vop_aclcheck",
        0,
        vop_aclcheck_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_aclcheck_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_ACLCHECK(struct vnode *vp,
    acl_type_t type,
    struct acl *aclp,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_aclcheck_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_aclcheck);
        a.a_vp = vp;
        a.a_type = type;
        a.a_aclp = aclp;
        a.a_cred = cred;
        error = vop_pre(vp, &mp, &mpsafe, FST_YES);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_aclcheck), &a));
        vop_post(vp, mp, mpsafe, FST_YES);
        return error;
}

const int vop_closeextattr_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_closeextattr_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_closeextattr_desc = {
        VOP_CLOSEEXTATTR_DESCOFFSET,
        "vop_closeextattr",
        0,
        vop_closeextattr_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_closeextattr_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_CLOSEEXTATTR(struct vnode *vp,
    int commit,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_closeextattr_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_closeextattr);
        a.a_vp = vp;
        a.a_commit = commit;
        a.a_cred = cred;
        assert_vop_locked(vp, "vop_closeextattr: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_closeextattr), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_getextattr_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_getextattr_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_getextattr_desc = {
        VOP_GETEXTATTR_DESCOFFSET,
        "vop_getextattr",
        0,
        vop_getextattr_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_getextattr_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_GETEXTATTR(struct vnode *vp,
    int attrnamespace,
    const char *name,
    struct uio *uio,
    size_t *size,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_getextattr_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_getextattr);
        a.a_vp = vp;
        a.a_attrnamespace = attrnamespace;
        a.a_name = name;
        a.a_uio = uio;
        a.a_size = size;
        a.a_cred = cred;
        assert_vop_locked(vp, "vop_getextattr: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_getextattr), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_listextattr_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_listextattr_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_listextattr_desc = {
        VOP_LISTEXTATTR_DESCOFFSET,
        "vop_listextattr",
        0,
        vop_listextattr_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_listextattr_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_LISTEXTATTR(struct vnode *vp,
    int attrnamespace,
    struct uio *uio,
    size_t *size,
    int flag,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_listextattr_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_listextattr);
        a.a_vp = vp;
        a.a_attrnamespace = attrnamespace;
        a.a_uio = uio;
        a.a_size = size;
        a.a_flag = flag;
        a.a_cred = cred;
        assert_vop_locked(vp, "vop_listextattr: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_listextattr), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_openextattr_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_openextattr_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_openextattr_desc = {
        VOP_OPENEXTATTR_DESCOFFSET,
        "vop_openextattr",
        0,
        vop_openextattr_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_openextattr_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_OPENEXTATTR(struct vnode *vp,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_openextattr_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_openextattr);
        a.a_vp = vp;
        a.a_cred = cred;
        assert_vop_locked(vp, "vop_openextattr: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_openextattr), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_deleteextattr_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_deleteextattr_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_deleteextattr_desc = {
        VOP_DELETEEXTATTR_DESCOFFSET,
        "vop_deleteextattr",
        0,
        vop_deleteextattr_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_deleteextattr_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_DELETEEXTATTR(struct vnode *vp,
    int attrnamespace,
    const char *name,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_deleteextattr_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_deleteextattr);
        a.a_vp = vp;
        a.a_attrnamespace = attrnamespace;
        a.a_name = name;
        a.a_cred = cred;
        assert_vop_elocked(vp, "vop_deleteextattr: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_deleteextattr), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const int vop_setextattr_vp_offsets[] = {
        VOPARG_OFFSETOF(struct vop_setextattr_args,a_vp),
        VDESC_NO_OFFSET
};
const struct vnodeop_desc vop_setextattr_desc = {
        VOP_SETEXTATTR_DESCOFFSET,
        "vop_setextattr",
        0,
        vop_setextattr_vp_offsets,
        VDESC_NO_OFFSET,
        VOPARG_OFFSETOF(struct vop_setextattr_args, a_cred),
        VDESC_NO_OFFSET,
};
int
VOP_SETEXTATTR(struct vnode *vp,
    int attrnamespace,
    const char *name,
    struct uio *uio,
    kauth_cred_t cred)
{
        int error;
        bool mpsafe;
        struct vop_setextattr_args a;
        struct mount *mp;
        a.a_desc = VDESC(vop_setextattr);
        a.a_vp = vp;
        a.a_attrnamespace = attrnamespace;
        a.a_name = name;
        a.a_uio = uio;
        a.a_cred = cred;
        assert_vop_elocked(vp, "vop_setextattr: vp");
        error = vop_pre(vp, &mp, &mpsafe, FST_NO);
        if (error)
                return error;
        error = (VCALL(vp, VOFFSET(vop_setextattr), &a));
        vop_post(vp, mp, mpsafe, FST_NO);
        return error;
}

const struct vnodeop_desc * const vfs_op_descs[] = {
        &vop_default_desc,        /* MUST BE FIRST */

        &vop_bwrite_desc,
        &vop_parsepath_desc,
        &vop_lookup_desc,
        &vop_create_desc,
        &vop_mknod_desc,
        &vop_open_desc,
        &vop_close_desc,
        &vop_access_desc,
        &vop_accessx_desc,
        &vop_getattr_desc,
        &vop_setattr_desc,
        &vop_read_desc,
        &vop_write_desc,
        &vop_fallocate_desc,
        &vop_fdiscard_desc,
        &vop_ioctl_desc,
        &vop_fcntl_desc,
        &vop_poll_desc,
        &vop_kqfilter_desc,
        &vop_revoke_desc,
        &vop_mmap_desc,
        &vop_fsync_desc,
        &vop_seek_desc,
        &vop_remove_desc,
        &vop_link_desc,
        &vop_rename_desc,
        &vop_mkdir_desc,
        &vop_rmdir_desc,
        &vop_symlink_desc,
        &vop_readdir_desc,
        &vop_readlink_desc,
        &vop_abortop_desc,
        &vop_inactive_desc,
        &vop_reclaim_desc,
        &vop_lock_desc,
        &vop_unlock_desc,
        &vop_bmap_desc,
        &vop_strategy_desc,
        &vop_print_desc,
        &vop_islocked_desc,
        &vop_pathconf_desc,
        &vop_advlock_desc,
        &vop_whiteout_desc,
        &vop_getpages_desc,
        &vop_putpages_desc,
        &vop_getacl_desc,
        &vop_setacl_desc,
        &vop_aclcheck_desc,
        &vop_closeextattr_desc,
        &vop_getextattr_desc,
        &vop_listextattr_desc,
        &vop_openextattr_desc,
        &vop_deleteextattr_desc,
        &vop_setextattr_desc,
        NULL
};











































    7 



















    6 







    6 





   12 







   12 




   12 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/*        $NetBSD: genfs_vfsops.c,v 1.11 2022/07/08 07:42:06 hannken Exp $        */

/*-
 * Copyright (c) 2008, 2009, 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_vfsops.c,v 1.11 2022/07/08 07:42:06 hannken Exp $");

#include <sys/types.h>
#include <sys/mount.h>
#include <sys/fstrans.h>
#include <sys/statvfs.h>
#include <sys/vnode.h>

#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/genfs_node.h>

int
genfs_statvfs(struct mount *mp, struct statvfs *sbp)
{

        sbp->f_bsize = DEV_BSIZE;
        sbp->f_frsize = DEV_BSIZE;
        sbp->f_iosize = DEV_BSIZE;
        sbp->f_blocks = 2;                /* 1k to keep df happy */
        sbp->f_bfree = 0;
        sbp->f_bavail = 0;
        sbp->f_bresvd = 0;
        sbp->f_files = 0;
        sbp->f_ffree = 0;
        sbp->f_favail = 0;
        sbp->f_fresvd = 0;
        copy_statvfs_info(sbp, mp);

        return 0;
}

int
genfs_renamelock_enter(struct mount *mp)
{
        mutex_enter(mp->mnt_renamelock);
        /* Preserve possible error return in case we become interruptible. */
        return 0;
}

void
genfs_renamelock_exit(struct mount *mp)
{
        mutex_exit(mp->mnt_renamelock);
}

int
genfs_suspendctl(struct mount *mp, int cmd)
{
        int error;

        switch (cmd) {
        case SUSPEND_SUSPEND:
                error = fstrans_setstate(mp, FSTRANS_SUSPENDING);
                if (error)
                        return error;
                error = fstrans_setstate(mp, FSTRANS_SUSPENDED);
                return error;

        case SUSPEND_RESUME:
                error = fstrans_setstate(mp, FSTRANS_NORMAL);
                KASSERT(error == 0);
                return 0;

        default:
                panic("%s: bogus command %d", __func__, cmd);
        }
}


































































































































































    4 




    3 
    1 












    3 


























    5 
























































    4 

    2 

    2 

    2 

    2 







    5 
    1 
    5 

    5 



    5 



    5 

















    5 









    1 





    1 




    5 












































































    5 


    5 


























    4 

















    4 
    2 



    4 







    2 











    2 

    4 





























    4 



















    4 


    4 
















    4 

















    3 






















    4 





















    3 

    3 

    3 




    3 


































































    2 
    2 

















    4 
    4 
    4 
















    2 



    4 




















    4 












































    4 




































    4 










    4 




    2 
    2 






    4 







    4 









































































































































































    5 
    2 

    5 



    1 
    4 
    5 

































    1 




























































































































































































    4 




























   12 







   12 
   12 







    2 

    1 
























    8 











































































































































































































































































































































    4 












    4 


































    1 









































































































































































































































































































































































































    4 













































































































    1 
    1 
    1 




    1 




    1 



    1 
    2 



    1 
    1 



    1 
    2 




    1 
    1 


























































































    2 






    1 








    1 









    1 






















    1 








    1 







    1 



    1 



















    4 

    4 




    3 

















    1 





















































    1 













    1 
    1 



    1 




















    1 
    1 















    1 


    1 












    1 


















































    1 





    1 




























    1 

    2 

    1 
























































    3 


    3 
















    1 






    1 































    1 

























    4 













































    4 


















    2 






    2 





    1 



    1 



















    2 






































































































































    1 



    1 


















































































    1 

































































































































    1 




    1 























    1 


    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
/*        $NetBSD: ip6_output.c,v 1.235 2024/04/19 00:55:35 riastradh Exp $        */
/*        $KAME: ip6_output.c,v 1.172 2001/03/25 09:55:56 itojun Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ip_output.c        8.3 (Berkeley) 1/21/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip6_output.c,v 1.235 2024/04/19 00:55:35 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#endif

#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/errno.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kauth.h>

#include <net/if.h>
#include <net/route.h>
#include <net/pfil.h>

#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet/ip_var.h>
#include <netinet/icmp6.h>
#include <netinet/in_offload.h>
#include <netinet/portalgo.h>
#include <netinet6/in6_offload.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/nd6.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/scope6_var.h>

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#include <netipsec/key.h>
#endif

extern pfil_head_t *inet6_pfil_hook;        /* XXX */

struct ip6_exthdrs {
        struct mbuf *ip6e_ip6;
        struct mbuf *ip6e_hbh;
        struct mbuf *ip6e_dest1;
        struct mbuf *ip6e_rthdr;
        struct mbuf *ip6e_dest2;
};

static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **,
        kauth_cred_t, int);
static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *);
static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, kauth_cred_t,
        int, int, int);
static int ip6_setmoptions(const struct sockopt *, struct inpcb *);
static int ip6_getmoptions(struct sockopt *, struct inpcb *);
static int ip6_copyexthdr(struct mbuf **, void *, int);
static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
        struct ip6_frag **);
static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
static int ip6_getpmtu(struct rtentry *, struct ifnet *, u_long *, int *);
static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
static int ip6_ifaddrvalid(const struct in6_addr *, const struct in6_addr *);
static int ip6_handle_rthdr(struct ip6_rthdr *, struct ip6_hdr *);

#ifdef RFC2292
static int ip6_pcbopts(struct ip6_pktopts **, struct socket *, struct sockopt *);
#endif

static int
ip6_handle_rthdr(struct ip6_rthdr *rh, struct ip6_hdr *ip6)
{
        int error = 0;

        switch (rh->ip6r_type) {
        case IPV6_RTHDR_TYPE_0:
                /* Dropped, RFC5095. */
        default:        /* is it possible? */
                error = EINVAL;
        }

        return error;
}

/*
 * Send an IP packet to a host.
 */
int
ip6_if_output(struct ifnet * const ifp, struct ifnet * const origifp,
    struct mbuf * const m, const struct sockaddr_in6 * const dst,
    const struct rtentry *rt)
{
        int error = 0;

        if (rt != NULL) {
                error = rt_check_reject_route(rt, ifp);
                if (error != 0) {
                        IP6_STATINC(IP6_STAT_RTREJECT);
                        m_freem(m);
                        return error;
                }
        }

        /* discard the packet if IPv6 operation is disabled on the interface */
        if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) {
                m_freem(m);
                return ENETDOWN; /* better error? */
        }

        if ((ifp->if_flags & IFF_LOOPBACK) != 0)
                error = if_output_lock(ifp, origifp, m, sin6tocsa(dst), rt);
        else
                error = if_output_lock(ifp, ifp, m, sin6tocsa(dst), rt);
        return error;
}

/*
 * IP6 output. The packet in mbuf chain m contains a skeletal IP6
 * header (with pri, len, nxt, hlim, src, dst).
 *
 * This function may modify ver and hlim only. The mbuf chain containing the
 * packet will be freed. The mbuf opt, if present, will not be freed.
 *
 * Type of "mtu": rt_rmx.rmx_mtu is u_long, ifnet.ifr_mtu is int, and
 * nd_ifinfo.linkmtu is u_int32_t. So we use u_long to hold largest one,
 * which is rt_rmx.rmx_mtu.
 */
int
ip6_output(
    struct mbuf *m0,
    struct ip6_pktopts *opt,
    struct route *ro,
    int flags,
    struct ip6_moptions *im6o,
    struct inpcb *inp,
    struct ifnet **ifpp                /* XXX: just for statistics */
)
{
        struct ip6_hdr *ip6, *mhip6;
        struct ifnet *ifp = NULL, *origifp = NULL;
        struct mbuf *m = m0;
        int tlen, len, off;
        bool tso;
        struct route ip6route;
        struct rtentry *rt = NULL, *rt_pmtu;
        const struct sockaddr_in6 *dst;
        struct sockaddr_in6 src_sa, dst_sa;
        int error = 0;
        struct in6_ifaddr *ia = NULL;
        u_long mtu;
        int alwaysfrag, dontfrag;
        u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
        struct ip6_exthdrs exthdrs;
        struct in6_addr finaldst, src0, dst0;
        u_int32_t zone;
        struct route *ro_pmtu = NULL;
        int hdrsplit = 0;
        int needipsec = 0;
#ifdef IPSEC
        struct secpolicy *sp = NULL;
#endif
        struct psref psref, psref_ia;
        int bound = curlwp_bind();
        bool release_psref_ia = false;

#ifdef DIAGNOSTIC
        if ((m->m_flags & M_PKTHDR) == 0)
                panic("ip6_output: no HDR");
        if ((m->m_pkthdr.csum_flags &
            (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_TSOv4)) != 0) {
                panic("ip6_output: IPv4 checksum offload flags: %d",
                    m->m_pkthdr.csum_flags);
        }
        if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) ==
            (M_CSUM_TCPv6|M_CSUM_UDPv6)) {
                panic("ip6_output: conflicting checksum offload flags: %d",
                    m->m_pkthdr.csum_flags);
        }
#endif

        M_CSUM_DATA_IPv6_SET(m->m_pkthdr.csum_data, sizeof(struct ip6_hdr));

#define MAKE_EXTHDR(hp, mp)                                                \
    do {                                                                \
        if (hp) {                                                        \
                struct ip6_ext *eh = (struct ip6_ext *)(hp);                \
                error = ip6_copyexthdr((mp), (void *)(hp),                 \
                    ((eh)->ip6e_len + 1) << 3);                                \
                if (error)                                                \
                        goto freehdrs;                                        \
        }                                                                \
    } while (/*CONSTCOND*/ 0)

        memset(&exthdrs, 0, sizeof(exthdrs));
        if (opt) {
                /* Hop-by-Hop options header */
                MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
                /* Destination options header (1st part) */
                MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
                /* Routing header */
                MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
                /* Destination options header (2nd part) */
                MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
        }

        /*
         * Calculate the total length of the extension header chain.
         * Keep the length of the unfragmentable part for fragmentation.
         */
        optlen = 0;
        if (exthdrs.ip6e_hbh)
                optlen += exthdrs.ip6e_hbh->m_len;
        if (exthdrs.ip6e_dest1)
                optlen += exthdrs.ip6e_dest1->m_len;
        if (exthdrs.ip6e_rthdr)
                optlen += exthdrs.ip6e_rthdr->m_len;
        unfragpartlen = optlen + sizeof(struct ip6_hdr);
        /* NOTE: we don't add AH/ESP length here. do that later. */
        if (exthdrs.ip6e_dest2)
                optlen += exthdrs.ip6e_dest2->m_len;

#ifdef IPSEC
        if (ipsec_used) {
                /* Check the security policy (SP) for the packet */
                sp = ipsec6_check_policy(m, inp, flags, &needipsec, &error);
                if (error != 0) {
                        /*
                         * Hack: -EINVAL is used to signal that a packet
                         * should be silently discarded.  This is typically
                         * because we asked key management for an SA and
                         * it was delayed (e.g. kicked up to IKE).
                         */
                        if (error == -EINVAL)
                                error = 0;
                        IP6_STATINC(IP6_STAT_IPSECDROP_OUT);
                        goto freehdrs;
                }
        }
#endif

        if (needipsec &&
            (m->m_pkthdr.csum_flags & (M_CSUM_UDPv6|M_CSUM_TCPv6)) != 0) {
                in6_undefer_cksum_tcpudp(m);
                m->m_pkthdr.csum_flags &= ~(M_CSUM_UDPv6|M_CSUM_TCPv6);
        }

        /*
         * If we need IPsec, or there is at least one extension header,
         * separate IP6 header from the payload.
         */
        if ((needipsec || optlen) && !hdrsplit) {
                if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
                        IP6_STATINC(IP6_STAT_ODROPPED);
                        m = NULL;
                        goto freehdrs;
                }
                m = exthdrs.ip6e_ip6;
                hdrsplit++;
        }

        /* adjust pointer */
        ip6 = mtod(m, struct ip6_hdr *);

        /* adjust mbuf packet header length */
        m->m_pkthdr.len += optlen;
        plen = m->m_pkthdr.len - sizeof(*ip6);

        /* If this is a jumbo payload, insert a jumbo payload option. */
        if (plen > IPV6_MAXPACKET) {
                if (!hdrsplit) {
                        if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
                                IP6_STATINC(IP6_STAT_ODROPPED);
                                m = NULL;
                                goto freehdrs;
                        }
                        m = exthdrs.ip6e_ip6;
                        hdrsplit++;
                }
                /* adjust pointer */
                ip6 = mtod(m, struct ip6_hdr *);
                if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) {
                        IP6_STATINC(IP6_STAT_ODROPPED);
                        goto freehdrs;
                }
                optlen += 8; /* XXX JUMBOOPTLEN */
                ip6->ip6_plen = 0;
        } else
                ip6->ip6_plen = htons(plen);

        /*
         * Concatenate headers and fill in next header fields.
         * Here we have, on "m"
         *        IPv6 payload
         * and we insert headers accordingly.  Finally, we should be getting:
         *        IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
         *
         * during the header composing process, "m" points to IPv6 header.
         * "mprev" points to an extension header prior to esp.
         */
        {
                u_char *nexthdrp = &ip6->ip6_nxt;
                struct mbuf *mprev = m;

                /*
                 * we treat dest2 specially.  this makes IPsec processing
                 * much easier.  the goal here is to make mprev point the
                 * mbuf prior to dest2.
                 *
                 * result: IPv6 dest2 payload
                 * m and mprev will point to IPv6 header.
                 */
                if (exthdrs.ip6e_dest2) {
                        if (!hdrsplit)
                                panic("assumption failed: hdr not split");
                        exthdrs.ip6e_dest2->m_next = m->m_next;
                        m->m_next = exthdrs.ip6e_dest2;
                        *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
                        ip6->ip6_nxt = IPPROTO_DSTOPTS;
                }

#define MAKE_CHAIN(m, mp, p, i)\
    do {\
        if (m) {\
                if (!hdrsplit) \
                        panic("assumption failed: hdr not split"); \
                *mtod((m), u_char *) = *(p);\
                *(p) = (i);\
                p = mtod((m), u_char *);\
                (m)->m_next = (mp)->m_next;\
                (mp)->m_next = (m);\
                (mp) = (m);\
        }\
    } while (/*CONSTCOND*/ 0)
                /*
                 * result: IPv6 hbh dest1 rthdr dest2 payload
                 * m will point to IPv6 header.  mprev will point to the
                 * extension header prior to dest2 (rthdr in the above case).
                 */
                MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
                MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
                    IPPROTO_DSTOPTS);
                MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
                    IPPROTO_ROUTING);

                M_CSUM_DATA_IPv6_SET(m->m_pkthdr.csum_data,
                    sizeof(struct ip6_hdr) + optlen);
        }

        /* Need to save for pmtu */
        finaldst = ip6->ip6_dst;

        /*
         * If there is a routing header, replace destination address field
         * with the first hop of the routing header.
         */
        if (exthdrs.ip6e_rthdr) {
                struct ip6_rthdr *rh;

                rh = mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *);

                error = ip6_handle_rthdr(rh, ip6);
                if (error != 0) {
                        IP6_STATINC(IP6_STAT_ODROPPED);
                        goto bad;
                }
        }

        /* Source address validation */
        if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
            (flags & IPV6_UNSPECSRC) == 0) {
                error = EOPNOTSUPP;
                IP6_STATINC(IP6_STAT_BADSCOPE);
                goto bad;
        }
        if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
                error = EOPNOTSUPP;
                IP6_STATINC(IP6_STAT_BADSCOPE);
                goto bad;
        }

        IP6_STATINC(IP6_STAT_LOCALOUT);

        /*
         * Route packet.
         */
        /* initialize cached route */
        if (ro == NULL) {
                memset(&ip6route, 0, sizeof(ip6route));
                ro = &ip6route;
        }
        ro_pmtu = ro;
        if (opt && opt->ip6po_rthdr)
                ro = &opt->ip6po_route;

        /*
         * if specified, try to fill in the traffic class field.
         * do not override if a non-zero value is already set.
         * we check the diffserv field and the ecn field separately.
         */
        if (opt && opt->ip6po_tclass >= 0) {
                int mask = 0;

                if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
                        mask |= 0xfc;
                if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
                        mask |= 0x03;
                if (mask != 0)
                        ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
        }

        /* fill in or override the hop limit field, if necessary. */
        if (opt && opt->ip6po_hlim != -1)
                ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
        else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
                if (im6o != NULL)
                        ip6->ip6_hlim = im6o->im6o_multicast_hlim;
                else
                        ip6->ip6_hlim = ip6_defmcasthlim;
        }

#ifdef IPSEC
        if (needipsec) {
                error = ipsec6_process_packet(m, sp->req, flags);

                /*
                 * Preserve KAME behaviour: ENOENT can be returned
                 * when an SA acquire is in progress.  Don't propagate
                 * this to user-level; it confuses applications.
                 * XXX this will go away when the SADB is redone.
                 */
                if (error == ENOENT)
                        error = 0;

                goto done;
        }
#endif

        /* adjust pointer */
        ip6 = mtod(m, struct ip6_hdr *);

        sockaddr_in6_init(&dst_sa, &ip6->ip6_dst, 0, 0, 0);

        /* We do not need a route for multicast */
        if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
                struct in6_pktinfo *pi = NULL;

                /*
                 * If the outgoing interface for the address is specified by
                 * the caller, use it.
                 */
                if (opt && (pi = opt->ip6po_pktinfo) != NULL) {
                        /* XXX boundary check is assumed to be already done. */
                        ifp = if_get_byindex(pi->ipi6_ifindex, &psref);
                } else if (im6o != NULL) {
                        ifp = if_get_byindex(im6o->im6o_multicast_if_index,
                            &psref);
                }
        }

        if (ifp == NULL) {
                error = in6_selectroute(&dst_sa, opt, &ro, &rt, true);
                if (error != 0)
                        goto bad;
                ifp = if_get_byindex(rt->rt_ifp->if_index, &psref);
        }

        if (rt == NULL) {
                /*
                 * If in6_selectroute() does not return a route entry,
                 * dst may not have been updated.
                 */
                error = rtcache_setdst(ro, sin6tosa(&dst_sa));
                if (error) {
                        IP6_STATINC(IP6_STAT_ODROPPED);
                        goto bad;
                }
        }

        /*
         * then rt (for unicast) and ifp must be non-NULL valid values.
         */
        if ((flags & IPV6_FORWARDING) == 0) {
                /* XXX: the FORWARDING flag can be set for mrouting. */
                in6_ifstat_inc(ifp, ifs6_out_request);
        }
        if (rt != NULL) {
                ia = (struct in6_ifaddr *)(rt->rt_ifa);
                rt->rt_use++;
        }

        /*
         * The outgoing interface must be in the zone of source and
         * destination addresses.  We should use ia_ifp to support the
         * case of sending packets to an address of our own.
         */
        if (ia != NULL) {
                origifp = ia->ia_ifp;
                if (if_is_deactivated(origifp)) {
                        IP6_STATINC(IP6_STAT_ODROPPED);
                        goto bad;
                }
                if_acquire(origifp, &psref_ia);
                release_psref_ia = true;
        } else
                origifp = ifp;

        src0 = ip6->ip6_src;
        if (in6_setscope(&src0, origifp, &zone))
                goto badscope;
        sockaddr_in6_init(&src_sa, &ip6->ip6_src, 0, 0, 0);
        if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id)
                goto badscope;

        dst0 = ip6->ip6_dst;
        if (in6_setscope(&dst0, origifp, &zone))
                goto badscope;
        /* re-initialize to be sure */
        sockaddr_in6_init(&dst_sa, &ip6->ip6_dst, 0, 0, 0);
        if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id)
                goto badscope;

        /* scope check is done. */

        /* Ensure we only send from a valid address. */
        if ((ifp->if_flags & IFF_LOOPBACK) == 0 &&
            (flags & IPV6_FORWARDING) == 0 &&
            (error = ip6_ifaddrvalid(&src0, &dst0)) != 0)
        {
                char ip6buf[INET6_ADDRSTRLEN];
                nd6log(LOG_ERR,
                    "refusing to send from invalid address %s (pid %d)\n",
                    IN6_PRINT(ip6buf, &src0), curproc->p_pid);
                IP6_STATINC(IP6_STAT_ODROPPED);
                in6_ifstat_inc(origifp, ifs6_out_discard);
                if (error == 1)
                        /*
                         * Address exists, but is tentative or detached.
                         * We can't send from it because it's invalid,
                         * so we drop the packet.
                         */
                        error = 0;
                else
                        error = EADDRNOTAVAIL;
                goto bad;
        }

        if (rt != NULL && (rt->rt_flags & RTF_GATEWAY) &&
            !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
                dst = satocsin6(rt->rt_gateway);
        else
                dst = satocsin6(rtcache_getdst(ro));

        /*
         * XXXXXX: original code follows:
         */
        if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
                m->m_flags &= ~(M_BCAST | M_MCAST);        /* just in case */
        else {
                bool ingroup;

                m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;

                in6_ifstat_inc(ifp, ifs6_out_mcast);

                /*
                 * Confirm that the outgoing interface supports multicast.
                 */
                if (!(ifp->if_flags & IFF_MULTICAST)) {
                        IP6_STATINC(IP6_STAT_NOROUTE);
                        in6_ifstat_inc(ifp, ifs6_out_discard);
                        error = ENETUNREACH;
                        goto bad;
                }

                ingroup = in6_multi_group(&ip6->ip6_dst, ifp);
                if (ingroup && (im6o == NULL || im6o->im6o_multicast_loop)) {
                        /*
                         * If we belong to the destination multicast group
                         * on the outgoing interface, and the caller did not
                         * forbid loopback, loop back a copy.
                         */
                        KASSERT(dst != NULL);
                        ip6_mloopback(ifp, m, dst);
                } else {
                        /*
                         * If we are acting as a multicast router, perform
                         * multicast forwarding as if the packet had just
                         * arrived on the interface to which we are about
                         * to send.  The multicast forwarding function
                         * recursively calls this function, using the
                         * IPV6_FORWARDING flag to prevent infinite recursion.
                         *
                         * Multicasts that are looped back by ip6_mloopback(),
                         * above, will be forwarded by the ip6_input() routine,
                         * if necessary.
                         */
                        if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
                                if (ip6_mforward(ip6, ifp, m) != 0) {
                                        m_freem(m);
                                        goto done;
                                }
                        }
                }
                /*
                 * Multicasts with a hoplimit of zero may be looped back,
                 * above, but must not be transmitted on a network.
                 * Also, multicasts addressed to the loopback interface
                 * are not sent -- the above call to ip6_mloopback() will
                 * loop back a copy if this host actually belongs to the
                 * destination group on the loopback interface.
                 */
                if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
                    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
                        m_freem(m);
                        goto done;
                }
        }

        /*
         * Fill the outgoing interface to tell the upper layer
         * to increment per-interface statistics.
         */
        if (ifpp)
                *ifpp = ifp;

        /* Determine path MTU. */
        /*
         * ro_pmtu represent final destination while
         * ro might represent immediate destination.
         * Use ro_pmtu destination since MTU might differ.
         */
        if (ro_pmtu != ro) {
                union {
                        struct sockaddr                dst;
                        struct sockaddr_in6        dst6;
                } u;

                /* ro_pmtu may not have a cache */
                sockaddr_in6_init(&u.dst6, &finaldst, 0, 0, 0);
                rt_pmtu = rtcache_lookup(ro_pmtu, &u.dst);
        } else
                rt_pmtu = rt;
        error = ip6_getpmtu(rt_pmtu, ifp, &mtu, &alwaysfrag);
        if (rt_pmtu != NULL && rt_pmtu != rt)
                rtcache_unref(rt_pmtu, ro_pmtu);
        KASSERT(error == 0); /* ip6_getpmtu never fail if ifp is passed */

        /*
         * The caller of this function may specify to use the minimum MTU
         * in some cases.
         * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
         * setting.  The logic is a bit complicated; by default, unicast
         * packets will follow path MTU while multicast packets will be sent at
         * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
         * including unicast ones will be sent at the minimum MTU.  Multicast
         * packets will always be sent at the minimum MTU unless
         * IP6PO_MINMTU_DISABLE is explicitly specified.
         * See RFC 3542 for more details.
         */
        if (mtu > IPV6_MMTU) {
                if ((flags & IPV6_MINMTU))
                        mtu = IPV6_MMTU;
                else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
                        mtu = IPV6_MMTU;
                else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
                         (opt == NULL ||
                          opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
                        mtu = IPV6_MMTU;
                }
        }

        /*
         * clear embedded scope identifiers if necessary.
         * in6_clearscope will touch the addresses only when necessary.
         */
        in6_clearscope(&ip6->ip6_src);
        in6_clearscope(&ip6->ip6_dst);

        /*
         * If the outgoing packet contains a hop-by-hop options header,
         * it must be examined and processed even by the source node.
         * (RFC 2460, section 4.)
         *
         * XXX Is this really necessary?
         */
        if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
                u_int32_t dummy1 = 0; /* XXX unused */
                u_int32_t dummy2; /* XXX unused */
                int hoff = sizeof(struct ip6_hdr);

                if (ip6_hopopts_input(&dummy1, &dummy2, &m, &hoff)) {
                        /* m was already freed at this point */
                        error = EINVAL;
                        goto done;
                }

                ip6 = mtod(m, struct ip6_hdr *);
        }

        /*
         * Run through list of hooks for output packets.
         */
        error = pfil_run_hooks(inet6_pfil_hook, &m, ifp, PFIL_OUT);
        if (error != 0 || m == NULL) {
                IP6_STATINC(IP6_STAT_PFILDROP_OUT);
                goto done;
        }
        ip6 = mtod(m, struct ip6_hdr *);

        /*
         * Send the packet to the outgoing interface.
         * If necessary, do IPv6 fragmentation before sending.
         *
         * the logic here is rather complex:
         * 1: normal case (dontfrag == 0, alwaysfrag == 0)
         * 1-a:        send as is if tlen <= path mtu
         * 1-b:        fragment if tlen > path mtu
         *
         * 2: if user asks us not to fragment (dontfrag == 1)
         * 2-a:        send as is if tlen <= interface mtu
         * 2-b:        error if tlen > interface mtu
         *
         * 3: if we always need to attach fragment header (alwaysfrag == 1)
         *        always fragment
         *
         * 4: if dontfrag == 1 && alwaysfrag == 1
         *        error, as we cannot handle this conflicting request
         */
        tlen = m->m_pkthdr.len;
        tso = (m->m_pkthdr.csum_flags & M_CSUM_TSOv6) != 0;
        if (opt && (opt->ip6po_flags & IP6PO_DONTFRAG))
                dontfrag = 1;
        else
                dontfrag = 0;

        if (dontfrag && alwaysfrag) {        /* case 4 */
                /* conflicting request - can't transmit */
                IP6_STATINC(IP6_STAT_CANTFRAG);
                error = EMSGSIZE;
                goto bad;
        }
        if (dontfrag && (!tso && tlen > ifp->if_mtu)) {        /* case 2-b */
                /*
                 * Even if the DONTFRAG option is specified, we cannot send the
                 * packet when the data length is larger than the MTU of the
                 * outgoing interface.
                 * Notify the error by sending IPV6_PATHMTU ancillary data as
                 * well as returning an error code (the latter is not described
                 * in the API spec.)
                 */
                u_int32_t mtu32;
                struct ip6ctlparam ip6cp;

                mtu32 = (u_int32_t)mtu;
                memset(&ip6cp, 0, sizeof(ip6cp));
                ip6cp.ip6c_cmdarg = (void *)&mtu32;
                pfctlinput2(PRC_MSGSIZE,
                    rtcache_getdst(ro_pmtu), &ip6cp);

                IP6_STATINC(IP6_STAT_CANTFRAG);
                error = EMSGSIZE;
                goto bad;
        }

        /*
         * transmit packet without fragmentation
         */
        if (dontfrag || (!alwaysfrag && (tlen <= mtu || tso))) {
                /* case 1-a and 2-a */
                struct in6_ifaddr *ia6;
                int sw_csum;
                int s;

                ip6 = mtod(m, struct ip6_hdr *);
                s = pserialize_read_enter();
                ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
                if (ia6) {
                        /* Record statistics for this interface address. */
                        ia6->ia_ifa.ifa_data.ifad_outbytes += m->m_pkthdr.len;
                }
                pserialize_read_exit(s);

                sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx;
                if ((sw_csum & (M_CSUM_UDPv6|M_CSUM_TCPv6)) != 0) {
                        if (IN6_NEED_CHECKSUM(ifp,
                            sw_csum & (M_CSUM_UDPv6|M_CSUM_TCPv6))) {
                                in6_undefer_cksum_tcpudp(m);
                        }
                        m->m_pkthdr.csum_flags &= ~(M_CSUM_UDPv6|M_CSUM_TCPv6);
                }

                KASSERT(dst != NULL);
                if (__predict_false(sw_csum & M_CSUM_TSOv6)) {
                        /*
                         * TSO6 is required by a packet, but disabled for
                         * the interface.
                         */
                        error = ip6_tso_output(ifp, origifp, m, dst, rt);
                } else
                        error = ip6_if_output(ifp, origifp, m, dst, rt);
                goto done;
        }

        if (tso) {
                IP6_STATINC(IP6_STAT_CANTFRAG); /* XXX */
                error = EINVAL; /* XXX */
                goto bad;
        }

        /*
         * try to fragment the packet.  case 1-b and 3
         */
        if (mtu < IPV6_MMTU) {
                /* path MTU cannot be less than IPV6_MMTU */
                IP6_STATINC(IP6_STAT_CANTFRAG);
                error = EMSGSIZE;
                in6_ifstat_inc(ifp, ifs6_out_fragfail);
                goto bad;
        } else if (ip6->ip6_plen == 0) {
                /* jumbo payload cannot be fragmented */
                IP6_STATINC(IP6_STAT_CANTFRAG);
                error = EMSGSIZE;
                in6_ifstat_inc(ifp, ifs6_out_fragfail);
                goto bad;
        } else {
                const uint32_t id = ip6_randomid();
                struct mbuf **mnext, *m_frgpart;
                const int hlen = unfragpartlen;
                struct ip6_frag *ip6f;
                u_char nextproto;

                if (mtu > IPV6_MAXPACKET)
                        mtu = IPV6_MAXPACKET;

                /*
                 * Must be able to put at least 8 bytes per fragment.
                 */
                len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
                if (len < 8) {
                        IP6_STATINC(IP6_STAT_CANTFRAG);
                        error = EMSGSIZE;
                        in6_ifstat_inc(ifp, ifs6_out_fragfail);
                        goto bad;
                }

                mnext = &m->m_nextpkt;

                /*
                 * Change the next header field of the last header in the
                 * unfragmentable part.
                 */
                if (exthdrs.ip6e_rthdr) {
                        nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
                        *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
                } else if (exthdrs.ip6e_dest1) {
                        nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
                        *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
                } else if (exthdrs.ip6e_hbh) {
                        nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
                        *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
                } else {
                        nextproto = ip6->ip6_nxt;
                        ip6->ip6_nxt = IPPROTO_FRAGMENT;
                }

                if ((m->m_pkthdr.csum_flags & (M_CSUM_UDPv6|M_CSUM_TCPv6))
                    != 0) {
                        if (IN6_NEED_CHECKSUM(ifp,
                            m->m_pkthdr.csum_flags &
                            (M_CSUM_UDPv6|M_CSUM_TCPv6))) {
                                in6_undefer_cksum_tcpudp(m);
                        }
                        m->m_pkthdr.csum_flags &= ~(M_CSUM_UDPv6|M_CSUM_TCPv6);
                }

                /*
                 * Loop through length of segment after first fragment,
                 * make new header and copy data of each part and link onto
                 * chain.
                 */
                m0 = m;
                for (off = hlen; off < tlen; off += len) {
                        struct mbuf *mlast;

                        MGETHDR(m, M_DONTWAIT, MT_HEADER);
                        if (!m) {
                                error = ENOBUFS;
                                IP6_STATINC(IP6_STAT_ODROPPED);
                                goto sendorfree;
                        }
                        m_reset_rcvif(m);
                        m->m_flags = m0->m_flags & M_COPYFLAGS;
                        *mnext = m;
                        mnext = &m->m_nextpkt;
                        m->m_data += max_linkhdr;
                        mhip6 = mtod(m, struct ip6_hdr *);
                        *mhip6 = *ip6;
                        m->m_len = sizeof(*mhip6);

                        ip6f = NULL;
                        error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
                        if (error) {
                                IP6_STATINC(IP6_STAT_ODROPPED);
                                goto sendorfree;
                        }

                        /* Fill in the Frag6 Header */
                        ip6f->ip6f_offlg = htons((u_int16_t)((off - hlen) & ~7));
                        if (off + len >= tlen)
                                len = tlen - off;
                        else
                                ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
                        ip6f->ip6f_reserved = 0;
                        ip6f->ip6f_ident = id;
                        ip6f->ip6f_nxt = nextproto;

                        mhip6->ip6_plen = htons((u_int16_t)(len + hlen +
                            sizeof(*ip6f) - sizeof(struct ip6_hdr)));
                        if ((m_frgpart = m_copym(m0, off, len, M_DONTWAIT)) == NULL) {
                                error = ENOBUFS;
                                IP6_STATINC(IP6_STAT_ODROPPED);
                                goto sendorfree;
                        }
                        for (mlast = m; mlast->m_next; mlast = mlast->m_next)
                                ;
                        mlast->m_next = m_frgpart;

                        m->m_pkthdr.len = len + hlen + sizeof(*ip6f);
                        m_reset_rcvif(m);
                        IP6_STATINC(IP6_STAT_OFRAGMENTS);
                        in6_ifstat_inc(ifp, ifs6_out_fragcreat);
                }

                in6_ifstat_inc(ifp, ifs6_out_fragok);
        }

sendorfree:
        m = m0->m_nextpkt;
        m0->m_nextpkt = 0;
        m_freem(m0);
        for (m0 = m; m; m = m0) {
                m0 = m->m_nextpkt;
                m->m_nextpkt = 0;
                if (error == 0) {
                        struct in6_ifaddr *ia6;
                        int s;
                        ip6 = mtod(m, struct ip6_hdr *);
                        s = pserialize_read_enter();
                        ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
                        if (ia6) {
                                /*
                                 * Record statistics for this interface
                                 * address.
                                 */
                                ia6->ia_ifa.ifa_data.ifad_outbytes +=
                                    m->m_pkthdr.len;
                        }
                        pserialize_read_exit(s);
                        KASSERT(dst != NULL);
                        error = ip6_if_output(ifp, origifp, m, dst, rt);
                } else
                        m_freem(m);
        }

        if (error == 0)
                IP6_STATINC(IP6_STAT_FRAGMENTED);

done:
        rtcache_unref(rt, ro);
        if (ro == &ip6route)
                rtcache_free(&ip6route);
#ifdef IPSEC
        if (sp != NULL)
                KEY_SP_UNREF(&sp);
#endif
        if_put(ifp, &psref);
        if (release_psref_ia)
                if_put(origifp, &psref_ia);
        curlwp_bindx(bound);

        return error;

freehdrs:
        m_freem(exthdrs.ip6e_hbh);
        m_freem(exthdrs.ip6e_dest1);
        m_freem(exthdrs.ip6e_rthdr);
        m_freem(exthdrs.ip6e_dest2);
        /* FALLTHROUGH */
bad:
        m_freem(m);
        goto done;

badscope:
        IP6_STATINC(IP6_STAT_BADSCOPE);
        in6_ifstat_inc(origifp, ifs6_out_discard);
        if (error == 0)
                error = EHOSTUNREACH; /* XXX */
        goto bad;
}

static int
ip6_copyexthdr(struct mbuf **mp, void *hdr, int hlen)
{
        struct mbuf *m;

        if (hlen > MCLBYTES)
                return ENOBUFS; /* XXX */

        MGET(m, M_DONTWAIT, MT_DATA);
        if (!m)
                return ENOBUFS;

        if (hlen > MLEN) {
                MCLGET(m, M_DONTWAIT);
                if ((m->m_flags & M_EXT) == 0) {
                        m_free(m);
                        return ENOBUFS;
                }
        }
        m->m_len = hlen;
        if (hdr)
                memcpy(mtod(m, void *), hdr, hlen);

        *mp = m;
        return 0;
}

/*
 * Insert jumbo payload option.
 */
static int
ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
{
        struct mbuf *mopt;
        u_int8_t *optbuf;
        u_int32_t v;

#define JUMBOOPTLEN        8        /* length of jumbo payload option and padding */

        /*
         * If there is no hop-by-hop options header, allocate new one.
         * If there is one but it doesn't have enough space to store the
         * jumbo payload option, allocate a cluster to store the whole options.
         * Otherwise, use it to store the options.
         */
        if (exthdrs->ip6e_hbh == NULL) {
                MGET(mopt, M_DONTWAIT, MT_DATA);
                if (mopt == 0)
                        return (ENOBUFS);
                mopt->m_len = JUMBOOPTLEN;
                optbuf = mtod(mopt, u_int8_t *);
                optbuf[1] = 0;        /* = ((JUMBOOPTLEN) >> 3) - 1 */
                exthdrs->ip6e_hbh = mopt;
        } else {
                struct ip6_hbh *hbh;

                mopt = exthdrs->ip6e_hbh;
                if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
                        const int oldoptlen = mopt->m_len;
                        struct mbuf *n;

                        /*
                         * Assumptions:
                         * - exthdrs->ip6e_hbh is not referenced from places
                         *   other than exthdrs.
                         * - exthdrs->ip6e_hbh is not an mbuf chain.
                         */
                        KASSERT(mopt->m_next == NULL);

                        /*
                         * Give up if the whole (new) hbh header does not fit
                         * even in an mbuf cluster.
                         */
                        if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
                                return ENOBUFS;

                        /*
                         * At this point, we must always prepare a cluster.
                         */
                        MGET(n, M_DONTWAIT, MT_DATA);
                        if (n) {
                                MCLGET(n, M_DONTWAIT);
                                if ((n->m_flags & M_EXT) == 0) {
                                        m_freem(n);
                                        n = NULL;
                                }
                        }
                        if (!n)
                                return ENOBUFS;

                        n->m_len = oldoptlen + JUMBOOPTLEN;
                        bcopy(mtod(mopt, void *), mtod(n, void *),
                            oldoptlen);
                        optbuf = mtod(n, u_int8_t *) + oldoptlen;
                        m_freem(mopt);
                        mopt = exthdrs->ip6e_hbh = n;
                } else {
                        optbuf = mtod(mopt, u_int8_t *) + mopt->m_len;
                        mopt->m_len += JUMBOOPTLEN;
                }
                optbuf[0] = IP6OPT_PADN;
                optbuf[1] = 0;

                /*
                 * Adjust the header length according to the pad and
                 * the jumbo payload option.
                 */
                hbh = mtod(mopt, struct ip6_hbh *);
                hbh->ip6h_len += (JUMBOOPTLEN >> 3);
        }

        /* fill in the option. */
        optbuf[2] = IP6OPT_JUMBO;
        optbuf[3] = 4;
        v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
        memcpy(&optbuf[4], &v, sizeof(u_int32_t));

        /* finally, adjust the packet header length */
        exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;

        return 0;
#undef JUMBOOPTLEN
}

/*
 * Insert fragment header and copy unfragmentable header portions.
 *
 * *frghdrp will not be read, and it is guaranteed that either an
 * error is returned or that *frghdrp will point to space allocated
 * for the fragment header.
 *
 * On entry, m contains:
 *     IPv6 Header
 * On exit, it contains:
 *     IPv6 Header -> Unfragmentable Part -> Frag6 Header
 */
static int
ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
        struct ip6_frag **frghdrp)
{
        struct mbuf *n, *mlast;

        if (hlen > sizeof(struct ip6_hdr)) {
                n = m_copym(m0, sizeof(struct ip6_hdr),
                    hlen - sizeof(struct ip6_hdr), M_DONTWAIT);
                if (n == NULL)
                        return ENOBUFS;
                m->m_next = n;
        } else
                n = m;

        /* Search for the last mbuf of unfragmentable part. */
        for (mlast = n; mlast->m_next; mlast = mlast->m_next)
                ;

        if ((mlast->m_flags & M_EXT) == 0 &&
            M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
                /* use the trailing space of the last mbuf for the fragment hdr */
                *frghdrp = (struct ip6_frag *)(mtod(mlast, char *) +
                    mlast->m_len);
                mlast->m_len += sizeof(struct ip6_frag);
        } else {
                /* allocate a new mbuf for the fragment header */
                struct mbuf *mfrg;

                MGET(mfrg, M_DONTWAIT, MT_DATA);
                if (mfrg == NULL)
                        return ENOBUFS;
                mfrg->m_len = sizeof(struct ip6_frag);
                *frghdrp = mtod(mfrg, struct ip6_frag *);
                mlast->m_next = mfrg;
        }

        return 0;
}

static int
ip6_getpmtu(struct rtentry *rt, struct ifnet *ifp, u_long *mtup,
    int *alwaysfragp)
{
        u_int32_t mtu = 0;
        int alwaysfrag = 0;
        int error = 0;

        if (rt != NULL) {
                if (ifp == NULL)
                        ifp = rt->rt_ifp;
                mtu = rt->rt_rmx.rmx_mtu;
                if (mtu == 0)
                        mtu = ifp->if_mtu;
                else if (mtu < IPV6_MMTU) {
                        /*
                         * RFC2460 section 5, last paragraph:
                         * if we record ICMPv6 too big message with
                         * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
                         * or smaller, with fragment header attached.
                         * (fragment header is needed regardless from the
                         * packet size, for translators to identify packets)
                         */
                        alwaysfrag = 1;
                        mtu = IPV6_MMTU;
                } else if (mtu > ifp->if_mtu) {
                        /*
                         * The MTU on the route is larger than the MTU on
                         * the interface!  This shouldn't happen, unless the
                         * MTU of the interface has been changed after the
                         * interface was brought up.  Change the MTU in the
                         * route to match the interface MTU (as long as the
                         * field isn't locked).
                         */
                        mtu = ifp->if_mtu;
                        if (!(rt->rt_rmx.rmx_locks & RTV_MTU))
                                rt->rt_rmx.rmx_mtu = mtu;
                }
        } else if (ifp) {
                mtu = ifp->if_mtu;
        } else
                error = EHOSTUNREACH; /* XXX */

        *mtup = mtu;
        if (alwaysfragp)
                *alwaysfragp = alwaysfrag;
        return (error);
}

/*
 * IP6 socket option processing.
 */
int
ip6_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
        int optdatalen, uproto;
        void *optdata;
        struct inpcb *inp = sotoinpcb(so);
        struct ip_moptions **mopts;
        int error, optval;
        int level, optname;

        KASSERT(solocked(so));
        KASSERT(sopt != NULL);

        level = sopt->sopt_level;
        optname = sopt->sopt_name;

        error = optval = 0;
        uproto = (int)so->so_proto->pr_protocol;

        switch (level) {
        case IPPROTO_IP:
                switch (optname) {
                case IP_ADD_MEMBERSHIP:
                case IP_DROP_MEMBERSHIP:
                case IP_MULTICAST_IF:
                case IP_MULTICAST_LOOP:
                case IP_MULTICAST_TTL:
                        mopts = &inp->inp_moptions;
                        switch (op) {
                        case PRCO_GETOPT:
                                return ip_getmoptions(*mopts, sopt);
                        case PRCO_SETOPT:
                                return ip_setmoptions(mopts, sopt);
                        default:
                                return EINVAL;
                        }
                default:
                        return ENOPROTOOPT;
                }
        case IPPROTO_IPV6:
                break;
        default:
                return ENOPROTOOPT;
        }
        switch (op) {
        case PRCO_SETOPT:
                switch (optname) {
#ifdef RFC2292
                case IPV6_2292PKTOPTIONS:
                        error = ip6_pcbopts(&in6p_outputopts(inp), so, sopt);
                        break;
#endif

                /*
                 * Use of some Hop-by-Hop options or some
                 * Destination options, might require special
                 * privilege.  That is, normal applications
                 * (without special privilege) might be forbidden
                 * from setting certain options in outgoing packets,
                 * and might never see certain options in received
                 * packets. [RFC 2292 Section 6]
                 * KAME specific note:
                 *  KAME prevents non-privileged users from sending or
                 *  receiving ANY hbh/dst options in order to avoid
                 *  overhead of parsing options in the kernel.
                 */
                case IPV6_RECVHOPOPTS:
                case IPV6_RECVDSTOPTS:
                case IPV6_RECVRTHDRDSTOPTS:
                        error = kauth_authorize_network(
                            kauth_cred_get(),
                            KAUTH_NETWORK_IPV6, KAUTH_REQ_NETWORK_IPV6_HOPBYHOP,
                            NULL, NULL, NULL);
                        if (error)
                                break;
                        /* FALLTHROUGH */
                case IPV6_UNICAST_HOPS:
                case IPV6_HOPLIMIT:
                case IPV6_FAITH:

                case IPV6_RECVPKTINFO:
                case IPV6_RECVHOPLIMIT:
                case IPV6_RECVRTHDR:
                case IPV6_RECVPATHMTU:
                case IPV6_RECVTCLASS:
                case IPV6_V6ONLY:
                case IPV6_BINDANY:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;
                        switch (optname) {
                        case IPV6_UNICAST_HOPS:
                                if (optval < -1 || optval >= 256)
                                        error = EINVAL;
                                else {
                                        /* -1 = kernel default */
                                        in6p_hops6(inp) = optval;
                                }
                                break;
#define OPTSET(bit) \
do { \
if (optval) \
        inp->inp_flags |= (bit); \
else \
        inp->inp_flags &= ~(bit); \
} while (/*CONSTCOND*/ 0)

#ifdef RFC2292
#define OPTSET2292(bit)                         \
do {                                                 \
inp->inp_flags |= IN6P_RFC2292;         \
if (optval)                                 \
        inp->inp_flags |= (bit);         \
else                                         \
        inp->inp_flags &= ~(bit);         \
} while (/*CONSTCOND*/ 0)
#endif

#define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0)

                        case IPV6_RECVPKTINFO:
#ifdef RFC2292
                                /* cannot mix with RFC2292 */
                                if (OPTBIT(IN6P_RFC2292)) {
                                        error = EINVAL;
                                        break;
                                }
#endif
                                OPTSET(IN6P_PKTINFO);
                                break;

                        case IPV6_HOPLIMIT:
                        {
                                struct ip6_pktopts **optp;

#ifdef RFC2292
                                /* cannot mix with RFC2292 */
                                if (OPTBIT(IN6P_RFC2292)) {
                                        error = EINVAL;
                                        break;
                                }
#endif
                                optp = &in6p_outputopts(inp);
                                error = ip6_pcbopt(IPV6_HOPLIMIT,
                                                   (u_char *)&optval,
                                                   sizeof(optval),
                                                   optp,
                                                   kauth_cred_get(), uproto);
                                break;
                        }

                        case IPV6_RECVHOPLIMIT:
#ifdef RFC2292
                                /* cannot mix with RFC2292 */
                                if (OPTBIT(IN6P_RFC2292)) {
                                        error = EINVAL;
                                        break;
                                }
#endif
                                OPTSET(IN6P_HOPLIMIT);
                                break;

                        case IPV6_RECVHOPOPTS:
#ifdef RFC2292
                                /* cannot mix with RFC2292 */
                                if (OPTBIT(IN6P_RFC2292)) {
                                        error = EINVAL;
                                        break;
                                }
#endif
                                OPTSET(IN6P_HOPOPTS);
                                break;

                        case IPV6_RECVDSTOPTS:
#ifdef RFC2292
                                /* cannot mix with RFC2292 */
                                if (OPTBIT(IN6P_RFC2292)) {
                                        error = EINVAL;
                                        break;
                                }
#endif
                                OPTSET(IN6P_DSTOPTS);
                                break;

                        case IPV6_RECVRTHDRDSTOPTS:
#ifdef RFC2292
                                /* cannot mix with RFC2292 */
                                if (OPTBIT(IN6P_RFC2292)) {
                                        error = EINVAL;
                                        break;
                                }
#endif
                                OPTSET(IN6P_RTHDRDSTOPTS);
                                break;

                        case IPV6_RECVRTHDR:
#ifdef RFC2292
                                /* cannot mix with RFC2292 */
                                if (OPTBIT(IN6P_RFC2292)) {
                                        error = EINVAL;
                                        break;
                                }
#endif
                                OPTSET(IN6P_RTHDR);
                                break;

                        case IPV6_FAITH:
                                OPTSET(IN6P_FAITH);
                                break;

                        case IPV6_RECVPATHMTU:
                                /*
                                 * We ignore this option for TCP
                                 * sockets.
                                 * (RFC3542 leaves this case
                                 * unspecified.)
                                 */
                                if (uproto != IPPROTO_TCP)
                                        OPTSET(IN6P_MTU);
                                break;

                        case IPV6_V6ONLY:
                                /*
                                 * make setsockopt(IPV6_V6ONLY)
                                 * available only prior to bind(2).
                                 * see ipng mailing list, Jun 22 2001.
                                 */
                                if (inp->inp_lport ||
                                    !IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) {
                                        error = EINVAL;
                                        break;
                                }
#ifdef INET6_BINDV6ONLY
                                if (!optval)
                                        error = EINVAL;
#else
                                OPTSET(IN6P_IPV6_V6ONLY);
#endif
                                break;

                        case IPV6_RECVTCLASS:
#ifdef RFC2292
                                /* cannot mix with RFC2292 XXX */
                                if (OPTBIT(IN6P_RFC2292)) {
                                        error = EINVAL;
                                        break;
                                }
#endif
                                OPTSET(IN6P_TCLASS);
                                break;

                        case IPV6_BINDANY:
                                error = kauth_authorize_network(
                                    kauth_cred_get(), KAUTH_NETWORK_BIND,
                                    KAUTH_REQ_NETWORK_BIND_ANYADDR, so, NULL,
                                    NULL);
                                if (error)
                                        break;
                                OPTSET(IN6P_BINDANY);
                                break;
                        }
                        break;

                case IPV6_OTCLASS:
                {
                        struct ip6_pktopts **optp;
                        u_int8_t tclass;

                        error = sockopt_get(sopt, &tclass, sizeof(tclass));
                        if (error)
                                break;
                        optp = &in6p_outputopts(inp);
                        error = ip6_pcbopt(optname,
                                           (u_char *)&tclass,
                                           sizeof(tclass),
                                           optp,
                                           kauth_cred_get(), uproto);
                        break;
                }

                case IPV6_TCLASS:
                case IPV6_DONTFRAG:
                case IPV6_USE_MIN_MTU:
                case IPV6_PREFER_TEMPADDR:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;
                        {
                                struct ip6_pktopts **optp;
                                optp = &in6p_outputopts(inp);
                                error = ip6_pcbopt(optname,
                                                   (u_char *)&optval,
                                                   sizeof(optval),
                                                   optp,
                                                   kauth_cred_get(), uproto);
                                break;
                        }

#ifdef RFC2292
                case IPV6_2292PKTINFO:
                case IPV6_2292HOPLIMIT:
                case IPV6_2292HOPOPTS:
                case IPV6_2292DSTOPTS:
                case IPV6_2292RTHDR:
                        /* RFC 2292 */
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;

                        switch (optname) {
                        case IPV6_2292PKTINFO:
                                OPTSET2292(IN6P_PKTINFO);
                                break;
                        case IPV6_2292HOPLIMIT:
                                OPTSET2292(IN6P_HOPLIMIT);
                                break;
                        case IPV6_2292HOPOPTS:
                                /*
                                 * Check super-user privilege.
                                 * See comments for IPV6_RECVHOPOPTS.
                                 */
                                error = kauth_authorize_network(
                                    kauth_cred_get(),
                                    KAUTH_NETWORK_IPV6,
                                    KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL,
                                    NULL, NULL);
                                if (error)
                                        return (error);
                                OPTSET2292(IN6P_HOPOPTS);
                                break;
                        case IPV6_2292DSTOPTS:
                                error = kauth_authorize_network(
                                    kauth_cred_get(),
                                    KAUTH_NETWORK_IPV6,
                                    KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL,
                                    NULL, NULL);
                                if (error)
                                        return (error);
                                OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
                                break;
                        case IPV6_2292RTHDR:
                                OPTSET2292(IN6P_RTHDR);
                                break;
                        }
                        break;
#endif
                case IPV6_PKTINFO:
                case IPV6_HOPOPTS:
                case IPV6_RTHDR:
                case IPV6_DSTOPTS:
                case IPV6_RTHDRDSTOPTS:
                case IPV6_NEXTHOP: {
                        /* new advanced API (RFC3542) */
                        void *optbuf;
                        int optbuflen;
                        struct ip6_pktopts **optp;

#ifdef RFC2292
                        /* cannot mix with RFC2292 */
                        if (OPTBIT(IN6P_RFC2292)) {
                                error = EINVAL;
                                break;
                        }
#endif

                        optbuflen = sopt->sopt_size;
                        optbuf = malloc(optbuflen, M_IP6OPT, M_NOWAIT);
                        if (optbuf == NULL) {
                                error = ENOBUFS;
                                break;
                        }

                        error = sockopt_get(sopt, optbuf, optbuflen);
                        if (error) {
                                free(optbuf, M_IP6OPT);
                                break;
                        }
                        optp = &in6p_outputopts(inp);
                        error = ip6_pcbopt(optname, optbuf, optbuflen,
                            optp, kauth_cred_get(), uproto);

                        free(optbuf, M_IP6OPT);
                        break;
                        }
#undef OPTSET

                case IPV6_MULTICAST_IF:
                case IPV6_MULTICAST_HOPS:
                case IPV6_MULTICAST_LOOP:
                case IPV6_JOIN_GROUP:
                case IPV6_LEAVE_GROUP:
                        error = ip6_setmoptions(sopt, inp);
                        break;

                case IPV6_PORTRANGE:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;

                        switch (optval) {
                        case IPV6_PORTRANGE_DEFAULT:
                                inp->inp_flags &= ~(IN6P_LOWPORT);
                                inp->inp_flags &= ~(IN6P_HIGHPORT);
                                break;

                        case IPV6_PORTRANGE_HIGH:
                                inp->inp_flags &= ~(IN6P_LOWPORT);
                                inp->inp_flags |= IN6P_HIGHPORT;
                                break;

                        case IPV6_PORTRANGE_LOW:
                                inp->inp_flags &= ~(IN6P_HIGHPORT);
                                inp->inp_flags |= IN6P_LOWPORT;
                                break;

                        default:
                                error = EINVAL;
                                break;
                        }
                        break;

                case IPV6_PORTALGO:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;

                        error = portalgo_algo_index_select(inp, optval);
                        break;

#if defined(IPSEC)
                case IPV6_IPSEC_POLICY:
                        if (ipsec_enabled) {
                                error = ipsec_set_policy(inp,
                                    sopt->sopt_data, sopt->sopt_size,
                                    kauth_cred_get());
                        } else
                                error = ENOPROTOOPT;
                        break;
#endif /* IPSEC */

                default:
                        error = ENOPROTOOPT;
                        break;
                }
                break;

        case PRCO_GETOPT:
                switch (optname) {
#ifdef RFC2292
                case IPV6_2292PKTOPTIONS:
                        /*
                         * RFC3542 (effectively) deprecated the
                         * semantics of the 2292-style pktoptions.
                         * Since it was not reliable in nature (i.e.,
                         * applications had to expect the lack of some
                         * information after all), it would make sense
                         * to simplify this part by always returning
                         * empty data.
                         */
                        break;
#endif

                case IPV6_RECVHOPOPTS:
                case IPV6_RECVDSTOPTS:
                case IPV6_RECVRTHDRDSTOPTS:
                case IPV6_UNICAST_HOPS:
                case IPV6_RECVPKTINFO:
                case IPV6_RECVHOPLIMIT:
                case IPV6_RECVRTHDR:
                case IPV6_RECVPATHMTU:

                case IPV6_FAITH:
                case IPV6_V6ONLY:
                case IPV6_PORTRANGE:
                case IPV6_RECVTCLASS:
                case IPV6_BINDANY:
                        switch (optname) {

                        case IPV6_RECVHOPOPTS:
                                optval = OPTBIT(IN6P_HOPOPTS);
                                break;

                        case IPV6_RECVDSTOPTS:
                                optval = OPTBIT(IN6P_DSTOPTS);
                                break;

                        case IPV6_RECVRTHDRDSTOPTS:
                                optval = OPTBIT(IN6P_RTHDRDSTOPTS);
                                break;

                        case IPV6_UNICAST_HOPS:
                                optval = in6p_hops6(inp);
                                break;

                        case IPV6_RECVPKTINFO:
                                optval = OPTBIT(IN6P_PKTINFO);
                                break;

                        case IPV6_RECVHOPLIMIT:
                                optval = OPTBIT(IN6P_HOPLIMIT);
                                break;

                        case IPV6_RECVRTHDR:
                                optval = OPTBIT(IN6P_RTHDR);
                                break;

                        case IPV6_RECVPATHMTU:
                                optval = OPTBIT(IN6P_MTU);
                                break;

                        case IPV6_FAITH:
                                optval = OPTBIT(IN6P_FAITH);
                                break;

                        case IPV6_V6ONLY:
                                optval = OPTBIT(IN6P_IPV6_V6ONLY);
                                break;

                        case IPV6_PORTRANGE:
                            {
                                int flags;
                                flags = inp->inp_flags;
                                if (flags & IN6P_HIGHPORT)
                                        optval = IPV6_PORTRANGE_HIGH;
                                else if (flags & IN6P_LOWPORT)
                                        optval = IPV6_PORTRANGE_LOW;
                                else
                                        optval = 0;
                                break;
                            }
                        case IPV6_RECVTCLASS:
                                optval = OPTBIT(IN6P_TCLASS);
                                break;

                        case IPV6_BINDANY:
                                optval = OPTBIT(IN6P_BINDANY);
                                break;
                        }

                        if (error)
                                break;
                        error = sockopt_setint(sopt, optval);
                        break;

                case IPV6_PATHMTU:
                    {
                        u_long pmtu = 0;
                        struct ip6_mtuinfo mtuinfo;
                        struct route *ro = &inp->inp_route;
                        struct rtentry *rt;
                        union {
                                struct sockaddr                dst;
                                struct sockaddr_in6        dst6;
                        } u;

                        if (!(so->so_state & SS_ISCONNECTED))
                                return (ENOTCONN);
                        /*
                         * XXX: we dot not consider the case of source
                         * routing, or optional information to specify
                         * the outgoing interface.
                         */
                        sockaddr_in6_init(&u.dst6, &in6p_faddr(inp), 0, 0, 0);
                        rt = rtcache_lookup(ro, &u.dst);
                        error = ip6_getpmtu(rt, NULL, &pmtu, NULL);
                        rtcache_unref(rt, ro);
                        if (error)
                                break;
                        if (pmtu > IPV6_MAXPACKET)
                                pmtu = IPV6_MAXPACKET;

                        memset(&mtuinfo, 0, sizeof(mtuinfo));
                        mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
                        optdata = (void *)&mtuinfo;
                        optdatalen = sizeof(mtuinfo);
                        if (optdatalen > MCLBYTES)
                                return (EMSGSIZE); /* XXX */
                        error = sockopt_set(sopt, optdata, optdatalen);
                        break;
                    }

#ifdef RFC2292
                case IPV6_2292PKTINFO:
                case IPV6_2292HOPLIMIT:
                case IPV6_2292HOPOPTS:
                case IPV6_2292RTHDR:
                case IPV6_2292DSTOPTS:
                        switch (optname) {
                        case IPV6_2292PKTINFO:
                                optval = OPTBIT(IN6P_PKTINFO);
                                break;
                        case IPV6_2292HOPLIMIT:
                                optval = OPTBIT(IN6P_HOPLIMIT);
                                break;
                        case IPV6_2292HOPOPTS:
                                optval = OPTBIT(IN6P_HOPOPTS);
                                break;
                        case IPV6_2292RTHDR:
                                optval = OPTBIT(IN6P_RTHDR);
                                break;
                        case IPV6_2292DSTOPTS:
                                optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
                                break;
                        }
                        error = sockopt_setint(sopt, optval);
                        break;
#endif
                case IPV6_PKTINFO:
                case IPV6_HOPOPTS:
                case IPV6_RTHDR:
                case IPV6_DSTOPTS:
                case IPV6_RTHDRDSTOPTS:
                case IPV6_NEXTHOP:
                case IPV6_OTCLASS:
                case IPV6_TCLASS:
                case IPV6_DONTFRAG:
                case IPV6_USE_MIN_MTU:
                case IPV6_PREFER_TEMPADDR:
                        error = ip6_getpcbopt(in6p_outputopts(inp),
                            optname, sopt);
                        break;

                case IPV6_MULTICAST_IF:
                case IPV6_MULTICAST_HOPS:
                case IPV6_MULTICAST_LOOP:
                case IPV6_JOIN_GROUP:
                case IPV6_LEAVE_GROUP:
                        error = ip6_getmoptions(sopt, inp);
                        break;

                case IPV6_PORTALGO:
                        optval = inp->inp_portalgo;
                        error = sockopt_setint(sopt, optval);
                        break;

#if defined(IPSEC)
                case IPV6_IPSEC_POLICY:
                        if (ipsec_used) {
                                struct mbuf *m = NULL;

                                /*
                                 * XXX: this will return EINVAL as sopt is
                                 * empty
                                 */
                                error = ipsec_get_policy(inp, sopt->sopt_data,
                                    sopt->sopt_size, &m);
                                if (!error)
                                        error = sockopt_setmbuf(sopt, m);
                        } else
                                error = ENOPROTOOPT;
                        break;
#endif /* IPSEC */

                default:
                        error = ENOPROTOOPT;
                        break;
                }
                break;
        }
        return (error);
}

int
ip6_raw_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
        int error = 0, optval;
        const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
        struct inpcb *inp = sotoinpcb(so);
        int level, optname;

        KASSERT(sopt != NULL);

        level = sopt->sopt_level;
        optname = sopt->sopt_name;

        if (level != IPPROTO_IPV6) {
                return ENOPROTOOPT;
        }

        switch (optname) {
        case IPV6_CHECKSUM:
                /*
                 * For ICMPv6 sockets, no modification allowed for checksum
                 * offset, permit "no change" values to help existing apps.
                 *
                 * XXX RFC3542 says: "An attempt to set IPV6_CHECKSUM
                 * for an ICMPv6 socket will fail."  The current
                 * behavior does not meet RFC3542.
                 */
                switch (op) {
                case PRCO_SETOPT:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;
                        if (optval < -1 ||
                            (optval > 0 && (optval % 2) != 0)) {
                                /*
                                 * The API assumes non-negative even offset
                                 * values or -1 as a special value.
                                 */
                                error = EINVAL;
                        } else if (so->so_proto->pr_protocol ==
                            IPPROTO_ICMPV6) {
                                if (optval != icmp6off)
                                        error = EINVAL;
                        } else
                                in6p_cksum(inp) = optval;
                        break;

                case PRCO_GETOPT:
                        if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
                                optval = icmp6off;
                        else
                                optval = in6p_cksum(inp);

                        error = sockopt_setint(sopt, optval);
                        break;

                default:
                        error = EINVAL;
                        break;
                }
                break;

        default:
                error = ENOPROTOOPT;
                break;
        }

        return (error);
}

#ifdef RFC2292
/*
 * Set up IP6 options in pcb for insertion in output packets or
 * specifying behavior of outgoing packets.
 */
static int
ip6_pcbopts(struct ip6_pktopts **pktopt, struct socket *so,
    struct sockopt *sopt)
{
        struct ip6_pktopts *opt = *pktopt;
        struct mbuf *m;
        int error = 0;

        KASSERT(solocked(so));

        /* turn off any old options. */
        if (opt) {
#ifdef DIAGNOSTIC
            if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
                opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
                opt->ip6po_rhinfo.ip6po_rhi_rthdr)
                    printf("ip6_pcbopts: all specified options are cleared.\n");
#endif
                ip6_clearpktopts(opt, -1);
        } else {
                opt = malloc(sizeof(*opt), M_IP6OPT, M_NOWAIT);
                if (opt == NULL)
                        return (ENOBUFS);
        }
        *pktopt = NULL;

        if (sopt == NULL || sopt->sopt_size == 0) {
                /*
                 * Only turning off any previous options, regardless of
                 * whether the opt is just created or given.
                 */
                free(opt, M_IP6OPT);
                return (0);
        }

        /*  set options specified by user. */
        m = sockopt_getmbuf(sopt);
        if (m == NULL) {
                free(opt, M_IP6OPT);
                return (ENOBUFS);
        }

        error = ip6_setpktopts(m, opt, NULL, kauth_cred_get(),
            so->so_proto->pr_protocol);
        m_freem(m);
        if (error != 0) {
                ip6_clearpktopts(opt, -1); /* XXX: discard all options */
                free(opt, M_IP6OPT);
                return (error);
        }
        *pktopt = opt;
        return (0);
}
#endif

/*
 * initialize ip6_pktopts.  beware that there are non-zero default values in
 * the struct.
 */
void
ip6_initpktopts(struct ip6_pktopts *opt)
{

        memset(opt, 0, sizeof(*opt));
        opt->ip6po_hlim = -1;        /* -1 means default hop limit */
        opt->ip6po_tclass = -1;        /* -1 means default traffic class */
        opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
        opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
}

#define sin6tosa(sin6)        ((struct sockaddr *)(sin6)) /* XXX */
static int
ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
    kauth_cred_t cred, int uproto)
{
        struct ip6_pktopts *opt;

        if (*pktopt == NULL) {
                *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
                    M_NOWAIT);
                if (*pktopt == NULL)
                        return (ENOBUFS);

                ip6_initpktopts(*pktopt);
        }
        opt = *pktopt;

        return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
}

static int
ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt)
{
        void *optdata = NULL;
        int optdatalen = 0;
        struct ip6_ext *ip6e;
        int error = 0;
        struct in6_pktinfo null_pktinfo;
        int deftclass = 0, on;
        int defminmtu = IP6PO_MINMTU_MCASTONLY;
        int defpreftemp = IP6PO_TEMPADDR_SYSTEM;

        switch (optname) {
        case IPV6_PKTINFO:
                if (pktopt && pktopt->ip6po_pktinfo)
                        optdata = (void *)pktopt->ip6po_pktinfo;
                else {
                        /* XXX: we don't have to do this every time... */
                        memset(&null_pktinfo, 0, sizeof(null_pktinfo));
                        optdata = (void *)&null_pktinfo;
                }
                optdatalen = sizeof(struct in6_pktinfo);
                break;
        case IPV6_OTCLASS:
                /* XXX */
                return (EINVAL);
        case IPV6_TCLASS:
                if (pktopt && pktopt->ip6po_tclass >= 0)
                        optdata = (void *)&pktopt->ip6po_tclass;
                else
                        optdata = (void *)&deftclass;
                optdatalen = sizeof(int);
                break;
        case IPV6_HOPOPTS:
                if (pktopt && pktopt->ip6po_hbh) {
                        optdata = (void *)pktopt->ip6po_hbh;
                        ip6e = (struct ip6_ext *)pktopt->ip6po_hbh;
                        optdatalen = (ip6e->ip6e_len + 1) << 3;
                }
                break;
        case IPV6_RTHDR:
                if (pktopt && pktopt->ip6po_rthdr) {
                        optdata = (void *)pktopt->ip6po_rthdr;
                        ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr;
                        optdatalen = (ip6e->ip6e_len + 1) << 3;
                }
                break;
        case IPV6_RTHDRDSTOPTS:
                if (pktopt && pktopt->ip6po_dest1) {
                        optdata = (void *)pktopt->ip6po_dest1;
                        ip6e = (struct ip6_ext *)pktopt->ip6po_dest1;
                        optdatalen = (ip6e->ip6e_len + 1) << 3;
                }
                break;
        case IPV6_DSTOPTS:
                if (pktopt && pktopt->ip6po_dest2) {
                        optdata = (void *)pktopt->ip6po_dest2;
                        ip6e = (struct ip6_ext *)pktopt->ip6po_dest2;
                        optdatalen = (ip6e->ip6e_len + 1) << 3;
                }
                break;
        case IPV6_NEXTHOP:
                if (pktopt && pktopt->ip6po_nexthop) {
                        optdata = (void *)pktopt->ip6po_nexthop;
                        optdatalen = pktopt->ip6po_nexthop->sa_len;
                }
                break;
        case IPV6_USE_MIN_MTU:
                if (pktopt)
                        optdata = (void *)&pktopt->ip6po_minmtu;
                else
                        optdata = (void *)&defminmtu;
                optdatalen = sizeof(int);
                break;
        case IPV6_DONTFRAG:
                if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
                        on = 1;
                else
                        on = 0;
                optdata = (void *)&on;
                optdatalen = sizeof(on);
                break;
        case IPV6_PREFER_TEMPADDR:
                if (pktopt)
                        optdata = (void *)&pktopt->ip6po_prefer_tempaddr;
                else
                        optdata = (void *)&defpreftemp;
                optdatalen = sizeof(int);
                break;
        default:                /* should not happen */
#ifdef DIAGNOSTIC
                panic("ip6_getpcbopt: unexpected option\n");
#endif
                return (ENOPROTOOPT);
        }

        error = sockopt_set(sopt, optdata, optdatalen);

        return (error);
}

void
ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
{
        if (optname == -1 || optname == IPV6_PKTINFO) {
                if (pktopt->ip6po_pktinfo)
                        free(pktopt->ip6po_pktinfo, M_IP6OPT);
                pktopt->ip6po_pktinfo = NULL;
        }
        if (optname == -1 || optname == IPV6_HOPLIMIT)
                pktopt->ip6po_hlim = -1;
        if (optname == -1 || optname == IPV6_TCLASS)
                pktopt->ip6po_tclass = -1;
        if (optname == -1 || optname == IPV6_NEXTHOP) {
                rtcache_free(&pktopt->ip6po_nextroute);
                if (pktopt->ip6po_nexthop)
                        free(pktopt->ip6po_nexthop, M_IP6OPT);
                pktopt->ip6po_nexthop = NULL;
        }
        if (optname == -1 || optname == IPV6_HOPOPTS) {
                if (pktopt->ip6po_hbh)
                        free(pktopt->ip6po_hbh, M_IP6OPT);
                pktopt->ip6po_hbh = NULL;
        }
        if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
                if (pktopt->ip6po_dest1)
                        free(pktopt->ip6po_dest1, M_IP6OPT);
                pktopt->ip6po_dest1 = NULL;
        }
        if (optname == -1 || optname == IPV6_RTHDR) {
                if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
                        free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
                pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
                rtcache_free(&pktopt->ip6po_route);
        }
        if (optname == -1 || optname == IPV6_DSTOPTS) {
                if (pktopt->ip6po_dest2)
                        free(pktopt->ip6po_dest2, M_IP6OPT);
                pktopt->ip6po_dest2 = NULL;
        }
}

#define PKTOPT_EXTHDRCPY(type)                                         \
do {                                                                \
        if (src->type) {                                        \
                int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
                dst->type = malloc(hlen, M_IP6OPT, canwait);        \
                if (dst->type == NULL)                                \
                        goto bad;                                \
                memcpy(dst->type, src->type, hlen);                \
        }                                                        \
} while (/*CONSTCOND*/ 0)

static int
copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
{
        dst->ip6po_hlim = src->ip6po_hlim;
        dst->ip6po_tclass = src->ip6po_tclass;
        dst->ip6po_flags = src->ip6po_flags;
        dst->ip6po_minmtu = src->ip6po_minmtu;
        dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr;
        if (src->ip6po_pktinfo) {
                dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
                    M_IP6OPT, canwait);
                if (dst->ip6po_pktinfo == NULL)
                        goto bad;
                *dst->ip6po_pktinfo = *src->ip6po_pktinfo;
        }
        if (src->ip6po_nexthop) {
                dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
                    M_IP6OPT, canwait);
                if (dst->ip6po_nexthop == NULL)
                        goto bad;
                memcpy(dst->ip6po_nexthop, src->ip6po_nexthop,
                    src->ip6po_nexthop->sa_len);
        }
        PKTOPT_EXTHDRCPY(ip6po_hbh);
        PKTOPT_EXTHDRCPY(ip6po_dest1);
        PKTOPT_EXTHDRCPY(ip6po_dest2);
        PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
        return (0);

  bad:
        if (dst->ip6po_pktinfo) free(dst->ip6po_pktinfo, M_IP6OPT);
        if (dst->ip6po_nexthop) free(dst->ip6po_nexthop, M_IP6OPT);
        if (dst->ip6po_hbh) free(dst->ip6po_hbh, M_IP6OPT);
        if (dst->ip6po_dest1) free(dst->ip6po_dest1, M_IP6OPT);
        if (dst->ip6po_dest2) free(dst->ip6po_dest2, M_IP6OPT);
        if (dst->ip6po_rthdr) free(dst->ip6po_rthdr, M_IP6OPT);

        return (ENOBUFS);
}
#undef PKTOPT_EXTHDRCPY

struct ip6_pktopts *
ip6_copypktopts(struct ip6_pktopts *src, int canwait)
{
        int error;
        struct ip6_pktopts *dst;

        dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
        if (dst == NULL)
                return (NULL);
        ip6_initpktopts(dst);

        if ((error = copypktopts(dst, src, canwait)) != 0) {
                free(dst, M_IP6OPT);
                return (NULL);
        }

        return (dst);
}

void
ip6_freepcbopts(struct ip6_pktopts *pktopt)
{
        if (pktopt == NULL)
                return;

        ip6_clearpktopts(pktopt, -1);

        free(pktopt, M_IP6OPT);
}

int
ip6_get_membership(const struct sockopt *sopt, struct ifnet **ifp,
    struct psref *psref, void *v, size_t l)
{
        struct ipv6_mreq mreq;
        int error;
        struct in6_addr *ia = &mreq.ipv6mr_multiaddr;
        struct in_addr *ia4 = (void *)&ia->s6_addr32[3];

        error = sockopt_get(sopt, &mreq, sizeof(mreq));
        if (error != 0)
                return error;

        if (IN6_IS_ADDR_UNSPECIFIED(ia)) {
                /*
                 * We use the unspecified address to specify to accept
                 * all multicast addresses. Only super user is allowed
                 * to do this.
                 */
                if (kauth_authorize_network(kauth_cred_get(),
                    KAUTH_NETWORK_IPV6,
                    KAUTH_REQ_NETWORK_IPV6_JOIN_MULTICAST, NULL, NULL, NULL))
                        return EACCES;
        } else if (IN6_IS_ADDR_V4MAPPED(ia)) {
                // Don't bother if we are not going to use ifp.
                if (l == sizeof(*ia)) {
                        memcpy(v, ia, l);
                        return 0;
                }
        } else if (!IN6_IS_ADDR_MULTICAST(ia)) {
                return EINVAL;
        }

        /*
         * If no interface was explicitly specified, choose an
         * appropriate one according to the given multicast address.
         */
        if (mreq.ipv6mr_interface == 0) {
                struct rtentry *rt;
                union {
                        struct sockaddr                dst;
                        struct sockaddr_in        dst4;
                        struct sockaddr_in6        dst6;
                } u;
                struct route ro;

                /*
                 * Look up the routing table for the
                 * address, and choose the outgoing interface.
                 *   XXX: is it a good approach?
                 */
                memset(&ro, 0, sizeof(ro));
                if (IN6_IS_ADDR_V4MAPPED(ia))
                        sockaddr_in_init(&u.dst4, ia4, 0);
                else
                        sockaddr_in6_init(&u.dst6, ia, 0, 0, 0);
                error = rtcache_setdst(&ro, &u.dst);
                if (error != 0)
                        return error;
                rt = rtcache_init(&ro);
                *ifp = rt != NULL ?
                    if_get_byindex(rt->rt_ifp->if_index, psref) : NULL;
                rtcache_unref(rt, &ro);
                rtcache_free(&ro);
        } else {
                /*
                 * If the interface is specified, validate it.
                 */
                *ifp = if_get_byindex(mreq.ipv6mr_interface, psref);
                if (*ifp == NULL)
                        return ENXIO;        /* XXX EINVAL? */
        }
        if (sizeof(*ia) == l)
                memcpy(v, ia, l);
        else
                memcpy(v, ia4, l);
        return 0;
}

/*
 * Set the IP6 multicast options in response to user setsockopt().
 */
static int
ip6_setmoptions(const struct sockopt *sopt, struct inpcb *inp)
{
        int error = 0;
        u_int loop, ifindex;
        struct ipv6_mreq mreq;
        struct in6_addr ia;
        struct ifnet *ifp;
        struct ip6_moptions *im6o = in6p_moptions(inp);
        struct in6_multi_mship *imm;

        KASSERT(inp_locked(inp));

        if (im6o == NULL) {
                /*
                 * No multicast option buffer attached to the pcb;
                 * allocate one and initialize to default values.
                 */
                im6o = malloc(sizeof(*im6o), M_IPMOPTS, M_NOWAIT);
                if (im6o == NULL)
                        return (ENOBUFS);
                in6p_moptions(inp) = im6o;
                im6o->im6o_multicast_if_index = 0;
                im6o->im6o_multicast_hlim = ip6_defmcasthlim;
                im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
                LIST_INIT(&im6o->im6o_memberships);
        }

        switch (sopt->sopt_name) {

        case IPV6_MULTICAST_IF: {
                int s;
                /*
                 * Select the interface for outgoing multicast packets.
                 */
                error = sockopt_get(sopt, &ifindex, sizeof(ifindex));
                if (error != 0)
                        break;

                s = pserialize_read_enter();
                if (ifindex != 0) {
                        if ((ifp = if_byindex(ifindex)) == NULL) {
                                pserialize_read_exit(s);
                                error = ENXIO;        /* XXX EINVAL? */
                                break;
                        }
                        if ((ifp->if_flags & IFF_MULTICAST) == 0) {
                                pserialize_read_exit(s);
                                error = EADDRNOTAVAIL;
                                break;
                        }
                } else
                        ifp = NULL;
                im6o->im6o_multicast_if_index = if_get_index(ifp);
                pserialize_read_exit(s);
                break;
            }

        case IPV6_MULTICAST_HOPS:
            {
                /*
                 * Set the IP6 hoplimit for outgoing multicast packets.
                 */
                int optval;

                error = sockopt_getint(sopt, &optval);
                if (error != 0)
                        break;

                if (optval < -1 || optval >= 256)
                        error = EINVAL;
                else if (optval == -1)
                        im6o->im6o_multicast_hlim = ip6_defmcasthlim;
                else
                        im6o->im6o_multicast_hlim = optval;
                break;
            }

        case IPV6_MULTICAST_LOOP:
                /*
                 * Set the loopback flag for outgoing multicast packets.
                 * Must be zero or one.
                 */
                error = sockopt_get(sopt, &loop, sizeof(loop));
                if (error != 0)
                        break;
                if (loop > 1) {
                        error = EINVAL;
                        break;
                }
                im6o->im6o_multicast_loop = loop;
                break;

        case IPV6_JOIN_GROUP: {
                int bound;
                struct psref psref;
                /*
                 * Add a multicast group membership.
                 * Group must be a valid IP6 multicast address.
                 */
                bound = curlwp_bind();
                ifp = NULL;
                error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia));
                if (error != 0) {
                        KASSERT(ifp == NULL);
                        curlwp_bindx(bound);
                        return error;
                }

                if (IN6_IS_ADDR_V4MAPPED(&ia)) {
                        error = ip_setmoptions(&inp->inp_moptions, sopt);
                        goto put_break;
                }
                /*
                 * See if we found an interface, and confirm that it
                 * supports multicast
                 */
                if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
                        error = EADDRNOTAVAIL;
                        goto put_break;
                }

                if (in6_setscope(&ia, ifp, NULL)) {
                        error = EADDRNOTAVAIL; /* XXX: should not happen */
                        goto put_break;
                }

                /*
                 * See if the membership already exists.
                 */
                LIST_FOREACH(imm, &im6o->im6o_memberships, i6mm_chain) {
                        if (imm->i6mm_maddr->in6m_ifp == ifp &&
                            IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
                            &ia))
                                goto put_break;
                }
                if (imm != NULL) {
                        error = EADDRINUSE;
                        goto put_break;
                }
                /*
                 * Everything looks good; add a new record to the multicast
                 * address list for the given interface.
                 */
                imm = in6_joingroup(ifp, &ia, &error, 0);
                if (imm == NULL)
                        goto put_break;
                LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
            put_break:
                if_put(ifp, &psref);
                curlwp_bindx(bound);
                break;
            }

        case IPV6_LEAVE_GROUP: {
                /*
                 * Drop a multicast group membership.
                 * Group must be a valid IP6 multicast address.
                 */
                error = sockopt_get(sopt, &mreq, sizeof(mreq));
                if (error != 0)
                        break;

                if (IN6_IS_ADDR_V4MAPPED(&mreq.ipv6mr_multiaddr)) {
                        error = ip_setmoptions(&inp->inp_moptions, sopt);
                        break;
                }
                /*
                 * If an interface address was specified, get a pointer
                 * to its ifnet structure.
                 */
                if (mreq.ipv6mr_interface != 0) {
                        if ((ifp = if_byindex(mreq.ipv6mr_interface)) == NULL) {
                                error = ENXIO;        /* XXX EINVAL? */
                                break;
                        }
                } else
                        ifp = NULL;

                /* Fill in the scope zone ID */
                if (ifp) {
                        if (in6_setscope(&mreq.ipv6mr_multiaddr, ifp, NULL)) {
                                /* XXX: should not happen */
                                error = EADDRNOTAVAIL;
                                break;
                        }
                } else if (mreq.ipv6mr_interface != 0) {
                        /*
                         * XXX: This case would happens when the (positive)
                         * index is in the valid range, but the corresponding
                         * interface has been detached dynamically.  The above
                         * check probably avoids such case to happen here, but
                         * we check it explicitly for safety.
                         */
                        error = EADDRNOTAVAIL;
                        break;
                } else {        /* ipv6mr_interface == 0 */
                        struct sockaddr_in6 sa6_mc;

                        /*
                         * The API spec says as follows:
                         *  If the interface index is specified as 0, the
                         *  system may choose a multicast group membership to
                         *  drop by matching the multicast address only.
                         * On the other hand, we cannot disambiguate the scope
                         * zone unless an interface is provided.  Thus, we
                         * check if there's ambiguity with the default scope
                         * zone as the last resort.
                         */
                        sockaddr_in6_init(&sa6_mc, &mreq.ipv6mr_multiaddr,
                            0, 0, 0);
                        error = sa6_embedscope(&sa6_mc, ip6_use_defzone);
                        if (error != 0)
                                break;
                        mreq.ipv6mr_multiaddr = sa6_mc.sin6_addr;
                }

                /*
                 * Find the membership in the membership list.
                 */
                LIST_FOREACH(imm, &im6o->im6o_memberships, i6mm_chain) {
                        if ((ifp == NULL || imm->i6mm_maddr->in6m_ifp == ifp) &&
                            IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
                            &mreq.ipv6mr_multiaddr))
                                break;
                }
                if (imm == NULL) {
                        /* Unable to resolve interface */
                        error = EADDRNOTAVAIL;
                        break;
                }
                /*
                 * Give up the multicast address record to which the
                 * membership points.
                 */
                LIST_REMOVE(imm, i6mm_chain);
                in6_leavegroup(imm);
                /* in6m_ifp should not leave thanks to inp_lock */
                break;
            }

        default:
                error = EOPNOTSUPP;
                break;
        }

        /*
         * If all options have default values, no need to keep the mbuf.
         */
        if (im6o->im6o_multicast_if_index == 0 &&
            im6o->im6o_multicast_hlim == ip6_defmcasthlim &&
            im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP &&
            LIST_EMPTY(&im6o->im6o_memberships)) {
                free(in6p_moptions(inp), M_IPMOPTS);
                in6p_moptions(inp) = NULL;
        }

        return (error);
}

/*
 * Return the IP6 multicast options in response to user getsockopt().
 */
static int
ip6_getmoptions(struct sockopt *sopt, struct inpcb *inp)
{
        u_int optval;
        int error;
        struct ip6_moptions *im6o = in6p_moptions(inp);

        switch (sopt->sopt_name) {
        case IPV6_MULTICAST_IF:
                if (im6o == NULL || im6o->im6o_multicast_if_index == 0)
                        optval = 0;
                else
                        optval = im6o->im6o_multicast_if_index;

                error = sockopt_set(sopt, &optval, sizeof(optval));
                break;

        case IPV6_MULTICAST_HOPS:
                if (im6o == NULL)
                        optval = ip6_defmcasthlim;
                else
                        optval = im6o->im6o_multicast_hlim;

                error = sockopt_set(sopt, &optval, sizeof(optval));
                break;

        case IPV6_MULTICAST_LOOP:
                if (im6o == NULL)
                        optval = IPV6_DEFAULT_MULTICAST_LOOP;
                else
                        optval = im6o->im6o_multicast_loop;

                error = sockopt_set(sopt, &optval, sizeof(optval));
                break;

        default:
                error = EOPNOTSUPP;
        }

        return (error);
}

/*
 * Discard the IP6 multicast options.
 */
void
ip6_freemoptions(struct ip6_moptions *im6o)
{
        struct in6_multi_mship *imm, *nimm;

        if (im6o == NULL)
                return;

        /* The owner of im6o (inp) should be protected by solock */
        LIST_FOREACH_SAFE(imm, &im6o->im6o_memberships, i6mm_chain, nimm) {
                LIST_REMOVE(imm, i6mm_chain);
                in6_leavegroup(imm);
        }
        free(im6o, M_IPMOPTS);
}

/*
 * Set IPv6 outgoing packet options based on advanced API.
 */
int
ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
        struct ip6_pktopts *stickyopt, kauth_cred_t cred, int uproto)
{
        struct cmsghdr *cm = 0;

        if (control == NULL || opt == NULL)
                return (EINVAL);

        ip6_initpktopts(opt);
        if (stickyopt) {
                int error;

                /*
                 * If stickyopt is provided, make a local copy of the options
                 * for this particular packet, then override them by ancillary
                 * objects.
                 * XXX: copypktopts() does not copy the cached route to a next
                 * hop (if any).  This is not very good in terms of efficiency,
                 * but we can allow this since this option should be rarely
                 * used.
                 */
                if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
                        return (error);
        }

        /*
         * XXX: Currently, we assume all the optional information is stored
         * in a single mbuf.
         */
        if (control->m_next)
                return (EINVAL);

        /* XXX if cm->cmsg_len is not aligned, control->m_len can become <0 */
        for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
            control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
                int error;

                if (control->m_len < CMSG_LEN(0))
                        return (EINVAL);

                cm = mtod(control, struct cmsghdr *);
                if (cm->cmsg_len < CMSG_LEN(0) || cm->cmsg_len > control->m_len)
                        return (EINVAL);
                if (cm->cmsg_level != IPPROTO_IPV6)
                        continue;

                error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
                    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
                if (error)
                        return (error);
        }

        return (0);
}

/*
 * Set a particular packet option, as a sticky option or an ancillary data
 * item.  "len" can be 0 only when it's a sticky option.
 * We have 4 cases of combination of "sticky" and "cmsg":
 * "sticky=0, cmsg=0": impossible
 * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
 * "sticky=1, cmsg=0": RFC3542 socket option
 * "sticky=1, cmsg=1": RFC2292 socket option
 */
static int
ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
    kauth_cred_t cred, int sticky, int cmsg, int uproto)
{
        int minmtupolicy;
        int error;

        if (!sticky && !cmsg) {
#ifdef DIAGNOSTIC
                printf("ip6_setpktopt: impossible case\n");
#endif
                return (EINVAL);
        }

        /*
         * IPV6_2292xxx is for backward compatibility to RFC2292, and should
         * not be specified in the context of RFC3542.  Conversely,
         * RFC3542 types should not be specified in the context of RFC2292.
         */
        if (!cmsg) {
                switch (optname) {
                case IPV6_2292PKTINFO:
                case IPV6_2292HOPLIMIT:
                case IPV6_2292NEXTHOP:
                case IPV6_2292HOPOPTS:
                case IPV6_2292DSTOPTS:
                case IPV6_2292RTHDR:
                case IPV6_2292PKTOPTIONS:
                        return (ENOPROTOOPT);
                }
        }
        if (sticky && cmsg) {
                switch (optname) {
                case IPV6_PKTINFO:
                case IPV6_HOPLIMIT:
                case IPV6_NEXTHOP:
                case IPV6_HOPOPTS:
                case IPV6_DSTOPTS:
                case IPV6_RTHDRDSTOPTS:
                case IPV6_RTHDR:
                case IPV6_USE_MIN_MTU:
                case IPV6_DONTFRAG:
                case IPV6_OTCLASS:
                case IPV6_TCLASS:
                case IPV6_PREFER_TEMPADDR: /* XXX not an RFC3542 option */
                        return (ENOPROTOOPT);
                }
        }

        switch (optname) {
#ifdef RFC2292
        case IPV6_2292PKTINFO:
#endif
        case IPV6_PKTINFO:
        {
                struct in6_pktinfo *pktinfo;

                if (len != sizeof(struct in6_pktinfo))
                        return (EINVAL);

                pktinfo = (struct in6_pktinfo *)buf;

                /*
                 * An application can clear any sticky IPV6_PKTINFO option by
                 * doing a "regular" setsockopt with ipi6_addr being
                 * in6addr_any and ipi6_ifindex being zero.
                 * [RFC 3542, Section 6]
                 */
                if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
                    pktinfo->ipi6_ifindex == 0 &&
                    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
                        ip6_clearpktopts(opt, optname);
                        break;
                }

                if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
                    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
                        return (EINVAL);
                }

                /* Validate the interface index if specified. */
                if (pktinfo->ipi6_ifindex) {
                        struct ifnet *ifp;
                        int s = pserialize_read_enter();
                        ifp = if_byindex(pktinfo->ipi6_ifindex);
                        if (ifp == NULL) {
                                pserialize_read_exit(s);
                                return ENXIO;
                        }
                        pserialize_read_exit(s);
                }

                /*
                 * We store the address anyway, and let in6_selectsrc()
                 * validate the specified address.  This is because ipi6_addr
                 * may not have enough information about its scope zone, and
                 * we may need additional information (such as outgoing
                 * interface or the scope zone of a destination address) to
                 * disambiguate the scope.
                 * XXX: the delay of the validation may confuse the
                 * application when it is used as a sticky option.
                 */
                if (opt->ip6po_pktinfo == NULL) {
                        opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
                            M_IP6OPT, M_NOWAIT);
                        if (opt->ip6po_pktinfo == NULL)
                                return (ENOBUFS);
                }
                memcpy(opt->ip6po_pktinfo, pktinfo, sizeof(*pktinfo));
                break;
        }

#ifdef RFC2292
        case IPV6_2292HOPLIMIT:
#endif
        case IPV6_HOPLIMIT:
        {
                int *hlimp;

                /*
                 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
                 * to simplify the ordering among hoplimit options.
                 */
                if (optname == IPV6_HOPLIMIT && sticky)
                        return (ENOPROTOOPT);

                if (len != sizeof(int))
                        return (EINVAL);
                hlimp = (int *)buf;
                if (*hlimp < -1 || *hlimp > 255)
                        return (EINVAL);

                opt->ip6po_hlim = *hlimp;
                break;
        }

        case IPV6_OTCLASS:
                if (len != sizeof(u_int8_t))
                        return (EINVAL);

                opt->ip6po_tclass = *(u_int8_t *)buf;
                break;

        case IPV6_TCLASS:
        {
                int tclass;

                if (len != sizeof(int))
                        return (EINVAL);
                tclass = *(int *)buf;
                if (tclass < -1 || tclass > 255)
                        return (EINVAL);

                opt->ip6po_tclass = tclass;
                break;
        }

#ifdef RFC2292
        case IPV6_2292NEXTHOP:
#endif
        case IPV6_NEXTHOP:
                error = kauth_authorize_network(cred,
                    KAUTH_NETWORK_IPV6,
                    KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL);
                if (error)
                        return (error);

                if (len == 0) {        /* just remove the option */
                        ip6_clearpktopts(opt, IPV6_NEXTHOP);
                        break;
                }

                /* check if cmsg_len is large enough for sa_len */
                if (len < sizeof(struct sockaddr) || len < *buf)
                        return (EINVAL);

                switch (((struct sockaddr *)buf)->sa_family) {
                case AF_INET6:
                {
                        struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;

                        if (sa6->sin6_len != sizeof(struct sockaddr_in6))
                                return (EINVAL);

                        if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
                            IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
                                return (EINVAL);
                        }
                        if ((error = sa6_embedscope(sa6, ip6_use_defzone))
                            != 0) {
                                return (error);
                        }
                        break;
                }
                case AF_LINK:        /* eventually be supported? */
                default:
                        return (EAFNOSUPPORT);
                }

                /* turn off the previous option, then set the new option. */
                ip6_clearpktopts(opt, IPV6_NEXTHOP);
                opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
                if (opt->ip6po_nexthop == NULL)
                        return (ENOBUFS);
                memcpy(opt->ip6po_nexthop, buf, *buf);
                break;

#ifdef RFC2292
        case IPV6_2292HOPOPTS:
#endif
        case IPV6_HOPOPTS:
        {
                struct ip6_hbh *hbh;
                int hbhlen;

                /*
                 * XXX: We don't allow a non-privileged user to set ANY HbH
                 * options, since per-option restriction has too much
                 * overhead.
                 */
                error = kauth_authorize_network(cred,
                    KAUTH_NETWORK_IPV6,
                    KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL);
                if (error)
                        return (error);

                if (len == 0) {
                        ip6_clearpktopts(opt, IPV6_HOPOPTS);
                        break;        /* just remove the option */
                }

                /* message length validation */
                if (len < sizeof(struct ip6_hbh))
                        return (EINVAL);
                hbh = (struct ip6_hbh *)buf;
                hbhlen = (hbh->ip6h_len + 1) << 3;
                if (len != hbhlen)
                        return (EINVAL);

                /* turn off the previous option, then set the new option. */
                ip6_clearpktopts(opt, IPV6_HOPOPTS);
                opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
                if (opt->ip6po_hbh == NULL)
                        return (ENOBUFS);
                memcpy(opt->ip6po_hbh, hbh, hbhlen);

                break;
        }

#ifdef RFC2292
        case IPV6_2292DSTOPTS:
#endif
        case IPV6_DSTOPTS:
        case IPV6_RTHDRDSTOPTS:
        {
                struct ip6_dest *dest, **newdest = NULL;
                int destlen;

                /* XXX: see the comment for IPV6_HOPOPTS */
                error = kauth_authorize_network(cred,
                    KAUTH_NETWORK_IPV6,
                    KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL);
                if (error)
                        return (error);

                if (len == 0) {
                        ip6_clearpktopts(opt, optname);
                        break;        /* just remove the option */
                }

                /* message length validation */
                if (len < sizeof(struct ip6_dest))
                        return (EINVAL);
                dest = (struct ip6_dest *)buf;
                destlen = (dest->ip6d_len + 1) << 3;
                if (len != destlen)
                        return (EINVAL);
                /*
                 * Determine the position that the destination options header
                 * should be inserted; before or after the routing header.
                 */
                switch (optname) {
                case IPV6_2292DSTOPTS:
                        /*
                         * The old advanced API is ambiguous on this point.
                         * Our approach is to determine the position based
                         * according to the existence of a routing header.
                         * Note, however, that this depends on the order of the
                         * extension headers in the ancillary data; the 1st
                         * part of the destination options header must appear
                         * before the routing header in the ancillary data,
                         * too.
                         * RFC3542 solved the ambiguity by introducing
                         * separate ancillary data or option types.
                         */
                        if (opt->ip6po_rthdr == NULL)
                                newdest = &opt->ip6po_dest1;
                        else
                                newdest = &opt->ip6po_dest2;
                        break;
                case IPV6_RTHDRDSTOPTS:
                        newdest = &opt->ip6po_dest1;
                        break;
                case IPV6_DSTOPTS:
                        newdest = &opt->ip6po_dest2;
                        break;
                }

                /* turn off the previous option, then set the new option. */
                ip6_clearpktopts(opt, optname);
                *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
                if (*newdest == NULL)
                        return (ENOBUFS);
                memcpy(*newdest, dest, destlen);

                break;
        }

#ifdef RFC2292
        case IPV6_2292RTHDR:
#endif
        case IPV6_RTHDR:
        {
                struct ip6_rthdr *rth;
                int rthlen;

                if (len == 0) {
                        ip6_clearpktopts(opt, IPV6_RTHDR);
                        break;        /* just remove the option */
                }

                /* message length validation */
                if (len < sizeof(struct ip6_rthdr))
                        return (EINVAL);
                rth = (struct ip6_rthdr *)buf;
                rthlen = (rth->ip6r_len + 1) << 3;
                if (len != rthlen)
                        return (EINVAL);
                switch (rth->ip6r_type) {
                case IPV6_RTHDR_TYPE_0:
                        /* Dropped, RFC5095. */
                default:
                        return (EINVAL);        /* not supported */
                }
                /* turn off the previous option */
                ip6_clearpktopts(opt, IPV6_RTHDR);
                opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
                if (opt->ip6po_rthdr == NULL)
                        return (ENOBUFS);
                memcpy(opt->ip6po_rthdr, rth, rthlen);
                break;
        }

        case IPV6_USE_MIN_MTU:
                if (len != sizeof(int))
                        return (EINVAL);
                minmtupolicy = *(int *)buf;
                if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
                    minmtupolicy != IP6PO_MINMTU_DISABLE &&
                    minmtupolicy != IP6PO_MINMTU_ALL) {
                        return (EINVAL);
                }
                opt->ip6po_minmtu = minmtupolicy;
                break;

        case IPV6_DONTFRAG:
                if (len != sizeof(int))
                        return (EINVAL);

                if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
                        /*
                         * we ignore this option for TCP sockets.
                         * (RFC3542 leaves this case unspecified.)
                         */
                        opt->ip6po_flags &= ~IP6PO_DONTFRAG;
                } else
                        opt->ip6po_flags |= IP6PO_DONTFRAG;
                break;

        case IPV6_PREFER_TEMPADDR:
        {
                int preftemp;

                if (len != sizeof(int))
                        return (EINVAL);
                preftemp = *(int *)buf;
                switch (preftemp) {
                case IP6PO_TEMPADDR_SYSTEM:
                case IP6PO_TEMPADDR_NOTPREFER:
                case IP6PO_TEMPADDR_PREFER:
                        break;
                default:
                        return (EINVAL);
                }
                opt->ip6po_prefer_tempaddr = preftemp;
                break;
        }

        default:
                return (ENOPROTOOPT);
        } /* end of switch */

        return (0);
}

/*
 * Routine called from ip6_output() to loop back a copy of an IP6 multicast
 * packet to the input queue of a specified interface.  Note that this
 * calls the output routine of the loopback "driver", but with an interface
 * pointer that might NOT be lo0ifp -- easier than replicating that code here.
 */
void
ip6_mloopback(struct ifnet *ifp, struct mbuf *m,
        const struct sockaddr_in6 *dst)
{
        struct mbuf *copym;
        struct ip6_hdr *ip6;

        copym = m_copypacket(m, M_DONTWAIT);
        if (copym == NULL)
                return;

        /*
         * Make sure to deep-copy IPv6 header portion in case the data
         * is in an mbuf cluster, so that we can safely override the IPv6
         * header portion later.
         */
        if ((copym->m_flags & M_EXT) != 0 ||
            copym->m_len < sizeof(struct ip6_hdr)) {
                copym = m_pullup(copym, sizeof(struct ip6_hdr));
                if (copym == NULL)
                        return;
        }

#ifdef DIAGNOSTIC
        if (copym->m_len < sizeof(*ip6)) {
                m_freem(copym);
                return;
        }
#endif

        ip6 = mtod(copym, struct ip6_hdr *);
        /*
         * clear embedded scope identifiers if necessary.
         * in6_clearscope will touch the addresses only when necessary.
         */
        in6_clearscope(&ip6->ip6_src);
        in6_clearscope(&ip6->ip6_dst);

        (void)looutput(ifp, copym, (const struct sockaddr *)dst, NULL);
}

/*
 * Chop IPv6 header off from the payload.
 */
static int
ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
{
        struct mbuf *mh;
        struct ip6_hdr *ip6;

        ip6 = mtod(m, struct ip6_hdr *);
        if (m->m_len > sizeof(*ip6)) {
                MGETHDR(mh, M_DONTWAIT, MT_HEADER);
                if (mh == NULL) {
                        m_freem(m);
                        return ENOBUFS;
                }
                m_move_pkthdr(mh, m);
                m_align(mh, sizeof(*ip6));
                m->m_len -= sizeof(*ip6);
                m->m_data += sizeof(*ip6);
                mh->m_next = m;
                mh->m_len = sizeof(*ip6);
                memcpy(mtod(mh, void *), (void *)ip6, sizeof(*ip6));
                m = mh;
        }
        exthdrs->ip6e_ip6 = m;
        return 0;
}

/*
 * Compute IPv6 extension header length.
 */
int
ip6_optlen(struct inpcb *inp)
{
        int len;

        if (!in6p_outputopts(inp))
                return 0;

        len = 0;
#define elen(x) \
    (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)

        len += elen(in6p_outputopts(inp)->ip6po_hbh);
        len += elen(in6p_outputopts(inp)->ip6po_dest1);
        len += elen(in6p_outputopts(inp)->ip6po_rthdr);
        len += elen(in6p_outputopts(inp)->ip6po_dest2);
        return len;
#undef elen
}

/*
 * Ensure sending address is valid.
 * Returns 0 on success, -1 if an error should be sent back or 1
 * if the packet could be dropped without error (protocol dependent).
 */
static int
ip6_ifaddrvalid(const struct in6_addr *src, const struct in6_addr *dst)
{
        struct sockaddr_in6 sin6;
        int s, error;
        struct ifaddr *ifa;
        struct in6_ifaddr *ia6;

        if (IN6_IS_ADDR_UNSPECIFIED(src))
                return 0;

        memset(&sin6, 0, sizeof(sin6));
        sin6.sin6_family = AF_INET6;
        sin6.sin6_len = sizeof(sin6);
        sin6.sin6_addr = *src;

        s = pserialize_read_enter();
        ifa = ifa_ifwithaddr(sin6tosa(&sin6));
        if ((ia6 = ifatoia6(ifa)) == NULL ||
            ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED))
                error = -1;
        else if (ia6->ia6_flags & IN6_IFF_TENTATIVE)
                error = 1;
        else if (ia6->ia6_flags & IN6_IFF_DETACHED &&
            (sin6.sin6_addr = *dst, ifa_ifwithaddr(sin6tosa(&sin6)) == NULL))
                /* Allow internal traffic to DETACHED addresses */
                error = 1;
        else
                error = 0;
        pserialize_read_exit(s);

        return error;
}























































































































    1 



















    1 
    1 


























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
/*        $NetBSD: exec_script.c,v 1.83 2021/05/03 10:25:14 fcambus Exp $        */

/*
 * Copyright (c) 1993, 1994, 1996 Christopher G. Demetriou
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Christopher G. Demetriou.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_script.c,v 1.83 2021/05/03 10:25:14 fcambus Exp $");

#ifdef _KERNEL_OPT
#include "opt_script.h"
#endif

#if defined(SETUIDSCRIPTS) && !defined(FDSCRIPTS)
#define FDSCRIPTS                /* Need this for safe set-id scripts. */
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/file.h>
#ifdef SETUIDSCRIPTS
#include <sys/stat.h>
#endif
#include <sys/filedesc.h>
#include <sys/exec.h>
#include <sys/resourcevar.h>
#include <sys/module.h>
#include <sys/exec_script.h>
#include <sys/exec_elf.h>

MODULE(MODULE_CLASS_EXEC, exec_script, NULL);

static struct execsw exec_script_execsw = {
        .es_hdrsz = SCRIPT_HDR_SIZE,
        .es_makecmds = exec_script_makecmds,
        .u = {
                .elf_probe_func = NULL,
        },
        .es_emul = NULL,
        .es_prio = EXECSW_PRIO_ANY,
        .es_arglen = 0,
        .es_copyargs = NULL,
        .es_setregs = NULL,
        .es_coredump = NULL,
        .es_setup_stack = exec_setup_stack,
};

static int
exec_script_modcmd(modcmd_t cmd, void *arg)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return exec_add(&exec_script_execsw, 1);

        case MODULE_CMD_FINI:
                return exec_remove(&exec_script_execsw, 1);

        case MODULE_CMD_AUTOUNLOAD:
                /*
                 * We don't want to be autounloaded because our use is
                 * transient: no executables with p_execsw equal to
                 * exec_script_execsw will exist, so FINI will never
                 * return EBUSY.  However, the system will run scripts
                 * often.  Return EBUSY here to prevent this module from
                 * ping-ponging in and out of the kernel.
                 */
                return EBUSY;

        default:
                return ENOTTY;
        }
}

/*
 * exec_script_makecmds(): Check if it's an executable shell script.
 *
 * Given a proc pointer and an exec package pointer, see if the referent
 * of the epp is in shell script.  If it is, then set thing up so that
 * the script can be run.  This involves preparing the address space
 * and arguments for the shell which will run the script.
 *
 * This function is ultimately responsible for creating a set of vmcmds
 * which can be used to build the process's vm space and inserting them
 * into the exec package.
 */
int
exec_script_makecmds(struct lwp *l, struct exec_package *epp)
{
        int error, hdrlinelen, shellnamelen, shellarglen;
        char *hdrstr = epp->ep_hdr;
        char *cp, *shellname, *shellarg;
        size_t shellargp_len;
        struct exec_fakearg *shellargp;
        struct exec_fakearg *tmpsap;
        struct pathbuf *shell_pathbuf;
        struct vnode *scriptvp;
#ifdef SETUIDSCRIPTS
        /* Gcc needs those initialized for spurious uninitialized warning */
        uid_t script_uid = (uid_t) -1;
        gid_t script_gid = NOGROUP;
        u_short script_sbits;
#endif

        /*
         * if the magic isn't that of a shell script, or we've already
         * done shell script processing for this exec, punt on it.
         */
        if ((epp->ep_flags & EXEC_INDIR) != 0 ||
            epp->ep_hdrvalid < EXEC_SCRIPT_MAGICLEN ||
            strncmp(hdrstr, EXEC_SCRIPT_MAGIC, EXEC_SCRIPT_MAGICLEN))
                return ENOEXEC;

        /*
         * Check that the shell spec is terminated by a newline, and that
         * it isn't too large.
         */
        hdrlinelen = uimin(epp->ep_hdrvalid, SCRIPT_HDR_SIZE);
        for (cp = hdrstr + EXEC_SCRIPT_MAGICLEN; cp < hdrstr + hdrlinelen;
            cp++) {
                if (*cp == '\n') {
                        *cp = '\0';
                        break;
                }
        }
        if (cp >= hdrstr + hdrlinelen)
                return ENOEXEC;

        /* strip spaces before the shell name */
        for (cp = hdrstr + EXEC_SCRIPT_MAGICLEN; *cp == ' ' || *cp == '\t';
            cp++)
                ;
        if (*cp == '\0')
                return ENOEXEC;

        shellarg = NULL;
        shellarglen = 0;

        /* collect the shell name; remember its length for later */
        shellname = cp;
        shellnamelen = 0;
        for ( /* cp = cp */ ; *cp != '\0' && *cp != ' ' && *cp != '\t'; cp++)
                shellnamelen++;
        if (*cp == '\0')
                goto check_shell;
        *cp++ = '\0';

        /* skip spaces before any argument */
        for ( /* cp = cp */ ; *cp == ' ' || *cp == '\t'; cp++)
                ;
        if (*cp == '\0')
                goto check_shell;

        /*
         * collect the shell argument.  everything after the shell name
         * is passed as ONE argument; that's the correct (historical)
         * behaviour.
         */
        shellarg = cp;
        for ( /* cp = cp */ ; *cp != '\0'; cp++)
                shellarglen++;
        *cp++ = '\0';

check_shell:
#ifdef SETUIDSCRIPTS
        /*
         * MNT_NOSUID has already taken care of by check_exec,
         * so we don't need to worry about it now or later.  We
         * will need to check PSL_TRACED later, however.
         */
        script_sbits = epp->ep_vap->va_mode & (S_ISUID | S_ISGID);
        if (script_sbits != 0) {
                script_uid = epp->ep_vap->va_uid;
                script_gid = epp->ep_vap->va_gid;
        }
#endif
#ifdef FDSCRIPTS
        /*
         * if the script isn't readable, or it's set-id, then we've
         * gotta supply a "/dev/fd/..." for the shell to read.
         * Note that stupid shells (csh) do the wrong thing, and
         * close all open fd's when they start.  That kills this
         * method of implementing "safe" set-id and x-only scripts.
         */
        vn_lock(epp->ep_vp, LK_SHARED | LK_RETRY);
        error = VOP_ACCESS(epp->ep_vp, VREAD, l->l_cred);
        VOP_UNLOCK(epp->ep_vp);
        if (error == EACCES
#ifdef SETUIDSCRIPTS
            || script_sbits
#endif
            ) {
                struct file *fp;

                KASSERT(!(epp->ep_flags & EXEC_HASFD));

                if ((error = fd_allocfile(&fp, &epp->ep_fd)) != 0) {
                        scriptvp = NULL;
                        shellargp = NULL;
                        goto fail;
                }
                epp->ep_flags |= EXEC_HASFD;
                fp->f_type = DTYPE_VNODE;
                fp->f_ops = &vnops;
                fp->f_vnode = epp->ep_vp;
                fp->f_flag = FREAD;
                fd_affix(curproc, fp, epp->ep_fd);
        }
#endif

        /* set up the fake args list */
        shellargp_len = 4 * sizeof(*shellargp);
        shellargp = kmem_alloc(shellargp_len, KM_SLEEP);
        tmpsap = shellargp;
        tmpsap->fa_len = shellnamelen + 1;
        tmpsap->fa_arg = kmem_alloc(tmpsap->fa_len, KM_SLEEP);
        strlcpy(tmpsap->fa_arg, shellname, tmpsap->fa_len);
        tmpsap++;
        if (shellarg != NULL) {
                tmpsap->fa_len = shellarglen + 1;
                tmpsap->fa_arg = kmem_alloc(tmpsap->fa_len, KM_SLEEP);
                strlcpy(tmpsap->fa_arg, shellarg, tmpsap->fa_len);
                tmpsap++;
        }
        tmpsap->fa_len = MAXPATHLEN;
        tmpsap->fa_arg = kmem_alloc(tmpsap->fa_len, KM_SLEEP);
#ifdef FDSCRIPTS
        if ((epp->ep_flags & EXEC_HASFD) == 0) {
#endif
                /* normally can't fail, but check for it if diagnostic */
                error = copystr(epp->ep_kname, tmpsap->fa_arg, MAXPATHLEN,
                    NULL);
                KASSERT(error == 0);
                tmpsap++;
#ifdef FDSCRIPTS
        } else {
                snprintf(tmpsap->fa_arg, MAXPATHLEN, "/dev/fd/%d", epp->ep_fd);
                tmpsap++;
        }
#endif
        tmpsap->fa_arg = NULL;

        /* Save the old vnode so we can clean it up later. */
        scriptvp = epp->ep_vp;
        epp->ep_vp = NULL;

        /* Note that we're trying recursively. */
        epp->ep_flags |= EXEC_INDIR;

        /*
         * mark the header we have as invalid; check_exec will read
         * the header from the new executable
         */
        epp->ep_hdrvalid = 0;

        /* try loading the interpreter */
        if ((error = exec_makepathbuf(l, shellname, UIO_SYSSPACE,
            &shell_pathbuf, NULL)) == 0) {
                error = check_exec(l, epp, shell_pathbuf, NULL);
                pathbuf_destroy(shell_pathbuf);
        }

        /* note that we've clobbered the header */
        epp->ep_flags |= EXEC_DESTR;

        if (error == 0) {
                /*
                 * It succeeded.  Unlock the script and
                 * close it if we aren't using it any more.
                 * Also, set things up so that the fake args
                 * list will be used.
                 */
                if ((epp->ep_flags & EXEC_HASFD) == 0) {
                        vn_lock(scriptvp, LK_EXCLUSIVE | LK_RETRY);
                        VOP_CLOSE(scriptvp, FREAD, l->l_cred);
                        vput(scriptvp);
                }

                epp->ep_flags |= (EXEC_HASARGL | EXEC_SKIPARG);
                epp->ep_fa = shellargp;
                epp->ep_fa_len = shellargp_len;
#ifdef SETUIDSCRIPTS
                /*
                 * set thing up so that set-id scripts will be
                 * handled appropriately.  PSL_TRACED will be
                 * checked later when the shell is actually
                 * exec'd.
                 */
                epp->ep_vap->va_mode |= script_sbits;
                if (script_sbits & S_ISUID)
                        epp->ep_vap->va_uid = script_uid;
                if (script_sbits & S_ISGID)
                        epp->ep_vap->va_gid = script_gid;
#endif
                return (0);
        }

#ifdef FDSCRIPTS
fail:
#endif

        /* kill the opened file descriptor, else close the file */
        if (epp->ep_flags & EXEC_HASFD) {
                epp->ep_flags &= ~EXEC_HASFD;
                fd_close(epp->ep_fd);
        } else if (scriptvp) {
                vn_lock(scriptvp, LK_EXCLUSIVE | LK_RETRY);
                VOP_CLOSE(scriptvp, FREAD, l->l_cred);
                vput(scriptvp);
        }

        /* free the fake arg list, because we're not returning it */
        if ((tmpsap = shellargp) != NULL) {
                while (tmpsap->fa_arg != NULL) {
                        kmem_free(tmpsap->fa_arg, tmpsap->fa_len);
                        tmpsap++;
                }
                kmem_free(shellargp, shellargp_len);
        }

        /*
         * free any vmspace-creation commands,
         * and release their references
         */
        kill_vmcmds(&epp->ep_vmcmds);

        return error;
}



























































































































































































































   27 







   22 





    4 


    2 






    3 

























   38 

   37 

   38 
   36 
   38 

   37 
    3 








   38 










    5 

    5 

    5 
    1 









   35 








   36 




    1 
    1 













   36 














   36 



































    4 
   33 

    2 






































    1 

    1 








    4 


    3 

    2 







    1 


    5 

    5 
































   35 










   15 

   30 




























   35 



















   35 







   34 




   35 

















    8 





    8 























    1 






    1 



















































    8 





    1 




























    1 








    1 
    1 


















   13 



   12 








    2 


    2 






    2 



    1 

    1 









































   65 





   64 



















































































  113 







































   33 





   33 









    4 
































    5 











    5 

















    5 

















    5 






    3 
    2 
















    5 




    5 













    1 

















    1 




    1 












    1 

    2 






    5 

    4 




    1 

    5 














    2 















    2 








    4 









    4 














    4 

    4 





    4 














    2 
    2 




















    1 









    1 




    1 

    1 

















































































































































































    4 
    4 
    4 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
/*        $NetBSD: kern_ktrace.c,v 1.184 2023/10/17 10:27:34 riastradh Exp $        */

/*-
 * Copyright (c) 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_ktrace.c        8.5 (Berkeley) 5/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_ktrace.c,v 1.184 2023/10/17 10:27:34 riastradh Exp $");

#include <sys/param.h>

#include <sys/callout.h>
#include <sys/cpu.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/ioctl.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/kthread.h>
#include <sys/ktrace.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/syncobj.h>
#include <sys/syscallargs.h>
#include <sys/syslog.h>
#include <sys/systm.h>

/*
 * TODO:
 *        - need better error reporting?
 *        - userland utility to sort ktrace.out by timestamp.
 *        - keep minimum information in ktrace_entry when rest of alloc failed.
 *        - per trace control of configurable parameters.
 */

struct ktrace_entry {
        TAILQ_ENTRY(ktrace_entry) kte_list;
        struct        ktr_header kte_kth;
        void        *kte_buf;
        size_t        kte_bufsz;
#define        KTE_SPACE                32
        uint8_t kte_space[KTE_SPACE] __aligned(sizeof(register_t));
};

struct ktr_desc {
        TAILQ_ENTRY(ktr_desc) ktd_list;
        int ktd_flags;
#define        KTDF_WAIT                0x0001
#define        KTDF_DONE                0x0002
#define        KTDF_BLOCKING                0x0004
#define        KTDF_INTERACTIVE        0x0008
        int ktd_error;
#define        KTDE_ENOMEM                0x0001
#define        KTDE_ENOSPC                0x0002
        int ktd_errcnt;
        int ktd_ref;                        /* # of reference */
        int ktd_qcount;                        /* # of entry in the queue */

        /*
         * Params to control behaviour.
         */
        int ktd_delayqcnt;                /* # of entry allowed to delay */
        int ktd_wakedelay;                /* delay of wakeup in *tick* */
        int ktd_intrwakdl;                /* ditto, but when interactive */

        file_t *ktd_fp;                        /* trace output file */
        lwp_t *ktd_lwp;                        /* our kernel thread */
        TAILQ_HEAD(, ktrace_entry) ktd_queue;
        callout_t ktd_wakch;                /* delayed wakeup */
        kcondvar_t ktd_sync_cv;
        kcondvar_t ktd_cv;
};

static void        ktrwrite(struct ktr_desc *, struct ktrace_entry *);
static int        ktrops(lwp_t *, struct proc *, int, int,
                    struct ktr_desc *);
static int        ktrsetchildren(lwp_t *, struct proc *, int, int,
                    struct ktr_desc *);
static int        ktrcanset(lwp_t *, struct proc *);
static int        ktrsamefile(file_t *, file_t *);
static void        ktr_kmem(lwp_t *, int, const void *, size_t);
static void        ktr_io(lwp_t *, int, enum uio_rw, struct iovec *, size_t);

static struct ktr_desc *
                ktd_lookup(file_t *);
static void        ktdrel(struct ktr_desc *);
static void        ktdref(struct ktr_desc *);
static void        ktefree(struct ktrace_entry *);
static void        ktd_logerrl(struct ktr_desc *, int);
static void        ktrace_thread(void *);
static int        ktrderefall(struct ktr_desc *, int);

/*
 * Default values.
 */
#define        KTD_MAXENTRY                1000        /* XXX: tune */
#define        KTD_TIMEOUT                5        /* XXX: tune */
#define        KTD_DELAYQCNT                100        /* XXX: tune */
#define        KTD_WAKEDELAY                5000        /* XXX: tune */
#define        KTD_INTRWAKDL                100        /* XXX: tune */

/*
 * Patchable variables.
 */
int ktd_maxentry = KTD_MAXENTRY;        /* max # of entry in the queue */
int ktd_timeout = KTD_TIMEOUT;                /* timeout in seconds */
int ktd_delayqcnt = KTD_DELAYQCNT;        /* # of entry allowed to delay */
int ktd_wakedelay = KTD_WAKEDELAY;        /* delay of wakeup in *ms* */
int ktd_intrwakdl = KTD_INTRWAKDL;        /* ditto, but when interactive */

kmutex_t ktrace_lock;
int ktrace_on;
static TAILQ_HEAD(, ktr_desc) ktdq = TAILQ_HEAD_INITIALIZER(ktdq);
static pool_cache_t kte_cache;

static kauth_listener_t ktrace_listener;

static void
ktd_wakeup(struct ktr_desc *ktd)
{

        callout_stop(&ktd->ktd_wakch);
        cv_signal(&ktd->ktd_cv);
}

static void
ktd_callout(void *arg)
{

        mutex_enter(&ktrace_lock);
        ktd_wakeup(arg);
        mutex_exit(&ktrace_lock);
}

static void
ktd_logerrl(struct ktr_desc *ktd, int error)
{

        ktd->ktd_error |= error;
        ktd->ktd_errcnt++;
}

#if 0
static void
ktd_logerr(struct proc *p, int error)
{
        struct ktr_desc *ktd;

        KASSERT(mutex_owned(&ktrace_lock));

        ktd = p->p_tracep;
        if (ktd == NULL)
                return;

        ktd_logerrl(ktd, error);
}
#endif

static int
ktrace_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        struct proc *p;
        int result;
        enum kauth_process_req req;

        result = KAUTH_RESULT_DEFER;
        p = arg0;

        if (action != KAUTH_PROCESS_KTRACE)
                return result;

        req = (enum kauth_process_req)(uintptr_t)arg1;

        /* Privileged; secmodel should handle these. */
        if (req == KAUTH_REQ_PROCESS_KTRACE_PERSISTENT)
                return result;

        if ((p->p_traceflag & KTRFAC_PERSISTENT) ||
            (p->p_flag & PK_SUGID))
                return result;

        if (kauth_cred_geteuid(cred) == kauth_cred_getuid(p->p_cred) &&
            kauth_cred_getuid(cred) == kauth_cred_getsvuid(p->p_cred) &&
            kauth_cred_getgid(cred) == kauth_cred_getgid(p->p_cred) &&
            kauth_cred_getgid(cred) == kauth_cred_getsvgid(p->p_cred))
                result = KAUTH_RESULT_ALLOW;

        return result;
}

/*
 * Initialise the ktrace system.
 */
void
ktrinit(void)
{

        mutex_init(&ktrace_lock, MUTEX_DEFAULT, IPL_NONE);
        kte_cache = pool_cache_init(sizeof(struct ktrace_entry), 0, 0, 0,
            "ktrace", &pool_allocator_nointr, IPL_NONE, NULL, NULL, NULL);

        ktrace_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
            ktrace_listener_cb, NULL);
}

/*
 * Release a reference.  Called with ktrace_lock held.
 */
static void
ktdrel(struct ktr_desc *ktd)
{

        KASSERT(mutex_owned(&ktrace_lock));

        KDASSERT(ktd->ktd_ref != 0);
        KASSERT(ktd->ktd_ref > 0);
        KASSERT(ktrace_on > 0);
        ktrace_on--;
        if (--ktd->ktd_ref <= 0) {
                ktd->ktd_flags |= KTDF_DONE;
                cv_signal(&ktd->ktd_cv);
        }
}

static void
ktdref(struct ktr_desc *ktd)
{

        KASSERT(mutex_owned(&ktrace_lock));

        ktd->ktd_ref++;
        ktrace_on++;
}

static struct ktr_desc *
ktd_lookup(file_t *fp)
{
        struct ktr_desc *ktd;

        KASSERT(mutex_owned(&ktrace_lock));

        for (ktd = TAILQ_FIRST(&ktdq); ktd != NULL;
            ktd = TAILQ_NEXT(ktd, ktd_list)) {
                if (ktrsamefile(ktd->ktd_fp, fp)) {
                        ktdref(ktd);
                        break;
                }
        }

        return (ktd);
}

void
ktraddentry(lwp_t *l, struct ktrace_entry *kte, int flags)
{
        struct proc *p = l->l_proc;
        struct ktr_desc *ktd;
#ifdef DEBUG
        struct timeval t1, t2;
#endif

        mutex_enter(&ktrace_lock);

        if (p->p_traceflag & KTRFAC_TRC_EMUL) {
                /* Add emulation trace before first entry for this process */
                p->p_traceflag &= ~KTRFAC_TRC_EMUL;
                mutex_exit(&ktrace_lock);
                ktrexit(l);
                ktremul();
                (void)ktrenter(l);
                mutex_enter(&ktrace_lock);
        }

        /* Tracing may have been cancelled. */
        ktd = p->p_tracep;
        if (ktd == NULL)
                goto freekte;

        /*
         * Bump reference count so that the object will remain while
         * we are here.  Note that the trace is controlled by other
         * process.
         */
        ktdref(ktd);

        if (ktd->ktd_flags & KTDF_DONE)
                goto relktd;

        if (ktd->ktd_qcount > ktd_maxentry) {
                ktd_logerrl(ktd, KTDE_ENOSPC);
                goto relktd;
        }
        TAILQ_INSERT_TAIL(&ktd->ktd_queue, kte, kte_list);
        ktd->ktd_qcount++;
        if (ktd->ktd_flags & KTDF_BLOCKING)
                goto skip_sync;

        if (flags & KTA_WAITOK &&
            (/* flags & KTA_LARGE */0 || ktd->ktd_flags & KTDF_WAIT ||
            ktd->ktd_qcount > ktd_maxentry >> 1))
                /*
                 * Sync with writer thread since we're requesting rather
                 * big one or many requests are pending.
                 */
                do {
                        ktd->ktd_flags |= KTDF_WAIT;
                        ktd_wakeup(ktd);
#ifdef DEBUG
                        getmicrouptime(&t1);
#endif
                        if (cv_timedwait(&ktd->ktd_sync_cv, &ktrace_lock,
                            ktd_timeout * hz) != 0) {
                                ktd->ktd_flags |= KTDF_BLOCKING;
                                /*
                                 * Maybe the writer thread is blocking
                                 * completely for some reason, but
                                 * don't stop target process forever.
                                 */
                                log(LOG_NOTICE, "ktrace timeout\n");
                                break;
                        }
#ifdef DEBUG
                        getmicrouptime(&t2);
                        timersub(&t2, &t1, &t2);
                        if (t2.tv_sec > 0)
                                log(LOG_NOTICE,
                                    "ktrace long wait: %lld.%06ld\n",
                                    (long long)t2.tv_sec, (long)t2.tv_usec);
#endif
                } while (p->p_tracep == ktd &&
                    (ktd->ktd_flags & (KTDF_WAIT | KTDF_DONE)) == KTDF_WAIT);
        else {
                /* Schedule delayed wakeup */
                if (ktd->ktd_qcount > ktd->ktd_delayqcnt)
                        ktd_wakeup(ktd);        /* Wakeup now */
                else if (!callout_pending(&ktd->ktd_wakch))
                        callout_reset(&ktd->ktd_wakch,
                            ktd->ktd_flags & KTDF_INTERACTIVE ?
                            ktd->ktd_intrwakdl : ktd->ktd_wakedelay,
                            ktd_callout, ktd);
        }

skip_sync:
        ktdrel(ktd);
        mutex_exit(&ktrace_lock);
        ktrexit(l);
        return;

relktd:
        ktdrel(ktd);

freekte:
        mutex_exit(&ktrace_lock);
        ktefree(kte);
        ktrexit(l);
}

static void
ktefree(struct ktrace_entry *kte)
{

        if (kte->kte_buf != kte->kte_space)
                kmem_free(kte->kte_buf, kte->kte_bufsz);
        pool_cache_put(kte_cache, kte);
}

/*
 * "deep" compare of two files for the purposes of clearing a trace.
 * Returns true if they're the same open file, or if they point at the
 * same underlying vnode/socket.
 */

static int
ktrsamefile(file_t *f1, file_t *f2)
{

        return ((f1 == f2) ||
            ((f1 != NULL) && (f2 != NULL) &&
                (f1->f_type == f2->f_type) &&
                (f1->f_data == f2->f_data)));
}

void
ktrderef(struct proc *p)
{
        struct ktr_desc *ktd = p->p_tracep;

        KASSERT(mutex_owned(&ktrace_lock));

        p->p_traceflag = 0;
        if (ktd == NULL)
                return;
        p->p_tracep = NULL;

        cv_broadcast(&ktd->ktd_sync_cv);
        ktdrel(ktd);
}

void
ktradref(struct proc *p)
{
        struct ktr_desc *ktd = p->p_tracep;

        KASSERT(mutex_owned(&ktrace_lock));

        ktdref(ktd);
}

static int
ktrderefall(struct ktr_desc *ktd, int auth)
{
        lwp_t *curl = curlwp;
        struct proc *p;
        int error = 0;

        mutex_enter(&proc_lock);
        PROCLIST_FOREACH(p, &allproc) {
                if (p->p_tracep != ktd)
                        continue;
                mutex_enter(p->p_lock);
                mutex_enter(&ktrace_lock);
                if (p->p_tracep == ktd) {
                        if (!auth || ktrcanset(curl, p))
                                ktrderef(p);
                        else
                                error = EPERM;
                }
                mutex_exit(&ktrace_lock);
                mutex_exit(p->p_lock);
        }
        mutex_exit(&proc_lock);

        return error;
}

int
ktealloc(struct ktrace_entry **ktep, void **bufp, lwp_t *l, int type,
         size_t sz)
{
        struct proc *p = l->l_proc;
        struct ktrace_entry *kte;
        struct ktr_header *kth;
        void *buf;

        if (ktrenter(l))
                return EAGAIN;

        kte = pool_cache_get(kte_cache, PR_WAITOK);
        if (sz > sizeof(kte->kte_space)) {
                buf = kmem_alloc(sz, KM_SLEEP);
        } else
                buf = kte->kte_space;

        kte->kte_bufsz = sz;
        kte->kte_buf = buf;

        kth = &kte->kte_kth;
        (void)memset(kth, 0, sizeof(*kth));
        kth->ktr_len = sz;
        kth->ktr_type = type;
        kth->ktr_pid = p->p_pid;
        memcpy(kth->ktr_comm, p->p_comm, MAXCOMLEN);
        kth->ktr_version = KTRFAC_VERSION(p->p_traceflag);
        kth->ktr_lid = l->l_lid;
        nanotime(&kth->ktr_ts);

        *ktep = kte;
        *bufp = buf;

        return 0;
}

void
ktesethdrlen(struct ktrace_entry *kte, size_t l)
{
        kte->kte_kth.ktr_len = l;
}

void
ktr_syscall(register_t code, const register_t args[], int narg)
{
        lwp_t *l = curlwp;
        struct proc *p = l->l_proc;
        struct ktrace_entry *kte;
        struct ktr_syscall *ktp;
        register_t *argp;
        size_t len;
        u_int i;

        if (!KTRPOINT(p, KTR_SYSCALL))
                return;

        len = sizeof(struct ktr_syscall) + narg * sizeof argp[0];

        if (ktealloc(&kte, (void *)&ktp, l, KTR_SYSCALL, len))
                return;

        ktp->ktr_code = code;
        ktp->ktr_argsize = narg * sizeof argp[0];
        argp = (register_t *)(ktp + 1);
        for (i = 0; i < narg; i++)
                *argp++ = args[i];

        ktraddentry(l, kte, KTA_WAITOK);
}

void
ktr_sysret(register_t code, int error, register_t *retval)
{
        lwp_t *l = curlwp;
        struct ktrace_entry *kte;
        struct ktr_sysret *ktp;

        if (!KTRPOINT(l->l_proc, KTR_SYSRET))
                return;

        if (ktealloc(&kte, (void *)&ktp, l, KTR_SYSRET,
            sizeof(struct ktr_sysret)))
                return;

        ktp->ktr_code = code;
        ktp->ktr_eosys = 0;                        /* XXX unused */
        ktp->ktr_error = error;
        ktp->ktr_retval = retval && error == 0 ? retval[0] : 0;
        ktp->ktr_retval_1 = retval && error == 0 ? retval[1] : 0;

        ktraddentry(l, kte, KTA_WAITOK);
}

void
ktr_namei(const char *path, size_t pathlen)
{
        lwp_t *l = curlwp;

        if (!KTRPOINT(l->l_proc, KTR_NAMEI))
                return;

        ktr_kmem(l, KTR_NAMEI, path, pathlen);
}

void
ktr_namei2(const char *eroot, size_t erootlen,
          const char *path, size_t pathlen)
{
        lwp_t *l = curlwp;
        struct ktrace_entry *kte;
        void *buf;

        if (!KTRPOINT(l->l_proc, KTR_NAMEI))
                return;

        if (ktealloc(&kte, &buf, l, KTR_NAMEI, erootlen + pathlen))
                return;
        memcpy(buf, eroot, erootlen);
        buf = (char *)buf + erootlen;
        memcpy(buf, path, pathlen);
        ktraddentry(l, kte, KTA_WAITOK);
}

void
ktr_emul(void)
{
        lwp_t *l = curlwp;
        const char *emul = l->l_proc->p_emul->e_name;

        if (!KTRPOINT(l->l_proc, KTR_EMUL))
                return;

        ktr_kmem(l, KTR_EMUL, emul, strlen(emul));
}

void
ktr_execarg(const void *bf, size_t len)
{
        lwp_t *l = curlwp;

        if (!KTRPOINT(l->l_proc, KTR_EXEC_ARG))
                return;

        ktr_kmem(l, KTR_EXEC_ARG, bf, len);
}

void
ktr_execenv(const void *bf, size_t len)
{
        lwp_t *l = curlwp;

        if (!KTRPOINT(l->l_proc, KTR_EXEC_ENV))
                return;

        ktr_kmem(l, KTR_EXEC_ENV, bf, len);
}

void
ktr_execfd(int fd, u_int dtype)
{
        struct ktrace_entry *kte;
        struct ktr_execfd* ktp;

        lwp_t *l = curlwp;

        if (!KTRPOINT(l->l_proc, KTR_EXEC_FD))
                return;

        if (ktealloc(&kte, (void *)&ktp, l, KTR_EXEC_FD, sizeof(*ktp)))
                return;

        ktp->ktr_fd = fd;
        ktp->ktr_dtype = dtype;
        ktraddentry(l, kte, KTA_WAITOK);
}

static void
ktr_kmem(lwp_t *l, int type, const void *bf, size_t len)
{
        struct ktrace_entry *kte;
        void *buf;

        if (ktealloc(&kte, &buf, l, type, len))
                return;
        memcpy(buf, bf, len);
        ktraddentry(l, kte, KTA_WAITOK);
}

static void
ktr_io(lwp_t *l, int fd, enum uio_rw rw, struct iovec *iov, size_t len)
{
        struct ktrace_entry *kte;
        struct ktr_genio *ktp;
        size_t resid = len, cnt, buflen;
        char *cp;

 next:
        buflen = uimin(PAGE_SIZE, resid + sizeof(struct ktr_genio));

        if (ktealloc(&kte, (void *)&ktp, l, KTR_GENIO, buflen))
                return;

        ktp->ktr_fd = fd;
        ktp->ktr_rw = rw;

        cp = (void *)(ktp + 1);
        buflen -= sizeof(struct ktr_genio);
        kte->kte_kth.ktr_len = sizeof(struct ktr_genio);

        while (buflen > 0) {
                cnt = uimin(iov->iov_len, buflen);
                if (copyin(iov->iov_base, cp, cnt) != 0)
                        goto out;
                kte->kte_kth.ktr_len += cnt;
                cp += cnt;
                buflen -= cnt;
                resid -= cnt;
                iov->iov_len -= cnt;
                if (iov->iov_len == 0)
                        iov++;
                else
                        iov->iov_base = (char *)iov->iov_base + cnt;
        }

        /*
         * Don't push so many entry at once.  It will cause kmem map
         * shortage.
         */
        ktraddentry(l, kte, KTA_WAITOK | KTA_LARGE);
        if (resid > 0) {
                if (preempt_needed()) {
                        (void)ktrenter(l);
                        preempt();
                        ktrexit(l);
                }

                goto next;
        }

        return;

out:
        ktefree(kte);
        ktrexit(l);
}

void
ktr_genio(int fd, enum uio_rw rw, const void *addr, size_t len, int error)
{
        lwp_t *l = curlwp;
        struct iovec iov;

        if (!KTRPOINT(l->l_proc, KTR_GENIO) || error != 0)
                return;
        iov.iov_base = __UNCONST(addr);
        iov.iov_len = len;
        ktr_io(l, fd, rw, &iov, len);
}

void
ktr_geniov(int fd, enum uio_rw rw, struct iovec *iov, size_t len, int error)
{
        lwp_t *l = curlwp;

        if (!KTRPOINT(l->l_proc, KTR_GENIO) || error != 0)
                return;
        ktr_io(l, fd, rw, iov, len);
}

void
ktr_mibio(int fd, enum uio_rw rw, const void *addr, size_t len, int error)
{
        lwp_t *l = curlwp;
        struct iovec iov;

        if (!KTRPOINT(l->l_proc, KTR_MIB) || error != 0)
                return;
        iov.iov_base = __UNCONST(addr);
        iov.iov_len = len;
        ktr_io(l, fd, rw, &iov, len);
}

void
ktr_psig(int sig, sig_t action, const sigset_t *mask,
         const ksiginfo_t *ksi)
{
        struct ktrace_entry *kte;
        lwp_t *l = curlwp;
        struct {
                struct ktr_psig        kp;
                siginfo_t        si;
        } *kbuf;

        if (!KTRPOINT(l->l_proc, KTR_PSIG))
                return;

        if (ktealloc(&kte, (void *)&kbuf, l, KTR_PSIG, sizeof(*kbuf)))
                return;

        memset(&kbuf->kp, 0, sizeof(kbuf->kp));
        kbuf->kp.signo = (char)sig;
        kbuf->kp.action = action;
        kbuf->kp.mask = *mask;

        if (ksi) {
                kbuf->kp.code = KSI_TRAPCODE(ksi);
                (void)memset(&kbuf->si, 0, sizeof(kbuf->si));
                kbuf->si._info = ksi->ksi_info;
                kte->kte_kth.ktr_len = sizeof(*kbuf);
        } else {
                kbuf->kp.code = 0;
                kte->kte_kth.ktr_len = sizeof(struct ktr_psig);
        }

        ktraddentry(l, kte, KTA_WAITOK);
}

void
ktr_csw(int out, int user, const struct syncobj *syncobj)
{
        lwp_t *l = curlwp;
        struct proc *p = l->l_proc;
        struct ktrace_entry *kte;
        struct ktr_csw *kc;

        if (!KTRPOINT(p, KTR_CSW))
                return;

        /*
         * Don't record context switches resulting from blocking on
         * locks; the results are not useful, and the mutex may be in a
         * softint, which would lead us to ktealloc in softint context,
         * which is forbidden.
         */
        if (syncobj == &mutex_syncobj || syncobj == &rw_syncobj)
                return;
        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());

        /*
         * We can't sleep if we're already going to sleep (if original
         * condition is met during sleep, we hang up).
         *
         * XXX This is not ideal: it would be better to maintain a pool
         * of ktes and actually push this to the kthread when context
         * switch happens, however given the points where we are called
         * from that is difficult to do.
         */
        if (out) {
                if (ktrenter(l))
                        return;

                nanotime(&l->l_ktrcsw);
                l->l_pflag |= LP_KTRCSW;
                if (user)
                        l->l_pflag |= LP_KTRCSWUSER;
                else
                        l->l_pflag &= ~LP_KTRCSWUSER;

                ktrexit(l);
                return;
        }

        /*
         * On the way back in, we need to record twice: once for entry, and
         * once for exit.
         */
        if ((l->l_pflag & LP_KTRCSW) != 0) {
                struct timespec *ts;
                l->l_pflag &= ~LP_KTRCSW;

                if (ktealloc(&kte, (void *)&kc, l, KTR_CSW, sizeof(*kc)))
                        return;

                kc->out = 1;
                kc->user = ((l->l_pflag & LP_KTRCSWUSER) != 0);

                ts = &l->l_ktrcsw;
                switch (KTRFAC_VERSION(p->p_traceflag)) {
                case 0:
                        kte->kte_kth.ktr_otv.tv_sec = ts->tv_sec;
                        kte->kte_kth.ktr_otv.tv_usec = ts->tv_nsec / 1000;
                        break;
                case 1:
                        kte->kte_kth.ktr_ots.tv_sec = ts->tv_sec;
                        kte->kte_kth.ktr_ots.tv_nsec = ts->tv_nsec;
                        break;
                case 2:
                        kte->kte_kth.ktr_ts.tv_sec = ts->tv_sec;
                        kte->kte_kth.ktr_ts.tv_nsec = ts->tv_nsec;
                        break;
                default:
                        break;
                }

                ktraddentry(l, kte, KTA_WAITOK);
        }

        if (ktealloc(&kte, (void *)&kc, l, KTR_CSW, sizeof(*kc)))
                return;

        kc->out = 0;
        kc->user = user;

        ktraddentry(l, kte, KTA_WAITOK);
}

bool
ktr_point(int fac_bit)
{
        return curlwp->l_proc->p_traceflag & fac_bit;
}

int
ktruser(const char *id, void *addr, size_t len, int ustr)
{
        struct ktrace_entry *kte;
        struct ktr_user *ktp;
        lwp_t *l = curlwp;
        void *user_dta;
        int error;

        if (!KTRPOINT(l->l_proc, KTR_USER))
                return 0;

        if (len > KTR_USER_MAXLEN)
                return ENOSPC;

        error = ktealloc(&kte, (void *)&ktp, l, KTR_USER, sizeof(*ktp) + len);
        if (error != 0)
                return error;

        if (ustr) {
                if (copyinstr(id, ktp->ktr_id, KTR_USER_MAXIDLEN, NULL) != 0)
                        ktp->ktr_id[0] = '\0';
        } else
                strncpy(ktp->ktr_id, id, KTR_USER_MAXIDLEN);
        ktp->ktr_id[KTR_USER_MAXIDLEN-1] = '\0';

        user_dta = (void *)(ktp + 1);
        if ((error = copyin(addr, user_dta, len)) != 0)
                kte->kte_kth.ktr_len = 0;

        ktraddentry(l, kte, KTA_WAITOK);
        return error;
}

void
ktr_kuser(const char *id, const void *addr, size_t len)
{
        struct ktrace_entry *kte;
        struct ktr_user *ktp;
        lwp_t *l = curlwp;
        int error;

        if (!KTRPOINT(l->l_proc, KTR_USER))
                return;

        if (len > KTR_USER_MAXLEN)
                return;

        error = ktealloc(&kte, (void *)&ktp, l, KTR_USER, sizeof(*ktp) + len);
        if (error != 0)
                return;

        strncpy(ktp->ktr_id, id, KTR_USER_MAXIDLEN - 1);
        ktp->ktr_id[KTR_USER_MAXIDLEN - 1] = '\0';

        memcpy(ktp + 1, addr, len);

        ktraddentry(l, kte, KTA_WAITOK);
}

void
ktr_mib(const int *name, u_int namelen)
{
        struct ktrace_entry *kte;
        int *namep;
        size_t size;
        lwp_t *l = curlwp;

        if (!KTRPOINT(l->l_proc, KTR_MIB))
                return;

        size = namelen * sizeof(*name);

        if (ktealloc(&kte, (void *)&namep, l, KTR_MIB, size))
                return;

        (void)memcpy(namep, name, namelen * sizeof(*name));

        ktraddentry(l, kte, KTA_WAITOK);
}

/* Interface and common routines */

int
ktrace_common(lwp_t *curl, int ops, int facs, int pid, file_t **fpp)
{
        struct proc *p;
        struct pgrp *pg;
        struct ktr_desc *ktd = NULL, *nktd;
        file_t *fp = *fpp;
        int ret = 0;
        int error = 0;
        int descend;

        descend = ops & KTRFLAG_DESCEND;
        facs = facs & ~((unsigned) KTRFAC_PERSISTENT);

        (void)ktrenter(curl);

        switch (KTROP(ops)) {

        case KTROP_CLEARFILE:
                /*
                 * Clear all uses of the tracefile
                 */
                mutex_enter(&ktrace_lock);
                ktd = ktd_lookup(fp);
                mutex_exit(&ktrace_lock);
                if (ktd == NULL)
                        goto done;
                error = ktrderefall(ktd, 1);
                goto done;

        case KTROP_SET:
                mutex_enter(&ktrace_lock);
                ktd = ktd_lookup(fp);
                mutex_exit(&ktrace_lock);
                if (ktd == NULL) {
                        nktd = kmem_alloc(sizeof(*nktd), KM_SLEEP);
                        TAILQ_INIT(&nktd->ktd_queue);
                        callout_init(&nktd->ktd_wakch, CALLOUT_MPSAFE);
                        cv_init(&nktd->ktd_cv, "ktrwait");
                        cv_init(&nktd->ktd_sync_cv, "ktrsync");
                        nktd->ktd_flags = 0;
                        nktd->ktd_qcount = 0;
                        nktd->ktd_error = 0;
                        nktd->ktd_errcnt = 0;
                        nktd->ktd_delayqcnt = ktd_delayqcnt;
                        nktd->ktd_wakedelay = mstohz(ktd_wakedelay);
                        nktd->ktd_intrwakdl = mstohz(ktd_intrwakdl);
                        nktd->ktd_ref = 0;
                        nktd->ktd_fp = fp;
                        mutex_enter(&ktrace_lock);
                        ktdref(nktd);
                        mutex_exit(&ktrace_lock);

                        /*
                         * XXX: not correct.  needs an way to detect
                         * whether ktruss or ktrace.
                         */
                        if (fp->f_type == DTYPE_PIPE)
                                nktd->ktd_flags |= KTDF_INTERACTIVE;

                        mutex_enter(&fp->f_lock);
                        fp->f_count++;
                        mutex_exit(&fp->f_lock);
                        error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
                            ktrace_thread, nktd, &nktd->ktd_lwp, "ktrace");
                        if (error != 0) {
                                kmem_free(nktd, sizeof(*nktd));
                                nktd = NULL;
                                mutex_enter(&fp->f_lock);
                                fp->f_count--;
                                mutex_exit(&fp->f_lock);
                                goto done;
                        }

                        mutex_enter(&ktrace_lock);
                        ktd = ktd_lookup(fp);
                        if (ktd != NULL) {
                                ktdrel(nktd);
                                nktd = NULL;
                        } else {
                                TAILQ_INSERT_TAIL(&ktdq, nktd, ktd_list);
                                ktd = nktd;
                        }
                        mutex_exit(&ktrace_lock);
                }
                break;

        case KTROP_CLEAR:
                break;
        }

        /*
         * need something to (un)trace (XXX - why is this here?)
         */
        if (!facs) {
                error = EINVAL;
                *fpp = NULL;
                goto done;
        }

        /*
         * do it
         */
        mutex_enter(&proc_lock);
        if (pid < 0) {
                /*
                 * by process group
                 */
                pg = pgrp_find(-pid);
                if (pg == NULL)
                        error = ESRCH;
                else {
                        LIST_FOREACH(p, &pg->pg_members, p_pglist) {
                                if (descend)
                                        ret |= ktrsetchildren(curl, p, ops,
                                            facs, ktd);
                                else
                                        ret |= ktrops(curl, p, ops, facs,
                                            ktd);
                        }
                }

        } else {
                /*
                 * by pid
                 */
                p = proc_find(pid);
                if (p == NULL)
                        error = ESRCH;
                else if (descend)
                        ret |= ktrsetchildren(curl, p, ops, facs, ktd);
                else
                        ret |= ktrops(curl, p, ops, facs, ktd);
        }
        mutex_exit(&proc_lock);
        if (error == 0 && !ret)
                error = EPERM;
        *fpp = NULL;
done:
        if (ktd != NULL) {
                mutex_enter(&ktrace_lock);
                if (error != 0) {
                        /*
                         * Wakeup the thread so that it can be die if we
                         * can't trace any process.
                         */
                        ktd_wakeup(ktd);
                }
                if (KTROP(ops) == KTROP_SET || KTROP(ops) == KTROP_CLEARFILE)
                        ktdrel(ktd);
                mutex_exit(&ktrace_lock);
        }
        ktrexit(curl);
        return (error);
}

/*
 * fktrace system call
 */
/* ARGSUSED */
int
sys_fktrace(struct lwp *l, const struct sys_fktrace_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(int) ops;
                syscallarg(int) facs;
                syscallarg(int) pid;
        } */
        file_t *fp;
        int error, fd;

        fd = SCARG(uap, fd);
        if ((fp = fd_getfile(fd)) == NULL)
                return (EBADF);
        if ((fp->f_flag & FWRITE) == 0)
                error = EBADF;
        else
                error = ktrace_common(l, SCARG(uap, ops),
                    SCARG(uap, facs), SCARG(uap, pid), &fp);
        fd_putfile(fd);
        return error;
}

static int
ktrops(lwp_t *curl, struct proc *p, int ops, int facs,
    struct ktr_desc *ktd)
{
        int vers = ops & KTRFAC_VER_MASK;
        int error = 0;

        mutex_enter(p->p_lock);
        mutex_enter(&ktrace_lock);

        if (!ktrcanset(curl, p))
                goto out;

        switch (vers) {
        case KTRFACv0:
        case KTRFACv1:
        case KTRFACv2:
                break;
        default:
                error = EINVAL;
                goto out;
        }

        if (KTROP(ops) == KTROP_SET) {
                if (p->p_tracep != ktd) {
                        /*
                         * if trace file already in use, relinquish
                         */
                        ktrderef(p);
                        p->p_tracep = ktd;
                        ktradref(p);
                }
                p->p_traceflag |= facs;
                if (kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_KTRACE,
                    p, KAUTH_ARG(KAUTH_REQ_PROCESS_KTRACE_PERSISTENT), NULL,
                    NULL) == 0)
                        p->p_traceflag |= KTRFAC_PERSISTENT;
        } else {
                /* KTROP_CLEAR */
                if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
                        /* no more tracing */
                        ktrderef(p);
                }
        }

        if (p->p_traceflag)
                p->p_traceflag |= vers;
        /*
         * Emit an emulation record, every time there is a ktrace
         * change/attach request.
         */
        if (KTRPOINT(p, KTR_EMUL))
                p->p_traceflag |= KTRFAC_TRC_EMUL;

        p->p_trace_enabled = trace_is_enabled(p);
#ifdef __HAVE_SYSCALL_INTERN
        (*p->p_emul->e_syscall_intern)(p);
#endif

 out:
        mutex_exit(&ktrace_lock);
        mutex_exit(p->p_lock);

        return error ? 0 : 1;
}

static int
ktrsetchildren(lwp_t *curl, struct proc *top, int ops, int facs,
    struct ktr_desc *ktd)
{
        struct proc *p;
        int ret = 0;

        KASSERT(mutex_owned(&proc_lock));

        p = top;
        for (;;) {
                ret |= ktrops(curl, p, ops, facs, ktd);
                /*
                 * If this process has children, descend to them next,
                 * otherwise do any siblings, and if done with this level,
                 * follow back up the tree (but not past top).
                 */
                if (LIST_FIRST(&p->p_children) != NULL) {
                        p = LIST_FIRST(&p->p_children);
                        continue;
                }
                for (;;) {
                        if (p == top)
                                return (ret);
                        if (LIST_NEXT(p, p_sibling) != NULL) {
                                p = LIST_NEXT(p, p_sibling);
                                break;
                        }
                        p = p->p_pptr;
                }
        }
        /*NOTREACHED*/
}

static void
ktrwrite(struct ktr_desc *ktd, struct ktrace_entry *kte)
{
        size_t hlen;
        struct uio auio;
        struct iovec aiov[64], *iov;
        struct ktrace_entry *top = kte;
        struct ktr_header *kth;
        file_t *fp = ktd->ktd_fp;
        int error;
next:
        auio.uio_iov = iov = &aiov[0];
        auio.uio_offset = 0;
        auio.uio_rw = UIO_WRITE;
        auio.uio_resid = 0;
        auio.uio_iovcnt = 0;
        UIO_SETUP_SYSSPACE(&auio);
        do {
                struct timespec ts;
                lwpid_t lid;
                kth = &kte->kte_kth;

                hlen = sizeof(struct ktr_header);
                switch (kth->ktr_version) {
                case 0:
                        ts = kth->ktr_time;

                        kth->ktr_otv.tv_sec = ts.tv_sec;
                        kth->ktr_otv.tv_usec = ts.tv_nsec / 1000;
                        kth->ktr_unused = NULL;
                        hlen -= sizeof(kth->_v) -
                            MAX(sizeof(kth->_v._v0), sizeof(kth->_v._v1));
                        break;
                case 1:
                        ts = kth->ktr_time;
                        lid = kth->ktr_lid;

                        kth->ktr_ots.tv_sec = ts.tv_sec;
                        kth->ktr_ots.tv_nsec = ts.tv_nsec;
                        kth->ktr_olid = lid;
                        hlen -= sizeof(kth->_v) -
                            MAX(sizeof(kth->_v._v0), sizeof(kth->_v._v1));
                        break;
                }
                iov->iov_base = (void *)kth;
                iov++->iov_len = hlen;
                auio.uio_resid += hlen;
                auio.uio_iovcnt++;
                if (kth->ktr_len > 0) {
                        iov->iov_base = kte->kte_buf;
                        iov++->iov_len = kth->ktr_len;
                        auio.uio_resid += kth->ktr_len;
                        auio.uio_iovcnt++;
                }
        } while ((kte = TAILQ_NEXT(kte, kte_list)) != NULL &&
            auio.uio_iovcnt < sizeof(aiov) / sizeof(aiov[0]) - 1);

again:
        error = (*fp->f_ops->fo_write)(fp, &fp->f_offset, &auio,
            fp->f_cred, FOF_UPDATE_OFFSET);
        switch (error) {

        case 0:
                if (auio.uio_resid > 0)
                        goto again;
                if (kte != NULL)
                        goto next;
                break;

        case EWOULDBLOCK:
                kpause("ktrzzz", false, 1, NULL);
                goto again;

        default:
                /*
                 * If error encountered, give up tracing on this
                 * vnode.  Don't report EPIPE as this can easily
                 * happen with fktrace()/ktruss.
                 */
#ifndef DEBUG
                if (error != EPIPE)
#endif
                        log(LOG_NOTICE,
                            "ktrace write failed, errno %d, tracing stopped\n",
                            error);
                (void)ktrderefall(ktd, 0);
        }

        while ((kte = top) != NULL) {
                top = TAILQ_NEXT(top, kte_list);
                ktefree(kte);
        }
}

static void
ktrace_thread(void *arg)
{
        struct ktr_desc *ktd = arg;
        file_t *fp = ktd->ktd_fp;
        struct ktrace_entry *kte;
        int ktrerr, errcnt;

        mutex_enter(&ktrace_lock);
        for (;;) {
                kte = TAILQ_FIRST(&ktd->ktd_queue);
                if (kte == NULL) {
                        if (ktd->ktd_flags & KTDF_WAIT) {
                                ktd->ktd_flags &= ~(KTDF_WAIT | KTDF_BLOCKING);
                                cv_broadcast(&ktd->ktd_sync_cv);
                        }
                        if (ktd->ktd_ref == 0)
                                break;
                        cv_wait(&ktd->ktd_cv, &ktrace_lock);
                        continue;
                }
                TAILQ_INIT(&ktd->ktd_queue);
                ktd->ktd_qcount = 0;
                ktrerr = ktd->ktd_error;
                errcnt = ktd->ktd_errcnt;
                ktd->ktd_error = ktd->ktd_errcnt = 0;
                mutex_exit(&ktrace_lock);

                if (ktrerr) {
                        log(LOG_NOTICE,
                            "ktrace failed, fp %p, error 0x%x, total %d\n",
                            fp, ktrerr, errcnt);
                }
                ktrwrite(ktd, kte);
                mutex_enter(&ktrace_lock);
        }

        if (ktd_lookup(ktd->ktd_fp) == ktd) {
                TAILQ_REMOVE(&ktdq, ktd, ktd_list);
        } else {
                /* nothing, collision in KTROP_SET */
        }

        callout_halt(&ktd->ktd_wakch, &ktrace_lock);
        callout_destroy(&ktd->ktd_wakch);
        mutex_exit(&ktrace_lock);

        /*
         * ktrace file descriptor can't be watched (are not visible to
         * userspace), so no kqueue stuff here
         * XXX: The above comment is wrong, because the fktrace file
         * descriptor is available in userland.
         */
        closef(fp);

        cv_destroy(&ktd->ktd_sync_cv);
        cv_destroy(&ktd->ktd_cv);

        kmem_free(ktd, sizeof(*ktd));

        kthread_exit(0);
}

/*
 * Return true if caller has permission to set the ktracing state
 * of target.  Essentially, the target can't possess any
 * more permissions than the caller.  KTRFAC_PERSISTENT signifies that
 * the tracing will persist on sugid processes during exec; it is only
 * settable by a process with appropriate credentials.
 *
 * TODO: check groups.  use caller effective gid.
 */
static int
ktrcanset(lwp_t *calll, struct proc *targetp)
{
        KASSERT(mutex_owned(targetp->p_lock));
        KASSERT(mutex_owned(&ktrace_lock));

        if (kauth_authorize_process(calll->l_cred, KAUTH_PROCESS_KTRACE,
            targetp, NULL, NULL, NULL) == 0)
                return (1);

        return (0);
}

/*
 * Put user defined entry to ktrace records.
 */
int
sys_utrace(struct lwp *l, const struct sys_utrace_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) label;
                syscallarg(void *) addr;
                syscallarg(size_t) len;
        } */

        return ktruser(SCARG(uap, label), SCARG(uap, addr),
            SCARG(uap, len), 1);
}
















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
/*        $NetBSD: kern_scdebug.c,v 1.2 2019/03/14 19:51:49 palle Exp $        */

/*
 * Copyright (c) 2015 Matthew R. Green
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_xxx.c        8.3 (Berkeley) 2/14/95
 *        from: NetBSD: kern_xxx.c,v 1.74 2017/10/28 00:37:11 pgoyette Exp
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_scdebug.c,v 1.2 2019/03/14 19:51:49 palle Exp $");

#ifdef _KERNEL_OPT
#include "opt_syscall_debug.h"
#include "opt_kernhist.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/mount.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/kernhist.h>

/*
 * Pull in the indirect syscall functions here.
 * They are only actually used if the ports syscall entry code
 * doesn't special-case SYS_SYSCALL and SYS___SYSCALL
 *
 * In some cases the generated code for the two functions is identical,
 * but there isn't a MI way of determining that - so we don't try.
 */

#define SYS_SYSCALL sys_syscall
#include "sys_syscall.c"
#undef SYS_SYSCALL

#define SYS_SYSCALL sys___syscall
#include "sys_syscall.c"
#undef SYS_SYSCALL

#ifdef SYSCALL_DEBUG
#define        SCDEBUG_CALLS                0x0001        /* show calls */
#define        SCDEBUG_RETURNS                0x0002        /* show returns */
#define        SCDEBUG_ALL                0x0004        /* even syscalls that are not implemented */
#define        SCDEBUG_SHOWARGS        0x0008        /* show arguments to calls */
#define        SCDEBUG_KERNHIST        0x0010        /* use kernhist instead of printf */

#ifndef SCDEBUG_DEFAULT
#define SCDEBUG_DEFAULT (SCDEBUG_CALLS|SCDEBUG_RETURNS|SCDEBUG_SHOWARGS)
#endif

int        scdebug = SCDEBUG_DEFAULT;

#ifdef KERNHIST
KERNHIST_DEFINE(scdebughist);
#define SCDEBUG_KERNHIST_FUNC(a)                KERNHIST_FUNC(a)
#define SCDEBUG_KERNHIST_CALLED(a)                KERNHIST_CALLED(a)
#define SCDEBUG_KERNHIST_LOG(a,b,c,d,e,f)        KERNHIST_LOG(a,b,c,d,e,f)
#else
#define SCDEBUG_KERNHIST_FUNC(a)                {} /* nothing */
#define SCDEBUG_KERNHIST_CALLED(a)                {} /* nothing */
#define SCDEBUG_KERNHIST_LOG(a,b,c,d,e,f)        {} /* nothing */
/* The non-kernhist support version can elide all this code easily. */
#undef        SCDEBUG_KERNHIST
#define        SCDEBUG_KERNHIST 0
#endif

#ifdef __HAVE_MINIMAL_EMUL
#define CODE_NOT_OK(code, em)        ((int)(code) < 0)
#else
#define CODE_NOT_OK(code, em)        (((int)(code) < 0) || \
                                 ((int)(code) >= (em)->e_nsysent))
#endif

void
scdebug_call(register_t code, const register_t args[])
{
        SCDEBUG_KERNHIST_FUNC("scdebug_call");
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;
        const struct sysent *sy;
        const struct emul *em;
        int i;

        if ((scdebug & SCDEBUG_CALLS) == 0)
                return;

        if (scdebug & SCDEBUG_KERNHIST)
                SCDEBUG_KERNHIST_CALLED(scdebughist);

        em = p->p_emul;
        sy = &em->e_sysent[code];

        if ((scdebug & SCDEBUG_ALL) == 0 &&
            (CODE_NOT_OK(code, em) || sy->sy_call == sys_nosys)) {
                if (scdebug & SCDEBUG_KERNHIST)
                        SCDEBUG_KERNHIST_LOG(scdebughist, "", 0, 0, 0, 0);
                return;
        }

        /*
         * The kernhist version of scdebug needs to restrict the usage
         * compared to the normal version.  histories must avoid these
         * sorts of usage:
         *
         *        - the format string *must* be literal, as it is used
         *          at display time in the kernel or userland
         *        - strings in the format will cause vmstat -u to crash
         *          so avoid using %s formats
         *
         * to avoid these, we have a fairly long block to print args
         * as the format needs to change for each, and we can't just
         * call printf() on each argument until we're done.
         */
        if (scdebug & SCDEBUG_KERNHIST) {
                if (CODE_NOT_OK(code, em)) {
                        SCDEBUG_KERNHIST_LOG(scdebughist, 
                            "pid %jd:%jd: OUT OF RANGE (%jd)",
                            p->p_pid, l->l_lid, code, 0);
                } else {
                        SCDEBUG_KERNHIST_LOG(scdebughist,
                            "pid %jd:%jd: num %jd call %#jx",
                            p->p_pid, l->l_lid, code, (uintptr_t)sy->sy_call);
                        if ((scdebug & SCDEBUG_SHOWARGS) == 0)
                                return;

                        if (sy->sy_narg > 7) {
                                SCDEBUG_KERNHIST_LOG(scdebughist,
                                    "args[4-7]: (%jx, %jx, %jx, %jx, ...)",
                                    (long)args[4], (long)args[5],
                                    (long)args[6], (long)args[7]);
                        } else if (sy->sy_narg > 6) {
                                SCDEBUG_KERNHIST_LOG(scdebughist,
                                    "args[4-6]: (%jx, %jx, %jx)",
                                    (long)args[4], (long)args[5],
                                    (long)args[6], 0);
                        } else if (sy->sy_narg > 5) {
                                SCDEBUG_KERNHIST_LOG(scdebughist,
                                    "args[4-5]: (%jx, %jx)",
                                    (long)args[4], (long)args[5], 0, 0);
                        } else if (sy->sy_narg == 5) {
                                SCDEBUG_KERNHIST_LOG(scdebughist,
                                    "args[4]: (%jx)",
                                    (long)args[4], 0, 0, 0);
                        }

                        if (sy->sy_narg > 3) {
                                SCDEBUG_KERNHIST_LOG(scdebughist,
                                    "args[0-3]: (%jx, %jx, %jx, %jx, ...)",
                                    (long)args[0], (long)args[1],
                                    (long)args[2], (long)args[3]);
                        } else if (sy->sy_narg > 2) {
                                SCDEBUG_KERNHIST_LOG(scdebughist,
                                    "args[0-2]: (%jx, %jx, %jx)",
                                    (long)args[0], (long)args[1],
                                    (long)args[2], 0);
                        } else if (sy->sy_narg > 1) {
                                SCDEBUG_KERNHIST_LOG(scdebughist,
                                    "args[0-1]: (%jx, %jx)",
                                    (long)args[0], (long)args[1], 0, 0);
                        } else if (sy->sy_narg == 1) {
                                SCDEBUG_KERNHIST_LOG(scdebughist,
                                    "args[0]: (%jx)",
                                    (long)args[0], 0, 0, 0);
                        }
                }
                return;
        }

        printf("proc %d (%s): %s num ", p->p_pid, p->p_comm, em->e_name);
        if (CODE_NOT_OK(code, em))
                printf("OUT OF RANGE (%ld)", (long)code);
        else {
                printf("%ld call: %s", (long)code, em->e_syscallnames[code]);
                if (scdebug & SCDEBUG_SHOWARGS) {
                        printf("(");
                        for (i = 0; i < sy->sy_argsize/sizeof(register_t); i++)
                                printf("%s0x%lx", i == 0 ? "" : ", ",
                                    (long)args[i]);
                        printf(")");
                }
        }
        printf("\n");
}

void
scdebug_ret(register_t code, int error, const register_t retval[])
{
        SCDEBUG_KERNHIST_FUNC("scdebug_ret");
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;
        const struct sysent *sy;
        const struct emul *em;

        if ((scdebug & SCDEBUG_RETURNS) == 0)
                return;

        if (scdebug & SCDEBUG_KERNHIST)
                SCDEBUG_KERNHIST_CALLED(scdebughist);

        em = p->p_emul;
        sy = &em->e_sysent[code];
        if ((scdebug & SCDEBUG_ALL) == 0 &&
            (CODE_NOT_OK(code, em) || sy->sy_call == sys_nosys)) {
                if (scdebug & SCDEBUG_KERNHIST)
                        SCDEBUG_KERNHIST_LOG(scdebughist, "", 0, 0, 0, 0);
                return;
        }

        if (scdebug & SCDEBUG_KERNHIST) {
                if (CODE_NOT_OK(code, em)) {
                        SCDEBUG_KERNHIST_LOG(scdebughist, 
                            "pid %jd:%jd: OUT OF RANGE (%jd)",
                            p->p_pid, l->l_lid, code, 0);
                } else {
                        SCDEBUG_KERNHIST_LOG(scdebughist,
                            "pid %jd:%jd: num %jd",
                            p->p_pid, l->l_lid, code, 0);
                        SCDEBUG_KERNHIST_LOG(scdebughist,
                            "ret: err = %jd, rv = 0x%jx,0x%jx",
                            error, (long)retval[0], (long)retval[1], 0);
                }
                return;
        }

        printf("proc %d (%s): %s num ", p->p_pid, p->p_comm, em->e_name);
        if (CODE_NOT_OK(code, em))
                printf("OUT OF RANGE (%ld)", (long)code);
        else
                printf("%ld ret %s: err = %d, rv = 0x%lx,0x%lx", (long)code,
                    em->e_syscallnames[code], error,
                    (long)retval[0], (long)retval[1]);
        printf("\n");
}
#endif /* SYSCALL_DEBUG */

#ifndef SCDEBUG_KERNHIST_SIZE
#define SCDEBUG_KERNHIST_SIZE 500
#endif

void
scdebug_init(void)
{
#if defined(SYSCALL_DEBUG) && defined(KERNHIST)
        /* Setup scdebughist kernel history */
        KERNHIST_INIT(scdebughist, SCDEBUG_KERNHIST_SIZE);
#endif
}



































































































































































































































































   41 







   38 







    2 







    3 






























    3 
















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
/*        $NetBSD: sysv_ipc.c,v 1.42 2022/03/27 16:23:08 christos Exp $        */

/*-
 * Copyright (c) 1998, 2007 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sysv_ipc.c,v 1.42 2022/03/27 16:23:08 christos Exp $");

#ifdef _KERNEL_OPT
#include "opt_sysv.h"
#include "opt_sysvparam.h"
#include "opt_compat_netbsd.h"
#endif

#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/syscallvar.h>
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/ipc.h>
#ifdef SYSVMSG
#include <sys/msg.h>
#endif
#ifdef SYSVSEM
#include <sys/sem.h>
#endif
#ifdef SYSVSHM
#include <sys/shm.h>
#endif
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/compat_stub.h>

#include <compat/common/compat_sysv_mod.h>        /* for sysctl routine vector */

/*
 * Values in support of System V compatible shared memory.        XXX
 * (originally located in sys/conf/param.c)
 */
#ifdef SYSVSHM
#if !defined(SHMMAX) && defined(SHMMAXPGS)
#define        SHMMAX        SHMMAXPGS        /* shminit() performs a `*= PAGE_SIZE' */
#elif !defined(SHMMAX)
#define SHMMAX 0
#endif
#ifndef        SHMMIN
#define        SHMMIN        1
#endif
#ifndef        SHMMNI
#define        SHMMNI        128                /* <64k, see IPCID_TO_IX in ipc.h */
#endif
#ifndef        SHMSEG
#define        SHMSEG        128
#endif

struct        shminfo shminfo = {
        SHMMAX,
        SHMMIN,
        SHMMNI,
        SHMSEG,
        0
};
#endif

/*
 * Values in support of System V compatible semaphores.
 */
#ifdef SYSVSEM
struct        seminfo seminfo = {
        SEMMAP,                /* # of entries in semaphore map */
        SEMMNI,                /* # of semaphore identifiers */
        SEMMNS,                /* # of semaphores in system */
        SEMMNU,                /* # of undo structures in system */
        SEMMSL,                /* max # of semaphores per id */
        SEMOPM,                /* max # of operations per semop call */
        SEMUME,                /* max # of undo entries per process */
        SEMUSZ,                /* size in bytes of undo structure */
        SEMVMX,                /* semaphore maximum value */
        SEMAEM                /* adjust on exit max value */
};
#endif

/*
 * Values in support of System V compatible messages.
 */
#ifdef SYSVMSG
struct        msginfo msginfo = {
        MSGMAX,                /* max chars in a message */
        MSGMNI,                /* # of message queue identifiers */
        MSGMNB,                /* max chars in a queue */
        MSGTQL,                /* max messages in system */
        MSGSSZ,                /* size of a message segment */
                        /* (must be small power of 2 greater than 4) */
        MSGSEG                /* number of message segments */
};
#endif

MODULE(MODULE_CLASS_EXEC, sysv_ipc, NULL);
 
SYSCTL_SETUP_PROTO(sysctl_ipc_setup);

static const struct syscall_package sysvipc_syscalls[] = {
#if defined(SYSVSHM)
        { SYS___shmctl50, 0, (sy_call_t *)sys___shmctl50 },
        { SYS_shmat, 0, (sy_call_t *)sys_shmat },
        { SYS_shmdt, 0, (sy_call_t *)sys_shmdt },
        { SYS_shmget, 0, (sy_call_t *)sys_shmget },
#endif        /* SYSVSHM */

#if defined(SYSVSEM)
        { SYS_____semctl50, 0, (sy_call_t *)sys_____semctl50 },
        { SYS_semget, 0, (sy_call_t *)sys_semget },
        { SYS_semop, 0, (sy_call_t *)sys_semop },
        { SYS_semconfig, 0, (sy_call_t *)sys_semconfig },
#endif        /* SYSVSEM */

#if defined(SYSVMSG)
        { SYS___msgctl50, 0, (sy_call_t *)sys___msgctl50 },
        { SYS_msgget, 0, (sy_call_t *)sys_msgget },
        { SYS_msgsnd, 0, (sy_call_t *)sys_msgsnd },
        { SYS_msgrcv, 0, (sy_call_t *)sys_msgrcv },
#endif        /* SYSVMSG */
        { 0, 0, NULL }
};

static int
sysv_ipc_modcmd(modcmd_t cmd, void *arg)
{
        int error = 0;

        switch (cmd) {
        case MODULE_CMD_INIT:
                /* Set up the kauth listener */
                sysvipcinit();

                /* Link the system calls */
                error = syscall_establish(NULL, sysvipc_syscalls);
                if (error) {
                        sysvipcfini();
                        return error;
                }

                /*
                 * Initialize each sub-component, including their
                 * sysctl data
                 */
#ifdef SYSVSHM
                error = shminit();
                if (error != 0)
                        return error;
#endif
#ifdef SYSVSEM
                error = seminit();
                if (error != 0) {
#ifdef SYSVSHM
                        shmfini();
#endif
                        return error;
                }
#endif
#ifdef SYSVMSG
                error = msginit();
                if (error != 0) {
#ifdef SYSVSEM
                        semfini();
#endif
#ifdef SYSVSHM
                        shmfini();
#endif
                        return error;
                }
#endif
                break;
        case MODULE_CMD_FINI:
                /*
                 * Make sure no subcomponents are active.  Each one
                 * tells us if it is busy, and if it was _not_ busy,
                 * we assume it has already done its own clean-up.
                 * So we might need to re-init any components that
                 * are successfully fini'd if we find one that is 
                 * still busy.
                 */
#ifdef SYSVSHM
                if (shmfini()) {
                        return EBUSY;
                }
#endif
#ifdef SYSVSEM
                if (semfini()) {
#ifdef SYSVSHM
                        shminit();
#endif
                        return EBUSY;
                }
#endif
#ifdef SYSVMSG
                if (msgfini()) {
#ifdef SYSVSEM
                        seminit();
#endif
#ifdef SYSVSHM
                        shminit();
#endif
                        return EBUSY;
                }
#endif
                /* Unlink the system calls. */
                error = syscall_disestablish(NULL, sysvipc_syscalls);
                if (error)
                        return error;

                /* Remove the kauth listener */
                sysvipcfini();
                break;
        default:
                return ENOTTY;
        }
        return error;
}

static kauth_listener_t sysvipc_listener = NULL;

static int
sysvipc_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        mode_t mask;
        struct ipc_perm *perm;
        int mode;
        enum kauth_system_req req;

        req = (enum kauth_system_req)(uintptr_t)arg0;

        if (!(action == KAUTH_SYSTEM_SYSVIPC &&
              req == KAUTH_REQ_SYSTEM_SYSVIPC_BYPASS))
                return KAUTH_RESULT_DEFER;

        perm = arg1;
        mode = (int)(uintptr_t)arg2;

        if (mode == IPC_M) {
                if (kauth_cred_geteuid(cred) == perm->uid ||
                    kauth_cred_geteuid(cred) == perm->cuid)
                        return (KAUTH_RESULT_ALLOW);
                return (KAUTH_RESULT_DEFER); /* EPERM */
        }

        mask = 0;

        if (kauth_cred_geteuid(cred) == perm->uid ||
            kauth_cred_geteuid(cred) == perm->cuid) {
                if (mode & IPC_R)
                        mask |= S_IRUSR;
                if (mode & IPC_W)
                        mask |= S_IWUSR;
                return ((perm->mode & mask) == mask ? KAUTH_RESULT_ALLOW : KAUTH_RESULT_DEFER /* EACCES */);
        }

        if (kauth_cred_groupmember(cred, perm->gid) == 0 ||
            kauth_cred_groupmember(cred, perm->cgid) == 0) {
                if (mode & IPC_R)
                        mask |= S_IRGRP;
                if (mode & IPC_W)
                        mask |= S_IWGRP;
                return ((perm->mode & mask) == mask ? KAUTH_RESULT_ALLOW : KAUTH_RESULT_DEFER /* EACCES */);
        }

        if (mode & IPC_R)
                mask |= S_IROTH;
        if (mode & IPC_W)
                mask |= S_IWOTH;
        return ((perm->mode & mask) == mask ? KAUTH_RESULT_ALLOW : KAUTH_RESULT_DEFER /* EACCES */);
}

/*
 * Check for ipc permission
 */

int
ipcperm(kauth_cred_t cred, struct ipc_perm *perm, int mode)
{
        int error;

        error = kauth_authorize_system(cred, KAUTH_SYSTEM_SYSVIPC,
            KAUTH_REQ_SYSTEM_SYSVIPC_BYPASS, perm, KAUTH_ARG(mode), NULL);
        if (error == 0)
                return (0);

        /* Adjust EPERM and EACCES errors until there's a better way to do this. */
        if (mode != IPC_M)
                error = EACCES;

        return error;
}

void
sysvipcfini(void)
{

        KASSERT(sysvipc_listener != NULL);
        kauth_unlisten_scope(sysvipc_listener);
        sysvipc_listener = NULL;
}

void
sysvipcinit(void)
{

        KASSERT(sysvipc_listener == NULL);

        sysvipc_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
            sysvipc_listener_cb, NULL);
}

static int
stub_sysvipc50_sysctl(SYSCTLFN_ARGS)
{
        return EPASSTHROUGH;
}

static int
sysctl_kern_sysvipc(SYSCTLFN_ARGS)
{
        void *where = oldp;
        size_t sz, *sizep = oldlenp;
#ifdef SYSVMSG
        struct msg_sysctl_info *msgsi = NULL;
#endif
#ifdef SYSVSEM
        struct sem_sysctl_info *semsi = NULL;
#endif
#ifdef SYSVSHM
        struct shm_sysctl_info *shmsi = NULL;
#endif
        size_t infosize, dssize, tsize, buflen;
        void *bf = NULL;
        char *start;
        int32_t nds;
        int i, error, ret;

/*
 * If present, call the compat sysctl() code.  If it handles the request
 * completely (either success or error), return.  Otherwise fallthrough
 * to the non-compat sysctl code.
 */

        MODULE_HOOK_CALL(sysvipc_sysctl_50_hook, (SYSCTLFN_CALL(rnode)),
            stub_sysvipc50_sysctl(SYSCTLFN_CALL(rnode)), error);
        if (error != EPASSTHROUGH)
                return error;

        if (namelen != 1)
                return EINVAL;

        start = where;
        buflen = *sizep;

        switch (*name) {
        case KERN_SYSVIPC_MSG_INFO:
#ifdef SYSVMSG
                infosize = sizeof(msgsi->msginfo);
                nds = msginfo.msgmni;
                dssize = sizeof(msgsi->msgids[0]);
                break;
#else
                return EINVAL;
#endif
        case KERN_SYSVIPC_SEM_INFO:
#ifdef SYSVSEM
                infosize = sizeof(semsi->seminfo);
                nds = seminfo.semmni;
                dssize = sizeof(semsi->semids[0]);
                break;
#else
                return EINVAL;
#endif
        case KERN_SYSVIPC_SHM_INFO:
#ifdef SYSVSHM
                infosize = sizeof(shmsi->shminfo);
                nds = shminfo.shmmni;
                dssize = sizeof(shmsi->shmids[0]);
                break;
#else
                return EINVAL;
#endif
        default:
                return EINVAL;
        }
        /*
         * Round infosize to 64 bit boundary if requesting more than just
         * the info structure or getting the total data size.
         */
        if (where == NULL || *sizep > infosize)
                infosize = roundup(infosize, sizeof(quad_t));
        tsize = infosize + nds * dssize;

        /* Return just the total size required. */
        if (where == NULL) {
                *sizep = tsize;
                return 0;
        }

        /* Not enough room for even the info struct. */
        if (buflen < infosize) {
                *sizep = 0;
                return ENOMEM;
        }
        sz = uimin(tsize, buflen);
        bf = kmem_zalloc(sz, KM_SLEEP);

        switch (*name) {
#ifdef SYSVMSG
        case KERN_SYSVIPC_MSG_INFO:
                msgsi = (struct msg_sysctl_info *)bf;
                msgsi->msginfo = msginfo;
                break;
#endif
#ifdef SYSVSEM
        case KERN_SYSVIPC_SEM_INFO:
                semsi = (struct sem_sysctl_info *)bf;
                semsi->seminfo = seminfo;
                break;
#endif
#ifdef SYSVSHM
        case KERN_SYSVIPC_SHM_INFO:
                shmsi = (struct shm_sysctl_info *)bf;
                shmsi->shminfo = shminfo;
                break;
#endif
        }
        buflen -= infosize;

        ret = 0;
        if (buflen > 0) {
                /* Fill in the IPC data structures.  */
                for (i = 0; i < nds; i++) {
                        if (buflen < dssize) {
                                ret = ENOMEM;
                                break;
                        }
                        switch (*name) {
#ifdef SYSVMSG
                        case KERN_SYSVIPC_MSG_INFO:
                                mutex_enter(&msgmutex);
                                SYSCTL_FILL_MSG(msqs[i].msq_u, msgsi->msgids[i]);
                                mutex_exit(&msgmutex);
                                break;
#endif
#ifdef SYSVSEM
                        case KERN_SYSVIPC_SEM_INFO:
                                SYSCTL_FILL_SEM(sema[i], semsi->semids[i]);
                                break;
#endif
#ifdef SYSVSHM
                        case KERN_SYSVIPC_SHM_INFO:
                                SYSCTL_FILL_SHM(shmsegs[i], shmsi->shmids[i]);
                                break;
#endif
                        }
                        buflen -= dssize;
                }
        }
        *sizep -= buflen;
        error = copyout(bf, start, *sizep);
        /* If copyout succeeded, use return code set earlier. */
        if (error == 0)
                error = ret;
        if (bf)
                kmem_free(bf, sz);
        return error;
}

SYSCTL_SETUP(sysctl_ipc_setup, "sysctl kern.ipc subtree setup")
{

        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT,
                CTLTYPE_NODE, "ipc",
                SYSCTL_DESCR("SysV IPC options"),
                NULL, 0, NULL, 0,
                CTL_KERN, KERN_SYSVIPC, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                CTLFLAG_PERMANENT,
                CTLTYPE_STRUCT, "sysvipc_info",
                SYSCTL_DESCR("System V style IPC information"),
                sysctl_kern_sysvipc, 0, NULL, 0,
                CTL_KERN, KERN_SYSVIPC, KERN_SYSVIPC_INFO, CTL_EOL);
}

































































































































































































    8 







    8 






























    9 



    7 







    3 
    4 




















    1 






    1 





    1 







    2 




















    2 













































































    2 


    2 



















    2 














    1 

























































    7 


































    7 






















































































































    1 













    7 










































































































































    9 







    9 




    9 



















    8 



    1 


















    9 





    5 



    4 
















    7 




    2 











    8 









    4 
    7 
















    6 
    4 









    7 





















    6 
    4 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
/*        $NetBSD: vfs_lockf.c,v 1.81 2023/09/23 18:21:11 ad Exp $        */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Scooter Morris at Genentech Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ufs_lockf.c        8.4 (Berkeley) 10/26/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_lockf.c,v 1.81 2023/09/23 18:21:11 ad Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/kmem.h>
#include <sys/fcntl.h>
#include <sys/lockf.h>
#include <sys/atomic.h>
#include <sys/kauth.h>
#include <sys/uidinfo.h>

/*
 * The lockf structure is a kernel structure which contains the information
 * associated with a byte range lock.  The lockf structures are linked into
 * the vnode structure.  Locks are sorted by the starting byte of the lock for
 * efficiency.
 *
 * lf_next is used for two purposes, depending on whether the lock is
 * being held, or is in conflict with an existing lock.  If this lock
 * is held, it indicates the next lock on the same vnode.
 * For pending locks, if lock->lf_next is non-NULL, then lock->lf_block
 * must be queued on the lf_blkhd TAILQ of lock->lf_next.
 */

TAILQ_HEAD(locklist, lockf);

struct lockf {
        kcondvar_t lf_cv;         /* Signalling */
        short        lf_flags;         /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */
        short        lf_type;         /* Lock type: F_RDLCK, F_WRLCK */
        off_t        lf_start;         /* The byte # of the start of the lock */
        off_t        lf_end;                 /* The byte # of the end of the lock (-1=EOF)*/
        void        *lf_id;                 /* process or file description holding lock */
        struct        lockf **lf_head; /* Back pointer to the head of lockf list */
        struct        lockf *lf_next;         /* Next lock on this vnode, or blocking lock */
        struct  locklist lf_blkhd; /* List of requests blocked on this lock */
        TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */
        struct        uidinfo *lf_uip; /* Cached pointer to uidinfo */
};

/* Maximum length of sleep chains to traverse to try and detect deadlock. */
#define MAXDEPTH 50

static kmutex_t lockf_lock __cacheline_aligned;
static char lockstr[] = "lockf";

/*
 * This variable controls the maximum number of processes that will
 * be checked in doing deadlock detection.
 */
int maxlockdepth = MAXDEPTH;

#ifdef LOCKF_DEBUG
int        lockf_debug = 0;
#endif

#define SELF        0x1
#define OTHERS        0x2

/*
 * XXX TODO
 * Misc cleanups: "void *id" should be visible in the API as a
 * "struct proc *".
 * (This requires rototilling all VFS's which support advisory locking).
 */

/*
 * If there's a lot of lock contention on a single vnode, locking
 * schemes which allow for more paralleism would be needed.  Given how
 * infrequently byte-range locks are actually used in typical BSD
 * code, a more complex approach probably isn't worth it.
 */

/*
 * We enforce a limit on locks by uid, so that a single user cannot
 * run the kernel out of memory.  For now, the limit is pretty coarse.
 * There is no limit on root.
 *
 * Splitting a lock will always succeed, regardless of current allocations.
 * If you're slightly above the limit, we still have to permit an allocation
 * so that the unlock can succeed.  If the unlocking causes too many splits,
 * however, you're totally cutoff.
 */
#define MAXLOCKSPERUID (2 * maxfiles)

#ifdef LOCKF_DEBUG
/*
 * Print out a lock.
 */
static void
lf_print(const char *tag, struct lockf *lock)
{

        printf("%s: lock %p for ", tag, lock);
        if (lock->lf_flags & F_POSIX)
                printf("proc %d", ((struct proc *)lock->lf_id)->p_pid);
        else
                printf("file %p", (struct file *)lock->lf_id);
        printf(" %s, start %jd, end %jd",
                lock->lf_type == F_RDLCK ? "shared" :
                lock->lf_type == F_WRLCK ? "exclusive" :
                lock->lf_type == F_UNLCK ? "unlock" :
                "unknown", (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
        if (TAILQ_FIRST(&lock->lf_blkhd))
                printf(" block %p\n", TAILQ_FIRST(&lock->lf_blkhd));
        else
                printf("\n");
}

static void
lf_printlist(const char *tag, struct lockf *lock)
{
        struct lockf *lf, *blk;

        printf("%s: Lock list:\n", tag);
        for (lf = *lock->lf_head; lf; lf = lf->lf_next) {
                printf("\tlock %p for ", lf);
                if (lf->lf_flags & F_POSIX)
                        printf("proc %d", ((struct proc *)lf->lf_id)->p_pid);
                else
                        printf("file %p", (struct file *)lf->lf_id);
                printf(", %s, start %jd, end %jd",
                        lf->lf_type == F_RDLCK ? "shared" :
                        lf->lf_type == F_WRLCK ? "exclusive" :
                        lf->lf_type == F_UNLCK ? "unlock" :
                        "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
                TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
                        if (blk->lf_flags & F_POSIX)
                                printf("; proc %d",
                                    ((struct proc *)blk->lf_id)->p_pid);
                        else
                                printf("; file %p", (struct file *)blk->lf_id);
                        printf(", %s, start %jd, end %jd",
                                blk->lf_type == F_RDLCK ? "shared" :
                                blk->lf_type == F_WRLCK ? "exclusive" :
                                blk->lf_type == F_UNLCK ? "unlock" :
                                "unknown", (intmax_t)blk->lf_start, (intmax_t)blk->lf_end);
                        if (TAILQ_FIRST(&blk->lf_blkhd))
                                 panic("lf_printlist: bad list");
                }
                printf("\n");
        }
}
#endif /* LOCKF_DEBUG */

/*
 * 3 options for allowfail.
 * 0 - always allocate.  1 - cutoff at limit.  2 - cutoff at double limit.
 */
static struct lockf *
lf_alloc(int allowfail)
{
        struct uidinfo *uip;
        struct lockf *lock;
        u_long lcnt;
        const uid_t uid = kauth_cred_geteuid(kauth_cred_get());

        uip = uid_find(uid);
        lcnt = atomic_inc_ulong_nv(&uip->ui_lockcnt);
        if (uid && allowfail && lcnt >
            (allowfail == 1 ? MAXLOCKSPERUID : (MAXLOCKSPERUID * 2))) {
                atomic_dec_ulong(&uip->ui_lockcnt);
                return NULL;
        }

        lock = kmem_alloc(sizeof(*lock), KM_SLEEP);
        lock->lf_uip = uip;
        cv_init(&lock->lf_cv, lockstr);
        return lock;
}

static void
lf_free(struct lockf *lock)
{

        atomic_dec_ulong(&lock->lf_uip->ui_lockcnt);
        cv_destroy(&lock->lf_cv);
        kmem_free(lock, sizeof(*lock));
}

/*
 * Walk the list of locks for an inode to
 * find an overlapping lock (if any).
 *
 * NOTE: this returns only the FIRST overlapping lock.  There
 *         may be more than one.
 */
static int
lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
    struct lockf ***prev, struct lockf **overlap)
{
        off_t start, end;

        *overlap = lf;
        if (lf == NULL)
                return 0;
#ifdef LOCKF_DEBUG
        if (lockf_debug & 2)
                lf_print("lf_findoverlap: looking for overlap in", lock);
#endif /* LOCKF_DEBUG */
        start = lock->lf_start;
        end = lock->lf_end;
        while (lf != NULL) {
                if (((type == SELF) && lf->lf_id != lock->lf_id) ||
                    ((type == OTHERS) && lf->lf_id == lock->lf_id)) {
                        *prev = &lf->lf_next;
                        *overlap = lf = lf->lf_next;
                        continue;
                }
#ifdef LOCKF_DEBUG
                if (lockf_debug & 2)
                        lf_print("\tchecking", lf);
#endif /* LOCKF_DEBUG */
                /*
                 * OK, check for overlap
                 *
                 * Six cases:
                 *        0) no overlap
                 *        1) overlap == lock
                 *        2) overlap contains lock
                 *        3) lock contains overlap
                 *        4) overlap starts before lock
                 *        5) overlap ends after lock
                 */
                if ((lf->lf_end != -1 && start > lf->lf_end) ||
                    (end != -1 && lf->lf_start > end)) {
                        /* Case 0 */
#ifdef LOCKF_DEBUG
                        if (lockf_debug & 2)
                                printf("no overlap\n");
#endif /* LOCKF_DEBUG */
                        if ((type & SELF) && end != -1 && lf->lf_start > end)
                                return 0;
                        *prev = &lf->lf_next;
                        *overlap = lf = lf->lf_next;
                        continue;
                }
                if ((lf->lf_start == start) && (lf->lf_end == end)) {
                        /* Case 1 */
#ifdef LOCKF_DEBUG
                        if (lockf_debug & 2)
                                printf("overlap == lock\n");
#endif /* LOCKF_DEBUG */
                        return 1;
                }
                if ((lf->lf_start <= start) &&
                    (end != -1) &&
                    ((lf->lf_end >= end) || (lf->lf_end == -1))) {
                        /* Case 2 */
#ifdef LOCKF_DEBUG
                        if (lockf_debug & 2)
                                printf("overlap contains lock\n");
#endif /* LOCKF_DEBUG */
                        return 2;
                }
                if (start <= lf->lf_start &&
                           (end == -1 ||
                           (lf->lf_end != -1 && end >= lf->lf_end))) {
                        /* Case 3 */
#ifdef LOCKF_DEBUG
                        if (lockf_debug & 2)
                                printf("lock contains overlap\n");
#endif /* LOCKF_DEBUG */
                        return 3;
                }
                if ((lf->lf_start < start) &&
                        ((lf->lf_end >= start) || (lf->lf_end == -1))) {
                        /* Case 4 */
#ifdef LOCKF_DEBUG
                        if (lockf_debug & 2)
                                printf("overlap starts before lock\n");
#endif /* LOCKF_DEBUG */
                        return 4;
                }
                if ((lf->lf_start > start) &&
                        (end != -1) &&
                        ((lf->lf_end > end) || (lf->lf_end == -1))) {
                        /* Case 5 */
#ifdef LOCKF_DEBUG
                        if (lockf_debug & 2)
                                printf("overlap ends after lock\n");
#endif /* LOCKF_DEBUG */
                        return 5;
                }
                panic("lf_findoverlap: default");
        }
        return 0;
}

/*
 * Split a lock and a contained region into
 * two or three locks as necessary.
 */
static void
lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **sparelock)
{
        struct lockf *splitlock;

#ifdef LOCKF_DEBUG
        if (lockf_debug & 2) {
                lf_print("lf_split", lock1);
                lf_print("splitting from", lock2);
        }
#endif /* LOCKF_DEBUG */
        /*
         * Check to see if splitting into only two pieces.
         */
        if (lock1->lf_start == lock2->lf_start) {
                lock1->lf_start = lock2->lf_end + 1;
                lock2->lf_next = lock1;
                return;
        }
        if (lock1->lf_end == lock2->lf_end) {
                lock1->lf_end = lock2->lf_start - 1;
                lock2->lf_next = lock1->lf_next;
                lock1->lf_next = lock2;
                return;
        }
        /*
         * Make a new lock consisting of the last part of
         * the encompassing lock
         */
        splitlock = *sparelock;
        *sparelock = NULL;
        cv_destroy(&splitlock->lf_cv);
        memcpy(splitlock, lock1, sizeof(*splitlock));
        cv_init(&splitlock->lf_cv, lockstr);

        splitlock->lf_start = lock2->lf_end + 1;
        TAILQ_INIT(&splitlock->lf_blkhd);
        lock1->lf_end = lock2->lf_start - 1;
        /*
         * OK, now link it in
         */
        splitlock->lf_next = lock1->lf_next;
        lock2->lf_next = splitlock;
        lock1->lf_next = lock2;
}

/*
 * Wakeup a blocklist
 */
static void
lf_wakelock(struct lockf *listhead)
{
        struct lockf *wakelock;

        while ((wakelock = TAILQ_FIRST(&listhead->lf_blkhd))) {
                KASSERT(wakelock->lf_next == listhead);
                TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
                wakelock->lf_next = NULL;
#ifdef LOCKF_DEBUG
                if (lockf_debug & 2)
                        lf_print("lf_wakelock: awakening", wakelock);
#endif
                cv_broadcast(&wakelock->lf_cv);
        }
}

/*
 * Remove a byte-range lock on an inode.
 *
 * Generally, find the lock (or an overlap to that lock)
 * and remove it (or shrink it), then wakeup anyone we can.
 */
static int
lf_clearlock(struct lockf *unlock, struct lockf **sparelock)
{
        struct lockf **head = unlock->lf_head;
        struct lockf *lf = *head;
        struct lockf *overlap, **prev;
        int ovcase;

        if (lf == NULL)
                return 0;
#ifdef LOCKF_DEBUG
        if (unlock->lf_type != F_UNLCK)
                panic("lf_clearlock: bad type");
        if (lockf_debug & 1)
                lf_print("lf_clearlock", unlock);
#endif /* LOCKF_DEBUG */
        prev = head;
        while ((ovcase = lf_findoverlap(lf, unlock, SELF,
            &prev, &overlap)) != 0) {
                /*
                 * Wakeup the list of locks to be retried.
                 */
                lf_wakelock(overlap);

                switch (ovcase) {

                case 1: /* overlap == lock */
                        *prev = overlap->lf_next;
                        lf_free(overlap);
                        break;

                case 2: /* overlap contains lock: split it */
                        if (overlap->lf_start == unlock->lf_start) {
                                overlap->lf_start = unlock->lf_end + 1;
                                break;
                        }
                        lf_split(overlap, unlock, sparelock);
                        overlap->lf_next = unlock->lf_next;
                        break;

                case 3: /* lock contains overlap */
                        *prev = overlap->lf_next;
                        lf = overlap->lf_next;
                        lf_free(overlap);
                        continue;

                case 4: /* overlap starts before lock */
                        overlap->lf_end = unlock->lf_start - 1;
                        prev = &overlap->lf_next;
                        lf = overlap->lf_next;
                        continue;

                case 5: /* overlap ends after lock */
                        overlap->lf_start = unlock->lf_end + 1;
                        break;
                }
                break;
        }
#ifdef LOCKF_DEBUG
        if (lockf_debug & 1)
                lf_printlist("lf_clearlock", unlock);
#endif /* LOCKF_DEBUG */
        return 0;
}

/*
 * Walk the list of locks for an inode and
 * return the first blocking lock.
 */
static struct lockf *
lf_getblock(struct lockf *lock)
{
        struct lockf **prev, *overlap, *lf = *(lock->lf_head);

        prev = lock->lf_head;
        while (lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != 0) {
                /*
                 * We've found an overlap, see if it blocks us
                 */
                if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
                        return overlap;
                /*
                 * Nope, point to the next one on the list and
                 * see if it blocks us
                 */
                lf = overlap->lf_next;
        }
        return NULL;
}

/*
 * Set a byte-range lock.
 */
static int
lf_setlock(struct lockf *lock, struct lockf **sparelock,
    kmutex_t *interlock)
{
        struct lockf *block;
        struct lockf **head = lock->lf_head;
        struct lockf **prev, *overlap, *ltmp;
        int ovcase, needtolink, error;

#ifdef LOCKF_DEBUG
        if (lockf_debug & 1)
                lf_print("lf_setlock", lock);
#endif /* LOCKF_DEBUG */

        /*
         * Scan lock list for this file looking for locks that would block us.
         */
        while ((block = lf_getblock(lock)) != NULL) {
                /*
                 * Free the structure and return if nonblocking.
                 */
                if ((lock->lf_flags & F_WAIT) == 0) {
                        lf_free(lock);
                        return EAGAIN;
                }
                /*
                 * We are blocked. Since flock style locks cover
                 * the whole file, there is no chance for deadlock.
                 * For byte-range locks we must check for deadlock.
                 *
                 * Deadlock detection is done by looking through the
                 * wait channels to see if there are any cycles that
                 * involve us. MAXDEPTH is set just to make sure we
                 * do not go off into neverneverland.
                 */
                if ((lock->lf_flags & F_POSIX) &&
                    (block->lf_flags & F_POSIX)) {
                        struct lwp *wlwp;
                        volatile const struct lockf *waitblock;
                        int i = 0;
                        struct proc *p;

                        p = (struct proc *)block->lf_id;
                        KASSERT(p != NULL);
                        while (i++ < maxlockdepth) {
                                mutex_enter(p->p_lock);
                                if (p->p_nlwps > 1) {
                                        mutex_exit(p->p_lock);
                                        break;
                                }
                                wlwp = LIST_FIRST(&p->p_lwps);
                                lwp_lock(wlwp);
                                if (wlwp->l_wchan == NULL ||
                                    wlwp->l_wmesg != lockstr) {
                                        lwp_unlock(wlwp);
                                        mutex_exit(p->p_lock);
                                        break;
                                }
                                waitblock = wlwp->l_wchan;
                                lwp_unlock(wlwp);
                                mutex_exit(p->p_lock);
                                /* Get the owner of the blocking lock */
                                waitblock = waitblock->lf_next;
                                if ((waitblock->lf_flags & F_POSIX) == 0)
                                        break;
                                p = (struct proc *)waitblock->lf_id;
                                if (p == curproc) {
                                        lf_free(lock);
                                        return EDEADLK;
                                }
                        }
                        /*
                         * If we're still following a dependency chain
                         * after maxlockdepth iterations, assume we're in
                         * a cycle to be safe.
                         */
                        if (i >= maxlockdepth) {
                                lf_free(lock);
                                return EDEADLK;
                        }
                }
                /*
                 * For flock type locks, we must first remove
                 * any shared locks that we hold before we sleep
                 * waiting for an exclusive lock.
                 */
                if ((lock->lf_flags & F_FLOCK) &&
                    lock->lf_type == F_WRLCK) {
                        lock->lf_type = F_UNLCK;
                        (void) lf_clearlock(lock, NULL);
                        lock->lf_type = F_WRLCK;
                }
                /*
                 * Add our lock to the blocked list and sleep until we're free.
                 * Remember who blocked us (for deadlock detection).
                 */
                lock->lf_next = block;
                TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
#ifdef LOCKF_DEBUG
                if (lockf_debug & 1) {
                        lf_print("lf_setlock: blocking on", block);
                        lf_printlist("lf_setlock", block);
                }
#endif /* LOCKF_DEBUG */
                error = cv_wait_sig(&lock->lf_cv, interlock);

                /*
                 * We may have been awoken by a signal (in
                 * which case we must remove ourselves from the
                 * blocked list) and/or by another process
                 * releasing a lock (in which case we have already
                 * been removed from the blocked list and our
                 * lf_next field set to NULL).
                 */
                if (lock->lf_next != NULL) {
                        TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
                        lock->lf_next = NULL;
                }
                if (error) {
                        lf_free(lock);
                        return error;
                }
        }
        /*
         * No blocks!!  Add the lock.  Note that we will
         * downgrade or upgrade any overlapping locks this
         * process already owns.
         *
         * Skip over locks owned by other processes.
         * Handle any locks that overlap and are owned by ourselves.
         */
        prev = head;
        block = *head;
        needtolink = 1;
        for (;;) {
                ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
                if (ovcase)
                        block = overlap->lf_next;
                /*
                 * Six cases:
                 *        0) no overlap
                 *        1) overlap == lock
                 *        2) overlap contains lock
                 *        3) lock contains overlap
                 *        4) overlap starts before lock
                 *        5) overlap ends after lock
                 */
                switch (ovcase) {
                case 0: /* no overlap */
                        if (needtolink) {
                                *prev = lock;
                                lock->lf_next = overlap;
                        }
                        break;

                case 1: /* overlap == lock */
                        /*
                         * If downgrading lock, others may be
                         * able to acquire it.
                         */
                        if (lock->lf_type == F_RDLCK &&
                            overlap->lf_type == F_WRLCK)
                                lf_wakelock(overlap);
                        overlap->lf_type = lock->lf_type;
                        lf_free(lock);
                        lock = overlap; /* for debug output below */
                        break;

                case 2: /* overlap contains lock */
                        /*
                         * Check for common starting point and different types.
                         */
                        if (overlap->lf_type == lock->lf_type) {
                                lf_free(lock);
                                lock = overlap; /* for debug output below */
                                break;
                        }
                        if (overlap->lf_start == lock->lf_start) {
                                *prev = lock;
                                lock->lf_next = overlap;
                                overlap->lf_start = lock->lf_end + 1;
                        } else
                                lf_split(overlap, lock, sparelock);
                        lf_wakelock(overlap);
                        break;

                case 3: /* lock contains overlap */
                        /*
                         * If downgrading lock, others may be able to
                         * acquire it, otherwise take the list.
                         */
                        if (lock->lf_type == F_RDLCK &&
                            overlap->lf_type == F_WRLCK) {
                                lf_wakelock(overlap);
                        } else {
                                while ((ltmp = TAILQ_FIRST(&overlap->lf_blkhd))) {
                                        KASSERT(ltmp->lf_next == overlap);
                                        TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
                                            lf_block);
                                        ltmp->lf_next = lock;
                                        TAILQ_INSERT_TAIL(&lock->lf_blkhd,
                                            ltmp, lf_block);
                                }
                        }
                        /*
                         * Add the new lock if necessary and delete the overlap.
                         */
                        if (needtolink) {
                                *prev = lock;
                                lock->lf_next = overlap->lf_next;
                                prev = &lock->lf_next;
                                needtolink = 0;
                        } else
                                *prev = overlap->lf_next;
                        lf_free(overlap);
                        continue;

                case 4: /* overlap starts before lock */
                        /*
                         * Add lock after overlap on the list.
                         */
                        lock->lf_next = overlap->lf_next;
                        overlap->lf_next = lock;
                        overlap->lf_end = lock->lf_start - 1;
                        prev = &lock->lf_next;
                        lf_wakelock(overlap);
                        needtolink = 0;
                        continue;

                case 5: /* overlap ends after lock */
                        /*
                         * Add the new lock before overlap.
                         */
                        if (needtolink) {
                                *prev = lock;
                                lock->lf_next = overlap;
                        }
                        overlap->lf_start = lock->lf_end + 1;
                        lf_wakelock(overlap);
                        break;
                }
                break;
        }
#ifdef LOCKF_DEBUG
        if (lockf_debug & 1) {
                lf_print("lf_setlock: got the lock", lock);
                lf_printlist("lf_setlock", lock);
        }
#endif /* LOCKF_DEBUG */
        return 0;
}

/*
 * Check whether there is a blocking lock,
 * and if so return its process identifier.
 */
static int
lf_getlock(struct lockf *lock, struct flock *fl)
{
        struct lockf *block;

#ifdef LOCKF_DEBUG
        if (lockf_debug & 1)
                lf_print("lf_getlock", lock);
#endif /* LOCKF_DEBUG */

        if ((block = lf_getblock(lock)) != NULL) {
                fl->l_type = block->lf_type;
                fl->l_whence = SEEK_SET;
                fl->l_start = block->lf_start;
                if (block->lf_end == -1)
                        fl->l_len = 0;
                else
                        fl->l_len = block->lf_end - block->lf_start + 1;
                if (block->lf_flags & F_POSIX)
                        fl->l_pid = ((struct proc *)block->lf_id)->p_pid;
                else
                        fl->l_pid = -1;
        } else {
                fl->l_type = F_UNLCK;
        }
        return 0;
}

/*
 * Do an advisory lock operation.
 */
int
lf_advlock(struct vop_advlock_args *ap, struct lockf **head, off_t size)
{
        struct flock *fl = ap->a_fl;
        struct lockf *lock = NULL;
        struct lockf *sparelock;
        kmutex_t *interlock = &lockf_lock;
        off_t start, end;
        int error = 0;

        KASSERTMSG(size >= 0, "size=%jd", (intmax_t)size);

        /*
         * Convert the flock structure into a start and end.
         */
        switch (fl->l_whence) {
        case SEEK_SET:
        case SEEK_CUR:
                /*
                 * Caller is responsible for adding any necessary offset
                 * when SEEK_CUR is used.
                 */
                start = fl->l_start;
                break;

        case SEEK_END:
                if (fl->l_start > __type_max(off_t) - size)
                        return EINVAL;
                start = size + fl->l_start;
                break;

        default:
                return EINVAL;
        }

        if (fl->l_len == 0)
                end = -1;
        else {
                if (fl->l_len >= 0) {
                        if (start >= 0 &&
                            fl->l_len - 1 > __type_max(off_t) - start)
                                return EINVAL;
                        end = start + (fl->l_len - 1);
                } else {
                        /* lockf() allows -ve lengths */
                        if (start < 0)
                                return EINVAL;
                        end = start - 1;
                        start += fl->l_len;
                }
        }
        if (start < 0)
                return EINVAL;

        /*
         * Allocate locks before acquiring the interlock.  We need two
         * locks in the worst case.
         */
        switch (ap->a_op) {
        case F_SETLK:
        case F_UNLCK:
                /*
                 * XXX For F_UNLCK case, we can re-use the lock.
                 */
                if ((ap->a_flags & F_FLOCK) == 0) {
                        /*
                         * Byte-range lock might need one more lock.
                         */
                        sparelock = lf_alloc(0);
                        if (sparelock == NULL) {
                                error = ENOMEM;
                                goto quit;
                        }
                        break;
                }
                /* FALLTHROUGH */

        case F_GETLK:
                sparelock = NULL;
                break;

        default:
                return EINVAL;
        }

        switch (ap->a_op) {
        case F_SETLK:
                lock = lf_alloc(1);
                break;
        case F_UNLCK:
                if (start == 0 || end == -1) {
                        /* never split */
                        lock = lf_alloc(0);
                } else {
                        /* might split */
                        lock = lf_alloc(2);
                }
                break;
        case F_GETLK:
                lock = lf_alloc(0);
                break;
        }
        if (lock == NULL) {
                error = ENOMEM;
                goto quit;
        }

        mutex_enter(interlock);

        /*
         * Avoid the common case of unlocking when inode has no locks.
         */
        if (*head == (struct lockf *)0) {
                if (ap->a_op != F_SETLK) {
                        fl->l_type = F_UNLCK;
                        error = 0;
                        goto quit_unlock;
                }
        }

        /*
         * Create the lockf structure.
         */
        lock->lf_start = start;
        lock->lf_end = end;
        lock->lf_head = head;
        lock->lf_type = fl->l_type;
        lock->lf_next = (struct lockf *)0;
        TAILQ_INIT(&lock->lf_blkhd);
        lock->lf_flags = ap->a_flags;
        if (lock->lf_flags & F_POSIX) {
                KASSERT(curproc == (struct proc *)ap->a_id);
        }
        lock->lf_id = ap->a_id;

        /*
         * Do the requested operation.
         */
        switch (ap->a_op) {

        case F_SETLK:
                error = lf_setlock(lock, &sparelock, interlock);
                lock = NULL; /* lf_setlock freed it */
                break;

        case F_UNLCK:
                error = lf_clearlock(lock, &sparelock);
                break;

        case F_GETLK:
                error = lf_getlock(lock, fl);
                break;

        default:
                break;
                /* NOTREACHED */
        }

quit_unlock:
        mutex_exit(interlock);
quit:
        if (lock)
                lf_free(lock);
        if (sparelock)
                lf_free(sparelock);

        return error;
}

/*
 * Initialize subsystem.   XXX We use a global lock.  This could be the
 * vnode interlock, but the deadlock detection code may need to inspect
 * locks belonging to other files.
 */
void
lf_init(void)
{

        mutex_init(&lockf_lock, MUTEX_DEFAULT, IPL_NONE);
}






















































































    1 




    1 

    1 





    1 












































































































































































































































































































































































    1 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
/*        $NetBSD: sco_socket.c,v 1.38 2019/01/28 12:53:01 martin Exp $        */

/*-
 * Copyright (c) 2006 Itronix Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of Itronix Inc. may not be used to endorse
 *    or promote products derived from this software without specific
 *    prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sco_socket.c,v 1.38 2019/01/28 12:53:01 martin Exp $");

/* load symbolic names */
#ifdef BLUETOOTH_DEBUG
#define PRUREQUESTS
#define PRCOREQUESTS
#endif

#include <sys/param.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/systm.h>

#include <netbt/bluetooth.h>
#include <netbt/hci.h>
#include <netbt/sco.h>

/*******************************************************************************
 *
 * SCO SOCK_SEQPACKET sockets - low latency audio data
 */

static void sco_connecting(void *);
static void sco_connected(void *);
static void sco_disconnected(void *, int);
static void *sco_newconn(void *, struct sockaddr_bt *, struct sockaddr_bt *);
static void sco_complete(void *, int);
static void sco_linkmode(void *, int);
static void sco_input(void *, struct mbuf *);

static const struct btproto sco_proto = {
        sco_connecting,
        sco_connected,
        sco_disconnected,
        sco_newconn,
        sco_complete,
        sco_linkmode,
        sco_input,
};

int sco_sendspace = 4096;
int sco_recvspace = 4096;

static int
sco_attach(struct socket *so, int proto)
{
        int error;

        KASSERT(so->so_pcb == NULL);

        if (so->so_lock == NULL) {
                mutex_obj_hold(bt_lock);
                so->so_lock = bt_lock;
                solock(so);
        }
        KASSERT(solocked(so));

        error = soreserve(so, sco_sendspace, sco_recvspace);
        if (error) {
                return error;
        }
        return sco_attach_pcb((struct sco_pcb **)&so->so_pcb, &sco_proto, so);
}

static void
sco_detach(struct socket *so)
{
        KASSERT(so->so_pcb != NULL);
        sco_detach_pcb((struct sco_pcb **)&so->so_pcb);
        KASSERT(so->so_pcb == NULL);
}

static int
sco_accept(struct socket *so, struct sockaddr *nam)
{
        struct sco_pcb *pcb = so->so_pcb;

        KASSERT(solocked(so));
        KASSERT(nam != NULL);

        if (pcb == NULL)
                return EINVAL;

        return sco_peeraddr_pcb(pcb, (struct sockaddr_bt *)nam);
}

static int
sco_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct sco_pcb *pcb = so->so_pcb;
        struct sockaddr_bt *sa = (struct sockaddr_bt *)nam;

        KASSERT(solocked(so));
        KASSERT(nam != NULL);

        if (pcb == NULL)
                return EINVAL;

        if (sa->bt_len != sizeof(struct sockaddr_bt))
                return EINVAL;

        if (sa->bt_family != AF_BLUETOOTH)
                return EAFNOSUPPORT;

        return sco_bind_pcb(pcb, sa);
}

static int
sco_listen(struct socket *so, struct lwp *l)
{
        struct sco_pcb *pcb = so->so_pcb;

        KASSERT(solocked(so));

        if (pcb == NULL)
                return EINVAL;

        return sco_listen_pcb(pcb);
}

static int
sco_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct sco_pcb *pcb = so->so_pcb;
        struct sockaddr_bt *sa = (struct sockaddr_bt *)nam;

        KASSERT(solocked(so));
        KASSERT(nam != NULL);

        if (pcb == NULL)
                return EINVAL;

        if (sa->bt_len != sizeof(struct sockaddr_bt))
                return EINVAL;

        if (sa->bt_family != AF_BLUETOOTH)
                return EAFNOSUPPORT;

        soisconnecting(so);
        return sco_connect_pcb(pcb, sa);
}

static int
sco_connect2(struct socket *so, struct socket *so2)
{
        struct sco_pcb *pcb = so->so_pcb;

        KASSERT(solocked(so));

        if (pcb == NULL)
                return EINVAL;

        return EOPNOTSUPP;
}

static int
sco_disconnect(struct socket *so)
{
        struct sco_pcb *pcb = so->so_pcb;

        KASSERT(solocked(so));

        if (pcb == NULL)
                return EINVAL;

        soisdisconnecting(so);
        return sco_disconnect_pcb(pcb, so->so_linger);
}

static int
sco_shutdown(struct socket *so)
{
        KASSERT(solocked(so));

        socantsendmore(so);
        return 0;
}

static int
sco_abort(struct socket *so)
{
        struct sco_pcb *pcb = so->so_pcb;

        KASSERT(solocked(so));

        if (pcb == NULL)
                return EINVAL;

        sco_disconnect_pcb(pcb, 0);
        soisdisconnected(so);
        sco_detach(so);
        return 0;
}

static int
sco_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
{
        return EOPNOTSUPP;
}

static int
sco_stat(struct socket *so, struct stat *ub)
{
        KASSERT(solocked(so));

        return 0;
}

static int
sco_peeraddr(struct socket *so, struct sockaddr *nam)
{
        struct sco_pcb *pcb = (struct sco_pcb *)so->so_pcb;

        KASSERT(solocked(so));
        KASSERT(pcb != NULL);
        KASSERT(nam != NULL);

        return sco_peeraddr_pcb(pcb, (struct sockaddr_bt *)nam);
}

static int
sco_sockaddr(struct socket *so, struct sockaddr *nam)
{
        struct sco_pcb *pcb = (struct sco_pcb *)so->so_pcb;

        KASSERT(solocked(so));
        KASSERT(pcb != NULL);
        KASSERT(nam != NULL);

        return sco_sockaddr_pcb(pcb, (struct sockaddr_bt *)nam);
}

static int
sco_rcvd(struct socket *so, int flags, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
sco_recvoob(struct socket *so, struct mbuf *m, int flags)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
sco_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
    struct mbuf *control, struct lwp *l)
{
        struct sco_pcb *pcb = so->so_pcb;
        int err = 0;
        struct mbuf *m0;

        KASSERT(solocked(so));
        KASSERT(m != NULL);

        if (control) /* no use for that */
                m_freem(control);

        if (pcb == NULL) {
                err = EINVAL;
                goto release;
        }

        if (m->m_pkthdr.len == 0)
                goto release;

        if (m->m_pkthdr.len > pcb->sp_mtu) {
                err = EMSGSIZE;
                goto release;
        }

        m0 = m_copypacket(m, M_DONTWAIT);
        if (m0 == NULL) {
                err = ENOMEM;
                goto release;
        }

        sbappendrecord(&so->so_snd, m);
        return sco_send_pcb(pcb, m0);

release:
        m_freem(m);
        return err;
}

static int
sco_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control)
{
        KASSERT(solocked(so));

        m_freem(m);
        m_freem(control);

        return EOPNOTSUPP;
}

static int
sco_purgeif(struct socket *so, struct ifnet *ifp)
{

        return EOPNOTSUPP;
}

/*
 * get/set socket options
 */
int
sco_ctloutput(int req, struct socket *so, struct sockopt *sopt)
{
        struct sco_pcb *pcb = (struct sco_pcb *)so->so_pcb;
        int err = 0;

        DPRINTFN(2, "req %s\n", prcorequests[req]);

        if (pcb == NULL)
                return EINVAL;

        if (sopt->sopt_level != BTPROTO_SCO)
                return ENOPROTOOPT;

        switch(req) {
        case PRCO_GETOPT:
                err = sco_getopt(pcb, sopt);
                break;

        case PRCO_SETOPT:
                err = sco_setopt(pcb, sopt);
                break;

        default:
                err = ENOPROTOOPT;
                break;
        }

        return err;
}

/*****************************************************************************
 *
 *        SCO Protocol socket callbacks
 *
 */
static void
sco_connecting(void *arg)
{
        struct socket *so = arg;

        DPRINTF("Connecting\n");
        soisconnecting(so);
}

static void
sco_connected(void *arg)
{
        struct socket *so = arg;

        DPRINTF("Connected\n");
        soisconnected(so);
}

static void
sco_disconnected(void *arg, int err)
{
        struct socket *so = arg;

        DPRINTF("Disconnected (%d)\n", err);

        so->so_error = err;
        soisdisconnected(so);
}

static void *
sco_newconn(void *arg, struct sockaddr_bt *laddr,
    struct sockaddr_bt *raddr)
{
        struct socket *so = arg;

        DPRINTF("New Connection\n");
        so = sonewconn(so, false);
        if (so == NULL)
                return NULL;

        soisconnecting(so);
        return so->so_pcb;
}

static void
sco_complete(void *arg, int num)
{
        struct socket *so = arg;

        while (num-- > 0)
                sbdroprecord(&so->so_snd);

        sowwakeup(so);
}

static void
sco_linkmode(void *arg, int mode)
{
}

static void
sco_input(void *arg, struct mbuf *m)
{
        struct socket *so = arg;

        /*
         * since this data is time sensitive, if the buffer
         * is full we just dump data until the latest one
         * will fit.
         */

        while (m->m_pkthdr.len > sbspace(&so->so_rcv))
                sbdroprecord(&so->so_rcv);

        DPRINTFN(10, "received %d bytes\n", m->m_pkthdr.len);

        sbappendrecord(&so->so_rcv, m);
        sorwakeup(so);
}

PR_WRAP_USRREQS(sco)

#define        sco_attach                sco_attach_wrapper
#define        sco_detach                sco_detach_wrapper
#define        sco_accept                sco_accept_wrapper
#define        sco_bind                sco_bind_wrapper
#define        sco_listen                sco_listen_wrapper
#define        sco_connect                sco_connect_wrapper
#define        sco_connect2                sco_connect2_wrapper
#define        sco_disconnect                sco_disconnect_wrapper
#define        sco_shutdown                sco_shutdown_wrapper
#define        sco_abort                sco_abort_wrapper
#define        sco_ioctl                sco_ioctl_wrapper
#define        sco_stat                sco_stat_wrapper
#define        sco_peeraddr                sco_peeraddr_wrapper
#define        sco_sockaddr                sco_sockaddr_wrapper
#define        sco_rcvd                sco_rcvd_wrapper
#define        sco_recvoob                sco_recvoob_wrapper
#define        sco_send                sco_send_wrapper
#define        sco_sendoob                sco_sendoob_wrapper
#define        sco_purgeif                sco_purgeif_wrapper

const struct pr_usrreqs sco_usrreqs = {
        .pr_attach        = sco_attach,
        .pr_detach        = sco_detach,
        .pr_accept        = sco_accept,
        .pr_bind        = sco_bind,
        .pr_listen        = sco_listen,
        .pr_connect        = sco_connect,
        .pr_connect2        = sco_connect2,
        .pr_disconnect        = sco_disconnect,
        .pr_shutdown        = sco_shutdown,
        .pr_abort        = sco_abort,
        .pr_ioctl        = sco_ioctl,
        .pr_stat        = sco_stat,
        .pr_peeraddr        = sco_peeraddr,
        .pr_sockaddr        = sco_sockaddr,
        .pr_rcvd        = sco_rcvd,
        .pr_recvoob        = sco_recvoob,
        .pr_send        = sco_send,
        .pr_sendoob        = sco_sendoob,
        .pr_purgeif        = sco_purgeif,
};
































































































































































































   17 
   17 












   14 





   16 







































   47 
































   47 



















   47 

















   48 



   47 







   48 
    9 



   48 



    9 










    1 





    1 













    3 





    3 
    3 











    3 
    3 
    3 

















    2 












    2 
    2 








    1 






























    1 







    1 








    1 


















































   33 


   34 




























































    1 




    1 




    1 

    1 


    1 



    1 

















    1 







    1 
    1 


    1 














    1 







    1 

    1 









    1 
































   17 


















   17 
   17 









   17 























   17 

















   14 





















    3 



























    3 
   12 
    2 






















   16 
   17 














   12 






    1 








    1 








    1 






    2 























   17 










































   10 
   10 
   10 


   10 

    8 










    8 
    8 




    6 
    2 




































    6 

    3 

    1 












    2 






















    2 
    2 



























    2 





































































    1 


















    1 
    1 
























    1 






















































































    7 


















    7 



















    2 














    2 














































































    1 











    1 















   29 
















   11 

   18 


   21 









   21 







   29 
   29 


































































    1 













    1 






























    1 














































































    1 
    1 
    1 




    1 





    1 

    1 



    1 










    1 










    1 

    1 







    1 














    1 
    1 





    1 
























    3 











    1 

















    1 









    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
/*        $NetBSD: spec_vnops.c,v 1.218 2023/04/22 15:32:49 riastradh Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)spec_vnops.c        8.15 (Berkeley) 7/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.218 2023/04/22 15:32:49 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#endif

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/vnode_impl.h>
#include <sys/stat.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/poll.h>
#include <sys/file.h>
#include <sys/disklabel.h>
#include <sys/disk.h>
#include <sys/lockf.h>
#include <sys/tty.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>
#include <sys/module.h>
#include <sys/atomic.h>

#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

#ifdef DDB
#include <ddb/ddb.h>
#endif

/*
 * Lock order:
 *
 *        vnode lock
 *        -> device_lock
 *        -> struct vnode::v_interlock
 */

/* symbolic sleep message strings for devices */
const char        devopn[] = "devopn";
const char        devio[] = "devio";
const char        devwait[] = "devwait";
const char        devin[] = "devin";
const char        devout[] = "devout";
const char        devioc[] = "devioc";
const char        devcls[] = "devcls";

#define        SPECHSZ        64
#if        ((SPECHSZ&(SPECHSZ-1)) == 0)
#define        SPECHASH(rdev)        (((rdev>>5)+(rdev))&(SPECHSZ-1))
#else
#define        SPECHASH(rdev)        (((unsigned)((rdev>>5)+(rdev)))%SPECHSZ)
#endif

static vnode_t        *specfs_hash[SPECHSZ];
extern struct mount *dead_rootmount;

/*
 * This vnode operations vector is used for special device nodes
 * created from whole cloth by the kernel.  For the ops vector for
 * vnodes built from special devices found in a filesystem, see (e.g)
 * ffs_specop_entries[] in ffs_vnops.c or the equivalent for other
 * filesystems.
 */

int (**spec_vnodeop_p)(void *);
const struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
        { &vop_default_desc, vn_default_error },
        { &vop_parsepath_desc, genfs_parsepath },        /* parsepath */
        { &vop_lookup_desc, spec_lookup },                /* lookup */
        { &vop_create_desc, genfs_badop },                /* create */
        { &vop_mknod_desc, genfs_badop },                /* mknod */
        { &vop_open_desc, spec_open },                        /* open */
        { &vop_close_desc, spec_close },                /* close */
        { &vop_access_desc, genfs_ebadf },                /* access */
        { &vop_accessx_desc, genfs_ebadf },                /* accessx */
        { &vop_getattr_desc, genfs_ebadf },                /* getattr */
        { &vop_setattr_desc, genfs_ebadf },                /* setattr */
        { &vop_read_desc, spec_read },                        /* read */
        { &vop_write_desc, spec_write },                /* write */
        { &vop_fallocate_desc, genfs_eopnotsupp },        /* fallocate */
        { &vop_fdiscard_desc, spec_fdiscard },                /* fdiscard */
        { &vop_fcntl_desc, genfs_fcntl },                /* fcntl */
        { &vop_ioctl_desc, spec_ioctl },                /* ioctl */
        { &vop_poll_desc, spec_poll },                        /* poll */
        { &vop_kqfilter_desc, spec_kqfilter },                /* kqfilter */
        { &vop_revoke_desc, genfs_revoke },                /* revoke */
        { &vop_mmap_desc, spec_mmap },                        /* mmap */
        { &vop_fsync_desc, spec_fsync },                /* fsync */
        { &vop_seek_desc, spec_seek },                        /* seek */
        { &vop_remove_desc, genfs_badop },                /* remove */
        { &vop_link_desc, genfs_badop },                /* link */
        { &vop_rename_desc, genfs_badop },                /* rename */
        { &vop_mkdir_desc, genfs_badop },                /* mkdir */
        { &vop_rmdir_desc, genfs_badop },                /* rmdir */
        { &vop_symlink_desc, genfs_badop },                /* symlink */
        { &vop_readdir_desc, genfs_badop },                /* readdir */
        { &vop_readlink_desc, genfs_badop },                /* readlink */
        { &vop_abortop_desc, genfs_badop },                /* abortop */
        { &vop_inactive_desc, spec_inactive },                /* inactive */
        { &vop_reclaim_desc, spec_reclaim },                /* reclaim */
        { &vop_lock_desc, genfs_lock },                        /* lock */
        { &vop_unlock_desc, genfs_unlock },                /* unlock */
        { &vop_bmap_desc, spec_bmap },                        /* bmap */
        { &vop_strategy_desc, spec_strategy },                /* strategy */
        { &vop_print_desc, spec_print },                /* print */
        { &vop_islocked_desc, genfs_islocked },                /* islocked */
        { &vop_pathconf_desc, spec_pathconf },                /* pathconf */
        { &vop_advlock_desc, spec_advlock },                /* advlock */
        { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
        { &vop_getpages_desc, genfs_getpages },                /* getpages */
        { &vop_putpages_desc, genfs_putpages },                /* putpages */
        { NULL, NULL }
};
const struct vnodeopv_desc spec_vnodeop_opv_desc =
        { &spec_vnodeop_p, spec_vnodeop_entries };

static kauth_listener_t rawio_listener;
static struct kcondvar specfs_iocv;

/*
 * Returns true if vnode is /dev/mem or /dev/kmem.
 */
bool
iskmemvp(struct vnode *vp)
{
        return ((vp->v_type == VCHR) && iskmemdev(vp->v_rdev));
}

/*
 * Returns true if dev is /dev/mem or /dev/kmem.
 */
int
iskmemdev(dev_t dev)
{
        /* mem_no is emitted by config(8) to generated devsw.c */
        extern const int mem_no;

        /* minor 14 is /dev/io on i386 with COMPAT_10 */
        return (major(dev) == mem_no && (minor(dev) < 2 || minor(dev) == 14));
}

static int
rawio_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;

        result = KAUTH_RESULT_DEFER;

        if ((action != KAUTH_DEVICE_RAWIO_SPEC) &&
            (action != KAUTH_DEVICE_RAWIO_PASSTHRU))
                return result;

        /* Access is mandated by permissions. */
        result = KAUTH_RESULT_ALLOW;

        return result;
}

void
spec_init(void)
{

        rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
            rawio_listener_cb, NULL);
        cv_init(&specfs_iocv, "specio");
}

/*
 * spec_io_enter(vp, &sn, &dev)
 *
 *        Enter an operation that may not hold vp's vnode lock or an
 *        fstrans on vp's mount.  Until spec_io_exit, the vnode will not
 *        be revoked.
 *
 *        On success, set sn to the specnode pointer and dev to the dev_t
 *        number and return zero.  Caller must later call spec_io_exit
 *        when done.
 *
 *        On failure, return ENXIO -- the device has been revoked and no
 *        longer exists.
 */
static int
spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp)
{
        dev_t dev;
        struct specnode *sn;
        unsigned iocnt;
        int error = 0;

        mutex_enter(vp->v_interlock);

        /*
         * Extract all the info we need from the vnode, unless the
         * vnode has already been reclaimed.  This can happen if the
         * underlying device has been removed and all the device nodes
         * for it have been revoked.  The caller may not hold a vnode
         * lock or fstrans to prevent this from happening before it has
         * had an opportunity to notice the vnode is dead.
         */
        if (vdead_check(vp, VDEAD_NOWAIT) != 0 ||
            (sn = vp->v_specnode) == NULL ||
            (dev = vp->v_rdev) == NODEV) {
                error = ENXIO;
                goto out;
        }

        /*
         * Notify spec_close that we are doing an I/O operation which
         * may not be not bracketed by fstrans(9) and thus is not
         * blocked by vfs suspension.
         *
         * We could hold this reference with psref(9) instead, but we
         * already have to take the interlock for vdead_check, so
         * there's not much more cost here to another atomic operation.
         */
        do {
                iocnt = atomic_load_relaxed(&sn->sn_dev->sd_iocnt);
                if (__predict_false(iocnt == UINT_MAX)) {
                        /*
                         * The I/O count is limited by the number of
                         * LWPs (which will never overflow this) --
                         * unless one driver uses another driver via
                         * specfs, which is rather unusual, but which
                         * could happen via pud(4) userspace drivers.
                         * We could use a 64-bit count, but can't use
                         * atomics for that on all platforms.
                         * (Probably better to switch to psref or
                         * localcount instead.)
                         */
                        error = EBUSY;
                        goto out;
                }
        } while (atomic_cas_uint(&sn->sn_dev->sd_iocnt, iocnt, iocnt + 1)
            != iocnt);

        /* Success!  */
        *snp = sn;
        *devp = dev;
        error = 0;

out:        mutex_exit(vp->v_interlock);
        return error;
}

/*
 * spec_io_exit(vp, sn)
 *
 *        Exit an operation entered with a successful spec_io_enter --
 *        allow concurrent spec_node_revoke to proceed.  The argument sn
 *        must match the struct specnode pointer returned by spec_io_exit
 *        for vp.
 */
static void
spec_io_exit(struct vnode *vp, struct specnode *sn)
{
        struct specdev *sd = sn->sn_dev;
        unsigned iocnt;

        KASSERT(vp->v_specnode == sn);

        /*
         * We are done.  Notify spec_close if appropriate.  The
         * transition of 1 -> 0 must happen under device_lock so
         * spec_close doesn't miss a wakeup.
         */
        do {
                iocnt = atomic_load_relaxed(&sd->sd_iocnt);
                KASSERT(iocnt > 0);
                if (iocnt == 1) {
                        mutex_enter(&device_lock);
                        if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0)
                                cv_broadcast(&specfs_iocv);
                        mutex_exit(&device_lock);
                        break;
                }
        } while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt);
}

/*
 * spec_io_drain(sd)
 *
 *        Wait for all existing spec_io_enter/exit sections to complete.
 *        Caller must ensure spec_io_enter will fail at this point.
 */
static void
spec_io_drain(struct specdev *sd)
{

        /*
         * I/O at the same time as closing is unlikely -- it often
         * indicates an application bug.
         */
        if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0))
                return;

        mutex_enter(&device_lock);
        while (atomic_load_relaxed(&sd->sd_iocnt) > 0)
                cv_wait(&specfs_iocv, &device_lock);
        mutex_exit(&device_lock);
}

/*
 * Initialize a vnode that represents a device.
 */
void
spec_node_init(vnode_t *vp, dev_t rdev)
{
        specnode_t *sn;
        specdev_t *sd;
        vnode_t *vp2;
        vnode_t **vpp;

        KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
        KASSERT(vp->v_specnode == NULL);

        /*
         * Search the hash table for this device.  If known, add a
         * reference to the device structure.  If not known, create
         * a new entry to represent the device.  In all cases add
         * the vnode to the hash table.
         */
        sn = kmem_alloc(sizeof(*sn), KM_SLEEP);
        sd = kmem_alloc(sizeof(*sd), KM_SLEEP);
        mutex_enter(&device_lock);
        vpp = &specfs_hash[SPECHASH(rdev)];
        for (vp2 = *vpp; vp2 != NULL; vp2 = vp2->v_specnext) {
                KASSERT(vp2->v_specnode != NULL);
                if (rdev == vp2->v_rdev && vp->v_type == vp2->v_type) {
                        break;
                }
        }
        if (vp2 == NULL) {
                /* No existing record, create a new one. */
                sd->sd_mountpoint = NULL;
                sd->sd_lockf = NULL;
                sd->sd_refcnt = 1;
                sd->sd_opencnt = 0;
                sd->sd_bdevvp = NULL;
                sd->sd_iocnt = 0;
                sd->sd_opened = false;
                sd->sd_closing = false;
                sn->sn_dev = sd;
                sd = NULL;
        } else {
                /* Use the existing record. */
                sn->sn_dev = vp2->v_specnode->sn_dev;
                sn->sn_dev->sd_refcnt++;
        }
        /* Insert vnode into the hash chain. */
        sn->sn_opencnt = 0;
        sn->sn_rdev = rdev;
        sn->sn_gone = false;
        vp->v_specnode = sn;
        vp->v_specnext = *vpp;
        *vpp = vp;
        mutex_exit(&device_lock);

        /* Free the record we allocated if unused. */
        if (sd != NULL) {
                kmem_free(sd, sizeof(*sd));
        }
}

/*
 * Lookup a vnode by device number and return it referenced.
 */
int
spec_node_lookup_by_dev(enum vtype type, dev_t dev, int flags, vnode_t **vpp)
{
        int error;
        vnode_t *vp;

top:        mutex_enter(&device_lock);
        for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
                if (type == vp->v_type && dev == vp->v_rdev) {
                        mutex_enter(vp->v_interlock);
                        /* If clean or being cleaned, then ignore it. */
                        if (vdead_check(vp, VDEAD_NOWAIT) == 0)
                                break;
                        if ((flags & VDEAD_NOWAIT) == 0) {
                                mutex_exit(&device_lock);
                                /*
                                 * It may be being revoked as we speak,
                                 * and the caller wants to wait until
                                 * all revocation has completed.  Let
                                 * vcache_vget wait for it to finish
                                 * dying; as a side effect, vcache_vget
                                 * releases vp->v_interlock.  Note that
                                 * vcache_vget cannot succeed at this
                                 * point because vdead_check already
                                 * failed.
                                 */
                                error = vcache_vget(vp);
                                KASSERT(error);
                                goto top;
                        }
                        mutex_exit(vp->v_interlock);
                }
        }
        KASSERT(vp == NULL || mutex_owned(vp->v_interlock));
        if (vp == NULL) {
                mutex_exit(&device_lock);
                return ENOENT;
        }
        /*
         * If it is an opened block device return the opened vnode.
         */
        if (type == VBLK && vp->v_specnode->sn_dev->sd_bdevvp != NULL) {
                mutex_exit(vp->v_interlock);
                vp = vp->v_specnode->sn_dev->sd_bdevvp;
                mutex_enter(vp->v_interlock);
        }
        mutex_exit(&device_lock);
        error = vcache_vget(vp);
        if (error)
                return error;
        *vpp = vp;

        return 0;
}

/*
 * Lookup a vnode by file system mounted on and return it referenced.
 */
int
spec_node_lookup_by_mount(struct mount *mp, vnode_t **vpp)
{
        int i, error;
        vnode_t *vp, *vq;

        mutex_enter(&device_lock);
        for (i = 0, vq = NULL; i < SPECHSZ && vq == NULL; i++) {
                for (vp = specfs_hash[i]; vp; vp = vp->v_specnext) {
                        if (vp->v_type != VBLK)
                                continue;
                        vq = vp->v_specnode->sn_dev->sd_bdevvp;
                        if (vq != NULL &&
                            vq->v_specnode->sn_dev->sd_mountpoint == mp)
                                break;
                        vq = NULL;
                }
        }
        if (vq == NULL) {
                mutex_exit(&device_lock);
                return ENOENT;
        }
        mutex_enter(vq->v_interlock);
        mutex_exit(&device_lock);
        error = vcache_vget(vq);
        if (error)
                return error;
        *vpp = vq;

        return 0;

}

/*
 * Get the file system mounted on this block device.
 *
 * XXX Caller should hold the vnode lock -- shared or exclusive -- so
 * that this can't changed, and the vnode can't be revoked while we
 * examine it.  But not all callers do, and they're scattered through a
 * lot of file systems, so we can't assert this yet.
 */
struct mount *
spec_node_getmountedfs(vnode_t *devvp)
{
        struct mount *mp;

        KASSERT(devvp->v_type == VBLK);
        mp = devvp->v_specnode->sn_dev->sd_mountpoint;

        return mp;
}

/*
 * Set the file system mounted on this block device.
 *
 * XXX Caller should hold the vnode lock exclusively so this can't be
 * changed or assumed by spec_node_getmountedfs while we change it, and
 * the vnode can't be revoked while we handle it.  But not all callers
 * do, and they're scattered through a lot of file systems, so we can't
 * assert this yet.  Instead, for now, we'll take an I/O reference so
 * at least the ioctl doesn't race with revoke/detach.
 *
 * If you do change this to assert an exclusive vnode lock, you must
 * also do vdead_check before trying bdev_ioctl, because the vnode may
 * have been revoked by the time the caller locked it, and this is
 * _not_ a vop -- calls to spec_node_setmountedfs don't go through
 * v_op, so revoking the vnode doesn't prevent further calls.
 *
 * XXX Caller should additionally have the vnode open, at least if mp
 * is nonnull, but I'm not sure all callers do that -- need to audit.
 * Currently udf closes the vnode before clearing the mount.
 */
void
spec_node_setmountedfs(vnode_t *devvp, struct mount *mp)
{
        struct dkwedge_info dkw;
        struct specnode *sn;
        dev_t dev;
        int error;

        KASSERT(devvp->v_type == VBLK);

        error = spec_io_enter(devvp, &sn, &dev);
        if (error)
                return;

        KASSERT(sn->sn_dev->sd_mountpoint == NULL || mp == NULL);
        sn->sn_dev->sd_mountpoint = mp;
        if (mp == NULL)
                goto out;

        error = bdev_ioctl(dev, DIOCGWEDGEINFO, &dkw, FREAD, curlwp);
        if (error)
                goto out;

        strlcpy(mp->mnt_stat.f_mntfromlabel, dkw.dkw_wname,
            sizeof(mp->mnt_stat.f_mntfromlabel));

out:        spec_io_exit(devvp, sn);
}

/*
 * A vnode representing a special device is going away.  Close
 * the device if the vnode holds it open.
 */
void
spec_node_revoke(vnode_t *vp)
{
        specnode_t *sn;
        specdev_t *sd;
        struct vnode **vpp;

        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);

        sn = vp->v_specnode;
        sd = sn->sn_dev;

        KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
        KASSERT(vp->v_specnode != NULL);
        KASSERT(sn->sn_gone == false);

        mutex_enter(&device_lock);
        KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
            "sn_opencnt=%u > sd_opencnt=%u",
            sn->sn_opencnt, sd->sd_opencnt);
        sn->sn_gone = true;
        if (sn->sn_opencnt != 0) {
                sd->sd_opencnt -= (sn->sn_opencnt - 1);
                sn->sn_opencnt = 1;
                mutex_exit(&device_lock);

                VOP_CLOSE(vp, FNONBLOCK, NOCRED);

                mutex_enter(&device_lock);
                KASSERT(sn->sn_opencnt == 0);
        }

        /*
         * We may have revoked the vnode in this thread while another
         * thread was in the middle of spec_close, in the window when
         * spec_close releases the vnode lock to call .d_close for the
         * last close.  In that case, wait for the concurrent
         * spec_close to complete.
         */
        while (sd->sd_closing)
                cv_wait(&specfs_iocv, &device_lock);

        /*
         * Remove from the hash so lookups stop returning this
         * specnode.  We will dissociate it from the specdev -- and
         * possibly free the specdev -- in spec_node_destroy.
         */
        KASSERT(sn->sn_gone);
        KASSERT(sn->sn_opencnt == 0);
        for (vpp = &specfs_hash[SPECHASH(vp->v_rdev)];;
             vpp = &(*vpp)->v_specnext) {
                if (*vpp == vp) {
                        *vpp = vp->v_specnext;
                        vp->v_specnext = NULL;
                        break;
                }
        }
        mutex_exit(&device_lock);
}

/*
 * A vnode representing a special device is being recycled.
 * Destroy the specfs component.
 */
void
spec_node_destroy(vnode_t *vp)
{
        specnode_t *sn;
        specdev_t *sd;
        int refcnt;

        sn = vp->v_specnode;
        sd = sn->sn_dev;

        KASSERT(vp->v_type == VBLK || vp->v_type == VCHR);
        KASSERT(vp->v_specnode != NULL);
        KASSERT(sn->sn_opencnt == 0);

        mutex_enter(&device_lock);
        sn = vp->v_specnode;
        vp->v_specnode = NULL;
        refcnt = sd->sd_refcnt--;
        KASSERT(refcnt > 0);
        mutex_exit(&device_lock);

        /* If the device is no longer in use, destroy our record. */
        if (refcnt == 1) {
                KASSERT(sd->sd_iocnt == 0);
                KASSERT(sd->sd_opencnt == 0);
                KASSERT(sd->sd_bdevvp == NULL);
                kmem_free(sd, sizeof(*sd));
        }
        kmem_free(sn, sizeof(*sn));
}

/*
 * Trivial lookup routine that always fails.
 */
int
spec_lookup(void *v)
{
        struct vop_lookup_v2_args /* {
                struct vnode *a_dvp;
                struct vnode **a_vpp;
                struct componentname *a_cnp;
        } */ *ap = v;

        *ap->a_vpp = NULL;
        return ENOTDIR;
}

typedef int (*spec_ioctl_t)(dev_t, u_long, void *, int, struct lwp *);

/*
 * Open a special file.
 */
/* ARGSUSED */
int
spec_open(void *v)
{
        struct vop_open_args /* {
                struct vnode *a_vp;
                int  a_mode;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct lwp *l = curlwp;
        struct vnode *vp = ap->a_vp;
        dev_t dev, dev1;
        int error;
        enum kauth_device_req req;
        specnode_t *sn, *sn1;
        specdev_t *sd;
        spec_ioctl_t ioctl;
        u_int gen = 0;
        const char *name = NULL;
        bool needclose = false;
        struct partinfo pi;

        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d",
            vp->v_type);

        dev = vp->v_rdev;
        sn = vp->v_specnode;
        sd = sn->sn_dev;

        /*
         * Don't allow open if fs is mounted -nodev.
         */
        if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
                return ENXIO;

        switch (ap->a_mode & (FREAD | FWRITE)) {
        case FREAD | FWRITE:
                req = KAUTH_REQ_DEVICE_RAWIO_SPEC_RW;
                break;
        case FWRITE:
                req = KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE;
                break;
        default:
                req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ;
                break;
        }
        error = kauth_authorize_device_spec(ap->a_cred, req, vp);
        if (error)
                return error;

        /*
         * Acquire an open reference -- as long as we hold onto it, and
         * the vnode isn't revoked, it can't be closed, and the vnode
         * can't be revoked until we release the vnode lock.
         */
        mutex_enter(&device_lock);
        KASSERT(!sn->sn_gone);
        switch (vp->v_type) {
        case VCHR:
                /*
                 * Character devices can accept opens from multiple
                 * vnodes.  But first, wait for any close to finish.
                 * Wait under the vnode lock so we don't have to worry
                 * about the vnode being revoked while we wait.
                 */
                while (sd->sd_closing) {
                        error = cv_wait_sig(&specfs_iocv, &device_lock);
                        if (error)
                                break;
                }
                if (error)
                        break;
                sd->sd_opencnt++;
                sn->sn_opencnt++;
                KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
                    "sn_opencnt=%u > sd_opencnt=%u",
                    sn->sn_opencnt, sd->sd_opencnt);
                break;
        case VBLK:
                /*
                 * For block devices, permit only one open.  The buffer
                 * cache cannot remain self-consistent with multiple
                 * vnodes holding a block device open.
                 *
                 * Treat zero opencnt with non-NULL mountpoint as open.
                 * This may happen after forced detach of a mounted device.
                 *
                 * Also treat sd_closing, meaning there is a concurrent
                 * close in progress, as still open.
                 */
                if (sd->sd_opencnt != 0 ||
                    sd->sd_mountpoint != NULL ||
                    sd->sd_closing) {
                        error = EBUSY;
                        break;
                }
                KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u",
                    sn->sn_opencnt);
                sn->sn_opencnt = 1;
                sd->sd_opencnt = 1;
                sd->sd_bdevvp = vp;
                break;
        default:
                panic("invalid specfs vnode type: %d", vp->v_type);
        }
        mutex_exit(&device_lock);
        if (error)
                return error;

        /*
         * Set VV_ISTTY if this is a tty cdev.
         *
         * XXX This does the wrong thing if the module has to be
         * autoloaded.  We should maybe set this after autoloading
         * modules and calling .d_open successfully, except (a) we need
         * the vnode lock to touch it, and (b) once we acquire the
         * vnode lock again, the vnode may have been revoked, and
         * deadfs's dead_read needs VV_ISTTY to be already set in order
         * to return the right answer.  So this needs some additional
         * synchronization to be made to work correctly with tty driver
         * module autoload.  For now, let's just hope it doesn't cause
         * too much trouble for a tty from an autoloaded driver module
         * to fail with EIO instead of returning EOF.
         */
        if (vp->v_type == VCHR) {
                if (cdev_type(dev) == D_TTY)
                        vp->v_vflag |= VV_ISTTY;
        }

        /*
         * Because opening the device may block indefinitely, e.g. when
         * opening a tty, and loading a module may cross into many
         * other subsystems, we must not hold the vnode lock while
         * calling .d_open, so release it now and reacquire it when
         * done.
         *
         * Take an I/O reference so that any concurrent spec_close via
         * spec_node_revoke will wait for us to finish calling .d_open.
         * The vnode can't be dead at this point because we have it
         * locked.  Note that if revoked, the driver must interrupt
         * .d_open before spec_close starts waiting for I/O to drain so
         * this doesn't deadlock.
         */
        VOP_UNLOCK(vp);
        error = spec_io_enter(vp, &sn1, &dev1);
        if (error) {
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                return error;
        }
        KASSERT(sn1 == sn);
        KASSERT(dev1 == dev);

        /*
         * Open the device.  If .d_open returns ENXIO (device not
         * configured), the driver may not be loaded, so try
         * autoloading a module and then try .d_open again if anything
         * got loaded.
         */
        switch (vp->v_type) {
        case VCHR:
                do {
                        const struct cdevsw *cdev;

                        gen = module_gen;
                        error = cdev_open(dev, ap->a_mode, S_IFCHR, l);
                        if (error != ENXIO)
                                break;

                        /* Check if we already have a valid driver */
                        mutex_enter(&device_lock);
                        cdev = cdevsw_lookup(dev);
                        mutex_exit(&device_lock);
                        if (cdev != NULL)
                                break;

                        /* Get device name from devsw_conv array */
                        if ((name = cdevsw_getname(major(dev))) == NULL)
                                break;

                        /* Try to autoload device module */
                        (void)module_autoload(name, MODULE_CLASS_DRIVER);
                } while (gen != module_gen);
                break;

        case VBLK:
                do {
                        const struct bdevsw *bdev;

                        gen = module_gen;
                        error = bdev_open(dev, ap->a_mode, S_IFBLK, l);
                        if (error != ENXIO)
                                break;

                        /* Check if we already have a valid driver */
                        mutex_enter(&device_lock);
                        bdev = bdevsw_lookup(dev);
                        mutex_exit(&device_lock);
                        if (bdev != NULL)
                                break;

                        /* Get device name from devsw_conv array */
                        if ((name = bdevsw_getname(major(dev))) == NULL)
                                break;

                        /* Try to autoload device module */
                        (void)module_autoload(name, MODULE_CLASS_DRIVER);
                } while (gen != module_gen);
                break;

        default:
                __unreachable();
        }

        /*
         * Release the I/O reference now that we have called .d_open,
         * and reacquire the vnode lock.  At this point, the device may
         * have been revoked, so we must tread carefully.  However, sn
         * and sd remain valid pointers until we drop our reference.
         */
        spec_io_exit(vp, sn);
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        KASSERT(vp->v_specnode == sn);

        /*
         * If it has been revoked since we released the vnode lock and
         * reacquired it, then spec_node_revoke has closed it, and we
         * must fail with EBADF.
         *
         * Otherwise, if opening it failed, back out and release the
         * open reference.  If it was ever successfully opened and we
         * got the last reference this way, it's now our job to close
         * it.  This might happen in the following scenario:
         *
         *        Thread 1                Thread 2
         *        VOP_OPEN
         *          ...
         *          .d_open -> 0 (success)
         *          acquire vnode lock
         *          do stuff                VOP_OPEN
         *          release vnode lock        ...
         *                                  .d_open -> EBUSY
         *        VOP_CLOSE
         *          acquire vnode lock
         *          --sd_opencnt != 0
         *          => no .d_close
         *          release vnode lock
         *                                  acquire vnode lock
         *                                  --sd_opencnt == 0
         *
         * We can't resolve this by making spec_close wait for .d_open
         * to complete before examining sd_opencnt, because .d_open can
         * hang indefinitely, e.g. for a tty.
         */
        mutex_enter(&device_lock);
        if (sn->sn_gone) {
                if (error == 0)
                        error = EBADF;
        } else if (error == 0) {
                /*
                 * Device has not been revoked, so our opencnt can't
                 * have gone away at this point -- transition to
                 * sn_gone=true happens before transition to
                 * sn_opencnt=0 in spec_node_revoke.
                 */
                KASSERT(sd->sd_opencnt);
                KASSERT(sn->sn_opencnt);
                KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
                    "sn_opencnt=%u > sd_opencnt=%u",
                    sn->sn_opencnt, sd->sd_opencnt);
                KASSERT(!sd->sd_closing);
                sd->sd_opened = true;
        } else if (sd->sd_opencnt == 1 && sd->sd_opened) {
                /*
                 * We're the last reference to a _previous_ open even
                 * though this one failed, so we have to close it.
                 * Don't decrement the reference count here --
                 * spec_close will do that.
                 */
                KASSERT(sn->sn_opencnt == 1);
                needclose = true;
        } else {
                KASSERT(sd->sd_opencnt);
                KASSERT(sn->sn_opencnt);
                KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
                    "sn_opencnt=%u > sd_opencnt=%u",
                    sn->sn_opencnt, sd->sd_opencnt);
                sd->sd_opencnt--;
                sn->sn_opencnt--;
                if (vp->v_type == VBLK)
                        sd->sd_bdevvp = NULL;
        }
        mutex_exit(&device_lock);

        /*
         * If this open failed, but the device was previously opened,
         * and another thread concurrently closed the vnode while we
         * were in the middle of reopening it, the other thread will
         * see sd_opencnt > 0 and thus decide not to call .d_close --
         * it is now our responsibility to do so.
         *
         * XXX The flags passed to VOP_CLOSE here are wrong, but
         * drivers can't rely on FREAD|FWRITE anyway -- e.g., consider
         * a device opened by thread 0 with O_READ, then opened by
         * thread 1 with O_WRITE, then closed by thread 0, and finally
         * closed by thread 1; the last .d_close call will have FWRITE
         * but not FREAD.  We should just eliminate the FREAD/FWRITE
         * parameter to .d_close altogether.
         */
        if (needclose) {
                KASSERT(error);
                VOP_CLOSE(vp, FNONBLOCK, NOCRED);
        }

        /* If anything went wrong, we're done.  */
        if (error)
                return error;

        /*
         * For disk devices, automagically set the vnode size to the
         * partition size, if we can.  This applies to block devices
         * and character devices alike -- every block device must have
         * a corresponding character device.  And if the module is
         * loaded it will remain loaded until we're done here (it is
         * forbidden to devsw_detach until closed).  So it is safe to
         * query cdev_type unconditionally here.
         */
        if (cdev_type(dev) == D_DISK) {
                ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl;
                if ((*ioctl)(dev, DIOCGPARTINFO, &pi, FREAD, curlwp) == 0)
                        uvm_vnp_setsize(vp,
                            (voff_t)pi.pi_secsize * pi.pi_size);
        }

        /* Success!  */
        return 0;
}

/*
 * Vnode op for read
 */
/* ARGSUSED */
int
spec_read(void *v)
{
        struct vop_read_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int  a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct uio *uio = ap->a_uio;
        struct lwp *l = curlwp;
        struct specnode *sn;
        dev_t dev;
        struct buf *bp;
        daddr_t bn;
        int bsize, bscale;
        struct partinfo pi;
        int n, on;
        int error = 0;
        int i, nra;
        daddr_t lastbn, *rablks;
        int *rasizes;
        int nrablks, ratogo;

        KASSERT(uio->uio_rw == UIO_READ);
        KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ||
                uio->uio_vmspace == curproc->p_vmspace),
            "vmspace belongs to neither kernel nor curproc");

        if (uio->uio_resid == 0)
                return 0;

        switch (vp->v_type) {

        case VCHR:
                /*
                 * Release the lock while we sleep -- possibly
                 * indefinitely, if this is, e.g., a tty -- in
                 * cdev_read, so we don't hold up everything else that
                 * might want access to the vnode.
                 *
                 * But before we issue the read, take an I/O reference
                 * to the specnode so close will know when we're done
                 * reading.  Note that the moment we release the lock,
                 * the vnode's identity may change; hence spec_io_enter
                 * may fail, and the caller may have a dead vnode on
                 * their hands, if the file system on which vp lived
                 * has been unmounted.
                 */
                VOP_UNLOCK(vp);
                error = spec_io_enter(vp, &sn, &dev);
                if (error)
                        goto out;
                error = cdev_read(dev, uio, ap->a_ioflag);
                spec_io_exit(vp, sn);
out:                /* XXX What if the caller held an exclusive lock?  */
                vn_lock(vp, LK_SHARED | LK_RETRY);
                return error;

        case VBLK:
                KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
                if (uio->uio_offset < 0)
                        return EINVAL;

                if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0)
                        bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE);
                else
                        bsize = BLKDEV_IOSIZE;

                bscale = bsize >> DEV_BSHIFT;

                nra = uimax(16 * MAXPHYS / bsize - 1, 511);
                rablks = kmem_alloc(nra * sizeof(*rablks), KM_SLEEP);
                rasizes = kmem_alloc(nra * sizeof(*rasizes), KM_SLEEP);
                lastbn = ((uio->uio_offset + uio->uio_resid - 1) >> DEV_BSHIFT)
                    &~ (bscale - 1);
                nrablks = ratogo = 0;
                do {
                        bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1);
                        on = uio->uio_offset % bsize;
                        n = uimin((unsigned)(bsize - on), uio->uio_resid);

                        if (ratogo == 0) {
                                nrablks = uimin((lastbn - bn) / bscale, nra);
                                ratogo = nrablks;

                                for (i = 0; i < nrablks; ++i) {
                                        rablks[i] = bn + (i+1) * bscale;
                                        rasizes[i] = bsize;
                                }

                                error = breadn(vp, bn, bsize,
                                    rablks, rasizes, nrablks,
                                    0, &bp);
                        } else {
                                if (ratogo > 0)
                                        --ratogo;
                                error = bread(vp, bn, bsize, 0, &bp);
                        }
                        if (error)
                                break;
                        n = uimin(n, bsize - bp->b_resid);
                        error = uiomove((char *)bp->b_data + on, n, uio);
                        brelse(bp, 0);
                } while (error == 0 && uio->uio_resid > 0 && n != 0);

                kmem_free(rablks, nra * sizeof(*rablks));
                kmem_free(rasizes, nra * sizeof(*rasizes));

                return error;

        default:
                panic("spec_read type");
        }
        /* NOTREACHED */
}

/*
 * Vnode op for write
 */
/* ARGSUSED */
int
spec_write(void *v)
{
        struct vop_write_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int  a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct uio *uio = ap->a_uio;
        struct lwp *l = curlwp;
        struct specnode *sn;
        dev_t dev;
        struct buf *bp;
        daddr_t bn;
        int bsize, bscale;
        struct partinfo pi;
        int n, on;
        int error = 0;

        KASSERT(uio->uio_rw == UIO_WRITE);
        KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ||
                uio->uio_vmspace == curproc->p_vmspace),
            "vmspace belongs to neither kernel nor curproc");

        switch (vp->v_type) {

        case VCHR:
                /*
                 * Release the lock while we sleep -- possibly
                 * indefinitely, if this is, e.g., a tty -- in
                 * cdev_write, so we don't hold up everything else that
                 * might want access to the vnode.
                 *
                 * But before we issue the write, take an I/O reference
                 * to the specnode so close will know when we're done
                 * writing.  Note that the moment we release the lock,
                 * the vnode's identity may change; hence spec_io_enter
                 * may fail, and the caller may have a dead vnode on
                 * their hands, if the file system on which vp lived
                 * has been unmounted.
                 */
                VOP_UNLOCK(vp);
                error = spec_io_enter(vp, &sn, &dev);
                if (error)
                        goto out;
                error = cdev_write(dev, uio, ap->a_ioflag);
                spec_io_exit(vp, sn);
out:                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                return error;

        case VBLK:
                KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
                if (uio->uio_resid == 0)
                        return 0;
                if (uio->uio_offset < 0)
                        return EINVAL;

                if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0)
                        bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE);
                else
                        bsize = BLKDEV_IOSIZE;

                bscale = bsize >> DEV_BSHIFT;
                do {
                        bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1);
                        on = uio->uio_offset % bsize;
                        n = uimin((unsigned)(bsize - on), uio->uio_resid);
                        if (n == bsize)
                                bp = getblk(vp, bn, bsize, 0, 0);
                        else
                                error = bread(vp, bn, bsize, B_MODIFY, &bp);
                        if (error) {
                                return error;
                        }
                        n = uimin(n, bsize - bp->b_resid);
                        error = uiomove((char *)bp->b_data + on, n, uio);
                        if (error)
                                brelse(bp, 0);
                        else {
                                if (n + on == bsize)
                                        bawrite(bp);
                                else
                                        bdwrite(bp);
                                error = bp->b_error;
                        }
                } while (error == 0 && uio->uio_resid > 0 && n != 0);
                return error;

        default:
                panic("spec_write type");
        }
        /* NOTREACHED */
}

/*
 * fdiscard, which on disk devices becomes TRIM.
 */
int
spec_fdiscard(void *v)
{
        struct vop_fdiscard_args /* {
                struct vnode *a_vp;
                off_t a_pos;
                off_t a_len;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        dev_t dev;

        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);

        dev = vp->v_rdev;

        switch (vp->v_type) {
        case VCHR:
#if 0                /* This is not stored for character devices. */
                KASSERT(vp == vp->v_specnode->sn_dev->sd_cdevvp);
#endif
                return cdev_discard(dev, ap->a_pos, ap->a_len);
        case VBLK:
                KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
                return bdev_discard(dev, ap->a_pos, ap->a_len);
        default:
                panic("spec_fdiscard: not a device\n");
        }
}

/*
 * Device ioctl operation.
 */
/* ARGSUSED */
int
spec_ioctl(void *v)
{
        struct vop_ioctl_args /* {
                struct vnode *a_vp;
                u_long a_command;
                void  *a_data;
                int  a_fflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct specnode *sn;
        dev_t dev;
        int error;

        error = spec_io_enter(vp, &sn, &dev);
        if (error)
                return error;

        switch (vp->v_type) {
        case VCHR:
                error = cdev_ioctl(dev, ap->a_command, ap->a_data,
                    ap->a_fflag, curlwp);
                break;
        case VBLK:
                KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
                error = bdev_ioctl(dev, ap->a_command, ap->a_data,
                    ap->a_fflag, curlwp);
                break;
        default:
                panic("spec_ioctl");
                /* NOTREACHED */
        }

        spec_io_exit(vp, sn);
        return error;
}

/* ARGSUSED */
int
spec_poll(void *v)
{
        struct vop_poll_args /* {
                struct vnode *a_vp;
                int a_events;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct specnode *sn;
        dev_t dev;
        int revents;

        if (spec_io_enter(vp, &sn, &dev) != 0)
                return POLLERR;

        switch (vp->v_type) {
        case VCHR:
                revents = cdev_poll(dev, ap->a_events, curlwp);
                break;
        default:
                revents = genfs_poll(v);
                break;
        }

        spec_io_exit(vp, sn);
        return revents;
}

/* ARGSUSED */
int
spec_kqfilter(void *v)
{
        struct vop_kqfilter_args /* {
                struct vnode        *a_vp;
                struct proc        *a_kn;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct specnode *sn;
        dev_t dev;
        int error;

        error = spec_io_enter(vp, &sn, &dev);
        if (error)
                return error;

        switch (vp->v_type) {
        case VCHR:
                error = cdev_kqfilter(dev, ap->a_kn);
                break;
        default:
                /*
                 * Block devices don't support kqfilter, and refuse it
                 * for any other files (like those vflush()ed) too.
                 */
                error = EOPNOTSUPP;
                break;
        }

        spec_io_exit(vp, sn);
        return error;
}

/*
 * Allow mapping of only D_DISK.  This is called only for VBLK.
 */
int
spec_mmap(void *v)
{
        struct vop_mmap_args /* {
                struct vnode *a_vp;
                vm_prot_t a_prot;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct specnode *sn;
        dev_t dev;
        int error;

        KASSERT(vp->v_type == VBLK);

        error = spec_io_enter(vp, &sn, &dev);
        if (error)
                return error;

        error = bdev_type(dev) == D_DISK ? 0 : EINVAL;

        spec_io_exit(vp, sn);
        return 0;
}

/*
 * Synch buffers associated with a block device
 */
/* ARGSUSED */
int
spec_fsync(void *v)
{
        struct vop_fsync_args /* {
                struct vnode *a_vp;
                kauth_cred_t a_cred;
                int  a_flags;
                off_t offlo;
                off_t offhi;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct mount *mp;
        int error;

        if (vp->v_type == VBLK) {
                if ((mp = spec_node_getmountedfs(vp)) != NULL) {
                        error = VFS_FSYNC(mp, vp, ap->a_flags);
                        if (error != EOPNOTSUPP)
                                return error;
                }
                return vflushbuf(vp, ap->a_flags);
        }
        return 0;
}

/*
 * Just call the device strategy routine
 */
int
spec_strategy(void *v)
{
        struct vop_strategy_args /* {
                struct vnode *a_vp;
                struct buf *a_bp;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct buf *bp = ap->a_bp;
        struct specnode *sn = NULL;
        dev_t dev;
        int error;

        error = spec_io_enter(vp, &sn, &dev);
        if (error)
                goto out;

        bp->b_dev = dev;

        if (!(bp->b_flags & B_READ)) {
#ifdef DIAGNOSTIC
                if (bp->b_vp && bp->b_vp->v_type == VBLK) {
                        struct mount *mp = spec_node_getmountedfs(bp->b_vp);

                        if (mp && (mp->mnt_flag & MNT_RDONLY)) {
                                printf("%s blk %"PRId64" written while ro!\n",
                                    mp->mnt_stat.f_mntonname, bp->b_blkno);
#ifdef DDB
                                db_stacktrace();
#endif
                        }
                }
#endif /* DIAGNOSTIC */
                error = fscow_run(bp, false);
                if (error)
                        goto out;
        }
        bdev_strategy(bp);

        error = 0;

out:        if (sn)
                spec_io_exit(vp, sn);
        if (error) {
                bp->b_error = error;
                bp->b_resid = bp->b_bcount;
                biodone(bp);
        }
        return error;
}

int
spec_inactive(void *v)
{
        struct vop_inactive_v2_args /* {
                struct vnode *a_vp;
                struct bool *a_recycle;
        } */ *ap = v;

        KASSERT(ap->a_vp->v_mount == dead_rootmount);
        *ap->a_recycle = true;

        return 0;
}

int
spec_reclaim(void *v)
{
        struct vop_reclaim_v2_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;

        KASSERT(vp->v_specnode->sn_opencnt == 0);

        VOP_UNLOCK(vp);

        KASSERT(vp->v_mount == dead_rootmount);
        return 0;
}

/*
 * This is a noop, simply returning what one has been given.
 */
int
spec_bmap(void *v)
{
        struct vop_bmap_args /* {
                struct vnode *a_vp;
                daddr_t  a_bn;
                struct vnode **a_vpp;
                daddr_t *a_bnp;
                int *a_runp;
        } */ *ap = v;

        if (ap->a_vpp != NULL)
                *ap->a_vpp = ap->a_vp;
        if (ap->a_bnp != NULL)
                *ap->a_bnp = ap->a_bn;
        if (ap->a_runp != NULL)
                *ap->a_runp = (MAXBSIZE >> DEV_BSHIFT) - 1;
        return 0;
}

/*
 * Device close routine
 */
/* ARGSUSED */
int
spec_close(void *v)
{
        struct vop_close_args /* {
                struct vnode *a_vp;
                int  a_fflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct session *sess;
        dev_t dev;
        int flags = ap->a_fflag;
        int mode, error, count;
        specnode_t *sn;
        specdev_t *sd;

        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);

        mutex_enter(vp->v_interlock);
        sn = vp->v_specnode;
        dev = vp->v_rdev;
        sd = sn->sn_dev;
        /*
         * If we're going away soon, make this non-blocking.
         * Also ensures that we won't wedge in vn_lock below.
         */
        if (vdead_check(vp, VDEAD_NOWAIT) != 0)
                flags |= FNONBLOCK;
        mutex_exit(vp->v_interlock);

        switch (vp->v_type) {

        case VCHR:
                /*
                 * Hack: a tty device that is a controlling terminal
                 * has a reference from the session structure.  We
                 * cannot easily tell that a character device is a
                 * controlling terminal, unless it is the closing
                 * process' controlling terminal.  In that case, if the
                 * open count is 1 release the reference from the
                 * session.  Also, remove the link from the tty back to
                 * the session and pgrp.
                 *
                 * XXX V. fishy.
                 */
                mutex_enter(&proc_lock);
                sess = curlwp->l_proc->p_session;
                if (sn->sn_opencnt == 1 && vp == sess->s_ttyvp) {
                        mutex_spin_enter(&tty_lock);
                        sess->s_ttyvp = NULL;
                        if (sess->s_ttyp->t_session != NULL) {
                                sess->s_ttyp->t_pgrp = NULL;
                                sess->s_ttyp->t_session = NULL;
                                mutex_spin_exit(&tty_lock);
                                /* Releases proc_lock. */
                                proc_sessrele(sess);
                        } else {
                                mutex_spin_exit(&tty_lock);
                                if (sess->s_ttyp->t_pgrp != NULL)
                                        panic("spec_close: spurious pgrp ref");
                                mutex_exit(&proc_lock);
                        }
                        vrele(vp);
                } else
                        mutex_exit(&proc_lock);

                /*
                 * If the vnode is locked, then we are in the midst
                 * of forcably closing the device, otherwise we only
                 * close on last reference.
                 */
                mode = S_IFCHR;
                break;

        case VBLK:
                KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
                /*
                 * On last close of a block device (that isn't mounted)
                 * we must invalidate any in core blocks, so that
                 * we can, for instance, change floppy disks.
                 */
                error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0);
                if (error)
                        return error;
                /*
                 * We do not want to really close the device if it
                 * is still in use unless we are trying to close it
                 * forcibly. Since every use (buffer, vnode, swap, cmap)
                 * holds a reference to the vnode, and because we mark
                 * any other vnodes that alias this device, when the
                 * sum of the reference counts on all the aliased
                 * vnodes descends to one, we are on last close.
                 */
                mode = S_IFBLK;
                break;

        default:
                panic("spec_close: not special");
        }

        /*
         * Decrement the open reference count of this node and the
         * device.  For block devices, the open reference count must be
         * 1 at this point.  If the device's open reference count goes
         * to zero, we're the last one out so get the lights.
         *
         * We may find --sd->sd_opencnt gives zero, and yet
         * sd->sd_opened is false.  This happens if the vnode is
         * revoked at the same time as it is being opened, which can
         * happen when opening a tty blocks indefinitely.  In that
         * case, we still must call close -- it is the job of close to
         * interrupt the open.  Either way, the device will be no
         * longer opened, so we have to clear sd->sd_opened; subsequent
         * opens will have responsibility for issuing close.
         *
         * This has the side effect that the sequence of opens might
         * happen out of order -- we might end up doing open, open,
         * close, close, instead of open, close, open, close.  This is
         * unavoidable with the current devsw API, where open is
         * allowed to block and close must be able to run concurrently
         * to interrupt it.  It is the driver's responsibility to
         * ensure that close is idempotent so that this works.  Drivers
         * requiring per-open state and exact 1:1 correspondence
         * between open and close can use fd_clone.
         */
        mutex_enter(&device_lock);
        KASSERT(sn->sn_opencnt);
        KASSERT(sd->sd_opencnt);
        KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
            "sn_opencnt=%u > sd_opencnt=%u",
            sn->sn_opencnt, sd->sd_opencnt);
        sn->sn_opencnt--;
        count = --sd->sd_opencnt;
        if (vp->v_type == VBLK) {
                KASSERTMSG(count == 0, "block device with %u opens",
                    count + 1);
                sd->sd_bdevvp = NULL;
        }
        if (count == 0) {
                KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u",
                    sn->sn_opencnt);
                KASSERT(!sd->sd_closing);
                sd->sd_opened = false;
                sd->sd_closing = true;
        }
        mutex_exit(&device_lock);

        if (count != 0)
                return 0;

        /*
         * If we're able to block, release the vnode lock & reacquire. We
         * might end up sleeping for someone else who wants our queues. They
         * won't get them if we hold the vnode locked.
         */
        if (!(flags & FNONBLOCK))
                VOP_UNLOCK(vp);

        /*
         * If we can cancel all outstanding I/O, then wait for it to
         * drain before we call .d_close.  Drivers that split up
         * .d_cancel and .d_close this way need not have any internal
         * mechanism for waiting in .d_close for I/O to drain.
         */
        if (vp->v_type == VBLK)
                error = bdev_cancel(dev, flags, mode, curlwp);
        else
                error = cdev_cancel(dev, flags, mode, curlwp);
        if (error == 0)
                spec_io_drain(sd);
        else
                KASSERTMSG(error == ENODEV, "cancel dev=0x%lx failed with %d",
                    (unsigned long)dev, error);

        if (vp->v_type == VBLK)
                error = bdev_close(dev, flags, mode, curlwp);
        else
                error = cdev_close(dev, flags, mode, curlwp);

        /*
         * Wait for all other devsw operations to drain.  After this
         * point, no bdev/cdev_* can be active for this specdev.
         */
        spec_io_drain(sd);

        /*
         * Wake any spec_open calls waiting for close to finish -- do
         * this before reacquiring the vnode lock, because spec_open
         * holds the vnode lock while waiting, so doing this after
         * reacquiring the lock would deadlock.
         */
        mutex_enter(&device_lock);
        KASSERT(!sd->sd_opened);
        KASSERT(sd->sd_closing);
        sd->sd_closing = false;
        cv_broadcast(&specfs_iocv);
        mutex_exit(&device_lock);

        if (!(flags & FNONBLOCK))
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);

        return error;
}

/*
 * Print out the contents of a special device vnode.
 */
int
spec_print(void *v)
{
        struct vop_print_args /* {
                struct vnode *a_vp;
        } */ *ap = v;

        printf("dev %llu, %llu\n", (unsigned long long)major(ap->a_vp->v_rdev),
            (unsigned long long)minor(ap->a_vp->v_rdev));
        return 0;
}

/*
 * Return POSIX pathconf information applicable to special devices.
 */
int
spec_pathconf(void *v)
{
        struct vop_pathconf_args /* {
                struct vnode *a_vp;
                int a_name;
                register_t *a_retval;
        } */ *ap = v;

        switch (ap->a_name) {
        case _PC_LINK_MAX:
                *ap->a_retval = LINK_MAX;
                return 0;
        case _PC_MAX_CANON:
                *ap->a_retval = MAX_CANON;
                return 0;
        case _PC_MAX_INPUT:
                *ap->a_retval = MAX_INPUT;
                return 0;
        case _PC_PIPE_BUF:
                *ap->a_retval = PIPE_BUF;
                return 0;
        case _PC_CHOWN_RESTRICTED:
                *ap->a_retval = 1;
                return 0;
        case _PC_VDISABLE:
                *ap->a_retval = _POSIX_VDISABLE;
                return 0;
        case _PC_SYNC_IO:
                *ap->a_retval = 1;
                return 0;
        default:
                return genfs_pathconf(ap);
        }
        /* NOTREACHED */
}

/*
 * Advisory record locking support.
 */
int
spec_advlock(void *v)
{
        struct vop_advlock_args /* {
                struct vnode *a_vp;
                void *a_id;
                int a_op;
                struct flock *a_fl;
                int a_flags;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;

        return lf_advlock(ap, &vp->v_speclockf, (off_t)0);
}























































































































































































   28 
   27 
    1 









































































  100 











  101 










   64 
   59 
















   99 
































































































































































   64 


    3 
















   58 








































   77 



   59 








   33 
    2 
    1 




















   34 

















   26 
   34 

















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
/*        $NetBSD: trap.c,v 1.129 2023/10/05 19:41:03 ad Exp $        */

/*
 * Copyright (c) 1998, 2000, 2017 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum, and by Maxime Villard.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1990 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * the University of Utah, and William Jolitz.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)trap.c        7.4 (Berkeley) 5/13/91
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.129 2023/10/05 19:41:03 ad Exp $");

#include "opt_ddb.h"
#include "opt_kgdb.h"
#include "opt_xen.h"
#include "opt_dtrace.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/acct.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/ras.h>
#include <sys/signal.h>
#include <sys/syscall.h>
#include <sys/cpu.h>
#include <sys/ucontext.h>
#include <sys/module_hook.h>
#include <sys/compat_stub.h>

#include <uvm/uvm_extern.h>

#include <machine/cpufunc.h>
#include <x86/fpu.h>
#include <x86/dbregs.h>
#include <machine/psl.h>
#include <machine/reg.h>
#include <machine/trap.h>
#include <machine/userret.h>
#include <machine/db_machdep.h>

#include <x86/nmi.h>

#ifndef XENPV
#include "isa.h"
#endif

#include <sys/kgdb.h>

#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
/*
 * This is a hook which is initialized by the dtrace module to handle traps
 * which might occur during DTrace probe execution.
 */
dtrace_trap_func_t dtrace_trap_func = NULL;
dtrace_doubletrap_func_t dtrace_doubletrap_func = NULL;
#endif

/*
 * Module hook for amd64_oosyscall
 */
struct amd64_oosyscall_hook_t amd64_oosyscall_hook;

void nmitrap(struct trapframe *);
void doubletrap(struct trapframe *);
void trap(struct trapframe *);

const char * const trap_type[] = {
        "privileged instruction fault",                /*  0 T_PRIVINFLT */
        "breakpoint trap",                        /*  1 T_BPTFLT */
        "arithmetic trap",                        /*  2 T_ARITHTRAP */
        "asynchronous system trap",                /*  3 T_ASTFLT */
        "protection fault",                        /*  4 T_PROTFLT */
        "trace trap",                                /*  5 T_TRCTRAP */
        "page fault",                                /*  6 T_PAGEFLT */
        "alignment fault",                        /*  7 T_ALIGNFLT */
        "integer divide fault",                        /*  8 T_DIVIDE */
        "non-maskable interrupt",                /*  9 T_NMI */
        "overflow trap",                        /* 10 T_OFLOW */
        "bounds check fault",                        /* 11 T_BOUND */
        "FPU not available fault",                /* 12 T_DNA */
        "double fault",                                /* 13 T_DOUBLEFLT */
        "FPU operand fetch fault",                /* 14 T_FPOPFLT */
        "invalid TSS fault",                        /* 15 T_TSSFLT */
        "segment not present fault",                /* 16 T_SEGNPFLT */
        "stack fault",                                /* 17 T_STKFLT */
        "machine check fault",                        /* 18 T_MCA */
        "SSE FP exception",                        /* 19 T_XMM */
        "reserved trap",                        /* 20 T_RESERVED */
};
int trap_types = __arraycount(trap_type);

#ifdef TRAP_SIGDEBUG
static void sigdebug(const struct trapframe *, const ksiginfo_t *, int);
#define SIGDEBUG(a, b, c) sigdebug(a, b, c)
#else
#define SIGDEBUG(a, b, c)
#endif

static void
onfault_restore(struct trapframe *frame, void *onfault, int error)
{
        frame->tf_rip = (uintptr_t)onfault;
        frame->tf_rax = error;
}

static void *
onfault_handler(const struct pcb *pcb, const struct trapframe *tf)
{
        struct onfault_table {
                uintptr_t start;
                uintptr_t end;
                void *handler;
        };
        extern const struct onfault_table onfault_table[];
        const struct onfault_table *p;
        uintptr_t pc;

        if (pcb->pcb_onfault != NULL) {
                return pcb->pcb_onfault;
        }

        pc = tf->tf_rip;
        for (p = onfault_table; p->start; p++) {
                if (p->start <= pc && pc < p->end) {
                        return p->handler;
                }
        }
        return NULL;
}

static void
trap_print(const struct trapframe *frame, const lwp_t *l)
{
        const int type = frame->tf_trapno;

        if (frame->tf_trapno < trap_types) {
                printf("fatal %s", trap_type[type]);
        } else {
                printf("unknown trap %d", type);
        }
        printf(" in %s mode\n", (type & T_USER) ? "user" : "supervisor");

        printf("trap type %d code %#lx rip %#lx cs %#lx rflags %#lx cr2 %#lx "
            "ilevel %#x rsp %#lx\n",
            type, frame->tf_err, (u_long)frame->tf_rip, frame->tf_cs,
            frame->tf_rflags, rcr2(), curcpu()->ci_ilevel, frame->tf_rsp);

        printf("curlwp %p pid %d.%d lowest kstack %p\n",
            l, l->l_proc->p_pid, l->l_lid, KSTACK_LOWEST_ADDR(l));
}

void
nmitrap(struct trapframe *frame)
{
        const int type = T_NMI;

        if (nmi_dispatch(frame))
                return;
        /* NMI can be hooked up to a pushbutton for debugging */
        if (kgdb_trap(type, frame))
                return;
        if (kdb_trap(type, 0, frame))
                return;
        /* machine/parity/power fail/"kitchen sink" faults */

        x86_nmi();
}

void
doubletrap(struct trapframe *frame)
{
        const int type = T_DOUBLEFLT;
        struct lwp *l = curlwp;

        trap_print(frame, l);

        if (kdb_trap(type, 0, frame))
                return;
        if (kgdb_trap(type, frame))
                return;

        panic("double fault");
}

/*
 * trap(frame): exception, fault, and trap interface to BSD kernel.
 *
 * This common code is called from assembly language IDT gate entry routines
 * that prepare a suitable stack frame, and restore this frame after the
 * exception has been processed. Note that the effect is as if the arguments
 * were passed call by reference.
 *
 * Note that the fpu traps (07 T_DNA, 10 T_ARITHTRAP and 13 T_XMM)
 * jump directly into the code in x86/fpu.c so they get processed
 * without interrupts being enabled.
 */
void
trap(struct trapframe *frame)
{
        struct lwp *l = curlwp;
        struct proc *p;
        struct pcb *pcb;
        extern char kcopy_fault[];
        ksiginfo_t ksi;
        void *onfault;
        int type, error;
        uint64_t cr2;
        bool pfail;

        if (__predict_true(l != NULL)) {
                pcb = lwp_getpcb(l);
                p = l->l_proc;
        } else {
                /*
                 * This can happen eg on break points in early on boot.
                 */
                pcb = NULL;
                p = NULL;
        }
        type = frame->tf_trapno;

        if (!KERNELMODE(frame->tf_cs)) {
                type |= T_USER;
                l->l_md.md_regs = frame;
        }

#ifdef KDTRACE_HOOKS
        /*
         * A trap can occur while DTrace executes a probe. Before
         * executing the probe, DTrace blocks re-scheduling and sets
         * a flag in its per-cpu flags to indicate that it doesn't
         * want to fault. On returning from the probe, the no-fault
         * flag is cleared and finally re-scheduling is enabled.
         *
         * If the DTrace kernel module has registered a trap handler,
         * call it and if it returns non-zero, assume that it has
         * handled the trap and modified the trap frame so that this
         * function can return normally.
         */
        if ((type == T_PROTFLT || type == T_PAGEFLT) &&
            dtrace_trap_func != NULL) {
                if ((*dtrace_trap_func)(frame, type)) {
                        return;
                }
        }
#endif

        switch (type) {

        default:
        we_re_toast:
                trap_print(frame, l);

                if (kdb_trap(type, 0, frame))
                        return;
                if (kgdb_trap(type, frame))
                        return;
                /*
                 * If this is a breakpoint, don't panic if we're not connected.
                 */
                if (type == T_BPTFLT && kgdb_disconnected()) {
                        printf("kgdb: ignored %s\n", trap_type[type]);
                        return;
                }
                panic("trap");
                /*NOTREACHED*/

        case T_PROTFLT:
        case T_SEGNPFLT:
        case T_ALIGNFLT:
        case T_STKFLT:
        case T_TSSFLT:
                if (p == NULL)
                        goto we_re_toast;

                /* Check for copyin/copyout fault. */
                onfault = onfault_handler(pcb, frame);
                if (onfault != NULL) {
                        onfault_restore(frame, onfault, EFAULT);
                        return;
                }

                goto we_re_toast;

        case T_PROTFLT|T_USER:                /* protection fault */
        {        int hook_ret;

                MODULE_HOOK_CALL(amd64_oosyscall_hook, (p, frame),
                        ENOSYS, hook_ret);
                if (hook_ret == 0) {
                        /* Do the syscall */
                        p->p_md.md_syscall(frame);
                        goto out;
                }
        }
                /* FALLTHROUGH */
        case T_TSSFLT|T_USER:
        case T_SEGNPFLT|T_USER:
        case T_STKFLT|T_USER:
        case T_ALIGNFLT|T_USER:
                KSI_INIT_TRAP(&ksi);
                ksi.ksi_trap = type & ~T_USER;
                ksi.ksi_addr = (void *)frame->tf_rip;
                switch (type) {
                case T_SEGNPFLT|T_USER:
                case T_STKFLT|T_USER:
                        ksi.ksi_signo = SIGBUS;
                        ksi.ksi_code = BUS_ADRERR;
                        break;
                case T_TSSFLT|T_USER:
                        ksi.ksi_signo = SIGBUS;
                        ksi.ksi_code = BUS_OBJERR;
                        break;
                case T_ALIGNFLT|T_USER:
                        ksi.ksi_signo = SIGBUS;
                        ksi.ksi_code = BUS_ADRALN;
                        break;
                case T_PROTFLT|T_USER:
                        ksi.ksi_signo = SIGSEGV;
                        ksi.ksi_code = SEGV_ACCERR;
                        break;
                default:
                        KASSERT(0);
                        break;
                }
                goto trapsignal;

        case T_PRIVINFLT|T_USER:        /* privileged instruction fault */
        case T_FPOPFLT|T_USER:                /* coprocessor operand fault */
                KSI_INIT_TRAP(&ksi);
                ksi.ksi_signo = SIGILL;
                ksi.ksi_trap = type & ~T_USER;
                ksi.ksi_addr = (void *) frame->tf_rip;
                switch (type) {
                case T_PRIVINFLT|T_USER:
                        ksi.ksi_code = ILL_PRVOPC;
                        break;
                case T_FPOPFLT|T_USER:
                        ksi.ksi_code = ILL_COPROC;
                        break;
                default:
                        KASSERT(0);
                        break;
                }
                goto trapsignal;

        case T_ASTFLT|T_USER:
                /* Allow process switch. */
                //curcpu()->ci_data.cpu_nast++;
                if (l->l_pflag & LP_OWEUPC) {
                        l->l_pflag &= ~LP_OWEUPC;
                        ADDUPROF(l);
                }
                goto out;

        case T_BOUND|T_USER:
        case T_OFLOW|T_USER:
        case T_DIVIDE|T_USER:
                KSI_INIT_TRAP(&ksi);
                ksi.ksi_signo = SIGFPE;
                ksi.ksi_trap = type & ~T_USER;
                ksi.ksi_addr = (void *)frame->tf_rip;
                switch (type) {
                case T_BOUND|T_USER:
                        ksi.ksi_code = FPE_FLTSUB;
                        break;
                case T_OFLOW|T_USER:
                        ksi.ksi_code = FPE_INTOVF;
                        break;
                case T_DIVIDE|T_USER:
                        ksi.ksi_code = FPE_INTDIV;
                        break;
                default:
                        KASSERT(0);
                        break;
                }
                goto trapsignal;

        case T_PAGEFLT:
                /* Allow page faults in kernel mode. */
                if (__predict_false(l == NULL))
                        goto we_re_toast;

                onfault = pcb->pcb_onfault;

                if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) {
                        goto we_re_toast;
                }

                cr2 = rcr2();

                if (frame->tf_err & PGEX_I) {
                        /* SMEP might have brought us here */
                        if (cr2 < VM_MAXUSER_ADDRESS) {
                                printf("prevented execution of %p (SMEP)\n",
                                    (void *)cr2);
                                goto we_re_toast;
                        }
                }

                if ((frame->tf_err & PGEX_P) &&
                    cr2 < VM_MAXUSER_ADDRESS) {
                        /* SMAP might have brought us here */
                        if (onfault_handler(pcb, frame) == NULL) {
                                printf("prevented access to %p (SMAP)\n",
                                    (void *)cr2);
                                goto we_re_toast;
                        }
                }

                goto pagefltcommon;

        case T_PAGEFLT|T_USER: {
                register vaddr_t va;
                register struct vmspace *vm;
                register struct vm_map *map;
                vm_prot_t ftype;
                extern struct vm_map *kernel_map;

                cr2 = rcr2();
                if (p->p_emul->e_usertrap != NULL &&
                    (*p->p_emul->e_usertrap)(l, cr2, frame) != 0)
                        return;
pagefltcommon:
                vm = p->p_vmspace;
                if (__predict_false(vm == NULL)) {
                        goto we_re_toast;
                }
                pcb->pcb_cr2 = cr2;
                va = trunc_page((vaddr_t)cr2);
                /*
                 * It is only a kernel address space fault iff:
                 *        1. (type & T_USER) == 0  and
                 *        2. pcb_onfault not set or
                 *        3. pcb_onfault set but supervisor space fault
                 * The last can occur during an exec() copyin where the
                 * argument space is lazy-allocated.
                 */
                if (type == T_PAGEFLT && va >= VM_MIN_KERNEL_ADDRESS)
                        map = kernel_map;
                else
                        map = &vm->vm_map;
                if (frame->tf_err & PGEX_W)
                        ftype = VM_PROT_WRITE;
                else if (frame->tf_err & PGEX_I)
                        ftype = VM_PROT_EXECUTE;
                else
                        ftype = VM_PROT_READ;

#ifdef DIAGNOSTIC
                if (map == kernel_map && va == 0) {
                        printf("trap: bad kernel access at %lx\n", va);
                        goto we_re_toast;
                }
#endif
                /* Fault the original page in. */
                onfault = pcb->pcb_onfault;
                pcb->pcb_onfault = NULL;
                error = uvm_fault(map, va, ftype);
                pcb->pcb_onfault = onfault;
                if (error == 0) {
                        if (map != kernel_map && (void *)va >= vm->vm_maxsaddr)
                                uvm_grow(p, va);

                        pfail = false;
                        while (type == T_PAGEFLT) {
                                /*
                                 * we need to switch pmap now if we're in
                                 * the middle of copyin/out.
                                 *
                                 * but we don't need to do so for kcopy as
                                 * it never touch userspace.
                                  */
                                kpreempt_disable();
                                if (curcpu()->ci_want_pmapload) {
                                        onfault = onfault_handler(pcb, frame);
                                        if (onfault != kcopy_fault) {
                                                pmap_load();
                                        }
                                }
                                /*
                                 * We need to keep the pmap loaded and
                                 * so avoid being preempted until back
                                 * into the copy functions.  Disable
                                 * interrupts at the hardware level before
                                 * re-enabling preemption.  Interrupts
                                 * will be re-enabled by 'iret' when
                                 * returning back out of the trap stub.
                                 * They'll only be re-enabled when the
                                 * program counter is once again in
                                 * the copy functions, and so visible
                                 * to cpu_kpreempt_exit().
                                 */
#ifndef XENPV
                                x86_disable_intr();
#endif
                                l->l_nopreempt--;
                                if (l->l_nopreempt > 0 || !l->l_dopreempt ||
                                    pfail) {
                                        return;
                                }
#ifndef XENPV
                                x86_enable_intr();
#endif
                                /*
                                 * If preemption fails for some reason,
                                 * don't retry it.  The conditions won't
                                 * change under our nose.
                                 */
                                pfail = kpreempt(0);
                        }
                        goto out;
                }

                if (type == T_PAGEFLT) {
                        onfault = onfault_handler(pcb, frame);
                        if (onfault != NULL) {
                                onfault_restore(frame, onfault, error);
                                return;
                        }

                        printf("uvm_fault(%p, 0x%lx, %d) -> %x\n",
                            map, va, ftype, error);
                        goto we_re_toast;
                }

                KSI_INIT_TRAP(&ksi);
                ksi.ksi_trap = type & ~T_USER;
                ksi.ksi_addr = (void *)cr2;
                switch (error) {
                case EINVAL:
                        ksi.ksi_signo = SIGBUS;
                        ksi.ksi_code = BUS_ADRERR;
                        break;
                case EACCES:
                        ksi.ksi_signo = SIGSEGV;
                        ksi.ksi_code = SEGV_ACCERR;
                        error = EFAULT;
                        break;
                case ENOMEM:
                        ksi.ksi_signo = SIGKILL;
                        printf("UVM: pid %d.%d (%s), uid %d killed: "
                            "out of swap\n", p->p_pid, l->l_lid, p->p_comm,
                            l->l_cred ?  kauth_cred_geteuid(l->l_cred) : -1);
                        break;
                default:
                        ksi.ksi_signo = SIGSEGV;
                        ksi.ksi_code = SEGV_MAPERR;
                        break;
                }

                SIGDEBUG(frame, &ksi, error);
                 (*p->p_emul->e_trapsignal)(l, &ksi);
                break;
        }

        case T_TRCTRAP:
                /*
                 * Ignore debug register trace traps due to
                 * accesses in the user's address space, which
                 * can happen under several conditions such as
                 * if a user sets a watchpoint on a buffer and
                 * then passes that buffer to a system call.
                 * We still want to get TRCTRAPS for addresses
                 * in kernel space because that is useful when
                 * debugging the kernel.
                 */
                if (x86_dbregs_user_trap())
                        break;

                goto we_re_toast;

        case T_BPTFLT|T_USER:                /* bpt instruction fault */
        case T_TRCTRAP|T_USER:                /* trace trap */
                /*
                 * Don't go single-stepping into a RAS.
                 */
                if (p->p_raslist == NULL ||
                    (ras_lookup(p, (void *)frame->tf_rip) == (void *)-1)) {
                        KSI_INIT_TRAP(&ksi);
                        ksi.ksi_signo = SIGTRAP;
                        ksi.ksi_trap = type & ~T_USER;
                        if (x86_dbregs_user_trap()) {
                                x86_dbregs_store_dr6(l);
                                ksi.ksi_code = TRAP_DBREG;
                        } else if (type == (T_BPTFLT|T_USER))
                                ksi.ksi_code = TRAP_BRKPT;
                        else
                                ksi.ksi_code = TRAP_TRACE;
                        (*p->p_emul->e_trapsignal)(l, &ksi);
                }
                break;
        }

        if ((type & T_USER) == 0)
                return;
out:
        userret(l);
        return;
trapsignal:
        SIGDEBUG(frame, &ksi, 0);
        (*p->p_emul->e_trapsignal)(l, &ksi);
        userret(l);
}

/*
 * startlwp: start of a new LWP.
 */
void
startlwp(void *arg)
{
        ucontext_t *uc = arg;
        lwp_t *l = curlwp;
        int error __diagused;

        error = cpu_setmcontext(l, &uc->uc_mcontext, uc->uc_flags);
        KASSERT(error == 0);

        kmem_free(uc, sizeof(ucontext_t));
        userret(l);
}

#ifdef TRAP_SIGDEBUG
static void
frame_dump(const struct trapframe *tf, struct pcb *pcb)
{

        printf("trapframe %p\n", tf);
        printf("rip %#018lx  rsp %#018lx  rfl %#018lx\n",
            tf->tf_rip, tf->tf_rsp, tf->tf_rflags);
        printf("rdi %#018lx  rsi %#018lx  rdx %#018lx\n",
            tf->tf_rdi, tf->tf_rsi, tf->tf_rdx);
        printf("rcx %#018lx  r8  %#018lx  r9  %#018lx\n",
            tf->tf_rcx, tf->tf_r8, tf->tf_r9);
        printf("r10 %#018lx  r11 %#018lx  r12 %#018lx\n",
            tf->tf_r10, tf->tf_r11, tf->tf_r12);
        printf("r13 %#018lx  r14 %#018lx  r15 %#018lx\n",
            tf->tf_r13, tf->tf_r14, tf->tf_r15);
        printf("rbp %#018lx  rbx %#018lx  rax %#018lx\n",
            tf->tf_rbp, tf->tf_rbx, tf->tf_rax);
        printf("cs %#04lx  ds %#04lx  es %#04lx  "
            "fs %#04lx  gs %#04lx  ss %#04lx\n",
            tf->tf_cs & 0xffff, tf->tf_ds & 0xffff, tf->tf_es & 0xffff,
            tf->tf_fs & 0xffff, tf->tf_gs & 0xffff, tf->tf_ss & 0xffff);
        printf("fsbase %#018lx gsbase %#018lx\n", pcb->pcb_fs, pcb->pcb_gs);
        printf("\n");
        hexdump(printf, "Stack dump", tf, 256);
}

static void
sigdebug(const struct trapframe *tf, const ksiginfo_t *ksi, int e)
{
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;

        printf("pid %d.%d (%s): signal %d code=%d (trap %#lx) "
            "@rip %#lx addr %#lx error=%d\n",
            p->p_pid, l->l_lid, p->p_comm, ksi->ksi_signo, ksi->ksi_code,
            tf->tf_trapno, tf->tf_rip, rcr2(), e);
        frame_dump(tf, lwp_getpcb(l));
}
#endif



































































































































    1 







































































































































































































































































































































    1 








    1 



    1 

    1 
    1 



    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
/*        $NetBSD: subr_workqueue.c,v 1.48 2024/03/01 04:32:38 mrg Exp $        */

/*-
 * Copyright (c)2002, 2005, 2006, 2007 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_workqueue.c,v 1.48 2024/03/01 04:32:38 mrg Exp $");

#include <sys/param.h>

#include <sys/condvar.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/kthread.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/sdt.h>
#include <sys/systm.h>
#include <sys/workqueue.h>

typedef struct work_impl {
        SIMPLEQ_ENTRY(work_impl) wk_entry;
} work_impl_t;

SIMPLEQ_HEAD(workqhead, work_impl);

struct workqueue_queue {
        kmutex_t q_mutex;
        kcondvar_t q_cv;
        struct workqhead q_queue_pending;
        uint64_t q_gen;
        lwp_t *q_worker;
};

struct workqueue {
        void (*wq_func)(struct work *, void *);
        void *wq_arg;
        int wq_flags;

        char wq_name[MAXCOMLEN];
        pri_t wq_prio;
        void *wq_ptr;
};

#define        WQ_SIZE                (roundup2(sizeof(struct workqueue), coherency_unit))
#define        WQ_QUEUE_SIZE        (roundup2(sizeof(struct workqueue_queue), coherency_unit))

#define        POISON        0xaabbccdd

SDT_PROBE_DEFINE7(sdt, kernel, workqueue, create,
    "struct workqueue *"/*wq*/,
    "const char *"/*name*/,
    "void (*)(struct work *, void *)"/*func*/,
    "void *"/*arg*/,
    "pri_t"/*prio*/,
    "int"/*ipl*/,
    "int"/*flags*/);
SDT_PROBE_DEFINE1(sdt, kernel, workqueue, destroy,
    "struct workqueue *"/*wq*/);

SDT_PROBE_DEFINE3(sdt, kernel, workqueue, enqueue,
    "struct workqueue *"/*wq*/,
    "struct work *"/*wk*/,
    "struct cpu_info *"/*ci*/);
SDT_PROBE_DEFINE4(sdt, kernel, workqueue, entry,
    "struct workqueue *"/*wq*/,
    "struct work *"/*wk*/,
    "void (*)(struct work *, void *)"/*func*/,
    "void *"/*arg*/);
SDT_PROBE_DEFINE4(sdt, kernel, workqueue, return,
    "struct workqueue *"/*wq*/,
    "struct work *"/*wk*/,
    "void (*)(struct work *, void *)"/*func*/,
    "void *"/*arg*/);
SDT_PROBE_DEFINE2(sdt, kernel, workqueue, wait__start,
    "struct workqueue *"/*wq*/,
    "struct work *"/*wk*/);
SDT_PROBE_DEFINE2(sdt, kernel, workqueue, wait__self,
    "struct workqueue *"/*wq*/,
    "struct work *"/*wk*/);
SDT_PROBE_DEFINE2(sdt, kernel, workqueue, wait__hit,
    "struct workqueue *"/*wq*/,
    "struct work *"/*wk*/);
SDT_PROBE_DEFINE2(sdt, kernel, workqueue, wait__done,
    "struct workqueue *"/*wq*/,
    "struct work *"/*wk*/);

SDT_PROBE_DEFINE1(sdt, kernel, workqueue, exit__start,
    "struct workqueue *"/*wq*/);
SDT_PROBE_DEFINE1(sdt, kernel, workqueue, exit__done,
    "struct workqueue *"/*wq*/);

static size_t
workqueue_size(int flags)
{

        return WQ_SIZE
            + ((flags & WQ_PERCPU) != 0 ? ncpu : 1) * WQ_QUEUE_SIZE
            + coherency_unit;
}

static struct workqueue_queue *
workqueue_queue_lookup(struct workqueue *wq, struct cpu_info *ci)
{
        u_int idx = 0;

        if (wq->wq_flags & WQ_PERCPU) {
                idx = ci ? cpu_index(ci) : cpu_index(curcpu());
        }

        return (void *)((uintptr_t)(wq) + WQ_SIZE + (idx * WQ_QUEUE_SIZE));
}

static void
workqueue_runlist(struct workqueue *wq, struct workqhead *list)
{
        work_impl_t *wk;
        work_impl_t *next;
        struct lwp *l = curlwp;

        KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d",
            l, l->l_nopreempt);

        for (wk = SIMPLEQ_FIRST(list); wk != NULL; wk = next) {
                next = SIMPLEQ_NEXT(wk, wk_entry);
                SDT_PROBE4(sdt, kernel, workqueue, entry,
                    wq, wk, wq->wq_func, wq->wq_arg);
                (*wq->wq_func)((void *)wk, wq->wq_arg);
                SDT_PROBE4(sdt, kernel, workqueue, return,
                    wq, wk, wq->wq_func, wq->wq_arg);
                KASSERTMSG(l->l_nopreempt == 0,
                    "lwp %p nopreempt %d func %p",
                    l, l->l_nopreempt, wq->wq_func);
        }
}

static void
workqueue_worker(void *cookie)
{
        struct workqueue *wq = cookie;
        struct workqueue_queue *q;
        int s, fpu = wq->wq_flags & WQ_FPU;

        /* find the workqueue of this kthread */
        q = workqueue_queue_lookup(wq, curlwp->l_cpu);

        if (fpu)
                s = kthread_fpu_enter();
        mutex_enter(&q->q_mutex);
        for (;;) {
                struct workqhead tmp;

                SIMPLEQ_INIT(&tmp);

                while (SIMPLEQ_EMPTY(&q->q_queue_pending))
                        cv_wait(&q->q_cv, &q->q_mutex);
                SIMPLEQ_CONCAT(&tmp, &q->q_queue_pending);
                SIMPLEQ_INIT(&q->q_queue_pending);

                /*
                 * Mark the queue as actively running a batch of work
                 * by setting the generation number odd.
                 */
                q->q_gen |= 1;
                mutex_exit(&q->q_mutex);

                workqueue_runlist(wq, &tmp);

                /*
                 * Notify workqueue_wait that we have completed a batch
                 * of work by incrementing the generation number.
                 */
                mutex_enter(&q->q_mutex);
                KASSERTMSG(q->q_gen & 1, "q=%p gen=%"PRIu64, q, q->q_gen);
                q->q_gen++;
                cv_broadcast(&q->q_cv);
        }
        mutex_exit(&q->q_mutex);
        if (fpu)
                kthread_fpu_exit(s);
}

static void
workqueue_init(struct workqueue *wq, const char *name,
    void (*callback_func)(struct work *, void *), void *callback_arg,
    pri_t prio, int ipl)
{

        KASSERT(sizeof(wq->wq_name) > strlen(name));
        strncpy(wq->wq_name, name, sizeof(wq->wq_name));

        wq->wq_prio = prio;
        wq->wq_func = callback_func;
        wq->wq_arg = callback_arg;
}

static int
workqueue_initqueue(struct workqueue *wq, struct workqueue_queue *q,
    int ipl, struct cpu_info *ci)
{
        int error, ktf;

        KASSERT(q->q_worker == NULL);

        mutex_init(&q->q_mutex, MUTEX_DEFAULT, ipl);
        cv_init(&q->q_cv, wq->wq_name);
        SIMPLEQ_INIT(&q->q_queue_pending);
        q->q_gen = 0;
        ktf = ((wq->wq_flags & WQ_MPSAFE) != 0 ? KTHREAD_MPSAFE : 0);
        if (wq->wq_prio < PRI_KERNEL)
                ktf |= KTHREAD_TS;
        if (ci) {
                error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker,
                    wq, &q->q_worker, "%s/%u", wq->wq_name, ci->ci_index);
        } else {
                error = kthread_create(wq->wq_prio, ktf, ci, workqueue_worker,
                    wq, &q->q_worker, "%s", wq->wq_name);
        }
        if (error != 0) {
                mutex_destroy(&q->q_mutex);
                cv_destroy(&q->q_cv);
                KASSERT(q->q_worker == NULL);
        }
        return error;
}

struct workqueue_exitargs {
        work_impl_t wqe_wk;
        struct workqueue_queue *wqe_q;
};

static void
workqueue_exit(struct work *wk, void *arg)
{
        struct workqueue_exitargs *wqe = (void *)wk;
        struct workqueue_queue *q = wqe->wqe_q;

        /*
         * only competition at this point is workqueue_finiqueue.
         */

        KASSERT(q->q_worker == curlwp);
        KASSERT(SIMPLEQ_EMPTY(&q->q_queue_pending));
        mutex_enter(&q->q_mutex);
        q->q_worker = NULL;
        cv_broadcast(&q->q_cv);
        mutex_exit(&q->q_mutex);
        kthread_exit(0);
}

static void
workqueue_finiqueue(struct workqueue *wq, struct workqueue_queue *q)
{
        struct workqueue_exitargs wqe;

        KASSERT(wq->wq_func == workqueue_exit);

        wqe.wqe_q = q;
        KASSERT(SIMPLEQ_EMPTY(&q->q_queue_pending));
        KASSERT(q->q_worker != NULL);
        mutex_enter(&q->q_mutex);
        SIMPLEQ_INSERT_TAIL(&q->q_queue_pending, &wqe.wqe_wk, wk_entry);
        cv_broadcast(&q->q_cv);
        while (q->q_worker != NULL) {
                cv_wait(&q->q_cv, &q->q_mutex);
        }
        mutex_exit(&q->q_mutex);
        mutex_destroy(&q->q_mutex);
        cv_destroy(&q->q_cv);
}

/* --- */

int
workqueue_create(struct workqueue **wqp, const char *name,
    void (*callback_func)(struct work *, void *), void *callback_arg,
    pri_t prio, int ipl, int flags)
{
        struct workqueue *wq;
        struct workqueue_queue *q;
        void *ptr;
        int error = 0;

        CTASSERT(sizeof(work_impl_t) <= sizeof(struct work));

        ptr = kmem_zalloc(workqueue_size(flags), KM_SLEEP);
        wq = (void *)roundup2((uintptr_t)ptr, coherency_unit);
        wq->wq_ptr = ptr;
        wq->wq_flags = flags;

        workqueue_init(wq, name, callback_func, callback_arg, prio, ipl);

        if (flags & WQ_PERCPU) {
                struct cpu_info *ci;
                CPU_INFO_ITERATOR cii;

                /* create the work-queue for each CPU */
                for (CPU_INFO_FOREACH(cii, ci)) {
                        q = workqueue_queue_lookup(wq, ci);
                        error = workqueue_initqueue(wq, q, ipl, ci);
                        if (error) {
                                break;
                        }
                }
        } else {
                /* initialize a work-queue */
                q = workqueue_queue_lookup(wq, NULL);
                error = workqueue_initqueue(wq, q, ipl, NULL);
        }

        if (error != 0) {
                workqueue_destroy(wq);
        } else {
                *wqp = wq;
        }

        return error;
}

static bool
workqueue_q_wait(struct workqueue *wq, struct workqueue_queue *q,
    work_impl_t *wk_target)
{
        work_impl_t *wk;
        bool found = false;
        uint64_t gen;

        mutex_enter(&q->q_mutex);

        /*
         * Avoid a deadlock scenario.  We can't guarantee that
         * wk_target has completed at this point, but we can't wait for
         * it either, so do nothing.
         *
         * XXX Are there use-cases that require this semantics?
         */
        if (q->q_worker == curlwp) {
                SDT_PROBE2(sdt, kernel, workqueue, wait__self,  wq, wk_target);
                goto out;
        }

        /*
         * Wait until the target is no longer pending.  If we find it
         * on this queue, the caller can stop looking in other queues.
         * If we don't find it in this queue, however, we can't skip
         * waiting -- it may be hidden in the running queue which we
         * have no access to.
         */
    again:
        SIMPLEQ_FOREACH(wk, &q->q_queue_pending, wk_entry) {
                if (wk == wk_target) {
                        SDT_PROBE2(sdt, kernel, workqueue, wait__hit,  wq, wk);
                        found = true;
                        cv_wait(&q->q_cv, &q->q_mutex);
                        goto again;
                }
        }

        /*
         * The target may be in the batch of work currently running,
         * but we can't touch that queue.  So if there's anything
         * running, wait until the generation changes.
         */
        gen = q->q_gen;
        if (gen & 1) {
                do
                        cv_wait(&q->q_cv, &q->q_mutex);
                while (gen == q->q_gen);
        }

    out:
        mutex_exit(&q->q_mutex);

        return found;
}

/*
 * Wait for a specified work to finish.  The caller must ensure that no new
 * work will be enqueued before calling workqueue_wait.  Note that if the
 * workqueue is WQ_PERCPU, the caller can enqueue a new work to another queue
 * other than the waiting queue.
 */
void
workqueue_wait(struct workqueue *wq, struct work *wk)
{
        struct workqueue_queue *q;
        bool found;

        ASSERT_SLEEPABLE();

        SDT_PROBE2(sdt, kernel, workqueue, wait__start,  wq, wk);
        if (ISSET(wq->wq_flags, WQ_PERCPU)) {
                struct cpu_info *ci;
                CPU_INFO_ITERATOR cii;
                for (CPU_INFO_FOREACH(cii, ci)) {
                        q = workqueue_queue_lookup(wq, ci);
                        found = workqueue_q_wait(wq, q, (work_impl_t *)wk);
                        if (found)
                                break;
                }
        } else {
                q = workqueue_queue_lookup(wq, NULL);
                (void)workqueue_q_wait(wq, q, (work_impl_t *)wk);
        }
        SDT_PROBE2(sdt, kernel, workqueue, wait__done,  wq, wk);
}

void
workqueue_destroy(struct workqueue *wq)
{
        struct workqueue_queue *q;
        struct cpu_info *ci;
        CPU_INFO_ITERATOR cii;

        ASSERT_SLEEPABLE();

        SDT_PROBE1(sdt, kernel, workqueue, exit__start,  wq);
        wq->wq_func = workqueue_exit;
        for (CPU_INFO_FOREACH(cii, ci)) {
                q = workqueue_queue_lookup(wq, ci);
                if (q->q_worker != NULL) {
                        workqueue_finiqueue(wq, q);
                }
        }
        SDT_PROBE1(sdt, kernel, workqueue, exit__done,  wq);
        kmem_free(wq->wq_ptr, workqueue_size(wq->wq_flags));
}

#ifdef DEBUG
static void
workqueue_check_duplication(struct workqueue_queue *q, work_impl_t *wk)
{
        work_impl_t *_wk;

        SIMPLEQ_FOREACH(_wk, &q->q_queue_pending, wk_entry) {
                if (_wk == wk)
                        panic("%s: tried to enqueue a queued work", __func__);
        }
}
#endif

void
workqueue_enqueue(struct workqueue *wq, struct work *wk0, struct cpu_info *ci)
{
        struct workqueue_queue *q;
        work_impl_t *wk = (void *)wk0;

        SDT_PROBE3(sdt, kernel, workqueue, enqueue,  wq, wk0, ci);

        KASSERT(wq->wq_flags & WQ_PERCPU || ci == NULL);
        q = workqueue_queue_lookup(wq, ci);

        mutex_enter(&q->q_mutex);
#ifdef DEBUG
        workqueue_check_duplication(q, wk);
#endif
        SIMPLEQ_INSERT_TAIL(&q->q_queue_pending, wk, wk_entry);
        cv_broadcast(&q->q_cv);
        mutex_exit(&q->q_mutex);
}























































   21 




   20 
   21 




   16 

   21 





    6 
    6 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
/*        $NetBSD: strlcpy.c,v 1.3 2007/06/04 18:19:27 christos Exp $        */
/*        $OpenBSD: strlcpy.c,v 1.7 2003/04/12 21:56:39 millert Exp $        */

/*
 * Copyright (c) 1998 Todd C. Miller <Todd.Miller@courtesan.com>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND TODD C. MILLER DISCLAIMS ALL
 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL TODD C. MILLER BE LIABLE
 * FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#if !defined(_KERNEL) && !defined(_STANDALONE)
#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif

#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
__RCSID("$NetBSD: strlcpy.c,v 1.3 2007/06/04 18:19:27 christos Exp $");
#endif /* LIBC_SCCS and not lint */

#ifdef _LIBC
#include "namespace.h"
#endif
#include <sys/types.h>
#include <assert.h>
#include <string.h>

#ifdef _LIBC
# ifdef __weak_alias
__weak_alias(strlcpy, _strlcpy)
# endif
#endif
#else
#include <lib/libkern/libkern.h>
#endif /* !_KERNEL && !_STANDALONE */


#if !HAVE_STRLCPY
/*
 * Copy src to string dst of size siz.  At most siz-1 characters
 * will be copied.  Always NUL terminates (unless siz == 0).
 * Returns strlen(src); if retval >= siz, truncation occurred.
 */
size_t
strlcpy(char *dst, const char *src, size_t siz)
{
        char *d = dst;
        const char *s = src;
        size_t n = siz;

        _DIAGASSERT(dst != NULL);
        _DIAGASSERT(src != NULL);

        /* Copy as many bytes as will fit */
        if (n != 0 && --n != 0) {
                do {
                        if ((*d++ = *s++) == 0)
                                break;
                } while (--n != 0);
        }

        /* Not enough room in dst, add NUL and traverse rest of src */
        if (n == 0) {
                if (siz != 0)
                        *d = '\0';                /* NUL-terminate dst */
                while (*s++)
                        ;
        }

        return(s - src - 1);        /* count does not include NUL */
}
#endif







































































































































    5 




    6 



















































































    6 
    6 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
/*        $NetBSD: rtbl.c,v 1.7 2017/06/01 02:45:14 chs Exp $        */

/*-
 * Copyright (c) 1998, 2008, 2011 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Kevin M. Lahey of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1980, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)route.c        8.3 (Berkeley) 1/9/95
 */

#if defined(_KERNEL) && defined(_KERNEL_OPT)
#include "opt_route.h"
#endif /* _KERNEL && _KERNEL_OPT */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rtbl.c,v 1.7 2017/06/01 02:45:14 chs Exp $");

#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/proc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/ioctl.h>
#include <sys/pool.h>
#include <sys/kauth.h>

#include <net/if.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/raw_cb.h>

static rtbl_t *rt_tables[AF_MAX+1];

int
rt_inithead(rtbl_t **tp, int off)
{
        rtbl_t *t;
        if (*tp != NULL)
                return 1;
        t = kmem_alloc(sizeof(*t), KM_SLEEP);
        *tp = t;
        return rn_inithead0(&t->t_rnh, off);
}

struct rtentry *
rt_matchaddr(rtbl_t *t, const struct sockaddr *dst)
{
        struct radix_node_head *rnh = &t->t_rnh;
        struct radix_node *rn;

        rn = rnh->rnh_matchaddr(dst, rnh);
        if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0)
                return NULL;
        return (struct rtentry *)rn;
}

int
rt_addaddr(rtbl_t *t, struct rtentry *rt, const struct sockaddr *netmask)
{
        struct radix_node_head *rnh = &t->t_rnh;
        struct radix_node *rn;

        rn = rnh->rnh_addaddr(rt_getkey(rt), netmask, rnh, rt->rt_nodes);

        return (rn == NULL) ? EEXIST : 0;
}

struct rtentry *
rt_lookup(rtbl_t *t, const struct sockaddr *dst, const struct sockaddr *netmask)
{
        struct radix_node_head *rnh = &t->t_rnh;
        struct radix_node *rn;

        rn = rnh->rnh_lookup(dst, netmask, rnh);
        if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0)
                return NULL;
        return (struct rtentry *)rn;
}

struct rtentry *
rt_deladdr(rtbl_t *t, const struct sockaddr *dst,
    const struct sockaddr *netmask)
{
        struct radix_node_head *rnh = &t->t_rnh;
        struct radix_node *rn;

        if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == NULL)
                return NULL;
        if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
                panic("%s", __func__);
        return (struct rtentry *)rn;
}

static int
rt_walktree_visitor(struct radix_node *rn, void *v)
{
        struct rtwalk *rw = (struct rtwalk *)v;

        return (*rw->rw_f)((struct rtentry *)rn, rw->rw_v);
}

int
rtbl_walktree(sa_family_t family, int (*f)(struct rtentry *, void *), void *v)
{
        rtbl_t *t = rt_tables[family];
        struct rtwalk rw;

        if (t == NULL)
                return 0;

        rw.rw_f = f;
        rw.rw_v = v;

        return rn_walktree(&t->t_rnh, rt_walktree_visitor, &rw);
}

struct rtentry *
rtbl_search_matched_entry(sa_family_t family,
    int (*f)(struct rtentry *, void *), void *v)
{
        rtbl_t *t = rt_tables[family];
        struct rtwalk rw;

        if (t == NULL)
                return 0;

        rw.rw_f = f;
        rw.rw_v = v;

        return (struct rtentry *)
            rn_search_matched(&t->t_rnh, rt_walktree_visitor, &rw);
}

rtbl_t *
rt_gettable(sa_family_t af)
{
        if (af >= __arraycount(rt_tables))
                return NULL;
        return rt_tables[af];
}

void
rtbl_init(void)
{
        struct domain *dom;
        DOMAIN_FOREACH(dom)
                if (dom->dom_rtattach)
                        dom->dom_rtattach(&rt_tables[dom->dom_family],
                            dom->dom_rtoffset);
}

void
rt_assert_inactive(const struct rtentry *rt)
{
        if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
                panic ("rtfree 2");
}

int
rt_refines(const struct sockaddr *m_sa, const struct sockaddr *n_sa)
{

        return rn_refines(m_sa, n_sa);
}



















































































































































































































































































































































































































































































































































































































    3 



    3 
    3 
    3 


    3 

































   27 






   27 


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 





    3 



    3 















    3 
































































































































































































































































































































































































































































































































































    3 


    3 









    3 


























































































































    3 


    3 








    2 



    3 

































































   56 








   10 










   10 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
/*        $NetBSD: kern_event.c,v 1.150 2023/09/21 09:31:50 msaitoh Exp $        */

/*-
 * Copyright (c) 2008, 2009, 2021 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
 * Copyright (c) 2009 Apple, Inc
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
 */

#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#endif /* _KERNEL_OPT */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.150 2023/09/21 09:31:50 msaitoh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/wait.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/select.h>
#include <sys/queue.h>
#include <sys/event.h>
#include <sys/eventvar.h>
#include <sys/poll.h>
#include <sys/kmem.h>
#include <sys/stat.h>
#include <sys/filedesc.h>
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/conf.h>
#include <sys/atomic.h>

static int        kqueue_scan(file_t *, size_t, struct kevent *,
                            const struct timespec *, register_t *,
                            const struct kevent_ops *, struct kevent *,
                            size_t);
static int        kqueue_ioctl(file_t *, u_long, void *);
static int        kqueue_fcntl(file_t *, u_int, void *);
static int        kqueue_poll(file_t *, int);
static int        kqueue_kqfilter(file_t *, struct knote *);
static int        kqueue_stat(file_t *, struct stat *);
static int        kqueue_close(file_t *);
static void        kqueue_restart(file_t *);
static int        kqueue_fpathconf(file_t *, int, register_t *);
static int        kqueue_register(struct kqueue *, struct kevent *);
static void        kqueue_doclose(struct kqueue *, struct klist *, int);

static void        knote_detach(struct knote *, filedesc_t *fdp, bool);
static void        knote_enqueue(struct knote *);
static void        knote_activate(struct knote *);
static void        knote_activate_locked(struct knote *);
static void        knote_deactivate_locked(struct knote *);

static void        filt_kqdetach(struct knote *);
static int        filt_kqueue(struct knote *, long hint);
static int        filt_procattach(struct knote *);
static void        filt_procdetach(struct knote *);
static int        filt_proc(struct knote *, long hint);
static int        filt_fileattach(struct knote *);
static void        filt_timerexpire(void *x);
static int        filt_timerattach(struct knote *);
static void        filt_timerdetach(struct knote *);
static int        filt_timer(struct knote *, long hint);
static int        filt_timertouch(struct knote *, struct kevent *, long type);
static int        filt_userattach(struct knote *);
static void        filt_userdetach(struct knote *);
static int        filt_user(struct knote *, long hint);
static int        filt_usertouch(struct knote *, struct kevent *, long type);

/*
 * Private knote state that should never be exposed outside
 * of kern_event.c
 *
 * Field locking:
 *
 * q        kn_kq->kq_lock
 */
struct knote_impl {
        struct knote        ki_knote;
        unsigned int        ki_influx;        /* q: in-flux counter */
        kmutex_t        ki_foplock;        /* for kn_filterops */
};

#define        KIMPL_TO_KNOTE(kip)        (&(kip)->ki_knote)
#define        KNOTE_TO_KIMPL(knp)        container_of((knp), struct knote_impl, ki_knote)

static inline struct knote *
knote_alloc(bool sleepok)
{
        struct knote_impl *ki;

        ki = kmem_zalloc(sizeof(*ki), sleepok ? KM_SLEEP : KM_NOSLEEP);
        mutex_init(&ki->ki_foplock, MUTEX_DEFAULT, IPL_NONE);

        return KIMPL_TO_KNOTE(ki);
}

static inline void
knote_free(struct knote *kn)
{
        struct knote_impl *ki = KNOTE_TO_KIMPL(kn);

        mutex_destroy(&ki->ki_foplock);
        kmem_free(ki, sizeof(*ki));
}

static inline void
knote_foplock_enter(struct knote *kn)
{
        mutex_enter(&KNOTE_TO_KIMPL(kn)->ki_foplock);
}

static inline void
knote_foplock_exit(struct knote *kn)
{
        mutex_exit(&KNOTE_TO_KIMPL(kn)->ki_foplock);
}

static inline bool __diagused
knote_foplock_owned(struct knote *kn)
{
        return mutex_owned(&KNOTE_TO_KIMPL(kn)->ki_foplock);
}

static const struct fileops kqueueops = {
        .fo_name = "kqueue",
        .fo_read = (void *)enxio,
        .fo_write = (void *)enxio,
        .fo_ioctl = kqueue_ioctl,
        .fo_fcntl = kqueue_fcntl,
        .fo_poll = kqueue_poll,
        .fo_stat = kqueue_stat,
        .fo_close = kqueue_close,
        .fo_kqfilter = kqueue_kqfilter,
        .fo_restart = kqueue_restart,
        .fo_fpathconf = kqueue_fpathconf,
};

static void
filt_nopdetach(struct knote *kn __unused)
{
}

static int
filt_nopevent(struct knote *kn __unused, long hint __unused)
{
        return 0;
}

static const struct filterops nop_fd_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_nopdetach,
        .f_event = filt_nopevent,
};

static const struct filterops nop_filtops = {
        .f_flags = FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_nopdetach,
        .f_event = filt_nopevent,
};

static const struct filterops kqread_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_kqdetach,
        .f_event = filt_kqueue,
};

static const struct filterops proc_filtops = {
        .f_flags = FILTEROP_MPSAFE,
        .f_attach = filt_procattach,
        .f_detach = filt_procdetach,
        .f_event = filt_proc,
};

/*
 * file_filtops is not marked MPSAFE because it's going to call
 * fileops::fo_kqfilter(), which might not be.  That function,
 * however, will override the knote's filterops, and thus will
 * inherit the MPSAFE-ness of the back-end at that time.
 */
static const struct filterops file_filtops = {
        .f_flags = FILTEROP_ISFD,
        .f_attach = filt_fileattach,
        .f_detach = NULL,
        .f_event = NULL,
};

static const struct filterops timer_filtops = {
        .f_flags = FILTEROP_MPSAFE,
        .f_attach = filt_timerattach,
        .f_detach = filt_timerdetach,
        .f_event = filt_timer,
        .f_touch = filt_timertouch,
};

static const struct filterops user_filtops = {
        .f_flags = FILTEROP_MPSAFE,
        .f_attach = filt_userattach,
        .f_detach = filt_userdetach,
        .f_event = filt_user,
        .f_touch = filt_usertouch,
};

static u_int        kq_ncallouts = 0;
static int        kq_calloutmax = (4 * 1024);

#define        KN_HASHSIZE                64                /* XXX should be tunable */
#define        KN_HASH(val, mask)        (((val) ^ (val >> 8)) & (mask))

extern const struct filterops fs_filtops;        /* vfs_syscalls.c */
extern const struct filterops sig_filtops;        /* kern_sig.c */

/*
 * Table for all system-defined filters.
 * These should be listed in the numeric order of the EVFILT_* defines.
 * If filtops is NULL, the filter isn't implemented in NetBSD.
 * End of list is when name is NULL.
 *
 * Note that 'refcnt' is meaningless for built-in filters.
 */
struct kfilter {
        const char        *name;                /* name of filter */
        uint32_t        filter;                /* id of filter */
        unsigned        refcnt;                /* reference count */
        const struct filterops *filtops;/* operations for filter */
        size_t                namelen;        /* length of name string */
};

/* System defined filters */
static struct kfilter sys_kfilters[] = {
        { "EVFILT_READ",        EVFILT_READ,        0, &file_filtops, 0 },
        { "EVFILT_WRITE",        EVFILT_WRITE,        0, &file_filtops, 0, },
        { "EVFILT_AIO",                EVFILT_AIO,        0, NULL, 0 },
        { "EVFILT_VNODE",        EVFILT_VNODE,        0, &file_filtops, 0 },
        { "EVFILT_PROC",        EVFILT_PROC,        0, &proc_filtops, 0 },
        { "EVFILT_SIGNAL",        EVFILT_SIGNAL,        0, &sig_filtops, 0 },
        { "EVFILT_TIMER",        EVFILT_TIMER,        0, &timer_filtops, 0 },
        { "EVFILT_FS",                EVFILT_FS,        0, &fs_filtops, 0 },
        { "EVFILT_USER",        EVFILT_USER,        0, &user_filtops, 0 },
        { "EVFILT_EMPTY",        EVFILT_EMPTY,        0, &file_filtops, 0 },
        { NULL,                        0,                0, NULL, 0 },
};

/* User defined kfilters */
static struct kfilter        *user_kfilters;                /* array */
static int                user_kfilterc;                /* current offset */
static int                user_kfiltermaxc;        /* max size so far */
static size_t                user_kfiltersz;                /* size of allocated memory */

/*
 * Global Locks.
 *
 * Lock order:
 *
 *        kqueue_filter_lock
 *        -> kn_kq->kq_fdp->fd_lock
 *        -> knote foplock (if taken)
 *        -> object lock (e.g., device driver lock, &c.)
 *        -> kn_kq->kq_lock
 *
 * Locking rules.  ==> indicates the lock is acquired by the backing
 * object, locks prior are acquired before calling filter ops:
 *
 *        f_attach: fdp->fd_lock -> knote foplock ->
 *          (maybe) KERNEL_LOCK ==> backing object lock
 *
 *        f_detach: fdp->fd_lock -> knote foplock ->
 *           (maybe) KERNEL_LOCK ==> backing object lock
 *
 *        f_event via kevent: fdp->fd_lock -> knote foplock ->
 *           (maybe) KERNEL_LOCK ==> backing object lock
 *           N.B. NOTE_SUBMIT will never be set in the "hint" argument
 *           in this case.
 *
 *        f_event via knote (via backing object: Whatever caller guarantees.
 *        Typically:
 *                f_event(NOTE_SUBMIT): caller has already acquired backing
 *                    object lock.
 *                f_event(!NOTE_SUBMIT): caller has not acquired backing object,
 *                    lock or has possibly acquired KERNEL_LOCK.  Backing object
 *                    lock may or may not be acquired as-needed.
 *        N.B. the knote foplock will **not** be acquired in this case.  The
 *        caller guarantees that klist_fini() will not be called concurrently
 *        with knote().
 *
 *        f_touch: fdp->fd_lock -> kn_kq->kq_lock (spin lock)
 *            N.B. knote foplock is **not** acquired in this case and
 *            the caller must guarantee that klist_fini() will never
 *            be called.  kevent_register() restricts filters that
 *            provide f_touch to known-safe cases.
 *
 *        klist_fini(): Caller must guarantee that no more knotes can
 *            be attached to the klist, and must **not** hold the backing
 *            object's lock; klist_fini() itself will acquire the foplock
 *            of each knote on the klist.
 *
 * Locking rules when detaching knotes:
 *
 * There are some situations where knote submission may require dropping
 * locks (see knote_proc_fork()).  In order to support this, it's possible
 * to mark a knote as being 'in-flux'.  Such a knote is guaranteed not to
 * be detached while it remains in-flux.  Because it will not be detached,
 * locks can be dropped so e.g. memory can be allocated, locks on other
 * data structures can be acquired, etc.  During this time, any attempt to
 * detach an in-flux knote must wait until the knote is no longer in-flux.
 * When this happens, the knote is marked for death (KN_WILLDETACH) and the
 * LWP who gets to finish the detach operation is recorded in the knote's
 * 'udata' field (which is no longer required for its original purpose once
 * a knote is so marked).  Code paths that lead to knote_detach() must ensure
 * that their LWP is the one tasked with its final demise after waiting for
 * the in-flux status of the knote to clear.  Note that once a knote is
 * marked KN_WILLDETACH, no code paths may put it into an in-flux state.
 *
 * Once the special circumstances have been handled, the locks are re-
 * acquired in the proper order (object lock -> kq_lock), the knote taken
 * out of flux, and any waiters are notified.  Because waiters must have
 * also dropped *their* locks in order to safely block, they must re-
 * validate all of their assumptions; see knote_detach_quiesce().  See also
 * the kqueue_register() (EV_ADD, EV_DELETE) and kqueue_scan() (EV_ONESHOT)
 * cases.
 *
 * When kqueue_scan() encounters an in-flux knote, the situation is
 * treated like another LWP's list marker.
 *
 * LISTEN WELL: It is important to not hold knotes in flux for an
 * extended period of time! In-flux knotes effectively block any
 * progress of the kqueue_scan() operation.  Any code paths that place
 * knotes in-flux should be careful to not block for indefinite periods
 * of time, such as for memory allocation (i.e. KM_NOSLEEP is OK, but
 * KM_SLEEP is not).
 */
static krwlock_t        kqueue_filter_lock;        /* lock on filter lists */

#define        KQ_FLUX_WAIT(kq)        (void)cv_wait(&kq->kq_cv, &kq->kq_lock)
#define        KQ_FLUX_WAKEUP(kq)        cv_broadcast(&kq->kq_cv)

static inline bool
kn_in_flux(struct knote *kn)
{
        KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
        return KNOTE_TO_KIMPL(kn)->ki_influx != 0;
}

static inline bool
kn_enter_flux(struct knote *kn)
{
        KASSERT(mutex_owned(&kn->kn_kq->kq_lock));

        if (kn->kn_status & KN_WILLDETACH) {
                return false;
        }

        struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
        KASSERT(ki->ki_influx < UINT_MAX);
        ki->ki_influx++;

        return true;
}

static inline bool
kn_leave_flux(struct knote *kn)
{
        KASSERT(mutex_owned(&kn->kn_kq->kq_lock));

        struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
        KASSERT(ki->ki_influx > 0);
        ki->ki_influx--;
        return ki->ki_influx == 0;
}

static void
kn_wait_flux(struct knote *kn, bool can_loop)
{
        struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
        bool loop;

        KASSERT(mutex_owned(&kn->kn_kq->kq_lock));

        /*
         * It may not be safe for us to touch the knote again after
         * dropping the kq_lock.  The caller has let us know in
         * 'can_loop'.
         */
        for (loop = true; loop && ki->ki_influx != 0; loop = can_loop) {
                KQ_FLUX_WAIT(kn->kn_kq);
        }
}

#define        KNOTE_WILLDETACH(kn)                                                \
do {                                                                        \
        (kn)->kn_status |= KN_WILLDETACH;                                \
        (kn)->kn_kevent.udata = curlwp;                                        \
} while (/*CONSTCOND*/0)

/*
 * Wait until the specified knote is in a quiescent state and
 * safe to detach.  Returns true if we potentially blocked (and
 * thus dropped our locks).
 */
static bool
knote_detach_quiesce(struct knote *kn)
{
        struct kqueue *kq = kn->kn_kq;
        filedesc_t *fdp = kq->kq_fdp;

        KASSERT(mutex_owned(&fdp->fd_lock));

        mutex_spin_enter(&kq->kq_lock);
        /*
         * There are two cases where we might see KN_WILLDETACH here:
         *
         * 1. Someone else has already started detaching the knote but
         *    had to wait for it to settle first.
         *
         * 2. We had to wait for it to settle, and had to come back
         *    around after re-acquiring the locks.
         *
         * When KN_WILLDETACH is set, we also set the LWP that claimed
         * the prize of finishing the detach in the 'udata' field of the
         * knote (which will never be used again for its usual purpose
         * once the note is in this state).  If it doesn't point to us,
         * we must drop the locks and let them in to finish the job.
         *
         * Otherwise, once we have claimed the knote for ourselves, we
         * can finish waiting for it to settle.  The is the only scenario
         * where touching a detaching knote is safe after dropping the
         * locks.
         */
        if ((kn->kn_status & KN_WILLDETACH) != 0 &&
            kn->kn_kevent.udata != curlwp) {
                /*
                 * N.B. it is NOT safe for us to touch the knote again
                 * after dropping the locks here.  The caller must go
                 * back around and re-validate everything.  However, if
                 * the knote is in-flux, we want to block to minimize
                 * busy-looping.
                 */
                mutex_exit(&fdp->fd_lock);
                if (kn_in_flux(kn)) {
                        kn_wait_flux(kn, false);
                        mutex_spin_exit(&kq->kq_lock);
                        return true;
                }
                mutex_spin_exit(&kq->kq_lock);
                preempt_point();
                return true;
        }
        /*
         * If we get here, we know that we will be claiming the
         * detach responsibilies, or that we already have and
         * this is the second attempt after re-validation.
         */
        KASSERT((kn->kn_status & KN_WILLDETACH) == 0 ||
                kn->kn_kevent.udata == curlwp);
        /*
         * Similarly, if we get here, either we are just claiming it
         * and may have to wait for it to settle, or if this is the
         * second attempt after re-validation that no other code paths
         * have put it in-flux.
         */
        KASSERT((kn->kn_status & KN_WILLDETACH) == 0 ||
                kn_in_flux(kn) == false);
        KNOTE_WILLDETACH(kn);
        if (kn_in_flux(kn)) {
                mutex_exit(&fdp->fd_lock);
                kn_wait_flux(kn, true);
                /*
                 * It is safe for us to touch the knote again after
                 * dropping the locks, but the caller must still
                 * re-validate everything because other aspects of
                 * the environment may have changed while we blocked.
                 */
                KASSERT(kn_in_flux(kn) == false);
                mutex_spin_exit(&kq->kq_lock);
                return true;
        }
        mutex_spin_exit(&kq->kq_lock);

        return false;
}

/*
 * Calls into the filterops need to be resilient against things which
 * destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid
 * chasing garbage pointers (to data, or even potentially code in a
 * module about to be unloaded).  To that end, we acquire the
 * knote foplock before calling into the filter ops.  When a driver
 * (or anything else) is tearing down its klist, klist_fini() enumerates
 * each knote, acquires its foplock, and replaces the filterops with a
 * nop stub, allowing knote detach (when descriptors are closed) to safely
 * proceed.
 */

static int
filter_attach(struct knote *kn)
{
        int rv;

        KASSERT(knote_foplock_owned(kn));
        KASSERT(kn->kn_fop != NULL);
        KASSERT(kn->kn_fop->f_attach != NULL);

        /*
         * N.B. that kn->kn_fop may change as the result of calling
         * f_attach().  After f_attach() returns, kn->kn_fop may not
         * be modified by code outside of klist_fini().
         */
        if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
                rv = kn->kn_fop->f_attach(kn);
        } else {
                KERNEL_LOCK(1, NULL);
                rv = kn->kn_fop->f_attach(kn);
                KERNEL_UNLOCK_ONE(NULL);
        }

        return rv;
}

static void
filter_detach(struct knote *kn)
{

        KASSERT(knote_foplock_owned(kn));
        KASSERT(kn->kn_fop != NULL);
        KASSERT(kn->kn_fop->f_detach != NULL);

        if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
                kn->kn_fop->f_detach(kn);
        } else {
                KERNEL_LOCK(1, NULL);
                kn->kn_fop->f_detach(kn);
                KERNEL_UNLOCK_ONE(NULL);
        }
}

static int
filter_event(struct knote *kn, long hint, bool submitting)
{
        int rv;

        /* See knote(). */
        KASSERT(submitting || knote_foplock_owned(kn));
        KASSERT(kn->kn_fop != NULL);
        KASSERT(kn->kn_fop->f_event != NULL);

        if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
                rv = kn->kn_fop->f_event(kn, hint);
        } else {
                KERNEL_LOCK(1, NULL);
                rv = kn->kn_fop->f_event(kn, hint);
                KERNEL_UNLOCK_ONE(NULL);
        }

        return rv;
}

static int
filter_touch(struct knote *kn, struct kevent *kev, long type)
{

        /*
         * XXX We cannot assert that the knote foplock is held here
         * XXX beause we cannot safely acquire it in all cases
         * XXX where "touch" will be used in kqueue_scan().  We just
         * XXX have to assume that f_touch will always be safe to call,
         * XXX and kqueue_register() allows only the two known-safe
         * XXX users of that op.
         */

        KASSERT(kn->kn_fop != NULL);
        KASSERT(kn->kn_fop->f_touch != NULL);

        return kn->kn_fop->f_touch(kn, kev, type);
}

static kauth_listener_t        kqueue_listener;

static int
kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        struct proc *p;
        int result;

        result = KAUTH_RESULT_DEFER;
        p = arg0;

        if (action != KAUTH_PROCESS_KEVENT_FILTER)
                return result;

        if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) ||
            ISSET(p->p_flag, PK_SUGID)))
                return result;

        result = KAUTH_RESULT_ALLOW;

        return result;
}

/*
 * Initialize the kqueue subsystem.
 */
void
kqueue_init(void)
{

        rw_init(&kqueue_filter_lock);

        kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
            kqueue_listener_cb, NULL);
}

/*
 * Find kfilter entry by name, or NULL if not found.
 */
static struct kfilter *
kfilter_byname_sys(const char *name)
{
        int i;

        KASSERT(rw_lock_held(&kqueue_filter_lock));

        for (i = 0; sys_kfilters[i].name != NULL; i++) {
                if (strcmp(name, sys_kfilters[i].name) == 0)
                        return &sys_kfilters[i];
        }
        return NULL;
}

static struct kfilter *
kfilter_byname_user(const char *name)
{
        int i;

        KASSERT(rw_lock_held(&kqueue_filter_lock));

        /* user filter slots have a NULL name if previously deregistered */
        for (i = 0; i < user_kfilterc ; i++) {
                if (user_kfilters[i].name != NULL &&
                    strcmp(name, user_kfilters[i].name) == 0)
                        return &user_kfilters[i];
        }
        return NULL;
}

static struct kfilter *
kfilter_byname(const char *name)
{
        struct kfilter *kfilter;

        KASSERT(rw_lock_held(&kqueue_filter_lock));

        if ((kfilter = kfilter_byname_sys(name)) != NULL)
                return kfilter;

        return kfilter_byname_user(name);
}

/*
 * Find kfilter entry by filter id, or NULL if not found.
 * Assumes entries are indexed in filter id order, for speed.
 */
static struct kfilter *
kfilter_byfilter(uint32_t filter)
{
        struct kfilter *kfilter;

        KASSERT(rw_lock_held(&kqueue_filter_lock));

        if (filter < EVFILT_SYSCOUNT)        /* it's a system filter */
                kfilter = &sys_kfilters[filter];
        else if (user_kfilters != NULL &&
            filter < EVFILT_SYSCOUNT + user_kfilterc)
                                        /* it's a user filter */
                kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
        else
                return (NULL);                /* out of range */
        KASSERT(kfilter->filter == filter);        /* sanity check! */
        return (kfilter);
}

/*
 * Register a new kfilter. Stores the entry in user_kfilters.
 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
 * If retfilter != NULL, the new filterid is returned in it.
 */
int
kfilter_register(const char *name, const struct filterops *filtops,
                 int *retfilter)
{
        struct kfilter *kfilter;
        size_t len;
        int i;

        if (name == NULL || name[0] == '\0' || filtops == NULL)
                return (EINVAL);        /* invalid args */

        rw_enter(&kqueue_filter_lock, RW_WRITER);
        if (kfilter_byname(name) != NULL) {
                rw_exit(&kqueue_filter_lock);
                return (EEXIST);        /* already exists */
        }
        if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
                rw_exit(&kqueue_filter_lock);
                return (EINVAL);        /* too many */
        }

        for (i = 0; i < user_kfilterc; i++) {
                kfilter = &user_kfilters[i];
                if (kfilter->name == NULL) {
                        /* Previously deregistered slot.  Reuse. */
                        goto reuse;
                }
        }

        /* check if need to grow user_kfilters */
        if (user_kfilterc + 1 > user_kfiltermaxc) {
                /* Grow in KFILTER_EXTENT chunks. */
                user_kfiltermaxc += KFILTER_EXTENT;
                len = user_kfiltermaxc * sizeof(*kfilter);
                kfilter = kmem_alloc(len, KM_SLEEP);
                memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
                if (user_kfilters != NULL) {
                        memcpy(kfilter, user_kfilters, user_kfiltersz);
                        kmem_free(user_kfilters, user_kfiltersz);
                }
                user_kfiltersz = len;
                user_kfilters = kfilter;
        }
        /* Adding new slot */
        kfilter = &user_kfilters[user_kfilterc++];
reuse:
        kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP);

        kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;

        kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
        memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));

        if (retfilter != NULL)
                *retfilter = kfilter->filter;
        rw_exit(&kqueue_filter_lock);

        return (0);
}

/*
 * Unregister a kfilter previously registered with kfilter_register.
 * This retains the filter id, but clears the name and frees filtops (filter
 * operations), so that the number isn't reused during a boot.
 * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
 */
int
kfilter_unregister(const char *name)
{
        struct kfilter *kfilter;

        if (name == NULL || name[0] == '\0')
                return (EINVAL);        /* invalid name */

        rw_enter(&kqueue_filter_lock, RW_WRITER);
        if (kfilter_byname_sys(name) != NULL) {
                rw_exit(&kqueue_filter_lock);
                return (EINVAL);        /* can't detach system filters */
        }

        kfilter = kfilter_byname_user(name);
        if (kfilter == NULL) {
                rw_exit(&kqueue_filter_lock);
                return (ENOENT);
        }
        if (kfilter->refcnt != 0) {
                rw_exit(&kqueue_filter_lock);
                return (EBUSY);
        }

        /* Cast away const (but we know it's safe. */
        kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
        kfilter->name = NULL;        /* mark as `not implemented' */

        if (kfilter->filtops != NULL) {
                /* Cast away const (but we know it's safe. */
                kmem_free(__UNCONST(kfilter->filtops),
                    sizeof(*kfilter->filtops));
                kfilter->filtops = NULL; /* mark as `not implemented' */
        }
        rw_exit(&kqueue_filter_lock);

        return (0);
}


/*
 * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
 * descriptors. Calls fileops kqfilter method for given file descriptor.
 */
static int
filt_fileattach(struct knote *kn)
{
        file_t *fp;

        fp = kn->kn_obj;

        return (*fp->f_ops->fo_kqfilter)(fp, kn);
}

/*
 * Filter detach method for EVFILT_READ on kqueue descriptor.
 */
static void
filt_kqdetach(struct knote *kn)
{
        struct kqueue *kq;

        kq = ((file_t *)kn->kn_obj)->f_kqueue;

        mutex_spin_enter(&kq->kq_lock);
        selremove_knote(&kq->kq_sel, kn);
        mutex_spin_exit(&kq->kq_lock);
}

/*
 * Filter event method for EVFILT_READ on kqueue descriptor.
 */
/*ARGSUSED*/
static int
filt_kqueue(struct knote *kn, long hint)
{
        struct kqueue *kq;
        int rv;

        kq = ((file_t *)kn->kn_obj)->f_kqueue;

        if (hint != NOTE_SUBMIT)
                mutex_spin_enter(&kq->kq_lock);
        kn->kn_data = KQ_COUNT(kq);
        rv = (kn->kn_data > 0);
        if (hint != NOTE_SUBMIT)
                mutex_spin_exit(&kq->kq_lock);

        return rv;
}

/*
 * Filter attach method for EVFILT_PROC.
 */
static int
filt_procattach(struct knote *kn)
{
        struct proc *p;

        mutex_enter(&proc_lock);
        p = proc_find(kn->kn_id);
        if (p == NULL) {
                mutex_exit(&proc_lock);
                return ESRCH;
        }

        /*
         * Fail if it's not owned by you, or the last exec gave us
         * setuid/setgid privs (unless you're root).
         */
        mutex_enter(p->p_lock);
        mutex_exit(&proc_lock);
        if (kauth_authorize_process(curlwp->l_cred,
            KAUTH_PROCESS_KEVENT_FILTER, p, NULL, NULL, NULL) != 0) {
                    mutex_exit(p->p_lock);
                return EACCES;
        }

        kn->kn_obj = p;
        kn->kn_flags |= EV_CLEAR;        /* automatically set */

        /*
         * NOTE_CHILD is only ever generated internally; don't let it
         * leak in from user-space.  See knote_proc_fork_track().
         */
        kn->kn_sfflags &= ~NOTE_CHILD;

        klist_insert(&p->p_klist, kn);
            mutex_exit(p->p_lock);

        return 0;
}

/*
 * Filter detach method for EVFILT_PROC.
 *
 * The knote may be attached to a different process, which may exit,
 * leaving nothing for the knote to be attached to.  So when the process
 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
 * it will be deleted when read out.  However, as part of the knote deletion,
 * this routine is called, so a check is needed to avoid actually performing
 * a detach, because the original process might not exist any more.
 */
static void
filt_procdetach(struct knote *kn)
{
        struct kqueue *kq = kn->kn_kq;
        struct proc *p;

        /*
         * We have to synchronize with knote_proc_exit(), but we
         * are forced to acquire the locks in the wrong order here
         * because we can't be sure kn->kn_obj is valid unless
         * KN_DETACHED is not set.
         */
 again:
        mutex_spin_enter(&kq->kq_lock);
        if ((kn->kn_status & KN_DETACHED) == 0) {
                p = kn->kn_obj;
                if (!mutex_tryenter(p->p_lock)) {
                        mutex_spin_exit(&kq->kq_lock);
                        preempt_point();
                        goto again;
                }
                kn->kn_status |= KN_DETACHED;
                klist_remove(&p->p_klist, kn);
                mutex_exit(p->p_lock);
        }
        mutex_spin_exit(&kq->kq_lock);
}

/*
 * Filter event method for EVFILT_PROC.
 *
 * Due to some of the complexities of process locking, we have special
 * entry points for delivering knote submissions.  filt_proc() is used
 * only to check for activation from kqueue_register() and kqueue_scan().
 */
static int
filt_proc(struct knote *kn, long hint)
{
        struct kqueue *kq = kn->kn_kq;
        uint32_t fflags;

        /*
         * Because we share the same klist with signal knotes, just
         * ensure that we're not being invoked for the proc-related
         * submissions.
         */
        KASSERT((hint & (NOTE_EXEC | NOTE_EXIT | NOTE_FORK)) == 0);

        mutex_spin_enter(&kq->kq_lock);
        fflags = kn->kn_fflags;
        mutex_spin_exit(&kq->kq_lock);

        return fflags != 0;
}

void
knote_proc_exec(struct proc *p)
{
        struct knote *kn, *tmpkn;
        struct kqueue *kq;
        uint32_t fflags;

        mutex_enter(p->p_lock);

        SLIST_FOREACH_SAFE(kn, &p->p_klist, kn_selnext, tmpkn) {
                /* N.B. EVFILT_SIGNAL knotes are on this same list. */
                if (kn->kn_fop == &sig_filtops) {
                        continue;
                }
                KASSERT(kn->kn_fop == &proc_filtops);

                kq = kn->kn_kq;
                mutex_spin_enter(&kq->kq_lock);
                fflags = (kn->kn_fflags |= (kn->kn_sfflags & NOTE_EXEC));
                if (fflags) {
                        knote_activate_locked(kn);
                }
                mutex_spin_exit(&kq->kq_lock);
        }

        mutex_exit(p->p_lock);
}

static int __noinline
knote_proc_fork_track(struct proc *p1, struct proc *p2, struct knote *okn)
{
        struct kqueue *kq = okn->kn_kq;

        KASSERT(mutex_owned(&kq->kq_lock));
        KASSERT(mutex_owned(p1->p_lock));

        /*
         * We're going to put this knote into flux while we drop
         * the locks and create and attach a new knote to track the
         * child.  If we are not able to enter flux, then this knote
         * is about to go away, so skip the notification.
         */
        if (!kn_enter_flux(okn)) {
                return 0;
        }

        mutex_spin_exit(&kq->kq_lock);
        mutex_exit(p1->p_lock);

        /*
         * We actually have to register *two* new knotes:
         *
         * ==> One for the NOTE_CHILD notification.  This is a forced
         *     ONESHOT note.
         *
         * ==> One to actually track the child process as it subsequently
         *     forks, execs, and, ultimately, exits.
         *
         * If we only register a single knote, then it's possible for
         * for the NOTE_CHILD and NOTE_EXIT to be collapsed into a single
         * notification if the child exits before the tracking process
         * has received the NOTE_CHILD notification, which applications
         * aren't expecting (the event's 'data' field would be clobbered,
         * for example).
         *
         * To do this, what we have here is an **extremely** stripped-down
         * version of kqueue_register() that has the following properties:
         *
         * ==> Does not block to allocate memory.  If we are unable
         *     to allocate memory, we return ENOMEM.
         *
         * ==> Does not search for existing knotes; we know there
         *     are not any because this is a new process that isn't
         *     even visible to other processes yet.
         *
         * ==> Assumes that the knhash for our kq's descriptor table
         *     already exists (after all, we're already tracking
         *     processes with knotes if we got here).
         *
         * ==> Directly attaches the new tracking knote to the child
         *     process.
         *
         * The whole point is to do the minimum amount of work while the
         * knote is held in-flux, and to avoid doing extra work in general
         * (we already have the new child process; why bother looking it
         * up again?).
         */
        filedesc_t *fdp = kq->kq_fdp;
        struct knote *knchild, *kntrack;
        int error = 0;

        knchild = knote_alloc(false);
        kntrack = knote_alloc(false);
        if (__predict_false(knchild == NULL || kntrack == NULL)) {
                error = ENOMEM;
                goto out;
        }

        kntrack->kn_obj = p2;
        kntrack->kn_id = p2->p_pid;
        kntrack->kn_kq = kq;
        kntrack->kn_fop = okn->kn_fop;
        kntrack->kn_kfilter = okn->kn_kfilter;
        kntrack->kn_sfflags = okn->kn_sfflags;
        kntrack->kn_sdata = p1->p_pid;

        kntrack->kn_kevent.ident = p2->p_pid;
        kntrack->kn_kevent.filter = okn->kn_filter;
        kntrack->kn_kevent.flags =
            okn->kn_flags | EV_ADD | EV_ENABLE | EV_CLEAR;
        kntrack->kn_kevent.fflags = 0;
        kntrack->kn_kevent.data = 0;
        kntrack->kn_kevent.udata = okn->kn_kevent.udata; /* preserve udata */

        /*
         * The child note does not need to be attached to the
         * new proc's klist at all.
         */
        *knchild = *kntrack;
        knchild->kn_status = KN_DETACHED;
        knchild->kn_sfflags = 0;
        knchild->kn_kevent.flags |= EV_ONESHOT;
        knchild->kn_kevent.fflags = NOTE_CHILD;
        knchild->kn_kevent.data = p1->p_pid;                 /* parent */

        mutex_enter(&fdp->fd_lock);

        /*
         * We need to check to see if the kq is closing, and skip
         * attaching the knote if so.  Normally, this isn't necessary
         * when coming in the front door because the file descriptor
         * layer will synchronize this.
         *
         * It's safe to test KQ_CLOSING without taking the kq_lock
         * here because that flag is only ever set when the fd_lock
         * is also held.
         */
        if (__predict_false(kq->kq_count & KQ_CLOSING)) {
                mutex_exit(&fdp->fd_lock);
                goto out;
        }

        /*
         * We do the "insert into FD table" and "attach to klist" steps
         * in the opposite order of kqueue_register() here to avoid
         * having to take p2->p_lock twice.  But this is OK because we
         * hold fd_lock across the entire operation.
         */

        mutex_enter(p2->p_lock);
        error = kauth_authorize_process(curlwp->l_cred,
            KAUTH_PROCESS_KEVENT_FILTER, p2, NULL, NULL, NULL);
        if (__predict_false(error != 0)) {
                mutex_exit(p2->p_lock);
                mutex_exit(&fdp->fd_lock);
                error = EACCES;
                goto out;
        }
        klist_insert(&p2->p_klist, kntrack);
        mutex_exit(p2->p_lock);

        KASSERT(fdp->fd_knhashmask != 0);
        KASSERT(fdp->fd_knhash != NULL);
        struct klist *list = &fdp->fd_knhash[KN_HASH(kntrack->kn_id,
            fdp->fd_knhashmask)];
        SLIST_INSERT_HEAD(list, kntrack, kn_link);
        SLIST_INSERT_HEAD(list, knchild, kn_link);

        /* This adds references for knchild *and* kntrack. */
        atomic_add_int(&kntrack->kn_kfilter->refcnt, 2);

        knote_activate(knchild);

        kntrack = NULL;
        knchild = NULL;

        mutex_exit(&fdp->fd_lock);

 out:
        if (__predict_false(knchild != NULL)) {
                knote_free(knchild);
        }
        if (__predict_false(kntrack != NULL)) {
                knote_free(kntrack);
        }
        mutex_enter(p1->p_lock);
        mutex_spin_enter(&kq->kq_lock);

        if (kn_leave_flux(okn)) {
                KQ_FLUX_WAKEUP(kq);
        }

        return error;
}

void
knote_proc_fork(struct proc *p1, struct proc *p2)
{
        struct knote *kn;
        struct kqueue *kq;
        uint32_t fflags;

        mutex_enter(p1->p_lock);

        /*
         * N.B. We DO NOT use SLIST_FOREACH_SAFE() here because we
         * don't want to pre-fetch the next knote; in the event we
         * have to drop p_lock, we will have put the knote in-flux,
         * meaning that no one will be able to detach it until we
         * have taken the knote out of flux.  However, that does
         * NOT stop someone else from detaching the next note in the
         * list while we have it unlocked.  Thus, we want to fetch
         * the next note in the list only after we have re-acquired
         * the lock, and using SLIST_FOREACH() will satisfy that.
         */
        SLIST_FOREACH(kn, &p1->p_klist, kn_selnext) {
                /* N.B. EVFILT_SIGNAL knotes are on this same list. */
                if (kn->kn_fop == &sig_filtops) {
                        continue;
                }
                KASSERT(kn->kn_fop == &proc_filtops);

                kq = kn->kn_kq;
                mutex_spin_enter(&kq->kq_lock);
                kn->kn_fflags |= (kn->kn_sfflags & NOTE_FORK);
                if (__predict_false(kn->kn_sfflags & NOTE_TRACK)) {
                        /*
                         * This will drop kq_lock and p_lock and
                         * re-acquire them before it returns.
                         */
                        if (knote_proc_fork_track(p1, p2, kn)) {
                                kn->kn_fflags |= NOTE_TRACKERR;
                        }
                        KASSERT(mutex_owned(p1->p_lock));
                        KASSERT(mutex_owned(&kq->kq_lock));
                }
                fflags = kn->kn_fflags;
                if (fflags) {
                        knote_activate_locked(kn);
                }
                mutex_spin_exit(&kq->kq_lock);
        }

        mutex_exit(p1->p_lock);
}

void
knote_proc_exit(struct proc *p)
{
        struct knote *kn;
        struct kqueue *kq;

        KASSERT(mutex_owned(p->p_lock));

        while (!SLIST_EMPTY(&p->p_klist)) {
                kn = SLIST_FIRST(&p->p_klist);
                kq = kn->kn_kq;

                KASSERT(kn->kn_obj == p);

                mutex_spin_enter(&kq->kq_lock);
                kn->kn_data = P_WAITSTATUS(p);
                /*
                 * Mark as ONESHOT, so that the knote is g/c'ed
                 * when read.
                 */
                kn->kn_flags |= (EV_EOF | EV_ONESHOT);
                kn->kn_fflags |= kn->kn_sfflags & NOTE_EXIT;

                /*
                 * Detach the knote from the process and mark it as such.
                 * N.B. EVFILT_SIGNAL are also on p_klist, but by the
                 * time we get here, all open file descriptors for this
                 * process have been released, meaning that signal knotes
                 * will have already been detached.
                 *
                 * We need to synchronize this with filt_procdetach().
                 */
                KASSERT(kn->kn_fop == &proc_filtops);
                if ((kn->kn_status & KN_DETACHED) == 0) {
                        kn->kn_status |= KN_DETACHED;
                        SLIST_REMOVE_HEAD(&p->p_klist, kn_selnext);
                }

                /*
                 * Always activate the knote for NOTE_EXIT regardless
                 * of whether or not the listener cares about it.
                 * This matches historical behavior.
                 */
                knote_activate_locked(kn);
                mutex_spin_exit(&kq->kq_lock);
        }
}

#define        FILT_TIMER_NOSCHED        ((uintptr_t)-1)

static int
filt_timercompute(struct kevent *kev, uintptr_t *tticksp)
{
        struct timespec ts;
        uintptr_t tticks;

        if (kev->fflags & ~(NOTE_TIMER_UNITMASK | NOTE_ABSTIME)) {
                return EINVAL;
        }

        /*
         * Convert the event 'data' to a timespec, then convert the
         * timespec to callout ticks.
         */
        switch (kev->fflags & NOTE_TIMER_UNITMASK) {
        case NOTE_SECONDS:
                ts.tv_sec = kev->data;
                ts.tv_nsec = 0;
                break;

        case NOTE_MSECONDS:                /* == historical value 0 */
                ts.tv_sec = kev->data / 1000;
                ts.tv_nsec = (kev->data % 1000) * 1000000;
                break;

        case NOTE_USECONDS:
                ts.tv_sec = kev->data / 1000000;
                ts.tv_nsec = (kev->data % 1000000) * 1000;
                break;

        case NOTE_NSECONDS:
                ts.tv_sec = kev->data / 1000000000;
                ts.tv_nsec = kev->data % 1000000000;
                break;

        default:
                return EINVAL;
        }

        if (kev->fflags & NOTE_ABSTIME) {
                struct timespec deadline = ts;

                /*
                 * Get current time.
                 *
                 * XXX This is CLOCK_REALTIME.  There is no way to
                 * XXX specify CLOCK_MONOTONIC.
                 */
                nanotime(&ts);

                /* Absolute timers do not repeat. */
                kev->data = FILT_TIMER_NOSCHED;

                /* If we're past the deadline, then the event will fire. */
                if (timespeccmp(&deadline, &ts, <=)) {
                        tticks = FILT_TIMER_NOSCHED;
                        goto out;
                }

                /* Calculate how much time is left. */
                timespecsub(&deadline, &ts, &ts);
        } else {
                /* EV_CLEAR automatically set for relative timers. */
                kev->flags |= EV_CLEAR;
        }

        tticks = tstohz(&ts);

        /* if the supplied value is under our resolution, use 1 tick */
        if (tticks == 0) {
                if (kev->data == 0)
                        return EINVAL;
                tticks = 1;
        } else if (tticks > INT_MAX) {
                return EINVAL;
        }

        if ((kev->flags & EV_ONESHOT) != 0) {
                /* Timer does not repeat. */
                kev->data = FILT_TIMER_NOSCHED;
        } else {
                KASSERT((uintptr_t)tticks != FILT_TIMER_NOSCHED);
                kev->data = tticks;
        }

 out:
        *tticksp = tticks;

        return 0;
}

static void
filt_timerexpire(void *knx)
{
        struct knote *kn = knx;
        struct kqueue *kq = kn->kn_kq;

        mutex_spin_enter(&kq->kq_lock);
        kn->kn_data++;
        knote_activate_locked(kn);
        if (kn->kn_sdata != FILT_TIMER_NOSCHED) {
                KASSERT(kn->kn_sdata > 0);
                KASSERT(kn->kn_sdata <= INT_MAX);
                callout_schedule((callout_t *)kn->kn_hook,
                    (int)kn->kn_sdata);
        }
        mutex_spin_exit(&kq->kq_lock);
}

static inline void
filt_timerstart(struct knote *kn, uintptr_t tticks)
{
        callout_t *calloutp = kn->kn_hook;

        KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
        KASSERT(!callout_pending(calloutp));

        if (__predict_false(tticks == FILT_TIMER_NOSCHED)) {
                kn->kn_data = 1;
        } else {
                KASSERT(tticks <= INT_MAX);
                callout_reset(calloutp, (int)tticks, filt_timerexpire, kn);
        }
}

static int
filt_timerattach(struct knote *kn)
{
        callout_t *calloutp;
        struct kqueue *kq;
        uintptr_t tticks;
        int error;

        struct kevent kev = {
                .flags = kn->kn_flags,
                .fflags = kn->kn_sfflags,
                .data = kn->kn_sdata,
        };

        error = filt_timercompute(&kev, &tticks);
        if (error) {
                return error;
        }

        if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
            (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
                atomic_dec_uint(&kq_ncallouts);
                return ENOMEM;
        }
        callout_init(calloutp, CALLOUT_MPSAFE);

        kq = kn->kn_kq;
        mutex_spin_enter(&kq->kq_lock);

        kn->kn_sdata = kev.data;
        kn->kn_flags = kev.flags;
        KASSERT(kn->kn_sfflags == kev.fflags);
        kn->kn_hook = calloutp;

        filt_timerstart(kn, tticks);

        mutex_spin_exit(&kq->kq_lock);

        return (0);
}

static void
filt_timerdetach(struct knote *kn)
{
        callout_t *calloutp;
        struct kqueue *kq = kn->kn_kq;

        /* prevent rescheduling when we expire */
        mutex_spin_enter(&kq->kq_lock);
        kn->kn_sdata = FILT_TIMER_NOSCHED;
        mutex_spin_exit(&kq->kq_lock);

        calloutp = (callout_t *)kn->kn_hook;

        /*
         * Attempt to stop the callout.  This will block if it's
         * already running.
         */
        callout_halt(calloutp, NULL);

        callout_destroy(calloutp);
        kmem_free(calloutp, sizeof(*calloutp));
        atomic_dec_uint(&kq_ncallouts);
}

static int
filt_timertouch(struct knote *kn, struct kevent *kev, long type)
{
        struct kqueue *kq = kn->kn_kq;
        callout_t *calloutp;
        uintptr_t tticks;
        int error;

        KASSERT(mutex_owned(&kq->kq_lock));

        switch (type) {
        case EVENT_REGISTER:
                /* Only relevant for EV_ADD. */
                if ((kev->flags & EV_ADD) == 0) {
                        return 0;
                }

                /*
                 * Stop the timer, under the assumption that if
                 * an application is re-configuring the timer,
                 * they no longer care about the old one.  We
                 * can safely drop the kq_lock while we wait
                 * because fdp->fd_lock will be held throughout,
                 * ensuring that no one can sneak in with an
                 * EV_DELETE or close the kq.
                 */
                KASSERT(mutex_owned(&kq->kq_fdp->fd_lock));

                calloutp = kn->kn_hook;
                callout_halt(calloutp, &kq->kq_lock);
                KASSERT(mutex_owned(&kq->kq_lock));
                knote_deactivate_locked(kn);
                kn->kn_data = 0;

                error = filt_timercompute(kev, &tticks);
                if (error) {
                        return error;
                }
                kn->kn_sdata = kev->data;
                kn->kn_flags = kev->flags;
                kn->kn_sfflags = kev->fflags;
                filt_timerstart(kn, tticks);
                break;

        case EVENT_PROCESS:
                *kev = kn->kn_kevent;
                break;

        default:
                panic("%s: invalid type (%ld)", __func__, type);
        }

        return 0;
}

static int
filt_timer(struct knote *kn, long hint)
{
        struct kqueue *kq = kn->kn_kq;
        int rv;

        mutex_spin_enter(&kq->kq_lock);
        rv = (kn->kn_data != 0);
        mutex_spin_exit(&kq->kq_lock);

        return rv;
}

static int
filt_userattach(struct knote *kn)
{
        struct kqueue *kq = kn->kn_kq;

        /*
         * EVFILT_USER knotes are not attached to anything in the kernel.
         */
        mutex_spin_enter(&kq->kq_lock);
        kn->kn_hook = NULL;
        if (kn->kn_fflags & NOTE_TRIGGER)
                kn->kn_hookid = 1;
        else
                kn->kn_hookid = 0;
        mutex_spin_exit(&kq->kq_lock);
        return (0);
}

static void
filt_userdetach(struct knote *kn)
{

        /*
         * EVFILT_USER knotes are not attached to anything in the kernel.
         */
}

static int
filt_user(struct knote *kn, long hint)
{
        struct kqueue *kq = kn->kn_kq;
        int hookid;

        mutex_spin_enter(&kq->kq_lock);
        hookid = kn->kn_hookid;
        mutex_spin_exit(&kq->kq_lock);

        return hookid;
}

static int
filt_usertouch(struct knote *kn, struct kevent *kev, long type)
{
        int ffctrl;

        KASSERT(mutex_owned(&kn->kn_kq->kq_lock));

        switch (type) {
        case EVENT_REGISTER:
                if (kev->fflags & NOTE_TRIGGER)
                        kn->kn_hookid = 1;

                ffctrl = kev->fflags & NOTE_FFCTRLMASK;
                kev->fflags &= NOTE_FFLAGSMASK;
                switch (ffctrl) {
                case NOTE_FFNOP:
                        break;

                case NOTE_FFAND:
                        kn->kn_sfflags &= kev->fflags;
                        break;

                case NOTE_FFOR:
                        kn->kn_sfflags |= kev->fflags;
                        break;

                case NOTE_FFCOPY:
                        kn->kn_sfflags = kev->fflags;
                        break;

                default:
                        /* XXX Return error? */
                        break;
                }
                kn->kn_sdata = kev->data;
                if (kev->flags & EV_CLEAR) {
                        kn->kn_hookid = 0;
                        kn->kn_data = 0;
                        kn->kn_fflags = 0;
                }
                break;

        case EVENT_PROCESS:
                *kev = kn->kn_kevent;
                kev->fflags = kn->kn_sfflags;
                kev->data = kn->kn_sdata;
                if (kn->kn_flags & EV_CLEAR) {
                        kn->kn_hookid = 0;
                        kn->kn_data = 0;
                        kn->kn_fflags = 0;
                }
                break;

        default:
                panic("filt_usertouch() - invalid type (%ld)", type);
                break;
        }

        return 0;
}

/*
 * filt_seltrue:
 *
 *        This filter "event" routine simulates seltrue().
 */
int
filt_seltrue(struct knote *kn, long hint)
{

        /*
         * We don't know how much data can be read/written,
         * but we know that it *can* be.  This is about as
         * good as select/poll does as well.
         */
        kn->kn_data = 0;
        return (1);
}

/*
 * This provides full kqfilter entry for device switch tables, which
 * has same effect as filter using filt_seltrue() as filter method.
 */
static void
filt_seltruedetach(struct knote *kn)
{
        /* Nothing to do */
}

const struct filterops seltrue_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_seltruedetach,
        .f_event = filt_seltrue,
};

int
seltrue_kqfilter(dev_t dev, struct knote *kn)
{
        switch (kn->kn_filter) {
        case EVFILT_READ:
        case EVFILT_WRITE:
                kn->kn_fop = &seltrue_filtops;
                break;
        default:
                return (EINVAL);
        }

        /* Nothing more to do */
        return (0);
}

/*
 * kqueue(2) system call.
 */
static int
kqueue1(struct lwp *l, int flags, register_t *retval)
{
        struct kqueue *kq;
        file_t *fp;
        int fd, error;

        if ((error = fd_allocfile(&fp, &fd)) != 0)
                return error;
        fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
        fp->f_type = DTYPE_KQUEUE;
        fp->f_ops = &kqueueops;
        kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
        mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
        cv_init(&kq->kq_cv, "kqueue");
        selinit(&kq->kq_sel);
        TAILQ_INIT(&kq->kq_head);
        fp->f_kqueue = kq;
        *retval = fd;
        kq->kq_fdp = curlwp->l_fd;
        fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
        fd_affix(curproc, fp, fd);
        return error;
}

/*
 * kqueue(2) system call.
 */
int
sys_kqueue(struct lwp *l, const void *v, register_t *retval)
{
        return kqueue1(l, 0, retval);
}

int
sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) flags;
        } */
        return kqueue1(l, SCARG(uap, flags), retval);
}

/*
 * kevent(2) system call.
 */
int
kevent_fetch_changes(void *ctx, const struct kevent *changelist,
    struct kevent *changes, size_t index, int n)
{

        return copyin(changelist + index, changes, n * sizeof(*changes));
}

int
kevent_put_events(void *ctx, struct kevent *events,
    struct kevent *eventlist, size_t index, int n)
{

        return copyout(events, eventlist + index, n * sizeof(*events));
}

static const struct kevent_ops kevent_native_ops = {
        .keo_private = NULL,
        .keo_fetch_timeout = copyin,
        .keo_fetch_changes = kevent_fetch_changes,
        .keo_put_events = kevent_put_events,
};

int
sys___kevent100(struct lwp *l, const struct sys___kevent100_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const struct kevent *) changelist;
                syscallarg(size_t) nchanges;
                syscallarg(struct kevent *) eventlist;
                syscallarg(size_t) nevents;
                syscallarg(const struct timespec *) timeout;
        } */

        return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
            SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
            SCARG(uap, timeout), &kevent_native_ops);
}

int
kevent1(register_t *retval, int fd,
        const struct kevent *changelist, size_t nchanges,
        struct kevent *eventlist, size_t nevents,
        const struct timespec *timeout,
        const struct kevent_ops *keops)
{
        struct kevent *kevp;
        struct kqueue *kq;
        struct timespec        ts;
        size_t i, n, ichange;
        int nerrors, error;
        struct kevent kevbuf[KQ_NEVENTS];        /* approx 300 bytes on 64-bit */
        file_t *fp;

        /* check that we're dealing with a kq */
        fp = fd_getfile(fd);
        if (fp == NULL)
                return (EBADF);

        if (fp->f_type != DTYPE_KQUEUE) {
                fd_putfile(fd);
                return (EBADF);
        }

        if (timeout != NULL) {
                error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
                if (error)
                        goto done;
                timeout = &ts;
        }

        kq = fp->f_kqueue;
        nerrors = 0;
        ichange = 0;

        /* traverse list of events to register */
        while (nchanges > 0) {
                n = MIN(nchanges, __arraycount(kevbuf));
                error = (*keops->keo_fetch_changes)(keops->keo_private,
                    changelist, kevbuf, ichange, n);
                if (error)
                        goto done;
                for (i = 0; i < n; i++) {
                        kevp = &kevbuf[i];
                        kevp->flags &= ~EV_SYSFLAGS;
                        /* register each knote */
                        error = kqueue_register(kq, kevp);
                        if (!error && !(kevp->flags & EV_RECEIPT))
                                continue;
                        if (nevents == 0)
                                goto done;
                        kevp->flags = EV_ERROR;
                        kevp->data = error;
                        error = (*keops->keo_put_events)
                                (keops->keo_private, kevp,
                                 eventlist, nerrors, 1);
                        if (error)
                                goto done;
                        nevents--;
                        nerrors++;
                }
                nchanges -= n;        /* update the results */
                ichange += n;
        }
        if (nerrors) {
                *retval = nerrors;
                error = 0;
                goto done;
        }

        /* actually scan through the events */
        error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
            kevbuf, __arraycount(kevbuf));
 done:
        fd_putfile(fd);
        return (error);
}

/*
 * Register a given kevent kev onto the kqueue
 */
static int
kqueue_register(struct kqueue *kq, struct kevent *kev)
{
        struct kfilter *kfilter;
        filedesc_t *fdp;
        file_t *fp;
        fdfile_t *ff;
        struct knote *kn, *newkn;
        struct klist *list;
        int error, fd, rv;

        fdp = kq->kq_fdp;
        fp = NULL;
        kn = NULL;
        error = 0;
        fd = 0;

        newkn = knote_alloc(true);

        rw_enter(&kqueue_filter_lock, RW_READER);
        kfilter = kfilter_byfilter(kev->filter);
        if (kfilter == NULL || kfilter->filtops == NULL) {
                /* filter not found nor implemented */
                rw_exit(&kqueue_filter_lock);
                knote_free(newkn);
                return (EINVAL);
        }

        /* search if knote already exists */
        if (kfilter->filtops->f_flags & FILTEROP_ISFD) {
                /* monitoring a file descriptor */
                /* validate descriptor */
                if (kev->ident > INT_MAX
                    || (fp = fd_getfile(fd = kev->ident)) == NULL) {
                        rw_exit(&kqueue_filter_lock);
                        knote_free(newkn);
                        return EBADF;
                }
                mutex_enter(&fdp->fd_lock);
                ff = fdp->fd_dt->dt_ff[fd];
                if (ff->ff_refcnt & FR_CLOSING) {
                        error = EBADF;
                        goto doneunlock;
                }
                if (fd <= fdp->fd_lastkqfile) {
                        SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
                                if (kq == kn->kn_kq &&
                                    kev->filter == kn->kn_filter)
                                        break;
                        }
                }
        } else {
                /*
                 * not monitoring a file descriptor, so
                 * lookup knotes in internal hash table
                 */
                mutex_enter(&fdp->fd_lock);
                if (fdp->fd_knhashmask != 0) {
                        list = &fdp->fd_knhash[
                            KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
                        SLIST_FOREACH(kn, list, kn_link) {
                                if (kev->ident == kn->kn_id &&
                                    kq == kn->kn_kq &&
                                    kev->filter == kn->kn_filter)
                                        break;
                        }
                }
        }

        /* It's safe to test KQ_CLOSING while holding only the fd_lock. */
        KASSERT(mutex_owned(&fdp->fd_lock));
        KASSERT((kq->kq_count & KQ_CLOSING) == 0);

        /*
         * kn now contains the matching knote, or NULL if no match
         */
        if (kn == NULL) {
                if (kev->flags & EV_ADD) {
                        /* create new knote */
                        kn = newkn;
                        newkn = NULL;
                        kn->kn_obj = fp;
                        kn->kn_id = kev->ident;
                        kn->kn_kq = kq;
                        kn->kn_fop = kfilter->filtops;
                        kn->kn_kfilter = kfilter;
                        kn->kn_sfflags = kev->fflags;
                        kn->kn_sdata = kev->data;
                        kev->fflags = 0;
                        kev->data = 0;
                        kn->kn_kevent = *kev;

                        KASSERT(kn->kn_fop != NULL);
                        /*
                         * XXX Allow only known-safe users of f_touch.
                         * XXX See filter_touch() for details.
                         */
                        if (kn->kn_fop->f_touch != NULL &&
                            kn->kn_fop != &timer_filtops &&
                            kn->kn_fop != &user_filtops) {
                                error = ENOTSUP;
                                goto fail_ev_add;
                        }

                        /*
                         * apply reference count to knote structure, and
                         * do not release it at the end of this routine.
                         */
                        fp = NULL;

                        if (!(kn->kn_fop->f_flags & FILTEROP_ISFD)) {
                                /*
                                 * If knote is not on an fd, store on
                                 * internal hash table.
                                 */
                                if (fdp->fd_knhashmask == 0) {
                                        /* XXXAD can block with fd_lock held */
                                        fdp->fd_knhash = hashinit(KN_HASHSIZE,
                                            HASH_LIST, true,
                                            &fdp->fd_knhashmask);
                                }
                                list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
                                    fdp->fd_knhashmask)];
                        } else {
                                /* Otherwise, knote is on an fd. */
                                list = (struct klist *)
                                    &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
                                if ((int)kn->kn_id > fdp->fd_lastkqfile)
                                        fdp->fd_lastkqfile = kn->kn_id;
                        }
                        SLIST_INSERT_HEAD(list, kn, kn_link);

                        /*
                         * N.B. kn->kn_fop may change as the result
                         * of filter_attach()!
                         */
                        knote_foplock_enter(kn);
                        error = filter_attach(kn);
                        if (error != 0) {
#ifdef DEBUG
                                struct proc *p = curlwp->l_proc;
                                const file_t *ft = kn->kn_obj;
                                printf("%s: %s[%d]: event type %d not "
                                    "supported for file type %d/%s "
                                    "(error %d)\n", __func__,
                                    p->p_comm, p->p_pid,
                                    kn->kn_filter, ft ? ft->f_type : -1,
                                    ft ? ft->f_ops->fo_name : "?", error);
#endif

 fail_ev_add:
                                /*
                                 * N.B. no need to check for this note to
                                 * be in-flux, since it was never visible
                                 * to the monitored object.
                                 *
                                 * knote_detach() drops fdp->fd_lock
                                 */
                                knote_foplock_exit(kn);
                                mutex_enter(&kq->kq_lock);
                                KNOTE_WILLDETACH(kn);
                                KASSERT(kn_in_flux(kn) == false);
                                mutex_exit(&kq->kq_lock);
                                knote_detach(kn, fdp, false);
                                goto done;
                        }
                        atomic_inc_uint(&kfilter->refcnt);
                        goto done_ev_add;
                } else {
                        /* No matching knote and the EV_ADD flag is not set. */
                        error = ENOENT;
                        goto doneunlock;
                }
        }

        if (kev->flags & EV_DELETE) {
                /*
                 * Let the world know that this knote is about to go
                 * away, and wait for it to settle if it's currently
                 * in-flux.
                 */
                mutex_spin_enter(&kq->kq_lock);
                if (kn->kn_status & KN_WILLDETACH) {
                        /*
                         * This knote is already on its way out,
                         * so just be done.
                         */
                        mutex_spin_exit(&kq->kq_lock);
                        goto doneunlock;
                }
                KNOTE_WILLDETACH(kn);
                if (kn_in_flux(kn)) {
                        mutex_exit(&fdp->fd_lock);
                        /*
                         * It's safe for us to conclusively wait for
                         * this knote to settle because we know we'll
                         * be completing the detach.
                         */
                        kn_wait_flux(kn, true);
                        KASSERT(kn_in_flux(kn) == false);
                        mutex_spin_exit(&kq->kq_lock);
                        mutex_enter(&fdp->fd_lock);
                } else {
                        mutex_spin_exit(&kq->kq_lock);
                }

                /* knote_detach() drops fdp->fd_lock */
                knote_detach(kn, fdp, true);
                goto done;
        }

        /*
         * The user may change some filter values after the
         * initial EV_ADD, but doing so will not reset any
         * filter which have already been triggered.
         */
        knote_foplock_enter(kn);
        kn->kn_kevent.udata = kev->udata;
        KASSERT(kn->kn_fop != NULL);
        if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
            kn->kn_fop->f_touch != NULL) {
                mutex_spin_enter(&kq->kq_lock);
                error = filter_touch(kn, kev, EVENT_REGISTER);
                mutex_spin_exit(&kq->kq_lock);
                if (__predict_false(error != 0)) {
                        /* Never a new knote (which would consume newkn). */
                        KASSERT(newkn != NULL);
                        knote_foplock_exit(kn);
                        goto doneunlock;
                }
        } else {
                kn->kn_sfflags = kev->fflags;
                kn->kn_sdata = kev->data;
        }

        /*
         * We can get here if we are trying to attach
         * an event to a file descriptor that does not
         * support events, and the attach routine is
         * broken and does not return an error.
         */
 done_ev_add:
        rv = filter_event(kn, 0, false);
        if (rv)
                knote_activate(kn);

        knote_foplock_exit(kn);

        /* disable knote */
        if ((kev->flags & EV_DISABLE)) {
                mutex_spin_enter(&kq->kq_lock);
                if ((kn->kn_status & KN_DISABLED) == 0)
                        kn->kn_status |= KN_DISABLED;
                mutex_spin_exit(&kq->kq_lock);
        }

        /* enable knote */
        if ((kev->flags & EV_ENABLE)) {
                knote_enqueue(kn);
        }
 doneunlock:
        mutex_exit(&fdp->fd_lock);
 done:
        rw_exit(&kqueue_filter_lock);
        if (newkn != NULL)
                knote_free(newkn);
        if (fp != NULL)
                fd_putfile(fd);
        return (error);
}

#define KN_FMT(buf, kn) \
    (snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf)

#if defined(DDB)
void
kqueue_printit(struct kqueue *kq, bool full, void (*pr)(const char *, ...))
{
        const struct knote *kn;
        u_int count;
        int nmarker;
        char buf[128];

        count = 0;
        nmarker = 0;

        (*pr)("kqueue %p (restart=%d count=%u):\n", kq,
            !!(kq->kq_count & KQ_RESTART), KQ_COUNT(kq));
        (*pr)("  Queued knotes:\n");
        TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
                if (kn->kn_status & KN_MARKER) {
                        nmarker++;
                } else {
                        count++;
                }
                (*pr)("    knote %p: kq=%p status=%s\n",
                    kn, kn->kn_kq, KN_FMT(buf, kn));
                (*pr)("      id=0x%lx (%lu) filter=%d\n",
                    (u_long)kn->kn_id, (u_long)kn->kn_id, kn->kn_filter);
                if (kn->kn_kq != kq) {
                        (*pr)("      !!! kn->kn_kq != kq\n");
                }
        }
        if (count != KQ_COUNT(kq)) {
                (*pr)("  !!! count(%u) != KQ_COUNT(%u)\n",
                    count, KQ_COUNT(kq));
        }
}
#endif /* DDB */

#if defined(DEBUG)
static void
kqueue_check(const char *func, size_t line, const struct kqueue *kq)
{
        const struct knote *kn;
        u_int count;
        int nmarker;
        char buf[128];

        KASSERT(mutex_owned(&kq->kq_lock));

        count = 0;
        nmarker = 0;
        TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
                if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
                        panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s",
                            func, line, kq, kn, KN_FMT(buf, kn));
                }
                if ((kn->kn_status & KN_MARKER) == 0) {
                        if (kn->kn_kq != kq) {
                                panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s",
                                    func, line, kq, kn, kn->kn_kq,
                                    KN_FMT(buf, kn));
                        }
                        if ((kn->kn_status & KN_ACTIVE) == 0) {
                                panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s",
                                    func, line, kq, kn, KN_FMT(buf, kn));
                        }
                        count++;
                        if (count > KQ_COUNT(kq)) {
                                panic("%s,%zu: kq=%p kq->kq_count(%u) != "
                                    "count(%d), nmarker=%d",
                                        func, line, kq, KQ_COUNT(kq), count,
                                    nmarker);
                        }
                } else {
                        nmarker++;
                }
        }
}
#define kq_check(a) kqueue_check(__func__, __LINE__, (a))
#else /* defined(DEBUG) */
#define        kq_check(a)        /* nothing */
#endif /* defined(DEBUG) */

static void
kqueue_restart(file_t *fp)
{
        struct kqueue *kq = fp->f_kqueue;
        KASSERT(kq != NULL);

        mutex_spin_enter(&kq->kq_lock);
        kq->kq_count |= KQ_RESTART;
        cv_broadcast(&kq->kq_cv);
        mutex_spin_exit(&kq->kq_lock);
}

static int
kqueue_fpathconf(struct file *fp, int name, register_t *retval)
{

        return EINVAL;
}

/*
 * Scan through the list of events on fp (for a maximum of maxevents),
 * returning the results in to ulistp. Timeout is determined by tsp; if
 * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
 * as appropriate.
 */
static int
kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
            const struct timespec *tsp, register_t *retval,
            const struct kevent_ops *keops, struct kevent *kevbuf,
            size_t kevcnt)
{
        struct kqueue        *kq;
        struct kevent        *kevp;
        struct timespec        ats, sleepts;
        struct knote        *kn, *marker;
        struct knote_impl morker;
        size_t                count, nkev, nevents;
        int                timeout, error, touch, rv, influx;
        filedesc_t        *fdp;

        fdp = curlwp->l_fd;
        kq = fp->f_kqueue;
        count = maxevents;
        nkev = nevents = error = 0;
        if (count == 0) {
                *retval = 0;
                return 0;
        }

        if (tsp) {                                /* timeout supplied */
                ats = *tsp;
                if (inittimeleft(&ats, &sleepts) == -1) {
                        *retval = maxevents;
                        return EINVAL;
                }
                timeout = tstohz(&ats);
                if (timeout <= 0)
                        timeout = -1;           /* do poll */
        } else {
                /* no timeout, wait forever */
                timeout = 0;
        }

        memset(&morker, 0, sizeof(morker));
        marker = &morker.ki_knote;
        marker->kn_kq = kq;
        marker->kn_status = KN_MARKER;
        mutex_spin_enter(&kq->kq_lock);
 retry:
        kevp = kevbuf;
        if (KQ_COUNT(kq) == 0) {
                if (timeout >= 0) {
                        error = cv_timedwait_sig(&kq->kq_cv,
                            &kq->kq_lock, timeout);
                        if (error == 0) {
                                if (KQ_COUNT(kq) == 0 &&
                                    (kq->kq_count & KQ_RESTART)) {
                                        /* return to clear file reference */
                                        error = ERESTART;
                                } else if (tsp == NULL || (timeout =
                                    gettimeleft(&ats, &sleepts)) > 0) {
                                        goto retry;
                                }
                        } else {
                                /* don't restart after signals... */
                                if (error == ERESTART)
                                        error = EINTR;
                                if (error == EWOULDBLOCK)
                                        error = 0;
                        }
                }
                mutex_spin_exit(&kq->kq_lock);
                goto done;
        }

        /* mark end of knote list */
        TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
        influx = 0;

        /*
         * Acquire the fdp->fd_lock interlock to avoid races with
         * file creation/destruction from other threads.
         */
        mutex_spin_exit(&kq->kq_lock);
relock:
        mutex_enter(&fdp->fd_lock);
        mutex_spin_enter(&kq->kq_lock);

        while (count != 0) {
                /*
                 * Get next knote.  We are guaranteed this will never
                 * be NULL because of the marker we inserted above.
                 */
                kn = TAILQ_FIRST(&kq->kq_head);

                bool kn_is_other_marker =
                    (kn->kn_status & KN_MARKER) != 0 && kn != marker;
                bool kn_is_detaching = (kn->kn_status & KN_WILLDETACH) != 0;
                bool kn_is_in_flux = kn_in_flux(kn);

                /*
                 * If we found a marker that's not ours, or this knote
                 * is in a state of flux, then wait for everything to
                 * settle down and go around again.
                 */
                if (kn_is_other_marker || kn_is_detaching || kn_is_in_flux) {
                        if (influx) {
                                influx = 0;
                                KQ_FLUX_WAKEUP(kq);
                        }
                        mutex_exit(&fdp->fd_lock);
                        if (kn_is_other_marker || kn_is_in_flux) {
                                KQ_FLUX_WAIT(kq);
                                mutex_spin_exit(&kq->kq_lock);
                        } else {
                                /*
                                 * Detaching but not in-flux?  Someone is
                                 * actively trying to finish the job; just
                                 * go around and try again.
                                 */
                                KASSERT(kn_is_detaching);
                                mutex_spin_exit(&kq->kq_lock);
                                preempt_point();
                        }
                        goto relock;
                }

                TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
                if (kn == marker) {
                        /* it's our marker, stop */
                        KQ_FLUX_WAKEUP(kq);
                        if (count == maxevents) {
                                mutex_exit(&fdp->fd_lock);
                                goto retry;
                        }
                        break;
                }
                KASSERT((kn->kn_status & KN_BUSY) == 0);

                kq_check(kq);
                kn->kn_status &= ~KN_QUEUED;
                kn->kn_status |= KN_BUSY;
                kq_check(kq);
                if (kn->kn_status & KN_DISABLED) {
                        kn->kn_status &= ~KN_BUSY;
                        kq->kq_count--;
                        /* don't want disabled events */
                        continue;
                }
                if ((kn->kn_flags & EV_ONESHOT) == 0) {
                        mutex_spin_exit(&kq->kq_lock);
                        KASSERT(mutex_owned(&fdp->fd_lock));
                        knote_foplock_enter(kn);
                        rv = filter_event(kn, 0, false);
                        knote_foplock_exit(kn);
                        mutex_spin_enter(&kq->kq_lock);
                        /* Re-poll if note was re-enqueued. */
                        if ((kn->kn_status & KN_QUEUED) != 0) {
                                kn->kn_status &= ~KN_BUSY;
                                /* Re-enqueue raised kq_count, lower it again */
                                kq->kq_count--;
                                influx = 1;
                                continue;
                        }
                        if (rv == 0) {
                                /*
                                 * non-ONESHOT event that hasn't triggered
                                 * again, so it will remain de-queued.
                                 */
                                kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
                                kq->kq_count--;
                                influx = 1;
                                continue;
                        }
                } else {
                        /*
                         * Must NOT drop kq_lock until we can do
                         * the KNOTE_WILLDETACH() below.
                         */
                }
                KASSERT(kn->kn_fop != NULL);
                touch = (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
                                kn->kn_fop->f_touch != NULL);
                /* XXXAD should be got from f_event if !oneshot. */
                KASSERT((kn->kn_status & KN_WILLDETACH) == 0);
                if (touch) {
                        (void)filter_touch(kn, kevp, EVENT_PROCESS);
                } else {
                        *kevp = kn->kn_kevent;
                }
                kevp++;
                nkev++;
                influx = 1;
                if (kn->kn_flags & EV_ONESHOT) {
                        /* delete ONESHOT events after retrieval */
                        KNOTE_WILLDETACH(kn);
                        kn->kn_status &= ~KN_BUSY;
                        kq->kq_count--;
                        KASSERT(kn_in_flux(kn) == false);
                        KASSERT((kn->kn_status & KN_WILLDETACH) != 0);
                        KASSERT(kn->kn_kevent.udata == curlwp);
                        mutex_spin_exit(&kq->kq_lock);
                        knote_detach(kn, fdp, true);
                        mutex_enter(&fdp->fd_lock);
                        mutex_spin_enter(&kq->kq_lock);
                } else if (kn->kn_flags & EV_CLEAR) {
                        /* clear state after retrieval */
                        kn->kn_data = 0;
                        kn->kn_fflags = 0;
                        /*
                         * Manually clear knotes who weren't
                         * 'touch'ed.
                         */
                        if (touch == 0) {
                                kn->kn_data = 0;
                                kn->kn_fflags = 0;
                        }
                        kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
                        kq->kq_count--;
                } else if (kn->kn_flags & EV_DISPATCH) {
                        kn->kn_status |= KN_DISABLED;
                        kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
                        kq->kq_count--;
                } else {
                        /* add event back on list */
                        kq_check(kq);
                        kn->kn_status |= KN_QUEUED;
                        kn->kn_status &= ~KN_BUSY;
                        TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
                        kq_check(kq);
                }

                if (nkev == kevcnt) {
                        /* do copyouts in kevcnt chunks */
                        influx = 0;
                        KQ_FLUX_WAKEUP(kq);
                        mutex_spin_exit(&kq->kq_lock);
                        mutex_exit(&fdp->fd_lock);
                        error = (*keops->keo_put_events)
                            (keops->keo_private,
                            kevbuf, ulistp, nevents, nkev);
                        mutex_enter(&fdp->fd_lock);
                        mutex_spin_enter(&kq->kq_lock);
                        nevents += nkev;
                        nkev = 0;
                        kevp = kevbuf;
                }
                count--;
                if (error != 0 || count == 0) {
                        /* remove marker */
                        TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
                        break;
                }
        }
        KQ_FLUX_WAKEUP(kq);
        mutex_spin_exit(&kq->kq_lock);
        mutex_exit(&fdp->fd_lock);

done:
        if (nkev != 0) {
                /* copyout remaining events */
                error = (*keops->keo_put_events)(keops->keo_private,
                    kevbuf, ulistp, nevents, nkev);
        }
        *retval = maxevents - count;

        return error;
}

/*
 * fileops ioctl method for a kqueue descriptor.
 *
 * Two ioctls are currently supported. They both use struct kfilter_mapping:
 *        KFILTER_BYNAME                find name for filter, and return result in
 *                                name, which is of size len.
 *        KFILTER_BYFILTER        find filter for name. len is ignored.
 */
/*ARGSUSED*/
static int
kqueue_ioctl(file_t *fp, u_long com, void *data)
{
        struct kfilter_mapping        *km;
        const struct kfilter        *kfilter;
        char                        *name;
        int                        error;

        km = data;
        error = 0;
        name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);

        switch (com) {
        case KFILTER_BYFILTER:        /* convert filter -> name */
                rw_enter(&kqueue_filter_lock, RW_READER);
                kfilter = kfilter_byfilter(km->filter);
                if (kfilter != NULL) {
                        strlcpy(name, kfilter->name, KFILTER_MAXNAME);
                        rw_exit(&kqueue_filter_lock);
                        error = copyoutstr(name, km->name, km->len, NULL);
                } else {
                        rw_exit(&kqueue_filter_lock);
                        error = ENOENT;
                }
                break;

        case KFILTER_BYNAME:        /* convert name -> filter */
                error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
                if (error) {
                        break;
                }
                rw_enter(&kqueue_filter_lock, RW_READER);
                kfilter = kfilter_byname(name);
                if (kfilter != NULL)
                        km->filter = kfilter->filter;
                else
                        error = ENOENT;
                rw_exit(&kqueue_filter_lock);
                break;

        default:
                error = ENOTTY;
                break;

        }
        kmem_free(name, KFILTER_MAXNAME);
        return (error);
}

/*
 * fileops fcntl method for a kqueue descriptor.
 */
static int
kqueue_fcntl(file_t *fp, u_int com, void *data)
{

        return (ENOTTY);
}

/*
 * fileops poll method for a kqueue descriptor.
 * Determine if kqueue has events pending.
 */
static int
kqueue_poll(file_t *fp, int events)
{
        struct kqueue        *kq;
        int                revents;

        kq = fp->f_kqueue;

        revents = 0;
        if (events & (POLLIN | POLLRDNORM)) {
                mutex_spin_enter(&kq->kq_lock);
                if (KQ_COUNT(kq) != 0) {
                        revents |= events & (POLLIN | POLLRDNORM);
                } else {
                        selrecord(curlwp, &kq->kq_sel);
                }
                kq_check(kq);
                mutex_spin_exit(&kq->kq_lock);
        }

        return revents;
}

/*
 * fileops stat method for a kqueue descriptor.
 * Returns dummy info, with st_size being number of events pending.
 */
static int
kqueue_stat(file_t *fp, struct stat *st)
{
        struct kqueue *kq;

        kq = fp->f_kqueue;

        memset(st, 0, sizeof(*st));
        st->st_size = KQ_COUNT(kq);
        st->st_blksize = sizeof(struct kevent);
        st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
        st->st_blocks = 1;
        st->st_uid = kauth_cred_geteuid(fp->f_cred);
        st->st_gid = kauth_cred_getegid(fp->f_cred);

        return 0;
}

static void
kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
{
        struct knote *kn;
        filedesc_t *fdp;

        fdp = kq->kq_fdp;

        KASSERT(mutex_owned(&fdp->fd_lock));

 again:
        for (kn = SLIST_FIRST(list); kn != NULL;) {
                if (kq != kn->kn_kq) {
                        kn = SLIST_NEXT(kn, kn_link);
                        continue;
                }
                if (knote_detach_quiesce(kn)) {
                        mutex_enter(&fdp->fd_lock);
                        goto again;
                }
                knote_detach(kn, fdp, true);
                mutex_enter(&fdp->fd_lock);
                kn = SLIST_FIRST(list);
        }
}

/*
 * fileops close method for a kqueue descriptor.
 */
static int
kqueue_close(file_t *fp)
{
        struct kqueue *kq;
        filedesc_t *fdp;
        fdfile_t *ff;
        int i;

        kq = fp->f_kqueue;
        fp->f_kqueue = NULL;
        fp->f_type = 0;
        fdp = curlwp->l_fd;

        KASSERT(kq->kq_fdp == fdp);

        mutex_enter(&fdp->fd_lock);

        /*
         * We're doing to drop the fd_lock multiple times while
         * we detach knotes.  During this time, attempts to register
         * knotes via the back door (e.g. knote_proc_fork_track())
         * need to fail, lest they sneak in to attach a knote after
         * we've already drained the list it's destined for.
         *
         * We must acquire kq_lock here to set KQ_CLOSING (to serialize
         * with other code paths that modify kq_count without holding
         * the fd_lock), but once this bit is set, it's only safe to
         * test it while holding the fd_lock, and holding kq_lock while
         * doing so is not necessary.
         */
        mutex_enter(&kq->kq_lock);
        kq->kq_count |= KQ_CLOSING;
        mutex_exit(&kq->kq_lock);

        for (i = 0; i <= fdp->fd_lastkqfile; i++) {
                if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
                        continue;
                kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
        }
        if (fdp->fd_knhashmask != 0) {
                for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
                        kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
                }
        }

        mutex_exit(&fdp->fd_lock);

#if defined(DEBUG)
        mutex_enter(&kq->kq_lock);
        kq_check(kq);
        mutex_exit(&kq->kq_lock);
#endif /* DEBUG */
        KASSERT(TAILQ_EMPTY(&kq->kq_head));
        KASSERT(KQ_COUNT(kq) == 0);
        mutex_destroy(&kq->kq_lock);
        cv_destroy(&kq->kq_cv);
        seldestroy(&kq->kq_sel);
        kmem_free(kq, sizeof(*kq));

        return (0);
}

/*
 * struct fileops kqfilter method for a kqueue descriptor.
 * Event triggered when monitored kqueue changes.
 */
static int
kqueue_kqfilter(file_t *fp, struct knote *kn)
{
        struct kqueue *kq;

        kq = ((file_t *)kn->kn_obj)->f_kqueue;

        KASSERT(fp == kn->kn_obj);

        if (kn->kn_filter != EVFILT_READ)
                return EINVAL;

        kn->kn_fop = &kqread_filtops;
        mutex_enter(&kq->kq_lock);
        selrecord_knote(&kq->kq_sel, kn);
        mutex_exit(&kq->kq_lock);

        return 0;
}


/*
 * Walk down a list of knotes, activating them if their event has
 * triggered.  The caller's object lock (e.g. device driver lock)
 * must be held.
 */
void
knote(struct klist *list, long hint)
{
        struct knote *kn, *tmpkn;

        SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
                /*
                 * We assume here that the backing object's lock is
                 * already held if we're traversing the klist, and
                 * so acquiring the knote foplock would create a
                 * deadlock scenario.  But we also know that the klist
                 * won't disappear on us while we're here, so not
                 * acquiring it is safe.
                 */
                if (filter_event(kn, hint, true)) {
                        knote_activate(kn);
                }
        }
}

/*
 * Remove all knotes referencing a specified fd
 */
void
knote_fdclose(int fd)
{
        struct klist *list;
        struct knote *kn;
        filedesc_t *fdp;

 again:
        fdp = curlwp->l_fd;
        mutex_enter(&fdp->fd_lock);
        list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
        while ((kn = SLIST_FIRST(list)) != NULL) {
                if (knote_detach_quiesce(kn)) {
                        goto again;
                }
                knote_detach(kn, fdp, true);
                mutex_enter(&fdp->fd_lock);
        }
        mutex_exit(&fdp->fd_lock);
}

/*
 * Drop knote.  Called with fdp->fd_lock held, and will drop before
 * returning.
 */
static void
knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
{
        struct klist *list;
        struct kqueue *kq;

        kq = kn->kn_kq;

        KASSERT((kn->kn_status & KN_MARKER) == 0);
        KASSERT((kn->kn_status & KN_WILLDETACH) != 0);
        KASSERT(kn->kn_fop != NULL);
        KASSERT(mutex_owned(&fdp->fd_lock));

        /* Remove from monitored object. */
        if (dofop) {
                knote_foplock_enter(kn);
                filter_detach(kn);
                knote_foplock_exit(kn);
        }

        /* Remove from descriptor table. */
        if (kn->kn_fop->f_flags & FILTEROP_ISFD)
                list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
        else
                list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];

        SLIST_REMOVE(list, kn, knote, kn_link);

        /* Remove from kqueue. */
again:
        mutex_spin_enter(&kq->kq_lock);
        KASSERT(kn_in_flux(kn) == false);
        if ((kn->kn_status & KN_QUEUED) != 0) {
                kq_check(kq);
                KASSERT(KQ_COUNT(kq) != 0);
                kq->kq_count--;
                TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
                kn->kn_status &= ~KN_QUEUED;
                kq_check(kq);
        } else if (kn->kn_status & KN_BUSY) {
                mutex_spin_exit(&kq->kq_lock);
                goto again;
        }
        mutex_spin_exit(&kq->kq_lock);

        mutex_exit(&fdp->fd_lock);
        if (kn->kn_fop->f_flags & FILTEROP_ISFD)
                fd_putfile(kn->kn_id);
        atomic_dec_uint(&kn->kn_kfilter->refcnt);
        knote_free(kn);
}

/*
 * Queue new event for knote.
 */
static void
knote_enqueue(struct knote *kn)
{
        struct kqueue *kq;

        KASSERT((kn->kn_status & KN_MARKER) == 0);

        kq = kn->kn_kq;

        mutex_spin_enter(&kq->kq_lock);
        if (__predict_false(kn->kn_status & KN_WILLDETACH)) {
                /* Don't bother enqueueing a dying knote. */
                goto out;
        }
        if ((kn->kn_status & KN_DISABLED) != 0) {
                kn->kn_status &= ~KN_DISABLED;
        }
        if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
                kq_check(kq);
                kn->kn_status |= KN_QUEUED;
                TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
                KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT);
                kq->kq_count++;
                kq_check(kq);
                cv_broadcast(&kq->kq_cv);
                selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
        }
 out:
        mutex_spin_exit(&kq->kq_lock);
}
/*
 * Queue new event for knote.
 */
static void
knote_activate_locked(struct knote *kn)
{
        struct kqueue *kq;

        KASSERT((kn->kn_status & KN_MARKER) == 0);

        kq = kn->kn_kq;

        if (__predict_false(kn->kn_status & KN_WILLDETACH)) {
                /* Don't bother enqueueing a dying knote. */
                return;
        }
        kn->kn_status |= KN_ACTIVE;
        if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
                kq_check(kq);
                kn->kn_status |= KN_QUEUED;
                TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
                KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT);
                kq->kq_count++;
                kq_check(kq);
                cv_broadcast(&kq->kq_cv);
                selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
        }
}

static void
knote_activate(struct knote *kn)
{
        struct kqueue *kq = kn->kn_kq;

        mutex_spin_enter(&kq->kq_lock);
        knote_activate_locked(kn);
        mutex_spin_exit(&kq->kq_lock);
}

static void
knote_deactivate_locked(struct knote *kn)
{
        struct kqueue *kq = kn->kn_kq;

        if (kn->kn_status & KN_QUEUED) {
                kq_check(kq);
                kn->kn_status &= ~KN_QUEUED;
                TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
                KASSERT(KQ_COUNT(kq) > 0);
                kq->kq_count--;
                kq_check(kq);
        }
        kn->kn_status &= ~KN_ACTIVE;
}

/*
 * Set EV_EOF on the specified knote.  Also allows additional
 * EV_* flags to be set (e.g. EV_ONESHOT).
 */
void
knote_set_eof(struct knote *kn, uint32_t flags)
{
        struct kqueue *kq = kn->kn_kq;

        mutex_spin_enter(&kq->kq_lock);
        kn->kn_flags |= EV_EOF | flags;
        mutex_spin_exit(&kq->kq_lock);
}

/*
 * Clear EV_EOF on the specified knote.
 */
void
knote_clear_eof(struct knote *kn)
{
        struct kqueue *kq = kn->kn_kq;

        mutex_spin_enter(&kq->kq_lock);
        kn->kn_flags &= ~EV_EOF;
        mutex_spin_exit(&kq->kq_lock);
}

/*
 * Initialize a klist.
 */
void
klist_init(struct klist *list)
{
        SLIST_INIT(list);
}

/*
 * Finalize a klist.
 */
void
klist_fini(struct klist *list)
{
        struct knote *kn;

        /*
         * Neuter all existing knotes on the klist because the list is
         * being destroyed.  The caller has guaranteed that no additional
         * knotes will be added to the list, that the backing object's
         * locks are not held (otherwise there is a locking order issue
         * with acquiring the knote foplock ), and that we can traverse
         * the list safely in this state.
         */
        SLIST_FOREACH(kn, list, kn_selnext) {
                knote_foplock_enter(kn);
                KASSERT(kn->kn_fop != NULL);
                if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
                        kn->kn_fop = &nop_fd_filtops;
                } else {
                        kn->kn_fop = &nop_filtops;
                }
                knote_foplock_exit(kn);
        }
}

/*
 * Insert a knote into a klist.
 */
void
klist_insert(struct klist *list, struct knote *kn)
{
        SLIST_INSERT_HEAD(list, kn, kn_selnext);
}

/*
 * Remove a knote from a klist.  Returns true if the last
 * knote was removed and the list is now empty.
 */
bool
klist_remove(struct klist *list, struct knote *kn)
{
        SLIST_REMOVE(list, kn, knote, kn_selnext);
        return SLIST_EMPTY(list);
}


























































































































































    4 




















    1 











    6 





    6 




















   10 



























    2 

    2 






















    2 





































    1 

    1 
























































    6 






    5 






























   10 






   10 






































    6 





















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
/*        $NetBSD: uvm_glue.c,v 1.182 2023/10/04 20:34:19 ad Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * Copyright (c) 1991, 1993, The Regents of the University of California.
 *
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * The Mach Operating System project at Carnegie-Mellon University.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vm_glue.c        8.6 (Berkeley) 1/5/94
 * from: Id: uvm_glue.c,v 1.1.2.8 1998/02/07 01:16:54 chs Exp
 *
 *
 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
 * All rights reserved.
 *
 * Permission to use, copy, modify and distribute this software and
 * its documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.182 2023/10/04 20:34:19 ad Exp $");

#include "opt_kgdb.h"
#include "opt_kstack.h"
#include "opt_uvmhist.h"

/*
 * uvm_glue.c: glue functions
 */

#include <sys/param.h>
#include <sys/kernel.h>

#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/buf.h>
#include <sys/syncobj.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <sys/lwp.h>
#include <sys/asan.h>

#include <uvm/uvm.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_pgflcache.h>

/*
 * uvm_kernacc: test if kernel can access a memory region.
 *
 * => Currently used only by /dev/kmem driver (dev/mm.c).
 */
bool
uvm_kernacc(void *addr, size_t len, vm_prot_t prot)
{
        vaddr_t saddr = trunc_page((vaddr_t)addr);
        vaddr_t eaddr = round_page(saddr + len);
        bool rv;

        vm_map_lock_read(kernel_map);
        rv = uvm_map_checkprot(kernel_map, saddr, eaddr, prot);
        vm_map_unlock_read(kernel_map);

        return rv;
}

#ifdef KGDB
/*
 * Change protections on kernel pages from addr to addr+len
 * (presumably so debugger can plant a breakpoint).
 *
 * We force the protection change at the pmap level.  If we were
 * to use vm_map_protect a change to allow writing would be lazily-
 * applied meaning we would still take a protection fault, something
 * we really don't want to do.  It would also fragment the kernel
 * map unnecessarily.  We cannot use pmap_protect since it also won't
 * enforce a write-enable request.  Using pmap_enter is the only way
 * we can ensure the change takes place properly.
 */
void
uvm_chgkprot(void *addr, size_t len, int rw)
{
        vm_prot_t prot;
        paddr_t pa;
        vaddr_t sva, eva;

        prot = rw == B_READ ? VM_PROT_READ : VM_PROT_READ|VM_PROT_WRITE;
        eva = round_page((vaddr_t)addr + len);
        for (sva = trunc_page((vaddr_t)addr); sva < eva; sva += PAGE_SIZE) {
                /*
                 * Extract physical address for the page.
                 */
                if (pmap_extract(pmap_kernel(), sva, &pa) == false)
                        panic("%s: invalid page", __func__);
                pmap_enter(pmap_kernel(), sva, pa, prot, PMAP_WIRED);
        }
        pmap_update(pmap_kernel());
}
#endif

/*
 * uvm_vslock: wire user memory for I/O
 *
 * - called from physio and sys___sysctl
 * - XXXCDC: consider nuking this (or making it a macro?)
 */

int
uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access_type)
{
        struct vm_map *map;
        vaddr_t start, end;
        int error;

        map = &vs->vm_map;
        start = trunc_page((vaddr_t)addr);
        end = round_page((vaddr_t)addr + len);
        error = uvm_fault_wire(map, start, end, access_type, 0);
        return error;
}

/*
 * uvm_vsunlock: unwire user memory wired by uvm_vslock()
 *
 * - called from physio and sys___sysctl
 * - XXXCDC: consider nuking this (or making it a macro?)
 */

void
uvm_vsunlock(struct vmspace *vs, void *addr, size_t len)
{
        uvm_fault_unwire(&vs->vm_map, trunc_page((vaddr_t)addr),
                round_page((vaddr_t)addr + len));
}

/*
 * uvm_proc_fork: fork a virtual address space
 *
 * - the address space is copied as per parent map's inherit values
 */
void
uvm_proc_fork(struct proc *p1, struct proc *p2, bool shared)
{

        if (shared == true) {
                p2->p_vmspace = NULL;
                uvmspace_share(p1, p2);
        } else {
                p2->p_vmspace = uvmspace_fork(p1->p_vmspace);
        }

        cpu_proc_fork(p1, p2);
}

/*
 * uvm_lwp_fork: fork a thread
 *
 * - a new PCB structure is allocated for the child process,
 *        and filled in by MD layer
 * - if specified, the child gets a new user stack described by
 *        stack and stacksize
 * - NOTE: the kernel stack may be at a different location in the child
 *        process, and thus addresses of automatic variables may be invalid
 *        after cpu_lwp_fork returns in the child process.  We do nothing here
 *        after cpu_lwp_fork returns.
 */
void
uvm_lwp_fork(struct lwp *l1, struct lwp *l2, void *stack, size_t stacksize,
    void (*func)(void *), void *arg)
{

        /* Fill stack with magic number. */
        kstack_setup_magic(l2);

        /*
         * cpu_lwp_fork() copy and update the pcb, and make the child ready
          * to run.  If this is a normal user fork, the child will exit
         * directly to user mode via child_return() on its first time
         * slice and will not return here.  If this is a kernel thread,
         * the specified entry point will be executed.
         */
        cpu_lwp_fork(l1, l2, stack, stacksize, func, arg);
}

#ifndef USPACE_ALIGN
#define        USPACE_ALIGN        0
#endif

static pool_cache_t uvm_uarea_cache;
#if defined(__HAVE_CPU_UAREA_ROUTINES)
static pool_cache_t uvm_uarea_system_cache;
#else
#define uvm_uarea_system_cache uvm_uarea_cache
#endif

static void *
uarea_poolpage_alloc(struct pool *pp, int flags)
{

        KASSERT((flags & PR_WAITOK) != 0);

#if defined(PMAP_MAP_POOLPAGE)
        while (USPACE == PAGE_SIZE &&
            (USPACE_ALIGN == 0 || USPACE_ALIGN == PAGE_SIZE)) {
                struct vm_page *pg;
                vaddr_t va;
#if defined(PMAP_ALLOC_POOLPAGE)
                pg = PMAP_ALLOC_POOLPAGE(0);
#else
                pg = uvm_pagealloc(NULL, 0, NULL, 0);
#endif
                if (pg == NULL) {
                        uvm_wait("uarea");
                        continue;
                }
                va = PMAP_MAP_POOLPAGE(VM_PAGE_TO_PHYS(pg));
                KASSERT(va != 0);
                return (void *)va;
        }
#endif
#if defined(__HAVE_CPU_UAREA_ROUTINES)
        void *va = cpu_uarea_alloc(false);
        if (va)
                return (void *)va;
#endif
        return (void *)uvm_km_alloc(kernel_map, pp->pr_alloc->pa_pagesz,
            USPACE_ALIGN, UVM_KMF_WIRED | UVM_KMF_WAITVA);
}

static void
uarea_poolpage_free(struct pool *pp, void *addr)
{
#if defined(PMAP_MAP_POOLPAGE)
        if (USPACE == PAGE_SIZE &&
            (USPACE_ALIGN == 0 || USPACE_ALIGN == PAGE_SIZE)) {
                paddr_t pa;

                pa = PMAP_UNMAP_POOLPAGE((vaddr_t) addr);
                KASSERT(pa != 0);
                uvm_pagefree(PHYS_TO_VM_PAGE(pa));
                return;
        }
#endif
#if defined(__HAVE_CPU_UAREA_ROUTINES)
        if (cpu_uarea_free(addr))
                return;
#endif
        uvm_km_free(kernel_map, (vaddr_t)addr, pp->pr_alloc->pa_pagesz,
            UVM_KMF_WIRED);
}

static struct pool_allocator uvm_uarea_allocator = {
        .pa_alloc = uarea_poolpage_alloc,
        .pa_free = uarea_poolpage_free,
        .pa_pagesz = USPACE,
};

#if defined(__HAVE_CPU_UAREA_ROUTINES)
static void *
uarea_system_poolpage_alloc(struct pool *pp, int flags)
{
        void * const va = cpu_uarea_alloc(true);
        if (va != NULL)
                return va;

        return (void *)uvm_km_alloc(kernel_map, pp->pr_alloc->pa_pagesz,
            USPACE_ALIGN, UVM_KMF_WIRED |
            ((flags & PR_WAITOK) ? UVM_KMF_WAITVA :
            (UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)));
}

static void
uarea_system_poolpage_free(struct pool *pp, void *addr)
{
        if (cpu_uarea_free(addr))
                return;

        uvm_km_free(kernel_map, (vaddr_t)addr, pp->pr_alloc->pa_pagesz,
            UVM_KMF_WIRED);
}

static struct pool_allocator uvm_uarea_system_allocator = {
        .pa_alloc = uarea_system_poolpage_alloc,
        .pa_free = uarea_system_poolpage_free,
        .pa_pagesz = USPACE,
};
#endif /* __HAVE_CPU_UAREA_ROUTINES */

void
uvm_uarea_init(void)
{
        int flags = PR_NOTOUCH;

        /*
         * specify PR_NOALIGN unless the alignment provided by
         * the backend (USPACE_ALIGN) is sufficient to provide
         * pool page size (UPSACE) alignment.
         */

        if ((USPACE_ALIGN == 0 && USPACE != PAGE_SIZE) ||
            (USPACE_ALIGN % USPACE) != 0) {
                flags |= PR_NOALIGN;
        }

        uvm_uarea_cache = pool_cache_init(USPACE, USPACE_ALIGN, 0, flags,
            "uarea", &uvm_uarea_allocator, IPL_NONE, NULL, NULL, NULL);
#if defined(__HAVE_CPU_UAREA_ROUTINES)
        uvm_uarea_system_cache = pool_cache_init(USPACE, USPACE_ALIGN,
            0, flags, "uareasys", &uvm_uarea_system_allocator,
            IPL_NONE, NULL, NULL, NULL);
#endif
}

/*
 * uvm_uarea_alloc: allocate a u-area
 */

vaddr_t
uvm_uarea_alloc(void)
{

        return (vaddr_t)pool_cache_get(uvm_uarea_cache, PR_WAITOK);
}

vaddr_t
uvm_uarea_system_alloc(struct cpu_info *ci)
{
#ifdef __HAVE_CPU_UAREA_ALLOC_IDLELWP
        if (__predict_false(ci != NULL))
                return cpu_uarea_alloc_idlelwp(ci);
#endif

        return (vaddr_t)pool_cache_get(uvm_uarea_system_cache, PR_WAITOK);
}

/*
 * uvm_uarea_free: free a u-area
 */

void
uvm_uarea_free(vaddr_t uaddr)
{

        kasan_mark((void *)uaddr, USPACE, USPACE, 0);
        pool_cache_put(uvm_uarea_cache, (void *)uaddr);
}

void
uvm_uarea_system_free(vaddr_t uaddr)
{

        kasan_mark((void *)uaddr, USPACE, USPACE, 0);
        pool_cache_put(uvm_uarea_system_cache, (void *)uaddr);
}

vaddr_t
uvm_lwp_getuarea(lwp_t *l)
{

        return (vaddr_t)l->l_addr - UAREA_PCB_OFFSET;
}

void
uvm_lwp_setuarea(lwp_t *l, vaddr_t addr)
{

        l->l_addr = (void *)(addr + UAREA_PCB_OFFSET);
}

/*
 * uvm_proc_exit: exit a virtual address space
 *
 * - borrow proc0's address space because freeing the vmspace
 *   of the dead process may block.
 */

void
uvm_proc_exit(struct proc *p)
{
        struct lwp *l = curlwp; /* XXX */
        struct vmspace *ovm;

        KASSERT(p == l->l_proc);
        ovm = p->p_vmspace;
        KASSERT(ovm != NULL);

        if (__predict_false(ovm == proc0.p_vmspace))
                return;

        /*
         * borrow proc0's address space.
         */
        kpreempt_disable();
        pmap_deactivate(l);
        p->p_vmspace = proc0.p_vmspace;
        pmap_activate(l);
        kpreempt_enable();

        uvmspace_free(ovm);
}

void
uvm_lwp_exit(struct lwp *l)
{
        vaddr_t va = uvm_lwp_getuarea(l);
        bool system = (l->l_flag & LW_SYSTEM) != 0;

        if (system)
                uvm_uarea_system_free(va);
        else
                uvm_uarea_free(va);
#ifdef DIAGNOSTIC
        uvm_lwp_setuarea(l, (vaddr_t)NULL);
#endif
}

/*
 * uvm_init_limit: init per-process VM limits
 *
 * - called for process 0 and then inherited by all others.
 */

void
uvm_init_limits(struct proc *p)
{

        /*
         * Set up the initial limits on process VM.  Set the maximum
         * resident set size to be all of (reasonably) available memory.
         * This causes any single, large process to start random page
         * replacement once it fills memory.
         */

        p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ;
        p->p_rlimit[RLIMIT_STACK].rlim_max = maxsmap;
        p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ;
        p->p_rlimit[RLIMIT_DATA].rlim_max = maxdmap;
        p->p_rlimit[RLIMIT_AS].rlim_cur = RLIM_INFINITY;
        p->p_rlimit[RLIMIT_AS].rlim_max = RLIM_INFINITY;
        p->p_rlimit[RLIMIT_RSS].rlim_cur = MIN(VM_MAXUSER_ADDRESS,
            ctob((rlim_t)uvm_availmem(false)));
}

/*
 * uvm_scheduler: process zero main loop.
 */

extern struct loadavg averunnable;

void
uvm_scheduler(void)
{
        lwp_t *l = curlwp;

        lwp_lock(l);
        l->l_class = SCHED_FIFO;
        lwp_changepri(l, PRI_VM);
        lwp_unlock(l);

        /* Start the freelist cache. */
        uvm_pgflcache_start();

        for (;;) {
                /* Update legacy stats for post-mortem debugging. */
                uvm_update_uvmexp();

                /* See if the pagedaemon needs to generate some free pages. */
                uvm_kick_pdaemon();

                /* Calculate process statistics. */
                sched_pstats();
                (void)kpause("uvm", false, hz, NULL);
        }
}

/*
 * uvm_idle: called from the idle loop.
 */

void
uvm_idle(void)
{
        struct cpu_info *ci = curcpu();
        struct uvm_cpu *ucpu = ci->ci_data.cpu_uvm;

        KASSERT(kpreempt_disabled());

        uvmpdpol_idle(ucpu);
}


















































































































































































































































    4 


   30 



















    1 














   24 







   21 
















    1 



    6 


   24 















































































































































   21 







    2 







   20 


   20 



















    1 





























   11 

















   22 

















    1 


   21 









    2 





















   19 








    1 









   20 




   20 

   12 




   21 

   12 
   11 

    4 
   19 




























































































































































































































































































































































































































































































































































































































    4 




    4 






    4 















    1 


    3 
































































































































    3 













    1 








    4 





    4 



























    1 






    1 



























































































































    1 

    1 













    2 



    2 








    3 






    1 




    1 
    2 









    2 








    2 








    1 




    1 





















    5 





    1 




    1 













    3 











   32 


































    7 








    2 


    1 

   25 






























   33 






   31 
    2 
    2 






   33 










   32 









   31 









   29 











    8 
   26 






    3 












    2 
    1 



















    1 













    1 






    1 








    1 


















    1 
    1 















































    2 












    1 




    1 
































    1 




































































    1 
































    1 













































    1 










    1 












    2 







    1 




    1 


































    1 








    1 




















































    3 

    7 










    9 





    9 






    1 








    1 













    7 


























    7 











    1 




    2 


    2 


















    1 

















    3 
















    2 
    2 






































    3 












    3 



















    3 












    1 











    1 
    1 











































    1 











































    6 








    6 





    6 











    5 


    5 






    1 










    4 














    5 












    1 
















    1 








    1 


































    2 






























    2 






    2 












    2 






















































    1 
























    1 























    1 
















    1 










    1 

















































    1 










    1 







    1 























    1 

















    2 












    4 







    1 
















    3 






    3 







    1 









    1 
    1 




    3 






    2 

















    3 






    6 




    6 


    1 





    5 




    4 











    1 










    1 








    1 










    1 





    1 

















    1 






    7 













    7 












    7 





















































































































































    2 











    2 







































    1 









    1 
    1 















    1 










    1 







    1 







































































































































































    1 









    1 













    4 










    1 


    3 






































    3 

































    3 




























    2 





    2 



































































    1 
































    3 






    3 








    3 











    1 


    1 






    1 



    2 




    3 
    2 
































    1 




    1 


    1 






    1 






    1 






    1 


    1 














    3 









    1 











    2 













    1 














    1 


































































































































    6 












    2 

















    1 






























    9 







    9 
    9 




    9 


    1 

    8 









    8 










    6 
    6 




























    6 






















    6 








    6 
    6 

    4 
    2 




    6 












    6 



















    6 













































    3 
    4 

    6 
    6 
















    1 


    3 




















    5 
    1 













































    5 
    5 



















    1 
    1 





    1 















    5 












    1 




















    5 







    5 









    5 





















    4 





















    1 









    1 






















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
/*        $NetBSD: vfs_syscalls.c,v 1.561 2023/09/09 18:34:44 ad Exp $        */

/*-
 * Copyright (c) 2008, 2009, 2019, 2020, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_syscalls.c        8.42 (Berkeley) 7/31/95
 */

/*
 * Virtual File System System Calls
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.561 2023/09/09 18:34:44 ad Exp $");

#ifdef _KERNEL_OPT
#include "opt_fileassoc.h"
#include "veriexec.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/fstrans.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/kmem.h>
#include <sys/dirent.h>
#include <sys/sysctl.h>
#include <sys/syscallargs.h>
#include <sys/vfs_syscalls.h>
#include <sys/quota.h>
#include <sys/quotactl.h>
#include <sys/ktrace.h>
#ifdef FILEASSOC
#include <sys/fileassoc.h>
#endif /* FILEASSOC */
#include <sys/extattr.h>
#include <sys/verified_exec.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/module.h>
#include <sys/buf.h>
#include <sys/event.h>
#include <sys/compat_stub.h>

#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

#include <nfs/rpcv2.h>
#include <nfs/nfsproto.h>
#include <nfs/nfs.h>
#include <nfs/nfs_var.h>

/* XXX this shouldn't be here */
#ifndef OFF_T_MAX
#define OFF_T_MAX __type_max(off_t)
#endif

static int change_flags(struct vnode *, u_long, struct lwp *);
static int change_mode(struct vnode *, int, struct lwp *);
static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
static int do_sys_openat(lwp_t *, int, const char *, int, int, int *);
static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t,
    enum uio_seg);
static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t);
static int do_sys_symlinkat(struct lwp *, const char *, int, const char *,
    enum uio_seg);
static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *,
    enum uio_seg, int);
static int do_sys_readlinkat(struct lwp *, int, const char *, char *,
    size_t, register_t *);
static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg);

static int fd_nameiat(struct lwp *, int, struct nameidata *);
static int fd_nameiat_simple_user(struct lwp *, int, const char *,
    namei_simple_flags_t, struct vnode **);

/*
 * This table is used to maintain compatibility with 4.3BSD
 * and NetBSD 0.9 mount syscalls - and possibly other systems.
 * Note, the order is important!
 *
 * Do not modify this table. It should only contain filesystems
 * supported by NetBSD 0.9 and 4.3BSD.
 */
const char * const mountcompatnames[] = {
        NULL,                /* 0 = MOUNT_NONE */
        MOUNT_FFS,        /* 1 = MOUNT_UFS */
        MOUNT_NFS,        /* 2 */
        MOUNT_MFS,        /* 3 */
        MOUNT_MSDOS,        /* 4 */
        MOUNT_CD9660,        /* 5 = MOUNT_ISOFS */
        MOUNT_FDESC,        /* 6 */
        MOUNT_KERNFS,        /* 7 */
        NULL,                /* 8 = MOUNT_DEVFS */
        MOUNT_AFS,        /* 9 */
};

const u_int nmountcompatnames = __arraycount(mountcompatnames);

/*
 * Filter event method for EVFILT_FS.
 */
static struct klist fs_klist;
static kmutex_t fs_klist_lock;

CTASSERT((NOTE_SUBMIT & VQ_MOUNT) == 0);
CTASSERT((NOTE_SUBMIT & VQ_UNMOUNT) == 0);

void
vfs_evfilt_fs_init(void)
{
        klist_init(&fs_klist);
        mutex_init(&fs_klist_lock, MUTEX_DEFAULT, IPL_NONE);
}

static int
filt_fsattach(struct knote *kn)
{
        mutex_enter(&fs_klist_lock);
        kn->kn_flags |= EV_CLEAR;
        klist_insert(&fs_klist, kn);
        mutex_exit(&fs_klist_lock);

        return 0;
}

static void
filt_fsdetach(struct knote *kn)
{
        mutex_enter(&fs_klist_lock);
        klist_remove(&fs_klist, kn);
        mutex_exit(&fs_klist_lock);
}

static int
filt_fs(struct knote *kn, long hint)
{
        int rv;

        if (hint & NOTE_SUBMIT) {
                KASSERT(mutex_owned(&fs_klist_lock));
                kn->kn_fflags |= hint & ~NOTE_SUBMIT;
        } else {
                mutex_enter(&fs_klist_lock);
        }

        rv = (kn->kn_fflags != 0);

        if ((hint & NOTE_SUBMIT) == 0) {
                mutex_exit(&fs_klist_lock);
        }

        return rv;
}

/* referenced in kern_event.c */
const struct filterops fs_filtops = {
        .f_flags = FILTEROP_MPSAFE,
        .f_attach = filt_fsattach,
        .f_detach = filt_fsdetach,
        .f_event = filt_fs,
};

static int
fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp)
{
        file_t *dfp;
        int error;

        if (fdat != AT_FDCWD) {
                if ((error = fd_getvnode(fdat, &dfp)) != 0)
                        goto out;

                NDAT(ndp, dfp->f_vnode);
        }

        error = namei(ndp);

        if (fdat != AT_FDCWD)
                fd_putfile(fdat);
out:
        return error;
}

static int
fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path,
    namei_simple_flags_t sflags, struct vnode **vp_ret)
{
        file_t *dfp;
        struct vnode *dvp;
        int error;

        if (fdat != AT_FDCWD) {
                if ((error = fd_getvnode(fdat, &dfp)) != 0)
                        goto out;

                dvp = dfp->f_vnode;
        } else {
                dvp = NULL;
        }

        error = nameiat_simple_user(dvp, path, sflags, vp_ret);

        if (fdat != AT_FDCWD)
                fd_putfile(fdat);
out:
        return error;
}

static int
open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
{
        int error;

        fp->f_flag = flags & FMASK;
        fp->f_type = DTYPE_VNODE;
        fp->f_ops = &vnops;
        fp->f_vnode = vp;

        if (flags & (O_EXLOCK | O_SHLOCK)) {
                struct flock lf;
                int type;

                lf.l_whence = SEEK_SET;
                lf.l_start = 0;
                lf.l_len = 0;
                if (flags & O_EXLOCK)
                        lf.l_type = F_WRLCK;
                else
                        lf.l_type = F_RDLCK;
                type = F_FLOCK;
                if ((flags & FNONBLOCK) == 0)
                        type |= F_WAIT;
                VOP_UNLOCK(vp);
                error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
                if (error) {
                        (void) vn_close(vp, fp->f_flag, fp->f_cred);
                        fd_abort(l->l_proc, fp, indx);
                        return error;
                }
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
                atomic_or_uint(&fp->f_flag, FHASLOCK);
        }
        if (flags & O_CLOEXEC)
                fd_set_exclose(l, indx, true);
        return 0;
}

static int
mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
    void *data, size_t *data_len)
{
        struct mount *mp;
        int error = 0, saved_flags;

        mp = vp->v_mount;
        saved_flags = mp->mnt_flag;

        /* We can operate only on VV_ROOT nodes. */
        if ((vp->v_vflag & VV_ROOT) == 0) {
                error = EINVAL;
                goto out;
        }

        /*
         * We only allow the filesystem to be reloaded if it
         * is currently mounted read-only.  Additionally, we
         * prevent read-write to read-only downgrades.
         */
        if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
            (mp->mnt_flag & MNT_RDONLY) == 0 &&
            (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
                error = EOPNOTSUPP;        /* Needs translation */
                goto out;
        }

        /*
         * Enabling MNT_UNION requires a covered mountpoint and
         * must not happen on the root mount.
         */
        if ((flags & MNT_UNION) != 0 && mp->mnt_vnodecovered == NULLVP) {
                error = EOPNOTSUPP;
                goto out;
        }

        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
            KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
        if (error)
                goto out;

        error = vfs_suspend(mp, 0);
        if (error)
                goto out;

        mutex_enter(mp->mnt_updating);

        mp->mnt_flag &= ~MNT_OP_FLAGS;
        mp->mnt_flag |= flags & MNT_OP_FLAGS;

        /*
         * Set the mount level flags.
         */
        if ((flags & MNT_RDONLY) != (mp->mnt_flag & MNT_RDONLY)) {
                if ((flags & MNT_RDONLY))
                        mp->mnt_iflag |= IMNT_WANTRDONLY;
                else
                        mp->mnt_iflag |= IMNT_WANTRDWR;
        }
        mp->mnt_flag &= ~MNT_BASIC_FLAGS;
        mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
        if ((mp->mnt_iflag & IMNT_WANTRDONLY))
                mp->mnt_flag &= ~MNT_RDONLY;

        error = VFS_MOUNT(mp, path, data, data_len);

        if (error && data != NULL) {
                int error2;

                /*
                 * Update failed; let's try and see if it was an
                 * export request.  For compat with 3.0 and earlier.
                 */
                error2 = vfs_hooks_reexport(mp, path, data);

                /*
                 * Only update error code if the export request was
                 * understood but some problem occurred while
                 * processing it.
                 */
                if (error2 != EJUSTRETURN)
                        error = error2;
        }

        if (error == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY))
                mp->mnt_flag |= MNT_RDONLY;
        if (error)
                mp->mnt_flag = saved_flags;
        mp->mnt_flag &= ~MNT_OP_FLAGS;
        mp->mnt_iflag &= ~(IMNT_WANTRDONLY | IMNT_WANTRDWR);
        if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
                if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0)
                        vfs_syncer_add_to_worklist(mp);
        } else {
                if ((mp->mnt_iflag & IMNT_ONWORKLIST) != 0)
                        vfs_syncer_remove_from_worklist(mp);
        }
        mutex_exit(mp->mnt_updating);
        vfs_resume(mp);

        if ((error == 0) && !(saved_flags & MNT_EXTATTR) &&
            (flags & MNT_EXTATTR)) {
                if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START,
                                   NULL, 0, NULL) != 0) {
                        printf("%s: failed to start extattr, error = %d",
                               mp->mnt_stat.f_mntonname, error);
                        mp->mnt_flag &= ~MNT_EXTATTR;
                }
        }

        if ((error == 0) && (saved_flags & MNT_EXTATTR) &&
            !(flags & MNT_EXTATTR)) {
                if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP,
                                   NULL, 0, NULL) != 0) {
                        printf("%s: failed to stop extattr, error = %d",
                               mp->mnt_stat.f_mntonname, error);
                        mp->mnt_flag |= MNT_RDONLY;
                }
        }
 out:
        return (error);
}

static int
mount_get_vfsops(const char *fstype, enum uio_seg type_seg,
    struct vfsops **vfsops)
{
        char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
        int error;

        if (type_seg == UIO_USERSPACE) {
                /* Copy file-system type from userspace.  */
                error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
        } else {
                error = copystr(fstype, fstypename, sizeof(fstypename), NULL);
                KASSERT(error == 0);
        }

        if (error) {
                /*
                 * Historically, filesystem types were identified by numbers.
                 * If we get an integer for the filesystem type instead of a
                 * string, we check to see if it matches one of the historic
                 * filesystem types.
                 */
                u_long fsindex = (u_long)fstype;
                if (fsindex >= nmountcompatnames ||
                    mountcompatnames[fsindex] == NULL)
                        return ENODEV;
                strlcpy(fstypename, mountcompatnames[fsindex],
                    sizeof(fstypename));
        }

        /* Accept `ufs' as an alias for `ffs', for compatibility. */
        if (strcmp(fstypename, "ufs") == 0)
                fstypename[0] = 'f';

        if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
                return 0;

        /* If we can autoload a vfs module, try again */
        (void)module_autoload(fstypename, MODULE_CLASS_VFS);

        if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
                return 0;

        return ENODEV;
}

static int
mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
    void *data, size_t *data_len)
{
        struct mount *mp;
        int error;

        /* If MNT_GETARGS is specified, it should be the only flag. */
        if (flags & ~MNT_GETARGS)
                return EINVAL;

        mp = vp->v_mount;

        /* XXX: probably some notion of "can see" here if we want isolation. */
        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
            KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
        if (error)
                return error;

        if ((vp->v_vflag & VV_ROOT) == 0)
                return EINVAL;

        if (vfs_busy(mp))
                return EPERM;

        mutex_enter(mp->mnt_updating);
        mp->mnt_flag &= ~MNT_OP_FLAGS;
        mp->mnt_flag |= MNT_GETARGS;
        error = VFS_MOUNT(mp, path, data, data_len);
        mp->mnt_flag &= ~MNT_OP_FLAGS;
        mutex_exit(mp->mnt_updating);

        vfs_unbusy(mp);
        return (error);
}

int
sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) type;
                syscallarg(const char *) path;
                syscallarg(int) flags;
                syscallarg(void *) data;
                syscallarg(size_t) data_len;
        } */

        return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path),
            SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
            SCARG(uap, data_len), retval);
}

int
do_sys_mount(struct lwp *l, const char *type, enum uio_seg type_seg,
    const char *path, int flags, void *data, enum uio_seg data_seg,
    size_t data_len, register_t *retval)
{
        struct vfsops *vfsops = NULL;        /* XXX gcc4.8 */
        struct vnode *vp;
        void *data_buf = data;
        bool vfsopsrele = false;
        size_t alloc_sz = 0;
        int error;

        /*
         * Get vnode to be covered
         */
        error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
        if (error != 0) {
                vp = NULL;
                goto done;
        }

        if (flags & (MNT_GETARGS | MNT_UPDATE)) {
                vfsops = vp->v_mount->mnt_op;
        } else {
                /* 'type' is userspace */
                error = mount_get_vfsops(type, type_seg, &vfsops);
                if (error != 0)
                        goto done;
                vfsopsrele = true;
        }

        /*
         * We allow data to be NULL, even for userspace. Some fs's don't need
         * it. The others will handle NULL.
         */
        if (data != NULL && data_seg == UIO_USERSPACE) {
                if (data_len == 0) {
                        /* No length supplied, use default for filesystem */
                        data_len = vfsops->vfs_min_mount_data;

                        /*
                         * Hopefully a longer buffer won't make copyin() fail.
                         * For compatibility with 3.0 and earlier.
                         */
                        if (flags & MNT_UPDATE
                            && data_len < sizeof (struct mnt_export_args30))
                                data_len = sizeof (struct mnt_export_args30);
                }
                if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
                        error = EINVAL;
                        goto done;
                }
                alloc_sz = data_len;
                data_buf = kmem_alloc(alloc_sz, KM_SLEEP);

                /* NFS needs the buffer even for mnt_getargs .... */
                error = copyin(data, data_buf, data_len);
                if (error != 0)
                        goto done;
        }

        if (flags & MNT_GETARGS) {
                if (data_len == 0) {
                        error = EINVAL;
                        goto done;
                }
                error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
                if (error != 0)
                        goto done;
                if (data_seg == UIO_USERSPACE)
                        error = copyout(data_buf, data, data_len);
                *retval = data_len;
        } else if (flags & MNT_UPDATE) {
                error = mount_update(l, vp, path, flags, data_buf, &data_len);
        } else {
                /* Locking is handled internally in mount_domount(). */
                KASSERT(vfsopsrele == true);
                error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
                    &data_len);
                vfsopsrele = false;
        }
        if (!error) {
                mutex_enter(&fs_klist_lock);
                KNOTE(&fs_klist, NOTE_SUBMIT | VQ_MOUNT);
                mutex_exit(&fs_klist_lock);
        }

    done:
        if (vfsopsrele)
                vfs_delref(vfsops);
            if (vp != NULL) {
                    vrele(vp);
        }
        if (data_buf != data)
                kmem_free(data_buf, alloc_sz);
        return (error);
}

/*
 * Unmount a file system.
 *
 * Note: unmount takes a path to the vnode mounted on as argument,
 * not special file (as before).
 */
/* ARGSUSED */
int
sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(int) flags;
        } */
        struct vnode *vp;
        struct mount *mp;
        int error;
        struct pathbuf *pb;
        struct nameidata nd;

        error = pathbuf_copyin(SCARG(uap, path), &pb);
        if (error) {
                return error;
        }

        NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
        if ((error = namei(&nd)) != 0) {
                pathbuf_destroy(pb);
                return error;
        }
        vp = nd.ni_vp;
        pathbuf_destroy(pb);

        mp = vp->v_mount;
        vfs_ref(mp);
        VOP_UNLOCK(vp);

        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
            KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
        if (error) {
                vrele(vp);
                vfs_rele(mp);
                return (error);
        }

        /*
         * Don't allow unmounting the root file system.
         */
        if (mp->mnt_flag & MNT_ROOTFS) {
                vrele(vp);
                vfs_rele(mp);
                return (EINVAL);
        }

        /*
         * Must be the root of the filesystem
         */
        if ((vp->v_vflag & VV_ROOT) == 0) {
                vrele(vp);
                vfs_rele(mp);
                return (EINVAL);
        }

        vrele(vp);
        error = dounmount(mp, SCARG(uap, flags), l);
        vfs_rele(mp);
        if (!error) {
                mutex_enter(&fs_klist_lock);
                KNOTE(&fs_klist, NOTE_SUBMIT | VQ_UNMOUNT);
                mutex_exit(&fs_klist_lock);
        }
        return error;
}

/*
 * Sync each mounted filesystem.
 */
#ifdef DEBUG
int syncprt = 0;
struct ctldebug debug0 = { "syncprt", &syncprt };
#endif

void
do_sys_sync(struct lwp *l)
{
        mount_iterator_t *iter;
        struct mount *mp;
        int asyncflag;

        mountlist_iterator_init(&iter);
        while ((mp = mountlist_iterator_next(iter)) != NULL) {
                mutex_enter(mp->mnt_updating);
                if ((mp->mnt_flag & MNT_RDONLY) == 0) {
                        asyncflag = mp->mnt_flag & MNT_ASYNC;
                        mp->mnt_flag &= ~MNT_ASYNC;
                        VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
                        if (asyncflag)
                                 mp->mnt_flag |= MNT_ASYNC;
                }
                mutex_exit(mp->mnt_updating);
        }
        mountlist_iterator_destroy(iter);
#ifdef DEBUG
        if (syncprt)
                vfs_bufstats();
#endif /* DEBUG */
}

static bool
sync_vnode_filter(void *cookie, vnode_t *vp)
{

        if (vp->v_numoutput > 0) {
                ++*(int *)cookie;
        }
        return false;
}

int
vfs_syncwait(void)
{
        int nbusy, nbusy_prev, iter;
        struct vnode_iterator *vniter;
        mount_iterator_t *mpiter;
        struct mount *mp;

        for (nbusy_prev = 0, iter = 0; iter < 20;) {
                nbusy = 0;
                mountlist_iterator_init(&mpiter);
                while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
                        vnode_t *vp __diagused;
                        vfs_vnode_iterator_init(mp, &vniter);
                        vp = vfs_vnode_iterator_next(vniter,
                            sync_vnode_filter, &nbusy);
                        KASSERT(vp == NULL);
                        vfs_vnode_iterator_destroy(vniter);
                }
                mountlist_iterator_destroy(mpiter);

                if (nbusy == 0)
                        break;
                if (nbusy_prev == 0)
                        nbusy_prev = nbusy;
                printf("%d ", nbusy);
                kpause("syncwait", false, MAX(1, hz / 25 * iter), NULL);
                if (nbusy >= nbusy_prev) /* we didn't flush anything */
                        iter++;
                else
                        nbusy_prev = nbusy;
        }

        if (nbusy) {
#if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
                printf("giving up\nPrinting vnodes for busy buffers\n");
                mountlist_iterator_init(&mpiter);
                while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
                        vnode_t *vp;
                        vfs_vnode_iterator_init(mp, &vniter);
                        vp = vfs_vnode_iterator_next(vniter,
                            NULL, NULL);
                        mutex_enter(vp->v_interlock);
                        if (vp->v_numoutput > 0)
                                vprint(NULL, vp);
                        mutex_exit(vp->v_interlock);
                        vrele(vp);
                        vfs_vnode_iterator_destroy(vniter);
                }
                mountlist_iterator_destroy(mpiter);
#endif
        }

        return nbusy;
}

/* ARGSUSED */
int
sys_sync(struct lwp *l, const void *v, register_t *retval)
{
        do_sys_sync(l);
        return (0);
}


/*
 * Access or change filesystem quotas.
 *
 * (this is really 14 different calls bundled into one)
 */

static int
do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
{
        struct quotastat info_k;
        int error;

        /* ensure any padding bytes are cleared */
        memset(&info_k, 0, sizeof(info_k));

        error = vfs_quotactl_stat(mp, &info_k);
        if (error) {
                return error;
        }

        return copyout(&info_k, info_u, sizeof(info_k));
}

static int
do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
    struct quotaidtypestat *info_u)
{
        struct quotaidtypestat info_k;
        int error;

        /* ensure any padding bytes are cleared */
        memset(&info_k, 0, sizeof(info_k));

        error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
        if (error) {
                return error;
        }

        return copyout(&info_k, info_u, sizeof(info_k));
}

static int
do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
    struct quotaobjtypestat *info_u)
{
        struct quotaobjtypestat info_k;
        int error;

        /* ensure any padding bytes are cleared */
        memset(&info_k, 0, sizeof(info_k));

        error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
        if (error) {
                return error;
        }

        return copyout(&info_k, info_u, sizeof(info_k));
}

static int
do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
    struct quotaval *val_u)
{
        struct quotakey key_k;
        struct quotaval val_k;
        int error;

        /* ensure any padding bytes are cleared */
        memset(&val_k, 0, sizeof(val_k));

        error = copyin(key_u, &key_k, sizeof(key_k));
        if (error) {
                return error;
        }

        error = vfs_quotactl_get(mp, &key_k, &val_k);
        if (error) {
                return error;
        }

        return copyout(&val_k, val_u, sizeof(val_k));
}

static int
do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
    const struct quotaval *val_u)
{
        struct quotakey key_k;
        struct quotaval val_k;
        int error;

        error = copyin(key_u, &key_k, sizeof(key_k));
        if (error) {
                return error;
        }

        error = copyin(val_u, &val_k, sizeof(val_k));
        if (error) {
                return error;
        }

        return vfs_quotactl_put(mp, &key_k, &val_k);
}

static int
do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u)
{
        struct quotakey key_k;
        int error;

        error = copyin(key_u, &key_k, sizeof(key_k));
        if (error) {
                return error;
        }

        return vfs_quotactl_del(mp, &key_k);
}

static int
do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
{
        struct quotakcursor cursor_k;
        int error;

        /* ensure any padding bytes are cleared */
        memset(&cursor_k, 0, sizeof(cursor_k));

        error = vfs_quotactl_cursoropen(mp, &cursor_k);
        if (error) {
                return error;
        }

        return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
}

static int
do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
{
        struct quotakcursor cursor_k;
        int error;

        error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
        if (error) {
                return error;
        }

        return vfs_quotactl_cursorclose(mp, &cursor_k);
}

static int
do_sys_quotactl_cursorskipidtype(struct mount *mp,
    struct quotakcursor *cursor_u, int idtype)
{
        struct quotakcursor cursor_k;
        int error;

        error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
        if (error) {
                return error;
        }

        error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
        if (error) {
                return error;
        }

        return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
}

static int
do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
    struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
    unsigned *ret_u)
{
#define CGET_STACK_MAX 8
        struct quotakcursor cursor_k;
        struct quotakey stackkeys[CGET_STACK_MAX];
        struct quotaval stackvals[CGET_STACK_MAX];
        struct quotakey *keys_k;
        struct quotaval *vals_k;
        unsigned ret_k;
        int error;

        if (maxnum > 128) {
                maxnum = 128;
        }

        error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
        if (error) {
                return error;
        }

        if (maxnum <= CGET_STACK_MAX) {
                keys_k = stackkeys;
                vals_k = stackvals;
                /* ensure any padding bytes are cleared */
                memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
                memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
        } else {
                keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
                vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
        }

        error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
                                       &ret_k);
        if (error) {
                goto fail;
        }

        error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
        if (error) {
                goto fail;
        }

        error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
        if (error) {
                goto fail;
        }

        error = copyout(&ret_k, ret_u, sizeof(ret_k));
        if (error) {
                goto fail;
        }

        /* do last to maximize the chance of being able to recover a failure */
        error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));

fail:
        if (keys_k != stackkeys) {
                kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
        }
        if (vals_k != stackvals) {
                kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
        }
        return error;
}

static int
do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
    int *ret_u)
{
        struct quotakcursor cursor_k;
        int ret_k;
        int error;

        error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
        if (error) {
                return error;
        }

        error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
        if (error) {
                return error;
        }

        error = copyout(&ret_k, ret_u, sizeof(ret_k));
        if (error) {
                return error;
        }

        return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
}

static int
do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
{
        struct quotakcursor cursor_k;
        int error;

        error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
        if (error) {
                return error;
        }

        error = vfs_quotactl_cursorrewind(mp, &cursor_k);
        if (error) {
                return error;
        }

        return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
}

static int
do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
{
        char *path_k;
        int error;

        /* XXX this should probably be a struct pathbuf */
        path_k = PNBUF_GET();
        error = copyin(path_u, path_k, PATH_MAX);
        if (error) {
                PNBUF_PUT(path_k);
                return error;
        }

        error = vfs_quotactl_quotaon(mp, idtype, path_k);

        PNBUF_PUT(path_k);
        return error;
}

static int
do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
{
        return vfs_quotactl_quotaoff(mp, idtype);
}

int
do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
{
        struct mount *mp;
        struct vnode *vp;
        int error;

        error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
        if (error != 0)
                return (error);
        mp = vp->v_mount;

        switch (args->qc_op) {
            case QUOTACTL_STAT:
                error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
                break;
            case QUOTACTL_IDTYPESTAT:
                error = do_sys_quotactl_idtypestat(mp,
                                args->u.idtypestat.qc_idtype,
                                args->u.idtypestat.qc_info);
                break;
            case QUOTACTL_OBJTYPESTAT:
                error = do_sys_quotactl_objtypestat(mp,
                                args->u.objtypestat.qc_objtype,
                                args->u.objtypestat.qc_info);
                break;
            case QUOTACTL_GET:
                error = do_sys_quotactl_get(mp,
                                args->u.get.qc_key,
                                args->u.get.qc_val);
                break;
            case QUOTACTL_PUT:
                error = do_sys_quotactl_put(mp,
                                args->u.put.qc_key,
                                args->u.put.qc_val);
                break;
            case QUOTACTL_DEL:
                error = do_sys_quotactl_del(mp, args->u.del.qc_key);
                break;
            case QUOTACTL_CURSOROPEN:
                error = do_sys_quotactl_cursoropen(mp,
                                args->u.cursoropen.qc_cursor);
                break;
            case QUOTACTL_CURSORCLOSE:
                error = do_sys_quotactl_cursorclose(mp,
                                args->u.cursorclose.qc_cursor);
                break;
            case QUOTACTL_CURSORSKIPIDTYPE:
                error = do_sys_quotactl_cursorskipidtype(mp,
                                args->u.cursorskipidtype.qc_cursor,
                                args->u.cursorskipidtype.qc_idtype);
                break;
            case QUOTACTL_CURSORGET:
                error = do_sys_quotactl_cursorget(mp,
                                args->u.cursorget.qc_cursor,
                                args->u.cursorget.qc_keys,
                                args->u.cursorget.qc_vals,
                                args->u.cursorget.qc_maxnum,
                                args->u.cursorget.qc_ret);
                break;
            case QUOTACTL_CURSORATEND:
                error = do_sys_quotactl_cursoratend(mp,
                                args->u.cursoratend.qc_cursor,
                                args->u.cursoratend.qc_ret);
                break;
            case QUOTACTL_CURSORREWIND:
                error = do_sys_quotactl_cursorrewind(mp,
                                args->u.cursorrewind.qc_cursor);
                break;
            case QUOTACTL_QUOTAON:
                error = do_sys_quotactl_quotaon(mp,
                                args->u.quotaon.qc_idtype,
                                args->u.quotaon.qc_quotafile);
                break;
            case QUOTACTL_QUOTAOFF:
                error = do_sys_quotactl_quotaoff(mp,
                                args->u.quotaoff.qc_idtype);
                break;
            default:
                error = EINVAL;
                break;
        }

        vrele(vp);
        return error;
}

/* ARGSUSED */
int
sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(struct quotactl_args *) args;
        } */
        struct quotactl_args args;
        int error;

        error = copyin(SCARG(uap, args), &args, sizeof(args));
        if (error) {
                return error;
        }

        return do_sys_quotactl(SCARG(uap, path), &args);
}

int
dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
    int root)
{
        struct cwdinfo *cwdi = l->l_proc->p_cwdi;
        bool chrooted;
        int error = 0;

        KASSERT(l == curlwp);

        /*
         * This is safe unlocked.  cwdi_rdir never goes non-NULL -> NULL,
         * since it would imply chroots can be escaped.  Just make sure this
         * routine is self-consistent.
         */
        chrooted = (atomic_load_relaxed(&cwdi->cwdi_rdir) != NULL);

        /*
         * If MNT_NOWAIT or MNT_LAZY is specified, do not
         * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
         * overrides MNT_NOWAIT.
         */
        if (flags == MNT_NOWAIT        || flags == MNT_LAZY ||
            (flags != MNT_WAIT && flags != 0)) {
                memcpy(sp, &mp->mnt_stat, sizeof(*sp));
        } else {
                /* Get the filesystem stats now */
                memset(sp, 0, sizeof(*sp));
                if ((error = VFS_STATVFS(mp, sp)) != 0)
                        return error;
                if (!chrooted)
                        (void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
        }

        if (chrooted) {
                size_t len;
                char *bp;
                char c;
                char *path = PNBUF_GET();

                bp = path + MAXPATHLEN;
                *--bp = '\0';
                rw_enter(&cwdi->cwdi_lock, RW_READER);
                error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
                    MAXPATHLEN / 2, 0, l);
                rw_exit(&cwdi->cwdi_lock);
                if (error) {
                        PNBUF_PUT(path);
                        return error;
                }
                len = strlen(bp);
                if (len != 1) {
                        /*
                         * for mount points that are below our root, we can see
                         * them, so we fix up the pathname and return them. The
                         * rest we cannot see, so we don't allow viewing the
                         * data.
                         */
                        if (strncmp(bp, sp->f_mntonname, len) == 0 &&
                            ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
                                (void)strlcpy(sp->f_mntonname,
                                    c == '\0' ? "/" : &sp->f_mntonname[len],
                                    sizeof(sp->f_mntonname));
                        } else {
                                if (root)
                                        (void)strlcpy(sp->f_mntonname, "/",
                                            sizeof(sp->f_mntonname));
                                else
                                        error = EPERM;
                        }
                }
                PNBUF_PUT(path);
        }
        sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
        return error;
}

/*
 * Get filesystem statistics by path.
 */
int
do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
{
        struct mount *mp;
        int error;
        struct vnode *vp;

        error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
        if (error != 0)
                return error;
        mp = vp->v_mount;
        error = dostatvfs(mp, sb, l, flags, 1);
        vrele(vp);
        return error;
}

/* ARGSUSED */
int
sys___statvfs190(struct lwp *l, const struct sys___statvfs190_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(struct statvfs *) buf;
                syscallarg(int) flags;
        } */
        struct statvfs *sb;
        int error;

        sb = STATVFSBUF_GET();
        error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
        if (error == 0)
                error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
        STATVFSBUF_PUT(sb);
        return error;
}

/*
 * Get filesystem statistics by fd.
 */
int
do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
{
        file_t *fp;
        struct mount *mp;
        int error;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(fd, &fp)) != 0)
                return (error);
        mp = fp->f_vnode->v_mount;
        error = dostatvfs(mp, sb, curlwp, flags, 1);
        fd_putfile(fd);
        return error;
}

/* ARGSUSED */
int
sys___fstatvfs190(struct lwp *l, const struct sys___fstatvfs190_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(struct statvfs *) buf;
                syscallarg(int) flags;
        } */
        struct statvfs *sb;
        int error;

        sb = STATVFSBUF_GET();
        error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
        if (error == 0)
                error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
        STATVFSBUF_PUT(sb);
        return error;
}


/*
 * Get statistics on all filesystems.
 */
int
do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
    int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
    register_t *retval)
{
        int root = 0;
        mount_iterator_t *iter;
        struct proc *p = l->l_proc;
        struct mount *mp;
        struct statvfs *sb;
        size_t count, maxcount;
        int error = 0;

        sb = STATVFSBUF_GET();
        maxcount = bufsize / entry_sz;
        count = 0;
        mountlist_iterator_init(&iter);
        while ((mp = mountlist_iterator_next(iter)) != NULL) {
                if (sfsp && count < maxcount) {
                        error = dostatvfs(mp, sb, l, flags, 0);
                        if (error) {
                                error = 0;
                                continue;
                        }
                        error = copyfn(sb, sfsp, entry_sz);
                        if (error)
                                goto out;
                        sfsp = (char *)sfsp + entry_sz;
                        root |= strcmp(sb->f_mntonname, "/") == 0;
                }
                count++;
        }

        if (root == 0 && p->p_cwdi->cwdi_rdir) {
                /*
                 * fake a root entry
                 */
                error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
                    sb, l, flags, 1);
                if (error != 0)
                        goto out;
                if (sfsp) {
                        error = copyfn(sb, sfsp, entry_sz);
                        if (error != 0)
                                goto out;
                }
                count++;
        }
        if (sfsp && count > maxcount)
                *retval = maxcount;
        else
                *retval = count;
out:
        mountlist_iterator_destroy(iter);
        STATVFSBUF_PUT(sb);
        return error;
}

int
sys___getvfsstat90(struct lwp *l, const struct sys___getvfsstat90_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(struct statvfs *) buf;
                syscallarg(size_t) bufsize;
                syscallarg(int) flags;
        } */

        return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
            SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
}

/*
 * Change current working directory to a given file descriptor.
 */
int
do_sys_fchdir(struct lwp *l, int fd, register_t *retval)
{
        struct proc *p = l->l_proc;
        struct cwdinfo *cwdi;
        struct vnode *vp, *tdp;
        struct mount *mp;
        file_t *fp;
        int error;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(fd, &fp)) != 0)
                return error;
        vp = fp->f_vnode;

        vref(vp);
        vn_lock(vp, LK_SHARED | LK_RETRY);
        if (vp->v_type != VDIR)
                error = ENOTDIR;
        else
                error = VOP_ACCESS(vp, VEXEC, l->l_cred);
        if (error) {
                vput(vp);
                goto out;
        }
        while ((mp = vp->v_mountedhere) != NULL) {
                error = vfs_busy(mp);
                vput(vp);
                if (error != 0)
                        goto out;
                error = VFS_ROOT(mp, LK_SHARED, &tdp);
                vfs_unbusy(mp);
                if (error)
                        goto out;
                vp = tdp;
        }
        VOP_UNLOCK(vp);

        /*
         * Disallow changing to a directory not under the process's
         * current root directory (if there is one).
         */
        cwdi = p->p_cwdi;
        rw_enter(&cwdi->cwdi_lock, RW_WRITER);
        if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
                vrele(vp);
                error = EPERM;        /* operation not permitted */
        } else {
                vrele(cwdi->cwdi_cdir);
                cwdi->cwdi_cdir = vp;
        }
        rw_exit(&cwdi->cwdi_lock);

out:
        fd_putfile(fd);
        return error;
}

/*
 * Change current working directory to a given file descriptor.
 */
/* ARGSUSED */
int
sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
        } */
        return do_sys_fchdir(l, SCARG(uap, fd), retval);
}

/*
 * Change this process's notion of the root directory to a given file
 * descriptor.
 */
int
sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
{
        struct vnode        *vp;
        file_t        *fp;
        int                 error, fd = SCARG(uap, fd);

        if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
             KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
                return error;
        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(fd, &fp)) != 0)
                return error;
        vp = fp->f_vnode;
        vn_lock(vp, LK_SHARED | LK_RETRY);
        if (vp->v_type != VDIR)
                error = ENOTDIR;
        else
                error = VOP_ACCESS(vp, VEXEC, l->l_cred);
        VOP_UNLOCK(vp);
        if (error)
                goto out;
        vref(vp);
        change_root(vp);

 out:
        fd_putfile(fd);
        return (error);
}

/*
 * Change current working directory (``.'').
 */
int
do_sys_chdir(struct lwp *l, const char *path, enum uio_seg seg,
    register_t *retval)
{
        struct proc *p = l->l_proc;
        struct cwdinfo * cwdi;
        int error;
        struct vnode *vp;

        if ((error = chdir_lookup(path, seg, &vp, l)) != 0)
                return error;
        cwdi = p->p_cwdi;
        rw_enter(&cwdi->cwdi_lock, RW_WRITER);
        vrele(cwdi->cwdi_cdir);
        cwdi->cwdi_cdir = vp;
        rw_exit(&cwdi->cwdi_lock);
        return 0;
}

/*
 * Change current working directory (``.'').
 */
/* ARGSUSED */
int
sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
        } */
        return do_sys_chdir(l, SCARG(uap, path), UIO_USERSPACE, retval);
}

/*
 * Change notion of root (``/'') directory.
 */
/* ARGSUSED */
int
sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
        } */
        int error;
        struct vnode *vp;

        if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
            KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
                return (error);

        error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE, &vp, l);
        if (error == 0)
                change_root(vp);
        return error;
}

/*
 * Common routine for chroot and fchroot.
 * NB: callers need to properly authorize the change root operation.
 */
void
change_root(struct vnode *vp)
{
        kauth_cred_t ncred;
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;
        struct cwdinfo *cwdi = p->p_cwdi;

        ncred = kauth_cred_alloc();

        rw_enter(&cwdi->cwdi_lock, RW_WRITER);
        if (cwdi->cwdi_rdir != NULL)
                vrele(cwdi->cwdi_rdir);
        cwdi->cwdi_rdir = vp;

        /*
         * Prevent escaping from chroot by putting the root under
         * the working directory.  Silently chdir to / if we aren't
         * already there.
         */
        if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
                /*
                 * XXX would be more failsafe to change directory to a
                 * deadfs node here instead
                 */
                vrele(cwdi->cwdi_cdir);
                vref(vp);
                cwdi->cwdi_cdir = vp;
        }
        rw_exit(&cwdi->cwdi_lock);

        /* Get a write lock on the process credential. */
        proc_crmod_enter();

        kauth_cred_clone(p->p_cred, ncred);
        kauth_proc_chroot(ncred, p->p_cwdi);

        /* Broadcast our credentials to the process and other LWPs. */
         proc_crmod_leave(ncred, p->p_cred, true);
}

/*
 * Common routine for chroot and chdir.
 * XXX "where" should be enum uio_seg
 */
int
chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
{
        struct pathbuf *pb;
        struct nameidata nd;
        int error;

        error = pathbuf_maybe_copyin(path, where, &pb);
        if (error) {
                return error;
        }
        NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
        if ((error = namei(&nd)) != 0) {
                pathbuf_destroy(pb);
                return error;
        }
        *vpp = nd.ni_vp;
        pathbuf_destroy(pb);

        if ((*vpp)->v_type != VDIR)
                error = ENOTDIR;
        else
                error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);

        if (error)
                vput(*vpp);
        else
                VOP_UNLOCK(*vpp);
        return (error);
}

/*
 * Internals of sys_open - path has already been converted into a pathbuf
 * (so we can easily reuse this function from other parts of the kernel,
 * like posix_spawn post-processing).
 */
int
do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags,
        int open_mode, int *fd)
{
        struct proc *p = l->l_proc;
        struct cwdinfo *cwdi = p->p_cwdi;
        file_t *fp;
        struct vnode *vp;
        int dupfd;
        bool dupfd_move;
        int flags, cmode;
        int indx, error;

        if (open_flags & O_SEARCH) {
                open_flags &= ~(int)O_SEARCH;
        }

        /*
         * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
         * may be specified.
         */
        if ((open_flags & O_EXEC) && (open_flags & O_ACCMODE))
                return EINVAL;

        flags = FFLAGS(open_flags);
        if ((flags & (FREAD | FWRITE)) == 0)
                return EINVAL;

        if ((error = fd_allocfile(&fp, &indx)) != 0) {
                return error;
        }

        /* We're going to read cwdi->cwdi_cmask unlocked here. */
        cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;

        error = vn_open(dvp, pb, TRYEMULROOT, flags, cmode,
            &vp, &dupfd_move, &dupfd);
        if (error != 0) {
                fd_abort(p, fp, indx);
                return error;
        }

        if (vp == NULL) {
                fd_abort(p, fp, indx);
                error = fd_dupopen(dupfd, dupfd_move, flags, &indx);
                if (error)
                        return error;
                *fd = indx;
        } else {
                error = open_setfp(l, fp, vp, indx, flags);
                if (error)
                        return error;
                VOP_UNLOCK(vp);
                *fd = indx;
                fd_affix(p, fp, indx);
        }

        return 0;
}

int
fd_open(const char *path, int open_flags, int open_mode, int *fd)
{
        struct pathbuf *pb;
        int error, oflags;

        oflags = FFLAGS(open_flags);
        if ((oflags & (FREAD | FWRITE)) == 0)
                return EINVAL;

        pb = pathbuf_create(path);
        if (pb == NULL)
                return ENOMEM;

        error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
        pathbuf_destroy(pb);

        return error;
}

static int
do_sys_openat(lwp_t *l, int fdat, const char *path, int flags,
    int mode, int *fd)
{
        file_t *dfp = NULL;
        struct vnode *dvp = NULL;
        struct pathbuf *pb;
        const char *pathstring = NULL;
        int error;

        if (path == NULL) {
                MODULE_HOOK_CALL(vfs_openat_10_hook, (&pb), enosys(), error);
                if (error == ENOSYS)
                        goto no_compat;
                if (error)
                        return error;
        } else {
no_compat:
                error = pathbuf_copyin(path, &pb);
                if (error)
                        return error;
        }

        pathstring = pathbuf_stringcopy_get(pb);

        /*
         * fdat is ignored if:
         * 1) if fdat is AT_FDCWD, which means use current directory as base.
         * 2) if path is absolute, then fdat is useless.
         */
        if (fdat != AT_FDCWD && pathstring[0] != '/') {
                /* fd_getvnode() will use the descriptor for us */
                if ((error = fd_getvnode(fdat, &dfp)) != 0)
                        goto out;

                dvp = dfp->f_vnode;
        }

        error = do_open(l, dvp, pb, flags, mode, fd);

        if (dfp != NULL)
                fd_putfile(fdat);
out:
        pathbuf_stringcopy_put(pb, pathstring);
        pathbuf_destroy(pb);
        return error;
}

int
sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(int) flags;
                syscallarg(int) mode;
        } */
        int error;
        int fd;

        error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
                              SCARG(uap, flags), SCARG(uap, mode), &fd);

        if (error == 0)
                *retval = fd;

        return error;
}

int
sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const char *) path;
                syscallarg(int) oflags;
                syscallarg(int) mode;
        } */
        int error;
        int fd;

        error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
                              SCARG(uap, oflags), SCARG(uap, mode), &fd);

        if (error == 0)
                *retval = fd;

        return error;
}

static void
vfs__fhfree(fhandle_t *fhp)
{
        size_t fhsize;

        fhsize = FHANDLE_SIZE(fhp);
        kmem_free(fhp, fhsize);
}

/*
 * vfs_composefh: compose a filehandle.
 */

int
vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
{
        struct mount *mp;
        struct fid *fidp;
        int error;
        size_t needfhsize;
        size_t fidsize;

        mp = vp->v_mount;
        fidp = NULL;
        if (*fh_size < FHANDLE_SIZE_MIN) {
                fidsize = 0;
        } else {
                fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
                if (fhp != NULL) {
                        memset(fhp, 0, *fh_size);
                        fhp->fh_fsid = mp->mnt_stat.f_fsidx;
                        fidp = &fhp->fh_fid;
                }
        }
        error = VFS_VPTOFH(vp, fidp, &fidsize);
        needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
        if (error == 0 && *fh_size < needfhsize) {
                error = E2BIG;
        }
        *fh_size = needfhsize;
        return error;
}

int
vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
{
        struct mount *mp;
        fhandle_t *fhp;
        size_t fhsize;
        size_t fidsize;
        int error;

        mp = vp->v_mount;
        fidsize = 0;
        error = VFS_VPTOFH(vp, NULL, &fidsize);
        KASSERT(error != 0);
        if (error != E2BIG) {
                goto out;
        }
        fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
        fhp = kmem_zalloc(fhsize, KM_SLEEP);
        fhp->fh_fsid = mp->mnt_stat.f_fsidx;
        error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
        if (error == 0) {
                KASSERT(FHANDLE_SIZE(fhp) == fhsize);
                KASSERT(FHANDLE_FILEID(fhp)->fid_len == fidsize);
                *fhpp = fhp;
        } else {
                kmem_free(fhp, fhsize);
        }
out:
        return error;
}

void
vfs_composefh_free(fhandle_t *fhp)
{

        vfs__fhfree(fhp);
}

/*
 * vfs_fhtovp: lookup a vnode by a filehandle.
 */

int
vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
{
        struct mount *mp;
        int error;

        *vpp = NULL;
        mp = vfs_getvfs(FHANDLE_FSID(fhp));
        if (mp == NULL) {
                error = ESTALE;
                goto out;
        }
        if (mp->mnt_op->vfs_fhtovp == NULL) {
                error = EOPNOTSUPP;
                goto out;
        }
        error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), LK_EXCLUSIVE, vpp);
out:
        return error;
}

/*
 * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
 * the needed size.
 */

int
vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
{
        fhandle_t *fhp;
        int error;

        if (fhsize > FHANDLE_SIZE_MAX) {
                return EINVAL;
        }
        if (fhsize < FHANDLE_SIZE_MIN) {
                return EINVAL;
        }
again:
        fhp = kmem_alloc(fhsize, KM_SLEEP);
        error = copyin(ufhp, fhp, fhsize);
        if (error == 0) {
                /* XXX this check shouldn't be here */
                if (FHANDLE_SIZE(fhp) == fhsize) {
                        *fhpp = fhp;
                        return 0;
                } else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
                        /*
                         * a kludge for nfsv2 padded handles.
                         */
                        size_t sz;

                        sz = FHANDLE_SIZE(fhp);
                        kmem_free(fhp, fhsize);
                        fhsize = sz;
                        goto again;
                } else {
                        /*
                         * userland told us wrong size.
                         */
                            error = EINVAL;
                }
        }
        kmem_free(fhp, fhsize);
        return error;
}

void
vfs_copyinfh_free(fhandle_t *fhp)
{

        vfs__fhfree(fhp);
}

/*
 * Get file handle system call
 */
int
sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
{
        /* {
                syscallarg(char *) fname;
                syscallarg(fhandle_t *) fhp;
                syscallarg(size_t *) fh_size;
        } */
        struct vnode *vp;
        fhandle_t *fh;
        int error;
        struct pathbuf *pb;
        struct nameidata nd;
        size_t sz;
        size_t usz;

        /*
         * Must be super user
         */
        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
            0, NULL, NULL, NULL);
        if (error)
                return (error);

        error = pathbuf_copyin(SCARG(uap, fname), &pb);
        if (error) {
                return error;
        }
        NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
        error = namei(&nd);
        if (error) {
                pathbuf_destroy(pb);
                return error;
        }
        vp = nd.ni_vp;
        pathbuf_destroy(pb);

        error = vfs_composefh_alloc(vp, &fh);
        vput(vp);
        if (error != 0) {
                return error;
        }
        error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
        if (error != 0) {
                goto out;
        }
        sz = FHANDLE_SIZE(fh);
        error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
        if (error != 0) {
                goto out;
        }
        if (usz >= sz) {
                error = copyout(fh, SCARG(uap, fhp), sz);
        } else {
                error = E2BIG;
        }
out:
        vfs_composefh_free(fh);
        return (error);
}

/*
 * Open a file given a file handle.
 *
 * Check permissions, allocate an open file structure,
 * and call the device open routine if any.
 */

int
dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
    register_t *retval)
{
        file_t *fp;
        struct vnode *vp = NULL;
        kauth_cred_t cred = l->l_cred;
        file_t *nfp;
        int indx, error;
        struct vattr va;
        fhandle_t *fh;
        int flags;
        proc_t *p;

        p = curproc;

        /*
         * Must be super user
         */
        if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
            0, NULL, NULL, NULL)))
                return (error);

        if (oflags & O_SEARCH) {
                oflags &= ~(int)O_SEARCH;
        }

        flags = FFLAGS(oflags);
        if ((flags & (FREAD | FWRITE)) == 0)
                return (EINVAL);
        if ((flags & O_CREAT))
                return (EINVAL);
        if ((error = fd_allocfile(&nfp, &indx)) != 0)
                return (error);
        fp = nfp;
        error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
        if (error != 0) {
                goto bad;
        }
        error = vfs_fhtovp(fh, &vp);
        vfs_copyinfh_free(fh);
        if (error != 0) {
                goto bad;
        }

        /* Now do an effective vn_open */

        if (vp->v_type == VSOCK) {
                error = EOPNOTSUPP;
                goto bad;
        }
        error = vn_openchk(vp, cred, flags);
        if (error != 0)
                goto bad;
        if (flags & O_TRUNC) {
                VOP_UNLOCK(vp);                        /* XXX */
                vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
                vattr_null(&va);
                va.va_size = 0;
                error = VOP_SETATTR(vp, &va, cred);
                if (error)
                        goto bad;
        }
        if ((error = VOP_OPEN(vp, flags, cred)) != 0)
                goto bad;
        if (flags & FWRITE) {
                mutex_enter(vp->v_interlock);
                vp->v_writecount++;
                mutex_exit(vp->v_interlock);
        }

        /* done with modified vn_open, now finish what sys_open does. */
        if ((error = open_setfp(l, fp, vp, indx, flags)))
                return error;

        VOP_UNLOCK(vp);
        *retval = indx;
        fd_affix(p, fp, indx);
        return (0);

bad:
        fd_abort(p, fp, indx);
        if (vp != NULL)
                vput(vp);
        if (error == EDUPFD || error == EMOVEFD) {
                /* XXX should probably close curlwp->l_dupfd */
                error = EOPNOTSUPP;
        }
        return (error);
}

int
sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
{
        /* {
                syscallarg(const void *) fhp;
                syscallarg(size_t) fh_size;
                syscallarg(int) flags;
        } */

        return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
            SCARG(uap, flags), retval);
}

int
do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
{
        int error;
        fhandle_t *fh;
        struct vnode *vp;

        /*
         * Must be super user
         */
        if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
            0, NULL, NULL, NULL)))
                return (error);

        error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
        if (error != 0)
                return error;

        error = vfs_fhtovp(fh, &vp);
        vfs_copyinfh_free(fh);
        if (error != 0)
                return error;

        error = vn_stat(vp, sb);
        vput(vp);
        return error;
}


/* ARGSUSED */
int
sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
{
        /* {
                syscallarg(const void *) fhp;
                syscallarg(size_t) fh_size;
                syscallarg(struct stat *) sb;
        } */
        struct stat sb;
        int error;

        error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
        if (error)
                return error;
        return copyout(&sb, SCARG(uap, sb), sizeof(sb));
}

int
do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
    int flags)
{
        fhandle_t *fh;
        struct mount *mp;
        struct vnode *vp;
        int error;

        /*
         * Must be super user
         */
        if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
            0, NULL, NULL, NULL)))
                return error;

        error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
        if (error != 0)
                return error;

        error = vfs_fhtovp(fh, &vp);
        vfs_copyinfh_free(fh);
        if (error != 0)
                return error;

        mp = vp->v_mount;
        error = dostatvfs(mp, sb, l, flags, 1);
        vput(vp);
        return error;
}

/* ARGSUSED */
int
sys___fhstatvfs190(struct lwp *l, const struct sys___fhstatvfs190_args *uap, register_t *retval)
{
        /* {
                syscallarg(const void *) fhp;
                syscallarg(size_t) fh_size;
                syscallarg(struct statvfs *) buf;
                syscallarg(int)        flags;
        } */
        struct statvfs *sb = STATVFSBUF_GET();
        int error;

        error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
            SCARG(uap, flags));
        if (error == 0)
                error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
        STATVFSBUF_PUT(sb);
        return error;
}

int
do_posix_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
    dev_t dev)
{

        /*
         * The POSIX mknod(2) call is an alias for mkfifo(2) for S_IFIFO
         * in mode and dev=0.
         *
         * In all the other cases it's implementation defined behavior.
         */

        if ((mode & S_IFIFO) && dev == 0)
                return do_sys_mkfifoat(l, fdat, pathname, mode);
        else
                return do_sys_mknodat(l, fdat, pathname, mode, dev,
                    UIO_USERSPACE);
}

/*
 * Create a special file.
 */
/* ARGSUSED */
int
sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(mode_t) mode;
                syscallarg(dev_t) dev;
        } */
        return do_posix_mknodat(l, AT_FDCWD, SCARG(uap, path),
            SCARG(uap, mode), SCARG(uap, dev));
}

int
sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const char *) path;
                syscallarg(mode_t) mode;
                syscallarg(int) pad;
                syscallarg(dev_t) dev;
        } */

        return do_posix_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
            SCARG(uap, mode), SCARG(uap, dev));
}

int
do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
    enum uio_seg seg)
{
        return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, seg);
}

int
do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
    dev_t dev, enum uio_seg seg)
{
        struct proc *p = l->l_proc;
        struct vnode *vp;
        struct vattr vattr;
        int error, optype;
        struct pathbuf *pb;
        struct nameidata nd;
        const char *pathstring;

        if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
            0, NULL, NULL, NULL)) != 0)
                return (error);

        optype = VOP_MKNOD_DESCOFFSET;

        error = pathbuf_maybe_copyin(pathname, seg, &pb);
        if (error) {
                return error;
        }
        pathstring = pathbuf_stringcopy_get(pb);
        if (pathstring == NULL) {
                pathbuf_destroy(pb);
                return ENOMEM;
        }

        NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);

        if ((error = fd_nameiat(l, fdat, &nd)) != 0)
                goto out;
        vp = nd.ni_vp;

        if (vp != NULL)
                error = EEXIST;
        else {
                vattr_null(&vattr);
                /* We will read cwdi->cwdi_cmask unlocked. */
                vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
                vattr.va_rdev = dev;

                switch (mode & S_IFMT) {
                case S_IFMT:        /* used by badsect to flag bad sectors */
                        vattr.va_type = VBAD;
                        break;
                case S_IFCHR:
                        vattr.va_type = VCHR;
                        break;
                case S_IFBLK:
                        vattr.va_type = VBLK;
                        break;
                case S_IFWHT:
                        optype = VOP_WHITEOUT_DESCOFFSET;
                        break;
                case S_IFREG:
#if NVERIEXEC > 0
                        error = veriexec_openchk(l, nd.ni_vp, pathstring,
                            O_CREAT);
#endif /* NVERIEXEC > 0 */
                        vattr.va_type = VREG;
                        vattr.va_rdev = VNOVAL;
                        optype = VOP_CREATE_DESCOFFSET;
                        break;
                default:
                        error = EINVAL;
                        break;
                }

                if (error == 0 && optype == VOP_MKNOD_DESCOFFSET &&
                    vattr.va_rdev == VNOVAL)
                        error = EINVAL;
        }

        if (!error) {
                switch (optype) {
                case VOP_WHITEOUT_DESCOFFSET:
                        error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
                        if (error)
                                VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
                        vput(nd.ni_dvp);
                        break;

                case VOP_MKNOD_DESCOFFSET:
                        error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
                                                &nd.ni_cnd, &vattr);
                        if (error == 0)
                                vrele(nd.ni_vp);
                        vput(nd.ni_dvp);
                        break;

                case VOP_CREATE_DESCOFFSET:
                        error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
                                                &nd.ni_cnd, &vattr);
                        if (error == 0)
                                vrele(nd.ni_vp);
                        vput(nd.ni_dvp);
                        break;
                }
        } else {
                VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
                if (nd.ni_dvp == vp)
                        vrele(nd.ni_dvp);
                else
                        vput(nd.ni_dvp);
                if (vp)
                        vrele(vp);
        }
out:
        pathbuf_stringcopy_put(pb, pathstring);
        pathbuf_destroy(pb);
        return (error);
}

/*
 * Create a named pipe.
 */
/* ARGSUSED */
int
sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(int) mode;
        } */
        return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode));
}

int
sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const char *) path;
                syscallarg(int) mode;
        } */

        return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path),
            SCARG(uap, mode));
}

static int
do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode)
{
        struct proc *p = l->l_proc;
        struct vattr vattr;
        int error;
        struct pathbuf *pb;
        struct nameidata nd;

        error = pathbuf_copyin(path, &pb);
        if (error) {
                return error;
        }
        NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);

        if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
                pathbuf_destroy(pb);
                return error;
        }
        if (nd.ni_vp != NULL) {
                VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
                if (nd.ni_dvp == nd.ni_vp)
                        vrele(nd.ni_dvp);
                else
                        vput(nd.ni_dvp);
                vrele(nd.ni_vp);
                pathbuf_destroy(pb);
                return (EEXIST);
        }
        vattr_null(&vattr);
        vattr.va_type = VFIFO;
        /* We will read cwdi->cwdi_cmask unlocked. */
        vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
        error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
        if (error == 0)
                vrele(nd.ni_vp);
        vput(nd.ni_dvp);
        pathbuf_destroy(pb);
        return (error);
}

/*
 * Make a hard file link.
 */
/* ARGSUSED */
int
do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink,
    const char *link, int follow, register_t *retval)
{
        struct vnode *vp;
        struct pathbuf *linkpb;
        struct nameidata nd;
        namei_simple_flags_t ns_flags;
        int error;

        if (follow & AT_SYMLINK_FOLLOW)
                ns_flags = NSM_FOLLOW_TRYEMULROOT;
        else
                ns_flags = NSM_NOFOLLOW_TRYEMULROOT;

        error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
        if (error != 0)
                return (error);
        error = pathbuf_copyin(link, &linkpb);
        if (error) {
                goto out1;
        }
        NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
        if ((error = fd_nameiat(l, fdlink, &nd)) != 0)
                goto out2;
        if (nd.ni_vp) {
                error = EEXIST;
                goto abortop;
        }
        /* Prevent hard links on directories. */
        if (vp->v_type == VDIR) {
                error = EPERM;
                goto abortop;
        }
        /* Prevent cross-mount operation. */
        if (nd.ni_dvp->v_mount != vp->v_mount) {
                error = EXDEV;
                goto abortop;
        }
        error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
        VOP_UNLOCK(nd.ni_dvp);
        vrele(nd.ni_dvp);
out2:
        pathbuf_destroy(linkpb);
out1:
        vrele(vp);
        return (error);
abortop:
        VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
        if (nd.ni_dvp == nd.ni_vp)
                vrele(nd.ni_dvp);
        else
                vput(nd.ni_dvp);
        if (nd.ni_vp != NULL)
                vrele(nd.ni_vp);
        goto out2;
}

int
sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(const char *) link;
        } */
        const char *path = SCARG(uap, path);
        const char *link = SCARG(uap, link);

        return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
            AT_SYMLINK_FOLLOW, retval);
}

int
sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd1;
                syscallarg(const char *) name1;
                syscallarg(int) fd2;
                syscallarg(const char *) name2;
                syscallarg(int) flags;
        } */
        int fd1 = SCARG(uap, fd1);
        const char *name1 = SCARG(uap, name1);
        int fd2 = SCARG(uap, fd2);
        const char *name2 = SCARG(uap, name2);
        int follow;

        follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;

        return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
}


int
do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
{
        return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
}

static int
do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat,
    const char *link, enum uio_seg seg)
{
        struct proc *p = curproc;
        struct vattr vattr;
        char *path;
        int error;
        size_t len;
        struct pathbuf *linkpb;
        struct nameidata nd;

        KASSERT(l != NULL || fdat == AT_FDCWD);

        path = PNBUF_GET();
        if (seg == UIO_USERSPACE) {
                if ((error = copyinstr(patharg, path, MAXPATHLEN, &len)) != 0)
                        goto out1;
                if ((error = pathbuf_copyin(link, &linkpb)) != 0)
                        goto out1;
        } else {
                len = strlen(patharg) + 1;
                KASSERT(len <= MAXPATHLEN);
                memcpy(path, patharg, len);
                linkpb = pathbuf_create(link);
                if (linkpb == NULL) {
                        error = ENOMEM;
                        goto out1;
                }
        }
        ktrkuser("symlink-target", path, len - 1);

        NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
        if ((error = fd_nameiat(l, fdat, &nd)) != 0)
                goto out2;
        if (nd.ni_vp) {
                VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
                if (nd.ni_dvp == nd.ni_vp)
                        vrele(nd.ni_dvp);
                else
                        vput(nd.ni_dvp);
                vrele(nd.ni_vp);
                error = EEXIST;
                goto out2;
        }
        vattr_null(&vattr);
        vattr.va_type = VLNK;
        /* We will read cwdi->cwdi_cmask unlocked. */
        vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
        error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
        if (error == 0)
                vrele(nd.ni_vp);
        vput(nd.ni_dvp);
out2:
        pathbuf_destroy(linkpb);
out1:
        PNBUF_PUT(path);
        return (error);
}

/*
 * Make a symbolic link.
 */
/* ARGSUSED */
int
sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(const char *) link;
        } */

        return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
            UIO_USERSPACE);
}

int
sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const char *) path1;
                syscallarg(int) fd;
                syscallarg(const char *) path2;
        } */

        return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
            SCARG(uap, path2), UIO_USERSPACE);
}

/*
 * Delete a whiteout from the filesystem.
 */
/* ARGSUSED */
int
sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
        } */
        int error;
        struct pathbuf *pb;
        struct nameidata nd;

        error = pathbuf_copyin(SCARG(uap, path), &pb);
        if (error) {
                return error;
        }

        NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
        error = namei(&nd);
        if (error) {
                pathbuf_destroy(pb);
                return (error);
        }

        if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
                VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
                if (nd.ni_dvp == nd.ni_vp)
                        vrele(nd.ni_dvp);
                else
                        vput(nd.ni_dvp);
                if (nd.ni_vp)
                        vrele(nd.ni_vp);
                pathbuf_destroy(pb);
                return (EEXIST);
        }
        if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
                VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
        vput(nd.ni_dvp);
        pathbuf_destroy(pb);
        return (error);
}

/*
 * Delete a name from the filesystem.
 */
/* ARGSUSED */
int
sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
        } */

        return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0, UIO_USERSPACE);
}

int
sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const char *) path;
                syscallarg(int) flag;
        } */

        return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
            SCARG(uap, flag), UIO_USERSPACE);
}

int
do_sys_unlink(const char *arg, enum uio_seg seg)
{
        return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg);
}

static int
do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags,
    enum uio_seg seg)
{
        struct vnode *vp;
        int error;
        struct pathbuf *pb;
        struct nameidata nd;
        const char *pathstring;

        KASSERT(l != NULL || fdat == AT_FDCWD);

        error = pathbuf_maybe_copyin(arg, seg, &pb);
        if (error) {
                return error;
        }
        pathstring = pathbuf_stringcopy_get(pb);
        if (pathstring == NULL) {
                pathbuf_destroy(pb);
                return ENOMEM;
        }

        NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
        if ((error = fd_nameiat(l, fdat, &nd)) != 0)
                goto out;
        vp = nd.ni_vp;

        /*
         * The root of a mounted filesystem cannot be deleted.
         */
        if ((vp->v_vflag & VV_ROOT) != 0) {
                error = EBUSY;
                goto abort;
        }

        if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
                error = EBUSY;
                goto abort;
        }

        /*
         * No rmdir "." please.
         */
        if (nd.ni_dvp == vp) {
                error = EINVAL;
                goto abort;
        }

        /*
         * AT_REMOVEDIR is required to remove a directory
         */
        if (vp->v_type == VDIR) {
                if (!(flags & AT_REMOVEDIR)) {
                        error = EPERM;
                        goto abort;
                } else {
                        error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
                        vput(nd.ni_dvp);
                        goto out;
                }
        }

        /*
         * Starting here we only deal with non directories.
         */
        if (flags & AT_REMOVEDIR) {
                error = ENOTDIR;
                goto abort;
        }

#if NVERIEXEC > 0
        /* Handle remove requests for veriexec entries. */
        if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
                goto abort;
        }
#endif /* NVERIEXEC > 0 */

#ifdef FILEASSOC
        (void)fileassoc_file_delete(vp);
#endif /* FILEASSOC */
        error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
        vput(nd.ni_dvp);
        goto out;

abort:
        VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
        if (nd.ni_dvp == vp)
                vrele(nd.ni_dvp);
        else
                vput(nd.ni_dvp);
        vput(vp);

out:
        pathbuf_stringcopy_put(pb, pathstring);
        pathbuf_destroy(pb);
        return (error);
}

/*
 * Reposition read/write file offset.
 */
int
sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(int) pad;
                syscallarg(off_t) offset;
                syscallarg(int) whence;
        } */
        file_t *fp;
        int error, fd;

        switch (SCARG(uap, whence)) {
        case SEEK_CUR:
        case SEEK_END:
        case SEEK_SET:
                break;
        default:
                return EINVAL;
        }

        fd = SCARG(uap, fd);

        if ((fp = fd_getfile(fd)) == NULL)
                return (EBADF);

        if (fp->f_ops->fo_seek == NULL) {
                error = ESPIPE;
                goto out;
        }

        error = (*fp->f_ops->fo_seek)(fp, SCARG(uap, offset),
            SCARG(uap, whence), (off_t *)retval, FOF_UPDATE_OFFSET);
 out:
         fd_putfile(fd);
        return (error);
}

/*
 * Positional read system call.
 */
int
sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(void *) buf;
                syscallarg(size_t) nbyte;
                syscallarg(off_t) offset;
        } */
        file_t *fp;
        off_t offset;
        int error, fd = SCARG(uap, fd);

        if ((fp = fd_getfile(fd)) == NULL)
                return (EBADF);

        if ((fp->f_flag & FREAD) == 0) {
                fd_putfile(fd);
                return (EBADF);
        }

        if (fp->f_ops->fo_seek == NULL) {
                error = ESPIPE;
                goto out;
        }

        offset = SCARG(uap, offset);
        error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
        if (error)
                goto out;

        /* dofileread() will unuse the descriptor for us */
        return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
            &offset, 0, retval));

 out:
        fd_putfile(fd);
        return (error);
}

/*
 * Positional scatter read system call.
 */
int
sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const struct iovec *) iovp;
                syscallarg(int) iovcnt;
                syscallarg(off_t) offset;
        } */
        off_t offset = SCARG(uap, offset);

        return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
            SCARG(uap, iovcnt), &offset, 0, retval);
}

/*
 * Positional write system call.
 */
int
sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const void *) buf;
                syscallarg(size_t) nbyte;
                syscallarg(off_t) offset;
        } */
        file_t *fp;
        off_t offset;
        int error, fd = SCARG(uap, fd);

        if ((fp = fd_getfile(fd)) == NULL)
                return (EBADF);

        if ((fp->f_flag & FWRITE) == 0) {
                fd_putfile(fd);
                return (EBADF);
        }

        if (fp->f_ops->fo_seek == NULL) {
                error = ESPIPE;
                goto out;
        }

        offset = SCARG(uap, offset);
        error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
        if (error)
                goto out;

        /* dofilewrite() will unuse the descriptor for us */
        return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
            &offset, 0, retval));

 out:
        fd_putfile(fd);
        return (error);
}

/*
 * Positional gather write system call.
 */
int
sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const struct iovec *) iovp;
                syscallarg(int) iovcnt;
                syscallarg(off_t) offset;
        } */
        off_t offset = SCARG(uap, offset);

        return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
            SCARG(uap, iovcnt), &offset, 0, retval);
}

/*
 * Check access permissions.
 */
int
sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(int) flags;
        } */

        return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
             SCARG(uap, flags), 0);
}

int
do_sys_accessat(struct lwp *l, int fdat, const char *path,
    int mode, int flags)
{
        kauth_cred_t cred;
        struct vnode *vp;
        int error, nd_flag, vmode;
        struct pathbuf *pb;
        struct nameidata nd;

        CTASSERT(F_OK == 0);
        if ((mode & ~(R_OK | W_OK | X_OK)) != 0) {
                /* nonsense mode */
                return EINVAL;
        }

        nd_flag = FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT;
        if (flags & AT_SYMLINK_NOFOLLOW)
                nd_flag &= ~FOLLOW;

        error = pathbuf_copyin(path, &pb);
        if (error)
                return error;

        NDINIT(&nd, LOOKUP, nd_flag, pb);

        /* Override default credentials */
        if (!(flags & AT_EACCESS)) {
                cred = kauth_cred_dup(l->l_cred);
                kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
                kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
        } else
                cred = l->l_cred;
        nd.ni_cnd.cn_cred = cred;

        if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
                pathbuf_destroy(pb);
                goto out;
        }
        vp = nd.ni_vp;
        pathbuf_destroy(pb);

        /* Flags == 0 means only check for existence. */
        if (mode) {
                vmode = 0;
                if (mode & R_OK)
                        vmode |= VREAD;
                if (mode & W_OK)
                        vmode |= VWRITE;
                if (mode & X_OK)
                        vmode |= VEXEC;

                error = VOP_ACCESS(vp, vmode, cred);
                if (!error && (vmode & VWRITE))
                        error = vn_writechk(vp);
        }
        vput(vp);
out:
        if (!(flags & AT_EACCESS))
                kauth_cred_free(cred);
        return (error);
}

int
sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const char *) path;
                syscallarg(int) amode;
                syscallarg(int) flag;
        } */

        return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
             SCARG(uap, amode), SCARG(uap, flag));
}

/*
 * Common code for all sys_stat functions, including compat versions.
 */
int
do_sys_stat(const char *userpath, unsigned int nd_flag,
    struct stat *sb)
{
        return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
}

int
do_sys_statat(struct lwp *l, int fdat, const char *userpath,
    unsigned int nd_flag, struct stat *sb)
{
        int error;
        struct pathbuf *pb;
        struct nameidata nd;

        KASSERT(l != NULL || fdat == AT_FDCWD);

        error = pathbuf_copyin(userpath, &pb);
        if (error) {
                return error;
        }

        NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb);

        error = fd_nameiat(l, fdat, &nd);
        if (error != 0) {
                pathbuf_destroy(pb);
                return error;
        }
        error = vn_stat(nd.ni_vp, sb);
        vput(nd.ni_vp);
        pathbuf_destroy(pb);
        return error;
}

/*
 * Get file status; this version follows links.
 */
/* ARGSUSED */
int
sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(struct stat *) ub;
        } */
        struct stat sb;
        int error;

        error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
        if (error)
                return error;
        return copyout(&sb, SCARG(uap, ub), sizeof(sb));
}

/*
 * Get file status; this version does not follow links.
 */
/* ARGSUSED */
int
sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(struct stat *) ub;
        } */
        struct stat sb;
        int error;

        error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
        if (error)
                return error;
        return copyout(&sb, SCARG(uap, ub), sizeof(sb));
}

int
sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const char *) path;
                syscallarg(struct stat *) buf;
                syscallarg(int) flag;
        } */
        unsigned int nd_flag;
        struct stat sb;
        int error;

        if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
                nd_flag = NOFOLLOW;
        else
                nd_flag = FOLLOW;

        error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag,
            &sb);
        if (error)
                return error;
        return copyout(&sb, SCARG(uap, buf), sizeof(sb));
}

static int
kern_pathconf(register_t *retval, const char *path, int name, int flag)
{
        int error;
        struct pathbuf *pb;
        struct nameidata nd;

        error = pathbuf_copyin(path, &pb);
        if (error) {
                return error;
        }
        NDINIT(&nd, LOOKUP, flag | LOCKLEAF | TRYEMULROOT, pb);
        if ((error = namei(&nd)) != 0) {
                pathbuf_destroy(pb);
                return error;
        }
        error = VOP_PATHCONF(nd.ni_vp, name, retval);
        vput(nd.ni_vp);
        pathbuf_destroy(pb);
        return error;
}

/*
 * Get configurable pathname variables.
 */
/* ARGSUSED */
int
sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(int) name;
        } */
        return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
            FOLLOW);
}

/* ARGSUSED */
int
sys_lpathconf(struct lwp *l, const struct sys_lpathconf_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(int) name;
        } */
        return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name),
            NOFOLLOW);
}

/*
 * Return target name of a symbolic link.
 */
/* ARGSUSED */
int
sys_readlink(struct lwp *l, const struct sys_readlink_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(char *) buf;
                syscallarg(size_t) count;
        } */
        return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
            SCARG(uap, buf), SCARG(uap, count), retval);
}

static int
do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf,
    size_t count, register_t *retval)
{
        struct vnode *vp;
        struct iovec aiov;
        struct uio auio;
        int error;
        struct pathbuf *pb;
        struct nameidata nd;

        error = pathbuf_copyin(path, &pb);
        if (error) {
                return error;
        }
        NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
        if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
                pathbuf_destroy(pb);
                return error;
        }
        vp = nd.ni_vp;
        pathbuf_destroy(pb);
        if (vp->v_type != VLNK)
                error = EINVAL;
        else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
            (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
                aiov.iov_base = buf;
                aiov.iov_len = count;
                auio.uio_iov = &aiov;
                auio.uio_iovcnt = 1;
                auio.uio_offset = 0;
                auio.uio_rw = UIO_READ;
                KASSERT(l == curlwp);
                auio.uio_vmspace = l->l_proc->p_vmspace;
                auio.uio_resid = count;
                if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0)
                        *retval = count - auio.uio_resid;
        }
        vput(vp);
        return (error);
}

int
sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const char *) path;
                syscallarg(char *) buf;
                syscallarg(size_t) bufsize;
        } */

        return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
            SCARG(uap, buf), SCARG(uap, bufsize), retval);
}

/*
 * Change flags of a file given a path name.
 */
/* ARGSUSED */
int
sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(u_long) flags;
        } */
        struct vnode *vp;
        int error;

        error = namei_simple_user(SCARG(uap, path),
                                NSM_FOLLOW_TRYEMULROOT, &vp);
        if (error != 0)
                return (error);
        error = change_flags(vp, SCARG(uap, flags), l);
        vput(vp);
        return (error);
}

/*
 * Change flags of a file given a file descriptor.
 */
/* ARGSUSED */
int
sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(u_long) flags;
        } */
        struct vnode *vp;
        file_t *fp;
        int error;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return (error);
        vp = fp->f_vnode;
        error = change_flags(vp, SCARG(uap, flags), l);
        VOP_UNLOCK(vp);
        fd_putfile(SCARG(uap, fd));
        return (error);
}

/*
 * Change flags of a file given a path name; this version does
 * not follow links.
 */
int
sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(u_long) flags;
        } */
        struct vnode *vp;
        int error;

        error = namei_simple_user(SCARG(uap, path),
                                NSM_NOFOLLOW_TRYEMULROOT, &vp);
        if (error != 0)
                return (error);
        error = change_flags(vp, SCARG(uap, flags), l);
        vput(vp);
        return (error);
}

/*
 * Common routine to change flags of a file.
 */
int
change_flags(struct vnode *vp, u_long flags, struct lwp *l)
{
        struct vattr vattr;
        int error;

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);

        vattr_null(&vattr);
        vattr.va_flags = flags;
        error = VOP_SETATTR(vp, &vattr, l->l_cred);

        return (error);
}

/*
 * Change mode of a file given path name; this version follows links.
 */
/* ARGSUSED */
int
sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(int) mode;
        } */
        return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
                              SCARG(uap, mode), 0);
}

int
do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags)
{
        int error;
        struct vnode *vp;
        namei_simple_flags_t ns_flag;

        if (flags & AT_SYMLINK_NOFOLLOW)
                ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
        else
                ns_flag = NSM_FOLLOW_TRYEMULROOT;

        error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
        if (error != 0)
                return error;

        error = change_mode(vp, mode, l);

        vrele(vp);

        return (error);
}

/*
 * Change mode of a file given a file descriptor.
 */
/* ARGSUSED */
int
sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(int) mode;
        } */
        file_t *fp;
        int error;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return (error);
        error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
        fd_putfile(SCARG(uap, fd));
        return (error);
}

int
sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const char *) path;
                syscallarg(int) mode;
                syscallarg(int) flag;
        } */

        return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
                              SCARG(uap, mode), SCARG(uap, flag));
}

/*
 * Change mode of a file given path name; this version does not follow links.
 */
/* ARGSUSED */
int
sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(int) mode;
        } */
        int error;
        struct vnode *vp;

        error = namei_simple_user(SCARG(uap, path),
                                NSM_NOFOLLOW_TRYEMULROOT, &vp);
        if (error != 0)
                return (error);

        error = change_mode(vp, SCARG(uap, mode), l);

        vrele(vp);
        return (error);
}

/*
 * Common routine to set mode given a vnode.
 */
static int
change_mode(struct vnode *vp, int mode, struct lwp *l)
{
        struct vattr vattr;
        int error;

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        vattr_null(&vattr);
        vattr.va_mode = mode & ALLPERMS;
        error = VOP_SETATTR(vp, &vattr, l->l_cred);
        VOP_UNLOCK(vp);
        return (error);
}

/*
 * Set ownership given a path name; this version follows links.
 */
/* ARGSUSED */
int
sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(uid_t) uid;
                syscallarg(gid_t) gid;
        } */
        return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
                              SCARG(uap, gid), 0);
}

int
do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid,
   gid_t gid, int flags)
{
        int error;
        struct vnode *vp;
        namei_simple_flags_t ns_flag;

        if (flags & AT_SYMLINK_NOFOLLOW)
                ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
        else
                ns_flag = NSM_FOLLOW_TRYEMULROOT;

        error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
        if (error != 0)
                return error;

        error = change_owner(vp, uid, gid, l, 0);

        vrele(vp);

        return (error);
}

/*
 * Set ownership given a path name; this version follows links.
 * Provides POSIX semantics.
 */
/* ARGSUSED */
int
sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(uid_t) uid;
                syscallarg(gid_t) gid;
        } */
        int error;
        struct vnode *vp;

        error = namei_simple_user(SCARG(uap, path),
                                NSM_FOLLOW_TRYEMULROOT, &vp);
        if (error != 0)
                return (error);

        error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);

        vrele(vp);
        return (error);
}

/*
 * Set ownership given a file descriptor.
 */
/* ARGSUSED */
int
sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(uid_t) uid;
                syscallarg(gid_t) gid;
        } */
        int error;
        file_t *fp;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return (error);
        error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
            l, 0);
        fd_putfile(SCARG(uap, fd));
        return (error);
}

int
sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const char *) path;
                syscallarg(uid_t) owner;
                syscallarg(gid_t) group;
                syscallarg(int) flag;
        } */

        return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
                              SCARG(uap, owner), SCARG(uap, group),
                              SCARG(uap, flag));
}

/*
 * Set ownership given a file descriptor, providing POSIX/XPG semantics.
 */
/* ARGSUSED */
int
sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(uid_t) uid;
                syscallarg(gid_t) gid;
        } */
        int error;
        file_t *fp;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return (error);
        error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
            l, 1);
        fd_putfile(SCARG(uap, fd));
        return (error);
}

/*
 * Set ownership given a path name; this version does not follow links.
 */
/* ARGSUSED */
int
sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(uid_t) uid;
                syscallarg(gid_t) gid;
        } */
        int error;
        struct vnode *vp;

        error = namei_simple_user(SCARG(uap, path),
                                NSM_NOFOLLOW_TRYEMULROOT, &vp);
        if (error != 0)
                return (error);

        error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);

        vrele(vp);
        return (error);
}

/*
 * Set ownership given a path name; this version does not follow links.
 * Provides POSIX/XPG semantics.
 */
/* ARGSUSED */
int
sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(uid_t) uid;
                syscallarg(gid_t) gid;
        } */
        int error;
        struct vnode *vp;

        error = namei_simple_user(SCARG(uap, path),
                                NSM_NOFOLLOW_TRYEMULROOT, &vp);
        if (error != 0)
                return (error);

        error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);

        vrele(vp);
        return (error);
}

/*
 * Common routine to set ownership given a vnode.
 */
static int
change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
    int posix_semantics)
{
        struct vattr vattr;
        mode_t newmode;
        int error;

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
                goto out;

#define CHANGED(x) ((int)(x) != -1)
        newmode = vattr.va_mode;
        if (posix_semantics) {
                /*
                 * POSIX/XPG semantics: if the caller is not the super-user,
                 * clear set-user-id and set-group-id bits.  Both POSIX and
                 * the XPG consider the behaviour for calls by the super-user
                 * implementation-defined; we leave the set-user-id and set-
                 * group-id settings intact in that case.
                 */
                if (vattr.va_mode & S_ISUID) {
                        if (kauth_authorize_vnode(l->l_cred,
                            KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0)
                                newmode &= ~S_ISUID;
                }
                if (vattr.va_mode & S_ISGID) {
                        if (kauth_authorize_vnode(l->l_cred,
                            KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0)
                                newmode &= ~S_ISGID;
                }
        } else {
                /*
                 * NetBSD semantics: when changing owner and/or group,
                 * clear the respective bit(s).
                 */
                if (CHANGED(uid))
                        newmode &= ~S_ISUID;
                if (CHANGED(gid))
                        newmode &= ~S_ISGID;
        }
        /* Update va_mode iff altered. */
        if (vattr.va_mode == newmode)
                newmode = VNOVAL;

        vattr_null(&vattr);
        vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
        vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
        vattr.va_mode = newmode;
        error = VOP_SETATTR(vp, &vattr, l->l_cred);
#undef CHANGED

out:
        VOP_UNLOCK(vp);
        return (error);
}

/*
 * Set the access and modification times given a path name; this
 * version follows links.
 */
/* ARGSUSED */
int
sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(const struct timeval *) tptr;
        } */

        return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
            SCARG(uap, tptr), UIO_USERSPACE);
}

/*
 * Set the access and modification times given a file descriptor.
 */
/* ARGSUSED */
int
sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const struct timeval *) tptr;
        } */
        int error;
        file_t *fp;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return (error);
        error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr),
            UIO_USERSPACE);
        fd_putfile(SCARG(uap, fd));
        return (error);
}

int
sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const struct timespec *) tptr;
        } */
        int error;
        file_t *fp;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return (error);
        error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0,
            SCARG(uap, tptr), UIO_USERSPACE);
        fd_putfile(SCARG(uap, fd));
        return (error);
}

/*
 * Set the access and modification times given a path name; this
 * version does not follow links.
 */
int
sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(const struct timeval *) tptr;
        } */

        return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
            SCARG(uap, tptr), UIO_USERSPACE);
}

int
sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const char *) path;
                syscallarg(const struct timespec *) tptr;
                syscallarg(int) flag;
        } */
        int follow;
        const struct timespec *tptr;
        int error;

        tptr = SCARG(uap, tptr);
        follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;

        error = do_sys_utimensat(l, SCARG(uap, fd), NULL,
            SCARG(uap, path), follow, tptr, UIO_USERSPACE);

        return error;
}

/*
 * Common routine to set access and modification times given a vnode.
 */
int
do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
    const struct timespec *tptr, enum uio_seg seg)
{
        return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
}

int
do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp,
    const char *path, int flag, const struct timespec *tptr, enum uio_seg seg)
{
        struct vattr vattr;
        int error, dorele = 0;
        namei_simple_flags_t sflags;
        bool vanull, setbirthtime;
        struct timespec ts[2];

        KASSERT(l != NULL || fdat == AT_FDCWD);

        /*
         * I have checked all callers and they pass either FOLLOW,
         * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
         * is 0. More to the point, they don't pass anything else.
         * Let's keep it that way at least until the namei interfaces
         * are fully sanitized.
         */
        KASSERT(flag == NOFOLLOW || flag == FOLLOW);
        sflags = (flag == FOLLOW) ?
                NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;

        if (tptr == NULL) {
                vanull = true;
                nanotime(&ts[0]);
                ts[1] = ts[0];
        } else {
                vanull = false;
                if (seg != UIO_SYSSPACE) {
                        error = copyin(tptr, ts, sizeof (ts));
                        if (error != 0)
                                return error;
                } else {
                        ts[0] = tptr[0];
                        ts[1] = tptr[1];
                }
        }

        if (ts[0].tv_nsec == UTIME_NOW) {
                nanotime(&ts[0]);
                if (ts[1].tv_nsec == UTIME_NOW) {
                        vanull = true;
                        ts[1] = ts[0];
                }
        } else if (ts[1].tv_nsec == UTIME_NOW)
                nanotime(&ts[1]);

        if (vp == NULL) {
                /* note: SEG describes TPTR, not PATH; PATH is always user */
                error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
                if (error != 0)
                        return error;
                dorele = 1;
        }

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
            timespeccmp(&ts[1], &vattr.va_birthtime, <));
        vattr_null(&vattr);

        if (ts[0].tv_nsec != UTIME_OMIT)
                vattr.va_atime = ts[0];

        if (ts[1].tv_nsec != UTIME_OMIT) {
                vattr.va_mtime = ts[1];
                if (setbirthtime)
                        vattr.va_birthtime = ts[1];
        }

        if (vanull)
                vattr.va_vaflags |= VA_UTIMES_NULL;
        error = VOP_SETATTR(vp, &vattr, l->l_cred);
        VOP_UNLOCK(vp);

        if (dorele != 0)
                vrele(vp);

        return error;
}

int
do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
    const struct timeval *tptr, enum uio_seg seg)
{
        struct timespec ts[2];
        struct timespec *tsptr = NULL;
        int error;

        if (tptr != NULL) {
                struct timeval tv[2];

                if (seg != UIO_SYSSPACE) {
                        error = copyin(tptr, tv, sizeof(tv));
                        if (error != 0)
                                return error;
                        tptr = tv;
                }

                if ((tptr[0].tv_usec == UTIME_NOW) ||
                    (tptr[0].tv_usec == UTIME_OMIT))
                        ts[0].tv_nsec = tptr[0].tv_usec;
                else {
                        if (tptr[0].tv_usec < 0 || tptr[0].tv_usec >= 1000000)
                                return EINVAL;

                        TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
                }

                if ((tptr[1].tv_usec == UTIME_NOW) ||
                    (tptr[1].tv_usec == UTIME_OMIT))
                        ts[1].tv_nsec = tptr[1].tv_usec;
                else {
                        if (tptr[1].tv_usec < 0 || tptr[1].tv_usec >= 1000000)
                                return EINVAL;

                        TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
                }

                tsptr = &ts[0];
        }

        return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
}

/*
 * Truncate a file given its path name.
 */
/* ARGSUSED */
int
sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(int) pad;
                syscallarg(off_t) length;
        } */
        struct vnode *vp;
        struct vattr vattr;
        int error;

        if (SCARG(uap, length) < 0)
                return EINVAL;

        error = namei_simple_user(SCARG(uap, path),
                                NSM_FOLLOW_TRYEMULROOT, &vp);
        if (error != 0)
                return (error);
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        if (vp->v_type == VDIR)
                error = EISDIR;
        else if ((error = vn_writechk(vp)) == 0 &&
            (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
                vattr_null(&vattr);
                vattr.va_size = SCARG(uap, length);
                error = VOP_SETATTR(vp, &vattr, l->l_cred);
        }
        vput(vp);
        return (error);
}

/*
 * Truncate a file given a file descriptor.
 */
/* ARGSUSED */
int
sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(int) pad;
                syscallarg(off_t) length;
        } */
        file_t *fp;
        int error, fd = SCARG(uap, fd);

        fp = fd_getfile(fd);
        if (fp == NULL)
                return EBADF;
        if (fp->f_ops->fo_truncate == NULL)
                error = EOPNOTSUPP;
        else
                error = (*fp->f_ops->fo_truncate)(fp, SCARG(uap, length));

        fd_putfile(fd);
        return error;
}

/*
 * Sync an open file.
 */
/* ARGSUSED */
int
sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
        } */
        struct vnode *vp;
        file_t *fp;
        int error;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return (error);
        vp = fp->f_vnode;
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
        VOP_UNLOCK(vp);
        fd_putfile(SCARG(uap, fd));
        return (error);
}

/*
 * Sync a range of file data.  API modeled after that found in AIX.
 *
 * FDATASYNC indicates that we need only save enough metadata to be able
 * to re-read the written data.
 */
/* ARGSUSED */
int
sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(int) flags;
                syscallarg(off_t) start;
                syscallarg(off_t) length;
        } */
        struct vnode *vp;
        file_t *fp;
        int flags, nflags;
        off_t s, e, len;
        int error;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return (error);

        if ((fp->f_flag & FWRITE) == 0) {
                error = EBADF;
                goto out;
        }

        flags = SCARG(uap, flags);
        if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
            ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
                error = EINVAL;
                goto out;
        }
        /* Now set up the flags for value(s) to pass to VOP_FSYNC() */
        if (flags & FDATASYNC)
                nflags = FSYNC_DATAONLY | FSYNC_WAIT;
        else
                nflags = FSYNC_WAIT;
        if (flags & FDISKSYNC)
                nflags |= FSYNC_CACHE;

        len = SCARG(uap, length);
        /* If length == 0, we do the whole file, and s = e = 0 will do that */
        if (len) {
                s = SCARG(uap, start);
                if (s < 0 || len < 0 || len > OFF_T_MAX - s) {
                        error = EINVAL;
                        goto out;
                }
                e = s + len;
                KASSERT(s <= e);
        } else {
                e = 0;
                s = 0;
        }

        vp = fp->f_vnode;
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
        VOP_UNLOCK(vp);
out:
        fd_putfile(SCARG(uap, fd));
        return (error);
}

/*
 * Sync the data of an open file.
 */
/* ARGSUSED */
int
sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
        } */
        struct vnode *vp;
        file_t *fp;
        int error;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return (error);
        vp = fp->f_vnode;
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
        VOP_UNLOCK(vp);
        fd_putfile(SCARG(uap, fd));
        return (error);
}

/*
 * Rename files, (standard) BSD semantics frontend.
 */
/* ARGSUSED */
int
sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) from;
                syscallarg(const char *) to;
        } */

        return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
            SCARG(uap, to), UIO_USERSPACE, 0));
}

int
sys_renameat(struct lwp *l, const struct sys_renameat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fromfd;
                syscallarg(const char *) from;
                syscallarg(int) tofd;
                syscallarg(const char *) to;
        } */

        return (do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
            SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0));
}

/*
 * Rename files, POSIX semantics frontend.
 */
/* ARGSUSED */
int
sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) from;
                syscallarg(const char *) to;
        } */

        return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
            SCARG(uap, to), UIO_USERSPACE, 1));
}

/*
 * Rename files.  Source and destination must either both be directories,
 * or both not be directories.  If target is a directory, it must be empty.
 * If `from' and `to' refer to the same object, the value of the `retain'
 * argument is used to determine whether `from' will be
 *
 * (retain == 0)        deleted unless `from' and `to' refer to the same
 *                        object in the file system's name space (BSD).
 * (retain == 1)        always retained (POSIX).
 *
 * XXX Synchronize with nfsrv_rename in nfs_serv.c.
 */
int
do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
{
        return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg, retain);
}

static int
do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd,
    const char *to, enum uio_seg seg, int retain)
{
        struct pathbuf *fpb, *tpb;
        struct nameidata fnd, tnd;
        struct vnode *fdvp, *fvp;
        struct vnode *tdvp, *tvp;
        struct mount *mp, *tmp;
        int error;

        KASSERT(l != NULL || fromfd == AT_FDCWD);
        KASSERT(l != NULL || tofd == AT_FDCWD);

        error = pathbuf_maybe_copyin(from, seg, &fpb);
        if (error)
                goto out0;
        KASSERT(fpb != NULL);

        error = pathbuf_maybe_copyin(to, seg, &tpb);
        if (error)
                goto out1;
        KASSERT(tpb != NULL);

        /*
         * Lookup from.
         *
         * XXX LOCKPARENT is wrong because we don't actually want it
         * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
         * insane, so for the time being we need to leave it like this.
         */
        NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT), fpb);
        if ((error = fd_nameiat(l, fromfd, &fnd)) != 0)
                goto out2;

        /*
         * Pull out the important results of the lookup, fdvp and fvp.
         * Of course, fvp is bogus because we're about to unlock fdvp.
         */
        fdvp = fnd.ni_dvp;
        fvp = fnd.ni_vp;
        mp = fdvp->v_mount;
        KASSERT(fdvp != NULL);
        KASSERT(fvp != NULL);
        KASSERT((fdvp == fvp) || (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE));
        /*
         * Bracket the operation with fstrans_start()/fstrans_done().
         *
         * Inside the bracket this file system cannot be unmounted so
         * a vnode on this file system cannot change its v_mount.
         * A vnode on another file system may still change to dead mount.
         */
        fstrans_start(mp);

        /*
         * Make sure neither fdvp nor fvp is locked.
         */
        if (fdvp != fvp)
                VOP_UNLOCK(fdvp);
        /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
        /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */

        /*
         * Reject renaming `.' and `..'.  Can't do this until after
         * namei because we need namei's parsing to find the final
         * component name.  (namei should just leave us with the final
         * component name and not look it up itself, but anyway...)
         *
         * This was here before because we used to relookup from
         * instead of to and relookup requires the caller to check
         * this, but now file systems may depend on this check, so we
         * must retain it until the file systems are all rototilled.
         */
        if (((fnd.ni_cnd.cn_namelen == 1) &&
                (fnd.ni_cnd.cn_nameptr[0] == '.')) ||
            ((fnd.ni_cnd.cn_namelen == 2) &&
                (fnd.ni_cnd.cn_nameptr[0] == '.') &&
                (fnd.ni_cnd.cn_nameptr[1] == '.'))) {
                error = EINVAL;        /* XXX EISDIR?  */
                goto abort0;
        }

        /*
         * Lookup to.
         *
         * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
         * fvp here to decide whether to add CREATEDIR is a load of
         * bollocks because fvp might be the wrong node by now, since
         * fdvp is unlocked.
         *
         * XXX Why not pass CREATEDIR always?
         */
        NDINIT(&tnd, RENAME,
            (LOCKPARENT | NOCACHE | TRYEMULROOT |
                ((fvp->v_type == VDIR)? CREATEDIR : 0)),
            tpb);
        if ((error = fd_nameiat(l, tofd, &tnd)) != 0)
                goto abort0;

        /*
         * Pull out the important results of the lookup, tdvp and tvp.
         * Of course, tvp is bogus because we're about to unlock tdvp.
         */
        tdvp = tnd.ni_dvp;
        tvp = tnd.ni_vp;
        KASSERT(tdvp != NULL);
        KASSERT((tdvp == tvp) || (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE));

        if (fvp->v_type == VDIR)
                tnd.ni_cnd.cn_flags |= WILLBEDIR;
        /*
         * Make sure neither tdvp nor tvp is locked.
         */
        if (tdvp != tvp)
                VOP_UNLOCK(tdvp);
        /* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
        /* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */

        /*
         * Reject renaming onto `.' or `..'.  relookup is unhappy with
         * these, which is why we must do this here.  Once upon a time
         * we relooked up from instead of to, and consequently didn't
         * need this check, but now that we relookup to instead of
         * from, we need this; and we shall need it forever forward
         * until the VOP_RENAME protocol changes, because file systems
         * will no doubt begin to depend on this check.
         */
        if ((tnd.ni_cnd.cn_namelen == 1) && (tnd.ni_cnd.cn_nameptr[0] == '.')) {
                error = EISDIR;
                goto abort1;
        }
        if ((tnd.ni_cnd.cn_namelen == 2) &&
            (tnd.ni_cnd.cn_nameptr[0] == '.') &&
            (tnd.ni_cnd.cn_nameptr[1] == '.')) {
                error = EINVAL;
                goto abort1;
        }

        /*
         * Make sure the mount points match.  Although we don't hold
         * any vnode locks, the v_mount on fdvp file system are stable.
         *
         * Unmounting another file system at an inopportune moment may
         * cause tdvp to disappear and change its v_mount to dead.
         *
         * So in either case different v_mount means cross-device rename.
         */
        KASSERT(mp != NULL);
        tmp = tdvp->v_mount;

        if (mp != tmp) {
                error = EXDEV;
                goto abort1;
        }

        /*
         * Take the vfs rename lock to avoid cross-directory screw cases.
         * Nothing is locked currently, so taking this lock is safe.
         */
        error = VFS_RENAMELOCK_ENTER(mp);
        if (error)
                goto abort1;

        /*
         * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
         * and nothing is locked except for the vfs rename lock.
         *
         * The next step is a little rain dance to conform to the
         * insane lock protocol, even though it does nothing to ward
         * off race conditions.
         *
         * We need tdvp and tvp to be locked.  However, because we have
         * unlocked tdvp in order to hold no locks while we take the
         * vfs rename lock, tvp may be wrong here, and we can't safely
         * lock it even if the sensible file systems will just unlock
         * it straight away.  Consequently, we must lock tdvp and then
         * relookup tvp to get it locked.
         *
         * Finally, because the VOP_RENAME protocol is brain-damaged
         * and various file systems insanely depend on the semantics of
         * this brain damage, the lookup of to must be the last lookup
         * before VOP_RENAME.
         */
        vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
        error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);
        if (error)
                goto abort2;

        /*
         * Drop the old tvp and pick up the new one -- which might be
         * the same, but that doesn't matter to us.  After this, tdvp
         * and tvp should both be locked.
         */
        if (tvp != NULL)
                vrele(tvp);
        tvp = tnd.ni_vp;
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));

        /*
         * The old do_sys_rename had various consistency checks here
         * involving fvp and tvp.  fvp is bogus already here, and tvp
         * will become bogus soon in any sensible file system, so the
         * only purpose in putting these checks here is to give lip
         * service to these screw cases and to acknowledge that they
         * exist, not actually to handle them, but here you go
         * anyway...
         */

        /*
         * Acknowledge that directories and non-directories aren't
         * supposed to mix.
         */
        if (tvp != NULL) {
                if ((fvp->v_type == VDIR) && (tvp->v_type != VDIR)) {
                        error = ENOTDIR;
                        goto abort3;
                } else if ((fvp->v_type != VDIR) && (tvp->v_type == VDIR)) {
                        error = EISDIR;
                        goto abort3;
                }
        }

        /*
         * Acknowledge some random screw case, among the dozens that
         * might arise.
         */
        if (fvp == tdvp) {
                error = EINVAL;
                goto abort3;
        }

        /*
         * Acknowledge that POSIX has a wacky screw case.
         *
         * XXX Eventually the retain flag needs to be passed on to
         * VOP_RENAME.
         */
        if (fvp == tvp) {
                if (retain) {
                        error = 0;
                        goto abort3;
                } else if ((fdvp == tdvp) &&
                    (fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen) &&
                    (0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
                        fnd.ni_cnd.cn_namelen))) {
                        error = 0;
                        goto abort3;
                }
        }

        /*
         * Make sure veriexec can screw us up.  (But a race can screw
         * up veriexec, of course -- remember, fvp and (soon) tvp are
         * bogus.)
         */
#if NVERIEXEC > 0
        {
                char *f1, *f2;
                size_t f1_len;
                size_t f2_len;

                f1_len = fnd.ni_cnd.cn_namelen + 1;
                f1 = kmem_alloc(f1_len, KM_SLEEP);
                strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);

                f2_len = tnd.ni_cnd.cn_namelen + 1;
                f2 = kmem_alloc(f2_len, KM_SLEEP);
                strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);

                error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);

                kmem_free(f1, f1_len);
                kmem_free(f2, f2_len);

                if (error)
                        goto abort3;
        }
#endif /* NVERIEXEC > 0 */

        /*
         * All ready.  Incant the rename vop.
         */
        /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
        /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
        error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);

        /*
         * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
         * tdvp and tvp.  But we can't assert any of that.
         */
        /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
        /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
        /* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
        /* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */

        /*
         * So all we have left to do is to drop the rename lock and
         * destroy the pathbufs.
         */
        VFS_RENAMELOCK_EXIT(mp);
        fstrans_done(mp);
        goto out2;

abort3:        if ((tvp != NULL) && (tvp != tdvp))
                VOP_UNLOCK(tvp);
abort2:        VOP_UNLOCK(tdvp);
        VFS_RENAMELOCK_EXIT(mp);
abort1:        VOP_ABORTOP(tdvp, &tnd.ni_cnd);
        vrele(tdvp);
        if (tvp != NULL)
                vrele(tvp);
abort0:        VOP_ABORTOP(fdvp, &fnd.ni_cnd);
        vrele(fdvp);
        vrele(fvp);
        fstrans_done(mp);
out2:        pathbuf_destroy(tpb);
out1:        pathbuf_destroy(fpb);
out0:        return error;
}

/*
 * Make a directory file.
 */
/* ARGSUSED */
int
sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(int) mode;
        } */

        return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
            SCARG(uap, mode), UIO_USERSPACE);
}

int
sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(const char *) path;
                syscallarg(int) mode;
        } */

        return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
            SCARG(uap, mode), UIO_USERSPACE);
}


int
do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
{
        return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, seg);
}

static int
do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode,
    enum uio_seg seg)
{
        struct proc *p = curlwp->l_proc;
        struct vnode *vp;
        struct vattr vattr;
        int error;
        struct pathbuf *pb;
        struct nameidata nd;

        KASSERT(l != NULL || fdat == AT_FDCWD);

        /* XXX bollocks, should pass in a pathbuf */
        error = pathbuf_maybe_copyin(path, seg, &pb);
        if (error) {
                return error;
        }

        NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);

        if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
                pathbuf_destroy(pb);
                return (error);
        }
        vp = nd.ni_vp;
        if (vp != NULL) {
                VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
                if (nd.ni_dvp == vp)
                        vrele(nd.ni_dvp);
                else
                        vput(nd.ni_dvp);
                vrele(vp);
                pathbuf_destroy(pb);
                return (EEXIST);
        }
        vattr_null(&vattr);
        vattr.va_type = VDIR;
        /* We will read cwdi->cwdi_cmask unlocked. */
        vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
        nd.ni_cnd.cn_flags |= WILLBEDIR;
        error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
        if (!error)
                vrele(nd.ni_vp);
        vput(nd.ni_dvp);
        pathbuf_destroy(pb);
        return (error);
}

/*
 * Remove a directory file.
 */
/* ARGSUSED */
int
sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
{
        return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path),
            AT_REMOVEDIR, UIO_USERSPACE);
}

/*
 * Read a block of directory entries in a file system independent format.
 */
int
sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(char *) buf;
                syscallarg(size_t) count;
        } */
        file_t *fp;
        int error, done;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return (error);
        if ((fp->f_flag & FREAD) == 0) {
                error = EBADF;
                goto out;
        }
        error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
                        SCARG(uap, count), &done, l, 0, 0);
        ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
        *retval = done;
 out:
        fd_putfile(SCARG(uap, fd));
        return (error);
}

/*
 * Set the mode mask for creation of filesystem nodes.
 */
int
sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
{
        /* {
                syscallarg(mode_t) newmask;
        } */

        /*
         * cwdi->cwdi_cmask will be read unlocked elsewhere, and no kind of
         * serialization with those reads is required.  It's important to
         * return a coherent answer for the caller of umask() though, and
         * the atomic operation accomplishes that.
         */
        *retval = atomic_swap_uint(&curproc->p_cwdi->cwdi_cmask,
            SCARG(uap, newmask) & ALLPERMS);

        return (0);
}

int
dorevoke(struct vnode *vp, kauth_cred_t cred)
{
        struct vattr vattr;
        int error, fs_decision;

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_GETATTR(vp, &vattr, cred);
        VOP_UNLOCK(vp);
        if (error != 0)
                return error;
        fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM;
        error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
            fs_decision);
        if (!error)
                VOP_REVOKE(vp, REVOKEALL);
        return (error);
}

/*
 * Void all references to file by ripping underlying filesystem
 * away from vnode.
 */
/* ARGSUSED */
int
sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
        } */
        struct vnode *vp;
        int error;

        error = namei_simple_user(SCARG(uap, path),
                                NSM_FOLLOW_TRYEMULROOT, &vp);
        if (error != 0)
                return (error);
        error = dorevoke(vp, l->l_cred);
        vrele(vp);
        return (error);
}

/*
 * Allocate backing store for a file, filling a hole without having to
 * explicitly write anything out.
 */
/* ARGSUSED */
int
sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap,
                register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(off_t) pos;
                syscallarg(off_t) len;
        } */
        int fd;
        off_t pos, len;
        struct file *fp;
        struct vnode *vp;
        int error;

        fd = SCARG(uap, fd);
        pos = SCARG(uap, pos);
        len = SCARG(uap, len);

        if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
                *retval = EINVAL;
                return 0;
        }

        error = fd_getvnode(fd, &fp);
        if (error) {
                *retval = error;
                return 0;
        }
        if ((fp->f_flag & FWRITE) == 0) {
                error = EBADF;
                goto fail;
        }
        vp = fp->f_vnode;

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        if (vp->v_type == VDIR) {
                error = EISDIR;
        } else {
                error = VOP_FALLOCATE(vp, pos, len);
        }
        VOP_UNLOCK(vp);

fail:
        fd_putfile(fd);
        *retval = error;
        return 0;
}

/*
 * Deallocate backing store for a file, creating a hole. Also used for
 * invoking TRIM on disks.
 */
/* ARGSUSED */
int
sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap,
                register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(off_t) pos;
                syscallarg(off_t) len;
        } */
        int fd;
        off_t pos, len;
        struct file *fp;
        struct vnode *vp;
        int error;

        fd = SCARG(uap, fd);
        pos = SCARG(uap, pos);
        len = SCARG(uap, len);

        if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
                return EINVAL;
        }

        error = fd_getvnode(fd, &fp);
        if (error) {
                return error;
        }
        if ((fp->f_flag & FWRITE) == 0) {
                error = EBADF;
                goto fail;
        }
        vp = fp->f_vnode;

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        if (vp->v_type == VDIR) {
                error = EISDIR;
        } else {
                error = VOP_FDISCARD(vp, pos, len);
        }
        VOP_UNLOCK(vp);

fail:
        fd_putfile(fd);
        return error;
}






















































































































































































































































































































   27 






   25 

    3 




    2 














































   30 






   32 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
/* $NetBSD: secmodel_extensions.c,v 1.16 2023/04/22 13:54:19 riastradh Exp $ */
/*-
 * Copyright (c) 2011 Elad Efrat <elad@NetBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: secmodel_extensions.c,v 1.16 2023/04/22 13:54:19 riastradh Exp $");

#include <sys/types.h>
#include <sys/param.h>
#include <sys/kauth.h>

#include <sys/mount.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/module.h>

#include <secmodel/secmodel.h>
#include <secmodel/extensions/extensions.h>
#include <secmodel/extensions/extensions_impl.h>

MODULE(MODULE_CLASS_SECMODEL, extensions, NULL);

static int curtain;
static int user_set_cpu_affinity;

#ifdef PT_SETDBREGS
int user_set_dbregs;
#endif

static kauth_listener_t l_process, l_network;

static secmodel_t extensions_sm;

static void secmodel_extensions_init(void);
static void secmodel_extensions_start(void);
static void secmodel_extensions_stop(void);

static void sysctl_security_extensions_setup(struct sysctllog **);
static int  sysctl_extensions_curtain_handler(SYSCTLFN_PROTO);
static bool is_securelevel_above(int);

static int secmodel_extensions_process_cb(kauth_cred_t, kauth_action_t,
    void *, void *, void *, void *, void *);
static int secmodel_extensions_network_cb(kauth_cred_t, kauth_action_t,
    void *, void *, void *, void *, void *);

SYSCTL_SETUP(sysctl_security_extensions_setup,
    "security extensions sysctl")
{
        const struct sysctlnode *rnode, *rnode2;

        sysctl_createv(clog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "models", NULL,
                       NULL, 0, NULL, 0,
                       CTL_SECURITY, CTL_CREATE, CTL_EOL);

        /* Compatibility: security.models.bsd44 */
        rnode2 = rnode;
        sysctl_createv(clog, 0, &rnode2, &rnode2,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "bsd44", NULL,
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);

        /* Compatibility: security.models.bsd44.curtain */
        sysctl_createv(clog, 0, &rnode2, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "curtain",
                       SYSCTL_DESCR("Curtain information about objects to "\
                                           "users not owning them."),
                       sysctl_extensions_curtain_handler, 0, &curtain, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "extensions", NULL,
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "name", NULL,
                       NULL, 0, __UNCONST(SECMODEL_EXTENSIONS_NAME), 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "curtain",
                       SYSCTL_DESCR("Curtain information about objects to "\
                                           "users not owning them."),
                       sysctl_extensions_curtain_handler, 0, &curtain, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "user_set_cpu_affinity",
                       SYSCTL_DESCR("Whether unprivileged users may control "\
                                           "CPU affinity."),
                       sysctl_extensions_user_handler, 0,
                       &user_set_cpu_affinity, 0,
                       CTL_CREATE, CTL_EOL);

#ifdef PT_SETDBREGS
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "user_set_dbregs",
                       SYSCTL_DESCR("Whether unprivileged users may set "\
                                           "CPU Debug Registers."),
                       sysctl_extensions_user_handler, 0,
                       &user_set_dbregs, 0,
                       CTL_CREATE, CTL_EOL);
#endif

        /* Compatibility: security.curtain */
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "curtain",
                       SYSCTL_DESCR("Curtain information about objects to "\
                                           "users not owning them."),
                       sysctl_extensions_curtain_handler, 0, &curtain, 0,
                       CTL_SECURITY, CTL_CREATE, CTL_EOL);

        secmodel_extensions_vfs_sysctl(clog, rnode);
}

static int
sysctl_extensions_curtain_handler(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int val, error;

        val = *(int *)rnode->sysctl_data;

        node = *rnode;
        node.sysctl_data = &val;

        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        /* shortcut */
        if (val == *(int *)rnode->sysctl_data)
                return 0;

        /* curtain cannot be disabled when securelevel is above 0 */
        if (val == 0 && is_securelevel_above(0)) {
                return EPERM;
        }

        *(int *)rnode->sysctl_data = val;
        return 0;
}

/*
 * Generic sysctl extensions handler for user mount and set CPU affinity
 * rights. Checks the following conditions:
 * - setting value to 0 is always permitted (decrease user rights)
 * - setting value != 0 is not permitted when securelevel is above 0 (increase
 *   user rights).
 */
int
sysctl_extensions_user_handler(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int val, error;

        val = *(int *)rnode->sysctl_data;

        node = *rnode;
        node.sysctl_data = &val;

        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        /* shortcut */
        if (val == *(int *)rnode->sysctl_data)
                return 0;

        /* we cannot grant more rights to users when securelevel is above 0 */
        if (val != 0 && is_securelevel_above(0)) {
                return EPERM;
        }

        *(int *)rnode->sysctl_data = val;
        return 0;
}

/*
 * Query secmodel_securelevel(9) to know whether securelevel is strictly
 * above 'level' or not.
 * Returns true if it is, false otherwise (when securelevel is absent or
 * securelevel is at or below 'level').
 */
static bool
is_securelevel_above(int level)
{
        bool above;
        int error;

        error = secmodel_eval("org.netbsd.secmodel.securelevel",
            "is-securelevel-above", KAUTH_ARG(level), &above);
        if (error == 0 && above)
                return true;
        else
                return false;
}

static void
secmodel_extensions_init(void)
{

        curtain = 0;
        user_set_cpu_affinity = 0;
#ifdef PT_SETDBREGS
        user_set_dbregs = 0;
#endif
}

static void
secmodel_extensions_start(void)
{

        l_process = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
            secmodel_extensions_process_cb, NULL);
        l_network = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
            secmodel_extensions_network_cb, NULL);
        secmodel_extensions_vfs_start();
}

static void
secmodel_extensions_stop(void)
{

        secmodel_extensions_vfs_stop();
        kauth_unlisten_scope(l_process);
        kauth_unlisten_scope(l_network);
}

static int
extensions_modcmd(modcmd_t cmd, void *arg)
{
        int error = 0;

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = secmodel_register(&extensions_sm,
                    SECMODEL_EXTENSIONS_ID, SECMODEL_EXTENSIONS_NAME,
                    NULL, NULL, NULL);
                if (error != 0)
                        printf("extensions_modcmd::init: secmodel_register "
                            "returned %d\n", error);

                secmodel_extensions_init();
                secmodel_extensions_start();
                break;

        case MODULE_CMD_FINI:
                secmodel_extensions_stop();

                error = secmodel_deregister(extensions_sm);
                if (error != 0)
                        printf("extensions_modcmd::fini: secmodel_deregister "
                            "returned %d\n", error);

                break;

        case MODULE_CMD_AUTOUNLOAD:
                error = EPERM;
                break;

        default:
                error = ENOTTY;
                break;
        }

        return (error);
}

static int
secmodel_extensions_process_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;
        enum kauth_process_req req;

        result = KAUTH_RESULT_DEFER;
        req = (enum kauth_process_req)(uintptr_t)arg1;

        switch (action) {
        case KAUTH_PROCESS_CANSEE:
                switch (req) {
                case KAUTH_REQ_PROCESS_CANSEE_ARGS:
                case KAUTH_REQ_PROCESS_CANSEE_ENTRY:
                case KAUTH_REQ_PROCESS_CANSEE_OPENFILES:
                case KAUTH_REQ_PROCESS_CANSEE_EPROC:
                        if (curtain != 0) {
                                struct proc *p = arg0;

                                /*
                                 * Only process' owner and root can see
                                 * through curtain
                                 */
                                if (!kauth_cred_uidmatch(cred, p->p_cred)) {
                                        int error;
                                        bool isroot = false;

                                        error = secmodel_eval(
                                            "org.netbsd.secmodel.suser",
                                            "is-root", cred, &isroot);
                                        if (error == 0 && !isroot)
                                                result = KAUTH_RESULT_DENY;
                                }
                        }

                        break;

                case KAUTH_REQ_PROCESS_CANSEE_KPTR:
                default:
                        break;
                }

                break;

        case KAUTH_PROCESS_SCHEDULER_SETAFFINITY:
                if (user_set_cpu_affinity != 0) {
                        struct proc *p = arg0;

                        if (kauth_cred_uidmatch(cred, p->p_cred))
                                result = KAUTH_RESULT_ALLOW;
                }
                break;

        default:
                break;
        }

        return (result);
}

static int
secmodel_extensions_network_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;
        enum kauth_network_req req;

        result = KAUTH_RESULT_DEFER;
        req = (enum kauth_network_req)(uintptr_t)arg0;

        if (action != KAUTH_NETWORK_SOCKET ||
            req != KAUTH_REQ_NETWORK_SOCKET_CANSEE)
                return result;

        if (curtain != 0) {
                struct socket *so = (struct socket *)arg1;

                if (__predict_false(so == NULL || so->so_cred == NULL))
                        return KAUTH_RESULT_DENY;

                if (!kauth_cred_uidmatch(cred, so->so_cred)) {
                        int error;
                        bool isroot = false;

                        error = secmodel_eval("org.netbsd.secmodel.suser",
                            "is-root", cred, &isroot);
                        if (error == 0 && !isroot)
                                result = KAUTH_RESULT_DENY;
                }
        }

        return (result);
}





































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
/*        $NetBSD: ipsec.h,v 1.93 2022/10/28 05:23:09 ozaki-r Exp $        */
/*        $FreeBSD: ipsec.h,v 1.2.4.2 2004/02/14 22:23:23 bms Exp $        */
/*        $KAME: ipsec.h,v 1.53 2001/11/20 08:32:38 itojun Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#ifndef _NETIPSEC_IPSEC_H_
#define _NETIPSEC_IPSEC_H_

#if defined(_KERNEL_OPT)
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif

#include <net/pfkeyv2.h>

#ifdef _KERNEL
#include <sys/socketvar.h>
#include <sys/localcount.h>

#include <netinet/in_pcb.h>
#include <netipsec/keydb.h>

/*
 * Security Policy Index
 * Ensure that both address families in the "src" and "dst" are same.
 * When the value of the ul_proto is ICMPv6, the port field in "src"
 * specifies ICMPv6 type, and the port field in "dst" specifies ICMPv6 code.
 */
struct secpolicyindex {
        u_int8_t dir;                        /* direction of packet flow, see blow */
        union sockaddr_union src;        /* IP src address for SP */
        union sockaddr_union dst;        /* IP dst address for SP */
        u_int8_t prefs;                        /* prefix length in bits for src */
        u_int8_t prefd;                        /* prefix length in bits for dst */
        u_int16_t ul_proto;                /* upper layer Protocol */
};

/* Security Policy Data Base */
struct secpolicy {
        struct pslist_entry pslist_entry;

        struct localcount localcount;        /* reference count */
        struct secpolicyindex spidx;        /* selector */
        u_int32_t id;                        /* It's unique number on the system. */
        u_int state;                        /* 0: dead, others: alive */
#define IPSEC_SPSTATE_DEAD        0
#define IPSEC_SPSTATE_ALIVE        1

        u_int origin;                        /* who generate this SP. */
#define IPSEC_SPORIGIN_USER        0
#define IPSEC_SPORIGIN_KERNEL        1

        u_int policy;                /* DISCARD, NONE or IPSEC, see keyv2.h */
        struct ipsecrequest *req;
                                /* pointer to the ipsec request tree, */
                                /* if policy == IPSEC else this value == NULL.*/

        /*
         * lifetime handler.
         * the policy can be used without limitiation if both lifetime and
         * validtime are zero.
         * "lifetime" is passed by sadb_lifetime.sadb_lifetime_addtime.
         * "validtime" is passed by sadb_lifetime.sadb_lifetime_usetime.
         */
        time_t created;                /* time created the policy */
        time_t lastused;        /* updated every when kernel sends a packet */
        time_t lifetime;        /* duration of the lifetime of this policy */
        time_t validtime;        /* duration this policy is valid without use */
};

/* Request for IPsec */
struct ipsecrequest {
        struct ipsecrequest *next;
                                /* pointer to next structure */
                                /* If NULL, it means the end of chain. */
        struct secasindex saidx;/* hint for search proper SA */
                                /* if __ss_len == 0 then no address specified.*/
        u_int level;                /* IPsec level defined below. */

        struct secpolicy *sp;        /* back pointer to SP */
};

/* security policy in PCB */
struct inpcbpolicy {
        struct secpolicy *sp_in;
        struct secpolicy *sp_out;
        int priv;                        /* privileged socket ? */

        /* cached policy */
        struct {
                struct secpolicy *cachesp;
                struct secpolicyindex cacheidx;
                int cachehint;                /* processing requirement hint: */
#define        IPSEC_PCBHINT_UNKNOWN        0        /* Unknown */
#define        IPSEC_PCBHINT_YES        1        /* IPsec processing is required */
#define        IPSEC_PCBHINT_NO        2        /* IPsec processing not required */
                u_int cachegen;                /* spdgen when cache filled */
        } sp_cache[3];                        /* XXX 3 == IPSEC_DIR_MAX */
        int sp_cacheflags;
#define        IPSEC_PCBSP_CONNECTED        1
        struct inpcb *sp_inp;                /* back pointer */
};

extern u_int ipsec_spdgen;

static __inline bool
ipsec_pcb_skip_ipsec(struct inpcbpolicy *pcbsp, int dir)
{

        KASSERT(inp_locked(pcbsp->sp_inp));

        return pcbsp->sp_cache[(dir)].cachehint == IPSEC_PCBHINT_NO &&
            pcbsp->sp_cache[(dir)].cachegen == ipsec_spdgen;
}

/* SP acquiring list table. */
struct secspacq {
        LIST_ENTRY(secspacq) chain;

        struct secpolicyindex spidx;

        time_t created;                /* for lifetime */
        int count;                /* for lifetime */
        /* XXX: here is mbuf place holder to be sent ? */
};
#endif /* _KERNEL */

/* buffer size for formatted output of ipsec address (addr + '%' + scope_id?) */
#define        IPSEC_ADDRSTRLEN        (INET6_ADDRSTRLEN + 11)
/* buffer size for ipsec_logsastr() */
#define        IPSEC_LOGSASTRLEN        192

/* according to IANA assignment, port 0x0000 and proto 0xff are reserved. */
#define IPSEC_PORT_ANY                0
#define IPSEC_ULPROTO_ANY        255
#define IPSEC_PROTO_ANY                255

/* mode of security protocol */
/* NOTE: DON'T use IPSEC_MODE_ANY at SPD.  It's only use in SAD */
#define        IPSEC_MODE_ANY                0        /* i.e. wildcard. */
#define        IPSEC_MODE_TRANSPORT        1
#define        IPSEC_MODE_TUNNEL        2
#define        IPSEC_MODE_TCPMD5        3        /* TCP MD5 mode */

/*
 * Direction of security policy.
 * NOTE: Since INVALID is used just as flag.
 * The other are used for loop counter too.
 */
#define IPSEC_DIR_ANY                0
#define IPSEC_DIR_INBOUND        1
#define IPSEC_DIR_OUTBOUND        2
#define IPSEC_DIR_MAX                3
#define IPSEC_DIR_INVALID        4

#define IPSEC_DIR_IS_VALID(dir)                ((dir) >= 0 && (dir) <= IPSEC_DIR_MAX)
#define IPSEC_DIR_IS_INOROUT(dir)        ((dir) == IPSEC_DIR_INBOUND || \
                                         (dir) == IPSEC_DIR_OUTBOUND)

/* Policy level */
/*
 * IPSEC, ENTRUST and BYPASS are allowed for setsockopt() in PCB,
 * DISCARD, IPSEC and NONE are allowed for setkey() in SPD.
 * DISCARD and NONE are allowed for system default.
 */
#define IPSEC_POLICY_DISCARD        0        /* discarding packet */
#define IPSEC_POLICY_NONE        1        /* through IPsec engine */
#define IPSEC_POLICY_IPSEC        2        /* do IPsec */
#define IPSEC_POLICY_ENTRUST        3        /* consulting SPD if present. */
#define IPSEC_POLICY_BYPASS        4        /* only for privileged socket. */

/* Security protocol level */
#define        IPSEC_LEVEL_DEFAULT        0        /* reference to system default */
#define        IPSEC_LEVEL_USE                1        /* use SA if present. */
#define        IPSEC_LEVEL_REQUIRE        2        /* require SA. */
#define        IPSEC_LEVEL_UNIQUE        3        /* unique SA. */

#define IPSEC_MANUAL_REQID_MAX        0x3fff
                                /*
                                 * if security policy level == unique, this id
                                 * indicate to a relative SA for use, else is
                                 * zero.
                                 * 1 - 0x3fff are reserved for manual keying.
                                 * 0 are reserved for above reason.  Others is
                                 * for kernel use.
                                 * Note that this id doesn't identify SA
                                 * by only itself.
                                 */
#define IPSEC_REPLAYWSIZE  32

#ifdef _KERNEL

extern int ipsec_debug;
#ifdef IPSEC_DEBUG
extern int ipsec_replay;
extern int ipsec_integrity;
#endif

extern struct secpolicy ip4_def_policy;
extern int ip4_esp_trans_deflev;
extern int ip4_esp_net_deflev;
extern int ip4_ah_trans_deflev;
extern int ip4_ah_net_deflev;
extern int ip4_ah_cleartos;
extern int ip4_ah_offsetmask;
extern int ip4_ipsec_dfbit;
extern int ip4_ipsec_ecn;
extern int crypto_support;

#include <sys/syslog.h>

#define        DPRINTF(fmt, args...)                                                 \
        do {                                                                \
                if (ipsec_debug)                                        \
                        log(LOG_DEBUG, "%s: " fmt, __func__, ##args);        \
        } while (/*CONSTCOND*/0)

#define IPSECLOG(level, fmt, args...)                                         \
        do {                                                                \
                if (ipsec_debug)                                        \
                        log(level, "%s: " fmt, __func__, ##args);        \
        } while (/*CONSTCOND*/0)

#define ipsec_indone(m)        \
        ((m->m_flags & M_AUTHIPHDR) || (m->m_flags & M_DECRYPTED))
#define ipsec_outdone(m) \
        (m_tag_find((m), PACKET_TAG_IPSEC_OUT_DONE) != NULL)

static __inline bool
ipsec_skip_pfil(struct mbuf *m)
{
        bool rv;

        if (ipsec_indone(m) &&
            ((m->m_pkthdr.pkthdr_flags & PKTHDR_FLAG_IPSEC_SKIP_PFIL) != 0)) {
                m->m_pkthdr.pkthdr_flags &= ~PKTHDR_FLAG_IPSEC_SKIP_PFIL;
                rv = true;
        } else {
                rv = false;
        }

        return rv;
}

void ipsec_pcbconn(struct inpcbpolicy *);
void ipsec_pcbdisconn(struct inpcbpolicy *);
void ipsec_invalpcbcacheall(void);

struct inpcb;
int ipsec4_output(struct mbuf *, struct inpcb *, int, u_long *, bool *, bool *, bool *);

int ipsec_ip_input_checkpolicy(struct mbuf *, bool);
void ipsec_mtu(struct mbuf *, int *);
#ifdef INET6
void ipsec6_udp_cksum(struct mbuf *);
#endif

struct inpcb;
int ipsec_init_pcbpolicy(struct socket *so, struct inpcbpolicy **);
int ipsec_copy_policy(const struct inpcbpolicy *, struct inpcbpolicy *);
u_int ipsec_get_reqlevel(const struct ipsecrequest *);

int ipsec_set_policy(struct inpcb *, const void *, size_t, kauth_cred_t);
int ipsec_get_policy(struct inpcb *, const void *, size_t, struct mbuf **);
int ipsec_delete_pcbpolicy(struct inpcb *);
int ipsec_in_reject(struct mbuf *, struct inpcb *);

struct secasvar *ipsec_lookup_sa(const struct ipsecrequest *,
    const struct mbuf *);

struct secas;
struct tcpcb;
int ipsec_chkreplay(u_int32_t, const struct secasvar *);
int ipsec_updatereplay(u_int32_t, const struct secasvar *);

size_t ipsec_hdrsiz(struct mbuf *, u_int, struct inpcb *);
size_t ipsec4_hdrsiz_tcp(struct tcpcb *);

union sockaddr_union;
const char *ipsec_address(const union sockaddr_union* sa, char *, size_t);
const char *ipsec_logsastr(const struct secasvar *, char *, size_t);

/* NetBSD protosw ctlin entrypoint */
void *esp4_ctlinput(int, const struct sockaddr *, void *);
void *ah4_ctlinput(int, const struct sockaddr *, void *);

void ipsec_output_init(void);
struct m_tag;
void ipsec4_common_input(struct mbuf *m, int, int);
int ipsec4_common_input_cb(struct mbuf *, struct secasvar *, int, int);
int ipsec4_process_packet(struct mbuf *, const struct ipsecrequest *, u_long *);
int ipsec_process_done(struct mbuf *, const struct ipsecrequest *,
    struct secasvar *, int);

struct mbuf *m_clone(struct mbuf *);
struct mbuf *m_makespace(struct mbuf *, int, int, int *);
void *m_pad(struct mbuf *, int);
int m_striphdr(struct mbuf *, int, int);

extern int ipsec_used __read_mostly;
extern int ipsec_enabled __read_mostly;

#endif /* _KERNEL */

#ifndef _KERNEL
char *ipsec_set_policy(const char *, int);
int ipsec_get_policylen(char *);
char *ipsec_dump_policy(char *, const char *);
const char *ipsec_strerror(void);
#endif /* !_KERNEL */

#ifdef _KERNEL
/* External declarations of per-file init functions */
void ah_attach(void);
void esp_attach(void);
void ipcomp_attach(void);
void ipe4_attach(void);
void tcpsignature_attach(void);

void ipsec_attach(void);

void sysctl_net_inet_ipsec_setup(struct sysctllog **);
#ifdef INET6
void sysctl_net_inet6_ipsec6_setup(struct sysctllog **);
#endif

#endif /* _KERNEL */
#endif /* !_NETIPSEC_IPSEC_H_ */
































































































































    1 





















    1 






















    1 







    2 






































































































































































    2 












    1 













































    2 













    1 







    1 







































    2 










    2 






    2 



    1 














    2 










    3 






    1 














    1 





    1 









    1 














    1 











    1 








    1 






    1 







    1 
































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
/*        $NetBSD: sys_lwp.c,v 1.89 2023/10/15 10:29:24 riastradh Exp $        */

/*-
 * Copyright (c) 2001, 2006, 2007, 2008, 2019, 2020, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Nathan J. Williams, and Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Lightweight process (LWP) system calls.  See kern_lwp.c for a description
 * of LWPs.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_lwp.c,v 1.89 2023/10/15 10:29:24 riastradh Exp $");

#include <sys/param.h>

#include <sys/cpu.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/lwpctl.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/ptrace.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/syscallargs.h>
#include <sys/systm.h>
#include <sys/types.h>

#include <uvm/uvm_extern.h>

#define        LWP_UNPARK_MAX                1024

static const stack_t lwp_ss_init = SS_INIT;

/*
 * Parked LWPs get no priority boost on awakening as they blocked on
 * user space objects.  Maybe revisit?
 */
syncobj_t lwp_park_syncobj = {
        .sobj_name        = "lwp_park",
        .sobj_flag        = SOBJ_SLEEPQ_NULL,
        .sobj_boostpri  = PRI_USER,
        .sobj_unsleep        = sleepq_unsleep,
        .sobj_changepri        = sleepq_changepri,
        .sobj_lendpri        = sleepq_lendpri,
        .sobj_owner        = syncobj_noowner,
};

static void
mi_startlwp(void *arg)
{
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;

        (p->p_emul->e_startlwp)(arg);

        /* If the process is traced, report lwp creation to a debugger */
        if ((p->p_slflag & (PSL_TRACED|PSL_TRACELWP_CREATE)) ==
            (PSL_TRACED|PSL_TRACELWP_CREATE)) {
                /* Paranoid check */
                mutex_enter(&proc_lock);
                if ((p->p_slflag & (PSL_TRACED|PSL_TRACELWP_CREATE)) !=
                    (PSL_TRACED|PSL_TRACELWP_CREATE)) { 
                        mutex_exit(&proc_lock);
                        return;
                }

                mutex_enter(p->p_lock);
                eventswitch(TRAP_LWP, PTRACE_LWP_CREATE, l->l_lid);
        }
}

int
do_lwp_create(lwp_t *l, void *arg, u_long flags, lwp_t **l2,
    const sigset_t *sigmask, const stack_t *sigstk)
{
        struct proc *p = l->l_proc;
        vaddr_t uaddr;
        int error;

        /* XXX check against resource limits */

        uaddr = uvm_uarea_alloc();
        if (__predict_false(uaddr == 0))
                return ENOMEM;

        error = lwp_create(l, p, uaddr, flags & LWP_DETACHED, NULL, 0,
            mi_startlwp, arg, l2, l->l_class, sigmask, &lwp_ss_init);
        if (__predict_false(error)) {
                uvm_uarea_free(uaddr);
                return error;
        }

        return 0;
}

int
sys__lwp_create(struct lwp *l, const struct sys__lwp_create_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const ucontext_t *) ucp;
                syscallarg(u_long) flags;
                syscallarg(lwpid_t *) new_lwp;
        } */
        struct proc *p = l->l_proc;
        ucontext_t *newuc;
        lwp_t *l2;
        int error;

        newuc = kmem_alloc(sizeof(ucontext_t), KM_SLEEP);
        error = copyin(SCARG(uap, ucp), newuc, p->p_emul->e_ucsize);
        if (error)
                goto fail;

        /* validate the ucontext */
        if ((newuc->uc_flags & _UC_CPU) == 0) {
                error = EINVAL;
                goto fail;
        }
        error = cpu_mcontext_validate(l, &newuc->uc_mcontext);
        if (error)
                goto fail;

        const sigset_t *sigmask = newuc->uc_flags & _UC_SIGMASK ?
            &newuc->uc_sigmask : &l->l_sigmask;
        error = do_lwp_create(l, newuc, SCARG(uap, flags), &l2, sigmask,
            &SS_INIT);
        if (error)
                goto fail;

        error = copyout(&l2->l_lid, SCARG(uap, new_lwp), sizeof(l2->l_lid));
        if (error == 0) {
                lwp_start(l2, SCARG(uap, flags));
                return 0;
        }
        lwp_exit(l2);
 fail:
        kmem_free(newuc, sizeof(ucontext_t));
        return error;
}

int
sys__lwp_exit(struct lwp *l, const void *v, register_t *retval)
{

        lwp_exit(l);
        return 0;
}

int
sys__lwp_self(struct lwp *l, const void *v, register_t *retval)
{

        *retval = l->l_lid;
        return 0;
}

int
sys__lwp_getprivate(struct lwp *l, const void *v, register_t *retval)
{

        *retval = (uintptr_t)l->l_private;
        return 0;
}

int
sys__lwp_setprivate(struct lwp *l, const struct sys__lwp_setprivate_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(void *) ptr;
        } */

        return lwp_setprivate(l, SCARG(uap, ptr));
}

int
sys__lwp_suspend(struct lwp *l, const struct sys__lwp_suspend_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(lwpid_t) target;
        } */
        struct proc *p = l->l_proc;
        struct lwp *t;
        int error;

        mutex_enter(p->p_lock);
        if ((t = lwp_find(p, SCARG(uap, target))) == NULL) {
                mutex_exit(p->p_lock);
                return ESRCH;
        }

        /*
         * Check for deadlock, which is only possible when we're suspending
         * ourself.  XXX There is a short race here, as p_nrlwps is only
         * incremented when an LWP suspends itself on the kernel/user
         * boundary.  It's still possible to kill -9 the process so we
         * don't bother checking further.
         */
        lwp_lock(t);
        if ((t == l && p->p_nrlwps == 1) ||
            (l->l_flag & (LW_WCORE | LW_WEXIT)) != 0) {
                lwp_unlock(t);
                mutex_exit(p->p_lock);
                return EDEADLK;
        }

        /*
         * Suspend the LWP.  XXX If it's on a different CPU, we should wait
         * for it to be preempted, where it will put itself to sleep. 
         *
         * Suspension of the current LWP will happen on return to userspace.
         */
        error = lwp_suspend(l, t);
        if (error) {
                mutex_exit(p->p_lock);
                return error;
        }

        /*
         * Wait for:
         *  o process exiting
         *  o target LWP suspended
         *  o target LWP not suspended and L_WSUSPEND clear
         *  o target LWP exited
         */
        for (;;) {
                error = cv_wait_sig(&p->p_lwpcv, p->p_lock);
                if (error) {
                        error = ERESTART;
                        break;
                }
                if (lwp_find(p, SCARG(uap, target)) == NULL) {
                        error = ESRCH;
                        break;
                }
                if ((l->l_flag | t->l_flag) & (LW_WCORE | LW_WEXIT)) {
                        error = ERESTART;
                        break;
                }
                if (t->l_stat == LSSUSPENDED ||
                    (t->l_flag & LW_WSUSPEND) == 0)
                        break;
        }
        mutex_exit(p->p_lock);

        return error;
}

int
sys__lwp_continue(struct lwp *l, const struct sys__lwp_continue_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(lwpid_t) target;
        } */
        int error;
        struct proc *p = l->l_proc;
        struct lwp *t;

        error = 0;

        mutex_enter(p->p_lock);
        if ((t = lwp_find(p, SCARG(uap, target))) == NULL) {
                mutex_exit(p->p_lock);
                return ESRCH;
        }

        lwp_lock(t);
        lwp_continue(t);
        mutex_exit(p->p_lock);

        return error;
}

int
sys__lwp_wakeup(struct lwp *l, const struct sys__lwp_wakeup_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(lwpid_t) target;
        } */
        struct lwp *t;
        struct proc *p;
        int error;

        p = l->l_proc;
        mutex_enter(p->p_lock);

        if ((t = lwp_find(p, SCARG(uap, target))) == NULL) {
                mutex_exit(p->p_lock);
                return ESRCH;
        }

        lwp_lock(t);
        t->l_flag |= (LW_CANCELLED | LW_UNPARKED);

        if (t->l_stat != LSSLEEP) {
                lwp_unlock(t);
                error = ENODEV;
        } else if ((t->l_flag & LW_SINTR) == 0) {
                lwp_unlock(t);
                error = EBUSY;
        } else {
                /* Wake it up.  lwp_unsleep() will release the LWP lock. */
                lwp_unsleep(t, true);
                error = 0;
        }

        mutex_exit(p->p_lock);

        return error;
}

int
sys__lwp_wait(struct lwp *l, const struct sys__lwp_wait_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(lwpid_t) wait_for;
                syscallarg(lwpid_t *) departed;
        } */
        struct proc *p = l->l_proc;
        int error;
        lwpid_t dep;

        mutex_enter(p->p_lock);
        error = lwp_wait(l, SCARG(uap, wait_for), &dep, false);
        mutex_exit(p->p_lock);

        if (!error && SCARG(uap, departed)) {
                error = copyout(&dep, SCARG(uap, departed), sizeof(dep));
        }

        return error;
}

int
sys__lwp_kill(struct lwp *l, const struct sys__lwp_kill_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(lwpid_t)        target;
                syscallarg(int)                signo;
        } */
        struct proc *p = l->l_proc;
        struct lwp *t;
        ksiginfo_t ksi;
        int signo = SCARG(uap, signo);
        int error = 0;

        if ((u_int)signo >= NSIG)
                return EINVAL;

        KSI_INIT(&ksi);
        ksi.ksi_signo = signo;
        ksi.ksi_code = SI_LWP;
        ksi.ksi_pid = p->p_pid;
        ksi.ksi_uid = kauth_cred_geteuid(l->l_cred);
        ksi.ksi_lid = SCARG(uap, target);

        mutex_enter(&proc_lock);
        mutex_enter(p->p_lock);
        if ((t = lwp_find(p, ksi.ksi_lid)) == NULL)
                error = ESRCH;
        else if (signo != 0)
                kpsignal2(p, &ksi);
        mutex_exit(p->p_lock);
        mutex_exit(&proc_lock);

        return error;
}

int
sys__lwp_detach(struct lwp *l, const struct sys__lwp_detach_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(lwpid_t)        target;
        } */
        struct proc *p;
        struct lwp *t;
        lwpid_t target;
        int error;

        target = SCARG(uap, target);
        p = l->l_proc;

        mutex_enter(p->p_lock);

        if (l->l_lid == target)
                t = l;
        else {
                /*
                 * We can't use lwp_find() here because the target might
                 * be a zombie.
                 */
                t = proc_find_lwp(p, target);
                KASSERT(t == NULL || t->l_lid == target);
        }

        /*
         * If the LWP is already detached, there's nothing to do.
         * If it's a zombie, we need to clean up after it.  LSZOMB
         * is visible with the proc mutex held.
         *
         * After we have detached or released the LWP, kick any
         * other LWPs that may be sitting in _lwp_wait(), waiting
         * for the target LWP to exit.
         */
        if (t != NULL && t->l_stat != LSIDL) {
                if ((t->l_prflag & LPR_DETACHED) == 0) {
                        p->p_ndlwps++;
                        t->l_prflag |= LPR_DETACHED;
                        if (t->l_stat == LSZOMB) {
                                /* Releases proc mutex. */
                                lwp_free(t, false, false);
                                return 0;
                        }
                        error = 0;

                        /*
                         * Have any LWPs sleeping in lwp_wait() recheck
                         * for deadlock.
                         */
                        cv_broadcast(&p->p_lwpcv);
                } else
                        error = EINVAL;
        } else
                error = ESRCH;

        mutex_exit(p->p_lock);

        return error;
}

int
lwp_unpark(const lwpid_t *tp, const u_int ntargets)
{
        u_int target;
        kmutex_t *mp;
        int error, s;
        proc_t *p;
        lwp_t *t;

        p = curproc;
        error = 0;

        s = pserialize_read_enter();
        for (target = 0; target < ntargets; target++) {
                t = proc_find_lwp_unlocked(p, tp[target]);
                if (__predict_false(t == NULL)) {
                        error = ESRCH;
                        continue;
                }

                KASSERT(lwp_locked(t, NULL));

                if (__predict_true(t->l_syncobj == &lwp_park_syncobj)) {
                        /* As expected it's parked, so wake it up. */
                        mp = t->l_mutex;
                        sleepq_remove(NULL, t, true);
                        mutex_spin_exit(mp);
                } else if (__predict_false(t->l_stat == LSZOMB)) {
                        lwp_unlock(t);
                        error = ESRCH;
                } else {
                        /*
                         * It hasn't parked yet because the wakeup side won
                         * the race, or something else has happened to make
                         * the thread not park.  Why doesn't really matter. 
                         * Set the operation pending, so that the next call
                         * to _lwp_park() in the LWP returns early.  If it
                         * turns out to be a spurious wakeup, no harm done.
                         */
                        t->l_flag |= LW_UNPARKED;
                        lwp_unlock(t);
                }
        }
        pserialize_read_exit(s);

        return error;
}

int
lwp_park(clockid_t clock_id, int flags, struct timespec *ts)
{
        int timo, error;
        struct timespec start;
        lwp_t *l;
        bool timeremain = !(flags & TIMER_ABSTIME) && ts;

        if (ts != NULL) {
                if ((error = ts2timo(clock_id, flags, ts, &timo, 
                    timeremain ? &start : NULL)) != 0)
                        return error;
                KASSERT(timo != 0);
        } else {
                timo = 0;
        }

        /*
         * Before going the full route and blocking, check to see if an
         * unpark op is pending.
         */
        l = curlwp;
        lwp_lock(l);
        if ((l->l_flag & (LW_CANCELLED | LW_UNPARKED)) != 0) {
                l->l_flag &= ~(LW_CANCELLED | LW_UNPARKED);
                lwp_unlock(l);
                return EALREADY;
        }
        sleepq_enqueue(NULL, l, "parked", &lwp_park_syncobj, true);
        error = sleepq_block(timo, true, &lwp_park_syncobj, 0);
        switch (error) {
        case EWOULDBLOCK:
                error = ETIMEDOUT;
                if (timeremain)
                        memset(ts, 0, sizeof(*ts));
                break;
        case ERESTART:
                error = EINTR;
                /*FALLTHROUGH*/
        default:
                if (timeremain)
                        clock_timeleft(clock_id, ts, &start);
                break;
        }
        return error;
}

/*
 * 'park' an LWP waiting on a user-level synchronisation object.  The LWP
 * will remain parked until another LWP in the same process calls in and
 * requests that it be unparked.
 */
int
sys____lwp_park60(struct lwp *l, const struct sys____lwp_park60_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(clockid_t)                        clock_id;
                syscallarg(int)                                flags;
                syscallarg(struct timespec *)                ts;
                syscallarg(lwpid_t)                        unpark;
                syscallarg(const void *)                hint;
                syscallarg(const void *)                unparkhint;
        } */
        struct timespec ts, *tsp;
        int error;

        if (SCARG(uap, ts) == NULL)
                tsp = NULL;
        else {
                error = copyin(SCARG(uap, ts), &ts, sizeof(ts));
                if (error != 0)
                        return error;
                tsp = &ts;
        }

        if (SCARG(uap, unpark) != 0) {
                error = lwp_unpark(&SCARG(uap, unpark), 1);
                if (error != 0)
                        return error;
        }

        error = lwp_park(SCARG(uap, clock_id), SCARG(uap, flags), tsp);
        if (SCARG(uap, ts) != NULL && (SCARG(uap, flags) & TIMER_ABSTIME) == 0)
                (void)copyout(tsp, SCARG(uap, ts), sizeof(*tsp));
        return error;
}

int
sys__lwp_unpark(struct lwp *l, const struct sys__lwp_unpark_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(lwpid_t)                target;
                syscallarg(const void *)        hint;
        } */

        return lwp_unpark(&SCARG(uap, target), 1);
}

int
sys__lwp_unpark_all(struct lwp *l, const struct sys__lwp_unpark_all_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(const lwpid_t *)        targets;
                syscallarg(size_t)                ntargets;
                syscallarg(const void *)        hint;
        } */
        lwpid_t targets[32], *tp;
        int error;
        u_int ntargets;
        size_t sz;

        ntargets = SCARG(uap, ntargets);
        if (SCARG(uap, targets) == NULL) {
                /*
                 * Let the caller know how much we are willing to do, and
                 * let it unpark the LWPs in blocks.
                 */
                *retval = LWP_UNPARK_MAX;
                return 0;
        }
        if (ntargets > LWP_UNPARK_MAX || ntargets == 0)
                return EINVAL;

        /*
         * Copy in the target array.  If it's a small number of LWPs, then
         * place the numbers on the stack.
         */
        sz = sizeof(lwpid_t) * ntargets;
        if (sz <= sizeof(targets))
                tp = targets;
        else
                tp = kmem_alloc(sz, KM_SLEEP);
        error = copyin(SCARG(uap, targets), tp, sz);
        if (error != 0) {
                if (tp != targets) {
                        kmem_free(tp, sz);
                }
                return error;
        }
        error = lwp_unpark(tp, ntargets);
        if (tp != targets)
                kmem_free(tp, sz);
        return error;
}

int
sys__lwp_setname(struct lwp *l, const struct sys__lwp_setname_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(lwpid_t)                target;
                syscallarg(const char *)        name;
        } */
        char *name, *oname;
        lwpid_t target;
        proc_t *p;
        lwp_t *t;
        int error;

        if ((target = SCARG(uap, target)) == 0)
                target = l->l_lid;

        name = kmem_alloc(MAXCOMLEN, KM_SLEEP);
        error = copyinstr(SCARG(uap, name), name, MAXCOMLEN, NULL);
        switch (error) {
        case ENAMETOOLONG:
        case 0:
                name[MAXCOMLEN - 1] = '\0';
                break;
        default:
                kmem_free(name, MAXCOMLEN);
                return error;
        }

        p = curproc;
        mutex_enter(p->p_lock);
        if ((t = lwp_find(p, target)) == NULL) {
                mutex_exit(p->p_lock);
                kmem_free(name, MAXCOMLEN);
                return ESRCH;
        }
        lwp_lock(t);
        oname = t->l_name;
        t->l_name = name;
        lwp_unlock(t);
        mutex_exit(p->p_lock);

        if (oname != NULL)
                kmem_free(oname, MAXCOMLEN);

        return 0;
}

int
sys__lwp_getname(struct lwp *l, const struct sys__lwp_getname_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(lwpid_t)                target;
                syscallarg(char *)                name;
                syscallarg(size_t)                len;
        } */
        char name[MAXCOMLEN];
        lwpid_t target;
        size_t len;
        proc_t *p;
        lwp_t *t;

        if ((target = SCARG(uap, target)) == 0)
                target = l->l_lid;

        p = curproc;
        mutex_enter(p->p_lock);
        if ((t = lwp_find(p, target)) == NULL) {
                mutex_exit(p->p_lock);
                return ESRCH;
        }
        lwp_lock(t);
        if (t->l_name == NULL)
                name[0] = '\0';
        else
                strlcpy(name, t->l_name, sizeof(name));
        lwp_unlock(t);
        mutex_exit(p->p_lock);

        len = uimin(SCARG(uap, len), sizeof(name));

        return copyoutstr(name, SCARG(uap, name), len, NULL);
}

int
sys__lwp_ctl(struct lwp *l, const struct sys__lwp_ctl_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int)                        features;
                syscallarg(struct lwpctl **)        address;
        } */
        int error, features;
        vaddr_t vaddr;

        features = SCARG(uap, features);
        features &= ~(LWPCTL_FEATURE_CURCPU | LWPCTL_FEATURE_PCTR);
        if (features != 0)
                return ENODEV;
        if ((error = lwp_ctl_alloc(&vaddr)) != 0)
                return error;
        return copyout(&vaddr, SCARG(uap, address), sizeof(void *));
}


















































   83 





   83 



    4 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/*        $NetBSD: strncmp.c,v 1.3 2018/02/04 20:22:17 mrg Exp $        */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
#if 0
static char sccsid[] = "@(#)strncmp.c        8.1 (Berkeley) 6/4/93";
#else
__RCSID("$NetBSD: strncmp.c,v 1.3 2018/02/04 20:22:17 mrg Exp $");
#endif
#endif /* LIBC_SCCS and not lint */

#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <assert.h>
#include <string.h>
#else
#include <lib/libkern/libkern.h>
#endif

int
strncmp(const char *s1, const char *s2, size_t n)
{

        if (n == 0)
                return (0);
        do {
                if (*s1 != *s2++)
                        return (*(const unsigned char *)s1 -
                            *(const unsigned char *)--s2);
                if (*s1++ == 0)
                        break;
        } while (--n != 0);
        return (0);
}





















































  250 
  252 
  250 

  254 
  251 
    4 



  251 

  249 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
/*        $NetBSD: userret.h,v 1.35 2024/01/28 10:06:19 skrll Exp $        */

/*-
 * Copyright (c) 1998, 2000, 2003, 2006, 2008, 2019, 2020, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum, and Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _SYS_USERRET_H_
#define        _SYS_USERRET_H_

#include <sys/lockdebug.h>
#include <sys/intr.h>
#include <sys/psref.h>

/*
 * Define the MI code needed before returning to user mode, for trap and
 * syscall.
 *
 * We handle "exceptional" events: pending signals, stop/exit actions, etc.
 * Note that the event must be flagged BEFORE any AST is posted as we are
 * reading unlocked.
 */
static __inline void
mi_userret(struct lwp *l)
{
        int exception;

        KPREEMPT_DISABLE(l);
        KASSERTMSG(l->l_cpu->ci_biglock_count == 0, "kernel_lock leaked");
        KASSERT(l->l_blcnt == 0);
        exception = l->l_cpu->ci_want_resched | (l->l_flag & LW_USERRET);
        KPREEMPT_ENABLE(l);
        if (__predict_false(exception)) {
                lwp_userret(l);
        }

        LOCKDEBUG_BARRIER(NULL, 0);
        KASSERT(l->l_nopreempt == 0);
        PSREF_DEBUG_BARRIER();
        KASSERT(l->l_psrefs == 0);
}

#endif        /* !_SYS_USERRET_H_ */





























































































































    2 

    2 




























    1 


    1 



    2 



    2 






















































































































    2 






    2 





    2 









    2 





    2 



    2 

    2 






















    2 









    2 








































    2 


























    2 



    2 




























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
/*        $NetBSD: tty_43.c,v 1.40 2022/07/10 13:57:14 riastradh Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)tty_compat.c        8.2 (Berkeley) 1/9/95
 */

/*
 * mapping routines for old line discipline (yuck)
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty_43.c,v 1.40 2022/07/10 13:57:14 riastradh Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/ioctl.h>
#include <sys/proc.h>
#include <sys/conf.h>
#include <sys/tty.h>
#include <sys/termios.h>
#include <sys/file.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/compat_stub.h>
#include <sys/module_hook.h>
#include <sys/ioctl_compat.h>

#include <compat/common/compat_mod.h>
#include <compat/sys/ttycom.h>

int ttydebug = 0;

static const struct speedtab compatspeeds[] = {
#define MAX_SPEED        17
        { 115200, 17 },
        { 57600, 16 },
        { 38400, 15 },
        { 19200, 14 },
        { 9600,        13 },
        { 4800,        12 },
        { 2400,        11 },
        { 1800,        10 },
        { 1200,        9 },
        { 600,        8 },
        { 300,        7 },
        { 200,        6 },
        { 150,        5 },
        { 134,        4 },
        { 110,        3 },
        { 75,        2 },
        { 50,        1 },
        { 0,        0 },
        { -1,        -1 },
};
static const int compatspcodes[] = {
        0, 50, 75, 110, 134, 150, 200, 300, 600, 1200,
        1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200
};

static int ttcompatgetflags(struct tty *);
static void ttcompatsetflags(struct tty *, struct termios *);
static void ttcompatsetlflags(struct tty *, struct termios *);

/*ARGSUSED*/
int
compat_43_ttioctl(struct tty *tp, u_long com, void *data, int flag,
    struct lwp *l)
{

        switch (com) {
        case TIOCGETP: {
                struct sgttyb *sg = (struct sgttyb *)data;
                int speed;

                mutex_spin_enter(&tty_lock);
                speed = ttspeedtab(tp->t_ospeed, compatspeeds);
                sg->sg_ospeed = (speed == -1) ? MAX_SPEED : speed;
                if (tp->t_ispeed == 0)
                        sg->sg_ispeed = sg->sg_ospeed;
                else {
                        speed = ttspeedtab(tp->t_ispeed, compatspeeds);
                        sg->sg_ispeed = (speed == -1) ? MAX_SPEED : speed;
                }
                sg->sg_erase = tty_getctrlchar(tp, VERASE);
                sg->sg_kill = tty_getctrlchar(tp, VKILL);
                sg->sg_flags = ttcompatgetflags(tp);
                mutex_spin_exit(&tty_lock);
                break;
        }

        case TIOCSETP:
        case TIOCSETN: {
                struct sgttyb *sg = (struct sgttyb *)data;
                struct termios term;
                int speed;

                mutex_spin_enter(&tty_lock);
                term = tp->t_termios;
                if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0)
                        term.c_ispeed = speed;
                else
                        term.c_ispeed = compatspcodes[speed];
                if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0)
                        term.c_ospeed = speed;
                else
                        term.c_ospeed = compatspcodes[speed];
                term.c_cc[VERASE] = sg->sg_erase;
                term.c_cc[VKILL] = sg->sg_kill;
                tp->t_flags = (ttcompatgetflags(tp)&0xffff0000) | (sg->sg_flags&0xffff);
                ttcompatsetflags(tp, &term);
                mutex_spin_exit(&tty_lock);
                return (ttioctl(tp, com == TIOCSETP ? TIOCSETAF : TIOCSETA,
                        (void *)&term, flag, l));
        }

        case TIOCGETC: {
                struct tchars *tc = (struct tchars *)data;

                tc->t_intrc = tty_getctrlchar(tp, VINTR);
                tc->t_quitc = tty_getctrlchar(tp, VQUIT);
                tc->t_startc = tty_getctrlchar(tp, VSTART);
                tc->t_stopc = tty_getctrlchar(tp, VSTOP);
                tc->t_eofc = tty_getctrlchar(tp, VEOF);
                tc->t_brkc = tty_getctrlchar(tp, VEOL);
                break;
        }
        case TIOCSETC: {
                struct tchars *tc = (struct tchars *)data;

                tty_setctrlchar(tp, VINTR, tc->t_intrc);
                tty_setctrlchar(tp, VQUIT, tc->t_quitc);
                tty_setctrlchar(tp, VSTART, tc->t_startc);
                tty_setctrlchar(tp, VSTOP, tc->t_stopc);
                tty_setctrlchar(tp, VEOF, tc->t_eofc);
                tty_setctrlchar(tp, VEOL, tc->t_brkc);
                if (tc->t_brkc == (char)-1)
                        tty_setctrlchar(tp, VEOL2, _POSIX_VDISABLE);
                break;
        }
        case TIOCSLTC: {
                struct ltchars *ltc = (struct ltchars *)data;

                tty_setctrlchar(tp, VSUSP, ltc->t_suspc);
                tty_setctrlchar(tp, VDSUSP, ltc->t_dsuspc);
                tty_setctrlchar(tp, VREPRINT, ltc->t_rprntc);
                tty_setctrlchar(tp, VDISCARD, ltc->t_flushc);
                tty_setctrlchar(tp, VWERASE, ltc->t_werasc);
                tty_setctrlchar(tp, VLNEXT, ltc->t_lnextc);
                break;
        }
        case TIOCGLTC: {
                struct ltchars *ltc = (struct ltchars *)data;

                ltc->t_suspc = tty_getctrlchar(tp, VSUSP);
                ltc->t_dsuspc = tty_getctrlchar(tp, VDSUSP);
                ltc->t_rprntc = tty_getctrlchar(tp, VREPRINT);
                ltc->t_flushc = tty_getctrlchar(tp, VDISCARD);
                ltc->t_werasc = tty_getctrlchar(tp, VWERASE);
                ltc->t_lnextc = tty_getctrlchar(tp, VLNEXT);
                break;
        }
        case TIOCLBIS:
        case TIOCLBIC:
        case TIOCLSET: {
                struct termios term;
                unsigned argbits, flags;

                argbits = *(int *)data;

                mutex_spin_enter(&tty_lock);
                term = tp->t_termios;
                flags = ttcompatgetflags(tp);
                switch (com) {
                case TIOCLSET:
                        tp->t_flags = (flags & 0xffff) | (argbits << 16);
                        break;
                case TIOCLBIS:
                        tp->t_flags = flags | (argbits << 16);
                        break;
                case TIOCLBIC:
                        tp->t_flags = flags & ~(argbits << 16);
                        break;
                }
                ttcompatsetlflags(tp, &term);
                mutex_spin_exit(&tty_lock);
                return (ttioctl(tp, TIOCSETA, (void *)&term, flag, l));
        }
        case TIOCLGET:
                mutex_spin_enter(&tty_lock);
                *(int *)data = ttcompatgetflags(tp)>>16;
                mutex_spin_exit(&tty_lock);
                if (ttydebug)
                        printf("CLGET: returning %x\n", *(int *)data);
                break;

        case OTIOCGETD:
                mutex_spin_enter(&tty_lock);
                *(int *)data = (tp->t_linesw == NULL || tp->t_linesw->l_no == 0)
                    ? 2 /* XXX old NTTYDISC */ : tp->t_linesw->l_no;
                mutex_spin_exit(&tty_lock);
                break;

        case OTIOCSETD: {
                int ldisczero = 0;

                return (ttioctl(tp, TIOCSETD,
                        *(int *)data == 2 ? (void *)&ldisczero : data, flag,
                        l));
            }

        case OTIOCCONS:
                *(int *)data = 1;
                return (ttioctl(tp, TIOCCONS, data, flag, l));

        case TIOCHPCL:
                mutex_spin_enter(&tty_lock);
                SET(tp->t_cflag, HUPCL);
                mutex_spin_exit(&tty_lock);
                break;

        default:
                return (EPASSTHROUGH);
        }
        return (0);
}

static int
ttcompatgetflags(struct tty *tp)
{
        tcflag_t iflag = tp->t_iflag;
        tcflag_t lflag = tp->t_lflag;
        tcflag_t oflag = tp->t_oflag;
        tcflag_t cflag = tp->t_cflag;
        int flags = 0;

        KASSERT(mutex_owned(&tty_lock));

        if (ISSET(iflag, IXOFF))
                SET(flags, TANDEM);
        if (ISSET(iflag, ICRNL) || ISSET(oflag, ONLCR))
                SET(flags, CRMOD);
        if (ISSET(cflag, PARENB)) {
                if (ISSET(iflag, INPCK)) {
                        if (ISSET(cflag, PARODD))
                                SET(flags, ODDP);
                        else
                                SET(flags, EVENP);
                } else
                        SET(flags, ANYP);
        }

        if (!ISSET(lflag, ICANON)) {
                /* fudge */
                if (ISSET(iflag, IXON) || ISSET(lflag, ISIG|IEXTEN) ||
                    ISSET(cflag, PARENB))
                        SET(flags, CBREAK);
                else
                        SET(flags, RAW);
        }

        if (ISSET(flags, RAW))
                SET(flags, ISSET(tp->t_flags, LITOUT|PASS8));
        else if (ISSET(cflag, CSIZE) == CS8) {
                if (!ISSET(oflag, OPOST))
                        SET(flags, LITOUT);
                if (!ISSET(iflag, ISTRIP))
                        SET(flags, PASS8);
        }

        if (ISSET(cflag, MDMBUF))
                SET(flags, MDMBUF);
        if (!ISSET(cflag, HUPCL))
                SET(flags, NOHANG);
        if (ISSET(oflag, OXTABS))
                SET(flags, XTABS);
        if (ISSET(lflag, ECHOE))
                SET(flags, CRTERA|CRTBS);
        if (ISSET(lflag, ECHOKE))
                SET(flags, CRTKIL|CRTBS);
        if (ISSET(lflag, ECHOPRT))
                SET(flags, PRTERA);
        if (ISSET(lflag, ECHOCTL))
                SET(flags, CTLECH);
        if (!ISSET(iflag, IXANY))
                SET(flags, DECCTQ);
        SET(flags, ISSET(lflag, ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH));
        if (ttydebug)
                printf("getflags: %x\n", flags);
        return (flags);
}

static void
ttcompatsetflags(struct tty *tp, struct termios *t)
{
        int flags = tp->t_flags;

        KASSERT(mutex_owned(&tty_lock));

        tcflag_t iflag = t->c_iflag;
        tcflag_t oflag = t->c_oflag;
        tcflag_t lflag = t->c_lflag;
        tcflag_t cflag = t->c_cflag;

        if (ISSET(flags, TANDEM))
                SET(iflag, IXOFF);
        else
                CLR(iflag, IXOFF);
        if (ISSET(flags, ECHO))
                SET(lflag, ECHO);
        else
                CLR(lflag, ECHO);
        if (ISSET(flags, CRMOD)) {
                SET(iflag, ICRNL);
                SET(oflag, ONLCR);
        } else {
                CLR(iflag, ICRNL);
                CLR(oflag, ONLCR);
        }
        if (ISSET(flags, XTABS))
                SET(oflag, OXTABS);
        else
                CLR(oflag, OXTABS);


        if (ISSET(flags, RAW)) {
                iflag &= IXOFF;
                CLR(lflag, ISIG|ICANON|IEXTEN);
                CLR(cflag, PARENB);
        } else {
                SET(iflag, BRKINT|IXON|IMAXBEL);
                SET(lflag, ISIG|IEXTEN);
                if (ISSET(flags, CBREAK))
                        CLR(lflag, ICANON);
                else
                        SET(lflag, ICANON);
                switch (ISSET(flags, ANYP)) {
                case 0:
                        CLR(cflag, PARENB);
                        break;
                case ANYP:
                        SET(cflag, PARENB);
                        CLR(iflag, INPCK);
                        break;
                case EVENP:
                        SET(cflag, PARENB);
                        SET(iflag, INPCK);
                        CLR(cflag, PARODD);
                        break;
                case ODDP:
                        SET(cflag, PARENB);
                        SET(iflag, INPCK);
                        SET(cflag, PARODD);
                        break;
                }
        }

        if (ISSET(flags, RAW|LITOUT|PASS8)) {
                CLR(cflag, CSIZE);
                SET(cflag, CS8);
                if (!ISSET(flags, RAW|PASS8))
                        SET(iflag, ISTRIP);
                else
                        CLR(iflag, ISTRIP);
                if (!ISSET(flags, RAW|LITOUT))
                        SET(oflag, OPOST);
                else
                        CLR(oflag, OPOST);
        } else {
                CLR(cflag, CSIZE);
                SET(cflag, CS7);
                SET(iflag, ISTRIP);
                SET(oflag, OPOST);
        }

        t->c_iflag = iflag;
        t->c_oflag = oflag;
        t->c_lflag = lflag;
        t->c_cflag = cflag;
}

static void
ttcompatsetlflags(struct tty *tp, struct termios *t)
{
        int flags = tp->t_flags;
        tcflag_t iflag = t->c_iflag;
        tcflag_t oflag = t->c_oflag;
        tcflag_t lflag = t->c_lflag;
        tcflag_t cflag = t->c_cflag;

        KASSERT(mutex_owned(&tty_lock));

        /* Nothing we can do with CRTBS. */
        if (ISSET(flags, PRTERA))
                SET(lflag, ECHOPRT);
        else
                CLR(lflag, ECHOPRT);
        if (ISSET(flags, CRTERA))
                SET(lflag, ECHOE);
        else
                CLR(lflag, ECHOE);
        /* Nothing we can do with TILDE. */
        if (ISSET(flags, MDMBUF))
                SET(cflag, MDMBUF);
        else
                CLR(cflag, MDMBUF);
        if (ISSET(flags, NOHANG))
                CLR(cflag, HUPCL);
        else
                SET(cflag, HUPCL);
        if (ISSET(flags, CRTKIL))
                SET(lflag, ECHOKE);
        else
                CLR(lflag, ECHOKE);
        if (ISSET(flags, CTLECH))
                SET(lflag, ECHOCTL);
        else
                CLR(lflag, ECHOCTL);
        if (!ISSET(flags, DECCTQ))
                SET(iflag, IXANY);
        else
                CLR(iflag, IXANY);
        CLR(lflag, TOSTOP|FLUSHO|PENDIN|NOFLSH);
        SET(lflag, ISSET(flags, TOSTOP|FLUSHO|PENDIN|NOFLSH));

        if (ISSET(flags, RAW|LITOUT|PASS8)) {
                CLR(cflag, CSIZE);
                SET(cflag, CS8);
                if (!ISSET(flags, RAW|PASS8))
                        SET(iflag, ISTRIP);
                else
                        CLR(iflag, ISTRIP);
                if (!ISSET(flags, RAW|LITOUT))
                        SET(oflag, OPOST);
                else
                        CLR(oflag, OPOST);
        } else {
                CLR(cflag, CSIZE);
                SET(cflag, CS7);
                SET(iflag, ISTRIP);
                SET(oflag, OPOST);
        }

        t->c_iflag = iflag;
        t->c_oflag = oflag;
        t->c_lflag = lflag;
        t->c_cflag = cflag;
}

int
kern_tty_43_init(void)
{
        MODULE_HOOK_SET(tty_ttioctl_43_hook, compat_43_ttioctl);
        return 0;
}

int
kern_tty_43_fini(void)
{
        MODULE_HOOK_UNSET(tty_ttioctl_43_hook);
        return 0;
}



























































































































































































    1 














    1 































































































































































    1 


    1 























    2 




    1 

    2 








    2 

































































































    2 



    2 





    2 

    2 

















    2 













    2 



















    2 
    2 
    2 





















    2 
    2 






    2 







































































































































































































































































































































































































































































































































































































































































































































































































































































    2 




    1 























    2 












    2 



































    1 


    1 


    1 
    1 
    1 









    1 































































    1 




















    1 



    1 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 




    1 
    1 



    2 





    1 






















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
/*        $NetBSD: kern_sysctl.c,v 1.270 2023/09/09 16:01:09 christos Exp $        */

/*-
 * Copyright (c) 2003, 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Brown.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Mike Karels at Berkeley Software Design, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_sysctl.c        8.9 (Berkeley) 5/20/95
 */

/*
 * sysctl system call.
 */

#define __COMPAT_SYSCTL

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_sysctl.c,v 1.270 2023/09/09 16:01:09 christos Exp $");

#ifdef _KERNEL_OPT
#include "opt_defcorename.h"
#endif

#include "ksyms.h"

#include <sys/param.h>
#include <sys/types.h>

#include <sys/buf.h>
#include <sys/cprng.h>
#include <sys/kauth.h>
#include <sys/ksyms.h>
#include <sys/ktrace.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/once.h>
#include <sys/rndsource.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>

#include <crypto/blake2/blake2s.h>

#define        MAXDESCLEN        1024
MALLOC_DEFINE(M_SYSCTLNODE, "sysctlnode", "sysctl node structures");
MALLOC_DEFINE(M_SYSCTLDATA, "sysctldata", "misc sysctl data");

static int sysctl_mmap(SYSCTLFN_PROTO);
static int sysctl_alloc(struct sysctlnode *, int);
static int sysctl_realloc(struct sysctlnode *);

static int sysctl_cvt_in(struct lwp *, int *, const void *, size_t,
                         struct sysctlnode *);
static int sysctl_cvt_out(struct lwp *, int, const struct sysctlnode *,
                          void *, size_t, size_t *);

static int sysctl_log_add(struct sysctllog **, const struct sysctlnode *);
static int sysctl_log_realloc(struct sysctllog *);

typedef void sysctl_setup_func(struct sysctllog **);

#ifdef SYSCTL_DEBUG
#define DPRINTF(a)        printf a
#else
#define DPRINTF(a)
#endif

struct sysctllog {
        const struct sysctlnode *log_root;
        int *log_num;
        int log_size, log_left;
};

/*
 * the "root" of the new sysctl tree
 */
struct sysctlnode sysctl_root = {
        .sysctl_flags = SYSCTL_VERSION|
            CTLFLAG_ROOT|CTLFLAG_READWRITE|
            CTLTYPE_NODE,
        .sysctl_num = 0,
        .sysctl_size = sizeof(struct sysctlnode),
        .sysctl_name = "(root)",
};

/*
 * link set of functions that add nodes at boot time (see also
 * sysctl_buildtree())
 */
__link_set_decl(sysctl_funcs, sysctl_setup_func);

/*
 * The `sysctl_treelock' is intended to serialize access to the sysctl
 * tree.  XXX This has serious problems; allocating memory and
 * copying data out with the lock held is insane.
 */
krwlock_t sysctl_treelock;

kmutex_t sysctl_file_marker_lock;

/*
 * Attributes stored in the kernel.
 */
char hostname[MAXHOSTNAMELEN];
int hostnamelen;

char domainname[MAXHOSTNAMELEN];
int domainnamelen;

long hostid;

#ifndef DEFCORENAME
#define        DEFCORENAME        "%n.core"
#endif
char defcorename[MAXPATHLEN] = DEFCORENAME;

/*
 * ********************************************************************
 * Section 0: Some simple glue
 * ********************************************************************
 * By wrapping copyin(), copyout(), and copyinstr() like this, we can
 * stop caring about who's calling us and simplify some code a bunch.
 * ********************************************************************
 */
int
sysctl_copyin(struct lwp *l, const void *uaddr, void *kaddr, size_t len)
{
        int error;

        if (l != NULL) {
                error = copyin(uaddr, kaddr, len);
                ktrmibio(-1, UIO_WRITE, uaddr, len, error);
        } else {
                error = kcopy(uaddr, kaddr, len);
        }

        return error;
}

int
sysctl_copyout(struct lwp *l, const void *kaddr, void *uaddr, size_t len)
{
        int error;

        if (l != NULL) {
                error = copyout(kaddr, uaddr, len);
                ktrmibio(-1, UIO_READ, uaddr, len, error);
        } else {
                error = kcopy(kaddr, uaddr, len);
        }

        return error;
}

int
sysctl_copyinstr(struct lwp *l, const void *uaddr, void *kaddr,
                 size_t len, size_t *done)
{
        int error;

        if (l != NULL) {
                error = copyinstr(uaddr, kaddr, len, done);
                ktrmibio(-1, UIO_WRITE, uaddr, len, error);
        } else {
                error = copystr(uaddr, kaddr, len, done);
        }

        return error;
}

/*
 * ********************************************************************
 * Initialize sysctl subsystem.
 * ********************************************************************
 */
void
sysctl_init(void)
{
        sysctl_setup_func *const *sysctl_setup;

        rw_init(&sysctl_treelock);

        /*
         * dynamic mib numbers start here
         */
        sysctl_root.sysctl_num = CREATE_BASE;
        sysctl_basenode_init();

        __link_set_foreach(sysctl_setup, sysctl_funcs) {
                (**sysctl_setup)(NULL);
        }

        mutex_init(&sysctl_file_marker_lock, MUTEX_DEFAULT, IPL_NONE);
}

/*
 * Setting this means no more permanent nodes can be added,
 * trees that claim to be readonly at the root now are, and if
 * the main tree is readonly, *everything* is.
 *
 * Also starts up the PRNG used for the "random" sysctl: it's
 * better to start it later than sooner.
 *
 * Call this at the end of kernel init.
 */
void
sysctl_finalize(void)
{

        sysctl_root.sysctl_flags |= CTLFLAG_PERMANENT;
}

/*
 * ********************************************************************
 * The main native sysctl system call itself.
 * ********************************************************************
 */
int
sys___sysctl(struct lwp *l, const struct sys___sysctl_args *uap, register_t *retval)
{
        /* {
                syscallarg(const int *) name;
                syscallarg(u_int) namelen;
                syscallarg(void *) old;
                syscallarg(size_t *) oldlenp;
                syscallarg(const void *) new;
                syscallarg(size_t) newlen;
        } */
        int error, nerror, name[CTL_MAXNAME];
        size_t oldlen, savelen, *oldlenp;

        /*
         * get oldlen
         */
        oldlen = 0;
        oldlenp = SCARG(uap, oldlenp);
        if (oldlenp != NULL) {
                error = copyin(oldlenp, &oldlen, sizeof(oldlen));
                if (error)
                        return (error);
        }
        savelen = oldlen;

        /*
         * top-level sysctl names may or may not be non-terminal, but
         * we don't care
         */
        if (SCARG(uap, namelen) > CTL_MAXNAME || SCARG(uap, namelen) < 1)
                return (EINVAL);
        error = copyin(SCARG(uap, name), &name,
                       SCARG(uap, namelen) * sizeof(int));
        if (error)
                return (error);

        ktrmib(name, SCARG(uap, namelen));

        sysctl_lock(SCARG(uap, newv) != NULL);

        /*
         * do sysctl work (NULL means main built-in default tree)
         */
        error = sysctl_dispatch(&name[0], SCARG(uap, namelen),
                                SCARG(uap, oldv), &oldlen,
                                SCARG(uap, newv), SCARG(uap, newlen),
                                &name[0], l, NULL);

        /*
         * release the sysctl lock
         */
        sysctl_unlock();

        /*
         * set caller's oldlen to new value even in the face of an
         * error (if this gets an error and they didn't have one, they
         * get this one)
         */
        if (oldlenp) {
                nerror = copyout(&oldlen, oldlenp, sizeof(oldlen));
                if (error == 0)
                        error = nerror;
        }

        /*
         * if the only problem is that we weren't given enough space,
         * that's an ENOMEM error
         */
        if (error == 0 && SCARG(uap, oldv) != NULL && savelen < oldlen)
                error = ENOMEM;

        return (error);
}

/*
 * ********************************************************************
 * Section 1: How the tree is used
 * ********************************************************************
 * Implementations of sysctl for emulations should typically need only
 * these three functions in this order: lock the tree, dispatch
 * request into it, unlock the tree.
 * ********************************************************************
 */
void
sysctl_lock(bool write)
{

        if (write) {
                rw_enter(&sysctl_treelock, RW_WRITER);
                curlwp->l_pflag |= LP_SYSCTLWRITE;
        } else {
                rw_enter(&sysctl_treelock, RW_READER);
                curlwp->l_pflag &= ~LP_SYSCTLWRITE;
        }
}

void
sysctl_relock(void)
{

        if ((curlwp->l_pflag & LP_SYSCTLWRITE) != 0) {
                rw_enter(&sysctl_treelock, RW_WRITER);
        } else {
                rw_enter(&sysctl_treelock, RW_READER);
        }
}

/*
 * ********************************************************************
 * the main sysctl dispatch routine.  scans the given tree and picks a
 * function to call based on what it finds.
 * ********************************************************************
 */
int
sysctl_dispatch(SYSCTLFN_ARGS)
{
        int error;
        sysctlfn fn;
        int ni;

        KASSERT(rw_lock_held(&sysctl_treelock));

        if (rnode && SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
                printf("sysctl_dispatch: rnode %p wrong version\n", rnode);
                error = EINVAL;
                goto out;
        }

        fn = NULL;
        error = sysctl_locate(l, name, namelen, &rnode, &ni);

        if (rnode->sysctl_func != NULL) {
                /*
                 * the node we ended up at has a function, so call it.  it can
                 * hand off to query or create if it wants to.
                 */
                fn = rnode->sysctl_func;
        } else if (error == 0) {
                /*
                 * we found the node they were looking for, so do a lookup.
                 */
                fn = (sysctlfn)sysctl_lookup; /* XXX may write to rnode */
        } else if (error == ENOENT && (ni + 1) == namelen && name[ni] < 0) {
                /*
                 * prospective parent node found, but the terminal node was
                 * not.  generic operations associate with the parent.
                 */
                switch (name[ni]) {
                case CTL_QUERY:
                        fn = sysctl_query;
                        break;
                case CTL_CREATE:
#if NKSYMS > 0
                case CTL_CREATESYM:
#endif /* NKSYMS > 0 */
                        if (newp == NULL) {
                                error = EINVAL;
                                break;
                        }
                        KASSERT(rw_write_held(&sysctl_treelock));
                        fn = (sysctlfn)sysctl_create; /* we own the rnode */
                        break;
                case CTL_DESTROY:
                        if (newp == NULL) {
                                error = EINVAL;
                                break;
                        }
                        KASSERT(rw_write_held(&sysctl_treelock));
                        fn = (sysctlfn)sysctl_destroy; /* we own the rnode */
                        break;
                case CTL_MMAP:
                        fn = (sysctlfn)sysctl_mmap; /* we own the rnode */
                        break;
                case CTL_DESCRIBE:
                        fn = sysctl_describe;
                        break;
                default:
                        error = EOPNOTSUPP;
                        break;
                }
        }

        /*
         * after all of that, maybe we found someone who knows how to
         * get us what we want?
         */
        if (fn != NULL)
                error = (*fn)(name + ni, namelen - ni, oldp, oldlenp,
                              newp, newlen, name, l, rnode);
        else if (error == 0)
                error = EOPNOTSUPP;

out:
        return (error);
}

/*
 * ********************************************************************
 * Releases the tree lock.
 * ********************************************************************
 */
void
sysctl_unlock(void)
{

        rw_exit(&sysctl_treelock);
}

/*
 * ********************************************************************
 * Section 2: The main tree interfaces
 * ********************************************************************
 * This is how sysctl_dispatch() does its work, and you can too, by
 * calling these routines from helpers (though typically only
 * sysctl_lookup() will be used).  The tree MUST BE LOCKED when these
 * are called.
 * ********************************************************************
 */

/*
 * sysctl_locate -- Finds the node matching the given mib under the
 * given tree (via rv).  If no tree is given, we fall back to the
 * native tree.  The current process (via l) is used for access
 * control on the tree (some nodes may be traversable only by root) and
 * on return, nip will show how many numbers in the mib were consumed.
 */
int
sysctl_locate(struct lwp *l, const int *name, u_int namelen,
              const struct sysctlnode **rnode, int *nip)
{
        const struct sysctlnode *node, *pnode;
        int tn, si, ni, error, alias;

        KASSERT(rw_lock_held(&sysctl_treelock));

        /*
         * basic checks and setup
         */
        if (*rnode == NULL)
                *rnode = &sysctl_root;
        if (nip)
                *nip = 0;
        if (namelen == 0)
                return (0);

        /*
         * search starts from "root"
         */
        pnode = *rnode;
        if (SYSCTL_VERS(pnode->sysctl_flags) != SYSCTL_VERSION) {
                printf("sysctl_locate: pnode %p wrong version\n", pnode);
                return (EINVAL);
        }
        node = pnode->sysctl_child;
        error = 0;

        /*
         * scan for node to which new node should be attached
         */
        for (ni = 0; ni < namelen; ni++) {
                /*
                 * walked off bottom of tree
                 */
                if (node == NULL) {
                        if (SYSCTL_TYPE(pnode->sysctl_flags) == CTLTYPE_NODE)
                                error = ENOENT;
                        else
                                error = ENOTDIR;
                        break;
                }
                /*
                 * can anyone traverse this node or only root?
                 */
                if (l != NULL && (pnode->sysctl_flags & CTLFLAG_PRIVATE) &&
                    (error = kauth_authorize_system(l->l_cred,
                    KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_PRVT,
                    NULL, NULL, NULL)) != 0)
                        return (error);
                /*
                 * find a child node with the right number
                 */
                tn = name[ni];
                alias = 0;

                si = 0;
                /*
                 * Note: ANYNUMBER only matches positive integers.
                 * Since ANYNUMBER is only permitted on single-node
                 * sub-trees (eg proc), check before the loop and skip
                 * it if we can.
                 */
                if ((node[si].sysctl_flags & CTLFLAG_ANYNUMBER) && (tn >= 0))
                        goto foundit;
                for (; si < pnode->sysctl_clen; si++) {
                        if (node[si].sysctl_num == tn) {
                                if (node[si].sysctl_flags & CTLFLAG_ALIAS) {
                                        if (alias++ == 4)
                                                break;
                                        else {
                                                tn = node[si].sysctl_alias;
                                                si = -1;
                                        }
                                } else
                                        goto foundit;
                        }
                }
                /*
                 * if we ran off the end, it obviously doesn't exist
                 */
                error = ENOENT;
                break;

                /*
                 * so far so good, move on down the line
                 */
          foundit:
                pnode = &node[si];
                if (SYSCTL_TYPE(pnode->sysctl_flags) == CTLTYPE_NODE)
                        node = node[si].sysctl_child;
                else
                        node = NULL;
        }

        *rnode = pnode;
        if (nip)
                *nip = ni;

        return (error);
}

/*
 * sysctl_query -- The auto-discovery engine.  Copies out the structs
 * describing nodes under the given node and handles overlay trees.
 */
int
sysctl_query(SYSCTLFN_ARGS)
{
        int error, ni, elim, v;
        size_t out, left, t;
        const struct sysctlnode *enode, *onode;
        struct sysctlnode qnode;

        KASSERT(rw_lock_held(&sysctl_treelock));

        if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
                printf("sysctl_query: rnode %p wrong version\n", rnode);
                return (EINVAL);
        }

        if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE)
                return (ENOTDIR);
        if (namelen != 1 || name[0] != CTL_QUERY)
                return (EINVAL);

        error = 0;
        out = 0;
        left = *oldlenp;
        elim = 0;
        enode = NULL;

        /*
         * translate the given request to a current node
         */
        error = sysctl_cvt_in(l, &v, newp, newlen, &qnode);
        if (error)
                return (error);

        /*
         * if the request specifies a version, check it
         */
        if (qnode.sysctl_ver != 0) {
                enode = rnode;
                if (qnode.sysctl_ver != enode->sysctl_ver &&
                    qnode.sysctl_ver != sysctl_rootof(enode)->sysctl_ver)
                        return (EINVAL);
        }

        /*
         * process has overlay tree
         */
        if (l && l->l_proc->p_emul->e_sysctlovly) {
                enode = l->l_proc->p_emul->e_sysctlovly;
                elim = (name - oname);
                error = sysctl_locate(l, oname, elim, &enode, NULL);
                if (error == 0) {
                        /* ah, found parent in overlay */
                        elim = enode->sysctl_clen;
                        enode = enode->sysctl_child;
                } else {
                        error = 0;
                        elim = 0;
                        enode = NULL;
                }
        }

        for (ni = 0; ni < rnode->sysctl_clen; ni++) {
                onode = &rnode->sysctl_child[ni];
                if (enode && enode->sysctl_num == onode->sysctl_num) {
                        if (SYSCTL_TYPE(enode->sysctl_flags) != CTLTYPE_NODE)
                                onode = enode;
                        if (--elim > 0)
                                enode++;
                        else
                                enode = NULL;
                }
                error = sysctl_cvt_out(l, v, onode, oldp, left, &t);
                if (error)
                        return (error);
                if (oldp != NULL)
                        oldp = (char*)oldp + t;
                out += t;
                left -= MIN(left, t);
        }

        /*
         * overlay trees *MUST* be entirely consumed
         */
        KASSERT(enode == NULL);

        *oldlenp = out;

        return (error);
}

/*
 * sysctl_create -- Adds a node (the description of which is taken
 * from newp) to the tree, returning a copy of it in the space pointed
 * to by oldp.  In the event that the requested slot is already taken
 * (either by name or by number), the offending node is returned
 * instead.  Yes, this is complex, but we want to make sure everything
 * is proper.
 */
#ifdef SYSCTL_DEBUG_CREATE
int _sysctl_create(SYSCTLFN_ARGS);
int
_sysctl_create(SYSCTLFN_ARGS)
#else
int
sysctl_create(SYSCTLFN_ARGS)
#endif
{
        struct sysctlnode nnode, *node, *pnode;
        int error, ni, at, nm, type, nsz, sz, flags, anum, v;
        void *own;

        KASSERT(rw_write_held(&sysctl_treelock));

        error = 0;
        own = NULL;
        anum = -1;

        if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
                printf("sysctl_create: rnode %p wrong version\n", rnode);
                return (EINVAL);
        }

        if (namelen != 1 || (name[namelen - 1] != CTL_CREATE
#if NKSYMS > 0
                             && name[namelen - 1] != CTL_CREATESYM
#endif /* NKSYMS > 0 */
                             ))
                return (EINVAL);

        /*
         * processes can only add nodes at securelevel 0, must be
         * root, and can't add nodes to a parent that's not writeable
         */
        if (l != NULL) {
#ifndef SYSCTL_DISALLOW_CREATE
                error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
                    KAUTH_REQ_SYSTEM_SYSCTL_ADD, NULL, NULL, NULL);
                if (error)
                        return (error);
                if (!(rnode->sysctl_flags & CTLFLAG_READWRITE))
#endif /* SYSCTL_DISALLOW_CREATE */
                        return (EPERM);
        }

        /*
         * nothing can add a node if:
         * we've finished initial set up of this tree and
         * (the tree itself is not writeable or
         * the entire sysctl system is not writeable)
         */
        if ((sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_PERMANENT) &&
            (!(sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_READWRITE) ||
             !(sysctl_root.sysctl_flags & CTLFLAG_READWRITE)))
                return (EPERM);

        /*
         * it must be a "node", not a "int" or something
         */
        if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE)
                return (ENOTDIR);
        if (rnode->sysctl_flags & CTLFLAG_ALIAS) {
                printf("sysctl_create: attempt to add node to aliased "
                       "node %p\n", rnode);
                return (EINVAL);
        }
        pnode = __UNCONST(rnode); /* we are adding children to this node */

        if (newp == NULL)
                return (EINVAL);
        error = sysctl_cvt_in(l, &v, newp, newlen, &nnode);
        if (error)
                return (error);

        /*
         * nodes passed in don't *have* parents
         */
        if (nnode.sysctl_parent != NULL)
                return (EINVAL);

        /*
         * if we are indeed adding it, it should be a "good" name and
         * number
         */
        nm = nnode.sysctl_num;
#if NKSYMS > 0
        if (nm == CTL_CREATESYM)
                nm = CTL_CREATE;
#endif /* NKSYMS > 0 */
        if (nm < 0 && nm != CTL_CREATE)
                return (EINVAL);

        /*
         * the name can't start with a digit
         */
        if (nnode.sysctl_name[0] >= '0' &&
            nnode.sysctl_name[0] <= '9')
                return (EINVAL);

        /*
         * the name must be only alphanumerics or - or _, longer than
         * 0 bytes and less than SYSCTL_NAMELEN
         */
        nsz = 0;
        while (nsz < SYSCTL_NAMELEN && nnode.sysctl_name[nsz] != '\0') {
                if ((nnode.sysctl_name[nsz] >= '0' &&
                     nnode.sysctl_name[nsz] <= '9') ||
                    (nnode.sysctl_name[nsz] >= 'A' &&
                     nnode.sysctl_name[nsz] <= 'Z') ||
                    (nnode.sysctl_name[nsz] >= 'a' &&
                     nnode.sysctl_name[nsz] <= 'z') ||
                    nnode.sysctl_name[nsz] == '-' ||
                    nnode.sysctl_name[nsz] == '_')
                        nsz++;
                else
                        return (EINVAL);
        }
        if (nsz == 0 || nsz == SYSCTL_NAMELEN)
                return (EINVAL);

        /*
         * various checks revolve around size vs type, etc
         */
        type = SYSCTL_TYPE(nnode.sysctl_flags);
        flags = SYSCTL_FLAGS(nnode.sysctl_flags);
        sz = nnode.sysctl_size;

        /*
         * find out if there's a collision, and if so, let the caller
         * know what they collided with
         */
        node = pnode->sysctl_child;
        at = 0;
        if (node) {
                if ((flags | node->sysctl_flags) & CTLFLAG_ANYNUMBER)
                        /* No siblings for a CTLFLAG_ANYNUMBER node */
                        return EINVAL;
                for (ni = 0; ni < pnode->sysctl_clen; ni++) {
                        if (nm == node[ni].sysctl_num ||
                            strcmp(nnode.sysctl_name, node[ni].sysctl_name) == 0) {
                                /*
                                 * ignore error here, since we
                                 * are already fixed on EEXIST
                                 */
                                (void)sysctl_cvt_out(l, v, &node[ni], oldp,
                                                     *oldlenp, oldlenp);
                                return (EEXIST);
                        }
                        if (nm > node[ni].sysctl_num)
                                at++;
                }
        }

        /*
         * use sysctl_ver to add to the tree iff it hasn't changed
         */
        if (nnode.sysctl_ver != 0) {
                /*
                 * a specified value must match either the parent
                 * node's version or the root node's version
                 */
                if (nnode.sysctl_ver != sysctl_rootof(rnode)->sysctl_ver &&
                    nnode.sysctl_ver != rnode->sysctl_ver) {
                        return (EINVAL);
                }
        }

        /*
         * only the kernel can assign functions to entries
         */
        if (l != NULL && nnode.sysctl_func != NULL)
                return (EPERM);

        /*
         * only the kernel can create permanent entries, and only then
         * before the kernel is finished setting itself up
         */
        if (l != NULL && (flags & ~SYSCTL_USERFLAGS))
                return (EPERM);
        if ((flags & CTLFLAG_PERMANENT) &
            (sysctl_root.sysctl_flags & CTLFLAG_PERMANENT))
                return (EPERM);
        if ((flags & (CTLFLAG_OWNDATA | CTLFLAG_IMMEDIATE)) ==
            (CTLFLAG_OWNDATA | CTLFLAG_IMMEDIATE))
                return (EINVAL);
        if ((flags & CTLFLAG_IMMEDIATE) &&
            type != CTLTYPE_INT && type != CTLTYPE_QUAD && type != CTLTYPE_BOOL)
                return (EINVAL);

        /*
         * check size, or set it if unset and we can figure it out.
         * kernel created nodes are allowed to have a function instead
         * of a size (or a data pointer).
         */
        switch (type) {
        case CTLTYPE_NODE:
                /*
                 * only *i* can assert the size of a node
                 */
                if (flags & CTLFLAG_ALIAS) {
                        anum = nnode.sysctl_alias;
                        if (anum < 0)
                                return (EINVAL);
                        nnode.sysctl_alias = 0;
                }
                if (sz != 0 || nnode.sysctl_data != NULL)
                        return (EINVAL);
                if (nnode.sysctl_csize != 0 ||
                    nnode.sysctl_clen != 0 ||
                    nnode.sysctl_child != 0)
                        return (EINVAL);
                if (flags & CTLFLAG_OWNDATA)
                        return (EINVAL);
                sz = sizeof(struct sysctlnode);
                break;
        case CTLTYPE_INT:
                /*
                 * since an int is an int, if the size is not given or
                 * is wrong, we can "int-uit" it.
                 */
                if (sz != 0 && sz != sizeof(int))
                        return (EINVAL);
                sz = sizeof(int);
                break;
        case CTLTYPE_STRING:
                /*
                 * strings are a little more tricky
                 */
                if (sz == 0) {
                        if (l == NULL) {
                                if (nnode.sysctl_func == NULL) {
                                        if (nnode.sysctl_data == NULL)
                                                return (EINVAL);
                                        else
                                                sz = strlen(nnode.sysctl_data) +
                                                    1;
                                }
                        } else if (nnode.sysctl_data == NULL &&
                                 flags & CTLFLAG_OWNDATA) {
                                return (EINVAL);
                        } else {
                                char *vp, *e;
                                size_t s;

                                /*
                                 * we want a rough idea of what the
                                 * size is now
                                 */
                                vp = malloc(PAGE_SIZE, M_SYSCTLDATA, M_WAITOK);
                                if (vp == NULL)
                                        return (ENOMEM);
                                e = nnode.sysctl_data;
                                do {
                                        error = copyinstr(e, vp, PAGE_SIZE, &s);
                                        if (error) {
                                                if (error != ENAMETOOLONG) {
                                                        free(vp, M_SYSCTLDATA);
                                                        return (error);
                                                }
                                                e += PAGE_SIZE;
                                                if ((e - 32 * PAGE_SIZE) >
                                                    (char*)nnode.sysctl_data) {
                                                        free(vp, M_SYSCTLDATA);
                                                        return (ERANGE);
                                                }
                                        }
                                } while (error != 0);
                                sz = s + (e - (char*)nnode.sysctl_data);
                                free(vp, M_SYSCTLDATA);
                        }
                }
                break;
        case CTLTYPE_QUAD:
                if (sz != 0 && sz != sizeof(u_quad_t))
                        return (EINVAL);
                sz = sizeof(u_quad_t);
                break;
        case CTLTYPE_BOOL:
                /*
                 * since an bool is an bool, if the size is not given or
                 * is wrong, we can "intuit" it.
                 */
                if (sz != 0 && sz != sizeof(bool))
                        return (EINVAL);
                sz = sizeof(bool);
                break;
        case CTLTYPE_STRUCT:
                if (sz == 0) {
                        if (l != NULL || nnode.sysctl_func == NULL)
                                return (EINVAL);
                        if (flags & CTLFLAG_OWNDATA)
                                return (EINVAL);
                }
                break;
        default:
                return (EINVAL);
        }

        /*
         * at this point, if sz is zero, we *must* have a
         * function to go with it and we can't own it.
         */

        /*
         *  l  ptr own
         *  0   0   0  -> EINVAL (if no func)
         *  0   0   1  -> own
         *  0   1   0  -> kptr
         *  0   1   1  -> kptr
         *  1   0   0  -> EINVAL
         *  1   0   1  -> own
         *  1   1   0  -> kptr, no own (fault on lookup)
         *  1   1   1  -> uptr, own
         */
        if (type != CTLTYPE_NODE) {
                if (sz != 0) {
                        if (flags & CTLFLAG_OWNDATA) {
                                own = malloc(sz, M_SYSCTLDATA, M_WAITOK);
                                if (own == NULL)
                                        return ENOMEM;
                                if (nnode.sysctl_data == NULL)
                                        memset(own, 0, sz);
                                else {
                                        error = sysctl_copyin(l,
                                            nnode.sysctl_data, own, sz);
                                        if (error != 0) {
                                                free(own, M_SYSCTLDATA);
                                                return (error);
                                        }
                                }
                        } else if ((nnode.sysctl_data != NULL) &&
                                 !(flags & CTLFLAG_IMMEDIATE)) {
#if NKSYMS > 0
                                if (name[namelen - 1] == CTL_CREATESYM) {
                                        char symname[128]; /* XXX enough? */
                                        u_long symaddr;
                                        size_t symlen;

                                        error = sysctl_copyinstr(l,
                                            nnode.sysctl_data, symname,
                                            sizeof(symname), &symlen);
                                        if (error)
                                                return (error);
                                        error = ksyms_getval(NULL, symname,
                                            &symaddr, KSYMS_EXTERN);
                                        if (error)
                                                return (error); /* EINVAL? */
                                        nnode.sysctl_data = (void*)symaddr;
                                }
#endif /* NKSYMS > 0 */
                                /*
                                 * Ideally, we'd like to verify here
                                 * that this address is acceptable,
                                 * but...
                                 *
                                 * - it might be valid now, only to
                                 *   become invalid later
                                 *
                                 * - it might be invalid only for the
                                 *   moment and valid later
                                 *
                                 * - or something else.
                                 *
                                 * Since we can't get a good answer,
                                 * we'll just accept the address as
                                 * given, and fault on individual
                                 * lookups.
                                 */
                        }
                } else if (nnode.sysctl_func == NULL)
                        return (EINVAL);
        }

        /*
         * a process can't assign a function to a node, and the kernel
         * can't create a node that has no function or data.
         * (XXX somewhat redundant check)
         */
        if (l != NULL || nnode.sysctl_func == NULL) {
                if (type != CTLTYPE_NODE &&
                    !(flags & CTLFLAG_IMMEDIATE) &&
                    nnode.sysctl_data == NULL &&
                    own == NULL)
                        return (EINVAL);
        }

#ifdef SYSCTL_DISALLOW_KWRITE
        /*
         * a process can't create a writable node unless it refers to
         * new data.
         */
        if (l != NULL && own == NULL && type != CTLTYPE_NODE &&
            (flags & CTLFLAG_READWRITE) != CTLFLAG_READONLY &&
            !(flags & CTLFLAG_IMMEDIATE))
                return (EPERM);
#endif /* SYSCTL_DISALLOW_KWRITE */

        /*
         * make sure there's somewhere to put the new stuff.
         */
        if (pnode->sysctl_child == NULL) {
                if (flags & CTLFLAG_ANYNUMBER)
                        error = sysctl_alloc(pnode, 1);
                else
                        error = sysctl_alloc(pnode, 0);
                if (error) {
                        if (own != NULL)
                                free(own, M_SYSCTLDATA);
                        return (error);
                }
        }
        node = pnode->sysctl_child;

        /*
         * no collisions, so pick a good dynamic number if we need to.
         */
        if (nm == CTL_CREATE) {
                nm = ++sysctl_root.sysctl_num;
                for (ni = 0; ni < pnode->sysctl_clen; ni++) {
                        if (nm == node[ni].sysctl_num) {
                                nm++;
                                ni = -1;
                        } else if (nm > node[ni].sysctl_num)
                                at = ni + 1;
                }
        }

        /*
         * oops...ran out of space
         */
        if (pnode->sysctl_clen == pnode->sysctl_csize) {
                error = sysctl_realloc(pnode);
                if (error) {
                        if (own != NULL)
                                free(own, M_SYSCTLDATA);
                        return (error);
                }
                node = pnode->sysctl_child;
        }

        /*
         * insert new node data
         */
        if (at < pnode->sysctl_clen) {
                int t;

                /*
                 * move the nodes that should come after the new one
                 */
                memmove(&node[at + 1], &node[at],
                        (pnode->sysctl_clen - at) * sizeof(struct sysctlnode));
                memset(&node[at], 0, sizeof(struct sysctlnode));
                node[at].sysctl_parent = pnode;
                /*
                 * and...reparent any children of any moved nodes
                 */
                for (ni = at; ni <= pnode->sysctl_clen; ni++)
                        if (node[ni].sysctl_child != NULL)
                                for (t = 0; t < node[ni].sysctl_csize; t++)
                                        node[ni].sysctl_child[t].sysctl_parent =
                                                &node[ni];
        }
        node = &node[at];
        pnode->sysctl_clen++;

        strlcpy(node->sysctl_name, nnode.sysctl_name,
                sizeof(node->sysctl_name));
        node->sysctl_num = nm;
        node->sysctl_size = sz;
        node->sysctl_flags = SYSCTL_VERSION|type|flags; /* XXX other trees */
        node->sysctl_csize = 0;
        node->sysctl_clen = 0;
        if (own) {
                node->sysctl_data = own;
                node->sysctl_flags |= CTLFLAG_OWNDATA;
        } else if (flags & CTLFLAG_ALIAS) {
                node->sysctl_alias = anum;
        } else if (flags & CTLFLAG_IMMEDIATE) {
                switch (type) {
                case CTLTYPE_BOOL:
                        node->sysctl_bdata = nnode.sysctl_bdata;
                        break;
                case CTLTYPE_INT:
                        node->sysctl_idata = nnode.sysctl_idata;
                        break;
                case CTLTYPE_QUAD:
                        node->sysctl_qdata = nnode.sysctl_qdata;
                        break;
                }
        } else {
                node->sysctl_data = nnode.sysctl_data;
                node->sysctl_flags &= ~CTLFLAG_OWNDATA;
        }
        node->sysctl_func = nnode.sysctl_func;
        node->sysctl_child = NULL;
        /* node->sysctl_parent should already be done */

        /*
         * update "version" on path to "root"
         */
        for (; rnode->sysctl_parent != NULL; rnode = rnode->sysctl_parent)
                ;
        pnode = node;
        for (nm = rnode->sysctl_ver + 1; pnode != NULL;
             pnode = pnode->sysctl_parent)
                pnode->sysctl_ver = nm;

        /* If this fails, the node is already added - the user won't know! */
        error = sysctl_cvt_out(l, v, node, oldp, *oldlenp, oldlenp);

        return (error);
}

/*
 * ********************************************************************
 * A wrapper around sysctl_create() that prints the thing we're trying
 * to add.
 * ********************************************************************
 */
#ifdef SYSCTL_DEBUG_CREATE
int
sysctl_create(SYSCTLFN_ARGS)
{
        const struct sysctlnode *node;
        int k, v, rc, ni, nl = namelen + (name - oname);
        struct sysctlnode nnode;

        if (newp == NULL)
                return EINVAL;
        int error = sysctl_cvt_in(l, &v, newp, newlen, &nnode);
        if (error)
                return error;

        node = &nnode;

        printf("namelen %d (", nl);
        for (ni = 0; ni < nl - 1; ni++)
                printf(" %d", oname[ni]);
        printf(" %d )\t[%s]\tflags %08x (%08x %d %zu)\n",
               k = node->sysctl_num,
               node->sysctl_name,
               node->sysctl_flags,
               SYSCTL_FLAGS(node->sysctl_flags),
               SYSCTL_TYPE(node->sysctl_flags),
               node->sysctl_size);

        node = rnode;
        rc = _sysctl_create(SYSCTLFN_CALL(rnode));

        printf("sysctl_create(");
        for (ni = 0; ni < nl - 1; ni++)
                printf(" %d", oname[ni]);
        printf(" %d ) returned %d\n", k, rc);

        return (rc);
}
#endif /* SYSCTL_DEBUG_CREATE */

/*
 * sysctl_destroy -- Removes a node (as described by newp) from the
 * given tree, returning (if successful) a copy of the dead node in
 * oldp.  Since we're removing stuff, there's not much to check.
 */
int
sysctl_destroy(SYSCTLFN_ARGS)
{
        struct sysctlnode *node, *pnode, onode, nnode;
        int ni, error, v;

        KASSERT(rw_write_held(&sysctl_treelock));

        if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
                printf("sysctl_destroy: rnode %p wrong version\n", rnode);
                return (EINVAL);
        }

        error = 0;

        if (namelen != 1 || name[namelen - 1] != CTL_DESTROY)
                return (EINVAL);

        /*
         * processes can only destroy nodes at securelevel 0, must be
         * root, and can't remove nodes from a parent that's not
         * writeable
         */
        if (l != NULL) {
#ifndef SYSCTL_DISALLOW_CREATE
                error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
                    KAUTH_REQ_SYSTEM_SYSCTL_DELETE, NULL, NULL, NULL);
                if (error)
                        return (error);
                if (!(rnode->sysctl_flags & CTLFLAG_READWRITE))
#endif /* SYSCTL_DISALLOW_CREATE */
                        return (EPERM);
        }

        /*
         * nothing can remove a node if:
         * the node is permanent (checked later) or
         * the tree itself is not writeable or
         * the entire sysctl system is not writeable
         *
         * note that we ignore whether setup is complete or not,
         * because these rules always apply.
         */
        if (!(sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_READWRITE) ||
            !(sysctl_root.sysctl_flags & CTLFLAG_READWRITE))
                return (EPERM);

        if (newp == NULL)
                return (EINVAL);
        error = sysctl_cvt_in(l, &v, newp, newlen, &nnode);
        if (error)
                return (error);
        memset(&onode, 0, sizeof(struct sysctlnode));

        node = rnode->sysctl_child;
        for (ni = 0; ni < rnode->sysctl_clen; ni++) {
                if (nnode.sysctl_num == node[ni].sysctl_num) {
                        /*
                         * if name specified, must match
                         */
                        if (nnode.sysctl_name[0] != '\0' &&
                            strcmp(nnode.sysctl_name, node[ni].sysctl_name))
                                continue;
                        /*
                         * if version specified, must match
                         */
                        if (nnode.sysctl_ver != 0 &&
                            nnode.sysctl_ver != node[ni].sysctl_ver)
                                continue;
                        /*
                         * this must be the one
                         */
                        break;
                }
        }
        if (ni == rnode->sysctl_clen)
                return (ENOENT);
        node = &node[ni];
        pnode = node->sysctl_parent;

        /*
         * if the kernel says permanent, it is, so there.  nyah.
         */
        if (SYSCTL_FLAGS(node->sysctl_flags) & CTLFLAG_PERMANENT)
                return (EPERM);

        /*
         * can't delete non-empty nodes
         */
        if (SYSCTL_TYPE(node->sysctl_flags) == CTLTYPE_NODE &&
            node->sysctl_clen != 0)
                return (ENOTEMPTY);

        /*
         * if the node "owns" data, release it now
         */
        if (node->sysctl_flags & CTLFLAG_OWNDATA) {
                if (node->sysctl_data != NULL)
                        free(node->sysctl_data, M_SYSCTLDATA);
                node->sysctl_data = NULL;
        }
        if (node->sysctl_flags & CTLFLAG_OWNDESC) {
                if (node->sysctl_desc != NULL)
                        /*XXXUNCONST*/
                        free(__UNCONST(node->sysctl_desc), M_SYSCTLDATA);
                node->sysctl_desc = NULL;
        }

        /*
         * if the node to be removed is not the last one on the list,
         * move the remaining nodes up, and reparent any grandchildren
         */
        onode = *node;
        if (ni < pnode->sysctl_clen - 1) {
                int t;

                memmove(&pnode->sysctl_child[ni], &pnode->sysctl_child[ni + 1],
                        (pnode->sysctl_clen - ni - 1) *
                        sizeof(struct sysctlnode));
                for (; ni < pnode->sysctl_clen - 1; ni++)
                        if (SYSCTL_TYPE(pnode->sysctl_child[ni].sysctl_flags) ==
                            CTLTYPE_NODE)
                                for (t = 0;
                                     t < pnode->sysctl_child[ni].sysctl_clen;
                                     t++)
                                        pnode->sysctl_child[ni].sysctl_child[t].
                                                sysctl_parent =
                                                &pnode->sysctl_child[ni];
                ni = pnode->sysctl_clen - 1;
                node = &pnode->sysctl_child[ni];
        }

        /*
         * reset the space we just vacated
         */
        memset(node, 0, sizeof(struct sysctlnode));
        node->sysctl_parent = pnode;
        pnode->sysctl_clen--;

        /*
         * if this parent just lost its last child, nuke the creche
         */
        if (pnode->sysctl_clen == 0) {
                free(pnode->sysctl_child, M_SYSCTLNODE);
                pnode->sysctl_csize = 0;
                pnode->sysctl_child = NULL;
        }

        /*
         * update "version" on path to "root"
         */
        for (; rnode->sysctl_parent != NULL; rnode = rnode->sysctl_parent)
                ;
        for (ni = rnode->sysctl_ver + 1; pnode != NULL;
             pnode = pnode->sysctl_parent)
                pnode->sysctl_ver = ni;

        error = sysctl_cvt_out(l, v, &onode, oldp, *oldlenp, oldlenp);

        return (error);
}

/*
 * sysctl_lookup -- Handles copyin/copyout of new and old values.
 * Partial reads are globally allowed.  Only root can write to things
 * unless the node says otherwise.
 */
int
sysctl_lookup(SYSCTLFN_ARGS)
{
        int error, rw;
        size_t sz, len;
        void *d;

        KASSERT(rw_lock_held(&sysctl_treelock));

        if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
                printf("%s: rnode %p wrong version\n", __func__, rnode);
                return EINVAL;
        }

        if (newlen == 0)
                newp = NULL;

        error = 0;

        /*
         * you can't "look up" a node.  you can "query" it, but you
         * can't "look it up".
         */
        if (SYSCTL_TYPE(rnode->sysctl_flags) == CTLTYPE_NODE || namelen != 0) {
                DPRINTF(("%s: can't lookup a node\n", __func__));
                return EINVAL;
        }

        /*
         * some nodes are private, so only root can look into them.
         */
        if (l != NULL && (rnode->sysctl_flags & CTLFLAG_PRIVATE) &&
            (error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
            KAUTH_REQ_SYSTEM_SYSCTL_PRVT, NULL, NULL, NULL)) != 0) {
                DPRINTF(("%s: private node\n", __func__));
                return error;
        }

        /*
         * if a node wants to be writable according to different rules
         * other than "only root can write to stuff unless a flag is
         * set", then it needs its own function which should have been
         * called and not us.
         */
        if (l != NULL && newp != NULL &&
            !(rnode->sysctl_flags & CTLFLAG_ANYWRITE) &&
            (error = kauth_authorize_system(l->l_cred,
            KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_MODIFY, NULL, NULL,
            NULL)) != 0) {
                DPRINTF(("%s: can't modify\n", __func__));
                return error;
        }

        /*
         * is this node supposedly writable?
         */
        rw = (rnode->sysctl_flags & CTLFLAG_READWRITE) ? 1 : 0;

        /*
         * it appears not to be writable at this time, so if someone
         * tried to write to it, we must tell them to go away
         */
        if (!rw && newp != NULL) {
                DPRINTF(("%s: not writable\n", __func__));
                return EPERM;
        }

        /*
         * step one, copy out the stuff we have presently
         */
        if (rnode->sysctl_flags & CTLFLAG_IMMEDIATE) {
                /*
                 * note that we discard const here because we are
                 * modifying the contents of the node (which is okay
                 * because it's ours)
                 *
                 * It also doesn't matter which field of the union we pick.
                 */
                d = __UNCONST(&rnode->sysctl_qdata);
        } else
                d = rnode->sysctl_data;

        if (SYSCTL_TYPE(rnode->sysctl_flags) == CTLTYPE_STRING)
                sz = strlen(d) + 1; /* XXX@@@ possible fault here */
        else
                sz = rnode->sysctl_size;
        if (oldp != NULL) {
                error = sysctl_copyout(l, d, oldp, MIN(sz, *oldlenp));
                if (error) {
                        DPRINTF(("%s: bad copyout %d\n", __func__, error));
                        return error;
                }
        }
        *oldlenp = sz;

        /*
         * are we done?
         */
        if (newp == NULL)
                return 0;

        /*
         * hmm...not done.  must now "copy in" new value.  re-adjust
         * sz to maximum value (strings are "weird").
         */
        sz = rnode->sysctl_size;
        switch (SYSCTL_TYPE(rnode->sysctl_flags)) {
        case CTLTYPE_BOOL: {
                bool tmp;
                /*
                 * these data must be *exactly* the same size coming
                 * in.  bool may only be true or false.
                 */
                if (newlen != sz) {
                        DPRINTF(("%s: bad size %zu != %zu\n", __func__, newlen,
                            sz));
                        return EINVAL;
                }
                error = sysctl_copyin(l, newp, &tmp, sz);
                if (error)
                        break;
                if (tmp != true && tmp != false) {
                        DPRINTF(("%s: tmp %d\n", __func__, tmp));
                        return EINVAL;
                }
                *(bool *)d = tmp;
                break;
        }
        case CTLTYPE_INT:
        case CTLTYPE_QUAD:
        case CTLTYPE_STRUCT:
                /*
                 * these data must be *exactly* the same size coming
                 * in.
                 */
                if (newlen != sz)
                        goto bad_size;
                error = sysctl_copyin(l, newp, d, sz);
                rnd_add_data(NULL, d, sz, 0);
                break;
        case CTLTYPE_STRING: {
                /*
                 * strings, on the other hand, can be shorter, and we
                 * let userland be sloppy about the trailing nul.
                 */
                char *newbuf;

                /*
                 * too much new string?
                 */
                if (newlen > sz)
                        goto bad_size;

                /*
                 * temporary copy of new inbound string
                 */
                len = MIN(sz, newlen);
                newbuf = malloc(len, M_SYSCTLDATA, M_WAITOK);
                if (newbuf == NULL) {
                        DPRINTF(("%s: oomem %zu\n", __func__, len));
                        return ENOMEM;
                }
                error = sysctl_copyin(l, newp, newbuf, len);
                if (error) {
                        free(newbuf, M_SYSCTLDATA);
                        DPRINTF(("%s: copyin %d\n", __func__, error));
                        return error;
                }

                /*
                 * did they NUL terminate it, or do we have space
                 * left to do it ourselves?
                 */
                if (newbuf[len - 1] != '\0' && len == sz) {
                        free(newbuf, M_SYSCTLDATA);
                        DPRINTF(("%s: string too long\n", __func__));
                        return EINVAL;
                }

                /*
                 * looks good, so pop it into place and zero the rest.
                 */
                if (len > 0) {
                        memcpy(d, newbuf, len);
                        rnd_add_data(NULL, d, len, 0);
                }
                if (sz != len)
                        memset((char*)d + len, 0, sz - len);
                free(newbuf, M_SYSCTLDATA);
                break;
        }
        default:
                DPRINTF(("%s: bad type\n", __func__));
                return EINVAL;
        }
        if (error) {
                DPRINTF(("%s: copyin %d\n", __func__, error));
        }

        return error;

    bad_size:
        DPRINTF(("%s: bad size %zu > %zu\n", __func__, newlen, sz));
        return EINVAL;
}

/*
 * sysctl_mmap -- Dispatches sysctl mmap requests to those nodes that
 * purport to handle it.  This interface isn't fully fleshed out yet,
 * unfortunately.
 */
static int
sysctl_mmap(SYSCTLFN_ARGS)
{
        const struct sysctlnode *node;
        struct sysctlnode nnode;
        int error;
        int sysctl_num;

        if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
                printf("sysctl_mmap: rnode %p wrong version\n", rnode);
                return (EINVAL);
        }

        /*
         * let's just pretend that didn't happen, m'kay?
         */
        if (l == NULL)
                return (EPERM);

        /*
         * is this a sysctlnode description of an mmap request?
         */
        if (newp == NULL || newlen != sizeof(struct sysctlnode))
                return (EINVAL);
        error = sysctl_copyin(l, newp, &nnode, sizeof(nnode));
        if (error)
                return (error);

        /*
         * does the node they asked for exist?
         */
        if (namelen != 1)
                return (EOPNOTSUPP);
        node = rnode;
        sysctl_num = nnode.sysctl_num;
        error = sysctl_locate(l, &sysctl_num, 1, &node, NULL);
        if (error)
                return (error);

        /*
         * does this node that we have found purport to handle mmap?
         */
        if (node->sysctl_func == NULL ||
            !(node->sysctl_flags & CTLFLAG_MMAP))
                return (EOPNOTSUPP);

        /*
         * well...okay, they asked for it.
         */
        return ((*node->sysctl_func)(SYSCTLFN_CALL(node)));
}

int
sysctl_describe(SYSCTLFN_ARGS)
{
        struct sysctldesc *d;
        void *bf;
        size_t sz, left, tot;
        int i, error, v = -1;
        struct sysctlnode *node;
        struct sysctlnode dnode;

        if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
                printf("sysctl_query: rnode %p wrong version\n", rnode);
                return (EINVAL);
        }

        if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE)
                return (ENOTDIR);
        if (namelen != 1 || name[0] != CTL_DESCRIBE)
                return (EINVAL);

        /*
         * get ready...
         */
        error = 0;
        d = bf = malloc(MAXDESCLEN, M_TEMP, M_WAITOK);
        if (bf == NULL)
                return ENOMEM;
        tot = 0;
        node = rnode->sysctl_child;
        left = *oldlenp;

        /*
         * no request -> all descriptions at this level
         * request with desc unset -> just this node
         * request with desc set -> set descr for this node
         */
        if (newp != NULL) {
                error = sysctl_cvt_in(l, &v, newp, newlen, &dnode);
                if (error)
                        goto out;
                if (dnode.sysctl_desc != NULL) {
                        /*
                         * processes cannot set descriptions above
                         * securelevel 0.  and must be root.  blah
                         * blah blah.  a couple more checks are made
                         * once we find the node we want.
                         */
                        if (l != NULL) {
#ifndef SYSCTL_DISALLOW_CREATE
                                error = kauth_authorize_system(l->l_cred,
                                    KAUTH_SYSTEM_SYSCTL,
                                    KAUTH_REQ_SYSTEM_SYSCTL_DESC, NULL,
                                    NULL, NULL);
                                if (error)
                                        goto out;
#else /* SYSCTL_DISALLOW_CREATE */
                                error = EPERM;
                                goto out;
#endif /* SYSCTL_DISALLOW_CREATE */
                        }

                        /*
                         * find node and try to set the description on it
                         */
                        for (i = 0; i < rnode->sysctl_clen; i++)
                                if (node[i].sysctl_num == dnode.sysctl_num)
                                        break;
                        if (i == rnode->sysctl_clen) {
                                error = ENOENT;
                                goto out;
                        }
                        node = &node[i];

                        /*
                         * did the caller specify a node version?
                         */
                        if (dnode.sysctl_ver != 0 &&
                            dnode.sysctl_ver != node->sysctl_ver) {
                                error = EINVAL;
                                goto out;
                        }

                        /*
                         * okay...some rules:
                         * (1) if setup is done and the tree is
                         *     read-only or the whole system is
                         *     read-only
                         * (2) no one can set a description on a
                         *     permanent node (it must be set when
                         *     using createv)
                         * (3) processes cannot *change* a description
                         * (4) processes *can*, however, set a
                         *     description on a read-only node so that
                         *     one can be created and then described
                         *     in two steps
                         * anything else come to mind?
                         */
                        if ((sysctl_root.sysctl_flags & CTLFLAG_PERMANENT) &&
                            (!(sysctl_rootof(node)->sysctl_flags &
                               CTLFLAG_READWRITE) ||
                             !(sysctl_root.sysctl_flags & CTLFLAG_READWRITE))) {
                                error = EPERM;
                                goto out;
                        }
                        if (node->sysctl_flags & CTLFLAG_PERMANENT) {
                                error = EPERM;
                                goto out;
                        }
                        if (l != NULL && node->sysctl_desc != NULL) {
                                error = EPERM;
                                goto out;
                        }

                        /*
                         * right, let's go ahead.  the first step is
                         * making the description into something the
                         * node can "own", if need be.
                         */
                        if (l != NULL ||
                            dnode.sysctl_flags & CTLFLAG_OWNDESC) {
                                char *nd, *k;

                                k = malloc(MAXDESCLEN, M_TEMP, M_WAITOK);
                                if (k == NULL) {
                                        error = ENOMEM;
                                        goto out;
                                }
                                error = sysctl_copyinstr(l, dnode.sysctl_desc,
                                                         k, MAXDESCLEN, &sz);
                                if (error) {
                                        free(k, M_TEMP);
                                        goto out;
                                }
                                nd = malloc(sz, M_SYSCTLDATA, M_WAITOK);
                                if (nd == NULL) {
                                        free(k, M_TEMP);
                                        error = ENOMEM;
                                        goto out;
                                }
                                memcpy(nd, k, sz);
                                dnode.sysctl_flags |= CTLFLAG_OWNDESC;
                                dnode.sysctl_desc = nd;
                                free(k, M_TEMP);
                        }

                        /*
                         * now "release" the old description and
                         * attach the new one.  ta-da.
                         */
                        if ((node->sysctl_flags & CTLFLAG_OWNDESC) &&
                            node->sysctl_desc != NULL)
                                /*XXXUNCONST*/
                                free(__UNCONST(node->sysctl_desc), M_SYSCTLDATA);
                        node->sysctl_desc = dnode.sysctl_desc;
                        node->sysctl_flags |=
                                (dnode.sysctl_flags & CTLFLAG_OWNDESC);

                        /*
                         * now we "fall out" and into the loop which
                         * will copy the new description back out for
                         * those interested parties
                         */
                }
        }

        /*
         * scan for one description or just retrieve all descriptions
         */
        for (i = 0; i < rnode->sysctl_clen; i++) {
                /*
                 * did they ask for the description of only one node?
                 */
                if (v != -1 && node[i].sysctl_num != dnode.sysctl_num)
                        continue;

                /*
                 * don't describe "private" nodes to non-suser users
                 */
                if ((node[i].sysctl_flags & CTLFLAG_PRIVATE) && (l != NULL) &&
                    !(kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
                    KAUTH_REQ_SYSTEM_SYSCTL_PRVT, NULL, NULL, NULL)))
                        continue;

                /*
                 * is this description "valid"?
                 */
                memset(bf, 0, MAXDESCLEN);
                if (node[i].sysctl_desc == NULL)
                        sz = 1;
                else if (copystr(node[i].sysctl_desc, &d->descr_str[0],
                                 MAXDESCLEN - sizeof(*d), &sz) != 0) {
                        /*
                         * erase possible partial description
                         */
                        memset(bf, 0, MAXDESCLEN);
                        sz = 1;
                }

                /*
                 * we've got it, stuff it into the caller's buffer
                 */
                d->descr_num = node[i].sysctl_num;
                d->descr_ver = node[i].sysctl_ver;
                d->descr_len = sz; /* includes trailing nul */
                sz = (char *)NEXT_DESCR(d) - (char *)d;
                if (oldp != NULL && left >= sz) {
                        error = sysctl_copyout(l, d, oldp, sz);
                        if (error)
                                goto out;
                        left -= sz;
                        oldp = (void *)__sysc_desc_adv(oldp, d->descr_len);
                }
                tot += sz;

                /*
                 * if we get this far with v not "unset", they asked
                 * for a specific node and we found it
                 */
                if (v != -1)
                        break;
        }

        /*
         * did we find it after all?
         */
        if (v != -1 && tot == 0)
                error = ENOENT;
        else
                *oldlenp = tot;

out:
        free(bf, M_TEMP);
        return (error);
}

/*
 * ********************************************************************
 * Section 3: Create and destroy from inside the kernel
 * ********************************************************************
 * sysctl_createv() and sysctl_destroyv() are simpler-to-use
 * interfaces for the kernel to fling new entries into the mib and rip
 * them out later.  In the case of sysctl_createv(), the returned copy
 * of the node (see sysctl_create()) will be translated back into a
 * pointer to the actual node.
 *
 * Note that sysctl_createv() will return 0 if the create request
 * matches an existing node (ala mkdir -p), and that sysctl_destroyv()
 * will return 0 if the node to be destroyed already does not exist
 * (aka rm -f) or if it is a parent of other nodes.
 *
 * This allows two (or more) different subsystems to assert sub-tree
 * existence before populating their own nodes, and to remove their
 * own nodes without orphaning the others when they are done.
 * ********************************************************************
 */
#undef sysctl_createv
int
sysctl_createv(struct sysctllog **log, int cflags,
               const struct sysctlnode **rnode, const struct sysctlnode **cnode,
               int flags, int type, const char *namep, const char *descr,
               sysctlfn func, u_quad_t qv, void *newp, size_t newlen,
               ...)
{
        va_list ap;
        int error, ni, namelen, name[CTL_MAXNAME];
        const struct sysctlnode *root, *pnode;
        struct sysctlnode nnode, onode, *dnode;
        size_t sz;
        const struct sysctlnode *snode __diagused;

        /*
         * where are we putting this?
         */
        if (rnode != NULL && *rnode == NULL) {
                printf("sysctl_createv: rnode NULL\n");
                return (EINVAL);
        }
        root = rnode ? *rnode : NULL;
        if (cnode != NULL)
                *cnode = NULL;
        if (cflags != 0)
                return (EINVAL);

        /*
         * what is it?
         */
        flags = SYSCTL_VERSION|SYSCTL_TYPE(type)|SYSCTL_FLAGS(flags);
        if (log != NULL)
                flags &= ~CTLFLAG_PERMANENT;

        /*
         * where do we put it?
         */
        va_start(ap, newlen);
        namelen = 0;
        error = 0;
        ni = -1;
        do {
                if (++ni == CTL_MAXNAME) {
                        error = ENAMETOOLONG;
                        break;
                }
                name[ni] = va_arg(ap, int);
                /*
                 * sorry, this is not supported from here
                 */
                if (name[ni] == CTL_CREATESYM) {
                        error = EINVAL;
                        break;
                }
        } while (name[ni] != CTL_EOL && name[ni] != CTL_CREATE);
        va_end(ap);
        if (error)
                return error;
        namelen = ni + (name[ni] == CTL_CREATE ? 1 : 0);

        /*
         * what's it called
         */
        if (strlcpy(nnode.sysctl_name, namep, sizeof(nnode.sysctl_name)) >=
            sizeof(nnode.sysctl_name))
                return (ENAMETOOLONG);

        /*
         * cons up the description of the new node
         */
        nnode.sysctl_num = name[namelen - 1];
        name[namelen - 1] = CTL_CREATE;
        nnode.sysctl_size = newlen;
        nnode.sysctl_flags = flags;
        if (type == CTLTYPE_NODE) {
                nnode.sysctl_csize = 0;
                nnode.sysctl_clen = 0;
                nnode.sysctl_child = NULL;
                if (flags & CTLFLAG_ALIAS)
                        nnode.sysctl_alias = qv;
        } else if (flags & CTLFLAG_IMMEDIATE) {
                switch (type) {
                case CTLTYPE_BOOL:
                        nnode.sysctl_bdata = qv;
                        break;
                case CTLTYPE_INT:
                        nnode.sysctl_idata = qv;
                        break;
                case CTLTYPE_QUAD:
                        nnode.sysctl_qdata = qv;
                        break;
                default:
                        return (EINVAL);
                }
        } else {
                nnode.sysctl_data = newp;
        }
        nnode.sysctl_func = func;
        nnode.sysctl_parent = NULL;
        nnode.sysctl_ver = 0;

        /*
         * initialize lock state -- we need locks if the main tree has
         * been marked as complete, but since we could be called from
         * either there, or from a device driver (say, at device
         * insertion), or from a module (at module load time, say), we
         * don't really want to "wait"...
         */
        sysctl_lock(true);

        /*
         * locate the prospective parent of the new node, and if we
         * find it, add the new node.
         */
        sz = sizeof(onode);
        pnode = root;
        error = sysctl_locate(NULL, &name[0], namelen - 1, &pnode, &ni);
        if (error) {
                /*
                 * XXX: If you are seeing this printf in early bringup
                 * stages, perhaps your setfault is not functioning and
                 * thus kcopy() is mis-behaving.
                 */
                printf("sysctl_createv: sysctl_locate(%s) returned %d\n",
                       nnode.sysctl_name, error);
                sysctl_unlock();
                return (error);
        }
        error = sysctl_create(&name[ni], namelen - ni, &onode, &sz,
                              &nnode, sizeof(nnode), &name[0], NULL,
                              pnode);

        /*
         * unfortunately the node we wanted to create is already
         * there.  if the node that's already there is a reasonable
         * facsimile of the node we wanted to create, just pretend
         * (for the caller's benefit) that we managed to create the
         * node they wanted.
         */
        if (error == EEXIST) {
                /* name is the same as requested... */
                if (strcmp(nnode.sysctl_name, onode.sysctl_name) == 0 &&
                    /* they want the same function... */
                    nnode.sysctl_func == onode.sysctl_func &&
                    /* number is the same as requested, or... */
                    (nnode.sysctl_num == onode.sysctl_num ||
                     /* they didn't pick a number... */
                     nnode.sysctl_num == CTL_CREATE)) {
                        /*
                         * collision here from trying to create
                         * something that already existed; let's give
                         * our customers a hand and tell them they got
                         * what they wanted.
                         */
#ifdef SYSCTL_DEBUG_CREATE
                        printf("cleared\n");
#endif /* SYSCTL_DEBUG_CREATE */
                        error = 0;
                }
        }

        if (error == 0 &&
            (cnode != NULL || log != NULL || descr != NULL)) {
                /*
                 * sysctl_create() gave us back a copy of the node,
                 * but we need to know where it actually is...
                 */
                pnode = root;
                error = sysctl_locate(NULL, &name[0], namelen - 1, &pnode, &ni);
                snode = pnode;

                /*
                 * manual scan of last layer so that aliased nodes
                 * aren't followed.
                 */
                if (error == 0) {
                        for (ni = 0; ni < pnode->sysctl_clen; ni++)
                                if (pnode->sysctl_child[ni].sysctl_num ==
                                    onode.sysctl_num)
                                        break;
                        if (ni < pnode->sysctl_clen)
                                pnode = &pnode->sysctl_child[ni];
                        else
                                error = ENOENT;
                }

                /*
                 * not expecting an error here, but...
                 */
                if (error == 0) {
                        KASSERTMSG(pnode->sysctl_parent == snode,
                            "sysctl parent mis-match pnode %s, snode %s",
                            pnode->sysctl_name, snode->sysctl_name);
                        if (log != NULL)
                                sysctl_log_add(log, pnode);
                        if (cnode != NULL)
                                *cnode = pnode;
                        if (descr != NULL) {
                                /*
                                 * allow first caller to *set* a
                                 * description actually to set it
                                 *
                                 * discard const here so we can attach
                                 * the description
                                 */
                                dnode = __UNCONST(pnode);
                                if (pnode->sysctl_desc != NULL)
                                        /* skip it...we've got one */;
                                else if (flags & CTLFLAG_OWNDESC) {
                                        size_t l = strlen(descr) + 1;
                                        char *d = malloc(l, M_SYSCTLDATA,
                                                         M_WAITOK);
                                        if (d != NULL) {
                                                memcpy(d, descr, l);
                                                dnode->sysctl_desc = d;
                                                dnode->sysctl_flags |=
                                                    CTLFLAG_OWNDESC;
                                        }
                                } else
                                        dnode->sysctl_desc = descr;
                        }
                } else {
                        printf("sysctl_create succeeded but node not found?!\n");
                        /*
                         *  confusing, but the create said it
                         * succeeded, so...
                         */
                        error = 0;
                }
        }

        /*
         * now it should be safe to release the lock state.  note that
         * the pointer to the newly created node being passed back may
         * not be "good" for very long.
         */
        sysctl_unlock();

        if (error != 0) {
                printf("sysctl_createv: sysctl_create(%s) returned %d\n",
                       nnode.sysctl_name, error);
#if 0
                if (error != ENOENT)
                        sysctl_dump(&onode);
#endif
        }

        return (error);
}

int
sysctl_destroyv(struct sysctlnode *rnode, ...)
{
        va_list ap;
        int error, name[CTL_MAXNAME], namelen, ni;
        const struct sysctlnode *pnode, *node;
        struct sysctlnode dnode, *onode;
        size_t sz;

        va_start(ap, rnode);
        namelen = 0;
        ni = 0;
        do {
                if (ni == CTL_MAXNAME) {
                        va_end(ap);
                        return (ENAMETOOLONG);
                }
                name[ni] = va_arg(ap, int);
        } while (name[ni++] != CTL_EOL);
        namelen = ni - 1;
        va_end(ap);

        /*
         * i can't imagine why we'd be destroying a node when the tree
         * wasn't complete, but who knows?
         */
        sysctl_lock(true);

        /*
         * where is it?
         */
        node = rnode;
        error = sysctl_locate(NULL, &name[0], namelen - 1, &node, &ni);
        if (error) {
                /* they want it gone and it's not there, so... */
                sysctl_unlock();
                return (error == ENOENT ? 0 : error);
        }

        /*
         * set up the deletion
         */
        pnode = node;
        node = &dnode;
        memset(&dnode, 0, sizeof(dnode));
        dnode.sysctl_flags = SYSCTL_VERSION;
        dnode.sysctl_num = name[namelen - 1];

        /*
         * we found it, now let's nuke it
         */
        name[namelen - 1] = CTL_DESTROY;
        sz = 0;
        error = sysctl_destroy(&name[namelen - 1], 1, NULL, &sz,
                               node, sizeof(*node), &name[0], NULL,
                               pnode);
        if (error == ENOTEMPTY) {
                /*
                 * think of trying to delete "foo" when "foo.bar"
                 * (which someone else put there) is still in
                 * existence
                 */
                error = 0;

                /*
                 * dunno who put the description there, but if this
                 * node can ever be removed, we need to make sure the
                 * string doesn't go out of context.  that means we
                 * need to find the node that's still there (don't use
                 * sysctl_locate() because that follows aliasing).
                 */
                node = pnode->sysctl_child;
                for (ni = 0; ni < pnode->sysctl_clen; ni++)
                        if (node[ni].sysctl_num == dnode.sysctl_num)
                                break;
                node = (ni < pnode->sysctl_clen) ? &node[ni] : NULL;

                /*
                 * if we found it, and this node has a description,
                 * and this node can be released, and it doesn't
                 * already own its own description...sigh.  :)
                 */
                if (node != NULL && node->sysctl_desc != NULL &&
                    !(node->sysctl_flags & CTLFLAG_PERMANENT) &&
                    !(node->sysctl_flags & CTLFLAG_OWNDESC)) {
                        char *d;

                        sz = strlen(node->sysctl_desc) + 1;
                        d = malloc(sz, M_SYSCTLDATA, M_WAITOK);
                        if (d != NULL) {
                                /*
                                 * discard const so that we can
                                 * re-attach the description
                                 */
                                memcpy(d, node->sysctl_desc, sz);
                                onode = __UNCONST(node);
                                onode->sysctl_desc = d;
                                onode->sysctl_flags |= CTLFLAG_OWNDESC;
                        } else {
                                /*
                                 * XXX drop the description?  be
                                 * afraid?  don't care?
                                 */
                        }
                }
        }

        sysctl_unlock();

        return (error);
}

/*
 * ********************************************************************
 * Deletes an entire n-ary tree.  Not recommended unless you know why
 * you're doing it.  Personally, I don't know why you'd even think
 * about it.
 * ********************************************************************
 */
void
sysctl_free(struct sysctlnode *rnode)
{
        struct sysctlnode *node, *pnode;

        rw_enter(&sysctl_treelock, RW_WRITER);

        if (rnode == NULL)
                rnode = &sysctl_root;

        if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
                printf("sysctl_free: rnode %p wrong version\n", rnode);
                rw_exit(&sysctl_treelock);
                return;
        }

        pnode = rnode;

        node = pnode->sysctl_child;
        do {
                while (node != NULL && pnode->sysctl_csize > 0) {
                        while (node <
                               &pnode->sysctl_child[pnode->sysctl_clen] &&
                               (SYSCTL_TYPE(node->sysctl_flags) !=
                                CTLTYPE_NODE ||
                                node->sysctl_csize == 0)) {
                                if (SYSCTL_FLAGS(node->sysctl_flags) &
                                    CTLFLAG_OWNDATA) {
                                        if (node->sysctl_data != NULL) {
                                                free(node->sysctl_data,
                                                     M_SYSCTLDATA);
                                                node->sysctl_data = NULL;
                                        }
                                }
                                if (SYSCTL_FLAGS(node->sysctl_flags) &
                                    CTLFLAG_OWNDESC) {
                                        if (node->sysctl_desc != NULL) {
                                                /*XXXUNCONST*/
                                                free(__UNCONST(node->sysctl_desc),
                                                     M_SYSCTLDATA);
                                                node->sysctl_desc = NULL;
                                        }
                                }
                                node++;
                        }
                        if (node < &pnode->sysctl_child[pnode->sysctl_clen]) {
                                pnode = node;
                                node = node->sysctl_child;
                        } else
                                break;
                }
                if (pnode->sysctl_child != NULL)
                        free(pnode->sysctl_child, M_SYSCTLNODE);
                pnode->sysctl_clen = 0;
                pnode->sysctl_csize = 0;
                pnode->sysctl_child = NULL;
                node = pnode;
                pnode = node->sysctl_parent;
        } while (pnode != NULL && node != rnode);

        rw_exit(&sysctl_treelock);
}

void
sysctl_log_print(const struct sysctllog *slog)
{
        int i, len;

        printf("root %p left %d size %d content", (const void *)slog->log_root,
            slog->log_left, slog->log_size);

        for (len = 0, i = slog->log_left; i < slog->log_size; i++) {
                switch (len) {
                case 0:
                        len = -1;
                        printf(" version %d", slog->log_num[i]);
                        break;
                case -1:
                        len = -2;
                        printf(" type %d", slog->log_num[i]);
                        break;
                case -2:
                        len =  slog->log_num[i];
                        printf(" len %d:", slog->log_num[i]);
                        if (len <= 0)
                                len = -1;
                        break;
                default:
                        len--;
                        printf(" %d", slog->log_num[i]);
                        break;
                }
        }
        printf(" end\n");
}

int
sysctl_log_add(struct sysctllog **logp, const struct sysctlnode *node)
{
        const int size0 = 16;
        int name[CTL_MAXNAME], namelen, i;
        const struct sysctlnode *pnode;
        struct sysctllog *log;

        if (node->sysctl_flags & CTLFLAG_PERMANENT)
                return (0);

        if (logp == NULL)
                return (0);

        if (*logp == NULL) {
                log = malloc(sizeof(struct sysctllog),
                       M_SYSCTLDATA, M_WAITOK);
                if (log == NULL) {
                        /* XXX print error message? */
                        return (-1);
                }
                log->log_num = malloc(size0 * sizeof(int),
                       M_SYSCTLDATA, M_WAITOK);
                if (log->log_num == NULL) {
                        /* XXX print error message? */
                        free(log, M_SYSCTLDATA);
                        return (-1);
                }
                memset(log->log_num, 0, size0 * sizeof(int));
                log->log_root = NULL;
                log->log_size = size0;
                log->log_left = size0;
                *logp = log;
        } else
                log = *logp;

        /*
         * check that the root is proper.  it's okay to record the
         * address of the root of a tree.  it's the only thing that's
         * guaranteed not to shift around as nodes come and go.
         */
        if (log->log_root == NULL)
                log->log_root = sysctl_rootof(node);
        else if (log->log_root != sysctl_rootof(node)) {
                printf("sysctl: log %p root mismatch (%p)\n",
                       log->log_root, sysctl_rootof(node));
                return (-1);
        }

        /*
         * we will copy out name in reverse order
         */
        for (pnode = node, namelen = 0;
             pnode != NULL && !(pnode->sysctl_flags & CTLFLAG_ROOT);
             pnode = pnode->sysctl_parent)
                name[namelen++] = pnode->sysctl_num;

        /*
         * do we have space?
         */
        if (log->log_left < (namelen + 3))
                sysctl_log_realloc(log);
        if (log->log_left < (namelen + 3))
                return (-1);

        /*
         * stuff name in, then namelen, then node type, and finally,
         * the version for non-node nodes.
         */
        for (i = 0; i < namelen && i < CTL_MAXNAME; i++)
                log->log_num[--log->log_left] = name[i];
        log->log_num[--log->log_left] = namelen;
        log->log_num[--log->log_left] = SYSCTL_TYPE(node->sysctl_flags);
        if (log->log_num[log->log_left] != CTLTYPE_NODE)
                log->log_num[--log->log_left] = node->sysctl_ver;
        else
                log->log_num[--log->log_left] = 0;

        return (0);
}

void
sysctl_teardown(struct sysctllog **logp)
{
        const struct sysctlnode *rnode;
        struct sysctlnode node;
        struct sysctllog *log;
        uint namelen;
        int *name, t, v, error, ni;
        size_t sz;

        if (logp == NULL || *logp == NULL)
                return;
        log = *logp;

        rw_enter(&sysctl_treelock, RW_WRITER);
        memset(&node, 0, sizeof(node));

        while (log->log_left < log->log_size) {
                KASSERT(log->log_left + 3 < log->log_size);
                KASSERT(log->log_left + log->log_num[log->log_left + 2] <=
                    log->log_size);
                v = log->log_num[log->log_left++];
                t = log->log_num[log->log_left++];
                namelen = log->log_num[log->log_left++];
                name = &log->log_num[log->log_left];

                node.sysctl_num = name[namelen - 1];
                node.sysctl_flags = SYSCTL_VERSION|t;
                node.sysctl_ver = v;

                rnode = log->log_root;
                error = sysctl_locate(NULL, &name[0], namelen, &rnode, &ni);
                if (error == 0) {
                        name[namelen - 1] = CTL_DESTROY;
                        rnode = rnode->sysctl_parent;
                        sz = 0;
                        (void)sysctl_destroy(&name[namelen - 1], 1, NULL,
                                             &sz, &node, sizeof(node),
                                             &name[0], NULL, rnode);
                }

                log->log_left += namelen;
        }

        KASSERT(log->log_size == log->log_left);
        free(log->log_num, M_SYSCTLDATA);
        free(log, M_SYSCTLDATA);
        *logp = NULL;

        rw_exit(&sysctl_treelock);
}

/*
 * ********************************************************************
 * old_sysctl -- A routine to bridge old-style internal calls to the
 * new infrastructure.
 * ********************************************************************
 */
int
old_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
           void *newp, size_t newlen, struct lwp *l)
{
        int error;
        size_t oldlen = 0;
        size_t savelen;

        if (oldlenp) {
                oldlen = *oldlenp;
        }
        savelen = oldlen;

        sysctl_lock(newp != NULL);
        error = sysctl_dispatch(name, namelen, oldp, &oldlen,
                                newp, newlen, name, l, NULL);
        sysctl_unlock();
        if (error == 0 && oldp != NULL && savelen < oldlen)
                error = ENOMEM;
        if (oldlenp) {
                *oldlenp = oldlen;
        }

        return (error);
}

/*
 * ********************************************************************
 * Section 4: Generic helper routines
 * ********************************************************************
 * "helper" routines that can do more finely grained access control,
 * construct structures from disparate information, create the
 * appearance of more nodes and sub-trees, etc.  for example, if
 * CTL_PROC wanted a helper function, it could respond to a CTL_QUERY
 * with a dynamically created list of nodes that represented the
 * currently running processes at that instant.
 * ********************************************************************
 */

/*
 * first, a few generic helpers that provide:
 *
 * sysctl_needfunc()                a readonly interface that emits a warning
 * sysctl_notavail()                returns EOPNOTSUPP (generic error)
 * sysctl_null()                an empty return buffer with no error
 */
int
sysctl_needfunc(SYSCTLFN_ARGS)
{
        int error;

        printf("!!SYSCTL_NEEDFUNC!!\n");

        if (newp != NULL || namelen != 0)
                return (EOPNOTSUPP);

        error = 0;
        if (oldp != NULL)
                error = sysctl_copyout(l, rnode->sysctl_data, oldp,
                                       MIN(rnode->sysctl_size, *oldlenp));
        *oldlenp = rnode->sysctl_size;

        return (error);
}

int
sysctl_notavail(SYSCTLFN_ARGS)
{

        if (namelen == 1 && name[0] == CTL_QUERY)
                return (sysctl_query(SYSCTLFN_CALL(rnode)));

        return (EOPNOTSUPP);
}

int
sysctl_null(SYSCTLFN_ARGS)
{

        *oldlenp = 0;

        return (0);
}

u_int
sysctl_map_flags(const u_int *map, u_int word)
{
        u_int rv;

        for (rv = 0; *map != 0; map += 2)
                if ((word & map[0]) != 0)
                        rv |= map[1];

        return rv;
}

/*
 * ********************************************************************
 * Section 5: The machinery that makes it all go
 * ********************************************************************
 * Memory "manglement" routines.  Not much to this, eh?
 * ********************************************************************
 */
static int
sysctl_alloc(struct sysctlnode *p, int x)
{
        int i;
        struct sysctlnode *n;

        assert(p->sysctl_child == NULL);

        if (x == 1)
                n = malloc(sizeof(struct sysctlnode),
                       M_SYSCTLNODE, M_WAITOK);
        else
                n = malloc(SYSCTL_DEFSIZE * sizeof(struct sysctlnode),
                       M_SYSCTLNODE, M_WAITOK);
        if (n == NULL)
                return (ENOMEM);

        if (x == 1) {
                memset(n, 0, sizeof(struct sysctlnode));
                p->sysctl_csize = 1;
        } else {
                memset(n, 0, SYSCTL_DEFSIZE * sizeof(struct sysctlnode));
                p->sysctl_csize = SYSCTL_DEFSIZE;
        }
        p->sysctl_clen = 0;

        for (i = 0; i < p->sysctl_csize; i++)
                n[i].sysctl_parent = p;

        p->sysctl_child = n;
        return (0);
}

static int
sysctl_realloc(struct sysctlnode *p)
{
        int i, j, olen;
        struct sysctlnode *n;

        assert(p->sysctl_csize == p->sysctl_clen);

        /*
         * how many do we have...how many should we make?
         */
        olen = p->sysctl_clen;
        n = malloc(2 * olen * sizeof(struct sysctlnode), M_SYSCTLNODE,
                   M_WAITOK);
        if (n == NULL)
                return (ENOMEM);

        /*
         * move old children over...initialize new children
         */
        memcpy(n, p->sysctl_child, olen * sizeof(struct sysctlnode));
        memset(&n[olen], 0, olen * sizeof(struct sysctlnode));
        p->sysctl_csize = 2 * olen;

        /*
         * reattach moved (and new) children to parent; if a moved
         * child node has children, reattach the parent pointers of
         * grandchildren
         */
        for (i = 0; i < p->sysctl_csize; i++) {
                n[i].sysctl_parent = p;
                if (n[i].sysctl_child != NULL) {
                        for (j = 0; j < n[i].sysctl_csize; j++)
                                n[i].sysctl_child[j].sysctl_parent = &n[i];
                }
        }

        /*
         * get out with the old and in with the new
         */
        free(p->sysctl_child, M_SYSCTLNODE);
        p->sysctl_child = n;

        return (0);
}

static int
sysctl_log_realloc(struct sysctllog *log)
{
        int *n, s, d;

        s = log->log_size * 2;
        d = log->log_size;

        n = malloc(s * sizeof(int), M_SYSCTLDATA, M_WAITOK);
        if (n == NULL)
                return (-1);

        memset(n, 0, s * sizeof(int));
        memcpy(&n[d], log->log_num, d * sizeof(int));
        free(log->log_num, M_SYSCTLDATA);
        log->log_num = n;
        if (d)
                log->log_left += d;
        else
                log->log_left = s;
        log->log_size = s;

        return (0);
}

/*
 * ********************************************************************
 * Section 6: Conversion between API versions wrt the sysctlnode
 * ********************************************************************
 */
static int
sysctl_cvt_in(struct lwp *l, int *vp, const void *i, size_t sz,
              struct sysctlnode *node)
{
        int error, flags;

        if (i == NULL || sz < sizeof(flags))
                return (EINVAL);

        error = sysctl_copyin(l, i, &flags, sizeof(flags));
        if (error)
                return (error);

#if (SYSCTL_VERSION != SYSCTL_VERS_1)
#error sysctl_cvt_in: no support for SYSCTL_VERSION
#endif /*  (SYSCTL_VERSION != SYSCTL_VERS_1) */

        if (sz == sizeof(*node) &&
            SYSCTL_VERS(flags) == SYSCTL_VERSION) {
                error = sysctl_copyin(l, i, node, sizeof(*node));
                if (error)
                        return (error);
                *vp = SYSCTL_VERSION;
                return (0);
        }

        return (EINVAL);
}

static int
sysctl_cvt_out(struct lwp *l, int v, const struct sysctlnode *i,
               void *ovp, size_t left, size_t *szp)
{
        size_t sz = sizeof(*i);
        const void *src = i;
        int error;

        switch (v) {
        case SYSCTL_VERS_0:
                return (EINVAL);

#if (SYSCTL_VERSION != SYSCTL_VERS_1)
#error sysctl_cvt_out: no support for SYSCTL_VERSION
#endif /*  (SYSCTL_VERSION != SYSCTL_VERS_1) */

        case SYSCTL_VERSION:
                /* nothing more to do here */
                break;
        }

        if (ovp != NULL && left >= sz) {
                error = sysctl_copyout(l, src, ovp, sz);
                if (error)
                        return (error);
        }

        if (szp != NULL)
                *szp = sz;

        return (0);
}

static uint8_t address_key[32];        /* key used in address hashing */
static ONCE_DECL(random_inithook);

static int
random_address_init(void)
{

        cprng_strong(kern_cprng, address_key, sizeof(address_key), 0);
        return 0;
}

void
hash_value_ensure_initialized(void)
{

        RUN_ONCE(&random_inithook, random_address_init);
}

void
hash_value(void *d, size_t ds, const void *s, size_t ss)
{

        blake2s(d, ds, address_key, sizeof(address_key), s, ss);
}






































































    3 





















































    1 












































































    2 






















































    2 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
/*        $NetBSD: umap_vfsops.c,v 1.104 2022/11/04 11:20:40 hannken Exp $        */

/*
 * Copyright (c) 1992, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * the UCLA Ficus project.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        from: @(#)null_vfsops.c       1.5 (Berkeley) 7/10/92
 *        @(#)umap_vfsops.c        8.8 (Berkeley) 5/14/95
 */

/*
 * Umap Layer
 * (See mount_umap(8) for a description of this layer.)
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: umap_vfsops.c,v 1.104 2022/11/04 11:20:40 hannken Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/syslog.h>
#include <sys/kauth.h>
#include <sys/module.h>

#include <miscfs/umapfs/umap.h>
#include <miscfs/genfs/layer_extern.h>

MODULE(MODULE_CLASS_VFS, umap, "layerfs");

VFS_PROTOS(umapfs);

/*
 * Mount umap layer
 */
int
umapfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
        struct lwp *l = curlwp;
        struct pathbuf *pb;
        struct nameidata nd;
        struct umap_args *args = data;
        struct vnode *lowerrootvp, *vp;
        struct umap_mount *amp;
        int error;
#ifdef UMAPFS_DIAGNOSTIC
        int i;
#endif
        fsid_t tfsid;

        if (args == NULL)
                return EINVAL;
        if (*data_len < sizeof *args) {
#ifdef UMAPFS_DIAGNOSTIC
                printf("mount_umap: data len %d < args %d\n",
                        (int)*data_len, (int)(sizeof *args));
#endif
                return EINVAL;
        }

        if (mp->mnt_flag & MNT_GETARGS) {
                amp = MOUNTTOUMAPMOUNT(mp);
                if (amp == NULL)
                        return EIO;
                args->la.target = NULL;
                args->nentries = amp->info_nentries;
                args->gnentries = amp->info_gnentries;
                *data_len = sizeof *args;
                return 0;
        }

        /* only for root */
        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
            KAUTH_REQ_SYSTEM_MOUNT_UMAP, NULL, NULL, NULL);
        if (error)
                return error;

#ifdef UMAPFS_DIAGNOSTIC
        printf("umapfs_mount(mp = %p)\n", mp);
#endif

        /*
         * Update is not supported
         */
        if (mp->mnt_flag & MNT_UPDATE)
                return EOPNOTSUPP;

        /*
         * Find lower node
         */
        error = pathbuf_copyin(args->umap_target, &pb);
        if (error) {
                return error;
        }
        NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, pb);
        if ((error = namei(&nd)) != 0) {
                pathbuf_destroy(pb);
                return error;
        }

        /*
         * Sanity check on lower vnode
         */
        lowerrootvp = nd.ni_vp;
        pathbuf_destroy(pb);
#ifdef UMAPFS_DIAGNOSTIC
        printf("vp = %p, check for VDIR...\n", lowerrootvp);
#endif

        if (lowerrootvp->v_type != VDIR) {
                vput(lowerrootvp);
                return (EINVAL);
        }

#ifdef UMAPFS_DIAGNOSTIC
        printf("mp = %p\n", mp);
#endif

        amp = kmem_zalloc(sizeof(struct umap_mount), KM_SLEEP);
        mp->mnt_data = amp;

        /*
         * Now copy in the number of entries and maps for umap mapping.
         */
        if (args->nentries < 0 || args->nentries > MAPFILEENTRIES ||
            args->gnentries < 0 || args->gnentries > GMAPFILEENTRIES) {
                vput(lowerrootvp);
                return (EINVAL);
        }

        amp->info_nentries = args->nentries;
        amp->info_gnentries = args->gnentries;
        error = copyin(args->mapdata, amp->info_mapdata,
            2*sizeof(u_long)*args->nentries);
        if (error) {
                vput(lowerrootvp);
                return (error);
        }

#ifdef UMAPFS_DIAGNOSTIC
        printf("umap_mount:nentries %d\n",args->nentries);
        for (i = 0; i < args->nentries; i++)
                printf("   %ld maps to %ld\n", amp->info_mapdata[i][0],
                     amp->info_mapdata[i][1]);
#endif

        error = copyin(args->gmapdata, amp->info_gmapdata,
            2*sizeof(u_long)*args->gnentries);
        if (error) {
                vput(lowerrootvp);
                return (error);
        }

#ifdef UMAPFS_DIAGNOSTIC
        printf("umap_mount:gnentries %d\n",args->gnentries);
        for (i = 0; i < args->gnentries; i++)
                printf("\tgroup %ld maps to %ld\n",
                    amp->info_gmapdata[i][0],
                     amp->info_gmapdata[i][1]);
#endif

        /*
         * Make sure the mount point's sufficiently initialized
         * that the node create call will work.
         */
        tfsid.__fsid_val[0] = (int32_t)args->fsid;
        tfsid.__fsid_val[1] = makefstype(MOUNT_UMAP);
        if (tfsid.__fsid_val[0] == 0) {
                log(LOG_WARNING, "umapfs: fsid given as 0, ignoring\n");
                vfs_getnewfsid(mp);
        } else if (vfs_getvfs(&tfsid)) {
                log(LOG_WARNING, "umapfs: fsid %x already mounted\n",
                        tfsid.__fsid_val[0]);
                vfs_getnewfsid(mp);
        } else {
                       mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
                       mp->mnt_stat.f_fsidx.__fsid_val[1] = tfsid.__fsid_val[1];
                mp->mnt_stat.f_fsid = tfsid.__fsid_val[0];
        }
        log(LOG_DEBUG, "umapfs: using fsid %x/%x\n",
                mp->mnt_stat.f_fsidx.__fsid_val[0],
                mp->mnt_stat.f_fsidx.__fsid_val[1]);
        error = vfs_set_lowermount(mp, lowerrootvp->v_mount);
        if (error) {
                vput(lowerrootvp);
                kmem_free(amp, sizeof(struct umap_mount));
                return error;
        }

        amp->umapm_size = sizeof(struct umap_node);
        amp->umapm_tag = VT_UMAP;
        amp->umapm_bypass = umap_bypass;
        amp->umapm_vnodeop_p = umap_vnodeop_p;

        /*
         * fix up umap node for root vnode.
         */
        VOP_UNLOCK(lowerrootvp);
        error = layer_node_create(mp, lowerrootvp, &vp);
        /*
         * Make sure the node alias worked
         */
        if (error) {
                vrele(lowerrootvp);
                kmem_free(amp, sizeof(struct umap_mount));
                return error;
        }

        /*
         * Keep a held reference to the root vnode.
         * It is vrele'd in umapfs_unmount.
         */
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        vp->v_vflag |= VV_ROOT;
        amp->umapm_rootvp = vp;
        VOP_UNLOCK(vp);

        error = set_statvfs_info(path, UIO_USERSPACE, args->umap_target,
            UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
        if (error)
                return error;

        if (mp->mnt_lower->mnt_flag & MNT_LOCAL)
                mp->mnt_flag |= MNT_LOCAL;
#ifdef UMAPFS_DIAGNOSTIC
        printf("umapfs_mount: lower %s, alias at %s\n",
                mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
#endif
        return 0;
}

/*
 * Free reference to umap layer
 */
int
umapfs_unmount(struct mount *mp, int mntflags)
{
        struct umap_mount *amp = MOUNTTOUMAPMOUNT(mp);
        struct vnode *rtvp = amp->umapm_rootvp;
        int error;
        int flags = 0;

#ifdef UMAPFS_DIAGNOSTIC
        printf("umapfs_unmount(mp = %p)\n", mp);
#endif

        if (mntflags & MNT_FORCE)
                flags |= FORCECLOSE;

        if (vrefcnt(rtvp) > 1 && (mntflags & MNT_FORCE) == 0)
                return (EBUSY);
        if ((error = vflush(mp, rtvp, flags)) != 0)
                return (error);

#ifdef UMAPFS_DIAGNOSTIC
        vprint("alias root of lower", rtvp);
#endif
        /*
         * Blow it away for future re-use
         */
        vgone(rtvp);
        /*
         * Finally, throw away the umap_mount structure
         */
        kmem_free(amp, sizeof(struct umap_mount));
        mp->mnt_data = NULL;
        return 0;
}

extern const struct vnodeopv_desc umapfs_vnodeop_opv_desc;

const struct vnodeopv_desc * const umapfs_vnodeopv_descs[] = {
        &umapfs_vnodeop_opv_desc,
        NULL,
};

struct vfsops umapfs_vfsops = {
        .vfs_name = MOUNT_UMAP,
        .vfs_min_mount_data = sizeof (struct umap_args),
        .vfs_mount = umapfs_mount,
        .vfs_start = layerfs_start,
        .vfs_unmount = umapfs_unmount,
        .vfs_root = layerfs_root,
        .vfs_quotactl = layerfs_quotactl,
        .vfs_statvfs = layerfs_statvfs,
        .vfs_sync = layerfs_sync,
        .vfs_loadvnode = layerfs_loadvnode,
        .vfs_vget = layerfs_vget,
        .vfs_fhtovp = layerfs_fhtovp,
        .vfs_vptofh = layerfs_vptofh,
        .vfs_init = layerfs_init,
        .vfs_done = layerfs_done,
        .vfs_snapshot = layerfs_snapshot,
        .vfs_extattrctl = vfs_stdextattrctl,
        .vfs_suspendctl = layerfs_suspendctl,
        .vfs_renamelock_enter = layerfs_renamelock_enter,
        .vfs_renamelock_exit = layerfs_renamelock_exit,
        .vfs_fsync = (void *)eopnotsupp,
        .vfs_opv_descs = umapfs_vnodeopv_descs
};

SYSCTL_SETUP(umapfs_sysctl_setup, "umapfs sysctl")
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "umap",
                       SYSCTL_DESCR("UID/GID remapping file system"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, 10, CTL_EOL);
        /*
         * XXX the "10" above could be dynamic, thereby eliminating
         * one more instance of the "number to vfs" mapping problem,
         * but "10" is the order as taken from sys/mount.h
         */
}

static int
umap_modcmd(modcmd_t cmd, void *arg)
{
        int error;

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = vfs_attach(&umapfs_vfsops);
                if (error != 0)
                        break;
                break;
        case MODULE_CMD_FINI:
                error = vfs_detach(&umapfs_vfsops);
                if (error != 0)
                        break;
                break;
        default:
                error = ENOTTY;
                break;
        }

        return (error);
}

















































































































































































































































































































   40 



   41 
































































































































































































































































































































































































    2 



    2 





















    1 



    1 




































    5 


    5 




























































































































































   16 







    1 




   12 





    5 



    1 






    3 








    1 





















































































































































































































































































































































   35 



   35 
































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
/*        $NetBSD: kern_veriexec.c,v 1.27 2023/04/09 09:18:09 riastradh Exp $        */

/*-
 * Copyright (c) 2005, 2006 Elad Efrat <elad@NetBSD.org>
 * Copyright (c) 2005, 2006 Brett Lymn <blymn@NetBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the authors may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_veriexec.c,v 1.27 2023/04/09 09:18:09 riastradh Exp $");

#include "opt_veriexec.h"

#include <sys/param.h>
#include <sys/mount.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/once.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <sys/inttypes.h>
#include <sys/verified_exec.h>
#include <sys/sha1.h>
#include <sys/sha2.h>
#include <sys/rmd160.h>
#include <sys/md5.h>
#include <sys/fileassoc.h>
#include <sys/kauth.h>
#include <sys/conf.h>
#include <miscfs/specfs/specdev.h>
#include <prop/proplib.h>
#include <sys/fcntl.h>

/* Readable values for veriexec_file_report(). */
#define        REPORT_ALWAYS                0x01        /* Always print */
#define        REPORT_VERBOSE                0x02        /* Print when verbose >= 1 */
#define        REPORT_DEBUG                0x04        /* Print when verbose >= 2 (debug) */
#define        REPORT_PANIC                0x08        /* Call panic() */
#define        REPORT_ALARM                0x10        /* Alarm - also print pid/uid/.. */
#define        REPORT_LOGMASK                (REPORT_ALWAYS|REPORT_VERBOSE|REPORT_DEBUG)

/* state of locking for veriexec_file_verify */
#define VERIEXEC_UNLOCKED        0x00        /* Nothing locked, callee does it */
#define VERIEXEC_LOCKED                0x01        /* Global op lock held */

/* state of file locking for veriexec_file_verify */
#define VERIEXEC_FILE_UNLOCKED        0x02        /* Nothing locked, callee does it */
#define VERIEXEC_FILE_LOCKED        0x04        /* File locked */

#define VERIEXEC_RW_UPGRADE(lock)        while((rw_tryupgrade(lock)) == 0){};

struct veriexec_fpops {
        const char *type;
        size_t hash_len;
        size_t context_size;
        veriexec_fpop_init_t init;
        veriexec_fpop_update_t update;
        veriexec_fpop_final_t final;
        LIST_ENTRY(veriexec_fpops) entries;
};

/* Veriexec per-file entry data. */
struct veriexec_file_entry {
        krwlock_t lock;                                /* r/w lock */
        u_char *filename;                        /* File name. */
        u_char type;                                /* Entry type. */
        u_char status;                                /* Evaluation status. */
        u_char *fp;                                /* Fingerprint. */
        struct veriexec_fpops *ops;                /* Fingerprint ops vector*/
        size_t filename_len;                        /* Length of filename. */
};

/* Veriexec per-table data. */
struct veriexec_table_entry {
        uint64_t vte_count;                        /* Number of Veriexec entries. */
        const struct sysctlnode *vte_node;
};

static int veriexec_verbose;
static int veriexec_strict;
static int veriexec_bypass = 1;

static char *veriexec_fp_names = NULL;
static size_t veriexec_name_max = 0;

static const struct sysctlnode *veriexec_count_node;

static fileassoc_t veriexec_hook;
static specificdata_key_t veriexec_mountspecific_key;

static LIST_HEAD(, veriexec_fpops) veriexec_fpops_list =
        LIST_HEAD_INITIALIZER(veriexec_fpops_list);

static int veriexec_raw_cb(kauth_cred_t, kauth_action_t, void *,
    void *, void *, void *, void *);
static struct veriexec_fpops *veriexec_fpops_lookup(const char *);
static void veriexec_file_free(struct veriexec_file_entry *);

static unsigned int veriexec_tablecount = 0;

/*
 * Veriexec operations global lock - most ops hold this as a read
 * lock, it is upgraded to a write lock when destroying veriexec file
 * table entries.
 */
static krwlock_t veriexec_op_lock;

/*
 * Sysctl helper routine for Veriexec.
 */
static int
sysctl_kern_veriexec_algorithms(SYSCTLFN_ARGS)
{
        size_t len;
        int error;
        const char *p;

        if (newp != NULL)
                return EPERM;

        if (namelen != 0)
                return EINVAL;

        p = veriexec_fp_names == NULL ? "" : veriexec_fp_names;

        len = strlen(p) + 1;

        if (*oldlenp < len && oldp)
                return ENOMEM;

        if (oldp && (error = copyout(p, oldp, len)) != 0)
                return error;

        *oldlenp = len;
        return 0;
}

static int
sysctl_kern_veriexec_strict(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int error, newval;

        node = *rnode;
        node.sysctl_data = &newval;

        newval = veriexec_strict;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        if (newval < veriexec_strict)
                return EPERM;

        veriexec_strict = newval;

        return 0;
}

SYSCTL_SETUP(sysctl_kern_veriexec_setup, "sysctl kern.veriexec setup")
{
        const struct sysctlnode *rnode = NULL;

        sysctl_createv(clog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "veriexec",
                       SYSCTL_DESCR("Veriexec"),
                       NULL, 0, NULL, 0,
                       CTL_KERN, CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "verbose",
                       SYSCTL_DESCR("Veriexec verbose level"),
                       NULL, 0, &veriexec_verbose, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "strict",
                       SYSCTL_DESCR("Veriexec strict level"),
                       sysctl_kern_veriexec_strict, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "algorithms",
                       SYSCTL_DESCR("Veriexec supported hashing "
                                    "algorithms"),
                       sysctl_kern_veriexec_algorithms, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &rnode, &veriexec_count_node,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "count",
                       SYSCTL_DESCR("Number of fingerprints on mount(s)"),
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);
}

/*
 * Add ops to the fingerprint ops vector list.
 */
int
veriexec_fpops_add(const char *fp_type, size_t hash_len, size_t ctx_size,
    veriexec_fpop_init_t init, veriexec_fpop_update_t update,
    veriexec_fpop_final_t final)
{
        struct veriexec_fpops *ops;

        KASSERT(init != NULL);
        KASSERT(update != NULL);
        KASSERT(final != NULL);
        KASSERT(hash_len != 0);
        KASSERT(ctx_size != 0);
        KASSERT(fp_type != NULL);

        if (veriexec_fpops_lookup(fp_type) != NULL)
                return (EEXIST);

        ops = kmem_alloc(sizeof(*ops), KM_SLEEP);
        ops->type = fp_type;
        ops->hash_len = hash_len;
        ops->context_size = ctx_size;
        ops->init = init;
        ops->update = update;
        ops->final = final;

        LIST_INSERT_HEAD(&veriexec_fpops_list, ops, entries);

        /*
         * If we don't have space for any names, allocate enough for six
         * which should be sufficient. (it's also enough for all algorithms
         * we can support at the moment)
         */
        if (veriexec_fp_names == NULL) {
                veriexec_name_max = 64;
                veriexec_fp_names = kmem_zalloc(veriexec_name_max, KM_SLEEP);
        }

        /*
         * If we're running out of space for storing supported algorithms,
         * extend the buffer with space for four names.
         */
        while (veriexec_name_max - (strlen(veriexec_fp_names) + 1) <
            strlen(fp_type)) {
                char *newp;
                unsigned int new_max;

                /* Add space for four algorithm names. */
                new_max = veriexec_name_max + 64;
                newp = kmem_zalloc(new_max, KM_SLEEP);
                strlcpy(newp, veriexec_fp_names, new_max);
                kmem_free(veriexec_fp_names, veriexec_name_max);
                veriexec_fp_names = newp;
                veriexec_name_max = new_max;
        }

        if (*veriexec_fp_names != '\0')
                strlcat(veriexec_fp_names, " ", veriexec_name_max);

        strlcat(veriexec_fp_names, fp_type, veriexec_name_max);

        return (0);
}

static void
veriexec_mountspecific_dtor(void *v)
{
        struct veriexec_table_entry *vte = v;

        if (vte == NULL) {
                return;
        }
        sysctl_free(__UNCONST(vte->vte_node));
        veriexec_tablecount--;
        kmem_free(vte, sizeof(*vte));
}

static int
veriexec_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;
        enum kauth_system_req req;

        if (action != KAUTH_SYSTEM_VERIEXEC)
                return KAUTH_RESULT_DEFER;

        result = KAUTH_RESULT_DEFER;
        req = (enum kauth_system_req)(uintptr_t)arg0;

        if (req == KAUTH_REQ_SYSTEM_VERIEXEC_MODIFY &&
            veriexec_strict > VERIEXEC_LEARNING) {
                log(LOG_WARNING, "Veriexec: Strict mode, modifying "
                    "tables not permitted.\n");

                result = KAUTH_RESULT_DENY;
        }

        return result;
}

/*
 * Initialise Veriexec.
 */
void
veriexec_init(void)
{
        int error;

        /* Register a fileassoc for Veriexec. */
        error = fileassoc_register("veriexec",
            (fileassoc_cleanup_cb_t)veriexec_file_free, &veriexec_hook);
        if (error)
                panic("Veriexec: Can't register fileassoc: error=%d", error);

        /* Register listener to handle raw disk access. */
        if (kauth_listen_scope(KAUTH_SCOPE_DEVICE, veriexec_raw_cb, NULL) ==
            NULL)
                panic("Veriexec: Can't listen on device scope");

        error = mount_specific_key_create(&veriexec_mountspecific_key,
            veriexec_mountspecific_dtor);
        if (error)
                panic("Veriexec: Can't create mountspecific key");

        if (kauth_listen_scope(KAUTH_SCOPE_SYSTEM, veriexec_listener_cb,
            NULL) == NULL)
                panic("Veriexec: Can't listen on system scope");

        rw_init(&veriexec_op_lock);

#define        FPOPS_ADD(a, b, c, d, e, f)                        \
        veriexec_fpops_add(a, b, c,                        \
            __FPTRCAST(veriexec_fpop_init_t, d),        \
            __FPTRCAST(veriexec_fpop_update_t, e),        \
            __FPTRCAST(veriexec_fpop_final_t, f))

#ifdef VERIFIED_EXEC_FP_SHA256
        FPOPS_ADD("SHA256", SHA256_DIGEST_LENGTH, sizeof(SHA256_CTX),
            SHA256_Init, SHA256_Update, SHA256_Final);
#endif /* VERIFIED_EXEC_FP_SHA256 */

#ifdef VERIFIED_EXEC_FP_SHA384
        FPOPS_ADD("SHA384", SHA384_DIGEST_LENGTH, sizeof(SHA384_CTX),
            SHA384_Init, SHA384_Update, SHA384_Final);
#endif /* VERIFIED_EXEC_FP_SHA384 */

#ifdef VERIFIED_EXEC_FP_SHA512
        FPOPS_ADD("SHA512", SHA512_DIGEST_LENGTH, sizeof(SHA512_CTX),
            SHA512_Init, SHA512_Update, SHA512_Final);
#endif /* VERIFIED_EXEC_FP_SHA512 */

#undef FPOPS_ADD
}

static struct veriexec_fpops *
veriexec_fpops_lookup(const char *name)
{
        struct veriexec_fpops *ops;

        if (name == NULL)
                return (NULL);

        LIST_FOREACH(ops, &veriexec_fpops_list, entries) {
                if (strcasecmp(name, ops->type) == 0)
                        return (ops);
        }

        return (NULL);
}

/*
 * Calculate fingerprint. Information on hash length and routines used is
 * extracted from veriexec_hash_list according to the hash type.
 *
 * NOTE: vfe is assumed to be locked for writing on entry.
 */
static int
veriexec_fp_calc(struct lwp *l, struct vnode *vp, int file_lock_state,
    struct veriexec_file_entry *vfe, u_char *fp)
{
        struct vattr va;
        void *ctx;
        u_char *buf;
        off_t offset, len;
        size_t resid;
        int error;

        KASSERT(file_lock_state != VERIEXEC_LOCKED);
        KASSERT(file_lock_state != VERIEXEC_UNLOCKED);

        if (file_lock_state == VERIEXEC_FILE_UNLOCKED)
                vn_lock(vp, LK_SHARED | LK_RETRY);
        error = VOP_GETATTR(vp, &va, l->l_cred);
        if (file_lock_state == VERIEXEC_FILE_UNLOCKED)
                VOP_UNLOCK(vp);
        if (error)
                return (error);

        ctx = kmem_alloc(vfe->ops->context_size, KM_SLEEP);
        buf = kmem_alloc(PAGE_SIZE, KM_SLEEP);

        (vfe->ops->init)(ctx);

        len = 0;
        error = 0;
        for (offset = 0; offset < va.va_size; offset += PAGE_SIZE) {
                len = ((va.va_size - offset) < PAGE_SIZE) ?
                    (va.va_size - offset) : PAGE_SIZE;

                error = vn_rdwr(UIO_READ, vp, buf, len, offset,
                                UIO_SYSSPACE,
                                ((file_lock_state == VERIEXEC_FILE_LOCKED)?
                                 IO_NODELOCKED : 0),
                                l->l_cred, &resid, NULL);

                if (error) {
                        goto bad;
                }

                (vfe->ops->update)(ctx, buf, (unsigned int) len);

                if (len != PAGE_SIZE)
                        break;
        }

        (vfe->ops->final)(fp, ctx);

bad:
        kmem_free(ctx, vfe->ops->context_size);
        kmem_free(buf, PAGE_SIZE);

        return (error);
}

/* Compare two fingerprints of the same type. */
static int
veriexec_fp_cmp(struct veriexec_fpops *ops, u_char *fp1, u_char *fp2)
{
        if (veriexec_verbose >= 2) {
                int i;

                printf("comparing hashes...\n");
                printf("fp1: ");
                for (i = 0; i < ops->hash_len; i++) {
                        printf("%02x", fp1[i]);
                }
                printf("\nfp2: ");
                for (i = 0; i < ops->hash_len; i++) {
                        printf("%02x", fp2[i]);
                }
                printf("\n");
        }

        return (memcmp(fp1, fp2, ops->hash_len));
}

static int
veriexec_fp_status(struct lwp *l, struct vnode *vp, int file_lock_state,
    struct veriexec_file_entry *vfe, u_char *status)
{
        size_t hash_len = vfe->ops->hash_len;
        u_char *digest;
        int error;

        digest = kmem_zalloc(hash_len, KM_SLEEP);

        error = veriexec_fp_calc(l, vp, file_lock_state, vfe, digest);
        if (error)
                goto out;

        /* Compare fingerprint with loaded data. */
        if (veriexec_fp_cmp(vfe->ops, vfe->fp, digest) == 0)
                *status = FINGERPRINT_VALID;
        else
                *status = FINGERPRINT_NOMATCH;

out:
        kmem_free(digest, hash_len);
        return error;
}


static struct veriexec_table_entry *
veriexec_table_lookup(struct mount *mp)
{
        /* XXX: From raidframe init */
        if (mp == NULL)
                return NULL;

        return mount_getspecific(mp, veriexec_mountspecific_key);
}

static struct veriexec_file_entry *
veriexec_get(struct vnode *vp)
{
        return (fileassoc_lookup(vp, veriexec_hook));
}

bool
veriexec_lookup(struct vnode *vp)
{
        return (veriexec_get(vp) == NULL ? false : true);
}

/*
 * Routine for maintaining mostly consistent message formats in Veriexec.
 */
static void
veriexec_file_report(struct veriexec_file_entry *vfe, const u_char *msg,
    const u_char *filename, struct lwp *l, int f)
{
        if (vfe != NULL && vfe->filename != NULL)
                filename = vfe->filename;
        if (filename == NULL)
                return;

        if (((f & REPORT_LOGMASK) >> 1) <= veriexec_verbose) {
                if (!(f & REPORT_ALARM) || (l == NULL))
                        log(LOG_NOTICE, "Veriexec: %s [%s]\n", msg,
                            filename);
                else
                        log(LOG_ALERT, "Veriexec: %s [%s, prog=%s pid=%u, "
                            "uid=%u, gid=%u]\n", msg, filename,
                            l->l_proc->p_comm, l->l_proc->p_pid,
                            kauth_cred_getuid(l->l_cred),
                            kauth_cred_getgid(l->l_cred));
        }

        if (f & REPORT_PANIC)
                panic("Veriexec: Unrecoverable error.");
}

/*
 * Verify the fingerprint of the given file. If we're called directly from
 * sys_execve(), 'flag' will be VERIEXEC_DIRECT. If we're called from
 * exec_script(), 'flag' will be VERIEXEC_INDIRECT.  If we are called from
 * vn_open(), 'flag' will be VERIEXEC_FILE.
 *
 * 'veriexec_op_lock' must be locked (and remains locked).
 *
 * NOTE: The veriexec file entry pointer (vfep) will be returned LOCKED
 *       on no error.
 */
static int
veriexec_file_verify(struct lwp *l, struct vnode *vp, const u_char *name,
    int flag, int file_lock_state, struct veriexec_file_entry **vfep)
{
        struct veriexec_file_entry *vfe;
        int error = 0;

        KASSERT(rw_lock_held(&veriexec_op_lock));
        KASSERT(file_lock_state != VERIEXEC_LOCKED);
        KASSERT(file_lock_state != VERIEXEC_UNLOCKED);

#define VFE_NEEDS_EVAL(vfe) ((vfe->status == FINGERPRINT_NOTEVAL) || \
                             (vfe->type & VERIEXEC_UNTRUSTED))

        if (vfep != NULL)
                *vfep = NULL;

        if (vp->v_type != VREG)
                return (0);

        /* Lookup veriexec table entry, save pointer if requested. */
        vfe = veriexec_get(vp);
        if (vfep != NULL)
                *vfep = vfe;

        /* No entry in the veriexec tables. */
        if (vfe == NULL) {
                veriexec_file_report(NULL, "No entry.", name,
                    l, REPORT_VERBOSE);

                /*
                 * Lockdown mode: Deny access to non-monitored files.
                 * IPS mode: Deny execution of non-monitored files.
                 */
                if ((veriexec_strict >= VERIEXEC_LOCKDOWN) ||
                    ((veriexec_strict >= VERIEXEC_IPS) &&
                     (flag != VERIEXEC_FILE)))
                        return (EPERM);

                return (0);
        }

        /*
         * Grab the lock for the entry, if we need to do an evaluation
         * then the lock is a write lock, after we have the write
         * lock, check if we really need it - some other thread may
         * have already done the work for us.
         */
        if (VFE_NEEDS_EVAL(vfe)) {
                rw_enter(&vfe->lock, RW_WRITER);
                if (!VFE_NEEDS_EVAL(vfe))
                        rw_downgrade(&vfe->lock);
        } else
                rw_enter(&vfe->lock, RW_READER);

        /* Evaluate fingerprint if needed. */
        if (VFE_NEEDS_EVAL(vfe)) {
                u_char status;

                error = veriexec_fp_status(l, vp, file_lock_state, vfe, &status);
                if (error) {
                        veriexec_file_report(vfe, "Fingerprint calculation error.",
                            name, NULL, REPORT_ALWAYS);
                        rw_exit(&vfe->lock);
                        return (error);
                }
                vfe->status = status;
                rw_downgrade(&vfe->lock);
        }

        if (!(vfe->type & flag)) {
                veriexec_file_report(vfe, "Incorrect access type.", name, l,
                    REPORT_ALWAYS|REPORT_ALARM);

                /* IPS mode: Enforce access type. */
                if (veriexec_strict >= VERIEXEC_IPS) {
                        rw_exit(&vfe->lock);
                        return (EPERM);
                }
        }

        switch (vfe->status) {
        case FINGERPRINT_NOTEVAL:
                /* Should not happen. */
                rw_exit(&vfe->lock);
                veriexec_file_report(vfe, "Not-evaluated status "
                    "post evaluation; inconsistency detected.", name,
                    NULL, REPORT_ALWAYS|REPORT_PANIC);
                __builtin_unreachable();
                /* NOTREACHED */

        case FINGERPRINT_VALID:
                /* Valid fingerprint. */
                veriexec_file_report(vfe, "Match.", name, NULL,
                    REPORT_VERBOSE);

                break;

        case FINGERPRINT_NOMATCH:
                /* Fingerprint mismatch. */
                veriexec_file_report(vfe, "Mismatch.", name,
                    NULL, REPORT_ALWAYS|REPORT_ALARM);

                /* IDS mode: Deny access on fingerprint mismatch. */
                if (veriexec_strict >= VERIEXEC_IDS) {
                        rw_exit(&vfe->lock);
                        error = EPERM;
                }

                break;

        default:
                /* Should never happen. */
                rw_exit(&vfe->lock);
                veriexec_file_report(vfe, "Invalid status "
                    "post evaluation.", name, NULL, REPORT_ALWAYS|REPORT_PANIC);
                /* NOTREACHED */
        }

        return (error);
}

int
veriexec_verify(struct lwp *l, struct vnode *vp, const u_char *name, int flag,
    bool *found)
{
        struct veriexec_file_entry *vfe;
        int r;

        if (veriexec_bypass && (veriexec_strict == VERIEXEC_LEARNING))
                return 0;

        rw_enter(&veriexec_op_lock, RW_READER);
        r = veriexec_file_verify(l, vp, name, flag, VERIEXEC_FILE_UNLOCKED,
            &vfe);
        rw_exit(&veriexec_op_lock);

        if ((r  == 0) && (vfe != NULL))
                rw_exit(&vfe->lock);

        if (found != NULL)
                *found = (vfe != NULL) ? true : false;

        return (r);
}

/*
 * Veriexec remove policy code.
 */
int
veriexec_removechk(struct lwp *l, struct vnode *vp, const char *pathbuf)
{
        struct veriexec_file_entry *vfe;
        int error;

        if (veriexec_bypass && (veriexec_strict == VERIEXEC_LEARNING))
                return 0;

        rw_enter(&veriexec_op_lock, RW_READER);
        vfe = veriexec_get(vp);
        rw_exit(&veriexec_op_lock);

        if (vfe == NULL) {
                /* Lockdown mode: Deny access to non-monitored files. */
                if (veriexec_strict >= VERIEXEC_LOCKDOWN)
                        return (EPERM);

                return (0);
        }

        veriexec_file_report(vfe, "Remove request.", pathbuf, l,
            REPORT_ALWAYS|REPORT_ALARM);

        /* IDS mode: Deny removal of monitored files. */
        if (veriexec_strict >= VERIEXEC_IDS)
                error = EPERM;
        else
                error = veriexec_file_delete(l, vp);

        return error;
}

/*
 * Veriexec rename policy.
 *
 * XXX: Once there's a way to hook after a successful rename, it would be
 * XXX: nice to update vfe->filename to the new name if it's not NULL and
 * XXX: the new name is absolute (ie., starts with a slash).
 */
int
veriexec_renamechk(struct lwp *l, struct vnode *fromvp, const char *fromname,
    struct vnode *tovp, const char *toname)
{
        struct veriexec_file_entry *fvfe = NULL, *tvfe = NULL;

        if (veriexec_bypass && (veriexec_strict == VERIEXEC_LEARNING))
                return 0;

        rw_enter(&veriexec_op_lock, RW_READER);

        if (veriexec_strict >= VERIEXEC_LOCKDOWN) {
                log(LOG_ALERT, "Veriexec: Preventing rename of `%s' to "
                    "`%s', uid=%u, pid=%u: Lockdown mode.\n", fromname, toname,
                    kauth_cred_geteuid(l->l_cred), l->l_proc->p_pid);
                rw_exit(&veriexec_op_lock);
                return (EPERM);
        }

        fvfe = veriexec_get(fromvp);
        if (tovp != NULL)
                tvfe = veriexec_get(tovp);

        if ((fvfe == NULL) && (tvfe == NULL)) {
                /* None of them is monitored */
                rw_exit(&veriexec_op_lock);
                return 0;
        }

        if (veriexec_strict >= VERIEXEC_IPS) {
                log(LOG_ALERT, "Veriexec: Preventing rename of `%s' "
                    "to `%s', uid=%u, pid=%u: IPS mode, %s "
                    "monitored.\n", fromname, toname,
                    kauth_cred_geteuid(l->l_cred),
                    l->l_proc->p_pid, (fvfe != NULL && tvfe != NULL) ?
                    "files" : "file");
                rw_exit(&veriexec_op_lock);
                return (EPERM);
        }

        if (fvfe != NULL) {
                /*
                 * Monitored file is renamed; filename no longer relevant.
                 */

                /*
                 * XXX: We could keep the buffer, and when (and if) updating the
                 * XXX: filename post-rename, re-allocate it only if it's not
                 * XXX: big enough for the new filename.
                 */

                /* XXX: Get write lock on fvfe here? */

                VERIEXEC_RW_UPGRADE(&veriexec_op_lock);
                /* once we have the op lock in write mode
                 * there should be no locks on any file
                 * entries so we can destroy the object.
                 */

                if (fvfe->filename_len > 0)
                        kmem_free(fvfe->filename, fvfe->filename_len);

                fvfe->filename = NULL;
                fvfe->filename_len = 0;

                rw_downgrade(&veriexec_op_lock);
        }

        log(LOG_NOTICE, "Veriexec: %s file `%s' renamed to "
            "%s file `%s', uid=%u, pid=%u.\n", (fvfe != NULL) ?
            "Monitored" : "Non-monitored", fromname, (tvfe != NULL) ?
            "monitored" : "non-monitored", toname,
            kauth_cred_geteuid(l->l_cred), l->l_proc->p_pid);

        rw_exit(&veriexec_op_lock);

        if (tvfe != NULL) {
                /*
                 * Monitored file is overwritten. Remove the entry.
                 */
                (void)veriexec_file_delete(l, tovp);
        }

        return (0);
}

static void
veriexec_file_free(struct veriexec_file_entry *vfe)
{
        if (vfe != NULL) {
                if (vfe->fp != NULL)
                        kmem_free(vfe->fp, vfe->ops->hash_len);
                if (vfe->filename != NULL)
                        kmem_free(vfe->filename, vfe->filename_len);
                rw_destroy(&vfe->lock);
                kmem_free(vfe, sizeof(*vfe));
        }
}

static void
veriexec_file_purge(struct veriexec_file_entry *vfe, int have_lock)
{
        if (vfe == NULL)
                return;

        if (have_lock == VERIEXEC_UNLOCKED)
                rw_enter(&vfe->lock, RW_WRITER);
        else
                VERIEXEC_RW_UPGRADE(&vfe->lock);

        vfe->status = FINGERPRINT_NOTEVAL;
        if (have_lock == VERIEXEC_UNLOCKED)
                rw_exit(&vfe->lock);
        else
                rw_downgrade(&vfe->lock);
}

static void
veriexec_file_purge_cb(struct veriexec_file_entry *vfe, void *cookie)
{
        veriexec_file_purge(vfe, VERIEXEC_UNLOCKED);
}

/*
 * Invalidate a Veriexec file entry.
 * XXX: This should be updated when per-page fingerprints are added.
 */
void
veriexec_purge(struct vnode *vp)
{
        rw_enter(&veriexec_op_lock, RW_READER);
        veriexec_file_purge(veriexec_get(vp), VERIEXEC_UNLOCKED);
        rw_exit(&veriexec_op_lock);
}

/*
 * Enforce raw disk access policy.
 *
 * IDS mode: Invalidate fingerprints on a mount if it's opened for writing.
 * IPS mode: Don't allow raw writing to disks we monitor.
 * Lockdown mode: Don't allow raw writing to all disks.
 *
 * XXX: This is bogus. There's an obvious race condition between the time
 * XXX: the disk is open for writing, in which an attacker can access a
 * XXX: monitored file to get its signature cached again, and when the raw
 * XXX: file is overwritten on disk.
 * XXX:
 * XXX: To solve this, we need something like the following:
 * XXX:                open raw disk:
 * XXX:                  - raise refcount,
 * XXX:                  - invalidate fingerprints,
 * XXX:                  - mark all entries for that disk with "no cache" flag
 * XXX:
 * XXX:                veriexec_verify:
 * XXX:                  - if "no cache", don't cache evaluation result
 * XXX:
 * XXX:                close raw disk:
 * XXX:                  - lower refcount,
 * XXX:                  - if refcount == 0, remove "no cache" flag from all entries
 */
static int
veriexec_raw_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;
        enum kauth_device_req req;
        struct veriexec_table_entry *vte;

        result = KAUTH_RESULT_DENY;
        req = (enum kauth_device_req)(uintptr_t)arg0;

        switch (action) {
        case KAUTH_DEVICE_RAWIO_SPEC: {
                struct vnode *vp, *bvp;
                int error;

                if (req == KAUTH_REQ_DEVICE_RAWIO_SPEC_READ) {
                        result = KAUTH_RESULT_DEFER;
                        break;
                }

                vp = arg1;
                KASSERT(vp != NULL);

                /* Handle /dev/mem and /dev/kmem. */
                if (iskmemvp(vp)) {
                        if (veriexec_strict < VERIEXEC_IPS)
                                result = KAUTH_RESULT_DEFER;

                        break;
                }

                error = rawdev_mounted(vp, &bvp);
                if (error == EINVAL) {
                        result = KAUTH_RESULT_DEFER;
                        break;
                }

                /*
                 * XXX: See vfs_mountedon() comment in rawdev_mounted().
                 */
                vte = veriexec_table_lookup(bvp->v_mount);
                if (vte == NULL) {
                        result = KAUTH_RESULT_DEFER;
                        break;
                }

                switch (veriexec_strict) {
                case VERIEXEC_LEARNING:
                case VERIEXEC_IDS:
                        result = KAUTH_RESULT_DEFER;

                        rw_enter(&veriexec_op_lock, RW_WRITER);
                        fileassoc_table_run(bvp->v_mount, veriexec_hook,
                            (fileassoc_cb_t)veriexec_file_purge_cb, NULL);
                        rw_exit(&veriexec_op_lock);

                        break;
                case VERIEXEC_IPS:
                        result = KAUTH_RESULT_DENY;
                        break;
                case VERIEXEC_LOCKDOWN:
                        result = KAUTH_RESULT_DENY;
                        break;
                }

                break;
                }

        case KAUTH_DEVICE_RAWIO_PASSTHRU:
                /* XXX What can we do here? */
                if (veriexec_strict < VERIEXEC_IPS)
                        result = KAUTH_RESULT_DEFER;

                break;

        default:
                result = KAUTH_RESULT_DEFER;
                break;
        }

        return (result);
}

/*
 * Create a new Veriexec table.
 */
static struct veriexec_table_entry *
veriexec_table_add(struct lwp *l, struct mount *mp)
{
        struct veriexec_table_entry *vte;
        u_char buf[16];

        vte = kmem_zalloc(sizeof(*vte), KM_SLEEP);
        mount_setspecific(mp, veriexec_mountspecific_key, vte);

        snprintf(buf, sizeof(buf), "table%u", veriexec_tablecount++);
        sysctl_createv(NULL, 0, &veriexec_count_node, &vte->vte_node,
                       0, CTLTYPE_NODE, buf, NULL, NULL, 0, NULL,
                       0, CTL_CREATE, CTL_EOL);

        sysctl_createv(NULL, 0, &vte->vte_node, NULL,
                       CTLFLAG_READONLY, CTLTYPE_STRING, "mntpt",
                       NULL, NULL, 0, mp->mnt_stat.f_mntonname,
                       0, CTL_CREATE, CTL_EOL);
        sysctl_createv(NULL, 0, &vte->vte_node, NULL,
                       CTLFLAG_READONLY, CTLTYPE_STRING, "fstype",
                       NULL, NULL, 0, mp->mnt_stat.f_fstypename,
                       0, CTL_CREATE, CTL_EOL);
        sysctl_createv(NULL, 0, &vte->vte_node, NULL,
                       CTLFLAG_READONLY, CTLTYPE_QUAD, "nentries",
                       NULL, NULL, 0, &vte->vte_count, 0, CTL_CREATE, CTL_EOL);

        return (vte);
}

/*
 * Add a file to be monitored by Veriexec.
 *
 * Expected elements in dict:
 *     file, fp, fp-type, entry-type, keep-filename, eval-on-load.
 */
int
veriexec_file_add(struct lwp *l, prop_dictionary_t dict)
{
        struct veriexec_table_entry *vte;
        struct veriexec_file_entry *vfe = NULL;
        struct veriexec_file_entry *ovfe;
        struct vnode *vp;
        const char *file, *fp_type;
        int error;
        bool ignore_dup = false;

        if (!prop_dictionary_get_string(dict, "file", &file))
                return (EINVAL);

        error = namei_simple_kernel(file, NSM_FOLLOW_NOEMULROOT, &vp);
        if (error)
                return (error);

        /* Add only regular files. */
        if (vp->v_type != VREG) {
                log(LOG_ERR, "Veriexec: Not adding `%s': Not a regular file.\n",
                    file);
                error = EBADF;
                goto out;
        }

        vfe = kmem_zalloc(sizeof(*vfe), KM_SLEEP);
        rw_init(&vfe->lock);

        /* Lookup fingerprint hashing algorithm. */
        fp_type = prop_string_value(prop_dictionary_get(dict, "fp-type"));
        if ((vfe->ops = veriexec_fpops_lookup(fp_type)) == NULL) {
                log(LOG_ERR, "Veriexec: Invalid or unknown fingerprint type "
                    "`%s' for file `%s'.\n", fp_type, file);
                error = EOPNOTSUPP;
                goto out;
        }

        if (prop_data_size(prop_dictionary_get(dict, "fp")) !=
            vfe->ops->hash_len) {
                log(LOG_ERR, "Veriexec: Bad fingerprint length for `%s'.\n",
                    file);
                error = EINVAL;
                goto out;
        }

        vfe->fp = kmem_alloc(vfe->ops->hash_len, KM_SLEEP);
        memcpy(vfe->fp, prop_data_value(prop_dictionary_get(dict, "fp")),
            vfe->ops->hash_len);

        rw_enter(&veriexec_op_lock, RW_WRITER);

        /* Continue entry initialization. */
        if (prop_dictionary_get_uint8(dict, "entry-type", &vfe->type) == FALSE)
                vfe->type = 0;
        else {
                uint8_t extra_flags;

                extra_flags = vfe->type & ~(VERIEXEC_DIRECT |
                    VERIEXEC_INDIRECT | VERIEXEC_FILE | VERIEXEC_UNTRUSTED);
                if (extra_flags) {
                        log(LOG_NOTICE, "Veriexec: Contaminated flags `0x%x' "
                            "for `%s', skipping.\n", extra_flags, file);
                        error = EINVAL;
                        goto unlock_out;
                }
        }
        if (!(vfe->type & (VERIEXEC_DIRECT | VERIEXEC_INDIRECT |
            VERIEXEC_FILE)))
                vfe->type |= VERIEXEC_DIRECT;

        vfe->status = FINGERPRINT_NOTEVAL;
        if (prop_bool_true(prop_dictionary_get(dict, "keep-filename"))) {
                vfe->filename = kmem_strdupsize(file, &vfe->filename_len,
                    KM_SLEEP);
        } else
                vfe->filename = NULL;

        if (prop_bool_true(prop_dictionary_get(dict, "eval-on-load")) ||
            (vfe->type & VERIEXEC_UNTRUSTED)) {
                u_char status;

                error = veriexec_fp_status(l, vp, VERIEXEC_FILE_UNLOCKED,
                    vfe, &status);
                if (error)
                        goto unlock_out;
                vfe->status = status;
        }

        /*
         * If we already have an entry for this file, and it matches
         * the new entry exactly (except for the filename, which may
         * hard-linked!), we just ignore the new entry.  If the new
         * entry differs, report the error.
         */
        if ((ovfe = veriexec_get(vp)) != NULL) {
                error = EEXIST;
                if (vfe->type == ovfe->type &&
                    vfe->status == ovfe->status &&
                    vfe->ops == ovfe->ops &&
                    memcmp(vfe->fp, ovfe->fp, vfe->ops->hash_len) == 0)
                        ignore_dup = true;
                goto unlock_out;
        }

        vte = veriexec_table_lookup(vp->v_mount);
        if (vte == NULL)
                vte = veriexec_table_add(l, vp->v_mount);

        /* XXX if we bail below this, we might want to gc newly created vtes. */

        error = fileassoc_add(vp, veriexec_hook, vfe);
        if (error)
                goto unlock_out;

        vte->vte_count++;

        veriexec_file_report(NULL, "New entry.", file, NULL, REPORT_DEBUG);
        veriexec_bypass = 0;

  unlock_out:
        rw_exit(&veriexec_op_lock);

  out:
        vrele(vp);
        if (error)
                veriexec_file_free(vfe);

        if (ignore_dup && error == EEXIST)
                error = 0;

        return (error);
}

int
veriexec_table_delete(struct lwp *l, struct mount *mp)
{
        struct veriexec_table_entry *vte;

        vte = veriexec_table_lookup(mp);
        if (vte == NULL)
                return (ENOENT);

        veriexec_mountspecific_dtor(vte);
        mount_setspecific(mp, veriexec_mountspecific_key, NULL);

        return (fileassoc_table_clear(mp, veriexec_hook));
}

int
veriexec_file_delete(struct lwp *l, struct vnode *vp)
{
        struct veriexec_table_entry *vte;
        int error;

        vte = veriexec_table_lookup(vp->v_mount);
        if (vte == NULL)
                return (ENOENT);

        rw_enter(&veriexec_op_lock, RW_WRITER);
        error = fileassoc_clear(vp, veriexec_hook);
        rw_exit(&veriexec_op_lock);
        if (!error) {
                KASSERT(vte->vte_count > 0);
                vte->vte_count--;
        }

        return (error);
}

/*
 * Convert Veriexec entry data to a dictionary readable by userland tools.
 */
static void
veriexec_file_convert(struct veriexec_file_entry *vfe, prop_dictionary_t rdict)
{
        if (vfe->filename)
                prop_dictionary_set(rdict, "file",
                    prop_string_create_copy(vfe->filename));
        prop_dictionary_set_uint8(rdict, "entry-type", vfe->type);
        prop_dictionary_set_uint8(rdict, "status", vfe->status);
        prop_dictionary_set(rdict, "fp-type",
            prop_string_create_copy(vfe->ops->type));
        prop_dictionary_set(rdict, "fp",
            prop_data_create_copy(vfe->fp, vfe->ops->hash_len));
}

int
veriexec_convert(struct vnode *vp, prop_dictionary_t rdict)
{
        struct veriexec_file_entry *vfe;

        rw_enter(&veriexec_op_lock, RW_READER);

        vfe = veriexec_get(vp);
        if (vfe == NULL) {
                rw_exit(&veriexec_op_lock);
                return (ENOENT);
        }

        rw_enter(&vfe->lock, RW_READER);
        veriexec_file_convert(vfe, rdict);
        rw_exit(&vfe->lock);

        rw_exit(&veriexec_op_lock);
        return (0);
}

int
veriexec_unmountchk(struct mount *mp)
{
        int error;

        if ((veriexec_bypass && (veriexec_strict == VERIEXEC_LEARNING))
            || doing_shutdown)
                return (0);

        rw_enter(&veriexec_op_lock, RW_READER);

        switch (veriexec_strict) {
        case VERIEXEC_LEARNING:
                error = 0;
                break;

        case VERIEXEC_IDS:
                if (veriexec_table_lookup(mp) != NULL) {
                        log(LOG_INFO, "Veriexec: IDS mode, allowing unmount "
                            "of \"%s\".\n", mp->mnt_stat.f_mntonname);
                }

                error = 0;
                break;

        case VERIEXEC_IPS: {
                struct veriexec_table_entry *vte;

                vte = veriexec_table_lookup(mp);
                if ((vte != NULL) && (vte->vte_count > 0)) {
                        log(LOG_ALERT, "Veriexec: IPS mode, preventing"
                            " unmount of \"%s\" with monitored files.\n",
                            mp->mnt_stat.f_mntonname);

                        error = EPERM;
                } else
                        error = 0;
                break;
                }

        case VERIEXEC_LOCKDOWN:
        default:
                log(LOG_ALERT, "Veriexec: Lockdown mode, preventing unmount "
                    "of \"%s\".\n", mp->mnt_stat.f_mntonname);
                error = EPERM;
                break;
        }

        rw_exit(&veriexec_op_lock);
        return (error);
}

int
veriexec_openchk(struct lwp *l, struct vnode *vp, const char *path, int fmode)
{
        struct veriexec_file_entry *vfe = NULL;
        int error = 0;

        if (veriexec_bypass && (veriexec_strict == VERIEXEC_LEARNING))
                return 0;

        if (vp == NULL) {
                /* If no creation requested, let this fail normally. */
                if (!(fmode & O_CREAT))
                        goto out;

                /* Lockdown mode: Prevent creation of new files. */
                if (veriexec_strict >= VERIEXEC_LOCKDOWN) {
                        log(LOG_ALERT, "Veriexec: Preventing new file "
                            "creation in `%s'.\n", path);
                        error = EPERM;
                }

                goto out;
        }

        rw_enter(&veriexec_op_lock, RW_READER);
        error = veriexec_file_verify(l, vp, path, VERIEXEC_FILE,
                                     VERIEXEC_FILE_LOCKED, &vfe);

        if (error) {
                rw_exit(&veriexec_op_lock);
                goto out;
        }

        if ((vfe != NULL) && ((fmode & FWRITE) || (fmode & O_TRUNC))) {
                veriexec_file_report(vfe, "Write access request.", path, l,
                    REPORT_ALWAYS | REPORT_ALARM);

                /* IPS mode: Deny write access to monitored files. */
                if (veriexec_strict >= VERIEXEC_IPS)
                        error = EPERM;
                else
                        veriexec_file_purge(vfe, VERIEXEC_LOCKED);
        }

        if (vfe != NULL)
                rw_exit(&vfe->lock);

        rw_exit(&veriexec_op_lock);
 out:
        return (error);
}

static void
veriexec_file_dump(struct veriexec_file_entry *vfe, prop_array_t entries)
{
        prop_dictionary_t entry;

        /* If we don't have a filename, this is meaningless. */
        if (vfe->filename == NULL)
                return;

        entry = prop_dictionary_create();

        veriexec_file_convert(vfe, entry);

        prop_array_add(entries, entry);
}

int
veriexec_dump(struct lwp *l, prop_array_t rarray)
{
        mount_iterator_t *iter;
        struct mount *mp;

        mountlist_iterator_init(&iter);
        while ((mp = mountlist_iterator_next(iter)) != NULL) {
                fileassoc_table_run(mp, veriexec_hook,
                    (fileassoc_cb_t)veriexec_file_dump, rarray);
        }
        mountlist_iterator_destroy(iter);

        return (0);
}

int
veriexec_flush(struct lwp *l)
{
        mount_iterator_t *iter;
        struct mount *mp;
        int error = 0;

        mountlist_iterator_init(&iter);
        while ((mp = mountlist_iterator_next(iter)) != NULL) {
                int lerror;

                lerror = veriexec_table_delete(l, mp);
                if (lerror && lerror != ENOENT)
                        error = lerror;
        }
        mountlist_iterator_destroy(iter);

        return (error);
}










































































































   24 





























   20 
   10 















   16 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
/*        $NetBSD: kern_malloc.c,v 1.158 2019/11/14 16:23:52 maxv Exp $        */

/*
 * Copyright (c) 1987, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_malloc.c        8.4 (Berkeley) 5/20/95
 */

/*
 * Copyright (c) 1996 Christopher G. Demetriou.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_malloc.c        8.4 (Berkeley) 5/20/95
 */

/*
 * Wrapper interface for obsolete malloc(9).
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_malloc.c,v 1.158 2019/11/14 16:23:52 maxv Exp $");

#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/kmem.h>
#include <sys/asan.h>
#include <sys/msan.h>

/*
 * Built-in malloc types.  Note: ought to be removed.
 */
MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
MALLOC_DEFINE(M_DMAMAP, "DMA map", "bus_dma(9) structures");
MALLOC_DEFINE(M_FREE, "free", "should be on free list");
MALLOC_DEFINE(M_TEMP, "temp", "misc. temporary data buffers");
MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
MALLOC_DEFINE(M_FTABLE, "fragtbl", "fragment reassembly header");
MALLOC_DEFINE(M_UFSMNT, "UFS mount", "UFS mount structure");
MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
MALLOC_DEFINE(M_MRTABLE, "mrt", "multicast routing tables");

/*
 * Header contains total size, including the header itself.
 */
struct malloc_header {
        size_t mh_size;
#ifdef KASAN
        size_t mh_rqsz;
#endif
} __aligned(ALIGNBYTES + 1);

void *
kern_malloc(unsigned long reqsize, int flags)
{
        const int kmflags = (flags & M_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
#ifdef KASAN
        const size_t origsize = reqsize;
#endif
        size_t size = reqsize;
        size_t allocsize, hdroffset;
        struct malloc_header *mh;
        void *p;

        kasan_add_redzone(&size);

        if (size >= PAGE_SIZE) {
                if (size > (ULONG_MAX-PAGE_SIZE))
                        allocsize = ULONG_MAX;        /* this will fail later */
                else
                        allocsize = PAGE_SIZE + size; /* for page alignment */
                hdroffset = PAGE_SIZE - sizeof(struct malloc_header);
        } else {
                allocsize = sizeof(struct malloc_header) + size;
                hdroffset = 0;
        }

        p = kmem_intr_alloc(allocsize, kmflags);
        if (p == NULL)
                return NULL;

        kmsan_mark(p, allocsize, KMSAN_STATE_UNINIT);
        kmsan_orig(p, allocsize, KMSAN_TYPE_MALLOC, __RET_ADDR);

        if ((flags & M_ZERO) != 0) {
                memset(p, 0, allocsize);
        }
        mh = (void *)((char *)p + hdroffset);
        mh->mh_size = allocsize - hdroffset;
#ifdef KASAN
        mh->mh_rqsz = origsize;
#endif
        mh++;

        kasan_mark(mh, origsize, size, KASAN_MALLOC_REDZONE);

        return mh;
}

void
kern_free(void *addr)
{
        struct malloc_header *mh;

        mh = addr;
        mh--;

        kasan_mark(addr, mh->mh_size - sizeof(struct malloc_header),
            mh->mh_size - sizeof(struct malloc_header), KASAN_MALLOC_REDZONE);

        if (mh->mh_size >= PAGE_SIZE + sizeof(struct malloc_header)) {
                kmsan_mark((char *)addr - PAGE_SIZE,
                    mh->mh_size + PAGE_SIZE - sizeof(struct malloc_header),
                    KMSAN_STATE_INITED);
                kmem_intr_free((char *)addr - PAGE_SIZE,
                    mh->mh_size + PAGE_SIZE - sizeof(struct malloc_header));
        } else {
                kmsan_mark(mh, mh->mh_size, KMSAN_STATE_INITED);
                kmem_intr_free(mh, mh->mh_size);
        }
}

void *
kern_realloc(void *curaddr, unsigned long newsize, int flags)
{
        struct malloc_header *mh;
        unsigned long cursize;
        void *newaddr;

        /*
         * realloc() with a NULL pointer is the same as malloc().
         */
        if (curaddr == NULL)
                return malloc(newsize, ksp, flags);

        /*
         * realloc() with zero size is the same as free().
         */
        if (newsize == 0) {
                free(curaddr, ksp);
                return NULL;
        }

        if ((flags & M_NOWAIT) == 0) {
                ASSERT_SLEEPABLE();
        }

        mh = curaddr;
        mh--;

#ifdef KASAN
        cursize = mh->mh_rqsz;
#else
        cursize = mh->mh_size - sizeof(struct malloc_header);
#endif

        /*
         * If we already actually have as much as they want, we're done.
         */
        if (newsize <= cursize)
                return curaddr;

        /*
         * Can't satisfy the allocation with the existing block.
         * Allocate a new one and copy the data.
         */
        newaddr = malloc(newsize, ksp, flags);
        if (__predict_false(newaddr == NULL)) {
                /*
                 * malloc() failed, because flags included M_NOWAIT.
                 * Return NULL to indicate that failure.  The old
                 * pointer is still valid.
                 */
                return NULL;
        }
        memcpy(newaddr, curaddr, cursize);

        /*
         * We were successful: free the old allocation and return
         * the new one.
         */
        free(curaddr, ksp);
        return newaddr;
}

































































































   19 






   35 








   36 
   36 
   25 



    3 






   18 
    6 



   20 


    7 


    1 





































































































































































    1 


    6 





























   18 


    2 








































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
/*        $NetBSD: subr_copy.c,v 1.19 2023/05/22 14:07:24 riastradh Exp $        */

/*-
 * Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008, 2019
 *        The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Copyright (c) 1992, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This software was developed by the Computer Systems Engineering group
 * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
 * contributed to Berkeley.
 *
 * All advertising materials mentioning features or use of this software
 * must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Lawrence Berkeley Laboratory.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_subr.c        8.4 (Berkeley) 2/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_copy.c,v 1.19 2023/05/22 14:07:24 riastradh Exp $");

#define        __UFETCHSTORE_PRIVATE
#define        __UCAS_PRIVATE

#include <sys/param.h>
#include <sys/fcntl.h>
#include <sys/proc.h>
#include <sys/systm.h>

#include <uvm/uvm_extern.h>

void
uio_setup_sysspace(struct uio *uio)
{

        uio->uio_vmspace = vmspace_kernel();
}

int
uiomove(void *buf, size_t n, struct uio *uio)
{
        struct vmspace *vm = uio->uio_vmspace;
        struct iovec *iov;
        size_t cnt;
        int error = 0;
        char *cp = buf;

        ASSERT_SLEEPABLE();

        KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE);
        while (n > 0 && uio->uio_resid) {
                KASSERT(uio->uio_iovcnt > 0);
                iov = uio->uio_iov;
                cnt = iov->iov_len;
                if (cnt == 0) {
                        KASSERT(uio->uio_iovcnt > 1);
                        uio->uio_iov++;
                        uio->uio_iovcnt--;
                        continue;
                }
                if (cnt > n)
                        cnt = n;
                if (!VMSPACE_IS_KERNEL_P(vm)) {
                        preempt_point();
                }

                if (uio->uio_rw == UIO_READ) {
                        error = copyout_vmspace(vm, cp, iov->iov_base,
                            cnt);
                } else {
                        error = copyin_vmspace(vm, iov->iov_base, cp,
                            cnt);
                }
                if (error) {
                        break;
                }
                iov->iov_base = (char *)iov->iov_base + cnt;
                iov->iov_len -= cnt;
                uio->uio_resid -= cnt;
                uio->uio_offset += cnt;
                cp += cnt;
                KDASSERT(cnt <= n);
                n -= cnt;
        }

        return (error);
}

/*
 * Wrapper for uiomove() that validates the arguments against a known-good
 * kernel buffer.
 */
int
uiomove_frombuf(void *buf, size_t buflen, struct uio *uio)
{
        size_t offset;

        if (uio->uio_offset < 0 || /* uio->uio_resid < 0 || */
            (offset = uio->uio_offset) != uio->uio_offset)
                return (EINVAL);
        if (offset >= buflen)
                return (0);
        return (uiomove((char *)buf + offset, buflen - offset, uio));
}

int
uiopeek(void *buf, size_t n, struct uio *uio)
{
        struct vmspace *vm = uio->uio_vmspace;
        struct iovec *iov;
        size_t cnt;
        int error = 0;
        char *cp = buf;
        size_t resid = uio->uio_resid;
        int iovcnt = uio->uio_iovcnt;
        char *base;
        size_t len;

        KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE);

        if (n == 0 || resid == 0)
                return 0;
        iov = uio->uio_iov;
        base = iov->iov_base;
        len = iov->iov_len;

        while (n > 0 && resid > 0) {
                KASSERT(iovcnt > 0);
                cnt = len;
                if (cnt == 0) {
                        KASSERT(iovcnt > 1);
                        iov++;
                        iovcnt--;
                        base = iov->iov_base;
                        len = iov->iov_len;
                        continue;
                }
                if (cnt > n)
                        cnt = n;
                if (!VMSPACE_IS_KERNEL_P(vm)) {
                        preempt_point();
                }

                if (uio->uio_rw == UIO_READ) {
                        error = copyout_vmspace(vm, cp, base, cnt);
                } else {
                        error = copyin_vmspace(vm, base, cp, cnt);
                }
                if (error) {
                        break;
                }
                base += cnt;
                len -= cnt;
                resid -= cnt;
                cp += cnt;
                KDASSERT(cnt <= n);
                n -= cnt;
        }

        return error;
}

void
uioskip(size_t n, struct uio *uio)
{
        struct iovec *iov;
        size_t cnt;

        KASSERTMSG(n <= uio->uio_resid, "n=%zu resid=%zu", n, uio->uio_resid);

        KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE);
        while (n > 0 && uio->uio_resid) {
                KASSERT(uio->uio_iovcnt > 0);
                iov = uio->uio_iov;
                cnt = iov->iov_len;
                if (cnt == 0) {
                        KASSERT(uio->uio_iovcnt > 1);
                        uio->uio_iov++;
                        uio->uio_iovcnt--;
                        continue;
                }
                if (cnt > n)
                        cnt = n;
                iov->iov_base = (char *)iov->iov_base + cnt;
                iov->iov_len -= cnt;
                uio->uio_resid -= cnt;
                uio->uio_offset += cnt;
                KDASSERT(cnt <= n);
                n -= cnt;
        }
}

/*
 * Give next character to user as result of read.
 */
int
ureadc(int c, struct uio *uio)
{
        struct iovec *iov;

        if (uio->uio_resid <= 0)
                panic("ureadc: non-positive resid");
again:
        if (uio->uio_iovcnt <= 0)
                panic("ureadc: non-positive iovcnt");
        iov = uio->uio_iov;
        if (iov->iov_len <= 0) {
                uio->uio_iovcnt--;
                uio->uio_iov++;
                goto again;
        }
        if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) {
                int error;
                if ((error = ustore_char(iov->iov_base, c)) != 0)
                        return (error);
        } else {
                *(char *)iov->iov_base = c;
        }
        iov->iov_base = (char *)iov->iov_base + 1;
        iov->iov_len--;
        uio->uio_resid--;
        uio->uio_offset++;
        return (0);
}

/*
 * Like copyin(), but operates on an arbitrary vmspace.
 */
int
copyin_vmspace(struct vmspace *vm, const void *uaddr, void *kaddr, size_t len)
{
        struct iovec iov;
        struct uio uio;
        int error;

        if (len == 0)
                return (0);

        if (VMSPACE_IS_KERNEL_P(vm)) {
                return kcopy(uaddr, kaddr, len);
        }
        if (__predict_true(vm == curproc->p_vmspace)) {
                return copyin(uaddr, kaddr, len);
        }

        iov.iov_base = kaddr;
        iov.iov_len = len;
        uio.uio_iov = &iov;
        uio.uio_iovcnt = 1;
        uio.uio_offset = (off_t)(uintptr_t)uaddr;
        uio.uio_resid = len;
        uio.uio_rw = UIO_READ;
        UIO_SETUP_SYSSPACE(&uio);
        error = uvm_io(&vm->vm_map, &uio, 0);

        return (error);
}

/*
 * Like copyout(), but operates on an arbitrary vmspace.
 */
int
copyout_vmspace(struct vmspace *vm, const void *kaddr, void *uaddr, size_t len)
{
        struct iovec iov;
        struct uio uio;
        int error;

        if (len == 0)
                return (0);

        if (VMSPACE_IS_KERNEL_P(vm)) {
                return kcopy(kaddr, uaddr, len);
        }
        if (__predict_true(vm == curproc->p_vmspace)) {
                return copyout(kaddr, uaddr, len);
        }

        iov.iov_base = __UNCONST(kaddr); /* XXXUNCONST cast away const */
        iov.iov_len = len;
        uio.uio_iov = &iov;
        uio.uio_iovcnt = 1;
        uio.uio_offset = (off_t)(uintptr_t)uaddr;
        uio.uio_resid = len;
        uio.uio_rw = UIO_WRITE;
        UIO_SETUP_SYSSPACE(&uio);
        error = uvm_io(&vm->vm_map, &uio, 0);

        return (error);
}

/*
 * Like copyin(), but operates on an arbitrary process.
 */
int
copyin_proc(struct proc *p, const void *uaddr, void *kaddr, size_t len)
{
        struct vmspace *vm;
        int error;

        error = proc_vmspace_getref(p, &vm);
        if (error) {
                return error;
        }
        error = copyin_vmspace(vm, uaddr, kaddr, len);
        uvmspace_free(vm);

        return error;
}

/*
 * Like copyout(), but operates on an arbitrary process.
 */
int
copyout_proc(struct proc *p, const void *kaddr, void *uaddr, size_t len)
{
        struct vmspace *vm;
        int error;

        error = proc_vmspace_getref(p, &vm);
        if (error) {
                return error;
        }
        error = copyout_vmspace(vm, kaddr, uaddr, len);
        uvmspace_free(vm);

        return error;
}

/*
 * Like copyin(), but operates on an arbitrary pid.
 */
int
copyin_pid(pid_t pid, const void *uaddr, void *kaddr, size_t len)
{
        struct proc *p;
        struct vmspace *vm;
        int error;

        mutex_enter(&proc_lock);
        p = proc_find(pid);
        if (p == NULL) {
                mutex_exit(&proc_lock);
                return ESRCH;
        }
        mutex_enter(p->p_lock);
        error = proc_vmspace_getref(p, &vm);
        mutex_exit(p->p_lock);
        mutex_exit(&proc_lock);

        if (error == 0) {
                error = copyin_vmspace(vm, uaddr, kaddr, len);
                uvmspace_free(vm);
        }
        return error;
}

/*
 * Like copyin(), except it operates on kernel addresses when the FKIOCTL
 * flag is passed in `ioctlflags' from the ioctl call.
 */
int
ioctl_copyin(int ioctlflags, const void *src, void *dst, size_t len)
{
        if (ioctlflags & FKIOCTL)
                return kcopy(src, dst, len);
        return copyin(src, dst, len);
}

/*
 * Like copyout(), except it operates on kernel addresses when the FKIOCTL
 * flag is passed in `ioctlflags' from the ioctl call.
 */
int
ioctl_copyout(int ioctlflags, const void *src, void *dst, size_t len)
{
        if (ioctlflags & FKIOCTL)
                return kcopy(src, dst, len);
        return copyout(src, dst, len);
}

/*
 * User-space CAS / fetch / store
 */

#ifdef __NO_STRICT_ALIGNMENT
#define        CHECK_ALIGNMENT(x)        __nothing
#else /* ! __NO_STRICT_ALIGNMENT */
static bool
ufetchstore_aligned(uintptr_t uaddr, size_t size)
{
        return (uaddr & (size - 1)) == 0;
}

#define        CHECK_ALIGNMENT()                                                \
do {                                                                        \
        if (!ufetchstore_aligned((uintptr_t)uaddr, sizeof(*uaddr)))        \
                return EFAULT;                                                \
} while (/*CONSTCOND*/0)
#endif /* __NO_STRICT_ALIGNMENT */

/*
 * __HAVE_UCAS_FULL platforms provide _ucas_32() and _ucas_64() themselves.
 * _RUMPKERNEL also provides it's own _ucas_32() and _ucas_64().
 *
 * In all other cases, we provide generic implementations that work on
 * all platforms.
 */

#if !defined(__HAVE_UCAS_FULL) && !defined(_RUMPKERNEL)
#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/once.h>
#include <sys/mutex.h>
#include <sys/ipi.h>

static int ucas_critical_splcookie;
static volatile u_int ucas_critical_pausing_cpus;
static u_int ucas_critical_ipi;
static ONCE_DECL(ucas_critical_init_once)

static void
ucas_critical_cpu_gate(void *arg __unused)
{
        int count = SPINLOCK_BACKOFF_MIN;

        KASSERT(atomic_load_relaxed(&ucas_critical_pausing_cpus) > 0);

        /*
         * Notify ucas_critical_wait that we have stopped.  Using
         * store-release ensures all our memory operations up to the
         * IPI happen before the ucas -- no buffered stores on our end
         * can clobber it later on, for instance.
         *
         * Matches atomic_load_acquire in ucas_critical_wait -- turns
         * the following atomic_dec_uint into a store-release.
         */
        membar_release();
        atomic_dec_uint(&ucas_critical_pausing_cpus);

        /*
         * Wait for ucas_critical_exit to reopen the gate and let us
         * proceed.  Using a load-acquire ensures the ucas happens
         * before any of our memory operations when we return from the
         * IPI and proceed -- we won't observe any stale cached value
         * that the ucas overwrote, for instance.
         *
         * Matches atomic_store_release in ucas_critical_exit.
         */
        while (atomic_load_acquire(&ucas_critical_pausing_cpus) != (u_int)-1) {
                SPINLOCK_BACKOFF(count);
        }
}

static int
ucas_critical_init(void)
{

        ucas_critical_ipi = ipi_register(ucas_critical_cpu_gate, NULL);
        return 0;
}

static void
ucas_critical_wait(void)
{
        int count = SPINLOCK_BACKOFF_MIN;

        /*
         * Wait for all CPUs to stop at the gate.  Using a load-acquire
         * ensures all memory operations before they stop at the gate
         * happen before the ucas -- no buffered stores in other CPUs
         * can clobber it later on, for instance.
         *
         * Matches membar_release/atomic_dec_uint (store-release) in
         * ucas_critical_cpu_gate.
         */
        while (atomic_load_acquire(&ucas_critical_pausing_cpus) > 0) {
                SPINLOCK_BACKOFF(count);
        }
}
#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */

static inline void
ucas_critical_enter(lwp_t * const l)
{

#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)
        if (ncpu > 1) {
                RUN_ONCE(&ucas_critical_init_once, ucas_critical_init);

                /*
                 * Acquire the mutex first, then go to splhigh() and
                 * broadcast the IPI to lock all of the other CPUs
                 * behind the gate.
                 *
                 * N.B. Going to splhigh() implicitly disables preemption,
                 * so there's no need to do it explicitly.
                 */
                mutex_enter(&cpu_lock);
                ucas_critical_splcookie = splhigh();
                ucas_critical_pausing_cpus = ncpu - 1;
                ipi_trigger_broadcast(ucas_critical_ipi, true);
                ucas_critical_wait();
                return;
        }
#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */

        KPREEMPT_DISABLE(l);
}

static inline void
ucas_critical_exit(lwp_t * const l)
{

#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)
        if (ncpu > 1) {
                /*
                 * Open the gate and notify all CPUs in
                 * ucas_critical_cpu_gate that they can now proceed.
                 * Using a store-release ensures the ucas happens
                 * before any memory operations they issue after the
                 * IPI -- they won't observe any stale cache of the
                 * target word, for instance.
                 *
                 * Matches atomic_load_acquire in ucas_critical_cpu_gate.
                 */
                atomic_store_release(&ucas_critical_pausing_cpus, (u_int)-1);
                splx(ucas_critical_splcookie);
                mutex_exit(&cpu_lock);
                return;
        }
#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */

        KPREEMPT_ENABLE(l);
}

int
_ucas_32(volatile uint32_t *uaddr, uint32_t old, uint32_t new, uint32_t *ret)
{
        lwp_t * const l = curlwp;
        uint32_t *uva = ((void *)(uintptr_t)uaddr);
        int error;

        /*
         * Wire the user address down to avoid taking a page fault during
         * the critical section.
         */
        error = uvm_vslock(l->l_proc->p_vmspace, uva, sizeof(*uaddr),
                           VM_PROT_READ | VM_PROT_WRITE);
        if (error)
                return error;

        ucas_critical_enter(l);
        error = _ufetch_32(uva, ret);
        if (error == 0 && *ret == old) {
                error = _ustore_32(uva, new);
        }
        ucas_critical_exit(l);

        uvm_vsunlock(l->l_proc->p_vmspace, uva, sizeof(*uaddr));

        return error;
}

#ifdef _LP64
int
_ucas_64(volatile uint64_t *uaddr, uint64_t old, uint64_t new, uint64_t *ret)
{
        lwp_t * const l = curlwp;
        uint64_t *uva = ((void *)(uintptr_t)uaddr);
        int error;

        /*
         * Wire the user address down to avoid taking a page fault during
         * the critical section.
         */
        error = uvm_vslock(l->l_proc->p_vmspace, uva, sizeof(*uaddr),
                           VM_PROT_READ | VM_PROT_WRITE);
        if (error)
                return error;

        ucas_critical_enter(l);
        error = _ufetch_64(uva, ret);
        if (error == 0 && *ret == old) {
                error = _ustore_64(uva, new);
        }
        ucas_critical_exit(l);

        uvm_vsunlock(l->l_proc->p_vmspace, uva, sizeof(*uaddr));

        return error;
}
#endif /* _LP64 */
#endif /* ! __HAVE_UCAS_FULL && ! _RUMPKERNEL */

int
ucas_32(volatile uint32_t *uaddr, uint32_t old, uint32_t new, uint32_t *ret)
{

        ASSERT_SLEEPABLE();
        CHECK_ALIGNMENT();
#if (defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)) && \
    !defined(_RUMPKERNEL)
        if (ncpu > 1) {
                return _ucas_32_mp(uaddr, old, new, ret);
        }
#endif /* __HAVE_UCAS_MP && MULTIPROCESSOR */
        return _ucas_32(uaddr, old, new, ret);
}

#ifdef _LP64
int
ucas_64(volatile uint64_t *uaddr, uint64_t old, uint64_t new, uint64_t *ret)
{

        ASSERT_SLEEPABLE();
        CHECK_ALIGNMENT();
#if (defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)) && \
    !defined(_RUMPKERNEL)
        if (ncpu > 1) {
                return _ucas_64_mp(uaddr, old, new, ret);
        }
#endif /* __HAVE_UCAS_MP && MULTIPROCESSOR */
        return _ucas_64(uaddr, old, new, ret);
}
#endif /* _LP64 */

__strong_alias(ucas_int,ucas_32);
#ifdef _LP64
__strong_alias(ucas_ptr,ucas_64);
#else
__strong_alias(ucas_ptr,ucas_32);
#endif /* _LP64 */

int
ufetch_8(const uint8_t *uaddr, uint8_t *valp)
{

        ASSERT_SLEEPABLE();
        CHECK_ALIGNMENT();
        return _ufetch_8(uaddr, valp);
}

int
ufetch_16(const uint16_t *uaddr, uint16_t *valp)
{

        ASSERT_SLEEPABLE();
        CHECK_ALIGNMENT();
        return _ufetch_16(uaddr, valp);
}

int
ufetch_32(const uint32_t *uaddr, uint32_t *valp)
{

        ASSERT_SLEEPABLE();
        CHECK_ALIGNMENT();
        return _ufetch_32(uaddr, valp);
}

#ifdef _LP64
int
ufetch_64(const uint64_t *uaddr, uint64_t *valp)
{

        ASSERT_SLEEPABLE();
        CHECK_ALIGNMENT();
        return _ufetch_64(uaddr, valp);
}
#endif /* _LP64 */

__strong_alias(ufetch_char,ufetch_8);
__strong_alias(ufetch_short,ufetch_16);
__strong_alias(ufetch_int,ufetch_32);
#ifdef _LP64
__strong_alias(ufetch_long,ufetch_64);
__strong_alias(ufetch_ptr,ufetch_64);
#else
__strong_alias(ufetch_long,ufetch_32);
__strong_alias(ufetch_ptr,ufetch_32);
#endif /* _LP64 */

int
ustore_8(uint8_t *uaddr, uint8_t val)
{

        ASSERT_SLEEPABLE();
        CHECK_ALIGNMENT();
        return _ustore_8(uaddr, val);
}

int
ustore_16(uint16_t *uaddr, uint16_t val)
{

        ASSERT_SLEEPABLE();
        CHECK_ALIGNMENT();
        return _ustore_16(uaddr, val);
}

int
ustore_32(uint32_t *uaddr, uint32_t val)
{

        ASSERT_SLEEPABLE();
        CHECK_ALIGNMENT();
        return _ustore_32(uaddr, val);
}

#ifdef _LP64
int
ustore_64(uint64_t *uaddr, uint64_t val)
{

        ASSERT_SLEEPABLE();
        CHECK_ALIGNMENT();
        return _ustore_64(uaddr, val);
}
#endif /* _LP64 */

__strong_alias(ustore_char,ustore_8);
__strong_alias(ustore_short,ustore_16);
__strong_alias(ustore_int,ustore_32);
#ifdef _LP64
__strong_alias(ustore_long,ustore_64);
__strong_alias(ustore_ptr,ustore_64);
#else
__strong_alias(ustore_long,ustore_32);
__strong_alias(ustore_ptr,ustore_32);
#endif /* _LP64 */






















































































    1 










    1 









    1 




































    1 









    1 
































































    1 






















































    1 










    1 















    3 















    3 
    1 


    2 
    2 








    3 







    2 
    2 




    1 

    2 







    1 






    2 
    1 

    3 
















    1 














    1 




    1 


    1 













    1 

    1 






















    1 






















    2 










    1 
    1 
    2 



























    1 






















































    1 















    1 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
/*        $NetBSD: kern_prot.c,v 1.122 2020/05/23 23:42:43 ad Exp $        */

/*
 * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_prot.c        8.9 (Berkeley) 2/14/95
 */

/*
 * System calls related to processes and protection
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_prot.c,v 1.122 2020/05/23 23:42:43 ad Exp $");

#ifdef _KERNEL_OPT
#include "opt_compat_43.h"
#endif

#include <sys/param.h>
#include <sys/acct.h>
#include <sys/systm.h>
#include <sys/ucred.h>
#include <sys/proc.h>
#include <sys/timeb.h>
#include <sys/times.h>
#include <sys/pool.h>
#include <sys/prot.h>
#include <sys/syslog.h>
#include <sys/uidinfo.h>
#include <sys/kauth.h>

#include <sys/mount.h>
#include <sys/syscallargs.h>

int        sys_getpid(struct lwp *, const void *, register_t *);
int        sys_getpid_with_ppid(struct lwp *, const void *, register_t *);
int        sys_getuid(struct lwp *, const void *, register_t *);
int        sys_getuid_with_euid(struct lwp *, const void *, register_t *);
int        sys_getgid(struct lwp *, const void *, register_t *);
int        sys_getgid_with_egid(struct lwp *, const void *, register_t *);

/* ARGSUSED */
int
sys_getpid(struct lwp *l, const void *v, register_t *retval)
{
        struct proc *p = l->l_proc;

        *retval = p->p_pid;
        return (0);
}

/* ARGSUSED */
int
sys_getpid_with_ppid(struct lwp *l, const void *v, register_t *retval)
{
        struct proc *p = l->l_proc;

        retval[0] = p->p_pid;
        retval[1] = p->p_ppid;
        return (0);
}

/* ARGSUSED */
int
sys_getppid(struct lwp *l, const void *v, register_t *retval)
{
        struct proc *p = l->l_proc;

        *retval = p->p_ppid;
        return (0);
}

/* Get process group ID; note that POSIX getpgrp takes no parameter */
int
sys_getpgrp(struct lwp *l, const void *v, register_t *retval)
{
        struct proc *p = l->l_proc;

        mutex_enter(&proc_lock);
        *retval = p->p_pgrp->pg_id;
        mutex_exit(&proc_lock);
        return (0);
}

/*
 * Return the process group ID of the session leader (session ID)
 * for the specified process.
 */
int
sys_getsid(struct lwp *l, const struct sys_getsid_args *uap, register_t *retval)
{
        /* {
                syscalldarg(pid_t) pid;
        } */
        pid_t pid = SCARG(uap, pid);
        struct proc *p;
        int error = 0;

        mutex_enter(&proc_lock);
        if (pid == 0)
                *retval = l->l_proc->p_session->s_sid;
        else if ((p = proc_find(pid)) != NULL)
                *retval = p->p_session->s_sid;
        else
                error = ESRCH;
        mutex_exit(&proc_lock);

        return error;
}

int
sys_getpgid(struct lwp *l, const struct sys_getpgid_args *uap, register_t *retval)
{
        /* {
                syscallarg(pid_t) pid;
        } */
        pid_t pid = SCARG(uap, pid);
        struct proc *p;
        int error = 0;

        mutex_enter(&proc_lock);
        if (pid == 0)
                *retval = l->l_proc->p_pgid;
        else if ((p = proc_find(pid)) != NULL)
                *retval = p->p_pgid;
        else
                error = ESRCH;
        mutex_exit(&proc_lock);

        return error;
}

/* ARGSUSED */
int
sys_getuid(struct lwp *l, const void *v, register_t *retval)
{

        *retval = kauth_cred_getuid(l->l_cred);
        return (0);
}

/* ARGSUSED */
int
sys_getuid_with_euid(struct lwp *l, const void *v, register_t *retval)
{

        retval[0] = kauth_cred_getuid(l->l_cred);
        retval[1] = kauth_cred_geteuid(l->l_cred);
        return (0);
}

/* ARGSUSED */
int
sys_geteuid(struct lwp *l, const void *v, register_t *retval)
{

        *retval = kauth_cred_geteuid(l->l_cred);
        return (0);
}

/* ARGSUSED */
int
sys_getgid(struct lwp *l, const void *v, register_t *retval)
{

        *retval = kauth_cred_getgid(l->l_cred);
        return (0);
}

/* ARGSUSED */
int
sys_getgid_with_egid(struct lwp *l, const void *v, register_t *retval)
{

        retval[0] = kauth_cred_getgid(l->l_cred);
        retval[1] = kauth_cred_getegid(l->l_cred);
        return (0);
}

/*
 * Get effective group ID.  The "egid" is groups[0], and could be obtained
 * via getgroups.  This syscall exists because it is somewhat painful to do
 * correctly in a library function.
 */
/* ARGSUSED */
int
sys_getegid(struct lwp *l, const void *v, register_t *retval)
{

        *retval = kauth_cred_getegid(l->l_cred);
        return (0);
}

int
sys_getgroups(struct lwp *l, const struct sys_getgroups_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) gidsetsize;
                syscallarg(gid_t *) gidset;
        } */

        *retval = kauth_cred_ngroups(l->l_cred);
        if (SCARG(uap, gidsetsize) == 0)
                return 0;
        if (SCARG(uap, gidsetsize) < (int)*retval)
                return EINVAL;

        return kauth_cred_getgroups(l->l_cred, SCARG(uap, gidset), *retval,
            UIO_USERSPACE);
}

int
sys_setsid(struct lwp *l, const void *v, register_t *retval)
{
        struct proc *p = l->l_proc;
        int error;

        error = proc_enterpgrp(p, p->p_pid, p->p_pid, true);
        *retval = p->p_pid;
        return (error);
}


/*
 * set process group (setpgid/old setpgrp)
 *
 * caller does setpgid(targpid, targpgid)
 *
 * pgid must be in valid range (EINVAL)
 * pid must be caller or child of caller (ESRCH)
 * if a child
 *        pid must be in same session (EPERM)
 *        pid can't have done an exec (EACCES)
 * if pgid != pid
 *         there must exist some pid in same session having pgid (EPERM)
 * pid must not be session leader (EPERM)
 *
 * Permission checks now in proc_enterpgrp()
 */
int
sys_setpgid(struct lwp *l, const struct sys_setpgid_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) pid;
                syscallarg(int) pgid;
        } */
        struct proc *p = l->l_proc;
        pid_t targp, pgid;

        if (SCARG(uap, pgid) < 0)
                return EINVAL;
        if ((targp = SCARG(uap, pid)) == 0)
                targp = p->p_pid;
        if ((pgid = SCARG(uap, pgid)) == 0)
                pgid = targp;

        return proc_enterpgrp(p, targp, pgid, false);
}

/*
 * Set real, effective and saved uids to the requested values.
 * non-root callers can only ever change uids to values that match
 * one of the processes current uid values.
 * This is further restricted by the flags argument.
 */

int
do_setresuid(struct lwp *l, uid_t r, uid_t e, uid_t sv, u_int flags)
{
        struct proc *p = l->l_proc;
        kauth_cred_t cred, ncred;

        ncred = kauth_cred_alloc();

        /* Get a write lock on the process credential. */
        proc_crmod_enter();
        cred = p->p_cred;

        /*
         * Check that the new value is one of the allowed existing values,
         * or that we have root privilege.
         */
        if ((r != -1
            && !((flags & ID_R_EQ_R) && r == kauth_cred_getuid(cred))
            && !((flags & ID_R_EQ_E) && r == kauth_cred_geteuid(cred))
            && !((flags & ID_R_EQ_S) && r == kauth_cred_getsvuid(cred))) ||
            (e != -1
            && !((flags & ID_E_EQ_R) && e == kauth_cred_getuid(cred))
            && !((flags & ID_E_EQ_E) && e == kauth_cred_geteuid(cred))
            && !((flags & ID_E_EQ_S) && e == kauth_cred_getsvuid(cred))) ||
            (sv != -1
            && !((flags & ID_S_EQ_R) && sv == kauth_cred_getuid(cred))
            && !((flags & ID_S_EQ_E) && sv == kauth_cred_geteuid(cred))
            && !((flags & ID_S_EQ_S) && sv == kauth_cred_getsvuid(cred)))) {
                int error;

                error = kauth_authorize_process(cred, KAUTH_PROCESS_SETID,
                    p, NULL, NULL, NULL);
                if (error != 0) {
                         proc_crmod_leave(cred, ncred, false);
                        return error;
                }
        }

        /* If nothing has changed, short circuit the request */
        if ((r == -1 || r == kauth_cred_getuid(cred))
            && (e == -1 || e == kauth_cred_geteuid(cred))
            && (sv == -1 || sv == kauth_cred_getsvuid(cred))) {
                proc_crmod_leave(cred, ncred, false);
                return 0;
        }

        kauth_cred_clone(cred, ncred);

        if (r != -1 && r != kauth_cred_getuid(ncred)) {
                u_long nlwps;

                /* Update count of processes for this user. */
                (void)chgproccnt(kauth_cred_getuid(ncred), -1);
                (void)chgproccnt(r, 1);

                /* The first LWP of a process is excluded. */
                KASSERT(mutex_owned(p->p_lock));
                nlwps = p->p_nlwps - 1;
                (void)chglwpcnt(kauth_cred_getuid(ncred), -nlwps);
                (void)chglwpcnt(r, nlwps);

                kauth_cred_setuid(ncred, r);
        }
        if (sv != -1)
                kauth_cred_setsvuid(ncred, sv);
        if (e != -1)
                kauth_cred_seteuid(ncred, e);

        /* Broadcast our credentials to the process and other LWPs. */
         proc_crmod_leave(ncred, cred, true);

        return 0;
}

/*
 * Set real, effective and saved gids to the requested values.
 * non-root callers can only ever change gids to values that match
 * one of the processes current gid values.
 * This is further restricted by the flags argument.
 */

int
do_setresgid(struct lwp *l, gid_t r, gid_t e, gid_t sv, u_int flags)
{
        struct proc *p = l->l_proc;
        kauth_cred_t cred, ncred;

        ncred = kauth_cred_alloc();

        /* Get a write lock on the process credential. */
        proc_crmod_enter();
        cred = p->p_cred;

        /*
         * check new value is one of the allowed existing values.
         * otherwise, check if we have root privilege.
         */
        if ((r != -1
            && !((flags & ID_R_EQ_R) && r == kauth_cred_getgid(cred))
            && !((flags & ID_R_EQ_E) && r == kauth_cred_getegid(cred))
            && !((flags & ID_R_EQ_S) && r == kauth_cred_getsvgid(cred))) ||
            (e != -1
            && !((flags & ID_E_EQ_R) && e == kauth_cred_getgid(cred))
            && !((flags & ID_E_EQ_E) && e == kauth_cred_getegid(cred))
            && !((flags & ID_E_EQ_S) && e == kauth_cred_getsvgid(cred))) ||
            (sv != -1
            && !((flags & ID_S_EQ_R) && sv == kauth_cred_getgid(cred))
            && !((flags & ID_S_EQ_E) && sv == kauth_cred_getegid(cred))
            && !((flags & ID_S_EQ_S) && sv == kauth_cred_getsvgid(cred)))) {
                int error;

                error = kauth_authorize_process(cred, KAUTH_PROCESS_SETID,
                    p, NULL, NULL, NULL);
                if (error != 0) {
                         proc_crmod_leave(cred, ncred, false);
                        return error;
                }
        }

        /* If nothing has changed, short circuit the request */
        if ((r == -1 || r == kauth_cred_getgid(cred))
            && (e == -1 || e == kauth_cred_getegid(cred))
            && (sv == -1 || sv == kauth_cred_getsvgid(cred))) {
                 proc_crmod_leave(cred, ncred, false);
                return 0;
        }

        kauth_cred_clone(cred, ncred);

        if (r != -1)
                kauth_cred_setgid(ncred, r);
        if (sv != -1)
                kauth_cred_setsvgid(ncred, sv);
        if (e != -1)
                kauth_cred_setegid(ncred, e);

        /* Broadcast our credentials to the process and other LWPs. */
         proc_crmod_leave(ncred, cred, true);

        return 0;
}

/* ARGSUSED */
int
sys_setuid(struct lwp *l, const struct sys_setuid_args *uap, register_t *retval)
{
        /* {
                syscallarg(uid_t) uid;
        } */
        uid_t uid = SCARG(uap, uid);

        return do_setresuid(l, uid, uid, uid,
                            ID_R_EQ_R | ID_E_EQ_R | ID_S_EQ_R);
}

/* ARGSUSED */
int
sys_seteuid(struct lwp *l, const struct sys_seteuid_args *uap, register_t *retval)
{
        /* {
                syscallarg(uid_t) euid;
        } */

        return do_setresuid(l, -1, SCARG(uap, euid), -1, ID_E_EQ_R | ID_E_EQ_S);
}

int
sys_setreuid(struct lwp *l, const struct sys_setreuid_args *uap, register_t *retval)
{
        /* {
                syscallarg(uid_t) ruid;
                syscallarg(uid_t) euid;
        } */
        kauth_cred_t cred = l->l_cred;
        uid_t ruid, euid, svuid;

        ruid = SCARG(uap, ruid);
        euid = SCARG(uap, euid);

        if (ruid == -1)
                ruid = kauth_cred_getuid(cred);
        if (euid == -1)
                euid = kauth_cred_geteuid(cred);

        /* Saved uid is set to the new euid if the ruid changed */
        svuid = (ruid == kauth_cred_getuid(cred)) ? -1 : euid;

        return do_setresuid(l, ruid, euid, svuid,
                            ID_R_EQ_R | ID_R_EQ_E |
                            ID_E_EQ_R | ID_E_EQ_E | ID_E_EQ_S |
                            ID_S_EQ_R | ID_S_EQ_E | ID_S_EQ_S);
}

/* ARGSUSED */
int
sys_setgid(struct lwp *l, const struct sys_setgid_args *uap, register_t *retval)
{
        /* {
                syscallarg(gid_t) gid;
        } */
        gid_t gid = SCARG(uap, gid);

        return do_setresgid(l, gid, gid, gid,
                            ID_R_EQ_R | ID_E_EQ_R | ID_S_EQ_R);
}

/* ARGSUSED */
int
sys_setegid(struct lwp *l, const struct sys_setegid_args *uap, register_t *retval)
{
        /* {
                syscallarg(gid_t) egid;
        } */

        return do_setresgid(l, -1, SCARG(uap, egid), -1, ID_E_EQ_R | ID_E_EQ_S);
}

int
sys_setregid(struct lwp *l, const struct sys_setregid_args *uap, register_t *retval)
{
        /* {
                syscallarg(gid_t) rgid;
                syscallarg(gid_t) egid;
        } */
        kauth_cred_t cred = l->l_cred;
        gid_t rgid, egid, svgid;

        rgid = SCARG(uap, rgid);
        egid = SCARG(uap, egid);

        if (rgid == -1)
                rgid = kauth_cred_getgid(cred);
        if (egid == -1)
                egid = kauth_cred_getegid(cred);

        /* Saved gid is set to the new egid if the rgid changed */
        svgid = rgid == kauth_cred_getgid(cred) ? -1 : egid;

        return do_setresgid(l, rgid, egid, svgid,
                        ID_R_EQ_R | ID_R_EQ_E |
                        ID_E_EQ_R | ID_E_EQ_E | ID_E_EQ_S |
                        ID_S_EQ_R | ID_S_EQ_E | ID_S_EQ_S);
}

int
sys_issetugid(struct lwp *l, const void *v, register_t *retval)
{
        struct proc *p = l->l_proc;

        /*
         * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time,
         * we use PK_SUGID because we consider changing the owners as
         * "tainting" as well.
         * This is significant for procs that start as root and "become"
         * a user without an exec - programs cannot know *everything*
         * that libc *might* have put in their data segment.
         */
        *retval = (p->p_flag & PK_SUGID) != 0;
        return (0);
}

/* ARGSUSED */
int
sys_setgroups(struct lwp *l, const struct sys_setgroups_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) gidsetsize;
                syscallarg(const gid_t *) gidset;
        } */
        kauth_cred_t ncred;
        int error;

        ncred = kauth_cred_alloc();
        error = kauth_cred_setgroups(ncred, SCARG(uap, gidset),
            SCARG(uap, gidsetsize), -1, UIO_USERSPACE);
        if (error != 0) {
                kauth_cred_free(ncred);
                return error;
        }

        return kauth_proc_setgroups(l, ncred);
}

/*
 * Get login name, if available.
 */
/* ARGSUSED */
int
sys___getlogin(struct lwp *l, const struct sys___getlogin_args *uap, register_t *retval)
{
        /* {
                syscallarg(char *) namebuf;
                syscallarg(size_t) namelen;
        } */
        struct proc *p = l->l_proc;
        char login[sizeof(p->p_session->s_login)];
        size_t namelen = SCARG(uap, namelen);

        if (namelen > sizeof(login))
                namelen = sizeof(login);
        mutex_enter(&proc_lock);
        memcpy(login, p->p_session->s_login, namelen);
        mutex_exit(&proc_lock);
        return (copyout(login, (void *)SCARG(uap, namebuf), namelen));
}

/*
 * Set login name.
 */
/* ARGSUSED */
int
sys___setlogin(struct lwp *l, const struct sys___setlogin_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) namebuf;
        } */
        struct proc *p = l->l_proc;
        struct session *sp;
        char newname[sizeof sp->s_login + 1];
        int error;

        if ((error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_SETID,
            p, NULL, NULL, NULL)) != 0)
                return (error);
        error = copyinstr(SCARG(uap, namebuf), newname, sizeof newname, NULL);
        if (error != 0)
                return (error == ENAMETOOLONG ? EINVAL : error);

        mutex_enter(&proc_lock);
        sp = p->p_session;
        if (sp->s_flags & S_LOGIN_SET && p->p_pid != sp->s_sid &&
            strncmp(newname, sp->s_login, sizeof sp->s_login) != 0)
                log(LOG_WARNING, "%s (pid %d) changing logname from "
                    "%.*s to %s\n", p->p_comm, p->p_pid,
                    (int)sizeof sp->s_login, sp->s_login, newname);
        sp->s_flags |= S_LOGIN_SET;
        strncpy(sp->s_login, newname, sizeof sp->s_login);
        mutex_exit(&proc_lock);
        return (0);
}
































































    5 







    6 





    6 







    6 

    6 
    5 

    6 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/*        $NetBSD: kern_cfglock.c,v 1.1 2010/08/21 13:17:31 pgoyette Exp $ */

/*-
 * Copyright (c) 2002, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_cfglock.c,v 1.1 2010/08/21 13:17:31 pgoyette Exp $");

#include <sys/param.h>
#include <sys/cpu.h>
#include <sys/mutex.h>
#include <sys/lwp.h>
#include <sys/systm.h>

static kmutex_t kernconfig_mutex;
static lwp_t *kernconfig_lwp;
static int kernconfig_recurse;

/*
 * Functions for manipulating the kernel configuration lock.  This
 * recursive lock should be used to protect all additions and removals
 * of kernel functionality, such as device configuration and loading
 * of modular kernel components.
 */

void
kernconfig_lock_init(void)
{

        mutex_init(&kernconfig_mutex, MUTEX_DEFAULT, IPL_NONE);
        kernconfig_lwp = NULL;
        kernconfig_recurse = 0;
}

void
kernconfig_lock(void)
{
        lwp_t        *my_lwp;

        /*
         * It's OK to check this unlocked, since it could only be set to
         * curlwp by the current thread itself, and not by an interrupt
         * or any other LWP.
         */
        KASSERT(!cpu_intr_p());
        my_lwp = curlwp;
        if (kernconfig_lwp == my_lwp) {
                kernconfig_recurse++;
                KASSERT(kernconfig_recurse > 1);
        } else {
                mutex_enter(&kernconfig_mutex);
                kernconfig_lwp = my_lwp;
                kernconfig_recurse = 1;
        }
}

void
kernconfig_unlock(void)
{

        KASSERT(kernconfig_is_held());
        KASSERT(kernconfig_recurse != 0);
        if (--kernconfig_recurse == 0) {
                kernconfig_lwp = NULL;
                mutex_exit(&kernconfig_mutex);
        }
}

bool
kernconfig_is_held(void)
{

        return mutex_owned(&kernconfig_mutex);
}











































































































































































































































































    1 








    1 




















































    1 






    1 



    1 




























    1 






















    1 






















    1 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 












































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
/*        $NetBSD: vnd.c,v 1.289 2023/05/19 15:42:43 mlelstv Exp $        */

/*-
 * Copyright (c) 1996, 1997, 1998, 2008, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1988 University of Utah.
 * Copyright (c) 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * the Systems Programming Group of the University of Utah Computer
 * Science Department.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * from: Utah $Hdr: vn.c 1.13 94/04/02$
 *
 *        @(#)vn.c        8.9 (Berkeley) 5/14/95
 */

/*
 * Vnode disk driver.
 *
 * Block/character interface to a vnode.  Allows one to treat a file
 * as a disk (e.g. build a filesystem in it, mount it, etc.).
 *
 * NOTE 1: If the vnode supports the VOP_BMAP and VOP_STRATEGY operations,
 * this uses them to avoid distorting the local buffer cache.  If those
 * block-level operations are not available, this falls back to the regular
 * read and write calls.  Using these may distort the cache in some cases
 * but better have the driver working than preventing it to work on file
 * systems where the block-level operations are not implemented for
 * whatever reason.
 *
 * NOTE 2: There is a security issue involved with this driver.
 * Once mounted all access to the contents of the "mapped" file via
 * the special file is controlled by the permissions on the special
 * file, the protection of the mapped file is ignored (effectively,
 * by using root credentials in all transactions).
 *
 * NOTE 3: Doesn't interact with leases, should it?
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vnd.c,v 1.289 2023/05/19 15:42:43 mlelstv Exp $");

#if defined(_KERNEL_OPT)
#include "opt_vnd.h"
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <sys/errno.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/malloc.h>
#include <sys/ioctl.h>
#include <sys/disklabel.h>
#include <sys/device.h>
#include <sys/disk.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/fstrans.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/compat_stub.h>
#include <sys/atomic.h>

#include <uvm/uvm.h>

#include <net/zlib.h>

#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

#include <dev/dkvar.h>
#include <dev/vndvar.h>

#include "ioconf.h"

#if defined(VNDDEBUG) && !defined(DEBUG)
#define DEBUG
#endif

#ifdef DEBUG
int dovndcluster = 1;
#define VDB_FOLLOW        0x01
#define VDB_INIT        0x02
#define VDB_IO                0x04
#define VDB_LABEL        0x08
int vnddebug = 0;
#endif

#define vndunit(x)        DISKUNIT(x)

struct vndxfer {
        struct buf vx_buf;
        struct vnd_softc *vx_vnd;
};
#define        VND_BUFTOXFER(bp)        ((struct vndxfer *)(void *)bp)

#define VND_GETXFER(vnd)        pool_get(&(vnd)->sc_vxpool, PR_WAITOK)
#define VND_PUTXFER(vnd, vx)        pool_put(&(vnd)->sc_vxpool, (vx))

#define VNDLABELDEV(dev) \
    (MAKEDISKDEV(major((dev)), vndunit((dev)), RAW_PART))

#define        VND_MAXPENDING(vnd)        ((vnd)->sc_maxactive * 4)
#define        VND_MAXPAGES(vnd)        (1024 * 1024 / PAGE_SIZE)


static void        vndclear(struct vnd_softc *, int);
static int        vnddoclear(struct vnd_softc *, int, int, bool);
static int        vndsetcred(struct vnd_softc *, kauth_cred_t);
static void        vndthrottle(struct vnd_softc *, struct vnode *);
static void        vndiodone(struct buf *);
#if 0
static void        vndshutdown(void);
#endif

static void        vndgetdefaultlabel(struct vnd_softc *, struct disklabel *);
static void        vndgetdisklabel(dev_t, struct vnd_softc *);

static int        vndlock(struct vnd_softc *);
static void        vndunlock(struct vnd_softc *);
#ifdef VND_COMPRESSION
static void        compstrategy(struct buf *, off_t);
static void        *vnd_alloc(void *, u_int, u_int);
static void        vnd_free(void *, void *);
#endif /* VND_COMPRESSION */

static void        vndthread(void *);
static bool        vnode_has_op(const struct vnode *, int);
static void        handle_with_rdwr(struct vnd_softc *, const struct buf *,
                    struct buf *);
static void        handle_with_strategy(struct vnd_softc *, const struct buf *,
                    struct buf *);
static void        vnd_set_geometry(struct vnd_softc *);

static dev_type_open(vndopen);
static dev_type_close(vndclose);
static dev_type_read(vndread);
static dev_type_write(vndwrite);
static dev_type_ioctl(vndioctl);
static dev_type_strategy(vndstrategy);
static dev_type_dump(vnddump);
static dev_type_size(vndsize);

const struct bdevsw vnd_bdevsw = {
        .d_open = vndopen,
        .d_close = vndclose,
        .d_strategy = vndstrategy,
        .d_ioctl = vndioctl,
        .d_dump = vnddump,
        .d_psize = vndsize,
        .d_discard = nodiscard,
        .d_flag = D_DISK
};

const struct cdevsw vnd_cdevsw = {
        .d_open = vndopen,
        .d_close = vndclose,
        .d_read = vndread,
        .d_write = vndwrite,
        .d_ioctl = vndioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_DISK
};

static int        vnd_match(device_t, cfdata_t, void *);
static void        vnd_attach(device_t, device_t, void *);
static int        vnd_detach(device_t, int);

CFATTACH_DECL3_NEW(vnd, sizeof(struct vnd_softc),
    vnd_match, vnd_attach, vnd_detach, NULL, NULL, NULL, DVF_DETACH_SHUTDOWN);

static struct vnd_softc        *vnd_spawn(int);
static int        vnd_destroy(device_t);

static const struct        dkdriver vnddkdriver = {
        .d_strategy = vndstrategy,
        .d_minphys = minphys
};

void
vndattach(int num)
{
        int error;

        error = config_cfattach_attach(vnd_cd.cd_name, &vnd_ca);
        if (error)
                aprint_error("%s: unable to register cfattach, error = %d\n",
                    vnd_cd.cd_name, error);
}

static int
vnd_match(device_t self, cfdata_t cfdata, void *aux)
{

        return 1;
}

static void
vnd_attach(device_t parent, device_t self, void *aux)
{
        struct vnd_softc *sc = device_private(self);

        sc->sc_dev = self;
        sc->sc_comp_offsets = NULL;
        sc->sc_comp_buff = NULL;
        sc->sc_comp_decombuf = NULL;
        bufq_alloc(&sc->sc_tab, "disksort", BUFQ_SORT_RAWBLOCK);
        disk_init(&sc->sc_dkdev, device_xname(self), &vnddkdriver);
        if (!pmf_device_register(self, NULL, NULL))
                aprint_error_dev(self, "couldn't establish power handler\n");
}

static int
vnd_detach(device_t self, int flags)
{
        int error;
        struct vnd_softc *sc = device_private(self);

        if (sc->sc_flags & VNF_INITED) {
                error = vnddoclear(sc, 0, -1, (flags & DETACH_FORCE) != 0);
                if (error != 0)
                        return error;
        }

        pmf_device_deregister(self);
        bufq_free(sc->sc_tab);
        disk_destroy(&sc->sc_dkdev);

        return 0;
}

static struct vnd_softc *
vnd_spawn(int unit)
{
        cfdata_t cf;

        cf = malloc(sizeof(*cf), M_DEVBUF, M_WAITOK);
        cf->cf_name = vnd_cd.cd_name;
        cf->cf_atname = vnd_cd.cd_name;
        cf->cf_unit = unit;
        cf->cf_fstate = FSTATE_STAR;

        return device_private(config_attach_pseudo(cf));
}

static int
vnd_destroy(device_t dev)
{
        int error;
        cfdata_t cf;

        cf = device_cfdata(dev);
        error = config_detach(dev, DETACH_QUIET);
        if (error)
                return error;
        free(cf, M_DEVBUF);
        return 0;
}

static int
vndopen(dev_t dev, int flags, int mode, struct lwp *l)
{
        int unit = vndunit(dev);
        struct vnd_softc *sc;
        int error = 0, part, pmask;
        struct disklabel *lp;

#ifdef DEBUG
        if (vnddebug & VDB_FOLLOW)
                printf("vndopen(0x%"PRIx64", 0x%x, 0x%x, %p)\n", dev, flags, mode, l);
#endif
        sc = device_lookup_private(&vnd_cd, unit);
        if (sc == NULL) {
                sc = vnd_spawn(unit);
                if (sc == NULL)
                        return ENOMEM;

                /* compatibility, keep disklabel after close */
                sc->sc_flags = VNF_KLABEL;
        }

        if ((error = vndlock(sc)) != 0)
                return error;

        mutex_enter(&sc->sc_dkdev.dk_openlock);

        if ((sc->sc_flags & VNF_CLEARING) != 0) {
                error = ENXIO;
                goto done;
        }

        lp = sc->sc_dkdev.dk_label;

        part = DISKPART(dev);
        pmask = (1 << part);

        if (sc->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
                error = EBUSY;
                goto done;
        }

        if (sc->sc_flags & VNF_INITED) {
                if ((sc->sc_dkdev.dk_openmask & ~(1<<RAW_PART)) != 0) {
                        /*
                         * If any non-raw partition is open, but the disk
                         * has been invalidated, disallow further opens.
                         */
                        if ((sc->sc_flags & VNF_VLABEL) == 0) {
                                error = EIO;
                                goto done;
                        }
                } else {
                        /*
                         * Load the partition info if not already loaded.
                         */
                        if ((sc->sc_flags & VNF_VLABEL) == 0) {
                                sc->sc_flags |= VNF_VLABEL;
                                vndgetdisklabel(dev, sc);
                        }
                }
        }

        /* Check that the partitions exists. */
        if (part != RAW_PART) {
                if (((sc->sc_flags & VNF_INITED) == 0) ||
                    ((part >= lp->d_npartitions) ||
                     (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
                        error = ENXIO;
                        goto done;
                }
        }

        /* Prevent our unit from being unconfigured while open. */
        switch (mode) {
        case S_IFCHR:
                sc->sc_dkdev.dk_copenmask |= pmask;
                break;

        case S_IFBLK:
                sc->sc_dkdev.dk_bopenmask |= pmask;
                break;
        }
        sc->sc_dkdev.dk_openmask =
            sc->sc_dkdev.dk_copenmask | sc->sc_dkdev.dk_bopenmask;

 done:
        mutex_exit(&sc->sc_dkdev.dk_openlock);
        vndunlock(sc);
        return error;
}

static int
vndclose(dev_t dev, int flags, int mode, struct lwp *l)
{
        int unit = vndunit(dev);
        struct vnd_softc *sc;
        int error = 0, part;

#ifdef DEBUG
        if (vnddebug & VDB_FOLLOW)
                printf("vndclose(0x%"PRIx64", 0x%x, 0x%x, %p)\n", dev, flags, mode, l);
#endif
        sc = device_lookup_private(&vnd_cd, unit);
        if (sc == NULL)
                return ENXIO;

        if ((error = vndlock(sc)) != 0)
                return error;

        mutex_enter(&sc->sc_dkdev.dk_openlock);

        part = DISKPART(dev);

        /* ...that much closer to allowing unconfiguration... */
        switch (mode) {
        case S_IFCHR:
                sc->sc_dkdev.dk_copenmask &= ~(1 << part);
                break;

        case S_IFBLK:
                sc->sc_dkdev.dk_bopenmask &= ~(1 << part);
                break;
        }
        sc->sc_dkdev.dk_openmask =
            sc->sc_dkdev.dk_copenmask | sc->sc_dkdev.dk_bopenmask;

        /* are we last opener ? */
        if (sc->sc_dkdev.dk_openmask == 0) {
                if ((sc->sc_flags & VNF_KLABEL) == 0)
                        sc->sc_flags &= ~VNF_VLABEL;
        }

        mutex_exit(&sc->sc_dkdev.dk_openlock);

        vndunlock(sc);

        if ((sc->sc_flags & VNF_INITED) == 0) {
                if ((error = vnd_destroy(sc->sc_dev)) != 0) {
                        aprint_error_dev(sc->sc_dev,
                            "unable to detach instance\n");
                        return error;
                }
        }

        return 0;
}

/*
 * Queue the request, and wakeup the kernel thread to handle it.
 */
static void
vndstrategy(struct buf *bp)
{
        int unit = vndunit(bp->b_dev);
        struct vnd_softc *vnd =
            device_lookup_private(&vnd_cd, unit);
        struct disklabel *lp;
        daddr_t blkno;
        int s = splbio();

        if (vnd == NULL) {
                bp->b_error = ENXIO;
                goto done;
        }
        lp = vnd->sc_dkdev.dk_label;

        if ((vnd->sc_flags & VNF_INITED) == 0) {
                bp->b_error = ENXIO;
                goto done;
        }

        /*
         * The transfer must be a whole number of blocks.
         */
        if ((bp->b_bcount % lp->d_secsize) != 0) {
                bp->b_error = EINVAL;
                goto done;
        }

        /*
         * check if we're read-only.
         */
        if ((vnd->sc_flags & VNF_READONLY) && !(bp->b_flags & B_READ)) {
                bp->b_error = EACCES;
                goto done;
        }

        /* If it's a nil transfer, wake up the top half now. */
        if (bp->b_bcount == 0) {
                goto done;
        }

        /*
         * Do bounds checking and adjust transfer.  If there's an error,
         * the bounds check will flag that for us.
         */
        if (DISKPART(bp->b_dev) == RAW_PART) {
                if (bounds_check_with_mediasize(bp, DEV_BSIZE,
                    vnd->sc_size) <= 0)
                        goto done;
        } else {
                if (bounds_check_with_label(&vnd->sc_dkdev,
                    bp, vnd->sc_flags & (VNF_WLABEL|VNF_LABELLING)) <= 0)
                        goto done;
        }

        /*
         * Put the block number in terms of the logical blocksize
         * of the "device".
         */

        blkno = bp->b_blkno / (lp->d_secsize / DEV_BSIZE);

        /*
         * Translate the partition-relative block number to an absolute.
         */
        if (DISKPART(bp->b_dev) != RAW_PART) {
                struct partition *pp;

                pp = &vnd->sc_dkdev.dk_label->d_partitions[
                    DISKPART(bp->b_dev)];
                blkno += pp->p_offset;
        }
        bp->b_rawblkno = blkno;

#ifdef DEBUG
        if (vnddebug & VDB_FOLLOW)
                printf("vndstrategy(%p): unit %d\n", bp, unit);
#endif
        if ((vnd->sc_flags & VNF_USE_VN_RDWR)) {
                /*
                 * Limit the number of pending requests to not exhaust
                 * resources needed for I/O but always allow the worker
                 * thread to add requests, as a wedge on vnd queues
                 * requests with biodone() -> dkstart() -> vndstrategy().
                 */
                if (curlwp != vnd->sc_kthread && curlwp != uvm.pagedaemon_lwp) {
                        while (vnd->sc_pending >= VND_MAXPENDING(vnd))
                                tsleep(&vnd->sc_pending, PRIBIO, "vndpc", 0);
                }
                vnd->sc_pending++;
                KASSERT(vnd->sc_pending > 0);
        }
        bufq_put(vnd->sc_tab, bp);
        wakeup(&vnd->sc_tab);
        splx(s);
        return;

done:
        bp->b_resid = bp->b_bcount;
        biodone(bp);
        splx(s);
}

static bool
vnode_has_strategy(struct vnd_softc *vnd)
{
        return vnode_has_op(vnd->sc_vp, VOFFSET(vop_bmap)) &&
            vnode_has_op(vnd->sc_vp, VOFFSET(vop_strategy));
}

/* Verify that I/O requests cannot be smaller than the
 * smallest I/O size supported by the backend.
 */
static bool
vnode_has_large_blocks(struct vnd_softc *vnd)
{
        u_int32_t vnd_secsize, iosize;

        iosize = vnd->sc_iosize;
        vnd_secsize = vnd->sc_geom.vng_secsize;

        return vnd_secsize % iosize != 0;
}

/* XXX this function needs a reliable check to detect
 * sparse files. Otherwise, bmap/strategy may be used
 * and fail on non-allocated blocks. VOP_READ/VOP_WRITE
 * works on sparse files.
 */
#if notyet
static bool
vnode_strategy_probe(struct vnd_softc *vnd)
{
        int error;
        daddr_t nbn;

        if (!vnode_has_strategy(vnd))
                return false;

        if (vnode_has_large_blocks(vnd))
                return false;

        /* Convert the first logical block number to its
         * physical block number.
         */
        error = 0;
        vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_BMAP(vnd->sc_vp, 0, NULL, &nbn, NULL);
        VOP_UNLOCK(vnd->sc_vp);

        /* Test if that worked. */
        if (error == 0 && (long)nbn == -1)
                return false;

        return true;
}
#endif

static void
vndthread(void *arg)
{
        struct vnd_softc *vnd = arg;
        int s;

        /* Determine whether we can *use* VOP_BMAP and VOP_STRATEGY to
         * directly access the backing vnode.  If we can, use these two
         * operations to avoid messing with the local buffer cache.
         * Otherwise fall back to regular VOP_READ/VOP_WRITE operations
         * which are guaranteed to work with any file system. */
        if ((vnd->sc_flags & VNF_USE_VN_RDWR) == 0 &&
            ! vnode_has_strategy(vnd))
                vnd->sc_flags |= VNF_USE_VN_RDWR;

        /* VOP_STRATEGY can only be used if the backing vnode allows
         * to access blocks as small as defined by the vnd geometry.
         */
        if ((vnd->sc_flags & VNF_USE_VN_RDWR) == 0 &&
            vnode_has_large_blocks(vnd))
                vnd->sc_flags |= VNF_USE_VN_RDWR;

#ifdef DEBUG
        if (vnddebug & VDB_INIT)
                printf("vndthread: vp %p, %s\n", vnd->sc_vp,
                    (vnd->sc_flags & VNF_USE_VN_RDWR) == 0 ?
                    "using bmap/strategy operations" :
                    "using read/write operations");
#endif

        s = splbio();
        vnd->sc_flags |= VNF_KTHREAD;
        wakeup(&vnd->sc_kthread);

        /*
         * Dequeue requests and serve them depending on the available
         * vnode operations.
         */
        while ((vnd->sc_flags & VNF_VUNCONF) == 0) {
                struct vndxfer *vnx;
                struct buf *obp;
                struct buf *bp;

                obp = bufq_get(vnd->sc_tab);
                if (obp == NULL) {
                        tsleep(&vnd->sc_tab, PRIBIO, "vndbp", 0);
                        continue;
                };
                if ((vnd->sc_flags & VNF_USE_VN_RDWR)) {
                        KASSERT(vnd->sc_pending > 0);
                        if (vnd->sc_pending-- == VND_MAXPENDING(vnd))
                                wakeup(&vnd->sc_pending);
                }
                splx(s);
#ifdef DEBUG
                if (vnddebug & VDB_FOLLOW)
                        printf("vndthread(%p)\n", obp);
#endif

                if (vnd->sc_vp->v_mount == NULL) {
                        obp->b_error = ENXIO;
                        goto done;
                }
#ifdef VND_COMPRESSION
                /* handle a compressed read */
                if ((obp->b_flags & B_READ) != 0 && (vnd->sc_flags & VNF_COMP)) {
                        off_t bn;

                        /* Convert to a byte offset within the file. */
                        bn = obp->b_rawblkno *
                            vnd->sc_dkdev.dk_label->d_secsize;

                        compstrategy(obp, bn);
                        goto done;
                }
#endif /* VND_COMPRESSION */

                /*
                 * Allocate a header for this transfer and link it to the
                 * buffer
                 */
                s = splbio();
                vnx = VND_GETXFER(vnd);
                splx(s);
                vnx->vx_vnd = vnd;

                s = splbio();
                while (vnd->sc_active >= vnd->sc_maxactive) {
                        tsleep(&vnd->sc_tab, PRIBIO, "vndac", 0);
                }
                vnd->sc_active++;
                splx(s);

                /* Instrumentation. */
                disk_busy(&vnd->sc_dkdev);

                bp = &vnx->vx_buf;
                buf_init(bp);
                bp->b_flags = (obp->b_flags & (B_READ | B_PHYS | B_RAW));
                bp->b_oflags = obp->b_oflags;
                bp->b_cflags = obp->b_cflags;
                bp->b_iodone = vndiodone;
                bp->b_private = obp;
                bp->b_vp = vnd->sc_vp;
                bp->b_objlock = bp->b_vp->v_interlock;
                bp->b_data = obp->b_data;
                bp->b_bcount = obp->b_bcount;
                BIO_COPYPRIO(bp, obp);

                /* Make sure the request succeeds while suspending this fs. */
                fstrans_start_lazy(vnd->sc_vp->v_mount);

                /* Handle the request using the appropriate operations. */
                if ((vnd->sc_flags & VNF_USE_VN_RDWR) == 0)
                        handle_with_strategy(vnd, obp, bp);
                else
                        handle_with_rdwr(vnd, obp, bp);

                fstrans_done(vnd->sc_vp->v_mount);

                s = splbio();
                continue;

done:
                biodone(obp);
                s = splbio();
        }

        vnd->sc_flags &= (~VNF_KTHREAD | VNF_VUNCONF);
        wakeup(&vnd->sc_kthread);
        splx(s);
        kthread_exit(0);
}

/*
 * Checks if the given vnode supports the requested operation.
 * The operation is specified the offset returned by VOFFSET.
 *
 * XXX The test below used to determine this is quite fragile
 * because it relies on the file system to use genfs to specify
 * unimplemented operations.  There might be another way to do
 * it more cleanly.
 */
static bool
vnode_has_op(const struct vnode *vp, int opoffset)
{
        int (*defaultp)(void *);
        int (*opp)(void *);

        defaultp = vp->v_op[VOFFSET(vop_default)];
        opp = vp->v_op[opoffset];

        return opp != defaultp && opp != genfs_eopnotsupp &&
            opp != genfs_badop && opp != genfs_nullop;
}

/*
 * Handles the read/write request given in 'bp' using the vnode's VOP_READ
 * and VOP_WRITE operations.
 *
 * 'obp' is a pointer to the original request fed to the vnd device.
 */
static void
handle_with_rdwr(struct vnd_softc *vnd, const struct buf *obp, struct buf *bp)
{
        bool doread;
        off_t offset;
        size_t len, resid;
        struct vnode *vp;
        int npages;

        doread = bp->b_flags & B_READ;
        offset = obp->b_rawblkno * vnd->sc_dkdev.dk_label->d_secsize;
        len = bp->b_bcount;
        vp = vnd->sc_vp;

#if defined(DEBUG)
        if (vnddebug & VDB_IO)
                printf("vnd (rdwr): vp %p, %s, rawblkno 0x%" PRIx64
                    ", secsize %d, offset %" PRIu64
                    ", bcount %d\n",
                    vp, doread ? "read" : "write", obp->b_rawblkno,
                    vnd->sc_dkdev.dk_label->d_secsize, offset,
                    bp->b_bcount);
#endif

        /* Issue the read or write operation. */
        bp->b_error =
            vn_rdwr(doread ? UIO_READ : UIO_WRITE,
            vp, bp->b_data, len, offset, UIO_SYSSPACE,
            IO_ADV_ENCODE(POSIX_FADV_NOREUSE) | IO_DIRECT,
                vnd->sc_cred, &resid, NULL);
        bp->b_resid = resid;

        /*
         * Avoid caching too many pages, the vnd user
         * is usually a filesystem and caches itself.
         * We need some amount of caching to not hinder
         * read-ahead and write-behind operations.
         */
        npages = atomic_load_relaxed(&vp->v_uobj.uo_npages);
        if (npages > VND_MAXPAGES(vnd)) {
                rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
                (void) VOP_PUTPAGES(vp, 0, 0,
                    PGO_ALLPAGES | PGO_CLEANIT | PGO_FREE);
        }

        /* We need to increase the number of outputs on the vnode if
         * there was any write to it. */
        if (!doread) {
                mutex_enter(vp->v_interlock);
                vp->v_numoutput++;
                mutex_exit(vp->v_interlock);
        }

        biodone(bp);
}

/*
 * Handes the read/write request given in 'bp' using the vnode's VOP_BMAP
 * and VOP_STRATEGY operations.
 *
 * 'obp' is a pointer to the original request fed to the vnd device.
 */
static void
handle_with_strategy(struct vnd_softc *vnd, const struct buf *obp,
    struct buf *bp)
{
        int bsize, error, flags, skipped;
        size_t resid, sz;
        off_t bn, offset;
        struct vnode *vp;
        struct buf *nbp = NULL;

        flags = obp->b_flags;


        /* convert to a byte offset within the file. */
        bn = obp->b_rawblkno * vnd->sc_dkdev.dk_label->d_secsize;

        bsize = vnd->sc_vp->v_mount->mnt_stat.f_iosize;
        skipped = 0;

        /*
         * Break the request into bsize pieces and feed them
         * sequentially using VOP_BMAP/VOP_STRATEGY.
         * We do it this way to keep from flooding NFS servers if we
         * are connected to an NFS file.  This places the burden on
         * the client rather than the server.
         */
        error = 0;
        bp->b_resid = bp->b_bcount;
        for (offset = 0, resid = bp->b_resid; /* true */;
            resid -= sz, offset += sz) {
                daddr_t nbn;
                int off, nra;

                nra = 0;
                vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY);
                error = VOP_BMAP(vnd->sc_vp, bn / bsize, &vp, &nbn, &nra);
                VOP_UNLOCK(vnd->sc_vp);

                if (error == 0 && (long)nbn == -1)
                        error = EIO;

                /*
                 * If there was an error or a hole in the file...punt.
                 * Note that we may have to wait for any operations
                 * that we have already fired off before releasing
                 * the buffer.
                 *
                 * XXX we could deal with holes here but it would be
                 * a hassle (in the write case).
                 */
                if (error) {
                        skipped += resid;
                        break;
                }

#ifdef DEBUG
                if (!dovndcluster)
                        nra = 0;
#endif

                off = bn % bsize;
                sz = MIN(((off_t)1 + nra) * bsize - off, resid);
#ifdef        DEBUG
                if (vnddebug & VDB_IO)
                        printf("vndstrategy: vp %p/%p bn 0x%qx/0x%" PRIx64
                            " sz 0x%zx\n", vnd->sc_vp, vp, (long long)bn,
                            nbn, sz);
#endif

                nbp = getiobuf(vp, true);
                nestiobuf_setup(bp, nbp, offset, sz);
                nbp->b_blkno = nbn + btodb(off);

#if 0 /* XXX #ifdef DEBUG */
                if (vnddebug & VDB_IO)
                        printf("vndstart(%ld): bp %p vp %p blkno "
                            "0x%" PRIx64 " flags %x addr %p cnt 0x%x\n",
                            (long) (vnd-vnd_softc), &nbp->vb_buf,
                            nbp->vb_buf.b_vp, nbp->vb_buf.b_blkno,
                            nbp->vb_buf.b_flags, nbp->vb_buf.b_data,
                            nbp->vb_buf.b_bcount);
#endif
                if (resid == sz) {
                        break;
                }
                VOP_STRATEGY(vp, nbp);
                bn += sz;
        }
        if (!(flags & B_READ)) {
                struct vnode *w_vp;
                /*
                 * this is the last nested buf, account for
                 * the parent buf write too.
                 * This has to be done last, so that
                 * fsync won't wait for this write which
                 * has no chance to complete before all nested bufs
                 * have been queued. But it has to be done
                 * before the last VOP_STRATEGY()
                 * or the call to nestiobuf_done().
                 */
                w_vp = bp->b_vp;
                mutex_enter(w_vp->v_interlock);
                w_vp->v_numoutput++;
                mutex_exit(w_vp->v_interlock);
        }
        KASSERT(skipped != 0 || nbp != NULL);
        if (skipped)
                nestiobuf_done(bp, skipped, error);
        else
                VOP_STRATEGY(vp, nbp);
}

static void
vndiodone(struct buf *bp)
{
        struct vndxfer *vnx = VND_BUFTOXFER(bp);
        struct vnd_softc *vnd = vnx->vx_vnd;
        struct buf *obp = bp->b_private;
        int s = splbio();

        KERNEL_LOCK(1, NULL);                /* XXXSMP */
        KASSERT(&vnx->vx_buf == bp);
        KASSERT(vnd->sc_active > 0);
#ifdef DEBUG
        if (vnddebug & VDB_IO) {
                printf("vndiodone1: bp %p iodone: error %d\n",
                    bp, bp->b_error);
        }
#endif
        disk_unbusy(&vnd->sc_dkdev, bp->b_bcount - bp->b_resid,
            (bp->b_flags & B_READ));
        vnd->sc_active--;
        if (vnd->sc_active == 0) {
                wakeup(&vnd->sc_tab);
        }
        KERNEL_UNLOCK_ONE(NULL);        /* XXXSMP */
        splx(s);
        obp->b_error = bp->b_error;
        obp->b_resid = bp->b_resid;
        buf_destroy(bp);
        VND_PUTXFER(vnd, vnx);
        biodone(obp);
}

/* ARGSUSED */
static int
vndread(dev_t dev, struct uio *uio, int flags)
{
        int unit = vndunit(dev);
        struct vnd_softc *sc;

#ifdef DEBUG
        if (vnddebug & VDB_FOLLOW)
                printf("vndread(0x%"PRIx64", %p)\n", dev, uio);
#endif

        sc = device_lookup_private(&vnd_cd, unit);
        if (sc == NULL)
                return ENXIO;

        if ((sc->sc_flags & VNF_INITED) == 0)
                return ENXIO;

        return physio(vndstrategy, NULL, dev, B_READ, minphys, uio);
}

/* ARGSUSED */
static int
vndwrite(dev_t dev, struct uio *uio, int flags)
{
        int unit = vndunit(dev);
        struct vnd_softc *sc;

#ifdef DEBUG
        if (vnddebug & VDB_FOLLOW)
                printf("vndwrite(0x%"PRIx64", %p)\n", dev, uio);
#endif

        sc = device_lookup_private(&vnd_cd, unit);
        if (sc == NULL)
                return ENXIO;

        if ((sc->sc_flags & VNF_INITED) == 0)
                return ENXIO;

        return physio(vndstrategy, NULL, dev, B_WRITE, minphys, uio);
}

static int
vnd_cget(struct lwp *l, int unit, int *un, struct vattr *va)
{
        int error;
        struct vnd_softc *vnd;

        if (*un == -1)
                *un = unit;
        if (*un < 0)
                return EINVAL;

        vnd = device_lookup_private(&vnd_cd, *un);
        if (vnd == NULL)
                return -1;

        if ((vnd->sc_flags & VNF_INITED) == 0)
                return -1;

        vn_lock(vnd->sc_vp, LK_SHARED | LK_RETRY);
        error = VOP_GETATTR(vnd->sc_vp, va, l->l_cred);
        VOP_UNLOCK(vnd->sc_vp);
        return error;
}

static int
vnddoclear(struct vnd_softc *vnd, int pmask, int minor, bool force)
{
        int error;

        if ((error = vndlock(vnd)) != 0)
                return error;

        /*
         * Don't unconfigure if any other partitions are open
         * or if both the character and block flavors of this
         * partition are open.
         */
        if (DK_BUSY(vnd, pmask) && !force) {
                vndunlock(vnd);
                return EBUSY;
        }

        /* Delete all of our wedges */
        dkwedge_delall(&vnd->sc_dkdev);

        /*
         * XXX vndclear() might call vndclose() implicitly;
         * release lock to avoid recursion
         *
         * Set VNF_CLEARING to prevent vndopen() from
         * sneaking in after we vndunlock().
         */
        vnd->sc_flags |= VNF_CLEARING;
        vndunlock(vnd);
        vndclear(vnd, minor);
#ifdef DEBUG
        if (vnddebug & VDB_INIT)
                printf("%s: CLRed\n", __func__);
#endif

        /* Destroy the xfer and buffer pools. */
        pool_destroy(&vnd->sc_vxpool);

        /* Detach the disk. */
        disk_detach(&vnd->sc_dkdev);

        return 0;
}

static int
vndioctl_get(struct lwp *l, void *data, int unit, struct vattr *va)
{
        int error;

        KASSERT(l);

        /* the first member is always int vnd_unit in all the versions */
        if (*(int *)data >= vnd_cd.cd_ndevs)
                return ENXIO;

        switch (error = vnd_cget(l, unit, (int *)data, va)) {
        case -1:
                /* unused is not an error */
                memset(va, 0, sizeof(*va));
                /*FALLTHROUGH*/
        case 0:
                return 0;
        default:
                return error;
        }
}

/* ARGSUSED */
static int
vndioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
        bool force;
        int unit = vndunit(dev);
        struct vnd_softc *vnd;
        struct vnd_ioctl *vio;
        struct vattr vattr;
        struct pathbuf *pb;
        struct vnode *vp;
        int error, part, pmask;
        uint64_t geomsize;
        int fflags;
#ifdef __HAVE_OLD_DISKLABEL
        struct disklabel newlabel;
#endif

#ifdef DEBUG
        if (vnddebug & VDB_FOLLOW)
                printf("vndioctl(0x%"PRIx64", 0x%lx, %p, 0x%x, %p): unit %d\n",
                    dev, cmd, data, flag, l->l_proc, unit);
#endif
        /* Do the get's first; they don't need initialization or verification */
        switch (cmd) {
        case VNDIOCGET:
                if ((error = vndioctl_get(l, data, unit, &vattr)) != 0)
                        return error;

                struct vnd_user *vnu = data;
                vnu->vnu_dev = vattr.va_fsid;
                vnu->vnu_ino = vattr.va_fileid;
                return 0;

        default:
                /* First check for COMPAT_50 hook */
                MODULE_HOOK_CALL(compat_vndioctl_50_hook,
                    (cmd, l, data, unit, &vattr, vndioctl_get),
                    enosys(), error);

                /*
                 * If not present, then COMPAT_30 hook also not
                 * present, so just continue with checks for the
                 * "write" commands
                 */
                if (error == ENOSYS) {
                        error = 0;
                        break;
                }

                /* If not already handled, try the COMPAT_30 hook */
                if (error == EPASSTHROUGH)
                        MODULE_HOOK_CALL(compat_vndioctl_30_hook,
                            (cmd, l, data, unit, &vattr, vndioctl_get),
                            enosys(), error);

                /* If no COMPAT_30 module, or not handled, check writes */
                if (error == ENOSYS || error == EPASSTHROUGH) {
                        error = 0;
                        break;
                }
                return error;
        }

        vnd = device_lookup_private(&vnd_cd, unit);
        if (vnd == NULL)
                return ENXIO;
        vio = (struct vnd_ioctl *)data;

        /* Must be open for writes for these commands... */
        switch (cmd) {
        case VNDIOCSET50:
        case VNDIOCCLR50:
                if (!compat_vndioctl_50_hook.hooked)
                        return EINVAL;
                /* FALLTHROUGH */
        case VNDIOCSET:
        case VNDIOCCLR:
        case DIOCSDINFO:
        case DIOCWDINFO:
#ifdef __HAVE_OLD_DISKLABEL
        case ODIOCSDINFO:
        case ODIOCWDINFO:
#endif
        case DIOCKLABEL:
        case DIOCWLABEL:
        case DIOCCACHESYNC:
                if ((flag & FWRITE) == 0)
                        return EBADF;
        }

        switch (cmd) {
        case VNDIOCSET50:
        case VNDIOCSET:
                /* Must not be initialized */
                if (vnd->sc_flags & VNF_INITED)
                        return EBUSY;
                break;
        default:
                /* Must be initialized */
                if ((vnd->sc_flags & VNF_INITED) == 0)
                        return ENXIO;
                break;
        }

        error = disk_ioctl(&vnd->sc_dkdev, dev, cmd, data, flag, l);
        if (error != EPASSTHROUGH)
                return error;

        switch (cmd) {
        case VNDIOCSET50:
        case VNDIOCSET:
                if ((error = vndlock(vnd)) != 0)
                        return error;

                fflags = FREAD;
                if ((vio->vnd_flags & VNDIOF_READONLY) == 0)
                        fflags |= FWRITE;
                if ((vio->vnd_flags & VNDIOF_FILEIO) != 0)
                        vnd->sc_flags |= VNF_USE_VN_RDWR;
                error = pathbuf_copyin(vio->vnd_file, &pb);
                if (error) {
                        goto unlock_and_exit;
                }
                error = vn_open(NULL, pb, 0, fflags, 0, &vp, NULL, NULL);
                if (error != 0) {
                        pathbuf_destroy(pb);
                        goto unlock_and_exit;
                }
                KASSERT(l);
                error = VOP_GETATTR(vp, &vattr, l->l_cred);
                if (!error && vp->v_type != VREG)
                        error = EOPNOTSUPP;
                if (!error && vattr.va_bytes < vattr.va_size)
                        /* File is definitely sparse, use vn_rdwr() */
                        vnd->sc_flags |= VNF_USE_VN_RDWR;
                if (error) {
                        VOP_UNLOCK(vp);
                        goto close_and_exit;
                }

                /* If using a compressed file, initialize its info */
                /* (or abort with an error if kernel has no compression) */
                if (vio->vnd_flags & VNDIOF_COMP) {
#ifdef VND_COMPRESSION
                        struct vnd_comp_header *ch;
                        int i;
                        uint32_t comp_size;
                        uint32_t comp_maxsize;

                        /* allocate space for compressed file header */
                        ch = malloc(sizeof(struct vnd_comp_header),
                            M_TEMP, M_WAITOK);

                        /* read compressed file header */
                        error = vn_rdwr(UIO_READ, vp, (void *)ch,
                            sizeof(struct vnd_comp_header), 0, UIO_SYSSPACE,
                            IO_UNIT|IO_NODELOCKED, l->l_cred, NULL, NULL);
                        if (error) {
                                free(ch, M_TEMP);
                                VOP_UNLOCK(vp);
                                goto close_and_exit;
                        }

                        if (be32toh(ch->block_size) == 0 ||
                            be32toh(ch->num_blocks) > UINT32_MAX - 1) {
                                free(ch, M_TEMP);
                                VOP_UNLOCK(vp);
                                goto close_and_exit;
                        }

                        /* save some header info */
                        vnd->sc_comp_blksz = be32toh(ch->block_size);
                        /* note last offset is the file byte size */
                        vnd->sc_comp_numoffs = be32toh(ch->num_blocks) + 1;
                        free(ch, M_TEMP);
                        if (!DK_DEV_BSIZE_OK(vnd->sc_comp_blksz)) {
                                VOP_UNLOCK(vp);
                                error = EINVAL;
                                goto close_and_exit;
                        }
                        KASSERT(0 < vnd->sc_comp_blksz);
                        KASSERT(0 < vnd->sc_comp_numoffs);
                        /*
                         * @#^@!$& gcc -Wtype-limits refuses to let me
                         * write SIZE_MAX/sizeof(uint64_t) < numoffs,
                         * because the range of the type on amd64 makes
                         * the comparisons always false.
                         */
#if SIZE_MAX <= UINT32_MAX*(64/CHAR_BIT)
                        if (SIZE_MAX/sizeof(uint64_t) < vnd->sc_comp_numoffs) {
                                VOP_UNLOCK(vp);
                                error = EINVAL;
                                goto close_and_exit;
                        }
#endif
                        if ((vattr.va_size < sizeof(struct vnd_comp_header)) ||
                            (vattr.va_size - sizeof(struct vnd_comp_header) <
                                sizeof(uint64_t)*vnd->sc_comp_numoffs) ||
                            (UQUAD_MAX/vnd->sc_comp_blksz <
                                vnd->sc_comp_numoffs - 1)) {
                                VOP_UNLOCK(vp);
                                error = EINVAL;
                                goto close_and_exit;
                        }

                        /* set decompressed file size */
                        KASSERT(vnd->sc_comp_numoffs - 1 <=
                            UQUAD_MAX/vnd->sc_comp_blksz);
                        vattr.va_size =
                            ((u_quad_t)vnd->sc_comp_numoffs - 1) *
                             (u_quad_t)vnd->sc_comp_blksz;

                        /* allocate space for all the compressed offsets */
                        __CTASSERT(UINT32_MAX <= UQUAD_MAX/sizeof(uint64_t));
                        vnd->sc_comp_offsets =
                            malloc(sizeof(uint64_t) * vnd->sc_comp_numoffs,
                                M_DEVBUF, M_WAITOK);

                        /* read in the offsets */
                        error = vn_rdwr(UIO_READ, vp,
                            (void *)vnd->sc_comp_offsets,
                            sizeof(uint64_t) * vnd->sc_comp_numoffs,
                            sizeof(struct vnd_comp_header), UIO_SYSSPACE,
                          IO_UNIT|IO_NODELOCKED, l->l_cred, NULL, NULL);
                        if (error) {
                                VOP_UNLOCK(vp);
                                goto close_and_exit;
                        }
                        /*
                         * find largest block size (used for allocation limit).
                         * Also convert offset to native byte order.
                         */
                        comp_maxsize = 0;
                        for (i = 0; i < vnd->sc_comp_numoffs - 1; i++) {
                                vnd->sc_comp_offsets[i] =
                                    be64toh(vnd->sc_comp_offsets[i]);
                                comp_size =
                                    be64toh(vnd->sc_comp_offsets[i + 1])
                                    - vnd->sc_comp_offsets[i];
                                if (comp_size > comp_maxsize)
                                        comp_maxsize = comp_size;
                        }
                        vnd->sc_comp_offsets[vnd->sc_comp_numoffs - 1] =
                            be64toh(vnd->sc_comp_offsets[vnd->sc_comp_numoffs
                                    - 1]);

                        /* create compressed data buffer */
                        vnd->sc_comp_buff = malloc(comp_maxsize,
                            M_DEVBUF, M_WAITOK);

                        /* create decompressed buffer */
                        vnd->sc_comp_decombuf = malloc(vnd->sc_comp_blksz,
                            M_DEVBUF, M_WAITOK);
                        vnd->sc_comp_buffblk = -1;

                        /* Initialize decompress stream */
                        memset(&vnd->sc_comp_stream, 0, sizeof(z_stream));
                        vnd->sc_comp_stream.zalloc = vnd_alloc;
                        vnd->sc_comp_stream.zfree = vnd_free;
                        error = inflateInit2(&vnd->sc_comp_stream, MAX_WBITS);
                        if (error) {
                                if (vnd->sc_comp_stream.msg)
                                        printf("vnd%d: compressed file, %s\n",
                                            unit, vnd->sc_comp_stream.msg);
                                VOP_UNLOCK(vp);
                                error = EINVAL;
                                goto close_and_exit;
                        }

                        vnd->sc_flags |= VNF_COMP | VNF_READONLY;
#else /* !VND_COMPRESSION */
                        VOP_UNLOCK(vp);
                        error = EOPNOTSUPP;
                        goto close_and_exit;
#endif /* VND_COMPRESSION */
                }

                VOP_UNLOCK(vp);
                vnd->sc_vp = vp;
                vnd->sc_size = btodb(vattr.va_size);        /* note truncation */

                /* get smallest I/O size for underlying device, fall back to
                 * fundamental I/O size of underlying filesystem
                 */
                error = bdev_ioctl(vattr.va_fsid, DIOCGSECTORSIZE, &vnd->sc_iosize, FKIOCTL, l);
                if (error)
                        vnd->sc_iosize = vnd->sc_vp->v_mount->mnt_stat.f_frsize;

                /* Default I/O size to DEV_BSIZE */
                if (vnd->sc_iosize == 0)
                        vnd->sc_iosize = DEV_BSIZE;

                /*
                 * Use pseudo-geometry specified.  If none was provided,
                 * use "standard" Adaptec fictitious geometry.
                 */
                if (vio->vnd_flags & VNDIOF_HASGEOM) {

                        memcpy(&vnd->sc_geom, &vio->vnd_geom,
                            sizeof(vio->vnd_geom));

                        /*
                         * Sanity-check the sector size.
                         */
                        if (!DK_DEV_BSIZE_OK(vnd->sc_geom.vng_secsize) ||
                            vnd->sc_geom.vng_ntracks == 0 ||
                            vnd->sc_geom.vng_nsectors == 0) {
                                error = EINVAL;
                                goto close_and_exit;
                        }

                        /*
                         * Compute missing cylinder count from size
                         */
                        if (vnd->sc_geom.vng_ncylinders == 0)
                                vnd->sc_geom.vng_ncylinders = vnd->sc_size / (
                                        (vnd->sc_geom.vng_secsize / DEV_BSIZE) *
                                        vnd->sc_geom.vng_ntracks *
                                        vnd->sc_geom.vng_nsectors);

                        /*
                         * Compute the size (in DEV_BSIZE blocks) specified
                         * by the geometry.
                         */
                        geomsize = (int64_t)vnd->sc_geom.vng_nsectors *
                            vnd->sc_geom.vng_ntracks *
                            vnd->sc_geom.vng_ncylinders *
                            (vnd->sc_geom.vng_secsize / DEV_BSIZE);

                        /*
                         * Sanity-check the size against the specified
                         * geometry.
                         */
                        if (vnd->sc_size < geomsize) {
                                error = EINVAL;
                                goto close_and_exit;
                        }
                } else if (vnd->sc_size >= (32 * 64)) {
                        /*
                         * Size must be at least 2048 DEV_BSIZE blocks
                         * (1M) in order to use this geometry.
                         */
                        vnd->sc_geom.vng_secsize = DEV_BSIZE;
                        vnd->sc_geom.vng_nsectors = 32;
                        vnd->sc_geom.vng_ntracks = 64;
                        vnd->sc_geom.vng_ncylinders = vnd->sc_size / (64 * 32);
                } else {
                        vnd->sc_geom.vng_secsize = DEV_BSIZE;
                        vnd->sc_geom.vng_nsectors = 1;
                        vnd->sc_geom.vng_ntracks = 1;
                        vnd->sc_geom.vng_ncylinders = vnd->sc_size;
                }

                vnd_set_geometry(vnd);

                if (vio->vnd_flags & VNDIOF_READONLY) {
                        vnd->sc_flags |= VNF_READONLY;
                }

                if ((error = vndsetcred(vnd, l->l_cred)) != 0)
                        goto close_and_exit;

                vndthrottle(vnd, vnd->sc_vp);
                vio->vnd_osize = dbtob(vnd->sc_size);
                if (cmd != VNDIOCSET50)
                        vio->vnd_size = dbtob(vnd->sc_size);
                vnd->sc_flags |= VNF_INITED;

                /* create the kernel thread, wait for it to be up */
                error = kthread_create(PRI_NONE, 0, NULL, vndthread, vnd,
                    &vnd->sc_kthread, "%s", device_xname(vnd->sc_dev));
                if (error)
                        goto close_and_exit;
                while ((vnd->sc_flags & VNF_KTHREAD) == 0) {
                        tsleep(&vnd->sc_kthread, PRIBIO, "vndthr", 0);
                }
#ifdef DEBUG
                if (vnddebug & VDB_INIT)
                        printf("vndioctl: SET vp %p size 0x%lx %d/%d/%d/%d\n",
                            vnd->sc_vp, (unsigned long) vnd->sc_size,
                            vnd->sc_geom.vng_secsize,
                            vnd->sc_geom.vng_nsectors,
                            vnd->sc_geom.vng_ntracks,
                            vnd->sc_geom.vng_ncylinders);
#endif

                /* Attach the disk. */
                disk_attach(&vnd->sc_dkdev);

                /* Initialize the xfer and buffer pools. */
                pool_init(&vnd->sc_vxpool, sizeof(struct vndxfer), 0,
                    0, 0, "vndxpl", NULL, IPL_BIO);

                vndunlock(vnd);

                pathbuf_destroy(pb);

                /* Discover wedges on this disk */
                dkwedge_discover(&vnd->sc_dkdev);

                break;

close_and_exit:
                (void) vn_close(vp, fflags, l->l_cred);
                pathbuf_destroy(pb);
unlock_and_exit:
#ifdef VND_COMPRESSION
                /* free any allocated memory (for compressed file) */
                if (vnd->sc_comp_offsets) {
                        free(vnd->sc_comp_offsets, M_DEVBUF);
                        vnd->sc_comp_offsets = NULL;
                }
                if (vnd->sc_comp_buff) {
                        free(vnd->sc_comp_buff, M_DEVBUF);
                        vnd->sc_comp_buff = NULL;
                }
                if (vnd->sc_comp_decombuf) {
                        free(vnd->sc_comp_decombuf, M_DEVBUF);
                        vnd->sc_comp_decombuf = NULL;
                }
#endif /* VND_COMPRESSION */
                vndunlock(vnd);
                return error;

        case VNDIOCCLR50:
        case VNDIOCCLR:
                part = DISKPART(dev);
                pmask = (1 << part);
                force = (vio->vnd_flags & VNDIOF_FORCE) != 0;

                if ((error = vnddoclear(vnd, pmask, minor(dev), force)) != 0)
                        return error;

                break;


        case DIOCWDINFO:
        case DIOCSDINFO:
#ifdef __HAVE_OLD_DISKLABEL
        case ODIOCWDINFO:
        case ODIOCSDINFO:
#endif
        {
                struct disklabel *lp;

                if ((error = vndlock(vnd)) != 0)
                        return error;

                vnd->sc_flags |= VNF_LABELLING;

#ifdef __HAVE_OLD_DISKLABEL
                if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
                        memset(&newlabel, 0, sizeof newlabel);
                        memcpy(&newlabel, data, sizeof (struct olddisklabel));
                        lp = &newlabel;
                } else
#endif
                lp = (struct disklabel *)data;

                error = setdisklabel(vnd->sc_dkdev.dk_label,
                    lp, 0, vnd->sc_dkdev.dk_cpulabel);
                if (error == 0) {
                        if (cmd == DIOCWDINFO
#ifdef __HAVE_OLD_DISKLABEL
                            || cmd == ODIOCWDINFO
#endif
                           )
                                error = writedisklabel(VNDLABELDEV(dev),
                                    vndstrategy, vnd->sc_dkdev.dk_label,
                                    vnd->sc_dkdev.dk_cpulabel);
                }

                vnd->sc_flags &= ~VNF_LABELLING;

                vndunlock(vnd);

                if (error)
                        return error;
                break;
        }

        case DIOCKLABEL:
                if (*(int *)data != 0)
                        vnd->sc_flags |= VNF_KLABEL;
                else
                        vnd->sc_flags &= ~VNF_KLABEL;
                break;

        case DIOCWLABEL:
                if (*(int *)data != 0)
                        vnd->sc_flags |= VNF_WLABEL;
                else
                        vnd->sc_flags &= ~VNF_WLABEL;
                break;

        case DIOCGDEFLABEL:
                vndgetdefaultlabel(vnd, (struct disklabel *)data);
                break;

#ifdef __HAVE_OLD_DISKLABEL
        case ODIOCGDEFLABEL:
                vndgetdefaultlabel(vnd, &newlabel);
                if (newlabel.d_npartitions > OLDMAXPARTITIONS)
                        return ENOTTY;
                memcpy(data, &newlabel, sizeof (struct olddisklabel));
                break;
#endif

        case DIOCGSTRATEGY:
            {
                struct disk_strategy *dks = (void *)data;

                /* No lock needed, never changed */
                strlcpy(dks->dks_name,
                    bufq_getstrategyname(vnd->sc_tab),
                    sizeof(dks->dks_name));
                dks->dks_paramlen = 0;
                break;
            }
        case DIOCGCACHE:
            {
                int *bits = (int *)data;
                *bits |= DKCACHE_READ | DKCACHE_WRITE;
                break;
            }
        case DIOCCACHESYNC:
                vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY);
                error = VOP_FSYNC(vnd->sc_vp, vnd->sc_cred,
                    FSYNC_WAIT | FSYNC_DATAONLY | FSYNC_CACHE, 0, 0);
                VOP_UNLOCK(vnd->sc_vp);
                return error;

        default:
                return ENOTTY;
        }

        return 0;
}

/*
 * Duplicate the current processes' credentials.  Since we are called only
 * as the result of a SET ioctl and only root can do that, any future access
 * to this "disk" is essentially as root.  Note that credentials may change
 * if some other uid can write directly to the mapped file (NFS).
 */
static int
vndsetcred(struct vnd_softc *vnd, kauth_cred_t cred)
{
        struct uio auio;
        struct iovec aiov;
        char *tmpbuf;
        int error;

        vnd->sc_cred = kauth_cred_dup(cred);
        tmpbuf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);

        /* XXX: Horrible kludge to establish credentials for NFS */
        aiov.iov_base = tmpbuf;
        aiov.iov_len = uimin(DEV_BSIZE, dbtob(vnd->sc_size));
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        auio.uio_offset = 0;
        auio.uio_rw = UIO_READ;
        auio.uio_resid = aiov.iov_len;
        UIO_SETUP_SYSSPACE(&auio);
        vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_READ(vnd->sc_vp, &auio, 0, vnd->sc_cred);
        if (error == 0) {
                /*
                 * Because vnd does all IO directly through the vnode
                 * we need to flush (at least) the buffer from the above
                 * VOP_READ from the buffer cache to prevent cache
                 * incoherencies.  Also, be careful to write dirty
                 * buffers back to stable storage.
                 */
                error = vinvalbuf(vnd->sc_vp, V_SAVE, vnd->sc_cred,
                            curlwp, 0, 0);
        }
        VOP_UNLOCK(vnd->sc_vp);

        free(tmpbuf, M_TEMP);
        return error;
}

/*
 * Set maxactive based on FS type
 */
static void
vndthrottle(struct vnd_softc *vnd, struct vnode *vp)
{

        if (vp->v_tag == VT_NFS)
                vnd->sc_maxactive = 2;
        else
                vnd->sc_maxactive = 8;

        if (vnd->sc_maxactive < 1)
                vnd->sc_maxactive = 1;
}

#if 0
static void
vndshutdown(void)
{
        struct vnd_softc *vnd;

        for (vnd = &vnd_softc[0]; vnd < &vnd_softc[numvnd]; vnd++)
                if (vnd->sc_flags & VNF_INITED)
                        vndclear(vnd);
}
#endif

static void
vndclear(struct vnd_softc *vnd, int myminor)
{
        struct vnode *vp = vnd->sc_vp;
        int fflags = FREAD;
        int bmaj, cmaj, i, mn;
        int s;

#ifdef DEBUG
        if (vnddebug & VDB_FOLLOW)
                printf("vndclear(%p): vp %p\n", vnd, vp);
#endif
        /* locate the major number */
        bmaj = bdevsw_lookup_major(&vnd_bdevsw);
        cmaj = cdevsw_lookup_major(&vnd_cdevsw);

        /* Nuke the vnodes for any open instances */
        for (i = 0; i < MAXPARTITIONS; i++) {
                mn = DISKMINOR(device_unit(vnd->sc_dev), i);
                if (mn != myminor) { /* XXX avoid to kill own vnode */
                        vdevgone(bmaj, mn, mn, VBLK);
                        vdevgone(cmaj, mn, mn, VCHR);
                }
        }

        if ((vnd->sc_flags & VNF_READONLY) == 0)
                fflags |= FWRITE;

        s = splbio();
        bufq_drain(vnd->sc_tab);
        splx(s);

        vnd->sc_flags |= VNF_VUNCONF;
        wakeup(&vnd->sc_tab);
        while (vnd->sc_flags & VNF_KTHREAD)
                tsleep(&vnd->sc_kthread, PRIBIO, "vnthr", 0);

#ifdef VND_COMPRESSION
        /* free the compressed file buffers */
        if (vnd->sc_flags & VNF_COMP) {
                if (vnd->sc_comp_offsets) {
                        free(vnd->sc_comp_offsets, M_DEVBUF);
                        vnd->sc_comp_offsets = NULL;
                }
                if (vnd->sc_comp_buff) {
                        free(vnd->sc_comp_buff, M_DEVBUF);
                        vnd->sc_comp_buff = NULL;
                }
                if (vnd->sc_comp_decombuf) {
                        free(vnd->sc_comp_decombuf, M_DEVBUF);
                        vnd->sc_comp_decombuf = NULL;
                }
        }
#endif /* VND_COMPRESSION */
        vnd->sc_flags &=
            ~(VNF_INITED | VNF_READONLY | VNF_KLABEL | VNF_VLABEL
              | VNF_VUNCONF | VNF_COMP | VNF_CLEARING);
        if (vp == NULL)
                panic("vndclear: null vp");
        (void) vn_close(vp, fflags, vnd->sc_cred);
        kauth_cred_free(vnd->sc_cred);
        vnd->sc_vp = NULL;
        vnd->sc_cred = NULL;
        vnd->sc_size = 0;
}

static int
vndsize(dev_t dev)
{
        struct vnd_softc *sc;
        struct disklabel *lp;
        int part, unit, omask;
        int size;

        unit = vndunit(dev);
        sc = device_lookup_private(&vnd_cd, unit);
        if (sc == NULL)
                return -1;

        if ((sc->sc_flags & VNF_INITED) == 0)
                return -1;

        part = DISKPART(dev);
        omask = sc->sc_dkdev.dk_openmask & (1 << part);
        lp = sc->sc_dkdev.dk_label;

        if (omask == 0 && vndopen(dev, 0, S_IFBLK, curlwp))        /* XXX */
                return -1;

        if (lp->d_partitions[part].p_fstype != FS_SWAP)
                size = -1;
        else
                size = lp->d_partitions[part].p_size *
                    (lp->d_secsize / DEV_BSIZE);

        if (omask == 0 && vndclose(dev, 0, S_IFBLK, curlwp))        /* XXX */
                return -1;

        return size;
}

static int
vnddump(dev_t dev, daddr_t blkno, void *va,
    size_t size)
{

        /* Not implemented. */
        return ENXIO;
}

static void
vndgetdefaultlabel(struct vnd_softc *sc, struct disklabel *lp)
{
        struct vndgeom *vng = &sc->sc_geom;
        struct partition *pp;
        unsigned spb;

        memset(lp, 0, sizeof(*lp));

        spb = vng->vng_secsize / DEV_BSIZE;
        if (sc->sc_size / spb > UINT32_MAX)
                lp->d_secperunit = UINT32_MAX;
        else
                lp->d_secperunit = sc->sc_size / spb;
        lp->d_secsize = vng->vng_secsize;
        lp->d_nsectors = vng->vng_nsectors;
        lp->d_ntracks = vng->vng_ntracks;
        lp->d_ncylinders = vng->vng_ncylinders;
        lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;

        strncpy(lp->d_typename, "vnd", sizeof(lp->d_typename));
        lp->d_type = DKTYPE_VND;
        strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
        lp->d_rpm = 3600;
        lp->d_interleave = 1;
        lp->d_flags = 0;

        pp = &lp->d_partitions[RAW_PART];
        pp->p_offset = 0;
        pp->p_size = lp->d_secperunit;
        pp->p_fstype = FS_UNUSED;
        lp->d_npartitions = RAW_PART + 1;

        lp->d_magic = DISKMAGIC;
        lp->d_magic2 = DISKMAGIC;
        lp->d_checksum = dkcksum(lp);
}

/*
 * Read the disklabel from a vnd.  If one is not present, create a fake one.
 */
static void
vndgetdisklabel(dev_t dev, struct vnd_softc *sc)
{
        const char *errstring;
        struct disklabel *lp = sc->sc_dkdev.dk_label;
        struct cpu_disklabel *clp = sc->sc_dkdev.dk_cpulabel;
        int i;

        memset(clp, 0, sizeof(*clp));

        vndgetdefaultlabel(sc, lp);

        /*
         * Call the generic disklabel extraction routine.
         */
        errstring = readdisklabel(VNDLABELDEV(dev), vndstrategy, lp, clp);
        if (errstring) {
                /*
                 * Lack of disklabel is common, but we print the warning
                 * anyway, since it might contain other useful information.
                 */
                aprint_normal_dev(sc->sc_dev, "%s\n", errstring);

                /*
                 * For historical reasons, if there's no disklabel
                 * present, all partitions must be FS_BSDFFS and
                 * occupy the entire disk.
                 */
                for (i = 0; i < MAXPARTITIONS; i++) {
                        /*
                         * Don't wipe out port specific hack (such as
                         * dos partition hack of i386 port).
                         */
                        if (lp->d_partitions[i].p_size != 0)
                                continue;

                        lp->d_partitions[i].p_size = lp->d_secperunit;
                        lp->d_partitions[i].p_offset = 0;
                        lp->d_partitions[i].p_fstype = FS_BSDFFS;
                }

                strncpy(lp->d_packname, "default label",
                    sizeof(lp->d_packname));

                lp->d_npartitions = MAXPARTITIONS;
                lp->d_checksum = dkcksum(lp);
        }
}

/*
 * Wait interruptibly for an exclusive lock.
 *
 * XXX
 * Several drivers do this; it should be abstracted and made MP-safe.
 */
static int
vndlock(struct vnd_softc *sc)
{
        int error;

        while ((sc->sc_flags & VNF_LOCKED) != 0) {
                sc->sc_flags |= VNF_WANTED;
                if ((error = tsleep(sc, PRIBIO | PCATCH, "vndlck", 0)) != 0)
                        return error;
        }
        sc->sc_flags |= VNF_LOCKED;
        return 0;
}

/*
 * Unlock and wake up any waiters.
 */
static void
vndunlock(struct vnd_softc *sc)
{

        sc->sc_flags &= ~VNF_LOCKED;
        if ((sc->sc_flags & VNF_WANTED) != 0) {
                sc->sc_flags &= ~VNF_WANTED;
                wakeup(sc);
        }
}

#ifdef VND_COMPRESSION
/* compressed file read */
static void
compstrategy(struct buf *bp, off_t bn)
{
        int error;
        int unit = vndunit(bp->b_dev);
        struct vnd_softc *vnd =
            device_lookup_private(&vnd_cd, unit);
        u_int32_t comp_block;
        struct uio auio;
        char *addr;
        int s;

        /* set up constants for data move */
        auio.uio_rw = UIO_READ;
        UIO_SETUP_SYSSPACE(&auio);

        /* read, and transfer the data */
        addr = bp->b_data;
        bp->b_resid = bp->b_bcount;
        s = splbio();
        while (bp->b_resid > 0) {
                unsigned length;
                size_t length_in_buffer;
                u_int32_t offset_in_buffer;
                struct iovec aiov;

                /* calculate the compressed block number */
                comp_block = bn / (off_t)vnd->sc_comp_blksz;

                /* check for good block number */
                if (comp_block >= vnd->sc_comp_numoffs) {
                        bp->b_error = EINVAL;
                        splx(s);
                        return;
                }

                /* read in the compressed block, if not in buffer */
                if (comp_block != vnd->sc_comp_buffblk) {
                        length = vnd->sc_comp_offsets[comp_block + 1] -
                            vnd->sc_comp_offsets[comp_block];
                        vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY);
                        error = vn_rdwr(UIO_READ, vnd->sc_vp, vnd->sc_comp_buff,
                            length, vnd->sc_comp_offsets[comp_block],
                            UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vnd->sc_cred,
                            NULL, NULL);
                        if (error) {
                                bp->b_error = error;
                                VOP_UNLOCK(vnd->sc_vp);
                                splx(s);
                                return;
                        }
                        /* uncompress the buffer */
                        vnd->sc_comp_stream.next_in = vnd->sc_comp_buff;
                        vnd->sc_comp_stream.avail_in = length;
                        vnd->sc_comp_stream.next_out = vnd->sc_comp_decombuf;
                        vnd->sc_comp_stream.avail_out = vnd->sc_comp_blksz;
                        inflateReset(&vnd->sc_comp_stream);
                        error = inflate(&vnd->sc_comp_stream, Z_FINISH);
                        if (error != Z_STREAM_END) {
                                if (vnd->sc_comp_stream.msg)
                                        aprint_normal_dev(vnd->sc_dev,
                                            "compressed file, %s\n",
                                            vnd->sc_comp_stream.msg);
                                bp->b_error = EBADMSG;
                                VOP_UNLOCK(vnd->sc_vp);
                                splx(s);
                                return;
                        }
                        vnd->sc_comp_buffblk = comp_block;
                        VOP_UNLOCK(vnd->sc_vp);
                }

                /* transfer the usable uncompressed data */
                offset_in_buffer = bn % (off_t)vnd->sc_comp_blksz;
                length_in_buffer = vnd->sc_comp_blksz - offset_in_buffer;
                if (length_in_buffer > bp->b_resid)
                        length_in_buffer = bp->b_resid;
                auio.uio_iov = &aiov;
                auio.uio_iovcnt = 1;
                aiov.iov_base = addr;
                aiov.iov_len = length_in_buffer;
                auio.uio_resid = aiov.iov_len;
                auio.uio_offset = 0;
                error = uiomove(vnd->sc_comp_decombuf + offset_in_buffer,
                    length_in_buffer, &auio);
                if (error) {
                        bp->b_error = error;
                        splx(s);
                        return;
                }

                bn += length_in_buffer;
                addr += length_in_buffer;
                bp->b_resid -= length_in_buffer;
        }
        splx(s);
}

/* compression memory allocation routines */
static void *
vnd_alloc(void *aux, u_int items, u_int siz)
{
        return malloc(items * siz, M_TEMP, M_NOWAIT);
}

static void
vnd_free(void *aux, void *ptr)
{
        free(ptr, M_TEMP);
}
#endif /* VND_COMPRESSION */

static void
vnd_set_geometry(struct vnd_softc *vnd)
{
        struct disk_geom *dg = &vnd->sc_dkdev.dk_geom;
        unsigned spb;

        memset(dg, 0, sizeof(*dg));

        spb = vnd->sc_geom.vng_secsize / DEV_BSIZE;
        dg->dg_secperunit = vnd->sc_size / spb;
        dg->dg_secsize = vnd->sc_geom.vng_secsize;
        dg->dg_nsectors = vnd->sc_geom.vng_nsectors;
        dg->dg_ntracks = vnd->sc_geom.vng_ntracks;
        dg->dg_ncylinders = vnd->sc_geom.vng_ncylinders;

#ifdef DEBUG
        if (vnddebug & VDB_LABEL) {
                printf("dg->dg_secperunit: %" PRId64 "\n", dg->dg_secperunit);
                printf("dg->dg_ncylinders: %u\n", dg->dg_ncylinders);
        }
#endif
        disk_set_info(vnd->sc_dev, &vnd->sc_dkdev, NULL);
}

#ifdef VND_COMPRESSION
#define VND_DEPENDS "zlib"
#else
#define VND_DEPENDS NULL
#endif

MODULE(MODULE_CLASS_DRIVER, vnd, VND_DEPENDS);

#ifdef _MODULE
int vnd_bmajor = -1, vnd_cmajor = -1;

CFDRIVER_DECL(vnd, DV_DISK, NULL);
#endif

static int
vnd_modcmd(modcmd_t cmd, void *arg)
{
        int error = 0;

        switch (cmd) {
        case MODULE_CMD_INIT:
#ifdef _MODULE
                /*
                 * Attach the {b,c}devsw's
                 */
                error = devsw_attach("vnd", &vnd_bdevsw, &vnd_bmajor,
                    &vnd_cdevsw, &vnd_cmajor);
                if (error) {
#ifdef DIAGNOSTIC
                        aprint_error("%s: unable to attach %s devsw, "
                            "error %d", __func__, vnd_cd.cd_name, error);
#endif
                        break;
                }

                error = config_cfdriver_attach(&vnd_cd);
                if (error) {
                        devsw_detach(&vnd_bdevsw, &vnd_cdevsw);
                        break;
                }

                error = config_cfattach_attach(vnd_cd.cd_name, &vnd_ca);
                if (error) {
                        config_cfdriver_detach(&vnd_cd);
                        devsw_detach(&vnd_bdevsw, &vnd_cdevsw);
#ifdef DIAGNOSTIC
                        aprint_error("%s: unable to register cfattach for \n"
                            "%s, error %d", __func__, vnd_cd.cd_name, error);
#endif
                        break;
                }
#endif
                break;

        case MODULE_CMD_FINI:
#ifdef _MODULE
                /*
                 * Remove device from autoconf database
                 */
                error = config_cfattach_detach(vnd_cd.cd_name, &vnd_ca);
                if (error) { 
#ifdef DIAGNOSTIC
                        aprint_error("%s: failed to detach %s cfattach, "
                            "error %d\n", __func__, vnd_cd.cd_name, error);
#endif
                        break;
                }
                error = config_cfdriver_detach(&vnd_cd);
                if (error) {
                        (void)config_cfattach_attach(vnd_cd.cd_name, &vnd_ca); 
#ifdef DIAGNOSTIC
                        aprint_error("%s: failed to detach %s cfdriver, "
                            "error %d\n", __func__, vnd_cd.cd_name, error);
                        break;
#endif
                }
                /*
                 * Remove {b,c}devsw's
                 */
                devsw_detach(&vnd_bdevsw, &vnd_cdevsw);

#endif
                break;

        case MODULE_CMD_STAT:
                return ENOTTY;

        default:
                return ENOTTY;
        }

        return error;
}
















































































































































































































































































































































































































    3 



















  150 















  147 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
/*        $NetBSD: tsc.c,v 1.60 2024/02/19 20:10:09 mrg Exp $        */

/*-
 * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.60 2024/02/19 20:10:09 mrg Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/lwp.h>
#include <sys/atomic.h>
#include <sys/kernel.h>
#include <sys/cpu.h>
#include <sys/xcall.h>
#include <sys/lock.h>

#include <machine/cpu_counter.h>
#include <machine/cpuvar.h>
#include <machine/cpufunc.h>
#include <machine/specialreg.h>
#include <machine/cputypes.h>

#include "tsc.h"

#define        TSC_SYNC_ROUNDS                1000
#define        ABS(a)                        ((a) >= 0 ? (a) : -(a))

static u_int        tsc_get_timecount(struct timecounter *);

static void        tsc_delay(unsigned int);

static uint64_t        tsc_dummy_cacheline __cacheline_aligned;
uint64_t        tsc_freq __read_mostly;        /* exported for sysctl */
static int64_t        tsc_drift_max = 1000;        /* max cycles */
static int64_t        tsc_drift_observed;
uint64_t        (*rdtsc)(void) = rdtsc_cpuid;
uint64_t        (*cpu_counter)(void) = cpu_counter_cpuid;
uint32_t        (*cpu_counter32)(void) = cpu_counter32_cpuid;

int tsc_user_enabled = 1;

static volatile int64_t        tsc_sync_val;
static volatile struct cpu_info        *tsc_sync_cpu;

static struct timecounter tsc_timecounter = {
        .tc_get_timecount = tsc_get_timecount,
        .tc_counter_mask = ~0U,
        .tc_name = "TSC",
        .tc_quality = 3000,
};

bool
tsc_is_invariant(void)
{
        struct cpu_info *ci;
        uint32_t descs[4];
        uint32_t family;
        bool invariant;

        if (!cpu_hascounter())
                return false;

        ci = curcpu();
        invariant = false;

        if (cpu_vendor == CPUVENDOR_INTEL) {
                /*
                 * From Intel(tm) 64 and IA-32 Architectures Software
                 * Developer's Manual Volume 3A: System Programming Guide,
                 * Part 1, 17.13 TIME_STAMP COUNTER, these are the processors
                 * where the TSC is known invariant:
                 *
                 * Pentium 4, Intel Xeon (family 0f, models 03 and higher)
                 * Core Solo and Core Duo processors (family 06, model 0e)
                 * Xeon 5100 series and Core 2 Duo (family 06, model 0f)
                 * Core 2 and Xeon (family 06, model 17)
                 * Atom (family 06, model 1c)
                 *
                 * We'll also assume that it's safe on the Pentium, and
                 * that it's safe on P-II and P-III Xeons due to the
                 * typical configuration of those systems.
                 *
                 */
                switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) {
                case 0x05:
                        invariant = true;
                        break;
                case 0x06:
                        invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e ||
                            CPUID_TO_MODEL(ci->ci_signature) == 0x0f ||
                            CPUID_TO_MODEL(ci->ci_signature) == 0x17 ||
                            CPUID_TO_MODEL(ci->ci_signature) == 0x1c;
                        break;
                case 0x0f:
                        invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03;
                        break;
                }
        } else if (cpu_vendor == CPUVENDOR_AMD) {
                /*
                 * TSC and Power Management Events on AMD Processors
                 * Nov 2, 2005 Rich Brunner, AMD Fellow
                 * http://lkml.org/lkml/2005/11/4/173
                 *
                 * See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power
                 * Management Features, AMD64 Architecture Programmer's
                 * Manual Volume 3: General-Purpose and System Instructions.
                 * The check is done below.
                 */

                 /*
                  * AMD Errata 778: Processor Core Time Stamp Counters May
                  * Experience Drift
                  *
                  * This affects all family 15h and family 16h processors.
                  */
                switch (CPUID_TO_FAMILY(ci->ci_signature)) {
                case 0x15:
                case 0x16:
                        return false;
                }
        }

        /*
         * The best way to check whether the TSC counter is invariant or not
         * is to check CPUID 80000007.
         */
        family = CPUID_TO_BASEFAMILY(ci->ci_signature);
        if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD))
            && ((family == 0x06) || (family == 0x0f))) {
                x86_cpuid(0x80000000, descs);
                if (descs[0] >= 0x80000007) {
                        x86_cpuid(0x80000007, descs);
                        invariant = (descs[3] & CPUID_APM_ITSC) != 0;
                }
        }

        return invariant;
}

/* Setup function pointers for rdtsc() and timecounter(9). */
void
tsc_setfunc(struct cpu_info *ci)
{
        bool use_lfence, use_mfence;

        use_lfence = use_mfence = false;

        /*
         * XXX On AMD, we might be able to use lfence for some cases:
         *   a) if MSR_DE_CFG exist and the bit 1 is set.
         *   b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and
         *      lfence is always serializing.
         *
         * We don't use it because the test result showed mfence was better
         * than lfence with MSR_DE_CFG.
         */
        if (cpu_vendor == CPUVENDOR_AMD)
                use_mfence = true;
        else if (cpu_vendor == CPUVENDOR_INTEL)
                use_lfence = true;

        /* LFENCE and MFENCE are applicable if SSE2 is set. */
        if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0)
                use_lfence = use_mfence = false;

#define TSC_SETFUNC(fence)                                                      \
        do {                                                                      \
                rdtsc = rdtsc_##fence;                                              \
                cpu_counter = cpu_counter_##fence;                              \
                cpu_counter32 = cpu_counter32_##fence;                              \
        } while (/* CONSTCOND */ 0)

        if (use_lfence)
                TSC_SETFUNC(lfence);
        else if (use_mfence)
                TSC_SETFUNC(mfence);
        else
                TSC_SETFUNC(cpuid);

        aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n",
            use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid"));
}

/*
 * Initialize timecounter(9) and DELAY() function of TSC.
 *
 * This function is called after all secondary processors were brought up
 * and drift has been measured, and after any other potential delay funcs
 * have been installed (e.g. lapic_delay()).
 */
void
tsc_tc_init(void)
{
        struct cpu_info *ci;
        bool invariant;

        if (!cpu_hascounter())
                return;

        ci = curcpu();
        tsc_freq = ci->ci_data.cpu_cc_freq;
        invariant = tsc_is_invariant();
        if (!invariant) {
                aprint_debug("TSC not known invariant on this CPU\n");
                tsc_timecounter.tc_quality = -100;
        } else if (tsc_drift_observed > tsc_drift_max) {
                aprint_error("ERROR: %lld cycle TSC drift observed\n",
                    (long long)tsc_drift_observed);
                tsc_timecounter.tc_quality = -100;
                invariant = false;
        } else if (vm_guest == VM_GUEST_NO) {
                delay_func = tsc_delay;
        } else if (vm_guest == VM_GUEST_VIRTUALBOX) {
                tsc_timecounter.tc_quality = -100;
        }

        if (tsc_freq != 0) {
                tsc_timecounter.tc_frequency = tsc_freq;
                tc_init(&tsc_timecounter);
        }
}

/*
 * Record drift (in clock cycles).  Called during AP startup.
 */
void
tsc_sync_drift(int64_t drift)
{

        if (drift < 0)
                drift = -drift;
        if (drift > tsc_drift_observed)
                tsc_drift_observed = drift;
}

/*
 * Called during startup of APs, by the boot processor.  Interrupts
 * are disabled on entry.
 */
static void __noinline
tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
{
        uint64_t bptsc;

        if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) {
                panic("tsc_sync_bp: 1");
        }

        /* Prepare a cache miss for the other side. */
        (void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0);

        /* Flag our readiness. */
        atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC);

        /* Wait for other side then read our TSC. */
        while ((ci->ci_flags & CPUF_SYNCTSC) != 0) {
                __insn_barrier();
        }
        bptsc = rdtsc();

        /* Wait for the results to come in. */
        while (tsc_sync_cpu == ci) {
                x86_pause();
        }
        if (tsc_sync_cpu != NULL) {
                panic("tsc_sync_bp: 2");
        }

        *bptscp = bptsc;
        *aptscp = tsc_sync_val;
}

void
tsc_sync_bp(struct cpu_info *ci)
{
        int64_t bptsc, aptsc, val, diff;

        if (!cpu_hascounter())
                return;

        val = INT64_MAX;
        for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
                tsc_read_bp(ci, &bptsc, &aptsc);
                diff = bptsc - aptsc;
                if (ABS(diff) < ABS(val)) {
                        val = diff;
                }
        }

        ci->ci_data.cpu_cc_skew = val;
}

/*
 * Called during startup of AP, by the AP itself.  Interrupts are
 * disabled on entry.
 */
static void __noinline
tsc_post_ap(struct cpu_info *ci)
{
        uint64_t tsc;

        /* Wait for go-ahead from primary. */
        while ((ci->ci_flags & CPUF_SYNCTSC) == 0) {
                __insn_barrier();
        }

        /* Instruct primary to read its counter. */
        atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC);

        /* Suffer a cache miss, then read TSC. */
        __insn_barrier();
        tsc = tsc_dummy_cacheline;
        __insn_barrier();
        tsc += rdtsc();

        /* Post result.  Ensure the whole value goes out atomically. */
        (void)atomic_swap_64(&tsc_sync_val, tsc);

        if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) {
                panic("tsc_sync_ap");
        }
}

void
tsc_sync_ap(struct cpu_info *ci)
{

        if (!cpu_hascounter())
                return;

        for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
                tsc_post_ap(ci);
        }
}

static void
tsc_apply_cpu(void *arg1, void *arg2)
{
        bool enable = arg1 != NULL;
        if (enable) {
                lcr4(rcr4() & ~CR4_TSD);
        } else {
                lcr4(rcr4() | CR4_TSD);
        }
}

void
tsc_user_enable(void)
{
        uint64_t xc;

        xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL);
        xc_wait(xc);
}

void
tsc_user_disable(void)
{
        uint64_t xc;

        xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL);
        xc_wait(xc);
}

uint64_t
cpu_frequency(struct cpu_info *ci)
{

        return ci->ci_data.cpu_cc_freq;
}

int
cpu_hascounter(void)
{

        return cpu_feature[0] & CPUID_TSC;
}

static void
tsc_delay(unsigned int us)
{
        uint64_t start, delta;

        start = cpu_counter();
        delta = (uint64_t)us * tsc_freq / 1000000;

        while ((cpu_counter() - start) < delta) {
                x86_pause();
        }
}

static u_int
tsc_get_timecount(struct timecounter *tc)
{
#if defined(_LP64) && defined(DIAGNOSTIC) /* requires atomic 64-bit store */
        static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED;
        static int lastwarn;
        uint64_t cur, prev;
        lwp_t *l = curlwp;
        int ticks;

        /*
         * Previous value must be read before the counter and stored to
         * after, because this routine can be called from interrupt context
         * and may run over the top of an existing invocation.  Ordering is
         * guaranteed by "volatile" on md_tsc.
         */
        prev = l->l_md.md_tsc;
        cur = cpu_counter();
        if (__predict_false(cur < prev) && (cur >> 63) == (prev >> 63) &&
            __cpu_simple_lock_try(&lock)) {
                ticks = getticks();
                if (ticks - lastwarn >= hz) {
                        printf(
                            "WARNING: %s TSC went backwards by %u - "
                            "change sysctl(7) kern.timecounter?\n",
                            cpu_name(curcpu()), (unsigned)(prev - cur));
                        lastwarn = ticks;
                }
                __cpu_simple_unlock(&lock);
        }
        l->l_md.md_tsc = cur;
        return (uint32_t)cur;
#else
        return cpu_counter32();
#endif
}

/*
 * tsc has been reset; zero the cached tsc of every lwp in the system
 * so we don't spuriously report that the tsc has gone backward.
 * Caller must ensure all LWPs are quiescent (except the current one,
 * obviously) and interrupts are blocked while we update this.
 */
void
tsc_tc_reset(void)
{
        struct lwp *l;

        LIST_FOREACH(l, &alllwp, l_list)
                l->l_md.md_tsc = 0;
}


























































































































































































    3 


    3 







    3 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
/*        $NetBSD: coda_vnops.c,v 1.118 2022/03/27 16:24:58 christos Exp $        */

/*
 *
 *             Coda: an Experimental Distributed File System
 *                              Release 3.1
 *
 *           Copyright (c) 1987-1998 Carnegie Mellon University
 *                          All Rights Reserved
 *
 * Permission  to  use, copy, modify and distribute this software and its
 * documentation is hereby granted,  provided  that  both  the  copyright
 * notice  and  this  permission  notice  appear  in  all  copies  of the
 * software, derivative works or  modified  versions,  and  any  portions
 * thereof, and that both notices appear in supporting documentation, and
 * that credit is given to Carnegie Mellon University  in  all  documents
 * and publicity pertaining to direct or indirect use of this code or its
 * derivatives.
 *
 * CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS  KNOWN  TO  HAVE  BUGS,
 * SOME  OF  WHICH MAY HAVE SERIOUS CONSEQUENCES.  CARNEGIE MELLON ALLOWS
 * FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.   CARNEGIE  MELLON
 * DISCLAIMS  ANY  LIABILITY  OF  ANY  KIND  FOR  ANY  DAMAGES WHATSOEVER
 * RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE  OR  OF
 * ANY DERIVATIVE WORK.
 *
 * Carnegie  Mellon  encourages  users  of  this  software  to return any
 * improvements or extensions that  they  make,  and  to  grant  Carnegie
 * Mellon the rights to redistribute these changes without encumbrance.
 *
 *         @(#) coda/coda_vnops.c,v 1.1.1.1 1998/08/29 21:26:46 rvb Exp $
 */

/*
 * Mach Operating System
 * Copyright (c) 1990 Carnegie-Mellon University
 * Copyright (c) 1989 Carnegie-Mellon University
 * All rights reserved.  The CMU software License Agreement specifies
 * the terms and conditions for use and redistribution.
 */

/*
 * This code was written for the Coda file system at Carnegie Mellon
 * University.  Contributers include David Steere, James Kistler, and
 * M. Satyanarayanan.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: coda_vnops.c,v 1.118 2022/03/27 16:24:58 christos Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/errno.h>
#include <sys/acct.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/namei.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/select.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/dirent.h>

#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

#include <coda/coda.h>
#include <coda/cnode.h>
#include <coda/coda_vnops.h>
#include <coda/coda_venus.h>
#include <coda/coda_opstats.h>
#include <coda/coda_subr.h>
#include <coda/coda_namecache.h>
#include <coda/coda_pioctl.h>

/*
 * These flags select various performance enhancements.
 */
int coda_attr_cache  = 1;       /* Set to cache attributes in the kernel */
int coda_symlink_cache = 1;     /* Set to cache symbolic link information */
int coda_access_cache = 1;      /* Set to handle some access checks directly */

/* structure to keep track of vfs calls */

struct coda_op_stats coda_vnodeopstats[CODA_VNODEOPS_SIZE];

#define MARK_ENTRY(op) (coda_vnodeopstats[op].entries++)
#define MARK_INT_SAT(op) (coda_vnodeopstats[op].sat_intrn++)
#define MARK_INT_FAIL(op) (coda_vnodeopstats[op].unsat_intrn++)
#define MARK_INT_GEN(op) (coda_vnodeopstats[op].gen_intrn++)

/* What we are delaying for in printf */
static int coda_lockdebug = 0;

#define ENTRY if(coda_vnop_print_entry) myprintf(("Entered %s\n",__func__))

/* Definition of the vnode operation vector */

const struct vnodeopv_entry_desc coda_vnodeop_entries[] = {
    { &vop_default_desc, coda_vop_error },
    { &vop_parsepath_desc, genfs_parsepath },        /* parsepath */
    { &vop_lookup_desc, coda_lookup },                /* lookup */
    { &vop_create_desc, coda_create },                /* create */
    { &vop_mknod_desc, coda_vop_error },        /* mknod */
    { &vop_open_desc, coda_open },                /* open */
    { &vop_close_desc, coda_close },                /* close */
    { &vop_access_desc, coda_access },                /* access */
    { &vop_accessx_desc, genfs_accessx },        /* access */
    { &vop_getattr_desc, coda_getattr },        /* getattr */
    { &vop_setattr_desc, coda_setattr },        /* setattr */
    { &vop_read_desc, coda_read },                /* read */
    { &vop_write_desc, coda_write },                /* write */
    { &vop_fallocate_desc, genfs_eopnotsupp },        /* fallocate */
    { &vop_fdiscard_desc, genfs_eopnotsupp },        /* fdiscard */
    { &vop_fcntl_desc, genfs_fcntl },                /* fcntl */
    { &vop_ioctl_desc, coda_ioctl },                /* ioctl */
    { &vop_mmap_desc, genfs_mmap },                /* mmap */
    { &vop_fsync_desc, coda_fsync },                /* fsync */
    { &vop_remove_desc, coda_remove },                /* remove */
    { &vop_link_desc, coda_link },                /* link */
    { &vop_rename_desc, coda_rename },                /* rename */
    { &vop_mkdir_desc, coda_mkdir },                /* mkdir */
    { &vop_rmdir_desc, coda_rmdir },                /* rmdir */
    { &vop_symlink_desc, coda_symlink },        /* symlink */
    { &vop_readdir_desc, coda_readdir },        /* readdir */
    { &vop_readlink_desc, coda_readlink },        /* readlink */
    { &vop_abortop_desc, coda_abortop },        /* abortop */
    { &vop_inactive_desc, coda_inactive },        /* inactive */
    { &vop_reclaim_desc, coda_reclaim },        /* reclaim */
    { &vop_lock_desc, coda_lock },                /* lock */
    { &vop_unlock_desc, coda_unlock },                /* unlock */
    { &vop_bmap_desc, coda_bmap },                /* bmap */
    { &vop_strategy_desc, coda_strategy },        /* strategy */
    { &vop_print_desc, coda_vop_error },        /* print */
    { &vop_islocked_desc, coda_islocked },        /* islocked */
    { &vop_pathconf_desc, coda_pathconf },        /* pathconf */
    { &vop_advlock_desc, coda_vop_nop },        /* advlock */
    { &vop_bwrite_desc, coda_vop_error },        /* bwrite */
    { &vop_seek_desc, genfs_seek },                /* seek */
    { &vop_poll_desc, genfs_poll },                /* poll */
    { &vop_getpages_desc, coda_getpages },        /* getpages */
    { &vop_putpages_desc, coda_putpages },        /* putpages */
    { NULL, NULL }
};

static void coda_print_vattr(struct vattr *);

int (**coda_vnodeop_p)(void *);
const struct vnodeopv_desc coda_vnodeop_opv_desc =
        { &coda_vnodeop_p, coda_vnodeop_entries };

/* Definitions of NetBSD vnodeop interfaces */

/*
 * A generic error routine.  Return EIO without looking at arguments.
 */
int
coda_vop_error(void *anon) {
    struct vnodeop_desc **desc = (struct vnodeop_desc **)anon;

    if (codadebug) {
        myprintf(("%s: Vnode operation %s called (error).\n",
            __func__, (*desc)->vdesc_name));
    }

    return EIO;
}

/* A generic do-nothing. */
int
coda_vop_nop(void *anon) {
    struct vnodeop_desc **desc = (struct vnodeop_desc **)anon;

    if (codadebug) {
        myprintf(("Vnode operation %s called, but unsupported\n",
                  (*desc)->vdesc_name));
    }
   return (0);
}

int
coda_vnodeopstats_init(void)
{
        int i;

        for(i=0;i<CODA_VNODEOPS_SIZE;i++) {
                coda_vnodeopstats[i].opcode = i;
                coda_vnodeopstats[i].entries = 0;
                coda_vnodeopstats[i].sat_intrn = 0;
                coda_vnodeopstats[i].unsat_intrn = 0;
                coda_vnodeopstats[i].gen_intrn = 0;
        }

        return 0;
}

/*
 * XXX The entire relationship between VOP_OPEN and having a container
 * file (via venus_open) needs to be reexamined.  In particular, it's
 * valid to open/mmap/close and then reference.  Instead of doing
 * VOP_OPEN when getpages needs a container, we should do the
 * venus_open part, and record that the vnode has opened the container
 * for getpages, and do the matching logical close on coda_inactive.
 * Further, coda_rdwr needs a container file, and sometimes needs to
 * do the equivalent of open (core dumps).
 */
/*
 * coda_open calls Venus to return the device and inode of the
 * container file, and then obtains a vnode for that file.  The
 * container vnode is stored in the coda vnode, and a reference is
 * added for each open file.
 */
int
coda_open(void *v)
{
    /*
     * NetBSD can pass the O_EXCL flag in mode, even though the check
     * has already happened.  Venus defensively assumes that if open
     * is passed the EXCL, it must be a bug.  We strip the flag here.
     */
/* true args */
    struct vop_open_args *ap = v;
    vnode_t *vp = ap->a_vp;
    struct cnode *cp = VTOC(vp);
    int flag = ap->a_mode & (~O_EXCL);
    kauth_cred_t cred = ap->a_cred;
/* locals */
    int error;
    dev_t dev;                        /* container file device, inode, vnode */
    ino_t inode;
    vnode_t *container_vp;

    MARK_ENTRY(CODA_OPEN_STATS);

    KASSERT(VOP_ISLOCKED(vp));
    /* Check for open of control file. */
    if (IS_CTL_VP(vp)) {
        /* if (WRITABLE(flag)) */
        if (flag & (FWRITE | O_TRUNC | O_CREAT | O_EXCL)) {
            MARK_INT_FAIL(CODA_OPEN_STATS);
            return(EACCES);
        }
        MARK_INT_SAT(CODA_OPEN_STATS);
        return(0);
    }

    error = venus_open(vtomi(vp), &cp->c_fid, flag, cred, curlwp, &dev, &inode);
    if (error)
        return (error);
    if (!error) {
            CODADEBUG(CODA_OPEN, myprintf((
                "%s: dev 0x%llx inode %llu result %d\n", __func__,
                (unsigned long long)dev, (unsigned long long)inode, error));)
    }

    /* 
     * Obtain locked and referenced container vnode from container
     * device/inode.
     */
    error = coda_grab_vnode(vp, dev, inode, &container_vp);
    if (error)
        return (error);

    /* Save the vnode pointer for the container file. */
    if (cp->c_ovp == NULL) {
        cp->c_ovp = container_vp;
    } else {
        if (cp->c_ovp != container_vp)
            /*
             * Perhaps venus returned a different container, or
             * something else went wrong.
             */
            panic("%s: cp->c_ovp != container_vp", __func__);
    }
    cp->c_ocount++;

    /* Flush the attribute cache if writing the file. */
    if (flag & FWRITE) {
        cp->c_owrite++;
        cp->c_flags &= ~C_VATTR;
    }

    /* 
     * Save the <device, inode> pair for the container file to speed
     * up subsequent reads while closed (mmap, program execution).
     * This is perhaps safe because venus will invalidate the node
     * before changing the container file mapping.
     */
    cp->c_device = dev;
    cp->c_inode = inode;

    /* Open the container file. */
    error = VOP_OPEN(container_vp, flag, cred);
    /* 
     * Drop the lock on the container, after we have done VOP_OPEN
     * (which requires a locked vnode).
     */
    VOP_UNLOCK(container_vp);
    return(error);
}

/*
 * Close the cache file used for I/O and notify Venus.
 */
int
coda_close(void *v)
{
/* true args */
    struct vop_close_args *ap = v;
    vnode_t *vp = ap->a_vp;
    struct cnode *cp = VTOC(vp);
    int flag = ap->a_fflag;
    kauth_cred_t cred = ap->a_cred;
/* locals */
    int error;

    MARK_ENTRY(CODA_CLOSE_STATS);

    /* Check for close of control file. */
    if (IS_CTL_VP(vp)) {
        MARK_INT_SAT(CODA_CLOSE_STATS);
        return(0);
    }

    /*
     * XXX The IS_UNMOUNTING part of this is very suspect.
     */ 
    if (IS_UNMOUNTING(cp)) {
        if (cp->c_ovp) {
#ifdef        CODA_VERBOSE
            printf("%s: destroying container %d, ufs vp %p of vp %p/cp %p\n",
                __func__, vrefcnt(vp), cp->c_ovp, vp, cp);
#endif
#ifdef        hmm
            vgone(cp->c_ovp);
#else
            vn_lock(cp->c_ovp, LK_EXCLUSIVE | LK_RETRY);
            VOP_CLOSE(cp->c_ovp, flag, cred); /* Do errors matter here? */
            vput(cp->c_ovp);
#endif
        } else {
#ifdef        CODA_VERBOSE
            printf("%s: NO container vp %p/cp %p\n", __func__, vp, cp);
#endif
        }
        return ENODEV;
    }

    /* Lock the container node, and VOP_CLOSE it. */
    vn_lock(cp->c_ovp, LK_EXCLUSIVE | LK_RETRY);
    VOP_CLOSE(cp->c_ovp, flag, cred); /* Do errors matter here? */
    /*
     * Drop the lock we just obtained, and vrele the container vnode.
     * Decrement reference counts, and clear container vnode pointer on
     * last close.
     */
    vput(cp->c_ovp);
    if (flag & FWRITE)
        --cp->c_owrite;
    if (--cp->c_ocount == 0)
        cp->c_ovp = NULL;

    error = venus_close(vtomi(vp), &cp->c_fid, flag, cred, curlwp);

    CODADEBUG(CODA_CLOSE, myprintf(("%s: result %d\n", __func__, error)); )
    return(error);
}

int
coda_read(void *v)
{
    struct vop_read_args *ap = v;

    ENTRY;
    return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_READ,
                    ap->a_ioflag, ap->a_cred, curlwp));
}

int
coda_write(void *v)
{
    struct vop_write_args *ap = v;

    ENTRY;
    return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_WRITE,
                    ap->a_ioflag, ap->a_cred, curlwp));
}

int
coda_rdwr(vnode_t *vp, struct uio *uiop, enum uio_rw rw, int ioflag,
        kauth_cred_t cred, struct lwp *l)
{
/* upcall decl */
  /* NOTE: container file operation!!! */
/* locals */
    struct cnode *cp = VTOC(vp);
    vnode_t *cfvp = cp->c_ovp;
    struct proc *p = l->l_proc;
    int opened_internally = 0;
    int error = 0;

    MARK_ENTRY(CODA_RDWR_STATS);

    CODADEBUG(CODA_RDWR, myprintf(("coda_rdwr(%d, %p, %lu, %lld)\n", rw,
        uiop->uio_iov->iov_base, (unsigned long) uiop->uio_resid,
        (long long) uiop->uio_offset)); )

    /* Check for rdwr of control object. */
    if (IS_CTL_VP(vp)) {
        MARK_INT_FAIL(CODA_RDWR_STATS);
        return(EINVAL);
    }

    /* Redirect the request to UFS. */

    /*
     * If file is not already open this must be a page
     * {read,write} request.  Iget the cache file's inode
     * pointer if we still have its <device, inode> pair.
     * Otherwise, we must do an internal open to derive the
     * pair.
     * XXX Integrate this into a coherent strategy for container
     * file acquisition.
     */
    if (cfvp == NULL) {
        /*
         * If we're dumping core, do the internal open. Otherwise
         * venus won't have the correct size of the core when
         * it's completely written.
         */
        if (cp->c_inode != 0 && !(p && (p->p_acflag & ACORE))) {
#ifdef CODA_VERBOSE
            printf("%s: grabbing container vnode, losing reference\n",
                __func__);
#endif
            /* Get locked and refed vnode. */
            error = coda_grab_vnode(vp, cp->c_device, cp->c_inode, &cfvp);
            if (error) {
                MARK_INT_FAIL(CODA_RDWR_STATS);
                return(error);
            }
            /* 
             * Drop lock. 
             * XXX Where is reference released.
             */
            VOP_UNLOCK(cfvp);
        }
        else {
#ifdef CODA_VERBOSE
            printf("%s: internal VOP_OPEN\n", __func__);
#endif
            opened_internally = 1;
            MARK_INT_GEN(CODA_OPEN_STATS);
            error = VOP_OPEN(vp, (rw == UIO_READ ? FREAD : FWRITE), cred);
#ifdef        CODA_VERBOSE
            printf("%s: Internally Opening %p\n", __func__, vp);
#endif
            if (error) {
                MARK_INT_FAIL(CODA_RDWR_STATS);
                return(error);
            }
            cfvp = cp->c_ovp;
        }
    }

    /* Have UFS handle the call. */
    CODADEBUG(CODA_RDWR, myprintf(("%s: fid = %s, refcnt = %d\n", __func__,
        coda_f2s(&cp->c_fid), vrefcnt(CTOV(cp)))); )

    if (rw == UIO_READ) {
        error = VOP_READ(cfvp, uiop, ioflag, cred);
    } else {
        error = VOP_WRITE(cfvp, uiop, ioflag, cred);
    }

    if (error)
        MARK_INT_FAIL(CODA_RDWR_STATS);
    else
        MARK_INT_SAT(CODA_RDWR_STATS);

    /* Do an internal close if necessary. */
    if (opened_internally) {
        MARK_INT_GEN(CODA_CLOSE_STATS);
        (void)VOP_CLOSE(vp, (rw == UIO_READ ? FREAD : FWRITE), cred);
    }

    /* Invalidate cached attributes if writing. */
    if (rw == UIO_WRITE)
        cp->c_flags &= ~C_VATTR;
    return(error);
}

int
coda_ioctl(void *v)
{
/* true args */
    struct vop_ioctl_args *ap = v;
    vnode_t *vp = ap->a_vp;
    int com = ap->a_command;
    void *data = ap->a_data;
    int flag = ap->a_fflag;
    kauth_cred_t cred = ap->a_cred;
/* locals */
    int error;
    vnode_t *tvp;
    struct PioctlData *iap = (struct PioctlData *)data;
    namei_simple_flags_t sflags;

    MARK_ENTRY(CODA_IOCTL_STATS);

    CODADEBUG(CODA_IOCTL, myprintf(("in coda_ioctl on %s\n", iap->path));)

    /* Don't check for operation on a dying object, for ctlvp it
       shouldn't matter */

    /* Must be control object to succeed. */
    if (!IS_CTL_VP(vp)) {
        MARK_INT_FAIL(CODA_IOCTL_STATS);
        CODADEBUG(CODA_IOCTL, myprintf(("%s error: vp != ctlvp", __func__));)
        return (EOPNOTSUPP);
    }
    /* Look up the pathname. */

    /* Should we use the name cache here? It would get it from
       lookupname sooner or later anyway, right? */

    sflags = iap->follow ? NSM_FOLLOW_NOEMULROOT : NSM_NOFOLLOW_NOEMULROOT;
    error = namei_simple_user(iap->path, sflags, &tvp);

    if (error) {
        MARK_INT_FAIL(CODA_IOCTL_STATS);
        CODADEBUG(CODA_IOCTL, myprintf(("%s error: lookup returns %d\n",
            __func__, error));)
        return(error);
    }

    /*
     * Make sure this is a coda style cnode, but it may be a
     * different vfsp
     */
    /* XXX: this totally violates the comment about vtagtype in vnode.h */
    if (tvp->v_tag != VT_CODA) {
        vrele(tvp);
        MARK_INT_FAIL(CODA_IOCTL_STATS);
        CODADEBUG(CODA_IOCTL, myprintf(("%s error: %s not a coda object\n",
            __func__, iap->path));)
        return(EINVAL);
    }

    if (iap->vi.in_size > VC_MAXDATASIZE || iap->vi.out_size > VC_MAXDATASIZE) {
        vrele(tvp);
        return(EINVAL);
    }
    error = venus_ioctl(vtomi(tvp), &((VTOC(tvp))->c_fid), com, flag, data,
        cred, curlwp);

    if (error)
        MARK_INT_FAIL(CODA_IOCTL_STATS);
    else
        CODADEBUG(CODA_IOCTL, myprintf(("Ioctl returns %d \n", error)); )

    vrele(tvp);
    return(error);
}

/*
 * To reduce the cost of a user-level venus;we cache attributes in
 * the kernel.  Each cnode has storage allocated for an attribute. If
 * c_vattr is valid, return a reference to it. Otherwise, get the
 * attributes from venus and store them in the cnode.  There is some
 * question if this method is a security leak. But I think that in
 * order to make this call, the user must have done a lookup and
 * opened the file, and therefore should already have access.
 */
int
coda_getattr(void *v)
{
/* true args */
    struct vop_getattr_args *ap = v;
    vnode_t *vp = ap->a_vp;
    struct cnode *cp = VTOC(vp);
    struct vattr *vap = ap->a_vap;
    kauth_cred_t cred = ap->a_cred;
/* locals */
    int error;

    MARK_ENTRY(CODA_GETATTR_STATS);

    /* Check for getattr of control object. */
    if (IS_CTL_VP(vp)) {
        MARK_INT_FAIL(CODA_GETATTR_STATS);
        return(ENOENT);
    }

    /* Check to see if the attributes have already been cached */
    if (VALID_VATTR(cp)) {
        CODADEBUG(CODA_GETATTR, { myprintf(("%s: attr cache hit: %s\n",
            __func__, coda_f2s(&cp->c_fid)));})
        CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR))
            coda_print_vattr(&cp->c_vattr); )

        *vap = cp->c_vattr;
        MARK_INT_SAT(CODA_GETATTR_STATS);
        return(0);
    }

    error = venus_getattr(vtomi(vp), &cp->c_fid, cred, curlwp, vap);

    if (!error) {
        CODADEBUG(CODA_GETATTR, myprintf(("%s miss %s: result %d\n",
            __func__, coda_f2s(&cp->c_fid), error)); )

        CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR))
            coda_print_vattr(vap);        )

        /* If not open for write, store attributes in cnode */
        if ((cp->c_owrite == 0) && (coda_attr_cache)) {
            cp->c_vattr = *vap;
            cp->c_flags |= C_VATTR;
        }

    }
    return(error);
}

int
coda_setattr(void *v)
{
/* true args */
    struct vop_setattr_args *ap = v;
    vnode_t *vp = ap->a_vp;
    struct cnode *cp = VTOC(vp);
    struct vattr *vap = ap->a_vap;
    kauth_cred_t cred = ap->a_cred;
/* locals */
    int error;

    MARK_ENTRY(CODA_SETATTR_STATS);

    /* Check for setattr of control object. */
    if (IS_CTL_VP(vp)) {
        MARK_INT_FAIL(CODA_SETATTR_STATS);
        return(ENOENT);
    }

    if (codadebug & CODADBGMSK(CODA_SETATTR)) {
        coda_print_vattr(vap);
    }
    error = venus_setattr(vtomi(vp), &cp->c_fid, vap, cred, curlwp);

    if (!error)
        cp->c_flags &= ~C_VATTR;

    CODADEBUG(CODA_SETATTR,        myprintf(("setattr %d\n", error)); )
    return(error);
}

int
coda_access(void *v)
{
/* true args */
    struct vop_access_args *ap = v;
    vnode_t *vp = ap->a_vp;
    struct cnode *cp = VTOC(vp);
    accmode_t accmode = ap->a_accmode;
    kauth_cred_t cred = ap->a_cred;
/* locals */
    int error;

    MARK_ENTRY(CODA_ACCESS_STATS);

    KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0);
    /* Check for access of control object.  Only read access is
       allowed on it. */
    if (IS_CTL_VP(vp)) {
        /* bogus hack - all will be marked as successes */
        MARK_INT_SAT(CODA_ACCESS_STATS);
        return(((accmode & VREAD) && !(accmode & (VWRITE | VEXEC)))
               ? 0 : EACCES);
    }

    /*
     * if the file is a directory, and we are checking exec (eg lookup)
     * access, and the file is in the namecache, then the user must have
     * lookup access to it.
     */
    if (coda_access_cache) {
        if ((vp->v_type == VDIR) && (accmode & VEXEC)) {
            if (coda_nc_lookup(cp, ".", 1, cred)) {
                MARK_INT_SAT(CODA_ACCESS_STATS);
                return(0);                     /* it was in the cache */
            }
        }
    }

    error = venus_access(vtomi(vp), &cp->c_fid, accmode, cred, curlwp);

    return(error);
}

/*
 * CODA abort op, called after namei() when a CREATE/DELETE isn't actually
 * done. If a buffer has been saved in anticipation of a coda_create or
 * a coda_remove, delete it.
 */
/* ARGSUSED */
int
coda_abortop(void *v)
{
/* true args */
    struct vop_abortop_args /* {
        vnode_t *a_dvp;
        struct componentname *a_cnp;
    } */ *ap = v;

    (void)ap;
/* upcall decl */
/* locals */

    return (0);
}

int
coda_readlink(void *v)
{
/* true args */
    struct vop_readlink_args *ap = v;
    vnode_t *vp = ap->a_vp;
    struct cnode *cp = VTOC(vp);
    struct uio *uiop = ap->a_uio;
    kauth_cred_t cred = ap->a_cred;
/* locals */
    struct lwp *l = curlwp;
    int error;
    char *str;
    int len;

    MARK_ENTRY(CODA_READLINK_STATS);

    /* Check for readlink of control object. */
    if (IS_CTL_VP(vp)) {
        MARK_INT_FAIL(CODA_READLINK_STATS);
        return(ENOENT);
    }

    if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { /* symlink was cached */
        uiop->uio_rw = UIO_READ;
        error = uiomove(cp->c_symlink, (int)cp->c_symlen, uiop);
        if (error)
            MARK_INT_FAIL(CODA_READLINK_STATS);
        else
            MARK_INT_SAT(CODA_READLINK_STATS);
        return(error);
    }

    error = venus_readlink(vtomi(vp), &cp->c_fid, cred, l, &str, &len);

    if (!error) {
        uiop->uio_rw = UIO_READ;
        error = uiomove(str, len, uiop);

        if (coda_symlink_cache) {
            cp->c_symlink = str;
            cp->c_symlen = len;
            cp->c_flags |= C_SYMLINK;
        } else
            CODA_FREE(str, len);
    }

    CODADEBUG(CODA_READLINK, myprintf(("in readlink result %d\n",error));)
    return(error);
}

int
coda_fsync(void *v)
{
/* true args */
    struct vop_fsync_args *ap = v;
    vnode_t *vp = ap->a_vp;
    struct cnode *cp = VTOC(vp);
    kauth_cred_t cred = ap->a_cred;
/* locals */
    vnode_t *convp = cp->c_ovp;
    int error;

    MARK_ENTRY(CODA_FSYNC_STATS);

    /* Check for fsync on an unmounting object */
    /* The NetBSD kernel, in its infinite wisdom, can try to fsync
     * after an unmount has been initiated.  This is a Bad Thing,
     * which we have to avoid.  Not a legitimate failure for stats.
     */
    if (IS_UNMOUNTING(cp)) {
        return(ENODEV);
    }

    /* Check for fsync of control object or unitialized cnode. */
    if (IS_CTL_VP(vp) || vp->v_type == VNON) {
        MARK_INT_SAT(CODA_FSYNC_STATS);
        return(0);
    }

    if (convp)
            VOP_FSYNC(convp, cred, MNT_WAIT, 0, 0);

    /*
     * We can expect fsync on any vnode at all if venus is pruging it.
     * Venus can't very well answer the fsync request, now can it?
     * Hopefully, it won't have to, because hopefully, venus preserves
     * the (possibly untrue) invariant that it never purges an open
     * vnode.  Hopefully.
     */
    if (cp->c_flags & C_PURGING) {
        return(0);
    }

    error = venus_fsync(vtomi(vp), &cp->c_fid, cred, curlwp);

    CODADEBUG(CODA_FSYNC, myprintf(("in fsync result %d\n",error)); )
    return(error);
}

/*
 * vp is locked on entry, and we must unlock it.
 * XXX This routine is suspect and probably needs rewriting.
 */
int
coda_inactive(void *v)
{
/* true args */
    struct vop_inactive_v2_args *ap = v;
    vnode_t *vp = ap->a_vp;
    struct cnode *cp = VTOC(vp);
    kauth_cred_t cred __unused = NULL;

    /* We don't need to send inactive to venus - DCS */
    MARK_ENTRY(CODA_INACTIVE_STATS);

    if (IS_CTL_VP(vp)) {
        MARK_INT_SAT(CODA_INACTIVE_STATS);
        return 0;
    }

    CODADEBUG(CODA_INACTIVE, myprintf(("in inactive, %s, vfsp %p\n",
                                  coda_f2s(&cp->c_fid), vp->v_mount));)

    if (vp->v_mount->mnt_data == NULL) {
        myprintf(("Help! vfsp->vfs_data was NULL, but vnode %p wasn't dying\n", vp));
        panic("badness in coda_inactive");
    }

#ifdef CODA_VERBOSE
    /* Sanity checks that perhaps should be panic. */
    if (vrefcnt(vp) > 1)
        printf("%s: %p usecount %d\n", __func__, vp, vrefcnt(vp));
    if (cp->c_ovp != NULL)
        printf("%s: %p ovp != NULL\n", __func__, vp);
#endif
    /* XXX Do we need to VOP_CLOSE container vnodes? */
    if (!IS_UNMOUNTING(cp))
        *ap->a_recycle = true;

    MARK_INT_SAT(CODA_INACTIVE_STATS);
    return(0);
}

/*
 * Coda does not use the normal namecache, but a private version.
 * Consider how to use the standard facility instead.
 */
int
coda_lookup(void *v)
{
/* true args */
    struct vop_lookup_v2_args *ap = v;
    /* (locked) vnode of dir in which to do lookup */
    vnode_t *dvp = ap->a_dvp;
    struct cnode *dcp = VTOC(dvp);
    /* output variable for result */
    vnode_t **vpp = ap->a_vpp;
    /* name to lookup */
    struct componentname *cnp = ap->a_cnp;
    kauth_cred_t cred = cnp->cn_cred;
    struct lwp *l = curlwp;
/* locals */
    struct cnode *cp;
    const char *nm = cnp->cn_nameptr;
    int len = cnp->cn_namelen;
    CodaFid VFid;
    int        vtype;
    int error = 0;

    MARK_ENTRY(CODA_LOOKUP_STATS);

    CODADEBUG(CODA_LOOKUP, myprintf(("%s: %s in %s\n", __func__,
        nm, coda_f2s(&dcp->c_fid)));)

    /*
     * XXX componentname flags in MODMASK are not handled at all
     */

    /*
     * The overall strategy is to switch on the lookup type and get a
     * result vnode that is vref'd but not locked.
     */

    /* Check for lookup of control object. */
    if (IS_CTL_NAME(dvp, nm, len)) {
        *vpp = coda_ctlvp;
        vref(*vpp);
        MARK_INT_SAT(CODA_LOOKUP_STATS);
        goto exit;
    }

    /* Avoid trying to hand venus an unreasonably long name. */
    if (len+1 > CODA_MAXNAMLEN) {
        MARK_INT_FAIL(CODA_LOOKUP_STATS);
        CODADEBUG(CODA_LOOKUP, myprintf(("%s: name too long:, %s (%s)\n",
            __func__, coda_f2s(&dcp->c_fid), nm));)
        *vpp = (vnode_t *)0;
        error = EINVAL;
        goto exit;
    }

    /*
     * Try to resolve the lookup in the minicache.  If that fails, ask
     * venus to do the lookup.  XXX The interaction between vnode
     * locking and any locking that coda does is not clear.
     */
    cp = coda_nc_lookup(dcp, nm, len, cred);
    if (cp) {
        *vpp = CTOV(cp);
        vref(*vpp);
        CODADEBUG(CODA_LOOKUP,
                 myprintf(("lookup result %d vpp %p\n",error,*vpp));)
    } else {
        /* The name wasn't cached, so ask Venus. */
        error = venus_lookup(vtomi(dvp), &dcp->c_fid, nm, len, cred, l, &VFid,
            &vtype);

        if (error) {
            MARK_INT_FAIL(CODA_LOOKUP_STATS);
            CODADEBUG(CODA_LOOKUP, myprintf(("%s: lookup error on %s (%s)%d\n",
                __func__, coda_f2s(&dcp->c_fid), nm, error));)
            *vpp = (vnode_t *)0;
        } else {
            MARK_INT_SAT(CODA_LOOKUP_STATS);
            CODADEBUG(CODA_LOOKUP, myprintf(("%s: %s type %o result %d\n",
                __func__, coda_f2s(&VFid), vtype, error)); )

            cp = make_coda_node(&VFid, dvp->v_mount, vtype);
            *vpp = CTOV(cp);
            /* vpp is now vrefed. */

            /*
             * Unless this vnode is marked CODA_NOCACHE, enter it into
             * the coda name cache to avoid a future venus round-trip.
             * XXX Interaction with componentname NOCACHE is unclear.
             */
            if (!(vtype & CODA_NOCACHE))
                coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp));
        }
    }

 exit:
    /*
     * If we are creating, and this was the last name to be looked up,
     * and the error was ENOENT, then make the leaf NULL and return
     * success.
     * XXX Check against new lookup rules.
     */
    if (((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME))
        && (cnp->cn_flags & ISLASTCN)
        && (error == ENOENT))
    {
        error = EJUSTRETURN;
        *ap->a_vpp = NULL;
    }

    return(error);
}

/*ARGSUSED*/
int
coda_create(void *v)
{
/* true args */
    struct vop_create_v3_args *ap = v;
    vnode_t *dvp = ap->a_dvp;
    struct cnode *dcp = VTOC(dvp);
    struct vattr *va = ap->a_vap;
    int exclusive = 1;
    int mode = ap->a_vap->va_mode;
    vnode_t **vpp = ap->a_vpp;
    struct componentname  *cnp = ap->a_cnp;
    kauth_cred_t cred = cnp->cn_cred;
    struct lwp *l = curlwp;
/* locals */
    int error;
    struct cnode *cp;
    const char *nm = cnp->cn_nameptr;
    int len = cnp->cn_namelen;
    CodaFid VFid;
    struct vattr attr;

    MARK_ENTRY(CODA_CREATE_STATS);

    /* All creates are exclusive XXX */
    /* I'm assuming the 'mode' argument is the file mode bits XXX */

    /* Check for create of control object. */
    if (IS_CTL_NAME(dvp, nm, len)) {
        *vpp = (vnode_t *)0;
        MARK_INT_FAIL(CODA_CREATE_STATS);
        return(EACCES);
    }

    error = venus_create(vtomi(dvp), &dcp->c_fid, nm, len, exclusive, mode, va, cred, l, &VFid, &attr);

    if (!error) {

        /*
         * XXX Violation of venus/kernel invariants is a difficult case,
         * but venus should not be able to cause a panic.
         */
        /* If this is an exclusive create, panic if the file already exists. */
        /* Venus should have detected the file and reported EEXIST. */

        if ((exclusive == 1) &&
            (coda_find(&VFid) != NULL))
            panic("cnode existed for newly created file!");

        cp = make_coda_node(&VFid, dvp->v_mount, attr.va_type);
        *vpp = CTOV(cp);

        /* XXX vnodeops doesn't say this argument can be changed. */
        /* Update va to reflect the new attributes. */
        (*va) = attr;

        /* Update the attribute cache and mark it as valid */
        if (coda_attr_cache) {
            VTOC(*vpp)->c_vattr = attr;
            VTOC(*vpp)->c_flags |= C_VATTR;
        }

        /* Invalidate parent's attr cache (modification time has changed). */
        VTOC(dvp)->c_flags &= ~C_VATTR;

        /* enter the new vnode in the Name Cache */
        coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp));

        CODADEBUG(CODA_CREATE, myprintf(("%s: %s, result %d\n", __func__,
            coda_f2s(&VFid), error)); )
    } else {
        *vpp = (vnode_t *)0;
        CODADEBUG(CODA_CREATE, myprintf(("%s: create error %d\n", __func__,
            error));)
    }

    if (!error) {
#ifdef CODA_VERBOSE
        if ((cnp->cn_flags & LOCKLEAF) == 0)
            /* This should not happen; flags are for lookup only. */
            printf("%s: LOCKLEAF not set!\n", __func__);
#endif
    }

    return(error);
}

int
coda_remove(void *v)
{
/* true args */
    struct vop_remove_v3_args *ap = v;
    vnode_t *dvp = ap->a_dvp;
    struct cnode *cp = VTOC(dvp);
    vnode_t *vp = ap->a_vp;
    struct componentname  *cnp = ap->a_cnp;
    kauth_cred_t cred = cnp->cn_cred;
    struct lwp *l = curlwp;
/* locals */
    int error;
    const char *nm = cnp->cn_nameptr;
    int len = cnp->cn_namelen;
    struct cnode *tp;

    MARK_ENTRY(CODA_REMOVE_STATS);

    CODADEBUG(CODA_REMOVE, myprintf(("%s: %s in %s\n", __func__,
        nm, coda_f2s(&cp->c_fid)));)

    /* Remove the file's entry from the CODA Name Cache */
    /* We're being conservative here, it might be that this person
     * doesn't really have sufficient access to delete the file
     * but we feel zapping the entry won't really hurt anyone -- dcs
     */
    /* I'm gonna go out on a limb here. If a file and a hardlink to it
     * exist, and one is removed, the link count on the other will be
     * off by 1. We could either invalidate the attrs if cached, or
     * fix them. I'll try to fix them. DCS 11/8/94
     */
    tp = coda_nc_lookup(VTOC(dvp), nm, len, cred);
    if (tp) {
        if (VALID_VATTR(tp)) {        /* If attrs are cached */
            if (tp->c_vattr.va_nlink > 1) {        /* If it's a hard link */
                tp->c_vattr.va_nlink--;
            }
        }

        coda_nc_zapfile(VTOC(dvp), nm, len);
        /* No need to flush it if it doesn't exist! */
    }
    /* Invalidate the parent's attr cache, the modification time has changed */
    VTOC(dvp)->c_flags &= ~C_VATTR;

    /* Check for remove of control object. */
    if (IS_CTL_NAME(dvp, nm, len)) {
        MARK_INT_FAIL(CODA_REMOVE_STATS);
        return(ENOENT);
    }

    error = venus_remove(vtomi(dvp), &cp->c_fid, nm, len, cred, l);

    CODADEBUG(CODA_REMOVE, myprintf(("in remove result %d\n",error)); )

    /*
     * Unlock and release child (avoiding double if ".").
     */
    if (dvp == vp) {
        vrele(vp);
    } else {
        vput(vp);
    }

    return(error);
}

/*
 * dvp is the directory where the link is to go, and is locked.
 * vp is the object to be linked to, and is unlocked.
 * At exit, we must unlock dvp, and vput dvp.
 */
int
coda_link(void *v)
{
/* true args */
    struct vop_link_v2_args *ap = v;
    vnode_t *vp = ap->a_vp;
    struct cnode *cp = VTOC(vp);
    vnode_t *dvp = ap->a_dvp;
    struct cnode *dcp = VTOC(dvp);
    struct componentname *cnp = ap->a_cnp;
    kauth_cred_t cred = cnp->cn_cred;
    struct lwp *l = curlwp;
/* locals */
    int error;
    const char *nm = cnp->cn_nameptr;
    int len = cnp->cn_namelen;

    MARK_ENTRY(CODA_LINK_STATS);

    if (codadebug & CODADBGMSK(CODA_LINK)) {

        myprintf(("%s: vp fid: %s\n", __func__, coda_f2s(&cp->c_fid)));
        myprintf(("%s: dvp fid: %s)\n", __func__, coda_f2s(&dcp->c_fid)));

    }
    if (codadebug & CODADBGMSK(CODA_LINK)) {
        myprintf(("%s: vp fid: %s\n", __func__, coda_f2s(&cp->c_fid)));
        myprintf(("%s: dvp fid: %s\n", __func__, coda_f2s(&dcp->c_fid)));

    }

    /* Check for link to/from control object. */
    if (IS_CTL_NAME(dvp, nm, len) || IS_CTL_VP(vp)) {
        MARK_INT_FAIL(CODA_LINK_STATS);
        return(EACCES);
    }

    /* If linking . to a name, error out earlier. */
    if (vp == dvp) {
#ifdef CODA_VERBOSE
        printf("%s coda_link vp==dvp\n", __func__);
#endif
        error = EISDIR;
        goto exit;
    }

    /* XXX Why does venus_link need the vnode to be locked?*/
    if ((error = vn_lock(vp, LK_EXCLUSIVE)) != 0) {
#ifdef CODA_VERBOSE
        printf("%s: couldn't lock vnode %p\n", __func__, vp);
#endif
        error = EFAULT;                /* XXX better value */
        goto exit;
    }
    error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp,
        dvp, 0);
    if (error)
            goto exit;
    error = venus_link(vtomi(vp), &cp->c_fid, &dcp->c_fid, nm, len, cred, l);
    VOP_UNLOCK(vp);

    /* Invalidate parent's attr cache (the modification time has changed). */
    VTOC(dvp)->c_flags &= ~C_VATTR;
    /* Invalidate child's attr cache (XXX why). */
    VTOC(vp)->c_flags &= ~C_VATTR;

    CODADEBUG(CODA_LINK,        myprintf(("in link result %d\n",error)); )

exit:
    return(error);
}

int
coda_rename(void *v)
{
/* true args */
    struct vop_rename_args *ap = v;
    vnode_t *odvp = ap->a_fdvp;
    struct cnode *odcp = VTOC(odvp);
    struct componentname  *fcnp = ap->a_fcnp;
    vnode_t *ndvp = ap->a_tdvp;
    struct cnode *ndcp = VTOC(ndvp);
    struct componentname  *tcnp = ap->a_tcnp;
    kauth_cred_t cred = fcnp->cn_cred;
    struct lwp *l = curlwp;
/* true args */
    int error;
    const char *fnm = fcnp->cn_nameptr;
    int flen = fcnp->cn_namelen;
    const char *tnm = tcnp->cn_nameptr;
    int tlen = tcnp->cn_namelen;

    MARK_ENTRY(CODA_RENAME_STATS);

    /* Hmmm.  The vnodes are already looked up.  Perhaps they are locked?
       This could be Bad. XXX */
#ifdef OLD_DIAGNOSTIC
    if ((fcnp->cn_cred != tcnp->cn_cred)
        || (fcnp->cn_lwp != tcnp->cn_lwp))
    {
        panic("%s: component names don't agree", __func__);
    }
#endif

    /* Check for rename involving control object. */
    if (IS_CTL_NAME(odvp, fnm, flen) || IS_CTL_NAME(ndvp, tnm, tlen)) {
        MARK_INT_FAIL(CODA_RENAME_STATS);
        return(EACCES);
    }

    /* Problem with moving directories -- need to flush entry for .. */
    if (odvp != ndvp) {
        struct cnode *ovcp = coda_nc_lookup(VTOC(odvp), fnm, flen, cred);
        if (ovcp) {
            vnode_t *ovp = CTOV(ovcp);
            if ((ovp) &&
                (ovp->v_type == VDIR)) /* If it's a directory */
                coda_nc_zapfile(VTOC(ovp),"..", 2);
        }
    }

    /* Remove the entries for both source and target files */
    coda_nc_zapfile(VTOC(odvp), fnm, flen);
    coda_nc_zapfile(VTOC(ndvp), tnm, tlen);

    /* Invalidate the parent's attr cache, the modification time has changed */
    VTOC(odvp)->c_flags &= ~C_VATTR;
    VTOC(ndvp)->c_flags &= ~C_VATTR;

    if (flen+1 > CODA_MAXNAMLEN) {
        MARK_INT_FAIL(CODA_RENAME_STATS);
        error = EINVAL;
        goto exit;
    }

    if (tlen+1 > CODA_MAXNAMLEN) {
        MARK_INT_FAIL(CODA_RENAME_STATS);
        error = EINVAL;
        goto exit;
    }

    error = venus_rename(vtomi(odvp), &odcp->c_fid, &ndcp->c_fid, fnm, flen, tnm, tlen, cred, l);

 exit:
    CODADEBUG(CODA_RENAME, myprintf(("in rename result %d\n",error));)
    /* XXX - do we need to call cache pureg on the moved vnode? */
    cache_purge(ap->a_fvp);

    /* It seems to be incumbent on us to drop locks on all four vnodes */
    /* From-vnodes are not locked, only ref'd.  To-vnodes are locked. */

    vrele(ap->a_fvp);
    vrele(odvp);

    if (ap->a_tvp) {
        if (ap->a_tvp == ndvp) {
            vrele(ap->a_tvp);
        } else {
            vput(ap->a_tvp);
        }
    }

    vput(ndvp);
    return(error);
}

int
coda_mkdir(void *v)
{
/* true args */
    struct vop_mkdir_v3_args *ap = v;
    vnode_t *dvp = ap->a_dvp;
    struct cnode *dcp = VTOC(dvp);
    struct componentname  *cnp = ap->a_cnp;
    struct vattr *va = ap->a_vap;
    vnode_t **vpp = ap->a_vpp;
    kauth_cred_t cred = cnp->cn_cred;
    struct lwp *l = curlwp;
/* locals */
    int error;
    const char *nm = cnp->cn_nameptr;
    int len = cnp->cn_namelen;
    struct cnode *cp;
    CodaFid VFid;
    struct vattr ova;

    MARK_ENTRY(CODA_MKDIR_STATS);

    /* Check for mkdir of target object. */
    if (IS_CTL_NAME(dvp, nm, len)) {
        *vpp = (vnode_t *)0;
        MARK_INT_FAIL(CODA_MKDIR_STATS);
        return(EACCES);
    }

    if (len+1 > CODA_MAXNAMLEN) {
        *vpp = (vnode_t *)0;
        MARK_INT_FAIL(CODA_MKDIR_STATS);
        return(EACCES);
    }

    error = venus_mkdir(vtomi(dvp), &dcp->c_fid, nm, len, va, cred, l, &VFid, &ova);

    if (!error) {
        if (coda_find(&VFid) != NULL)
            panic("cnode existed for newly created directory!");


        cp =  make_coda_node(&VFid, dvp->v_mount, va->va_type);
        *vpp = CTOV(cp);

        /* enter the new vnode in the Name Cache */
        coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp));

        /* as a side effect, enter "." and ".." for the directory */
        coda_nc_enter(VTOC(*vpp), ".", 1, cred, VTOC(*vpp));
        coda_nc_enter(VTOC(*vpp), "..", 2, cred, VTOC(dvp));

        if (coda_attr_cache) {
            VTOC(*vpp)->c_vattr = ova;                /* update the attr cache */
            VTOC(*vpp)->c_flags |= C_VATTR;        /* Valid attributes in cnode */
        }

        /* Invalidate the parent's attr cache, the modification time has changed */
        VTOC(dvp)->c_flags &= ~C_VATTR;

        CODADEBUG( CODA_MKDIR, myprintf(("%s: %s result %d\n", __func__,
            coda_f2s(&VFid), error)); )
    } else {
        *vpp = (vnode_t *)0;
        CODADEBUG(CODA_MKDIR, myprintf(("%s error %d\n", __func__, error));)
    }

    return(error);
}

int
coda_rmdir(void *v)
{
/* true args */
    struct vop_rmdir_v2_args *ap = v;
    vnode_t *dvp = ap->a_dvp;
    struct cnode *dcp = VTOC(dvp);
    vnode_t *vp = ap->a_vp;
    struct componentname  *cnp = ap->a_cnp;
    kauth_cred_t cred = cnp->cn_cred;
    struct lwp *l = curlwp;
/* true args */
    int error;
    const char *nm = cnp->cn_nameptr;
    int len = cnp->cn_namelen;
    struct cnode *cp;

    MARK_ENTRY(CODA_RMDIR_STATS);

    /* Check for rmdir of control object. */
    if (IS_CTL_NAME(dvp, nm, len)) {
        MARK_INT_FAIL(CODA_RMDIR_STATS);
        return(ENOENT);
    }

    /* Can't remove . in self. */
    if (dvp == vp) {
#ifdef CODA_VERBOSE
        printf("%s: dvp == vp\n", __func__);
#endif
        error = EINVAL;
        goto exit;
    }

    /*
     * The caller may not have adequate permissions, and the venus
     * operation may fail, but it doesn't hurt from a correctness
     * viewpoint to invalidate cache entries.
     * XXX Why isn't this done after the venus_rmdir call?
     */
    /* Look up child in name cache (by name, from parent). */
    cp = coda_nc_lookup(dcp, nm, len, cred);
    /* If found, remove all children of the child (., ..). */
    if (cp) coda_nc_zapParentfid(&(cp->c_fid), NOT_DOWNCALL);

    /* Remove child's own entry. */
    coda_nc_zapfile(dcp, nm, len);

    /* Invalidate parent's attr cache (the modification time has changed). */
    dcp->c_flags &= ~C_VATTR;

    error = venus_rmdir(vtomi(dvp), &dcp->c_fid, nm, len, cred, l);

    CODADEBUG(CODA_RMDIR, myprintf(("in rmdir result %d\n", error)); )

exit:
    /* unlock and release child */
    if (dvp == vp) {
        vrele(vp);
    } else {
        vput(vp);
    }

    return(error);
}

int
coda_symlink(void *v)
{
/* true args */
    struct vop_symlink_v3_args *ap = v;
    vnode_t *dvp = ap->a_dvp;
    struct cnode *dcp = VTOC(dvp);
    /* a_vpp is used in place below */
    struct componentname *cnp = ap->a_cnp;
    struct vattr *tva = ap->a_vap;
    char *path = ap->a_target;
    kauth_cred_t cred = cnp->cn_cred;
    struct lwp *l = curlwp;
/* locals */
    int error;
    u_long saved_cn_flags;
    const char *nm = cnp->cn_nameptr;
    int len = cnp->cn_namelen;
    int plen = strlen(path);

    /*
     * Here's the strategy for the moment: perform the symlink, then
     * do a lookup to grab the resulting vnode.  I know this requires
     * two communications with Venus for a new symbolic link, but
     * that's the way the ball bounces.  I don't yet want to change
     * the way the Mach symlink works.  When Mach support is
     * deprecated, we should change symlink so that the common case
     * returns the resultant vnode in a vpp argument.
     */

    MARK_ENTRY(CODA_SYMLINK_STATS);

    /* Check for symlink of control object. */
    if (IS_CTL_NAME(dvp, nm, len)) {
        MARK_INT_FAIL(CODA_SYMLINK_STATS);
        error = EACCES; 
        goto exit;
    }

    if (plen+1 > CODA_MAXPATHLEN) {
        MARK_INT_FAIL(CODA_SYMLINK_STATS);
        error = EINVAL;
        goto exit;
    }

    if (len+1 > CODA_MAXNAMLEN) {
        MARK_INT_FAIL(CODA_SYMLINK_STATS);
        error = EINVAL;
        goto exit;
    }

    error = venus_symlink(vtomi(dvp), &dcp->c_fid, path, plen, nm, len, tva, cred, l);

    /* Invalidate the parent's attr cache (modification time has changed). */
    dcp->c_flags &= ~C_VATTR;

    if (!error) {
        /*
         * VOP_SYMLINK is not defined to pay attention to cnp->cn_flags;
         * these are defined only for VOP_LOOKUP.   We desire to reuse
         * cnp for a VOP_LOOKUP operation, and must be sure to not pass
         * stray flags passed to us.  Such stray flags can occur because
         * sys_symlink makes a namei call and then reuses the
         * componentname structure.
         */
        /*
         * XXX Arguably we should create our own componentname structure
         * and not reuse the one that was passed in.
         */
        saved_cn_flags = cnp->cn_flags;
        cnp->cn_flags &= ~(MODMASK | OPMASK);
        cnp->cn_flags |= LOOKUP;
        error = VOP_LOOKUP(dvp, ap->a_vpp, cnp);
        cnp->cn_flags = saved_cn_flags;
    }

 exit:
    CODADEBUG(CODA_SYMLINK, myprintf(("in symlink result %d\n",error)); )
    return(error);
}

/*
 * Read directory entries.
 */
int
coda_readdir(void *v)
{
/* true args */
        struct vop_readdir_args *ap = v;
        vnode_t *vp = ap->a_vp;
        struct cnode *cp = VTOC(vp);
        struct uio *uiop = ap->a_uio;
        kauth_cred_t cred = ap->a_cred;
        int *eofflag = ap->a_eofflag;
/* upcall decl */
/* locals */
        size_t initial_resid = uiop->uio_resid;
        int error = 0;
        int opened_internally = 0;
        int ncookies;
        char *buf;
        struct vnode *cvp;
        struct dirent *dirp;

        MARK_ENTRY(CODA_READDIR_STATS);

        CODADEBUG(CODA_READDIR, myprintf(("%s: (%p, %lu, %lld)\n", __func__,
            uiop->uio_iov->iov_base, (unsigned long) uiop->uio_resid,
            (long long) uiop->uio_offset)); )

        /* Check for readdir of control object. */
        if (IS_CTL_VP(vp)) {
                MARK_INT_FAIL(CODA_READDIR_STATS);
                return ENOENT;
        }

        /* If directory is not already open do an "internal open" on it. */
        if (cp->c_ovp == NULL) {
                opened_internally = 1;
                MARK_INT_GEN(CODA_OPEN_STATS);
                error = VOP_OPEN(vp, FREAD, cred);
#ifdef        CODA_VERBOSE
                printf("%s: Internally Opening %p\n", __func__, vp);
#endif
                if (error)
                        return error;
                KASSERT(cp->c_ovp != NULL);
        }
        cvp = cp->c_ovp;

        CODADEBUG(CODA_READDIR, myprintf(("%s: fid = %s, refcnt = %d\n",
            __func__, coda_f2s(&cp->c_fid), vrefcnt(cvp))); )

        if (ap->a_ncookies) {
                ncookies = ap->a_uio->uio_resid / _DIRENT_RECLEN(dirp, 1);
                *ap->a_ncookies = 0;
                *ap->a_cookies = malloc(ncookies * sizeof (off_t),
                    M_TEMP, M_WAITOK);
        }
        buf = kmem_alloc(CODA_DIRBLKSIZ, KM_SLEEP);
        dirp = kmem_alloc(sizeof(*dirp), KM_SLEEP);
        vn_lock(cvp, LK_EXCLUSIVE | LK_RETRY);

        while (error == 0) {
                size_t resid = 0;
                char *dp, *ep;

                if (!ALIGNED_POINTER(uiop->uio_offset, uint32_t)) {
                        error = EINVAL;
                        break;
                }
                error = vn_rdwr(UIO_READ, cvp, buf,
                    CODA_DIRBLKSIZ, uiop->uio_offset,
                    UIO_SYSSPACE, IO_NODELOCKED, cred, &resid, curlwp);
                if (error || resid == CODA_DIRBLKSIZ)
                        break;
                for (dp = buf, ep = dp + CODA_DIRBLKSIZ - resid; dp < ep; ) {
                        off_t off;
                        struct venus_dirent *vd = (struct venus_dirent *)dp;

                        if (!ALIGNED_POINTER(vd, uint32_t) ||
                            !ALIGNED_POINTER(vd->d_reclen, uint32_t) ||
                            vd->d_reclen == 0) {
                                error = EINVAL;
                                break;
                        }
                        if (dp + vd->d_reclen > ep) {
                                error = ENAMETOOLONG;
                                break;
                        }
                        if (vd->d_namlen == 0) {
                                uiop->uio_offset += vd->d_reclen;
                                dp += vd->d_reclen;
                                continue;
                        }

                        dirp->d_fileno = vd->d_fileno;
                        dirp->d_type = vd->d_type;
                        dirp->d_namlen = vd->d_namlen;
                        dirp->d_reclen = _DIRENT_SIZE(dirp);
                        strlcpy(dirp->d_name, vd->d_name, dirp->d_namlen + 1);

                        if (uiop->uio_resid < dirp->d_reclen) {
                                error = ENAMETOOLONG;
                                break;
                        }

                        off = uiop->uio_offset;
                        error = uiomove(dirp, dirp->d_reclen, uiop);
                        uiop->uio_offset = off;
                        if (error)
                                break;

                        uiop->uio_offset += vd->d_reclen;
                        dp += vd->d_reclen;
                        if (ap->a_ncookies)
                                (*ap->a_cookies)[(*ap->a_ncookies)++] =
                                    uiop->uio_offset;
                }
        }

        VOP_UNLOCK(cvp);
        kmem_free(dirp, sizeof(*dirp));
        kmem_free(buf, CODA_DIRBLKSIZ);
        if (eofflag && error == 0)
                *eofflag = 1;
        if (uiop->uio_resid < initial_resid && error == ENAMETOOLONG)
                error = 0;
        if (ap->a_ncookies && error) {
                free(*ap->a_cookies, M_TEMP);
                *ap->a_ncookies = 0;
                *ap->a_cookies = NULL;
        }
        if (error)
                MARK_INT_FAIL(CODA_READDIR_STATS);
        else
                MARK_INT_SAT(CODA_READDIR_STATS);

        /* Do an "internal close" if necessary. */
        if (opened_internally) {
                MARK_INT_GEN(CODA_CLOSE_STATS);
                (void)VOP_CLOSE(vp, FREAD, cred);
        }

        return error;
}

/*
 * Convert from file system blocks to device blocks
 */
int
coda_bmap(void *v)
{
    /* XXX on the global proc */
/* true args */
    struct vop_bmap_args *ap = v;
    vnode_t *vp __unused = ap->a_vp;        /* file's vnode */
    daddr_t bn __unused = ap->a_bn;        /* fs block number */
    vnode_t **vpp = ap->a_vpp;                        /* RETURN vp of device */
    daddr_t *bnp __unused = ap->a_bnp;        /* RETURN device block number */
    struct lwp *l __unused = curlwp;
/* upcall decl */
/* locals */

        *vpp = (vnode_t *)0;
        myprintf(("coda_bmap called!\n"));
        return(EINVAL);
}

/*
 * I don't think the following two things are used anywhere, so I've
 * commented them out
 *
 * struct buf *async_bufhead;
 * int async_daemon_count;
 */
int
coda_strategy(void *v)
{
/* true args */
    struct vop_strategy_args *ap = v;
    struct buf *bp __unused = ap->a_bp;
    struct lwp *l __unused = curlwp;
/* upcall decl */
/* locals */

        myprintf(("coda_strategy called!  "));
        return(EINVAL);
}

int
coda_reclaim(void *v)
{
/* true args */
    struct vop_reclaim_v2_args *ap = v;
    vnode_t *vp = ap->a_vp;
    struct cnode *cp = VTOC(vp);
/* upcall decl */
/* locals */

    VOP_UNLOCK(vp);

/*
 * Forced unmount/flush will let vnodes with non zero use be destroyed!
 */
    ENTRY;

    if (IS_UNMOUNTING(cp)) {
#ifdef        DEBUG
        if (VTOC(vp)->c_ovp) {
            if (IS_UNMOUNTING(cp))
                printf("%s: c_ovp not void: vp %p, cp %p\n", __func__, vp, cp);
        }
#endif
    } else {
#ifdef OLD_DIAGNOSTIC
        if (vrefcnt(vp) != 0)
            print("%s: pushing active %p\n", __func__, vp);
        if (VTOC(vp)->c_ovp) {
            panic("%s: c_ovp not void", __func__);
        }
#endif
    }
    /* If an array has been allocated to hold the symlink, deallocate it */
    if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) {
        if (cp->c_symlink == NULL)
            panic("%s: null symlink pointer in cnode", __func__);

        CODA_FREE(cp->c_symlink, cp->c_symlen);
        cp->c_flags &= ~C_SYMLINK;
        cp->c_symlen = 0;
    }

    mutex_enter(vp->v_interlock);
    mutex_enter(&cp->c_lock);
    SET_VTOC(vp) = NULL;
    mutex_exit(&cp->c_lock);
    mutex_exit(vp->v_interlock);
    mutex_destroy(&cp->c_lock);
    kmem_free(cp, sizeof(*cp));

    return (0);
}

int
coda_lock(void *v)
{
/* true args */
    struct vop_lock_args *ap = v;
    vnode_t *vp = ap->a_vp;
    struct cnode *cp = VTOC(vp);
/* upcall decl */
/* locals */

    ENTRY;

    if (coda_lockdebug) {
        myprintf(("Attempting lock on %s\n",
                  coda_f2s(&cp->c_fid)));
    }

    return genfs_lock(v);
}

int
coda_unlock(void *v)
{
/* true args */
    struct vop_unlock_args *ap = v;
    vnode_t *vp = ap->a_vp;
    struct cnode *cp = VTOC(vp);
/* upcall decl */
/* locals */

    ENTRY;
    if (coda_lockdebug) {
        myprintf(("Attempting unlock on %s\n",
                  coda_f2s(&cp->c_fid)));
    }

    return genfs_unlock(v);
}

int
coda_islocked(void *v)
{
/* true args */
    ENTRY;

    return genfs_islocked(v);
}

int
coda_pathconf(void *v)
{
        struct vop_pathconf_args *ap = v;

        switch (ap->a_name) {
        default:
                return EINVAL;
        }
        /* NOTREACHED */
}

/*
 * Given a device and inode, obtain a locked vnode.  One reference is
 * obtained and passed back to the caller.
 */
int
coda_grab_vnode(vnode_t *uvp, dev_t dev, ino_t ino, vnode_t **vpp)
{
    int           error;
    struct mount *mp;

    /* Obtain mount point structure from device. */
    if (!(mp = devtomp(dev))) {
        myprintf(("%s: devtomp(0x%llx) returns NULL\n", __func__,
            (unsigned long long)dev));
        return(ENXIO);
    }

    /*
     * Obtain vnode from mount point and inode.
     */
    error = VFS_VGET(mp, ino, LK_EXCLUSIVE, vpp);
    if (error) {
        myprintf(("%s: iget/vget(0x%llx, %llu) returns %p, err %d\n", __func__,
            (unsigned long long)dev, (unsigned long long)ino, *vpp, error));
        return(ENOENT);
    }
    /* share the underlying vnode lock with the coda vnode */
    vshareilock(*vpp, uvp);
    KASSERT(VOP_ISLOCKED(*vpp));
    return(0);
}

static void
coda_print_vattr(struct vattr *attr)
{
    const char *typestr;

    switch (attr->va_type) {
    case VNON:
        typestr = "VNON";
        break;
    case VREG:
        typestr = "VREG";
        break;
    case VDIR:
        typestr = "VDIR";
        break;
    case VBLK:
        typestr = "VBLK";
        break;
    case VCHR:
        typestr = "VCHR";
        break;
    case VLNK:
        typestr = "VLNK";
        break;
    case VSOCK:
        typestr = "VSCK";
        break;
    case VFIFO:
        typestr = "VFFO";
        break;
    case VBAD:
        typestr = "VBAD";
        break;
    default:
        typestr = "????";
        break;
    }


    myprintf(("attr: type %s mode %d uid %d gid %d fsid %d rdev %d\n",
              typestr, (int)attr->va_mode, (int)attr->va_uid,
              (int)attr->va_gid, (int)attr->va_fsid, (int)attr->va_rdev));

    myprintf(("      fileid %d nlink %d size %d blocksize %d bytes %d\n",
              (int)attr->va_fileid, (int)attr->va_nlink,
              (int)attr->va_size,
              (int)attr->va_blocksize,(int)attr->va_bytes));
    myprintf(("      gen %ld flags %ld vaflags %d\n",
              attr->va_gen, attr->va_flags, attr->va_vaflags));
    myprintf(("      atime sec %d nsec %d\n",
              (int)attr->va_atime.tv_sec, (int)attr->va_atime.tv_nsec));
    myprintf(("      mtime sec %d nsec %d\n",
              (int)attr->va_mtime.tv_sec, (int)attr->va_mtime.tv_nsec));
    myprintf(("      ctime sec %d nsec %d\n",
              (int)attr->va_ctime.tv_sec, (int)attr->va_ctime.tv_nsec));
}

/*
 * Return a vnode for the given fid.
 * If no cnode exists for this fid create one and put it
 * in a table hashed by coda_f2i().  If the cnode for
 * this fid is already in the table return it (ref count is
 * incremented by coda_find.  The cnode will be flushed from the
 * table when coda_inactive calls coda_unsave.
 */
struct cnode *
make_coda_node(CodaFid *fid, struct mount *fvsp, short type)
{
        int error __diagused;
        struct vnode *vp;
        struct cnode *cp;

        error = vcache_get(fvsp, fid, sizeof(CodaFid), &vp);
        KASSERT(error == 0);

        mutex_enter(vp->v_interlock);
        cp = VTOC(vp);
        KASSERT(cp != NULL);
        mutex_enter(&cp->c_lock);
        mutex_exit(vp->v_interlock);

        if (vp->v_type != type) {
                if (vp->v_type == VCHR || vp->v_type == VBLK)
                        spec_node_destroy(vp);
                vp->v_type = type;
                if (type == VCHR || type == VBLK)
                        spec_node_init(vp, NODEV);
                uvm_vnp_setsize(vp, 0);
        }
        mutex_exit(&cp->c_lock);

        return cp;
}

/*
 * coda_getpages may be called on a vnode which has not been opened,
 * e.g. to fault in pages to execute a program.  In that case, we must
 * open the file to get the container.  The vnode may or may not be
 * locked, and we must leave it in the same state.
 */
int
coda_getpages(void *v)
{
        struct vop_getpages_args /* {
                vnode_t *a_vp;
                voff_t a_offset;
                struct vm_page **a_m;
                int *a_count;
                int a_centeridx;
                vm_prot_t a_access_type;
                int a_advice;
                int a_flags;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp, *cvp;
        struct cnode *cp = VTOC(vp);
        struct lwp *l = curlwp;
        kauth_cred_t cred = l->l_cred;
        int error, cerror;
        int waslocked;               /* 1 if vnode lock was held on entry */
        int didopen = 0;        /* 1 if we opened container file */
        krw_t op;

        /*
         * Handle a case that uvm_fault doesn't quite use yet.
         * See layer_vnops.c. for inspiration.
         */
        if (ap->a_flags & PGO_LOCKED) {
                return EBUSY;
        }

        KASSERT(rw_lock_held(vp->v_uobj.vmobjlock));

        /* Check for control object. */
        if (IS_CTL_VP(vp)) {
#ifdef CODA_VERBOSE
                printf("%s: control object %p\n", __func__, vp);
#endif
                return(EINVAL);
        }

        /*
         * XXX It's really not ok to be releasing the lock we get,
         * because we could be overlapping with another call to
         * getpages and drop a lock they are relying on.  We need to
         * figure out whether getpages ever is called holding the
         * lock, and if we should serialize getpages calls by some
         * mechanism.
         */
        /* XXX VOP_ISLOCKED() may not be used for lock decisions. */
        op = rw_lock_op(vp->v_uobj.vmobjlock);
        waslocked = VOP_ISLOCKED(vp);

        /* Get container file if not already present. */
        cvp = cp->c_ovp;
        if (cvp == NULL) {
                /*
                 * VOP_OPEN requires a locked vnode.  We must avoid
                 * locking the vnode if it is already locked, and
                 * leave it in the same state on exit.
                 */
                if (waslocked == 0) {
                        rw_exit(vp->v_uobj.vmobjlock);
                        cerror = vn_lock(vp, LK_EXCLUSIVE);
                        if (cerror) {
#ifdef CODA_VERBOSE
                                printf("%s: can't lock vnode %p\n",
                                    __func__, vp);
#endif
                                return cerror;
                        }
#ifdef CODA_VERBOSE
                        printf("%s: locked vnode %p\n", __func__, vp);
#endif
                }

                /*
                 * Open file (causes upcall to venus).
                 * XXX Perhaps we should not fully open the file, but
                 * simply obtain a container file.
                 */
                /* XXX Is it ok to do this while holding the mutex? */
                cerror = VOP_OPEN(vp, FREAD, cred);

                if (cerror) {
#ifdef CODA_VERBOSE
                        printf("%s: cannot open vnode %p => %d\n", __func__,
                            vp, cerror);
#endif
                        if (waslocked == 0)
                                VOP_UNLOCK(vp);
                        return cerror;
                }

#ifdef CODA_VERBOSE
                printf("%s: opened vnode %p\n", __func__, vp);
#endif
                cvp = cp->c_ovp;
                didopen = 1;
                if (waslocked == 0)
                        rw_enter(vp->v_uobj.vmobjlock, op);
        }
        KASSERT(cvp != NULL);

        /* Munge the arg structure to refer to the container vnode. */
        KASSERT(cvp->v_uobj.vmobjlock == vp->v_uobj.vmobjlock);
        ap->a_vp = cp->c_ovp;

        /* Finally, call getpages on it. */
        error = VCALL(ap->a_vp, VOFFSET(vop_getpages), ap);

        /* If we opened the vnode, we must close it. */
        if (didopen) {
                /*
                 * VOP_CLOSE requires a locked vnode, but we are still
                 * holding the lock (or riding a caller's lock).
                 */
                cerror = VOP_CLOSE(vp, FREAD, cred);
#ifdef CODA_VERBOSE
                if (cerror != 0)
                        /* XXX How should we handle this? */
                        printf("%s: closed vnode %p -> %d\n", __func__,
                            vp, cerror);
#endif

                /* If we obtained a lock, drop it. */
                if (waslocked == 0)
                        VOP_UNLOCK(vp);
        }

        return error;
}

/*
 * The protocol requires v_interlock to be held by the caller.
 */
int
coda_putpages(void *v)
{
        struct vop_putpages_args /* {
                vnode_t *a_vp;
                voff_t a_offlo;
                voff_t a_offhi;
                int a_flags;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp, *cvp;
        struct cnode *cp = VTOC(vp);
        int error;

        KASSERT(rw_write_held(vp->v_uobj.vmobjlock));

        /* Check for control object. */
        if (IS_CTL_VP(vp)) {
                rw_exit(vp->v_uobj.vmobjlock);
#ifdef CODA_VERBOSE
                printf("%s: control object %p\n", __func__, vp);
#endif
                return 0;
        }

        /*
         * If container object is not present, then there are no pages
         * to put; just return without error.  This happens all the
         * time, apparently during discard of a closed vnode (which
         * trivially can't have dirty pages).
         */
        cvp = cp->c_ovp;
        if (cvp == NULL) {
                rw_exit(vp->v_uobj.vmobjlock);
                return 0;
        }

        /* Munge the arg structure to refer to the container vnode. */
        KASSERT(cvp->v_uobj.vmobjlock == vp->v_uobj.vmobjlock);
        ap->a_vp = cvp;

        /* Finally, call putpages on it. */
        error = VCALL(ap->a_vp, VOFFSET(vop_putpages), ap);

        return error;
}























































































































































































   70 









   68 




   69 








   70 
   69 
   68 



   70 
   69 
   70 











   59 








   58 

   58 





    2 








   58 









    1 





    1 










    1 







   61 





   60 
   60 
   60 
   60 
   60 



   60 





   60 








   24 





   24 
   24 
   24 
   23 

   20 
    7 


   24 





   24 



   24 
   22 
    1 










  113 










  114 
   36 



  107 
    2 




















































  104 







  109 
  105 


    3 








  104 





  105 
  103 

  104 
  103 
  104 































  103 

















   12 




    4 



    3 








    6 









   44 

   17 



    2 


   52 





   26 

   28 




















































    9 











   10 

   10 


   10 
   10 


















   10 







   10 














   10 















































   10 










   10 

    1 





















    4 











    4 










    2 










    2 











    2 


    2 











    2 












    2 










   12 








   12 
    7 








    5 
    3 
    1 






    8 



    8 
    7 












   58 





   59 








   57 







   59 



   59 













   58 
    3 



   59 


   59 


   57 
























































































































































































   56 





    2 











   57 
   56 
   55 



   55 








   53 

















   49 




   50 










   50 


   49 
   49 
   50 
   50 
   50 


   50 







   16 



   17 


   17 


   16 


   17 



    3 
   15 
   15 






   57 

















   57 









   22 




   21 


   22 




















































































    5 







































    1 












    5 










    5 











    5 
    5 
    5 
    5 
    5 
    5 
    5 

    5 



    5 




























    5 







    5 














    5 
    5 


    5 
    5 

    5 
















    5 



    5 







    5 

    5 







    5 












    1 








    1 

    1 
    1 


    1 





































































































































    2 










    2 
































    2 



































































































   19 

   19 


   18 
    2 
































































    2 





    2 
    2 












    1 









    1 


















































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
/*        $NetBSD: kern_descrip.c,v 1.262 2023/10/04 22:17:09 ad Exp $        */

/*-
 * Copyright (c) 2008, 2009, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_descrip.c        8.8 (Berkeley) 2/14/95
 */

/*
 * File descriptor management.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.262 2023/10/04 22:17:09 ad Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/pool.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/syscallargs.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
#include <sys/sysctl.h>
#include <sys/ktrace.h>

/*
 * A list (head) of open files, counter, and lock protecting them.
 */
struct filelist                filehead        __cacheline_aligned;
static u_int                nfiles                __cacheline_aligned;
kmutex_t                filelist_lock        __cacheline_aligned;

static pool_cache_t        filedesc_cache        __read_mostly;
static pool_cache_t        file_cache        __read_mostly;

static int        file_ctor(void *, void *, int);
static void        file_dtor(void *, void *);
static void        fdfile_ctor(fdfile_t *);
static void        fdfile_dtor(fdfile_t *);
static int        filedesc_ctor(void *, void *, int);
static void        filedesc_dtor(void *, void *);
static int        filedescopen(dev_t, int, int, lwp_t *);

static int sysctl_kern_file(SYSCTLFN_PROTO);
static int sysctl_kern_file2(SYSCTLFN_PROTO);
static void fill_file(struct file *, const struct file *);
static void fill_file2(struct kinfo_file *, const file_t *, const fdfile_t *,
                      int, pid_t);

const struct cdevsw filedesc_cdevsw = {
        .d_open = filedescopen,
        .d_close = noclose,
        .d_read = noread,
        .d_write = nowrite,
        .d_ioctl = noioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_OTHER | D_MPSAFE
};

/* For ease of reading. */
__strong_alias(fd_putvnode,fd_putfile)
__strong_alias(fd_putsock,fd_putfile)

/*
 * Initialize the descriptor system.
 */
void
fd_sys_init(void)
{
        static struct sysctllog *clog;

        mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);

        LIST_INIT(&filehead);

        file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
            0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
        KASSERT(file_cache != NULL);

        filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
            0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
            NULL);
        KASSERT(filedesc_cache != NULL);

        sysctl_createv(&clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "file",
                       SYSCTL_DESCR("System open file table"),
                       sysctl_kern_file, 0, NULL, 0,
                       CTL_KERN, KERN_FILE, CTL_EOL);
        sysctl_createv(&clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "file2",
                       SYSCTL_DESCR("System open file table"),
                       sysctl_kern_file2, 0, NULL, 0,
                       CTL_KERN, KERN_FILE2, CTL_EOL);
}

static bool
fd_isused(filedesc_t *fdp, unsigned fd)
{
        u_int off = fd >> NDENTRYSHIFT;

        KASSERT(fd < atomic_load_consume(&fdp->fd_dt)->dt_nfiles);

        return (fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0;
}

/*
 * Verify that the bitmaps match the descriptor table.
 */
static inline void
fd_checkmaps(filedesc_t *fdp)
{
#ifdef DEBUG
        fdtab_t *dt;
        u_int fd;

        KASSERT(fdp->fd_refcnt <= 1 || mutex_owned(&fdp->fd_lock));

        dt = fdp->fd_dt;
        if (fdp->fd_refcnt == -1) {
                /*
                 * fd_free tears down the table without maintaining its bitmap.
                 */
                return;
        }
        for (fd = 0; fd < dt->dt_nfiles; fd++) {
                if (fd < NDFDFILE) {
                        KASSERT(dt->dt_ff[fd] ==
                            (fdfile_t *)fdp->fd_dfdfile[fd]);
                }
                if (dt->dt_ff[fd] == NULL) {
                        KASSERT(!fd_isused(fdp, fd));
                } else if (dt->dt_ff[fd]->ff_file != NULL) {
                        KASSERT(fd_isused(fdp, fd));
                }
        }
#endif
}

static int
fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
{
        int i, off, maxoff;
        uint32_t sub;

        KASSERT(mutex_owned(&fdp->fd_lock));

        fd_checkmaps(fdp);

        if (want > bits)
                return -1;

        off = want >> NDENTRYSHIFT;
        i = want & NDENTRYMASK;
        if (i) {
                sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
                if (sub != ~0)
                        goto found;
                off++;
        }

        maxoff = NDLOSLOTS(bits);
        while (off < maxoff) {
                if ((sub = bitmap[off]) != ~0)
                        goto found;
                off++;
        }

        return -1;

 found:
        return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
}

static int
fd_last_set(filedesc_t *fd, int last)
{
        int off, i;
        fdfile_t **ff = fd->fd_dt->dt_ff;
        uint32_t *bitmap = fd->fd_lomap;

        KASSERT(mutex_owned(&fd->fd_lock));

        fd_checkmaps(fd);

        off = (last - 1) >> NDENTRYSHIFT;

        while (off >= 0 && !bitmap[off])
                off--;

        if (off < 0)
                return -1;

        i = ((off + 1) << NDENTRYSHIFT) - 1;
        if (i >= last)
                i = last - 1;

        /* XXX should use bitmap */
        while (i > 0 && (ff[i] == NULL || !ff[i]->ff_allocated))
                i--;

        return i;
}

static inline void
fd_used(filedesc_t *fdp, unsigned fd)
{
        u_int off = fd >> NDENTRYSHIFT;
        fdfile_t *ff;

        ff = fdp->fd_dt->dt_ff[fd];

        KASSERT(mutex_owned(&fdp->fd_lock));
        KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) == 0);
        KASSERT(ff != NULL);
        KASSERT(ff->ff_file == NULL);
        KASSERT(!ff->ff_allocated);

        ff->ff_allocated = true;
        fdp->fd_lomap[off] |= 1U << (fd & NDENTRYMASK);
        if (__predict_false(fdp->fd_lomap[off] == ~0)) {
                KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
                    (1U << (off & NDENTRYMASK))) == 0);
                fdp->fd_himap[off >> NDENTRYSHIFT] |= 1U << (off & NDENTRYMASK);
        }

        if ((int)fd > fdp->fd_lastfile) {
                fdp->fd_lastfile = fd;
        }

        fd_checkmaps(fdp);
}

static inline void
fd_unused(filedesc_t *fdp, unsigned fd)
{
        u_int off = fd >> NDENTRYSHIFT;
        fdfile_t *ff;

        ff = fdp->fd_dt->dt_ff[fd];

        KASSERT(mutex_owned(&fdp->fd_lock));
        KASSERT(ff != NULL);
        KASSERT(ff->ff_file == NULL);
        KASSERT(ff->ff_allocated);

        if (fd < fdp->fd_freefile) {
                fdp->fd_freefile = fd;
        }

        if (fdp->fd_lomap[off] == ~0) {
                KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
                    (1U << (off & NDENTRYMASK))) != 0);
                fdp->fd_himap[off >> NDENTRYSHIFT] &=
                    ~(1U << (off & NDENTRYMASK));
        }
        KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0);
        fdp->fd_lomap[off] &= ~(1U << (fd & NDENTRYMASK));
        ff->ff_allocated = false;

        KASSERT(fd <= fdp->fd_lastfile);
        if (fd == fdp->fd_lastfile) {
                fdp->fd_lastfile = fd_last_set(fdp, fd);
        }
        fd_checkmaps(fdp);
}

/*
 * Look up the file structure corresponding to a file descriptor
 * and return the file, holding a reference on the descriptor.
 */
file_t *
fd_getfile(unsigned fd)
{
        filedesc_t *fdp;
        fdfile_t *ff;
        file_t *fp;
        fdtab_t *dt;

        /*
         * Look up the fdfile structure representing this descriptor.
         * We are doing this unlocked.  See fd_tryexpand().
         */
        fdp = curlwp->l_fd;
        dt = atomic_load_consume(&fdp->fd_dt);
        if (__predict_false(fd >= dt->dt_nfiles)) {
                return NULL;
        }
        ff = dt->dt_ff[fd];
        KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
        if (__predict_false(ff == NULL)) {
                return NULL;
        }

        /* Now get a reference to the descriptor. */
        if (fdp->fd_refcnt == 1) {
                /*
                 * Single threaded: don't need to worry about concurrent
                 * access (other than earlier calls to kqueue, which may
                 * hold a reference to the descriptor).
                 */
                ff->ff_refcnt++;
        } else {
                /*
                 * Multi threaded: issue a memory barrier to ensure that we
                 * acquire the file pointer _after_ adding a reference.  If
                 * no memory barrier, we could fetch a stale pointer.
                 *
                 * In particular, we must coordinate the following four
                 * memory operations:
                 *
                 *        A. fd_close store ff->ff_file = NULL
                 *        B. fd_close refcnt = atomic_dec_uint_nv(&ff->ff_refcnt)
                 *        C. fd_getfile atomic_inc_uint(&ff->ff_refcnt)
                 *        D. fd_getfile load fp = ff->ff_file
                 *
                 * If the order is D;A;B;C:
                 *
                 *        1. D: fp = ff->ff_file
                 *        2. A: ff->ff_file = NULL
                 *        3. B: refcnt = atomic_dec_uint_nv(&ff->ff_refcnt)
                 *        4. C: atomic_inc_uint(&ff->ff_refcnt)
                 *
                 * then fd_close determines that there are no more
                 * references and decides to free fp immediately, at
                 * the same that fd_getfile ends up with an fp that's
                 * about to be freed.  *boom*
                 *
                 * By making B a release operation in fd_close, and by
                 * making C an acquire operation in fd_getfile, since
                 * they are atomic operations on the same object, which
                 * has a total modification order, we guarantee either:
                 *
                 *        - B happens before C.  Then since A is
                 *          sequenced before B in fd_close, and C is
                 *          sequenced before D in fd_getfile, we
                 *          guarantee A happens before D, so fd_getfile
                 *          reads a null fp and safely fails.
                 *
                 *        - C happens before B.  Then fd_getfile may read
                 *          null or nonnull, but either way, fd_close
                 *          will safely wait for references to drain.
                 */
                atomic_inc_uint(&ff->ff_refcnt);
                membar_acquire();
        }

        /*
         * If the file is not open or is being closed then put the
         * reference back.
         */
        fp = atomic_load_consume(&ff->ff_file);
        if (__predict_true(fp != NULL)) {
                return fp;
        }
        fd_putfile(fd);
        return NULL;
}

/*
 * Release a reference to a file descriptor acquired with fd_getfile().
 */
void
fd_putfile(unsigned fd)
{
        filedesc_t *fdp;
        fdfile_t *ff;
        u_int u, v;

        fdp = curlwp->l_fd;
        KASSERT(fd < atomic_load_consume(&fdp->fd_dt)->dt_nfiles);
        ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];

        KASSERT(ff != NULL);
        KASSERT((ff->ff_refcnt & FR_MASK) > 0);
        KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);

        if (fdp->fd_refcnt == 1) {
                /*
                 * Single threaded: don't need to worry about concurrent
                 * access (other than earlier calls to kqueue, which may
                 * hold a reference to the descriptor).
                 */
                if (__predict_false((ff->ff_refcnt & FR_CLOSING) != 0)) {
                        fd_close(fd);
                        return;
                }
                ff->ff_refcnt--;
                return;
        }

        /*
         * Ensure that any use of the file is complete and globally
         * visible before dropping the final reference.  If no membar,
         * the current CPU could still access memory associated with
         * the file after it has been freed or recycled by another
         * CPU.
         */
        membar_release();

        /*
         * Be optimistic and start out with the assumption that no other
         * threads are trying to close the descriptor.  If the CAS fails,
         * we lost a race and/or it's being closed.
         */
        for (u = ff->ff_refcnt & FR_MASK;; u = v) {
                v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
                if (__predict_true(u == v)) {
                        return;
                }
                if (__predict_false((v & FR_CLOSING) != 0)) {
                        break;
                }
        }

        /* Another thread is waiting to close the file: join it. */
        (void)fd_close(fd);
}

/*
 * Convenience wrapper around fd_getfile() that returns reference
 * to a vnode.
 */
int
fd_getvnode(unsigned fd, file_t **fpp)
{
        vnode_t *vp;
        file_t *fp;

        fp = fd_getfile(fd);
        if (__predict_false(fp == NULL)) {
                return EBADF;
        }
        if (__predict_false(fp->f_type != DTYPE_VNODE)) {
                fd_putfile(fd);
                return EINVAL;
        }
        vp = fp->f_vnode;
        if (__predict_false(vp->v_type == VBAD)) {
                /* XXX Is this case really necessary? */
                fd_putfile(fd);
                return EBADF;
        }
        *fpp = fp;
        return 0;
}

/*
 * Convenience wrapper around fd_getfile() that returns reference
 * to a socket.
 */
int
fd_getsock1(unsigned fd, struct socket **sop, file_t **fp)
{
        *fp = fd_getfile(fd);
        if (__predict_false(*fp == NULL)) {
                return EBADF;
        }
        if (__predict_false((*fp)->f_type != DTYPE_SOCKET)) {
                fd_putfile(fd);
                return ENOTSOCK;
        }
        *sop = (*fp)->f_socket;
        return 0;
}

int
fd_getsock(unsigned fd, struct socket **sop)
{
        file_t *fp;
        return fd_getsock1(fd, sop, &fp);
}

/*
 * Look up the file structure corresponding to a file descriptor
 * and return it with a reference held on the file, not the
 * descriptor.
 *
 * This is heavyweight and only used when accessing descriptors
 * from a foreign process.  The caller must ensure that `p' does
 * not exit or fork across this call.
 *
 * To release the file (not descriptor) reference, use closef().
 */
file_t *
fd_getfile2(proc_t *p, unsigned fd)
{
        filedesc_t *fdp;
        fdfile_t *ff;
        file_t *fp;
        fdtab_t *dt;

        fdp = p->p_fd;
        mutex_enter(&fdp->fd_lock);
        dt = fdp->fd_dt;
        if (fd >= dt->dt_nfiles) {
                mutex_exit(&fdp->fd_lock);
                return NULL;
        }
        if ((ff = dt->dt_ff[fd]) == NULL) {
                mutex_exit(&fdp->fd_lock);
                return NULL;
        }
        if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) {
                mutex_exit(&fdp->fd_lock);
                return NULL;
        }
        mutex_enter(&fp->f_lock);
        fp->f_count++;
        mutex_exit(&fp->f_lock);
        mutex_exit(&fdp->fd_lock);

        return fp;
}

/*
 * Internal form of close.  Must be called with a reference to the
 * descriptor, and will drop the reference.  When all descriptor
 * references are dropped, releases the descriptor slot and a single
 * reference to the file structure.
 */
int
fd_close(unsigned fd)
{
        struct flock lf;
        filedesc_t *fdp;
        fdfile_t *ff;
        file_t *fp;
        proc_t *p;
        lwp_t *l;
        u_int refcnt;

        l = curlwp;
        p = l->l_proc;
        fdp = l->l_fd;
        ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];

        KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);

        mutex_enter(&fdp->fd_lock);
        KASSERT((ff->ff_refcnt & FR_MASK) > 0);
        fp = atomic_load_consume(&ff->ff_file);
        if (__predict_false(fp == NULL)) {
                /*
                 * Another user of the file is already closing, and is
                 * waiting for other users of the file to drain.  Release
                 * our reference, and wake up the closer.
                 */
                membar_release();
                atomic_dec_uint(&ff->ff_refcnt);
                cv_broadcast(&ff->ff_closing);
                mutex_exit(&fdp->fd_lock);

                /*
                 * An application error, so pretend that the descriptor
                 * was already closed.  We can't safely wait for it to
                 * be closed without potentially deadlocking.
                 */
                return (EBADF);
        }
        KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);

        /*
         * There may be multiple users of this file within the process.
         * Notify existing and new users that the file is closing.  This
         * will prevent them from adding additional uses to this file
         * while we are closing it.
         */
        atomic_store_relaxed(&ff->ff_file, NULL);
        ff->ff_exclose = false;

        /*
         * We expect the caller to hold a descriptor reference - drop it.
         * The reference count may increase beyond zero at this point due
         * to an erroneous descriptor reference by an application, but
         * fd_getfile() will notice that the file is being closed and drop
         * the reference again.
         */
        if (fdp->fd_refcnt == 1) {
                /* Single threaded. */
                refcnt = --(ff->ff_refcnt);
        } else {
                /* Multi threaded. */
                membar_release();
                refcnt = atomic_dec_uint_nv(&ff->ff_refcnt);
                membar_acquire();
        }
        if (__predict_false(refcnt != 0)) {
                /*
                 * Wait for other references to drain.  This is typically
                 * an application error - the descriptor is being closed
                 * while still in use.
                 * (Or just a threaded application trying to unblock its
                 * thread that sleeps in (say) accept()).
                 */
                atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);

                /*
                 * Remove any knotes attached to the file.  A knote
                 * attached to the descriptor can hold references on it.
                 */
                mutex_exit(&fdp->fd_lock);
                if (!SLIST_EMPTY(&ff->ff_knlist)) {
                        knote_fdclose(fd);
                }

                /*
                 * Since the file system code doesn't know which fd
                 * each request came from (think dup()), we have to
                 * ask it to return ERESTART for any long-term blocks.
                 * The re-entry through read/write/etc will detect the
                 * closed fd and return EBAFD.
                 * Blocked partial writes may return a short length.
                 */
                (*fp->f_ops->fo_restart)(fp);
                mutex_enter(&fdp->fd_lock);

                /*
                 * We need to see the count drop to zero at least once,
                 * in order to ensure that all pre-existing references
                 * have been drained.  New references past this point are
                 * of no interest.
                 * XXX (dsl) this may need to call fo_restart() after a
                 * timeout to guarantee that all the system calls exit.
                 */
                while ((ff->ff_refcnt & FR_MASK) != 0) {
                        cv_wait(&ff->ff_closing, &fdp->fd_lock);
                }
                atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
        } else {
                /* If no references, there must be no knotes. */
                KASSERT(SLIST_EMPTY(&ff->ff_knlist));
        }

        /*
         * POSIX record locking dictates that any close releases ALL
         * locks owned by this process.  This is handled by setting
         * a flag in the unlock to free ONLY locks obeying POSIX
         * semantics, and not to free BSD-style file locks.
         * If the descriptor was in a message, POSIX-style locks
         * aren't passed with the descriptor.
         */
        if (__predict_false((p->p_flag & PK_ADVLOCK) != 0) &&
            fp->f_ops->fo_advlock != NULL) {
                lf.l_whence = SEEK_SET;
                lf.l_start = 0;
                lf.l_len = 0;
                lf.l_type = F_UNLCK;
                mutex_exit(&fdp->fd_lock);
                (void)(*fp->f_ops->fo_advlock)(fp, p, F_UNLCK, &lf, F_POSIX);
                mutex_enter(&fdp->fd_lock);
        }

        /* Free descriptor slot. */
        fd_unused(fdp, fd);
        mutex_exit(&fdp->fd_lock);

        /* Now drop reference to the file itself. */
        return closef(fp);
}

/*
 * Duplicate a file descriptor.
 */
int
fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
{
        proc_t *p = curproc;
        fdtab_t *dt;
        int error;

        while ((error = fd_alloc(p, minfd, newp)) != 0) {
                if (error != ENOSPC) {
                        return error;
                }
                fd_tryexpand(p);
        }

        dt = atomic_load_consume(&curlwp->l_fd->fd_dt);
        dt->dt_ff[*newp]->ff_exclose = exclose;
        fd_affix(p, fp, *newp);
        return 0;
}

/*
 * dup2 operation.
 */
int
fd_dup2(file_t *fp, unsigned newfd, int flags)
{
        filedesc_t *fdp = curlwp->l_fd;
        fdfile_t *ff;
        fdtab_t *dt;

        if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE))
                return EINVAL;
        /*
         * Ensure there are enough slots in the descriptor table,
         * and allocate an fdfile_t up front in case we need it.
         */
        while (newfd >= atomic_load_consume(&fdp->fd_dt)->dt_nfiles) {
                fd_tryexpand(curproc);
        }
        ff = kmem_alloc(sizeof(*ff), KM_SLEEP);
        fdfile_ctor(ff);

        /*
         * If there is already a file open, close it.  If the file is
         * half open, wait for it to be constructed before closing it.
         * XXX Potential for deadlock here?
         */
        mutex_enter(&fdp->fd_lock);
        while (fd_isused(fdp, newfd)) {
                mutex_exit(&fdp->fd_lock);
                if (fd_getfile(newfd) != NULL) {
                        (void)fd_close(newfd);
                } else {
                        /*
                         * Crummy, but unlikely to happen.
                         * Can occur if we interrupt another
                         * thread while it is opening a file.
                         */
                        kpause("dup2", false, 1, NULL);
                }
                mutex_enter(&fdp->fd_lock);
        }
        dt = fdp->fd_dt;
        if (dt->dt_ff[newfd] == NULL) {
                KASSERT(newfd >= NDFDFILE);
                dt->dt_ff[newfd] = ff;
                ff = NULL;
        }
        fd_used(fdp, newfd);
        mutex_exit(&fdp->fd_lock);

        dt->dt_ff[newfd]->ff_exclose = (flags & O_CLOEXEC) != 0;
        fp->f_flag |= flags & (FNONBLOCK|FNOSIGPIPE);
        /* Slot is now allocated.  Insert copy of the file. */
        fd_affix(curproc, fp, newfd);
        if (ff != NULL) {
                cv_destroy(&ff->ff_closing);
                kmem_free(ff, sizeof(*ff));
        }
        return 0;
}

/*
 * Drop reference to a file structure.
 */
int
closef(file_t *fp)
{
        struct flock lf;
        int error;

        /*
         * Drop reference.  If referenced elsewhere it's still open
         * and we have nothing more to do.
         */
        mutex_enter(&fp->f_lock);
        KASSERT(fp->f_count > 0);
        if (--fp->f_count > 0) {
                mutex_exit(&fp->f_lock);
                return 0;
        }
        KASSERT(fp->f_count == 0);
        mutex_exit(&fp->f_lock);

        /* We held the last reference - release locks, close and free. */
        if (fp->f_ops->fo_advlock == NULL) {
                KASSERT((fp->f_flag & FHASLOCK) == 0);
        } else if (fp->f_flag & FHASLOCK) {
                lf.l_whence = SEEK_SET;
                lf.l_start = 0;
                lf.l_len = 0;
                lf.l_type = F_UNLCK;
                (void)(*fp->f_ops->fo_advlock)(fp, fp, F_UNLCK, &lf, F_FLOCK);
        }
        if (fp->f_ops != NULL) {
                error = (*fp->f_ops->fo_close)(fp);
        } else {
                error = 0;
        }
        KASSERT(fp->f_count == 0);
        KASSERT(fp->f_cred != NULL);
        pool_cache_put(file_cache, fp);

        return error;
}

/*
 * Allocate a file descriptor for the process.
 *
 * Future idea for experimentation: replace all of this with radixtree.
 */
int
fd_alloc(proc_t *p, int want, int *result)
{
        filedesc_t *fdp = p->p_fd;
        int i, lim, last, error, hi;
        u_int off;
        fdtab_t *dt;

        KASSERT(p == curproc || p == &proc0);

        /*
         * Search for a free descriptor starting at the higher
         * of want or fd_freefile.
         */
        mutex_enter(&fdp->fd_lock);
        fd_checkmaps(fdp);
        dt = fdp->fd_dt;
        KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
        lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
        last = uimin(dt->dt_nfiles, lim);

        for (;;) {
                if ((i = want) < fdp->fd_freefile)
                        i = fdp->fd_freefile;
                off = i >> NDENTRYSHIFT;
                hi = fd_next_zero(fdp, fdp->fd_himap, off,
                    (last + NDENTRIES - 1) >> NDENTRYSHIFT);
                if (hi == -1)
                        break;
                i = fd_next_zero(fdp, &fdp->fd_lomap[hi],
                    hi > off ? 0 : i & NDENTRYMASK, NDENTRIES);
                if (i == -1) {
                        /*
                         * Free file descriptor in this block was
                         * below want, try again with higher want.
                         */
                        want = (hi + 1) << NDENTRYSHIFT;
                        continue;
                }
                i += (hi << NDENTRYSHIFT);
                if (i >= last) {
                        break;
                }
                if (dt->dt_ff[i] == NULL) {
                        KASSERT(i >= NDFDFILE);
                        dt->dt_ff[i] = kmem_alloc(sizeof(fdfile_t), KM_SLEEP);
                        fdfile_ctor(dt->dt_ff[i]);
                }
                KASSERT(dt->dt_ff[i]->ff_file == NULL);
                fd_used(fdp, i);
                if (want <= fdp->fd_freefile) {
                        fdp->fd_freefile = i;
                }
                *result = i;
                KASSERT(i >= NDFDFILE ||
                    dt->dt_ff[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
                fd_checkmaps(fdp);
                mutex_exit(&fdp->fd_lock);
                return 0;
        }

        /* No space in current array.  Let the caller expand and retry. */
        error = (dt->dt_nfiles >= lim) ? EMFILE : ENOSPC;
        mutex_exit(&fdp->fd_lock);
        return error;
}

/*
 * Allocate memory for a descriptor table.
 */
static fdtab_t *
fd_dtab_alloc(int n)
{
        fdtab_t *dt;
        size_t sz;

        KASSERT(n > NDFILE);

        sz = sizeof(*dt) + (n - NDFILE) * sizeof(dt->dt_ff[0]);
        dt = kmem_alloc(sz, KM_SLEEP);
#ifdef DIAGNOSTIC
        memset(dt, 0xff, sz);
#endif
        dt->dt_nfiles = n;
        dt->dt_link = NULL;
        return dt;
}

/*
 * Free a descriptor table, and all tables linked for deferred free.
 */
static void
fd_dtab_free(fdtab_t *dt)
{
        fdtab_t *next;
        size_t sz;

        do {
                next = dt->dt_link;
                KASSERT(dt->dt_nfiles > NDFILE);
                sz = sizeof(*dt) +
                    (dt->dt_nfiles - NDFILE) * sizeof(dt->dt_ff[0]);
#ifdef DIAGNOSTIC
                memset(dt, 0xff, sz);
#endif
                kmem_free(dt, sz);
                dt = next;
        } while (dt != NULL);
}

/*
 * Allocate descriptor bitmap.
 */
static void
fd_map_alloc(int n, uint32_t **lo, uint32_t **hi)
{
        uint8_t *ptr;
        size_t szlo, szhi;

        KASSERT(n > NDENTRIES);

        szlo = NDLOSLOTS(n) * sizeof(uint32_t);
        szhi = NDHISLOTS(n) * sizeof(uint32_t);
        ptr = kmem_alloc(szlo + szhi, KM_SLEEP);
        *lo = (uint32_t *)ptr;
        *hi = (uint32_t *)(ptr + szlo);
}

/*
 * Free descriptor bitmap.
 */
static void
fd_map_free(int n, uint32_t *lo, uint32_t *hi)
{
        size_t szlo, szhi;

        KASSERT(n > NDENTRIES);

        szlo = NDLOSLOTS(n) * sizeof(uint32_t);
        szhi = NDHISLOTS(n) * sizeof(uint32_t);
        KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo));
        kmem_free(lo, szlo + szhi);
}

/*
 * Expand a process' descriptor table.
 */
void
fd_tryexpand(proc_t *p)
{
        filedesc_t *fdp;
        int i, numfiles, oldnfiles;
        fdtab_t *newdt, *dt;
        uint32_t *newhimap, *newlomap;

        KASSERT(p == curproc || p == &proc0);

        fdp = p->p_fd;
        newhimap = NULL;
        newlomap = NULL;
        oldnfiles = atomic_load_consume(&fdp->fd_dt)->dt_nfiles;

        if (oldnfiles < NDEXTENT)
                numfiles = NDEXTENT;
        else
                numfiles = 2 * oldnfiles;

        newdt = fd_dtab_alloc(numfiles);
        if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
                fd_map_alloc(numfiles, &newlomap, &newhimap);
        }

        mutex_enter(&fdp->fd_lock);
        dt = fdp->fd_dt;
        KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
        if (dt->dt_nfiles != oldnfiles) {
                /* fdp changed; caller must retry */
                mutex_exit(&fdp->fd_lock);
                fd_dtab_free(newdt);
                if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
                        fd_map_free(numfiles, newlomap, newhimap);
                }
                return;
        }

        /* Copy the existing descriptor table and zero the new portion. */
        i = sizeof(fdfile_t *) * oldnfiles;
        memcpy(newdt->dt_ff, dt->dt_ff, i);
        memset((uint8_t *)newdt->dt_ff + i, 0,
            numfiles * sizeof(fdfile_t *) - i);

        /*
         * Link old descriptor array into list to be discarded.  We defer
         * freeing until the last reference to the descriptor table goes
         * away (usually process exit).  This allows us to do lockless
         * lookups in fd_getfile().
         */
        if (oldnfiles > NDFILE) {
                if (fdp->fd_refcnt > 1) {
                        newdt->dt_link = dt;
                } else {
                        fd_dtab_free(dt);
                }
        }

        if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
                i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
                memcpy(newhimap, fdp->fd_himap, i);
                memset((uint8_t *)newhimap + i, 0,
                    NDHISLOTS(numfiles) * sizeof(uint32_t) - i);

                i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
                memcpy(newlomap, fdp->fd_lomap, i);
                memset((uint8_t *)newlomap + i, 0,
                    NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);

                if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
                        fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap);
                }
                fdp->fd_himap = newhimap;
                fdp->fd_lomap = newlomap;
        }

        /*
         * All other modifications must become globally visible before
         * the change to fd_dt.  See fd_getfile().
         */
        atomic_store_release(&fdp->fd_dt, newdt);
        KASSERT(newdt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
        fd_checkmaps(fdp);
        mutex_exit(&fdp->fd_lock);
}

/*
 * Create a new open file structure and allocate a file descriptor
 * for the current process.
 */
int
fd_allocfile(file_t **resultfp, int *resultfd)
{
        proc_t *p = curproc;
        kauth_cred_t cred;
        file_t *fp;
        int error;

        while ((error = fd_alloc(p, 0, resultfd)) != 0) {
                if (error != ENOSPC) {
                        return error;
                }
                fd_tryexpand(p);
        }

        fp = pool_cache_get(file_cache, PR_WAITOK);
        if (fp == NULL) {
                fd_abort(p, NULL, *resultfd);
                return ENFILE;
        }
        KASSERT(fp->f_count == 0);
        KASSERT(fp->f_msgcount == 0);
        KASSERT(fp->f_unpcount == 0);

        /* Replace cached credentials if not what we need. */
        cred = curlwp->l_cred;
        if (__predict_false(cred != fp->f_cred)) {
                kauth_cred_free(fp->f_cred);
                fp->f_cred = kauth_cred_hold(cred);
        }

        /*
         * Don't allow recycled files to be scanned.
         * See uipc_usrreq.c.
         */
        if (__predict_false((fp->f_flag & FSCAN) != 0)) {
                mutex_enter(&fp->f_lock);
                atomic_and_uint(&fp->f_flag, ~FSCAN);
                mutex_exit(&fp->f_lock);
        }

        fp->f_advice = 0;
        fp->f_offset = 0;
        *resultfp = fp;

        return 0;
}

/*
 * Successful creation of a new descriptor: make visible to the process.
 */
void
fd_affix(proc_t *p, file_t *fp, unsigned fd)
{
        fdfile_t *ff;
        filedesc_t *fdp;
        fdtab_t *dt;

        KASSERT(p == curproc || p == &proc0);

        /* Add a reference to the file structure. */
        mutex_enter(&fp->f_lock);
        fp->f_count++;
        mutex_exit(&fp->f_lock);

        /*
         * Insert the new file into the descriptor slot.
         */
        fdp = p->p_fd;
        dt = atomic_load_consume(&fdp->fd_dt);
        ff = dt->dt_ff[fd];

        KASSERT(ff != NULL);
        KASSERT(ff->ff_file == NULL);
        KASSERT(ff->ff_allocated);
        KASSERT(fd_isused(fdp, fd));
        KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);

        /* No need to lock in order to make file initially visible. */
        atomic_store_release(&ff->ff_file, fp);
}

/*
 * Abort creation of a new descriptor: free descriptor slot and file.
 */
void
fd_abort(proc_t *p, file_t *fp, unsigned fd)
{
        filedesc_t *fdp;
        fdfile_t *ff;

        KASSERT(p == curproc || p == &proc0);

        fdp = p->p_fd;
        ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];
        ff->ff_exclose = false;

        KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);

        mutex_enter(&fdp->fd_lock);
        KASSERT(fd_isused(fdp, fd));
        fd_unused(fdp, fd);
        mutex_exit(&fdp->fd_lock);

        if (fp != NULL) {
                KASSERT(fp->f_count == 0);
                KASSERT(fp->f_cred != NULL);
                pool_cache_put(file_cache, fp);
        }
}

static int
file_ctor(void *arg, void *obj, int flags)
{
        /*
         * It's easy to exhaust the open file limit on a system with many
         * CPUs due to caching.  Allow a bit of leeway to reduce the element
         * of surprise.
         */
        u_int slop = PCG_NOBJECTS_NORMAL * (ncpu - 1);
        file_t *fp = obj;

        memset(fp, 0, sizeof(*fp));

        mutex_enter(&filelist_lock);
        if (__predict_false(nfiles >= slop + maxfiles)) {
                mutex_exit(&filelist_lock);
                tablefull("file", "increase kern.maxfiles or MAXFILES");
                return ENFILE;
        }
        nfiles++;
        LIST_INSERT_HEAD(&filehead, fp, f_list);
        mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
        fp->f_cred = kauth_cred_hold(curlwp->l_cred);
        mutex_exit(&filelist_lock);

        return 0;
}

static void
file_dtor(void *arg, void *obj)
{
        file_t *fp = obj;

        mutex_enter(&filelist_lock);
        nfiles--;
        LIST_REMOVE(fp, f_list);
        mutex_exit(&filelist_lock);

        KASSERT(fp->f_count == 0);
        kauth_cred_free(fp->f_cred);
        mutex_destroy(&fp->f_lock);
}

static void
fdfile_ctor(fdfile_t *ff)
{

        memset(ff, 0, sizeof(*ff));
        cv_init(&ff->ff_closing, "fdclose");
}

static void
fdfile_dtor(fdfile_t *ff)
{

        cv_destroy(&ff->ff_closing);
}

file_t *
fgetdummy(void)
{
        file_t *fp;

        fp = kmem_zalloc(sizeof(*fp), KM_SLEEP);
        mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
        return fp;
}

void
fputdummy(file_t *fp)
{

        mutex_destroy(&fp->f_lock);
        kmem_free(fp, sizeof(*fp));
}

/*
 * Create an initial filedesc structure.
 */
filedesc_t *
fd_init(filedesc_t *fdp)
{
#ifdef DIAGNOSTIC
        unsigned fd;
#endif

        if (__predict_true(fdp == NULL)) {
                fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
        } else {
                KASSERT(fdp == &filedesc0);
                filedesc_ctor(NULL, fdp, PR_WAITOK);
        }

#ifdef DIAGNOSTIC
        KASSERT(fdp->fd_lastfile == -1);
        KASSERT(fdp->fd_lastkqfile == -1);
        KASSERT(fdp->fd_knhash == NULL);
        KASSERT(fdp->fd_freefile == 0);
        KASSERT(fdp->fd_exclose == false);
        KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
        KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
        for (fd = 0; fd < NDFDFILE; fd++) {
                KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] ==
                    (fdfile_t *)fdp->fd_dfdfile[fd]);
        }
        for (fd = NDFDFILE; fd < NDFILE; fd++) {
                KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == NULL);
        }
        KASSERT(fdp->fd_himap == fdp->fd_dhimap);
        KASSERT(fdp->fd_lomap == fdp->fd_dlomap);
#endif        /* DIAGNOSTIC */

        fdp->fd_refcnt = 1;
        fd_checkmaps(fdp);

        return fdp;
}

/*
 * Initialize a file descriptor table.
 */
static int
filedesc_ctor(void *arg, void *obj, int flag)
{
        filedesc_t *fdp = obj;
        fdfile_t **ffp;
        int i;

        memset(fdp, 0, sizeof(*fdp));
        mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
        fdp->fd_lastfile = -1;
        fdp->fd_lastkqfile = -1;
        fdp->fd_dt = &fdp->fd_dtbuiltin;
        fdp->fd_dtbuiltin.dt_nfiles = NDFILE;
        fdp->fd_himap = fdp->fd_dhimap;
        fdp->fd_lomap = fdp->fd_dlomap;

        CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
        for (i = 0, ffp = fdp->fd_dt->dt_ff; i < NDFDFILE; i++, ffp++) {
                fdfile_ctor(*ffp = (fdfile_t *)fdp->fd_dfdfile[i]);
        }

        return 0;
}

static void
filedesc_dtor(void *arg, void *obj)
{
        filedesc_t *fdp = obj;
        int i;

        for (i = 0; i < NDFDFILE; i++) {
                fdfile_dtor((fdfile_t *)fdp->fd_dfdfile[i]);
        }

        mutex_destroy(&fdp->fd_lock);
}

/*
 * Make p share curproc's filedesc structure.
 */
void
fd_share(struct proc *p)
{
        filedesc_t *fdp;

        fdp = curlwp->l_fd;
        p->p_fd = fdp;
        atomic_inc_uint(&fdp->fd_refcnt);
}

/*
 * Acquire a hold on a filedesc structure.
 */
void
fd_hold(lwp_t *l)
{
        filedesc_t *fdp = l->l_fd;

        atomic_inc_uint(&fdp->fd_refcnt);
}

/*
 * Copy a filedesc structure.
 */
filedesc_t *
fd_copy(void)
{
        filedesc_t *newfdp, *fdp;
        fdfile_t *ff, **ffp, **nffp, *ff2;
        int i, j, numfiles, lastfile, newlast;
        file_t *fp;
        fdtab_t *newdt;

        fdp = curproc->p_fd;
        newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
        newfdp->fd_refcnt = 1;

#ifdef DIAGNOSTIC
        KASSERT(newfdp->fd_lastfile == -1);
        KASSERT(newfdp->fd_lastkqfile == -1);
        KASSERT(newfdp->fd_knhash == NULL);
        KASSERT(newfdp->fd_freefile == 0);
        KASSERT(newfdp->fd_exclose == false);
        KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
        KASSERT(newfdp->fd_dtbuiltin.dt_nfiles == NDFILE);
        for (i = 0; i < NDFDFILE; i++) {
                KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] ==
                    (fdfile_t *)&newfdp->fd_dfdfile[i]);
        }
        for (i = NDFDFILE; i < NDFILE; i++) {
                KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == NULL);
        }
#endif        /* DIAGNOSTIC */

        mutex_enter(&fdp->fd_lock);
        fd_checkmaps(fdp);
        numfiles = fdp->fd_dt->dt_nfiles;
        lastfile = fdp->fd_lastfile;

        /*
         * If the number of open files fits in the internal arrays
         * of the open file structure, use them, otherwise allocate
         * additional memory for the number of descriptors currently
         * in use.
         */
        if (lastfile < NDFILE) {
                i = NDFILE;
                newdt = newfdp->fd_dt;
                KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
        } else {
                /*
                 * Compute the smallest multiple of NDEXTENT needed
                 * for the file descriptors currently in use,
                 * allowing the table to shrink.
                 */
                i = numfiles;
                while (i >= 2 * NDEXTENT && i > lastfile * 2) {
                        i /= 2;
                }
                KASSERT(i > NDFILE);
                newdt = fd_dtab_alloc(i);
                newfdp->fd_dt = newdt;
                memcpy(newdt->dt_ff, newfdp->fd_dtbuiltin.dt_ff,
                    NDFDFILE * sizeof(fdfile_t **));
                memset(newdt->dt_ff + NDFDFILE, 0,
                    (i - NDFDFILE) * sizeof(fdfile_t **));
        }
        if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
                newfdp->fd_himap = newfdp->fd_dhimap;
                newfdp->fd_lomap = newfdp->fd_dlomap;
        } else {
                fd_map_alloc(i, &newfdp->fd_lomap, &newfdp->fd_himap);
                KASSERT(i >= NDENTRIES * NDENTRIES);
                memset(newfdp->fd_himap, 0, NDHISLOTS(i)*sizeof(uint32_t));
                memset(newfdp->fd_lomap, 0, NDLOSLOTS(i)*sizeof(uint32_t));
        }
        newfdp->fd_freefile = fdp->fd_freefile;
        newfdp->fd_exclose = fdp->fd_exclose;

        ffp = fdp->fd_dt->dt_ff;
        nffp = newdt->dt_ff;
        newlast = -1;
        for (i = 0; i <= lastfile; i++, ffp++, nffp++) {
                KASSERT(i >= NDFDFILE ||
                    *nffp == (fdfile_t *)newfdp->fd_dfdfile[i]);
                ff = *ffp;
                if (ff == NULL ||
                    (fp = atomic_load_consume(&ff->ff_file)) == NULL) {
                        /* Descriptor unused, or descriptor half open. */
                        KASSERT(!fd_isused(newfdp, i));
                        continue;
                }
                if (__predict_false(fp->f_type == DTYPE_KQUEUE)) {
                        /* kqueue descriptors cannot be copied. */
                        if (i < newfdp->fd_freefile) {
                                newfdp->fd_freefile = i;
                        }
                        continue;
                }
                /* It's active: add a reference to the file. */
                mutex_enter(&fp->f_lock);
                fp->f_count++;
                mutex_exit(&fp->f_lock);

                /* Allocate an fdfile_t to represent it. */
                if (i >= NDFDFILE) {
                        ff2 = kmem_alloc(sizeof(*ff2), KM_SLEEP);
                        fdfile_ctor(ff2);
                        *nffp = ff2;
                } else {
                        ff2 = newdt->dt_ff[i];
                }
                ff2->ff_file = fp;
                ff2->ff_exclose = ff->ff_exclose;
                ff2->ff_allocated = true;

                /* Fix up bitmaps. */
                j = i >> NDENTRYSHIFT;
                KASSERT((newfdp->fd_lomap[j] & (1U << (i & NDENTRYMASK))) == 0);
                newfdp->fd_lomap[j] |= 1U << (i & NDENTRYMASK);
                if (__predict_false(newfdp->fd_lomap[j] == ~0)) {
                        KASSERT((newfdp->fd_himap[j >> NDENTRYSHIFT] &
                            (1U << (j & NDENTRYMASK))) == 0);
                        newfdp->fd_himap[j >> NDENTRYSHIFT] |=
                            1U << (j & NDENTRYMASK);
                }
                newlast = i;
        }
        KASSERT(newdt->dt_ff[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
        newfdp->fd_lastfile = newlast;
        fd_checkmaps(newfdp);
        mutex_exit(&fdp->fd_lock);

        return newfdp;
}

/*
 * Release a filedesc structure.
 */
void
fd_free(void)
{
        fdfile_t *ff;
        file_t *fp;
        int fd, nf;
        fdtab_t *dt;
        lwp_t * const l = curlwp;
        filedesc_t * const fdp = l->l_fd;
        const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0;

        KASSERT(atomic_load_consume(&fdp->fd_dt)->dt_ff[0] ==
            (fdfile_t *)fdp->fd_dfdfile[0]);
        KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
        KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);

        membar_release();
        if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
                return;
        membar_acquire();

        /*
         * Close any files that the process holds open.
         */
        dt = fdp->fd_dt;
        fd_checkmaps(fdp);
#ifdef DEBUG
        fdp->fd_refcnt = -1; /* see fd_checkmaps */
#endif
        for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) {
                ff = dt->dt_ff[fd];
                KASSERT(fd >= NDFDFILE ||
                    ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
                if (ff == NULL)
                        continue;
                if ((fp = atomic_load_consume(&ff->ff_file)) != NULL) {
                        /*
                         * Must use fd_close() here if there is
                         * a reference from kqueue or we might have posix
                         * advisory locks.
                         */
                        if (__predict_true(ff->ff_refcnt == 0) &&
                            (noadvlock || fp->f_type != DTYPE_VNODE)) {
                                ff->ff_file = NULL;
                                ff->ff_exclose = false;
                                ff->ff_allocated = false;
                                closef(fp);
                        } else {
                                ff->ff_refcnt++;
                                fd_close(fd);
                        }
                }
                KASSERT(ff->ff_refcnt == 0);
                KASSERT(ff->ff_file == NULL);
                KASSERT(!ff->ff_exclose);
                KASSERT(!ff->ff_allocated);
                if (fd >= NDFDFILE) {
                        cv_destroy(&ff->ff_closing);
                        kmem_free(ff, sizeof(*ff));
                        dt->dt_ff[fd] = NULL;
                }
        }

        /*
         * Clean out the descriptor table for the next user and return
         * to the cache.
         */
        if (__predict_false(dt != &fdp->fd_dtbuiltin)) {
                fd_dtab_free(fdp->fd_dt);
                /* Otherwise, done above. */
                memset(&fdp->fd_dtbuiltin.dt_ff[NDFDFILE], 0,
                    (NDFILE - NDFDFILE) * sizeof(fdp->fd_dtbuiltin.dt_ff[0]));
                fdp->fd_dt = &fdp->fd_dtbuiltin;
        }
        if (__predict_false(NDHISLOTS(nf) > NDHISLOTS(NDFILE))) {
                KASSERT(fdp->fd_himap != fdp->fd_dhimap);
                KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
                fd_map_free(nf, fdp->fd_lomap, fdp->fd_himap);
        }
        if (__predict_false(fdp->fd_knhash != NULL)) {
                hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
                fdp->fd_knhash = NULL;
                fdp->fd_knhashmask = 0;
        } else {
                KASSERT(fdp->fd_knhashmask == 0);
        }
        fdp->fd_dt = &fdp->fd_dtbuiltin;
        fdp->fd_lastkqfile = -1;
        fdp->fd_lastfile = -1;
        fdp->fd_freefile = 0;
        fdp->fd_exclose = false;
        memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
            offsetof(filedesc_t, fd_startzero));
        fdp->fd_himap = fdp->fd_dhimap;
        fdp->fd_lomap = fdp->fd_dlomap;
        KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
        KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
        KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
#ifdef DEBUG
        fdp->fd_refcnt = 0; /* see fd_checkmaps */
#endif
        fd_checkmaps(fdp);
        pool_cache_put(filedesc_cache, fdp);
}

/*
 * File Descriptor pseudo-device driver (/dev/fd/).
 *
 * Opening minor device N dup()s the file (if any) connected to file
 * descriptor N belonging to the calling process.  Note that this driver
 * consists of only the ``open()'' routine, because all subsequent
 * references to this file will be direct to the other driver.
 */
static int
filedescopen(dev_t dev, int mode, int type, lwp_t *l)
{

        /*
         * XXX Kludge: set dupfd to contain the value of the
         * the file descriptor being sought for duplication. The error
         * return ensures that the vnode for this device will be released
         * by vn_open. Open will detect this special error and take the
         * actions in fd_dupopen below. Other callers of vn_open or VOP_OPEN
         * will simply report the error.
         */
        l->l_dupfd = minor(dev);        /* XXX */
        return EDUPFD;
}

/*
 * Duplicate the specified descriptor to a free descriptor.
 *
 * old is the original fd.
 * moveit is true if we should move rather than duplicate.
 * flags are the open flags (converted from O_* to F*).
 * newp returns the new fd on success.
 *
 * These two cases are produced by the EDUPFD and EMOVEFD magic
 * errnos, but in the interest of removing that regrettable interface,
 * vn_open has been changed to intercept them. Now vn_open returns
 * either a vnode or a filehandle, and the filehandle is accompanied
 * by a boolean that says whether we should dup (moveit == false) or
 * move (moveit == true) the fd.
 *
 * The dup case is used by /dev/stderr, /proc/self/fd, and such. The
 * move case is used by cloner devices that allocate a fd of their
 * own (a layering violation that should go away eventually) that
 * then needs to be put in the place open() expects it.
 */
int
fd_dupopen(int old, bool moveit, int flags, int *newp)
{
        filedesc_t *fdp;
        fdfile_t *ff;
        file_t *fp;
        fdtab_t *dt;
        int error;

        if ((fp = fd_getfile(old)) == NULL) {
                return EBADF;
        }
        fdp = curlwp->l_fd;
        dt = atomic_load_consume(&fdp->fd_dt);
        ff = dt->dt_ff[old];

        /*
         * There are two cases of interest here.
         *
         * 1. moveit == false (used to be the EDUPFD magic errno):
         *    simply dup (old) to file descriptor (new) and return.
         *
         * 2. moveit == true (used to be the EMOVEFD magic errno):
         *    steal away the file structure from (old) and store it in
         *    (new).  (old) is effectively closed by this operation.
         */
        if (moveit == false) {
                /*
                 * Check that the mode the file is being opened for is a
                 * subset of the mode of the existing descriptor.
                 */
                if (((flags & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
                        error = EACCES;
                        goto out;
                }

                /* Copy it. */
                error = fd_dup(fp, 0, newp, ff->ff_exclose);
        } else {
                /* Copy it. */
                error = fd_dup(fp, 0, newp, ff->ff_exclose);
                if (error != 0) {
                        goto out;
                }

                /* Steal away the file pointer from 'old'. */
                (void)fd_close(old);
                return 0;
        }

out:
        fd_putfile(old);
        return error;
}

/*
 * Close open files on exec.
 */
void
fd_closeexec(void)
{
        proc_t *p;
        filedesc_t *fdp;
        fdfile_t *ff;
        lwp_t *l;
        fdtab_t *dt;
        int fd;

        l = curlwp;
        p = l->l_proc;
        fdp = p->p_fd;

        if (fdp->fd_refcnt > 1) {
                fdp = fd_copy();
                fd_free();
                p->p_fd = fdp;
                l->l_fd = fdp;
        }
        if (!fdp->fd_exclose) {
                return;
        }
        fdp->fd_exclose = false;
        dt = atomic_load_consume(&fdp->fd_dt);

        for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
                if ((ff = dt->dt_ff[fd]) == NULL) {
                        KASSERT(fd >= NDFDFILE);
                        continue;
                }
                KASSERT(fd >= NDFDFILE ||
                    ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
                if (ff->ff_file == NULL)
                        continue;
                if (ff->ff_exclose) {
                        /*
                         * We need a reference to close the file.
                         * No other threads can see the fdfile_t at
                         * this point, so don't bother locking.
                         */
                        KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
                        ff->ff_refcnt++;
                        fd_close(fd);
                }
        }
}

/*
 * Sets descriptor owner. If the owner is a process, 'pgid'
 * is set to positive value, process ID. If the owner is process group,
 * 'pgid' is set to -pg_id.
 */
int
fsetown(pid_t *pgid, u_long cmd, const void *data)
{
        pid_t id = *(const pid_t *)data;
        int error;

        if (id == INT_MIN)
                return EINVAL;

        switch (cmd) {
        case TIOCSPGRP:
                if (id < 0)
                        return EINVAL;
                id = -id;
                break;
        default:
                break;
        }
        if (id > 0) {
                mutex_enter(&proc_lock);
                error = proc_find(id) ? 0 : ESRCH;
                mutex_exit(&proc_lock);
        } else if (id < 0) {
                error = pgid_in_session(curproc, -id);
        } else {
                error = 0;
        }
        if (!error) {
                *pgid = id;
        }
        return error;
}

void
fd_set_exclose(struct lwp *l, int fd, bool exclose)
{
        filedesc_t *fdp = l->l_fd;
        fdfile_t *ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];

        ff->ff_exclose = exclose;
        if (exclose)
                fdp->fd_exclose = true;
}

/*
 * Return descriptor owner information. If the value is positive,
 * it's process ID. If it's negative, it's process group ID and
 * needs the sign removed before use.
 */
int
fgetown(pid_t pgid, u_long cmd, void *data)
{

        switch (cmd) {
        case TIOCGPGRP:
                *(int *)data = -pgid;
                break;
        default:
                *(int *)data = pgid;
                break;
        }
        return 0;
}

/*
 * Send signal to descriptor owner, either process or process group.
 */
void
fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
{
        ksiginfo_t ksi;

        KASSERT(!cpu_intr_p());

        if (pgid == 0) {
                return;
        }

        KSI_INIT(&ksi);
        ksi.ksi_signo = signo;
        ksi.ksi_code = code;
        ksi.ksi_band = band;

        mutex_enter(&proc_lock);
        if (pgid > 0) {
                struct proc *p1;

                p1 = proc_find(pgid);
                if (p1 != NULL) {
                        kpsignal(p1, &ksi, fdescdata);
                }
        } else {
                struct pgrp *pgrp;

                KASSERT(pgid < 0);
                pgrp = pgrp_find(-pgid);
                if (pgrp != NULL) {
                        kpgsignal(pgrp, &ksi, fdescdata, 0);
                }
        }
        mutex_exit(&proc_lock);
}

int
fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
         void *data)
{
        fdfile_t *ff;
        filedesc_t *fdp;

        fp->f_flag = flag & FMASK;
        fdp = curproc->p_fd;
        ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];
        KASSERT(ff != NULL);
        ff->ff_exclose = (flag & O_CLOEXEC) != 0;
        fp->f_type = DTYPE_MISC;
        fp->f_ops = fops;
        fp->f_data = data;
        curlwp->l_dupfd = fd;
        fd_affix(curproc, fp, fd);

        return EMOVEFD;
}

int
fnullop_fcntl(file_t *fp, u_int cmd, void *data)
{

        if (cmd == F_SETFL)
                return 0;

        return EOPNOTSUPP;
}

int
fnullop_poll(file_t *fp, int which)
{

        return 0;
}

int
fnullop_kqfilter(file_t *fp, struct knote *kn)
{

        return EOPNOTSUPP;
}

void
fnullop_restart(file_t *fp)
{

}

int
fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
            kauth_cred_t cred, int flags)
{

        return EOPNOTSUPP;
}

int
fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
             kauth_cred_t cred, int flags)
{

        return EOPNOTSUPP;
}

int
fbadop_ioctl(file_t *fp, u_long com, void *data)
{

        return EOPNOTSUPP;
}

int
fbadop_stat(file_t *fp, struct stat *sb)
{

        return EOPNOTSUPP;
}

int
fbadop_close(file_t *fp)
{

        return EOPNOTSUPP;
}

/*
 * sysctl routines pertaining to file descriptors
 */

/* Initialized in sysctl_init() for now... */
extern kmutex_t sysctl_file_marker_lock;
static u_int sysctl_file_marker = 1;

/*
 * Expects to be called with proc_lock and sysctl_file_marker_lock locked.
 */
static void
sysctl_file_marker_reset(void)
{
        struct proc *p;

        PROCLIST_FOREACH(p, &allproc) {
                struct filedesc *fd = p->p_fd;
                fdtab_t *dt;
                u_int i;

                mutex_enter(&fd->fd_lock);
                dt = fd->fd_dt;
                for (i = 0; i < dt->dt_nfiles; i++) {
                        struct file *fp;
                        fdfile_t *ff;

                        if ((ff = dt->dt_ff[i]) == NULL) {
                                continue;
                        }
                        if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) {
                                continue;
                        }
                        fp->f_marker = 0;
                }
                mutex_exit(&fd->fd_lock);
        }
}

/*
 * sysctl helper routine for kern.file pseudo-subtree.
 */
static int
sysctl_kern_file(SYSCTLFN_ARGS)
{
        const bool allowaddr = get_expose_address(curproc);
        struct filelist flist;
        int error;
        size_t buflen;
        struct file *fp, fbuf;
        char *start, *where;
        struct proc *p;

        start = where = oldp;
        buflen = *oldlenp;
        
        if (where == NULL) {
                /*
                 * overestimate by 10 files
                 */
                *oldlenp = sizeof(filehead) + (nfiles + 10) *
                    sizeof(struct file);
                return 0;
        }

        /*
         * first sysctl_copyout filehead
         */
        if (buflen < sizeof(filehead)) {
                *oldlenp = 0;
                return 0;
        }
        sysctl_unlock();
        if (allowaddr) {
                memcpy(&flist, &filehead, sizeof(flist));
        } else {
                memset(&flist, 0, sizeof(flist));
        }
        error = sysctl_copyout(l, &flist, where, sizeof(flist));
        if (error) {
                sysctl_relock();
                return error;
        }
        buflen -= sizeof(flist);
        where += sizeof(flist);

        /*
         * followed by an array of file structures
         */
        mutex_enter(&sysctl_file_marker_lock);
        mutex_enter(&proc_lock);
        PROCLIST_FOREACH(p, &allproc) {
                struct filedesc *fd;
                fdtab_t *dt;
                u_int i;

                if (p->p_stat == SIDL) {
                        /* skip embryonic processes */
                        continue;
                }
                mutex_enter(p->p_lock);
                error = kauth_authorize_process(l->l_cred,
                    KAUTH_PROCESS_CANSEE, p,
                    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES),
                    NULL, NULL);
                mutex_exit(p->p_lock);
                if (error != 0) {
                        /*
                         * Don't leak kauth retval if we're silently
                         * skipping this entry.
                         */
                        error = 0;
                        continue;
                }

                /*
                 * Grab a hold on the process.
                 */
                if (!rw_tryenter(&p->p_reflock, RW_READER)) {
                        continue;
                }
                mutex_exit(&proc_lock);

                fd = p->p_fd;
                mutex_enter(&fd->fd_lock);
                dt = fd->fd_dt;
                for (i = 0; i < dt->dt_nfiles; i++) {
                        fdfile_t *ff;

                        if ((ff = dt->dt_ff[i]) == NULL) {
                                continue;
                        }
                        if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) {
                                continue;
                        }

                        mutex_enter(&fp->f_lock);

                        if ((fp->f_count == 0) ||
                            (fp->f_marker == sysctl_file_marker)) {
                                mutex_exit(&fp->f_lock);
                                continue;
                        }

                        /* Check that we have enough space. */
                        if (buflen < sizeof(struct file)) {
                                *oldlenp = where - start;
                                mutex_exit(&fp->f_lock);
                                error = ENOMEM;
                                break;
                        }

                        fill_file(&fbuf, fp);
                        mutex_exit(&fp->f_lock);
                        error = sysctl_copyout(l, &fbuf, where, sizeof(fbuf));
                        if (error) {
                                break;
                        }
                        buflen -= sizeof(struct file);
                        where += sizeof(struct file);

                        fp->f_marker = sysctl_file_marker;
                }
                mutex_exit(&fd->fd_lock);

                /*
                 * Release reference to process.
                 */
                mutex_enter(&proc_lock);
                rw_exit(&p->p_reflock);

                if (error)
                        break;
        }

        sysctl_file_marker++;
        /* Reset all markers if wrapped. */
        if (sysctl_file_marker == 0) {
                sysctl_file_marker_reset();
                sysctl_file_marker++;
        }

        mutex_exit(&proc_lock);
        mutex_exit(&sysctl_file_marker_lock);

        *oldlenp = where - start;
        sysctl_relock();
        return error;
}

/*
 * sysctl helper function for kern.file2
 */
static int
sysctl_kern_file2(SYSCTLFN_ARGS)
{
        struct proc *p;
        struct file *fp;
        struct filedesc *fd;
        struct kinfo_file kf;
        char *dp;
        u_int i, op;
        size_t len, needed, elem_size, out_size;
        int error, arg, elem_count;
        fdfile_t *ff;
        fdtab_t *dt;

        if (namelen == 1 && name[0] == CTL_QUERY)
                return sysctl_query(SYSCTLFN_CALL(rnode));

        if (namelen != 4)
                return EINVAL;

        error = 0;
        dp = oldp;
        len = (oldp != NULL) ? *oldlenp : 0;
        op = name[0];
        arg = name[1];
        elem_size = name[2];
        elem_count = name[3];
        out_size = MIN(sizeof(kf), elem_size);
        needed = 0;

        if (elem_size < 1 || elem_count < 0)
                return EINVAL;

        switch (op) {
        case KERN_FILE_BYFILE:
        case KERN_FILE_BYPID:
                /*
                 * We're traversing the process list in both cases; the BYFILE
                 * case does additional work of keeping track of files already
                 * looked at.
                 */

                /* doesn't use arg so it must be zero */
                if ((op == KERN_FILE_BYFILE) && (arg != 0))
                        return EINVAL;

                if ((op == KERN_FILE_BYPID) && (arg < -1))
                        /* -1 means all processes */
                        return EINVAL;

                sysctl_unlock();
                if (op == KERN_FILE_BYFILE)
                        mutex_enter(&sysctl_file_marker_lock);
                mutex_enter(&proc_lock);
                PROCLIST_FOREACH(p, &allproc) {
                        if (p->p_stat == SIDL) {
                                /* skip embryonic processes */
                                continue;
                        }
                        if (arg > 0 && p->p_pid != arg) {
                                /* pick only the one we want */
                                /* XXX want 0 to mean "kernel files" */
                                continue;
                        }
                        mutex_enter(p->p_lock);
                        error = kauth_authorize_process(l->l_cred,
                            KAUTH_PROCESS_CANSEE, p,
                            KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES),
                            NULL, NULL);
                        mutex_exit(p->p_lock);
                        if (error != 0) {
                                /*
                                 * Don't leak kauth retval if we're silently
                                 * skipping this entry.
                                 */
                                error = 0;
                                continue;
                        }

                        /*
                         * Grab a hold on the process.
                         */
                        if (!rw_tryenter(&p->p_reflock, RW_READER)) {
                                continue;
                        }
                        mutex_exit(&proc_lock);

                        fd = p->p_fd;
                        mutex_enter(&fd->fd_lock);
                        dt = fd->fd_dt;
                        for (i = 0; i < dt->dt_nfiles; i++) {
                                if ((ff = dt->dt_ff[i]) == NULL) {
                                        continue;
                                }
                                if ((fp = atomic_load_consume(&ff->ff_file)) ==
                                    NULL) {
                                        continue;
                                }

                                if ((op == KERN_FILE_BYFILE) &&
                                    (fp->f_marker == sysctl_file_marker)) {
                                        continue;
                                }
                                if (len >= elem_size && elem_count > 0) {
                                        mutex_enter(&fp->f_lock);
                                        fill_file2(&kf, fp, ff, i, p->p_pid);
                                        mutex_exit(&fp->f_lock);
                                        mutex_exit(&fd->fd_lock);
                                        error = sysctl_copyout(l,
                                            &kf, dp, out_size);
                                        mutex_enter(&fd->fd_lock);
                                        if (error)
                                                break;
                                        dp += elem_size;
                                        len -= elem_size;
                                }
                                if (op == KERN_FILE_BYFILE)
                                        fp->f_marker = sysctl_file_marker;
                                needed += elem_size;
                                if (elem_count > 0 && elem_count != INT_MAX)
                                        elem_count--;
                        }
                        mutex_exit(&fd->fd_lock);

                        /*
                         * Release reference to process.
                         */
                        mutex_enter(&proc_lock);
                        rw_exit(&p->p_reflock);
                }
                if (op == KERN_FILE_BYFILE) {
                        sysctl_file_marker++;

                        /* Reset all markers if wrapped. */
                        if (sysctl_file_marker == 0) {
                                sysctl_file_marker_reset();
                                sysctl_file_marker++;
                        }
                }
                mutex_exit(&proc_lock);
                if (op == KERN_FILE_BYFILE)
                        mutex_exit(&sysctl_file_marker_lock);
                sysctl_relock();
                break;
        default:
                return EINVAL;
        }

        if (oldp == NULL)
                needed += KERN_FILESLOP * elem_size;
        *oldlenp = needed;

        return error;
}

static void
fill_file(struct file *fp, const struct file *fpsrc)
{
        const bool allowaddr = get_expose_address(curproc);

        memset(fp, 0, sizeof(*fp));

        fp->f_offset = fpsrc->f_offset;
        COND_SET_PTR(fp->f_cred, fpsrc->f_cred, allowaddr);
        COND_SET_CPTR(fp->f_ops, fpsrc->f_ops, allowaddr);
        COND_SET_STRUCT(fp->f_undata, fpsrc->f_undata, allowaddr);
        COND_SET_STRUCT(fp->f_list, fpsrc->f_list, allowaddr);
        fp->f_flag = fpsrc->f_flag;
        fp->f_marker = fpsrc->f_marker;
        fp->f_type = fpsrc->f_type;
        fp->f_advice = fpsrc->f_advice;
        fp->f_count = fpsrc->f_count;
        fp->f_msgcount = fpsrc->f_msgcount;
        fp->f_unpcount = fpsrc->f_unpcount;
        COND_SET_STRUCT(fp->f_unplist, fpsrc->f_unplist, allowaddr);
}

static void
fill_file2(struct kinfo_file *kp, const file_t *fp, const fdfile_t *ff,
          int i, pid_t pid)
{
        const bool allowaddr = get_expose_address(curproc);

        memset(kp, 0, sizeof(*kp));

        COND_SET_VALUE(kp->ki_fileaddr, PTRTOUINT64(fp), allowaddr);
        kp->ki_flag =                fp->f_flag;
        kp->ki_iflags =                0;
        kp->ki_ftype =                fp->f_type;
        kp->ki_count =                fp->f_count;
        kp->ki_msgcount =        fp->f_msgcount;
        COND_SET_VALUE(kp->ki_fucred, PTRTOUINT64(fp->f_cred), allowaddr);
        kp->ki_fuid =                kauth_cred_geteuid(fp->f_cred);
        kp->ki_fgid =                kauth_cred_getegid(fp->f_cred);
        COND_SET_VALUE(kp->ki_fops, PTRTOUINT64(fp->f_ops), allowaddr);
        kp->ki_foffset =        fp->f_offset;
        COND_SET_VALUE(kp->ki_fdata, PTRTOUINT64(fp->f_data), allowaddr);

        /* vnode information to glue this file to something */
        if (fp->f_type == DTYPE_VNODE) {
                struct vnode *vp = fp->f_vnode;

                COND_SET_VALUE(kp->ki_vun, PTRTOUINT64(vp->v_un.vu_socket),
                    allowaddr);
                kp->ki_vsize =        vp->v_size;
                kp->ki_vtype =        vp->v_type;
                kp->ki_vtag =        vp->v_tag;
                COND_SET_VALUE(kp->ki_vdata, PTRTOUINT64(vp->v_data),
                    allowaddr);
        }

        /* process information when retrieved via KERN_FILE_BYPID */
        if (ff != NULL) {
                kp->ki_pid =                pid;
                kp->ki_fd =                i;
                kp->ki_ofileflags =        ff->ff_exclose;
                kp->ki_usecount =        ff->ff_refcnt;
        }
}












































































   29 








   29 


































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
/*        $NetBSD: scsi_base.c,v 1.93 2019/05/03 16:06:56 mlelstv Exp $        */

/*-
 * Copyright (c) 1998, 2004 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: scsi_base.c,v 1.93 2019/05/03 16:06:56 mlelstv Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/buf.h>
#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/errno.h>
#include <sys/device.h>
#include <sys/proc.h>

#include <dev/scsipi/scsipi_all.h>
#include <dev/scsipi/scsi_all.h>
#include <dev/scsipi/scsi_disk.h>
#include <dev/scsipi/scsiconf.h>
#include <dev/scsipi/scsipi_base.h>

static void scsi_print_xfer_mode(struct scsipi_periph *);
/*
 * Do a scsi operation, asking a device to run as SCSI-II if it can.
 */
int
scsi_change_def(struct scsipi_periph *periph, int flags)
{
        struct scsi_changedef cmd;

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = SCSI_CHANGE_DEFINITION;
        cmd.how = SC_SCSI_2;

        return (scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
            SCSIPIRETRIES, 100000, NULL, flags));
}

/*
 * ask the scsi driver to perform a command for us.
 * tell it where to read/write the data, and how
 * long the data is supposed to be. If we have  a buf
 * to associate with the transfer, we need that too.
 */
void
scsi_scsipi_cmd(struct scsipi_xfer *xs)
{
        struct scsipi_periph *periph = xs->xs_periph;

        SC_DEBUG(periph, SCSIPI_DB2, ("scsi_scsipi_cmd\n"));

        /*
         * Set the LUN in the CDB if we have an older device.  We also
         * set it for more modern SCSI-2 devices "just in case".
         */
        if (periph->periph_version <= 2)
                xs->cmd->bytes[0] |=
                    ((periph->periph_lun << SCSI_CMD_LUN_SHIFT) &
                        SCSI_CMD_LUN_MASK);
}

/*
 * Utility routines often used in SCSI stuff
 */

/*
 * Print out the periph's address info.
 */
void
scsi_print_addr(struct scsipi_periph *periph)
{
        struct scsipi_channel *chan = periph->periph_channel;
        struct scsipi_adapter *adapt = chan->chan_adapter;

        printf("%s(%s:%d:%d:%d): ", periph->periph_dev != NULL ?
            device_xname(periph->periph_dev) : "probe",
            device_xname(adapt->adapt_dev),
            chan->chan_channel, periph->periph_target,
            periph->periph_lun);
}

/*
 * Kill off all pending xfers for a periph.
 *
 * Must be called with channel lock held
 */
void
scsi_kill_pending(struct scsipi_periph *periph)
{
        struct scsipi_xfer *xs;

        TAILQ_FOREACH(xs, &periph->periph_xferq, device_q) {
                callout_stop(&xs->xs_callout);
                scsi_print_addr(periph);
                printf("killed ");
                scsipi_print_cdb(xs->cmd);
                xs->error = XS_DRIVER_STUFFUP;
                scsipi_done(xs);
        }
}

/*
 * scsi_print_xfer_mode:
 *
 *        Print a parallel SCSI periph's capabilities.
 */
static void
scsi_print_xfer_mode(struct scsipi_periph *periph)
{
        struct scsipi_channel *chan = periph->periph_channel;
        struct scsipi_adapter *adapt = chan->chan_adapter;
        int period, freq, speed, mbs;

        if (periph->periph_dev)
                aprint_normal_dev(periph->periph_dev, "");
        else
                aprint_normal("probe(%s:%d:%d:%d): ",
                        device_xname(adapt->adapt_dev),
                        chan->chan_channel, periph->periph_target,
                        periph->periph_lun);
        if (periph->periph_mode & (PERIPH_CAP_SYNC | PERIPH_CAP_DT)) {
                period = scsipi_sync_factor_to_period(periph->periph_period);
                aprint_normal("sync (%d.%02dns offset %d)",
                    period / 100, period % 100, periph->periph_offset);
        } else
                aprint_normal("async");

        if (periph->periph_mode & PERIPH_CAP_WIDE32)
                aprint_normal(", 32-bit");
        else if (periph->periph_mode & (PERIPH_CAP_WIDE16 | PERIPH_CAP_DT))
                aprint_normal(", 16-bit");
        else
                aprint_normal(", 8-bit");

        if (periph->periph_mode & (PERIPH_CAP_SYNC | PERIPH_CAP_DT)) {
                freq = scsipi_sync_factor_to_freq(periph->periph_period);
                speed = freq;
                if (periph->periph_mode & PERIPH_CAP_WIDE32)
                        speed *= 4;
                else if (periph->periph_mode &
                    (PERIPH_CAP_WIDE16 | PERIPH_CAP_DT))
                        speed *= 2;
                mbs = speed / 1000;
                if (mbs > 0) {
                        aprint_normal(" (%d.%03dMB/s)", mbs,
                            speed % 1000);
                } else
                        aprint_normal(" (%dKB/s)", speed % 1000);
        }

        aprint_normal(" transfers");

        if (periph->periph_mode & PERIPH_CAP_TQING)
                aprint_normal(", tagged queueing");

        aprint_normal("\n");
}

/*
 * scsi_async_event_xfer_mode:
 *
 *        Update the xfer mode for all parallel SCSI periphs sharing the
 *        specified I_T Nexus.
 */
void
scsi_async_event_xfer_mode(struct scsipi_channel *chan, void *arg)
{
        struct scsipi_xfer_mode *xm = arg;
        struct scsipi_periph *periph;
        int lun, announce, mode, period, offset;

        for (lun = 0; lun < chan->chan_nluns; lun++) {
                periph = scsipi_lookup_periph_locked(chan, xm->xm_target, lun);
                if (periph == NULL)
                        continue;
                announce = 0;

                /*
                 * Clamp the xfer mode down to this periph's capabilities.
                 */
                mode = xm->xm_mode & periph->periph_cap;
                if (mode & PERIPH_CAP_SYNC) {
                        period = xm->xm_period;
                        offset = xm->xm_offset;
                } else {
                        period = 0;
                        offset = 0;
                }

                /*
                 * If we do not have a valid xfer mode yet, or the parameters
                 * are different, announce them.
                 */
                if ((periph->periph_flags & PERIPH_MODE_VALID) == 0 ||
                    periph->periph_mode != mode ||
                    periph->periph_period != period ||
                    periph->periph_offset != offset)
                        announce = 1;

                periph->periph_mode = mode;
                periph->periph_period = period;
                periph->periph_offset = offset;
                periph->periph_flags |= PERIPH_MODE_VALID;

                if (announce)
                        scsi_print_xfer_mode(periph);
        }
}

/*
 * scsipi_async_event_xfer_mode:
 *
 *        Update the xfer mode for all SAS/FC periphs sharing the
 *        specified I_T Nexus.
 */
void
scsi_fc_sas_async_event_xfer_mode(struct scsipi_channel *chan, void *arg)
{
        struct scsipi_xfer_mode *xm = arg;
        struct scsipi_periph *periph;
        int lun, announce, mode;

        for (lun = 0; lun < chan->chan_nluns; lun++) {
                periph = scsipi_lookup_periph_locked(chan, xm->xm_target, lun);
                if (periph == NULL)
                        continue;
                announce = 0;

                /*
                 * Clamp the xfer mode down to this periph's capabilities.
                 */
                mode = xm->xm_mode & periph->periph_cap;
                /*
                 * If we do not have a valid xfer mode yet, or the parameters
                 * are different, announce them.
                 */
                if ((periph->periph_flags & PERIPH_MODE_VALID) == 0 ||
                    periph->periph_mode != mode)
                        announce = 1;

                periph->periph_mode = mode;
                periph->periph_flags |= PERIPH_MODE_VALID;

                if (announce &&
                    (periph->periph_mode & PERIPH_CAP_TQING) != 0) {
                        aprint_normal_dev(periph->periph_dev,
                            "tagged queueing\n");
                }
        }
}



































































































































































































































































































































































































































































































    1 









    1 
















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
/*        $NetBSD: subr_prof.c,v 1.50 2021/08/14 17:51:20 ryo Exp $        */

/*-
 * Copyright (c) 1982, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)subr_prof.c        8.4 (Berkeley) 2/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_prof.c,v 1.50 2021/08/14 17:51:20 ryo Exp $");

#ifdef _KERNEL_OPT
#include "opt_gprof.h"
#include "opt_multiprocessor.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>

#include <sys/cpu.h>

#ifdef GPROF
#include <sys/malloc.h>
#include <sys/gmon.h>
#include <sys/xcall.h>

MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");

static int sysctl_kern_profiling(SYSCTLFN_ARGS);
#ifdef MULTIPROCESSOR
void _gmonparam_merge(struct gmonparam *, struct gmonparam *);
#endif

/*
 * Froms is actually a bunch of unsigned shorts indexing tos
 */
struct gmonparam _gmonparam = { .state = GMON_PROF_OFF };

/* Actual start of the kernel text segment. */
extern char kernel_text[];

extern char etext[];


void
kmstartup(void)
{
        char *cp;
        struct gmonparam *p = &_gmonparam;
        unsigned long size;
        /*
         * Round lowpc and highpc to multiples of the density we're using
         * so the rest of the scaling (here and in gprof) stays in ints.
         */
        p->lowpc = rounddown(((u_long)kernel_text),
                HISTFRACTION * sizeof(HISTCOUNTER));
        p->highpc = roundup((u_long)etext,
                HISTFRACTION * sizeof(HISTCOUNTER));
        p->textsize = p->highpc - p->lowpc;
        printf("Profiling kernel, textsize=%ld [%lx..%lx]\n",
               p->textsize, p->lowpc, p->highpc);
        p->kcountsize = p->textsize / HISTFRACTION;
        p->hashfraction = HASHFRACTION;
        p->fromssize = p->textsize / HASHFRACTION;
        p->tolimit = p->textsize * ARCDENSITY / 100;
        if (p->tolimit < MINARCS)
                p->tolimit = MINARCS;
        else if (p->tolimit > MAXARCS)
                p->tolimit = MAXARCS;
        p->tossize = p->tolimit * sizeof(struct tostruct);

        size = p->kcountsize + p->fromssize + p->tossize;
#ifdef MULTIPROCESSOR
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
        for (CPU_INFO_FOREACH(cii, ci)) {
                p = malloc(sizeof(struct gmonparam) + size, M_GPROF,
                    M_NOWAIT | M_ZERO);
                if (p == NULL) {
                        printf("No memory for profiling on %s\n",
                            cpu_name(ci));
                        /* cannot profile on this cpu */
                        continue;
                }
                memcpy(p, &_gmonparam, sizeof(_gmonparam));
                ci->ci_gmon = p;

                /*
                 * To allow profiling to be controlled only by the global
                 * _gmonparam.state, set the default value for each CPU to
                 * GMON_PROF_ON. If _gmonparam.state is not ON, mcount will
                 * not be executed.
                 * This is For compatibility of the kgmon(8) kmem interface.
                 */
                p->state = GMON_PROF_ON;

                cp = (char *)(p + 1);
                p->tos = (struct tostruct *)cp;
                p->kcount = (u_short *)(cp + p->tossize);
                p->froms = (u_short *)(cp + p->tossize + p->kcountsize);
        }

        sysctl_createv(NULL, 0, NULL, NULL,
            0, CTLTYPE_NODE, "percpu",
            SYSCTL_DESCR("per cpu profiling information"),
            NULL, 0, NULL, 0,
            CTL_KERN, KERN_PROF, GPROF_PERCPU, CTL_EOL);

        for (CPU_INFO_FOREACH(cii, ci)) {
                if (ci->ci_gmon == NULL)
                        continue;

                sysctl_createv(NULL, 0, NULL, NULL,
                    0, CTLTYPE_NODE, cpu_name(ci),
                    NULL,
                    NULL, 0, NULL, 0,
                    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci), CTL_EOL);

                sysctl_createv(NULL, 0, NULL, NULL,
                    CTLFLAG_READWRITE, CTLTYPE_INT, "state",
                    SYSCTL_DESCR("Profiling state"),
                    sysctl_kern_profiling, 0, (void *)ci, 0,
                    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
                    GPROF_STATE, CTL_EOL);
                sysctl_createv(NULL, 0, NULL, NULL,
                    CTLFLAG_READWRITE, CTLTYPE_STRUCT, "count",
                    SYSCTL_DESCR("Array of statistical program counters"),
                    sysctl_kern_profiling, 0, (void *)ci, 0,
                    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
                    GPROF_COUNT, CTL_EOL);
                sysctl_createv(NULL, 0, NULL, NULL,
                    CTLFLAG_READWRITE, CTLTYPE_STRUCT, "froms",
                    SYSCTL_DESCR("Array indexed by program counter of "
                    "call-from points"),
                    sysctl_kern_profiling, 0, (void *)ci, 0,
                    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
                    GPROF_FROMS, CTL_EOL);
                sysctl_createv(NULL, 0, NULL, NULL,
                    CTLFLAG_READWRITE, CTLTYPE_STRUCT, "tos",
                    SYSCTL_DESCR("Array of structures describing "
                    "destination of calls and their counts"),
                    sysctl_kern_profiling, 0, (void *)ci, 0,
                    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
                    GPROF_TOS, CTL_EOL);
                sysctl_createv(NULL, 0, NULL, NULL,
                    CTLFLAG_READWRITE, CTLTYPE_STRUCT, "gmonparam",
                    SYSCTL_DESCR("Structure giving the sizes of the above "
                    "arrays"),
                    sysctl_kern_profiling, 0, (void *)ci, 0,
                    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
                    GPROF_GMONPARAM, CTL_EOL);
        }

        /*
         * For minimal compatibility of the kgmon(8) kmem interface,
         * the _gmonparam and cpu0:ci_gmon share buffers.
         */
        p = curcpu()->ci_gmon;
        if (p != NULL) {
                _gmonparam.tos = p->tos;
                _gmonparam.kcount = p->kcount;
                _gmonparam.froms = p->froms;
        }
#else /* MULTIPROCESSOR */
        cp = malloc(size, M_GPROF, M_NOWAIT | M_ZERO);
        if (cp == 0) {
                printf("No memory for profiling.\n");
                return;
        }
        p->tos = (struct tostruct *)cp;
        cp += p->tossize;
        p->kcount = (u_short *)cp;
        cp += p->kcountsize;
        p->froms = (u_short *)cp;
#endif /* MULTIPROCESSOR */
}

#ifdef MULTIPROCESSOR
static void
prof_set_state_xc(void *arg1, void *arg2 __unused)
{
        int state = PTRTOUINT64(arg1);
        struct gmonparam *gp = curcpu()->ci_gmon;

        if (gp != NULL)
                gp->state = state;
}
#endif /* MULTIPROCESSOR */

/*
 * Return kernel profiling information.
 */
/*
 * sysctl helper routine for kern.profiling subtree.  enables/disables
 * kernel profiling and gives out copies of the profiling data.
 */
static int
sysctl_kern_profiling(SYSCTLFN_ARGS)
{
        struct sysctlnode node = *rnode;
        struct gmonparam *gp;
        int error;
#ifdef MULTIPROCESSOR
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci, *target_ci;
        uint64_t where;
        int state;
        bool prof_on, do_merge;

        target_ci = (struct cpu_info *)rnode->sysctl_data;
        do_merge = (oldp != NULL) && (target_ci == NULL) &&
            ((node.sysctl_num == GPROF_COUNT) ||
            (node.sysctl_num == GPROF_FROMS) ||
            (node.sysctl_num == GPROF_TOS));

        if (do_merge) {
                /* kern.profiling.{count,froms,tos} */
                unsigned long size;
                char *cp;

                /* allocate temporary gmonparam, and merge results of all CPU */
                size = _gmonparam.kcountsize + _gmonparam.fromssize +
                    _gmonparam.tossize;
                gp = malloc(sizeof(struct gmonparam) + size, M_GPROF,
                    M_NOWAIT | M_ZERO);
                if (gp == NULL)
                        return ENOMEM;
                memcpy(gp, &_gmonparam, sizeof(_gmonparam));
                cp = (char *)(gp + 1);
                gp->tos = (struct tostruct *)cp;
                gp->kcount = (u_short *)(cp + gp->tossize);
                gp->froms = (u_short *)(cp + gp->tossize + gp->kcountsize);

                for (CPU_INFO_FOREACH(cii, ci)) {
                        if (ci->ci_gmon == NULL)
                                continue;
                        _gmonparam_merge(gp, ci->ci_gmon);
                }
        } else if (target_ci != NULL) {
                /* kern.profiling.percpu.* */
                gp = target_ci->ci_gmon;
        } else {
                /* kern.profiling.{state,gmonparam} */
                gp = &_gmonparam;
        }
#else /* MULTIPROCESSOR */
        gp = &_gmonparam;
#endif

        switch (node.sysctl_num) {
        case GPROF_STATE:
#ifdef MULTIPROCESSOR
                /*
                 * if _gmonparam.state is OFF, the state of each CPU is
                 * considered to be OFF, even if it is actually ON.
                 */
                if (_gmonparam.state == GMON_PROF_OFF ||
                    gp->state == GMON_PROF_OFF)
                        state = GMON_PROF_OFF;
                else
                        state = GMON_PROF_ON;
                node.sysctl_data = &state;
#else
                node.sysctl_data = &gp->state;
#endif
                break;
        case GPROF_COUNT:
                node.sysctl_data = gp->kcount;
                node.sysctl_size = gp->kcountsize;
                break;
        case GPROF_FROMS:
                node.sysctl_data = gp->froms;
                node.sysctl_size = gp->fromssize;
                break;
        case GPROF_TOS:
                node.sysctl_data = gp->tos;
                node.sysctl_size = gp->tossize;
                break;
        case GPROF_GMONPARAM:
                node.sysctl_data = gp;
                node.sysctl_size = sizeof(*gp);
                break;
        default:
                return (EOPNOTSUPP);
        }

        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                goto done;

#ifdef MULTIPROCESSOR
        switch (node.sysctl_num) {
        case GPROF_STATE:
                if (target_ci != NULL) {
                        where = xc_unicast(0, prof_set_state_xc,
                            UINT64TOPTR(state), NULL, target_ci);
                        xc_wait(where);

                        /* if even one CPU being profiled, enable perfclock. */
                        prof_on = false;
                        for (CPU_INFO_FOREACH(cii, ci)) {
                                if (ci->ci_gmon == NULL)
                                        continue;
                                if (ci->ci_gmon->state != GMON_PROF_OFF) {
                                        prof_on = true;
                                        break;
                                }
                        }
                        mutex_spin_enter(&proc0.p_stmutex);
                        if (prof_on)
                                startprofclock(&proc0);
                        else
                                stopprofclock(&proc0);
                        mutex_spin_exit(&proc0.p_stmutex);

                        if (prof_on) {
                                _gmonparam.state = GMON_PROF_ON;
                        } else {
                                _gmonparam.state = GMON_PROF_OFF;
                                /*
                                 * when _gmonparam.state and all CPU gmon state
                                 * are OFF, all CPU states should be ON so that
                                 * the entire CPUs profiling can be controlled
                                 * by _gmonparam.state only.
                                 */
                                for (CPU_INFO_FOREACH(cii, ci)) {
                                        if (ci->ci_gmon == NULL)
                                                continue;
                                        ci->ci_gmon->state = GMON_PROF_ON;
                                }
                        }
                } else {
                        _gmonparam.state = state;
                        where = xc_broadcast(0, prof_set_state_xc,
                            UINT64TOPTR(state), NULL);
                        xc_wait(where);

                        mutex_spin_enter(&proc0.p_stmutex);
                        if (state == GMON_PROF_OFF)
                                stopprofclock(&proc0);
                        else
                                startprofclock(&proc0);
                        mutex_spin_exit(&proc0.p_stmutex);
                }
                break;
        case GPROF_COUNT:
                /*
                 * if 'kern.profiling.{count,froms,tos}' is written, the same
                 * data will be written to 'kern.profiling.percpu.cpuN.xxx'
                 */
                if (target_ci == NULL) {
                        for (CPU_INFO_FOREACH(cii, ci)) {
                                if (ci->ci_gmon == NULL)
                                        continue;
                                memmove(ci->ci_gmon->kcount, gp->kcount,
                                    newlen);
                        }
                }
                break;
        case GPROF_FROMS:
                if (target_ci == NULL) {
                        for (CPU_INFO_FOREACH(cii, ci)) {
                                if (ci->ci_gmon == NULL)
                                        continue;
                                memmove(ci->ci_gmon->froms, gp->froms, newlen);
                        }
                }
                break;
        case GPROF_TOS:
                if (target_ci == NULL) {
                        for (CPU_INFO_FOREACH(cii, ci)) {
                                if (ci->ci_gmon == NULL)
                                        continue;
                                memmove(ci->ci_gmon->tos, gp->tos, newlen);
                        }
                }
                break;
        }
#else
        if (node.sysctl_num == GPROF_STATE) {
                mutex_spin_enter(&proc0.p_stmutex);
                if (gp->state == GMON_PROF_OFF)
                        stopprofclock(&proc0);
                else
                        startprofclock(&proc0);
                mutex_spin_exit(&proc0.p_stmutex);
        }
#endif

 done:
#ifdef MULTIPROCESSOR
        if (do_merge)
                free(gp, M_GPROF);
#endif
        return error;
}

SYSCTL_SETUP(sysctl_kern_gprof_setup, "sysctl kern.profiling subtree setup")
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "profiling",
                       SYSCTL_DESCR("Profiling information (available)"),
                       NULL, 0, NULL, 0,
                       CTL_KERN, KERN_PROF, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "state",
                       SYSCTL_DESCR("Profiling state"),
                       sysctl_kern_profiling, 0, NULL, 0,
                       CTL_KERN, KERN_PROF, GPROF_STATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_STRUCT, "count",
                       SYSCTL_DESCR("Array of statistical program counters"),
                       sysctl_kern_profiling, 0, NULL, 0,
                       CTL_KERN, KERN_PROF, GPROF_COUNT, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_STRUCT, "froms",
                       SYSCTL_DESCR("Array indexed by program counter of "
                                    "call-from points"),
                       sysctl_kern_profiling, 0, NULL, 0,
                       CTL_KERN, KERN_PROF, GPROF_FROMS, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_STRUCT, "tos",
                       SYSCTL_DESCR("Array of structures describing "
                                    "destination of calls and their counts"),
                       sysctl_kern_profiling, 0, NULL, 0,
                       CTL_KERN, KERN_PROF, GPROF_TOS, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "gmonparam",
                       SYSCTL_DESCR("Structure giving the sizes of the above "
                                    "arrays"),
                       sysctl_kern_profiling, 0, NULL, 0,
                       CTL_KERN, KERN_PROF, GPROF_GMONPARAM, CTL_EOL);
}
#endif /* GPROF */

/*
 * Profiling system call.
 *
 * The scale factor is a fixed point number with 16 bits of fraction, so that
 * 1.0 is represented as 0x10000.  A scale factor of 0 turns off profiling.
 */
/* ARGSUSED */
int
sys_profil(struct lwp *l, const struct sys_profil_args *uap, register_t *retval)
{
        /* {
                syscallarg(char *) samples;
                syscallarg(size_t) size;
                syscallarg(u_long) offset;
                syscallarg(u_int) scale;
        } */
        struct proc *p = l->l_proc;
        struct uprof *upp;

        if (SCARG(uap, scale) > (1 << 16))
                return (EINVAL);
        if (SCARG(uap, scale) == 0) {
                mutex_spin_enter(&p->p_stmutex);
                stopprofclock(p);
                mutex_spin_exit(&p->p_stmutex);
                return (0);
        }
        upp = &p->p_stats->p_prof;

        /* Block profile interrupts while changing state. */
        mutex_spin_enter(&p->p_stmutex);
        upp->pr_off = SCARG(uap, offset);
        upp->pr_scale = SCARG(uap, scale);
        upp->pr_base = SCARG(uap, samples);
        upp->pr_size = SCARG(uap, size);
        startprofclock(p);
        mutex_spin_exit(&p->p_stmutex);

        return (0);
}

/*
 * Scale is a fixed-point number with the binary point 16 bits
 * into the value, and is <= 1.0.  pc is at most 32 bits, so the
 * intermediate result is at most 48 bits.
 */
#define        PC_TO_INDEX(pc, prof) \
        ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
            (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)

/*
 * Collect user-level profiling statistics; called on a profiling tick,
 * when a process is running in user-mode.  This routine may be called
 * from an interrupt context.  We schedule an AST that will vector us
 * to trap() with a context in which copyin and copyout will work.
 * Trap will then call addupc_task().
 *
 * XXX We could use ufetch/ustore here if the profile buffers were
 * wired.
 *
 * Note that we may (rarely) not get around to the AST soon enough, and
 * lose profile ticks when the next tick overwrites this one, but in this
 * case the system is overloaded and the profile is probably already
 * inaccurate.
 */
void
addupc_intr(struct lwp *l, u_long pc)
{
        struct uprof *prof;
        struct proc *p;
        u_int i;

        p = l->l_proc;

        KASSERT(mutex_owned(&p->p_stmutex));

        prof = &p->p_stats->p_prof;
        if (pc < prof->pr_off ||
            (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
                return;                        /* out of range; ignore */

        mutex_spin_exit(&p->p_stmutex);

        /* XXXSMP */
        prof->pr_addr = pc;
        prof->pr_ticks++;
        cpu_need_proftick(l);

        mutex_spin_enter(&p->p_stmutex);
}

/*
 * Much like before, but we can afford to take faults here.  If the
 * update fails, we simply turn off profiling.
 */
void
addupc_task(struct lwp *l, u_long pc, u_int ticks)
{
        struct uprof *prof;
        struct proc *p;
        void *addr;
        int error;
        u_int i;
        u_short v;

        p = l->l_proc;

        if (ticks == 0)
                return;

        mutex_spin_enter(&p->p_stmutex);
        prof = &p->p_stats->p_prof;

        /* Testing P_PROFIL may be unnecessary, but is certainly safe. */
        if ((p->p_stflag & PST_PROFIL) == 0 || pc < prof->pr_off ||
            (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
                mutex_spin_exit(&p->p_stmutex);
                return;
        }

        addr = prof->pr_base + i;
        mutex_spin_exit(&p->p_stmutex);
        if ((error = copyin(addr, (void *)&v, sizeof(v))) == 0) {
                v += ticks;
                error = copyout((void *)&v, addr, sizeof(v));
        }
        if (error != 0) {
                mutex_spin_enter(&p->p_stmutex);
                stopprofclock(p);
                mutex_spin_exit(&p->p_stmutex);
        }
}


























































































































































































































































































































    2 










    1 







    1 





















    1 




































































































































    1 
























    1 


























    1 




















    1 

    1 




















    1 










































































































































































































































































































































































































    1 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
/*        $NetBSD: ip6_mroute.c,v 1.132 2020/06/12 11:04:45 roy Exp $        */
/*        $KAME: ip6_mroute.c,v 1.49 2001/07/25 09:21:18 jinmei Exp $        */

/*
 * Copyright (C) 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*        BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp        */

/*
 * Copyright (c) 1992, 1993
 *      The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Stephen Deering of Stanford University.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
 */

/*
 * Copyright (c) 1989 Stephen Deering
 *
 * This code is derived from software contributed to Berkeley by
 * Stephen Deering of Stanford University.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by the University of
 *      California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
 */

/*
 * IP multicast forwarding procedures
 *
 * Written by David Waitzman, BBN Labs, August 1988.
 * Modified by Steve Deering, Stanford, February 1989.
 * Modified by Mark J. Steiglitz, Stanford, May, 1991
 * Modified by Van Jacobson, LBL, January 1993
 * Modified by Ajit Thyagarajan, PARC, August 1993
 * Modified by Bill Fenner, PARC, April 1994
 *
 * MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip6_mroute.c,v 1.132 2020/06/12 11:04:45 roy Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_mrouting.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/ioctl.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>

#include <net/if.h>
#include <net/route.h>
#include <net/raw_cb.h>
#include <net/net_stats.h>

#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/icmp6.h>

#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/ip6_mroute.h>
#include <netinet6/scope6_var.h>
#include <netinet6/pim6.h>
#include <netinet6/pim6_var.h>
#include <netinet6/nd6.h>

static int ip6_mdq(struct mbuf *, struct ifnet *, struct mf6c *);
static void phyint_send(struct ip6_hdr *, struct mif6 *, struct mbuf *);

static int set_pim6(int *);
static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in6 *);
static int register_send(struct ip6_hdr *, struct mif6 *, struct mbuf *);

/*
 * Globals.  All but ip6_mrouter, ip6_mrtproto and mrt6stat could be static,
 * except for netstat or debugging purposes.
 */
struct socket  *ip6_mrouter = NULL;
int                ip6_mrouter_ver = 0;
int                ip6_mrtproto = IPPROTO_PIM;    /* for netstat only */
struct mrt6stat        mrt6stat;

#define NO_RTE_FOUND         0x1
#define RTE_FOUND        0x2

struct mf6c        *mf6ctable[MF6CTBLSIZ];
u_char                n6expire[MF6CTBLSIZ];
struct mif6 mif6table[MAXMIFS];
#ifdef MRT6DEBUG
u_int                mrt6debug = 0;          /* debug level         */
#define DEBUG_MFC        0x02
#define DEBUG_FORWARD        0x04
#define DEBUG_EXPIRE        0x08
#define DEBUG_XMIT        0x10
#define DEBUG_REG        0x20
#define DEBUG_PIM        0x40
#define __mrt6debugused     /* empty */
#else
#define __mrt6debugused     __unused
#endif

static void        expire_upcalls(void *);
#define        EXPIRE_TIMEOUT        (hz / 4)        /* 4x / second */
#define        UPCALL_EXPIRE        6                /* number of timeouts */

#ifdef INET
#ifdef MROUTING
extern struct socket *ip_mrouter;
#endif
#endif

/*
 * 'Interfaces' associated with decapsulator (so we can tell
 * packets that went through it from ones that get reflected
 * by a broken gateway).  These interfaces are never linked into
 * the system ifnet list & no routes point to them.  I.e., packets
 * can't be sent this way.  They only exist as a placeholder for
 * multicast source verification.
 */
struct ifnet multicast_register_if6;

#define ENCAP_HOPS 64

/*
 * Private variables.
 */
static mifi_t nummifs = 0;
static mifi_t reg_mif_num = (mifi_t)-1;

static percpu_t *pim6stat_percpu;

#define        PIM6_STATINC(x)                _NET_STATINC(pim6stat_percpu, x)

static int pim6;

/*
 * Hash function for a source, group entry
 */
#define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \
                                   (a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \
                                   (g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \
                                   (g).s6_addr32[2] ^ (g).s6_addr32[3])

/*
 * Find a route for a given origin IPv6 address and Multicast group address.
 * Quality of service parameter to be added in the future!!!
 */

#define MF6CFIND(o, g, rt) do { \
        struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \
        rt = NULL; \
        mrt6stat.mrt6s_mfc_lookups++; \
        while (_rt) { \
                if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \
                    IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \
                    (_rt->mf6c_stall == NULL)) { \
                        rt = _rt; \
                        break; \
                } \
                _rt = _rt->mf6c_next; \
        } \
        if (rt == NULL) { \
                mrt6stat.mrt6s_mfc_misses++; \
        } \
} while (/*CONSTCOND*/ 0)

/*
 * Macros to compute elapsed time efficiently
 * Borrowed from Van Jacobson's scheduling code
 */
#define TV_DELTA(a, b, delta) do { \
            int xxs; \
                \
            delta = (a).tv_usec - (b).tv_usec; \
            if ((xxs = (a).tv_sec - (b).tv_sec)) { \
               switch (xxs) { \
                      case 2: \
                          delta += 1000000; \
                              /* FALLTHROUGH */ \
                      case 1: \
                          delta += 1000000; \
                          break; \
                      default: \
                          delta += (1000000 * xxs); \
               } \
            } \
} while (/*CONSTCOND*/ 0)

#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
              (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)

#ifdef UPCALL_TIMING
#define UPCALL_MAX        50
u_long upcall_data[UPCALL_MAX + 1];
static void collate();
#endif /* UPCALL_TIMING */

static int get_sg_cnt(struct sioc_sg_req6 *);
static int get_mif6_cnt(struct sioc_mif_req6 *);
static int ip6_mrouter_init(struct socket *, int, int);
static int add_m6if(struct mif6ctl *);
static int del_m6if(mifi_t *);
static int add_m6fc(struct mf6cctl *);
static int del_m6fc(struct mf6cctl *);
static void sysctl_net_inet6_pim6_setup(struct sysctllog **);

static callout_t expire_upcalls_ch;

void
pim6_init(void)
{

        sysctl_net_inet6_pim6_setup(NULL);
        pim6stat_percpu = percpu_alloc(sizeof(uint64_t) * PIM6_NSTATS);
}

/*
 * Handle MRT setsockopt commands to modify the multicast routing tables.
 */
int
ip6_mrouter_set(struct socket *so, struct sockopt *sopt)
{
        int error, optval;
        struct mif6ctl mifc;
        struct mf6cctl mfcc;
        mifi_t mifi;

        if (sopt->sopt_name != MRT6_INIT && so != ip6_mrouter)
                return (EACCES);

        error = 0;

        switch (sopt->sopt_name) {
#ifdef MRT6_OINIT
        case MRT6_OINIT:
#endif
        case MRT6_INIT:
                error = sockopt_getint(sopt, &optval);
                if (error)
                        break;
                return (ip6_mrouter_init(so, optval, sopt->sopt_name));
        case MRT6_DONE:
                return (ip6_mrouter_done());
        case MRT6_ADD_MIF:
                error = sockopt_get(sopt, &mifc, sizeof(mifc));
                if (error)
                        break;
                return (add_m6if(&mifc));
        case MRT6_DEL_MIF:
                error = sockopt_get(sopt, &mifi, sizeof(mifi));
                if (error)
                        break;
                return (del_m6if(&mifi));
        case MRT6_ADD_MFC:
                error = sockopt_get(sopt, &mfcc, sizeof(mfcc));
                if (error)
                        break;
                return (add_m6fc(&mfcc));
        case MRT6_DEL_MFC:
                error = sockopt_get(sopt, &mfcc, sizeof(mfcc));
                if (error)
                        break;
                return (del_m6fc(&mfcc));
        case MRT6_PIM:
                error = sockopt_getint(sopt, &optval);
                if (error)
                        break;
                return (set_pim6(&optval));
        default:
                error = EOPNOTSUPP;
        }

        return (error);
}

/*
 * Handle MRT getsockopt commands
 */
int
ip6_mrouter_get(struct socket *so, struct sockopt *sopt)
{
        int error;

        if (so != ip6_mrouter)
                return EACCES;

        error = 0;

        switch (sopt->sopt_name) {
        case MRT6_PIM:
                error = sockopt_set(sopt, &pim6, sizeof(pim6));
                break;
        default:
                error = EOPNOTSUPP;
                break;
        }

        return (error);
}

/*
 * Handle ioctl commands to obtain information from the cache
 */
int
mrt6_ioctl(u_long cmd, void *data)
{

        switch (cmd) {
        case SIOCGETSGCNT_IN6:
                return (get_sg_cnt((struct sioc_sg_req6 *)data));
        case SIOCGETMIFCNT_IN6:
                return (get_mif6_cnt((struct sioc_mif_req6 *)data));
        default:
                return (EINVAL);
        }
}

/*
 * returns the packet, byte, rpf-failure count for the source group provided
 */
static int
get_sg_cnt(struct sioc_sg_req6 *req)
{
        struct mf6c *rt;
        int s;

        s = splsoftnet();
        MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt);
        splx(s);
        if (rt != NULL) {
                req->pktcnt = rt->mf6c_pkt_cnt;
                req->bytecnt = rt->mf6c_byte_cnt;
                req->wrong_if = rt->mf6c_wrong_if;
        } else
                return (ESRCH);
#if 0
                req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
#endif

        return 0;
}

/*
 * returns the input and output packet and byte counts on the mif provided
 */
static int
get_mif6_cnt(struct sioc_mif_req6 *req)
{
        mifi_t mifi = req->mifi;

        if (mifi >= nummifs)
                return EINVAL;

        req->icount = mif6table[mifi].m6_pkt_in;
        req->ocount = mif6table[mifi].m6_pkt_out;
        req->ibytes = mif6table[mifi].m6_bytes_in;
        req->obytes = mif6table[mifi].m6_bytes_out;

        return 0;
}

static int
set_pim6(int *i)
{
        if ((*i != 1) && (*i != 0))
                return EINVAL;

        pim6 = *i;

        return 0;
}

/*
 * Enable multicast routing
 */
static int
ip6_mrouter_init(struct socket *so, int v, int cmd)
{
#ifdef MRT6DEBUG
        if (mrt6debug)
                log(LOG_DEBUG,
                    "ip6_mrouter_init: so_type = %d, pr_protocol = %d\n",
                    so->so_type, so->so_proto->pr_protocol);
#endif

        if (so->so_type != SOCK_RAW ||
            so->so_proto->pr_protocol != IPPROTO_ICMPV6)
                return EOPNOTSUPP;

        if (v != 1)
                return ENOPROTOOPT;

        if (ip6_mrouter != NULL)
                return EADDRINUSE;

        ip6_mrouter = so;
        ip6_mrouter_ver = cmd;

        memset((void *)mf6ctable, 0, sizeof(mf6ctable));
        memset((void *)n6expire, 0, sizeof(n6expire));

        pim6 = 0;/* used for stubbing out/in pim stuff */

        callout_init(&expire_upcalls_ch, CALLOUT_MPSAFE);
        callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
            expire_upcalls, NULL);

#ifdef MRT6DEBUG
        if (mrt6debug)
                log(LOG_DEBUG, "ip6_mrouter_init\n");
#endif

        return 0;
}

/*
 * Disable multicast routing
 */
int
ip6_mrouter_done(void)
{
        mifi_t mifi;
        int i;
        struct ifnet *ifp;
        struct sockaddr_in6 sin6;
        struct mf6c *rt;
        struct rtdetq *rte;
        int s;

        s = splsoftnet();

        /*
         * For each phyint in use, disable promiscuous reception of all IPv6
         * multicasts.
         */
#ifdef INET
#ifdef MROUTING
        /*
         * If there is still IPv4 multicast routing daemon,
         * we remain interfaces to receive all muliticasted packets.
         * XXX: there may be an interface in which the IPv4 multicast
         * daemon is not interested...
         */
        if (!ip_mrouter)
#endif
#endif
        {
                for (mifi = 0; mifi < nummifs; mifi++) {
                        if (mif6table[mifi].m6_ifp &&
                            !(mif6table[mifi].m6_flags & MIFF_REGISTER)) {
                                ifp = mif6table[mifi].m6_ifp;
                                sockaddr_in6_init(&sin6, &in6addr_any, 0, 0, 0);
                                if_mcast_op(ifp, SIOCDELMULTI,
                                    sin6tocsa(&sin6));
                        }
                }
        }

        memset((void *)mif6table, 0, sizeof(mif6table));
        nummifs = 0;

        pim6 = 0; /* used to stub out/in pim specific code */

        callout_stop(&expire_upcalls_ch);

        /*
         * Free all multicast forwarding cache entries.
         */
        for (i = 0; i < MF6CTBLSIZ; i++) {
                rt = mf6ctable[i];
                while (rt) {
                        struct mf6c *frt;

                        for (rte = rt->mf6c_stall; rte != NULL; ) {
                                struct rtdetq *n = rte->next;

                                m_freem(rte->m);
                                free(rte, M_MRTABLE);
                                rte = n;
                        }
                        frt = rt;
                        rt = rt->mf6c_next;
                        free(frt, M_MRTABLE);
                }
        }

        memset((void *)mf6ctable, 0, sizeof(mf6ctable));

        /*
         * Reset register interface
         */
        if (reg_mif_num != (mifi_t)-1) {
                if_detach(&multicast_register_if6);
                reg_mif_num = (mifi_t)-1;
        }

        ip6_mrouter = NULL;
        ip6_mrouter_ver = 0;

        splx(s);

#ifdef MRT6DEBUG
        if (mrt6debug)
                log(LOG_DEBUG, "ip6_mrouter_done\n");
#endif

        return 0;
}

void
ip6_mrouter_detach(struct ifnet *ifp)
{
        struct rtdetq *rte;
        struct mf6c *mfc;
        mifi_t mifi;
        int i;

        if (ip6_mrouter == NULL)
                return;

        /*
         * Delete a mif which points to ifp.
         */
        for (mifi = 0; mifi < nummifs; mifi++)
                if (mif6table[mifi].m6_ifp == ifp)
                        del_m6if(&mifi);

        /*
         * Clear rte->ifp of cache entries received on ifp.
         */
        for (i = 0; i < MF6CTBLSIZ; i++) {
                if (n6expire[i] == 0)
                        continue;

                for (mfc = mf6ctable[i]; mfc != NULL; mfc = mfc->mf6c_next) {
                        for (rte = mfc->mf6c_stall; rte != NULL; rte = rte->next) {
                                if (rte->ifp == ifp)
                                        rte->ifp = NULL;
                        }
                }
        }
}

/*
 * Add a mif to the mif table
 */
static int
add_m6if(struct mif6ctl *mifcp)
{
        struct mif6 *mifp;
        struct ifnet *ifp;
        struct sockaddr_in6 sin6;
        int error, s;

        if (mifcp->mif6c_mifi >= MAXMIFS)
                return EINVAL;
        mifp = mif6table + mifcp->mif6c_mifi;
        if (mifp->m6_ifp)
                return EADDRINUSE; /* XXX: is it appropriate? */
        if (!mifcp->mif6c_pifi || (ifp = if_byindex(mifcp->mif6c_pifi)) == NULL)
                return ENXIO;

        if (mifcp->mif6c_flags & MIFF_REGISTER) {
                ifp = &multicast_register_if6;

                if (reg_mif_num == (mifi_t)-1) {
                        strlcpy(ifp->if_xname, "register_mif",
                            sizeof(ifp->if_xname));
                        ifp->if_flags |= IFF_LOOPBACK;
                        ifp->if_index = mifcp->mif6c_mifi;
                        reg_mif_num = mifcp->mif6c_mifi;
                        if_attach(ifp);
                }
        } else {
                /* Make sure the interface supports multicast */
                if ((ifp->if_flags & IFF_MULTICAST) == 0)
                        return EOPNOTSUPP;

                s = splsoftnet();
                /*
                 * Enable promiscuous reception of all IPv6 multicasts
                 * from the interface.
                 */
                sockaddr_in6_init(&sin6, &in6addr_any, 0, 0, 0);
                error = if_mcast_op(ifp, SIOCADDMULTI, sin6tosa(&sin6));
                splx(s);
                if (error)
                        return error;
        }

        s = splsoftnet();
        mifp->m6_flags     = mifcp->mif6c_flags;
        mifp->m6_ifp       = ifp;
        /* initialize per mif pkt counters */
        mifp->m6_pkt_in    = 0;
        mifp->m6_pkt_out   = 0;
        mifp->m6_bytes_in  = 0;
        mifp->m6_bytes_out = 0;
        splx(s);

        /* Adjust nummifs up if the mifi is higher than nummifs */
        if (nummifs <= mifcp->mif6c_mifi)
                nummifs = mifcp->mif6c_mifi + 1;

#ifdef MRT6DEBUG
        if (mrt6debug)
                log(LOG_DEBUG,
                    "add_mif #%d, phyint %s\n",
                    mifcp->mif6c_mifi, ifp->if_xname);
#endif

        return 0;
}

/*
 * Delete a mif from the mif table
 */
static int
del_m6if(mifi_t *mifip)
{
        struct mif6 *mifp = mif6table + *mifip;
        mifi_t mifi;
        struct ifnet *ifp;
        struct sockaddr_in6 sin6;
        int s;

        if (*mifip >= nummifs)
                return EINVAL;
        if (mifp->m6_ifp == NULL)
                return EINVAL;

        s = splsoftnet();

        if (!(mifp->m6_flags & MIFF_REGISTER)) {
                /*
                 * XXX: what if there is yet IPv4 multicast daemon
                 *      using the interface?
                 */
                ifp = mifp->m6_ifp;

                sockaddr_in6_init(&sin6, &in6addr_any, 0, 0, 0);
                if_mcast_op(ifp, SIOCDELMULTI, sin6tosa(&sin6));
        } else {
                if (reg_mif_num != (mifi_t)-1) {
                        if_detach(&multicast_register_if6);
                        reg_mif_num = (mifi_t)-1;
                }
        }

        memset((void *)mifp, 0, sizeof (*mifp));

        /* Adjust nummifs down */
        for (mifi = nummifs; mifi > 0; mifi--)
                if (mif6table[mifi - 1].m6_ifp)
                        break;
        nummifs = mifi;

        splx(s);

#ifdef MRT6DEBUG
        if (mrt6debug)
                log(LOG_DEBUG, "del_m6if %d, nummifs %d\n", *mifip, nummifs);
#endif

        return 0;
}

/*
 * Add an mfc entry
 */
static int
add_m6fc(struct mf6cctl *mfccp)
{
        struct mf6c *rt;
        u_long hash;
        struct rtdetq *rte;
        u_short nstl;
        int s;
        char ip6bufo[INET6_ADDRSTRLEN], ip6bufm[INET6_ADDRSTRLEN];

        MF6CFIND(mfccp->mf6cc_origin.sin6_addr,
                 mfccp->mf6cc_mcastgrp.sin6_addr, rt);

        /* If an entry already exists, just update the fields */
        if (rt) {
#ifdef MRT6DEBUG
                if (mrt6debug & DEBUG_MFC)
                        log(LOG_DEBUG,"add_m6fc update o %s g %s p %x\n",
                            IN6_PRINT(ip6bufo,
                            &mfccp->mf6cc_origin.sin6_addr),
                            IN6_PRINT(ip6bufm,
                            &mfccp->mf6cc_mcastgrp.sin6_addr),
                            mfccp->mf6cc_parent);
#endif

                s = splsoftnet();
                rt->mf6c_parent = mfccp->mf6cc_parent;
                rt->mf6c_ifset = mfccp->mf6cc_ifset;
                splx(s);
                return 0;
        }

        /*
         * Find the entry for which the upcall was made and update
         */
        s = splsoftnet();
        hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr,
                        mfccp->mf6cc_mcastgrp.sin6_addr);
        for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) {
                if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr,
                                       &mfccp->mf6cc_origin.sin6_addr) &&
                    IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
                                       &mfccp->mf6cc_mcastgrp.sin6_addr) &&
                    (rt->mf6c_stall != NULL)) {

                        if (nstl++)
                                log(LOG_ERR,
                                    "add_m6fc: %s o %s g %s p %x dbx %p\n",
                                    "multiple kernel entries",
                                    IN6_PRINT(ip6bufo,
                                    &mfccp->mf6cc_origin.sin6_addr),
                                    IN6_PRINT(ip6bufm,
                                    &mfccp->mf6cc_mcastgrp.sin6_addr),
                                    mfccp->mf6cc_parent, rt->mf6c_stall);

#ifdef MRT6DEBUG
                        if (mrt6debug & DEBUG_MFC)
                                log(LOG_DEBUG,
                                    "add_m6fc o %s g %s p %x dbg %p\n",
                                    IN6_PRINT(ip6bufo,
                                    &mfccp->mf6cc_origin.sin6_addr),
                                    IN6_PRINT(ip6bufm,
                                    &mfccp->mf6cc_mcastgrp.sin6_addr),
                                    mfccp->mf6cc_parent, rt->mf6c_stall);
#endif

                        rt->mf6c_origin     = mfccp->mf6cc_origin;
                        rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
                        rt->mf6c_parent     = mfccp->mf6cc_parent;
                        rt->mf6c_ifset            = mfccp->mf6cc_ifset;
                        /* initialize pkt counters per src-grp */
                        rt->mf6c_pkt_cnt    = 0;
                        rt->mf6c_byte_cnt   = 0;
                        rt->mf6c_wrong_if   = 0;

                        rt->mf6c_expire = 0;        /* Don't clean this guy up */
                        n6expire[hash]--;

                        /* free packets Qed at the end of this entry */
                        for (rte = rt->mf6c_stall; rte != NULL; ) {
                                struct rtdetq *n = rte->next;
                                if (rte->ifp) {
                                        ip6_mdq(rte->m, rte->ifp, rt);
                                }
                                m_freem(rte->m);
#ifdef UPCALL_TIMING
                                collate(&(rte->t));
#endif
                                free(rte, M_MRTABLE);
                                rte = n;
                        }
                        rt->mf6c_stall = NULL;
                }
        }

        /*
         * It is possible that an entry is being inserted without an upcall
         */
        if (nstl == 0) {
#ifdef MRT6DEBUG
                if (mrt6debug & DEBUG_MFC)
                        log(LOG_DEBUG,
                            "add_mfc no upcall h %ld o %s g %s p %x\n",
                            hash,
                            IN6_PRINT(ip6bufo,
                            &mfccp->mf6cc_origin.sin6_addr),
                            IN6_PRINT(ip6bufm,
                            &mfccp->mf6cc_mcastgrp.sin6_addr),
                            mfccp->mf6cc_parent);
#endif

                for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {

                        if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr,
                                               &mfccp->mf6cc_origin.sin6_addr)&&
                            IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
                                               &mfccp->mf6cc_mcastgrp.sin6_addr)) {

                                rt->mf6c_origin     = mfccp->mf6cc_origin;
                                rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
                                rt->mf6c_parent     = mfccp->mf6cc_parent;
                                rt->mf6c_ifset            = mfccp->mf6cc_ifset;
                                /* initialize pkt counters per src-grp */
                                rt->mf6c_pkt_cnt    = 0;
                                rt->mf6c_byte_cnt   = 0;
                                rt->mf6c_wrong_if   = 0;

                                if (rt->mf6c_expire)
                                        n6expire[hash]--;
                                rt->mf6c_expire           = 0;
                        }
                }
                if (rt == NULL) {
                        /* no upcall, so make a new entry */
                        rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
                        if (rt == NULL) {
                                splx(s);
                                return ENOBUFS;
                        }

                        /* insert new entry at head of hash chain */
                        rt->mf6c_origin     = mfccp->mf6cc_origin;
                        rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
                        rt->mf6c_parent     = mfccp->mf6cc_parent;
                        rt->mf6c_ifset            = mfccp->mf6cc_ifset;
                        /* initialize pkt counters per src-grp */
                        rt->mf6c_pkt_cnt    = 0;
                        rt->mf6c_byte_cnt   = 0;
                        rt->mf6c_wrong_if   = 0;
                        rt->mf6c_expire     = 0;
                        rt->mf6c_stall = NULL;

                        /* link into table */
                        rt->mf6c_next  = mf6ctable[hash];
                        mf6ctable[hash] = rt;
                }
        }
        splx(s);
        return 0;
}

#ifdef UPCALL_TIMING
/*
 * collect delay statistics on the upcalls
 */
static void
collate(struct timeval *t)
{
        u_long d;
        struct timeval tp;
        u_long delta;

        GET_TIME(tp);

        if (TV_LT(*t, tp))
        {
                TV_DELTA(tp, *t, delta);

                d = delta >> 10;
                if (d > UPCALL_MAX)
                        d = UPCALL_MAX;

                ++upcall_data[d];
        }
}
#endif /* UPCALL_TIMING */

/*
 * Delete an mfc entry
 */
static int
del_m6fc(struct mf6cctl *mfccp)
{
        struct sockaddr_in6         origin;
        struct sockaddr_in6         mcastgrp;
        struct mf6c                 *rt;
        struct mf6c                 **nptr;
        u_long                 hash;
        int s;

        origin = mfccp->mf6cc_origin;
        mcastgrp = mfccp->mf6cc_mcastgrp;
        hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr);

#ifdef MRT6DEBUG
        if (mrt6debug & DEBUG_MFC) {
                char ip6bufo[INET6_ADDRSTRLEN], ip6bufm[INET6_ADDRSTRLEN];
                log(LOG_DEBUG,"del_m6fc orig %s mcastgrp %s\n",
                    IN6_PRINT(ip6bufo, &origin.sin6_addr),
                    IN6_PRINT(ip6bufm, &mcastgrp.sin6_addr));
        }
#endif

        s = splsoftnet();

        nptr = &mf6ctable[hash];
        while ((rt = *nptr) != NULL) {
                if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr,
                                       &rt->mf6c_origin.sin6_addr) &&
                    IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr,
                                       &rt->mf6c_mcastgrp.sin6_addr) &&
                    rt->mf6c_stall == NULL)
                        break;

                nptr = &rt->mf6c_next;
        }
        if (rt == NULL) {
                splx(s);
                return EADDRNOTAVAIL;
        }

        *nptr = rt->mf6c_next;
        free(rt, M_MRTABLE);

        splx(s);

        return 0;
}

static int
socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in6 *src)
{
        if (s) {
                if (sbappendaddr(&s->so_rcv, sin6tosa(src), mm, NULL) != 0) {
                        sorwakeup(s);
                        return 0;
                }
                soroverflow(s);
        }
        m_freem(mm);
        return -1;
}

/*
 * IPv6 multicast forwarding function. This function assumes that the packet
 * pointed to by "ip6" has arrived on (or is about to be sent to) the interface
 * pointed to by "ifp", and the packet is to be relayed to other networks
 * that have members of the packet's destination IPv6 multicast group.
 *
 * The packet is returned unscathed to the caller, unless it is
 * erroneous, in which case a non-zero return value tells the caller to
 * discard it.
 */
int
ip6_mforward(struct ip6_hdr *ip6, struct ifnet *ifp, struct mbuf *m)
{
        struct mf6c *rt;
        struct mif6 *mifp;
        struct mbuf *mm;
        int s;
        mifi_t mifi;
        struct sockaddr_in6 sin6;
        char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];

#ifdef MRT6DEBUG
        if (mrt6debug & DEBUG_FORWARD)
                log(LOG_DEBUG, "ip6_mforward: src %s, dst %s, ifindex %d\n",
                    IN6_PRINT(ip6bufs, &ip6->ip6_src),
                    IN6_PRINT(ip6bufd, &ip6->ip6_dst),
                    ifp->if_index);
#endif

        /*
         * Don't forward a packet with Hop limit of zero or one,
         * or a packet destined to a local-only group.
         */
        if (ip6->ip6_hlim <= 1 || IN6_IS_ADDR_MC_NODELOCAL(&ip6->ip6_dst) ||
            IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
                return 0;
        ip6->ip6_hlim--;

        /*
         * Source address check: do not forward packets with unspecified
         * source. It was discussed in July 2000, on ipngwg mailing list.
         * This is rather more serious than unicast cases, because some
         * MLD packets can be sent with the unspecified source address
         * (although such packets must normally set the hop limit field to 1).
         */
        if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
                IP6_STATINC(IP6_STAT_CANTFORWARD);
                if (ip6_log_time + ip6_log_interval < time_uptime) {
                        ip6_log_time = time_uptime;
                        log(LOG_DEBUG,
                            "cannot forward "
                            "from %s to %s nxt %d received on %s\n",
                            IN6_PRINT(ip6bufs, &ip6->ip6_src),
                            IN6_PRINT(ip6bufd, &ip6->ip6_dst),
                            ip6->ip6_nxt,
                            m->m_pkthdr.rcvif_index ?
                            if_name(m_get_rcvif_NOMPSAFE(m)) : "?");
                }
                return 0;
        }

        /*
         * Determine forwarding mifs from the forwarding cache table
         */
        s = splsoftnet();
        MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt);

        /* Entry exists, so forward if necessary */
        if (rt) {
                splx(s);
                return ip6_mdq(m, ifp, rt);
        } else {
                /*
                 * If we don't have a route for packet's origin, make a copy
                 * of the packet and send message to routing daemon.
                 */

                struct mbuf *mb0;
                struct rtdetq *rte;
                u_long hash;

#ifdef UPCALL_TIMING
                struct timeval tp;
                GET_TIME(tp);
#endif

                mrt6stat.mrt6s_no_route++;
#ifdef MRT6DEBUG
                if (mrt6debug & (DEBUG_FORWARD | DEBUG_MFC))
                        log(LOG_DEBUG, "ip6_mforward: no rte s %s g %s\n",
                            IN6_PRINT(ip6bufs, &ip6->ip6_src),
                            IN6_PRINT(ip6bufd, &ip6->ip6_dst));
#endif

                /*
                 * Allocate mbufs early so that we don't do extra work if we
                 * are just going to fail anyway.
                 */
                rte = malloc(sizeof(*rte), M_MRTABLE, M_NOWAIT);
                if (rte == NULL) {
                        splx(s);
                        return ENOBUFS;
                }
                mb0 = m_copypacket(m, M_DONTWAIT);

                /*
                 * Pullup packet header if needed before storing it,
                 * as other references may modify it in the meantime.
                 */
                if (mb0 && M_UNWRITABLE(mb0, sizeof(struct ip6_hdr)))
                        mb0 = m_pullup(mb0, sizeof(struct ip6_hdr));
                if (mb0 == NULL) {
                        free(rte, M_MRTABLE);
                        splx(s);
                        return ENOBUFS;
                }

                /* is there an upcall waiting for this packet? */
                hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst);
                for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {
                        if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
                                               &rt->mf6c_origin.sin6_addr) &&
                            IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
                                               &rt->mf6c_mcastgrp.sin6_addr) &&
                            (rt->mf6c_stall != NULL))
                                break;
                }

                if (rt == NULL) {
                        struct mrt6msg *im;
                        struct omrt6msg *oim;

                        /* no upcall, so make a new entry */
                        rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
                        if (rt == NULL) {
                                free(rte, M_MRTABLE);
                                m_freem(mb0);
                                splx(s);
                                return ENOBUFS;
                        }

                        /*
                         * Make a copy of the header to send to the user
                         * level process
                         */
                        mm = m_copym(mb0, 0, sizeof(struct ip6_hdr), M_DONTWAIT);

                        if (mm == NULL) {
                                free(rte, M_MRTABLE);
                                m_freem(mb0);
                                free(rt, M_MRTABLE);
                                splx(s);
                                return ENOBUFS;
                        }

                        /*
                         * Send message to routing daemon
                         */
                        sockaddr_in6_init(&sin6, &ip6->ip6_src, 0, 0, 0);

                        im = NULL;
                        oim = NULL;
                        switch (ip6_mrouter_ver) {
                        case MRT6_OINIT:
                                oim = mtod(mm, struct omrt6msg *);
                                oim->im6_msgtype = MRT6MSG_NOCACHE;
                                oim->im6_mbz = 0;
                                break;
                        case MRT6_INIT:
                                im = mtod(mm, struct mrt6msg *);
                                im->im6_msgtype = MRT6MSG_NOCACHE;
                                im->im6_mbz = 0;
                                break;
                        default:
                                free(rte, M_MRTABLE);
                                m_freem(mb0);
                                free(rt, M_MRTABLE);
                                splx(s);
                                return EINVAL;
                        }

#ifdef MRT6DEBUG
                        if (mrt6debug & DEBUG_FORWARD)
                                log(LOG_DEBUG,
                                    "getting the iif info in the kernel\n");
#endif

                        for (mifp = mif6table, mifi = 0;
                             mifi < nummifs && mifp->m6_ifp != ifp;
                             mifp++, mifi++)
                                ;

                        switch (ip6_mrouter_ver) {
                        case MRT6_OINIT:
                                oim->im6_mif = mifi;
                                break;
                        case MRT6_INIT:
                                im->im6_mif = mifi;
                                break;
                        }

                        if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
                                log(LOG_WARNING, "ip6_mforward: ip6_mrouter "
                                    "socket queue full\n");
                                mrt6stat.mrt6s_upq_sockfull++;
                                free(rte, M_MRTABLE);
                                m_freem(mb0);
                                free(rt, M_MRTABLE);
                                splx(s);
                                return ENOBUFS;
                        }

                        mrt6stat.mrt6s_upcalls++;

                        /* insert new entry at head of hash chain */
                        memset(rt, 0, sizeof(*rt));
                        sockaddr_in6_init(&rt->mf6c_origin, &ip6->ip6_src,
                            0, 0, 0);
                        sockaddr_in6_init(&rt->mf6c_mcastgrp, &ip6->ip6_dst,
                            0, 0, 0);
                        rt->mf6c_expire = UPCALL_EXPIRE;
                        n6expire[hash]++;
                        rt->mf6c_parent = MF6C_INCOMPLETE_PARENT;

                        /* link into table */
                        rt->mf6c_next  = mf6ctable[hash];
                        mf6ctable[hash] = rt;
                        /* Add this entry to the end of the queue */
                        rt->mf6c_stall = rte;
                } else {
                        /* determine if q has overflowed */
                        struct rtdetq **p;
                        int npkts = 0;

                        for (p = &rt->mf6c_stall; *p != NULL; p = &(*p)->next) {
                                if (++npkts > MAX_UPQ6) {
                                        mrt6stat.mrt6s_upq_ovflw++;
                                        free(rte, M_MRTABLE);
                                        m_freem(mb0);
                                        splx(s);
                                        return 0;
                                }
                        }

                        /* Add this entry to the end of the queue */
                        *p = rte;
                }

                rte->next = NULL;
                rte->m = mb0;
                rte->ifp = ifp;
#ifdef UPCALL_TIMING
                rte->t = tp;
#endif

                splx(s);

                return 0;
        }
}

/*
 * Clean up cache entries if upcalls are not serviced
 * Call from the Slow Timeout mechanism, every 0.25 seconds.
 */
static void
expire_upcalls(void *unused)
{
        struct rtdetq *rte;
        struct mf6c *mfc, **nptr;
        int i;

        /* XXX NOMPSAFE still need softnet_lock */
        mutex_enter(softnet_lock);
        KERNEL_LOCK(1, NULL);

        for (i = 0; i < MF6CTBLSIZ; i++) {
                if (n6expire[i] == 0)
                        continue;
                nptr = &mf6ctable[i];
                while ((mfc = *nptr) != NULL) {
                        rte = mfc->mf6c_stall;
                        /*
                         * Skip real cache entries
                         * Make sure it wasn't marked to not expire (shouldn't happen)
                         * If it expires now
                         */
                        if (rte != NULL &&
                            mfc->mf6c_expire != 0 &&
                            --mfc->mf6c_expire == 0) {
#ifdef MRT6DEBUG
                                if (mrt6debug & DEBUG_EXPIRE) {
                                        char ip6bufo[INET6_ADDRSTRLEN];
                                        char ip6bufm[INET6_ADDRSTRLEN];
                                        log(LOG_DEBUG,
                                            "expire_upcalls: expiring (%s %s)\n",
                                            IN6_PRINT(ip6bufo,
                                            &mfc->mf6c_origin.sin6_addr),
                                            IN6_PRINT(ip6bufm,
                                            &mfc->mf6c_mcastgrp.sin6_addr));
                                }
#endif
                                /*
                                 * drop all the packets
                                 * free the mbuf with the pkt, if, timing info
                                 */
                                do {
                                        struct rtdetq *n = rte->next;
                                        m_freem(rte->m);
                                        free(rte, M_MRTABLE);
                                        rte = n;
                                } while (rte != NULL);
                                mrt6stat.mrt6s_cache_cleanups++;
                                n6expire[i]--;

                                *nptr = mfc->mf6c_next;
                                free(mfc, M_MRTABLE);
                        } else {
                                nptr = &mfc->mf6c_next;
                        }
                }
        }
        callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
            expire_upcalls, NULL);

        KERNEL_UNLOCK_ONE(NULL);
        mutex_exit(softnet_lock);
}

/*
 * Macro to send packet on mif.  Since RSVP packets don't get counted on
 * input, they shouldn't get counted on output, so statistics keeping is
 * separate.
 */
#define MC6_SEND(ip6, mifp, m) do {                                \
        if ((mifp)->m6_flags & MIFF_REGISTER)                        \
                register_send((ip6), (mifp), (m));                \
        else                                                        \
                phyint_send((ip6), (mifp), (m));                \
} while (/*CONSTCOND*/ 0)

/*
 * Packet forwarding routine once entry in the cache is made
 */
static int
ip6_mdq(struct mbuf *m, struct ifnet *ifp, struct mf6c *rt)
{
        struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
        mifi_t mifi, iif;
        struct mif6 *mifp;
        int plen = m->m_pkthdr.len;
        struct in6_addr src0, dst0; /* copies for local work */
        u_int32_t iszone, idzone, oszone, odzone;
        int error = 0;

        /*
         * Don't forward if it didn't arrive from the parent mif
         * for its origin.
         */
        mifi = rt->mf6c_parent;
        if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) {
                /* came in the wrong interface */
#ifdef MRT6DEBUG
                if (mrt6debug & DEBUG_FORWARD)
                        log(LOG_DEBUG,
                            "wrong if: ifid %d mifi %d mififid %x\n",
                            ifp->if_index, mifi,
                            mif6table[mifi].m6_ifp ?
                            mif6table[mifi].m6_ifp->if_index : -1);
#endif
                mrt6stat.mrt6s_wrong_if++;
                rt->mf6c_wrong_if++;

                /*
                 * If we are doing PIM processing, and we are forwarding
                 * packets on this interface, send a message to the
                 * routing daemon.
                 */
                /* have to make sure this is a valid mif */
                if (mifi < nummifs && mif6table[mifi].m6_ifp) {
                        if (pim6 && (m->m_flags & M_LOOP) == 0) {
                                /*
                                 * Check the M_LOOP flag to avoid an
                                 * unnecessary PIM assert.
                                 * XXX: M_LOOP is an ad-hoc hack...
                                 */
                                struct sockaddr_in6 sin6;

                                struct mbuf *mm;
                                struct mrt6msg *im;
                                struct omrt6msg *oim;

                                mm = m_copym(m, 0, sizeof(struct ip6_hdr), M_DONTWAIT);
                                if (mm && M_UNWRITABLE(mm, sizeof(struct ip6_hdr)))
                                        mm = m_pullup(mm, sizeof(struct ip6_hdr));
                                if (mm == NULL)
                                        return ENOBUFS;

                                oim = NULL;
                                im = NULL;
                                switch (ip6_mrouter_ver) {
                                case MRT6_OINIT:
                                        oim = mtod(mm, struct omrt6msg *);
                                        oim->im6_msgtype = MRT6MSG_WRONGMIF;
                                        oim->im6_mbz = 0;
                                        break;
                                case MRT6_INIT:
                                        im = mtod(mm, struct mrt6msg *);
                                        im->im6_msgtype = MRT6MSG_WRONGMIF;
                                        im->im6_mbz = 0;
                                        break;
                                default:
                                        m_freem(mm);
                                        return EINVAL;
                                }

                                for (mifp = mif6table, iif = 0;
                                     iif < nummifs && mifp &&
                                             mifp->m6_ifp != ifp;
                                     mifp++, iif++)
                                        ;

                                memset(&sin6, 0, sizeof(sin6));
                                sin6.sin6_len = sizeof(sin6);
                                sin6.sin6_family = AF_INET6;
                                switch (ip6_mrouter_ver) {
                                case MRT6_OINIT:
                                        oim->im6_mif = iif;
                                        sin6.sin6_addr = oim->im6_src;
                                        break;
                                case MRT6_INIT:
                                        im->im6_mif = iif;
                                        sin6.sin6_addr = im->im6_src;
                                        break;
                                }

                                mrt6stat.mrt6s_upcalls++;

                                if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
#ifdef MRT6DEBUG
                                        if (mrt6debug)
                                                log(LOG_WARNING, "mdq, ip6_mrouter socket queue full\n");
#endif
                                        ++mrt6stat.mrt6s_upq_sockfull;
                                        return ENOBUFS;
                                }
                        }
                }

                return 0;
        }

        /* If I sourced this packet, it counts as output, else it was input. */
        if (m->m_pkthdr.rcvif_index == 0) {
                /* XXX: is rcvif really NULL when output?? */
                mif6table[mifi].m6_pkt_out++;
                mif6table[mifi].m6_bytes_out += plen;
        } else {
                mif6table[mifi].m6_pkt_in++;
                mif6table[mifi].m6_bytes_in += plen;
        }
        rt->mf6c_pkt_cnt++;
        rt->mf6c_byte_cnt += plen;

        /*
         * For each mif, forward a copy of the packet if there are group
         * members downstream on the interface.
         */
        src0 = ip6->ip6_src;
        dst0 = ip6->ip6_dst;
        if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 ||
            (error = in6_setscope(&dst0, ifp, &idzone)) != 0) {
                IP6_STATINC(IP6_STAT_BADSCOPE);
                return error;
        }
        for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) {
                if (IF_ISSET(mifi, &rt->mf6c_ifset)) {
                        if (mif6table[mifi].m6_ifp == NULL)
                                continue;
                        /*
                         * check if the outgoing packet is going to break
                         * a scope boundary.
                         * XXX: For packets through PIM register tunnel
                         * interface, we believe the routing daemon.
                         */
                        if ((mif6table[rt->mf6c_parent].m6_flags &
                             MIFF_REGISTER) == 0 &&
                            (mif6table[mifi].m6_flags & MIFF_REGISTER) == 0) {
                                if (in6_setscope(&src0, mif6table[mifi].m6_ifp,
                                    &oszone) ||
                                    in6_setscope(&dst0, mif6table[mifi].m6_ifp,
                                    &odzone) ||
                                    iszone != oszone || idzone != odzone) {
                                        IP6_STATINC(IP6_STAT_BADSCOPE);
                                        continue;
                                }
                        }

                        mifp->m6_pkt_out++;
                        mifp->m6_bytes_out += plen;
                        MC6_SEND(ip6, mifp, m);
                }
        }

        return 0;
}

static void
phyint_send(struct ip6_hdr *ip6, struct mif6 *mifp, struct mbuf *m)
{
        struct mbuf *mb_copy;
        struct ifnet *ifp = mifp->m6_ifp;
        int error __mrt6debugused = 0;
        int s;
        static struct route ro;
        bool ingroup;
        struct sockaddr_in6 dst6;

        s = splsoftnet();

        /*
         * Make a new reference to the packet; make sure that
         * the IPv6 header is actually copied, not just referenced,
         * so that ip6_output() only scribbles on the copy.
         */
        mb_copy = m_copypacket(m, M_DONTWAIT);
        if (mb_copy && M_UNWRITABLE(mb_copy, sizeof(struct ip6_hdr)))
                mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr));
        if (mb_copy == NULL) {
                splx(s);
                return;
        }

        /* set MCAST flag to the outgoing packet */
        mb_copy->m_flags |= M_MCAST;

        /*
         * If we sourced the packet, call ip6_output since we may divide
         * the packet into fragments when the packet is too big for the
         * outgoing interface.
         * Otherwise, we can simply send the packet to the interface
         * sending queue.
         */
        if (m->m_pkthdr.rcvif_index == 0) {
                struct ip6_moptions im6o;

                im6o.im6o_multicast_if_index = if_get_index(ifp);
                /* XXX: ip6_output will override ip6->ip6_hlim */
                im6o.im6o_multicast_hlim = ip6->ip6_hlim;
                im6o.im6o_multicast_loop = 1;
                error = ip6_output(mb_copy, NULL, &ro, IPV6_FORWARDING,
                    &im6o, NULL, NULL);

#ifdef MRT6DEBUG
                if (mrt6debug & DEBUG_XMIT)
                        log(LOG_DEBUG, "phyint_send on mif %td err %d\n",
                            mifp - mif6table, error);
#endif
                splx(s);
                return;
        }

        /*
         * If we belong to the destination multicast group
         * on the outgoing interface, loop back a copy.
         */
        /*
         * Does not have to check source info, as it's already covered by
         * ip6_input
         */
        sockaddr_in6_init(&dst6, &ip6->ip6_dst, 0, 0, 0);

        ingroup = in6_multi_group(&ip6->ip6_dst, ifp);
        if (ingroup) {
                ip6_mloopback(ifp, m,
                    satocsin6(rtcache_getdst(&ro)));
        }

        /*
         * Put the packet into the sending queue of the outgoing interface
         * if it would fit in the MTU of the interface.
         */
        if (mb_copy->m_pkthdr.len <= ifp->if_mtu || ifp->if_mtu < IPV6_MMTU) {
                error = ip6_if_output(ifp, ifp, mb_copy, &dst6, NULL);
#ifdef MRT6DEBUG
                if (mrt6debug & DEBUG_XMIT)
                        log(LOG_DEBUG, "phyint_send on mif %td err %d\n",
                            mifp - mif6table, error);
#endif
        } else {
                /*
                 * pMTU discovery is intentionally disabled by default, since
                 * various routers may notify pMTU in multicast, which can be
                 * a DDoS to a router.
                 */
                if (ip6_mcast_pmtu) {
                        icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0,
                            ifp->if_mtu);
                } else {
                        /* simply discard the packet */
#ifdef MRT6DEBUG
                        if (mrt6debug & DEBUG_XMIT) {
                                char ip6bufs[INET6_ADDRSTRLEN];
                                char ip6bufd[INET6_ADDRSTRLEN];
                                log(LOG_DEBUG,
                                    "phyint_send: packet too big on %s o %s g %s"
                                    " size %d(discarded)\n",
                                    if_name(ifp),
                                    IN6_PRINT(ip6bufs, &ip6->ip6_src),
                                    IN6_PRINT(ip6bufd, &ip6->ip6_dst),
                                    mb_copy->m_pkthdr.len);
                        }
#endif
                        m_freem(mb_copy);
                }
        }

        splx(s);
}

static int
register_send(struct ip6_hdr *ip6, struct mif6 *mif, struct mbuf *m)
{
        struct mbuf *mm;
        int i, len = m->m_pkthdr.len;
        struct sockaddr_in6 sin6;
        struct mrt6msg *im6;

#ifdef MRT6DEBUG
        if (mrt6debug) {
                char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
                log(LOG_DEBUG, "** IPv6 register_send **\n src %s dst %s\n",
                    IN6_PRINT(ip6bufs, &ip6->ip6_src),
                    IN6_PRINT(ip6bufd, &ip6->ip6_dst));
        }
#endif
        PIM6_STATINC(PIM6_STAT_SND_REGISTERS);

        /* Make a copy of the packet to send to the user level process */
        MGETHDR(mm, M_DONTWAIT, MT_HEADER);
        if (mm == NULL)
                return ENOBUFS;
        mm->m_data += max_linkhdr;
        mm->m_len = sizeof(struct ip6_hdr);

        if ((mm->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) {
                m_freem(mm);
                return ENOBUFS;
        }
        i = MHLEN - M_LEADINGSPACE(mm);
        if (i > len)
                i = len;
        mm = m_pullup(mm, i);
        if (mm == NULL)
                return ENOBUFS;
        mm->m_pkthdr.len = len + sizeof(struct ip6_hdr);

        /*
         * Send message to routing daemon
         */
        sockaddr_in6_init(&sin6, &ip6->ip6_src, 0, 0, 0);

        im6 = mtod(mm, struct mrt6msg *);
        im6->im6_msgtype = MRT6MSG_WHOLEPKT;
        im6->im6_mbz = 0;
        im6->im6_mif = mif - mif6table;

        /* iif info is not given for reg. encap.n */
        mrt6stat.mrt6s_upcalls++;

        if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
#ifdef MRT6DEBUG
                if (mrt6debug)
                        log(LOG_WARNING,
                            "register_send: ip6_mrouter socket queue full\n");
#endif
                ++mrt6stat.mrt6s_upq_sockfull;
                return ENOBUFS;
        }

        return 0;
}

/*
 * PIM sparse mode hook. Receives the pim control messages, and passes them up
 * to the listening socket, using rip6_input.
 *
 * The only message processed is the REGISTER pim message; the pim header
 * is stripped off, and the inner packet is passed to register_mforward.
 */
int
pim6_input(struct mbuf **mp, int *offp, int proto)
{
        struct pim *pim;
        struct ip6_hdr *ip6 __mrt6debugused;
        int pimlen;
        struct mbuf *m = *mp;
        int minlen;
        int off = *offp;

        PIM6_STATINC(PIM6_STAT_RCV_TOTAL);

        ip6 = mtod(m, struct ip6_hdr *);
        pimlen = m->m_pkthdr.len - off;

        /*
         * Validate lengths
         */
        if (pimlen < PIM_MINLEN) {
                PIM6_STATINC(PIM6_STAT_RCV_TOOSHORT);
#ifdef MRT6DEBUG
                if (mrt6debug & DEBUG_PIM)
                        log(LOG_DEBUG,"pim6_input: PIM packet too short\n");
#endif
                m_freem(m);
                return IPPROTO_DONE;
        }

        /*
         * If the packet is at least as big as a REGISTER, go ahead
         * and grab the PIM REGISTER header size, to avoid another
         * possible m_pullup() later.
         *
         * PIM_MINLEN       == pimhdr + u_int32 == 8
         * PIM6_REG_MINLEN   == pimhdr + reghdr + eip6hdr == 4 + 4 + 40
         */
        minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN;

        /*
         * Make sure that the IP6 and PIM headers in contiguous memory, and
         * possibly the PIM REGISTER header
         */
        IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen);
        if (pim == NULL) {
                PIM6_STATINC(PIM6_STAT_RCV_TOOSHORT);
                return IPPROTO_DONE;
        }

        /* PIM version check */
        if (pim->pim_ver != PIM_VERSION) {
                PIM6_STATINC(PIM6_STAT_RCV_BADVERSION);
#ifdef MRT6DEBUG
                log(LOG_ERR,
                    "pim6_input: incorrect version %d, expecting %d\n",
                    pim->pim_ver, PIM_VERSION);
#endif
                m_freem(m);
                return IPPROTO_DONE;
        }

#define PIM6_CHECKSUM
#ifdef PIM6_CHECKSUM
        {
                int cksumlen;

                /*
                 * Validate checksum.
                 * If PIM REGISTER, exclude the data packet
                 */
                if (pim->pim_type == PIM_REGISTER)
                        cksumlen = PIM_MINLEN;
                else
                        cksumlen = pimlen;

                if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) {
                        PIM6_STATINC(PIM6_STAT_RCV_BADSUM);
#ifdef MRT6DEBUG
                        if (mrt6debug & DEBUG_PIM)
                                log(LOG_DEBUG,
                                    "pim6_input: invalid checksum\n");
#endif
                        m_freem(m);
                        return IPPROTO_DONE;
                }
        }
#endif /* PIM_CHECKSUM */

        if (pim->pim_type == PIM_REGISTER) {
                /*
                 * since this is a REGISTER, we'll make a copy of the register
                 * headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the
                 * routing daemon.
                 */
                static const struct sockaddr_in6 dst = {
                        .sin6_len = sizeof(dst),
                        .sin6_family = AF_INET6,
                };

                struct mbuf *mcp;
                struct ip6_hdr *eip6;
                u_int32_t *reghdr;

                PIM6_STATINC(PIM6_STAT_RCV_REGISTERS);

                if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) {
#ifdef MRT6DEBUG
                        if (mrt6debug & DEBUG_PIM)
                                log(LOG_DEBUG,
                                    "pim6_input: register mif not set: %d\n",
                                    reg_mif_num);
#endif
                        m_freem(m);
                        return IPPROTO_DONE;
                }

                reghdr = (u_int32_t *)(pim + 1);

                if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
                        goto pim6_input_to_daemon;

                /*
                 * Validate length
                 */
                if (pimlen < PIM6_REG_MINLEN) {
#ifdef MRT6DEBUG
                        char ip6buf[INET6_ADDRSTRLEN];
                        log(LOG_ERR,
                            "pim6_input: register packet size too "
                            "small %d from %s\n",
                            pimlen, IN6_PRINT(ip6buf, &ip6->ip6_src));
#endif
                        PIM6_STATINC(PIM6_STAT_RCV_TOOSHORT);
                        PIM6_STATINC(PIM6_STAT_RCV_BADREGISTERS);
                        m_freem(m);
                        return IPPROTO_DONE;
                }

                eip6 = (struct ip6_hdr *)(reghdr + 1);
#ifdef MRT6DEBUG
                if (mrt6debug & DEBUG_PIM) {
                        char ip6bufs[INET6_ADDRSTRLEN];
                        char ip6bufd[INET6_ADDRSTRLEN];
                        log(LOG_DEBUG,
                            "pim6_input[register], eip6: %s -> %s, "
                            "eip6 plen %d\n",
                            IN6_PRINT(ip6bufs, &eip6->ip6_src),
                            IN6_PRINT(ip6bufd, &eip6->ip6_dst),
                            ntohs(eip6->ip6_plen));
                }
#endif

                /* verify the version number of the inner packet */
                if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
                        PIM6_STATINC(PIM6_STAT_RCV_BADREGISTERS);
#ifdef MRT6DEBUG
                        log(LOG_DEBUG, "pim6_input: invalid IP version (%d) "
                            "of the inner packet\n",
                            (eip6->ip6_vfc & IPV6_VERSION));
#endif
                        m_freem(m);
                        return IPPROTO_DONE;
                }

                /* verify the inner packet is destined to a mcast group */
                if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) {
                        PIM6_STATINC(PIM6_STAT_RCV_BADREGISTERS);
#ifdef MRT6DEBUG
                        if (mrt6debug & DEBUG_PIM) {
                                char ip6buf[INET6_ADDRSTRLEN];
                                log(LOG_DEBUG,
                                    "pim6_input: inner packet of register "
                                    "is not multicast %s\n",
                                    IN6_PRINT(ip6buf, &eip6->ip6_dst));
                        }
#endif
                        m_freem(m);
                        return IPPROTO_DONE;
                }

                /*
                 * make a copy of the whole header to pass to the daemon later.
                 */
                mcp = m_copym(m, 0, off + PIM6_REG_MINLEN, M_DONTWAIT);
                if (mcp == NULL) {
#ifdef MRT6DEBUG
                        log(LOG_ERR,
                            "pim6_input: pim register: "
                            "could not copy register head\n");
#endif
                        m_freem(m);
                        return IPPROTO_DONE;
                }

                /*
                 * forward the inner ip6 packet; point m_data at the inner ip6.
                 */
                m_adj(m, off + PIM_MINLEN);
#ifdef MRT6DEBUG
                if (mrt6debug & DEBUG_PIM) {
                        char ip6bufs[INET6_ADDRSTRLEN];
                        char ip6bufd[INET6_ADDRSTRLEN];
                        log(LOG_DEBUG,
                            "pim6_input: forwarding decapsulated register: "
                            "src %s, dst %s, mif %d\n",
                            IN6_PRINT(ip6bufs, &eip6->ip6_src),
                            IN6_PRINT(ip6bufd, &eip6->ip6_dst),
                            reg_mif_num);
                }
#endif

                looutput(mif6table[reg_mif_num].m6_ifp, m, sin6tocsa(&dst),
                    NULL);

                /* prepare the register head to send to the mrouting daemon */
                m = mcp;
        }

        /*
         * Pass the PIM message up to the daemon; if it is a register message
         * pass the 'head' only up to the daemon. This includes the
         * encapsulator ip6 header, pim header, register header and the
         * encapsulated ip6 header.
         */
pim6_input_to_daemon:
        /*
         * Currently, rip6_input() is always called holding softnet_lock
         * by ipintr()(!NET_MPSAFE) or PR_INPUT_WRAP()(NET_MPSAFE).
         */
        KASSERT(mutex_owned(softnet_lock));
        rip6_input(&m, offp, proto);
        return IPPROTO_DONE;
}

static int
sysctl_net_inet6_pim6_stats(SYSCTLFN_ARGS)
{

        return (NETSTAT_SYSCTL(pim6stat_percpu, PIM6_NSTATS));
}

static void
sysctl_net_inet6_pim6_setup(struct sysctllog **clog)
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "inet6", NULL,
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET6, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "pim6",
                       SYSCTL_DESCR("PIMv6 settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET6, IPPROTO_PIM, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "stats",
                       SYSCTL_DESCR("PIMv6 statistics"),
                       sysctl_net_inet6_pim6_stats, 0, NULL, 0,
                       CTL_NET, PF_INET6, IPPROTO_PIM, PIM6CTL_STATS,
                       CTL_EOL);
}























































































































































































































































































































































































































    1 




























    1 

















































































    1 


    1 

    1 










    1 














































































































































































































































































































    1 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
/*        $NetBSD: raw_ip.c,v 1.184 2022/11/04 09:00:58 ozaki-r Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)raw_ip.c        8.7 (Berkeley) 5/15/95
 */

/*
 * Raw interface to IP protocol.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: raw_ip.c,v 1.184 2022/11/04 09:00:58 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_mrouting.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kauth.h>

#include <net/if.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_private.h>
#include <netinet/ip_mroute.h>
#include <netinet/ip_icmp.h>
#include <netinet/in_pcb.h>
#include <netinet/in_proto.h>
#include <netinet/in_var.h>

#ifdef IPSEC
#include <netipsec/ipsec.h>
#endif

struct inpcbtable rawcbtable;

int         rip_pcbnotify(struct inpcbtable *, struct in_addr,
    struct in_addr, int, int, void (*)(struct inpcb *, int));
static int         rip_connect_pcb(struct inpcb *, struct sockaddr_in *);
static void         rip_disconnect1(struct inpcb *);

static void sysctl_net_inet_raw_setup(struct sysctllog **);

/*
 * Nominal space allocated to a raw ip socket.
 */
#define        RIPSNDQ                8192
#define        RIPRCVQ                8192

static u_long                rip_sendspace = RIPSNDQ;
static u_long                rip_recvspace = RIPRCVQ;

/*
 * Raw interface to IP protocol.
 */

/*
 * Initialize raw connection block q.
 */
void
rip_init(void)
{

        sysctl_net_inet_raw_setup(NULL);
        inpcb_init(&rawcbtable, 1, 1);
}

static void
rip_sbappendaddr(struct inpcb *last, struct ip *ip, const struct sockaddr *sa,
    int hlen, struct mbuf *n)
{
        struct mbuf *opts = NULL;

        if (last->inp_flags & INP_NOHEADER)
                m_adj(n, hlen);
        if (last->inp_flags & INP_CONTROLOPTS ||
            SOOPT_TIMESTAMP(last->inp_socket->so_options))
                ip_savecontrol(last, &opts, ip, n);
        if (sbappendaddr(&last->inp_socket->so_rcv, sa, n, opts) == 0) {
                soroverflow(last->inp_socket);
                m_freem(n);
                if (opts)
                        m_freem(opts);
        } else {
                sorwakeup(last->inp_socket);
        }
}

/*
 * Setup generic address and protocol structures
 * for raw_input routine, then pass them along with
 * mbuf chain.
 */
void
rip_input(struct mbuf *m, int off, int proto)
{
        struct ip *ip = mtod(m, struct ip *);
        struct inpcb *inp;
        struct inpcb *last = NULL;
        struct mbuf *n;
        struct sockaddr_in ripsrc;
        int hlen;

        sockaddr_in_init(&ripsrc, &ip->ip_src, 0);

        /*
         * XXX Compatibility: programs using raw IP expect ip_len
         * XXX to have the header length subtracted, and in host order.
         * XXX ip_off is also expected to be host order.
         */
        hlen = ip->ip_hl << 2;
        ip->ip_len = ntohs(ip->ip_len) - hlen;
        NTOHS(ip->ip_off);

        TAILQ_FOREACH(inp, &rawcbtable.inpt_queue, inp_queue) {
                if (inp->inp_af != AF_INET)
                        continue;
                if (in4p_ip(inp).ip_p && in4p_ip(inp).ip_p != proto)
                        continue;
                if (!in_nullhost(in4p_laddr(inp)) &&
                    !in_hosteq(in4p_laddr(inp), ip->ip_dst))
                        continue;
                if (!in_nullhost(in4p_faddr(inp)) &&
                    !in_hosteq(in4p_faddr(inp), ip->ip_src))
                        continue;

                if (last == NULL) {
                        ;
                }
#if defined(IPSEC)
                else if (ipsec_used && ipsec_in_reject(m, last)) {
                        /* do not inject data into pcb */
                }
#endif
                else if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) {
                        rip_sbappendaddr(last, ip, sintosa(&ripsrc), hlen, n);
                }

                last = inp;
        }

#if defined(IPSEC)
        if (ipsec_used && last != NULL && ipsec_in_reject(m, last)) {
                m_freem(m);
                IP_STATDEC(IP_STAT_DELIVERED);
                /* do not inject data into pcb */
        } else
#endif
        if (last != NULL) {
                rip_sbappendaddr(last, ip, sintosa(&ripsrc), hlen, m);
        } else if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) {
                uint64_t *ips;

                icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL,
                    0, 0);
                ips = IP_STAT_GETREF();
                ips[IP_STAT_NOPROTO]++;
                ips[IP_STAT_DELIVERED]--;
                IP_STAT_PUTREF();
        } else {
                m_freem(m);
        }

        return;
}

int
rip_pcbnotify(struct inpcbtable *table,
    struct in_addr faddr, struct in_addr laddr, int proto, int errno,
    void (*notify)(struct inpcb *, int))
{
        struct inpcb *inp;
        int nmatch;

        nmatch = 0;
        TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
                if (inp->inp_af != AF_INET)
                        continue;
                if (in4p_ip(inp).ip_p && in4p_ip(inp).ip_p != proto)
                        continue;
                if (in_hosteq(in4p_faddr(inp), faddr) &&
                    in_hosteq(in4p_laddr(inp), laddr)) {
                        (*notify)(inp, errno);
                        nmatch++;
                }
        }

        return nmatch;
}

void *
rip_ctlinput(int cmd, const struct sockaddr *sa, void *v)
{
        struct ip *ip = v;
        void (*notify)(struct inpcb *, int) = inpcb_rtchange;
        int errno;

        if (sa->sa_family != AF_INET ||
            sa->sa_len != sizeof(struct sockaddr_in))
                return NULL;
        if ((unsigned)cmd >= PRC_NCMDS)
                return NULL;
        errno = inetctlerrmap[cmd];
        if (PRC_IS_REDIRECT(cmd))
                notify = inpcb_rtchange, ip = 0;
        else if (cmd == PRC_HOSTDEAD)
                ip = 0;
        else if (errno == 0)
                return NULL;
        if (ip) {
                rip_pcbnotify(&rawcbtable, satocsin(sa)->sin_addr,
                    ip->ip_src, ip->ip_p, errno, notify);

                /* XXX mapped address case */
        } else
                inpcb_notifyall(&rawcbtable, satocsin(sa)->sin_addr, errno,
                    notify);
        return NULL;
}

/*
 * Generate IP header and pass packet to ip_output.
 * Tack on options user may have setup with control call.
 */
int
rip_output(struct mbuf *m, struct inpcb *inp, struct mbuf *control,
    struct lwp *l)
{
        struct ip *ip;
        struct mbuf *opts;
        struct ip_pktopts pktopts;
        kauth_cred_t cred;
        int error, flags;

        flags = (inp->inp_socket->so_options & SO_DONTROUTE) |
            IP_ALLOWBROADCAST | IP_RETURNMTU;

        if (l == NULL)
                cred = NULL;
        else
                cred = l->l_cred;

        /* Setup IP outgoing packet options */
        memset(&pktopts, 0, sizeof(pktopts));
        error = ip_setpktopts(control, &pktopts, &flags, inp, cred);
        if (control != NULL)
                m_freem(control);
        if (error != 0)
                goto release;

        /*
         * If the user handed us a complete IP packet, use it.
         * Otherwise, allocate an mbuf for a header and fill it in.
         */
        if ((inp->inp_flags & INP_HDRINCL) == 0) {
                if ((m->m_pkthdr.len + sizeof(struct ip)) > IP_MAXPACKET) {
                        error = EMSGSIZE;
                        goto release;
                }
                M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
                if (!m) {
                        error = ENOBUFS;
                        goto release;
                }
                ip = mtod(m, struct ip *);
                ip->ip_tos = 0;
                ip->ip_off = htons(0);
                ip->ip_p = in4p_ip(inp).ip_p;
                ip->ip_len = htons(m->m_pkthdr.len);
                ip->ip_src = pktopts.ippo_laddr.sin_addr;
                ip->ip_dst = in4p_faddr(inp);
                ip->ip_ttl = MAXTTL;
                opts = inp->inp_options;
        } else {
                if (m->m_pkthdr.len > IP_MAXPACKET) {
                        error = EMSGSIZE;
                        goto release;
                }
                if (m->m_pkthdr.len < sizeof(struct ip)) {
                        error = EINVAL;
                        goto release;
                }
                ip = mtod(m, struct ip *);

                /*
                 * If the mbuf is read-only, we need to allocate
                 * a new mbuf for the header, since we need to
                 * modify the header.
                 */
                if (M_READONLY(m)) {
                        int hlen = ip->ip_hl << 2;

                        m = m_copyup(m, hlen, (max_linkhdr + 3) & ~3);
                        if (m == NULL) {
                                error = ENOMEM;
                                goto release;
                        }
                        ip = mtod(m, struct ip *);
                }

                /* XXX userland passes ip_len and ip_off in host order */
                if (m->m_pkthdr.len != ip->ip_len) {
                        error = EINVAL;
                        goto release;
                }
                HTONS(ip->ip_len);
                HTONS(ip->ip_off);

                if (ip->ip_id != 0 || m->m_pkthdr.len < IP_MINFRAGSIZE)
                        flags |= IP_NOIPNEWID;
                opts = NULL;

                /* Prevent ip_output from overwriting header fields. */
                flags |= IP_RAWOUTPUT;

                IP_STATINC(IP_STAT_RAWOUT);
        }

        /*
         * IP output.  Note: if IP_RETURNMTU flag is set, the MTU size
         * will be stored in inp_errormtu.
         */
        return ip_output(m, opts, &inp->inp_route, flags, pktopts.ippo_imo,
            inp);

 release:
        if (m != NULL)
                m_freem(m);
        return error;
}

/*
 * Raw IP socket option processing.
 */
int
rip_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
        struct inpcb *inp = sotoinpcb(so);
        int error = 0;
        int optval;

        if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) {
                if (op == PRCO_GETOPT) {
                        optval = (inp->inp_flags & INP_NOHEADER) ? 1 : 0;
                        error = sockopt_set(sopt, &optval, sizeof(optval));
                } else if (op == PRCO_SETOPT) {
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                goto out;
                        if (optval) {
                                inp->inp_flags &= ~INP_HDRINCL;
                                inp->inp_flags |= INP_NOHEADER;
                        } else
                                inp->inp_flags &= ~INP_NOHEADER;
                }
                goto out;
        } else if (sopt->sopt_level != IPPROTO_IP)
                return ip_ctloutput(op, so, sopt);

        switch (op) {

        case PRCO_SETOPT:
                switch (sopt->sopt_name) {
                case IP_HDRINCL:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;
                        if (optval)
                                inp->inp_flags |= INP_HDRINCL;
                        else
                                inp->inp_flags &= ~INP_HDRINCL;
                        break;

#ifdef MROUTING
                case MRT_INIT:
                case MRT_DONE:
                case MRT_ADD_VIF:
                case MRT_DEL_VIF:
                case MRT_ADD_MFC:
                case MRT_DEL_MFC:
                case MRT_ASSERT:
                case MRT_API_CONFIG:
                case MRT_ADD_BW_UPCALL:
                case MRT_DEL_BW_UPCALL:
                        error = ip_mrouter_set(so, sopt);
                        break;
#endif

                default:
                        error = ip_ctloutput(op, so, sopt);
                        break;
                }
                break;

        case PRCO_GETOPT:
                switch (sopt->sopt_name) {
                case IP_HDRINCL:
                        optval = inp->inp_flags & INP_HDRINCL;
                        error = sockopt_set(sopt, &optval, sizeof(optval));
                        break;

#ifdef MROUTING
                case MRT_VERSION:
                case MRT_ASSERT:
                case MRT_API_SUPPORT:
                case MRT_API_CONFIG:
                        error = ip_mrouter_get(so, sopt);
                        break;
#endif

                default:
                        error = ip_ctloutput(op, so, sopt);
                        break;
                }
                break;
        }
 out:
        return error;
}

int
rip_connect_pcb(struct inpcb *inp, struct sockaddr_in *addr)
{

        if (IFNET_READER_EMPTY())
                return (EADDRNOTAVAIL);
        if (addr->sin_family != AF_INET)
                return (EAFNOSUPPORT);
        if (addr->sin_len != sizeof(*addr))
                return EINVAL;
        in4p_faddr(inp) = addr->sin_addr;
        return (0);
}

static void
rip_disconnect1(struct inpcb *inp)
{

        in4p_faddr(inp) = zeroin_addr;
}

static int
rip_attach(struct socket *so, int proto)
{
        struct inpcb *inp;
        int error;

        KASSERT(sotoinpcb(so) == NULL);
        sosetlock(so);

        if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
                error = soreserve(so, rip_sendspace, rip_recvspace);
                if (error) {
                        return error;
                }
        }

        error = inpcb_create(so, &rawcbtable);
        if (error) {
                return error;
        }
        inp = sotoinpcb(so);
        in4p_ip(inp).ip_p = proto;
        KASSERT(solocked(so));

        return 0;
}

static void
rip_detach(struct socket *so)
{
        struct inpcb *inp;

        KASSERT(solocked(so));
        inp = sotoinpcb(so);
        KASSERT(inp != NULL);

#ifdef MROUTING
        extern struct socket *ip_mrouter;
        if (so == ip_mrouter) {
                ip_mrouter_done();
        }
#endif
        inpcb_destroy(inp);
}

static int
rip_accept(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));

        panic("rip_accept");

        return EOPNOTSUPP;
}

static int
rip_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct inpcb *inp = sotoinpcb(so);
        struct sockaddr_in *addr = (struct sockaddr_in *)nam;
        int error = 0;
        int s, ss;
        struct ifaddr *ifa;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);
        KASSERT(nam != NULL);

        if (addr->sin_len != sizeof(*addr))
                return EINVAL;

        s = splsoftnet();
        if (IFNET_READER_EMPTY()) {
                error = EADDRNOTAVAIL;
                goto release;
        }
        if (addr->sin_family != AF_INET) {
                error = EAFNOSUPPORT;
                goto release;
        }
        ss = pserialize_read_enter();
        if ((ifa = ifa_ifwithaddr(sintosa(addr))) == NULL &&
            (inp->inp_flags & INP_BINDANY) == 0 &&
            !in_nullhost(addr->sin_addr))
        {
                pserialize_read_exit(ss);
                error = EADDRNOTAVAIL;
                goto release;
        }
        if (ifa && (ifatoia(ifa))->ia4_flags & IN6_IFF_DUPLICATED) {
                pserialize_read_exit(ss);
                error = EADDRNOTAVAIL;
                goto release;
        }
        pserialize_read_exit(ss);

        in4p_laddr(inp) = addr->sin_addr;

release:
        splx(s);
        return error;
}

static int
rip_listen(struct socket *so, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
rip_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct inpcb *inp = sotoinpcb(so);
        int error = 0;
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);
        KASSERT(nam != NULL);

        s = splsoftnet();
        error = rip_connect_pcb(inp, (struct sockaddr_in *)nam);
        if (! error)
                soisconnected(so);
        splx(s);

        return error;
}

static int
rip_connect2(struct socket *so, struct socket *so2)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
rip_disconnect(struct socket *so)
{
        struct inpcb *inp = sotoinpcb(so);
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);

        s = splsoftnet();
        soisdisconnected(so);
        rip_disconnect1(inp);
        splx(s);

        return 0;
}

static int
rip_shutdown(struct socket *so)
{
        int s;

        KASSERT(solocked(so));

        /*
         * Mark the connection as being incapable of further input.
         */
        s = splsoftnet();
        socantsendmore(so);
        splx(s);

        return 0;
}

static int
rip_abort(struct socket *so)
{
        KASSERT(solocked(so));

        panic("rip_abort");

        return EOPNOTSUPP;
}

static int
rip_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
{
        return in_control(so, cmd, nam, ifp);
}

static int
rip_stat(struct socket *so, struct stat *ub)
{
        KASSERT(solocked(so));

        /* stat: don't bother with a blocksize. */
        return 0;
}

static int
rip_peeraddr(struct socket *so, struct sockaddr *nam)
{
        int s;

        KASSERT(solocked(so));
        KASSERT(sotoinpcb(so) != NULL);
        KASSERT(nam != NULL);

        s = splsoftnet();
        inpcb_fetch_peeraddr(sotoinpcb(so), (struct sockaddr_in *)nam);
        splx(s);

        return 0;
}

static int
rip_sockaddr(struct socket *so, struct sockaddr *nam)
{
        int s;

        KASSERT(solocked(so));
        KASSERT(sotoinpcb(so) != NULL);
        KASSERT(nam != NULL);

        s = splsoftnet();
        inpcb_fetch_sockaddr(sotoinpcb(so), (struct sockaddr_in *)nam);
        splx(s);

        return 0;
}

static int
rip_rcvd(struct socket *so, int flags, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
rip_recvoob(struct socket *so, struct mbuf *m, int flags)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
rip_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
    struct mbuf *control, struct lwp *l)
{
        struct inpcb *inp = sotoinpcb(so);
        int error = 0;
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);
        KASSERT(m != NULL);

        /*
         * Ship a packet out.  The appropriate raw output
         * routine handles any massaging necessary.
         */
        s = splsoftnet();
        if (nam) {
                if ((so->so_state & SS_ISCONNECTED) != 0) {
                        error = EISCONN;
                        goto die;
                }
                error = rip_connect_pcb(inp, (struct sockaddr_in *)nam);
                if (error)
                        goto die;
        } else {
                if ((so->so_state & SS_ISCONNECTED) == 0) {
                        error = ENOTCONN;
                        goto die;
                }
        }
        error = rip_output(m, inp, control, l);
        m = NULL;
        control = NULL;
        if (nam)
                rip_disconnect1(inp);
 die:
        if (m != NULL)
                m_freem(m);
        if (control != NULL)
                m_freem(control);

        splx(s);
        return error;
}

static int
rip_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control)
{
        KASSERT(solocked(so));

        m_freem(m);
        m_freem(control);

        return EOPNOTSUPP;
}

static int
rip_purgeif(struct socket *so, struct ifnet *ifp)
{
        int s;

        s = splsoftnet();
        mutex_enter(softnet_lock);
        inpcb_purgeif0(&rawcbtable, ifp);
#ifdef NET_MPSAFE
        mutex_exit(softnet_lock);
#endif
        in_purgeif(ifp);
#ifdef NET_MPSAFE
        mutex_enter(softnet_lock);
#endif
        inpcb_purgeif(&rawcbtable, ifp);
        mutex_exit(softnet_lock);
        splx(s);

        return 0;
}

PR_WRAP_USRREQS(rip)
#define        rip_attach        rip_attach_wrapper
#define        rip_detach        rip_detach_wrapper
#define        rip_accept        rip_accept_wrapper
#define        rip_bind        rip_bind_wrapper
#define        rip_listen        rip_listen_wrapper
#define        rip_connect        rip_connect_wrapper
#define        rip_connect2        rip_connect2_wrapper
#define        rip_disconnect        rip_disconnect_wrapper
#define        rip_shutdown        rip_shutdown_wrapper
#define        rip_abort        rip_abort_wrapper
#define        rip_ioctl        rip_ioctl_wrapper
#define        rip_stat        rip_stat_wrapper
#define        rip_peeraddr        rip_peeraddr_wrapper
#define        rip_sockaddr        rip_sockaddr_wrapper
#define        rip_rcvd        rip_rcvd_wrapper
#define        rip_recvoob        rip_recvoob_wrapper
#define        rip_send        rip_send_wrapper
#define        rip_sendoob        rip_sendoob_wrapper
#define        rip_purgeif        rip_purgeif_wrapper

const struct pr_usrreqs rip_usrreqs = {
        .pr_attach        = rip_attach,
        .pr_detach        = rip_detach,
        .pr_accept        = rip_accept,
        .pr_bind        = rip_bind,
        .pr_listen        = rip_listen,
        .pr_connect        = rip_connect,
        .pr_connect2        = rip_connect2,
        .pr_disconnect        = rip_disconnect,
        .pr_shutdown        = rip_shutdown,
        .pr_abort        = rip_abort,
        .pr_ioctl        = rip_ioctl,
        .pr_stat        = rip_stat,
        .pr_peeraddr        = rip_peeraddr,
        .pr_sockaddr        = rip_sockaddr,
        .pr_rcvd        = rip_rcvd,
        .pr_recvoob        = rip_recvoob,
        .pr_send        = rip_send,
        .pr_sendoob        = rip_sendoob,
        .pr_purgeif        = rip_purgeif,
};

static void
sysctl_net_inet_raw_setup(struct sysctllog **clog)
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "inet", NULL,
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "raw",
                       SYSCTL_DESCR("Raw IPv4 settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET, IPPROTO_RAW, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "pcblist",
                       SYSCTL_DESCR("Raw IPv4 control block list"),
                       sysctl_inpcblist, 0, &rawcbtable, 0,
                       CTL_NET, PF_INET, IPPROTO_RAW,
                       CTL_CREATE, CTL_EOL);
}





























































































































    3 


































    3 










    3 




    3 













    3 
    3 







    3 
    3 
















































































































    3 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
/*        $NetBSD: prop_dictionary_util.c,v 1.9 2022/08/03 21:13:46 riastradh Exp $        */

/*-
 * Copyright (c) 2006, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Utility routines to make it more convenient to work with values
 * stored in dictionaries.
 *
 * Note: There is no special magic going on here.  We use the standard
 * proplib(3) APIs to do all of this work.  Any application could do
 * exactly what we're doing here.
 */

#include "prop_object_impl.h"        /* only to hide kernel vs. not-kernel */
#include <prop/proplib.h>

bool
prop_dictionary_get_dict(prop_dictionary_t dict, const char *key,
                         prop_dictionary_t *dp)
{
        prop_object_t o;

        o = prop_dictionary_get(dict, key);
        if (prop_object_type(o) != PROP_TYPE_DICTIONARY)
                return false;
        *dp = o;
        return true;

}

bool
prop_dictionary_get_bool(prop_dictionary_t dict, const char *key, bool *valp)
{
        prop_bool_t b;

        b = prop_dictionary_get(dict, key);
        if (prop_object_type(b) != PROP_TYPE_BOOL)
                return (false);

        *valp = prop_bool_true(b);

        return (true);
}

bool
prop_dictionary_set_bool(prop_dictionary_t dict, const char *key, bool val)
{

        return prop_dictionary_set_and_rel(dict, key, prop_bool_create(val));
}

#define        TEMPLATE(name, typ)                                                \
bool                                                                        \
prop_dictionary_get_ ## name (prop_dictionary_t dict,                        \
                              const char *key,                                \
                              typ *valp)                                \
{                                                                        \
        return prop_number_ ## name ## _value(                                \
            prop_dictionary_get(dict, key), valp);                        \
}
TEMPLATE(schar,    signed char)
TEMPLATE(short,    short)
TEMPLATE(int,      int)
TEMPLATE(long,     long)
TEMPLATE(longlong, long long)
TEMPLATE(intptr,   intptr_t)
TEMPLATE(int8,     int8_t)
TEMPLATE(int16,    int16_t)
TEMPLATE(int32,    int32_t)
TEMPLATE(int64,    int64_t)

TEMPLATE(uchar,     unsigned char)
TEMPLATE(ushort,    unsigned short)
TEMPLATE(uint,      unsigned int)
TEMPLATE(ulong,     unsigned long)
TEMPLATE(ulonglong, unsigned long long)
TEMPLATE(uintptr,   uintptr_t)
TEMPLATE(uint8,     uint8_t)
TEMPLATE(uint16,    uint16_t)
TEMPLATE(uint32,    uint32_t)
TEMPLATE(uint64,    uint64_t)

#undef TEMPLATE

static bool
prop_dictionary_set_signed_number(prop_dictionary_t dict, const char *key,
                                  intmax_t val)
{
        return prop_dictionary_set_and_rel(dict, key,
                                           prop_number_create_signed(val));
}

static bool
prop_dictionary_set_unsigned_number(prop_dictionary_t dict, const char *key,
                                    uintmax_t val)
{
        /*LINTED: for conversion from 'long long' to 'long'*/                \
        return prop_dictionary_set_and_rel(dict, key,
                                           prop_number_create_unsigned(val));
}

#define        TEMPLATE(name, which, typ)                                        \
bool                                                                        \
prop_dictionary_set_ ## name (prop_dictionary_t dict,                        \
                              const char *key,                                \
                              typ val)                                        \
{                                                                        \
        /*LINTED: for conversion from long long to 'long'*/                \
        return prop_dictionary_set_ ## which ## _number(dict, key, val);\
}

#define        STEMPLATE(name, typ)        TEMPLATE(name, signed, typ)
#define        UTEMPLATE(name, typ)        TEMPLATE(name, unsigned, typ)

STEMPLATE(schar,    signed char)
STEMPLATE(short,    short)
STEMPLATE(int,      int)
STEMPLATE(long,     long)
STEMPLATE(longlong, long long)
STEMPLATE(intptr,   intptr_t)
STEMPLATE(int8,     int8_t)
STEMPLATE(int16,    int16_t)
STEMPLATE(int32,    int32_t)
STEMPLATE(int64,    int64_t)

UTEMPLATE(uchar,     unsigned char)
UTEMPLATE(ushort,    unsigned short)
UTEMPLATE(uint,      unsigned int)
UTEMPLATE(ulong,     unsigned long)
UTEMPLATE(ulonglong, unsigned long long)
UTEMPLATE(uintptr,   uintptr_t)
UTEMPLATE(uint8,     uint8_t)
UTEMPLATE(uint16,    uint16_t)
UTEMPLATE(uint32,    uint32_t)
UTEMPLATE(uint64,    uint64_t)

#undef STEMPLATE
#undef UTEMPLATE
#undef TEMPLATE

bool
prop_dictionary_get_string(prop_dictionary_t dict, const char *key,
                           const char **cpp)
{
        prop_string_t str;
        const char *cp;

        str = prop_dictionary_get(dict, key);
        if (prop_object_type(str) != PROP_TYPE_STRING)
                return (false);

        cp = prop_string_value(str);
        if (cp == NULL)
                return (false);

        *cpp = cp;
        return (true);
}

bool
prop_dictionary_set_string(prop_dictionary_t dict, const char *key,
                           const char *cp)
{
        return prop_dictionary_set_and_rel(dict, key,
                                           prop_string_create_copy(cp));
}

bool
prop_dictionary_set_string_nocopy(prop_dictionary_t dict,
                                  const char *key,
                                  const char *cp)
{
        return prop_dictionary_set_and_rel(dict, key,
                                           prop_string_create_nocopy(cp));
}

bool
prop_dictionary_get_data(prop_dictionary_t dict, const char *key,
                         const void **vp, size_t *sizep)
{
        prop_data_t data;
        const void *v;

        data = prop_dictionary_get(dict, key);
        if (prop_object_type(data) != PROP_TYPE_DATA)
                return (false);

        v = prop_data_value(data);
        if (v == NULL)
                return (false);

        *vp = v;
        if (sizep != NULL)
                *sizep = prop_data_size(data);
        return (true);
}

bool
prop_dictionary_set_data(prop_dictionary_t dict, const char *key,
                         const void *v, size_t size)
{
        return prop_dictionary_set_and_rel(dict, key,
                                           prop_data_create_copy(v, size));
}

bool
prop_dictionary_set_data_nocopy(prop_dictionary_t dict, const char *key,
                                const void *v, size_t size)
{
        return prop_dictionary_set_and_rel(dict, key,
                                           prop_data_create_nocopy(v, size));
}

_PROP_DEPRECATED(prop_dictionary_get_cstring,
    "this program uses prop_dictionary_get_cstring(), "
    "which is deprecated; use prop_dictionary_get_string() and copy instead.")
bool
prop_dictionary_get_cstring(prop_dictionary_t dict,
                            const char *key,
                            char **cpp)
{
        prop_string_t str;
        char *cp;
        size_t len;
        bool rv;

        str = prop_dictionary_get(dict, key);
        if (prop_object_type(str) != PROP_TYPE_STRING)
                return (false);

        len = prop_string_size(str);
        cp = _PROP_MALLOC(len + 1, M_TEMP);
        if (cp == NULL)
                return (false);

        rv = prop_string_copy_value(str, cp, len + 1);
        if (rv)
                *cpp = cp;
        else
                _PROP_FREE(cp, M_TEMP);

        return (rv);
}

_PROP_DEPRECATED(prop_string_get_cstring_nocopy,
    "this program uses prop_string_get_cstring_nocopy(), "
    "which is deprecated; use prop_dictionary_get_string() instead.")
bool
prop_dictionary_get_cstring_nocopy(prop_dictionary_t dict,
                                   const char *key,
                                   const char **cpp)
{
        return prop_dictionary_get_string(dict, key, cpp);
}

_PROP_DEPRECATED(prop_dictionary_set_cstring,
    "this program uses prop_dictionary_set_cstring(), "
    "which is deprecated; use prop_dictionary_set_string() instead.")
bool
prop_dictionary_set_cstring(prop_dictionary_t dict,
                            const char *key,
                            const char *cp)
{
        return prop_dictionary_set_string(dict, key, cp);
}

_PROP_DEPRECATED(prop_dictionary_set_cstring_nocopy,
    "this program uses prop_dictionary_set_cstring_nocopy(), "
    "which is deprecated; use prop_dictionary_set_string_nocopy() instead.")
bool
prop_dictionary_set_cstring_nocopy(prop_dictionary_t dict,
                                   const char *key,
                                   const char *cp)
{
        return prop_dictionary_set_string_nocopy(dict, key, cp);
}

bool
prop_dictionary_set_and_rel(prop_dictionary_t dict, const char *key,
                            prop_object_t po)
{
        bool rv;

        if (po == NULL)
                return false;
        rv = prop_dictionary_set(dict, key, po);
        prop_object_release(po);
        return rv;
}







































































































































    3 





    3 

































    1 









    1 















































































































































































    1 




















































    1 

























    1 





    1 




































































































    1 






































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
/* $NetBSD: kern_drvctl.c,v 1.51 2022/03/28 12:33:22 riastradh Exp $ */

/*
 * Copyright (c) 2004
 *         Matthias Drochner.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions, and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_drvctl.c,v 1.51 2022/03/28 12:33:22 riastradh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/conf.h>
#include <sys/device.h>
#include <sys/event.h>
#include <sys/kmem.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/select.h>
#include <sys/poll.h>
#include <sys/drvctlio.h>
#include <sys/devmon.h>
#include <sys/stat.h>
#include <sys/kauth.h>
#include <sys/lwp.h>
#include <sys/module.h>

#include "ioconf.h"

struct drvctl_event {
        TAILQ_ENTRY(drvctl_event) dce_link;
        prop_dictionary_t        dce_event;
};

TAILQ_HEAD(drvctl_queue, drvctl_event);

static struct drvctl_queue        drvctl_eventq;                /* FIFO */
static kcondvar_t                drvctl_cond;
static kmutex_t                        drvctl_lock;
static int                        drvctl_nopen = 0, drvctl_eventcnt = 0;
static struct selinfo                drvctl_rdsel;

#define DRVCTL_EVENTQ_DEPTH        64        /* arbitrary queue limit */

dev_type_open(drvctlopen);

const struct cdevsw drvctl_cdevsw = {
        .d_open = drvctlopen,
        .d_close = nullclose,
        .d_read = nullread,
        .d_write = nullwrite,
        .d_ioctl = noioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_OTHER
};

static int        drvctl_read(struct file *, off_t *, struct uio *,
                            kauth_cred_t, int);
static int        drvctl_write(struct file *, off_t *, struct uio *,
                             kauth_cred_t, int);
static int        drvctl_ioctl(struct file *, u_long, void *);
static int        drvctl_poll(struct file *, int);
static int        drvctl_stat(struct file *, struct stat *);
static int        drvctl_close(struct file *);

static const struct fileops drvctl_fileops = {
        .fo_name = "drvctl",
        .fo_read = drvctl_read,
        .fo_write = drvctl_write,
        .fo_ioctl = drvctl_ioctl,
        .fo_fcntl = fnullop_fcntl,
        .fo_poll = drvctl_poll,
        .fo_stat = drvctl_stat,
        .fo_close = drvctl_close,
        .fo_kqfilter = fnullop_kqfilter,
        .fo_restart = fnullop_restart,
};

#define MAXLOCATORS 100

static int (*saved_insert_vec)(const char *, prop_dictionary_t) = NULL;

static int drvctl_command(struct lwp *, struct plistref *, u_long, int);
static int drvctl_getevent(struct lwp *, struct plistref *, u_long, int);

void
drvctl_init(void)
{
        TAILQ_INIT(&drvctl_eventq);
        mutex_init(&drvctl_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&drvctl_cond, "devmon");
        selinit(&drvctl_rdsel);
}

void
drvctl_fini(void)
{

        seldestroy(&drvctl_rdsel);
        cv_destroy(&drvctl_cond);
        mutex_destroy(&drvctl_lock);
}

int
devmon_insert(const char *event, prop_dictionary_t ev)
{
        struct drvctl_event *dce, *odce;

        mutex_enter(&drvctl_lock);

        if (drvctl_nopen == 0) {
                prop_object_release(ev);
                mutex_exit(&drvctl_lock);
                return 0;
        }

        /* Fill in mandatory member */
        if (!prop_dictionary_set_string_nocopy(ev, "event", event)) {
                prop_object_release(ev);
                mutex_exit(&drvctl_lock);
                return 0;
        }

        dce = kmem_alloc(sizeof(*dce), KM_SLEEP);
        dce->dce_event = ev;

        if (drvctl_eventcnt == DRVCTL_EVENTQ_DEPTH) {
                odce = TAILQ_FIRST(&drvctl_eventq);
                TAILQ_REMOVE(&drvctl_eventq, odce, dce_link);
                prop_object_release(odce->dce_event);
                kmem_free(odce, sizeof(*odce));
                --drvctl_eventcnt;
        }

        TAILQ_INSERT_TAIL(&drvctl_eventq, dce, dce_link);
        ++drvctl_eventcnt;
        cv_broadcast(&drvctl_cond);
        selnotify(&drvctl_rdsel, 0, 0);

        mutex_exit(&drvctl_lock);
        return 0;
}

int
drvctlopen(dev_t dev, int flags, int mode, struct lwp *l)
{
        struct file *fp;
        int fd;
        int ret;

        ret = fd_allocfile(&fp, &fd);
        if (ret)
                return ret;

        /* XXX setup context */
        mutex_enter(&drvctl_lock);
        ret = fd_clone(fp, fd, flags, &drvctl_fileops, /* context */NULL);
        ++drvctl_nopen;
        mutex_exit(&drvctl_lock);

        return ret;
}

static int
pmdevbyname(u_long cmd, struct devpmargs *a)
{
        device_t d;

        KASSERT(KERNEL_LOCKED_P());

        if ((d = device_find_by_xname(a->devname)) == NULL)
                return ENXIO;

        switch (cmd) {
        case DRVSUSPENDDEV:
                return pmf_device_recursive_suspend(d, PMF_Q_DRVCTL) ? 0 : EBUSY;
        case DRVRESUMEDEV:
                if (a->flags & DEVPM_F_SUBTREE) {
                        return pmf_device_subtree_resume(d, PMF_Q_DRVCTL)
                            ? 0 : EBUSY;
                } else {
                        return pmf_device_recursive_resume(d, PMF_Q_DRVCTL)
                            ? 0 : EBUSY;
                }
        default:
                return EPASSTHROUGH;
        }
}

static int
listdevbyname(struct devlistargs *l)
{
        device_t d, child;
        deviter_t di;
        int cnt = 0, idx, error = 0;

        KASSERT(KERNEL_LOCKED_P());

        if (*l->l_devname == '\0')
                d = NULL;
        else if (memchr(l->l_devname, 0, sizeof(l->l_devname)) == NULL)
                return EINVAL;
        else if ((d = device_find_by_xname(l->l_devname)) == NULL)
                return ENXIO;

        for (child = deviter_first(&di, 0); child != NULL;
             child = deviter_next(&di)) {
                if (device_parent(child) != d)
                        continue;
                idx = cnt++;
                if (l->l_childname == NULL || idx >= l->l_children)
                        continue;
                error = copyoutstr(device_xname(child), l->l_childname[idx],
                                sizeof(l->l_childname[idx]), NULL);
                if (error != 0)
                        break;
        }
        deviter_release(&di);

        l->l_children = cnt;
        return error;
}

static int
detachdevbyname(const char *devname)
{
        device_t d;
        deviter_t di;
        int error;

        KASSERT(KERNEL_LOCKED_P());

        for (d = deviter_first(&di, DEVITER_F_RW);
             d != NULL;
             d = deviter_next(&di)) {
                if (strcmp(device_xname(d), devname) == 0)
                        break;
        }
        if (d == NULL) {
                error = ENXIO;
                goto out;
        }

#ifndef XXXFULLRISK
        /*
         * If the parent cannot be notified, it might keep
         * pointers to the detached device.
         * There might be a private notification mechanism,
         * but better play it safe here.
         */
        if (device_parent(d) &&
            !device_cfattach(device_parent(d))->ca_childdetached) {
                error = ENOTSUP;
                goto out;
        }
#endif

        error = config_detach(d, 0);
out:        deviter_release(&di);
        return error;
}

static int
rescanbus(const char *busname, const char *ifattr,
          int numlocators, const int *locators)
{
        int i, rc;
        device_t d;
        const struct cfiattrdata * const *ap;

        KASSERT(KERNEL_LOCKED_P());

        /* XXX there should be a way to get limits and defaults (per device)
           from config generated data */
        int locs[MAXLOCATORS];
        for (i = 0; i < MAXLOCATORS; i++)
                locs[i] = -1;

        for (i = 0; i < numlocators;i++)
                locs[i] = locators[i];

        if ((d = device_find_by_xname(busname)) == NULL)
                return ENXIO;

        /*
         * must support rescan, and must have something
         * to attach to
         */
        if (!device_cfattach(d)->ca_rescan ||
            !device_cfdriver(d)->cd_attrs)
                return ENODEV;

        /* rescan all ifattrs if none is specified */
        if (!ifattr) {
                rc = 0;
                for (ap = device_cfdriver(d)->cd_attrs; *ap; ap++) {
                        rc = (*device_cfattach(d)->ca_rescan)(d,
                            (*ap)->ci_name, locs);
                        if (rc)
                                break;
                }
        } else {
                /* check for valid attribute passed */
                for (ap = device_cfdriver(d)->cd_attrs; *ap; ap++)
                        if (!strcmp((*ap)->ci_name, ifattr))
                                break;
                if (!*ap)
                        return EINVAL;
                rc = (*device_cfattach(d)->ca_rescan)(d, ifattr, locs);
        }

        config_deferred(NULL);
        return rc;
}

static int
drvctl_read(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
    int flags)
{
        return ENODEV;
}

static int
drvctl_write(struct file *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
    int flags)
{
        return ENODEV;
}

static int
drvctl_ioctl(struct file *fp, u_long cmd, void *data)
{
        int res;
        char *ifattr;
        int *locs;
        size_t locs_sz = 0; /* XXXgcc */

        KERNEL_LOCK(1, NULL);
        switch (cmd) {
        case DRVSUSPENDDEV:
        case DRVRESUMEDEV:
#define d ((struct devpmargs *)data)
                res = pmdevbyname(cmd, d);
#undef d
                break;
        case DRVLISTDEV:
                res = listdevbyname((struct devlistargs *)data);
                break;
        case DRVDETACHDEV:
#define d ((struct devdetachargs *)data)
                res = detachdevbyname(d->devname);
#undef d
                break;
        case DRVRESCANBUS:
#define d ((struct devrescanargs *)data)
                d->busname[sizeof(d->busname) - 1] = '\0';

                /* XXX better copyin? */
                if (d->ifattr[0]) {
                        d->ifattr[sizeof(d->ifattr) - 1] = '\0';
                        ifattr = d->ifattr;
                } else
                        ifattr = 0;

                if (d->numlocators) {
                        if (d->numlocators > MAXLOCATORS) {
                                res = EINVAL;
                                goto out;
                        }
                        locs_sz = d->numlocators * sizeof(int);
                        locs = kmem_alloc(locs_sz, KM_SLEEP);
                        res = copyin(d->locators, locs, locs_sz);
                        if (res) {
                                kmem_free(locs, locs_sz);
                                goto out;
                        }
                } else
                        locs = NULL;
                res = rescanbus(d->busname, ifattr, d->numlocators, locs);
                if (locs)
                        kmem_free(locs, locs_sz);
#undef d
                break;
        case DRVCTLCOMMAND:
                    res = drvctl_command(curlwp, (struct plistref *)data, cmd,
                    fp->f_flag);
                    break;
        case DRVGETEVENT:
                res = drvctl_getevent(curlwp, (struct plistref *)data, cmd,
                    fp->f_flag);
                break;
        default:
                res = EPASSTHROUGH;
                break;
        }
out:        KERNEL_UNLOCK_ONE(NULL);
        return res;
}

static int
drvctl_stat(struct file *fp, struct stat *st)
{
        (void)memset(st, 0, sizeof(*st));
        st->st_uid = kauth_cred_geteuid(fp->f_cred);
        st->st_gid = kauth_cred_getegid(fp->f_cred);
        return 0;
}

static int
drvctl_poll(struct file *fp, int events)
{
        int revents = 0;

        if (!TAILQ_EMPTY(&drvctl_eventq))
                revents |= events & (POLLIN | POLLRDNORM);
        else
                selrecord(curlwp, &drvctl_rdsel);

        return revents;
}

static int
drvctl_close(struct file *fp)
{
        struct drvctl_event *dce;

        /* XXX free context */
        mutex_enter(&drvctl_lock);
        KASSERT(drvctl_nopen > 0);
        --drvctl_nopen;
        if (drvctl_nopen == 0) {
                /* flush queue */
                while ((dce = TAILQ_FIRST(&drvctl_eventq)) != NULL) {
                        TAILQ_REMOVE(&drvctl_eventq, dce, dce_link);
                        KASSERT(drvctl_eventcnt > 0);
                        --drvctl_eventcnt;
                        prop_object_release(dce->dce_event);
                        kmem_free(dce, sizeof(*dce));
                }
        }
        mutex_exit(&drvctl_lock);

        return 0;
}

void
drvctlattach(int arg __unused)
{
}

/*****************************************************************************
 * Driver control command processing engine
 *****************************************************************************/

static int
drvctl_command_get_properties(struct lwp *l,
                              prop_dictionary_t command_dict,
                              prop_dictionary_t results_dict)
{
        prop_dictionary_t args_dict;
        prop_string_t devname_string;
        device_t dev;
        deviter_t di;
        
        args_dict = prop_dictionary_get(command_dict, "drvctl-arguments");
        if (args_dict == NULL)
                return EINVAL;

        devname_string = prop_dictionary_get(args_dict, "device-name");
        if (devname_string == NULL)
                return EINVAL;
        
        for (dev = deviter_first(&di, 0); dev != NULL;
             dev = deviter_next(&di)) {
                if (prop_string_equals_string(devname_string,
                                               device_xname(dev))) {
                        prop_dictionary_set(results_dict, "drvctl-result-data",
                            device_properties(dev));
                        break;
                }
        }

        deviter_release(&di);

        if (dev == NULL)
                return ESRCH;

        return 0;
}

struct drvctl_command_desc {
        const char *dcd_name;                /* command name */
        int (*dcd_func)(struct lwp *,        /* handler function */
                        prop_dictionary_t,
                        prop_dictionary_t);
        int dcd_rw;                        /* read or write required */
};

static const struct drvctl_command_desc drvctl_command_table[] = {
        { .dcd_name = "get-properties",
          .dcd_func = drvctl_command_get_properties,
          .dcd_rw   = FREAD,
        },

        { .dcd_name = NULL }
};

static int
drvctl_command(struct lwp *l, struct plistref *pref, u_long ioctl_cmd,
               int fflag)
{
        prop_dictionary_t command_dict, results_dict;
        prop_string_t command_string;
        const struct drvctl_command_desc *dcd;
        int error;

        error = prop_dictionary_copyin_ioctl(pref, ioctl_cmd, &command_dict);
        if (error)
                return error;

        results_dict = prop_dictionary_create();
        if (results_dict == NULL) {
                prop_object_release(command_dict);
                return ENOMEM;
        }
        
        command_string = prop_dictionary_get(command_dict, "drvctl-command");
        if (command_string == NULL) {
                error = EINVAL;
                goto out;
        }

        for (dcd = drvctl_command_table; dcd->dcd_name != NULL; dcd++) {
                if (prop_string_equals_string(command_string,
                                              dcd->dcd_name))
                        break;
        }

        if (dcd->dcd_name == NULL) {
                error = EINVAL;
                goto out;
        }

        if ((fflag & dcd->dcd_rw) == 0) {
                error = EPERM;
                goto out;
        }

        error = (*dcd->dcd_func)(l, command_dict, results_dict);

        prop_dictionary_set_int32(results_dict, "drvctl-error", error);

        error = prop_dictionary_copyout_ioctl(pref, ioctl_cmd, results_dict);
 out:
        prop_object_release(command_dict);
        prop_object_release(results_dict);
        return error;
}

static int
drvctl_getevent(struct lwp *l, struct plistref *pref, u_long ioctl_cmd,
                int fflag)
{
        struct drvctl_event *dce;
        int ret;

        if ((fflag & (FREAD|FWRITE)) != (FREAD|FWRITE))
                return EPERM;

        mutex_enter(&drvctl_lock);
        while ((dce = TAILQ_FIRST(&drvctl_eventq)) == NULL) {
                if (fflag & O_NONBLOCK) {
                        mutex_exit(&drvctl_lock);
                        return EWOULDBLOCK;
                }

                ret = cv_wait_sig(&drvctl_cond, &drvctl_lock);
                if (ret) {
                        mutex_exit(&drvctl_lock);
                        return ret;
                }
        }
        TAILQ_REMOVE(&drvctl_eventq, dce, dce_link);
        KASSERT(drvctl_eventcnt > 0);
        --drvctl_eventcnt;
        mutex_exit(&drvctl_lock);

        ret = prop_dictionary_copyout_ioctl(pref, ioctl_cmd, dce->dce_event);

        prop_object_release(dce->dce_event);
        kmem_free(dce, sizeof(*dce));

        return ret;
}

/*
 * Module glue
 */

MODULE(MODULE_CLASS_DRIVER, drvctl, NULL);

int
drvctl_modcmd(modcmd_t cmd, void *arg)
{
        int error;
#ifdef _MODULE
        int bmajor, cmajor;
#endif

        error = 0;
        switch (cmd) {
        case MODULE_CMD_INIT:
                drvctl_init();

                mutex_enter(&drvctl_lock);
#ifdef _MODULE
                bmajor = cmajor = -1;
                error = devsw_attach("drvctl", NULL, &bmajor,
                    &drvctl_cdevsw, &cmajor);
#endif
                if (error == 0) {
                        KASSERT(saved_insert_vec == NULL);
                        saved_insert_vec = devmon_insert_vec;
                        devmon_insert_vec = devmon_insert;
                }

                mutex_exit(&drvctl_lock);
                break;

        case MODULE_CMD_FINI:
                mutex_enter(&drvctl_lock);
                if (drvctl_nopen != 0 || drvctl_eventcnt != 0 ) {
                        mutex_exit(&drvctl_lock);
                        return EBUSY;
                }
                KASSERT(saved_insert_vec != NULL);
                devmon_insert_vec = saved_insert_vec;
                saved_insert_vec = NULL;
#ifdef _MODULE
                devsw_detach(NULL, &drvctl_cdevsw);
#endif
                mutex_exit(&drvctl_lock);
                drvctl_fini();

                break;
        default:
                error = ENOTTY;
                break;
        }

        return error;
}


















































































































































































































































































































































































































































































































































































































































































































































































































































    2 










    3 






   18 













































































    2 



    2 

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
/*        $NetBSD: mbuf.h,v 1.239 2024/01/22 21:15:02 jdolecek Exp $        */

/*
 * Copyright (c) 1996, 1997, 1999, 2001, 2007 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center and Matt Thomas of 3am Software Foundry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)mbuf.h        8.5 (Berkeley) 2/19/95
 */

#ifndef _SYS_MBUF_H_
#define _SYS_MBUF_H_

#ifdef _KERNEL_OPT
#include "opt_mbuftrace.h"
#endif

#ifndef M_WAITOK
#include <sys/malloc.h>
#endif
#include <sys/pool.h>
#include <sys/queue.h>
#if defined(_KERNEL)
#include <sys/percpu_types.h>
#include <sys/socket.h>        /* for AF_UNSPEC */
#include <sys/psref.h>
#endif /* defined(_KERNEL) */

/* For offsetof() */
#if defined(_KERNEL) || defined(_STANDALONE)
#include <sys/systm.h>
#else
#include <stddef.h>
#endif

#include <uvm/uvm_param.h>        /* for MIN_PAGE_SIZE */

#include <net/if.h>

/*
 * Mbufs are of a single size, MSIZE (machine/param.h), which
 * includes overhead.  An mbuf may add a single "mbuf cluster" of size
 * MCLBYTES (also in machine/param.h), which has no additional overhead
 * and is used instead of the internal data area; this is done when
 * at least MINCLSIZE of data must be stored.
 */

/* Packet tags structure */
struct m_tag {
        SLIST_ENTRY(m_tag)        m_tag_link;        /* List of packet tags */
        uint16_t                m_tag_id;        /* Tag ID */
        uint16_t                m_tag_len;        /* Length of data */
};

/* mbuf ownership structure */
struct mowner {
        char mo_name[16];                /* owner name (fxp0) */
        char mo_descr[16];                /* owner description (input) */
        LIST_ENTRY(mowner) mo_link;        /* */
        struct percpu *mo_counters;
};

#define MOWNER_INIT(x, y) { .mo_name = x, .mo_descr = y }

enum mowner_counter_index {
        MOWNER_COUNTER_CLAIMS,                /* # of small mbuf claimed */
        MOWNER_COUNTER_RELEASES,        /* # of small mbuf released */
        MOWNER_COUNTER_CLUSTER_CLAIMS,        /* # of cluster mbuf claimed */
        MOWNER_COUNTER_CLUSTER_RELEASES,/* # of cluster mbuf released */
        MOWNER_COUNTER_EXT_CLAIMS,        /* # of M_EXT mbuf claimed */
        MOWNER_COUNTER_EXT_RELEASES,        /* # of M_EXT mbuf released */

        MOWNER_COUNTER_NCOUNTERS,
};

#if defined(_KERNEL)
struct mowner_counter {
        u_long mc_counter[MOWNER_COUNTER_NCOUNTERS];
};
#endif

/* userland-exported version of struct mowner */
struct mowner_user {
        char mo_name[16];                /* owner name (fxp0) */
        char mo_descr[16];                /* owner description (input) */
        LIST_ENTRY(mowner) mo_link;        /* unused padding; for compatibility */
        u_long mo_counter[MOWNER_COUNTER_NCOUNTERS]; /* counters */
};

/*
 * Macros for type conversion
 * mtod(m,t) -        convert mbuf pointer to data pointer of correct type
 */
#define mtod(m, t)        ((t)((m)->m_data))

/* header at beginning of each mbuf */
struct m_hdr {
        struct        mbuf *mh_next;                /* next buffer in chain */
        struct        mbuf *mh_nextpkt;        /* next chain in queue/record */
        char        *mh_data;                /* location of data */
        struct        mowner *mh_owner;        /* mbuf owner */
        int        mh_len;                        /* amount of data in this mbuf */
        int        mh_flags;                /* flags; see below */
        paddr_t        mh_paddr;                /* physical address of mbuf */
        short        mh_type;                /* type of data in this mbuf */
};

/*
 * record/packet header in first mbuf of chain; valid if M_PKTHDR set
 *
 * A note about csum_data:
 *
 *  o For the out-bound direction, the low 16 bits indicates the offset after
 *    the L4 header where the final L4 checksum value is to be stored and the
 *    high 16 bits is the length of the L3 header (the start of the data to
 *    be checksummed).
 *
 *  o For the in-bound direction, it is only valid if the M_CSUM_DATA flag is
 *    set. In this case, an L4 checksum has been calculated by hardware and
 *    is stored in csum_data, but it is up to software to perform final
 *    verification.
 *
 * Note for in-bound TCP/UDP checksums: we expect the csum_data to NOT
 * be bit-wise inverted (the final step in the calculation of an IP
 * checksum) -- this is so we can accumulate the checksum for fragmented
 * packets during reassembly.
 *
 * Size ILP32: 40
 *       LP64: 56
 */
struct pkthdr {
        union {
                void                *ctx;                /* for M_GETCTX/M_SETCTX */
                if_index_t        index;                /* rcv interface index */
        } _rcvif;
#define rcvif_index                _rcvif.index
        SLIST_HEAD(packet_tags, m_tag) tags;        /* list of packet tags */
        int                len;                        /* total packet length */
        int                csum_flags;                /* checksum flags */
        uint32_t        csum_data;                /* checksum data */
        u_int                segsz;                        /* segment size */
        uint16_t        ether_vtag;                /* ethernet 802.1p+q vlan tag */
        uint16_t        pkthdr_flags;                /* flags for pkthdr, see blow */
#define PKTHDR_FLAG_IPSEC_SKIP_PFIL        0x0001        /* skip pfil_run_hooks() after ipsec decrypt */

        /*
         * Following three fields are open-coded struct altq_pktattr
         * to rearrange struct pkthdr fields flexibly.
         */
        int        pattr_af;                /* ALTQ: address family */
        void        *pattr_class;                /* ALTQ: sched class set by classifier */
        void        *pattr_hdr;                /* ALTQ: saved header position in mbuf */
};

/* Checksumming flags (csum_flags). */
#define M_CSUM_TCPv4                0x00000001        /* TCP header/payload */
#define M_CSUM_UDPv4                0x00000002        /* UDP header/payload */
#define M_CSUM_TCP_UDP_BAD        0x00000004        /* TCP/UDP checksum bad */
#define M_CSUM_DATA                0x00000008        /* consult csum_data */
#define M_CSUM_TCPv6                0x00000010        /* IPv6 TCP header/payload */
#define M_CSUM_UDPv6                0x00000020        /* IPv6 UDP header/payload */
#define M_CSUM_IPv4                0x00000040        /* IPv4 header */
#define M_CSUM_IPv4_BAD                0x00000080        /* IPv4 header checksum bad */
#define M_CSUM_TSOv4                0x00000100        /* TCPv4 segmentation offload */
#define M_CSUM_TSOv6                0x00000200        /* TCPv6 segmentation offload */

/* Checksum-assist quirks: keep separate from jump-table bits. */
#define M_CSUM_BLANK                0x40000000        /* csum is missing */
#define M_CSUM_NO_PSEUDOHDR        0x80000000        /* Rx csum_data does not include
                                                 * the UDP/TCP pseudo-hdr, and
                                                 * is not yet 1s-complemented.
                                                 */

#define M_CSUM_BITS \
    "\20\1TCPv4\2UDPv4\3TCP_UDP_BAD\4DATA\5TCPv6\6UDPv6\7IPv4\10IPv4_BAD" \
    "\11TSOv4\12TSOv6\37BLANK\40NO_PSEUDOHDR"

/*
 * Macros for manipulating csum_data on outgoing packets. These are
 * used to pass information down from the L4/L3 to the L2.
 *
 *   _IPHL:   Length of the IPv{4/6} header, plus the options; in other
 *            words the offset of the UDP/TCP header in the packet.
 *   _OFFSET: Offset of the checksum field in the UDP/TCP header.
 */
#define M_CSUM_DATA_IPv4_IPHL(x)        ((x) >> 16)
#define M_CSUM_DATA_IPv4_OFFSET(x)        ((x) & 0xffff)
#define M_CSUM_DATA_IPv6_IPHL(x)        ((x) >> 16)
#define M_CSUM_DATA_IPv6_OFFSET(x)        ((x) & 0xffff)
#define M_CSUM_DATA_IPv6_SET(x, v)        (x) = ((x) & 0xffff) | ((v) << 16)

/*
 * Max # of pages we can attach to m_ext.  This is carefully chosen
 * to be able to handle SOSEND_LOAN_CHUNK with our minimum sized page.
 */
#ifdef MIN_PAGE_SIZE
#define M_EXT_MAXPAGES                ((65536 / MIN_PAGE_SIZE) + 1)
#endif

/*
 * Description of external storage mapped into mbuf, valid if M_EXT set.
 */
struct _m_ext_storage {
        unsigned int ext_refcnt;
        char *ext_buf;                        /* start of buffer */
        void (*ext_free)                /* free routine if not the usual */
                (struct mbuf *, void *, size_t, void *);
        void *ext_arg;                        /* argument for ext_free */
        size_t ext_size;                /* size of buffer, for ext_free */

        union {
                /* M_EXT_CLUSTER: physical address */
                paddr_t extun_paddr;
#ifdef M_EXT_MAXPAGES
                /* M_EXT_PAGES: pages */
                struct vm_page *extun_pgs[M_EXT_MAXPAGES];
#endif
        } ext_un;
#define ext_paddr        ext_un.extun_paddr
#define ext_pgs                ext_un.extun_pgs
};

struct _m_ext {
        struct mbuf *ext_ref;
        struct _m_ext_storage ext_storage;
};

#define M_PADDR_INVALID                POOL_PADDR_INVALID

/*
 * Definition of "struct mbuf".
 * Don't change this without understanding how MHLEN/MLEN are defined.
 */
#define MBUF_DEFINE(name, mhlen, mlen)                                        \
        struct name {                                                        \
                struct m_hdr m_hdr;                                        \
                union {                                                        \
                        struct {                                        \
                                struct pkthdr MH_pkthdr;                \
                                union {                                        \
                                        struct _m_ext MH_ext;                \
                                        char MH_databuf[(mhlen)];        \
                                } MH_dat;                                \
                        } MH;                                                \
                        char M_databuf[(mlen)];                                \
                } M_dat;                                                \
        }
#define m_next                m_hdr.mh_next
#define m_len                m_hdr.mh_len
#define m_data                m_hdr.mh_data
#define m_owner                m_hdr.mh_owner
#define m_type                m_hdr.mh_type
#define m_flags                m_hdr.mh_flags
#define m_nextpkt        m_hdr.mh_nextpkt
#define m_paddr                m_hdr.mh_paddr
#define m_pkthdr        M_dat.MH.MH_pkthdr
#define m_ext_storage        M_dat.MH.MH_dat.MH_ext.ext_storage
#define m_ext_ref        M_dat.MH.MH_dat.MH_ext.ext_ref
#define m_ext                m_ext_ref->m_ext_storage
#define m_pktdat        M_dat.MH.MH_dat.MH_databuf
#define m_dat                M_dat.M_databuf

/*
 * Dummy mbuf structure to calculate the right values for MLEN/MHLEN, taking
 * into account inter-structure padding.
 */
MBUF_DEFINE(_mbuf_dummy, 1, 1);

/* normal data len */
#define MLEN                ((int)(MSIZE - offsetof(struct _mbuf_dummy, m_dat)))
/* data len w/pkthdr */
#define MHLEN                ((int)(MSIZE - offsetof(struct _mbuf_dummy, m_pktdat)))

#define MINCLSIZE        (MHLEN+MLEN+1)        /* smallest amount to put in cluster */

/*
 * The *real* struct mbuf
 */
MBUF_DEFINE(mbuf, MHLEN, MLEN);

/* mbuf flags */
#define M_EXT                0x00000001        /* has associated external storage */
#define M_PKTHDR        0x00000002        /* start of record */
#define M_EOR                0x00000004        /* end of record */
#define M_PROTO1        0x00000008        /* protocol-specific */

/* mbuf pkthdr flags, also in m_flags */
#define M_AUTHIPHDR        0x00000010        /* authenticated (IPsec) */
#define M_DECRYPTED        0x00000020        /* decrypted (IPsec) */
#define M_LOOP                0x00000040        /* received on loopback */
#define M_BCAST                0x00000100        /* send/received as L2 broadcast */
#define M_MCAST                0x00000200        /* send/received as L2 multicast */
#define M_CANFASTFWD        0x00000400        /* packet can be fast-forwarded */
#define M_ANYCAST6        0x00000800        /* received as IPv6 anycast */

#define M_LINK0                0x00001000        /* link layer specific flag */
#define M_LINK1                0x00002000        /* link layer specific flag */
#define M_LINK2                0x00004000        /* link layer specific flag */
#define M_LINK3                0x00008000        /* link layer specific flag */
#define M_LINK4                0x00010000        /* link layer specific flag */
#define M_LINK5                0x00020000        /* link layer specific flag */
#define M_LINK6                0x00040000        /* link layer specific flag */
#define M_LINK7                0x00080000        /* link layer specific flag */

#define M_VLANTAG        0x00100000        /* ether_vtag is valid */

/* additional flags for M_EXT mbufs */
#define M_EXT_FLAGS        0xff000000
#define M_EXT_CLUSTER        0x01000000        /* ext is a cluster */
#define M_EXT_PAGES        0x02000000        /* ext_pgs is valid */
#define M_EXT_ROMAP        0x04000000        /* ext mapping is r-o at MMU */
#define M_EXT_RW        0x08000000        /* ext storage is writable */

/* for source-level compatibility */
#define M_NOTIFICATION        M_PROTO1

#define M_FLAGS_BITS \
    "\20\1EXT\2PKTHDR\3EOR\4PROTO1\5AUTHIPHDR\6DECRYPTED\7LOOP\10NONE" \
    "\11BCAST\12MCAST\13CANFASTFWD\14ANYCAST6\15LINK0\16LINK1\17LINK2\20LINK3" \
    "\21LINK4\22LINK5\23LINK6\24LINK7" \
    "\25VLANTAG" \
    "\31EXT_CLUSTER\32EXT_PAGES\33EXT_ROMAP\34EXT_RW"

/* flags copied when copying m_pkthdr */
#define M_COPYFLAGS        (M_PKTHDR|M_EOR|M_BCAST|M_MCAST|M_CANFASTFWD| \
    M_ANYCAST6|M_LINK0|M_LINK1|M_LINK2|M_AUTHIPHDR|M_DECRYPTED|M_LOOP| \
    M_VLANTAG)

/* flag copied when shallow-copying external storage */
#define M_EXTCOPYFLAGS        (M_EXT|M_EXT_FLAGS)

/* mbuf types */
#define MT_FREE                0        /* should be on free list */
#define MT_DATA                1        /* dynamic (data) allocation */
#define MT_HEADER        2        /* packet header */
#define MT_SONAME        3        /* socket name */
#define MT_SOOPTS        4        /* socket options */
#define MT_FTABLE        5        /* fragment reassembly header */
#define MT_CONTROL        6        /* extra-data protocol message */
#define MT_OOBDATA        7        /* expedited data  */

#ifdef MBUFTYPES
const char * const mbuftypes[] = {
        "mbfree",
        "mbdata",
        "mbheader",
        "mbsoname",
        "mbsopts",
        "mbftable",
        "mbcontrol",
        "mboobdata",
};
#else
extern const char * const mbuftypes[];
#endif

/* flags to m_get/MGET */
#define M_DONTWAIT        M_NOWAIT
#define M_WAIT                M_WAITOK

#ifdef MBUFTRACE
/* Mbuf allocation tracing. */
void mowner_init_owner(struct mowner *, const char *, const char *);
void mowner_init(struct mbuf *, int);
void mowner_ref(struct mbuf *, int);
void m_claim(struct mbuf *, struct mowner *);
void mowner_revoke(struct mbuf *, bool, int);
void mowner_attach(struct mowner *);
void mowner_detach(struct mowner *);
void m_claimm(struct mbuf *, struct mowner *);
#else
#define mowner_init_owner(mo, n, d)        __nothing
#define mowner_init(m, type)                __nothing
#define mowner_ref(m, flags)                __nothing
#define mowner_revoke(m, all, flags)        __nothing
#define m_claim(m, mowner)                __nothing
#define mowner_attach(mo)                __nothing
#define mowner_detach(mo)                __nothing
#define m_claimm(m, mo)                        __nothing
#endif

#define MCLAIM(m, mo)                m_claim((m), (mo))
#define MOWNER_ATTACH(mo)        mowner_attach(mo)
#define MOWNER_DETACH(mo)        mowner_detach(mo)

/*
 * mbuf allocation/deallocation macros:
 *
 *        MGET(struct mbuf *m, int how, int type)
 * allocates an mbuf and initializes it to contain internal data.
 *
 *        MGETHDR(struct mbuf *m, int how, int type)
 * allocates an mbuf and initializes it to contain a packet header
 * and internal data.
 *
 * If 'how' is M_WAIT, these macros (and the corresponding functions)
 * are guaranteed to return successfully.
 */
#define MGET(m, how, type)        m = m_get((how), (type))
#define MGETHDR(m, how, type)        m = m_gethdr((how), (type))

#if defined(_KERNEL)

#define MCLINITREFERENCE(m)                                                \
do {                                                                        \
        KASSERT(((m)->m_flags & M_EXT) == 0);                                \
        (m)->m_ext_ref = (m);                                                \
        (m)->m_ext.ext_refcnt = 1;                                        \
} while (/* CONSTCOND */ 0)

/*
 * Macros for mbuf external storage.
 *
 * MCLGET allocates and adds an mbuf cluster to a normal mbuf;
 * the flag M_EXT is set upon success.
 *
 * MEXTMALLOC allocates external storage and adds it to
 * a normal mbuf; the flag M_EXT is set upon success.
 *
 * MEXTADD adds pre-allocated external storage to
 * a normal mbuf; the flag M_EXT is set upon success.
 */

#define MCLGET(m, how)        m_clget((m), (how))

#define MEXTMALLOC(m, size, how)                                        \
do {                                                                        \
        (m)->m_ext_storage.ext_buf = malloc((size), 0, (how));                \
        if ((m)->m_ext_storage.ext_buf != NULL) {                        \
                MCLINITREFERENCE(m);                                        \
                (m)->m_data = (m)->m_ext.ext_buf;                        \
                (m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) |        \
                                M_EXT|M_EXT_RW;                                \
                (m)->m_ext.ext_size = (size);                                \
                (m)->m_ext.ext_free = NULL;                                \
                (m)->m_ext.ext_arg = NULL;                                \
                mowner_ref((m), M_EXT);                                        \
        }                                                                \
} while (/* CONSTCOND */ 0)

#define MEXTADD(m, buf, size, type, free, arg)                                \
do {                                                                        \
        MCLINITREFERENCE(m);                                                \
        (m)->m_data = (m)->m_ext.ext_buf = (char *)(buf);                \
        (m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) | M_EXT;        \
        (m)->m_ext.ext_size = (size);                                        \
        (m)->m_ext.ext_free = (free);                                        \
        (m)->m_ext.ext_arg = (arg);                                        \
        mowner_ref((m), M_EXT);                                                \
} while (/* CONSTCOND */ 0)

#define M_BUFADDR(m)                                                        \
        (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf :                        \
            ((m)->m_flags & M_PKTHDR) ? (m)->m_pktdat : (m)->m_dat)

#define M_BUFSIZE(m)                                                        \
        (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size :                        \
            ((m)->m_flags & M_PKTHDR) ? MHLEN : MLEN)

#define MRESETDATA(m)        (m)->m_data = M_BUFADDR(m)

/*
 * Compute the offset of the beginning of the data buffer of a non-ext
 * mbuf.
 */
#define M_BUFOFFSET(m)                                                        \
        (((m)->m_flags & M_PKTHDR) ?                                        \
         offsetof(struct mbuf, m_pktdat) : offsetof(struct mbuf, m_dat))

/*
 * Determine if an mbuf's data area is read-only.  This is true
 * if external storage is read-only mapped, or not marked as R/W,
 * or referenced by more than one mbuf.
 */
#define M_READONLY(m)                                                        \
        (((m)->m_flags & M_EXT) != 0 &&                                        \
          (((m)->m_flags & (M_EXT_ROMAP|M_EXT_RW)) != M_EXT_RW ||        \
          (m)->m_ext.ext_refcnt > 1))

#define M_UNWRITABLE(__m, __len)                                        \
        ((__m)->m_len < (__len) || M_READONLY((__m)))

/*
 * Determine if an mbuf's data area is read-only at the MMU.
 */
#define M_ROMAP(m)                                                        \
        (((m)->m_flags & (M_EXT|M_EXT_ROMAP)) == (M_EXT|M_EXT_ROMAP))

/*
 * Compute the amount of space available before the current start of
 * data in an mbuf.
 */
#define M_LEADINGSPACE(m)                                                \
        (M_READONLY((m)) ? 0 : ((m)->m_data - M_BUFADDR(m)))

/*
 * Compute the amount of space available
 * after the end of data in an mbuf.
 */
#define _M_TRAILINGSPACE(m)                                                \
        ((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf + (m)->m_ext.ext_size - \
         ((m)->m_data + (m)->m_len) :                                        \
         &(m)->m_dat[MLEN] - ((m)->m_data + (m)->m_len))

#define M_TRAILINGSPACE(m)                                                \
        (M_READONLY((m)) ? 0 : _M_TRAILINGSPACE((m)))

/*
 * Arrange to prepend space of size plen to mbuf m.
 * If a new mbuf must be allocated, how specifies whether to wait.
 * If how is M_DONTWAIT and allocation fails, the original mbuf chain
 * is freed and m is set to NULL.
 */
#define M_PREPEND(m, plen, how)                                                \
do {                                                                        \
        if (M_LEADINGSPACE(m) >= (plen)) {                                \
                (m)->m_data -= (plen);                                        \
                (m)->m_len += (plen);                                        \
        } else                                                                \
                (m) = m_prepend((m), (plen), (how));                        \
        if ((m) && (m)->m_flags & M_PKTHDR)                                \
                (m)->m_pkthdr.len += (plen);                                \
} while (/* CONSTCOND */ 0)

/* change mbuf to new type */
#define MCHTYPE(m, t)                                                        \
do {                                                                        \
        KASSERT((t) != MT_FREE);                                        \
        mbstat_type_add((m)->m_type, -1);                                \
        mbstat_type_add(t, 1);                                                \
        (m)->m_type = t;                                                \
} while (/* CONSTCOND */ 0)

#ifdef DIAGNOSTIC
#define M_VERIFY_PACKET(m)        m_verify_packet(m)
#else
#define M_VERIFY_PACKET(m)        __nothing
#endif

/* The "copy all" special length. */
#define M_COPYALL        -1

/*
 * Allow drivers and/or protocols to store private context information.
 */
#define M_GETCTX(m, t)                ((t)(m)->m_pkthdr._rcvif.ctx)
#define M_SETCTX(m, c)                ((void)((m)->m_pkthdr._rcvif.ctx = (void *)(c)))
#define M_CLEARCTX(m)                M_SETCTX((m), NULL)

/*
 * M_REGION_GET ensures that the "len"-sized region of type "typ" starting
 * from "off" within "m" is located in a single mbuf, contiguously.
 *
 * The pointer to the region will be returned to pointer variable "val".
 */
#define M_REGION_GET(val, typ, m, off, len) \
do {                                                                        \
        struct mbuf *_t;                                                \
        int _tmp;                                                        \
        if ((m)->m_len >= (off) + (len))                                \
                (val) = (typ)(mtod((m), char *) + (off));                \
        else {                                                                \
                _t = m_pulldown((m), (off), (len), &_tmp);                \
                if (_t) {                                                \
                        if (_t->m_len < _tmp + (len))                        \
                                panic("m_pulldown malfunction");        \
                        (val) = (typ)(mtod(_t, char *) + _tmp);        \
                } else {                                                \
                        (val) = (typ)NULL;                                \
                        (m) = NULL;                                        \
                }                                                        \
        }                                                                \
} while (/*CONSTCOND*/ 0)

#endif /* defined(_KERNEL) */

/*
 * Simple mbuf queueing system
 *
 * this is basically a SIMPLEQ adapted to mbuf use (ie using
 * m_nextpkt instead of field.sqe_next).
 *
 * m_next is ignored, so queueing chains of mbufs is possible
 */
#define MBUFQ_HEAD(name)                                        \
struct name {                                                        \
        struct mbuf *mq_first;                                        \
        struct mbuf **mq_last;                                        \
}

#define MBUFQ_INIT(q)                do {                                \
        (q)->mq_first = NULL;                                        \
        (q)->mq_last = &(q)->mq_first;                                \
} while (/*CONSTCOND*/0)

#define MBUFQ_ENQUEUE(q, m)        do {                                \
        (m)->m_nextpkt = NULL;                                        \
        *(q)->mq_last = (m);                                        \
        (q)->mq_last = &(m)->m_nextpkt;                                \
} while (/*CONSTCOND*/0)

#define MBUFQ_PREPEND(q, m)        do {                                \
        if (((m)->m_nextpkt = (q)->mq_first) == NULL)                \
                (q)->mq_last = &(m)->m_nextpkt;                        \
        (q)->mq_first = (m);                                        \
} while (/*CONSTCOND*/0)

#define MBUFQ_DEQUEUE(q, m)        do {                                \
        if (((m) = (q)->mq_first) != NULL) {                        \
                if (((q)->mq_first = (m)->m_nextpkt) == NULL)        \
                        (q)->mq_last = &(q)->mq_first;                \
                else                                                \
                        (m)->m_nextpkt = NULL;                        \
        }                                                        \
} while (/*CONSTCOND*/0)

#define MBUFQ_DRAIN(q)                do {                                \
        struct mbuf *__m0;                                        \
        while ((__m0 = (q)->mq_first) != NULL) {                \
                (q)->mq_first = __m0->m_nextpkt;                \
                m_freem(__m0);                                        \
        }                                                        \
        (q)->mq_last = &(q)->mq_first;                                \
} while (/*CONSTCOND*/0)

#define MBUFQ_FIRST(q)                ((q)->mq_first)
#define MBUFQ_NEXT(m)                ((m)->m_nextpkt)
#define MBUFQ_LAST(q)                (*(q)->mq_last)

/*
 * Mbuf statistics.
 * For statistics related to mbuf and cluster allocations, see also the
 * pool headers (mb_cache and mcl_cache).
 */
struct mbstat {
        u_long        _m_spare;        /* formerly m_mbufs */
        u_long        _m_spare1;        /* formerly m_clusters */
        u_long        _m_spare2;        /* spare field */
        u_long        _m_spare3;        /* formely m_clfree - free clusters */
        u_long        m_drops;        /* times failed to find space */
        u_long        m_wait;                /* times waited for space */
        u_long        m_drain;        /* times drained protocols for space */
        u_short        m_mtypes[256];        /* type specific mbuf allocations */
};

struct mbstat_cpu {
        u_int        m_mtypes[256];        /* type specific mbuf allocations */
};

/*
 * Mbuf sysctl variables.
 */
#define MBUF_MSIZE                1        /* int: mbuf base size */
#define MBUF_MCLBYTES                2        /* int: mbuf cluster size */
#define MBUF_NMBCLUSTERS        3        /* int: limit on the # of clusters */
#define MBUF_MBLOWAT                4        /* int: mbuf low water mark */
#define MBUF_MCLLOWAT                5        /* int: mbuf cluster low water mark */
#define MBUF_STATS                6        /* struct: mbstat */
#define MBUF_MOWNERS                7        /* struct: m_owner[] */
#define MBUF_NMBCLUSTERS_LIMIT        8        /* int: limit of nmbclusters */

#ifdef _KERNEL
extern struct mbstat mbstat;
extern int nmbclusters;                /* limit on the # of clusters */
extern int mblowat;                /* mbuf low water mark */
extern int mcllowat;                /* mbuf cluster low water mark */
extern int max_linkhdr;                /* largest link-level header */
extern int max_protohdr;                /* largest protocol header */
extern int max_hdr;                /* largest link+protocol header */
extern int max_datalen;                /* MHLEN - max_hdr */
extern const int msize;                        /* mbuf base size */
extern const int mclbytes;                /* mbuf cluster size */
extern pool_cache_t mb_cache;
#ifdef MBUFTRACE
LIST_HEAD(mownerhead, mowner);
extern struct mownerhead mowners;
extern struct mowner unknown_mowners[];
extern struct mowner revoked_mowner;
#endif

MALLOC_DECLARE(M_MBUF);
MALLOC_DECLARE(M_SONAME);

struct        mbuf *m_copym(struct mbuf *, int, int, int);
struct        mbuf *m_copypacket(struct mbuf *, int);
struct        mbuf *m_devget(char *, int, int, struct ifnet *);
struct        mbuf *m_dup(struct mbuf *, int, int, int);
struct        mbuf *m_get(int, int);
struct        mbuf *m_gethdr(int, int);
struct        mbuf *m_get_n(int, int, size_t, size_t);
struct        mbuf *m_gethdr_n(int, int, size_t, size_t);
struct        mbuf *m_prepend(struct mbuf *,int, int);
struct        mbuf *m_pulldown(struct mbuf *, int, int, int *);
struct        mbuf *m_pullup(struct mbuf *, int);
struct        mbuf *m_copyup(struct mbuf *, int, int);
struct        mbuf *m_split(struct mbuf *,int, int);
struct        mbuf *m_getptr(struct mbuf *, int, int *);
void        m_adj(struct mbuf *, int);
struct        mbuf *m_defrag(struct mbuf *, int);
int        m_apply(struct mbuf *, int, int,
    int (*)(void *, void *, unsigned int), void *);
void        m_cat(struct mbuf *,struct mbuf *);
void        m_clget(struct mbuf *, int);
void        m_copyback(struct mbuf *, int, int, const void *);
struct        mbuf *m_copyback_cow(struct mbuf *, int, int, const void *, int);
int        m_makewritable(struct mbuf **, int, int, int);
struct        mbuf *m_getcl(int, int, int);
void        m_copydata(struct mbuf *, int, int, void *);
void        m_verify_packet(struct mbuf *);
struct        mbuf *m_free(struct mbuf *);
void        m_freem(struct mbuf *);
void        mbinit(void);
void        m_remove_pkthdr(struct mbuf *);
void        m_copy_pkthdr(struct mbuf *, struct mbuf *);
void        m_move_pkthdr(struct mbuf *, struct mbuf *);
void        m_align(struct mbuf *, int);

bool        m_ensure_contig(struct mbuf **, int);
struct mbuf *m_add(struct mbuf *, struct mbuf *);

/* Inline routines. */
static __inline u_int m_length(const struct mbuf *) __unused;

/* Statistics */
void mbstat_type_add(int, int);

/* Packet tag routines */
struct        m_tag *m_tag_get(int, int, int);
void        m_tag_free(struct m_tag *);
void        m_tag_prepend(struct mbuf *, struct m_tag *);
void        m_tag_unlink(struct mbuf *, struct m_tag *);
void        m_tag_delete(struct mbuf *, struct m_tag *);
void        m_tag_delete_chain(struct mbuf *);
struct        m_tag *m_tag_find(const struct mbuf *, int);
struct        m_tag *m_tag_copy(struct m_tag *);
int        m_tag_copy_chain(struct mbuf *, struct mbuf *);

/* Packet tag types */
#define PACKET_TAG_NONE                        0  /* Nothing */
#define PACKET_TAG_SO                        4  /* sending socket pointer */
#define PACKET_TAG_NPF                        10 /* packet filter */
#define PACKET_TAG_PF                        11 /* packet filter */
#define PACKET_TAG_ALTQ_QID                12 /* ALTQ queue id */
#define PACKET_TAG_IPSEC_OUT_DONE        18
#define PACKET_TAG_IPSEC_NAT_T_PORTS        25 /* two uint16_t */
#define PACKET_TAG_INET6                26 /* IPv6 info */
#define PACKET_TAG_TUNNEL_INFO                28 /* tunnel identification and
                                            * protocol callback, for loop
                                            * detection/recovery
                                            */
#define PACKET_TAG_MPLS                        29 /* Indicate it's for MPLS */
#define PACKET_TAG_SRCROUTE                30 /* IPv4 source routing */
#define PACKET_TAG_ETHERNET_SRC                31 /* Ethernet source address */

/*
 * Return the number of bytes in the mbuf chain, m.
 */
static __inline u_int
m_length(const struct mbuf *m)
{
        const struct mbuf *m0;
        u_int pktlen;

        if ((m->m_flags & M_PKTHDR) != 0)
                return m->m_pkthdr.len;

        pktlen = 0;
        for (m0 = m; m0 != NULL; m0 = m0->m_next)
                pktlen += m0->m_len;
        return pktlen;
}

static __inline void
m_set_rcvif(struct mbuf *m, const struct ifnet *ifp)
{
        KASSERT(m->m_flags & M_PKTHDR);
        m->m_pkthdr.rcvif_index = ifp->if_index;
}

static __inline void
m_reset_rcvif(struct mbuf *m)
{
        KASSERT(m->m_flags & M_PKTHDR);
        /* A caller may expect whole _rcvif union is zeroed */
        /* m->m_pkthdr.rcvif_index = 0; */
        m->m_pkthdr._rcvif.ctx = NULL;
}

static __inline void
m_copy_rcvif(struct mbuf *m, const struct mbuf *n)
{
        KASSERT(m->m_flags & M_PKTHDR);
        KASSERT(n->m_flags & M_PKTHDR);
        m->m_pkthdr.rcvif_index = n->m_pkthdr.rcvif_index;
}

#define M_GET_ALIGNED_HDR(m, type, linkhdr) \
    m_get_aligned_hdr((m), __alignof(type) - 1, sizeof(type), (linkhdr))

static __inline int
m_get_aligned_hdr(struct mbuf **m, int mask, size_t hlen, bool linkhdr)
{
#ifndef __NO_STRICT_ALIGNMENT
        if (((uintptr_t)mtod(*m, void *) & mask) != 0)
                *m = m_copyup(*m, hlen, 
                      linkhdr ? (max_linkhdr + mask) & ~mask : 0);
        else
#endif
        if (__predict_false((size_t)(*m)->m_len < hlen))
                *m = m_pullup(*m, hlen);

        return *m == NULL;
}

void m_print(const struct mbuf *, const char *, void (*)(const char *, ...)
    __printflike(1, 2));

/* from uipc_mbufdebug.c */
void        m_examine(const struct mbuf *, int, const char *,
    void (*)(const char *, ...) __printflike(1, 2));

/* parsers for m_examine() */
void m_examine_ether(const struct mbuf *, int, const char *,
    void (*)(const char *, ...) __printflike(1, 2));
void m_examine_pppoe(const struct mbuf *, int, const char *,
    void (*)(const char *, ...) __printflike(1, 2));
void m_examine_ppp(const struct mbuf *, int, const char *,
    void (*)(const char *, ...) __printflike(1, 2));
void m_examine_arp(const struct mbuf *, int, const char *,
    void (*)(const char *, ...) __printflike(1, 2));
void m_examine_ip(const struct mbuf *, int, const char *,
    void (*)(const char *, ...) __printflike(1, 2));
void m_examine_icmp(const struct mbuf *, int, const char *,
    void (*)(const char *, ...) __printflike(1, 2));
void m_examine_ip6(const struct mbuf *, int, const char *,
    void (*)(const char *, ...) __printflike(1, 2));
void m_examine_icmp6(const struct mbuf *, int, const char *,
    void (*)(const char *, ...) __printflike(1, 2));
void m_examine_tcp(const struct mbuf *, int, const char *,
    void (*)(const char *, ...) __printflike(1, 2));
void m_examine_udp(const struct mbuf *, int, const char *,
    void (*)(const char *, ...) __printflike(1, 2));
void m_examine_hex(const struct mbuf *, int, const char *,
    void (*)(const char *, ...) __printflike(1, 2));

/*
 * Get rcvif of a mbuf.
 *
 * The caller must call m_put_rcvif after using rcvif if the returned rcvif
 * isn't NULL. If the returned rcvif is NULL, the caller doesn't need to call
 * m_put_rcvif (although calling it is safe).
 *
 * The caller must not block or sleep while using rcvif. The API ensures a
 * returned rcvif isn't freed until m_put_rcvif is called.
 */
static __inline struct ifnet *
m_get_rcvif(const struct mbuf *m, int *s)
{
        struct ifnet *ifp;

        KASSERT(m->m_flags & M_PKTHDR);
        *s = pserialize_read_enter();
        ifp = if_byindex(m->m_pkthdr.rcvif_index);
        if (__predict_false(ifp == NULL))
                pserialize_read_exit(*s);

        return ifp;
}

static __inline void
m_put_rcvif(struct ifnet *ifp, int *s)
{

        if (ifp == NULL)
                return;
        pserialize_read_exit(*s);
}

/*
 * Get rcvif of a mbuf.
 *
 * The caller must call m_put_rcvif_psref after using rcvif. The API ensures
 * a got rcvif isn't be freed until m_put_rcvif_psref is called.
 */
static __inline struct ifnet *
m_get_rcvif_psref(const struct mbuf *m, struct psref *psref)
{
        KASSERT(m->m_flags & M_PKTHDR);
        return if_get_byindex(m->m_pkthdr.rcvif_index, psref);
}

static __inline void
m_put_rcvif_psref(struct ifnet *ifp, struct psref *psref)
{

        if (ifp == NULL)
                return;
        if_put(ifp, psref);
}

/*
 * Get rcvif of a mbuf.
 *
 * This is NOT an MP-safe API and shouldn't be used at where we want MP-safe.
 */
static __inline struct ifnet *
m_get_rcvif_NOMPSAFE(const struct mbuf *m)
{
        KASSERT(m->m_flags & M_PKTHDR);
        return if_byindex(m->m_pkthdr.rcvif_index);
}

#endif /* _KERNEL */
#endif /* !_SYS_MBUF_H_ */



























    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/* $NetBSD: explicit_memset.c,v 1.4 2014/06/24 16:39:39 drochner Exp $ */

/*
 * Written by Matthias Drochner <drochner@NetBSD.org>.
 * Public domain.
 */

#if !defined(_KERNEL) && !defined(_STANDALONE)
#include "namespace.h"
#include <string.h>
#ifdef __weak_alias
__weak_alias(explicit_memset,_explicit_memset)
#endif
#define explicit_memset_impl __explicit_memset_impl
#else
#include <lib/libkern/libkern.h>
#endif

/*
 * The use of a volatile pointer guarantees that the compiler
 * will not optimise the call away.
 */
void *(* volatile explicit_memset_impl)(void *, int, size_t) = memset;

void *
explicit_memset(void *b, int c, size_t len)
{

        return (*explicit_memset_impl)(b, c, len);
}






















































































































































































































































































































































































































































    1 
















    1 


































































    1 
    1 






    1 



































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
/*        $NetBSD: prop_kern.c,v 1.25 2022/08/03 21:13:46 riastradh Exp $        */

/*-
 * Copyright (c) 2006, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#if defined(__NetBSD__)

#include <sys/types.h>
#include <sys/ioctl.h>

#include <prop/proplib.h>

#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <sys/mman.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>

#ifdef RUMP_ACTION
#include <rump/rump_syscalls.h>
#define ioctl(a,b,c) rump_sys_ioctl(a,b,c)
#endif

static int
_prop_object_externalize_to_pref(prop_object_t obj, struct plistref *pref,
                                        char **bufp)
{
        char *buf;

        switch (prop_object_type(obj)) {
        case PROP_TYPE_DICTIONARY:
                buf = prop_dictionary_externalize(obj);
                break;
        case PROP_TYPE_ARRAY:
                buf = prop_array_externalize(obj);
                break;
        default:
                return (ENOTSUP);
        }
        if (buf == NULL) {
                /* Assume we ran out of memory. */
                return (ENOMEM);
        }
        pref->pref_plist = buf;
        pref->pref_len = strlen(buf) + 1;

        *bufp = buf;

        return (0);
}

bool
prop_array_externalize_to_pref(prop_array_t array, struct plistref *prefp)
{
        char *buf;
        int rv;

        rv = _prop_object_externalize_to_pref(array, prefp, &buf);
        if (rv != 0)
                errno = rv;        /* pass up error value in errno */
        return (rv == 0);
}

/*
 * prop_array_externalize_to_pref --
 *        Externalize an array into a plistref for sending to the kernel.
 */
int
prop_array_send_syscall(prop_array_t array, struct plistref *prefp)
{
        if (prop_array_externalize_to_pref(array, prefp))
                return 0;
        else
                return errno;
}

bool
prop_dictionary_externalize_to_pref(prop_dictionary_t dict,
                                    struct plistref *prefp)
{
        char *buf;
        int rv;

        rv = _prop_object_externalize_to_pref(dict, prefp, &buf);
        if (rv != 0)
                errno = rv;        /* pass up error value in errno */
        return (rv == 0);
}

/*
 * prop_dictionary_externalize_to_pref --
 *        Externalize an dictionary into a plistref for sending to the kernel.
 */
int
prop_dictionary_send_syscall(prop_dictionary_t dict,
                             struct plistref *prefp)
{
        if (prop_dictionary_externalize_to_pref(dict, prefp))
                return 0;
        else
                return errno;
}

static int
_prop_object_send_ioctl(prop_object_t obj, int fd, unsigned long cmd)
{
        struct plistref pref;
        char *buf;
        int error;

        error = _prop_object_externalize_to_pref(obj, &pref, &buf);
        if (error)
                return (error);

        if (ioctl(fd, cmd, &pref) == -1)
                error = errno;
        else
                error = 0;

        free(buf);

        return (error);
}

/*
 * prop_array_send_ioctl --
 *        Send an array to the kernel using the specified ioctl.
 */
int
prop_array_send_ioctl(prop_array_t array, int fd, unsigned long cmd)
{
        int rv;

        rv = _prop_object_send_ioctl(array, fd, cmd);
        if (rv != 0) {
                errno = rv;        /* pass up error value in errno */
                return rv;
        } else
                return 0;
}

/*
 * prop_dictionary_send_ioctl --
 *        Send a dictionary to the kernel using the specified ioctl.
 */
int
prop_dictionary_send_ioctl(prop_dictionary_t dict, int fd, unsigned long cmd)
{
        int rv;

        rv = _prop_object_send_ioctl(dict, fd, cmd);
        if (rv != 0) {
                errno = rv;        /* pass up error value in errno */
                return rv;
        } else
                return 0;
}

static int
_prop_object_internalize_from_pref(const struct plistref *pref,
                                   prop_type_t type, prop_object_t *objp)
{
        prop_object_t obj = NULL;
        char *buf;
        int error = 0;

        if (pref->pref_len == 0) {
                /*
                 * This should never happen; we should always get the XML
                 * for an empty dictionary if it's really empty.
                 */
                error = EIO;
                goto out;
        } else {
                buf = pref->pref_plist;
                buf[pref->pref_len - 1] = '\0';        /* extra insurance */
                switch (type) {
                case PROP_TYPE_DICTIONARY:
                        obj = prop_dictionary_internalize(buf);
                        break;
                case PROP_TYPE_ARRAY:
                        obj = prop_array_internalize(buf);
                        break;
                default:
                        error = ENOTSUP;
                }
                (void) munmap(buf, pref->pref_len);
                if (obj == NULL && error == 0)
                        error = EIO;
        }

 out:
        if (error == 0)
                *objp = obj;
        return (error);
}

/*
 * prop_array_internalize_from_pref --
 *         Internalize a pref into a prop_array_t object.
 */
bool
prop_array_internalize_from_pref(const struct plistref *prefp,
                                 prop_array_t *arrayp)
{
        int rv;

        rv = _prop_object_internalize_from_pref(prefp, PROP_TYPE_ARRAY,
            (prop_object_t *)arrayp);
        if (rv != 0)
                errno = rv;     /* pass up error value in errno */
        return (rv == 0);
}

/*
 * prop_array_recv_syscall --
 *         Internalize an array received from the kernel as pref.
 */
int
prop_array_recv_syscall(const struct plistref *prefp,
                        prop_array_t *arrayp)
{
        if (prop_array_internalize_from_pref(prefp, arrayp))
                return 0;
        else
                return errno;
}

/*
 * prop_dictionary_internalize_from_pref --
 *         Internalize a pref into a prop_dictionary_t object.
 */
bool
prop_dictionary_internalize_from_pref(const struct plistref *prefp,
                                      prop_dictionary_t *dictp)
{
        int rv;

        rv = _prop_object_internalize_from_pref(prefp, PROP_TYPE_DICTIONARY,
            (prop_object_t *)dictp);
        if (rv != 0)
                errno = rv;     /* pass up error value in errno */
        return (rv == 0);
}

/*
 * prop_dictionary_recv_syscall --
 *        Internalize a dictionary received from the kernel as pref.
 */
int
prop_dictionary_recv_syscall(const struct plistref *prefp,
                             prop_dictionary_t *dictp)
{
        if (prop_dictionary_internalize_from_pref(prefp, dictp))
                return 0;
        else
                return errno;
}


/*
 * prop_array_recv_ioctl --
 *        Receive an array from the kernel using the specified ioctl.
 */
int
prop_array_recv_ioctl(int fd, unsigned long cmd, prop_array_t *arrayp)
{
        int rv;
        struct plistref pref;

        rv = ioctl(fd, cmd, &pref);
        if (rv == -1)
                return errno;

        rv = _prop_object_internalize_from_pref(&pref, PROP_TYPE_ARRAY,
                            (prop_object_t *)arrayp);
        if (rv != 0) {
                errno = rv;     /* pass up error value in errno */
                return rv;
        } else
                return 0;
}

/*
 * prop_dictionary_recv_ioctl --
 *        Receive a dictionary from the kernel using the specified ioctl.
 */
int
prop_dictionary_recv_ioctl(int fd, unsigned long cmd, prop_dictionary_t *dictp)
{
        int rv;
        struct plistref pref;

        rv = ioctl(fd, cmd, &pref);
        if (rv == -1)
                return errno;

        rv = _prop_object_internalize_from_pref(&pref, PROP_TYPE_DICTIONARY,
                            (prop_object_t *)dictp);
        if (rv != 0) {
                errno = rv;     /* pass up error value in errno */
                return rv;
        } else
                return 0;
}

/*
 * prop_dictionary_sendrecv_ioctl --
 *        Combination send/receive a dictionary to/from the kernel using
 *        the specified ioctl.
 */
int
prop_dictionary_sendrecv_ioctl(prop_dictionary_t dict, int fd,
                               unsigned long cmd, prop_dictionary_t *dictp)
{
        struct plistref pref;
        char *buf;
        int error;

        error = _prop_object_externalize_to_pref(dict, &pref, &buf);
        if (error != 0) {
                errno = error;
                return error;
        }

        if (ioctl(fd, cmd, &pref) == -1)
                error = errno;
        else
                error = 0;

        free(buf);

        if (error != 0)
                return error;

        error = _prop_object_internalize_from_pref(&pref, PROP_TYPE_DICTIONARY,
                            (prop_object_t *)dictp);
        if (error != 0) {
                errno = error;     /* pass up error value in errno */
                return error;
        } else
                return 0;
}
#endif /* !_KERNEL && !_STANDALONE */

#if defined(_KERNEL)
#include <sys/param.h>
#include <sys/mman.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/resource.h>
#include <sys/pool.h>

#include <uvm/uvm_extern.h>

#include "prop_object_impl.h"

/* Arbitrary limit ioctl input to 128KB */
unsigned int prop_object_copyin_limit = 128 * 1024;

/* initialize proplib for use in the kernel */
void
prop_kern_init(void)
{
        __link_set_decl(prop_linkpools, struct prop_pool_init);
        struct prop_pool_init * const *pi;

        __link_set_foreach(pi, prop_linkpools)
                pool_init((*pi)->pp, (*pi)->size, 0, 0, 0, (*pi)->wchan,
                    &pool_allocator_nointr, IPL_NONE);
}

static int
_prop_object_copyin(const struct plistref *pref, const prop_type_t type,
                          prop_object_t *objp, size_t lim)
{
        prop_object_t obj = NULL;
        char *buf;
        int error;

        if (pref->pref_len >= lim)
                return E2BIG;

        /*
         * Allocate an extra byte so we can guarantee NUL-termination.
         */
        buf = malloc(pref->pref_len + 1, M_TEMP, M_WAITOK);
        if (buf == NULL)
                return (ENOMEM);
        error = copyin(pref->pref_plist, buf, pref->pref_len);
        if (error) {
                free(buf, M_TEMP);
                return (error);
        }
        buf[pref->pref_len] = '\0';

        switch (type) {
        case PROP_TYPE_ARRAY:
                obj = prop_array_internalize(buf);
                break;
        case PROP_TYPE_DICTIONARY:
                obj = prop_dictionary_internalize(buf);
                break;
        default:
                error = ENOTSUP;
        }

        free(buf, M_TEMP);
        if (obj == NULL) {
                if (error == 0)
                        error = EIO;
        } else {
                *objp = obj;
        }
        return (error);
}


static int
_prop_object_copyin_ioctl(const struct plistref *pref, const prop_type_t type,
                          const u_long cmd, prop_object_t *objp, size_t lim)
{
        if ((cmd & IOC_IN) == 0)
                return (EFAULT);

        return _prop_object_copyin(pref, type, objp, lim);
}

/*
 * prop_array_copyin --
 *        Copy in an array passed as a syscall arg.
 */
int
prop_array_copyin_size(const struct plistref *pref, prop_array_t *arrayp,
        size_t lim)
{
        return _prop_object_copyin(pref, PROP_TYPE_ARRAY,
            (prop_object_t *)arrayp, lim);
}

int
prop_array_copyin(const struct plistref *pref, prop_array_t *arrayp)
{
        return prop_array_copyin_size(pref, arrayp, prop_object_copyin_limit);
}

/*
 * prop_dictionary_copyin --
 *        Copy in a dictionary passed as a syscall arg.
 */
int
prop_dictionary_copyin_size(const struct plistref *pref,
    prop_dictionary_t *dictp, size_t lim)
{
        return _prop_object_copyin(pref, PROP_TYPE_DICTIONARY,
            (prop_object_t *)dictp, lim);
}

int
prop_dictionary_copyin(const struct plistref *pref, prop_dictionary_t *dictp)
{
        return prop_dictionary_copyin_size(pref, dictp,
            prop_object_copyin_limit);
}

/*
 * prop_array_copyin_ioctl --
 *        Copy in an array send with an ioctl.
 */
int
prop_array_copyin_ioctl_size(const struct plistref *pref, const u_long cmd,
    prop_array_t *arrayp, size_t lim)
{
        return _prop_object_copyin_ioctl(pref, PROP_TYPE_ARRAY,
            cmd, (prop_object_t *)arrayp, lim);
}

int
prop_array_copyin_ioctl(const struct plistref *pref, const u_long cmd,
    prop_array_t *arrayp)
{
        return prop_array_copyin_ioctl_size(pref, cmd, arrayp,
            prop_object_copyin_limit);
}

/*
 * prop_dictionary_copyin_ioctl --
 *        Copy in a dictionary sent with an ioctl.
 */
int
prop_dictionary_copyin_ioctl_size(const struct plistref *pref, const u_long cmd,
    prop_dictionary_t *dictp, size_t lim)
{
        return _prop_object_copyin_ioctl(pref, PROP_TYPE_DICTIONARY,
            cmd, (prop_object_t *)dictp, lim);
}

int
prop_dictionary_copyin_ioctl(const struct plistref *pref, const u_long cmd,
    prop_dictionary_t *dictp)
{
    return prop_dictionary_copyin_ioctl_size(pref, cmd, dictp,
        prop_object_copyin_limit);
}

static int
_prop_object_copyout(struct plistref *pref, prop_object_t obj)
{
        struct lwp *l = curlwp;                /* XXX */
        struct proc *p = l->l_proc;
        char *buf;
        void *uaddr;
        size_t len, rlen;
        int error = 0;

        switch (prop_object_type(obj)) {
        case PROP_TYPE_ARRAY:
                buf = prop_array_externalize(obj);
                break;
        case PROP_TYPE_DICTIONARY:
                buf = prop_dictionary_externalize(obj);
                break;
        default:
                return (ENOTSUP);
        }
        if (buf == NULL)
                return (ENOMEM);

        len = strlen(buf) + 1;
        rlen = round_page(len);
        uaddr = NULL;
        error = uvm_mmap_anon(p, &uaddr, rlen);
        if (error == 0) {
                error = copyout(buf, uaddr, len);
                if (error == 0) {
                        pref->pref_plist = uaddr;
                        pref->pref_len   = len;
                }
        }

        free(buf, M_TEMP);

        return (error);
}

/*
 * prop_array_copyout --
 *        Copy out an array to a syscall arg.
 */
int
prop_array_copyout(struct plistref *pref, prop_array_t array)
{
        return (_prop_object_copyout(pref, array));
}

/*
 * prop_dictionary_copyout --
 *        Copy out a dictionary to a syscall arg.
 */
int
prop_dictionary_copyout(struct plistref *pref, prop_dictionary_t dict)
{
        return (_prop_object_copyout(pref, dict));
}

static int
_prop_object_copyout_ioctl(struct plistref *pref, const u_long cmd,
                           prop_object_t obj)
{
        if ((cmd & IOC_OUT) == 0)
                return (EFAULT);
        return _prop_object_copyout(pref, obj);
}


/*
 * prop_array_copyout_ioctl --
 *        Copy out an array being received with an ioctl.
 */
int
prop_array_copyout_ioctl(struct plistref *pref, const u_long cmd,
                         prop_array_t array)
{
        return (_prop_object_copyout_ioctl(pref, cmd, array));
}

/*
 * prop_dictionary_copyout_ioctl --
 *        Copy out a dictionary being received with an ioctl.
 */
int
prop_dictionary_copyout_ioctl(struct plistref *pref, const u_long cmd,
                              prop_dictionary_t dict)
{
        return (
            _prop_object_copyout_ioctl(pref, cmd, dict));
}
#endif /* _KERNEL */

#endif /* __NetBSD__ */




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 






    3 


    1 






    3 
































































































































































































































































































































    2 

































































































































































































































































    1 




    1 









    1 

















































    2 







    1 







































    1 







































































































    2 








    2 































    2 








    2 












    2 


















    1 




















































    2 














    2 


































































    4 




    4 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
/*        $NetBSD: ip_output.c,v 1.326 2023/04/19 22:00:18 mlelstv Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1998 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Public Access Networks Corporation ("Panix").  It was developed under
 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ip_output.c        8.3 (Berkeley) 1/21/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip_output.c,v 1.326 2023/04/19 22:00:18 mlelstv Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_mrouting.h"
#include "opt_net_mpsafe.h"
#include "opt_mpls.h"
#endif

#include "arp.h"

#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/kauth.h>
#include <sys/systm.h>
#include <sys/syslog.h>

#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/pfil.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/ip_private.h>
#include <netinet/in_offload.h>
#include <netinet/portalgo.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>

#ifdef INET6
#include <netinet6/ip6_var.h>
#endif

#ifdef MROUTING
#include <netinet/ip_mroute.h>
#endif

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#endif

#ifdef MPLS
#include <netmpls/mpls.h>
#include <netmpls/mpls_var.h>
#endif

static int ip_pcbopts(struct inpcb *, const struct sockopt *);
static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
static struct ifnet *ip_multicast_if(struct in_addr *, int *);
static void ip_mloopback(struct ifnet *, struct mbuf *,
    const struct sockaddr_in *);
static int ip_ifaddrvalid(const struct in_ifaddr *);

extern pfil_head_t *inet_pfil_hook;                        /* XXX */

int ip_do_loopback_cksum = 0;

static int
ip_mark_mpls(struct ifnet * const ifp, struct mbuf * const m,
    const struct rtentry *rt)
{
        int error = 0;
#ifdef MPLS
        union mpls_shim msh;

        if (rt == NULL || rt_gettag(rt) == NULL ||
            rt_gettag(rt)->sa_family != AF_MPLS ||
            (m->m_flags & (M_MCAST | M_BCAST)) != 0 ||
            ifp->if_type != IFT_ETHER)
                return 0;

        msh.s_addr = MPLS_GETSADDR(rt);
        if (msh.shim.label != MPLS_LABEL_IMPLNULL) {
                struct m_tag *mtag;
                /*
                 * XXX tentative solution to tell ether_output
                 * it's MPLS. Need some more efficient solution.
                 */
                mtag = m_tag_get(PACKET_TAG_MPLS,
                    sizeof(int) /* dummy */,
                    M_NOWAIT);
                if (mtag == NULL)
                        return ENOMEM;
                m_tag_prepend(m, mtag);
        }
#endif
        return error;
}

/*
 * Send an IP packet to a host.
 */
int
ip_if_output(struct ifnet * const ifp, struct mbuf * const m,
    const struct sockaddr * const dst, const struct rtentry *rt)
{
        int error = 0;

        if (rt != NULL) {
                error = rt_check_reject_route(rt, ifp);
                if (error != 0) {
                        IP_STATINC(IP_STAT_RTREJECT);
                        m_freem(m);
                        return error;
                }
        }

        error = ip_mark_mpls(ifp, m, rt);
        if (error != 0) {
                m_freem(m);
                return error;
        }

        error = if_output_lock(ifp, ifp, m, dst, rt);

        return error;
}

/*
 * IP output.  The packet in mbuf chain m contains a skeletal IP
 * header (with len, off, ttl, proto, tos, src, dst).
 * The mbuf chain containing the packet will be freed.
 * The mbuf opt, if present, will not be freed.
 */
int
ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags,
    struct ip_moptions *imo, struct inpcb *inp)
{
        struct rtentry *rt;
        struct ip *ip;
        struct ifnet *ifp, *mifp = NULL;
        struct mbuf *m = m0;
        int len, hlen, error = 0;
        struct route iproute;
        const struct sockaddr_in *dst;
        struct in_ifaddr *ia = NULL;
        struct ifaddr *ifa;
        int isbroadcast;
        int sw_csum;
        u_long mtu;
        bool natt_frag = false;
        bool rtmtu_nolock;
        union {
                struct sockaddr                sa;
                struct sockaddr_in        sin;
        } udst, usrc;
        struct sockaddr *rdst = &udst.sa;        /* real IP destination, as
                                                 * opposed to the nexthop
                                                 */
        struct psref psref, psref_ia;
        int bound;
        bool bind_need_restore = false;
        const struct sockaddr *sa;

        len = 0;

        MCLAIM(m, &ip_tx_mowner);

        KASSERT((m->m_flags & M_PKTHDR) != 0);
        KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) == 0);
        KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) !=
            (M_CSUM_TCPv4|M_CSUM_UDPv4));
        KASSERT(m->m_len >= sizeof(struct ip));

        hlen = sizeof(struct ip);
        if (opt) {
                m = ip_insertoptions(m, opt, &len);
                hlen = len;
        }
        ip = mtod(m, struct ip *);

        /*
         * Fill in IP header.
         */
        if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
                ip->ip_v = IPVERSION;
                ip->ip_off = htons(0);
                /* ip->ip_id filled in after we find out source ia */
                ip->ip_hl = hlen >> 2;
                IP_STATINC(IP_STAT_LOCALOUT);
        } else {
                hlen = ip->ip_hl << 2;
        }

        /*
         * Route packet.
         */
        if (ro == NULL) {
                memset(&iproute, 0, sizeof(iproute));
                ro = &iproute;
        }
        sockaddr_in_init(&udst.sin, &ip->ip_dst, 0);
        dst = satocsin(rtcache_getdst(ro));

        /*
         * If there is a cached route, check that it is to the same
         * destination and is still up.  If not, free it and try again.
         * The address family should also be checked in case of sharing
         * the cache with IPv6.
         */
        if (dst && (dst->sin_family != AF_INET ||
            !in_hosteq(dst->sin_addr, ip->ip_dst)))
                rtcache_free(ro);

        /* XXX must be before rtcache operations */
        bound = curlwp_bind();
        bind_need_restore = true;

        if ((rt = rtcache_validate(ro)) == NULL &&
            (rt = rtcache_update(ro, 1)) == NULL) {
                dst = &udst.sin;
                error = rtcache_setdst(ro, &udst.sa);
                if (error != 0) {
                        IP_STATINC(IP_STAT_ODROPPED);
                        goto bad;
                }
        }

        /*
         * If routing to interface only, short circuit routing lookup.
         */
        if (flags & IP_ROUTETOIF) {
                ifa = ifa_ifwithladdr_psref(sintocsa(dst), &psref_ia);
                if (ifa == NULL) {
                        IP_STATINC(IP_STAT_NOROUTE);
                        error = ENETUNREACH;
                        goto bad;
                }
                /* ia is already referenced by psref_ia */
                ia = ifatoia(ifa);

                ifp = ia->ia_ifp;
                mtu = ifp->if_mtu;
                ip->ip_ttl = 1;
                isbroadcast = in_broadcast(dst->sin_addr, ifp);
        } else if (((IN_MULTICAST(ip->ip_dst.s_addr) ||
            ip->ip_dst.s_addr == INADDR_BROADCAST) ||
            (flags & IP_ROUTETOIFINDEX)) &&
            imo != NULL && imo->imo_multicast_if_index != 0) {
                ifp = mifp = if_get_byindex(imo->imo_multicast_if_index, &psref);
                if (ifp == NULL) {
                        IP_STATINC(IP_STAT_NOROUTE);
                        error = ENETUNREACH;
                        goto bad;
                }
                mtu = ifp->if_mtu;
                ia = in_get_ia_from_ifp_psref(ifp, &psref_ia);
                if (IN_MULTICAST(ip->ip_dst.s_addr) ||
                    ip->ip_dst.s_addr == INADDR_BROADCAST) {
                        isbroadcast = 0;
                } else {
                        /* IP_ROUTETOIFINDEX */
                        isbroadcast = in_broadcast(dst->sin_addr, ifp);
                        if ((isbroadcast == 0) && ((ifp->if_flags &
                            (IFF_LOOPBACK | IFF_POINTOPOINT)) == 0) &&
                            (in_direct(dst->sin_addr, ifp) == 0)) {
                                /* gateway address required */
                                if (rt == NULL)
                                        rt = rtcache_init(ro);
                                if (rt == NULL || rt->rt_ifp != ifp) {
                                        IP_STATINC(IP_STAT_NOROUTE);
                                        error = EHOSTUNREACH;
                                        goto bad;
                                }
                                rt->rt_use++;
                                if (rt->rt_flags & RTF_GATEWAY)
                                        dst = satosin(rt->rt_gateway);
                                if (rt->rt_flags & RTF_HOST)
                                        isbroadcast =
                                            rt->rt_flags & RTF_BROADCAST;
                        }
                }
        } else {
                if (rt == NULL)
                        rt = rtcache_init(ro);
                if (rt == NULL) {
                        IP_STATINC(IP_STAT_NOROUTE);
                        error = EHOSTUNREACH;
                        goto bad;
                }
                if (ifa_is_destroying(rt->rt_ifa)) {
                        rtcache_unref(rt, ro);
                        rt = NULL;
                        IP_STATINC(IP_STAT_NOROUTE);
                        error = EHOSTUNREACH;
                        goto bad;
                }
                ifa_acquire(rt->rt_ifa, &psref_ia);
                ia = ifatoia(rt->rt_ifa);
                ifp = rt->rt_ifp;
                if ((mtu = rt->rt_rmx.rmx_mtu) == 0)
                        mtu = ifp->if_mtu;
                rt->rt_use++;
                if (rt->rt_flags & RTF_GATEWAY)
                        dst = satosin(rt->rt_gateway);
                if (rt->rt_flags & RTF_HOST)
                        isbroadcast = rt->rt_flags & RTF_BROADCAST;
                else
                        isbroadcast = in_broadcast(dst->sin_addr, ifp);
        }
        rtmtu_nolock = rt && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0;

        if (IN_MULTICAST(ip->ip_dst.s_addr) ||
            (ip->ip_dst.s_addr == INADDR_BROADCAST)) {
                bool inmgroup;

                m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ?
                    M_BCAST : M_MCAST;
                /*
                 * See if the caller provided any multicast options
                 */
                if (imo != NULL)
                        ip->ip_ttl = imo->imo_multicast_ttl;
                else
                        ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;

                /*
                 * if we don't know the outgoing ifp yet, we can't generate
                 * output
                 */
                if (!ifp) {
                        IP_STATINC(IP_STAT_NOROUTE);
                        error = ENETUNREACH;
                        goto bad;
                }

                /*
                 * If the packet is multicast or broadcast, confirm that
                 * the outgoing interface can transmit it.
                 */
                if (((m->m_flags & M_MCAST) &&
                     (ifp->if_flags & IFF_MULTICAST) == 0) ||
                    ((m->m_flags & M_BCAST) &&
                     (ifp->if_flags & (IFF_BROADCAST|IFF_POINTOPOINT)) == 0))  {
                        IP_STATINC(IP_STAT_NOROUTE);
                        error = ENETUNREACH;
                        goto bad;
                }
                /*
                 * If source address not specified yet, use an address
                 * of outgoing interface.
                 */
                if (in_nullhost(ip->ip_src)) {
                        struct in_ifaddr *xia;
                        struct ifaddr *xifa;
                        struct psref _psref;

                        xia = in_get_ia_from_ifp_psref(ifp, &_psref);
                        if (!xia) {
                                IP_STATINC(IP_STAT_IFNOADDR);
                                error = EADDRNOTAVAIL;
                                goto bad;
                        }
                        xifa = &xia->ia_ifa;
                        if (xifa->ifa_getifa != NULL) {
                                ia4_release(xia, &_psref);
                                /* FIXME ifa_getifa is NOMPSAFE */
                                xia = ifatoia((*xifa->ifa_getifa)(xifa, rdst));
                                if (xia == NULL) {
                                        IP_STATINC(IP_STAT_IFNOADDR);
                                        error = EADDRNOTAVAIL;
                                        goto bad;
                                }
                                ia4_acquire(xia, &_psref);
                        }
                        ip->ip_src = xia->ia_addr.sin_addr;
                        ia4_release(xia, &_psref);
                }

                inmgroup = in_multi_group(ip->ip_dst, ifp, flags);
                if (inmgroup && (imo == NULL || imo->imo_multicast_loop)) {
                        /*
                         * If we belong to the destination multicast group
                         * on the outgoing interface, and the caller did not
                         * forbid loopback, loop back a copy.
                         */
                        ip_mloopback(ifp, m, &udst.sin);
                }
#ifdef MROUTING
                else {
                        /*
                         * If we are acting as a multicast router, perform
                         * multicast forwarding as if the packet had just
                         * arrived on the interface to which we are about
                         * to send.  The multicast forwarding function
                         * recursively calls this function, using the
                         * IP_FORWARDING flag to prevent infinite recursion.
                         *
                         * Multicasts that are looped back by ip_mloopback(),
                         * above, will be forwarded by the ip_input() routine,
                         * if necessary.
                         */
                        extern struct socket *ip_mrouter;

                        if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
                                if (ip_mforward(m, ifp) != 0) {
                                        m_freem(m);
                                        goto done;
                                }
                        }
                }
#endif
                /*
                 * Multicasts with a time-to-live of zero may be looped-
                 * back, above, but must not be transmitted on a network.
                 * Also, multicasts addressed to the loopback interface
                 * are not sent -- the above call to ip_mloopback() will
                 * loop back a copy if this host actually belongs to the
                 * destination group on the loopback interface.
                 */
                if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) {
                        IP_STATINC(IP_STAT_ODROPPED);
                        m_freem(m);
                        goto done;
                }
                goto sendit;
        }

        /*
         * If source address not specified yet, use address
         * of outgoing interface.
         */
        if (in_nullhost(ip->ip_src)) {
                struct ifaddr *xifa;

                xifa = &ia->ia_ifa;
                if (xifa->ifa_getifa != NULL) {
                        ia4_release(ia, &psref_ia);
                        /* FIXME ifa_getifa is NOMPSAFE */
                        ia = ifatoia((*xifa->ifa_getifa)(xifa, rdst));
                        if (ia == NULL) {
                                error = EADDRNOTAVAIL;
                                goto bad;
                        }
                        ia4_acquire(ia, &psref_ia);
                }
                ip->ip_src = ia->ia_addr.sin_addr;
        }

        /*
         * Packets with Class-D address as source are not valid per
         * RFC1112.
         */
        if (IN_MULTICAST(ip->ip_src.s_addr)) {
                IP_STATINC(IP_STAT_ODROPPED);
                error = EADDRNOTAVAIL;
                goto bad;
        }

        /*
         * Look for broadcast address and verify user is allowed to
         * send such a packet.
         */
        if (isbroadcast) {
                if ((ifp->if_flags & IFF_BROADCAST) == 0) {
                        IP_STATINC(IP_STAT_BCASTDENIED);
                        error = EADDRNOTAVAIL;
                        goto bad;
                }
                if ((flags & IP_ALLOWBROADCAST) == 0) {
                        IP_STATINC(IP_STAT_BCASTDENIED);
                        error = EACCES;
                        goto bad;
                }
                /* don't allow broadcast messages to be fragmented */
                if (ntohs(ip->ip_len) > ifp->if_mtu) {
                        IP_STATINC(IP_STAT_BCASTDENIED);
                        error = EMSGSIZE;
                        goto bad;
                }
                m->m_flags |= M_BCAST;
        } else
                m->m_flags &= ~M_BCAST;

sendit:
        if ((flags & (IP_FORWARDING|IP_NOIPNEWID)) == 0) {
                if (m->m_pkthdr.len < IP_MINFRAGSIZE) {
                        ip->ip_id = 0;
                } else if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
                        ip->ip_id = ip_newid(ia);
                } else {
                        /*
                         * TSO capable interfaces (typically?) increment
                         * ip_id for each segment.
                         * "allocate" enough ids here to increase the chance
                         * for them to be unique.
                         *
                         * note that the following calculation is not
                         * needed to be precise.  wasting some ip_id is fine.
                         */

                        unsigned int segsz = m->m_pkthdr.segsz;
                        unsigned int datasz = ntohs(ip->ip_len) - hlen;
                        unsigned int num = howmany(datasz, segsz);

                        ip->ip_id = ip_newid_range(ia, num);
                }
        }
        if (ia != NULL) {
                ia4_release(ia, &psref_ia);
                ia = NULL;
        }

        /*
         * If we're doing Path MTU Discovery, we need to set DF unless
         * the route's MTU is locked.
         */
        if ((flags & IP_MTUDISC) != 0 && rtmtu_nolock) {
                ip->ip_off |= htons(IP_DF);
        }

#ifdef IPSEC
        if (ipsec_used) {
                bool ipsec_done = false;
                bool count_drop = false;

                /* Perform IPsec processing, if any. */
                error = ipsec4_output(m, inp, flags, &mtu, &natt_frag,
                    &ipsec_done, &count_drop);
                if (count_drop)
                        IP_STATINC(IP_STAT_IPSECDROP_OUT);
                if (error || ipsec_done)
                        goto done;
        }

        if (!ipsec_used || !natt_frag)
#endif
        {
                /*
                 * Run through list of hooks for output packets.
                 */
                error = pfil_run_hooks(inet_pfil_hook, &m, ifp, PFIL_OUT);
                if (error || m == NULL) {
                        IP_STATINC(IP_STAT_PFILDROP_OUT);
                        goto done;
                }
        }

        ip = mtod(m, struct ip *);
        hlen = ip->ip_hl << 2;

        m->m_pkthdr.csum_data |= hlen << 16;

        /*
         * search for the source address structure to
         * maintain output statistics, and verify address
         * validity
         */
        KASSERT(ia == NULL);
        sockaddr_in_init(&usrc.sin, &ip->ip_src, 0);
        ifa = ifaof_ifpforaddr_psref(&usrc.sa, ifp, &psref_ia);
        if (ifa != NULL)
                ia = ifatoia(ifa);

        /*
         * Ensure we only send from a valid address.
         * A NULL address is valid because the packet could be
         * generated from a packet filter.
         */
        if (ia != NULL && (flags & IP_FORWARDING) == 0 &&
            (error = ip_ifaddrvalid(ia)) != 0)
        {
                ARPLOG(LOG_ERR,
                    "refusing to send from invalid address %s (pid %d)\n",
                    ARPLOGADDR(&ip->ip_src), curproc->p_pid);
                IP_STATINC(IP_STAT_ODROPPED);
                if (error == 1)
                        /*
                         * Address exists, but is tentative or detached.
                         * We can't send from it because it's invalid,
                         * so we drop the packet.
                         */
                        error = 0;
                else
                        error = EADDRNOTAVAIL;
                goto bad;
        }

        /* Maybe skip checksums on loopback interfaces. */
        if (IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) {
                m->m_pkthdr.csum_flags |= M_CSUM_IPv4;
        }
        sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx;

        /* Need to fragment the packet */
        if (ntohs(ip->ip_len) > mtu &&
            (m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
                goto fragment;
        }

#if IFA_STATS
        if (ia)
                ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len);
#endif
        /*
         * Always initialize the sum to 0!  Some HW assisted
         * checksumming requires this.
         */
        ip->ip_sum = 0;

        if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
                /*
                 * Perform any checksums that the hardware can't do
                 * for us.
                 *
                 * XXX Does any hardware require the {th,uh}_sum
                 * XXX fields to be 0?
                 */
                if (sw_csum & M_CSUM_IPv4) {
                        KASSERT(IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4));
                        ip->ip_sum = in_cksum(m, hlen);
                        m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4;
                }
                if (sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
                        if (IN_NEED_CHECKSUM(ifp,
                            sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4))) {
                                in_undefer_cksum_tcpudp(m);
                        }
                        m->m_pkthdr.csum_flags &=
                            ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
                }
        }

        sa = (m->m_flags & M_MCAST) ? sintocsa(rdst) : sintocsa(dst);

        /* Send it */
        if (__predict_false(sw_csum & M_CSUM_TSOv4)) {
                /*
                 * TSO4 is required by a packet, but disabled for
                 * the interface.
                 */
                error = ip_tso_output(ifp, m, sa, rt);
        } else
                error = ip_if_output(ifp, m, sa, rt);
        goto done;

fragment:
        /*
         * We can't use HW checksumming if we're about to fragment the packet.
         *
         * XXX Some hardware can do this.
         */
        if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
                if (IN_NEED_CHECKSUM(ifp,
                    m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4))) {
                        in_undefer_cksum_tcpudp(m);
                }
                m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
        }

        /*
         * Too large for interface; fragment if possible.
         * Must be able to put at least 8 bytes per fragment.
         */
        if (ntohs(ip->ip_off) & IP_DF) {
                if (flags & IP_RETURNMTU) {
                        KASSERT(inp != NULL);
                        in4p_errormtu(inp) = mtu;
                }
                error = EMSGSIZE;
                IP_STATINC(IP_STAT_CANTFRAG);
                goto bad;
        }

        error = ip_fragment(m, ifp, mtu);
        if (error) {
                m = NULL;
                goto bad;
        }

        for (; m; m = m0) {
                m0 = m->m_nextpkt;
                m->m_nextpkt = NULL;
                if (error) {
                        m_freem(m);
                        continue;
                }
#if IFA_STATS
                if (ia)
                        ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len);
#endif
                /*
                 * If we get there, the packet has not been handled by
                 * IPsec whereas it should have. Now that it has been
                 * fragmented, re-inject it in ip_output so that IPsec
                 * processing can occur.
                 */
                if (natt_frag) {
                        error = ip_output(m, opt, NULL,
                            flags | IP_RAWOUTPUT | IP_NOIPNEWID,
                            imo, inp);
                } else {
                        KASSERT((m->m_pkthdr.csum_flags &
                            (M_CSUM_UDPv4 | M_CSUM_TCPv4)) == 0);
                        error = ip_if_output(ifp, m, (m->m_flags & M_MCAST) ?
                            sintocsa(rdst) : sintocsa(dst), rt);
                }
        }
        if (error == 0) {
                IP_STATINC(IP_STAT_FRAGMENTED);
        }

done:
        ia4_release(ia, &psref_ia);
        rtcache_unref(rt, ro);
        if (ro == &iproute) {
                rtcache_free(&iproute);
        }
        if (mifp != NULL) {
                if_put(mifp, &psref);
        }
        if (bind_need_restore)
                curlwp_bindx(bound);
        return error;

bad:
        m_freem(m);
        goto done;
}

int
ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu)
{
        struct ip *ip, *mhip;
        struct mbuf *m0;
        int len, hlen, off;
        int mhlen, firstlen;
        struct mbuf **mnext;
        int sw_csum = m->m_pkthdr.csum_flags;
        int fragments = 0;
        int error = 0;
        int ipoff, ipflg;

        ip = mtod(m, struct ip *);
        hlen = ip->ip_hl << 2;

        /* Preserve the offset and flags. */
        ipoff = ntohs(ip->ip_off) & IP_OFFMASK;
        ipflg = ntohs(ip->ip_off) & (IP_RF|IP_DF|IP_MF);

        if (ifp != NULL)
                sw_csum &= ~ifp->if_csum_flags_tx;

        len = (mtu - hlen) &~ 7;
        if (len < 8) {
                IP_STATINC(IP_STAT_CANTFRAG);
                m_freem(m);
                return EMSGSIZE;
        }

        firstlen = len;
        mnext = &m->m_nextpkt;

        /*
         * Loop through length of segment after first fragment,
         * make new header and copy data of each part and link onto chain.
         */
        m0 = m;
        mhlen = sizeof(struct ip);
        for (off = hlen + len; off < ntohs(ip->ip_len); off += len) {
                MGETHDR(m, M_DONTWAIT, MT_HEADER);
                if (m == NULL) {
                        error = ENOBUFS;
                        IP_STATINC(IP_STAT_ODROPPED);
                        goto sendorfree;
                }
                MCLAIM(m, m0->m_owner);

                *mnext = m;
                mnext = &m->m_nextpkt;

                m->m_data += max_linkhdr;
                mhip = mtod(m, struct ip *);
                *mhip = *ip;

                /* we must inherit the flags */
                m->m_flags |= m0->m_flags & M_COPYFLAGS;

                if (hlen > sizeof(struct ip)) {
                        mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip);
                        mhip->ip_hl = mhlen >> 2;
                }
                m->m_len = mhlen;

                mhip->ip_off = ((off - hlen) >> 3) + ipoff;
                mhip->ip_off |= ipflg;
                if (off + len >= ntohs(ip->ip_len))
                        len = ntohs(ip->ip_len) - off;
                else
                        mhip->ip_off |= IP_MF;
                HTONS(mhip->ip_off);

                mhip->ip_len = htons((u_int16_t)(len + mhlen));
                m->m_next = m_copym(m0, off, len, M_DONTWAIT);
                if (m->m_next == NULL) {
                        error = ENOBUFS;
                        IP_STATINC(IP_STAT_ODROPPED);
                        goto sendorfree;
                }

                m->m_pkthdr.len = mhlen + len;
                m_reset_rcvif(m);

                mhip->ip_sum = 0;
                KASSERT((m->m_pkthdr.csum_flags & M_CSUM_IPv4) == 0);
                if (sw_csum & M_CSUM_IPv4) {
                        mhip->ip_sum = in_cksum(m, mhlen);
                } else {
                        /*
                         * checksum is hw-offloaded or not necessary.
                         */
                        m->m_pkthdr.csum_flags |=
                            m0->m_pkthdr.csum_flags & M_CSUM_IPv4;
                        m->m_pkthdr.csum_data |= mhlen << 16;
                        KASSERT(!(ifp != NULL &&
                            IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) ||
                            (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0);
                }
                IP_STATINC(IP_STAT_OFRAGMENTS);
                fragments++;
        }

        /*
         * Update first fragment by trimming what's been copied out
         * and updating header, then send each fragment (in order).
         */
        m = m0;
        m_adj(m, hlen + firstlen - ntohs(ip->ip_len));
        m->m_pkthdr.len = hlen + firstlen;
        ip->ip_len = htons((u_int16_t)m->m_pkthdr.len);
        ip->ip_off |= htons(IP_MF);
        ip->ip_sum = 0;
        if (sw_csum & M_CSUM_IPv4) {
                ip->ip_sum = in_cksum(m, hlen);
                m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4;
        } else {
                /*
                 * checksum is hw-offloaded or not necessary.
                 */
                KASSERT(!(ifp != NULL && IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) ||
                    (m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0);
                KASSERT(M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data) >=
                    sizeof(struct ip));
        }

sendorfree:
        /*
         * If there is no room for all the fragments, don't queue
         * any of them.
         */
        if (ifp != NULL) {
                IFQ_LOCK(&ifp->if_snd);
                if (ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len < fragments &&
                    error == 0) {
                        error = ENOBUFS;
                        IP_STATINC(IP_STAT_ODROPPED);
                        IFQ_INC_DROPS(&ifp->if_snd);
                }
                IFQ_UNLOCK(&ifp->if_snd);
        }
        if (error) {
                for (m = m0; m; m = m0) {
                        m0 = m->m_nextpkt;
                        m->m_nextpkt = NULL;
                        m_freem(m);
                }
        }

        return error;
}

/*
 * Determine the maximum length of the options to be inserted;
 * we would far rather allocate too much space rather than too little.
 */
u_int
ip_optlen(struct inpcb *inp)
{
        struct mbuf *m = inp->inp_options;

        if (m && m->m_len > offsetof(struct ipoption, ipopt_dst)) {
                return (m->m_len - offsetof(struct ipoption, ipopt_dst));
        }
        return 0;
}

/*
 * Insert IP options into preformed packet.
 * Adjust IP destination as required for IP source routing,
 * as indicated by a non-zero in_addr at the start of the options.
 */
static struct mbuf *
ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
{
        struct ipoption *p = mtod(opt, struct ipoption *);
        struct mbuf *n;
        struct ip *ip = mtod(m, struct ip *);
        unsigned optlen;

        optlen = opt->m_len - sizeof(p->ipopt_dst);
        KASSERT(optlen % 4 == 0);
        if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET)
                return m;                /* XXX should fail */
        if (!in_nullhost(p->ipopt_dst))
                ip->ip_dst = p->ipopt_dst;
        if (M_READONLY(m) || M_LEADINGSPACE(m) < optlen) {
                MGETHDR(n, M_DONTWAIT, MT_HEADER);
                if (n == NULL)
                        return m;
                MCLAIM(n, m->m_owner);
                m_move_pkthdr(n, m);
                m->m_len -= sizeof(struct ip);
                m->m_data += sizeof(struct ip);
                n->m_next = m;
                n->m_len = optlen + sizeof(struct ip);
                n->m_data += max_linkhdr;
                memcpy(mtod(n, void *), ip, sizeof(struct ip));
                m = n;
        } else {
                m->m_data -= optlen;
                m->m_len += optlen;
                memmove(mtod(m, void *), ip, sizeof(struct ip));
        }
        m->m_pkthdr.len += optlen;
        ip = mtod(m, struct ip *);
        memcpy(ip + 1, p->ipopt_list, optlen);
        *phlen = sizeof(struct ip) + optlen;
        ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
        return m;
}

/*
 * Copy options from ipsrc to ipdst, omitting those not copied during
 * fragmentation.
 */
int
ip_optcopy(struct ip *ipsrc, struct ip *ipdst)
{
        u_char *cp, *dp;
        int opt, optlen, cnt;

        cp = (u_char *)(ipsrc + 1);
        dp = (u_char *)(ipdst + 1);
        cnt = (ipsrc->ip_hl << 2) - sizeof(struct ip);
        for (; cnt > 0; cnt -= optlen, cp += optlen) {
                opt = cp[0];
                if (opt == IPOPT_EOL)
                        break;
                if (opt == IPOPT_NOP) {
                        /* Preserve for IP mcast tunnel's LSRR alignment. */
                        *dp++ = IPOPT_NOP;
                        optlen = 1;
                        continue;
                }

                KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp));
                optlen = cp[IPOPT_OLEN];
                KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen < cnt);

                /* Invalid lengths should have been caught by ip_dooptions. */
                if (optlen > cnt)
                        optlen = cnt;
                if (IPOPT_COPIED(opt)) {
                        bcopy((void *)cp, (void *)dp, (unsigned)optlen);
                        dp += optlen;
                }
        }

        for (optlen = dp - (u_char *)(ipdst+1); optlen & 0x3; optlen++) {
                *dp++ = IPOPT_EOL;
        }

        return optlen;
}

/*
 * IP socket option processing.
 */
int
ip_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
        struct inpcb *inp = sotoinpcb(so);
        struct ip *ip = &in4p_ip(inp);
        int inpflags = inp->inp_flags;
        int optval = 0, error = 0;
        struct in_pktinfo pktinfo;

        KASSERT(solocked(so));

        if (sopt->sopt_level != IPPROTO_IP) {
                if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER)
                        return 0;
                return ENOPROTOOPT;
        }

        switch (op) {
        case PRCO_SETOPT:
                switch (sopt->sopt_name) {
                case IP_OPTIONS:
#ifdef notyet
                case IP_RETOPTS:
#endif
                        error = ip_pcbopts(inp, sopt);
                        break;

                case IP_TOS:
                case IP_TTL:
                case IP_MINTTL:
                case IP_RECVOPTS:
                case IP_RECVRETOPTS:
                case IP_RECVDSTADDR:
                case IP_RECVIF:
                case IP_RECVPKTINFO:
                case IP_RECVTTL:
                case IP_BINDANY:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;

                        switch (sopt->sopt_name) {
                        case IP_TOS:
                                ip->ip_tos = optval;
                                break;

                        case IP_TTL:
                                ip->ip_ttl = optval;
                                break;

                        case IP_MINTTL:
                                if (optval > 0 && optval <= MAXTTL)
                                        in4p_ip_minttl(inp) = optval;
                                else
                                        error = EINVAL;
                                break;
#define        OPTSET(bit) \
        if (optval) \
                inpflags |= bit; \
        else \
                inpflags &= ~bit;

                        case IP_RECVOPTS:
                                OPTSET(INP_RECVOPTS);
                                break;

                        case IP_RECVPKTINFO:
                                OPTSET(INP_RECVPKTINFO);
                                break;

                        case IP_RECVRETOPTS:
                                OPTSET(INP_RECVRETOPTS);
                                break;

                        case IP_RECVDSTADDR:
                                OPTSET(INP_RECVDSTADDR);
                                break;

                        case IP_RECVIF:
                                OPTSET(INP_RECVIF);
                                break;

                        case IP_RECVTTL:
                                OPTSET(INP_RECVTTL);
                                break;

                        case IP_BINDANY:
                                error = kauth_authorize_network(
                                    kauth_cred_get(), KAUTH_NETWORK_BIND,
                                    KAUTH_REQ_NETWORK_BIND_ANYADDR, so,
                                    NULL, NULL);
                                if (error == 0) {
                                        OPTSET(INP_BINDANY);
                                }
                                break;
                        }
                        break;
                case IP_PKTINFO:
                        error = sockopt_getint(sopt, &optval);
                        if (!error) {
                                /* Linux compatibility */
                                OPTSET(INP_RECVPKTINFO);
                                break;
                        }
                        error = sockopt_get(sopt, &pktinfo, sizeof(pktinfo));
                        if (error)
                                break;

                        if (pktinfo.ipi_ifindex == 0) {
                                in4p_prefsrcip(inp) = pktinfo.ipi_addr;
                                break;
                        }

                        /* Solaris compatibility */
                        struct ifnet *ifp;
                        struct in_ifaddr *ia;
                        int s;

                        /* pick up primary address */
                        s = pserialize_read_enter();
                        ifp = if_byindex(pktinfo.ipi_ifindex);
                        if (ifp == NULL) {
                                pserialize_read_exit(s);
                                error = EADDRNOTAVAIL;
                                break;
                        }
                        ia = in_get_ia_from_ifp(ifp);
                        if (ia == NULL) {
                                pserialize_read_exit(s);
                                error = EADDRNOTAVAIL;
                                break;
                        }
                        in4p_prefsrcip(inp) = IA_SIN(ia)->sin_addr;
                        pserialize_read_exit(s);
                        break;
                break;
#undef OPTSET

                case IP_MULTICAST_IF:
                case IP_MULTICAST_TTL:
                case IP_MULTICAST_LOOP:
                case IP_ADD_MEMBERSHIP:
                case IP_DROP_MEMBERSHIP:
                        error = ip_setmoptions(&inp->inp_moptions, sopt);
                        break;

                case IP_PORTRANGE:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;

                        switch (optval) {
                        case IP_PORTRANGE_DEFAULT:
                        case IP_PORTRANGE_HIGH:
                                inpflags &= ~(INP_LOWPORT);
                                break;

                        case IP_PORTRANGE_LOW:
                                inpflags |= INP_LOWPORT;
                                break;

                        default:
                                error = EINVAL;
                                break;
                        }
                        break;

                case IP_PORTALGO:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;

                        error = portalgo_algo_index_select(inp, optval);
                        break;

#if defined(IPSEC)
                case IP_IPSEC_POLICY:
                        if (ipsec_enabled) {
                                error = ipsec_set_policy(inp,
                                    sopt->sopt_data, sopt->sopt_size,
                                    curlwp->l_cred);
                        } else 
                                error = ENOPROTOOPT;
                        break;
#endif /* IPSEC */

                default:
                        error = ENOPROTOOPT;
                        break;
                }
                break;

        case PRCO_GETOPT:
                switch (sopt->sopt_name) {
                case IP_OPTIONS:
                case IP_RETOPTS: {
                        struct mbuf *mopts = inp->inp_options;

                        if (mopts) {
                                struct mbuf *m;

                                m = m_copym(mopts, 0, M_COPYALL, M_DONTWAIT);
                                if (m == NULL) {
                                        error = ENOBUFS;
                                        break;
                                }
                                error = sockopt_setmbuf(sopt, m);
                        }
                        break;
                }
                case IP_TOS:
                case IP_TTL:
                case IP_MINTTL:
                case IP_RECVOPTS:
                case IP_RECVRETOPTS:
                case IP_RECVDSTADDR:
                case IP_RECVIF:
                case IP_RECVPKTINFO:
                case IP_RECVTTL:
                case IP_ERRORMTU:
                case IP_BINDANY:
                        switch (sopt->sopt_name) {
                        case IP_TOS:
                                optval = ip->ip_tos;
                                break;

                        case IP_TTL:
                                optval = ip->ip_ttl;
                                break;

                        case IP_MINTTL:
                                optval = in4p_ip_minttl(inp);
                                break;

                        case IP_ERRORMTU:
                                optval = in4p_errormtu(inp);
                                break;

#define        OPTBIT(bit)        (inpflags & bit ? 1 : 0)

                        case IP_RECVOPTS:
                                optval = OPTBIT(INP_RECVOPTS);
                                break;

                        case IP_RECVPKTINFO:
                                optval = OPTBIT(INP_RECVPKTINFO);
                                break;

                        case IP_RECVRETOPTS:
                                optval = OPTBIT(INP_RECVRETOPTS);
                                break;

                        case IP_RECVDSTADDR:
                                optval = OPTBIT(INP_RECVDSTADDR);
                                break;

                        case IP_RECVIF:
                                optval = OPTBIT(INP_RECVIF);
                                break;

                        case IP_RECVTTL:
                                optval = OPTBIT(INP_RECVTTL);
                                break;

                        case IP_BINDANY:
                                optval = OPTBIT(INP_BINDANY);
                                break;
                        }
                        error = sockopt_setint(sopt, optval);
                        break;

                case IP_PKTINFO:
                        switch (sopt->sopt_size) {
                        case sizeof(int):
                                /* Linux compatibility */
                                optval = OPTBIT(INP_RECVPKTINFO);
                                error = sockopt_setint(sopt, optval);
                                break;
                        case sizeof(struct in_pktinfo):
                                /* Solaris compatibility */
                                pktinfo.ipi_ifindex = 0;
                                pktinfo.ipi_addr = in4p_prefsrcip(inp);
                                error = sockopt_set(sopt, &pktinfo,
                                    sizeof(pktinfo));
                                break;
                        default:
                                /*
                                 * While size is stuck at 0, and, later, if
                                 * the caller doesn't use an exactly sized
                                 * recipient for the data, default to Linux
                                 * compatibility
                                 */
                                optval = OPTBIT(INP_RECVPKTINFO);
                                error = sockopt_setint(sopt, optval);
                                break;
                        }
                        break;

#if 0        /* defined(IPSEC) */
                case IP_IPSEC_POLICY:
                {
                        struct mbuf *m = NULL;

                        /* XXX this will return EINVAL as sopt is empty */
                        error = ipsec_get_policy(inp, sopt->sopt_data,
                            sopt->sopt_size, &m);
                        if (error == 0)
                                error = sockopt_setmbuf(sopt, m);
                        break;
                }
#endif /*IPSEC*/

                case IP_MULTICAST_IF:
                case IP_MULTICAST_TTL:
                case IP_MULTICAST_LOOP:
                case IP_ADD_MEMBERSHIP:
                case IP_DROP_MEMBERSHIP:
                        error = ip_getmoptions(inp->inp_moptions, sopt);
                        break;

                case IP_PORTRANGE:
                        if (inpflags & INP_LOWPORT)
                                optval = IP_PORTRANGE_LOW;
                        else
                                optval = IP_PORTRANGE_DEFAULT;
                        error = sockopt_setint(sopt, optval);
                        break;

                case IP_PORTALGO:
                        optval = inp->inp_portalgo;
                        error = sockopt_setint(sopt, optval);
                        break;

                default:
                        error = ENOPROTOOPT;
                        break;
                }
                break;
        }

        if (!error) {
                inp->inp_flags = inpflags;
        }
        return error;
}

static int
ip_pktinfo_prepare(const struct inpcb *inp, const struct in_pktinfo *pktinfo,
    struct ip_pktopts *pktopts, int *flags, kauth_cred_t cred)
{
        struct ip_moptions *imo;
        int error = 0;
        bool addrset = false;

        if (!in_nullhost(pktinfo->ipi_addr)) {
                pktopts->ippo_laddr.sin_addr = pktinfo->ipi_addr;
                /* EADDRNOTAVAIL? */
                error = inpcb_bindableaddr(inp, &pktopts->ippo_laddr, cred);
                if (error != 0)
                        return error;
                addrset = true;
        }

        if (pktinfo->ipi_ifindex != 0) {
                if (!addrset) {
                        struct ifnet *ifp;
                        struct in_ifaddr *ia;
                        int s;

                        /* pick up primary address */
                        s = pserialize_read_enter();
                        ifp = if_byindex(pktinfo->ipi_ifindex);
                        if (ifp == NULL) {
                                pserialize_read_exit(s);
                                return EADDRNOTAVAIL;
                        }
                        ia = in_get_ia_from_ifp(ifp);
                        if (ia == NULL) {
                                pserialize_read_exit(s);
                                return EADDRNOTAVAIL;
                        }
                        pktopts->ippo_laddr.sin_addr = IA_SIN(ia)->sin_addr;
                        pserialize_read_exit(s);
                }

                /*
                 * If specified ipi_ifindex,
                 * use copied or locally initialized ip_moptions.
                 * Original ip_moptions must not be modified.
                 */
                imo = &pktopts->ippo_imobuf;        /* local buf in pktopts */
                if (pktopts->ippo_imo != NULL) {
                        memcpy(imo, pktopts->ippo_imo, sizeof(*imo));
                } else {
                        memset(imo, 0, sizeof(*imo));
                        imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
                        imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
                }
                imo->imo_multicast_if_index = pktinfo->ipi_ifindex;
                pktopts->ippo_imo = imo;
                *flags |= IP_ROUTETOIFINDEX;
        }
        return error;
}

/*
 * Set up IP outgoing packet options. Even if control is NULL,
 * pktopts->ippo_laddr and pktopts->ippo_imo are set and used.
 */
int
ip_setpktopts(struct mbuf *control, struct ip_pktopts *pktopts, int *flags,
    struct inpcb *inp, kauth_cred_t cred)
{
        struct cmsghdr *cm;
        struct in_pktinfo pktinfo;
        int error;

        pktopts->ippo_imo = inp->inp_moptions;

        struct in_addr *ia = in_nullhost(in4p_prefsrcip(inp)) ? &in4p_laddr(inp) :
            &in4p_prefsrcip(inp);
        sockaddr_in_init(&pktopts->ippo_laddr, ia, 0);

        if (control == NULL)
                return 0;

        /*
         * XXX: Currently, we assume all the optional information is
         * stored in a single mbuf.
         */
        if (control->m_next)
                return EINVAL;

        for (; control->m_len > 0;
            control->m_data += CMSG_ALIGN(cm->cmsg_len),
            control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
                cm = mtod(control, struct cmsghdr *);
                if ((control->m_len < sizeof(*cm)) ||
                    (cm->cmsg_len == 0) ||
                    (cm->cmsg_len > control->m_len)) {
                        return EINVAL;
                }
                if (cm->cmsg_level != IPPROTO_IP)
                        continue;

                switch (cm->cmsg_type) {
                case IP_PKTINFO:
                        if (cm->cmsg_len != CMSG_LEN(sizeof(pktinfo)))
                                return EINVAL;
                        memcpy(&pktinfo, CMSG_DATA(cm), sizeof(pktinfo));
                        error = ip_pktinfo_prepare(inp, &pktinfo, pktopts,
                            flags, cred);
                        if (error)
                                return error;
                        break;
                case IP_SENDSRCADDR: /* FreeBSD compatibility */
                        if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr)))
                                return EINVAL;
                        pktinfo.ipi_ifindex = 0;
                        pktinfo.ipi_addr =
                            ((struct in_pktinfo *)CMSG_DATA(cm))->ipi_addr;
                        error = ip_pktinfo_prepare(inp, &pktinfo, pktopts,
                            flags, cred);
                        if (error)
                                return error;
                        break;
                default:
                        return ENOPROTOOPT;
                }
        }
        return 0;
}

/*
 * Set up IP options in pcb for insertion in output packets.
 * Store in mbuf with pointer in pcbopt, adding pseudo-option
 * with destination address if source routed.
 */
static int
ip_pcbopts(struct inpcb *inp, const struct sockopt *sopt)
{
        struct mbuf *m;
        const u_char *cp;
        u_char *dp;
        int cnt;

        KASSERT(inp_locked(inp));

        /* Turn off any old options. */
        if (inp->inp_options) {
                m_free(inp->inp_options);
        }
        inp->inp_options = NULL;
        if ((cnt = sopt->sopt_size) == 0) {
                /* Only turning off any previous options. */
                return 0;
        }
        cp = sopt->sopt_data;

        if (cnt % 4) {
                /* Must be 4-byte aligned, because there's no padding. */
                return EINVAL;
        }

        m = m_get(M_DONTWAIT, MT_SOOPTS);
        if (m == NULL)
                return ENOBUFS;

        dp = mtod(m, u_char *);
        memset(dp, 0, sizeof(struct in_addr));
        dp += sizeof(struct in_addr);
        m->m_len = sizeof(struct in_addr);

        /*
         * IP option list according to RFC791. Each option is of the form
         *
         *        [optval] [olen] [(olen - 2) data bytes]
         *
         * We validate the list and copy options to an mbuf for prepending
         * to data packets. The IP first-hop destination address will be
         * stored before actual options and is zero if unset.
         */
        while (cnt > 0) {
                uint8_t optval, olen, offset;

                optval = cp[IPOPT_OPTVAL];

                if (optval == IPOPT_EOL || optval == IPOPT_NOP) {
                        olen = 1;
                } else {
                        if (cnt < IPOPT_OLEN + 1)
                                goto bad;

                        olen = cp[IPOPT_OLEN];
                        if (olen < IPOPT_OLEN + 1 || olen > cnt)
                                goto bad;
                }

                if (optval == IPOPT_LSRR || optval == IPOPT_SSRR) {
                        /*
                         * user process specifies route as:
                         *        ->A->B->C->D
                         * D must be our final destination (but we can't
                         * check that since we may not have connected yet).
                         * A is first hop destination, which doesn't appear in
                         * actual IP option, but is stored before the options.
                         */
                        if (olen < IPOPT_OFFSET + 1 + sizeof(struct in_addr))
                                goto bad;

                        offset = cp[IPOPT_OFFSET];
                        memcpy(mtod(m, u_char *), cp + IPOPT_OFFSET + 1,
                            sizeof(struct in_addr));

                        cp += sizeof(struct in_addr);
                        cnt -= sizeof(struct in_addr);
                        olen -= sizeof(struct in_addr);

                        if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr))
                                goto bad;

                        memcpy(dp, cp, olen);
                        dp[IPOPT_OPTVAL] = optval;
                        dp[IPOPT_OLEN] = olen;
                        dp[IPOPT_OFFSET] = offset;
                        break;
                } else {
                        if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr))
                                goto bad;

                        memcpy(dp, cp, olen);
                        break;
                }

                dp += olen;
                m->m_len += olen;

                if (optval == IPOPT_EOL)
                        break;

                cp += olen;
                cnt -= olen;
        }

        inp->inp_options = m;
        return 0;

bad:
        (void)m_free(m);
        return EINVAL;
}

/*
 * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
 * Must be called in a pserialize critical section.
 */
static struct ifnet *
ip_multicast_if(struct in_addr *a, int *ifindexp)
{
        int ifindex;
        struct ifnet *ifp = NULL;
        struct in_ifaddr *ia;

        if (ifindexp)
                *ifindexp = 0;
        if (ntohl(a->s_addr) >> 24 == 0) {
                ifindex = ntohl(a->s_addr) & 0xffffff;
                ifp = if_byindex(ifindex);
                if (!ifp)
                        return NULL;
                if (ifindexp)
                        *ifindexp = ifindex;
        } else {
                IN_ADDRHASH_READER_FOREACH(ia, a->s_addr) {
                        if (in_hosteq(ia->ia_addr.sin_addr, *a) &&
                            (ia->ia_ifp->if_flags & IFF_MULTICAST) != 0) {
                                ifp = ia->ia_ifp;
                                if (if_is_deactivated(ifp))
                                        ifp = NULL;
                                break;
                        }
                }
        }
        return ifp;
}

static int
ip_getoptval(const struct sockopt *sopt, u_int8_t *val, u_int maxval)
{
        u_int tval;
        u_char cval;
        int error;

        if (sopt == NULL)
                return EINVAL;

        switch (sopt->sopt_size) {
        case sizeof(u_char):
                error = sockopt_get(sopt, &cval, sizeof(u_char));
                tval = cval;
                break;

        case sizeof(u_int):
                error = sockopt_get(sopt, &tval, sizeof(u_int));
                break;

        default:
                error = EINVAL;
        }

        if (error)
                return error;

        if (tval > maxval)
                return EINVAL;

        *val = tval;
        return 0;
}

static int
ip_get_membership(const struct sockopt *sopt, struct ifnet **ifp,
    struct psref *psref, struct in_addr *ia, bool add)
{
        int error;
        struct ip_mreq mreq;

        error = sockopt_get(sopt, &mreq, sizeof(mreq));
        if (error)
                return error;

        if (!IN_MULTICAST(mreq.imr_multiaddr.s_addr))
                return EINVAL;

        memcpy(ia, &mreq.imr_multiaddr, sizeof(*ia));

        if (in_nullhost(mreq.imr_interface)) {
                union {
                        struct sockaddr                dst;
                        struct sockaddr_in        dst4;
                } u;
                struct route ro;

                if (!add) {
                        *ifp = NULL;
                        return 0;
                }
                /*
                 * If no interface address was provided, use the interface of
                 * the route to the given multicast address.
                 */
                struct rtentry *rt;
                memset(&ro, 0, sizeof(ro));

                sockaddr_in_init(&u.dst4, ia, 0);
                error = rtcache_setdst(&ro, &u.dst);
                if (error != 0)
                        return error;
                *ifp = (rt = rtcache_init(&ro)) != NULL ? rt->rt_ifp : NULL;
                if (*ifp != NULL) {
                        if (if_is_deactivated(*ifp))
                                *ifp = NULL;
                        else
                                if_acquire(*ifp, psref);
                }
                rtcache_unref(rt, &ro);
                rtcache_free(&ro);
        } else {
                int s = pserialize_read_enter();
                *ifp = ip_multicast_if(&mreq.imr_interface, NULL);
                if (!add && *ifp == NULL) {
                        pserialize_read_exit(s);
                        return EADDRNOTAVAIL;
                }
                if (*ifp != NULL) {
                        if (if_is_deactivated(*ifp))
                                *ifp = NULL;
                        else
                                if_acquire(*ifp, psref);
                }
                pserialize_read_exit(s);
        }
        return 0;
}

/*
 * Add a multicast group membership.
 * Group must be a valid IP multicast address.
 */
static int
ip_add_membership(struct ip_moptions *imo, const struct sockopt *sopt)
{
        struct ifnet *ifp = NULL;        // XXX: gcc [ppc]
        struct in_addr ia;
        int i, error, bound;
        struct psref psref;

        /* imo is protected by solock or referenced only by the caller */

        bound = curlwp_bind();
        if (sopt->sopt_size == sizeof(struct ip_mreq))
                error = ip_get_membership(sopt, &ifp, &psref, &ia, true);
        else {
#ifdef INET6
                error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia));
#else
                error = EINVAL;
#endif
        }

        if (error)
                goto out;

        /*
         * See if we found an interface, and confirm that it
         * supports multicast.
         */
        if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
                error = EADDRNOTAVAIL;
                goto out;
        }

        /*
         * See if the membership already exists or if all the
         * membership slots are full.
         */
        for (i = 0; i < imo->imo_num_memberships; ++i) {
                if (imo->imo_membership[i]->inm_ifp == ifp &&
                    in_hosteq(imo->imo_membership[i]->inm_addr, ia))
                        break;
        }
        if (i < imo->imo_num_memberships) {
                error = EADDRINUSE;
                goto out;
        }

        if (i == IP_MAX_MEMBERSHIPS) {
                error = ETOOMANYREFS;
                goto out;
        }

        /*
         * Everything looks good; add a new record to the multicast
         * address list for the given interface.
         */
        imo->imo_membership[i] = in_addmulti(&ia, ifp);
        if (imo->imo_membership[i] == NULL) {
                error = ENOBUFS;
                goto out;
        }

        ++imo->imo_num_memberships;
        error = 0;
out:
        if_put(ifp, &psref);
        curlwp_bindx(bound);
        return error;
}

/*
 * Drop a multicast group membership.
 * Group must be a valid IP multicast address.
 */
static int
ip_drop_membership(struct ip_moptions *imo, const struct sockopt *sopt)
{
        struct in_addr ia = { .s_addr = 0 };        // XXX: gcc [ppc]
        struct ifnet *ifp = NULL;                // XXX: gcc [ppc]
        int i, error, bound;
        struct psref psref;

        /* imo is protected by solock or referenced only by the caller */

        bound = curlwp_bind();
        if (sopt->sopt_size == sizeof(struct ip_mreq))
                error = ip_get_membership(sopt, &ifp, &psref, &ia, false);
        else {
#ifdef INET6
                error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia));
#else
                error = EINVAL;
#endif
        }

        if (error)
                goto out;

        /*
         * Find the membership in the membership array.
         */
        for (i = 0; i < imo->imo_num_memberships; ++i) {
                if ((ifp == NULL ||
                     imo->imo_membership[i]->inm_ifp == ifp) &&
                    in_hosteq(imo->imo_membership[i]->inm_addr, ia))
                        break;
        }
        if (i == imo->imo_num_memberships) {
                error = EADDRNOTAVAIL;
                goto out;
        }

        /*
         * Give up the multicast address record to which the
         * membership points.
         */
        in_delmulti(imo->imo_membership[i]);

        /*
         * Remove the gap in the membership array.
         */
        for (++i; i < imo->imo_num_memberships; ++i)
                imo->imo_membership[i-1] = imo->imo_membership[i];
        --imo->imo_num_memberships;
        error = 0;
out:
        if_put(ifp, &psref);
        curlwp_bindx(bound);
        return error;
}

/*
 * Set the IP multicast options in response to user setsockopt().
 */
int
ip_setmoptions(struct ip_moptions **pimo, const struct sockopt *sopt)
{
        struct ip_moptions *imo = *pimo;
        struct in_addr addr;
        struct ifnet *ifp;
        int ifindex, error = 0;

        /* The passed imo isn't NULL, it should be protected by solock */

        if (!imo) {
                /*
                 * No multicast option buffer attached to the pcb;
                 * allocate one and initialize to default values.
                 */
                imo = kmem_intr_alloc(sizeof(*imo), KM_NOSLEEP);
                if (imo == NULL)
                        return ENOBUFS;

                imo->imo_multicast_if_index = 0;
                imo->imo_multicast_addr.s_addr = INADDR_ANY;
                imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
                imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
                imo->imo_num_memberships = 0;
                *pimo = imo;
        }

        switch (sopt->sopt_name) {
        case IP_MULTICAST_IF: {
                int s;
                /*
                 * Select the interface for outgoing multicast packets.
                 */
                error = sockopt_get(sopt, &addr, sizeof(addr));
                if (error)
                        break;

                /*
                 * INADDR_ANY is used to remove a previous selection.
                 * When no interface is selected, a default one is
                 * chosen every time a multicast packet is sent.
                 */
                if (in_nullhost(addr)) {
                        imo->imo_multicast_if_index = 0;
                        break;
                }
                /*
                 * The selected interface is identified by its local
                 * IP address.  Find the interface and confirm that
                 * it supports multicasting.
                 */
                s = pserialize_read_enter();
                ifp = ip_multicast_if(&addr, &ifindex);
                if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
                        pserialize_read_exit(s);
                        error = EADDRNOTAVAIL;
                        break;
                }
                imo->imo_multicast_if_index = ifp->if_index;
                pserialize_read_exit(s);
                if (ifindex)
                        imo->imo_multicast_addr = addr;
                else
                        imo->imo_multicast_addr.s_addr = INADDR_ANY;
                break;
            }

        case IP_MULTICAST_TTL:
                /*
                 * Set the IP time-to-live for outgoing multicast packets.
                 */
                error = ip_getoptval(sopt, &imo->imo_multicast_ttl, MAXTTL);
                break;

        case IP_MULTICAST_LOOP:
                /*
                 * Set the loopback flag for outgoing multicast packets.
                 * Must be zero or one.
                 */
                error = ip_getoptval(sopt, &imo->imo_multicast_loop, 1);
                break;

        case IP_ADD_MEMBERSHIP: /* IPV6_JOIN_GROUP */
                error = ip_add_membership(imo, sopt);
                break;

        case IP_DROP_MEMBERSHIP: /* IPV6_LEAVE_GROUP */
                error = ip_drop_membership(imo, sopt);
                break;

        default:
                error = EOPNOTSUPP;
                break;
        }

        /*
         * If all options have default values, no need to keep the mbuf.
         */
        if (imo->imo_multicast_if_index == 0 &&
            imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL &&
            imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
            imo->imo_num_memberships == 0) {
                kmem_intr_free(imo, sizeof(*imo));
                *pimo = NULL;
        }

        return error;
}

/*
 * Return the IP multicast options in response to user getsockopt().
 */
int
ip_getmoptions(struct ip_moptions *imo, struct sockopt *sopt)
{
        struct in_addr addr;
        uint8_t optval;
        int error = 0;

        /* imo is protected by solock or referenced only by the caller */

        switch (sopt->sopt_name) {
        case IP_MULTICAST_IF:
                if (imo == NULL || imo->imo_multicast_if_index == 0)
                        addr = zeroin_addr;
                else if (imo->imo_multicast_addr.s_addr) {
                        /* return the value user has set */
                        addr = imo->imo_multicast_addr;
                } else {
                        struct ifnet *ifp;
                        struct in_ifaddr *ia = NULL;
                        int s = pserialize_read_enter();

                        ifp = if_byindex(imo->imo_multicast_if_index);
                        if (ifp != NULL) {
                                ia = in_get_ia_from_ifp(ifp);
                        }
                        addr = ia ? ia->ia_addr.sin_addr : zeroin_addr;
                        pserialize_read_exit(s);
                }
                error = sockopt_set(sopt, &addr, sizeof(addr));
                break;

        case IP_MULTICAST_TTL:
                optval = imo ? imo->imo_multicast_ttl
                    : IP_DEFAULT_MULTICAST_TTL;

                error = sockopt_set(sopt, &optval, sizeof(optval));
                break;

        case IP_MULTICAST_LOOP:
                optval = imo ? imo->imo_multicast_loop
                    : IP_DEFAULT_MULTICAST_LOOP;

                error = sockopt_set(sopt, &optval, sizeof(optval));
                break;

        default:
                error = EOPNOTSUPP;
        }

        return error;
}

/*
 * Discard the IP multicast options.
 */
void
ip_freemoptions(struct ip_moptions *imo)
{
        int i;

        /* The owner of imo (inp) should be protected by solock */

        if (imo != NULL) {
                for (i = 0; i < imo->imo_num_memberships; ++i) {
                        struct in_multi *inm = imo->imo_membership[i];
                        in_delmulti(inm);
                        /* ifp should not leave thanks to solock */
                }

                kmem_intr_free(imo, sizeof(*imo));
        }
}

/*
 * Routine called from ip_output() to loop back a copy of an IP multicast
 * packet to the input queue of a specified interface.  Note that this
 * calls the output routine of the loopback "driver", but with an interface
 * pointer that might NOT be lo0ifp -- easier than replicating that code here.
 */
static void
ip_mloopback(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in *dst)
{
        struct ip *ip;
        struct mbuf *copym;

        copym = m_copypacket(m, M_DONTWAIT);
        if (copym != NULL &&
            (copym->m_flags & M_EXT || copym->m_len < sizeof(struct ip)))
                copym = m_pullup(copym, sizeof(struct ip));
        if (copym == NULL)
                return;
        /*
         * We don't bother to fragment if the IP length is greater
         * than the interface's MTU.  Can this possibly matter?
         */
        ip = mtod(copym, struct ip *);

        if (copym->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) {
                in_undefer_cksum_tcpudp(copym);
                copym->m_pkthdr.csum_flags &=
                    ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
        }

        ip->ip_sum = 0;
        ip->ip_sum = in_cksum(copym, ip->ip_hl << 2);
        KERNEL_LOCK_UNLESS_NET_MPSAFE();
        (void)looutput(ifp, copym, sintocsa(dst), NULL);
        KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}

/*
 * Ensure sending address is valid.
 * Returns 0 on success, -1 if an error should be sent back or 1
 * if the packet could be dropped without error (protocol dependent).
 */
static int
ip_ifaddrvalid(const struct in_ifaddr *ia)
{

        if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY)
                return 0;

        if (ia->ia4_flags & IN_IFF_DUPLICATED)
                return -1;
        else if (ia->ia4_flags & (IN_IFF_TENTATIVE | IN_IFF_DETACHED))
                return 1;

        return 0;
}























































































    3 







    3 


    2 





















































    1 








































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
/*        $NetBSD: if_43.c,v 1.27 2023/03/30 17:48:10 riastradh Exp $        */

/*
 * Copyright (c) 1982, 1986, 1989, 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)uipc_syscalls.c        8.4 (Berkeley) 2/21/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_43.c,v 1.27 2023/03/30 17:48:10 riastradh Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/syslog.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
#include <sys/mbuf.h>                /* for MLEN */
#include <sys/protosw.h>
#include <sys/compat_stub.h>

#include <sys/syscallargs.h>

#include <net/if.h>
#include <net/bpf.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <net/if_gre.h>
#include <net/if_tap.h>
#include <net80211/ieee80211_ioctl.h>
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#include <compat/net/if.h>
#include <compat/sys/socket.h>
#include <compat/sys/sockio.h>

#include <compat/common/compat_util.h>
#include <compat/common/compat_mod.h>
#include <uvm/uvm_extern.h>

#if defined(COMPAT_43)

/* 
 * Use a wrapper so that the compat_cvtcmd() can return a u_long
 */
static int 
do_compat_cvtcmd(u_long *ncmd, u_long ocmd)
{ 

        *ncmd = compat_cvtcmd(ocmd);
        return 0;
}

u_long
compat_cvtcmd(u_long cmd)
{
        u_long ncmd;

        if (IOCPARM_LEN(cmd) != sizeof(struct oifreq))
                return cmd;

        switch (cmd) {
        case OSIOCSIFADDR:
                return SIOCSIFADDR;
        case OOSIOCGIFADDR:
                return SIOCGIFADDR;
        case OSIOCSIFDSTADDR:
                return SIOCSIFDSTADDR;
        case OOSIOCGIFDSTADDR:
                return SIOCGIFDSTADDR;
        case OSIOCSIFFLAGS:
                return SIOCSIFFLAGS;
        case OSIOCGIFFLAGS:
                return SIOCGIFFLAGS;
        case OOSIOCGIFBRDADDR:
                return SIOCGIFBRDADDR;
        case OSIOCSIFBRDADDR:
                return SIOCSIFBRDADDR;
        case OOSIOCGIFCONF:
                return SIOCGIFCONF;
        case OOSIOCGIFNETMASK:
                return SIOCGIFNETMASK;
        case OSIOCSIFNETMASK:
                return SIOCSIFNETMASK;
        case OSIOCGIFCONF:
                return SIOCGIFCONF;
        case OSIOCADDMULTI:
                return SIOCADDMULTI;
        case OSIOCDELMULTI:
                return SIOCDELMULTI;
        case SIOCSIFMEDIA_43:
                return SIOCSIFMEDIA_80;
        case OSIOCGIFMTU:
                return SIOCGIFMTU;
        case OSIOCGIFDATA:
                return SIOCGIFDATA;
        case OSIOCZIFDATA:
                return SIOCZIFDATA;
        case OBIOCGETIF:
                return BIOCGETIF;
        case OBIOCSETIF:
                return BIOCSETIF;
        case OTAPGIFNAME:
                return TAPGIFNAME;
        default:
                /*
                 * XXX: the following code should be removed and the
                 * needing treatment ioctls should move to the switch
                 * above.
                 */
                ncmd = ((cmd) & ~(IOCPARM_MASK << IOCPARM_SHIFT)) | 
                    (sizeof(struct ifreq) << IOCPARM_SHIFT);
                switch (ncmd) {
                case BIOCGETIF:
                case BIOCSETIF:
                case GREDSOCK:
                case GREGADDRD:
                case GREGADDRS:
                case GREGPROTO:
                case GRESADDRD:
                case GRESADDRS:
                case GRESPROTO:
                case GRESSOCK:
                case SIOCADDMULTI:
                case SIOCDELMULTI:
                case SIOCDIFADDR:
                case SIOCDIFADDR_IN6:
                case SIOCDIFPHYADDR:
                case SIOCG80211NWID:
                case SIOCG80211STATS:
                case SIOCG80211ZSTATS:
                case SIOCGIFADDR:
                case SIOCGIFADDR_IN6:
                case SIOCGIFAFLAG_IN6:
                case SIOCGIFALIFETIME_IN6:
                case SIOCGIFBRDADDR:
                case SIOCGIFDLT:
                case SIOCGIFDSTADDR:
                case SIOCGIFDSTADDR_IN6:
                case SIOCGIFFLAGS:
                case SIOCGIFGENERIC:
                case SIOCGIFMETRIC:
                case SIOCGIFMTU:
                case SIOCGIFNETMASK:
                case SIOCGIFNETMASK_IN6:
                case SIOCGIFPDSTADDR:
                case SIOCGIFPDSTADDR_IN6:
                case SIOCGIFPSRCADDR:
                case SIOCGIFPSRCADDR_IN6:
                case SIOCGIFSTAT_ICMP6:
                case SIOCGIFSTAT_IN6:
                case SIOCGVH:
                case SIOCIFCREATE:
                case SIOCIFDESTROY:
                case SIOCS80211NWID:
                case SIOCSIFADDR:
                case SIOCSIFADDR_IN6:
                case SIOCSIFBRDADDR:
                case SIOCSIFDSTADDR:
                case SIOCSIFDSTADDR_IN6:
                case SIOCSIFFLAGS:
                case SIOCSIFGENERIC:
                case SIOCSIFMEDIA:
                case SIOCSIFMETRIC:
                case SIOCSIFMTU:
                case SIOCSIFNETMASK:
                case SIOCSIFNETMASK_IN6:
                case SIOCSVH:
                case TAPGIFNAME:
                        return ncmd;
                default:
                    {        int rv;

                        MODULE_HOOK_CALL(if43_cvtcmd_20_hook, (ncmd), enosys(),
                            rv);
                        if (rv == 0)
                                return ncmd;
                        return cmd;
                    }
                }
        }
}

int
compat_ifioctl(struct socket *so, u_long ocmd, u_long cmd, void *data,
    struct lwp *l)
{
        int error;
        struct ifreq *ifr = (struct ifreq *)data;
        struct ifreq ifrb;
        struct oifreq *oifr = NULL;
        struct ifnet *ifp;
        struct sockaddr *sa;
        struct psref psref;
        int bound = curlwp_bind();

        ifp = if_get(ifr->ifr_name, &psref);
        if (ifp == NULL) {
                curlwp_bindx(bound);
                return ENXIO;
        }

        /*
         * If we have not been converted, make sure that we are.
         * (because the upper layer handles old socket calls, but
         * not oifreq calls.
         */
        if (cmd == ocmd) {
                cmd = compat_cvtcmd(ocmd);
        }
        if (cmd != ocmd) {
                oifr = data;
                ifr = &ifrb;
                IFREQO2N_43(oifr, ifr);
        }

        switch (ocmd) {
                enum { maxlen = sizeof(oifr->ifr_ifru) };
                CTASSERT(maxlen == 16);
                socklen_t famlen;
        case OSIOCSIFADDR:
        case OSIOCSIFDSTADDR:
        case OSIOCSIFBRDADDR:
        case OSIOCSIFNETMASK:
                sa = &ifr->ifr_addr;
#if BYTE_ORDER != BIG_ENDIAN
                if (sa->sa_family == 0 && sa->sa_len < maxlen) {
                        sa->sa_family = sa->sa_len;
                        sa->sa_len = maxlen;
                }
#else
                if (sa->sa_len == 0)
                        sa->sa_len = maxlen;
#endif
                famlen = sockaddr_getsize_by_family(sa->sa_family);
                if (famlen > sa->sa_len) {
                        curlwp_bindx(bound);
                        return EAFNOSUPPORT;
                }

                break;
        }

        error = (*so->so_proto->pr_usrreqs->pr_ioctl)(so, cmd, ifr, ifp);
        if_put(ifp, &psref);
        curlwp_bindx(bound);

        switch (ocmd) {
        case OOSIOCGIFADDR:
        case OOSIOCGIFDSTADDR:
        case OOSIOCGIFBRDADDR:
        case OOSIOCGIFNETMASK:
                *(u_int16_t *)&ifr->ifr_addr = 
                    ((struct sockaddr *)&ifr->ifr_addr)->sa_family;
                break;
        }

        if (cmd != ocmd)
                IFREQN2O_43(oifr, ifr);

        return error;
}

int
if_43_init(void)
{

        MODULE_HOOK_SET(if_cvtcmd_43_hook, do_compat_cvtcmd);
        MODULE_HOOK_SET(if_ifioctl_43_hook, compat_ifioctl);
        return 0;
}

int
if_43_fini(void)
{

        MODULE_HOOK_UNSET(if_cvtcmd_43_hook);
        MODULE_HOOK_UNSET(if_ifioctl_43_hook);
        return 0;
}
#endif /* defined(COMPAT_43) */













































































































































































































































































































































































































































    1 



    1 


    1 










































































    1 


    1 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
/* $NetBSD: tcp_sack.c,v 1.36 2018/05/18 18:58:51 maxv Exp $ */

/*
 * Copyright (c) 2005 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Kentaro A. Kurahone.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)tcp_sack.c        8.12 (Berkeley) 5/24/95
 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $
 */

/*
 *        @@(#)COPYRIGHT        1.1 (NRL) 17 January 1995
 *
 * NRL grants permission for redistribution and use in source and binary
 * forms, with or without modification, of the software and documentation
 * created at NRL provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgements:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 *        This product includes software developed at the Information
 *        Technology Division, US Naval Research Laboratory.
 * 4. Neither the name of the NRL nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * The views and conclusions contained in the software and documentation
 * are those of the authors and should not be interpreted as representing
 * official policies, either expressed or implied, of the US Naval
 * Research Laboratory (NRL).
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.36 2018/05/18 18:58:51 maxv Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_inet_csum.h"
#include "opt_tcp_debug.h"
#include "opt_ddb.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/domain.h>
#include <sys/kernel.h>

#include <net/if.h>
#include <net/route.h>
#include <net/if_types.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>

#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#include <netinet/icmp6.h>
#endif

#ifndef INET6
#include <netinet/ip6.h>
#endif

#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_debug.h>

/* SACK block pool. */
static struct pool sackhole_pool;

void
tcp_sack_init(void)
{

        pool_init(&sackhole_pool, sizeof(struct sackhole), 0, 0, 0,
            "sackholepl", NULL, IPL_SOFTNET);
}

static struct sackhole *
sack_allochole(struct tcpcb *tp)
{
        struct sackhole *hole;

        if (tp->snd_numholes >= tcp_sack_tp_maxholes ||
            tcp_sack_globalholes >= tcp_sack_globalmaxholes) {
                return NULL;
        }
        hole = pool_get(&sackhole_pool, PR_NOWAIT);
        if (hole == NULL) {
                return NULL;
        }
        tp->snd_numholes++;
        tcp_sack_globalholes++;

        return hole;
}

static struct sackhole *
sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end,
    struct sackhole *prev)
{
        struct sackhole *hole;

        hole = sack_allochole(tp);
        if (hole == NULL) {
                return NULL;
        }
        hole->start = hole->rxmit = start;
        hole->end = end;
        if (prev != NULL) {
                TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q);
        } else {
                TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q);
        }
        return hole;
}

static struct sackhole *
sack_removehole(struct tcpcb *tp, struct sackhole *hole)
{
        struct sackhole *next;

        next = TAILQ_NEXT(hole, sackhole_q);
        tp->snd_numholes--;
        tcp_sack_globalholes--;
        TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q);
        pool_put(&sackhole_pool, hole);

        return next;
}

/*
 * tcp_new_dsack: record the reception of a duplicated segment.
 */

void
tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len)
{

        if (TCP_SACK_ENABLED(tp)) {
                tp->rcv_dsack_block.left = seq;
                tp->rcv_dsack_block.right = seq + len;
                tp->rcv_sack_flags |= TCPSACK_HAVED;
        }
}

/*
 * tcp_sack_option: parse the given SACK option and update the scoreboard.
 */

void
tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp,
    int optlen)
{
        struct sackblk
            t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)];
        struct sackblk *sack = NULL;
        struct sackhole *cur = NULL;
        struct sackhole *tmp = NULL;
        const char *lp = cp + 2;
        int i, j, num_sack_blks;
        tcp_seq left, right, acked;

        /*
         * If we aren't processing SACK responses, this is not an ACK
         * or the peer sends us a sack option with invalid length, don't
         * update the scoreboard.
         */
        if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) ||
                        (optlen % 8 != 2 || optlen < 10)) {
                return;
        }

        /*
         * If we don't want any SACK holes to be allocated, just return.
         */
        if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) {
                return;
        }

        /* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */
        if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))
                return;

        /*
         * Extract SACK blocks.
         *
         * Note that t_sack_block is sorted so that we only need to do
         * one pass over the sequence number space. (SACK "fast-path")
         */
        num_sack_blks = optlen / 8;
        acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una;
        for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) {
                memcpy(&left, lp, sizeof(uint32_t));
                memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t));
                left = ntohl(left);
                right = ntohl(right);

                if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) ||
                    SEQ_GEQ(left, right)) {
                        /* SACK entry that's old, or invalid. */
                        i--;
                        num_sack_blks--;
                        continue;
                }

                /* Insertion sort. */
                for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left);
                    j--) {
                        t_sack_block[j].left = t_sack_block[j - 1].left;
                        t_sack_block[j].right = t_sack_block[j - 1].right;
                }
                t_sack_block[j].left = left;
                t_sack_block[j].right = right;
        }

        /* Update the scoreboard. */
        cur = TAILQ_FIRST(&tp->snd_holes);
        for (i = 0; i < num_sack_blks; i++) {
                sack = &t_sack_block[i];
                /*
                 * FACK TCP.  Update snd_fack so we can enter Fast
                 * Recovery early.
                 */
                if (SEQ_GEQ(sack->right, tp->snd_fack))
                        tp->snd_fack = sack->right;

                if (TAILQ_EMPTY(&tp->snd_holes)) {
                        /* First hole. */
                        cur = sack_inserthole(tp, th->th_ack, sack->left, NULL);
                        if (cur == NULL) {
                                /* ENOBUFS, bail out*/
                                return;
                        }
                        tp->rcv_lastsack = sack->right;
                        continue; /* With next sack block */
                }

                /* Go through the list of holes. */
                while (cur) {
                        if (SEQ_LEQ(sack->right, cur->start))
                                /* SACKs data before the current hole */
                                break; /* No use going through more holes */

                        if (SEQ_GEQ(sack->left, cur->end)) {
                                /* SACKs data beyond the current hole */
                                cur = TAILQ_NEXT(cur, sackhole_q);
                                continue;
                        }

                        if (SEQ_LEQ(sack->left, cur->start)) {
                                /* Data acks at least the beginning of hole */
                                if (SEQ_GEQ(sack->right, cur->end)) {
                                        /* Acks entire hole, so delete hole */
                                        cur = sack_removehole(tp, cur);
                                        break;
                                }

                                /* Otherwise, move start of hole forward */
                                cur->start = sack->right;
                                cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
                                break;
                        }

                        if (SEQ_GEQ(sack->right, cur->end)) {
                                /* Move end of hole backward. */
                                cur->end = sack->left;
                                cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
                                cur = TAILQ_NEXT(cur, sackhole_q);
                                break;
                        }

                        if (SEQ_LT(cur->start, sack->left) &&
                            SEQ_GT(cur->end, sack->right)) {
                                /*
                                 * ACKs some data in middle of a hole; need to
                                 * split current hole
                                 */
                                tmp = sack_inserthole(tp, sack->right, cur->end,
                                    cur);
                                if (tmp == NULL) {
                                        return;
                                }
                                tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start);
                                cur->end = sack->left;
                                cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
                                cur = tmp;
                                break;
                        }
                }

                /* At this point, we have reached the tail of the list. */
                if (SEQ_LT(tp->rcv_lastsack, sack->left)) {
                        /*
                         * Need to append new hole at end.
                         */
                        cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left,
                            NULL);
                        if (cur == NULL) {
                                return;
                        }
                }
                if (SEQ_LT(tp->rcv_lastsack, sack->right)) {
                        tp->rcv_lastsack = sack->right;
                }
        }
}

/*
 * tcp_del_sackholes: remove holes covered by a cumulative ACK.
 */

void
tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th)
{
        /* Max because this could be an older ack that just arrived. */
        tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
                th->th_ack : tp->snd_una;
        struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);

        while (cur) {
                if (SEQ_LEQ(cur->end, lastack)) {
                        cur = sack_removehole(tp, cur);
                } else if (SEQ_LT(cur->start, lastack)) {
                        cur->start = lastack;
                        if (SEQ_LT(cur->rxmit, cur->start))
                                cur->rxmit = cur->start;
                        break;
                } else
                        break;
        }
}

/*
 * tcp_free_sackholes: clear the scoreboard.
 */

void
tcp_free_sackholes(struct tcpcb *tp)
{
        struct sackhole *sack;

        /* Free up the SACK hole list. */
        while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) {
                sack_removehole(tp, sack);
        }
        KASSERT(tp->snd_numholes == 0);
}

/*
 * Returns pointer to a sackhole if there are any pending retransmissions;
 * NULL otherwise.
 */
struct sackhole *
tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
{
        struct sackhole *cur = NULL;

        if (!TCP_SACK_ENABLED(tp))
                return (NULL);

        *sack_bytes_rexmt = 0;
        TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
                if (SEQ_LT(cur->rxmit, cur->end)) {
                        if (SEQ_LT(cur->rxmit, tp->snd_una)) {
                                /* old SACK hole */
                                continue;
                        }
                        *sack_bytes_rexmt += (cur->rxmit - cur->start);
                        break;
                }
                *sack_bytes_rexmt += (cur->rxmit - cur->start);
        }

        return (cur);
}

/*
 * After a timeout, the SACK list may be rebuilt.  This SACK information
 * should be used to avoid retransmitting SACKed data.  This function
 * traverses the SACK list to see if snd_nxt should be moved forward.
 */
void
tcp_sack_adjust(struct tcpcb *tp)
{
        struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
        struct sackhole *n = NULL;

        if (TAILQ_EMPTY(&tp->snd_holes))
                return; /* No holes */
        if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
                return; /* We're already beyond any SACKed blocks */

        /*
         * Two cases for which we want to advance snd_nxt:
         * i) snd_nxt lies between end of one hole and beginning of another
         * ii) snd_nxt lies between end of last hole and rcv_lastsack
         */
        while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) {
                if (SEQ_LT(tp->snd_nxt, cur->end))
                        return;
                if (SEQ_GEQ(tp->snd_nxt, n->start))
                        cur = n;
                else {
                        tp->snd_nxt = n->start;
                        return;
                }
        }
        if (SEQ_LT(tp->snd_nxt, cur->end))
                return;
        tp->snd_nxt = tp->rcv_lastsack;

        return;
}

/*
 * tcp_sack_numblks: return the number of SACK blocks to send.
 */

int
tcp_sack_numblks(const struct tcpcb *tp)
{
        int numblks;

        if (!TCP_SACK_ENABLED(tp)) {
                return 0;
        }

        numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) +
            tp->t_segqlen;

        if (numblks == 0) {
                return 0;
        }

        if (numblks > TCP_SACK_MAX) {
                numblks = TCP_SACK_MAX;
        }

        return numblks;
}

#if defined(DDB)
void sack_dump(const struct tcpcb *);

void
sack_dump(const struct tcpcb *tp)
{
        const struct sackhole *cur;

        printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n",
            tp->snd_una, tp->snd_max);
        printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n",
            tp->rcv_lastsack, tp->snd_fack);
        printf("numholes=%d\n", tp->snd_numholes);
        TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
                printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n",
                    cur->start, cur->end, cur->rxmit);
        }
}
#endif /* defined(DDB) */
































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
/*        $NetBSD: rtsock_50.c,v 1.16 2020/01/29 05:47:12 thorpej Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1988, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)rtsock.c        8.7 (Berkeley) 10/12/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rtsock_50.c,v 1.16 2020/01/29 05:47:12 thorpej Exp $");

#define        COMPAT_RTSOCK        /* Use the COMPATNAME/COMPATCALL macros and the
                         * various other compat definitions - see
                         * sys/net/rtsock_shared.c for details
                         */

#include <net/rtsock_shared.c>
#include <compat/net/route_50.h>

static struct sysctllog *clog;

void
compat_50_rt_oifmsg(struct ifnet *ifp)
{
        struct if_msghdr50 oifm;
        struct if_data ifi;
        struct mbuf *m;
        struct rt_addrinfo info;

        if (COMPATNAME(route_info).ri_cb.any_count == 0)
                return;
        (void)memset(&info, 0, sizeof(info));
        (void)memset(&oifm, 0, sizeof(oifm));
        if_export_if_data(ifp, &ifi, false);
        oifm.ifm_index = ifp->if_index;
        oifm.ifm_flags = ifp->if_flags;
        oifm.ifm_data.ifi_type = ifi.ifi_type;
        oifm.ifm_data.ifi_addrlen = ifi.ifi_addrlen;
        oifm.ifm_data.ifi_hdrlen = ifi.ifi_hdrlen;
        oifm.ifm_data.ifi_link_state = ifi.ifi_link_state;
        oifm.ifm_data.ifi_mtu = ifi.ifi_mtu;
        oifm.ifm_data.ifi_metric = ifi.ifi_metric;
        oifm.ifm_data.ifi_baudrate = ifi.ifi_baudrate;
        oifm.ifm_data.ifi_ipackets = ifi.ifi_ipackets;
        oifm.ifm_data.ifi_ierrors = ifi.ifi_ierrors;
        oifm.ifm_data.ifi_opackets = ifi.ifi_opackets;
        oifm.ifm_data.ifi_oerrors = ifi.ifi_oerrors;
        oifm.ifm_data.ifi_collisions = ifi.ifi_collisions;
        oifm.ifm_data.ifi_ibytes = ifi.ifi_ibytes;
        oifm.ifm_data.ifi_obytes = ifi.ifi_obytes;
        oifm.ifm_data.ifi_imcasts = ifi.ifi_imcasts;
        oifm.ifm_data.ifi_omcasts = ifi.ifi_omcasts;
        oifm.ifm_data.ifi_iqdrops = ifi.ifi_iqdrops;
        oifm.ifm_data.ifi_noproto = ifi.ifi_noproto;
        TIMESPEC_TO_TIMEVAL(&oifm.ifm_data.ifi_lastchange,
            &ifi.ifi_lastchange);
        oifm.ifm_addrs = 0;
        m = COMPATNAME(rt_msg1)(RTM_OIFINFO, &info, (void *)&oifm, sizeof(oifm));
        if (m == NULL)
                return;
        COMPATNAME(route_enqueue)(m, 0);
}

int
compat_50_iflist(struct ifnet *ifp, struct rt_walkarg *w,
    struct rt_addrinfo *info, size_t len)
{
        struct if_msghdr50 *ifm;
        struct if_data ifi;
        int error;

        ifm = (struct if_msghdr50 *)w->w_tmem;
        if_export_if_data(ifp, &ifi, false);
        ifm->ifm_index = ifp->if_index;
        ifm->ifm_flags = ifp->if_flags;
        ifm->ifm_data.ifi_type = ifi.ifi_type;
        ifm->ifm_data.ifi_addrlen = ifi.ifi_addrlen;
        ifm->ifm_data.ifi_hdrlen = ifi.ifi_hdrlen;
        ifm->ifm_data.ifi_link_state = ifi.ifi_link_state;
        ifm->ifm_data.ifi_mtu = ifi.ifi_mtu;
        ifm->ifm_data.ifi_metric = ifi.ifi_metric;
        ifm->ifm_data.ifi_baudrate = ifi.ifi_baudrate;
        ifm->ifm_data.ifi_ipackets = ifi.ifi_ipackets;
        ifm->ifm_data.ifi_ierrors = ifi.ifi_ierrors;
        ifm->ifm_data.ifi_opackets = ifi.ifi_opackets;
        ifm->ifm_data.ifi_oerrors = ifi.ifi_oerrors;
        ifm->ifm_data.ifi_collisions = ifi.ifi_collisions;
        ifm->ifm_data.ifi_ibytes = ifi.ifi_ibytes;
        ifm->ifm_data.ifi_obytes = ifi.ifi_obytes;
        ifm->ifm_data.ifi_imcasts = ifi.ifi_imcasts;
        ifm->ifm_data.ifi_omcasts = ifi.ifi_omcasts;
        ifm->ifm_data.ifi_iqdrops = ifi.ifi_iqdrops;
        ifm->ifm_data.ifi_noproto = ifi.ifi_noproto;
        TIMESPEC_TO_TIMEVAL(&ifm->ifm_data.ifi_lastchange,
            &ifi.ifi_lastchange);
        ifm->ifm_addrs = info->rti_addrs;
        error = copyout(ifm, w->w_where, len);
        if (error)
                return error;
        w->w_where = (char *)w->w_where + len;
        return 0;
}

void
rtsock_50_init(void)
{
 
        MODULE_HOOK_SET(rtsock_iflist_50_hook, compat_50_iflist);
        MODULE_HOOK_SET(rtsock_oifmsg_50_hook, compat_50_rt_oifmsg);
        MODULE_HOOK_SET(rtsock_rt_missmsg_50_hook, compat_50_rt_missmsg);
        MODULE_HOOK_SET(rtsock_rt_ifmsg_50_hook, compat_50_rt_ifmsg);
        MODULE_HOOK_SET(rtsock_rt_addrmsg_rt_50_hook, compat_50_rt_addrmsg_rt);
        MODULE_HOOK_SET(rtsock_rt_addrmsg_src_50_hook,
            compat_50_rt_addrmsg_src);
        MODULE_HOOK_SET(rtsock_rt_addrmsg_50_hook, compat_50_rt_addrmsg);
        MODULE_HOOK_SET(rtsock_rt_ifannouncemsg_50_hook,
            compat_50_rt_ifannouncemsg);
        MODULE_HOOK_SET(rtsock_rt_ieee80211msg_50_hook,
            compat_50_rt_ieee80211msg);
        sysctl_net_route_setup(&clog, PF_OROUTE, "ortable");
}
 
void
rtsock_50_fini(void)
{  

        sysctl_teardown(&clog);
        MODULE_HOOK_UNSET(rtsock_iflist_50_hook); 
        MODULE_HOOK_UNSET(rtsock_oifmsg_50_hook); 
        MODULE_HOOK_UNSET(rtsock_rt_missmsg_50_hook); 
        MODULE_HOOK_UNSET(rtsock_rt_ifmsg_50_hook); 
        MODULE_HOOK_UNSET(rtsock_rt_addrmsg_rt_50_hook); 
        MODULE_HOOK_UNSET(rtsock_rt_addrmsg_src_50_hook); 
        MODULE_HOOK_UNSET(rtsock_rt_addrmsg_50_hook); 
        MODULE_HOOK_UNSET(rtsock_rt_ifannouncemsg_50_hook); 
        MODULE_HOOK_UNSET(rtsock_rt_ieee80211msg_50_hook); 
}





















































  252 









  250 



























  247 











   35 

   34 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/*        $NetBSD: sys_syscall.c,v 1.15 2022/06/29 16:33:09 hannken Exp $        */

/*-
 * Copyright (c) 2006 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by David Laight.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_syscall.c,v 1.15 2022/06/29 16:33:09 hannken Exp $");

#include <sys/syscall_stats.h>
#include <sys/syscallvar.h>

/*
 * MI indirect system call support.
 * Included from sys_indirect.c and compat/netbsd32/netbsd32_indirect.c
 *
 * SYS_SYSCALL is set to the required function name.
 */

#define CONCAT(a,b) __CONCAT(a,b)

static void
CONCAT(SYS_SYSCALL, _biglockcheck)(struct proc *p, int code)
{

#ifdef DIAGNOSTIC
       kpreempt_disable();     /* make curcpu() stable */
       KASSERTMSG(curcpu()->ci_biglock_count == 0,
           "syscall %ld of emul %s leaked %d kernel locks",
           (long)code, p->p_emul->e_name, curcpu()->ci_biglock_count);
       kpreempt_enable();
#endif
}

int
SYS_SYSCALL(struct lwp *l, const struct CONCAT(SYS_SYSCALL, _args) *uap,
    register_t *rval)
{
        /* {
                syscallarg(int) code;
                syscallarg(register_t) args[SYS_MAXSYSARGS];
        } */
        const struct sysent *callp;
        struct proc *p = l->l_proc;
        int code;
        int error;
#ifdef NETBSD32_SYSCALL
        register_t args64[SYS_MAXSYSARGS];
        int i, narg;
        #define TRACE_ARGS args64
#else
        #define TRACE_ARGS &SCARG(uap, args[0])
#endif

        callp = p->p_emul->e_sysent;

        code = SCARG(uap, code) & (SYS_NSYSENT - 1);
        SYSCALL_COUNT(syscall_counts, code);
        callp += code;

        if (__predict_false(callp->sy_flags & SYCALL_INDIRECT))
                return ENOSYS;

        if (__predict_true(!p->p_trace_enabled)) {
                error = sy_call(callp, l, &uap->args, rval);
                CONCAT(SYS_SYSCALL, _biglockcheck)(p, code);
                return error;
        }

#ifdef NETBSD32_SYSCALL
        narg = callp->sy_narg;
        for (i = 0; i < narg; i++)
                args64[i] = SCARG(uap, args[i]);
#endif

        error = trace_enter(code, callp, TRACE_ARGS);
        if (__predict_true(error == 0))
                error = sy_call(callp, l, &uap->args, rval);
        trace_exit(code, callp, &uap->args, rval, error);
        CONCAT(SYS_SYSCALL, _biglockcheck)(p, code);
        return error;

        #undef TRACE_ARGS
}











































































































    2 






































































    1 











    1 














    1 

















    1 
















    1 



























































    1 






    1 



    1 


    1 











    1 



















































































    1 







    1 











    1 






    2 










    2 



























    2 

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
/*        $NetBSD: union_vfsops.c,v 1.87 2023/02/13 08:39:40 hannken Exp $        */

/*
 * Copyright (c) 1994 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)union_vfsops.c        8.20 (Berkeley) 5/20/95
 */

/*
 * Copyright (c) 1994 Jan-Simon Pendry.
 * All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)union_vfsops.c        8.20 (Berkeley) 5/20/95
 */

/*
 * Union Layer
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: union_vfsops.c,v 1.87 2023/02/13 08:39:40 hannken Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/filedesc.h>
#include <sys/queue.h>
#include <sys/stat.h>
#include <sys/kauth.h>
#include <sys/module.h>

#include <miscfs/genfs/genfs.h>
#include <fs/union/union.h>

MODULE(MODULE_CLASS_VFS, union, NULL);

/*
 * Mount union filesystem
 */
int
union_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
        struct lwp *l = curlwp;
        int error = 0;
        struct union_args *args = data;
        struct vnode *lowerrootvp = NULLVP;
        struct vnode *upperrootvp = NULLVP;
        struct union_mount *um = 0;
        const char *cp;
        char *xp;
        int len;
        size_t size;

        if (args == NULL)
                return EINVAL;
        if (*data_len < sizeof *args)
                return EINVAL;

#ifdef UNION_DIAGNOSTIC
        printf("%s(mp = %p)\n", __func__, mp);
#endif

        if (mp->mnt_flag & MNT_GETARGS) {
                um = MOUNTTOUNIONMOUNT(mp);
                if (um == NULL)
                        return EIO;
                args->target = NULL;
                args->mntflags = um->um_op;
                *data_len = sizeof *args;
                return 0;
        }
        /*
         * Update is a no-op
         */
        if (mp->mnt_flag & MNT_UPDATE) {
                /*
                 * Need to provide.
                 * 1. a way to convert between rdonly and rdwr mounts.
                 * 2. support for nfs exports.
                 */
                error = EOPNOTSUPP;
                goto bad;
        }

        lowerrootvp = mp->mnt_vnodecovered;
        vref(lowerrootvp);

        /*
         * Find upper node.
         */
        error = namei_simple_user(args->target,
            NSM_FOLLOW_NOEMULROOT, &upperrootvp);
        if (error != 0)
                goto bad;

        if (upperrootvp->v_type != VDIR) {
                error = EINVAL;
                goto bad;
        }

        um = kmem_zalloc(sizeof(*um), KM_SLEEP);

        /*
         * Keep a held reference to the target vnodes.
         * They are vrele'd in union_unmount.
         *
         * Depending on the _BELOW flag, the filesystems are
         * viewed in a different order.  In effect, this is the
         * same as providing a mount under option to the mount syscall.
         */

        um->um_op = args->mntflags & UNMNT_OPMASK;
        switch (um->um_op) {
        case UNMNT_ABOVE:
                um->um_lowervp = lowerrootvp;
                um->um_uppervp = upperrootvp;
                break;

        case UNMNT_BELOW:
                um->um_lowervp = upperrootvp;
                um->um_uppervp = lowerrootvp;
                break;

        case UNMNT_REPLACE:
                vrele(lowerrootvp);
                lowerrootvp = NULLVP;
                um->um_uppervp = upperrootvp;
                um->um_lowervp = lowerrootvp;
                break;

        default:
                error = EINVAL;
                goto bad;
        }

        /*
         * This mount is mp-safe if both lower mounts are mp-safe.
         */

        if (((um->um_lowervp == NULLVP) ||
            (um->um_lowervp->v_mount->mnt_iflag & IMNT_MPSAFE)) &&
            (um->um_uppervp->v_mount->mnt_iflag & IMNT_MPSAFE))
                mp->mnt_iflag |= IMNT_MPSAFE;

        /*
         * Unless the mount is readonly, ensure that the top layer
         * supports whiteout operations
         */
        if ((mp->mnt_flag & MNT_RDONLY) == 0) {
                static struct componentname nullcn = {
                        .cn_nameiop = LOOKUP,
                        .cn_cred = NOCRED
                };

                vn_lock(um->um_uppervp, LK_EXCLUSIVE | LK_RETRY);
                error = VOP_WHITEOUT(um->um_uppervp, &nullcn, LOOKUP);
                VOP_UNLOCK(um->um_uppervp);
                if (error)
                        goto bad;
        }

        um->um_cred = l->l_cred;
        kauth_cred_hold(um->um_cred);
        um->um_cmode = UN_DIRMODE &~ l->l_proc->p_cwdi->cwdi_cmask;

        /*
         * Depending on what you think the MNT_LOCAL flag might mean,
         * you may want the && to be || on the conditional below.
         * At the moment it has been defined that the filesystem is
         * only local if it is all local, ie the MNT_LOCAL flag implies
         * that the entire namespace is local.  If you think the MNT_LOCAL
         * flag implies that some of the files might be stored locally
         * then you will want to change the conditional.
         */
        if (um->um_op == UNMNT_ABOVE) {
                if (((um->um_lowervp == NULLVP) ||
                     (um->um_lowervp->v_mount->mnt_flag & MNT_LOCAL)) &&
                    (um->um_uppervp->v_mount->mnt_flag & MNT_LOCAL))
                        mp->mnt_flag |= MNT_LOCAL;
        }

        /*
         * Copy in the upper layer's RDONLY flag.  This is for the benefit
         * of lookup() which explicitly checks the flag, rather than asking
         * the filesystem for its own opinion.  This means, that an update
         * mount of the underlying filesystem to go from rdonly to rdwr
         * will leave the unioned view as read-only.
         */
        mp->mnt_flag |= (um->um_uppervp->v_mount->mnt_flag & MNT_RDONLY);

        mp->mnt_data = um;
        vfs_getnewfsid(mp);

        error = set_statvfs_info(path, UIO_USERSPACE, NULL, UIO_USERSPACE,
            mp->mnt_op->vfs_name, mp, l);
        if (error)
                goto bad;

        error = vfs_set_lowermount(mp, um->um_uppervp->v_mount);
        if (error)
                goto bad;

        switch (um->um_op) {
        case UNMNT_ABOVE:
                cp = "<above>:";
                break;
        case UNMNT_BELOW:
                cp = "<below>:";
                break;
        case UNMNT_REPLACE:
                cp = "";
                break;
        default:
                cp = "<invalid>:";
#ifdef DIAGNOSTIC
                panic("%s: bad um_op", __func__);
#endif
                break;
        }
        len = strlen(cp);
        memcpy(mp->mnt_stat.f_mntfromname, cp, len);

        xp = mp->mnt_stat.f_mntfromname + len;
        len = MNAMELEN - len;

        (void) copyinstr(args->target, xp, len - 1, &size);
        memset(xp + size, 0, len - size);

#ifdef UNION_DIAGNOSTIC
        printf("%s: from %s, on %s\n", __func__,
            mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
#endif

        /* Setup the readdir hook if it's not set already */
        if (!vn_union_readdir_hook)
                vn_union_readdir_hook = union_readdirhook;

        return 0;

bad:
        if (um) {
                if (um->um_cred)
                        kauth_cred_free(um->um_cred);
                kmem_free(um, sizeof(*um));
        }
        if (upperrootvp)
                vrele(upperrootvp);
        if (lowerrootvp)
                vrele(lowerrootvp);
        return error;
}

/*
 * VFS start.  Nothing needed here - the start routine
 * on the underlying filesystem(s) will have been called
 * when that filesystem was mounted.
 */
 /*ARGSUSED*/
int
union_start(struct mount *mp, int flags)
{

        return 0;
}

/*
 * Free reference to union layer
 */
static bool
union_unmount_selector(void *cl, struct vnode *vp)
{
        int *count = cl;

        KASSERT(mutex_owned(vp->v_interlock));

        *count += 1;
        return false;
}

int
union_unmount(struct mount *mp, int mntflags)
{
        struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
        int freeing;
        int error;

#ifdef UNION_DIAGNOSTIC
        printf("%s(mp = %p)\n", __func__, mp);
#endif

        /*
         * Keep flushing vnodes from the mount list.
         * This is needed because of the un_pvp held
         * reference to the parent vnode.
         * If more vnodes have been freed on a given pass,
         * the try again.  The loop will iterate at most
         * (d) times, where (d) is the maximum tree depth
         * in the filesystem.
         */
        for (freeing = 0; (error = vflush(mp, NULL, 0)) != 0;) {
                struct vnode_iterator *marker;
                int n;

                /* count #vnodes held on mount list */
                n = 0;
                vfs_vnode_iterator_init(mp, &marker);
                vfs_vnode_iterator_next(marker, union_unmount_selector, &n);
                vfs_vnode_iterator_destroy(marker);

                /* if this is unchanged then stop */
                if (n == freeing)
                        break;

                /* otherwise try once more time */
                freeing = n;
        }

        /*
         * Ok, now that we've tried doing it gently, get out the hammer.
         */

        if (mntflags & MNT_FORCE)
                error = vflush(mp, NULL, FORCECLOSE);

        if (error)
                return error;

        /*
         * Discard references to upper and lower target vnodes.
         */
        if (um->um_lowervp)
                vrele(um->um_lowervp);
        vrele(um->um_uppervp);
        kauth_cred_free(um->um_cred);
        /*
         * Finally, throw away the union_mount structure
         */
        kmem_free(um, sizeof(*um));
        mp->mnt_data = NULL;
        return 0;
}

int
union_root(struct mount *mp, int lktype, struct vnode **vpp)
{
        struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
        int error;

        /*
         * Return locked reference to root.
         */
        vref(um->um_uppervp);
        if (um->um_lowervp)
                vref(um->um_lowervp);
        error = union_allocvp(vpp, mp, NULL, NULL, NULL,
            um->um_uppervp, um->um_lowervp, 1);

        if (error) {
                vrele(um->um_uppervp);
                if (um->um_lowervp)
                        vrele(um->um_lowervp);
                return error;
        }

        vn_lock(*vpp, lktype | LK_RETRY);

        return 0;
}

int
union_statvfs(struct mount *mp, struct statvfs *sbp)
{
        int error;
        struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
        struct statvfs *sbuf = kmem_zalloc(sizeof(*sbuf), KM_SLEEP);
        unsigned long lbsize;

#ifdef UNION_DIAGNOSTIC
        printf("%s(mp = %p, lvp = %p, uvp = %p)\n", __func__, mp,
            um->um_lowervp, um->um_uppervp);
#endif

        if (um->um_lowervp) {
                error = VFS_STATVFS(um->um_lowervp->v_mount, sbuf);
                if (error)
                        goto done;
        }

        /* now copy across the "interesting" information and fake the rest */
        lbsize = sbuf->f_bsize;
        sbp->f_blocks = sbuf->f_blocks - sbuf->f_bfree;
        sbp->f_files = sbuf->f_files - sbuf->f_ffree;

        error = VFS_STATVFS(um->um_uppervp->v_mount, sbuf);
        if (error)
                goto done;

        sbp->f_flag = sbuf->f_flag;
        sbp->f_bsize = sbuf->f_bsize;
        sbp->f_frsize = sbuf->f_frsize;
        sbp->f_iosize = sbuf->f_iosize;

        /*
         * The "total" fields count total resources in all layers,
         * the "free" fields count only those resources which are
         * free in the upper layer (since only the upper layer
         * is writable).
         */

        if (sbuf->f_bsize != lbsize)
                sbp->f_blocks = sbp->f_blocks * lbsize / sbuf->f_bsize;
        sbp->f_blocks += sbuf->f_blocks;
        sbp->f_bfree = sbuf->f_bfree;
        sbp->f_bavail = sbuf->f_bavail;
        sbp->f_bresvd = sbuf->f_bresvd;
        sbp->f_files += sbuf->f_files;
        sbp->f_ffree = sbuf->f_ffree;
        sbp->f_favail = sbuf->f_favail;
        sbp->f_fresvd = sbuf->f_fresvd;

        copy_statvfs_info(sbp, mp);
done:
        kmem_free(sbuf, sizeof(*sbuf));
        return error;
}

/*ARGSUSED*/
int
union_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
{

        /*
         * XXX - Assumes no data cached at union layer.
         */
        return 0;
}

/*ARGSUSED*/
int
union_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp)
{

        return EOPNOTSUPP;
}

static int
union_renamelock_enter(struct mount *mp)
{
        struct union_mount *um = MOUNTTOUNIONMOUNT(mp);

        /* Lock just the upper fs, where the action happens. */
        return VFS_RENAMELOCK_ENTER(um->um_uppervp->v_mount);
}

static void
union_renamelock_exit(struct mount *mp)
{
        struct union_mount *um = MOUNTTOUNIONMOUNT(mp);

        VFS_RENAMELOCK_EXIT(um->um_uppervp->v_mount);
}

extern const struct vnodeopv_desc union_vnodeop_opv_desc;

const struct vnodeopv_desc * const union_vnodeopv_descs[] = {
        &union_vnodeop_opv_desc,
        NULL,
};

struct vfsops union_vfsops = {
        .vfs_name = MOUNT_UNION,
        .vfs_min_mount_data = sizeof (struct union_args),
        .vfs_mount = union_mount,
        .vfs_start = union_start,
        .vfs_unmount = union_unmount,
        .vfs_root = union_root,
        .vfs_quotactl = (void *)eopnotsupp,
        .vfs_statvfs = union_statvfs,
        .vfs_sync = union_sync,
        .vfs_vget = union_vget,
        .vfs_loadvnode = union_loadvnode,
        .vfs_fhtovp = (void *)eopnotsupp,
        .vfs_vptofh = (void *)eopnotsupp,
        .vfs_init = union_init,
        .vfs_reinit = union_reinit,
        .vfs_done = union_done,
        .vfs_snapshot = (void *)eopnotsupp,
        .vfs_extattrctl = vfs_stdextattrctl,
        .vfs_suspendctl = genfs_suspendctl,
        .vfs_renamelock_enter = union_renamelock_enter,
        .vfs_renamelock_exit = union_renamelock_exit,
        .vfs_fsync = (void *)eopnotsupp,
        .vfs_opv_descs = union_vnodeopv_descs
};

SYSCTL_SETUP(unionfs_sysctl_setup, "unionfs sysctl")
{

        sysctl_createv(clog, 0, NULL, NULL,
            CTLFLAG_PERMANENT,
            CTLTYPE_NODE, "union",
            SYSCTL_DESCR("Union file system"),
            NULL, 0, NULL, 0,
            CTL_VFS, 15, CTL_EOL);
        /*
         * XXX the "15" above could be dynamic, thereby eliminating
         * one more instance of the "number to vfs" mapping problem,
         * but "15" is the order as taken from sys/mount.h
         */
}

static int
union_modcmd(modcmd_t cmd, void *arg)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return vfs_attach(&union_vfsops);
        case MODULE_CMD_FINI:
                return vfs_detach(&union_vfsops);
        default:
                return ENOTTY;
        }
}























































































































































































































































































































   29 







































































   29 




   29 















   29 






   29 








































   29 





   28 






























   29 




















   29 



















   29 






















   29 


   29 
   29 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   28 






































































































































































































































































































































































































































































































   29 



   29 





   29 
   27 
























   29 












   29 

   29 











   29 











   28 







   29 









   29 





















   29 
































   29 






   29 





   29 
   29 

   29 









   29 




   29 

   28 







   29 






























   29 













   29 














   29 







   29 






   29 






























   28 





































































































































































































































































































































































































































































































   29 







   29 





    1 


    1 

    1 









   29 
   29 

   29 
   29 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
/*        $NetBSD: scsipi_base.c,v 1.189 2022/04/09 23:38:32 riastradh Exp $        */

/*-
 * Copyright (c) 1998, 1999, 2000, 2002, 2003, 2004 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum; by Jason R. Thorpe of the Numerical Aerospace
 * Simulation Facility, NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: scsipi_base.c,v 1.189 2022/04/09 23:38:32 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_scsi.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/buf.h>
#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/errno.h>
#include <sys/device.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <sys/hash.h>
#include <sys/atomic.h>

#include <dev/scsipi/scsi_sdt.h>
#include <dev/scsipi/scsi_spc.h>
#include <dev/scsipi/scsipi_all.h>
#include <dev/scsipi/scsipi_disk.h>
#include <dev/scsipi/scsipiconf.h>
#include <dev/scsipi/scsipi_base.h>

#include <dev/scsipi/scsi_all.h>
#include <dev/scsipi/scsi_message.h>

#include <machine/param.h>

SDT_PROVIDER_DEFINE(scsi);

SDT_PROBE_DEFINE3(scsi, base, tag, get,
    "struct scsipi_xfer *"/*xs*/, "uint8_t"/*tag*/, "uint8_t"/*type*/);
SDT_PROBE_DEFINE3(scsi, base, tag, put,
    "struct scsipi_xfer *"/*xs*/, "uint8_t"/*tag*/, "uint8_t"/*type*/);

SDT_PROBE_DEFINE3(scsi, base, adapter, request__start,
    "struct scsipi_channel *"/*chan*/,
    "scsipi_adapter_req_t"/*req*/,
    "void *"/*arg*/);
SDT_PROBE_DEFINE3(scsi, base, adapter, request__done,
    "struct scsipi_channel *"/*chan*/,
    "scsipi_adapter_req_t"/*req*/,
    "void *"/*arg*/);

SDT_PROBE_DEFINE1(scsi, base, queue, batch__start,
    "struct scsipi_channel *"/*chan*/);
SDT_PROBE_DEFINE2(scsi, base, queue, run,
    "struct scsipi_channel *"/*chan*/,
    "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, queue, batch__done,
    "struct scsipi_channel *"/*chan*/);

SDT_PROBE_DEFINE1(scsi, base, xfer, execute,  "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, enqueue,  "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, done,  "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, redone,  "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, complete,  "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, restart,  "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, free,  "struct scsipi_xfer *"/*xs*/);

static int        scsipi_complete(struct scsipi_xfer *);
static void        scsipi_request_sense(struct scsipi_xfer *);
static int        scsipi_enqueue(struct scsipi_xfer *);
static void        scsipi_run_queue(struct scsipi_channel *chan);

static void        scsipi_completion_thread(void *);

static void        scsipi_get_tag(struct scsipi_xfer *);
static void        scsipi_put_tag(struct scsipi_xfer *);

static int        scsipi_get_resource(struct scsipi_channel *);
static void        scsipi_put_resource(struct scsipi_channel *);

static void        scsipi_async_event_max_openings(struct scsipi_channel *,
                    struct scsipi_max_openings *);
static void        scsipi_async_event_channel_reset(struct scsipi_channel *);

static void        scsipi_channel_freeze_locked(struct scsipi_channel *, int);

static void        scsipi_adapter_lock(struct scsipi_adapter *adapt);
static void        scsipi_adapter_unlock(struct scsipi_adapter *adapt);

static void        scsipi_update_timeouts(struct scsipi_xfer *xs);

static struct pool scsipi_xfer_pool;

int scsipi_xs_count = 0;

/*
 * scsipi_init:
 *
 *        Called when a scsibus or atapibus is attached to the system
 *        to initialize shared data structures.
 */
void
scsipi_init(void)
{
        static int scsipi_init_done;

        if (scsipi_init_done)
                return;
        scsipi_init_done = 1;

        /* Initialize the scsipi_xfer pool. */
        pool_init(&scsipi_xfer_pool, sizeof(struct scsipi_xfer), 0,
            0, 0, "scxspl", NULL, IPL_BIO);
        pool_prime(&scsipi_xfer_pool, 1);

        scsipi_ioctl_init();
}

/*
 * scsipi_channel_init:
 *
 *        Initialize a scsipi_channel when it is attached.
 */
int
scsipi_channel_init(struct scsipi_channel *chan)
{
        struct scsipi_adapter *adapt = chan->chan_adapter;
        int i;

        /* Initialize shared data. */
        scsipi_init();

        /* Initialize the queues. */
        TAILQ_INIT(&chan->chan_queue);
        TAILQ_INIT(&chan->chan_complete);

        for (i = 0; i < SCSIPI_CHAN_PERIPH_BUCKETS; i++)
                LIST_INIT(&chan->chan_periphtab[i]);

        /*
         * Create the asynchronous completion thread.
         */
        if (kthread_create(PRI_NONE, 0, NULL, scsipi_completion_thread, chan,
            &chan->chan_thread, "%s", chan->chan_name)) {
                aprint_error_dev(adapt->adapt_dev, "unable to create completion thread for "
                    "channel %d\n", chan->chan_channel);
                panic("scsipi_channel_init");
        }

        return 0;
}

/*
 * scsipi_channel_shutdown:
 *
 *        Shutdown a scsipi_channel.
 */
void
scsipi_channel_shutdown(struct scsipi_channel *chan)
{

        mutex_enter(chan_mtx(chan));
        /*
         * Shut down the completion thread.
         */
        chan->chan_tflags |= SCSIPI_CHANT_SHUTDOWN;
        cv_broadcast(chan_cv_complete(chan));

        /*
         * Now wait for the thread to exit.
         */
        while (chan->chan_thread != NULL)
                cv_wait(chan_cv_thread(chan), chan_mtx(chan));
        mutex_exit(chan_mtx(chan));
}

static uint32_t
scsipi_chan_periph_hash(uint64_t t, uint64_t l)
{
        uint32_t hash;

        hash = hash32_buf(&t, sizeof(t), HASH32_BUF_INIT);
        hash = hash32_buf(&l, sizeof(l), hash);

        return hash & SCSIPI_CHAN_PERIPH_HASHMASK;
}

/*
 * scsipi_insert_periph:
 *
 *        Insert a periph into the channel.
 */
void
scsipi_insert_periph(struct scsipi_channel *chan, struct scsipi_periph *periph)
{
        uint32_t hash;

        hash = scsipi_chan_periph_hash(periph->periph_target,
            periph->periph_lun);

        mutex_enter(chan_mtx(chan));
        LIST_INSERT_HEAD(&chan->chan_periphtab[hash], periph, periph_hash);
        mutex_exit(chan_mtx(chan));
}

/*
 * scsipi_remove_periph:
 *
 *        Remove a periph from the channel.
 */
void
scsipi_remove_periph(struct scsipi_channel *chan,
    struct scsipi_periph *periph)
{

        LIST_REMOVE(periph, periph_hash);
}

/*
 * scsipi_lookup_periph:
 *
 *        Lookup a periph on the specified channel.
 */
static struct scsipi_periph *
scsipi_lookup_periph_internal(struct scsipi_channel *chan, int target, int lun, bool lock)
{
        struct scsipi_periph *periph;
        uint32_t hash;

        if (target >= chan->chan_ntargets ||
            lun >= chan->chan_nluns)
                return NULL;

        hash = scsipi_chan_periph_hash(target, lun);

        if (lock)
                mutex_enter(chan_mtx(chan));
        LIST_FOREACH(periph, &chan->chan_periphtab[hash], periph_hash) {
                if (periph->periph_target == target &&
                    periph->periph_lun == lun)
                        break;
        }
        if (lock)
                mutex_exit(chan_mtx(chan));

        return periph;
}

struct scsipi_periph *
scsipi_lookup_periph_locked(struct scsipi_channel *chan, int target, int lun)
{
        return scsipi_lookup_periph_internal(chan, target, lun, false);
}

struct scsipi_periph *
scsipi_lookup_periph(struct scsipi_channel *chan, int target, int lun)
{
        return scsipi_lookup_periph_internal(chan, target, lun, true);
}

/*
 * scsipi_get_resource:
 *
 *        Allocate a single xfer `resource' from the channel.
 *
 *        NOTE: Must be called with channel lock held
 */
static int
scsipi_get_resource(struct scsipi_channel *chan)
{
        struct scsipi_adapter *adapt = chan->chan_adapter;

        if (chan->chan_flags & SCSIPI_CHAN_OPENINGS) {
                if (chan->chan_openings > 0) {
                        chan->chan_openings--;
                        return 1;
                }
                return 0;
        }

        if (adapt->adapt_openings > 0) {
                adapt->adapt_openings--;
                return 1;
        }
        return 0;
}

/*
 * scsipi_grow_resources:
 *
 *        Attempt to grow resources for a channel.  If this succeeds,
 *        we allocate one for our caller.
 *
 *        NOTE: Must be called with channel lock held
 */
static inline int
scsipi_grow_resources(struct scsipi_channel *chan)
{

        if (chan->chan_flags & SCSIPI_CHAN_CANGROW) {
                if ((chan->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) {
                        mutex_exit(chan_mtx(chan));
                        scsipi_adapter_request(chan,
                            ADAPTER_REQ_GROW_RESOURCES, NULL);
                        mutex_enter(chan_mtx(chan));
                        return scsipi_get_resource(chan);
                }
                /*
                 * ask the channel thread to do it. It'll have to thaw the
                 * queue
                 */
                scsipi_channel_freeze_locked(chan, 1);
                chan->chan_tflags |= SCSIPI_CHANT_GROWRES;
                cv_broadcast(chan_cv_complete(chan));
                return 0;
        }

        return 0;
}

/*
 * scsipi_put_resource:
 *
 *        Free a single xfer `resource' to the channel.
 *
 *        NOTE: Must be called with channel lock held
 */
static void
scsipi_put_resource(struct scsipi_channel *chan)
{
        struct scsipi_adapter *adapt = chan->chan_adapter;

        if (chan->chan_flags & SCSIPI_CHAN_OPENINGS)
                chan->chan_openings++;
        else
                adapt->adapt_openings++;
}

/*
 * scsipi_get_tag:
 *
 *        Get a tag ID for the specified xfer.
 *
 *        NOTE: Must be called with channel lock held
 */
static void
scsipi_get_tag(struct scsipi_xfer *xs)
{
        struct scsipi_periph *periph = xs->xs_periph;
        int bit, tag;
        u_int word;

        KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));

        bit = 0;        /* XXX gcc */
        for (word = 0; word < PERIPH_NTAGWORDS; word++) {
                bit = ffs(periph->periph_freetags[word]);
                if (bit != 0)
                        break;
        }
#ifdef DIAGNOSTIC
        if (word == PERIPH_NTAGWORDS) {
                scsipi_printaddr(periph);
                printf("no free tags\n");
                panic("scsipi_get_tag");
        }
#endif

        bit -= 1;
        periph->periph_freetags[word] &= ~(1U << bit);
        tag = (word << 5) | bit;

        /* XXX Should eventually disallow this completely. */
        if (tag >= periph->periph_openings) {
                scsipi_printaddr(periph);
                printf("WARNING: tag %d greater than available openings %d\n",
                    tag, periph->periph_openings);
        }

        xs->xs_tag_id = tag;
        SDT_PROBE3(scsi, base, tag, get,
            xs, xs->xs_tag_id, xs->xs_tag_type);
}

/*
 * scsipi_put_tag:
 *
 *        Put the tag ID for the specified xfer back into the pool.
 *
 *        NOTE: Must be called with channel lock held
 */
static void
scsipi_put_tag(struct scsipi_xfer *xs)
{
        struct scsipi_periph *periph = xs->xs_periph;
        int word, bit;

        KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));

        SDT_PROBE3(scsi, base, tag, put,
            xs, xs->xs_tag_id, xs->xs_tag_type);

        word = xs->xs_tag_id >> 5;
        bit = xs->xs_tag_id & 0x1f;

        periph->periph_freetags[word] |= (1U << bit);
}

/*
 * scsipi_get_xs:
 *
 *        Allocate an xfer descriptor and associate it with the
 *        specified peripheral.  If the peripheral has no more
 *        available command openings, we either block waiting for
 *        one to become available, or fail.
 *
 *        When this routine is called with the channel lock held
 *        the flags must include XS_CTL_NOSLEEP.
 */
struct scsipi_xfer *
scsipi_get_xs(struct scsipi_periph *periph, int flags)
{
        struct scsipi_xfer *xs;
        bool lock = (flags & XS_CTL_NOSLEEP) == 0;

        SC_DEBUG(periph, SCSIPI_DB3, ("scsipi_get_xs\n"));

        KASSERT(!cold);

#ifdef DIAGNOSTIC
        /*
         * URGENT commands can never be ASYNC.
         */
        if ((flags & (XS_CTL_URGENT|XS_CTL_ASYNC)) ==
            (XS_CTL_URGENT|XS_CTL_ASYNC)) {
                scsipi_printaddr(periph);
                printf("URGENT and ASYNC\n");
                panic("scsipi_get_xs");
        }
#endif

        /*
         * Wait for a command opening to become available.  Rules:
         *
         *        - All xfers must wait for an available opening.
         *          Exception: URGENT xfers can proceed when
         *          active == openings, because we use the opening
         *          of the command we're recovering for.
         *        - if the periph has sense pending, only URGENT & REQSENSE
         *          xfers may proceed.
         *
         *        - If the periph is recovering, only URGENT xfers may
         *          proceed.
         *
         *        - If the periph is currently executing a recovery
         *          command, URGENT commands must block, because only
         *          one recovery command can execute at a time.
         */
        if (lock)
                mutex_enter(chan_mtx(periph->periph_channel));
        for (;;) {
                if (flags & XS_CTL_URGENT) {
                        if (periph->periph_active > periph->periph_openings)
                                goto wait_for_opening;
                        if (periph->periph_flags & PERIPH_SENSE) {
                                if ((flags & XS_CTL_REQSENSE) == 0)
                                        goto wait_for_opening;
                        } else {
                                if ((periph->periph_flags &
                                    PERIPH_RECOVERY_ACTIVE) != 0)
                                        goto wait_for_opening;
                                periph->periph_flags |= PERIPH_RECOVERY_ACTIVE;
                        }
                        break;
                }
                if (periph->periph_active >= periph->periph_openings ||
                    (periph->periph_flags & PERIPH_RECOVERING) != 0)
                        goto wait_for_opening;
                periph->periph_active++;
                KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
                break;

 wait_for_opening:
                if (flags & XS_CTL_NOSLEEP) {
                        KASSERT(!lock);
                        return NULL;
                }
                KASSERT(lock);
                SC_DEBUG(periph, SCSIPI_DB3, ("sleeping\n"));
                periph->periph_flags |= PERIPH_WAITING;
                cv_wait(periph_cv_periph(periph),
                    chan_mtx(periph->periph_channel));
        }
        if (lock)
                mutex_exit(chan_mtx(periph->periph_channel));

        SC_DEBUG(periph, SCSIPI_DB3, ("calling pool_get\n"));
        xs = pool_get(&scsipi_xfer_pool,
            ((flags & XS_CTL_NOSLEEP) != 0 ? PR_NOWAIT : PR_WAITOK));
        if (xs == NULL) {
                if (lock)
                        mutex_enter(chan_mtx(periph->periph_channel));
                if (flags & XS_CTL_URGENT) {
                        if ((flags & XS_CTL_REQSENSE) == 0)
                                periph->periph_flags &= ~PERIPH_RECOVERY_ACTIVE;
                } else
                        periph->periph_active--;
                if (lock)
                        mutex_exit(chan_mtx(periph->periph_channel));
                scsipi_printaddr(periph);
                printf("unable to allocate %sscsipi_xfer\n",
                    (flags & XS_CTL_URGENT) ? "URGENT " : "");
        }

        SC_DEBUG(periph, SCSIPI_DB3, ("returning\n"));

        if (xs != NULL) {
                memset(xs, 0, sizeof(*xs));
                callout_init(&xs->xs_callout, 0);
                xs->xs_periph = periph;
                xs->xs_control = flags;
                xs->xs_status = 0;
                if ((flags & XS_CTL_NOSLEEP) == 0)
                        mutex_enter(chan_mtx(periph->periph_channel));
                TAILQ_INSERT_TAIL(&periph->periph_xferq, xs, device_q);
                KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
                if ((flags & XS_CTL_NOSLEEP) == 0)
                        mutex_exit(chan_mtx(periph->periph_channel));
        }
        return xs;
}

/*
 * scsipi_put_xs:
 *
 *        Release an xfer descriptor, decreasing the outstanding command
 *        count for the peripheral.  If there is a thread waiting for
 *        an opening, wake it up.  If not, kick any queued I/O the
 *        peripheral may have.
 *
 *        NOTE: Must be called with channel lock held
 */
void
scsipi_put_xs(struct scsipi_xfer *xs)
{
        struct scsipi_periph *periph = xs->xs_periph;
        int flags = xs->xs_control;

        SDT_PROBE1(scsi, base, xfer, free,  xs);
        SC_DEBUG(periph, SCSIPI_DB3, ("scsipi_free_xs\n"));
        KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));

        TAILQ_REMOVE(&periph->periph_xferq, xs, device_q);
        callout_destroy(&xs->xs_callout);
        pool_put(&scsipi_xfer_pool, xs);

#ifdef DIAGNOSTIC
        if ((periph->periph_flags & PERIPH_RECOVERY_ACTIVE) != 0 &&
            periph->periph_active == 0) {
                scsipi_printaddr(periph);
                printf("recovery without a command to recovery for\n");
                panic("scsipi_put_xs");
        }
#endif

        if (flags & XS_CTL_URGENT) {
                if ((flags & XS_CTL_REQSENSE) == 0)
                        periph->periph_flags &= ~PERIPH_RECOVERY_ACTIVE;
        } else
                periph->periph_active--;
        if (periph->periph_active == 0 &&
            (periph->periph_flags & PERIPH_WAITDRAIN) != 0) {
                periph->periph_flags &= ~PERIPH_WAITDRAIN;
                cv_broadcast(periph_cv_active(periph));
        }

        if (periph->periph_flags & PERIPH_WAITING) {
                periph->periph_flags &= ~PERIPH_WAITING;
                cv_broadcast(periph_cv_periph(periph));
        } else {
                if (periph->periph_switch->psw_start != NULL &&
                    device_is_active(periph->periph_dev)) {
                        SC_DEBUG(periph, SCSIPI_DB2,
                            ("calling private start()\n"));
                        (*periph->periph_switch->psw_start)(periph);
                }
        }
}

/*
 * scsipi_channel_freeze:
 *
 *        Freeze a channel's xfer queue.
 */
void
scsipi_channel_freeze(struct scsipi_channel *chan, int count)
{
        bool lock = chan_running(chan) > 0;

        if (lock)
                mutex_enter(chan_mtx(chan));
        chan->chan_qfreeze += count;
        if (lock)
                mutex_exit(chan_mtx(chan));
}

static void
scsipi_channel_freeze_locked(struct scsipi_channel *chan, int count)
{

        chan->chan_qfreeze += count;
}

/*
 * scsipi_channel_thaw:
 *
 *        Thaw a channel's xfer queue.
 */
void
scsipi_channel_thaw(struct scsipi_channel *chan, int count)
{
        bool lock = chan_running(chan) > 0;

        if (lock)
                mutex_enter(chan_mtx(chan));
        chan->chan_qfreeze -= count;
        /*
         * Don't let the freeze count go negative.
         *
         * Presumably the adapter driver could keep track of this,
         * but it might just be easier to do this here so as to allow
         * multiple callers, including those outside the adapter driver.
         */
        if (chan->chan_qfreeze < 0) {
                chan->chan_qfreeze = 0;
        }
        if (lock)
                mutex_exit(chan_mtx(chan));

        /*
         * until the channel is running
         */
        if (!lock)
                return;

        /*
         * Kick the channel's queue here.  Note, we may be running in
         * interrupt context (softclock or HBA's interrupt), so the adapter
         * driver had better not sleep.
         */
        if (chan->chan_qfreeze == 0)
                scsipi_run_queue(chan);
}

/*
 * scsipi_channel_timed_thaw:
 *
 *        Thaw a channel after some time has expired. This will also
 *         run the channel's queue if the freeze count has reached 0.
 */
void
scsipi_channel_timed_thaw(void *arg)
{
        struct scsipi_channel *chan = arg;

        scsipi_channel_thaw(chan, 1);
}

/*
 * scsipi_periph_freeze:
 *
 *        Freeze a device's xfer queue.
 */
void
scsipi_periph_freeze_locked(struct scsipi_periph *periph, int count)
{

        periph->periph_qfreeze += count;
}

/*
 * scsipi_periph_thaw:
 *
 *        Thaw a device's xfer queue.
 */
void
scsipi_periph_thaw_locked(struct scsipi_periph *periph, int count)
{

        periph->periph_qfreeze -= count;
#ifdef DIAGNOSTIC
        if (periph->periph_qfreeze < 0) {
                static const char pc[] = "periph freeze count < 0";
                scsipi_printaddr(periph);
                printf("%s\n", pc);
                panic(pc);
        }
#endif
        if (periph->periph_qfreeze == 0 &&
            (periph->periph_flags & PERIPH_WAITING) != 0)
                cv_broadcast(periph_cv_periph(periph));
}

void
scsipi_periph_freeze(struct scsipi_periph *periph, int count)
{

        mutex_enter(chan_mtx(periph->periph_channel));
        scsipi_periph_freeze_locked(periph, count);
        mutex_exit(chan_mtx(periph->periph_channel));
}

void
scsipi_periph_thaw(struct scsipi_periph *periph, int count)
{

        mutex_enter(chan_mtx(periph->periph_channel));
        scsipi_periph_thaw_locked(periph, count);
        mutex_exit(chan_mtx(periph->periph_channel));
}

/*
 * scsipi_periph_timed_thaw:
 *
 *        Thaw a device after some time has expired.
 */
void
scsipi_periph_timed_thaw(void *arg)
{
        struct scsipi_periph *periph = arg;
        struct scsipi_channel *chan = periph->periph_channel;

        callout_stop(&periph->periph_callout);

        mutex_enter(chan_mtx(chan));
        scsipi_periph_thaw_locked(periph, 1);
        if ((periph->periph_channel->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) {
                /*
                 * Kick the channel's queue here.  Note, we're running in
                 * interrupt context (softclock), so the adapter driver
                 * had better not sleep.
                 */
                mutex_exit(chan_mtx(chan));
                scsipi_run_queue(periph->periph_channel);
        } else {
                /*
                 * Tell the completion thread to kick the channel's queue here.
                 */
                periph->periph_channel->chan_tflags |= SCSIPI_CHANT_KICK;
                cv_broadcast(chan_cv_complete(chan));
                mutex_exit(chan_mtx(chan));
        }
}

/*
 * scsipi_wait_drain:
 *
 *        Wait for a periph's pending xfers to drain.
 */
void
scsipi_wait_drain(struct scsipi_periph *periph)
{
        struct scsipi_channel *chan = periph->periph_channel;

        mutex_enter(chan_mtx(chan));
        while (periph->periph_active != 0) {
                periph->periph_flags |= PERIPH_WAITDRAIN;
                cv_wait(periph_cv_active(periph), chan_mtx(chan));
        }
        mutex_exit(chan_mtx(chan));
}

/*
 * scsipi_kill_pending:
 *
 *        Kill off all pending xfers for a periph.
 *
 *        NOTE: Must be called with channel lock held
 */
void
scsipi_kill_pending(struct scsipi_periph *periph)
{
        struct scsipi_channel *chan = periph->periph_channel;

        (*chan->chan_bustype->bustype_kill_pending)(periph);
        while (periph->periph_active != 0) {
                periph->periph_flags |= PERIPH_WAITDRAIN;
                cv_wait(periph_cv_active(periph), chan_mtx(chan));
        }
}

/*
 * scsipi_print_cdb:
 * prints a command descriptor block (for debug purpose, error messages,
 * SCSIVERBOSE, ...)
 */
void
scsipi_print_cdb(struct scsipi_generic *cmd)
{
        int i, j;

         printf("0x%02x", cmd->opcode);

         switch (CDB_GROUPID(cmd->opcode)) {
         case CDB_GROUPID_0:
                 j = CDB_GROUP0;
                 break;
         case CDB_GROUPID_1:
                 j = CDB_GROUP1;
                 break;
         case CDB_GROUPID_2:
                 j = CDB_GROUP2;
                 break;
         case CDB_GROUPID_3:
                 j = CDB_GROUP3;
                 break;
         case CDB_GROUPID_4:
                 j = CDB_GROUP4;
                 break;
         case CDB_GROUPID_5:
                 j = CDB_GROUP5;
                 break;
         case CDB_GROUPID_6:
                 j = CDB_GROUP6;
                 break;
         case CDB_GROUPID_7:
                 j = CDB_GROUP7;
                 break;
         default:
                 j = 0;
         }
         if (j == 0)
                 j = sizeof (cmd->bytes);
         for (i = 0; i < j-1; i++) /* already done the opcode */
                 printf(" %02x", cmd->bytes[i]);
}

/*
 * scsipi_interpret_sense:
 *
 *        Look at the returned sense and act on the error, determining
 *        the unix error number to pass back.  (0 = report no error)
 *
 *        NOTE: If we return ERESTART, we are expected to have
 *        thawed the device!
 *
 *        THIS IS THE DEFAULT ERROR HANDLER FOR SCSI DEVICES.
 */
int
scsipi_interpret_sense(struct scsipi_xfer *xs)
{
        struct scsi_sense_data *sense;
        struct scsipi_periph *periph = xs->xs_periph;
        u_int8_t key;
        int error;
        u_int32_t info;
        static const char *error_mes[] = {
                "soft error (corrected)",
                "not ready", "medium error",
                "non-media hardware failure", "illegal request",
                "unit attention", "readonly device",
                "no data found", "vendor unique",
                "copy aborted", "command aborted",
                "search returned equal", "volume overflow",
                "verify miscompare", "unknown error key"
        };

        sense = &xs->sense.scsi_sense;
#ifdef SCSIPI_DEBUG
        if (periph->periph_flags & SCSIPI_DB1) {
                int count, len;
                scsipi_printaddr(periph);
                printf(" sense debug information:\n");
                printf("\tcode 0x%x valid %d\n",
                        SSD_RCODE(sense->response_code),
                        sense->response_code & SSD_RCODE_VALID ? 1 : 0);
                printf("\tseg 0x%x key 0x%x ili 0x%x eom 0x%x fmark 0x%x\n",
                        sense->segment,
                        SSD_SENSE_KEY(sense->flags),
                        sense->flags & SSD_ILI ? 1 : 0,
                        sense->flags & SSD_EOM ? 1 : 0,
                        sense->flags & SSD_FILEMARK ? 1 : 0);
                printf("\ninfo: 0x%x 0x%x 0x%x 0x%x followed by %d "
                        "extra bytes\n",
                        sense->info[0],
                        sense->info[1],
                        sense->info[2],
                        sense->info[3],
                        sense->extra_len);
                len = SSD_ADD_BYTES_LIM(sense);
                printf("\textra (up to %d bytes): ", len);
                for (count = 0; count < len; count++)
                        printf("0x%x ", sense->csi[count]);
                printf("\n");
        }
#endif

        /*
         * If the periph has its own error handler, call it first.
         * If it returns a legit error value, return that, otherwise
         * it wants us to continue with normal error processing.
         */
        if (periph->periph_switch->psw_error != NULL) {
                SC_DEBUG(periph, SCSIPI_DB2,
                    ("calling private err_handler()\n"));
                error = (*periph->periph_switch->psw_error)(xs);
                if (error != EJUSTRETURN)
                        return error;
        }
        /* otherwise use the default */
        switch (SSD_RCODE(sense->response_code)) {

                /*
                 * Old SCSI-1 and SASI devices respond with
                 * codes other than 70.
                 */
        case 0x00:                /* no error (command completed OK) */
                return 0;
        case 0x04:                /* drive not ready after it was selected */
                if ((periph->periph_flags & PERIPH_REMOVABLE) != 0)
                        periph->periph_flags &= ~PERIPH_MEDIA_LOADED;
                if ((xs->xs_control & XS_CTL_IGNORE_NOT_READY) != 0)
                        return 0;
                /* XXX - display some sort of error here? */
                return EIO;
        case 0x20:                /* invalid command */
                if ((xs->xs_control &
                     XS_CTL_IGNORE_ILLEGAL_REQUEST) != 0)
                        return 0;
                return EINVAL;
        case 0x25:                /* invalid LUN (Adaptec ACB-4000) */
                return EACCES;

                /*
                 * If it's code 70, use the extended stuff and
                 * interpret the key
                 */
        case 0x71:                /* delayed error */
                scsipi_printaddr(periph);
                key = SSD_SENSE_KEY(sense->flags);
                printf(" DEFERRED ERROR, key = 0x%x\n", key);
                /* FALLTHROUGH */
        case 0x70:
                if ((sense->response_code & SSD_RCODE_VALID) != 0)
                        info = _4btol(sense->info);
                else
                        info = 0;
                key = SSD_SENSE_KEY(sense->flags);

                switch (key) {
                case SKEY_NO_SENSE:
                case SKEY_RECOVERED_ERROR:
                        if (xs->resid == xs->datalen && xs->datalen) {
                                /*
                                 * Why is this here?
                                 */
                                xs->resid = 0;        /* not short read */
                        }
                        error = 0;
                        break;
                case SKEY_EQUAL:
                        error = 0;
                        break;
                case SKEY_NOT_READY:
                        if ((periph->periph_flags & PERIPH_REMOVABLE) != 0)
                                periph->periph_flags &= ~PERIPH_MEDIA_LOADED;
                        if ((xs->xs_control & XS_CTL_IGNORE_NOT_READY) != 0)
                                return 0;
                        if (sense->asc == 0x3A) {
                                error = ENODEV; /* Medium not present */
                                if (xs->xs_control & XS_CTL_SILENT_NODEV)
                                        return error;
                        } else
                                error = EIO;
                        if ((xs->xs_control & XS_CTL_SILENT) != 0)
                                return error;
                        break;
                case SKEY_ILLEGAL_REQUEST:
                        if ((xs->xs_control &
                             XS_CTL_IGNORE_ILLEGAL_REQUEST) != 0)
                                return 0;
                        /*
                         * Handle the case where a device reports
                         * Logical Unit Not Supported during discovery.
                         */
                        if ((xs->xs_control & XS_CTL_DISCOVERY) != 0 &&
                            sense->asc == 0x25 &&
                            sense->ascq == 0x00)
                                return EINVAL;
                        if ((xs->xs_control & XS_CTL_SILENT) != 0)
                                return EIO;
                        error = EINVAL;
                        break;
                case SKEY_UNIT_ATTENTION:
                        if (sense->asc == 0x29 &&
                            sense->ascq == 0x00) {
                                /* device or bus reset */
                                return ERESTART;
                        }
                        if ((periph->periph_flags & PERIPH_REMOVABLE) != 0)
                                periph->periph_flags &= ~PERIPH_MEDIA_LOADED;
                        if ((xs->xs_control &
                             XS_CTL_IGNORE_MEDIA_CHANGE) != 0 ||
                                /* XXX Should reupload any transient state. */
                                (periph->periph_flags &
                                 PERIPH_REMOVABLE) == 0) {
                                return ERESTART;
                        }
                        if ((xs->xs_control & XS_CTL_SILENT) != 0)
                                return EIO;
                        error = EIO;
                        break;
                case SKEY_DATA_PROTECT:
                        error = EROFS;
                        break;
                case SKEY_BLANK_CHECK:
                        error = 0;
                        break;
                case SKEY_ABORTED_COMMAND:
                        if (xs->xs_retries != 0) {
                                xs->xs_retries--;
                                error = ERESTART;
                        } else
                                error = EIO;
                        break;
                case SKEY_VOLUME_OVERFLOW:
                        error = ENOSPC;
                        break;
                default:
                        error = EIO;
                        break;
                }

                /* Print verbose decode if appropriate and possible */
                if ((key == 0) ||
                    ((xs->xs_control & XS_CTL_SILENT) != 0) ||
                    (scsipi_print_sense(xs, 0) != 0))
                        return error;

                /* Print brief(er) sense information */
                scsipi_printaddr(periph);
                printf("%s", error_mes[key - 1]);
                if ((sense->response_code & SSD_RCODE_VALID) != 0) {
                        switch (key) {
                        case SKEY_NOT_READY:
                        case SKEY_ILLEGAL_REQUEST:
                        case SKEY_UNIT_ATTENTION:
                        case SKEY_DATA_PROTECT:
                                break;
                        case SKEY_BLANK_CHECK:
                                printf(", requested size: %d (decimal)",
                                    info);
                                break;
                        case SKEY_ABORTED_COMMAND:
                                if (xs->xs_retries)
                                        printf(", retrying");
                                printf(", cmd 0x%x, info 0x%x",
                                    xs->cmd->opcode, info);
                                break;
                        default:
                                printf(", info = %d (decimal)", info);
                        }
                }
                if (sense->extra_len != 0) {
                        int n;
                        printf(", data =");
                        for (n = 0; n < sense->extra_len; n++)
                                printf(" %02x",
                                    sense->csi[n]);
                }
                printf("\n");
                return error;

        /*
         * Some other code, just report it
         */
        default:
#if    defined(SCSIDEBUG) || defined(DEBUG)
        {
                static const char *uc = "undecodable sense error";
                int i;
                u_int8_t *cptr = (u_int8_t *) sense;
                scsipi_printaddr(periph);
                if (xs->cmd == &xs->cmdstore) {
                        printf("%s for opcode 0x%x, data=",
                            uc, xs->cmdstore.opcode);
                } else {
                        printf("%s, data=", uc);
                }
                for (i = 0; i < sizeof (sense); i++)
                        printf(" 0x%02x", *(cptr++) & 0xff);
                printf("\n");
        }
#else
                scsipi_printaddr(periph);
                printf("Sense Error Code 0x%x",
                        SSD_RCODE(sense->response_code));
                if ((sense->response_code & SSD_RCODE_VALID) != 0) {
                        struct scsi_sense_data_unextended *usense =
                            (struct scsi_sense_data_unextended *)sense;
                        printf(" at block no. %d (decimal)",
                            _3btol(usense->block));
                }
                printf("\n");
#endif
                return EIO;
        }
}

/*
 * scsipi_test_unit_ready:
 *
 *        Issue a `test unit ready' request.
 */
int
scsipi_test_unit_ready(struct scsipi_periph *periph, int flags)
{
        struct scsi_test_unit_ready cmd;
        int retries;

        /* some ATAPI drives don't support TEST UNIT READY. Sigh */
        if (periph->periph_quirks & PQUIRK_NOTUR)
                return 0;

        if (flags & XS_CTL_DISCOVERY)
                retries = 0;
        else
                retries = SCSIPIRETRIES;

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = SCSI_TEST_UNIT_READY;

        return scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
            retries, 10000, NULL, flags);
}

static const struct scsipi_inquiry3_pattern {
        const char vendor[8];
        const char product[16];
        const char revision[4];
} scsipi_inquiry3_quirk[] = {
        { "ES-6600 ", "", "" },
};

static int
scsipi_inquiry3_ok(const struct scsipi_inquiry_data *ib)
{
        for (size_t i = 0; i < __arraycount(scsipi_inquiry3_quirk); i++) {
                const struct scsipi_inquiry3_pattern *q =
                    &scsipi_inquiry3_quirk[i];
#define MATCH(field) \
    (q->field[0] ? memcmp(ib->field, q->field, sizeof(ib->field)) == 0 : 1)
                if (MATCH(vendor) && MATCH(product) && MATCH(revision))
                        return 0;
        }
        return 1;
}

/*
 * scsipi_inquire:
 *
 *        Ask the device about itself.
 */
int
scsipi_inquire(struct scsipi_periph *periph, struct scsipi_inquiry_data *inqbuf,
    int flags)
{
        struct scsipi_inquiry cmd;
        int error;
        int retries;

        if (flags & XS_CTL_DISCOVERY)
                retries = 0;
        else
                retries = SCSIPIRETRIES;

        /*
         * If we request more data than the device can provide, it SHOULD just
         * return a short response.  However, some devices error with an
         * ILLEGAL REQUEST sense code, and yet others have even more special
         * failure modes (such as the GL641USB flash adapter, which goes loony
         * and sends corrupted CRCs).  To work around this, and to bring our
         * behavior more in line with other OSes, we do a shorter inquiry,
         * covering all the SCSI-2 information, first, and then request more
         * data iff the "additional length" field indicates there is more.
         * - mycroft, 2003/10/16
         */
        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = INQUIRY;
        cmd.length = SCSIPI_INQUIRY_LENGTH_SCSI2;
        error = scsipi_command(periph, (void *)&cmd, sizeof(cmd),
            (void *)inqbuf, SCSIPI_INQUIRY_LENGTH_SCSI2, retries,
            10000, NULL, flags | XS_CTL_DATA_IN);
        if (!error &&
            inqbuf->additional_length > SCSIPI_INQUIRY_LENGTH_SCSI2 - 4) {
            if (scsipi_inquiry3_ok(inqbuf)) {
#if 0
printf("inquire: addlen=%d, retrying\n", inqbuf->additional_length);
#endif
                cmd.length = SCSIPI_INQUIRY_LENGTH_SCSI3;
                error = scsipi_command(periph, (void *)&cmd, sizeof(cmd),
                    (void *)inqbuf, SCSIPI_INQUIRY_LENGTH_SCSI3, retries,
                    10000, NULL, flags | XS_CTL_DATA_IN);
#if 0
printf("inquire: error=%d\n", error);
#endif
            }
        }

#ifdef SCSI_OLD_NOINQUIRY
        /*
         * Kludge for the Adaptec ACB-4000 SCSI->MFM translator.
         * This board doesn't support the INQUIRY command at all.
         */
        if (error == EINVAL || error == EACCES) {
                /*
                 * Conjure up an INQUIRY response.
                 */
                inqbuf->device = (error == EINVAL ?
                         SID_QUAL_LU_PRESENT :
                         SID_QUAL_LU_NOTPRESENT) | T_DIRECT;
                inqbuf->dev_qual2 = 0;
                inqbuf->version = 0;
                inqbuf->response_format = SID_FORMAT_SCSI1;
                inqbuf->additional_length = SCSIPI_INQUIRY_LENGTH_SCSI2 - 4;
                inqbuf->flags1 = inqbuf->flags2 = inqbuf->flags3 = 0;
                memcpy(inqbuf->vendor, "ADAPTEC ACB-4000            ", 28);
                error = 0;
        }

        /*
         * Kludge for the Emulex MT-02 SCSI->QIC translator.
         * This board gives an empty response to an INQUIRY command.
         */
        else if (error == 0 &&
            inqbuf->device == (SID_QUAL_LU_PRESENT | T_DIRECT) &&
            inqbuf->dev_qual2 == 0 &&
            inqbuf->version == 0 &&
            inqbuf->response_format == SID_FORMAT_SCSI1) {
                /*
                 * Fill out the INQUIRY response.
                 */
                inqbuf->device = (SID_QUAL_LU_PRESENT | T_SEQUENTIAL);
                inqbuf->dev_qual2 = SID_REMOVABLE;
                inqbuf->additional_length = SCSIPI_INQUIRY_LENGTH_SCSI2 - 4;
                inqbuf->flags1 = inqbuf->flags2 = inqbuf->flags3 = 0;
                memcpy(inqbuf->vendor, "EMULEX  MT-02 QIC           ", 28);
        }
#endif /* SCSI_OLD_NOINQUIRY */

        return error;
}

/*
 * scsipi_prevent:
 *
 *        Prevent or allow the user to remove the media
 */
int
scsipi_prevent(struct scsipi_periph *periph, int type, int flags)
{
        struct scsi_prevent_allow_medium_removal cmd;

        if (periph->periph_quirks & PQUIRK_NODOORLOCK)
                return 0;

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = SCSI_PREVENT_ALLOW_MEDIUM_REMOVAL;
        cmd.how = type;

        return (scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
            SCSIPIRETRIES, 5000, NULL, flags));
}

/*
 * scsipi_start:
 *
 *        Send a START UNIT.
 */
int
scsipi_start(struct scsipi_periph *periph, int type, int flags)
{
        struct scsipi_start_stop cmd;

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = START_STOP;
        cmd.byte2 = 0x00;
        cmd.how = type;

        return scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
            SCSIPIRETRIES, (type & SSS_START) ? 60000 : 10000, NULL, flags);
}

/*
 * scsipi_mode_sense, scsipi_mode_sense_big:
 *        get a sense page from a device
 */

int
scsipi_mode_sense(struct scsipi_periph *periph, int byte2, int page,
    struct scsi_mode_parameter_header_6 *data, int len, int flags, int retries,
    int timeout)
{
        struct scsi_mode_sense_6 cmd;

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = SCSI_MODE_SENSE_6;
        cmd.byte2 = byte2;
        cmd.page = page;
        cmd.length = len & 0xff;

        return scsipi_command(periph, (void *)&cmd, sizeof(cmd),
            (void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_IN);
}

int
scsipi_mode_sense_big(struct scsipi_periph *periph, int byte2, int page,
    struct scsi_mode_parameter_header_10 *data, int len, int flags, int retries,
    int timeout)
{
        struct scsi_mode_sense_10 cmd;

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = SCSI_MODE_SENSE_10;
        cmd.byte2 = byte2;
        cmd.page = page;
        _lto2b(len, cmd.length);

        return scsipi_command(periph, (void *)&cmd, sizeof(cmd),
            (void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_IN);
}

int
scsipi_mode_select(struct scsipi_periph *periph, int byte2,
    struct scsi_mode_parameter_header_6 *data, int len, int flags, int retries,
    int timeout)
{
        struct scsi_mode_select_6 cmd;

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = SCSI_MODE_SELECT_6;
        cmd.byte2 = byte2;
        cmd.length = len & 0xff;

        return scsipi_command(periph, (void *)&cmd, sizeof(cmd),
            (void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_OUT);
}

int
scsipi_mode_select_big(struct scsipi_periph *periph, int byte2,
    struct scsi_mode_parameter_header_10 *data, int len, int flags, int retries,
    int timeout)
{
        struct scsi_mode_select_10 cmd;

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = SCSI_MODE_SELECT_10;
        cmd.byte2 = byte2;
        _lto2b(len, cmd.length);

        return scsipi_command(periph, (void *)&cmd, sizeof(cmd),
            (void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_OUT);
}

/*
 * scsipi_get_opcodeinfo:
 *
 * query the device for supported commands and their timeout
 * building a timeout lookup table if timeout information is available.
 */
void
scsipi_get_opcodeinfo(struct scsipi_periph *periph)
{
        u_int8_t *data;
        int len = 16*1024;
        int rc;
        struct scsi_repsuppopcode cmd;
        
        /* refrain from asking for supported opcodes */
        if (periph->periph_quirks & PQUIRK_NOREPSUPPOPC ||
            periph->periph_type == T_PROCESSOR || /* spec. */
            periph->periph_type == T_CDROM) /* spec. */
                return;

        scsipi_free_opcodeinfo(periph);

        /*
         * query REPORT SUPPORTED OPERATION CODES
         * if OK
         *   enumerate all codes
         *     if timeout exists insert maximum into opcode table
         */

        data = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = SCSI_MAINTENANCE_IN;
        cmd.svcaction = RSOC_REPORT_SUPPORTED_OPCODES;
        cmd.repoption = RSOC_RCTD|RSOC_ALL;
        _lto4b(len, cmd.alloclen);
        
        rc = scsipi_command(periph, (void *)&cmd, sizeof(cmd),
                            (void *)data, len, 0, 1000, NULL,
                            XS_CTL_DATA_IN|XS_CTL_SILENT);

        if (rc == 0) {
                int count;
                int dlen = _4btol(data);
                u_int8_t *c = data + 4;
                
                SC_DEBUG(periph, SCSIPI_DB3,
                         ("supported opcode timeout-values loaded\n"));
                SC_DEBUG(periph, SCSIPI_DB3,
                         ("CMD  LEN  SA    spec  nom. time  cmd timeout\n"));

                struct scsipi_opcodes *tot = malloc(sizeof(struct scsipi_opcodes),
                    M_DEVBUF, M_WAITOK|M_ZERO);

                count = 0;
                while (tot != NULL &&
                       dlen >= (int)sizeof(struct scsi_repsupopcode_all_commands_descriptor)) {
                        struct scsi_repsupopcode_all_commands_descriptor *acd
                                = (struct scsi_repsupopcode_all_commands_descriptor *)c;
#ifdef SCSIPI_DEBUG
                        int cdblen = _2btol((const u_int8_t *)&acd->cdblen);
#endif
                        dlen -= sizeof(struct scsi_repsupopcode_all_commands_descriptor);
                        c += sizeof(struct scsi_repsupopcode_all_commands_descriptor);
                        SC_DEBUG(periph, SCSIPI_DB3,
                                 ("0x%02x(%2d) ", acd->opcode, cdblen));
                        
                        tot->opcode_info[acd->opcode].ti_flags = SCSIPI_TI_VALID;
                        
                        if (acd->flags & RSOC_ACD_SERVACTV) {
                                SC_DEBUGN(periph, SCSIPI_DB3,
                                         ("0x%02x%02x ",
                                          acd->serviceaction[0],
                                          acd->serviceaction[1]));
                        } else {
                                SC_DEBUGN(periph, SCSIPI_DB3, ("       "));
                        }
                        
                        if (acd->flags & RSOC_ACD_CTDP
                            && dlen >= (int)sizeof(struct scsi_repsupopcode_timeouts_descriptor)) {
                                struct scsi_repsupopcode_timeouts_descriptor *td
                                        = (struct scsi_repsupopcode_timeouts_descriptor *)c;
                                long nomto = _4btol(td->nom_process_timeout);
                                long cmdto = _4btol(td->cmd_process_timeout);
                                long t = (cmdto > nomto) ? cmdto : nomto;

                                dlen -= sizeof(struct scsi_repsupopcode_timeouts_descriptor);
                                c += sizeof(struct scsi_repsupopcode_timeouts_descriptor);

                                SC_DEBUGN(periph, SCSIPI_DB3,
                                          ("0x%02x %10ld %10ld",
                                           td->cmd_specific,
                                           nomto, cmdto));

                                if (t > tot->opcode_info[acd->opcode].ti_timeout) {
                                        tot->opcode_info[acd->opcode].ti_timeout = t;
                                        ++count;
                                }
                        }
                        SC_DEBUGN(periph, SCSIPI_DB3,("\n"));
                }

                if (count > 0) {
                        periph->periph_opcs = tot;
                } else {
                        free(tot, M_DEVBUF);
                        SC_DEBUG(periph, SCSIPI_DB3,
                                 ("no usable timeout values available\n"));
                }
        } else {
                SC_DEBUG(periph, SCSIPI_DB3,
                         ("SCSI_MAINTENANCE_IN"
                          "[RSOC_REPORT_SUPPORTED_OPCODES] failed error=%d"
                          " - no device provided timeout "
                          "values available\n", rc));
        }

        free(data, M_DEVBUF);
}

/*
 * scsipi_update_timeouts:
 *         Override timeout value if device/config provided
 *      timeouts are available.
 */
static void
scsipi_update_timeouts(struct scsipi_xfer *xs)
{
        struct scsipi_opcodes *opcs;
        u_int8_t cmd;
        int timeout;
        struct scsipi_opinfo *oi;
        
        if (xs->timeout <= 0) {
                return;        
        }
        
        opcs = xs->xs_periph->periph_opcs;
        
        if (opcs == NULL) {
                return;
        }
        
        cmd = xs->cmd->opcode;
        oi = &opcs->opcode_info[cmd];
        
        timeout = 1000 * (int)oi->ti_timeout;


        if (timeout > xs->timeout && timeout < 86400000) {
                /*
                 * pick up device configured timeouts if they
                 * are longer than the requested ones but less
                 * than a day
                 */
#ifdef SCSIPI_DEBUG
                if ((oi->ti_flags & SCSIPI_TI_LOGGED) == 0) {
                        SC_DEBUG(xs->xs_periph, SCSIPI_DB3,
                                 ("Overriding command 0x%02x "
                                  "timeout of %d with %d ms\n",
                                  cmd, xs->timeout, timeout));
                        oi->ti_flags |= SCSIPI_TI_LOGGED;
                }
#endif
                xs->timeout = timeout;
        }
}

/*
 * scsipi_free_opcodeinfo:
 *
 * free the opcode information table
 */
void
scsipi_free_opcodeinfo(struct scsipi_periph *periph)
{
        if (periph->periph_opcs != NULL) {
                free(periph->periph_opcs, M_DEVBUF);
        }

        periph->periph_opcs = NULL;
}

/*
 * scsipi_done:
 *
 *        This routine is called by an adapter's interrupt handler when
 *        an xfer is completed.
 */
void
scsipi_done(struct scsipi_xfer *xs)
{
        struct scsipi_periph *periph = xs->xs_periph;
        struct scsipi_channel *chan = periph->periph_channel;
        int freezecnt;

        SC_DEBUG(periph, SCSIPI_DB2, ("scsipi_done\n"));
#ifdef SCSIPI_DEBUG
        if (periph->periph_dbflags & SCSIPI_DB1)
                show_scsipi_cmd(xs);
#endif

        mutex_enter(chan_mtx(chan));
        SDT_PROBE1(scsi, base, xfer, done,  xs);
        /*
         * The resource this command was using is now free.
         */
        if (xs->xs_status & XS_STS_DONE) {
                /* XXX in certain circumstances, such as a device
                 * being detached, a xs that has already been
                 * scsipi_done()'d by the main thread will be done'd
                 * again by scsibusdetach(). Putting the xs on the
                 * chan_complete queue causes list corruption and
                 * everyone dies. This prevents that, but perhaps
                 * there should be better coordination somewhere such
                 * that this won't ever happen (and can be turned into
                 * a KASSERT().
                 */
                SDT_PROBE1(scsi, base, xfer, redone,  xs);
                mutex_exit(chan_mtx(chan));
                goto out;
        }
        scsipi_put_resource(chan);
        xs->xs_periph->periph_sent--;

        /*
         * If the command was tagged, free the tag.
         */
        if (XS_CTL_TAGTYPE(xs) != 0)
                scsipi_put_tag(xs);
        else
                periph->periph_flags &= ~PERIPH_UNTAG;

        /* Mark the command as `done'. */
        xs->xs_status |= XS_STS_DONE;

#ifdef DIAGNOSTIC
        if ((xs->xs_control & (XS_CTL_ASYNC|XS_CTL_POLL)) ==
            (XS_CTL_ASYNC|XS_CTL_POLL))
                panic("scsipi_done: ASYNC and POLL");
#endif

        /*
         * If the xfer had an error of any sort, freeze the
         * periph's queue.  Freeze it again if we were requested
         * to do so in the xfer.
         */
        freezecnt = 0;
        if (xs->error != XS_NOERROR)
                freezecnt++;
        if (xs->xs_control & XS_CTL_FREEZE_PERIPH)
                freezecnt++;
        if (freezecnt != 0)
                scsipi_periph_freeze_locked(periph, freezecnt);

        /*
         * record the xfer with a pending sense, in case a SCSI reset is
         * received before the thread is waked up.
         */
        if (xs->error == XS_BUSY && xs->status == SCSI_CHECK) {
                periph->periph_flags |= PERIPH_SENSE;
                periph->periph_xscheck = xs;
        }

        /*
         * If this was an xfer that was not to complete asynchronously,
         * let the requesting thread perform error checking/handling
         * in its context.
         */
        if ((xs->xs_control & XS_CTL_ASYNC) == 0) {
                /*
                 * If it's a polling job, just return, to unwind the
                 * call graph.  We don't need to restart the queue,
                 * because polling jobs are treated specially, and
                 * are really only used during crash dumps anyway
                 * (XXX or during boot-time autoconfiguration of
                 * ATAPI devices).
                 */
                if (xs->xs_control & XS_CTL_POLL) {
                        mutex_exit(chan_mtx(chan));
                        return;
                }
                cv_broadcast(xs_cv(xs));
                mutex_exit(chan_mtx(chan));
                goto out;
        }

        /*
         * Catch the extremely common case of I/O completing
         * without error; no use in taking a context switch
         * if we can handle it in interrupt context.
         */
        if (xs->error == XS_NOERROR) {
                mutex_exit(chan_mtx(chan));
                (void) scsipi_complete(xs);
                goto out;
        }

        /*
         * There is an error on this xfer.  Put it on the channel's
         * completion queue, and wake up the completion thread.
         */
        TAILQ_INSERT_TAIL(&chan->chan_complete, xs, channel_q);
        cv_broadcast(chan_cv_complete(chan));
        mutex_exit(chan_mtx(chan));

 out:
        /*
         * If there are more xfers on the channel's queue, attempt to
         * run them.
         */
        scsipi_run_queue(chan);
}

/*
 * scsipi_complete:
 *
 *        Completion of a scsipi_xfer.  This is the guts of scsipi_done().
 *
 *        NOTE: This routine MUST be called with valid thread context
 *        except for the case where the following two conditions are
 *        true:
 *
 *                xs->error == XS_NOERROR
 *                XS_CTL_ASYNC is set in xs->xs_control
 *
 *        The semantics of this routine can be tricky, so here is an
 *        explanation:
 *
 *                0                Xfer completed successfully.
 *
 *                ERESTART        Xfer had an error, but was restarted.
 *
 *                anything else        Xfer had an error, return value is Unix
 *                                errno.
 *
 *        If the return value is anything but ERESTART:
 *
 *                - If XS_CTL_ASYNC is set, `xs' has been freed back to
 *                  the pool.
 *                - If there is a buf associated with the xfer,
 *                  it has been biodone()'d.
 */
static int
scsipi_complete(struct scsipi_xfer *xs)
{
        struct scsipi_periph *periph = xs->xs_periph;
        struct scsipi_channel *chan = periph->periph_channel;
        int error;

        SDT_PROBE1(scsi, base, xfer, complete,  xs);

#ifdef DIAGNOSTIC
        if ((xs->xs_control & XS_CTL_ASYNC) != 0 && xs->bp == NULL)
                panic("scsipi_complete: XS_CTL_ASYNC but no buf");
#endif
        /*
         * If command terminated with a CHECK CONDITION, we need to issue a
         * REQUEST_SENSE command. Once the REQUEST_SENSE has been processed
         * we'll have the real status.
         * Must be processed with channel lock held to avoid missing
         * a SCSI bus reset for this command.
         */
        mutex_enter(chan_mtx(chan));
        if (xs->error == XS_BUSY && xs->status == SCSI_CHECK) {
                /* request sense for a request sense ? */
                if (xs->xs_control & XS_CTL_REQSENSE) {
                        scsipi_printaddr(periph);
                        printf("request sense for a request sense ?\n");
                        /* XXX maybe we should reset the device ? */
                        /* we've been frozen because xs->error != XS_NOERROR */
                        scsipi_periph_thaw_locked(periph, 1);
                        mutex_exit(chan_mtx(chan));
                        if (xs->resid < xs->datalen) {
                                printf("we read %d bytes of sense anyway:\n",
                                    xs->datalen - xs->resid);
                                scsipi_print_sense_data((void *)xs->data, 0);
                        }
                        return EINVAL;
                }
                mutex_exit(chan_mtx(chan)); // XXX allows other commands to queue or run
                scsipi_request_sense(xs);
        } else
                mutex_exit(chan_mtx(chan));

        /*
         * If it's a user level request, bypass all usual completion
         * processing, let the user work it out..
         */
        if ((xs->xs_control & XS_CTL_USERCMD) != 0) {
                SC_DEBUG(periph, SCSIPI_DB3, ("calling user done()\n"));
                mutex_enter(chan_mtx(chan));
                if (xs->error != XS_NOERROR)
                        scsipi_periph_thaw_locked(periph, 1);
                mutex_exit(chan_mtx(chan));
                scsipi_user_done(xs);
                SC_DEBUG(periph, SCSIPI_DB3, ("returned from user done()\n "));
                return 0;
        }

        switch (xs->error) {
        case XS_NOERROR:
                error = 0;
                break;

        case XS_SENSE:
        case XS_SHORTSENSE:
                error = (*chan->chan_bustype->bustype_interpret_sense)(xs);
                break;

        case XS_RESOURCE_SHORTAGE:
                /*
                 * XXX Should freeze channel's queue.
                 */
                scsipi_printaddr(periph);
                printf("adapter resource shortage\n");
                /* FALLTHROUGH */

        case XS_BUSY:
                if (xs->error == XS_BUSY && xs->status == SCSI_QUEUE_FULL) {
                        struct scsipi_max_openings mo;

                        /*
                         * We set the openings to active - 1, assuming that
                         * the command that got us here is the first one that
                         * can't fit into the device's queue.  If that's not
                         * the case, I guess we'll find out soon enough.
                         */
                        mo.mo_target = periph->periph_target;
                        mo.mo_lun = periph->periph_lun;
                        if (periph->periph_active < periph->periph_openings)
                                mo.mo_openings = periph->periph_active - 1;
                        else
                                mo.mo_openings = periph->periph_openings - 1;
#ifdef DIAGNOSTIC
                        if (mo.mo_openings < 0) {
                                scsipi_printaddr(periph);
                                printf("QUEUE FULL resulted in < 0 openings\n");
                                panic("scsipi_done");
                        }
#endif
                        if (mo.mo_openings == 0) {
                                scsipi_printaddr(periph);
                                printf("QUEUE FULL resulted in 0 openings\n");
                                mo.mo_openings = 1;
                        }
                        scsipi_async_event(chan, ASYNC_EVENT_MAX_OPENINGS, &mo);
                        error = ERESTART;
                } else if (xs->xs_retries != 0) {
                        xs->xs_retries--;
                        /*
                         * Wait one second, and try again.
                         */
                        mutex_enter(chan_mtx(chan));
                        if ((xs->xs_control & XS_CTL_POLL) ||
                            (chan->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) {
                                /* XXX: quite extreme */
                                kpause("xsbusy", false, hz, chan_mtx(chan));
                        } else if (!callout_pending(&periph->periph_callout)) {
                                scsipi_periph_freeze_locked(periph, 1);
                                callout_reset(&periph->periph_callout,
                                    hz, scsipi_periph_timed_thaw, periph);
                        }
                        mutex_exit(chan_mtx(chan));
                        error = ERESTART;
                } else
                        error = EBUSY;
                break;

        case XS_REQUEUE:
                error = ERESTART;
                break;

        case XS_SELTIMEOUT:
        case XS_TIMEOUT:
                /*
                 * If the device hasn't gone away, honor retry counts.
                 *
                 * Note that if we're in the middle of probing it,
                 * it won't be found because it isn't here yet so
                 * we won't honor the retry count in that case.
                 */
                if (scsipi_lookup_periph(chan, periph->periph_target,
                    periph->periph_lun) && xs->xs_retries != 0) {
                        xs->xs_retries--;
                        error = ERESTART;
                } else
                        error = EIO;
                break;

        case XS_RESET:
                if (xs->xs_control & XS_CTL_REQSENSE) {
                        /*
                         * request sense interrupted by reset: signal it
                         * with EINTR return code.
                         */
                        error = EINTR;
                } else {
                        if (xs->xs_retries != 0) {
                                xs->xs_retries--;
                                error = ERESTART;
                        } else
                                error = EIO;
                }
                break;

        case XS_DRIVER_STUFFUP:
                scsipi_printaddr(periph);
                printf("generic HBA error\n");
                error = EIO;
                break;
        default:
                scsipi_printaddr(periph);
                printf("invalid return code from adapter: %d\n", xs->error);
                error = EIO;
                break;
        }

        mutex_enter(chan_mtx(chan));
        if (error == ERESTART) {
                SDT_PROBE1(scsi, base, xfer, restart,  xs);
                /*
                 * If we get here, the periph has been thawed and frozen
                 * again if we had to issue recovery commands.  Alternatively,
                 * it may have been frozen again and in a timed thaw.  In
                 * any case, we thaw the periph once we re-enqueue the
                 * command.  Once the periph is fully thawed, it will begin
                 * operation again.
                 */
                xs->error = XS_NOERROR;
                xs->status = SCSI_OK;
                xs->xs_status &= ~XS_STS_DONE;
                xs->xs_requeuecnt++;
                error = scsipi_enqueue(xs);
                if (error == 0) {
                        scsipi_periph_thaw_locked(periph, 1);
                        mutex_exit(chan_mtx(chan));
                        return ERESTART;
                }
        }

        /*
         * scsipi_done() freezes the queue if not XS_NOERROR.
         * Thaw it here.
         */
        if (xs->error != XS_NOERROR)
                scsipi_periph_thaw_locked(periph, 1);
        mutex_exit(chan_mtx(chan));

        if (periph->periph_switch->psw_done)
                periph->periph_switch->psw_done(xs, error);

        mutex_enter(chan_mtx(chan));
        if (xs->xs_control & XS_CTL_ASYNC)
                scsipi_put_xs(xs);
        mutex_exit(chan_mtx(chan));

        return error;
}

/*
 * Issue a request sense for the given scsipi_xfer. Called when the xfer
 * returns with a CHECK_CONDITION status. Must be called in valid thread
 * context.
 */

static void
scsipi_request_sense(struct scsipi_xfer *xs)
{
        struct scsipi_periph *periph = xs->xs_periph;
        int flags, error;
        struct scsi_request_sense cmd;

        periph->periph_flags |= PERIPH_SENSE;

        /* if command was polling, request sense will too */
        flags = xs->xs_control & XS_CTL_POLL;
        /* Polling commands can't sleep */
        if (flags)
                flags |= XS_CTL_NOSLEEP;

        flags |= XS_CTL_REQSENSE | XS_CTL_URGENT | XS_CTL_DATA_IN |
            XS_CTL_THAW_PERIPH | XS_CTL_FREEZE_PERIPH;

        memset(&cmd, 0, sizeof(cmd));
        cmd.opcode = SCSI_REQUEST_SENSE;
        cmd.length = sizeof(struct scsi_sense_data);

        error = scsipi_command(periph, (void *)&cmd, sizeof(cmd),
            (void *)&xs->sense.scsi_sense, sizeof(struct scsi_sense_data),
            0, 1000, NULL, flags);
        periph->periph_flags &= ~PERIPH_SENSE;
        periph->periph_xscheck = NULL;
        switch (error) {
        case 0:
                /* we have a valid sense */
                xs->error = XS_SENSE;
                return;
        case EINTR:
                /* REQUEST_SENSE interrupted by bus reset. */
                xs->error = XS_RESET;
                return;
        case EIO:
                 /* request sense couldn't be performed */
                /*
                 * XXX this isn't quite right but we don't have anything
                 * better for now
                 */
                xs->error = XS_DRIVER_STUFFUP;
                return;
        default:
                 /* Notify that request sense failed. */
                xs->error = XS_DRIVER_STUFFUP;
                scsipi_printaddr(periph);
                printf("request sense failed with error %d\n", error);
                return;
        }
}

/*
 * scsipi_enqueue:
 *
 *        Enqueue an xfer on a channel.
 */
static int
scsipi_enqueue(struct scsipi_xfer *xs)
{
        struct scsipi_channel *chan = xs->xs_periph->periph_channel;
        struct scsipi_xfer *qxs;

        SDT_PROBE1(scsi, base, xfer, enqueue,  xs);

        /*
         * If the xfer is to be polled, and there are already jobs on
         * the queue, we can't proceed.
         */
        KASSERT(mutex_owned(chan_mtx(chan)));
        if ((xs->xs_control & XS_CTL_POLL) != 0 &&
            TAILQ_FIRST(&chan->chan_queue) != NULL) {
                xs->error = XS_DRIVER_STUFFUP;
                return EAGAIN;
        }

        /*
         * If we have an URGENT xfer, it's an error recovery command
         * and it should just go on the head of the channel's queue.
         */
        if (xs->xs_control & XS_CTL_URGENT) {
                TAILQ_INSERT_HEAD(&chan->chan_queue, xs, channel_q);
                goto out;
        }

        /*
         * If this xfer has already been on the queue before, we
         * need to reinsert it in the correct order.  That order is:
         *
         *        Immediately before the first xfer for this periph
         *        with a requeuecnt less than xs->xs_requeuecnt.
         *
         * Failing that, at the end of the queue.  (We'll end up
         * there naturally.)
         */
        if (xs->xs_requeuecnt != 0) {
                for (qxs = TAILQ_FIRST(&chan->chan_queue); qxs != NULL;
                     qxs = TAILQ_NEXT(qxs, channel_q)) {
                        if (qxs->xs_periph == xs->xs_periph &&
                            qxs->xs_requeuecnt < xs->xs_requeuecnt)
                                break;
                }
                if (qxs != NULL) {
                        TAILQ_INSERT_AFTER(&chan->chan_queue, qxs, xs,
                            channel_q);
                        goto out;
                }
        }
        TAILQ_INSERT_TAIL(&chan->chan_queue, xs, channel_q);
 out:
        if (xs->xs_control & XS_CTL_THAW_PERIPH)
                scsipi_periph_thaw_locked(xs->xs_periph, 1);
        return 0;
}

/*
 * scsipi_run_queue:
 *
 *        Start as many xfers as possible running on the channel.
 */
static void
scsipi_run_queue(struct scsipi_channel *chan)
{
        struct scsipi_xfer *xs;
        struct scsipi_periph *periph;

        SDT_PROBE1(scsi, base, queue, batch__start,  chan);
        for (;;) {
                mutex_enter(chan_mtx(chan));

                /*
                 * If the channel is frozen, we can't do any work right
                 * now.
                 */
                if (chan->chan_qfreeze != 0) {
                        mutex_exit(chan_mtx(chan));
                        break;
                }

                /*
                 * Look for work to do, and make sure we can do it.
                 */
                for (xs = TAILQ_FIRST(&chan->chan_queue); xs != NULL;
                     xs = TAILQ_NEXT(xs, channel_q)) {
                        periph = xs->xs_periph;

                        if ((periph->periph_sent >= periph->periph_openings) ||
                            periph->periph_qfreeze != 0 ||
                            (periph->periph_flags & PERIPH_UNTAG) != 0)
                                continue;

                        if ((periph->periph_flags &
                            (PERIPH_RECOVERING | PERIPH_SENSE)) != 0 &&
                            (xs->xs_control & XS_CTL_URGENT) == 0)
                                continue;

                        /*
                         * We can issue this xfer!
                         */
                        goto got_one;
                }

                /*
                 * Can't find any work to do right now.
                 */
                mutex_exit(chan_mtx(chan));
                break;

 got_one:
                /*
                 * Have an xfer to run.  Allocate a resource from
                 * the adapter to run it.  If we can't allocate that
                 * resource, we don't dequeue the xfer.
                 */
                if (scsipi_get_resource(chan) == 0) {
                        /*
                         * Adapter is out of resources.  If the adapter
                         * supports it, attempt to grow them.
                         */
                        if (scsipi_grow_resources(chan) == 0) {
                                /*
                                 * Wasn't able to grow resources,
                                 * nothing more we can do.
                                 */
                                if (xs->xs_control & XS_CTL_POLL) {
                                        scsipi_printaddr(xs->xs_periph);
                                        printf("polling command but no "
                                            "adapter resources");
                                        /* We'll panic shortly... */
                                }
                                mutex_exit(chan_mtx(chan));

                                /*
                                 * XXX: We should be able to note that
                                 * XXX: that resources are needed here!
                                 */
                                break;
                        }
                        /*
                         * scsipi_grow_resources() allocated the resource
                         * for us.
                         */
                }

                /*
                 * We have a resource to run this xfer, do it!
                 */
                TAILQ_REMOVE(&chan->chan_queue, xs, channel_q);

                /*
                 * If the command is to be tagged, allocate a tag ID
                 * for it.
                 */
                if (XS_CTL_TAGTYPE(xs) != 0)
                        scsipi_get_tag(xs);
                else
                        periph->periph_flags |= PERIPH_UNTAG;
                periph->periph_sent++;
                mutex_exit(chan_mtx(chan));

                SDT_PROBE2(scsi, base, queue, run,  chan, xs);
                scsipi_adapter_request(chan, ADAPTER_REQ_RUN_XFER, xs);
        }
        SDT_PROBE1(scsi, base, queue, batch__done,  chan);
}

/*
 * scsipi_execute_xs:
 *
 *        Begin execution of an xfer, waiting for it to complete, if necessary.
 */
int
scsipi_execute_xs(struct scsipi_xfer *xs)
{
        struct scsipi_periph *periph = xs->xs_periph;
        struct scsipi_channel *chan = periph->periph_channel;
        int oasync, async, poll, error;

        KASSERT(!cold);

        scsipi_update_timeouts(xs);
        
        (chan->chan_bustype->bustype_cmd)(xs);

        xs->xs_status &= ~XS_STS_DONE;
        xs->error = XS_NOERROR;
        xs->resid = xs->datalen;
        xs->status = SCSI_OK;
        SDT_PROBE1(scsi, base, xfer, execute,  xs);

#ifdef SCSIPI_DEBUG
        if (xs->xs_periph->periph_dbflags & SCSIPI_DB3) {
                printf("scsipi_execute_xs: ");
                show_scsipi_xs(xs);
                printf("\n");
        }
#endif

        /*
         * Deal with command tagging:
         *
         *        - If the device's current operating mode doesn't
         *          include tagged queueing, clear the tag mask.
         *
         *        - If the device's current operating mode *does*
         *          include tagged queueing, set the tag_type in
         *          the xfer to the appropriate byte for the tag
         *          message.
         */
        if ((PERIPH_XFER_MODE(periph) & PERIPH_CAP_TQING) == 0 ||
                (xs->xs_control & XS_CTL_REQSENSE)) {
                xs->xs_control &= ~XS_CTL_TAGMASK;
                xs->xs_tag_type = 0;
        } else {
                /*
                 * If the request doesn't specify a tag, give Head
                 * tags to URGENT operations and Simple tags to
                 * everything else.
                 */
                if (XS_CTL_TAGTYPE(xs) == 0) {
                        if (xs->xs_control & XS_CTL_URGENT)
                                xs->xs_control |= XS_CTL_HEAD_TAG;
                        else
                                xs->xs_control |= XS_CTL_SIMPLE_TAG;
                }

                switch (XS_CTL_TAGTYPE(xs)) {
                case XS_CTL_ORDERED_TAG:
                        xs->xs_tag_type = MSG_ORDERED_Q_TAG;
                        break;

                case XS_CTL_SIMPLE_TAG:
                        xs->xs_tag_type = MSG_SIMPLE_Q_TAG;
                        break;

                case XS_CTL_HEAD_TAG:
                        xs->xs_tag_type = MSG_HEAD_OF_Q_TAG;
                        break;

                default:
                        scsipi_printaddr(periph);
                        printf("invalid tag mask 0x%08x\n",
                            XS_CTL_TAGTYPE(xs));
                        panic("scsipi_execute_xs");
                }
        }

        /* If the adapter wants us to poll, poll. */
        if (chan->chan_adapter->adapt_flags & SCSIPI_ADAPT_POLL_ONLY)
                xs->xs_control |= XS_CTL_POLL;

        /*
         * If we don't yet have a completion thread, or we are to poll for
         * completion, clear the ASYNC flag.
         */
        oasync =  (xs->xs_control & XS_CTL_ASYNC);
        if (chan->chan_thread == NULL || (xs->xs_control & XS_CTL_POLL) != 0)
                xs->xs_control &= ~XS_CTL_ASYNC;

        async = (xs->xs_control & XS_CTL_ASYNC);
        poll = (xs->xs_control & XS_CTL_POLL);

#ifdef DIAGNOSTIC
        if (oasync != 0 && xs->bp == NULL)
                panic("scsipi_execute_xs: XS_CTL_ASYNC but no buf");
#endif

        /*
         * Enqueue the transfer.  If we're not polling for completion, this
         * should ALWAYS return `no error'.
         */
        error = scsipi_enqueue(xs);
        if (error) {
                if (poll == 0) {
                        scsipi_printaddr(periph);
                        printf("not polling, but enqueue failed with %d\n",
                            error);
                        panic("scsipi_execute_xs");
                }

                scsipi_printaddr(periph);
                printf("should have flushed queue?\n");
                goto free_xs;
        }

        mutex_exit(chan_mtx(chan));
 restarted:
        scsipi_run_queue(chan);
        mutex_enter(chan_mtx(chan));

        /*
         * The xfer is enqueued, and possibly running.  If it's to be
         * completed asynchronously, just return now.
         */
        if (async)
                return 0;

        /*
         * Not an asynchronous command; wait for it to complete.
         */
        while ((xs->xs_status & XS_STS_DONE) == 0) {
                if (poll) {
                        scsipi_printaddr(periph);
                        printf("polling command not done\n");
                        panic("scsipi_execute_xs");
                }
                cv_wait(xs_cv(xs), chan_mtx(chan));
        }

        /*
         * Command is complete.  scsipi_done() has awakened us to perform
         * the error handling.
         */
        mutex_exit(chan_mtx(chan));
        error = scsipi_complete(xs);
        if (error == ERESTART)
                goto restarted;

        /*
         * If it was meant to run async and we cleared async ourselves,
         * don't return an error here. It has already been handled
         */
        if (oasync)
                error = 0;
        /*
         * Command completed successfully or fatal error occurred.  Fall
         * into....
         */
        mutex_enter(chan_mtx(chan));
 free_xs:
        scsipi_put_xs(xs);
        mutex_exit(chan_mtx(chan));

        /*
         * Kick the queue, keep it running in case it stopped for some
         * reason.
         */
        scsipi_run_queue(chan);

        mutex_enter(chan_mtx(chan));
        return error;
}

/*
 * scsipi_completion_thread:
 *
 *        This is the completion thread.  We wait for errors on
 *        asynchronous xfers, and perform the error handling
 *        function, restarting the command, if necessary.
 */
static void
scsipi_completion_thread(void *arg)
{
        struct scsipi_channel *chan = arg;
        struct scsipi_xfer *xs;

        if (chan->chan_init_cb)
                (*chan->chan_init_cb)(chan, chan->chan_init_cb_arg);

        mutex_enter(chan_mtx(chan));
        chan->chan_flags |= SCSIPI_CHAN_TACTIVE;
        for (;;) {
                xs = TAILQ_FIRST(&chan->chan_complete);
                if (xs == NULL && chan->chan_tflags == 0) {
                        /* nothing to do; wait */
                        cv_wait(chan_cv_complete(chan), chan_mtx(chan));
                        continue;
                }
                if (chan->chan_tflags & SCSIPI_CHANT_CALLBACK) {
                        /* call chan_callback from thread context */
                        chan->chan_tflags &= ~SCSIPI_CHANT_CALLBACK;
                        chan->chan_callback(chan, chan->chan_callback_arg);
                        continue;
                }
                if (chan->chan_tflags & SCSIPI_CHANT_GROWRES) {
                        /* attempt to get more openings for this channel */
                        chan->chan_tflags &= ~SCSIPI_CHANT_GROWRES;
                        mutex_exit(chan_mtx(chan));
                        scsipi_adapter_request(chan,
                            ADAPTER_REQ_GROW_RESOURCES, NULL);
                        scsipi_channel_thaw(chan, 1);
                        if (chan->chan_tflags & SCSIPI_CHANT_GROWRES)
                                kpause("scsizzz", FALSE, hz/10, NULL);
                        mutex_enter(chan_mtx(chan));
                        continue;
                }
                if (chan->chan_tflags & SCSIPI_CHANT_KICK) {
                        /* explicitly run the queues for this channel */
                        chan->chan_tflags &= ~SCSIPI_CHANT_KICK;
                        mutex_exit(chan_mtx(chan));
                        scsipi_run_queue(chan);
                        mutex_enter(chan_mtx(chan));
                        continue;
                }
                if (chan->chan_tflags & SCSIPI_CHANT_SHUTDOWN) {
                        break;
                }
                if (xs) {
                        TAILQ_REMOVE(&chan->chan_complete, xs, channel_q);
                        mutex_exit(chan_mtx(chan));

                        /*
                         * Have an xfer with an error; process it.
                         */
                        (void) scsipi_complete(xs);

                        /*
                         * Kick the queue; keep it running if it was stopped
                         * for some reason.
                         */
                        scsipi_run_queue(chan);
                        mutex_enter(chan_mtx(chan));
                }
        }

        chan->chan_thread = NULL;

        /* In case parent is waiting for us to exit. */
        cv_broadcast(chan_cv_thread(chan));
        mutex_exit(chan_mtx(chan));

        kthread_exit(0);
}
/*
 * scsipi_thread_call_callback:
 *
 *         request to call a callback from the completion thread
 */
int
scsipi_thread_call_callback(struct scsipi_channel *chan,
    void (*callback)(struct scsipi_channel *, void *), void *arg)
{

        mutex_enter(chan_mtx(chan));
        if ((chan->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) {
                /* kernel thread doesn't exist yet */
                mutex_exit(chan_mtx(chan));
                return ESRCH;
        }
        if (chan->chan_tflags & SCSIPI_CHANT_CALLBACK) {
                mutex_exit(chan_mtx(chan));
                return EBUSY;
        }
        scsipi_channel_freeze(chan, 1);
        chan->chan_callback = callback;
        chan->chan_callback_arg = arg;
        chan->chan_tflags |= SCSIPI_CHANT_CALLBACK;
        cv_broadcast(chan_cv_complete(chan));
        mutex_exit(chan_mtx(chan));
        return 0;
}

/*
 * scsipi_async_event:
 *
 *        Handle an asynchronous event from an adapter.
 */
void
scsipi_async_event(struct scsipi_channel *chan, scsipi_async_event_t event,
    void *arg)
{
        bool lock = chan_running(chan) > 0;

        if (lock)
                mutex_enter(chan_mtx(chan));
        switch (event) {
        case ASYNC_EVENT_MAX_OPENINGS:
                scsipi_async_event_max_openings(chan,
                    (struct scsipi_max_openings *)arg);
                break;

        case ASYNC_EVENT_XFER_MODE:
                if (chan->chan_bustype->bustype_async_event_xfer_mode) {
                        chan->chan_bustype->bustype_async_event_xfer_mode(
                            chan, arg);
                }
                break;
        case ASYNC_EVENT_RESET:
                scsipi_async_event_channel_reset(chan);
                break;
        }
        if (lock)
                mutex_exit(chan_mtx(chan));
}

/*
 * scsipi_async_event_max_openings:
 *
 *        Update the maximum number of outstanding commands a
 *        device may have.
 */
static void
scsipi_async_event_max_openings(struct scsipi_channel *chan,
    struct scsipi_max_openings *mo)
{
        struct scsipi_periph *periph;
        int minlun, maxlun;

        if (mo->mo_lun == -1) {
                /*
                 * Wildcarded; apply it to all LUNs.
                 */
                minlun = 0;
                maxlun = chan->chan_nluns - 1;
        } else
                minlun = maxlun = mo->mo_lun;

        /* XXX This could really suck with a large LUN space. */
        for (; minlun <= maxlun; minlun++) {
                periph = scsipi_lookup_periph_locked(chan, mo->mo_target, minlun);
                if (periph == NULL)
                        continue;

                if (mo->mo_openings < periph->periph_openings)
                        periph->periph_openings = mo->mo_openings;
                else if (mo->mo_openings > periph->periph_openings &&
                    (periph->periph_flags & PERIPH_GROW_OPENINGS) != 0)
                        periph->periph_openings = mo->mo_openings;
        }
}

/*
 * scsipi_set_xfer_mode:
 *
 *        Set the xfer mode for the specified I_T Nexus.
 */
void
scsipi_set_xfer_mode(struct scsipi_channel *chan, int target, int immed)
{
        struct scsipi_xfer_mode xm;
        struct scsipi_periph *itperiph;
        int lun;

        /*
         * Go to the minimal xfer mode.
         */
        xm.xm_target = target;
        xm.xm_mode = 0;
        xm.xm_period = 0;                        /* ignored */
        xm.xm_offset = 0;                        /* ignored */

        /*
         * Find the first LUN we know about on this I_T Nexus.
         */
        for (itperiph = NULL, lun = 0; lun < chan->chan_nluns; lun++) {
                itperiph = scsipi_lookup_periph(chan, target, lun);
                if (itperiph != NULL)
                        break;
        }
        if (itperiph != NULL) {
                xm.xm_mode = itperiph->periph_cap;
                /*
                 * Now issue the request to the adapter.
                 */
                scsipi_adapter_request(chan, ADAPTER_REQ_SET_XFER_MODE, &xm);
                /*
                 * If we want this to happen immediately, issue a dummy
                 * command, since most adapters can't really negotiate unless
                 * they're executing a job.
                 */
                if (immed != 0) {
                        (void) scsipi_test_unit_ready(itperiph,
                            XS_CTL_DISCOVERY | XS_CTL_IGNORE_ILLEGAL_REQUEST |
                            XS_CTL_IGNORE_NOT_READY |
                            XS_CTL_IGNORE_MEDIA_CHANGE);
                }
        }
}

/*
 * scsipi_channel_reset:
 *
 *        handle scsi bus reset
 * called with channel lock held
 */
static void
scsipi_async_event_channel_reset(struct scsipi_channel *chan)
{
        struct scsipi_xfer *xs, *xs_next;
        struct scsipi_periph *periph;
        int target, lun;

        /*
         * Channel has been reset. Also mark as reset pending REQUEST_SENSE
         * commands; as the sense is not available any more.
         * can't call scsipi_done() from here, as the command has not been
         * sent to the adapter yet (this would corrupt accounting).
         */

        for (xs = TAILQ_FIRST(&chan->chan_queue); xs != NULL; xs = xs_next) {
                xs_next = TAILQ_NEXT(xs, channel_q);
                if (xs->xs_control & XS_CTL_REQSENSE) {
                        TAILQ_REMOVE(&chan->chan_queue, xs, channel_q);
                        xs->error = XS_RESET;
                        if ((xs->xs_control & XS_CTL_ASYNC) != 0)
                                TAILQ_INSERT_TAIL(&chan->chan_complete, xs,
                                    channel_q);
                }
        }
        cv_broadcast(chan_cv_complete(chan));
        /* Catch xs with pending sense which may not have a REQSENSE xs yet */
        for (target = 0; target < chan->chan_ntargets; target++) {
                if (target == chan->chan_id)
                        continue;
                for (lun = 0; lun <  chan->chan_nluns; lun++) {
                        periph = scsipi_lookup_periph_locked(chan, target, lun);
                        if (periph) {
                                xs = periph->periph_xscheck;
                                if (xs)
                                        xs->error = XS_RESET;
                        }
                }
        }
}

/*
 * scsipi_target_detach:
 *
 *        detach all periph associated with a I_T
 *         must be called from valid thread context
 */
int
scsipi_target_detach(struct scsipi_channel *chan, int target, int lun,
    int flags)
{
        struct scsipi_periph *periph;
        device_t tdev;
        int ctarget, mintarget, maxtarget;
        int clun, minlun, maxlun;
        int error = 0;

        if (target == -1) {
                mintarget = 0;
                maxtarget = chan->chan_ntargets;
        } else {
                if (target == chan->chan_id)
                        return EINVAL;
                if (target < 0 || target >= chan->chan_ntargets)
                        return EINVAL;
                mintarget = target;
                maxtarget = target + 1;
        }

        if (lun == -1) {
                minlun = 0;
                maxlun = chan->chan_nluns;
        } else {
                if (lun < 0 || lun >= chan->chan_nluns)
                        return EINVAL;
                minlun = lun;
                maxlun = lun + 1;
        }

        /* for config_detach */
        KERNEL_LOCK(1, curlwp);

        mutex_enter(chan_mtx(chan));
        for (ctarget = mintarget; ctarget < maxtarget; ctarget++) {
                if (ctarget == chan->chan_id)
                        continue;

                for (clun = minlun; clun < maxlun; clun++) {
                        periph = scsipi_lookup_periph_locked(chan, ctarget, clun);
                        if (periph == NULL)
                                continue;
                        tdev = periph->periph_dev;
                        mutex_exit(chan_mtx(chan));
                        error = config_detach(tdev, flags);
                        if (error)
                                goto out;
                        mutex_enter(chan_mtx(chan));
                        KASSERT(scsipi_lookup_periph_locked(chan, ctarget, clun) == NULL);
                }
        }
        mutex_exit(chan_mtx(chan));

out:
        KERNEL_UNLOCK_ONE(curlwp);

        return error;
}

/*
 * scsipi_adapter_addref:
 *
 *        Add a reference to the adapter pointed to by the provided
 *        link, enabling the adapter if necessary.
 */
int
scsipi_adapter_addref(struct scsipi_adapter *adapt)
{
        int error = 0;

        if (atomic_inc_uint_nv(&adapt->adapt_refcnt) == 1
            && adapt->adapt_enable != NULL) {
                scsipi_adapter_lock(adapt);
                error = scsipi_adapter_enable(adapt, 1);
                scsipi_adapter_unlock(adapt);
                if (error)
                        atomic_dec_uint(&adapt->adapt_refcnt);
        }
        return error;
}

/*
 * scsipi_adapter_delref:
 *
 *        Delete a reference to the adapter pointed to by the provided
 *        link, disabling the adapter if possible.
 */
void
scsipi_adapter_delref(struct scsipi_adapter *adapt)
{

        membar_release();
        if (atomic_dec_uint_nv(&adapt->adapt_refcnt) == 0
            && adapt->adapt_enable != NULL) {
                membar_acquire();
                scsipi_adapter_lock(adapt);
                (void) scsipi_adapter_enable(adapt, 0);
                scsipi_adapter_unlock(adapt);
        }
}

static struct scsipi_syncparam {
        int        ss_factor;
        int        ss_period;        /* ns * 100 */
} scsipi_syncparams[] = {
        { 0x08,                 625 },        /* FAST-160 (Ultra320) */
        { 0x09,                1250 },        /* FAST-80 (Ultra160) */
        { 0x0a,                2500 },        /* FAST-40 40MHz (Ultra2) */
        { 0x0b,                3030 },        /* FAST-40 33MHz (Ultra2) */
        { 0x0c,                5000 },        /* FAST-20 (Ultra) */
};
static const int scsipi_nsyncparams =
    sizeof(scsipi_syncparams) / sizeof(scsipi_syncparams[0]);

int
scsipi_sync_period_to_factor(int period /* ns * 100 */)
{
        int i;

        for (i = 0; i < scsipi_nsyncparams; i++) {
                if (period <= scsipi_syncparams[i].ss_period)
                        return scsipi_syncparams[i].ss_factor;
        }

        return (period / 100) / 4;
}

int
scsipi_sync_factor_to_period(int factor)
{
        int i;

        for (i = 0; i < scsipi_nsyncparams; i++) {
                if (factor == scsipi_syncparams[i].ss_factor)
                        return scsipi_syncparams[i].ss_period;
        }

        return (factor * 4) * 100;
}

int
scsipi_sync_factor_to_freq(int factor)
{
        int i;

        for (i = 0; i < scsipi_nsyncparams; i++) {
                if (factor == scsipi_syncparams[i].ss_factor)
                        return 100000000 / scsipi_syncparams[i].ss_period;
        }

        return 10000000 / ((factor * 4) * 10);
}

static inline void
scsipi_adapter_lock(struct scsipi_adapter *adapt)
{

        if ((adapt->adapt_flags & SCSIPI_ADAPT_MPSAFE) == 0)
                KERNEL_LOCK(1, NULL);
}

static inline void
scsipi_adapter_unlock(struct scsipi_adapter *adapt)
{

        if ((adapt->adapt_flags & SCSIPI_ADAPT_MPSAFE) == 0)
                KERNEL_UNLOCK_ONE(NULL);
}

void
scsipi_adapter_minphys(struct scsipi_channel *chan, struct buf *bp)
{
        struct scsipi_adapter *adapt = chan->chan_adapter;

        scsipi_adapter_lock(adapt);
        (adapt->adapt_minphys)(bp);
        scsipi_adapter_unlock(chan->chan_adapter);
}

void
scsipi_adapter_request(struct scsipi_channel *chan, 
        scsipi_adapter_req_t req, void *arg)

{
        struct scsipi_adapter *adapt = chan->chan_adapter;

        scsipi_adapter_lock(adapt);
        SDT_PROBE3(scsi, base, adapter, request__start,  chan, req, arg);
        (adapt->adapt_request)(chan, req, arg);
        SDT_PROBE3(scsi, base, adapter, request__done,  chan, req, arg);
        scsipi_adapter_unlock(adapt);
}

int
scsipi_adapter_ioctl(struct scsipi_channel *chan, u_long cmd,
        void *data, int flag, struct proc *p)
{
        struct scsipi_adapter *adapt = chan->chan_adapter;
        int error;

        if (adapt->adapt_ioctl == NULL)
                return ENOTTY;

        scsipi_adapter_lock(adapt);
        error = (adapt->adapt_ioctl)(chan, cmd, data, flag, p);
        scsipi_adapter_unlock(adapt);
        return error;
}

int
scsipi_adapter_enable(struct scsipi_adapter *adapt, int enable)
{
        int error;

        scsipi_adapter_lock(adapt);
        error = (adapt->adapt_enable)(adapt->adapt_dev, enable);
        scsipi_adapter_unlock(adapt);
        return error;
}

#ifdef SCSIPI_DEBUG
/*
 * Given a scsipi_xfer, dump the request, in all its glory
 */
void
show_scsipi_xs(struct scsipi_xfer *xs)
{

        printf("xs(%p): ", xs);
        printf("xs_control(0x%08x)", xs->xs_control);
        printf("xs_status(0x%08x)", xs->xs_status);
        printf("periph(%p)", xs->xs_periph);
        printf("retr(0x%x)", xs->xs_retries);
        printf("timo(0x%x)", xs->timeout);
        printf("cmd(%p)", xs->cmd);
        printf("len(0x%x)", xs->cmdlen);
        printf("data(%p)", xs->data);
        printf("len(0x%x)", xs->datalen);
        printf("res(0x%x)", xs->resid);
        printf("err(0x%x)", xs->error);
        printf("bp(%p)", xs->bp);
        show_scsipi_cmd(xs);
}

void
show_scsipi_cmd(struct scsipi_xfer *xs)
{
        u_char *b = (u_char *) xs->cmd;
        int i = 0;

        scsipi_printaddr(xs->xs_periph);
        printf(" command: ");

        if ((xs->xs_control & XS_CTL_RESET) == 0) {
                while (i < xs->cmdlen) {
                        if (i)
                                printf(",");
                        printf("0x%x", b[i++]);
                }
                printf("-[%d bytes]\n", xs->datalen);
                if (xs->datalen)
                        show_mem(xs->data, uimin(64, xs->datalen));
        } else
                printf("-RESET-\n");
}

void
show_mem(u_char *address, int num)
{
        int x;

        printf("------------------------------");
        for (x = 0; x < num; x++) {
                if ((x % 16) == 0)
                        printf("\n%03d: ", x);
                printf("%02x ", *address++);
        }
        printf("\n------------------------------\n");
}
#endif /* SCSIPI_DEBUG */






































































  199 




















  197 











  197 



























































































   77 












   76 




   17 






   77 
   76 








   77 























   13 









   13 



   13 

   12 



















   12 
   13 

   13 





































   77 





   79 




    5 
    5 




   76 

    6 





   77 

   76 

   77 





   76 
   21 

   17 




   76 





    3 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
/*        $NetBSD: kern_lock.c,v 1.188 2024/01/14 11:46:05 andvar Exp $        */

/*-
 * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2020, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_lock.c,v 1.188 2024/01/14 11:46:05 andvar Exp $");

#ifdef _KERNEL_OPT
#include "opt_lockdebug.h"
#endif

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lockdebug.h>
#include <sys/cpu.h>
#include <sys/syslog.h>
#include <sys/atomic.h>
#include <sys/lwp.h>
#include <sys/pserialize.h>

#if defined(DIAGNOSTIC) && !defined(LOCKDEBUG)
#include <sys/ksyms.h>
#endif

#include <machine/lock.h>

#include <dev/lockstat.h>

#define        RETURN_ADDRESS        (uintptr_t)__builtin_return_address(0)

bool        kernel_lock_dodebug;

__cpu_simple_lock_t kernel_lock[CACHE_LINE_SIZE / sizeof(__cpu_simple_lock_t)]
    __cacheline_aligned;

void
assert_sleepable(void)
{
        const char *reason;
        long pctr;
        bool idle;

        if (__predict_false(panicstr != NULL)) {
                return;
        }

        LOCKDEBUG_BARRIER(kernel_lock, 1);

        /*
         * Avoid disabling/re-enabling preemption here since this
         * routine may be called in delicate situations.
         */
        do {
                pctr = lwp_pctr();
                idle = CURCPU_IDLE_P();
        } while (__predict_false(pctr != lwp_pctr()));

        reason = NULL;
        if (__predict_false(idle) && !cold) {
                reason = "idle";
                goto panic;
        }
        if (__predict_false(cpu_intr_p())) {
                reason = "interrupt";
                goto panic;
        }
        if (__predict_false(cpu_softintr_p())) {
                reason = "softint";
                goto panic;
        }
        if (__predict_false(!pserialize_not_in_read_section())) {
                reason = "pserialize";
                goto panic;
        }
        return;

panic:        panic("%s: %s caller=%p", __func__, reason, (void *)RETURN_ADDRESS);
}

/*
 * Functions for manipulating the kernel_lock.  We put them here
 * so that they show up in profiles.
 */

#define        _KERNEL_LOCK_ABORT(msg)                                                \
    LOCKDEBUG_ABORT(__func__, __LINE__, kernel_lock, &_kernel_lock_ops, msg)

#ifdef LOCKDEBUG
#define        _KERNEL_LOCK_ASSERT(cond)                                        \
do {                                                                        \
        if (!(cond))                                                        \
                _KERNEL_LOCK_ABORT("assertion failed: " #cond);                \
} while (/* CONSTCOND */ 0)
#else
#define        _KERNEL_LOCK_ASSERT(cond)        /* nothing */
#endif

static void        _kernel_lock_dump(const volatile void *, lockop_printer_t);

lockops_t _kernel_lock_ops = {
        .lo_name = "Kernel lock",
        .lo_type = LOCKOPS_SPIN,
        .lo_dump = _kernel_lock_dump,
};

#ifdef LOCKDEBUG

#ifdef DDB
#include <ddb/ddb.h>
#endif

static void
kernel_lock_trace_ipi(void *cookie)
{

        printf("%s[%d %s]: hogging kernel lock\n", cpu_name(curcpu()),
            curlwp->l_lid,
            curlwp->l_name ? curlwp->l_name : curproc->p_comm);
#ifdef DDB
        db_stacktrace();
#endif
}

#endif

/*
 * Initialize the kernel lock.
 */
void
kernel_lock_init(void)
{

        __cpu_simple_lock_init(kernel_lock);
        kernel_lock_dodebug = LOCKDEBUG_ALLOC(kernel_lock, &_kernel_lock_ops,
            RETURN_ADDRESS);
}
CTASSERT(CACHE_LINE_SIZE >= sizeof(__cpu_simple_lock_t));

/*
 * Print debugging information about the kernel lock.
 */
static void
_kernel_lock_dump(const volatile void *junk, lockop_printer_t pr)
{
        struct cpu_info *ci = curcpu();

        (void)junk;

        pr("curcpu holds : %18d wanted by: %#018lx\n",
            ci->ci_biglock_count, (long)ci->ci_biglock_wanted);
}

/*
 * Acquire 'nlocks' holds on the kernel lock.
 *
 * Although it may not look it, this is one of the most central, intricate
 * routines in the kernel, and tons of code elsewhere depends on its exact
 * behaviour.  If you change something in here, expect it to bite you in the
 * rear.
 */
void
_kernel_lock(int nlocks)
{
        struct cpu_info *ci;
        LOCKSTAT_TIMER(spintime);
        LOCKSTAT_FLAG(lsflag);
        struct lwp *owant;
#ifdef LOCKDEBUG
        static struct cpu_info *kernel_lock_holder;
        u_int spins = 0;
        u_int starttime = getticks();
#endif
        int s;
        struct lwp *l = curlwp;

        _KERNEL_LOCK_ASSERT(nlocks > 0);

        s = splvm();
        ci = curcpu();
        if (ci->ci_biglock_count != 0) {
                _KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock));
                ci->ci_biglock_count += nlocks;
                l->l_blcnt += nlocks;
                splx(s);
                return;
        }

        _KERNEL_LOCK_ASSERT(l->l_blcnt == 0);
        LOCKDEBUG_WANTLOCK(kernel_lock_dodebug, kernel_lock, RETURN_ADDRESS,
            0);

        if (__predict_true(__cpu_simple_lock_try(kernel_lock))) {
#ifdef LOCKDEBUG
                kernel_lock_holder = curcpu();
#endif
                ci->ci_biglock_count = nlocks;
                l->l_blcnt = nlocks;
                LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL,
                    RETURN_ADDRESS, 0);
                splx(s);
                return;
        }

        /*
         * To remove the ordering constraint between adaptive mutexes
         * and kernel_lock we must make it appear as if this thread is
         * blocking.  For non-interlocked mutex release, a store fence
         * is required to ensure that the result of any mutex_exit()
         * by the current LWP becomes visible on the bus before the set
         * of ci->ci_biglock_wanted becomes visible.
         *
         * This membar_producer matches the membar_consumer in
         * mutex_vector_enter.
         *
         * That way, if l has just released a mutex, mutex_vector_enter
         * can't see this store ci->ci_biglock_wanted := l until it
         * will also see the mutex_exit store mtx->mtx_owner := 0 which
         * clears the has-waiters bit.
         */
        membar_producer();
        owant = ci->ci_biglock_wanted;
        atomic_store_relaxed(&ci->ci_biglock_wanted, l);
#if defined(DIAGNOSTIC) && !defined(LOCKDEBUG)
        l->l_ld_wanted = __builtin_return_address(0);
#endif

        /*
         * Spin until we acquire the lock.  Once we have it, record the
         * time spent with lockstat.
         */
        LOCKSTAT_ENTER(lsflag);
        LOCKSTAT_START_TIMER(lsflag, spintime);

        do {
                splx(s);
                while (__SIMPLELOCK_LOCKED_P(kernel_lock)) {
#ifdef LOCKDEBUG
                        if (SPINLOCK_SPINOUT(spins) && start_init_exec &&
                            (getticks() - starttime) > 10*hz) {
                                ipi_msg_t msg = {
                                        .func = kernel_lock_trace_ipi,
                                };
                                kpreempt_disable();
                                ipi_unicast(&msg, kernel_lock_holder);
                                ipi_wait(&msg);
                                kpreempt_enable();
                                _KERNEL_LOCK_ABORT("spinout");
                        }
#endif
                        SPINLOCK_BACKOFF_HOOK;
                        SPINLOCK_SPIN_HOOK;
                }
                s = splvm();
        } while (!__cpu_simple_lock_try(kernel_lock));

        ci->ci_biglock_count = nlocks;
        l->l_blcnt = nlocks;
        LOCKSTAT_STOP_TIMER(lsflag, spintime);
        LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL,
            RETURN_ADDRESS, 0);
        if (owant == NULL) {
                LOCKSTAT_EVENT_RA(lsflag, kernel_lock,
                    LB_KERNEL_LOCK | LB_SPIN, 1, spintime, RETURN_ADDRESS);
        }
        LOCKSTAT_EXIT(lsflag);
        splx(s);

        /*
         * Now that we have kernel_lock, reset ci_biglock_wanted.  This
         * store must be visible on other CPUs before a mutex_exit() on
         * this CPU can test the has-waiters bit.
         *
         * This membar_enter matches the membar_enter in
         * mutex_vector_enter.  (Yes, not membar_exit -- the legacy
         * naming is confusing, but store-before-load usually pairs
         * with store-before-load, in the extremely rare cases where it
         * is used at all.)
         *
         * That way, mutex_vector_enter can't see this store
         * ci->ci_biglock_wanted := owant until it has set the
         * has-waiters bit.
         */
        (void)atomic_swap_ptr(&ci->ci_biglock_wanted, owant);
#ifndef __HAVE_ATOMIC_AS_MEMBAR
        membar_enter();
#endif

#ifdef LOCKDEBUG
        kernel_lock_holder = curcpu();
#endif
}

/*
 * Release 'nlocks' holds on the kernel lock.  If 'nlocks' is zero, release
 * all holds.
 */
void
_kernel_unlock(int nlocks, int *countp)
{
        struct cpu_info *ci;
        u_int olocks;
        int s;
        struct lwp *l = curlwp;

        _KERNEL_LOCK_ASSERT(nlocks < 2);

        olocks = l->l_blcnt;

        if (olocks == 0) {
                _KERNEL_LOCK_ASSERT(nlocks <= 0);
                if (countp != NULL)
                        *countp = 0;
                return;
        }

        _KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock));

        if (nlocks == 0)
                nlocks = olocks;
        else if (nlocks == -1) {
                nlocks = 1;
                _KERNEL_LOCK_ASSERT(olocks == 1);
        }
        s = splvm();
        ci = curcpu();
        _KERNEL_LOCK_ASSERT(ci->ci_biglock_count >= l->l_blcnt);
        if (ci->ci_biglock_count == nlocks) {
                LOCKDEBUG_UNLOCKED(kernel_lock_dodebug, kernel_lock,
                    RETURN_ADDRESS, 0);
                ci->ci_biglock_count = 0;
                __cpu_simple_unlock(kernel_lock);
                l->l_blcnt -= nlocks;
                splx(s);
                if (l->l_dopreempt)
                        kpreempt(0);
        } else {
                ci->ci_biglock_count -= nlocks;
                l->l_blcnt -= nlocks;
                splx(s);
        }

        if (countp != NULL)
                *countp = olocks;
}

bool
_kernel_locked_p(void)
{
        return __SIMPLELOCK_LOCKED_P(kernel_lock);
}































































































































































































































































































































































































































































    1 












    1 






    1 







    1 


























    1 




























































































    1 















    1 


























    1 






    1 

    1 









    1 


































    1 














    1 


















    1 



























































































































































    1 
    1 
    1 
    1 











































    1 

    1 
    1 




















    1 























    1 






































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
/*        $NetBSD: sysv_msg.c,v 1.76 2019/10/04 23:20:22 kamil Exp $        */

/*-
 * Copyright (c) 1999, 2006, 2007 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Implementation of SVID messages
 *
 * Author: Daniel Boulet
 *
 * Copyright 1993 Daniel Boulet and RTMX Inc.
 *
 * This system call was implemented by Daniel Boulet under contract from RTMX.
 *
 * Redistribution and use in source forms, with and without modification,
 * are permitted provided that this entire comment appears intact.
 *
 * Redistribution in binary form may occur without any restrictions.
 * Obviously, it would be nice if you gave credit where credit is due
 * but requiring it would be too onerous.
 *
 * This software is provided ``AS IS'' without any warranties of any kind.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sysv_msg.c,v 1.76 2019/10/04 23:20:22 kamil Exp $");

#ifdef _KERNEL_OPT
#include "opt_sysv.h"
#endif

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/msg.h>
#include <sys/sysctl.h>
#include <sys/mount.h>                /* XXX for <sys/syscallargs.h> */
#include <sys/syscallargs.h>
#include <sys/kauth.h>

#define MSG_DEBUG
#undef MSG_DEBUG_OK

#ifdef MSG_DEBUG_OK
#define MSG_PRINTF(a)        printf a
#else
#define MSG_PRINTF(a)
#endif

static int        nfree_msgmaps;                /* # of free map entries */
static short        free_msgmaps;        /* head of linked list of free map entries */
static struct        __msg *free_msghdrs;        /* list of free msg headers */
static char        *msgpool;                /* MSGMAX byte long msg buffer pool */
static struct        msgmap *msgmaps;        /* MSGSEG msgmap structures */
static struct __msg *msghdrs;                /* MSGTQL msg headers */

kmsq_t        *msqs;                                /* MSGMNI msqid_ds struct's */
kmutex_t msgmutex;                        /* subsystem lock */

static u_int        msg_waiters = 0;        /* total number of msgrcv waiters */
static bool        msg_realloc_state;
static kcondvar_t msg_realloc_cv;

static void msg_freehdr(struct __msg *);

extern int kern_has_sysvmsg;

SYSCTL_SETUP_PROTO(sysctl_ipc_msg_setup);

int
msginit(void)
{
        int i, sz;
        vaddr_t v;

        /*
         * msginfo.msgssz should be a power of two for efficiency reasons.
         * It is also pretty silly if msginfo.msgssz is less than 8
         * or greater than about 256 so ...
         */

        i = 8;
        while (i < 1024 && i != msginfo.msgssz)
                i <<= 1;
        if (i != msginfo.msgssz) {
                printf("msginfo.msgssz = %d, not a small power of 2",
                    msginfo.msgssz);
                return EINVAL;
        }

        if (msginfo.msgseg > 32767) {
                printf("msginfo.msgseg = %d > 32767", msginfo.msgseg);
                return EINVAL;
        }

        /* Allocate the wired memory for our structures */
        sz = ALIGN(msginfo.msgmax) +
            ALIGN(msginfo.msgseg * sizeof(struct msgmap)) +
            ALIGN(msginfo.msgtql * sizeof(struct __msg)) +
            ALIGN(msginfo.msgmni * sizeof(kmsq_t));
        sz = round_page(sz);
        v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
        if (v == 0) {
                printf("sysv_msg: cannot allocate memory");
                return ENOMEM;
        }
        msgpool = (void *)v;
        msgmaps = (void *)((uintptr_t)msgpool + ALIGN(msginfo.msgmax));
        msghdrs = (void *)((uintptr_t)msgmaps +
            ALIGN(msginfo.msgseg * sizeof(struct msgmap)));
        msqs = (void *)((uintptr_t)msghdrs +
            ALIGN(msginfo.msgtql * sizeof(struct __msg)));

        for (i = 0; i < (msginfo.msgseg - 1); i++)
                msgmaps[i].next = i + 1;
        msgmaps[msginfo.msgseg - 1].next = -1;

        free_msgmaps = 0;
        nfree_msgmaps = msginfo.msgseg;

        for (i = 0; i < (msginfo.msgtql - 1); i++) {
                msghdrs[i].msg_type = 0;
                msghdrs[i].msg_next = &msghdrs[i + 1];
        }
        i = msginfo.msgtql - 1;
        msghdrs[i].msg_type = 0;
        msghdrs[i].msg_next = NULL;
        free_msghdrs = &msghdrs[0];

        for (i = 0; i < msginfo.msgmni; i++) {
                cv_init(&msqs[i].msq_cv, "msgwait");
                /* Implies entry is available */
                msqs[i].msq_u.msg_qbytes = 0;
                /* Reset to a known value */
                msqs[i].msq_u.msg_perm._seq = 0;
        }

        mutex_init(&msgmutex, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&msg_realloc_cv, "msgrealc");
        msg_realloc_state = false;

        kern_has_sysvmsg = 1;

        return 0;
}

int
msgfini(void)
{
        int i, sz;
        vaddr_t v = (vaddr_t)msgpool;

        mutex_enter(&msgmutex);
        for (i = 0; i < msginfo.msgmni; i++) {
                if (msqs[i].msq_u.msg_qbytes != 0) {
                        mutex_exit(&msgmutex);
                        return 1; /* queue not available, prevent unload! */
                }
        }
/*
 * Destroy all condvars and free the memory we're using
 */
        for (i = 0; i < msginfo.msgmni; i++) {
                cv_destroy(&msqs[i].msq_cv);
        }
        sz = ALIGN(msginfo.msgmax) +
            ALIGN(msginfo.msgseg * sizeof(struct msgmap)) +
            ALIGN(msginfo.msgtql * sizeof(struct __msg)) +
            ALIGN(msginfo.msgmni * sizeof(kmsq_t));
        sz = round_page(sz);
        uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);

        cv_destroy(&msg_realloc_cv);
        mutex_exit(&msgmutex);
        mutex_destroy(&msgmutex);

        kern_has_sysvmsg = 0;

        return 0;
}

static int
msgrealloc(int newmsgmni, int newmsgseg)
{
        struct msgmap *new_msgmaps;
        struct __msg *new_msghdrs, *new_free_msghdrs;
        char *old_msgpool, *new_msgpool;
        kmsq_t *new_msqs;
        vaddr_t v;
        int i, sz, msqid, newmsgmax, new_nfree_msgmaps;
        short new_free_msgmaps;

        if (newmsgmni < 1 || newmsgseg < 1)
                return EINVAL;

        /* Allocate the wired memory for our structures */
        newmsgmax = msginfo.msgssz * newmsgseg;
        sz = ALIGN(newmsgmax) +
            ALIGN(newmsgseg * sizeof(struct msgmap)) +
            ALIGN(msginfo.msgtql * sizeof(struct __msg)) +
            ALIGN(newmsgmni * sizeof(kmsq_t));
        sz = round_page(sz);
        v = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
        if (v == 0)
                return ENOMEM;

        mutex_enter(&msgmutex);
        if (msg_realloc_state) {
                mutex_exit(&msgmutex);
                uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);
                return EBUSY;
        }
        msg_realloc_state = true;
        if (msg_waiters) {
                /*
                 * Mark reallocation state, wake-up all waiters,
                 * and wait while they will all exit.
                 */
                for (i = 0; i < msginfo.msgmni; i++)
                        cv_broadcast(&msqs[i].msq_cv);
                while (msg_waiters)
                        cv_wait(&msg_realloc_cv, &msgmutex);
        }
        old_msgpool = msgpool;

        /* We cannot reallocate less memory than we use */
        i = 0;
        for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
                struct msqid_ds *mptr;
                kmsq_t *msq;

                msq = &msqs[msqid];
                mptr = &msq->msq_u;
                if (mptr->msg_qbytes || (mptr->msg_perm.mode & MSG_LOCKED))
                        i = msqid;
        }
        if (i >= newmsgmni || (msginfo.msgseg - nfree_msgmaps) > newmsgseg) {
                mutex_exit(&msgmutex);
                uvm_km_free(kernel_map, v, sz, UVM_KMF_WIRED);
                return EBUSY;
        }

        new_msgpool = (void *)v;
        new_msgmaps = (void *)((uintptr_t)new_msgpool + ALIGN(newmsgmax));
        new_msghdrs = (void *)((uintptr_t)new_msgmaps +
            ALIGN(newmsgseg * sizeof(struct msgmap)));
        new_msqs = (void *)((uintptr_t)new_msghdrs +
            ALIGN(msginfo.msgtql * sizeof(struct __msg)));

        /* Initialize the structures */
        for (i = 0; i < (newmsgseg - 1); i++)
                new_msgmaps[i].next = i + 1;
        new_msgmaps[newmsgseg - 1].next = -1;
        new_free_msgmaps = 0;
        new_nfree_msgmaps = newmsgseg;

        for (i = 0; i < (msginfo.msgtql - 1); i++) {
                new_msghdrs[i].msg_type = 0;
                new_msghdrs[i].msg_next = &new_msghdrs[i + 1];
        }
        i = msginfo.msgtql - 1;
        new_msghdrs[i].msg_type = 0;
        new_msghdrs[i].msg_next = NULL;
        new_free_msghdrs = &new_msghdrs[0];

        for (i = 0; i < newmsgmni; i++) {
                new_msqs[i].msq_u.msg_qbytes = 0;
                new_msqs[i].msq_u.msg_perm._seq = 0;
                cv_init(&new_msqs[i].msq_cv, "msgwait");
        }

        /*
         * Copy all message queue identifiers, message headers and buffer
         * pools to the new memory location.
         */
        for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
                struct __msg *nmsghdr, *msghdr, *pmsghdr;
                struct msqid_ds *nmptr, *mptr;
                kmsq_t *nmsq, *msq;

                msq = &msqs[msqid];
                mptr = &msq->msq_u;

                if (mptr->msg_qbytes == 0 &&
                    (mptr->msg_perm.mode & MSG_LOCKED) == 0)
                        continue;

                nmsq = &new_msqs[msqid];
                nmptr = &nmsq->msq_u;
                memcpy(nmptr, mptr, sizeof(struct msqid_ds));

                /*
                 * Go through the message headers, and copy each one
                 * by taking the new ones, and thus defragmenting.
                 */
                nmsghdr = pmsghdr = NULL;
                msghdr = mptr->_msg_first;
                while (msghdr) {
                        short nnext = 0, next;
                        u_short msgsz, segcnt;

                        /* Take an entry from the new list of free msghdrs */
                        nmsghdr = new_free_msghdrs;
                        KASSERT(nmsghdr != NULL);
                        new_free_msghdrs = nmsghdr->msg_next;

                        nmsghdr->msg_next = NULL;
                        if (pmsghdr) {
                                pmsghdr->msg_next = nmsghdr;
                        } else {
                                nmptr->_msg_first = nmsghdr;
                                pmsghdr = nmsghdr;
                        }
                        nmsghdr->msg_ts = msghdr->msg_ts;
                        nmsghdr->msg_spot = -1;

                        /* Compute the amount of segments and reserve them */
                        msgsz = msghdr->msg_ts;
                        segcnt = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
                        if (segcnt == 0)
                                continue;
                        while (segcnt--) {
                                nnext = new_free_msgmaps;
                                new_free_msgmaps = new_msgmaps[nnext].next;
                                new_nfree_msgmaps--;
                                new_msgmaps[nnext].next = nmsghdr->msg_spot;
                                nmsghdr->msg_spot = nnext;
                        }

                        /* Copy all segments */
                        KASSERT(nnext == nmsghdr->msg_spot);
                        next = msghdr->msg_spot;
                        while (msgsz > 0) {
                                size_t tlen;

                                if (msgsz >= msginfo.msgssz) {
                                        tlen = msginfo.msgssz;
                                        msgsz -= msginfo.msgssz;
                                } else {
                                        tlen = msgsz;
                                        msgsz = 0;
                                }

                                /* Copy the message buffer */
                                memcpy(&new_msgpool[nnext * msginfo.msgssz],
                                    &msgpool[next * msginfo.msgssz], tlen);

                                /* Next entry of the map */
                                nnext = msgmaps[nnext].next;
                                next = msgmaps[next].next;
                        }

                        /* Next message header */
                        msghdr = msghdr->msg_next;
                }
                nmptr->_msg_last = nmsghdr;
        }
        KASSERT((msginfo.msgseg - nfree_msgmaps) ==
            (newmsgseg - new_nfree_msgmaps));

        sz = ALIGN(msginfo.msgmax) +
            ALIGN(msginfo.msgseg * sizeof(struct msgmap)) +
            ALIGN(msginfo.msgtql * sizeof(struct __msg)) +
            ALIGN(msginfo.msgmni * sizeof(kmsq_t));
        sz = round_page(sz);

        for (i = 0; i < msginfo.msgmni; i++)
                cv_destroy(&msqs[i].msq_cv);

        /* Set the pointers and update the new values */
        msgpool = new_msgpool;
        msgmaps = new_msgmaps;
        msghdrs = new_msghdrs;
        msqs = new_msqs;

        free_msghdrs = new_free_msghdrs;
        free_msgmaps = new_free_msgmaps;
        nfree_msgmaps = new_nfree_msgmaps;
        msginfo.msgmni = newmsgmni;
        msginfo.msgseg = newmsgseg;
        msginfo.msgmax = newmsgmax;

        /* Reallocation completed - notify all waiters, if any */
        msg_realloc_state = false;
        cv_broadcast(&msg_realloc_cv);
        mutex_exit(&msgmutex);

        uvm_km_free(kernel_map, (vaddr_t)old_msgpool, sz, UVM_KMF_WIRED);
        return 0;
}

static void
msg_freehdr(struct __msg *msghdr)
{

        KASSERT(mutex_owned(&msgmutex));

        while (msghdr->msg_ts > 0) {
                short next;
                KASSERT(msghdr->msg_spot >= 0);
                KASSERT(msghdr->msg_spot < msginfo.msgseg);

                next = msgmaps[msghdr->msg_spot].next;
                msgmaps[msghdr->msg_spot].next = free_msgmaps;
                free_msgmaps = msghdr->msg_spot;
                nfree_msgmaps++;
                msghdr->msg_spot = next;
                if (msghdr->msg_ts >= msginfo.msgssz)
                        msghdr->msg_ts -= msginfo.msgssz;
                else
                        msghdr->msg_ts = 0;
        }
        KASSERT(msghdr->msg_spot == -1);
        msghdr->msg_next = free_msghdrs;
        free_msghdrs = msghdr;
}

int
sys___msgctl50(struct lwp *l, const struct sys___msgctl50_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) msqid;
                syscallarg(int) cmd;
                syscallarg(struct msqid_ds *) buf;
        } */
        struct msqid_ds msqbuf;
        int cmd, error;

        cmd = SCARG(uap, cmd);

        if (cmd == IPC_SET) {
                error = copyin(SCARG(uap, buf), &msqbuf, sizeof(msqbuf));
                if (error)
                        return (error);
        }

        error = msgctl1(l, SCARG(uap, msqid), cmd,
            (cmd == IPC_SET || cmd == IPC_STAT) ? &msqbuf : NULL);

        if (error == 0 && cmd == IPC_STAT)
                error = copyout(&msqbuf, SCARG(uap, buf), sizeof(msqbuf));

        return (error);
}

int
msgctl1(struct lwp *l, int msqid, int cmd, struct msqid_ds *msqbuf)
{
        kauth_cred_t cred = l->l_cred;
        struct msqid_ds *msqptr;
        kmsq_t *msq;
        int error = 0, ix;

        MSG_PRINTF(("call to msgctl1(%d, %d)\n", msqid, cmd));

        ix = IPCID_TO_IX(msqid);

        mutex_enter(&msgmutex);

        if (ix < 0 || ix >= msginfo.msgmni) {
                MSG_PRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", ix,
                    msginfo.msgmni));
                error = EINVAL;
                goto unlock;
        }

        msq = &msqs[ix];
        msqptr = &msq->msq_u;

        if (msqptr->msg_qbytes == 0) {
                MSG_PRINTF(("no such msqid\n"));
                error = EINVAL;
                goto unlock;
        }
        if (msqptr->msg_perm._seq != IPCID_TO_SEQ(msqid)) {
                MSG_PRINTF(("wrong sequence number\n"));
                error = EINVAL;
                goto unlock;
        }

        switch (cmd) {
        case IPC_RMID:
        {
                struct __msg *msghdr;
                if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_M)) != 0)
                        break;
                /* Free the message headers */
                msghdr = msqptr->_msg_first;
                while (msghdr != NULL) {
                        struct __msg *msghdr_tmp;

                        /* Free the segments of each message */
                        msqptr->_msg_cbytes -= msghdr->msg_ts;
                        msqptr->msg_qnum--;
                        msghdr_tmp = msghdr;
                        msghdr = msghdr->msg_next;
                        msg_freehdr(msghdr_tmp);
                }
                KASSERT(msqptr->_msg_cbytes == 0);
                KASSERT(msqptr->msg_qnum == 0);

                /* Mark it as free */
                msqptr->msg_qbytes = 0;
                cv_broadcast(&msq->msq_cv);
        }
                break;

        case IPC_SET:
                if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_M)))
                        break;
                if (msqbuf->msg_qbytes > msqptr->msg_qbytes &&
                    kauth_authorize_system(cred, KAUTH_SYSTEM_SYSVIPC,
                    KAUTH_REQ_SYSTEM_SYSVIPC_MSGQ_OVERSIZE,
                    KAUTH_ARG(msqbuf->msg_qbytes),
                    KAUTH_ARG(msqptr->msg_qbytes), NULL) != 0) {
                        error = EPERM;
                        break;
                }
                if (msqbuf->msg_qbytes > msginfo.msgmnb) {
                        MSG_PRINTF(("can't increase msg_qbytes beyond %d "
                            "(truncating)\n", msginfo.msgmnb));
                        /* silently restrict qbytes to system limit */
                        msqbuf->msg_qbytes = msginfo.msgmnb;
                }
                if (msqbuf->msg_qbytes == 0) {
                        MSG_PRINTF(("can't reduce msg_qbytes to 0\n"));
                        error = EINVAL;                /* XXX non-standard errno! */
                        break;
                }
                msqptr->msg_perm.uid = msqbuf->msg_perm.uid;
                msqptr->msg_perm.gid = msqbuf->msg_perm.gid;
                msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) |
                    (msqbuf->msg_perm.mode & 0777);
                msqptr->msg_qbytes = msqbuf->msg_qbytes;
                msqptr->msg_ctime = time_second;
                break;

        case IPC_STAT:
                if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_R))) {
                        MSG_PRINTF(("requester doesn't have read access\n"));
                        break;
                }
                memset(msqbuf, 0, sizeof *msqbuf);
                msqbuf->msg_perm = msqptr->msg_perm;
                msqbuf->msg_perm.mode &= 0777;
                msqbuf->msg_qnum = msqptr->msg_qnum;
                msqbuf->msg_qbytes = msqptr->msg_qbytes;
                msqbuf->msg_lspid = msqptr->msg_lspid;
                msqbuf->msg_lrpid = msqptr->msg_lrpid;
                msqbuf->msg_stime = msqptr->msg_stime;
                msqbuf->msg_rtime = msqptr->msg_rtime;
                msqbuf->msg_ctime = msqptr->msg_ctime;
                break;

        default:
                MSG_PRINTF(("invalid command %d\n", cmd));
                error = EINVAL;
                break;
        }

unlock:
        mutex_exit(&msgmutex);
        return (error);
}

int
sys_msgget(struct lwp *l, const struct sys_msgget_args *uap, register_t *retval)
{
        /* {
                syscallarg(key_t) key;
                syscallarg(int) msgflg;
        } */
        int msqid, error = 0;
        int key = SCARG(uap, key);
        int msgflg = SCARG(uap, msgflg);
        kauth_cred_t cred = l->l_cred;
        struct msqid_ds *msqptr = NULL;
        kmsq_t *msq;

        mutex_enter(&msgmutex);

        MSG_PRINTF(("msgget(0x%x, 0%o)\n", key, msgflg));

        if (key != IPC_PRIVATE) {
                for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
                        msq = &msqs[msqid];
                        msqptr = &msq->msq_u;
                        if (msqptr->msg_qbytes != 0 &&
                            msqptr->msg_perm._key == key)
                                break;
                }
                if (msqid < msginfo.msgmni) {
                        MSG_PRINTF(("found public key\n"));
                        if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
                                MSG_PRINTF(("not exclusive\n"));
                                error = EEXIST;
                                goto unlock;
                        }
                        if ((error = ipcperm(cred, &msqptr->msg_perm,
                            msgflg & 0700 ))) {
                                MSG_PRINTF(("requester doesn't have 0%o access\n",
                                    msgflg & 0700));
                                goto unlock;
                        }
                        goto found;
                }
        }

        MSG_PRINTF(("need to allocate the msqid_ds\n"));
        if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
                for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
                        /*
                         * Look for an unallocated and unlocked msqid_ds.
                         * msqid_ds's can be locked by msgsnd or msgrcv while
                         * they are copying the message in/out.  We can't
                         * re-use the entry until they release it.
                         */
                        msq = &msqs[msqid];
                        msqptr = &msq->msq_u;
                        if (msqptr->msg_qbytes == 0 &&
                            (msqptr->msg_perm.mode & MSG_LOCKED) == 0)
                                break;
                }
                if (msqid == msginfo.msgmni) {
                        MSG_PRINTF(("no more msqid_ds's available\n"));
                        error = ENOSPC;
                        goto unlock;
                }
                MSG_PRINTF(("msqid %d is available\n", msqid));
                msqptr->msg_perm._key = key;
                msqptr->msg_perm.cuid = kauth_cred_geteuid(cred);
                msqptr->msg_perm.uid = kauth_cred_geteuid(cred);
                msqptr->msg_perm.cgid = kauth_cred_getegid(cred);
                msqptr->msg_perm.gid = kauth_cred_getegid(cred);
                msqptr->msg_perm.mode = (msgflg & 0777);
                /* Make sure that the returned msqid is unique */
                msqptr->msg_perm._seq++;
                msqptr->_msg_first = NULL;
                msqptr->_msg_last = NULL;
                msqptr->_msg_cbytes = 0;
                msqptr->msg_qnum = 0;
                msqptr->msg_qbytes = msginfo.msgmnb;
                msqptr->msg_lspid = 0;
                msqptr->msg_lrpid = 0;
                msqptr->msg_stime = 0;
                msqptr->msg_rtime = 0;
                msqptr->msg_ctime = time_second;
        } else {
                MSG_PRINTF(("didn't find it and wasn't asked to create it\n"));
                error = ENOENT;
                goto unlock;
        }

found:
        /* Construct the unique msqid */
        *retval = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm);

unlock:
        mutex_exit(&msgmutex);
        return (error);
}

int
sys_msgsnd(struct lwp *l, const struct sys_msgsnd_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) msqid;
                syscallarg(const void *) msgp;
                syscallarg(size_t) msgsz;
                syscallarg(int) msgflg;
        } */

        return msgsnd1(l, SCARG(uap, msqid), SCARG(uap, msgp),
            SCARG(uap, msgsz), SCARG(uap, msgflg), sizeof(long), copyin);
}

int
msgsnd1(struct lwp *l, int msqidr, const char *user_msgp, size_t msgsz,
    int msgflg, size_t typesz, copyin_t fetch_type)
{
        int segs_needed, error = 0, msqid;
        kauth_cred_t cred = l->l_cred;
        struct msqid_ds *msqptr;
        struct __msg *msghdr;
        kmsq_t *msq;
        short next;

        MSG_PRINTF(("call to msgsnd(%d, %p, %lld, %d)\n", msqidr,
             user_msgp, (long long)msgsz, msgflg));

        if ((ssize_t)msgsz < 0)
                return EINVAL;

restart:
        msqid = IPCID_TO_IX(msqidr);

        mutex_enter(&msgmutex);
        /* In case of reallocation, we will wait for completion */
        while (__predict_false(msg_realloc_state))
                cv_wait(&msg_realloc_cv, &msgmutex);

        if (msqid < 0 || msqid >= msginfo.msgmni) {
                MSG_PRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
                    msginfo.msgmni));
                error = EINVAL;
                goto unlock;
        }

        msq = &msqs[msqid];
        msqptr = &msq->msq_u;

        if (msqptr->msg_qbytes == 0) {
                MSG_PRINTF(("no such message queue id\n"));
                error = EINVAL;
                goto unlock;
        }
        if (msqptr->msg_perm._seq != IPCID_TO_SEQ(msqidr)) {
                MSG_PRINTF(("wrong sequence number\n"));
                error = EINVAL;
                goto unlock;
        }

        if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_W))) {
                MSG_PRINTF(("requester doesn't have write access\n"));
                goto unlock;
        }

        segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
        MSG_PRINTF(("msgsz=%lld, msgssz=%d, segs_needed=%d\n",
            (long long)msgsz, msginfo.msgssz, segs_needed));
        for (;;) {
                int need_more_resources = 0;

                /*
                 * check msgsz [cannot be negative since it is unsigned]
                 * (inside this loop in case msg_qbytes changes while we sleep)
                 */

                if (msgsz > msqptr->msg_qbytes) {
                        MSG_PRINTF(("msgsz > msqptr->msg_qbytes\n"));
                        error = EINVAL;
                        goto unlock;
                }

                if (msqptr->msg_perm.mode & MSG_LOCKED) {
                        MSG_PRINTF(("msqid is locked\n"));
                        need_more_resources = 1;
                }
                if (msgsz + msqptr->_msg_cbytes > msqptr->msg_qbytes) {
                        MSG_PRINTF(("msgsz + msg_cbytes > msg_qbytes\n"));
                        need_more_resources = 1;
                }
                if (segs_needed > nfree_msgmaps) {
                        MSG_PRINTF(("segs_needed > nfree_msgmaps\n"));
                        need_more_resources = 1;
                }
                if (free_msghdrs == NULL) {
                        MSG_PRINTF(("no more msghdrs\n"));
                        need_more_resources = 1;
                }

                if (need_more_resources) {
                        int we_own_it;

                        if ((msgflg & IPC_NOWAIT) != 0) {
                                MSG_PRINTF(("need more resources but caller "
                                    "doesn't want to wait\n"));
                                error = EAGAIN;
                                goto unlock;
                        }

                        if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) {
                                MSG_PRINTF(("we don't own the msqid_ds\n"));
                                we_own_it = 0;
                        } else {
                                /* Force later arrivals to wait for our
                                   request */
                                MSG_PRINTF(("we own the msqid_ds\n"));
                                msqptr->msg_perm.mode |= MSG_LOCKED;
                                we_own_it = 1;
                        }

                        msg_waiters++;
                        MSG_PRINTF(("goodnight\n"));
                        error = cv_wait_sig(&msq->msq_cv, &msgmutex);
                        MSG_PRINTF(("good morning, error=%d\n", error));
                        msg_waiters--;

                        if (we_own_it)
                                msqptr->msg_perm.mode &= ~MSG_LOCKED;

                        /*
                         * In case of such state, notify reallocator and
                         * restart the call.
                         */
                        if (msg_realloc_state) {
                                cv_broadcast(&msg_realloc_cv);
                                mutex_exit(&msgmutex);
                                goto restart;
                        }

                        if (error != 0) {
                                MSG_PRINTF(("msgsnd: interrupted system "
                                    "call\n"));
                                error = EINTR;
                                goto unlock;
                        }

                        /*
                         * Make sure that the msq queue still exists
                         */

                        if (msqptr->msg_qbytes == 0) {
                                MSG_PRINTF(("msqid deleted\n"));
                                error = EIDRM;
                                goto unlock;
                        }
                } else {
                        MSG_PRINTF(("got all the resources that we need\n"));
                        break;
                }
        }

        /*
         * We have the resources that we need.
         * Make sure!
         */

        KASSERT((msqptr->msg_perm.mode & MSG_LOCKED) == 0);
        KASSERT(segs_needed <= nfree_msgmaps);
        KASSERT(msgsz + msqptr->_msg_cbytes <= msqptr->msg_qbytes);
        KASSERT(free_msghdrs != NULL);

        /*
         * Re-lock the msqid_ds in case we page-fault when copying in the
         * message
         */

        KASSERT((msqptr->msg_perm.mode & MSG_LOCKED) == 0);
        msqptr->msg_perm.mode |= MSG_LOCKED;

        /*
         * Allocate a message header
         */

        msghdr = free_msghdrs;
        free_msghdrs = msghdr->msg_next;
        msghdr->msg_spot = -1;
        msghdr->msg_ts = msgsz;

        /*
         * Allocate space for the message
         */

        while (segs_needed > 0) {
                KASSERT(nfree_msgmaps > 0);
                KASSERT(free_msgmaps != -1);
                KASSERT(free_msgmaps < msginfo.msgseg);

                next = free_msgmaps;
                MSG_PRINTF(("allocating segment %d to message\n", next));
                free_msgmaps = msgmaps[next].next;
                nfree_msgmaps--;
                msgmaps[next].next = msghdr->msg_spot;
                msghdr->msg_spot = next;
                segs_needed--;
        }

        /*
         * Copy in the message type
         */
        mutex_exit(&msgmutex);
        error = (*fetch_type)(user_msgp, &msghdr->msg_type, typesz);
        mutex_enter(&msgmutex);
        if (error != 0) {
                MSG_PRINTF(("error %d copying the message type\n", error));
                msg_freehdr(msghdr);
                msqptr->msg_perm.mode &= ~MSG_LOCKED;
                cv_broadcast(&msq->msq_cv);
                goto unlock;
        }
        user_msgp += typesz;

        /*
         * Validate the message type
         */

        if (msghdr->msg_type < 1) {
                msg_freehdr(msghdr);
                msqptr->msg_perm.mode &= ~MSG_LOCKED;
                cv_broadcast(&msq->msq_cv);
                MSG_PRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
                error = EINVAL;
                goto unlock;
        }

        /*
         * Copy in the message body
         */

        next = msghdr->msg_spot;
        while (msgsz > 0) {
                size_t tlen;
                KASSERT(next > -1);
                KASSERT(next < msginfo.msgseg);

                if (msgsz > msginfo.msgssz)
                        tlen = msginfo.msgssz;
                else
                        tlen = msgsz;
                mutex_exit(&msgmutex);
                error = copyin(user_msgp, &msgpool[next * msginfo.msgssz], tlen);
                mutex_enter(&msgmutex);
                if (error != 0) {
                        MSG_PRINTF(("error %d copying in message segment\n",
                            error));
                        msg_freehdr(msghdr);
                        msqptr->msg_perm.mode &= ~MSG_LOCKED;
                        cv_broadcast(&msq->msq_cv);
                        goto unlock;
                }
                msgsz -= tlen;
                user_msgp += tlen;
                next = msgmaps[next].next;
        }
        KASSERT(next == -1);

        /*
         * We've got the message.  Unlock the msqid_ds.
         */

        msqptr->msg_perm.mode &= ~MSG_LOCKED;

        /*
         * Make sure that the msqid_ds is still allocated.
         */

        if (msqptr->msg_qbytes == 0) {
                msg_freehdr(msghdr);
                cv_broadcast(&msq->msq_cv);
                error = EIDRM;
                goto unlock;
        }

        /*
         * Put the message into the queue
         */

        if (msqptr->_msg_first == NULL) {
                msqptr->_msg_first = msghdr;
                msqptr->_msg_last = msghdr;
        } else {
                msqptr->_msg_last->msg_next = msghdr;
                msqptr->_msg_last = msghdr;
        }
        msqptr->_msg_last->msg_next = NULL;

        msqptr->_msg_cbytes += msghdr->msg_ts;
        msqptr->msg_qnum++;
        msqptr->msg_lspid = l->l_proc->p_pid;
        msqptr->msg_stime = time_second;

        cv_broadcast(&msq->msq_cv);

unlock:
        mutex_exit(&msgmutex);
        return error;
}

int
sys_msgrcv(struct lwp *l, const struct sys_msgrcv_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) msqid;
                syscallarg(void *) msgp;
                syscallarg(size_t) msgsz;
                syscallarg(long) msgtyp;
                syscallarg(int) msgflg;
        } */

        return msgrcv1(l, SCARG(uap, msqid), SCARG(uap, msgp),
            SCARG(uap, msgsz), SCARG(uap, msgtyp), SCARG(uap, msgflg),
            sizeof(long), copyout, retval);
}

int
msgrcv1(struct lwp *l, int msqidr, char *user_msgp, size_t msgsz, long msgtyp,
    int msgflg, size_t typesz, copyout_t put_type, register_t *retval)
{
        size_t len;
        kauth_cred_t cred = l->l_cred;
        struct msqid_ds *msqptr;
        struct __msg *msghdr;
        int error = 0, msqid;
        kmsq_t *msq;
        short next;

        MSG_PRINTF(("call to msgrcv(%d, %p, %lld, %ld, %d)\n", msqidr,
            user_msgp, (long long)msgsz, msgtyp, msgflg));

        if ((ssize_t)msgsz < 0)
                return EINVAL;

restart:
        msqid = IPCID_TO_IX(msqidr);

        mutex_enter(&msgmutex);
        /* In case of reallocation, we will wait for completion */
        while (__predict_false(msg_realloc_state))
                cv_wait(&msg_realloc_cv, &msgmutex);

        if (msqid < 0 || msqid >= msginfo.msgmni) {
                MSG_PRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
                    msginfo.msgmni));
                error = EINVAL;
                goto unlock;
        }

        msq = &msqs[msqid];
        msqptr = &msq->msq_u;

        if (msqptr->msg_qbytes == 0) {
                MSG_PRINTF(("no such message queue id\n"));
                error = EINVAL;
                goto unlock;
        }
        if (msqptr->msg_perm._seq != IPCID_TO_SEQ(msqidr)) {
                MSG_PRINTF(("wrong sequence number\n"));
                error = EINVAL;
                goto unlock;
        }

        if ((error = ipcperm(cred, &msqptr->msg_perm, IPC_R))) {
                MSG_PRINTF(("requester doesn't have read access\n"));
                goto unlock;
        }

        msghdr = NULL;
        while (msghdr == NULL) {
                if (msgtyp == 0) {
                        msghdr = msqptr->_msg_first;
                        if (msghdr != NULL) {
                                if (msgsz < msghdr->msg_ts &&
                                    (msgflg & MSG_NOERROR) == 0) {
                                        MSG_PRINTF(("first msg on the queue "
                                            "is too big (want %lld, got %d)\n",
                                            (long long)msgsz, msghdr->msg_ts));
                                        error = E2BIG;
                                        goto unlock;
                                }
                                if (msqptr->_msg_first == msqptr->_msg_last) {
                                        msqptr->_msg_first = NULL;
                                        msqptr->_msg_last = NULL;
                                } else {
                                        msqptr->_msg_first = msghdr->msg_next;
                                        KASSERT(msqptr->_msg_first != NULL);
                                }
                        }
                } else {
                        struct __msg *previous;
                        struct __msg **prev;

                        for (previous = NULL, prev = &msqptr->_msg_first;
                             (msghdr = *prev) != NULL;
                             previous = msghdr, prev = &msghdr->msg_next) {
                                /*
                                 * Is this message's type an exact match or is
                                 * this message's type less than or equal to
                                 * the absolute value of a negative msgtyp?
                                 * Note that the second half of this test can
                                 * NEVER be true if msgtyp is positive since
                                 * msg_type is always positive!
                                 */

                                if (msgtyp != msghdr->msg_type &&
                                    msgtyp != LONG_MIN &&
                                    msghdr->msg_type > -msgtyp)
                                        continue;

                                MSG_PRINTF(("found message type %ld, requested %ld\n",
                                    msghdr->msg_type, msgtyp));
                                if (msgsz < msghdr->msg_ts &&
                                     (msgflg & MSG_NOERROR) == 0) {
                                        MSG_PRINTF(("requested message on the queue "
                                            "is too big (want %lld, got %d)\n",
                                            (long long)msgsz, msghdr->msg_ts));
                                        error = E2BIG;
                                        goto unlock;
                                }
                                *prev = msghdr->msg_next;
                                if (msghdr != msqptr->_msg_last)
                                        break;
                                if (previous == NULL) {
                                        KASSERT(prev == &msqptr->_msg_first);
                                        msqptr->_msg_first = NULL;
                                        msqptr->_msg_last = NULL;
                                } else {
                                        KASSERT(prev != &msqptr->_msg_first);
                                        msqptr->_msg_last = previous;
                                }
                                break;
                        }
                }

                /*
                 * We've either extracted the msghdr for the appropriate
                 * message or there isn't one.
                 * If there is one then bail out of this loop.
                 */
                if (msghdr != NULL)
                        break;

                /*
                 * Hmph!  No message found.  Does the user want to wait?
                 */

                if ((msgflg & IPC_NOWAIT) != 0) {
                        MSG_PRINTF(("no appropriate message found (msgtyp=%ld)\n",
                            msgtyp));
                        error = ENOMSG;
                        goto unlock;
                }

                /*
                 * Wait for something to happen
                 */

                msg_waiters++;
                MSG_PRINTF(("msgrcv:  goodnight\n"));
                error = cv_wait_sig(&msq->msq_cv, &msgmutex);
                MSG_PRINTF(("msgrcv: good morning (error=%d)\n", error));
                msg_waiters--;

                /*
                 * In case of such state, notify reallocator and
                 * restart the call.
                 */
                if (msg_realloc_state) {
                        cv_broadcast(&msg_realloc_cv);
                        mutex_exit(&msgmutex);
                        goto restart;
                }

                if (error != 0) {
                        MSG_PRINTF(("msgsnd: interrupted system call\n"));
                        error = EINTR;
                        goto unlock;
                }

                /*
                 * Make sure that the msq queue still exists
                 */

                if (msqptr->msg_qbytes == 0 ||
                    msqptr->msg_perm._seq != IPCID_TO_SEQ(msqidr)) {
                        MSG_PRINTF(("msqid deleted\n"));
                        error = EIDRM;
                        goto unlock;
                }
        }

        /*
         * Return the message to the user.
         *
         * First, do the bookkeeping (before we risk being interrupted).
         */

        msqptr->_msg_cbytes -= msghdr->msg_ts;
        msqptr->msg_qnum--;
        msqptr->msg_lrpid = l->l_proc->p_pid;
        msqptr->msg_rtime = time_second;

        /*
         * Make msgsz the actual amount that we'll be returning.
         * Note that this effectively truncates the message if it is too long
         * (since msgsz is never increased).
         */

        MSG_PRINTF(("found a message, msgsz=%lld, msg_ts=%d\n",
            (long long)msgsz, msghdr->msg_ts));
        if (msgsz > msghdr->msg_ts)
                msgsz = msghdr->msg_ts;

        /*
         * Return the type to the user.
         */
        mutex_exit(&msgmutex);
        error = (*put_type)(&msghdr->msg_type, user_msgp, typesz);
        mutex_enter(&msgmutex);
        if (error != 0) {
                MSG_PRINTF(("error (%d) copying out message type\n", error));
                msg_freehdr(msghdr);
                cv_broadcast(&msq->msq_cv);
                goto unlock;
        }
        user_msgp += typesz;

        /*
         * Return the segments to the user
         */

        next = msghdr->msg_spot;
        for (len = 0; len < msgsz; len += msginfo.msgssz) {
                size_t tlen;
                KASSERT(next > -1);
                KASSERT(next < msginfo.msgseg);

                if (msgsz - len > msginfo.msgssz)
                        tlen = msginfo.msgssz;
                else
                        tlen = msgsz - len;
                mutex_exit(&msgmutex);
                error = copyout(&msgpool[next * msginfo.msgssz],
                    user_msgp, tlen);
                mutex_enter(&msgmutex);
                if (error != 0) {
                        MSG_PRINTF(("error (%d) copying out message segment\n",
                            error));
                        msg_freehdr(msghdr);
                        cv_broadcast(&msq->msq_cv);
                        goto unlock;
                }
                user_msgp += tlen;
                next = msgmaps[next].next;
        }

        /*
         * Done, return the actual number of bytes copied out.
         */

        msg_freehdr(msghdr);
        cv_broadcast(&msq->msq_cv);
        *retval = msgsz;

unlock:
        mutex_exit(&msgmutex);
        return error;
}

/*
 * Sysctl initialization and nodes.
 */

static int
sysctl_ipc_msgmni(SYSCTLFN_ARGS)
{
        int newsize, error;
        struct sysctlnode node;
        node = *rnode;
        node.sysctl_data = &newsize;

        newsize = msginfo.msgmni;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        sysctl_unlock();
        error = msgrealloc(newsize, msginfo.msgseg);
        sysctl_relock();
        return error;
}

static int
sysctl_ipc_msgseg(SYSCTLFN_ARGS)
{
        int newsize, error;
        struct sysctlnode node;
        node = *rnode;
        node.sysctl_data = &newsize;

        newsize = msginfo.msgseg;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        sysctl_unlock();
        error = msgrealloc(msginfo.msgmni, newsize);
        sysctl_relock();
        return error;
}

SYSCTL_SETUP(sysctl_ipc_msg_setup, "sysctl kern.ipc subtree setup")
{
        const struct sysctlnode *node = NULL;

        sysctl_createv(clog, 0, NULL, &node,
                CTLFLAG_PERMANENT,
                CTLTYPE_NODE, "ipc",
                SYSCTL_DESCR("SysV IPC options"),
                NULL, 0, NULL, 0,
                CTL_KERN, KERN_SYSVIPC, CTL_EOL);

        if (node == NULL)
                return;

        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "msgmni",
                SYSCTL_DESCR("Max number of message queue identifiers"),
                sysctl_ipc_msgmni, 0, &msginfo.msgmni, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "msgseg",
                SYSCTL_DESCR("Max number of number of message segments"),
                sysctl_ipc_msgseg, 0, &msginfo.msgseg, 0,
                CTL_CREATE, CTL_EOL);
}

















































































































































































































































































   27 






   20 





    2 






















    1 














   12 






























    6 












































































































































































































































    1 


    1 
    1 





















































































    3 






    3 


















    3 












    3 















    2 














    2 





    2 

















    2 





















    2 








































































    2 
    5 

    1 







    2 

    2 







    6 

    6 








    5 







    6 
    6 















    1 


    1 







    1 


















































































































    6 




    6 







    6 














   11 




   11 







































   11 




   11 








    6 


    6 
    6 




    6 







   10 



   11 
   11 
   11 




















   11 

    6 






    5 








    6 


    6 



    6 
    6 







    6 















    1 

    1 












    5 

    5 













    5 




    1 

    1 
















    1 







    1 







    1 























    1 



























    1 











    1 











































    1 


    1 

    1 







    1 



    1 


    1 














    1 


    1 




    1 







    1 










    1 



    1 




    1 
    1 
    1 



    1 


















    1 




    1 












    1 














    1 

















    1 




    1 






    1 






    1 








    1 



























    1 

    1 
    1 
























































































































































































    7 





    7 






    7 












    7 




    7 




    7 
    7 
    7 







    1 
    6 




    1 




    6 






    7 

    1 
    6 



































    6 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
/*        $NetBSD: kern_proc.c,v 1.274 2023/10/05 19:41:07 ad Exp $        */

/*-
 * Copyright (c) 1999, 2006, 2007, 2008, 2020, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_proc.c        8.7 (Berkeley) 2/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_proc.c,v 1.274 2023/10/05 19:41:07 ad Exp $");

#ifdef _KERNEL_OPT
#include "opt_kstack.h"
#include "opt_maxuprc.h"
#include "opt_dtrace.h"
#include "opt_compat_netbsd32.h"
#include "opt_kaslr.h"
#endif

#if defined(__HAVE_COMPAT_NETBSD32) && !defined(COMPAT_NETBSD32) \
    && !defined(_RUMPKERNEL)
#define COMPAT_NETBSD32
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/buf.h>
#include <sys/acct.h>
#include <sys/wait.h>
#include <sys/file.h>
#include <ufs/ufs/quota.h>
#include <sys/uio.h>
#include <sys/pool.h>
#include <sys/pset.h>
#include <sys/ioctl.h>
#include <sys/tty.h>
#include <sys/signalvar.h>
#include <sys/ras.h>
#include <sys/filedesc.h>
#include <sys/syscall_stats.h>
#include <sys/kauth.h>
#include <sys/sleepq.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/dtrace_bsd.h>
#include <sys/sysctl.h>
#include <sys/exec.h>
#include <sys/cpu.h>
#include <sys/compat_stub.h>
#include <sys/futex.h>
#include <sys/pserialize.h>

#include <uvm/uvm_extern.h>

/*
 * Process lists.
 */

struct proclist                allproc                __cacheline_aligned;
struct proclist                zombproc        __cacheline_aligned;

kmutex_t                proc_lock        __cacheline_aligned;
static pserialize_t        proc_psz;

/*
 * pid to lwp/proc lookup is done by indexing the pid_table array.
 * Since pid numbers are only allocated when an empty slot
 * has been found, there is no need to search any lists ever.
 * (an orphaned pgrp will lock the slot, a session will lock
 * the pgrp with the same number.)
 * If the table is too small it is reallocated with twice the
 * previous size and the entries 'unzipped' into the two halves.
 * A linked list of free entries is passed through the pt_lwp
 * field of 'free' items - set odd to be an invalid ptr.  Two
 * additional bits are also used to indicate if the slot is
 * currently occupied by a proc or lwp, and if the PID is
 * hidden from certain kinds of lookups.  We thus require a
 * minimum alignment for proc and lwp structures (LWPs are
 * at least 32-byte aligned).
 */

struct pid_table {
        uintptr_t        pt_slot;
        struct pgrp        *pt_pgrp;
        pid_t                pt_pid;
};

#define        PT_F_FREE                ((uintptr_t)__BIT(0))
#define        PT_F_LWP                0        /* pseudo-flag */
#define        PT_F_PROC                ((uintptr_t)__BIT(1))

#define        PT_F_TYPEBITS                (PT_F_FREE|PT_F_PROC)
#define        PT_F_ALLBITS                (PT_F_FREE|PT_F_PROC)

#define        PT_VALID(s)                (((s) & PT_F_FREE) == 0)
#define        PT_RESERVED(s)                ((s) == 0)
#define        PT_NEXT(s)                ((u_int)(s) >> 1)
#define        PT_SET_FREE(pid)        (((pid) << 1) | PT_F_FREE)
#define        PT_SET_LWP(l)                ((uintptr_t)(l))
#define        PT_SET_PROC(p)                (((uintptr_t)(p)) | PT_F_PROC)
#define        PT_SET_RESERVED                0
#define        PT_GET_LWP(s)                ((struct lwp *)((s) & ~PT_F_ALLBITS))
#define        PT_GET_PROC(s)                ((struct proc *)((s) & ~PT_F_ALLBITS))
#define        PT_GET_TYPE(s)                ((s) & PT_F_TYPEBITS)
#define        PT_IS_LWP(s)                (PT_GET_TYPE(s) == PT_F_LWP && (s) != 0)
#define        PT_IS_PROC(s)                (PT_GET_TYPE(s) == PT_F_PROC)

#define        MIN_PROC_ALIGNMENT        (PT_F_ALLBITS + 1)

/*
 * Table of process IDs (PIDs).
 */
static struct pid_table *pid_table        __read_mostly;

#define        INITIAL_PID_TABLE_SIZE                (1 << 5)

/* Table mask, threshold for growing and number of allocated PIDs. */
static u_int                pid_tbl_mask        __read_mostly;
static u_int                pid_alloc_lim        __read_mostly;
static u_int                pid_alloc_cnt        __cacheline_aligned;

/* Next free, last free and maximum PIDs. */
static u_int                next_free_pt        __cacheline_aligned;
static u_int                last_free_pt        __cacheline_aligned;
static pid_t                pid_max                __read_mostly;

/* Components of the first process -- never freed. */

struct session session0 = {
        .s_count = 1,
        .s_sid = 0,
};
struct pgrp pgrp0 = {
        .pg_members = LIST_HEAD_INITIALIZER(&pgrp0.pg_members),
        .pg_session = &session0,
};
filedesc_t filedesc0;
struct cwdinfo cwdi0 = {
        .cwdi_cmask = CMASK,
        .cwdi_refcnt = 1,
};
struct plimit limit0;
struct pstats pstat0;
struct vmspace vmspace0;
struct sigacts sigacts0;
struct proc proc0 = {
        .p_lwps = LIST_HEAD_INITIALIZER(&proc0.p_lwps),
        .p_sigwaiters = LIST_HEAD_INITIALIZER(&proc0.p_sigwaiters),
        .p_nlwps = 1,
        .p_nrlwps = 1,
        .p_pgrp = &pgrp0,
        .p_comm = "system",
        /*
         * Set P_NOCLDWAIT so that kernel threads are reparented to init(8)
         * when they exit.  init(8) can easily wait them out for us.
         */
        .p_flag = PK_SYSTEM | PK_NOCLDWAIT,
        .p_stat = SACTIVE,
        .p_nice = NZERO,
        .p_emul = &emul_netbsd,
        .p_cwdi = &cwdi0,
        .p_limit = &limit0,
        .p_fd = &filedesc0,
        .p_vmspace = &vmspace0,
        .p_stats = &pstat0,
        .p_sigacts = &sigacts0,
#ifdef PROC0_MD_INITIALIZERS
        PROC0_MD_INITIALIZERS
#endif
};
kauth_cred_t cred0;

static const int        nofile        = NOFILE;
static const int        maxuprc        = MAXUPRC;

static int sysctl_doeproc(SYSCTLFN_PROTO);
static int sysctl_kern_proc_args(SYSCTLFN_PROTO);
static int sysctl_security_expose_address(SYSCTLFN_PROTO);

#ifdef KASLR
static int kern_expose_address = 0;
#else
static int kern_expose_address = 1;
#endif
/*
 * The process list descriptors, used during pid allocation and
 * by sysctl.  No locking on this data structure is needed since
 * it is completely static.
 */
const struct proclist_desc proclists[] = {
        { &allproc        },
        { &zombproc        },
        { NULL                },
};

static struct pgrp *        pg_remove(pid_t);
static void                pg_delete(pid_t);
static void                orphanpg(struct pgrp *);

static specificdata_domain_t proc_specificdata_domain;

static pool_cache_t proc_cache;

static kauth_listener_t proc_listener;

static void fill_proc(const struct proc *, struct proc *, bool);
static int fill_pathname(struct lwp *, pid_t, void *, size_t *);
static int fill_cwd(struct lwp *, pid_t, void *, size_t *);

static int
proc_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        struct proc *p;
        int result;

        result = KAUTH_RESULT_DEFER;
        p = arg0;

        switch (action) {
        case KAUTH_PROCESS_CANSEE: {
                enum kauth_process_req req;

                req = (enum kauth_process_req)(uintptr_t)arg1;

                switch (req) {
                case KAUTH_REQ_PROCESS_CANSEE_ARGS:
                case KAUTH_REQ_PROCESS_CANSEE_ENTRY:
                case KAUTH_REQ_PROCESS_CANSEE_OPENFILES:
                case KAUTH_REQ_PROCESS_CANSEE_EPROC:
                        result = KAUTH_RESULT_ALLOW;
                        break;

                case KAUTH_REQ_PROCESS_CANSEE_ENV:
                        if (kauth_cred_getuid(cred) !=
                            kauth_cred_getuid(p->p_cred) ||
                            kauth_cred_getuid(cred) !=
                            kauth_cred_getsvuid(p->p_cred))
                                break;

                        result = KAUTH_RESULT_ALLOW;

                        break;

                case KAUTH_REQ_PROCESS_CANSEE_KPTR:
                        if (!kern_expose_address)
                                break;

                        if (kern_expose_address == 1 && !(p->p_flag & PK_KMEM))
                                break;

                        result = KAUTH_RESULT_ALLOW;

                        break;

                default:
                        break;
                }

                break;
                }

        case KAUTH_PROCESS_FORK: {
                int lnprocs = (int)(unsigned long)arg2;

                /*
                 * Don't allow a nonprivileged user to use the last few
                 * processes. The variable lnprocs is the current number of
                 * processes, maxproc is the limit.
                 */
                if (__predict_false((lnprocs >= maxproc - 5)))
                        break;

                result = KAUTH_RESULT_ALLOW;

                break;
                }

        case KAUTH_PROCESS_CORENAME:
        case KAUTH_PROCESS_STOPFLAG:
                if (proc_uidmatch(cred, p->p_cred) == 0)
                        result = KAUTH_RESULT_ALLOW;

                break;

        default:
                break;
        }

        return result;
}

static int
proc_ctor(void *arg __unused, void *obj, int flags __unused)
{
        struct proc *p = obj;

        memset(p, 0, sizeof(*p));
        klist_init(&p->p_klist);

        /*
         * There is no need for a proc_dtor() to do a klist_fini(),
         * since knote_proc_exit() ensures that p->p_klist is empty
         * when a process exits.
         */

        return 0;
}

static pid_t proc_alloc_pid_slot(struct proc *, uintptr_t);

/*
 * Initialize global process hashing structures.
 */
void
procinit(void)
{
        const struct proclist_desc *pd;
        u_int i;
#define        LINK_EMPTY ((PID_MAX + INITIAL_PID_TABLE_SIZE) & ~(INITIAL_PID_TABLE_SIZE - 1))

        for (pd = proclists; pd->pd_list != NULL; pd++)
                LIST_INIT(pd->pd_list);

        mutex_init(&proc_lock, MUTEX_DEFAULT, IPL_NONE);

        proc_psz = pserialize_create();

        pid_table = kmem_alloc(INITIAL_PID_TABLE_SIZE
            * sizeof(struct pid_table), KM_SLEEP);
        pid_tbl_mask = INITIAL_PID_TABLE_SIZE - 1;
        pid_max = PID_MAX;

        /* Set free list running through table...
           Preset 'use count' above PID_MAX so we allocate pid 1 next. */
        for (i = 0; i <= pid_tbl_mask; i++) {
                pid_table[i].pt_slot = PT_SET_FREE(LINK_EMPTY + i + 1);
                pid_table[i].pt_pgrp = 0;
                pid_table[i].pt_pid = 0;
        }
        /* slot 0 is just grabbed */
        next_free_pt = 1;
        /* Need to fix last entry. */
        last_free_pt = pid_tbl_mask;
        pid_table[last_free_pt].pt_slot = PT_SET_FREE(LINK_EMPTY);
        /* point at which we grow table - to avoid reusing pids too often */
        pid_alloc_lim = pid_tbl_mask - 1;
#undef LINK_EMPTY

        /* Reserve PID 1 for init(8). */        /* XXX slightly gross */
        mutex_enter(&proc_lock);
        if (proc_alloc_pid_slot(&proc0, PT_SET_RESERVED) != 1)
                panic("failed to reserve PID 1 for init(8)");
        mutex_exit(&proc_lock);

        proc_specificdata_domain = specificdata_domain_create();
        KASSERT(proc_specificdata_domain != NULL);

        size_t proc_alignment = coherency_unit;
        if (proc_alignment < MIN_PROC_ALIGNMENT)
                proc_alignment = MIN_PROC_ALIGNMENT;

        proc_cache = pool_cache_init(sizeof(struct proc), proc_alignment, 0, 0,
            "procpl", NULL, IPL_NONE, proc_ctor, NULL, NULL);

        proc_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
            proc_listener_cb, NULL);
}

void
procinit_sysctl(void)
{
        static struct sysctllog *clog;

        sysctl_createv(&clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "expose_address",
                       SYSCTL_DESCR("Enable exposing kernel addresses"),
                       sysctl_security_expose_address, 0,
                       &kern_expose_address, 0, CTL_KERN, CTL_CREATE, CTL_EOL);
        sysctl_createv(&clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "proc",
                       SYSCTL_DESCR("System-wide process information"),
                       sysctl_doeproc, 0, NULL, 0,
                       CTL_KERN, KERN_PROC, CTL_EOL);
        sysctl_createv(&clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "proc2",
                       SYSCTL_DESCR("Machine-independent process information"),
                       sysctl_doeproc, 0, NULL, 0,
                       CTL_KERN, KERN_PROC2, CTL_EOL);
        sysctl_createv(&clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "proc_args",
                       SYSCTL_DESCR("Process argument information"),
                       sysctl_kern_proc_args, 0, NULL, 0,
                       CTL_KERN, KERN_PROC_ARGS, CTL_EOL);

        /*
          "nodes" under these:

          KERN_PROC_ALL
          KERN_PROC_PID pid
          KERN_PROC_PGRP pgrp
          KERN_PROC_SESSION sess
          KERN_PROC_TTY tty
          KERN_PROC_UID uid
          KERN_PROC_RUID uid
          KERN_PROC_GID gid
          KERN_PROC_RGID gid

          all in all, probably not worth the effort...
        */
}

/*
 * Initialize process 0.
 */
void
proc0_init(void)
{
        struct proc *p;
        struct pgrp *pg;
        struct rlimit *rlim;
        rlim_t lim;
        int i;

        p = &proc0;
        pg = &pgrp0;

        mutex_init(&p->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
        mutex_init(&p->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
        p->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);

        rw_init(&p->p_reflock);
        cv_init(&p->p_waitcv, "wait");
        cv_init(&p->p_lwpcv, "lwpwait");

        LIST_INSERT_HEAD(&p->p_lwps, &lwp0, l_sibling);

        KASSERT(lwp0.l_lid == 0);
        pid_table[lwp0.l_lid].pt_slot = PT_SET_LWP(&lwp0);
        LIST_INSERT_HEAD(&allproc, p, p_list);

        pid_table[lwp0.l_lid].pt_pgrp = pg;
        LIST_INSERT_HEAD(&pg->pg_members, p, p_pglist);

#ifdef __HAVE_SYSCALL_INTERN
        (*p->p_emul->e_syscall_intern)(p);
#endif

        /* Create credentials. */
        cred0 = kauth_cred_alloc();
        p->p_cred = cred0;

        /* Create the CWD info. */
        rw_init(&cwdi0.cwdi_lock);

        /* Create the limits structures. */
        mutex_init(&limit0.pl_lock, MUTEX_DEFAULT, IPL_NONE);

        rlim = limit0.pl_rlimit;
        for (i = 0; i < __arraycount(limit0.pl_rlimit); i++) {
                rlim[i].rlim_cur = RLIM_INFINITY;
                rlim[i].rlim_max = RLIM_INFINITY;
        }

        rlim[RLIMIT_NOFILE].rlim_max = maxfiles;
        rlim[RLIMIT_NOFILE].rlim_cur = maxfiles < nofile ? maxfiles : nofile;

        rlim[RLIMIT_NPROC].rlim_max = maxproc;
        rlim[RLIMIT_NPROC].rlim_cur = maxproc < maxuprc ? maxproc : maxuprc;

        lim = MIN(VM_MAXUSER_ADDRESS, ctob((rlim_t)uvm_availmem(false)));
        rlim[RLIMIT_RSS].rlim_max = lim;
        rlim[RLIMIT_MEMLOCK].rlim_max = lim;
        rlim[RLIMIT_MEMLOCK].rlim_cur = lim / 3;

        rlim[RLIMIT_NTHR].rlim_max = maxlwp;
        rlim[RLIMIT_NTHR].rlim_cur = maxlwp / 2;

        /* Note that default core name has zero length. */
        limit0.pl_corename = defcorename;
        limit0.pl_cnlen = 0;
        limit0.pl_refcnt = 1;
        limit0.pl_writeable = false;
        limit0.pl_sv_limit = NULL;

        /* Configure virtual memory system, set vm rlimits. */
        uvm_init_limits(p);

        /* Initialize file descriptor table for proc0. */
        fd_init(&filedesc0);

        /*
         * Initialize proc0's vmspace, which uses the kernel pmap.
         * All kernel processes (which never have user space mappings)
         * share proc0's vmspace, and thus, the kernel pmap.
         */
        uvmspace_init(&vmspace0, pmap_kernel(), round_page(VM_MIN_ADDRESS),
            trunc_page(VM_MAXUSER_ADDRESS),
#ifdef __USE_TOPDOWN_VM
            true
#else
            false
#endif
            );

        /* Initialize signal state for proc0. XXX IPL_SCHED */
        mutex_init(&p->p_sigacts->sa_mutex, MUTEX_DEFAULT, IPL_SCHED);
        siginit(p);

        proc_initspecific(p);
        kdtrace_proc_ctor(NULL, p);
}

/*
 * Session reference counting.
 */

void
proc_sesshold(struct session *ss)
{

        KASSERT(mutex_owned(&proc_lock));
        ss->s_count++;
}

void
proc_sessrele(struct session *ss)
{
        struct pgrp *pg;

        KASSERT(mutex_owned(&proc_lock));
        KASSERT(ss->s_count > 0);

        /*
         * We keep the pgrp with the same id as the session in order to
         * stop a process being given the same pid.  Since the pgrp holds
         * a reference to the session, it must be a 'zombie' pgrp by now.
         */
        if (--ss->s_count == 0) {
                pg = pg_remove(ss->s_sid);
        } else {
                pg = NULL;
                ss = NULL;
        }

        mutex_exit(&proc_lock);

        if (pg)
                kmem_free(pg, sizeof(struct pgrp));
        if (ss)
                kmem_free(ss, sizeof(struct session));
}

/*
 * Check that the specified process group is in the session of the
 * specified process.
 * Treats -ve ids as process ids.
 * Used to validate TIOCSPGRP requests.
 */
int
pgid_in_session(struct proc *p, pid_t pg_id)
{
        struct pgrp *pgrp;
        struct session *session;
        int error;

        if (pg_id == INT_MIN)
                return EINVAL;

        mutex_enter(&proc_lock);
        if (pg_id < 0) {
                struct proc *p1 = proc_find(-pg_id);
                if (p1 == NULL) {
                        error = EINVAL;
                        goto fail;
                }
                pgrp = p1->p_pgrp;
        } else {
                pgrp = pgrp_find(pg_id);
                if (pgrp == NULL) {
                        error = EINVAL;
                        goto fail;
                }
        }
        session = pgrp->pg_session;
        error = (session != p->p_pgrp->pg_session) ? EPERM : 0;
fail:
        mutex_exit(&proc_lock);
        return error;
}

/*
 * p_inferior: is p an inferior of q?
 */
static inline bool
p_inferior(struct proc *p, struct proc *q)
{

        KASSERT(mutex_owned(&proc_lock));

        for (; p != q; p = p->p_pptr)
                if (p->p_pid == 0)
                        return false;
        return true;
}

/*
 * proc_find_lwp: locate an lwp in said proc by the ID.
 *
 * => Must be called with p::p_lock held.
 * => LSIDL lwps are not returned because they are only partially
 *    constructed while occupying the slot.
 * => Callers need to be careful about lwp::l_stat of the returned
 *    lwp.
 */
struct lwp *
proc_find_lwp(proc_t *p, pid_t pid)
{
        struct pid_table *pt;
        unsigned pt_mask;
        struct lwp *l = NULL;
        uintptr_t slot;
        int s;

        KASSERT(mutex_owned(p->p_lock));

        /*
         * Look in the pid_table.  This is done unlocked inside a
         * pserialize read section covering pid_table's memory
         * allocation only, so take care to read things in the correct
         * order:
         *
         * 1. First read the table mask -- this only ever increases, in
         *    expand_pid_table, so a stale value is safely
         *    conservative.
         *
         * 2. Next read the pid table -- this is always set _before_
         *    the mask increases, so if we see a new table and stale
         *    mask, the mask is still valid for the table.
         */
        s = pserialize_read_enter();
        pt_mask = atomic_load_acquire(&pid_tbl_mask);
        pt = &atomic_load_consume(&pid_table)[pid & pt_mask];
        slot = atomic_load_consume(&pt->pt_slot);
        if (__predict_false(!PT_IS_LWP(slot))) {
                pserialize_read_exit(s);
                return NULL;
        }

        /*
         * Check to see if the LWP is from the correct process.  We won't
         * see entries in pid_table from a prior process that also used "p",
         * by virtue of the fact that allocating "p" means all prior updates
         * to dependant data structures are visible to this thread.
         */
        l = PT_GET_LWP(slot);
        if (__predict_false(atomic_load_relaxed(&l->l_proc) != p)) {
                pserialize_read_exit(s);
                return NULL;
        }

        /*
         * We now know that p->p_lock holds this LWP stable.
         *
         * If the status is not LSIDL, it means the LWP is intended to be
         * findable by LID and l_lid cannot change behind us.
         *
         * No need to acquire the LWP's lock to check for LSIDL, as
         * p->p_lock must be held to transition in and out of LSIDL.
         * Any other observed state of is no particular interest.
         */
        pserialize_read_exit(s);
        return l->l_stat != LSIDL && l->l_lid == pid ? l : NULL;
}

/*
 * proc_find_lwp_unlocked: locate an lwp in said proc by the ID.
 *
 * => Called in a pserialize read section with no locks held.
 * => LSIDL lwps are not returned because they are only partially
 *    constructed while occupying the slot.
 * => Callers need to be careful about lwp::l_stat of the returned
 *    lwp.
 * => If an LWP is found, it's returned locked.
 */
struct lwp *
proc_find_lwp_unlocked(proc_t *p, pid_t pid)
{
        struct pid_table *pt;
        unsigned pt_mask;
        struct lwp *l = NULL;
        uintptr_t slot;

        KASSERT(pserialize_in_read_section());

        /*
         * Look in the pid_table.  This is done unlocked inside a
         * pserialize read section covering pid_table's memory
         * allocation only, so take care to read things in the correct
         * order:
         *
         * 1. First read the table mask -- this only ever increases, in
         *    expand_pid_table, so a stale value is safely
         *    conservative.
         *
         * 2. Next read the pid table -- this is always set _before_
         *    the mask increases, so if we see a new table and stale
         *    mask, the mask is still valid for the table.
         */
        pt_mask = atomic_load_acquire(&pid_tbl_mask);
        pt = &atomic_load_consume(&pid_table)[pid & pt_mask];
        slot = atomic_load_consume(&pt->pt_slot);
        if (__predict_false(!PT_IS_LWP(slot))) {
                return NULL;
        }

        /*
         * Lock the LWP we found to get it stable.  If it's embryonic or
         * reaped (LSIDL) then none of the other fields can safely be
         * checked.
         */
        l = PT_GET_LWP(slot);
        lwp_lock(l);
        if (__predict_false(l->l_stat == LSIDL)) {
                lwp_unlock(l);
                return NULL;
        }

        /*
         * l_proc and l_lid are now known stable because the LWP is not
         * LSIDL, so check those fields too to make sure we found the
         * right thing.
         */
        if (__predict_false(l->l_proc != p || l->l_lid != pid)) {
                lwp_unlock(l);
                return NULL;
        }

        /* Everything checks out, return it locked. */
        return l;
}

/*
 * proc_find_lwp_acquire_proc: locate an lwp and acquire a lock
 * on its containing proc.
 *
 * => Similar to proc_find_lwp(), but does not require you to have
 *    the proc a priori.
 * => Also returns proc * to caller, with p::p_lock held.
 * => Same caveats apply.
 */
struct lwp *
proc_find_lwp_acquire_proc(pid_t pid, struct proc **pp)
{
        struct pid_table *pt;
        struct proc *p = NULL;
        struct lwp *l = NULL;
        uintptr_t slot;

        KASSERT(pp != NULL);
        mutex_enter(&proc_lock);
        pt = &pid_table[pid & pid_tbl_mask];

        slot = pt->pt_slot;
        if (__predict_true(PT_IS_LWP(slot) && pt->pt_pid == pid)) {
                l = PT_GET_LWP(slot);
                p = l->l_proc;
                mutex_enter(p->p_lock);
                if (__predict_false(l->l_stat == LSIDL)) {
                        mutex_exit(p->p_lock);
                        l = NULL;
                        p = NULL;
                }
        }
        mutex_exit(&proc_lock);

        KASSERT(p == NULL || mutex_owned(p->p_lock));
        *pp = p;
        return l;
}

/*
 * proc_find_raw_pid_table_locked: locate a process by the ID.
 *
 * => Must be called with proc_lock held.
 */
static proc_t *
proc_find_raw_pid_table_locked(pid_t pid, bool any_lwpid)
{
        struct pid_table *pt;
        proc_t *p = NULL;
        uintptr_t slot;

        /* No - used by DDB.  KASSERT(mutex_owned(&proc_lock)); */
        pt = &pid_table[pid & pid_tbl_mask];

        slot = pt->pt_slot;
        if (__predict_true(PT_IS_LWP(slot) && pt->pt_pid == pid)) {
                /*
                 * When looking up processes, require a direct match
                 * on the PID assigned to the proc, not just one of
                 * its LWPs.
                 *
                 * N.B. We require lwp::l_proc of LSIDL LWPs to be
                 * valid here.
                 */
                p = PT_GET_LWP(slot)->l_proc;
                if (__predict_false(p->p_pid != pid && !any_lwpid))
                        p = NULL;
        } else if (PT_IS_PROC(slot) && pt->pt_pid == pid) {
                p = PT_GET_PROC(slot);
        }
        return p;
}

proc_t *
proc_find_raw(pid_t pid)
{

        return proc_find_raw_pid_table_locked(pid, false);
}

static proc_t *
proc_find_internal(pid_t pid, bool any_lwpid)
{
        proc_t *p;

        KASSERT(mutex_owned(&proc_lock));

        p = proc_find_raw_pid_table_locked(pid, any_lwpid);
        if (__predict_false(p == NULL)) {
                return NULL;
        }

        /*
         * Only allow live processes to be found by PID.
         * XXX: p_stat might change, since proc unlocked.
         */
        if (__predict_true(p->p_stat == SACTIVE || p->p_stat == SSTOP)) {
                return p;
        }
        return NULL;
}

proc_t *
proc_find(pid_t pid)
{
        return proc_find_internal(pid, false);
}

proc_t *
proc_find_lwpid(pid_t pid)
{
        return proc_find_internal(pid, true);
}

/*
 * pgrp_find: locate a process group by the ID.
 *
 * => Must be called with proc_lock held.
 */
struct pgrp *
pgrp_find(pid_t pgid)
{
        struct pgrp *pg;

        KASSERT(mutex_owned(&proc_lock));

        pg = pid_table[pgid & pid_tbl_mask].pt_pgrp;

        /*
         * Cannot look up a process group that only exists because the
         * session has not died yet (traditional).
         */
        if (pg == NULL || pg->pg_id != pgid || LIST_EMPTY(&pg->pg_members)) {
                return NULL;
        }
        return pg;
}

static void
expand_pid_table(void)
{
        size_t pt_size, tsz;
        struct pid_table *n_pt, *new_pt;
        uintptr_t slot;
        struct pgrp *pgrp;
        pid_t pid, rpid;
        u_int i;
        uint new_pt_mask;

        KASSERT(mutex_owned(&proc_lock));

        /* Unlock the pid_table briefly to allocate memory. */
        pt_size = pid_tbl_mask + 1;
        mutex_exit(&proc_lock);

        tsz = pt_size * 2 * sizeof(struct pid_table);
        new_pt = kmem_alloc(tsz, KM_SLEEP);
        new_pt_mask = pt_size * 2 - 1;

        /* XXX For now.  The pratical limit is much lower anyway. */
        KASSERT(new_pt_mask <= FUTEX_TID_MASK);

        mutex_enter(&proc_lock);
        if (pt_size != pid_tbl_mask + 1) {
                /* Another process beat us to it... */
                mutex_exit(&proc_lock);
                kmem_free(new_pt, tsz);
                goto out;
        }

        /*
         * Copy entries from old table into new one.
         * If 'pid' is 'odd' we need to place in the upper half,
         * even pid's to the lower half.
         * Free items stay in the low half so we don't have to
         * fixup the reference to them.
         * We stuff free items on the front of the freelist
         * because we can't write to unmodified entries.
         * Processing the table backwards maintains a semblance
         * of issuing pid numbers that increase with time.
         */
        i = pt_size - 1;
        n_pt = new_pt + i;
        for (; ; i--, n_pt--) {
                slot = pid_table[i].pt_slot;
                pgrp = pid_table[i].pt_pgrp;
                if (!PT_VALID(slot)) {
                        /* Up 'use count' so that link is valid */
                        pid = (PT_NEXT(slot) + pt_size) & ~pt_size;
                        rpid = 0;
                        slot = PT_SET_FREE(pid);
                        if (pgrp)
                                pid = pgrp->pg_id;
                } else {
                        pid = pid_table[i].pt_pid;
                        rpid = pid;
                }

                /* Save entry in appropriate half of table */
                n_pt[pid & pt_size].pt_slot = slot;
                n_pt[pid & pt_size].pt_pgrp = pgrp;
                n_pt[pid & pt_size].pt_pid = rpid;

                /* Put other piece on start of free list */
                pid = (pid ^ pt_size) & ~pid_tbl_mask;
                n_pt[pid & pt_size].pt_slot =
                        PT_SET_FREE((pid & ~pt_size) | next_free_pt);
                n_pt[pid & pt_size].pt_pgrp = 0;
                n_pt[pid & pt_size].pt_pid = 0;

                next_free_pt = i | (pid & pt_size);
                if (i == 0)
                        break;
        }

        /* Save old table size and switch tables */
        tsz = pt_size * sizeof(struct pid_table);
        n_pt = pid_table;
        atomic_store_release(&pid_table, new_pt);
        KASSERT(new_pt_mask >= pid_tbl_mask);
        atomic_store_release(&pid_tbl_mask, new_pt_mask);

        /*
         * pid_max starts as PID_MAX (= 30000), once we have 16384
         * allocated pids we need it to be larger!
         */
        if (pid_tbl_mask > PID_MAX) {
                pid_max = pid_tbl_mask * 2 + 1;
                pid_alloc_lim |= pid_alloc_lim << 1;
        } else
                pid_alloc_lim <<= 1;        /* doubles number of free slots... */

        mutex_exit(&proc_lock);

        /*
         * Make sure that unlocked access to the old pid_table is complete
         * and then free it.
         */
        pserialize_perform(proc_psz);
        kmem_free(n_pt, tsz);

 out:        /* Return with proc_lock held again. */
        mutex_enter(&proc_lock);
}

struct proc *
proc_alloc(void)
{
        struct proc *p;

        p = pool_cache_get(proc_cache, PR_WAITOK);
        p->p_stat = SIDL;                        /* protect against others */
        proc_initspecific(p);
        kdtrace_proc_ctor(NULL, p);

        /*
         * Allocate a placeholder in the pid_table.  When we create the
         * first LWP for this process, it will take ownership of the
         * slot.
         */
        if (__predict_false(proc_alloc_pid(p) == -1)) {
                /* Allocating the PID failed; unwind. */
                proc_finispecific(p);
                proc_free_mem(p);
                p = NULL;
        }
        return p;
}

/*
 * proc_alloc_pid_slot: allocate PID and record the occcupant so that
 * proc_find_raw() can find it by the PID.
 */
static pid_t __noinline
proc_alloc_pid_slot(struct proc *p, uintptr_t slot)
{
        struct pid_table *pt;
        pid_t pid;
        int nxt;

        KASSERT(mutex_owned(&proc_lock));

        for (;;expand_pid_table()) {
                if (__predict_false(pid_alloc_cnt >= pid_alloc_lim)) {
                        /* ensure pids cycle through 2000+ values */
                        continue;
                }
                /*
                 * The first user process *must* be given PID 1.
                 * it has already been reserved for us.  This
                 * will be coming in from the proc_alloc() call
                 * above, and the entry will be usurped later when
                 * the first user LWP is created.
                 * XXX this is slightly gross.
                 */
                if (__predict_false(PT_RESERVED(pid_table[1].pt_slot) &&
                                    p != &proc0)) {
                        KASSERT(PT_IS_PROC(slot));
                        pt = &pid_table[1];
                        pt->pt_slot = slot;
                        return 1;
                }
                pt = &pid_table[next_free_pt];
#ifdef DIAGNOSTIC
                if (__predict_false(PT_VALID(pt->pt_slot) || pt->pt_pgrp))
                        panic("proc_alloc: slot busy");
#endif
                nxt = PT_NEXT(pt->pt_slot);
                if (nxt & pid_tbl_mask)
                        break;
                /* Table full - expand (NB last entry not used....) */
        }

        /* pid is 'saved use count' + 'size' + entry */
        pid = (nxt & ~pid_tbl_mask) + pid_tbl_mask + 1 + next_free_pt;
        if ((uint)pid > (uint)pid_max)
                pid &= pid_tbl_mask;
        next_free_pt = nxt & pid_tbl_mask;

        /* XXX For now.  The pratical limit is much lower anyway. */
        KASSERT(pid <= FUTEX_TID_MASK);

        /* Grab table slot */
        pt->pt_slot = slot;

        KASSERT(pt->pt_pid == 0);
        pt->pt_pid = pid;
        pid_alloc_cnt++;

        return pid;
}

pid_t
proc_alloc_pid(struct proc *p)
{
        pid_t pid;

        KASSERT((((uintptr_t)p) & PT_F_ALLBITS) == 0);
        KASSERT(p->p_stat == SIDL);

        mutex_enter(&proc_lock);
        pid = proc_alloc_pid_slot(p, PT_SET_PROC(p));
        if (pid != -1)
                p->p_pid = pid;
        mutex_exit(&proc_lock);

        return pid;
}

pid_t
proc_alloc_lwpid(struct proc *p, struct lwp *l)
{
        struct pid_table *pt;
        pid_t pid;

        KASSERT((((uintptr_t)l) & PT_F_ALLBITS) == 0);
        KASSERT(l->l_proc == p);
        KASSERT(l->l_stat == LSIDL);

        /*
         * For unlocked lookup in proc_find_lwp(), make sure l->l_proc
         * is globally visible before the LWP becomes visible via the
         * pid_table.
         */
#ifndef __HAVE_ATOMIC_AS_MEMBAR
        membar_producer();
#endif

        /*
         * If the slot for p->p_pid currently points to the proc,
         * then we should usurp this ID for the LWP.  This happens
         * at least once per process (for the first LWP), and can
         * happen again if the first LWP for a process exits and
         * before the process creates another.
         */
        mutex_enter(&proc_lock);
        pid = p->p_pid;
        pt = &pid_table[pid & pid_tbl_mask];
        KASSERT(pt->pt_pid == pid);
        if (PT_IS_PROC(pt->pt_slot)) {
                KASSERT(PT_GET_PROC(pt->pt_slot) == p);
                l->l_lid = pid;
                pt->pt_slot = PT_SET_LWP(l);
        } else {
                /* Need to allocate a new slot. */
                pid = proc_alloc_pid_slot(p, PT_SET_LWP(l));
                if (pid != -1)
                        l->l_lid = pid;
        }
        mutex_exit(&proc_lock);

        return pid;
}

static void __noinline
proc_free_pid_internal(pid_t pid, uintptr_t type __diagused)
{
        struct pid_table *pt;

        KASSERT(mutex_owned(&proc_lock));

        pt = &pid_table[pid & pid_tbl_mask];

        KASSERT(PT_GET_TYPE(pt->pt_slot) == type);
        KASSERT(pt->pt_pid == pid);

        /* save pid use count in slot */
        pt->pt_slot = PT_SET_FREE(pid & ~pid_tbl_mask);
        pt->pt_pid = 0;

        if (pt->pt_pgrp == NULL) {
                /* link last freed entry onto ours */
                pid &= pid_tbl_mask;
                pt = &pid_table[last_free_pt];
                pt->pt_slot = PT_SET_FREE(PT_NEXT(pt->pt_slot) | pid);
                pt->pt_pid = 0;
                last_free_pt = pid;
                pid_alloc_cnt--;
        }
}

/*
 * Free a process id - called from proc_free (in kern_exit.c)
 *
 * Called with the proc_lock held.
 */
void
proc_free_pid(pid_t pid)
{

        KASSERT(mutex_owned(&proc_lock));
        proc_free_pid_internal(pid, PT_F_PROC);
}

/*
 * Free a process id used by an LWP.  If this was the process's
 * first LWP, we convert the slot to point to the process; the
 * entry will get cleaned up later when the process finishes exiting.
 *
 * If not, then it's the same as proc_free_pid().
 */
void
proc_free_lwpid(struct proc *p, pid_t pid)
{

        KASSERT(mutex_owned(&proc_lock));

        if (__predict_true(p->p_pid == pid)) {
                struct pid_table *pt;

                pt = &pid_table[pid & pid_tbl_mask];

                KASSERT(pt->pt_pid == pid);
                KASSERT(PT_IS_LWP(pt->pt_slot));
                KASSERT(PT_GET_LWP(pt->pt_slot)->l_proc == p);

                pt->pt_slot = PT_SET_PROC(p);
                return;
        }
        proc_free_pid_internal(pid, PT_F_LWP);
}

void
proc_free_mem(struct proc *p)
{

        kdtrace_proc_dtor(NULL, p);
        pool_cache_put(proc_cache, p);
}

/*
 * proc_enterpgrp: move p to a new or existing process group (and session).
 *
 * If we are creating a new pgrp, the pgid should equal
 * the calling process' pid.
 * If is only valid to enter a process group that is in the session
 * of the process.
 * Also mksess should only be set if we are creating a process group
 *
 * Only called from sys_setsid, sys_setpgid and posix_spawn/spawn_return.
 */
int
proc_enterpgrp(struct proc *curp, pid_t pid, pid_t pgid, bool mksess)
{
        struct pgrp *new_pgrp, *pgrp;
        struct session *sess;
        struct proc *p;
        int rval;
        pid_t pg_id = NO_PGID;

        /* Allocate data areas we might need before doing any validity checks */
        sess = mksess ? kmem_alloc(sizeof(*sess), KM_SLEEP) : NULL;
        new_pgrp = kmem_alloc(sizeof(*new_pgrp), KM_SLEEP);

        mutex_enter(&proc_lock);
        rval = EPERM;        /* most common error (to save typing) */

        /* Check pgrp exists or can be created */
        pgrp = pid_table[pgid & pid_tbl_mask].pt_pgrp;
        if (pgrp != NULL && pgrp->pg_id != pgid)
                goto done;

        /* Can only set another process under restricted circumstances. */
        if (pid != curp->p_pid) {
                /* Must exist and be one of our children... */
                p = proc_find_internal(pid, false);
                if (p == NULL || !p_inferior(p, curp)) {
                        rval = ESRCH;
                        goto done;
                }
                /* ... in the same session... */
                if (sess != NULL || p->p_session != curp->p_session)
                        goto done;
                /* ... existing pgid must be in same session ... */
                if (pgrp != NULL && pgrp->pg_session != p->p_session)
                        goto done;
                /* ... and not done an exec. */
                if (p->p_flag & PK_EXEC) {
                        rval = EACCES;
                        goto done;
                }
        } else {
                /* ... setsid() cannot re-enter a pgrp */
                if (mksess && (curp->p_pgid == curp->p_pid ||
                    pgrp_find(curp->p_pid)))
                        goto done;
                p = curp;
        }

        /* Changing the process group/session of a session
           leader is definitely off limits. */
        if (SESS_LEADER(p)) {
                if (sess == NULL && p->p_pgrp == pgrp)
                        /* unless it's a definite noop */
                        rval = 0;
                goto done;
        }

        /* Can only create a process group with id of process */
        if (pgrp == NULL && pgid != pid)
                goto done;

        /* Can only create a session if creating pgrp */
        if (sess != NULL && pgrp != NULL)
                goto done;

        /* Check we allocated memory for a pgrp... */
        if (pgrp == NULL && new_pgrp == NULL)
                goto done;

        /* Don't attach to 'zombie' pgrp */
        if (pgrp != NULL && LIST_EMPTY(&pgrp->pg_members))
                goto done;

        /* Expect to succeed now */
        rval = 0;

        if (pgrp == p->p_pgrp)
                /* nothing to do */
                goto done;

        /* Ok all setup, link up required structures */

        if (pgrp == NULL) {
                pgrp = new_pgrp;
                new_pgrp = NULL;
                if (sess != NULL) {
                        sess->s_sid = p->p_pid;
                        sess->s_leader = p;
                        sess->s_count = 1;
                        sess->s_ttyvp = NULL;
                        sess->s_ttyp = NULL;
                        sess->s_flags = p->p_session->s_flags & ~S_LOGIN_SET;
                        memcpy(sess->s_login, p->p_session->s_login,
                            sizeof(sess->s_login));
                        p->p_lflag &= ~PL_CONTROLT;
                } else {
                        sess = p->p_pgrp->pg_session;
                        proc_sesshold(sess);
                }
                pgrp->pg_session = sess;
                sess = NULL;

                pgrp->pg_id = pgid;
                LIST_INIT(&pgrp->pg_members);
#ifdef DIAGNOSTIC
                if (__predict_false(pid_table[pgid & pid_tbl_mask].pt_pgrp))
                        panic("enterpgrp: pgrp table slot in use");
                if (__predict_false(mksess && p != curp))
                        panic("enterpgrp: mksession and p != curproc");
#endif
                pid_table[pgid & pid_tbl_mask].pt_pgrp = pgrp;
                pgrp->pg_jobc = 0;
        }

        /*
         * Adjust eligibility of affected pgrps to participate in job control.
         * Increment eligibility counts before decrementing, otherwise we
         * could reach 0 spuriously during the first call.
         */
        fixjobc(p, pgrp, 1);
        fixjobc(p, p->p_pgrp, 0);

        /* Interlock with ttread(). */
        mutex_spin_enter(&tty_lock);

        /* Move process to requested group. */
        LIST_REMOVE(p, p_pglist);
        if (LIST_EMPTY(&p->p_pgrp->pg_members))
                /* defer delete until we've dumped the lock */
                pg_id = p->p_pgrp->pg_id;
        p->p_pgrp = pgrp;
        LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);

        /* Done with the swap; we can release the tty mutex. */
        mutex_spin_exit(&tty_lock);

    done:
        if (pg_id != NO_PGID) {
                /* Releases proc_lock. */
                pg_delete(pg_id);
        } else {
                mutex_exit(&proc_lock);
        }
        if (sess != NULL)
                kmem_free(sess, sizeof(*sess));
        if (new_pgrp != NULL)
                kmem_free(new_pgrp, sizeof(*new_pgrp));
#ifdef DEBUG_PGRP
        if (__predict_false(rval))
                printf("enterpgrp(%d,%d,%d), curproc %d, rval %d\n",
                        pid, pgid, mksess, curp->p_pid, rval);
#endif
        return rval;
}

/*
 * proc_leavepgrp: remove a process from its process group.
 *  => must be called with the proc_lock held, which will be released;
 */
void
proc_leavepgrp(struct proc *p)
{
        struct pgrp *pgrp;

        KASSERT(mutex_owned(&proc_lock));

        /* Interlock with ttread() */
        mutex_spin_enter(&tty_lock);
        pgrp = p->p_pgrp;
        LIST_REMOVE(p, p_pglist);
        p->p_pgrp = NULL;
        mutex_spin_exit(&tty_lock);

        if (LIST_EMPTY(&pgrp->pg_members)) {
                /* Releases proc_lock. */
                pg_delete(pgrp->pg_id);
        } else {
                mutex_exit(&proc_lock);
        }
}

/*
 * pg_remove: remove a process group from the table.
 *  => must be called with the proc_lock held;
 *  => returns process group to free;
 */
static struct pgrp *
pg_remove(pid_t pg_id)
{
        struct pgrp *pgrp;
        struct pid_table *pt;

        KASSERT(mutex_owned(&proc_lock));

        pt = &pid_table[pg_id & pid_tbl_mask];
        pgrp = pt->pt_pgrp;

        KASSERT(pgrp != NULL);
        KASSERT(pgrp->pg_id == pg_id);
        KASSERT(LIST_EMPTY(&pgrp->pg_members));

        pt->pt_pgrp = NULL;

        if (!PT_VALID(pt->pt_slot)) {
                /* Orphaned pgrp, put slot onto free list. */
                KASSERT((PT_NEXT(pt->pt_slot) & pid_tbl_mask) == 0);
                pg_id &= pid_tbl_mask;
                pt = &pid_table[last_free_pt];
                pt->pt_slot = PT_SET_FREE(PT_NEXT(pt->pt_slot) | pg_id);
                KASSERT(pt->pt_pid == 0);
                last_free_pt = pg_id;
                pid_alloc_cnt--;
        }
        return pgrp;
}

/*
 * pg_delete: delete and free a process group.
 *  => must be called with the proc_lock held, which will be released.
 */
static void
pg_delete(pid_t pg_id)
{
        struct pgrp *pg;
        struct tty *ttyp;
        struct session *ss;

        KASSERT(mutex_owned(&proc_lock));

        pg = pid_table[pg_id & pid_tbl_mask].pt_pgrp;
        if (pg == NULL || pg->pg_id != pg_id || !LIST_EMPTY(&pg->pg_members)) {
                mutex_exit(&proc_lock);
                return;
        }

        ss = pg->pg_session;

        /* Remove reference (if any) from tty to this process group */
        mutex_spin_enter(&tty_lock);
        ttyp = ss->s_ttyp;
        if (ttyp != NULL && ttyp->t_pgrp == pg) {
                ttyp->t_pgrp = NULL;
                KASSERT(ttyp->t_session == ss);
        }
        mutex_spin_exit(&tty_lock);

        /*
         * The leading process group in a session is freed by proc_sessrele(),
         * if last reference.  It will also release the locks.
         */
        pg = (ss->s_sid != pg->pg_id) ? pg_remove(pg_id) : NULL;
        proc_sessrele(ss);

        if (pg != NULL) {
                /* Free it, if was not done above. */
                kmem_free(pg, sizeof(struct pgrp));
        }
}

/*
 * Adjust pgrp jobc counters when specified process changes process group.
 * We count the number of processes in each process group that "qualify"
 * the group for terminal job control (those with a parent in a different
 * process group of the same session).  If that count reaches zero, the
 * process group becomes orphaned.  Check both the specified process'
 * process group and that of its children.
 * entering == 0 => p is leaving specified group.
 * entering == 1 => p is entering specified group.
 *
 * Call with proc_lock held.
 */
void
fixjobc(struct proc *p, struct pgrp *pgrp, int entering)
{
        struct pgrp *hispgrp;
        struct session *mysession = pgrp->pg_session;
        struct proc *child;

        KASSERT(mutex_owned(&proc_lock));

        /*
         * Check p's parent to see whether p qualifies its own process
         * group; if so, adjust count for p's process group.
         */
        hispgrp = p->p_pptr->p_pgrp;
        if (hispgrp != pgrp && hispgrp->pg_session == mysession) {
                if (entering) {
                        pgrp->pg_jobc++;
                        p->p_lflag &= ~PL_ORPHANPG;
                } else {
                        /* KASSERT(pgrp->pg_jobc > 0); */
                        if (--pgrp->pg_jobc == 0)
                                orphanpg(pgrp);
                }
        }

        /*
         * Check this process' children to see whether they qualify
         * their process groups; if so, adjust counts for children's
         * process groups.
         */
        LIST_FOREACH(child, &p->p_children, p_sibling) {
                hispgrp = child->p_pgrp;
                if (hispgrp != pgrp && hispgrp->pg_session == mysession &&
                    !P_ZOMBIE(child)) {
                        if (entering) {
                                child->p_lflag &= ~PL_ORPHANPG;
                                hispgrp->pg_jobc++;
                        } else {
                                KASSERT(hispgrp->pg_jobc > 0);
                                if (--hispgrp->pg_jobc == 0)
                                        orphanpg(hispgrp);
                        }
                }
        }
}

/*
 * A process group has become orphaned;
 * if there are any stopped processes in the group,
 * hang-up all process in that group.
 *
 * Call with proc_lock held.
 */
static void
orphanpg(struct pgrp *pg)
{
        struct proc *p;

        KASSERT(mutex_owned(&proc_lock));

        LIST_FOREACH(p, &pg->pg_members, p_pglist) {
                if (p->p_stat == SSTOP) {
                        p->p_lflag |= PL_ORPHANPG;
                        psignal(p, SIGHUP);
                        psignal(p, SIGCONT);
                }
        }
}

#ifdef DDB
#include <ddb/db_output.h>
void pidtbl_dump(void);
void
pidtbl_dump(void)
{
        struct pid_table *pt;
        struct proc *p;
        struct pgrp *pgrp;
        uintptr_t slot;
        int id;

        db_printf("pid table %p size %x, next %x, last %x\n",
                pid_table, pid_tbl_mask+1,
                next_free_pt, last_free_pt);
        for (pt = pid_table, id = 0; id <= pid_tbl_mask; id++, pt++) {
                slot = pt->pt_slot;
                if (!PT_VALID(slot) && !pt->pt_pgrp)
                        continue;
                if (PT_IS_LWP(slot)) {
                        p = PT_GET_LWP(slot)->l_proc;
                } else if (PT_IS_PROC(slot)) {
                        p = PT_GET_PROC(slot);
                } else {
                        p = NULL;
                }
                db_printf("  id %x: ", id);
                if (p != NULL)
                        db_printf("slotpid %d proc %p id %d (0x%x) %s\n",
                                pt->pt_pid, p, p->p_pid, p->p_pid, p->p_comm);
                else
                        db_printf("next %x use %x\n",
                                PT_NEXT(slot) & pid_tbl_mask,
                                PT_NEXT(slot) & ~pid_tbl_mask);
                if ((pgrp = pt->pt_pgrp)) {
                        db_printf("\tsession %p, sid %d, count %d, login %s\n",
                            pgrp->pg_session, pgrp->pg_session->s_sid,
                            pgrp->pg_session->s_count,
                            pgrp->pg_session->s_login);
                        db_printf("\tpgrp %p, pg_id %d, pg_jobc %d, members %p\n",
                            pgrp, pgrp->pg_id, pgrp->pg_jobc,
                            LIST_FIRST(&pgrp->pg_members));
                        LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
                                db_printf("\t\tpid %d addr %p pgrp %p %s\n",
                                    p->p_pid, p, p->p_pgrp, p->p_comm);
                        }
                }
        }
}
#endif /* DDB */

#ifdef KSTACK_CHECK_MAGIC

#define        KSTACK_MAGIC        0xdeadbeaf

/* XXX should be per process basis? */
static int        kstackleftmin = KSTACK_SIZE;
static int        kstackleftthres = KSTACK_SIZE / 8;

void
kstack_setup_magic(const struct lwp *l)
{
        uint32_t *ip;
        uint32_t const *end;

        KASSERT(l != NULL);
        KASSERT(l != &lwp0);

        /*
         * fill all the stack with magic number
         * so that later modification on it can be detected.
         */
        ip = (uint32_t *)KSTACK_LOWEST_ADDR(l);
        end = (uint32_t *)((char *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE);
        for (; ip < end; ip++) {
                *ip = KSTACK_MAGIC;
        }
}

void
kstack_check_magic(const struct lwp *l)
{
        uint32_t const *ip, *end;
        int stackleft;

        KASSERT(l != NULL);

        /* don't check proc0 */ /*XXX*/
        if (l == &lwp0)
                return;

#ifdef __MACHINE_STACK_GROWS_UP
        /* stack grows upwards (eg. hppa) */
        ip = (uint32_t *)((void *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE);
        end = (uint32_t *)KSTACK_LOWEST_ADDR(l);
        for (ip--; ip >= end; ip--)
                if (*ip != KSTACK_MAGIC)
                        break;

        stackleft = (void *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE - (void *)ip;
#else /* __MACHINE_STACK_GROWS_UP */
        /* stack grows downwards (eg. i386) */
        ip = (uint32_t *)KSTACK_LOWEST_ADDR(l);
        end = (uint32_t *)((char *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE);
        for (; ip < end; ip++)
                if (*ip != KSTACK_MAGIC)
                        break;

        stackleft = ((const char *)ip) - (const char *)KSTACK_LOWEST_ADDR(l);
#endif /* __MACHINE_STACK_GROWS_UP */

        if (kstackleftmin > stackleft) {
                kstackleftmin = stackleft;
                if (stackleft < kstackleftthres)
                        printf("warning: kernel stack left %d bytes"
                            "(pid %u:lid %u)\n", stackleft,
                            (u_int)l->l_proc->p_pid, (u_int)l->l_lid);
        }

        if (stackleft <= 0) {
                panic("magic on the top of kernel stack changed for "
                    "pid %u, lid %u: maybe kernel stack overflow",
                    (u_int)l->l_proc->p_pid, (u_int)l->l_lid);
        }
}
#endif /* KSTACK_CHECK_MAGIC */

int
proclist_foreach_call(struct proclist *list,
    int (*callback)(struct proc *, void *arg), void *arg)
{
        struct proc marker;
        struct proc *p;
        int ret = 0;

        marker.p_flag = PK_MARKER;
        mutex_enter(&proc_lock);
        for (p = LIST_FIRST(list); ret == 0 && p != NULL;) {
                if (p->p_flag & PK_MARKER) {
                        p = LIST_NEXT(p, p_list);
                        continue;
                }
                LIST_INSERT_AFTER(p, &marker, p_list);
                ret = (*callback)(p, arg);
                KASSERT(mutex_owned(&proc_lock));
                p = LIST_NEXT(&marker, p_list);
                LIST_REMOVE(&marker, p_list);
        }
        mutex_exit(&proc_lock);

        return ret;
}

int
proc_vmspace_getref(struct proc *p, struct vmspace **vm)
{

        /* XXXCDC: how should locking work here? */

        /* curproc exception is for coredump. */

        if ((p != curproc && (p->p_sflag & PS_WEXIT) != 0) ||
            (p->p_vmspace->vm_refcnt < 1)) {
                return EFAULT;
        }

        uvmspace_addref(p->p_vmspace);
        *vm = p->p_vmspace;

        return 0;
}

/*
 * Acquire a write lock on the process credential.
 */
void
proc_crmod_enter(void)
{
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;
        kauth_cred_t oc;

        /* Reset what needs to be reset in plimit. */
        if (p->p_limit->pl_corename != defcorename) {
                lim_setcorename(p, defcorename, 0);
        }

        mutex_enter(p->p_lock);

        /* Ensure the LWP cached credentials are up to date. */
        if ((oc = l->l_cred) != p->p_cred) {
                l->l_cred = kauth_cred_hold(p->p_cred);
                kauth_cred_free(oc);
        }
}

/*
 * Set in a new process credential, and drop the write lock.  The credential
 * must have a reference already.  Optionally, free a no-longer required
 * credential.
 */
void
proc_crmod_leave(kauth_cred_t scred, kauth_cred_t fcred, bool sugid)
{
        struct lwp *l = curlwp, *l2;
        struct proc *p = l->l_proc;
        kauth_cred_t oc;

        KASSERT(mutex_owned(p->p_lock));

        /* Is there a new credential to set in? */
        if (scred != NULL) {
                p->p_cred = scred;
                LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
                        if (l2 != l) {
                                lwp_lock(l2);
                                l2->l_flag |= LW_CACHECRED;
                                lwp_need_userret(l2);
                                lwp_unlock(l2);
                        }
                }

                /* Ensure the LWP cached credentials are up to date. */
                if ((oc = l->l_cred) != scred) {
                        l->l_cred = kauth_cred_hold(scred);
                }
        } else
                oc = NULL;        /* XXXgcc */

        if (sugid) {
                /*
                 * Mark process as having changed credentials, stops
                 * tracing etc.
                 */
                p->p_flag |= PK_SUGID;
        }

        mutex_exit(p->p_lock);

        /* If there is a credential to be released, free it now. */
        if (fcred != NULL) {
                KASSERT(scred != NULL);
                kauth_cred_free(fcred);
                if (oc != scred)
                        kauth_cred_free(oc);
        }
}

/*
 * proc_specific_key_create --
 *        Create a key for subsystem proc-specific data.
 */
int
proc_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
{

        return (specificdata_key_create(proc_specificdata_domain, keyp, dtor));
}

/*
 * proc_specific_key_delete --
 *        Delete a key for subsystem proc-specific data.
 */
void
proc_specific_key_delete(specificdata_key_t key)
{

        specificdata_key_delete(proc_specificdata_domain, key);
}

/*
 * proc_initspecific --
 *        Initialize a proc's specificdata container.
 */
void
proc_initspecific(struct proc *p)
{
        int error __diagused;

        error = specificdata_init(proc_specificdata_domain, &p->p_specdataref);
        KASSERT(error == 0);
}

/*
 * proc_finispecific --
 *        Finalize a proc's specificdata container.
 */
void
proc_finispecific(struct proc *p)
{

        specificdata_fini(proc_specificdata_domain, &p->p_specdataref);
}

/*
 * proc_getspecific --
 *        Return proc-specific data corresponding to the specified key.
 */
void *
proc_getspecific(struct proc *p, specificdata_key_t key)
{

        return (specificdata_getspecific(proc_specificdata_domain,
                                         &p->p_specdataref, key));
}

/*
 * proc_setspecific --
 *        Set proc-specific data corresponding to the specified key.
 */
void
proc_setspecific(struct proc *p, specificdata_key_t key, void *data)
{

        specificdata_setspecific(proc_specificdata_domain,
                                 &p->p_specdataref, key, data);
}

int
proc_uidmatch(kauth_cred_t cred, kauth_cred_t target)
{
        int r = 0;

        if (kauth_cred_getuid(cred) != kauth_cred_getuid(target) ||
            kauth_cred_getuid(cred) != kauth_cred_getsvuid(target)) {
                /*
                 * suid proc of ours or proc not ours
                 */
                r = EPERM;
        } else if (kauth_cred_getgid(target) != kauth_cred_getsvgid(target)) {
                /*
                 * sgid proc has sgid back to us temporarily
                 */
                r = EPERM;
        } else {
                /*
                 * our rgid must be in target's group list (ie,
                 * sub-processes started by a sgid process)
                 */
                int ismember = 0;

                if (kauth_cred_ismember_gid(cred,
                    kauth_cred_getgid(target), &ismember) != 0 ||
                    !ismember)
                        r = EPERM;
        }

        return (r);
}

/*
 * sysctl stuff
 */

#define KERN_PROCSLOP        (5 * sizeof(struct kinfo_proc))

static const u_int sysctl_flagmap[] = {
        PK_ADVLOCK, P_ADVLOCK,
        PK_EXEC, P_EXEC,
        PK_NOCLDWAIT, P_NOCLDWAIT,
        PK_32, P_32,
        PK_CLDSIGIGN, P_CLDSIGIGN,
        PK_SUGID, P_SUGID,
        0
};

static const u_int sysctl_sflagmap[] = {
        PS_NOCLDSTOP, P_NOCLDSTOP,
        PS_WEXIT, P_WEXIT,
        PS_STOPFORK, P_STOPFORK,
        PS_STOPEXEC, P_STOPEXEC,
        PS_STOPEXIT, P_STOPEXIT,
        0
};

static const u_int sysctl_slflagmap[] = {
        PSL_TRACED, P_TRACED,
        PSL_CHTRACED, P_CHTRACED,
        PSL_SYSCALL, P_SYSCALL,
        0
};

static const u_int sysctl_lflagmap[] = {
        PL_CONTROLT, P_CONTROLT,
        PL_PPWAIT, P_PPWAIT,
        0
};

static const u_int sysctl_stflagmap[] = {
        PST_PROFIL, P_PROFIL,
        0

};

/* used by kern_lwp also */
const u_int sysctl_lwpflagmap[] = {
        LW_SINTR, L_SINTR,
        LW_SYSTEM, L_SYSTEM,
        0
};

/*
 * Find the most ``active'' lwp of a process and return it for ps display
 * purposes
 */
static struct lwp *
proc_active_lwp(struct proc *p)
{
        static const int ostat[] = {
                0,
                2,        /* LSIDL */
                6,        /* LSRUN */
                5,        /* LSSLEEP */
                4,        /* LSSTOP */
                0,        /* LSZOMB */
                1,        /* LSDEAD */
                7,        /* LSONPROC */
                3        /* LSSUSPENDED */
        };

        struct lwp *l, *lp = NULL;
        LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                KASSERT(l->l_stat >= 0);
                KASSERT(l->l_stat < __arraycount(ostat));
                if (lp == NULL ||
                    ostat[l->l_stat] > ostat[lp->l_stat] ||
                    (ostat[l->l_stat] == ostat[lp->l_stat] &&
                    l->l_cpticks > lp->l_cpticks)) {
                        lp = l;
                        continue;
                }
        }
        return lp;
}

static int
sysctl_doeproc(SYSCTLFN_ARGS)
{
        union {
                struct kinfo_proc kproc;
                struct kinfo_proc2 kproc2;
        } *kbuf;
        struct proc *p, *next, *marker;
        char *where, *dp;
        int type, op, arg, error;
        u_int elem_size, kelem_size, elem_count;
        size_t buflen, needed;
        bool match, zombie, mmmbrains;
        const bool allowaddr = get_expose_address(curproc);

        if (namelen == 1 && name[0] == CTL_QUERY)
                return (sysctl_query(SYSCTLFN_CALL(rnode)));

        dp = where = oldp;
        buflen = where != NULL ? *oldlenp : 0;
        error = 0;
        needed = 0;
        type = rnode->sysctl_num;

        if (type == KERN_PROC) {
                if (namelen == 0)
                        return EINVAL;
                switch (op = name[0]) {
                case KERN_PROC_ALL:
                        if (namelen != 1)
                                return EINVAL;
                        arg = 0;
                        break;
                default:
                        if (namelen != 2)
                                return EINVAL;
                        arg = name[1];
                        break;
                }
                elem_count = 0;        /* Hush little compiler, don't you cry */
                kelem_size = elem_size = sizeof(kbuf->kproc);
        } else {
                if (namelen != 4)
                        return EINVAL;
                op = name[0];
                arg = name[1];
                elem_size = name[2];
                elem_count = name[3];
                kelem_size = sizeof(kbuf->kproc2);
        }

        sysctl_unlock();

        kbuf = kmem_zalloc(sizeof(*kbuf), KM_SLEEP);
        marker = kmem_alloc(sizeof(*marker), KM_SLEEP);
        marker->p_flag = PK_MARKER;

        mutex_enter(&proc_lock);
        /*
         * Start with zombies to prevent reporting processes twice, in case they
         * are dying and being moved from the list of alive processes to zombies.
         */
        mmmbrains = true;
        for (p = LIST_FIRST(&zombproc);; p = next) {
                if (p == NULL) {
                        if (mmmbrains) {
                                p = LIST_FIRST(&allproc);
                                mmmbrains = false;
                        }
                        if (p == NULL)
                                break;
                }
                next = LIST_NEXT(p, p_list);
                if ((p->p_flag & PK_MARKER) != 0)
                        continue;

                /*
                 * Skip embryonic processes.
                 */
                if (p->p_stat == SIDL)
                        continue;

                mutex_enter(p->p_lock);
                error = kauth_authorize_process(l->l_cred,
                    KAUTH_PROCESS_CANSEE, p,
                    KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_EPROC), NULL, NULL);
                if (error != 0) {
                        mutex_exit(p->p_lock);
                        continue;
                }

                /*
                 * Hande all the operations in one switch on the cost of
                 * algorithm complexity is on purpose. The win splitting this
                 * function into several similar copies makes maintenance
                 * burden, code grow and boost is negligible in practical
                 * systems.
                 */
                switch (op) {
                case KERN_PROC_PID:
                        match = (p->p_pid == (pid_t)arg);
                        break;

                case KERN_PROC_PGRP:
                        match = (p->p_pgrp->pg_id == (pid_t)arg);
                        break;

                case KERN_PROC_SESSION:
                        match = (p->p_session->s_sid == (pid_t)arg);
                        break;

                case KERN_PROC_TTY:
                        match = true;
                        if (arg == (int) KERN_PROC_TTY_REVOKE) {
                                if ((p->p_lflag & PL_CONTROLT) == 0 ||
                                    p->p_session->s_ttyp == NULL ||
                                    p->p_session->s_ttyvp != NULL) {
                                            match = false;
                                }
                        } else if ((p->p_lflag & PL_CONTROLT) == 0 ||
                            p->p_session->s_ttyp == NULL) {
                                if ((dev_t)arg != KERN_PROC_TTY_NODEV) {
                                        match = false;
                                }
                        } else if (p->p_session->s_ttyp->t_dev != (dev_t)arg) {
                                match = false;
                        }
                        break;

                case KERN_PROC_UID:
                        match = (kauth_cred_geteuid(p->p_cred) == (uid_t)arg);
                        break;

                case KERN_PROC_RUID:
                        match = (kauth_cred_getuid(p->p_cred) == (uid_t)arg);
                        break;

                case KERN_PROC_GID:
                        match = (kauth_cred_getegid(p->p_cred) == (uid_t)arg);
                        break;

                case KERN_PROC_RGID:
                        match = (kauth_cred_getgid(p->p_cred) == (uid_t)arg);
                        break;

                case KERN_PROC_ALL:
                        match = true;
                        /* allow everything */
                        break;

                default:
                        error = EINVAL;
                        mutex_exit(p->p_lock);
                        goto cleanup;
                }
                if (!match) {
                        mutex_exit(p->p_lock);
                        continue;
                }

                /*
                 * Grab a hold on the process.
                 */
                if (mmmbrains) {
                        zombie = true;
                } else {
                        zombie = !rw_tryenter(&p->p_reflock, RW_READER);
                }
                if (zombie) {
                        LIST_INSERT_AFTER(p, marker, p_list);
                }

                if (buflen >= elem_size &&
                    (type == KERN_PROC || elem_count > 0)) {
                        ruspace(p);        /* Update process vm resource use */

                        if (type == KERN_PROC) {
                                fill_proc(p, &kbuf->kproc.kp_proc, allowaddr);
                                fill_eproc(p, &kbuf->kproc.kp_eproc, zombie,
                                    allowaddr);
                        } else {
                                fill_kproc2(p, &kbuf->kproc2, zombie,
                                    allowaddr);
                                elem_count--;
                        }
                        mutex_exit(p->p_lock);
                        mutex_exit(&proc_lock);
                        /*
                         * Copy out elem_size, but not larger than kelem_size
                         */
                        error = sysctl_copyout(l, kbuf, dp,
                            uimin(kelem_size, elem_size));
                        mutex_enter(&proc_lock);
                        if (error) {
                                goto bah;
                        }
                        dp += elem_size;
                        buflen -= elem_size;
                } else {
                        mutex_exit(p->p_lock);
                }
                needed += elem_size;

                /*
                 * Release reference to process.
                 */
                 if (zombie) {
                        next = LIST_NEXT(marker, p_list);
                         LIST_REMOVE(marker, p_list);
                } else {
                        rw_exit(&p->p_reflock);
                        next = LIST_NEXT(p, p_list);
                }

                /*
                 * Short-circuit break quickly!
                 */
                if (op == KERN_PROC_PID)
                        break;
        }
        mutex_exit(&proc_lock);

        if (where != NULL) {
                *oldlenp = dp - where;
                if (needed > *oldlenp) {
                        error = ENOMEM;
                        goto out;
                }
        } else {
                needed += KERN_PROCSLOP;
                *oldlenp = needed;
        }
        kmem_free(kbuf, sizeof(*kbuf));
        kmem_free(marker, sizeof(*marker));
        sysctl_relock();
        return 0;
 bah:
         if (zombie)
                 LIST_REMOVE(marker, p_list);
        else
                rw_exit(&p->p_reflock);
 cleanup:
        mutex_exit(&proc_lock);
 out:
        kmem_free(kbuf, sizeof(*kbuf));
        kmem_free(marker, sizeof(*marker));
        sysctl_relock();
        return error;
}

int
copyin_psstrings(struct proc *p, struct ps_strings *arginfo)
{
#if !defined(_RUMPKERNEL)
        int retval;

        if (p->p_flag & PK_32) {
                MODULE_HOOK_CALL(kern_proc32_copyin_hook, (p, arginfo),
                    enosys(), retval);
                return retval;
        }
#endif /* !defined(_RUMPKERNEL) */

        return copyin_proc(p, (void *)p->p_psstrp, arginfo, sizeof(*arginfo));
}

static int
copy_procargs_sysctl_cb(void *cookie_, const void *src, size_t off, size_t len)
{
        void **cookie = cookie_;
        struct lwp *l = cookie[0];
        char *dst = cookie[1];

        return sysctl_copyout(l, src, dst + off, len);
}

/*
 * sysctl helper routine for kern.proc_args pseudo-subtree.
 */
static int
sysctl_kern_proc_args(SYSCTLFN_ARGS)
{
        struct ps_strings pss;
        struct proc *p;
        pid_t pid;
        int type, error;
        void *cookie[2];

        if (namelen == 1 && name[0] == CTL_QUERY)
                return (sysctl_query(SYSCTLFN_CALL(rnode)));

        if (newp != NULL || namelen != 2)
                return (EINVAL);
        pid = name[0];
        type = name[1];

        switch (type) {
        case KERN_PROC_PATHNAME:
                sysctl_unlock();
                error = fill_pathname(l, pid, oldp, oldlenp);
                sysctl_relock();
                return error;

        case KERN_PROC_CWD:
                sysctl_unlock();
                error = fill_cwd(l, pid, oldp, oldlenp);
                sysctl_relock();
                return error;

        case KERN_PROC_ARGV:
        case KERN_PROC_NARGV:
        case KERN_PROC_ENV:
        case KERN_PROC_NENV:
                /* ok */
                break;
        default:
                return (EINVAL);
        }

        sysctl_unlock();

        /* check pid */
        mutex_enter(&proc_lock);
        if ((p = proc_find(pid)) == NULL) {
                error = EINVAL;
                goto out_locked;
        }
        mutex_enter(p->p_lock);

        /* Check permission. */
        if (type == KERN_PROC_ARGV || type == KERN_PROC_NARGV)
                error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE,
                    p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ARGS), NULL, NULL);
        else if (type == KERN_PROC_ENV || type == KERN_PROC_NENV)
                error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE,
                    p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENV), NULL, NULL);
        else
                error = EINVAL; /* XXXGCC */
        if (error) {
                mutex_exit(p->p_lock);
                goto out_locked;
        }

        if (oldp == NULL) {
                if (type == KERN_PROC_NARGV || type == KERN_PROC_NENV)
                        *oldlenp = sizeof (int);
                else
                        *oldlenp = ARG_MAX;        /* XXX XXX XXX */
                error = 0;
                mutex_exit(p->p_lock);
                goto out_locked;
        }

        /*
         * Zombies don't have a stack, so we can't read their psstrings.
         * System processes also don't have a user stack.
         */
        if (P_ZOMBIE(p) || (p->p_flag & PK_SYSTEM) != 0) {
                error = EINVAL;
                mutex_exit(p->p_lock);
                goto out_locked;
        }

        error = rw_tryenter(&p->p_reflock, RW_READER) ? 0 : EBUSY;
        mutex_exit(p->p_lock);
        if (error) {
                goto out_locked;
        }
        mutex_exit(&proc_lock);

        if (type == KERN_PROC_NARGV || type == KERN_PROC_NENV) {
                int value;
                if ((error = copyin_psstrings(p, &pss)) == 0) {
                        if (type == KERN_PROC_NARGV)
                                value = pss.ps_nargvstr;
                        else
                                value = pss.ps_nenvstr;
                        error = sysctl_copyout(l, &value, oldp, sizeof(value));
                        *oldlenp = sizeof(value);
                }
        } else {
                cookie[0] = l;
                cookie[1] = oldp;
                error = copy_procargs(p, type, oldlenp,
                    copy_procargs_sysctl_cb, cookie);
        }
        rw_exit(&p->p_reflock);
        sysctl_relock();
        return error;

out_locked:
        mutex_exit(&proc_lock);
        sysctl_relock();
        return error;
}

int
copy_procargs(struct proc *p, int oid, size_t *limit,
    int (*cb)(void *, const void *, size_t, size_t), void *cookie)
{
        struct ps_strings pss;
        size_t len, i, loaded, entry_len;
        struct uio auio;
        struct iovec aiov;
        int error, argvlen;
        char *arg;
        char **argv;
        vaddr_t user_argv;
        struct vmspace *vmspace;

        /*
         * Allocate a temporary buffer to hold the argument vector and
         * the arguments themselve.
         */
        arg = kmem_alloc(PAGE_SIZE, KM_SLEEP);
        argv = kmem_alloc(PAGE_SIZE, KM_SLEEP);

        /*
         * Lock the process down in memory.
         */
        vmspace = p->p_vmspace;
        uvmspace_addref(vmspace);

        /*
         * Read in the ps_strings structure.
         */
        if ((error = copyin_psstrings(p, &pss)) != 0)
                goto done;

        /*
         * Now read the address of the argument vector.
         */
        switch (oid) {
        case KERN_PROC_ARGV:
                user_argv = (uintptr_t)pss.ps_argvstr;
                argvlen = pss.ps_nargvstr;
                break;
        case KERN_PROC_ENV:
                user_argv = (uintptr_t)pss.ps_envstr;
                argvlen = pss.ps_nenvstr;
                break;
        default:
                error = EINVAL;
                goto done;
        }

        if (argvlen < 0) {
                error = EIO;
                goto done;
        }


        /*
         * Now copy each string.
         */
        len = 0; /* bytes written to user buffer */
        loaded = 0; /* bytes from argv already processed */
        i = 0; /* To make compiler happy */
        entry_len = PROC_PTRSZ(p);

        for (; argvlen; --argvlen) {
                int finished = 0;
                vaddr_t base;
                size_t xlen;
                int j;

                if (loaded == 0) {
                        size_t rem = entry_len * argvlen;
                        loaded = MIN(rem, PAGE_SIZE);
                        error = copyin_vmspace(vmspace,
                            (const void *)user_argv, argv, loaded);
                        if (error)
                                break;
                        user_argv += loaded;
                        i = 0;
                }

#if !defined(_RUMPKERNEL)
                if (p->p_flag & PK_32)
                        MODULE_HOOK_CALL(kern_proc32_base_hook,
                            (argv, i++), 0, base);
                else
#endif /* !defined(_RUMPKERNEL) */
                        base = (vaddr_t)argv[i++];
                loaded -= entry_len;

                /*
                 * The program has messed around with its arguments,
                 * possibly deleting some, and replacing them with
                 * NULL's. Treat this as the last argument and not
                 * a failure.
                 */
                if (base == 0)
                        break;

                while (!finished) {
                        xlen = PAGE_SIZE - (base & PAGE_MASK);

                        aiov.iov_base = arg;
                        aiov.iov_len = PAGE_SIZE;
                        auio.uio_iov = &aiov;
                        auio.uio_iovcnt = 1;
                        auio.uio_offset = base;
                        auio.uio_resid = xlen;
                        auio.uio_rw = UIO_READ;
                        UIO_SETUP_SYSSPACE(&auio);
                        error = uvm_io(&vmspace->vm_map, &auio, 0);
                        if (error)
                                goto done;

                        /* Look for the end of the string */
                        for (j = 0; j < xlen; j++) {
                                if (arg[j] == '\0') {
                                        xlen = j + 1;
                                        finished = 1;
                                        break;
                                }
                        }

                        /* Check for user buffer overflow */
                        if (len + xlen > *limit) {
                                finished = 1;
                                if (len > *limit)
                                        xlen = 0;
                                else
                                        xlen = *limit - len;
                        }

                        /* Copyout the page */
                        error = (*cb)(cookie, arg, len, xlen);
                        if (error)
                                goto done;

                        len += xlen;
                        base += xlen;
                }
        }
        *limit = len;

done:
        kmem_free(argv, PAGE_SIZE);
        kmem_free(arg, PAGE_SIZE);
        uvmspace_free(vmspace);
        return error;
}

/*
 * Fill in a proc structure for the specified process.
 */
static void
fill_proc(const struct proc *psrc, struct proc *p, bool allowaddr)
{
        COND_SET_STRUCT(p->p_list, psrc->p_list, allowaddr);
        memset(&p->p_auxlock, 0, sizeof(p->p_auxlock));
        COND_SET_STRUCT(p->p_lock, psrc->p_lock, allowaddr);
        memset(&p->p_stmutex, 0, sizeof(p->p_stmutex));
        memset(&p->p_reflock, 0, sizeof(p->p_reflock));
        COND_SET_STRUCT(p->p_waitcv, psrc->p_waitcv, allowaddr);
        COND_SET_STRUCT(p->p_lwpcv, psrc->p_lwpcv, allowaddr);
        COND_SET_PTR(p->p_cred, psrc->p_cred, allowaddr);
        COND_SET_PTR(p->p_fd, psrc->p_fd, allowaddr);
        COND_SET_PTR(p->p_cwdi, psrc->p_cwdi, allowaddr);
        COND_SET_PTR(p->p_stats, psrc->p_stats, allowaddr);
        COND_SET_PTR(p->p_limit, psrc->p_limit, allowaddr);
        COND_SET_PTR(p->p_vmspace, psrc->p_vmspace, allowaddr);
        COND_SET_PTR(p->p_sigacts, psrc->p_sigacts, allowaddr);
        COND_SET_PTR(p->p_aio, psrc->p_aio, allowaddr);
        p->p_mqueue_cnt = psrc->p_mqueue_cnt;
        memset(&p->p_specdataref, 0, sizeof(p->p_specdataref));
        p->p_exitsig = psrc->p_exitsig;
        p->p_flag = psrc->p_flag;
        p->p_sflag = psrc->p_sflag;
        p->p_slflag = psrc->p_slflag;
        p->p_lflag = psrc->p_lflag;
        p->p_stflag = psrc->p_stflag;
        p->p_stat = psrc->p_stat;
        p->p_trace_enabled = psrc->p_trace_enabled;
        p->p_pid = psrc->p_pid;
        COND_SET_STRUCT(p->p_pglist, psrc->p_pglist, allowaddr);
        COND_SET_PTR(p->p_pptr, psrc->p_pptr, allowaddr);
        COND_SET_STRUCT(p->p_sibling, psrc->p_sibling, allowaddr);
        COND_SET_STRUCT(p->p_children, psrc->p_children, allowaddr);
        COND_SET_STRUCT(p->p_lwps, psrc->p_lwps, allowaddr);
        COND_SET_PTR(p->p_raslist, psrc->p_raslist, allowaddr);
        p->p_nlwps = psrc->p_nlwps;
        p->p_nzlwps = psrc->p_nzlwps;
        p->p_nrlwps = psrc->p_nrlwps;
        p->p_nlwpwait = psrc->p_nlwpwait;
        p->p_ndlwps = psrc->p_ndlwps;
        p->p_nstopchild = psrc->p_nstopchild;
        p->p_waited = psrc->p_waited;
        COND_SET_PTR(p->p_zomblwp, psrc->p_zomblwp, allowaddr);
        COND_SET_PTR(p->p_vforklwp, psrc->p_vforklwp, allowaddr);
        COND_SET_PTR(p->p_sched_info, psrc->p_sched_info, allowaddr);
        p->p_estcpu = psrc->p_estcpu;
        p->p_estcpu_inherited = psrc->p_estcpu_inherited;
        p->p_forktime = psrc->p_forktime;
        p->p_pctcpu = psrc->p_pctcpu;
        COND_SET_PTR(p->p_opptr, psrc->p_opptr, allowaddr);
        COND_SET_PTR(p->p_timers, psrc->p_timers, allowaddr);
        p->p_rtime = psrc->p_rtime;
        p->p_uticks = psrc->p_uticks;
        p->p_sticks = psrc->p_sticks;
        p->p_iticks = psrc->p_iticks;
        p->p_xutime = psrc->p_xutime;
        p->p_xstime = psrc->p_xstime;
        p->p_traceflag = psrc->p_traceflag;
        COND_SET_PTR(p->p_tracep, psrc->p_tracep, allowaddr);
        COND_SET_PTR(p->p_textvp, psrc->p_textvp, allowaddr);
        COND_SET_PTR(p->p_emul, psrc->p_emul, allowaddr);
        COND_SET_PTR(p->p_emuldata, psrc->p_emuldata, allowaddr);
        COND_SET_CPTR(p->p_execsw, psrc->p_execsw, allowaddr);
        COND_SET_STRUCT(p->p_klist, psrc->p_klist, allowaddr);
        COND_SET_STRUCT(p->p_sigwaiters, psrc->p_sigwaiters, allowaddr);
        COND_SET_STRUCT(p->p_sigpend.sp_info, psrc->p_sigpend.sp_info,
            allowaddr);
        p->p_sigpend.sp_set = psrc->p_sigpend.sp_set;
        COND_SET_PTR(p->p_lwpctl, psrc->p_lwpctl, allowaddr);
        p->p_ppid = psrc->p_ppid;
        p->p_oppid = psrc->p_oppid;
        COND_SET_PTR(p->p_path, psrc->p_path, allowaddr);
        p->p_sigctx = psrc->p_sigctx;
        p->p_nice = psrc->p_nice;
        memcpy(p->p_comm, psrc->p_comm, sizeof(p->p_comm));
        COND_SET_PTR(p->p_pgrp, psrc->p_pgrp, allowaddr);
        COND_SET_VALUE(p->p_psstrp, psrc->p_psstrp, allowaddr);
        p->p_pax = psrc->p_pax;
        p->p_xexit = psrc->p_xexit;
        p->p_xsig = psrc->p_xsig;
        p->p_acflag = psrc->p_acflag;
        COND_SET_STRUCT(p->p_md, psrc->p_md, allowaddr);
        p->p_stackbase = psrc->p_stackbase;
        COND_SET_PTR(p->p_dtrace, psrc->p_dtrace, allowaddr);
}

/*
 * Fill in an eproc structure for the specified process.
 */
void
fill_eproc(struct proc *p, struct eproc *ep, bool zombie, bool allowaddr)
{
        struct tty *tp;
        struct lwp *l;

        KASSERT(mutex_owned(&proc_lock));
        KASSERT(mutex_owned(p->p_lock));

        COND_SET_PTR(ep->e_paddr, p, allowaddr);
        COND_SET_PTR(ep->e_sess, p->p_session, allowaddr);
        if (p->p_cred) {
                kauth_cred_topcred(p->p_cred, &ep->e_pcred);
                kauth_cred_toucred(p->p_cred, &ep->e_ucred);
        }
        if (p->p_stat != SIDL && !P_ZOMBIE(p) && !zombie) {
                struct vmspace *vm = p->p_vmspace;

                ep->e_vm.vm_rssize = vm_resident_count(vm);
                ep->e_vm.vm_tsize = vm->vm_tsize;
                ep->e_vm.vm_dsize = vm->vm_dsize;
                ep->e_vm.vm_ssize = vm->vm_ssize;
                ep->e_vm.vm_map.size = vm->vm_map.size;

                /* Pick the primary (first) LWP */
                l = proc_active_lwp(p);
                KASSERT(l != NULL);
                lwp_lock(l);
                if (l->l_wchan)
                        strncpy(ep->e_wmesg, l->l_wmesg, WMESGLEN);
                lwp_unlock(l);
        }
        ep->e_ppid = p->p_ppid;
        if (p->p_pgrp && p->p_session) {
                ep->e_pgid = p->p_pgrp->pg_id;
                ep->e_jobc = p->p_pgrp->pg_jobc;
                ep->e_sid = p->p_session->s_sid;
                if ((p->p_lflag & PL_CONTROLT) &&
                    (tp = p->p_session->s_ttyp)) {
                        ep->e_tdev = tp->t_dev;
                        ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
                        COND_SET_PTR(ep->e_tsess, tp->t_session, allowaddr);
                } else
                        ep->e_tdev = (uint32_t)NODEV;
                ep->e_flag = p->p_session->s_ttyvp ? EPROC_CTTY : 0;
                if (SESS_LEADER(p))
                        ep->e_flag |= EPROC_SLEADER;
                strncpy(ep->e_login, p->p_session->s_login, MAXLOGNAME);
        }
        ep->e_xsize = ep->e_xrssize = 0;
        ep->e_xccount = ep->e_xswrss = 0;
}

/*
 * Fill in a kinfo_proc2 structure for the specified process.
 */
void
fill_kproc2(struct proc *p, struct kinfo_proc2 *ki, bool zombie, bool allowaddr)
{
        struct tty *tp;
        struct lwp *l;
        struct timeval ut, st, rt;
        sigset_t ss1, ss2;
        struct rusage ru;
        struct vmspace *vm;

        KASSERT(mutex_owned(&proc_lock));
        KASSERT(mutex_owned(p->p_lock));

        sigemptyset(&ss1);
        sigemptyset(&ss2);

        COND_SET_VALUE(ki->p_paddr, PTRTOUINT64(p), allowaddr);
        COND_SET_VALUE(ki->p_fd, PTRTOUINT64(p->p_fd), allowaddr);
        COND_SET_VALUE(ki->p_cwdi, PTRTOUINT64(p->p_cwdi), allowaddr);
        COND_SET_VALUE(ki->p_stats, PTRTOUINT64(p->p_stats), allowaddr);
        COND_SET_VALUE(ki->p_limit, PTRTOUINT64(p->p_limit), allowaddr);
        COND_SET_VALUE(ki->p_vmspace, PTRTOUINT64(p->p_vmspace), allowaddr);
        COND_SET_VALUE(ki->p_sigacts, PTRTOUINT64(p->p_sigacts), allowaddr);
        COND_SET_VALUE(ki->p_sess, PTRTOUINT64(p->p_session), allowaddr);
        ki->p_tsess = 0;        /* may be changed if controlling tty below */
        COND_SET_VALUE(ki->p_ru, PTRTOUINT64(&p->p_stats->p_ru), allowaddr);
        ki->p_eflag = 0;
        ki->p_exitsig = p->p_exitsig;
        ki->p_flag = L_INMEM;   /* Process never swapped out */
        ki->p_flag |= sysctl_map_flags(sysctl_flagmap, p->p_flag);
        ki->p_flag |= sysctl_map_flags(sysctl_sflagmap, p->p_sflag);
        ki->p_flag |= sysctl_map_flags(sysctl_slflagmap, p->p_slflag);
        ki->p_flag |= sysctl_map_flags(sysctl_lflagmap, p->p_lflag);
        ki->p_flag |= sysctl_map_flags(sysctl_stflagmap, p->p_stflag);
        ki->p_pid = p->p_pid;
        ki->p_ppid = p->p_ppid;
        ki->p_uid = kauth_cred_geteuid(p->p_cred);
        ki->p_ruid = kauth_cred_getuid(p->p_cred);
        ki->p_gid = kauth_cred_getegid(p->p_cred);
        ki->p_rgid = kauth_cred_getgid(p->p_cred);
        ki->p_svuid = kauth_cred_getsvuid(p->p_cred);
        ki->p_svgid = kauth_cred_getsvgid(p->p_cred);
        ki->p_ngroups = kauth_cred_ngroups(p->p_cred);
        kauth_cred_getgroups(p->p_cred, ki->p_groups,
            uimin(ki->p_ngroups, sizeof(ki->p_groups) / sizeof(ki->p_groups[0])),
            UIO_SYSSPACE);

        ki->p_uticks = p->p_uticks;
        ki->p_sticks = p->p_sticks;
        ki->p_iticks = p->p_iticks;
        ki->p_tpgid = NO_PGID;        /* may be changed if controlling tty below */
        COND_SET_VALUE(ki->p_tracep, PTRTOUINT64(p->p_tracep), allowaddr);
        ki->p_traceflag = p->p_traceflag;

        memcpy(&ki->p_sigignore, &p->p_sigctx.ps_sigignore,sizeof(ki_sigset_t));
        memcpy(&ki->p_sigcatch, &p->p_sigctx.ps_sigcatch, sizeof(ki_sigset_t));

        ki->p_cpticks = 0;
        ki->p_pctcpu = p->p_pctcpu;
        ki->p_estcpu = 0;
        ki->p_stat = p->p_stat; /* Will likely be overridden by LWP status */
        ki->p_realstat = p->p_stat;
        ki->p_nice = p->p_nice;
        ki->p_xstat = P_WAITSTATUS(p);
        ki->p_acflag = p->p_acflag;

        strncpy(ki->p_comm, p->p_comm,
            uimin(sizeof(ki->p_comm), sizeof(p->p_comm)));
        strncpy(ki->p_ename, p->p_emul->e_name, sizeof(ki->p_ename));

        ki->p_nlwps = p->p_nlwps;
        ki->p_realflag = ki->p_flag;

        if (p->p_stat != SIDL && !P_ZOMBIE(p) && !zombie) {
                vm = p->p_vmspace;
                ki->p_vm_rssize = vm_resident_count(vm);
                ki->p_vm_tsize = vm->vm_tsize;
                ki->p_vm_dsize = vm->vm_dsize;
                ki->p_vm_ssize = vm->vm_ssize;
                ki->p_vm_vsize = atop(vm->vm_map.size);
                /*
                 * Since the stack is initially mapped mostly with
                 * PROT_NONE and grown as needed, adjust the "mapped size"
                 * to skip the unused stack portion.
                 */
                ki->p_vm_msize =
                    atop(vm->vm_map.size) - vm->vm_issize + vm->vm_ssize;

                /* Pick the primary (first) LWP */
                l = proc_active_lwp(p);
                KASSERT(l != NULL);
                lwp_lock(l);
                ki->p_nrlwps = p->p_nrlwps;
                ki->p_forw = 0;
                ki->p_back = 0;
                COND_SET_VALUE(ki->p_addr, PTRTOUINT64(l->l_addr), allowaddr);
                ki->p_stat = l->l_stat;
                ki->p_flag |= sysctl_map_flags(sysctl_lwpflagmap, l->l_flag);
                ki->p_swtime = l->l_swtime;
                ki->p_slptime = l->l_slptime;
                if (l->l_stat == LSONPROC)
                        ki->p_schedflags = l->l_cpu->ci_schedstate.spc_flags;
                else
                        ki->p_schedflags = 0;
                ki->p_priority = lwp_eprio(l);
                ki->p_usrpri = l->l_priority;
                if (l->l_wchan)
                        strncpy(ki->p_wmesg, l->l_wmesg, sizeof(ki->p_wmesg));
                COND_SET_VALUE(ki->p_wchan, PTRTOUINT64(l->l_wchan), allowaddr);
                ki->p_cpuid = cpu_index(l->l_cpu);
                lwp_unlock(l);
                LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                        /* This is hardly correct, but... */
                        sigplusset(&l->l_sigpend.sp_set, &ss1);
                        sigplusset(&l->l_sigmask, &ss2);
                        ki->p_cpticks += l->l_cpticks;
                        ki->p_pctcpu += l->l_pctcpu;
                        ki->p_estcpu += l->l_estcpu;
                }
        }
        sigplusset(&p->p_sigpend.sp_set, &ss1);
        memcpy(&ki->p_siglist, &ss1, sizeof(ki_sigset_t));
        memcpy(&ki->p_sigmask, &ss2, sizeof(ki_sigset_t));

        if (p->p_session != NULL) {
                ki->p_sid = p->p_session->s_sid;
                ki->p__pgid = p->p_pgrp->pg_id;
                if (p->p_session->s_ttyvp)
                        ki->p_eflag |= EPROC_CTTY;
                if (SESS_LEADER(p))
                        ki->p_eflag |= EPROC_SLEADER;
                strncpy(ki->p_login, p->p_session->s_login,
                    uimin(sizeof ki->p_login - 1, sizeof p->p_session->s_login));
                ki->p_jobc = p->p_pgrp->pg_jobc;
                if ((p->p_lflag & PL_CONTROLT) && (tp = p->p_session->s_ttyp)) {
                        ki->p_tdev = tp->t_dev;
                        ki->p_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
                        COND_SET_VALUE(ki->p_tsess, PTRTOUINT64(tp->t_session),
                            allowaddr);
                } else {
                        ki->p_tdev = (int32_t)NODEV;
                }
        }

        if (!P_ZOMBIE(p) && !zombie) {
                ki->p_uvalid = 1;
                ki->p_ustart_sec = p->p_stats->p_start.tv_sec;
                ki->p_ustart_usec = p->p_stats->p_start.tv_usec;

                calcru(p, &ut, &st, NULL, &rt);
                ki->p_rtime_sec = rt.tv_sec;
                ki->p_rtime_usec = rt.tv_usec;
                ki->p_uutime_sec = ut.tv_sec;
                ki->p_uutime_usec = ut.tv_usec;
                ki->p_ustime_sec = st.tv_sec;
                ki->p_ustime_usec = st.tv_usec;

                memcpy(&ru, &p->p_stats->p_ru, sizeof(ru));
                rulwps(p, &ru);
                ki->p_uru_nvcsw = ru.ru_nvcsw;
                ki->p_uru_nivcsw = ru.ru_nivcsw;
                ki->p_uru_maxrss = ru.ru_maxrss;
                ki->p_uru_ixrss = ru.ru_ixrss;
                ki->p_uru_idrss = ru.ru_idrss;
                ki->p_uru_isrss = ru.ru_isrss;
                ki->p_uru_minflt = ru.ru_minflt;
                ki->p_uru_majflt = ru.ru_majflt;
                ki->p_uru_nswap = ru.ru_nswap;
                ki->p_uru_inblock = ru.ru_inblock;
                ki->p_uru_oublock = ru.ru_oublock;
                ki->p_uru_msgsnd = ru.ru_msgsnd;
                ki->p_uru_msgrcv = ru.ru_msgrcv;
                ki->p_uru_nsignals = ru.ru_nsignals;

                timeradd(&p->p_stats->p_cru.ru_utime,
                         &p->p_stats->p_cru.ru_stime, &ut);
                ki->p_uctime_sec = ut.tv_sec;
                ki->p_uctime_usec = ut.tv_usec;
        }
}


int
proc_find_locked(struct lwp *l, struct proc **p, pid_t pid)
{
        int error;

        mutex_enter(&proc_lock);
        if (pid == -1)
                *p = l->l_proc;
        else
                *p = proc_find(pid);

        if (*p == NULL) {
                if (pid != -1)
                        mutex_exit(&proc_lock);
                return ESRCH;
        }
        if (pid != -1)
                mutex_enter((*p)->p_lock);
        mutex_exit(&proc_lock);

        error = kauth_authorize_process(l->l_cred,
            KAUTH_PROCESS_CANSEE, *p,
            KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
        if (error) {
                if (pid != -1)
                        mutex_exit((*p)->p_lock);
        }
        return error;
}

static int
fill_pathname(struct lwp *l, pid_t pid, void *oldp, size_t *oldlenp)
{
        int error;
        struct proc *p;

        if ((error = proc_find_locked(l, &p, pid)) != 0)
                return error;

        if (p->p_path == NULL) {
                if (pid != -1)
                        mutex_exit(p->p_lock);
                return ENOENT;
        }

        size_t len = strlen(p->p_path) + 1;
        if (oldp != NULL) {
                size_t copylen = uimin(len, *oldlenp);
                error = sysctl_copyout(l, p->p_path, oldp, copylen);
                if (error == 0 && *oldlenp < len)
                        error = ENOSPC;
        }
        *oldlenp = len;
        if (pid != -1)
                mutex_exit(p->p_lock);
        return error;
}

static int
fill_cwd(struct lwp *l, pid_t pid, void *oldp, size_t *oldlenp)
{
        int error;
        struct proc *p;
        char *path;
        char *bp, *bend;
        struct cwdinfo *cwdi;
        struct vnode *vp;
        size_t len, lenused;

        if ((error = proc_find_locked(l, &p, pid)) != 0)
                return error;

        len = MAXPATHLEN * 4;

        path = kmem_alloc(len, KM_SLEEP);

        bp = &path[len];
        bend = bp;
        *(--bp) = '\0';

        cwdi = p->p_cwdi;
        rw_enter(&cwdi->cwdi_lock, RW_READER);
        vp = cwdi->cwdi_cdir;
        error = getcwd_common(vp, NULL, &bp, path, len/2, 0, l);
        rw_exit(&cwdi->cwdi_lock);

        if (error)
                goto out;

        lenused = bend - bp;

        if (oldp != NULL) {
                size_t copylen = uimin(lenused, *oldlenp);
                error = sysctl_copyout(l, bp, oldp, copylen);
                if (error == 0 && *oldlenp < lenused)
                        error = ENOSPC;
        }
        *oldlenp = lenused;
out:
        if (pid != -1)
                mutex_exit(p->p_lock);
        kmem_free(path, len);
        return error;
}

int
proc_getauxv(struct proc *p, void **buf, size_t *len)
{
        struct ps_strings pss;
        int error;
        void *uauxv, *kauxv;
        size_t size;

        if ((error = copyin_psstrings(p, &pss)) != 0)
                return error;
        if (pss.ps_envstr == NULL)
                return EIO;

        size = p->p_execsw->es_arglen;
        if (size == 0)
                return EIO;

        size_t ptrsz = PROC_PTRSZ(p);
        uauxv = (void *)((char *)pss.ps_envstr + (pss.ps_nenvstr + 1) * ptrsz);

        kauxv = kmem_alloc(size, KM_SLEEP);

        error = copyin_proc(p, uauxv, kauxv, size);
        if (error) {
                kmem_free(kauxv, size);
                return error;
        }

        *buf = kauxv;
        *len = size;

        return 0;
}


static int
sysctl_security_expose_address(SYSCTLFN_ARGS)
{
        int expose_address, error;
        struct sysctlnode node;

        node = *rnode;
        node.sysctl_data = &expose_address;
        expose_address = *(int *)rnode->sysctl_data;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_KERNADDR,
            0, NULL, NULL, NULL))
                return EPERM;

        switch (expose_address) {
        case 0:
        case 1:
        case 2:
                break;
        default:
                return EINVAL;
        }

        *(int *)rnode->sysctl_data = expose_address;

        return 0;
}

bool
get_expose_address(struct proc *p)
{
        /* allow only if sysctl variable is set or privileged */
        return kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE,
            p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_KPTR), NULL, NULL) == 0;
}










































































































































































































































































































































































































































































































































































































    1 














    1 

















    1 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
















    1 





    1 




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
/*        $NetBSD: bpf.c,v 1.252 2023/07/31 17:41:18 christos Exp $        */

/*
 * Copyright (c) 1990, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from the Stanford/CMU enet packet filter,
 * (net/enet.c) distributed as part of 4.3BSD, and code contributed
 * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
 * Berkeley Laboratory.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)bpf.c        8.4 (Berkeley) 1/9/95
 * static char rcsid[] =
 * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.252 2023/07/31 17:41:18 christos Exp $");

#if defined(_KERNEL_OPT)
#include "opt_bpf.h"
#include "sl.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/buf.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/ioctl.h>
#include <sys/conf.h>
#include <sys/vnode.h>
#include <sys/queue.h>
#include <sys/stat.h>
#include <sys/module.h>
#include <sys/atomic.h>
#include <sys/cpu.h>

#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/tty.h>
#include <sys/uio.h>

#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/poll.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/syslog.h>
#include <sys/percpu.h>
#include <sys/pserialize.h>
#include <sys/lwp.h>
#include <sys/xcall.h>

#include <net/if.h>
#include <net/slip.h>

#include <net/bpf.h>
#include <net/bpfdesc.h>
#include <net/bpfjit.h>

#include <net/if_arc.h>
#include <net/if_ether.h>
#include <net/if_types.h>

#include <netinet/in.h>
#include <netinet/if_inarp.h>


#include <compat/sys/sockio.h>

#ifndef BPF_BUFSIZE
/*
 * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
 * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
 */
# define BPF_BUFSIZE 32768
#endif

#define PRINET  26                        /* interruptible */

/*
 * The default read buffer size, and limit for BIOCSBLEN, is sysctl'able.
 * XXX the default values should be computed dynamically based
 * on available memory size and available mbuf clusters.
 */
static int bpf_bufsize = BPF_BUFSIZE;
static int bpf_maxbufsize = BPF_DFLTBUFSIZE;        /* XXX set dynamically, see above */
static bool bpf_jit = false;

struct bpfjit_ops bpfjit_module_ops = {
        .bj_generate_code = NULL,
        .bj_free_code = NULL
};

/*
 * Global BPF statistics returned by net.bpf.stats sysctl.
 */
static struct percpu        *bpf_gstats_percpu; /* struct bpf_stat */

#define BPF_STATINC(id)                                        \
        {                                                \
                struct bpf_stat *__stats =                \
                    percpu_getref(bpf_gstats_percpu);        \
                __stats->bs_##id++;                        \
                percpu_putref(bpf_gstats_percpu);        \
        }

/*
 * Locking notes:
 * - bpf_mtx (adaptive mutex) protects:
 *   - Gobal lists: bpf_iflist and bpf_dlist
 *   - struct bpf_if
 *   - bpf_close
 *   - bpf_psz (pserialize)
 * - struct bpf_d has two mutexes:
 *   - bd_buf_mtx (spin mutex) protects the buffers that can be accessed
 *     on packet tapping
 *   - bd_mtx (adaptive mutex) protects member variables other than the buffers
 * - Locking order: bpf_mtx => bpf_d#bd_mtx => bpf_d#bd_buf_mtx
 * - struct bpf_d obtained via fp->f_bpf in bpf_read and bpf_write is
 *   never freed because struct bpf_d is only freed in bpf_close and
 *   bpf_close never be called while executing bpf_read and bpf_write
 * - A filter that is assigned to bpf_d can be replaced with another filter
 *   while tapping packets, so it needs to be done atomically
 * - struct bpf_d is iterated on bpf_dlist with psz
 * - struct bpf_if is iterated on bpf_iflist with psz or psref
 */
/*
 * Use a mutex to avoid a race condition between gathering the stats/peers
 * and opening/closing the device.
 */
static kmutex_t bpf_mtx;

static struct psref_class        *bpf_psref_class __read_mostly;
static pserialize_t                bpf_psz;

static inline void
bpf_if_acquire(struct bpf_if *bp, struct psref *psref)
{

        psref_acquire(psref, &bp->bif_psref, bpf_psref_class);
}

static inline void
bpf_if_release(struct bpf_if *bp, struct psref *psref)
{

        psref_release(psref, &bp->bif_psref, bpf_psref_class);
}

/*
 *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
 *  bpf_dtab holds the descriptors, indexed by minor device #
 */
static struct pslist_head bpf_iflist;
static struct pslist_head bpf_dlist;

/* Macros for bpf_d on bpf_dlist */
#define BPF_DLIST_WRITER_INSERT_HEAD(__d)                                \
        PSLIST_WRITER_INSERT_HEAD(&bpf_dlist, (__d), bd_bpf_dlist_entry)
#define BPF_DLIST_READER_FOREACH(__d)                                        \
        PSLIST_READER_FOREACH((__d), &bpf_dlist, struct bpf_d,                \
                              bd_bpf_dlist_entry)
#define BPF_DLIST_WRITER_FOREACH(__d)                                        \
        PSLIST_WRITER_FOREACH((__d), &bpf_dlist, struct bpf_d,                \
                              bd_bpf_dlist_entry)
#define BPF_DLIST_ENTRY_INIT(__d)                                        \
        PSLIST_ENTRY_INIT((__d), bd_bpf_dlist_entry)
#define BPF_DLIST_WRITER_REMOVE(__d)                                        \
        PSLIST_WRITER_REMOVE((__d), bd_bpf_dlist_entry)
#define BPF_DLIST_ENTRY_DESTROY(__d)                                        \
        PSLIST_ENTRY_DESTROY((__d), bd_bpf_dlist_entry)

/* Macros for bpf_if on bpf_iflist */
#define BPF_IFLIST_WRITER_INSERT_HEAD(__bp)                                \
        PSLIST_WRITER_INSERT_HEAD(&bpf_iflist, (__bp), bif_iflist_entry)
#define BPF_IFLIST_READER_FOREACH(__bp)                                        \
        PSLIST_READER_FOREACH((__bp), &bpf_iflist, struct bpf_if,        \
                              bif_iflist_entry)
#define BPF_IFLIST_WRITER_FOREACH(__bp)                                        \
        PSLIST_WRITER_FOREACH((__bp), &bpf_iflist, struct bpf_if,        \
                              bif_iflist_entry)
#define BPF_IFLIST_WRITER_REMOVE(__bp)                                        \
        PSLIST_WRITER_REMOVE((__bp), bif_iflist_entry)
#define BPF_IFLIST_ENTRY_INIT(__bp)                                        \
        PSLIST_ENTRY_INIT((__bp), bif_iflist_entry)
#define BPF_IFLIST_ENTRY_DESTROY(__bp)                                        \
        PSLIST_ENTRY_DESTROY((__bp), bif_iflist_entry)

/* Macros for bpf_d on bpf_if#bif_dlist_pslist */
#define BPFIF_DLIST_READER_FOREACH(__d, __bp)                                \
        PSLIST_READER_FOREACH((__d), &(__bp)->bif_dlist_head, struct bpf_d, \
                              bd_bif_dlist_entry)
#define BPFIF_DLIST_WRITER_INSERT_HEAD(__bp, __d)                        \
        PSLIST_WRITER_INSERT_HEAD(&(__bp)->bif_dlist_head, (__d),        \
                                  bd_bif_dlist_entry)
#define BPFIF_DLIST_WRITER_REMOVE(__d)                                        \
        PSLIST_WRITER_REMOVE((__d), bd_bif_dlist_entry)
#define BPFIF_DLIST_ENTRY_INIT(__d)                                        \
        PSLIST_ENTRY_INIT((__d), bd_bif_dlist_entry)
#define        BPFIF_DLIST_READER_EMPTY(__bp)                                        \
        (PSLIST_READER_FIRST(&(__bp)->bif_dlist_head, struct bpf_d,        \
                             bd_bif_dlist_entry) == NULL)
#define        BPFIF_DLIST_WRITER_EMPTY(__bp)                                        \
        (PSLIST_WRITER_FIRST(&(__bp)->bif_dlist_head, struct bpf_d,        \
                             bd_bif_dlist_entry) == NULL)
#define BPFIF_DLIST_ENTRY_DESTROY(__d)                                        \
        PSLIST_ENTRY_DESTROY((__d), bd_bif_dlist_entry)

static int        bpf_allocbufs(struct bpf_d *);
static u_int        bpf_xfilter(struct bpf_filter **, void *, u_int, u_int);
static void        bpf_deliver(struct bpf_if *,
                            void *(*cpfn)(void *, const void *, size_t),
                            void *, u_int, u_int, const u_int);
static void        bpf_freed(struct bpf_d *);
static void        bpf_free_filter(struct bpf_filter *);
static void        bpf_ifname(struct ifnet *, struct ifreq *);
static void        *bpf_mcpy(void *, const void *, size_t);
static int        bpf_movein(struct ifnet *, struct uio *, int, uint64_t,
                                struct mbuf **, struct sockaddr *,
                                struct bpf_filter **);
static void        bpf_attachd(struct bpf_d *, struct bpf_if *);
static void        bpf_detachd(struct bpf_d *);
static int        bpf_setif(struct bpf_d *, struct ifreq *);
static int        bpf_setf(struct bpf_d *, struct bpf_program *, u_long);
static void        bpf_timed_out(void *);
static inline void
                bpf_wakeup(struct bpf_d *);
static int        bpf_hdrlen(struct bpf_d *);
static void        catchpacket(struct bpf_d *, u_char *, u_int, u_int,
    void *(*)(void *, const void *, size_t), struct timespec *);
static void        reset_d(struct bpf_d *);
static int        bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
static int        bpf_setdlt(struct bpf_d *, u_int);

static int        bpf_read(struct file *, off_t *, struct uio *, kauth_cred_t,
    int);
static int        bpf_write(struct file *, off_t *, struct uio *, kauth_cred_t,
    int);
static int        bpf_ioctl(struct file *, u_long, void *);
static int        bpf_poll(struct file *, int);
static int        bpf_stat(struct file *, struct stat *);
static int        bpf_close(struct file *);
static int        bpf_kqfilter(struct file *, struct knote *);

static const struct fileops bpf_fileops = {
        .fo_name = "bpf",
        .fo_read = bpf_read,
        .fo_write = bpf_write,
        .fo_ioctl = bpf_ioctl,
        .fo_fcntl = fnullop_fcntl,
        .fo_poll = bpf_poll,
        .fo_stat = bpf_stat,
        .fo_close = bpf_close,
        .fo_kqfilter = bpf_kqfilter,
        .fo_restart = fnullop_restart,
};

dev_type_open(bpfopen);

const struct cdevsw bpf_cdevsw = {
        .d_open = bpfopen,
        .d_close = noclose,
        .d_read = noread,
        .d_write = nowrite,
        .d_ioctl = noioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_OTHER | D_MPSAFE
};

bpfjit_func_t
bpf_jit_generate(bpf_ctx_t *bc, void *code, size_t size)
{
        struct bpfjit_ops *ops = &bpfjit_module_ops;
        bpfjit_func_t (*generate_code)(const bpf_ctx_t *,
            const struct bpf_insn *, size_t);

        generate_code = atomic_load_acquire(&ops->bj_generate_code);
        if (generate_code != NULL) {
                return generate_code(bc, code, size);
        }
        return NULL;
}

void
bpf_jit_freecode(bpfjit_func_t jcode)
{
        KASSERT(bpfjit_module_ops.bj_free_code != NULL);
        bpfjit_module_ops.bj_free_code(jcode);
}

static int
bpf_movein(struct ifnet *ifp, struct uio *uio, int linktype, uint64_t mtu, struct mbuf **mp,
           struct sockaddr *sockp, struct bpf_filter **wfilter)
{
        struct mbuf *m, *m0, *n;
        int error;
        size_t len;
        size_t hlen;
        size_t align;
        u_int slen;

        /*
         * Build a sockaddr based on the data link layer type.
         * We do this at this level because the ethernet header
         * is copied directly into the data field of the sockaddr.
         * In the case of SLIP, there is no header and the packet
         * is forwarded as is.
         * Also, we are careful to leave room at the front of the mbuf
         * for the link level header.
         */
        switch (linktype) {

        case DLT_SLIP:
                sockp->sa_family = AF_INET;
                hlen = 0;
                align = 0;
                break;

        case DLT_PPP:
                sockp->sa_family = AF_UNSPEC;
                hlen = 0;
                align = 0;
                break;

        case DLT_EN10MB:
                sockp->sa_family = AF_UNSPEC;
                /* XXX Would MAXLINKHDR be better? */
                 /* 6(dst)+6(src)+2(type) */
                hlen = sizeof(struct ether_header);
                align = 2;
                break;

        case DLT_ARCNET:
                sockp->sa_family = AF_UNSPEC;
                hlen = ARC_HDRLEN;
                align = 5;
                break;

        case DLT_FDDI:
                sockp->sa_family = AF_LINK;
                /* XXX 4(FORMAC)+6(dst)+6(src) */
                hlen = 16;
                align = 0;
                break;

        case DLT_ECONET:
                sockp->sa_family = AF_UNSPEC;
                hlen = 6;
                align = 2;
                break;

        case DLT_NULL:
                sockp->sa_family = AF_UNSPEC;
                if (ifp->if_type == IFT_LOOP) {
                        /* Set here to apply the following validations */
                        hlen = sizeof(uint32_t);
                } else
                        hlen = 0;
                align = 0;
                break;

        default:
                return (EIO);
        }

        len = uio->uio_resid;
        /*
         * If there aren't enough bytes for a link level header or the
         * packet length exceeds the interface mtu, return an error.
         */
        if (len - hlen > mtu)
                return (EMSGSIZE);

        m0 = m = m_gethdr(M_WAIT, MT_DATA);
        m_reset_rcvif(m);
        m->m_pkthdr.len = (int)(len - hlen);
        if (len + align > MHLEN) {
                m_clget(m, M_WAIT);
                if ((m->m_flags & M_EXT) == 0) {
                        error = ENOBUFS;
                        goto bad;
                }
        }

        /* Ensure the data is properly aligned */
        if (align > 0)
                m->m_data += align;

        for (;;) {
                len = M_TRAILINGSPACE(m);
                if (len > uio->uio_resid)
                        len = uio->uio_resid;
                error = uiomove(mtod(m, void *), len, uio);
                if (error)
                        goto bad;
                m->m_len = len;

                if (uio->uio_resid == 0)
                        break;

                n = m_get(M_WAIT, MT_DATA);
                m_clget(n, M_WAIT);        /* if fails, there is no problem */
                m->m_next = n;
                m = n;
        }

        slen = bpf_xfilter(wfilter, mtod(m, u_char *), len, len);
        if (slen == 0) {
                error = EPERM;
                goto bad;
        }

        if (hlen != 0) {
                if (linktype == DLT_NULL && ifp->if_type == IFT_LOOP) {
                        uint32_t af;
                        /* the link header indicates the address family */
                        memcpy(&af, mtod(m0, void *), sizeof(af));
                        sockp->sa_family = af;
                } else {
                        /* move link level header in the top of mbuf to sa_data */
                        memcpy(sockp->sa_data, mtod(m0, void *), hlen);
                }
                m0->m_data += hlen;
                m0->m_len -= hlen;
        }

        *mp = m0;
        return (0);

bad:
        m_freem(m0);
        return (error);
}

/*
 * Attach file to the bpf interface, i.e. make d listen on bp.
 */
static void
bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
{
        struct bpf_event_tracker *t;

        KASSERT(mutex_owned(&bpf_mtx));
        KASSERT(mutex_owned(d->bd_mtx));
        /*
         * Point d at bp, and add d to the interface's list of listeners.
         * Finally, point the driver's bpf cookie at the interface so
         * it will divert packets to bpf.
         */
        d->bd_bif = bp;
        BPFIF_DLIST_WRITER_INSERT_HEAD(bp, d);

        *bp->bif_driverp = bp;

        SLIST_FOREACH(t, &bp->bif_trackers, bet_entries) {
                t->bet_notify(bp, bp->bif_ifp, bp->bif_dlt,
                    BPF_TRACK_EVENT_ATTACH);
        }
}

/*
 * Detach a file from its interface.
 */
static void
bpf_detachd(struct bpf_d *d)
{
        struct bpf_if *bp;
        struct bpf_event_tracker *t;

        KASSERT(mutex_owned(&bpf_mtx));
        KASSERT(mutex_owned(d->bd_mtx));

        bp = d->bd_bif;
        /*
         * Check if this descriptor had requested promiscuous mode.
         * If so, turn it off.
         */
        if (d->bd_promisc) {
                int error __diagused;

                d->bd_promisc = 0;
                /*
                 * Take device out of promiscuous mode.  Since we were
                 * able to enter promiscuous mode, we should be able
                 * to turn it off.  But we can get an error if
                 * the interface was configured down, so only panic
                 * if we don't get an unexpected error.
                 */
                KERNEL_LOCK_UNLESS_NET_MPSAFE();
                  error = ifpromisc(bp->bif_ifp, 0);
                KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
#ifdef DIAGNOSTIC
                if (error)
                        printf("%s: ifpromisc failed: %d", __func__, error);
#endif
        }

        /* Remove d from the interface's descriptor list. */
        BPFIF_DLIST_WRITER_REMOVE(d);

        pserialize_perform(bpf_psz);

        if (BPFIF_DLIST_WRITER_EMPTY(bp)) {
                /*
                 * Let the driver know that there are no more listeners.
                 */
                *d->bd_bif->bif_driverp = NULL;
        }

        d->bd_bif = NULL;

        SLIST_FOREACH(t, &bp->bif_trackers, bet_entries) {
                t->bet_notify(bp, bp->bif_ifp, bp->bif_dlt,
                    BPF_TRACK_EVENT_DETACH);
        }
}

static void
bpf_init(void)
{

        mutex_init(&bpf_mtx, MUTEX_DEFAULT, IPL_NONE);
        bpf_psz = pserialize_create();
        bpf_psref_class = psref_class_create("bpf", IPL_SOFTNET);

        PSLIST_INIT(&bpf_iflist);
        PSLIST_INIT(&bpf_dlist);

        bpf_gstats_percpu = percpu_alloc(sizeof(struct bpf_stat));

        return;
}

/*
 * bpfilterattach() is called at boot time.  We don't need to do anything
 * here, since any initialization will happen as part of module init code.
 */
/* ARGSUSED */
void
bpfilterattach(int n)
{

}

/*
 * Open ethernet device. Clones.
 */
/* ARGSUSED */
int
bpfopen(dev_t dev, int flag, int mode, struct lwp *l)
{
        struct bpf_d *d;
        struct file *fp;
        int error, fd;

        /* falloc() will fill in the descriptor for us. */
        if ((error = fd_allocfile(&fp, &fd)) != 0)
                return error;

        d = kmem_zalloc(sizeof(*d), KM_SLEEP);
        d->bd_bufsize = bpf_bufsize;
        d->bd_direction = BPF_D_INOUT;
        d->bd_feedback = 0;
        d->bd_pid = l->l_proc->p_pid;
#ifdef _LP64
        if (curproc->p_flag & PK_32)
                d->bd_compat32 = 1;
#endif
        getnanotime(&d->bd_btime);
        d->bd_atime = d->bd_mtime = d->bd_btime;
        callout_init(&d->bd_callout, CALLOUT_MPSAFE);
        selinit(&d->bd_sel);
        d->bd_jitcode = NULL;
        d->bd_rfilter = NULL;
        d->bd_wfilter = NULL;
        d->bd_locked = 0;
        BPF_DLIST_ENTRY_INIT(d);
        BPFIF_DLIST_ENTRY_INIT(d);
        d->bd_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SOFTNET);
        d->bd_buf_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET);
        cv_init(&d->bd_cv, "bpf");

        mutex_enter(&bpf_mtx);
        BPF_DLIST_WRITER_INSERT_HEAD(d);
        mutex_exit(&bpf_mtx);

        return fd_clone(fp, fd, flag, &bpf_fileops, d);
}

/*
 * Close the descriptor by detaching it from its interface,
 * deallocating its buffers, and marking it free.
 */
/* ARGSUSED */
static int
bpf_close(struct file *fp)
{
        struct bpf_d *d;

        mutex_enter(&bpf_mtx);

        if ((d = fp->f_bpf) == NULL) {
                mutex_exit(&bpf_mtx);
                return 0;
        }

        /*
         * Refresh the PID associated with this bpf file.
         */
        d->bd_pid = curproc->p_pid;

        mutex_enter(d->bd_mtx);
        if (d->bd_state == BPF_WAITING)
                callout_halt(&d->bd_callout, d->bd_mtx);
        d->bd_state = BPF_IDLE;
        if (d->bd_bif)
                bpf_detachd(d);
        mutex_exit(d->bd_mtx);

        BPF_DLIST_WRITER_REMOVE(d);

        pserialize_perform(bpf_psz);
        mutex_exit(&bpf_mtx);

        BPFIF_DLIST_ENTRY_DESTROY(d);
        BPF_DLIST_ENTRY_DESTROY(d);
        fp->f_bpf = NULL;
        bpf_freed(d);
        callout_destroy(&d->bd_callout);
        seldestroy(&d->bd_sel);
        mutex_obj_free(d->bd_mtx);
        mutex_obj_free(d->bd_buf_mtx);
        cv_destroy(&d->bd_cv);

        kmem_free(d, sizeof(*d));

        return (0);
}

/*
 * Rotate the packet buffers in descriptor d.  Move the store buffer
 * into the hold slot, and the free buffer into the store slot.
 * Zero the length of the new store buffer.
 */
#define ROTATE_BUFFERS(d) \
        (d)->bd_hbuf = (d)->bd_sbuf; \
        (d)->bd_hlen = (d)->bd_slen; \
        (d)->bd_sbuf = (d)->bd_fbuf; \
        (d)->bd_slen = 0; \
        (d)->bd_fbuf = NULL;
/*
 *  bpfread - read next chunk of packets from buffers
 */
static int
bpf_read(struct file *fp, off_t *offp, struct uio *uio,
    kauth_cred_t cred, int flags)
{
        struct bpf_d *d = fp->f_bpf;
        int timed_out;
        int error;

        /*
         * Refresh the PID associated with this bpf file.
         */
        d->bd_pid = curproc->p_pid;

        getnanotime(&d->bd_atime);
        /*
         * Restrict application to use a buffer the same size as
         * the kernel buffers.
         */
        if (uio->uio_resid != d->bd_bufsize)
                return (EINVAL);

        mutex_enter(d->bd_mtx);
        if (d->bd_state == BPF_WAITING)
                callout_halt(&d->bd_callout, d->bd_mtx);
        timed_out = (d->bd_state == BPF_TIMED_OUT);
        d->bd_state = BPF_IDLE;
        mutex_exit(d->bd_mtx);
        /*
         * If the hold buffer is empty, then do a timed sleep, which
         * ends when the timeout expires or when enough packets
         * have arrived to fill the store buffer.
         */
        mutex_enter(d->bd_buf_mtx);
        while (d->bd_hbuf == NULL) {
                if (fp->f_flag & FNONBLOCK) {
                        if (d->bd_slen == 0) {
                                error = EWOULDBLOCK;
                                goto out;
                        }
                        ROTATE_BUFFERS(d);
                        break;
                }

                if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
                        /*
                         * A packet(s) either arrived since the previous
                         * read or arrived while we were asleep.
                         * Rotate the buffers and return what's here.
                         */
                        ROTATE_BUFFERS(d);
                        break;
                }

                error = cv_timedwait_sig(&d->bd_cv, d->bd_buf_mtx, d->bd_rtout);

                if (error == EINTR || error == ERESTART)
                        goto out;

                if (error == EWOULDBLOCK) {
                        /*
                         * On a timeout, return what's in the buffer,
                         * which may be nothing.  If there is something
                         * in the store buffer, we can rotate the buffers.
                         */
                        if (d->bd_hbuf)
                                /*
                                 * We filled up the buffer in between
                                 * getting the timeout and arriving
                                 * here, so we don't need to rotate.
                                 */
                                break;

                        if (d->bd_slen == 0) {
                                error = 0;
                                goto out;
                        }
                        ROTATE_BUFFERS(d);
                        break;
                }
                if (error != 0)
                        goto out;
        }
        /*
         * At this point, we know we have something in the hold slot.
         */
        mutex_exit(d->bd_buf_mtx);

        /*
         * Move data from hold buffer into user space.
         * We know the entire buffer is transferred since
         * we checked above that the read buffer is bpf_bufsize bytes.
         */
        error = uiomove(d->bd_hbuf, d->bd_hlen, uio);

        mutex_enter(d->bd_buf_mtx);
        d->bd_fbuf = d->bd_hbuf;
        d->bd_hbuf = NULL;
        d->bd_hlen = 0;
out:
        mutex_exit(d->bd_buf_mtx);
        return (error);
}


/*
 * If there are processes sleeping on this descriptor, wake them up.
 */
static inline void
bpf_wakeup(struct bpf_d *d)
{

        mutex_enter(d->bd_buf_mtx);
        cv_broadcast(&d->bd_cv);
        mutex_exit(d->bd_buf_mtx);

        if (d->bd_async)
                fownsignal(d->bd_pgid, SIGIO, 0, 0, NULL);
        selnotify(&d->bd_sel, 0, 0);
}

static void
bpf_timed_out(void *arg)
{
        struct bpf_d *d = arg;

        mutex_enter(d->bd_mtx);
        if (d->bd_state == BPF_WAITING) {
                d->bd_state = BPF_TIMED_OUT;
                if (d->bd_slen != 0)
                        bpf_wakeup(d);
        }
        mutex_exit(d->bd_mtx);
}


static int
bpf_write(struct file *fp, off_t *offp, struct uio *uio,
    kauth_cred_t cred, int flags)
{
        struct bpf_d *d = fp->f_bpf;
        struct bpf_if *bp;
        struct ifnet *ifp;
        struct mbuf *m, *mc;
        int error;
        static struct sockaddr_storage dst;
        struct psref psref;
        int bound;

        /*
         * Refresh the PID associated with this bpf file.
         */
        d->bd_pid = curproc->p_pid;

        m = NULL;        /* XXX gcc */

        bound = curlwp_bind();
        mutex_enter(d->bd_mtx);
        bp = d->bd_bif;
        if (bp == NULL) {
                mutex_exit(d->bd_mtx);
                error = ENXIO;
                goto out_bindx;
        }
        bpf_if_acquire(bp, &psref);
        mutex_exit(d->bd_mtx);

        getnanotime(&d->bd_mtime);

        ifp = bp->bif_ifp;
        if (if_is_deactivated(ifp)) {
                error = ENXIO;
                goto out;
        }

        if (uio->uio_resid == 0) {
                error = 0;
                goto out;
        }

        error = bpf_movein(ifp, uio, (int)bp->bif_dlt, ifp->if_mtu, &m,
                (struct sockaddr *) &dst, &d->bd_wfilter);
        if (error)
                goto out;

        if (m->m_pkthdr.len > ifp->if_mtu) {
                m_freem(m);
                error = EMSGSIZE;
                goto out;
        }

        /*
         * If writing to a loopback interface, the address family has
         * already been specially computed in bpf_movein(), so don't
         * clobber it, or the loopback will reject it in looutput().
         */
        if (d->bd_hdrcmplt && ifp->if_type != IFT_LOOP)
                dst.ss_family = pseudo_AF_HDRCMPLT;

        if (d->bd_feedback) {
                mc = m_dup(m, 0, M_COPYALL, M_NOWAIT);
                if (mc != NULL)
                        m_set_rcvif(mc, ifp);
                /* Set M_PROMISC for outgoing packets to be discarded. */
                if (1 /*d->bd_direction == BPF_D_INOUT*/)
                        m->m_flags |= M_PROMISC;
        } else  
                mc = NULL;

        error = if_output_lock(ifp, ifp, m, (struct sockaddr *) &dst, NULL);

        if (mc != NULL) {
                if (error == 0) {
                        int s = splsoftnet();
                        KERNEL_LOCK_UNLESS_IFP_MPSAFE(ifp);
                        ifp->_if_input(ifp, mc);
                        KERNEL_UNLOCK_UNLESS_IFP_MPSAFE(ifp);
                        splx(s);
                } else
                        m_freem(mc);
        }
        /*
         * The driver frees the mbuf.
         */
out:
        bpf_if_release(bp, &psref);
out_bindx:
        curlwp_bindx(bound);
        return error;
}

/*
 * Reset a descriptor by flushing its packet buffer and clearing the
 * receive and drop counts.
 */
static void
reset_d(struct bpf_d *d)
{

        KASSERT(mutex_owned(d->bd_mtx));

        mutex_enter(d->bd_buf_mtx);
        if (d->bd_hbuf) {
                /* Free the hold buffer. */
                d->bd_fbuf = d->bd_hbuf;
                d->bd_hbuf = NULL;
        }
        d->bd_slen = 0;
        d->bd_hlen = 0;
        d->bd_rcount = 0;
        d->bd_dcount = 0;
        d->bd_ccount = 0;
        mutex_exit(d->bd_buf_mtx);
}

/*
 *  FIONREAD                Check for read packet available.
 *  BIOCGBLEN                Get buffer len [for read()].
 *  BIOCSETF                Set ethernet read filter.
 *  BIOCFLUSH                Flush read packet buffer.
 *  BIOCPROMISC                Put interface into promiscuous mode.
 *  BIOCGDLT                Get link layer type.
 *  BIOCGETIF                Get interface name.
 *  BIOCSETIF                Set interface.
 *  BIOCSRTIMEOUT        Set read timeout.
 *  BIOCGRTIMEOUT        Get read timeout.
 *  BIOCGSTATS                Get packet stats.
 *  BIOCIMMEDIATE        Set immediate mode.
 *  BIOCVERSION                Get filter language version.
 *  BIOCGHDRCMPLT        Get "header already complete" flag.
 *  BIOCSHDRCMPLT        Set "header already complete" flag.
 *  BIOCSFEEDBACK        Set packet feedback mode.
 *  BIOCGFEEDBACK        Get packet feedback mode.
 *  BIOCGDIRECTION        Get packet direction flag
 *  BIOCSDIRECTION        Set packet direction flag
 */
/* ARGSUSED */
static int
bpf_ioctl(struct file *fp, u_long cmd, void *addr)
{
        struct bpf_d *d = fp->f_bpf;
        int error = 0;

        /*
         * Refresh the PID associated with this bpf file.
         */
        d->bd_pid = curproc->p_pid;
#ifdef _LP64
        if (curproc->p_flag & PK_32)
                d->bd_compat32 = 1;
        else
                d->bd_compat32 = 0;
#endif

        mutex_enter(d->bd_mtx);
        if (d->bd_state == BPF_WAITING)
                callout_halt(&d->bd_callout, d->bd_mtx);
        d->bd_state = BPF_IDLE;
        mutex_exit(d->bd_mtx);

        if (d->bd_locked) {
                switch (cmd) {
                case BIOCGBLEN:                /* FALLTHROUGH */
                case BIOCFLUSH:                /* FALLTHROUGH */
                case BIOCGDLT:                /* FALLTHROUGH */
                case BIOCGDLTLIST:        /* FALLTHROUGH */
                case BIOCGETIF:                /* FALLTHROUGH */
                case BIOCGRTIMEOUT:        /* FALLTHROUGH */
                case BIOCGSTATS:        /* FALLTHROUGH */
                case BIOCVERSION:        /* FALLTHROUGH */
                case BIOCGHDRCMPLT:        /* FALLTHROUGH */
                case FIONREAD:                /* FALLTHROUGH */
                case BIOCLOCK:                /* FALLTHROUGH */
                case BIOCSRTIMEOUT:        /* FALLTHROUGH */
                case BIOCIMMEDIATE:        /* FALLTHROUGH */
                case TIOCGPGRP:
                        break;
                default:
                        return EPERM;
                }
        }

        switch (cmd) {

        default:
                error = EINVAL;
                break;

        /*
         * Check for read packet available.
         */
        case FIONREAD:
                {
                        int n;

                        mutex_enter(d->bd_buf_mtx);
                        n = d->bd_slen;
                        if (d->bd_hbuf)
                                n += d->bd_hlen;
                        mutex_exit(d->bd_buf_mtx);

                        *(int *)addr = n;
                        break;
                }

        /*
         * Get buffer len [for read()].
         */
        case BIOCGBLEN:
                *(u_int *)addr = d->bd_bufsize;
                break;

        /*
         * Set buffer length.
         */
        case BIOCSBLEN:
                /*
                 * Forbid to change the buffer length if buffers are already
                 * allocated.
                 */
                mutex_enter(d->bd_mtx);
                mutex_enter(d->bd_buf_mtx);
                if (d->bd_bif != NULL || d->bd_sbuf != NULL)
                        error = EINVAL;
                else {
                        u_int size = *(u_int *)addr;

                        if (size > bpf_maxbufsize)
                                *(u_int *)addr = size = bpf_maxbufsize;
                        else if (size < BPF_MINBUFSIZE)
                                *(u_int *)addr = size = BPF_MINBUFSIZE;
                        d->bd_bufsize = size;
                }
                mutex_exit(d->bd_buf_mtx);
                mutex_exit(d->bd_mtx);
                break;

        /*
         * Set link layer read filter.
         */
        case BIOCSETF:                /* FALLTHROUGH */
        case BIOCSETWF:
                error = bpf_setf(d, addr, cmd);
                break;

        case BIOCLOCK:
                d->bd_locked = 1;
                break;

        /*
         * Flush read packet buffer.
         */
        case BIOCFLUSH:
                mutex_enter(d->bd_mtx);
                reset_d(d);
                mutex_exit(d->bd_mtx);
                break;

        /*
         * Put interface into promiscuous mode.
         */
        case BIOCPROMISC:
                mutex_enter(d->bd_mtx);
                if (d->bd_bif == NULL) {
                        mutex_exit(d->bd_mtx);
                        /*
                         * No interface attached yet.
                         */
                        error = EINVAL;
                        break;
                }
                if (d->bd_promisc == 0) {
                        KERNEL_LOCK_UNLESS_NET_MPSAFE();
                        error = ifpromisc(d->bd_bif->bif_ifp, 1);
                        KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
                        if (error == 0)
                                d->bd_promisc = 1;
                }
                mutex_exit(d->bd_mtx);
                break;

        /*
         * Get device parameters.
         */
        case BIOCGDLT:
                mutex_enter(d->bd_mtx);
                if (d->bd_bif == NULL)
                        error = EINVAL;
                else
                        *(u_int *)addr = d->bd_bif->bif_dlt;
                mutex_exit(d->bd_mtx);
                break;

        /*
         * Get a list of supported device parameters.
         */
        case BIOCGDLTLIST:
                mutex_enter(d->bd_mtx);
                if (d->bd_bif == NULL)
                        error = EINVAL;
                else
                        error = bpf_getdltlist(d, addr);
                mutex_exit(d->bd_mtx);
                break;

        /*
         * Set device parameters.
         */
        case BIOCSDLT:
                mutex_enter(&bpf_mtx);
                mutex_enter(d->bd_mtx);
                if (d->bd_bif == NULL)
                        error = EINVAL;
                else
                        error = bpf_setdlt(d, *(u_int *)addr);
                mutex_exit(d->bd_mtx);
                mutex_exit(&bpf_mtx);
                break;

        /*
         * Set interface name.
         */
#ifdef OBIOCGETIF
        case OBIOCGETIF:
#endif
        case BIOCGETIF:
                mutex_enter(d->bd_mtx);
                if (d->bd_bif == NULL)
                        error = EINVAL;
                else
                        bpf_ifname(d->bd_bif->bif_ifp, addr);
                mutex_exit(d->bd_mtx);
                break;

        /*
         * Set interface.
         */
#ifdef OBIOCSETIF
        case OBIOCSETIF:
#endif
        case BIOCSETIF:
                mutex_enter(&bpf_mtx);
                error = bpf_setif(d, addr);
                mutex_exit(&bpf_mtx);
                break;

        /*
         * Set read timeout.
         */
        case BIOCSRTIMEOUT:
                {
                        struct timeval *tv = addr;

                        /* Compute number of ticks. */
                        if (tv->tv_sec < 0 ||
                            tv->tv_usec < 0 || tv->tv_usec >= 1000000) {
                                error = EINVAL;
                                break;
                        } else if (tv->tv_sec > INT_MAX/hz - 1) {
                                d->bd_rtout = INT_MAX;
                        } else {
                                d->bd_rtout = tv->tv_sec * hz
                                    + tv->tv_usec / tick;
                        }
                        if ((d->bd_rtout == 0) && (tv->tv_usec != 0))
                                d->bd_rtout = 1;
                        break;
                }

#ifdef BIOCGORTIMEOUT
        /*
         * Get read timeout.
         */
        case BIOCGORTIMEOUT:
                {
                        struct timeval50 *tv = addr;

                        tv->tv_sec = d->bd_rtout / hz;
                        tv->tv_usec = (d->bd_rtout % hz) * tick;
                        break;
                }
#endif

#ifdef BIOCSORTIMEOUT
        /*
         * Set read timeout.
         */
        case BIOCSORTIMEOUT:
                {
                        struct timeval50 *tv = addr;

                        /* Compute number of ticks. */
                        if (tv->tv_sec < 0 ||
                            tv->tv_usec < 0 || tv->tv_usec >= 1000000) {
                                error = EINVAL;
                                break;
                        } else if (tv->tv_sec > INT_MAX/hz - 1) {
                                d->bd_rtout = INT_MAX;
                        } else {
                                d->bd_rtout = tv->tv_sec * hz
                                    + tv->tv_usec / tick;
                        }
                        if ((d->bd_rtout == 0) && (tv->tv_usec != 0))
                                d->bd_rtout = 1;
                        break;
                }
#endif

        /*
         * Get read timeout.
         */
        case BIOCGRTIMEOUT:
                {
                        struct timeval *tv = addr;

                        tv->tv_sec = d->bd_rtout / hz;
                        tv->tv_usec = (d->bd_rtout % hz) * tick;
                        break;
                }
        /*
         * Get packet stats.
         */
        case BIOCGSTATS:
                {
                        struct bpf_stat *bs = addr;

                        bs->bs_recv = d->bd_rcount;
                        bs->bs_drop = d->bd_dcount;
                        bs->bs_capt = d->bd_ccount;
                        break;
                }

        case BIOCGSTATS_30:
                {
                        struct bpf_stat30 *bs = addr;

                        bs->bs_recv = d->bd_rcount;
                        bs->bs_drop = d->bd_dcount;
                        break;
                }

        /*
         * Set immediate mode.
         */
        case BIOCIMMEDIATE:
                d->bd_immediate = *(u_int *)addr;
                break;

        case BIOCVERSION:
                {
                        struct bpf_version *bv = addr;

                        bv->bv_major = BPF_MAJOR_VERSION;
                        bv->bv_minor = BPF_MINOR_VERSION;
                        break;
                }

        case BIOCGHDRCMPLT:        /* get "header already complete" flag */
                *(u_int *)addr = d->bd_hdrcmplt;
                break;

        case BIOCSHDRCMPLT:        /* set "header already complete" flag */
                d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
                break;

        /*
         * Get packet direction flag
         */
        case BIOCGDIRECTION:
                *(u_int *)addr = d->bd_direction;
                break;

        /*
         * Set packet direction flag
         */
        case BIOCSDIRECTION:
                {
                        u_int        direction;

                        direction = *(u_int *)addr;
                        switch (direction) {
                        case BPF_D_IN:
                        case BPF_D_INOUT:
                        case BPF_D_OUT:
                                d->bd_direction = direction;
                                break;
                        default:
                                error = EINVAL;
                        }
                }
                break;

        /*
         * Set "feed packets from bpf back to input" mode
         */
        case BIOCSFEEDBACK:
                d->bd_feedback = *(u_int *)addr;
                break;

        /*
         * Get "feed packets from bpf back to input" mode
         */
        case BIOCGFEEDBACK:
                *(u_int *)addr = d->bd_feedback;
                break;

        case FIONBIO:                /* Non-blocking I/O */
                /*
                 * No need to do anything special as we use IO_NDELAY in
                 * bpfread() as an indication of whether or not to block
                 * the read.
                 */
                break;

        case FIOASYNC:                /* Send signal on receive packets */
                mutex_enter(d->bd_mtx);
                d->bd_async = *(int *)addr;
                mutex_exit(d->bd_mtx);
                break;

        case TIOCSPGRP:                /* Process or group to send signals to */
        case FIOSETOWN:
                error = fsetown(&d->bd_pgid, cmd, addr);
                break;

        case TIOCGPGRP:
        case FIOGETOWN:
                error = fgetown(d->bd_pgid, cmd, addr);
                break;
        }
        return (error);
}

/*
 * Set d's packet filter program to fp.  If this file already has a filter,
 * free it and replace it.  Returns EINVAL for bogus requests.
 */
static int
bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
{
        struct bpf_insn *fcode;
        bpfjit_func_t jcode;
        size_t flen, size = 0;
        struct bpf_filter *oldf, *newf, **storef;

        jcode = NULL;
        flen = fp->bf_len;

        if ((fp->bf_insns == NULL && flen) || flen > BPF_MAXINSNS) {
                return EINVAL;
        }

        if (flen) {
                /*
                 * Allocate the buffer, copy the byte-code from
                 * userspace and validate it.
                 */
                size = flen * sizeof(*fp->bf_insns);
                fcode = kmem_alloc(size, KM_SLEEP);
                if (copyin(fp->bf_insns, fcode, size) != 0 ||
                    !bpf_validate(fcode, (int)flen)) {
                        kmem_free(fcode, size);
                        return EINVAL;
                }
                if (bpf_jit)
                        jcode = bpf_jit_generate(NULL, fcode, flen);
        } else {
                fcode = NULL;
        }

        newf = kmem_alloc(sizeof(*newf), KM_SLEEP);
        newf->bf_insn = fcode;
        newf->bf_size = size;
        newf->bf_jitcode = jcode;
        if (cmd == BIOCSETF)
                d->bd_jitcode = jcode; /* XXX just for kvm(3) users */

        /* Need to hold bpf_mtx for pserialize_perform */
        mutex_enter(&bpf_mtx);
        mutex_enter(d->bd_mtx);
        if (cmd == BIOCSETWF) {
                oldf = d->bd_wfilter;
                storef = &d->bd_wfilter;
        } else {
                oldf = d->bd_rfilter;
                storef = &d->bd_rfilter;
        }
        atomic_store_release(storef, newf);
        reset_d(d);
        pserialize_perform(bpf_psz);
        mutex_exit(d->bd_mtx);
        mutex_exit(&bpf_mtx);

        if (oldf != NULL)
                bpf_free_filter(oldf);

        return 0;
}

/*
 * Detach a file from its current interface (if attached at all) and attach
 * to the interface indicated by the name stored in ifr.
 * Return an errno or 0.
 */
static int
bpf_setif(struct bpf_d *d, struct ifreq *ifr)
{
        struct bpf_if *bp;
        char *cp;
        int unit_seen, i, error;

        KASSERT(mutex_owned(&bpf_mtx));
        /*
         * Make sure the provided name has a unit number, and default
         * it to '0' if not specified.
         * XXX This is ugly ... do this differently?
         */
        unit_seen = 0;
        cp = ifr->ifr_name;
        cp[sizeof(ifr->ifr_name) - 1] = '\0';        /* sanity */
        while (*cp++)
                if (*cp >= '0' && *cp <= '9')
                        unit_seen = 1;
        if (!unit_seen) {
                /* Make sure to leave room for the '\0'. */
                for (i = 0; i < (IFNAMSIZ - 1); ++i) {
                        if ((ifr->ifr_name[i] >= 'a' &&
                             ifr->ifr_name[i] <= 'z') ||
                            (ifr->ifr_name[i] >= 'A' &&
                             ifr->ifr_name[i] <= 'Z'))
                                continue;
                        ifr->ifr_name[i] = '0';
                }
        }

        /*
         * Look through attached interfaces for the named one.
         */
        BPF_IFLIST_WRITER_FOREACH(bp) {
                struct ifnet *ifp = bp->bif_ifp;

                if (ifp == NULL ||
                    strcmp(ifp->if_xname, ifr->ifr_name) != 0)
                        continue;
                /* skip additional entry */
                if (bp->bif_driverp != &ifp->if_bpf)
                        continue;
                /*
                 * We found the requested interface.
                 * Allocate the packet buffers if we need to.
                 * If we're already attached to requested interface,
                 * just flush the buffer.
                 */
                /*
                 * bpf_allocbufs is called only here. bpf_mtx ensures that
                 * no race condition happen on d->bd_sbuf.
                 */
                if (d->bd_sbuf == NULL) {
                        error = bpf_allocbufs(d);
                        if (error != 0)
                                return (error);
                }
                mutex_enter(d->bd_mtx);
                if (bp != d->bd_bif) {
                        if (d->bd_bif) {
                                /*
                                 * Detach if attached to something else.
                                 */
                                bpf_detachd(d);
                                BPFIF_DLIST_ENTRY_INIT(d);
                        }

                        bpf_attachd(d, bp);
                }
                reset_d(d);
                mutex_exit(d->bd_mtx);
                return (0);
        }
        /* Not found. */
        return (ENXIO);
}

/*
 * Copy the interface name to the ifreq.
 */
static void
bpf_ifname(struct ifnet *ifp, struct ifreq *ifr)
{
        memcpy(ifr->ifr_name, ifp->if_xname, IFNAMSIZ);
}

static int
bpf_stat(struct file *fp, struct stat *st)
{
        struct bpf_d *d = fp->f_bpf;

        (void)memset(st, 0, sizeof(*st));
        mutex_enter(d->bd_mtx);
        st->st_dev = makedev(cdevsw_lookup_major(&bpf_cdevsw), d->bd_pid);
        st->st_atimespec = d->bd_atime;
        st->st_mtimespec = d->bd_mtime;
        st->st_ctimespec = st->st_birthtimespec = d->bd_btime;
        st->st_uid = kauth_cred_geteuid(fp->f_cred);
        st->st_gid = kauth_cred_getegid(fp->f_cred);
        st->st_mode = S_IFCHR;
        mutex_exit(d->bd_mtx);
        return 0;
}

/*
 * Support for poll() system call
 *
 * Return true iff the specific operation will not block indefinitely - with
 * the assumption that it is safe to positively acknowledge a request for the
 * ability to write to the BPF device.
 * Otherwise, return false but make a note that a selnotify() must be done.
 */
static int
bpf_poll(struct file *fp, int events)
{
        struct bpf_d *d = fp->f_bpf;
        int revents;

        /*
         * Refresh the PID associated with this bpf file.
         */
        mutex_enter(&bpf_mtx);
        d->bd_pid = curproc->p_pid;

        revents = events & (POLLOUT | POLLWRNORM);
        if (events & (POLLIN | POLLRDNORM)) {
                /*
                 * An imitation of the FIONREAD ioctl code.
                 */
                mutex_enter(d->bd_mtx);
                if (d->bd_hlen != 0 ||
                    ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
                     d->bd_slen != 0)) {
                        revents |= events & (POLLIN | POLLRDNORM);
                } else {
                        selrecord(curlwp, &d->bd_sel);
                        /* Start the read timeout if necessary */
                        if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
                                callout_reset(&d->bd_callout, d->bd_rtout,
                                              bpf_timed_out, d);
                                d->bd_state = BPF_WAITING;
                        }
                }
                mutex_exit(d->bd_mtx);
        }

        mutex_exit(&bpf_mtx);
        return (revents);
}

static void
filt_bpfrdetach(struct knote *kn)
{
        struct bpf_d *d = kn->kn_hook;

        mutex_enter(d->bd_buf_mtx);
        selremove_knote(&d->bd_sel, kn);
        mutex_exit(d->bd_buf_mtx);
}

static int
filt_bpfread(struct knote *kn, long hint)
{
        struct bpf_d *d = kn->kn_hook;
        int rv;

        /*
         * Refresh the PID associated with this bpf file.
         */
        d->bd_pid = curproc->p_pid;

        mutex_enter(d->bd_buf_mtx);
        kn->kn_data = d->bd_hlen;
        if (d->bd_immediate)
                kn->kn_data += d->bd_slen;
        rv = (kn->kn_data > 0);
        mutex_exit(d->bd_buf_mtx);
        return rv;
}

static const struct filterops bpfread_filtops = {
        .f_flags = FILTEROP_ISFD,
        .f_attach = NULL,
        .f_detach = filt_bpfrdetach,
        .f_event = filt_bpfread,
};

static int
bpf_kqfilter(struct file *fp, struct knote *kn)
{
        struct bpf_d *d = fp->f_bpf;

        switch (kn->kn_filter) {
        case EVFILT_READ:
                kn->kn_fop = &bpfread_filtops;
                break;

        default:
                return (EINVAL);
        }

        kn->kn_hook = d;

        mutex_enter(d->bd_buf_mtx);
        selrecord_knote(&d->bd_sel, kn);
        mutex_exit(d->bd_buf_mtx);

        return (0);
}

/*
 * Copy data from an mbuf chain into a buffer.  This code is derived
 * from m_copydata in sys/uipc_mbuf.c.
 */
static void *
bpf_mcpy(void *dst_arg, const void *src_arg, size_t len)
{
        const struct mbuf *m;
        u_int count;
        u_char *dst;

        m = src_arg;
        dst = dst_arg;
        while (len > 0) {
                if (m == NULL)
                        panic("bpf_mcpy");
                count = uimin(m->m_len, len);
                memcpy(dst, mtod(m, const void *), count);
                m = m->m_next;
                dst += count;
                len -= count;
        }
        return dst_arg;
}

static inline u_int
bpf_xfilter(struct bpf_filter **filter, void *pkt, u_int pktlen, u_int buflen)
{
        struct bpf_filter *filt;
        uint32_t mem[BPF_MEMWORDS];
        bpf_args_t args = {
                .pkt = (const uint8_t *)pkt,
                .wirelen = pktlen,
                .buflen = buflen,
                .mem = mem,
                .arg = NULL
        };
        u_int slen;

        filt = atomic_load_consume(filter);
        if (filt == NULL) /* No filter means accept all. */
                return (u_int)-1;

        if (filt->bf_jitcode != NULL)
                slen = filt->bf_jitcode(NULL, &args);
        else
                slen = bpf_filter_ext(NULL, filt->bf_insn, &args);
        return slen;
}

/*
 * Dispatch a packet to all the listeners on interface bp.
 *
 * pkt       pointer to the packet, either a data buffer or an mbuf chain
 * buflen    buffer length, if pkt is a data buffer
 * cpfn      a function that can copy pkt into the listener's buffer
 * pktlen    length of the packet
 * direction BPF_D_IN or BPF_D_OUT
 */
static inline void
bpf_deliver(struct bpf_if *bp, void *(*cpfn)(void *, const void *, size_t),
    void *pkt, u_int pktlen, u_int buflen, const u_int direction)
{
        bool gottime = false;
        struct timespec ts;
        struct bpf_d *d;
        int s;
        u_int slen;

        KASSERT(!cpu_intr_p());

        /*
         * Note that the IPL does not have to be raised at this point.
         * The only problem that could arise here is that if two different
         * interfaces shared any data.  This is not the case.
         */
        s = pserialize_read_enter();
        BPFIF_DLIST_READER_FOREACH(d, bp) {
                if (direction == BPF_D_IN) {
                        if (d->bd_direction == BPF_D_OUT)
                                continue;
                } else { /* BPF_D_OUT */
                        if (d->bd_direction == BPF_D_IN)
                                continue;
                }

                atomic_inc_ulong(&d->bd_rcount);
                BPF_STATINC(recv);

                slen = bpf_xfilter(&d->bd_rfilter, pkt, pktlen, buflen);
                if (slen == 0)
                        continue;

                if (!gottime) {
                        gottime = true;
                        nanotime(&ts);
                }
                /* Assume catchpacket doesn't sleep */
                catchpacket(d, pkt, pktlen, slen, cpfn, &ts);
        }
        pserialize_read_exit(s);
}

/*
 * Incoming linkage from device drivers, when the head of the packet is in
 * a buffer, and the tail is in an mbuf chain.
 */
static void
_bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m,
        u_int direction)
{
        u_int pktlen;
        struct mbuf mb;

        /* Skip outgoing duplicate packets. */
        if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif_index == 0) {
                m->m_flags &= ~M_PROMISC;
                return;
        }

        pktlen = m_length(m) + dlen;

        /*
         * Craft on-stack mbuf suitable for passing to bpf_filter.
         * Note that we cut corners here; we only set up what's
         * absolutely needed--this mbuf should never go anywhere else.
         */
        (void)memset(&mb, 0, sizeof(mb));
        mb.m_type = MT_DATA;
        mb.m_next = m;
        mb.m_data = data;
        mb.m_len = dlen;

        bpf_deliver(bp, bpf_mcpy, &mb, pktlen, 0, direction);
}

/*
 * Incoming linkage from device drivers, when packet is in an mbuf chain.
 */
static void
_bpf_mtap(struct bpf_if *bp, struct mbuf *m, u_int direction)
{
        void *(*cpfn)(void *, const void *, size_t);
        u_int pktlen, buflen;
        void *marg;

        /* Skip outgoing duplicate packets. */
        if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif_index == 0) {
                m->m_flags &= ~M_PROMISC;
                return;
        }

        pktlen = m_length(m);

        /* Skip zero-sized packets. */
        if (__predict_false(pktlen == 0)) {
                return;
        }

        if (pktlen == m->m_len) {
                cpfn = (void *)memcpy;
                marg = mtod(m, void *);
                buflen = pktlen;
                KASSERT(buflen != 0);
        } else {
                cpfn = bpf_mcpy;
                marg = m;
                buflen = 0;
        }

        bpf_deliver(bp, cpfn, marg, pktlen, buflen, direction);
}

/*
 * We need to prepend the address family as
 * a four byte field.  Cons up a dummy header
 * to pacify bpf.  This is safe because bpf
 * will only read from the mbuf (i.e., it won't
 * try to free it or keep a pointer a to it).
 */
static void
_bpf_mtap_af(struct bpf_if *bp, uint32_t af, struct mbuf *m, u_int direction)
{
        struct mbuf m0;

        m0.m_type = MT_DATA;
        m0.m_flags = 0;
        m0.m_next = m;
        m0.m_nextpkt = NULL;
        m0.m_owner = NULL;
        m0.m_len = 4;
        m0.m_data = (char *)&af;

        _bpf_mtap(bp, &m0, direction);
}

/*
 * Put the SLIP pseudo-"link header" in place.
 * Note this M_PREPEND() should never fail,
 * since we know we always have enough space
 * in the input buffer.
 */
static void
_bpf_mtap_sl_in(struct bpf_if *bp, u_char *chdr, struct mbuf **m)
{
        u_char *hp;

        M_PREPEND(*m, SLIP_HDRLEN, M_DONTWAIT);
        if (*m == NULL)
                return;

        hp = mtod(*m, u_char *);
        hp[SLX_DIR] = SLIPDIR_IN;
        (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);

        _bpf_mtap(bp, *m, BPF_D_IN);

        m_adj(*m, SLIP_HDRLEN);
}

/*
 * Put the SLIP pseudo-"link header" in
 * place.  The compressed header is now
 * at the beginning of the mbuf.
 */
static void
_bpf_mtap_sl_out(struct bpf_if *bp, u_char *chdr, struct mbuf *m)
{
        struct mbuf m0;
        u_char *hp;

        m0.m_type = MT_DATA;
        m0.m_flags = 0;
        m0.m_next = m;
        m0.m_nextpkt = NULL;
        m0.m_owner = NULL;
        m0.m_data = m0.m_dat;
        m0.m_len = SLIP_HDRLEN;

        hp = mtod(&m0, u_char *);

        hp[SLX_DIR] = SLIPDIR_OUT;
        (void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);

        _bpf_mtap(bp, &m0, BPF_D_OUT);
        m_freem(m);
}

static struct mbuf *
bpf_mbuf_enqueue(struct bpf_if *bp, struct mbuf *m)
{
        struct mbuf *dup;

        dup = m_dup(m, 0, M_COPYALL, M_NOWAIT);
        if (dup == NULL)
                return NULL;

        if (bp->bif_mbuf_tail != NULL) {
                bp->bif_mbuf_tail->m_nextpkt = dup;
        } else {
                bp->bif_mbuf_head = dup;
        }
        bp->bif_mbuf_tail = dup;
#ifdef BPF_MTAP_SOFTINT_DEBUG
        log(LOG_DEBUG, "%s: enqueued mbuf=%p to %s\n",
            __func__, dup, bp->bif_ifp->if_xname);
#endif

        return dup;
}

static struct mbuf *
bpf_mbuf_dequeue(struct bpf_if *bp)
{
        struct mbuf *m;
        int s;

        /* XXX NOMPSAFE: assumed running on one CPU */
        s = splnet();
        m = bp->bif_mbuf_head;
        if (m != NULL) {
                bp->bif_mbuf_head = m->m_nextpkt;
                m->m_nextpkt = NULL;

                if (bp->bif_mbuf_head == NULL)
                        bp->bif_mbuf_tail = NULL;
#ifdef BPF_MTAP_SOFTINT_DEBUG
                log(LOG_DEBUG, "%s: dequeued mbuf=%p from %s\n",
                    __func__, m, bp->bif_ifp->if_xname);
#endif
        }
        splx(s);

        return m;
}

static void
bpf_mtap_si(void *arg)
{
        struct bpf_if *bp = arg;
        struct mbuf *m;

        while ((m = bpf_mbuf_dequeue(bp)) != NULL) {
#ifdef BPF_MTAP_SOFTINT_DEBUG
                log(LOG_DEBUG, "%s: tapping mbuf=%p on %s\n",
                    __func__, m, bp->bif_ifp->if_xname);
#endif
                bpf_ops->bpf_mtap(bp, m, BPF_D_IN);
                m_freem(m);
        }
}

static void
_bpf_mtap_softint(struct ifnet *ifp, struct mbuf *m)
{
        struct bpf_if *bp = ifp->if_bpf;
        struct mbuf *dup;

        KASSERT(cpu_intr_p());

        /* To avoid extra invocations of the softint */
        if (BPFIF_DLIST_READER_EMPTY(bp))
                return;
        KASSERT(bp->bif_si != NULL);

        dup = bpf_mbuf_enqueue(bp, m);
        if (dup != NULL)
                softint_schedule(bp->bif_si);
}

static int
bpf_hdrlen(struct bpf_d *d)
{
        int hdrlen = d->bd_bif->bif_hdrlen;
        /*
         * Compute the length of the bpf header.  This is not necessarily
         * equal to SIZEOF_BPF_HDR because we want to insert spacing such
         * that the network layer header begins on a longword boundary (for
         * performance reasons and to alleviate alignment restrictions).
         */
#ifdef _LP64
        if (d->bd_compat32)
                return (BPF_WORDALIGN32(hdrlen + SIZEOF_BPF_HDR32) - hdrlen);
        else
#endif
                return (BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen);
}

/*
 * Move the packet data from interface memory (pkt) into the
 * store buffer. Call the wakeup functions if it's time to wake up
 * a listener (buffer full), "cpfn" is the routine called to do the
 * actual data transfer. memcpy is passed in to copy contiguous chunks,
 * while bpf_mcpy is passed in to copy mbuf chains.  In the latter case,
 * pkt is really an mbuf.
 */
static void
catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
    void *(*cpfn)(void *, const void *, size_t), struct timespec *ts)
{
        char *h;
        int totlen, curlen, caplen;
        int hdrlen = bpf_hdrlen(d);
        int do_wakeup = 0;

        atomic_inc_ulong(&d->bd_ccount);
        BPF_STATINC(capt);
        /*
         * Figure out how many bytes to move.  If the packet is
         * greater or equal to the snapshot length, transfer that
         * much.  Otherwise, transfer the whole packet (unless
         * we hit the buffer size limit).
         */
        totlen = hdrlen + uimin(snaplen, pktlen);
        if (totlen > d->bd_bufsize)
                totlen = d->bd_bufsize;
        /*
         * If we adjusted totlen to fit the bufsize, it could be that
         * totlen is smaller than hdrlen because of the link layer header.
         */
        caplen = totlen - hdrlen;
        if (caplen < 0)
                caplen = 0;

        mutex_enter(d->bd_buf_mtx);
        /*
         * Round up the end of the previous packet to the next longword.
         */
#ifdef _LP64
        if (d->bd_compat32)
                curlen = BPF_WORDALIGN32(d->bd_slen);
        else
#endif
                curlen = BPF_WORDALIGN(d->bd_slen);
        if (curlen + totlen > d->bd_bufsize) {
                /*
                 * This packet will overflow the storage buffer.
                 * Rotate the buffers if we can, then wakeup any
                 * pending reads.
                 */
                if (d->bd_fbuf == NULL) {
                        mutex_exit(d->bd_buf_mtx);
                        /*
                         * We haven't completed the previous read yet,
                         * so drop the packet.
                         */
                        atomic_inc_ulong(&d->bd_dcount);
                        BPF_STATINC(drop);
                        return;
                }
                ROTATE_BUFFERS(d);
                do_wakeup = 1;
                curlen = 0;
        } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
                /*
                 * Immediate mode is set, or the read timeout has
                 * already expired during a select call.  A packet
                 * arrived, so the reader should be woken up.
                 */
                do_wakeup = 1;
        }

        /*
         * Append the bpf header.
         */
        h = (char *)d->bd_sbuf + curlen;
#ifdef _LP64
        if (d->bd_compat32) {
                struct bpf_hdr32 *hp32;

                hp32 = (struct bpf_hdr32 *)h;
                hp32->bh_tstamp.tv_sec = ts->tv_sec;
                hp32->bh_tstamp.tv_usec = ts->tv_nsec / 1000;
                hp32->bh_datalen = pktlen;
                hp32->bh_hdrlen = hdrlen;
                hp32->bh_caplen = caplen;
        } else
#endif
        {
                struct bpf_hdr *hp;

                hp = (struct bpf_hdr *)h;
                hp->bh_tstamp.tv_sec = ts->tv_sec;
                hp->bh_tstamp.tv_usec = ts->tv_nsec / 1000;
                hp->bh_datalen = pktlen;
                hp->bh_hdrlen = hdrlen;
                hp->bh_caplen = caplen;
        }

        /*
         * Copy the packet data into the store buffer and update its length.
         */
        (*cpfn)(h + hdrlen, pkt, caplen);
        d->bd_slen = curlen + totlen;
        mutex_exit(d->bd_buf_mtx);

        /*
         * Call bpf_wakeup after bd_slen has been updated so that kevent(2)
         * will cause filt_bpfread() to be called with it adjusted.
         */
        if (do_wakeup)
                bpf_wakeup(d);
}

/*
 * Initialize all nonzero fields of a descriptor.
 */
static int
bpf_allocbufs(struct bpf_d *d)
{

        d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
        if (!d->bd_fbuf)
                return (ENOBUFS);
        d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
        if (!d->bd_sbuf) {
                kmem_free(d->bd_fbuf, d->bd_bufsize);
                return (ENOBUFS);
        }
        d->bd_slen = 0;
        d->bd_hlen = 0;
        return (0);
}

static void
bpf_free_filter(struct bpf_filter *filter)
{

        KASSERT(filter != NULL);

        if (filter->bf_insn != NULL)
                kmem_free(filter->bf_insn, filter->bf_size);
        if (filter->bf_jitcode != NULL)
                bpf_jit_freecode(filter->bf_jitcode);
        kmem_free(filter, sizeof(*filter));
}

/*
 * Free buffers currently in use by a descriptor.
 * Called on close.
 */
static void
bpf_freed(struct bpf_d *d)
{
        /*
         * We don't need to lock out interrupts since this descriptor has
         * been detached from its interface and it yet hasn't been marked
         * free.
         */
        if (d->bd_sbuf != NULL) {
                kmem_free(d->bd_sbuf, d->bd_bufsize);
                if (d->bd_hbuf != NULL)
                        kmem_free(d->bd_hbuf, d->bd_bufsize);
                if (d->bd_fbuf != NULL)
                        kmem_free(d->bd_fbuf, d->bd_bufsize);
        }
        if (d->bd_rfilter != NULL) {
                bpf_free_filter(d->bd_rfilter);
                d->bd_rfilter = NULL;
        }
        if (d->bd_wfilter != NULL) {
                bpf_free_filter(d->bd_wfilter);
                d->bd_wfilter = NULL;
        }
        d->bd_jitcode = NULL;
}

/*
 * Attach an interface to bpf.  dlt is the link layer type;
 * hdrlen is the fixed size of the link header for the specified dlt
 * (variable length headers not yet supported).
 */
static void
_bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
{
        struct bpf_if *bp;

        bp = kmem_alloc(sizeof(*bp), KM_SLEEP);

        mutex_enter(&bpf_mtx);
        bp->bif_driverp = driverp;
        bp->bif_ifp = ifp;
        bp->bif_dlt = dlt;
        bp->bif_si = NULL;
        BPF_IFLIST_ENTRY_INIT(bp);
        PSLIST_INIT(&bp->bif_dlist_head);
        psref_target_init(&bp->bif_psref, bpf_psref_class);
        SLIST_INIT(&bp->bif_trackers);

        BPF_IFLIST_WRITER_INSERT_HEAD(bp);

        *bp->bif_driverp = NULL;

        bp->bif_hdrlen = hdrlen;
        mutex_exit(&bpf_mtx);
#if 0
        printf("bpf: %s attached with dlt %x\n", ifp->if_xname, dlt);
#endif
}

static void
_bpf_mtap_softint_init(struct ifnet *ifp)
{
        struct bpf_if *bp;

        mutex_enter(&bpf_mtx);
        BPF_IFLIST_WRITER_FOREACH(bp) {
                if (bp->bif_ifp != ifp)
                        continue;

                bp->bif_mbuf_head = NULL;
                bp->bif_mbuf_tail = NULL;
                bp->bif_si = softint_establish(SOFTINT_NET, bpf_mtap_si, bp);
                if (bp->bif_si == NULL)
                        panic("%s: softint_establish() failed", __func__);
                break;
        }
        mutex_exit(&bpf_mtx);

        if (bp == NULL)
                panic("%s: no bpf_if found for %s", __func__, ifp->if_xname);
}

/*
 * Remove an interface from bpf.
 */
static void
_bpfdetach(struct ifnet *ifp)
{
        struct bpf_if *bp;
        struct bpf_d *d;
        int s;

        mutex_enter(&bpf_mtx);
        /* Nuke the vnodes for any open instances */
  again_d:
        BPF_DLIST_WRITER_FOREACH(d) {
                mutex_enter(d->bd_mtx);
                if (d->bd_bif != NULL && d->bd_bif->bif_ifp == ifp) {
                        /*
                         * Detach the descriptor from an interface now.
                         * It will be free'ed later by close routine.
                         */
                        bpf_detachd(d);
                        mutex_exit(d->bd_mtx);
                        goto again_d;
                }
                mutex_exit(d->bd_mtx);
        }

  again:
        BPF_IFLIST_WRITER_FOREACH(bp) {
                if (bp->bif_ifp == ifp) {
                        BPF_IFLIST_WRITER_REMOVE(bp);

                        pserialize_perform(bpf_psz);
                        psref_target_destroy(&bp->bif_psref, bpf_psref_class);

                        while (!SLIST_EMPTY(&bp->bif_trackers)) {
                                struct bpf_event_tracker *t =
                                    SLIST_FIRST(&bp->bif_trackers);
                                SLIST_REMOVE_HEAD(&bp->bif_trackers,
                                    bet_entries);
                                kmem_free(t, sizeof(*t));
                        }

                        BPF_IFLIST_ENTRY_DESTROY(bp);
                        if (bp->bif_si != NULL) {
                                /* XXX NOMPSAFE: assumed running on one CPU */
                                s = splnet();
                                while (bp->bif_mbuf_head != NULL) {
                                        struct mbuf *m = bp->bif_mbuf_head;
                                        bp->bif_mbuf_head = m->m_nextpkt;
                                        m_freem(m);
                                }
                                splx(s);
                                softint_disestablish(bp->bif_si);
                        }
                        kmem_free(bp, sizeof(*bp));
                        goto again;
                }
        }
        mutex_exit(&bpf_mtx);
}

/*
 * Change the data link type of a interface.
 */
static void
_bpf_change_type(struct ifnet *ifp, u_int dlt, u_int hdrlen)
{
        struct bpf_if *bp;

        mutex_enter(&bpf_mtx);
        BPF_IFLIST_WRITER_FOREACH(bp) {
                if (bp->bif_driverp == &ifp->if_bpf)
                        break;
        }
        if (bp == NULL)
                panic("bpf_change_type");

        bp->bif_dlt = dlt;

        bp->bif_hdrlen = hdrlen;
        mutex_exit(&bpf_mtx);
}

/*
 * Get a list of available data link type of the interface.
 */
static int
bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
{
        int n, error;
        struct ifnet *ifp;
        struct bpf_if *bp;
        int s, bound;

        KASSERT(mutex_owned(d->bd_mtx));

        ifp = d->bd_bif->bif_ifp;
        n = 0;
        error = 0;

        bound = curlwp_bind();
        s = pserialize_read_enter();
        BPF_IFLIST_READER_FOREACH(bp) {
                if (bp->bif_ifp != ifp)
                        continue;
                if (bfl->bfl_list != NULL) {
                        struct psref psref;

                        if (n >= bfl->bfl_len) {
                                pserialize_read_exit(s);
                                return ENOMEM;
                        }

                        bpf_if_acquire(bp, &psref);
                        pserialize_read_exit(s);

                        error = copyout(&bp->bif_dlt,
                            bfl->bfl_list + n, sizeof(u_int));

                        s = pserialize_read_enter();
                        bpf_if_release(bp, &psref);
                }
                n++;
        }
        pserialize_read_exit(s);
        curlwp_bindx(bound);

        bfl->bfl_len = n;
        return error;
}

/*
 * Set the data link type of a BPF instance.
 */
static int
bpf_setdlt(struct bpf_d *d, u_int dlt)
{
        int error, opromisc;
        struct ifnet *ifp;
        struct bpf_if *bp;

        KASSERT(mutex_owned(&bpf_mtx));
        KASSERT(mutex_owned(d->bd_mtx));

        if (d->bd_bif->bif_dlt == dlt)
                return 0;
        ifp = d->bd_bif->bif_ifp;
        BPF_IFLIST_WRITER_FOREACH(bp) {
                if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
                        break;
        }
        if (bp == NULL)
                return EINVAL;
        opromisc = d->bd_promisc;
        bpf_detachd(d);
        BPFIF_DLIST_ENTRY_INIT(d);
        bpf_attachd(d, bp);
        reset_d(d);
        if (opromisc) {
                KERNEL_LOCK_UNLESS_NET_MPSAFE();
                error = ifpromisc(bp->bif_ifp, 1);
                KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
                if (error)
                        printf("%s: bpf_setdlt: ifpromisc failed (%d)\n",
                            bp->bif_ifp->if_xname, error);
                else
                        d->bd_promisc = 1;
        }
        return 0;
}

static int
sysctl_net_bpf_maxbufsize(SYSCTLFN_ARGS)
{
        int newsize, error;
        struct sysctlnode node;

        node = *rnode;
        node.sysctl_data = &newsize;
        newsize = bpf_maxbufsize;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return (error);

        if (newsize < BPF_MINBUFSIZE || newsize > BPF_MAXBUFSIZE)
                return (EINVAL);

        bpf_maxbufsize = newsize;

        return (0);
}

#if defined(MODULAR) || defined(BPFJIT)
static int
sysctl_net_bpf_jit(SYSCTLFN_ARGS)
{
        bool newval;
        int error;
        struct sysctlnode node;

        node = *rnode;
        node.sysctl_data = &newval;
        newval = bpf_jit;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error != 0 || newp == NULL)
                return error;

        bpf_jit = newval;
        if (newval && bpfjit_module_ops.bj_generate_code == NULL) {
                printf("JIT compilation is postponed "
                    "until after bpfjit module is loaded\n");
        }

        return 0;
}
#endif

static int
sysctl_net_bpf_peers(SYSCTLFN_ARGS)
{
        int    error, elem_count;
        struct bpf_d         *dp;
        struct bpf_d_ext  dpe;
        size_t len, needed, elem_size, out_size;
        char   *sp;

        if (namelen == 1 && name[0] == CTL_QUERY)
                return (sysctl_query(SYSCTLFN_CALL(rnode)));

        if (namelen != 2)
                return (EINVAL);

        /* BPF peers is privileged information. */
        error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_INTERFACE,
            KAUTH_REQ_NETWORK_INTERFACE_GETPRIV, NULL, NULL, NULL);
        if (error)
                return (EPERM);

        len = (oldp != NULL) ? *oldlenp : 0;
        sp = oldp;
        elem_size = name[0];
        elem_count = name[1];
        out_size = MIN(sizeof(dpe), elem_size);
        needed = 0;

        if (elem_size < 1 || elem_count < 0)
                return (EINVAL);

        mutex_enter(&bpf_mtx);
        BPF_DLIST_WRITER_FOREACH(dp) {
                if (len >= elem_size && elem_count > 0) {
#define BPF_EXT(field)        dpe.bde_ ## field = dp->bd_ ## field
                        BPF_EXT(bufsize);
                        BPF_EXT(promisc);
                        BPF_EXT(state);
                        BPF_EXT(immediate);
                        BPF_EXT(hdrcmplt);
                        BPF_EXT(direction);
                        BPF_EXT(pid);
                        BPF_EXT(rcount);
                        BPF_EXT(dcount);
                        BPF_EXT(ccount);
#undef BPF_EXT
                        mutex_enter(dp->bd_mtx);
                        if (dp->bd_bif)
                                (void)strlcpy(dpe.bde_ifname,
                                    dp->bd_bif->bif_ifp->if_xname,
                                    IFNAMSIZ - 1);
                        else
                                dpe.bde_ifname[0] = '\0';
                        dpe.bde_locked = dp->bd_locked;
                        mutex_exit(dp->bd_mtx);

                        error = copyout(&dpe, sp, out_size);
                        if (error)
                                break;
                        sp += elem_size;
                        len -= elem_size;
                }
                needed += elem_size;
                if (elem_count > 0 && elem_count != INT_MAX)
                        elem_count--;
        }
        mutex_exit(&bpf_mtx);

        *oldlenp = needed;

        return (error);
}

static void
bpf_stats(void *p, void *arg, struct cpu_info *ci __unused)
{
        struct bpf_stat *const stats = p;
        struct bpf_stat *sum = arg;

        int s = splnet();

        sum->bs_recv += stats->bs_recv;
        sum->bs_drop += stats->bs_drop;
        sum->bs_capt += stats->bs_capt;

        splx(s);
}

static int
bpf_sysctl_gstats_handler(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int error;
        struct bpf_stat sum;

        memset(&sum, 0, sizeof(sum));
        node = *rnode;

        percpu_foreach_xcall(bpf_gstats_percpu, XC_HIGHPRI_IPL(IPL_SOFTNET),
            bpf_stats, &sum);

        node.sysctl_data = &sum;
        node.sysctl_size = sizeof(sum);
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error != 0 || newp == NULL)
                return error;

        return 0;
}

SYSCTL_SETUP(sysctl_net_bpf_setup, "bpf sysctls")
{
        const struct sysctlnode *node;

        node = NULL;
        sysctl_createv(clog, 0, NULL, &node,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "bpf",
                       SYSCTL_DESCR("BPF options"),
                       NULL, 0, NULL, 0,
                       CTL_NET, CTL_CREATE, CTL_EOL);
        if (node != NULL) {
#if defined(MODULAR) || defined(BPFJIT)
                sysctl_createv(clog, 0, NULL, NULL,
                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                        CTLTYPE_BOOL, "jit",
                        SYSCTL_DESCR("Toggle Just-In-Time compilation"),
                        sysctl_net_bpf_jit, 0, &bpf_jit, 0,
                        CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL);
#endif
                sysctl_createv(clog, 0, NULL, NULL,
                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                        CTLTYPE_INT, "maxbufsize",
                        SYSCTL_DESCR("Maximum size for data capture buffer"),
                        sysctl_net_bpf_maxbufsize, 0, &bpf_maxbufsize, 0,
                        CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL);
                sysctl_createv(clog, 0, NULL, NULL,
                        CTLFLAG_PERMANENT,
                        CTLTYPE_STRUCT, "stats",
                        SYSCTL_DESCR("BPF stats"),
                        bpf_sysctl_gstats_handler, 0, NULL, 0,
                        CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL);
                sysctl_createv(clog, 0, NULL, NULL,
                        CTLFLAG_PERMANENT,
                        CTLTYPE_STRUCT, "peers",
                        SYSCTL_DESCR("BPF peers"),
                        sysctl_net_bpf_peers, 0, NULL, 0,
                        CTL_NET, node->sysctl_num, CTL_CREATE, CTL_EOL);
        }

}

static int
_bpf_register_track_event(struct bpf_if **driverp,
            void (*_fun)(struct bpf_if *, struct ifnet *, int, int))
{
        struct bpf_if *bp;
        struct bpf_event_tracker *t;
        int ret = ENOENT;

        t = kmem_zalloc(sizeof(*t), KM_SLEEP);
        if (!t)
                return ENOMEM;
        t->bet_notify = _fun;

        mutex_enter(&bpf_mtx);
        BPF_IFLIST_WRITER_FOREACH(bp) {
                if (bp->bif_driverp != driverp)
                        continue;
                SLIST_INSERT_HEAD(&bp->bif_trackers, t, bet_entries);
                ret = 0;
                break;
        }
        mutex_exit(&bpf_mtx);

        return ret;
}

static int
_bpf_deregister_track_event(struct bpf_if **driverp,
            void (*_fun)(struct bpf_if *, struct ifnet *, int, int))
{
        struct bpf_if *bp;
        struct bpf_event_tracker *t = NULL;
        int ret = ENOENT;

        mutex_enter(&bpf_mtx);
        BPF_IFLIST_WRITER_FOREACH(bp) {
                if (bp->bif_driverp != driverp)
                        continue;
                SLIST_FOREACH(t, &bp->bif_trackers, bet_entries) {
                        if (t->bet_notify == _fun) {
                                ret = 0;
                                break;
                        }
                }
                if (ret == 0)
                        break;
        }
        if (ret == 0 && t && t->bet_notify == _fun) {
                SLIST_REMOVE(&bp->bif_trackers, t, bpf_event_tracker,
                    bet_entries);
        }
        mutex_exit(&bpf_mtx);
        if (ret == 0)
                kmem_free(t, sizeof(*t));
        return ret;
}

struct bpf_ops bpf_ops_kernel = {
        .bpf_attach =                _bpfattach,
        .bpf_detach =                _bpfdetach,
        .bpf_change_type =        _bpf_change_type,
        .bpf_register_track_event = _bpf_register_track_event,
        .bpf_deregister_track_event = _bpf_deregister_track_event,

        .bpf_mtap =                _bpf_mtap,
        .bpf_mtap2 =                _bpf_mtap2,
        .bpf_mtap_af =                _bpf_mtap_af,
        .bpf_mtap_sl_in =        _bpf_mtap_sl_in,
        .bpf_mtap_sl_out =        _bpf_mtap_sl_out,

        .bpf_mtap_softint =                _bpf_mtap_softint,
        .bpf_mtap_softint_init =        _bpf_mtap_softint_init,
};

MODULE(MODULE_CLASS_DRIVER, bpf, "bpf_filter");

static int
bpf_modcmd(modcmd_t cmd, void *arg)
{
#ifdef _MODULE
        devmajor_t bmajor, cmajor;
#endif
        int error = 0;

        switch (cmd) {
        case MODULE_CMD_INIT:
                bpf_init();
#ifdef _MODULE
                bmajor = cmajor = NODEVMAJOR;
                error = devsw_attach("bpf", NULL, &bmajor,
                    &bpf_cdevsw, &cmajor);
                if (error)
                        break;
#endif

                bpf_ops_handover_enter(&bpf_ops_kernel);
                atomic_swap_ptr(&bpf_ops, &bpf_ops_kernel);
                bpf_ops_handover_exit();
                break;

        case MODULE_CMD_FINI:
                /*
                 * While there is no reference counting for bpf callers,
                 * unload could at least in theory be done similarly to 
                 * system call disestablishment.  This should even be
                 * a little simpler:
                 * 
                 * 1) replace op vector with stubs
                 * 2) post update to all cpus with xc
                 * 3) check that nobody is in bpf anymore
                 *    (it's doubtful we'd want something like l_sysent,
                 *     but we could do something like *signed* percpu
                 *     counters.  if the sum is 0, we're good).
                 * 4) if fail, unroll changes
                 *
                 * NOTE: change won't be atomic to the outside.  some
                 * packets may be not captured even if unload is
                 * not successful.  I think packet capture not working
                 * is a perfectly logical consequence of trying to
                 * disable packet capture.
                 */
                error = EOPNOTSUPP;
                break;

        default:
                error = ENOTTY;
                break;
        }

        return error;
}













































































































    1 




































































































































































































































    1 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
/*        $NetBSD: kern_info_43.c,v 1.40 2021/09/07 11:43:02 riastradh Exp $        */

/*
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)subr_xxx.c        8.1 (Berkeley) 6/10/93
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_info_43.c,v 1.40 2021/09/07 11:43:02 riastradh Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/syslog.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
#include <sys/kauth.h>
#include <sys/cpu.h>

#include <uvm/uvm_extern.h>
#include <sys/sysctl.h>

#include <sys/mount.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <compat/sys/time.h>

#include <compat/common/compat_mod.h>

static struct syscall_package kern_info_43_syscalls[] = {
        { SYS_compat_43_ogetdtablesize, 0,
            (sy_call_t *)compat_43_sys_getdtablesize },
        { SYS_compat_43_ogethostid, 0, (sy_call_t *)compat_43_sys_gethostid },
        { SYS_compat_43_ogethostname, 0,
            (sy_call_t *)compat_43_sys_gethostname },
        { SYS_compat_43_ogetkerninfo, 0,
            (sy_call_t *)compat_43_sys_getkerninfo },
        { SYS_compat_43_osethostid, 0, (sy_call_t *)compat_43_sys_sethostid },
        { SYS_compat_43_osethostname, 0,
            (sy_call_t *)compat_43_sys_sethostname },
        { 0, 0, NULL }
};

int
compat_43_sys_getdtablesize(struct lwp *l, const void *v, register_t *retval)
{
        struct proc *p = l->l_proc;

        mutex_enter(p->p_lock);
        *retval = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
        mutex_exit(p->p_lock);
        return (0);
}


/* ARGSUSED */
int
compat_43_sys_gethostid(struct lwp *l, const void *v, register_t *retval)
{

        *(int32_t *)retval = hostid;
        return (0);
}


/*ARGSUSED*/
int
compat_43_sys_gethostname(struct lwp *l, const struct compat_43_sys_gethostname_args *uap, register_t *retval)
{
        /* {
                syscallarg(char *) hostname;
                syscallarg(u_int) len;
        } */
        int name[2];
        size_t sz;

        name[0] = CTL_KERN;
        name[1] = KERN_HOSTNAME;
        sz = SCARG(uap, len);
        return (old_sysctl(&name[0], 2, SCARG(uap, hostname), &sz, 0, 0, l));
}

#define        KINFO_PROC                (0<<8)
#define        KINFO_RT                (1<<8)
#define        KINFO_VNODE                (2<<8)
#define        KINFO_FILE                (3<<8)
#define        KINFO_METER                (4<<8)
#define        KINFO_LOADAVG                (5<<8)
#define        KINFO_CLOCKRATE                (6<<8)
#define        KINFO_BSDI_SYSINFO        (101<<8)


/*
 * The string data is appended to the end of the bsdi_si structure during
 * copyout. The "char *" offsets in the bsdi_si struct are relative to the
 * base of the bsdi_si struct.
 */
struct bsdi_si {
        char    *machine;
        char    *cpu_model;
        long    ncpu;
        long    cpuspeed;
        long    hwflags;
        u_long  physmem;
        u_long  usermem;
        u_long  pagesize;

        char    *ostype;
        char    *osrelease;
        long    os_revision;
        long    posix1_version;
        char    *version;

        long    hz;
        long    profhz;
        int     ngroups_max;
        long    arg_max;
        long    open_max;
        long    child_max;

        struct  timeval50 boottime;
        char    *hostname;
};

int
compat_43_sys_getkerninfo(struct lwp *l, const struct compat_43_sys_getkerninfo_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) op;
                syscallarg(char *) where;
                syscallarg(int *) size;
                syscallarg(int) arg;
        } */
        int error, name[6];
        int isize;
        size_t size;

        if (!SCARG(uap, size))
                return EINVAL;

        if ((error = copyin(SCARG(uap, size), &isize, sizeof(isize))) != 0)
                return error;

        if (isize < 0 || isize > 4096)
                return EINVAL;

        size = isize;

        switch (SCARG(uap, op) & 0xff00) {

        case KINFO_RT:
                name[0] = CTL_NET;
                name[1] = PF_ROUTE;
                name[2] = 0;
                name[3] = (SCARG(uap, op) & 0xff0000) >> 16;
                name[4] = SCARG(uap, op) & 0xff;
                name[5] = SCARG(uap, arg);
                error = old_sysctl(&name[0], 6, SCARG(uap, where), &size,
                                   NULL, 0, l);
                break;

        case KINFO_VNODE:
                name[0] = CTL_KERN;
                name[1] = KERN_VNODE;
                error = old_sysctl(&name[0], 2, SCARG(uap, where), &size,
                                   NULL, 0, l);
                break;

        case KINFO_PROC:
                name[0] = CTL_KERN;
                name[1] = KERN_PROC;
                name[2] = SCARG(uap, op) & 0xff;
                name[3] = SCARG(uap, arg);
                error = old_sysctl(&name[0], 4, SCARG(uap, where), &size,
                                   NULL, 0, l);
                break;

        case KINFO_FILE:
                name[0] = CTL_KERN;
                name[1] = KERN_FILE;
                error = old_sysctl(&name[0], 2, SCARG(uap, where), &size,
                                   NULL, 0, l);
                break;

        case KINFO_METER:
                name[0] = CTL_VM;
                name[1] = VM_METER;
                error = old_sysctl(&name[0], 2, SCARG(uap, where), &size,
                                   NULL, 0, l);
                break;

        case KINFO_LOADAVG:
                name[0] = CTL_VM;
                name[1] = VM_LOADAVG;
                error = old_sysctl(&name[0], 2, SCARG(uap, where), &size,
                                   NULL, 0, l);
                break;

        case KINFO_CLOCKRATE:
                name[0] = CTL_KERN;
                name[1] = KERN_CLOCKRATE;
                error = old_sysctl(&name[0], 2, SCARG(uap, where), &size,
                                   NULL, 0, l);
                break;


        case KINFO_BSDI_SYSINFO:
                {
                        size_t len;
                        struct bsdi_si *usi =
                            (struct bsdi_si *) SCARG(uap, where);
                        struct bsdi_si ksi;
                        struct timeval tv;
                        const char *cpu_model = cpu_getmodel();
                        char *us = (char *) &usi[1];

                        if (usi == NULL) {
                                size = sizeof(ksi) +
                                    strlen(ostype) + strlen(cpu_model) +
                                    strlen(osrelease) + strlen(machine) +
                                    strlen(version) + strlen(hostname) + 6;
                                error = 0;
                                break;
                        }

                        memset(&ksi, 0, sizeof(ksi));

#define COPY(fld)                                                        \
                        ksi.fld = us - (u_long) usi;                        \
                        if ((error = copyoutstr(fld, us, 1024, &len)) != 0)\
                                return error;                                \
                        us += len

                        COPY(machine);
                        COPY(cpu_model);
                        ksi.ncpu = ncpu;                /* XXX */
                        ksi.cpuspeed = 40;                /* XXX */
                        ksi.hwflags = 0;                /* XXX */
                        ksi.physmem = ctob(physmem);
                        ksi.usermem = ctob(physmem);        /* XXX */
                        ksi.pagesize = PAGE_SIZE;

                        COPY(ostype);
                        COPY(osrelease);
                        ksi.os_revision = NetBSD;        /* XXX */
                        ksi.posix1_version = _POSIX_VERSION;
                        COPY(version);                        /* XXX */

                        ksi.hz = hz;
                        ksi.profhz = profhz;
                        ksi.ngroups_max = NGROUPS_MAX;
                        ksi.arg_max = ARG_MAX;
                        ksi.open_max = OPEN_MAX;
                        ksi.child_max = CHILD_MAX;

                        getmicroboottime(&tv);
                        timeval_to_timeval50(&tv, &ksi.boottime);
                        COPY(hostname);

                        size = (us - (char *) &usi[1]) + sizeof(ksi);

                        if ((error = copyout(&ksi, usi, sizeof(ksi))) != 0)
                                return error;
                }
                break;

        default:
                return (EOPNOTSUPP);
        }
        if (error)
                return (error);
        *retval = size;
        if (SCARG(uap, size))
                error = copyout((void *)&size, (void *)SCARG(uap, size),
                    sizeof(size));
        return (error);
}


/* ARGSUSED */
int
compat_43_sys_sethostid(struct lwp *l, const struct compat_43_sys_sethostid_args *uap, register_t *retval)
{
        long uhostid;
        int name[2];

        uhostid = SCARG(uap, hostid);
        name[0] = CTL_KERN;
        name[1] = KERN_HOSTID;

        return (old_sysctl(&name[0], 2, 0, 0, &uhostid, sizeof(long), l));
}


/* ARGSUSED */
int
compat_43_sys_sethostname(struct lwp *l, const struct compat_43_sys_sethostname_args *uap, register_t *retval)
{
        int name[2];

        name[0] = CTL_KERN;
        name[1] = KERN_HOSTNAME;
        return (old_sysctl(&name[0], 2, 0, 0, SCARG(uap, hostname),
                           SCARG(uap, len), l));
}

int
kern_info_43_init(void)
{

        return syscall_establish(NULL, kern_info_43_syscalls);
}

int
kern_info_43_fini(void)
{

        return syscall_disestablish(NULL, kern_info_43_syscalls);
}

















































    5 











    5 

    5 


    5 














    1 





























    9 


    9 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*        $NetBSD: vfs_cwd.c,v 1.11 2023/09/23 18:21:11 ad Exp $        */

/*-
 * Copyright (c) 2008, 2020, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Current working directory.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_cwd.c,v 1.11 2023/09/23 18:21:11 ad Exp $");

#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/filedesc.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/kmem.h>

/*
 * Create an initial cwdinfo structure, using the same current and root
 * directories as curproc.
 */
struct cwdinfo *
cwdinit(void)
{
        struct cwdinfo *cwdi;
        struct cwdinfo *copy;

        cwdi = kmem_alloc(sizeof(*cwdi), KM_SLEEP);
        KASSERT(ALIGNED_POINTER(cwdi, COHERENCY_UNIT));
        rw_init(&cwdi->cwdi_lock);
        copy = curproc->p_cwdi;

        rw_enter(&copy->cwdi_lock, RW_READER);
        cwdi->cwdi_cdir = copy->cwdi_cdir;
        if (cwdi->cwdi_cdir)
                vref(cwdi->cwdi_cdir);
        cwdi->cwdi_rdir = copy->cwdi_rdir;
        if (cwdi->cwdi_rdir)
                vref(cwdi->cwdi_rdir);
        cwdi->cwdi_edir = copy->cwdi_edir;
        if (cwdi->cwdi_edir)
                vref(cwdi->cwdi_edir);
        rw_exit(&copy->cwdi_lock);

        cwdi->cwdi_cmask = copy->cwdi_cmask;
        cwdi->cwdi_refcnt = 1;

        return (cwdi);
}

/*
 * Make p2 share p1's cwdinfo.
 */
void
cwdshare(struct proc *p2)
{
        struct cwdinfo *cwdi;

        cwdi = curproc->p_cwdi;

        atomic_inc_uint(&cwdi->cwdi_refcnt);
        p2->p_cwdi = cwdi;
}

/*
 * Make sure proc has only one reference to its cwdi, creating
 * a new one if necessary.
 */
void
cwdunshare(struct proc *p)
{
        struct cwdinfo *cwdi = p->p_cwdi;

        if (cwdi->cwdi_refcnt > 1) {
                cwdi = cwdinit();
                cwdfree(p->p_cwdi);
                p->p_cwdi = cwdi;
        }
}

/*
 * Release a cwdinfo structure.
 */
void
cwdfree(struct cwdinfo *cwdi)
{

        membar_release();
        if (atomic_dec_uint_nv(&cwdi->cwdi_refcnt) > 0)
                return;
        membar_acquire();

        vrele(cwdi->cwdi_cdir);
        rw_destroy(&cwdi->cwdi_lock);
        if (cwdi->cwdi_rdir)
                vrele(cwdi->cwdi_rdir);
        if (cwdi->cwdi_edir)
                vrele(cwdi->cwdi_edir);
        kmem_free(cwdi, sizeof(*cwdi));
}

void
cwdexec(struct proc *p)
{

        cwdunshare(p);

        if (p->p_cwdi->cwdi_edir) {
                vrele(p->p_cwdi->cwdi_edir);
        }
}










































































































































































































    6 
    5 
   28 













   29 



   28 





   29 
    4 
    4 
















   29 



   29 
   29 






   28 
   29 





   29 
   14 
   14 









































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
/*        $NetBSD: subr_iostat.c,v 1.25 2019/05/22 08:47:02 hannken Exp $        */
/*        NetBSD: subr_disk.c,v 1.69 2005/05/29 22:24:15 christos Exp        */

/*-
 * Copyright (c) 1996, 1997, 1999, 2000, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ufs_disksubr.c        8.5 (Berkeley) 1/21/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_iostat.c,v 1.25 2019/05/22 08:47:02 hannken Exp $");

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/iostat.h>
#include <sys/sysctl.h>
#include <sys/rwlock.h>

/*
 * Function prototypes for sysctl nodes
 */
static int        sysctl_hw_disknames(SYSCTLFN_PROTO);
static int        sysctl_hw_iostatnames(SYSCTLFN_PROTO);
static int        sysctl_hw_iostats(SYSCTLFN_PROTO);

static int
iostati_getnames(int disk_only, char *oldp, size_t *oldlenp, const void *newp,
                u_int namelen);

/*
 * A global list of all drives attached to the system.  May grow or
 * shrink over time.
 */
struct iostatlist_head iostatlist = TAILQ_HEAD_INITIALIZER(iostatlist);
int iostat_count;                /* number of drives in global drivelist */
krwlock_t iostatlist_lock;

static void sysctl_io_stats_setup(struct sysctllog **);

/*
 * Initialise the iostat subsystem.
 */
void
iostat_init(void)
{

        rw_init(&iostatlist_lock);
        sysctl_io_stats_setup(NULL);
}

/*
 * Searches the iostatlist for the iostat corresponding to the
 * name provided.
 */
struct io_stats *
iostat_find(const char *name)
{
        struct io_stats *iostatp;

        KASSERT(name != NULL);

        rw_enter(&iostatlist_lock, RW_READER);
        TAILQ_FOREACH(iostatp, &iostatlist, io_link) {
                if (strcmp(iostatp->io_name, name) == 0) {
                        break;
                }
        }
        rw_exit(&iostatlist_lock);

        return iostatp;
}

/*
 * Allocate and initialise memory for the i/o statistics.
 */
struct io_stats *
iostat_alloc(int32_t type, void *parent, const char *name)
{
        struct io_stats *stats;

        stats = kmem_zalloc(sizeof(*stats), KM_SLEEP);
        stats->io_type = type;
        stats->io_parent = parent;
        (void)strlcpy(stats->io_name, name, sizeof(stats->io_name));

        /*
         * Set the attached timestamp.
         */
        getmicrouptime(&stats->io_attachtime);

        /*
         * Link into the drivelist.
         */
        rw_enter(&iostatlist_lock, RW_WRITER);
        TAILQ_INSERT_TAIL(&iostatlist, stats, io_link);
        iostat_count++;
        rw_exit(&iostatlist_lock);

        return stats;
}

/*
 * Remove i/o from stats collection.
 */
void
iostat_free(struct io_stats *stats)
{

        /*
         * Remove from the iostat list.
         */
        if (iostat_count == 0)
                panic("iostat_free: iostat_count == 0");
        rw_enter(&iostatlist_lock, RW_WRITER);
        TAILQ_REMOVE(&iostatlist, stats, io_link);
        iostat_count--;
        rw_exit(&iostatlist_lock);
        kmem_free(stats, sizeof(*stats));
}

/*
 * Rename i/o stats.
 */
void
iostat_rename(struct io_stats *stats, const char *name)
{

        rw_enter(&iostatlist_lock, RW_WRITER);
        (void)strlcpy(stats->io_name, name, sizeof(stats->io_name));
        rw_exit(&iostatlist_lock);
}

/*
 * multiply timeval by unsigned integer and add to result
 */
static void
timermac(struct timeval *a, uint64_t count, struct timeval *res)
{
        struct timeval part = *a;

        while (count) {
                if (count & 1)
                        timeradd(res, &part, res);
                timeradd(&part, &part, &part);
                count >>= 1;
        }
}

/*
 * Increment the iostat wait counter.
 * Accumulate wait time and timesum.
 *
 * Wait time is spent in the device bufq.
 */
void
iostat_wait(struct io_stats *stats)
{
        struct timeval dv_time, diff_time;
        int32_t count;

        KASSERT(stats->io_wait >= 0);

        getmicrouptime(&dv_time);

        timersub(&dv_time, &stats->io_waitstamp, &diff_time);
        count = stats->io_wait++;
        if (count != 0) {
                timermac(&diff_time, count, &stats->io_waitsum);
                timeradd(&stats->io_waittime, &diff_time, &stats->io_waittime);
        }
        stats->io_waitstamp = dv_time;
}

/*
 * Decrement the iostat wait counter.
 * Increment the iostat busy counter.
 * Accumulate wait and busy times and timesums.
 *
 * Busy time is spent being processed by the device.
 *
 * Old devices do not yet measure wait time, so skip
 * processing it if the counter is still zero.
 */
void
iostat_busy(struct io_stats *stats)
{
        struct timeval dv_time, diff_time;
        int32_t count;

        KASSERT(stats->io_wait >= 0); /* > 0 when iostat_wait is used */
        KASSERT(stats->io_busy >= 0);

        getmicrouptime(&dv_time);

        timersub(&dv_time, &stats->io_waitstamp, &diff_time);
        if (stats->io_wait != 0) {
                count = stats->io_wait--;
                timermac(&diff_time, count, &stats->io_waitsum);
                timeradd(&stats->io_waittime, &diff_time, &stats->io_waittime);
        }
        stats->io_waitstamp = dv_time;

        timersub(&dv_time, &stats->io_busystamp, &diff_time);
        count = stats->io_busy++;
        if (count != 0) {
                timermac(&diff_time, count, &stats->io_busysum);
                timeradd(&stats->io_busytime, &diff_time, &stats->io_busytime);
        }
        stats->io_busystamp = dv_time;
}

/*
 * Decrement the iostat busy counter, increment the byte count.
 * Accumulate busy time and timesum.
 */
void
iostat_unbusy(struct io_stats *stats, long bcount, int read)
{
        struct timeval dv_time, diff_time;
        int32_t count;

        KASSERT(stats->io_busy > 0);

        getmicrouptime(&dv_time);
        stats->io_timestamp = dv_time;

        /* any op */
        timersub(&dv_time, &stats->io_busystamp, &diff_time);
        count = stats->io_busy--;
        timermac(&diff_time, count, &stats->io_busysum);
        timeradd(&stats->io_busytime, &diff_time, &stats->io_busytime);
        stats->io_busystamp = dv_time;

        if (bcount > 0) {
                if (read) {
                        stats->io_rbytes += bcount;
                        stats->io_rxfer++;
                } else {
                        stats->io_wbytes += bcount;
                        stats->io_wxfer++;
                }
        }
}

/*
 * Return non-zero if a device has an I/O request in flight.
 */
bool
iostat_isbusy(struct io_stats *stats)
{

        return stats->io_busy != 0;
}

/*
 * Increment the seek counter.  This does look almost redundant but it
 * abstracts the stats gathering.
 */
void
iostat_seek(struct io_stats *stats)
{

        stats->io_seek++;
}

static int
sysctl_hw_disknames(SYSCTLFN_ARGS)
{

        return iostati_getnames(1, oldp, oldlenp, newp, namelen);
}

static int
sysctl_hw_iostatnames(SYSCTLFN_ARGS)
{

        return iostati_getnames(0, oldp, oldlenp, newp, namelen);
}

static int
iostati_getnames(int disk_only, char *oldp, size_t *oldlenp, const void *newp,
                 u_int namelen)
{
        char bf[IOSTATNAMELEN + 1];
        char *where = oldp;
        struct io_stats *stats;
        size_t needed, left, slen;
        int error, first;

        if (newp != NULL)
                return (EPERM);
        if (namelen != 0)
                return (EINVAL);

        first = 1;
        error = 0;
        needed = 0;
        left = *oldlenp;

        rw_enter(&iostatlist_lock, RW_READER);
        for (stats = TAILQ_FIRST(&iostatlist); stats != NULL;
            stats = TAILQ_NEXT(stats, io_link)) {
                if ((disk_only == 1) && (stats->io_type != IOSTAT_DISK))
                        continue;

                if (where == NULL)
                        needed += strlen(stats->io_name) + 1;
                else {
                        memset(bf, 0, sizeof(bf));
                        if (first) {
                                strncpy(bf, stats->io_name, sizeof(bf));
                                first = 0;
                        } else {
                                bf[0] = ' ';
                                strncpy(bf + 1, stats->io_name,
                                    sizeof(bf) - 1);
                        }
                        bf[IOSTATNAMELEN] = '\0';
                        slen = strlen(bf);
                        if (left < slen + 1)
                                break;
                        /* +1 to copy out the trailing NUL byte */
                        error = copyout(bf, where, slen + 1);
                        if (error)
                                break;
                        where += slen;
                        needed += slen;
                        left -= slen;
                }
        }
        rw_exit(&iostatlist_lock);
        *oldlenp = needed;
        return (error);
}

static int
sysctl_hw_iostats(SYSCTLFN_ARGS)
{
        struct io_sysctl sdrive;
        struct io_stats *stats;
        char *where = oldp;
        size_t tocopy, left;
        int error;

        if (newp != NULL)
                return (EPERM);

        /*
         * The original hw.diskstats call was broken and did not require
         * the userland to pass in its size of struct disk_sysctl.  This
         * was fixed after NetBSD 1.6 was released.
         */
        if (namelen == 0)
                tocopy = offsetof(struct io_sysctl, busy);
        else
                tocopy = name[0];

        if (where == NULL) {
                *oldlenp = iostat_count * tocopy;
                return (0);
        }

        error = 0;
        left = *oldlenp;
        memset(&sdrive, 0, sizeof(sdrive));
        *oldlenp = 0;

        rw_enter(&iostatlist_lock, RW_READER);
        TAILQ_FOREACH(stats, &iostatlist, io_link) {
                if (left < tocopy)
                        break;

                strncpy(sdrive.name, stats->io_name, sizeof(sdrive.name));
                sdrive.attachtime_sec = stats->io_attachtime.tv_sec;
                sdrive.attachtime_usec = stats->io_attachtime.tv_usec;
                sdrive.timestamp_sec = stats->io_busystamp.tv_sec;
                sdrive.timestamp_usec = stats->io_busystamp.tv_usec;

                sdrive.time_sec = stats->io_busytime.tv_sec;
                sdrive.time_usec = stats->io_busytime.tv_usec;

                sdrive.seek = stats->io_seek;

                sdrive.rxfer = stats->io_rxfer;
                sdrive.wxfer = stats->io_wxfer;
                sdrive.xfer = stats->io_rxfer + stats->io_wxfer;

                sdrive.rbytes = stats->io_rbytes;
                sdrive.wbytes = stats->io_wbytes;
                sdrive.bytes = stats->io_rbytes + stats->io_wbytes;

                sdrive.wait_sec = stats->io_waittime.tv_sec;
                sdrive.wait_usec = stats->io_waittime.tv_usec;

                sdrive.time_sec = stats->io_busytime.tv_sec;
                sdrive.time_usec = stats->io_busytime.tv_usec;

                sdrive.waitsum_sec = stats->io_waitsum.tv_sec;
                sdrive.waitsum_usec = stats->io_waitsum.tv_usec;

                sdrive.busysum_sec = stats->io_busysum.tv_sec;
                sdrive.busysum_usec = stats->io_busysum.tv_usec;

                sdrive.busy = stats->io_busy;

                error = copyout(&sdrive, where, uimin(tocopy, sizeof(sdrive)));
                if (error)
                        break;
                where += tocopy;
                *oldlenp += tocopy;
                left -= tocopy;
        }
        rw_exit(&iostatlist_lock);
        return (error);
}

static void
sysctl_io_stats_setup(struct sysctllog **clog)
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "disknames",
                       SYSCTL_DESCR("List of disk drives present"),
                       sysctl_hw_disknames, 0, NULL, 0,
                       CTL_HW, HW_DISKNAMES, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "iostatnames",
                       SYSCTL_DESCR("I/O stats are being collected for these"
                                    " devices"),
                       sysctl_hw_iostatnames, 0, NULL, 0,
                       CTL_HW, HW_IOSTATNAMES, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "iostats",
                       SYSCTL_DESCR("Statistics on device I/O operations"),
                       sysctl_hw_iostats, 0, NULL, 0,
                       CTL_HW, HW_IOSTATS, CTL_EOL);
}






































































































































































































































































































































































































































   29 
   29 






   29 




   28 
   29 
   29 










































































































































































































































































































































































































































































































































   29 

   29 
   29 





   29 





   29 







   28 



   29 



























































   29 


   29 


   29 













   29 




   29 
   29 




   29 

   29 



































   29 
















   28 





   29 





   29 

   29 
   29 

   29 
   28 

   11 

   11 


   23 












   28 








   29 
   29 
   29 
   29 
   28 

   29 
   29 

   29 

   21 


   29 













   29 







   28 




   29 






   29 














   29 










   29 
   29 



















































































































































































































   29 




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
/*        $NetBSD: virtio.c,v 1.81 2024/02/10 02:25:15 isaki Exp $        */

/*
 * Copyright (c) 2020 The NetBSD Foundation, Inc.
 * Copyright (c) 2012 Stefan Fritsch, Alexander Fiveg.
 * Copyright (c) 2010 Minoura Makoto.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: virtio.c,v 1.81 2024/02/10 02:25:15 isaki Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <sys/bus.h>
#include <sys/device.h>
#include <sys/kmem.h>
#include <sys/module.h>

#define VIRTIO_PRIVATE

#include <dev/pci/virtioreg.h> /* XXX: move to non-pci */
#include <dev/pci/virtiovar.h> /* XXX: move to non-pci */

#define MINSEG_INDIRECT                2 /* use indirect if nsegs >= this value */

/*
 * The maximum descriptor size is 2^15. Use that value as the end of
 * descriptor chain terminator since it will never be a valid index
 * in the descriptor table.
 */
#define VRING_DESC_CHAIN_END                32768

/* incomplete list */
static const char *virtio_device_name[] = {
        "unknown (0)",                        /*  0 */
        "network",                        /*  1 */
        "block",                        /*  2 */
        "console",                        /*  3 */
        "entropy",                        /*  4 */
        "memory balloon",                /*  5 */
        "I/O memory",                        /*  6 */
        "remote processor messaging",        /*  7 */
        "SCSI",                                /*  8 */
        "9P transport",                        /*  9 */
};
#define NDEVNAMES        __arraycount(virtio_device_name)

static void        virtio_reset_vq(struct virtio_softc *,
                    struct virtqueue *);

void
virtio_set_status(struct virtio_softc *sc, int status)
{
        sc->sc_ops->set_status(sc, status);
}

/*
 * Reset the device.
 */
/*
 * To reset the device to a known state, do following:
 *        virtio_reset(sc);             // this will stop the device activity
 *        <dequeue finished requests>; // virtio_dequeue() still can be called
 *        <revoke pending requests in the vqs if any>;
 *        virtio_reinit_start(sc);     // dequeue prohibited
 *        newfeatures = virtio_negotiate_features(sc, requestedfeatures);
 *        <some other initialization>;
 *        virtio_reinit_end(sc);             // device activated; enqueue allowed
 * Once attached, feature negotiation can only be allowed after virtio_reset.
 */
void
virtio_reset(struct virtio_softc *sc)
{
        virtio_device_reset(sc);
}

int
virtio_reinit_start(struct virtio_softc *sc)
{
        int i, r;

        virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_ACK);
        virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER);
        for (i = 0; i < sc->sc_nvqs; i++) {
                int n;
                struct virtqueue *vq = &sc->sc_vqs[i];
                n = sc->sc_ops->read_queue_size(sc, vq->vq_index);
                if (n == 0)        /* vq disappeared */
                        continue;
                if (n != vq->vq_num) {
                        panic("%s: virtqueue size changed, vq index %d\n",
                            device_xname(sc->sc_dev),
                            vq->vq_index);
                }
                virtio_reset_vq(sc, vq);
                sc->sc_ops->setup_queue(sc, vq->vq_index,
                    vq->vq_dmamap->dm_segs[0].ds_addr);
        }

        r = sc->sc_ops->setup_interrupts(sc, 1);
        if (r != 0)
                goto fail;

        return 0;

fail:
        virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_FAILED);

        return 1;
}

void
virtio_reinit_end(struct virtio_softc *sc)
{
        virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK);
}

/*
 * Feature negotiation.
 */
void
virtio_negotiate_features(struct virtio_softc *sc, uint64_t guest_features)
{
        if (!(device_cfdata(sc->sc_dev)->cf_flags & 1) &&
            !(device_cfdata(sc->sc_child)->cf_flags & 1)) /* XXX */
                guest_features |= VIRTIO_F_RING_INDIRECT_DESC;
        sc->sc_ops->neg_features(sc, guest_features);
        if (sc->sc_active_features & VIRTIO_F_RING_INDIRECT_DESC)
                sc->sc_indirect = true;
        else
                sc->sc_indirect = false;
}


/*
 * Device configuration registers readers/writers
 */
#if 0
#define DPRINTFR(n, fmt, val, index, num) \
        printf("\n%s (", n); \
        for (int i = 0; i < num; i++) \
                printf("%02x ", bus_space_read_1(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index+i)); \
        printf(") -> "); printf(fmt, val); printf("\n");
#define DPRINTFR2(n, fmt, val_s, val_n) \
        printf("%s ", n); \
        printf("\n        stream "); printf(fmt, val_s); printf(" norm "); printf(fmt, val_n); printf("\n");
#else
#define DPRINTFR(n, fmt, val, index, num)
#define DPRINTFR2(n, fmt, val_s, val_n)
#endif


uint8_t
virtio_read_device_config_1(struct virtio_softc *sc, int index)
{
        bus_space_tag_t           iot = sc->sc_devcfg_iot;
        bus_space_handle_t ioh = sc->sc_devcfg_ioh;
        uint8_t val;

        val = bus_space_read_1(iot, ioh, index);

        DPRINTFR("read_1", "%02x", val, index, 1);
        return val;
}

uint16_t
virtio_read_device_config_2(struct virtio_softc *sc, int index)
{
        bus_space_tag_t           iot = sc->sc_devcfg_iot;
        bus_space_handle_t ioh = sc->sc_devcfg_ioh;
        uint16_t val;

        val = bus_space_read_2(iot, ioh, index);
        if (BYTE_ORDER != sc->sc_bus_endian)
                val = bswap16(val);

        DPRINTFR("read_2", "%04x", val, index, 2);
        DPRINTFR2("read_2", "%04x",
            bus_space_read_stream_2(sc->sc_devcfg_iot, sc->sc_devcfg_ioh,
                index),
            bus_space_read_2(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index));
        return val;
}

uint32_t
virtio_read_device_config_4(struct virtio_softc *sc, int index)
{
        bus_space_tag_t           iot = sc->sc_devcfg_iot;
        bus_space_handle_t ioh = sc->sc_devcfg_ioh;
        uint32_t val;

        val = bus_space_read_4(iot, ioh, index);
        if (BYTE_ORDER != sc->sc_bus_endian)
                val = bswap32(val);

        DPRINTFR("read_4", "%08x", val, index, 4);
        DPRINTFR2("read_4", "%08x",
            bus_space_read_stream_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh,
                index),
            bus_space_read_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index));
        return val;
}

/*
 * The Virtio spec explicitly tells that reading and writing 8 bytes are not
 * considered atomic and no triggers may be connected to reading or writing
 * it. We access it using two 32 reads. See virtio spec 4.1.3.1.
 */
uint64_t
virtio_read_device_config_8(struct virtio_softc *sc, int index)
{
        bus_space_tag_t           iot = sc->sc_devcfg_iot;
        bus_space_handle_t ioh = sc->sc_devcfg_ioh;
        union {
                uint64_t u64;
                uint32_t l[2];
        } v;
        uint64_t val;

        v.l[0] = bus_space_read_4(iot, ioh, index);
        v.l[1] = bus_space_read_4(iot, ioh, index + 4);
        if (sc->sc_bus_endian != sc->sc_struct_endian) {
                v.l[0] = bswap32(v.l[0]);
                v.l[1] = bswap32(v.l[1]);
        }
        val = v.u64;

        if (BYTE_ORDER != sc->sc_struct_endian)
                val = bswap64(val);

        DPRINTFR("read_8", "%08"PRIx64, val, index, 8);
        DPRINTFR2("read_8 low ", "%08x",
            bus_space_read_stream_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh,
                index),
            bus_space_read_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index));
        DPRINTFR2("read_8 high ", "%08x",
            bus_space_read_stream_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh,
                index + 4),
            bus_space_read_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, index + 4));
        return val;
}

/*
 * In the older virtio spec, device config registers are host endian. On newer
 * they are little endian. Some newer devices however explicitly specify their
 * register to always be little endian. These functions cater for these.
 */
uint16_t
virtio_read_device_config_le_2(struct virtio_softc *sc, int index)
{
        bus_space_tag_t           iot = sc->sc_devcfg_iot;
        bus_space_handle_t ioh = sc->sc_devcfg_ioh;
        uint16_t val;

        val = bus_space_read_2(iot, ioh, index);
#if !defined(__aarch64__) && !defined(__arm__)
        /*
         * For big-endian aarch64/armv7, bus endian is always LSB, but
         * byte-order is automatically swapped by bus_space(9) (see also
         * comments in virtio_pci.c). Therefore, no need to swap here.
         */
        if (sc->sc_bus_endian != LITTLE_ENDIAN)
                val = bswap16(val);
#endif

        DPRINTFR("read_le_2", "%04x", val, index, 2);
        DPRINTFR2("read_le_2", "%04x",
            bus_space_read_stream_2(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, 0),
            bus_space_read_2(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, 0));
        return val;
}

uint32_t
virtio_read_device_config_le_4(struct virtio_softc *sc, int index)
{
        bus_space_tag_t           iot = sc->sc_devcfg_iot;
        bus_space_handle_t ioh = sc->sc_devcfg_ioh;
        uint32_t val;

        val = bus_space_read_4(iot, ioh, index);
#if !defined(__aarch64__) && !defined(__arm__)
        /* See virtio_read_device_config_le_2() above. */
        if (sc->sc_bus_endian != LITTLE_ENDIAN)
                val = bswap32(val);
#endif

        DPRINTFR("read_le_4", "%08x", val, index, 4);
        DPRINTFR2("read_le_4", "%08x",
            bus_space_read_stream_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, 0),
            bus_space_read_4(sc->sc_devcfg_iot, sc->sc_devcfg_ioh, 0));
        return val;
}

void
virtio_write_device_config_1(struct virtio_softc *sc, int index, uint8_t value)
{
        bus_space_tag_t           iot = sc->sc_devcfg_iot;
        bus_space_handle_t ioh = sc->sc_devcfg_ioh;

        bus_space_write_1(iot, ioh, index, value);
}

void
virtio_write_device_config_2(struct virtio_softc *sc, int index,
    uint16_t value)
{
        bus_space_tag_t           iot = sc->sc_devcfg_iot;
        bus_space_handle_t ioh = sc->sc_devcfg_ioh;

        if (BYTE_ORDER != sc->sc_bus_endian)
                value = bswap16(value);
        bus_space_write_2(iot, ioh, index, value);
}

void
virtio_write_device_config_4(struct virtio_softc *sc, int index,
    uint32_t value)
{
        bus_space_tag_t           iot = sc->sc_devcfg_iot;
        bus_space_handle_t ioh = sc->sc_devcfg_ioh;

        if (BYTE_ORDER != sc->sc_bus_endian)
                value = bswap32(value);
        bus_space_write_4(iot, ioh, index, value);
}

/*
 * The Virtio spec explicitly tells that reading and writing 8 bytes are not
 * considered atomic and no triggers may be connected to reading or writing
 * it. We access it using two 32 bit writes. For good measure it is stated to
 * always write lsb first just in case of a hypervisor bug. See See virtio
 * spec 4.1.3.1.
 */
void
virtio_write_device_config_8(struct virtio_softc *sc, int index,
    uint64_t value)
{
        bus_space_tag_t           iot = sc->sc_devcfg_iot;
        bus_space_handle_t ioh = sc->sc_devcfg_ioh;
        union {
                uint64_t u64;
                uint32_t l[2];
        } v;

        if (BYTE_ORDER != sc->sc_struct_endian)
                value = bswap64(value);

        v.u64 = value;
        if (sc->sc_bus_endian != sc->sc_struct_endian) {
                v.l[0] = bswap32(v.l[0]);
                v.l[1] = bswap32(v.l[1]);
        }

        if (sc->sc_struct_endian == LITTLE_ENDIAN) {
                bus_space_write_4(iot, ioh, index,     v.l[0]);
                bus_space_write_4(iot, ioh, index + 4, v.l[1]);
        } else {
                bus_space_write_4(iot, ioh, index + 4, v.l[1]);
                bus_space_write_4(iot, ioh, index,     v.l[0]);
        }
}

/*
 * In the older virtio spec, device config registers are host endian. On newer
 * they are little endian. Some newer devices however explicitly specify their
 * register to always be little endian. These functions cater for these.
 */
void
virtio_write_device_config_le_2(struct virtio_softc *sc, int index,
    uint16_t value)
{
        bus_space_tag_t           iot = sc->sc_devcfg_iot;
        bus_space_handle_t ioh = sc->sc_devcfg_ioh;

        if (sc->sc_bus_endian != LITTLE_ENDIAN)
                value = bswap16(value);
        bus_space_write_2(iot, ioh, index, value);
}

void
virtio_write_device_config_le_4(struct virtio_softc *sc, int index,
    uint32_t value)
{
        bus_space_tag_t           iot = sc->sc_devcfg_iot;
        bus_space_handle_t ioh = sc->sc_devcfg_ioh;

        if (sc->sc_bus_endian != LITTLE_ENDIAN)
                value = bswap32(value);
        bus_space_write_4(iot, ioh, index, value);
}


/*
 * data structures endian helpers
 */
uint16_t
virtio_rw16(struct virtio_softc *sc, uint16_t val)
{
        KASSERT(sc);
        return BYTE_ORDER != sc->sc_struct_endian ? bswap16(val) : val;
}

uint32_t
virtio_rw32(struct virtio_softc *sc, uint32_t val)
{
        KASSERT(sc);
        return BYTE_ORDER != sc->sc_struct_endian ? bswap32(val) : val;
}

uint64_t
virtio_rw64(struct virtio_softc *sc, uint64_t val)
{
        KASSERT(sc);
        return BYTE_ORDER != sc->sc_struct_endian ? bswap64(val) : val;
}


/*
 * Interrupt handler.
 */
static void
virtio_soft_intr(void *arg)
{
        struct virtio_softc *sc = arg;

        KASSERT(sc->sc_intrhand != NULL);

        (*sc->sc_intrhand)(sc);
}

/* set to vq->vq_intrhand in virtio_init_vq_vqdone() */
static int
virtio_vq_done(void *xvq)
{
        struct virtqueue *vq = xvq;

        return vq->vq_done(vq);
}

static int
virtio_vq_intr(struct virtio_softc *sc)
{
        struct virtqueue *vq;
        int i, r = 0;

        for (i = 0; i < sc->sc_nvqs; i++) {
                vq = &sc->sc_vqs[i];
                if (virtio_vq_is_enqueued(sc, vq) == 1) {
                        r |= (*vq->vq_intrhand)(vq->vq_intrhand_arg);
                }
        }

        return r;
}

/*
 * dmamap sync operations for a virtqueue.
 */
static inline void
vq_sync_descs(struct virtio_softc *sc, struct virtqueue *vq, int ops)
{

        /* availoffset == sizeof(vring_desc) * vq_num */
        bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap, 0, vq->vq_availoffset,
            ops);
}

static inline void
vq_sync_aring_all(struct virtio_softc *sc, struct virtqueue *vq, int ops)
{
        uint16_t hdrlen = offsetof(struct vring_avail, ring);
        size_t payloadlen = vq->vq_num * sizeof(uint16_t);
        size_t usedlen = 0;

        if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX)
                usedlen = sizeof(uint16_t);
        bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
            vq->vq_availoffset, hdrlen + payloadlen + usedlen, ops);
}

static inline void
vq_sync_aring_header(struct virtio_softc *sc, struct virtqueue *vq, int ops)
{
        uint16_t hdrlen = offsetof(struct vring_avail, ring);

        bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
            vq->vq_availoffset, hdrlen, ops);
}

static inline void
vq_sync_aring_payload(struct virtio_softc *sc, struct virtqueue *vq, int ops)
{
        uint16_t hdrlen = offsetof(struct vring_avail, ring);
        size_t payloadlen = vq->vq_num * sizeof(uint16_t);

        bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
            vq->vq_availoffset + hdrlen, payloadlen, ops);
}

static inline void
vq_sync_aring_used(struct virtio_softc *sc, struct virtqueue *vq, int ops)
{
        uint16_t hdrlen = offsetof(struct vring_avail, ring);
        size_t payloadlen = vq->vq_num * sizeof(uint16_t);
        size_t usedlen = sizeof(uint16_t);

        if ((sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) == 0)
                return;
        bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
            vq->vq_availoffset + hdrlen + payloadlen, usedlen, ops);
}

static inline void
vq_sync_uring_all(struct virtio_softc *sc, struct virtqueue *vq, int ops)
{
        uint16_t hdrlen = offsetof(struct vring_used, ring);
        size_t payloadlen = vq->vq_num * sizeof(struct vring_used_elem);
        size_t availlen = 0;

        if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX)
                availlen = sizeof(uint16_t);
        bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
            vq->vq_usedoffset, hdrlen + payloadlen + availlen, ops);
}

static inline void
vq_sync_uring_header(struct virtio_softc *sc, struct virtqueue *vq, int ops)
{
        uint16_t hdrlen = offsetof(struct vring_used, ring);

        bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
            vq->vq_usedoffset, hdrlen, ops);
}

static inline void
vq_sync_uring_payload(struct virtio_softc *sc, struct virtqueue *vq, int ops)
{
        uint16_t hdrlen = offsetof(struct vring_used, ring);
        size_t payloadlen = vq->vq_num * sizeof(struct vring_used_elem);

        bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
            vq->vq_usedoffset + hdrlen, payloadlen, ops);
}

static inline void
vq_sync_uring_avail(struct virtio_softc *sc, struct virtqueue *vq, int ops)
{
        uint16_t hdrlen = offsetof(struct vring_used, ring);
        size_t payloadlen = vq->vq_num * sizeof(struct vring_used_elem);
        size_t availlen = sizeof(uint16_t);

        if ((sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) == 0)
                return;
        bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
            vq->vq_usedoffset + hdrlen + payloadlen, availlen, ops);
}

static inline void
vq_sync_indirect(struct virtio_softc *sc, struct virtqueue *vq, int slot,
    int ops)
{
        int offset = vq->vq_indirectoffset +
            sizeof(struct vring_desc) * vq->vq_maxnsegs * slot;

        bus_dmamap_sync(sc->sc_dmat, vq->vq_dmamap,
            offset, sizeof(struct vring_desc) * vq->vq_maxnsegs, ops);
}

bool
virtio_vq_is_enqueued(struct virtio_softc *sc, struct virtqueue *vq)
{

        if (vq->vq_queued) {
                vq->vq_queued = 0;
                vq_sync_aring_all(sc, vq, BUS_DMASYNC_POSTWRITE);
        }

        vq_sync_uring_header(sc, vq, BUS_DMASYNC_POSTREAD);
        if (vq->vq_used_idx == virtio_rw16(sc, vq->vq_used->idx))
                return 0;
        vq_sync_uring_payload(sc, vq, BUS_DMASYNC_POSTREAD);
        return 1;
}

/*
 * Increase the event index in order to delay interrupts.
 */
int
virtio_postpone_intr(struct virtio_softc *sc, struct virtqueue *vq,
    uint16_t nslots)
{
        uint16_t        idx, nused;

        idx = vq->vq_used_idx + nslots;

        /* set the new event index: avail_ring->used_event = idx */
        *vq->vq_used_event = virtio_rw16(sc, idx);
        vq_sync_aring_used(vq->vq_owner, vq, BUS_DMASYNC_PREWRITE);
        vq->vq_queued++;

        nused = (uint16_t)
            (virtio_rw16(sc, vq->vq_used->idx) - vq->vq_used_idx);
        KASSERT(nused <= vq->vq_num);

        return nslots < nused;
}

/*
 * Postpone interrupt until 3/4 of the available descriptors have been
 * consumed.
 */
int
virtio_postpone_intr_smart(struct virtio_softc *sc, struct virtqueue *vq)
{
        uint16_t        nslots;

        nslots = (uint16_t)
            (virtio_rw16(sc, vq->vq_avail->idx) - vq->vq_used_idx) * 3 / 4;

        return virtio_postpone_intr(sc, vq, nslots);
}

/*
 * Postpone interrupt until all of the available descriptors have been
 * consumed.
 */
int
virtio_postpone_intr_far(struct virtio_softc *sc, struct virtqueue *vq)
{
        uint16_t        nslots;

        nslots = (uint16_t)
            (virtio_rw16(sc, vq->vq_avail->idx) - vq->vq_used_idx);

        return virtio_postpone_intr(sc, vq, nslots);
}

/*
 * Start/stop vq interrupt.  No guarantee.
 */
void
virtio_stop_vq_intr(struct virtio_softc *sc, struct virtqueue *vq)
{

        if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) {
                /*
                 * No way to disable the interrupt completely with
                 * RingEventIdx. Instead advance used_event by half the
                 * possible value. This won't happen soon and is far enough in
                 * the past to not trigger a spurious interrupt.
                 */
                *vq->vq_used_event = virtio_rw16(sc, vq->vq_used_idx + 0x8000);
                vq_sync_aring_used(sc, vq, BUS_DMASYNC_PREWRITE);
        } else {
                vq->vq_avail->flags |=
                    virtio_rw16(sc, VRING_AVAIL_F_NO_INTERRUPT);
                vq_sync_aring_header(sc, vq, BUS_DMASYNC_PREWRITE);
        }
        vq->vq_queued++;
}

int
virtio_start_vq_intr(struct virtio_softc *sc, struct virtqueue *vq)
{

        if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) {
                /*
                 * If event index feature is negotiated, enabling interrupts
                 * is done through setting the latest consumed index in the
                 * used_event field
                 */
                *vq->vq_used_event = virtio_rw16(sc, vq->vq_used_idx);
                vq_sync_aring_used(sc, vq, BUS_DMASYNC_PREWRITE);
        } else {
                vq->vq_avail->flags &=
                    ~virtio_rw16(sc, VRING_AVAIL_F_NO_INTERRUPT);
                vq_sync_aring_header(sc, vq, BUS_DMASYNC_PREWRITE);
        }
        vq->vq_queued++;

        vq_sync_uring_header(sc, vq, BUS_DMASYNC_POSTREAD);
        if (vq->vq_used_idx == virtio_rw16(sc, vq->vq_used->idx))
                return 0;
        vq_sync_uring_payload(sc, vq, BUS_DMASYNC_POSTREAD);
        return 1;
}

/*
 * Initialize vq structure.
 */
/*
 * Reset virtqueue parameters
 */
static void
virtio_reset_vq(struct virtio_softc *sc, struct virtqueue *vq)
{
        struct vring_desc *vds;
        int i, j;
        int vq_size = vq->vq_num;

        memset(vq->vq_vaddr, 0, vq->vq_bytesize);

        /* build the descriptor chain for free slot management */
        vds = vq->vq_desc;
        for (i = 0; i < vq_size - 1; i++) {
                vds[i].next = virtio_rw16(sc, i + 1);
        }
        vds[i].next = virtio_rw16(sc, VRING_DESC_CHAIN_END);
        vq->vq_free_idx = 0;

        /* build the indirect descriptor chain */
        if (vq->vq_indirect != NULL) {
                struct vring_desc *vd;

                for (i = 0; i < vq_size; i++) {
                        vd = vq->vq_indirect;
                        vd += vq->vq_maxnsegs * i;
                        for (j = 0; j < vq->vq_maxnsegs - 1; j++) {
                                vd[j].next = virtio_rw16(sc, j + 1);
                        }
                }
        }

        /* enqueue/dequeue status */
        vq->vq_avail_idx = 0;
        vq->vq_used_idx = 0;
        vq->vq_queued = 0;
        vq_sync_uring_all(sc, vq, BUS_DMASYNC_PREREAD);
        vq->vq_queued++;
}

/* Initialize vq */
void
virtio_init_vq_vqdone(struct virtio_softc *sc, struct virtqueue *vq,
    int index, int (*vq_done)(struct virtqueue *))
{

        virtio_init_vq(sc, vq, index, virtio_vq_done, vq);
        vq->vq_done = vq_done;
}

void
virtio_init_vq(struct virtio_softc *sc, struct virtqueue *vq, int index,
   int (*func)(void *), void *arg)
{

        memset(vq, 0, sizeof(*vq));

        vq->vq_owner = sc;
        vq->vq_num = sc->sc_ops->read_queue_size(sc, index);
        vq->vq_index = index;
        vq->vq_intrhand = func;
        vq->vq_intrhand_arg = arg;
}

/*
 * Allocate/free a vq.
 */
int
virtio_alloc_vq(struct virtio_softc *sc, struct virtqueue *vq,
    int maxsegsize, int maxnsegs, const char *name)
{
        bus_size_t size_desc, size_avail, size_used, size_indirect;
        bus_size_t allocsize = 0, size_desc_avail;
        int rsegs, r, hdrlen;
        unsigned int vq_num;
#define VIRTQUEUE_ALIGN(n)        roundup(n, VIRTIO_PAGE_SIZE)

        vq_num = vq->vq_num;

        if (vq_num == 0) {
                aprint_error_dev(sc->sc_dev,
                    "virtqueue not exist, index %d for %s\n",
                    vq->vq_index, name);
                goto err;
        }

        hdrlen = sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX ? 3 : 2;

        size_desc = sizeof(vq->vq_desc[0]) * vq_num;
        size_avail = sizeof(uint16_t) * hdrlen
            + sizeof(vq->vq_avail[0].ring[0]) * vq_num;
        size_used = sizeof(uint16_t) *hdrlen
            + sizeof(vq->vq_used[0].ring[0]) * vq_num;
        size_indirect = (sc->sc_indirect && maxnsegs >= MINSEG_INDIRECT) ?
            sizeof(struct vring_desc) * maxnsegs * vq_num : 0;

        size_desc_avail = VIRTQUEUE_ALIGN(size_desc + size_avail);
        size_used = VIRTQUEUE_ALIGN(size_used);

        allocsize = size_desc_avail + size_used + size_indirect;

        /* alloc and map the memory */
        r = bus_dmamem_alloc(sc->sc_dmat, allocsize, VIRTIO_PAGE_SIZE, 0,
            &vq->vq_segs[0], 1, &rsegs, BUS_DMA_WAITOK);
        if (r != 0) {
                aprint_error_dev(sc->sc_dev,
                    "virtqueue %d for %s allocation failed, "
                    "error code %d\n", vq->vq_index, name, r);
                goto err;
        }

        r = bus_dmamem_map(sc->sc_dmat, &vq->vq_segs[0], rsegs, allocsize,
            &vq->vq_vaddr, BUS_DMA_WAITOK);
        if (r != 0) {
                aprint_error_dev(sc->sc_dev,
                    "virtqueue %d for %s map failed, "
                    "error code %d\n", vq->vq_index, name, r);
                goto err;
        }

        r = bus_dmamap_create(sc->sc_dmat, allocsize, 1, allocsize, 0,
            BUS_DMA_WAITOK, &vq->vq_dmamap);
        if (r != 0) {
                aprint_error_dev(sc->sc_dev,
                    "virtqueue %d for %s dmamap creation failed, "
                    "error code %d\n", vq->vq_index, name, r);
                goto err;
        }

        r = bus_dmamap_load(sc->sc_dmat, vq->vq_dmamap,
            vq->vq_vaddr, allocsize, NULL, BUS_DMA_WAITOK);
        if (r != 0) {
                aprint_error_dev(sc->sc_dev,
                    "virtqueue %d for %s dmamap load failed, "
                    "error code %d\n", vq->vq_index, name, r);
                goto err;
        }

        vq->vq_bytesize = allocsize;
        vq->vq_maxsegsize = maxsegsize;
        vq->vq_maxnsegs = maxnsegs;

#define VIRTIO_PTR(base, offset)        (void *)((intptr_t)(base) + (offset))
        /* initialize vring pointers */
        vq->vq_desc = VIRTIO_PTR(vq->vq_vaddr, 0);
        vq->vq_availoffset = size_desc;
        vq->vq_avail = VIRTIO_PTR(vq->vq_vaddr, vq->vq_availoffset);
        vq->vq_used_event = VIRTIO_PTR(vq->vq_avail,
            offsetof(struct vring_avail, ring[vq_num]));
        vq->vq_usedoffset = size_desc_avail;
        vq->vq_used = VIRTIO_PTR(vq->vq_vaddr, vq->vq_usedoffset);
        vq->vq_avail_event = VIRTIO_PTR(vq->vq_used,
            offsetof(struct vring_used, ring[vq_num]));

        if (size_indirect > 0) {
                vq->vq_indirectoffset = size_desc_avail + size_used;
                vq->vq_indirect = VIRTIO_PTR(vq->vq_vaddr,
                    vq->vq_indirectoffset);
        }
#undef VIRTIO_PTR

        vq->vq_descx = kmem_zalloc(sizeof(vq->vq_descx[0]) * vq_num,
            KM_SLEEP);

        mutex_init(&vq->vq_freedesc_lock, MUTEX_SPIN, sc->sc_ipl);
        mutex_init(&vq->vq_aring_lock, MUTEX_SPIN, sc->sc_ipl);
        mutex_init(&vq->vq_uring_lock, MUTEX_SPIN, sc->sc_ipl);

        virtio_reset_vq(sc, vq);

        aprint_verbose_dev(sc->sc_dev,
            "allocated %" PRIuBUSSIZE " byte for virtqueue %d for %s, "
            "size %d\n", allocsize, vq->vq_index, name, vq_num);
        if (size_indirect > 0)
                aprint_verbose_dev(sc->sc_dev,
                    "using %" PRIuBUSSIZE " byte (%d entries) indirect "
                    "descriptors\n", size_indirect, maxnsegs * vq_num);

        return 0;

err:
        sc->sc_ops->setup_queue(sc, vq->vq_index, 0);
        if (vq->vq_dmamap)
                bus_dmamap_destroy(sc->sc_dmat, vq->vq_dmamap);
        if (vq->vq_vaddr)
                bus_dmamem_unmap(sc->sc_dmat, vq->vq_vaddr, allocsize);
        if (vq->vq_segs[0].ds_addr)
                bus_dmamem_free(sc->sc_dmat, &vq->vq_segs[0], 1);
        memset(vq, 0, sizeof(*vq));

        return -1;
}

int
virtio_free_vq(struct virtio_softc *sc, struct virtqueue *vq)
{
        uint16_t s;
        size_t i;

        if (vq->vq_vaddr == NULL)
                return 0;

        /* device must be already deactivated */
        /* confirm the vq is empty */
        s = vq->vq_free_idx;
        i = 0;
        while (s != virtio_rw16(sc, VRING_DESC_CHAIN_END)) {
                s = vq->vq_desc[s].next;
                i++;
        }
        if (i != vq->vq_num) {
                printf("%s: freeing non-empty vq, index %d\n",
                    device_xname(sc->sc_dev), vq->vq_index);
                return EBUSY;
        }

        /* tell device that there's no virtqueue any longer */
        sc->sc_ops->setup_queue(sc, vq->vq_index, 0);

        vq_sync_aring_all(sc, vq, BUS_DMASYNC_POSTWRITE);

        kmem_free(vq->vq_descx, sizeof(vq->vq_descx[0]) * vq->vq_num);
        bus_dmamap_unload(sc->sc_dmat, vq->vq_dmamap);
        bus_dmamap_destroy(sc->sc_dmat, vq->vq_dmamap);
        bus_dmamem_unmap(sc->sc_dmat, vq->vq_vaddr, vq->vq_bytesize);
        bus_dmamem_free(sc->sc_dmat, &vq->vq_segs[0], 1);
        mutex_destroy(&vq->vq_freedesc_lock);
        mutex_destroy(&vq->vq_uring_lock);
        mutex_destroy(&vq->vq_aring_lock);
        memset(vq, 0, sizeof(*vq));

        return 0;
}

/*
 * Free descriptor management.
 */
static int
vq_alloc_slot_locked(struct virtio_softc *sc, struct virtqueue *vq,
    size_t nslots)
{
        struct vring_desc *vd;
        uint16_t head, tail;
        size_t i;

        KASSERT(mutex_owned(&vq->vq_freedesc_lock));

        head = tail = virtio_rw16(sc, vq->vq_free_idx);
        for (i = 0; i < nslots - 1; i++) {
                if (tail == VRING_DESC_CHAIN_END)
                        return VRING_DESC_CHAIN_END;

                vd = &vq->vq_desc[tail];
                vd->flags = virtio_rw16(sc, VRING_DESC_F_NEXT);
                tail = virtio_rw16(sc, vd->next);
        }

        if (tail == VRING_DESC_CHAIN_END)
                return VRING_DESC_CHAIN_END;

        vd = &vq->vq_desc[tail];
        vd->flags = virtio_rw16(sc, 0);
        vq->vq_free_idx = vd->next;

        return head;
}
static uint16_t
vq_alloc_slot(struct virtio_softc *sc, struct virtqueue *vq, size_t nslots)
{
        uint16_t rv;

        mutex_enter(&vq->vq_freedesc_lock);
        rv = vq_alloc_slot_locked(sc, vq, nslots);
        mutex_exit(&vq->vq_freedesc_lock);

        return rv;
}

static void
vq_free_slot(struct virtio_softc *sc, struct virtqueue *vq, uint16_t slot)
{
        struct vring_desc *vd;
        uint16_t s;

        mutex_enter(&vq->vq_freedesc_lock);
        vd = &vq->vq_desc[slot];
        while ((vd->flags & virtio_rw16(sc, VRING_DESC_F_NEXT)) != 0) {
                s = virtio_rw16(sc, vd->next);
                vd = &vq->vq_desc[s];
        }
        vd->next = vq->vq_free_idx;
        vq->vq_free_idx = virtio_rw16(sc, slot);
        mutex_exit(&vq->vq_freedesc_lock);
}

/*
 * Enqueue several dmamaps as a single request.
 */
/*
 * Typical usage:
 *  <queue size> number of followings are stored in arrays
 *  - command blocks (in dmamem) should be pre-allocated and mapped
 *  - dmamaps for command blocks should be pre-allocated and loaded
 *  - dmamaps for payload should be pre-allocated
 *      r = virtio_enqueue_prep(sc, vq, &slot);                // allocate a slot
 *        if (r)                // currently 0 or EAGAIN
 *                return r;
 *        r = bus_dmamap_load(dmat, dmamap_payload[slot], data, count, ..);
 *        if (r) {
 *                virtio_enqueue_abort(sc, vq, slot);
 *                return r;
 *        }
 *        r = virtio_enqueue_reserve(sc, vq, slot,
 *            dmamap_payload[slot]->dm_nsegs + 1);
 *                                                        // ^ +1 for command
 *        if (r) {        // currently 0 or EAGAIN
 *                bus_dmamap_unload(dmat, dmamap_payload[slot]);
 *                return r;                                // do not call abort()
 *        }
 *        <setup and prepare commands>
 *        bus_dmamap_sync(dmat, dmamap_cmd[slot],... BUS_DMASYNC_PREWRITE);
 *        bus_dmamap_sync(dmat, dmamap_payload[slot],...);
 *        virtio_enqueue(sc, vq, slot, dmamap_cmd[slot], false);
 *        virtio_enqueue(sc, vq, slot, dmamap_payload[slot], iswrite);
 *        virtio_enqueue_commit(sc, vq, slot, true);
 */

/*
 * enqueue_prep: allocate a slot number
 */
int
virtio_enqueue_prep(struct virtio_softc *sc, struct virtqueue *vq, int *slotp)
{
        uint16_t slot;

        KASSERT(slotp != NULL);

        slot = vq_alloc_slot(sc, vq, 1);
        if (slot == VRING_DESC_CHAIN_END)
                return EAGAIN;

        *slotp = slot;

        return 0;
}

/*
 * enqueue_reserve: allocate remaining slots and build the descriptor chain.
 */
int
virtio_enqueue_reserve(struct virtio_softc *sc, struct virtqueue *vq,
    int slot, int nsegs)
{
        struct vring_desc *vd;
        struct vring_desc_extra *vdx;
        int i;

        KASSERT(1 <= nsegs);
        KASSERT(nsegs <= vq->vq_num);

        vdx = &vq->vq_descx[slot];
        vd = &vq->vq_desc[slot];

        KASSERT((vd->flags & virtio_rw16(sc, VRING_DESC_F_NEXT)) == 0);

        if ((vq->vq_indirect != NULL) &&
            (nsegs >= MINSEG_INDIRECT) &&
            (nsegs <= vq->vq_maxnsegs))
                vdx->use_indirect = true;
        else
                vdx->use_indirect = false;

        if (vdx->use_indirect) {
                uint64_t addr;

                addr = vq->vq_dmamap->dm_segs[0].ds_addr
                    + vq->vq_indirectoffset;
                addr += sizeof(struct vring_desc)
                    * vq->vq_maxnsegs * slot;

                vd->addr  = virtio_rw64(sc, addr);
                vd->len   = virtio_rw32(sc, sizeof(struct vring_desc) * nsegs);
                vd->flags = virtio_rw16(sc, VRING_DESC_F_INDIRECT);

                vd = &vq->vq_indirect[vq->vq_maxnsegs * slot];
                vdx->desc_base = vd;
                vdx->desc_free_idx = 0;

                for (i = 0; i < nsegs - 1; i++) {
                        vd[i].flags = virtio_rw16(sc, VRING_DESC_F_NEXT);
                }
                vd[i].flags  = virtio_rw16(sc, 0);
        } else {
                if (nsegs > 1) {
                        uint16_t s;

                        s = vq_alloc_slot(sc, vq, nsegs - 1);
                        if (s == VRING_DESC_CHAIN_END) {
                                vq_free_slot(sc, vq, slot);
                                return EAGAIN;
                        }
                        vd->next = virtio_rw16(sc, s);
                        vd->flags = virtio_rw16(sc, VRING_DESC_F_NEXT);
                }

                vdx->desc_base = &vq->vq_desc[0];
                vdx->desc_free_idx = slot;
        }

        return 0;
}

/*
 * enqueue: enqueue a single dmamap.
 */
int
virtio_enqueue(struct virtio_softc *sc, struct virtqueue *vq, int slot,
    bus_dmamap_t dmamap, bool write)
{
        struct vring_desc *vds;
        struct vring_desc_extra *vdx;
        uint16_t s;
        int i;

        KASSERT(dmamap->dm_nsegs > 0);

        vdx = &vq->vq_descx[slot];
        vds = vdx->desc_base;
        s = vdx->desc_free_idx;

        KASSERT(vds != NULL);

        for (i = 0; i < dmamap->dm_nsegs; i++) {
                KASSERT(s != VRING_DESC_CHAIN_END);

                vds[s].addr = virtio_rw64(sc, dmamap->dm_segs[i].ds_addr);
                vds[s].len  = virtio_rw32(sc, dmamap->dm_segs[i].ds_len);
                if (!write)
                        vds[s].flags |= virtio_rw16(sc, VRING_DESC_F_WRITE);

                if ((vds[s].flags & virtio_rw16(sc, VRING_DESC_F_NEXT)) == 0) {
                        s = VRING_DESC_CHAIN_END;
                } else {
                        s = virtio_rw16(sc, vds[s].next);
                }
        }

        vdx->desc_free_idx = s;

        return 0;
}

int
virtio_enqueue_p(struct virtio_softc *sc, struct virtqueue *vq, int slot,
    bus_dmamap_t dmamap, bus_addr_t start, bus_size_t len,
    bool write)
{
        struct vring_desc_extra *vdx;
        struct vring_desc *vds;
        uint16_t s;

        vdx = &vq->vq_descx[slot];
        vds = vdx->desc_base;
        s = vdx->desc_free_idx;

        KASSERT(s != VRING_DESC_CHAIN_END);
        KASSERT(vds != NULL);
        KASSERT(dmamap->dm_nsegs == 1); /* XXX */
        KASSERT(dmamap->dm_segs[0].ds_len > start);
        KASSERT(dmamap->dm_segs[0].ds_len >= start + len);

        vds[s].addr = virtio_rw64(sc, dmamap->dm_segs[0].ds_addr + start);
        vds[s].len  = virtio_rw32(sc, len);
        if (!write)
                vds[s].flags |= virtio_rw16(sc, VRING_DESC_F_WRITE);

        if ((vds[s].flags & virtio_rw16(sc, VRING_DESC_F_NEXT)) == 0) {
                s = VRING_DESC_CHAIN_END;
        } else {
                s = virtio_rw16(sc, vds[s].next);
        }

        vdx->desc_free_idx = s;

        return 0;
}

/*
 * enqueue_commit: add it to the aring.
 */
int
virtio_enqueue_commit(struct virtio_softc *sc, struct virtqueue *vq, int slot,
    bool notifynow)
{

        if (slot < 0) {
                mutex_enter(&vq->vq_aring_lock);
                goto notify;
        }

        vq_sync_descs(sc, vq, BUS_DMASYNC_PREWRITE);
        if (vq->vq_descx[slot].use_indirect)
                vq_sync_indirect(sc, vq, slot, BUS_DMASYNC_PREWRITE);

        mutex_enter(&vq->vq_aring_lock);
        vq->vq_avail->ring[(vq->vq_avail_idx++) % vq->vq_num] =
            virtio_rw16(sc, slot);

notify:
        if (notifynow) {
                uint16_t o, n, t;
                uint16_t flags;

                o = virtio_rw16(sc, vq->vq_avail->idx) - 1;
                n = vq->vq_avail_idx;

                /*
                 * Prepare for `device->CPU' (host->guest) transfer
                 * into the buffer.  This must happen before we commit
                 * the vq->vq_avail->idx update to ensure we're not
                 * still using the buffer in case program-prior loads
                 * or stores in it get delayed past the store to
                 * vq->vq_avail->idx.
                 */
                vq_sync_uring_all(sc, vq, BUS_DMASYNC_PREREAD);

                /* ensure payload is published, then avail idx */
                vq_sync_aring_payload(sc, vq, BUS_DMASYNC_PREWRITE);
                vq->vq_avail->idx = virtio_rw16(sc, vq->vq_avail_idx);
                vq_sync_aring_header(sc, vq, BUS_DMASYNC_PREWRITE);
                vq->vq_queued++;

                if (sc->sc_active_features & VIRTIO_F_RING_EVENT_IDX) {
                        vq_sync_uring_avail(sc, vq, BUS_DMASYNC_POSTREAD);
                        t = virtio_rw16(sc, *vq->vq_avail_event) + 1;
                        if ((uint16_t) (n - t) < (uint16_t) (n - o))
                                sc->sc_ops->kick(sc, vq->vq_index);
                } else {
                        vq_sync_uring_header(sc, vq, BUS_DMASYNC_POSTREAD);
                        flags = virtio_rw16(sc, vq->vq_used->flags);
                        if (!(flags & VRING_USED_F_NO_NOTIFY))
                                sc->sc_ops->kick(sc, vq->vq_index);
                }
        }
        mutex_exit(&vq->vq_aring_lock);

        return 0;
}

/*
 * enqueue_abort: rollback.
 */
int
virtio_enqueue_abort(struct virtio_softc *sc, struct virtqueue *vq, int slot)
{
        struct vring_desc_extra *vdx;

        vdx = &vq->vq_descx[slot];
        vdx->desc_free_idx = VRING_DESC_CHAIN_END;
        vdx->desc_base = NULL;

        vq_free_slot(sc, vq, slot);

        return 0;
}

/*
 * Dequeue a request.
 */
/*
 * dequeue: dequeue a request from uring; dmamap_sync for uring is
 *            already done in the interrupt handler.
 */
int
virtio_dequeue(struct virtio_softc *sc, struct virtqueue *vq,
    int *slotp, int *lenp)
{
        uint16_t slot, usedidx;

        if (vq->vq_used_idx == virtio_rw16(sc, vq->vq_used->idx))
                return ENOENT;
        mutex_enter(&vq->vq_uring_lock);
        usedidx = vq->vq_used_idx++;
        mutex_exit(&vq->vq_uring_lock);
        usedidx %= vq->vq_num;
        slot = virtio_rw32(sc, vq->vq_used->ring[usedidx].id);

        if (vq->vq_descx[slot].use_indirect)
                vq_sync_indirect(sc, vq, slot, BUS_DMASYNC_POSTWRITE);

        if (slotp)
                *slotp = slot;
        if (lenp)
                *lenp = virtio_rw32(sc, vq->vq_used->ring[usedidx].len);

        return 0;
}

/*
 * dequeue_commit: complete dequeue; the slot is recycled for future use.
 *                 if you forget to call this the slot will be leaked.
 */
int
virtio_dequeue_commit(struct virtio_softc *sc, struct virtqueue *vq, int slot)
{
        struct vring_desc_extra *vdx;

        vdx = &vq->vq_descx[slot];
        vdx->desc_base = NULL;
        vdx->desc_free_idx = VRING_DESC_CHAIN_END;

        vq_free_slot(sc, vq, slot);

        return 0;
}

/*
 * Attach a child, fill all the members.
 */
void
virtio_child_attach_start(struct virtio_softc *sc, device_t child, int ipl,
    uint64_t req_features, const char *feat_bits)
{
        char buf[1024];

        KASSERT(sc->sc_child == NULL);
        KASSERT(sc->sc_child_state == VIRTIO_NO_CHILD);

        sc->sc_child = child;
        sc->sc_ipl = ipl;

        virtio_negotiate_features(sc, req_features);
        snprintb(buf, sizeof(buf), feat_bits, sc->sc_active_features);
        aprint_normal(": features: %s\n", buf);
        aprint_naive("\n");
}

int
virtio_child_attach_finish(struct virtio_softc *sc,
    struct virtqueue *vqs, size_t nvqs,
    virtio_callback config_change,
    int req_flags)
{
        size_t i;
        int r;

#ifdef DIAGNOSTIC
        KASSERT(nvqs > 0);
#define VIRTIO_ASSERT_FLAGS        (VIRTIO_F_INTR_SOFTINT | VIRTIO_F_INTR_PERVQ)
        KASSERT((req_flags & VIRTIO_ASSERT_FLAGS) != VIRTIO_ASSERT_FLAGS);
#undef VIRTIO_ASSERT_FLAGS

        for (i = 0; i < nvqs; i++){
                KASSERT(vqs[i].vq_index == i);
                KASSERT(vqs[i].vq_intrhand != NULL);
                KASSERT(vqs[i].vq_done == NULL ||
                    vqs[i].vq_intrhand == virtio_vq_done);
        }
#endif


        sc->sc_vqs = vqs;
        sc->sc_nvqs = nvqs;
        sc->sc_config_change = config_change;
        sc->sc_intrhand = virtio_vq_intr;
        sc->sc_flags = req_flags;

        /* set the vq address */
        for (i = 0; i < nvqs; i++) {
                sc->sc_ops->setup_queue(sc, vqs[i].vq_index,
                    vqs[i].vq_dmamap->dm_segs[0].ds_addr);
        }

        r = sc->sc_ops->alloc_interrupts(sc);
        if (r != 0) {
                aprint_error_dev(sc->sc_dev,
                    "failed to allocate interrupts\n");
                goto fail;
        }

        r = sc->sc_ops->setup_interrupts(sc, 0);
        if (r != 0) {
                aprint_error_dev(sc->sc_dev, "failed to setup interrupts\n");
                goto fail;
        }

        KASSERT(sc->sc_soft_ih == NULL);
        if (sc->sc_flags & VIRTIO_F_INTR_SOFTINT) {
                u_int flags = SOFTINT_NET;
                if (sc->sc_flags & VIRTIO_F_INTR_MPSAFE)
                        flags |= SOFTINT_MPSAFE;

                sc->sc_soft_ih = softint_establish(flags, virtio_soft_intr,
                    sc);
                if (sc->sc_soft_ih == NULL) {
                        sc->sc_ops->free_interrupts(sc);
                        aprint_error_dev(sc->sc_dev,
                            "failed to establish soft interrupt\n");
                        goto fail;
                }
        }

        sc->sc_child_state = VIRTIO_CHILD_ATTACH_FINISHED;
        virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK);
        return 0;

fail:
        if (sc->sc_soft_ih) {
                softint_disestablish(sc->sc_soft_ih);
                sc->sc_soft_ih = NULL;
        }

        sc->sc_ops->free_interrupts(sc);

        virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_FAILED);
        return 1;
}

void
virtio_child_detach(struct virtio_softc *sc)
{

        /* already detached */
        if (sc->sc_child == NULL)
                return;


        virtio_device_reset(sc);

        sc->sc_ops->free_interrupts(sc);

        if (sc->sc_soft_ih) {
                softint_disestablish(sc->sc_soft_ih);
                sc->sc_soft_ih = NULL;
        }

        sc->sc_vqs = NULL;
        sc->sc_child = NULL;
}

void
virtio_child_attach_failed(struct virtio_softc *sc)
{
        virtio_child_detach(sc);

        virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_FAILED);

        sc->sc_child_state = VIRTIO_CHILD_ATTACH_FAILED;
}

bus_dma_tag_t
virtio_dmat(struct virtio_softc *sc)
{
        return sc->sc_dmat;
}

device_t
virtio_child(struct virtio_softc *sc)
{
        return sc->sc_child;
}

int
virtio_intrhand(struct virtio_softc *sc)
{
        return (*sc->sc_intrhand)(sc);
}

uint64_t
virtio_features(struct virtio_softc *sc)
{
        return sc->sc_active_features;
}

int
virtio_attach_failed(struct virtio_softc *sc)
{
        device_t self = sc->sc_dev;

        /* no error if its not connected, but its failed */
        if (sc->sc_childdevid == 0)
                return 1;

        if (sc->sc_child == NULL) {
                switch (sc->sc_child_state) {
                case VIRTIO_CHILD_ATTACH_FAILED:
                        aprint_error_dev(self,
                            "virtio configuration failed\n");
                        break;
                case VIRTIO_NO_CHILD:
                        aprint_error_dev(self,
                            "no matching child driver; not configured\n");
                        break;
                default:
                        /* sanity check */
                        aprint_error_dev(self,
                            "virtio internal error, "
                            "child driver is not configured\n");
                        break;
                }

                return 1;
        }

        /* sanity check */
        if (sc->sc_child_state != VIRTIO_CHILD_ATTACH_FINISHED) {
                aprint_error_dev(self, "virtio internal error, child driver "
                    "signaled OK but didn't initialize interrupts\n");
                return 1;
        }

        return 0;
}

void
virtio_print_device_type(device_t self, int id, int revision)
{
        aprint_normal_dev(self, "%s device (id %d, rev. 0x%02x)\n",
            (id < NDEVNAMES ? virtio_device_name[id] : "Unknown"),
            id,
            revision);
}


MODULE(MODULE_CLASS_DRIVER, virtio, NULL);

#ifdef _MODULE
#include "ioconf.c"
#endif

static int
virtio_modcmd(modcmd_t cmd, void *opaque)
{
        int error = 0;

#ifdef _MODULE
        switch (cmd) {
        case MODULE_CMD_INIT:
                error = config_init_component(cfdriver_ioconf_virtio,
                    cfattach_ioconf_virtio, cfdata_ioconf_virtio);
                break;
        case MODULE_CMD_FINI:
                error = config_fini_component(cfdriver_ioconf_virtio,
                    cfattach_ioconf_virtio, cfdata_ioconf_virtio);
                break;
        default:
                error = ENOTTY;
                break;
        }
#endif

        return error;
}





































































    1 





    1 


    1 










    1 

    1 


    1 















    1 






































    3 



    3 





    3 





















    3 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
/*        $NetBSD: raw_usrreq.c,v 1.65 2022/09/02 23:48:11 thorpej Exp $        */

/*
 * Copyright (c) 1980, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)raw_usrreq.c        8.1 (Berkeley) 6/10/93
 */

/*
 * Raw protocol interface.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: raw_usrreq.c,v 1.65 2022/09/02 23:48:11 thorpej Exp $");

#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kauth.h>

#include <net/if.h>
#include <net/route.h>
#include <net/raw_cb.h>

static inline int
equal(const struct sockaddr *a1, const struct sockaddr *a2)
{
        return memcmp(a1, a2, a1->sa_len) == 0;
}

/*
 * raw_input: find the socket associated with the packet and move it over.
 * If nothing exists for this packet, drop it.
 */
void
raw_input(struct mbuf *m0, struct sockproto *proto, struct sockaddr *src,
    struct sockaddr *dst, struct rawcbhead *rawcbhead)
{
        struct rawcb *rp;
        struct mbuf *m = m0;
        struct socket *last;

        last = NULL;
        LIST_FOREACH(rp, rawcbhead, rcb_list) {
                if (rp->rcb_proto.sp_family != proto->sp_family)
                        continue;
                if (rp->rcb_proto.sp_protocol  &&
                    rp->rcb_proto.sp_protocol != proto->sp_protocol)
                        continue;
                /*
                 * We assume the lower level routines have
                 * placed the address in a canonical format
                 * suitable for a structure comparison.
                 *
                 * Note that if the lengths are not the same
                 * the comparison will fail at the first byte.
                 */
                if (rp->rcb_laddr && !equal(rp->rcb_laddr, dst))
                        continue;
                if (rp->rcb_faddr && !equal(rp->rcb_faddr, src))
                        continue;
                /* Run any filtering that may have been installed. */
                if (rp->rcb_filter != NULL && rp->rcb_filter(m, proto, rp) != 0)
                        continue;
                if (last != NULL) {
                        struct mbuf *n;

                        if ((n = m_copypacket(m, M_DONTWAIT)) == NULL ||
                            sbappendaddr(&last->so_rcv, src, n, NULL) == 0)
                        {
                                if (n != NULL)
                                        m_freem(n);
                                soroverflow(last);
                        } else
                                sorwakeup(last);
                }
                last = rp->rcb_socket;
        }
        if (last != NULL) {
                if (sbappendaddr(&last->so_rcv, src, m, NULL) == 0) {
                        m_freem(m);
                        soroverflow(last);
                } else
                        sorwakeup(last);
        } else {
                m_freem(m);
        }
}

void *
raw_ctlinput(int cmd, const struct sockaddr *arg, void *d)
{

        if ((unsigned)cmd >= PRC_NCMDS)
                return NULL;
        return NULL;
        /* INCOMPLETE */
}

void
raw_setsockaddr(struct rawcb *rp, struct sockaddr *nam)
{

        memcpy(nam, rp->rcb_laddr, rp->rcb_laddr->sa_len);
}

void
raw_setpeeraddr(struct rawcb *rp, struct sockaddr *nam)
{

        memcpy(nam, rp->rcb_faddr, rp->rcb_faddr->sa_len);
}

int
raw_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
    struct mbuf *control, struct lwp *l,
    int (*output)(struct mbuf *, struct socket *))
{
        struct rawcb *rp = sotorawcb(so);
        int error = 0;

        KASSERT(rp != NULL);

        /*
         * Ship a packet out.  The appropriate raw output
         * routine handles any massaging necessary.
         */
        if (control && control->m_len) {
                m_freem(control);
                m_freem(m);
                return EINVAL;
        }
        if (nam) {
                if ((so->so_state & SS_ISCONNECTED) != 0) {
                        error = EISCONN;
                        goto die;
                }
                error = (*so->so_proto->pr_usrreqs->pr_connect)(so, nam, l);
                if (error) {
                die:
                        m_freem(m);
                        return error;
                }
        } else {
                if ((so->so_state & SS_ISCONNECTED) == 0) {
                        error = ENOTCONN;
                        goto die;
                }
        }
        error = (*output)(m, so);
        if (nam)
                raw_disconnect(rp);

        return error;
}

int
raw_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
    struct mbuf *control, struct lwp *l)
{

        KASSERT(req != PRU_ATTACH);
        KASSERT(req != PRU_DETACH);
        KASSERT(req != PRU_ACCEPT);
        KASSERT(req != PRU_BIND);
        KASSERT(req != PRU_LISTEN);
        KASSERT(req != PRU_CONNECT);
        KASSERT(req != PRU_CONNECT2);
        KASSERT(req != PRU_DISCONNECT);
        KASSERT(req != PRU_SHUTDOWN);
        KASSERT(req != PRU_ABORT);
        KASSERT(req != PRU_CONTROL);
        KASSERT(req != PRU_SENSE);
        KASSERT(req != PRU_PEERADDR);
        KASSERT(req != PRU_SOCKADDR);
        KASSERT(req != PRU_RCVD);
        KASSERT(req != PRU_RCVOOB);
        KASSERT(req != PRU_SEND);
        KASSERT(req != PRU_SENDOOB);
        KASSERT(req != PRU_PURGEIF);

        if (sotorawcb(so) == NULL)
                return EINVAL;

        panic("raw_usrreq");

        return 0;
}




























































    1 





    1 









   12 













   10 




   12 





















   11 












   12 


   10 






    2 




   12 





   12 











   12 







    3 

    9 









    1 



    1 





















































































































































    1 








    1 




    1 
















    1 
















    1 
















    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
/*        $NetBSD: ufs_bmap.c,v 1.54 2022/11/17 06:40:40 chs Exp $        */

/*
 * Copyright (c) 1989, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ufs_bmap.c        8.8 (Berkeley) 8/11/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_bmap.c,v 1.54 2022/11/17 06:40:40 chs Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/resourcevar.h>
#include <sys/trace.h>

#include <miscfs/specfs/specdev.h>

#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>

static bool
ufs_issequential(const struct ufsmount *ump, daddr_t daddr0, daddr_t daddr1)
{

        /* for ufs, blocks in a hole is not 'contiguous'. */
        if (daddr0 == 0)
                return false;

        return (daddr0 + ump->um_seqinc == daddr1);
}

/*
 * Bmap converts the logical block number of a file to its physical block
 * number on the disk. The conversion is done by using the logical block
 * number to index into the array of block pointers described by the dinode.
 */
int
ufs_bmap(void *v)
{
        struct vop_bmap_args /* {
                struct vnode *a_vp;
                daddr_t  a_bn;
                struct vnode **a_vpp;
                daddr_t *a_bnp;
                int *a_runp;
        } */ *ap = v;
        int error;

        /*
         * Check for underlying vnode requests and ensure that logical
         * to physical mapping is requested.
         */
        if (ap->a_vpp != NULL)
                *ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
        if (ap->a_bnp == NULL)
                return (0);

        error = ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
            ap->a_runp, ufs_issequential);
        return error;
}

/*
 * Indirect blocks are now on the vnode for the file.  They are given negative
 * logical block numbers.  Indirect blocks are addressed by the negative
 * address of the first data block to which they point.  Double indirect blocks
 * are addressed by one less than the address of the first indirect block to
 * which they point.  Triple indirect blocks are addressed by one less than
 * the address of the first double indirect block to which they point.
 *
 * ufs_bmaparray does the bmap conversion, and if requested returns the
 * array of logical blocks which must be traversed to get to a block.
 * Each entry contains the offset into that block that gets you to the
 * next block and the disk address of the block (if it is assigned).
 */

int
ufs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap,
    int *nump, int *runp, ufs_issequential_callback_t is_sequential)
{
        struct inode *ip;
        struct buf *bp, *cbp;
        struct ufsmount *ump;
        struct mount *mp;
        struct indir a[UFS_NIADDR + 1], *xap;
        daddr_t daddr;
        daddr_t metalbn;
        int error, maxrun = 0, num;

        ip = VTOI(vp);
        mp = vp->v_mount;
        ump = ip->i_ump;
        KASSERTMSG(((ap == NULL) == (nump == NULL)),
            "ufs_bmaparray: invalid arguments: ap = %p, nump = %p", ap, nump);

        if (runp) {
                /*
                 * XXX
                 * If MAXBSIZE is the largest transfer the disks can handle,
                 * we probably want maxrun to be 1 block less so that we
                 * don't create a block larger than the device can handle.
                 */
                *runp = 0;
                maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1;
        }

        if (bn >= 0 && bn < UFS_NDADDR) {
                if (nump != NULL)
                        *nump = 0;
                if (ump->um_fstype == UFS1)
                        daddr = ufs_rw32(ip->i_ffs1_db[bn],
                            UFS_MPNEEDSWAP(ump));
                else
                        daddr = ufs_rw64(ip->i_ffs2_db[bn],
                            UFS_MPNEEDSWAP(ump));
                *bnp = blkptrtodb(ump, daddr);
                /*
                 * Since this is FFS independent code, we are out of
                 * scope for the definitions of BLK_NOCOPY and
                 * BLK_SNAP, but we do know that they will fall in
                 * the range 1..um_seqinc, so we use that test and
                 * return a request for a zeroed out buffer if attempts
                 * are made to read a BLK_NOCOPY or BLK_SNAP block.
                 */
                if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT
                    && daddr > 0 &&
                    daddr < ump->um_seqinc) {
                        *bnp = -1;
                } else if (*bnp == 0) {
                        if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
                            == SF_SNAPSHOT) {
                                *bnp = blkptrtodb(ump, bn * ump->um_seqinc);
                        } else {
                                *bnp = -1;
                        }
                } else if (runp) {
                        if (ump->um_fstype == UFS1) {
                                for (++bn; bn < UFS_NDADDR && *runp < maxrun &&
                                    is_sequential(ump,
                                        ufs_rw32(ip->i_ffs1_db[bn - 1],
                                            UFS_MPNEEDSWAP(ump)),
                                        ufs_rw32(ip->i_ffs1_db[bn],
                                            UFS_MPNEEDSWAP(ump)));
                                    ++bn, ++*runp);
                        } else {
                                for (++bn; bn < UFS_NDADDR && *runp < maxrun &&
                                    is_sequential(ump,
                                        ufs_rw64(ip->i_ffs2_db[bn - 1],
                                            UFS_MPNEEDSWAP(ump)),
                                        ufs_rw64(ip->i_ffs2_db[bn],
                                            UFS_MPNEEDSWAP(ump)));
                                    ++bn, ++*runp);
                        }
                }
                return (0);
        } else if (bn < 0 && bn >= -UFS_NXADDR) {
                KASSERT(ump->um_fstype == UFS2 && (ump->um_flags & UFS_EA) != 0);
                daddr = ufs_rw64(ip->i_ffs2_extb[-1 - bn], UFS_MPNEEDSWAP(ump));
                *bnp = blkptrtodb(ump, daddr);
                if (*bnp == 0)
                        *bnp = -1;
                return 0;
        }

        xap = ap == NULL ? a : ap;
        if (!nump)
                nump = &num;
        if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0)
                return (error);

        num = *nump;

        /* Get disk address out of indirect block array */
        if (ump->um_fstype == UFS1)
                daddr = ufs_rw32(ip->i_ffs1_ib[xap->in_off],
                    UFS_MPNEEDSWAP(ump));
        else
                daddr = ufs_rw64(ip->i_ffs2_ib[xap->in_off],
                    UFS_MPNEEDSWAP(ump));

        for (bp = NULL, ++xap; --num; ++xap) {
                /*
                 * Exit the loop if there is no disk address assigned yet and
                 * the indirect block isn't in the cache, or if we were
                 * looking for an indirect block and we've found it.
                 */

                metalbn = xap->in_lbn;
                if (metalbn == bn)
                        break;
                if (daddr == 0) {
                        mutex_enter(&bufcache_lock);
                        cbp = incore(vp, metalbn);
                        mutex_exit(&bufcache_lock);
                        if (cbp == NULL)
                                break;
                }

                /*
                 * If we get here, we've either got the block in the cache
                 * or we have a disk address for it, go fetch it.
                 */
                if (bp)
                        brelse(bp, 0);

                xap->in_exists = 1;
                bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
                if (bp == NULL) {

                        /*
                         * getblk() above returns NULL only iff we are
                         * pagedaemon.  See the implementation of getblk
                         * for detail.
                         */

                        return (ENOMEM);
                }
                if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
                        trace(TR_BREADHIT, pack(vp, size), metalbn);
                } else {
                        KASSERTMSG((daddr != 0),
                            "ufs_bmaparray: indirect block not in cache");
                        trace(TR_BREADMISS, pack(vp, size), metalbn);
                        bp->b_blkno = blkptrtodb(ump, daddr);
                        bp->b_flags |= B_READ;
                        BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
                        VOP_STRATEGY(vp, bp);
                        curlwp->l_ru.ru_inblock++;        /* XXX */
                        if ((error = biowait(bp)) != 0) {
                                brelse(bp, 0);
                                return (error);
                        }
                }
                if (ump->um_fstype == UFS1) {
                        daddr = ufs_rw32(((u_int32_t *)bp->b_data)[xap->in_off],
                            UFS_MPNEEDSWAP(ump));
                        if (num == 1 && daddr && runp) {
                                for (bn = xap->in_off + 1;
                                    bn < MNINDIR(ump) && *runp < maxrun &&
                                    is_sequential(ump,
                                        ufs_rw32(((int32_t *)bp->b_data)[bn-1],
                                            UFS_MPNEEDSWAP(ump)),
                                        ufs_rw32(((int32_t *)bp->b_data)[bn],
                                            UFS_MPNEEDSWAP(ump)));
                                    ++bn, ++*runp);
                        }
                } else {
                        daddr = ufs_rw64(((u_int64_t *)bp->b_data)[xap->in_off],
                            UFS_MPNEEDSWAP(ump));
                        if (num == 1 && daddr && runp) {
                                for (bn = xap->in_off + 1;
                                    bn < MNINDIR(ump) && *runp < maxrun &&
                                    is_sequential(ump,
                                        ufs_rw64(((int64_t *)bp->b_data)[bn-1],
                                            UFS_MPNEEDSWAP(ump)),
                                        ufs_rw64(((int64_t *)bp->b_data)[bn],
                                            UFS_MPNEEDSWAP(ump)));
                                    ++bn, ++*runp);
                        }
                }
        }
        if (bp)
                brelse(bp, 0);

        /*
         * Since this is FFS independent code, we are out of scope for the
         * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
         * will fall in the range 1..um_seqinc, so we use that test and
         * return a request for a zeroed out buffer if attempts are made
         * to read a BLK_NOCOPY or BLK_SNAP block.
         */
        if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT
            && daddr > 0 && daddr < ump->um_seqinc) {
                *bnp = -1;
                return (0);
        }
        *bnp = blkptrtodb(ump, daddr);
        if (*bnp == 0) {
                if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
                    == SF_SNAPSHOT) {
                        *bnp = blkptrtodb(ump, bn * ump->um_seqinc);
                } else {
                        *bnp = -1;
                }
        }
        return (0);
}

/*
 * Create an array of logical block number/offset pairs which represent the
 * path of indirect blocks required to access a data block.  The first "pair"
 * contains the logical block number of the appropriate single, double or
 * triple indirect block and the offset into the inode indirect block array.
 * Note, the logical block number of the inode single/double/triple indirect
 * block appears twice in the array, once with the offset into the i_ffs1_ib and
 * once with the offset into the page itself.
 */
int
ufs_getlbns(struct vnode *vp, daddr_t bn, struct indir *ap, int *nump)
{
        daddr_t metalbn, realbn;
        struct ufsmount *ump;
        int64_t blockcnt;
        int lbc;
        int i, numlevels, off;

        ump = VFSTOUFS(vp->v_mount);
        if (nump)
                *nump = 0;
        numlevels = 0;
        realbn = bn;
        if (bn < 0)
                bn = -bn;
        KASSERT(bn >= UFS_NDADDR);

        /*
         * Determine the number of levels of indirection.  After this loop
         * is done, blockcnt indicates the number of data blocks possible
         * at the given level of indirection, and UFS_NIADDR - i is the number
         * of levels of indirection needed to locate the requested block.
         */

        bn -= UFS_NDADDR;
        for (lbc = 0, i = UFS_NIADDR;; i--, bn -= blockcnt) {
                if (i == 0)
                        return (EFBIG);

                lbc += ump->um_lognindir;
                blockcnt = (int64_t)1 << lbc;

                if (bn < blockcnt)
                        break;
        }

        /* Calculate the address of the first meta-block. */
        metalbn = -((realbn >= 0 ? realbn : -realbn) - bn + UFS_NIADDR - i);

        /*
         * At each iteration, off is the offset into the bap array which is
         * an array of disk addresses at the current level of indirection.
         * The logical block number and the offset in that block are stored
         * into the argument array.
         */
        ap->in_lbn = metalbn;
        ap->in_off = off = UFS_NIADDR - i;
        ap->in_exists = 0;
        ap++;
        for (++numlevels; i <= UFS_NIADDR; i++) {
                /* If searching for a meta-data block, quit when found. */
                if (metalbn == realbn)
                        break;

                lbc -= ump->um_lognindir;
                off = (bn >> lbc) & (MNINDIR(ump) - 1);

                ++numlevels;
                ap->in_lbn = metalbn;
                ap->in_off = off;
                ap->in_exists = 0;
                ++ap;

                metalbn -= -1 + ((int64_t)off << lbc);
        }
        if (nump)
                *nump = numlevels;
        return (0);
}















































































































































































































































































































































































































































































































    7 




















    6 






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
/*        $NetBSD: route.h,v 1.134 2023/06/16 02:48:07 rin Exp $        */

/*
 * Copyright (c) 1980, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)route.h        8.5 (Berkeley) 2/8/95
 */

#ifndef _NET_ROUTE_H_
#define _NET_ROUTE_H_

#include <sys/queue.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <net/if.h>
#ifdef _KERNEL
#include <sys/rwlock.h>
#include <sys/condvar.h>
#include <sys/pserialize.h>
#include <sys/percpu.h>
#endif
#include <sys/psref.h>

#if !(defined(_KERNEL) || defined(_STANDALONE))
#include <stdbool.h>
#endif

/*
 * Kernel resident routing tables.
 *
 * The routing tables are initialized when interface addresses
 * are set by making entries for all directly connected interfaces.
 */

/*
 * A route consists of a destination address and a reference
 * to a routing entry.  These are often held by protocols
 * in their control blocks, e.g. inpcb.
 */
struct route {
        struct        rtentry                *_ro_rt;
        struct        sockaddr        *ro_sa;
        uint64_t                ro_rtcache_generation;
        struct        psref                ro_psref;
        int                        ro_bound;
};

/*
 * These numbers are used by reliable protocols for determining
 * retransmission behavior and are included in the routing structure.
 */
struct rt_metrics {
        uint64_t rmx_locks;        /* Kernel must leave these values alone */
        uint64_t rmx_mtu;        /* MTU for this path */
        uint64_t rmx_hopcount;        /* max hops expected */
        uint64_t rmx_recvpipe;        /* inbound delay-bandwidth product */
        uint64_t rmx_sendpipe;        /* outbound delay-bandwidth product */
        uint64_t rmx_ssthresh;        /* outbound gateway buffer limit */
        uint64_t rmx_rtt;        /* estimated round trip time */
        uint64_t rmx_rttvar;        /* estimated rtt variance */
        time_t        rmx_expire;        /* lifetime for route, e.g. redirect */
        time_t        rmx_pksent;        /* packets sent using this route */
};

/*
 * rmx_rtt and rmx_rttvar are stored as microseconds;
 * RTTTOPRHZ(rtt) converts to a value suitable for use
 * by a protocol slowtimo counter.
 */
#define        RTM_RTTUNIT        1000000        /* units for rtt, rttvar, as units per sec */
#define        RTTTOPRHZ(r)        ((r) / (RTM_RTTUNIT / PR_SLOWHZ))

/*
 * We distinguish between routes to hosts and routes to networks,
 * preferring the former if available.  For each route we infer
 * the interface to use from the gateway address supplied when
 * the route was entered.  Routes that forward packets through
 * gateways are marked so that the output routines know to address the
 * gateway rather than the ultimate destination.
 */
#ifndef RNF_NORMAL
#include <net/radix.h>
#endif
struct rtentry {
        struct        radix_node rt_nodes[2];        /* tree glue, and other values */
#define        rt_mask(r)        ((const struct sockaddr *)((r)->rt_nodes->rn_mask))
        struct        sockaddr *rt_gateway;        /* value */
        int        rt_flags;                /* up/down?, host/net */
        int        rt_refcnt;                /* # held references */
        uint64_t rt_use;                        /* raw # packets forwarded */
        struct        ifnet *rt_ifp;                /* the answer: interface to use */
        struct        ifaddr *rt_ifa;                /* the answer: interface to use */
        uint32_t rt_ifa_seqno;
        void *        rt_llinfo;                /* pointer to link level info cache */
        struct        rt_metrics rt_rmx;        /* metrics used by rx'ing protocols */
        struct        rtentry *rt_gwroute;        /* implied entry for gatewayed routes */
        LIST_HEAD(, rttimer) rt_timer;  /* queue of timeouts for misc funcs */
        struct        rtentry *rt_parent;        /* parent of cloned route */
        struct        sockaddr *_rt_key;
        struct        sockaddr *rt_tag;        /* route tagging info */
#ifdef _KERNEL
        kcondvar_t rt_cv;
        struct psref_target rt_psref;
        SLIST_ENTRY(rtentry) rt_free;        /* queue of deferred frees */
#endif
};

static __inline const struct sockaddr *
rt_getkey(const struct rtentry *rt)
{
        return rt->_rt_key;
}

/*
 * Following structure necessary for 4.3 compatibility;
 * We should eventually move it to a compat file.
 */
struct ortentry {
        uint32_t rt_hash;                /* to speed lookups */
        struct        sockaddr rt_dst;        /* key */
        struct        sockaddr rt_gateway;        /* value */
        int16_t        rt_flags;                /* up/down?, host/net */
        int16_t        rt_refcnt;                /* # held references */
        uint32_t rt_use;                /* raw # packets forwarded */
        struct        ifnet *rt_ifp;                /* the answer: interface to use */
};

#define        RTF_UP                0x1                /* route usable */
#define        RTF_GATEWAY        0x2                /* destination is a gateway */
#define        RTF_HOST        0x4                /* host entry (net otherwise) */
#define        RTF_REJECT        0x8                /* host or net unreachable */
#define        RTF_DYNAMIC        0x10                /* created dynamically (by redirect) */
#define        RTF_MODIFIED        0x20                /* modified dynamically (by redirect) */
#define RTF_DONE        0x40                /* message confirmed */
#define RTF_MASK        0x80                /* subnet mask present */
// #define RTF_CLONING        0x100                /* generate new routes on use */
#define RTF_CONNECTED        0x100                /* hosts on this route are neighbours */
// #define RTF_XRESOLVE        0x200                /* external daemon resolves name */
// #define RTF_LLINFO        0x400                /* generated by ARP or NDP */
#define RTF_LLDATA        0x400                /* used by apps to add/del L2 entries */
#define RTF_STATIC        0x800                /* manually added */
#define RTF_BLACKHOLE        0x1000                /* just discard pkts (during updates) */
// #define RTF_CLONED        0x2000                /* this is a cloned route */
#define RTF_PROTO2        0x4000                /* protocol specific routing flag */
#define RTF_PROTO1        0x8000                /* protocol specific routing flag */
#define RTF_SRC                0x10000                /* route has fixed source address */
#define RTF_ANNOUNCE        0x20000                /* announce new ARP or NDP entry */
#define RTF_LOCAL        0x40000                /* route represents a local address */
#define RTF_BROADCAST        0x80000                /* route represents a bcast address */
#define RTF_UPDATING        0x100000        /* route is updating */
/*
 * The flag is nevert set to rt_flags.  It just tells rtrequest1 to set a passed
 * ifa to rt_ifa (via rti_ifa) and not replace rt_ifa in ifa_rtrequest.
 */
#define RTF_DONTCHANGEIFA        0x200000        /* suppress rt_ifa replacement */

/*
 * 0x400 is exposed to userland just for backward compatibility. For that
 * purpose, it should be shown as LLINFO.
 */
#define RTFBITS "\020\1UP\2GATEWAY\3HOST\4REJECT\5DYNAMIC\6MODIFIED\7DONE" \
    "\010MASK_PRESENT\011CONNECTED\012XRESOLVE\013LLINFO\014STATIC" \
    "\015BLACKHOLE\016CLONED\017PROTO2\020PROTO1\021SRC\022ANNOUNCE" \
    "\023LOCAL\024BROADCAST\025UPDATING"


/*
 * Routing statistics.
 */
struct        rtstat {
        uint64_t rts_badredirect;        /* bogus redirect calls */
        uint64_t rts_dynamic;                /* routes created by redirects */
        uint64_t rts_newgateway;        /* routes modified by redirects */
        uint64_t rts_unreach;                /* lookups which failed */
        uint64_t rts_wildcard;                /* lookups satisfied by a wildcard */
};

/*
 * Structures for routing messages.  By forcing the first member to be aligned
 * at a 64-bit boundary, we also force the size to be a multiple of 64-bits.
 */

#if !defined(_KERNEL) || !defined(COMPAT_RTSOCK)
/*
 * If we aren't being compiled for backwards compatibility, enforce 64-bit
 * alignment so any routing message is the same regardless if the kernel
 * is an ILP32 or LP64 kernel.
 */
#define        __align64        __aligned(sizeof(uint64_t))
#else
#define        __align64
#endif

struct rt_msghdr {
        u_short        rtm_msglen __align64;
                                /* to skip over non-understood messages */
        u_char        rtm_version;        /* future binary compatibility */
        u_char        rtm_type;        /* message type */
        u_short        rtm_index;        /* index for associated ifp */
        int        rtm_flags;        /* flags, incl. kern & message, e.g. DONE */
        int        rtm_addrs;        /* bitmask identifying sockaddrs in msg */
        pid_t        rtm_pid;        /* identify sender */
        int        rtm_seq;        /* for sender to identify action */
        int        rtm_errno;        /* why failed */
        int        rtm_use;        /* from rtentry */
        int        rtm_inits;        /* which metrics we are initializing */
        struct        rt_metrics rtm_rmx __align64;
                                /* metrics themselves */
};

#undef __align64

#define RTM_VERSION        4        /* Up the ante and ignore older versions */

#define RTM_ADD                0x1        /* Add Route */
#define RTM_DELETE        0x2        /* Delete Route */
#define RTM_CHANGE        0x3        /* Change Metrics or flags */
#define RTM_GET                0x4        /* Report Metrics */
#define RTM_LOSING        0x5        /* Kernel Suspects Partitioning */
#define RTM_REDIRECT        0x6        /* Told to use different route */
#define RTM_MISS        0x7        /* Lookup failed on this address */
#define RTM_LOCK        0x8        /* fix specified metrics */
#define RTM_OLDADD        0x9        /* caused by SIOCADDRT */
#define RTM_OLDDEL        0xa        /* caused by SIOCDELRT */
// #define RTM_RESOLVE        0xb        /* req to resolve dst to LL addr */
#define RTM_ONEWADDR        0xc        /* Old (pre-8.0) RTM_NEWADDR message */
#define RTM_ODELADDR        0xd        /* Old (pre-8.0) RTM_DELADDR message */
#define RTM_OOIFINFO        0xe        /* Old (pre-1.5) RTM_IFINFO message */
#define RTM_OIFINFO        0xf        /* Old (pre-64bit time) RTM_IFINFO message */
#define        RTM_IFANNOUNCE        0x10        /* iface arrival/departure */
#define        RTM_IEEE80211        0x11        /* IEEE80211 wireless event */
#define        RTM_SETGATE        0x12        /* set prototype gateway for clones
                                 * (see example in arp_rtrequest).
                                 */
#define        RTM_LLINFO_UPD        0x13        /* indication to ARP/NDP/etc. that link-layer
                                 * address has changed
                                 */
#define RTM_IFINFO        0x14        /* iface/link going up/down etc. */
#define RTM_OCHGADDR        0x15        /* Old (pre-8.0) RTM_CHGADDR message */
#define RTM_NEWADDR        0x16        /* address being added to iface */
#define RTM_DELADDR        0x17        /* address being removed from iface */
#define RTM_CHGADDR        0x18        /* address properties changed */

#ifdef RTM_NAMES
static const char *rtm_names[] = {
    "*none*", "add", "delete", "change", "get",
    "losing", "redirect", "miss", "lock", "oldadd",
    "olddel", "*resolve*", "onewaddr", "odeladdr", "ooifinfo",
    "oifinfo", "ifannounce", "ieee80211", "setgate", "llinfo_upd",
    "ifinfo", "ochgaddr",  "newaddr", "deladdr", "chgaddr",
};
#endif

/*
 * setsockopt defines used for the filtering.
 */
#define        RO_MSGFILTER        1        /* array of which rtm_type to send to client */
#define        RO_MISSFILTER        2        /* array of sockaddrs to match miss dst */

#define        RO_FILTSA_MAX        30        /* maximum number of sockaddrs per filter */

#define RTV_MTU                0x1        /* init or lock _mtu */
#define RTV_HOPCOUNT        0x2        /* init or lock _hopcount */
#define RTV_EXPIRE        0x4        /* init or lock _expire */
#define RTV_RPIPE        0x8        /* init or lock _recvpipe */
#define RTV_SPIPE        0x10        /* init or lock _sendpipe */
#define RTV_SSTHRESH        0x20        /* init or lock _ssthresh */
#define RTV_RTT                0x40        /* init or lock _rtt */
#define RTV_RTTVAR        0x80        /* init or lock _rttvar */

#define RTVBITS "\020\1MTU\2HOPCOUNT\3EXPIRE\4RECVPIPE\5SENDPIPE" \
    "\6SSTHRESH\7RTT\010RTTVAR"

/*
 * Bitmask values for rtm_addr.
 */
#define RTA_DST                0x1        /* destination sockaddr present */
#define RTA_GATEWAY        0x2        /* gateway sockaddr present */
#define RTA_NETMASK        0x4        /* netmask sockaddr present */
#define RTA_GENMASK        0x8        /* cloning mask sockaddr present */
#define RTA_IFP                0x10        /* interface name sockaddr present */
#define RTA_IFA                0x20        /* interface addr sockaddr present */
#define RTA_AUTHOR        0x40        /* sockaddr for author of redirect */
#define RTA_BRD                0x80        /* for NEWADDR, broadcast or p-p dest addr */
#define RTA_TAG                0x100        /* route tag */

#define RTABITS "\020\1DST\2GATEWAY\3NETMASK\4GENMASK\5IFP\6IFA\7AUTHOR" \
    "\010BRD\011TAG"

/*
 * Index offsets for sockaddr array for alternate internal encoding.
 */
#define RTAX_DST        0        /* destination sockaddr present */
#define RTAX_GATEWAY        1        /* gateway sockaddr present */
#define RTAX_NETMASK        2        /* netmask sockaddr present */
#define RTAX_GENMASK        3        /* cloning mask sockaddr present */
#define RTAX_IFP        4        /* interface name sockaddr present */
#define RTAX_IFA        5        /* interface addr sockaddr present */
#define RTAX_AUTHOR        6        /* sockaddr for author of redirect */
#define RTAX_BRD        7        /* for NEWADDR, broadcast or p-p dest addr */
#define RTAX_TAG        8        /* route tag */
#define RTAX_MAX        9        /* size of array to allocate */

#define RT_ROUNDUP2(a, n)        ((a) > 0 ? (1 + (((a) - 1U) | ((n) - 1))) : (n))
#define RT_ROUNDUP(a)                RT_ROUNDUP2((a), sizeof(uint64_t))
#define RT_ADVANCE(x, n)        (x += RT_ROUNDUP((n)->sa_len))

struct rt_addrinfo {
        int        rti_addrs;
        const struct        sockaddr *rti_info[RTAX_MAX];
        int        rti_flags;
        struct        ifaddr *rti_ifa;
        struct        ifnet *rti_ifp;
};

struct route_cb {
        int        ip_count;
        int        ip6_count;
        int        unused1;
        int        mpls_count;
        int        any_count;
};

/*
 * This structure, and the prototypes for the rt_timer_{init,remove_all,
 * add,timer} functions all used with the kind permission of BSDI.
 * These allow functions to be called for routes at specific times.
 */

struct rttimer {
        TAILQ_ENTRY(rttimer)        rtt_next;  /* entry on timer queue */
        LIST_ENTRY(rttimer)         rtt_link;  /* multiple timers per rtentry */
        struct rttimer_queue   *rtt_queue; /* back pointer to queue */
        struct rtentry         *rtt_rt;    /* Back pointer to the route */
        void                      (*rtt_func)(struct rtentry *, struct rttimer *);
        time_t                  rtt_time;  /* When this timer was registered */
};

struct rttimer_queue {
        long                                rtq_timeout;
        unsigned long                        rtq_count;
        TAILQ_HEAD(, rttimer)                rtq_head;
        LIST_ENTRY(rttimer_queue)        rtq_link;
};


struct rtbl;
typedef struct rtbl rtbl_t;

#ifdef _KERNEL

struct rtbl {
        struct radix_node_head t_rnh;
};

struct rt_walkarg {
        int        w_op;
        int        w_arg;
        int        w_given;
        int        w_needed;
        void *        w_where;
        int        w_tmemsize;
        int        w_tmemneeded;
        void *        w_tmem;
};

#if 0
#define        RT_DPRINTF(__fmt, ...)        do { } while (/*CONSTCOND*/0)
#else
#define        RT_DPRINTF(__fmt, ...)        /* do nothing */
#endif

struct rtwalk {
        int (*rw_f)(struct rtentry *, void *);
        void *rw_v;
};

/*
 * Global data specific to the routing socket.
 */
struct route_info {
        struct sockaddr ri_dst;
        struct sockaddr ri_src;
        struct route_cb ri_cb;
        int ri_maxqlen;
        struct ifqueue ri_intrq;
        void *ri_sih;
};

extern        struct        route_info route_info;
extern        struct        rtstat        rtstat;

struct socket;

void        rt_init(void);

int        rt_timer_add(struct rtentry *,
            void(*)(struct rtentry *, struct rttimer *),
            struct rttimer_queue *);
unsigned long
        rt_timer_count(struct rttimer_queue *);
void        rt_timer_queue_change(struct rttimer_queue *, long);
struct rttimer_queue *
        rt_timer_queue_create(u_int);
void        rt_timer_queue_destroy(struct rttimer_queue *);

void        rt_free(struct rtentry *);
void        rt_unref(struct rtentry *);

int        rt_update(struct rtentry *, struct rt_addrinfo *, void *);
int        rt_update_prepare(struct rtentry *);
void        rt_update_finish(struct rtentry *);

void        rt_newmsg(const int, const struct rtentry *);
void        rt_newmsg_dynamic(const int, const struct rtentry *);
struct rtentry *
        rtalloc1(const struct sockaddr *, int);
int        rtinit(struct ifaddr *, int, int);
void        rtredirect(const struct sockaddr *, const struct sockaddr *,
            const struct sockaddr *, int, const struct sockaddr *,
            struct rtentry **);
int        rtrequest(int, const struct sockaddr *,
            const struct sockaddr *, const struct sockaddr *, int,
            struct rtentry **);
int        rtrequest1(int, struct rt_addrinfo *, struct rtentry **);

int        rt_ifa_addlocal(struct ifaddr *);
int        rt_ifa_remlocal(struct ifaddr *, struct ifaddr *);
struct ifaddr *
        rt_get_ifa(struct rtentry *);
void        rt_replace_ifa(struct rtentry *, struct ifaddr *);
int        rt_setgate(struct rtentry *, const struct sockaddr *);

const struct sockaddr *
        rt_settag(struct rtentry *, const struct sockaddr *);
struct sockaddr *
        rt_gettag(const struct rtentry *);

int        rt_check_reject_route(const struct rtentry *, const struct ifnet *);
void        rt_delete_matched_entries(sa_family_t,
            int (*)(struct rtentry *, void *), void *, bool);
void        rt_replace_ifa_matched_entries(sa_family_t,
            int (*)(struct rtentry *, void *), void *, struct ifaddr *);
int        rt_walktree(sa_family_t, int (*)(struct rtentry *, void *), void *);

static __inline void
rt_assert_referenced(const struct rtentry *rt)
{

        KASSERT(rt->rt_refcnt > 0);
}

void        rtcache_copy(struct route *, struct route *);
void        rtcache_free(struct route *);
struct rtentry *
        rtcache_init(struct route *);
struct rtentry *
        rtcache_init_noclone(struct route *);
struct rtentry *
        rtcache_lookup2(struct route *, const struct sockaddr *, int,
            int *);
int        rtcache_setdst(struct route *, const struct sockaddr *);
struct rtentry *
        rtcache_update(struct route *, int);

static __inline void
rtcache_invariants(const struct route *ro)
{

        KASSERT(ro->ro_sa != NULL || ro->_ro_rt == NULL);
}

static __inline struct rtentry *
rtcache_lookup1(struct route *ro, const struct sockaddr *dst, int clone)
{
        int hit;

        return rtcache_lookup2(ro, dst, clone, &hit);
}

static __inline struct rtentry *
rtcache_lookup(struct route *ro, const struct sockaddr *dst)
{
        return rtcache_lookup1(ro, dst, 1);
}

static __inline const struct sockaddr *
rtcache_getdst(const struct route *ro)
{

        rtcache_invariants(ro);
        return ro->ro_sa;
}

struct rtentry *
        rtcache_validate(struct route *);

void        rtcache_unref(struct rtentry *, struct route *);

percpu_t *
        rtcache_percpu_alloc(void);

static __inline struct route *
rtcache_percpu_getref(percpu_t *pc)
{

        return *(struct route **)percpu_getref(pc);
}

static __inline void
rtcache_percpu_putref(percpu_t *pc)
{

        percpu_putref(pc);
}


/* rtsock */
void        rt_ieee80211msg(struct ifnet *, int, void *, size_t);
void        rt_ifannouncemsg(struct ifnet *, int);
void        rt_ifmsg(struct ifnet *);
void        rt_missmsg(int, const struct rt_addrinfo *, int, int);
struct mbuf *
        rt_msg1(int, struct rt_addrinfo *, void *, int);
int        rt_msg3(int, struct rt_addrinfo *, void *, struct rt_walkarg *, int *);
void        rt_addrmsg(int, struct ifaddr *);
void        rt_addrmsg_src(int, struct ifaddr *, const struct sockaddr *);
void        rt_addrmsg_rt(int, struct ifaddr *, int, struct rtentry *);
void        route_enqueue(struct mbuf *, int);

struct llentry;
void        rt_clonedmsg(int, const struct sockaddr *, const struct sockaddr *,
            const uint8_t *, const struct ifnet *);

void        rt_setmetrics(void *, struct rtentry *);

/* rtbl */
int        rt_addaddr(rtbl_t *, struct rtentry *, const struct sockaddr *);
void        rt_assert_inactive(const struct rtentry *);
struct rtentry *
        rt_deladdr(rtbl_t *, const struct sockaddr *,
            const struct sockaddr *);
rtbl_t *rt_gettable(sa_family_t);
int        rt_inithead(rtbl_t **, int);
struct rtentry *
        rt_lookup(rtbl_t *, const struct sockaddr *,
            const struct sockaddr *);
struct rtentry *
        rt_matchaddr(rtbl_t *, const struct sockaddr *);
int        rt_refines(const struct sockaddr *, const struct sockaddr *);
int        rtbl_walktree(sa_family_t, int (*)(struct rtentry *, void *), void *);
struct rtentry *
        rtbl_search_matched_entry(sa_family_t,
            int (*)(struct rtentry *, void *), void *);
void        rtbl_init(void);

void        sysctl_net_route_setup(struct sysctllog **, int, const char *);

#endif /* _KERNEL */

#endif /* !_NET_ROUTE_H_ */





























































































































































































































































    1 








    1 
    1 










    1 










































































































































































































































































































































    1 






    1 


















    1 




    1 















    1 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
/*        $NetBSD: ccd.c,v 1.191 2024/04/12 05:04:02 pgoyette Exp $        */

/*-
 * Copyright (c) 1996, 1997, 1998, 1999, 2007, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1988 University of Utah.
 * Copyright (c) 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * the Systems Programming Group of the University of Utah Computer
 * Science Department.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * from: Utah $Hdr: cd.c 1.6 90/11/28$
 *
 *        @(#)cd.c        8.2 (Berkeley) 11/16/93
 */

/*
 * "Concatenated" disk driver.
 *
 * Notes on concurrency:
 *
 * => sc_dvlock serializes access to the device nodes, excluding block I/O.
 *
 * => sc_iolock serializes access to (sc_flags & CCDF_INITED), disk stats,
 *    sc_stop, sc_bufq and b_resid from master buffers.
 *
 * => a combination of CCDF_INITED, sc_inflight, and sc_iolock is used to
 *    serialize I/O and configuration changes.
 *
 * => the in-core disk label does not change while the device is open.
 *
 * On memory consumption: ccd fans out I/O requests and so needs to
 * allocate memory.  If the system is desperately low on memory, we
 * single thread I/O.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ccd.c,v 1.191 2024/04/12 05:04:02 pgoyette Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/buf.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/module.h>
#include <sys/namei.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/disklabel.h>
#include <sys/device.h>
#include <sys/disk.h>
#include <sys/syslog.h>
#include <sys/fcntl.h>
#include <sys/vnode.h>
#include <sys/conf.h>
#include <sys/mutex.h>
#include <sys/queue.h>
#include <sys/kauth.h>
#include <sys/kthread.h>
#include <sys/bufq.h>
#include <sys/sysctl.h>
#include <sys/compat_stub.h>

#include <uvm/uvm_extern.h>

#include <dev/ccdvar.h>
#include <dev/dkvar.h>

#include <miscfs/specfs/specdev.h> /* for v_rdev */

#include "ioconf.h"

#if defined(CCDDEBUG) && !defined(DEBUG)
#define DEBUG
#endif

#ifdef DEBUG
#define CCDB_FOLLOW        0x01
#define CCDB_INIT        0x02
#define CCDB_IO                0x04
#define CCDB_LABEL        0x08
#define CCDB_VNODE        0x10
int ccddebug = 0x00;
#endif

#define        ccdunit(x)        DISKUNIT(x)

struct ccdbuf {
        struct buf        cb_buf;                /* new I/O buf */
        struct buf        *cb_obp;        /* ptr. to original I/O buf */
        struct ccd_softc *cb_sc;        /* pointer to ccd softc */
        int                cb_comp;        /* target component */
        SIMPLEQ_ENTRY(ccdbuf) cb_q;        /* fifo of component buffers */
};

/* component buffer pool */
static pool_cache_t ccd_cache;

#define        CCD_GETBUF(wait)        pool_cache_get(ccd_cache, wait)
#define        CCD_PUTBUF(cbp)                pool_cache_put(ccd_cache, cbp)

#define CCDLABELDEV(dev)        \
        (MAKEDISKDEV(major((dev)), ccdunit((dev)), RAW_PART))

/* called by main() at boot time */
void        ccddetach(void);

/* called by biodone() at interrupt time */
static void        ccdiodone(struct buf *);

static void        ccdinterleave(struct ccd_softc *);
static int        ccdinit(struct ccd_softc *, char **, struct vnode **,
                    struct lwp *);
static struct ccdbuf *ccdbuffer(struct ccd_softc *, struct buf *,
                    daddr_t, void *, long, int);
static void        ccdgetdefaultlabel(struct ccd_softc *, struct disklabel *);
static void        ccdgetdisklabel(dev_t);
static void        ccdmakedisklabel(struct ccd_softc *);
static int        ccdstart(struct ccd_softc *, struct buf *, int);
static void        ccdthread(void *);

static dev_type_open(ccdopen);
static dev_type_close(ccdclose);
static dev_type_read(ccdread);
static dev_type_write(ccdwrite);
static dev_type_ioctl(ccdioctl);
static dev_type_strategy(ccdstrategy);
static dev_type_size(ccdsize);

const struct bdevsw ccd_bdevsw = {
        .d_open = ccdopen,
        .d_close = ccdclose,
        .d_strategy = ccdstrategy,
        .d_ioctl = ccdioctl,
        .d_dump = nodump,
        .d_psize = ccdsize,
        .d_discard = nodiscard,
        .d_flag = D_DISK | D_MPSAFE
};

const struct cdevsw ccd_cdevsw = {
        .d_open = ccdopen,
        .d_close = ccdclose,
        .d_read = ccdread,
        .d_write = ccdwrite,
        .d_ioctl = ccdioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = nopoll,
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
        .d_flag = D_DISK | D_MPSAFE
};

static const struct dkdriver ccddkdriver = {
        .d_strategy = ccdstrategy,
        .d_minphys = minphys
}; 

#ifdef DEBUG
static        void printiinfo(struct ccdiinfo *);
#endif

static LIST_HEAD(, ccd_softc) ccds = LIST_HEAD_INITIALIZER(ccds);
static kmutex_t ccd_lock;

SYSCTL_SETUP_PROTO(sysctl_kern_ccd_setup);

static struct ccd_softc *
ccdcreate(int unit) {
        struct ccd_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);

        /* Initialize per-softc structures. */
        snprintf(sc->sc_xname, sizeof(sc->sc_xname), "ccd%d", unit);
        sc->sc_unit = unit;
        mutex_init(&sc->sc_dvlock, MUTEX_DEFAULT, IPL_NONE);
        sc->sc_iolock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
        cv_init(&sc->sc_stop, "ccdstop");
        cv_init(&sc->sc_push, "ccdthr");
        disk_init(&sc->sc_dkdev, sc->sc_xname, &ccddkdriver);
        return sc;
}

static void
ccddestroy(struct ccd_softc *sc) {
        mutex_obj_free(sc->sc_iolock);
        mutex_exit(&sc->sc_dvlock);
        mutex_destroy(&sc->sc_dvlock);
        cv_destroy(&sc->sc_stop);
        cv_destroy(&sc->sc_push);
        disk_destroy(&sc->sc_dkdev);
        kmem_free(sc, sizeof(*sc));
}

static struct ccd_softc *
ccdget(int unit, int make) {
        struct ccd_softc *sc;
        if (unit < 0) {
#ifdef DIAGNOSTIC
                panic("%s: unit %d!", __func__, unit);
#endif
                return NULL;
        }
        mutex_enter(&ccd_lock);
        LIST_FOREACH(sc, &ccds, sc_link) {
                if (sc->sc_unit == unit) {
                        mutex_exit(&ccd_lock);
                        return sc;
                }
        }
        mutex_exit(&ccd_lock);
        if (!make)
                return NULL;
        if ((sc = ccdcreate(unit)) == NULL)
                return NULL;
        mutex_enter(&ccd_lock);
        LIST_INSERT_HEAD(&ccds, sc, sc_link);
        mutex_exit(&ccd_lock);
        return sc;
}

static void
ccdput(struct ccd_softc *sc) {
        mutex_enter(&ccd_lock);
        LIST_REMOVE(sc, sc_link);
        mutex_exit(&ccd_lock);
        ccddestroy(sc);
}

/*
 * Called by main() during pseudo-device attachment.  All we need
 * to do is allocate enough space for devices to be configured later.
 */
void
ccdattach(int num)
{
        mutex_init(&ccd_lock, MUTEX_DEFAULT, IPL_NONE);

        /* Initialize the component buffer pool. */
        ccd_cache = pool_cache_init(sizeof(struct ccdbuf), 0,
            0, 0, "ccdbuf", NULL, IPL_BIO, NULL, NULL, NULL);
}

void
ccddetach(void)
{
        pool_cache_destroy(ccd_cache);
        mutex_destroy(&ccd_lock);
}

static int
ccdinit(struct ccd_softc *cs, char **cpaths, struct vnode **vpp,
    struct lwp *l)
{
        struct ccdcinfo *ci = NULL;
        int ix;
        struct ccdgeom *ccg = &cs->sc_geom;
        char *tmppath;
        int error, path_alloced;
        uint64_t psize, minsize;
        unsigned secsize, maxsecsize;
        struct disk_geom *dg;

#ifdef DEBUG
        if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
                printf("%s: ccdinit\n", cs->sc_xname);
#endif

        /* Allocate space for the component info. */
        cs->sc_cinfo = kmem_alloc(cs->sc_nccdisks * sizeof(*cs->sc_cinfo),
            KM_SLEEP);
        tmppath = kmem_alloc(MAXPATHLEN, KM_SLEEP);

        cs->sc_size = 0;

        /*
         * Verify that each component piece exists and record
         * relevant information about it.
         */
        maxsecsize = 0;
        minsize = 0;
        for (ix = 0, path_alloced = 0; ix < cs->sc_nccdisks; ix++) {
                ci = &cs->sc_cinfo[ix];
                ci->ci_vp = vpp[ix];

                /*
                 * Copy in the pathname of the component.
                 */
                memset(tmppath, 0, MAXPATHLEN);        /* sanity */
                error = copyinstr(cpaths[ix], tmppath,
                    MAXPATHLEN, &ci->ci_pathlen);
                if (ci->ci_pathlen == 0)
                        error = EINVAL;
                if (error) {
#ifdef DEBUG
                        if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
                                printf("%s: can't copy path, error = %d\n",
                                    cs->sc_xname, error);
#endif
                        goto out;
                }
                ci->ci_path = kmem_alloc(ci->ci_pathlen, KM_SLEEP);
                memcpy(ci->ci_path, tmppath, ci->ci_pathlen);
                path_alloced++;

                /*
                 * XXX: Cache the component's dev_t.
                 */
                ci->ci_dev = vpp[ix]->v_rdev;

                /*
                 * Get partition information for the component.
                 */
                error = getdisksize(vpp[ix], &psize, &secsize);
                if (error) {
#ifdef DEBUG
                        if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
                                 printf("%s: %s: disksize failed, error = %d\n",
                                     cs->sc_xname, ci->ci_path, error);
#endif
                        goto out;
                }

                /*
                 * Calculate the size, truncating to an interleave
                 * boundary if necessary.
                 */
                maxsecsize = secsize > maxsecsize ? secsize : maxsecsize;
                if (cs->sc_ileave > 1)
                        psize -= psize % cs->sc_ileave;

                if (psize == 0) {
#ifdef DEBUG
                        if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
                                printf("%s: %s: size == 0\n",
                                    cs->sc_xname, ci->ci_path);
#endif
                        error = ENODEV;
                        goto out;
                }

                if (minsize == 0 || psize < minsize)
                        minsize = psize;
                ci->ci_size = psize;
                cs->sc_size += psize;
        }

        /*
         * Don't allow the interleave to be smaller than
         * the biggest component sector.
         */
        if ((cs->sc_ileave > 0) &&
            (cs->sc_ileave < (maxsecsize / DEV_BSIZE))) {
#ifdef DEBUG
                if (ccddebug & (CCDB_FOLLOW|CCDB_INIT))
                        printf("%s: interleave must be at least %d\n",
                            cs->sc_xname, (maxsecsize / DEV_BSIZE));
#endif
                error = EINVAL;
                goto out;
        }

        /*
         * If uniform interleave is desired set all sizes to that of
         * the smallest component.
         */
        if (cs->sc_flags & CCDF_UNIFORM) {
                for (ci = cs->sc_cinfo;
                     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++)
                        ci->ci_size = minsize;

                cs->sc_size = cs->sc_nccdisks * minsize;
        }

        /*
         * Construct the interleave table.
         */
        ccdinterleave(cs);

        /*
         * Create pseudo-geometry based on 1MB cylinders.  It's
         * pretty close.
         */
        ccg->ccg_secsize = DEV_BSIZE;
        ccg->ccg_ntracks = 1;
        ccg->ccg_nsectors = 1024 * (1024 / ccg->ccg_secsize);
        ccg->ccg_ncylinders = cs->sc_size / ccg->ccg_nsectors;

        dg = &cs->sc_dkdev.dk_geom;
        memset(dg, 0, sizeof(*dg));
        dg->dg_secperunit = cs->sc_size;
        dg->dg_secsize = ccg->ccg_secsize;
        dg->dg_nsectors = ccg->ccg_nsectors;
        dg->dg_ntracks = ccg->ccg_ntracks;
        dg->dg_ncylinders = ccg->ccg_ncylinders;

        if (cs->sc_ileave > 0)
                aprint_normal("%s: Interleaving %d component%s "
                    "(%d block interleave)\n", cs->sc_xname,
                    cs->sc_nccdisks, (cs->sc_nccdisks != 0 ? "s" : ""),
                    cs->sc_ileave);
        else
                aprint_normal("%s: Concatenating %d component%s\n",
                    cs->sc_xname,
                    cs->sc_nccdisks, (cs->sc_nccdisks != 0 ? "s" : ""));
        for (ix = 0; ix < cs->sc_nccdisks; ix++) {
                ci = &cs->sc_cinfo[ix];
                aprint_normal("%s: %s (%ju blocks)\n", cs->sc_xname,
                    ci->ci_path, (uintmax_t)ci->ci_size);
        }
        aprint_normal("%s: total %ju blocks\n", cs->sc_xname, cs->sc_size);

        /*
         * Create thread to handle deferred I/O.
         */
        cs->sc_zap = false;
        error = kthread_create(PRI_BIO, KTHREAD_MPSAFE, NULL, ccdthread,
            cs, &cs->sc_thread, "%s", cs->sc_xname);
        if (error) {
                printf("ccdinit: can't create thread: %d\n", error);
                goto out;
        }

        /*
         * Only now that everything is set up can we enable the device.
         */
        mutex_enter(cs->sc_iolock);
        cs->sc_flags |= CCDF_INITED;
        mutex_exit(cs->sc_iolock);
        kmem_free(tmppath, MAXPATHLEN);
        return (0);

 out:
        for (ix = 0; ix < path_alloced; ix++) {
                kmem_free(cs->sc_cinfo[ix].ci_path,
                    cs->sc_cinfo[ix].ci_pathlen);
        }
        kmem_free(cs->sc_cinfo, cs->sc_nccdisks * sizeof(struct ccdcinfo));
        kmem_free(tmppath, MAXPATHLEN);
        return (error);
}

static void
ccdinterleave(struct ccd_softc *cs)
{
        struct ccdcinfo *ci, *smallci;
        struct ccdiinfo *ii;
        daddr_t bn, lbn;
        int ix;
        u_long size;

#ifdef DEBUG
        if (ccddebug & CCDB_INIT)
                printf("ccdinterleave(%p): ileave %d\n", cs, cs->sc_ileave);
#endif
        /*
         * Allocate an interleave table.
         * Chances are this is too big, but we don't care.
         */
        size = (cs->sc_nccdisks + 1) * sizeof(struct ccdiinfo);
        cs->sc_itable = kmem_zalloc(size, KM_SLEEP);

        /*
         * Trivial case: no interleave (actually interleave of disk size).
         * Each table entry represents a single component in its entirety.
         */
        if (cs->sc_ileave == 0) {
                bn = 0;
                ii = cs->sc_itable;

                for (ix = 0; ix < cs->sc_nccdisks; ix++) {
                        /* Allocate space for ii_index. */
                        ii->ii_indexsz = sizeof(int);
                        ii->ii_index = kmem_alloc(ii->ii_indexsz, KM_SLEEP);
                        ii->ii_ndisk = 1;
                        ii->ii_startblk = bn;
                        ii->ii_startoff = 0;
                        ii->ii_index[0] = ix;
                        bn += cs->sc_cinfo[ix].ci_size;
                        ii++;
                }
                ii->ii_ndisk = 0;
#ifdef DEBUG
                if (ccddebug & CCDB_INIT)
                        printiinfo(cs->sc_itable);
#endif
                return;
        }

        /*
         * The following isn't fast or pretty; it doesn't have to be.
         */
        size = 0;
        bn = lbn = 0;
        for (ii = cs->sc_itable; ; ii++) {
                /* Allocate space for ii_index. */
                ii->ii_indexsz = sizeof(int) * cs->sc_nccdisks;
                ii->ii_index = kmem_alloc(ii->ii_indexsz, KM_SLEEP);

                /*
                 * Locate the smallest of the remaining components
                 */
                smallci = NULL;
                for (ci = cs->sc_cinfo;
                     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++)
                        if (ci->ci_size > size &&
                            (smallci == NULL ||
                             ci->ci_size < smallci->ci_size))
                                smallci = ci;

                /*
                 * Nobody left, all done
                 */
                if (smallci == NULL) {
                        ii->ii_ndisk = 0;
                        break;
                }

                /*
                 * Record starting logical block and component offset
                 */
                ii->ii_startblk = bn / cs->sc_ileave;
                ii->ii_startoff = lbn;

                /*
                 * Determine how many disks take part in this interleave
                 * and record their indices.
                 */
                ix = 0;
                for (ci = cs->sc_cinfo;
                     ci < &cs->sc_cinfo[cs->sc_nccdisks]; ci++)
                        if (ci->ci_size >= smallci->ci_size)
                                ii->ii_index[ix++] = ci - cs->sc_cinfo;
                ii->ii_ndisk = ix;
                bn += ix * (smallci->ci_size - size);
                lbn = smallci->ci_size / cs->sc_ileave;
                size = smallci->ci_size;
        }
#ifdef DEBUG
        if (ccddebug & CCDB_INIT)
                printiinfo(cs->sc_itable);
#endif
}

/* ARGSUSED */
static int
ccdopen(dev_t dev, int flags, int fmt, struct lwp *l)
{
        int unit = ccdunit(dev);
        struct ccd_softc *cs;
        struct disklabel *lp;
        int error = 0, part, pmask;

#ifdef DEBUG
        if (ccddebug & CCDB_FOLLOW)
                printf("ccdopen(0x%"PRIx64", 0x%x)\n", dev, flags);
#endif
        if ((cs = ccdget(unit, 1)) == NULL)
                return ENXIO;

        mutex_enter(&cs->sc_dvlock);

        lp = cs->sc_dkdev.dk_label;

        part = DISKPART(dev);
        pmask = (1 << part);

        /*
         * If we're initialized, check to see if there are any other
         * open partitions.  If not, then it's safe to update
         * the in-core disklabel.  Only read the disklabel if it is
         * not already valid.
         */
        if ((cs->sc_flags & (CCDF_INITED|CCDF_VLABEL)) == CCDF_INITED &&
            cs->sc_dkdev.dk_openmask == 0)
                ccdgetdisklabel(dev);

        /* Check that the partition exists. */
        if (part != RAW_PART) {
                if (((cs->sc_flags & CCDF_INITED) == 0) ||
                    ((part >= lp->d_npartitions) ||
                     (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
                        error = ENXIO;
                        goto done;
                }
        }

        /* Prevent our unit from being unconfigured while open. */
        switch (fmt) {
        case S_IFCHR:
                cs->sc_dkdev.dk_copenmask |= pmask;
                break;

        case S_IFBLK:
                cs->sc_dkdev.dk_bopenmask |= pmask;
                break;
        }
        cs->sc_dkdev.dk_openmask =
            cs->sc_dkdev.dk_copenmask | cs->sc_dkdev.dk_bopenmask;

 done:
        mutex_exit(&cs->sc_dvlock);
        return (error);
}

/* ARGSUSED */
static int
ccdclose(dev_t dev, int flags, int fmt, struct lwp *l)
{
        int unit = ccdunit(dev);
        struct ccd_softc *cs;
        int part;

#ifdef DEBUG
        if (ccddebug & CCDB_FOLLOW)
                printf("ccdclose(0x%"PRIx64", 0x%x)\n", dev, flags);
#endif

        if ((cs = ccdget(unit, 0)) == NULL)
                return ENXIO;

        mutex_enter(&cs->sc_dvlock);

        part = DISKPART(dev);

        /* ...that much closer to allowing unconfiguration... */
        switch (fmt) {
        case S_IFCHR:
                cs->sc_dkdev.dk_copenmask &= ~(1 << part);
                break;

        case S_IFBLK:
                cs->sc_dkdev.dk_bopenmask &= ~(1 << part);
                break;
        }
        cs->sc_dkdev.dk_openmask =
            cs->sc_dkdev.dk_copenmask | cs->sc_dkdev.dk_bopenmask;

        if (cs->sc_dkdev.dk_openmask == 0) {
                if ((cs->sc_flags & CCDF_KLABEL) == 0)
                        cs->sc_flags &= ~CCDF_VLABEL;
        }

        mutex_exit(&cs->sc_dvlock);
        return (0);
}

static void
ccdthread(void *cookie)
{
        int error;
        struct ccd_softc *cs;
        struct buf *bp;

        cs = cookie;

#ifdef DEBUG
         if (ccddebug & CCDB_FOLLOW)
                 printf("ccdthread: hello\n");
#endif

        mutex_enter(cs->sc_iolock);
        while (__predict_true(!cs->sc_zap)) {
                bp = bufq_get(cs->sc_bufq);
                if (bp == NULL) {
                        /* Nothing to do. */
                        cv_wait(&cs->sc_push, cs->sc_iolock);
                        continue;
                }
#ifdef DEBUG
                 if (ccddebug & CCDB_FOLLOW)
                         printf("ccdthread: dispatching I/O\n");
#endif
                error = ccdstart(cs, bp, PR_WAITOK);
                KASSERT(error == 0);
                mutex_enter(cs->sc_iolock);
        }
        cs->sc_thread = NULL;
        mutex_exit(cs->sc_iolock);
#ifdef DEBUG
         if (ccddebug & CCDB_FOLLOW)
                 printf("ccdthread: goodbye\n");
#endif
        kthread_exit(0);
}

static void
ccdstrategy(struct buf *bp)
{
        int unit = ccdunit(bp->b_dev);
        struct ccd_softc *cs;
        if ((cs = ccdget(unit, 0)) == NULL)
                return;

        /* Must be open or reading label. */
        KASSERT(cs->sc_dkdev.dk_openmask != 0 ||
            (cs->sc_flags & CCDF_RLABEL) != 0);

        mutex_enter(cs->sc_iolock);
        /* Synchronize with device init/uninit. */
        if (__predict_false((cs->sc_flags & CCDF_INITED) == 0)) {
                mutex_exit(cs->sc_iolock);
#ifdef DEBUG
                 if (ccddebug & CCDB_FOLLOW)
                         printf("ccdstrategy: unit %d: not inited\n", unit);
#endif
                 bp->b_error = ENXIO;
                 bp->b_resid = bp->b_bcount;
                 biodone(bp);
                return;
        }

        if (ccdstart(cs, bp, PR_NOWAIT) != 0) {
                /* Defer to thread if system is low on memory. */
                bufq_put(cs->sc_bufq, bp);
                cv_broadcast(&cs->sc_push);
                mutex_exit(cs->sc_iolock);
        }
}

static int
ccdstart(struct ccd_softc *cs, struct buf *bp, int wait)
{
        daddr_t blkno;
        int wlabel;
        struct disklabel *lp;
        long bcount, rcount;
        struct ccdbuf *cbp;
        char *addr;
        daddr_t bn;
        vnode_t *vp;
        SIMPLEQ_HEAD(, ccdbuf) cbufq;

        KASSERT(mutex_owned(cs->sc_iolock));
        KASSERT(bp != NULL);

        disk_busy(&cs->sc_dkdev);

#ifdef DEBUG
        if (ccddebug & CCDB_FOLLOW)
                printf("ccdstart(%s, %p)\n", cs->sc_xname, bp);
#endif

        /* If it's a nil transfer, wake up the top half now. */
        if (bp->b_bcount == 0)
                goto done;

        lp = cs->sc_dkdev.dk_label;

        /*
         * Do bounds checking and adjust transfer.  If there's an
         * error, the bounds check will flag that for us.  Convert
         * the partition relative block number to an absolute.
         */
        blkno = bp->b_blkno;
        wlabel = cs->sc_flags & (CCDF_WLABEL|CCDF_LABELLING);
        if (DISKPART(bp->b_dev) != RAW_PART) {
                if (bounds_check_with_label(&cs->sc_dkdev, bp, wlabel) <= 0)
                        goto done;
                blkno += lp->d_partitions[DISKPART(bp->b_dev)].p_offset;
        }
        mutex_exit(cs->sc_iolock);
        bp->b_rawblkno = blkno;

        /* Allocate the component buffers. */
        SIMPLEQ_INIT(&cbufq);
        bp->b_resid = bp->b_bcount;
        bn = bp->b_rawblkno;
        addr = bp->b_data;
        for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
                cbp = ccdbuffer(cs, bp, bn, addr, bcount, wait);
                KASSERT(cbp != NULL || wait == PR_NOWAIT);
                if (cbp == NULL) {
                        while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
                                SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
                                CCD_PUTBUF(cbp);
                        }
                        mutex_enter(cs->sc_iolock);
                        disk_unbusy(&cs->sc_dkdev, 0, 0);
                        return ENOMEM;
                }
                SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
                rcount = cbp->cb_buf.b_bcount;
                bn += btodb(rcount);
                addr += rcount;
        }

        /* All buffers set up, now fire off the requests. */
        while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
                SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
                vp = cbp->cb_buf.b_vp;
                if ((cbp->cb_buf.b_flags & B_READ) == 0) {
                        mutex_enter(vp->v_interlock);
                        vp->v_numoutput++;
                        mutex_exit(vp->v_interlock);
                }
                (void)VOP_STRATEGY(vp, &cbp->cb_buf);
        }
        return 0;

 done:
        disk_unbusy(&cs->sc_dkdev, 0, 0);
        cv_broadcast(&cs->sc_stop);
        cv_broadcast(&cs->sc_push);
        mutex_exit(cs->sc_iolock);
        bp->b_resid = bp->b_bcount;
        biodone(bp);
        return 0;
}

/*
 * Build a component buffer header.
 */
static struct ccdbuf *
ccdbuffer(struct ccd_softc *cs, struct buf *bp, daddr_t bn, void *addr,
    long bcount, int wait)
{
        struct ccdcinfo *ci;
        struct ccdbuf *cbp;
        daddr_t cbn, cboff;
        u_int64_t cbc;
        int ccdisk;

#ifdef DEBUG
        if (ccddebug & CCDB_IO)
                printf("ccdbuffer(%p, %p, %" PRId64 ", %p, %ld)\n",
                       cs, bp, bn, addr, bcount);
#endif
        /*
         * Determine which component bn falls in.
         */
        cbn = bn;
        cboff = 0;

        /*
         * Serially concatenated
         */
        if (cs->sc_ileave == 0) {
                daddr_t sblk;

                sblk = 0;
                for (ccdisk = 0, ci = &cs->sc_cinfo[ccdisk];
                    cbn >= sblk + ci->ci_size;
                    ccdisk++, ci = &cs->sc_cinfo[ccdisk])
                        sblk += ci->ci_size;
                cbn -= sblk;
        }
        /*
         * Interleaved
         */
        else {
                struct ccdiinfo *ii;
                int off;

                cboff = cbn % cs->sc_ileave;
                cbn /= cs->sc_ileave;
                for (ii = cs->sc_itable; ii->ii_ndisk; ii++)
                        if (ii->ii_startblk > cbn)
                                break;
                ii--;
                off = cbn - ii->ii_startblk;
                if (ii->ii_ndisk == 1) {
                        ccdisk = ii->ii_index[0];
                        cbn = ii->ii_startoff + off;
                } else {
                        ccdisk = ii->ii_index[off % ii->ii_ndisk];
                        cbn = ii->ii_startoff + off / ii->ii_ndisk;
                }
                cbn *= cs->sc_ileave;
                ci = &cs->sc_cinfo[ccdisk];
        }

        /*
         * Fill in the component buf structure.
         */
        cbp = CCD_GETBUF(wait);
        if (cbp == NULL)
                return NULL;
        buf_init(&cbp->cb_buf);
        cbp->cb_buf.b_flags = bp->b_flags;
        cbp->cb_buf.b_oflags = bp->b_oflags;
        cbp->cb_buf.b_cflags = bp->b_cflags;
        cbp->cb_buf.b_iodone = ccdiodone;
        cbp->cb_buf.b_proc = bp->b_proc;
        cbp->cb_buf.b_dev = ci->ci_dev;
        cbp->cb_buf.b_blkno = cbn + cboff;
        cbp->cb_buf.b_data = addr;
        cbp->cb_buf.b_vp = ci->ci_vp;
        cbp->cb_buf.b_objlock = ci->ci_vp->v_interlock;
        if (cs->sc_ileave == 0)
                cbc = dbtob((u_int64_t)(ci->ci_size - cbn));
        else
                cbc = dbtob((u_int64_t)(cs->sc_ileave - cboff));
        cbp->cb_buf.b_bcount = cbc < bcount ? cbc : bcount;

        /*
         * context for ccdiodone
         */
        cbp->cb_obp = bp;
        cbp->cb_sc = cs;
        cbp->cb_comp = ccdisk;

        BIO_COPYPRIO(&cbp->cb_buf, bp);

#ifdef DEBUG
        if (ccddebug & CCDB_IO)
                printf(" dev 0x%"PRIx64"(u%lu): cbp %p bn %" PRId64 " addr %p"
                       " bcnt %d\n",
                    ci->ci_dev, (unsigned long) (ci-cs->sc_cinfo), cbp,
                    cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
                    cbp->cb_buf.b_bcount);
#endif

        return (cbp);
}

/*
 * Called at interrupt time.
 * Mark the component as done and if all components are done,
 * take a ccd interrupt.
 */
static void
ccdiodone(struct buf *vbp)
{
        struct ccdbuf *cbp = (struct ccdbuf *) vbp;
        struct buf *bp = cbp->cb_obp;
        struct ccd_softc *cs = cbp->cb_sc;
        int count;

#ifdef DEBUG
        if (ccddebug & CCDB_FOLLOW)
                printf("ccdiodone(%p)\n", cbp);
        if (ccddebug & CCDB_IO) {
                printf("ccdiodone: bp %p bcount %d resid %d\n",
                       bp, bp->b_bcount, bp->b_resid);
                printf(" dev 0x%"PRIx64"(u%d), cbp %p bn %" PRId64 " addr %p"
                       " bcnt %d\n",
                       cbp->cb_buf.b_dev, cbp->cb_comp, cbp,
                       cbp->cb_buf.b_blkno, cbp->cb_buf.b_data,
                       cbp->cb_buf.b_bcount);
        }
#endif

        if (cbp->cb_buf.b_error != 0) {
                bp->b_error = cbp->cb_buf.b_error;
                printf("%s: error %d on component %d\n",
                       cs->sc_xname, bp->b_error, cbp->cb_comp);
        }
        count = cbp->cb_buf.b_bcount;
        buf_destroy(&cbp->cb_buf);
        CCD_PUTBUF(cbp);

        /*
         * If all done, "interrupt".
         */
        mutex_enter(cs->sc_iolock);
        bp->b_resid -= count;
        if (bp->b_resid < 0)
                panic("ccdiodone: count");
        if (bp->b_resid == 0) {
                /*
                 * Request is done for better or worse, wakeup the top half.
                 */
                if (bp->b_error != 0)
                        bp->b_resid = bp->b_bcount;
                disk_unbusy(&cs->sc_dkdev, (bp->b_bcount - bp->b_resid),
                    (bp->b_flags & B_READ));
                if (!disk_isbusy(&cs->sc_dkdev)) {
                        if (bufq_peek(cs->sc_bufq) != NULL) {
                                cv_broadcast(&cs->sc_push);
                        }
                        cv_broadcast(&cs->sc_stop);
                }
                mutex_exit(cs->sc_iolock);
                biodone(bp);
        } else
                mutex_exit(cs->sc_iolock);
}

/* ARGSUSED */
static int
ccdread(dev_t dev, struct uio *uio, int flags)
{
        int unit = ccdunit(dev);
        struct ccd_softc *cs;

#ifdef DEBUG
        if (ccddebug & CCDB_FOLLOW)
                printf("ccdread(0x%"PRIx64", %p)\n", dev, uio);
#endif
        if ((cs = ccdget(unit, 0)) == NULL)
                return 0;

        /* Unlocked advisory check, ccdstrategy check is synchronous. */
        if ((cs->sc_flags & CCDF_INITED) == 0)
                return (ENXIO);

        return (physio(ccdstrategy, NULL, dev, B_READ, minphys, uio));
}

/* ARGSUSED */
static int
ccdwrite(dev_t dev, struct uio *uio, int flags)
{
        int unit = ccdunit(dev);
        struct ccd_softc *cs;

#ifdef DEBUG
        if (ccddebug & CCDB_FOLLOW)
                printf("ccdwrite(0x%"PRIx64", %p)\n", dev, uio);
#endif
        if ((cs = ccdget(unit, 0)) == NULL)
                return ENOENT;

        /* Unlocked advisory check, ccdstrategy check is synchronous. */
        if ((cs->sc_flags & CCDF_INITED) == 0)
                return (ENXIO);

        return (physio(ccdstrategy, NULL, dev, B_WRITE, minphys, uio));
}

int (*compat_ccd_ioctl_60)(dev_t, u_long, void *, int, struct lwp *,
    int (*)(dev_t, u_long, void *, int, struct lwp *)) = (void *)enosys;

static int
ccdioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
        int unit = ccdunit(dev);
        int i, j, lookedup = 0, error = 0;
        int part, pmask, make, hook;
        struct ccd_softc *cs;
        struct ccd_ioctl *ccio = (struct ccd_ioctl *)data;
        kauth_cred_t uc;
        char **cpp;
        struct pathbuf *pb;
        struct vnode **vpp;
#ifdef __HAVE_OLD_DISKLABEL
        struct disklabel newlabel;
#endif

        switch (cmd) {
        case CCDIOCSET:
                make = 1;
                break;
        default:
                MODULE_HOOK_CALL(ccd_ioctl_60_hook,
                                 (0, cmd, NULL, 0, NULL, NULL),
                                 enosys(), hook);
                if (hook == 0)
                        make = 1;
                else
                        make = 0;
                break;
        }

        if ((cs = ccdget(unit, make)) == NULL)
                return ENOENT;
        uc = kauth_cred_get();

        MODULE_HOOK_CALL(ccd_ioctl_60_hook,
                         (dev, cmd, data, flag, l, ccdioctl),
                         enosys(), error);
        if (error != ENOSYS)
                return error;

        /* Must be open for writes for these commands... */
        switch (cmd) {
        case CCDIOCSET:
        case CCDIOCCLR:
        case DIOCSDINFO:
        case DIOCWDINFO:
        case DIOCCACHESYNC:
        case DIOCAWEDGE:
        case DIOCDWEDGE:
        case DIOCRMWEDGES:
        case DIOCMWEDGES:
#ifdef __HAVE_OLD_DISKLABEL
        case ODIOCSDINFO:
        case ODIOCWDINFO:
#endif
        case DIOCKLABEL:
        case DIOCWLABEL:
                if ((flag & FWRITE) == 0)
                        return (EBADF);
        }

        /* Must be initialized for these... */
        switch (cmd) {
        case CCDIOCCLR:
        case DIOCGDINFO:
        case DIOCGSTRATEGY:
        case DIOCGCACHE:
        case DIOCCACHESYNC:
        case DIOCAWEDGE:
        case DIOCDWEDGE:
        case DIOCLWEDGES:
        case DIOCMWEDGES:
        case DIOCSDINFO:
        case DIOCWDINFO:
        case DIOCGPARTINFO:
        case DIOCWLABEL:
        case DIOCKLABEL:
        case DIOCGDEFLABEL:
#ifdef __HAVE_OLD_DISKLABEL
        case ODIOCGDINFO:
        case ODIOCSDINFO:
        case ODIOCWDINFO:
        case ODIOCGDEFLABEL:
#endif
                if ((cs->sc_flags & CCDF_INITED) == 0)
                        return ENXIO;
        }

        error = disk_ioctl(&cs->sc_dkdev, dev, cmd, data, flag, l);
        if (error != EPASSTHROUGH)
                return error;

        switch (cmd) {
        case DIOCGSTRATEGY:
            {
                struct disk_strategy *dks = (void *)data;

                mutex_enter(cs->sc_iolock);
                if (cs->sc_bufq != NULL)
                        strlcpy(dks->dks_name,
                            bufq_getstrategyname(cs->sc_bufq),
                            sizeof(dks->dks_name));
                else
                        error = EINVAL;
                mutex_exit(cs->sc_iolock);
                dks->dks_paramlen = 0;
                break;
            }

        case DIOCWDINFO:
        case DIOCSDINFO:
#ifdef __HAVE_OLD_DISKLABEL
        case ODIOCWDINFO:
        case ODIOCSDINFO:
#endif
        {
                struct disklabel *lp;
#ifdef __HAVE_OLD_DISKLABEL
                if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
                        memset(&newlabel, 0, sizeof newlabel);
                        memcpy(&newlabel, data, sizeof (struct olddisklabel));
                        lp = &newlabel;
                } else
#endif
                lp = (struct disklabel *)data;

                cs->sc_flags |= CCDF_LABELLING;

                error = setdisklabel(cs->sc_dkdev.dk_label,
                    lp, 0, cs->sc_dkdev.dk_cpulabel);
                if (error == 0) {
                        if (cmd == DIOCWDINFO
#ifdef __HAVE_OLD_DISKLABEL
                            || cmd == ODIOCWDINFO
#endif
                           )
                                error = writedisklabel(CCDLABELDEV(dev),
                                    ccdstrategy, cs->sc_dkdev.dk_label,
                                    cs->sc_dkdev.dk_cpulabel);
                }

                cs->sc_flags &= ~CCDF_LABELLING;
                break;
        }

        case DIOCKLABEL:
                if (*(int *)data != 0)
                        cs->sc_flags |= CCDF_KLABEL;
                else
                        cs->sc_flags &= ~CCDF_KLABEL;
                break;

        case DIOCWLABEL:
                if (*(int *)data != 0)
                        cs->sc_flags |= CCDF_WLABEL;
                else
                        cs->sc_flags &= ~CCDF_WLABEL;
                break;

        case DIOCGDEFLABEL:
                ccdgetdefaultlabel(cs, (struct disklabel *)data);
                break;

#ifdef __HAVE_OLD_DISKLABEL
        case ODIOCGDEFLABEL:
                ccdgetdefaultlabel(cs, &newlabel);
                if (newlabel.d_npartitions > OLDMAXPARTITIONS)
                        return ENOTTY;
                memcpy(data, &newlabel, sizeof (struct olddisklabel));
                break;
#endif
        default:
                error = ENOTTY;
                        break;
        }

        if (error != ENOTTY)
                return error;

        mutex_enter(&cs->sc_dvlock);

        error = 0;
        switch (cmd) {
        case CCDIOCSET:
                if (cs->sc_flags & CCDF_INITED) {
                        error = EBUSY;
                        goto out;
                }

                /* Validate the flags. */
                if ((ccio->ccio_flags & CCDF_USERMASK) != ccio->ccio_flags) {
                        error = EINVAL;
                        goto out;
                }

                if (ccio->ccio_ndisks > CCD_MAXNDISKS ||
                    ccio->ccio_ndisks == 0) {
                        error = EINVAL;
                        goto out;
                }

                /* Fill in some important bits. */
                cs->sc_ileave = ccio->ccio_ileave;
                cs->sc_nccdisks = ccio->ccio_ndisks;
                cs->sc_flags = ccio->ccio_flags & CCDF_USERMASK;

                /*
                 * Allocate space for and copy in the array of
                 * component pathnames and device numbers.
                 */
                cpp = kmem_alloc(ccio->ccio_ndisks * sizeof(*cpp), KM_SLEEP);
                vpp = kmem_alloc(ccio->ccio_ndisks * sizeof(*vpp), KM_SLEEP);
                error = copyin(ccio->ccio_disks, cpp,
                    ccio->ccio_ndisks * sizeof(*cpp));
                if (error) {
                        kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp));
                        kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp));
                        goto out;
                }

#ifdef DEBUG
                if (ccddebug & CCDB_INIT)
                        for (i = 0; i < ccio->ccio_ndisks; ++i)
                                printf("ccdioctl: component %d: %p\n",
                                    i, cpp[i]);
#endif

                for (i = 0; i < ccio->ccio_ndisks; ++i) {
#ifdef DEBUG
                        if (ccddebug & CCDB_INIT)
                                printf("ccdioctl: lookedup = %d\n", lookedup);
#endif
                        error = pathbuf_copyin(cpp[i], &pb);
                        if (error == 0) {
                                error = vn_bdev_openpath(pb, &vpp[i], l);
                                pathbuf_destroy(pb);
                        }
                        if (error != 0) {
                                for (j = 0; j < lookedup; ++j)
                                        (void)vn_close(vpp[j], FREAD|FWRITE,
                                            uc);
                                kmem_free(vpp, ccio->ccio_ndisks *
                                    sizeof(*vpp));
                                kmem_free(cpp, ccio->ccio_ndisks *
                                    sizeof(*cpp));

                                /*
                                 * No component data is allocated,
                                 * nothing is to be freed.
                                */
                                cs->sc_nccdisks = 0;
                                goto out;
                        }
                        ++lookedup;
                }

                /* Attach the disk. */
                disk_attach(&cs->sc_dkdev);
                bufq_alloc(&cs->sc_bufq, "fcfs", 0);

                /*
                 * Initialize the ccd.  Fills in the softc for us.
                 */
                if ((error = ccdinit(cs, cpp, vpp, l)) != 0) {
                        for (j = 0; j < lookedup; ++j)
                                (void)vn_close(vpp[j], FREAD|FWRITE,
                                    uc);
                        kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp));
                        kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp));
                        disk_detach(&cs->sc_dkdev);
                        mutex_exit(&cs->sc_dvlock);
                        bufq_free(cs->sc_bufq);
                        return error;
                }

                /* We can free the temporary variables now. */
                kmem_free(vpp, ccio->ccio_ndisks * sizeof(*vpp));
                kmem_free(cpp, ccio->ccio_ndisks * sizeof(*cpp));

                /*
                 * The ccd has been successfully initialized, so
                 * we can place it into the array.  Don't try to
                 * read the disklabel until the disk has been attached,
                 * because space for the disklabel is allocated
                 * in disk_attach();
                 */
                ccio->ccio_unit = unit;
                ccio->ccio_size = cs->sc_size;

                /* Try and read the disklabel. */
                ccdgetdisklabel(dev);
                disk_set_info(NULL, &cs->sc_dkdev, NULL);

                /* discover wedges */
                mutex_exit(&cs->sc_dvlock);
                dkwedge_discover(&cs->sc_dkdev);
                return 0;

        case CCDIOCCLR:
                /*
                 * Don't unconfigure if any other partitions are open
                 * or if both the character and block flavors of this
                 * partition are open.
                 */
                part = DISKPART(dev);
                pmask = (1 << part);
                if ((cs->sc_dkdev.dk_openmask & ~pmask) ||
                    ((cs->sc_dkdev.dk_bopenmask & pmask) &&
                    (cs->sc_dkdev.dk_copenmask & pmask))) {
                        error = EBUSY;
                        goto out;
                }

                /* Delete all of our wedges. */
                dkwedge_delall(&cs->sc_dkdev);

                /* Stop new I/O, wait for in-flight I/O to complete. */
                mutex_enter(cs->sc_iolock);
                cs->sc_flags &= ~(CCDF_INITED|CCDF_VLABEL);
                cs->sc_zap = true;
                while (disk_isbusy(&cs->sc_dkdev) ||
                    bufq_peek(cs->sc_bufq) != NULL ||
                    cs->sc_thread != NULL) {
                        cv_broadcast(&cs->sc_push);
                        (void)cv_timedwait(&cs->sc_stop, cs->sc_iolock, hz);
                }
                mutex_exit(cs->sc_iolock);

                /*
                 * Free ccd_softc information and clear entry.
                 */

                /* Close the components and free their pathnames. */
                for (i = 0; i < cs->sc_nccdisks; ++i) {
                        /*
                         * XXX: this close could potentially fail and
                         * cause Bad Things.  Maybe we need to force
                         * the close to happen?
                         */
#ifdef DEBUG
                        if (ccddebug & CCDB_VNODE)
                                vprint("CCDIOCCLR: vnode info",
                                    cs->sc_cinfo[i].ci_vp);
#endif
                        (void)vn_close(cs->sc_cinfo[i].ci_vp, FREAD|FWRITE,
                            uc);
                        kmem_free(cs->sc_cinfo[i].ci_path,
                            cs->sc_cinfo[i].ci_pathlen);
                }

                if (cs->sc_nccdisks != 0) {
                        /* Free interleave index. */
                        for (i = 0; cs->sc_itable[i].ii_ndisk; ++i) {
                                kmem_free(cs->sc_itable[i].ii_index,
                                    cs->sc_itable[i].ii_indexsz);
                        }
                        /* Free component info and interleave table. */
                        kmem_free(cs->sc_cinfo, cs->sc_nccdisks *
                            sizeof(struct ccdcinfo));
                        kmem_free(cs->sc_itable, (cs->sc_nccdisks + 1) *
                            sizeof(struct ccdiinfo));
                }

                aprint_normal("%s: detached\n", cs->sc_xname);

                /* Detach the disk. */
                disk_detach(&cs->sc_dkdev);
                bufq_free(cs->sc_bufq);

                /* also releases sc_dvlock */
                ccdput(cs);

                /* Don't break, otherwise cs is read again. */
                return 0;

        case DIOCGCACHE:
            {
                int dkcache = 0;

                /*
                 * We pass this call down to all components and report
                 * intersection of the flags returned by the components.
                 * If any errors out, we return error. CCD components
                 * can not change unless the device is unconfigured, so
                 * device feature flags will remain static. RCE/WCE can change
                 * of course, if set directly on underlying device.
                 */
                for (error = 0, i = 0; i < cs->sc_nccdisks; i++) {
                        error = VOP_IOCTL(cs->sc_cinfo[i].ci_vp, cmd, &j,
                                      flag, uc);
                        if (error)
                                break;

                        if (i == 0)
                                dkcache = j;
                        else
                                dkcache = DKCACHE_COMBINE(dkcache, j);
                }

                *((int *)data) = dkcache;
                break;
            }

        case DIOCCACHESYNC:
                /*
                 * We pass this call down to all components and report
                 * the first error we encounter.
                 */
                for (error = 0, i = 0; i < cs->sc_nccdisks; i++) {
                        j = VOP_IOCTL(cs->sc_cinfo[i].ci_vp, cmd, data,
                                      flag, uc);
                        if (j != 0 && error == 0)
                                error = j;
                }
                break;

default:
        error = ENOTTY;
                break;
        }

 out:
        mutex_exit(&cs->sc_dvlock);
        return (error);
}

static int
ccdsize(dev_t dev)
{
        struct ccd_softc *cs;
        struct disklabel *lp;
        int part, unit, omask, size;

        unit = ccdunit(dev);
        if ((cs = ccdget(unit, 0)) == NULL)
                return -1;

        if ((cs->sc_flags & CCDF_INITED) == 0)
                return (-1);

        part = DISKPART(dev);
        omask = cs->sc_dkdev.dk_openmask & (1 << part);
        lp = cs->sc_dkdev.dk_label;

        if (omask == 0 && ccdopen(dev, 0, S_IFBLK, curlwp))
                return (-1);

        if (lp->d_partitions[part].p_fstype != FS_SWAP)
                size = -1;
        else
                size = lp->d_partitions[part].p_size *
                    (lp->d_secsize / DEV_BSIZE);

        if (omask == 0 && ccdclose(dev, 0, S_IFBLK, curlwp))
                return (-1);

        return (size);
}

static void
ccdgetdefaultlabel(struct ccd_softc *cs, struct disklabel *lp)
{
        struct ccdgeom *ccg = &cs->sc_geom;

        memset(lp, 0, sizeof(*lp));

        if (cs->sc_size > UINT32_MAX)
                lp->d_secperunit = UINT32_MAX;
        else
                lp->d_secperunit = cs->sc_size;
        lp->d_secsize = ccg->ccg_secsize;
        lp->d_nsectors = ccg->ccg_nsectors;
        lp->d_ntracks = ccg->ccg_ntracks;
        lp->d_ncylinders = ccg->ccg_ncylinders;
        lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;

        strncpy(lp->d_typename, "ccd", sizeof(lp->d_typename));
        lp->d_type = DKTYPE_CCD;
        strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
        lp->d_rpm = 3600;
        lp->d_interleave = 1;
        lp->d_flags = 0;

        lp->d_partitions[RAW_PART].p_offset = 0;
        lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
        lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
        lp->d_npartitions = RAW_PART + 1;

        lp->d_magic = DISKMAGIC;
        lp->d_magic2 = DISKMAGIC;
        lp->d_checksum = dkcksum(cs->sc_dkdev.dk_label);
}

/*
 * Read the disklabel from the ccd.  If one is not present, fake one
 * up.
 */
static void
ccdgetdisklabel(dev_t dev)
{
        int unit = ccdunit(dev);
        struct ccd_softc *cs;
        const char *errstring;
        struct disklabel *lp;
        struct cpu_disklabel *clp;

        if ((cs = ccdget(unit, 0)) == NULL)
                return;
        lp = cs->sc_dkdev.dk_label;
        clp = cs->sc_dkdev.dk_cpulabel;
        KASSERT(mutex_owned(&cs->sc_dvlock));

        memset(clp, 0, sizeof(*clp));

        ccdgetdefaultlabel(cs, lp);

        /*
         * Call the generic disklabel extraction routine.
         */
        cs->sc_flags |= CCDF_RLABEL;
        if ((cs->sc_flags & CCDF_NOLABEL) != 0)
                errstring = "CCDF_NOLABEL set; ignoring on-disk label";
        else
                errstring = readdisklabel(CCDLABELDEV(dev), ccdstrategy,
                    cs->sc_dkdev.dk_label, cs->sc_dkdev.dk_cpulabel);
        if (errstring)
                ccdmakedisklabel(cs);
        else {
                int i;
                struct partition *pp;

                /*
                 * Sanity check whether the found disklabel is valid.
                 *
                 * This is necessary since total size of ccd may vary
                 * when an interleave is changed even though exactly
                 * same componets are used, and old disklabel may used
                 * if that is found.
                 */
                if (lp->d_secperunit < UINT32_MAX ?
                        lp->d_secperunit != cs->sc_size :
                        lp->d_secperunit > cs->sc_size)
                        printf("WARNING: %s: "
                            "total sector size in disklabel (%ju) != "
                            "the size of ccd (%ju)\n", cs->sc_xname,
                            (uintmax_t)lp->d_secperunit,
                            (uintmax_t)cs->sc_size);
                for (i = 0; i < lp->d_npartitions; i++) {
                        pp = &lp->d_partitions[i];
                        if (pp->p_offset + pp->p_size > cs->sc_size)
                                printf("WARNING: %s: end of partition `%c' "
                                    "exceeds the size of ccd (%ju)\n",
                                    cs->sc_xname, 'a' + i, (uintmax_t)cs->sc_size);
                }
        }

#ifdef DEBUG
        /* It's actually extremely common to have unlabeled ccds. */
        if (ccddebug & CCDB_LABEL)
                if (errstring != NULL)
                        printf("%s: %s\n", cs->sc_xname, errstring);
#endif

        /* In-core label now valid. */
        cs->sc_flags = (cs->sc_flags | CCDF_VLABEL) & ~CCDF_RLABEL;
}

/*
 * Take care of things one might want to take care of in the event
 * that a disklabel isn't present.
 */
static void
ccdmakedisklabel(struct ccd_softc *cs)
{
        struct disklabel *lp = cs->sc_dkdev.dk_label;

        /*
         * For historical reasons, if there's no disklabel present
         * the raw partition must be marked FS_BSDFFS.
         */
        lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;

        strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));

        lp->d_checksum = dkcksum(lp);
}

#ifdef DEBUG
static void
printiinfo(struct ccdiinfo *ii)
{
        int ix, i;

        for (ix = 0; ii->ii_ndisk; ix++, ii++) {
                printf(" itab[%d]: #dk %d sblk %" PRId64 " soff %" PRId64,
                    ix, ii->ii_ndisk, ii->ii_startblk, ii->ii_startoff);
                for (i = 0; i < ii->ii_ndisk; i++)
                        printf(" %d", ii->ii_index[i]);
                printf("\n");
        }
}
#endif

MODULE(MODULE_CLASS_DRIVER, ccd, "dk_subr,bufq_fcfs");

static int
ccd_modcmd(modcmd_t cmd, void *arg)
{
        int error = 0;
#ifdef _MODULE
        int bmajor = -1, cmajor = -1;
#endif


        switch (cmd) {
        case MODULE_CMD_INIT:
#ifdef _MODULE
                ccdattach(0);

                error = devsw_attach("ccd", &ccd_bdevsw, &bmajor,
                    &ccd_cdevsw, &cmajor);
#endif
                break;

        case MODULE_CMD_FINI:
#ifdef _MODULE
                mutex_enter(&ccd_lock);
                if (!LIST_EMPTY(&ccds)) {
                        mutex_exit(&ccd_lock);
                        error = EBUSY;
                } else {
                        mutex_exit(&ccd_lock);
                        devsw_detach(&ccd_bdevsw, &ccd_cdevsw);
                        ccddetach();
                }
#endif
                break;

        case MODULE_CMD_STAT:
                return ENOTTY;

        default:
                return ENOTTY;
        }

        return error;
}

static int
ccd_units_sysctl(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        struct ccd_softc *sc;
        int error, i, nccd, *units;
        size_t size;

        nccd = 0;
        mutex_enter(&ccd_lock);
        LIST_FOREACH(sc, &ccds, sc_link)
                nccd++;
        mutex_exit(&ccd_lock);

        if (nccd != 0) {
                size = nccd * sizeof(*units);
                units = kmem_zalloc(size, KM_SLEEP);
                i = 0;
                mutex_enter(&ccd_lock);
                LIST_FOREACH(sc, &ccds, sc_link) {
                        if (i >= nccd)
                                break;
                        units[i] = sc->sc_unit;
                }
                mutex_exit(&ccd_lock);
        } else {
                units = NULL;
                size = 0;
        }

        node = *rnode;
        node.sysctl_data = units;
        node.sysctl_size = size;

        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (units)
                kmem_free(units, size);
        return error;
}

static int
ccd_info_sysctl(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        struct ccddiskinfo ccd;
        struct ccd_softc *sc;
        int unit, error;

        if (newp == NULL || newlen != sizeof(int))
                return EINVAL;

        error = sysctl_copyin(l, newp, &unit, sizeof unit);
        if (error)
                return error;
        newlen = 0;
        ccd.ccd_ndisks = ~0;
        mutex_enter(&ccd_lock);
        LIST_FOREACH(sc, &ccds, sc_link) {
                if (sc->sc_unit == unit) {
                        ccd.ccd_ileave = sc->sc_ileave;
                        ccd.ccd_size = sc->sc_size;
                        ccd.ccd_ndisks = sc->sc_nccdisks;
                        ccd.ccd_flags = sc->sc_flags;
                        break;
                }
        }
        mutex_exit(&ccd_lock);

        if (ccd.ccd_ndisks == ~0)
                return ENOENT;

        node = *rnode;
        node.sysctl_data = &ccd;
        node.sysctl_size = sizeof(ccd);

        return sysctl_lookup(SYSCTLFN_CALL(&node));
}

static int
ccd_components_sysctl(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int error, unit;
        size_t size;
        char *names, *p, *ep;
        struct ccd_softc *sc;

        if (newp == NULL || newlen != sizeof(int))
                return EINVAL;

        size = 0;
        error = sysctl_copyin(l, newp, &unit, sizeof unit);
        if (error)
                return error;
        newlen = 0;
        mutex_enter(&ccd_lock);
        LIST_FOREACH(sc, &ccds, sc_link)
                if (sc->sc_unit == unit) {
                        for (size_t i = 0; i < sc->sc_nccdisks; i++)
                                size += strlen(sc->sc_cinfo[i].ci_path) + 1;
                        break;
                }
        mutex_exit(&ccd_lock);

        if (size == 0)
                return ENOENT;
        names = kmem_zalloc(size, KM_SLEEP);
        p = names;
        ep = names + size;
        mutex_enter(&ccd_lock);
        LIST_FOREACH(sc, &ccds, sc_link)
                if (sc->sc_unit == unit) {
                        for (size_t i = 0; i < sc->sc_nccdisks; i++) {
                                char *d = sc->sc_cinfo[i].ci_path;
                                while (p < ep && (*p++ = *d++) != '\0')
                                        continue;
                        }
                        break;
                }
        mutex_exit(&ccd_lock);

        node = *rnode;
        node.sysctl_data = names;
        node.sysctl_size = ep - names;

        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        kmem_free(names, size);
        return error;
}

SYSCTL_SETUP(sysctl_kern_ccd_setup, "sysctl kern.ccd subtree setup")
{
        const struct sysctlnode *node = NULL;

        sysctl_createv(clog, 0, NULL, &node,
            CTLFLAG_PERMANENT,
            CTLTYPE_NODE, "ccd",
            SYSCTL_DESCR("ConCatenated Disk state"),
            NULL, 0, NULL, 0,
            CTL_KERN, CTL_CREATE, CTL_EOL);

        if (node == NULL)
                return;

        sysctl_createv(clog, 0, &node, NULL,
            CTLFLAG_PERMANENT | CTLFLAG_READONLY,
            CTLTYPE_STRUCT, "units",
            SYSCTL_DESCR("List of ccd unit numbers"),
            ccd_units_sysctl, 0, NULL, 0,
            CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
            CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
            CTLTYPE_STRUCT, "info",
            SYSCTL_DESCR("Information about a CCD unit"),
            ccd_info_sysctl, 0, NULL, 0,
            CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
            CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
            CTLTYPE_STRUCT, "components",
            SYSCTL_DESCR("Information about CCD components"),
            ccd_components_sysctl, 0, NULL, 0,
            CTL_CREATE, CTL_EOL);
}

































































   29 


   23 








    3 




    3 





    3 


















   29 










   29 
































   29 








   26 


   21 














   29 






















































































   28 




   29 




   28 












   29 
   29 



   29 














   29 
   29 






















   29 















































   29 
   29 







   29 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
/*        $NetBSD: bufq_priocscan.c,v 1.21 2017/05/04 11:03:27 kamil Exp $        */

/*-
 * Copyright (c)2004,2005,2006,2008,2009,2011,2012 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: bufq_priocscan.c,v 1.21 2017/05/04 11:03:27 kamil Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/bufq_impl.h>
#include <sys/kmem.h>
#include <sys/rbtree.h>
#include <sys/module.h>

#undef        PRIOCSCAN_USE_GLOBAL_POSITION

/*
 * Cyclical scan (CSCAN)
 */

struct cscan_key {
        daddr_t        k_rawblkno;
        int k_cylinder;
};

struct cscan_queue {
        rb_tree_t cq_buffers;                /* ordered list of buffers */
#if !defined(PRIOCSCAN_USE_GLOBAL_POSITION)
        struct cscan_key cq_lastkey;        /* key of last request */
#endif /* !defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
        int cq_sortby;                        /* BUFQ_SORT_MASK */
        rb_tree_ops_t cq_ops;
};

static signed int
buf_cmp(const struct buf *b1, const struct buf *b2, int sortby)
{

        if (buf_inorder(b2, b1, sortby)) {
                return 1;        /* b1 > b2 */
        }
        if (buf_inorder(b1, b2, sortby)) {
                return -1;        /* b1 < b2 */
        }
        return 0;
}

/* return positive if n1 > n2 */
static signed int
cscan_tree_compare_nodes(void *context, const void *n1, const void *n2)
{
        const struct cscan_queue * const q = context;
        const struct buf * const b1 = n1;
        const struct buf * const b2 = n2;
        const int sortby = q->cq_sortby;
        const int diff = buf_cmp(b1, b2, sortby);

        /*
         * XXX rawblkno/cylinder might not be unique.  eg. unbuffered i/o
         */

        if (diff != 0) {
                return diff;
        }

        /*
         * XXX rawblkno/cylinder might not be unique.  eg. unbuffered i/o
         */
        if (b1 > b2) {
                return 1;
        }
        if (b1 < b2) {
                return -1;
        }
        return 0;
}

/* return positive if n1 > k2 */
static signed int
cscan_tree_compare_key(void *context, const void *n1, const void *k2)
{
        const struct cscan_queue * const q = context;
        const struct buf * const b1 = n1;
        const struct cscan_key * const key = k2;
        const struct buf tmp = {
                .b_rawblkno = key->k_rawblkno,
                .b_cylinder = key->k_cylinder,
        };
        const struct buf *b2 = &tmp;
        const int sortby = q->cq_sortby;

        return buf_cmp(b1, b2, sortby);
}

static void __unused
cscan_dump(struct cscan_queue *cq)
{
        const int sortby = cq->cq_sortby;
        struct buf *bp;

        RB_TREE_FOREACH(bp, &cq->cq_buffers) {
                if (sortby == BUFQ_SORT_RAWBLOCK) {
                        printf(" %jd", (intmax_t)bp->b_rawblkno);
                } else {
                        printf(" %jd/%jd",
                            (intmax_t)bp->b_cylinder, (intmax_t)bp->b_rawblkno);
                }
        }
}

static inline bool
cscan_empty(struct cscan_queue *q)
{

        /* XXX this might do more work than necessary */
        return rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT) == NULL;
}

static void
cscan_put(struct cscan_queue *q, struct buf *bp)
{
        struct buf *obp __diagused;

        obp = rb_tree_insert_node(&q->cq_buffers, bp);
        KASSERT(obp == bp); /* see cscan_tree_compare_nodes */
}

static struct buf *
cscan_get(struct cscan_queue *q, int remove, struct cscan_key *key)
{
        struct buf *bp;

        bp = rb_tree_find_node_geq(&q->cq_buffers, key);
        KDASSERT(bp == NULL || cscan_tree_compare_key(q, bp, key) >= 0);
        if (bp == NULL) {
                bp = rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT);
                KDASSERT(cscan_tree_compare_key(q, bp, key) < 0);
        }
        if (bp != NULL && remove) {
#if defined(DEBUG)
                struct buf *nbp;
#endif /* defined(DEBUG) */

                rb_tree_remove_node(&q->cq_buffers, bp);
                /*
                 * remember the head position.
                 */
                key->k_cylinder = bp->b_cylinder;
                key->k_rawblkno = bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT);
#if defined(DEBUG)
                nbp = rb_tree_find_node_geq(&q->cq_buffers, key);
                if (nbp != NULL && cscan_tree_compare_nodes(q, nbp, bp) < 0) {
                        panic("%s: wrong order %p < %p\n", __func__,
                            nbp, bp);
                }
#endif /* defined(DEBUG) */
        }
        return bp;
}

static void
cscan_init(struct cscan_queue *q, int sortby)
{
        static const rb_tree_ops_t cscan_tree_ops = {
                .rbto_compare_nodes = cscan_tree_compare_nodes,
                .rbto_compare_key = cscan_tree_compare_key,
                .rbto_node_offset = offsetof(struct buf, b_u.u_rbnode),
                .rbto_context = NULL,
        };

        q->cq_sortby = sortby;
        /* XXX copy ops to workaround rbtree.h API limitation */
        q->cq_ops = cscan_tree_ops;
        q->cq_ops.rbto_context = q;
        rb_tree_init(&q->cq_buffers, &q->cq_ops);
}

/*
 * Per-prioritiy CSCAN.
 *
 * XXX probably we should have a way to raise
 * priority of the on-queue requests.
 */
#define        PRIOCSCAN_NQUEUE        3

struct priocscan_queue {
        struct cscan_queue q_queue;
        unsigned int q_burst;
};

struct bufq_priocscan {
        struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE];

#if defined(PRIOCSCAN_USE_GLOBAL_POSITION)
        /*
         * XXX using "global" head position can reduce positioning time
         * when switching between queues.
         * although it might affect against fairness.
         */
        struct cscan_key bq_lastkey;
#endif
};

/*
 * how many requests to serve when having pending requests on other queues.
 *
 * XXX tune
 * be careful: while making these values larger likely
 * increases the total throughput, it can also increase latencies
 * for some workloads.
 */
const int priocscan_burst[] = {
        64, 16, 4
};

static void bufq_priocscan_init(struct bufq_state *);
static void bufq_priocscan_put(struct bufq_state *, struct buf *);
static struct buf *bufq_priocscan_get(struct bufq_state *, int);

BUFQ_DEFINE(priocscan, 40, bufq_priocscan_init);

static inline struct cscan_queue *bufq_priocscan_selectqueue(
    struct bufq_priocscan *, const struct buf *);

static inline struct cscan_queue *
bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp)
{
        static const int priocscan_priomap[] = {
                [BPRIO_TIMENONCRITICAL] = 2,
                [BPRIO_TIMELIMITED] = 1,
                [BPRIO_TIMECRITICAL] = 0
        };

        return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue;
}

static void
bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp)
{
        struct bufq_priocscan *q = bufq_private(bufq);
        struct cscan_queue *cq;

        cq = bufq_priocscan_selectqueue(q, bp);
        cscan_put(cq, bp);
}

static struct buf *
bufq_priocscan_get(struct bufq_state *bufq, int remove)
{
        struct bufq_priocscan *q = bufq_private(bufq);
        struct priocscan_queue *pq, *npq;
        struct priocscan_queue *first; /* highest priority non-empty queue */
        const struct priocscan_queue *epq;
        struct buf *bp;
        bool single; /* true if there's only one non-empty queue */

        /*
         * find the highest priority non-empty queue.
         */
        pq = &q->bq_queue[0];
        epq = pq + PRIOCSCAN_NQUEUE;
        for (; pq < epq; pq++) {
                if (!cscan_empty(&pq->q_queue)) {
                        break;
                }
        }
        if (pq == epq) {
                /*
                 * all our queues are empty.  there's nothing to serve.
                 */
                return NULL;
        }
        first = pq;

        /*
         * scan the rest of queues.
         *
         * if we have two or more non-empty queues, we serve the highest
         * priority one with non-zero burst count.
         */
        single = true;
        for (npq = pq + 1; npq < epq; npq++) {
                if (!cscan_empty(&npq->q_queue)) {
                        /*
                         * we found another non-empty queue.
                         * it means that a queue needs to consume its burst
                         * count to be served.
                         */
                        single = false;

                        /*
                         * check if our current candidate queue has already
                         * exhausted its burst count.
                         */
                        if (pq->q_burst > 0) {
                                break;
                        }
                        pq = npq;
                }
        }
        if (single) {
                /*
                 * there's only a non-empty queue.
                 * just serve it without consuming its burst count.
                 */
                KASSERT(pq == first);
        } else {
                /*
                 * there are two or more non-empty queues.
                 */
                if (pq->q_burst == 0) {
                        /*
                         * no queues can be served because they have already
                         * exhausted their burst count.
                         */
                        unsigned int i;
#ifdef DEBUG
                        for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
                                pq = &q->bq_queue[i];
                                if (!cscan_empty(&pq->q_queue) && pq->q_burst) {
                                        panic("%s: inconsist", __func__);
                                }
                        }
#endif /* DEBUG */
                        /*
                         * reset burst counts.
                         */
                        if (remove) {
                                for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
                                        pq = &q->bq_queue[i];
                                        pq->q_burst = priocscan_burst[i];
                                }
                        }

                        /*
                         * serve the highest priority non-empty queue.
                         */
                        pq = first;
                }
                /*
                 * consume the burst count.
                 *
                 * XXX account only by number of requests.  is it good enough?
                 */
                if (remove) {
                        KASSERT(pq->q_burst > 0);
                        pq->q_burst--;
                }
        }

        /*
         * finally, get a request from the selected queue.
         */
        KDASSERT(!cscan_empty(&pq->q_queue));
        bp = cscan_get(&pq->q_queue, remove,
#if defined(PRIOCSCAN_USE_GLOBAL_POSITION)
            &q->bq_lastkey
#else /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
            &pq->q_queue.cq_lastkey
#endif /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
            );
        KDASSERT(bp != NULL);
        KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp));

        return bp;
}

static struct buf *
bufq_priocscan_cancel(struct bufq_state *bufq, struct buf *bp)
{
        struct bufq_priocscan * const q = bufq_private(bufq);
        unsigned int i;

        for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
                struct cscan_queue * const cq = &q->bq_queue[i].q_queue;
                struct buf *it;

                /*
                 * XXX probably could be faster but the cancel functionality
                 * is not widely used anyway.
                 */
                RB_TREE_FOREACH(it, &cq->cq_buffers) {
                        if (it == bp) {
                                rb_tree_remove_node(&cq->cq_buffers, bp);
                                return bp;
                        }
                }
        }
        return NULL;
}

static void
bufq_priocscan_fini(struct bufq_state *bufq)
{

        KASSERT(bufq->bq_private != NULL);
        kmem_free(bufq->bq_private, sizeof(struct bufq_priocscan));
}

static void
bufq_priocscan_init(struct bufq_state *bufq)
{
        struct bufq_priocscan *q;
        const int sortby = bufq->bq_flags & BUFQ_SORT_MASK;
        unsigned int i;

        bufq->bq_get = bufq_priocscan_get;
        bufq->bq_put = bufq_priocscan_put;
        bufq->bq_cancel = bufq_priocscan_cancel;
        bufq->bq_fini = bufq_priocscan_fini;
        bufq->bq_private = kmem_zalloc(sizeof(struct bufq_priocscan), KM_SLEEP);

        q = bufq->bq_private;
        for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
                struct cscan_queue *cq = &q->bq_queue[i].q_queue;

                cscan_init(cq, sortby);
        }
}

MODULE(MODULE_CLASS_BUFQ, bufq_priocscan, NULL);

static int
bufq_priocscan_modcmd(modcmd_t cmd, void *opaque)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return bufq_register(&bufq_strat_priocscan);
        case MODULE_CMD_FINI:
                return bufq_unregister(&bufq_strat_priocscan);
        default:
                return ENOTTY;
        }
}


































































































    1 




















































































    1 



































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
/*        $NetBSD: null_vfsops.c,v 1.101 2023/02/06 10:32:58 hannken Exp $        */

/*
 * Copyright (c) 1999 National Aeronautics & Space Administration
 * All rights reserved.
 *
 * This software was written by William Studenmund of the
 * Numerical Aerospace Simulation Facility, NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the National Aeronautics & Space Administration
 *    nor the names of its contributors may be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
 * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1992, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        from: Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp
 *        from: @(#)lofs_vfsops.c        1.2 (Berkeley) 6/18/92
 *        @(#)null_vfsops.c        8.7 (Berkeley) 5/14/95
 */

/*
 * Null file-system: VFS operations.
 *
 * See null_vnops.c for a description.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: null_vfsops.c,v 1.101 2023/02/06 10:32:58 hannken Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/module.h>

#include <miscfs/nullfs/null.h>
#include <miscfs/genfs/layer_extern.h>

MODULE(MODULE_CLASS_VFS, null, "layerfs");

VFS_PROTOS(nullfs);

int
nullfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
        struct vnode *lowerrootvp, *vp;
        struct null_args *args = data;
        struct null_mount *nmp;
        struct layer_mount *lmp;
        struct pathbuf *pb;
        struct nameidata nd;
        int error;

        if (args == NULL)
                return EINVAL;
        if (*data_len < sizeof(*args))
                return EINVAL;

        if (mp->mnt_flag & MNT_GETARGS) {
                lmp = MOUNTTOLAYERMOUNT(mp);
                if (lmp == NULL)
                        return EIO;
                args->la.target = NULL;
                *data_len = sizeof(*args);
                return 0;
        }

        /* Update is not supported. */
        if (mp->mnt_flag & MNT_UPDATE)
                return EOPNOTSUPP;

        /* Find the lower vnode and lock it. */
        error = pathbuf_copyin(args->la.target, &pb);
        if (error) {
                return error;
        }
        NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, pb);
        if ((error = namei(&nd)) != 0) {
                pathbuf_destroy(pb);
                return error;
        }
        lowerrootvp = nd.ni_vp;
        pathbuf_destroy(pb);

        /* Create the mount point. */
        nmp = kmem_zalloc(sizeof(struct null_mount), KM_SLEEP);
        mp->mnt_data = nmp;
        mp->mnt_iflag |= lowerrootvp->v_mount->mnt_iflag & IMNT_MPSAFE;
        mp->mnt_iflag |= lowerrootvp->v_mount->mnt_iflag & IMNT_SHRLOOKUP;

        /*
         * Make sure that the mount point is sufficiently initialized
         * that the node create call will work.
         */
        vfs_getnewfsid(mp);
        error = vfs_set_lowermount(mp, lowerrootvp->v_mount);
        if (error) {
                vput(lowerrootvp);
                kmem_free(nmp, sizeof(struct null_mount));
                return error;
        }

        nmp->nullm_size = sizeof(struct null_node);
        nmp->nullm_tag = VT_NULL;
        nmp->nullm_bypass = layer_bypass;
        nmp->nullm_vnodeop_p = null_vnodeop_p;

        /* Setup a null node for root vnode. */
        VOP_UNLOCK(lowerrootvp);
        error = layer_node_create(mp, lowerrootvp, &vp);
        if (error) {
                vrele(lowerrootvp);
                kmem_free(nmp, sizeof(struct null_mount));
                return error;
        }
        /*
         * Keep a held reference to the root vnode.  It will be released on
         * umount.  Note: nullfs is MP-safe.
         */
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        vp->v_vflag |= VV_ROOT;
        nmp->nullm_rootvp = vp;
        VOP_UNLOCK(vp);

        error = set_statvfs_info(path, UIO_USERSPACE, args->la.target,
            UIO_USERSPACE, mp->mnt_op->vfs_name, mp, curlwp);
        if (error)
                return error;

        if (mp->mnt_lower->mnt_flag & MNT_LOCAL)
                mp->mnt_flag |= MNT_LOCAL;
        return 0;
}

int
nullfs_unmount(struct mount *mp, int mntflags)
{
        struct null_mount *nmp = MOUNTTONULLMOUNT(mp);
        struct vnode *null_rootvp = nmp->nullm_rootvp;
        int error, flags = 0;

        if (mntflags & MNT_FORCE)
                flags |= FORCECLOSE;

        if (vrefcnt(null_rootvp) > 1 && (mntflags & MNT_FORCE) == 0)
                return EBUSY;

        if ((error = vflush(mp, null_rootvp, flags)) != 0)
                return error;

        /* Eliminate all activity and release the vnode. */
        vgone(null_rootvp);

        /* Finally, destroy the mount point structures. */
        kmem_free(mp->mnt_data, sizeof(struct null_mount));
        mp->mnt_data = NULL;
        return 0;
}

extern const struct vnodeopv_desc null_vnodeop_opv_desc;

const struct vnodeopv_desc * const nullfs_vnodeopv_descs[] = {
        &null_vnodeop_opv_desc,
        NULL,
};

struct vfsops nullfs_vfsops = {
        .vfs_name = MOUNT_NULL,
        .vfs_min_mount_data = sizeof (struct null_args),
        .vfs_mount = nullfs_mount,
        .vfs_start = layerfs_start,
        .vfs_unmount = nullfs_unmount,
        .vfs_root = layerfs_root,
        .vfs_quotactl = layerfs_quotactl,
        .vfs_statvfs = layerfs_statvfs,
        .vfs_sync = layerfs_sync,
        .vfs_loadvnode = layerfs_loadvnode,
        .vfs_vget = layerfs_vget,
        .vfs_fhtovp = layerfs_fhtovp,
        .vfs_vptofh = layerfs_vptofh,
        .vfs_init = layerfs_init,
        .vfs_done = layerfs_done,
        .vfs_snapshot = layerfs_snapshot,
        .vfs_extattrctl = vfs_stdextattrctl,
        .vfs_suspendctl = layerfs_suspendctl,
        .vfs_renamelock_enter = layerfs_renamelock_enter,
        .vfs_renamelock_exit = layerfs_renamelock_exit,
        .vfs_fsync = (void *)eopnotsupp,
        .vfs_opv_descs = nullfs_vnodeopv_descs
};

SYSCTL_SETUP(nullfs_sysctl_setup, "nullfs sysctl")
{

        sysctl_createv(clog, 0, NULL, NULL,
            CTLFLAG_PERMANENT,
            CTLTYPE_NODE, "null",
            SYSCTL_DESCR("Loopback file system"),
            NULL, 0, NULL, 0,
            CTL_VFS, 9, CTL_EOL);
        /*
         * XXX the "9" above could be dynamic, thereby eliminating
         * one more instance of the "number to vfs" mapping problem,
         * but "9" is the order as taken from sys/mount.h
         */
}

static int
null_modcmd(modcmd_t cmd, void *arg)
{
        int error;

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = vfs_attach(&nullfs_vfsops);
                if (error != 0)
                        break;
                break;
        case MODULE_CMD_FINI:
                error = vfs_detach(&nullfs_vfsops);
                if (error != 0)
                        break;
                break;
        default:
                error = ENOTTY;
                break;
        }
        return error;
}






































































































   12 

   12 
   12 













   12 








  121 










  120 

  118 




  119 













    2 

    2 













  247 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/*        $NetBSD: subr_pserialize.c,v 1.24 2023/10/04 20:28:06 ad Exp $        */

/*-
 * Copyright (c) 2010, 2011, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Passive serialization.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_pserialize.c,v 1.24 2023/10/04 20:28:06 ad Exp $");

#include <sys/param.h>

#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/evcnt.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/pserialize.h>
#include <sys/xcall.h>

struct pserialize {
        char                        psz_dummy;
};

static kmutex_t                        psz_lock        __cacheline_aligned;
static struct evcnt                psz_ev_excl        __cacheline_aligned =
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pserialize", "exclusive access");
EVCNT_ATTACH_STATIC(psz_ev_excl);

/*
 * pserialize_init:
 *
 *        Initialize passive serialization structures.
 */
void
pserialize_init(void)
{

        mutex_init(&psz_lock, MUTEX_DEFAULT, IPL_NONE);
}

/*
 * pserialize_create:
 *
 *        Create and initialize a passive serialization object.
 */
pserialize_t
pserialize_create(void)
{
        pserialize_t psz;

        psz = kmem_zalloc(sizeof(*psz), KM_SLEEP);
        return psz;
}

/*
 * pserialize_destroy:
 *
 *        Destroy a passive serialization object.
 */
void
pserialize_destroy(pserialize_t psz)
{

        kmem_free(psz, sizeof(*psz));
}

/*
 * pserialize_perform:
 *
 *        Perform the write side of passive serialization.
 */
void
pserialize_perform(pserialize_t psz)
{

        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());

        if (__predict_false(panicstr != NULL)) {
                return;
        }

        if (__predict_false(mp_online == false)) {
                psz_ev_excl.ev_count++;
                return;
        }

        /*
         * Broadcast a NOP to all CPUs and wait until all of them complete.
         */
        xc_barrier(XC_HIGHPRI);

        mutex_enter(&psz_lock);
        psz_ev_excl.ev_count++;
        mutex_exit(&psz_lock);
}

int
pserialize_read_enter(void)
{
        int s;

        s = splsoftserial();
        curcpu()->ci_psz_read_depth++;
        __insn_barrier();
        return s;
}

void
pserialize_read_exit(int s)
{

        KASSERT(__predict_false(cold) || kpreempt_disabled());

        __insn_barrier();
        if (__predict_false(curcpu()->ci_psz_read_depth-- == 0))
                panic("mismatching pserialize_read_exit()");
        splx(s);
}

/*
 * pserialize_in_read_section:
 *
 *        True if the caller is in a pserialize read section.  To be used
 *        only for diagnostic assertions where we want to guarantee the
 *        condition like:
 *
 *                KASSERT(pserialize_in_read_section());
 */
bool
pserialize_in_read_section(void)
{

        return kpreempt_disabled() && curcpu()->ci_psz_read_depth > 0;
}

/*
 * pserialize_not_in_read_section:
 *
 *        True if the caller is not in a pserialize read section.  To be
 *        used only for diagnostic assertions where we want to guarantee
 *        the condition like:
 *
 *                KASSERT(pserialize_not_in_read_section());
 */
bool
pserialize_not_in_read_section(void)
{
        bool notin;
        long pctr;

        pctr = lwp_pctr();
        notin = __predict_true(curcpu()->ci_psz_read_depth == 0);

        /*
         * If we had a context switch, we're definitely not in a
         * pserialize read section because pserialize read sections
         * block preemption.
         */
        if (__predict_false(pctr != lwp_pctr()))
                notin = true;

        return notin;
}
















































    3 


    2 
    3 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/*        $NetBSD: strnlen.c,v 1.2 2014/01/09 11:25:11 apb Exp $        */

/*-
 * Copyright (c) 2009 David Schultz <das@FreeBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif

#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
__RCSID("$NetBSD: strnlen.c,v 1.2 2014/01/09 11:25:11 apb Exp $");
#endif /* LIBC_SCCS and not lint */
/* FreeBSD: src/lib/libc/string/strnlen.c,v 1.1 2009/02/28 06:00:58 das Exp */

#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <string.h>
#else
#include <lib/libkern/libkern.h>
#endif

#if !HAVE_STRNLEN
size_t
strnlen(const char *s, size_t maxlen)
{
        size_t len;

        for (len = 0; len < maxlen; len++, s++) {
                if (!*s)
                        break;
        }
        return (len);
}
#endif /* !HAVE_STRNLEN */





































    3 







    3 



















    3 
    3 















    3 



    3 























    2 

    3 

    3 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/* $NetBSD: prop_stack.c,v 1.3 2019/05/08 02:25:50 thorpej Exp $ */

/*-
 * Copyright (c) 2007 Joerg Sonnenberger <joerg@NetBSD.org>.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "prop_object_impl.h"
#include "prop_stack.h"

void
_prop_stack_init(prop_stack_t stack)
{
        stack->used_intern_elems = 0;
        SLIST_INIT(&stack->extern_elems);
}

bool
_prop_stack_push(prop_stack_t stack, prop_object_t obj, void *data1,
    void *data2, void *data3)
{
        struct _prop_stack_extern_elem *eelem;
        struct _prop_stack_intern_elem *ielem;

        if (stack->used_intern_elems == PROP_STACK_INTERN_ELEMS) {
                eelem = _PROP_MALLOC(sizeof(*eelem), M_TEMP);

                if (eelem == NULL)
                        return false;

                eelem->object = obj;
                eelem->object_data[0] = data1;
                eelem->object_data[1] = data2;
                eelem->object_data[2] = data3;

                SLIST_INSERT_HEAD(&stack->extern_elems, eelem, stack_link);

                return true;
        }

        _PROP_ASSERT(stack->used_intern_elems < PROP_STACK_INTERN_ELEMS);
        _PROP_ASSERT(SLIST_EMPTY(&stack->extern_elems));

        ielem = &stack->intern_elems[stack->used_intern_elems];
        ielem->object = obj;
        ielem->object_data[0] = data1;
        ielem->object_data[1] = data2;
        ielem->object_data[2] = data3;

        ++stack->used_intern_elems;

        return true;
}

bool
_prop_stack_pop(prop_stack_t stack, prop_object_t *obj, void **data1,
    void **data2, void **data3)
{
        struct _prop_stack_extern_elem *eelem;
        struct _prop_stack_intern_elem *ielem;

        if (stack->used_intern_elems == 0)
                return false;

        if ((eelem = SLIST_FIRST(&stack->extern_elems)) != NULL) {
                _PROP_ASSERT(stack->used_intern_elems == PROP_STACK_INTERN_ELEMS);

                SLIST_REMOVE_HEAD(&stack->extern_elems, stack_link);
                if (obj)
                        *obj = eelem->object;
                if (data1)
                        *data1 = eelem->object_data[0];
                if (data2)
                        *data2 = eelem->object_data[1];
                if (data3)
                        *data3 = eelem->object_data[2];
                _PROP_FREE(eelem, M_TEMP);
                return true;
        }

        --stack->used_intern_elems;
        ielem = &stack->intern_elems[stack->used_intern_elems];

        if (obj)
                *obj = ielem->object;
        if (data1)
                *data1 = ielem->object_data[0];
        if (data2)
                *data2 = ielem->object_data[1];
        if (data3)
                *data3 = ielem->object_data[2];

        return true;
}





































































































    3 










    2 



    1 




    2 






    2 




























    1 




    1 











    4 













    4 








    1 










    3 






















    2 





    1 



    3 









    3 






    1 









    2 






    1 
    1 





    2 





    2 
    1 










    1 














    1 




























































    4 













    5 








    1 










    3 







    1 






    1 







    4 









    4 









    4 
















    4 
    1 





    2 
    1 


    2 








    4 
    1 




    4 












   15 

























    4 





























   14 



















































   13 
    1 















   11 




   10 








    1 

    2 





























   11 


    3 





   13 
    1 

   14 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
/*        $NetBSD: sys_generic.c,v 1.134 2022/07/10 23:12:12 riastradh Exp $        */

/*-
 * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)sys_generic.c        8.9 (Berkeley) 2/14/95
 */

/*
 * System calls relating to files.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.134 2022/07/10 23:12:12 riastradh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/ioctl.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/uio.h>
#include <sys/kernel.h>
#include <sys/stat.h>
#include <sys/kmem.h>
#include <sys/poll.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/ktrace.h>
#include <sys/atomic.h>
#include <sys/disklabel.h>

/*
 * Read system call.
 */
/* ARGSUSED */
int
sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)                fd;
                syscallarg(void *)        buf;
                syscallarg(size_t)        nbyte;
        } */
        file_t *fp;
        int fd;

        fd = SCARG(uap, fd);

        if ((fp = fd_getfile(fd)) == NULL)
                return (EBADF);

        if ((fp->f_flag & FREAD) == 0) {
                fd_putfile(fd);
                return (EBADF);
        }

        /* dofileread() will unuse the descriptor for us */
        return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
            &fp->f_offset, FOF_UPDATE_OFFSET, retval));
}

int
dofileread(int fd, struct file *fp, void *buf, size_t nbyte,
        off_t *offset, int flags, register_t *retval)
{
        struct iovec aiov;
        struct uio auio;
        size_t cnt;
        int error;
        lwp_t *l;

        l = curlwp;

        aiov.iov_base = (void *)buf;
        aiov.iov_len = nbyte;
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        auio.uio_resid = nbyte;
        auio.uio_rw = UIO_READ;
        auio.uio_vmspace = l->l_proc->p_vmspace;

        /*
         * Reads return ssize_t because -1 is returned on error.  Therefore
         * we must restrict the length to SSIZE_MAX to avoid garbage return
         * values.
         */
        if (auio.uio_resid > SSIZE_MAX) {
                error = EINVAL;
                goto out;
        }

        cnt = auio.uio_resid;
        error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
        if (error)
                if (auio.uio_resid != cnt && (error == ERESTART ||
                    error == EINTR || error == EWOULDBLOCK))
                        error = 0;
        cnt -= auio.uio_resid;
        ktrgenio(fd, UIO_READ, buf, cnt, error);
        *retval = cnt;
 out:
        fd_putfile(fd);
        return (error);
}

/*
 * Scatter read system call.
 */
int
sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)                                fd;
                syscallarg(const struct iovec *)        iovp;
                syscallarg(int)                                iovcnt;
        } */

        return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
            SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
}

int
do_filereadv(int fd, const struct iovec *iovp, int iovcnt,
    off_t *offset, int flags, register_t *retval)
{
        struct uio        auio;
        struct iovec        *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
        int                i, error;
        size_t                cnt;
        u_int                iovlen;
        struct file        *fp;
        struct iovec        *ktriov = NULL;

        if (iovcnt == 0)
                return EINVAL;

        if ((fp = fd_getfile(fd)) == NULL)
                return EBADF;

        if ((fp->f_flag & FREAD) == 0) {
                fd_putfile(fd);
                return EBADF;
        }

        if (offset == NULL)
                offset = &fp->f_offset;
        else {
                /*
                 * Caller must not specify &fp->f_offset -- we can't
                 * safely dereference it for the call to fo_seek
                 * without holding some underlying object lock.
                 */
                KASSERT(offset != &fp->f_offset);
                if (fp->f_ops->fo_seek == NULL) {
                        error = ESPIPE;
                        goto out;
                }
                error = (*fp->f_ops->fo_seek)(fp, *offset, SEEK_SET, NULL,
                    0);
                if (error != 0)
                        goto out;
        }

        iovlen = iovcnt * sizeof(struct iovec);
        if (flags & FOF_IOV_SYSSPACE)
                iov = __UNCONST(iovp);
        else {
                iov = aiov;
                if ((u_int)iovcnt > UIO_SMALLIOV) {
                        if ((u_int)iovcnt > IOV_MAX) {
                                error = EINVAL;
                                goto out;
                        }
                        iov = kmem_alloc(iovlen, KM_SLEEP);
                        needfree = iov;
                }
                error = copyin(iovp, iov, iovlen);
                if (error)
                        goto done;
        }

        auio.uio_iov = iov;
        auio.uio_iovcnt = iovcnt;
        auio.uio_rw = UIO_READ;
        auio.uio_vmspace = curproc->p_vmspace;

        auio.uio_resid = 0;
        for (i = 0; i < iovcnt; i++, iov++) {
                auio.uio_resid += iov->iov_len;
                /*
                 * Reads return ssize_t because -1 is returned on error.
                 * Therefore we must restrict the length to SSIZE_MAX to
                 * avoid garbage return values.
                 */
                if (iov->iov_len > SSIZE_MAX ||
                    auio.uio_resid > SSIZE_MAX - iov->iov_len) {
                        error = EINVAL;
                        goto done;
                }
        }

        /*
         * if tracing, save a copy of iovec
         */
        if (ktrpoint(KTR_GENIO))  {
                ktriov = kmem_alloc(iovlen, KM_SLEEP);
                memcpy(ktriov, auio.uio_iov, iovlen);
        }

        cnt = auio.uio_resid;
        error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
        if (error)
                if (auio.uio_resid != cnt && (error == ERESTART ||
                    error == EINTR || error == EWOULDBLOCK))
                        error = 0;
        cnt -= auio.uio_resid;
        *retval = cnt;

        if (ktriov != NULL) {
                ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
                kmem_free(ktriov, iovlen);
        }

 done:
        if (needfree)
                kmem_free(needfree, iovlen);
 out:
        fd_putfile(fd);
        return (error);
}

/*
 * Write system call
 */
int
sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)                        fd;
                syscallarg(const void *)        buf;
                syscallarg(size_t)                nbyte;
        } */
        file_t *fp;
        int fd;

        fd = SCARG(uap, fd);

        if ((fp = fd_getfile(fd)) == NULL)
                return (EBADF);

        if ((fp->f_flag & FWRITE) == 0) {
                fd_putfile(fd);
                return (EBADF);
        }

        /* dofilewrite() will unuse the descriptor for us */
        return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
            &fp->f_offset, FOF_UPDATE_OFFSET, retval));
}

int
dofilewrite(int fd, struct file *fp, const void *buf,
        size_t nbyte, off_t *offset, int flags, register_t *retval)
{
        struct iovec aiov;
        struct uio auio;
        size_t cnt;
        int error;

        aiov.iov_base = __UNCONST(buf);                /* XXXUNCONST kills const */
        aiov.iov_len = nbyte;
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        auio.uio_resid = nbyte;
        auio.uio_rw = UIO_WRITE;
        auio.uio_vmspace = curproc->p_vmspace;

        /*
         * Writes return ssize_t because -1 is returned on error.  Therefore
         * we must restrict the length to SSIZE_MAX to avoid garbage return
         * values.
         */
        if (auio.uio_resid > SSIZE_MAX) {
                error = EINVAL;
                goto out;
        }

        cnt = auio.uio_resid;
        error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
        if (error) {
                if (auio.uio_resid != cnt && (error == ERESTART ||
                    error == EINTR || error == EWOULDBLOCK))
                        error = 0;
                if (error == EPIPE && !(fp->f_flag & FNOSIGPIPE)) {
                        mutex_enter(&proc_lock);
                        psignal(curproc, SIGPIPE);
                        mutex_exit(&proc_lock);
                }
        }
        cnt -= auio.uio_resid;
        ktrgenio(fd, UIO_WRITE, buf, cnt, error);
        *retval = cnt;
 out:
        fd_putfile(fd);
        return (error);
}

/*
 * Gather write system call
 */
int
sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)                                fd;
                syscallarg(const struct iovec *)        iovp;
                syscallarg(int)                                iovcnt;
        } */

        return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
            SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
}

int
do_filewritev(int fd, const struct iovec *iovp, int iovcnt,
    off_t *offset, int flags, register_t *retval)
{
        struct uio        auio;
        struct iovec        *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
        int                i, error;
        size_t                cnt;
        u_int                iovlen;
        struct file        *fp;
        struct iovec        *ktriov = NULL;

        if (iovcnt == 0)
                return EINVAL;

        if ((fp = fd_getfile(fd)) == NULL)
                return EBADF;

        if ((fp->f_flag & FWRITE) == 0) {
                fd_putfile(fd);
                return EBADF;
        }

        if (offset == NULL)
                offset = &fp->f_offset;
        else {
                /*
                 * Caller must not specify &fp->f_offset -- we can't
                 * safely dereference it for the call to fo_seek
                 * without holding some underlying object lock.
                 */
                KASSERT(offset != &fp->f_offset);
                if (fp->f_ops->fo_seek == NULL) {
                        error = ESPIPE;
                        goto out;
                }
                error = (*fp->f_ops->fo_seek)(fp, *offset, SEEK_SET, NULL,
                    0);
                if (error != 0)
                        goto out;
        }

        iovlen = iovcnt * sizeof(struct iovec);
        if (flags & FOF_IOV_SYSSPACE)
                iov = __UNCONST(iovp);
        else {
                iov = aiov;
                if ((u_int)iovcnt > UIO_SMALLIOV) {
                        if ((u_int)iovcnt > IOV_MAX) {
                                error = EINVAL;
                                goto out;
                        }
                        iov = kmem_alloc(iovlen, KM_SLEEP);
                        needfree = iov;
                }
                error = copyin(iovp, iov, iovlen);
                if (error)
                        goto done;
        }

        auio.uio_iov = iov;
        auio.uio_iovcnt = iovcnt;
        auio.uio_rw = UIO_WRITE;
        auio.uio_vmspace = curproc->p_vmspace;

        auio.uio_resid = 0;
        for (i = 0; i < iovcnt; i++, iov++) {
                auio.uio_resid += iov->iov_len;
                /*
                 * Writes return ssize_t because -1 is returned on error.
                 * Therefore we must restrict the length to SSIZE_MAX to
                 * avoid garbage return values.
                 */
                if (iov->iov_len > SSIZE_MAX ||
                    auio.uio_resid > SSIZE_MAX - iov->iov_len) {
                        error = EINVAL;
                        goto done;
                }
        }

        /*
         * if tracing, save a copy of iovec
         */
        if (ktrpoint(KTR_GENIO))  {
                ktriov = kmem_alloc(iovlen, KM_SLEEP);
                memcpy(ktriov, auio.uio_iov, iovlen);
        }

        cnt = auio.uio_resid;
        error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
        if (error) {
                if (auio.uio_resid != cnt && (error == ERESTART ||
                    error == EINTR || error == EWOULDBLOCK))
                        error = 0;
                if (error == EPIPE && !(fp->f_flag & FNOSIGPIPE)) {
                        mutex_enter(&proc_lock);
                        psignal(curproc, SIGPIPE);
                        mutex_exit(&proc_lock);
                }
        }
        cnt -= auio.uio_resid;
        *retval = cnt;

        if (ktriov != NULL) {
                ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
                kmem_free(ktriov, iovlen);
        }

 done:
        if (needfree)
                kmem_free(needfree, iovlen);
 out:
        fd_putfile(fd);
        return (error);
}

/*
 * Ioctl system call
 */
/* ARGSUSED */
int
sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval)
{
        /* {
                syscallarg(int)                fd;
                syscallarg(u_long)        com;
                syscallarg(void *)        data;
        } */
        struct file        *fp;
        proc_t                *p;
        u_long                com;
        int                error;
        size_t                size, alloc_size;
        void                 *data, *memp;
#define        STK_PARAMS        128
        u_long                stkbuf[STK_PARAMS/sizeof(u_long)];
#if  __TMPBIGMAXPARTITIONS > MAXPARTITIONS
        size_t                zero_last = 0;
#define        zero_size(SZ)        ((SZ)+zero_last)
#else
#define        zero_size(SZ)        (SZ)
#endif

        memp = NULL;
        alloc_size = 0;
        error = 0;
        p = l->l_proc;

        if ((fp = fd_getfile(SCARG(uap, fd))) == NULL)
                return (EBADF);

        if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
                error = EBADF;
                com = 0;
                goto out;
        }

        switch (com = SCARG(uap, com)) {
        case FIONCLEX:
        case FIOCLEX:
                fd_set_exclose(l, SCARG(uap, fd), com == FIOCLEX);
                goto out;
        }

        /*
         * Interpret high order word to find amount of data to be
         * copied to/from the user's address space.
         */
        size = IOCPARM_LEN(com);
        alloc_size = size;

        /*
         * The disklabel is now padded to a multiple of 8 bytes however the old
         * disklabel on 32bit platforms wasn't.  This leaves a difference in
         * size of 4 bytes between the two but are otherwise identical.
         * To deal with this, we allocate enough space for the new disklabel
         * but only copyin/out the smaller amount.
         */
        if (IOCGROUP(com) == 'd') {
#if  __TMPBIGMAXPARTITIONS > MAXPARTITIONS
                u_long ocom = com;
#endif
                u_long ncom = com ^ (DIOCGDINFO ^ DIOCGDINFO32);

#if  __TMPBIGMAXPARTITIONS > MAXPARTITIONS
        /*
         * Userland might use struct disklabel that is bigger than the
         * the kernel version (historic accident) - alloc userland
         * size and zero unused part on copyout.
         */
#define        DISKLABELLENDIFF        (sizeof(struct partition)        \
                                       *(__TMPBIGMAXPARTITIONS-MAXPARTITIONS))
#define        IOCFIXUP(NIOC)        ((NIOC&~(IOCPARM_MASK<<IOCPARM_SHIFT))        | \
                           (IOCPARM_LEN(NIOC)-DISKLABELLENDIFF)<<IOCPARM_SHIFT)

                switch (IOCFIXUP(ocom)) {
                case DIOCGDINFO:
                case DIOCWDINFO:
                case DIOCSDINFO:
                case DIOCGDEFLABEL:
                        com = ncom = IOCFIXUP(ocom);
                        zero_last = DISKLABELLENDIFF;
                        size -= DISKLABELLENDIFF;
                        goto done;
                }
#endif

                switch (ncom) {
                case DIOCGDINFO:
                case DIOCWDINFO:
                case DIOCSDINFO:
                case DIOCGDEFLABEL:
                        com = ncom;
                        if (IOCPARM_LEN(DIOCGDINFO32) < IOCPARM_LEN(DIOCGDINFO))
                                alloc_size = IOCPARM_LEN(DIOCGDINFO);
                        break;
                }
#if  __TMPBIGMAXPARTITIONS > MAXPARTITIONS
                done: ;
#endif
        }
        if (size > IOCPARM_MAX) {
                error = ENOTTY;
                goto out;
        }
        memp = NULL;
        if ((com >> IOCPARM_SHIFT) == 0)  {
                /* UNIX-style ioctl. */
                data = SCARG(uap, data);
        } else {
                if (alloc_size > sizeof(stkbuf)) {
                        memp = kmem_alloc(alloc_size, KM_SLEEP);
                        data = memp;
                } else {
                        data = (void *)stkbuf;
                }
                if (com&IOC_IN) {
                        if (size) {
                                error = copyin(SCARG(uap, data), data, size);
                                if (error) {
                                        goto out;
                                }
                                /*
                                 * The data between size and alloc_size has
                                 * not been overwritten.  It shouldn't matter
                                 * but let's clear that anyway.
                                 */
                                if (__predict_false(size < alloc_size)) {
                                        memset((char *)data+size, 0,
                                            alloc_size - size);
                                }
                                ktrgenio(SCARG(uap, fd), UIO_WRITE,
                                    SCARG(uap, data), size, 0);
                        } else {
                                *(void **)data = SCARG(uap, data);
                        }
                } else if ((com&IOC_OUT) && size) {
                        /*
                         * Zero the buffer so the user always
                         * gets back something deterministic.
                         */
                        memset(data, 0, zero_size(size));
                } else if (com&IOC_VOID) {
                        *(void **)data = SCARG(uap, data);
                }
        }

        switch (com) {

        case FIONBIO:
                /* XXX Code block is not atomic */
                if (*(int *)data != 0)
                        atomic_or_uint(&fp->f_flag, FNONBLOCK);
                else
                        atomic_and_uint(&fp->f_flag, ~FNONBLOCK);
                error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data);
                break;

        case FIOASYNC:
                /* XXX Code block is not atomic */
                if (*(int *)data != 0)
                        atomic_or_uint(&fp->f_flag, FASYNC);
                else
                        atomic_and_uint(&fp->f_flag, ~FASYNC);
                error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data);
                break;

        default:
                error = (*fp->f_ops->fo_ioctl)(fp, com, data);
                /*
                 * Copy any data to user, size was
                 * already set and checked above.
                 */
                if (error == 0 && (com&IOC_OUT) && size) {
                        error = copyout(data, SCARG(uap, data),
                            zero_size(size));
                        ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
                            size, error);
                }
                break;
        }
 out:
        if (memp)
                kmem_free(memp, alloc_size);
        fd_putfile(SCARG(uap, fd));
        switch (error) {
        case -1:
                printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
                    "pid=%d comm=%s\n",
                    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
                    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
                    p->p_pid, p->p_comm);
                /* FALLTHROUGH */
        case EPASSTHROUGH:
                error = ENOTTY;
                /* FALLTHROUGH */
        default:
                return (error);
        }
}








































































































































































    3 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
/*        $NetBSD: sys_ptrace.c,v 1.12 2022/07/10 14:07:55 riastradh Exp $        */

/*-
 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        from: @(#)sys_process.c        8.1 (Berkeley) 6/10/93
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_ptrace.c,v 1.12 2022/07/10 14:07:55 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_ptrace.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/exec.h>
#include <sys/pax.h>
#include <sys/ptrace.h>
#include <sys/uio.h>
#include <sys/ras.h>
#include <sys/kmem.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/syscallvar.h>
#include <sys/syscall.h>
#include <sys/module.h>

#include <uvm/uvm_extern.h>

#include <machine/reg.h>

/*
 * PTRACE methods
 */

static int
ptrace_copyin_piod(struct ptrace_io_desc *piod, const void *addr, size_t len)
{
        if (len != 0 && sizeof(*piod) != len)
                return EINVAL;

        return copyin(addr, piod, sizeof(*piod));
}

static int
ptrace_copyout_piod(const struct ptrace_io_desc *piod, void *addr, size_t len)
{
        if (len != 0 && sizeof(*piod) != len)
                return EINVAL;

        return copyout(piod, addr, sizeof(*piod));
}

static int
ptrace_copyin_siginfo(struct ptrace_siginfo *psi, const void *addr, size_t len)
{
        if (sizeof(*psi) != len)
                return EINVAL;

        return copyin(addr, psi, sizeof(*psi));
}

static int
ptrace_copyout_siginfo(const struct ptrace_siginfo *psi, void *addr, size_t len)
{
        if (sizeof(*psi) != len)
                return EINVAL;

        return copyout(psi, addr, sizeof(*psi));
}

static int
ptrace_copyout_lwpstatus(const struct ptrace_lwpstatus *pls, void *addr,
    size_t len)
{

        return copyout(pls, addr, len);
}

static struct ptrace_methods native_ptm = {
        .ptm_copyin_piod = ptrace_copyin_piod,
        .ptm_copyout_piod = ptrace_copyout_piod,
        .ptm_copyin_siginfo = ptrace_copyin_siginfo,
        .ptm_copyout_siginfo = ptrace_copyout_siginfo,
        .ptm_copyout_lwpstatus = ptrace_copyout_lwpstatus,
        .ptm_doregs = process_doregs,
        .ptm_dofpregs = process_dofpregs,
        .ptm_dodbregs = process_dodbregs,
};

static const struct syscall_package ptrace_syscalls[] = {
        { SYS_ptrace, 0, (sy_call_t *)sys_ptrace },
        { 0, 0, NULL },
};

/*
 * Process debugging system call.
 */
int
sys_ptrace(struct lwp *l, const struct sys_ptrace_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) req;
                syscallarg(pid_t) pid;
                syscallarg(void *) addr;
                syscallarg(int) data;
        } */

        return do_ptrace(&native_ptm, l, SCARG(uap, req), SCARG(uap, pid),
            SCARG(uap, addr), SCARG(uap, data), retval);
}

#define        DEPS        "ptrace_common"

MODULE(MODULE_CLASS_EXEC, ptrace, DEPS);

static int
ptrace_init(void)
{
        int error;

        error = syscall_establish(&emul_netbsd, ptrace_syscalls);
        return error;
}

static int
ptrace_fini(void)
{
        int error;

        error = syscall_disestablish(&emul_netbsd, ptrace_syscalls);
        return error;
}


static int
ptrace_modcmd(modcmd_t cmd, void *arg)
{
        int error;

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = ptrace_init();
                break;
        case MODULE_CMD_FINI:
                error = ptrace_fini();
                break;
        default:
                error = ENOTTY;
                break;
        }
        return error;
}


























































































































































































































   24 


    6 


    6 







   24 








   29 






   29 



















   15 










   12 




   15 
    6 



   15 























































   23 


   22 












  171 








  171 




  171 




















   15 












   23 


   22 
































   23 




















   23 








   22 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
/*        $NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $        */

/*-
 * Copyright (c) 2008-2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran and Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * x86 pmap(9) module: TLB shootdowns.
 *
 * TLB shootdowns are hard interrupts that operate outside the SPL framework.
 * They do not need to be blocked, provided that the pmap module gets the
 * order of events correct.  The calls are made by poking the LAPIC directly.
 * The interrupt handler is short and does one of the following: invalidate
 * a set of pages, all user TLB entries or the entire TLB.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $");

#include <sys/param.h>
#include <sys/kernel.h>

#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <uvm/uvm.h>

#include <machine/cpuvar.h>
#include <machine/pmap_private.h>

#ifdef XENPV
#include <xen/xenpmap.h>
#endif /* XENPV */
#include <x86/i82489reg.h>
#include <x86/i82489var.h>

/*
 * TLB shootdown packet.  Each CPU has a copy of this packet, where we build
 * sets of TLB shootdowns.  If shootdowns need to occur on remote CPUs, the
 * packet is copied into a shared mailbox kept on the initiator's kernel
 * stack.  Once the copy is made, no further updates to the mailbox are made
 * until the request is completed.  This keeps the cache line in the shared
 * state, and bus traffic to a minimum.
 *
 * In order to make maximal use of the available space, control fields are
 * overlaid into the lower 12 bits of the first 4 virtual addresses.  This
 * is very ugly, but it counts.
 *
 * On i386 the packet is 64 bytes in size.  On amd64 it's 128 bytes.  This
 * is sized in concert with UBC_WINSIZE, otherwise excessive shootdown
 * interrupts could be issued.
 */

#define        TP_MAXVA        16                /* for individual mappings */
#define        TP_ALLVA        PAGE_MASK        /* special: shoot all mappings */

typedef struct {
        uintptr_t                tp_store[TP_MAXVA];
} pmap_tlb_packet_t;

#define        TP_COUNT        0
#define        TP_USERPMAP        1
#define        TP_GLOBAL        2
#define        TP_DONE                3

#define        TP_GET_COUNT(tp)        ((tp)->tp_store[TP_COUNT] & PAGE_MASK)
#define        TP_GET_USERPMAP(tp)        ((tp)->tp_store[TP_USERPMAP] & 1)
#define        TP_GET_GLOBAL(tp)        ((tp)->tp_store[TP_GLOBAL] & 1)
#define        TP_GET_DONE(tp)                (atomic_load_relaxed(&(tp)->tp_store[TP_DONE]) & 1)
#define        TP_GET_VA(tp, i)        ((tp)->tp_store[(i)] & ~PAGE_MASK)

#define        TP_INC_COUNT(tp)        ((tp)->tp_store[TP_COUNT]++)
#define        TP_SET_ALLVA(tp)        ((tp)->tp_store[TP_COUNT] |= TP_ALLVA)
#define        TP_SET_VA(tp, c, va)        ((tp)->tp_store[(c)] |= ((va) & ~PAGE_MASK))

#define        TP_SET_USERPMAP(tp)        ((tp)->tp_store[TP_USERPMAP] |= 1)
#define        TP_SET_GLOBAL(tp)        ((tp)->tp_store[TP_GLOBAL] |= 1)
#define        TP_SET_DONE(tp)                                                             \
        do {                                                                     \
                uintptr_t v = atomic_load_relaxed(&(tp)->tp_store[TP_DONE]); \
                atomic_store_relaxed(&(tp)->tp_store[TP_DONE], v | 1);             \
        } while (/* CONSTCOND */ 0);

#define        TP_CLEAR(tp)                memset(__UNVOLATILE(tp), 0, sizeof(*(tp)));

/*
 * TLB shootdown state.
 */
static volatile pmap_tlb_packet_t *volatile pmap_tlb_packet __cacheline_aligned;
static volatile u_int                pmap_tlb_pendcount        __cacheline_aligned;
static struct evcnt                pmap_tlb_evcnt                __cacheline_aligned;

/*
 * TLB shootdown statistics.
 */
#ifdef TLBSTATS
static struct evcnt                tlbstat_local[TLBSHOOT__MAX];
static struct evcnt                tlbstat_remote[TLBSHOOT__MAX];
static struct evcnt                tlbstat_kernel[TLBSHOOT__MAX];
static struct evcnt                tlbstat_single_req;
static struct evcnt                tlbstat_single_issue;
static const char *                tlbstat_name[ ] = {
        "REMOVE_ALL",
        "KENTER",
        "KREMOVE",
        "FREE_PTP",
        "REMOVE_PTE",
        "SYNC_PV",
        "WRITE_PROTECT",
        "ENTER",
        "NVMM",
        "BUS_DMA",
        "BUS_SPACE",
};
#endif

void
pmap_tlb_init(void)
{

        evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
            NULL, "TLB", "shootdown");

#ifdef TLBSTATS
        int i;

        for (i = 0; i < TLBSHOOT__MAX; i++) {
                evcnt_attach_dynamic(&tlbstat_local[i], EVCNT_TYPE_MISC,
                    NULL, "tlbshoot local", tlbstat_name[i]);
        }
        for (i = 0; i < TLBSHOOT__MAX; i++) {
                evcnt_attach_dynamic(&tlbstat_remote[i], EVCNT_TYPE_MISC,
                    NULL, "tlbshoot remote", tlbstat_name[i]);
        }
        for (i = 0; i < TLBSHOOT__MAX; i++) {
                evcnt_attach_dynamic(&tlbstat_kernel[i], EVCNT_TYPE_MISC,
                    NULL, "tlbshoot kernel", tlbstat_name[i]);
        }
        evcnt_attach_dynamic(&tlbstat_single_req, EVCNT_TYPE_MISC,
            NULL, "tlbshoot single page", "requests");
        evcnt_attach_dynamic(&tlbstat_single_issue, EVCNT_TYPE_MISC,
            NULL, "tlbshoot single page", "issues");
#endif
}

void
pmap_tlb_cpu_init(struct cpu_info *ci)
{
        pmap_tlb_packet_t *tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;

        memset(tp, 0, sizeof(pmap_tlb_packet_t));
        kcpuset_create(&ci->ci_tlb_cpuset, true);
}

static inline void
pmap_tlbstat_count(struct pmap *pm, vaddr_t va, tlbwhy_t why)
{
#ifdef TLBSTATS
        const cpuid_t cid = cpu_index(curcpu());
        bool local = false, remote = false;

        if (va != (vaddr_t)-1LL) {
                atomic_inc_64(&tlbstat_single_req.ev_count);
        }
        if (pm == pmap_kernel()) {
                atomic_inc_64(&tlbstat_kernel[why].ev_count);
                return;
        }

        if (va >= VM_MAXUSER_ADDRESS) {
                remote = kcpuset_isotherset(pm->pm_kernel_cpus, cid);
                local = kcpuset_isset(pm->pm_kernel_cpus, cid);
        }
        remote |= kcpuset_isotherset(pm->pm_cpus, cid);
        local |= kcpuset_isset(pm->pm_cpus, cid);

        if (local) {
                atomic_inc_64(&tlbstat_local[why].ev_count);
        }
        if (remote) {
                atomic_inc_64(&tlbstat_remote[why].ev_count);
        }
#endif
}

static inline void
pmap_tlb_invalidate(volatile pmap_tlb_packet_t *tp)
{
        int i = TP_GET_COUNT(tp);

        /* Find out what we need to invalidate. */
        if (i == TP_ALLVA) {
                if (TP_GET_GLOBAL(tp) != 0) {
                        /* Invalidating all TLB entries. */
                        tlbflushg();
                } else {
                        /* Invalidating non-global TLB entries only. */
                        tlbflush();
                }
        } else {
                /* Invalidating a single page or a range of pages. */
                KASSERT(i != 0);
                do {
                        --i;
                        pmap_update_pg(TP_GET_VA(tp, i));
                } while (i > 0);
        }
}

/*
 * pmap_tlb_shootdown: invalidate a page on all CPUs using pmap 'pm'.
 */
void
pmap_tlb_shootdown(struct pmap *pm, vaddr_t va, pt_entry_t pte, tlbwhy_t why)
{
        pmap_tlb_packet_t *tp;
        struct cpu_info *ci;
        uint8_t count;
        int s;

#ifndef XENPV
        KASSERT((pte & PTE_G) == 0 || pm == pmap_kernel());
#endif

        if (__predict_false(pm->pm_tlb_flush != NULL)) {
                (*pm->pm_tlb_flush)(pm);
                return;
        }

        if ((pte & PTE_PS) != 0) {
                va &= PTE_LGFRAME;
        }

        /*
         * Add the shootdown operation to our pending set.
         */
        s = splvm();
        ci = curcpu();
        tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;

        /* Whole address flush will be needed if PTE_G is set. */
        if ((pte & PTE_G) != 0) {
                TP_SET_GLOBAL(tp);
        }
        count = TP_GET_COUNT(tp);

        if (count < TP_MAXVA && va != (vaddr_t)-1LL) {
                /* Flush a single page. */
                TP_SET_VA(tp, count, va);
                TP_INC_COUNT(tp);
        } else {
                /* Flush everything - may already be set. */
                TP_SET_ALLVA(tp);
        }

        if (pm != pmap_kernel()) {
                kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_cpus);
                if (va >= VM_MAXUSER_ADDRESS) {
                        kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_kernel_cpus);
                }
                TP_SET_USERPMAP(tp);
        } else {
                kcpuset_copy(ci->ci_tlb_cpuset, kcpuset_running);
        }
        pmap_tlbstat_count(pm, va, why);
        splx(s);
}

#ifdef XENPV

static inline void
pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target)
{
#ifdef MULTIPROCESSOR
        int i = TP_GET_COUNT(tp);

        if (i != TP_ALLVA) {
                /* Invalidating a single page or a range of pages. */
                KASSERT(i != 0);
                do {
                        --i;
                        xen_mcast_invlpg(TP_GET_VA(tp, i), target);
                } while (i > 0);
        } else {
                xen_mcast_tlbflush(target);
        }

        /* Remote CPUs have been synchronously flushed. */
        pmap_tlb_pendcount = 0;
        pmap_tlb_packet = NULL;
        TP_SET_DONE(tp);
#endif /* MULTIPROCESSOR */
}

#else

static inline void
pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target)
{
#ifdef MULTIPROCESSOR
        int err = 0;

        if (!kcpuset_match(target, kcpuset_attached)) {
                const struct cpu_info * const self = curcpu();
                CPU_INFO_ITERATOR cii;
                struct cpu_info *lci;

                for (CPU_INFO_FOREACH(cii, lci)) {
                        const cpuid_t lcid = cpu_index(lci);

                        if (__predict_false(lci == self) ||
                            !kcpuset_isset(target, lcid)) {
                                continue;
                        }
                        err |= x86_ipi(LAPIC_TLB_VECTOR,
                            lci->ci_cpuid, LAPIC_DLMODE_FIXED);
                }
        } else {
                err = x86_ipi(LAPIC_TLB_VECTOR, LAPIC_DEST_ALLEXCL,
                    LAPIC_DLMODE_FIXED);
        }
        KASSERT(err == 0);
#endif /* MULTIPROCESSOR */
}

#endif /* XENPV */

/*
 * pmap_tlb_shootnow: process pending TLB shootdowns queued on current CPU.
 *
 * => Must be called with preemption disabled.
 */
void
pmap_tlb_shootnow(void)
{
        volatile pmap_tlb_packet_t *tp, *ts;
        volatile uint8_t stackbuf[sizeof(*tp) + COHERENCY_UNIT];
        struct cpu_info *ci;
        kcpuset_t *target;
        u_int local, rcpucount;
        cpuid_t cid;
        int s;

        KASSERT(kpreempt_disabled());

        /* Pre-check first. */
        ci = curcpu();
        tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;
        if (TP_GET_COUNT(tp) == 0) {
                return;
        }

        /* An interrupt may have flushed our updates, so check again. */
        s = splvm();
        if (TP_GET_COUNT(tp) == 0) {
                splx(s);
                return;
        }

        cid = cpu_index(ci);
        target = ci->ci_tlb_cpuset;
        local = kcpuset_isset(target, cid) ? 1 : 0;
        rcpucount = kcpuset_countset(target) - local;

        /*
         * Fast path for local shootdowns only.  Do the shootdowns, and
         * clear out the buffer for the next user.
         */
        if (rcpucount == 0) {
                pmap_tlb_invalidate(tp);
                kcpuset_zero(ci->ci_tlb_cpuset);
                TP_CLEAR(tp);
                splx(s);
                return;
        }

        /*
         * Copy the packet into the stack buffer, and gain ownership of the
         * global pointer.  We must keep interrupts blocked once we own the
         * pointer and until the IPIs are triggered, or we could deadlock
         * against an interrupt on the current CPU trying the same.
         */
        KASSERT(rcpucount < ncpu);
        ts = (void *)roundup2((uintptr_t)stackbuf, COHERENCY_UNIT);
        *ts = *tp;
        KASSERT(TP_GET_DONE(ts) == 0);
        while (atomic_cas_ptr(&pmap_tlb_packet, NULL,
            __UNVOLATILE(ts)) != NULL) {
                KASSERT(atomic_load_relaxed(&pmap_tlb_packet) != ts);
                /*
                 * Don't bother with exponentional backoff, as the pointer
                 * is in a dedicated cache line and only updated twice per
                 * IPI (in contrast to the pending counter).  The cache
                 * line will spend most of its time in the SHARED state.
                 */
                splx(s);
                do {
                        x86_pause();
                } while (atomic_load_relaxed(&pmap_tlb_packet) != NULL);
                s = splvm();

                /*
                 * An interrupt might have done the shootdowns for
                 * us while we spun.
                 */
                if (TP_GET_COUNT(tp) == 0) {
                        splx(s);
                        return;
                }
        }

        /*
         * Ownership of the global pointer provides serialization of the
         * update to the count and the event counter.  With those values
         * updated, start shootdowns on remote CPUs.
         */
        pmap_tlb_pendcount = rcpucount;
        pmap_tlb_evcnt.ev_count++;
        pmap_tlb_processpacket(ts, target);

        /*
         * Clear out the local CPU's buffer for the next user.  Once done,
         * we can drop the IPL.
         */
#ifdef TLBSTATS
        if (TP_GET_COUNT(tp) != TP_ALLVA) {
                atomic_add_64(&tlbstat_single_issue.ev_count,
                    TP_GET_COUNT(tp));
        }
#endif
        kcpuset_zero(ci->ci_tlb_cpuset);
        TP_CLEAR(tp);
        splx(s);

        /*
         * Shootdowns on remote CPUs are now in flight.  In the meantime,
         * perform local shootdown if needed, using our copy of the packet.
         */
        if (local) {
                pmap_tlb_invalidate(ts);
        }

        /*
         * Wait for the updates to be processed by remote CPUs.  Poll the
         * flag in the packet in order to limit bus traffic (only the last
         * CPU out will update it and only we are reading it).  No memory
         * barrier required due to prior stores - yay x86.
         */
        while (TP_GET_DONE(ts) == 0) {
                x86_pause();
        }
}

/*
 * pmap_tlb_intr: pmap shootdown interrupt handler to invalidate TLB entries.
 *
 * Called from IPI only.  We are outside the SPL framework, with interrupts
 * disabled on the CPU: be careful.
 *
 * TLB flush and the interrupt that brought us here are serializing
 * operations (they defeat speculative execution).  Any speculative load
 * producing a TLB fill between receipt of the interrupt and the TLB flush
 * will load "current" PTEs.  None of the mappings relied on by this ISR for
 * its execution will be changing.  So it's safe to acknowledge the request
 * and allow the initiator to proceed before performing the flush.
 */
void
pmap_tlb_intr(void)
{
        pmap_tlb_packet_t copy;
        volatile pmap_tlb_packet_t *source;
        struct cpu_info *ci;

        /* Make a private copy of the packet. */
        source = pmap_tlb_packet;
        copy = *source;

        /*
         * If we are the last CPU out, clear the active pointer and mark the
         * packet as done.  Both can be done without using an atomic, and
         * the one atomic we do use serves as our memory barrier.
         *
         * It's important to clear the active pointer before setting
         * TP_DONE, to ensure a remote CPU does not exit & re-enter
         * pmap_tlb_shootnow() only to find its current pointer still
         * seemingly active.
         */
        if (atomic_dec_uint_nv(&pmap_tlb_pendcount) == 0) {
                atomic_store_relaxed(&pmap_tlb_packet, NULL);
                __insn_barrier();
                TP_SET_DONE(source);
        }
        pmap_tlb_invalidate(&copy);

        /*
         * Check the current TLB state.  If we don't want further flushes
         * for this pmap, then take the CPU out of the pmap's set.  The
         * order of updates to the set and TLB state must closely align with
         * the pmap code, as we can interrupt code running in the pmap
         * module.
         */
        ci = curcpu();
        if (ci->ci_tlbstate == TLBSTATE_LAZY && TP_GET_USERPMAP(&copy) != 0) {
                kcpuset_atomic_clear(ci->ci_pmap->pm_cpus, cpu_index(ci));
                ci->ci_tlbstate = TLBSTATE_STALE;
        }
}



















































































































































































































    6 





    6 











    2 



    2 
    2 






    2 























































































   30 

   30 
   29 














   15 

   15 
   15 



























  135 


  134 
  133 
  134 
  135 

































   23 








































   15 

   15 
























   28 


   28 


   29 








  108 


  112 





  110 


  111 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
/*        $NetBSD: subr_kcpuset.c,v 1.20 2023/09/23 18:21:11 ad Exp $        */

/*-
 * Copyright (c) 2011, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Kernel CPU set implementation.
 *
 * Interface can be used by kernel subsystems as a unified dynamic CPU
 * bitset implementation handling many CPUs.  Facility also supports early
 * use by MD code on boot, as it fixups bitsets on further boot.
 *
 * TODO:
 * - Handle "reverse" bitset on fixup/grow.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_kcpuset.c,v 1.20 2023/09/23 18:21:11 ad Exp $");

#include <sys/param.h>
#include <sys/types.h>

#include <sys/atomic.h>
#include <sys/intr.h>
#include <sys/sched.h>
#include <sys/kcpuset.h>
#include <sys/kmem.h>

/* Number of CPUs to support. */
#define        KC_MAXCPUS                roundup2(MAXCPUS, 32)

/*
 * Structure of dynamic CPU set in the kernel.
 */
struct kcpuset {
        uint32_t                bits[0];
};

typedef struct kcpuset_impl {
        /* Reference count. */
        u_int                        kc_refcnt;
        /* Next to free, if non-NULL (used when multiple references). */
        struct kcpuset *        kc_next;
        /* Actual variable-sized field of bits. */
        struct kcpuset                kc_field;
} kcpuset_impl_t;

#define        KC_BITS_OFF                (offsetof(struct kcpuset_impl, kc_field))
#define        KC_GETSTRUCT(b)                ((kcpuset_impl_t *)((char *)(b) - KC_BITS_OFF))
#define        KC_GETCSTRUCT(b)        ((const kcpuset_impl_t *)((const char *)(b) - KC_BITS_OFF))

/* Sizes of a single bitset. */
#define        KC_SHIFT                5
#define        KC_MASK                        31

/* An array of noted early kcpuset creations and data. */
#define        KC_SAVE_NITEMS                8

/* Structures for early boot mechanism (must be statically initialised). */
static kcpuset_t **                kc_noted_early[KC_SAVE_NITEMS];
static uint32_t                        kc_bits_early[KC_SAVE_NITEMS];
static int                        kc_last_idx = 0;
static bool                        kc_initialised = false;

#define        KC_BITSIZE_EARLY        sizeof(kc_bits_early[0])
#define        KC_NFIELDS_EARLY        1

/*
 * The size of whole bitset fields and amount of fields.
 * The whole size must statically initialise for early case.
 */
static size_t                        kc_bitsize __read_mostly = KC_BITSIZE_EARLY;
static size_t                        kc_nfields __read_mostly = KC_NFIELDS_EARLY;
static size_t                        kc_memsize __read_mostly;

static kcpuset_t *                kcpuset_create_raw(bool);

/*
 * kcpuset_sysinit: initialize the subsystem, transfer early boot cases
 * to dynamically allocated sets.
 */
void
kcpuset_sysinit(void)
{
        kcpuset_t *kc_dynamic[KC_SAVE_NITEMS], *kcp;
        int i, s;

        /* Set a kcpuset_t sizes. */
        kc_nfields = (KC_MAXCPUS >> KC_SHIFT);
        kc_bitsize = sizeof(uint32_t) * kc_nfields;
        kc_memsize = sizeof(kcpuset_impl_t) + kc_bitsize;
        KASSERT(kc_nfields != 0);
        KASSERT(kc_bitsize != 0);

        /* First, pre-allocate kcpuset entries. */
        for (i = 0; i < kc_last_idx; i++) {
                kcp = kcpuset_create_raw(true);
                kc_dynamic[i] = kcp;
        }

        /*
         * Prepare to convert all early noted kcpuset uses to dynamic sets.
         * All processors, except the one we are currently running (primary),
         * must not be spinned yet.  Since MD facilities can use kcpuset,
         * raise the IPL to high.
         */
        KASSERT(mp_online == false);

        s = splhigh();
        for (i = 0; i < kc_last_idx; i++) {
                /*
                 * Transfer the bits from early static storage to the kcpuset.
                 */
                KASSERT(kc_bitsize >= KC_BITSIZE_EARLY);
                memcpy(kc_dynamic[i], &kc_bits_early[i], KC_BITSIZE_EARLY);

                /*
                 * Store the new pointer, pointing to the allocated kcpuset.
                 * Note: we are not in an interrupt context and it is the only
                 * CPU running - thus store is safe (e.g. no need for pointer
                 * variable to be volatile).
                 */
                *kc_noted_early[i] = kc_dynamic[i];
        }
        kc_initialised = true;
        kc_last_idx = 0;
        splx(s);
}

/*
 * kcpuset_early_ptr: note an early boot use by saving the pointer and
 * returning a pointer to a static, temporary bit field.
 */
static kcpuset_t *
kcpuset_early_ptr(kcpuset_t **kcptr)
{
        kcpuset_t *kcp;
        int s;

        s = splhigh();
        if (kc_last_idx < KC_SAVE_NITEMS) {
                /*
                 * Save the pointer, return pointer to static early field.
                 * Need to zero it out.
                 */
                kc_noted_early[kc_last_idx] = kcptr;
                kcp = (kcpuset_t *)&kc_bits_early[kc_last_idx];
                kc_last_idx++;
                memset(kcp, 0, KC_BITSIZE_EARLY);
                KASSERT(kc_bitsize == KC_BITSIZE_EARLY);
        } else {
                panic("kcpuset(9): all early-use entries exhausted; "
                    "increase KC_SAVE_NITEMS\n");
        }
        splx(s);

        return kcp;
}

/*
 * Routines to create or destroy the CPU set.
 * Early boot case is handled.
 */

static kcpuset_t *
kcpuset_create_raw(bool zero)
{
        kcpuset_impl_t *kc;

        kc = kmem_alloc(kc_memsize, KM_SLEEP);
        kc->kc_refcnt = 1;
        kc->kc_next = NULL;

        if (zero) {
                memset(&kc->kc_field, 0, kc_bitsize);
        }

        /* Note: return pointer to the actual field of bits. */
        KASSERT((uint8_t *)kc + KC_BITS_OFF == (uint8_t *)&kc->kc_field);
        return &kc->kc_field;
}

void
kcpuset_create(kcpuset_t **retkcp, bool zero)
{
        if (__predict_false(!kc_initialised)) {
                /* Early boot use - special case. */
                *retkcp = kcpuset_early_ptr(retkcp);
                return;
        }
        *retkcp = kcpuset_create_raw(zero);
}

void
kcpuset_clone(kcpuset_t **retkcp, const kcpuset_t *kcp)
{
        kcpuset_create(retkcp, false);
        memcpy(*retkcp, kcp, kc_bitsize);
}

void
kcpuset_destroy(kcpuset_t *kcp)
{
        const size_t size = kc_memsize;
        kcpuset_impl_t *kc;

        KASSERT(kc_initialised);
        KASSERT(kcp != NULL);

        do {
                kc = KC_GETSTRUCT(kcp);
                kcp = kc->kc_next;
                kmem_free(kc, size);
        } while (kcp);
}

/*
 * Routines to reference/unreference the CPU set.
 * Note: early boot case is not supported by these routines.
 */

void
kcpuset_use(kcpuset_t *kcp)
{
        kcpuset_impl_t *kc = KC_GETSTRUCT(kcp);

        KASSERT(kc_initialised);
        atomic_inc_uint(&kc->kc_refcnt);
}

void
kcpuset_unuse(kcpuset_t *kcp, kcpuset_t **lst)
{
        kcpuset_impl_t *kc = KC_GETSTRUCT(kcp);

        KASSERT(kc_initialised);
        KASSERT(kc->kc_refcnt > 0);

        membar_release();
        if (atomic_dec_uint_nv(&kc->kc_refcnt) != 0) {
                return;
        }
        membar_acquire();
        KASSERT(kc->kc_next == NULL);
        if (lst == NULL) {
                kcpuset_destroy(kcp);
                return;
        }
        kc->kc_next = *lst;
        *lst = kcp;
}

/*
 * Routines to transfer the CPU set from / to userspace.
 * Note: early boot case is not supported by these routines.
 */

int
kcpuset_copyin(const cpuset_t *ucp, kcpuset_t *kcp, size_t len)
{
        kcpuset_impl_t *kc __diagused = KC_GETSTRUCT(kcp);

        KASSERT(kc_initialised);
        KASSERT(kc->kc_refcnt > 0);
        KASSERT(kc->kc_next == NULL);

        if (len > kc_bitsize) { /* XXX */
                return EINVAL;
        }
        return copyin(ucp, kcp, len);
}

int
kcpuset_copyout(kcpuset_t *kcp, cpuset_t *ucp, size_t len)
{
        kcpuset_impl_t *kc __diagused = KC_GETSTRUCT(kcp);

        KASSERT(kc_initialised);
        KASSERT(kc->kc_refcnt > 0);
        KASSERT(kc->kc_next == NULL);

        if (len > kc_bitsize) { /* XXX */
                return EINVAL;
        }
        return copyout(kcp, ucp, len);
}

void
kcpuset_export_u32(const kcpuset_t *kcp, uint32_t *bitfield, size_t len)
{
        size_t rlen = MIN(kc_bitsize, len);

        KASSERT(kcp != NULL);
        memcpy(bitfield, kcp->bits, rlen);
}

/*
 * Routines to change bit field - zero, fill, copy, set, unset, etc.
 */

void
kcpuset_zero(kcpuset_t *kcp)
{

        KASSERT(!kc_initialised || KC_GETSTRUCT(kcp)->kc_refcnt > 0);
        KASSERT(!kc_initialised || KC_GETSTRUCT(kcp)->kc_next == NULL);
        memset(kcp, 0, kc_bitsize);
}

void
kcpuset_fill(kcpuset_t *kcp)
{

        KASSERT(!kc_initialised || KC_GETSTRUCT(kcp)->kc_refcnt > 0);
        KASSERT(!kc_initialised || KC_GETSTRUCT(kcp)->kc_next == NULL);
        memset(kcp, ~0, kc_bitsize);
}

void
kcpuset_copy(kcpuset_t *dkcp, const kcpuset_t *skcp)
{

        KASSERT(!kc_initialised || KC_GETSTRUCT(dkcp)->kc_refcnt > 0);
        KASSERT(!kc_initialised || KC_GETSTRUCT(dkcp)->kc_next == NULL);
        memcpy(dkcp, skcp, kc_bitsize);
}

void
kcpuset_set(kcpuset_t *kcp, cpuid_t i)
{
        const size_t j = i >> KC_SHIFT;

        KASSERT(!kc_initialised || KC_GETSTRUCT(kcp)->kc_next == NULL);
        KASSERT(j < kc_nfields);

        kcp->bits[j] |= __BIT(i & KC_MASK);
}

void
kcpuset_clear(kcpuset_t *kcp, cpuid_t i)
{
        const size_t j = i >> KC_SHIFT;

        KASSERT(!kc_initialised || KC_GETCSTRUCT(kcp)->kc_next == NULL);
        KASSERT(j < kc_nfields);

        kcp->bits[j] &= ~(__BIT(i & KC_MASK));
}

bool
kcpuset_isset(const kcpuset_t *kcp, cpuid_t i)
{
        const size_t j = i >> KC_SHIFT;

        KASSERT(kcp != NULL);
        KASSERT(!kc_initialised || KC_GETCSTRUCT(kcp)->kc_refcnt > 0);
        KASSERT(!kc_initialised || KC_GETCSTRUCT(kcp)->kc_next == NULL);
        KASSERT(j < kc_nfields);

        return ((__BIT(i & KC_MASK)) & kcp->bits[j]) != 0;
}

bool
kcpuset_isotherset(const kcpuset_t *kcp, cpuid_t i)
{
        const size_t j2 = i >> KC_SHIFT;
        const uint32_t mask = ~(__BIT(i & KC_MASK));

        for (size_t j = 0; j < kc_nfields; j++) {
                const uint32_t bits = kcp->bits[j];
                if (bits && (j != j2 || (bits & mask) != 0)) {
                        return true;
                }
        }
        return false;
}

bool
kcpuset_iszero(const kcpuset_t *kcp)
{

        for (size_t j = 0; j < kc_nfields; j++) {
                if (kcp->bits[j] != 0) {
                        return false;
                }
        }
        return true;
}

bool
kcpuset_match(const kcpuset_t *kcp1, const kcpuset_t *kcp2)
{

        return memcmp(kcp1, kcp2, kc_bitsize) == 0;
}

bool
kcpuset_intersecting_p(const kcpuset_t *kcp1, const kcpuset_t *kcp2)
{

        for (size_t j = 0; j < kc_nfields; j++) {
                if (kcp1->bits[j] & kcp2->bits[j])
                        return true;
        }
        return false;
}

cpuid_t
kcpuset_ffs(const kcpuset_t *kcp)
{

        for (size_t j = 0; j < kc_nfields; j++) {
                if (kcp->bits[j])
                        return 32 * j + ffs(kcp->bits[j]);
        }
        return 0;
}

cpuid_t
kcpuset_ffs_intersecting(const kcpuset_t *kcp1, const kcpuset_t *kcp2)
{

        for (size_t j = 0; j < kc_nfields; j++) {
                uint32_t bits = kcp1->bits[j] & kcp2->bits[j];
                if (bits)
                        return 32 * j + ffs(bits);
        }
        return 0;
}

void
kcpuset_merge(kcpuset_t *kcp1, const kcpuset_t *kcp2)
{

        for (size_t j = 0; j < kc_nfields; j++) {
                kcp1->bits[j] |= kcp2->bits[j];
        }
}

void
kcpuset_intersect(kcpuset_t *kcp1, const kcpuset_t *kcp2)
{

        for (size_t j = 0; j < kc_nfields; j++) {
                kcp1->bits[j] &= kcp2->bits[j];
        }
}

void
kcpuset_remove(kcpuset_t *kcp1, const kcpuset_t *kcp2)
{

        for (size_t j = 0; j < kc_nfields; j++) {
                kcp1->bits[j] &= ~kcp2->bits[j];
        }
}

int
kcpuset_countset(const kcpuset_t *kcp)
{
        int count = 0;

        for (size_t j = 0; j < kc_nfields; j++) {
                count += popcount32(kcp->bits[j]);
        }
        return count;
}

/*
 * Routines to set/clear the flags atomically.
 */

void
kcpuset_atomic_set(kcpuset_t *kcp, cpuid_t i)
{
        const size_t j = i >> KC_SHIFT;

        KASSERT(j < kc_nfields);
        atomic_or_32(&kcp->bits[j], __BIT(i & KC_MASK));
}

void
kcpuset_atomic_clear(kcpuset_t *kcp, cpuid_t i)
{
        const size_t j = i >> KC_SHIFT;

        KASSERT(j < kc_nfields);
        atomic_and_32(&kcp->bits[j], ~(__BIT(i & KC_MASK)));
}

void
kcpuset_atomicly_intersect(kcpuset_t *kcp1, const kcpuset_t *kcp2)
{

        for (size_t j = 0; j < kc_nfields; j++) {
                if (kcp2->bits[j])
                        atomic_and_32(&kcp1->bits[j], kcp2->bits[j]);
        }
}

void
kcpuset_atomicly_merge(kcpuset_t *kcp1, const kcpuset_t *kcp2)
{

        for (size_t j = 0; j < kc_nfields; j++) {
                if (kcp2->bits[j])
                        atomic_or_32(&kcp1->bits[j], kcp2->bits[j]);
        }
}

void
kcpuset_atomicly_remove(kcpuset_t *kcp1, const kcpuset_t *kcp2)
{

        for (size_t j = 0; j < kc_nfields; j++) {
                if (kcp2->bits[j])
                        atomic_and_32(&kcp1->bits[j], ~kcp2->bits[j]);
        }
}





















































































































    2 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
/*        $NetBSD: statvfs.h,v 1.5 2024/01/19 18:39:15 christos Exp $         */

/*-
 * Copyright (c) 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef        _COMPAT_SYS_STATVFS_H_
#define        _COMPAT_SYS_STATVFS_H_

#include <sys/statvfs.h>

struct statvfs90 {
        unsigned long        f_flag;                /* copy of mount exported flags */
        unsigned long        f_bsize;        /* file system block size */
        unsigned long        f_frsize;        /* fundamental file system block size */
        unsigned long        f_iosize;        /* optimal file system block size */

        /* The following are in units of f_frsize */
        fsblkcnt_t        f_blocks;        /* number of blocks in file system, */
        fsblkcnt_t        f_bfree;        /* free blocks avail in file system */
        fsblkcnt_t        f_bavail;        /* free blocks avail to non-root */
        fsblkcnt_t        f_bresvd;        /* blocks reserved for root */

        fsfilcnt_t        f_files;        /* total file nodes in file system */
        fsfilcnt_t        f_ffree;        /* free file nodes in file system */
        fsfilcnt_t        f_favail;        /* free file nodes avail to non-root */
        fsfilcnt_t        f_fresvd;        /* file nodes reserved for root */

        uint64_t          f_syncreads;        /* count of sync reads since mount */
        uint64_t          f_syncwrites;        /* count of sync writes since mount */

        uint64_t          f_asyncreads;        /* count of async reads since mount */
        uint64_t          f_asyncwrites;        /* count of async writes since mount */

        fsid_t                f_fsidx;        /* NetBSD compatible fsid */
        unsigned long        f_fsid;                /* Posix compatible fsid */
        unsigned long        f_namemax;        /* maximum filename length */
        uid_t                f_owner;        /* user that mounted the file system */

        uint32_t        f_spare[4];        /* spare space */

        char        f_fstypename[_VFS_NAMELEN]; /* fs type name */
        char        f_mntonname[_VFS_MNAMELEN];  /* directory on which mounted */
        char        f_mntfromname[_VFS_MNAMELEN];  /* mounted file system */
};

__BEGIN_DECLS
#ifndef _KERNEL
#include <string.h>
#endif

static __inline void
statvfs_to_statvfs90(const struct statvfs *s, struct statvfs90 *s90)
{

        memset(s90, 0, sizeof(*s90));

        s90->f_flag = s->f_flag;
        s90->f_bsize = s->f_bsize;
        s90->f_frsize = s->f_frsize;
        s90->f_iosize = s->f_iosize;

        s90->f_blocks = s->f_blocks;
        s90->f_bfree = s->f_bfree;
        s90->f_bavail = s->f_bavail;
        s90->f_bresvd = s->f_bresvd;

        s90->f_files = s->f_files;
        s90->f_ffree = s->f_ffree;
        s90->f_favail = s->f_favail;
        s90->f_fresvd = s->f_fresvd;

        s90->f_syncreads = s->f_syncreads;
        s90->f_syncwrites = s->f_syncwrites;

        s90->f_asyncreads = s->f_asyncreads;
        s90->f_asyncwrites = s->f_asyncwrites;

        s90->f_fsidx = s->f_fsidx;
        s90->f_fsid = s->f_fsid;
        s90->f_namemax = s->f_namemax;
        s90->f_owner = s->f_owner;

        memcpy(s90->f_fstypename, s->f_fstypename, sizeof(s90->f_fstypename));
        memcpy(s90->f_mntonname, s->f_mntonname, sizeof(s90->f_mntonname));
        memcpy(s90->f_mntfromname, s->f_mntfromname, sizeof(s90->f_mntfromname));
}

#ifdef _KERNEL
static __inline int
statvfs_to_statvfs90_copy(const void *vs, void *vs90, size_t l)
{
        struct statvfs90 *s90 = kmem_zalloc(sizeof(*s90), KM_SLEEP);
        int error;

        statvfs_to_statvfs90(vs, s90);
        error = copyout(s90, vs90, sizeof(*s90));
        kmem_free(s90, sizeof(*s90));

        return error;
}
#else

#ifdef __LIBC12_SOURCE__

int        __compat_statvfs(const char *__restrict, struct statvfs90 *__restrict);
int        __compat_statvfs1(const char *__restrict, struct statvfs90 *__restrict,
    int);

int        __compat_fstatvfs(int, struct statvfs90 *);
int        __compat_fstatvfs1(int, struct statvfs90 *, int);

int        __compat___getmntinfo13(struct statvfs90 **, int);

int        __compat___fhstatvfs40(const void *, size_t, struct statvfs90 *);
int        __compat___fhstatvfs140(const void *, size_t, struct statvfs90 *, int);

int        __compat_getvfsstat(struct statvfs90 *, size_t, int);

int        __statvfs90(const char *__restrict, struct statvfs *__restrict);
int        __statvfs190(const char *__restrict, struct statvfs *__restrict, int);

int        __fstatvfs90(int, struct statvfs *);
int        __fstatvfs190(int, struct statvfs *, int);

int        __fhstatvfs90(const void *, size_t, struct statvfs *);
int        __fhstatvfs190(const void *, size_t, struct statvfs *, int);

int        __getvfsstat90(struct statvfs *, size_t, int);

int        __getmntinfo90(struct statvfs **, int);

struct compat_30_fhandle;
int        fhstatvfs(const struct compat_30_fhandle *, struct statvfs90 *);
int        fhstatvfs1(const struct compat_30_fhandle *, struct statvfs90 *, int);

#endif /* __LIBC12_SOURCE__ */

#endif /* _KERNEL */

__END_DECLS

#endif /* !_COMPAT_SYS_STATVFS_H_ */



































































































































































































































































    8 


























    6 





































    1 
























    1 








































































































    3 
    3 











    3 

    3 






















    3 



    3 














    1 








































































    1 

    1 















































































    2 









    1 

    1 



    1 

    1 























    1 
































































































































































































    1 




























































































































































    1 























































































    1 





    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    7 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
/*        $NetBSD: tcp_usrreq.c,v 1.238 2022/11/04 09:01:53 ozaki-r Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1997, 1998, 2005, 2006 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
 * Facility, NASA Ames Research Center.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Rui Paulo.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)tcp_usrreq.c        8.5 (Berkeley) 6/21/95
 */

/*
 * TCP protocol interface to socket abstraction.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c,v 1.238 2022/11/04 09:01:53 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_tcp_debug.h"
#include "opt_mbuftrace.h"
#include "opt_tcp_space.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/errno.h>
#include <sys/stat.h>
#include <sys/proc.h>
#include <sys/domain.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/uidinfo.h>

#include <net/if.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/in_offload.h>

#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#endif

#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_private.h>
#include <netinet/tcp_congctl.h>
#include <netinet/tcp_debug.h>
#include <netinet/tcp_vtw.h>
#include <netinet/tcp_syncache.h>

static int
tcp_debug_capture(struct tcpcb *tp, int req)
{
#ifdef TCP_DEBUG
        return tp->t_state;
#endif
        return 0;
}

static inline void
tcp_debug_trace(struct socket *so, struct tcpcb *tp, int ostate, int req)
{
#ifdef TCP_DEBUG
        if (tp && (so->so_options & SO_DEBUG))
                tcp_trace(TA_USER, ostate, tp, NULL, req);
#endif
}

static void
change_keepalive(struct socket *so, struct tcpcb *tp)
{
        tp->t_maxidle = tp->t_keepcnt * MIN(tp->t_keepintvl,
            TCP_TIMER_MAXTICKS / tp->t_keepcnt);
        TCP_TIMER_DISARM(tp, TCPT_KEEP);
        TCP_TIMER_DISARM(tp, TCPT_2MSL);

        if (tp->t_state == TCPS_SYN_RECEIVED ||
            tp->t_state == TCPS_SYN_SENT) {
                TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
        } else if (so->so_options & SO_KEEPALIVE &&
            tp->t_state <= TCPS_CLOSE_WAIT) {
                TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepintvl);
        } else {
                TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
        }

        if ((tp->t_state == TCPS_FIN_WAIT_2) && (tp->t_maxidle > 0))
                TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_maxidle);
}

/*
 * Export TCP internal state information via a struct tcp_info, based on the
 * Linux 2.6 API.  Not ABI compatible as our constants are mapped differently
 * (TCP state machine, etc).  We export all information using FreeBSD-native
 * constants -- for example, the numeric values for tcpi_state will differ
 * from Linux.
 */
static void
tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
{

        bzero(ti, sizeof(*ti));

        ti->tcpi_state = tp->t_state;
        if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
                ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
        if (tp->t_flags & TF_SACK_PERMIT)
                ti->tcpi_options |= TCPI_OPT_SACK;
        if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
                ti->tcpi_options |= TCPI_OPT_WSCALE;
                ti->tcpi_snd_wscale = tp->snd_scale;
                ti->tcpi_rcv_wscale = tp->rcv_scale;
        }
        if (tp->t_flags & TF_ECN_PERMIT) {
                ti->tcpi_options |= TCPI_OPT_ECN;
        }

        ti->tcpi_rto = tp->t_rxtcur * tick;
        ti->tcpi_last_data_recv = (long)(getticks() -
                                         (int)tp->t_rcvtime) * tick;
        ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick / PR_SLOWHZ)
                           >> (TCP_RTT_SHIFT + 2);
        ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick / PR_SLOWHZ)
                           >> (TCP_RTTVAR_SHIFT + 2);

        ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
        /* Linux API wants these in # of segments, apparently */
        ti->tcpi_snd_cwnd = tp->snd_cwnd / tp->t_segsz;
        ti->tcpi_snd_wnd = tp->snd_wnd / tp->t_segsz;

        /*
         * FreeBSD-specific extension fields for tcp_info.
         */
        ti->tcpi_rcv_space = tp->rcv_wnd;
        ti->tcpi_rcv_nxt = tp->rcv_nxt;
        ti->tcpi_snd_bwnd = 0;                /* Unused, kept for compat. */
        ti->tcpi_snd_nxt = tp->snd_nxt;
        ti->tcpi_snd_mss = tp->t_segsz;
        ti->tcpi_rcv_mss = tp->t_segsz;
#ifdef TF_TOE
        if (tp->t_flags & TF_TOE)
                ti->tcpi_options |= TCPI_OPT_TOE;
#endif
        /* From the redundant department of redundancies... */
        ti->__tcpi_retransmits = ti->__tcpi_retrans =
                ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;

        ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
        ti->tcpi_snd_zerowin = tp->t_sndzerowin;
}

int
tcp_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
        int error = 0, s;
        struct inpcb *inp;
        struct tcpcb *tp;
        struct tcp_info ti;
        u_int ui;
        int family;        /* family of the socket */
        int level, optname, optval;

        level = sopt->sopt_level;
        optname = sopt->sopt_name;

        family = so->so_proto->pr_domain->dom_family;

        s = splsoftnet();
        inp = sotoinpcb(so);
        if (inp == NULL) {
                splx(s);
                return ECONNRESET;
        }
        if (level != IPPROTO_TCP) {
                switch (family) {
                case PF_INET:
                        error = ip_ctloutput(op, so, sopt);
                        break;
#ifdef INET6
                case PF_INET6:
                        error = ip6_ctloutput(op, so, sopt);
                        break;
#endif
                }
                splx(s);
                return error;
        }
        tp = intotcpcb(inp);

        switch (op) {
        case PRCO_SETOPT:
                switch (optname) {
#ifdef TCP_SIGNATURE
                case TCP_MD5SIG:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;
                        if (optval > 0)
                                tp->t_flags |= TF_SIGNATURE;
                        else
                                tp->t_flags &= ~TF_SIGNATURE;
                        break;
#endif /* TCP_SIGNATURE */

                case TCP_NODELAY:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;
                        if (optval)
                                tp->t_flags |= TF_NODELAY;
                        else
                                tp->t_flags &= ~TF_NODELAY;
                        break;

                case TCP_MAXSEG:
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                break;
                        if (optval > 0 && optval <= tp->t_peermss)
                                tp->t_peermss = optval; /* limit on send size */
                        else
                                error = EINVAL;
                        break;
#ifdef notyet
                case TCP_CONGCTL:
                        /* XXX string overflow XXX */
                        error = tcp_congctl_select(tp, sopt->sopt_data);
                        break;
#endif

                case TCP_KEEPIDLE:
                        error = sockopt_get(sopt, &ui, sizeof(ui));
                        if (error)
                                break;
                        if (ui > 0 && ui <= TCP_TIMER_MAXTICKS) {
                                tp->t_keepidle = ui;
                                change_keepalive(so, tp);
                        } else
                                error = EINVAL;
                        break;

                case TCP_KEEPINTVL:
                        error = sockopt_get(sopt, &ui, sizeof(ui));
                        if (error)
                                break;
                        if (ui > 0 && ui <= TCP_TIMER_MAXTICKS) {
                                tp->t_keepintvl = ui;
                                change_keepalive(so, tp);
                        } else
                                error = EINVAL;
                        break;

                case TCP_KEEPCNT:
                        error = sockopt_get(sopt, &ui, sizeof(ui));
                        if (error)
                                break;
                        if (ui > 0 && ui <= TCP_TIMER_MAXTICKS) {
                                tp->t_keepcnt = ui;
                                change_keepalive(so, tp);
                        } else
                                error = EINVAL;
                        break;

                case TCP_KEEPINIT:
                        error = sockopt_get(sopt, &ui, sizeof(ui));
                        if (error)
                                break;
                        if (ui > 0 && ui <= TCP_TIMER_MAXTICKS) {
                                tp->t_keepinit = ui;
                                change_keepalive(so, tp);
                        } else
                                error = EINVAL;
                        break;

                default:
                        error = ENOPROTOOPT;
                        break;
                }
                break;

        case PRCO_GETOPT:
                switch (optname) {
#ifdef TCP_SIGNATURE
                case TCP_MD5SIG:
                        optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
                        goto setval;
#endif
                case TCP_NODELAY:
                        optval = tp->t_flags & TF_NODELAY;
                        goto setval;
                case TCP_MAXSEG:
                        optval = tp->t_peermss;
                        goto setval;
                case TCP_INFO:
                        tcp_fill_info(tp, &ti);
                        error = sockopt_set(sopt, &ti, sizeof ti);
                        break;
#ifdef notyet
                case TCP_CONGCTL:
                        break;
#endif
                case TCP_KEEPIDLE:
                        optval = tp->t_keepidle;
                        goto setval;
                case TCP_KEEPINTVL:
                        optval = tp->t_keepintvl;
                        goto setval;
                case TCP_KEEPCNT:
                        optval = tp->t_keepcnt;
                        goto setval;
                case TCP_KEEPINIT:
                        optval = tp->t_keepinit;
setval:                        error = sockopt_set(sopt, &optval, sizeof(optval));
                        break;
                default:
                        error = ENOPROTOOPT;
                        break;
                }
                break;
        }
        splx(s);
        return error;
}

#ifndef TCP_SENDSPACE
#define        TCP_SENDSPACE        1024*32
#endif
int        tcp_sendspace = TCP_SENDSPACE;
#ifndef TCP_RECVSPACE
#define        TCP_RECVSPACE        1024*32
#endif
int        tcp_recvspace = TCP_RECVSPACE;

/*
 * tcp_attach: attach TCP protocol to socket, allocating internet protocol
 * control block, TCP control block, buffer space and entering LISTEN state
 * if to accept connections.
 */
static int
tcp_attach(struct socket *so, int proto)
{
        struct tcpcb *tp;
        struct inpcb *inp;
        int s, error, family;

        /* Assign the lock (must happen even if we will error out). */
        s = splsoftnet();
        sosetlock(so);
        KASSERT(solocked(so));
        KASSERT(sotoinpcb(so) == NULL);

        inp = sotoinpcb(so);
        KASSERT(inp == NULL);

        family = soaf(so);

#ifdef MBUFTRACE
        so->so_mowner = &tcp_sock_mowner;
        so->so_rcv.sb_mowner = &tcp_sock_rx_mowner;
        so->so_snd.sb_mowner = &tcp_sock_tx_mowner;
#endif
        if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
                error = soreserve(so, tcp_sendspace, tcp_recvspace);
                if (error)
                        goto out;
        }

        so->so_rcv.sb_flags |= SB_AUTOSIZE;
        so->so_snd.sb_flags |= SB_AUTOSIZE;

        error = inpcb_create(so, &tcbtable);
        if (error)
                goto out;
        inp = sotoinpcb(so);

        tp = tcp_newtcpcb(family, inp);
        if (tp == NULL) {
                int nofd = so->so_state & SS_NOFDREF;        /* XXX */

                so->so_state &= ~SS_NOFDREF;        /* don't free the socket yet */
                inpcb_destroy(inp);
                so->so_state |= nofd;
                error = ENOBUFS;
                goto out;
        }
        tp->t_state = TCPS_CLOSED;
        if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
                so->so_linger = TCP_LINGERTIME;
        }
out:
        KASSERT(solocked(so));
        splx(s);
        return error;
}

static void
tcp_detach(struct socket *so)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int s;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return;
        tp = intotcpcb(inp);

        s = splsoftnet();
        (void)tcp_disconnect1(tp);
        splx(s);
}

static int
tcp_accept(struct socket *so, struct sockaddr *nam)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int ostate = 0;
        int s;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_ACCEPT);

        /*
         * Accept a connection.  Essentially all the work is
         * done at higher levels; just return the address
         * of the peer, storing through addr.
         */
        s = splsoftnet();
        if (inp->inp_af == AF_INET) {
                inpcb_fetch_peeraddr(inp, (struct sockaddr_in *)nam);
        }
#ifdef INET6
        else if (inp->inp_af == AF_INET6) {
                in6pcb_fetch_peeraddr(inp, (struct sockaddr_in6 *)nam);
        }
#endif
        tcp_debug_trace(so, tp, ostate, PRU_ACCEPT);
        splx(s);

        return 0;
}

static int
tcp_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct inpcb *inp = NULL;
        struct sockaddr_in *sin = (struct sockaddr_in *)nam;
#ifdef INET6
        struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
#endif /* INET6 */
        struct tcpcb *tp;
        int s;
        int error = 0;
        int ostate = 0;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_BIND);

        /*
         * Give the socket an address.
         */
        s = splsoftnet();
        switch (so->so_proto->pr_domain->dom_family) {
        case PF_INET:
                error = inpcb_bind(inp, sin, l);
                break;
#ifdef INET6
        case PF_INET6:
                error = in6pcb_bind(inp, sin6, l);
                if (!error) {
                        /* mapped addr case */
                        if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)))
                                tp->t_family = AF_INET;
                        else
                                tp->t_family = AF_INET6;
                }
                break;
#endif
        }
        tcp_debug_trace(so, tp, ostate, PRU_BIND);
        splx(s);

        return error;
}

static int
tcp_listen(struct socket *so, struct lwp *l)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int error = 0;
        int ostate = 0;
        int s;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_LISTEN);

        /*
         * Prepare to accept connections.
         */
        s = splsoftnet();
        if (inp->inp_af == AF_INET && inp->inp_lport == 0) {
                error = inpcb_bind(inp, NULL, l);
                if (error)
                        goto release;
        }
#ifdef INET6
        if (inp->inp_af == AF_INET6 && inp->inp_lport == 0) {
                error = in6pcb_bind(inp, NULL, l);
                if (error)
                        goto release;
        }
#endif
        tp->t_state = TCPS_LISTEN;

release:
        tcp_debug_trace(so, tp, ostate, PRU_LISTEN);
        splx(s);

        return error;
}

static int
tcp_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int s;
        int error = 0;
        int ostate = 0;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_CONNECT);

        /*
         * Initiate connection to peer.
         * Create a template for use in transmissions on this connection.
         * Enter SYN_SENT state, and mark socket as connecting.
         * Start keep-alive timer, and seed output sequence space.
         * Send initial segment on connection.
         */
        s = splsoftnet();

        if (inp->inp_af == AF_INET) {
                if (inp->inp_lport == 0) {
                        error = inpcb_bind(inp, NULL, l);
                        if (error)
                                goto release;
                }
                error = inpcb_connect(inp, (struct sockaddr_in *)nam, l);
        }
#ifdef INET6
        if (inp->inp_af == AF_INET6) {
                if (inp->inp_lport == 0) {
                        error = in6pcb_bind(inp, NULL, l);
                        if (error)
                                goto release;
                }
                error = in6pcb_connect(inp, (struct sockaddr_in6 *)nam, l);
                if (!error) {
                        /* mapped addr case */
                        if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp)))
                                tp->t_family = AF_INET;
                        else
                                tp->t_family = AF_INET6;
                }
        }
#endif
        if (error)
                goto release;
        tp->t_template = tcp_template(tp);
        if (tp->t_template == 0) {
                if (inp->inp_af == AF_INET)
                        inpcb_disconnect(inp);
#ifdef INET6
                else if (inp->inp_af == AF_INET6)
                        in6pcb_disconnect(inp);
#endif
                error = ENOBUFS;
                goto release;
        }
        /*
         * Compute window scaling to request.
         * XXX: This should be moved to tcp_output().
         */
        while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
            (TCP_MAXWIN << tp->request_r_scale) < sb_max)
                tp->request_r_scale++;
        soisconnecting(so);
        TCP_STATINC(TCP_STAT_CONNATTEMPT);
        tp->t_state = TCPS_SYN_SENT;
        TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
        tp->iss = tcp_new_iss(tp);
        tcp_sendseqinit(tp);
        error = tcp_output(tp);

release:
        tcp_debug_trace(so, tp, ostate, PRU_CONNECT);
        splx(s);

        return error;
}

static int
tcp_connect2(struct socket *so, struct socket *so2)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int ostate = 0;

        KASSERT(solocked(so));

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_CONNECT2);

        tcp_debug_trace(so, tp, ostate, PRU_CONNECT2);

        return EOPNOTSUPP;
}

static int
tcp_disconnect(struct socket *so)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int error = 0;
        int ostate = 0;
        int s;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_DISCONNECT);

        /*
         * Initiate disconnect from peer.
         * If connection never passed embryonic stage, just drop;
         * else if don't need to let data drain, then can just drop anyways,
         * else have to begin TCP shutdown process: mark socket disconnecting,
         * drain unread data, state switch to reflect user close, and
         * send segment (e.g. FIN) to peer.  Socket will be really disconnected
         * when peer sends FIN and acks ours.
         *
         * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
         */
        s = splsoftnet();
        tp = tcp_disconnect1(tp);
        tcp_debug_trace(so, tp, ostate, PRU_DISCONNECT);
        splx(s);

        return error;
}

static int
tcp_shutdown(struct socket *so)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int error = 0;
        int ostate = 0;
        int s;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_SHUTDOWN);
        /*
         * Mark the connection as being incapable of further output.
         */
        s = splsoftnet();
        socantsendmore(so);
        tp = tcp_usrclosed(tp);
        if (tp)
                error = tcp_output(tp);
        tcp_debug_trace(so, tp, ostate, PRU_SHUTDOWN);
        splx(s);

        return error;
}

static int
tcp_abort(struct socket *so)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int error = 0;
        int ostate = 0;
        int s;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_ABORT);

        /*
         * Abort the TCP.
         */
        s = splsoftnet();
        tp = tcp_drop(tp, ECONNABORTED);
        tcp_debug_trace(so, tp, ostate, PRU_ABORT);
        splx(s);

        return error;
}

static int
tcp_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
{
        switch (so->so_proto->pr_domain->dom_family) {
        case PF_INET:
                return in_control(so, cmd, nam, ifp);
#ifdef INET6
        case PF_INET6:
                return in6_control(so, cmd, nam, ifp);
#endif
        default:
                return EAFNOSUPPORT;
        }
}

static int
tcp_stat(struct socket *so, struct stat *ub)
{
        KASSERT(solocked(so));

        /* stat: don't bother with a blocksize.  */
        return 0;
}

static int
tcp_peeraddr(struct socket *so, struct sockaddr *nam)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int ostate = 0;
        int s;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_PEERADDR);

        s = splsoftnet();
        if (inp->inp_af == AF_INET) {
                inpcb_fetch_peeraddr(inp, (struct sockaddr_in *)nam);
        }
#ifdef INET6
        else if (inp->inp_af == AF_INET6) {
                in6pcb_fetch_peeraddr(inp, (struct sockaddr_in6 *)nam);
        }
#endif
        tcp_debug_trace(so, tp, ostate, PRU_PEERADDR);
        splx(s);

        return 0;
}

static int
tcp_sockaddr(struct socket *so, struct sockaddr *nam)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int ostate = 0;
        int s;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_SOCKADDR);

        s = splsoftnet();
        if (inp->inp_af == AF_INET) {
                inpcb_fetch_sockaddr(inp, (struct sockaddr_in *)nam);
        }
#ifdef INET6
        if (inp->inp_af == AF_INET6) {
                in6pcb_fetch_sockaddr(inp, (struct sockaddr_in6 *)nam);
        }
#endif
        tcp_debug_trace(so, tp, ostate, PRU_SOCKADDR);
        splx(s);

        return 0;
}

static int
tcp_rcvd(struct socket *so, int flags, struct lwp *l)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int ostate = 0;
        int s;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_RCVD);

        /*
         * After a receive, possibly send window update to peer.
         *
         * soreceive() calls this function when a user receives
         * ancillary data on a listening socket. We don't call
         * tcp_output in such a case, since there is no header
         * template for a listening socket and hence the kernel
         * will panic.
         */
        s = splsoftnet();
        if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
                (void) tcp_output(tp);
        splx(s);

        tcp_debug_trace(so, tp, ostate, PRU_RCVD);

        return 0;
}

static int
tcp_recvoob(struct socket *so, struct mbuf *m, int flags)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int ostate = 0;
        int s;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_RCVOOB);

        s = splsoftnet();
        if ((so->so_oobmark == 0 &&
            (so->so_state & SS_RCVATMARK) == 0) ||
            so->so_options & SO_OOBINLINE ||
            tp->t_oobflags & TCPOOB_HADDATA) {
                splx(s);
                return EINVAL;
        }

        if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
                splx(s);
                return EWOULDBLOCK;
        }

        m->m_len = 1;
        *mtod(m, char *) = tp->t_iobc;
        if ((flags & MSG_PEEK) == 0) {
                tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
                so->so_state &= ~SS_POLLRDBAND;
        }

        tcp_debug_trace(so, tp, ostate, PRU_RCVOOB);
        splx(s);

        return 0;
}

static int
tcp_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
    struct mbuf *control, struct lwp *l)
{
        struct inpcb *inp;
        struct tcpcb *tp;
        int ostate = 0;
        int error = 0;
        int s;

        inp = sotoinpcb(so);
        if (inp == NULL)
                return EINVAL;
        tp = intotcpcb(inp);

        ostate = tcp_debug_capture(tp, PRU_SEND);

        /*
         * Do a send by putting data in output queue and updating urgent
         * marker if URG set.  Possibly send more data.
         */
        s = splsoftnet();
        if (control && control->m_len) {
                m_freem(control);
                m_freem(m);
                tcp_debug_trace(so, tp, ostate, PRU_SEND);
                splx(s);
                return EINVAL;
        }

        sbappendstream(&so->so_snd, m);
        error = tcp_output(tp);
        tcp_debug_trace(so, tp, ostate, PRU_SEND);
        splx(s);

        return error;
}

static int
tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control)
{
        struct inpcb *inp = NULL;
        struct tcpcb *tp = NULL;
        int ostate = 0;
        int error = 0;
        int s;

        inp = sotoinpcb(so);
        if (inp == NULL) {
                m_freem(m);
                m_freem(control);
                return EINVAL;
        }
        tp = intotcpcb(inp);
        if (tp->t_template == NULL) {
                /*
                 * XXX FreeBSD appears to open the connection
                 * automagically in this case, but the socket address
                 * isn't passed through here so we can't do that.
                 */
                m_freem(m);
                m_freem(control);
                return ENOTCONN;
        }

        ostate = tcp_debug_capture(tp, PRU_SENDOOB);

        s = splsoftnet();
        if (sbspace_oob(&so->so_snd) == 0) {
                m_freem(m);
                m_freem(control);
                splx(s);
                return ENOBUFS;
        }
        /*
         * According to RFC961 (Assigned Protocols),
         * the urgent pointer points to the last octet
         * of urgent data.  We continue, however,
         * to consider it to indicate the first octet
         * of data past the urgent section.
         * Otherwise, snd_up should be one lower.
         */
        sbappendstream(&so->so_snd, m);
        tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
        tp->t_force = 1;
        error = tcp_output(tp);
        tp->t_force = 0;
        tcp_debug_trace(so, tp, ostate, PRU_SENDOOB);
        splx(s);
        m_freem(control);

        return error;
}

static int
tcp_purgeif(struct socket *so, struct ifnet *ifp)
{
        int s;
        int error = 0;

        s = splsoftnet();

        mutex_enter(softnet_lock);
        switch (so->so_proto->pr_domain->dom_family) {
        case PF_INET:
                inpcb_purgeif0(&tcbtable, ifp);
#ifdef NET_MPSAFE
                mutex_exit(softnet_lock);
#endif
                in_purgeif(ifp);
#ifdef NET_MPSAFE
                mutex_enter(softnet_lock);
#endif
                inpcb_purgeif(&tcbtable, ifp);
                break;
#ifdef INET6
        case PF_INET6:
                in6pcb_purgeif0(&tcbtable, ifp);
#ifdef NET_MPSAFE
                mutex_exit(softnet_lock);
#endif
                in6_purgeif(ifp);
#ifdef NET_MPSAFE
                mutex_enter(softnet_lock);
#endif
                in6pcb_purgeif(&tcbtable, ifp);
                break;
#endif
        default:
                error = EAFNOSUPPORT;
                break;
        }
        mutex_exit(softnet_lock);
        splx(s);

        return error;
}

/*
 * Initiate (or continue) disconnect.
 * If embryonic state, just send reset (once).
 * If in ``let data drain'' option and linger null, just drop.
 * Otherwise (hard), mark socket disconnecting and drop
 * current input data; switch states based on user close, and
 * send segment to peer (with FIN).
 */
struct tcpcb *
tcp_disconnect1(struct tcpcb *tp)
{
        struct socket *so;

        so = tp->t_inpcb->inp_socket;

        if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
                tp = tcp_close(tp);
        else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
                tp = tcp_drop(tp, 0);
        else {
                soisdisconnecting(so);
                sbflush(&so->so_rcv);
                tp = tcp_usrclosed(tp);
                if (tp)
                        (void) tcp_output(tp);
        }
        return tp;
}

/*
 * User issued close, and wish to trail through shutdown states:
 * if never received SYN, just forget it.  If got a SYN from peer,
 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
 * If already got a FIN from peer, then almost done; go to LAST_ACK
 * state.  In all other cases, have already sent FIN to peer (e.g.
 * after PRU_SHUTDOWN), and just have to play tedious game waiting
 * for peer to send FIN or not respond to keep-alives, etc.
 * We can let the user exit from the close as soon as the FIN is acked.
 */
struct tcpcb *
tcp_usrclosed(struct tcpcb *tp)
{

        switch (tp->t_state) {

        case TCPS_CLOSED:
        case TCPS_LISTEN:
        case TCPS_SYN_SENT:
                tp->t_state = TCPS_CLOSED;
                tp = tcp_close(tp);
                break;

        case TCPS_SYN_RECEIVED:
        case TCPS_ESTABLISHED:
                tp->t_state = TCPS_FIN_WAIT_1;
                break;

        case TCPS_CLOSE_WAIT:
                tp->t_state = TCPS_LAST_ACK;
                break;
        }
        if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
                struct socket *so = tp->t_inpcb->inp_socket;
                if (so)
                        soisdisconnected(so);
                /*
                 * If we are in FIN_WAIT_2, we arrived here because the
                 * application did a shutdown of the send side.  Like the
                 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
                 * a full close, we start a timer to make sure sockets are
                 * not left in FIN_WAIT_2 forever.
                 */
                if ((tp->t_state == TCPS_FIN_WAIT_2) && (tp->t_maxidle > 0))
                        TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_maxidle);
                else if (tp->t_state == TCPS_TIME_WAIT
                         && ((tp->t_inpcb->inp_af == AF_INET
                              && (tcp4_vtw_enable & 1)
                              && vtw_add(AF_INET, tp))
                             ||
                             (tp->t_inpcb->inp_af == AF_INET6
                              && (tcp6_vtw_enable & 1)
                              && vtw_add(AF_INET6, tp)))) {
                        tp = 0;
                }
        }
        return tp;
}

/*
 * sysctl helper routine for net.inet.ip.mssdflt.  it can't be less
 * than 32.
 */
static int
sysctl_net_inet_tcp_mssdflt(SYSCTLFN_ARGS)
{
        int error, mssdflt;
        struct sysctlnode node;

        mssdflt = tcp_mssdflt;
        node = *rnode;
        node.sysctl_data = &mssdflt;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        if (mssdflt < 32)
                return EINVAL;
        tcp_mssdflt = mssdflt;

        mutex_enter(softnet_lock);
        tcp_tcpcb_template();
        mutex_exit(softnet_lock);

        return 0;
}

/*
 * sysctl helper for TCP CB template update
 */
static int
sysctl_update_tcpcb_template(SYSCTLFN_ARGS)
{
        int t, error;
        struct sysctlnode node;

        /* follow procedures in sysctl(9) manpage */
        t = *(int *)rnode->sysctl_data;
        node = *rnode;
        node.sysctl_data = &t;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        if (t < 0)
                return EINVAL;

        *(int *)rnode->sysctl_data = t;

        mutex_enter(softnet_lock);
        tcp_tcpcb_template();
        mutex_exit(softnet_lock);

        return 0;
}

/*
 * sysctl helper routine for setting port related values under
 * net.inet.ip and net.inet6.ip6.  does basic range checking and does
 * additional checks for each type.  this code has placed in
 * tcp_input.c since INET and INET6 both use the same tcp code.
 *
 * this helper is not static so that both inet and inet6 can use it.
 */
int
sysctl_net_inet_ip_ports(SYSCTLFN_ARGS)
{
        int error, tmp;
        int apmin, apmax;
#ifndef IPNOPRIVPORTS
        int lpmin, lpmax;
#endif /* IPNOPRIVPORTS */
        struct sysctlnode node;

        if (namelen != 0)
                return EINVAL;

        switch (name[-3]) {
            case PF_INET:
                apmin = anonportmin;
                apmax = anonportmax;
#ifndef IPNOPRIVPORTS
                lpmin = lowportmin;
                lpmax = lowportmax;
#endif /* IPNOPRIVPORTS */
                break;
#ifdef INET6
            case PF_INET6:
                apmin = ip6_anonportmin;
                apmax = ip6_anonportmax;
#ifndef IPNOPRIVPORTS
                lpmin = ip6_lowportmin;
                lpmax = ip6_lowportmax;
#endif /* IPNOPRIVPORTS */
                break;
#endif /* INET6 */
            default:
                return EINVAL;
        }

        /*
         * insert temporary copy into node, perform lookup on
         * temporary, then restore pointer
         */
        node = *rnode;
        tmp = *(int*)rnode->sysctl_data;
        node.sysctl_data = &tmp;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        /*
         * simple port range check
         */
        if (tmp < 0 || tmp > 65535)
                return EINVAL;

        /*
         * per-node range checks
         */
        switch (rnode->sysctl_num) {
        case IPCTL_ANONPORTMIN:
        case IPV6CTL_ANONPORTMIN:
                if (tmp >= apmax)
                        return EINVAL;
#ifndef IPNOPRIVPORTS
                if (tmp < IPPORT_RESERVED)
                        return EINVAL;
#endif /* IPNOPRIVPORTS */
                break;

        case IPCTL_ANONPORTMAX:
        case IPV6CTL_ANONPORTMAX:
                if (apmin >= tmp)
                        return EINVAL;
#ifndef IPNOPRIVPORTS
                if (tmp < IPPORT_RESERVED)
                        return EINVAL;
#endif /* IPNOPRIVPORTS */
                break;

#ifndef IPNOPRIVPORTS
        case IPCTL_LOWPORTMIN:
        case IPV6CTL_LOWPORTMIN:
                if (tmp >= lpmax ||
                    tmp > IPPORT_RESERVEDMAX ||
                    tmp < IPPORT_RESERVEDMIN)
                        return EINVAL;
                break;

        case IPCTL_LOWPORTMAX:
        case IPV6CTL_LOWPORTMAX:
                if (lpmin >= tmp ||
                    tmp > IPPORT_RESERVEDMAX ||
                    tmp < IPPORT_RESERVEDMIN)
                        return EINVAL;
                break;
#endif /* IPNOPRIVPORTS */

        default:
                return EINVAL;
        }

        *(int*)rnode->sysctl_data = tmp;

        return 0;
}

static inline int
copyout_uid(struct socket *sockp, void *oldp, size_t *oldlenp)
{
        if (oldp) {
                size_t sz;
                uid_t uid;
                int error;

                if (sockp->so_cred == NULL)
                        return EPERM;

                uid = kauth_cred_geteuid(sockp->so_cred);
                sz = MIN(sizeof(uid), *oldlenp);
                if ((error = copyout(&uid, oldp, sz)) != 0)
                        return error;
        }
        *oldlenp = sizeof(uid_t);
        return 0;
}

static inline int
inet4_ident_core(struct in_addr raddr, u_int rport,
    struct in_addr laddr, u_int lport,
    void *oldp, size_t *oldlenp,
    struct lwp *l, int dodrop)
{
        struct inpcb *inp;
        struct socket *sockp;

        inp = inpcb_lookup(&tcbtable, raddr, rport, laddr, lport, 0);

        if (inp == NULL || (sockp = inp->inp_socket) == NULL)
                return ESRCH;

        if (dodrop) {
                struct tcpcb *tp;
                int error;

                if (inp == NULL || (tp = intotcpcb(inp)) == NULL ||
                    (inp->inp_socket->so_options & SO_ACCEPTCONN) != 0)
                        return ESRCH;

                error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
                    KAUTH_REQ_NETWORK_SOCKET_DROP, inp->inp_socket, tp, NULL);
                if (error)
                        return error;

                (void)tcp_drop(tp, ECONNABORTED);
                return 0;
        }

        return copyout_uid(sockp, oldp, oldlenp);
}

#ifdef INET6
static inline int
inet6_ident_core(struct in6_addr *raddr, u_int rport,
    struct in6_addr *laddr, u_int lport,
    void *oldp, size_t *oldlenp,
    struct lwp *l, int dodrop)
{
        struct inpcb *inp;
        struct socket *sockp;

        inp = in6pcb_lookup(&tcbtable, raddr, rport, laddr, lport, 0, 0);

        if (inp == NULL || (sockp = inp->inp_socket) == NULL)
                return ESRCH;

        if (dodrop) {
                struct tcpcb *tp;
                int error;

                if (inp == NULL || (tp = intotcpcb(inp)) == NULL ||
                    (inp->inp_socket->so_options & SO_ACCEPTCONN) != 0)
                        return ESRCH;

                error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
                    KAUTH_REQ_NETWORK_SOCKET_DROP, inp->inp_socket, tp, NULL);
                if (error)
                        return error;

                (void)tcp_drop(tp, ECONNABORTED);
                return 0;
        }

        return copyout_uid(sockp, oldp, oldlenp);
}
#endif

/*
 * sysctl helper routine for the net.inet.tcp.drop and
 * net.inet6.tcp6.drop nodes.
 */
#define sysctl_net_inet_tcp_drop sysctl_net_inet_tcp_ident

/*
 * sysctl helper routine for the net.inet.tcp.ident and
 * net.inet6.tcp6.ident nodes.  contains backwards compat code for the
 * old way of looking up the ident information for ipv4 which involves
 * stuffing the port/addr pairs into the mib lookup.
 */
static int
sysctl_net_inet_tcp_ident(SYSCTLFN_ARGS)
{
        struct sockaddr_in *si4[2];
#ifdef INET6
        struct sockaddr_in6 *si6[2];
#endif
        struct sockaddr_storage sa[2];
        int error, pf, dodrop;

        dodrop = name[-1] == TCPCTL_DROP;
        if (dodrop) {
                if (oldp != NULL || *oldlenp != 0)
                        return EINVAL;
                if (newp == NULL)
                        return EPERM;
                if (newlen < sizeof(sa))
                        return ENOMEM;
        }
        if (namelen != 4 && namelen != 0)
                return EINVAL;
        if (name[-2] != IPPROTO_TCP)
                return EINVAL;
        pf = name[-3];

        /* old style lookup, ipv4 only */
        if (namelen == 4) {
                struct in_addr laddr, raddr;
                u_int lport, rport;

                if (pf != PF_INET)
                        return EPROTONOSUPPORT;
                raddr.s_addr = (uint32_t)name[0];
                rport = (u_int)name[1];
                laddr.s_addr = (uint32_t)name[2];
                lport = (u_int)name[3];

                mutex_enter(softnet_lock);
                error = inet4_ident_core(raddr, rport, laddr, lport,
                    oldp, oldlenp, l, dodrop);
                mutex_exit(softnet_lock);
                return error;
        }

        if (newp == NULL || newlen != sizeof(sa))
                return EINVAL;
        error = copyin(newp, &sa, newlen);
        if (error)
                return error;

        /*
         * requested families must match
         */
        if (pf != sa[0].ss_family || sa[0].ss_family != sa[1].ss_family)
                return EINVAL;

        switch (pf) {
#ifdef INET6
        case PF_INET6:
                si6[0] = (struct sockaddr_in6*)&sa[0];
                si6[1] = (struct sockaddr_in6*)&sa[1];
                if (si6[0]->sin6_len != sizeof(*si6[0]) ||
                    si6[1]->sin6_len != sizeof(*si6[1]))
                        return EINVAL;

                if (!IN6_IS_ADDR_V4MAPPED(&si6[0]->sin6_addr) &&
                    !IN6_IS_ADDR_V4MAPPED(&si6[1]->sin6_addr)) {
                        error = sa6_embedscope(si6[0], ip6_use_defzone);
                        if (error)
                                return error;
                        error = sa6_embedscope(si6[1], ip6_use_defzone);
                        if (error)
                                return error;

                        mutex_enter(softnet_lock);
                        error = inet6_ident_core(&si6[0]->sin6_addr,
                            si6[0]->sin6_port, &si6[1]->sin6_addr,
                            si6[1]->sin6_port, oldp, oldlenp, l, dodrop);
                        mutex_exit(softnet_lock);
                        return error;
                }

                if (IN6_IS_ADDR_V4MAPPED(&si6[0]->sin6_addr) !=
                    IN6_IS_ADDR_V4MAPPED(&si6[1]->sin6_addr))
                        return EINVAL;

                in6_sin6_2_sin_in_sock((struct sockaddr *)&sa[0]);
                in6_sin6_2_sin_in_sock((struct sockaddr *)&sa[1]);
#endif /* INET6 */
                /*FALLTHROUGH*/
        case PF_INET:
                si4[0] = (struct sockaddr_in*)&sa[0];
                si4[1] = (struct sockaddr_in*)&sa[1];
                if (si4[0]->sin_len != sizeof(*si4[0]) ||
                    si4[0]->sin_len != sizeof(*si4[1]))
                        return EINVAL;

                mutex_enter(softnet_lock);
                error = inet4_ident_core(si4[0]->sin_addr, si4[0]->sin_port,
                    si4[1]->sin_addr, si4[1]->sin_port,
                    oldp, oldlenp, l, dodrop);
                mutex_exit(softnet_lock);
                return error;
        default:
                return EPROTONOSUPPORT;
        }
}

/*
 * sysctl helper for the inet and inet6 pcblists.  handles tcp/udp and
 * inet/inet6, as well as raw pcbs for each.  specifically not
 * declared static so that raw sockets and udp/udp6 can use it as
 * well.
 */
int
sysctl_inpcblist(SYSCTLFN_ARGS)
{
        const bool allowaddr = get_expose_address(curproc);
        struct sockaddr_in *in;
        const struct inpcb *inp;
#ifdef INET6
        struct sockaddr_in6 *in6;
#endif
        struct inpcbtable *pcbtbl = __UNCONST(rnode->sysctl_data);
        struct tcpcb *tp;
        struct kinfo_pcb pcb;
        char *dp;
        size_t len, needed, elem_size, out_size;
        int error, elem_count, pf, proto, pf2;

        if (namelen != 4)
                return EINVAL;

        if (oldp != NULL) {
                    len = *oldlenp;
                    elem_size = name[2];
                    elem_count = name[3];
                    if (elem_size != sizeof(pcb))
                            return EINVAL;
        } else {
                    len = 0;
                    elem_count = INT_MAX;
                    elem_size = sizeof(pcb);
        }
        error = 0;
        dp = oldp;
        out_size = elem_size;
        needed = 0;

        if (namelen == 1 && name[0] == CTL_QUERY)
                return (sysctl_query(SYSCTLFN_CALL(rnode)));

        if (name - oname != 4)
                return EINVAL;

        pf = oname[1];
        proto = oname[2];
        pf2 = (oldp != NULL) ? pf : 0;

        mutex_enter(softnet_lock);

        TAILQ_FOREACH(inp, &pcbtbl->inpt_queue, inp_queue) {
                if (inp->inp_af != pf)
                        continue;

                if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
                    KAUTH_REQ_NETWORK_SOCKET_CANSEE, inp->inp_socket, NULL,
                    NULL) != 0)
                        continue;

                memset(&pcb, 0, sizeof(pcb));

                pcb.ki_family = pf;
                pcb.ki_type = proto;

                switch (pf2) {
                case 0:
                        /* just probing for size */
                        break;
                case PF_INET:
                        pcb.ki_family = inp->inp_socket->so_proto->
                            pr_domain->dom_family;
                        pcb.ki_type = inp->inp_socket->so_proto->
                            pr_type;
                        pcb.ki_protocol = inp->inp_socket->so_proto->
                            pr_protocol;
                        pcb.ki_pflags = inp->inp_flags;

                        pcb.ki_sostate = inp->inp_socket->so_state;
                        pcb.ki_prstate = inp->inp_state;
                        if (proto == IPPROTO_TCP) {
                                tp = intotcpcb(inp);
                                pcb.ki_tstate = tp->t_state;
                                pcb.ki_tflags = tp->t_flags;
                        }

                        COND_SET_VALUE(pcb.ki_pcbaddr,
                            PTRTOUINT64(inp), allowaddr);
                        COND_SET_VALUE(pcb.ki_ppcbaddr,
                            PTRTOUINT64(inp->inp_ppcb), allowaddr);
                        COND_SET_VALUE(pcb.ki_sockaddr,
                            PTRTOUINT64(inp->inp_socket), allowaddr);

                        pcb.ki_rcvq = inp->inp_socket->so_rcv.sb_cc;
                        pcb.ki_sndq = inp->inp_socket->so_snd.sb_cc;

                        in = satosin(&pcb.ki_src);
                        in->sin_len = sizeof(*in);
                        in->sin_family = pf;
                        in->sin_port = inp->inp_lport;
                        in->sin_addr = const_in4p_laddr(inp);
                        if (pcb.ki_prstate >= INP_CONNECTED) {
                                in = satosin(&pcb.ki_dst);
                                in->sin_len = sizeof(*in);
                                in->sin_family = pf;
                                in->sin_port = inp->inp_fport;
                                in->sin_addr = const_in4p_faddr(inp);
                        }
                        break;
#ifdef INET6
                case PF_INET6:
                        pcb.ki_family = inp->inp_socket->so_proto->
                            pr_domain->dom_family;
                        pcb.ki_type = inp->inp_socket->so_proto->pr_type;
                        pcb.ki_protocol = inp->inp_socket->so_proto->
                            pr_protocol;
                        pcb.ki_pflags = inp->inp_flags;

                        pcb.ki_sostate = inp->inp_socket->so_state;
                        pcb.ki_prstate = inp->inp_state;
                        if (proto == IPPROTO_TCP) {
                                tp = intotcpcb(inp);
                                pcb.ki_tstate = tp->t_state;
                                pcb.ki_tflags = tp->t_flags;
                        }

                        COND_SET_VALUE(pcb.ki_pcbaddr,
                            PTRTOUINT64(inp), allowaddr);
                        COND_SET_VALUE(pcb.ki_ppcbaddr,
                            PTRTOUINT64(inp->inp_ppcb), allowaddr);
                        COND_SET_VALUE(pcb.ki_sockaddr,
                            PTRTOUINT64(inp->inp_socket), allowaddr);

                        pcb.ki_rcvq = inp->inp_socket->so_rcv.sb_cc;
                        pcb.ki_sndq = inp->inp_socket->so_snd.sb_cc;

                        in6 = satosin6(&pcb.ki_src);
                        in6->sin6_len = sizeof(*in6);
                        in6->sin6_family = pf;
                        in6->sin6_port = inp->inp_lport;
                        in6->sin6_flowinfo = const_in6p_flowinfo(inp);
                        in6->sin6_addr = const_in6p_laddr(inp);
                        in6->sin6_scope_id = 0; /* XXX? */

                        if (pcb.ki_prstate >= INP_CONNECTED) {
                                in6 = satosin6(&pcb.ki_dst);
                                in6->sin6_len = sizeof(*in6);
                                in6->sin6_family = pf;
                                in6->sin6_port = inp->inp_fport;
                                in6->sin6_flowinfo = const_in6p_flowinfo(inp);
                                in6->sin6_addr = const_in6p_faddr(inp);
                                in6->sin6_scope_id = 0; /* XXX? */
                        }
                        break;
#endif
                }

                if (len >= elem_size && elem_count > 0) {
                        error = copyout(&pcb, dp, out_size);
                        if (error) {
                                mutex_exit(softnet_lock);
                                return error;
                        }
                        dp += elem_size;
                        len -= elem_size;
                }
                needed += elem_size;
                if (elem_count > 0 && elem_count != INT_MAX)
                        elem_count--;
        }

        *oldlenp = needed;
        if (oldp == NULL)
                *oldlenp += PCB_SLOP * sizeof(struct kinfo_pcb);

        mutex_exit(softnet_lock);

        return error;
}

static int
sysctl_tcp_congctl(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int error;
        char newname[TCPCC_MAXLEN];

        strlcpy(newname, tcp_congctl_global_name, sizeof(newname) - 1);

        node = *rnode;
        node.sysctl_data = newname;
        node.sysctl_size = sizeof(newname);

        error = sysctl_lookup(SYSCTLFN_CALL(&node));

        if (error ||
            newp == NULL ||
            strncmp(newname, tcp_congctl_global_name, sizeof(newname)) == 0)
                return error;

        mutex_enter(softnet_lock);
        error = tcp_congctl_select(NULL, newname);
        mutex_exit(softnet_lock);

        return error;
}

static int
sysctl_tcp_init_win(SYSCTLFN_ARGS)
{
        int error;
        u_int iw;
        struct sysctlnode node;

        iw = *(u_int *)rnode->sysctl_data;
        node = *rnode;
        node.sysctl_data = &iw;
        node.sysctl_size = sizeof(iw);
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        if (iw >= __arraycount(tcp_init_win_max))
                return EINVAL;
        *(u_int *)rnode->sysctl_data = iw;
        return 0;
}

static int
sysctl_tcp_keep(SYSCTLFN_ARGS)
{
        int error;
        u_int tmp;
        struct sysctlnode node;

        node = *rnode;
        tmp = *(u_int *)rnode->sysctl_data;
        node.sysctl_data = &tmp;

        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        if (!(tmp > 0 && tmp <= TCP_TIMER_MAXTICKS))
                return EINVAL;

        mutex_enter(softnet_lock);

        *(u_int *)rnode->sysctl_data = tmp;
        tcp_tcpcb_template();        /* update the template */

        mutex_exit(softnet_lock);
        return 0;
}

static int
sysctl_net_inet_tcp_stats(SYSCTLFN_ARGS)
{

        return (NETSTAT_SYSCTL(tcpstat_percpu, TCP_NSTATS));
}

/*
 * this (second stage) setup routine is a replacement for tcp_sysctl()
 * (which is currently used for ipv4 and ipv6)
 */
static void
sysctl_net_inet_tcp_setup2(struct sysctllog **clog, int pf, const char *pfname,
                           const char *tcpname)
{
        const struct sysctlnode *sack_node;
        const struct sysctlnode *abc_node;
        const struct sysctlnode *ecn_node;
        const struct sysctlnode *congctl_node;
        const struct sysctlnode *mslt_node;
        const struct sysctlnode *vtw_node;
#ifdef TCP_DEBUG
        extern struct tcp_debug tcp_debug[TCP_NDEBUG];
        extern int tcp_debx;
#endif

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, pfname, NULL,
                       NULL, 0, NULL, 0,
                       CTL_NET, pf, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, tcpname,
                       SYSCTL_DESCR("TCP related settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "rfc1323",
                       SYSCTL_DESCR("Enable RFC1323 TCP extensions"),
                       sysctl_update_tcpcb_template, 0, &tcp_do_rfc1323, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_RFC1323, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "sendspace",
                       SYSCTL_DESCR("Default TCP send buffer size"),
                       NULL, 0, &tcp_sendspace, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SENDSPACE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "recvspace",
                       SYSCTL_DESCR("Default TCP receive buffer size"),
                       NULL, 0, &tcp_recvspace, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_RECVSPACE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "mssdflt",
                       SYSCTL_DESCR("Default maximum segment size"),
                       sysctl_net_inet_tcp_mssdflt, 0, &tcp_mssdflt, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_MSSDFLT, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "minmss",
                       SYSCTL_DESCR("Lower limit for TCP maximum segment size"),
                       NULL, 0, &tcp_minmss, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "msl",
                       SYSCTL_DESCR("Maximum Segment Life"),
                       NULL, 0, &tcp_msl, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_MSL, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "syn_cache_limit",
                       SYSCTL_DESCR("Maximum number of entries in the TCP "
                                    "compressed state engine"),
                       NULL, 0, &tcp_syn_cache_limit, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SYN_CACHE_LIMIT,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "syn_bucket_limit",
                       SYSCTL_DESCR("Maximum number of entries per hash "
                                    "bucket in the TCP compressed state "
                                    "engine"),
                       NULL, 0, &tcp_syn_bucket_limit, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SYN_BUCKET_LIMIT,
                       CTL_EOL);
#if 0 /* obsoleted */
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "syn_cache_interval",
                       SYSCTL_DESCR("TCP compressed state engine's timer interval"),
                       NULL, 0, &tcp_syn_cache_interval, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SYN_CACHE_INTER,
                       CTL_EOL);
#endif
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "init_win",
                       SYSCTL_DESCR("Initial TCP congestion window"),
                       sysctl_tcp_init_win, 0, &tcp_init_win, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_INIT_WIN, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "mss_ifmtu",
                       SYSCTL_DESCR("Use interface MTU for calculating MSS"),
                       NULL, 0, &tcp_mss_ifmtu, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_MSS_IFMTU, CTL_EOL);
        sysctl_createv(clog, 0, NULL, &sack_node,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "sack",
                       SYSCTL_DESCR("RFC2018 Selective ACKnowledgement tunables"),
                       NULL, 0, NULL, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_EOL);

        /* Congctl subtree */
        sysctl_createv(clog, 0, NULL, &congctl_node,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "congctl",
                       SYSCTL_DESCR("TCP Congestion Control"),
                           NULL, 0, NULL, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &congctl_node, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "available",
                       SYSCTL_DESCR("Available Congestion Control Mechanisms"),
                       NULL, 0, tcp_congctl_avail, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &congctl_node, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_STRING, "selected",
                       SYSCTL_DESCR("Selected Congestion Control Mechanism"),
                       sysctl_tcp_congctl, 0, NULL, TCPCC_MAXLEN,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "win_scale",
                       SYSCTL_DESCR("Use RFC1323 window scale options"),
                       sysctl_update_tcpcb_template, 0, &tcp_do_win_scale, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_WSCALE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "timestamps",
                       SYSCTL_DESCR("Use RFC1323 time stamp options"),
                       sysctl_update_tcpcb_template, 0, &tcp_do_timestamps, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_TSTAMP, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "cwm",
                       SYSCTL_DESCR("Hughes/Touch/Heidemann Congestion Window "
                                    "Monitoring"),
                       NULL, 0, &tcp_cwm, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_CWM, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "cwm_burstsize",
                       SYSCTL_DESCR("Congestion Window Monitoring allowed "
                                    "burst count in packets"),
                       NULL, 0, &tcp_cwm_burstsize, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_CWM_BURSTSIZE,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "ack_on_push",
                       SYSCTL_DESCR("Immediately return ACK when PSH is "
                                    "received"),
                       NULL, 0, &tcp_ack_on_push, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_ACK_ON_PUSH, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "keepidle",
                       SYSCTL_DESCR("Allowed connection idle ticks before a "
                                    "keepalive probe is sent"),
                       sysctl_tcp_keep, 0, &tcp_keepidle, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPIDLE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "keepintvl",
                       SYSCTL_DESCR("Ticks before next keepalive probe is sent"),
                       sysctl_tcp_keep, 0, &tcp_keepintvl, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPINTVL, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "keepcnt",
                       SYSCTL_DESCR("Number of keepalive probes to send"),
                       sysctl_tcp_keep, 0, &tcp_keepcnt, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_KEEPCNT, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
                       CTLTYPE_INT, "slowhz",
                       SYSCTL_DESCR("Keepalive ticks per second"),
                       NULL, PR_SLOWHZ, NULL, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SLOWHZ, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "log_refused",
                       SYSCTL_DESCR("Log refused TCP connections"),
                       NULL, 0, &tcp_log_refused, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_LOG_REFUSED, CTL_EOL);
#if 0 /* obsoleted */
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "rstratelimit", NULL,
                       NULL, 0, &tcp_rst_ratelim, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_RSTRATELIMIT, CTL_EOL);
#endif
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "rstppslimit",
                       SYSCTL_DESCR("Maximum number of RST packets to send "
                                    "per second"),
                       NULL, 0, &tcp_rst_ppslim, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_RSTPPSLIMIT, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "delack_ticks",
                       SYSCTL_DESCR("Number of ticks to delay sending an ACK"),
                       NULL, 0, &tcp_delack_ticks, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_DELACK_TICKS, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "init_win_local",
                       SYSCTL_DESCR("Initial TCP window size (in segments)"),
                       sysctl_tcp_init_win, 0, &tcp_init_win_local, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_INIT_WIN_LOCAL,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_STRUCT, "ident",
                       SYSCTL_DESCR("RFC1413 Identification Protocol lookups"),
                       sysctl_net_inet_tcp_ident, 0, NULL, sizeof(uid_t),
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_IDENT, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "do_loopback_cksum",
                       SYSCTL_DESCR("Perform TCP checksum on loopback"),
                       NULL, 0, &tcp_do_loopback_cksum, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_LOOPBACKCKSUM,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "pcblist",
                       SYSCTL_DESCR("TCP protocol control block list"),
                       sysctl_inpcblist, 0, &tcbtable, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "keepinit",
                       SYSCTL_DESCR("Ticks before initial tcp connection times out"),
                       sysctl_tcp_keep, 0, &tcp_keepinit, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);

        /* TCP socket buffers auto-sizing nodes */
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "recvbuf_auto",
                       SYSCTL_DESCR("Enable automatic receive "
                           "buffer sizing (experimental)"),
                       NULL, 0, &tcp_do_autorcvbuf, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "recvbuf_inc",
                       SYSCTL_DESCR("Incrementor step size of "
                           "automatic receive buffer"),
                       NULL, 0, &tcp_autorcvbuf_inc, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "recvbuf_max",
                       SYSCTL_DESCR("Max size of automatic receive buffer"),
                       NULL, 0, &tcp_autorcvbuf_max, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "sendbuf_auto",
                       SYSCTL_DESCR("Enable automatic send "
                           "buffer sizing (experimental)"),
                       NULL, 0, &tcp_do_autosndbuf, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "sendbuf_inc",
                       SYSCTL_DESCR("Incrementor step size of "
                           "automatic send buffer"),
                       NULL, 0, &tcp_autosndbuf_inc, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "sendbuf_max",
                       SYSCTL_DESCR("Max size of automatic send buffer"),
                       NULL, 0, &tcp_autosndbuf_max, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);

        /* ECN subtree */
        sysctl_createv(clog, 0, NULL, &ecn_node,
                           CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "ecn",
                           SYSCTL_DESCR("RFC3168 Explicit Congestion Notification"),
                           NULL, 0, NULL, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &ecn_node, NULL,
                           CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "enable",
                       SYSCTL_DESCR("Enable TCP Explicit Congestion "
                           "Notification"),
                           NULL, 0, &tcp_do_ecn, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &ecn_node, NULL,
                           CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "maxretries",
                       SYSCTL_DESCR("Number of times to retry ECN setup "
                               "before disabling ECN on the connection"),
                           NULL, 0, &tcp_ecn_maxretries, 0, CTL_CREATE, CTL_EOL);

        /* SACK gets its own little subtree. */
        sysctl_createv(clog, 0, NULL, &sack_node,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "enable",
                       SYSCTL_DESCR("Enable RFC2018 Selective ACKnowledgement"),
                       NULL, 0, &tcp_do_sack, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, &sack_node,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "maxholes",
                       SYSCTL_DESCR("Maximum number of TCP SACK holes allowed per connection"),
                       NULL, 0, &tcp_sack_tp_maxholes, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, &sack_node,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "globalmaxholes",
                       SYSCTL_DESCR("Global maximum number of TCP SACK holes"),
                       NULL, 0, &tcp_sack_globalmaxholes, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, &sack_node,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_INT, "globalholes",
                       SYSCTL_DESCR("Global number of TCP SACK holes"),
                       NULL, 0, &tcp_sack_globalholes, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_SACK, CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "stats",
                       SYSCTL_DESCR("TCP statistics"),
                       sysctl_net_inet_tcp_stats, 0, NULL, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_STATS,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "local_by_rtt",
                       SYSCTL_DESCR("Use RTT estimator to decide which hosts "
                                    "are local"),
                       NULL, 0, &tcp_rttlocal, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
#ifdef TCP_DEBUG
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "debug",
                       SYSCTL_DESCR("TCP sockets debug information"),
                       NULL, 0, &tcp_debug, sizeof(tcp_debug),
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_DEBUG,
                       CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_INT, "debx",
                       SYSCTL_DESCR("Number of TCP debug sockets messages"),
                       NULL, 0, &tcp_debx, sizeof(tcp_debx),
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_DEBX,
                       CTL_EOL);
#endif
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_STRUCT, "drop",
                       SYSCTL_DESCR("TCP drop connection"),
                       sysctl_net_inet_tcp_drop, 0, NULL, 0,
                       CTL_NET, pf, IPPROTO_TCP, TCPCTL_DROP, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "iss_hash",
                       SYSCTL_DESCR("Enable RFC 1948 ISS by cryptographic "
                                    "hash computation"),
                       NULL, 0, &tcp_do_rfc1948, sizeof(tcp_do_rfc1948),
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE,
                       CTL_EOL);

        /* ABC subtree */

        sysctl_createv(clog, 0, NULL, &abc_node,
                       CTLFLAG_PERMANENT, CTLTYPE_NODE, "abc",
                       SYSCTL_DESCR("RFC3465 Appropriate Byte Counting (ABC)"),
                       NULL, 0, NULL, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &abc_node, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "enable",
                       SYSCTL_DESCR("Enable RFC3465 Appropriate Byte Counting"),
                       NULL, 0, &tcp_do_abc, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &abc_node, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "aggressive",
                       SYSCTL_DESCR("1: L=2*SMSS 0: L=1*SMSS"),
                       NULL, 0, &tcp_abc_aggressive, 0, CTL_CREATE, CTL_EOL);

        /* MSL tuning subtree */

        sysctl_createv(clog, 0, NULL, &mslt_node,
                       CTLFLAG_PERMANENT, CTLTYPE_NODE, "mslt",
                       SYSCTL_DESCR("MSL Tuning for TIME_WAIT truncation"),
                       NULL, 0, NULL, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &mslt_node, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "enable",
                       SYSCTL_DESCR("Enable TIME_WAIT truncation"),
                       NULL, 0, &tcp_msl_enable, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &mslt_node, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "loopback",
                       SYSCTL_DESCR("MSL value to use for loopback connections"),
                       NULL, 0, &tcp_msl_loop, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &mslt_node, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "local",
                       SYSCTL_DESCR("MSL value to use for local connections"),
                       NULL, 0, &tcp_msl_local, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &mslt_node, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "remote",
                       SYSCTL_DESCR("MSL value to use for remote connections"),
                       NULL, 0, &tcp_msl_remote, 0, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &mslt_node, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "remote_threshold",
                       SYSCTL_DESCR("RTT estimate value to promote local to remote"),
                       NULL, 0, &tcp_msl_remote_threshold, 0, CTL_CREATE, CTL_EOL);

        /* vestigial TIME_WAIT tuning subtree */

        sysctl_createv(clog, 0, NULL, &vtw_node,
                       CTLFLAG_PERMANENT, CTLTYPE_NODE, "vtw",
                       SYSCTL_DESCR("Tuning for Vestigial TIME_WAIT"),
                       NULL, 0, NULL, 0,
                       CTL_NET, pf, IPPROTO_TCP, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &vtw_node, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "enable",
                       SYSCTL_DESCR("Enable Vestigial TIME_WAIT"),
                       sysctl_tcp_vtw_enable, 0,
                       (pf == AF_INET) ? &tcp4_vtw_enable : &tcp6_vtw_enable,
                       0, CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &vtw_node, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
                       CTLTYPE_INT, "entries",
                       SYSCTL_DESCR("Maximum number of vestigial TIME_WAIT entries"),
                       NULL, 0, &tcp_vtw_entries, 0, CTL_CREATE, CTL_EOL);
}

void
tcp_usrreq_init(void)
{

        sysctl_net_inet_tcp_setup2(NULL, PF_INET, "inet", "tcp");
#ifdef INET6
        sysctl_net_inet_tcp_setup2(NULL, PF_INET6, "inet6", "tcp6");
#endif
}

PR_WRAP_USRREQS(tcp)
#define        tcp_attach        tcp_attach_wrapper
#define        tcp_detach        tcp_detach_wrapper
#define        tcp_accept        tcp_accept_wrapper
#define        tcp_bind        tcp_bind_wrapper
#define        tcp_listen        tcp_listen_wrapper
#define        tcp_connect        tcp_connect_wrapper
#define        tcp_connect2        tcp_connect2_wrapper
#define        tcp_disconnect        tcp_disconnect_wrapper
#define        tcp_shutdown        tcp_shutdown_wrapper
#define        tcp_abort        tcp_abort_wrapper
#define        tcp_ioctl        tcp_ioctl_wrapper
#define        tcp_stat        tcp_stat_wrapper
#define        tcp_peeraddr        tcp_peeraddr_wrapper
#define        tcp_sockaddr        tcp_sockaddr_wrapper
#define        tcp_rcvd        tcp_rcvd_wrapper
#define        tcp_recvoob        tcp_recvoob_wrapper
#define        tcp_send        tcp_send_wrapper
#define        tcp_sendoob        tcp_sendoob_wrapper
#define        tcp_purgeif        tcp_purgeif_wrapper

const struct pr_usrreqs tcp_usrreqs = {
        .pr_attach        = tcp_attach,
        .pr_detach        = tcp_detach,
        .pr_accept        = tcp_accept,
        .pr_bind        = tcp_bind,
        .pr_listen        = tcp_listen,
        .pr_connect        = tcp_connect,
        .pr_connect2        = tcp_connect2,
        .pr_disconnect        = tcp_disconnect,
        .pr_shutdown        = tcp_shutdown,
        .pr_abort        = tcp_abort,
        .pr_ioctl        = tcp_ioctl,
        .pr_stat        = tcp_stat,
        .pr_peeraddr        = tcp_peeraddr,
        .pr_sockaddr        = tcp_sockaddr,
        .pr_rcvd        = tcp_rcvd,
        .pr_recvoob        = tcp_recvoob,
        .pr_send        = tcp_send,
        .pr_sendoob        = tcp_sendoob,
        .pr_purgeif        = tcp_purgeif,
};




































































































































































































































































































































    9 








    8 
    8 










    9 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
/*        $NetBSD: tmpfs.h,v 1.56 2020/05/17 19:39:15 ad Exp $        */

/*
 * Copyright (c) 2005, 2006, 2007, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
 * 2005 program.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _FS_TMPFS_TMPFS_H_
#define _FS_TMPFS_TMPFS_H_

#if !defined(_KERNEL) && !defined(_KMEMUSER)
#error "not supposed to be exposed to userland"
#endif

#include <sys/dirent.h>
#include <sys/mount.h>
#include <sys/pool.h>
#include <sys/queue.h>
#include <sys/vnode.h>

/*
 * Internal representation of a tmpfs directory entry.
 *
 * All fields are protected by vnode lock.
 */
typedef struct tmpfs_dirent {
        TAILQ_ENTRY(tmpfs_dirent)        td_entries;

        /* Pointer to the inode this entry refers to. */
        struct tmpfs_node *                td_node;

        /* Sequence number, see tmpfs_dir_getseq(). */
        uint32_t                        td_seq;

        /* Name and its length. */
        char *                                td_name;
        uint16_t                        td_namelen;
} tmpfs_dirent_t;

TAILQ_HEAD(tmpfs_dir, tmpfs_dirent);

/*
 * Internal representation of a tmpfs file system node -- inode.
 *
 * This structure is split in two parts: one holds attributes common
 * to all file types and the other holds data that is only applicable to
 * a particular type.
 *
 * All fields are protected by vnode lock.  The vnode association itself
 * is protected by vcache.
 */
typedef struct tmpfs_node {
        LIST_ENTRY(tmpfs_node)        tn_entries;

        /*
         * Each inode has a corresponding vnode.  It is a bi-directional
         * association.  Whenever vnode is allocated, its v_data field is
         * set to the inode it reference, and tmpfs_node_t::tn_vnode is
         * set to point to the said vnode.
         *
         * Further attempts to allocate a vnode for this same node will
         * result in returning a new reference to the value stored in
         * tn_vnode.  It may be NULL when the node is unused (that is,
         * no vnode has been allocated or it has been reclaimed).
         */
        vnode_t *                tn_vnode;

        /* Prevent node from being reclaimed. */
        uint32_t                tn_holdcount;

        /* Directory entry.  Only a hint, since hard link can have multiple. */
        tmpfs_dirent_t *        tn_dirent_hint;

        /* The inode type: VBLK, VCHR, VDIR, VFIFO, VLNK, VREG or VSOCK. */
        enum vtype                tn_type;

        /* Inode identifier and generation number. */
        ino_t                        tn_id;
        uint32_t                tn_gen;

        /* The inode size. */
        off_t                        tn_size;

        /* Generic node attributes. */
        uid_t                        tn_uid;
        gid_t                        tn_gid;
        mode_t                        tn_mode;
        int                        tn_flags;
        nlink_t                        tn_links;
        unsigned                tn_tflags;
        struct timespec                tn_atime;
        struct timespec                tn_mtime;
        struct timespec                tn_ctime;
        struct timespec                tn_birthtime;
        kmutex_t                tn_timelock;

        /* Head of byte-level lock list (used by tmpfs_advlock). */
        struct lockf *                tn_lockf;

        union {
                /* Type case: VBLK or VCHR. */
                struct {
                        dev_t                        tn_rdev;
                } tn_dev;

                /* Type case: VDIR. */
                struct {
                        /* Parent directory (root inode points to itself). */
                        struct tmpfs_node *        tn_parent;

                        /* List of directory entries. */
                        struct tmpfs_dir        tn_dir;

                        /* Last given sequence number and their arena. */
                        uint32_t                tn_next_seq;
                        void *                        tn_seq_arena;

                        /*
                         * Pointer of the last directory entry returned
                         * by the readdir(3) operation.
                         */
                        struct tmpfs_dirent *        tn_readdir_lastp;
                } tn_dir;

                /* Type case: VLNK. */
                struct tn_lnk {
                        /* The link's target. */
                        char *                        tn_link;
                } tn_lnk;

                /* Type case: VREG. */
                struct tn_reg {
                        /* Underlying UVM object to store contents. */
                        struct uvm_object *        tn_aobj;
                        size_t                        tn_aobj_pages;
                } tn_reg;
        } tn_spec;
} tmpfs_node_t;

#if defined(_KERNEL)

VFS_PROTOS(tmpfs);

LIST_HEAD(tmpfs_node_list, tmpfs_node);

#define        TMPFS_MAXNAMLEN                255
/* Validate maximum td_namelen length. */
CTASSERT(TMPFS_MAXNAMLEN < UINT16_MAX);

/*
 * Reserved values for the virtual entries (the first must be 0) and EOF.
 * The start/end of the incremental range, see tmpfs_dir_getseq().
 */
#define        TMPFS_DIRSEQ_DOT        0
#define        TMPFS_DIRSEQ_DOTDOT        1
#define        TMPFS_DIRSEQ_EOF        2

#define        TMPFS_DIRSEQ_START        3                /* inclusive */
#define        TMPFS_DIRSEQ_END        (1U << 30)        /* exclusive */

/* Mark to indicate that the number is not set. */
#define        TMPFS_DIRSEQ_NONE        (1U << 31)

/* Flags: time update requests. */
#define        TMPFS_UPDATE_ATIME        0x01
#define        TMPFS_UPDATE_MTIME        0x02
#define        TMPFS_UPDATE_CTIME        0x04

/*
 * Bits indicating whiteout use for the directory.
 * We abuse tmpfs_node_t::tn_gen for that.
 */
#define        TMPFS_WHITEOUT_BIT        (1U << 31)
#define        TMPFS_NODE_GEN_MASK        (TMPFS_WHITEOUT_BIT - 1)

#define        TMPFS_NODE_GEN(node) \
    ((node)->tn_gen & TMPFS_NODE_GEN_MASK)

/* White-out inode indicator. */
#define        TMPFS_NODE_WHITEOUT        ((tmpfs_node_t *)-1)

/*
 * Bit indicating this node must be reclaimed when holdcount reaches zero.
 * Ored into tmpfs_node_t::tn_holdcount.
 */
#define TMPFS_NODE_RECLAIMED                (1U << 30)

/*
 * Internal representation of a tmpfs mount point.
 */
typedef struct tmpfs_mount {
        /* Limit and number of bytes in use by the file system. */
        uint64_t                tm_mem_limit;
        uint64_t                tm_bytes_used;
        kmutex_t                tm_acc_lock;

        /* Pointer to the root inode. */
        tmpfs_node_t *                tm_root;

        /* Maximum number of possible nodes for this file system. */
        unsigned int                tm_nodes_max;

        /* Number of nodes currently allocated. */
        unsigned int                tm_nodes_cnt;

        /* List of inodes and the lock protecting it. */
        kmutex_t                tm_lock;
        struct tmpfs_node_list        tm_nodes;
} tmpfs_mount_t;

/*
 * This structure maps a file identifier to a tmpfs node.  Used by the
 * NFS code.
 */
typedef struct tmpfs_fid {
        uint16_t                tf_len;
        uint16_t                tf_pad;
        uint32_t                tf_gen;
        ino_t                        tf_id;
} tmpfs_fid_t;

/*
 * Prototypes for tmpfs_subr.c.
 */

void                tmpfs_free_node(tmpfs_mount_t *, tmpfs_node_t *);

int                tmpfs_construct_node(vnode_t *, vnode_t **, struct vattr *,
                    struct componentname *, char *);

int                tmpfs_alloc_dirent(tmpfs_mount_t *, const char *, uint16_t,
                    tmpfs_dirent_t **);
void                tmpfs_free_dirent(tmpfs_mount_t *, tmpfs_dirent_t *);
void                tmpfs_dir_attach(tmpfs_node_t *, tmpfs_dirent_t *, tmpfs_node_t *);
void                tmpfs_dir_detach(tmpfs_node_t *, tmpfs_dirent_t *);

tmpfs_dirent_t *tmpfs_dir_lookup(tmpfs_node_t *, struct componentname *);
tmpfs_dirent_t *tmpfs_dir_cached(tmpfs_node_t *);

uint32_t        tmpfs_dir_getseq(tmpfs_node_t *, tmpfs_dirent_t *);
tmpfs_dirent_t *tmpfs_dir_lookupbyseq(tmpfs_node_t *, off_t);
int                tmpfs_dir_getdents(tmpfs_node_t *, struct uio *, off_t *);

int                tmpfs_reg_resize(vnode_t *, off_t);

int                tmpfs_chflags(vnode_t *, int, kauth_cred_t, lwp_t *);
int                tmpfs_chmod(vnode_t *, mode_t, kauth_cred_t, lwp_t *);
int                tmpfs_chown(vnode_t *, uid_t, gid_t, kauth_cred_t, lwp_t *);
int                tmpfs_chsize(vnode_t *, u_quad_t, kauth_cred_t, lwp_t *);
int                tmpfs_chtimes(vnode_t *, const struct timespec *,
                    const struct timespec *, const struct timespec *, int,
                    kauth_cred_t, lwp_t *);
void                tmpfs_update(vnode_t *, unsigned);
void                tmpfs_update_locked(vnode_t *, unsigned);
void                tmpfs_update_lazily(vnode_t *, unsigned);

/*
 * Prototypes for tmpfs_mem.c.
 */

void                tmpfs_mntmem_init(tmpfs_mount_t *, uint64_t);
void                tmpfs_mntmem_destroy(tmpfs_mount_t *);
int                tmpfs_mntmem_set(tmpfs_mount_t *, uint64_t);

size_t                tmpfs_mem_info(bool);
uint64_t        tmpfs_bytes_max(tmpfs_mount_t *);
size_t                tmpfs_pages_avail(tmpfs_mount_t *);
bool                tmpfs_mem_incr(tmpfs_mount_t *, size_t);
void                tmpfs_mem_decr(tmpfs_mount_t *, size_t);

tmpfs_dirent_t *tmpfs_dirent_get(tmpfs_mount_t *);
void                tmpfs_dirent_put(tmpfs_mount_t *, tmpfs_dirent_t *);

tmpfs_node_t *        tmpfs_node_get(tmpfs_mount_t *);
void                tmpfs_node_put(tmpfs_mount_t *, tmpfs_node_t *);

char *                tmpfs_strname_alloc(tmpfs_mount_t *, size_t);
void                tmpfs_strname_free(tmpfs_mount_t *, char *, size_t);
bool                tmpfs_strname_neqlen(struct componentname *, struct componentname *);

/*
 * Ensures that the node pointed by 'node' is a directory and that its
 * contents are consistent with respect to directories.
 */
#define        TMPFS_VALIDATE_DIR(node) \
    KASSERT((node)->tn_vnode == NULL || VOP_ISLOCKED((node)->tn_vnode)); \
    KASSERT((node)->tn_type == VDIR); \
    KASSERT((node)->tn_size % sizeof(tmpfs_dirent_t) == 0);

/*
 * Routines to convert VFS structures to tmpfs internal ones.
 */

static __inline tmpfs_mount_t *
VFS_TO_TMPFS(struct mount *mp)
{
        tmpfs_mount_t *tmp = mp->mnt_data;

        KASSERT(tmp != NULL);
        return tmp;
}

static __inline tmpfs_node_t *
VP_TO_TMPFS_DIR(vnode_t *vp)
{
        tmpfs_node_t *node = vp->v_data;

        KASSERT(node != NULL);
        TMPFS_VALIDATE_DIR(node);
        return node;
}

#endif /* defined(_KERNEL) */

static __inline tmpfs_node_t *
VP_TO_TMPFS_NODE(vnode_t *vp)
{
        tmpfs_node_t *node = vp->v_data;
#ifdef KASSERT
        KASSERT(node != NULL);
#endif
        return node;
}

#endif /* _FS_TMPFS_TMPFS_H_ */







































































































































  204 













































































































































































































































































































































   83 



   84 
















    9 
















    8 
    9 




    8 

    8 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
/*        $NetBSD: subr_cpu.c,v 1.22 2024/03/05 20:59:41 thorpej Exp $        */

/*-
 * Copyright (c) 2007, 2008, 2009, 2010, 2012, 2019, 2020
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c)2007 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * CPU related routines shared with rump.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_cpu.c,v 1.22 2024/03/05 20:59:41 thorpej Exp $");

#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/kmem.h>

static void        cpu_topology_fake1(struct cpu_info *);

kmutex_t        cpu_lock                __cacheline_aligned;
int                ncpu                        __read_mostly;
int                ncpuonline                __read_mostly;
bool                mp_online                __read_mostly;
static bool        cpu_topology_present        __read_mostly;
static bool        cpu_topology_haveslow        __read_mostly;
int64_t                cpu_counts[CPU_COUNT_MAX];

/* An array of CPUs.  There are ncpu entries. */
struct cpu_info **cpu_infos                __read_mostly;

/* Note: set on mi_cpu_attach() and idle_loop(). */
kcpuset_t *        kcpuset_attached        __read_mostly        = NULL;
kcpuset_t *        kcpuset_running                __read_mostly        = NULL;

static char cpu_model[128];

/*
 * mi_cpu_init: early initialisation of MI CPU related structures.
 *
 * Note: may not block and memory allocator is not yet available.
 */
void
mi_cpu_init(void)
{
        struct cpu_info *ci;

        mutex_init(&cpu_lock, MUTEX_DEFAULT, IPL_NONE);

        kcpuset_create(&kcpuset_attached, true);
        kcpuset_create(&kcpuset_running, true);
        kcpuset_set(kcpuset_running, 0);

        ci = curcpu();
        cpu_topology_fake1(ci);
}

int
cpu_setmodel(const char *fmt, ...)
{
        int len;
        va_list ap;

        va_start(ap, fmt);
        len = vsnprintf(cpu_model, sizeof(cpu_model), fmt, ap);
        va_end(ap);
        return len;
}

const char *
cpu_getmodel(void)
{
        return cpu_model;
}

bool
cpu_softintr_p(void)
{

        return (curlwp->l_pflag & LP_INTR) != 0;
}

bool
curcpu_stable(void)
{
        struct lwp *const l = curlwp;
        const int pflag = l->l_pflag;
        const int nopreempt = l->l_nopreempt;

        /*
         * - Softints (LP_INTR) never migrate between CPUs.
         * - Bound lwps (LP_BOUND), either kthreads created bound to
         *   a CPU or any lwps bound with curlwp_bind, never migrate.
         * - If kpreemption is disabled, the lwp can't migrate.
         * - If we're in interrupt context, preemption is blocked.
         *
         * We combine the LP_INTR, LP_BOUND, and l_nopreempt test into
         * a single predicted-true branch so this is cheap to assert in
         * most contexts where it will be used, then fall back to
         * calling the full kpreempt_disabled() and cpu_intr_p() as
         * subroutines.
         *
         * XXX Is cpu_intr_p redundant with kpreempt_disabled?
         */
        return __predict_true(((pflag & (LP_INTR|LP_BOUND)) | nopreempt)
                != 0) ||
            kpreempt_disabled() ||
            cpu_intr_p();
}

/*
 * Collect CPU topology information as each CPU is attached.  This can be
 * called early during boot, so we need to be careful what we do.
 */
void
cpu_topology_set(struct cpu_info *ci, u_int package_id, u_int core_id,
    u_int smt_id, u_int numa_id)
{
        enum cpu_rel rel;

        cpu_topology_present = true;
        ci->ci_package_id = package_id;
        ci->ci_core_id = core_id;
        ci->ci_smt_id = smt_id;
        ci->ci_numa_id = numa_id;
        for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) {
                ci->ci_sibling[rel] = ci;
                ci->ci_nsibling[rel] = 1;
        }
}

/*
 * Collect CPU relative speed
 */
void
cpu_topology_setspeed(struct cpu_info *ci, bool slow)
{

        cpu_topology_haveslow |= slow;
        ci->ci_is_slow = slow;
}

/*
 * Link a CPU into the given circular list.
 */
static void
cpu_topology_link(struct cpu_info *ci, struct cpu_info *ci2, enum cpu_rel rel)
{
        struct cpu_info *ci3;

        /* Walk to the end of the existing circular list and append. */
        for (ci3 = ci2;; ci3 = ci3->ci_sibling[rel]) {
                ci3->ci_nsibling[rel]++;
                if (ci3->ci_sibling[rel] == ci2) {
                        break;
                }
        }
        ci->ci_sibling[rel] = ci2;
        ci3->ci_sibling[rel] = ci;
        ci->ci_nsibling[rel] = ci3->ci_nsibling[rel];
}

/*
 * Print out the topology lists.
 */
static void
cpu_topology_dump(void)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci, *ci2;
        const char *names[] = { "core", "pkg", "1st" };
        enum cpu_rel rel;
        int i;

        CTASSERT(__arraycount(names) >= __arraycount(ci->ci_sibling));
        if (ncpu == 1) {
                return;
        }

        for (CPU_INFO_FOREACH(cii, ci)) {
                if (cpu_topology_haveslow)
                        aprint_debug("%s ", ci->ci_is_slow ? "slow" : "fast");
                for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) {
                        aprint_debug("%s has %d %s siblings:", cpu_name(ci),
                            ci->ci_nsibling[rel], names[rel]);
                        ci2 = ci->ci_sibling[rel];
                        i = 0;
                        do {
                                aprint_debug(" %s", cpu_name(ci2));
                                ci2 = ci2->ci_sibling[rel];
                        } while (++i < 64 && ci2 != ci->ci_sibling[rel]);
                        if (i == 64) {
                                aprint_debug(" GAVE UP");
                        }
                        aprint_debug("\n");
                }
                aprint_debug("%s first in package: %s\n", cpu_name(ci),
                    cpu_name(ci->ci_package1st));
        }
}

/*
 * Fake up topology info if we have none, or if what we got was bogus.
 * Used early in boot, and by cpu_topology_fake().
 */
static void
cpu_topology_fake1(struct cpu_info *ci)
{
        enum cpu_rel rel;

        for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) {
                ci->ci_sibling[rel] = ci;
                ci->ci_nsibling[rel] = 1;
        }
        if (!cpu_topology_present) {
                ci->ci_package_id = cpu_index(ci);
        }
        ci->ci_schedstate.spc_flags |=
            (SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS);
        ci->ci_package1st = ci;
        if (!cpu_topology_haveslow) {
                ci->ci_is_slow = false;
        }
}

/*
 * Fake up topology info if we have none, or if what we got was bogus.
 * Don't override ci_package_id, etc, if cpu_topology_present is set.
 * MD code also uses these.
 */
static void
cpu_topology_fake(void)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;

        for (CPU_INFO_FOREACH(cii, ci)) {
                cpu_topology_fake1(ci);
                /* Undo (early boot) flag set so everything links OK. */
                ci->ci_schedstate.spc_flags &=
                    ~(SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS);
        }
}

/*
 * Fix up basic CPU topology info.  Right now that means attach each CPU to
 * circular lists of its siblings in the same core, and in the same package.
 */
void
cpu_topology_init(void)
{
        CPU_INFO_ITERATOR cii, cii2;
        struct cpu_info *ci, *ci2, *ci3;
        u_int minsmt, mincore;

        if (!cpu_topology_present) {
                cpu_topology_fake();
                goto linkit;
        }

        /* Find siblings in same core and package. */
        for (CPU_INFO_FOREACH(cii, ci)) {
                ci->ci_schedstate.spc_flags &=
                    ~(SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS);
                for (CPU_INFO_FOREACH(cii2, ci2)) {
                        /* Avoid bad things happening. */
                        if (ci2->ci_package_id == ci->ci_package_id &&
                            ci2->ci_core_id == ci->ci_core_id &&
                            ci2->ci_smt_id == ci->ci_smt_id &&
                            ci2 != ci) {
#ifdef DEBUG
                                printf("cpu%u %p pkg %u core %u smt %u same as "
                                       "cpu%u %p pkg %u core %u smt %u\n",
                                       cpu_index(ci), ci, ci->ci_package_id,
                                       ci->ci_core_id, ci->ci_smt_id,
                                       cpu_index(ci2), ci2, ci2->ci_package_id,
                                       ci2->ci_core_id, ci2->ci_smt_id);
#endif
                                    printf("cpu_topology_init: info bogus, "
                                        "faking it\n");
                                    cpu_topology_fake();
                                    goto linkit;
                        }
                        if (ci2 == ci ||
                            ci2->ci_package_id != ci->ci_package_id) {
                                continue;
                        }
                        /* Find CPUs in the same core. */
                        if (ci->ci_nsibling[CPUREL_CORE] == 1 &&
                            ci->ci_core_id == ci2->ci_core_id) {
                                    cpu_topology_link(ci, ci2, CPUREL_CORE);
                        }
                        /* Find CPUs in the same package. */
                        if (ci->ci_nsibling[CPUREL_PACKAGE] == 1) {
                                    cpu_topology_link(ci, ci2, CPUREL_PACKAGE);
                        }
                        if (ci->ci_nsibling[CPUREL_CORE] > 1 &&
                            ci->ci_nsibling[CPUREL_PACKAGE] > 1) {
                                break;
                        }
                }
        }

 linkit:
        /* Identify lowest numbered SMT in each core. */
        for (CPU_INFO_FOREACH(cii, ci)) {
                ci2 = ci3 = ci;
                minsmt = ci->ci_smt_id;
                do {
                        if (ci2->ci_smt_id < minsmt) {
                                ci3 = ci2;
                                minsmt = ci2->ci_smt_id;
                        }
                        ci2 = ci2->ci_sibling[CPUREL_CORE];
                } while (ci2 != ci);
                ci3->ci_schedstate.spc_flags |= SPCF_CORE1ST;
        }

        /* Identify lowest numbered SMT in each package. */
        ci3 = NULL;
        for (CPU_INFO_FOREACH(cii, ci)) {
                if ((ci->ci_schedstate.spc_flags & SPCF_CORE1ST) == 0) {
                        continue;
                }
                ci2 = ci3 = ci;
                mincore = ci->ci_core_id;
                do {
                        if ((ci2->ci_schedstate.spc_flags &
                            SPCF_CORE1ST) != 0 &&
                            ci2->ci_core_id < mincore) {
                                ci3 = ci2;
                                mincore = ci2->ci_core_id;
                        }
                        ci2 = ci2->ci_sibling[CPUREL_PACKAGE];
                } while (ci2 != ci);

                if ((ci3->ci_schedstate.spc_flags & SPCF_PACKAGE1ST) != 0) {
                        /* Already identified - nothing more to do. */
                        continue;
                }
                ci3->ci_schedstate.spc_flags |= SPCF_PACKAGE1ST;

                /* Walk through all CPUs in package and point to first. */
                ci2 = ci3;
                do {
                        ci2->ci_package1st = ci3;
                        ci2->ci_sibling[CPUREL_PACKAGE1ST] = ci3;
                        ci2 = ci2->ci_sibling[CPUREL_PACKAGE];
                } while (ci2 != ci3);

                /* Now look for somebody else to link to. */
                for (CPU_INFO_FOREACH(cii2, ci2)) {
                        if ((ci2->ci_schedstate.spc_flags & SPCF_PACKAGE1ST)
                            != 0 && ci2 != ci3) {
                                    cpu_topology_link(ci3, ci2, CPUREL_PACKAGE1ST);
                                    break;
                        }
                }
        }

        /* Walk through all packages, starting with value of ci3 from above. */
        KASSERT(ci3 != NULL);
        ci = ci3;
        do {
                /* Walk through CPUs in the package and copy in PACKAGE1ST. */
                ci2 = ci;
                do {
                        ci2->ci_sibling[CPUREL_PACKAGE1ST] =
                            ci->ci_sibling[CPUREL_PACKAGE1ST];
                        ci2->ci_nsibling[CPUREL_PACKAGE1ST] =
                            ci->ci_nsibling[CPUREL_PACKAGE1ST];
                        ci2 = ci2->ci_sibling[CPUREL_PACKAGE];
                } while (ci2 != ci);
                ci = ci->ci_sibling[CPUREL_PACKAGE1ST];
        } while (ci != ci3);

        if (cpu_topology_haveslow) {
                /*
                 * For asymmetric systems where some CPUs are slower than
                 * others, mark first class CPUs for the scheduler.  This
                 * conflicts with SMT right now so whinge if observed.
                 */
                if (curcpu()->ci_nsibling[CPUREL_CORE] > 1) {
                        printf("cpu_topology_init: asymmetric & SMT??\n");
                }
                for (CPU_INFO_FOREACH(cii, ci)) {
                        if (!ci->ci_is_slow) {
                                ci->ci_schedstate.spc_flags |= SPCF_1STCLASS;
                        }
                }
        } else {
                /*
                 * For any other configuration mark the 1st CPU in each
                 * core as a first class CPU.
                 */
                for (CPU_INFO_FOREACH(cii, ci)) {
                        if ((ci->ci_schedstate.spc_flags & SPCF_CORE1ST) != 0) {
                                ci->ci_schedstate.spc_flags |= SPCF_1STCLASS;
                        }
                }
        }

        cpu_topology_dump();
}

/*
 * Adjust one count, for a counter that's NOT updated from interrupt
 * context.  Hardly worth making an inline due to preemption stuff.
 */
void
cpu_count(enum cpu_count idx, int64_t delta)
{
        lwp_t *l = curlwp;
        KPREEMPT_DISABLE(l);
        l->l_cpu->ci_counts[idx] += delta;
        KPREEMPT_ENABLE(l);
}

/*
 * Fetch fresh sum total for all counts.  Expensive - don't call often.
 *
 * If poll is true, the caller is okay with less recent values (but
 * no more than 1/hz seconds old).  Where this is called very often that
 * should be the case.
 *
 * This should be reasonably quick so that any value collected get isn't
 * totally out of whack, and it can also be called from interrupt context,
 * so go to splvm() while summing the counters.  It's tempting to use a spin
 * mutex here but this routine is called from DDB.
 */
void
cpu_count_sync(bool poll)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
        int64_t sum[CPU_COUNT_MAX], *ptr;
        static int lasttick;
        int curtick, s;
        enum cpu_count i;

        KASSERT(sizeof(ci->ci_counts) == sizeof(cpu_counts));

        if (__predict_false(!mp_online)) {
                memcpy(cpu_counts, curcpu()->ci_counts, sizeof(cpu_counts));
                return;
        }

        s = splvm();
        curtick = getticks();
        if (poll && atomic_load_acquire(&lasttick) == curtick) {
                splx(s);
                return;
        }
        memset(sum, 0, sizeof(sum));
        curcpu()->ci_counts[CPU_COUNT_SYNC]++;
        for (CPU_INFO_FOREACH(cii, ci)) {
                ptr = ci->ci_counts;
                for (i = 0; i < CPU_COUNT_MAX; i += 8) {
                        sum[i+0] += ptr[i+0];
                        sum[i+1] += ptr[i+1];
                        sum[i+2] += ptr[i+2];
                        sum[i+3] += ptr[i+3];
                        sum[i+4] += ptr[i+4];
                        sum[i+5] += ptr[i+5];
                        sum[i+6] += ptr[i+6];
                        sum[i+7] += ptr[i+7];
                }
                KASSERT(i == CPU_COUNT_MAX);
        }
        memcpy(cpu_counts, sum, sizeof(cpu_counts));
        atomic_store_release(&lasttick, curtick);
        splx(s);
}






















































































































    1 










    1 


















    1 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
/*        $NetBSD: vfs_syscalls_20.c,v 1.46 2020/06/28 14:37:53 christos Exp $        */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vfs_syscalls.c        8.42 (Berkeley) 7/31/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_20.c,v 1.46 2020/06/28 14:37:53 christos Exp $");

#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/dirent.h>
#include <sys/sysctl.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/vfs_syscalls.h>

#include <compat/common/compat_mod.h>

#include <compat/sys/mount.h>
#include <compat/sys/statvfs.h>

static const struct syscall_package vfs_syscalls_20_syscalls[] = {
        { SYS_compat_20_fhstatfs, 0, (sy_call_t *)compat_20_sys_fhstatfs },
        { SYS_compat_20_fstatfs, 0, (sy_call_t *)compat_20_sys_fstatfs },
        { SYS_compat_20_getfsstat, 0, (sy_call_t *)compat_20_sys_getfsstat }, 
        { SYS_compat_20_statfs, 0, (sy_call_t *)compat_20_sys_statfs },
        { 0, 0, NULL }
};

/*
 * Get filesystem statistics.
 */
/* ARGSUSED */
int
compat_20_sys_statfs(struct lwp *l, const struct compat_20_sys_statfs_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(struct statfs12 *) buf;
        } */
        struct mount *mp;
        struct statvfs *sbuf;
        int error;
        struct vnode *vp;

        error = namei_simple_user(SCARG(uap, path),
                        NSM_FOLLOW_TRYEMULROOT, &vp);
        if (error != 0)
                return error;

        mp = vp->v_mount;

        sbuf = STATVFSBUF_GET();
        if ((error = dostatvfs(mp, sbuf, l, 0, 1)) != 0)
                goto done;

        error = statvfs_to_statfs12_copy(sbuf, SCARG(uap, buf), 0);
done:
        vrele(vp);
        STATVFSBUF_PUT(sbuf);
        return error;
}

/*
 * Get filesystem statistics.
 */
/* ARGSUSED */
int
compat_20_sys_fstatfs(struct lwp *l, const struct compat_20_sys_fstatfs_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(struct statfs12 *) buf;
        } */
        struct file *fp;
        struct mount *mp;
        struct statvfs *sbuf;
        int error;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return (error);
        mp = fp->f_vnode->v_mount;
        sbuf = STATVFSBUF_GET();
        if ((error = dostatvfs(mp, sbuf, l, 0, 1)) != 0)
                goto out;
        error = statvfs_to_statfs12_copy(sbuf, SCARG(uap, buf), 0);
 out:
        fd_putfile(SCARG(uap, fd));
        STATVFSBUF_PUT(sbuf);
        return error;
}


/*
 * Get statistics on all filesystems.
 */
int
compat_20_sys_getfsstat(struct lwp *l, const struct compat_20_sys_getfsstat_args *uap, register_t *retval)
{
        /* {
                syscallarg(struct statfs12 *) buf;
                syscallarg(long) bufsize;
                syscallarg(int) flags;
        } */
        return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
            SCARG(uap, flags), statvfs_to_statfs12_copy,
            sizeof(struct statfs12), retval);
}

int
compat_20_sys_fhstatfs(struct lwp *l, const struct compat_20_sys_fhstatfs_args *uap, register_t *retval)
{
        /* {
                syscallarg(const struct compat_30_fhandle *) fhp;
                syscallarg(struct statfs12 *) buf;
        } */
        struct statvfs *sbuf;
        struct compat_30_fhandle fh;
        struct mount *mp;
        struct vnode *vp;
        int error;

        /*
         * Must be super user
         */
        if ((error = kauth_authorize_system(l->l_cred,
            KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL)))
                return (error);

        if ((error = copyin(SCARG(uap, fhp), &fh, sizeof(fh))) != 0)
                return (error);

        if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
                return (ESTALE);
        error = VFS_FHTOVP(mp, (struct fid*)&fh.fh_fid, LK_EXCLUSIVE, &vp);
        if (error != 0)
                return (error);
        mp = vp->v_mount;
        VOP_UNLOCK(vp);
        sbuf = STATVFSBUF_GET();
        if ((error = VFS_STATVFS(mp, sbuf)) != 0)
                goto out;
        error = statvfs_to_statfs12_copy(sbuf, SCARG(uap, buf), 0);
out:
        vrele(vp);
        STATVFSBUF_PUT(sbuf);
        return error;
}

int
vfs_syscalls_20_init(void)
{

        return syscall_establish(NULL, vfs_syscalls_20_syscalls);
}

int
vfs_syscalls_20_fini(void)
{

        return syscall_disestablish(NULL, vfs_syscalls_20_syscalls);
}
















































































































































    5 






















    5 










    5 



























    5 
    5 

















   14 









   14 













    3 









    3 
















   69 
   30 







   26 










































   73 














   73 




   73 





































    6 


   72 





   10 













































    9 


    1 























    1 

    9 










   10 
































   10 

    8 
























    1 









    1 




    1 


    1 

    1 











    9 




    1 


    1 
    1 
    1 


    3 





    4 

















    2 








    8 




































    8 










    1 


    1 
    1 


   10 


























































    1 









    1 
    1 
    1 





















    1 

    1 



















    1 




    1 

    1 

















    1 
    1 





    1 
















    1 




















    1 



    1 

    1 









    1 





















































































    1 




















    1 












































































































































































































































    2 






















    2 




















    2 









    2 
    2 










    2 













    2 












    2 





    2 
















    2 





















    2 





































































































































    4 






















    4 














    3 

    3 


    3 

    3 

    1 


    4 

    4 














    4 









































    4 





































































































































    4 












    4 
    4 




































    4 







    4 













    4 




















   12 


















    1 
   11 











   24 















   24 
   24 
   18 







    9 












   18 


    9 
    9 



















































    2 











    2 








    1 











    1 










    1 








    1 









    1 









































    2 









    2 

















































































    9 




















   15 








   11 








    2 




    3 









   15 

















   13 





   13 











   12 






   13 


   11 




    2 





   13 
















   13 












   13 























    2 












    2 












    1 



    2 







    2 





   55 












   55 









    2 




    2 
    2 
    2 
    1 













    1 


    1 






    1 
    1 


    2 
    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
/*        $NetBSD: ufs_vnops.c,v 1.262 2022/03/27 16:24:59 christos Exp $        */

/*-
 * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ufs_vnops.c        8.28 (Berkeley) 7/31/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.262 2022/03/27 16:24:59 christos Exp $");

#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#include "opt_uvmhist.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/resourcevar.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/fstrans.h>
#include <sys/kmem.h>
#include <sys/malloc.h>
#include <sys/dirent.h>
#include <sys/lockf.h>
#include <sys/kauth.h>
#include <sys/wapbl.h>

#include <miscfs/specfs/specdev.h>
#include <miscfs/fifofs/fifo.h>
#include <miscfs/genfs/genfs.h>

#include <ufs/ufs/acl.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_wapbl.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dirhash.h>
#endif
#include <ufs/ext2fs/ext2fs_extern.h>
#include <ufs/ext2fs/ext2fs_dir.h>
#include <ufs/ffs/ffs_extern.h>
#include <ufs/lfs/lfs_extern.h>
#include <ufs/lfs/lfs.h>

#ifdef UVMHIST
#include <uvm/uvm.h>
#endif
#include <uvm/uvm_extern.h>
#include <uvm/uvm_stat.h>

__CTASSERT(EXT2FS_MAXNAMLEN == FFS_MAXNAMLEN);
__CTASSERT(LFS_MAXNAMLEN == FFS_MAXNAMLEN);

static int ufs_chmod(struct vnode *, int, kauth_cred_t, struct lwp *);
static int ufs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t,
    struct lwp *);
static int ufs_makeinode(struct vattr *, struct vnode *,
    const struct ufs_lookup_results *, struct vnode **, struct componentname *);

/*
 * A virgin directory (no blushing please).
 */
static const struct dirtemplate mastertemplate = {
        0,        12,                        DT_DIR,        1,        ".",
        0,        UFS_DIRBLKSIZ - 12,        DT_DIR,        2,        ".."
};

/*
 * Create a regular file
 */
int
ufs_create(void *v)
{
        struct vop_create_v3_args /* {
                struct vnode                *a_dvp;
                struct vnode                **a_vpp;
                struct componentname        *a_cnp;
                struct vattr                *a_vap;
        } */ *ap = v;
        int        error;
        struct vnode *dvp = ap->a_dvp;
        struct ufs_lookup_results *ulr;

        /* XXX should handle this material another way */
        ulr = &VTOI(dvp)->i_crap;
        UFS_CHECK_CRAPCOUNTER(VTOI(dvp));

        /*
         * UFS_WAPBL_BEGIN(dvp->v_mount) performed by successful
         * ufs_makeinode
         */
        error = ufs_makeinode(ap->a_vap, dvp, ulr, ap->a_vpp, ap->a_cnp);
        if (error) {
                return (error);
        }
        UFS_WAPBL_END(dvp->v_mount);
        VOP_UNLOCK(*ap->a_vpp);
        return (0);
}

/*
 * Mknod vnode call
 */
/* ARGSUSED */
int
ufs_mknod(void *v)
{
        struct vop_mknod_v3_args /* {
                struct vnode                *a_dvp;
                struct vnode                **a_vpp;
                struct componentname        *a_cnp;
                struct vattr                *a_vap;
        } */ *ap = v;
        struct vattr        *vap;
        struct vnode        **vpp;
        struct inode        *ip;
        int                error;
        struct ufs_lookup_results *ulr;

        vap = ap->a_vap;
        vpp = ap->a_vpp;

        /* XXX should handle this material another way */
        ulr = &VTOI(ap->a_dvp)->i_crap;
        UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));

        /*
         * UFS_WAPBL_BEGIN(dvp->v_mount) performed by successful
         * ufs_makeinode
         */
        if ((error = ufs_makeinode(vap, ap->a_dvp, ulr, vpp, ap->a_cnp)) != 0)
                goto out;
        ip = VTOI(*vpp);
        ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
        UFS_WAPBL_UPDATE(*vpp, NULL, NULL, 0);
        UFS_WAPBL_END(ap->a_dvp->v_mount);
        VOP_UNLOCK(*vpp);
out:
        if (error != 0) {
                *vpp = NULL;
                return (error);
        }
        return (0);
}

/*
 * Open called.
 *
 * Nothing to do.
 */
/* ARGSUSED */
int
ufs_open(void *v)
{
        struct vop_open_args /* {
                struct vnode        *a_vp;
                int                a_mode;
                kauth_cred_t        a_cred;
        } */ *ap = v;

        /*
         * Files marked append-only must be opened for appending.
         */
        if ((VTOI(ap->a_vp)->i_flags & APPEND) &&
            (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
                return (EPERM);
        return (0);
}

/*
 * Close called.
 *
 * Update the times on the inode.
 */
/* ARGSUSED */
int
ufs_close(void *v)
{
        struct vop_close_args /* {
                struct vnode        *a_vp;
                int                a_fflag;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        struct vnode        *vp;

        vp = ap->a_vp;
        if (vrefcnt(vp) > 1)
                UFS_ITIMES(vp, NULL, NULL, NULL);
        return (0);
}

static int
ufs_check_possible(struct vnode *vp, struct inode *ip, accmode_t accmode,
    kauth_cred_t cred)
{
#if defined(QUOTA) || defined(QUOTA2)
        int error;
#endif

        /*
         * Disallow write attempts on read-only file systems;
         * unless the file is a socket, fifo, or a block or
         * character device resident on the file system.
         */
        if (accmode & VMODIFY_PERMS) {
                switch (vp->v_type) {
                case VDIR:
                case VLNK:
                case VREG:
                        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                                return EROFS;
#if defined(QUOTA) || defined(QUOTA2)
                        error = chkdq(ip, 0, cred, 0);
                        if (error != 0)
                                return error;
#endif
                        break;
                case VBAD:
                case VBLK:
                case VCHR:
                case VSOCK:
                case VFIFO:
                case VNON:
                default:
                        break;
                }
        }

        /* If it is a snapshot, nobody gets access to it. */
        if ((ip->i_flags & SF_SNAPSHOT))
                return EPERM;
        /*
         * If immutable bit set, nobody gets to write it.  "& ~VADMIN_PERMS"
         * permits the owner of the file to remove the IMMUTABLE flag.
         */
        if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) &&
            (ip->i_flags & IMMUTABLE))
                return EPERM;

        return 0;
}

static int
ufs_check_permitted(struct vnode *vp, struct inode *ip,
    struct acl *acl, accmode_t accmode, kauth_cred_t cred,
    int (*func)(struct vnode *, kauth_cred_t, uid_t, gid_t, mode_t,
    struct acl *, accmode_t))
{

        return kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(accmode,
            vp->v_type, ip->i_mode & ALLPERMS), vp, NULL, (*func)(vp, cred,
            ip->i_uid, ip->i_gid, ip->i_mode & ALLPERMS, acl, accmode));
}

int
ufs_accessx(void *v)
{
        struct vop_accessx_args /* {
                struct vnode *a_vp;
                accmode_t a_accmode;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct inode *ip = VTOI(vp);
        accmode_t accmode = ap->a_accmode;
        int error;
#ifdef UFS_ACL
        struct acl *acl;
        acl_type_t type;
#endif

        error = ufs_check_possible(vp, ip, accmode, ap->a_cred);
        if (error)
                return error;

#ifdef UFS_ACL
        if ((vp->v_mount->mnt_flag & (MNT_POSIX1EACLS | MNT_NFS4ACLS)) != 0) {
                if (vp->v_mount->mnt_flag & MNT_NFS4ACLS)
                        type = ACL_TYPE_NFS4;
                else
                        type = ACL_TYPE_ACCESS;

                acl = acl_alloc(KM_SLEEP);
                if (type == ACL_TYPE_NFS4)
                        error = ufs_getacl_nfs4_internal(vp, acl, curlwp);
                else
                        error = VOP_GETACL(vp, type, acl, ap->a_cred);
                if (!error) {
                        if (type == ACL_TYPE_NFS4) {
                                error = ufs_check_permitted(vp,
                                    ip, acl, accmode, ap->a_cred,
                                    genfs_can_access_acl_nfs4);
                        } else {
                                error = vfs_unixify_accmode(&accmode);
                                if (error == 0)
                                        error = ufs_check_permitted(vp,
                                            ip, acl, accmode, ap->a_cred,
                                            genfs_can_access_acl_posix1e);
                        }
                        acl_free(acl);
                        return error;
                }
                if (error != EOPNOTSUPP)
                        printf("%s: Error retrieving ACL: %d\n",
                            __func__, error);
                /*
                 * XXX: Fall back until debugged.  Should
                 * eventually possibly log an error, and return
                 * EPERM for safety.
                 */
                acl_free(acl);
        }
#endif /* !UFS_ACL */
        error = vfs_unixify_accmode(&accmode);
        if (error)
                return error;
        return ufs_check_permitted(vp, ip,
            NULL, accmode, ap->a_cred, genfs_can_access);
}

/* ARGSUSED */
int
ufs_getattr(void *v)
{
        struct vop_getattr_args /* {
                struct vnode        *a_vp;
                struct vattr        *a_vap;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        struct vnode        *vp;
        struct inode        *ip;
        struct vattr        *vap;

        vp = ap->a_vp;
        ip = VTOI(vp);
        vap = ap->a_vap;
        UFS_ITIMES(vp, NULL, NULL, NULL);

        /*
         * Copy from inode table
         */
        vap->va_fsid = ip->i_dev;
        vap->va_fileid = ip->i_number;
        vap->va_mode = ip->i_mode & ALLPERMS;
        vap->va_nlink = ip->i_nlink;
        vap->va_uid = ip->i_uid;
        vap->va_gid = ip->i_gid;
        vap->va_size = vp->v_size;
        if (ip->i_ump->um_fstype == UFS1) {
                switch (vp->v_type) {
                    case VBLK:
                    case VCHR:
                        vap->va_rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev,
                            UFS_MPNEEDSWAP(ip->i_ump));
                        break;
                    default:
                        vap->va_rdev = NODEV;
                        break;
                }
                vap->va_atime.tv_sec = ip->i_ffs1_atime;
                vap->va_atime.tv_nsec = ip->i_ffs1_atimensec;
                vap->va_mtime.tv_sec = ip->i_ffs1_mtime;
                vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec;
                vap->va_ctime.tv_sec = ip->i_ffs1_ctime;
                vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec;
                vap->va_birthtime.tv_sec = 0;
                vap->va_birthtime.tv_nsec = 0;
                vap->va_bytes = dbtob((u_quad_t)ip->i_ffs1_blocks);
        } else {
                switch (vp->v_type) {
                    case VBLK:
                    case VCHR:
                        vap->va_rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev,
                            UFS_MPNEEDSWAP(ip->i_ump));
                        break;
                    default:
                        vap->va_rdev = NODEV;
                        break;
                }
                vap->va_atime.tv_sec = ip->i_ffs2_atime;
                vap->va_atime.tv_nsec = ip->i_ffs2_atimensec;
                vap->va_mtime.tv_sec = ip->i_ffs2_mtime;
                vap->va_mtime.tv_nsec = ip->i_ffs2_mtimensec;
                vap->va_ctime.tv_sec = ip->i_ffs2_ctime;
                vap->va_ctime.tv_nsec = ip->i_ffs2_ctimensec;
                vap->va_birthtime.tv_sec = ip->i_ffs2_birthtime;
                vap->va_birthtime.tv_nsec = ip->i_ffs2_birthnsec;
                vap->va_bytes = dbtob(ip->i_ffs2_blocks);
        }
        vap->va_gen = ip->i_gen;
        vap->va_flags = ip->i_flags;

        /* this doesn't belong here */
        if (vp->v_type == VBLK)
                vap->va_blocksize = BLKDEV_IOSIZE;
        else if (vp->v_type == VCHR)
                vap->va_blocksize = MAXBSIZE;
        else
                vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
        vap->va_type = vp->v_type;
        vap->va_filerev = ip->i_modrev;
        return (0);
}

/*
 * Set attribute vnode op. called from several syscalls
 */
int
ufs_setattr(void *v)
{
        struct vop_setattr_args /* {
                struct vnode        *a_vp;
                struct vattr        *a_vap;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        struct vattr        *vap;
        struct vnode        *vp;
        struct inode        *ip;
        kauth_cred_t        cred;
        struct lwp        *l;
        int                error;
        kauth_action_t        action;
        bool                changing_sysflags;

        vap = ap->a_vap;
        vp = ap->a_vp;
        ip = VTOI(vp);
        cred = ap->a_cred;
        l = curlwp;
        action = KAUTH_VNODE_WRITE_FLAGS;
        changing_sysflags = false;

        /*
         * Check for unsettable attributes.
         */
        if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
            (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
            (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
            ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
                return (EINVAL);
        }

        UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount);

        if (vap->va_flags != VNOVAL) {
                if (vp->v_mount->mnt_flag & MNT_RDONLY) {
                        error = EROFS;
                        goto out;
                }

                /* Snapshot flag cannot be set or cleared */
                if ((vap->va_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
                    (ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))) {
                        error = EPERM;
                        goto out;
                }

                if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) {
                        action |= KAUTH_VNODE_HAS_SYSFLAGS;
                }

                if ((vap->va_flags & SF_SETTABLE) !=
                    (ip->i_flags & SF_SETTABLE)) {
                        action |= KAUTH_VNODE_WRITE_SYSFLAGS;
                        changing_sysflags = true;
                }

                error = kauth_authorize_vnode(cred, action, vp, NULL,
                    genfs_can_chflags(vp, cred, ip->i_uid, changing_sysflags));
                if (error)
                        goto out;

                if (changing_sysflags) {
                        error = UFS_WAPBL_BEGIN(vp->v_mount);
                        if (error)
                                goto out;
                        ip->i_flags = vap->va_flags;
                        DIP_ASSIGN(ip, flags, ip->i_flags);
                } else {
                        error = UFS_WAPBL_BEGIN(vp->v_mount);
                        if (error)
                                goto out;
                        ip->i_flags &= SF_SETTABLE;
                        ip->i_flags |= (vap->va_flags & UF_SETTABLE);
                        DIP_ASSIGN(ip, flags, ip->i_flags);
                }
                ip->i_flag |= IN_CHANGE;
                UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
                UFS_WAPBL_END(vp->v_mount);
                if (vap->va_flags & (IMMUTABLE | APPEND)) {
                        error = 0;
                        goto out;
                }
        }
        if (ip->i_flags & (IMMUTABLE | APPEND)) {
                error = EPERM;
                goto out;
        }
        /*
         * Go through the fields and update iff not VNOVAL.
         */
        if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
                if (vp->v_mount->mnt_flag & MNT_RDONLY) {
                        error = EROFS;
                        goto out;
                }
                error = UFS_WAPBL_BEGIN(vp->v_mount);
                if (error)
                        goto out;
                error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, l);
                UFS_WAPBL_END(vp->v_mount);
                if (error)
                        goto out;
        }
        if (vap->va_size != VNOVAL) {
                /*
                 * Disallow write attempts on read-only file systems;
                 * unless the file is a socket, fifo, or a block or
                 * character device resident on the file system.
                 */
                switch (vp->v_type) {
                case VDIR:
                        error = EISDIR;
                        goto out;
                case VCHR:
                case VBLK:
                case VFIFO:
                        break;
                case VREG:
                        if (vp->v_mount->mnt_flag & MNT_RDONLY) {
                                error = EROFS;
                                goto out;
                        }
                        if ((ip->i_flags & SF_SNAPSHOT) != 0) {
                                error = EPERM;
                                goto out;
                        }
                        error = ufs_truncate_retry(vp, 0, vap->va_size, cred);
                        if (error)
                                goto out;
                        break;
                default:
                        error = EOPNOTSUPP;
                        goto out;
                }
        }
        ip = VTOI(vp);
        if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL ||
            vap->va_birthtime.tv_sec != VNOVAL) {
                if (vp->v_mount->mnt_flag & MNT_RDONLY) {
                        error = EROFS;
                        goto out;
                }
                if ((ip->i_flags & SF_SNAPSHOT) != 0) {
                        error = EPERM;
                        goto out;
                }
                error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp,
                    NULL, genfs_can_chtimes(vp, cred, ip->i_uid,
                    vap->va_vaflags));
                if (error)
                        goto out;
                error = UFS_WAPBL_BEGIN(vp->v_mount);
                if (error)
                        goto out;
                if (vap->va_atime.tv_sec != VNOVAL)
                        if (!(vp->v_mount->mnt_flag & MNT_NOATIME))
                                ip->i_flag |= IN_ACCESS;
                if (vap->va_mtime.tv_sec != VNOVAL) {
                        ip->i_flag |= IN_CHANGE | IN_UPDATE;
                        if (vp->v_mount->mnt_flag & MNT_RELATIME)
                                ip->i_flag |= IN_ACCESS;
                }
                if (vap->va_birthtime.tv_sec != VNOVAL &&
                    ip->i_ump->um_fstype == UFS2) {
                        ip->i_ffs2_birthtime = vap->va_birthtime.tv_sec;
                        ip->i_ffs2_birthnsec = vap->va_birthtime.tv_nsec;
                }
                error = UFS_UPDATE(vp, &vap->va_atime, &vap->va_mtime, 0);
                UFS_WAPBL_END(vp->v_mount);
                if (error)
                        goto out;
        }
        error = 0;
        if (vap->va_mode != (mode_t)VNOVAL) {
                if (vp->v_mount->mnt_flag & MNT_RDONLY) {
                        error = EROFS;
                        goto out;
                }
                if ((ip->i_flags & SF_SNAPSHOT) != 0 &&
                    (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP |
                     S_IXOTH | S_IWOTH))) {
                        error = EPERM;
                        goto out;
                }
                error = UFS_WAPBL_BEGIN(vp->v_mount);
                if (error)
                        goto out;
                error = ufs_chmod(vp, (int)vap->va_mode, cred, l);
                UFS_WAPBL_END(vp->v_mount);
        }
out:
        cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip));
        return (error);
}

#ifdef UFS_ACL
static int
ufs_update_nfs4_acl_after_mode_change(struct vnode *vp, int mode,
    int file_owner_id, kauth_cred_t cred, struct lwp *l)
{
        int error;
        struct acl *aclp;

        aclp = acl_alloc(KM_SLEEP);
        error = ufs_getacl_nfs4_internal(vp, aclp, l);
        /*
         * We don't have to handle EOPNOTSUPP here, as the filesystem claims
         * it supports ACLs.
         */
        if (error)
                goto out;

        acl_nfs4_sync_acl_from_mode(aclp, mode, file_owner_id);
        error = ufs_setacl_nfs4_internal(vp, aclp, l, false);

out:
        acl_free(aclp);
        return (error);
}
#endif /* UFS_ACL */

/*
 * Change the mode on a file.
 * Inode must be locked before calling.
 */
static int
ufs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l)
{
        struct inode        *ip;
        int                error;

        UFS_WAPBL_JLOCK_ASSERT(vp->v_mount);

        ip = VTOI(vp);

#ifdef UFS_ACL
        /*
         * To modify the permissions on a file, must possess VADMIN
         * for that file.
         */
        if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred)) != 0)
                return error;
#endif

        error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp,
            NULL, genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid, mode));
        if (error)
                return (error);

#ifdef UFS_ACL
        if ((vp->v_mount->mnt_flag & MNT_NFS4ACLS) != 0) {
                error = ufs_update_nfs4_acl_after_mode_change(vp, mode,
                    ip->i_uid, cred, l);
                if (error)
                        return error;
        }
#endif
        ip->i_mode &= ~ALLPERMS;
        ip->i_mode |= (mode & ALLPERMS);
        ip->i_flag |= IN_CHANGE;
        DIP_ASSIGN(ip, mode, ip->i_mode);
        UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
        cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip));
        return (0);
}

/*
 * Perform chown operation on inode ip;
 * inode must be locked prior to call.
 */
static int
ufs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred,
            struct lwp *l)
{
        struct inode        *ip;
        int                error = 0;
#if defined(QUOTA) || defined(QUOTA2)
        uid_t                ouid;
        gid_t                ogid;
        int64_t                change;
#endif
        ip = VTOI(vp);
        error = 0;

        if (uid == (uid_t)VNOVAL)
                uid = ip->i_uid;
        if (gid == (gid_t)VNOVAL)
                gid = ip->i_gid;

#ifdef UFS_ACL
        /*
         * To modify the ownership of a file, must possess VADMIN for that
         * file.
         */
        if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred)) != 0)
                return error;
#endif

        error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp,
            NULL, genfs_can_chown(vp, cred, ip->i_uid, ip->i_gid, uid, gid));
        if (error)
                return (error);

#if defined(QUOTA) || defined(QUOTA2)
        ogid = ip->i_gid;
        ouid = ip->i_uid;
        change = DIP(ip, blocks);
        (void) chkdq(ip, -change, cred, 0);
        (void) chkiq(ip, -1, cred, 0);
#endif
        ip->i_gid = gid;
        DIP_ASSIGN(ip, gid, gid);
        ip->i_uid = uid;
        DIP_ASSIGN(ip, uid, uid);
#if defined(QUOTA) || defined(QUOTA2)
        if ((error = chkdq(ip, change, cred, 0)) == 0) {
                if ((error = chkiq(ip, 1, cred, 0)) == 0)
                        goto good;
                else
                        (void) chkdq(ip, -change, cred, FORCE);
        }
        ip->i_gid = ogid;
        DIP_ASSIGN(ip, gid, ogid);
        ip->i_uid = ouid;
        DIP_ASSIGN(ip, uid, ouid);
        (void) chkdq(ip, change, cred, FORCE);
        (void) chkiq(ip, 1, cred, FORCE);
        return (error);
 good:
#endif /* QUOTA || QUOTA2 */
        ip->i_flag |= IN_CHANGE;
        UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
        cache_enter_id(vp, ip->i_mode, ip->i_uid, ip->i_gid, !HAS_ACLS(ip));
        return (0);
}

int
ufs_remove(void *v)
{
        struct vop_remove_v3_args /* {
                struct vnode                *a_dvp;
                struct vnode                *a_vp;
                struct componentname        *a_cnp;
                nlink_t                  ctx_vp_new_nlink;
        } */ *ap = v;
        struct vnode        *vp, *dvp;
        struct inode        *ip;
        struct mount        *mp;
        int                error;
        struct ufs_lookup_results *ulr;

        vp = ap->a_vp;
        dvp = ap->a_dvp;
        ip = VTOI(vp);
        mp = dvp->v_mount;
        KASSERT(mp == vp->v_mount); /* XXX Not stable without lock.  */

#ifdef UFS_ACL
#ifdef notyet
        /* We don't do this because if the filesystem is mounted without ACLs
         * this goes through vfs_unixify_accmode() and we get EPERM.
         */
        error = VOP_ACCESSX(vp, VDELETE, ap->a_cnp->cn_cred);
        if (error)
                goto err;
#endif
#endif

        /* XXX should handle this material another way */
        ulr = &VTOI(dvp)->i_crap;
        UFS_CHECK_CRAPCOUNTER(VTOI(dvp));

        if (vp->v_type == VDIR || (ip->i_flags & (IMMUTABLE | APPEND)) ||
            (VTOI(dvp)->i_flags & APPEND))
                error = EPERM;
        else {
                error = UFS_WAPBL_BEGIN(mp);
                if (error == 0) {
                        error = ufs_dirremove(dvp, ulr,
                                              ip, ap->a_cnp->cn_flags, 0);
                        UFS_WAPBL_END(mp);
                        if (error == 0) {
                                ap->ctx_vp_new_nlink = ip->i_nlink;
                        }
                }
        }
#ifdef notyet
err:
#endif
        if (dvp == vp)
                vrele(vp);
        else
                vput(vp);
        return (error);
}

/*
 * ufs_link: create hard link.
 */
int
ufs_link(void *v)
{
        struct vop_link_v2_args /* {
                struct vnode *a_dvp;
                struct vnode *a_vp;
                struct componentname *a_cnp;
        } */ *ap = v;
        struct vnode *dvp = ap->a_dvp;
        struct vnode *vp = ap->a_vp;
        struct componentname *cnp = ap->a_cnp;
        struct mount *mp = dvp->v_mount;
        struct inode *ip;
        struct direct *newdir;
        int error, abrt = 1;
        struct ufs_lookup_results *ulr;

        KASSERT(dvp != vp);
        KASSERT(vp->v_type != VDIR);
        KASSERT(mp == vp->v_mount); /* XXX Not stable without lock.  */

        /* XXX should handle this material another way */
        ulr = &VTOI(dvp)->i_crap;
        UFS_CHECK_CRAPCOUNTER(VTOI(dvp));

        error = vn_lock(vp, LK_EXCLUSIVE);
        if (error)
                goto out2;

        ip = VTOI(vp);
        if ((nlink_t)ip->i_nlink >= LINK_MAX) {
                error = EMLINK;
                goto out1;
        }
        if (ip->i_flags & (IMMUTABLE | APPEND)) {
                error = EPERM;
                goto out1;
        }

        error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp,
            dvp, 0);
        if (error)
                goto out1;

        error = UFS_WAPBL_BEGIN(mp);
        if (error)
                goto out1;

        ip->i_nlink++;
        DIP_ASSIGN(ip, nlink, ip->i_nlink);
        ip->i_flag |= IN_CHANGE;
        abrt = 0;
        error = UFS_UPDATE(vp, NULL, NULL, UPDATE_DIROP);
        if (!error) {
                newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
                ufs_makedirentry(ip, cnp, newdir);
                error = ufs_direnter(dvp, ulr, vp, newdir, cnp, NULL);
                pool_cache_put(ufs_direct_cache, newdir);
        }
        if (error) {
                ip->i_nlink--;
                DIP_ASSIGN(ip, nlink, ip->i_nlink);
                ip->i_flag |= IN_CHANGE;
                UFS_WAPBL_UPDATE(vp, NULL, NULL, UPDATE_DIROP);
        }
        UFS_WAPBL_END(mp);
 out1:
        VOP_UNLOCK(vp);
 out2:
        if (abrt)
                VOP_ABORTOP(dvp, cnp);
        return (error);
}

/*
 * whiteout vnode call
 */
int
ufs_whiteout(void *v)
{
        struct vop_whiteout_args /* {
                struct vnode                *a_dvp;
                struct componentname        *a_cnp;
                int                        a_flags;
        } */ *ap = v;
        struct vnode                *dvp = ap->a_dvp;
        struct componentname        *cnp = ap->a_cnp;
        struct direct                *newdir;
        int                        error;
        struct ufsmount                *ump = VFSTOUFS(dvp->v_mount);
        struct ufs_lookup_results *ulr;

        /* XXX should handle this material another way */
        ulr = &VTOI(dvp)->i_crap;
        UFS_CHECK_CRAPCOUNTER(VTOI(dvp));

        error = 0;
        switch (ap->a_flags) {
        case LOOKUP:
                /* 4.4 format directories support whiteout operations */
                if (ump->um_maxsymlinklen > 0)
                        return (0);
                return (EOPNOTSUPP);

        case CREATE:
                /* create a new directory whiteout */
                error = UFS_WAPBL_BEGIN(dvp->v_mount);
                if (error)
                        break;

                KASSERTMSG((ump->um_maxsymlinklen > 0),
                    "ufs_whiteout: old format filesystem");

                newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
                newdir->d_ino = UFS_WINO;
                newdir->d_namlen = cnp->cn_namelen;
                memcpy(newdir->d_name, cnp->cn_nameptr,
                    (size_t)cnp->cn_namelen);

                /* NUL terminate and zero out padding */
                memset(&newdir->d_name[cnp->cn_namelen], 0,
                    UFS_NAMEPAD(cnp->cn_namelen));

                newdir->d_type = DT_WHT;
                error = ufs_direnter(dvp, ulr, NULL, newdir, cnp, NULL);
                pool_cache_put(ufs_direct_cache, newdir);
                break;

        case DELETE:
                /* remove an existing directory whiteout */
                error = UFS_WAPBL_BEGIN(dvp->v_mount);
                if (error)
                        break;

                KASSERTMSG((ump->um_maxsymlinklen > 0),
                    "ufs_whiteout: old format filesystem");

                cnp->cn_flags &= ~DOWHITEOUT;
                error = ufs_dirremove(dvp, ulr, NULL, cnp->cn_flags, 0);
                break;
        default:
                panic("ufs_whiteout: unknown op");
                /* NOTREACHED */
        }
        UFS_WAPBL_END(dvp->v_mount);
        return (error);
}

#ifdef UFS_ACL
static int
ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp,
    mode_t dmode, kauth_cred_t cred, struct lwp *l)
{
        int error;
        struct inode *ip = VTOI(tvp);
        struct acl *dacl, *acl;

        acl = acl_alloc(KM_SLEEP);
        dacl = acl_alloc(KM_SLEEP);

        /*
         * Retrieve default ACL from parent, if any.
         */
        error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred);
        switch (error) {
        case 0:
                /*
                 * Retrieved a default ACL, so merge mode and ACL if
                 * necessary.  If the ACL is empty, fall through to
                 * the "not defined or available" case.
                 */
                if (acl->acl_cnt != 0) {
                        dmode = acl_posix1e_newfilemode(dmode, acl);
                        ip->i_mode = dmode;
                        DIP_ASSIGN(ip, mode, dmode);
                        *dacl = *acl;
                        ufs_sync_acl_from_inode(ip, acl);
                        break;
                }
                /* FALLTHROUGH */

        case EOPNOTSUPP:
                /*
                 * Just use the mode as-is.
                 */
                ip->i_mode = dmode;
                DIP_ASSIGN(ip, mode, dmode);
                error = 0;
                goto out;
        
        default:
                goto out;
        }

        /*
         * XXX: If we abort now, will Soft Updates notify the extattr
         * code that the EAs for the file need to be released?
         */
        UFS_WAPBL_END(tvp->v_mount);
        error = ufs_setacl_posix1e(tvp, ACL_TYPE_ACCESS, acl, cred, l);
        if (error == 0)
                error = ufs_setacl_posix1e(tvp, ACL_TYPE_DEFAULT, dacl, cred,
                    l);
        UFS_WAPBL_BEGIN(tvp->v_mount);
        switch (error) {
        case 0:
                break;

        case EOPNOTSUPP:
                /*
                 * XXX: This should not happen, as EOPNOTSUPP above
                 * was supposed to free acl.
                 */
                printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n");
                /*
                panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()");
                 */
                break;

        default:
                goto out;
        }

out:
        acl_free(acl);
        acl_free(dacl);

        return (error);
}

static int
ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp,
    mode_t mode, kauth_cred_t cred, struct lwp *l)
{
        int error;
        struct inode *ip = VTOI(tvp);
        struct acl *acl;

        acl = acl_alloc(KM_SLEEP);

        /*
         * Retrieve default ACL for parent, if any.
         */
        error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred);
        switch (error) {
        case 0:
                /*
                 * Retrieved a default ACL, so merge mode and ACL if
                 * necessary.
                 */
                if (acl->acl_cnt != 0) {
                        /*
                         * Two possible ways for default ACL to not
                         * be present.  First, the EA can be
                         * undefined, or second, the default ACL can
                         * be blank.  If it's blank, fall through to
                         * the it's not defined case.
                         */
                        mode = acl_posix1e_newfilemode(mode, acl);
                        ip->i_mode = mode;
                        DIP_ASSIGN(ip, mode, mode);
                        ufs_sync_acl_from_inode(ip, acl);
                        break;
                }
                /* FALLTHROUGH */

        case EOPNOTSUPP:
                /*
                 * Just use the mode as-is.
                 */
                ip->i_mode = mode;
                DIP_ASSIGN(ip, mode, mode);
                error = 0;
                goto out;

        default:
                goto out;
        }

        UFS_WAPBL_END(tvp->v_mount);
        /*
         * XXX: If we abort now, will Soft Updates notify the extattr
         * code that the EAs for the file need to be released?
         */
        error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred);
        UFS_WAPBL_BEGIN(tvp->v_mount);
        switch (error) {
        case 0:
                break;

        case EOPNOTSUPP:
                /*
                 * XXX: This should not happen, as EOPNOTSUPP above was
                 * supposed to free acl.
                 */
                printf("%s: VOP_GETACL() but no VOP_SETACL()\n", __func__);
                /* panic("%s: VOP_GETACL() but no VOP_SETACL()", __func__); */
                break;

        default:
                goto out;
        }

out:
        acl_free(acl);

        return (error);
}

static int
ufs_do_nfs4_acl_inheritance(struct vnode *dvp, struct vnode *tvp,
    mode_t child_mode, kauth_cred_t cred, struct lwp *l)
{
        int error;
        struct acl *parent_aclp, *child_aclp;

        parent_aclp = acl_alloc(KM_SLEEP);
        child_aclp = acl_alloc(KM_SLEEP);

        error = ufs_getacl_nfs4_internal(dvp, parent_aclp, l);
        if (error)
                goto out;
        acl_nfs4_compute_inherited_acl(parent_aclp, child_aclp,
            child_mode, VTOI(tvp)->i_uid, tvp->v_type == VDIR);
        error = ufs_setacl_nfs4_internal(tvp, child_aclp, l, false);
        if (error)
                goto out;
out:
        acl_free(parent_aclp);
        acl_free(child_aclp);

        return (error);
}
#endif

int
ufs_mkdir(void *v)
{
        struct vop_mkdir_v3_args /* {
                struct vnode                *a_dvp;
                struct vnode                **a_vpp;
                struct componentname        *a_cnp;
                struct vattr                *a_vap;
        } */ *ap = v;
        struct vnode                *dvp = ap->a_dvp, *tvp;
        struct vattr                *vap = ap->a_vap;
        struct componentname        *cnp = ap->a_cnp;
        struct inode                *ip, *dp = VTOI(dvp);
        struct buf                *bp;
        struct dirtemplate        dirtemplate;
        struct direct                *newdir;
        int                        error;
        struct ufsmount                *ump = dp->i_ump;
        int                        dirblksiz = ump->um_dirblksiz;
        struct ufs_lookup_results *ulr;

        /* XXX should handle this material another way */
        ulr = &dp->i_crap;
        UFS_CHECK_CRAPCOUNTER(dp);

        KASSERT(vap->va_type == VDIR);

        if ((nlink_t)dp->i_nlink >= LINK_MAX) {
                error = EMLINK;
                goto out;
        }
        /*
         * Must simulate part of ufs_makeinode here to acquire the inode,
         * but not have it entered in the parent directory. The entry is
         * made later after writing "." and ".." entries.
         */
        error = vcache_new(dvp->v_mount, dvp, vap, cnp->cn_cred, NULL,
            ap->a_vpp);
        if (error)
                goto out;
        error = vn_lock(*ap->a_vpp, LK_EXCLUSIVE);
        if (error) {
                vrele(*ap->a_vpp);
                *ap->a_vpp = NULL;
                goto out;
        }
        error = UFS_WAPBL_BEGIN(ap->a_dvp->v_mount);
        if (error) {
                vput(*ap->a_vpp);
                goto out;
        }

        tvp = *ap->a_vpp;
        ip = VTOI(tvp);
        ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
        ip->i_nlink = 2;
        DIP_ASSIGN(ip, nlink, 2);
        if (cnp->cn_flags & ISWHITEOUT) {
                ip->i_flags |= UF_OPAQUE;
                DIP_ASSIGN(ip, flags, ip->i_flags);
        }

        /*
         * Bump link count in parent directory to reflect work done below.
         * Should be done before reference is created so cleanup is
         * possible if we crash.
         */
        dp->i_nlink++;
        DIP_ASSIGN(dp, nlink, dp->i_nlink);
        dp->i_flag |= IN_CHANGE;
        if ((error = UFS_UPDATE(dvp, NULL, NULL, UPDATE_DIROP)) != 0)
                goto bad;

#ifdef UFS_ACL
        mode_t dmode = (vap->va_mode & 0777) | IFDIR;
        struct lwp *l = curlwp;
        if (dvp->v_mount->mnt_flag & MNT_POSIX1EACLS) {

                error = ufs_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode,
                    cnp->cn_cred, l);
                if (error)
                        goto bad;
        } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) {
                error = ufs_do_nfs4_acl_inheritance(dvp, tvp, dmode,
                    cnp->cn_cred, l);
                if (error)
                        goto bad;
        }
#endif /* !UFS_ACL */

        /*
         * Initialize directory with "." and ".." from static template.
         */
        dirtemplate = mastertemplate;
        dirtemplate.dotdot_reclen = dirblksiz - dirtemplate.dot_reclen;
        dirtemplate.dot_ino = ufs_rw32(ip->i_number, UFS_MPNEEDSWAP(ump));
        dirtemplate.dotdot_ino = ufs_rw32(dp->i_number, UFS_MPNEEDSWAP(ump));
        dirtemplate.dot_reclen = ufs_rw16(dirtemplate.dot_reclen,
            UFS_MPNEEDSWAP(ump));
        dirtemplate.dotdot_reclen = ufs_rw16(dirtemplate.dotdot_reclen,
            UFS_MPNEEDSWAP(ump));
        if (ump->um_maxsymlinklen <= 0) {
#if BYTE_ORDER == LITTLE_ENDIAN
                if (UFS_MPNEEDSWAP(ump) == 0)
#else
                if (UFS_MPNEEDSWAP(ump) != 0)
#endif
                {
                        dirtemplate.dot_type = dirtemplate.dot_namlen;
                        dirtemplate.dotdot_type = dirtemplate.dotdot_namlen;
                        dirtemplate.dot_namlen = dirtemplate.dotdot_namlen = 0;
                } else
                        dirtemplate.dot_type = dirtemplate.dotdot_type = 0;
        }
        if ((error = UFS_BALLOC(tvp, (off_t)0, dirblksiz, cnp->cn_cred,
            B_CLRBUF, &bp)) != 0)
                goto bad;
        ip->i_size = dirblksiz;
        DIP_ASSIGN(ip, size, dirblksiz);
        ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
        uvm_vnp_setsize(tvp, ip->i_size);
        memcpy((void *)bp->b_data, (void *)&dirtemplate, sizeof dirtemplate);

        /*
         * Directory set up, now install its entry in the parent directory.
         * We must write out the buffer containing the new directory body
         * before entering the new name in the parent.
         */
        if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0)
                goto bad;
        if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0) {
                goto bad;
        }
        newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
        ufs_makedirentry(ip, cnp, newdir);
        error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, bp);
        pool_cache_put(ufs_direct_cache, newdir);
 bad:
        if (error == 0) {
                VOP_UNLOCK(tvp);
                UFS_WAPBL_END(dvp->v_mount);
        } else {
                dp->i_nlink--;
                DIP_ASSIGN(dp, nlink, dp->i_nlink);
                dp->i_flag |= IN_CHANGE;
                UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
                /*
                 * No need to do an explicit UFS_TRUNCATE here, vrele will
                 * do this for us because we set the link count to 0.
                 */
                ip->i_nlink = 0;
                DIP_ASSIGN(ip, nlink, 0);
                ip->i_flag |= IN_CHANGE;
                UFS_WAPBL_UPDATE(tvp, NULL, NULL, UPDATE_DIROP);
                UFS_WAPBL_END(dvp->v_mount);
                vput(tvp);
        }
 out:
        return (error);
}

int
ufs_rmdir(void *v)
{
        struct vop_rmdir_v2_args /* {
                struct vnode                *a_dvp;
                struct vnode                *a_vp;
                struct componentname        *a_cnp;
        } */ *ap = v;
        struct vnode                *vp, *dvp;
        struct componentname        *cnp;
        struct inode                *ip, *dp;
        int                        error;
        struct ufs_lookup_results *ulr;

        vp = ap->a_vp;
        dvp = ap->a_dvp;
        cnp = ap->a_cnp;
        ip = VTOI(vp);
        dp = VTOI(dvp);

#ifdef UFS_ACL
#ifdef notyet
        /* We don't do this because if the filesystem is mounted without ACLs
         * this goes through vfs_unixify_accmode() and we get EPERM.
         */
        error = VOP_ACCESSX(vp, VDELETE, cnp->cn_cred);
        if (error)
                goto err;
#endif
#endif

        /* XXX should handle this material another way */
        ulr = &dp->i_crap;
        UFS_CHECK_CRAPCOUNTER(dp);

        /*
         * No rmdir "." or of mounted directories please.
         */
        if (dp == ip || vp->v_mountedhere != NULL) {
                error = EINVAL;
                goto err;
        }

        /*
         * Do not remove a directory that is in the process of being renamed.
         * Verify that the directory is empty (and valid). (Rmdir ".." won't
         * be valid since ".." will contain a reference to the current
         * directory and thus be non-empty.)
         */
        error = 0;
        if (ip->i_nlink != 2 ||
            !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
                error = ENOTEMPTY;
                goto out;
        }
        if ((dp->i_flags & APPEND) ||
                (ip->i_flags & (IMMUTABLE | APPEND))) {
                error = EPERM;
                goto out;
        }
        error = UFS_WAPBL_BEGIN(dvp->v_mount);
        if (error)
                goto out;
        /*
         * Delete reference to directory before purging
         * inode.  If we crash in between, the directory
         * will be reattached to lost+found,
         */
        error = ufs_dirremove(dvp, ulr, ip, cnp->cn_flags, 1);
        if (error) {
                UFS_WAPBL_END(dvp->v_mount);
                goto out;
        }
        cache_purge(dvp);
        /*
         * Truncate inode.  The only stuff left in the directory is "." and
         * "..".  The "." reference is inconsequential since we're quashing
         * it.
         */
        dp->i_nlink--;
        DIP_ASSIGN(dp, nlink, dp->i_nlink);
        dp->i_flag |= IN_CHANGE;
        UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
        ip->i_nlink--;
        DIP_ASSIGN(ip, nlink, ip->i_nlink);
        ip->i_flag |= IN_CHANGE;
        (void) UFS_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred);
        cache_purge(vp);
        /*
         * Unlock the log while we still have reference to unlinked
         * directory vp so that it will not get locked for recycling
         */
        UFS_WAPBL_END(dvp->v_mount);
#ifdef UFS_DIRHASH
        if (ip->i_dirhash != NULL)
                ufsdirhash_free(ip);
#endif
 out:
        vput(vp);
        return error;
 err:
        if (dp == ip)
                vrele(vp);
        else
                vput(vp);
        return error;
}

/*
 * symlink -- make a symbolic link
 */
int
ufs_symlink(void *v)
{
        struct vop_symlink_v3_args /* {
                struct vnode                *a_dvp;
                struct vnode                **a_vpp;
                struct componentname        *a_cnp;
                struct vattr                *a_vap;
                char                        *a_target;
        } */ *ap = v;
        struct vnode        *vp, **vpp;
        struct inode        *ip;
        int                len, error;
        struct ufs_lookup_results *ulr;

        vpp = ap->a_vpp;

        /* XXX should handle this material another way */
        ulr = &VTOI(ap->a_dvp)->i_crap;
        UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));

        /*
         * UFS_WAPBL_BEGIN(dvp->v_mount) performed by successful
         * ufs_makeinode
         */
        KASSERT(ap->a_vap->va_type == VLNK);
        error = ufs_makeinode(ap->a_vap, ap->a_dvp, ulr, vpp, ap->a_cnp);
        if (error)
                goto out;
        vp = *vpp;
        len = strlen(ap->a_target);
        ip = VTOI(vp);
        /*
         * This test is off by one. um_maxsymlinklen contains the
         * number of bytes available, and we aren't storing a \0, so
         * the test should properly be <=. However, it cannot be
         * changed as this would break compatibility with existing fs
         * images -- see the way ufs_readlink() works.
         */
        if (len < ip->i_ump->um_maxsymlinklen) {
                memcpy((char *)SHORTLINK(ip), ap->a_target, len);
                ip->i_size = len;
                DIP_ASSIGN(ip, size, len);
                uvm_vnp_setsize(vp, ip->i_size);
                ip->i_flag |= IN_CHANGE | IN_UPDATE;
                if (vp->v_mount->mnt_flag & MNT_RELATIME)
                        ip->i_flag |= IN_ACCESS;
                UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
        } else
                error = ufs_bufio(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
                    IO_NODELOCKED | IO_JOURNALLOCKED, ap->a_cnp->cn_cred, NULL,
                    NULL);
        UFS_WAPBL_END(ap->a_dvp->v_mount);
        VOP_UNLOCK(vp);
        if (error)
                vrele(vp);
out:
        return (error);
}

/*
 * Vnode op for reading directories.
 *
 * This routine handles converting from the on-disk directory format
 * "struct direct" to the in-memory format "struct dirent" as well as
 * byte swapping the entries if necessary.
 */
int
ufs_readdir(void *v)
{
        struct vop_readdir_args /* {
                struct vnode        *a_vp;
                struct uio        *a_uio;
                kauth_cred_t        a_cred;
                int                *a_eofflag;
                off_t                **a_cookies;
                int                *a_ncookies;
        } */ *ap = v;

        /* vnode and fs */
        struct vnode        *vp = ap->a_vp;
        struct ufsmount        *ump = VFSTOUFS(vp->v_mount);
        int nswap = UFS_MPNEEDSWAP(ump);
#if BYTE_ORDER == LITTLE_ENDIAN
        int needswap = ump->um_maxsymlinklen <= 0 && nswap == 0;
#else
        int needswap = ump->um_maxsymlinklen <= 0 && nswap != 0;
#endif
        /* caller's buffer */
        struct uio        *calleruio = ap->a_uio;
        off_t                startoffset, endoffset;
        size_t                callerbytes;
        off_t                curoffset;
        /* dirent production buffer */
        char                *direntbuf;
        size_t                direntbufmax;
        struct dirent        *dirent, *stopdirent;
        /* output cookies array */
        off_t                *cookies;
        size_t                numcookies, maxcookies;
        /* disk buffer */
        off_t                physstart, physend;
        size_t                skipstart, dropend;
        char                *rawbuf;
        size_t                rawbufmax, rawbytes;
        struct uio        rawuio;
        struct iovec        rawiov;
        struct direct        *rawdp, *stoprawdp;
        /* general */
        int                error;

        KASSERT(VOP_ISLOCKED(vp));

        /*
         * Figure out where the user wants us to read and how much.
         *
         * XXX: there should probably be an upper bound on callerbytes
         * to avoid silliness trying to do large kernel allocations.
         */
        callerbytes = calleruio->uio_resid;
        startoffset = calleruio->uio_offset;
        endoffset = startoffset + callerbytes;

        if (callerbytes < _DIRENT_MINSIZE(dirent)) {
                /* no room for even one struct dirent */
                return EINVAL;
        }

        /*
         * Now figure out where to actually start reading. Round the
         * start down to a block boundary: we need to start at the
         * beginning of a block in order to read the directory
         * correctly.
         *
         * We also want to always read a whole number of blocks so
         * that the copying code below doesn't have to worry about
         * partial entries. (It used to try at one point, and was a
         * horrible mess.)
         *
         * Furthermore, since blocks have to be scanned from the
         * beginning, if we go partially into another block now we'll
         * just have to rescan it on the next readdir call, which
         * doesn't really serve any useful purpose.
         *
         * So, round down the end as well. It's ok to underpopulate
         * the transfer buffer, as long as we send back at least one
         * dirent so as to avoid giving a bogus EOF indication.
         *
         * Note that because dirents are larger than ffs struct
         * directs, despite the rounding down we may not be able to
         * send all the entries in the blocks we read and may have to
         * rescan some of them on the next call anyway. Alternatively
         * if there's empty space on disk we might have actually been
         * able to fit the next block in, and so forth. None of this
         * actually matters that much in practice.
         *
         * XXX: what does ffs do if a directory block becomes
         * completely empty, and what happens if all the blocks we
         * read are completely empty even though we aren't at EOF? As
         * of this writing I (dholland) can't remember the details.
         */
        physstart = rounddown2(startoffset, ump->um_dirblksiz);
        physend = rounddown2(endoffset, ump->um_dirblksiz);

        if (physstart >= physend) {
                /* Need at least one block */
                return EINVAL;
        }

        /*
         * skipstart is the number of bytes we need to read in
         * (because we need to start at the beginning of a block) but
         * not transfer to the user.
         *
         * dropend is the number of bytes to ignore at the end of the
         * user's buffer.
         */
        skipstart = startoffset - physstart;
        dropend = endoffset - physend;

        /*
         * Make a transfer buffer.
         *
         * Note: rawbufmax = physend - physstart. Proof:
         *
         * physend - physstart = physend - physstart
         *   = physend - physstart + startoffset - startoffset
         *   = physend + (startoffset - physstart) - startoffset
         *   = physend + skipstart - startoffset
         *   = physend + skipstart - startoffset + endoffset - endoffset
         *   = skipstart - startoffset + endoffset - (endoffset - physend)
         *   = skipstart - startoffset + endoffset - dropend
         *   = skipstart - startoffset + (startoffset + callerbytes) - dropend
         *   = skipstart + callerbytes - dropend
         *   = rawbufmax
         * Qed.
         *
         * XXX: this should just use physend - physstart.
         *
         * XXX: this should be rewritten to read the directs straight
         * out of bufferio buffers instead of copying twice. This would
         * also let us adapt better to the user's buffer size.
         */

        /* Base buffer space for CALLERBYTES of new data */
        rawbufmax = callerbytes + skipstart;
        if (rawbufmax < callerbytes)
                return EINVAL;
        rawbufmax -= dropend;

        if (rawbufmax < _DIRENT_MINSIZE(rawdp)) {
                /* no room for even one struct direct */
                return EINVAL;
        }

        /* read it */
        rawbuf = kmem_alloc(rawbufmax, KM_SLEEP);
        rawiov.iov_base = rawbuf;
        rawiov.iov_len = rawbufmax;
        rawuio.uio_iov = &rawiov;
        rawuio.uio_iovcnt = 1;
        rawuio.uio_offset = physstart;
        rawuio.uio_resid = rawbufmax;
        UIO_SETUP_SYSSPACE(&rawuio);
        rawuio.uio_rw = UIO_READ;
        error = UFS_BUFRD(vp, &rawuio, 0, ap->a_cred);
        if (error != 0) {
                kmem_free(rawbuf, rawbufmax);
                return error;
        }
        rawbytes = rawbufmax - rawuio.uio_resid;

        /* the raw entries to iterate over */
        rawdp = (struct direct *)(void *)rawbuf;
        stoprawdp = (struct direct *)(void *)&rawbuf[rawbytes];

        /* allocate space to produce dirents into */
        direntbufmax = callerbytes;
        direntbuf = kmem_alloc(direntbufmax, KM_SLEEP);

        /* the dirents to iterate over */
        dirent = (struct dirent *)(void *)direntbuf;
        stopdirent = (struct dirent *)(void *)&direntbuf[direntbufmax];

        /* the output "cookies" (seek positions of directory entries) */
        if (ap->a_cookies) {
                numcookies = 0;
                maxcookies = rawbytes / _DIRENT_RECLEN(rawdp, 1);
                cookies = malloc(maxcookies * sizeof(*cookies),
                    M_TEMP, M_WAITOK);
        } else {
                /* XXX: GCC */
                maxcookies = 0;
                cookies = NULL;
        }

        /* now produce the dirents */
        curoffset = calleruio->uio_offset;
        while (rawdp < stoprawdp) {
                rawdp->d_reclen = ufs_rw16(rawdp->d_reclen, nswap);
                if (skipstart > 0) {
                        /* drain skipstart */
                        if (rawdp->d_reclen <= skipstart) {
                                skipstart -= rawdp->d_reclen;
                                rawdp = _DIRENT_NEXT(rawdp);
                                continue;
                        }
                        /* caller's start position wasn't on an entry */
                        error = EINVAL;
                        goto out;
                }
                if (rawdp->d_reclen == 0) {
                        struct dirent *save = dirent;
                        dirent->d_reclen = _DIRENT_MINSIZE(dirent);
                        dirent = _DIRENT_NEXT(dirent);
                        save->d_reclen = 0;
                        rawdp = stoprawdp;
                        break;
                }

                /* copy the header */
                if (needswap) {
                        dirent->d_type = rawdp->d_namlen;
                        dirent->d_namlen = rawdp->d_type;
                } else {
                        dirent->d_type = rawdp->d_type;
                        dirent->d_namlen = rawdp->d_namlen;
                }
                dirent->d_reclen = _DIRENT_RECLEN(dirent, dirent->d_namlen);

                /* stop if there isn't room for the name AND another header */
                if ((char *)(void *)dirent + dirent->d_reclen +
                    _DIRENT_MINSIZE(dirent) > (char *)(void *)stopdirent)
                        break;

                /* copy the name (and inode (XXX: why after the test?)) */
                dirent->d_fileno = ufs_rw32(rawdp->d_ino, nswap);
                (void)memcpy(dirent->d_name, rawdp->d_name, dirent->d_namlen);
                memset(&dirent->d_name[dirent->d_namlen], 0,
                    dirent->d_reclen - _DIRENT_NAMEOFF(dirent)
                    - dirent->d_namlen);

                /* onward */
                curoffset += rawdp->d_reclen;
                if (ap->a_cookies) {
                        KASSERT(numcookies < maxcookies);
                        cookies[numcookies++] = curoffset;
                }
                dirent = _DIRENT_NEXT(dirent);
                rawdp = _DIRENT_NEXT(rawdp);
        }

        /* transfer the dirents to the caller's buffer */
        callerbytes = ((char *)(void *)dirent - direntbuf);
        error = uiomove(direntbuf, callerbytes, calleruio);

out:
        calleruio->uio_offset = curoffset;
        if (ap->a_cookies) {
                if (error) {
                        free(cookies, M_TEMP);
                        *ap->a_cookies = NULL;
                        *ap->a_ncookies = 0;
                } else {
                        *ap->a_cookies = cookies;
                        *ap->a_ncookies = numcookies;
                }
        }
        kmem_free(direntbuf, direntbufmax);
        kmem_free(rawbuf, rawbufmax);
        *ap->a_eofflag = VTOI(vp)->i_size <= calleruio->uio_offset;
        return error;
}

/*
 * Return target name of a symbolic link
 */
int
ufs_readlink(void *v)
{
        struct vop_readlink_args /* {
                struct vnode        *a_vp;
                struct uio        *a_uio;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        struct vnode        *vp = ap->a_vp;
        struct inode        *ip = VTOI(vp);
        struct ufsmount        *ump = VFSTOUFS(vp->v_mount);
        int                isize;

        /*
         * The test against um_maxsymlinklen is off by one; it should
         * theoretically be <=, not <. However, it cannot be changed
         * as that would break compatibility with existing fs images.
         */

        isize = ip->i_size;
        if (isize < ump->um_maxsymlinklen ||
            (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) {
                uiomove((char *)SHORTLINK(ip), isize, ap->a_uio);
                return (0);
        }
        return (UFS_BUFRD(vp, ap->a_uio, 0, ap->a_cred));
}

/*
 * Calculate the logical to physical mapping if not done already,
 * then call the device strategy routine.
 */
int
ufs_strategy(void *v)
{
        struct vop_strategy_args /* {
                struct vnode *a_vp;
                struct buf *a_bp;
        } */ *ap = v;
        struct buf        *bp;
        struct vnode        *vp;
        struct inode        *ip;
        struct mount        *mp;
        int                error;

        bp = ap->a_bp;
        vp = ap->a_vp;
        ip = VTOI(vp);
        if (vp->v_type == VBLK || vp->v_type == VCHR)
                panic("ufs_strategy: spec");
        KASSERT(fstrans_held(vp->v_mount));
        KASSERT(bp->b_bcount != 0);
        if (bp->b_blkno == bp->b_lblkno) {
                error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
                                 NULL);
                if (error) {
                        bp->b_error = error;
                        biodone(bp);
                        return (error);
                }
                if (bp->b_blkno == -1) /* no valid data */
                        clrbuf(bp);
        }
        if (bp->b_blkno < 0) { /* block is not on disk */
                biodone(bp);
                return (0);
        }
        vp = ip->i_devvp;

        error = VOP_STRATEGY(vp, bp);
        if (error)
                return error;

        if (!BUF_ISREAD(bp))
                return 0;

        mp = wapbl_vptomp(vp);
        if (mp == NULL || mp->mnt_wapbl_replay == NULL ||
            !WAPBL_REPLAY_ISOPEN(mp) ||
            !WAPBL_REPLAY_CAN_READ(mp, bp->b_blkno, bp->b_bcount))
                return 0;

        error = biowait(bp);
        if (error)
                return error;

        error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno, bp->b_bcount);
        if (error) {
                mutex_enter(&bufcache_lock);
                SET(bp->b_cflags, BC_INVAL);
                mutex_exit(&bufcache_lock);
        }
        return error;
}

/*
 * Print out the contents of an inode.
 */
int
ufs_print(void *v)
{
        struct vop_print_args /* {
                struct vnode        *a_vp;
        } */ *ap = v;
        struct vnode        *vp;
        struct inode        *ip;

        vp = ap->a_vp;
        ip = VTOI(vp);
        printf("tag VT_UFS, ino %llu, on dev %llu, %llu",
            (unsigned long long)ip->i_number,
            (unsigned long long)major(ip->i_dev),
            (unsigned long long)minor(ip->i_dev));
        printf(" flags 0x%x, nlink %d\n",
            ip->i_flag, ip->i_nlink);
        printf("\tmode 0%o, owner %d, group %d, size %qd",
            ip->i_mode, ip->i_uid, ip->i_gid,
            (long long)ip->i_size);
        if (vp->v_type == VFIFO)
                VOCALL(fifo_vnodeop_p, VOFFSET(vop_print), v);
        printf("\n");
        return (0);
}

/*
 * Read wrapper for special devices.
 */
int
ufsspec_read(void *v)
{
        struct vop_read_args /* {
                struct vnode        *a_vp;
                struct uio        *a_uio;
                int                a_ioflag;
                kauth_cred_t        a_cred;
        } */ *ap = v;

        /*
         * Set access flag.
         */
        if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)
                VTOI(ap->a_vp)->i_flag |= IN_ACCESS;
        return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap));
}

/*
 * Write wrapper for special devices.
 */
int
ufsspec_write(void *v)
{
        struct vop_write_args /* {
                struct vnode        *a_vp;
                struct uio        *a_uio;
                int                a_ioflag;
                kauth_cred_t        a_cred;
        } */ *ap = v;

        /*
         * Set update and change flags.
         */
        if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)
                VTOI(ap->a_vp)->i_flag |= IN_MODIFY;
        return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap));
}

/*
 * Close wrapper for special devices.
 *
 * Update the times on the inode then do device close.
 */
int
ufsspec_close(void *v)
{
        struct vop_close_args /* {
                struct vnode        *a_vp;
                int                a_fflag;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        struct vnode        *vp;

        vp = ap->a_vp;
        if (vrefcnt(vp) > 1)
                UFS_ITIMES(vp, NULL, NULL, NULL);
        return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap));
}

/*
 * Read wrapper for fifo's
 */
int
ufsfifo_read(void *v)
{
        struct vop_read_args /* {
                struct vnode        *a_vp;
                struct uio        *a_uio;
                int                a_ioflag;
                kauth_cred_t        a_cred;
        } */ *ap = v;

        /*
         * Set access flag.
         */
        VTOI(ap->a_vp)->i_flag |= IN_ACCESS;
        return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap));
}

/*
 * Write wrapper for fifo's.
 */
int
ufsfifo_write(void *v)
{
        struct vop_write_args /* {
                struct vnode        *a_vp;
                struct uio        *a_uio;
                int                a_ioflag;
                kauth_cred_t        a_cred;
        } */ *ap = v;

        /*
         * Set update and change flags.
         */
        VTOI(ap->a_vp)->i_flag |= IN_MODIFY;
        return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap));
}

/*
 * Close wrapper for fifo's.
 *
 * Update the times on the inode then do device close.
 */
int
ufsfifo_close(void *v)
{
        struct vop_close_args /* {
                struct vnode        *a_vp;
                int                a_fflag;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        struct vnode        *vp;

        vp = ap->a_vp;
        if (vrefcnt(ap->a_vp) > 1)
                UFS_ITIMES(vp, NULL, NULL, NULL);
        return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap));
}

/*
 * Return POSIX pathconf information applicable to ufs filesystems.
 */
int
ufs_pathconf(void *v)
{
        struct vop_pathconf_args /* {
                struct vnode        *a_vp;
                int                a_name;
                register_t        *a_retval;
        } */ *ap = v;

        switch (ap->a_name) {
        case _PC_LINK_MAX:
                *ap->a_retval = LINK_MAX;
                return (0);
        case _PC_NAME_MAX:
                *ap->a_retval = FFS_MAXNAMLEN;
                return (0);
        case _PC_PATH_MAX:
                *ap->a_retval = PATH_MAX;
                return (0);
        case _PC_PIPE_BUF:
                *ap->a_retval = PIPE_BUF;
                return (0);
        case _PC_CHOWN_RESTRICTED:
                *ap->a_retval = 1;
                return (0);
        case _PC_NO_TRUNC:
                *ap->a_retval = 1;
                return (0);
#ifdef UFS_ACL
        case _PC_ACL_EXTENDED:
                if (ap->a_vp->v_mount->mnt_flag & MNT_POSIX1EACLS)
                        *ap->a_retval = 1;
                else
                        *ap->a_retval = 0;
                return 0;
        case _PC_ACL_NFS4:
                if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS)
                        *ap->a_retval = 1;
                else
                        *ap->a_retval = 0;
                return 0;
#endif
        case _PC_ACL_PATH_MAX:
#ifdef UFS_ACL
                if (ap->a_vp->v_mount->mnt_flag & (MNT_POSIX1EACLS | MNT_NFS4ACLS))
                        *ap->a_retval = ACL_MAX_ENTRIES;
                else
                        *ap->a_retval = 3;
#else
                *ap->a_retval = 3;
#endif
                return 0;
        case _PC_SYNC_IO:
                *ap->a_retval = 1;
                return (0);
        case _PC_FILESIZEBITS:
                *ap->a_retval = 42;
                return (0);
        case _PC_SYMLINK_MAX:
                *ap->a_retval = MAXPATHLEN;
                return (0);
        case _PC_2_SYMLINKS:
                *ap->a_retval = 1;
                return (0);
        default:
                return (EINVAL);
        }
        /* NOTREACHED */
}

/*
 * Advisory record locking support
 */
int
ufs_advlock(void *v)
{
        struct vop_advlock_args /* {
                struct vnode        *a_vp;
                void *                a_id;
                int                a_op;
                struct flock        *a_fl;
                int                a_flags;
        } */ *ap = v;
        struct inode *ip;

        ip = VTOI(ap->a_vp);
        return lf_advlock(ap, &ip->i_lockf, ip->i_size);
}

/*
 * Initialize the vnode associated with a new inode, handle aliased
 * vnodes.
 */
void
ufs_vinit(struct mount *mntp, int (**specops)(void *), int (**fifoops)(void *),
        struct vnode **vpp)
{
        struct timeval        tv;
        struct inode        *ip;
        struct vnode        *vp;
        dev_t                rdev;
        struct ufsmount        *ump;

        vp = *vpp;
        ip = VTOI(vp);
        switch(vp->v_type = IFTOVT(ip->i_mode)) {
        case VCHR:
        case VBLK:
                vp->v_op = specops;
                ump = ip->i_ump;
                if (ump->um_fstype == UFS1)
                        rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev,
                            UFS_MPNEEDSWAP(ump));
                else
                        rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev,
                            UFS_MPNEEDSWAP(ump));
                spec_node_init(vp, rdev);
                break;
        case VFIFO:
                vp->v_op = fifoops;
                break;
        case VNON:
        case VBAD:
        case VSOCK:
        case VLNK:
        case VDIR:
        case VREG:
                break;
        }
        if (ip->i_number == UFS_ROOTINO)
                vp->v_vflag |= VV_ROOT;
        /*
         * Initialize modrev times
         */
        getmicrouptime(&tv);
        ip->i_modrev = (uint64_t)(uint)tv.tv_sec << 32
                        | tv.tv_usec * 4294u;
        *vpp = vp;
}

/*
 * Allocate a new inode.
 */
static int
ufs_makeinode(struct vattr *vap, struct vnode *dvp,
        const struct ufs_lookup_results *ulr,
        struct vnode **vpp, struct componentname *cnp)
{
        struct inode        *ip;
        struct direct        *newdir;
        struct vnode        *tvp;
        int                error;

        UFS_WAPBL_JUNLOCK_ASSERT(dvp->v_mount);

        error = vcache_new(dvp->v_mount, dvp, vap, cnp->cn_cred, NULL, &tvp);
        if (error)
                return error;
        error = vn_lock(tvp, LK_EXCLUSIVE);
        if (error) {
                vrele(tvp);
                return error;
        }
        *vpp = tvp;
        ip = VTOI(tvp);
        error = UFS_WAPBL_BEGIN(dvp->v_mount);
        if (error) {
                vput(tvp);
                return (error);
        }
        ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
        ip->i_nlink = 1;
        DIP_ASSIGN(ip, nlink, 1);

        /* Authorize setting SGID if needed. */
        if (ip->i_mode & ISGID) {
                error = kauth_authorize_vnode(cnp->cn_cred,
                    KAUTH_VNODE_WRITE_SECURITY,
                    tvp, NULL, genfs_can_chmod(tvp, cnp->cn_cred, ip->i_uid,
                    ip->i_gid, MAKEIMODE(vap->va_type, vap->va_mode)));
                if (error) {
                        ip->i_mode &= ~ISGID;
                        DIP_ASSIGN(ip, mode, ip->i_mode);
                }
        }

        if (cnp->cn_flags & ISWHITEOUT) {
                ip->i_flags |= UF_OPAQUE;
                DIP_ASSIGN(ip, flags, ip->i_flags);
        }

        /*
         * Make sure inode goes to disk before directory entry.
         */
        if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0)
                goto bad;
#ifdef UFS_ACL
        struct lwp *l = curlwp;
        if (dvp->v_mount->mnt_flag & MNT_POSIX1EACLS) {
                error = ufs_do_posix1e_acl_inheritance_file(dvp, tvp,
                    ip->i_mode, cnp->cn_cred, l);
                if (error)
                        goto bad;
        } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) {
                error = ufs_do_nfs4_acl_inheritance(dvp, tvp, ip->i_mode,
                    cnp->cn_cred, l);
                if (error)
                        goto bad;
        }
#endif /* !UFS_ACL */
        newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
        ufs_makedirentry(ip, cnp, newdir);
        error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, NULL);
        pool_cache_put(ufs_direct_cache, newdir);
        if (error)
                goto bad;
        *vpp = tvp;
        cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags);
        return (0);

 bad:
        /*
         * Write error occurred trying to update the inode
         * or the directory so must deallocate the inode.
         */
        ip->i_nlink = 0;
        DIP_ASSIGN(ip, nlink, 0);
        ip->i_flag |= IN_CHANGE;
        UFS_WAPBL_UPDATE(tvp, NULL, NULL, 0);
        UFS_WAPBL_END(dvp->v_mount);
        vput(tvp);
        return (error);
}

/*
 * Allocate len bytes at offset off.
 */
int
ufs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags,
    kauth_cred_t cred)
{
        struct inode *ip = VTOI(vp);
        int error, delta, bshift, bsize;
        UVMHIST_FUNC("ufs_gop_alloc"); UVMHIST_CALLED(ubchist);

        error = 0;
        bshift = vp->v_mount->mnt_fs_bshift;
        bsize = 1 << bshift;

        delta = off & (bsize - 1);
        off -= delta;
        len += delta;

        while (len > 0) {
                bsize = MIN(bsize, len);

                error = UFS_BALLOC(vp, off, bsize, cred, flags, NULL);
                if (error) {
                        goto out;
                }

                /*
                 * increase file size now, UFS_BALLOC() requires that
                 * EOF be up-to-date before each call.
                 */

                if (ip->i_size < off + bsize) {
                        UVMHIST_LOG(ubchist, "vp %#jx old 0x%jx new 0x%x",
                            (uintptr_t)vp, ip->i_size, off + bsize, 0);
                        ip->i_size = off + bsize;
                        DIP_ASSIGN(ip, size, ip->i_size);
                }

                off += bsize;
                len -= bsize;
        }

out:
        UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
        return error;
}

void
ufs_gop_markupdate(struct vnode *vp, int flags)
{
        u_int32_t mask = 0;

        if ((flags & GOP_UPDATE_ACCESSED) != 0) {
                mask = IN_ACCESS;
        }
        if ((flags & GOP_UPDATE_MODIFIED) != 0) {
                if (vp->v_type == VREG) {
                        mask |= IN_CHANGE | IN_UPDATE;
                } else {
                        mask |= IN_MODIFY;
                }
        }
        if (mask) {
                struct inode *ip = VTOI(vp);

                ip->i_flag |= mask;
        }
}

int
ufs_bufio(enum uio_rw rw, struct vnode *vp, void *buf, size_t len, off_t off,
    int ioflg, kauth_cred_t cred, size_t *aresid, struct lwp *l)
{
        struct iovec iov;
        struct uio uio;
        int error;

        KASSERT(ISSET(ioflg, IO_NODELOCKED));
        KASSERT(VOP_ISLOCKED(vp));
        KASSERT(rw != UIO_WRITE || VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        KASSERT(rw != UIO_WRITE || vp->v_mount->mnt_wapbl == NULL ||
            ISSET(ioflg, IO_JOURNALLOCKED));

        iov.iov_base = buf;
        iov.iov_len = len;
        uio.uio_iov = &iov;
        uio.uio_iovcnt = 1;
        uio.uio_resid = len;
        uio.uio_offset = off;
        uio.uio_rw = rw;
        UIO_SETUP_SYSSPACE(&uio);

        switch (rw) {
        case UIO_READ:
                error = UFS_BUFRD(vp, &uio, ioflg, cred);
                break;
        case UIO_WRITE:
                error = UFS_BUFWR(vp, &uio, ioflg, cred);
                break;
        default:
                panic("invalid uio rw: %d", (int)rw);
        }

        if (aresid)
                *aresid = uio.uio_resid;
        else if (uio.uio_resid && error == 0)
                error = EIO;

        KASSERT(VOP_ISLOCKED(vp));
        KASSERT(rw != UIO_WRITE || VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
        return error;
}





































































    2 














    2 














    2 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
/*        $NetBSD: kern_rate.c,v 1.2 2012/12/12 11:10:56 pooka Exp $        */

/*-
 * Copyright (c) 2000, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christopher G. Demetriou.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_rate.c,v 1.2 2012/12/12 11:10:56 pooka Exp $");

#include <sys/param.h>
#include <sys/time.h>

/*
 * ratecheck(): simple time-based rate-limit checking.  see ratecheck(9)
 * for usage and rationale.
 */
int
ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
{
        struct timeval tv, delta;
        int rv = 0;

        getmicrouptime(&tv);
        timersub(&tv, lasttime, &delta);

        /*
         * check for 0,0 is so that the message will be seen at least once,
         * even if interval is huge.
         */
        if (timercmp(&delta, mininterval, >=) ||
            (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
                *lasttime = tv;
                rv = 1;
        }

        return (rv);
}

/*
 * ppsratecheck(): packets (or events) per second limitation.
 */
int
ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
{
        struct timeval tv, delta;
        int rv;

        getmicrouptime(&tv);
        timersub(&tv, lasttime, &delta);

        /*
         * check for 0,0 is so that the message will be seen at least once.
         * if more than one second have passed since the last update of
         * lasttime, reset the counter.
         *
         * we do increment *curpps even in *curpps < maxpps case, as some may
         * try to use *curpps for stat purposes as well.
         */
        if ((lasttime->tv_sec == 0 && lasttime->tv_usec == 0) ||
            delta.tv_sec >= 1) {
                *lasttime = tv;
                *curpps = 0;
        }
        if (maxpps < 0)
                rv = 1;
        else if (*curpps < maxpps)
                rv = 1;
        else
                rv = 0;

#if 1 /*DIAGNOSTIC?*/
        /* be careful about wrap-around */
        if (__predict_true(*curpps != INT_MAX))
                *curpps = *curpps + 1;
#else
        /*
         * assume that there's not too many calls to this function.
         * not sure if the assumption holds, as it depends on *caller's*
         * behavior, not the behavior of this function.
         * IMHO it is wrong to make assumption on the caller's behavior,
         * so the above #if is #if 1, not #ifdef DIAGNOSTIC.
         */
        *curpps = *curpps + 1;
#endif

        return (rv);
}























































































    2 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/*        $NetBSD: sleepq.h,v 1.42 2023/10/15 10:30:00 riastradh Exp $        */

/*-
 * Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe and Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef        _SYS_SLEEPQ_H_
#define        _SYS_SLEEPQ_H_

#include <sys/param.h>

#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/pool.h>
#include <sys/queue.h>
#include <sys/sched.h>
#include <sys/wchan.h>

struct syncobj;

/*
 * Generic sleep queues.
 */

typedef struct sleepq sleepq_t;

void        sleepq_init(sleepq_t *);
void        sleepq_remove(sleepq_t *, lwp_t *, bool);
int        sleepq_enter(sleepq_t *, lwp_t *, kmutex_t *);
void        sleepq_enqueue(sleepq_t *, wchan_t, const char *,
            const struct syncobj *, bool);
void        sleepq_transfer(lwp_t *, sleepq_t *, sleepq_t *, wchan_t, const char *,
            const struct syncobj *, kmutex_t *, bool);
void        sleepq_uncatch(lwp_t *);
void        sleepq_unsleep(lwp_t *, bool);
void        sleepq_timeout(void *);
void        sleepq_wake(sleepq_t *, wchan_t, u_int, kmutex_t *);
int        sleepq_abort(kmutex_t *, int);
void        sleepq_changepri(lwp_t *, pri_t);
void        sleepq_lendpri(lwp_t *, pri_t);
int        sleepq_block(int, bool, const struct syncobj *, int);

#ifdef _KERNEL

#include <sys/kernel.h>

typedef union {
        kmutex_t        lock;
        uint8_t                pad[COHERENCY_UNIT];
} sleepqlock_t;

/*
 * Return non-zero if it is unsafe to sleep.
 *
 * XXX This only exists because panic() is broken.
 */
static __inline bool
sleepq_dontsleep(lwp_t *l)
{

        return cold || (doing_shutdown && (panicstr || CURCPU_IDLE_P()));
}

#endif        /* _KERNEL */

#include <sys/sleeptab.h>

#endif        /* _SYS_SLEEPQ_H_ */
















































































   83 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/*        $NetBSD: uvm_pdpolicy.h,v 1.9 2022/08/20 23:26:02 riastradh Exp $        */

/*-
 * Copyright (c)2005, 2006 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#ifndef _UVM_PDPOLICY_H_
#define _UVM_PDPOLICY_H_

#include <sys/mutex.h>
#include <sys/stdint.h>

#include <uvm/uvm_page.h>

struct krwlock;
struct uvm_cpu;
struct vm_anon;
struct vm_page;

/*
 * these API is for uvm internal use only.
 * don't use them directly from outside of /sys/uvm.
 */

void uvmpdpol_idle(struct uvm_cpu *);
void uvmpdpol_init(void);
void uvmpdpol_init_cpu(struct uvm_cpu *);
void uvmpdpol_reinit(void);
void uvmpdpol_estimatepageable(int *, int *);
bool uvmpdpol_needsscan_p(void);

void uvmpdpol_pageactivate(struct vm_page *);
void uvmpdpol_pagedeactivate(struct vm_page *);
void uvmpdpol_pagedequeue(struct vm_page *);
void uvmpdpol_pageenqueue(struct vm_page *);
bool uvmpdpol_pageactivate_p(struct vm_page *);
bool uvmpdpol_pageisqueued_p(struct vm_page *);
void uvmpdpol_pagerealize(struct vm_page *);
void uvmpdpol_anfree(struct vm_anon *);

void uvmpdpol_tune(void);
void uvmpdpol_scaninit(void);
void uvmpdpol_scanfini(void);
struct vm_page *uvmpdpol_selectvictim(struct krwlock **);
void uvmpdpol_balancequeue(int);

void uvmpdpol_sysctlsetup(void);

/*
 * uvmpdpol_set_intent: set an intended state for the page, taking care not
 * to overwrite any of the other flags.
 */

static inline void
uvmpdpol_set_intent(struct vm_page *pg, uint32_t i)
{

        KASSERT(mutex_owned(&pg->interlock));
        pg->pqflags = PQ_INTENT_SET | (pg->pqflags & ~PQ_INTENT_MASK) | i;
}

#endif /* !_UVM_PDPOLICY_H_ */





































   11 




   12 

    6 
    8 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/*        $NetBSD: copystr.c,v 1.1 2020/06/30 16:20:02 maxv Exp $        */

/*
 * Copyright (c) 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Maxime Villard.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/systm.h>
#include <sys/errno.h>

int
copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done)
{
        const char *src = kfaddr;
        char *dst = kdaddr;
        size_t i;

        for (i = 0; i < len; i++) {
                if ((*dst++ = *src++) == '\0') {
                        if (done)
                                *done = i + 1;
                        return 0;
                }
        }

        if (done)
                *done = i;

        return ENAMETOOLONG;
}
















































































































































































































































    2 









    2 






















    2 


    2 









































































































    2 












    2 
    2 

































































   14 
   14 



























    2 











    2 










    2 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
/*        $NetBSD: cons.c,v 1.95 2023/09/02 17:44:59 riastradh Exp $        */

/*
 * Copyright (c) 1988 University of Utah.
 * Copyright (c) 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * the Systems Programming Group of the University of Utah Computer
 * Science Department.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * from: Utah $Hdr: cons.c 1.7 92/01/21$
 *
 *        @(#)cons.c        8.2 (Berkeley) 1/12/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: cons.c,v 1.95 2023/09/02 17:44:59 riastradh Exp $");

#include <sys/param.h>

#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/heartbeat.h>
#include <sys/ioctl.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/systm.h>
#include <sys/tty.h>
#include <sys/vnode.h>

#include <dev/cons.h>

#include "nullcons.h"

dev_type_open(cnopen);
dev_type_close(cnclose);
dev_type_read(cnread);
dev_type_write(cnwrite);
dev_type_ioctl(cnioctl);
dev_type_poll(cnpoll);
dev_type_kqfilter(cnkqfilter);

static bool cn_redirect(dev_t *, int, int *, struct tty **);
static void cn_release(struct tty *);

const struct cdevsw cons_cdevsw = {
        .d_open = cnopen,
        .d_close = cnclose,
        .d_read = cnread,
        .d_write = cnwrite,
        .d_ioctl = cnioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = cnpoll,
        .d_mmap = nommap,
        .d_kqfilter = cnkqfilter,
        .d_discard = nodiscard,
        .d_flag = D_TTY|D_MPSAFE,
};

static struct kmutex cn_lock;

struct        tty *volatile constty;        /* virtual console output device */
struct        consdev *cn_tab;        /* physical console device info */
struct        vnode *cn_devvp[2];        /* vnode for underlying device. */

void
cn_set_tab(struct consdev *tab)
{

        /*
         * This is a point that we should have KASSERT(cold) or add
         * synchronization in case this can happen after cold boot.
         * However, cn_tab initialization is so critical to any
         * diagnostics or debugging that we need to tread carefully
         * about introducing new ways to crash.  So let's put the
         * assertion in only after we've audited most or all of the
         * cn_tab updates.
         */
        cn_tab = tab;
}

int
cnopen(dev_t dev, int flag, int mode, struct lwp *l)
{
        dev_t cndev;
        int unit, error;

        unit = minor(dev);
        if (unit > 1)
                return ENODEV;

        mutex_enter(&cn_lock);

        if (cn_tab == NULL) {
                error = 0;
                goto out;
        }

        /*
         * always open the 'real' console device, so we don't get nailed
         * later.  This follows normal device semantics; they always get
         * open() calls.
         */
        cndev = cn_tab->cn_dev;
#if NNULLCONS > 0
        if (cndev == NODEV) {
                nullconsattach(0);
        }
#else /* NNULLCONS > 0 */
        if (cndev == NODEV) {
                /*
                 * This is most likely an error in the console attach
                 * code. Panicking looks better than jumping into nowhere
                 * through cdevsw below....
                 */
                panic("cnopen: no console device");
        }
#endif /* NNULLCONS > 0 */
        if (dev == cndev) {
                /*
                 * This causes cnopen() to be called recursively, which
                 * is generally a bad thing.  It is often caused when
                 * dev == 0 and cn_dev has not been set, but was probably
                 * initialised to 0.
                 */
                panic("cnopen: cn_tab->cn_dev == dev");
        }
        if (cn_devvp[unit] != NULLVP) {
                error = 0;
                goto out;
        }
        if ((error = cdevvp(cndev, &cn_devvp[unit])) != 0) {
                printf("cnopen: unable to get vnode reference\n");
                goto out;
        }
        vn_lock(cn_devvp[unit], LK_EXCLUSIVE | LK_RETRY);
        error = VOP_OPEN(cn_devvp[unit], flag, kauth_cred_get());
        VOP_UNLOCK(cn_devvp[unit]);

out:        mutex_exit(&cn_lock);
        return error;
}

int
cnclose(dev_t dev, int flag, int mode, struct lwp *l)
{
        struct vnode *vp;
        int unit, error;

        unit = minor(dev);
        if (unit > 1)
                return ENODEV;

        mutex_enter(&cn_lock);

        if (cn_tab == NULL) {
                error = 0;
                goto out;
        }

        vp = cn_devvp[unit];
        cn_devvp[unit] = NULL;
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        error = VOP_CLOSE(vp, flag, kauth_cred_get());
        VOP_UNLOCK(vp);
        vrele(vp);

out:        mutex_exit(&cn_lock);
        return error;
}

int
cnread(dev_t dev, struct uio *uio, int flag)
{
        struct tty *ctp = NULL;
        int error;

        /*
         * If we would redirect input, punt.  This will keep strange
         * things from happening to people who are using the real
         * console.  Nothing should be using /dev/console for
         * input (except a shell in single-user mode, but then,
         * one wouldn't TIOCCONS then).
         */
        if (!cn_redirect(&dev, 1, &error, &ctp))
                return error;
        error = cdev_read(dev, uio, flag);
        cn_release(ctp);
        return error;
}

int
cnwrite(dev_t dev, struct uio *uio, int flag)
{
        struct tty *ctp = NULL;
        int error;

        /* Redirect output, if that's appropriate. */
        if (!cn_redirect(&dev, 0, &error, &ctp))
                return error;
        error = cdev_write(dev, uio, flag);
        cn_release(ctp);
        return error;
}

int
cnioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
        struct tty *ctp = NULL;
        int error;

        error = 0;

        /*
         * Superuser can always use this to wrest control of console
         * output from the "virtual" console.
         */
        if (cmd == TIOCCONS) {
                struct tty *tp;

                mutex_enter(&constty_lock);
                tp = atomic_load_relaxed(&constty);
                if (tp == NULL) {
                        mutex_exit(&constty_lock);
                        goto passthrough; /* XXX ??? */
                }
                error = kauth_authorize_device_tty(l->l_cred,
                    KAUTH_DEVICE_TTY_VIRTUAL, tp);
                if (!error)
                        atomic_store_relaxed(&constty, NULL);
                mutex_exit(&constty_lock);
                return error;
        }
passthrough:
        /*
         * Redirect the ioctl, if that's appropriate.
         * Note that strange things can happen, if a program does
         * ioctls on /dev/console, then the console is redirected
         * out from under it.
         */
        if (!cn_redirect(&dev, 0, &error, &ctp))
                return error;
        error = cdev_ioctl(dev, cmd, data, flag, l);
        cn_release(ctp);
        return error;
}

/*ARGSUSED*/
int
cnpoll(dev_t dev, int events, struct lwp *l)
{
        struct tty *ctp = NULL;
        int error;

        /*
         * Redirect the poll, if that's appropriate.
         * I don't want to think of the possible side effects
         * of console redirection here.
         */
        if (!cn_redirect(&dev, 0, &error, &ctp))
                return POLLHUP;
        error = cdev_poll(dev, events, l);
        cn_release(ctp);
        return error;
}

/*ARGSUSED*/
int
cnkqfilter(dev_t dev, struct knote *kn)
{
        struct tty *ctp = NULL;
        int error;

        /*
         * Redirect the kqfilter, if that's appropriate.
         * I don't want to think of the possible side effects
         * of console redirection here.
         */
        if (!cn_redirect(&dev, 0, &error, &ctp))
                return error;
        error = cdev_kqfilter(dev, kn);
        cn_release(ctp);
        return error;
}

int
cngetc(void)
{
        if (cn_tab == NULL)
                return (0);
        int s = splhigh();
        for (;;) {
                const int rv = (*cn_tab->cn_getc)(cn_tab->cn_dev);
                if (rv >= 0) {
                        splx(s);
                        return rv;
                }
                docritpollhooks();
        }
}

int
cngetsn(char *cp, int size)
{
        char *lp;
        int c, len;

        cnpollc(1);

        lp = cp;
        len = 0;
        for (;;) {
                c = cngetc();
                switch (c) {
                case '\n':
                case '\r':
                        printf("\n");
                        *lp++ = '\0';
                        cnpollc(0);
                        return (len);
                case '\b':
                case '\177':
                case '#':
                        if (len) {
                                --len;
                                --lp;
                                printf("\b \b");
                        }
                        continue;
                case '@':
                case 'u'&037:        /* CTRL-u */
                        len = 0;
                        lp = cp;
                        printf("\n");
                        continue;
                default:
                        if (len + 1 >= size || c < ' ') {
                                printf("\007");
                                continue;
                        }
                        printf("%c", c);
                        ++len;
                        *lp++ = c;
                }
        }
}

void
cnputc(int c)
{

        if (cn_tab == NULL)
                return;

/*
 * XXX
 * for some reason this causes ARCS firmware to output an endless stream of
 * whitespaces with n32 kernels, so use the pre-1.74 code for now until I can
 * figure out why this happens
 */
#ifndef sgimips
        if (c) {
                if (c == '\n') {
                        (*cn_tab->cn_putc)(cn_tab->cn_dev, '\r');
                        docritpollhooks();
                }
                (*cn_tab->cn_putc)(cn_tab->cn_dev, c);
        }
#else
        if (c) {
                (*cn_tab->cn_putc)(cn_tab->cn_dev, c);
                if (c == '\n') {
                        docritpollhooks();
                        (*cn_tab->cn_putc)(cn_tab->cn_dev, '\r');
                }
        }
#endif
}

void
cnpollc(int on)
{
        static int refcount = 0;

        if (cn_tab == NULL)
                return;
        if (!on)
                --refcount;
        if (refcount == 0) {
                if (on) {
                        /*
                         * Bind to the current CPU by disabling
                         * preemption (more convenient than finding a
                         * place to store a stack to unwind for
                         * curlwp_bind/bindx, and preemption wouldn't
                         * happen anyway while spinning at high IPL in
                         * cngetc) so that curcpu() is stable so that
                         * we can suspend heartbeat checks for it.
                         */
                        kpreempt_disable();
                        heartbeat_suspend();
                }
                (*cn_tab->cn_pollc)(cn_tab->cn_dev, on);
                if (!on) {
                        heartbeat_resume();
                        kpreempt_enable();
                }
        }
        if (on)
                ++refcount;
}

void
nullcnpollc(dev_t dev, int on)
{

}

void
cnbell(u_int pitch, u_int period, u_int volume)
{

        if (cn_tab == NULL || cn_tab->cn_bell == NULL)
                return;
        (*cn_tab->cn_bell)(cn_tab->cn_dev, pitch, period, volume);
}

void
cnflush(void)
{
        if (cn_tab == NULL || cn_tab->cn_flush == NULL)
                return;
        (*cn_tab->cn_flush)(cn_tab->cn_dev);
}

void
cnhalt(void)
{
        if (cn_tab == NULL || cn_tab->cn_halt == NULL)
                return;
        (*cn_tab->cn_halt)(cn_tab->cn_dev);
}

/*
 * Redirect output, if that's appropriate.  If there's no real console,
 * return ENXIO.
 */
static bool
cn_redirect(dev_t *devp, int is_read, int *error, struct tty **ctpp)
{
        dev_t dev = *devp;
        struct tty *ctp;
        int s;
        bool ok = false;

        *error = ENXIO;
        *ctpp = NULL;
        s = pserialize_read_enter();
        if ((ctp = atomic_load_consume(&constty)) != NULL && minor(dev) == 0 &&
            (cn_tab == NULL || (cn_tab->cn_pri != CN_REMOTE))) {
                if (is_read) {
                        *error = 0;
                        goto out;
                }
                tty_acquire(ctp);
                *ctpp = ctp;
                dev = ctp->t_dev;
        } else if (cn_tab == NULL)
                goto out;
        else
                dev = cn_tab->cn_dev;
        ok = true;
        *devp = dev;
out:        pserialize_read_exit(s);
        return ok;
}

static void
cn_release(struct tty *ctp)
{

        if (ctp == NULL)
                return;
        tty_release(ctp);
}

MODULE(MODULE_CLASS_DRIVER, cons, NULL);

static int
cons_modcmd(modcmd_t cmd, void *arg)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                mutex_init(&cn_lock, MUTEX_DEFAULT, IPL_NONE);
                return 0;
        case MODULE_CMD_FINI:
                mutex_destroy(&cn_lock);
                return 0;
        default:
                return ENOTTY;
        }
}



























































    2 












    1 



    1 





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
/*        $NetBSD: kern_mod_80.c,v 1.6 2019/12/12 02:15:42 pgoyette Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * System calls relating to loadable modules.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_mod_80.c,v 1.6 2019/12/12 02:15:42 pgoyette Exp $");

#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_modular.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/kobj.h>
#include <sys/module.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/compat_stub.h>

#include <compat/sys/module.h>

#include <compat/common/compat_mod.h>

static int
compat_80_modstat(int cmd, struct iovec *iov, void *arg)
{
        omodstat_t *oms, *omso;
        modinfo_t *mi;
        module_t *mod;
        vaddr_t addr;
        size_t size;
        size_t omslen;
        size_t used;
        int error;
        int omscnt;
        bool stataddr;
        const char *suffix = "...";

        if (cmd != MODCTL_OSTAT)
                return EINVAL;

        error = copyin(arg, iov, sizeof(*iov));
        if (error != 0) {
                return error;
        }

        /* If not privileged, don't expose kernel addresses. */
        error = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE,
            0, (void *)(uintptr_t)MODCTL_STAT, NULL, NULL);
        stataddr = (error == 0);

        kernconfig_lock();
        omscnt = 0;
        TAILQ_FOREACH(mod, &module_list, mod_chain) {
                omscnt++;
                mi = mod->mod_info;
        }
        TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
                omscnt++;
                mi = mod->mod_info;
        }
        omslen = omscnt * sizeof(omodstat_t);
        omso = kmem_zalloc(omslen, KM_SLEEP);
        oms = omso;
        TAILQ_FOREACH(mod, &module_list, mod_chain) {
                mi = mod->mod_info;
                strlcpy(oms->oms_name, mi->mi_name, sizeof(oms->oms_name));
                if (mi->mi_required != NULL) {
                        used = strlcpy(oms->oms_required, mi->mi_required,
                            sizeof(oms->oms_required));
                        if (used >= sizeof(oms->oms_required)) {
                                oms->oms_required[sizeof(oms->oms_required) -
                                    strlen(suffix) - 1] = '\0';
                                strlcat(oms->oms_required, suffix,
                                    sizeof(oms->oms_required));
                        }
                }
                if (mod->mod_kobj != NULL && stataddr) {
                        kobj_stat(mod->mod_kobj, &addr, &size);
                        oms->oms_addr = addr;
                        oms->oms_size = size;
                }
                oms->oms_class = mi->mi_class;
                oms->oms_refcnt = mod->mod_refcnt;
                oms->oms_source = mod->mod_source;
                oms->oms_flags = mod->mod_flags;
                oms++;
        }
        TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
                mi = mod->mod_info;
                strlcpy(oms->oms_name, mi->mi_name, sizeof(oms->oms_name));
                if (mi->mi_required != NULL) {
                        used = strlcpy(oms->oms_required, mi->mi_required,
                            sizeof(oms->oms_required));
                        if (used >= sizeof(oms->oms_required)) {
                                oms->oms_required[sizeof(oms->oms_required) -
                                    strlen(suffix) - 1] = '\0';
                                strlcat(oms->oms_required, suffix,
                                    sizeof(oms->oms_required));
                        }
                }
                if (mod->mod_kobj != NULL && stataddr) {
                        kobj_stat(mod->mod_kobj, &addr, &size);
                        oms->oms_addr = addr;
                        oms->oms_size = size;
                }
                oms->oms_class = mi->mi_class;
                oms->oms_refcnt = -1;
                KASSERT(mod->mod_source == MODULE_SOURCE_KERNEL);
                oms->oms_source = mod->mod_source;
                oms++;
        }
        kernconfig_unlock();
        error = copyout(omso, iov->iov_base, uimin(omslen, iov->iov_len));
        kmem_free(omso, omslen);
        if (error == 0) {
                iov->iov_len = omslen;
                error = copyout(iov, arg, sizeof(*iov));
        }

        return error;
}

void
kern_mod_80_init(void)
{

        MODULE_HOOK_SET(compat_modstat_80_hook, compat_80_modstat);
}

void
kern_mod_80_fini(void)
{

        MODULE_HOOK_UNSET(compat_modstat_80_hook);
}





























































































































































































































  170 











































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
/*        $NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $        */

/*-
 * Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_clock.c        8.5 (Berkeley) 1/21/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_dtrace.h"
#include "opt_gprof.h"
#include "opt_multiprocessor.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/timex.h>
#include <sys/sched.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <sys/rndsource.h>
#include <sys/heartbeat.h>

#ifdef GPROF
#include <sys/gmon.h>
#endif

#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
#include <sys/cpu.h>

cyclic_clock_func_t        cyclic_clock_func[MAXCPUS];
#endif

static int sysctl_kern_clockrate(SYSCTLFN_PROTO);

/*
 * Clock handling routines.
 *
 * This code is written to operate with two timers that run independently of
 * each other.  The main clock, running hz times per second, is used to keep
 * track of real time.  The second timer handles kernel and user profiling,
 * and does resource use estimation.  If the second timer is programmable,
 * it is randomized to avoid aliasing between the two clocks.  For example,
 * the randomization prevents an adversary from always giving up the CPU
 * just before its quantum expires.  Otherwise, it would never accumulate
 * CPU ticks.  The mean frequency of the second timer is stathz.
 *
 * If no second timer exists, stathz will be zero; in this case we drive
 * profiling and statistics off the main clock.  This WILL NOT be accurate;
 * do not do it unless absolutely necessary.
 *
 * The statistics clock may (or may not) be run at a higher rate while
 * profiling.  This profile clock runs at profhz.  We require that profhz
 * be an integral multiple of stathz.
 *
 * If the statistics clock is running fast, it must be divided by the ratio
 * profhz/stathz for statistics.  (For profiling, every tick counts.)
 */

int        stathz;
int        profhz;
int        profsrc;
int        schedhz;
int        profprocs;
static int hardclock_ticks;
static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */
static int psdiv;                        /* prof => stat divider */
int        psratio;                        /* ratio: prof / stat */

struct clockrnd {
        struct krndsource source;
        unsigned needed;
};

static struct clockrnd hardclockrnd __aligned(COHERENCY_UNIT);
static struct clockrnd statclockrnd __aligned(COHERENCY_UNIT);

static void
clockrnd_get(size_t needed, void *cookie)
{
        struct clockrnd *C = cookie;

        /* Start sampling.  */
        atomic_store_relaxed(&C->needed, 2*NBBY*needed);
}

static void
clockrnd_sample(struct clockrnd *C)
{
        struct cpu_info *ci = curcpu();

        /* If there's nothing needed right now, stop here.  */
        if (__predict_true(atomic_load_relaxed(&C->needed) == 0))
                return;

        /*
         * If we're not the primary core of a package, we're probably
         * driven by the same clock as the primary core, so don't
         * bother.
         */
        if (ci != ci->ci_package1st)
                return;

        /* Take a sample and enter it into the pool.  */
        rnd_add_uint32(&C->source, 0);

        /*
         * On the primary CPU, count down.  Using an atomic decrement
         * here isn't really necessary -- on every platform we care
         * about, stores to unsigned int are atomic, and the only other
         * memory operation that could happen here is for another CPU
         * to store a higher value for needed.  But using an atomic
         * decrement avoids giving the impression of data races, and is
         * unlikely to hurt because only one CPU will ever be writing
         * to the location.
         */
        if (CPU_IS_PRIMARY(curcpu())) {
                unsigned needed __diagused;

                needed = atomic_dec_uint_nv(&C->needed);
                KASSERT(needed != UINT_MAX);
        }
}

static u_int get_intr_timecount(struct timecounter *);

static struct timecounter intr_timecounter = {
        .tc_get_timecount        = get_intr_timecount,
        .tc_poll_pps                = NULL,
        .tc_counter_mask        = ~0u,
        .tc_frequency                = 0,
        .tc_name                = "clockinterrupt",
        /* quality - minimum implementation level for a clock */
        .tc_quality                = 0,
        .tc_priv                = NULL,
};

static u_int
get_intr_timecount(struct timecounter *tc)
{

        return (u_int)getticks();
}

int
getticks(void)
{
        return atomic_load_relaxed(&hardclock_ticks);
}

/*
 * Initialize clock frequencies and start both clocks running.
 */
void
initclocks(void)
{
        static struct sysctllog *clog;
        int i;

        /*
         * Set divisors to 1 (normal case) and let the machine-specific
         * code do its bit.
         */
        psdiv = 1;

        /*
         * Call cpu_initclocks() before registering the default
         * timecounter, in case it needs to adjust hz.
         */
        const int old_hz = hz;
        cpu_initclocks();
        if (old_hz != hz) {
                tick = 1000000 / hz;
                tickadj = (240000 / (60 * hz)) ? (240000 / (60 * hz)) : 1;
        }

        /*
         * provide minimum default time counter
         * will only run at interrupt resolution
         */
        intr_timecounter.tc_frequency = hz;
        tc_init(&intr_timecounter);

        /*
         * Compute profhz and stathz, fix profhz if needed.
         */
        i = stathz ? stathz : hz;
        if (profhz == 0)
                profhz = i;
        psratio = profhz / i;
        if (schedhz == 0) {
                /* 16Hz is best */
                hardscheddiv = hz / 16;
                if (hardscheddiv <= 0)
                        panic("hardscheddiv");
        }

        sysctl_createv(&clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "clockrate",
                       SYSCTL_DESCR("Kernel clock rates"),
                       sysctl_kern_clockrate, 0, NULL,
                       sizeof(struct clockinfo),
                       CTL_KERN, KERN_CLOCKRATE, CTL_EOL);
        sysctl_createv(&clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_INT, "hardclock_ticks",
                       SYSCTL_DESCR("Number of hardclock ticks"),
                       NULL, 0, &hardclock_ticks, sizeof(hardclock_ticks),
                       CTL_KERN, KERN_HARDCLOCK_TICKS, CTL_EOL);

        rndsource_setcb(&hardclockrnd.source, clockrnd_get, &hardclockrnd);
        rnd_attach_source(&hardclockrnd.source, "hardclock", RND_TYPE_SKEW,
            RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME|RND_FLAG_HASCB);
        if (stathz) {
                rndsource_setcb(&statclockrnd.source, clockrnd_get,
                    &statclockrnd);
                rnd_attach_source(&statclockrnd.source, "statclock",
                    RND_TYPE_SKEW,
                    (RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME|
                        RND_FLAG_HASCB));
        }
}

/*
 * The real-time timer, interrupting hz times per second.
 */
void
hardclock(struct clockframe *frame)
{
        struct lwp *l;
        struct cpu_info *ci;

        clockrnd_sample(&hardclockrnd);

        ci = curcpu();
        l = ci->ci_onproc;

        ptimer_tick(l, CLKF_USERMODE(frame));

        /*
         * If no separate statistics clock is available, run it from here.
         */
        if (stathz == 0)
                statclock(frame);
        /*
         * If no separate schedclock is provided, call it here
         * at about 16 Hz.
         */
        if (schedhz == 0) {
                if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) {
                        schedclock(l);
                        ci->ci_schedstate.spc_schedticks = hardscheddiv;
                }
        }
        if ((--ci->ci_schedstate.spc_ticks) <= 0)
                sched_tick(ci);

        if (CPU_IS_PRIMARY(ci)) {
                atomic_store_relaxed(&hardclock_ticks,
                    atomic_load_relaxed(&hardclock_ticks) + 1);
                tc_ticktock();
        }

        /*
         * Make sure the CPUs and timecounter are making progress.
         */
        heartbeat();

        /*
         * Update real-time timeout queue.
         */
        callout_hardclock();
}

/*
 * Start profiling on a process.
 *
 * Kernel profiling passes proc0 which never exits and hence
 * keeps the profile clock running constantly.
 */
void
startprofclock(struct proc *p)
{

        KASSERT(mutex_owned(&p->p_stmutex));

        if ((p->p_stflag & PST_PROFIL) == 0) {
                p->p_stflag |= PST_PROFIL;
                /*
                 * This is only necessary if using the clock as the
                 * profiling source.
                 */
                if (++profprocs == 1 && stathz != 0)
                        psdiv = psratio;
        }
}

/*
 * Stop profiling on a process.
 */
void
stopprofclock(struct proc *p)
{

        KASSERT(mutex_owned(&p->p_stmutex));

        if (p->p_stflag & PST_PROFIL) {
                p->p_stflag &= ~PST_PROFIL;
                /*
                 * This is only necessary if using the clock as the
                 * profiling source.
                 */
                if (--profprocs == 0 && stathz != 0)
                        psdiv = 1;
        }
}

void
schedclock(struct lwp *l)
{
        if ((l->l_flag & LW_IDLE) != 0)
                return;

        sched_schedclock(l);
}

/*
 * Statistics clock.  Grab profile sample, and if divider reaches 0,
 * do process and kernel statistics.
 */
void
statclock(struct clockframe *frame)
{
#ifdef GPROF
        struct gmonparam *g;
        intptr_t i;
#endif
        struct cpu_info *ci = curcpu();
        struct schedstate_percpu *spc = &ci->ci_schedstate;
        struct proc *p;
        struct lwp *l;

        if (stathz)
                clockrnd_sample(&statclockrnd);

        /*
         * Notice changes in divisor frequency, and adjust clock
         * frequency accordingly.
         */
        if (spc->spc_psdiv != psdiv) {
                spc->spc_psdiv = psdiv;
                spc->spc_pscnt = psdiv;
                if (psdiv == 1) {
                        setstatclockrate(stathz);
                } else {
                        setstatclockrate(profhz);
                }
        }
        l = ci->ci_onproc;
        if ((l->l_flag & LW_IDLE) != 0) {
                /*
                 * don't account idle lwps as swapper.
                 */
                p = NULL;
        } else {
                p = l->l_proc;
                mutex_spin_enter(&p->p_stmutex);
        }

        if (CLKF_USERMODE(frame)) {
                KASSERT(p != NULL);
                if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK)
                        addupc_intr(l, CLKF_PC(frame));
                if (--spc->spc_pscnt > 0) {
                        mutex_spin_exit(&p->p_stmutex);
                        return;
                }

                /*
                 * Came from user mode; CPU was in user state.
                 * If this process is being profiled record the tick.
                 */
                p->p_uticks++;
                if (p->p_nice > NZERO)
                        spc->spc_cp_time[CP_NICE]++;
                else
                        spc->spc_cp_time[CP_USER]++;
        } else {
#ifdef GPROF
                /*
                 * Kernel statistics are just like addupc_intr, only easier.
                 */
#if defined(MULTIPROCESSOR) && !defined(_RUMPKERNEL)
                g = curcpu()->ci_gmon;
                if (g != NULL &&
                    profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
#else
                g = &_gmonparam;
                if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
#endif
                        i = CLKF_PC(frame) - g->lowpc;
                        if (i < g->textsize) {
                                i /= HISTFRACTION * sizeof(*g->kcount);
                                g->kcount[i]++;
                        }
                }
#endif
#ifdef LWP_PC
                if (p != NULL && profsrc == PROFSRC_CLOCK &&
                    (p->p_stflag & PST_PROFIL)) {
                        addupc_intr(l, LWP_PC(l));
                }
#endif
                if (--spc->spc_pscnt > 0) {
                        if (p != NULL)
                                mutex_spin_exit(&p->p_stmutex);
                        return;
                }
                /*
                 * Came from kernel mode, so we were:
                 * - handling an interrupt,
                 * - doing syscall or trap work on behalf of the current
                 *   user process, or
                 * - spinning in the idle loop.
                 * Whichever it is, charge the time as appropriate.
                 * Note that we charge interrupts to the current process,
                 * regardless of whether they are ``for'' that process,
                 * so that we know how much of its real time was spent
                 * in ``non-process'' (i.e., interrupt) work.
                 */
                if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) {
                        if (p != NULL) {
                                p->p_iticks++;
                        }
                        spc->spc_cp_time[CP_INTR]++;
                } else if (p != NULL) {
                        p->p_sticks++;
                        spc->spc_cp_time[CP_SYS]++;
                } else {
                        spc->spc_cp_time[CP_IDLE]++;
                }
        }
        spc->spc_pscnt = psdiv;

        if (p != NULL) {
                atomic_inc_uint(&l->l_cpticks);
                mutex_spin_exit(&p->p_stmutex);
        }

#ifdef KDTRACE_HOOKS
        cyclic_clock_func_t func = cyclic_clock_func[cpu_index(ci)];
        if (func) {
                (*func)((struct clockframe *)frame);
        }
#endif
}

/*
 * sysctl helper routine for kern.clockrate. Assembles a struct on
 * the fly to be returned to the caller.
 */
static int
sysctl_kern_clockrate(SYSCTLFN_ARGS)
{
        struct clockinfo clkinfo;
        struct sysctlnode node;

        clkinfo.tick = tick;
        clkinfo.tickadj = tickadj;
        clkinfo.hz = hz;
        clkinfo.profhz = profhz;
        clkinfo.stathz = stathz ? stathz : hz;

        node = *rnode;
        node.sysctl_data = &clkinfo;
        return (sysctl_lookup(SYSCTLFN_CALL(&node)));
}














































































































    6 


    6 


    6 









    1 






    1 


    5 















    6 

































    6 
    6 


    2 
    4 
    4 


    2 



    3 





























    6 













    2 




    4 



    6 



    2 



    1 




    5 





























    6 










































































    4 
    4 
    4 






    4 





































    4 

    4 





    4 










    4 






























































    4 



    4 
    4 


    4 





    4 











    4 



    2 

    3 



    3 



    3 





























































    5 




    5 
    5 

    5 

    5 
    1 

    2 






































    4 





    4 



    1 
















    4 

























































    1 


    1 




    1 





    1 
    1 
    1 











    1 






    1 







    1 















    1 











    1 




    1 
    1 











    1 





    1 























    1 



    1 


















    1 


    1 








    1 





































































































































































































































































































    6 
    6 


    6 

    1 




    5 
    3 

    4 
    5 

    3 
    5 








    8 
    8 

    8 



    6 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
/*        $NetBSD: tmpfs_subr.c,v 1.117 2023/04/29 08:15:13 riastradh Exp $        */

/*
 * Copyright (c) 2005-2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
 * 2005 program, and by Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Efficient memory file system: interfaces for inode and directory entry
 * construction, destruction and manipulation.
 *
 * Reference counting
 *
 *        The link count of inode (tmpfs_node_t::tn_links) is used as a
 *        reference counter.  However, it has slightly different semantics.
 *
 *        For directories - link count represents directory entries, which
 *        refer to the directories.  In other words, it represents the count
 *        of sub-directories.  It also takes into account the virtual '.'
 *        entry (which has no real entry in the list).  For files - link count
 *        represents the hard links.  Since only empty directories can be
 *        removed - link count aligns the reference counting requirements
 *        enough.  Note: to check whether directory is not empty, the inode
 *        size (tmpfs_node_t::tn_size) can be used.
 *
 *        The inode itself, as an object, gathers its first reference when
 *        directory entry is attached via tmpfs_dir_attach(9).  For instance,
 *        after regular tmpfs_create(), a file would have a link count of 1,
 *        while directory after tmpfs_mkdir() would have 2 (due to '.').
 *
 * Reclamation
 *
 *        It should be noted that tmpfs inodes rely on a combination of vnode
 *        reference counting and link counting.  That is, an inode can only be
 *        destroyed if its associated vnode is inactive.  The destruction is
 *        done on vnode reclamation i.e. tmpfs_reclaim().  It should be noted
 *        that tmpfs_node_t::tn_links being 0 is a destruction criterion.
 *
 *        If an inode has references within the file system (tn_links > 0) and
 *        its inactive vnode gets reclaimed/recycled - then the association is
 *        broken in tmpfs_reclaim().  In such case, an inode will always pass
 *        tmpfs_lookup() and thus vcache_get() to associate a new vnode.
 *
 * Lock order
 *
 *        vnode_t::v_vlock ->
 *                vnode_t::v_interlock
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.117 2023/04/29 08:15:13 riastradh Exp $");

#include <sys/param.h>
#include <sys/cprng.h>
#include <sys/dirent.h>
#include <sys/event.h>
#include <sys/kmem.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/atomic.h>

#include <uvm/uvm_aobj.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>

#include <miscfs/specfs/specdev.h>
#include <miscfs/genfs/genfs.h>
#include <fs/tmpfs/tmpfs.h>
#include <fs/tmpfs/tmpfs_fifoops.h>
#include <fs/tmpfs/tmpfs_specops.h>
#include <fs/tmpfs/tmpfs_vnops.h>

static void        tmpfs_dir_putseq(tmpfs_node_t *, tmpfs_dirent_t *);

/*
 * Initialize vnode with tmpfs node.
 */
static void
tmpfs_init_vnode(struct vnode *vp, tmpfs_node_t *node)
{
        krwlock_t *slock;

        KASSERT(node->tn_vnode == NULL);

        /* Share the interlock with the node. */
        if (node->tn_type == VREG) {
                slock = node->tn_spec.tn_reg.tn_aobj->vmobjlock;
                rw_obj_hold(slock);
                uvm_obj_setlock(&vp->v_uobj, slock);
        }

        vp->v_tag = VT_TMPFS;
        vp->v_type = node->tn_type;

        /* Type-specific initialization. */
        switch (vp->v_type) {
        case VBLK:
        case VCHR:
                vp->v_op = tmpfs_specop_p;
                spec_node_init(vp, node->tn_spec.tn_dev.tn_rdev);
                break;
        case VFIFO:
                vp->v_op = tmpfs_fifoop_p;
                break;
        case VDIR:
                if (node->tn_spec.tn_dir.tn_parent == node)
                        vp->v_vflag |= VV_ROOT;
                /* FALLTHROUGH */
        case VLNK:
        case VREG:
        case VSOCK:
                vp->v_op = tmpfs_vnodeop_p;
                break;
        default:
                panic("bad node type %d", vp->v_type);
                break;
        }

        vp->v_data = node;
        node->tn_vnode = vp;
        uvm_vnp_setsize(vp, node->tn_size);
        KASSERT(node->tn_mode != VNOVAL);
        cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid, true);
}

/*
 * tmpfs_loadvnode: initialise a vnode for a specified inode.
 */
int
tmpfs_loadvnode(struct mount *mp, struct vnode *vp,
    const void *key, size_t key_len, const void **new_key)
{
        tmpfs_node_t *node;

        KASSERT(key_len == sizeof(node));
        memcpy(&node, key, key_len);

        if (node->tn_links == 0)
                return ENOENT;

        tmpfs_init_vnode(vp, node);

        *new_key = &vp->v_data;

        return 0;
}

/*
 * tmpfs_newvnode: allocate a new inode of a specified type and
 * attach the vonode.
 */
int
tmpfs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp,
    struct vattr *vap, kauth_cred_t cred, void *extra,
    size_t *key_len, const void **new_key)
{
        tmpfs_mount_t *tmp = VFS_TO_TMPFS(mp);
        tmpfs_node_t *node, *dnode;

        if (dvp != NULL) {
                KASSERT(VOP_ISLOCKED(dvp));
                dnode = VP_TO_TMPFS_DIR(dvp);
                if (dnode->tn_links == 0)
                        return ENOENT;
                if (vap->va_type == VDIR) {
                        /* Check for maximum links limit. */
                        if (dnode->tn_links == LINK_MAX)
                                return EMLINK;
                        KASSERT(dnode->tn_links < LINK_MAX);
                }
        } else
                dnode = NULL;

        node = tmpfs_node_get(tmp);
        if (node == NULL)
                return ENOSPC;

        /* Initially, no references and no associations. */
        node->tn_links = 0;
        node->tn_vnode = NULL;
        node->tn_holdcount = 0;
        node->tn_dirent_hint = NULL;

        /*
         * XXX Where the pool is backed by a map larger than (4GB *
         * sizeof(*node)), this may produce duplicate inode numbers
         * for applications that do not understand 64-bit ino_t.
         */
        node->tn_id = (ino_t)((uintptr_t)node / sizeof(*node));
        /*
         * Make sure the generation number is not zero.
         * tmpfs_inactive() uses generation zero to mark dead nodes.
         */
        do {
                node->tn_gen = TMPFS_NODE_GEN_MASK & cprng_fast32();
        } while (node->tn_gen == 0);

        /* Generic initialization. */
        KASSERT((int)vap->va_type != VNOVAL);
        node->tn_type = vap->va_type;
        node->tn_size = 0;
        node->tn_flags = 0;
        node->tn_lockf = NULL;

        node->tn_tflags = 0;
        vfs_timestamp(&node->tn_atime);
        node->tn_birthtime = node->tn_atime;
        node->tn_ctime = node->tn_atime;
        node->tn_mtime = node->tn_atime;
        mutex_init(&node->tn_timelock, MUTEX_DEFAULT, IPL_NONE);

        if (dvp == NULL) {
                KASSERT(vap->va_uid != VNOVAL && vap->va_gid != VNOVAL);
                node->tn_uid = vap->va_uid;
                node->tn_gid = vap->va_gid;
                vp->v_vflag |= VV_ROOT;
        } else {
                KASSERT(dnode != NULL);
                node->tn_uid = kauth_cred_geteuid(cred);
                node->tn_gid = dnode->tn_gid;
        }
        KASSERT(vap->va_mode != VNOVAL);
        node->tn_mode = vap->va_mode;

        /* Type-specific initialization. */
        switch (node->tn_type) {
        case VBLK:
        case VCHR:
                /* Character/block special device. */
                KASSERT(vap->va_rdev != VNOVAL);
                node->tn_spec.tn_dev.tn_rdev = vap->va_rdev;
                break;
        case VDIR:
                /* Directory. */
                TAILQ_INIT(&node->tn_spec.tn_dir.tn_dir);
                node->tn_spec.tn_dir.tn_parent = NULL;
                node->tn_spec.tn_dir.tn_seq_arena = NULL;
                node->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
                node->tn_spec.tn_dir.tn_readdir_lastp = NULL;

                /* Extra link count for the virtual '.' entry. */
                node->tn_links++;
                break;
        case VFIFO:
        case VSOCK:
                break;
        case VLNK:
                node->tn_size = 0;
                node->tn_spec.tn_lnk.tn_link = NULL;
                break;
        case VREG:
                /* Regular file.  Create an underlying UVM object. */
                node->tn_spec.tn_reg.tn_aobj =
                    uao_create(INT64_MAX - PAGE_SIZE, 0);
                node->tn_spec.tn_reg.tn_aobj_pages = 0;
                break;
        default:
                panic("bad node type %d", vp->v_type);
                break;
        }

        tmpfs_init_vnode(vp, node);

        mutex_enter(&tmp->tm_lock);
        LIST_INSERT_HEAD(&tmp->tm_nodes, node, tn_entries);
        mutex_exit(&tmp->tm_lock);

        *key_len = sizeof(vp->v_data);
        *new_key = &vp->v_data;

        return 0;
}

/*
 * tmpfs_free_node: remove the inode from a list in the mount point and
 * destroy the inode structures.
 */
void
tmpfs_free_node(tmpfs_mount_t *tmp, tmpfs_node_t *node)
{
        size_t objsz;
        uint32_t hold;

        mutex_enter(&tmp->tm_lock);
        hold = atomic_or_32_nv(&node->tn_holdcount, TMPFS_NODE_RECLAIMED);
        /* Defer destruction to last thread holding this node. */
        if (hold != TMPFS_NODE_RECLAIMED) {
                mutex_exit(&tmp->tm_lock);
                return;
        }
        LIST_REMOVE(node, tn_entries);
        mutex_exit(&tmp->tm_lock);

        switch (node->tn_type) {
        case VLNK:
                if (node->tn_size > 0) {
                        tmpfs_strname_free(tmp, node->tn_spec.tn_lnk.tn_link,
                            node->tn_size);
                }
                break;
        case VREG:
                /*
                 * Calculate the size of inode data, decrease the used-memory
                 * counter, and destroy the unerlying UVM object (if any).
                 */
                objsz = PAGE_SIZE * node->tn_spec.tn_reg.tn_aobj_pages;
                if (objsz != 0) {
                        tmpfs_mem_decr(tmp, objsz);
                }
                if (node->tn_spec.tn_reg.tn_aobj != NULL) {
                        uao_detach(node->tn_spec.tn_reg.tn_aobj);
                }
                break;
        case VDIR:
                KASSERT(node->tn_size == 0);
                KASSERT(node->tn_spec.tn_dir.tn_seq_arena == NULL);
                KASSERT(TAILQ_EMPTY(&node->tn_spec.tn_dir.tn_dir));
                KASSERT(node->tn_spec.tn_dir.tn_parent == NULL ||
                    node == tmp->tm_root);
                break;
        default:
                break;
        }
        KASSERT(node->tn_vnode == NULL);
        KASSERT(node->tn_links == 0);

        mutex_destroy(&node->tn_timelock);
        tmpfs_node_put(tmp, node);
}

/*
 * tmpfs_construct_node: allocate a new file of specified type and adds it
 * into the parent directory.
 *
 * => Credentials of the caller are used.
 */
int
tmpfs_construct_node(vnode_t *dvp, vnode_t **vpp, struct vattr *vap,
    struct componentname *cnp, char *target)
{
        tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount);
        tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp), *node;
        tmpfs_dirent_t *de, *wde;
        char *slink = NULL;
        int ssize = 0;
        int error;

        /* Allocate symlink target. */
        if (target != NULL) {
                KASSERT(vap->va_type == VLNK);
                ssize = strlen(target);
                KASSERT(ssize < MAXPATHLEN);
                if (ssize > 0) {
                        slink = tmpfs_strname_alloc(tmp, ssize);
                        if (slink == NULL)
                                return ENOSPC;
                        memcpy(slink, target, ssize);
                }
        }

        /* Allocate a directory entry that points to the new file. */
        error = tmpfs_alloc_dirent(tmp, cnp->cn_nameptr, cnp->cn_namelen, &de);
        if (error) {
                if (slink != NULL)
                        tmpfs_strname_free(tmp, slink, ssize);
                return error;
        }

        /* Allocate a vnode that represents the new file. */
        error = vcache_new(dvp->v_mount, dvp, vap, cnp->cn_cred, NULL, vpp);
        if (error) {
                if (slink != NULL)
                        tmpfs_strname_free(tmp, slink, ssize);
                tmpfs_free_dirent(tmp, de);
                return error;
        }
        error = vn_lock(*vpp, LK_EXCLUSIVE);
        if (error) {
                vrele(*vpp);
                *vpp = NULL;
                if (slink != NULL)
                        tmpfs_strname_free(tmp, slink, ssize);
                tmpfs_free_dirent(tmp, de);
                return error;
        }

        node = VP_TO_TMPFS_NODE(*vpp);

        if (slink != NULL) {
                node->tn_spec.tn_lnk.tn_link = slink;
                node->tn_size = ssize;
        }

        /* Remove whiteout before adding the new entry. */
        if (cnp->cn_flags & ISWHITEOUT) {
                wde = tmpfs_dir_lookup(dnode, cnp);
                KASSERT(wde != NULL && wde->td_node == TMPFS_NODE_WHITEOUT);
                tmpfs_dir_detach(dnode, wde);
                tmpfs_free_dirent(tmp, wde);
        }

        /* Associate inode and attach the entry into the directory. */
        tmpfs_dir_attach(dnode, de, node);

        /* Make node opaque if requested. */
        if (cnp->cn_flags & ISWHITEOUT)
                node->tn_flags |= UF_OPAQUE;

        /* Update the parent's timestamps. */
        tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);

        VOP_UNLOCK(*vpp);

        cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags);
        return 0;
}

/*
 * tmpfs_alloc_dirent: allocates a new directory entry for the inode.
 * The directory entry contains a path name component.
 */
int
tmpfs_alloc_dirent(tmpfs_mount_t *tmp, const char *name, uint16_t len,
    tmpfs_dirent_t **de)
{
        tmpfs_dirent_t *nde;

        nde = tmpfs_dirent_get(tmp);
        if (nde == NULL)
                return ENOSPC;

        nde->td_name = tmpfs_strname_alloc(tmp, len);
        if (nde->td_name == NULL) {
                tmpfs_dirent_put(tmp, nde);
                return ENOSPC;
        }
        nde->td_namelen = len;
        memcpy(nde->td_name, name, len);
        nde->td_seq = TMPFS_DIRSEQ_NONE;
        nde->td_node = NULL; /* for asserts */

        *de = nde;
        return 0;
}

/*
 * tmpfs_free_dirent: free a directory entry.
 */
void
tmpfs_free_dirent(tmpfs_mount_t *tmp, tmpfs_dirent_t *de)
{
        KASSERT(de->td_node == NULL);
        KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
        tmpfs_strname_free(tmp, de->td_name, de->td_namelen);
        tmpfs_dirent_put(tmp, de);
}

/*
 * tmpfs_dir_attach: associate directory entry with a specified inode,
 * and attach the entry into the directory, specified by vnode.
 *
 * => Increases link count on the associated node.
 * => Increases link count on directory node if our node is VDIR.
 * => It is caller's responsibility to check for the LINK_MAX limit.
 * => Triggers kqueue events here.
 */
void
tmpfs_dir_attach(tmpfs_node_t *dnode, tmpfs_dirent_t *de, tmpfs_node_t *node)
{
        vnode_t *dvp = dnode->tn_vnode;
        int events = NOTE_WRITE;

        KASSERT(dvp != NULL);
        KASSERT(VOP_ISLOCKED(dvp));

        /* Get a new sequence number. */
        KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
        de->td_seq = tmpfs_dir_getseq(dnode, de);

        /* Associate directory entry and the inode. */
        de->td_node = node;
        if (node != TMPFS_NODE_WHITEOUT) {
                KASSERT(node->tn_links < LINK_MAX);
                node->tn_links++;

                /* Save the hint (might overwrite). */
                node->tn_dirent_hint = de;
        } else if ((dnode->tn_gen & TMPFS_WHITEOUT_BIT) == 0) {
                /* Flag that there are whiteout entries. */
                atomic_or_32(&dnode->tn_gen, TMPFS_WHITEOUT_BIT);
        }

        /* Insert the entry to the directory (parent of inode). */
        TAILQ_INSERT_TAIL(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
        KASSERT(dnode->tn_size <= __type_max(off_t) - sizeof(tmpfs_dirent_t));
        dnode->tn_size += sizeof(tmpfs_dirent_t);
        uvm_vnp_setsize(dvp, dnode->tn_size);

        if (node != TMPFS_NODE_WHITEOUT && node->tn_type == VDIR) {
                /* Set parent. */
                KASSERT(node->tn_spec.tn_dir.tn_parent == NULL);
                node->tn_spec.tn_dir.tn_parent = dnode;

                /* Increase the link count of parent. */
                KASSERT(dnode->tn_links < LINK_MAX);
                dnode->tn_links++;
                events |= NOTE_LINK;

                TMPFS_VALIDATE_DIR(node);
        }
}

/*
 * tmpfs_dir_detach: disassociate directory entry and its inode,
 * and detach the entry from the directory, specified by vnode.
 *
 * => Decreases link count on the associated node.
 * => Decreases the link count on directory node, if our node is VDIR.
 * => Triggers kqueue events here.
 *
 * => Note: dvp and vp may be NULL only if called by tmpfs_unmount().
 */
void
tmpfs_dir_detach(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
{
        tmpfs_node_t *node = de->td_node;
        vnode_t *dvp = dnode->tn_vnode;

        KASSERT(dvp == NULL || VOP_ISLOCKED(dvp));

        if (__predict_true(node != TMPFS_NODE_WHITEOUT)) {
                /* Deassociate the inode and entry. */
                node->tn_dirent_hint = NULL;

                KASSERT(node->tn_links > 0);
                node->tn_links--;

                /* If directory - decrease the link count of parent. */
                if (node->tn_type == VDIR) {
                        KASSERT(node->tn_spec.tn_dir.tn_parent == dnode);
                        node->tn_spec.tn_dir.tn_parent = NULL;

                        KASSERT(dnode->tn_links > 0);
                        dnode->tn_links--;
                }
        }
        de->td_node = NULL;

        /* Remove the entry from the directory. */
        if (dnode->tn_spec.tn_dir.tn_readdir_lastp == de) {
                dnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
        }
        TAILQ_REMOVE(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
        KASSERT(dnode->tn_size >= sizeof(tmpfs_dirent_t));
        dnode->tn_size -= sizeof(tmpfs_dirent_t);
        tmpfs_dir_putseq(dnode, de);

        if (dvp) {
                uvm_vnp_setsize(dvp, dnode->tn_size);
        }
}

/*
 * tmpfs_dir_lookup: find a directory entry in the specified inode.
 *
 * Note that the . and .. components are not allowed as they do not
 * physically exist within directories.
 */
tmpfs_dirent_t *
tmpfs_dir_lookup(tmpfs_node_t *node, struct componentname *cnp)
{
        const char *name = cnp->cn_nameptr;
        const uint16_t nlen = cnp->cn_namelen;
        tmpfs_dirent_t *de;

        KASSERT(VOP_ISLOCKED(node->tn_vnode));
        KASSERT(nlen != 1 || !(name[0] == '.'));
        KASSERT(nlen != 2 || !(name[0] == '.' && name[1] == '.'));
        TMPFS_VALIDATE_DIR(node);

        TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
                if (de->td_namelen != nlen)
                        continue;
                if (memcmp(de->td_name, name, nlen) != 0)
                        continue;
                break;
        }
        return de;
}

/*
 * tmpfs_dir_cached: get a cached directory entry if it is valid.  Used to
 * avoid unnecessary tmpfs_dir_lookup().
 *
 * => The vnode must be locked.
 */
tmpfs_dirent_t *
tmpfs_dir_cached(tmpfs_node_t *node)
{
        tmpfs_dirent_t *de = node->tn_dirent_hint;

        KASSERT(VOP_ISLOCKED(node->tn_vnode));

        if (de == NULL) {
                return NULL;
        }
        KASSERT(de->td_node == node);

        /*
         * Directories always have a valid hint.  For files, check if there
         * are any hard links.  If there are - hint might be invalid.
         */
        return (node->tn_type != VDIR && node->tn_links > 1) ? NULL : de;
}

/*
 * tmpfs_dir_getseq: get a per-directory sequence number for the entry.
 *
 * => Shall not be larger than 2^31 for linux32 compatibility.
 */
uint32_t
tmpfs_dir_getseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
{
        uint32_t seq = de->td_seq;
        vmem_t *seq_arena;
        vmem_addr_t off;
        int error __diagused;

        TMPFS_VALIDATE_DIR(dnode);

        if (__predict_true(seq != TMPFS_DIRSEQ_NONE)) {
                /* Already set. */
                KASSERT(seq >= TMPFS_DIRSEQ_START);
                return seq;
        }

        /*
         * The "." and ".." and the end-of-directory have reserved numbers.
         * The other sequence numbers are allocated as following:
         *
         * - The first half of the 2^31 is assigned incrementally.
         *
         * - If that range is exceeded, then the second half of 2^31
         * is used, but managed by vmem(9).
         */

        seq = dnode->tn_spec.tn_dir.tn_next_seq;
        KASSERT(seq >= TMPFS_DIRSEQ_START);

        if (__predict_true(seq < TMPFS_DIRSEQ_END)) {
                /* First half: just increment and return. */
                dnode->tn_spec.tn_dir.tn_next_seq++;
                return seq;
        }

        /*
         * First half exceeded, use the second half.  May need to create
         * vmem(9) arena for the directory first.
         */
        if ((seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena) == NULL) {
                seq_arena = vmem_create("tmpfscoo", 0,
                    TMPFS_DIRSEQ_END - 1, 1, NULL, NULL, NULL, 0,
                    VM_SLEEP, IPL_NONE);
                dnode->tn_spec.tn_dir.tn_seq_arena = seq_arena;
                KASSERT(seq_arena != NULL);
        }
        error = vmem_alloc(seq_arena, 1, VM_SLEEP | VM_BESTFIT, &off);
        KASSERT(error == 0);

        KASSERT(off < TMPFS_DIRSEQ_END);
        seq = off | TMPFS_DIRSEQ_END;
        return seq;
}

static void
tmpfs_dir_putseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
{
        vmem_t *seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena;
        uint32_t seq = de->td_seq;

        TMPFS_VALIDATE_DIR(dnode);

        if (seq == TMPFS_DIRSEQ_NONE || seq < TMPFS_DIRSEQ_END) {
                /* First half (or no sequence number set yet). */
                KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
        } else {
                /* Second half. */
                KASSERT(seq_arena != NULL);
                KASSERT(seq >= TMPFS_DIRSEQ_END);
                seq &= ~TMPFS_DIRSEQ_END;
                vmem_free(seq_arena, seq, 1);
        }
        de->td_seq = TMPFS_DIRSEQ_NONE;

        /* Empty?  We can reset. */
        if (seq_arena && dnode->tn_size == 0) {
                dnode->tn_spec.tn_dir.tn_seq_arena = NULL;
                dnode->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
                vmem_destroy(seq_arena);
        }
}

/*
 * tmpfs_dir_lookupbyseq: lookup a directory entry by the sequence number.
 */
tmpfs_dirent_t *
tmpfs_dir_lookupbyseq(tmpfs_node_t *node, off_t seq)
{
        tmpfs_dirent_t *de = node->tn_spec.tn_dir.tn_readdir_lastp;

        TMPFS_VALIDATE_DIR(node);

        /*
         * First, check the cache.  If does not match - perform a lookup.
         */
        if (de && de->td_seq == seq) {
                KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
                KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
                return de;
        }
        TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
                KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
                KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
                if (de->td_seq == seq)
                        return de;
        }
        return NULL;
}

/*
 * tmpfs_dir_getdotents: helper function for tmpfs_readdir() to get the
 * dot meta entries, that is, "." or "..".  Copy it to the UIO space.
 */
static int
tmpfs_dir_getdotents(tmpfs_node_t *node, struct dirent *dp, struct uio *uio)
{
        tmpfs_dirent_t *de;
        off_t next = 0;
        int error;

        switch (uio->uio_offset) {
        case TMPFS_DIRSEQ_DOT:
                dp->d_fileno = node->tn_id;
                strlcpy(dp->d_name, ".", sizeof(dp->d_name));
                next = TMPFS_DIRSEQ_DOTDOT;
                break;
        case TMPFS_DIRSEQ_DOTDOT:
                dp->d_fileno = node->tn_spec.tn_dir.tn_parent->tn_id;
                strlcpy(dp->d_name, "..", sizeof(dp->d_name));
                de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir);
                next = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
                break;
        default:
                KASSERT(false);
        }
        dp->d_type = DT_DIR;
        dp->d_namlen = strlen(dp->d_name);
        dp->d_reclen = _DIRENT_SIZE(dp);

        if (dp->d_reclen > uio->uio_resid) {
                return EJUSTRETURN;
        }
        if ((error = uiomove(dp, dp->d_reclen, uio)) != 0) {
                return error;
        }

        uio->uio_offset = next;
        return error;
}

/*
 * tmpfs_dir_getdents: helper function for tmpfs_readdir.
 *
 * => Returns as much directory entries as can fit in the uio space.
 * => The read starts at uio->uio_offset.
 */
int
tmpfs_dir_getdents(tmpfs_node_t *node, struct uio *uio, off_t *cntp)
{
        tmpfs_dirent_t *de;
        struct dirent dent;
        int error = 0;

        KASSERT(VOP_ISLOCKED(node->tn_vnode));
        TMPFS_VALIDATE_DIR(node);

        /*
         * First check for the "." and ".." cases.
         * Note: tmpfs_dir_getdotents() will "seek" for us.
         */
        memset(&dent, 0, sizeof(dent));

        if (uio->uio_offset == TMPFS_DIRSEQ_DOT) {
                if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
                        goto done;
                }
                (*cntp)++;
        }
        if (uio->uio_offset == TMPFS_DIRSEQ_DOTDOT) {
                if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
                        goto done;
                }
                (*cntp)++;
        }

        /* Done if we reached the end. */
        if (uio->uio_offset == TMPFS_DIRSEQ_EOF) {
                goto done;
        }

        /* Locate the directory entry given by the given sequence number. */
        de = tmpfs_dir_lookupbyseq(node, uio->uio_offset);
        if (de == NULL) {
                error = EINVAL;
                goto done;
        }

        /*
         * Read as many entries as possible; i.e., until we reach the end
         * of the directory or we exhaust UIO space.
         */
        do {
                if (de->td_node == TMPFS_NODE_WHITEOUT) {
                        dent.d_fileno = 1;
                        dent.d_type = DT_WHT;
                } else {
                        dent.d_fileno = de->td_node->tn_id;
                        dent.d_type = vtype2dt(de->td_node->tn_type);
                }
                dent.d_namlen = de->td_namelen;
                KASSERT(de->td_namelen < sizeof(dent.d_name));
                memcpy(dent.d_name, de->td_name, de->td_namelen);
                dent.d_name[de->td_namelen] = '\0';
                dent.d_reclen = _DIRENT_SIZE(&dent);

                if (dent.d_reclen > uio->uio_resid) {
                        /* Exhausted UIO space. */
                        error = EJUSTRETURN;
                        break;
                }

                /* Copy out the directory entry and continue. */
                error = uiomove(&dent, dent.d_reclen, uio);
                if (error) {
                        break;
                }
                (*cntp)++;
                de = TAILQ_NEXT(de, td_entries);

        } while (uio->uio_resid > 0 && de);

        /* Cache the last entry or clear and mark EOF. */
        uio->uio_offset = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
        node->tn_spec.tn_dir.tn_readdir_lastp = de;
done:
        tmpfs_update(node->tn_vnode, TMPFS_UPDATE_ATIME);

        if (error == EJUSTRETURN) {
                /* Exhausted UIO space - just return. */
                error = 0;
        }
        KASSERT(error >= 0);
        return error;
}

/*
 * tmpfs_reg_resize: resize the underlying UVM object associated with the
 * specified regular file.
 */
int
tmpfs_reg_resize(struct vnode *vp, off_t newsize)
{
        tmpfs_mount_t *tmp = VFS_TO_TMPFS(vp->v_mount);
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
        struct uvm_object *uobj = node->tn_spec.tn_reg.tn_aobj;
        size_t newpages, oldpages;
        off_t oldsize;

        KASSERT(vp->v_type == VREG);
        KASSERT(newsize >= 0);

        if (newsize > __type_max(off_t) - PAGE_SIZE + 1)
                return EFBIG;

        oldsize = node->tn_size;
        oldpages = round_page(oldsize) >> PAGE_SHIFT;
        newpages = round_page(newsize) >> PAGE_SHIFT;
        KASSERT(oldpages == node->tn_spec.tn_reg.tn_aobj_pages);

        if (newsize == oldsize) {
                return 0;
        }

        if (newpages > oldpages) {
                /* Increase the used-memory counter if getting extra pages. */
                if (!tmpfs_mem_incr(tmp, (newpages - oldpages) << PAGE_SHIFT)) {
                        return ENOSPC;
                }
        } else if (newsize < oldsize) {
                size_t zerolen;

                zerolen = MIN(round_page(newsize), node->tn_size) - newsize;
                ubc_zerorange(uobj, newsize, zerolen, UBC_VNODE_FLAGS(vp));
        }

        node->tn_spec.tn_reg.tn_aobj_pages = newpages;
        node->tn_size = newsize;
        uvm_vnp_setsize(vp, newsize);

        /*
         * Free "backing store".
         */
        if (newpages < oldpages) {
                rw_enter(uobj->vmobjlock, RW_WRITER);
                uao_dropswap_range(uobj, newpages, oldpages);
                rw_exit(uobj->vmobjlock);

                /* Decrease the used-memory counter. */
                tmpfs_mem_decr(tmp, (oldpages - newpages) << PAGE_SHIFT);
        }
        return 0;
}

/*
 * tmpfs_chflags: change flags of the given vnode.
 */
int
tmpfs_chflags(vnode_t *vp, int flags, kauth_cred_t cred, lwp_t *l)
{
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
        kauth_action_t action = KAUTH_VNODE_WRITE_FLAGS;
        int error;
        bool changing_sysflags = false;

        KASSERT(VOP_ISLOCKED(vp));

        /* Disallow this operation if the file system is mounted read-only. */
        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                return EROFS;

        /*
         * If the new flags have non-user flags that are different than
         * those on the node, we need special permission to change them.
         */
        if ((flags & SF_SETTABLE) != (node->tn_flags & SF_SETTABLE)) {
                action |= KAUTH_VNODE_WRITE_SYSFLAGS;
                changing_sysflags = true;
        }

        /*
         * Indicate that this node's flags have system attributes in them if
         * that's the case.
         */
        if (node->tn_flags & (SF_IMMUTABLE | SF_APPEND)) {
                action |= KAUTH_VNODE_HAS_SYSFLAGS;
        }

        error = kauth_authorize_vnode(cred, action, vp, NULL,
            genfs_can_chflags(vp, cred, node->tn_uid, changing_sysflags));
        if (error)
                return error;

        /*
         * Set the flags. If we're not setting non-user flags, be careful not
         * to overwrite them.
         *
         * XXX: Can't we always assign here? if the system flags are different,
         *      the code above should catch attempts to change them without
         *      proper permissions, and if we're here it means it's okay to
         *      change them...
         */
        if (!changing_sysflags) {
                /* Clear all user-settable flags and re-set them. */
                node->tn_flags &= SF_SETTABLE;
                node->tn_flags |= (flags & UF_SETTABLE);
        } else {
                node->tn_flags = flags;
        }
        tmpfs_update(vp, TMPFS_UPDATE_CTIME);
        return 0;
}

/*
 * tmpfs_chmod: change access mode on the given vnode.
 */
int
tmpfs_chmod(vnode_t *vp, mode_t mode, kauth_cred_t cred, lwp_t *l)
{
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
        int error;

        KASSERT(VOP_ISLOCKED(vp));

        /* Disallow this operation if the file system is mounted read-only. */
        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                return EROFS;

        /* Immutable or append-only files cannot be modified, either. */
        if (node->tn_flags & (IMMUTABLE | APPEND))
                return EPERM;

        error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp,
            NULL, genfs_can_chmod(vp, cred, node->tn_uid, node->tn_gid, mode));
        if (error) {
                return error;
        }
        node->tn_mode = (mode & ALLPERMS);
        tmpfs_update(vp, TMPFS_UPDATE_CTIME);
        cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid, true);
        return 0;
}

/*
 * tmpfs_chown: change ownership of the given vnode.
 *
 * => At least one of uid or gid must be different than VNOVAL.
 * => Attribute is unchanged for VNOVAL case.
 */
int
tmpfs_chown(vnode_t *vp, uid_t uid, gid_t gid, kauth_cred_t cred, lwp_t *l)
{
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
        int error;

        KASSERT(VOP_ISLOCKED(vp));

        /* Assign default values if they are unknown. */
        KASSERT(uid != VNOVAL || gid != VNOVAL);
        if (uid == VNOVAL) {
                uid = node->tn_uid;
        }
        if (gid == VNOVAL) {
                gid = node->tn_gid;
        }

        /* Disallow this operation if the file system is mounted read-only. */
        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                return EROFS;

        /* Immutable or append-only files cannot be modified, either. */
        if (node->tn_flags & (IMMUTABLE | APPEND))
                return EPERM;

        error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp,
            NULL, genfs_can_chown(vp, cred, node->tn_uid, node->tn_gid, uid,
            gid));
        if (error) {
                return error;
        }
        node->tn_uid = uid;
        node->tn_gid = gid;
        tmpfs_update(vp, TMPFS_UPDATE_CTIME);
        cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid, true);
        return 0;
}

/*
 * tmpfs_chsize: change size of the given vnode.
 */
int
tmpfs_chsize(vnode_t *vp, u_quad_t size, kauth_cred_t cred, lwp_t *l)
{
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
        const off_t length = size;
        int error;

        KASSERT(VOP_ISLOCKED(vp));

        /* Decide whether this is a valid operation based on the file type. */
        switch (vp->v_type) {
        case VDIR:
                return EISDIR;
        case VREG:
                if (vp->v_mount->mnt_flag & MNT_RDONLY) {
                        return EROFS;
                }
                break;
        case VBLK:
        case VCHR:
        case VFIFO:
                /*
                 * Allow modifications of special files even if in the file
                 * system is mounted read-only (we are not modifying the
                 * files themselves, but the objects they represent).
                 */
                return 0;
        default:
                return EOPNOTSUPP;
        }

        /* Immutable or append-only files cannot be modified, either. */
        if (node->tn_flags & (IMMUTABLE | APPEND)) {
                return EPERM;
        }

        if (length < 0) {
                return EINVAL;
        }

        /* Note: tmpfs_reg_resize() will raise NOTE_EXTEND and NOTE_ATTRIB. */
        if (node->tn_size != length &&
            (error = tmpfs_reg_resize(vp, length)) != 0) {
                return error;
        }
        tmpfs_update(vp, TMPFS_UPDATE_CTIME | TMPFS_UPDATE_MTIME);
        return 0;
}

/*
 * tmpfs_chtimes: change access and modification times for vnode.
 */
int
tmpfs_chtimes(vnode_t *vp, const struct timespec *atime,
    const struct timespec *mtime, const struct timespec *btime,
    int vaflags, kauth_cred_t cred, lwp_t *l)
{
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
        int error;

        KASSERT(VOP_ISLOCKED(vp));

        /* Disallow this operation if the file system is mounted read-only. */
        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                return EROFS;

        /* Immutable or append-only files cannot be modified, either. */
        if (node->tn_flags & (IMMUTABLE | APPEND))
                return EPERM;

        error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp, NULL,
            genfs_can_chtimes(vp, cred, node->tn_uid, vaflags));
        if (error)
                return error;

        mutex_enter(&node->tn_timelock);
        if (atime->tv_sec != VNOVAL) {
                atomic_and_uint(&node->tn_tflags, ~TMPFS_UPDATE_ATIME);
                node->tn_atime = *atime;
        }
        if (mtime->tv_sec != VNOVAL) {
                atomic_and_uint(&node->tn_tflags, ~TMPFS_UPDATE_MTIME);
                node->tn_mtime = *mtime;
        }
        if (btime->tv_sec != VNOVAL) {
                node->tn_birthtime = *btime;
        }
        mutex_exit(&node->tn_timelock);
        return 0;
}

/*
 * tmpfs_update_locked: update the timestamps as indicated by the flags.
 */
void
tmpfs_update_locked(vnode_t *vp, unsigned tflags)
{
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
        struct timespec nowtm;

        KASSERT(mutex_owned(&node->tn_timelock));

        if ((tflags |= atomic_swap_uint(&node->tn_tflags, 0)) == 0) {
                return;
        }
        vfs_timestamp(&nowtm);

        if (tflags & TMPFS_UPDATE_ATIME) {
                node->tn_atime = nowtm;
        }
        if (tflags & TMPFS_UPDATE_MTIME) {
                node->tn_mtime = nowtm;
        }
        if (tflags & TMPFS_UPDATE_CTIME) {
                node->tn_ctime = nowtm;
        }
}

/*
 * tmpfs_update: update the timestamps as indicated by the flags.
 */
void
tmpfs_update(vnode_t *vp, unsigned tflags)
{
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);

        if ((tflags | atomic_load_relaxed(&node->tn_tflags)) == 0) {
                return;
        }

        mutex_enter(&node->tn_timelock);
        tmpfs_update_locked(vp, tflags);
        mutex_exit(&node->tn_timelock);
}

/*
 * tmpfs_update_lazily: schedule a deferred timestamp update.
 */
void
tmpfs_update_lazily(vnode_t *vp, unsigned tflags)
{
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
        unsigned cur;

        cur = atomic_load_relaxed(&node->tn_tflags);
        if ((cur & tflags) != tflags) {
                atomic_or_uint(&node->tn_tflags, tflags);
                return;
        }
}




































































    2 















    2 








    2 
    2 





    2 












    6 


    6 







    6 

































































    2 




    1 







    2 























    2 
    1 




















    3 



















    3 




    2 


    2 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
/*        $NetBSD: kern_ras.c,v 1.42 2022/08/08 22:31:45 riastradh Exp $        */

/*-
 * Copyright (c) 2002, 2006, 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Gregory McGarry, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_ras.c,v 1.42 2022/08/08 22:31:45 riastradh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/proc.h>
#include <sys/ras.h>
#include <sys/xcall.h>
#include <sys/syscallargs.h>

#include <uvm/uvm_extern.h>

#define MAX_RAS_PER_PROC        16

u_int ras_per_proc = MAX_RAS_PER_PROC;

#ifdef DEBUG
int ras_debug = 0;
#define DPRINTF(x)        if (ras_debug) printf x
#else
#define DPRINTF(x)        /* nothing */
#endif

/*
 * Force all CPUs through cpu_switchto(), waiting until complete.
 * Context switching will drain the write buffer on the calling
 * CPU.
 */
static void
ras_sync(void)
{

        /* No need to sync if exiting or single threaded. */
        if (curproc->p_nlwps > 1 && ncpu > 1) {
                xc_barrier(0);
        }
}

/*
 * Check the specified address to see if it is within the
 * sequence.  If it is found, we return the restart address,
 * otherwise we return -1.  If we do perform a restart, we
 * mark the sequence as hit.
 *
 * No locking required: we disable preemption and ras_sync()
 * guarantees that individual entries are valid while we still
 * have visibility of them.
 */
void *
ras_lookup(struct proc *p, void *addr)
{
        struct ras *rp;
        void *startaddr;
        lwp_t *l;

        startaddr = (void *)-1;
        l = curlwp;

        KPREEMPT_DISABLE(l);
        for (rp = p->p_raslist; rp != NULL; rp = rp->ras_next) {
                if (addr > rp->ras_startaddr && addr < rp->ras_endaddr) {
                        startaddr = rp->ras_startaddr;
                        DPRINTF(("RAS hit: p=%p %p\n", p, addr));
                        break;
                }
        }
        KPREEMPT_ENABLE(l);

        return startaddr;
}

/*
 * During a fork, we copy all of the sequences from parent p1 to
 * the child p2.
 *
 * No locking required as the parent must be paused.
 */
int
ras_fork(struct proc *p1, struct proc *p2)
{
        struct ras *rp, *nrp;

        for (rp = p1->p_raslist; rp != NULL; rp = rp->ras_next) {
                nrp = kmem_alloc(sizeof(*nrp), KM_SLEEP);
                nrp->ras_startaddr = rp->ras_startaddr;
                nrp->ras_endaddr = rp->ras_endaddr;
                nrp->ras_next = p2->p_raslist;
                p2->p_raslist = nrp;
        }

        DPRINTF(("ras_fork: p1=%p, p2=%p\n", p1, p2));

        return 0;
}

/*
 * Nuke all sequences for this process.
 */
int
ras_purgeall(void)
{
        struct ras *rp, *nrp;
        proc_t *p;

        p = curproc;

        if (p->p_raslist == NULL)
                return 0;

        mutex_enter(&p->p_auxlock);
        if ((rp = p->p_raslist) != NULL) {
                p->p_raslist = NULL;
                ras_sync();
                for(; rp != NULL; rp = nrp) {
                        nrp = rp->ras_next;
                        kmem_free(rp, sizeof(*rp));
                }
        }
        mutex_exit(&p->p_auxlock);

        return 0;
}

#if defined(__HAVE_RAS)

/*
 * Install the new sequence.  If it already exists, return
 * an error.
 */
static int
ras_install(void *addr, size_t len)
{
        struct ras *rp;
        struct ras *newrp;
        void *endaddr;
        int nras, error;
        proc_t *p;

        if (len == 0)
                return EINVAL;

        if ((uintptr_t)addr < VM_MIN_ADDRESS ||
            (uintptr_t)addr > VM_MAXUSER_ADDRESS)
                return EINVAL;
        if (len > VM_MAXUSER_ADDRESS - (uintptr_t)addr)
                return EINVAL;
        endaddr = (char *)addr + len;

        newrp = kmem_alloc(sizeof(*newrp), KM_SLEEP);
        newrp->ras_startaddr = addr;
        newrp->ras_endaddr = endaddr;
        error = 0;
        nras = 0;
        p = curproc;

        mutex_enter(&p->p_auxlock);
        for (rp = p->p_raslist; rp != NULL; rp = rp->ras_next) {
                if (++nras >= ras_per_proc) {
                        error = EINVAL;
                        break;
                }
                if (addr < rp->ras_endaddr && endaddr > rp->ras_startaddr) {
                        error = EEXIST;
                        break;
                }
        }
        if (rp == NULL) {
                newrp->ras_next = p->p_raslist;
                p->p_raslist = newrp;
                ras_sync();
                 mutex_exit(&p->p_auxlock);
        } else {
                 mutex_exit(&p->p_auxlock);
                 kmem_free(newrp, sizeof(*newrp));
        }

        return error;
}

/*
 * Nuke the specified sequence.  Both address and len must
 * match, otherwise we return an error.
 */
static int
ras_purge(void *addr, size_t len)
{
        struct ras *rp, **link;
        proc_t *p;

        p = curproc;

        mutex_enter(&p->p_auxlock);
        link = &p->p_raslist;
        for (rp = *link; rp != NULL; link = &rp->ras_next, rp = *link) {
                if (addr == rp->ras_startaddr &&
                    (char *)rp->ras_endaddr - (char *)rp->ras_startaddr == len)
                        break;
        }
        if (rp != NULL) {
                *link = rp->ras_next;
                ras_sync();
                mutex_exit(&p->p_auxlock);
                kmem_free(rp, sizeof(*rp));
                return 0;
        } else {
                mutex_exit(&p->p_auxlock);
                return ESRCH;
        }
}

#endif /* defined(__HAVE_RAS) */

/*ARGSUSED*/
int
sys_rasctl(struct lwp *l, const struct sys_rasctl_args *uap, register_t *retval)
{
#if defined(__HAVE_RAS)
        /* {
                syscallarg(void *) addr;
                syscallarg(size_t) len;
                syscallarg(int) op;
        } */
        void *addr;
        size_t len;
        int op;
        int error;

        /*
         * first, extract syscall args from the uap.
         */

        addr = (void *)SCARG(uap, addr);
        len = (size_t)SCARG(uap, len);
        op = SCARG(uap, op);

        DPRINTF(("sys_rasctl: p=%p addr=%p, len=%ld, op=0x%x\n",
            curproc, addr, (long)len, op));

        switch (op) {
        case RAS_INSTALL:
                error = ras_install(addr, len);
                break;
        case RAS_PURGE:
                error = ras_purge(addr, len);
                break;
        case RAS_PURGE_ALL:
                error = ras_purgeall();
                break;
        default:
                error = EINVAL;
                break;
        }

        return (error);
#else
        return (EOPNOTSUPP);
#endif
}
























































    1 














    1 









    1 






    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/*        $NetBSD: sysv_sem_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $        */

/*-
 * Copyright (c) 1999 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sysv_sem_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signal.h>
#include <sys/proc.h>
#include <sys/sem.h>

#ifndef SYSVSEM
#define        SYSVSEM
#endif

#include <sys/syscallargs.h>

#include <compat/sys/sem.h>

int
compat_50_sys_____semctl13(struct lwp *l, const struct compat_50_sys_____semctl13_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) semid;
                syscallarg(int) semnum;
                syscallarg(int) cmd;
                syscallarg(union __semun *) arg;
        } */
        union __semun arg;
        struct semid_ds sembuf;
        struct semid_ds13 osembuf;
        int cmd, error;
        void *pass_arg;

        cmd = SCARG(uap, cmd);

        pass_arg = get_semctl_arg(cmd, &sembuf, &arg);

        if (pass_arg != NULL) {
                error = copyin(SCARG(uap, arg), &arg, sizeof(arg));
                if (error)
                        return (error);
                if (cmd == IPC_SET) {
                        error = copyin(arg.buf, &osembuf, sizeof(osembuf));
                        if (error)
                                return (error);
                        __semid_ds13_to_native(&osembuf, &sembuf);
                }
        }

        error = semctl1(l, SCARG(uap, semid), SCARG(uap, semnum), cmd,
            pass_arg, retval);

        if (error == 0 && cmd == IPC_STAT) {
                __native_to_semid_ds13(&sembuf, &osembuf);
                error = copyout(&osembuf, arg.buf, sizeof(osembuf));
        }

        return (error);
}






































































































    3 






























































































    3 















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
/*        $NetBSD: overlay_vfsops.c,v 1.73 2022/11/04 11:20:39 hannken Exp $        */

/*
 * Copyright (c) 1999, 2000 National Aeronautics & Space Administration
 * All rights reserved.
 *
 * This software was written by William Studenmund of the
 * Numerical Aerospace Simulation Facility, NASA Ames Research Center.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the National Aeronautics & Space Administration
 *    nor the names of its contributors may be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
 * UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
/*
 * Copyright (c) 1992, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        from: Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp
 *        from: @(#)lofs_vfsops.c        1.2 (Berkeley) 6/18/92
 *        @(#)null_vfsops.c        8.7 (Berkeley) 5/14/95
 */

/*
 * Overlay Layer
 * (See overlay_vnops.c for a description of what this does.)
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: overlay_vfsops.c,v 1.73 2022/11/04 11:20:39 hannken Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/module.h>
#include <miscfs/overlay/overlay.h>
#include <miscfs/genfs/layer_extern.h>

MODULE(MODULE_CLASS_VFS, overlay, "layerfs");

VFS_PROTOS(ov);

#define        NOVERLAYNODECACHE        16

/*
 * Mount overlay layer
 */
int
ov_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
        struct lwp *l = curlwp;
        int error = 0;
        struct overlay_args *args = data;
        struct vnode *lowerrootvp, *vp;
        struct overlay_mount *nmp;
        struct layer_mount *lmp;

#ifdef OVERLAYFS_DIAGNOSTIC
        printf("ov_mount(mp = %p)\n", mp);
#endif

        if (args == NULL)
                return EINVAL;
        if (*data_len < sizeof *args)
                return EINVAL;

        if (mp->mnt_flag & MNT_GETARGS) {
                lmp = MOUNTTOLAYERMOUNT(mp);
                if (lmp == NULL)
                        return EIO;
                args->la.target = NULL;
                *data_len = sizeof *args;
                return 0;
        }

        /*
         * Update is not supported
         */
        if (mp->mnt_flag & MNT_UPDATE)
                return EOPNOTSUPP;

        /*
         * Find lower node
         */
        lowerrootvp = mp->mnt_vnodecovered;
        vref(lowerrootvp);
        if ((error = vn_lock(lowerrootvp, LK_EXCLUSIVE))) {
                vrele(lowerrootvp);
                return (error);
        }

        /*
         * First cut at fixing up upper mount point
         */
        nmp = kmem_zalloc(sizeof(struct overlay_mount), KM_SLEEP);

        mp->mnt_data = nmp;

        /*
         * Make sure that the mount point is sufficiently initialized
         * that the node create call will work.
         */
        vfs_getnewfsid(mp);
        error = vfs_set_lowermount(mp, lowerrootvp->v_mount);
        if (error) {
                vput(lowerrootvp);
                kmem_free(nmp, sizeof(struct overlay_mount));
                return error;
        }

        nmp->ovm_size = sizeof (struct overlay_node);
        nmp->ovm_tag = VT_OVERLAY;
        nmp->ovm_bypass = layer_bypass;
        nmp->ovm_vnodeop_p = overlay_vnodeop_p;

        /*
         * Fix up overlay node for root vnode
         */
        VOP_UNLOCK(lowerrootvp);
        error = layer_node_create(mp, lowerrootvp, &vp);
        /*
         * Make sure the fixup worked
         */
        if (error) {
                vrele(lowerrootvp);
                kmem_free(nmp, sizeof(struct overlay_mount));
                return error;
        }

        /*
         * Keep a held reference to the root vnode.
         * It is vrele'd in ov_unmount.
         */
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        vp->v_vflag |= VV_ROOT;
        nmp->ovm_rootvp = vp;
        VOP_UNLOCK(vp);

        error = set_statvfs_info(path, UIO_USERSPACE, args->la.target,
            UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
        if (error)
                return error;

        if (mp->mnt_lower->mnt_flag & MNT_LOCAL)
                mp->mnt_flag |= MNT_LOCAL;
#ifdef OVERLAYFS_DIAGNOSTIC
        printf("ov_mount: lower %s, alias at %s\n",
            mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
#endif
        return 0;
}

/*
 * Free reference to overlay layer
 */
int
ov_unmount(struct mount *mp, int mntflags)
{
        struct vnode *overlay_rootvp = MOUNTTOOVERLAYMOUNT(mp)->ovm_rootvp;
        struct overlay_mount *omp;
        int error;
        int flags = 0;

#ifdef OVERLAYFS_DIAGNOSTIC
        printf("ov_unmount(mp = %p)\n", mp);
#endif

        if (mntflags & MNT_FORCE)
                flags |= FORCECLOSE;

        if (vrefcnt(overlay_rootvp) > 1 && (mntflags & MNT_FORCE) == 0)
                return (EBUSY);
        if ((error = vflush(mp, overlay_rootvp, flags)) != 0)
                return (error);

#ifdef OVERLAYFS_DIAGNOSTIC
        vprint("alias root of lower", overlay_rootvp);
#endif
        /*
         * Blow it away for future re-use
         */
        vgone(overlay_rootvp);
        /*
         * Finally, throw away the overlay_mount structure
         */
        omp = mp->mnt_data;
        kmem_free(omp, sizeof(struct overlay_mount));
        mp->mnt_data = NULL;
        return 0;
}

extern const struct vnodeopv_desc overlay_vnodeop_opv_desc;

const struct vnodeopv_desc * const ov_vnodeopv_descs[] = {
        &overlay_vnodeop_opv_desc,
        NULL,
};

struct vfsops overlay_vfsops = {
        .vfs_name = MOUNT_OVERLAY,
        .vfs_min_mount_data = sizeof (struct overlay_args),
        .vfs_mount = ov_mount,
        .vfs_start = layerfs_start,
        .vfs_unmount = ov_unmount,
        .vfs_root = layerfs_root,
        .vfs_quotactl = layerfs_quotactl,
        .vfs_statvfs = layerfs_statvfs,
        .vfs_sync = layerfs_sync,
        .vfs_loadvnode = layerfs_loadvnode,
        .vfs_vget = layerfs_vget,
        .vfs_fhtovp = layerfs_fhtovp,
        .vfs_vptofh = layerfs_vptofh,
        .vfs_init = layerfs_init,
        .vfs_done = layerfs_done,
        .vfs_snapshot = layerfs_snapshot,
        .vfs_extattrctl = vfs_stdextattrctl,
        .vfs_suspendctl = layerfs_suspendctl,
        .vfs_renamelock_enter = layerfs_renamelock_enter,
        .vfs_renamelock_exit = layerfs_renamelock_exit,
        .vfs_fsync = (void *)eopnotsupp,
        .vfs_opv_descs = ov_vnodeopv_descs
};

SYSCTL_SETUP(overlay_sysctl_setup, "overlay fs sysctl")
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT, CTLTYPE_NODE, "overlay",
                       SYSCTL_DESCR("Overlay file system"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, CTL_CREATE, CTL_EOL);
}

static int
overlay_modcmd(modcmd_t cmd, void *arg)
{
        int error;

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = vfs_attach(&overlay_vfsops);
                if (error != 0)
                        break;
                break;
        case MODULE_CMD_FINI:
                error = vfs_detach(&overlay_vfsops);
                if (error != 0)
                        break;
                break;
        default:
                error = ENOTTY;
                break;
        }

        return (error);
}

















































































































    1 









    1 

































    1 









    1 












    1 











    1 


















































































































































    1 







































    1 

























    1 









    1 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
/*        $NetBSD: vfs_syscalls_30.c,v 1.45 2022/03/12 20:46:03 riastradh Exp $        */

/*-
 * Copyright (c) 2005, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_30.c,v 1.45 2022/03/12 20:46:03 riastradh Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/socketvar.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/dirent.h>
#include <sys/malloc.h>
#include <sys/kauth.h>
#include <sys/vfs_syscalls.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>

#include <compat/common/compat_mod.h>
#include <compat/common/compat_util.h>

#include <compat/sys/stat.h>
#include <compat/sys/dirent.h>
#include <compat/sys/mount.h>
#include <compat/sys/statvfs.h>

static const struct syscall_package vfs_syscalls_30_syscalls[] = {
        { SYS_compat_30___fhstat30, 0, (sy_call_t *)compat_30_sys___fhstat30 },
        { SYS_compat_30___fstat13, 0, (sy_call_t *)compat_30_sys___fstat13 },
        { SYS_compat_30___lstat13, 0, (sy_call_t *)compat_30_sys___lstat13 }, 
        { SYS_compat_30___stat13, 0, (sy_call_t *)compat_30_sys___stat13 },  
        { SYS_compat_30_fhopen, 0, (sy_call_t *)compat_30_sys_fhopen },
        { SYS_compat_30_fhstat, 0, (sy_call_t *)compat_30_sys_fhstat },  
        { SYS_compat_30_fhstatvfs1, 0, (sy_call_t *)compat_30_sys_fhstatvfs1 },
        { SYS_compat_30_getdents, 0, (sy_call_t *)compat_30_sys_getdents },
        { SYS_compat_30_getfh, 0, (sy_call_t *)compat_30_sys_getfh },
        { 0,0, NULL }
};

/*
 * Convert from a new to an old stat structure.
 */
static void
cvtstat(struct stat13 *ost, const struct stat *st)
{

        /* Handle any padding. */
        memset(ost, 0, sizeof(*ost));
        ost->st_dev = st->st_dev;
        ost->st_ino = (uint32_t)st->st_ino;
        ost->st_mode = st->st_mode;
        ost->st_nlink = st->st_nlink;
        ost->st_uid = st->st_uid;
        ost->st_gid = st->st_gid;
        ost->st_rdev = st->st_rdev;
        timespec_to_timespec50(&st->st_atimespec, &ost->st_atimespec);
        timespec_to_timespec50(&st->st_mtimespec, &ost->st_mtimespec);
        timespec_to_timespec50(&st->st_ctimespec, &ost->st_ctimespec);
        timespec_to_timespec50(&st->st_birthtimespec, &ost->st_birthtimespec);
        ost->st_size = st->st_size;
        ost->st_blocks = st->st_blocks;
        ost->st_blksize = st->st_blksize;
        ost->st_flags = st->st_flags;
        ost->st_gen = st->st_gen;
}

/*
 * Get file status; this version follows links.
 */
/* ARGSUSED */
int
compat_30_sys___stat13(struct lwp *l,
    const struct compat_30_sys___stat13_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(struct stat13 *) ub;
        } */
        struct stat sb;
        struct stat13 osb;
        int error;

        error = do_sys_stat(SCARG(uap, path), FOLLOW, &sb);
        if (error)
                return error;
        cvtstat(&osb, &sb);
        return copyout(&osb, SCARG(uap, ub), sizeof(osb));
}


/*
 * Get file status; this version does not follow links.
 */
/* ARGSUSED */
int
compat_30_sys___lstat13(struct lwp *l,
    const struct compat_30_sys___lstat13_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(struct stat13 *) ub;
        } */
        struct stat sb;
        struct stat13 osb;
        int error;

        error = do_sys_stat(SCARG(uap, path), NOFOLLOW, &sb);
        if (error)
                return error;
        cvtstat(&osb, &sb);
        return copyout(&osb, SCARG(uap, ub), sizeof(osb));
}

/* ARGSUSED */
int
compat_30_sys_fhstat(struct lwp *l,
    const struct compat_30_sys_fhstat_args *uap, register_t *retval)
{
        /* {
                syscallarg(const struct compat_30_fhandle *) fhp;
                syscallarg(struct stat13 *) sb;
        } */
        struct stat sb;
        struct stat13 osb;
        int error;

        error = do_fhstat(l, SCARG(uap, fhp), sizeof(*SCARG(uap, fhp)), &sb);
        if (error)
                return error;
        cvtstat(&osb, &sb);
        return copyout(&osb, SCARG(uap, sb), sizeof(osb));
}

/*
 * Return status information about a file descriptor.
 */
/* ARGSUSED */
int
compat_30_sys___fstat13(struct lwp *l,
    const struct compat_30_sys___fstat13_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(struct stat13 *) sb;
        } */
        struct stat sb;
        struct stat13 osb;
        int error;

        error = do_sys_fstat(SCARG(uap, fd), &sb);
        if (error)
                return error;
        cvtstat(&osb, &sb);
        return copyout(&osb, SCARG(uap, sb), sizeof(osb));
}

/*
 * Read a block of directory entries in a file system independent format.
 */
int
compat_30_sys_getdents(struct lwp *l,
    const struct compat_30_sys_getdents_args *uap, register_t *retval)
{
        /* {
                syscallarg(int) fd;
                syscallarg(char *) buf;
                syscallarg(size_t) count;
        } */
        struct dirent *bdp;
        struct vnode *vp;
        char *inp, *tbuf;        /* BSD-format */
        int len, reclen;        /* BSD-format */
        char *outp;                /* NetBSD-3.0-format */
        int resid;        
        struct file *fp;
        struct uio auio;
        struct iovec aiov;
        struct dirent12 idb;
        off_t off;                /* true file offset */
        int buflen, error, eofflag;
        off_t *cookiebuf = NULL, *cookie;
        int ncookies;
        bool any = false;

        /* fd_getvnode() will use the descriptor for us */
        if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
                return error;

        if ((fp->f_flag & FREAD) == 0) {
                error = EBADF;
                goto out1;
        }

        vp = fp->f_vnode;
        if (vp->v_type != VDIR) {
                error = EINVAL;
                goto out1;
        }

        buflen = uimin(MAXBSIZE, SCARG(uap, count));
        tbuf = malloc(buflen, M_TEMP, M_WAITOK);
        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
        off = fp->f_offset;
again:
        aiov.iov_base = tbuf;
        aiov.iov_len = buflen;
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        auio.uio_rw = UIO_READ;
        auio.uio_resid = buflen;
        auio.uio_offset = off;
        UIO_SETUP_SYSSPACE(&auio);
        /*
         * First we read into the malloc'ed buffer, then
         * we massage it into user space, one record at a time.
         */
        error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &cookiebuf,
            &ncookies);
        if (error)
                goto out;

        inp = tbuf;
        outp = SCARG(uap, buf);
        resid = SCARG(uap, count);
        if ((len = buflen - auio.uio_resid) == 0)
                goto eof;

        for (cookie = cookiebuf; len > 0; len -= reclen) {
                bdp = (struct dirent *)inp;
                reclen = bdp->d_reclen;
                if (reclen & _DIRENT_ALIGN(bdp))
                        panic("%s: bad reclen %d", __func__, reclen);
                if (cookie)
                        off = *cookie++; /* each entry points to the next */
                else
                        off += reclen;
                if ((off >> 32) != 0) {
                        compat_offseterr(vp, "netbsd30_getdents");
                        error = EINVAL;
                        goto out;
                }
                memset(&idb, 0, sizeof(idb));
                if (bdp->d_namlen >= sizeof(idb.d_name))
                        idb.d_namlen = sizeof(idb.d_name) - 1;
                else
                        idb.d_namlen = bdp->d_namlen;
                idb.d_reclen = _DIRENT_SIZE(&idb);
                if (reclen > len || resid < idb.d_reclen) {
                        /* entry too big for buffer, so just stop */
                        any = true;
                        break;
                }
                /*
                 * Massage in place to make a NetBSD-3.0-shaped dirent
                 * (otherwise we have to worry about touching user memory
                 * outside of the copyout() call).
                 */
                idb.d_fileno = (u_int32_t)bdp->d_fileno;
                idb.d_type = bdp->d_type;
                (void)memcpy(idb.d_name, bdp->d_name, idb.d_namlen);
                memset(idb.d_name + idb.d_namlen, 0,
                    idb.d_reclen - _DIRENT_NAMEOFF(&idb) - idb.d_namlen);
                if ((error = copyout(&idb, outp, idb.d_reclen)) != 0)
                        goto out;
                /* advance past this real entry */
                inp += reclen;
                /* advance output past NetBSD-3.0-shaped entry */
                outp += idb.d_reclen;
                resid -= idb.d_reclen;
                any = true;
        }

        /* if we squished out the whole block, try again */
        if (!any) {
                if (cookiebuf)
                        free(cookiebuf, M_TEMP);
                cookiebuf = NULL;
                goto again;
        }
        fp->f_offset = off;        /* update the vnode offset */

eof:
        *retval = SCARG(uap, count) - resid;
out:
        VOP_UNLOCK(vp);
        if (cookiebuf)
                free(cookiebuf, M_TEMP);
        free(tbuf, M_TEMP);
out1:
        fd_putfile(SCARG(uap, fd));
        return error;
}

/*
 * Get file handle system call
 */
int
compat_30_sys_getfh(struct lwp *l, const struct compat_30_sys_getfh_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(char *) fname;
                syscallarg(struct compat_30_fhandle *) fhp;
        } */
        struct vnode *vp;
        struct compat_30_fhandle fh;
        int error;
        struct pathbuf *pb;
        struct nameidata nd;
        size_t sz;

        /*
         * Must be super user
         */
        error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
            0, NULL, NULL, NULL);
        if (error)
                return (error);

        error = pathbuf_copyin(SCARG(uap, fname), &pb);
        if (error) {
                return error;
        }
        NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
        error = namei(&nd);
        pathbuf_destroy(pb);
        if (error)
                return error;
        vp = nd.ni_vp;

        sz = sizeof(struct compat_30_fhandle);
        error = vfs_composefh(vp, (void *)&fh, &sz);
        vput(vp);
        CTASSERT(FHANDLE_SIZE_COMPAT == sizeof(struct compat_30_fhandle));
        if (sz != FHANDLE_SIZE_COMPAT) {
                error = EINVAL;
        }
        if (error)
                return error;
        return copyout(&fh, SCARG(uap, fhp), sizeof(fh));
}

/*
 * Open a file given a file handle.
 *
 * Check permissions, allocate an open file structure,
 * and call the device open routine if any.
 */
int
compat_30_sys_fhopen(struct lwp *l,
    const struct compat_30_sys_fhopen_args *uap, register_t *retval)
{
        /* {
                syscallarg(const fhandle_t *) fhp;
                syscallarg(int) flags;
        } */

        return dofhopen(l, SCARG(uap, fhp), FHANDLE_SIZE_COMPAT,
            SCARG(uap, flags), retval);
}

/* ARGSUSED */
int
compat_30_sys___fhstat30(struct lwp *l,
    const struct compat_30_sys___fhstat30_args *uap_30, register_t *retval)
{
        /* {
                syscallarg(const fhandle_t *) fhp;
                syscallarg(struct stat30 *) sb;
        } */
        struct stat sb;
        struct stat13 osb;
        int error;

        error = do_fhstat(l, SCARG(uap_30, fhp), FHANDLE_SIZE_COMPAT, &sb);
        if (error)
                return error;
        cvtstat(&osb, &sb);
        return copyout(&osb, SCARG(uap_30, sb), sizeof(osb));
}

/* ARGSUSED */
int
compat_30_sys_fhstatvfs1(struct lwp *l,
    const struct compat_30_sys_fhstatvfs1_args *uap, register_t *retval)
{
        /* {
                syscallarg(const fhandle_t *) fhp;
                syscallarg(struct statvfs90 *) buf;
                syscallarg(int)        flags;
        } */
        struct statvfs *sb = STATVFSBUF_GET();
        int error = do_fhstatvfs(l, SCARG(uap, fhp), FHANDLE_SIZE_COMPAT,
            sb, SCARG(uap, flags));

        if (!error) {
                error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf),
                    sizeof(struct statvfs90));
        }

        STATVFSBUF_PUT(sb);

        return error;
}

int
vfs_syscalls_30_init(void)
{

        return syscall_establish(NULL, vfs_syscalls_30_syscalls);
}

int
vfs_syscalls_30_fini(void)
{

        return syscall_disestablish(NULL, vfs_syscalls_30_syscalls);
}














































































































    4 





    1 

    4 

























    1 





























































    4 
















    4 








    4 








































    5 








    2 
    3 























    5 


















    1 

    2 





    3 











    1 







    3 
    1 




    4 

















    4 









    4 












    1 
    4 




    4 


    5 

    5 


    5 













    2 



















    1 



    1 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
/* $NetBSD: vfs_getcwd.c,v 1.61 2021/06/29 22:39:21 dholland Exp $ */

/*-
 * Copyright (c) 1999, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Bill Sommerfeld.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_getcwd.c,v 1.61 2021/06/29 22:39:21 dholland Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/kmem.h>
#include <sys/dirent.h>
#include <sys/kauth.h>

#include <ufs/ufs/dir.h>        /* XXX only for DIRBLKSIZ */

#include <sys/syscallargs.h>

/*
 * Vnode variable naming conventions in this file:
 *
 * rvp: the current root we're aiming towards.
 * lvp, *lvpp: the "lower" vnode
 * uvp, *uvpp: the "upper" vnode.
 *
 * Since all the vnodes we're dealing with are directories, and the
 * lookups are going *up* in the filesystem rather than *down*, the
 * usual "pvp" (parent) or "dvp" (directory) naming conventions are
 * too confusing.
 */

/*
 * XXX Will infinite loop in certain cases if a directory read reliably
 *        returns EINVAL on last block.
 * XXX is EINVAL the right thing to return if a directory is malformed?
 */

/*
 * XXX Untested vs. mount -o union; probably does the wrong thing.
 */

/*
 * Find parent vnode of *lvpp, return in *uvpp
 *
 * If we care about the name, scan it looking for name of directory
 * entry pointing at lvp.
 *
 * Place the name in the buffer which starts at bufp, immediately
 * before *bpp, and move bpp backwards to point at the start of it.
 *
 * On entry, *lvpp is a locked vnode reference; on exit, it is vput and NULL'ed
 * On exit, *uvpp is either NULL or is a locked vnode reference.
 */
static int
getcwd_scandir(struct vnode *lvp, struct vnode **uvpp, char **bpp,
    char *bufp, struct lwp *l)
{
        int     error = 0;
        int     eofflag;
        off_t   off;
        int     tries;
        struct uio uio;
        struct iovec iov;
        char   *dirbuf = NULL;
        int        dirbuflen;
        ino_t   fileno;
        struct vattr va;
        struct vnode *uvp = NULL;
        kauth_cred_t cred = l->l_cred;
        struct componentname cn;
        int len, reclen;
        tries = 0;

        /* Need exclusive for UFS VOP_GETATTR (itimes) & VOP_LOOKUP. */
        KASSERT(VOP_ISLOCKED(lvp) == LK_EXCLUSIVE);

        /*
         * If we want the filename, get some info we need while the
         * current directory is still locked.
         */
        if (bufp != NULL) {
                error = VOP_GETATTR(lvp, &va, cred);
                if (error) {
                        VOP_UNLOCK(lvp);
                        *uvpp = NULL;
                        return error;
                }
        }

        /*
         * Ok, we have to do it the hard way..
         * Next, get parent vnode using lookup of ..
         */
        cn.cn_nameiop = LOOKUP;
        cn.cn_flags = ISLASTCN | ISDOTDOT | RDONLY;
        cn.cn_cred = cred;
        cn.cn_nameptr = "..";
        cn.cn_namelen = 2;

        /* At this point, lvp is locked  */
        error = VOP_LOOKUP(lvp, uvpp, &cn);
        VOP_UNLOCK(lvp);
        if (error) {
                *uvpp = NULL;
                return error;
        }
        uvp = *uvpp;
        /* If we don't care about the pathname, we're done */
        if (bufp == NULL) {
                return 0;
        }

        fileno = va.va_fileid;

        /* I guess UFS_DIRBLKSIZ is a good guess at a good size to use? */
        dirbuflen = UFS_DIRBLKSIZ;
        if (dirbuflen < va.va_blocksize)
                dirbuflen = va.va_blocksize;
        dirbuf = kmem_alloc(dirbuflen, KM_SLEEP);

        /* Now lvp is unlocked, try to lock uvp */
        error = vn_lock(uvp, LK_SHARED);
        if (error) {
                vrele(uvp);
                *uvpp = NULL;
                return error;
        }

#if 0
unionread:
#endif
        off = 0;
        do {
                /* call VOP_READDIR of parent */
                iov.iov_base = dirbuf;
                iov.iov_len = dirbuflen;

                uio.uio_iov = &iov;
                uio.uio_iovcnt = 1;
                uio.uio_offset = off;
                uio.uio_resid = dirbuflen;
                uio.uio_rw = UIO_READ;
                UIO_SETUP_SYSSPACE(&uio);

                eofflag = 0;

                error = VOP_READDIR(uvp, &uio, cred, &eofflag, 0, 0);

                off = uio.uio_offset;

                /*
                 * Try again if NFS tosses its cookies.
                 * XXX this can still loop forever if the directory is busted
                 * such that the second or subsequent page of it always
                 * returns EINVAL
                 */
                if ((error == EINVAL) && (tries < 3)) {
                        off = 0;
                        tries++;
                        continue;        /* once more, with feeling */
                }

                if (!error) {
                        char   *cpos;
                        struct dirent *dp;

                        cpos = dirbuf;
                        tries = 0;

                        /* scan directory page looking for matching vnode */
                        for (len = (dirbuflen - uio.uio_resid); len > 0;
                            len -= reclen) {
                                dp = (struct dirent *) cpos;
                                reclen = dp->d_reclen;

                                /* check for malformed directory.. */
                                if (reclen < _DIRENT_MINSIZE(dp) ||
                                    reclen > len) {
                                        error = EINVAL;
                                        goto out;
                                }
                                /*
                                 * XXX should perhaps do VOP_LOOKUP to
                                 * check that we got back to the right place,
                                 * but getting the locking games for that
                                 * right would be heinous.
                                 */
                                if ((dp->d_type != DT_WHT) &&
                                    (dp->d_fileno == fileno)) {
                                        char *bp = *bpp;

                                        bp -= dp->d_namlen;
                                        if (bp <= bufp) {
                                                error = ERANGE;
                                                goto out;
                                        }
                                        memcpy(bp, dp->d_name, dp->d_namlen);
                                        error = 0;
                                        *bpp = bp;
                                        goto out;
                                }
                                cpos += reclen;
                        }
                } else
                        goto out;
        } while (!eofflag);
#if 0
        /*
         * Deal with mount -o union, which unions only the
         * root directory of the mount.
         */
        if ((uvp->v_vflag & VV_ROOT) &&
            (uvp->v_mount->mnt_flag & MNT_UNION)) {
                struct vnode *tvp = uvp;

                uvp = uvp->v_mount->mnt_vnodecovered;
                vput(tvp);
                vref(uvp);
                *uvpp = uvp;
                vn_lock(uvp, LK_SHARED | LK_RETRY);
                goto unionread;
        }
#endif
        error = ENOENT;

out:
        VOP_UNLOCK(uvp);
        kmem_free(dirbuf, dirbuflen);
        return error;
}

/*
 * common routine shared by sys___getcwd() and vn_isunder()
 */
int
getcwd_common(struct vnode *lvp, struct vnode *rvp, char **bpp, char *bufp,
    int limit, int flags, struct lwp *l)
{
        struct cwdinfo *cwdi = l->l_proc->p_cwdi;
        kauth_cred_t cred = l->l_cred;
        struct vnode *uvp = NULL;
        char *bp = NULL;
        int error;
        accmode_t accmode = VEXEC;

        error = 0;
        if (rvp == NULL) {
                rvp = cwdi->cwdi_rdir;
                if (rvp == NULL)
                        rvp = rootvnode;
        }

        vref(rvp);
        vref(lvp);

        /*
         * Error handling invariant:
         * Before a `goto out':
         *        lvp is either NULL, or held.
         *        uvp is either NULL, or held.
         */

        if (bufp)
                bp = *bpp;

        /*
         * this loop will terminate when one of the following happens:
         *        - we hit the root
         *        - getdirentries or lookup fails
         *        - we run out of space in the buffer.
         */
        if (lvp == rvp) {
                if (bp)
                        *(--bp) = '/';
                goto out;
        }
        do {
                /*
                 * access check here is optional, depending on
                 * whether or not caller cares.
                 */
                int chkaccess = (flags & GETCWD_CHECK_ACCESS);
                bool locked = false;

                /*
                 * step up if we're a covered vnode..
                 * check access on the first vnode only.
                 */
                if (lvp->v_vflag & VV_ROOT) {
                        vn_lock(lvp, LK_SHARED | LK_RETRY);
                        if (chkaccess) {
                                error = VOP_ACCESS(lvp, accmode, cred);
                                if (error) {
                                        VOP_UNLOCK(lvp);
                                        goto out;
                                }
                                chkaccess = 0;
                        }
                        while (lvp->v_vflag & VV_ROOT) {
                                struct vnode *tvp;

                                if (lvp == rvp) {
                                        VOP_UNLOCK(lvp);
                                        goto out;
                                }

                                tvp = lvp->v_mount->mnt_vnodecovered;
                                /*
                                 * hodie natus est radici frater
                                 */
                                if (tvp == NULL) {
                                        VOP_UNLOCK(lvp);
                                        error = ENOENT;
                                        goto out;
                                }
                                vref(tvp);
                                vput(lvp);
                                lvp = tvp;
                                if (lvp->v_vflag & VV_ROOT)
                                        vn_lock(lvp, LK_SHARED | LK_RETRY);
                        }
                }

                /* Do we need to check access to the directory? */
                if (chkaccess && !cache_have_id(lvp)) {
                        /* Need exclusive for UFS VOP_GETATTR (itimes) & VOP_LOOKUP. */
                        vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
                        error = VOP_ACCESS(lvp, accmode, cred);
                        if (error) {
                                VOP_UNLOCK(lvp);
                                goto out;
                        }
                        chkaccess = 0;
                        locked = true;
                }

                /*
                 * Look in the name cache; if that fails, look in the
                 * directory..
                 */
                error = cache_revlookup(lvp, &uvp, &bp, bufp, chkaccess,
                    accmode);
                if (error == -1) {
                        if (!locked) {
                                locked = true;
                                vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
                        }
                        if (lvp->v_type != VDIR) {
                                VOP_UNLOCK(lvp);
                                error = ENOTDIR;
                                goto out;
                        }
                        error = getcwd_scandir(lvp, &uvp, &bp, bufp, l);
                        /* lvp now unlocked */
                } else if (locked) {
                        VOP_UNLOCK(lvp);
                }
                if (error)
                        goto out;
#if DIAGNOSTIC
                if (bufp && (bp <= bufp)) {
                        panic("getcwd: oops, went back too far");
                }
#endif
                accmode = VEXEC | VREAD;
                if (bp)
                        *(--bp) = '/';
                vrele(lvp);
                lvp = uvp;
                uvp = NULL;
                limit--;
        } while ((lvp != rvp) && (limit > 0));

out:
        if (bpp)
                *bpp = bp;
        if (uvp)
                vrele(uvp);
        if (lvp)
                vrele(lvp);
        vrele(rvp);
        return error;
}

/*
 * Check if one directory can be found inside another in the directory
 * hierarchy.
 *
 * Intended to be used in chroot, chdir, fchdir, etc., to ensure that
 * chroot() actually means something.
 */
int
vn_isunder(struct vnode *lvp, struct vnode *rvp, struct lwp *l)
{
        int error;

        error = getcwd_common(lvp, rvp, NULL, NULL, MAXPATHLEN / 2, 0, l);

        if (!error)
                return 1;
        else
                return 0;
}

/*
 * Returns true if proc p1's root directory equal to or under p2's
 * root directory.
 *
 * Intended to be used from ptrace/procfs sorts of things.
 */

int
proc_isunder(struct proc *p1, struct lwp *l2)
{
        struct vnode *r1 = p1->p_cwdi->cwdi_rdir;
        struct vnode *r2 = l2->l_proc->p_cwdi->cwdi_rdir;

        if (r1 == NULL)
                return (r2 == NULL);
        else if (r2 == NULL)
                return 1;
        else
                return vn_isunder(r1, r2, l2);
}

/*
 * Find pathname of process's current directory.
 *
 * Use vfs vnode-to-name reverse cache; if that fails, fall back
 * to reading directory contents.
 */

int
sys___getcwd(struct lwp *l, const struct sys___getcwd_args *uap, register_t *retval)
{
        /* {
                syscallarg(char *) bufp;
                syscallarg(size_t) length;
        } */

        int     error;
        char   *path;
        char   *bp, *bend;
        int     len = SCARG(uap, length);
        int        lenused;
        struct        cwdinfo *cwdi;

        if (len > MAXPATHLEN * 4)
                len = MAXPATHLEN * 4;
        else if (len < 2)
                return ERANGE;

        path = kmem_alloc(len, KM_SLEEP);
        bp = &path[len];
        bend = bp;
        *(--bp) = '\0';

        /*
         * 5th argument here is "max number of vnodes to traverse".
         * Since each entry takes up at least 2 bytes in the output buffer,
         * limit it to N/2 vnodes for an N byte buffer.
         */
        cwdi = l->l_proc->p_cwdi;
        rw_enter(&cwdi->cwdi_lock, RW_READER);
        error = getcwd_common(cwdi->cwdi_cdir, NULL, &bp, path, 
            len/2, GETCWD_CHECK_ACCESS, l);
        rw_exit(&cwdi->cwdi_lock);

        if (error)
                goto out;
        lenused = bend - bp;
        *retval = lenused;
        /* put the result into user buffer */
        error = copyout(bp, SCARG(uap, bufp), lenused);

out:
        kmem_free(path, len);
        return error;
}

/*
 * Try to find a pathname for a vnode.  Since there is no mapping vnode ->
 * parent directory, this needs the namecache to succeed.  Caller holds a
 * reference to the vnode.
 */
int
vnode_to_path(char *path, size_t len, struct vnode *vp, struct lwp *curl,
    struct proc *p)
{
        struct proc *curp = curl->l_proc;
        int error, lenused, elen;
        char *bp, *bend;
        struct vnode *dvp;

        KASSERT(vrefcnt(vp) > 0);

        bp = bend = &path[len];
        *(--bp) = '\0';

        error = cache_revlookup(vp, &dvp, &bp, path, false, 0);
        if (error != 0)
                return (error == -1 ? ENOENT : error);

        *(--bp) = '/';
        error = getcwd_common(dvp, NULL, &bp, path, len / 2,
            GETCWD_CHECK_ACCESS, curl);
        vrele(dvp);
        if (error != 0)
                return error;

        /*
         * Strip off emulation path for emulated processes looking at
         * the maps file of a process of the same emulation. (Won't
         * work if /emul/xxx is a symlink..)
         */
        if (curp->p_emul == p->p_emul && curp->p_emul->e_path != NULL) {
                elen = strlen(curp->p_emul->e_path);
                if (!strncmp(bp, curp->p_emul->e_path, elen))
                        bp = &bp[elen];
        }

        lenused = bend - bp;

        memcpy(path, bp, lenused);
        path[lenused] = '\0';

        return 0;
}






































































































   77 


















   50 



   51 


   51 
   51 

    1 

























    1 






   49 



























   66 


    3 


   63 

    4 



   58 










   17 




   61 










   65 


   65 






    2 
   63 

   63 
    6 














   66 


   66 
   66 
   66 
   65 
   66 







   56 
   18 




































































   66 






   66 



















   56 













    5 


   62 





























   64 
   64 






















   64 
   61 
    6 









































































































































    3 

    3 










    3 





























    3 


    3 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
/*        $NetBSD: kern_sleepq.c,v 1.87 2023/11/02 10:31:55 martin Exp $        */

/*-
 * Copyright (c) 2006, 2007, 2008, 2009, 2019, 2020, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Sleep queue implementation, used by turnstiles and general sleep/wakeup
 * interfaces.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_sleepq.c,v 1.87 2023/11/02 10:31:55 martin Exp $");

#include <sys/param.h>

#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/kernel.h>
#include <sys/ktrace.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/systm.h>

/*
 * for sleepq_abort:
 * During autoconfiguration or after a panic, a sleep will simply lower the
 * priority briefly to allow interrupts, then return.  The priority to be
 * used (IPL_SAFEPRI) is machine-dependent, thus this value is initialized and
 * maintained in the machine-dependent layers.  This priority will typically
 * be 0, or the lowest priority that is safe for use on the interrupt stack;
 * it can be made higher to block network software interrupts after panics.
 */
#ifndef        IPL_SAFEPRI
#define        IPL_SAFEPRI        0
#endif

static int        sleepq_sigtoerror(lwp_t *, int);

/* General purpose sleep table, used by mtsleep() and condition variables. */
sleeptab_t        sleeptab __cacheline_aligned;
sleepqlock_t        sleepq_locks[SLEEPTAB_HASH_SIZE] __cacheline_aligned;

/*
 * sleeptab_init:
 *
 *        Initialize a sleep table.
 */
void
sleeptab_init(sleeptab_t *st)
{
        static bool again;
        int i;

        for (i = 0; i < SLEEPTAB_HASH_SIZE; i++) {
                if (!again) {
                        mutex_init(&sleepq_locks[i].lock, MUTEX_DEFAULT,
                            IPL_SCHED);
                }
                sleepq_init(&st->st_queue[i]);
        }
        again = true;
}

/*
 * sleepq_init:
 *
 *        Prepare a sleep queue for use.
 */
void
sleepq_init(sleepq_t *sq)
{

        LIST_INIT(sq);
}

/*
 * sleepq_remove:
 *
 *        Remove an LWP from a sleep queue and wake it up.  Distinguish
 *        between deliberate wakeups (which are a valuable information) and
 *        "unsleep" (an out-of-band action must be taken).
 *
 *        For wakeup, convert any interruptable wait into non-interruptable
 *        one before waking the LWP.  Otherwise, if only one LWP is awoken it
 *        could fail to do something useful with the wakeup due to an error
 *        return and the caller of e.g. cv_signal() may not expect this.
 */
void
sleepq_remove(sleepq_t *sq, lwp_t *l, bool wakeup)
{
        struct schedstate_percpu *spc;
        struct cpu_info *ci;

        KASSERT(lwp_locked(l, NULL));

        if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_NULL) == 0) {
                KASSERT(sq != NULL);
                LIST_REMOVE(l, l_sleepchain);
        } else {
                KASSERT(sq == NULL);
        }

        l->l_syncobj = &sched_syncobj;
        l->l_wchan = NULL;
        l->l_sleepq = NULL;
        l->l_flag &= wakeup ? ~(LW_SINTR|LW_CATCHINTR|LW_STIMO) : ~LW_SINTR;

        ci = l->l_cpu;
        spc = &ci->ci_schedstate;

        /*
         * If not sleeping, the LWP must have been suspended.  Let whoever
         * holds it stopped set it running again.
         */
        if (l->l_stat != LSSLEEP) {
                KASSERT(l->l_stat == LSSTOP || l->l_stat == LSSUSPENDED);
                lwp_setlock(l, spc->spc_lwplock);
                return;
        }

        /*
         * If the LWP is still on the CPU, mark it as LSONPROC.  It may be
         * about to call mi_switch(), in which case it will yield.
         */
        if ((l->l_pflag & LP_RUNNING) != 0) {
                l->l_stat = LSONPROC;
                l->l_slptime = 0;
                lwp_setlock(l, spc->spc_lwplock);
                return;
        }

        /* Update sleep time delta, call the wake-up handler of scheduler */
        l->l_slpticksum += (getticks() - l->l_slpticks);
        sched_wakeup(l);

        /* Look for a CPU to wake up */
        l->l_cpu = sched_takecpu(l);
        ci = l->l_cpu;
        spc = &ci->ci_schedstate;

        /*
         * Set it running.
         */
        spc_lock(ci);
        lwp_setlock(l, spc->spc_mutex);
        sched_setrunnable(l);
        l->l_stat = LSRUN;
        l->l_slptime = 0;
        sched_enqueue(l);
        sched_resched_lwp(l, true);
        /* LWP & SPC now unlocked, but we still hold sleep queue lock. */
}

/*
 * sleepq_insert:
 *
 *        Insert an LWP into the sleep queue, optionally sorting by priority.
 */
static void
sleepq_insert(sleepq_t *sq, lwp_t *l, syncobj_t *sobj)
{

        if ((sobj->sobj_flag & SOBJ_SLEEPQ_NULL) != 0) {
                KASSERT(sq == NULL); 
                return;
        }
        KASSERT(sq != NULL);

        if ((sobj->sobj_flag & SOBJ_SLEEPQ_SORTED) != 0) {
                lwp_t *l2, *l_last = NULL;
                const pri_t pri = lwp_eprio(l);

                LIST_FOREACH(l2, sq, l_sleepchain) {
                        l_last = l2;
                        if (lwp_eprio(l2) < pri) {
                                LIST_INSERT_BEFORE(l2, l, l_sleepchain);
                                return;
                        }
                }
                /*
                 * Ensure FIFO ordering if no waiters are of lower priority.
                 */
                if (l_last != NULL) {
                        LIST_INSERT_AFTER(l_last, l, l_sleepchain);
                        return;
                }
        }

        LIST_INSERT_HEAD(sq, l, l_sleepchain);
}

/*
 * sleepq_enter:
 *
 *        Prepare to block on a sleep queue, after which any interlock can be
 *        safely released.
 */
int
sleepq_enter(sleepq_t *sq, lwp_t *l, kmutex_t *mp)
{
        int nlocks;

        KASSERT((sq != NULL) == (mp != NULL));

        /*
         * Acquire the per-LWP mutex and lend it our sleep queue lock.
         * Once interlocked, we can release the kernel lock.
         */
        lwp_lock(l);
        if (mp != NULL) {
                lwp_unlock_to(l, mp);
        }
        if (__predict_false((nlocks = l->l_blcnt) != 0)) {
                KERNEL_UNLOCK_ALL(NULL, NULL);
        }
        return nlocks;
}

/*
 * sleepq_enqueue:
 *
 *        Enter an LWP into the sleep queue and prepare for sleep.  The sleep
 *        queue must already be locked, and any interlock (such as the kernel
 *        lock) must have be released (see sleeptab_lookup(), sleepq_enter()).
 */
void
sleepq_enqueue(sleepq_t *sq, wchan_t wchan, const char *wmesg, syncobj_t *sobj,
    bool catch_p)
{
        lwp_t *l = curlwp;

        KASSERT(lwp_locked(l, NULL));
        KASSERT(l->l_stat == LSONPROC);
        KASSERT(l->l_wchan == NULL);
        KASSERT(l->l_sleepq == NULL);
        KASSERT((l->l_flag & LW_SINTR) == 0);

        l->l_syncobj = sobj;
        l->l_wchan = wchan;
        l->l_sleepq = sq;
        l->l_wmesg = wmesg;
        l->l_slptime = 0;
        l->l_stat = LSSLEEP;
        if (catch_p)
                l->l_flag |= LW_SINTR;

        sleepq_insert(sq, l, sobj);

        /* Save the time when thread has slept */
        l->l_slpticks = getticks();
        sched_slept(l);
}

/*
 * sleepq_transfer:
 *
 *        Move an LWP from one sleep queue to another.  Both sleep queues
 *        must already be locked.
 *
 *        The LWP will be updated with the new sleepq, wchan, wmesg,
 *        sobj, and mutex.  The interruptible flag will also be updated.
 */
void
sleepq_transfer(lwp_t *l, sleepq_t *from_sq, sleepq_t *sq, wchan_t wchan,
    const char *wmesg, syncobj_t *sobj, kmutex_t *mp, bool catch_p)
{

        KASSERT(l->l_sleepq == from_sq);

        LIST_REMOVE(l, l_sleepchain);
        l->l_syncobj = sobj;
        l->l_wchan = wchan;
        l->l_sleepq = sq;
        l->l_wmesg = wmesg;

        if (catch_p)
                l->l_flag = LW_SINTR | LW_CATCHINTR;
        else
                l->l_flag = ~(LW_SINTR | LW_CATCHINTR);

        /*
         * This allows the transfer from one sleepq to another where
         * it is known that they're both protected by the same lock.
         */
        if (mp != NULL)
                lwp_setlock(l, mp);

        sleepq_insert(sq, l, sobj);
}

/*
 * sleepq_uncatch:
 *
 *        Mark the LWP as no longer sleeping interruptibly.
 */
void
sleepq_uncatch(lwp_t *l)
{

        l->l_flag &= ~(LW_SINTR | LW_CATCHINTR | LW_STIMO);
}

/*
 * sleepq_block:
 *
 *        After any intermediate step such as releasing an interlock, switch.
 *         sleepq_block() may return early under exceptional conditions, for
 *         example if the LWP's containing process is exiting.
 *
 *        timo is a timeout in ticks.  timo = 0 specifies an infinite timeout.
 */
int
sleepq_block(int timo, bool catch_p, syncobj_t *syncobj, int nlocks)
{
        const int mask = LW_CANCELLED|LW_WEXIT|LW_WCORE|LW_PENDSIG;
        int error = 0, sig, flag;
        struct proc *p;
        lwp_t *l = curlwp;
        bool early = false;

        ktrcsw(1, 0, syncobj);

        /*
         * If sleeping interruptably, check for pending signals, exits or
         * core dump events.
         *
         * Note the usage of LW_CATCHINTR.  This expresses our intent
         * to catch or not catch sleep interruptions, which might change
         * while we are sleeping.  It is independent from LW_SINTR because
         * we don't want to leave LW_SINTR set when the LWP is not asleep.
         */
        if (catch_p) {
                if ((l->l_flag & (LW_CANCELLED|LW_WEXIT|LW_WCORE)) != 0) {
                        l->l_flag &= ~LW_CANCELLED;
                        error = EINTR;
                        early = true;
                } else if ((l->l_flag & LW_PENDSIG) != 0 && sigispending(l, 0))
                        early = true;
                l->l_flag |= LW_CATCHINTR;
        } else
                l->l_flag &= ~LW_CATCHINTR;

        if (early) {
                /* lwp_unsleep() will release the lock */
                lwp_unsleep(l, true);
        } else {
                /*
                 * The LWP may have already been awoken if the caller
                 * dropped the sleep queue lock between sleepq_enqueue() and
                 * sleepq_block().  If that happens l_stat will be LSONPROC
                 * and mi_switch() will treat this as a preemption.  No need
                 * to do anything special here.
                 */
                if (timo) {
                        l->l_flag &= ~LW_STIMO;
                        callout_schedule(&l->l_timeout_ch, timo);
                }
                l->l_boostpri = l->l_syncobj->sobj_boostpri;
                spc_lock(l->l_cpu);
                mi_switch(l);

                /* The LWP and sleep queue are now unlocked. */
                if (timo) {
                        /*
                         * Even if the callout appears to have fired, we
                         * need to stop it in order to synchronise with
                         * other CPUs.  It's important that we do this in
                         * this LWP's context, and not during wakeup, in
                         * order to keep the callout & its cache lines
                         * co-located on the CPU with the LWP.
                         */
                        (void)callout_halt(&l->l_timeout_ch, NULL);
                        error = (l->l_flag & LW_STIMO) ? EWOULDBLOCK : 0;
                }
        }

        /*
         * LW_CATCHINTR is only modified in this function OR when we
         * are asleep (with the sleepq locked).  We can therefore safely
         * test it unlocked here as it is guaranteed to be stable by
         * virtue of us running.
         *
         * We do not bother clearing it if set; that would require us
         * to take the LWP lock, and it doesn't seem worth the hassle
         * considering it is only meaningful here inside this function,
         * and is set to reflect intent upon entry.
         */
        flag = atomic_load_relaxed(&l->l_flag);
        if (__predict_false((flag & mask) != 0)) {
                if ((flag & LW_CATCHINTR) == 0 || error != 0)
                        /* nothing */;
                else if ((flag & (LW_CANCELLED | LW_WEXIT | LW_WCORE)) != 0)
                        error = EINTR;
                else if ((flag & LW_PENDSIG) != 0) {
                        /*
                         * Acquiring p_lock may cause us to recurse
                         * through the sleep path and back into this
                         * routine, but is safe because LWPs sleeping
                         * on locks are non-interruptable and we will
                         * not recurse again.
                         */
                        p = l->l_proc;
                        mutex_enter(p->p_lock);
                        if (((sig = sigispending(l, 0)) != 0 &&
                            (sigprop[sig] & SA_STOP) == 0) ||
                            (sig = issignal(l)) != 0)
                                error = sleepq_sigtoerror(l, sig);
                        mutex_exit(p->p_lock);
                }
        }

        ktrcsw(0, 0, syncobj);
        if (__predict_false(nlocks != 0)) {
                KERNEL_LOCK(nlocks, NULL);
        }
        return error;
}

/*
 * sleepq_wake:
 *
 *        Wake zero or more LWPs blocked on a single wait channel.
 */
void
sleepq_wake(sleepq_t *sq, wchan_t wchan, u_int expected, kmutex_t *mp)
{
        lwp_t *l, *next;

        KASSERT(mutex_owned(mp));

        for (l = LIST_FIRST(sq); l != NULL; l = next) {
                KASSERT(l->l_sleepq == sq);
                KASSERT(l->l_mutex == mp);
                next = LIST_NEXT(l, l_sleepchain);
                if (l->l_wchan != wchan)
                        continue;
                sleepq_remove(sq, l, true);
                if (--expected == 0)
                        break;
        }

        mutex_spin_exit(mp);
}

/*
 * sleepq_unsleep:
 *
 *        Remove an LWP from its sleep queue and set it runnable again. 
 *        sleepq_unsleep() is called with the LWP's mutex held, and will
 *        release it if "unlock" is true.
 */
void
sleepq_unsleep(lwp_t *l, bool unlock)
{
        sleepq_t *sq = l->l_sleepq;
        kmutex_t *mp = l->l_mutex;

        KASSERT(lwp_locked(l, mp));
        KASSERT(l->l_wchan != NULL);

        sleepq_remove(sq, l, false);
        if (unlock) {
                mutex_spin_exit(mp);
        }
}

/*
 * sleepq_timeout:
 *
 *        Entered via the callout(9) subsystem to time out an LWP that is on a
 *        sleep queue.
 */
void
sleepq_timeout(void *arg)
{
        lwp_t *l = arg;

        /*
         * Lock the LWP.  Assuming it's still on the sleep queue, its
         * current mutex will also be the sleep queue mutex.
         */
        lwp_lock(l);

        if (l->l_wchan == NULL || l->l_syncobj == &callout_syncobj) {
                /*
                 * Somebody beat us to it, or the LWP is blocked in
                 * callout_halt() waiting for us to finish here.  In
                 * neither case should the LWP produce EWOULDBLOCK.
                 */
                lwp_unlock(l);
                return;
        }

        l->l_flag |= LW_STIMO;
        lwp_unsleep(l, true);
}

/*
 * sleepq_sigtoerror:
 *
 *        Given a signal number, interpret and return an error code.
 */
static int
sleepq_sigtoerror(lwp_t *l, int sig)
{
        struct proc *p = l->l_proc;
        int error;

        KASSERT(mutex_owned(p->p_lock));

        /*
         * If this sleep was canceled, don't let the syscall restart.
         */
        if ((SIGACTION(p, sig).sa_flags & SA_RESTART) == 0)
                error = EINTR;
        else
                error = ERESTART;

        return error;
}

/*
 * sleepq_abort:
 *
 *        After a panic or during autoconfiguration, lower the interrupt
 *        priority level to give pending interrupts a chance to run, and
 *        then return.  Called if sleepq_dontsleep() returns non-zero, and
 *        always returns zero.
 */
int
sleepq_abort(kmutex_t *mtx, int unlock)
{ 
        int s;

        s = splhigh();
        splx(IPL_SAFEPRI);
        splx(s);
        if (mtx != NULL && unlock != 0)
                mutex_exit(mtx);

        return 0;
}

/*
 * sleepq_reinsert:
 *
 *        Move the position of the lwp in the sleep queue after a possible
 *        change of the lwp's effective priority.
 */
static void
sleepq_reinsert(sleepq_t *sq, lwp_t *l)
{

        KASSERT(l->l_sleepq == sq);
        if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_SORTED) == 0) { 
                return;
        }

        /*
         * Don't let the sleep queue become empty, even briefly.
         * cv_signal() and cv_broadcast() inspect it without the
         * sleep queue lock held and need to see a non-empty queue
         * head if there are waiters.
         */
        if (LIST_FIRST(sq) == l && LIST_NEXT(l, l_sleepchain) == NULL) {
                return;
        }
        LIST_REMOVE(l, l_sleepchain);
        sleepq_insert(sq, l, l->l_syncobj);
}

/*
 * sleepq_changepri:
 *
 *        Adjust the priority of an LWP residing on a sleepq.
 */
void
sleepq_changepri(lwp_t *l, pri_t pri)
{
        sleepq_t *sq = l->l_sleepq;

        KASSERT(lwp_locked(l, NULL));

        l->l_priority = pri;
        sleepq_reinsert(sq, l);
}

/*
 * sleepq_changepri:
 *
 *        Adjust the lended priority of an LWP residing on a sleepq.
 */
void
sleepq_lendpri(lwp_t *l, pri_t pri)
{
        sleepq_t *sq = l->l_sleepq;

        KASSERT(lwp_locked(l, NULL));

        l->l_inheritedprio = pri;
        l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
        sleepq_reinsert(sq, l);
}


































































































































    8 













    8 

    8 












    8 
    1 











    8 


    1 























    1 



    1 






    1 

    5 



    8 













    5 





    5 


    5 






    5 



















    2 




    1 


















    4 
    8 



    8 
























    2 











    2 








    2 









    2 

    2 











    1 







    1 





    8 








    8 


    8 


    8 



    8 











    5 










    1 







    1 














    1 



























































































































































































































    2 











    2 






























































































































































    3 











    3 























































































































    1 

















    1 





    1 













    1 

    1 






































    1 







    1 


    1 
    1 
    1 



    1 











    7 








    7 

    7 
















    7 































    1 





















    1 



























































































































    2 














    2 


    2 










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
/*        $NetBSD: tmpfs_vnops.c,v 1.150 2022/06/01 08:42:38 hannken Exp $        */

/*
 * Copyright (c) 2005, 2006, 2007, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
 * 2005 program.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * tmpfs vnode interface.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_vnops.c,v 1.150 2022/06/01 08:42:38 hannken Exp $");

#include <sys/param.h>
#include <sys/dirent.h>
#include <sys/fcntl.h>
#include <sys/event.h>
#include <sys/malloc.h>
#include <sys/namei.h>
#include <sys/stat.h>
#include <sys/uio.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <sys/lockf.h>
#include <sys/kauth.h>
#include <sys/atomic.h>

#include <uvm/uvm_object.h>

#include <miscfs/fifofs/fifo.h>
#include <miscfs/genfs/genfs.h>
#include <fs/tmpfs/tmpfs_vnops.h>
#include <fs/tmpfs/tmpfs.h>

/*
 * vnode operations vector used for files stored in a tmpfs file system.
 */
int (**tmpfs_vnodeop_p)(void *);
const struct vnodeopv_entry_desc tmpfs_vnodeop_entries[] = {
        { &vop_default_desc,                vn_default_error },
        { &vop_parsepath_desc,                genfs_parsepath },
        { &vop_lookup_desc,                tmpfs_lookup },
        { &vop_create_desc,                tmpfs_create },
        { &vop_mknod_desc,                tmpfs_mknod },
        { &vop_open_desc,                tmpfs_open },
        { &vop_close_desc,                tmpfs_close },
        { &vop_access_desc,                tmpfs_access },
        { &vop_accessx_desc,                genfs_accessx },
        { &vop_getattr_desc,                tmpfs_getattr },
        { &vop_setattr_desc,                tmpfs_setattr },
        { &vop_read_desc,                tmpfs_read },
        { &vop_write_desc,                tmpfs_write },
        { &vop_fallocate_desc,                genfs_eopnotsupp },
        { &vop_fdiscard_desc,                genfs_eopnotsupp },
        { &vop_ioctl_desc,                genfs_enoioctl },
        { &vop_fcntl_desc,                genfs_fcntl },
        { &vop_poll_desc,                genfs_poll },
        { &vop_kqfilter_desc,                genfs_kqfilter },
        { &vop_revoke_desc,                genfs_revoke },
        { &vop_mmap_desc,                genfs_mmap },
        { &vop_fsync_desc,                tmpfs_fsync },
        { &vop_seek_desc,                genfs_seek },
        { &vop_remove_desc,                tmpfs_remove },
        { &vop_link_desc,                tmpfs_link },
        { &vop_rename_desc,                tmpfs_rename },
        { &vop_mkdir_desc,                tmpfs_mkdir },
        { &vop_rmdir_desc,                tmpfs_rmdir },
        { &vop_symlink_desc,                tmpfs_symlink },
        { &vop_readdir_desc,                tmpfs_readdir },
        { &vop_readlink_desc,                tmpfs_readlink },
        { &vop_abortop_desc,                genfs_abortop },
        { &vop_inactive_desc,                tmpfs_inactive },
        { &vop_reclaim_desc,                tmpfs_reclaim },
        { &vop_lock_desc,                genfs_lock },
        { &vop_unlock_desc,                genfs_unlock },
        { &vop_bmap_desc,                genfs_eopnotsupp },
        { &vop_strategy_desc,                genfs_eopnotsupp },
        { &vop_print_desc,                tmpfs_print },
        { &vop_pathconf_desc,                tmpfs_pathconf },
        { &vop_islocked_desc,                genfs_islocked },
        { &vop_advlock_desc,                tmpfs_advlock },
        { &vop_bwrite_desc,                genfs_nullop },
        { &vop_getpages_desc,                tmpfs_getpages },
        { &vop_putpages_desc,                tmpfs_putpages },
        { &vop_whiteout_desc,                tmpfs_whiteout },
        { NULL, NULL }
};

const struct vnodeopv_desc tmpfs_vnodeop_opv_desc = {
        &tmpfs_vnodeop_p, tmpfs_vnodeop_entries
};

/*
 * tmpfs_lookup: path name traversal routine.
 *
 * Arguments: dvp (directory being searched), vpp (result),
 * cnp (component name - path).
 *
 * => Caller holds a reference and lock on dvp.
 * => We return looked-up vnode (vpp) locked, with a reference held.
 */
int
tmpfs_lookup(void *v)
{
        struct vop_lookup_v2_args /* {
                struct vnode *a_dvp;
                struct vnode **a_vpp;
                struct componentname *a_cnp;
        } */ *ap = v;
        vnode_t *dvp = ap->a_dvp, **vpp = ap->a_vpp;
        struct componentname *cnp = ap->a_cnp;
        const bool lastcn = (cnp->cn_flags & ISLASTCN) != 0;
        tmpfs_node_t *dnode, *tnode;
        tmpfs_dirent_t *de;
        int cachefound, iswhiteout;
        int error;

        KASSERT(VOP_ISLOCKED(dvp));

        dnode = VP_TO_TMPFS_DIR(dvp);
        *vpp = NULL;

        /* Check accessibility of directory. */
        error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred);
        if (error) {
                goto out;
        }

        /*
         * If requesting the last path component on a read-only file system
         * with a write operation, deny it.
         */
        if (lastcn && (dvp->v_mount->mnt_flag & MNT_RDONLY) != 0 &&
            (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
                error = EROFS;
                goto out;
        }

        /*
         * Avoid doing a linear scan of the directory if the requested
         * directory/name couple is already in the cache.
         */
        cachefound = cache_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
                                  cnp->cn_nameiop, cnp->cn_flags,
                                  &iswhiteout, vpp);
        if (iswhiteout) {
                cnp->cn_flags |= ISWHITEOUT;
        }
        if (cachefound && *vpp == NULLVP) {
                /* Negative cache hit. */
                error = ENOENT;
                goto out;
        } else if (cachefound) {
                error = 0;
                goto out;
        }

        /*
         * Treat an unlinked directory as empty (no "." or "..")
         */
        if (dnode->tn_links == 0) {
                KASSERT(dnode->tn_size == 0);
                error = ENOENT;
                goto out;
        }

        if (cnp->cn_flags & ISDOTDOT) {
                tmpfs_node_t *pnode;

                /*
                 * Lookup of ".." case.
                 */
                if (lastcn && cnp->cn_nameiop == RENAME) {
                        error = EINVAL;
                        goto out;
                }
                KASSERT(dnode->tn_type == VDIR);
                pnode = dnode->tn_spec.tn_dir.tn_parent;
                if (pnode == NULL) {
                        error = ENOENT;
                        goto done;
                }

                error = vcache_get(dvp->v_mount, &pnode, sizeof(pnode), vpp);
                goto done;
        } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
                /*
                 * Lookup of "." case.
                 */
                if (lastcn && cnp->cn_nameiop == RENAME) {
                        error = EISDIR;
                        goto out;
                }
                vref(dvp);
                *vpp = dvp;
                error = 0;
                goto done;
        }

        /*
         * Other lookup cases: perform directory scan.
         */
        de = tmpfs_dir_lookup(dnode, cnp);
        if (de == NULL || de->td_node == TMPFS_NODE_WHITEOUT) {
                /*
                 * The entry was not found in the directory.  This is valid
                 * if we are creating or renaming an entry and are working
                 * on the last component of the path name.
                 */
                if (lastcn && (cnp->cn_nameiop == CREATE ||
                    cnp->cn_nameiop == RENAME)) {
                        error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
                        if (error) {
                                goto out;
                        }
                        error = EJUSTRETURN;
                } else {
                        error = ENOENT;
                }
                if (de) {
                        KASSERT(de->td_node == TMPFS_NODE_WHITEOUT);
                        cnp->cn_flags |= ISWHITEOUT;
                }
                goto done;
        }

        tnode = de->td_node;

        /*
         * If it is not the last path component and found a non-directory
         * or non-link entry (which may itself be pointing to a directory),
         * raise an error.
         */
        if (!lastcn && tnode->tn_type != VDIR && tnode->tn_type != VLNK) {
                error = ENOTDIR;
                goto out;
        }

        /* Check the permissions. */
        if (lastcn && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
                error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
                if (error)
                        goto out;

                if ((dnode->tn_mode & S_ISTXT) != 0) {
                        error = kauth_authorize_vnode(cnp->cn_cred,
                            KAUTH_VNODE_DELETE, tnode->tn_vnode,
                            dnode->tn_vnode, genfs_can_sticky(dvp, cnp->cn_cred,
                            dnode->tn_uid, tnode->tn_uid));
                        if (error) {
                                error = EPERM;
                                goto out;
                        }
                }
        }

        /* Get a vnode for the matching entry. */
        error = vcache_get(dvp->v_mount, &tnode, sizeof(tnode), vpp);
done:
        /*
         * Cache the result, unless request was for creation (as it does
         * not improve the performance).
         */
        if (cnp->cn_nameiop != CREATE) {
                cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen,
                            cnp->cn_flags);
        }
out:
        KASSERT(VOP_ISLOCKED(dvp));

        return error;
}

int
tmpfs_create(void *v)
{
        struct vop_create_v3_args /* {
                struct vnode                *a_dvp;
                struct vnode                **a_vpp;
                struct componentname        *a_cnp;
                struct vattr                *a_vap;
        } */ *ap = v;
        vnode_t *dvp = ap->a_dvp, **vpp = ap->a_vpp;
        struct componentname *cnp = ap->a_cnp;
        struct vattr *vap = ap->a_vap;

        KASSERT(VOP_ISLOCKED(dvp));
        KASSERT(vap->va_type == VREG || vap->va_type == VSOCK);
        return tmpfs_construct_node(dvp, vpp, vap, cnp, NULL);
}

int
tmpfs_mknod(void *v)
{
        struct vop_mknod_v3_args /* {
                struct vnode                *a_dvp;
                struct vnode                **a_vpp;
                struct componentname        *a_cnp;
                struct vattr                *a_vap;
        } */ *ap = v;
        vnode_t *dvp = ap->a_dvp, **vpp = ap->a_vpp;
        struct componentname *cnp = ap->a_cnp;
        struct vattr *vap = ap->a_vap;
        enum vtype vt = vap->va_type;

        if (vt != VBLK && vt != VCHR && vt != VFIFO) {
                *vpp = NULL;
                return EINVAL;
        }
        return tmpfs_construct_node(dvp, vpp, vap, cnp, NULL);
}

int
tmpfs_open(void *v)
{
        struct vop_open_args /* {
                struct vnode        *a_vp;
                int                a_mode;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        mode_t mode = ap->a_mode;
        tmpfs_node_t *node;

        KASSERT(VOP_ISLOCKED(vp));

        node = VP_TO_TMPFS_NODE(vp);

        /* If the file is marked append-only, deny write requests. */
        if ((node->tn_flags & APPEND) != 0 &&
            (mode & (FWRITE | O_APPEND)) == FWRITE) {
                return EPERM;
        }
        return 0;
}

int
tmpfs_close(void *v)
{
        struct vop_close_args /* {
                struct vnode        *a_vp;
                int                a_fflag;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        vnode_t *vp __diagused = ap->a_vp;

        KASSERT(VOP_ISLOCKED(vp));
        return 0;
}

int
tmpfs_access(void *v)
{
        struct vop_access_args /* {
                struct vnode        *a_vp;
                accmode_t        a_accmode;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        accmode_t accmode = ap->a_accmode;
        kauth_cred_t cred = ap->a_cred;
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
        const bool writing = (accmode & VWRITE) != 0;

        KASSERT(VOP_ISLOCKED(vp));

        /* Possible? */
        switch (vp->v_type) {
        case VDIR:
        case VLNK:
        case VREG:
                if (writing && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0) {
                        return EROFS;
                }
                break;
        case VBLK:
        case VCHR:
        case VSOCK:
        case VFIFO:
                break;
        default:
                return EINVAL;
        }
        if (writing && (node->tn_flags & IMMUTABLE) != 0) {
                return EPERM;
        }

        return kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(accmode,
            vp->v_type, node->tn_mode), vp, NULL, genfs_can_access(vp, cred,
            node->tn_uid, node->tn_gid, node->tn_mode, NULL, accmode));
}

int
tmpfs_getattr(void *v)
{
        struct vop_getattr_args /* {
                struct vnode        *a_vp;
                struct vattr        *a_vap;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        struct vattr *vap = ap->a_vap;
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);

        vattr_null(vap);

        vap->va_type = vp->v_type;
        vap->va_mode = node->tn_mode;
        vap->va_nlink = node->tn_links;
        vap->va_uid = node->tn_uid;
        vap->va_gid = node->tn_gid;
        vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
        vap->va_fileid = node->tn_id;
        vap->va_size = node->tn_size;
        vap->va_blocksize = PAGE_SIZE;
        vap->va_gen = TMPFS_NODE_GEN(node);
        vap->va_flags = node->tn_flags;
        vap->va_rdev = (vp->v_type == VBLK || vp->v_type == VCHR) ?
            node->tn_spec.tn_dev.tn_rdev : VNOVAL;
        vap->va_bytes = round_page(node->tn_size);
        vap->va_filerev = VNOVAL;
        vap->va_vaflags = 0;
        vap->va_spare = VNOVAL; /* XXX */

        mutex_enter(&node->tn_timelock);
        tmpfs_update_locked(vp, 0);
        vap->va_atime = node->tn_atime;
        vap->va_mtime = node->tn_mtime;
        vap->va_ctime = node->tn_ctime;
        vap->va_birthtime = node->tn_birthtime;
        mutex_exit(&node->tn_timelock);

        return 0;
}

int
tmpfs_setattr(void *v)
{
        struct vop_setattr_args /* {
                struct vnode        *a_vp;
                struct vattr        *a_vap;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        struct vattr *vap = ap->a_vap;
        kauth_cred_t cred = ap->a_cred;
        lwp_t *l = curlwp;
        int error = 0;

        KASSERT(VOP_ISLOCKED(vp));

        /* Abort if any unsettable attribute is given. */
        if (vap->va_type != VNON || vap->va_nlink != VNOVAL ||
            vap->va_fsid != VNOVAL || vap->va_fileid != VNOVAL ||
            vap->va_blocksize != VNOVAL || vap->va_ctime.tv_sec != VNOVAL ||
            vap->va_gen != VNOVAL || vap->va_rdev != VNOVAL ||
            vap->va_bytes != VNOVAL) {
                return EINVAL;
        }

        if (error == 0 && vap->va_flags != VNOVAL)
                error = tmpfs_chflags(vp, vap->va_flags, cred, l);

        if (error == 0 && vap->va_size != VNOVAL)
                error = tmpfs_chsize(vp, vap->va_size, cred, l);

        if (error == 0 && (vap->va_uid != VNOVAL || vap->va_gid != VNOVAL))
                error = tmpfs_chown(vp, vap->va_uid, vap->va_gid, cred, l);

        if (error == 0 && vap->va_mode != VNOVAL)
                error = tmpfs_chmod(vp, vap->va_mode, cred, l);

        const bool chsometime =
            vap->va_atime.tv_sec != VNOVAL ||
            vap->va_mtime.tv_sec != VNOVAL ||
            vap->va_birthtime.tv_sec != VNOVAL;
        if (error == 0 && chsometime) {
                error = tmpfs_chtimes(vp, &vap->va_atime, &vap->va_mtime,
                    &vap->va_birthtime, vap->va_vaflags, cred, l);
        }
        return error;
}

int
tmpfs_read(void *v)
{
        struct vop_read_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        struct uio *uio = ap->a_uio;
        const int ioflag = ap->a_ioflag;
        tmpfs_node_t *node;
        struct uvm_object *uobj;
        int error;

        KASSERT(VOP_ISLOCKED(vp));

        if (vp->v_type == VDIR) {
                return EISDIR;
        }
        if (uio->uio_offset < 0 || vp->v_type != VREG) {
                return EINVAL;
        }

        /* Note: reading zero bytes should not update atime. */
        if (uio->uio_resid == 0) {
                return 0;
        }

        node = VP_TO_TMPFS_NODE(vp);
        uobj = node->tn_spec.tn_reg.tn_aobj;
        error = 0;

        while (error == 0 && uio->uio_resid > 0) {
                vsize_t len;

                if (node->tn_size <= uio->uio_offset) {
                        break;
                }
                len = MIN(node->tn_size - uio->uio_offset, uio->uio_resid);
                if (len == 0) {
                        break;
                }
                error = ubc_uiomove(uobj, uio, len, IO_ADV_DECODE(ioflag),
                    UBC_READ | UBC_PARTIALOK | UBC_VNODE_FLAGS(vp));
        }

        if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
                tmpfs_update(vp, TMPFS_UPDATE_ATIME);

        return error;
}

int
tmpfs_write(void *v)
{
        struct vop_write_args /* {
                struct vnode        *a_vp;
                struct uio        *a_uio;
                int                a_ioflag;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        struct uio *uio = ap->a_uio;
        const int ioflag = ap->a_ioflag;
        tmpfs_node_t *node;
        struct uvm_object *uobj;
        off_t oldsize;
        int error, ubc_flags;

        KASSERT(VOP_ISLOCKED(vp));

        node = VP_TO_TMPFS_NODE(vp);
        oldsize = node->tn_size;

        if ((vp->v_mount->mnt_flag & MNT_RDONLY) != 0) {
                error = EROFS;
                goto out;
        }

        if (uio->uio_offset < 0 || vp->v_type != VREG) {
                error = EINVAL;
                goto out;
        }
        if (uio->uio_resid == 0) {
                error = 0;
                goto out;
        }
        if (ioflag & IO_APPEND) {
                uio->uio_offset = node->tn_size;
        }

        if (uio->uio_offset + uio->uio_resid > node->tn_size) {
                error = tmpfs_reg_resize(vp, uio->uio_offset + uio->uio_resid);
                if (error)
                        goto out;
        }

        /*
         * If we're extending the file and have data to write that would
         * not leave an un-zeroed hole, we can avoid fault processing and
         * zeroing of pages on allocation.
         *
         * Don't do this if the file is mapped and we need to touch an
         * existing page, because writing a mapping of the file into itself
         * could cause a deadlock on PG_BUSY.
         *
         * New pages will not become visible until finished here (because
         * of PG_BUSY and the vnode lock).
         */
        ubc_flags = UBC_WRITE | UBC_VNODE_FLAGS(vp);
#if 0
        /*
         * XXX disable use of UBC_FAULTBUSY for now, this check is insufficient
         * because it does not zero uninitialized parts of pages in all of
         * the cases where zeroing is needed.
         */
        if (uio->uio_offset >= oldsize &&
            ((uio->uio_offset & (PAGE_SIZE - 1)) == 0 ||
            ((vp->v_vflag & VV_MAPPED) == 0 &&
            trunc_page(uio->uio_offset) == trunc_page(oldsize)))) {
                ubc_flags |= UBC_FAULTBUSY;
        }
#endif

        uobj = node->tn_spec.tn_reg.tn_aobj;
        error = 0;
        while (error == 0 && uio->uio_resid > 0) {
                vsize_t len;

                len = MIN(node->tn_size - uio->uio_offset, uio->uio_resid);
                if (len == 0) {
                        break;
                }
                error = ubc_uiomove(uobj, uio, len, IO_ADV_DECODE(ioflag),
                    ubc_flags);
        }
        if (error) {
                (void)tmpfs_reg_resize(vp, oldsize);
        }

        tmpfs_update(vp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);
out:
        if (error) {
                KASSERT(oldsize == node->tn_size);
        } else {
                KASSERT(uio->uio_resid == 0);
        }
        return error;
}

int
tmpfs_fsync(void *v)
{
        struct vop_fsync_args /* {
                struct vnode *a_vp;
                kauth_cred_t a_cred;
                int a_flags;
                off_t a_offlo;
                off_t a_offhi;
                struct lwp *a_l;
        } */ *ap = v;
        vnode_t *vp __diagused = ap->a_vp;

        /* Nothing to do.  Should be up to date. */
        KASSERT(VOP_ISLOCKED(vp));
        return 0;
}

/*
 * tmpfs_remove: unlink a file.
 *
 * => Both directory (dvp) and file (vp) are locked.
 * => We unlock and drop the reference on both.
 */
int
tmpfs_remove(void *v)
{
        struct vop_remove_v3_args /* {
                struct vnode *a_dvp;
                struct vnode *a_vp;
                struct componentname *a_cnp;
                nlink_t ctx_vp_new_nlink;
        } */ *ap = v;
        vnode_t *dvp = ap->a_dvp, *vp = ap->a_vp;
        tmpfs_node_t *dnode, *node;
        tmpfs_dirent_t *de;
        int error, tflags;

        KASSERT(VOP_ISLOCKED(dvp));
        KASSERT(VOP_ISLOCKED(vp));

        if (vp->v_type == VDIR) {
                error = EPERM;
                goto out;
        }
        dnode = VP_TO_TMPFS_DIR(dvp);
        node = VP_TO_TMPFS_NODE(vp);

        /*
         * Files marked as immutable or append-only cannot be deleted.
         * Likewise, files residing on directories marked as append-only
         * cannot be deleted.
         */
        if (node->tn_flags & (IMMUTABLE | APPEND)) {
                error = EPERM;
                goto out;
        }
        if (dnode->tn_flags & APPEND) {
                error = EPERM;
                goto out;
        }

        /* Lookup the directory entry (check the cached hint first). */
        de = tmpfs_dir_cached(node);
        if (de == NULL) {
                struct componentname *cnp = ap->a_cnp;
                de = tmpfs_dir_lookup(dnode, cnp);
        }
        KASSERT(de && de->td_node == node);

        /*
         * Remove the entry from the directory (drops the link count) and
         * destroy it or replace with a whiteout.
         *
         * Note: the inode referred by it will not be destroyed until the
         * vnode is reclaimed/recycled.
         */

        tmpfs_dir_detach(dnode, de);

        if (ap->a_cnp->cn_flags & DOWHITEOUT)
                tmpfs_dir_attach(dnode, de, TMPFS_NODE_WHITEOUT);
        else
                tmpfs_free_dirent(VFS_TO_TMPFS(vp->v_mount), de);

        tflags = TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME;
        if (node->tn_links > 0) {
                /* We removed a hard link. */
                tflags |= TMPFS_UPDATE_CTIME;
        }
        ap->ctx_vp_new_nlink = node->tn_links;
        tmpfs_update(dvp, tflags);
        error = 0;
out:
        /* Drop the reference and unlock the node. */
        if (dvp == vp) {
                vrele(vp);
        } else {
                vput(vp);
        }
        return error;
}

/*
 * tmpfs_link: create a hard link.
 */
int
tmpfs_link(void *v)
{
        struct vop_link_v2_args /* {
                struct vnode *a_dvp;
                struct vnode *a_vp;
                struct componentname *a_cnp;
        } */ *ap = v;
        vnode_t *dvp = ap->a_dvp;
        vnode_t *vp = ap->a_vp;
        struct componentname *cnp = ap->a_cnp;
        tmpfs_node_t *dnode, *node;
        tmpfs_dirent_t *de;
        int error;

        KASSERT(dvp != vp);
        KASSERT(VOP_ISLOCKED(dvp));
        KASSERT(vp->v_type != VDIR);
        KASSERT(dvp->v_mount == vp->v_mount);

        dnode = VP_TO_TMPFS_DIR(dvp);
        node = VP_TO_TMPFS_NODE(vp);

        vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);

        /* Check for maximum number of links limit. */
        if (node->tn_links == LINK_MAX) {
                error = EMLINK;
                goto out;
        }
        KASSERT(node->tn_links < LINK_MAX);

        /* We cannot create links of files marked immutable or append-only. */
        if (node->tn_flags & (IMMUTABLE | APPEND)) {
                error = EPERM;
                goto out;
        }

        error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp,
            dvp, 0);
        if (error)
                goto out;

        /* Allocate a new directory entry to represent the inode. */
        error = tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount),
            cnp->cn_nameptr, cnp->cn_namelen, &de);
        if (error) {
                goto out;
        }

        /*
         * Insert the entry into the directory.
         * It will increase the inode link count.
         */
        tmpfs_dir_attach(dnode, de, node);
        tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);

        /* Update the timestamps. */
        tmpfs_update(vp, TMPFS_UPDATE_CTIME);
        error = 0;
out:
        VOP_UNLOCK(vp);
        return error;
}

int
tmpfs_mkdir(void *v)
{
        struct vop_mkdir_v3_args /* {
                struct vnode                *a_dvp;
                struct vnode                **a_vpp;
                struct componentname        *a_cnp;
                struct vattr                *a_vap;
        } */ *ap = v;
        vnode_t *dvp = ap->a_dvp;
        vnode_t **vpp = ap->a_vpp;
        struct componentname *cnp = ap->a_cnp;
        struct vattr *vap = ap->a_vap;

        KASSERT(vap->va_type == VDIR);
        return tmpfs_construct_node(dvp, vpp, vap, cnp, NULL);
}

int
tmpfs_rmdir(void *v)
{
        struct vop_rmdir_v2_args /* {
                struct vnode                *a_dvp;
                struct vnode                *a_vp;
                struct componentname        *a_cnp;
        } */ *ap = v;
        vnode_t *dvp = ap->a_dvp;
        vnode_t *vp = ap->a_vp;
        tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount);
        tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp);
        tmpfs_node_t *node = VP_TO_TMPFS_DIR(vp);
        tmpfs_dirent_t *de;
        int error = 0;

        KASSERT(VOP_ISLOCKED(dvp));
        KASSERT(VOP_ISLOCKED(vp));

        /*
         * Directories with more than two entries ('.' and '..') cannot be
         * removed.  There may be whiteout entries, which we will destroy.
         */
        if (node->tn_size > 0) {
                /*
                 * If never had whiteout entries, the directory is certainly
                 * not empty.  Otherwise, scan for any non-whiteout entry.
                 */
                if ((node->tn_gen & TMPFS_WHITEOUT_BIT) == 0) {
                        error = ENOTEMPTY;
                        goto out;
                }
                TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
                        if (de->td_node != TMPFS_NODE_WHITEOUT) {
                                error = ENOTEMPTY;
                                goto out;
                        }
                }
                KASSERT(error == 0);
        }

        KASSERT(node->tn_spec.tn_dir.tn_parent == dnode);

        /* Lookup the directory entry (check the cached hint first). */
        de = tmpfs_dir_cached(node);
        if (de == NULL) {
                struct componentname *cnp = ap->a_cnp;
                de = tmpfs_dir_lookup(dnode, cnp);
        }
        KASSERT(de && de->td_node == node);

        /* Check flags to see if we are allowed to remove the directory. */
        if (dnode->tn_flags & APPEND || node->tn_flags & (IMMUTABLE | APPEND)) {
                error = EPERM;
                goto out;
        }

        /* Decrement the link count for the virtual '.' entry. */
        node->tn_links--;

        /* Detach the directory entry from the directory. */
        tmpfs_dir_detach(dnode, de);

        /* Purge the cache for parent. */
        cache_purge(dvp);

        /*
         * Destroy the directory entry or replace it with a whiteout.
         *
         * Note: the inode referred by it will not be destroyed until the
         * vnode is reclaimed.
         */
        if (ap->a_cnp->cn_flags & DOWHITEOUT)
                tmpfs_dir_attach(dnode, de, TMPFS_NODE_WHITEOUT);
        else
                tmpfs_free_dirent(tmp, de);

        /* Destroy the whiteout entries from the node. */
        while ((de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir)) != NULL) {
                KASSERT(de->td_node == TMPFS_NODE_WHITEOUT);
                tmpfs_dir_detach(node, de);
                tmpfs_free_dirent(tmp, de);
        }
        tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);

        KASSERT(node->tn_size == 0);
        KASSERT(node->tn_links == 0);
out:
        /* Release the node. */
        KASSERT(dvp != vp);
        vput(vp);
        return error;
}

int
tmpfs_symlink(void *v)
{
        struct vop_symlink_v3_args /* {
                struct vnode                *a_dvp;
                struct vnode                **a_vpp;
                struct componentname        *a_cnp;
                struct vattr                *a_vap;
                char                        *a_target;
        } */ *ap = v;
        vnode_t *dvp = ap->a_dvp;
        vnode_t **vpp = ap->a_vpp;
        struct componentname *cnp = ap->a_cnp;
        struct vattr *vap = ap->a_vap;
        char *target = ap->a_target;

        KASSERT(vap->va_type == VLNK);
        return tmpfs_construct_node(dvp, vpp, vap, cnp, target);
}

int
tmpfs_readdir(void *v)
{
        struct vop_readdir_args /* {
                struct vnode        *a_vp;
                struct uio        *a_uio;
                kauth_cred_t        a_cred;
                int                *a_eofflag;
                off_t                **a_cookies;
                int                *ncookies;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        struct uio *uio = ap->a_uio;
        int *eofflag = ap->a_eofflag;
        off_t **cookies = ap->a_cookies;
        int *ncookies = ap->a_ncookies;
        off_t startoff, cnt;
        tmpfs_node_t *node;
        int error;

        KASSERT(VOP_ISLOCKED(vp));

        /* This operation only makes sense on directory nodes. */
        if (vp->v_type != VDIR) {
                return ENOTDIR;
        }
        node = VP_TO_TMPFS_DIR(vp);
        startoff = uio->uio_offset;
        cnt = 0;

        /*
         * Retrieve the directory entries, unless it is being destroyed.
         */
        if (node->tn_links) {
                error = tmpfs_dir_getdents(node, uio, &cnt);
        } else {
                error = 0;
        }

        if (eofflag != NULL) {
                *eofflag = !error && uio->uio_offset == TMPFS_DIRSEQ_EOF;
        }
        if (error || cookies == NULL || ncookies == NULL) {
                return error;
        }

        /* Update NFS-related variables, if any. */
        tmpfs_dirent_t *de = NULL;
        off_t i, off = startoff;

        *cookies = malloc(cnt * sizeof(off_t), M_TEMP, M_WAITOK);
        *ncookies = cnt;

        for (i = 0; i < cnt; i++) {
                KASSERT(off != TMPFS_DIRSEQ_EOF);
                if (off != TMPFS_DIRSEQ_DOT) {
                        if (off == TMPFS_DIRSEQ_DOTDOT) {
                                de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir);
                        } else if (de != NULL) {
                                de = TAILQ_NEXT(de, td_entries);
                        } else {
                                de = tmpfs_dir_lookupbyseq(node, off);
                                KASSERT(de != NULL);
                                de = TAILQ_NEXT(de, td_entries);
                        }
                        if (de == NULL) {
                                off = TMPFS_DIRSEQ_EOF;
                        } else {
                                off = tmpfs_dir_getseq(node, de);
                        }
                } else {
                        off = TMPFS_DIRSEQ_DOTDOT;
                }
                (*cookies)[i] = off;
        }
        KASSERT(uio->uio_offset == off);
        return error;
}

int
tmpfs_readlink(void *v)
{
        struct vop_readlink_args /* {
                struct vnode        *a_vp;
                struct uio        *a_uio;
                kauth_cred_t        a_cred;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        struct uio *uio = ap->a_uio;
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
        int error;

        KASSERT(VOP_ISLOCKED(vp));
        KASSERT(uio->uio_offset == 0);
        KASSERT(vp->v_type == VLNK);

        /* Note: readlink(2) returns the path without NUL terminator. */
        if (node->tn_size > 0) {
                error = uiomove(node->tn_spec.tn_lnk.tn_link,
                    MIN(node->tn_size, uio->uio_resid), uio);
        } else {
                error = 0;
        }
        tmpfs_update(vp, TMPFS_UPDATE_ATIME);

        return error;
}

int
tmpfs_inactive(void *v)
{
        struct vop_inactive_v2_args /* {
                struct vnode *a_vp;
                bool *a_recycle;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        tmpfs_node_t *node;
        int error = 0;

        KASSERT(VOP_ISLOCKED(vp));

        node = VP_TO_TMPFS_NODE(vp);
        if (node->tn_links == 0) {
                /*
                 * Mark node as dead by setting its generation to zero.
                 */
                atomic_and_32(&node->tn_gen, ~TMPFS_NODE_GEN_MASK);

                /*
                 * If the file has been deleted, truncate it, otherwise VFS
                 * will quite rightly try to write back dirty data, which in
                 * the case of tmpfs/UAO means needless page deactivations.
                 */
                if (vp->v_type == VREG) {
                        error = tmpfs_reg_resize(vp, 0);
                }
                *ap->a_recycle = true;
        } else {
                tmpfs_update(vp, 0);
                *ap->a_recycle = false;
        }

        return error;
}

int
tmpfs_reclaim(void *v)
{
        struct vop_reclaim_v2_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        tmpfs_mount_t *tmp = VFS_TO_TMPFS(vp->v_mount);
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);

        /* Unlock vnode.  We still have exclusive access to it. */
        VOP_UNLOCK(vp);

        /* Disassociate inode from vnode. */
        node->tn_vnode = NULL;
        vp->v_data = NULL;

        /* If inode is not referenced, i.e. no links, then destroy it. */
        if (node->tn_links == 0)
                tmpfs_free_node(tmp, node);
        return 0;
}

int
tmpfs_pathconf(void *v)
{
        struct vop_pathconf_args /* {
                struct vnode        *a_vp;
                int                a_name;
                register_t        *a_retval;
        } */ *ap = v;
        register_t *retval = ap->a_retval;

        switch (ap->a_name) {
        case _PC_LINK_MAX:
                *retval = LINK_MAX;
                return 0;
        case _PC_NAME_MAX:
                *retval = TMPFS_MAXNAMLEN;
                return 0;
        case _PC_PATH_MAX:
                *retval = PATH_MAX;
                return 0;
        case _PC_PIPE_BUF:
                *retval = PIPE_BUF;
                return 0;
        case _PC_CHOWN_RESTRICTED:
                *retval = 1;
                return 0;
        case _PC_NO_TRUNC:
                *retval = 1;
                return 0;
        case _PC_SYNC_IO:
                *retval = 1;
                return 0;
        case _PC_FILESIZEBITS:
                *retval = sizeof(off_t) * CHAR_BIT;
                return 0;
        default:
                return genfs_pathconf(ap);
        }
}

int
tmpfs_advlock(void *v)
{
        struct vop_advlock_args /* {
                struct vnode        *a_vp;
                void *                a_id;
                int                a_op;
                struct flock        *a_fl;
                int                a_flags;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);

        return lf_advlock(v, &node->tn_lockf, node->tn_size);
}

int
tmpfs_getpages(void *v)
{
        struct vop_getpages_args /* {
                struct vnode *a_vp;
                voff_t a_offset;
                struct vm_page **a_m;
                int *a_count;
                int a_centeridx;
                vm_prot_t a_access_type;
                int a_advice;
                int a_flags;
        } */ * const ap = v;
        vnode_t *vp = ap->a_vp;
        const voff_t offset = ap->a_offset;
        struct vm_page **pgs = ap->a_m;
        const int centeridx = ap->a_centeridx;
        const vm_prot_t access_type = ap->a_access_type;
        const int advice = ap->a_advice;
        const int flags = ap->a_flags;
        int error, iflag, npages = *ap->a_count;
        tmpfs_node_t *node;
        struct uvm_object *uobj;

        KASSERT(vp->v_type == VREG);
        KASSERT(rw_lock_held(vp->v_uobj.vmobjlock));

        /*
         * Currently, PGO_PASTEOF is not supported.
         */
        if (vp->v_size <= offset + (centeridx << PAGE_SHIFT)) {
                if ((flags & PGO_LOCKED) == 0)
                        rw_exit(vp->v_uobj.vmobjlock);
                return EINVAL;
        }

        if (vp->v_size < offset + (npages << PAGE_SHIFT)) {
                npages = (round_page(vp->v_size) - offset) >> PAGE_SHIFT;
        }

        /*
         * Check for reclaimed vnode.  v_interlock is not held here, but
         * VI_DEADCHECK is set with vmobjlock held.
         */
        iflag = atomic_load_relaxed(&vp->v_iflag);
        if (__predict_false((iflag & VI_DEADCHECK) != 0)) {
                mutex_enter(vp->v_interlock);
                error = vdead_check(vp, VDEAD_NOWAIT);
                mutex_exit(vp->v_interlock);
                if (error) {
                        if ((flags & PGO_LOCKED) == 0)
                                rw_exit(vp->v_uobj.vmobjlock);
                        return error;
                }
        }

        node = VP_TO_TMPFS_NODE(vp);
        uobj = node->tn_spec.tn_reg.tn_aobj;

        /*
         * Update timestamp lazily.  The update will be made real when
         * a synchronous update is next made -- or by tmpfs_getattr,
         * tmpfs_putpages, and tmpfs_inactive.
         */
        if ((flags & PGO_NOTIMESTAMP) == 0) {
                u_int tflags = 0;

                if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
                        tflags |= TMPFS_UPDATE_ATIME;

                if ((access_type & VM_PROT_WRITE) != 0) {
                        tflags |= TMPFS_UPDATE_MTIME;
                        if (vp->v_mount->mnt_flag & MNT_RELATIME)
                                tflags |= TMPFS_UPDATE_ATIME;
                }
                tmpfs_update_lazily(vp, tflags);
        }

        /* Invoke the pager.  The vnode vmobjlock is shared with the UAO. */
        KASSERT(vp->v_uobj.vmobjlock == uobj->vmobjlock);
        error = (*uobj->pgops->pgo_get)(uobj, offset, pgs, &npages, centeridx,
            access_type, advice, flags);
#if defined(DEBUG)
        if (!error && pgs) {
                KASSERT(pgs[centeridx] != NULL);
        }
#endif
        return error;
}

int
tmpfs_putpages(void *v)
{
        struct vop_putpages_args /* {
                struct vnode *a_vp;
                voff_t a_offlo;
                voff_t a_offhi;
                int a_flags;
        } */ * const ap = v;
        vnode_t *vp = ap->a_vp;
        const voff_t offlo = ap->a_offlo;
        const voff_t offhi = ap->a_offhi;
        const int flags = ap->a_flags;
        tmpfs_node_t *node;
        struct uvm_object *uobj;
        int error;

        KASSERT(rw_write_held(vp->v_uobj.vmobjlock));

        if (vp->v_type != VREG) {
                rw_exit(vp->v_uobj.vmobjlock);
                return 0;
        }

        node = VP_TO_TMPFS_NODE(vp);
        uobj = node->tn_spec.tn_reg.tn_aobj;

        KASSERT(vp->v_uobj.vmobjlock == uobj->vmobjlock);
        error = (*uobj->pgops->pgo_put)(uobj, offlo, offhi, flags);

        /* XXX mtime */

        /* Process deferred updates. */
        tmpfs_update(vp, 0);
        return error;
}

int
tmpfs_whiteout(void *v)
{
        struct vop_whiteout_args /* {
                struct vnode                *a_dvp;
                struct componentname        *a_cnp;
                int                        a_flags;
        } */ *ap = v;
        vnode_t *dvp = ap->a_dvp;
        struct componentname *cnp = ap->a_cnp;
        const int flags = ap->a_flags;
        tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount);
        tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp);
        tmpfs_dirent_t *de;
        int error;

        switch (flags) {
        case LOOKUP:
                break;
        case CREATE:
                error = tmpfs_alloc_dirent(tmp, cnp->cn_nameptr,
                    cnp->cn_namelen, &de);
                if (error)
                        return error;
                tmpfs_dir_attach(dnode, de, TMPFS_NODE_WHITEOUT);
                break;
        case DELETE:
                cnp->cn_flags &= ~DOWHITEOUT; /* when in doubt, cargo cult */
                de = tmpfs_dir_lookup(dnode, cnp);
                if (de == NULL)
                        return ENOENT;
                tmpfs_dir_detach(dnode, de);
                tmpfs_free_dirent(tmp, de);
                break;
        }
        tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);
        return 0;
}

int
tmpfs_print(void *v)
{
        struct vop_print_args /* {
                struct vnode        *a_vp;
        } */ *ap = v;
        vnode_t *vp = ap->a_vp;
        tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);

        printf("tag VT_TMPFS, tmpfs_node %p, flags 0x%x, links %d\n"
            "\tmode 0%o, owner %d, group %d, size %" PRIdMAX,
            node, node->tn_flags, node->tn_links, node->tn_mode, node->tn_uid,
            node->tn_gid, (uintmax_t)node->tn_size);
        if (vp->v_type == VFIFO) {
                VOCALL(fifo_vnodeop_p, VOFFSET(vop_print), v);
        }
        printf("\n");
        return 0;
}
























































































































    2 










    5 






    1 








    1 

    1 









    1 























































    4 















    1 






    4 






    4 









    5 



    5 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
/*        $NetBSD: sys_socket.c,v 1.81 2023/04/22 13:53:02 riastradh Exp $        */

/*-
 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1990, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)sys_socket.c        8.3 (Berkeley) 2/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_socket.c,v 1.81 2023/04/22 13:53:02 riastradh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/systm.h>
#include <sys/file.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/kauth.h>

#include <net/if.h>
#include <net/route.h>

static int soo_fpathconf(struct file *, int, register_t *);
static int soo_posix_fadvise(struct file *, off_t, off_t, int);

const struct fileops socketops = {
        .fo_name = "socket",
        .fo_read = soo_read,
        .fo_write = soo_write,
        .fo_ioctl = soo_ioctl,
        .fo_fcntl = fnullop_fcntl,
        .fo_poll = soo_poll,
        .fo_stat = soo_stat,
        .fo_close = soo_close,
        .fo_kqfilter = soo_kqfilter,
        .fo_restart = soo_restart,
        .fo_fpathconf = soo_fpathconf,
        .fo_posix_fadvise = soo_posix_fadvise,
};

int (*ifioctl)(struct socket *, u_long, void *, struct lwp *) = (void *)eopnotsupp;

/* ARGSUSED */
int
soo_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
         int flags)
{
        struct socket *so = fp->f_socket;
        int error;

        error = (*so->so_receive)(so, NULL, uio, NULL, NULL, NULL);

        return error;
}

/* ARGSUSED */
int
soo_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
          int flags)
{
        struct socket *so = fp->f_socket;
        int error;

        error = (*so->so_send)(so, NULL, uio, NULL, NULL, 0, curlwp);

        return error;
}

int
soo_ioctl(file_t *fp, u_long cmd, void *data)
{
        struct socket *so = fp->f_socket;
        int error = 0;

        switch (cmd) {

        case FIONBIO:
                solock(so);
                if (*(int *)data)
                        so->so_state |= SS_NBIO;
                else 
                        so->so_state &= ~SS_NBIO;
                sounlock(so);
                break;

        case FIOASYNC:
                solock(so);
                if (*(int *)data) {
                        so->so_rcv.sb_flags |= SB_ASYNC;
                        so->so_snd.sb_flags |= SB_ASYNC;
                } else {
                        so->so_rcv.sb_flags &= ~SB_ASYNC;
                        so->so_snd.sb_flags &= ~SB_ASYNC;
                }
                sounlock(so);
                break;

        case FIONREAD:
                *(int *)data = so->so_rcv.sb_cc;
                break;

        case FIONWRITE:
                *(int *)data = so->so_snd.sb_cc;
                break;

        case FIONSPACE:
                /*
                 * See the comment around sbspace()'s definition
                 * in sys/socketvar.h in face of counts about maximum
                 * to understand the following test. We detect overflow
                 * and return zero.
                 */
                solock(so);
                if ((so->so_snd.sb_hiwat < so->so_snd.sb_cc)
                    || (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt))
                        *(int *)data = 0;
                else
                        *(int *)data = sbspace(&so->so_snd);
                sounlock(so);
                break;

        case SIOCSPGRP:
        case FIOSETOWN:
        case TIOCSPGRP:
                error = fsetown(&so->so_pgid, cmd, data);
                break;

        case SIOCGPGRP:
        case FIOGETOWN:
        case TIOCGPGRP:
                error = fgetown(so->so_pgid, cmd, data);
                break;

        case SIOCATMARK:
                *(int *)data = (so->so_state&SS_RCVATMARK) != 0;
                break;

        case SIOCPEELOFF:
                solock(so);
                error = do_sys_peeloff(so, data);
                sounlock(so);
                break;

        default:
                /*
                 * Interface/routing/protocol specific ioctls:
                 * interface and routing ioctls should have a
                 * different entry since a socket's unnecessary
                 */
                if (IOCGROUP(cmd) == 'i')
                        /*
                         * KERNEL_LOCK will be held later if if_ioctl() of the
                         * interface isn't MP-safe.
                         */
                        error = ifioctl(so, cmd, data, curlwp);
                else {
                        KERNEL_LOCK(1, NULL);
                        error = (*so->so_proto->pr_usrreqs->pr_ioctl)(so,
                            cmd, data, NULL);
                        KERNEL_UNLOCK_ONE(NULL);
                }
                break;
        }


        return error;
}

int
soo_poll(file_t *fp, int events)
{

        return sopoll(fp->f_socket, events);
}

int
soo_stat(file_t *fp, struct stat *ub)
{
        struct socket *so = fp->f_socket;
        int error;

        memset(ub, 0, sizeof(*ub));
        ub->st_mode = S_IFSOCK;

        solock(so);
        error = (*so->so_proto->pr_usrreqs->pr_stat)(so, ub);
        sounlock(so);

        return error;
}

/* ARGSUSED */
int
soo_close(file_t *fp)
{
        int error = 0;

        if (fp->f_socket)
                error = soclose(fp->f_socket);
        fp->f_socket = NULL;

        return error;
}

void
soo_restart(file_t *fp)
{

        sorestart(fp->f_socket);
}

static int
soo_fpathconf(struct file *fp, int name, register_t *retval)
{

        switch (name) {
        case _PC_PIPE_BUF:
                *retval = PIPE_BUF;
                return 0;
        default:
                return EINVAL;
        }
}

static int
soo_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice)
{

        return ESPIPE;
}





























































































































    1 










































































































































































  124 









  122 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
/*        $NetBSD: dbregs.c,v 1.15 2020/01/31 08:55:38 maxv Exp $        */

/*
 * Copyright (c) 2016 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/param.h>
#include <sys/types.h>
#include <sys/lwp.h>
#include <sys/pool.h>
#include <x86/cpufunc.h>
#include <x86/dbregs.h>

#include <uvm/uvm_prot.h>
#include <uvm/uvm_pmap.h>

#include <machine/pmap.h>

struct pool x86_dbregspl;
static struct dbreg initdbstate;

#define X86_BREAKPOINT_CONDITION_DETECTED        ( \
        X86_DR6_DR0_BREAKPOINT_CONDITION_DETECTED | \
        X86_DR6_DR1_BREAKPOINT_CONDITION_DETECTED | \
        X86_DR6_DR2_BREAKPOINT_CONDITION_DETECTED | \
        X86_DR6_DR3_BREAKPOINT_CONDITION_DETECTED )

#define X86_GLOBAL_BREAKPOINT        ( \
        X86_DR7_GLOBAL_DR0_BREAKPOINT | \
        X86_DR7_GLOBAL_DR1_BREAKPOINT | \
        X86_DR7_GLOBAL_DR2_BREAKPOINT | \
        X86_DR7_GLOBAL_DR3_BREAKPOINT )

void
x86_dbregs_init(void)
{
        /* DR0-DR3 should always be 0 */
        initdbstate.dr[0] = rdr0();
        initdbstate.dr[1] = rdr1();
        initdbstate.dr[2] = rdr2();
        initdbstate.dr[3] = rdr3();
        /* DR4-DR5 are reserved - skip */
        /* DR6 and DR7 contain predefined nonzero bits */
        initdbstate.dr[6] = rdr6();
        initdbstate.dr[7] = rdr7();
        /* DR8-DR15 are reserved - skip */

        /*
         * Explicitly reset some bits just in case they could be
         * set by brave software/hardware before the kernel boot.
         */
        initdbstate.dr[6] &= ~X86_BREAKPOINT_CONDITION_DETECTED;
        initdbstate.dr[7] &= ~X86_DR7_GENERAL_DETECT_ENABLE;

        pool_init(&x86_dbregspl, sizeof(struct dbreg), 16, 0, 0, "dbregs",
            NULL, IPL_NONE);
}

static void
x86_dbregs_reset(void)
{
        /*
         * It's sufficient to just disable Debug Control Register (DR7).
         * It will deactivate hardware watchpoints.
         */
        ldr7(0);

        /*
         * However at some point we need to clear Debug Status Registers
         * (DR6). The CPU will never do it automatically.
         *
         * Clear BREAKPOINT_CONDITION_DETECTED bits and ignore the rest.
         */
        ldr6(rdr6() & ~X86_BREAKPOINT_CONDITION_DETECTED);
}

void
x86_dbregs_clear(struct lwp *l)
{
        struct pcb *pcb = lwp_getpcb(l);
        struct dbreg *dbregs;

        KASSERT(l == curlwp);

        if (__predict_true(pcb->pcb_dbregs == NULL)) {
                KASSERT((pcb->pcb_flags & PCB_DBREGS) == 0);
                return;
        }

        dbregs = pcb->pcb_dbregs;

        kpreempt_disable();
        pcb->pcb_dbregs = NULL;
        pcb->pcb_flags &= ~PCB_DBREGS;
        x86_dbregs_reset();
        kpreempt_enable();

        pool_put(&x86_dbregspl, dbregs);
}

void
x86_dbregs_abandon(struct lwp *l)
{
        struct pcb *pcb = lwp_getpcb(l);

        kpreempt_disable();
        pcb->pcb_flags &= ~PCB_DBREGS;
        x86_dbregs_reset();
        kpreempt_enable();
}

void
x86_dbregs_read(struct lwp *l, struct dbreg *regs)
{
        struct pcb *pcb = lwp_getpcb(l);

        if (pcb->pcb_dbregs == NULL) {
                pcb->pcb_dbregs = pool_get(&x86_dbregspl, PR_WAITOK);
                memcpy(pcb->pcb_dbregs, &initdbstate, sizeof(initdbstate));
                pcb->pcb_flags |= PCB_DBREGS;
        }
        memcpy(regs, pcb->pcb_dbregs, sizeof(*regs));
}

void
x86_dbregs_save(struct lwp *l)
{
        struct pcb *pcb = lwp_getpcb(l);

        if (!(pcb->pcb_flags & PCB_DBREGS)) {
                return;
        }

        KASSERT(pcb->pcb_dbregs != NULL);

        pcb->pcb_dbregs->dr[0] = rdr0();
        pcb->pcb_dbregs->dr[1] = rdr1();
        pcb->pcb_dbregs->dr[2] = rdr2();
        pcb->pcb_dbregs->dr[3] = rdr3();

        pcb->pcb_dbregs->dr[6] = rdr6();
        pcb->pcb_dbregs->dr[7] = rdr7();
}

void
x86_dbregs_restore(struct lwp *l)
{
        struct pcb *pcb = lwp_getpcb(l);

        if (!(pcb->pcb_flags & PCB_DBREGS)) {
                return;
        }

        KASSERT(pcb->pcb_dbregs != NULL);

        ldr0(pcb->pcb_dbregs->dr[0]);
        ldr1(pcb->pcb_dbregs->dr[1]);
        ldr2(pcb->pcb_dbregs->dr[2]);
        ldr3(pcb->pcb_dbregs->dr[3]);

        ldr6(pcb->pcb_dbregs->dr[6]);
        ldr7(pcb->pcb_dbregs->dr[7]);
}

void
x86_dbregs_store_dr6(struct lwp *l)
{
        struct pcb *pcb = lwp_getpcb(l);

        KASSERT(l == curlwp);
        KASSERT(pcb->pcb_dbregs != NULL);

        pcb->pcb_dbregs->dr[6] = rdr6();
}

int
x86_dbregs_user_trap(void)
{
        register_t dr7, dr6;
        register_t bp;

        dr7 = rdr7();
        if ((dr7 & X86_GLOBAL_BREAKPOINT) == 0) {
                /*
                 * All Global Breakpoint bits are zero, thus the trap couldn't
                 * have been caused by the hardware debug registers.
                 */
                return 0;
        }

        dr6 = rdr6();
        bp = dr6 & X86_BREAKPOINT_CONDITION_DETECTED;

        if (!bp) {
                /*
                 * None of the breakpoint bits are set, meaning this
                 * trap was not caused by any of the debug registers.
                 */
                return 0;
        }

        /*
         * At least one of the breakpoints was hit, check to see
         * which ones and if any of them are user space addresses.
         */

        if (bp & X86_DR6_DR0_BREAKPOINT_CONDITION_DETECTED)
                if (rdr0() < (vaddr_t)VM_MAXUSER_ADDRESS)
                        return 1;

        if (bp & X86_DR6_DR1_BREAKPOINT_CONDITION_DETECTED)
                if (rdr1() < (vaddr_t)VM_MAXUSER_ADDRESS)
                        return 1;

        if (bp & X86_DR6_DR2_BREAKPOINT_CONDITION_DETECTED)
                if (rdr2() < (vaddr_t)VM_MAXUSER_ADDRESS)
                        return 1;

        if (bp & X86_DR6_DR3_BREAKPOINT_CONDITION_DETECTED)
                if (rdr3() < (vaddr_t)VM_MAXUSER_ADDRESS)
                        return 1;

        return 0;
}

int
x86_dbregs_validate(const struct dbreg *regs)
{
        size_t i;

        /* Check that DR0-DR3 contain user-space address */
        for (i = 0; i < X86_DBREGS; i++) {
                if (regs->dr[i] >= (vaddr_t)VM_MAXUSER_ADDRESS)
                        return EINVAL;
        }

#ifndef i386
        if (regs->dr[6] & X86_DR6_MBZ) {
                return EINVAL;
        }
        if (regs->dr[7] & X86_DR7_MBZ) {
                return EINVAL;
        }
#endif
        if (regs->dr[7] & X86_DR7_GENERAL_DETECT_ENABLE) {
                return EINVAL;
        }

        /*
         * Skip checks for reserved registers (DR4-DR5, DR8-DR15).
         */

        return 0;
}

void
x86_dbregs_write(struct lwp *l, const struct dbreg *regs)
{
        struct pcb *pcb = lwp_getpcb(l);

        if (pcb->pcb_dbregs == NULL) {
                pcb->pcb_dbregs = pool_get(&x86_dbregspl, PR_WAITOK);
        }

        memcpy(pcb->pcb_dbregs, regs, sizeof(*regs));
        pcb->pcb_flags |= PCB_DBREGS;
}

/*
 * Called with preemption disabled.
 */
void
x86_dbregs_switch(struct lwp *oldlwp, struct lwp *newlwp)
{
        struct pcb *oldpcb, *newpcb;
        bool olddb, newdb;

        oldpcb = lwp_getpcb(oldlwp);
        newpcb = lwp_getpcb(newlwp);

        olddb = (oldpcb->pcb_flags & PCB_DBREGS) != 0;
        newdb = (newpcb->pcb_flags & PCB_DBREGS) != 0;

        if (__predict_true(!olddb && !newdb)) {
                /* fast path */
                return;
        }

        if (olddb) {
                x86_dbregs_save(oldlwp);
        }
        if (newdb) {
                x86_dbregs_restore(newlwp);
        } else if (olddb) {
                x86_dbregs_reset();
        }
}

















































































































   26 










   26 




    8 















   21 





   21 






   21 


    3 

   20 


    2 






















   21 


   19 

   14 













    4 


















    4 





















    4 
    4 







































    2 
    1 
    1 
    1 
    1 

    1 





    1 






    3 












    1 













    1 















    1 

















    2 








    1 



    1 










    1 






    3 





















    3 

    2 



    3 

    3 

    2 




    3 










    3 
    3 




    3 





    3 










    3 
    3 

    2 
























    2 

    2 



    3 






    3 
    3 
    3 


    2 
    2 








    2 


    3 






    1 







    1 

    1 
    1 









    1 















    3 



    3 
    3 


    3 

    2 







    3 










    3 
    3 


    3 



    3 















    2 































    2 


    2 















    2 































    2 
    2 

















    1 

    2 
    2 


    2 











    2 


    1 










    2 


    2 






    2 


















    2 













   30 


   10 




   15 


   18 
   18 

    9 



   24 
   24 



    4 


   25 
   25 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
/*        $NetBSD: ffs_inode.c,v 1.131 2020/07/31 04:07:30 chs Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ffs_inode.c        8.13 (Berkeley) 4/21/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.131 2020/07/31 04:07:30 chs Exp $");

#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/file.h>
#include <sys/fstrans.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/trace.h>
#include <sys/vnode.h>
#include <sys/wapbl.h>

#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>

#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>

static int ffs_indirtrunc(struct inode *, daddr_t, daddr_t, daddr_t, int,
                          int64_t *);

/*
 * Update the access, modified, and inode change times as specified
 * by the IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively.
 * The IN_MODIFIED flag is used to specify that the inode needs to be
 * updated but that the times have already been set. The access
 * and modified times are taken from the second and third parameters;
 * the inode change time is always taken from the current time. If
 * UPDATE_WAIT flag is set, or UPDATE_DIROP is set then wait for the
 * disk write of the inode to complete.
 */

int
ffs_update(struct vnode *vp, const struct timespec *acc,
    const struct timespec *mod, int updflags)
{
        struct fs *fs;
        struct buf *bp;
        struct inode *ip;
        int error;
        void *cp;
        int waitfor, flags;

        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                return (0);
        ip = VTOI(vp);
        FFS_ITIMES(ip, acc, mod, NULL);
        if (updflags & UPDATE_CLOSE)
                flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED);
        else
                flags = ip->i_flag & IN_MODIFIED;
        if (flags == 0)
                return (0);
        fs = ip->i_fs;

        if ((flags & IN_MODIFIED) != 0 &&
            (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) {
                waitfor = updflags & UPDATE_WAIT;
                if ((updflags & UPDATE_DIROP) != 0)
                        waitfor |= UPDATE_WAIT;
        } else
                waitfor = 0;

        /*
         * Ensure that uid and gid are correct. This is a temporary
         * fix until fsck has been changed to do the update.
         */
        if (fs->fs_magic == FS_UFS1_MAGIC &&                        /* XXX */
            fs->fs_old_inodefmt < FS_44INODEFMT) {                /* XXX */
                ip->i_ffs1_ouid = ip->i_uid;        /* XXX */
                ip->i_ffs1_ogid = ip->i_gid;        /* XXX */
        }                                                        /* XXX */
        error = bread(ip->i_devvp,
                      FFS_FSBTODB(fs, ino_to_fsba(fs, ip->i_number)),
                      (int)fs->fs_bsize, B_MODIFY, &bp);
        if (error) {
                return (error);
        }
        ip->i_flag &= ~(IN_MODIFIED | IN_ACCESSED);
        /* Keep unlinked inode list up to date */
        KDASSERTMSG(DIP(ip, nlink) == ip->i_nlink,
            "DIP(ip, nlink) [%d] == ip->i_nlink [%d]",
            DIP(ip, nlink), ip->i_nlink);
        if (ip->i_mode) {
                if (ip->i_nlink > 0) {
                        UFS_WAPBL_UNREGISTER_INODE(ip->i_ump->um_mountp,
                            ip->i_number, ip->i_mode);
                } else {
                        UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp,
                            ip->i_number, ip->i_mode);
                }
        }
        if (fs->fs_magic == FS_UFS1_MAGIC) {
                cp = (char *)bp->b_data +
                    (ino_to_fsbo(fs, ip->i_number) * DINODE1_SIZE);
#ifdef FFS_EI
                if (UFS_FSNEEDSWAP(fs))
                        ffs_dinode1_swap(ip->i_din.ffs1_din,
                            (struct ufs1_dinode *)cp);
                else
#endif
                        memcpy(cp, ip->i_din.ffs1_din, DINODE1_SIZE);
        } else {
                cp = (char *)bp->b_data +
                    (ino_to_fsbo(fs, ip->i_number) * DINODE2_SIZE);
#ifdef FFS_EI
                if (UFS_FSNEEDSWAP(fs))
                        ffs_dinode2_swap(ip->i_din.ffs2_din,
                            (struct ufs2_dinode *)cp);
                else
#endif
                        memcpy(cp, ip->i_din.ffs2_din, DINODE2_SIZE);
        }
        if (waitfor) {
                return (bwrite(bp));
        } else {
                bdwrite(bp);
                return (0);
        }
}

#define        SINGLE        0        /* index of single indirect block */
#define        DOUBLE        1        /* index of double indirect block */
#define        TRIPLE        2        /* index of triple indirect block */
/*
 * Truncate the inode oip to at most length size, freeing the
 * disk blocks.
 */
int
ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
{
        daddr_t lastblock;
        struct inode *oip = VTOI(ovp);
        struct mount *omp = ovp->v_mount;
        daddr_t bn, lastiblock[UFS_NIADDR], indir_lbn[UFS_NIADDR];
        daddr_t blks[UFS_NDADDR + UFS_NIADDR], oldblks[UFS_NDADDR + UFS_NIADDR];
        struct fs *fs;
        int extblocks;
        int offset, pgoffset, level;
        int64_t blocksreleased = 0, datablocks;
        int i, aflag, nblocks;
        int error, allerror = 0;
        off_t osize;
        int sync;
        struct ufsmount *ump = oip->i_ump;
        void *dcookie;
        long bsize;
        bool wapbl = omp->mnt_wapbl != NULL;

        UFS_WAPBL_JLOCK_ASSERT(ump->um_mountp);

        if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
            ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
                KASSERT(oip->i_size == 0);
                return 0;
        }

        if (length < 0)
                return (EINVAL);

        /*
         * Historically clients did not have to specify which data
         * they were truncating. So, if not specified, we assume
         * traditional behavior, e.g., just the normal data.
         */
        if ((ioflag & (IO_EXT | IO_NORMAL)) == 0)
                ioflag |= IO_NORMAL;

        fs = oip->i_fs;
#define i_din2 i_din.ffs2_din
        extblocks = 0;
        datablocks = DIP(oip, blocks);
        if (fs->fs_magic == FS_UFS2_MAGIC && oip->i_din2->di_extsize > 0) {
                extblocks = btodb(ffs_fragroundup(fs, oip->i_din2->di_extsize));
                datablocks -= extblocks;
        }
        if ((ioflag & IO_EXT) && extblocks > 0) {
                if (length != 0)
                        panic("ffs_truncate: partial trunc of extdata");
                {
#ifdef QUOTA
                        (void) chkdq(oip, -extblocks, NOCRED, FORCE);
#endif
                        osize = oip->i_din2->di_extsize;
                        oip->i_din2->di_blocks -= extblocks;
                        oip->i_din2->di_extsize = 0;
                        for (i = 0; i < UFS_NXADDR; i++) {
                                binvalbuf(ovp, -1 - i);
                                oldblks[i] = oip->i_din2->di_extb[i];
                                oip->i_din2->di_extb[i] = 0;
                        }
                        oip->i_flag |= IN_CHANGE;
                        if ((error = ffs_update(ovp, NULL, NULL, 0)))
                                return (error);
                        for (i = 0; i < UFS_NXADDR; i++) {
                                if (oldblks[i] == 0)
                                        continue;
                                bsize = ffs_sblksize(fs, osize, i);
                                if (wapbl) {
                                        error = UFS_WAPBL_REGISTER_DEALLOCATION(omp,
                                            FFS_FSBTODB(fs, oldblks[i]), bsize, NULL);
                                        if (error)
                                                return error;
                                } else 
                                        ffs_blkfree(fs, oip->i_devvp, oldblks[i],
                                            bsize, oip->i_number);
                        }
                        extblocks = 0;
                }
        }
        if ((ioflag & IO_NORMAL) == 0)
                return (0);
        if (ovp->v_type == VLNK &&
            (oip->i_size < ump->um_maxsymlinklen ||
             (ump->um_maxsymlinklen == 0 && datablocks == 0))) {
                KDASSERT(length == 0);
                memset(SHORTLINK(oip), 0, (size_t)oip->i_size);
                oip->i_size = 0;
                DIP_ASSIGN(oip, size, 0);
                oip->i_flag |= IN_CHANGE | IN_UPDATE;
                return (ffs_update(ovp, NULL, NULL, 0));
        }
        if (oip->i_size == length) {
                /* still do a uvm_vnp_setsize() as writesize may be larger */
                uvm_vnp_setsize(ovp, length);
                oip->i_flag |= IN_CHANGE | IN_UPDATE;
                return (ffs_update(ovp, NULL, NULL, 0));
        }
        if (length > ump->um_maxfilesize)
                return (EFBIG);

        if ((oip->i_flags & SF_SNAPSHOT) != 0)
                ffs_snapremove(ovp);

        osize = oip->i_size;
        aflag = ioflag & IO_SYNC ? B_SYNC : 0;

        /*
         * Lengthen the size of the file. We must ensure that the
         * last byte of the file is allocated. Since the smallest
         * value of osize is 0, length will be at least 1.
         */

        if (osize < length) {
                if (ffs_lblkno(fs, osize) < UFS_NDADDR &&
                    ffs_lblkno(fs, osize) != ffs_lblkno(fs, length) &&
                    ffs_blkroundup(fs, osize) != osize) {
                        off_t eob;

                        eob = ffs_blkroundup(fs, osize);
                        uvm_vnp_setwritesize(ovp, eob);
                        error = ufs_balloc_range(ovp, osize, eob - osize,
                            cred, aflag);
                        if (error) {
                                (void) ffs_truncate(ovp, osize,
                                    ioflag & IO_SYNC, cred);
                                return error;
                        }
                        if (ioflag & IO_SYNC) {
                                rw_enter(ovp->v_uobj.vmobjlock, RW_WRITER);
                                VOP_PUTPAGES(ovp,
                                    trunc_page(osize & fs->fs_bmask),
                                    round_page(eob), PGO_CLEANIT | PGO_SYNCIO |
                                    PGO_JOURNALLOCKED);
                        }
                }
                uvm_vnp_setwritesize(ovp, length);
                error = ufs_balloc_range(ovp, length - 1, 1, cred, aflag);
                if (error) {
                        (void) ffs_truncate(ovp, osize, ioflag & IO_SYNC, cred);
                        return (error);
                }
                uvm_vnp_setsize(ovp, length);
                oip->i_flag |= IN_CHANGE | IN_UPDATE;
                KASSERT(ovp->v_size == oip->i_size);
                return (ffs_update(ovp, NULL, NULL, 0));
        }

        /*
         * When truncating a regular file down to a non-block-aligned size,
         * we must zero the part of last block which is past the new EOF.
         * We must synchronously flush the zeroed pages to disk
         * since the new pages will be invalidated as soon as we
         * inform the VM system of the new, smaller size.
         * We must do this before acquiring the GLOCK, since fetching
         * the pages will acquire the GLOCK internally.
         * So there is a window where another thread could see a whole
         * zeroed page past EOF, but that's life.
         */

        offset = ffs_blkoff(fs, length);
        pgoffset = length & PAGE_MASK;
        if (ovp->v_type == VREG && (pgoffset != 0 || offset != 0) &&
            osize > length) {
                daddr_t lbn;
                voff_t eoz;
                int size;

                if (offset != 0) {
                        error = ufs_balloc_range(ovp, length - 1, 1, cred,
                            aflag);
                        if (error)
                                return error;
                }
                lbn = ffs_lblkno(fs, length);
                size = ffs_blksize(fs, oip, lbn);
                eoz = MIN(MAX(ffs_lblktosize(fs, lbn) + size, round_page(pgoffset)),
                    osize);
                ubc_zerorange(&ovp->v_uobj, length, eoz - length,
                    UBC_VNODE_FLAGS(ovp));
                if (round_page(eoz) > round_page(length)) {
                        rw_enter(ovp->v_uobj.vmobjlock, RW_WRITER);
                        error = VOP_PUTPAGES(ovp, round_page(length),
                            round_page(eoz),
                            PGO_CLEANIT | PGO_DEACTIVATE | PGO_JOURNALLOCKED |
                            ((ioflag & IO_SYNC) ? PGO_SYNCIO : 0));
                        if (error)
                                return error;
                }
        }

        genfs_node_wrlock(ovp);
        oip->i_size = length;
        DIP_ASSIGN(oip, size, length);
        uvm_vnp_setsize(ovp, length);
        /*
         * Calculate index into inode's block list of
         * last direct and indirect blocks (if any)
         * which we want to keep.  Lastblock is -1 when
         * the file is truncated to 0.
         */
        lastblock = ffs_lblkno(fs, length + fs->fs_bsize - 1) - 1;
        lastiblock[SINGLE] = lastblock - UFS_NDADDR;
        lastiblock[DOUBLE] = lastiblock[SINGLE] - FFS_NINDIR(fs);
        lastiblock[TRIPLE] = lastiblock[DOUBLE] - FFS_NINDIR(fs) * FFS_NINDIR(fs);
        nblocks = btodb(fs->fs_bsize);
        /*
         * Update file and block pointers on disk before we start freeing
         * blocks.  If we crash before free'ing blocks below, the blocks
         * will be returned to the free list.  lastiblock values are also
         * normalized to -1 for calls to ffs_indirtrunc below.
         */
        sync = 0;
        for (level = TRIPLE; level >= SINGLE; level--) {
                blks[UFS_NDADDR + level] = DIP(oip, ib[level]);
                if (lastiblock[level] < 0 && blks[UFS_NDADDR + level] != 0) {
                        sync = 1;
                        DIP_ASSIGN(oip, ib[level], 0);
                        lastiblock[level] = -1;
                }
        }
        for (i = 0; i < UFS_NDADDR; i++) {
                blks[i] = DIP(oip, db[i]);
                if (i > lastblock && blks[i] != 0) {
                        sync = 1;
                        DIP_ASSIGN(oip, db[i], 0);
                }
        }
        oip->i_flag |= IN_CHANGE | IN_UPDATE;
        if (sync) {
                error = ffs_update(ovp, NULL, NULL, UPDATE_WAIT);
                if (error && !allerror)
                        allerror = error;
        }

        /*
         * Having written the new inode to disk, save its new configuration
         * and put back the old block pointers long enough to process them.
         * Note that we save the new block configuration so we can check it
         * when we are done.
         */
        for (i = 0; i < UFS_NDADDR; i++) {
                bn = DIP(oip, db[i]);
                DIP_ASSIGN(oip, db[i], blks[i]);
                blks[i] = bn;
        }
        for (i = 0; i < UFS_NIADDR; i++) {
                bn = DIP(oip, ib[i]);
                DIP_ASSIGN(oip, ib[i], blks[UFS_NDADDR + i]);
                blks[UFS_NDADDR + i] = bn;
        }

        oip->i_size = osize;
        DIP_ASSIGN(oip, size, osize);
        error = vtruncbuf(ovp, lastblock + 1, 0, 0);
        if (error && !allerror)
                allerror = error;

        /*
         * Indirect blocks first.
         */
        indir_lbn[SINGLE] = -UFS_NDADDR;
        indir_lbn[DOUBLE] = indir_lbn[SINGLE] - FFS_NINDIR(fs) - 1;
        indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - FFS_NINDIR(fs) * FFS_NINDIR(fs) - 1;
        for (level = TRIPLE; level >= SINGLE; level--) {
                bn = ffs_getib(fs, oip, level);
                if (bn != 0) {
                        if (lastiblock[level] < 0 &&
                            oip->i_ump->um_mountp->mnt_wapbl) {
                                error = UFS_WAPBL_REGISTER_DEALLOCATION(
                                    oip->i_ump->um_mountp,
                                    FFS_FSBTODB(fs, bn), fs->fs_bsize,
                                    &dcookie);
                                if (error)
                                        goto out;
                        } else {
                                dcookie = NULL;
                        }
                            
                        error = ffs_indirtrunc(oip, indir_lbn[level],
                            FFS_FSBTODB(fs, bn), lastiblock[level], level,
                            &blocksreleased);
                        if (error) {
                                if (dcookie) {
                                        UFS_WAPBL_UNREGISTER_DEALLOCATION(
                                            oip->i_ump->um_mountp, dcookie);
                                }
                                goto out;
                        }

                        if (lastiblock[level] < 0) {
                                if (!dcookie)
                                        ffs_blkfree(fs, oip->i_devvp, bn,
                                            fs->fs_bsize, oip->i_number);
                                DIP_ASSIGN(oip, ib[level], 0);
                                blocksreleased += nblocks;
                        }
                }
                if (lastiblock[level] >= 0)
                        goto done;
        }

        /*
         * All whole direct blocks or frags.
         */
        for (i = UFS_NDADDR - 1; i > lastblock; i--) {
                bn = ffs_getdb(fs, oip, i);
                if (bn == 0)
                        continue;

                bsize = ffs_blksize(fs, oip, i);
                if ((oip->i_ump->um_mountp->mnt_wapbl) &&
                    (ovp->v_type != VREG)) {
                        error = UFS_WAPBL_REGISTER_DEALLOCATION(
                            oip->i_ump->um_mountp,
                            FFS_FSBTODB(fs, bn), bsize, NULL);
                        if (error)
                                goto out;
                } else
                        ffs_blkfree(fs, oip->i_devvp, bn, bsize, oip->i_number);
                DIP_ASSIGN(oip, db[i], 0);
                blocksreleased += btodb(bsize);
        }
        if (lastblock < 0)
                goto done;

        /*
         * Finally, look for a change in size of the
         * last direct block; release any frags.
         */
        bn = ffs_getdb(fs, oip, lastblock);
        if (bn != 0) {
                long oldspace, newspace;

                /*
                 * Calculate amount of space we're giving
                 * back as old block size minus new block size.
                 */
                oldspace = ffs_blksize(fs, oip, lastblock);
                oip->i_size = length;
                DIP_ASSIGN(oip, size, length);
                newspace = ffs_blksize(fs, oip, lastblock);
                if (newspace == 0)
                        panic("itrunc: newspace");
                if (oldspace - newspace > 0) {
                        /*
                         * Block number of space to be free'd is
                         * the old block # plus the number of frags
                         * required for the storage we're keeping.
                         */
                        bn += ffs_numfrags(fs, newspace);
                        if ((oip->i_ump->um_mountp->mnt_wapbl) &&
                            (ovp->v_type != VREG)) {
                                error = UFS_WAPBL_REGISTER_DEALLOCATION(
                                    oip->i_ump->um_mountp, FFS_FSBTODB(fs, bn),
                                    oldspace - newspace, NULL);
                                if (error)
                                        goto out;
                        } else
                                ffs_blkfree(fs, oip->i_devvp, bn,
                                    oldspace - newspace, oip->i_number);
                        blocksreleased += btodb(oldspace - newspace);
                }
        }

done:
        for (level = SINGLE; level <= TRIPLE; level++)
                KASSERTMSG((blks[UFS_NDADDR + level] == DIP(oip, ib[level])),
                    "itrunc1 blk mismatch: %jx != %jx",
                    (uintmax_t)blks[UFS_NDADDR + level],
                    (uintmax_t)DIP(oip, ib[level]));
        for (i = 0; i < UFS_NDADDR; i++)
                KASSERTMSG((blks[i] == DIP(oip, db[i])),
                    "itrunc2 blk mismatch: %jx != %jx",
                    (uintmax_t)blks[i], (uintmax_t)DIP(oip, db[i]));
        KASSERTMSG((length != 0 || extblocks || LIST_EMPTY(&ovp->v_cleanblkhd)),
            "itrunc3: zero length and nonempty cleanblkhd");
        KASSERTMSG((length != 0 || extblocks || LIST_EMPTY(&ovp->v_dirtyblkhd)),
            "itrunc3: zero length and nonempty dirtyblkhd");

out:
        /*
         * Set length back to old size if deallocation failed. Some indirect
         * blocks were deallocated creating a hole, but that is okay.
         */
        if (error == EAGAIN) {
                if (!allerror)
                        allerror = error;
                length = osize;
                uvm_vnp_setsize(ovp, length);
        }

        /*
         * Put back the real size.
         */
        oip->i_size = length;
        DIP_ASSIGN(oip, size, length);
        DIP_ADD(oip, blocks, -blocksreleased);
        genfs_node_unlock(ovp);
        oip->i_flag |= IN_CHANGE;
        UFS_WAPBL_UPDATE(ovp, NULL, NULL, 0);
#if defined(QUOTA) || defined(QUOTA2)
        (void) chkdq(oip, -blocksreleased, NOCRED, 0);
#endif
        KASSERT(ovp->v_type != VREG || ovp->v_size == oip->i_size);
        return (allerror);
}

/*
 * Release blocks associated with the inode ip and stored in the indirect
 * block bn.  Blocks are free'd in LIFO order up to (but not including)
 * lastbn.  If level is greater than SINGLE, the block is an indirect block
 * and recursive calls to indirtrunc must be used to cleanse other indirect
 * blocks.
 *
 * NB: triple indirect blocks are untested.
 */
static int
ffs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn,
    int level, int64_t *countp)
{
        int i;
        struct buf *bp;
        struct fs *fs = ip->i_fs;
        int32_t *bap1 = NULL;
        int64_t *bap2 = NULL;
        struct vnode *vp;
        daddr_t nb, nlbn, last;
        char *copy = NULL;
        int64_t factor;
        int64_t nblocks;
        int error = 0, allerror = 0;
        const int needswap = UFS_FSNEEDSWAP(fs);
        const int wapbl = (ip->i_ump->um_mountp->mnt_wapbl != NULL);
        void *dcookie;

#define RBAP(ip, i) (((ip)->i_ump->um_fstype == UFS1) ? \
            ufs_rw32(bap1[i], needswap) : ufs_rw64(bap2[i], needswap))
#define BAP_ASSIGN(ip, i, value)                                        \
        do {                                                                \
                if ((ip)->i_ump->um_fstype == UFS1)                        \
                        bap1[i] = (value);                                \
                else                                                        \
                        bap2[i] = (value);                                \
        } while(0)

        /*
         * Calculate index in current block of last
         * block to be kept.  -1 indicates the entire
         * block so we need not calculate the index.
         */
        factor = 1;
        for (i = SINGLE; i < level; i++)
                factor *= FFS_NINDIR(fs);
        last = lastbn;
        if (lastbn > 0)
                last /= factor;
        nblocks = btodb(fs->fs_bsize);
        /*
         * Get buffer of block pointers, zero those entries corresponding
         * to blocks to be free'd, and update on disk copy first.  Since
         * double(triple) indirect before single(double) indirect, calls
         * to bmap on these blocks will fail.  However, we already have
         * the on disk address, so we have to set the b_blkno field
         * explicitly instead of letting bread do everything for us.
         */
        vp = ITOV(ip);
        error = ffs_getblk(vp, lbn, FFS_NOBLK, fs->fs_bsize, false, &bp);
        if (error)
                return error;

        if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
                /* Braces must be here in case trace evaluates to nothing. */
                trace(TR_BREADHIT, pack(vp, fs->fs_bsize), lbn);
        } else {
                trace(TR_BREADMISS, pack(vp, fs->fs_bsize), lbn);
                curlwp->l_ru.ru_inblock++;        /* pay for read */
                bp->b_flags |= B_READ;
                bp->b_flags &= ~B_COWDONE;        /* we change blkno below */
                if (bp->b_bcount > bp->b_bufsize)
                        panic("ffs_indirtrunc: bad buffer size");
                bp->b_blkno = dbn;
                BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
                VOP_STRATEGY(vp, bp);
                error = biowait(bp);
                if (error == 0)
                        error = fscow_run(bp, true);
        }
        if (error) {
                brelse(bp, 0);
                return error;
        }

        /*
         * Clear reference to blocks to be removed on disk, before actually
         * reclaiming them, so that fsck is more likely to be able to recover
         * the filesystem if system goes down during the truncate process.
         * This assumes the truncate process would not fail, contrary
         * to the wapbl case.
         */
        if (ip->i_ump->um_fstype == UFS1)
                bap1 = (int32_t *)bp->b_data;
        else
                bap2 = (int64_t *)bp->b_data;
        if (lastbn >= 0 && !wapbl) {
                copy = kmem_alloc(fs->fs_bsize, KM_SLEEP);
                memcpy((void *)copy, bp->b_data, (u_int)fs->fs_bsize);
                for (i = last + 1; i < FFS_NINDIR(fs); i++)
                        BAP_ASSIGN(ip, i, 0);
                error = bwrite(bp);
                if (error)
                        allerror = error;

                if (ip->i_ump->um_fstype == UFS1)
                        bap1 = (int32_t *)copy;
                else
                        bap2 = (int64_t *)copy;
        }

        /*
         * Recursively free totally unused blocks.
         */
        for (i = FFS_NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
            i--, nlbn += factor) {
                nb = RBAP(ip, i);
                if (nb == 0)
                        continue;

                if ((ip->i_ump->um_mountp->mnt_wapbl) &&
                    ((level > SINGLE) || (ITOV(ip)->v_type != VREG))) {
                        error = UFS_WAPBL_REGISTER_DEALLOCATION(
                            ip->i_ump->um_mountp,
                            FFS_FSBTODB(fs, nb), fs->fs_bsize,
                            &dcookie);
                        if (error)
                                goto out;
                } else {
                        dcookie = NULL;
                }

                if (level > SINGLE) {
                        error = ffs_indirtrunc(ip, nlbn, FFS_FSBTODB(fs, nb),
                                               (daddr_t)-1, level - 1, countp);
                        if (error) {
                                if (dcookie) {
                                        UFS_WAPBL_UNREGISTER_DEALLOCATION(
                                            ip->i_ump->um_mountp, dcookie);
                                }

                                goto out;
                        }
                }

                if (!dcookie)
                        ffs_blkfree(fs, ip->i_devvp, nb, fs->fs_bsize,
                            ip->i_number);

                BAP_ASSIGN(ip, i, 0);
                *countp += nblocks;
        }

        /*
         * Recursively free blocks on the now last partial indirect block.
         */
        if (level > SINGLE && lastbn >= 0) {
                last = lastbn % factor;
                nb = RBAP(ip, i);
                if (nb != 0) {
                        error = ffs_indirtrunc(ip, nlbn, FFS_FSBTODB(fs, nb),
                                               last, level - 1, countp);
                        if (error)
                                goto out;
                }
        }

out:
         if (error && !allerror)
                 allerror = error;

         if (copy != NULL) {
                 kmem_free(copy, fs->fs_bsize);
         } else if (lastbn < 0 && error == 0) {
                /* all freed, release without writing back */
                brelse(bp, BC_INVAL);
        } else if (wapbl) {
                 /* only partially freed, write the updated block */
                 error = bwrite(bp);
                 if (!allerror)
                         allerror = error;
        }

        return (allerror);
}

void
ffs_itimes(struct inode *ip, const struct timespec *acc,
    const struct timespec *mod, const struct timespec *cre)
{
        struct timespec now;

        if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) {
                return;
        }

        vfs_timestamp(&now);
        if (ip->i_flag & IN_ACCESS) {
                if (acc == NULL)
                        acc = &now;
                DIP_ASSIGN(ip, atime, acc->tv_sec);
                DIP_ASSIGN(ip, atimensec, acc->tv_nsec);
        }
        if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) {
                if ((ip->i_flags & SF_SNAPSHOT) == 0) {
                        if (mod == NULL)
                                mod = &now;
                        DIP_ASSIGN(ip, mtime, mod->tv_sec);
                        DIP_ASSIGN(ip, mtimensec, mod->tv_nsec);
                }
                ip->i_modrev++;
        }
        if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) {
                if (cre == NULL)
                        cre = &now;
                DIP_ASSIGN(ip, ctime, cre->tv_sec);
                DIP_ASSIGN(ip, ctimensec, cre->tv_nsec);
        }
        if (ip->i_flag & (IN_ACCESS | IN_MODIFY))
                ip->i_flag |= IN_ACCESSED;
        if (ip->i_flag & (IN_UPDATE | IN_CHANGE))
                ip->i_flag |= IN_MODIFIED;
        ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
}





































































































































































    5 

















    5 
    5 
    5 
    5 
    5 
    5 
    5 


    5 
    5 
    5 
    5 








    5 









    3 
    3 


    3 
    3 



























    5 





    5 
    5 
    5 
    5 
    5 


    5 
    5 
    5 
    5 
    5 
    5 


    5 
    5 


    5 
    5 

    5 





    5 








    5 
    5 
    5 
    5 
    3 






    5 
























    5 









    3 






    2 

















































    5 



    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    2 

    3 



    2 




    5 

    5 

    5 









    3 
    2 










    1 

    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 
    1 








    1 

    1 


    1 












































    5 
    5 
    5 
    5 


    5 










    5 
    5 
    5 
    5 
    5 
    5 
    5 


















    5 






    5 
    5 











    5 




    5 








    5 






    3 


    3 




    5 






    5 

    5 




    3 

    3 

    3 





    5 
    5 
    5 





















































































































































































































































































































































    5 


    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    3 


    3 
    3 


    5 































































    5 




























    5 


    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 
    5 














    5 










    5 



    5 


















































































    5 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
/*        $NetBSD: genfs_rename.c,v 1.7 2021/10/20 13:29:06 thorpej Exp $        */

/*-
 * Copyright (c) 2012 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Taylor R Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Generic rename abstraction.
 *
 * Rename is unbelievably hairy.  Try to use this if you can --
 * otherwise you are practically guaranteed to get it wrong.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_rename.c,v 1.7 2021/10/20 13:29:06 thorpej Exp $");

#include <sys/param.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/types.h>

#include <miscfs/genfs/genfs.h>

/*
 * Sample copypasta for implementing VOP_RENAME via genfs_rename.
 * Don't change this template without carefully considering whether
 * every other file system that already uses it needs to change too.
 * That way, once we have changed all the file systems to use it, we
 * can easily replace mumblefs_rename by mumblefs_sane_rename and
 * eliminate the insane API altogether.
 */

/* begin sample copypasta */
#if 0

static const struct genfs_rename_ops mumblefs_genfs_rename_ops;

/*
 * mumblefs_sane_rename: The hairiest vop, with the saner API.
 *
 * Arguments:
 *
 * . fdvp (from directory vnode),
 * . fcnp (from component name),
 * . tdvp (to directory vnode),
 * . tcnp (to component name),
 * . cred (credentials structure), and
 * . posixly_correct (flag for behaviour if target & source link same file).
 *
 * fdvp and tdvp may be the same, and must be referenced and unlocked.
 */
static int
mumblefs_sane_rename(
    struct vnode *fdvp, struct componentname *fcnp,
    struct vnode *tdvp, struct componentname *tcnp,
    kauth_cred_t cred, bool posixly_correct)
{
        struct mumblefs_lookup_results fulr, tulr;

        return genfs_sane_rename(&mumblefs_genfs_rename_ops,
            fdvp, fcnp, &fulr, tdvp, tcnp, &tulr,
            cred, posixly_correct);
}

/*
 * mumblefs_rename: The hairiest vop, with the insanest API.  Defer to
 * genfs_insane_rename immediately.
 */
int
mumblefs_rename(void *v)
{

        return genfs_insane_rename(v, &mumblefs_sane_rename);
}

#endif
/* end sample copypasta */

/*
 * Forward declarations
 */

static int genfs_rename_enter(const struct genfs_rename_ops *, struct mount *,
    kauth_cred_t,
    struct vnode *, struct componentname *, void *, struct vnode **,
    struct vnode *, struct componentname *, void *, struct vnode **);
static int genfs_rename_enter_common(const struct genfs_rename_ops *,
    struct mount *, kauth_cred_t, struct vnode *,
    struct componentname *, void *, struct vnode **,
    struct componentname *, void *, struct vnode **);
static int genfs_rename_enter_separate(const struct genfs_rename_ops *,
    struct mount *, kauth_cred_t,
    struct vnode *, struct componentname *, void *, struct vnode **,
    struct vnode *, struct componentname *, void *, struct vnode **);
static int genfs_rename_lock(const struct genfs_rename_ops *, struct mount *,
    kauth_cred_t, int, int, int,
    struct vnode *, struct componentname *, bool, void *, struct vnode **,
    struct vnode *, struct componentname *, bool, void *, struct vnode **);
static void genfs_rename_exit(const struct genfs_rename_ops *, struct mount *,
    struct vnode *, struct vnode *,
    struct vnode *, struct vnode *);
static int genfs_rename_remove(const struct genfs_rename_ops *, struct mount *,
    kauth_cred_t,
    struct vnode *, struct componentname *, void *, struct vnode *, nlink_t *);

/*
 * genfs_insane_rename: Generic implementation of the insane API for
 * the rename vop.
 *
 * Arguments:
 *
 * . fdvp (from directory vnode),
 * . fvp (from vnode),
 * . fcnp (from component name),
 * . tdvp (to directory vnode),
 * . tvp (to vnode, or NULL), and
 * . tcnp (to component name).
 *
 * Any pair of vnode parameters may have the same vnode.
 *
 * On entry,
 *
 * . fdvp, fvp, tdvp, and tvp are referenced,
 * . fdvp and fvp are unlocked, and
 * . tdvp and tvp (if nonnull) are locked.
 *
 * On exit,
 *
 * . fdvp, fvp, tdvp, and tvp (if nonnull) are unreferenced, and
 * . tdvp and tvp (if nonnull) are unlocked.
 */
int
genfs_insane_rename(void *v,
    int (*sane_rename)(struct vnode *fdvp, struct componentname *fcnp,
        struct vnode *tdvp, struct componentname *tcnp,
        kauth_cred_t cred, bool posixly_correct))
{
        struct vop_rename_args /* {
                struct vnode *a_fdvp;
                struct vnode *a_fvp;
                struct componentname *a_fcnp;
                struct vnode *a_tdvp;
                struct vnode *a_tvp;
                struct componentname *a_tcnp;
        } */ *ap = v;
        struct vnode *fdvp = ap->a_fdvp;
        struct vnode *fvp = ap->a_fvp;
        struct componentname *fcnp = ap->a_fcnp;
        struct vnode *tdvp = ap->a_tdvp;
        struct vnode *tvp = ap->a_tvp;
        struct componentname *tcnp = ap->a_tcnp;
        kauth_cred_t cred;
        int error;

        KASSERT(fdvp != NULL);
        KASSERT(fvp != NULL);
        KASSERT(fcnp != NULL);
        KASSERT(fcnp->cn_nameptr != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(tcnp != NULL);
        KASSERT(fcnp->cn_nameptr != NULL);
        /* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
        /* KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
        KASSERT(fdvp->v_type == VDIR);
        KASSERT(tdvp->v_type == VDIR);

        cred = fcnp->cn_cred;

        /*
         * XXX Want a better equality test.  `tcnp->cn_cred == cred'
         * hoses p2k because puffs transmits the creds separately and
         * allocates distinct but equivalent structures for them.
         */
        KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred));

        /*
         * Sanitize our world from the VFS insanity.  Unlock the target
         * directory and node, which are locked.  Release the children,
         * which are referenced, since we'll be looking them up again
         * later.
         */

        VOP_UNLOCK(tdvp);
        if ((tvp != NULL) && (tvp != tdvp))
                VOP_UNLOCK(tvp);

        vrele(fvp);
        if (tvp != NULL)
                vrele(tvp);

        error = (*sane_rename)(fdvp, fcnp, tdvp, tcnp, cred, false);

        /*
         * All done, whether with success or failure.  Release the
         * directory nodes now, as the caller expects from the VFS
         * protocol.
         */
        vrele(fdvp);
        vrele(tdvp);

        return error;
}

/*
 * genfs_sane_rename: Generic implementation of the saner API for the
 * rename vop.  Handles ancestry checks, locking, and permissions
 * checks.  Caller is responsible for implementing the genfs rename
 * operations.
 *
 * fdvp and tdvp must be referenced and unlocked.
 */
int
genfs_sane_rename(const struct genfs_rename_ops *ops,
    struct vnode *fdvp, struct componentname *fcnp, void *fde,
    struct vnode *tdvp, struct componentname *tcnp, void *tde,
    kauth_cred_t cred, bool posixly_correct)
{
        struct mount *mp;
        struct vnode *fvp = NULL, *tvp = NULL;
        nlink_t tvp_new_nlink = 0;
        int error;

        KASSERT(ops != NULL);
        KASSERT(fdvp != NULL);
        KASSERT(fcnp != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(tcnp != NULL);
        /* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
        /* KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
        KASSERT(fdvp->v_type == VDIR);
        KASSERT(tdvp->v_type == VDIR);
        KASSERT(fdvp->v_mount == tdvp->v_mount);
        KASSERT(fcnp != tcnp);
        KASSERT(fcnp->cn_nameiop == DELETE);
        KASSERT(tcnp->cn_nameiop == RENAME);

        /* XXX Want a better equality test.  */
        KASSERT(kauth_cred_uidmatch(cred, fcnp->cn_cred));
        KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred));

        mp = fdvp->v_mount;
        KASSERT(mp != NULL);
        KASSERT(mp == tdvp->v_mount);
        /* XXX How can we be sure this stays true?  */
        KASSERT((mp->mnt_flag & MNT_RDONLY) == 0);

        /* Reject rename("x/..", ...) and rename(..., "x/..") early.  */
        if ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT)
                return EINVAL;        /* XXX EISDIR?  */

        error = genfs_rename_enter(ops, mp, cred,
            fdvp, fcnp, fde, &fvp,
            tdvp, tcnp, tde, &tvp);
        if (error)
                return error;

        /*
         * Check that everything is locked and looks right.
         */
        KASSERT(fvp != NULL);
        KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));

        /*
         * If the source and destination are the same object, we need
         * only at most delete the source entry.  We are guaranteed at
         * this point that the entries are distinct.
         */
        if (fvp == tvp) {
                KASSERT(tvp != NULL);
                if (fvp->v_type == VDIR)
                        /* XXX This shouldn't be possible.  */
                        error = EINVAL;
                else if (posixly_correct)
                        /* POSIX sez to leave them alone.  */
                        error = 0;
                else if ((fdvp == tdvp) &&
                    (fcnp->cn_namelen == tcnp->cn_namelen) &&
                    (memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr,
                        fcnp->cn_namelen) == 0))
                        /* Renaming an entry over itself does nothing.  */
                        error = 0;
                else {
                        /* XXX Can't use VOP_REMOVE because of locking.  */
                        error = genfs_rename_remove(ops, mp, cred,
                            fdvp, fcnp, fde, fvp, &tvp_new_nlink);
                        VN_KNOTE(fdvp, NOTE_WRITE);
                        VN_KNOTE(fvp,
                            tvp_new_nlink == 0 ? NOTE_DELETE : NOTE_LINK);
                }
                goto out;
        }
        KASSERT(fvp != tvp);
        KASSERT((fdvp != tdvp) ||
            (fcnp->cn_namelen != tcnp->cn_namelen) ||
            (memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen)
                != 0));

        /*
         * If the target exists, refuse to rename a directory over a
         * non-directory or vice versa, or to clobber a non-empty
         * directory.
         */
        if (tvp != NULL) {
                if (fvp->v_type == VDIR && tvp->v_type == VDIR)
                        error =
                            (ops->gro_directory_empty_p(mp, cred, tvp, tdvp)?
                                0 : ENOTEMPTY);
                else if (fvp->v_type == VDIR && tvp->v_type != VDIR)
                        error = ENOTDIR;
                else if (fvp->v_type != VDIR && tvp->v_type == VDIR)
                        error = EISDIR;
                else
                        error = 0;
                if (error)
                        goto out;
                KASSERT((fvp->v_type == VDIR) == (tvp->v_type == VDIR));
        }

        /*
         * Authorize the rename.
         */
        error = ops->gro_rename_check_possible(mp, fdvp, fvp, tdvp, tvp);
        if (error)
                goto out;
        error = ops->gro_rename_check_permitted(mp, cred, fdvp, fvp, tdvp, tvp);
        error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, fvp, fdvp,
            error);
        error = kauth_authorize_vnode(cred, KAUTH_VNODE_RENAME, tvp, tdvp,
            error);
        if (error)
                goto out;

        /*
         * Everything is hunky-dory.  Shuffle the directory entries.
         */
        error = ops->gro_rename(mp, cred,
            fdvp, fcnp, fde, fvp,
            tdvp, tcnp, tde, tvp,
            &tvp_new_nlink);
        if (error)
                goto out;

        /* Success!  */
        genfs_rename_knote(fdvp, fvp, tdvp, tvp, tvp_new_nlink);

out:
        genfs_rename_exit(ops, mp, fdvp, fvp, tdvp, tvp);
        return error;
}

/*
 * genfs_rename_knote: Note events about the various vnodes in a
 * rename.  To be called by gro_rename on success.  The only pair of
 * vnodes that may be identical is {fdvp, tdvp}.  tvp_new_nlink is
 * the resulting link count of tvp.
 */
void
genfs_rename_knote(struct vnode *fdvp, struct vnode *fvp,
    struct vnode *tdvp, struct vnode *tvp, nlink_t tvp_new_nlink)
{
        long fdvp_events, tdvp_events;
        bool directory_p, reparent_p, replaced_p;

        KASSERT(fdvp != NULL);
        KASSERT(fvp != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(fdvp != fvp);
        KASSERT(fdvp != tvp);
        KASSERT(tdvp != fvp);
        KASSERT(tdvp != tvp);
        KASSERT(fvp != tvp);
        KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));

        directory_p = (fvp->v_type == VDIR);
        reparent_p = (fdvp != tdvp);
        replaced_p = (tvp != NULL);

        KASSERT((tvp == NULL) || (directory_p == (tvp->v_type == VDIR)));

        fdvp_events = NOTE_WRITE;
        if (directory_p && reparent_p)
                fdvp_events |= NOTE_LINK;
        VN_KNOTE(fdvp, fdvp_events);

        VN_KNOTE(fvp, NOTE_RENAME);

        if (reparent_p) {
                tdvp_events = NOTE_WRITE;
                if (!replaced_p) {
                        tdvp_events |= NOTE_EXTEND;
                        if (directory_p)
                                tdvp_events |= NOTE_LINK;
                }
                VN_KNOTE(tdvp, tdvp_events);
        }

        if (replaced_p)
                VN_KNOTE(tvp, (tvp_new_nlink == 0 ? NOTE_DELETE : NOTE_LINK));
}

/*
 * genfs_rename_cache_purge: Purge the name cache.  To be called by
 * gro_rename on success.  The only pair of vnodes that may be
 * identical is {fdvp, tdvp}.
 */
void
genfs_rename_cache_purge(struct vnode *fdvp, struct vnode *fvp,
    struct vnode *tdvp, struct vnode *tvp)
{

        KASSERT(fdvp != NULL);
        KASSERT(fvp != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(fdvp != fvp);
        KASSERT(fdvp != tvp);
        KASSERT(tdvp != fvp);
        KASSERT(tdvp != tvp);
        KASSERT(fvp != tvp);
        KASSERT(fdvp->v_type == VDIR);
        KASSERT(tdvp->v_type == VDIR);

        /*
         * XXX What actually needs to be purged?
         */

        cache_purge(fdvp);

        if (fvp->v_type == VDIR)
                cache_purge(fvp);

        if (tdvp != fdvp)
                cache_purge(tdvp);

        if ((tvp != NULL) && (tvp->v_type == VDIR))
                cache_purge(tvp);
}

/*
 * genfs_rename_enter: Look up fcnp in fdvp, and store the lookup
 * results in *fde_ret and the associated vnode in *fvp_ret; fail if
 * not found.  Look up tcnp in tdvp, and store the lookup results in
 * *tde_ret and the associated vnode in *tvp_ret; store null instead if
 * not found.  Fail if anything has been mounted on any of the nodes
 * involved.
 *
 * fdvp and tdvp must be referenced.
 *
 * On entry, nothing is locked.
 *
 * On success, everything is locked, and *fvp_ret, and *tvp_ret if
 * nonnull, are referenced.  The only pairs of vnodes that may be
 * identical are {fdvp, tdvp} and {fvp, tvp}.
 *
 * On failure, everything remains as was.
 *
 * Locking everything including the source and target nodes is
 * necessary to make sure that, e.g., link count updates are OK.  The
 * locking order is, in general, ancestor-first, matching the order you
 * need to use to look up a descendant anyway.
 */
static int
genfs_rename_enter(const struct genfs_rename_ops *ops,
    struct mount *mp, kauth_cred_t cred,
    struct vnode *fdvp, struct componentname *fcnp,
    void *fde_ret, struct vnode **fvp_ret,
    struct vnode *tdvp, struct componentname *tcnp,
    void *tde_ret, struct vnode **tvp_ret)
{
        int error;

        KASSERT(mp != NULL);
        KASSERT(fdvp != NULL);
        KASSERT(fcnp != NULL);
        KASSERT(fvp_ret != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(tcnp != NULL);
        KASSERT(tvp_ret != NULL);
        KASSERT(fvp_ret != tvp_ret);
        KASSERT(fdvp->v_type == VDIR);
        KASSERT(tdvp->v_type == VDIR);
        KASSERT(fdvp->v_mount == mp);
        KASSERT(tdvp->v_mount == mp);

        if (fdvp == tdvp)
                error = genfs_rename_enter_common(ops, mp, cred, fdvp,
                    fcnp, fde_ret, fvp_ret,
                    tcnp, tde_ret, tvp_ret);
        else
                error = genfs_rename_enter_separate(ops, mp, cred,
                    fdvp, fcnp, fde_ret, fvp_ret,
                    tdvp, tcnp, tde_ret, tvp_ret);

        if (error)
                return error;

        KASSERT(*fvp_ret != NULL);
        KASSERT(VOP_ISLOCKED(*fvp_ret) == LK_EXCLUSIVE);
        KASSERT((*tvp_ret == NULL) || (VOP_ISLOCKED(*tvp_ret) == LK_EXCLUSIVE));
        KASSERT(*fvp_ret != fdvp);
        KASSERT(*fvp_ret != tdvp);
        KASSERT(*tvp_ret != fdvp);
        KASSERT(*tvp_ret != tdvp);
        return 0;
}

/*
 * genfs_rename_enter_common: Lock and look up with a common
 * source/target directory.
 */
static int
genfs_rename_enter_common(const struct genfs_rename_ops *ops,
    struct mount *mp, kauth_cred_t cred, struct vnode *dvp,
    struct componentname *fcnp,
    void *fde_ret, struct vnode **fvp_ret,
    struct componentname *tcnp,
    void *tde_ret, struct vnode **tvp_ret)
{
        struct vnode *fvp, *tvp;
        int error;

        KASSERT(ops != NULL);
        KASSERT(mp != NULL);
        KASSERT(dvp != NULL);
        KASSERT(fcnp != NULL);
        KASSERT(fvp_ret != NULL);
        KASSERT(tcnp != NULL);
        KASSERT(tvp_ret != NULL);
        KASSERT(dvp->v_type == VDIR);
        KASSERT(dvp->v_mount == mp);

        error = ops->gro_lock_directory(mp, dvp);
        if (error)
                goto fail0;

        /* Did we lose a race with mount?  */
        if (dvp->v_mountedhere != NULL) {
                error = EBUSY;
                goto fail1;
        }

        KASSERT(fcnp->cn_nameiop == DELETE);
        error = ops->gro_lookup(mp, dvp, fcnp, fde_ret, &fvp);
        if (error)
                goto fail1;

        KASSERT(fvp != NULL);

        /* Refuse to rename `.'.  */
        if (fvp == dvp) {
                error = EINVAL;
                goto fail2;
        }
        KASSERT(fvp != dvp);

        KASSERT(tcnp->cn_nameiop == RENAME);
        error = ops->gro_lookup(mp, dvp, tcnp, tde_ret, &tvp);
        if (error == ENOENT) {
                tvp = NULL;
        } else if (error) {
                goto fail2;
        } else {
                KASSERT(tvp != NULL);

                /* Refuse to rename over `.'.  */
                if (tvp == dvp) {
                        error = EISDIR; /* XXX EINVAL?  */
                        goto fail2;
                }
        }
        KASSERT(tvp != dvp);

        /*
         * We've looked up both nodes.  Now lock them and check them.
         */

        vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY);
        KASSERT(fvp->v_mount == mp);
        /* Refuse to rename a mount point.  */
        if ((fvp->v_type == VDIR) && (fvp->v_mountedhere != NULL)) {
                error = EBUSY;
                goto fail3;
        }

        if ((tvp != NULL) && (tvp != fvp)) {
                vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
                KASSERT(tvp->v_mount == mp);
                /* Refuse to rename over a mount point.  */
                if ((tvp->v_type == VDIR) && (tvp->v_mountedhere != NULL)) {
                        error = EBUSY;
                        goto fail4;
                }
        }

        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));

        *fvp_ret = fvp;
        *tvp_ret = tvp;
        return 0;

fail4:        if ((tvp != NULL) && (tvp != fvp))
                VOP_UNLOCK(tvp);
fail3:        VOP_UNLOCK(fvp);
        if (tvp != NULL)
                vrele(tvp);
fail2:        vrele(fvp);
fail1:        VOP_UNLOCK(dvp);
fail0:        return error;
}

/*
 * genfs_rename_enter_separate: Lock and look up with separate source
 * and target directories.
 */
static int
genfs_rename_enter_separate(const struct genfs_rename_ops *ops,
    struct mount *mp, kauth_cred_t cred,
    struct vnode *fdvp, struct componentname *fcnp,
    void *fde_ret, struct vnode **fvp_ret,
    struct vnode *tdvp, struct componentname *tcnp,
    void *tde_ret, struct vnode **tvp_ret)
{
        struct vnode *intermediate_node;
        struct vnode *fvp, *tvp;
        int error;

        KASSERT(ops != NULL);
        KASSERT(mp != NULL);
        KASSERT(fdvp != NULL);
        KASSERT(fcnp != NULL);
        KASSERT(fvp_ret != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(tcnp != NULL);
        KASSERT(tvp_ret != NULL);
        KASSERT(fdvp != tdvp);
        KASSERT(fcnp != tcnp);
        KASSERT(fcnp->cn_nameiop == DELETE);
        KASSERT(tcnp->cn_nameiop == RENAME);
        KASSERT(fvp_ret != tvp_ret);
        KASSERT(fdvp->v_type == VDIR);
        KASSERT(tdvp->v_type == VDIR);
        KASSERT(fdvp->v_mount == mp);
        KASSERT(tdvp->v_mount == mp);

        error = ops->gro_genealogy(mp, cred, fdvp, tdvp, &intermediate_node);
        if (error)
                return error;

        /*
         * intermediate_node == NULL means fdvp is not an ancestor of tdvp.
         */
        if (intermediate_node == NULL)
                error = genfs_rename_lock(ops, mp, cred,
                    ENOTEMPTY, EISDIR, EINVAL,
                    tdvp, tcnp, true, tde_ret, &tvp,
                    fdvp, fcnp, false, fde_ret, &fvp);
        else
                error = genfs_rename_lock(ops, mp, cred,
                    EINVAL, EISDIR, EINVAL,
                    fdvp, fcnp, false, fde_ret, &fvp,
                    tdvp, tcnp, true, tde_ret, &tvp);
        if (error)
                goto out;

        KASSERT(fvp != NULL);

        /*
         * Reject rename("foo/bar", "foo/bar/baz/quux/zot").
         */
        if (fvp == intermediate_node) {
                genfs_rename_exit(ops, mp, fdvp, fvp, tdvp, tvp);
                error = EINVAL;
                goto out;
        }

        *fvp_ret = fvp;
        *tvp_ret = tvp;
        error = 0;

out:        if (intermediate_node != NULL)
                vrele(intermediate_node);
        return error;
}

/*
 * genfs_rename_lock: Lookup and lock it all.  The lock order is:
 *
 *        a_dvp -> a_vp -> b_dvp -> b_vp,
 *
 * except if a_vp is a nondirectory in which case the lock order is:
 *
 *        a_dvp -> b_dvp -> b_vp -> a_vp,
 *
 * which can't violate ancestor->descendant because a_vp has no
 * descendants in this case.  This edge case is necessary because some
 * file systems can only lookup/lock/unlock, and we can't hold a_vp
 * locked when we lookup/lock/unlock b_vp if they turn out to be the
 * same, and we can't find out that they're the same until after the
 * lookup.
 *
 * b_dvp must not be an ancestor of a_dvp, although a_dvp may be an
 * ancestor of b_dvp.
 *
 * Fail with overlap_error if node a is directory b.  Neither
 * componentname may be `.' or `..'.
 *
 * a_dvp and b_dvp must be referenced.
 *
 * On entry, a_dvp and b_dvp are unlocked.
 *
 * On success,
 * . a_dvp and b_dvp are locked,
 * . *a_dirent_ret is filled with a directory entry whose node is
 *     locked and referenced,
 * . *b_vp_ret is filled with the corresponding vnode,
 * . *b_dirent_ret is filled either with null or with a directory entry
 *     whose node is locked and referenced,
 * . *b_vp is filled either with null or with the corresponding vnode,
 *     and
 * . the only pair of vnodes that may be identical is a_vp and b_vp.
 *
 * On failure, a_dvp and b_dvp are left unlocked, and *a_dirent_ret,
 * *a_vp, *b_dirent_ret, and *b_vp are left alone.
 */
static int
genfs_rename_lock(const struct genfs_rename_ops *ops,
    struct mount *mp, kauth_cred_t cred,
    int overlap_error, int a_dot_error, int b_dot_error,
    struct vnode *a_dvp, struct componentname *a_cnp, bool a_missing_ok,
    void *a_de_ret, struct vnode **a_vp_ret,
    struct vnode *b_dvp, struct componentname *b_cnp, bool b_missing_ok,
    void *b_de_ret, struct vnode **b_vp_ret)
{
        struct vnode *a_vp, *b_vp;
        int error;

        KASSERT(ops != NULL);
        KASSERT(mp != NULL);
        KASSERT(a_dvp != NULL);
        KASSERT(a_cnp != NULL);
        KASSERT(a_vp_ret != NULL);
        KASSERT(b_dvp != NULL);
        KASSERT(b_cnp != NULL);
        KASSERT(b_vp_ret != NULL);
        KASSERT(a_dvp != b_dvp);
        KASSERT(a_vp_ret != b_vp_ret);
        KASSERT(a_dvp->v_type == VDIR);
        KASSERT(b_dvp->v_type == VDIR);
        KASSERT(a_dvp->v_mount == mp);
        KASSERT(b_dvp->v_mount == mp);
        KASSERT(a_missing_ok != b_missing_ok);

        /*
         * 1. Lock a_dvp.
         */
        error = ops->gro_lock_directory(mp, a_dvp);
        if (error)
                goto fail0;

        /* Did we lose a race with mount?  */
        if (a_dvp->v_mountedhere != NULL) {
                error = EBUSY;
                goto fail1;
        }

        /*
         * 2. Lookup a_vp.  May lock/unlock a_vp.
         */
        error = ops->gro_lookup(mp, a_dvp, a_cnp, a_de_ret, &a_vp);
        if (error) {
                if (a_missing_ok && (error == ENOENT))
                        a_vp = NULL;
                else
                        goto fail1;
        } else {
                KASSERT(a_vp != NULL);

                /* Refuse to rename (over) `.'.  */
                if (a_vp == a_dvp) {
                        error = a_dot_error;
                        goto fail2;
                }

                /* Reject rename("x", "x/y") or rename("x/y", "x").  */
                if (a_vp == b_dvp) {
                        error = overlap_error;
                        goto fail2;
                }
        }

        KASSERT(a_vp != a_dvp);
        KASSERT(a_vp != b_dvp);

        /*
         * 3. Lock a_vp, if it is a directory.
         *
         * We already ruled out a_vp == a_dvp (i.e., a_cnp is `.'), so
         * this is not locking against self, and we already ruled out
         * a_vp == b_dvp, so this won't cause subsequent locking of
         * b_dvp to lock against self.
         *
         * If a_vp is a nondirectory, we can't hold it when we lookup
         * b_vp in case (a) the file system can only lookup/lock/unlock
         * and (b) b_vp turns out to be the same file as a_vp due to
         * hard links -- and we can't even detect that case until after
         * we've looked up b_vp.  Fortunately, if a_vp is a
         * nondirectory, then it is a leaf, so we can safely lock it
         * last.
         */
        if (a_vp != NULL && a_vp->v_type == VDIR) {
                vn_lock(a_vp, LK_EXCLUSIVE | LK_RETRY);
                KASSERT(a_vp->v_mount == mp);
                /* Refuse to rename (over) a mount point.  */
                if (a_vp->v_mountedhere != NULL) {
                        error = EBUSY;
                        goto fail3;
                }
        }

        /*
         * 4. Lock b_dvp.
         */
        error = ops->gro_lock_directory(mp, b_dvp);
        if (error)
                goto fail3;

        /* Did we lose a race with mount?  */
        if (b_dvp->v_mountedhere != NULL) {
                error = EBUSY;
                goto fail4;
        }

        /*
         * 5. Lookup b_vp.  May lock/unlock b_vp.
         */
        error = ops->gro_lookup(mp, b_dvp, b_cnp, b_de_ret, &b_vp);
        if (error) {
                if (b_missing_ok && (error == ENOENT))
                        b_vp = NULL;
                else
                        goto fail4;
        } else {
                KASSERT(b_vp != NULL);

                /* Refuse to rename (over) `.'.  */
                if (b_vp == b_dvp) {
                        error = b_dot_error;
                        goto fail5;
                }

                /*
                 * b_dvp must not be an ancestor of a_dvp, so if we
                 * find b_dvp/b_vp=a_dvp/a_vp something is wrong.
                 */
                if (b_vp == a_dvp) {
                        /*
                         * We have a directory hard link before us.
                         * XXX What error should this return?  EDEADLK?
                         * Panic?
                         */
                        error = EIO;
                        goto fail5;
                }
        }
        KASSERT(b_vp != b_dvp);
        KASSERT(b_vp != a_dvp);

        /*
         * 6. Lock a_vp, if it is a nondirectory.
         *
         * In this case a_vp is a leaf, so it is either equal to or
         * incommensurate with b_vp, and so we can safely lock it at
         * any point now.
         */
        if (a_vp != NULL && a_vp->v_type != VDIR) {
                vn_lock(a_vp, LK_EXCLUSIVE | LK_RETRY);
                KASSERT(a_vp->v_mount == mp);
                /* (not a directory so can't have anything mounted here) */
        }

        /*
         * 7. Lock b_vp, if it is not a_vp.
         *
         * b_vp and a_vp may the same inode if they are hard links to
         * one another.
         */
        if ((b_vp != NULL) && (b_vp != a_vp)) {
                vn_lock(b_vp, LK_EXCLUSIVE | LK_RETRY);
                KASSERT(b_vp->v_mount == mp);
                /* Refuse to rename (over) a mount point.  */
                if ((b_vp->v_type == VDIR) && (b_vp->v_mountedhere != NULL)) {
                        error = EBUSY;
                        goto fail6;
                }
        }

        KASSERT(VOP_ISLOCKED(a_dvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(b_dvp) == LK_EXCLUSIVE);
        KASSERT(a_missing_ok || (a_vp != NULL));
        KASSERT(b_missing_ok || (b_vp != NULL));
        KASSERT((a_vp == NULL) || (VOP_ISLOCKED(a_vp) == LK_EXCLUSIVE));
        KASSERT((b_vp == NULL) || (VOP_ISLOCKED(b_vp) == LK_EXCLUSIVE));

        *a_vp_ret = a_vp;
        *b_vp_ret = b_vp;
        return 0;

fail6:        if ((b_vp != NULL) && (b_vp != a_vp))
                VOP_UNLOCK(b_vp);
        if (a_vp != NULL && a_vp->v_type != VDIR)
                VOP_UNLOCK(a_vp);
fail5:        if (b_vp != NULL)
                vrele(b_vp);
fail4:        VOP_UNLOCK(b_dvp);
fail3:        if (a_vp != NULL && a_vp->v_type == VDIR)
                VOP_UNLOCK(a_vp);
fail2:        if (a_vp != NULL)
                vrele(a_vp);
fail1:        VOP_UNLOCK(a_dvp);
fail0:        return error;
}

/*
 * genfs_rename_exit: Unlock everything we locked for rename.
 *
 * fdvp and tdvp must be referenced.
 *
 * On entry, everything is locked, and fvp and tvp referenced.
 *
 * On exit, everything is unlocked, and fvp and tvp are released.
 */
static void
genfs_rename_exit(const struct genfs_rename_ops *ops,
    struct mount *mp,
    struct vnode *fdvp, struct vnode *fvp,
    struct vnode *tdvp, struct vnode *tvp)
{

        (void)ops;
        KASSERT(ops != NULL);
        KASSERT(mp != NULL);
        KASSERT(fdvp != NULL);
        KASSERT(fvp != NULL);
        KASSERT(fdvp != fvp);
        KASSERT(fdvp != tvp);
        KASSERT(tdvp != tvp);
        KASSERT(tdvp != fvp);
        KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));

        if ((tvp != NULL) && (tvp != fvp))
                VOP_UNLOCK(tvp);
        VOP_UNLOCK(fvp);
        if (tvp != NULL)
                vrele(tvp);
        if (tdvp != fdvp)
                VOP_UNLOCK(tdvp);
        vrele(fvp);
        VOP_UNLOCK(fdvp);
}

/*
 * genfs_rename_remove: Remove the entry for the non-directory vp with
 * componentname cnp from the directory dvp, using the lookup results
 * de.  It is the responsibility of gro_remove to purge the name cache.
 *
 * Everything must be locked and referenced.
 */
static int
genfs_rename_remove(const struct genfs_rename_ops *ops,
    struct mount *mp, kauth_cred_t cred,
    struct vnode *dvp, struct componentname *cnp, void *de, struct vnode *vp,
    nlink_t *tvp_nlinkp)
{
        int error;

        KASSERT(ops != NULL);
        KASSERT(mp != NULL);
        KASSERT(dvp != NULL);
        KASSERT(cnp != NULL);
        KASSERT(vp != NULL);
        KASSERT(dvp != vp);
        KASSERT(dvp->v_type == VDIR);
        KASSERT(vp->v_type != VDIR);
        KASSERT(dvp->v_mount == mp);
        KASSERT(vp->v_mount == mp);
        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);

        error = ops->gro_remove_check_possible(mp, dvp, vp);
        if (error)
                return error;

        error = ops->gro_remove_check_permitted(mp, cred, dvp, vp);
        error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, vp, dvp,
            error);
        if (error)
                return error;

        error = ops->gro_remove(mp, cred, dvp, cnp, de, vp, tvp_nlinkp);
        if (error)
                return error;

        return 0;
}

static int
genfs_ufslike_check_sticky(kauth_cred_t, mode_t, uid_t, struct vnode *, uid_t);

/*
 * genfs_ufslike_rename_check_possible: Check whether a rename is
 * possible independent of credentials, assuming UFS-like inode flag
 * semantics.  clobber_p is true iff the target node already exists.
 */
int
genfs_ufslike_rename_check_possible(
    unsigned long fdflags, unsigned long fflags,
    unsigned long tdflags, unsigned long tflags, bool clobber_p,
    unsigned long immutable, unsigned long append)
{

        if ((fdflags | fflags) & (immutable | append))
                return EPERM;

        if (tdflags & (immutable | (clobber_p? append : 0)))
                return EPERM;

        if (clobber_p && (tflags & (immutable | append)))
                return EPERM;

        return 0;
}

/*
 * genfs_ufslike_rename_check_permitted: Check whether a rename is
 * permitted given our credentials, assuming UFS-like permission and
 * ownership semantics.
 *
 * The only pair of vnodes that may be identical is {fdvp, tdvp}.
 *
 * Everything must be locked and referenced.
 */
int
genfs_ufslike_rename_check_permitted(kauth_cred_t cred,
    struct vnode *fdvp, mode_t fdmode, uid_t fduid,
    struct vnode *fvp, uid_t fuid,
    struct vnode *tdvp, mode_t tdmode, uid_t tduid,
    struct vnode *tvp, uid_t tuid)
{
        int error;

        KASSERT(fdvp != NULL);
        KASSERT(fvp != NULL);
        KASSERT(tdvp != NULL);
        KASSERT(fdvp != fvp);
        KASSERT(fdvp != tvp);
        KASSERT(tdvp != fvp);
        KASSERT(tdvp != tvp);
        KASSERT(fvp != tvp);
        KASSERT(fdvp->v_type == VDIR);
        KASSERT(tdvp->v_type == VDIR);
        KASSERT(fdvp->v_mount == fvp->v_mount);
        KASSERT(fdvp->v_mount == tdvp->v_mount);
        KASSERT((tvp == NULL) || (fdvp->v_mount == tvp->v_mount));
        KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
        KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));

        /*
         * We need to remove or change an entry in the source directory.
         */
        error = VOP_ACCESS(fdvp, VWRITE, cred);
        if (error)
                return error;

        /*
         * If we are changing directories, then we need to write to the
         * target directory to add or change an entry.  Also, if fvp is
         * a directory, we need to write to it to change its `..'
         * entry.
         */
        if (fdvp != tdvp) {
                error = VOP_ACCESS(tdvp, VWRITE, cred);
                if (error)
                        return error;
                if (fvp->v_type == VDIR) {
                        error = VOP_ACCESS(fvp, VWRITE, cred);
                        if (error)
                                return error;
                }
        }

        error = genfs_ufslike_check_sticky(cred, fdmode, fduid, fvp, fuid);
        if (error)
                return error;

        error = genfs_ufslike_check_sticky(cred, tdmode, tduid, tvp, tuid);
        if (error)
                return error;

        return 0;
}

/*
 * genfs_ufslike_remove_check_possible: Check whether a remove is
 * possible independent of credentials, assuming UFS-like inode flag
 * semantics.
 */
int
genfs_ufslike_remove_check_possible(unsigned long dflags, unsigned long flags,
    unsigned long immutable, unsigned long append)
{

        /*
         * We want to delete the entry.  If the directory is immutable,
         * we can't write to it to delete the entry.  If the directory
         * is append-only, the only change we can make is to add
         * entries, so we can't delete entries.  If the node is
         * immutable, we can't change the links to it, so we can't
         * delete the entry.  If the node is append-only...well, this
         * is what UFS does.
         */
        if ((dflags | flags) & (immutable | append))
                return EPERM;

        return 0;
}

/*
 * genfs_ufslike_remove_check_permitted: Check whether a remove is
 * permitted given our credentials, assuming UFS-like permission and
 * ownership semantics.
 *
 * Everything must be locked and referenced.
 */
int
genfs_ufslike_remove_check_permitted(kauth_cred_t cred,
    struct vnode *dvp, mode_t dmode, uid_t duid,
    struct vnode *vp, uid_t uid)
{
        int error;

        KASSERT(dvp != NULL);
        KASSERT(vp != NULL);
        KASSERT(dvp != vp);
        KASSERT(dvp->v_type == VDIR);
        KASSERT(vp->v_type != VDIR);
        KASSERT(dvp->v_mount == vp->v_mount);
        KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
        KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);

        /*
         * We need to write to the directory to remove from it.
         */
        error = VOP_ACCESS(dvp, VWRITE, cred);
        if (error)
                return error;

        error = genfs_ufslike_check_sticky(cred, dmode, duid, vp, uid);
        if (error)
                return error;

        return 0;
}

/*
 * genfs_ufslike_check_sticky: Check whether a party with credentials
 * cred may change an entry in a sticky directory, assuming UFS-like
 * permission, ownership, and stickiness semantics: If the directory is
 * sticky and the entry exists, the user must own either the directory
 * or the entry's node in order to change the entry.
 *
 * Everything must be locked and referenced.
 */
int
genfs_ufslike_check_sticky(kauth_cred_t cred, mode_t dmode, uid_t duid,
    struct vnode *vp, uid_t uid)
{

        if ((dmode & S_ISTXT) && (vp != NULL))
                return genfs_can_sticky(vp, cred, duid, uid);

        return 0;
}





















































   47 





   48 





   26 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/*        $NetBSD: ufs_bswap.h,v 1.23 2018/04/19 21:50:10 christos Exp $        */

/*
 * Copyright (c) 1998 Manuel Bouyer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#ifndef _UFS_UFS_BSWAP_H_
#define _UFS_UFS_BSWAP_H_

#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#endif

#include <sys/bswap.h>

/* Macros to access UFS flags */
#ifdef FFS_EI
#define        UFS_MPNEEDSWAP(ump)        ((ump)->um_flags & UFS_NEEDSWAP)
#define UFS_FSNEEDSWAP(fs)        ((fs)->fs_flags & FS_SWAPPED)
#define        UFS_IPNEEDSWAP(ip)        UFS_MPNEEDSWAP((ip)->i_ump)
#else
#define        UFS_MPNEEDSWAP(ump)        ((void)(ump), 0)
#define UFS_FSNEEDSWAP(fs)        ((void)(fs), 0)
#define        UFS_IPNEEDSWAP(ip)        ((void)(ip), 0)
#endif

#if (!defined(_KERNEL) && !defined(NO_FFS_EI)) || defined(FFS_EI)
/* inlines for access to swapped data */
static __inline u_int16_t
ufs_rw16(uint16_t a, int ns)
{
        return ((ns) ? bswap16(a) : (a));
}

static __inline u_int32_t
ufs_rw32(uint32_t a, int ns)
{
        return ((ns) ? bswap32(a) : (a));
}

static __inline u_int64_t
ufs_rw64(uint64_t a, int ns)
{
        return ((ns) ? bswap64(a) : (a));
}
#else
static __inline u_int16_t
ufs_rw16(uint16_t a, int ns)
{
        return a;
}

static __inline u_int32_t
ufs_rw32(uint32_t a, int ns)
{
        return a;
}

static __inline u_int64_t
ufs_rw64(uint64_t a, int ns)
{
        return a;
}
#endif

#define ufs_add16(a, b, ns) \
        (a) = ufs_rw16(ufs_rw16((a), (ns)) + (b), (ns))
#define ufs_add32(a, b, ns) \
        (a) = ufs_rw32(ufs_rw32((a), (ns)) + (b), (ns))
#define ufs_add64(a, b, ns) \
        (a) = ufs_rw64(ufs_rw64((a), (ns)) + (b), (ns))

#endif /* !_UFS_UFS_BSWAP_H_ */






























































   42 


















   42 




   42 
   42 
   41 

   41 















    4 





    4 
    4 






    4 







    4 
























    4 












    3 



















    4 























































































































































    2 


    6 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
/*        $NetBSD: uvm_anon.c,v 1.80 2020/10/25 00:05:26 chs Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * uvm_anon.c: uvm anon ops
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_anon.c,v 1.80 2020/10/25 00:05:26 chs Exp $");

#include "opt_uvmhist.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/pool.h>
#include <sys/kernel.h>
#include <sys/atomic.h>

#include <uvm/uvm.h>
#include <uvm/uvm_swap.h>
#include <uvm/uvm_pdpolicy.h>

static struct pool_cache        uvm_anon_cache;

static int                        uvm_anon_ctor(void *, void *, int);

void
uvm_anon_init(void)
{

        pool_cache_bootstrap(&uvm_anon_cache, sizeof(struct vm_anon), 0, 0,
            PR_LARGECACHE, "anonpl", NULL, IPL_NONE, uvm_anon_ctor,
            NULL, NULL);
}

static int
uvm_anon_ctor(void *arg, void *object, int flags)
{
        struct vm_anon *anon = object;

        anon->an_ref = 0;
        anon->an_lock = NULL;
        anon->an_page = NULL;
#if defined(VMSWAP)
        anon->an_swslot = 0;
#endif
        return 0;
}

/*
 * uvm_analloc: allocate a new anon.
 *
 * => anon will have no lock associated.
 */
struct vm_anon *
uvm_analloc(void)
{
        struct vm_anon *anon;

        anon = pool_cache_get(&uvm_anon_cache, PR_NOWAIT);
        if (anon) {
                KASSERT(anon->an_ref == 0);
                KASSERT(anon->an_lock == NULL);
                KASSERT(anon->an_page == NULL);
#if defined(VMSWAP)
                KASSERT(anon->an_swslot == 0);
#endif
                anon->an_ref = 1;
        }
        return anon;
}

/*
 * uvm_anfree: free a single anon structure
 *
 * => anon must be removed from the amap (if anon was in an amap).
 * => amap must be locked, if anon was owned by amap.
 * => we may drop and re-acquire the lock here (to break loans).
 */
void
uvm_anfree(struct vm_anon *anon)
{
        struct vm_page *pg = anon->an_page, *pg2 __diagused;

        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(maphist,"(anon=%#jx)", (uintptr_t)anon, 0,0,0);

        KASSERT(anon->an_lock == NULL || rw_write_held(anon->an_lock));
        KASSERT(anon->an_ref == 0);

        /*
         * Dispose of the page, if it is resident.
         */

        if (__predict_true(pg != NULL)) {
                KASSERT(anon->an_lock != NULL);

                /*
                 * If there is a resident page and it is loaned, then anon
                 * may not own it.  Call out to uvm_anon_lockloanpg() to
                 * identify and lock the real owner of the page.
                 */

                if (__predict_false(pg->loan_count != 0)) {
                        pg2 = uvm_anon_lockloanpg(anon);
                        KASSERT(pg2 == pg);
                }

                /*
                 * If the page is owned by a UVM object (now locked),
                 * then kill the loan on the page rather than free it,
                 * and release the object lock.
                 */

                if (__predict_false(pg->uobject != NULL)) {
                        mutex_enter(&pg->interlock);
                        KASSERT(pg->loan_count > 0);
                        pg->loan_count--;
                        pg->uanon = NULL;
                        mutex_exit(&pg->interlock);
                        rw_exit(pg->uobject->vmobjlock);
                } else {

                        /*
                         * If page has no UVM object, then anon is the owner,
                         * and it is already locked.
                         */

                        KASSERT((pg->flags & PG_RELEASED) == 0);
                        pmap_page_protect(pg, VM_PROT_NONE);

                        /*
                         * If the page is busy, mark it as PG_RELEASED, so
                         * that uvm_anon_release(9) would release it later.
                         */

                        if (__predict_false((pg->flags & PG_BUSY) != 0)) {
                                pg->flags |= PG_RELEASED;
                                rw_obj_hold(anon->an_lock);
                                return;
                        }
                        uvm_pagefree(pg);
                        UVMHIST_LOG(maphist, "anon %#jx, page %#jx: "
                            "freed now!", (uintptr_t)anon, (uintptr_t)pg,
                            0, 0);
                }
        } else {
#if defined(VMSWAP)
                if (anon->an_swslot > 0) {
                        /* This page is no longer only in swap. */
                        KASSERT(uvmexp.swpgonly > 0);
                        atomic_dec_uint(&uvmexp.swpgonly);
                }
#endif
        }
        anon->an_lock = NULL;

        /*
         * Free any swap resources, leave a page replacement hint.
         */

        uvm_anon_dropswap(anon);
        uvmpdpol_anfree(anon);
        UVMHIST_LOG(maphist,"<- done!",0,0,0,0);
        pool_cache_put(&uvm_anon_cache, anon);
}

/*
 * uvm_anon_lockloanpg: given a locked anon, lock its resident page owner.
 *
 * => anon is locked by caller
 * => on return: anon is locked
 *                 if there is a resident page:
 *                        if it has a uobject, it is locked by us
 *                        if it is ownerless, we take over as owner
 *                 we return the resident page (it can change during
 *                 this function)
 * => note that the only time an anon has an ownerless resident page
 *        is if the page was loaned from a uvm_object and the uvm_object
 *        disowned it
 * => this only needs to be called when you want to do an operation
 *        on an anon's resident page and that page has a non-zero loan
 *        count.
 */
struct vm_page *
uvm_anon_lockloanpg(struct vm_anon *anon)
{
        struct vm_page *pg;
        krw_t op;

        KASSERT(rw_lock_held(anon->an_lock));

        /*
         * loop while we have a resident page that has a non-zero loan count.
         * if we successfully get our lock, we will "break" the loop.
         * note that the test for pg->loan_count is not protected -- this
         * may produce false positive results.   note that a false positive
         * result may cause us to do more work than we need to, but it will
         * not produce an incorrect result.
         */

        while (((pg = anon->an_page) != NULL) && pg->loan_count != 0) {
                mutex_enter(&pg->interlock);
                if (pg->uobject) {
                        /*
                         * if we didn't get a lock (try lock failed), then we
                         * toggle our anon lock and try again
                         */

                        if (!rw_tryenter(pg->uobject->vmobjlock, RW_WRITER)) {
                                /*
                                 * someone locking the object has a chance to
                                 * lock us right now
                                 *
                                 * XXX Better than yielding but inadequate.
                                 */
                                mutex_exit(&pg->interlock);
                                op = rw_lock_op(anon->an_lock);
                                rw_exit(anon->an_lock);
                                kpause("lkloanpg", false, 1, NULL);
                                rw_enter(anon->an_lock, op);
                                continue;
                        }
                }

                /*
                 * If page is un-owned i.e. the object dropped its ownership,
                 * then we have to take the ownership.
                 */

                if (pg->uobject == NULL && (pg->flags & PG_ANON) == 0) {
                        pg->flags |= PG_ANON;
                        pg->loan_count--;
                }
                mutex_exit(&pg->interlock);
                break;
        }
        return pg;
}

#if defined(VMSWAP)

/*
 * uvm_anon_pagein: fetch an anon's page.
 *
 * => anon must be locked, and is unlocked upon return.
 * => returns true if pagein was aborted due to lack of memory.
 */

bool
uvm_anon_pagein(struct vm_amap *amap, struct vm_anon *anon)
{
        struct vm_page *pg;
        struct uvm_object *uobj;

        KASSERT(rw_write_held(anon->an_lock));
        KASSERT(anon->an_lock == amap->am_lock);

        /*
         * Get the page of the anon.
         */

        switch (uvmfault_anonget(NULL, amap, anon)) {
        case 0:
                /* Success - we have the page. */
                KASSERT(rw_write_held(anon->an_lock));
                break;
        case EIO:
        case ERESTART:
                /*
                 * Nothing more to do on errors.  ERESTART means that the
                 * anon was freed.
                 */
                return false;
        case ENOLCK:
                panic("uvm_anon_pagein");
        default:
                return true;
        }

        /*
         * Mark the page as dirty and clear its swslot.
         */

        pg = anon->an_page;
        uobj = pg->uobject;
        if (anon->an_swslot > 0) {
                uvm_swap_free(anon->an_swslot, 1);
        }
        anon->an_swslot = 0;
        uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);

        /*
         * Deactivate the page (to put it on a page queue).
         */

        uvm_pagelock(pg);
        uvm_pagedeactivate(pg);
        uvm_pageunlock(pg);
        rw_exit(anon->an_lock);
        if (uobj) {
                rw_exit(uobj->vmobjlock);
        }
        return false;
}

/*
 * uvm_anon_dropswap: release any swap resources from this anon.
 *
 * => anon must be locked or have a reference count of 0.
 */
void
uvm_anon_dropswap(struct vm_anon *anon)
{
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        if (anon->an_swslot == 0)
                return;

        UVMHIST_LOG(maphist,"freeing swap for anon %#jx, paged to swslot %#jx",
                    (uintptr_t)anon, anon->an_swslot, 0, 0);
        uvm_swap_free(anon->an_swslot, 1);
        anon->an_swslot = 0;
}

#endif

/*
 * uvm_anon_release: release an anon and its page.
 *
 * => anon should not have any references.
 * => anon must be locked.
 */

void
uvm_anon_release(struct vm_anon *anon)
{
        struct vm_page *pg = anon->an_page;
        krwlock_t *lock;

        KASSERT(rw_write_held(anon->an_lock));
        KASSERT(pg != NULL);
        KASSERT((pg->flags & PG_RELEASED) != 0);
        KASSERT((pg->flags & PG_BUSY) != 0);
        KASSERT(pg->uobject == NULL);
        KASSERT(pg->uanon == anon);
        KASSERT(pg->loan_count == 0);
        KASSERT(anon->an_ref == 0);

        if ((pg->flags & PG_PAGEOUT) != 0) {
                pg->flags &= ~PG_PAGEOUT;
                uvm_pageout_done(1);
        }

        uvm_pagefree(pg);
        KASSERT(anon->an_page == NULL);
        lock = anon->an_lock;
        uvm_anfree(anon);
        rw_exit(lock);
        /* Note: extra reference is held for PG_RELEASED case. */
        rw_obj_free(lock);
}










































































































































































































































































































































































































































































































































































































































































































    1 










    1 



































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
/*        $NetBSD: if_tun.c,v 1.175 2024/03/09 13:55:27 riastradh Exp $        */

/*
 * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk>
 * Nottingham University 1987.
 *
 * This source may be freely distributed, however I would be interested
 * in any changes that are made.
 *
 * This driver takes packets off the IP i/f and hands them up to a
 * user process to have its wicked way with. This driver has its
 * roots in a similar driver written by Phil Cockcroft (formerly) at
 * UCL. This driver is based much more on read/write/poll mode of
 * operation though.
 */

/*
 * tun - tunnel software network interface.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_tun.c,v 1.175 2024/03/09 13:55:27 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#endif

#include <sys/param.h>

#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/device.h>
#include <sys/file.h>
#include <sys/ioctl.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/poll.h>
#include <sys/select.h>
#include <sys/signalvar.h>
#include <sys/socket.h>

#include <net/bpf.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>

#ifdef INET
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/if_inarp.h>
#endif

#include <net/if_tun.h>

#include "ioconf.h"

#define TUNDEBUG        if (tundebug) printf
int        tundebug = 0;

extern int ifqmaxlen;

static LIST_HEAD(, tun_softc) tun_softc_list;
static LIST_HEAD(, tun_softc) tunz_softc_list;
static kmutex_t tun_softc_lock;

static int        tun_ioctl(struct ifnet *, u_long, void *);
static int        tun_output(struct ifnet *, struct mbuf *,
                        const struct sockaddr *, const struct rtentry *rt);
static int        tun_clone_create(struct if_clone *, int);
static int        tun_clone_destroy(struct ifnet *);

static struct if_clone tun_cloner =
    IF_CLONE_INITIALIZER("tun", tun_clone_create, tun_clone_destroy);

static void tunattach0(struct tun_softc *);
static void tun_enable(struct tun_softc *, const struct ifaddr *);
static void tun_i_softintr(void *);
static void tun_o_softintr(void *);
#ifdef ALTQ
static void tunstart(struct ifnet *);
#endif
static struct tun_softc *tun_find_unit(dev_t);
static struct tun_softc *tun_find_zunit(int);

static dev_type_open(tunopen);
static dev_type_close(tunclose);
static dev_type_read(tunread);
static dev_type_write(tunwrite);
static dev_type_ioctl(tunioctl);
static dev_type_poll(tunpoll);
static dev_type_kqfilter(tunkqfilter);

const struct cdevsw tun_cdevsw = {
        .d_open = tunopen,
        .d_close = tunclose,
        .d_read = tunread,
        .d_write = tunwrite,
        .d_ioctl = tunioctl,
        .d_stop = nostop,
        .d_tty = notty,
        .d_poll = tunpoll,
        .d_mmap = nommap,
        .d_kqfilter = tunkqfilter,
        .d_discard = nodiscard,
        .d_flag = D_OTHER | D_MPSAFE
};

#ifdef _MODULE
devmajor_t tun_bmajor = -1, tun_cmajor = -1;
#endif

void
tunattach(int unused)
{

        /*
         * Nothing to do here, initialization is handled by the
         * module initialization code in tuninit() below).
         */
}

static void
tuninit(void)
{

        mutex_init(&tun_softc_lock, MUTEX_DEFAULT, IPL_NONE);
        LIST_INIT(&tun_softc_list);
        LIST_INIT(&tunz_softc_list);
        if_clone_attach(&tun_cloner);
#ifdef _MODULE
        devsw_attach("tun", NULL, &tun_bmajor, &tun_cdevsw, &tun_cmajor);
#endif
}

static int
tundetach(void)
{

        if_clone_detach(&tun_cloner);
#ifdef _MODULE
        devsw_detach(NULL, &tun_cdevsw);
#endif

        if (!LIST_EMPTY(&tun_softc_list) || !LIST_EMPTY(&tunz_softc_list)) {
#ifdef _MODULE
                devsw_attach("tun", NULL, &tun_bmajor, &tun_cdevsw, &tun_cmajor);
#endif
                if_clone_attach(&tun_cloner);
                return EBUSY;
}

        mutex_destroy(&tun_softc_lock);

        return 0;
}

/*
 * Find driver instance from dev_t.
 * Returns with tp locked (if found).
 */
static struct tun_softc *
tun_find_unit(dev_t dev)
{
        struct tun_softc *tp;
        int unit = minor(dev);

        mutex_enter(&tun_softc_lock);
        LIST_FOREACH(tp, &tun_softc_list, tun_list)
                if (unit == tp->tun_unit)
                        break;
        if (tp)
                mutex_enter(&tp->tun_lock);
        mutex_exit(&tun_softc_lock);

        return tp;
}

/*
 * Find zombie driver instance by unit number.
 * Remove tp from list and return it unlocked (if found).
 */
static struct tun_softc *
tun_find_zunit(int unit)
{
        struct tun_softc *tp;

        mutex_enter(&tun_softc_lock);
        LIST_FOREACH(tp, &tunz_softc_list, tun_list)
                if (unit == tp->tun_unit)
                        break;
        if (tp)
                LIST_REMOVE(tp, tun_list);
        mutex_exit(&tun_softc_lock);
        KASSERTMSG(!tp || (tp->tun_flags & (TUN_INITED|TUN_OPEN)) == TUN_OPEN,
            "tun%d: inconsistent flags: %x", unit, tp->tun_flags);

        return tp;
}

static void
tun_init(struct tun_softc *tp, int unit)
{

        tp->tun_unit = unit;
        mutex_init(&tp->tun_lock, MUTEX_DEFAULT, IPL_SOFTNET);
        cv_init(&tp->tun_cv, "tunread");
        selinit(&tp->tun_rsel);
        selinit(&tp->tun_wsel);

        tp->tun_osih = softint_establish(SOFTINT_CLOCK, tun_o_softintr, tp);
        tp->tun_isih = softint_establish(SOFTINT_CLOCK, tun_i_softintr, tp);
}

static void
tun_fini(struct tun_softc *tp)
{

        softint_disestablish(tp->tun_isih);
        softint_disestablish(tp->tun_osih);

        seldestroy(&tp->tun_wsel);
        seldestroy(&tp->tun_rsel);
        mutex_destroy(&tp->tun_lock);
        cv_destroy(&tp->tun_cv);
}

static struct tun_softc *
tun_alloc(int unit)
{
        struct tun_softc *tp;

        tp = kmem_zalloc(sizeof(*tp), KM_SLEEP);
        tun_init(tp, unit);

        return tp;
}

static void
tun_recycle(struct tun_softc *tp)
{

        memset(&tp->tun_if, 0, sizeof(struct ifnet)); /* XXX ??? */
}

static void
tun_free(struct tun_softc *tp)
{

        tun_fini(tp);
        kmem_free(tp, sizeof(*tp));
}

static int
tun_clone_create(struct if_clone *ifc, int unit)
{
        struct tun_softc *tp;

        if ((tp = tun_find_zunit(unit)) == NULL) {
                tp = tun_alloc(unit);
        } else {
                tun_recycle(tp);
        }

        if_initname(&tp->tun_if, ifc->ifc_name, unit);
        tunattach0(tp);
        tp->tun_flags |= TUN_INITED;

        mutex_enter(&tun_softc_lock);
        LIST_INSERT_HEAD(&tun_softc_list, tp, tun_list);
        mutex_exit(&tun_softc_lock);

        return 0;
}

static void
tunattach0(struct tun_softc *tp)
{
        struct ifnet *ifp;

        ifp = &tp->tun_if;
        ifp->if_softc = tp;
        ifp->if_mtu = TUNMTU;
        ifp->if_ioctl = tun_ioctl;
        ifp->if_output = tun_output;
#ifdef ALTQ
        ifp->if_start = tunstart;
#endif
        ifp->if_flags = IFF_POINTOPOINT;
        ifp->if_type = IFT_TUNNEL;
        ifp->if_snd.ifq_maxlen = ifqmaxlen;
        ifp->if_dlt = DLT_NULL;
        IFQ_SET_READY(&ifp->if_snd);
        if_attach(ifp);
        ifp->if_link_state = LINK_STATE_DOWN;
        if_alloc_sadl(ifp);
        bpf_attach(ifp, DLT_NULL, sizeof(uint32_t));
}

static int
tun_clone_destroy(struct ifnet *ifp)
{
        struct tun_softc *tp = (void *)ifp;
        bool zombie = false;

        IF_PURGE(&ifp->if_snd);
        ifp->if_flags &= ~IFF_RUNNING;

        mutex_enter(&tun_softc_lock);
        mutex_enter(&tp->tun_lock);
        LIST_REMOVE(tp, tun_list);
        if (tp->tun_flags & TUN_OPEN) {
                /* Hang on to storage until last close. */
                tp->tun_flags &= ~TUN_INITED;
                LIST_INSERT_HEAD(&tunz_softc_list, tp, tun_list);
                zombie = true;
        }
        mutex_exit(&tun_softc_lock);

        cv_broadcast(&tp->tun_cv);
        if (tp->tun_flags & TUN_ASYNC && tp->tun_pgid)
                fownsignal(tp->tun_pgid, SIGIO, POLL_HUP, 0, NULL);
        selnotify(&tp->tun_rsel, 0, NOTE_SUBMIT);
        mutex_exit(&tp->tun_lock);

        bpf_detach(ifp);
        if_detach(ifp);

        if (!zombie) {
                tun_free(tp);
        }

        return 0;
}

/*
 * tunnel open - must be superuser & the device must be
 * configured in
 */
static int
tunopen(dev_t dev, int flag, int mode, struct lwp *l)
{
        struct ifnet        *ifp;
        struct tun_softc *tp;
        int        error;

        error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_INTERFACE_TUN,
            KAUTH_REQ_NETWORK_INTERFACE_TUN_ADD, NULL, NULL, NULL);
        if (error)
                return error;

        tp = tun_find_unit(dev);

        if (tp == NULL) {
                (void)tun_clone_create(&tun_cloner, minor(dev));
                tp = tun_find_unit(dev);
                if (tp == NULL) {
                        return ENXIO;
                }
        }

        if (tp->tun_flags & TUN_OPEN) {
                mutex_exit(&tp->tun_lock);
                return EBUSY;
        }

        ifp = &tp->tun_if;
        tp->tun_flags |= TUN_OPEN;
        TUNDEBUG("%s: open\n", ifp->if_xname);
        if_link_state_change(ifp, LINK_STATE_UP);

        mutex_exit(&tp->tun_lock);

        return error;
}

/*
 * tunclose - close the device - mark i/f down & delete
 * routing info
 */
int
tunclose(dev_t dev, int flag, int mode,
    struct lwp *l)
{
        struct tun_softc *tp;
        struct ifnet        *ifp;

        if ((tp = tun_find_zunit(minor(dev))) != NULL) {
                /* interface was "destroyed" before the close */
                tun_free(tp);
                return 0;
        }

        if ((tp = tun_find_unit(dev)) == NULL)
                goto out_nolock;

        ifp = &tp->tun_if;

        tp->tun_flags &= ~TUN_OPEN;

        tp->tun_pgid = 0;
        selnotify(&tp->tun_rsel, 0, NOTE_SUBMIT);

        TUNDEBUG ("%s: closed\n", ifp->if_xname);
        mutex_exit(&tp->tun_lock);

        /*
         * junk all pending output
         */
        IFQ_PURGE(&ifp->if_snd);

        if (ifp->if_flags & IFF_UP) {
                if_down(ifp);
                if (ifp->if_flags & IFF_RUNNING) {
                        /* find internet addresses and delete routes */
                        struct ifaddr *ifa;
                        IFADDR_READER_FOREACH(ifa, ifp) {
#if defined(INET) || defined(INET6)
                                if (ifa->ifa_addr->sa_family == AF_INET ||
                                    ifa->ifa_addr->sa_family == AF_INET6) {
                                        rtinit(ifa, (int)RTM_DELETE,
                                               tp->tun_flags & TUN_DSTADDR
                                                        ? RTF_HOST
                                                        : 0);
                                }
#endif
                        }
                }
        }

        if_link_state_change(ifp, LINK_STATE_DOWN);

out_nolock:
        return 0;
}

static void
tun_enable(struct tun_softc *tp, const struct ifaddr *ifa)
{
        struct ifnet        *ifp = &tp->tun_if;

        TUNDEBUG("%s: %s\n", __func__, ifp->if_xname);

        mutex_enter(&tp->tun_lock);
        tp->tun_flags &= ~(TUN_IASET|TUN_DSTADDR);

        switch (ifa->ifa_addr->sa_family) {
#ifdef INET
        case AF_INET: {
                struct sockaddr_in *sin;

                sin = satosin(ifa->ifa_addr);
                if (sin && sin->sin_addr.s_addr)
                        tp->tun_flags |= TUN_IASET;

                if (ifp->if_flags & IFF_POINTOPOINT) {
                        sin = satosin(ifa->ifa_dstaddr);
                        if (sin && sin->sin_addr.s_addr)
                                tp->tun_flags |= TUN_DSTADDR;
                }
                break;
            }
#endif
#ifdef INET6
        case AF_INET6: {
                struct sockaddr_in6 *sin;

                sin = satosin6(ifa->ifa_addr);
                if (!IN6_IS_ADDR_UNSPECIFIED(&sin->sin6_addr))
                        tp->tun_flags |= TUN_IASET;

                if (ifp->if_flags & IFF_POINTOPOINT) {
                        sin = satosin6(ifa->ifa_dstaddr);
                        if (sin && !IN6_IS_ADDR_UNSPECIFIED(&sin->sin6_addr))
                                tp->tun_flags |= TUN_DSTADDR;
                } else
                        tp->tun_flags &= ~TUN_DSTADDR;
                break;
            }
#endif /* INET6 */
        default:
                break;
        }
        ifp->if_flags |= IFF_UP | IFF_RUNNING;
        mutex_exit(&tp->tun_lock);
}

/*
 * Process an ioctl request.
 */
static int
tun_ioctl(struct ifnet *ifp, u_long cmd, void *data)
{
        struct tun_softc *tp = (struct tun_softc *)(ifp->if_softc);
        struct ifreq *ifr = (struct ifreq *)data;
        struct ifaddr *ifa = (struct ifaddr *)data;
        int error = 0;

        switch (cmd) {
        case SIOCINITIFADDR:
                tun_enable(tp, ifa);
                ifa->ifa_rtrequest = p2p_rtrequest;
                TUNDEBUG("%s: address set\n", ifp->if_xname);
                break;
        case SIOCSIFBRDADDR:
                TUNDEBUG("%s: broadcast address set\n", ifp->if_xname);
                break;
        case SIOCSIFMTU:
                if (ifr->ifr_mtu > TUNMTU || ifr->ifr_mtu < 576) {
                        error = EINVAL;
                        break;
                }
                TUNDEBUG("%s: interface mtu set\n", ifp->if_xname);
                if ((error = ifioctl_common(ifp, cmd, data)) == ENETRESET)
                        error = 0;
                break;
        case SIOCADDMULTI:
        case SIOCDELMULTI:
                if (ifr == NULL) {
                        error = EAFNOSUPPORT;           /* XXX */
                        break;
                }
                switch (ifreq_getaddr(cmd, ifr)->sa_family) {
#ifdef INET
                case AF_INET:
                        break;
#endif
#ifdef INET6
                case AF_INET6:
                        break;
#endif
                default:
                        error = EAFNOSUPPORT;
                        break;
                }
                break;
        default:
                error = ifioctl_common(ifp, cmd, data);
        }

        return error;
}

/*
 * tun_output - queue packets from higher level ready to put out.
 */
static int
tun_output(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst,
    const struct rtentry *rt)
{
        struct tun_softc *tp = ifp->if_softc;
        int                error;
#if defined(INET) || defined(INET6)
        int                mlen;
        uint32_t        *af;
#endif

        mutex_enter(&tp->tun_lock);
        TUNDEBUG ("%s: tun_output\n", ifp->if_xname);

        if ((tp->tun_flags & TUN_READY) != TUN_READY) {
                TUNDEBUG ("%s: not ready 0%o\n", ifp->if_xname,
                          tp->tun_flags);
                error = EHOSTDOWN;
                mutex_exit(&tp->tun_lock);
                goto out;
        }
        // XXXrmind
        mutex_exit(&tp->tun_lock);

        /*
         * if the queueing discipline needs packet classification,
         * do it before prepending link headers.
         */
        IFQ_CLASSIFY(&ifp->if_snd, m0, dst->sa_family);

        bpf_mtap_af(ifp, dst->sa_family, m0, BPF_D_OUT);

        if ((error = pfil_run_hooks(ifp->if_pfil, &m0, ifp, PFIL_OUT)) != 0)
                goto out;
        if (m0 == NULL)
                goto out;

        switch(dst->sa_family) {
#ifdef INET6
        case AF_INET6:
#endif
#ifdef INET
        case AF_INET:
#endif
#if defined(INET) || defined(INET6)
                if (tp->tun_flags & TUN_PREPADDR) {
                        /* Simple link-layer header */
                        M_PREPEND(m0, dst->sa_len, M_DONTWAIT);
                        if (m0 == NULL) {
                                IF_DROP(&ifp->if_snd);
                                error = ENOBUFS;
                                goto out;
                        }
                        memcpy(mtod(m0, char *), dst, dst->sa_len);
                } else if (tp->tun_flags & TUN_IFHEAD) {
                        /* Prepend the address family */
                        M_PREPEND(m0, sizeof(*af), M_DONTWAIT);
                        if (m0 == NULL) {
                                IF_DROP(&ifp->if_snd);
                                error = ENOBUFS;
                                goto out;
                        }
                        af = mtod(m0,uint32_t *);
                        *af = htonl(dst->sa_family);
                } else {
#ifdef INET
                        if (dst->sa_family != AF_INET)
#endif
                        {
                                error = EAFNOSUPPORT;
                                goto out;
                        }
                }
                /* FALLTHROUGH */
        case AF_UNSPEC:
                mlen = m0->m_pkthdr.len;
                IFQ_ENQUEUE(&ifp->if_snd, m0, error);
                if (error) {
                        if_statinc(ifp, if_collisions);
                        error = EAFNOSUPPORT;
                        m0 = NULL;
                        goto out;
                }
                if_statadd2(ifp, if_opackets, 1, if_obytes, mlen);
                break;
#endif
        default:
                error = EAFNOSUPPORT;
                goto out;
        }

        mutex_enter(&tp->tun_lock);
        cv_broadcast(&tp->tun_cv);
        if (tp->tun_flags & TUN_ASYNC && tp->tun_pgid)
                softint_schedule(tp->tun_isih);
        selnotify(&tp->tun_rsel, 0, NOTE_SUBMIT);
        mutex_exit(&tp->tun_lock);
out:
        if (error && m0)
                m_freem(m0);

        return error;
}

static void
tun_i_softintr(void *cookie)
{
        struct tun_softc *tp = cookie;

        if (tp->tun_flags & TUN_ASYNC && tp->tun_pgid)
                fownsignal(tp->tun_pgid, SIGIO, POLL_IN, POLLIN|POLLRDNORM,
                    NULL);
}

static void
tun_o_softintr(void *cookie)
{
        struct tun_softc *tp = cookie;

        if (tp->tun_flags & TUN_ASYNC && tp->tun_pgid)
                fownsignal(tp->tun_pgid, SIGIO, POLL_OUT, POLLOUT|POLLWRNORM,
                    NULL);
}

/*
 * the cdevsw interface is now pretty minimal.
 */
int
tunioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
        struct tun_softc *tp;
        int error = 0;

        tp = tun_find_unit(dev);

        /* interface was "destroyed" already */
        if (tp == NULL) {
                return ENXIO;
        }

        switch (cmd) {
        case TUNSDEBUG:
                tundebug = *(int *)data;
                break;

        case TUNGDEBUG:
                *(int *)data = tundebug;
                break;

        case TUNSIFMODE:
                switch (*(int *)data & (IFF_POINTOPOINT|IFF_BROADCAST)) {
                case IFF_POINTOPOINT:
                case IFF_BROADCAST:
                        if (tp->tun_if.if_flags & IFF_UP) {
                                error = EBUSY;
                                goto out;
                        }
                        tp->tun_if.if_flags &=
                                ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST);
                        tp->tun_if.if_flags |= *(int *)data;
                        break;
                default:
                        error = EINVAL;
                        goto out;
                }
                break;

        case TUNSLMODE:
                if (*(int *)data) {
                        tp->tun_flags |= TUN_PREPADDR;
                        tp->tun_flags &= ~TUN_IFHEAD;
                } else
                        tp->tun_flags &= ~TUN_PREPADDR;
                break;

        case TUNSIFHEAD:
                if (*(int *)data) {
                        tp->tun_flags |= TUN_IFHEAD;
                        tp->tun_flags &= ~TUN_PREPADDR;
                } else
                        tp->tun_flags &= ~TUN_IFHEAD;
                break;

        case TUNGIFHEAD:
                *(int *)data = (tp->tun_flags & TUN_IFHEAD);
                break;

        case FIONBIO:
                if (*(int *)data)
                        tp->tun_flags |= TUN_NBIO;
                else
                        tp->tun_flags &= ~TUN_NBIO;
                break;

        case FIOASYNC:
                if (*(int *)data)
                        tp->tun_flags |= TUN_ASYNC;
                else
                        tp->tun_flags &= ~TUN_ASYNC;
                break;

        case FIONREAD:
                if (tp->tun_if.if_snd.ifq_head)
                        *(int *)data = tp->tun_if.if_snd.ifq_head->m_pkthdr.len;
                else
                        *(int *)data = 0;
                break;

        case TIOCSPGRP:
        case FIOSETOWN:
                error = fsetown(&tp->tun_pgid, cmd, data);
                break;

        case TIOCGPGRP:
        case FIOGETOWN:
                error = fgetown(tp->tun_pgid, cmd, data);
                break;

        default:
                error = ENOTTY;
        }

out:
        mutex_exit(&tp->tun_lock);

        return error;
}

/*
 * The cdevsw read interface - reads a packet at a time, or at
 * least as much of a packet as can be read.
 */
int
tunread(dev_t dev, struct uio *uio, int ioflag)
{
        struct tun_softc *tp;
        struct ifnet        *ifp;
        struct mbuf        *m, *m0;
        int                error = 0, len;

        tp = tun_find_unit(dev);

        /* interface was "destroyed" already */
        if (tp == NULL) {
                return ENXIO;
        }

        ifp = &tp->tun_if;

        TUNDEBUG ("%s: read\n", ifp->if_xname);
        if ((tp->tun_flags & TUN_READY) != TUN_READY) {
                TUNDEBUG ("%s: not ready 0%o\n", ifp->if_xname, tp->tun_flags);
                error = EHOSTDOWN;
                goto out;
        }

        do {
                IFQ_DEQUEUE(&ifp->if_snd, m0);
                if (m0 == 0) {
                        if (tp->tun_flags & TUN_NBIO) {
                                error = EWOULDBLOCK;
                                goto out;
                        }
                        if (cv_wait_sig(&tp->tun_cv, &tp->tun_lock)) {
                                error = EINTR;
                                goto out;
                        }
                }
        } while (m0 == 0);

        mutex_exit(&tp->tun_lock);

        /* Copy the mbuf chain */
        while (m0 && uio->uio_resid > 0 && error == 0) {
                len = uimin(uio->uio_resid, m0->m_len);
                if (len != 0)
                        error = uiomove(mtod(m0, void *), len, uio);
                m0 = m = m_free(m0);
        }

        if (m0) {
                TUNDEBUG("Dropping mbuf\n");
                m_freem(m0);
        }
        if (error)
                if_statinc(ifp, if_ierrors);

        return error;

out:
        mutex_exit(&tp->tun_lock);

        return error;
}

/*
 * the cdevsw write interface - an atomic write is a packet - or else!
 */
int
tunwrite(dev_t dev, struct uio *uio, int ioflag)
{
        struct tun_softc *tp;
        struct ifnet        *ifp;
        struct mbuf        *top, **mp, *m;
        pktqueue_t        *pktq;
        struct sockaddr        dst;
        int                error = 0, tlen, mlen;
        uint32_t        family;

        tp = tun_find_unit(dev);
        if (tp == NULL) {
                /* Interface was "destroyed" already. */
                return ENXIO;
        }

        /* Unlock until we've got the data */
        mutex_exit(&tp->tun_lock);

        ifp = &tp->tun_if;

        TUNDEBUG("%s: tunwrite\n", ifp->if_xname);

        if (tp->tun_flags & TUN_PREPADDR) {
                if (uio->uio_resid < sizeof(dst)) {
                        error = EIO;
                        goto out0;
                }
                error = uiomove((void *)&dst, sizeof(dst), uio);
                if (error)
                        goto out0;
                if (dst.sa_len > sizeof(dst)) {
                        /* Duh.. */
                        int n = dst.sa_len - sizeof(dst);
                        while (n--) {
                                char discard;
                                error = uiomove(&discard, 1, uio);
                                if (error) {
                                        goto out0;
                                }
                        }
                }
        } else if (tp->tun_flags & TUN_IFHEAD) {
                if (uio->uio_resid < sizeof(family)){
                        error = EIO;
                        goto out0;
                }
                error = uiomove((void *)&family, sizeof(family), uio);
                if (error)
                        goto out0;
                dst.sa_family = ntohl(family);
        } else {
#ifdef INET
                dst.sa_family = AF_INET;
#endif
        }

        if (uio->uio_resid == 0 || uio->uio_resid > TUNMTU) {
                TUNDEBUG("%s: len=%lu!\n", ifp->if_xname,
                    (unsigned long)uio->uio_resid);
                error = EIO;
                goto out0;
        }

        switch (dst.sa_family) {
#ifdef INET
        case AF_INET:
                pktq = ip_pktq;
                break;
#endif
#ifdef INET6
        case AF_INET6:
                pktq = ip6_pktq;
                break;
#endif
        default:
                error = EAFNOSUPPORT;
                goto out0;
        }

        tlen = uio->uio_resid;

        /* get a header mbuf */
        MGETHDR(m, M_DONTWAIT, MT_DATA);
        if (m == NULL) {
                error = ENOBUFS;
                goto out0;
        }
        mlen = MHLEN;

        top = NULL;
        mp = &top;
        while (error == 0 && uio->uio_resid > 0) {
                m->m_len = uimin(mlen, uio->uio_resid);
                error = uiomove(mtod(m, void *), m->m_len, uio);
                *mp = m;
                mp = &m->m_next;
                if (error == 0 && uio->uio_resid > 0) {
                        MGET(m, M_DONTWAIT, MT_DATA);
                        if (m == NULL) {
                                error = ENOBUFS;
                                break;
                        }
                        mlen = MLEN;
                }
        }
        if (error) {
                if (top != NULL)
                        m_freem(top);
                if_statinc(ifp, if_ierrors);
                goto out0;
        }

        top->m_pkthdr.len = tlen;
        m_set_rcvif(top, ifp);

        bpf_mtap_af(ifp, dst.sa_family, top, BPF_D_IN);

        if ((error = pfil_run_hooks(ifp->if_pfil, &top, ifp, PFIL_IN)) != 0)
                goto out0;
        if (top == NULL)
                goto out0;

        mutex_enter(&tp->tun_lock);
        if ((tp->tun_flags & TUN_INITED) == 0) {
                /* Interface was destroyed */
                error = ENXIO;
                goto out;
        }
        kpreempt_disable();
        if (__predict_false(!pktq_enqueue(pktq, top, 0))) {
                kpreempt_enable();
                if_statinc(ifp, if_collisions);
                mutex_exit(&tp->tun_lock);
                error = ENOBUFS;
                m_freem(top);
                goto out0;
        }
        kpreempt_enable();
        if_statadd2(ifp, if_ipackets, 1, if_ibytes, tlen);
out:
        mutex_exit(&tp->tun_lock);
out0:
        return error;
}

#ifdef ALTQ
/*
 * Start packet transmission on the interface.
 * when the interface queue is rate-limited by ALTQ or TBR,
 * if_start is needed to drain packets from the queue in order
 * to notify readers when outgoing packets become ready.
 */
static void
tunstart(struct ifnet *ifp)
{
        struct tun_softc *tp = ifp->if_softc;

        if (!ALTQ_IS_ENABLED(&ifp->if_snd) && !TBR_IS_ENABLED(&ifp->if_snd))
                return;

        mutex_enter(&tp->tun_lock);
        if (!IF_IS_EMPTY(&ifp->if_snd)) {
                cv_broadcast(&tp->tun_cv);
                if (tp->tun_flags & TUN_ASYNC && tp->tun_pgid)
                        softint_schedule(tp->tun_osih);

                selnotify(&tp->tun_rsel, 0, NOTE_SUBMIT);
        }
        mutex_exit(&tp->tun_lock);
}
#endif /* ALTQ */
/*
 * tunpoll - the poll interface, this is only useful on reads
 * really. The write detect always returns true, write never blocks
 * anyway, it either accepts the packet or drops it.
 */
int
tunpoll(dev_t dev, int events, struct lwp *l)
{
        struct tun_softc *tp;
        struct ifnet        *ifp;
        int revents = 0;

        tp = tun_find_unit(dev);
        if (tp == NULL) {
                /* Interface was "destroyed" already. */
                return 0;
        }
        ifp = &tp->tun_if;

        TUNDEBUG("%s: tunpoll\n", ifp->if_xname);

        if (events & (POLLIN | POLLRDNORM)) {
                if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
                        TUNDEBUG("%s: tunpoll q=%d\n", ifp->if_xname,
                            ifp->if_snd.ifq_len);
                        revents |= events & (POLLIN | POLLRDNORM);
                } else {
                        TUNDEBUG("%s: tunpoll waiting\n", ifp->if_xname);
                        selrecord(l, &tp->tun_rsel);
                }
        }

        if (events & (POLLOUT | POLLWRNORM))
                revents |= events & (POLLOUT | POLLWRNORM);

        mutex_exit(&tp->tun_lock);

        return revents;
}

static void
filt_tunrdetach(struct knote *kn)
{
        struct tun_softc *tp = kn->kn_hook;

        mutex_enter(&tp->tun_lock);
        selremove_knote(&tp->tun_rsel, kn);
        mutex_exit(&tp->tun_lock);
}

static int
filt_tunread(struct knote *kn, long hint)
{
        struct tun_softc *tp = kn->kn_hook;
        struct ifnet *ifp = &tp->tun_if;
        struct mbuf *m;
        int ready;

        if (hint & NOTE_SUBMIT)
                KASSERT(mutex_owned(&tp->tun_lock));
        else
                mutex_enter(&tp->tun_lock);

        IF_POLL(&ifp->if_snd, m);
        ready = (m != NULL);
        for (kn->kn_data = 0; m != NULL; m = m->m_next)
                kn->kn_data += m->m_len;

        if (hint & NOTE_SUBMIT)
                KASSERT(mutex_owned(&tp->tun_lock));
        else
                mutex_exit(&tp->tun_lock);

        return ready;
}

static const struct filterops tunread_filtops = {
        .f_flags = FILTEROP_ISFD,
        .f_attach = NULL,
        .f_detach = filt_tunrdetach,
        .f_event = filt_tunread,
};

int
tunkqfilter(dev_t dev, struct knote *kn)
{
        struct tun_softc *tp;
        int rv = 0;

        tp = tun_find_unit(dev);
        if (tp == NULL)
                goto out_nolock;

        switch (kn->kn_filter) {
        case EVFILT_READ:
                kn->kn_fop = &tunread_filtops;
                kn->kn_hook = tp;
                selrecord_knote(&tp->tun_rsel, kn);
                break;

        case EVFILT_WRITE:
                kn->kn_fop = &seltrue_filtops;
                break;

        default:
                rv = EINVAL;
                goto out;
        }

out:
        mutex_exit(&tp->tun_lock);
out_nolock:
        return rv;
}

/*
 * Module infrastructure
 */
#include "if_module.h"

IF_MODULE(MODULE_CLASS_DRIVER, tun, NULL)


















































































































































































    5 











    5 








































    4 
    5 


    4 





    5 















    5 
    5 

    5 

















































    2 










    2 































    2 
    2 



    2 



    2 



    2 




    2 

















    2 
















    2 

































































    2 

















    2 







    2 











    2 




    1 




    1 





    2 











































   15 







   15 





   15 








    2 

   13 








    2 


   13 


   15 


   15 



































    2 
















































    2 




    2 




    2 









    2 
    2 
    1 

    1 


    2 
    2 
    2 

    2 







    2 



































































































    6 




    6 






















    6 

    6 






    1 







    1 
    1 












































   17 







    2 




    5 











































    2 





    2 




    1 




    1 

    1 
    1 

    1 

    1 

    1 




























    1 













    5 












    6 

    6 


    6 




    6 
    3 

    6 
    6 

    6 



    2 








    1 
    1 













    5 

    5 
    5 






    3 


























    5 


    5 




    3 
    4 

    5 






    4 


















    2 








    2 


    2 
    2 
    2 












    2 














    2 




    2 

    2 
    2 












    2 
















   15 











   15 
   15 





   15 







   15 




   15 


   15 












   15 




















   15 
   15 

   14 
   15 




   14 



   15 

   15 






   15 


















   15 


   15 




   15 




   15 
















   15 

   15 



   13 
    2 








   15 












































































































































    4 










    4 




    4 
    4 







    4 




















































































































































    4 
















    4 

























































































































    4 









    4 
    4 

    4 

    4 














    2 


    2 





















    3 









    3 











    3 





















    4 








    3 












    3 














    3 










    3 




















































    3 








    3 
    3 

    3 
    3 

    3 








    3 
    2 
    2 




    3 





    3 

















































    6 














    4 

    4 
    6 







    6 






    3 









    5 









    5 

    6 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
/*        $NetBSD: ffs_alloc.c,v 1.172 2023/01/07 19:41:30 chs Exp $        */

/*-
 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Wasabi Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 2002 Networks Associates Technology, Inc.
 * All rights reserved.
 *
 * This software was developed for the FreeBSD Project by Marshall
 * Kirk McKusick and Network Associates Laboratories, the Security
 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
 * research program
 *
 * Copyright (c) 1982, 1986, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ffs_alloc.c        8.19 (Berkeley) 7/13/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.172 2023/01/07 19:41:30 chs Exp $");

#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#include "opt_uvm_page_trkown.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/cprng.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/syslog.h>
#include <sys/vnode.h>
#include <sys/wapbl.h>
#include <sys/cprng.h>

#include <miscfs/specfs/specdev.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>

#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>

#ifdef UVM_PAGE_TRKOWN
#include <uvm/uvm_object.h>
#include <uvm/uvm_page.h>
#endif

static daddr_t ffs_alloccg(struct inode *, u_int, daddr_t, int, int, int);
static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int, int);
static ino_t ffs_dirpref(struct inode *);
static daddr_t ffs_fragextend(struct inode *, u_int, daddr_t, int, int);
static void ffs_fserr(struct fs *, kauth_cred_t, const char *);
static daddr_t ffs_hashalloc(struct inode *, u_int, daddr_t, int, int, int,
    daddr_t (*)(struct inode *, u_int, daddr_t, int, int, int));
static daddr_t ffs_nodealloccg(struct inode *, u_int, daddr_t, int, int, int);
static int32_t ffs_mapsearch(struct fs *, struct cg *,
                                      daddr_t, int);
static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *,
    daddr_t, long, bool);
static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t,
    int, bool);

/* if 1, changes in optimalization strategy are logged */
int ffs_log_changeopt = 0;

/* in ffs_tables.c */
extern const int inside[], around[];
extern const u_char * const fragtbl[];

/* Basic consistency check for block allocations */
static int
ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno,
    long size, dev_t dev, ino_t inum)
{
        if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0 ||
            ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) > fs->fs_frag) {
                panic("%s: bad size: dev = 0x%llx, bno = %" PRId64 
                    " bsize = %d, size = %ld, fs = %s", func,
                    (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
        }

        if (bno >= fs->fs_size) {
                printf("%s: bad block %" PRId64 ", ino %llu\n", func, bno,
                    (unsigned long long)inum);
                ffs_fserr(fs, NOCRED, "bad block");
                return EINVAL;
        }
        return 0;
}

/*
 * Allocate a block in the file system.
 *
 * The size of the requested block is given, which must be some
 * multiple of fs_fsize and <= fs_bsize.
 * A preference may be optionally specified. If a preference is given
 * the following hierarchy is used to allocate a block:
 *   1) allocate the requested block.
 *   2) allocate a rotationally optimal block in the same cylinder.
 *   3) allocate a block in the same cylinder group.
 *   4) quadradically rehash into other cylinder groups, until an
 *      available block is located.
 * If no block preference is given the following hierarchy is used
 * to allocate a block:
 *   1) allocate a block in the cylinder group that contains the
 *      inode for the file.
 *   2) quadradically rehash into other cylinder groups, until an
 *      available block is located.
 *
 * => called with um_lock held
 * => releases um_lock before returning
 */
int
ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size,
    int flags, kauth_cred_t cred, daddr_t *bnp)
{
        struct ufsmount *ump;
        struct fs *fs;
        daddr_t bno;
        u_int cg;
#if defined(QUOTA) || defined(QUOTA2)
        int error;
#endif

        fs = ip->i_fs;
        ump = ip->i_ump;

        KASSERT(mutex_owned(&ump->um_lock));

#ifdef UVM_PAGE_TRKOWN

        /*
         * Sanity-check that allocations within the file size
         * do not allow other threads to read the stale contents
         * of newly allocated blocks.
         * Usually pages will exist to cover the new allocation.
         * There is an optimization in ffs_write() where we skip
         * creating pages if several conditions are met:
         *  - the file must not be mapped (in any user address space).
         *  - the write must cover whole pages and whole blocks.
         * If those conditions are not met then pages must exist and
         * be locked by the current thread.
         */

        struct vnode *vp = ITOV(ip);
        if (vp->v_type == VREG && (flags & IO_EXT) == 0 &&
            ffs_lblktosize(fs, (voff_t)lbn) < round_page(vp->v_size) &&
            ((vp->v_vflag & VV_MAPPED) != 0 || (size & PAGE_MASK) != 0 ||
             ffs_blkoff(fs, size) != 0)) {
                struct vm_page *pg __diagused;
                struct uvm_object *uobj = &vp->v_uobj;
                voff_t off = trunc_page(ffs_lblktosize(fs, lbn));
                voff_t endoff = round_page(ffs_lblktosize(fs, lbn) + size);

                rw_enter(uobj->vmobjlock, RW_WRITER);
                while (off < endoff) {
                        pg = uvm_pagelookup(uobj, off);
                        KASSERT((pg != NULL && pg->owner_tag != NULL &&
                                 pg->owner == curproc->p_pid &&
                                 pg->lowner == curlwp->l_lid));
                        off += PAGE_SIZE;
                }
                rw_exit(uobj->vmobjlock);
        }
#endif

        *bnp = 0;

        KASSERTMSG((cred != NOCRED), "missing credential");
        KASSERTMSG(((u_int)size <= fs->fs_bsize),
            "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s",
            (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
        KASSERTMSG((ffs_fragoff(fs, size) == 0),
            "bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s",
            (unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);

        if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
                goto nospace;
        if (freespace(fs, fs->fs_minfree) <= 0 &&
            kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
            NULL, NULL) != 0)
                goto nospace;
#if defined(QUOTA) || defined(QUOTA2)
        mutex_exit(&ump->um_lock);
        if ((error = chkdq(ip, btodb(size), cred, 0)) != 0)
                return (error);
        mutex_enter(&ump->um_lock);
#endif

        if (bpref >= fs->fs_size)
                bpref = 0;
        if (bpref == 0)
                cg = ino_to_cg(fs, ip->i_number);
        else
                cg = dtog(fs, bpref);
        bno = ffs_hashalloc(ip, cg, bpref, size, 0, flags, ffs_alloccg);
        if (bno > 0) {
                DIP_ADD(ip, blocks, btodb(size));
                if (flags & IO_EXT)
                        ip->i_flag |= IN_CHANGE;
                else
                        ip->i_flag |= IN_CHANGE | IN_UPDATE;
                *bnp = bno;
                return (0);
        }
#if defined(QUOTA) || defined(QUOTA2)
        /*
         * Restore user's disk quota because allocation failed.
         */
        (void) chkdq(ip, -btodb(size), cred, FORCE);
#endif
        if (flags & B_CONTIG) {
                /*
                 * XXX ump->um_lock handling is "suspect" at best.
                 * For the case where ffs_hashalloc() fails early
                 * in the B_CONTIG case we reach here with um_lock
                 * already unlocked, so we can't release it again
                 * like in the normal error path.  See kern/39206.
                 *
                 *
                 * Fail silently - it's up to our caller to report
                 * errors.
                 */
                return (ENOSPC);
        }
nospace:
        mutex_exit(&ump->um_lock);
        ffs_fserr(fs, cred, "file system full");
        uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
        return (ENOSPC);
}

/*
 * Reallocate a fragment to a bigger size
 *
 * The number and size of the old block is given, and a preference
 * and new size is also specified. The allocator attempts to extend
 * the original block. Failing that, the regular block allocator is
 * invoked to get an appropriate block.
 *
 * => called with um_lock held
 * => return with um_lock released
 */
int
ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bprev, daddr_t bpref,
    int osize, int nsize, int flags, kauth_cred_t cred, struct buf **bpp,
    daddr_t *blknop)
{
        struct ufsmount *ump;
        struct fs *fs;
        struct buf *bp;
        u_int cg, request;
        int error;
        daddr_t bno;

        fs = ip->i_fs;
        ump = ip->i_ump;

        KASSERT(mutex_owned(&ump->um_lock));

#ifdef UVM_PAGE_TRKOWN

        /*
         * Sanity-check that allocations within the file size
         * do not allow other threads to read the stale contents
         * of newly allocated blocks.
         * Unlike in ffs_alloc(), here pages must always exist
         * for such allocations, because only the last block of a file
         * can be a fragment and ffs_write() will reallocate the
         * fragment to the new size using ufs_balloc_range(),
         * which always creates pages to cover blocks it allocates.
         */

        if (ITOV(ip)->v_type == VREG) {
                struct vm_page *pg __diagused;
                struct uvm_object *uobj = &ITOV(ip)->v_uobj;
                voff_t off = trunc_page(ffs_lblktosize(fs, lbprev));
                voff_t endoff = round_page(ffs_lblktosize(fs, lbprev) + osize);

                rw_enter(uobj->vmobjlock, RW_WRITER);
                while (off < endoff) {
                        pg = uvm_pagelookup(uobj, off);
                        KASSERT(pg->owner == curproc->p_pid &&
                                pg->lowner == curlwp->l_lid);
                        off += PAGE_SIZE;
                }
                rw_exit(uobj->vmobjlock);
        }
#endif

        KASSERTMSG((cred != NOCRED), "missing credential");
        KASSERTMSG(((u_int)osize <= fs->fs_bsize),
            "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
            (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
            fs->fs_fsmnt);
        KASSERTMSG((ffs_fragoff(fs, osize) == 0),
            "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
            (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
            fs->fs_fsmnt);
        KASSERTMSG(((u_int)nsize <= fs->fs_bsize),
            "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
            (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
            fs->fs_fsmnt);
        KASSERTMSG((ffs_fragoff(fs, nsize) == 0),
            "bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
            (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
            fs->fs_fsmnt);

        if (freespace(fs, fs->fs_minfree) <= 0 &&
            kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
            NULL, NULL) != 0) {
                mutex_exit(&ump->um_lock);
                goto nospace;
        }

        if (bprev == 0) {
                panic("%s: bad bprev: dev = 0x%llx, bsize = %d, bprev = %"
                    PRId64 ", fs = %s", __func__,
                    (unsigned long long)ip->i_dev, fs->fs_bsize, bprev,
                    fs->fs_fsmnt);
        }
        mutex_exit(&ump->um_lock);

        /*
         * Allocate the extra space in the buffer.
         */
        if (bpp != NULL &&
            (error = bread(ITOV(ip), lbprev, osize, 0, &bp)) != 0) {
                return (error);
        }
#if defined(QUOTA) || defined(QUOTA2)
        if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) {
                if (bpp != NULL) {
                        brelse(bp, 0);
                }
                return (error);
        }
#endif
        /*
         * Check for extension in the existing location.
         */
        cg = dtog(fs, bprev);
        mutex_enter(&ump->um_lock);
        if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) {
                DIP_ADD(ip, blocks, btodb(nsize - osize));
                if (flags & IO_EXT)
                        ip->i_flag |= IN_CHANGE;
                else
                        ip->i_flag |= IN_CHANGE | IN_UPDATE;

                if (bpp != NULL) {
                        if (bp->b_blkno != FFS_FSBTODB(fs, bno)) {
                                panic("%s: bad blockno %#llx != %#llx",
                                    __func__, (unsigned long long) bp->b_blkno,
                                    (unsigned long long)FFS_FSBTODB(fs, bno));
                        }
                        allocbuf(bp, nsize, 1);
                        memset((char *)bp->b_data + osize, 0, nsize - osize);
                        mutex_enter(bp->b_objlock);
                        KASSERT(!cv_has_waiters(&bp->b_done));
                        bp->b_oflags |= BO_DONE;
                        mutex_exit(bp->b_objlock);
                        *bpp = bp;
                }
                if (blknop != NULL) {
                        *blknop = bno;
                }
                return (0);
        }
        /*
         * Allocate a new disk location.
         */
        if (bpref >= fs->fs_size)
                bpref = 0;
        switch ((int)fs->fs_optim) {
        case FS_OPTSPACE:
                /*
                 * Allocate an exact sized fragment. Although this makes
                 * best use of space, we will waste time relocating it if
                 * the file continues to grow. If the fragmentation is
                 * less than half of the minimum free reserve, we choose
                 * to begin optimizing for time.
                 */
                request = nsize;
                if (fs->fs_minfree < 5 ||
                    fs->fs_cstotal.cs_nffree >
                    fs->fs_dsize * fs->fs_minfree / (2 * 100))
                        break;

                if (ffs_log_changeopt) {
                        log(LOG_NOTICE,
                                "%s: optimization changed from SPACE to TIME\n",
                                fs->fs_fsmnt);
                }

                fs->fs_optim = FS_OPTTIME;
                break;
        case FS_OPTTIME:
                /*
                 * At this point we have discovered a file that is trying to
                 * grow a small fragment to a larger fragment. To save time,
                 * we allocate a full sized block, then free the unused portion.
                 * If the file continues to grow, the `ffs_fragextend' call
                 * above will be able to grow it in place without further
                 * copying. If aberrant programs cause disk fragmentation to
                 * grow within 2% of the free reserve, we choose to begin
                 * optimizing for space.
                 */
                request = fs->fs_bsize;
                if (fs->fs_cstotal.cs_nffree <
                    fs->fs_dsize * (fs->fs_minfree - 2) / 100)
                        break;

                if (ffs_log_changeopt) {
                        log(LOG_NOTICE,
                                "%s: optimization changed from TIME to SPACE\n",
                                fs->fs_fsmnt);
                }

                fs->fs_optim = FS_OPTSPACE;
                break;
        default:
                panic("%s: bad optim: dev = 0x%llx, optim = %d, fs = %s",
                    __func__, (unsigned long long)ip->i_dev, fs->fs_optim,
                    fs->fs_fsmnt);
                /* NOTREACHED */
        }
        bno = ffs_hashalloc(ip, cg, bpref, request, nsize, 0, ffs_alloccg);
        if (bno > 0) {
                /*
                 * Use forced deallocation registration, we can't handle
                 * failure here. This is safe, as this place is ever hit
                 * maximum once per write operation, when fragment is extended
                 * to longer fragment, or a full block.
                 */
                if ((ip->i_ump->um_mountp->mnt_wapbl) &&
                    (ITOV(ip)->v_type != VREG)) {
                        /* this should never fail */
                        error = UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(
                            ip->i_ump->um_mountp, FFS_FSBTODB(fs, bprev),
                            osize);
                        if (error)
                                panic("ffs_realloccg: dealloc registration failed");
                } else {
                        ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
                            ip->i_number);
                }
                DIP_ADD(ip, blocks, btodb(nsize - osize));
                if (flags & IO_EXT)
                        ip->i_flag |= IN_CHANGE;
                else
                        ip->i_flag |= IN_CHANGE | IN_UPDATE;
                if (bpp != NULL) {
                        bp->b_blkno = FFS_FSBTODB(fs, bno);
                        allocbuf(bp, nsize, 1);
                        memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
                        mutex_enter(bp->b_objlock);
                        KASSERT(!cv_has_waiters(&bp->b_done));
                        bp->b_oflags |= BO_DONE;
                        mutex_exit(bp->b_objlock);
                        *bpp = bp;
                }
                if (blknop != NULL) {
                        *blknop = bno;
                }
                return (0);
        }
        mutex_exit(&ump->um_lock);

#if defined(QUOTA) || defined(QUOTA2)
        /*
         * Restore user's disk quota because allocation failed.
         */
        (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
#endif
        if (bpp != NULL) {
                brelse(bp, 0);
        }

nospace:
        /*
         * no space available
         */
        ffs_fserr(fs, cred, "file system full");
        uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
        return (ENOSPC);
}

/*
 * Allocate an inode in the file system.
 *
 * If allocating a directory, use ffs_dirpref to select the inode.
 * If allocating in a directory, the following hierarchy is followed:
 *   1) allocate the preferred inode.
 *   2) allocate an inode in the same cylinder group.
 *   3) quadradically rehash into other cylinder groups, until an
 *      available inode is located.
 * If no inode preference is given the following hierarchy is used
 * to allocate an inode:
 *   1) allocate an inode in cylinder group 0.
 *   2) quadradically rehash into other cylinder groups, until an
 *      available inode is located.
 *
 * => um_lock not held upon entry or return
 */
int
ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, ino_t *inop)
{
        struct ufsmount *ump;
        struct inode *pip;
        struct fs *fs;
        ino_t ino, ipref;
        u_int cg;
        int error;

        UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount);

        pip = VTOI(pvp);
        fs = pip->i_fs;
        ump = pip->i_ump;

        error = UFS_WAPBL_BEGIN(pvp->v_mount);
        if (error) {
                return error;
        }
        mutex_enter(&ump->um_lock);
        if (fs->fs_cstotal.cs_nifree == 0)
                goto noinodes;

        if ((mode & IFMT) == IFDIR)
                ipref = ffs_dirpref(pip);
        else
                ipref = pip->i_number;
        if (ipref >= fs->fs_ncg * fs->fs_ipg)
                ipref = 0;
        cg = ino_to_cg(fs, ipref);
        /*
         * Track number of dirs created one after another
         * in a same cg without intervening by files.
         */
        if ((mode & IFMT) == IFDIR) {
                if (fs->fs_contigdirs[cg] < 255)
                        fs->fs_contigdirs[cg]++;
        } else {
                if (fs->fs_contigdirs[cg] > 0)
                        fs->fs_contigdirs[cg]--;
        }
        ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 0, ffs_nodealloccg);
        if (ino == 0)
                goto noinodes;
        UFS_WAPBL_END(pvp->v_mount);
        *inop = ino;
        return 0;

noinodes:
        mutex_exit(&ump->um_lock);
        UFS_WAPBL_END(pvp->v_mount);
        ffs_fserr(fs, cred, "out of inodes");
        uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
        return ENOSPC;
}

/*
 * Find a cylinder group in which to place a directory.
 *
 * The policy implemented by this algorithm is to allocate a
 * directory inode in the same cylinder group as its parent
 * directory, but also to reserve space for its files inodes
 * and data. Restrict the number of directories which may be
 * allocated one after another in the same cylinder group
 * without intervening allocation of files.
 *
 * If we allocate a first level directory then force allocation
 * in another cylinder group.
 */
static ino_t
ffs_dirpref(struct inode *pip)
{
        register struct fs *fs;
        u_int cg, prefcg;
        uint64_t dirsize, cgsize, curdsz;
        u_int avgifree, avgbfree, avgndir;
        u_int minifree, minbfree, maxndir;
        u_int mincg, minndir;
        u_int maxcontigdirs;

        KASSERT(mutex_owned(&pip->i_ump->um_lock));

        fs = pip->i_fs;

        avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
        avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
        avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;

        /*
         * Force allocation in another cg if creating a first level dir.
         */
        if (ITOV(pip)->v_vflag & VV_ROOT) {
                prefcg = cprng_fast32() % fs->fs_ncg;
                mincg = prefcg;
                minndir = fs->fs_ipg;
                for (cg = prefcg; cg < fs->fs_ncg; cg++)
                        if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
                            fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
                            fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
                                mincg = cg;
                                minndir = fs->fs_cs(fs, cg).cs_ndir;
                        }
                for (cg = 0; cg < prefcg; cg++)
                        if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
                            fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
                            fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
                                mincg = cg;
                                minndir = fs->fs_cs(fs, cg).cs_ndir;
                        }
                return ((ino_t)(fs->fs_ipg * mincg));
        }

        /*
         * Count various limits which used for
         * optimal allocation of a directory inode.
         * Try cylinder groups with >75% avgifree and avgbfree.
         * Avoid cylinder groups with no free blocks or inodes as that
         * triggers an I/O-expensive cylinder group scan.
         */
        maxndir = uimin(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
        minifree = avgifree - avgifree / 4;
        if (minifree < 1)
                minifree = 1;
        minbfree = avgbfree - avgbfree / 4;
        if (minbfree < 1)
                minbfree = 1;
        cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg;
        dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir;
        if (avgndir != 0) {
                curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir;
                if (dirsize < curdsz)
                        dirsize = curdsz;
        }
        if (cgsize < dirsize * 255)
                maxcontigdirs = (avgbfree * fs->fs_bsize) / dirsize;
        else
                maxcontigdirs = 255;
        if (fs->fs_avgfpdir > 0)
                maxcontigdirs = uimin(maxcontigdirs,
                                    fs->fs_ipg / fs->fs_avgfpdir);
        if (maxcontigdirs == 0)
                maxcontigdirs = 1;

        /*
         * Limit number of dirs in one cg and reserve space for
         * regular files, but only if we have no deficit in
         * inodes or space.
         */
        prefcg = ino_to_cg(fs, pip->i_number);
        for (cg = prefcg; cg < fs->fs_ncg; cg++)
                if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
                    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
                        fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
                        if (fs->fs_contigdirs[cg] < maxcontigdirs)
                                return ((ino_t)(fs->fs_ipg * cg));
                }
        for (cg = 0; cg < prefcg; cg++)
                if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
                    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
                        fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
                        if (fs->fs_contigdirs[cg] < maxcontigdirs)
                                return ((ino_t)(fs->fs_ipg * cg));
                }
        /*
         * This is a backstop when we are deficient in space.
         */
        for (cg = prefcg; cg < fs->fs_ncg; cg++)
                if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
                        return ((ino_t)(fs->fs_ipg * cg));
        for (cg = 0; cg < prefcg; cg++)
                if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
                        break;
        return ((ino_t)(fs->fs_ipg * cg));
}

/*
 * Select the desired position for the next block in a file.  The file is
 * logically divided into sections. The first section is composed of the
 * direct blocks. Each additional section contains fs_maxbpg blocks.
 *
 * If no blocks have been allocated in the first section, the policy is to
 * request a block in the same cylinder group as the inode that describes
 * the file. If no blocks have been allocated in any other section, the
 * policy is to place the section in a cylinder group with a greater than
 * average number of free blocks.  An appropriate cylinder group is found
 * by using a rotor that sweeps the cylinder groups. When a new group of
 * blocks is needed, the sweep begins in the cylinder group following the
 * cylinder group from which the previous allocation was made. The sweep
 * continues until a cylinder group with greater than the average number
 * of free blocks is found. If the allocation is for the first block in an
 * indirect block, the information on the previous allocation is unavailable;
 * here a best guess is made based upon the logical block number being
 * allocated.
 *
 * If a section is already partially allocated, the policy is to
 * contiguously allocate fs_maxcontig blocks.  The end of one of these
 * contiguous blocks and the beginning of the next is laid out
 * contigously if possible.
 *
 * => um_lock held on entry and exit
 */
daddr_t
ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags,
    int32_t *bap /* XXX ondisk32 */)
{
        struct fs *fs;
        u_int cg;
        u_int avgbfree, startcg;

        KASSERT(mutex_owned(&ip->i_ump->um_lock));

        fs = ip->i_fs;

        /*
         * If allocating a contiguous file with B_CONTIG, use the hints
         * in the inode extensions to return the desired block.
         *
         * For metadata (indirect blocks) return the address of where
         * the first indirect block resides - we'll scan for the next
         * available slot if we need to allocate more than one indirect
         * block.  For data, return the address of the actual block
         * relative to the address of the first data block.
         */
        if (flags & B_CONTIG) {
                KASSERT(ip->i_ffs_first_data_blk != 0);
                KASSERT(ip->i_ffs_first_indir_blk != 0);
                if (flags & B_METAONLY)
                        return ip->i_ffs_first_indir_blk;
                else
                        return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
        }

        if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
                if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
                        cg = ino_to_cg(fs, ip->i_number);
                        return (cgbase(fs, cg) + fs->fs_frag);
                }
                /*
                 * Find a cylinder with greater than average number of
                 * unused data blocks.
                 */
                if (indx == 0 || bap[indx - 1] == 0)
                        startcg =
                            ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
                else
                        startcg = dtog(fs,
                                ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
                startcg %= fs->fs_ncg;
                avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
                for (cg = startcg; cg < fs->fs_ncg; cg++)
                        if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
                                return (cgbase(fs, cg) + fs->fs_frag);
                        }
                for (cg = 0; cg < startcg; cg++)
                        if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
                                return (cgbase(fs, cg) + fs->fs_frag);
                        }
                return (0);
        }
        /*
         * We just always try to lay things out contiguously.
         */
        return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
}

daddr_t
ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags,
    int64_t *bap)
{
        struct fs *fs;
        u_int cg;
        u_int avgbfree, startcg;

        KASSERT(mutex_owned(&ip->i_ump->um_lock));

        fs = ip->i_fs;

        /*
         * If allocating a contiguous file with B_CONTIG, use the hints
         * in the inode extensions to return the desired block.
         *
         * For metadata (indirect blocks) return the address of where
         * the first indirect block resides - we'll scan for the next
         * available slot if we need to allocate more than one indirect
         * block.  For data, return the address of the actual block
         * relative to the address of the first data block.
         */
        if (flags & B_CONTIG) {
                KASSERT(ip->i_ffs_first_data_blk != 0);
                KASSERT(ip->i_ffs_first_indir_blk != 0);
                if (flags & B_METAONLY)
                        return ip->i_ffs_first_indir_blk;
                else
                        return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
        }

        if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
                if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
                        cg = ino_to_cg(fs, ip->i_number);
                        return (cgbase(fs, cg) + fs->fs_frag);
                }
                /*
                 * Find a cylinder with greater than average number of
                 * unused data blocks.
                 */
                if (indx == 0 || bap[indx - 1] == 0)
                        startcg =
                            ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
                else
                        startcg = dtog(fs,
                                ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
                startcg %= fs->fs_ncg;
                avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
                for (cg = startcg; cg < fs->fs_ncg; cg++)
                        if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
                                return (cgbase(fs, cg) + fs->fs_frag);
                        }
                for (cg = 0; cg < startcg; cg++)
                        if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
                                return (cgbase(fs, cg) + fs->fs_frag);
                        }
                return (0);
        }
        /*
         * We just always try to lay things out contiguously.
         */
        return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
}


/*
 * Implement the cylinder overflow algorithm.
 *
 * The policy implemented by this algorithm is:
 *   1) allocate the block in its requested cylinder group.
 *   2) quadradically rehash on the cylinder group number.
 *   3) brute force search for a free block.
 *
 * => called with um_lock held
 * => returns with um_lock released on success, held on failure
 *    (*allocator releases lock on success, retains lock on failure)
 */
/*VARARGS5*/
static daddr_t
ffs_hashalloc(struct inode *ip, u_int cg, daddr_t pref,
    int size /* size for data blocks, mode for inodes */,
    int realsize,
    int flags,
    daddr_t (*allocator)(struct inode *, u_int, daddr_t, int, int, int))
{
        struct fs *fs;
        daddr_t result;
        u_int i, icg = cg;

        fs = ip->i_fs;
        /*
         * 1: preferred cylinder group
         */
        result = (*allocator)(ip, cg, pref, size, realsize, flags);
        if (result)
                return (result);

        if (flags & B_CONTIG)
                return (result);
        /*
         * 2: quadratic rehash
         */
        for (i = 1; i < fs->fs_ncg; i *= 2) {
                cg += i;
                if (cg >= fs->fs_ncg)
                        cg -= fs->fs_ncg;
                result = (*allocator)(ip, cg, 0, size, realsize, flags);
                if (result)
                        return (result);
        }
        /*
         * 3: brute force search
         * Note that we start at i == 2, since 0 was checked initially,
         * and 1 is always checked in the quadratic rehash.
         */
        cg = (icg + 2) % fs->fs_ncg;
        for (i = 2; i < fs->fs_ncg; i++) {
                result = (*allocator)(ip, cg, 0, size, realsize, flags);
                if (result)
                        return (result);
                cg++;
                if (cg == fs->fs_ncg)
                        cg = 0;
        }
        return (0);
}

/*
 * Determine whether a fragment can be extended.
 *
 * Check to see if the necessary fragments are available, and
 * if they are, allocate them.
 *
 * => called with um_lock held
 * => returns with um_lock released on success, held on failure
 */
static daddr_t
ffs_fragextend(struct inode *ip, u_int cg, daddr_t bprev, int osize, int nsize)
{
        struct ufsmount *ump;
        struct fs *fs;
        struct cg *cgp;
        struct buf *bp;
        daddr_t bno;
        int frags, bbase;
        int i, error;
        u_int8_t *blksfree;

        fs = ip->i_fs;
        ump = ip->i_ump;

        KASSERT(mutex_owned(&ump->um_lock));

        if (fs->fs_cs(fs, cg).cs_nffree < ffs_numfrags(fs, nsize - osize))
                return (0);
        frags = ffs_numfrags(fs, nsize);
        bbase = ffs_fragnum(fs, bprev);
        if (bbase > ffs_fragnum(fs, (bprev + frags - 1))) {
                /* cannot extend across a block boundary */
                return (0);
        }
        mutex_exit(&ump->um_lock);
        error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
                (int)fs->fs_cgsize, B_MODIFY, &bp);
        if (error)
                goto fail;
        cgp = (struct cg *)bp->b_data;
        if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
                goto fail;
        cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs));
        if ((fs->fs_magic != FS_UFS1_MAGIC) ||
            (fs->fs_old_flags & FS_FLAGS_UPDATED))
                cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs));
        bno = dtogd(fs, bprev);
        blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs));
        for (i = ffs_numfrags(fs, osize); i < frags; i++)
                if (isclr(blksfree, bno + i))
                        goto fail;
        /*
         * the current fragment can be extended
         * deduct the count on fragment being extended into
         * increase the count on the remaining fragment (if any)
         * allocate the extended piece
         */
        for (i = frags; i < fs->fs_frag - bbase; i++)
                if (isclr(blksfree, bno + i))
                        break;
        ufs_add32(cgp->cg_frsum[i - ffs_numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs));
        if (i != frags)
                ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs));
        mutex_enter(&ump->um_lock);
        for (i = ffs_numfrags(fs, osize); i < frags; i++) {
                clrbit(blksfree, bno + i);
                ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs));
                fs->fs_cstotal.cs_nffree--;
                fs->fs_cs(fs, cg).cs_nffree--;
        }
        fs->fs_fmod = 1;
        ACTIVECG_CLR(fs, cg);
        mutex_exit(&ump->um_lock);
        bdwrite(bp);
        return (bprev);

 fail:
         if (bp != NULL)
                brelse(bp, 0);
         mutex_enter(&ump->um_lock);
         return (0);
}

/*
 * Determine whether a block can be allocated.
 *
 * Check to see if a block of the appropriate size is available,
 * and if it is, allocate it.
 */
static daddr_t
ffs_alloccg(struct inode *ip, u_int cg, daddr_t bpref, int size, int realsize,
    int flags)
{
        struct ufsmount *ump;
        struct fs *fs = ip->i_fs;
        struct cg *cgp;
        struct buf *bp;
        int32_t bno;
        daddr_t blkno;
        int error, frags, allocsiz, i;
        u_int8_t *blksfree;
        const int needswap = UFS_FSNEEDSWAP(fs);

        ump = ip->i_ump;

        KASSERT(mutex_owned(&ump->um_lock));

        if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
                return (0);
        mutex_exit(&ump->um_lock);
        error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
                (int)fs->fs_cgsize, B_MODIFY, &bp);
        if (error)
                goto fail;
        cgp = (struct cg *)bp->b_data;
        if (!cg_chkmagic(cgp, needswap) ||
            (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
                goto fail;
        cgp->cg_old_time = ufs_rw32(time_second, needswap);
        if ((fs->fs_magic != FS_UFS1_MAGIC) ||
            (fs->fs_old_flags & FS_FLAGS_UPDATED))
                cgp->cg_time = ufs_rw64(time_second, needswap);
        if (size == fs->fs_bsize) {
                mutex_enter(&ump->um_lock);
                blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags);
                ACTIVECG_CLR(fs, cg);
                mutex_exit(&ump->um_lock);

                /*
                 * If actually needed size is lower, free the extra blocks now.
                 * This is safe to call here, there is no outside reference
                 * to this block yet. It is not necessary to keep um_lock
                 * locked.
                 */
                if (realsize != 0 && realsize < size) {
                        ffs_blkfree_common(ip->i_ump, ip->i_fs,
                            ip->i_devvp->v_rdev,
                            bp, blkno + ffs_numfrags(fs, realsize),
                            (long)(size - realsize), false);
                }

                bdwrite(bp);
                return (blkno);
        }
        /*
         * check to see if any fragments are already available
         * allocsiz is the size which will be allocated, hacking
         * it down to a smaller size if necessary
         */
        blksfree = cg_blksfree(cgp, needswap);
        frags = ffs_numfrags(fs, size);
        for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
                if (cgp->cg_frsum[allocsiz] != 0)
                        break;
        if (allocsiz == fs->fs_frag) {
                /*
                 * no fragments were available, so a block will be
                 * allocated, and hacked up
                 */
                if (cgp->cg_cs.cs_nbfree == 0)
                        goto fail;
                mutex_enter(&ump->um_lock);
                blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags);
                bno = dtogd(fs, blkno);
                for (i = frags; i < fs->fs_frag; i++)
                        setbit(blksfree, bno + i);
                i = fs->fs_frag - frags;
                ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
                fs->fs_cstotal.cs_nffree += i;
                fs->fs_cs(fs, cg).cs_nffree += i;
                fs->fs_fmod = 1;
                ufs_add32(cgp->cg_frsum[i], 1, needswap);
                ACTIVECG_CLR(fs, cg);
                mutex_exit(&ump->um_lock);
                bdwrite(bp);
                return (blkno);
        }
        bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
#if 0
        /*
         * XXX fvdl mapsearch will panic, and never return -1
         *          also: returning NULL as daddr_t ?
         */
        if (bno < 0)
                goto fail;
#endif
        for (i = 0; i < frags; i++)
                clrbit(blksfree, bno + i);
        mutex_enter(&ump->um_lock);
        ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
        fs->fs_cstotal.cs_nffree -= frags;
        fs->fs_cs(fs, cg).cs_nffree -= frags;
        fs->fs_fmod = 1;
        ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap);
        if (frags != allocsiz)
                ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap);
        blkno = cgbase(fs, cg) + bno;
        ACTIVECG_CLR(fs, cg);
        mutex_exit(&ump->um_lock);
        bdwrite(bp);
        return blkno;

 fail:
         if (bp != NULL)
                brelse(bp, 0);
         mutex_enter(&ump->um_lock);
         return (0);
}

/*
 * Allocate a block in a cylinder group.
 *
 * This algorithm implements the following policy:
 *   1) allocate the requested block.
 *   2) allocate a rotationally optimal block in the same cylinder.
 *   3) allocate the next available block on the block rotor for the
 *      specified cylinder group.
 * Note that this routine only allocates fs_bsize blocks; these
 * blocks may be fragmented by the routine that allocates them.
 */
static daddr_t
ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int realsize,
    int flags)
{
        struct fs *fs = ip->i_fs;
        struct cg *cgp;
        int cg;
        daddr_t blkno;
        int32_t bno;
        u_int8_t *blksfree;
        const int needswap = UFS_FSNEEDSWAP(fs);

        KASSERT(mutex_owned(&ip->i_ump->um_lock));

        cgp = (struct cg *)bp->b_data;
        blksfree = cg_blksfree(cgp, needswap);
        if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) {
                bpref = ufs_rw32(cgp->cg_rotor, needswap);
        } else {
                bpref = ffs_blknum(fs, bpref);
                bno = dtogd(fs, bpref);
                /*
                 * if the requested block is available, use it
                 */
                if (ffs_isblock(fs, blksfree, ffs_fragstoblks(fs, bno)))
                        goto gotit;
                /*
                 * if the requested data block isn't available and we are
                 * trying to allocate a contiguous file, return an error.
                 */
                if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG)
                        return (0);
        }

        /*
         * Take the next available block in this cylinder group.
         */
        bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
#if 0 
        /*
         * XXX jdolecek ffs_mapsearch() succeeds or panics
         */
        if (bno < 0)
                return (0);
#endif
        cgp->cg_rotor = ufs_rw32(bno, needswap);
gotit:
        blkno = ffs_fragstoblks(fs, bno);
        ffs_clrblock(fs, blksfree, blkno);
        ffs_clusteracct(fs, cgp, blkno, -1);
        ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
        fs->fs_cstotal.cs_nbfree--;
        fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--;
        if ((fs->fs_magic == FS_UFS1_MAGIC) &&
            ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
                int cylno;
                cylno = old_cbtocylno(fs, bno);
                KASSERT(cylno >= 0);
                KASSERT(cylno < fs->fs_old_ncyl);
                KASSERT(old_cbtorpos(fs, bno) >= 0);
                KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos);
                ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1,
                    needswap);
                ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap);
        }
        fs->fs_fmod = 1;
        cg = ufs_rw32(cgp->cg_cgx, needswap);
        blkno = cgbase(fs, cg) + bno;
        return (blkno);
}

/*
 * Determine whether an inode can be allocated.
 *
 * Check to see if an inode is available, and if it is,
 * allocate it using the following policy:
 *   1) allocate the requested inode.
 *   2) allocate the next available inode after the requested
 *      inode in the specified cylinder group.
 */
static daddr_t
ffs_nodealloccg(struct inode *ip, u_int cg, daddr_t ipref, int mode, int realsize,
    int flags)
{
        struct ufsmount *ump = ip->i_ump;
        struct fs *fs = ip->i_fs;
        struct cg *cgp;
        struct buf *bp, *ibp;
        u_int8_t *inosused;
        int error, start, len, loc, map, i;
        int32_t initediblk, maxiblk, irotor;
        daddr_t nalloc;
        struct ufs2_dinode *dp2;
        const int needswap = UFS_FSNEEDSWAP(fs);

        KASSERT(mutex_owned(&ump->um_lock));
        UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp);

        if (fs->fs_cs(fs, cg).cs_nifree == 0)
                return (0);
        mutex_exit(&ump->um_lock);
        ibp = NULL;
        if (fs->fs_magic == FS_UFS2_MAGIC) {
                initediblk = -1;
        } else {
                initediblk = fs->fs_ipg;
        }
        maxiblk = initediblk;

retry:
        error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
                (int)fs->fs_cgsize, B_MODIFY, &bp);
        if (error)
                goto fail;
        cgp = (struct cg *)bp->b_data;
        if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0)
                goto fail;

        if (ibp != NULL &&
            initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
                /* Another thread allocated more inodes so we retry the test. */
                brelse(ibp, 0);
                ibp = NULL;
        }
        /*
         * Check to see if we need to initialize more inodes.
         */
        if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) {
                initediblk = ufs_rw32(cgp->cg_initediblk, needswap);
                maxiblk = initediblk;
                nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap);
                if (nalloc + FFS_INOPB(fs) > initediblk &&
                    initediblk < ufs_rw32(cgp->cg_niblk, needswap)) {
                        /*
                         * We have to release the cg buffer here to prevent
                         * a deadlock when reading the inode block will
                         * run a copy-on-write that might use this cg.
                         */
                        brelse(bp, 0);
                        bp = NULL;
                        error = ffs_getblk(ip->i_devvp, FFS_FSBTODB(fs,
                            ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)),
                            FFS_NOBLK, fs->fs_bsize, false, &ibp);
                        if (error)
                                goto fail;

                        maxiblk += FFS_INOPB(fs);
                        
                        goto retry;
                }
        }

        cgp->cg_old_time = ufs_rw32(time_second, needswap);
        if ((fs->fs_magic != FS_UFS1_MAGIC) ||
            (fs->fs_old_flags & FS_FLAGS_UPDATED))
                cgp->cg_time = ufs_rw64(time_second, needswap);
        inosused = cg_inosused(cgp, needswap);
        
        if (ipref) {
                ipref %= fs->fs_ipg;
                /* safeguard to stay in (to be) allocated range */
                if (ipref < maxiblk && isclr(inosused, ipref))
                        goto gotit;
        }

        irotor = ufs_rw32(cgp->cg_irotor, needswap); 

        KASSERTMSG(irotor < initediblk, "%s: allocation botch: cg=%d, irotor %d"
                   " out of bounds, initediblk=%d",
                   __func__, cg, irotor, initediblk);

        start = irotor / NBBY;
        len = howmany(maxiblk - irotor, NBBY);
        loc = skpc(0xff, len, &inosused[start]);
        if (loc == 0) {
                len = start + 1;
                start = 0;
                loc = skpc(0xff, len, &inosused[0]);
                if (loc == 0) {
                        panic("%s: map corrupted: cg=%d, irotor=%d, fs=%s",
                            __func__, cg, ufs_rw32(cgp->cg_irotor, needswap),
                            fs->fs_fsmnt);
                        /* NOTREACHED */
                }
        }
        i = start + len - loc;
        map = inosused[i] ^ 0xff;
        if (map == 0) {
                panic("%s: block not in map: fs=%s", __func__, fs->fs_fsmnt);
        }
        
        ipref = i * NBBY + ffs(map) - 1;

        cgp->cg_irotor = ufs_rw32(ipref, needswap);

gotit:
        KASSERTMSG(ipref < maxiblk, "%s: allocation botch: cg=%d attempt to "
                   "allocate inode index %d beyond max allocated index %d"
                   " of %d inodes/cg",
                   __func__, cg, (int)ipref, maxiblk, cgp->cg_niblk);

        UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref,
            mode);
        /*
         * Check to see if we need to initialize more inodes.
         */
        if (ibp != NULL) {
                KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap));
                memset(ibp->b_data, 0, fs->fs_bsize);
                dp2 = (struct ufs2_dinode *)(ibp->b_data);
                for (i = 0; i < FFS_INOPB(fs); i++) {
                        /*
                         * Don't bother to swap, it's supposed to be
                         * random, after all.
                         */
                        dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1;
                        dp2++;
                }
                initediblk += FFS_INOPB(fs);
                cgp->cg_initediblk = ufs_rw32(initediblk, needswap);
        }

        mutex_enter(&ump->um_lock);
        ACTIVECG_CLR(fs, cg);
        setbit(inosused, ipref);
        ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap);
        fs->fs_cstotal.cs_nifree--;
        fs->fs_cs(fs, cg).cs_nifree--;
        fs->fs_fmod = 1;
        if ((mode & IFMT) == IFDIR) {
                ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap);
                fs->fs_cstotal.cs_ndir++;
                fs->fs_cs(fs, cg).cs_ndir++;
        }
        mutex_exit(&ump->um_lock);
        if (ibp != NULL) {
                bwrite(ibp);
                bwrite(bp);
        } else
                bdwrite(bp);
        return ((ino_t)(cg * fs->fs_ipg + ipref));
 fail:
        if (bp != NULL)
                brelse(bp, 0);
        if (ibp != NULL)
                brelse(ibp, 0);
        mutex_enter(&ump->um_lock);
        return (0);
}

/*
 * Allocate a block or fragment.
 *
 * The specified block or fragment is removed from the
 * free map, possibly fragmenting a block in the process.
 *
 * This implementation should mirror fs_blkfree
 *
 * => um_lock not held on entry or exit
 */
int
ffs_blkalloc(struct inode *ip, daddr_t bno, long size)
{
        int error;

        error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size,
            ip->i_dev, ip->i_uid);
        if (error)
                return error;

        return ffs_blkalloc_ump(ip->i_ump, bno, size);
}

int
ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size)
{
        struct fs *fs = ump->um_fs;
        struct cg *cgp;
        struct buf *bp;
        int32_t fragno, cgbno;
        int i, error, blk, frags, bbase;
        u_int cg;
        u_int8_t *blksfree;
        const int needswap = UFS_FSNEEDSWAP(fs);

        KASSERT((u_int)size <= fs->fs_bsize && ffs_fragoff(fs, size) == 0 &&
            ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) <= fs->fs_frag);
        KASSERT(bno < fs->fs_size);

        cg = dtog(fs, bno);
        error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
                (int)fs->fs_cgsize, B_MODIFY, &bp);
        if (error) {
                return error;
        }
        cgp = (struct cg *)bp->b_data;
        if (!cg_chkmagic(cgp, needswap)) {
                brelse(bp, 0);
                return EIO;
        }
        cgp->cg_old_time = ufs_rw32(time_second, needswap);
        cgp->cg_time = ufs_rw64(time_second, needswap);
        cgbno = dtogd(fs, bno);
        blksfree = cg_blksfree(cgp, needswap);

        mutex_enter(&ump->um_lock);
        if (size == fs->fs_bsize) {
                fragno = ffs_fragstoblks(fs, cgbno);
                if (!ffs_isblock(fs, blksfree, fragno)) {
                        mutex_exit(&ump->um_lock);
                        brelse(bp, 0);
                        return EBUSY;
                }
                ffs_clrblock(fs, blksfree, fragno);
                ffs_clusteracct(fs, cgp, fragno, -1);
                ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
                fs->fs_cstotal.cs_nbfree--;
                fs->fs_cs(fs, cg).cs_nbfree--;
        } else {
                bbase = cgbno - ffs_fragnum(fs, cgbno);

                frags = ffs_numfrags(fs, size);
                for (i = 0; i < frags; i++) {
                        if (isclr(blksfree, cgbno + i)) {
                                mutex_exit(&ump->um_lock);
                                brelse(bp, 0);
                                return EBUSY;
                        }
                }
                /*
                 * if a complete block is being split, account for it
                 */
                fragno = ffs_fragstoblks(fs, bbase);
                if (ffs_isblock(fs, blksfree, fragno)) {
                        ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap);
                        fs->fs_cstotal.cs_nffree += fs->fs_frag;
                        fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag;
                        ffs_clusteracct(fs, cgp, fragno, -1);
                        ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
                        fs->fs_cstotal.cs_nbfree--;
                        fs->fs_cs(fs, cg).cs_nbfree--;
                }
                /*
                 * decrement the counts associated with the old frags
                 */
                blk = blkmap(fs, blksfree, bbase);
                ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
                /*
                 * allocate the fragment
                 */
                for (i = 0; i < frags; i++) {
                        clrbit(blksfree, cgbno + i);
                }
                ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap);
                fs->fs_cstotal.cs_nffree -= i;
                fs->fs_cs(fs, cg).cs_nffree -= i;
                /*
                 * add back in counts associated with the new frags
                 */
                blk = blkmap(fs, blksfree, bbase);
                ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
        }
        fs->fs_fmod = 1;
        ACTIVECG_CLR(fs, cg);
        mutex_exit(&ump->um_lock);
        bdwrite(bp);
        return 0;
}

/*
 * Free a block or fragment.
 *
 * The specified block or fragment is placed back in the
 * free map. If a fragment is deallocated, a possible
 * block reassembly is checked.
 *
 * => um_lock not held on entry or exit
 */
static void
ffs_blkfree_cg(struct fs *fs, struct vnode *devvp, daddr_t bno, long size)
{
        struct cg *cgp;
        struct buf *bp;
        struct ufsmount *ump;
        daddr_t cgblkno;
        int error;
        u_int cg;
        dev_t dev;
        const bool devvp_is_snapshot = (devvp->v_type != VBLK);
        const int needswap = UFS_FSNEEDSWAP(fs);

        KASSERT(!devvp_is_snapshot);

        cg = dtog(fs, bno);
        dev = devvp->v_rdev;
        ump = VFSTOUFS(spec_node_getmountedfs(devvp));
        KASSERT(fs == ump->um_fs);
        cgblkno = FFS_FSBTODB(fs, cgtod(fs, cg));

        error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
            B_MODIFY, &bp);
        if (error) {
                return;
        }
        cgp = (struct cg *)bp->b_data;
        if (!cg_chkmagic(cgp, needswap)) {
                brelse(bp, 0);
                return;
        }

        ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);

        bdwrite(bp);
}

struct discardopdata {
        struct work wk; /* must be first */
        struct vnode *devvp;
        daddr_t bno;
        long size;
};

struct discarddata {
        struct fs *fs;
        struct discardopdata *entry;
        long maxsize;
        kmutex_t entrylk;
        struct workqueue *wq;
        int wqcnt, wqdraining;
        kmutex_t wqlk;
        kcondvar_t wqcv;
        /* timer for flush? */
};

static void
ffs_blkfree_td(struct fs *fs, struct discardopdata *td)
{
        struct mount *mp = spec_node_getmountedfs(td->devvp);
        long todo;
        int error;

        while (td->size) {
                todo = uimin(td->size,
                  ffs_lfragtosize(fs, (fs->fs_frag - ffs_fragnum(fs, td->bno))));
                error = UFS_WAPBL_BEGIN(mp);
                if (error) {
                        printf("ffs: failed to begin wapbl transaction"
                            " for discard: %d\n", error);
                        break;
                }
                ffs_blkfree_cg(fs, td->devvp, td->bno, todo);
                UFS_WAPBL_END(mp);
                td->bno += ffs_numfrags(fs, todo);
                td->size -= todo;
        }
}

static void
ffs_discardcb(struct work *wk, void *arg)
{
        struct discardopdata *td = (void *)wk;
        struct discarddata *ts = arg;
        struct fs *fs = ts->fs;
        off_t start, len;
#ifdef TRIMDEBUG
        int error;
#endif

/* like FSBTODB but emits bytes; XXX move to fs.h */
#ifndef FFS_FSBTOBYTES
#define FFS_FSBTOBYTES(fs, b) ((b) << (fs)->fs_fshift)
#endif

        start = FFS_FSBTOBYTES(fs, td->bno);
        len = td->size;
        vn_lock(td->devvp, LK_EXCLUSIVE | LK_RETRY);
#ifdef TRIMDEBUG
        error =
#endif
                VOP_FDISCARD(td->devvp, start, len);
        VOP_UNLOCK(td->devvp);
#ifdef TRIMDEBUG
        printf("trim(%" PRId64 ",%ld):%d\n", td->bno, td->size, error);
#endif

        ffs_blkfree_td(fs, td);
        kmem_free(td, sizeof(*td));
        mutex_enter(&ts->wqlk);
        ts->wqcnt--;
        if (ts->wqdraining && !ts->wqcnt)
                cv_signal(&ts->wqcv);
        mutex_exit(&ts->wqlk);
}

void *
ffs_discard_init(struct vnode *devvp, struct fs *fs)
{
        struct discarddata *ts;
        int error;

        ts = kmem_zalloc(sizeof (*ts), KM_SLEEP);
        error = workqueue_create(&ts->wq, "trimwq", ffs_discardcb, ts,
                                 PRI_USER, IPL_NONE, 0);
        if (error) {
                kmem_free(ts, sizeof (*ts));
                return NULL;
        }
        mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE);
        mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&ts->wqcv, "trimwqcv");
        ts->maxsize = 100*1024; /* XXX */
        ts->fs = fs;
        return ts;
}

void
ffs_discard_finish(void *vts, int flags)
{
        struct discarddata *ts = vts;
        struct discardopdata *td = NULL;

        /* wait for workqueue to drain */
        mutex_enter(&ts->wqlk);
        if (ts->wqcnt) {
                ts->wqdraining = 1;
                cv_wait(&ts->wqcv, &ts->wqlk);
        }
        mutex_exit(&ts->wqlk);

        mutex_enter(&ts->entrylk);
        if (ts->entry) {
                td = ts->entry;
                ts->entry = NULL;
        }
        mutex_exit(&ts->entrylk);
        if (td) {
                /* XXX don't tell disk, its optional */
                ffs_blkfree_td(ts->fs, td);
#ifdef TRIMDEBUG
                printf("finish(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
                kmem_free(td, sizeof(*td));
        }

        cv_destroy(&ts->wqcv);
        mutex_destroy(&ts->entrylk);
        mutex_destroy(&ts->wqlk);
        workqueue_destroy(ts->wq);
        kmem_free(ts, sizeof(*ts));
}

void
ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
    ino_t inum)
{
        struct ufsmount *ump;
        int error;
        dev_t dev;
        struct discarddata *ts;
        struct discardopdata *td;

        dev = devvp->v_rdev;
        ump = VFSTOUFS(spec_node_getmountedfs(devvp));
        if (ffs_snapblkfree(fs, devvp, bno, size, inum))
                return;

        error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
        if (error)
                return;

        if (!ump->um_discarddata) {
                ffs_blkfree_cg(fs, devvp, bno, size);
                return;
        }

#ifdef TRIMDEBUG
        printf("blkfree(%" PRId64 ",%ld)\n", bno, size);
#endif
        ts = ump->um_discarddata;
        td = NULL;

        mutex_enter(&ts->entrylk);
        if (ts->entry) {
                td = ts->entry;
                /* ffs deallocs backwards, check for prepend only */
                if (td->bno == bno + ffs_numfrags(fs, size)
                    && td->size + size <= ts->maxsize) {
                        td->bno = bno;
                        td->size += size;
                        if (td->size < ts->maxsize) {
#ifdef TRIMDEBUG
                                printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
                                mutex_exit(&ts->entrylk);
                                return;
                        }
                        size = 0; /* mark done */
                }
                ts->entry = NULL;
        }
        mutex_exit(&ts->entrylk);

        if (td) {
#ifdef TRIMDEBUG
                printf("enq old(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
                mutex_enter(&ts->wqlk);
                ts->wqcnt++;
                mutex_exit(&ts->wqlk);
                workqueue_enqueue(ts->wq, &td->wk, NULL);
        }
        if (!size)
                return;

        td = kmem_alloc(sizeof(*td), KM_SLEEP);
        td->devvp = devvp;
        td->bno = bno;
        td->size = size;

        if (td->size < ts->maxsize) { /* XXX always the case */
                mutex_enter(&ts->entrylk);
                if (!ts->entry) { /* possible race? */
#ifdef TRIMDEBUG
                        printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
                        ts->entry = td;
                        td = NULL;
                }
                mutex_exit(&ts->entrylk);
        }
        if (td) {
#ifdef TRIMDEBUG
                printf("enq new(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
                mutex_enter(&ts->wqlk);
                ts->wqcnt++;
                mutex_exit(&ts->wqlk);
                workqueue_enqueue(ts->wq, &td->wk, NULL);
        }
}

/*
 * Free a block or fragment from a snapshot cg copy.
 *
 * The specified block or fragment is placed back in the
 * free map. If a fragment is deallocated, a possible
 * block reassembly is checked.
 *
 * => um_lock not held on entry or exit
 */
void
ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
    ino_t inum)
{
        struct cg *cgp;
        struct buf *bp;
        struct ufsmount *ump;
        daddr_t cgblkno;
        int error, cg;
        dev_t dev;
        const bool devvp_is_snapshot = (devvp->v_type != VBLK);
        const int needswap = UFS_FSNEEDSWAP(fs);

        KASSERT(devvp_is_snapshot);

        cg = dtog(fs, bno);
        dev = VTOI(devvp)->i_devvp->v_rdev;
        ump = VFSTOUFS(devvp->v_mount);
        cgblkno = ffs_fragstoblks(fs, cgtod(fs, cg));

        error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
        if (error)
                return;

        error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
            B_MODIFY, &bp);
        if (error) {
                return;
        }
        cgp = (struct cg *)bp->b_data;
        if (!cg_chkmagic(cgp, needswap)) {
                brelse(bp, 0);
                return;
        }

        ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);

        bdwrite(bp);
}

static void
ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
    struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot)
{
        struct cg *cgp;
        int32_t fragno, cgbno;
        int i, blk, frags, bbase;
        u_int cg;
        u_int8_t *blksfree;
        const int needswap = UFS_FSNEEDSWAP(fs);

        cg = dtog(fs, bno);
        cgp = (struct cg *)bp->b_data;
        cgp->cg_old_time = ufs_rw32(time_second, needswap);
        if ((fs->fs_magic != FS_UFS1_MAGIC) ||
            (fs->fs_old_flags & FS_FLAGS_UPDATED))
                cgp->cg_time = ufs_rw64(time_second, needswap);
        cgbno = dtogd(fs, bno);
        blksfree = cg_blksfree(cgp, needswap);
        mutex_enter(&ump->um_lock);
        if (size == fs->fs_bsize) {
                fragno = ffs_fragstoblks(fs, cgbno);
                if (!ffs_isfreeblock(fs, blksfree, fragno)) {
                        if (devvp_is_snapshot) {
                                mutex_exit(&ump->um_lock);
                                return;
                        }
                        panic("%s: freeing free block: dev = 0x%llx, block = %"
                            PRId64 ", fs = %s", __func__,
                            (unsigned long long)dev, bno, fs->fs_fsmnt);
                }
                ffs_setblock(fs, blksfree, fragno);
                ffs_clusteracct(fs, cgp, fragno, 1);
                ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
                fs->fs_cstotal.cs_nbfree++;
                fs->fs_cs(fs, cg).cs_nbfree++;
                if ((fs->fs_magic == FS_UFS1_MAGIC) &&
                    ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
                        i = old_cbtocylno(fs, cgbno);
                        KASSERT(i >= 0);
                        KASSERT(i < fs->fs_old_ncyl);
                        KASSERT(old_cbtorpos(fs, cgbno) >= 0);
                        KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos);
                        ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1,
                            needswap);
                        ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
                }
        } else {
                bbase = cgbno - ffs_fragnum(fs, cgbno);
                /*
                 * decrement the counts associated with the old frags
                 */
                blk = blkmap(fs, blksfree, bbase);
                ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
                /*
                 * deallocate the fragment
                 */
                frags = ffs_numfrags(fs, size);
                for (i = 0; i < frags; i++) {
                        if (isset(blksfree, cgbno + i)) {
                                panic("%s: freeing free frag: "
                                    "dev = 0x%llx, block = %" PRId64
                                    ", fs = %s", __func__,
                                    (unsigned long long)dev, bno + i,
                                    fs->fs_fsmnt);
                        }
                        setbit(blksfree, cgbno + i);
                }
                ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
                fs->fs_cstotal.cs_nffree += i;
                fs->fs_cs(fs, cg).cs_nffree += i;
                /*
                 * add back in counts associated with the new frags
                 */
                blk = blkmap(fs, blksfree, bbase);
                ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
                /*
                 * if a complete block has been reassembled, account for it
                 */
                fragno = ffs_fragstoblks(fs, bbase);
                if (ffs_isblock(fs, blksfree, fragno)) {
                        ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
                        fs->fs_cstotal.cs_nffree -= fs->fs_frag;
                        fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
                        ffs_clusteracct(fs, cgp, fragno, 1);
                        ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
                        fs->fs_cstotal.cs_nbfree++;
                        fs->fs_cs(fs, cg).cs_nbfree++;
                        if ((fs->fs_magic == FS_UFS1_MAGIC) &&
                            ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
                                i = old_cbtocylno(fs, bbase);
                                KASSERT(i >= 0);
                                KASSERT(i < fs->fs_old_ncyl);
                                KASSERT(old_cbtorpos(fs, bbase) >= 0);
                                KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos);
                                ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs,
                                    bbase)], 1, needswap);
                                ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
                        }
                }
        }
        fs->fs_fmod = 1;
        ACTIVECG_CLR(fs, cg);
        mutex_exit(&ump->um_lock);
}

/*
 * Free an inode.
 */
int
ffs_vfree(struct vnode *vp, ino_t ino, int mode)
{

        return ffs_freefile(vp->v_mount, ino, mode);
}

/*
 * Do the actual free operation.
 * The specified inode is placed back in the free map.
 *
 * => um_lock not held on entry or exit
 */
int
ffs_freefile(struct mount *mp, ino_t ino, int mode)
{
        struct ufsmount *ump = VFSTOUFS(mp);
        struct fs *fs = ump->um_fs;
        struct vnode *devvp;
        struct cg *cgp;
        struct buf *bp;
        int error;
        u_int cg;
        daddr_t cgbno;
        dev_t dev;
        const int needswap = UFS_FSNEEDSWAP(fs);

        cg = ino_to_cg(fs, ino);
        devvp = ump->um_devvp;
        dev = devvp->v_rdev;
        cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));

        if (ino >= fs->fs_ipg * fs->fs_ncg)
                panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__,
                    (long long)dev, (unsigned long long)ino, fs->fs_fsmnt);
        error = bread(devvp, cgbno, (int)fs->fs_cgsize,
            B_MODIFY, &bp);
        if (error) {
                return (error);
        }
        cgp = (struct cg *)bp->b_data;
        if (!cg_chkmagic(cgp, needswap)) {
                brelse(bp, 0);
                return (0);
        }

        ffs_freefile_common(ump, fs, dev, bp, ino, mode, false);

        bdwrite(bp);

        return 0;
}

int
ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
{
        struct ufsmount *ump;
        struct cg *cgp;
        struct buf *bp;
        int error, cg;
        daddr_t cgbno;
        dev_t dev;
        const int needswap = UFS_FSNEEDSWAP(fs);

        KASSERT(devvp->v_type != VBLK);

        cg = ino_to_cg(fs, ino);
        dev = VTOI(devvp)->i_devvp->v_rdev;
        ump = VFSTOUFS(devvp->v_mount);
        cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
        if (ino >= fs->fs_ipg * fs->fs_ncg)
                panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__,
                    (unsigned long long)dev, (unsigned long long)ino,
                    fs->fs_fsmnt);
        error = bread(devvp, cgbno, (int)fs->fs_cgsize,
            B_MODIFY, &bp);
        if (error) {
                return (error);
        }
        cgp = (struct cg *)bp->b_data;
        if (!cg_chkmagic(cgp, needswap)) {
                brelse(bp, 0);
                return (0);
        }
        ffs_freefile_common(ump, fs, dev, bp, ino, mode, true);

        bdwrite(bp);

        return 0;
}

static void
ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
    struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot)
{
        u_int cg;
        struct cg *cgp;
        u_int8_t *inosused;
        const int needswap = UFS_FSNEEDSWAP(fs);
        ino_t cgino;

        cg = ino_to_cg(fs, ino);
        cgp = (struct cg *)bp->b_data;
        cgp->cg_old_time = ufs_rw32(time_second, needswap);
        if ((fs->fs_magic != FS_UFS1_MAGIC) ||
            (fs->fs_old_flags & FS_FLAGS_UPDATED))
                cgp->cg_time = ufs_rw64(time_second, needswap);
        inosused = cg_inosused(cgp, needswap);
        cgino = ino % fs->fs_ipg;
        if (isclr(inosused, cgino)) {
                printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n",
                    (unsigned long long)dev, (unsigned long long)ino,
                    fs->fs_fsmnt);
                if (fs->fs_ronly == 0)
                        panic("%s: freeing free inode", __func__);
        }
        clrbit(inosused, cgino);
        if (!devvp_is_snapshot)
                UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp, ino, mode);
        if (cgino < ufs_rw32(cgp->cg_irotor, needswap))
                cgp->cg_irotor = ufs_rw32(cgino, needswap);
        ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
        mutex_enter(&ump->um_lock);
        fs->fs_cstotal.cs_nifree++;
        fs->fs_cs(fs, cg).cs_nifree++;
        if ((mode & IFMT) == IFDIR) {
                ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap);
                fs->fs_cstotal.cs_ndir--;
                fs->fs_cs(fs, cg).cs_ndir--;
        }
        fs->fs_fmod = 1;
        ACTIVECG_CLR(fs, cg);
        mutex_exit(&ump->um_lock);
}

/*
 * Check to see if a file is free.
 */
int
ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino)
{
        struct cg *cgp;
        struct buf *bp;
        daddr_t cgbno;
        int ret;
        u_int cg;
        u_int8_t *inosused;
        const bool devvp_is_snapshot = (devvp->v_type != VBLK);

        KASSERT(devvp_is_snapshot);

        cg = ino_to_cg(fs, ino);
        if (devvp_is_snapshot)
                cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
        else
                cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
        if (ino >= fs->fs_ipg * fs->fs_ncg)
                return 1;
        if (bread(devvp, cgbno, (int)fs->fs_cgsize, 0, &bp)) {
                return 1;
        }
        cgp = (struct cg *)bp->b_data;
        if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
                brelse(bp, 0);
                return 1;
        }
        inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs));
        ino %= fs->fs_ipg;
        ret = isclr(inosused, ino);
        brelse(bp, 0);
        return ret;
}

/*
 * Find a block of the specified size in the specified cylinder group.
 *
 * It is a panic if a request is made to find a block if none are
 * available.
 */
static int32_t
ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz)
{
        int32_t bno;
        int start, len, loc, i;
        int blk, field, subfield, pos;
        int ostart, olen;
        u_int8_t *blksfree;
        const int needswap = UFS_FSNEEDSWAP(fs);

        /* KASSERT(mutex_owned(&ump->um_lock)); */

        /*
         * find the fragment by searching through the free block
         * map for an appropriate bit pattern
         */
        if (bpref)
                start = dtogd(fs, bpref) / NBBY;
        else
                start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY;
        blksfree = cg_blksfree(cgp, needswap);
        len = howmany(fs->fs_fpg, NBBY) - start;
        ostart = start;
        olen = len;
        loc = scanc((u_int)len,
                (const u_char *)&blksfree[start],
                (const u_char *)fragtbl[fs->fs_frag],
                (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
        if (loc == 0) {
                len = start + 1;
                start = 0;
                loc = scanc((u_int)len,
                        (const u_char *)&blksfree[0],
                        (const u_char *)fragtbl[fs->fs_frag],
                        (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
                if (loc == 0) {
                        panic("%s: map corrupted: start=%d, len=%d, "
                            "fs = %s, offset=%d/%ld, cg %d", __func__,
                            ostart, olen, fs->fs_fsmnt,
                            ufs_rw32(cgp->cg_freeoff, needswap),
                            (long)blksfree - (long)cgp, cgp->cg_cgx);
                        /* NOTREACHED */
                }
        }
        bno = (start + len - loc) * NBBY;
        cgp->cg_frotor = ufs_rw32(bno, needswap);
        /*
         * found the byte in the map
         * sift through the bits to find the selected frag
         */
        for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
                blk = blkmap(fs, blksfree, bno);
                blk <<= 1;
                field = around[allocsiz];
                subfield = inside[allocsiz];
                for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
                        if ((blk & field) == subfield)
                                return (bno + pos);
                        field <<= 1;
                        subfield <<= 1;
                }
        }
        panic("%s: block not in map: bno=%d, fs=%s", __func__,
            bno, fs->fs_fsmnt);
        /* return (-1); */
}

/*
 * Fserr prints the name of a file system with an error diagnostic.
 *
 * The form of the error message is:
 *        fs: error message
 */
static void
ffs_fserr(struct fs *fs, kauth_cred_t cred, const char *cp)
{
        KASSERT(cred != NULL);

        if (cred == NOCRED || cred == FSCRED) {
                log(LOG_ERR, "pid %d, command %s, on %s: %s\n",
                    curproc->p_pid, curproc->p_comm,
                    fs->fs_fsmnt, cp);
        } else {
                log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
                    kauth_cred_getuid(cred), curproc->p_pid, curproc->p_comm,
                    fs->fs_fsmnt, cp);
        }
}




























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
/*        $NetBSD: in_var.h,v 1.103 2022/11/19 08:00:51 yamt Exp $        */

/*-
 * Copyright (c) 1998 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Public Access Networks Corporation ("Panix").  It was developed under
 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1985, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)in_var.h        8.2 (Berkeley) 1/9/95
 */

#ifndef _NETINET_IN_VAR_H_
#define _NETINET_IN_VAR_H_

#include <sys/queue.h>

#define IN_IFF_TENTATIVE        0x01        /* tentative address */
#define IN_IFF_DUPLICATED        0x02        /* DAD detected duplicate */
#define IN_IFF_DETACHED                0x04        /* may be detached from the link */
#define IN_IFF_TRYTENTATIVE        0x08        /* intent to try DAD */

#define IN_IFFBITS \
    "\020\1TENTATIVE\2DUPLICATED\3DETACHED\4TRYTENTATIVE"

/* do not input/output */
#define IN_IFF_NOTREADY \
    (IN_IFF_TRYTENTATIVE | IN_IFF_TENTATIVE | IN_IFF_DUPLICATED)

/*
 * Interface address, Internet version.  One of these structures
 * is allocated for each interface with an Internet address.
 * The ifaddr structure contains the protocol-independent part
 * of the structure and is assumed to be first.
 */
struct in_ifaddr {
        struct        ifaddr ia_ifa;                /* protocol-independent info */
#define        ia_ifp                ia_ifa.ifa_ifp
#define ia_flags        ia_ifa.ifa_flags
                                        /* ia_{,sub}net{,mask} in host order */
        u_int32_t ia_net;                /* network number of interface */
        u_int32_t ia_netmask;                /* mask of net part */
        u_int32_t ia_subnet;                /* subnet number, including net */
        u_int32_t ia_subnetmask;        /* mask of subnet part */
        struct        in_addr ia_netbroadcast; /* to recognize net broadcasts */
        LIST_ENTRY(in_ifaddr) ia_hash;        /* entry in bucket of inet addresses */
        TAILQ_ENTRY(in_ifaddr) ia_list;        /* list of internet addresses */
        struct        sockaddr_in ia_addr;        /* reserve space for interface name */
        struct        sockaddr_in ia_dstaddr;        /* reserve space for broadcast addr */
#define        ia_broadaddr        ia_dstaddr
        struct        sockaddr_in ia_sockmask; /* reserve space for general netmask */
        LIST_HEAD(, in_multi) ia_multiaddrs; /* list of multicast addresses */
        struct        in_multi *ia_allhosts;        /* multicast address record for
                                           the allhosts multicast group */
        uint16_t ia_idsalt;                /* ip_id salt for this ia */
        int        ia4_flags;                /* address flags */
        void        (*ia_dad_start) (struct ifaddr *);        /* DAD start function */
        void        (*ia_dad_stop) (struct ifaddr *);        /* DAD stop function */
        time_t        ia_dad_defended;        /* last time of DAD defence */

#ifdef _KERNEL
        struct pslist_entry        ia_hash_pslist_entry;
        struct pslist_entry        ia_pslist_entry;
#endif
};

struct in_nbrinfo {
        char ifname[IFNAMSIZ];        /* if name, e.g. "en0" */
        struct in_addr addr;        /* IPv4 address of the neighbor */
        long        asked;                /* number of queries already sent for this addr */
        int        state;                /* reachability state */
        int        expire;                /* lifetime for NDP state transition */
};

#ifdef _KERNEL
static __inline void
ia4_acquire(struct in_ifaddr *ia, struct psref *psref)
{

        KASSERT(ia != NULL);
        ifa_acquire(&ia->ia_ifa, psref);
}

static __inline void
ia4_release(struct in_ifaddr *ia, struct psref *psref)
{

        if (ia == NULL)
                return;
        ifa_release(&ia->ia_ifa, psref);
}
#endif

struct        in_aliasreq {
        char        ifra_name[IFNAMSIZ];                /* if name, e.g. "en0" */
        struct        sockaddr_in ifra_addr;
        struct        sockaddr_in ifra_dstaddr;
#define        ifra_broadaddr        ifra_dstaddr
        struct        sockaddr_in ifra_mask;
};

/*
 * Given a pointer to an in_ifaddr (ifaddr),
 * return a pointer to the addr as a sockaddr_in.
 */
#define        IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr))

#ifdef _KERNEL

/* Note: 61, 127, 251, 509, 1021, 2039 are good. */
#ifndef IN_IFADDR_HASH_SIZE
#define IN_IFADDR_HASH_SIZE        509
#endif

/*
 * This is a bit unconventional, and wastes a little bit of space, but
 * because we want a very even hash function we don't use & in_ifaddrhash
 * here, but rather % the hash size, which should obviously be prime.
 */

#define        IN_IFADDR_HASH(x) in_ifaddrhashtbl[(u_long)(x) % IN_IFADDR_HASH_SIZE]

LIST_HEAD(in_ifaddrhashhead, in_ifaddr);        /* Type of the hash head */
TAILQ_HEAD(in_ifaddrhead, in_ifaddr);                /* Type of the list head */

extern        u_long in_ifaddrhash;                        /* size of hash table - 1 */
extern  struct in_ifaddrhashhead *in_ifaddrhashtbl;        /* Hash table head */
extern  struct in_ifaddrhead in_ifaddrhead;                /* List head (in ip_input) */

extern pserialize_t in_ifaddrhash_psz;
extern struct pslist_head *in_ifaddrhashtbl_pslist;
extern u_long in_ifaddrhash_pslist;
extern struct pslist_head in_ifaddrhead_pslist;

#define IN_IFADDR_HASH_PSLIST(x)                                        \
        in_ifaddrhashtbl_pslist[(u_long)(x) % IN_IFADDR_HASH_SIZE]

#define IN_ADDRHASH_READER_FOREACH(__ia, __addr)                        \
        PSLIST_READER_FOREACH((__ia), &IN_IFADDR_HASH_PSLIST(__addr),        \
            struct in_ifaddr, ia_hash_pslist_entry)
#define IN_ADDRHASH_WRITER_INSERT_HEAD(__ia)                                \
        PSLIST_WRITER_INSERT_HEAD(                                        \
            &IN_IFADDR_HASH_PSLIST((__ia)->ia_addr.sin_addr.s_addr),        \
            (__ia), ia_hash_pslist_entry)
#define IN_ADDRHASH_WRITER_REMOVE(__ia)                                        \
        PSLIST_WRITER_REMOVE((__ia), ia_hash_pslist_entry)
#define IN_ADDRHASH_ENTRY_INIT(__ia)                                        \
        PSLIST_ENTRY_INIT((__ia), ia_hash_pslist_entry);
#define IN_ADDRHASH_ENTRY_DESTROY(__ia)                                        \
        PSLIST_ENTRY_DESTROY((__ia), ia_hash_pslist_entry);
#define IN_ADDRHASH_READER_NEXT(__ia)                                        \
        PSLIST_READER_NEXT((__ia), struct in_ifaddr, ia_hash_pslist_entry)

#define IN_ADDRLIST_ENTRY_INIT(__ia)                                        \
        PSLIST_ENTRY_INIT((__ia), ia_pslist_entry)
#define IN_ADDRLIST_ENTRY_DESTROY(__ia)                                        \
        PSLIST_ENTRY_DESTROY((__ia), ia_pslist_entry);
#define IN_ADDRLIST_READER_EMPTY()                                        \
        (PSLIST_READER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr,        \
                             ia_pslist_entry) == NULL)
#define IN_ADDRLIST_READER_FIRST()                                        \
        PSLIST_READER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr,        \
                            ia_pslist_entry)
#define IN_ADDRLIST_READER_NEXT(__ia)                                        \
        PSLIST_READER_NEXT((__ia), struct in_ifaddr, ia_pslist_entry)
#define IN_ADDRLIST_READER_FOREACH(__ia)                                \
        PSLIST_READER_FOREACH((__ia), &in_ifaddrhead_pslist,                \
                              struct in_ifaddr, ia_pslist_entry)
#define IN_ADDRLIST_WRITER_INSERT_HEAD(__ia)                                \
        PSLIST_WRITER_INSERT_HEAD(&in_ifaddrhead_pslist, (__ia),        \
            ia_pslist_entry)
#define IN_ADDRLIST_WRITER_REMOVE(__ia)                                        \
        PSLIST_WRITER_REMOVE((__ia), ia_pslist_entry)
#define IN_ADDRLIST_WRITER_FOREACH(__ia)                                \
        PSLIST_WRITER_FOREACH((__ia), &in_ifaddrhead_pslist,                \
                              struct in_ifaddr, ia_pslist_entry)
#define IN_ADDRLIST_WRITER_FIRST()                                        \
        PSLIST_WRITER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr,        \
                            ia_pslist_entry)
#define IN_ADDRLIST_WRITER_NEXT(__ia)                                        \
        PSLIST_WRITER_NEXT((__ia), struct in_ifaddr, ia_pslist_entry)
#define IN_ADDRLIST_WRITER_INSERT_AFTER(__ia, __new)                        \
        PSLIST_WRITER_INSERT_AFTER((__ia), (__new), ia_pslist_entry)
#define IN_ADDRLIST_WRITER_EMPTY()                                        \
        (PSLIST_WRITER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr,        \
            ia_pslist_entry) == NULL)
#define IN_ADDRLIST_WRITER_INSERT_TAIL(__new)                                \
        do {                                                                \
                if (IN_ADDRLIST_WRITER_EMPTY()) {                        \
                        IN_ADDRLIST_WRITER_INSERT_HEAD((__new));        \
                } else {                                                \
                        struct in_ifaddr *__ia;                                \
                        IN_ADDRLIST_WRITER_FOREACH(__ia) {                \
                                if (IN_ADDRLIST_WRITER_NEXT(__ia) == NULL) { \
                                        IN_ADDRLIST_WRITER_INSERT_AFTER(__ia,\
                                            (__new));                        \
                                        break;                                \
                                }                                        \
                        }                                                \
                }                                                        \
        } while (0)

extern        const        int        inetctlerrmap[];

/*
 * Find whether an internet address (in_addr) belongs to one
 * of our interfaces (in_ifaddr).  NULL if the address isn't ours.
 */
static __inline struct in_ifaddr *
in_get_ia(struct in_addr addr)
{
        struct in_ifaddr *ia;

        IN_ADDRHASH_READER_FOREACH(ia, addr.s_addr) {
                if (in_hosteq(ia->ia_addr.sin_addr, addr))
                        break;
        }

        return ia;
}

static __inline struct in_ifaddr *
in_get_ia_psref(struct in_addr addr, struct psref *psref)
{
        struct in_ifaddr *ia;
        int s;

        s = pserialize_read_enter();
        ia = in_get_ia(addr);
        if (ia != NULL)
                ia4_acquire(ia, psref);
        pserialize_read_exit(s);

        return ia;
}

/*
 * Find whether an internet address (in_addr) belongs to a specified
 * interface.  NULL if the address isn't ours.
 */
static __inline struct in_ifaddr *
in_get_ia_on_iface(struct in_addr addr, struct ifnet *ifp)
{
        struct in_ifaddr *ia;

        IN_ADDRHASH_READER_FOREACH(ia, addr.s_addr) {
                if (in_hosteq(ia->ia_addr.sin_addr, addr) &&
                    ia->ia_ifp == ifp)
                        break;
        }

        return ia;
}

static __inline struct in_ifaddr *
in_get_ia_on_iface_psref(struct in_addr addr, struct ifnet *ifp, struct psref *psref)
{
        struct in_ifaddr *ia;
        int s;

        s = pserialize_read_enter();
        ia = in_get_ia_on_iface(addr, ifp);
        if (ia != NULL)
                ia4_acquire(ia, psref);
        pserialize_read_exit(s);

        return ia;
}

/*
 * Find an internet address structure (in_ifaddr) corresponding
 * to a given interface (ifnet structure).
 */
static __inline struct in_ifaddr *
in_get_ia_from_ifp(struct ifnet *ifp)
{
        struct ifaddr *ifa;

        IFADDR_READER_FOREACH(ifa, ifp) {
                if (ifa->ifa_addr->sa_family == AF_INET)
                        break;
        }

        return ifatoia(ifa);
}

static __inline struct in_ifaddr *
in_get_ia_from_ifp_psref(struct ifnet *ifp, struct psref *psref)
{
        struct in_ifaddr *ia;
        int s;

        s = pserialize_read_enter();
        ia = in_get_ia_from_ifp(ifp);
        if (ia != NULL)
                ia4_acquire(ia, psref);
        pserialize_read_exit(s);

        return ia;
}

#include <netinet/in_selsrc.h>
/*
 * IPv4 per-interface state.
 */
struct in_ifinfo {
        struct lltable                *ii_llt;        /* ARP state */
        struct in_ifsysctl        *ii_selsrc;
#ifdef MBUFTRACE
        struct mowner                ii_mowner;
#endif
};

#endif /* _KERNEL */

/*
 * Internet multicast address structure.  There is one of these for each IP
 * multicast group to which this host belongs on a given network interface.
 * They are kept in a linked list, rooted in the interface's in_ifaddr
 * structure.
 */
struct router_info;

struct in_multi {
        LIST_ENTRY(in_multi) inm_list;        /* list of multicast addresses */
        struct        router_info *inm_rti;        /* router version info */
        struct        ifnet *inm_ifp;                /* back pointer to ifnet */
        struct        in_addr inm_addr;        /* IP multicast address */
        u_int        inm_refcount;                /* no. membership claims by sockets */
        u_int        inm_timer;                /* IGMP membership report timer */
        u_int        inm_state;                /* state of membership */
};

#ifdef _KERNEL

#include <net/pktqueue.h>
#include <sys/cprng.h>

extern pktqueue_t *ip_pktq;

extern int ip_dad_count;                /* Duplicate Address Detection probes */

static inline bool
ip_dad_enabled(void)
{
#if NARP > 0
        return ip_dad_count > 0;
#else
        return false;
#endif
}

#if defined(INET) && NARP > 0
extern int arp_debug;
#define ARPLOGADDR(a) IN_PRINT(_ipbuf, a)
#define ARPLOG(level, fmt, args...)                                         \
        do {                                                                \
                char _ipbuf[INET_ADDRSTRLEN];                                 \
                (void)_ipbuf;                                                \
                if (arp_debug)                                                 \
                        log(level, "%s: " fmt, __func__, ##args);        \
        } while (/*CONSTCOND*/0)
#else
#define ARPLOG(level, fmt, args...)
#endif

/*
 * Structure used by functions below to remember position when stepping
 * through all of the in_multi records.
 */
struct in_multistep {
        int i_n;
        struct in_multi *i_inm;
};

bool in_multi_group(struct in_addr, struct ifnet *, int);
struct in_multi *in_first_multi(struct in_multistep *);
struct in_multi *in_next_multi(struct in_multistep *);
struct in_multi *in_lookup_multi(struct in_addr, struct ifnet *);
struct in_multi *in_addmulti(struct in_addr *, struct ifnet *);
void in_delmulti(struct in_multi *);

void in_multi_lock(int);
void in_multi_unlock(void);
int in_multi_lock_held(void);

struct ifaddr;

int        in_ifinit(struct ifnet *, struct in_ifaddr *,
    const struct sockaddr_in *, const struct sockaddr_in *, int);
void        in_savemkludge(struct in_ifaddr *);
void        in_restoremkludge(struct in_ifaddr *, struct ifnet *);
void        in_purgemkludge(struct ifnet *);
void        in_setmaxmtu(void);
int        in_control(struct socket *, u_long, void *, struct ifnet *);
void        in_purgeaddr(struct ifaddr *);
void        in_purgeif(struct ifnet *);
void        in_addrhash_insert(struct in_ifaddr *);
void        in_addrhash_remove(struct in_ifaddr *);
int        ipflow_fastforward(struct mbuf *);

extern uint16_t                ip_id;
extern int                ip_do_randomid;

static __inline uint16_t
ip_randomid(void)
{

        uint16_t id = (uint16_t)cprng_fast32();
        return id ? id : 1;
}

/*
 * ip_newid_range: "allocate" num contiguous IP IDs.
 *
 * => Return the first ID.
 */
static __inline uint16_t
ip_newid_range(const struct in_ifaddr *ia, u_int num)
{
        uint16_t id;

        if (ip_do_randomid) {
                /* XXX ignore num */
                return ip_randomid();
        }

        /* Never allow an IP ID of 0 (detect wrap). */
        if ((uint16_t)(ip_id + num) < ip_id) {
                ip_id = 1;
        }
        id = htons(ip_id);
        ip_id += num;
        return id;
}

static __inline uint16_t
ip_newid(const struct in_ifaddr *ia)
{

        return ip_newid_range(ia, 1);
}

#ifdef SYSCTLFN_PROTO
int        sysctl_inpcblist(SYSCTLFN_PROTO);
#endif

#define LLTABLE(ifp)        \
        ((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt

#endif        /* !_KERNEL */

/* INET6 stuff */
#include <netinet6/in6_var.h>

#endif /* !_NETINET_IN_VAR_H_ */






















































































































































    2 













































































































































































































    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
/*        $NetBSD: tty_conf.c,v 1.57 2021/08/09 20:49:10 andvar Exp $        */

/*-
 * Copyright (c) 2005, 2007 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)tty_conf.c        8.5 (Berkeley) 1/9/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty_conf.c,v 1.57 2021/08/09 20:49:10 andvar Exp $");

#define TTY_ALLOW_PRIVATE

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/ttycom.h>
#include <sys/conf.h>
#include <sys/mutex.h>
#include <sys/queue.h>

static struct linesw termios_disc = {
        .l_name = "termios",
        .l_open = ttylopen,
        .l_close = ttylclose,
        .l_read = ttread,
        .l_write = ttwrite,
        .l_ioctl = ttynullioctl,
        .l_rint = ttyinput,
        .l_start = ttstart,
        .l_modem = ttymodem,
        .l_poll = ttpoll
};

/*
 * This is for the benefit of old BSD TTY compatibility, but since it is
 * identical to termios (except for the name), don't bother conditionalizing
 * it.
 */
static struct linesw ntty_disc = {        /* old NTTYDISC */
        .l_name = "ntty",
        .l_open = ttylopen,
        .l_close = ttylclose,
        .l_read = ttread,
        .l_write = ttwrite,
        .l_ioctl = ttynullioctl,
        .l_rint = ttyinput,
        .l_start = ttstart,
        .l_modem = ttymodem,
        .l_poll = ttpoll
};

static LIST_HEAD(, linesw) ttyldisc_list = LIST_HEAD_INITIALIZER(ttyldisc_head);

/*
 * Note: We don't bother refcounting termios_disc and ntty_disc; they can't
 * be removed from the list, and termios_disc is likely to have very many
 * references (could we overflow the count?).
 */
#define        TTYLDISC_ISSTATIC(disc)                                        \
        ((disc) == &termios_disc || (disc) == &ntty_disc)

#define        TTYLDISC_HOLD(disc)                                        \
do {                                                                \
        if (! TTYLDISC_ISSTATIC(disc)) {                        \
                KASSERT((disc)->l_refcnt != UINT_MAX);                \
                (disc)->l_refcnt++;                                \
        }                                                        \
} while (/*CONSTCOND*/0)

#define        TTYLDISC_RELE(disc)                                        \
do {                                                                \
        if (! TTYLDISC_ISSTATIC(disc)) {                        \
                KASSERT((disc)->l_refcnt != 0);                        \
                (disc)->l_refcnt--;                                \
        }                                                        \
} while (/*CONSTCOND*/0)

#define        TTYLDISC_ISINUSE(disc)                                        \
        (TTYLDISC_ISSTATIC(disc) || (disc)->l_refcnt != 0)

/*
 * Do nothing specific version of line
 * discipline specific ioctl command.
 */
/*ARGSUSED*/
int
ttynullioctl(struct tty *tp, u_long cmd, void *data, int flags, struct lwp *l)
{

        return (EPASSTHROUGH);
}

/*
 * Return error to line discipline
 * specific poll call.
 */
/*ARGSUSED*/
int
ttyerrpoll(struct tty *tp, int events, struct lwp *l)
{

        return (POLLERR);
}

void
ttyldisc_init(void)
{

        if (ttyldisc_attach(&termios_disc) != 0)
                panic("ttyldisc_init: termios_disc");
        if (ttyldisc_attach(&ntty_disc) != 0)
                panic("ttyldisc_init: ntty_disc");
}

static struct linesw *
ttyldisc_lookup_locked(const char *name)
{
        struct linesw *disc;

        LIST_FOREACH(disc, &ttyldisc_list, l_list) {
                if (strcmp(name, disc->l_name) == 0)
                        return (disc);
        }

        return (NULL);
}

/*
 * Look up a line discipline by its name.  Caller holds a reference on
 * the returned line discipline.
 */
struct linesw *
ttyldisc_lookup(const char *name)
{
        struct linesw *disc;

        mutex_spin_enter(&tty_lock);
        disc = ttyldisc_lookup_locked(name);
        if (disc != NULL)
                TTYLDISC_HOLD(disc);
        mutex_spin_exit(&tty_lock);

        return (disc);
}

/*
 * Look up a line discipline by its legacy number.  Caller holds a
 * reference on the returned line discipline.
 */
struct linesw *
ttyldisc_lookup_bynum(int num)
{
        struct linesw *disc;

        mutex_spin_enter(&tty_lock);

        LIST_FOREACH(disc, &ttyldisc_list, l_list) {
                if (disc->l_no == num) {
                        TTYLDISC_HOLD(disc);
                        mutex_spin_exit(&tty_lock);
                        return (disc);
                }
        }

        mutex_spin_exit(&tty_lock);
        return (NULL);
}

/*
 * Release a reference on a line discipline previously added by
 * ttyldisc_lookup() or ttyldisc_lookup_bynum().
 */
void
ttyldisc_release(struct linesw *disc)
{

        if (disc == NULL)
                return;

        mutex_spin_enter(&tty_lock);
        TTYLDISC_RELE(disc);
        mutex_spin_exit(&tty_lock);
}

#define        TTYLDISC_LEGACY_NUMBER_MIN        10
#define        TTYLDISC_LEGACY_NUMBER_MAX        INT_MAX

static void
ttyldisc_assign_legacy_number(struct linesw *disc)
{
        static const struct {
                const char *name;
                int num;
        } table[] = {
                { "termios",                TTYDISC },
                { "ntty",                2 /* XXX old NTTYDISC */ },
                { "tablet",                TABLDISC },
                { "slip",                SLIPDISC },
                { "ppp",                PPPDISC },
                { "strip",                STRIPDISC },
                { "hdlc",                HDLCDISC },
                { NULL,                        0 }
        };
        struct linesw *ldisc;
        int i;

        for (i = 0; table[i].name != NULL; i++) {
                if (strcmp(disc->l_name, table[i].name) == 0) {
                        disc->l_no = table[i].num;
                        return;
                }
        }

        disc->l_no = TTYLDISC_LEGACY_NUMBER_MIN;

        LIST_FOREACH(ldisc, &ttyldisc_list, l_list) {
                if (disc->l_no == ldisc->l_no) {
                        KASSERT(disc->l_no < TTYLDISC_LEGACY_NUMBER_MAX);
                        disc->l_no++;
                }
        }
}

/*
 * Register a line discipline.
 */
int
ttyldisc_attach(struct linesw *disc)
{

        KASSERT(disc->l_name != NULL);
        KASSERT(disc->l_open != NULL);
        KASSERT(disc->l_close != NULL);
        KASSERT(disc->l_read != NULL);
        KASSERT(disc->l_write != NULL);
        KASSERT(disc->l_ioctl != NULL);
        KASSERT(disc->l_rint != NULL);
        KASSERT(disc->l_start != NULL);
        KASSERT(disc->l_modem != NULL);
        KASSERT(disc->l_poll != NULL);

        /* You are not allowed to exceed TTLINEDNAMELEN */
        if (strlen(disc->l_name) >= TTLINEDNAMELEN)
                return (ENAMETOOLONG);

        mutex_spin_enter(&tty_lock);

        if (ttyldisc_lookup_locked(disc->l_name) != NULL) {
                mutex_spin_exit(&tty_lock);
                return (EEXIST);
        }

        ttyldisc_assign_legacy_number(disc);
        LIST_INSERT_HEAD(&ttyldisc_list, disc, l_list);

        mutex_spin_exit(&tty_lock);

        return (0);
}

/*
 * Remove a line discipline.
 */
int
ttyldisc_detach(struct linesw *disc)
{
#ifdef DIAGNOSTIC
        struct linesw *ldisc = ttyldisc_lookup(disc->l_name);

        KASSERT(ldisc != NULL);
        KASSERT(ldisc == disc);
        ttyldisc_release(ldisc);
#endif

        mutex_spin_enter(&tty_lock);

        if (TTYLDISC_ISINUSE(disc)) {
                mutex_spin_exit(&tty_lock);
                return (EBUSY);
        }

        LIST_REMOVE(disc, l_list);

        mutex_spin_exit(&tty_lock);

        return (0);
}

/*
 * Return the default line discipline.
 */
struct linesw *
ttyldisc_default(void)
{

        return (&termios_disc);
}



























































































































































   65 





   67 























    1 


    1 









































   68 



   69 


   42 

   69 





    5 


    5 


















    8 

    8 

    8 





    8 
    8 
    8 

    8 


    8 


    8 










   20 



























    8 







    8 

    7 





























































   67 



   65 
   10 















   63 


   11 














   67 
   66 
   67 
   66 
   67 
   67 
   67 












   62 
   11 


















   70 

   96 























   94 

   10 













   95 






   96 
   66 




   42 





   94 












    4 










   94 



   94 




   31 


















   95 




   95 



   95 


   42 




   95 




   94 




   11 












   27 



   27 

   27 




   26 





























   12 

   12 
   12 

   12 

















   12 





   12 

   12 
















   12 

    3 





   12 
   12 


   12 
   12 
































   12 






   12 
    4 






   12 






   12 












   12 


   12 

   12 










































































































































   22 









   22 






   21 

   18 





   21 

















   21 









   21 



















   21 


























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
/*        $NetBSD: vfs_trans.c,v 1.70 2022/11/04 11:20:39 hannken Exp $        */

/*-
 * Copyright (c) 2007, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Juergen Hannken-Illjes.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_trans.c,v 1.70 2022/11/04 11:20:39 hannken Exp $");

/*
 * File system transaction operations.
 */

#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/hash.h>
#include <sys/kmem.h>
#include <sys/mount.h>
#include <sys/pserialize.h>
#include <sys/vnode.h>
#include <sys/fstrans.h>
#include <sys/proc.h>
#include <sys/pool.h>

#include <miscfs/deadfs/deadfs.h>
#include <miscfs/specfs/specdev.h>

#define FSTRANS_MOUNT_HASHSIZE        32

enum fstrans_lock_type {
        FSTRANS_LAZY,                        /* Granted while not suspended */
        FSTRANS_SHARED                        /* Granted while not suspending */
};

struct fscow_handler {
        LIST_ENTRY(fscow_handler) ch_list;
        int (*ch_func)(void *, struct buf *, bool);
        void *ch_arg;
};
struct fstrans_lwp_info {
        struct fstrans_lwp_info *fli_succ;
        struct lwp *fli_self;
        struct mount *fli_mount;
        struct fstrans_lwp_info *fli_alias;
        struct fstrans_mount_info *fli_mountinfo;
        int fli_trans_cnt;
        int fli_alias_cnt;
        int fli_cow_cnt;
        enum fstrans_lock_type fli_lock_type;
        LIST_ENTRY(fstrans_lwp_info) fli_list;
};
struct fstrans_mount_info {
        enum fstrans_state fmi_state;
        unsigned int fmi_ref_cnt;
        bool fmi_gone;
        bool fmi_cow_change;
        SLIST_ENTRY(fstrans_mount_info) fmi_hash;
        LIST_HEAD(, fscow_handler) fmi_cow_handler;
        struct mount *fmi_mount;
        struct fstrans_mount_info *fmi_lower_info;
        struct lwp *fmi_owner;
};
SLIST_HEAD(fstrans_mount_hashhead, fstrans_mount_info);

static kmutex_t vfs_suspend_lock        /* Serialize suspensions. */
    __cacheline_aligned;
static kmutex_t fstrans_lock                /* Fstrans big lock. */
    __cacheline_aligned;
static kcondvar_t fstrans_state_cv;        /* Fstrans or cow state changed. */
static kcondvar_t fstrans_count_cv;        /* Fstrans or cow count changed. */
static pserialize_t fstrans_psz;        /* Pserialize state. */
static LIST_HEAD(fstrans_lwp_head, fstrans_lwp_info) fstrans_fli_head;
                                        /* List of all fstrans_lwp_info. */
static pool_cache_t fstrans_lwp_cache;        /* Cache of fstrans_lwp_info. */

static u_long fstrans_mount_hashmask;
static struct fstrans_mount_hashhead *fstrans_mount_hashtab;
static int fstrans_gone_count;                /* Number of fstrans_mount_info gone. */

static inline uint32_t fstrans_mount_hash(struct mount *);
static inline struct fstrans_mount_info *fstrans_mount_get(struct mount *);
static void fstrans_mount_dtor(struct fstrans_mount_info *);
static void fstrans_clear_lwp_info(void);
static inline struct fstrans_lwp_info *
    fstrans_get_lwp_info(struct mount *, bool);
static struct fstrans_lwp_info *fstrans_alloc_lwp_info(struct mount *);
static int fstrans_lwp_pcc(void *, void *, int);
static void fstrans_lwp_pcd(void *, void *);
static inline int _fstrans_start(struct mount *, enum fstrans_lock_type, int);
static bool grant_lock(const struct fstrans_mount_info *,
    const enum fstrans_lock_type);
static bool state_change_done(const struct fstrans_mount_info *);
static bool cow_state_change_done(const struct fstrans_mount_info *);
static void cow_change_enter(struct fstrans_mount_info *);
static void cow_change_done(struct fstrans_mount_info *);

/*
 * Initialize.
 */
void
fstrans_init(void)
{

        mutex_init(&vfs_suspend_lock, MUTEX_DEFAULT, IPL_NONE);
        mutex_init(&fstrans_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&fstrans_state_cv, "fstchg");
        cv_init(&fstrans_count_cv, "fstcnt");
        fstrans_psz = pserialize_create();
        LIST_INIT(&fstrans_fli_head);
        fstrans_lwp_cache = pool_cache_init(sizeof(struct fstrans_lwp_info),
            coherency_unit, 0, 0, "fstlwp", NULL, IPL_NONE,
            fstrans_lwp_pcc, fstrans_lwp_pcd, NULL);
        KASSERT(fstrans_lwp_cache != NULL);
        fstrans_mount_hashtab = hashinit(FSTRANS_MOUNT_HASHSIZE, HASH_SLIST,
            true, &fstrans_mount_hashmask);
}

/*
 * pool_cache constructor for fstrans_lwp_info.  Updating the global list
 * produces cache misses on MP.  Minimise by keeping free entries on list.
 */
int
fstrans_lwp_pcc(void *arg, void *obj, int flags)
{
        struct fstrans_lwp_info *fli = obj;

        memset(fli, 0, sizeof(*fli));

        mutex_enter(&fstrans_lock);
        LIST_INSERT_HEAD(&fstrans_fli_head, fli, fli_list);
        mutex_exit(&fstrans_lock);

        return 0;
}

/*
 * pool_cache destructor
 */
void
fstrans_lwp_pcd(void *arg, void *obj)
{
        struct fstrans_lwp_info *fli = obj;

        mutex_enter(&fstrans_lock);
        LIST_REMOVE(fli, fli_list);
        mutex_exit(&fstrans_lock);
}

/*
 * Deallocate lwp state.
 */
void
fstrans_lwp_dtor(lwp_t *l)
{
        struct fstrans_lwp_info *fli, *fli_next;

        if (l->l_fstrans == NULL)
                return;

        mutex_enter(&fstrans_lock);
        for (fli = l->l_fstrans; fli; fli = fli_next) {
                KASSERT(fli->fli_trans_cnt == 0);
                KASSERT(fli->fli_cow_cnt == 0);
                KASSERT(fli->fli_self == l);
                if (fli->fli_mount != NULL)
                        fstrans_mount_dtor(fli->fli_mountinfo);
                fli_next = fli->fli_succ;
                fli->fli_alias_cnt = 0;
                fli->fli_mount = NULL;
                fli->fli_alias = NULL;
                fli->fli_mountinfo = NULL;
                fli->fli_self = NULL;
        }
        mutex_exit(&fstrans_lock);

        for (fli = l->l_fstrans; fli; fli = fli_next) {
                fli_next = fli->fli_succ;
                pool_cache_put(fstrans_lwp_cache, fli);
        }
        l->l_fstrans = NULL;
}

/*
 * mount pointer to hash
 */
static inline uint32_t
fstrans_mount_hash(struct mount *mp)
{

        return hash32_buf(&mp, sizeof(mp), HASH32_BUF_INIT) &
            fstrans_mount_hashmask;
}

/*
 * retrieve fstrans_mount_info by mount or NULL
 */
static inline struct fstrans_mount_info *
fstrans_mount_get(struct mount *mp)
{
        uint32_t indx;
        struct fstrans_mount_info *fmi, *fmi_lower;

        KASSERT(mutex_owned(&fstrans_lock));

        indx = fstrans_mount_hash(mp);
        SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash) {
                if (fmi->fmi_mount == mp) {
                        if (__predict_false(mp->mnt_lower != NULL &&
                            fmi->fmi_lower_info == NULL)) {
                                /*
                                 * Intern the lower/lowest mount into
                                 * this mount info on first lookup.
                                 */
                                KASSERT(fmi->fmi_ref_cnt == 1);

                                fmi_lower = fstrans_mount_get(mp->mnt_lower);
                                if (fmi_lower && fmi_lower->fmi_lower_info)
                                        fmi_lower = fmi_lower->fmi_lower_info;
                                if (fmi_lower == NULL)
                                        return NULL;
                                fmi->fmi_lower_info = fmi_lower;
                                fmi->fmi_lower_info->fmi_ref_cnt += 1;
                        }
                        return fmi;
                }
        }

        return NULL;
}

/*
 * Dereference mount state.
 */
static void
fstrans_mount_dtor(struct fstrans_mount_info *fmi)
{

        KASSERT(mutex_owned(&fstrans_lock));

        KASSERT(fmi != NULL);
        fmi->fmi_ref_cnt -= 1;
        if (__predict_true(fmi->fmi_ref_cnt > 0)) {
                return;
        }

        KASSERT(fmi->fmi_state == FSTRANS_NORMAL);
        KASSERT(LIST_FIRST(&fmi->fmi_cow_handler) == NULL);
        KASSERT(fmi->fmi_owner == NULL);

        if (fmi->fmi_lower_info)
                fstrans_mount_dtor(fmi->fmi_lower_info);

        KASSERT(fstrans_gone_count > 0);
        fstrans_gone_count -= 1;

        KASSERT(fmi->fmi_mount->mnt_lower == NULL);

        kmem_free(fmi->fmi_mount, sizeof(*fmi->fmi_mount));
        kmem_free(fmi, sizeof(*fmi));
}

/*
 * Allocate mount state.
 */
int
fstrans_mount(struct mount *mp)
{
        uint32_t indx;
        struct fstrans_mount_info *newfmi;

        indx = fstrans_mount_hash(mp);

        newfmi = kmem_alloc(sizeof(*newfmi), KM_SLEEP);
        newfmi->fmi_state = FSTRANS_NORMAL;
        newfmi->fmi_ref_cnt = 1;
        newfmi->fmi_gone = false;
        LIST_INIT(&newfmi->fmi_cow_handler);
        newfmi->fmi_cow_change = false;
        newfmi->fmi_mount = mp;
        newfmi->fmi_lower_info = NULL;
        newfmi->fmi_owner = NULL;

        mutex_enter(&fstrans_lock);
        SLIST_INSERT_HEAD(&fstrans_mount_hashtab[indx], newfmi, fmi_hash);
        mutex_exit(&fstrans_lock);

        return 0;
}

/*
 * Deallocate mount state.
 */
void
fstrans_unmount(struct mount *mp)
{
        uint32_t indx;
        struct fstrans_mount_info *fmi;

        indx = fstrans_mount_hash(mp);

        mutex_enter(&fstrans_lock);
        fmi = fstrans_mount_get(mp);
        KASSERT(fmi != NULL);
        fmi->fmi_gone = true;
        SLIST_REMOVE(&fstrans_mount_hashtab[indx],
            fmi, fstrans_mount_info, fmi_hash);
        fstrans_gone_count += 1;
        fstrans_mount_dtor(fmi);
        mutex_exit(&fstrans_lock);
}

/*
 * Clear mount entries whose mount is gone.
 */
static void
fstrans_clear_lwp_info(void)
{
        struct fstrans_lwp_info **p, *fli, *tofree = NULL;

        /*
         * Scan our list clearing entries whose mount is gone.
         */
        mutex_enter(&fstrans_lock);
        for (p = &curlwp->l_fstrans; *p; ) {
                fli = *p;
                if (fli->fli_mount != NULL &&
                    fli->fli_mountinfo->fmi_gone &&
                    fli->fli_trans_cnt == 0 &&
                    fli->fli_cow_cnt == 0 &&
                    fli->fli_alias_cnt == 0) {
                        *p = (*p)->fli_succ;
                        fstrans_mount_dtor(fli->fli_mountinfo);
                        if (fli->fli_alias) {
                                KASSERT(fli->fli_alias->fli_alias_cnt > 0);
                                fli->fli_alias->fli_alias_cnt--;
                        }
                        fli->fli_mount = NULL;
                        fli->fli_alias = NULL;
                        fli->fli_mountinfo = NULL;
                        fli->fli_self = NULL;
                        p = &curlwp->l_fstrans;
                        fli->fli_succ = tofree;
                        tofree = fli;
                } else {
                        p = &(*p)->fli_succ;
                }
        }
#ifdef DIAGNOSTIC
        for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ)
                if (fli->fli_alias != NULL)
                        KASSERT(fli->fli_alias->fli_self == curlwp);
#endif /* DIAGNOSTIC */
        mutex_exit(&fstrans_lock);

        while (tofree != NULL) {
                fli = tofree;
                tofree = fli->fli_succ;
                pool_cache_put(fstrans_lwp_cache, fli);
        }
}

/*
 * Allocate and return per lwp info for this mount.
 */
static struct fstrans_lwp_info *
fstrans_alloc_lwp_info(struct mount *mp)
{
        struct fstrans_lwp_info *fli, *fli_lower;
        struct fstrans_mount_info *fmi;

        for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
                if (fli->fli_mount == mp)
                        return fli;
        }

        /*
         * Lookup mount info and get lower mount per lwp info.
         */
        mutex_enter(&fstrans_lock);
        fmi = fstrans_mount_get(mp);
        if (fmi == NULL) {
                mutex_exit(&fstrans_lock);
                return NULL;
        }
        fmi->fmi_ref_cnt += 1;
        mutex_exit(&fstrans_lock);

        if (fmi->fmi_lower_info) {
                fli_lower =
                    fstrans_alloc_lwp_info(fmi->fmi_lower_info->fmi_mount);
                if (fli_lower == NULL) {
                        mutex_enter(&fstrans_lock);
                        fstrans_mount_dtor(fmi);
                        mutex_exit(&fstrans_lock);

                        return NULL;
                }
        } else {
                fli_lower = NULL;
        }

        /*
         * Allocate a new entry.
         */
        fli = pool_cache_get(fstrans_lwp_cache, PR_WAITOK);
        KASSERT(fli->fli_trans_cnt == 0);
        KASSERT(fli->fli_cow_cnt == 0);
        KASSERT(fli->fli_alias_cnt == 0);
        KASSERT(fli->fli_mount == NULL);
        KASSERT(fli->fli_alias == NULL);
        KASSERT(fli->fli_mountinfo == NULL);
        KASSERT(fli->fli_self == NULL);

        /*
         * Attach the mount info and alias.
         */

        fli->fli_self = curlwp;
        fli->fli_mount = mp;
        fli->fli_mountinfo = fmi;

        fli->fli_succ = curlwp->l_fstrans;
        curlwp->l_fstrans = fli;

        if (fli_lower) {
                fli->fli_alias = fli_lower;
                fli->fli_alias->fli_alias_cnt++;
                fli = fli->fli_alias;
        }

        return fli;
}

/*
 * Retrieve the per lwp info for this mount allocating if necessary.
 */
static inline struct fstrans_lwp_info *
fstrans_get_lwp_info(struct mount *mp, bool do_alloc)
{
        struct fstrans_lwp_info *fli;

        /*
         * Scan our list for a match.
         */
        for (fli = curlwp->l_fstrans; fli; fli = fli->fli_succ) {
                if (fli->fli_mount == mp) {
                        KASSERT(mp->mnt_lower == NULL ||
                            fli->fli_alias != NULL);
                        if (fli->fli_alias != NULL)
                                fli = fli->fli_alias;
                        break;
                }
        }

        if (do_alloc) {
                if (__predict_false(fli == NULL))
                        fli = fstrans_alloc_lwp_info(mp);
        }

        return fli;
}

/*
 * Check if this lock type is granted at this state.
 */
static bool
grant_lock(const struct fstrans_mount_info *fmi,
    const enum fstrans_lock_type type)
{

        if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL))
                return true;
        if (fmi->fmi_owner == curlwp)
                return true;
        if  (fmi->fmi_state == FSTRANS_SUSPENDING && type == FSTRANS_LAZY)
                return true;

        return false;
}

/*
 * Start a transaction.  If this thread already has a transaction on this
 * file system increment the reference counter.
 */
static inline int
_fstrans_start(struct mount *mp, enum fstrans_lock_type lock_type, int wait)
{
        int s;
        struct fstrans_lwp_info *fli;
        struct fstrans_mount_info *fmi;

        ASSERT_SLEEPABLE();

        fli = fstrans_get_lwp_info(mp, true);
        if (fli == NULL)
                return 0;
        fmi = fli->fli_mountinfo;

        if (fli->fli_trans_cnt > 0) {
                fli->fli_trans_cnt += 1;

                return 0;
        }

        s = pserialize_read_enter();
        if (__predict_true(grant_lock(fmi, lock_type))) {
                fli->fli_trans_cnt = 1;
                fli->fli_lock_type = lock_type;
                pserialize_read_exit(s);

                return 0;
        }
        pserialize_read_exit(s);

        if (! wait)
                return EBUSY;

        mutex_enter(&fstrans_lock);
        while (! grant_lock(fmi, lock_type))
                cv_wait(&fstrans_state_cv, &fstrans_lock);
        fli->fli_trans_cnt = 1;
        fli->fli_lock_type = lock_type;
        mutex_exit(&fstrans_lock);

        return 0;
}

void
fstrans_start(struct mount *mp)
{
        int error __diagused;

        error = _fstrans_start(mp, FSTRANS_SHARED, 1);
        KASSERT(error == 0);
}

int
fstrans_start_nowait(struct mount *mp)
{

        return _fstrans_start(mp, FSTRANS_SHARED, 0);
}

void
fstrans_start_lazy(struct mount *mp)
{
        int error __diagused;

        error = _fstrans_start(mp, FSTRANS_LAZY, 1);
        KASSERT(error == 0);
}

/*
 * Finish a transaction.
 */
void
fstrans_done(struct mount *mp)
{
        int s;
        struct fstrans_lwp_info *fli;
        struct fstrans_mount_info *fmi;

        fli = fstrans_get_lwp_info(mp, false);
        if (fli == NULL)
                return;
        fmi = fli->fli_mountinfo;
        KASSERT(fli->fli_trans_cnt > 0);

        if (fli->fli_trans_cnt > 1) {
                fli->fli_trans_cnt -= 1;

                return;
        }

        if (__predict_false(fstrans_gone_count > 0))
                fstrans_clear_lwp_info();

        s = pserialize_read_enter();
        if (__predict_true(fmi->fmi_state == FSTRANS_NORMAL)) {
                fli->fli_trans_cnt = 0;
                pserialize_read_exit(s);

                return;
        }
        pserialize_read_exit(s);

        mutex_enter(&fstrans_lock);
        fli->fli_trans_cnt = 0;
        cv_signal(&fstrans_count_cv);
        mutex_exit(&fstrans_lock);
}

/*
 * Check if we hold an lock.
 */
int
fstrans_held(struct mount *mp)
{
        struct fstrans_lwp_info *fli;
        struct fstrans_mount_info *fmi;

        KASSERT(mp != dead_rootmount);

        fli = fstrans_get_lwp_info(mp, false);
        if (fli == NULL)
                return 0;
        fmi = fli->fli_mountinfo;

        return (fli->fli_trans_cnt > 0 || fmi->fmi_owner == curlwp);
}

/*
 * Check if this thread has an exclusive lock.
 */
int
fstrans_is_owner(struct mount *mp)
{
        struct fstrans_lwp_info *fli;
        struct fstrans_mount_info *fmi;

        KASSERT(mp != dead_rootmount);

        fli = fstrans_get_lwp_info(mp, false);
        if (fli == NULL)
                return 0;
        fmi = fli->fli_mountinfo;

        return (fmi->fmi_owner == curlwp);
}

/*
 * True, if no thread is in a transaction not granted at the current state.
 */
static bool
state_change_done(const struct fstrans_mount_info *fmi)
{
        struct fstrans_lwp_info *fli;

        KASSERT(mutex_owned(&fstrans_lock));

        LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
                if (fli->fli_mountinfo != fmi)
                        continue;
                if (fli->fli_trans_cnt == 0)
                        continue;
                if (fli->fli_self == curlwp)
                        continue;
                if (grant_lock(fmi, fli->fli_lock_type))
                        continue;

                return false;
        }

        return true;
}

/*
 * Set new file system state.
 */
int
fstrans_setstate(struct mount *mp, enum fstrans_state new_state)
{
        int error;
        enum fstrans_state old_state;
        struct fstrans_lwp_info *fli;
        struct fstrans_mount_info *fmi;

        KASSERT(mp != dead_rootmount);

        fli = fstrans_get_lwp_info(mp, true);
        if (fli == NULL)
                return ENOENT;
        fmi = fli->fli_mountinfo;
        old_state = fmi->fmi_state;
        if (old_state == new_state)
                return 0;

        mutex_enter(&fstrans_lock);
        fmi->fmi_state = new_state;
        pserialize_perform(fstrans_psz);

        /*
         * All threads see the new state now.
         * Wait for transactions invalid at this state to leave.
         */
        error = 0;
        while (! state_change_done(fmi)) {
                error = cv_wait_sig(&fstrans_count_cv, &fstrans_lock);
                if (error) {
                        new_state = fmi->fmi_state = FSTRANS_NORMAL;
                        break;
                }
        }
        if (old_state != new_state) {
                if (old_state == FSTRANS_NORMAL) {
                        KASSERT(fmi->fmi_owner == NULL);
                        fmi->fmi_owner = curlwp;
                }
                if (new_state == FSTRANS_NORMAL) {
                        KASSERT(fmi->fmi_owner == curlwp);
                        fmi->fmi_owner = NULL;
                }
        }
        cv_broadcast(&fstrans_state_cv);
        mutex_exit(&fstrans_lock);

        return error;
}

/*
 * Get current file system state.
 */
enum fstrans_state
fstrans_getstate(struct mount *mp)
{
        struct fstrans_lwp_info *fli;
        struct fstrans_mount_info *fmi;

        KASSERT(mp != dead_rootmount);

        fli = fstrans_get_lwp_info(mp, true);
        KASSERT(fli != NULL);
        fmi = fli->fli_mountinfo;

        return fmi->fmi_state;
}

/*
 * Request a filesystem to suspend all operations.
 */
int
vfs_suspend(struct mount *mp, int nowait)
{
        struct fstrans_lwp_info *fli;
        int error;

        if (mp == dead_rootmount)
                return EOPNOTSUPP;

        fli = fstrans_get_lwp_info(mp, true);
        if (fli == NULL)
                return ENOENT;

        if (nowait) {
                if (!mutex_tryenter(&vfs_suspend_lock))
                        return EWOULDBLOCK;
        } else
                mutex_enter(&vfs_suspend_lock);

        if ((error = VFS_SUSPENDCTL(fli->fli_mount, SUSPEND_SUSPEND)) != 0) {
                mutex_exit(&vfs_suspend_lock);
                return error;
        }

        if ((mp->mnt_iflag & IMNT_GONE) != 0) {
                vfs_resume(mp);
                return ENOENT;
        }

        return 0;
}

/*
 * Request a filesystem to resume all operations.
 */
void
vfs_resume(struct mount *mp)
{
        struct fstrans_lwp_info *fli;

        KASSERT(mp != dead_rootmount);

        fli = fstrans_get_lwp_info(mp, false);
        mp = fli->fli_mount;

        VFS_SUSPENDCTL(mp, SUSPEND_RESUME);
        mutex_exit(&vfs_suspend_lock);
}


/*
 * True, if no thread is running a cow handler.
 */
static bool
cow_state_change_done(const struct fstrans_mount_info *fmi)
{
        struct fstrans_lwp_info *fli;

        KASSERT(mutex_owned(&fstrans_lock));
        KASSERT(fmi->fmi_cow_change);

        LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
                if (fli->fli_mount != fmi->fmi_mount)
                        continue;
                if (fli->fli_cow_cnt == 0)
                        continue;

                return false;
        }

        return true;
}

/*
 * Prepare for changing this mounts cow list.
 * Returns with fstrans_lock locked.
 */
static void
cow_change_enter(struct fstrans_mount_info *fmi)
{

        mutex_enter(&fstrans_lock);

        /*
         * Wait for other threads changing the list.
         */
        while (fmi->fmi_cow_change)
                cv_wait(&fstrans_state_cv, &fstrans_lock);

        /*
         * Wait until all threads are aware of a state change.
         */
        fmi->fmi_cow_change = true;
        pserialize_perform(fstrans_psz);

        while (! cow_state_change_done(fmi))
                cv_wait(&fstrans_count_cv, &fstrans_lock);
}

/*
 * Done changing this mounts cow list.
 */
static void
cow_change_done(struct fstrans_mount_info *fmi)
{

        KASSERT(mutex_owned(&fstrans_lock));

        fmi->fmi_cow_change = false;
        pserialize_perform(fstrans_psz);

        cv_broadcast(&fstrans_state_cv);

        mutex_exit(&fstrans_lock);
}

/*
 * Add a handler to this mount.
 */
int
fscow_establish(struct mount *mp, int (*func)(void *, struct buf *, bool),
    void *arg)
{
        struct fstrans_mount_info *fmi;
        struct fscow_handler *newch;

        KASSERT(mp != dead_rootmount);

        mutex_enter(&fstrans_lock);
        fmi = fstrans_mount_get(mp);
        KASSERT(fmi != NULL);
        fmi->fmi_ref_cnt += 1;
        mutex_exit(&fstrans_lock);

        newch = kmem_alloc(sizeof(*newch), KM_SLEEP);
        newch->ch_func = func;
        newch->ch_arg = arg;

        cow_change_enter(fmi);
        LIST_INSERT_HEAD(&fmi->fmi_cow_handler, newch, ch_list);
        cow_change_done(fmi);

        return 0;
}

/*
 * Remove a handler from this mount.
 */
int
fscow_disestablish(struct mount *mp, int (*func)(void *, struct buf *, bool),
    void *arg)
{
        struct fstrans_mount_info *fmi;
        struct fscow_handler *hp = NULL;

        KASSERT(mp != dead_rootmount);

        mutex_enter(&fstrans_lock);
        fmi = fstrans_mount_get(mp);
        KASSERT(fmi != NULL);
        mutex_exit(&fstrans_lock);

        cow_change_enter(fmi);
        LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
                if (hp->ch_func == func && hp->ch_arg == arg)
                        break;
        if (hp != NULL) {
                LIST_REMOVE(hp, ch_list);
                kmem_free(hp, sizeof(*hp));
        }
        fstrans_mount_dtor(fmi);
        cow_change_done(fmi);

        return hp ? 0 : EINVAL;
}

/*
 * Check for need to copy block that is about to be written.
 */
int
fscow_run(struct buf *bp, bool data_valid)
{
        int error, s;
        struct mount *mp;
        struct fstrans_lwp_info *fli;
        struct fstrans_mount_info *fmi;
        struct fscow_handler *hp;

        /*
         * First check if we need run the copy-on-write handler.
         */
        if ((bp->b_flags & B_COWDONE))
                return 0;
        if (bp->b_vp == NULL) {
                bp->b_flags |= B_COWDONE;
                return 0;
        }
        if (bp->b_vp->v_type == VBLK)
                mp = spec_node_getmountedfs(bp->b_vp);
        else
                mp = bp->b_vp->v_mount;
        if (mp == NULL || mp == dead_rootmount) {
                bp->b_flags |= B_COWDONE;
                return 0;
        }

        fli = fstrans_get_lwp_info(mp, true);
        KASSERT(fli != NULL);
        fmi = fli->fli_mountinfo;

        /*
         * On non-recursed run check if other threads
         * want to change the list.
         */
        if (fli->fli_cow_cnt == 0) {
                s = pserialize_read_enter();
                if (__predict_false(fmi->fmi_cow_change)) {
                        pserialize_read_exit(s);
                        mutex_enter(&fstrans_lock);
                        while (fmi->fmi_cow_change)
                                cv_wait(&fstrans_state_cv, &fstrans_lock);
                        fli->fli_cow_cnt = 1;
                        mutex_exit(&fstrans_lock);
                } else {
                        fli->fli_cow_cnt = 1;
                        pserialize_read_exit(s);
                }
        } else
                fli->fli_cow_cnt += 1;

        /*
         * Run all copy-on-write handlers, stop on error.
         */
        error = 0;
        LIST_FOREACH(hp, &fmi->fmi_cow_handler, ch_list)
                if ((error = (*hp->ch_func)(hp->ch_arg, bp, data_valid)) != 0)
                        break;
         if (error == 0)
                 bp->b_flags |= B_COWDONE;

        /*
         * Check if other threads want to change the list.
         */
        if (fli->fli_cow_cnt > 1) {
                fli->fli_cow_cnt -= 1;
        } else {
                s = pserialize_read_enter();
                if (__predict_false(fmi->fmi_cow_change)) {
                        pserialize_read_exit(s);
                        mutex_enter(&fstrans_lock);
                        fli->fli_cow_cnt = 0;
                        cv_signal(&fstrans_count_cv);
                        mutex_exit(&fstrans_lock);
                } else {
                        fli->fli_cow_cnt = 0;
                        pserialize_read_exit(s);
                }
        }

        return error;
}

#if defined(DDB)
void fstrans_dump(int);

static void
fstrans_print_lwp(struct proc *p, struct lwp *l, int verbose)
{
        char prefix[9];
        struct fstrans_lwp_info *fli;

        snprintf(prefix, sizeof(prefix), "%d.%d", p->p_pid, l->l_lid);
        LIST_FOREACH(fli, &fstrans_fli_head, fli_list) {
                if (fli->fli_self != l)
                        continue;
                if (fli->fli_trans_cnt == 0 && fli->fli_cow_cnt == 0) {
                        if (! verbose)
                                continue;
                }
                printf("%-8s", prefix);
                if (verbose)
                        printf(" @%p", fli);
                if (fli->fli_mount == dead_rootmount)
                        printf(" <dead>");
                else if (fli->fli_mount != NULL)
                        printf(" (%s)", fli->fli_mount->mnt_stat.f_mntonname);
                else
                        printf(" NULL");
                if (fli->fli_alias != NULL) {
                        struct mount *amp = fli->fli_alias->fli_mount;

                        printf(" alias");
                        if (verbose)
                                printf(" @%p", fli->fli_alias);
                        if (amp == NULL)
                                printf(" NULL");
                        else
                                printf(" (%s)", amp->mnt_stat.f_mntonname);
                }
                if (fli->fli_mountinfo && fli->fli_mountinfo->fmi_gone)
                        printf(" gone");
                if (fli->fli_trans_cnt == 0) {
                        printf(" -");
                } else {
                        switch (fli->fli_lock_type) {
                        case FSTRANS_LAZY:
                                printf(" lazy");
                                break;
                        case FSTRANS_SHARED:
                                printf(" shared");
                                break;
                        default:
                                printf(" %#x", fli->fli_lock_type);
                                break;
                        }
                }
                printf(" %d cow %d alias %d\n",
                    fli->fli_trans_cnt, fli->fli_cow_cnt, fli->fli_alias_cnt);
                prefix[0] = '\0';
        }
}

static void
fstrans_print_mount(struct mount *mp, int verbose)
{
        uint32_t indx;
        struct fstrans_mount_info *fmi;

        indx = fstrans_mount_hash(mp);
        SLIST_FOREACH(fmi, &fstrans_mount_hashtab[indx], fmi_hash)
                if (fmi->fmi_mount == mp)
                        break;

        if (!verbose && (fmi == NULL || fmi->fmi_state == FSTRANS_NORMAL))
                return;

        printf("%-16s ", mp->mnt_stat.f_mntonname);
        if (fmi == NULL) {
                printf("(null)\n");
                return;
        }
        printf("owner %p ", fmi->fmi_owner);
        switch (fmi->fmi_state) {
        case FSTRANS_NORMAL:
                printf("state normal\n");
                break;
        case FSTRANS_SUSPENDING:
                printf("state suspending\n");
                break;
        case FSTRANS_SUSPENDED:
                printf("state suspended\n");
                break;
        default:
                printf("state %#x\n", fmi->fmi_state);
                break;
        }
}

void
fstrans_dump(int full)
{
        const struct proclist_desc *pd;
        struct proc *p;
        struct lwp *l;
        struct mount *mp;

        printf("Fstrans locks by lwp:\n");
        for (pd = proclists; pd->pd_list != NULL; pd++)
                PROCLIST_FOREACH(p, pd->pd_list)
                        LIST_FOREACH(l, &p->p_lwps, l_sibling)
                                fstrans_print_lwp(p, l, full == 1);

        printf("Fstrans state by mount:\n");
        for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
                fstrans_print_mount(mp, full == 1);
}
#endif /* defined(DDB) */























































































































































































































    1 







    1 










    1 





























    1 




















































    2 








    1 































    1 













    2 















    2 











    2 

























    1 








    2 


    2 




    1 

























    1 



















































    1 

    1 


















    1 



































    1 


















    1 




















































































































































































































































































































































    1 




    1 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
/*        $NetBSD: ptyfs_vnops.c,v 1.69 2022/08/05 10:36:02 riastradh Exp $        */

/*
 * Copyright (c) 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)procfs_vnops.c        8.18 (Berkeley) 5/21/95
 */

/*
 * Copyright (c) 1993 Jan-Simon Pendry
 *
 * This code is derived from software contributed to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)procfs_vnops.c        8.18 (Berkeley) 5/21/95
 */

/*
 * ptyfs vnode interface
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ptyfs_vnops.c,v 1.69 2022/08/05 10:36:02 riastradh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/select.h>
#include <sys/dirent.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
#include <sys/conf.h>
#include <sys/tty.h>
#include <sys/pty.h>
#include <sys/kauth.h>

#include <uvm/uvm_extern.h>        /* for PAGE_SIZE */

#include <machine/reg.h>

#include <fs/ptyfs/ptyfs.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

MALLOC_DECLARE(M_PTYFSTMP);

/*
 * Vnode Operations.
 *
 */

int        ptyfs_lookup        (void *);
int        ptyfs_open        (void *);
int        ptyfs_close        (void *);
int        ptyfs_access        (void *);
int        ptyfs_getattr        (void *);
int        ptyfs_setattr        (void *);
int        ptyfs_read        (void *);
int        ptyfs_write        (void *);
int        ptyfs_ioctl        (void *);
int        ptyfs_poll        (void *);
int        ptyfs_kqfilter        (void *);
int        ptyfs_readdir        (void *);
int        ptyfs_reclaim        (void *);
int        ptyfs_inactive        (void *);
int        ptyfs_print        (void *);
int        ptyfs_pathconf        (void *);
int        ptyfs_advlock        (void *);

static int ptyfs_update(struct vnode *, const struct timespec *,
    const struct timespec *, int);
static int ptyfs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t,
    struct lwp *);
static int ptyfs_chmod(struct vnode *, mode_t, kauth_cred_t, struct lwp *);
static int atoi(const char *, size_t);

/*
 * ptyfs vnode operations.
 */
int (**ptyfs_vnodeop_p)(void *);
const struct vnodeopv_entry_desc ptyfs_vnodeop_entries[] = {
        { &vop_default_desc, vn_default_error },
        { &vop_parsepath_desc, genfs_parsepath },        /* parsepath */
        { &vop_lookup_desc, ptyfs_lookup },                /* lookup */
        { &vop_create_desc, genfs_eopnotsupp },                /* create */
        { &vop_mknod_desc, genfs_eopnotsupp },                /* mknod */
        { &vop_open_desc, ptyfs_open },                        /* open */
        { &vop_close_desc, ptyfs_close },                /* close */
        { &vop_access_desc, ptyfs_access },                /* access */
        { &vop_accessx_desc, genfs_accessx },                /* accessx */
        { &vop_getattr_desc, ptyfs_getattr },                /* getattr */
        { &vop_setattr_desc, ptyfs_setattr },                /* setattr */
        { &vop_read_desc, ptyfs_read },                        /* read */
        { &vop_write_desc, ptyfs_write },                /* write */
        { &vop_fallocate_desc, genfs_eopnotsupp },        /* fallocate */
        { &vop_fdiscard_desc, genfs_eopnotsupp },        /* fdiscard */
        { &vop_ioctl_desc, ptyfs_ioctl },                /* ioctl */
        { &vop_fcntl_desc, genfs_fcntl },                /* fcntl */
        { &vop_poll_desc, ptyfs_poll },                        /* poll */
        { &vop_kqfilter_desc, ptyfs_kqfilter },                /* kqfilter */
        { &vop_revoke_desc, genfs_revoke },                /* revoke */
        { &vop_mmap_desc, genfs_eopnotsupp },                /* mmap */
        { &vop_fsync_desc, genfs_nullop },                /* fsync */
        { &vop_seek_desc, genfs_nullop },                /* seek */
        { &vop_remove_desc, genfs_eopnotsupp },                /* remove */
        { &vop_link_desc, genfs_eopnotsupp },                /* link */
        { &vop_rename_desc, genfs_eopnotsupp },                /* rename */
        { &vop_mkdir_desc, genfs_eopnotsupp },                /* mkdir */
        { &vop_rmdir_desc, genfs_eopnotsupp },                /* rmdir */
        { &vop_symlink_desc, genfs_eopnotsupp },        /* symlink */
        { &vop_readdir_desc, ptyfs_readdir },                /* readdir */
        { &vop_readlink_desc, genfs_eopnotsupp },        /* readlink */
        { &vop_abortop_desc, genfs_abortop },                /* abortop */
        { &vop_inactive_desc, ptyfs_inactive },                /* inactive */
        { &vop_reclaim_desc, ptyfs_reclaim },                /* reclaim */
        { &vop_lock_desc, genfs_lock },                        /* lock */
        { &vop_unlock_desc, genfs_unlock },                /* unlock */
        { &vop_bmap_desc, genfs_eopnotsupp },                /* bmap */
        { &vop_strategy_desc, genfs_badop },                /* strategy */
        { &vop_print_desc, ptyfs_print },                /* print */
        { &vop_islocked_desc, genfs_islocked },                /* islocked */
        { &vop_pathconf_desc, ptyfs_pathconf },                /* pathconf */
        { &vop_advlock_desc, ptyfs_advlock },                /* advlock */
        { &vop_bwrite_desc, genfs_eopnotsupp },                /* bwrite */
        { &vop_putpages_desc, genfs_null_putpages },        /* putpages */
        { NULL, NULL }
};
const struct vnodeopv_desc ptyfs_vnodeop_opv_desc =
        { &ptyfs_vnodeop_p, ptyfs_vnodeop_entries };

/*
 * free any private data and remove the node
 * from any private lists.
 */
int
ptyfs_reclaim(void *v)
{
        struct vop_reclaim_v2_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;

        VOP_UNLOCK(vp);

        vp->v_data = NULL;
        return 0;
}

int
ptyfs_inactive(void *v)
{
        struct vop_inactive_v2_args /* {
                struct vnode *a_vp;
                bool *a_recycle;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct ptyfsnode *ptyfs = VTOPTYFS(vp);

        if (ptyfs->ptyfs_type == PTYFSptc)
                ptyfs_clr_active(vp->v_mount, ptyfs->ptyfs_pty);

        return 0;
}

/*
 * Return POSIX pathconf information applicable to special devices.
 */
int
ptyfs_pathconf(void *v)
{
        struct vop_pathconf_args /* {
                struct vnode *a_vp;
                int a_name;
                register_t *a_retval;
        } */ *ap = v;

        switch (ap->a_name) {
        case _PC_LINK_MAX:
                *ap->a_retval = LINK_MAX;
                return 0;
        case _PC_MAX_CANON:
                *ap->a_retval = MAX_CANON;
                return 0;
        case _PC_MAX_INPUT:
                *ap->a_retval = MAX_INPUT;
                return 0;
        case _PC_PIPE_BUF:
                *ap->a_retval = PIPE_BUF;
                return 0;
        case _PC_CHOWN_RESTRICTED:
                *ap->a_retval = 1;
                return 0;
        case _PC_VDISABLE:
                *ap->a_retval = _POSIX_VDISABLE;
                return 0;
        case _PC_SYNC_IO:
                *ap->a_retval = 1;
                return 0;
        default:
                return genfs_pathconf(ap);
        }
}

/*
 * _print is used for debugging.
 * just print a readable description
 * of (vp).
 */
int
ptyfs_print(void *v)
{
        struct vop_print_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct ptyfsnode *ptyfs = VTOPTYFS(ap->a_vp);

        printf("tag VT_PTYFS, type %d, pty %d\n",
            ptyfs->ptyfs_type, ptyfs->ptyfs_pty);
        return 0;
}

/*
 * support advisory locking on pty nodes
 */
int
ptyfs_advlock(void *v)
{
        struct vop_print_args /* {
                struct vnode *a_vp;
        } */ *ap = v;
        struct ptyfsnode *ptyfs = VTOPTYFS(ap->a_vp);

        switch (ptyfs->ptyfs_type) {
        case PTYFSpts:
        case PTYFSptc:
                return spec_advlock(v);
        default:
                return EOPNOTSUPP;
        }
}

/*
 * Invent attributes for ptyfsnode (vp) and store
 * them in (vap).
 * Directories lengths are returned as zero since
 * any real length would require the genuine size
 * to be computed, and nothing cares anyway.
 *
 * this is relatively minimal for ptyfs.
 */
int
ptyfs_getattr(void *v)
{
        struct vop_getattr_args /* {
                struct vnode *a_vp;
                struct vattr *a_vap;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct ptyfsnode *ptyfs = VTOPTYFS(ap->a_vp);
        struct vattr *vap = ap->a_vap;

        PTYFS_ITIMES(ptyfs, NULL, NULL, NULL);

        /* start by zeroing out the attributes */
        vattr_null(vap);

        /* next do all the common fields */
        vap->va_type = ap->a_vp->v_type;
        vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
        vap->va_fileid = ptyfs->ptyfs_fileno;
        vap->va_gen = 0;
        vap->va_flags = 0;
        vap->va_blocksize = PAGE_SIZE;

        vap->va_atime = ptyfs->ptyfs_atime;
        vap->va_mtime = ptyfs->ptyfs_mtime;
        vap->va_ctime = ptyfs->ptyfs_ctime;
        vap->va_birthtime = ptyfs->ptyfs_birthtime;
        vap->va_mode = ptyfs->ptyfs_mode;
        vap->va_flags = ptyfs->ptyfs_flags;
        vap->va_uid = ptyfs->ptyfs_uid;
        vap->va_gid = ptyfs->ptyfs_gid;

        switch (ptyfs->ptyfs_type) {
        case PTYFSpts:
        case PTYFSptc:
                if (pty_isfree(ptyfs->ptyfs_pty, 1))
                        return ENOENT;
                vap->va_bytes = vap->va_size = 0;
                vap->va_rdev = ap->a_vp->v_rdev;
                vap->va_nlink = 1;
                break;
        case PTYFSroot:
                vap->va_rdev = 0;
                vap->va_bytes = vap->va_size = DEV_BSIZE;
                vap->va_nlink = 2;
                break;
        default:
                return EOPNOTSUPP;
        }

        return 0;
}

/*ARGSUSED*/
int
ptyfs_setattr(void *v)
{
        struct vop_setattr_args /* {
                struct vnodeop_desc *a_desc;
                struct vnode *a_vp;
                struct vattr *a_vap;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct ptyfsnode *ptyfs = VTOPTYFS(vp);
        struct vattr *vap = ap->a_vap;
        kauth_cred_t cred = ap->a_cred;
        struct lwp *l = curlwp;
        int error;
        kauth_action_t action = KAUTH_VNODE_WRITE_FLAGS;
        bool changing_sysflags = false;

        if (vap->va_size != VNOVALSIZE) {
                 switch (ptyfs->ptyfs_type) {
                 case PTYFSroot:
                         return EISDIR;
                 case PTYFSpts:
                 case PTYFSptc:
                        break;
                default:
                        return EINVAL;
                }
        }

        if (vap->va_flags != VNOVALFLAGS) {
                if (vp->v_mount->mnt_flag & MNT_RDONLY)
                        return EROFS;

                /* Immutable and append-only flags are not supported on ptyfs. */
                if (vap->va_flags & (IMMUTABLE | APPEND))
                        return EINVAL;

                /* Snapshot flag cannot be set or cleared */
                if ((vap->va_flags & SF_SNAPSHOT) != (ptyfs->ptyfs_flags & SF_SNAPSHOT))
                        return EPERM;

                if ((ptyfs->ptyfs_flags & SF_SETTABLE) != (vap->va_flags & SF_SETTABLE)) {
                        changing_sysflags = true;
                        action |= KAUTH_VNODE_WRITE_SYSFLAGS;
                }

                error = kauth_authorize_vnode(cred, action, vp, NULL,
                    genfs_can_chflags(vp, cred, ptyfs->ptyfs_uid,
                    changing_sysflags));
                if (error)
                        return error;

                if (changing_sysflags) {
                        ptyfs->ptyfs_flags = vap->va_flags;
                } else {
                        ptyfs->ptyfs_flags &= SF_SETTABLE;
                        ptyfs->ptyfs_flags |= (vap->va_flags & UF_SETTABLE);
                }
                ptyfs->ptyfs_status |= PTYFS_CHANGE;
        }

        /*
         * Go through the fields and update iff not VNOVAL.
         */
        if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
                if (vp->v_mount->mnt_flag & MNT_RDONLY)
                        return EROFS;
                error = ptyfs_chown(vp, vap->va_uid, vap->va_gid, cred, l);
                if (error)
                        return error;
        }

        if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL ||
            vap->va_birthtime.tv_sec != VNOVAL) {
                if (vp->v_mount->mnt_flag & MNT_RDONLY)
                        return EROFS;
                if ((ptyfs->ptyfs_flags & SF_SNAPSHOT) != 0)
                        return EPERM;
                error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp,
                    NULL, genfs_can_chtimes(vp, cred, ptyfs->ptyfs_uid,
                    vap->va_vaflags));
                if (error)
                        return (error);
                if (vap->va_atime.tv_sec != VNOVAL)
                        if (!(vp->v_mount->mnt_flag & MNT_NOATIME))
                                ptyfs->ptyfs_status |= PTYFS_ACCESS;
                if (vap->va_mtime.tv_sec != VNOVAL) {
                        ptyfs->ptyfs_status |= PTYFS_CHANGE | PTYFS_MODIFY;
                        if (vp->v_mount->mnt_flag & MNT_RELATIME)
                                ptyfs->ptyfs_status |= PTYFS_ACCESS;
                }
                if (vap->va_birthtime.tv_sec != VNOVAL)
                        ptyfs->ptyfs_birthtime = vap->va_birthtime;
                ptyfs->ptyfs_status |= PTYFS_CHANGE;
                error = ptyfs_update(vp, &vap->va_atime, &vap->va_mtime, 0);
                if (error)
                        return error;
        }
        if (vap->va_mode != (mode_t)VNOVAL) {
                if (vp->v_mount->mnt_flag & MNT_RDONLY)
                        return EROFS;
                if ((ptyfs->ptyfs_flags & SF_SNAPSHOT) != 0 &&
                    (vap->va_mode &
                    (S_IXUSR|S_IWUSR|S_IXGRP|S_IWGRP|S_IXOTH|S_IWOTH)))
                        return EPERM;
                error = ptyfs_chmod(vp, vap->va_mode, cred, l);
                if (error)
                        return error;
        }
        return 0;
}

/*
 * Change the mode on a file.
 * Inode must be locked before calling.
 */
static int
ptyfs_chmod(struct vnode *vp, mode_t mode, kauth_cred_t cred, struct lwp *l)
{
        struct ptyfsnode *ptyfs = VTOPTYFS(vp);
        int error;

        error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp,
            NULL, genfs_can_chmod(vp, cred, ptyfs->ptyfs_uid, ptyfs->ptyfs_gid,
            mode));
        if (error)
                return (error);

        ptyfs->ptyfs_mode &= ~ALLPERMS;
        ptyfs->ptyfs_mode |= (mode & ALLPERMS);
        return 0;
}

/*
 * Perform chown operation on inode ip;
 * inode must be locked prior to call.
 */
static int
ptyfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred,
    struct lwp *l)
{
        struct ptyfsnode *ptyfs = VTOPTYFS(vp);
        int error;

        if (uid == (uid_t)VNOVAL)
                uid = ptyfs->ptyfs_uid;
        if (gid == (gid_t)VNOVAL)
                gid = ptyfs->ptyfs_gid;

        error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp,
            NULL, genfs_can_chown(vp, cred, ptyfs->ptyfs_uid, ptyfs->ptyfs_gid,
            uid, gid));
        if (error)
                return (error);

        ptyfs->ptyfs_gid = gid;
        ptyfs->ptyfs_uid = uid;
        return 0;
}

/*
 * implement access checking.
 *
 * actually, the check for super-user is slightly
 * broken since it will allow read access to write-only
 * objects.  this doesn't cause any particular trouble
 * but does mean that the i/o entry points need to check
 * that the operation really does make sense.
 */
int
ptyfs_access(void *v)
{
        struct vop_access_args /* {
                struct vnode *a_vp;
                accmode_t a_accmode;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vattr va;
        int error;

        if ((error = VOP_GETATTR(ap->a_vp, &va, ap->a_cred)) != 0)
                return error;

        return kauth_authorize_vnode(ap->a_cred,
            KAUTH_ACCESS_ACTION(ap->a_accmode, ap->a_vp->v_type, va.va_mode),
            ap->a_vp, NULL, genfs_can_access(ap->a_vp, ap->a_cred, va.va_uid,
            va.va_gid, va.va_mode, NULL, ap->a_accmode));
}

/*
 * lookup.  this is incredibly complicated in the
 * general case, however for most pseudo-filesystems
 * very little needs to be done.
 *
 * Locking isn't hard here, just poorly documented.
 *
 * If we're looking up ".", just vref the parent & return it.
 *
 * If we're looking up "..", unlock the parent, and lock "..". If everything
 * went ok, try to re-lock the parent. We do this to prevent lock races.
 *
 * For anything else, get the needed node.
 *
 * We try to exit with the parent locked in error cases.
 */
int
ptyfs_lookup(void *v)
{
        struct vop_lookup_v2_args /* {
                struct vnode * a_dvp;
                struct vnode ** a_vpp;
                struct componentname * a_cnp;
        } */ *ap = v;
        struct componentname *cnp = ap->a_cnp;
        struct vnode **vpp = ap->a_vpp;
        struct vnode *dvp = ap->a_dvp;
        const char *pname = cnp->cn_nameptr;
        struct ptyfsnode *ptyfs;
        int pty, error;

        *vpp = NULL;

        if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
                return EROFS;

        if (cnp->cn_namelen == 1 && *pname == '.') {
                *vpp = dvp;
                vref(dvp);
                return 0;
        }

        ptyfs = VTOPTYFS(dvp);
        switch (ptyfs->ptyfs_type) {
        case PTYFSroot:
                /*
                 * Shouldn't get here with .. in the root node.
                 */
                if (cnp->cn_flags & ISDOTDOT)
                        return EIO;

                pty = atoi(pname, cnp->cn_namelen);
                if (pty < 0 || ptyfs_next_active(dvp->v_mount, pty) != pty)
                        break;
                error = ptyfs_allocvp(dvp->v_mount, vpp, PTYFSpts, pty);
                if (error)
                        return error;
                if (ptyfs_next_active(dvp->v_mount, pty) != pty) {
                        vrele(*vpp);
                        *vpp = NULL;
                        return ENOENT;
                }
                return 0;

        default:
                return ENOTDIR;
        }

        return cnp->cn_nameiop == LOOKUP ? ENOENT : EROFS;
}

/*
 * readdir returns directory entries from ptyfsnode (vp).
 *
 * the strategy here with ptyfs is to generate a single
 * directory entry at a time (struct dirent) and then
 * copy that out to userland using uiomove.  a more efficient
 * though more complex implementation, would try to minimize
 * the number of calls to uiomove().  for ptyfs, this is
 * hardly worth the added code complexity.
 *
 * this should just be done through read()
 */
int
ptyfs_readdir(void *v)
{
        struct vop_readdir_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                kauth_cred_t a_cred;
                int *a_eofflag;
                off_t **a_cookies;
                int *a_ncookies;
        } */ *ap = v;
        struct uio *uio = ap->a_uio;
        struct dirent *dp;
        struct ptyfsnode *ptyfs;
        off_t i;
        int error;
        off_t *cookies = NULL;
        int ncookies;
        struct vnode *vp;
        int n, nc = 0;

        vp = ap->a_vp;
        ptyfs = VTOPTYFS(vp);

        if (uio->uio_resid < UIO_MX)
                return EINVAL;
        if (uio->uio_offset < 0)
                return EINVAL;

        dp = malloc(sizeof(struct dirent), M_PTYFSTMP, M_WAITOK | M_ZERO);

        error = 0;
        i = uio->uio_offset;
        dp->d_reclen = UIO_MX;
        ncookies = uio->uio_resid / UIO_MX;

        if (ptyfs->ptyfs_type != PTYFSroot) {
                error = ENOTDIR;
                goto out;
        }

        if (i >= npty)
                goto out;

        if (ap->a_ncookies) {
                ncookies = uimin(ncookies, (npty + 2 - i));
                cookies = malloc(ncookies * sizeof (off_t),
                    M_TEMP, M_WAITOK);
                *ap->a_cookies = cookies;
        }

        for (; i < 2 && uio->uio_resid >= UIO_MX; i++) {
                /* `.' and/or `..' */
                dp->d_fileno = PTYFS_FILENO(PTYFSroot, 0);
                dp->d_namlen = i + 1;
                (void)memcpy(dp->d_name, "..", dp->d_namlen);
                dp->d_name[i + 1] = '\0';
                dp->d_type = DT_DIR;
                if ((error = uiomove(dp, UIO_MX, uio)) != 0)
                        goto out;
                if (cookies)
                        *cookies++ = i + 1;
                nc++;
        }
        while (uio->uio_resid >= UIO_MX) {
                /* check for used ptys */
                n = ptyfs_next_active(vp->v_mount, i - 2);
                if (n < 0)
                        break;
                dp->d_fileno = PTYFS_FILENO(PTYFSpts, n);
                dp->d_namlen = snprintf(dp->d_name, sizeof(dp->d_name),
                    "%lld", (long long)(n));
                dp->d_type = DT_CHR;
                if ((error = uiomove(dp, UIO_MX, uio)) != 0)
                        goto out;
                i = n + 3;
                if (cookies)
                        *cookies++ = i;
                nc++;
        }

out:
        /* not pertinent in error cases */
        ncookies = nc;

        if (ap->a_ncookies) {
                if (error) {
                        if (cookies)
                                free(*ap->a_cookies, M_TEMP);
                        *ap->a_ncookies = 0;
                        *ap->a_cookies = NULL;
                } else
                        *ap->a_ncookies = ncookies;
        }
        uio->uio_offset = i;
        free(dp, M_PTYFSTMP);
        return error;
}

int
ptyfs_open(void *v)
{
        struct vop_open_args /* {
                struct vnode *a_vp;
                int  a_mode;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct ptyfsnode *ptyfs = VTOPTYFS(vp);

        switch (ptyfs->ptyfs_type) {
        case PTYFSpts:
        case PTYFSptc:
                return spec_open(v);
        case PTYFSroot:
                return 0;
        default:
                return EINVAL;
        }
}

int
ptyfs_close(void *v)
{
        struct vop_close_args /* {
                struct vnode *a_vp;
                int  a_fflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct ptyfsnode *ptyfs = VTOPTYFS(vp);

        mutex_enter(vp->v_interlock);
        if (vrefcnt(vp) > 1)
                PTYFS_ITIMES(ptyfs, NULL, NULL, NULL);
        mutex_exit(vp->v_interlock);

        switch (ptyfs->ptyfs_type) {
        case PTYFSpts:
        case PTYFSptc:
                return spec_close(v);
        case PTYFSroot:
                return 0;
        default:
                return EINVAL;
        }
}

int
ptyfs_read(void *v)
{
        struct vop_read_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int  a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct timespec ts;
        struct vnode *vp = ap->a_vp;
        struct ptyfsnode *ptyfs = VTOPTYFS(vp);
        int error;

        if (vp->v_type == VDIR)
                return EISDIR;

        ptyfs->ptyfs_status |= PTYFS_ACCESS;
        /* hardclock() resolution is good enough for ptyfs */
        getnanotime(&ts);
        (void)ptyfs_update(vp, &ts, &ts, 0);

        switch (ptyfs->ptyfs_type) {
        case PTYFSpts:
        case PTYFSptc:
                VOP_UNLOCK(vp);
                error = cdev_read(vp->v_rdev, ap->a_uio, ap->a_ioflag);
                vn_lock(vp, LK_RETRY|LK_EXCLUSIVE);
                return error;
        default:
                return EOPNOTSUPP;
        }
}

int
ptyfs_write(void *v)
{
        struct vop_write_args /* {
                struct vnode *a_vp;
                struct uio *a_uio;
                int  a_ioflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct timespec ts;
        struct vnode *vp = ap->a_vp;
        struct ptyfsnode *ptyfs = VTOPTYFS(vp);
        int error;

        ptyfs->ptyfs_status |= PTYFS_MODIFY;
        getnanotime(&ts);
        (void)ptyfs_update(vp, &ts, &ts, 0);

        switch (ptyfs->ptyfs_type) {
        case PTYFSpts:
        case PTYFSptc:
                VOP_UNLOCK(vp);
                error = cdev_write(vp->v_rdev, ap->a_uio, ap->a_ioflag);
                vn_lock(vp, LK_RETRY|LK_EXCLUSIVE);
                return error;
        default:
                return EOPNOTSUPP;
        }
}

int
ptyfs_ioctl(void *v)
{
        struct vop_ioctl_args /* {
                struct vnode *a_vp;
                u_long a_command;
                void *a_data;
                int  a_fflag;
                kauth_cred_t a_cred;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct ptyfsnode *ptyfs = VTOPTYFS(vp);

        switch (ptyfs->ptyfs_type) {
        case PTYFSpts:
        case PTYFSptc:
                return cdev_ioctl(vp->v_rdev, ap->a_command,
                    ap->a_data, ap->a_fflag, curlwp);
        default:
                return EOPNOTSUPP;
        }
}

int
ptyfs_poll(void *v)
{
        struct vop_poll_args /* {
                struct vnode *a_vp;
                int a_events;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct ptyfsnode *ptyfs = VTOPTYFS(vp);

        switch (ptyfs->ptyfs_type) {
        case PTYFSpts:
        case PTYFSptc:
                return cdev_poll(vp->v_rdev, ap->a_events, curlwp);
        default:
                return genfs_poll(v);
        }
}

int
ptyfs_kqfilter(void *v)
{
        struct vop_kqfilter_args /* {
                struct vnode *a_vp;
                struct knote *a_kn;
        } */ *ap = v;
        struct vnode *vp = ap->a_vp;
        struct ptyfsnode *ptyfs = VTOPTYFS(vp);

        switch (ptyfs->ptyfs_type) {
        case PTYFSpts:
        case PTYFSptc:
                return cdev_kqfilter(vp->v_rdev, ap->a_kn);
        default:
                return genfs_kqfilter(v);
        }
}

static int
ptyfs_update(struct vnode *vp, const struct timespec *acc,
    const struct timespec *mod, int flags)
{
        struct ptyfsnode *ptyfs = VTOPTYFS(vp);

        if (vp->v_mount->mnt_flag & MNT_RDONLY)
                return 0;

        PTYFS_ITIMES(ptyfs, acc, mod, NULL);
        return 0;
}

void
ptyfs_itimes(struct ptyfsnode *ptyfs, const struct timespec *acc,
    const struct timespec *mod, const struct timespec *cre)
{
        struct timespec now;
 
        KASSERT(ptyfs->ptyfs_status & (PTYFS_ACCESS|PTYFS_CHANGE|PTYFS_MODIFY));

        getnanotime(&now);
        if (ptyfs->ptyfs_status & PTYFS_ACCESS) {
                if (acc == NULL)
                        acc = &now;
                ptyfs->ptyfs_atime = *acc;
        }
        if (ptyfs->ptyfs_status & PTYFS_MODIFY) {
                if (mod == NULL)
                        mod = &now;
                ptyfs->ptyfs_mtime = *mod;
        }
        if (ptyfs->ptyfs_status & PTYFS_CHANGE) {
                if (cre == NULL)
                        cre = &now;
                ptyfs->ptyfs_ctime = *cre;
        }
        ptyfs->ptyfs_status &= ~(PTYFS_ACCESS|PTYFS_CHANGE|PTYFS_MODIFY);
}

/*
 * convert decimal ascii to int
 */
static int
atoi(const char *b, size_t len)
{
        int p = 0;

        while (len--) {
                char c = *b++;
                if (c < '0' || c > '9')
                        return -1;
                p = 10 * p + (c - '0');
        }

        return p;
}


























































   28 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/*        $NetBSD: popcount32.c,v 1.5 2015/05/29 19:39:41 matt Exp $        */
/*-
 * Copyright (c) 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Joerg Sonnenberger.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__RCSID("$NetBSD: popcount32.c,v 1.5 2015/05/29 19:39:41 matt Exp $");

#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <limits.h>
#include <stdint.h>
#include <strings.h>
#else
#include <lib/libkern/libkern.h>
#include <machine/limits.h>
#endif

#ifndef popcount32        // might be a builtin

/*
 * This a hybrid algorithm for bit counting between parallel counting and
 * using multiplication.  The idea is to sum up the bits in each Byte, so
 * that the final accumulation can be done with a single multiplication.
 * If the platform has a slow multiplication instruction, it can be replaced
 * by the commented out version below.
 */

unsigned int
popcount32(uint32_t v)
{
        unsigned int c;

        v = v - ((v >> 1) & 0x55555555U);
        v = (v & 0x33333333U) + ((v >> 2) & 0x33333333U);
        v = (v + (v >> 4)) & 0x0f0f0f0fU;
        c = (v * 0x01010101U) >> 24;
        /*
         * v = (v >> 16) + v;
         * v = (v >> 8) + v;
         * c = v & 255;
         */

        return c;
}

#if UINT_MAX == 0xffffffffU
__strong_alias(popcount, popcount32)
#endif

#if ULONG_MAX == 0xffffffffU
__strong_alias(popcountl, popcount32)
#endif

#endif        /* !popcount32 */




















































































































    1 

    1 
    1 



    1 




























































































































    8 









   13 

   13 








    8 










   13 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
/*        $NetBSD: pslist.h,v 1.7 2019/12/01 15:28:19 riastradh Exp $        */

/*-
 * Copyright (c) 2016 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Taylor R. Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef        _SYS_PSLIST_H
#define        _SYS_PSLIST_H

#include <sys/param.h>
#include <sys/atomic.h>

struct pslist_head;
struct pslist_entry;

struct pslist_head {
        struct pslist_entry *plh_first;
};

struct pslist_entry {
        struct pslist_entry **ple_prevp;
        struct pslist_entry *ple_next;
};

#ifdef _KERNEL
#define        _PSLIST_ASSERT        KASSERT
#else
#include <assert.h>
#define        _PSLIST_ASSERT        assert
#endif

#define        _PSLIST_POISON        ((void *)1ul)

/*
 * Initialization.  Allowed only when the caller has exclusive access,
 * excluding writers and readers.
 */

static __inline void
pslist_init(struct pslist_head *head)
{

        head->plh_first = NULL;        /* not yet published, so no atomic */
}

static __inline void
pslist_destroy(struct pslist_head *head __diagused)
{

        _PSLIST_ASSERT(head->plh_first == NULL);
}

static __inline void
pslist_entry_init(struct pslist_entry *entry)
{

        entry->ple_next = NULL;
        entry->ple_prevp = NULL;
}

static __inline void
pslist_entry_destroy(struct pslist_entry *entry)
{

        _PSLIST_ASSERT(entry->ple_prevp == NULL);

        /*
         * Poison the next entry.  If we used NULL here, then readers
         * would think they were simply at the end of the list.
         * Instead, cause readers to crash.
         */
        atomic_store_relaxed(&entry->ple_next, _PSLIST_POISON);
}

/*
 * Writer operations.  Caller must exclude other writers, but not
 * necessarily readers.
 *
 * Writes to initialize a new entry must precede its publication by
 * writing to plh_first / ple_next / *ple_prevp.
 *
 * The ple_prevp field is serialized by the caller's exclusive lock and
 * not read by readers, and hence its ordering relative to the internal
 * memory barriers is inconsequential.
 */

static __inline void
pslist_writer_insert_head(struct pslist_head *head, struct pslist_entry *new)
{

        _PSLIST_ASSERT(head->plh_first == NULL ||
            head->plh_first->ple_prevp == &head->plh_first);
        _PSLIST_ASSERT(new->ple_next == NULL);
        _PSLIST_ASSERT(new->ple_prevp == NULL);

        new->ple_prevp = &head->plh_first;
        new->ple_next = head->plh_first; /* not yet published, so no atomic */
        if (head->plh_first != NULL)
                head->plh_first->ple_prevp = &new->ple_next;
        atomic_store_release(&head->plh_first, new);
}

static __inline void
pslist_writer_insert_before(struct pslist_entry *entry,
    struct pslist_entry *new)
{

        _PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON);
        _PSLIST_ASSERT(entry->ple_prevp != NULL);
        _PSLIST_ASSERT(*entry->ple_prevp == entry);
        _PSLIST_ASSERT(new->ple_next == NULL);
        _PSLIST_ASSERT(new->ple_prevp == NULL);

        new->ple_prevp = entry->ple_prevp;
        new->ple_next = entry;        /* not yet published, so no atomic */

        /*
         * Pairs with atomic_load_consume in pslist_reader_first or
         * pslist_reader_next.
         */
        atomic_store_release(entry->ple_prevp, new);

        entry->ple_prevp = &new->ple_next;
}

static __inline void
pslist_writer_insert_after(struct pslist_entry *entry,
    struct pslist_entry *new)
{

        _PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON);
        _PSLIST_ASSERT(entry->ple_prevp != NULL);
        _PSLIST_ASSERT(*entry->ple_prevp == entry);
        _PSLIST_ASSERT(new->ple_next == NULL);
        _PSLIST_ASSERT(new->ple_prevp == NULL);

        new->ple_prevp = &entry->ple_next;
        new->ple_next = entry->ple_next; /* not yet published, so no atomic */
        if (new->ple_next != NULL)
                new->ple_next->ple_prevp = &new->ple_next;

        /* Pairs with atomic_load_consume in pslist_reader_next.  */
        atomic_store_release(&entry->ple_next, new);
}

static __inline void
pslist_writer_remove(struct pslist_entry *entry)
{

        _PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON);
        _PSLIST_ASSERT(entry->ple_prevp != NULL);
        _PSLIST_ASSERT(*entry->ple_prevp == entry);

        if (entry->ple_next != NULL)
                entry->ple_next->ple_prevp = entry->ple_prevp;

        /*
         * No need for atomic_store_release because there's no
         * initialization that this must happen after -- the store
         * transitions from a good state with the entry to a good state
         * without the entry, both of which are valid for readers to
         * witness.
         */
        atomic_store_relaxed(entry->ple_prevp, entry->ple_next);
        entry->ple_prevp = NULL;

        /*
         * Leave entry->ple_next intact so that any extant readers can
         * continue iterating through the list.  The caller must then
         * wait for readers to drain, e.g. with pserialize_perform,
         * before destroying and reusing the entry.
         */
}

static __inline struct pslist_entry *
pslist_writer_first(const struct pslist_head *head)
{

        return head->plh_first;
}

static __inline struct pslist_entry *
pslist_writer_next(const struct pslist_entry *entry)
{

        _PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON);
        return entry->ple_next;
}

static __inline void *
_pslist_writer_first_container(const struct pslist_head *head,
    const ptrdiff_t offset)
{
        struct pslist_entry *first = head->plh_first;

        return (first == NULL ? NULL : (char *)first - offset);
}

static __inline void *
_pslist_writer_next_container(const struct pslist_entry *entry,
    const ptrdiff_t offset)
{
        struct pslist_entry *next = entry->ple_next;

        _PSLIST_ASSERT(next != _PSLIST_POISON);
        return (next == NULL ? NULL : (char *)next - offset);
}

/*
 * Reader operations.  Caller must block pserialize_perform or
 * equivalent and be bound to a CPU.  Only plh_first/ple_next may be
 * read, and only with consuming memory order so that data-dependent
 * loads happen afterward.
 */

static __inline struct pslist_entry *
pslist_reader_first(const struct pslist_head *head)
{
        /*
         * Pairs with atomic_store_release in pslist_writer_insert_head
         * or pslist_writer_insert_before.
         */
        return atomic_load_consume(&head->plh_first);
}

static __inline struct pslist_entry *
pslist_reader_next(const struct pslist_entry *entry)
{
        /*
         * Pairs with atomic_store_release in
         * pslist_writer_insert_before or pslist_writer_insert_after.
         */
        struct pslist_entry *next = atomic_load_consume(&entry->ple_next);

        _PSLIST_ASSERT(next != _PSLIST_POISON);

        return next;
}

static __inline void *
_pslist_reader_first_container(const struct pslist_head *head,
    const ptrdiff_t offset)
{
        struct pslist_entry *first = pslist_reader_first(head);

        if (first == NULL)
                return NULL;
        return (char *)first - offset;
}

static __inline void *
_pslist_reader_next_container(const struct pslist_entry *entry,
    const ptrdiff_t offset)
{
        struct pslist_entry *next = pslist_reader_next(entry);

        if (next == NULL)
                return NULL;
        return (char *)next - offset;
}

/*
 * Type-safe macros for convenience.
 */

#if defined(__COVERITY__) || defined(__LGTM_BOT__)
#define        _PSLIST_VALIDATE_PTRS(P, Q)                0
#define        _PSLIST_VALIDATE_CONTAINER(P, T, F)        0
#else
#define        _PSLIST_VALIDATE_PTRS(P, Q)                                              \
        (0 * sizeof((P) - (Q)) * sizeof(*(P)) * sizeof(*(Q)))
#define        _PSLIST_VALIDATE_CONTAINER(P, T, F)                                      \
        (0 * sizeof((P) - &((T *)(((char *)(P)) - offsetof(T, F)))->F))
#endif

#define        PSLIST_INITIALIZER                { .plh_first = NULL }
#define        PSLIST_ENTRY_INITIALIZER        { .ple_next = NULL, .ple_prevp = NULL }

#define        PSLIST_INIT(H)                        pslist_init((H))
#define        PSLIST_DESTROY(H)                pslist_destroy((H))
#define        PSLIST_ENTRY_INIT(E, F)                pslist_entry_init(&(E)->F)
#define        PSLIST_ENTRY_DESTROY(E, F)        pslist_entry_destroy(&(E)->F)

#define        PSLIST_WRITER_INSERT_HEAD(H, V, F)                                      \
        pslist_writer_insert_head((H), &(V)->F)
#define        PSLIST_WRITER_INSERT_BEFORE(E, N, F)                                      \
        pslist_writer_insert_before(&(E)->F + _PSLIST_VALIDATE_PTRS(E, N),    \
            &(N)->F)
#define        PSLIST_WRITER_INSERT_AFTER(E, N, F)                                      \
        pslist_writer_insert_after(&(E)->F + _PSLIST_VALIDATE_PTRS(E, N),     \
            &(N)->F)
#define        PSLIST_WRITER_REMOVE(E, F)                                              \
        pslist_writer_remove(&(E)->F)
#define        PSLIST_WRITER_FIRST(H, T, F)                                              \
        ((T *)(_pslist_writer_first_container((H), offsetof(T, F))) +              \
            _PSLIST_VALIDATE_CONTAINER(pslist_writer_first(H), T, F))
#define        PSLIST_WRITER_NEXT(V, T, F)                                              \
        ((T *)(_pslist_writer_next_container(&(V)->F, offsetof(T, F))) +      \
            _PSLIST_VALIDATE_CONTAINER(pslist_writer_next(&(V)->F), T, F))
#define        PSLIST_WRITER_FOREACH(V, H, T, F)                                      \
        for ((V) = PSLIST_WRITER_FIRST((H), T, F);                              \
                (V) != NULL;                                                      \
                (V) = PSLIST_WRITER_NEXT((V), T, F))

#define        PSLIST_READER_FIRST(H, T, F)                                              \
        ((T *)(_pslist_reader_first_container((H), offsetof(T, F))) +              \
            _PSLIST_VALIDATE_CONTAINER(pslist_reader_first(H), T, F))
#define        PSLIST_READER_NEXT(V, T, F)                                              \
        ((T *)(_pslist_reader_next_container(&(V)->F, offsetof(T, F))) +      \
            _PSLIST_VALIDATE_CONTAINER(pslist_reader_next(&(V)->F), T, F))
#define        PSLIST_READER_FOREACH(V, H, T, F)                                      \
        for ((V) = PSLIST_READER_FIRST((H), T, F);                              \
                (V) != NULL;                                                      \
                (V) = PSLIST_READER_NEXT((V), T, F))

#endif        /* _SYS_PSLIST_H */


















































































































































































































































































































































































    3 















    2 
    1 












    2 












    2 












    2 



















    2 



























    2 





































    2 












    2 
    1 



    3 








    4 


    1 


















    2 








    2 








    1 










    3 



    3 













    1 













    2 





    2 


    2 
    2 

    1 
    1 



    2 






















    3 
    3 
    3 

    1 











    2 


    2 





































    2 
    2 
    2 





























    2 








    2 
















    1 
    1 




    1 
























































































    4 
    3 
    4 













    3 













    1 









    1 









































































   12 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
/*        $NetBSD: raw_ip6.c,v 1.184 2024/02/24 21:41:13 mlelstv Exp $        */
/*        $KAME: raw_ip6.c,v 1.82 2001/07/23 18:57:56 jinmei Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)raw_ip.c        8.2 (Berkeley) 1/4/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: raw_ip6.c,v 1.184 2024/02/24 21:41:13 mlelstv Exp $");

#ifdef _KERNEL_OPT
#include "opt_ipsec.h"
#include "opt_net_mpsafe.h"
#endif

#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/kmem.h>

#include <net/if.h>
#include <net/if_types.h>
#include <net/net_stats.h>

#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/ip6_mroute.h>
#include <netinet/icmp6.h>
#include <netinet6/icmp6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/scope6_var.h>
#include <netinet6/raw_ip6.h>

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#endif

#include "faith.h"
#if defined(NFAITH) && 0 < NFAITH
#include <net/if_faith.h>
#endif

extern struct inpcbtable rawcbtable;
struct        inpcbtable raw6cbtable;
#define ifatoia6(ifa)        ((struct in6_ifaddr *)(ifa))

/*
 * Raw interface to IP6 protocol.
 */

static percpu_t *rip6stat_percpu;

#define        RIP6_STATINC(x)                _NET_STATINC(rip6stat_percpu, x)

static void sysctl_net_inet6_raw6_setup(struct sysctllog **);

/*
 * Initialize raw connection block queue.
 */
void
rip6_init(void)
{

        sysctl_net_inet6_raw6_setup(NULL);
        in6pcb_init(&raw6cbtable, 1, 1);

        rip6stat_percpu = percpu_alloc(sizeof(uint64_t) * RIP6_NSTATS);
}

static void
rip6_sbappendaddr(struct inpcb *last, struct ip6_hdr *ip6,
    const struct sockaddr *sa, int hlen, struct mbuf *n)
{
        struct mbuf *opts = NULL;

        if (last->inp_flags & IN6P_CONTROLOPTS ||
            SOOPT_TIMESTAMP(last->inp_socket->so_options))
                ip6_savecontrol(last, &opts, ip6, n);

        m_adj(n, hlen);

        if (sbappendaddr(&last->inp_socket->so_rcv, sa, n, opts) == 0) {
                soroverflow(last->inp_socket);
                m_freem(n);
                if (opts)
                        m_freem(opts);
                RIP6_STATINC(RIP6_STAT_FULLSOCK);
        } else {
                sorwakeup(last->inp_socket);
        }
}

/*
 * Setup generic address and protocol structures
 * for raw_input routine, then pass them along with
 * mbuf chain.
 */
int
rip6_input(struct mbuf **mp, int *offp, int proto)
{
        struct mbuf *m = *mp;
        struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
        struct inpcb *inp;
        struct inpcb *last = NULL;
        struct sockaddr_in6 rip6src;
        struct mbuf *n;

        RIP6_STATINC(RIP6_STAT_IPACKETS);

#if defined(NFAITH) && 0 < NFAITH
        if (faithprefix(&ip6->ip6_dst)) {
                /* send icmp6 host unreach? */
                m_freem(m);
                return IPPROTO_DONE;
        }
#endif

        sockaddr_in6_init(&rip6src, &ip6->ip6_src, 0, 0, 0);
        if (sa6_recoverscope(&rip6src) != 0) {
                /* XXX: should be impossible. */
                m_freem(m);
                return IPPROTO_DONE;
        }

        TAILQ_FOREACH(inp, &raw6cbtable.inpt_queue, inp_queue) {
                if (inp->inp_af != AF_INET6)
                        continue;
                if (in6p_ip6(inp).ip6_nxt &&
                    in6p_ip6(inp).ip6_nxt != proto)
                        continue;
                if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) &&
                    !IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &ip6->ip6_dst))
                        continue;
                if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)) &&
                    !IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &ip6->ip6_src))
                        continue;
                if (in6p_cksum(inp) != -1) {
                        RIP6_STATINC(RIP6_STAT_ISUM);
                        /*
                         * Although in6_cksum() does not need the position of
                         * the checksum field for verification, enforce that it
                         * is located within the packet.  Userland has given
                         * a checksum offset, a packet too short for that is
                         * invalid.  Avoid overflow with user supplied offset.
                         */
                        if (m->m_pkthdr.len < *offp + 2 ||
                            m->m_pkthdr.len - *offp - 2 < in6p_cksum(inp) ||
                            in6_cksum(m, proto, *offp,
                            m->m_pkthdr.len - *offp)) {
                                RIP6_STATINC(RIP6_STAT_BADSUM);
                                continue;
                        }
                }

                if (last == NULL) {
                        ;
                }
#ifdef IPSEC
                else if (ipsec_used && ipsec_in_reject(m, last)) {
                        /* do not inject data into pcb */
                }
#endif
                else if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) {
                        rip6_sbappendaddr(last, ip6, sin6tosa(&rip6src),
                            *offp, n);
                }

                last = inp;
        }

#ifdef IPSEC
        if (ipsec_used && last && ipsec_in_reject(m, last)) {
                m_freem(m);
                IP6_STATDEC(IP6_STAT_DELIVERED);
                /* do not inject data into pcb */
        } else
#endif
        if (last != NULL) {
                rip6_sbappendaddr(last, ip6, sin6tosa(&rip6src), *offp, m);
        } else {
                RIP6_STATINC(RIP6_STAT_NOSOCK);
                if (m->m_flags & M_MCAST)
                        RIP6_STATINC(RIP6_STAT_NOSOCKMCAST);
                if (proto == IPPROTO_NONE)
                        m_freem(m);
                else {
                        int s;
                        struct ifnet *rcvif = m_get_rcvif(m, &s);
                        const int prvnxt = ip6_get_prevhdr(m, *offp);
                        in6_ifstat_inc(rcvif, ifs6_in_protounknown);
                        m_put_rcvif(rcvif, &s);
                        icmp6_error(m, ICMP6_PARAM_PROB,
                            ICMP6_PARAMPROB_NEXTHEADER,
                            prvnxt);
                }
                IP6_STATDEC(IP6_STAT_DELIVERED);
        }
        return IPPROTO_DONE;
}

void *
rip6_ctlinput(int cmd, const struct sockaddr *sa, void *d)
{
        struct ip6_hdr *ip6;
        struct ip6ctlparam *ip6cp = NULL;
        const struct sockaddr_in6 *sa6_src = NULL;
        void *cmdarg;
        void (*notify)(struct inpcb *, int) = in6pcb_rtchange;
        int nxt;

        if (sa->sa_family != AF_INET6 ||
            sa->sa_len != sizeof(struct sockaddr_in6))
                return NULL;

        if ((unsigned)cmd >= PRC_NCMDS)
                return NULL;
        if (PRC_IS_REDIRECT(cmd))
                notify = in6pcb_rtchange, d = NULL;
        else if (cmd == PRC_HOSTDEAD)
                d = NULL;
        else if (cmd == PRC_MSGSIZE)
                ; /* special code is present, see below */
        else if (inet6ctlerrmap[cmd] == 0)
                return NULL;

        /* if the parameter is from icmp6, decode it. */
        if (d != NULL) {
                ip6cp = (struct ip6ctlparam *)d;
                ip6 = ip6cp->ip6c_ip6;
                cmdarg = ip6cp->ip6c_cmdarg;
                sa6_src = ip6cp->ip6c_src;
                nxt = ip6cp->ip6c_nxt;
        } else {
                ip6 = NULL;
                cmdarg = NULL;
                sa6_src = &sa6_any;
                nxt = -1;
        }

        if (ip6 && cmd == PRC_MSGSIZE) {
                const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa;
                int valid = 0;
                struct inpcb *inp;

                /*
                 * Check to see if we have a valid raw IPv6 socket
                 * corresponding to the address in the ICMPv6 message
                 * payload, and the protocol (ip6_nxt) meets the socket.
                 * XXX chase extension headers, or pass final nxt value
                 * from icmp6_notify_error()
                 */
                inp = NULL;
                inp = in6pcb_lookup(&raw6cbtable, &sa6->sin6_addr, 0,
                                             (const struct in6_addr *)&sa6_src->sin6_addr, 0, 0, 0);
#if 0
                if (!inp) {
                        /*
                         * As the use of sendto(2) is fairly popular,
                         * we may want to allow non-connected pcb too.
                         * But it could be too weak against attacks...
                         * We should at least check if the local
                         * address (= s) is really ours.
                         */
                        inp = in6pcb_lookup_bound(&raw6cbtable,
                            &sa6->sin6_addr, 0, 0);
                }
#endif

                if (inp && in6p_ip6(inp).ip6_nxt &&
                    in6p_ip6(inp).ip6_nxt == nxt)
                        valid++;

                /*
                 * Depending on the value of "valid" and routing table
                 * size (mtudisc_{hi,lo}wat), we will:
                 * - recalculate the new MTU and create the
                 *   corresponding routing entry, or
                 * - ignore the MTU change notification.
                 */
                icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);

                /*
                 * regardless of if we called icmp6_mtudisc_update(),
                 * we need to call in6pcb_notify(), to notify path MTU
                 * change to the userland (RFC3542), because some
                 * unconnected sockets may share the same destination
                 * and want to know the path MTU.
                 */
        }

        (void) in6pcb_notify(&raw6cbtable, sa, 0,
            sin6tocsa(sa6_src), 0, cmd, cmdarg, notify);
        return NULL;
}

/*
 * Generate IPv6 header and pass packet to ip6_output.
 * Tack on options user may have setup with control call.
 */
int
rip6_output(struct mbuf *m, struct socket * const so,
    struct sockaddr_in6 * const dstsock, struct mbuf * const control)
{
        struct in6_addr *dst;
        struct ip6_hdr *ip6;
        struct inpcb *inp;
        u_int        plen = m->m_pkthdr.len;
        int error = 0;
        struct ip6_pktopts opt, *optp = NULL;
        struct ifnet *oifp = NULL;
        int type, code;                /* for ICMPv6 output statistics only */
        int scope_ambiguous = 0;
        int bound = curlwp_bind();
        struct psref psref;

        inp = sotoinpcb(so);

        dst = &dstsock->sin6_addr;
        if (control) {
                if ((error = ip6_setpktopts(control, &opt,
                    in6p_outputopts(inp),
                    kauth_cred_get(), so->so_proto->pr_protocol)) != 0) {
                        goto bad;
                }
                optp = &opt;
        } else
                optp = in6p_outputopts(inp);

        /*
         * Check and convert scope zone ID into internal form.
         * XXX: we may still need to determine the zone later.
         */
        if (!(so->so_state & SS_ISCONNECTED)) {
                if (dstsock->sin6_scope_id == 0 && !ip6_use_defzone)
                        scope_ambiguous = 1;
                if ((error = sa6_embedscope(dstsock, ip6_use_defzone)) != 0)
                        goto bad;
        }

        /*
         * For an ICMPv6 packet, we should know its type and code
         * to update statistics.
         */
        if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
                struct icmp6_hdr *icmp6;
                if (m->m_len < sizeof(struct icmp6_hdr) &&
                    (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) {
                        error = ENOBUFS;
                        goto bad;
                }
                icmp6 = mtod(m, struct icmp6_hdr *);
                type = icmp6->icmp6_type;
                code = icmp6->icmp6_code;
        } else {
                type = 0;
                code = 0;
        }

        M_PREPEND(m, sizeof(*ip6), M_DONTWAIT);
        if (!m) {
                error = ENOBUFS;
                goto bad;
        }
        ip6 = mtod(m, struct ip6_hdr *);

        /*
         * Next header might not be ICMP6 but use its pseudo header anyway.
         */
        ip6->ip6_dst = *dst;

        /*
         * Source address selection.
         */
        error = in6_selectsrc(dstsock, optp, in6p_moptions(inp),
            &inp->inp_route, &in6p_laddr(inp), &oifp, &psref, &ip6->ip6_src);
        if (error != 0)
                goto bad;

        if (oifp && scope_ambiguous) {
                /*
                 * Application should provide a proper zone ID or the use of
                 * default zone IDs should be enabled.  Unfortunately, some
                 * applications do not behave as it should, so we need a
                 * workaround.  Even if an appropriate ID is not determined
                 * (when it's required), if we can determine the outgoing
                 * interface. determine the zone ID based on the interface.
                 */
                error = in6_setscope(&dstsock->sin6_addr, oifp, NULL);
                if (error != 0)
                        goto bad;
        }
        ip6->ip6_dst = dstsock->sin6_addr;

        /* fill in the rest of the IPv6 header fields */
        ip6->ip6_flow = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK;
        ip6->ip6_vfc  &= ~IPV6_VERSION_MASK;
        ip6->ip6_vfc  |= IPV6_VERSION;
        /* ip6_plen will be filled in ip6_output, so not fill it here. */
        ip6->ip6_nxt   = in6p_ip6(inp).ip6_nxt;
        ip6->ip6_hlim = in6pcb_selecthlim(inp, oifp);

        if_put(oifp, &psref);
        oifp = NULL;

        if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 ||
            in6p_cksum(inp) != -1) {
                const uint8_t nxt = ip6->ip6_nxt;
                int off;
                u_int16_t sum;

                /* compute checksum */
                if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
                        off = offsetof(struct icmp6_hdr, icmp6_cksum);
                else
                        off = in6p_cksum(inp);
                if (plen < 2 || plen - 2 < off) {
                        error = EINVAL;
                        goto bad;
                }
                off += sizeof(struct ip6_hdr);

                sum = 0;
                m = m_copyback_cow(m, off, sizeof(sum), (void *)&sum,
                    M_DONTWAIT);
                if (m == NULL) {
                        error = ENOBUFS;
                        goto bad;
                }
                sum = in6_cksum(m, nxt, sizeof(*ip6), plen);
                m = m_copyback_cow(m, off, sizeof(sum), (void *)&sum,
                    M_DONTWAIT);
                if (m == NULL) {
                        error = ENOBUFS;
                        goto bad;
                }
        }

        {
                struct ifnet *ret_oifp = NULL;

                error = ip6_output(m, optp, &inp->inp_route, 0,
                    in6p_moptions(inp), inp, &ret_oifp);
                if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
                        if (ret_oifp)
                                icmp6_ifoutstat_inc(ret_oifp, type, code);
                        ICMP6_STATINC(ICMP6_STAT_OUTHIST + type);
                } else
                        RIP6_STATINC(RIP6_STAT_OPACKETS);
        }

        goto freectl;

 bad:
        if (m)
                m_freem(m);

 freectl:
        if (control) {
                ip6_clearpktopts(&opt, -1);
                m_freem(control);
        }
        if_put(oifp, &psref);
        curlwp_bindx(bound);
        return error;
}

/*
 * Raw IPv6 socket option processing.
 */
int
rip6_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
        int error = 0;

        if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) {
                int optval;

                /* need to fiddle w/ opt(IPPROTO_IPV6, IPV6_CHECKSUM)? */
                if (op == PRCO_GETOPT) {
                        optval = 1;
                        error = sockopt_set(sopt, &optval, sizeof(optval));
                } else if (op == PRCO_SETOPT) {
                        error = sockopt_getint(sopt, &optval);
                        if (error)
                                goto out;
                        if (optval == 0)
                                error = EINVAL;
                }

                goto out;
        } else if (sopt->sopt_level != IPPROTO_IPV6)
                return ip6_ctloutput(op, so, sopt);

        switch (sopt->sopt_name) {
        case MRT6_INIT:
        case MRT6_DONE:
        case MRT6_ADD_MIF:
        case MRT6_DEL_MIF:
        case MRT6_ADD_MFC:
        case MRT6_DEL_MFC:
        case MRT6_PIM:
                if (op == PRCO_SETOPT)
                        error = ip6_mrouter_set(so, sopt);
                else if (op == PRCO_GETOPT)
                        error = ip6_mrouter_get(so, sopt);
                else
                        error = EINVAL;
                break;
        case IPV6_CHECKSUM:
                return ip6_raw_ctloutput(op, so, sopt);
        default:
                return ip6_ctloutput(op, so, sopt);
        }
 out:
        return error;
}

extern        u_long rip6_sendspace;
extern        u_long rip6_recvspace;

int
rip6_attach(struct socket *so, int proto)
{
        struct inpcb *inp;
        int s, error;

        KASSERT(sotoinpcb(so) == NULL);
        sosetlock(so);

        error = kauth_authorize_network(kauth_cred_get(),
            KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_RAWSOCK,
            KAUTH_ARG(AF_INET6),
            KAUTH_ARG(SOCK_RAW),
            KAUTH_ARG(so->so_proto->pr_protocol));
        if (error) {
                return error;
        }
        s = splsoftnet();
        error = soreserve(so, rip6_sendspace, rip6_recvspace);
        if (error) {
                splx(s);
                return error;
        }
        if ((error = inpcb_create(so, &raw6cbtable)) != 0) {
                splx(s);
                return error;
        }
        splx(s);
        inp = sotoinpcb(so);
        in6p_ip6(inp).ip6_nxt = proto;
        in6p_cksum(inp) = -1;

        in6p_icmp6filt(inp) = kmem_alloc(sizeof(struct icmp6_filter), KM_SLEEP);
        ICMP6_FILTER_SETPASSALL(in6p_icmp6filt(inp));
        KASSERT(solocked(so));
        return error;
}

static void
rip6_detach(struct socket *so)
{
        struct inpcb *inp = sotoinpcb(so);

        KASSERT(solocked(so));
        KASSERT(inp != NULL);

        if (so == ip6_mrouter) {
                ip6_mrouter_done();
        }
        /* xxx: RSVP */
        if (in6p_icmp6filt(inp) != NULL) {
                kmem_free(in6p_icmp6filt(inp), sizeof(struct icmp6_filter));
                in6p_icmp6filt(inp) = NULL;
        }
        inpcb_destroy(inp);
}

static int
rip6_accept(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
rip6_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct inpcb *inp = sotoinpcb(so);
        struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
        struct ifaddr *ifa = NULL;
        int error = 0;
        int s;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);
        KASSERT(nam != NULL);

        if (addr->sin6_len != sizeof(*addr))
                return EINVAL;
        if (IFNET_READER_EMPTY() || addr->sin6_family != AF_INET6)
                return EADDRNOTAVAIL;

        if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0)
                return error;

        /*
         * we don't support mapped address here, it would confuse
         * users so reject it
         */
        if (IN6_IS_ADDR_V4MAPPED(&addr->sin6_addr))
                return EADDRNOTAVAIL;
        s = pserialize_read_enter();
        if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) &&
            (ifa = ifa_ifwithaddr(sin6tosa(addr))) == NULL) {
                error = EADDRNOTAVAIL;
                goto out;
        }
        if (ifa && (ifatoia6(ifa))->ia6_flags &
            (IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED)) {
                error = EADDRNOTAVAIL;
                goto out;
        }

        in6p_laddr(inp) = addr->sin6_addr;
        error = 0;
out:
        pserialize_read_exit(s);
        return error;
}

static int
rip6_listen(struct socket *so, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
rip6_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        struct inpcb *inp = sotoinpcb(so);
        struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
        struct in6_addr in6a;
        struct ifnet *ifp = NULL;
        int scope_ambiguous = 0;
        int error = 0;
        struct psref psref;
        int bound;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);
        KASSERT(nam != NULL);

        if (IFNET_READER_EMPTY())
                return EADDRNOTAVAIL;
        if (addr->sin6_family != AF_INET6)
                return EAFNOSUPPORT;
        if (addr->sin6_len != sizeof(*addr))
                return EINVAL;

        /*
         * Application should provide a proper zone ID or the use of
         * default zone IDs should be enabled.  Unfortunately, some
         * applications do not behave as it should, so we need a
         * workaround.  Even if an appropriate ID is not determined,
         * we'll see if we can determine the outgoing interface.  If we
         * can, determine the zone ID based on the interface below.
         */
        if (addr->sin6_scope_id == 0 && !ip6_use_defzone)
                scope_ambiguous = 1;
        if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0)
                return error;

        bound = curlwp_bind();
        /* Source address selection. XXX: need pcblookup? */
        error = in6_selectsrc(addr, in6p_outputopts(inp),
            in6p_moptions(inp), &inp->inp_route,
            &in6p_laddr(inp), &ifp, &psref, &in6a);
        if (error != 0)
                goto out;
        /* XXX: see above */
        if (ifp && scope_ambiguous &&
            (error = in6_setscope(&addr->sin6_addr, ifp, NULL)) != 0) {
                goto out;
        }
        in6p_laddr(inp) = in6a;
        in6p_faddr(inp) = addr->sin6_addr;
        soisconnected(so);
out:
        if_put(ifp, &psref);
        curlwp_bindx(bound);
        return error;
}

static int
rip6_connect2(struct socket *so, struct socket *so2)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
rip6_disconnect(struct socket *so)
{
        struct inpcb *inp = sotoinpcb(so);

        KASSERT(solocked(so));
        KASSERT(inp != NULL);

        if ((so->so_state & SS_ISCONNECTED) == 0)
                return ENOTCONN;

        in6p_faddr(inp) = in6addr_any;
        so->so_state &= ~SS_ISCONNECTED;        /* XXX */
        return 0;
}

static int
rip6_shutdown(struct socket *so)
{
        KASSERT(solocked(so));

        /*
         * Mark the connection as being incapable of further input.
         */
        socantsendmore(so);
        return 0;
}

static int
rip6_abort(struct socket *so)
{
        KASSERT(solocked(so));

        soisdisconnected(so);
        rip6_detach(so);
        return 0;
}

static int
rip6_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
{
        return in6_control(so, cmd, nam, ifp);
}

static int
rip6_stat(struct socket *so, struct stat *ub)
{
        KASSERT(solocked(so));

        /* stat: don't bother with a blocksize */
        return 0;
}

static int
rip6_peeraddr(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));
        KASSERT(sotoinpcb(so) != NULL);
        KASSERT(nam != NULL);

        in6pcb_fetch_peeraddr(sotoinpcb(so), (struct sockaddr_in6 *)nam);
        return 0;
}

static int
rip6_sockaddr(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));
        KASSERT(sotoinpcb(so) != NULL);
        KASSERT(nam != NULL);

        in6pcb_fetch_sockaddr(sotoinpcb(so), (struct sockaddr_in6 *)nam);
        return 0;
}

static int
rip6_rcvd(struct socket *so, int flags, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
rip6_recvoob(struct socket *so, struct mbuf *m, int flags)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
rip6_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
    struct mbuf *control, struct lwp *l)
{
        struct inpcb *inp = sotoinpcb(so);
        struct sockaddr_in6 tmp;
        struct sockaddr_in6 *dst;
        int error = 0;

        KASSERT(solocked(so));
        KASSERT(inp != NULL);
        KASSERT(m != NULL);

        /*
         * Ship a packet out. The appropriate raw output
         * routine handles any messaging necessary.
         */

        /* always copy sockaddr to avoid overwrites */
        if (so->so_state & SS_ISCONNECTED) {
                if (nam) {
                        error = EISCONN;
                        goto release;
                }
                /* XXX */
                sockaddr_in6_init(&tmp, &in6p_faddr(inp), 0, 0, 0);
                dst = &tmp;
        } else {
                if (nam == NULL) {
                        error = ENOTCONN;
                        goto release;
                }
                tmp = *(struct sockaddr_in6 *)nam;
                dst = &tmp;

                if (dst->sin6_family != AF_INET6) {
                        error = EAFNOSUPPORT;
                        goto release;
                }
                if (dst->sin6_len != sizeof(*dst)) {
                        error = EINVAL;
                        goto release;
                }
        }
        error = rip6_output(m, so, dst, control);
        m = NULL;

release:
        if (m)
                m_freem(m);

        return error;
}

static int
rip6_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control)
{
        KASSERT(solocked(so));

        m_freem(m);
        m_freem(control);

        return EOPNOTSUPP;
}

static int
rip6_purgeif(struct socket *so, struct ifnet *ifp)
{

        mutex_enter(softnet_lock);
        in6pcb_purgeif0(&raw6cbtable, ifp);
#ifdef NET_MPSAFE
        mutex_exit(softnet_lock);
#endif
        in6_purgeif(ifp);
#ifdef NET_MPSAFE
        mutex_enter(softnet_lock);
#endif
        in6pcb_purgeif(&raw6cbtable, ifp);
        mutex_exit(softnet_lock);

        return 0;
}

static int
sysctl_net_inet6_raw6_stats(SYSCTLFN_ARGS)
{

        return (NETSTAT_SYSCTL(rip6stat_percpu, RIP6_NSTATS));
}

static void
sysctl_net_inet6_raw6_setup(struct sysctllog **clog)
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "inet6", NULL,
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET6, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "raw6",
                       SYSCTL_DESCR("Raw IPv6 settings"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET6, IPPROTO_RAW, CTL_EOL);

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "pcblist",
                       SYSCTL_DESCR("Raw IPv6 control block list"),
                       sysctl_inpcblist, 0, &raw6cbtable, 0,
                       CTL_NET, PF_INET6, IPPROTO_RAW,
                       CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "stats",
                       SYSCTL_DESCR("Raw IPv6 statistics"),
                       sysctl_net_inet6_raw6_stats, 0, NULL, 0,
                       CTL_NET, PF_INET6, IPPROTO_RAW, RAW6CTL_STATS,
                       CTL_EOL);
}

PR_WRAP_USRREQS(rip6)
#define        rip6_attach                rip6_attach_wrapper
#define        rip6_detach                rip6_detach_wrapper
#define        rip6_accept                rip6_accept_wrapper
#define        rip6_bind                rip6_bind_wrapper
#define        rip6_listen                rip6_listen_wrapper
#define        rip6_connect                rip6_connect_wrapper
#define        rip6_connect2                rip6_connect2_wrapper
#define        rip6_disconnect                rip6_disconnect_wrapper
#define        rip6_shutdown                rip6_shutdown_wrapper
#define        rip6_abort                rip6_abort_wrapper
#define        rip6_ioctl                rip6_ioctl_wrapper
#define        rip6_stat                rip6_stat_wrapper
#define        rip6_peeraddr                rip6_peeraddr_wrapper
#define        rip6_sockaddr                rip6_sockaddr_wrapper
#define        rip6_rcvd                rip6_rcvd_wrapper
#define        rip6_recvoob                rip6_recvoob_wrapper
#define        rip6_send                rip6_send_wrapper
#define        rip6_sendoob                rip6_sendoob_wrapper
#define        rip6_purgeif                rip6_purgeif_wrapper

const struct pr_usrreqs rip6_usrreqs = {
        .pr_attach        = rip6_attach,
        .pr_detach        = rip6_detach,
        .pr_accept        = rip6_accept,
        .pr_bind        = rip6_bind,
        .pr_listen        = rip6_listen,
        .pr_connect        = rip6_connect,
        .pr_connect2        = rip6_connect2,
        .pr_disconnect        = rip6_disconnect,
        .pr_shutdown        = rip6_shutdown,
        .pr_abort        = rip6_abort,
        .pr_ioctl        = rip6_ioctl,
        .pr_stat        = rip6_stat,
        .pr_peeraddr        = rip6_peeraddr,
        .pr_sockaddr        = rip6_sockaddr,
        .pr_rcvd        = rip6_rcvd,
        .pr_recvoob        = rip6_recvoob,
        .pr_send        = rip6_send,
        .pr_sendoob        = rip6_sendoob,
        .pr_purgeif        = rip6_purgeif,
};












































































    2 



















    2 











    2 





























    3 




































    2 



    2 












    2 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
/*        $NetBSD: fdesc_vfsops.c,v 1.96 2020/04/13 19:23:18 ad Exp $        */

/*
 * Copyright (c) 1992, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)fdesc_vfsops.c        8.10 (Berkeley) 5/14/95
 *
 * #Id: fdesc_vfsops.c,v 1.9 1993/04/06 15:28:33 jsp Exp #
 */

/*
 * /dev/fd Filesystem
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: fdesc_vfsops.c,v 1.96 2020/04/13 19:23:18 ad Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/filedesc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/dirent.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/module.h>

#include <miscfs/genfs/genfs.h>
#include <miscfs/fdesc/fdesc.h>

MODULE(MODULE_CLASS_VFS, fdesc, NULL);

VFS_PROTOS(fdesc);

/*
 * Mount the per-process file descriptors (/dev/fd)
 */
int
fdesc_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
        struct lwp *l = curlwp;
        int error = 0, ix;
        struct vnode *rvp;

        if (mp->mnt_flag & MNT_GETARGS) {
                *data_len = 0;
                return 0;
        }
        /*
         * Update is a no-op
         */
        if (mp->mnt_flag & MNT_UPDATE)
                return (EOPNOTSUPP);

        ix = FD_ROOT;
        error = vcache_get(mp, &ix, sizeof(ix), &rvp);
        if (error)
                return error;

        mp->mnt_stat.f_namemax = FDESC_MAXNAMLEN;
        mp->mnt_flag |= MNT_LOCAL;
        mp->mnt_data = rvp;
        vfs_getnewfsid(mp);

        error = set_statvfs_info(path, UIO_USERSPACE, "fdesc", UIO_SYSSPACE,
            mp->mnt_op->vfs_name, mp, l);
        return error;
}

int
fdesc_start(struct mount *mp, int flags)
{
        return (0);
}

int
fdesc_unmount(struct mount *mp, int mntflags)
{
        int error;
        int flags = 0;
        struct vnode *rtvp = mp->mnt_data;

        if (mntflags & MNT_FORCE)
                flags |= FORCECLOSE;

        if (vrefcnt(rtvp) > 1 && (mntflags & MNT_FORCE) == 0)
                return (EBUSY);
        if ((error = vflush(mp, rtvp, flags)) != 0)
                return (error);

        /*
         * Blow it away for future re-use
         */
        vgone(rtvp);
        mp->mnt_data = NULL;

        return (0);
}

int
fdesc_root(struct mount *mp, int lktype, struct vnode **vpp)
{
        struct vnode *vp;

        /*
         * Return locked reference to root.
         */
        vp = mp->mnt_data;
        vref(vp);
        vn_lock(vp, lktype | LK_RETRY);
        *vpp = vp;
        return (0);
}

/*ARGSUSED*/
int
fdesc_sync(struct mount *mp, int waitfor,
    kauth_cred_t uc)
{

        return (0);
}

/*
 * Fdesc flat namespace lookup.
 * Currently unsupported.
 */
int
fdesc_vget(struct mount *mp, ino_t ino, int lktype,
    struct vnode **vpp)
{

        return (EOPNOTSUPP);
}

int
fdesc_loadvnode(struct mount *mp, struct vnode *vp,
    const void *key, size_t key_len, const void **new_key)
{
        int ix;
        struct fdescnode *fd;

        KASSERT(key_len == sizeof(ix));
        memcpy(&ix, key, key_len);

        fd = kmem_alloc(sizeof(struct fdescnode), KM_SLEEP);
        fd->fd_fd = -1;
        fd->fd_link = NULL;
        fd->fd_ix = ix;
        fd->fd_vnode = vp;
        vp->v_tag = VT_FDESC;
        vp->v_op = fdesc_vnodeop_p;
        vp->v_data = fd;
        switch (ix) {
        case FD_ROOT:
                fd->fd_type = Froot;
                vp->v_type = VDIR;
                vp->v_vflag |= VV_ROOT;
                break;
        case FD_DEVFD:
                fd->fd_type = Fdevfd;
                vp->v_type = VDIR;
                break;
        case FD_CTTY:
                fd->fd_type = Fctty;
                vp->v_type = VCHR;
                break;
        case FD_STDIN:
                fd->fd_type = Flink;
                fd->fd_link = "fd/0";
                vp->v_type = VLNK;
                break;
        case FD_STDOUT:
                fd->fd_type = Flink;
                fd->fd_link = "fd/1";
                vp->v_type = VLNK;
                break;
        case FD_STDERR:
                fd->fd_type = Flink;
                fd->fd_link = "fd/2";
                vp->v_type = VLNK;
                break;
        default:
                KASSERT(ix >= FD_DESC);
                fd->fd_type = Fdesc;
                fd->fd_fd = ix - FD_DESC;
                vp->v_type = VNON;
                break;
        }
        uvm_vnp_setsize(vp, 0);
        *new_key = &fd->fd_ix;

        return 0;
}

extern const struct vnodeopv_desc fdesc_vnodeop_opv_desc;

const struct vnodeopv_desc * const fdesc_vnodeopv_descs[] = {
        &fdesc_vnodeop_opv_desc,
        NULL,
};

struct vfsops fdesc_vfsops = {
        .vfs_name = MOUNT_FDESC,
        .vfs_min_mount_data = 0,
        .vfs_mount = fdesc_mount,
        .vfs_start = fdesc_start,
        .vfs_unmount = fdesc_unmount,
        .vfs_root = fdesc_root,
        .vfs_quotactl = (void *)eopnotsupp,
        .vfs_statvfs = genfs_statvfs,
        .vfs_sync = fdesc_sync,
        .vfs_vget = fdesc_vget,
        .vfs_loadvnode = fdesc_loadvnode,
        .vfs_fhtovp = (void *)eopnotsupp,
        .vfs_vptofh = (void *)eopnotsupp,
        .vfs_init = fdesc_init,
        .vfs_done = fdesc_done,
        .vfs_snapshot = (void *)eopnotsupp,
        .vfs_extattrctl = vfs_stdextattrctl,
        .vfs_suspendctl = genfs_suspendctl,
        .vfs_renamelock_enter = genfs_renamelock_enter,
        .vfs_renamelock_exit = genfs_renamelock_exit,
        .vfs_fsync = (void *)eopnotsupp,
        .vfs_opv_descs = fdesc_vnodeopv_descs
};

SYSCTL_SETUP(fdesc_sysctl_setup, "fdesc sysctl")
{

                sysctl_createv(clog, 0, NULL, NULL,
                               CTLFLAG_PERMANENT,
                               CTLTYPE_NODE, "fdesc",
                               SYSCTL_DESCR("File-descriptor file system"),
                               NULL, 0, NULL, 0,
                               CTL_VFS, 7, CTL_EOL);
                /*
                 * XXX the "7" above could be dynamic, thereby eliminating one
                 * more instance of the "number to vfs" mapping problem, but
                 * "7" is the order as taken from sys/mount.h
                 */
}

static int
fdesc_modcmd(modcmd_t cmd, void *arg)
{
        int error;

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = vfs_attach(&fdesc_vfsops);
                if (error != 0)
                        break;
                break;
        case MODULE_CMD_FINI:
                error = vfs_detach(&fdesc_vfsops);
                if (error != 0)
                        break;
                break;
        default:
                error = ENOTTY;
                break;
        }

        return (error);
}












































































































    1 




    1 









    1 































































































































































































































































    2 







    2 


    2 




    1 







    1 







    2 






















































































































    1 



























































    1 
    1 


































    3 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
/*        $NetBSD: keysock.c,v 1.70 2019/06/12 22:23:50 christos Exp $        */
/*        $FreeBSD: keysock.c,v 1.3.2.1 2003/01/24 05:11:36 sam Exp $        */
/*        $KAME: keysock.c,v 1.25 2001/08/13 20:07:41 itojun Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: keysock.c,v 1.70 2019/06/12 22:23:50 christos Exp $");

/* This code has derived from sys/net/rtsock.c on FreeBSD2.2.5 */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/domain.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/cpu.h>
#include <sys/syslog.h>

#include <net/raw_cb.h>
#include <net/route.h>

#include <net/pfkeyv2.h>
#include <netipsec/key.h>
#include <netipsec/keysock.h>
#include <netipsec/key_debug.h>

#include <netipsec/ipsec_private.h>

struct key_cb {
        int key_count;
        int any_count;
};
static struct key_cb key_cb;

static struct sockaddr key_dst = {
    .sa_len = 2,
    .sa_family = PF_KEY,
};
static struct sockaddr key_src = {
    .sa_len = 2,
    .sa_family = PF_KEY,
};

static const struct protosw keysw[];

static int key_sendup0(struct rawcb *, struct mbuf *, int, int);

int key_registered_sb_max = (2048 * MHLEN); /* XXX arbitrary */

static kmutex_t *key_so_mtx;
static struct rawcbhead key_rawcb;

void
key_init_so(void)
{

        key_so_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
}

static void
key_pr_init(void)
{

        LIST_INIT(&key_rawcb);
}

/*
 * key_output()
 */
static int
key_output(struct mbuf *m, struct socket *so)
{
        struct sadb_msg *msg;
        int len, error = 0;
        int s;

        KASSERT(m != NULL);

        {
                uint64_t *ps = PFKEY_STAT_GETREF();
                ps[PFKEY_STAT_OUT_TOTAL]++;
                ps[PFKEY_STAT_OUT_BYTES] += m->m_pkthdr.len;
                PFKEY_STAT_PUTREF();
        }

        len = m->m_pkthdr.len;
        if (len < sizeof(struct sadb_msg)) {
                PFKEY_STATINC(PFKEY_STAT_OUT_TOOSHORT);
                error = EINVAL;
                goto end;
        }

        if (m->m_len < sizeof(struct sadb_msg)) {
                if ((m = m_pullup(m, sizeof(struct sadb_msg))) == 0) {
                        PFKEY_STATINC(PFKEY_STAT_OUT_NOMEM);
                        error = ENOBUFS;
                        goto end;
                }
        }

        KASSERT((m->m_flags & M_PKTHDR) != 0);

        if (KEYDEBUG_ON(KEYDEBUG_KEY_DUMP))
                kdebug_mbuf(__func__, m);

        msg = mtod(m, struct sadb_msg *);
        PFKEY_STATINC(PFKEY_STAT_OUT_MSGTYPE + msg->sadb_msg_type);
        if (len != PFKEY_UNUNIT64(msg->sadb_msg_len)) {
                PFKEY_STATINC(PFKEY_STAT_OUT_INVLEN);
                error = EINVAL;
                goto end;
        }

        /*XXX giant lock*/
        s = splsoftnet();
        error = key_parse(m, so);
        m = NULL;
        splx(s);
end:
        if (m)
                m_freem(m);
        return error;
}

/*
 * send message to the socket.
 */
static int
key_sendup0(
    struct rawcb *rp,
    struct mbuf *m,
    int promisc,
    int sbprio
)
{
        int error;
        int ok;

        if (promisc) {
                struct sadb_msg *pmsg;

                M_PREPEND(m, sizeof(struct sadb_msg), M_DONTWAIT);
                if (m && m->m_len < sizeof(struct sadb_msg))
                        m = m_pullup(m, sizeof(struct sadb_msg));
                if (!m) {
                        PFKEY_STATINC(PFKEY_STAT_IN_NOMEM);
                        return ENOBUFS;
                }
                m->m_pkthdr.len += sizeof(*pmsg);

                pmsg = mtod(m, struct sadb_msg *);
                memset(pmsg, 0, sizeof(*pmsg));
                pmsg->sadb_msg_version = PF_KEY_V2;
                pmsg->sadb_msg_type = SADB_X_PROMISC;
                pmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);
                /* pid and seq? */

                PFKEY_STATINC(PFKEY_STAT_IN_MSGTYPE + pmsg->sadb_msg_type);
        }

        if (sbprio == 0)
                ok = sbappendaddr(&rp->rcb_socket->so_rcv,
                               (struct sockaddr *)&key_src, m, NULL);
        else
                ok = sbappendaddrchain(&rp->rcb_socket->so_rcv,
                               (struct sockaddr *)&key_src, m, sbprio);

        if (!ok) {
                log(LOG_WARNING,
                    "%s: couldn't send PF_KEY message to the socket\n",
                    __func__);
                PFKEY_STATINC(PFKEY_STAT_IN_NOMEM);
                m_freem(m);
                /* Don't call soroverflow because we're returning this
                 * error directly to the sender. */
                rp->rcb_socket->so_rcv.sb_overflowed++;
                error = ENOBUFS;
        } else {
                sorwakeup(rp->rcb_socket);
                error = 0;
        }
        return error;
}

/* so can be NULL if target != KEY_SENDUP_ONE */
static int
_key_sendup_mbuf(struct socket *so, struct mbuf *m,
                int target/*, sbprio */)
{
        struct mbuf *n;
        struct keycb *kp;
        int sendup;
        struct rawcb *rp;
        int error = 0;
        int sbprio = 0; /* XXX should be a parameter */

        KASSERT(m != NULL);
        KASSERT(so != NULL || target != KEY_SENDUP_ONE);

        /*
         * RFC 2367 says ACQUIRE and other kernel-generated messages
         * are special. We treat all KEY_SENDUP_REGISTERED messages
         * as special, delivering them to all registered sockets
         * even if the socket is at or above its so->so_rcv.sb_max limits.
         * The only constraint is that the  so_rcv data fall below
         * key_registered_sb_max.
         * Doing that check here avoids reworking every key_sendup_mbuf()
         * in the short term. . The rework will be done after a technical
         * conensus that this approach is appropriate.
          */
        if (target == KEY_SENDUP_REGISTERED) {
                sbprio = SB_PRIO_BESTEFFORT;
        }

        {
                uint64_t *ps = PFKEY_STAT_GETREF();
                ps[PFKEY_STAT_IN_TOTAL]++;
                ps[PFKEY_STAT_IN_BYTES] += m->m_pkthdr.len;
                PFKEY_STAT_PUTREF();
        }
        if (m->m_len < sizeof(struct sadb_msg)) {
#if 1
                m = m_pullup(m, sizeof(struct sadb_msg));
                if (m == NULL) {
                        PFKEY_STATINC(PFKEY_STAT_IN_NOMEM);
                        return ENOBUFS;
                }
#else
                /* don't bother pulling it up just for stats */
#endif
        }
        if (m->m_len >= sizeof(struct sadb_msg)) {
                struct sadb_msg *msg;
                msg = mtod(m, struct sadb_msg *);
                PFKEY_STATINC(PFKEY_STAT_IN_MSGTYPE + msg->sadb_msg_type);
        }

        LIST_FOREACH(rp, &key_rawcb, rcb_list)
        {
                struct socket * kso = rp->rcb_socket;
                if (rp->rcb_proto.sp_family != PF_KEY)
                        continue;
                if (rp->rcb_proto.sp_protocol
                 && rp->rcb_proto.sp_protocol != PF_KEY_V2) {
                        continue;
                }

                kp = (struct keycb *)rp;

                /*
                 * If you are in promiscuous mode, and when you get broadcasted
                 * reply, you'll get two PF_KEY messages.
                 * (based on pf_key@inner.net message on 14 Oct 1998)
                 */
                if (((struct keycb *)rp)->kp_promisc) {
                        if ((n = m_copym(m, 0, (int)M_COPYALL, M_DONTWAIT)) != NULL) {
                                (void)key_sendup0(rp, n, 1, 0);
                                n = NULL;
                        }
                }

                /* the exact target will be processed later */
                if (so && sotorawcb(so) == rp)
                        continue;

                sendup = 0;
                switch (target) {
                case KEY_SENDUP_ONE:
                        /* the statement has no effect */
                        if (so && sotorawcb(so) == rp)
                                sendup++;
                        break;
                case KEY_SENDUP_ALL:
                        sendup++;
                        break;
                case KEY_SENDUP_REGISTERED:
                        if (kp->kp_registered) {
                                if (kso->so_rcv.sb_cc <= key_registered_sb_max)
                                        sendup++;
                                  else
                                          printf("keysock: "
                                               "registered sendup dropped, "
                                               "sb_cc %ld max %d\n",
                                               kso->so_rcv.sb_cc,
                                               key_registered_sb_max);
                        }
                        break;
                }
                PFKEY_STATINC(PFKEY_STAT_IN_MSGTARGET + target);

                if (!sendup)
                        continue;

                if ((n = m_copym(m, 0, (int)M_COPYALL, M_DONTWAIT)) == NULL) {
                        m_freem(m);
                        PFKEY_STATINC(PFKEY_STAT_IN_NOMEM);
                        return ENOBUFS;
                }

                if ((error = key_sendup0(rp, n, 0, 0)) != 0) {
                        m_freem(m);
                        return error;
                }

                n = NULL;
        }

        /* The 'later' time for processing the exact target has arrived */
        if (so) {
                error = key_sendup0(sotorawcb(so), m, 0, sbprio);
                m = NULL;
        } else {
                error = 0;
                m_freem(m);
        }
        return error;
}

int
key_sendup_mbuf(struct socket *so, struct mbuf *m,
                int target/*, sbprio */)
{
        int error;

        if (so == NULL)
                mutex_enter(key_so_mtx);
        else
                KASSERT(solocked(so));

        error = _key_sendup_mbuf(so, m, target);

        if (so == NULL)
                mutex_exit(key_so_mtx);
        return error;
}

static int
key_attach(struct socket *so, int proto)
{
        struct keycb *kp;
        int s, error;

        KASSERT(sotorawcb(so) == NULL);
        kp = kmem_zalloc(sizeof(*kp), KM_SLEEP);
        kp->kp_raw.rcb_len = sizeof(*kp);
        so->so_pcb = kp;

        s = splsoftnet();

        if (so->so_lock != key_so_mtx) {
                KASSERT(so->so_lock == NULL);
                mutex_obj_hold(key_so_mtx);
                so->so_lock = key_so_mtx;
                solock(so);
        }

        error = raw_attach(so, proto, &key_rawcb);
        if (error) {
                PFKEY_STATINC(PFKEY_STAT_SOCKERR);
                kmem_free(kp, sizeof(*kp));
                so->so_pcb = NULL;
                goto out;
        }

        kp->kp_promisc = kp->kp_registered = 0;

        if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */
                key_cb.key_count++;
        key_cb.any_count++;
        kp->kp_raw.rcb_laddr = &key_src;
        kp->kp_raw.rcb_faddr = &key_dst;
        soisconnected(so);
        so->so_options |= SO_USELOOPBACK;
out:
        KASSERT(solocked(so));
        splx(s);
        return error;
}

static void
key_detach(struct socket *so)
{
        struct keycb *kp = (struct keycb *)sotorawcb(so);
        int s;

        KASSERT(!cpu_softintr_p());
        KASSERT(solocked(so));
        KASSERT(kp != NULL);

        s = splsoftnet();
        if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */
                key_cb.key_count--;
        key_cb.any_count--;
        key_freereg(so);
        raw_detach(so);
        splx(s);
}

static int
key_accept(struct socket *so, struct sockaddr *nam)
{
        KASSERT(solocked(so));

        panic("%s: unsupported", __func__);

        return EOPNOTSUPP;
}

static int
key_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
key_listen(struct socket *so, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
key_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
key_connect2(struct socket *so, struct socket *so2)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
key_disconnect(struct socket *so)
{
        struct rawcb *rp = sotorawcb(so);
        int s;
        
        KASSERT(solocked(so));
        KASSERT(rp != NULL);

        s = splsoftnet();
        soisdisconnected(so);
        raw_disconnect(rp);
        splx(s);
 
        return 0;                               
}

static int
key_shutdown(struct socket *so)
{
        int s;

        KASSERT(solocked(so));

        /*
         * Mark the connection as being incapable of further input.
         */
        s = splsoftnet();
        socantsendmore(so);
        splx(s);

        return 0;
}

static int
key_abort(struct socket *so)
{
        KASSERT(solocked(so));

        panic("%s: unsupported", __func__);

        return EOPNOTSUPP;
}

static int
key_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
{
        return EOPNOTSUPP;
}

static int
key_stat(struct socket *so, struct stat *ub)
{
        KASSERT(solocked(so));

        return 0;
}

static int
key_peeraddr(struct socket *so, struct sockaddr *nam)
{
        struct rawcb *rp = sotorawcb(so);

        KASSERT(solocked(so));
        KASSERT(rp != NULL);
        KASSERT(nam != NULL);

        if (rp->rcb_faddr == NULL)
                return ENOTCONN;

        raw_setpeeraddr(rp, nam);
        return 0;
}

static int
key_sockaddr(struct socket *so, struct sockaddr *nam)
{
        struct rawcb *rp = sotorawcb(so);

        KASSERT(solocked(so));
        KASSERT(rp != NULL);
        KASSERT(nam != NULL);

        if (rp->rcb_faddr == NULL)
                return ENOTCONN;

        raw_setsockaddr(rp, nam);
        return 0;
}

static int
key_rcvd(struct socket *so, int flags, struct lwp *l)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
key_recvoob(struct socket *so, struct mbuf *m, int flags)
{
        KASSERT(solocked(so));

        return EOPNOTSUPP;
}

static int
key_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
    struct mbuf *control, struct lwp *l)
{
        int error = 0;
        int s;

        KASSERT(solocked(so));
        KASSERT(so->so_proto == &keysw[0]);

        s = splsoftnet();
        error = raw_send(so, m, nam, control, l, &key_output);
        splx(s);

        return error;
}

static int
key_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control)
{
        KASSERT(solocked(so));

        m_freem(m);
        m_freem(control);

        return EOPNOTSUPP;
}

static int
key_purgeif(struct socket *so, struct ifnet *ifa)
{

        panic("%s: unsupported", __func__);

        return EOPNOTSUPP;
}

/*
 * Definitions of protocols supported in the KEY domain.
 */

DOMAIN_DEFINE(keydomain);

PR_WRAP_USRREQS(key)
#define        key_attach        key_attach_wrapper
#define        key_detach        key_detach_wrapper
#define        key_accept        key_accept_wrapper
#define        key_bind        key_bind_wrapper
#define        key_listen        key_listen_wrapper
#define        key_connect        key_connect_wrapper
#define        key_connect2        key_connect2_wrapper
#define        key_disconnect        key_disconnect_wrapper
#define        key_shutdown        key_shutdown_wrapper
#define        key_abort        key_abort_wrapper
#define        key_ioctl        key_ioctl_wrapper
#define        key_stat        key_stat_wrapper
#define        key_peeraddr        key_peeraddr_wrapper
#define        key_sockaddr        key_sockaddr_wrapper
#define        key_rcvd        key_rcvd_wrapper
#define        key_recvoob        key_recvoob_wrapper
#define        key_send        key_send_wrapper
#define        key_sendoob        key_sendoob_wrapper
#define        key_purgeif        key_purgeif_wrapper

static const struct pr_usrreqs key_usrreqs = {
        .pr_attach        = key_attach,
        .pr_detach        = key_detach,
        .pr_accept        = key_accept,
        .pr_bind        = key_bind,
        .pr_listen        = key_listen,
        .pr_connect        = key_connect,
        .pr_connect2        = key_connect2,
        .pr_disconnect        = key_disconnect,
        .pr_shutdown        = key_shutdown,
        .pr_abort        = key_abort,
        .pr_ioctl        = key_ioctl,
        .pr_stat        = key_stat,
        .pr_peeraddr        = key_peeraddr,
        .pr_sockaddr        = key_sockaddr,
        .pr_rcvd        = key_rcvd,
        .pr_recvoob        = key_recvoob,
        .pr_send        = key_send,
        .pr_sendoob        = key_sendoob,
        .pr_purgeif        = key_purgeif,
};

static const struct protosw keysw[] = {
    {
        .pr_type = SOCK_RAW,
        .pr_domain = &keydomain,
        .pr_protocol = PF_KEY_V2,
        .pr_flags = PR_ATOMIC|PR_ADDR,
        .pr_ctlinput = raw_ctlinput,
        .pr_usrreqs = &key_usrreqs,
        .pr_init = key_pr_init,
    }
};

struct domain keydomain = {
    .dom_family = PF_KEY,
    .dom_name = "key",
    .dom_init = key_init,
    .dom_protosw = keysw,
    .dom_protoswNPROTOSW = &keysw[__arraycount(keysw)],
};































































    1 


    1 
    1 
    1 









    1 























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
/*        $NetBSD: sco_upper.c,v 1.16 2014/08/05 07:55:32 rtr Exp $        */

/*-
 * Copyright (c) 2006 Itronix Inc.
 * All rights reserved.
 *
 * Written by Iain Hibbert for Itronix Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of Itronix Inc. may not be used to endorse
 *    or promote products derived from this software without specific
 *    prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sco_upper.c,v 1.16 2014/08/05 07:55:32 rtr Exp $");

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/socketvar.h>
#include <sys/systm.h>

#include <netbt/bluetooth.h>
#include <netbt/hci.h>
#include <netbt/sco.h>

/****************************************************************************
 *
 *        SCO - Upper Protocol API
 */

struct sco_pcb_list sco_pcb = LIST_HEAD_INITIALIZER(sco_pcb);

/*
 * sco_attach_pcb(handle, proto, upper)
 *
 *        Attach a new instance of SCO pcb to handle
 */
int
sco_attach_pcb(struct sco_pcb **handle,
                const struct btproto *proto, void *upper)
{
        struct sco_pcb *pcb;

        KASSERT(handle != NULL);
        KASSERT(proto != NULL);
        KASSERT(upper != NULL);

        pcb = malloc(sizeof(struct sco_pcb), M_BLUETOOTH,
                        M_NOWAIT | M_ZERO);
        if (pcb == NULL)
                return ENOMEM;

        pcb->sp_proto = proto;
        pcb->sp_upper = upper;

        LIST_INSERT_HEAD(&sco_pcb, pcb, sp_next);

        *handle = pcb;
        return 0;
}

/*
 * sco_bind_pcb(pcb, sockaddr)
 *
 *        Bind SCO pcb to local address
 */
int
sco_bind_pcb(struct sco_pcb *pcb, struct sockaddr_bt *addr)
{

        if (pcb->sp_link != NULL || pcb->sp_flags & SP_LISTENING)
                return EINVAL;

        bdaddr_copy(&pcb->sp_laddr, &addr->bt_bdaddr);
        return 0;
}

/*
 * sco_sockaddr_pcb(pcb, sockaddr)
 *
 *        Copy local address of PCB to sockaddr
 */
int
sco_sockaddr_pcb(struct sco_pcb *pcb, struct sockaddr_bt *addr)
{

        memset(addr, 0, sizeof(struct sockaddr_bt));
        addr->bt_len = sizeof(struct sockaddr_bt);
        addr->bt_family = AF_BLUETOOTH;
        bdaddr_copy(&addr->bt_bdaddr, &pcb->sp_laddr);
        return 0;
}

/*
 * sco_connect_pcb(pcb, sockaddr)
 *
 *        Initiate a SCO connection to the destination address.
 */
int
sco_connect_pcb(struct sco_pcb *pcb, struct sockaddr_bt *dest)
{
        hci_add_sco_con_cp cp;
        struct hci_unit *unit;
        struct hci_link *acl, *sco;
        int err;

        if (pcb->sp_flags & SP_LISTENING)
                return EINVAL;

        bdaddr_copy(&pcb->sp_raddr, &dest->bt_bdaddr);

        if (bdaddr_any(&pcb->sp_raddr))
                return EDESTADDRREQ;

        if (bdaddr_any(&pcb->sp_laddr)) {
                err = hci_route_lookup(&pcb->sp_laddr, &pcb->sp_raddr);
                if (err)
                        return err;
        }

        unit = hci_unit_lookup(&pcb->sp_laddr);
        if (unit == NULL)
                return ENETDOWN;

        /*
         * We must have an already open ACL connection before we open the SCO
         * connection, and since SCO connections dont happen on their own we
         * will not open one, the application wanting this should have opened
         * it previously.
         */
        acl = hci_link_lookup_bdaddr(unit, &pcb->sp_raddr, HCI_LINK_ACL);
        if (acl == NULL || acl->hl_state != HCI_LINK_OPEN)
                return EHOSTUNREACH;

        sco = hci_link_alloc(unit, &pcb->sp_raddr, HCI_LINK_SCO);
        if (sco == NULL)
                return ENOMEM;

        sco->hl_link = hci_acl_open(unit, &pcb->sp_raddr);
        KASSERT(sco->hl_link == acl);

        cp.con_handle = htole16(acl->hl_handle);
        cp.pkt_type = htole16(0x00e0);                /* HV1, HV2, HV3 */
        err = hci_send_cmd(unit, HCI_CMD_ADD_SCO_CON, &cp, sizeof(cp));
        if (err) {
                hci_link_free(sco, err);
                return err;
        }

        sco->hl_sco = pcb;
        pcb->sp_link = sco;

        pcb->sp_mtu = unit->hci_max_sco_size;
        return 0;
}

/*
 * sco_peeraddr_pcb(pcb, sockaddr)
 *
 *        Copy remote address of SCO pcb to sockaddr
 */
int
sco_peeraddr_pcb(struct sco_pcb *pcb, struct sockaddr_bt *addr)
{

        memset(addr, 0, sizeof(struct sockaddr_bt));
        addr->bt_len = sizeof(struct sockaddr_bt);
        addr->bt_family = AF_BLUETOOTH;
        bdaddr_copy(&addr->bt_bdaddr, &pcb->sp_raddr);
        return 0;
}

/*
 * sco_disconnect_pcb(pcb, linger)
 *
 *        Initiate disconnection of connected SCO pcb
 */
int
sco_disconnect_pcb(struct sco_pcb *pcb, int linger)
{
        hci_discon_cp cp;
        struct hci_link *sco;
        int err;

        sco = pcb->sp_link;
        if (sco == NULL)
                return EINVAL;

        cp.con_handle = htole16(sco->hl_handle);
        cp.reason = 0x13;        /* "Remote User Terminated Connection" */

        err = hci_send_cmd(sco->hl_unit, HCI_CMD_DISCONNECT, &cp, sizeof(cp));
        if (err || linger == 0) {
                sco->hl_sco = NULL;
                pcb->sp_link = NULL;
                hci_link_free(sco, err);
        }

        return err;
}

/*
 * sco_detach_pcb(handle)
 *
 *        Detach SCO pcb from handle and clear up
 */
void
sco_detach_pcb(struct sco_pcb **handle)
{
        struct sco_pcb *pcb;

        KASSERT(handle != NULL);
        pcb = *handle;
        *handle = NULL;

        if (pcb->sp_link != NULL) {
                sco_disconnect_pcb(pcb, 0);
                pcb->sp_link = NULL;
        }

        LIST_REMOVE(pcb, sp_next);
        free(pcb, M_BLUETOOTH);
}

/*
 * sco_listen_pcb(pcb)
 *
 *        Mark pcb as a listener.
 */
int
sco_listen_pcb(struct sco_pcb *pcb)
{

        if (pcb->sp_link != NULL)
                return EINVAL;

        pcb->sp_flags |= SP_LISTENING;
        return 0;
}

/*
 * sco_send_pcb(pcb, mbuf)
 *
 *        Send data on SCO pcb.
 *
 * Gross hackage, we just output the packet directly onto the unit queue.
 * This will work fine for one channel per unit, but for more channels it
 * really needs fixing. We set the context so that when the packet is sent,
 * we can drop a record from the socket buffer.
 */
int
sco_send_pcb(struct sco_pcb *pcb, struct mbuf *m)
{
        hci_scodata_hdr_t *hdr;
        int plen;

        if (pcb->sp_link == NULL) {
                m_freem(m);
                return EINVAL;
        }

        plen = m->m_pkthdr.len;
        DPRINTFN(10, "%d bytes\n", plen);

        /*
         * This is a temporary limitation, as USB devices cannot
         * handle SCO packet sizes that are not an integer number
         * of Isochronous frames. See ubt(4)
         */
        if (plen != pcb->sp_mtu) {
                m_freem(m);
                return EMSGSIZE;
        }

        M_PREPEND(m, sizeof(hci_scodata_hdr_t), M_DONTWAIT);
        if (m == NULL)
                return ENOMEM;

        hdr = mtod(m, hci_scodata_hdr_t *);
        hdr->type = HCI_SCO_DATA_PKT;
        hdr->con_handle = htole16(pcb->sp_link->hl_handle);
        hdr->length = plen;

        pcb->sp_pending++;
        M_SETCTX(m, pcb->sp_link);
        hci_output_sco(pcb->sp_link->hl_unit, m);

        return 0;
}

/*
 * sco_setopt(pcb, sopt)
 *
 *        Set SCO pcb options
 */
int
sco_setopt(struct sco_pcb *pcb, const struct sockopt *sopt)
{
        int err = 0;

        switch (sopt->sopt_name) {
        default:
                err = ENOPROTOOPT;
                break;
        }

        return err;
}

/*
 * sco_getopt(pcb, sopt)
 *
 *        Get SCO pcb options
 */
int
sco_getopt(struct sco_pcb *pcb, struct sockopt *sopt)
{

        switch (sopt->sopt_name) {
        case SO_SCO_MTU:
                return sockopt_set(sopt, &pcb->sp_mtu, sizeof(uint16_t));

        case SO_SCO_HANDLE:
                if (pcb->sp_link)
                        return sockopt_set(sopt,
                            &pcb->sp_link->hl_handle, sizeof(uint16_t));

                return ENOTCONN;

        default:
                break;
        }

        return ENOPROTOOPT;
}































































































































































































































































































  178 








  178 

  179 










  176 





  112 






   20 




   20 






  179 














    8 




   83 









  152 




  149 
  150 







  144 



  109 





   11 






  149 














  162 


  160 
  159 



  160 












   77 


   78 
   76 

   80 










  141 

  141 
  141 






















































































   34 






























    6 





    6 
























    2 



    2 














































  149 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
/*        $NetBSD: subr_kmem.c,v 1.89 2023/09/10 14:29:13 ad Exp $        */

/*
 * Copyright (c) 2009-2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran and Maxime Villard.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c)2006 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Allocator of kernel wired memory. This allocator has some debug features
 * enabled with "option DIAGNOSTIC" and "option DEBUG".
 */

/*
 * KMEM_SIZE: detect alloc/free size mismatch bugs.
 *        Append to each allocation a fixed-sized footer and record the exact
 *        user-requested allocation size in it.  When freeing, compare it with
 *        kmem_free's "size" argument.
 *
 * This option is enabled on DIAGNOSTIC.
 *
 *  |CHUNK|CHUNK|CHUNK|CHUNK|CHUNK|CHUNK|CHUNK|CHUNK|CHUNK| |
 *  +-----+-----+-----+-----+-----+-----+-----+-----+-----+-+
 *  |     |     |     |     |     |     |     |     |/////|U|
 *  |     |     |     |     |     |     |     |     |/HSZ/|U|
 *  |     |     |     |     |     |     |     |     |/////|U|
 *  +-----+-----+-----+-----+-----+-----+-----+-----+-----+-+
 *  | Buffer usable by the caller (requested size)  |Size |Unused
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_kmem.c,v 1.89 2023/09/10 14:29:13 ad Exp $");

#ifdef _KERNEL_OPT
#include "opt_kmem.h"
#endif

#include <sys/param.h>
#include <sys/callback.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/debug.h>
#include <sys/lockdebug.h>
#include <sys/cpu.h>
#include <sys/asan.h>
#include <sys/msan.h>
#include <sys/sdt.h>

#include <uvm/uvm_extern.h>
#include <uvm/uvm_map.h>

#include <lib/libkern/libkern.h>

struct kmem_cache_info {
        size_t                kc_size;
        const char *        kc_name;
#ifdef KDTRACE_HOOKS
        const id_t        *kc_alloc_probe_id;
        const id_t        *kc_free_probe_id;
#endif
};

#define        KMEM_CACHE_SIZES(F)                                                      \
        F(8, kmem-00008, kmem__00008)                                              \
        F(16, kmem-00016, kmem__00016)                                              \
        F(24, kmem-00024, kmem__00024)                                              \
        F(32, kmem-00032, kmem__00032)                                              \
        F(40, kmem-00040, kmem__00040)                                              \
        F(48, kmem-00048, kmem__00048)                                              \
        F(56, kmem-00056, kmem__00056)                                              \
        F(64, kmem-00064, kmem__00064)                                              \
        F(80, kmem-00080, kmem__00080)                                              \
        F(96, kmem-00096, kmem__00096)                                              \
        F(112, kmem-00112, kmem__00112)                                              \
        F(128, kmem-00128, kmem__00128)                                              \
        F(160, kmem-00160, kmem__00160)                                              \
        F(192, kmem-00192, kmem__00192)                                              \
        F(224, kmem-00224, kmem__00224)                                              \
        F(256, kmem-00256, kmem__00256)                                              \
        F(320, kmem-00320, kmem__00320)                                              \
        F(384, kmem-00384, kmem__00384)                                              \
        F(448, kmem-00448, kmem__00448)                                              \
        F(512, kmem-00512, kmem__00512)                                              \
        F(768, kmem-00768, kmem__00768)                                              \
        F(1024, kmem-01024, kmem__01024)                                      \
        /* end of KMEM_CACHE_SIZES */

#define        KMEM_CACHE_BIG_SIZES(F)                                                      \
        F(2048, kmem-02048, kmem__02048)                                      \
        F(4096, kmem-04096, kmem__04096)                                      \
        F(8192, kmem-08192, kmem__08192)                                      \
        F(16384, kmem-16384, kmem__16384)                                      \
        /* end of KMEM_CACHE_BIG_SIZES */

/* sdt:kmem:alloc:kmem-* probes */
#define        F(SZ, NAME, PROBENAME)                                                      \
        SDT_PROBE_DEFINE4(sdt, kmem, alloc, PROBENAME,                              \
            "void *"/*ptr*/,                                                      \
            "size_t"/*requested_size*/,                                              \
            "size_t"/*allocated_size*/,                                              \
            "km_flag_t"/*kmflags*/);
KMEM_CACHE_SIZES(F);
KMEM_CACHE_BIG_SIZES(F);
#undef        F

/* sdt:kmem:free:kmem-* probes */
#define        F(SZ, NAME, PROBENAME)                                                      \
        SDT_PROBE_DEFINE3(sdt, kmem, free, PROBENAME,                              \
            "void *"/*ptr*/,                                                      \
            "size_t"/*requested_size*/,                                              \
            "size_t"/*allocated_size*/);
KMEM_CACHE_SIZES(F);
KMEM_CACHE_BIG_SIZES(F);
#undef        F

/* sdt:kmem:alloc:large, sdt:kmem:free:large probes */
SDT_PROBE_DEFINE4(sdt, kmem, alloc, large,
    "void *"/*ptr*/,
    "size_t"/*requested_size*/,
    "size_t"/*allocated_size*/,
    "km_flag_t"/*kmflags*/);
SDT_PROBE_DEFINE3(sdt, kmem, free, large,
    "void *"/*ptr*/,
    "size_t"/*requested_size*/,
    "size_t"/*allocated_size*/);

#ifdef KDTRACE_HOOKS
#define        F(SZ, NAME, PROBENAME)                                                      \
        { SZ, #NAME,                                                              \
          &sdt_sdt_kmem_alloc_##PROBENAME->id,                                      \
          &sdt_sdt_kmem_free_##PROBENAME->id },
#else
#define        F(SZ, NAME, PROBENAME)        { SZ, #NAME },
#endif

static const struct kmem_cache_info kmem_cache_sizes[] = {
        KMEM_CACHE_SIZES(F)
        { 0 }
};

static const struct kmem_cache_info kmem_cache_big_sizes[] = {
        KMEM_CACHE_BIG_SIZES(F)
        { 0 }
};

#undef        F

/*
 * KMEM_ALIGN is the smallest guaranteed alignment and also the
 * smallest allocateable quantum.
 * Every cache size >= CACHE_LINE_SIZE gets CACHE_LINE_SIZE alignment.
 */
#define        KMEM_ALIGN                8
#define        KMEM_SHIFT                3
#define        KMEM_MAXSIZE                1024
#define        KMEM_CACHE_COUNT        (KMEM_MAXSIZE >> KMEM_SHIFT)

static pool_cache_t kmem_cache[KMEM_CACHE_COUNT] __cacheline_aligned;
static size_t kmem_cache_maxidx __read_mostly;

#define        KMEM_BIG_ALIGN                2048
#define        KMEM_BIG_SHIFT                11
#define        KMEM_BIG_MAXSIZE        16384
#define        KMEM_CACHE_BIG_COUNT        (KMEM_BIG_MAXSIZE >> KMEM_BIG_SHIFT)

static pool_cache_t kmem_cache_big[KMEM_CACHE_BIG_COUNT] __cacheline_aligned;
static size_t kmem_cache_big_maxidx __read_mostly;

#if defined(DIAGNOSTIC) && defined(_HARDKERNEL)
#define        KMEM_SIZE
#endif

#if defined(DEBUG) && defined(_HARDKERNEL)
static void *kmem_freecheck;
#endif

#if defined(KMEM_SIZE)
#define        SIZE_SIZE        sizeof(size_t)
static void kmem_size_set(void *, size_t);
static void kmem_size_check(void *, size_t);
#else
#define        SIZE_SIZE        0
#define        kmem_size_set(p, sz)        /* nothing */
#define        kmem_size_check(p, sz)        /* nothing */
#endif

#ifndef KDTRACE_HOOKS

static const id_t **const kmem_cache_alloc_probe_id = NULL;
static const id_t **const kmem_cache_big_alloc_probe_id = NULL;
static const id_t **const kmem_cache_free_probe_id = NULL;
static const id_t **const kmem_cache_big_free_probe_id = NULL;

#define        KMEM_CACHE_PROBE(ARRAY, INDEX, PTR, REQSIZE, ALLOCSIZE, FLAGS)              \
        __nothing

#else

static const id_t *kmem_cache_alloc_probe_id[KMEM_CACHE_COUNT];
static const id_t *kmem_cache_big_alloc_probe_id[KMEM_CACHE_COUNT];
static const id_t *kmem_cache_free_probe_id[KMEM_CACHE_COUNT];
static const id_t *kmem_cache_big_free_probe_id[KMEM_CACHE_COUNT];

#define        KMEM_CACHE_PROBE(ARRAY, INDEX, PTR, REQSIZE, ALLOCSIZE, FLAGS) do     \
{                                                                              \
        id_t id;                                                              \
                                                                              \
        KDASSERT((INDEX) < __arraycount(ARRAY));                              \
        if (__predict_false((id = *(ARRAY)[INDEX]) != 0)) {                      \
                (*sdt_probe_func)(id,                                              \
                    (uintptr_t)(PTR),                                              \
                    (uintptr_t)(REQSIZE),                                      \
                    (uintptr_t)(ALLOCSIZE),                                      \
                    (uintptr_t)(FLAGS),                                              \
                    (uintptr_t)0);                                              \
        }                                                                      \
} while (0)

#endif        /* KDTRACE_HOOKS */

#define        KMEM_CACHE_ALLOC_PROBE(I, P, RS, AS, F)                                      \
        KMEM_CACHE_PROBE(kmem_cache_alloc_probe_id, I, P, RS, AS, F)
#define        KMEM_CACHE_BIG_ALLOC_PROBE(I, P, RS, AS, F)                              \
        KMEM_CACHE_PROBE(kmem_cache_big_alloc_probe_id, I, P, RS, AS, F)
#define        KMEM_CACHE_FREE_PROBE(I, P, RS, AS)                                      \
        KMEM_CACHE_PROBE(kmem_cache_free_probe_id, I, P, RS, AS, 0)
#define        KMEM_CACHE_BIG_FREE_PROBE(I, P, RS, AS)                                      \
        KMEM_CACHE_PROBE(kmem_cache_big_free_probe_id, I, P, RS, AS, 0)

CTASSERT(KM_SLEEP == PR_WAITOK);
CTASSERT(KM_NOSLEEP == PR_NOWAIT);

/*
 * kmem_intr_alloc: allocate wired memory.
 */
void *
kmem_intr_alloc(size_t requested_size, km_flag_t kmflags)
{
#ifdef KASAN
        const size_t origsize = requested_size;
#endif
        size_t allocsz, index;
        size_t size;
        pool_cache_t pc;
        uint8_t *p;

        KASSERT(requested_size > 0);

        KASSERT((kmflags & KM_SLEEP) || (kmflags & KM_NOSLEEP));
        KASSERT(!(kmflags & KM_SLEEP) || !(kmflags & KM_NOSLEEP));

        kasan_add_redzone(&requested_size);
        size = kmem_roundup_size(requested_size);
        allocsz = size + SIZE_SIZE;

        if ((index = ((allocsz - 1) >> KMEM_SHIFT))
            < kmem_cache_maxidx) {
                pc = kmem_cache[index];
                p = pool_cache_get(pc, kmflags);
                KMEM_CACHE_ALLOC_PROBE(index,
                    p, requested_size, allocsz, kmflags);
        } else if ((index = ((allocsz - 1) >> KMEM_BIG_SHIFT))
            < kmem_cache_big_maxidx) {
                pc = kmem_cache_big[index];
                p = pool_cache_get(pc, kmflags);
                KMEM_CACHE_BIG_ALLOC_PROBE(index,
                    p, requested_size, allocsz, kmflags);
        } else {
                int ret = uvm_km_kmem_alloc(kmem_va_arena,
                    (vsize_t)round_page(size),
                    ((kmflags & KM_SLEEP) ? VM_SLEEP : VM_NOSLEEP)
                     | VM_INSTANTFIT, (vmem_addr_t *)&p);
                SDT_PROBE4(sdt, kmem, alloc, large,
                    ret ? NULL : p, requested_size, round_page(size), kmflags);
                if (ret) {
                        return NULL;
                }
                FREECHECK_OUT(&kmem_freecheck, p);
                KASSERT(size < coherency_unit ||
                    ALIGNED_POINTER(p, coherency_unit));
                return p;
        }

        if (__predict_true(p != NULL)) {
                FREECHECK_OUT(&kmem_freecheck, p);
                kmem_size_set(p, requested_size);
                kasan_mark(p, origsize, size, KASAN_KMEM_REDZONE);
                return p;
        }

        KASSERT(size < coherency_unit || ALIGNED_POINTER(p, coherency_unit));
        return p;
}

/*
 * kmem_intr_zalloc: allocate zeroed wired memory.
 */
void *
kmem_intr_zalloc(size_t size, km_flag_t kmflags)
{
        void *p;

        p = kmem_intr_alloc(size, kmflags);
        if (__predict_true(p != NULL)) {
                memset(p, 0, size);
        }
        return p;
}

/*
 * kmem_intr_free: free wired memory allocated by kmem_alloc.
 */
void
kmem_intr_free(void *p, size_t requested_size)
{
        size_t allocsz, index;
        size_t size;
        pool_cache_t pc;

        KASSERT(p != NULL);
        KASSERTMSG(requested_size > 0, "kmem_intr_free(%p, 0)", p);

        kasan_add_redzone(&requested_size);
        size = kmem_roundup_size(requested_size);
        allocsz = size + SIZE_SIZE;

        if ((index = ((allocsz - 1) >> KMEM_SHIFT))
            < kmem_cache_maxidx) {
                KMEM_CACHE_FREE_PROBE(index, p, requested_size, allocsz);
                pc = kmem_cache[index];
        } else if ((index = ((allocsz - 1) >> KMEM_BIG_SHIFT))
            < kmem_cache_big_maxidx) {
                KMEM_CACHE_BIG_FREE_PROBE(index, p, requested_size, allocsz);
                pc = kmem_cache_big[index];
        } else {
                FREECHECK_IN(&kmem_freecheck, p);
                SDT_PROBE3(sdt, kmem, free, large,
                    p, requested_size, round_page(size));
                uvm_km_kmem_free(kmem_va_arena, (vaddr_t)p,
                    round_page(size));
                return;
        }

        kasan_mark(p, size, size, 0);

        kmem_size_check(p, requested_size);
        FREECHECK_IN(&kmem_freecheck, p);
        LOCKDEBUG_MEM_CHECK(p, size);

        pool_cache_put(pc, p);
}

/* -------------------------------- Kmem API -------------------------------- */

/*
 * kmem_alloc: allocate wired memory.
 * => must not be called from interrupt context.
 */
void *
kmem_alloc(size_t size, km_flag_t kmflags)
{
        void *v;

        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());

        v = kmem_intr_alloc(size, kmflags);
        if (__predict_true(v != NULL)) {
                kmsan_mark(v, size, KMSAN_STATE_UNINIT);
                kmsan_orig(v, size, KMSAN_TYPE_KMEM, __RET_ADDR);
        }
        KASSERT(v || (kmflags & KM_NOSLEEP) != 0);
        return v;
}

/*
 * kmem_zalloc: allocate zeroed wired memory.
 * => must not be called from interrupt context.
 */
void *
kmem_zalloc(size_t size, km_flag_t kmflags)
{
        void *v;

        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());

        v = kmem_intr_zalloc(size, kmflags);
        KASSERT(v || (kmflags & KM_NOSLEEP) != 0);
        return v;
}

/*
 * kmem_free: free wired memory allocated by kmem_alloc.
 * => must not be called from interrupt context.
 */
void
kmem_free(void *p, size_t size)
{

        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());

        kmem_intr_free(p, size);
        kmsan_mark(p, size, KMSAN_STATE_INITED);
}

static size_t
kmem_create_caches(const struct kmem_cache_info *array,
    const id_t *alloc_probe_table[], const id_t *free_probe_table[],
    pool_cache_t alloc_table[], size_t maxsize, int shift, int ipl)
{
        size_t maxidx = 0;
        size_t table_unit = (1 << shift);
        size_t size = table_unit;
        int i;

        for (i = 0; array[i].kc_size != 0 ; i++) {
                const char *name = array[i].kc_name;
                size_t cache_size = array[i].kc_size;
                struct pool_allocator *pa;
                int flags = 0;
                pool_cache_t pc;
                size_t align;

                /* check if we reached the requested size */
                if (cache_size > maxsize || cache_size > PAGE_SIZE) {
                        break;
                }

                /*
                 * Exclude caches with size not a factor or multiple of the
                 * coherency unit.
                 */
                if (cache_size < COHERENCY_UNIT) {
                        if (COHERENCY_UNIT % cache_size > 0) {
                                    continue;
                        }
                        flags |= PR_NOTOUCH;
                        align = KMEM_ALIGN;
                } else if ((cache_size & (PAGE_SIZE - 1)) == 0) {
                        align = PAGE_SIZE;
                } else {
                        if ((cache_size % COHERENCY_UNIT) > 0) {
                                continue;
                        }
                        align = COHERENCY_UNIT;
                }

                if ((cache_size >> shift) > maxidx) {
                        maxidx = cache_size >> shift;
                }

                pa = &pool_allocator_kmem;
                pc = pool_cache_init(cache_size, align, 0, flags,
                    name, pa, ipl, NULL, NULL, NULL);

                while (size <= cache_size) {
                        alloc_table[(size - 1) >> shift] = pc;
#ifdef KDTRACE_HOOKS
                        if (alloc_probe_table) {
                                alloc_probe_table[(size - 1) >> shift] =
                                    array[i].kc_alloc_probe_id;
                        }
                        if (free_probe_table) {
                                free_probe_table[(size - 1) >> shift] =
                                    array[i].kc_free_probe_id;
                        }
#endif
                        size += table_unit;
                }
        }
        return maxidx;
}

void
kmem_init(void)
{
        kmem_cache_maxidx = kmem_create_caches(kmem_cache_sizes,
            kmem_cache_alloc_probe_id, kmem_cache_free_probe_id,
            kmem_cache, KMEM_MAXSIZE, KMEM_SHIFT, IPL_VM);
        kmem_cache_big_maxidx = kmem_create_caches(kmem_cache_big_sizes,
            kmem_cache_big_alloc_probe_id, kmem_cache_big_free_probe_id,
            kmem_cache_big, PAGE_SIZE, KMEM_BIG_SHIFT, IPL_VM);
}

size_t
kmem_roundup_size(size_t size)
{
        return (size + (KMEM_ALIGN - 1)) & ~(KMEM_ALIGN - 1);
}

/*
 * Used to dynamically allocate string with kmem accordingly to format.
 */
char *
kmem_asprintf(const char *fmt, ...)
{
        int size __diagused, len;
        va_list va;
        char *str;

        va_start(va, fmt);
        len = vsnprintf(NULL, 0, fmt, va);
        va_end(va);

        str = kmem_alloc(len + 1, KM_SLEEP);

        va_start(va, fmt);
        size = vsnprintf(str, len + 1, fmt, va);
        va_end(va);

        KASSERT(size == len);

        return str;
}

char *
kmem_strdupsize(const char *str, size_t *lenp, km_flag_t flags)
{
        size_t len = strlen(str) + 1;
        char *ptr = kmem_alloc(len, flags);
        if (ptr == NULL)
                return NULL;

        if (lenp)
                *lenp = len;
        memcpy(ptr, str, len);
        return ptr;
}

char *
kmem_strndup(const char *str, size_t maxlen, km_flag_t flags)
{
        KASSERT(str != NULL);
        KASSERT(maxlen != 0);

        size_t len = strnlen(str, maxlen);
        char *ptr = kmem_alloc(len + 1, flags);
        if (ptr == NULL)
                return NULL;

        memcpy(ptr, str, len);
        ptr[len] = '\0';

        return ptr;
}

void
kmem_strfree(char *str)
{
        if (str == NULL)
                return;

        kmem_free(str, strlen(str) + 1);
}

/*
 * Utility routine to maybe-allocate a temporary buffer if the size
 * is larger than we're willing to put on the stack.
 */
void *
kmem_tmpbuf_alloc(size_t size, void *stackbuf, size_t stackbufsize,
    km_flag_t flags)
{
        if (size <= stackbufsize) {
                return stackbuf;
        }

        return kmem_alloc(size, flags);
}

void
kmem_tmpbuf_free(void *buf, size_t size, void *stackbuf)
{
        if (buf != stackbuf) {
                kmem_free(buf, size);
        }
}

/* --------------------------- DEBUG / DIAGNOSTIC --------------------------- */

#if defined(KMEM_SIZE)
static void
kmem_size_set(void *p, size_t sz)
{
        memcpy((char *)p + sz, &sz, sizeof(size_t));
}

static void
kmem_size_check(void *p, size_t sz)
{
        size_t hsz;

        memcpy(&hsz, (char *)p + sz, sizeof(size_t));

        if (hsz != sz) {
                panic("kmem_free(%p, %zu) != allocated size %zu; overwrote?",
                    p, sz, hsz);
        }

        memset((char *)p + sz, 0xff, sizeof(size_t));
}
#endif /* defined(KMEM_SIZE) */

















































































































































































































































































    1 








































    1 


















    1 


































    1 









    1 































































































































    1 



































    1 









    1 












    1 














    1 



    1 




































    1 




    1 

















    1 

























    1 























    1 


















































    1 



























    1 






































    1 



    1 





























































    1 










    1 



















    1 




    1 











    1 



















































    1 





























































































    1 












    1 







    1 

















    1 

















    1 


























    1 




































    1 





















    1 




    1 




























    1 








    1 














    1 



































    1 









    1 








































    1 







    1 














    1 















    1 





















































    1 











    1 








    1 










    1 















    1 
    1 
    1 





























































    1 































    1 











    1 












    1 



    1 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
/*        $NetBSD: tcp_output.c,v 1.219 2023/09/13 15:54:28 bouyer Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
 *
 * NRL grants permission for redistribution and use in source and binary
 * forms, with or without modification, of the software and documentation
 * created at NRL provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgements:
 *      This product includes software developed by the University of
 *      California, Berkeley and its contributors.
 *      This product includes software developed at the Information
 *      Technology Division, US Naval Research Laboratory.
 * 4. Neither the name of the NRL nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * The views and conclusions contained in the software and documentation
 * are those of the authors and should not be interpreted as representing
 * official policies, either expressed or implied, of the US Naval
 * Research Laboratory (NRL).
 */

/*-
 * Copyright (c) 1997, 1998, 2001, 2005, 2006 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
 * Facility, NASA Ames Research Center.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Rui Paulo.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)tcp_output.c        8.4 (Berkeley) 5/24/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.219 2023/09/13 15:54:28 bouyer Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_tcp_debug.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#ifdef TCP_SIGNATURE
#include <sys/md5.h>
#endif

#include <net/if.h>
#include <net/route.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>

#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/nd6.h>
#endif

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#endif

#include <netinet/tcp.h>
#define        TCPOUTFLAGS
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_private.h>
#include <netinet/tcp_congctl.h>
#include <netinet/tcp_debug.h>
#include <netinet/in_offload.h>
#include <netinet6/in6_offload.h>

/*
 * Knob to enable Congestion Window Monitoring, and control
 * the burst size it allows.  Default burst is 4 packets, per
 * the Internet draft.
 */
int        tcp_cwm = 0;
int        tcp_cwm_burstsize = 4;

int        tcp_do_autosndbuf = 1;
int        tcp_autosndbuf_inc = 8 * 1024;
int        tcp_autosndbuf_max = 256 * 1024;

#ifdef TCP_OUTPUT_COUNTERS
#include <sys/device.h>

extern struct evcnt tcp_output_bigheader;
extern struct evcnt tcp_output_predict_hit;
extern struct evcnt tcp_output_predict_miss;
extern struct evcnt tcp_output_copysmall;
extern struct evcnt tcp_output_copybig;
extern struct evcnt tcp_output_refbig;

#define        TCP_OUTPUT_COUNTER_INCR(ev)        (ev)->ev_count++
#else

#define        TCP_OUTPUT_COUNTER_INCR(ev)        /* nothing */

#endif /* TCP_OUTPUT_COUNTERS */

static int
tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep,
    bool *alwaysfragp)
{
        struct inpcb *inp = tp->t_inpcb;
        struct socket *so = NULL;
        struct rtentry *rt;
        struct ifnet *ifp;
        int size;
        int hdrlen;
        int optlen;

        *alwaysfragp = false;
        size = tcp_mssdflt;

        switch (tp->t_family) {
        case AF_INET:
                hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
                break;
#ifdef INET6
        case AF_INET6:
                hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
                break;
#endif
        default:
                hdrlen = 1; /* prevent zero sized segments */
                goto out;
        }

        rt = inpcb_rtentry(inp);
        so = inp->inp_socket;
        if (rt == NULL) {
                goto out;
        }

        ifp = rt->rt_ifp;

        if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != 0) {
#ifdef INET6
                if (inp->inp_af == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
                        /*
                         * RFC2460 section 5, last paragraph: if path MTU is
                         * smaller than 1280, use 1280 as packet size and
                         * attach fragment header.
                         */
                        size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag);
                        *alwaysfragp = true;
                } else
                        size = rt->rt_rmx.rmx_mtu - hdrlen;
#else
                size = rt->rt_rmx.rmx_mtu - hdrlen;
#endif
        } else if (ifp->if_flags & IFF_LOOPBACK)
                size = ifp->if_mtu - hdrlen;
        else if (inp->inp_af == AF_INET && tp->t_mtudisc)
                size = ifp->if_mtu - hdrlen;
        else if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp)))
                size = ifp->if_mtu - hdrlen;
#ifdef INET6
        else if (inp->inp_af == AF_INET6) {
                if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) {
                        /* mapped addr case */
                        struct in_addr d;
                        memcpy(&d, &in6p_faddr(inp).s6_addr32[3], sizeof(d));
                        if (tp->t_mtudisc || in_localaddr(d))
                                size = ifp->if_mtu - hdrlen;
                } else {
                        /*
                         * for IPv6, path MTU discovery is always turned on,
                         * or the node must use packet size <= 1280.
                         */
                        size = tp->t_mtudisc ? ifp->if_mtu : IPV6_MMTU;
                        size -= hdrlen;
                }
        }
#endif
        inpcb_rtentry_unref(rt, inp);
 out:
        /*
         * Now we must make room for whatever extra TCP/IP options are in
         * the packet.
         */
        optlen = tcp_optlen(tp);

        /*
         * XXX tp->t_ourmss should have the right size, but without this code
         * fragmentation will occur... need more investigation
         */

        if (inp->inp_af == AF_INET) {
#if defined(IPSEC)
                if (ipsec_used &&
                    !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND))
                        optlen += ipsec4_hdrsiz_tcp(tp);
#endif
                optlen += ip_optlen(inp);
        }

#ifdef INET6
        if (inp->inp_af == AF_INET6 && tp->t_family == AF_INET) {
#if defined(IPSEC)
                if (ipsec_used &&
                    !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND))
                        optlen += ipsec4_hdrsiz_tcp(tp);
#endif
                /* XXX size -= ip_optlen(in6p); */
        } else if (inp->inp_af == AF_INET6) {
#if defined(IPSEC)
                if (ipsec_used &&
                    !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND))
                        optlen += ipsec6_hdrsiz_tcp(tp);
#endif
                optlen += ip6_optlen(inp);
        }
#endif
        size -= optlen;

        /*
         * There may not be any room for data if mtu is too small. This
         * includes zero-sized.
         */
        if (size <= 0) {
                return EMSGSIZE;
        }

        /*
         * *rxsegsizep holds *estimated* inbound segment size (estimation
         * assumes that path MTU is the same for both ways).  this is only
         * for silly window avoidance, do not use the value for other purposes.
         *
         * ipseclen is subtracted from both sides, this may not be right.
         * I'm not quite sure about this (could someone comment).
         */
        *txsegsizep = uimin(tp->t_peermss - optlen, size);
        *rxsegsizep = uimin(tp->t_ourmss - optlen, size);

        /*
         * Never send more than half a buffer full.  This insures that we can
         * always keep 2 packets on the wire, no matter what SO_SNDBUF is, and
         * therefore acks will never be delayed unless we run out of data to
         * transmit.
         */
        if (so) {
                *txsegsizep = uimin(so->so_snd.sb_hiwat >> 1, *txsegsizep);
        }

        /*
         * A segment must at least store header + options
         */
        if (*txsegsizep < hdrlen + optlen) {
                return EMSGSIZE;
        }

        if (*txsegsizep != tp->t_segsz) {
                /*
                 * If the new segment size is larger, we don't want to
                 * mess up the congestion window, but if it is smaller
                 * we'll have to reduce the congestion window to ensure
                 * that we don't get into trouble with initial windows
                 * and the rest.  In any case, if the segment size
                 * has changed, chances are the path has, too, and
                 * our congestion window will be different.
                 */
                if (*txsegsizep < tp->t_segsz) {
                        tp->snd_cwnd = uimax((tp->snd_cwnd / tp->t_segsz)
                            * *txsegsizep, *txsegsizep);
                        tp->snd_ssthresh = uimax((tp->snd_ssthresh / tp->t_segsz)
                            * *txsegsizep, *txsegsizep);
                }
                tp->t_segsz = *txsegsizep;
        }

        return 0;
}

static int
tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off,
    long len, int hdrlen, struct mbuf **mp)
{
        struct mbuf *m, *m0;
        uint64_t *tcps;

        tcps = TCP_STAT_GETREF();
        if (tp->t_force && len == 1)
                tcps[TCP_STAT_SNDPROBE]++;
        else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
                tp->t_sndrexmitpack++;
                tcps[TCP_STAT_SNDREXMITPACK]++;
                tcps[TCP_STAT_SNDREXMITBYTE] += len;
        } else {
                tcps[TCP_STAT_SNDPACK]++;
                tcps[TCP_STAT_SNDBYTE] += len;
        }
        TCP_STAT_PUTREF();

        MGETHDR(m, M_DONTWAIT, MT_HEADER);
        if (__predict_false(m == NULL))
                return ENOBUFS;
        MCLAIM(m, &tcp_tx_mowner);

        /*
         * XXX Because other code assumes headers will fit in
         * XXX one header mbuf.
         *
         * (This code should almost *never* be run.)
         */
        if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) {
                TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader);
                MCLGET(m, M_DONTWAIT);
                if ((m->m_flags & M_EXT) == 0) {
                        m_freem(m);
                        return ENOBUFS;
                }
        }

        m->m_data += max_linkhdr;
        m->m_len = hdrlen;

        /*
         * To avoid traversing the whole sb_mb chain for correct
         * data to send, remember last sent mbuf, its offset and
         * the sent size.  When called the next time, see if the
         * data to send is directly following the previous transfer.
         * This is important for large TCP windows.
         */
        if (off == 0 || tp->t_lastm == NULL ||
            (tp->t_lastoff + tp->t_lastlen) != off) {
                TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss);
                /*
                 * Either a new packet or a retransmit.
                 * Start from the beginning.
                 */
                tp->t_lastm = so->so_snd.sb_mb;
                tp->t_inoff = off;
        } else {
                TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit);
                tp->t_inoff += tp->t_lastlen;
        }

        /* Traverse forward to next packet */
        while (tp->t_inoff > 0) {
                if (tp->t_lastm == NULL)
                        panic("tp->t_lastm == NULL");
                if (tp->t_inoff < tp->t_lastm->m_len)
                        break;
                tp->t_inoff -= tp->t_lastm->m_len;
                tp->t_lastm = tp->t_lastm->m_next;
        }

        tp->t_lastoff = off;
        tp->t_lastlen = len;
        m0 = tp->t_lastm;
        off = tp->t_inoff;

        if (len <= M_TRAILINGSPACE(m)) {
                m_copydata(m0, off, (int)len, mtod(m, char *) + hdrlen);
                m->m_len += len;
                TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall);
        } else {
                m->m_next = m_copym(m0, off, (int)len, M_DONTWAIT);
                if (m->m_next == NULL) {
                        m_freem(m);
                        return ENOBUFS;
                }
#ifdef TCP_OUTPUT_COUNTERS
                if (m->m_next->m_flags & M_EXT)
                        TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig);
                else
                        TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig);
#endif
        }

        *mp = m;
        return 0;
}

/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
tcp_output(struct tcpcb *tp)
{
        struct rtentry *rt = NULL;
        struct socket *so;
        struct route *ro;
        long len, win;
        int off, flags, error;
        struct mbuf *m;
        struct ip *ip;
#ifdef INET6
        struct ip6_hdr *ip6;
#endif
        struct tcphdr *th;
        u_char opt[MAX_TCPOPTLEN], *optp;
#define OPT_FITS(more)        ((optlen + (more)) <= sizeof(opt))
        unsigned optlen, hdrlen, packetlen;
        unsigned int sack_numblks;
        int idle, sendalot, txsegsize, rxsegsize;
        int txsegsize_nosack;
        int maxburst = TCP_MAXBURST;
        int af;                /* address family on the wire */
        int iphdrlen;
        int has_tso4, has_tso6;
        int has_tso, use_tso;
        bool alwaysfrag;
        int sack_rxmit;
        int sack_bytes_rxmt;
        int ecn_tos;
        struct sackhole *p;
#ifdef TCP_SIGNATURE
        int sigoff = 0;
#endif
        uint64_t *tcps;

        so = tp->t_inpcb->inp_socket;
        ro = &tp->t_inpcb->inp_route;

        switch (af = tp->t_family) {
        case AF_INET:
        case AF_INET6:
                if (tp->t_inpcb)
                        break;
                return EINVAL;
        default:
                return EAFNOSUPPORT;
        }

        if (tcp_segsize(tp, &txsegsize, &rxsegsize, &alwaysfrag))
                return EMSGSIZE;

        idle = (tp->snd_max == tp->snd_una);

        /*
         * Determine if we can use TCP segmentation offload:
         * - If we're using IPv4
         * - If there is not an IPsec policy that prevents it
         * - If the interface can do it
         */
        has_tso4 = has_tso6 = false;

        has_tso4 = tp->t_inpcb->inp_af == AF_INET &&
#if defined(IPSEC)
            (!ipsec_used || ipsec_pcb_skip_ipsec(tp->t_inpcb->inp_sp,
            IPSEC_DIR_OUTBOUND)) &&
#endif
            (rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL &&
            (rt->rt_ifp->if_capenable & IFCAP_TSOv4) != 0;
        if (rt != NULL) {
                rtcache_unref(rt, &tp->t_inpcb->inp_route);
                rt = NULL;
        }

#if defined(INET6)
        has_tso6 = tp->t_inpcb->inp_af == AF_INET6 &&
#if defined(IPSEC)
            (!ipsec_used || ipsec_pcb_skip_ipsec(tp->t_inpcb->inp_sp,
            IPSEC_DIR_OUTBOUND)) &&
#endif
            (rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL &&
            (rt->rt_ifp->if_capenable & IFCAP_TSOv6) != 0;
        if (rt != NULL)
                rtcache_unref(rt, &tp->t_inpcb->inp_route);
#endif /* defined(INET6) */
        has_tso = (has_tso4 || has_tso6) && !alwaysfrag;

        /*
         * Restart Window computation.  From draft-floyd-incr-init-win-03:
         *
         *        Optionally, a TCP MAY set the restart window to the
         *        minimum of the value used for the initial window and
         *        the current value of cwnd (in other words, using a
         *        larger value for the restart window should never increase
         *        the size of cwnd).
         */
        if (tcp_cwm) {
                /*
                 * Hughes/Touch/Heidemann Congestion Window Monitoring.
                 * Count the number of packets currently pending
                 * acknowledgement, and limit our congestion window
                 * to a pre-determined allowed burst size plus that count.
                 * This prevents bursting once all pending packets have
                 * been acknowledged (i.e. transmission is idle).
                 *
                 * XXX Link this to Initial Window?
                 */
                tp->snd_cwnd = uimin(tp->snd_cwnd,
                    (tcp_cwm_burstsize * txsegsize) +
                    (tp->snd_nxt - tp->snd_una));
        } else {
                if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) {
                        /*
                         * We have been idle for "a while" and no acks are
                         * expected to clock out any data we send --
                         * slow start to get ack "clock" running again.
                         */
                        int ss = tcp_init_win;
                        if (tp->t_inpcb->inp_af == AF_INET &&
                            in_localaddr(in4p_faddr(tp->t_inpcb)))
                                ss = tcp_init_win_local;
#ifdef INET6
                        else if (tp->t_inpcb->inp_af == AF_INET6 &&
                            in6_localaddr(&in6p_faddr(tp->t_inpcb)))
                                ss = tcp_init_win_local;
#endif
                        tp->snd_cwnd = uimin(tp->snd_cwnd,
                            TCP_INITIAL_WINDOW(ss, txsegsize));
                }
        }

        txsegsize_nosack = txsegsize;
again:
        ecn_tos = 0;
        use_tso = has_tso;
        if ((tp->t_flags & (TF_ECN_SND_CWR|TF_ECN_SND_ECE)) != 0) {
                /* don't duplicate CWR/ECE. */
                use_tso = 0;
        }
        TCP_REASS_LOCK(tp);
        sack_numblks = tcp_sack_numblks(tp);
        if (sack_numblks) {
                int sackoptlen;

                sackoptlen = TCP_SACK_OPTLEN(sack_numblks);
                if (sackoptlen > txsegsize_nosack) {
                        sack_numblks = 0; /* give up SACK */
                        txsegsize = txsegsize_nosack;
                } else {
                        if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
                                /* don't duplicate D-SACK. */
                                use_tso = 0;
                        }
                        txsegsize = txsegsize_nosack - sackoptlen;
                }
        } else {
                txsegsize = txsegsize_nosack;
        }

        /*
         * Determine length of data that should be transmitted, and
         * flags that should be used.  If there is some data or critical
         * controls (SYN, RST) to send, then transmit; otherwise,
         * investigate further.
         *
         * Readjust SACK information to avoid resending duplicate data.
         */
        if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
                tcp_sack_adjust(tp);
        sendalot = 0;
        off = tp->snd_nxt - tp->snd_una;
        win = uimin(tp->snd_wnd, tp->snd_cwnd);

        flags = tcp_outflags[tp->t_state];

        /*
         * Send any SACK-generated retransmissions.  If we're explicitly trying
         * to send out new data (when sendalot is 1), bypass this function.
         * If we retransmit in fast recovery mode, decrement snd_cwnd, since
         * we're replacing a (future) new transmission with a retransmission
         * now, and we previously incremented snd_cwnd in tcp_input().
         */
        /*
         * Still in sack recovery, reset rxmit flag to zero.
         */
        sack_rxmit = 0;
        sack_bytes_rxmt = 0;
        len = 0;
        p = NULL;
        do {
                long cwin;
                if (!TCP_SACK_ENABLED(tp))
                        break;
                if (tp->t_partialacks < 0)
                        break;
                p = tcp_sack_output(tp, &sack_bytes_rxmt);
                if (p == NULL)
                        break;

                cwin = uimin(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
                if (cwin < 0)
                        cwin = 0;
                /* Do not retransmit SACK segments beyond snd_recover */
                if (SEQ_GT(p->end, tp->snd_recover)) {
                        /*
                         * (At least) part of sack hole extends beyond
                         * snd_recover. Check to see if we can rexmit data
                         * for this hole.
                         */
                        if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
                                /*
                                 * Can't rexmit any more data for this hole.
                                 * That data will be rexmitted in the next
                                 * sack recovery episode, when snd_recover
                                 * moves past p->rxmit.
                                 */
                                p = NULL;
                                break;
                        }
                        /* Can rexmit part of the current hole */
                        len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit));
                } else
                        len = ((long)ulmin(cwin, p->end - p->rxmit));
                off = p->rxmit - tp->snd_una;
                if (off + len > so->so_snd.sb_cc) {
                        /* 1 for TH_FIN */
                        KASSERT(off + len == so->so_snd.sb_cc + 1);
                        KASSERT(p->rxmit + len == tp->snd_max);
                        len = so->so_snd.sb_cc - off;
                }
                if (len > 0) {
                        sack_rxmit = 1;
                        sendalot = 1;
                }
        } while (/*CONSTCOND*/0);

        /*
         * If in persist timeout with window of 0, send 1 byte.
         * Otherwise, if window is small but nonzero
         * and timer expired, we will send what we can
         * and go to transmit state.
         */
        if (tp->t_force) {
                if (win == 0) {
                        /*
                         * If we still have some data to send, then
                         * clear the FIN bit.  Usually this would
                         * happen below when it realizes that we
                         * aren't sending all the data.  However,
                         * if we have exactly 1 byte of unset data,
                         * then it won't clear the FIN bit below,
                         * and if we are in persist state, we wind
                         * up sending the packet without recording
                         * that we sent the FIN bit.
                         *
                         * We can't just blindly clear the FIN bit,
                         * because if we don't have any more data
                         * to send then the probe will be the FIN
                         * itself.
                         */
                        if (off < so->so_snd.sb_cc)
                                flags &= ~TH_FIN;
                        win = 1;
                } else {
                        TCP_TIMER_DISARM(tp, TCPT_PERSIST);
                        tp->t_rxtshift = 0;
                }
        }

        if (sack_rxmit == 0) {
                if (TCP_SACK_ENABLED(tp) && tp->t_partialacks >= 0) {
                        long cwin;

                        /*
                         * We are inside of a SACK recovery episode and are
                         * sending new data, having retransmitted all the
                         * data possible in the scoreboard.
                         */
                        if (tp->snd_wnd < so->so_snd.sb_cc) {
                                len = tp->snd_wnd - off;
                                flags &= ~TH_FIN;
                        } else {
                                len = so->so_snd.sb_cc - off;
                        }

                        /*
                         * From FreeBSD:
                         *  Don't remove this (len > 0) check !
                         *  We explicitly check for len > 0 here (although it
                         *  isn't really necessary), to work around a gcc
                         *  optimization issue - to force gcc to compute
                         *  len above. Without this check, the computation
                         *  of len is bungled by the optimizer.
                         */
                        if (len > 0) {
                                cwin = tp->snd_cwnd -
                                    (tp->snd_nxt - tp->sack_newdata) -
                                    sack_bytes_rxmt;
                                if (cwin < 0)
                                        cwin = 0;
                                if (cwin < len) {
                                        len = cwin;
                                        flags &= ~TH_FIN;
                                }
                        }
                } else if (win < so->so_snd.sb_cc) {
                        len = win - off;
                        flags &= ~TH_FIN;
                } else {
                        len = so->so_snd.sb_cc - off;
                }
        }

        if (len < 0) {
                /*
                 * If FIN has been sent but not acked,
                 * but we haven't been called to retransmit,
                 * len will be -1.  Otherwise, window shrank
                 * after we sent into it.  If window shrank to 0,
                 * cancel pending retransmit, pull snd_nxt back
                 * to (closed) window, and set the persist timer
                 * if it isn't already going.  If the window didn't
                 * close completely, just wait for an ACK.
                 *
                 * If we have a pending FIN, either it has already been
                 * transmitted or it is outside the window, so drop it.
                 * If the FIN has been transmitted, but this is not a
                 * retransmission, then len must be -1.  Therefore we also
                 * prevent here the sending of `gratuitous FINs'.  This
                 * eliminates the need to check for that case below (e.g.
                 * to back up snd_nxt before the FIN so that the sequence
                 * number is correct).
                 */
                len = 0;
                flags &= ~TH_FIN;
                if (win == 0) {
                        TCP_TIMER_DISARM(tp, TCPT_REXMT);
                        tp->t_rxtshift = 0;
                        tp->snd_nxt = tp->snd_una;
                        if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
                                tcp_setpersist(tp);
                }
        }

        /*
         * Automatic sizing enables the performance of large buffers
         * and most of the efficiency of small ones by only allocating
         * space when it is needed.
         *
         * The criteria to step up the send buffer one notch are:
         *  1. receive window of remote host is larger than send buffer
         *     (with a fudge factor of 5/4th);
         *  2. send buffer is filled to 7/8th with data (so we actually
         *     have data to make use of it);
         *  3. send buffer fill has not hit maximal automatic size;
         *  4. our send window (slow start and cogestion controlled) is
         *     larger than sent but unacknowledged data in send buffer.
         *
         * The remote host receive window scaling factor may limit the
         * growing of the send buffer before it reaches its allowed
         * maximum.
         *
         * It scales directly with slow start or congestion window
         * and does at most one step per received ACK.  This fast
         * scaling has the drawback of growing the send buffer beyond
         * what is strictly necessary to make full use of a given
         * delay*bandwidth product.  However testing has shown this not
         * to be much of an problem.  At worst we are trading wasting
         * of available bandwidth (the non-use of it) for wasting some
         * socket buffer memory.
         *
         * TODO: Shrink send buffer during idle periods together
         * with congestion window.  Requires another timer.
         */
        if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
                if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
                    so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
                    so->so_snd.sb_cc < tcp_autosndbuf_max &&
                    win >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
                        if (!sbreserve(&so->so_snd,
                            uimin(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
                             tcp_autosndbuf_max), so))
                                so->so_snd.sb_flags &= ~SB_AUTOSIZE;
                }
        }

        if (len > txsegsize) {
                if (use_tso) {
                        /*
                         * Truncate TSO transfers to IP_MAXPACKET, and make
                         * sure that we send equal size transfers down the
                         * stack (rather than big-small-big-small-...).
                         */
#ifdef INET6
                        CTASSERT(IPV6_MAXPACKET == IP_MAXPACKET);
#endif
                        len = (uimin(len, IP_MAXPACKET) / txsegsize) * txsegsize;
                        if (len <= txsegsize) {
                                use_tso = 0;
                        }
                } else
                        len = txsegsize;
                flags &= ~TH_FIN;
                sendalot = 1;
        } else
                use_tso = 0;
        if (sack_rxmit) {
                if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
                        flags &= ~TH_FIN;
        }

        win = sbspace(&so->so_rcv);

        /*
         * Sender silly window avoidance.  If connection is idle
         * and can send all data, a maximum segment,
         * at least a maximum default-size segment do it,
         * or are forced, do it; otherwise don't bother.
         * If peer's buffer is tiny, then send
         * when window is at least half open.
         * If retransmitting (possibly after persist timer forced us
         * to send into a small window), then must resend.
         */
        if (len) {
                if (len >= txsegsize)
                        goto send;
                if ((so->so_state & SS_MORETOCOME) == 0 &&
                    ((idle || tp->t_flags & TF_NODELAY) &&
                     len + off >= so->so_snd.sb_cc))
                        goto send;
                if (tp->t_force)
                        goto send;
                if (len >= tp->max_sndwnd / 2)
                        goto send;
                if (SEQ_LT(tp->snd_nxt, tp->snd_max))
                        goto send;
                if (sack_rxmit)
                        goto send;
        }

        /*
         * Compare available window to amount of window known to peer
         * (as advertised window less next expected input).  If the
         * difference is at least twice the size of the largest segment
         * we expect to receive (i.e. two segments) or at least 50% of
         * the maximum possible window, then want to send a window update
         * to peer.
         */
        if (win > 0) {
                /*
                 * "adv" is the amount we can increase the window,
                 * taking into account that we are limited by
                 * TCP_MAXWIN << tp->rcv_scale.
                 */
                long recwin = uimin(win, (long)TCP_MAXWIN << tp->rcv_scale);
                long oldwin, adv;

                /*
                 * rcv_nxt may overtake rcv_adv when we accept a
                 * zero-window probe.
                 */
                if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
                        oldwin = tp->rcv_adv - tp->rcv_nxt;
                else
                        oldwin = 0;

                /*
                 * If the new window size ends up being the same as or
                 * less than the old size when it is scaled, then
                 * don't force a window update.
                 */
                if (recwin >> tp->rcv_scale <= oldwin >> tp->rcv_scale)
                        goto dontupdate;

                adv = recwin - oldwin;
                if (adv >= (long) (2 * rxsegsize))
                        goto send;
                if (2 * adv >= (long) so->so_rcv.sb_hiwat)
                        goto send;
        }
dontupdate:

        /*
         * Send if we owe peer an ACK.
         */
        if (tp->t_flags & TF_ACKNOW)
                goto send;
        if (flags & (TH_SYN|TH_FIN|TH_RST))
                goto send;
        if (SEQ_GT(tp->snd_up, tp->snd_una))
                goto send;
        /*
         * In SACK, it is possible for tcp_output to fail to send a segment
         * after the retransmission timer has been turned off.  Make sure
         * that the retransmission timer is set.
         */
        if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) &&
            !TCP_TIMER_ISARMED(tp, TCPT_REXMT) &&
            !TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
                TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
                goto just_return;
        }

        /*
         * TCP window updates are not reliable, rather a polling protocol
         * using ``persist'' packets is used to insure receipt of window
         * updates.  The three ``states'' for the output side are:
         *        idle                        not doing retransmits or persists
         *        persisting                to move a small or zero window
         *        (re)transmitting        and thereby not persisting
         *
         * tp->t_timer[TCPT_PERSIST]
         *        is set when we are in persist state.
         * tp->t_force
         *        is set when we are called to send a persist packet.
         * tp->t_timer[TCPT_REXMT]
         *        is set when we are retransmitting
         * The output side is idle when both timers are zero.
         *
         * If send window is too small, there is data to transmit, and no
         * retransmit or persist is pending, then go to persist state.
         * If nothing happens soon, send when timer expires:
         * if window is nonzero, transmit what we can,
         * otherwise force out a byte.
         */
        if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
            TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
                tp->t_rxtshift = 0;
                tcp_setpersist(tp);
        }

        /*
         * No reason to send a segment, just return.
         */
just_return:
        TCP_REASS_UNLOCK(tp);
        return 0;

send:
        /*
         * Before ESTABLISHED, force sending of initial options unless TCP set
         * not to do any options.
         *
         * Note: we assume that the IP/TCP header plus TCP options always fit
         * in a single mbuf, leaving room for a maximum link header, i.e.:
         *     max_linkhdr + IP_header + TCP_header + optlen <= MCLBYTES
         */
        optlen = 0;
        optp = opt;
        switch (af) {
        case AF_INET:
                iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
                break;
#ifdef INET6
        case AF_INET6:
                iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
                break;
#endif
        default:        /*pacify gcc*/
                iphdrlen = 0;
                break;
        }
        hdrlen = iphdrlen;
        if (flags & TH_SYN) {
                struct rtentry *synrt;

                synrt = inpcb_rtentry(tp->t_inpcb);
                tp->snd_nxt = tp->iss;
                tp->t_ourmss = tcp_mss_to_advertise(synrt != NULL ?
                                                    synrt->rt_ifp : NULL, af);
                inpcb_rtentry_unref(synrt, tp->t_inpcb);
                if ((tp->t_flags & TF_NOOPT) == 0 && OPT_FITS(TCPOLEN_MAXSEG)) {
                        *optp++ = TCPOPT_MAXSEG;
                        *optp++ = TCPOLEN_MAXSEG;
                        *optp++ = (tp->t_ourmss >> 8) & 0xff;
                        *optp++ = tp->t_ourmss & 0xff;
                        optlen += TCPOLEN_MAXSEG;

                        if ((tp->t_flags & TF_REQ_SCALE) &&
                            ((flags & TH_ACK) == 0 ||
                            (tp->t_flags & TF_RCVD_SCALE)) &&
                            OPT_FITS(TCPOLEN_WINDOW + TCPOLEN_NOP)) {
                                *((uint32_t *)optp) = htonl(
                                        TCPOPT_NOP << 24 |
                                        TCPOPT_WINDOW << 16 |
                                        TCPOLEN_WINDOW << 8 |
                                        tp->request_r_scale);
                                optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
                                optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
                        }
                        if (tcp_do_sack && OPT_FITS(TCPOLEN_SACK_PERMITTED)) {
                                *optp++ = TCPOPT_SACK_PERMITTED;
                                *optp++ = TCPOLEN_SACK_PERMITTED;
                                optlen += TCPOLEN_SACK_PERMITTED;
                        }
                }
        }

        /*
         * Send a timestamp and echo-reply if this is a SYN and our side
         * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
         * and our peer have sent timestamps in our SYN's.
         */
        if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
             (flags & TH_RST) == 0 &&
            ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
             (tp->t_flags & TF_RCVD_TSTMP))) {
                int alen = 0;
                while (optlen % 4 != 2) {
                        optlen += TCPOLEN_NOP;
                        *optp++ = TCPOPT_NOP;
                        alen++;
                }
                if (OPT_FITS(TCPOLEN_TIMESTAMP)) {
                        *optp++ = TCPOPT_TIMESTAMP;
                        *optp++ = TCPOLEN_TIMESTAMP;
                        uint32_t *lp = (uint32_t *)optp;
                        /* Form timestamp option (appendix A of RFC 1323) */
                        *lp++ = htonl(TCP_TIMESTAMP(tp));
                        *lp   = htonl(tp->ts_recent);
                        optp += TCPOLEN_TIMESTAMP - 2;
                        optlen += TCPOLEN_TIMESTAMP;

                        /* Set receive buffer autosizing timestamp. */
                        if (tp->rfbuf_ts == 0 &&
                            (so->so_rcv.sb_flags & SB_AUTOSIZE))
                                tp->rfbuf_ts = TCP_TIMESTAMP(tp);
                } else {
                        optp -= alen;
                        optlen -= alen;
                }
        }

#ifdef TCP_SIGNATURE
        if (tp->t_flags & TF_SIGNATURE) {
                /*
                 * Initialize TCP-MD5 option (RFC2385)
                 */
                if (!OPT_FITS(TCPOLEN_SIGNATURE))
                        goto reset;

                *optp++ = TCPOPT_SIGNATURE;
                *optp++ = TCPOLEN_SIGNATURE;
                sigoff = optlen + 2;
                memset(optp, 0, TCP_SIGLEN);
                optlen += TCPOLEN_SIGNATURE;
                optp += TCP_SIGLEN;
        }
#endif

        /*
         * Tack on the SACK block if it is necessary.
         */
        if (sack_numblks) {
                int alen = 0;
                int sack_len = sack_numblks * 8;
                while (optlen % 4 != 2) {
                        optlen += TCPOLEN_NOP;
                        *optp++ = TCPOPT_NOP;
                        alen++;
                }
                if (OPT_FITS(sack_len + 2)) {
                        struct ipqent *tiqe;
                        *optp++ = TCPOPT_SACK;
                        *optp++ = sack_len + 2;
                        uint32_t *lp = (uint32_t *)optp;
                        if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
                                sack_numblks--;
                                *lp++ = htonl(tp->rcv_dsack_block.left);
                                *lp++ = htonl(tp->rcv_dsack_block.right);
                                tp->rcv_sack_flags &= ~TCPSACK_HAVED;
                        }
                        for (tiqe = TAILQ_FIRST(&tp->timeq);
                            sack_numblks > 0;
                            tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) {
                                KASSERT(tiqe != NULL);
                                sack_numblks--;
                                *lp++ = htonl(tiqe->ipqe_seq);
                                *lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len +
                                    ((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0));
                        }
                        optlen += sack_len + 2;
                        optp += sack_len;
                } else {
                        optp -= alen;
                        optlen -= alen;
                }
        }

        /* Terminate and pad TCP options to a 4 byte boundary. */
        if (optlen % 4) {
                if (!OPT_FITS(TCPOLEN_EOL)) {
reset:                        TCP_REASS_UNLOCK(tp);
                        error = ECONNABORTED;
                        goto out;
                }
                optlen += TCPOLEN_EOL;
                *optp++ = TCPOPT_EOL;
        }
        /*
         * According to RFC 793 (STD0007):
         *   "The content of the header beyond the End-of-Option option
         *    must be header padding (i.e., zero)."
         *   and later: "The padding is composed of zeros."
         */
        while (optlen % 4) {
                if (!OPT_FITS(TCPOLEN_PAD))
                        goto reset;
                optlen += TCPOLEN_PAD;
                *optp++ = TCPOPT_PAD;
        }

        TCP_REASS_UNLOCK(tp);

        hdrlen += optlen;

#ifdef DIAGNOSTIC
        if (!use_tso && len > txsegsize)
                panic("tcp data to be sent is larger than segment");
        else if (use_tso && len > IP_MAXPACKET)
                panic("tcp data to be sent is larger than max TSO size");
        if (max_linkhdr + hdrlen > MCLBYTES)
                panic("tcphdr too big");
#endif

        /*
         * Grab a header mbuf, attaching a copy of data to
         * be transmitted, and initialize the header from
         * the template for sends on this connection.
         */
        if (len) {
                error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m);
                if (error)
                        goto out;
                /*
                 * If we're sending everything we've got, set PUSH.
                 * (This will keep happy those implementations which only
                 * give data to the user when a buffer fills or
                 * a PUSH comes in.)
                 */
                if (off + len == so->so_snd.sb_cc)
                        flags |= TH_PUSH;
        } else {
                tcps = TCP_STAT_GETREF();
                if (tp->t_flags & TF_ACKNOW)
                        tcps[TCP_STAT_SNDACKS]++;
                else if (flags & (TH_SYN|TH_FIN|TH_RST))
                        tcps[TCP_STAT_SNDCTRL]++;
                else if (SEQ_GT(tp->snd_up, tp->snd_una))
                        tcps[TCP_STAT_SNDURG]++;
                else
                        tcps[TCP_STAT_SNDWINUP]++;
                TCP_STAT_PUTREF();

                MGETHDR(m, M_DONTWAIT, MT_HEADER);
                if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
                        MCLGET(m, M_DONTWAIT);
                        if ((m->m_flags & M_EXT) == 0) {
                                m_freem(m);
                                m = NULL;
                        }
                }
                if (m == NULL) {
                        error = ENOBUFS;
                        goto out;
                }
                MCLAIM(m, &tcp_tx_mowner);
                m->m_data += max_linkhdr;
                m->m_len = hdrlen;
        }
        m_reset_rcvif(m);
        switch (af) {
        case AF_INET:
                ip = mtod(m, struct ip *);
#ifdef INET6
                ip6 = NULL;
#endif
                th = (struct tcphdr *)(ip + 1);
                break;
#ifdef INET6
        case AF_INET6:
                ip = NULL;
                ip6 = mtod(m, struct ip6_hdr *);
                th = (struct tcphdr *)(ip6 + 1);
                break;
#endif
        default:        /*pacify gcc*/
                ip = NULL;
#ifdef INET6
                ip6 = NULL;
#endif
                th = NULL;
                break;
        }
        if (tp->t_template == NULL)
                panic("%s: no template", __func__);
        if (tp->t_template->m_len < iphdrlen)
                panic("%s: %d < %d", __func__, tp->t_template->m_len, iphdrlen);
        bcopy(mtod(tp->t_template, void *), mtod(m, void *), iphdrlen);

        /*
         * If we are starting a connection, send ECN setup
         * SYN packet. If we are on a retransmit, we may
         * resend those bits a number of times as per
         * RFC 3168.
         */
        if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) {
                if (tp->t_flags & TF_SYN_REXMT) {
                        if (tp->t_ecn_retries--)
                                flags |= TH_ECE|TH_CWR;
                } else {
                        flags |= TH_ECE|TH_CWR;
                        tp->t_ecn_retries = tcp_ecn_maxretries;
                }
        }

        if (TCP_ECN_ALLOWED(tp)) {
                /*
                 * If the peer has ECN, mark data packets
                 * ECN capable. Ignore pure ack packets, retransmissions
                 * and window probes.
                 */
                if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
                    !(tp->t_force && len == 1)) {
                        ecn_tos = IPTOS_ECN_ECT0;
                        TCP_STATINC(TCP_STAT_ECN_ECT);
                }

                /*
                 * Reply with proper ECN notifications.
                 */
                if (tp->t_flags & TF_ECN_SND_CWR) {
                        flags |= TH_CWR;
                        tp->t_flags &= ~TF_ECN_SND_CWR;
                }
                if (tp->t_flags & TF_ECN_SND_ECE) {
                        flags |= TH_ECE;
                }
        }

        /*
         * If we are doing retransmissions, then snd_nxt will
         * not reflect the first unsent octet.  For ACK only
         * packets, we do not want the sequence number of the
         * retransmitted packet, we want the sequence number
         * of the next unsent octet.  So, if there is no data
         * (and no SYN or FIN), use snd_max instead of snd_nxt
         * when filling in ti_seq.  But if we are in persist
         * state, snd_max might reflect one byte beyond the
         * right edge of the window, so use snd_nxt in that
         * case, since we know we aren't doing a retransmission.
         * (retransmit and persist are mutually exclusive...)
         */
        if (TCP_SACK_ENABLED(tp) && sack_rxmit) {
                th->th_seq = htonl(p->rxmit);
                p->rxmit += len;
        } else {
                if (len || (flags & (TH_SYN|TH_FIN)) ||
                    TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
                        th->th_seq = htonl(tp->snd_nxt);
                else
                        th->th_seq = htonl(tp->snd_max);
        }
        th->th_ack = htonl(tp->rcv_nxt);
        if (optlen) {
                memcpy(th + 1, opt, optlen);
                th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
        }
        th->th_flags = flags;
        /*
         * Calculate receive window.  Don't shrink window,
         * but avoid silly window syndrome.
         */
        if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize)
                win = 0;
        if (win > (long)TCP_MAXWIN << tp->rcv_scale)
                win = (long)TCP_MAXWIN << tp->rcv_scale;
        if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
                win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
        th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
        if (th->th_win == 0) {
                tp->t_sndzerowin++;
        }
        if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
                u_int32_t urp = tp->snd_up - tp->snd_nxt;
                if (urp > IP_MAXPACKET)
                        urp = IP_MAXPACKET;
                th->th_urp = htons((u_int16_t)urp);
                th->th_flags |= TH_URG;
        } else
                /*
                 * If no urgent pointer to send, then we pull
                 * the urgent pointer to the left edge of the send window
                 * so that it doesn't drift into the send window on sequence
                 * number wraparound.
                 */
                tp->snd_up = tp->snd_una;                /* drag it along */

#ifdef TCP_SIGNATURE
        if (sigoff && (tp->t_flags & TF_SIGNATURE)) {
                struct secasvar *sav;
                u_int8_t *sigp;

                sav = tcp_signature_getsav(m);
                if (sav == NULL) {
                        if (m)
                                m_freem(m);
                        return EPERM;
                }

                m->m_pkthdr.len = hdrlen + len;
                sigp = (char *)th + sizeof(*th) + sigoff;
                tcp_signature(m, th, (char *)th - mtod(m, char *), sav, sigp);

                key_sa_recordxfer(sav, m);
                KEY_SA_UNREF(&sav);
        }
#endif

        /*
         * Set ourselves up to be checksummed just before the packet
         * hits the wire.
         */
        switch (af) {
        case AF_INET:
                m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
                if (use_tso) {
                        m->m_pkthdr.segsz = txsegsize;
                        m->m_pkthdr.csum_flags = M_CSUM_TSOv4;
                } else {
                        m->m_pkthdr.csum_flags = M_CSUM_TCPv4;
                        if (len + optlen) {
                                /* Fixup the pseudo-header checksum. */
                                /* XXXJRT Not IP Jumbogram safe. */
                                th->th_sum = in_cksum_addword(th->th_sum,
                                    htons((u_int16_t) (len + optlen)));
                        }
                }
                break;
#ifdef INET6
        case AF_INET6:
                m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
                if (use_tso) {
                        m->m_pkthdr.segsz = txsegsize;
                        m->m_pkthdr.csum_flags = M_CSUM_TSOv6;
                } else {
                        m->m_pkthdr.csum_flags = M_CSUM_TCPv6;
                        if (len + optlen) {
                                /* Fixup the pseudo-header checksum. */
                                /* XXXJRT: Not IPv6 Jumbogram safe. */
                                th->th_sum = in_cksum_addword(th->th_sum,
                                    htons((u_int16_t) (len + optlen)));
                        }
                }
                break;
#endif
        }

        /*
         * In transmit state, time the transmission and arrange for
         * the retransmit.  In persist state, just set snd_max.
         */
        if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
                tcp_seq startseq = tp->snd_nxt;

                /*
                 * Advance snd_nxt over sequence space of this segment.
                 * There are no states in which we send both a SYN and a FIN,
                 * so we collapse the tests for these flags.
                 */
                if (flags & (TH_SYN|TH_FIN))
                        tp->snd_nxt++;
                if (sack_rxmit)
                        goto timer;
                tp->snd_nxt += len;
                if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
                        tp->snd_max = tp->snd_nxt;
                        /*
                         * Time this transmission if not a retransmission and
                         * not currently timing anything.
                         */
                        if (tp->t_rtttime == 0) {
                                tp->t_rtttime = tcp_now;
                                tp->t_rtseq = startseq;
                                TCP_STATINC(TCP_STAT_SEGSTIMED);
                        }
                }

                /*
                 * Set retransmit timer if not currently set,
                 * and not doing an ack or a keep-alive probe.
                 * Initial value for retransmit timer is smoothed
                 * round-trip time + 2 * round-trip time variance.
                 * Initialize shift counter which is used for backoff
                 * of retransmit time.
                 */
timer:
                if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) {
                        if ((sack_rxmit && tp->snd_nxt != tp->snd_max)
                            || tp->snd_nxt != tp->snd_una) {
                                if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
                                        TCP_TIMER_DISARM(tp, TCPT_PERSIST);
                                        tp->t_rxtshift = 0;
                                }
                                TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
                        } else if (len == 0 && so->so_snd.sb_cc > 0
                            && TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
                                /*
                                 * If we are sending a window probe and there's
                                 * unacked data in the socket, make sure at
                                 * least the persist timer is running.
                                 */
                                tp->t_rxtshift = 0;
                                tcp_setpersist(tp);
                        }
                }
        } else
                if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
                        tp->snd_max = tp->snd_nxt + len;

#ifdef TCP_DEBUG
        /*
         * Trace.
         */
        if (so->so_options & SO_DEBUG)
                tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0);
#endif

        /*
         * Fill in IP length and desired time to live and
         * send to IP level.  There should be a better way
         * to handle ttl and tos; we could keep them in
         * the template, but need a way to checksum without them.
         */
        m->m_pkthdr.len = hdrlen + len;

        switch (af) {
        case AF_INET:
                ip->ip_len = htons(m->m_pkthdr.len);
                packetlen = m->m_pkthdr.len;
                if (tp->t_inpcb->inp_af == AF_INET) {
                        ip->ip_ttl = in4p_ip(tp->t_inpcb).ip_ttl;
                        ip->ip_tos = in4p_ip(tp->t_inpcb).ip_tos | ecn_tos;
                }
#ifdef INET6
                else if (tp->t_inpcb->inp_af == AF_INET6) {
                        ip->ip_ttl = in6pcb_selecthlim(tp->t_inpcb, NULL); /*XXX*/
                        ip->ip_tos = ecn_tos;        /*XXX*/
                }
#endif
                break;
#ifdef INET6
        case AF_INET6:
                packetlen = m->m_pkthdr.len;
                ip6->ip6_nxt = IPPROTO_TCP;
                if (tp->t_family == AF_INET6) {
                        /*
                         * we separately set hoplimit for every segment, since
                         * the user might want to change the value via
                         * setsockopt. Also, desired default hop limit might
                         * be changed via Neighbor Discovery.
                         */
                        ip6->ip6_hlim = in6pcb_selecthlim_rt(tp->t_inpcb);
                }
                ip6->ip6_flow |= htonl(ecn_tos << 20);
                /* ip6->ip6_flow = ??? (from template) */
                /* ip6_plen will be filled in ip6_output(). */
                break;
#endif
        default:        /*pacify gcc*/
                packetlen = 0;
                break;
        }

        switch (af) {
        case AF_INET:
            {
                struct mbuf *opts;

                if (tp->t_inpcb->inp_af == AF_INET)
                        opts = tp->t_inpcb->inp_options;
                else
                        opts = NULL;
                error = ip_output(m, opts, ro,
                        (tp->t_mtudisc ? IP_MTUDISC : 0) |
                        (so->so_options & SO_DONTROUTE), NULL, tp->t_inpcb);
                break;
            }
#ifdef INET6
        case AF_INET6:
            {
                struct ip6_pktopts *opts;

                if (tp->t_inpcb->inp_af == AF_INET6)
                        opts = in6p_outputopts(tp->t_inpcb);
                else
                        opts = NULL;
                error = ip6_output(m, opts, ro, so->so_options & SO_DONTROUTE,
                        NULL, tp->t_inpcb, NULL);
                break;
            }
#endif
        default:
                error = EAFNOSUPPORT;
                break;
        }
        if (error) {
out:
                if (error == ENOBUFS) {
                        TCP_STATINC(TCP_STAT_SELFQUENCH);
                        tcp_quench(tp->t_inpcb);
                        error = 0;
                } else if ((error == EHOSTUNREACH || error == ENETDOWN ||
                    error == EHOSTDOWN) && TCPS_HAVERCVDSYN(tp->t_state)) {
                        tp->t_softerror = error;
                        error = 0;
                }

                /* Back out the sequence number advance. */
                if (sack_rxmit)
                        p->rxmit -= len;

                /* Restart the delayed ACK timer, if necessary. */
                if (tp->t_flags & TF_DELACK)
                        TCP_RESTART_DELACK(tp);

                return error;
        }

        if (packetlen > tp->t_pmtud_mtu_sent)
                tp->t_pmtud_mtu_sent = packetlen;

        tcps = TCP_STAT_GETREF();
        tcps[TCP_STAT_SNDTOTAL]++;
        if (tp->t_flags & TF_DELACK)
                tcps[TCP_STAT_DELACK]++;
        TCP_STAT_PUTREF();

        /*
         * Data sent (as far as we can tell).
         * If this advertises a larger window than any other segment,
         * then remember the size of the advertised window.
         * Any pending ACK has now been sent.
         */
        if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
                tp->rcv_adv = tp->rcv_nxt + win;
        tp->last_ack_sent = tp->rcv_nxt;
        tp->t_flags &= ~TF_ACKNOW;
        TCP_CLEAR_DELACK(tp);
#ifdef DIAGNOSTIC
        if (maxburst < 0)
                printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
#endif
        if (sendalot && (tp->t_congctl == &tcp_reno_ctl || --maxburst))
                goto again;
        return 0;
}

void
tcp_setpersist(struct tcpcb *tp)
{
        int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2);
        int nticks;

        if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
                panic("tcp_output REXMT");
        /*
         * Start/restart persistance timer.
         */
        if (t < tp->t_rttmin)
                t = tp->t_rttmin;
        TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
            TCPTV_PERSMIN, TCPTV_PERSMAX);
        TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
        if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
                tp->t_rxtshift++;
}









































































































































































































































































































































































































   29 











   29 

   29 













































   29 






   29 






   29 











   28 

    4 







   29 













   27 

































































































































































































































































































































   29 
   29 


   29 


















   28 
   29 




   29 





















































































































































   28 
   28 

























































































































































































































































































   29 





















































































   29 





   29 









   29 










































   29 





   29 








   29 










































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
/*        $NetBSD: bus_dma.c,v 1.90 2023/03/28 19:55:42 riastradh Exp $        */

/*-
 * Copyright (c) 1996, 1997, 1998, 2007, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
 * Simulation Facility NASA Ames Research Center, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: bus_dma.c,v 1.90 2023/03/28 19:55:42 riastradh Exp $");

/*
 * The following is included because _bus_dma_uiomove is derived from
 * uiomove() in kern_subr.c.
 */

/*
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Copyright (c) 1992, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * This software was developed by the Computer Systems Engineering group
 * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
 * contributed to Berkeley.
 *
 * All advertising materials mentioning features or use of this software
 * must display the following acknowledgement:
 *        This product includes software developed by the University of
 *        California, Lawrence Berkeley Laboratory.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "ioapic.h"
#include "isa.h"
#include "opt_mpbios.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/asan.h>
#include <sys/msan.h>

#include <sys/bus.h>
#include <machine/bus_private.h>
#if NIOAPIC > 0
#include <machine/i82093var.h>
#endif
#ifdef MPBIOS
#include <machine/mpbiosvar.h>
#endif
#include <machine/pmap_private.h>

#if NISA > 0
#include <dev/isa/isareg.h>
#include <dev/isa/isavar.h>
#endif

#include <uvm/uvm.h>

extern        paddr_t avail_end;

#define        IDTVEC(name)        __CONCAT(X,name)
typedef void (vector)(void);
extern vector *IDTVEC(intr)[];

#define        BUSDMA_BOUNCESTATS

#ifdef BUSDMA_BOUNCESTATS
#define        BUSDMA_EVCNT_DECL(name)                                                \
static struct evcnt bus_dma_ev_##name =                                        \
    EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "bus_dma", #name);                \
EVCNT_ATTACH_STATIC(bus_dma_ev_##name)

#define        STAT_INCR(name)                                                        \
    bus_dma_ev_##name.ev_count++
#define        STAT_DECR(name)                                                        \
    bus_dma_ev_##name.ev_count--

BUSDMA_EVCNT_DECL(nbouncebufs);
BUSDMA_EVCNT_DECL(loads);
BUSDMA_EVCNT_DECL(bounces);
#else
#define STAT_INCR(x)
#define STAT_DECR(x)
#endif

static int        _bus_dmamap_create(bus_dma_tag_t, bus_size_t, int, bus_size_t,
            bus_size_t, int, bus_dmamap_t *);
static void        _bus_dmamap_destroy(bus_dma_tag_t, bus_dmamap_t);
static int        _bus_dmamap_load(bus_dma_tag_t, bus_dmamap_t, void *,
            bus_size_t, struct proc *, int);
static int        _bus_dmamap_load_mbuf(bus_dma_tag_t, bus_dmamap_t,
            struct mbuf *, int);
static int        _bus_dmamap_load_uio(bus_dma_tag_t, bus_dmamap_t,
            struct uio *, int);
static int        _bus_dmamap_load_raw(bus_dma_tag_t, bus_dmamap_t,
            bus_dma_segment_t *, int, bus_size_t, int);
static void        _bus_dmamap_unload(bus_dma_tag_t, bus_dmamap_t);
static void        _bus_dmamap_sync(bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
            bus_size_t, int);

static int        _bus_dmamem_alloc(bus_dma_tag_t tag, bus_size_t size,
            bus_size_t alignment, bus_size_t boundary,
            bus_dma_segment_t *segs, int nsegs, int *rsegs, int flags);
static void        _bus_dmamem_free(bus_dma_tag_t tag, bus_dma_segment_t *segs,
            int nsegs);
static int        _bus_dmamem_map(bus_dma_tag_t tag, bus_dma_segment_t *segs,
            int nsegs, size_t size, void **kvap, int flags);
static void        _bus_dmamem_unmap(bus_dma_tag_t tag, void *kva, size_t size);
static paddr_t        _bus_dmamem_mmap(bus_dma_tag_t tag, bus_dma_segment_t *segs,
            int nsegs, off_t off, int prot, int flags);

static int        _bus_dmatag_subregion(bus_dma_tag_t tag, bus_addr_t min_addr,
            bus_addr_t max_addr, bus_dma_tag_t *newtag, int flags);
static void        _bus_dmatag_destroy(bus_dma_tag_t tag);

static int _bus_dma_uiomove(void *, struct uio *, size_t, int);
static int _bus_dma_alloc_bouncebuf(bus_dma_tag_t t, bus_dmamap_t map,
            bus_size_t size, int flags);
static void _bus_dma_free_bouncebuf(bus_dma_tag_t t, bus_dmamap_t map);
static int _bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map,
            void *buf, bus_size_t buflen, struct vmspace *vm, int flags);
static int _bus_dmamap_load_busaddr(bus_dma_tag_t, bus_dmamap_t,
    bus_addr_t, bus_size_t);

#ifndef _BUS_DMAMEM_ALLOC_RANGE
static int        _bus_dmamem_alloc_range(bus_dma_tag_t tag, bus_size_t size,
            bus_size_t alignment, bus_size_t boundary,
            bus_dma_segment_t *segs, int nsegs, int *rsegs, int flags,
            bus_addr_t low, bus_addr_t high);

#define _BUS_DMAMEM_ALLOC_RANGE _bus_dmamem_alloc_range

/*
 * Allocate physical memory from the given physical address range.
 * Called by DMA-safe memory allocation methods.
 */
static int
_bus_dmamem_alloc_range(bus_dma_tag_t t, bus_size_t size,
    bus_size_t alignment, bus_size_t boundary, bus_dma_segment_t *segs,
    int nsegs, int *rsegs, int flags, bus_addr_t low, bus_addr_t high)
{
        paddr_t curaddr, lastaddr;
        struct vm_page *m;
        struct pglist mlist;
        int curseg, error;
        bus_size_t uboundary;

        /* Always round the size. */
        size = round_page(size);

        KASSERTMSG(boundary >= PAGE_SIZE || boundary == 0,
            "boundary=0x%"PRIxBUSSIZE, boundary);

        /*
         * Allocate pages from the VM system.
         * We accept boundaries < size, splitting in multiple segments
         * if needed. uvm_pglistalloc does not, so compute an appropriate
         * boundary: next power of 2 >= size
         */

        if (boundary == 0)
                uboundary = 0;
        else {
                uboundary = boundary;
                while (uboundary < size)
                        uboundary = uboundary << 1;
        }
        error = uvm_pglistalloc(size, low, high, alignment, uboundary,
            &mlist, nsegs, (flags & BUS_DMA_NOWAIT) == 0);
        if (error)
                return (error);

        /*
         * Compute the location, size, and number of segments actually
         * returned by the VM code.
         */
        m = TAILQ_FIRST(&mlist);
        curseg = 0;
        lastaddr = segs[curseg].ds_addr = VM_PAGE_TO_PHYS(m);
        segs[curseg].ds_len = PAGE_SIZE;
        m = m->pageq.queue.tqe_next;

        for (; m != NULL; m = m->pageq.queue.tqe_next) {
                curaddr = VM_PAGE_TO_PHYS(m);
                KASSERTMSG(curaddr >= low, "curaddr=%#"PRIxPADDR
                    " low=%#"PRIxBUSADDR" high=%#"PRIxBUSADDR,
                    curaddr, low, high);
                KASSERTMSG(curaddr < high, "curaddr=%#"PRIxPADDR
                    " low=%#"PRIxBUSADDR" high=%#"PRIxBUSADDR,
                    curaddr, low, high);
                if (curaddr == (lastaddr + PAGE_SIZE) &&
                    (lastaddr & boundary) == (curaddr & boundary)) {
                        segs[curseg].ds_len += PAGE_SIZE;
                } else {
                        curseg++;
                        KASSERTMSG(curseg < nsegs, "curseg %d size %llx",
                            curseg, (long long)size);
                        segs[curseg].ds_addr = curaddr;
                        segs[curseg].ds_len = PAGE_SIZE;
                }
                lastaddr = curaddr;
        }

        *rsegs = curseg + 1;

        return (0);
}
#endif /* _BUS_DMAMEM_ALLOC_RANGE */

/*
 * Create a DMA map.
 */
static int
_bus_dmamap_create(bus_dma_tag_t t, bus_size_t size, int nsegments,
    bus_size_t maxsegsz, bus_size_t boundary, int flags, bus_dmamap_t *dmamp)
{
        struct x86_bus_dma_cookie *cookie;
        bus_dmamap_t map;
        int error, cookieflags;
        void *cookiestore, *mapstore;
        size_t cookiesize, mapsize;

        /*
         * Allocate and initialize the DMA map.  The end of the map
         * is a variable-sized array of segments, so we allocate enough
         * room for them in one shot.
         *
         * Note we don't preserve the WAITOK or NOWAIT flags.  Preservation
         * of ALLOCNOW notifies others that we've reserved these resources,
         * and they are not to be freed.
         *
         * The bus_dmamap_t includes one bus_dma_segment_t, hence
         * the (nsegments - 1).
         */
        error = 0;
        mapsize = sizeof(struct x86_bus_dmamap) +
            (sizeof(bus_dma_segment_t) * (nsegments - 1));
        if ((mapstore = malloc(mapsize, M_DMAMAP, M_ZERO |
            ((flags & BUS_DMA_NOWAIT) ? M_NOWAIT : M_WAITOK))) == NULL)
                return (ENOMEM);

        map = (struct x86_bus_dmamap *)mapstore;
        map->_dm_size = size;
        map->_dm_segcnt = nsegments;
        map->_dm_maxmaxsegsz = maxsegsz;
        map->_dm_boundary = boundary;
        map->_dm_bounce_thresh = t->_bounce_thresh;
        map->_dm_flags = flags & ~(BUS_DMA_WAITOK|BUS_DMA_NOWAIT);
        map->dm_maxsegsz = maxsegsz;
        map->dm_mapsize = 0;                /* no valid mappings */
        map->dm_nsegs = 0;

        if (t->_bounce_thresh == 0 || _BUS_AVAIL_END <= t->_bounce_thresh - 1)
                map->_dm_bounce_thresh = 0;
        cookieflags = 0;

        if (t->_may_bounce != NULL) {
                error = t->_may_bounce(t, map, flags, &cookieflags);
                if (error != 0)
                        goto out;
        }

        if (map->_dm_bounce_thresh != 0)
                cookieflags |= X86_DMA_MIGHT_NEED_BOUNCE;

        if ((cookieflags & X86_DMA_MIGHT_NEED_BOUNCE) == 0) {
                *dmamp = map;
                return 0;
        }

        cookiesize = sizeof(struct x86_bus_dma_cookie) +
            (sizeof(bus_dma_segment_t) * map->_dm_segcnt);

        /*
         * Allocate our cookie.
         */
        if ((cookiestore = malloc(cookiesize, M_DMAMAP, M_ZERO |
            ((flags & BUS_DMA_NOWAIT) ? M_NOWAIT : M_WAITOK))) == NULL) {
                error = ENOMEM;
                goto out;
        }
        cookie = (struct x86_bus_dma_cookie *)cookiestore;
        cookie->id_flags = cookieflags;
        map->_dm_cookie = cookie;

        error = _bus_dma_alloc_bouncebuf(t, map, size, flags);
 out:
        if (error)
                _bus_dmamap_destroy(t, map);
        else
                *dmamp = map;

        return (error);
}

/*
 * Destroy a DMA map.
 */
static void
_bus_dmamap_destroy(bus_dma_tag_t t, bus_dmamap_t map)
{
        struct x86_bus_dma_cookie *cookie = map->_dm_cookie;

        /*
         * Free any bounce pages this map might hold.
         */
        if (cookie != NULL) {
                if (cookie->id_flags & X86_DMA_HAS_BOUNCE)
                        _bus_dma_free_bouncebuf(t, map);
                free(cookie, M_DMAMAP);
        }

        free(map, M_DMAMAP);
}

/*
 * Load a DMA map with a linear buffer.
 */
static int
_bus_dmamap_load(bus_dma_tag_t t, bus_dmamap_t map, void *buf,
    bus_size_t buflen, struct proc *p, int flags)
{
        struct x86_bus_dma_cookie *cookie = map->_dm_cookie;
        int error;
        struct vmspace *vm;

        STAT_INCR(loads);

        /*
         * Make sure that on error condition we return "no valid mappings."
         */
        map->dm_mapsize = 0;
        map->dm_nsegs = 0;
        KASSERTMSG(map->dm_maxsegsz <= map->_dm_maxmaxsegsz,
            "maxsegsz=0x%"PRIxBUSSIZE", maxmaxsegsz=0x%"PRIxBUSSIZE,
            map->dm_maxsegsz, map->_dm_maxmaxsegsz);

        if (buflen > map->_dm_size)
                return EINVAL;

        if (p != NULL) {
                vm = p->p_vmspace;
        } else {
                vm = vmspace_kernel();
        }
        error = _bus_dmamap_load_buffer(t, map, buf, buflen, vm, flags);
        if (error == 0) {
                if (cookie != NULL)
                        cookie->id_flags &= ~X86_DMA_IS_BOUNCING;
                map->dm_mapsize = buflen;
                return 0;
        }

        if (cookie == NULL ||
            (cookie->id_flags & X86_DMA_MIGHT_NEED_BOUNCE) == 0)
                return error;

        /*
         * First attempt failed; bounce it.
         */

        STAT_INCR(bounces);

        /*
         * Allocate bounce pages, if necessary.
         */
        if ((cookie->id_flags & X86_DMA_HAS_BOUNCE) == 0) {
                error = _bus_dma_alloc_bouncebuf(t, map, buflen, flags);
                if (error)
                        return (error);
        }

        /*
         * Cache a pointer to the caller's buffer and load the DMA map
         * with the bounce buffer.
         */
        cookie->id_origbuf = buf;
        cookie->id_origbuflen = buflen;
        cookie->id_buftype = X86_DMA_BUFTYPE_LINEAR;
        map->dm_nsegs = 0;
        error = bus_dmamap_load(t, map, cookie->id_bouncebuf, buflen,
            p, flags);
        if (error)
                return (error);

        /* ...so _bus_dmamap_sync() knows we're bouncing */
        cookie->id_flags |= X86_DMA_IS_BOUNCING;
        return (0);
}

static int
_bus_dmamap_load_busaddr(bus_dma_tag_t t, bus_dmamap_t map,
    bus_addr_t addr, bus_size_t size)
{
        bus_dma_segment_t * const segs = map->dm_segs;
        int nseg = map->dm_nsegs;
        bus_addr_t bmask = ~(map->_dm_boundary - 1);
        bus_addr_t lastaddr = 0xdead; /* XXX gcc */
        bus_size_t sgsize;

        if (nseg > 0)
                lastaddr = segs[nseg-1].ds_addr + segs[nseg-1].ds_len;
again:
        sgsize = size;
        /*
         * Make sure we don't cross any boundaries.
         */
        if (map->_dm_boundary > 0) {
                bus_addr_t baddr; /* next boundary address */

                baddr = (addr + map->_dm_boundary) & bmask;
                if (sgsize > (baddr - addr))
                        sgsize = (baddr - addr);
        }

        /*
         * Insert chunk into a segment, coalescing with
         * previous segment if possible.
         */
        if (nseg > 0 && addr == lastaddr &&
            segs[nseg-1].ds_len + sgsize <= map->dm_maxsegsz &&
            (map->_dm_boundary == 0 ||
             (segs[nseg-1].ds_addr & bmask) == (addr & bmask))) {
                /* coalesce */
                segs[nseg-1].ds_len += sgsize;
        } else if (nseg >= map->_dm_segcnt) {
                return EFBIG;
        } else {
                /* new segment */
                segs[nseg].ds_addr = addr;
                segs[nseg].ds_len = sgsize;
                nseg++;
        }

        lastaddr = addr + sgsize;
        if (map->_dm_bounce_thresh != 0 && lastaddr > map->_dm_bounce_thresh)
                return EINVAL;

        addr += sgsize;
        size -= sgsize;
        if (size > 0)
                goto again;

        map->dm_nsegs = nseg;
        return 0;
}

/*
 * Like _bus_dmamap_load(), but for mbufs.
 */
static int
_bus_dmamap_load_mbuf(bus_dma_tag_t t, bus_dmamap_t map, struct mbuf *m0,
    int flags)
{
        struct x86_bus_dma_cookie *cookie = map->_dm_cookie;
        int error;
        struct mbuf *m;

        /*
         * Make sure on error condition we return "no valid mappings."
         */
        map->dm_mapsize = 0;
        map->dm_nsegs = 0;
        KASSERTMSG(map->dm_maxsegsz <= map->_dm_maxmaxsegsz,
            "maxsegsz=0x%"PRIxBUSSIZE", maxmaxsegsz=0x%"PRIxBUSSIZE,
            map->dm_maxsegsz, map->_dm_maxmaxsegsz);

        KASSERTMSG(m0->m_flags & M_PKTHDR, "m0=%p m_flags=0x%x", m0,
            m0->m_flags);
        if (m0->m_pkthdr.len > map->_dm_size)
                return (EINVAL);

        error = 0;
        for (m = m0; m != NULL && error == 0; m = m->m_next) {
                int offset;
                int remainbytes;
                const struct vm_page * const *pgs;
                paddr_t paddr;
                int size;

                if (m->m_len == 0)
                        continue;
                switch (m->m_flags & (M_EXT|M_EXT_CLUSTER|M_EXT_PAGES)) {
                case M_EXT|M_EXT_CLUSTER:
                        /* XXX KDASSERT */
                        KASSERT(m->m_ext.ext_paddr != M_PADDR_INVALID);
                        paddr = m->m_ext.ext_paddr +
                            (m->m_data - m->m_ext.ext_buf);
                        size = m->m_len;
                        error = _bus_dmamap_load_busaddr(t, map,
                            _BUS_PHYS_TO_BUS(paddr), size);
                        break;

                case M_EXT|M_EXT_PAGES:
                        KASSERTMSG(m->m_ext.ext_buf <= m->m_data,
                            "m=%p m_ext.ext_buf=%p m_ext.ext_size=%zu"
                            " m_data=%p",
                            m, m->m_ext.ext_buf, m->m_ext.ext_size, m->m_data);
                        KASSERTMSG((m->m_data <=
                                m->m_ext.ext_buf + m->m_ext.ext_size),
                            "m=%p m_ext.ext_buf=%p m_ext.ext_size=%zu"
                            " m_data=%p",
                            m, m->m_ext.ext_buf, m->m_ext.ext_size, m->m_data);

                        offset = (vaddr_t)m->m_data -
                            trunc_page((vaddr_t)m->m_ext.ext_buf);
                        remainbytes = m->m_len;

                        /* skip uninteresting pages */
                        pgs = (const struct vm_page * const *)
                            m->m_ext.ext_pgs + (offset >> PAGE_SHIFT);

                        offset &= PAGE_MASK; /* offset in the first page */

                        /* load each pages */
                        while (remainbytes > 0) {
                                const struct vm_page *pg;
                                bus_addr_t busaddr;

                                size = MIN(remainbytes, PAGE_SIZE - offset);

                                pg = *pgs++;
                                KASSERT(pg);
                                busaddr = _BUS_VM_PAGE_TO_BUS(pg) + offset;

                                error = _bus_dmamap_load_busaddr(t, map,
                                    busaddr, size);
                                if (error)
                                        break;
                                offset = 0;
                                remainbytes -= size;
                        }
                        break;

                case 0:
                        paddr = m->m_paddr + M_BUFOFFSET(m) +
                            (m->m_data - M_BUFADDR(m));
                        size = m->m_len;
                        error = _bus_dmamap_load_busaddr(t, map,
                            _BUS_PHYS_TO_BUS(paddr), size);
                        break;

                default:
                        error = _bus_dmamap_load_buffer(t, map, m->m_data,
                            m->m_len, vmspace_kernel(), flags);
                }
        }
        if (error == 0) {
                map->dm_mapsize = m0->m_pkthdr.len;
                return 0;
        }

        map->dm_nsegs = 0;

        if (cookie == NULL ||
            (cookie->id_flags & X86_DMA_MIGHT_NEED_BOUNCE) == 0)
                return error;

        /*
         * First attempt failed; bounce it.
         */

        STAT_INCR(bounces);

        /*
         * Allocate bounce pages, if necessary.
         */
        if ((cookie->id_flags & X86_DMA_HAS_BOUNCE) == 0) {
                error = _bus_dma_alloc_bouncebuf(t, map, m0->m_pkthdr.len,
                    flags);
                if (error)
                        return (error);
        }

        /*
         * Cache a pointer to the caller's buffer and load the DMA map
         * with the bounce buffer.
         */
        cookie->id_origbuf = m0;
        cookie->id_origbuflen = m0->m_pkthdr.len;        /* not really used */
        cookie->id_buftype = X86_DMA_BUFTYPE_MBUF;
        error = bus_dmamap_load(t, map, cookie->id_bouncebuf,
            m0->m_pkthdr.len, NULL, flags);
        if (error)
                return (error);

        /* ...so _bus_dmamap_sync() knows we're bouncing */
        cookie->id_flags |= X86_DMA_IS_BOUNCING;
        return (0);
}

/*
 * Like _bus_dmamap_load(), but for uios.
 */
static int
_bus_dmamap_load_uio(bus_dma_tag_t t, bus_dmamap_t map, struct uio *uio,
    int flags)
{
        int i, error;
        bus_size_t minlen, resid;
        struct vmspace *vm;
        struct iovec *iov;
        void *addr;
        struct x86_bus_dma_cookie *cookie = map->_dm_cookie;

        /*
         * Make sure that on error condition we return "no valid mappings."
         */
        map->dm_mapsize = 0;
        map->dm_nsegs = 0;
        KASSERTMSG(map->dm_maxsegsz <= map->_dm_maxmaxsegsz,
            "maxsegsz=0x%"PRIxBUSSIZE", maxmaxsegsz=0x%"PRIxBUSSIZE,
            map->dm_maxsegsz, map->_dm_maxmaxsegsz);

        resid = uio->uio_resid;
        iov = uio->uio_iov;

        vm = uio->uio_vmspace;

        error = 0;
        for (i = 0; i < uio->uio_iovcnt && resid != 0 && error == 0; i++) {
                /*
                 * Now at the first iovec to load.  Load each iovec
                 * until we have exhausted the residual count.
                 */
                minlen = resid < iov[i].iov_len ? resid : iov[i].iov_len;
                addr = (void *)iov[i].iov_base;

                error = _bus_dmamap_load_buffer(t, map, addr, minlen,
                    vm, flags);

                resid -= minlen;
        }
        if (error == 0) {
                map->dm_mapsize = uio->uio_resid;
                return 0;
        }

        map->dm_nsegs = 0;

        if (cookie == NULL ||
            (cookie->id_flags & X86_DMA_MIGHT_NEED_BOUNCE) == 0)
                return error;

        STAT_INCR(bounces);

        /*
         * Allocate bounce pages, if necessary.
         */
        if ((cookie->id_flags & X86_DMA_HAS_BOUNCE) == 0) {
                error = _bus_dma_alloc_bouncebuf(t, map, uio->uio_resid,
                    flags);
                if (error)
                        return (error);
        }

        /*
         * Cache a pointer to the caller's buffer and load the DMA map
         * with the bounce buffer.
         */
        cookie->id_origbuf = uio;
        cookie->id_origbuflen = uio->uio_resid;
        cookie->id_buftype = X86_DMA_BUFTYPE_UIO;
        error = bus_dmamap_load(t, map, cookie->id_bouncebuf,
            uio->uio_resid, NULL, flags);
        if (error)
                return (error);

        /* ...so _bus_dmamap_sync() knows we're bouncing */
        cookie->id_flags |= X86_DMA_IS_BOUNCING;
        return (0);
}

/*
 * Like _bus_dmamap_load(), but for raw memory allocated with
 * bus_dmamem_alloc().
 */
static int
_bus_dmamap_load_raw(bus_dma_tag_t t, bus_dmamap_t map,
    bus_dma_segment_t *segs, int nsegs, bus_size_t size0, int flags)
{
        bus_size_t size;
        int i, error = 0;

        /*
         * Make sure that on error condition we return "no valid mappings."
         */
        map->dm_mapsize = 0;
        map->dm_nsegs = 0;
        KASSERTMSG(map->dm_maxsegsz <= map->_dm_maxmaxsegsz,
            "maxsegsz=0x%"PRIxBUSSIZE", maxmaxsegsz=0x%"PRIxBUSSIZE,
            map->dm_maxsegsz, map->_dm_maxmaxsegsz);

        if (size0 > map->_dm_size)
                return EINVAL;

        for (i = 0, size = size0; i < nsegs && size > 0; i++) {
                bus_dma_segment_t *ds = &segs[i];
                bus_size_t sgsize;

                sgsize = MIN(ds->ds_len, size);
                if (sgsize == 0)
                        continue;
                error = _bus_dmamap_load_busaddr(t, map, ds->ds_addr, sgsize);
                if (error != 0)
                        break;
                size -= sgsize;
        }

        if (error != 0) {
                map->dm_mapsize = 0;
                map->dm_nsegs = 0;
                return error;
        }

        /* XXX TBD bounce */

        map->dm_mapsize = size0;
        return 0;
}

/*
 * Unload a DMA map.
 */
static void
_bus_dmamap_unload(bus_dma_tag_t t, bus_dmamap_t map)
{
        struct x86_bus_dma_cookie *cookie = map->_dm_cookie;

        /*
         * If we have bounce pages, free them, unless they're
         * reserved for our exclusive use.
         */
        if (cookie != NULL) {
                cookie->id_flags &= ~X86_DMA_IS_BOUNCING;
                cookie->id_buftype = X86_DMA_BUFTYPE_INVALID;
        }
        map->dm_maxsegsz = map->_dm_maxmaxsegsz;
        map->dm_mapsize = 0;
        map->dm_nsegs = 0;
}

/*
 * Synchronize a DMA map.
 *
 * Reference:
 *
 *        AMD64 Architecture Programmer's Manual, Volume 2: System
 *        Programming, 24593--Rev. 3.38--November 2021, Sec. 7.4.2 Memory
 *        Barrier Interaction with Memory Types, Table 7-3, p. 196.
 *        https://web.archive.org/web/20220625040004/https://www.amd.com/system/files/TechDocs/24593.pdf#page=256
 */
static void
_bus_dmamap_sync(bus_dma_tag_t t, bus_dmamap_t map, bus_addr_t offset,
    bus_size_t len, int ops)
{
        struct x86_bus_dma_cookie *cookie = map->_dm_cookie;

        /*
         * Mixing PRE and POST operations is not allowed.
         */
        if ((ops & (BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE)) != 0 &&
            (ops & (BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE)) != 0)
                panic("%s: mix PRE and POST", __func__);

        if ((ops & (BUS_DMASYNC_PREWRITE|BUS_DMASYNC_POSTREAD)) != 0) {
                KASSERTMSG(offset < map->dm_mapsize,
                    "bad offset 0x%"PRIxBUSADDR" >= 0x%"PRIxBUSSIZE,
                    offset, map->dm_mapsize);
                KASSERTMSG(len <= map->dm_mapsize - offset,
                    "bad length 0x%"PRIxBUSADDR" + 0x%"PRIxBUSSIZE
                    " > 0x%"PRIxBUSSIZE,
                    offset, len, map->dm_mapsize);
        }

        /*
         * BUS_DMASYNC_POSTREAD: The caller has been alerted to DMA
         * completion by reading a register or DMA descriptor, and the
         * caller is about to read out of the DMA memory buffer that
         * the device just filled.
         *
         * => LFENCE ensures that these happen in order so that the
         *    caller, or the bounce buffer logic here, doesn't proceed
         *    to read any stale data from cache or speculation.  x86
         *    never reorders loads from wp/wt/wb or uc memory, but it
         *    may execute loads from wc/wc+ memory early, e.g. with
         *    BUS_SPACE_MAP_PREFETCHABLE.
         */
        if (ops & BUS_DMASYNC_POSTREAD)
                x86_lfence();

        /*
         * If we're not bouncing, just return; nothing to do.
         */
        if (len == 0 || cookie == NULL ||
            (cookie->id_flags & X86_DMA_IS_BOUNCING) == 0)
                goto end;

        switch (cookie->id_buftype) {
        case X86_DMA_BUFTYPE_LINEAR:
                /*
                 * Nothing to do for pre-read.
                 */

                if (ops & BUS_DMASYNC_PREWRITE) {
                        /*
                         * Copy the caller's buffer to the bounce buffer.
                         */
                        memcpy((char *)cookie->id_bouncebuf + offset,
                            (char *)cookie->id_origbuf + offset, len);
                }

                if (ops & BUS_DMASYNC_POSTREAD) {
                        /*
                         * Copy the bounce buffer to the caller's buffer.
                         */
                        memcpy((char *)cookie->id_origbuf + offset,
                            (char *)cookie->id_bouncebuf + offset, len);
                }

                /*
                 * Nothing to do for post-write.
                 */
                break;

        case X86_DMA_BUFTYPE_MBUF:
            {
                struct mbuf *m, *m0 = cookie->id_origbuf;
                bus_size_t minlen, moff;

                /*
                 * Nothing to do for pre-read.
                 */

                if (ops & BUS_DMASYNC_PREWRITE) {
                        /*
                         * Copy the caller's buffer to the bounce buffer.
                         */
                        m_copydata(m0, offset, len,
                            (char *)cookie->id_bouncebuf + offset);
                }

                if (ops & BUS_DMASYNC_POSTREAD) {
                        /*
                         * Copy the bounce buffer to the caller's buffer.
                         */
                        for (moff = offset, m = m0; m != NULL && len != 0;
                             m = m->m_next) {
                                /* Find the beginning mbuf. */
                                if (moff >= m->m_len) {
                                        moff -= m->m_len;
                                        continue;
                                }

                                /*
                                 * Now at the first mbuf to sync; nail
                                 * each one until we have exhausted the
                                 * length.
                                 */
                                minlen = len < m->m_len - moff ?
                                    len : m->m_len - moff;

                                memcpy(mtod(m, char *) + moff,
                                    (char *)cookie->id_bouncebuf + offset,
                                    minlen);

                                moff = 0;
                                len -= minlen;
                                offset += minlen;
                        }
                }

                /*
                 * Nothing to do for post-write.
                 */
                break;
            }
        case X86_DMA_BUFTYPE_UIO:
            {
                struct uio *uio;

                uio = (struct uio *)cookie->id_origbuf;

                /*
                 * Nothing to do for pre-read.
                 */

                if (ops & BUS_DMASYNC_PREWRITE) {
                        /*
                         * Copy the caller's buffer to the bounce buffer.
                         */
                        _bus_dma_uiomove((char *)cookie->id_bouncebuf + offset,
                            uio, len, UIO_WRITE);
                }

                if (ops & BUS_DMASYNC_POSTREAD) {
                        _bus_dma_uiomove((char *)cookie->id_bouncebuf + offset,
                            uio, len, UIO_READ);
                }

                /*
                 * Nothing to do for post-write.
                 */
                break;
            }

        case X86_DMA_BUFTYPE_RAW:
                panic("%s: X86_DMA_BUFTYPE_RAW", __func__);
                break;

        case X86_DMA_BUFTYPE_INVALID:
                panic("%s: X86_DMA_BUFTYPE_INVALID", __func__);
                break;

        default:
                panic("%s: unknown buffer type %d", __func__,
                    cookie->id_buftype);
                break;
        }
end:
        /*
         * BUS_DMASYNC_PREREAD: The caller may have previously been
         * using a DMA memory buffer, with loads and stores, and is
         * about to trigger DMA by writing to a register or DMA
         * descriptor.
         *
         * => SFENCE ensures that the stores happen in order, in case
         *    the latter one is non-temporal or to wc/wc+ memory and
         *    thus may be executed early.  x86 never reorders
         *    load;store to store;load for any memory type, so no
         *    barrier is needed for prior loads.
         *
         * BUS_DMASYNC_PREWRITE: The caller has just written to a DMA
         * memory buffer, or we just wrote to to the bounce buffer,
         * data that the device needs to use, and the caller is about
         * to trigger DMA by writing to a register or DMA descriptor.
         *
         * => SFENCE ensures that these happen in order so that any
         *    buffered stores are visible to the device before the DMA
         *    is triggered.  x86 never reorders (non-temporal) stores
         *    to wp/wt/wb or uc memory, but it may reorder two stores
         *    if one is to wc/wc+ memory, e.g. if the DMA descriptor is
         *    mapped with BUS_SPACE_MAP_PREFETCHABLE.
         */
        if (ops & (BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE))
                x86_sfence();

        /*
         * BUS_DMASYNC_POSTWRITE: The caller has been alerted to DMA
         * completion by reading a register or DMA descriptor, and the
         * caller may proceed to reuse the DMA memory buffer, with
         * loads and stores.
         *
         * => No barrier is needed.  Since the DMA memory buffer is not
         *    changing (we're sending data to the device, not receiving
         *    data from the device), prefetched loads are safe.  x86
         *    never reoreders load;store to store;load for any memory
         *    type, so early execution of stores prior to witnessing
         *    the DMA completion is not possible.
         */
}

/*
 * Allocate memory safe for DMA.
 */
static int
_bus_dmamem_alloc(bus_dma_tag_t t, bus_size_t size, bus_size_t alignment,
    bus_size_t boundary, bus_dma_segment_t *segs, int nsegs, int *rsegs,
    int flags)
{
        bus_addr_t high;

        if (t->_bounce_alloc_hi != 0 && _BUS_AVAIL_END > t->_bounce_alloc_hi - 1)
                high = t->_bounce_alloc_hi - 1;
        else
                high = _BUS_AVAIL_END;

        return (_BUS_DMAMEM_ALLOC_RANGE(t, size, alignment, boundary,
            segs, nsegs, rsegs, flags, t->_bounce_alloc_lo, high));
}

static int
_bus_dma_alloc_bouncebuf(bus_dma_tag_t t, bus_dmamap_t map,
    bus_size_t size, int flags)
{
        struct x86_bus_dma_cookie *cookie = map->_dm_cookie;
        int error = 0;

        KASSERT(cookie != NULL);

        cookie->id_bouncebuflen = round_page(size);
        error = _bus_dmamem_alloc(t, cookie->id_bouncebuflen,
            PAGE_SIZE, map->_dm_boundary, cookie->id_bouncesegs,
            map->_dm_segcnt, &cookie->id_nbouncesegs, flags);
        if (error) {
                cookie->id_bouncebuflen = 0;
                cookie->id_nbouncesegs = 0;
                return error;
        }

        error = _bus_dmamem_map(t, cookie->id_bouncesegs,
            cookie->id_nbouncesegs, cookie->id_bouncebuflen,
            (void **)&cookie->id_bouncebuf, flags);

        if (error) {
                _bus_dmamem_free(t, cookie->id_bouncesegs,
                    cookie->id_nbouncesegs);
                cookie->id_bouncebuflen = 0;
                cookie->id_nbouncesegs = 0;
        } else {
                cookie->id_flags |= X86_DMA_HAS_BOUNCE;
                STAT_INCR(nbouncebufs);
        }

        return (error);
}

static void
_bus_dma_free_bouncebuf(bus_dma_tag_t t, bus_dmamap_t map)
{
        struct x86_bus_dma_cookie *cookie = map->_dm_cookie;

        KASSERT(cookie != NULL);

        STAT_DECR(nbouncebufs);

        _bus_dmamem_unmap(t, cookie->id_bouncebuf, cookie->id_bouncebuflen);
        _bus_dmamem_free(t, cookie->id_bouncesegs,
            cookie->id_nbouncesegs);
        cookie->id_bouncebuflen = 0;
        cookie->id_nbouncesegs = 0;
        cookie->id_flags &= ~X86_DMA_HAS_BOUNCE;
}


/*
 * This function does the same as uiomove, but takes an explicit
 * direction, and does not update the uio structure.
 */
static int
_bus_dma_uiomove(void *buf, struct uio *uio, size_t n, int direction)
{
        struct iovec *iov;
        int error;
        struct vmspace *vm;
        char *cp;
        size_t resid, cnt;
        int i;

        iov = uio->uio_iov;
        vm = uio->uio_vmspace;
        cp = buf;
        resid = n;

        for (i = 0; i < uio->uio_iovcnt && resid > 0; i++) {
                iov = &uio->uio_iov[i];
                if (iov->iov_len == 0)
                        continue;
                cnt = MIN(resid, iov->iov_len);

                if (!VMSPACE_IS_KERNEL_P(vm)) {
                        preempt_point();
                }
                if (direction == UIO_READ) {
                        error = copyout_vmspace(vm, cp, iov->iov_base, cnt);
                } else {
                        error = copyin_vmspace(vm, iov->iov_base, cp, cnt);
                }
                if (error)
                        return (error);
                cp += cnt;
                resid -= cnt;
        }
        return (0);
}

/*
 * Common function for freeing DMA-safe memory.  May be called by
 * bus-specific DMA memory free functions.
 */
static void
_bus_dmamem_free(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs)
{
        struct vm_page *m;
        bus_addr_t addr;
        struct pglist mlist;
        int curseg;

        /*
         * Build a list of pages to free back to the VM system.
         */
        TAILQ_INIT(&mlist);
        for (curseg = 0; curseg < nsegs; curseg++) {
                for (addr = segs[curseg].ds_addr;
                    addr < (segs[curseg].ds_addr + segs[curseg].ds_len);
                    addr += PAGE_SIZE) {
                        m = _BUS_BUS_TO_VM_PAGE(addr);
                        TAILQ_INSERT_TAIL(&mlist, m, pageq.queue);
                }
        }

        uvm_pglistfree(&mlist);
}

/*
 * Common function for mapping DMA-safe memory.  May be called by
 * bus-specific DMA memory map functions.
 * This supports BUS_DMA_NOCACHE.
 */
static int
_bus_dmamem_map(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs,
    size_t size, void **kvap, int flags)
{
        vaddr_t va;
        bus_addr_t addr;
        int curseg;
        const uvm_flag_t kmflags =
            (flags & BUS_DMA_NOWAIT) != 0 ? UVM_KMF_NOWAIT : 0;
        u_int pmapflags = PMAP_WIRED | VM_PROT_READ | VM_PROT_WRITE;

        size = round_page(size);
        if (flags & BUS_DMA_NOCACHE)
                pmapflags |= PMAP_NOCACHE;

        va = uvm_km_alloc(kernel_map, size, 0, UVM_KMF_VAONLY | kmflags);

        if (va == 0)
                return ENOMEM;

        *kvap = (void *)va;

        for (curseg = 0; curseg < nsegs; curseg++) {
                for (addr = segs[curseg].ds_addr;
                    addr < (segs[curseg].ds_addr + segs[curseg].ds_len);
                    addr += PAGE_SIZE, va += PAGE_SIZE, size -= PAGE_SIZE) {
                        if (size == 0)
                                panic("_bus_dmamem_map: size botch");
                        _BUS_PMAP_ENTER(pmap_kernel(), va, addr,
                            VM_PROT_READ | VM_PROT_WRITE,
                            pmapflags);
                }
        }
        pmap_update(pmap_kernel());

        return 0;
}

/*
 * Common function for unmapping DMA-safe memory.  May be called by
 * bus-specific DMA memory unmapping functions.
 */

static void
_bus_dmamem_unmap(bus_dma_tag_t t, void *kva, size_t size)
{
        pt_entry_t *pte, opte;
        vaddr_t va, sva, eva;

        KASSERTMSG(((uintptr_t)kva & PGOFSET) == 0, "kva=%p", kva);

        size = round_page(size);
        sva = (vaddr_t)kva;
        eva = sva + size;

        /*
         * mark pages cacheable again.
         */
        for (va = sva; va < eva; va += PAGE_SIZE) {
                pte = kvtopte(va);
                opte = *pte;
                if ((opte & PTE_PCD) != 0)
                        pmap_pte_clearbits(pte, PTE_PCD);
        }
        pmap_remove(pmap_kernel(), (vaddr_t)kva, (vaddr_t)kva + size);
        pmap_update(pmap_kernel());
        uvm_km_free(kernel_map, (vaddr_t)kva, size, UVM_KMF_VAONLY);
}

/*
 * Common function for mmap(2)'ing DMA-safe memory.  May be called by
 * bus-specific DMA mmap(2)'ing functions.
 */
static paddr_t
_bus_dmamem_mmap(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs,
    off_t off, int prot, int flags)
{
        int i;

        for (i = 0; i < nsegs; i++) {
                KASSERTMSG((off & PGOFSET) == 0, "off=0x%jx", (uintmax_t)off);
                KASSERTMSG((segs[i].ds_addr & PGOFSET) == 0,
                    "segs[%u].ds_addr=%"PRIxBUSADDR, i, segs[i].ds_addr);
                KASSERTMSG((segs[i].ds_len & PGOFSET) == 0,
                    "segs[%u].ds_len=%"PRIxBUSSIZE, i, segs[i].ds_len);
                if (off >= segs[i].ds_len) {
                        off -= segs[i].ds_len;
                        continue;
                }

                return (x86_btop(_BUS_BUS_TO_PHYS(segs[i].ds_addr + off)));
        }

        /* Page not found. */
        return (-1);
}

/**********************************************************************
 * DMA utility functions
 **********************************************************************/

/*
 * Utility function to load a linear buffer.
 */
static int
_bus_dmamap_load_buffer(bus_dma_tag_t t, bus_dmamap_t map, void *buf,
    bus_size_t buflen, struct vmspace *vm, int flags)
{
        bus_size_t sgsize;
        bus_addr_t curaddr;
        vaddr_t vaddr = (vaddr_t)buf;
        pmap_t pmap;

        if (vm != NULL)
                pmap = vm_map_pmap(&vm->vm_map);
        else
                pmap = pmap_kernel();

        while (buflen > 0) {
                int error;

                /*
                 * Get the bus address for this segment.
                 */
                curaddr = _BUS_VIRT_TO_BUS(pmap, vaddr);

                /*
                 * Compute the segment size, and adjust counts.
                 */
                sgsize = PAGE_SIZE - ((u_long)vaddr & PGOFSET);
                if (buflen < sgsize)
                        sgsize = buflen;

                /*
                 * If we're beyond the bounce threshold, notify
                 * the caller.
                 */
                if (map->_dm_bounce_thresh != 0 &&
                    curaddr + sgsize >= map->_dm_bounce_thresh)
                        return (EINVAL);


                error = _bus_dmamap_load_busaddr(t, map, curaddr, sgsize);
                if (error)
                        return error;

                vaddr += sgsize;
                buflen -= sgsize;
        }

        return (0);
}

static int
_bus_dmatag_subregion(bus_dma_tag_t tag, bus_addr_t min_addr,
                      bus_addr_t max_addr, bus_dma_tag_t *newtag, int flags)
{

        if ((tag->_bounce_thresh != 0   && max_addr >= tag->_bounce_thresh - 1) &&
            (tag->_bounce_alloc_hi != 0 && max_addr >= tag->_bounce_alloc_hi - 1) &&
            (min_addr <= tag->_bounce_alloc_lo)) {
                *newtag = tag;
                /* if the tag must be freed, add a reference */
                if (tag->_tag_needs_free)
                        (tag->_tag_needs_free)++;
                return 0;
        }

        if ((*newtag = malloc(sizeof(struct x86_bus_dma_tag), M_DMAMAP,
            (flags & BUS_DMA_NOWAIT) ? M_NOWAIT : M_WAITOK)) == NULL)
                return ENOMEM;

        **newtag = *tag;
        (*newtag)->_tag_needs_free = 1;

        if (tag->_bounce_thresh == 0 || max_addr < tag->_bounce_thresh)
                (*newtag)->_bounce_thresh = max_addr;
        if (tag->_bounce_alloc_hi == 0 || max_addr < tag->_bounce_alloc_hi)
                (*newtag)->_bounce_alloc_hi = max_addr;
        if (min_addr > tag->_bounce_alloc_lo)
                (*newtag)->_bounce_alloc_lo = min_addr;

        return 0;
}

static void
_bus_dmatag_destroy(bus_dma_tag_t tag)
{

        switch (tag->_tag_needs_free) {
        case 0:
                break;                                /* not allocated with malloc */
        case 1:
                free(tag, M_DMAMAP);                /* last reference to tag */
                break;
        default:
                (tag->_tag_needs_free)--;        /* one less reference */
        }
}


void
bus_dmamap_sync(bus_dma_tag_t t, bus_dmamap_t p, bus_addr_t o, bus_size_t l,
                int ops)
{
        bus_dma_tag_t it;

        kasan_dma_sync(p, o, l, ops);
        kmsan_dma_sync(p, o, l, ops);

        if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_SYNC) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_SYNC) == 0)
                        continue;
                (*it->bdt_ov->ov_dmamap_sync)(it->bdt_ctx, t, p, o,
                    l, ops);
                return;
        }

        _bus_dmamap_sync(t, p, o, l, ops);
}

int
bus_dmamap_create(bus_dma_tag_t t, bus_size_t size, int nsegments,
                  bus_size_t maxsegsz, bus_size_t boundary, int flags,
                  bus_dmamap_t *dmamp)
{
        bus_dma_tag_t it;

        if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_CREATE) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_CREATE) == 0)
                        continue;
                return (*it->bdt_ov->ov_dmamap_create)(it->bdt_ctx, t, size,
                    nsegments, maxsegsz, boundary, flags, dmamp);
        }

        return _bus_dmamap_create(t, size, nsegments, maxsegsz,
            boundary, flags, dmamp);
}

void
bus_dmamap_destroy(bus_dma_tag_t t, bus_dmamap_t dmam)
{
        bus_dma_tag_t it;

        if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_DESTROY) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_DESTROY) == 0)
                        continue;
                (*it->bdt_ov->ov_dmamap_destroy)(it->bdt_ctx, t, dmam);
                return;
        }

        _bus_dmamap_destroy(t, dmam);
}

int
bus_dmamap_load(bus_dma_tag_t t, bus_dmamap_t dmam, void *buf,
                bus_size_t buflen, struct proc *p, int flags)
{
        bus_dma_tag_t it;

        kasan_dma_load(dmam, buf, buflen, KASAN_DMA_LINEAR);
        kmsan_dma_load(dmam, buf, buflen, KMSAN_DMA_LINEAR);

        if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_LOAD) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_LOAD) == 0)
                        continue;
                return (*it->bdt_ov->ov_dmamap_load)(it->bdt_ctx, t, dmam,
                    buf, buflen, p, flags);
        }

        return _bus_dmamap_load(t, dmam, buf, buflen, p, flags);
}

int
bus_dmamap_load_mbuf(bus_dma_tag_t t, bus_dmamap_t dmam,
                     struct mbuf *chain, int flags)
{
        bus_dma_tag_t it;

        kasan_dma_load(dmam, chain, 0, KASAN_DMA_MBUF);
        kmsan_dma_load(dmam, chain, 0, KMSAN_DMA_MBUF);

        if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_LOAD_MBUF) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_LOAD_MBUF) == 0)
                        continue;
                return (*it->bdt_ov->ov_dmamap_load_mbuf)(it->bdt_ctx, t, dmam,
                    chain, flags);
        }

        return _bus_dmamap_load_mbuf(t, dmam, chain, flags);
}

int
bus_dmamap_load_uio(bus_dma_tag_t t, bus_dmamap_t dmam,
                    struct uio *uio, int flags)
{
        bus_dma_tag_t it;

        kasan_dma_load(dmam, uio, 0, KASAN_DMA_UIO);
        kmsan_dma_load(dmam, uio, 0, KMSAN_DMA_UIO);

        if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_LOAD_UIO) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_LOAD_UIO) == 0)
                        continue;
                return (*it->bdt_ov->ov_dmamap_load_uio)(it->bdt_ctx, t, dmam,
                    uio, flags);
        }

        return _bus_dmamap_load_uio(t, dmam, uio, flags);
}

int
bus_dmamap_load_raw(bus_dma_tag_t t, bus_dmamap_t dmam,
                    bus_dma_segment_t *segs, int nsegs,
                    bus_size_t size, int flags)
{
        bus_dma_tag_t it;

        kasan_dma_load(dmam, NULL, 0, KASAN_DMA_RAW);
        kmsan_dma_load(dmam, NULL, 0, KMSAN_DMA_RAW);

        if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_LOAD_RAW) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_LOAD_RAW) == 0)
                        continue;
                return (*it->bdt_ov->ov_dmamap_load_raw)(it->bdt_ctx, t, dmam,
                    segs, nsegs, size, flags);
        }

        return _bus_dmamap_load_raw(t, dmam, segs, nsegs, size, flags);
}

void
bus_dmamap_unload(bus_dma_tag_t t, bus_dmamap_t dmam)
{
        bus_dma_tag_t it;

        if ((t->bdt_exists & BUS_DMAMAP_OVERRIDE_UNLOAD) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMAP_OVERRIDE_UNLOAD) == 0)
                        continue;
                (*it->bdt_ov->ov_dmamap_unload)(it->bdt_ctx, t, dmam);
                return;
        }

        _bus_dmamap_unload(t, dmam);
}

int
bus_dmamem_alloc(bus_dma_tag_t t, bus_size_t size, bus_size_t alignment,
                 bus_size_t boundary, bus_dma_segment_t *segs, int nsegs,
                 int *rsegs, int flags)
{
        bus_dma_tag_t it;

        if ((t->bdt_exists & BUS_DMAMEM_OVERRIDE_ALLOC) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMEM_OVERRIDE_ALLOC) == 0)
                        continue;
                return (*it->bdt_ov->ov_dmamem_alloc)(it->bdt_ctx, t, size,
                    alignment, boundary, segs, nsegs, rsegs, flags);
        }

        return _bus_dmamem_alloc(t, size, alignment, boundary, segs,
            nsegs, rsegs, flags);
}

void
bus_dmamem_free(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs)
{
        bus_dma_tag_t it;

        if ((t->bdt_exists & BUS_DMAMEM_OVERRIDE_FREE) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMEM_OVERRIDE_FREE) == 0)
                        continue;
                (*it->bdt_ov->ov_dmamem_free)(it->bdt_ctx, t, segs, nsegs);
                return;
        }

        _bus_dmamem_free(t, segs, nsegs);
}

int
bus_dmamem_map(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs,
               size_t size, void **kvap, int flags)
{
        bus_dma_tag_t it;

        if ((t->bdt_exists & BUS_DMAMEM_OVERRIDE_MAP) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMEM_OVERRIDE_MAP) == 0)
                        continue;
                return (*it->bdt_ov->ov_dmamem_map)(it->bdt_ctx, t,
                    segs, nsegs, size, kvap, flags);
        }

        return _bus_dmamem_map(t, segs, nsegs, size, kvap, flags);
}

void
bus_dmamem_unmap(bus_dma_tag_t t, void *kva, size_t size)
{
        bus_dma_tag_t it;

        if ((t->bdt_exists & BUS_DMAMEM_OVERRIDE_UNMAP) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMEM_OVERRIDE_UNMAP) == 0)
                        continue;
                (*it->bdt_ov->ov_dmamem_unmap)(it->bdt_ctx, t, kva, size);
                return;
        }

        _bus_dmamem_unmap(t, kva, size);
}

paddr_t
bus_dmamem_mmap(bus_dma_tag_t t, bus_dma_segment_t *segs, int nsegs,
                off_t off, int prot, int flags)
{
        bus_dma_tag_t it;

        if ((t->bdt_exists & BUS_DMAMEM_OVERRIDE_MMAP) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMAMEM_OVERRIDE_MMAP) == 0)
                        continue;
                return (*it->bdt_ov->ov_dmamem_mmap)(it->bdt_ctx, t, segs,
                    nsegs, off, prot, flags);
        }

        return _bus_dmamem_mmap(t, segs, nsegs, off, prot, flags);
}

int
bus_dmatag_subregion(bus_dma_tag_t t, bus_addr_t min_addr,
                     bus_addr_t max_addr, bus_dma_tag_t *newtag, int flags)
{
        bus_dma_tag_t it;

        if ((t->bdt_exists & BUS_DMATAG_OVERRIDE_SUBREGION) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMATAG_OVERRIDE_SUBREGION) == 0)
                        continue;
                return (*it->bdt_ov->ov_dmatag_subregion)(it->bdt_ctx, t,
                    min_addr, max_addr, newtag, flags);
        }

        return _bus_dmatag_subregion(t, min_addr, max_addr, newtag, flags);
}

void
bus_dmatag_destroy(bus_dma_tag_t t)
{
        bus_dma_tag_t it;

        if ((t->bdt_exists & BUS_DMATAG_OVERRIDE_DESTROY) == 0)
                ;        /* skip override */
        else for (it = t; it != NULL; it = it->bdt_super) {
                if ((it->bdt_present & BUS_DMATAG_OVERRIDE_DESTROY) == 0)
                        continue;
                (*it->bdt_ov->ov_dmatag_destroy)(it->bdt_ctx, t);
                return;
        }

        _bus_dmatag_destroy(t);
}

static const void *
bit_to_function_pointer(const struct bus_dma_overrides *ov, uint64_t bit)
{
        switch (bit) {
        case BUS_DMAMAP_OVERRIDE_CREATE:
                return ov->ov_dmamap_create;
        case BUS_DMAMAP_OVERRIDE_DESTROY:
                return ov->ov_dmamap_destroy;
        case BUS_DMAMAP_OVERRIDE_LOAD:
                return ov->ov_dmamap_load;
        case BUS_DMAMAP_OVERRIDE_LOAD_MBUF:
                return ov->ov_dmamap_load_mbuf;
        case BUS_DMAMAP_OVERRIDE_LOAD_UIO:
                return ov->ov_dmamap_load_uio;
        case BUS_DMAMAP_OVERRIDE_LOAD_RAW:
                return ov->ov_dmamap_load_raw;
        case BUS_DMAMAP_OVERRIDE_UNLOAD:
                return ov->ov_dmamap_unload;
        case BUS_DMAMAP_OVERRIDE_SYNC:
                return ov->ov_dmamap_sync;
        case BUS_DMAMEM_OVERRIDE_ALLOC:
                return ov->ov_dmamem_alloc;
        case BUS_DMAMEM_OVERRIDE_FREE:
                return ov->ov_dmamem_free;
        case BUS_DMAMEM_OVERRIDE_MAP:
                return ov->ov_dmamem_map;
        case BUS_DMAMEM_OVERRIDE_UNMAP:
                return ov->ov_dmamem_unmap;
        case BUS_DMAMEM_OVERRIDE_MMAP:
                return ov->ov_dmamem_mmap;
        case BUS_DMATAG_OVERRIDE_SUBREGION:
                return ov->ov_dmatag_subregion;
        case BUS_DMATAG_OVERRIDE_DESTROY:
                return ov->ov_dmatag_destroy;
        default:
                return NULL;
        }
}

void
bus_dma_tag_destroy(bus_dma_tag_t bdt)
{
        if (bdt->bdt_super != NULL)
                bus_dmatag_destroy(bdt->bdt_super);
        kmem_free(bdt, sizeof(struct x86_bus_dma_tag));
}

int
bus_dma_tag_create(bus_dma_tag_t obdt, const uint64_t present,
    const struct bus_dma_overrides *ov, void *ctx, bus_dma_tag_t *bdtp)
{
        uint64_t bit, bits, nbits;
        bus_dma_tag_t bdt;
        const void *fp;

        if (ov == NULL || present == 0)
                return EINVAL;

        bdt = kmem_alloc(sizeof(struct x86_bus_dma_tag), KM_SLEEP);
        *bdt = *obdt;
        /* don't let bus_dmatag_destroy free these */
        bdt->_tag_needs_free = 0;

        bdt->bdt_super = obdt;

        for (bits = present; bits != 0; bits = nbits) {
                nbits = bits & (bits - 1);
                bit = nbits ^ bits;
                if ((fp = bit_to_function_pointer(ov, bit)) == NULL) {
#ifdef DEBUG
                        printf("%s: missing bit %" PRIx64 "\n", __func__, bit);
#endif
                        goto einval;
                }
        }

        bdt->bdt_ov = ov;
        bdt->bdt_exists = obdt->bdt_exists | present;
        bdt->bdt_present = present;
        bdt->bdt_ctx = ctx;

        *bdtp = bdt;
        if (obdt->_tag_needs_free)
                obdt->_tag_needs_free++;

        return 0;
einval:
        kmem_free(bdt, sizeof(struct x86_bus_dma_tag));
        return EINVAL;
}








































































































   13 














   13 







   66 
   67 


   65 








   35 

   35 


   36 



   30 



    6 






   12 
   11 


   12 


   12 







    1 
   11 



   12 




    5 








    6 


    6 


    6 





































    6 






    6 





    2 
    2 




   23 
   22 








  138 
  143 








    5 
    5 








    7 
    7 








   29 
   29 








    3 
    3 








    1 
    1 


    1 






    8 
    8 


    8 






    1 
    1 


    1 

















    5 
    5 


    5 





















    2 






    2 










    4 


    4 



    2 


    2 








    2 
    2 











    2 
    2 


    2 








    3 


    3 


    3 






    2 


    1 
















    1 














    1 
























































































    5 
    5 


    5 



    5 










































































































































  106 












































































































































































































































  137 



  138 
  136 


   26 


  140 





  141 



  136 






  137 














   35 






   88 

    5 





















   40 
   40 









   26 
   27 









   32 
   30 






















    1 
    1 






   17 
   17 














   86 





























   84 









   83 







    4 











   18 



   18 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
/* $NetBSD: kern_auth.c,v 1.84 2023/10/04 22:17:09 ad Exp $ */

/*-
 * Copyright (c) 2005, 2006 Elad Efrat <elad@NetBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_auth.c,v 1.84 2023/10/04 22:17:09 ad Exp $");

#include <sys/types.h>
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/proc.h>
#include <sys/ucred.h>
#include <sys/pool.h>
#define __KAUTH_PRIVATE
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/rwlock.h>
#include <sys/sysctl.h>
#include <sys/atomic.h>
#include <sys/specificdata.h>
#include <sys/vnode.h>

#include <secmodel/secmodel.h>

/*
 * Secmodel-specific credentials.
 */
struct kauth_key {
        secmodel_t ks_secmodel;                /* secmodel */
        specificdata_key_t ks_key;        /* key */
};


/*
 * Listener.
 */
struct kauth_listener {
        kauth_scope_callback_t                func;                /* callback */
        kauth_scope_t                        scope;                /* scope backpointer */
        u_int                                refcnt;                /* reference count */
        SIMPLEQ_ENTRY(kauth_listener)        listener_next;        /* listener list */
};

/*
 * Scope.
 */
struct kauth_scope {
        const char                       *id;                /* scope name */
        void                               *cookie;                /* user cookie */
        u_int                                nlisteners;        /* # of listeners */
        SIMPLEQ_HEAD(, kauth_listener)        listenq;        /* listener list */
        SIMPLEQ_ENTRY(kauth_scope)        next_scope;        /* scope list */
};

static int kauth_cred_hook(kauth_cred_t, kauth_action_t, void *, void *);

/* List of scopes and its lock. */
static SIMPLEQ_HEAD(, kauth_scope) scope_list =
    SIMPLEQ_HEAD_INITIALIZER(scope_list);

/* Built-in scopes: generic, process. */
static kauth_scope_t kauth_builtin_scope_generic;
static kauth_scope_t kauth_builtin_scope_system;
static kauth_scope_t kauth_builtin_scope_process;
static kauth_scope_t kauth_builtin_scope_network;
static kauth_scope_t kauth_builtin_scope_machdep;
static kauth_scope_t kauth_builtin_scope_device;
static kauth_scope_t kauth_builtin_scope_cred;
static kauth_scope_t kauth_builtin_scope_vnode;

static specificdata_domain_t kauth_domain;
static pool_cache_t kauth_cred_cache;

krwlock_t        kauth_lock;

/* Allocate new, empty kauth credentials. */
kauth_cred_t
kauth_cred_alloc(void)
{
        kauth_cred_t cred;

        cred = pool_cache_get(kauth_cred_cache, PR_WAITOK);

        cred->cr_refcnt = 1;
        cred->cr_uid = 0;
        cred->cr_euid = 0;
        cred->cr_svuid = 0;
        cred->cr_gid = 0;
        cred->cr_egid = 0;
        cred->cr_svgid = 0;
        cred->cr_ngroups = 0;

        specificdata_init(kauth_domain, &cred->cr_sd);
        kauth_cred_hook(cred, KAUTH_CRED_INIT, NULL, NULL);

        return (cred);
}

/* Increment reference count to cred. */
kauth_cred_t
kauth_cred_hold(kauth_cred_t cred)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(cred->cr_refcnt > 0);

        atomic_inc_uint(&cred->cr_refcnt);
        return cred;
}

/* Decrease reference count to cred. If reached zero, free it. */
void
kauth_cred_free(kauth_cred_t cred)
{

        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(cred->cr_refcnt > 0);
        ASSERT_SLEEPABLE();

        membar_release();
        if (atomic_dec_uint_nv(&cred->cr_refcnt) > 0)
                return;
        membar_acquire();

        kauth_cred_hook(cred, KAUTH_CRED_FREE, NULL, NULL);
        specificdata_fini(kauth_domain, &cred->cr_sd);
        pool_cache_put(kauth_cred_cache, cred);
}

static void
kauth_cred_clone1(kauth_cred_t from, kauth_cred_t to, bool copy_groups)
{
        KASSERT(from != NULL);
        KASSERT(from != NOCRED);
        KASSERT(from != FSCRED);
        KASSERT(to != NULL);
        KASSERT(to != NOCRED);
        KASSERT(to != FSCRED);
        KASSERT(from->cr_refcnt > 0);

        to->cr_uid = from->cr_uid;
        to->cr_euid = from->cr_euid;
        to->cr_svuid = from->cr_svuid;
        to->cr_gid = from->cr_gid;
        to->cr_egid = from->cr_egid;
        to->cr_svgid = from->cr_svgid;
        if (copy_groups) {
                to->cr_ngroups = from->cr_ngroups;
                memcpy(to->cr_groups, from->cr_groups, sizeof(to->cr_groups));
        }

        kauth_cred_hook(from, KAUTH_CRED_COPY, to, NULL);
}

void
kauth_cred_clone(kauth_cred_t from, kauth_cred_t to)
{
        kauth_cred_clone1(from, to, true);
}

/*
 * Duplicate cred and return a new kauth_cred_t.
 */
kauth_cred_t
kauth_cred_dup(kauth_cred_t cred)
{
        kauth_cred_t new_cred;

        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(cred->cr_refcnt > 0);

        new_cred = kauth_cred_alloc();

        kauth_cred_clone(cred, new_cred);

        return (new_cred);
}

/*
 * Similar to crcopy(), only on a kauth_cred_t.
 * XXX: Is this even needed? [kauth_cred_copy]
 */
kauth_cred_t
kauth_cred_copy(kauth_cred_t cred)
{
        kauth_cred_t new_cred;

        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(cred->cr_refcnt > 0);

        /* If the provided credentials already have one reference, use them. */
        if (cred->cr_refcnt == 1)
                return (cred);

        new_cred = kauth_cred_alloc();

        kauth_cred_clone(cred, new_cred);

        kauth_cred_free(cred);

        return (new_cred);
}

void
kauth_proc_fork(struct proc *parent, struct proc *child)
{

        mutex_enter(parent->p_lock);
        child->p_cred = kauth_cred_hold(parent->p_cred);
        mutex_exit(parent->p_lock);

        /* XXX: relies on parent process stalling during fork() */
        kauth_cred_hook(parent->p_cred, KAUTH_CRED_FORK, parent,
            child);
}

void
kauth_proc_chroot(kauth_cred_t cred, struct cwdinfo *cwdi)
{
        kauth_cred_hook(cred, KAUTH_CRED_CHROOT, cwdi, NULL);
}

uid_t
kauth_cred_getuid(kauth_cred_t cred)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);

        return (cred->cr_uid);
}

uid_t
kauth_cred_geteuid(kauth_cred_t cred)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);

        return (cred->cr_euid);
}

uid_t
kauth_cred_getsvuid(kauth_cred_t cred)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);

        return (cred->cr_svuid);
}

gid_t
kauth_cred_getgid(kauth_cred_t cred)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);

        return (cred->cr_gid);
}

gid_t
kauth_cred_getegid(kauth_cred_t cred)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);

        return (cred->cr_egid);
}

gid_t
kauth_cred_getsvgid(kauth_cred_t cred)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);

        return (cred->cr_svgid);
}

void
kauth_cred_setuid(kauth_cred_t cred, uid_t uid)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(cred->cr_refcnt == 1);

        cred->cr_uid = uid;
}

void
kauth_cred_seteuid(kauth_cred_t cred, uid_t uid)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(cred->cr_refcnt == 1);

        cred->cr_euid = uid;
}

void
kauth_cred_setsvuid(kauth_cred_t cred, uid_t uid)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(cred->cr_refcnt == 1);

        cred->cr_svuid = uid;
}

void
kauth_cred_setgid(kauth_cred_t cred, gid_t gid)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(cred->cr_refcnt == 1);

        cred->cr_gid = gid;
}

void
kauth_cred_setegid(kauth_cred_t cred, gid_t gid)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(cred->cr_refcnt == 1);

        cred->cr_egid = gid;
}

void
kauth_cred_setsvgid(kauth_cred_t cred, gid_t gid)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(cred->cr_refcnt == 1);

        cred->cr_svgid = gid;
}

/* Checks if gid is a member of the groups in cred. */
int
kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp)
{
        uint32_t i;

        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(resultp != NULL);

        *resultp = 0;

        for (i = 0; i < cred->cr_ngroups; i++)
                if (cred->cr_groups[i] == gid) {
                        *resultp = 1;
                        break;
                }

        return (0);
}

int
kauth_cred_groupmember(kauth_cred_t cred, gid_t gid)
{
        int ismember, error;

        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);

        if (kauth_cred_getegid(cred) == gid)
                return 0;

        error = kauth_cred_ismember_gid(cred, gid, &ismember);
        if (error)
                return error;

        return ismember ? 0 : -1;
}

u_int
kauth_cred_ngroups(kauth_cred_t cred)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);

        return (cred->cr_ngroups);
}

/*
 * Return the group at index idx from the groups in cred.
 */
gid_t
kauth_cred_group(kauth_cred_t cred, u_int idx)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(idx < cred->cr_ngroups);

        return (cred->cr_groups[idx]);
}

/* XXX elad: gmuid is unused for now. */
int
kauth_cred_setgroups(kauth_cred_t cred, const gid_t *grbuf, size_t len,
    uid_t gmuid, enum uio_seg seg)
{
        int error = 0;

        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(cred->cr_refcnt == 1);

        if (len > __arraycount(cred->cr_groups))
                return EINVAL;

        if (len) {
                if (seg == UIO_SYSSPACE) {
                        memcpy(cred->cr_groups, grbuf,
                            len * sizeof(cred->cr_groups[0]));
                } else {
                        error = copyin(grbuf, cred->cr_groups,
                            len * sizeof(cred->cr_groups[0]));
                        if (error != 0)
                                len = 0;
                }
        }
        memset(cred->cr_groups + len, 0xff,
            sizeof(cred->cr_groups) - (len * sizeof(cred->cr_groups[0])));

        cred->cr_ngroups = len;

        return error;
}

/* This supports sys_setgroups() */
int
kauth_proc_setgroups(struct lwp *l, kauth_cred_t ncred)
{
        kauth_cred_t cred;
        int error;

        /*
         * At this point we could delete duplicate groups from ncred,
         * and plausibly sort the list - but in general the later is
         * a bad idea.
         */
        proc_crmod_enter();
        /* Maybe we should use curproc here ? */
        cred = l->l_proc->p_cred;

        kauth_cred_clone1(cred, ncred, false);

        error = kauth_authorize_process(cred, KAUTH_PROCESS_SETID,
            l->l_proc, NULL, NULL, NULL);
        if (error != 0) {
                proc_crmod_leave(cred, ncred, false);
                        return error;
        }

        /* Broadcast our credentials to the process and other LWPs. */
         proc_crmod_leave(ncred, cred, true);
        return 0;
}

int
kauth_cred_getgroups(kauth_cred_t cred, gid_t *grbuf, size_t len,
    enum uio_seg seg)
{
        KASSERT(cred != NULL);

        if (len > cred->cr_ngroups)
                return EINVAL;

        if (seg == UIO_USERSPACE)
                return copyout(cred->cr_groups, grbuf, sizeof(*grbuf) * len);
        memcpy(grbuf, cred->cr_groups, sizeof(*grbuf) * len);

        return 0;
}

int
kauth_register_key(secmodel_t secmodel, kauth_key_t *result)
{
        kauth_key_t k;
        specificdata_key_t key;
        int error;

        KASSERT(result != NULL);

        error = specificdata_key_create(kauth_domain, &key, NULL);
        if (error)
                return (error);

        k = kmem_alloc(sizeof(*k), KM_SLEEP);
        k->ks_secmodel = secmodel;
        k->ks_key = key;

        *result = k;

        return (0);
}

int
kauth_deregister_key(kauth_key_t key)
{
        KASSERT(key != NULL);

        specificdata_key_delete(kauth_domain, key->ks_key);
        kmem_free(key, sizeof(*key));

        return (0);
}

void *
kauth_cred_getdata(kauth_cred_t cred, kauth_key_t key)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(key != NULL);

        return (specificdata_getspecific(kauth_domain, &cred->cr_sd,
            key->ks_key));
}

void
kauth_cred_setdata(kauth_cred_t cred, kauth_key_t key, void *data)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(key != NULL);

        specificdata_setspecific(kauth_domain, &cred->cr_sd, key->ks_key, data);
}

/*
 * Match uids in two credentials.
 */
int
kauth_cred_uidmatch(kauth_cred_t cred1, kauth_cred_t cred2)
{
        KASSERT(cred1 != NULL);
        KASSERT(cred1 != NOCRED);
        KASSERT(cred1 != FSCRED);
        KASSERT(cred2 != NULL);
        KASSERT(cred2 != NOCRED);
        KASSERT(cred2 != FSCRED);

        if (cred1->cr_uid == cred2->cr_uid ||
            cred1->cr_euid == cred2->cr_uid ||
            cred1->cr_uid == cred2->cr_euid ||
            cred1->cr_euid == cred2->cr_euid)
                return (1);

        return (0);
}

u_int
kauth_cred_getrefcnt(kauth_cred_t cred)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);

        return (cred->cr_refcnt);
}

/*
 * Convert userland credentials (struct uucred) to kauth_cred_t.
 * XXX: For NFS & puffs
 */
void    
kauth_uucred_to_cred(kauth_cred_t cred, const struct uucred *uuc)
{       
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(uuc != NULL);
 
        cred->cr_refcnt = 1;
        cred->cr_uid = uuc->cr_uid;
        cred->cr_euid = uuc->cr_uid;
        cred->cr_svuid = uuc->cr_uid;
        cred->cr_gid = uuc->cr_gid;
        cred->cr_egid = uuc->cr_gid;
        cred->cr_svgid = uuc->cr_gid;
        cred->cr_ngroups = uimin(uuc->cr_ngroups, NGROUPS);
        kauth_cred_setgroups(cred, __UNCONST(uuc->cr_groups),
            cred->cr_ngroups, -1, UIO_SYSSPACE);
}

/*
 * Convert kauth_cred_t to userland credentials (struct uucred).
 * XXX: For NFS & puffs
 */
void    
kauth_cred_to_uucred(struct uucred *uuc, const kauth_cred_t cred)
{       
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(uuc != NULL);
        int ng;

        ng = uimin(cred->cr_ngroups, NGROUPS);
        uuc->cr_uid = cred->cr_euid;  
        uuc->cr_gid = cred->cr_egid;  
        uuc->cr_ngroups = ng;
        kauth_cred_getgroups(cred, uuc->cr_groups, ng, UIO_SYSSPACE);
}

/*
 * Compare kauth_cred_t and uucred credentials.
 * XXX: Modelled after crcmp() for NFS.
 */
int
kauth_cred_uucmp(kauth_cred_t cred, const struct uucred *uuc)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(uuc != NULL);

        if (cred->cr_euid == uuc->cr_uid &&
            cred->cr_egid == uuc->cr_gid &&
            cred->cr_ngroups == (uint32_t)uuc->cr_ngroups) {
                int i;

                /* Check if all groups from uuc appear in cred. */
                for (i = 0; i < uuc->cr_ngroups; i++) {
                        int ismember;

                        ismember = 0;
                        if (kauth_cred_ismember_gid(cred, uuc->cr_groups[i],
                            &ismember) != 0 || !ismember)
                                return (1);
                }

                return (0);
        }

        return (1);
}

/*
 * Make a struct ucred out of a kauth_cred_t.  For compatibility.
 */
void
kauth_cred_toucred(kauth_cred_t cred, struct ki_ucred *uc)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(uc != NULL);

        uc->cr_ref = cred->cr_refcnt;
        uc->cr_uid = cred->cr_euid;
        uc->cr_gid = cred->cr_egid;
        uc->cr_ngroups = uimin(cred->cr_ngroups, __arraycount(uc->cr_groups));
        memcpy(uc->cr_groups, cred->cr_groups,
               uc->cr_ngroups * sizeof(uc->cr_groups[0]));
}

/*
 * Make a struct pcred out of a kauth_cred_t.  For compatibility.
 */
void
kauth_cred_topcred(kauth_cred_t cred, struct ki_pcred *pc)
{
        KASSERT(cred != NULL);
        KASSERT(cred != NOCRED);
        KASSERT(cred != FSCRED);
        KASSERT(pc != NULL);

        pc->p_pad = NULL;
        pc->p_ruid = cred->cr_uid;
        pc->p_svuid = cred->cr_svuid;
        pc->p_rgid = cred->cr_gid;
        pc->p_svgid = cred->cr_svgid;
        pc->p_refcnt = cred->cr_refcnt;
}

/*
 * Return kauth_cred_t for the current LWP.
 */
kauth_cred_t
kauth_cred_get(void)
{
        return (curlwp->l_cred);
}

/*
 * Returns a scope matching the provided id.
 * Requires the scope list lock to be held by the caller.
 */
static kauth_scope_t
kauth_ifindscope(const char *id)
{
        kauth_scope_t scope;

        KASSERT(rw_lock_held(&kauth_lock));

        scope = NULL;
        SIMPLEQ_FOREACH(scope, &scope_list, next_scope) {
                if (strcmp(scope->id, id) == 0)
                        break;
        }

        return (scope);
}

/*
 * Register a new scope.
 *
 * id - identifier for the scope
 * callback - the scope's default listener
 * cookie - cookie to be passed to the listener(s)
 */
kauth_scope_t
kauth_register_scope(const char *id, kauth_scope_callback_t callback,
    void *cookie)
{
        kauth_scope_t scope;
        kauth_listener_t listener = NULL; /* XXX gcc */

        /* Sanitize input */
        if (id == NULL)
                return (NULL);

        /* Allocate space for a new scope and listener. */
        scope = kmem_alloc(sizeof(*scope), KM_SLEEP);
        if (callback != NULL)
                listener = kmem_alloc(sizeof(*listener), KM_SLEEP);

        /*
         * Acquire scope list lock.
         */
        rw_enter(&kauth_lock, RW_WRITER);

        /* Check we don't already have a scope with the same id */
        if (kauth_ifindscope(id) != NULL) {
                rw_exit(&kauth_lock);

                kmem_free(scope, sizeof(*scope));
                if (callback != NULL)
                        kmem_free(listener, sizeof(*listener));

                return (NULL);
        }

        /* Initialize new scope with parameters */
        scope->id = id;
        scope->cookie = cookie;
        scope->nlisteners = 1;

        SIMPLEQ_INIT(&scope->listenq);

        /* Add default listener */
        if (callback != NULL) {
                listener->func = callback;
                listener->scope = scope;
                listener->refcnt = 0;
                SIMPLEQ_INSERT_HEAD(&scope->listenq, listener, listener_next);
        }

        /* Insert scope to scopes list */
        SIMPLEQ_INSERT_TAIL(&scope_list, scope, next_scope);

        rw_exit(&kauth_lock);

        return (scope);
}

/*
 * Initialize the kernel authorization subsystem.
 *
 * Initialize the scopes list lock.
 * Create specificdata domain.
 * Register the credentials scope, used in kauth(9) internally.
 * Register built-in scopes: generic, system, process, network, machdep, device.
 */
void
kauth_init(void)
{
        rw_init(&kauth_lock);

        kauth_cred_cache = pool_cache_init(sizeof(struct kauth_cred),
            coherency_unit, 0, 0, "kcredpl", NULL, IPL_NONE,
            NULL, NULL, NULL);

        /* Create specificdata domain. */
        kauth_domain = specificdata_domain_create();

        /* Register credentials scope. */
        kauth_builtin_scope_cred =
            kauth_register_scope(KAUTH_SCOPE_CRED, NULL, NULL);

        /* Register generic scope. */
        kauth_builtin_scope_generic = kauth_register_scope(KAUTH_SCOPE_GENERIC,
            NULL, NULL);

        /* Register system scope. */
        kauth_builtin_scope_system = kauth_register_scope(KAUTH_SCOPE_SYSTEM,
            NULL, NULL);

        /* Register process scope. */
        kauth_builtin_scope_process = kauth_register_scope(KAUTH_SCOPE_PROCESS,
            NULL, NULL);

        /* Register network scope. */
        kauth_builtin_scope_network = kauth_register_scope(KAUTH_SCOPE_NETWORK,
            NULL, NULL);

        /* Register machdep scope. */
        kauth_builtin_scope_machdep = kauth_register_scope(KAUTH_SCOPE_MACHDEP,
            NULL, NULL);

        /* Register device scope. */
        kauth_builtin_scope_device = kauth_register_scope(KAUTH_SCOPE_DEVICE,
            NULL, NULL);

        /* Register vnode scope. */
        kauth_builtin_scope_vnode = kauth_register_scope(KAUTH_SCOPE_VNODE,
            NULL, NULL);
}

/*
 * Deregister a scope.
 * Requires scope list lock to be held by the caller.
 *
 * scope - the scope to deregister
 */
void
kauth_deregister_scope(kauth_scope_t scope)
{
        if (scope != NULL) {
                /* Remove scope from list */
                SIMPLEQ_REMOVE(&scope_list, scope, kauth_scope, next_scope);
                kmem_free(scope, sizeof(*scope));
        }
}

/*
 * Register a listener.
 *
 * id - scope identifier.
 * callback - the callback routine for the listener.
 * cookie - cookie to pass unmoidfied to the callback.
 */
kauth_listener_t
kauth_listen_scope(const char *id, kauth_scope_callback_t callback,
   void *cookie)
{
        kauth_scope_t scope;
        kauth_listener_t listener;

        listener = kmem_alloc(sizeof(*listener), KM_SLEEP);
        rw_enter(&kauth_lock, RW_WRITER);

        /*
         * Find scope struct.
         */
        scope = kauth_ifindscope(id);
        if (scope == NULL) {
                rw_exit(&kauth_lock);
                kmem_free(listener, sizeof(*listener));
                return (NULL);
        }

        /* Allocate listener */

        /* Initialize listener with parameters */
        listener->func = callback;
        listener->refcnt = 0;

        /* Add listener to scope */
        SIMPLEQ_INSERT_TAIL(&scope->listenq, listener, listener_next);

        /* Raise number of listeners on scope. */
        scope->nlisteners++;
        listener->scope = scope;

        rw_exit(&kauth_lock);

        return (listener);
}

/*
 * Deregister a listener.
 *
 * listener - listener reference as returned from kauth_listen_scope().
 */
void
kauth_unlisten_scope(kauth_listener_t listener)
{

        if (listener != NULL) {
                rw_enter(&kauth_lock, RW_WRITER);
                SIMPLEQ_REMOVE(&listener->scope->listenq, listener,
                    kauth_listener, listener_next);
                listener->scope->nlisteners--;
                rw_exit(&kauth_lock);
                kmem_free(listener, sizeof(*listener));
        }
}

/*
 * Authorize a request.
 *
 * scope - the scope of the request as defined by KAUTH_SCOPE_* or as
 *           returned from kauth_register_scope().
 * credential - credentials of the user ("actor") making the request.
 * action - request identifier.
 * arg[0-3] - passed unmodified to listener(s).
 *
 * Returns the aggregated result:
 *     - KAUTH_RESULT_ALLOW if there is at least one KAUTH_RESULT_ALLOW and
 *       zero KAUTH_DESULT_DENY
 *     - KAUTH_RESULT_DENY if there is at least one KAUTH_RESULT_DENY
 *     - KAUTH_RESULT_DEFER if there is nothing but KAUTH_RESULT_DEFER
 */
static int
kauth_authorize_action_internal(kauth_scope_t scope, kauth_cred_t cred,
    kauth_action_t action, void *arg0, void *arg1, void *arg2, void *arg3)
{
        kauth_listener_t listener;
        int error, allow, fail;

        KASSERT(cred != NULL);
        KASSERT(action != 0);

        /* Short-circuit requests coming from the kernel. */
        if (cred == NOCRED || cred == FSCRED)
                return KAUTH_RESULT_ALLOW;

        KASSERT(scope != NULL);

        fail = 0;
        allow = 0;

        /* rw_enter(&kauth_lock, RW_READER); XXX not yet */
        SIMPLEQ_FOREACH(listener, &scope->listenq, listener_next) {
                error = listener->func(cred, action, scope->cookie, arg0,
                    arg1, arg2, arg3);

                if (error == KAUTH_RESULT_ALLOW)
                        allow = 1;
                else if (error == KAUTH_RESULT_DENY)
                        fail = 1;
        }
        /* rw_exit(&kauth_lock); */

        if (fail)
                return (KAUTH_RESULT_DENY);

        if (allow)
                return (KAUTH_RESULT_ALLOW);

        return (KAUTH_RESULT_DEFER);
};

int
kauth_authorize_action(kauth_scope_t scope, kauth_cred_t cred,
    kauth_action_t action, void *arg0, void *arg1, void *arg2, void *arg3)
{
        int r;

        r = kauth_authorize_action_internal(scope, cred, action, arg0, arg1,
            arg2, arg3);

        if (r == KAUTH_RESULT_DENY)
                return (EPERM);

        if (r == KAUTH_RESULT_ALLOW)
                return (0);

        if (secmodel_nsecmodels() == 0)
                return (0);

        return (EPERM);
}

/*
 * Generic scope authorization wrapper.
 */
int
kauth_authorize_generic(kauth_cred_t cred, kauth_action_t action, void *arg0)
{
        return (kauth_authorize_action(kauth_builtin_scope_generic, cred, 
            action, arg0, NULL, NULL, NULL));
}

/*
 * System scope authorization wrapper.
 */
int
kauth_authorize_system(kauth_cred_t cred, kauth_action_t action,
    enum kauth_system_req req, void *arg1, void *arg2, void *arg3)
{
        return (kauth_authorize_action(kauth_builtin_scope_system, cred,
            action, (void *)req, arg1, arg2, arg3));
}

/*
 * Process scope authorization wrapper.
 */
int
kauth_authorize_process(kauth_cred_t cred, kauth_action_t action,
    struct proc *p, void *arg1, void *arg2, void *arg3)
{
        return (kauth_authorize_action(kauth_builtin_scope_process, cred,
            action, p, arg1, arg2, arg3));
}

/*
 * Network scope authorization wrapper.
 */
int
kauth_authorize_network(kauth_cred_t cred, kauth_action_t action,
    enum kauth_network_req req, void *arg1, void *arg2, void *arg3)
{
        return (kauth_authorize_action(kauth_builtin_scope_network, cred,
            action, (void *)req, arg1, arg2, arg3));
}

int
kauth_authorize_machdep(kauth_cred_t cred, kauth_action_t action,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        return (kauth_authorize_action(kauth_builtin_scope_machdep, cred,
            action, arg0, arg1, arg2, arg3));
}

int
kauth_authorize_device(kauth_cred_t cred, kauth_action_t action,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        return (kauth_authorize_action(kauth_builtin_scope_device, cred,
            action, arg0, arg1, arg2, arg3));
}

int
kauth_authorize_device_tty(kauth_cred_t cred, kauth_action_t action,
    struct tty *tty)
{
        return (kauth_authorize_action(kauth_builtin_scope_device, cred,
            action, tty, NULL, NULL, NULL));
}

int
kauth_authorize_device_spec(kauth_cred_t cred, enum kauth_device_req req,
    struct vnode *vp)
{
        return (kauth_authorize_action(kauth_builtin_scope_device, cred,
            KAUTH_DEVICE_RAWIO_SPEC, (void *)req, vp, NULL, NULL));
}

int
kauth_authorize_device_passthru(kauth_cred_t cred, dev_t dev, u_long bits,
    void *data)
{
        return (kauth_authorize_action(kauth_builtin_scope_device, cred,
            KAUTH_DEVICE_RAWIO_PASSTHRU, (void *)bits, (void *)(u_long)dev,
            data, NULL));
}

kauth_action_t
kauth_accmode_to_action(accmode_t accmode)
{
        kauth_action_t action = 0;

        // XXX: Revisit we need to have a richer set of kauth primitives
        // We also get only the Unix perms here sometimes
        if (accmode & (VSTAT_PERMS|VREAD))
                action |= KAUTH_VNODE_READ_DATA;
        if (accmode & (VMODIFY_PERMS|VADMIN_PERMS))
                action |= KAUTH_VNODE_WRITE_DATA;
        if (accmode & VEXEC)
                action |= KAUTH_VNODE_EXECUTE;
        return action == 0 ? KAUTH_VNODE_ACCESS : action;
}

kauth_action_t
kauth_extattr_action(mode_t access_mode)
{
        kauth_action_t action = 0;

        if (access_mode & VREAD)
                action |= KAUTH_VNODE_READ_EXTATTRIBUTES;
        if (access_mode & VWRITE)
                action |= KAUTH_VNODE_WRITE_EXTATTRIBUTES;

        return action;
}

int
kauth_authorize_vnode(kauth_cred_t cred, kauth_action_t action,
    struct vnode *vp, struct vnode *dvp, int fs_decision)
{
        int error;

        error = kauth_authorize_action_internal(kauth_builtin_scope_vnode, cred,
            action, vp, dvp, NULL, NULL);

        if (error == KAUTH_RESULT_DENY)
                return (EACCES);

        if (error == KAUTH_RESULT_ALLOW)
                return (0);

        /*
         * If the file-system does not support decision-before-action, we can
         * only short-circuit the operation (deny). If we're here, it means no
         * listener denied it, so our only alternative is to supposedly-allow
         * it and let the file-system have the last word.
         */
        if (fs_decision == KAUTH_VNODE_REMOTEFS)
                return (0);

        return (fs_decision);
}

static int
kauth_cred_hook(kauth_cred_t cred, kauth_action_t action, void *arg0,
    void *arg1)
{
        int r;

        r = kauth_authorize_action(kauth_builtin_scope_cred, cred, action,
            arg0, arg1, NULL, NULL);

#ifdef DIAGNOSTIC
        if (!SIMPLEQ_EMPTY(&kauth_builtin_scope_cred->listenq))
                KASSERT(r == 0);
#endif /* DIAGNOSTIC */

        return (r);
}











































































































   16 





    2 
   14 







   14 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/*        $NetBSD: kern_module_hook.c,v 1.4 2019/12/13 08:02:53 skrll Exp $ */

/*-
 * Copyright (c) 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Kernel module support.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_module_hook.c,v 1.4 2019/12/13 08:02:53 skrll Exp $");

#include <sys/param.h>

#include <sys/atomic.h>
#include <sys/condvar.h>
#include <sys/module_hook.h>
#include <sys/mutex.h>
#include <sys/pserialize.h>

#include <uvm/uvm_extern.h>

/* Locking/synchronization stuff for module hooks */

static struct {
        kmutex_t        mtx;
        kcondvar_t        cv;
        pserialize_t        psz;
} module_hook __cacheline_aligned;

/*
 * We use pserialize_perform() to issue a memory barrier on the current
 * CPU and on all other CPUs so that all prior memory operations on the
 * current CPU globally happen before all subsequent memory operations
 * on the current CPU, as perceived by any other CPU.
 *
 * pserialize_perform() might be rather heavy-weight here, but it only
 * happens during module loading, and it allows MODULE_HOOK_CALL() to
 * work without any other memory barriers.
 */

void
module_hook_set(bool *hooked, struct localcount *lc)
{

        KASSERT(kernconfig_is_held());
        KASSERT(!*hooked);

        localcount_init(lc);

        /* Wait until setup has been witnessed by all CPUs.  */
        pserialize_perform(module_hook.psz);

        /* Let others use it */
        atomic_store_relaxed(hooked, true);
}

void
module_hook_unset(bool *hooked, struct localcount *lc)
{

        KASSERT(kernconfig_is_held());
        KASSERT(*hooked);

        /* Get exclusive with pserialize and localcount. */
        mutex_enter(&module_hook.mtx);

        /* Prevent new calls to module_hook_tryenter(). */
        atomic_store_relaxed(hooked, false);

        /* Wait for existing calls to module_hook_tryenter(). */
        pserialize_perform(module_hook.psz);

        /* Wait for module_hook_exit. */
        localcount_drain(lc, &module_hook.cv, &module_hook.mtx);

        /* All done! */
        mutex_exit(&module_hook.mtx);
        localcount_fini(lc);
}

bool
module_hook_tryenter(bool *hooked, struct localcount *lc)
{
        bool call_hook;
        int s;

        s = pserialize_read_enter();
        call_hook = atomic_load_relaxed(hooked);
        if (call_hook)
                localcount_acquire(lc);
        pserialize_read_exit(s);

        return call_hook;
}

void
module_hook_exit(struct localcount *lc)
{

        localcount_release(lc, &module_hook.cv, &module_hook.mtx);
}

void
module_hook_init(void)
{

        mutex_init(&module_hook.mtx, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&module_hook.cv, "mod_hook");
        module_hook.psz = pserialize_create();
}

































































































































































































































































































    2 



















    8 
    2 









































    7 
    7 




    7 
    7 










   51 

   50 
    7 



























    7 

    2 





    5 





































    6 


    6 













    1 









    1 

    1 









   51 





   66 





   11 


















  122 





  113 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
/*        $NetBSD: sched_4bsd.c,v 1.46 2022/10/26 23:24:09 riastradh Exp $        */

/*
 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2019, 2020
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and
 * Daniel Sieger.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1990, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_synch.c        8.9 (Berkeley) 5/19/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.46 2022/10/26 23:24:09 riastradh Exp $");

#include "opt_ddb.h"
#include "opt_lockdebug.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/cpu.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/lockdebug.h>
#include <sys/intr.h>
#include <sys/atomic.h>

static void updatepri(struct lwp *);
static void resetpriority(struct lwp *);

/* Number of hardclock ticks per sched_tick() */
u_int sched_rrticks __read_mostly;

/*
 * Force switch among equal priority processes every 100ms.
 * Called from hardclock every hz/10 == sched_rrticks hardclock ticks.
 */
/* ARGSUSED */
void
sched_tick(struct cpu_info *ci)
{
        struct schedstate_percpu *spc = &ci->ci_schedstate;
        pri_t pri = PRI_NONE;
        lwp_t *l;

        spc->spc_ticks = sched_rrticks;

        if (CURCPU_IDLE_P()) {
                spc_lock(ci);
                sched_resched_cpu(ci, MAXPRI_KTHREAD, true);
                /* spc now unlocked */
                return;
        }
        l = ci->ci_onproc;
        if (l == NULL) {
                return;
        }
        /*
         * Can only be spc_lwplock or a turnstile lock at this point
         * (if we interrupted priority inheritance trylock dance).
         */
        KASSERT(l->l_mutex != spc->spc_mutex);
        switch (l->l_class) {
        case SCHED_FIFO:
                /* No timeslicing for FIFO jobs. */
                break;
        case SCHED_RR:
                /* Force it into mi_switch() to look for other jobs to run. */
                pri = MAXPRI_KERNEL_RT;
                break;
        default:
                if (spc->spc_flags & SPCF_SHOULDYIELD) {
                        /*
                         * Process is stuck in kernel somewhere, probably
                         * due to buggy or inefficient code.  Force a
                         * kernel preemption.
                         */
                        pri = MAXPRI_KERNEL_RT;
                } else if (spc->spc_flags & SPCF_SEENRR) {
                        /*
                         * The process has already been through a roundrobin
                         * without switching and may be hogging the CPU.
                         * Indicate that the process should yield.
                         */
                        pri = MAXPRI_KTHREAD;
                        spc->spc_flags |= SPCF_SHOULDYIELD;
                } else if ((spc->spc_flags & SPCF_1STCLASS) == 0) {
                        /*
                         * For SMT or asymmetric systems push a little
                         * harder: if this is not a 1st class CPU, try to
                         * find a better one to run this LWP.
                         */
                        pri = MAXPRI_KTHREAD;
                        spc->spc_flags |= SPCF_SHOULDYIELD;
                } else {
                        spc->spc_flags |= SPCF_SEENRR;
                }
                break;
        }

        if (pri != PRI_NONE) {
                spc_lock(ci);
                sched_resched_cpu(ci, pri, true);
                /* spc now unlocked */
        }
}

/*
 * Why PRIO_MAX - 2? From setpriority(2):
 *
 *        prio is a value in the range -20 to 20.  The default priority is
 *        0; lower priorities cause more favorable scheduling.  A value of
 *        19 or 20 will schedule a process only when nothing at priority <=
 *        0 is runnable.
 *
 * This gives estcpu influence over 18 priority levels, and leaves nice
 * with 40 levels.  One way to think about it is that nice has 20 levels
 * either side of estcpu's 18.
 */
#define        ESTCPU_SHIFT        11
#define        ESTCPU_MAX        ((PRIO_MAX - 2) << ESTCPU_SHIFT)
#define        ESTCPU_ACCUM        (1 << (ESTCPU_SHIFT - 1))
#define        ESTCPULIM(e)        uimin((e), ESTCPU_MAX)

/*
 * The main parameter used by this algorithm is 'l_estcpu'. It is an estimate
 * of the recent CPU utilization of the thread.
 *
 * l_estcpu is:
 *  - increased each time the hardclock ticks and the thread is found to
 *    be executing, in sched_schedclock() called from hardclock()
 *  - decreased (filtered) on each sched tick, in sched_pstats_hook()
 * If the lwp is sleeping for more than a second, we don't touch l_estcpu: it
 * will be updated in sched_setrunnable() when the lwp wakes up, in burst mode
 * (ie, we decrease it n times).
 *
 * Note that hardclock updates l_estcpu and l_cpticks independently.
 *
 * -----------------------------------------------------------------------------
 *
 * Here we describe how l_estcpu is decreased.
 *
 * Constants for digital decay (filter):
 *     90% of l_estcpu usage in (5 * loadavg) seconds
 *
 * We wish to decay away 90% of l_estcpu in (5 * loadavg) seconds. That is, we
 * want to compute a value of decay such that the following loop:
 *     for (i = 0; i < (5 * loadavg); i++)
 *         l_estcpu *= decay;
 * will result in
 *     l_estcpu *= 0.1;
 * for all values of loadavg.
 *
 * Mathematically this loop can be expressed by saying:
 *     decay ** (5 * loadavg) ~= .1
 *
 * And finally, the corresponding value of decay we're using is:
 *     decay = (2 * loadavg) / (2 * loadavg + 1)
 *
 * -----------------------------------------------------------------------------
 *
 * Now, let's prove that the value of decay stated above will always fulfill
 * the equation:
 *     decay ** (5 * loadavg) ~= .1
 *
 * If we compute b as:
 *     b = 2 * loadavg
 * then
 *     decay = b / (b + 1)
 *
 * We now need to prove two things:
 *     1) Given [factor ** (5 * loadavg) =~ .1], prove [factor == b/(b+1)].
 *     2) Given [b/(b+1) ** power =~ .1], prove [power == (5 * loadavg)].
 *
 * Facts:
 *   * For x real: exp(x) = 0! + x**1/1! + x**2/2! + ...
 *     Therefore, for x close to zero, exp(x) =~ 1 + x.
 *     In turn, for b large enough, exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
 *
 *   * For b large enough, (b-1)/b =~ b/(b+1).
 *
 *   * For x belonging to [-1;1[, ln(1-x) = - x - x**2/2 - x**3/3 - ...
 *     Therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
 *
 *   * ln(0.1) =~ -2.30
 *
 * Proof of (1):
 *     factor ** (5 * loadavg) =~ 0.1
 *  => ln(factor) =~ -2.30 / (5 * loadavg)
 *  => factor =~ exp(-1 / ((5 / 2.30) * loadavg))
 *            =~ exp(-1 / (2 * loadavg))
 *            =~ exp(-1 / b)
 *            =~ (b - 1) / b
 *            =~ b / (b + 1)
 *            =~ (2 * loadavg) / ((2 * loadavg) + 1)
 *
 * Proof of (2):
 *     (b / (b + 1)) ** power =~ .1
 *  => power * ln(b / (b + 1)) =~ -2.30
 *  => power * (-1 / (b + 1)) =~ -2.30
 *  => power =~ 2.30 * (b + 1)
 *  => power =~ 4.60 * loadavg + 2.30
 *  => power =~ 5 * loadavg
 *
 * Conclusion: decay = (2 * loadavg) / (2 * loadavg + 1)
 */

/* See calculations above */
#define        loadfactor(loadavg)  (2 * (loadavg))

static fixpt_t
decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
{

        if (estcpu == 0) {
                return 0;
        }

#if !defined(_LP64)
        /* avoid 64bit arithmetics. */
#define        FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
        if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
                return estcpu * loadfac / (loadfac + FSCALE);
        }
#endif

        return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
}

static fixpt_t
decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
{

        /*
         * For all load averages >= 1 and max l_estcpu of (255 << ESTCPU_SHIFT),
         * if we slept for at least seven times the loadfactor, we will decay
         * l_estcpu to less than (1 << ESTCPU_SHIFT), and therefore we can
         * return zero directly.
         *
         * Note that our ESTCPU_MAX is actually much smaller than
         * (255 << ESTCPU_SHIFT).
         */
        if ((n << FSHIFT) >= 7 * loadfac) {
                return 0;
        }

        while (estcpu != 0 && n > 1) {
                estcpu = decay_cpu(loadfac, estcpu);
                n--;
        }

        return estcpu;
}

/*
 * sched_pstats_hook:
 *
 * Periodically called from sched_pstats(); used to recalculate priorities.
 */
void
sched_pstats_hook(struct lwp *l, int batch)
{
        fixpt_t loadfac;

        /*
         * If the LWP has slept an entire second, stop recalculating
         * its priority until it wakes up.
         */
        KASSERT(lwp_locked(l, NULL));
        if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
            l->l_stat == LSSUSPENDED) {
                if (l->l_slptime > 1) {
                        return;
                }
        }

        loadfac = loadfactor(averunnable.ldavg[0]);
        l->l_estcpu = decay_cpu(loadfac, l->l_estcpu);
        resetpriority(l);
}

/*
 * Recalculate the priority of an LWP after it has slept for a while.
 */
static void
updatepri(struct lwp *l)
{
        fixpt_t loadfac;

        KASSERT(lwp_locked(l, NULL));
        KASSERT(l->l_slptime > 1);

        loadfac = loadfactor(averunnable.ldavg[0]);

        l->l_slptime--; /* the first time was done in sched_pstats */
        l->l_estcpu = decay_cpu_batch(loadfac, l->l_estcpu, l->l_slptime);
        resetpriority(l);
}

void
sched_rqinit(void)
{

}

void
sched_setrunnable(struct lwp *l)
{

         if (l->l_slptime > 1)
                 updatepri(l);
}

void
sched_nice(struct proc *p, int n)
{
        struct lwp *l;

        KASSERT(mutex_owned(p->p_lock));

        p->p_nice = n;
        LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                lwp_lock(l);
                resetpriority(l);
                lwp_unlock(l);
        }
}

/*
 * Recompute the priority of an LWP.  Arrange to reschedule if
 * the resulting priority is better than that of the current LWP.
 */
static void
resetpriority(struct lwp *l)
{
        pri_t pri;
        struct proc *p = l->l_proc;

        KASSERT(lwp_locked(l, NULL));

        if (l->l_class != SCHED_OTHER)
                return;

        /* See comments above ESTCPU_SHIFT definition. */
        pri = (PRI_KERNEL - 1) - (l->l_estcpu >> ESTCPU_SHIFT) - p->p_nice;
        pri = imax(pri, 0);
        if (pri != l->l_priority)
                lwp_changepri(l, pri);
}

/*
 * We adjust the priority of the current LWP.  The priority of a LWP
 * gets worse as it accumulates CPU time.  The CPU usage estimator (l_estcpu)
 * is increased here.  The formula for computing priorities will compute a
 * different value each time l_estcpu increases. This can cause a switch,
 * but unless the priority crosses a PPQ boundary the actual queue will not
 * change.  The CPU usage estimator ramps up quite quickly when the process
 * is running (linearly), and decays away exponentially, at a rate which is
 * proportionally slower when the system is busy.  The basic principle is
 * that the system will 90% forget that the process used a lot of CPU time
 * in (5 * loadavg) seconds.  This causes the system to favor processes which
 * haven't run much recently, and to round-robin among other processes.
 */
void
sched_schedclock(struct lwp *l)
{

        if (l->l_class != SCHED_OTHER)
                return;

        KASSERT(!CURCPU_IDLE_P());
        l->l_estcpu = ESTCPULIM(l->l_estcpu + ESTCPU_ACCUM);
        lwp_lock(l);
        resetpriority(l);
        lwp_unlock(l);
}

/*
 * sched_proc_fork:
 *
 *        Inherit the parent's scheduler history.
 */
void
sched_proc_fork(struct proc *parent, struct proc *child)
{
        lwp_t *pl;

        KASSERT(mutex_owned(parent->p_lock));

        pl = LIST_FIRST(&parent->p_lwps);
        child->p_estcpu_inherited = pl->l_estcpu;
        child->p_forktime = sched_pstats_ticks;
}

/*
 * sched_proc_exit:
 *
 *        Chargeback parents for the sins of their children.
 */
void
sched_proc_exit(struct proc *parent, struct proc *child)
{
        fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
        fixpt_t estcpu;
        lwp_t *pl, *cl;

        /* XXX Only if parent != init?? */

        mutex_enter(parent->p_lock);
        pl = LIST_FIRST(&parent->p_lwps);
        cl = LIST_FIRST(&child->p_lwps);
        estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
            sched_pstats_ticks - child->p_forktime);
        if (cl->l_estcpu > estcpu) {
                lwp_lock(pl);
                pl->l_estcpu = ESTCPULIM(pl->l_estcpu + cl->l_estcpu - estcpu);
                lwp_unlock(pl);
        }
        mutex_exit(parent->p_lock);
}

void
sched_wakeup(struct lwp *l)
{

}

void
sched_slept(struct lwp *l)
{

}

void
sched_lwp_fork(struct lwp *l1, struct lwp *l2)
{

        l2->l_estcpu = l1->l_estcpu;
}

void
sched_lwp_collect(struct lwp *t)
{
        lwp_t *l;

        /* Absorb estcpu value of collected LWP. */
        l = curlwp;
        lwp_lock(l);
        l->l_estcpu += t->l_estcpu;
        lwp_unlock(l);
}

void
sched_oncpu(lwp_t *l)
{

}

void
sched_newts(lwp_t *l)
{

}

/*
 * Sysctl nodes and initialization.
 */

static int
sysctl_sched_rtts(SYSCTLFN_ARGS)
{
        struct sysctlnode node;
        int rttsms = hztoms(sched_rrticks);

        node = *rnode;
        node.sysctl_data = &rttsms;
        return sysctl_lookup(SYSCTLFN_CALL(&node));
}

SYSCTL_SETUP(sysctl_sched_4bsd_setup, "sysctl sched setup")
{
        const struct sysctlnode *node = NULL;

        sysctl_createv(clog, 0, NULL, &node,
                CTLFLAG_PERMANENT,
                CTLTYPE_NODE, "sched",
                SYSCTL_DESCR("Scheduler options"),
                NULL, 0, NULL, 0,
                CTL_KERN, CTL_CREATE, CTL_EOL);

        if (node == NULL)
                return;

        sched_rrticks = hz / 10;

        sysctl_createv(NULL, 0, &node, NULL,
                CTLFLAG_PERMANENT,
                CTLTYPE_STRING, "name", NULL,
                NULL, 0, __UNCONST("4.4BSD"), 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(NULL, 0, &node, NULL,
                CTLFLAG_PERMANENT,
                CTLTYPE_INT, "rtts",
                SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"),
                sysctl_sched_rtts, 0, NULL, 0,
                CTL_CREATE, CTL_EOL);
}





































































































































  118 














  120 












































  128 










  127 
































































  120 






  117 





























  117 







  119 















  120 

  119 













   61 























   62 

   62 

   61 














    2 












    2 






















    2 


    2 





















  247 
















  248 

   85 

   47 


  244 








  174 
   18 
  245 





   85 
  246 













  246 













  174 













  233 

  217 






   84 
  242 












  246 

























   14 
  175 


   14 
  174 












  235 







  221 



















  252 









  251 










  244 



  250 





  250 




















  164 































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
/*        $NetBSD: subr_lockdebug.c,v 1.83 2022/09/02 06:01:38 nakayama Exp $        */

/*-
 * Copyright (c) 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Basic lock debugging code shared among lock primitives.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_lockdebug.c,v 1.83 2022/09/02 06:01:38 nakayama Exp $");

#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#endif

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/lockdebug.h>
#include <sys/sleepq.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <sys/lock.h>
#include <sys/rbtree.h>
#include <sys/ksyms.h>
#include <sys/kcov.h>

#include <machine/lock.h>

#ifdef DDB
#include <machine/db_machdep.h>
#include <ddb/db_interface.h>
#include <ddb/db_access.h>
#include <ddb/db_sym.h>
#endif

unsigned int                ld_panic;

#ifdef LOCKDEBUG

#ifdef __ia64__
#define        LD_BATCH_SHIFT        16
#else
#define        LD_BATCH_SHIFT        9
#endif
#define        LD_BATCH        (1 << LD_BATCH_SHIFT)
#define        LD_BATCH_MASK        (LD_BATCH - 1)
#define        LD_MAX_LOCKS        1048576
#define        LD_SLOP                16

#define        LD_LOCKED        0x01
#define        LD_SLEEPER        0x02

#define        LD_WRITE_LOCK        0x80000000

typedef struct lockdebug {
        struct rb_node        ld_rb_node;
        __cpu_simple_lock_t ld_spinlock;
        _TAILQ_ENTRY(struct lockdebug, volatile) ld_chain;
        _TAILQ_ENTRY(struct lockdebug, volatile) ld_achain;
        volatile void        *ld_lock;
        lockops_t        *ld_lockops;
        struct lwp        *ld_lwp;
        uintptr_t        ld_locked;
        uintptr_t        ld_unlocked;
        uintptr_t        ld_initaddr;
        uint16_t        ld_shares;
        uint16_t        ld_cpu;
        uint8_t                ld_flags;
        uint8_t                ld_shwant;        /* advisory */
        uint8_t                ld_exwant;        /* advisory */
        uint8_t                ld_unused;
} volatile lockdebug_t;

typedef _TAILQ_HEAD(lockdebuglist, struct lockdebug, volatile) lockdebuglist_t;

__cpu_simple_lock_t        ld_mod_lk;
lockdebuglist_t                ld_free = TAILQ_HEAD_INITIALIZER(ld_free);
#ifdef _KERNEL
lockdebuglist_t                ld_all = TAILQ_HEAD_INITIALIZER(ld_all);
#else
extern lockdebuglist_t        ld_all;
#define cpu_name(a)        "?"
#define cpu_index(a)        -1
#define curlwp                NULL
#endif /* _KERNEL */
int                        ld_nfree;
int                        ld_freeptr;
int                        ld_recurse;
bool                        ld_nomore;
lockdebug_t                ld_prime[LD_BATCH];

#ifdef _KERNEL
static void        lockdebug_abort1(const char *, size_t, lockdebug_t *, int,
    const char *, bool);
static int        lockdebug_more(int);
static void        lockdebug_init(void);
static void        lockdebug_dump(lwp_t *, lockdebug_t *,
    void (*)(const char *, ...)
    __printflike(1, 2));

static signed int
ld_rbto_compare_nodes(void *ctx, const void *n1, const void *n2)
{
        const lockdebug_t *ld1 = n1;
        const lockdebug_t *ld2 = n2;
        const uintptr_t a = (uintptr_t)ld1->ld_lock;
        const uintptr_t b = (uintptr_t)ld2->ld_lock;

        if (a < b)
                return -1;
        if (a > b)
                return 1;
        return 0;
}

static signed int
ld_rbto_compare_key(void *ctx, const void *n, const void *key)
{
        const lockdebug_t *ld = n;
        const uintptr_t a = (uintptr_t)ld->ld_lock;
        const uintptr_t b = (uintptr_t)key;

        if (a < b)
                return -1;
        if (a > b)
                return 1;
        return 0;
}

static rb_tree_t ld_rb_tree;

static const rb_tree_ops_t ld_rb_tree_ops = {
        .rbto_compare_nodes = ld_rbto_compare_nodes,
        .rbto_compare_key = ld_rbto_compare_key,
        .rbto_node_offset = offsetof(lockdebug_t, ld_rb_node),
        .rbto_context = NULL
};

static inline lockdebug_t *
lockdebug_lookup1(const volatile void *lock)
{
        lockdebug_t *ld;
        struct cpu_info *ci;

        ci = curcpu();
        __cpu_simple_lock(&ci->ci_data.cpu_ld_lock);
        ld = rb_tree_find_node(&ld_rb_tree, (void *)(intptr_t)lock);
        __cpu_simple_unlock(&ci->ci_data.cpu_ld_lock);
        if (ld == NULL) {
                return NULL;
        }
        __cpu_simple_lock(&ld->ld_spinlock);

        return ld;
}

static void
lockdebug_lock_cpus(void)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;

        for (CPU_INFO_FOREACH(cii, ci)) {
                __cpu_simple_lock(&ci->ci_data.cpu_ld_lock);
        }
}

static void
lockdebug_unlock_cpus(void)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;

        for (CPU_INFO_FOREACH(cii, ci)) {
                __cpu_simple_unlock(&ci->ci_data.cpu_ld_lock);
        }
}

/*
 * lockdebug_lookup:
 *
 *        Find a lockdebug structure by a pointer to a lock and return it locked.
 */
static inline lockdebug_t *
lockdebug_lookup(const char *func, size_t line, const volatile void *lock,
    uintptr_t where)
{
        lockdebug_t *ld;

        kcov_silence_enter();
        ld = lockdebug_lookup1(lock);
        kcov_silence_leave();

        if (__predict_false(ld == NULL)) {
                panic("%s,%zu: uninitialized lock (lock=%p, from=%08"
                    PRIxPTR ")", func, line, lock, where);
        }
        return ld;
}

/*
 * lockdebug_init:
 *
 *        Initialize the lockdebug system.  Allocate an initial pool of
 *        lockdebug structures before the VM system is up and running.
 */
static void
lockdebug_init(void)
{
        lockdebug_t *ld;
        int i;

        TAILQ_INIT(&curcpu()->ci_data.cpu_ld_locks);
        TAILQ_INIT(&curlwp->l_ld_locks);
        __cpu_simple_lock_init(&curcpu()->ci_data.cpu_ld_lock);
        __cpu_simple_lock_init(&ld_mod_lk);

        rb_tree_init(&ld_rb_tree, &ld_rb_tree_ops);

        ld = ld_prime;
        for (i = 1, ld++; i < LD_BATCH; i++, ld++) {
                __cpu_simple_lock_init(&ld->ld_spinlock);
                TAILQ_INSERT_TAIL(&ld_free, ld, ld_chain);
                TAILQ_INSERT_TAIL(&ld_all, ld, ld_achain);
        }
        ld_freeptr = 1;
        ld_nfree = LD_BATCH - 1;
}

/*
 * lockdebug_alloc:
 *
 *        A lock is being initialized, so allocate an associated debug
 *        structure.
 */
bool
lockdebug_alloc(const char *func, size_t line, volatile void *lock,
    lockops_t *lo, uintptr_t initaddr)
{
        struct cpu_info *ci;
        lockdebug_t *ld;
        int s;

        if (__predict_false(lo == NULL || panicstr != NULL || ld_panic))
                return false;
        if (__predict_false(ld_freeptr == 0))
                lockdebug_init();

        s = splhigh();
        __cpu_simple_lock(&ld_mod_lk);
        if (__predict_false((ld = lockdebug_lookup1(lock)) != NULL)) {
                __cpu_simple_unlock(&ld_mod_lk);
                lockdebug_abort1(func, line, ld, s, "already initialized",
                    true);
                return false;
        }

        /*
         * Pinch a new debug structure.  We may recurse because we call
         * kmem_alloc(), which may need to initialize new locks somewhere
         * down the path.  If not recursing, we try to maintain at least
         * LD_SLOP structures free, which should hopefully be enough to
         * satisfy kmem_alloc().  If we can't provide a structure, not to
         * worry: we'll just mark the lock as not having an ID.
         */
        ci = curcpu();
        ci->ci_lkdebug_recurse++;
        if (TAILQ_EMPTY(&ld_free)) {
                if (ci->ci_lkdebug_recurse > 1 || ld_nomore) {
                        ci->ci_lkdebug_recurse--;
                        __cpu_simple_unlock(&ld_mod_lk);
                        splx(s);
                        return false;
                }
                s = lockdebug_more(s);
        } else if (ci->ci_lkdebug_recurse == 1 && ld_nfree < LD_SLOP) {
                s = lockdebug_more(s);
        }
        if (__predict_false((ld = TAILQ_FIRST(&ld_free)) == NULL)) {
                __cpu_simple_unlock(&ld_mod_lk);
                splx(s);
                return false;
        }
        TAILQ_REMOVE(&ld_free, ld, ld_chain);
        ld_nfree--;
        ci->ci_lkdebug_recurse--;

        if (__predict_false(ld->ld_lock != NULL)) {
                panic("%s,%zu: corrupt table ld %p", func, line, ld);
        }

        /* Initialise the structure. */
        ld->ld_lock = lock;
        ld->ld_lockops = lo;
        ld->ld_locked = 0;
        ld->ld_unlocked = 0;
        ld->ld_lwp = NULL;
        ld->ld_initaddr = initaddr;
        ld->ld_flags = (lo->lo_type == LOCKOPS_SLEEP ? LD_SLEEPER : 0);
        lockdebug_lock_cpus();
        (void)rb_tree_insert_node(&ld_rb_tree, __UNVOLATILE(ld));
        lockdebug_unlock_cpus();
        __cpu_simple_unlock(&ld_mod_lk);

        splx(s);
        return true;
}

/*
 * lockdebug_free:
 *
 *        A lock is being destroyed, so release debugging resources.
 */
void
lockdebug_free(const char *func, size_t line, volatile void *lock)
{
        lockdebug_t *ld;
        int s;

        if (__predict_false(panicstr != NULL || ld_panic))
                return;

        s = splhigh();
        __cpu_simple_lock(&ld_mod_lk);
        ld = lockdebug_lookup(func, line, lock,
            (uintptr_t) __builtin_return_address(0));
        if (__predict_false(ld == NULL)) {
                __cpu_simple_unlock(&ld_mod_lk);
                panic("%s,%zu: destroying uninitialized object %p"
                    "(ld_lock=%p)", func, line, lock, ld->ld_lock);
                return;
        }
        if (__predict_false((ld->ld_flags & LD_LOCKED) != 0 ||
            ld->ld_shares != 0)) {
                __cpu_simple_unlock(&ld_mod_lk);
                lockdebug_abort1(func, line, ld, s, "is locked or in use",
                    true);
                return;
        }
        lockdebug_lock_cpus();
        rb_tree_remove_node(&ld_rb_tree, __UNVOLATILE(ld));
        lockdebug_unlock_cpus();
        ld->ld_lock = NULL;
        TAILQ_INSERT_TAIL(&ld_free, ld, ld_chain);
        ld_nfree++;
        __cpu_simple_unlock(&ld->ld_spinlock);
        __cpu_simple_unlock(&ld_mod_lk);
        splx(s);
}

/*
 * lockdebug_more:
 *
 *        Allocate a batch of debug structures and add to the free list.
 *        Must be called with ld_mod_lk held.
 */
static int
lockdebug_more(int s)
{
        lockdebug_t *ld;
        void *block;
        int i, base, m;

        /*
         * Can't call kmem_alloc() if in interrupt context.  XXX We could
         * deadlock, because we don't know which locks the caller holds.
         */
        if (cpu_intr_p() || cpu_softintr_p()) {
                return s;
        }

        while (ld_nfree < LD_SLOP) {
                __cpu_simple_unlock(&ld_mod_lk);
                splx(s);
                block = kmem_zalloc(LD_BATCH * sizeof(lockdebug_t), KM_SLEEP);
                s = splhigh();
                __cpu_simple_lock(&ld_mod_lk);

                if (ld_nfree > LD_SLOP) {
                        /* Somebody beat us to it. */
                        __cpu_simple_unlock(&ld_mod_lk);
                        splx(s);
                        kmem_free(block, LD_BATCH * sizeof(lockdebug_t));
                        s = splhigh();
                        __cpu_simple_lock(&ld_mod_lk);
                        continue;
                }

                base = ld_freeptr;
                ld_nfree += LD_BATCH;
                ld = block;
                base <<= LD_BATCH_SHIFT;
                m = uimin(LD_MAX_LOCKS, base + LD_BATCH);

                if (m == LD_MAX_LOCKS)
                        ld_nomore = true;

                for (i = base; i < m; i++, ld++) {
                        __cpu_simple_lock_init(&ld->ld_spinlock);
                        TAILQ_INSERT_TAIL(&ld_free, ld, ld_chain);
                        TAILQ_INSERT_TAIL(&ld_all, ld, ld_achain);
                }

                membar_producer();
        }

        return s;
}

/*
 * lockdebug_wantlock:
 *
 *        Process the preamble to a lock acquire.  The "shared"
 *        parameter controls which ld_{ex,sh}want counter is
 *        updated; a negative value of shared updates neither.
 */
void
lockdebug_wantlock(const char *func, size_t line,
    const volatile void *lock, uintptr_t where, int shared)
{
        struct lwp *l = curlwp;
        lockdebug_t *ld;
        bool recurse;
        int s;

        (void)shared;
        recurse = false;

        if (__predict_false(panicstr != NULL || ld_panic))
                return;

        s = splhigh();
        if ((ld = lockdebug_lookup(func, line, lock, where)) == NULL) {
                splx(s);
                return;
        }
        if ((ld->ld_flags & LD_LOCKED) != 0 || ld->ld_shares != 0) {
                if ((ld->ld_flags & LD_SLEEPER) != 0) {
                        if (ld->ld_lwp == l)
                                recurse = true;
                } else if (ld->ld_cpu == (uint16_t)cpu_index(curcpu()))
                        recurse = true;
        }
        if (cpu_intr_p()) {
                if (__predict_false((ld->ld_flags & LD_SLEEPER) != 0)) {
                        lockdebug_abort1(func, line, ld, s,
                            "acquiring sleep lock from interrupt context",
                            true);
                        return;
                }
        }
        if (shared > 0)
                ld->ld_shwant++;
        else if (shared == 0)
                ld->ld_exwant++;
        if (__predict_false(recurse)) {
                lockdebug_abort1(func, line, ld, s, "locking against myself",
                    true);
                return;
        }
        if (l->l_ld_wanted == NULL) {
                l->l_ld_wanted = ld;
        }
        __cpu_simple_unlock(&ld->ld_spinlock);
        splx(s);
}

/*
 * lockdebug_locked:
 *
 *        Process a lock acquire operation.
 */
void
lockdebug_locked(const char *func, size_t line,
    volatile void *lock, void *cvlock, uintptr_t where, int shared)
{
        struct lwp *l = curlwp;
        lockdebug_t *ld;
        int s;

        if (__predict_false(panicstr != NULL || ld_panic))
                return;

        s = splhigh();
        if ((ld = lockdebug_lookup(func, line, lock, where)) == NULL) {
                splx(s);
                return;
        }
        if (shared) {
                l->l_shlocks++;
                ld->ld_locked = where;
                ld->ld_shares++;
                ld->ld_shwant--;
        } else {
                if (__predict_false((ld->ld_flags & LD_LOCKED) != 0)) {
                        lockdebug_abort1(func, line, ld, s, "already locked",
                            true);
                        return;
                }
                ld->ld_flags |= LD_LOCKED;
                ld->ld_locked = where;
                ld->ld_exwant--;
                if ((ld->ld_flags & LD_SLEEPER) != 0) {
                        TAILQ_INSERT_TAIL(&l->l_ld_locks, ld, ld_chain);
                } else {
                        TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_ld_locks,
                            ld, ld_chain);
                }
        }
        ld->ld_cpu = (uint16_t)cpu_index(curcpu());
        ld->ld_lwp = l;
        __cpu_simple_unlock(&ld->ld_spinlock);
        if (l->l_ld_wanted == ld) {
                l->l_ld_wanted = NULL;
        }
        splx(s);
}

/*
 * lockdebug_unlocked:
 *
 *        Process a lock release operation.
 */
void
lockdebug_unlocked(const char *func, size_t line,
    volatile void *lock, uintptr_t where, int shared)
{
        struct lwp *l = curlwp;
        lockdebug_t *ld;
        int s;

        if (__predict_false(panicstr != NULL || ld_panic))
                return;

        s = splhigh();
        if ((ld = lockdebug_lookup(func, line, lock, where)) == NULL) {
                splx(s);
                return;
        }
        if (shared) {
                if (__predict_false(l->l_shlocks == 0)) {
                        lockdebug_abort1(func, line, ld, s,
                            "no shared locks held by LWP", true);
                        return;
                }
                if (__predict_false(ld->ld_shares == 0)) {
                        lockdebug_abort1(func, line, ld, s,
                            "no shared holds on this lock", true);
                        return;
                }
                l->l_shlocks--;
                ld->ld_shares--;
                if (ld->ld_lwp == l) {
                        ld->ld_unlocked = where;
                        ld->ld_lwp = NULL;
                }
                if (ld->ld_cpu == (uint16_t)cpu_index(curcpu()))
                        ld->ld_cpu = (uint16_t)-1;
        } else {
                if (__predict_false((ld->ld_flags & LD_LOCKED) == 0)) {
                        lockdebug_abort1(func, line, ld, s, "not locked", true);
                        return;
                }

                if ((ld->ld_flags & LD_SLEEPER) != 0) {
                        if (__predict_false(ld->ld_lwp != curlwp)) {
                                lockdebug_abort1(func, line, ld, s,
                                    "not held by current LWP", true);
                                return;
                        }
                        TAILQ_REMOVE(&l->l_ld_locks, ld, ld_chain);
                } else {
                        uint16_t idx = (uint16_t)cpu_index(curcpu());
                        if (__predict_false(ld->ld_cpu != idx)) {
                                lockdebug_abort1(func, line, ld, s,
                                    "not held by current CPU", true);
                                return;
                        }
                        TAILQ_REMOVE(&curcpu()->ci_data.cpu_ld_locks, ld,
                            ld_chain);
                }
                ld->ld_flags &= ~LD_LOCKED;
                ld->ld_unlocked = where;
                ld->ld_lwp = NULL;
        }
        __cpu_simple_unlock(&ld->ld_spinlock);
        splx(s);
}

/*
 * lockdebug_barrier:
 *
 *        Panic if we hold more than one specified lock, and optionally, if we
 *        hold any sleep locks.
 */
void
lockdebug_barrier(const char *func, size_t line, volatile void *onelock,
    int slplocks)
{
        struct lwp *l = curlwp;
        lockdebug_t *ld;
        int s;

        if (__predict_false(panicstr != NULL || ld_panic))
                return;

        s = splhigh();
        if ((l->l_pflag & LP_INTR) == 0) {
                TAILQ_FOREACH(ld, &curcpu()->ci_data.cpu_ld_locks, ld_chain) {
                        if (ld->ld_lock == onelock) {
                                continue;
                        }
                        __cpu_simple_lock(&ld->ld_spinlock);
                        lockdebug_abort1(func, line, ld, s,
                            "spin lock held", true);
                        return;
                }
        }
        if (slplocks) {
                splx(s);
                return;
        }
        ld = TAILQ_FIRST(&l->l_ld_locks);
        if (__predict_false(ld != NULL && ld->ld_lock != onelock)) {
                __cpu_simple_lock(&ld->ld_spinlock);
                lockdebug_abort1(func, line, ld, s, "sleep lock held", true);
                return;
        }
        splx(s);
        if (l->l_shlocks != 0) {
                TAILQ_FOREACH(ld, &ld_all, ld_achain) {
                        if (ld->ld_lock == onelock) {
                                continue;
                        }
                        if (ld->ld_lwp == l)
                                lockdebug_dump(l, ld, printf);
                }
                panic("%s,%zu: holding %d shared locks", func, line,
                    l->l_shlocks);
        }
}

/*
 * lockdebug_mem_check:
 *
 *        Check for in-use locks within a memory region that is
 *        being freed.
 */
void
lockdebug_mem_check(const char *func, size_t line, void *base, size_t sz)
{
        lockdebug_t *ld;
        struct cpu_info *ci;
        int s;

        if (__predict_false(panicstr != NULL || ld_panic))
                return;

        kcov_silence_enter();

        s = splhigh();
        ci = curcpu();
        __cpu_simple_lock(&ci->ci_data.cpu_ld_lock);
        ld = (lockdebug_t *)rb_tree_find_node_geq(&ld_rb_tree, base);
        if (ld != NULL) {
                const uintptr_t lock = (uintptr_t)ld->ld_lock;

                if (__predict_false((uintptr_t)base > lock))
                        panic("%s,%zu: corrupt tree ld=%p, base=%p, sz=%zu",
                            func, line, ld, base, sz);
                if (lock >= (uintptr_t)base + sz)
                        ld = NULL;
        }
        __cpu_simple_unlock(&ci->ci_data.cpu_ld_lock);
        if (__predict_false(ld != NULL)) {
                __cpu_simple_lock(&ld->ld_spinlock);
                lockdebug_abort1(func, line, ld, s,
                    "allocation contains active lock", !cold);
                kcov_silence_leave();
                return;
        }
        splx(s);

        kcov_silence_leave();
}
#endif /* _KERNEL */

/*
 * lockdebug_dump:
 *
 *        Dump information about a lock on panic, or for DDB.
 */
static void
lockdebug_dump(lwp_t *l, lockdebug_t *ld, void (*pr)(const char *, ...)
    __printflike(1, 2))
{
        int sleeper = (ld->ld_flags & LD_SLEEPER);
        lockops_t *lo = ld->ld_lockops;
        char locksym[128], initsym[128], lockedsym[128], unlockedsym[128];

#ifdef DDB
        db_symstr(locksym, sizeof(locksym), (db_expr_t)(intptr_t)ld->ld_lock,
            DB_STGY_ANY);
        db_symstr(initsym, sizeof(initsym), (db_expr_t)ld->ld_initaddr,
            DB_STGY_PROC);
        db_symstr(lockedsym, sizeof(lockedsym), (db_expr_t)ld->ld_locked,
            DB_STGY_PROC);
        db_symstr(unlockedsym, sizeof(unlockedsym), (db_expr_t)ld->ld_unlocked,
            DB_STGY_PROC);
#else
        snprintf(locksym, sizeof(locksym), "%#018lx",
            (unsigned long)ld->ld_lock);
        snprintf(initsym, sizeof(initsym), "%#018lx",
            (unsigned long)ld->ld_initaddr);
        snprintf(lockedsym, sizeof(lockedsym), "%#018lx",
            (unsigned long)ld->ld_locked);
        snprintf(unlockedsym, sizeof(unlockedsym), "%#018lx",
            (unsigned long)ld->ld_unlocked);
#endif

        (*pr)(
            "lock address : %s\n"
            "type         : %s\n"
            "initialized  : %s",
            locksym, (sleeper ? "sleep/adaptive" : "spin"),
            initsym);

#ifndef _KERNEL
        lockops_t los;
        lo = &los;
        db_read_bytes((db_addr_t)ld->ld_lockops, sizeof(los), (char *)lo);
#endif
        (*pr)("\n"
            "shared holds : %18u exclusive: %18u\n"
            "shares wanted: %18u exclusive: %18u\n"
            "relevant cpu : %18u last held: %18u\n"
            "relevant lwp : %#018lx last held: %#018lx\n"
            "last locked%c : %s\n"
            "unlocked%c    : %s\n",
            (unsigned)ld->ld_shares, ((ld->ld_flags & LD_LOCKED) != 0),
            (unsigned)ld->ld_shwant, (unsigned)ld->ld_exwant,
            (unsigned)cpu_index(l->l_cpu), (unsigned)ld->ld_cpu,
            (long)l, (long)ld->ld_lwp,
            ((ld->ld_flags & LD_LOCKED) ? '*' : ' '),
            lockedsym,
            ((ld->ld_flags & LD_LOCKED) ? ' ' : '*'),
            unlockedsym);

#ifdef _KERNEL
        if (lo->lo_dump != NULL)
                (*lo->lo_dump)(ld->ld_lock, pr);

        if (sleeper) {
                turnstile_print(ld->ld_lock, pr);
        }
#endif
}

#ifdef _KERNEL
/*
 * lockdebug_abort1:
 *
 *        An error has been trapped - dump lock info and panic.
 */
static void
lockdebug_abort1(const char *func, size_t line, lockdebug_t *ld, int s,
                 const char *msg, bool dopanic)
{

        /*
         * Don't make the situation worse if the system is already going
         * down in flames.  Once a panic is triggered, lockdebug state
         * becomes stale and cannot be trusted.
         */
        if (atomic_inc_uint_nv(&ld_panic) != 1) {
                __cpu_simple_unlock(&ld->ld_spinlock);
                splx(s);
                return;
        }

        printf("%s error: %s,%zu: %s\n\n", ld->ld_lockops->lo_name,
            func, line, msg);
        lockdebug_dump(curlwp, ld, printf);
        __cpu_simple_unlock(&ld->ld_spinlock);
        splx(s);
        printf("\n");
        if (dopanic)
                panic("LOCKDEBUG: %s error: %s,%zu: %s",
                    ld->ld_lockops->lo_name, func, line, msg);
}

#endif /* _KERNEL */
#endif        /* LOCKDEBUG */

/*
 * lockdebug_lock_print:
 *
 *        Handle the DDB 'show lock' command.
 */
#ifdef DDB
void
lockdebug_lock_print(void *addr,
    void (*pr)(const char *, ...) __printflike(1, 2))
{
#ifdef LOCKDEBUG
        lockdebug_t *ld, lds;

        TAILQ_FOREACH(ld, &ld_all, ld_achain) {
                db_read_bytes((db_addr_t)ld, sizeof(lds), __UNVOLATILE(&lds));
                ld = &lds;
                if (ld->ld_lock == NULL)
                        continue;
                if (addr == NULL || ld->ld_lock == addr) {
                        lockdebug_dump(curlwp, ld, pr);
                        if (addr != NULL)
                                return;
                }
        }
        if (addr != NULL) {
                (*pr)("Sorry, no record of a lock with address %p found.\n",
                    addr);
        }
#else
        char sym[128];
        uintptr_t word;

        (*pr)("WARNING: lock print is unreliable without LOCKDEBUG\n");
        db_symstr(sym, sizeof(sym), (db_expr_t)(intptr_t)addr, DB_STGY_ANY);
        db_read_bytes((db_addr_t)addr, sizeof(word), (char *)&word);
        (*pr)("%s: possible owner: %p, bits: 0x%" PRIxPTR "\n", sym,
            (void *)(word & ~(uintptr_t)ALIGNBYTES), word & ALIGNBYTES);
#endif        /* LOCKDEBUG */
}

#ifdef _KERNEL
#ifdef LOCKDEBUG
static void
lockdebug_show_one(lwp_t *l, lockdebug_t *ld, int i,
    void (*pr)(const char *, ...) __printflike(1, 2))
{
        char sym[128];

#ifdef DDB
        db_symstr(sym, sizeof(sym), (db_expr_t)ld->ld_initaddr, DB_STGY_PROC);
#else
        snprintf(sym, sizeof(sym), "%p", (void *)ld->ld_initaddr);
#endif
        (*pr)("* Lock %d (initialized at %s)\n", i++, sym);
        lockdebug_dump(l, ld, pr);
}

static void
lockdebug_show_trace(const void *ptr,
    void (*pr)(const char *, ...) __printflike(1, 2))
{

        db_stack_trace_print((db_expr_t)(intptr_t)ptr, true, 32, "a", pr);
}

static void
lockdebug_show_all_locks_lwp(void (*pr)(const char *, ...) __printflike(1, 2),
    bool show_trace)
{
        struct proc *p;

        LIST_FOREACH(p, &allproc, p_list) {
                struct lwp *l;
                LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                        lockdebug_t *ld;
                        int i = 0;
                        if (TAILQ_EMPTY(&l->l_ld_locks) &&
                            l->l_ld_wanted == NULL) {
                                    continue;
                        }
                        (*pr)("\n****** LWP %d.%d (%s) @ %p, l_stat=%d\n",
                            p->p_pid, l->l_lid,
                            l->l_name ? l->l_name : p->p_comm, l, l->l_stat);
                        if (!TAILQ_EMPTY(&l->l_ld_locks)) {
                                (*pr)("\n*** Locks held: \n");
                                TAILQ_FOREACH(ld, &l->l_ld_locks, ld_chain) {
                                        (*pr)("\n");
                                        lockdebug_show_one(l, ld, i++, pr);
                                }
                        } else {
                                (*pr)("\n*** Locks held: none\n");
                        }

                        if (l->l_ld_wanted != NULL) {
                                (*pr)("\n*** Locks wanted: \n\n");
                                lockdebug_show_one(l, l->l_ld_wanted, 0, pr);
                        } else {
                                (*pr)("\n*** Locks wanted: none\n");
                        }
                        if (show_trace) {
                                (*pr)("\n*** Traceback: \n\n");
                                lockdebug_show_trace(l, pr);
                                (*pr)("\n");
                        }
                }
        }
}

static void
lockdebug_show_all_locks_cpu(void (*pr)(const char *, ...) __printflike(1, 2),
    bool show_trace)
{
        lockdebug_t *ld;
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;

        for (CPU_INFO_FOREACH(cii, ci)) {
                int i = 0;
                if (TAILQ_EMPTY(&ci->ci_data.cpu_ld_locks))
                        continue;
                (*pr)("\n******* Locks held on %s:\n", cpu_name(ci));
                TAILQ_FOREACH(ld, &ci->ci_data.cpu_ld_locks, ld_chain) {
                        (*pr)("\n");
#ifdef MULTIPROCESSOR
                        lockdebug_show_one(ci->ci_curlwp, ld, i++, pr);
                        if (show_trace)
                                lockdebug_show_trace(ci->ci_curlwp, pr);
#else
                        lockdebug_show_one(curlwp, ld, i++, pr);
                        if (show_trace)
                                lockdebug_show_trace(curlwp, pr);
#endif
                }
        }
}
#endif /* _KERNEL */
#endif        /* LOCKDEBUG */

#ifdef _KERNEL
void
lockdebug_show_all_locks(void (*pr)(const char *, ...) __printflike(1, 2),
    const char *modif)
{
#ifdef LOCKDEBUG
        bool show_trace = false;
        if (modif[0] == 't')
                show_trace = true;

        (*pr)("[Locks tracked through LWPs]\n");
        lockdebug_show_all_locks_lwp(pr, show_trace);
        (*pr)("\n");

        (*pr)("[Locks tracked through CPUs]\n");
        lockdebug_show_all_locks_cpu(pr, show_trace);
        (*pr)("\n");
#else
        (*pr)("Sorry, kernel not built with the LOCKDEBUG option.\n");
#endif        /* LOCKDEBUG */
}

void
lockdebug_show_lockstats(void (*pr)(const char *, ...) __printflike(1, 2))
{
#ifdef LOCKDEBUG
        lockdebug_t *ld;
        void *_ld;
        uint32_t n_null = 0;
        uint32_t n_spin_mutex = 0;
        uint32_t n_adaptive_mutex = 0;
        uint32_t n_rwlock = 0;
        uint32_t n_others = 0;

        RB_TREE_FOREACH(_ld, &ld_rb_tree) {
                ld = _ld;
                if (ld->ld_lock == NULL) {
                        n_null++;
                        continue;
                }
                if (ld->ld_lockops->lo_name[0] == 'M') {
                        if (ld->ld_lockops->lo_type == LOCKOPS_SLEEP)
                                n_adaptive_mutex++;
                        else
                                n_spin_mutex++;
                        continue;
                }
                if (ld->ld_lockops->lo_name[0] == 'R') {
                        n_rwlock++;
                        continue;
                }
                n_others++;
        }
        (*pr)(
            "spin mutex: %u\n"
            "adaptive mutex: %u\n"
            "rwlock: %u\n"
            "null locks: %u\n"
            "others: %u\n",
            n_spin_mutex, n_adaptive_mutex, n_rwlock,
            n_null, n_others);
#else
        (*pr)("Sorry, kernel not built with the LOCKDEBUG option.\n");
#endif        /* LOCKDEBUG */
}
#endif /* _KERNEL */
#endif        /* DDB */

#ifdef _KERNEL
/*
 * lockdebug_dismiss:
 *
 *      The system is rebooting, and potentially from an unsafe
 *      place so avoid any future aborts.
 */
void
lockdebug_dismiss(void)
{

        atomic_inc_uint_nv(&ld_panic);
}

/*
 * lockdebug_abort:
 *
 *        An error has been trapped - dump lock info and call panic().
 */
void
lockdebug_abort(const char *func, size_t line, const volatile void *lock,
    lockops_t *ops, const char *msg)
{
#ifdef LOCKDEBUG
        lockdebug_t *ld;
        int s;

        s = splhigh();
        if ((ld = lockdebug_lookup(func, line, lock,
                        (uintptr_t) __builtin_return_address(0))) != NULL) {
                lockdebug_abort1(func, line, ld, s, msg, true);
                return;
        }
        splx(s);
#endif        /* LOCKDEBUG */

        /*
         * Don't make the situation worse if the system is already going
         * down in flames.  Once a panic is triggered, lockdebug state
         * becomes stale and cannot be trusted.
         */
        if (atomic_inc_uint_nv(&ld_panic) > 1)
                return;

        char locksym[128];

#ifdef DDB
        db_symstr(locksym, sizeof(locksym), (db_expr_t)(intptr_t)lock,
            DB_STGY_ANY);
#else
        snprintf(locksym, sizeof(locksym), "%#018lx", (unsigned long)lock);
#endif

        printf("%s error: %s,%zu: %s\n\n"
            "lock address : %s\n"
            "current cpu  : %18d\n"
            "current lwp  : %#018lx\n",
            ops->lo_name, func, line, msg, locksym,
            (int)cpu_index(curcpu()), (long)curlwp);
        (*ops->lo_dump)(lock, printf);
        printf("\n");

        panic("lock error: %s: %s,%zu: %s: lock %p cpu %d lwp %p",
            ops->lo_name, func, line, msg, lock, cpu_index(curcpu()), curlwp);
}
#endif /* _KERNEL */





























































































































































































































































    1 





















    1 













    2 







    2 
    2 

















































    2 






















    2 
    2 

















    2 


























    2 
























    2 



















    2 







    2 










    2 



    2 











    2 










    2 

































    1 






























    9 




    9 







    4 










    4 










    1 










    2 










    2 








    1 








    2 






















    1 






    1 
    1 











    1 
























    1 
    1 
















    1 








































































    1 















    1 
    1 
    1 












    1 












    1 

















































    1 















    1 










































    1 
    1 


















    1 











































































    1 
    1 
    1 


























    1 


    2 


    2 

























































































































































































































    2 



    2 
    2 





































































    2 


    2 











    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
/*        $NetBSD: uvm_aobj.c,v 1.157 2023/02/24 11:03:13 riastradh Exp $        */

/*
 * Copyright (c) 1998 Chuck Silvers, Charles D. Cranor and
 *                    Washington University.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * from: Id: uvm_aobj.c,v 1.1.2.5 1998/02/06 05:14:38 chs Exp
 */

/*
 * uvm_aobj.c: anonymous memory uvm_object pager
 *
 * author: Chuck Silvers <chuq@chuq.com>
 * started: Jan-1998
 *
 * - design mostly from Chuck Cranor
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_aobj.c,v 1.157 2023/02/24 11:03:13 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_uvmhist.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/atomic.h>

#include <uvm/uvm.h>
#include <uvm/uvm_page_array.h>

/*
 * An anonymous UVM object (aobj) manages anonymous-memory.  In addition to
 * keeping the list of resident pages, it may also keep a list of allocated
 * swap blocks.  Depending on the size of the object, this list is either
 * stored in an array (small objects) or in a hash table (large objects).
 *
 * Lock order
 *
 *        uao_list_lock ->
 *                uvm_object::vmobjlock
 */

/*
 * Note: for hash tables, we break the address space of the aobj into blocks
 * of UAO_SWHASH_CLUSTER_SIZE pages, which shall be a power of two.
 */

#define        UAO_SWHASH_CLUSTER_SHIFT        4
#define        UAO_SWHASH_CLUSTER_SIZE                (1 << UAO_SWHASH_CLUSTER_SHIFT)

/* Get the "tag" for this page index. */
#define        UAO_SWHASH_ELT_TAG(idx)                ((idx) >> UAO_SWHASH_CLUSTER_SHIFT)
#define UAO_SWHASH_ELT_PAGESLOT_IDX(idx) \
    ((idx) & (UAO_SWHASH_CLUSTER_SIZE - 1))

/* Given an ELT and a page index, find the swap slot. */
#define        UAO_SWHASH_ELT_PAGESLOT(elt, idx) \
    ((elt)->slots[UAO_SWHASH_ELT_PAGESLOT_IDX(idx)])

/* Given an ELT, return its pageidx base. */
#define        UAO_SWHASH_ELT_PAGEIDX_BASE(ELT) \
    ((elt)->tag << UAO_SWHASH_CLUSTER_SHIFT)

/* The hash function. */
#define        UAO_SWHASH_HASH(aobj, idx) \
    (&(aobj)->u_swhash[(((idx) >> UAO_SWHASH_CLUSTER_SHIFT) \
    & (aobj)->u_swhashmask)])

/*
 * The threshold which determines whether we will use an array or a
 * hash table to store the list of allocated swap blocks.
 */
#define        UAO_SWHASH_THRESHOLD                (UAO_SWHASH_CLUSTER_SIZE * 4)
#define        UAO_USES_SWHASH(aobj) \
    ((aobj)->u_pages > UAO_SWHASH_THRESHOLD)

/* The number of buckets in a hash, with an upper bound. */
#define        UAO_SWHASH_MAXBUCKETS                256
#define        UAO_SWHASH_BUCKETS(aobj) \
    (MIN((aobj)->u_pages >> UAO_SWHASH_CLUSTER_SHIFT, UAO_SWHASH_MAXBUCKETS))

/*
 * uao_swhash_elt: when a hash table is being used, this structure defines
 * the format of an entry in the bucket list.
 */

struct uao_swhash_elt {
        LIST_ENTRY(uao_swhash_elt) list;        /* the hash list */
        voff_t tag;                                /* our 'tag' */
        int count;                                /* our number of active slots */
        int slots[UAO_SWHASH_CLUSTER_SIZE];        /* the slots */
};

/*
 * uao_swhash: the swap hash table structure
 */

LIST_HEAD(uao_swhash, uao_swhash_elt);

/*
 * uao_swhash_elt_pool: pool of uao_swhash_elt structures.
 * Note: pages for this pool must not come from a pageable kernel map.
 */
static struct pool        uao_swhash_elt_pool        __cacheline_aligned;

/*
 * uvm_aobj: the actual anon-backed uvm_object
 *
 * => the uvm_object is at the top of the structure, this allows
 *   (struct uvm_aobj *) == (struct uvm_object *)
 * => only one of u_swslots and u_swhash is used in any given aobj
 */

struct uvm_aobj {
        struct uvm_object u_obj; /* has: lock, pgops, #pages, #refs */
        pgoff_t u_pages;         /* number of pages in entire object */
        int u_flags;                 /* the flags (see uvm_aobj.h) */
        int *u_swslots;                 /* array of offset->swapslot mappings */
                                 /*
                                  * hashtable of offset->swapslot mappings
                                  * (u_swhash is an array of bucket heads)
                                  */
        struct uao_swhash *u_swhash;
        u_long u_swhashmask;                /* mask for hashtable */
        LIST_ENTRY(uvm_aobj) u_list;        /* global list of aobjs */
        int u_freelist;                  /* freelist to allocate pages from */
};

static void        uao_free(struct uvm_aobj *);
static int        uao_get(struct uvm_object *, voff_t, struct vm_page **,
                    int *, int, vm_prot_t, int, int);
static int        uao_put(struct uvm_object *, voff_t, voff_t, int);

#if defined(VMSWAP)
static struct uao_swhash_elt *uao_find_swhash_elt
    (struct uvm_aobj *, int, bool);

static bool uao_pagein(struct uvm_aobj *, int, int);
static bool uao_pagein_page(struct uvm_aobj *, int);
#endif /* defined(VMSWAP) */

static struct vm_page        *uao_pagealloc(struct uvm_object *, voff_t, int);

/*
 * aobj_pager
 *
 * note that some functions (e.g. put) are handled elsewhere
 */

const struct uvm_pagerops aobj_pager = {
        .pgo_reference = uao_reference,
        .pgo_detach = uao_detach,
        .pgo_get = uao_get,
        .pgo_put = uao_put,
};

/*
 * uao_list: global list of active aobjs, locked by uao_list_lock
 */

static LIST_HEAD(aobjlist, uvm_aobj) uao_list        __cacheline_aligned;
static kmutex_t                uao_list_lock                __cacheline_aligned;

/*
 * hash table/array related functions
 */

#if defined(VMSWAP)

/*
 * uao_find_swhash_elt: find (or create) a hash table entry for a page
 * offset.
 *
 * => the object should be locked by the caller
 */

static struct uao_swhash_elt *
uao_find_swhash_elt(struct uvm_aobj *aobj, int pageidx, bool create)
{
        struct uao_swhash *swhash;
        struct uao_swhash_elt *elt;
        voff_t page_tag;

        swhash = UAO_SWHASH_HASH(aobj, pageidx);
        page_tag = UAO_SWHASH_ELT_TAG(pageidx);

        /*
         * now search the bucket for the requested tag
         */

        LIST_FOREACH(elt, swhash, list) {
                if (elt->tag == page_tag) {
                        return elt;
                }
        }
        if (!create) {
                return NULL;
        }

        /*
         * allocate a new entry for the bucket and init/insert it in
         */

        elt = pool_get(&uao_swhash_elt_pool, PR_NOWAIT);
        if (elt == NULL) {
                return NULL;
        }
        LIST_INSERT_HEAD(swhash, elt, list);
        elt->tag = page_tag;
        elt->count = 0;
        memset(elt->slots, 0, sizeof(elt->slots));
        return elt;
}

/*
 * uao_find_swslot: find the swap slot number for an aobj/pageidx
 *
 * => object must be locked by caller
 */

int
uao_find_swslot(struct uvm_object *uobj, int pageidx)
{
        struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
        struct uao_swhash_elt *elt;

        KASSERT(UVM_OBJ_IS_AOBJ(uobj));

        /*
         * if noswap flag is set, then we never return a slot
         */

        if (aobj->u_flags & UAO_FLAG_NOSWAP)
                return 0;

        /*
         * if hashing, look in hash table.
         */

        if (UAO_USES_SWHASH(aobj)) {
                elt = uao_find_swhash_elt(aobj, pageidx, false);
                return elt ? UAO_SWHASH_ELT_PAGESLOT(elt, pageidx) : 0;
        }

        /*
         * otherwise, look in the array
         */

        return aobj->u_swslots[pageidx];
}

/*
 * uao_set_swslot: set the swap slot for a page in an aobj.
 *
 * => setting a slot to zero frees the slot
 * => object must be locked by caller
 * => we return the old slot number, or -1 if we failed to allocate
 *    memory to record the new slot number
 */

int
uao_set_swslot(struct uvm_object *uobj, int pageidx, int slot)
{
        struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
        struct uao_swhash_elt *elt;
        int oldslot;
        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(pdhist, "aobj %#jx pageidx %jd slot %jd",
            (uintptr_t)aobj, pageidx, slot, 0);

        KASSERT(rw_write_held(uobj->vmobjlock) || uobj->uo_refs == 0);
        KASSERT(UVM_OBJ_IS_AOBJ(uobj));

        /*
         * if noswap flag is set, then we can't set a non-zero slot.
         */

        if (aobj->u_flags & UAO_FLAG_NOSWAP) {
                KASSERTMSG(slot == 0, "uao_set_swslot: no swap object");
                return 0;
        }

        /*
         * are we using a hash table?  if so, add it in the hash.
         */

        if (UAO_USES_SWHASH(aobj)) {

                /*
                 * Avoid allocating an entry just to free it again if
                 * the page had not swap slot in the first place, and
                 * we are freeing.
                 */

                elt = uao_find_swhash_elt(aobj, pageidx, slot != 0);
                if (elt == NULL) {
                        return slot ? -1 : 0;
                }

                oldslot = UAO_SWHASH_ELT_PAGESLOT(elt, pageidx);
                UAO_SWHASH_ELT_PAGESLOT(elt, pageidx) = slot;

                /*
                 * now adjust the elt's reference counter and free it if we've
                 * dropped it to zero.
                 */

                if (slot) {
                        if (oldslot == 0)
                                elt->count++;
                } else {
                        if (oldslot)
                                elt->count--;

                        if (elt->count == 0) {
                                LIST_REMOVE(elt, list);
                                pool_put(&uao_swhash_elt_pool, elt);
                        }
                }
        } else {
                /* we are using an array */
                oldslot = aobj->u_swslots[pageidx];
                aobj->u_swslots[pageidx] = slot;
        }
        return oldslot;
}

#endif /* defined(VMSWAP) */

/*
 * end of hash/array functions
 */

/*
 * uao_free: free all resources held by an aobj, and then free the aobj
 *
 * => the aobj should be dead
 */

static void
uao_free(struct uvm_aobj *aobj)
{
        struct uvm_object *uobj = &aobj->u_obj;

        KASSERT(UVM_OBJ_IS_AOBJ(uobj));
        KASSERT(rw_write_held(uobj->vmobjlock));
        uao_dropswap_range(uobj, 0, 0);
        rw_exit(uobj->vmobjlock);

#if defined(VMSWAP)
        if (UAO_USES_SWHASH(aobj)) {

                /*
                 * free the hash table itself.
                 */

                hashdone(aobj->u_swhash, HASH_LIST, aobj->u_swhashmask);
        } else {

                /*
                 * free the array itself.
                 */

                kmem_free(aobj->u_swslots, aobj->u_pages * sizeof(int));
        }
#endif /* defined(VMSWAP) */

        /*
         * finally free the aobj itself
         */

        uvm_obj_destroy(uobj, true);
        kmem_free(aobj, sizeof(struct uvm_aobj));
}

/*
 * pager functions
 */

/*
 * uao_create: create an aobj of the given size and return its uvm_object.
 *
 * => for normal use, flags are always zero
 * => for the kernel object, the flags are:
 *        UAO_FLAG_KERNOBJ - allocate the kernel object (can only happen once)
 *        UAO_FLAG_KERNSWAP - enable swapping of kernel object ("           ")
 */

struct uvm_object *
uao_create(voff_t size, int flags)
{
        static struct uvm_aobj kernel_object_store;
        static krwlock_t bootstrap_kernel_object_lock;
        static int kobj_alloced __diagused = 0;
        pgoff_t pages = round_page((uint64_t)size) >> PAGE_SHIFT;
        struct uvm_aobj *aobj;
        int refs;

        /*
         * Allocate a new aobj, unless kernel object is requested.
         */

        if (flags & UAO_FLAG_KERNOBJ) {
                KASSERT(!kobj_alloced);
                aobj = &kernel_object_store;
                aobj->u_pages = pages;
                aobj->u_flags = UAO_FLAG_NOSWAP;
                refs = UVM_OBJ_KERN;
                kobj_alloced = UAO_FLAG_KERNOBJ;
        } else if (flags & UAO_FLAG_KERNSWAP) {
                KASSERT(kobj_alloced == UAO_FLAG_KERNOBJ);
                aobj = &kernel_object_store;
                kobj_alloced = UAO_FLAG_KERNSWAP;
                refs = 0xdeadbeaf; /* XXX: gcc */
        } else {
                aobj = kmem_alloc(sizeof(struct uvm_aobj), KM_SLEEP);
                aobj->u_pages = pages;
                aobj->u_flags = 0;
                refs = 1;
        }

        /*
         * no freelist by default
         */

        aobj->u_freelist = VM_NFREELIST;

        /*
          * allocate hash/array if necessary
          *
          * note: in the KERNSWAP case no need to worry about locking since
          * we are still booting we should be the only thread around.
          */

        const int kernswap = (flags & UAO_FLAG_KERNSWAP) != 0;
        if (flags == 0 || kernswap) {
#if defined(VMSWAP)

                /* allocate hash table or array depending on object size */
                if (UAO_USES_SWHASH(aobj)) {
                        aobj->u_swhash = hashinit(UAO_SWHASH_BUCKETS(aobj),
                            HASH_LIST, true, &aobj->u_swhashmask);
                } else {
                        aobj->u_swslots = kmem_zalloc(pages * sizeof(int),
                            KM_SLEEP);
                }
#endif /* defined(VMSWAP) */

                /*
                 * Replace kernel_object's temporary static lock with
                 * a regular rw_obj.  We cannot use uvm_obj_setlock()
                 * because that would try to free the old lock.
                 */

                if (kernswap) {
                        aobj->u_obj.vmobjlock = rw_obj_alloc();
                        rw_destroy(&bootstrap_kernel_object_lock);
                }
                if (flags) {
                        aobj->u_flags &= ~UAO_FLAG_NOSWAP; /* clear noswap */
                        return &aobj->u_obj;
                }
        }

        /*
         * Initialise UVM object.
         */

        const bool kernobj = (flags & UAO_FLAG_KERNOBJ) != 0;
        uvm_obj_init(&aobj->u_obj, &aobj_pager, !kernobj, refs);
        if (__predict_false(kernobj)) {
                /* Use a temporary static lock for kernel_object. */
                rw_init(&bootstrap_kernel_object_lock);
                uvm_obj_setlock(&aobj->u_obj, &bootstrap_kernel_object_lock);
        }

        /*
          * now that aobj is ready, add it to the global list
          */

        mutex_enter(&uao_list_lock);
        LIST_INSERT_HEAD(&uao_list, aobj, u_list);
        mutex_exit(&uao_list_lock);
        return(&aobj->u_obj);
}

/*
 * uao_set_pgfl: allocate pages only from the specified freelist.
 *
 * => must be called before any pages are allocated for the object.
 * => reset by setting it to VM_NFREELIST, meaning any freelist.
 */

void
uao_set_pgfl(struct uvm_object *uobj, int freelist)
{
        struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;

        KASSERTMSG((0 <= freelist), "invalid freelist %d", freelist);
        KASSERTMSG((freelist <= VM_NFREELIST), "invalid freelist %d",
            freelist);

        aobj->u_freelist = freelist;
}

/*
 * uao_pagealloc: allocate a page for aobj.
 */

static inline struct vm_page *
uao_pagealloc(struct uvm_object *uobj, voff_t offset, int flags)
{
        struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;

        if (__predict_true(aobj->u_freelist == VM_NFREELIST))
                return uvm_pagealloc(uobj, offset, NULL, flags);
        else
                return uvm_pagealloc_strat(uobj, offset, NULL, flags,
                    UVM_PGA_STRAT_ONLY, aobj->u_freelist);
}

/*
 * uao_init: set up aobj pager subsystem
 *
 * => called at boot time from uvm_pager_init()
 */

void
uao_init(void)
{
        static int uao_initialized;

        if (uao_initialized)
                return;
        uao_initialized = true;
        LIST_INIT(&uao_list);
        mutex_init(&uao_list_lock, MUTEX_DEFAULT, IPL_NONE);
        pool_init(&uao_swhash_elt_pool, sizeof(struct uao_swhash_elt),
            0, 0, 0, "uaoeltpl", NULL, IPL_VM);
}

/*
 * uao_reference: hold a reference to an anonymous UVM object.
 */
void
uao_reference(struct uvm_object *uobj)
{
        /* Kernel object is persistent. */
        if (UVM_OBJ_IS_KERN_OBJECT(uobj)) {
                return;
        }
        atomic_inc_uint(&uobj->uo_refs);
}

/*
 * uao_detach: drop a reference to an anonymous UVM object.
 */
void
uao_detach(struct uvm_object *uobj)
{
        struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
        struct uvm_page_array a;
        struct vm_page *pg;

        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        /*
         * Detaching from kernel object is a NOP.
         */

        if (UVM_OBJ_IS_KERN_OBJECT(uobj))
                return;

        /*
         * Drop the reference.  If it was the last one, destroy the object.
         */

        KASSERT(uobj->uo_refs > 0);
        UVMHIST_LOG(maphist,"  (uobj=%#jx)  ref=%jd",
            (uintptr_t)uobj, uobj->uo_refs, 0, 0);
        membar_release();
        if (atomic_dec_uint_nv(&uobj->uo_refs) > 0) {
                UVMHIST_LOG(maphist, "<- done (rc>0)", 0,0,0,0);
                return;
        }
        membar_acquire();

        /*
         * Remove the aobj from the global list.
         */

        mutex_enter(&uao_list_lock);
        LIST_REMOVE(aobj, u_list);
        mutex_exit(&uao_list_lock);

        /*
         * Free all the pages left in the aobj.  For each page, when the
         * page is no longer busy (and thus after any disk I/O that it is
         * involved in is complete), release any swap resources and free
         * the page itself.
         */
        uvm_page_array_init(&a, uobj, 0);
        rw_enter(uobj->vmobjlock, RW_WRITER);
        while ((pg = uvm_page_array_fill_and_peek(&a, 0, 0)) != NULL) {
                uvm_page_array_advance(&a);
                pmap_page_protect(pg, VM_PROT_NONE);
                if (pg->flags & PG_BUSY) {
                        uvm_pagewait(pg, uobj->vmobjlock, "uao_det");
                        uvm_page_array_clear(&a);
                        rw_enter(uobj->vmobjlock, RW_WRITER);
                        continue;
                }
                uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT);
                uvm_pagefree(pg);
        }
        uvm_page_array_fini(&a);

        /*
         * Finally, free the anonymous UVM object itself.
         */

        uao_free(aobj);
}

/*
 * uao_put: flush pages out of a uvm object
 *
 * => object should be locked by caller.  we may _unlock_ the object
 *        if (and only if) we need to clean a page (PGO_CLEANIT).
 *        XXXJRT Currently, however, we don't.  In the case of cleaning
 *        XXXJRT a page, we simply just deactivate it.  Should probably
 *        XXXJRT handle this better, in the future (although "flushing"
 *        XXXJRT anonymous memory isn't terribly important).
 * => if PGO_CLEANIT is not set, then we will neither unlock the object
 *        or block.
 * => if PGO_ALLPAGE is set, then all pages in the object are valid targets
 *        for flushing.
 * => we return 0 unless we encountered some sort of I/O error
 *        XXXJRT currently never happens, as we never directly initiate
 *        XXXJRT I/O
 */

static int
uao_put(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)
{
        struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
        struct uvm_page_array a;
        struct vm_page *pg;
        voff_t curoff;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);

        KASSERT(UVM_OBJ_IS_AOBJ(uobj));
        KASSERT(rw_write_held(uobj->vmobjlock));

        if (flags & PGO_ALLPAGES) {
                start = 0;
                stop = aobj->u_pages << PAGE_SHIFT;
        } else {
                start = trunc_page(start);
                if (stop == 0) {
                        stop = aobj->u_pages << PAGE_SHIFT;
                } else {
                        stop = round_page(stop);
                }
                if (stop > (uint64_t)(aobj->u_pages << PAGE_SHIFT)) {
                        printf("uao_put: strange, got an out of range "
                            "flush %#jx > %#jx (fixed)\n",
                            (uintmax_t)stop,
                            (uintmax_t)(aobj->u_pages << PAGE_SHIFT));
                        stop = aobj->u_pages << PAGE_SHIFT;
                }
        }
        UVMHIST_LOG(maphist,
            " flush start=%#jx, stop=%#jx, flags=%#jx",
            start, stop, flags, 0);

        /*
         * Don't need to do any work here if we're not freeing
         * or deactivating pages.
         */

        if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) {
                rw_exit(uobj->vmobjlock);
                return 0;
        }

        /* locked: uobj */
        uvm_page_array_init(&a, uobj, 0);
        curoff = start;
        while ((pg = uvm_page_array_fill_and_peek(&a, curoff, 0)) != NULL) {
                if (pg->offset >= stop) {
                        break;
                }

                /*
                 * wait and try again if the page is busy.
                 */

                if (pg->flags & PG_BUSY) {
                        uvm_pagewait(pg, uobj->vmobjlock, "uao_put");
                        uvm_page_array_clear(&a);
                        rw_enter(uobj->vmobjlock, RW_WRITER);
                        continue;
                }
                uvm_page_array_advance(&a);
                curoff = pg->offset + PAGE_SIZE;

                switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {

                /*
                 * XXX In these first 3 cases, we always just
                 * XXX deactivate the page.  We may want to
                 * XXX handle the different cases more specifically
                 * XXX in the future.
                 */

                case PGO_CLEANIT|PGO_FREE:
                case PGO_CLEANIT|PGO_DEACTIVATE:
                case PGO_DEACTIVATE:
 deactivate_it:
                         uvm_pagelock(pg);
                        uvm_pagedeactivate(pg);
                         uvm_pageunlock(pg);
                        break;

                case PGO_FREE:
                        /*
                         * If there are multiple references to
                         * the object, just deactivate the page.
                         */

                        if (uobj->uo_refs > 1)
                                goto deactivate_it;

                        /*
                         * free the swap slot and the page.
                         */

                        pmap_page_protect(pg, VM_PROT_NONE);

                        /*
                         * freeing swapslot here is not strictly necessary.
                         * however, leaving it here doesn't save much
                         * because we need to update swap accounting anyway.
                         */

                        uao_dropswap(uobj, pg->offset >> PAGE_SHIFT);
                        uvm_pagefree(pg);
                        break;

                default:
                        panic("%s: impossible", __func__);
                }
        }
        rw_exit(uobj->vmobjlock);
        uvm_page_array_fini(&a);
        return 0;
}

/*
 * uao_get: fetch me a page
 *
 * we have three cases:
 * 1: page is resident     -> just return the page.
 * 2: page is zero-fill    -> allocate a new page and zero it.
 * 3: page is swapped out  -> fetch the page from swap.
 *
 * case 1 can be handled with PGO_LOCKED, cases 2 and 3 cannot.
 * so, if the "center" page hits case 2/3 then we will need to return EBUSY.
 *
 * => prefer map unlocked (not required)
 * => object must be locked!  we will _unlock_ it before starting any I/O.
 * => flags: PGO_LOCKED: fault data structures are locked
 * => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx]
 * => NOTE: caller must check for released pages!!
 */

static int
uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,
    int *npagesp, int centeridx, vm_prot_t access_type, int advice, int flags)
{
        voff_t current_offset;
        struct vm_page *ptmp;
        int lcv, gotpages, maxpages, swslot, pageidx;
        bool overwrite = ((flags & PGO_OVERWRITE) != 0);
        struct uvm_page_array a;

        UVMHIST_FUNC(__func__);
        UVMHIST_CALLARGS(pdhist, "aobj=%#jx offset=%jd, flags=%#jx",
                    (uintptr_t)uobj, offset, flags,0);

        /*
         * the object must be locked.  it can only be a read lock when
         * processing a read fault with PGO_LOCKED.
         */

        KASSERT(UVM_OBJ_IS_AOBJ(uobj));
        KASSERT(rw_lock_held(uobj->vmobjlock));
        KASSERT(rw_write_held(uobj->vmobjlock) ||
           ((flags & PGO_LOCKED) != 0 && (access_type & VM_PROT_WRITE) == 0));

        /*
          * get number of pages
          */

        maxpages = *npagesp;

        /*
          * step 1: handled the case where fault data structures are locked.
          */

        if (flags & PGO_LOCKED) {

                /*
                  * step 1a: get pages that are already resident.   only do
                 * this if the data structures are locked (i.e. the first
                 * time through).
                  */

                uvm_page_array_init(&a, uobj, 0);
                gotpages = 0;        /* # of pages we got so far */
                for (lcv = 0; lcv < maxpages; lcv++) {
                        ptmp = uvm_page_array_fill_and_peek(&a,
                            offset + (lcv << PAGE_SHIFT), maxpages);
                        if (ptmp == NULL) {
                                break;
                        }
                        KASSERT(ptmp->offset >= offset);
                        lcv = (ptmp->offset - offset) >> PAGE_SHIFT;
                        if (lcv >= maxpages) {
                                break;
                        }
                        uvm_page_array_advance(&a);

                        /*
                         * to be useful must get a non-busy page
                         */

                        if ((ptmp->flags & PG_BUSY) != 0) {
                                continue;
                        }

                        /*
                         * useful page: plug it in our result array
                         */

                        KASSERT(uvm_pagegetdirty(ptmp) !=
                            UVM_PAGE_STATUS_CLEAN);
                        pps[lcv] = ptmp;
                        gotpages++;
                }
                uvm_page_array_fini(&a);

                /*
                  * step 1b: now we've either done everything needed or we
                 * to unlock and do some waiting or I/O.
                  */

                UVMHIST_LOG(pdhist, "<- done (done=%jd)",
                    (pps[centeridx] != NULL), 0,0,0);
                *npagesp = gotpages;
                return pps[centeridx] != NULL ? 0 : EBUSY;
        }

        /*
          * step 2: get non-resident or busy pages.
          * object is locked.   data structures are unlocked.
          */

        if ((flags & PGO_SYNCIO) == 0) {
                goto done;
        }

        uvm_page_array_init(&a, uobj, 0);
        for (lcv = 0, current_offset = offset ; lcv < maxpages ;) {

                /*
                  * we have yet to locate the current page (pps[lcv]).   we
                 * first look for a page that is already at the current offset.
                 * if we find a page, we check to see if it is busy or
                 * released.  if that is the case, then we sleep on the page
                 * until it is no longer busy or released and repeat the lookup.
                 * if the page we found is neither busy nor released, then we
                 * busy it (so we own it) and plug it into pps[lcv].   we are
                 * ready to move on to the next page.
                  */

                ptmp = uvm_page_array_fill_and_peek(&a, current_offset,
                    maxpages - lcv);

                if (ptmp != NULL && ptmp->offset == current_offset) {
                        /* page is there, see if we need to wait on it */
                        if ((ptmp->flags & PG_BUSY) != 0) {
                                UVMHIST_LOG(pdhist,
                                    "sleeping, ptmp->flags %#jx\n",
                                    ptmp->flags,0,0,0);
                                uvm_pagewait(ptmp, uobj->vmobjlock, "uao_get");
                                rw_enter(uobj->vmobjlock, RW_WRITER);
                                uvm_page_array_clear(&a);
                                continue;
                        }

                        /*
                          * if we get here then the page is resident and
                         * unbusy.  we busy it now (so we own it).  if
                         * overwriting, mark the page dirty up front as
                         * it will be zapped via an unmanaged mapping.
                          */

                        KASSERT(uvm_pagegetdirty(ptmp) !=
                            UVM_PAGE_STATUS_CLEAN);
                        if (overwrite) {
                                uvm_pagemarkdirty(ptmp, UVM_PAGE_STATUS_DIRTY);
                        }
                        /* we own it, caller must un-busy */
                        ptmp->flags |= PG_BUSY;
                        UVM_PAGE_OWN(ptmp, "uao_get2");
                        pps[lcv++] = ptmp;
                        current_offset += PAGE_SIZE;
                        uvm_page_array_advance(&a);
                        continue;
                } else {
                        KASSERT(ptmp == NULL || ptmp->offset > current_offset);
                }

                /*
                 * not resident.  allocate a new busy/fake/clean page in the
                 * object.  if it's in swap we need to do I/O to fill in the
                 * data, otherwise the page needs to be cleared: if it's not
                 * destined to be overwritten, then zero it here and now.
                 */

                pageidx = current_offset >> PAGE_SHIFT;
                swslot = uao_find_swslot(uobj, pageidx);
                ptmp = uao_pagealloc(uobj, current_offset,
                    swslot != 0 || overwrite ? 0 : UVM_PGA_ZERO);

                /* out of RAM? */
                if (ptmp == NULL) {
                        rw_exit(uobj->vmobjlock);
                        UVMHIST_LOG(pdhist, "sleeping, ptmp == NULL",0,0,0,0);
                        uvm_wait("uao_getpage");
                        rw_enter(uobj->vmobjlock, RW_WRITER);
                        uvm_page_array_clear(&a);
                        continue;
                }

                /*
                  * if swslot == 0, page hasn't existed before and is zeroed.
                  * otherwise we have a "fake/busy/clean" page that we just
                  * allocated.  do the needed "i/o", reading from swap.
                  */

                if (swslot != 0) {
#if defined(VMSWAP)
                        int error;

                        UVMHIST_LOG(pdhist, "pagein from swslot %jd",
                             swslot, 0,0,0);

                        /*
                         * page in the swapped-out page.
                         * unlock object for i/o, relock when done.
                         */

                        uvm_page_array_clear(&a);
                        rw_exit(uobj->vmobjlock);
                        error = uvm_swap_get(ptmp, swslot, PGO_SYNCIO);
                        rw_enter(uobj->vmobjlock, RW_WRITER);

                        /*
                         * I/O done.  check for errors.
                         */

                        if (error != 0) {
                                UVMHIST_LOG(pdhist, "<- done (error=%jd)",
                                    error,0,0,0);

                                /*
                                 * remove the swap slot from the aobj
                                 * and mark the aobj as having no real slot.
                                 * don't free the swap slot, thus preventing
                                 * it from being used again.
                                 */

                                swslot = uao_set_swslot(uobj, pageidx,
                                    SWSLOT_BAD);
                                if (swslot > 0) {
                                        uvm_swap_markbad(swslot, 1);
                                }

                                uvm_pagefree(ptmp);
                                rw_exit(uobj->vmobjlock);
                                UVMHIST_LOG(pdhist, "<- done (error)",
                                    error,lcv,0,0);
                                if (lcv != 0) {
                                        uvm_page_unbusy(pps, lcv);
                                }
                                memset(pps, 0, maxpages * sizeof(pps[0]));
                                uvm_page_array_fini(&a);
                                return error;
                        }
#else /* defined(VMSWAP) */
                        panic("%s: pagein", __func__);
#endif /* defined(VMSWAP) */
                }

                /*
                 * note that we will allow the page being writably-mapped
                 * (!PG_RDONLY) regardless of access_type.  if overwrite,
                 * the page can be modified through an unmanaged mapping
                 * so mark it dirty up front.
                 */
                if (overwrite) {
                        uvm_pagemarkdirty(ptmp, UVM_PAGE_STATUS_DIRTY);
                } else {
                        uvm_pagemarkdirty(ptmp, UVM_PAGE_STATUS_UNKNOWN);
                }

                /*
                  * we got the page!   clear the fake flag (indicates valid
                 * data now in page) and plug into our result array.   note
                 * that page is still busy.
                  *
                  * it is the callers job to:
                  * => check if the page is released
                  * => unbusy the page
                  * => activate the page
                  */
                KASSERT(uvm_pagegetdirty(ptmp) != UVM_PAGE_STATUS_CLEAN);
                KASSERT((ptmp->flags & PG_FAKE) != 0);
                KASSERT(ptmp->offset == current_offset);
                ptmp->flags &= ~PG_FAKE;
                pps[lcv++] = ptmp;
                current_offset += PAGE_SIZE;
        }
        uvm_page_array_fini(&a);

        /*
          * finally, unlock object and return.
          */

done:
        rw_exit(uobj->vmobjlock);
        UVMHIST_LOG(pdhist, "<- done (OK)",0,0,0,0);
        return 0;
}

#if defined(VMSWAP)

/*
 * uao_dropswap:  release any swap resources from this aobj page.
 *
 * => aobj must be locked or have a reference count of 0.
 */

void
uao_dropswap(struct uvm_object *uobj, int pageidx)
{
        int slot;

        KASSERT(UVM_OBJ_IS_AOBJ(uobj));

        slot = uao_set_swslot(uobj, pageidx, 0);
        if (slot) {
                uvm_swap_free(slot, 1);
        }
}

/*
 * page in every page in every aobj that is paged-out to a range of swslots.
 *
 * => nothing should be locked.
 * => returns true if pagein was aborted due to lack of memory.
 */

bool
uao_swap_off(int startslot, int endslot)
{
        struct uvm_aobj *aobj;

        /*
         * Walk the list of all anonymous UVM objects.  Grab the first.
         */
        mutex_enter(&uao_list_lock);
        if ((aobj = LIST_FIRST(&uao_list)) == NULL) {
                mutex_exit(&uao_list_lock);
                return false;
        }
        uao_reference(&aobj->u_obj);

        do {
                struct uvm_aobj *nextaobj;
                bool rv;

                /*
                 * Prefetch the next object and immediately hold a reference
                 * on it, so neither the current nor the next entry could
                 * disappear while we are iterating.
                 */
                if ((nextaobj = LIST_NEXT(aobj, u_list)) != NULL) {
                        uao_reference(&nextaobj->u_obj);
                }
                mutex_exit(&uao_list_lock);

                /*
                 * Page in all pages in the swap slot range.
                 */
                rw_enter(aobj->u_obj.vmobjlock, RW_WRITER);
                rv = uao_pagein(aobj, startslot, endslot);
                rw_exit(aobj->u_obj.vmobjlock);

                /* Drop the reference of the current object. */
                uao_detach(&aobj->u_obj);
                if (rv) {
                        if (nextaobj) {
                                uao_detach(&nextaobj->u_obj);
                        }
                        return rv;
                }

                aobj = nextaobj;
                mutex_enter(&uao_list_lock);
        } while (aobj);

        mutex_exit(&uao_list_lock);
        return false;
}

/*
 * page in any pages from aobj in the given range.
 *
 * => aobj must be locked and is returned locked.
 * => returns true if pagein was aborted due to lack of memory.
 */
static bool
uao_pagein(struct uvm_aobj *aobj, int startslot, int endslot)
{
        bool rv;

        if (UAO_USES_SWHASH(aobj)) {
                struct uao_swhash_elt *elt;
                int buck;

restart:
                for (buck = aobj->u_swhashmask; buck >= 0; buck--) {
                        for (elt = LIST_FIRST(&aobj->u_swhash[buck]);
                             elt != NULL;
                             elt = LIST_NEXT(elt, list)) {
                                int i;

                                for (i = 0; i < UAO_SWHASH_CLUSTER_SIZE; i++) {
                                        int slot = elt->slots[i];

                                        /*
                                         * if the slot isn't in range, skip it.
                                         */

                                        if (slot < startslot ||
                                            slot >= endslot) {
                                                continue;
                                        }

                                        /*
                                         * process the page,
                                         * the start over on this object
                                         * since the swhash elt
                                         * may have been freed.
                                         */

                                        rv = uao_pagein_page(aobj,
                                          UAO_SWHASH_ELT_PAGEIDX_BASE(elt) + i);
                                        if (rv) {
                                                return rv;
                                        }
                                        goto restart;
                                }
                        }
                }
        } else {
                int i;

                for (i = 0; i < aobj->u_pages; i++) {
                        int slot = aobj->u_swslots[i];

                        /*
                         * if the slot isn't in range, skip it
                         */

                        if (slot < startslot || slot >= endslot) {
                                continue;
                        }

                        /*
                         * process the page.
                         */

                        rv = uao_pagein_page(aobj, i);
                        if (rv) {
                                return rv;
                        }
                }
        }

        return false;
}

/*
 * uao_pagein_page: page in a single page from an anonymous UVM object.
 *
 * => Returns true if pagein was aborted due to lack of memory.
 * => Object must be locked and is returned locked.
 */

static bool
uao_pagein_page(struct uvm_aobj *aobj, int pageidx)
{
        struct uvm_object *uobj = &aobj->u_obj;
        struct vm_page *pg;
        int rv, npages;

        pg = NULL;
        npages = 1;

        KASSERT(rw_write_held(uobj->vmobjlock));
        rv = uao_get(uobj, (voff_t)pageidx << PAGE_SHIFT, &pg, &npages,
            0, VM_PROT_READ | VM_PROT_WRITE, 0, PGO_SYNCIO);

        /*
         * relock and finish up.
         */

        rw_enter(uobj->vmobjlock, RW_WRITER);
        switch (rv) {
        case 0:
                break;

        case EIO:
        case ERESTART:

                /*
                 * nothing more to do on errors.
                 * ERESTART can only mean that the anon was freed,
                 * so again there's nothing to do.
                 */

                return false;

        default:
                return true;
        }

        /*
         * ok, we've got the page now.
         * mark it as dirty, clear its swslot and un-busy it.
         */
        uao_dropswap(&aobj->u_obj, pageidx);

        /*
         * make sure it's on a page queue.
         */
        uvm_pagelock(pg);
        uvm_pageenqueue(pg);
        uvm_pagewakeup(pg);
        uvm_pageunlock(pg);

        pg->flags &= ~(PG_BUSY|PG_FAKE);
        uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
        UVM_PAGE_OWN(pg, NULL);

        return false;
}

/*
 * uao_dropswap_range: drop swapslots in the range.
 *
 * => aobj must be locked and is returned locked.
 * => start is inclusive.  end is exclusive.
 */

void
uao_dropswap_range(struct uvm_object *uobj, voff_t start, voff_t end)
{
        struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
        int swpgonlydelta = 0;

        KASSERT(UVM_OBJ_IS_AOBJ(uobj));
        KASSERT(rw_write_held(uobj->vmobjlock));

        if (end == 0) {
                end = INT64_MAX;
        }

        if (UAO_USES_SWHASH(aobj)) {
                int i, hashbuckets = aobj->u_swhashmask + 1;
                voff_t taghi;
                voff_t taglo;

                taglo = UAO_SWHASH_ELT_TAG(start);
                taghi = UAO_SWHASH_ELT_TAG(end);

                for (i = 0; i < hashbuckets; i++) {
                        struct uao_swhash_elt *elt, *next;

                        for (elt = LIST_FIRST(&aobj->u_swhash[i]);
                             elt != NULL;
                             elt = next) {
                                int startidx, endidx;
                                int j;

                                next = LIST_NEXT(elt, list);

                                if (elt->tag < taglo || taghi < elt->tag) {
                                        continue;
                                }

                                if (elt->tag == taglo) {
                                        startidx =
                                            UAO_SWHASH_ELT_PAGESLOT_IDX(start);
                                } else {
                                        startidx = 0;
                                }

                                if (elt->tag == taghi) {
                                        endidx =
                                            UAO_SWHASH_ELT_PAGESLOT_IDX(end);
                                } else {
                                        endidx = UAO_SWHASH_CLUSTER_SIZE;
                                }

                                for (j = startidx; j < endidx; j++) {
                                        int slot = elt->slots[j];

                                        KASSERT(uvm_pagelookup(&aobj->u_obj,
                                            (UAO_SWHASH_ELT_PAGEIDX_BASE(elt)
                                            + j) << PAGE_SHIFT) == NULL);
                                        if (slot > 0) {
                                                uvm_swap_free(slot, 1);
                                                swpgonlydelta++;
                                                KASSERT(elt->count > 0);
                                                elt->slots[j] = 0;
                                                elt->count--;
                                        }
                                }

                                if (elt->count == 0) {
                                        LIST_REMOVE(elt, list);
                                        pool_put(&uao_swhash_elt_pool, elt);
                                }
                        }
                }
        } else {
                int i;

                if (aobj->u_pages < end) {
                        end = aobj->u_pages;
                }
                for (i = start; i < end; i++) {
                        int slot = aobj->u_swslots[i];

                        if (slot > 0) {
                                uvm_swap_free(slot, 1);
                                swpgonlydelta++;
                        }
                }
        }

        /*
         * adjust the counter of pages only in swap for all
         * the swap slots we've freed.
         */

        if (swpgonlydelta > 0) {
                KASSERT(uvmexp.swpgonly >= swpgonlydelta);
                atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
        }
}

#endif /* defined(VMSWAP) */




































































































    1 





    1 











    1 





































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
/*        $NetBSD: netbsd32_exec_aout.c,v 1.31 2021/01/19 03:20:13 simonb Exp $        */
/*        from: NetBSD: exec_aout.c,v 1.15 1996/09/26 23:34:46 cgd Exp */

/*
 * Copyright (c) 1998, 2001 Matthew R. Green.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1993, 1994 Christopher G. Demetriou
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Christopher G. Demetriou.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: netbsd32_exec_aout.c,v 1.31 2021/01/19 03:20:13 simonb Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/exec.h>
#include <sys/exec_aout.h>
#include <sys/resourcevar.h>
#include <sys/signal.h>
#include <sys/signalvar.h>

#include <compat/netbsd32/netbsd32.h>
#ifndef EXEC_AOUT
#define EXEC_AOUT
#endif
#include <compat/netbsd32/netbsd32_exec.h>

#include <machine/frame.h>
#include <machine/netbsd32_machdep.h>

#ifdef COMPAT_NOMID
static int netbsd32_exec_aout_nomid(struct lwp *, struct exec_package *);
#endif

/*
 * exec_netbsd32_makecmds(): Check if it's an netbsd32 a.out format
 * executable.
 *
 * Given a lwp pointer and an exec package pointer, see if the referent
 * of the epp is in netbsd32 a.out format.  Check 'standard' magic
 * numbers for this architecture.
 *
 * This function, in the former case, or the hook, in the latter, is
 * responsible for creating a set of vmcmds which can be used to build
 * the process's vm space and inserting them into the exec package.
 */

int
exec_netbsd32_makecmds(struct lwp *l, struct exec_package *epp)
{
        netbsd32_u_long midmag, magic;
        u_short mid;
        int error;
        struct netbsd32_exec *execp = epp->ep_hdr;

        if (epp->ep_hdrvalid < sizeof(struct netbsd32_exec))
                return ENOEXEC;

        midmag = (netbsd32_u_long)ntohl(execp->a_midmag);
        mid = (midmag >> 16) & 0x3ff;
        magic = midmag & 0xffff;

        midmag = mid << 16 | magic;

        /* this is already needed by setup_stack() */
        epp->ep_flags |= EXEC_32;

        switch (midmag) {
        case (NETBSD32_MID_MACHINE << 16) | ZMAGIC:
                error = netbsd32_exec_aout_prep_zmagic(l, epp);
                break;
        case (NETBSD32_MID_MACHINE << 16) | NMAGIC:
                error = netbsd32_exec_aout_prep_nmagic(l, epp);
                break;
        case (NETBSD32_MID_MACHINE << 16) | OMAGIC:
                error = netbsd32_exec_aout_prep_omagic(l, epp);
                break;
        default:
#ifdef COMPAT_NOMID
                error = netbsd32_exec_aout_nomid(l,  epp);
#else
                error = ENOEXEC;
#endif
                break;
        }

        if (error) {
                kill_vmcmds(&epp->ep_vmcmds);
                epp->ep_flags &= ~EXEC_32;
        } else
                epp->ep_flags &= ~EXEC_TOPDOWN_VM;
        return error;
}

/*
 * netbsd32_exec_aout_prep_zmagic(): Prepare a 'native' ZMAGIC binary's
 * exec package
 *
 * First, set of the various offsets/lengths in the exec package.
 *
 * Then, mark the text image busy (so it can be demand paged) or error
 * out if this is not possible.  Finally, set up vmcmds for the
 * text, data, bss, and stack segments.
 */

int
netbsd32_exec_aout_prep_zmagic(struct lwp *l, struct exec_package *epp)
{
        struct netbsd32_exec *execp = epp->ep_hdr;
        int error;

        epp->ep_taddr = AOUT_LDPGSZ;
        epp->ep_tsize = execp->a_text;
        epp->ep_daddr = epp->ep_taddr + execp->a_text;
        epp->ep_dsize = execp->a_data + execp->a_bss;
        epp->ep_entry = execp->a_entry;
        epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
        epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;

        error = vn_marktext(epp->ep_vp);
        if (error)
                return error;

        /* set up command for text segment */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_text,
            epp->ep_taddr, epp->ep_vp, 0, VM_PROT_READ|VM_PROT_EXECUTE);

        /* set up command for data segment */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_data,
            epp->ep_daddr, epp->ep_vp, execp->a_text,
            VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        /* set up command for bss segment */
        if (execp->a_bss > 0)
                NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, execp->a_bss,
                    epp->ep_daddr + execp->a_data, NULLVP, 0,
                    VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        return (*epp->ep_esch->es_setup_stack)(l, epp);
}

/*
 * netbsd32_exec_aout_prep_nmagic(): Prepare a 'native' NMAGIC binary's
 * exec package
 */

int
netbsd32_exec_aout_prep_nmagic(struct lwp *l, struct exec_package *epp)
{
        struct netbsd32_exec *execp = epp->ep_hdr;
        long bsize, baddr;

        epp->ep_taddr = AOUT_LDPGSZ;
        epp->ep_tsize = execp->a_text;
        epp->ep_daddr = roundup(epp->ep_taddr + execp->a_text, AOUT_LDPGSZ);
        epp->ep_dsize = execp->a_data + execp->a_bss;
        epp->ep_entry = execp->a_entry;
        epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
        epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;

        /* set up command for text segment */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text,
            epp->ep_taddr, epp->ep_vp, sizeof(struct netbsd32_exec),
            VM_PROT_READ|VM_PROT_EXECUTE);

        /* set up command for data segment */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_data,
            epp->ep_daddr, epp->ep_vp, execp->a_text + sizeof(struct netbsd32_exec),
            VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        /* set up command for bss segment */
        baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE);
        bsize = epp->ep_daddr + epp->ep_dsize - baddr;
        if (bsize > 0)
                NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
                    NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        return (*epp->ep_esch->es_setup_stack)(l, epp);
}

/*
 * netbsd32_exec_aout_prep_omagic(): Prepare a 'native' OMAGIC binary's
 * exec package
 */

int
netbsd32_exec_aout_prep_omagic(struct lwp *l, struct exec_package *epp)
{
        struct netbsd32_exec *execp = epp->ep_hdr;
        long dsize, bsize, baddr;

        epp->ep_taddr = AOUT_LDPGSZ;
        epp->ep_tsize = execp->a_text;
        epp->ep_daddr = epp->ep_taddr + execp->a_text;
        epp->ep_dsize = execp->a_data + execp->a_bss;
        epp->ep_entry = execp->a_entry;
        epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
        epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;

        /* set up command for text and data segments */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn,
            execp->a_text + execp->a_data, epp->ep_taddr, epp->ep_vp,
            sizeof(struct netbsd32_exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        /* set up command for bss segment */
        baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE);
        bsize = epp->ep_daddr + epp->ep_dsize - baddr;
        if (bsize > 0)
                NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
                    NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        /*
         * Make sure (# of pages) mapped above equals (vm_tsize + vm_dsize);
         * obreak(2) relies on this fact. Both `vm_tsize' and `vm_dsize' are
         * computed (in execve(2)) by rounding *up* `ep_tsize' and `ep_dsize'
         * respectively to page boundaries.
         * Compensate `ep_dsize' for the amount of data covered by the last
         * text page.
         */
        dsize = epp->ep_dsize + execp->a_text - roundup(execp->a_text,
                                                        PAGE_SIZE);
        epp->ep_dsize = (dsize > 0) ? dsize : 0;
        return (*epp->ep_esch->es_setup_stack)(l, epp);
}

#ifdef COMPAT_NOMID
/*
 * netbsd32_exec_aout_prep_oldzmagic():
 *        Prepare the vmcmds to build a vmspace for an old ZMAGIC
 *        binary. [386BSD/BSDI/4.4BSD/NetBSD0.8]
 *
 * Cloned from exec_aout_prep_zmagic() in kern/exec_aout.c; a more verbose
 * description of operation is there.
 * There were copies of this in the mac68k, hp300, and i386 ports.
 */
static int
netbsd32_exec_aout_prep_oldzmagic(struct lwp *l, struct exec_package *epp)
{
        struct netbsd32_exec *execp = epp->ep_hdr;
        int error;

        epp->ep_taddr = 0;
        epp->ep_tsize = execp->a_text;
        epp->ep_daddr = epp->ep_taddr + execp->a_text;
        epp->ep_dsize = execp->a_data + execp->a_bss;
        epp->ep_entry = execp->a_entry;
        epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
        epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;

        error = vn_marktext(epp->ep_vp);
        if (error)
                return error;

        /* set up command for text segment */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_text,
            epp->ep_taddr, epp->ep_vp, PAGE_SIZE, /* XXX CLBYTES? */
            VM_PROT_READ|VM_PROT_EXECUTE);

        /* set up command for data segment */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_data,
            epp->ep_daddr, epp->ep_vp,
            execp->a_text + PAGE_SIZE, /* XXX CLBYTES? */
            VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        /* set up command for bss segment */
        if (execp->a_bss)
            NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, execp->a_bss,
                epp->ep_daddr + execp->a_data, NULLVP, 0,
                VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        return (*epp->ep_esch->es_setup_stack)(l, epp);
}


/*
 * netbsd32_exec_aout_prep_oldnmagic():
 *        Prepare the vmcmds to build a vmspace for an old NMAGIC
 *        binary. [BSDI]
 *
 * Cloned from exec_aout_prep_nmagic() in kern/exec_aout.c; with text starting
 * at 0.
 * XXX: There must be a better way to share this code.
 */
static int
netbsd32_exec_aout_prep_oldnmagic(struct lwp *l, struct exec_package *epp)
{
        struct netbsd32_exec *execp = epp->ep_hdr;
        long bsize, baddr;

        epp->ep_taddr = 0;
        epp->ep_tsize = execp->a_text;
        epp->ep_daddr = roundup(epp->ep_taddr + execp->a_text, AOUT_LDPGSZ);
        epp->ep_dsize = execp->a_data + execp->a_bss;
        epp->ep_entry = execp->a_entry;
        epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
        epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;

        /* set up command for text segment */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text,
            epp->ep_taddr, epp->ep_vp, sizeof(struct netbsd32_exec),
            VM_PROT_READ|VM_PROT_EXECUTE);

        /* set up command for data segment */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_data,
            epp->ep_daddr, epp->ep_vp, execp->a_text + sizeof(struct netbsd32_exec),
            VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        /* set up command for bss segment */
        baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE);
        bsize = epp->ep_daddr + epp->ep_dsize - baddr;
        if (bsize > 0)
                NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
                    NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        return (*epp->ep_esch->es_setup_stack)(l, epp);
}


/*
 * netbsd32_exec_aout_prep_oldomagic():
 *        Prepare the vmcmds to build a vmspace for an old OMAGIC
 *        binary. [BSDI]
 *
 * Cloned from exec_aout_prep_omagic() in kern/exec_aout.c; with text starting
 * at 0.
 * XXX: There must be a better way to share this code.
 */
static int
netbsd32_exec_aout_prep_oldomagic(struct lwp *l, struct exec_package *epp)
{
        struct netbsd32_exec *execp = epp->ep_hdr;
        long dsize, bsize, baddr;

        epp->ep_taddr = 0;
        epp->ep_tsize = execp->a_text;
        epp->ep_daddr = epp->ep_taddr + execp->a_text;
        epp->ep_dsize = execp->a_data + execp->a_bss;
        epp->ep_entry = execp->a_entry;

        /* set up command for text and data segments */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn,
            execp->a_text + execp->a_data, epp->ep_taddr, epp->ep_vp,
            sizeof(struct netbsd32_exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        /* set up command for bss segment */
        baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE);
        bsize = epp->ep_daddr + epp->ep_dsize - baddr;
        if (bsize > 0)
                NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
                    NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        /*
         * Make sure (# of pages) mapped above equals (vm_tsize + vm_dsize);
         * obreak(2) relies on this fact. Both `vm_tsize' and `vm_dsize' are
         * computed (in execve(2)) by rounding *up* `ep_tsize' and `ep_dsize'
         * respectively to page boundaries.
         * Compensate `ep_dsize' for the amount of data covered by the last
         * text page.
         */
        dsize = epp->ep_dsize + execp->a_text - roundup(execp->a_text,
                                                        PAGE_SIZE);
        epp->ep_dsize = (dsize > 0) ? dsize : 0;
        return (*epp->ep_esch->es_setup_stack)(l, epp);
}

static int
netbsd32_exec_aout_nomid(struct lwp *l, struct exec_package *epp)
{
        int error;
        u_long midmag, magic;
        u_short mid;
        struct exec *execp = epp->ep_hdr;

        /* check on validity of epp->ep_hdr performed by exec_out_makecmds */

        midmag = ntohl(execp->a_midmag);
        mid = (midmag >> 16) & 0xffff;
        magic = midmag & 0xffff;

        if (magic == 0) {
                magic = (execp->a_midmag & 0xffff);
                mid = MID_ZERO;
        }

        midmag = mid << 16 | magic;

        switch (midmag) {
        case (MID_ZERO << 16) | ZMAGIC:
                /*
                 * 386BSD's ZMAGIC format:
                 */
                return netbsd32_exec_aout_prep_oldzmagic(l, epp);
                break;

        case (MID_ZERO << 16) | QMAGIC:
                /*
                 * BSDI's QMAGIC format:
                 * same as new ZMAGIC format, but with different magic number
                 */
                return netbsd32_exec_aout_prep_zmagic(l, epp);
                break;

        case (MID_ZERO << 16) | NMAGIC:
                /*
                 * BSDI's NMAGIC format:
                 * same as NMAGIC format, but with different magic number
                 * and with text starting at 0.
                 */
                return netbsd32_exec_aout_prep_oldnmagic(l, epp);

        case (MID_ZERO << 16) | OMAGIC:
                /*
                 * BSDI's OMAGIC format:
                 * same as OMAGIC format, but with different magic number
                 * and with text starting at 0.
                 */
                return netbsd32_exec_aout_prep_oldomagic(l, epp);

        default:
                return ENOEXEC;
        }

        return error;
}
#endif














































































































































































































































































































































    3 

    3 













    7 





    5 
   65 






    1 






    1 













    2 


































   35 





    2 
   20 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
/*        $NetBSD: ktrace.h,v 1.68 2022/06/29 22:10:43 riastradh Exp $        */

/*
 * Copyright (c) 1988, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)ktrace.h        8.2 (Berkeley) 2/19/95
 */

#ifndef _SYS_KTRACE_H_
#define _SYS_KTRACE_H_

#include <sys/mutex.h>
#include <sys/lwp.h>
#include <sys/signal.h>
#include <sys/time.h>
#include <sys/uio.h>

/*
 * operations to ktrace system call  (KTROP(op))
 */
#define KTROP_SET                0        /* set trace points */
#define KTROP_CLEAR                1        /* clear trace points */
#define KTROP_CLEARFILE                2        /* stop all tracing to file */
#define        KTROP_MASK                0x3
#define        KTROP(o)                ((o)&KTROP_MASK) /* macro to extract operation */
/*
 * flags (ORed in with operation)
 */
#define KTRFLAG_DESCEND                4        /* perform op on all children too */

/*
 * ktrace record header
 */
struct ktr_header {
        int        ktr_len;                /* length of record minus length of old header */
#if BYTE_ORDER == LITTLE_ENDIAN
        short        ktr_type;                /* trace record type */
        short        ktr_version;                /* trace record version */
#else
        short        ktr_version;                /* trace record version */
        short        ktr_type;                /* trace record type */
#endif
        pid_t        ktr_pid;                /* process id */
        char        ktr_comm[MAXCOMLEN+1];        /* command name */
        union {
                struct { /* v0 */
                        struct {
                                int32_t tv_sec;
                                long tv_usec;
                        } _tv;
                        const void *_buf;
                } _v0;
                struct { /* v1 */
                        struct {
                                int32_t tv_sec;
                                long tv_nsec;
                        } _ts;
                        lwpid_t _lid;
                } _v1;
                struct { /* v2 */
                        struct timespec _ts;
                        lwpid_t _lid;
                } _v2;
        } _v;
};

#define ktr_lid                _v._v2._lid
#define ktr_olid        _v._v1._lid
#define ktr_time        _v._v2._ts
#define ktr_otv                _v._v0._tv
#define ktr_ots                _v._v1._ts
#define ktr_ts                _v._v2._ts
#define ktr_unused        _v._v0._buf

#define        KTR_SHIMLEN        offsetof(struct ktr_header, ktr_pid)

/*
 * Test for kernel trace point
 */
#define KTRPOINT(p, type)        \
        (((p)->p_traceflag & (1<<(type))) != 0)

/*
 * ktrace record types
 */

/*
 * KTR_SYSCALL - system call record
 */
#define KTR_SYSCALL        1
struct ktr_syscall {
        int        ktr_code;                /* syscall number */
        int        ktr_argsize;                /* size of arguments */
        /*
         * followed by ktr_argsize/sizeof(register_t) "register_t"s
         */
};

/*
 * KTR_SYSRET - return from system call record
 */
#define KTR_SYSRET        2
struct ktr_sysret {
        short        ktr_code;
        short        ktr_eosys;                /* XXX unused */
        int        ktr_error;
        __register_t ktr_retval;
        __register_t ktr_retval_1;
};

/*
 * KTR_NAMEI - namei record
 */
#define KTR_NAMEI        3
        /* record contains pathname */

/*
 * KTR_GENIO - trace generic process i/o
 */
#define KTR_GENIO        4
struct ktr_genio {
        int        ktr_fd;
        enum        uio_rw ktr_rw;
        /*
         * followed by data successfully read/written
         */
};

/*
 * KTR_PSIG - trace processed signal
 */
#define        KTR_PSIG        5
struct ktr_psig {
        int        signo;
        sig_t        action;
        sigset_t mask;
        int        code;
        /*
         * followed by optional siginfo_t
         */
};

/*
 * KTR_CSW - trace context switches
 */
#define KTR_CSW                6
struct ktr_csw {
        int        out;        /* 1 if switch out, 0 if switch in */
        int        user;        /* 1 if usermode (ivcsw), 0 if kernel (vcsw) */
};

/*
 * KTR_EMUL - emulation change
 */
#define KTR_EMUL        7
        /* record contains emulation name */

/*
 * KTR_USER - user record
 */
#define        KTR_USER        8
#define KTR_USER_MAXIDLEN        20
#define KTR_USER_MAXLEN                2048        /* maximum length of passed data */
struct ktr_user {
        char         ktr_id[KTR_USER_MAXIDLEN];        /* string id of caller */
        /*
         * Followed by ktr_len - sizeof(struct ktr_user) of user data.
         */
};

/*
 * KTR_EXEC_ARG, KTR_EXEC_ENV - Arguments and environment from exec
 */
#define KTR_EXEC_ARG                10
#define KTR_EXEC_ENV                11
        /* record contains arg/env string */

/*
 * KTR_SAUPCALL - scheduler activated upcall.
 *
 * The structure is no longer used, but retained for compatibility.
 */
#define        KTR_SAUPCALL        13
struct ktr_saupcall {
        int ktr_type;
        int ktr_nevent;
        int ktr_nint;
        void *ktr_sas;
        void *ktr_ap;
        /*
         * followed by nevent sa_t's from sas[]
         */
};

/*
 * KTR_MIB - MIB name and data
 */
#define KTR_MIB                14
        /* Record contains MIB name */

/*
 * KTR_EXEC_FD - Opened file descriptor from exec
 */
#define KTR_EXEC_FD                15
struct ktr_execfd {
        int   ktr_fd;
        u_int ktr_dtype; /* one of DTYPE_* constants */
};

/*
 * kernel trace points (in p_traceflag)
 */
#define KTRFAC_MASK        0x00ffffff
#define KTRFAC_SYSCALL        (1<<KTR_SYSCALL)
#define KTRFAC_SYSRET        (1<<KTR_SYSRET)
#define KTRFAC_NAMEI        (1<<KTR_NAMEI)
#define KTRFAC_GENIO        (1<<KTR_GENIO)
#define        KTRFAC_PSIG        (1<<KTR_PSIG)
#define KTRFAC_CSW        (1<<KTR_CSW)
#define KTRFAC_EMUL        (1<<KTR_EMUL)
#define        KTRFAC_USER        (1<<KTR_USER)
#define KTRFAC_EXEC_ARG        (1<<KTR_EXEC_ARG)
#define KTRFAC_EXEC_ENV        (1<<KTR_EXEC_ENV)
#define        KTRFAC_MIB        (1<<KTR_MIB)
#define        KTRFAC_EXEC_FD        (1<<KTR_EXEC_FD)

#define __KTRACE_FLAG_BITS \
    "\177\020" \
    "b\1SYSCALL\0" \
    "b\2SYSRET\0" \
    "b\3NAMEI\0" \
    "b\4GENIO\0" \
    "b\5PSIG\0" \
    "b\6CSW\0" \
    "b\7EMUL\0" \
    "b\10USER\0" \
    "b\12EXEC_ARG\0" \
    "b\13EXEC_ENV\0" \
    "b\15SAUPCALL\0" \
    "b\16MIB\0" \
    "b\17EXEC_FD\0" \
    "f\30\4VERSION\0" \
    "b\36TRC_EMUL\0" \
    "b\37INHERIT\0" \
    "b\40PERSISTENT\0"

/*
 * trace flags (also in p_traceflags)
 */
#define KTRFAC_PERSISTENT        0x80000000        /* persistent trace across sugid
                                                   exec (exclusive) */
#define KTRFAC_INHERIT        0x40000000        /* pass trace flags to children */
#define KTRFAC_TRC_EMUL        0x10000000        /* ktrace KTR_EMUL before next trace */
#define        KTRFAC_VER_MASK        0x0f000000        /* record version mask */
#define        KTRFAC_VER_SHIFT        24        /* record version shift */

#define        KTRFAC_VERSION(tf)        (((tf) & KTRFAC_VER_MASK) >> KTRFAC_VER_SHIFT)

#define        KTRFACv0        (0 << KTRFAC_VER_SHIFT)
#define        KTRFACv1        (1 << KTRFAC_VER_SHIFT)
#define        KTRFACv2        (2 << KTRFAC_VER_SHIFT)

#ifndef        _KERNEL

#include <sys/cdefs.h>

__BEGIN_DECLS
int        ktrace(const char *, int, int, pid_t);
int        fktrace(int, int, int, pid_t);
int        utrace(const char *, void *, size_t);
__END_DECLS

#else

struct syncobj;

void ktrinit(void);
void ktrderef(struct proc *);
void ktradref(struct proc *);

extern kmutex_t ktrace_lock;
extern int ktrace_on;

int ktruser(const char *, void *, size_t, int);
bool ktr_point(int);

void ktr_csw(int, int, const struct syncobj *);
void ktr_emul(void);
void ktr_geniov(int, enum uio_rw, struct iovec *, size_t, int);
void ktr_genio(int, enum uio_rw, const void *, size_t, int);
void ktr_mibio(int, enum uio_rw, const void *, size_t, int);
void ktr_namei(const char *, size_t);
void ktr_namei2(const char *, size_t, const char *, size_t);
void ktr_psig(int, sig_t, const sigset_t *, const ksiginfo_t *);
void ktr_syscall(register_t, const register_t [], int);
void ktr_sysret(register_t, int, register_t *);
void ktr_kuser(const char *, const void *, size_t);
void ktr_mib(const int *a , u_int b);
void ktr_execarg(const void *, size_t);
void ktr_execenv(const void *, size_t);
void ktr_execfd(int, u_int);

int  ktrace_common(lwp_t *, int, int, int, file_t **);

static __inline int
ktrenter(lwp_t *l)
{

        if ((l->l_pflag & LP_KTRACTIVE) != 0)
                return 1;
        l->l_pflag |= LP_KTRACTIVE;
        return 0;
}

static __inline void
ktrexit(lwp_t *l)
{

        l->l_pflag &= ~LP_KTRACTIVE;
}

static __inline bool
ktrpoint(int fac)
{
    return __predict_false(ktrace_on) && __predict_false(ktr_point(1 << fac));
}

static __inline void
ktrcsw(int a, int b, const struct syncobj *c)
{
        if (__predict_false(ktrace_on))
                ktr_csw(a, b, c);
}

static __inline void
ktremul(void)
{
        if (__predict_false(ktrace_on))
                ktr_emul();
}

static __inline void
ktrgenio(int a, enum uio_rw b, const void *c, size_t d, int e)
{
        if (__predict_false(ktrace_on))
                ktr_genio(a, b, c, d, e);
}

static __inline void
ktrgeniov(int a, enum uio_rw b, struct iovec *c, int d, int e)
{
        if (__predict_false(ktrace_on))
                ktr_geniov(a, b, c, d, e);
}

static __inline void
ktrmibio(int a, enum uio_rw b, const void *c, size_t d, int e)
{
        if (__predict_false(ktrace_on))
                ktr_mibio(a, b, c, d, e);
}

static __inline void
ktrnamei(const char *a, size_t b)
{
        if (__predict_false(ktrace_on))
                ktr_namei(a, b);
}

static __inline void
ktrnamei2(const char *a, size_t b, const char *c, size_t d)
{
        if (__predict_false(ktrace_on))
                ktr_namei2(a, b, c, d);
}

static __inline void
ktrpsig(int a, sig_t b, const sigset_t *c, const ksiginfo_t * d)
{
        if (__predict_false(ktrace_on))
                ktr_psig(a, b, c, d);
}

static __inline void
ktrsyscall(register_t code, const register_t args[], int narg)
{
        if (__predict_false(ktrace_on))
                ktr_syscall(code, args, narg);
}

static __inline void
ktrsysret(register_t a, int b, register_t *c)
{
        if (__predict_false(ktrace_on))
                ktr_sysret(a, b, c);
}

static __inline void
ktrkuser(const char *a, const void *b, size_t c)
{
        if (__predict_false(ktrace_on))
                ktr_kuser(a, b, c);
}

static __inline void
ktrmib(const int *a , u_int b)
{
        if (__predict_false(ktrace_on))
                ktr_mib(a, b);
}

static __inline void
ktrexecarg(const void *a, size_t b)
{
        if (__predict_false(ktrace_on))
                ktr_execarg(a, b);
}

static __inline void
ktrexecenv(const void *a, size_t b)
{
        if (__predict_false(ktrace_on))
                ktr_execenv(a, b);
}

static __inline void
ktrexecfd(int fd, u_int dtype)
{
        if (__predict_false(ktrace_on))
                ktr_execfd(fd, dtype);
}

struct ktrace_entry;
int        ktealloc(struct ktrace_entry **, void **, lwp_t *, int, size_t);
void        ktesethdrlen(struct ktrace_entry *, size_t);
void        ktraddentry(lwp_t *, struct ktrace_entry *, int);
/* Flags for ktraddentry (3rd arg) */
#define        KTA_NOWAIT                0x0000
#define        KTA_WAITOK                0x0001
#define        KTA_LARGE                0x0002

#endif        /* !_KERNEL */

#endif /* _SYS_KTRACE_H_ */



































































































































































































































































































































































































































































































   16 





   16 






   16 



   16 
   16 


















   16 















    3 
    3 






    2 

    2 





    2 
































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
/*        $NetBSD: kern_softint.c,v 1.76 2024/03/01 04:32:38 mrg Exp $        */

/*-
 * Copyright (c) 2007, 2008, 2019, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Generic software interrupt framework.
 *
 * Overview
 *
 *        The soft interrupt framework provides a mechanism to schedule a
 *        low priority callback that runs with thread context.  It allows
 *        for dynamic registration of software interrupts, and for fair
 *        queueing and prioritization of those interrupts.  The callbacks
 *        can be scheduled to run from nearly any point in the kernel: by
 *        code running with thread context, by code running from a
 *        hardware interrupt handler, and at any interrupt priority
 *        level.
 *
 * Priority levels
 *
 *        Since soft interrupt dispatch can be tied to the underlying
 *        architecture's interrupt dispatch code, it can be limited
 *        both by the capabilities of the hardware and the capabilities
 *        of the interrupt dispatch code itself.  The number of priority
 *        levels is restricted to four.  In order of priority (lowest to
 *        highest) the levels are: clock, bio, net, serial.
 *
 *        The names are symbolic and in isolation do not have any direct
 *        connection with a particular kind of device activity: they are
 *        only meant as a guide.
 *
 *        The four priority levels map directly to scheduler priority
 *        levels, and where the architecture implements 'fast' software
 *        interrupts, they also map onto interrupt priorities.  The
 *        interrupt priorities are intended to be hidden from machine
 *        independent code, which should use thread-safe mechanisms to
 *        synchronize with software interrupts (for example: mutexes).
 *
 * Capabilities
 *
 *        Software interrupts run with limited machine context.  In
 *        particular, they do not posess any address space context.  They
 *        should not try to operate on user space addresses, or to use
 *        virtual memory facilities other than those noted as interrupt
 *        safe.
 *
 *        Unlike hardware interrupts, software interrupts do have thread
 *        context.  They may block on synchronization objects, sleep, and
 *        resume execution at a later time.
 *
 *        Since software interrupts are a limited resource and run with
 *        higher priority than most other LWPs in the system, all
 *        block-and-resume activity by a software interrupt must be kept
 *        short to allow further processing at that level to continue.  By
 *        extension, code running with process context must take care to
 *        ensure that any lock that may be taken from a software interrupt
 *        can not be held for more than a short period of time.
 *
 *        The kernel does not allow software interrupts to use facilities
 *        or perform actions that may block for a significant amount of
 *        time.  This means that it's not valid for a software interrupt
 *        to sleep on condition variables        or wait for resources to become
 *        available (for example,        memory).
 *
 * Per-CPU operation
 *
 *        If a soft interrupt is triggered on a CPU, it can only be
 *        dispatched on the same CPU.  Each LWP dedicated to handling a
 *        soft interrupt is bound to its home CPU, so if the LWP blocks
 *        and needs to run again, it can only run there.  Nearly all data
 *        structures used to manage software interrupts are per-CPU.
 *
 *        The per-CPU requirement is intended to reduce "ping-pong" of
 *        cache lines between CPUs: lines occupied by data structures
 *        used to manage the soft interrupts, and lines occupied by data
 *        items being passed down to the soft interrupt.  As a positive
 *        side effect, this also means that the soft interrupt dispatch
 *        code does not need to to use spinlocks to synchronize.
 *
 * Generic implementation
 *
 *        A generic, low performance implementation is provided that
 *        works across all architectures, with no machine-dependent
 *        modifications needed.  This implementation uses the scheduler,
 *        and so has a number of restrictions:
 *
 *        1) The software interrupts are not currently preemptive, so
 *        must wait for the currently executing LWP to yield the CPU.
 *        This can introduce latency.
 *
 *        2) An expensive context switch is required for a software
 *        interrupt to be handled.
 *
 * 'Fast' software interrupts
 *
 *        If an architectures defines __HAVE_FAST_SOFTINTS, it implements
 *        the fast mechanism.  Threads running either in the kernel or in
 *        userspace will be interrupted, but will not be preempted.  When
 *        the soft interrupt completes execution, the interrupted LWP
 *        is resumed.  Interrupt dispatch code must provide the minimum
 *        level of context necessary for the soft interrupt to block and
 *        be resumed at a later time.  The machine-dependent dispatch
 *        path looks something like the following:
 *
 *        softintr()
 *        {
 *                go to IPL_HIGH if necessary for switch;
 *                save any necessary registers in a format that can be
 *                    restored by cpu_switchto if the softint blocks;
 *                arrange for cpu_switchto() to restore into the
 *                    trampoline function;
 *                identify LWP to handle this interrupt;
 *                switch to the LWP's stack;
 *                switch register stacks, if necessary;
 *                assign new value of curlwp;
 *                call MI softint_dispatch, passing old curlwp and IPL
 *                    to execute interrupt at;
 *                switch back to old stack;
 *                switch back to old register stack, if necessary;
 *                restore curlwp;
 *                return to interrupted LWP;
 *        }
 *
 *        If the soft interrupt blocks, a trampoline function is returned
 *        to in the context of the interrupted LWP, as arranged for by
 *        softint():
 *
 *        softint_ret()
 *        {
 *                unlock soft interrupt LWP;
 *                resume interrupt processing, likely returning to
 *                    interrupted LWP or dispatching another, different
 *                    interrupt;
 *        }
 *
 *        Once the soft interrupt has fired (and even if it has blocked),
 *        no further soft interrupts at that level will be triggered by
 *        MI code until the soft interrupt handler has ceased execution.
 *        If a soft interrupt handler blocks and is resumed, it resumes
 *        execution as a normal LWP (kthread) and gains VM context.  Only
 *        when it has completed and is ready to fire again will it
 *        interrupt other threads.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_softint.c,v 1.76 2024/03/01 04:32:38 mrg Exp $");

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/intr.h>
#include <sys/ipi.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/evcnt.h>
#include <sys/cpu.h>
#include <sys/xcall.h>
#include <sys/psref.h>
#include <sys/sdt.h>

#include <uvm/uvm_extern.h>

/* This could overlap with signal info in struct lwp. */
typedef struct softint {
        SIMPLEQ_HEAD(, softhand) si_q;
        struct lwp                *si_lwp;
        struct cpu_info                *si_cpu;
        uintptr_t                si_machdep;
        struct evcnt                si_evcnt;
        struct evcnt                si_evcnt_block;
        volatile int                si_active;
        int                        si_ipl;
        char                        si_name[8];
        char                        si_name_block[8+6];
} softint_t;

typedef struct softhand {
        SIMPLEQ_ENTRY(softhand)        sh_q;
        void                        (*sh_func)(void *);
        void                        *sh_arg;
        softint_t                *sh_isr;
        u_int                        sh_flags;
        u_int                        sh_ipi_id;
} softhand_t;

typedef struct softcpu {
        struct cpu_info                *sc_cpu;
        softint_t                sc_int[SOFTINT_COUNT];
        softhand_t                sc_hand[1];
} softcpu_t;

static void        softint_thread(void *);

u_int                softint_bytes = 32768;
u_int                softint_timing;
static u_int        softint_max;
static kmutex_t        softint_lock;

SDT_PROBE_DEFINE4(sdt, kernel, softint, establish,
    "void *"/*sih*/,
    "void (*)(void *)"/*func*/,
    "void *"/*arg*/,
    "unsigned"/*flags*/);

SDT_PROBE_DEFINE1(sdt, kernel, softint, disestablish,
    "void *"/*sih*/);

SDT_PROBE_DEFINE2(sdt, kernel, softint, schedule,
    "void *"/*sih*/,
    "struct cpu_info *"/*ci*/);

SDT_PROBE_DEFINE4(sdt, kernel, softint, entry,
    "void *"/*sih*/,
    "void (*)(void *)"/*func*/,
    "void *"/*arg*/,
    "unsigned"/*flags*/);

SDT_PROBE_DEFINE4(sdt, kernel, softint, return,
    "void *"/*sih*/,
    "void (*)(void *)"/*func*/,
    "void *"/*arg*/,
    "unsigned"/*flags*/);

/*
 * softint_init_isr:
 *
 *        Initialize a single interrupt level for a single CPU.
 */
static void
softint_init_isr(softcpu_t *sc, const char *desc, pri_t pri, u_int level,
    int ipl)
{
        struct cpu_info *ci;
        softint_t *si;
        int error;

        si = &sc->sc_int[level];
        ci = sc->sc_cpu;
        si->si_cpu = ci;

        SIMPLEQ_INIT(&si->si_q);

        error = kthread_create(pri, KTHREAD_MPSAFE | KTHREAD_INTR |
            KTHREAD_IDLE, ci, softint_thread, si, &si->si_lwp,
            "soft%s/%u", desc, ci->ci_index);
        if (error != 0)
                panic("softint_init_isr: error %d", error);

        snprintf(si->si_name, sizeof(si->si_name), "%s/%u", desc,
            ci->ci_index);
        evcnt_attach_dynamic(&si->si_evcnt, EVCNT_TYPE_MISC, NULL,
           "softint", si->si_name);
        snprintf(si->si_name_block, sizeof(si->si_name_block), "%s block/%u",
            desc, ci->ci_index);
        evcnt_attach_dynamic(&si->si_evcnt_block, EVCNT_TYPE_MISC, NULL,
           "softint", si->si_name_block);

        si->si_ipl = ipl;
        si->si_lwp->l_private = si;
        softint_init_md(si->si_lwp, level, &si->si_machdep);
}

/*
 * softint_init:
 *
 *        Initialize per-CPU data structures.  Called from mi_cpu_attach().
 */
void
softint_init(struct cpu_info *ci)
{
        static struct cpu_info *first;
        softcpu_t *sc, *scfirst;
        softhand_t *sh, *shmax;

        if (first == NULL) {
                /* Boot CPU. */
                first = ci;
                mutex_init(&softint_lock, MUTEX_DEFAULT, IPL_NONE);
                softint_bytes = round_page(softint_bytes);
                softint_max = (softint_bytes - sizeof(softcpu_t)) /
                    sizeof(softhand_t);
        }

        /* Use uvm_km(9) for persistent, page-aligned allocation. */
        sc = (softcpu_t *)uvm_km_alloc(kernel_map, softint_bytes, 0,
            UVM_KMF_WIRED | UVM_KMF_ZERO);
        if (sc == NULL)
                panic("softint_init_cpu: cannot allocate memory");

        ci->ci_data.cpu_softcpu = sc;
        ci->ci_data.cpu_softints = 0;
        sc->sc_cpu = ci;

        softint_init_isr(sc, "net", PRI_SOFTNET, SOFTINT_NET,
            IPL_SOFTNET);
        softint_init_isr(sc, "bio", PRI_SOFTBIO, SOFTINT_BIO,
            IPL_SOFTBIO);
        softint_init_isr(sc, "clk", PRI_SOFTCLOCK, SOFTINT_CLOCK,
            IPL_SOFTCLOCK);
        softint_init_isr(sc, "ser", PRI_SOFTSERIAL, SOFTINT_SERIAL,
            IPL_SOFTSERIAL);

        if (first != ci) {
                mutex_enter(&softint_lock);
                scfirst = first->ci_data.cpu_softcpu;
                sh = sc->sc_hand;
                memcpy(sh, scfirst->sc_hand, sizeof(*sh) * softint_max);
                /* Update pointers for this CPU. */
                for (shmax = sh + softint_max; sh < shmax; sh++) {
                        if (sh->sh_func == NULL)
                                continue;
                        sh->sh_isr =
                            &sc->sc_int[sh->sh_flags & SOFTINT_LVLMASK];
                }
                mutex_exit(&softint_lock);
        }
}

/*
 * softint_establish:
 *
 *        Register a software interrupt handler.
 */
void *
softint_establish(u_int flags, void (*func)(void *), void *arg)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
        softcpu_t *sc;
        softhand_t *sh;
        u_int level, index;
        u_int ipi_id = 0;
        void *sih;

        level = (flags & SOFTINT_LVLMASK);
        KASSERT(level < SOFTINT_COUNT);
        KASSERT((flags & SOFTINT_IMPMASK) == 0);

        mutex_enter(&softint_lock);

        /* Find a free slot. */
        sc = curcpu()->ci_data.cpu_softcpu;
        for (index = 1; index < softint_max; index++) {
                if (sc->sc_hand[index].sh_func == NULL)
                        break;
        }
        if (index == softint_max) {
                mutex_exit(&softint_lock);
                printf("WARNING: softint_establish: table full, "
                    "increase softint_bytes\n");
                return NULL;
        }
        sih = (void *)((uint8_t *)&sc->sc_hand[index] - (uint8_t *)sc);

        if (flags & SOFTINT_RCPU) {
                if ((ipi_id = ipi_register(softint_schedule, sih)) == 0) {
                        mutex_exit(&softint_lock);
                        return NULL;
                }
        }

        /* Set up the handler on each CPU. */
        if (ncpu < 2) {
                /* XXX hack for machines with no CPU_INFO_FOREACH() early on */
                sc = curcpu()->ci_data.cpu_softcpu;
                sh = &sc->sc_hand[index];
                sh->sh_isr = &sc->sc_int[level];
                sh->sh_func = func;
                sh->sh_arg = arg;
                sh->sh_flags = flags;
                sh->sh_ipi_id = ipi_id;
        } else for (CPU_INFO_FOREACH(cii, ci)) {
                sc = ci->ci_data.cpu_softcpu;
                sh = &sc->sc_hand[index];
                sh->sh_isr = &sc->sc_int[level];
                sh->sh_func = func;
                sh->sh_arg = arg;
                sh->sh_flags = flags;
                sh->sh_ipi_id = ipi_id;
        }
        mutex_exit(&softint_lock);

        SDT_PROBE4(sdt, kernel, softint, establish,  sih, func, arg, flags);

        return sih;
}

/*
 * softint_disestablish:
 *
 *        Unregister a software interrupt handler.  The soft interrupt could
 *        still be active at this point, but the caller commits not to try
 *        and trigger it again once this call is made.  The caller must not
 *        hold any locks that could be taken from soft interrupt context,
 *        because we will wait for the softint to complete if it's still
 *        running.
 */
void
softint_disestablish(void *arg)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
        softcpu_t *sc;
        softhand_t *sh;
        uintptr_t offset;

        offset = (uintptr_t)arg;
        KASSERT(offset != 0);
        KASSERTMSG(offset < softint_bytes, "%"PRIuPTR" %u",
            offset, softint_bytes);

        /*
         * Unregister IPI handler if there is any.  Note: there is no need
         * to disable preemption here - ID is stable.
         */
        sc = curcpu()->ci_data.cpu_softcpu;
        sh = (softhand_t *)((uint8_t *)sc + offset);
        if (sh->sh_ipi_id) {
                ipi_unregister(sh->sh_ipi_id);
        }

        /*
         * Run a dummy softint at the same level on all CPUs and wait for
         * completion, to make sure this softint is no longer running
         * anywhere.
         */
        xc_barrier(XC_HIGHPRI_IPL(sh->sh_isr->si_ipl));

        /*
         * Notify dtrace probe when the old softint can't be running
         * any more, but before it can be recycled for a new softint.
         */
        SDT_PROBE1(sdt, kernel, softint, disestablish,  arg);

        /* Clear the handler on each CPU. */
        mutex_enter(&softint_lock);
        for (CPU_INFO_FOREACH(cii, ci)) {
                sc = ci->ci_data.cpu_softcpu;
                sh = (softhand_t *)((uint8_t *)sc + offset);
                KASSERT(sh->sh_func != NULL);
                sh->sh_func = NULL;
        }
        mutex_exit(&softint_lock);
}

/*
 * softint_schedule:
 *
 *        Trigger a software interrupt.  Must be called from a hardware
 *        interrupt handler, or with preemption disabled (since we are
 *        using the value of curcpu()).
 */
void
softint_schedule(void *arg)
{
        softhand_t *sh;
        softint_t *si;
        uintptr_t offset;
        int s;

        SDT_PROBE2(sdt, kernel, softint, schedule,  arg, /*ci*/NULL);

        /*
         * If this assert fires, rather than disabling preemption explicitly
         * to make it stop, consider that you are probably using a softint
         * when you don't need to.
         */
        KASSERT(kpreempt_disabled());

        /* Find the handler record for this CPU. */
        offset = (uintptr_t)arg;
        KASSERT(offset != 0);
        KASSERTMSG(offset < softint_bytes, "%"PRIuPTR" %u",
            offset, softint_bytes);
        sh = (softhand_t *)((uint8_t *)curcpu()->ci_data.cpu_softcpu + offset);

        /* If it's already pending there's nothing to do. */
        if ((sh->sh_flags & SOFTINT_PENDING) != 0) {
                return;
        }

        /*
         * Enqueue the handler into the LWP's pending list.
         * If the LWP is completely idle, then make it run.
         */
        s = splhigh();
        if ((sh->sh_flags & SOFTINT_PENDING) == 0) {
                si = sh->sh_isr;
                sh->sh_flags |= SOFTINT_PENDING;
                SIMPLEQ_INSERT_TAIL(&si->si_q, sh, sh_q);
                if (si->si_active == 0) {
                        si->si_active = 1;
                        softint_trigger(si->si_machdep);
                }
        }
        splx(s);
}

/*
 * softint_schedule_cpu:
 *
 *        Trigger a software interrupt on a target CPU.  This invokes
 *        softint_schedule() for the local CPU or send an IPI to invoke
 *        this routine on the remote CPU.  Preemption must be disabled.
 */
void
softint_schedule_cpu(void *arg, struct cpu_info *ci)
{
        KASSERT(kpreempt_disabled());

        if (curcpu() != ci) {
                const softcpu_t *sc = ci->ci_data.cpu_softcpu;
                const uintptr_t offset = (uintptr_t)arg;
                const softhand_t *sh;

                SDT_PROBE2(sdt, kernel, softint, schedule,  arg, ci);
                sh = (const softhand_t *)((const uint8_t *)sc + offset);
                KASSERT((sh->sh_flags & SOFTINT_RCPU) != 0);
                ipi_trigger(sh->sh_ipi_id, ci);
                return;
        }

        /* Just a local CPU. */
        softint_schedule(arg);
}

/*
 * softint_execute:
 *
 *        Invoke handlers for the specified soft interrupt.
 *        Must be entered at splhigh.  Will drop the priority
 *        to the level specified, but returns back at splhigh.
 */
static inline void
softint_execute(lwp_t *l, int s)
{
        softint_t *si = l->l_private;
        softhand_t *sh;

        KASSERT(si->si_lwp == curlwp);
        KASSERT(si->si_cpu == curcpu());
        KASSERT(si->si_lwp->l_wchan == NULL);
        KASSERT(si->si_active);
        KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d",
            l, l->l_nopreempt);

        /*
         * Note: due to priority inheritance we may have interrupted a
         * higher priority LWP.  Since the soft interrupt must be quick
         * and is non-preemptable, we don't bother yielding.
         */

        while (!SIMPLEQ_EMPTY(&si->si_q)) {
                /*
                 * Pick the longest waiting handler to run.  We block
                 * interrupts but do not lock in order to do this, as
                 * we are protecting against the local CPU only.
                 */
                sh = SIMPLEQ_FIRST(&si->si_q);
                SIMPLEQ_REMOVE_HEAD(&si->si_q, sh_q);
                KASSERT((sh->sh_flags & SOFTINT_PENDING) != 0);
                sh->sh_flags ^= SOFTINT_PENDING;
                splx(s);

                /* Run the handler. */
                SDT_PROBE4(sdt, kernel, softint, entry,
                    ((const char *)sh -
                        (const char *)curcpu()->ci_data.cpu_softcpu),
                    sh->sh_func, sh->sh_arg, sh->sh_flags);
                if (__predict_true((sh->sh_flags & SOFTINT_MPSAFE) != 0)) {
                        (*sh->sh_func)(sh->sh_arg);
                } else {
                        KERNEL_LOCK(1, l);
                        (*sh->sh_func)(sh->sh_arg);
                        KERNEL_UNLOCK_ONE(l);
                }
                SDT_PROBE4(sdt, kernel, softint, return,
                    ((const char *)sh -
                        (const char *)curcpu()->ci_data.cpu_softcpu),
                    sh->sh_func, sh->sh_arg, sh->sh_flags);

                /* Diagnostic: check that spin-locks have not leaked. */
                KASSERTMSG(curcpu()->ci_mtx_count == 0,
                    "%s: ci_mtx_count (%d) != 0, sh_func %p\n",
                    __func__, curcpu()->ci_mtx_count, sh->sh_func);
                /* Diagnostic: check that psrefs have not leaked. */
                KASSERTMSG(l->l_psrefs == 0, "%s: l_psrefs=%d, sh_func=%p\n",
                    __func__, l->l_psrefs, sh->sh_func);
                /* Diagnostic: check that biglocks have not leaked. */
                KASSERTMSG(l->l_blcnt == 0,
                    "%s: sh_func=%p leaked %d biglocks",
                    __func__, sh->sh_func, curlwp->l_blcnt);
                /* Diagnostic: check that LWP nopreempt remains zero. */
                KASSERTMSG(l->l_nopreempt == 0,
                    "%s: lwp %p nopreempt %d func %p",
                    __func__, l, l->l_nopreempt, sh->sh_func);

                (void)splhigh();
        }

        PSREF_DEBUG_BARRIER();

        CPU_COUNT(CPU_COUNT_NSOFT, 1);

        KASSERT(si->si_cpu == curcpu());
        KASSERT(si->si_lwp->l_wchan == NULL);
        KASSERT(si->si_active);
        si->si_evcnt.ev_count++;
        si->si_active = 0;
}

/*
 * softint_block:
 *
 *        Update statistics when the soft interrupt blocks.
 */
void
softint_block(lwp_t *l)
{
        softint_t *si = l->l_private;

        KASSERT((l->l_pflag & LP_INTR) != 0);
        si->si_evcnt_block.ev_count++;
}

#ifndef __HAVE_FAST_SOFTINTS

#ifdef __HAVE_PREEMPTION
#error __HAVE_PREEMPTION requires __HAVE_FAST_SOFTINTS
#endif

/*
 * softint_init_md:
 *
 *        Slow path: perform machine-dependent initialization.
 */
void
softint_init_md(lwp_t *l, u_int level, uintptr_t *machdep)
{
        struct proc *p;
        softint_t *si;

        *machdep = (1 << level);
        si = l->l_private;
        p = l->l_proc;

        mutex_enter(p->p_lock);
        lwp_lock(l);
        /* Cheat and make the KASSERT in softint_thread() happy. */
        si->si_active = 1;
        setrunnable(l);
        /* LWP now unlocked */
        mutex_exit(p->p_lock);
}

/*
 * softint_trigger:
 *
 *        Slow path: cause a soft interrupt handler to begin executing.
 *        Called at IPL_HIGH.
 */
void
softint_trigger(uintptr_t machdep)
{
        struct cpu_info *ci;
        lwp_t *l;

        ci = curcpu();
        ci->ci_data.cpu_softints |= machdep;
        l = ci->ci_onproc;

        /*
         * Arrange for mi_switch() to be called.  If called from interrupt
         * mode, we don't know if curlwp is executing in kernel or user, so
         * post an AST and have it take a trip through userret().  If not in
         * interrupt mode, curlwp is running in kernel and will notice the
         * resched soon enough; avoid the AST.
         */
        if (l == ci->ci_data.cpu_idlelwp) {
                atomic_or_uint(&ci->ci_want_resched,
                    RESCHED_IDLE | RESCHED_UPREEMPT);
        } else {
                atomic_or_uint(&ci->ci_want_resched, RESCHED_UPREEMPT);
                if (cpu_intr_p()) {
                        cpu_signotify(l);
                }
        }
}

/*
 * softint_thread:
 *
 *        Slow path: MI software interrupt dispatch.
 */
void
softint_thread(void *cookie)
{
        softint_t *si;
        lwp_t *l;
        int s;

        l = curlwp;
        si = l->l_private;

        for (;;) {
                /* Clear pending status and run it. */
                s = splhigh();
                l->l_cpu->ci_data.cpu_softints &= ~si->si_machdep;
                softint_execute(l, s);
                splx(s);

                /* Interrupts allowed to run again before switching. */
                lwp_lock(l);
                l->l_stat = LSIDL;
                spc_lock(l->l_cpu);
                mi_switch(l);
        }
}

/*
 * softint_picklwp:
 *
 *        Slow path: called from mi_switch() to pick the highest priority
 *        soft interrupt LWP that needs to run.
 */
lwp_t *
softint_picklwp(void)
{
        struct cpu_info *ci;
        u_int mask;
        softint_t *si;
        lwp_t *l;

        ci = curcpu();
        si = ((softcpu_t *)ci->ci_data.cpu_softcpu)->sc_int;
        mask = ci->ci_data.cpu_softints;

        if ((mask & (1 << SOFTINT_SERIAL)) != 0) {
                l = si[SOFTINT_SERIAL].si_lwp;
        } else if ((mask & (1 << SOFTINT_NET)) != 0) {
                l = si[SOFTINT_NET].si_lwp;
        } else if ((mask & (1 << SOFTINT_BIO)) != 0) {
                l = si[SOFTINT_BIO].si_lwp;
        } else if ((mask & (1 << SOFTINT_CLOCK)) != 0) {
                l = si[SOFTINT_CLOCK].si_lwp;
        } else {
                panic("softint_picklwp");
        }

        return l;
}

#else        /*  !__HAVE_FAST_SOFTINTS */

/*
 * softint_thread:
 *
 *        Fast path: the LWP is switched to without restoring any state,
 *        so we should not arrive here - there is a direct handoff between
 *        the interrupt stub and softint_dispatch().
 */
void
softint_thread(void *cookie)
{

        panic("softint_thread");
}

/*
 * softint_dispatch:
 *
 *        Fast path: entry point from machine-dependent code.
 */
void
softint_dispatch(lwp_t *pinned, int s)
{
        struct bintime now;
        u_int timing;
        lwp_t *l;

#ifdef DIAGNOSTIC
        if ((pinned->l_pflag & LP_RUNNING) == 0 || curlwp->l_stat != LSIDL) {
                struct lwp *onproc = curcpu()->ci_onproc;
                int s2 = splhigh();
                printf("curcpu=%d, spl=%d curspl=%d\n"
                        "onproc=%p => l_stat=%d l_flag=%08x l_cpu=%d\n"
                        "curlwp=%p => l_stat=%d l_flag=%08x l_cpu=%d\n"
                        "pinned=%p => l_stat=%d l_flag=%08x l_cpu=%d\n",
                        cpu_index(curcpu()), s, s2, onproc, onproc->l_stat,
                        onproc->l_flag, cpu_index(onproc->l_cpu), curlwp,
                        curlwp->l_stat, curlwp->l_flag,
                        cpu_index(curlwp->l_cpu), pinned, pinned->l_stat,
                        pinned->l_flag, cpu_index(pinned->l_cpu));
                splx(s2);
                panic("softint screwup");
        }
#endif

        /*
         * Note the interrupted LWP, and mark the current LWP as running
         * before proceeding.  Although this must as a rule be done with
         * the LWP locked, at this point no external agents will want to
         * modify the interrupt LWP's state.
         */
        timing = softint_timing;
        l = curlwp;
        l->l_switchto = pinned;
        l->l_stat = LSONPROC;

        /*
         * Dispatch the interrupt.  If softints are being timed, charge
         * for it.
         */
        if (timing) {
                binuptime(&l->l_stime);
                membar_producer();        /* for calcru */
                l->l_pflag |= LP_TIMEINTR;
        }
        l->l_pflag |= LP_RUNNING;
        softint_execute(l, s);
        if (timing) {
                binuptime(&now);
                updatertime(l, &now);
                l->l_pflag &= ~LP_TIMEINTR;
        }

        /*
         * If we blocked while handling the interrupt, the pinned LWP is
         * gone and we are now running as a kthread, so find another LWP to
         * run.  softint_dispatch() won't be reentered until the priority is
         * finally dropped to IPL_NONE on entry to the next LWP on this CPU.
         */
        l->l_stat = LSIDL;
        if (l->l_switchto == NULL) {
                lwp_lock(l);
                spc_lock(l->l_cpu);
                mi_switch(l);
                /* NOTREACHED */
        }
        l->l_switchto = NULL;
        l->l_pflag &= ~LP_RUNNING;
}

#endif        /* !__HAVE_FAST_SOFTINTS */











































































































    2 

    3 

















    2 



    2 






    3 



    3 






































    3 

    3 





    3 























































































    3 


    3 






    1 
































    2 





































    3 


























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
/*        $NetBSD: prop_number.c,v 1.34 2022/08/03 21:13:46 riastradh Exp $        */

/*-
 * Copyright (c) 2006, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "prop_object_impl.h"
#include <prop/prop_number.h>
#include <sys/rbtree.h>

#if defined(_KERNEL)
#include <sys/systm.h>
#elif defined(_STANDALONE)
#include <sys/param.h>
#include <lib/libkern/libkern.h>
#else
#include <errno.h>
#include <limits.h>
#include <stdlib.h>
#endif

struct _prop_number_value {
        union {
                int64_t  pnu_signed;
                uint64_t pnu_unsigned;
        } pnv_un;
#define        pnv_signed        pnv_un.pnu_signed
#define        pnv_unsigned        pnv_un.pnu_unsigned
        unsigned int        pnv_is_unsigned        :1,
                                        :31;
};

struct _prop_number {
        struct _prop_object        pn_obj;
        struct rb_node                pn_link;
        struct _prop_number_value pn_value;
};

_PROP_POOL_INIT(_prop_number_pool, sizeof(struct _prop_number), "propnmbr")

static _prop_object_free_rv_t
                _prop_number_free(prop_stack_t, prop_object_t *);
static bool        _prop_number_externalize(
                                struct _prop_object_externalize_context *,
                                void *);
static _prop_object_equals_rv_t
                _prop_number_equals(prop_object_t, prop_object_t,
                                    void **, void **,
                                    prop_object_t *, prop_object_t *);

static void _prop_number_lock(void);
static void _prop_number_unlock(void);

static const struct _prop_object_type _prop_object_type_number = {
        .pot_type        =        PROP_TYPE_NUMBER,
        .pot_free        =        _prop_number_free,
        .pot_extern        =        _prop_number_externalize,
        .pot_equals        =        _prop_number_equals,
        .pot_lock       =       _prop_number_lock,
        .pot_unlock     =            _prop_number_unlock,
};

#define        prop_object_is_number(x)        \
        ((x) != NULL && (x)->pn_obj.po_type == &_prop_object_type_number)

/*
 * Number objects are immutable, and we are likely to have many number
 * objects that have the same value.  So, to save memory, we unique'ify
 * numbers so we only have one copy of each.
 */

static int
_prop_number_compare_values(const struct _prop_number_value *pnv1,
                            const struct _prop_number_value *pnv2)
{

        /* Signed numbers are sorted before unsigned numbers. */

        if (pnv1->pnv_is_unsigned) {
                if (! pnv2->pnv_is_unsigned)
                        return (1);
                if (pnv1->pnv_unsigned < pnv2->pnv_unsigned)
                        return (-1);
                if (pnv1->pnv_unsigned > pnv2->pnv_unsigned)
                        return (1);
                return (0);
        }

        if (pnv2->pnv_is_unsigned)
                return (-1);
        if (pnv1->pnv_signed < pnv2->pnv_signed)
                return (-1);
        if (pnv1->pnv_signed > pnv2->pnv_signed)
                return (1);
        return (0);
}

static int
/*ARGSUSED*/
_prop_number_rb_compare_nodes(void *ctx _PROP_ARG_UNUSED,
                              const void *n1, const void *n2)
{
        const struct _prop_number *pn1 = n1;
        const struct _prop_number *pn2 = n2;

        return _prop_number_compare_values(&pn1->pn_value, &pn2->pn_value);
}

static int
/*ARGSUSED*/
_prop_number_rb_compare_key(void *ctx _PROP_ARG_UNUSED,
                            const void *n, const void *v)
{
        const struct _prop_number *pn = n;
        const struct _prop_number_value *pnv = v;

        return _prop_number_compare_values(&pn->pn_value, pnv);
}

static const rb_tree_ops_t _prop_number_rb_tree_ops = {
        .rbto_compare_nodes = _prop_number_rb_compare_nodes,
        .rbto_compare_key = _prop_number_rb_compare_key,
        .rbto_node_offset = offsetof(struct _prop_number, pn_link),
        .rbto_context = NULL
};

static struct rb_tree _prop_number_tree;
_PROP_MUTEX_DECL_STATIC(_prop_number_tree_mutex)

/* ARGSUSED */
static _prop_object_free_rv_t
_prop_number_free(prop_stack_t stack, prop_object_t *obj)
{
        prop_number_t pn = *obj;

        rb_tree_remove_node(&_prop_number_tree, pn);

        _PROP_POOL_PUT(_prop_number_pool, pn);

        return (_PROP_OBJECT_FREE_DONE);
}

_PROP_ONCE_DECL(_prop_number_init_once)

static int
_prop_number_init(void)
{

        _PROP_MUTEX_INIT(_prop_number_tree_mutex);
        rb_tree_init(&_prop_number_tree, &_prop_number_rb_tree_ops);
        return 0;
}

static void
_prop_number_lock(void)
{
        /* XXX: init necessary? */
        _PROP_ONCE_RUN(_prop_number_init_once, _prop_number_init);
        _PROP_MUTEX_LOCK(_prop_number_tree_mutex);
}

static void
_prop_number_unlock(void)
{
        _PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
}

static bool
_prop_number_externalize(struct _prop_object_externalize_context *ctx,
                         void *v)
{
        prop_number_t pn = v;
        char tmpstr[32];

        /*
         * For unsigned numbers, we output in hex.  For signed numbers,
         * we output in decimal.
         */
        if (pn->pn_value.pnv_is_unsigned)
                snprintf(tmpstr, sizeof(tmpstr), "0x%" PRIx64,
                    pn->pn_value.pnv_unsigned);
        else
                snprintf(tmpstr, sizeof(tmpstr), "%" PRIi64,
                    pn->pn_value.pnv_signed);

        if (_prop_object_externalize_start_tag(ctx, "integer") == false ||
            _prop_object_externalize_append_cstring(ctx, tmpstr) == false ||
            _prop_object_externalize_end_tag(ctx, "integer") == false)
                return (false);

        return (true);
}

/* ARGSUSED */
static _prop_object_equals_rv_t
_prop_number_equals(prop_object_t v1, prop_object_t v2,
    void **stored_pointer1, void **stored_pointer2,
    prop_object_t *next_obj1, prop_object_t *next_obj2)
{
        prop_number_t num1 = v1;
        prop_number_t num2 = v2;

        /*
         * There is only ever one copy of a number object at any given
         * time, so we can reduce this to a simple pointer equality check
         * in the common case.
         */
        if (num1 == num2)
                return (_PROP_OBJECT_EQUALS_TRUE);

        /*
         * If the numbers are the same signed-ness, then we know they
         * cannot be equal because they would have had pointer equality.
         */
        if (num1->pn_value.pnv_is_unsigned == num2->pn_value.pnv_is_unsigned)
                return (_PROP_OBJECT_EQUALS_FALSE);

        /*
         * We now have one signed value and one unsigned value.  We can
         * compare them iff:
         *        - The unsigned value is not larger than the signed value
         *          can represent.
         *        - The signed value is not smaller than the unsigned value
         *          can represent.
         */
        if (num1->pn_value.pnv_is_unsigned) {
                /*
                 * num1 is unsigned and num2 is signed.
                 */
                if (num1->pn_value.pnv_unsigned > INTMAX_MAX)
                        return (_PROP_OBJECT_EQUALS_FALSE);
                if (num2->pn_value.pnv_signed < 0)
                        return (_PROP_OBJECT_EQUALS_FALSE);
        } else {
                /*
                 * num1 is signed and num2 is unsigned.
                 */
                if (num1->pn_value.pnv_signed < 0)
                        return (_PROP_OBJECT_EQUALS_FALSE);
                if (num2->pn_value.pnv_unsigned > INTMAX_MAX)
                        return (_PROP_OBJECT_EQUALS_FALSE);
        }

        if (num1->pn_value.pnv_signed == num2->pn_value.pnv_signed)
                return _PROP_OBJECT_EQUALS_TRUE;
        else
                return _PROP_OBJECT_EQUALS_FALSE;
}

static prop_number_t
_prop_number_alloc(const struct _prop_number_value *pnv)
{
        prop_number_t opn, pn, rpn;

        _PROP_ONCE_RUN(_prop_number_init_once, _prop_number_init);

        /*
         * Check to see if this already exists in the tree.  If it does,
         * we just retain it and return it.
         */
        _PROP_MUTEX_LOCK(_prop_number_tree_mutex);
        opn = rb_tree_find_node(&_prop_number_tree, pnv);
        if (opn != NULL) {
                prop_object_retain(opn);
                _PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
                return (opn);
        }
        _PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);

        /*
         * Not in the tree.  Create it now.
         */

        pn = _PROP_POOL_GET(_prop_number_pool);
        if (pn == NULL)
                return (NULL);

        _prop_object_init(&pn->pn_obj, &_prop_object_type_number);

        pn->pn_value = *pnv;

        /*
         * We dropped the mutex when we allocated the new object, so
         * we have to check again if it is in the tree.
         */
        _PROP_MUTEX_LOCK(_prop_number_tree_mutex);
        opn = rb_tree_find_node(&_prop_number_tree, pnv);
        if (opn != NULL) {
                prop_object_retain(opn);
                _PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
                _PROP_POOL_PUT(_prop_number_pool, pn);
                return (opn);
        }
        rpn = rb_tree_insert_node(&_prop_number_tree, pn);
        _PROP_ASSERT(rpn == pn);
        _PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
        return (rpn);
}

/*
 * prop_number_create_signed --
 *        Create a prop_number_t and initialize it with the
 *        provided signed value.
 */
prop_number_t
prop_number_create_signed(intmax_t val)
{
        struct _prop_number_value pnv;

        memset(&pnv, 0, sizeof(pnv));
        pnv.pnv_signed = val;
        pnv.pnv_is_unsigned = false;

        return (_prop_number_alloc(&pnv));
}

_PROP_DEPRECATED(prop_number_create_integer,
    "this program uses prop_number_create_integer(), "
    "which is deprecated; use prop_number_create_signed() instead.")
prop_number_t
prop_number_create_integer(int64_t val)
{
        return prop_number_create_signed(val);
}

/*
 * prop_number_create_unsigned --
 *        Create a prop_number_t and initialize it with the
 *        provided unsigned value.
 */
prop_number_t
prop_number_create_unsigned(uintmax_t val)
{
        struct _prop_number_value pnv;

        memset(&pnv, 0, sizeof(pnv));
        pnv.pnv_unsigned = val;
        pnv.pnv_is_unsigned = true;

        return (_prop_number_alloc(&pnv));
}

_PROP_DEPRECATED(prop_number_create_unsigned_integer,
    "this program uses prop_number_create_unsigned_integer(), "
    "which is deprecated; use prop_number_create_unsigned() instead.")
prop_number_t
prop_number_create_unsigned_integer(uint64_t val)
{
        return prop_number_create_unsigned(val);
}

/*
 * prop_number_copy --
 *        Copy a prop_number_t.
 */
prop_number_t
prop_number_copy(prop_number_t opn)
{

        if (! prop_object_is_number(opn))
                return (NULL);

        /*
         * Because we only ever allocate one object for any given
         * value, this can be reduced to a simple retain operation.
         */
        prop_object_retain(opn);
        return (opn);
}

/*
 * prop_number_unsigned --
 *        Returns true if the prop_number_t has an unsigned value.
 */
bool
prop_number_unsigned(prop_number_t pn)
{

        return (pn->pn_value.pnv_is_unsigned);
}

/*
 * prop_number_size --
 *        Return the size, in bits, required to hold the value of
 *        the specified number.
 */
int
prop_number_size(prop_number_t pn)
{
        struct _prop_number_value *pnv;

        if (! prop_object_is_number(pn))
                return (0);

        pnv = &pn->pn_value;

        if (pnv->pnv_is_unsigned) {
                if (pnv->pnv_unsigned > UINT32_MAX)
                        return (64);
                if (pnv->pnv_unsigned > UINT16_MAX)
                        return (32);
                if (pnv->pnv_unsigned > UINT8_MAX)
                        return (16);
                return (8);
        }

        if (pnv->pnv_signed > INT32_MAX || pnv->pnv_signed < INT32_MIN)
                    return (64);
        if (pnv->pnv_signed > INT16_MAX || pnv->pnv_signed < INT16_MIN)
                return (32);
        if (pnv->pnv_signed > INT8_MAX  || pnv->pnv_signed < INT8_MIN)
                return (16);
        return (8);
}

/*
 * prop_number_signed_value --
 *        Get the signed value of a prop_number_t.
 */
intmax_t
prop_number_signed_value(prop_number_t pn)
{

        /*
         * XXX Impossible to distinguish between "not a prop_number_t"
         * XXX and "prop_number_t has a value of 0".
         */
        if (! prop_object_is_number(pn))
                return (0);

        return (pn->pn_value.pnv_signed);
}

_PROP_DEPRECATED(prop_number_integer_value,
    "this program uses prop_number_integer_value(), "
    "which is deprecated; use prop_number_signed_value() instead.")
int64_t
prop_number_integer_value(prop_number_t pn)
{
        return prop_number_signed_value(pn);
}

/*
 * prop_number_unsigned_value --
 *        Get the unsigned value of a prop_number_t.
 */
uintmax_t
prop_number_unsigned_value(prop_number_t pn)
{

        /*
         * XXX Impossible to distinguish between "not a prop_number_t"
         * XXX and "prop_number_t has a value of 0".
         */
        if (! prop_object_is_number(pn))
                return (0);

        return (pn->pn_value.pnv_unsigned);
}

_PROP_DEPRECATED(prop_number_unsigned_integer_value,
    "this program uses prop_number_unsigned_integer_value(), "
    "which is deprecated; use prop_number_unsigned_value() instead.")
uint64_t
prop_number_unsigned_integer_value(prop_number_t pn)
{
        return prop_number_unsigned_value(pn);
}

/*
 * prop_number_[...]_value --
 *        Retrieve the bounds-checked value as the specified type.
 *        Returns true if successful.
 */
#define        TEMPLATE(name, typ, minv, maxv)                                        \
bool                                                                        \
prop_number_ ## name ## _value(prop_number_t pn, typ * const valp)        \
{                                                                        \
                                                                        \
        if (! prop_object_is_number(pn))                                \
                return (false);                                                \
                                                                        \
        if (pn->pn_value.pnv_is_unsigned) {                                \
                if (pn->pn_value.pnv_unsigned > (maxv))                        \
                        return (false);                                        \
                *valp = (typ) pn->pn_value.pnv_unsigned;                \
        } else {                                                        \
                if ((pn->pn_value.pnv_signed > 0 &&                        \
                     (uintmax_t)pn->pn_value.pnv_signed > (maxv)) ||        \
                    pn->pn_value.pnv_signed < (minv))                        \
                        return (false);                                        \
                *valp = (typ) pn->pn_value.pnv_signed;                        \
        }                                                                \
                                                                        \
        return (true);                                                        \
}
TEMPLATE(schar,    signed char, SCHAR_MIN,  SCHAR_MAX)
TEMPLATE(short,    short,       SHRT_MIN,   SHRT_MAX)
TEMPLATE(int,      int,         INT_MIN,    INT_MAX)
TEMPLATE(long,     long,        LONG_MIN,   LONG_MAX)
TEMPLATE(longlong, long long,   LLONG_MIN,  LLONG_MAX)
TEMPLATE(intptr,   intptr_t,    INTPTR_MIN, INTPTR_MAX)
TEMPLATE(int8,     int8_t,      INT8_MIN,   INT8_MAX)
TEMPLATE(int16,    int16_t,     INT16_MIN,  INT16_MAX)
TEMPLATE(int32,    int32_t,     INT32_MIN,  INT32_MAX)
TEMPLATE(int64,    int64_t,     INT64_MIN,  INT64_MAX)

TEMPLATE(uchar,     unsigned char,      0, UCHAR_MAX)
TEMPLATE(ushort,    unsigned short,     0, USHRT_MAX)
TEMPLATE(uint,      unsigned int,       0, UINT_MAX)
TEMPLATE(ulong,     unsigned long,      0, ULONG_MAX)
TEMPLATE(ulonglong, unsigned long long, 0, ULLONG_MAX)
TEMPLATE(uintptr,   uintptr_t,          0, UINTPTR_MAX)
TEMPLATE(uint8,     uint8_t,            0, UINT8_MAX)
TEMPLATE(uint16,    uint16_t,           0, UINT16_MAX)
TEMPLATE(uint32,    uint32_t,           0, UINT32_MAX)
TEMPLATE(uint64,    uint64_t,           0, UINT64_MAX)

#undef TEMPLATE

/*
 * prop_number_equals --
 *        Return true if two numbers are equivalent.
 */
bool
prop_number_equals(prop_number_t num1, prop_number_t num2)
{
        if (!prop_object_is_number(num1) || !prop_object_is_number(num2))
                return (false);

        return (prop_object_equals(num1, num2));
}

/*
 * prop_number_equals_signed --
 *        Return true if the number is equivalent to the specified signed
 *        value.
 */
bool
prop_number_equals_signed(prop_number_t pn, intmax_t val)
{

        if (! prop_object_is_number(pn))
                return (false);

        if (pn->pn_value.pnv_is_unsigned &&
            (pn->pn_value.pnv_unsigned > INTMAX_MAX || val < 0))
                return (false);

        return (pn->pn_value.pnv_signed == val);
}

_PROP_DEPRECATED(prop_number_equals_integer,
    "this program uses prop_number_equals_integer(), "
    "which is deprecated; use prop_number_equals_signed() instead.")
bool
prop_number_equals_integer(prop_number_t pn, int64_t val)
{
        return prop_number_equals_signed(pn, val);
}

/*
 * prop_number_equals_unsigned --
 *        Return true if the number is equivalent to the specified
 *        unsigned value.
 */
bool
prop_number_equals_unsigned(prop_number_t pn, uintmax_t val)
{

        if (! prop_object_is_number(pn))
                return (false);

        if (! pn->pn_value.pnv_is_unsigned &&
            (pn->pn_value.pnv_signed < 0 || val > INT64_MAX))
                return (false);

        return (pn->pn_value.pnv_unsigned == val);
}

_PROP_DEPRECATED(prop_number_equals_unsigned_integer,
    "this program uses prop_number_equals_unsigned_integer(), "
    "which is deprecated; use prop_number_equals_unsigned() instead.")
bool
prop_number_equals_unsigned_integer(prop_number_t pn, uint64_t val)
{
        return prop_number_equals_unsigned(pn, val);
}

static bool
_prop_number_internalize_unsigned(struct _prop_object_internalize_context *ctx,
                                  struct _prop_number_value *pnv)
{
        char *cp;

        _PROP_ASSERT(/*CONSTCOND*/sizeof(unsigned long long) ==
                     sizeof(uint64_t));

#ifndef _KERNEL
        errno = 0;
#endif
        pnv->pnv_unsigned = (uint64_t) strtoull(ctx->poic_cp, &cp, 0);
#ifndef _KERNEL                /* XXX can't check for ERANGE in the kernel */
        if (pnv->pnv_unsigned == UINT64_MAX && errno == ERANGE)
                return (false);
#endif
        pnv->pnv_is_unsigned = true;
        ctx->poic_cp = cp;

        return (true);
}

static bool
_prop_number_internalize_signed(struct _prop_object_internalize_context *ctx,
                                struct _prop_number_value *pnv)
{
        char *cp;

        _PROP_ASSERT(/*CONSTCOND*/sizeof(long long) == sizeof(int64_t));

#ifndef _KERNEL
        errno = 0;
#endif
        pnv->pnv_signed = (int64_t) strtoll(ctx->poic_cp, &cp, 0);
#ifndef _KERNEL                /* XXX can't check for ERANGE in the kernel */
        if ((pnv->pnv_signed == INT64_MAX || pnv->pnv_signed == INT64_MIN) &&
            errno == ERANGE)
                    return (false);
#endif
        pnv->pnv_is_unsigned = false;
        ctx->poic_cp = cp;

        return (true);
}

/*
 * _prop_number_internalize --
 *        Parse a <number>...</number> and return the object created from
 *        the external representation.
 */
/* ARGSUSED */
bool
_prop_number_internalize(prop_stack_t stack, prop_object_t *obj,
    struct _prop_object_internalize_context *ctx)
{
        struct _prop_number_value pnv;

        memset(&pnv, 0, sizeof(pnv));

        /* No attributes, no empty elements. */
        if (ctx->poic_tagattr != NULL || ctx->poic_is_empty_element)
                return (true);

        /*
         * If the first character is '-', then we treat as signed.
         * If the first two characters are "0x" (i.e. the number is
         * in hex), then we treat as unsigned.  Otherwise, we try
         * signed first, and if that fails (presumably due to ERANGE),
         * then we switch to unsigned.
         */
        if (ctx->poic_cp[0] == '-') {
                if (_prop_number_internalize_signed(ctx, &pnv) == false)
                        return (true);
        } else if (ctx->poic_cp[0] == '0' && ctx->poic_cp[1] == 'x') {
                if (_prop_number_internalize_unsigned(ctx, &pnv) == false)
                        return (true);
        } else {
                if (_prop_number_internalize_signed(ctx, &pnv) == false &&
                    _prop_number_internalize_unsigned(ctx, &pnv) == false)
                            return (true);
        }

        if (_prop_object_internalize_find_tag(ctx, "integer",
                                              _PROP_TAG_TYPE_END) == false)
                return (true);

        *obj = _prop_number_alloc(&pnv);
        return (true);
}






















































































    4 



























    1 





    1 









    1 



    3 








    3 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/*        $NetBSD: kern_ktrace_vfs.c,v 1.3 2021/06/29 22:40:53 dholland Exp $        */

/*-
 * Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_ktrace.c        8.5 (Berkeley) 5/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_ktrace_vfs.c,v 1.3 2021/06/29 22:40:53 dholland Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/kernel.h>
#include <sys/ktrace.h>
#include <sys/kauth.h>

#include <sys/mount.h>
#include <sys/syscallargs.h>

/*
 * ktrace system call, the part of the ktrace framework that
 * explicitly interacts with VFS
 */
/* ARGSUSED */
int
sys_ktrace(struct lwp *l, const struct sys_ktrace_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) fname;
                syscallarg(int) ops;
                syscallarg(int) facs;
                syscallarg(int) pid;
        } */
        struct vnode *vp = NULL;
        file_t *fp = NULL;
        struct pathbuf *pb;
        int error = 0;
        int fd;

        if (ktrenter(l))
                return EAGAIN;

        if (KTROP(SCARG(uap, ops)) != KTROP_CLEAR) {
                /*
                 * an operation which requires a file argument.
                 */
                error = pathbuf_copyin(SCARG(uap, fname), &pb);
                if (error) {
                        ktrexit(l);
                        return (error);
                }
                error = vn_open(NULL, pb, 0, FREAD|FWRITE, 0, &vp, NULL, NULL);
                if (error != 0) {
                        pathbuf_destroy(pb);
                        ktrexit(l);
                        return (error);
                }
                pathbuf_destroy(pb);
                VOP_UNLOCK(vp);
                if (vp->v_type != VREG) {
                        vn_close(vp, FREAD|FWRITE, l->l_cred);
                        ktrexit(l);
                        return (EACCES);
                }
                /*
                 * This uses up a file descriptor slot in the
                 * tracing process for the duration of this syscall.
                 * This is not expected to be a problem.
                 */
                if ((error = fd_allocfile(&fp, &fd)) != 0) {
                        vn_close(vp, FWRITE, l->l_cred);
                        ktrexit(l);
                        return error;
                }
                fp->f_flag = FWRITE;
                fp->f_type = DTYPE_VNODE;
                fp->f_ops = &vnops;
                fp->f_vnode = vp;
                vp = NULL;
        }
        error = ktrace_common(l, SCARG(uap, ops), SCARG(uap, facs),
            SCARG(uap, pid), &fp);
        if (KTROP(SCARG(uap, ops)) != KTROP_CLEAR)
                fd_abort(curproc, fp, fd);
        return (error);
}
















































































































































































































   32 

   33 
   32 
   33 

   30 




    2 


    2 




    3 




    1 












   30 





   33 


   33 












    7 

    7 
    7 
    7 

    5 




    2 


    1 




    2 




    1 












    7 


    7 
















































































































































































































































































































































































  116 















































































































































































































































































































    9 
























   80 











   80 






   39 
   78 
    1 



















   38 
   38 


   39 









    1 
   38 


   39 
   39 

   39 




    2 














   80 





   79 
   79 









   39 



   80 

























   80 





   79 
   78 
   79 
   80 
   81 










   78 

   30 













   80 









   78 







































































   72 
   42 


   81 






   52 
   46 




   79 

   41 


   53 



    3 

   30 




   33 














   53 


   38 


























































    5 







    5 







    5 
































   22 








   22 





   22 
   22 
   22 
   22 






   20 
    3 





















































   22 











    3 
   20 
    4 









   22 







    7 








    7 

   19 
















   22 





























    2 





    1 





    2 
    2 

    2 















    2 


    2 
    2 



    2 


































    4 


    4 



    4 








































































   83 


   85 



   85 
   80 













    7 

    7 
    7 







    7 



    7 












    3 

    3 
    3 
    3 
    3 



    3 
















    4 

    4 
    4 
    1 
    3 













   80 

   80 
   80 







   78 











   11 

   12 
   12 
    6 
   10 


























   81 




























   82 



   27 


   84 

































































    6 






















   79 



   81 










   84 



   32 
   72 

   55 

   42 
   45 










   80 


   57 
   37 
    1 


   78 





    3 









































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
/*        $NetBSD: uvm_page.c,v 1.256 2024/03/05 14:33:50 thorpej Exp $        */

/*-
 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * Copyright (c) 1991, 1993, The Regents of the University of California.
 *
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * The Mach Operating System project at Carnegie-Mellon University.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)vm_page.c   8.3 (Berkeley) 3/21/94
 * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
 *
 *
 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
 * All rights reserved.
 *
 * Permission to use, copy, modify and distribute this software and
 * its documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 */

/*
 * uvm_page.c: page ops.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.256 2024/03/05 14:33:50 thorpej Exp $");

#include "opt_ddb.h"
#include "opt_uvm.h"
#include "opt_uvmhist.h"
#include "opt_readahead.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/radixtree.h>
#include <sys/atomic.h>
#include <sys/cpu.h>

#include <ddb/db_active.h>

#include <uvm/uvm.h>
#include <uvm/uvm_ddb.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_pgflcache.h>

/*
 * number of pages per-CPU to reserve for the kernel.
 */
#ifndef        UVM_RESERVED_PAGES_PER_CPU
#define        UVM_RESERVED_PAGES_PER_CPU        5
#endif
int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;

/*
 * physical memory size;
 */
psize_t physmem;

/*
 * local variables
 */

/*
 * these variables record the values returned by vm_page_bootstrap,
 * for debugging purposes.  The implementation of uvm_pageboot_alloc
 * and pmap_startup here also uses them internally.
 */

static vaddr_t      virtual_space_start;
static vaddr_t      virtual_space_end;

/*
 * we allocate an initial number of page colors in uvm_page_init(),
 * and remember them.  We may re-color pages as cache sizes are
 * discovered during the autoconfiguration phase.  But we can never
 * free the initial set of buckets, since they are allocated using
 * uvm_pageboot_alloc().
 */

static size_t recolored_pages_memsize /* = 0 */;
static char *recolored_pages_mem;

/*
 * freelist locks - one per bucket.
 */

union uvm_freelist_lock        uvm_freelist_locks[PGFL_MAX_BUCKETS]
    __cacheline_aligned;

/*
 * basic NUMA information.
 */

static struct uvm_page_numa_region {
        struct uvm_page_numa_region        *next;
        paddr_t                                start;
        paddr_t                                size;
        u_int                                numa_id;
} *uvm_page_numa_region;

#ifdef DEBUG
kmutex_t uvm_zerochecklock __cacheline_aligned;
vaddr_t uvm_zerocheckkva;
#endif /* DEBUG */

/*
 * These functions are reserved for uvm(9) internal use and are not
 * exported in the header file uvm_physseg.h
 *
 * Thus they are redefined here.
 */
void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);

/* returns a pgs array */
struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);

/*
 * inline functions
 */

/*
 * uvm_pageinsert: insert a page in the object.
 *
 * => caller must lock object
 * => call should have already set pg's object and offset pointers
 *    and bumped the version counter
 */

static inline void
uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
{

        KASSERT(uobj == pg->uobject);
        KASSERT(rw_write_held(uobj->vmobjlock));
        KASSERT((pg->flags & PG_TABLED) == 0);

        if ((pg->flags & PG_STAT) != 0) {
                /* Cannot use uvm_pagegetdirty(): not yet in radix tree. */
                const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);

                if ((pg->flags & PG_FILE) != 0) {
                        if (uobj->uo_npages == 0) {
                                struct vnode *vp = (struct vnode *)uobj;
                                mutex_enter(vp->v_interlock);
                                KASSERT((vp->v_iflag & VI_PAGES) == 0);
                                vp->v_iflag |= VI_PAGES;
                                vholdl(vp);
                                mutex_exit(vp->v_interlock);
                        }
                        if (UVM_OBJ_IS_VTEXT(uobj)) {
                                cpu_count(CPU_COUNT_EXECPAGES, 1);
                        }
                        cpu_count(CPU_COUNT_FILEUNKNOWN + status, 1);
                } else {
                        cpu_count(CPU_COUNT_ANONUNKNOWN + status, 1);
                }
        }
        pg->flags |= PG_TABLED;
        uobj->uo_npages++;
}

static inline int
uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
{
        const uint64_t idx = pg->offset >> PAGE_SHIFT;
        int error;

        KASSERT(rw_write_held(uobj->vmobjlock));

        error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
        if (error != 0) {
                return error;
        }
        if ((pg->flags & PG_CLEAN) == 0) {
                uvm_obj_page_set_dirty(pg);
        }
        KASSERT(((pg->flags & PG_CLEAN) == 0) ==
                uvm_obj_page_dirty_p(pg));
        return 0;
}

/*
 * uvm_page_remove: remove page from object.
 *
 * => caller must lock object
 */

static inline void
uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
{

        KASSERT(uobj == pg->uobject);
        KASSERT(rw_write_held(uobj->vmobjlock));
        KASSERT(pg->flags & PG_TABLED);

        if ((pg->flags & PG_STAT) != 0) {
                /* Cannot use uvm_pagegetdirty(): no longer in radix tree. */
                const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);

                if ((pg->flags & PG_FILE) != 0) {
                        if (uobj->uo_npages == 1) {
                                struct vnode *vp = (struct vnode *)uobj;
                                mutex_enter(vp->v_interlock);
                                KASSERT((vp->v_iflag & VI_PAGES) != 0);
                                vp->v_iflag &= ~VI_PAGES;
                                holdrelel(vp);
                                mutex_exit(vp->v_interlock);
                        }
                        if (UVM_OBJ_IS_VTEXT(uobj)) {
                                cpu_count(CPU_COUNT_EXECPAGES, -1);
                        }
                        cpu_count(CPU_COUNT_FILEUNKNOWN + status, -1);
                } else {
                        cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
                }
        }
        uobj->uo_npages--;
        pg->flags &= ~PG_TABLED;
        pg->uobject = NULL;
}

static inline void
uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
{
        struct vm_page *opg __unused;

        KASSERT(rw_write_held(uobj->vmobjlock));

        opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
        KASSERT(pg == opg);
}

static void
uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num)
{
        int i;

        pgb->pgb_nfree = 0;
        for (i = 0; i < uvmexp.ncolors; i++) {
                LIST_INIT(&pgb->pgb_colors[i]);
        }
        pgfl->pgfl_buckets[num] = pgb;
}

/*
 * uvm_page_init: init the page system.   called from uvm_init().
 *
 * => we return the range of kernel virtual memory in kvm_startp/kvm_endp
 */

void
uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
{
        static struct uvm_cpu uvm_boot_cpu __cacheline_aligned;
        psize_t freepages, pagecount, bucketsize, n;
        struct pgflbucket *pgb;
        struct vm_page *pagearray;
        char *bucketarray;
        uvm_physseg_t bank;
        int fl, b;

        KASSERT(ncpu <= 1);

        /*
         * init the page queues and free page queue locks, except the
         * free list; we allocate that later (with the initial vm_page
         * structures).
         */

        curcpu()->ci_data.cpu_uvm = &uvm_boot_cpu;
        uvmpdpol_init();
        for (b = 0; b < __arraycount(uvm_freelist_locks); b++) {
                mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM);
        }

        /*
         * allocate vm_page structures.
         */

        /*
         * sanity check:
         * before calling this function the MD code is expected to register
         * some free RAM with the uvm_page_physload() function.   our job
         * now is to allocate vm_page structures for this memory.
         */

        if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
                panic("uvm_page_bootstrap: no memory pre-allocated");

        /*
         * first calculate the number of free pages...
         *
         * note that we use start/end rather than avail_start/avail_end.
         * this allows us to allocate extra vm_page structures in case we
         * want to return some memory to the pool after booting.
         */

        freepages = 0;

        for (bank = uvm_physseg_get_first();
             uvm_physseg_valid_p(bank) ;
             bank = uvm_physseg_get_next(bank)) {
                freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
        }

        /*
         * Let MD code initialize the number of colors, or default
         * to 1 color if MD code doesn't care.
         */
        if (uvmexp.ncolors == 0)
                uvmexp.ncolors = 1;
        uvmexp.colormask = uvmexp.ncolors - 1;
        KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);

        /* We always start with only 1 bucket. */
        uvm.bucketcount = 1;

        /*
         * we now know we have (PAGE_SIZE * freepages) bytes of memory we can
         * use.   for each page of memory we use we need a vm_page structure.
         * thus, the total number of pages we can use is the total size of
         * the memory divided by the PAGE_SIZE plus the size of the vm_page
         * structure.   we add one to freepages as a fudge factor to avoid
         * truncation errors (since we can only allocate in terms of whole
         * pages).
         */
        pagecount = ((freepages + 1) << PAGE_SHIFT) /
            (PAGE_SIZE + sizeof(struct vm_page));
        bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]);
        bucketsize = roundup2(bucketsize, coherency_unit);
        bucketarray = (void *)uvm_pageboot_alloc(
            bucketsize * VM_NFREELIST +
            pagecount * sizeof(struct vm_page));
        pagearray = (struct vm_page *)
            (bucketarray + bucketsize * VM_NFREELIST);

        for (fl = 0; fl < VM_NFREELIST; fl++) {
                pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl);
                uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0);
        }
        memset(pagearray, 0, pagecount * sizeof(struct vm_page));

        /*
         * init the freelist cache in the disabled state.
         */
        uvm_pgflcache_init();

        /*
         * init the vm_page structures and put them in the correct place.
         */
        /* First init the extent */

        for (bank = uvm_physseg_get_first(),
                 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
             uvm_physseg_valid_p(bank);
             bank = uvm_physseg_get_next(bank)) {

                n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
                uvm_physseg_seg_alloc_from_slab(bank, n);
                uvm_physseg_init_seg(bank, pagearray);

                /* set up page array pointers */
                pagearray += n;
                pagecount -= n;
        }

        /*
         * pass up the values of virtual_space_start and
         * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
         * layers of the VM.
         */

        *kvm_startp = round_page(virtual_space_start);
        *kvm_endp = trunc_page(virtual_space_end);

        /*
         * init various thresholds.
         */

        uvmexp.reserve_pagedaemon = 1;
        uvmexp.reserve_kernel = vm_page_reserve_kernel;

        /*
         * done!
         */

        uvm.page_init_done = true;
}

/*
 * uvm_pgfl_lock: lock all freelist buckets
 */

void
uvm_pgfl_lock(void)
{
        int i;

        for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
                mutex_spin_enter(&uvm_freelist_locks[i].lock);
        }
}

/*
 * uvm_pgfl_unlock: unlock all freelist buckets
 */

void
uvm_pgfl_unlock(void)
{
        int i;

        for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
                mutex_spin_exit(&uvm_freelist_locks[i].lock);
        }
}

/*
 * uvm_setpagesize: set the page size
 *
 * => sets page_shift and page_mask from uvmexp.pagesize.
 */

void
uvm_setpagesize(void)
{

        /*
         * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
         * to be a constant (indicated by being a non-zero value).
         */
        if (uvmexp.pagesize == 0) {
                if (PAGE_SIZE == 0)
                        panic("uvm_setpagesize: uvmexp.pagesize not set");
                uvmexp.pagesize = PAGE_SIZE;
        }
        uvmexp.pagemask = uvmexp.pagesize - 1;
        if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
                panic("uvm_setpagesize: page size %u (%#x) not a power of two",
                    uvmexp.pagesize, uvmexp.pagesize);
        for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
                if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
                        break;
}

/*
 * uvm_pageboot_alloc: steal memory from physmem for bootstrapping
 */

vaddr_t
uvm_pageboot_alloc(vsize_t size)
{
        static bool initialized = false;
        vaddr_t addr;
#if !defined(PMAP_STEAL_MEMORY)
        vaddr_t vaddr;
        paddr_t paddr;
#endif

        /*
         * on first call to this function, initialize ourselves.
         */
        if (initialized == false) {
                pmap_virtual_space(&virtual_space_start, &virtual_space_end);

                /* round it the way we like it */
                virtual_space_start = round_page(virtual_space_start);
                virtual_space_end = trunc_page(virtual_space_end);

                initialized = true;
        }

        /* round to page size */
        size = round_page(size);
        uvmexp.bootpages += atop(size);

#if defined(PMAP_STEAL_MEMORY)

        /*
         * defer bootstrap allocation to MD code (it may want to allocate
         * from a direct-mapped segment).  pmap_steal_memory should adjust
         * virtual_space_start/virtual_space_end if necessary.
         */

        addr = pmap_steal_memory(size, &virtual_space_start,
            &virtual_space_end);

        return addr;

#else /* !PMAP_STEAL_MEMORY */

        /*
         * allocate virtual memory for this request
         */
        if (virtual_space_start == virtual_space_end ||
            (virtual_space_end - virtual_space_start) < size)
                panic("uvm_pageboot_alloc: out of virtual space");

        addr = virtual_space_start;

#ifdef PMAP_GROWKERNEL
        /*
         * If the kernel pmap can't map the requested space,
         * then allocate more resources for it.
         */
        if (uvm_maxkaddr < (addr + size)) {
                uvm_maxkaddr = pmap_growkernel(addr + size);
                if (uvm_maxkaddr < (addr + size))
                        panic("uvm_pageboot_alloc: pmap_growkernel() failed");
        }
#endif

        virtual_space_start += size;

        /*
         * allocate and mapin physical pages to back new virtual pages
         */

        for (vaddr = round_page(addr) ; vaddr < addr + size ;
            vaddr += PAGE_SIZE) {

                if (!uvm_page_physget(&paddr))
                        panic("uvm_pageboot_alloc: out of memory");

                /*
                 * Note this memory is no longer managed, so using
                 * pmap_kenter is safe.
                 */
                pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
        }
        pmap_update(pmap_kernel());
        return addr;
#endif        /* PMAP_STEAL_MEMORY */
}

#if !defined(PMAP_STEAL_MEMORY)
/*
 * uvm_page_physget: "steal" one page from the vm_physmem structure.
 *
 * => attempt to allocate it off the end of a segment in which the "avail"
 *    values match the start/end values.   if we can't do that, then we
 *    will advance both values (making them equal, and removing some
 *    vm_page structures from the non-avail area).
 * => return false if out of memory.
 */

/* subroutine: try to allocate from memory chunks on the specified freelist */
static bool uvm_page_physget_freelist(paddr_t *, int);

static bool
uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
{
        uvm_physseg_t lcv;

        /* pass 1: try allocating from a matching end */
#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
        for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
#else
        for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
#endif
        {
                if (uvm.page_init_done == true)
                        panic("uvm_page_physget: called _after_ bootstrap");

                /* Try to match at front or back on unused segment */
                if (uvm_page_physunload(lcv, freelist, paddrp))
                        return true;
        }

        /* pass2: forget about matching ends, just allocate something */
#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
        for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
#else
        for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
#endif
        {
                /* Try the front regardless. */
                if (uvm_page_physunload_force(lcv, freelist, paddrp))
                        return true;
        }
        return false;
}

bool
uvm_page_physget(paddr_t *paddrp)
{
        int i;

        /* try in the order of freelist preference */
        for (i = 0; i < VM_NFREELIST; i++)
                if (uvm_page_physget_freelist(paddrp, i) == true)
                        return (true);
        return (false);
}
#endif /* PMAP_STEAL_MEMORY */

paddr_t
uvm_vm_page_to_phys(const struct vm_page *pg)
{

        return pg->phys_addr & ~(PAGE_SIZE - 1);
}

/*
 * uvm_page_numa_load: load NUMA range description.
 */
void
uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id)
{
        struct uvm_page_numa_region *d;

        KASSERT(numa_id < PGFL_MAX_BUCKETS);

        d = kmem_alloc(sizeof(*d), KM_SLEEP);
        d->start = start;
        d->size = size;
        d->numa_id = numa_id;
        d->next = uvm_page_numa_region;
        uvm_page_numa_region = d;
}

/*
 * uvm_page_numa_lookup: lookup NUMA node for the given page.
 */
static u_int
uvm_page_numa_lookup(struct vm_page *pg)
{
        struct uvm_page_numa_region *d;
        static bool warned;
        paddr_t pa;

        KASSERT(uvm_page_numa_region != NULL);

        pa = VM_PAGE_TO_PHYS(pg);
        for (d = uvm_page_numa_region; d != NULL; d = d->next) {
                if (pa >= d->start && pa < d->start + d->size) {
                        return d->numa_id;
                }
        }

        if (!warned) {
                printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#"
                    PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg));
                warned = true;
        }

        return 0;
}

/*
 * uvm_page_redim: adjust freelist dimensions if they have changed.
 */

static void
uvm_page_redim(int newncolors, int newnbuckets)
{
        struct pgfreelist npgfl;
        struct pgflbucket *opgb, *npgb;
        struct pgflist *ohead, *nhead;
        struct vm_page *pg;
        size_t bucketsize, bucketmemsize, oldbucketmemsize;
        int fl, ob, oc, nb, nc, obuckets, ocolors;
        char *bucketarray, *oldbucketmem, *bucketmem;

        KASSERT(((newncolors - 1) & newncolors) == 0);

        /* Anything to do? */
        if (newncolors <= uvmexp.ncolors &&
            newnbuckets == uvm.bucketcount) {
                return;
        }
        if (uvm.page_init_done == false) {
                uvmexp.ncolors = newncolors;
                return;
        }

        bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]);
        bucketsize = roundup2(bucketsize, coherency_unit);
        bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST +
            coherency_unit - 1;
        bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP);
        bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit);

        ocolors = uvmexp.ncolors;
        obuckets = uvm.bucketcount;

        /* Freelist cache mustn't be enabled. */
        uvm_pgflcache_pause();

        /* Make sure we should still do this. */
        uvm_pgfl_lock();
        if (newncolors <= uvmexp.ncolors &&
            newnbuckets == uvm.bucketcount) {
                uvm_pgfl_unlock();
                uvm_pgflcache_resume();
                kmem_free(bucketmem, bucketmemsize);
                return;
        }

        uvmexp.ncolors = newncolors;
        uvmexp.colormask = uvmexp.ncolors - 1;
        uvm.bucketcount = newnbuckets;

        for (fl = 0; fl < VM_NFREELIST; fl++) {
                /* Init new buckets in new freelist. */
                memset(&npgfl, 0, sizeof(npgfl));
                for (nb = 0; nb < newnbuckets; nb++) {
                        npgb = (struct pgflbucket *)bucketarray;
                        uvm_page_init_bucket(&npgfl, npgb, nb);
                        bucketarray += bucketsize;
                }
                /* Now transfer pages from the old freelist. */
                for (nb = ob = 0; ob < obuckets; ob++) {
                        opgb = uvm.page_free[fl].pgfl_buckets[ob];
                        for (oc = 0; oc < ocolors; oc++) {
                                ohead = &opgb->pgb_colors[oc];
                                while ((pg = LIST_FIRST(ohead)) != NULL) {
                                        LIST_REMOVE(pg, pageq.list);
                                        /*
                                         * Here we decide on the NEW color &
                                         * bucket for the page.  For NUMA
                                         * we'll use the info that the
                                         * hardware gave us.  For non-NUMA
                                         * assign take physical page frame
                                         * number and cache color into
                                         * account.  We do this to try and
                                         * avoid defeating any memory
                                         * interleaving in the hardware.
                                         */
                                        KASSERT(
                                            uvm_page_get_bucket(pg) == ob);
                                        KASSERT(fl ==
                                            uvm_page_get_freelist(pg));
                                        if (uvm_page_numa_region != NULL) {
                                                nb = uvm_page_numa_lookup(pg);
                                        } else {
                                                nb = atop(VM_PAGE_TO_PHYS(pg))
                                                    / uvmexp.ncolors / 8
                                                    % newnbuckets;
                                        }
                                        uvm_page_set_bucket(pg, nb);
                                        npgb = npgfl.pgfl_buckets[nb];
                                        npgb->pgb_nfree++;
                                        nc = VM_PGCOLOR(pg);
                                        nhead = &npgb->pgb_colors[nc];
                                        LIST_INSERT_HEAD(nhead, pg, pageq.list);
                                }
                        }
                }
                /* Install the new freelist. */
                memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl));
        }

        /* Unlock and free the old memory. */
        oldbucketmemsize = recolored_pages_memsize;
        oldbucketmem = recolored_pages_mem;
        recolored_pages_memsize = bucketmemsize;
        recolored_pages_mem = bucketmem;

        uvm_pgfl_unlock();
        uvm_pgflcache_resume();

        if (oldbucketmemsize) {
                kmem_free(oldbucketmem, oldbucketmemsize);
        }

        /*
         * this calls uvm_km_alloc() which may want to hold
         * uvm_freelist_lock.
         */
        uvm_pager_realloc_emerg();
}

/*
 * uvm_page_recolor: Recolor the pages if the new color count is
 * larger than the old one.
 */

void
uvm_page_recolor(int newncolors)
{

        uvm_page_redim(newncolors, uvm.bucketcount);
}

/*
 * uvm_page_rebucket: Determine a bucket structure and redim the free
 * lists to match.
 */

void
uvm_page_rebucket(void)
{
        u_int min_numa, max_numa, npackage, shift;
        struct cpu_info *ci, *ci2, *ci3;
        CPU_INFO_ITERATOR cii;

        /*
         * If we have more than one NUMA node, and the maximum NUMA node ID
         * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution
         * for free pages.
         */
        min_numa = (u_int)-1;
        max_numa = 0;
        for (CPU_INFO_FOREACH(cii, ci)) {
                if (ci->ci_numa_id < min_numa) {
                        min_numa = ci->ci_numa_id;
                }
                if (ci->ci_numa_id > max_numa) {
                        max_numa = ci->ci_numa_id;
                }
        }
        if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) {
                aprint_debug("UVM: using NUMA allocation scheme\n");
                for (CPU_INFO_FOREACH(cii, ci)) {
                        ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id;
                }
                 uvm_page_redim(uvmexp.ncolors, max_numa + 1);
                 return;
        }

        /*
         * Otherwise we'll go with a scheme to maximise L2/L3 cache locality
         * and minimise lock contention.  Count the total number of CPU
         * packages, and then try to distribute the buckets among CPU
         * packages evenly.
         */
        npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST];

        /*
         * Figure out how to arrange the packages & buckets, and the total
         * number of buckets we need.  XXX 2 may not be the best factor.
         */
        for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) {
                npackage >>= 1;
        }
         uvm_page_redim(uvmexp.ncolors, npackage);

         /*
          * Now tell each CPU which bucket to use.  In the outer loop, scroll
          * through all CPU packages.
          */
         npackage = 0;
        ci = curcpu();
        ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST];
        do {
                /*
                 * In the inner loop, scroll through all CPUs in the package
                 * and assign the same bucket ID.
                 */
                ci3 = ci2;
                do {
                        ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift;
                        ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
                } while (ci3 != ci2);
                npackage++;
                ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST];
        } while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]);

        aprint_debug("UVM: using package allocation scheme, "
            "%d package(s) per bucket\n", 1 << shift);
}

/*
 * uvm_cpu_attach: initialize per-CPU data structures.
 */

void
uvm_cpu_attach(struct cpu_info *ci)
{
        struct uvm_cpu *ucpu;

        /* Already done in uvm_page_init(). */
        if (!CPU_IS_PRIMARY(ci)) {
                /* Add more reserve pages for this CPU. */
                uvmexp.reserve_kernel += vm_page_reserve_kernel;

                /* Allocate per-CPU data structures. */
                ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1,
                    KM_SLEEP);
                ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu,
                    coherency_unit);
                ci->ci_data.cpu_uvm = ucpu;
        } else {
                ucpu = ci->ci_data.cpu_uvm;
        }

        uvmpdpol_init_cpu(ucpu);
}

/*
 * uvm_availmem: fetch the total amount of free memory in pages.  this can
 * have a detrimental effect on performance due to false sharing; don't call
 * unless needed.
 *
 * some users can request the amount of free memory so often that it begins
 * to impact upon performance.  if calling frequently and an inexact value
 * is okay, call with cached = true.
 */

int
uvm_availmem(bool cached)
{
        int64_t fp;

        cpu_count_sync(cached);
        if ((fp = cpu_count_get(CPU_COUNT_FREEPAGES)) < 0) {
                /*
                 * XXXAD could briefly go negative because it's impossible
                 * to get a clean snapshot.  address this for other counters
                 * used as running totals before NetBSD 10 although less
                 * important for those.
                 */
                fp = 0;
        }
        return (int)fp;
}

/*
 * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a
 * specific freelist and specific bucket only.
 *
 * => must be at IPL_VM or higher to protect per-CPU data structures.
 */

static struct vm_page *
uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags)
{
        int c, trycolor, colormask;
        struct pgflbucket *pgb;
        struct vm_page *pg;
        kmutex_t *lock;
        bool fill;

        /*
         * Skip the bucket if empty, no lock needed.  There could be many
         * empty freelists/buckets.
         */
        pgb = uvm.page_free[f].pgfl_buckets[b];
        if (pgb->pgb_nfree == 0) {
                return NULL;
        }

        /* Skip bucket if low on memory. */
        lock = &uvm_freelist_locks[b].lock;
        mutex_spin_enter(lock);
        if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) {
                if ((flags & UVM_PGA_USERESERVE) == 0 ||
                    (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon &&
                     curlwp != uvm.pagedaemon_lwp)) {
                        mutex_spin_exit(lock);
                             return NULL;
                }
                fill = false;
        } else {
                fill = true;
        }

        /* Try all page colors as needed. */
        c = trycolor = *trycolorp;
        colormask = uvmexp.colormask;
        do {
                pg = LIST_FIRST(&pgb->pgb_colors[c]);
                if (__predict_true(pg != NULL)) {
                        /*
                         * Got a free page!  PG_FREE must be cleared under
                         * lock because of uvm_pglistalloc().
                         */
                        LIST_REMOVE(pg, pageq.list);
                        KASSERT(pg->flags == PG_FREE);
                        pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
                        pgb->pgb_nfree--;
                        CPU_COUNT(CPU_COUNT_FREEPAGES, -1);

                        /*
                         * While we have the bucket locked and our data
                         * structures fresh in L1 cache, we have an ideal
                         * opportunity to grab some pages for the freelist
                         * cache without causing extra contention.  Only do
                         * so if we found pages in this CPU's preferred
                         * bucket.
                         */
                        if (__predict_true(b == ucpu->pgflbucket && fill)) {
                                uvm_pgflcache_fill(ucpu, f, b, c);
                        }
                        mutex_spin_exit(lock);
                        KASSERT(uvm_page_get_bucket(pg) == b);
                        CPU_COUNT(c == trycolor ?
                            CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1);
                        CPU_COUNT(CPU_COUNT_CPUMISS, 1);
                        *trycolorp = c;
                        return pg;
                }
                c = (c + 1) & colormask;
        } while (c != trycolor);
        mutex_spin_exit(lock);

        return NULL;
}

/*
 * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates
 * any color from any bucket, in a specific freelist.
 *
 * => must be at IPL_VM or higher to protect per-CPU data structures.
 */

static struct vm_page *
uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags)
{
        int b, trybucket, bucketcount;
        struct vm_page *pg;

        /* Try for the exact thing in the per-CPU cache. */
        if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) {
                CPU_COUNT(CPU_COUNT_CPUHIT, 1);
                CPU_COUNT(CPU_COUNT_COLORHIT, 1);
                return pg;
        }

        /* Walk through all buckets, trying our preferred bucket first. */
        trybucket = ucpu->pgflbucket;
        b = trybucket;
        bucketcount = uvm.bucketcount;
        do {
                pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags);
                if (pg != NULL) {
                        return pg;
                }
                b = (b + 1 == bucketcount ? 0 : b + 1);
        } while (b != trybucket);

        return NULL;
}

/*
 * uvm_pagealloc_strat: allocate vm_page from a particular free list.
 *
 * => return null if no pages free
 * => wake up pagedaemon if number of free pages drops below low water mark
 * => if obj != NULL, obj must be locked (to put in obj's tree)
 * => if anon != NULL, anon must be locked (to put in anon)
 * => only one of obj or anon can be non-null
 * => caller must activate/deactivate page if it is not wired.
 * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
 * => policy decision: it is more important to pull a page off of the
 *        appropriate priority free list than it is to get a page from the
 *        correct bucket or color bin.  This is because we live with the
 *        consequences of a bad free list decision for the entire
 *        lifetime of the page, e.g. if the page comes from memory that
 *        is slower to access.
 */

struct vm_page *
uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
    int flags, int strat, int free_list)
{
        int color, lcv, error, s;
        struct uvm_cpu *ucpu;
        struct vm_page *pg;
        lwp_t *l;

        KASSERT(obj == NULL || anon == NULL);
        KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0);
        KASSERT(off == trunc_page(off));
        KASSERT(obj == NULL || rw_write_held(obj->vmobjlock));
        KASSERT(anon == NULL || anon->an_lock == NULL ||
            rw_write_held(anon->an_lock));

        /*
         * This implements a global round-robin page coloring
         * algorithm.
         */

        s = splvm();
        ucpu = curcpu()->ci_data.cpu_uvm;
        if (flags & UVM_FLAG_COLORMATCH) {
                color = atop(off) & uvmexp.colormask;
        } else {
                color = ucpu->pgflcolor;
        }

        /*
         * fail if any of these conditions is true:
         * [1]  there really are no free pages, or
         * [2]  only kernel "reserved" pages remain and
         *        reserved pages have not been requested.
         * [3]  only pagedaemon "reserved" pages remain and
         *        the requestor isn't the pagedaemon.
         * we make kernel reserve pages available if called by a
         * kernel thread.
         */
        l = curlwp;
        if (__predict_true(l != NULL) && (l->l_flag & LW_SYSTEM) != 0) {
                flags |= UVM_PGA_USERESERVE;
        }

 again:
        switch (strat) {
        case UVM_PGA_STRAT_NORMAL:
                /* Check freelists: descending priority (ascending id) order. */
                for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
                        pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags);
                        if (pg != NULL) {
                                goto gotit;
                        }
                }

                /* No pages free!  Have pagedaemon free some memory. */
                splx(s);
                uvm_kick_pdaemon();
                return NULL;

        case UVM_PGA_STRAT_ONLY:
        case UVM_PGA_STRAT_FALLBACK:
                /* Attempt to allocate from the specified free list. */
                KASSERT(free_list >= 0);
                KASSERT(free_list < VM_NFREELIST);
                pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags);
                if (pg != NULL) {
                        goto gotit;
                }

                /* Fall back, if possible. */
                if (strat == UVM_PGA_STRAT_FALLBACK) {
                        strat = UVM_PGA_STRAT_NORMAL;
                        goto again;
                }

                /* No pages free!  Have pagedaemon free some memory. */
                splx(s);
                uvm_kick_pdaemon();
                return NULL;

        case UVM_PGA_STRAT_NUMA:
                /*
                 * NUMA strategy (experimental): allocating from the correct
                 * bucket is more important than observing freelist
                 * priority.  Look only to the current NUMA node; if that
                 * fails, we need to look to other NUMA nodes, so retry with
                 * the normal strategy.
                 */
                for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
                        pg = uvm_pgflcache_alloc(ucpu, lcv, color);
                        if (pg != NULL) {
                                CPU_COUNT(CPU_COUNT_CPUHIT, 1);
                                CPU_COUNT(CPU_COUNT_COLORHIT, 1);
                                goto gotit;
                        }
                        pg = uvm_pagealloc_pgb(ucpu, lcv,
                            ucpu->pgflbucket, &color, flags);
                        if (pg != NULL) {
                                goto gotit;
                        }
                }
                strat = UVM_PGA_STRAT_NORMAL;
                goto again;

        default:
                panic("uvm_pagealloc_strat: bad strat %d", strat);
                /* NOTREACHED */
        }

 gotit:
        /*
         * We now know which color we actually allocated from; set
         * the next color accordingly.
         */

        ucpu->pgflcolor = (color + 1) & uvmexp.colormask;

        /*
         * while still at IPL_VM, update allocation statistics.
         */

        if (anon) {
                CPU_COUNT(CPU_COUNT_ANONCLEAN, 1);
        }
        splx(s);
        KASSERT(pg->flags == (PG_BUSY|PG_CLEAN|PG_FAKE));

        /*
         * assign the page to the object.  as the page was free, we know
         * that pg->uobject and pg->uanon are NULL.  we only need to take
         * the page's interlock if we are changing the values.
         */
        if (anon != NULL || obj != NULL) {
                mutex_enter(&pg->interlock);
        }
        pg->offset = off;
        pg->uobject = obj;
        pg->uanon = anon;
        KASSERT(uvm_page_owner_locked_p(pg, true));
        if (anon) {
                anon->an_page = pg;
                pg->flags |= PG_ANON;
                mutex_exit(&pg->interlock);
        } else if (obj) {
                /*
                 * set PG_FILE|PG_AOBJ before the first uvm_pageinsert.
                 */
                if (UVM_OBJ_IS_VNODE(obj)) {
                        pg->flags |= PG_FILE;
                } else if (UVM_OBJ_IS_AOBJ(obj)) {
                        pg->flags |= PG_AOBJ;
                }
                uvm_pageinsert_object(obj, pg);
                mutex_exit(&pg->interlock);
                error = uvm_pageinsert_tree(obj, pg);
                if (error != 0) {
                        mutex_enter(&pg->interlock);
                        uvm_pageremove_object(obj, pg);
                        mutex_exit(&pg->interlock);
                        uvm_pagefree(pg);
                        return NULL;
                }
        }

#if defined(UVM_PAGE_TRKOWN)
        pg->owner_tag = NULL;
#endif
        UVM_PAGE_OWN(pg, "new alloc");

        if (flags & UVM_PGA_ZERO) {
                /* A zero'd page is not clean. */
                if (obj != NULL || anon != NULL) {
                        uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
                }
                pmap_zero_page(VM_PAGE_TO_PHYS(pg));
        }

        return(pg);
}

/*
 * uvm_pagereplace: replace a page with another
 *
 * => object must be locked
 * => page interlocks must be held
 */

void
uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
{
        struct uvm_object *uobj = oldpg->uobject;
        struct vm_page *pg __diagused;
        uint64_t idx;

        KASSERT((oldpg->flags & PG_TABLED) != 0);
        KASSERT(uobj != NULL);
        KASSERT((newpg->flags & PG_TABLED) == 0);
        KASSERT(newpg->uobject == NULL);
        KASSERT(rw_write_held(uobj->vmobjlock));
        KASSERT(mutex_owned(&oldpg->interlock));
        KASSERT(mutex_owned(&newpg->interlock));

        newpg->uobject = uobj;
        newpg->offset = oldpg->offset;
        idx = newpg->offset >> PAGE_SHIFT;
        pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg);
        KASSERT(pg == oldpg);
        if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) {
                if ((newpg->flags & PG_CLEAN) != 0) {
                        uvm_obj_page_clear_dirty(newpg);
                } else {
                        uvm_obj_page_set_dirty(newpg);
                }
        }
        /*
         * oldpg's PG_STAT is stable.  newpg is not reachable by others yet.
         */
        newpg->flags |=
            (newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT);
        uvm_pageinsert_object(uobj, newpg);
        uvm_pageremove_object(uobj, oldpg);
}

/*
 * uvm_pagerealloc: reallocate a page from one object to another
 *
 * => both objects must be locked
 */

int
uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
{
        int error = 0;

        /*
         * remove it from the old object
         */

        if (pg->uobject) {
                uvm_pageremove_tree(pg->uobject, pg);
                uvm_pageremove_object(pg->uobject, pg);
        }

        /*
         * put it in the new object
         */

        if (newobj) {
                mutex_enter(&pg->interlock);
                pg->uobject = newobj;
                pg->offset = newoff;
                if (UVM_OBJ_IS_VNODE(newobj)) {
                        pg->flags |= PG_FILE;
                } else if (UVM_OBJ_IS_AOBJ(newobj)) {
                        pg->flags |= PG_AOBJ;
                }
                uvm_pageinsert_object(newobj, pg);
                mutex_exit(&pg->interlock);
                error = uvm_pageinsert_tree(newobj, pg);
                if (error != 0) {
                        mutex_enter(&pg->interlock);
                        uvm_pageremove_object(newobj, pg);
                        mutex_exit(&pg->interlock);
                }
        }

        return error;
}

/*
 * uvm_pagefree: free page
 *
 * => erase page's identity (i.e. remove from object)
 * => put page on free list
 * => caller must lock owning object (either anon or uvm_object)
 * => assumes all valid mappings of pg are gone
 */

void
uvm_pagefree(struct vm_page *pg)
{
        struct pgfreelist *pgfl;
        struct pgflbucket *pgb;
        struct uvm_cpu *ucpu;
        kmutex_t *lock;
        int bucket, s;
        bool locked;

#ifdef DEBUG
        if (pg->uobject == (void *)0xdeadbeef &&
            pg->uanon == (void *)0xdeadbeef) {
                panic("uvm_pagefree: freeing free page %p", pg);
        }
#endif /* DEBUG */

        KASSERT((pg->flags & PG_PAGEOUT) == 0);
        KASSERT(!(pg->flags & PG_FREE));
        KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock));
        KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
                rw_write_held(pg->uanon->an_lock));

        /*
         * remove the page from the object's tree before acquiring any page
         * interlocks: this can acquire locks to free radixtree nodes.
         */
        if (pg->uobject != NULL) {
                uvm_pageremove_tree(pg->uobject, pg);
        }

        /*
         * if the page is loaned, resolve the loan instead of freeing.
         */

        if (pg->loan_count) {
                KASSERT(pg->wire_count == 0);

                /*
                 * if the page is owned by an anon then we just want to
                 * drop anon ownership.  the kernel will free the page when
                 * it is done with it.  if the page is owned by an object,
                 * remove it from the object and mark it dirty for the benefit
                 * of possible anon owners.
                 *
                 * regardless of previous ownership, wakeup any waiters,
                 * unbusy the page, and we're done.
                 */

                uvm_pagelock(pg);
                locked = true;
                if (pg->uobject != NULL) {
                        uvm_pageremove_object(pg->uobject, pg);
                        pg->flags &= ~(PG_FILE|PG_AOBJ);
                } else if (pg->uanon != NULL) {
                        if ((pg->flags & PG_ANON) == 0) {
                                pg->loan_count--;
                        } else {
                                const unsigned status = uvm_pagegetdirty(pg);
                                pg->flags &= ~PG_ANON;
                                cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
                        }
                        pg->uanon->an_page = NULL;
                        pg->uanon = NULL;
                }
                if (pg->pqflags & PQ_WANTED) {
                        wakeup(pg);
                }
                pg->pqflags &= ~PQ_WANTED;
                pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1);
#ifdef UVM_PAGE_TRKOWN
                pg->owner_tag = NULL;
#endif
                KASSERT((pg->flags & PG_STAT) == 0);
                if (pg->loan_count) {
                        KASSERT(pg->uobject == NULL);
                        if (pg->uanon == NULL) {
                                uvm_pagedequeue(pg);
                        }
                        uvm_pageunlock(pg);
                        return;
                }
        } else if (pg->uobject != NULL || pg->uanon != NULL ||
                   pg->wire_count != 0) {
                uvm_pagelock(pg);
                locked = true;
        } else {
                locked = false;
        }

        /*
         * remove page from its object or anon.
         */
        if (pg->uobject != NULL) {
                uvm_pageremove_object(pg->uobject, pg);
        } else if (pg->uanon != NULL) {
                const unsigned int status = uvm_pagegetdirty(pg);
                pg->uanon->an_page = NULL;
                pg->uanon = NULL;
                cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
        }

        /*
         * if the page was wired, unwire it now.
         */

        if (pg->wire_count) {
                pg->wire_count = 0;
                atomic_dec_uint(&uvmexp.wired);
        }
        if (locked) {
                /*
                 * wake anyone waiting on the page.
                 */
                if ((pg->pqflags & PQ_WANTED) != 0) {
                        pg->pqflags &= ~PQ_WANTED;
                        wakeup(pg);
                }

                /*
                 * now remove the page from the queues.
                 */
                uvm_pagedequeue(pg);
                uvm_pageunlock(pg);
        } else {
                KASSERT(!uvmpdpol_pageisqueued_p(pg));
        }

        /*
         * and put on free queue
         */

#ifdef DEBUG
        pg->uobject = (void *)0xdeadbeef;
        pg->uanon = (void *)0xdeadbeef;
#endif /* DEBUG */

        /* Try to send the page to the per-CPU cache. */
        s = splvm();
        ucpu = curcpu()->ci_data.cpu_uvm;
        bucket = uvm_page_get_bucket(pg);
        if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) {
                splx(s);
                return;
        }

        /* Didn't work.  Never mind, send it to a global bucket. */
        pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
        pgb = pgfl->pgfl_buckets[bucket];
        lock = &uvm_freelist_locks[bucket].lock;

        mutex_spin_enter(lock);
        /* PG_FREE must be set under lock because of uvm_pglistalloc(). */
        pg->flags = PG_FREE;
        LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list);
        pgb->pgb_nfree++;
            CPU_COUNT(CPU_COUNT_FREEPAGES, 1);
        mutex_spin_exit(lock);
        splx(s);
}

/*
 * uvm_page_unbusy: unbusy an array of pages.
 *
 * => pages must either all belong to the same object, or all belong to anons.
 * => if pages are object-owned, object must be locked.
 * => if pages are anon-owned, anons must be locked.
 * => caller must make sure that anon-owned pages are not PG_RELEASED.
 */

void
uvm_page_unbusy(struct vm_page **pgs, int npgs)
{
        struct vm_page *pg;
        int i, pageout_done;
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);

        pageout_done = 0;
        for (i = 0; i < npgs; i++) {
                pg = pgs[i];
                if (pg == NULL || pg == PGO_DONTCARE) {
                        continue;
                }

                KASSERT(uvm_page_owner_locked_p(pg, true));
                KASSERT(pg->flags & PG_BUSY);

                if (pg->flags & PG_PAGEOUT) {
                        pg->flags &= ~PG_PAGEOUT;
                        pg->flags |= PG_RELEASED;
                        pageout_done++;
                        atomic_inc_uint(&uvmexp.pdfreed);
                }
                if (pg->flags & PG_RELEASED) {
                        UVMHIST_LOG(ubchist, "releasing pg %#jx",
                            (uintptr_t)pg, 0, 0, 0);
                        KASSERT(pg->uobject != NULL ||
                            (pg->uanon != NULL && pg->uanon->an_ref > 0));
                        pg->flags &= ~PG_RELEASED;
                        uvm_pagefree(pg);
                } else {
                        UVMHIST_LOG(ubchist, "unbusying pg %#jx",
                            (uintptr_t)pg, 0, 0, 0);
                        KASSERT((pg->flags & PG_FAKE) == 0);
                        pg->flags &= ~PG_BUSY;
                        uvm_pagelock(pg);
                        uvm_pagewakeup(pg);
                        uvm_pageunlock(pg);
                        UVM_PAGE_OWN(pg, NULL);
                }
        }
        if (pageout_done != 0) {
                uvm_pageout_done(pageout_done);
        }
}

/*
 * uvm_pagewait: wait for a busy page
 *
 * => page must be known PG_BUSY
 * => object must be read or write locked
 * => object will be unlocked on return
 */

void
uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg)
{

        KASSERT(rw_lock_held(lock));
        KASSERT((pg->flags & PG_BUSY) != 0);
        KASSERT(uvm_page_owner_locked_p(pg, false));

        mutex_enter(&pg->interlock);
        pg->pqflags |= PQ_WANTED;
        rw_exit(lock);
        UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0);
}

/*
 * uvm_pagewakeup: wake anyone waiting on a page
 *
 * => page interlock must be held
 */

void
uvm_pagewakeup(struct vm_page *pg)
{
        UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);

        KASSERT(mutex_owned(&pg->interlock));

        UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0);

        if ((pg->pqflags & PQ_WANTED) != 0) {
                wakeup(pg);
                pg->pqflags &= ~PQ_WANTED;
        }
}

/*
 * uvm_pagewanted_p: return true if someone is waiting on the page
 *
 * => object must be write locked (lock out all concurrent access)
 */

bool
uvm_pagewanted_p(struct vm_page *pg)
{

        KASSERT(uvm_page_owner_locked_p(pg, true));

        return (atomic_load_relaxed(&pg->pqflags) & PQ_WANTED) != 0;
}

#if defined(UVM_PAGE_TRKOWN)
/*
 * uvm_page_own: set or release page ownership
 *
 * => this is a debugging function that keeps track of who sets PG_BUSY
 *        and where they do it.   it can be used to track down problems
 *        such a process setting "PG_BUSY" and never releasing it.
 * => page's object [if any] must be locked
 * => if "tag" is NULL then we are releasing page ownership
 */
void
uvm_page_own(struct vm_page *pg, const char *tag)
{

        KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
        KASSERT(uvm_page_owner_locked_p(pg, true));

        /* gain ownership? */
        if (tag) {
                KASSERT((pg->flags & PG_BUSY) != 0);
                if (pg->owner_tag) {
                        printf("uvm_page_own: page %p already owned "
                            "by proc %d.%d [%s]\n", pg,
                            pg->owner, pg->lowner, pg->owner_tag);
                        panic("uvm_page_own");
                }
                pg->owner = curproc->p_pid;
                pg->lowner = curlwp->l_lid;
                pg->owner_tag = tag;
                return;
        }

        /* drop ownership */
        KASSERT((pg->flags & PG_BUSY) == 0);
        if (pg->owner_tag == NULL) {
                printf("uvm_page_own: dropping ownership of an non-owned "
                    "page (%p)\n", pg);
                panic("uvm_page_own");
        }
        pg->owner_tag = NULL;
}
#endif

/*
 * uvm_pagelookup: look up a page
 *
 * => caller should lock object to keep someone from pulling the page
 *        out from under it
 */

struct vm_page *
uvm_pagelookup(struct uvm_object *obj, voff_t off)
{
        struct vm_page *pg;

        KASSERT(db_active || rw_lock_held(obj->vmobjlock));

        pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);

        KASSERT(pg == NULL || obj->uo_npages != 0);
        KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
                (pg->flags & PG_BUSY) != 0);
        return pg;
}

/*
 * uvm_pagewire: wire the page, thus removing it from the daemon's grasp
 *
 * => caller must lock objects
 * => caller must hold pg->interlock
 */

void
uvm_pagewire(struct vm_page *pg)
{

        KASSERT(uvm_page_owner_locked_p(pg, true));
        KASSERT(mutex_owned(&pg->interlock));
#if defined(READAHEAD_STATS)
        if ((pg->flags & PG_READAHEAD) != 0) {
                uvm_ra_hit.ev_count++;
                pg->flags &= ~PG_READAHEAD;
        }
#endif /* defined(READAHEAD_STATS) */
        if (pg->wire_count == 0) {
                uvm_pagedequeue(pg);
                atomic_inc_uint(&uvmexp.wired);
        }
        pg->wire_count++;
        KASSERT(pg->wire_count > 0);        /* detect wraparound */
}

/*
 * uvm_pageunwire: unwire the page.
 *
 * => activate if wire count goes to zero.
 * => caller must lock objects
 * => caller must hold pg->interlock
 */

void
uvm_pageunwire(struct vm_page *pg)
{

        KASSERT(uvm_page_owner_locked_p(pg, true));
        KASSERT(pg->wire_count != 0);
        KASSERT(!uvmpdpol_pageisqueued_p(pg));
        KASSERT(mutex_owned(&pg->interlock));
        pg->wire_count--;
        if (pg->wire_count == 0) {
                uvm_pageactivate(pg);
                KASSERT(uvmexp.wired != 0);
                atomic_dec_uint(&uvmexp.wired);
        }
}

/*
 * uvm_pagedeactivate: deactivate page
 *
 * => caller must lock objects
 * => caller must check to make sure page is not wired
 * => object that page belongs to must be locked (so we can adjust pg->flags)
 * => caller must clear the reference on the page before calling
 * => caller must hold pg->interlock
 */

void
uvm_pagedeactivate(struct vm_page *pg)
{

        KASSERT(uvm_page_owner_locked_p(pg, false));
        KASSERT(mutex_owned(&pg->interlock));
        if (pg->wire_count == 0) {
                KASSERT(uvmpdpol_pageisqueued_p(pg));
                uvmpdpol_pagedeactivate(pg);
        }
}

/*
 * uvm_pageactivate: activate page
 *
 * => caller must lock objects
 * => caller must hold pg->interlock
 */

void
uvm_pageactivate(struct vm_page *pg)
{

        KASSERT(uvm_page_owner_locked_p(pg, false));
        KASSERT(mutex_owned(&pg->interlock));
#if defined(READAHEAD_STATS)
        if ((pg->flags & PG_READAHEAD) != 0) {
                uvm_ra_hit.ev_count++;
                pg->flags &= ~PG_READAHEAD;
        }
#endif /* defined(READAHEAD_STATS) */
        if (pg->wire_count == 0) {
                uvmpdpol_pageactivate(pg);
        }
}

/*
 * uvm_pagedequeue: remove a page from any paging queue
 *
 * => caller must lock objects
 * => caller must hold pg->interlock
 */
void
uvm_pagedequeue(struct vm_page *pg)
{

        KASSERT(uvm_page_owner_locked_p(pg, true));
        KASSERT(mutex_owned(&pg->interlock));
        if (uvmpdpol_pageisqueued_p(pg)) {
                uvmpdpol_pagedequeue(pg);
        }
}

/*
 * uvm_pageenqueue: add a page to a paging queue without activating.
 * used where a page is not really demanded (yet).  eg. read-ahead
 *
 * => caller must lock objects
 * => caller must hold pg->interlock
 */
void
uvm_pageenqueue(struct vm_page *pg)
{

        KASSERT(uvm_page_owner_locked_p(pg, false));
        KASSERT(mutex_owned(&pg->interlock));
        if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
                uvmpdpol_pageenqueue(pg);
        }
}

/*
 * uvm_pagelock: acquire page interlock
 */
void
uvm_pagelock(struct vm_page *pg)
{

        mutex_enter(&pg->interlock);
}

/*
 * uvm_pagelock2: acquire two page interlocks
 */
void
uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
{

        if (pg1 < pg2) {
                mutex_enter(&pg1->interlock);
                mutex_enter(&pg2->interlock);
        } else {
                mutex_enter(&pg2->interlock);
                mutex_enter(&pg1->interlock);
        }
}

/*
 * uvm_pageunlock: release page interlock, and if a page replacement intent
 * is set on the page, pass it to uvmpdpol to make real.
 *
 * => caller must hold pg->interlock
 */
void
uvm_pageunlock(struct vm_page *pg)
{

        if ((pg->pqflags & PQ_INTENT_SET) == 0 ||
            (pg->pqflags & PQ_INTENT_QUEUED) != 0) {
                    mutex_exit(&pg->interlock);
                    return;
        }
        pg->pqflags |= PQ_INTENT_QUEUED;
        mutex_exit(&pg->interlock);
        uvmpdpol_pagerealize(pg);
}

/*
 * uvm_pageunlock2: release two page interlocks, and for both pages if a
 * page replacement intent is set on the page, pass it to uvmpdpol to make
 * real.
 *
 * => caller must hold pg->interlock
 */
void
uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
{

        if ((pg1->pqflags & PQ_INTENT_SET) == 0 ||
            (pg1->pqflags & PQ_INTENT_QUEUED) != 0) {
                    mutex_exit(&pg1->interlock);
                    pg1 = NULL;
        } else {
                pg1->pqflags |= PQ_INTENT_QUEUED;
                mutex_exit(&pg1->interlock);
        }

        if ((pg2->pqflags & PQ_INTENT_SET) == 0 ||
            (pg2->pqflags & PQ_INTENT_QUEUED) != 0) {
                    mutex_exit(&pg2->interlock);
                    pg2 = NULL;
        } else {
                pg2->pqflags |= PQ_INTENT_QUEUED;
                mutex_exit(&pg2->interlock);
        }

        if (pg1 != NULL) {
                uvmpdpol_pagerealize(pg1);
        }
        if (pg2 != NULL) {
                uvmpdpol_pagerealize(pg2);
        }
}

/*
 * uvm_pagezero: zero fill a page
 *
 * => if page is part of an object then the object should be locked
 *        to protect pg->flags.
 */

void
uvm_pagezero(struct vm_page *pg)
{

        uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
        pmap_zero_page(VM_PAGE_TO_PHYS(pg));
}

/*
 * uvm_pagecopy: copy a page
 *
 * => if page is part of an object then the object should be locked
 *        to protect pg->flags.
 */

void
uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
{

        uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY);
        pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
}

/*
 * uvm_pageismanaged: test it see that a page (specified by PA) is managed.
 */

bool
uvm_pageismanaged(paddr_t pa)
{

        return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
}

/*
 * uvm_page_lookup_freelist: look up the free list for the specified page
 */

int
uvm_page_lookup_freelist(struct vm_page *pg)
{
        uvm_physseg_t upm;

        upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
        KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
        return uvm_physseg_get_free_list(upm);
}

/*
 * uvm_page_owner_locked_p: return true if object associated with page is
 * locked.  this is a weak check for runtime assertions only.
 */

bool
uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive)
{

        if (pg->uobject != NULL) {
                return exclusive
                    ? rw_write_held(pg->uobject->vmobjlock)
                    : rw_lock_held(pg->uobject->vmobjlock);
        }
        if (pg->uanon != NULL) {
                return exclusive
                    ? rw_write_held(pg->uanon->an_lock)
                    : rw_lock_held(pg->uanon->an_lock);
        }
        return true;
}

/*
 * uvm_pagereadonly_p: return if the page should be mapped read-only
 */

bool
uvm_pagereadonly_p(struct vm_page *pg)
{
        struct uvm_object * const uobj = pg->uobject;

        KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
        KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock));
        if ((pg->flags & PG_RDONLY) != 0) {
                return true;
        }
        if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
                return true;
        }
        if (uobj == NULL) {
                return false;
        }
        return UVM_OBJ_NEEDS_WRITEFAULT(uobj);
}

#ifdef PMAP_DIRECT
/*
 * Call pmap to translate physical address into a virtual and to run a callback
 * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
 * or equivalent.
 */
int
uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
            int (*process)(void *, size_t, void *), void *arg)
{
        int error = 0;
        paddr_t pa;
        size_t todo;
        voff_t pgoff = (off & PAGE_MASK);
        struct vm_page *pg;

        KASSERT(npages > 0);
        KASSERT(len > 0);

        for (int i = 0; i < npages; i++) {
                pg = pgs[i];

                KASSERT(len > 0);

                /*
                 * Caller is responsible for ensuring all the pages are
                 * available.
                 */
                KASSERT(pg != NULL);
                KASSERT(pg != PGO_DONTCARE);

                pa = VM_PAGE_TO_PHYS(pg);
                todo = MIN(len, PAGE_SIZE - pgoff);

                error = pmap_direct_process(pa, pgoff, todo, process, arg);
                if (error)
                        break;

                pgoff = 0;
                len -= todo;
        }

        KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
        return error;
}
#endif /* PMAP_DIRECT */

#if defined(DDB) || defined(DEBUGPRINT)

/*
 * uvm_page_printit: actually print the page
 */

static const char page_flagbits[] = UVM_PGFLAGBITS;
static const char page_pqflagbits[] = UVM_PQFLAGBITS;

void
uvm_page_printit(struct vm_page *pg, bool full,
    void (*pr)(const char *, ...))
{
        struct vm_page *tpg;
        struct uvm_object *uobj;
        struct pgflbucket *pgb;
        struct pgflist *pgl;
        char pgbuf[128];

        (*pr)("PAGE %p:\n", pg);
        snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
        (*pr)("  flags=%s\n", pgbuf);
        snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags);
        (*pr)("  pqflags=%s\n", pgbuf);
        (*pr)("  uobject=%p, uanon=%p, offset=0x%llx\n",
            pg->uobject, pg->uanon, (long long)pg->offset);
        (*pr)("  loan_count=%d wire_count=%d bucket=%d freelist=%d\n",
            pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg),
            uvm_page_get_freelist(pg));
        (*pr)("  pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg));
#if defined(UVM_PAGE_TRKOWN)
        if (pg->flags & PG_BUSY)
                (*pr)("  owning process = %d.%d, tag=%s\n",
                    pg->owner, pg->lowner, pg->owner_tag);
        else
                (*pr)("  page not busy, no owner\n");
#else
        (*pr)("  [page ownership tracking disabled]\n");
#endif

        if (!full)
                return;

        /* cross-verify object/anon */
        if ((pg->flags & PG_FREE) == 0) {
                if (pg->flags & PG_ANON) {
                        if (pg->uanon == NULL || pg->uanon->an_page != pg)
                            (*pr)("  >>> ANON DOES NOT POINT HERE <<< (%p)\n",
                                (pg->uanon) ? pg->uanon->an_page : NULL);
                        else
                                (*pr)("  anon backpointer is OK\n");
                } else {
                        uobj = pg->uobject;
                        if (uobj) {
                                (*pr)("  checking object list\n");
                                tpg = uvm_pagelookup(uobj, pg->offset);
                                if (tpg)
                                        (*pr)("  page found on object list\n");
                                else
                        (*pr)("  >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
                        }
                }
        }

        /* cross-verify page queue */
        if (pg->flags & PG_FREE) {
                int fl = uvm_page_get_freelist(pg);
                int b = uvm_page_get_bucket(pg);
                pgb = uvm.page_free[fl].pgfl_buckets[b];
                pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)];
                (*pr)("  checking pageq list\n");
                LIST_FOREACH(tpg, pgl, pageq.list) {
                        if (tpg == pg) {
                                break;
                        }
                }
                if (tpg)
                        (*pr)("  page found on pageq list\n");
                else
                        (*pr)("  >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
        }
}

/*
 * uvm_page_printall - print a summary of all managed pages
 */

void
uvm_page_printall(void (*pr)(const char *, ...))
{
        uvm_physseg_t i;
        paddr_t pfn;
        struct vm_page *pg;

        (*pr)("%18s %4s %4s %18s %18s"
#ifdef UVM_PAGE_TRKOWN
            " OWNER"
#endif
            "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
        for (i = uvm_physseg_get_first();
             uvm_physseg_valid_p(i);
             i = uvm_physseg_get_next(i)) {
                for (pfn = uvm_physseg_get_start(i);
                     pfn < uvm_physseg_get_end(i);
                     pfn++) {
                        pg = PHYS_TO_VM_PAGE(ptoa(pfn));

                        (*pr)("%18p %04x %08x %18p %18p",
                            pg, pg->flags, pg->pqflags, pg->uobject,
                            pg->uanon);
#ifdef UVM_PAGE_TRKOWN
                        if (pg->flags & PG_BUSY)
                                (*pr)(" %d [%s]", pg->owner, pg->owner_tag);
#endif
                        (*pr)("\n");
                }
        }
}

/*
 * uvm_page_print_freelists - print a summary freelists
 */

void
uvm_page_print_freelists(void (*pr)(const char *, ...))
{
        struct pgfreelist *pgfl;
        struct pgflbucket *pgb;
        int fl, b, c;

        (*pr)("There are %d freelists with %d buckets of %d colors.\n\n",
            VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors);

        for (fl = 0; fl < VM_NFREELIST; fl++) {
                pgfl = &uvm.page_free[fl];
                (*pr)("freelist(%d) @ %p\n", fl, pgfl);
                for (b = 0; b < uvm.bucketcount; b++) {
                        pgb = uvm.page_free[fl].pgfl_buckets[b];
                        (*pr)("    bucket(%d) @ %p, nfree = %d, lock @ %p:\n",
                            b, pgb, pgb->pgb_nfree,
                            &uvm_freelist_locks[b].lock);
                        for (c = 0; c < uvmexp.ncolors; c++) {
                                (*pr)("        color(%d) @ %p, ", c,
                                    &pgb->pgb_colors[c]);
                                (*pr)("first page = %p\n",
                                    LIST_FIRST(&pgb->pgb_colors[c]));
                        }
                }
        }
}

#endif /* DDB || DEBUGPRINT */

















































    1 




    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
/*        $NetBSD: uvm_user.c,v 1.14 2011/02/02 15:13:34 chuck Exp $        */

/*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * from: Id: uvm_user.c,v 1.1.2.1 1997/08/14 19:10:41 chuck Exp
 */

/*
 * uvm_user.c: high level uvm_allocate/uvm_deallocate interface into vm.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_user.c,v 1.14 2011/02/02 15:13:34 chuck Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>

#include <uvm/uvm.h>

/*
 * uvm_deallocate: deallocate memory (unmap)
 */

void
uvm_deallocate(struct vm_map *map, vaddr_t start, vsize_t size)
{

        if (size == 0)
                return;

        uvm_unmap(map, trunc_page(start), round_page(start + size));
}






























































































































































































































































































































































































    1 





































    1 
















    1 



    1 
    1 




    1 





































































































































































































































































































































































































































































































































































































    2 







    2 



    2 



























    2 



















    2 






























































































































































    2 





















    1 



    1 

    1 




    1 




























































































































































































































































    2 






    2 




































































































































































    2 





    2 













































    2 

    2 


    2 
    2 





    2 













    1 

































































































    3 




















    3 













































































































































































































































































































































































































































































































































































































































































































    4 

    4 


    4 


























    1 









    2 












































































































































































































































































    1 


    1 


























    1 



































































































   17 





   17 





    1 







    1 










































    1 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
/*        $NetBSD: tty.c,v 1.312 2023/12/07 09:00:32 pgoyette Exp $        */

/*-
 * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1982, 1986, 1990, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)tty.c        8.13 (Berkeley) 1/9/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty.c,v 1.312 2023/12/07 09:00:32 pgoyette Exp $");

#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#endif

#define TTY_ALLOW_PRIVATE

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/ioctl.h>
#include <sys/proc.h>
#define        TTYDEFCHARS
#include <sys/tty.h>
#undef        TTYDEFCHARS
#include <sys/file.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/dkstat.h>
#include <sys/uio.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/syslog.h>
#include <sys/kmem.h>
#include <sys/signalvar.h>
#include <sys/resourcevar.h>
#include <sys/poll.h>
#include <sys/kprintf.h>
#include <sys/namei.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/intr.h>
#include <sys/ioctl_compat.h>
#include <sys/module.h>
#include <sys/bitops.h>
#include <sys/compat_stub.h>
#include <sys/atomic.h>
#include <sys/condvar.h>
#include <sys/pserialize.h>

static int        ttnread(struct tty *);
static void        ttyblock(struct tty *);
static void        ttyecho(int, struct tty *);
static void        ttyrubo(struct tty *, int);
static void        ttyprintf_nolock(struct tty *, const char *fmt, ...)
    __printflike(2, 3);
static int        proc_compare_wrapper(struct proc *, struct proc *);
static void        ttysigintr(void *);

/* Symbolic sleep message strings. */
const char        ttclos[] = "ttycls";
const char        ttopen[] = "ttyopn";
const char        ttybg[] = "ttybg";
const char        ttyin[] = "ttyin";
const char        ttyout[] = "ttyout";

/*
 * Used to determine whether we still have a connection.  This is true in
 * one of 3 cases:
 * 1) We have carrier.
 * 2) It's a locally attached terminal, and we are therefore ignoring carrier.
 * 3) We're using a flow control mechanism that overloads the carrier signal.
 */
#define        CONNECTED(tp)        (ISSET(tp->t_state, TS_CARR_ON) ||        \
                         ISSET(tp->t_cflag, CLOCAL | MDMBUF))

/*
 * Table with character classes and parity. The 8th bit indicates parity,
 * the 7th bit indicates the character is an alphameric or underscore (for
 * ALTWERASE), and the low 6 bits indicate delay type.  If the low 6 bits
 * are 0 then the character needs no special processing on output; classes
 * other than 0 might be translated or (not currently) require delays.
 */
#define        E        0x00        /* Even parity. */
#define        O        0x80        /* Odd parity. */
#define        PARITY(c)        (char_type[c] & O)

#define        ALPHA        0x40        /* Alpha or underscore. */
#define        ISALPHA(c)        (char_type[(c) & TTY_CHARMASK] & ALPHA)

#define        CCLASSMASK        0x3f
#define        CCLASS(c)        (char_type[c] & CCLASSMASK)

#define        BS        BACKSPACE
#define        CC        CONTROL
#define        CR        RETURN
#define        NA        ORDINARY | ALPHA
#define        NL        NEWLINE
#define        NO        ORDINARY
#define        TB        TAB
#define        VT        VTAB

unsigned char const char_type[] = {
        E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC,        /* nul - bel */
        O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC,        /* bs - si */
        O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC,        /* dle - etb */
        E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC,        /* can - us */
        O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO,        /* sp - ' */
        E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO,        /* ( - / */
        E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA,        /* 0 - 7 */
        O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO,        /* 8 - ? */
        O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA,        /* @ - G */
        E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA,        /* H - O */
        E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA,        /* P - W */
        O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA,        /* X - _ */
        E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA,        /* ` - g */
        O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA,        /* h - o */
        O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA,        /* p - w */
        E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC,        /* x - del */
        /*
         * Meta chars; should be settable per character set;
         * for now, treat them all as normal characters.
         */
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
        NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
};
#undef        BS
#undef        CC
#undef        CR
#undef        NA
#undef        NL
#undef        NO
#undef        TB
#undef        VT

static struct ttylist_head tty_sigqueue = TAILQ_HEAD_INITIALIZER(tty_sigqueue);
static void *tty_sigsih;

struct ttylist_head ttylist = TAILQ_HEAD_INITIALIZER(ttylist);
int tty_count;
kmutex_t tty_lock;
kmutex_t constty_lock;
static struct pserialize *constty_psz;
static kcondvar_t ttyref_cv;

struct ptm_pty *ptm = NULL;

uint64_t tk_cancc;
uint64_t tk_nin;
uint64_t tk_nout;
uint64_t tk_rawcc;

static kauth_listener_t tty_listener;

#define        TTY_MINQSIZE        0x00400
#define        TTY_MAXQSIZE        0x10000
int tty_qsize = TTY_MINQSIZE;

static int
tty_get_qsize(int *qsize, int newsize)
{
        if (newsize <= 0)
                return EINVAL;

        newsize = 1 << ilog2(newsize);        /* Make it a power of two */

        if (newsize < TTY_MINQSIZE || newsize > TTY_MAXQSIZE)
                return EINVAL;

        *qsize = newsize;
        return 0;
}

static int
tty_set_qsize(struct tty *tp, int newsize)
{
        struct clist rawq, canq, outq;
        struct clist orawq, ocanq, ooutq;

        clalloc(&rawq, newsize, 1);
        clalloc(&canq, newsize, 1);
        clalloc(&outq, newsize, 0);

        mutex_spin_enter(&tty_lock);

        if (tp->t_outq.c_cc != 0) {
                mutex_spin_exit(&tty_lock);
                clfree(&rawq);
                clfree(&canq);
                clfree(&outq);
                return EBUSY;
        }

        orawq = tp->t_rawq;
        ocanq = tp->t_canq;
        ooutq = tp->t_outq;

        tp->t_qsize = newsize;
        tp->t_rawq = rawq;
        tp->t_canq = canq;
        tp->t_outq = outq;

        ttsetwater(tp);

        mutex_spin_exit(&tty_lock);

        clfree(&orawq);
        clfree(&ocanq);
        clfree(&ooutq);

        return 0;
}

static int
sysctl_kern_tty_qsize(SYSCTLFN_ARGS)
{
        int newsize;
        int error;
        struct sysctlnode node;
        node = *rnode;
        node.sysctl_data = &newsize;

        newsize = tty_qsize;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;


        return tty_get_qsize(&tty_qsize, newsize);
}

static void
sysctl_kern_tty_setup(void)
{
        const struct sysctlnode *rnode, *cnode;

        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "tkstat",
                       SYSCTL_DESCR("Number of characters sent and received "
                                    "on ttys"),
                       NULL, 0, NULL, 0,
                       CTL_KERN, KERN_TKSTAT, CTL_EOL);

        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "nin",
                       SYSCTL_DESCR("Total number of tty input characters"),
                       NULL, 0, &tk_nin, 0,
                       CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_NIN, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "nout",
                       SYSCTL_DESCR("Total number of tty output characters"),
                       NULL, 0, &tk_nout, 0,
                       CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_NOUT, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "cancc",
                       SYSCTL_DESCR("Number of canonical tty input characters"),
                       NULL, 0, &tk_cancc, 0,
                       CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_CANCC, CTL_EOL);
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_QUAD, "rawcc",
                       SYSCTL_DESCR("Number of raw tty input characters"),
                       NULL, 0, &tk_rawcc, 0,
                       CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_RAWCC, CTL_EOL);

        sysctl_createv(NULL, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "tty", NULL,
                       NULL, 0, NULL, 0,
                       CTL_KERN, CTL_CREATE, CTL_EOL);
        sysctl_createv(NULL, 0, &rnode, &cnode,
                       CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                       CTLTYPE_INT, "qsize",
                       SYSCTL_DESCR("TTY input and output queue size"),
                       sysctl_kern_tty_qsize, 0, &tty_qsize, 0,
                       CTL_CREATE, CTL_EOL);
}

/*
 * ttylock(tp), ttyunlock(tp), ttylocked(tp)
 *
 *        Exclusive lock on tty.  Currently a single global lock.
 *
 *        ttylocked is for positive DIAGNOSTIC assertions only.
 */
void
ttylock(struct tty *tp)
{

        mutex_spin_enter(&tty_lock);
}

void
ttyunlock(struct tty *tp)
{

        mutex_spin_exit(&tty_lock);
}

bool
ttylocked(struct tty *tp)
{

        return mutex_owned(&tty_lock);
}

int
ttyopen(struct tty *tp, int dialout, int nonblock)
{
        int        error;

        error = 0;

        mutex_spin_enter(&tty_lock);

        if (dialout) {
                /*
                 * If the device is already open for non-dialout, fail.
                 * Otherwise, set TS_DIALOUT to block any pending non-dialout
                 * opens.
                 */
                if (ISSET(tp->t_state, TS_ISOPEN) &&
                    !ISSET(tp->t_state, TS_DIALOUT)) {
                        error = EBUSY;
                        goto out;
                }
                SET(tp->t_state, TS_DIALOUT);
        } else {
                if (!nonblock) {
                        /*
                         * Wait for carrier.  Also wait for any dialout
                         * processes to close the tty first.
                         */
                        while (ISSET(tp->t_state, TS_DIALOUT) ||
                               !CONNECTED(tp)) {
                                tp->t_wopen++;
                                error = ttysleep(tp, &tp->t_rawcv, true, 0);
                                tp->t_wopen--;
                                if (error)
                                        goto out;
                        }
                } else {
                        /*
                         * Don't allow a non-blocking non-dialout open if the
                         * device is already open for dialout.
                         */
                        if (ISSET(tp->t_state, TS_DIALOUT)) {
                                error = EBUSY;
                                goto out;
                        }
                }
        }

out:
        mutex_spin_exit(&tty_lock);
        return (error);
}

/*
 * Initial open of tty, or (re)entry to standard tty line discipline.
 */
int
ttylopen(dev_t device, struct tty *tp)
{

        mutex_spin_enter(&tty_lock);
        tp->t_dev = device;
        if (!ISSET(tp->t_state, TS_ISOPEN)) {
                SET(tp->t_state, TS_ISOPEN);
                memset(&tp->t_winsize, 0, sizeof(tp->t_winsize));
                tp->t_flags = 0;
        }
        mutex_spin_exit(&tty_lock);
        if (tp->t_qsize != tty_qsize)
                tty_set_qsize(tp, tty_qsize);
        return (0);
}

/*
 * Interrupt any pending I/O and make it fail.  Used before close to
 * interrupt pending open/read/write/&c. and make it fail promptly.
 */
void
ttycancel(struct tty *tp)
{

        mutex_spin_enter(&tty_lock);
        tp->t_state |= TS_CANCEL;
        cv_broadcast(&tp->t_outcv);
        cv_broadcast(&tp->t_rawcv);
        mutex_spin_exit(&tty_lock);
}

/*
 * Handle close() on a tty line: flush and set to initial state,
 * bumping generation number so that pending read/write calls
 * can detect recycling of the tty.
 */
int
ttyclose(struct tty *tp)
{
        struct session *sess;

        /*
         * Make sure this is not the constty.  Without constty_lock it
         * is always allowed to transition from nonnull to null.
         */
        (void)atomic_cas_ptr(&constty, tp, NULL);

        /*
         * We don't know if this has _ever_ been the constty: another
         * thread may have kicked it out as constty before we started
         * to close.
         *
         * So we wait for all users that might be acquiring references
         * to finish doing so -- after that, no more references can be
         * made, at which point we can safely flush the tty, wait for
         * the existing references to drain, and finally free or reuse
         * the tty.
         */
        pserialize_perform(constty_psz);

        mutex_spin_enter(&tty_lock);

        ttyflush(tp, FREAD | FWRITE);

        tp->t_gen++;
        tp->t_pgrp = NULL;
        tp->t_state = 0;
        sess = tp->t_session;
        tp->t_session = NULL;

        while (tp->t_refcnt)
                cv_wait(&ttyref_cv, &tty_lock);

        mutex_spin_exit(&tty_lock);

        if (sess != NULL) {
                mutex_enter(&proc_lock);
                /* Releases proc_lock. */
                proc_sessrele(sess);
        }
        return (0);
}

#define        FLUSHQ(q) {                                                        \
        if ((q)->c_cc)                                                        \
                ndflush(q, (q)->c_cc);                                        \
}

/*
 * tty_acquire(tp), tty_release(tp)
 *
 *        Acquire a reference to tp that prevents it from being closed
 *        until released.  Caller must guarantee tp has not yet been
 *        closed, e.g. by obtaining tp from constty during a pserialize
 *        read section.  Caller must not hold tty_lock.
 */
void
tty_acquire(struct tty *tp)
{
        unsigned refcnt __diagused;

        refcnt = atomic_inc_uint_nv(&tp->t_refcnt);
        KASSERT(refcnt < UINT_MAX);
}

void
tty_release(struct tty *tp)
{
        unsigned old, new;

        KDASSERT(mutex_ownable(&tty_lock));

        do {
                old = atomic_load_relaxed(&tp->t_refcnt);
                if (old == 1) {
                        mutex_spin_enter(&tty_lock);
                        if (atomic_dec_uint_nv(&tp->t_refcnt) == 0)
                                cv_broadcast(&ttyref_cv);
                        mutex_spin_exit(&tty_lock);
                        return;
                }
                KASSERT(old != 0);
                new = old - 1;
        } while (atomic_cas_uint(&tp->t_refcnt, old, new) != old);
}

/*
 * This macro is used in canonical mode input processing, where a read
 * request shall not return unless a 'line delimiter' ('\n') or 'break'
 * (EOF, EOL, EOL2) character (or a signal) has been received. As EOL2
 * is an extension to the POSIX.1 defined set of special characters,
 * recognize it only if IEXTEN is set in the set of local flags.
 */
#define        TTBREAKC(c, lflg)                                                \
        ((c) == '\n' || (((c) == cc[VEOF] || (c) == cc[VEOL] ||                \
        ((c) == cc[VEOL2] && ISSET(lflg, IEXTEN))) && (c) != _POSIX_VDISABLE))



/*
 * ttyinput() helper.
 * Call with the tty lock held.
 */
/* XXX static */ int
ttyinput_wlock(int c, struct tty *tp)
{
        int        iflag, lflag, i, error;
        u_char        *cc;

        KASSERT(mutex_owned(&tty_lock));

        /*
         * If input is pending take it first.
         */
        lflag = tp->t_lflag;
        if (ISSET(lflag, PENDIN))
                ttypend(tp);
        /*
         * Gather stats.
         */
        if (ISSET(lflag, ICANON)) {
                ++tk_cancc;
                ++tp->t_cancc;
        } else {
                ++tk_rawcc;
                ++tp->t_rawcc;
        }
        ++tk_nin;

        cc = tp->t_cc;

        /*
         * Handle exceptional conditions (break, parity, framing).
         */
        iflag = tp->t_iflag;
        if ((error = (ISSET(c, TTY_ERRORMASK))) != 0) {
                CLR(c, TTY_ERRORMASK);
                if (ISSET(error, TTY_FE) && c == 0) {                /* Break. */
                        if (ISSET(iflag, IGNBRK))
                                return (0);
                        else if (ISSET(iflag, BRKINT)) {
                                ttyflush(tp, FREAD | FWRITE);
                                ttysig(tp, TTYSIG_PG1, SIGINT);
                                return (0);
                        } else if (ISSET(iflag, PARMRK))
                                goto parmrk;
                } else if ((ISSET(error, TTY_PE) && ISSET(iflag, INPCK)) ||
                    ISSET(error, TTY_FE)) {
                        if (ISSET(iflag, IGNPAR))
                                return (0);
                        else if (ISSET(iflag, PARMRK)) {
 parmrk:                        (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
                                (void)putc(0    | TTY_QUOTE, &tp->t_rawq);
                                (void)putc(c    | TTY_QUOTE, &tp->t_rawq);
                                return (0);
                        } else
                                c = 0;
                }
        } else if (c == 0377 &&
            ISSET(iflag, ISTRIP|IGNPAR|INPCK|PARMRK) == (INPCK|PARMRK)) {
                /* "Escape" a valid character of '\377'. */
                (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
                (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
                goto endcase;
        }

        /*
         * In tandem mode, check high water mark.
         */
        if (ISSET(iflag, IXOFF) || ISSET(tp->t_cflag, CHWFLOW))
                ttyblock(tp);
        if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP))
                CLR(c, 0x80);
        if (!ISSET(lflag, EXTPROC)) {
                /*
                 * Check for literal nexting very first
                 */
                if (ISSET(tp->t_state, TS_LNCH)) {
                        SET(c, TTY_QUOTE);
                        CLR(tp->t_state, TS_LNCH);
                }
                /*
                 * Scan for special characters.  This code
                 * is really just a big case statement with
                 * non-constant cases.  The bottom of the
                 * case statement is labeled ``endcase'', so goto
                 * it after a case match, or similar.
                 */

                /*
                 * Control chars which aren't controlled
                 * by ICANON, ISIG, or IXON.
                 */
                if (ISSET(lflag, IEXTEN)) {
                        if (CCEQ(cc[VLNEXT], c)) {
                                if (ISSET(lflag, ECHO)) {
                                        if (ISSET(lflag, ECHOE)) {
                                                (void)ttyoutput('^', tp);
                                                (void)ttyoutput('\b', tp);
                                        } else
                                                ttyecho(c, tp);
                                }
                                SET(tp->t_state, TS_LNCH);
                                goto endcase;
                        }
                        if (CCEQ(cc[VDISCARD], c)) {
                                if (ISSET(lflag, FLUSHO))
                                        CLR(tp->t_lflag, FLUSHO);
                                else {
                                        ttyflush(tp, FWRITE);
                                        ttyecho(c, tp);
                                        if (tp->t_rawq.c_cc + tp->t_canq.c_cc)
                                                ttyretype(tp);
                                        SET(tp->t_lflag, FLUSHO);
                                }
                                goto startoutput;
                        }
                }
                /*
                 * Signals.
                 */
                if (ISSET(lflag, ISIG)) {
                        if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) {
                                if (!ISSET(lflag, NOFLSH))
                                        ttyflush(tp, FREAD | FWRITE);
                                ttyecho(c, tp);
                                ttysig(tp, TTYSIG_PG1, CCEQ(cc[VINTR], c) ?
                                    SIGINT : SIGQUIT);
                                goto endcase;
                        }
                        if (CCEQ(cc[VSUSP], c)) {
                                if (!ISSET(lflag, NOFLSH))
                                        ttyflush(tp, FREAD);
                                ttyecho(c, tp);
                                ttysig(tp, TTYSIG_PG1, SIGTSTP);
                                goto endcase;
                        }
                }
                /*
                 * Handle start/stop characters.
                 */
                if (ISSET(iflag, IXON)) {
                        if (CCEQ(cc[VSTOP], c)) {
                                if (!ISSET(tp->t_state, TS_TTSTOP)) {
                                        SET(tp->t_state, TS_TTSTOP);
                                        cdev_stop(tp, 0);
                                        return (0);
                                }
                                if (!CCEQ(cc[VSTART], c))
                                        return (0);
                                /*
                                 * if VSTART == VSTOP then toggle
                                 */
                                goto endcase;
                        }
                        if (CCEQ(cc[VSTART], c))
                                goto restartoutput;
                }
                /*
                 * IGNCR, ICRNL, & INLCR
                 */
                if (c == '\r') {
                        if (ISSET(iflag, IGNCR))
                                goto endcase;
                        else if (ISSET(iflag, ICRNL))
                                c = '\n';
                } else if (c == '\n' && ISSET(iflag, INLCR))
                        c = '\r';
        }
        if (!ISSET(lflag, EXTPROC) && ISSET(lflag, ICANON)) {
                /*
                 * From here on down canonical mode character
                 * processing takes place.
                 */
                /*
                 * erase (^H / ^?)
                 */
                if (CCEQ(cc[VERASE], c)) {
                        if (tp->t_rawq.c_cc)
                                ttyrub(unputc(&tp->t_rawq), tp);
                        goto endcase;
                }
                /*
                 * kill (^U)
                 */
                if (CCEQ(cc[VKILL], c)) {
                        if (ISSET(lflag, ECHOKE) &&
                            tp->t_rawq.c_cc == tp->t_rocount &&
                            !ISSET(lflag, ECHOPRT))
                                while (tp->t_rawq.c_cc)
                                        ttyrub(unputc(&tp->t_rawq), tp);
                        else {
                                ttyecho(c, tp);
                                if (ISSET(lflag, ECHOK) ||
                                    ISSET(lflag, ECHOKE))
                                        ttyecho('\n', tp);
                                FLUSHQ(&tp->t_rawq);
                                tp->t_rocount = 0;
                        }
                        CLR(tp->t_state, TS_LOCAL);
                        goto endcase;
                }
                /*
                 * Extensions to the POSIX.1 GTI set of functions.
                 */
                if (ISSET(lflag, IEXTEN)) {
                        /*
                         * word erase (^W)
                         */
                        if (CCEQ(cc[VWERASE], c)) {
                                int alt = ISSET(lflag, ALTWERASE);
                                int ctype;

                                /*
                                 * erase whitespace
                                 */
                                while ((c = unputc(&tp->t_rawq)) == ' ' ||
                                    c == '\t')
                                        ttyrub(c, tp);
                                if (c == -1)
                                        goto endcase;
                                /*
                                 * erase last char of word and remember the
                                 * next chars type (for ALTWERASE)
                                 */
                                ttyrub(c, tp);
                                c = unputc(&tp->t_rawq);
                                if (c == -1)
                                        goto endcase;
                                if (c == ' ' || c == '\t') {
                                        (void)putc(c, &tp->t_rawq);
                                        goto endcase;
                                }
                                ctype = ISALPHA(c);
                                /*
                                 * erase rest of word
                                 */
                                do {
                                        ttyrub(c, tp);
                                        c = unputc(&tp->t_rawq);
                                        if (c == -1)
                                                goto endcase;
                                } while (c != ' ' && c != '\t' &&
                                    (alt == 0 || ISALPHA(c) == ctype));
                                (void)putc(c, &tp->t_rawq);
                                goto endcase;
                        }
                        /*
                         * reprint line (^R)
                         */
                        if (CCEQ(cc[VREPRINT], c)) {
                                ttyretype(tp);
                                goto endcase;
                        }
                        /*
                         * ^T - kernel info and generate SIGINFO
                         */
                        if (CCEQ(cc[VSTATUS], c)) {
                                ttysig(tp, TTYSIG_PG1, SIGINFO);
                                goto endcase;
                        }
                }
        }
        /*
         * Check for input buffer overflow
         */
        if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= TTYHOG) {
                if (ISSET(iflag, IMAXBEL)) {
                        if (tp->t_outq.c_cc < tp->t_hiwat)
                                (void)ttyoutput(CTRL('g'), tp);
                } else
                        ttyflush(tp, FREAD | FWRITE);
                goto endcase;
        }
        /*
         * Put data char in q for user and
         * wakeup on seeing a line delimiter.
         */
        if (putc(c, &tp->t_rawq) >= 0) {
                if (!ISSET(lflag, ICANON)) {
                        ttwakeup(tp);
                        ttyecho(c, tp);
                        goto endcase;
                }
                if (TTBREAKC(c, lflag)) {
                        tp->t_rocount = 0;
                        catq(&tp->t_rawq, &tp->t_canq);
                        ttwakeup(tp);
                } else if (tp->t_rocount++ == 0)
                        tp->t_rocol = tp->t_column;
                if (ISSET(tp->t_state, TS_ERASE)) {
                        /*
                         * end of prterase \.../
                         */
                        CLR(tp->t_state, TS_ERASE);
                        (void)ttyoutput('/', tp);
                }
                i = tp->t_column;
                ttyecho(c, tp);
                if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) {
                        /*
                         * Place the cursor over the '^' of the ^D.
                         */
                        i = uimin(2, tp->t_column - i);
                        while (i > 0) {
                                (void)ttyoutput('\b', tp);
                                i--;
                        }
                }
        }
 endcase:
        /*
         * IXANY means allow any character to restart output.
         */
        if (ISSET(tp->t_state, TS_TTSTOP) &&
            !ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP]) {
                return (0);
        }
 restartoutput:
        CLR(tp->t_lflag, FLUSHO);
        CLR(tp->t_state, TS_TTSTOP);
 startoutput:
        return (ttstart(tp));
}

/*
 * Process input of a single character received on a tty.
 *
 * XXX - this is a hack, all drivers must changed to acquire the
 *         lock before calling linesw->l_rint()
 */
int
ttyinput(int c, struct tty *tp)
{
        int error;

        /*
         * Unless the receiver is enabled, drop incoming data.
         */
        if (!ISSET(tp->t_cflag, CREAD))
                return (0);

        mutex_spin_enter(&tty_lock);
        error = ttyinput_wlock(c, tp);
        mutex_spin_exit(&tty_lock);

        return (error);
}

/*
 * Output a single character on a tty, doing output processing
 * as needed (expanding tabs, newline processing, etc.).
 * Returns < 0 if succeeds, otherwise returns char to resend.
 * Must be recursive.
 *
 * Call with tty lock held.
 */
int
ttyoutput(int c, struct tty *tp)
{
        long        oflag;
        int        col, notout;

        KASSERT(mutex_owned(&tty_lock));

        oflag = tp->t_oflag;
        if (!ISSET(oflag, OPOST)) {
                tk_nout++;
                tp->t_outcc++;
                if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq))
                        return (c);
                return (-1);
        }
        /*
         * Do tab expansion if OXTABS is set.  Special case if we do external
         * processing, we don't do the tab expansion because we'll probably
         * get it wrong.  If tab expansion needs to be done, let it happen
         * externally.
         */
        CLR(c, ~TTY_CHARMASK);
        if (c == '\t' &&
            ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) {
                c = 8 - (tp->t_column & 7);
                if (ISSET(tp->t_lflag, FLUSHO)) {
                        notout = 0;
                } else {
                        notout = b_to_q("        ", c, &tp->t_outq);
                        c -= notout;
                        tk_nout += c;
                        tp->t_outcc += c;
                }
                tp->t_column += c;
                return (notout ? '\t' : -1);
        }
        if (c == CEOT && ISSET(oflag, ONOEOT))
                return (-1);

        /*
         * Newline translation: if ONLCR is set,
         * translate newline into "\r\n".
         */
        if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) {
                tk_nout++;
                tp->t_outcc++;
                if (!ISSET(tp->t_lflag, FLUSHO) && putc('\r', &tp->t_outq))
                        return (c);
        }
        /* If OCRNL is set, translate "\r" into "\n". */
        else if (c == '\r' && ISSET(tp->t_oflag, OCRNL))
                c = '\n';
        /* If ONOCR is set, don't transmit CRs when on column 0. */
        else if (c == '\r' && ISSET(tp->t_oflag, ONOCR) && tp->t_column == 0)
                return (-1);

        tk_nout++;
        tp->t_outcc++;
        if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq))
                return (c);

        col = tp->t_column;
        switch (CCLASS(c)) {
        case BACKSPACE:
                if (col > 0)
                        --col;
                break;
        case CONTROL:
                break;
        case NEWLINE:
                if (ISSET(tp->t_oflag, ONLCR | ONLRET))
                        col = 0;
                break;
        case RETURN:
                col = 0;
                break;
        case ORDINARY:
                ++col;
                break;
        case TAB:
                col = (col + 8) & ~7;
                break;
        }
        tp->t_column = col;
        return (-1);
}

/*
 * Ioctls for all tty devices.  Called after line-discipline specific ioctl
 * has been called to do discipline-specific functions and/or reject any
 * of these ioctl commands.
 */
/* ARGSUSED */
int
ttioctl(struct tty *tp, u_long cmd, void *data, int flag, struct lwp *l)
{
        struct proc *p;
        struct linesw        *lp;
        int                s, error;
        struct pathbuf *pb;
        struct nameidata nd;
        char                infobuf[200];

        KASSERT(l != NULL);
        p = l->l_proc;

        /* If the ioctl involves modification, hang if in the background. */
        switch (cmd) {
        case  TIOCFLUSH:
        case  TIOCDRAIN:
        case  TIOCSBRK:
        case  TIOCCBRK:
        case  TIOCSTART:
        case  TIOCSETA:
        case  TIOCSETD:
        case  TIOCSLINED:
        case  TIOCSETAF:
        case  TIOCSETAW:
#ifdef notdef
        case  TIOCSPGRP:
        case  FIOSETOWN:
#endif
        case  TIOCSTAT:
        case  TIOCSTI:
        case  TIOCSWINSZ:
        case  TIOCSQSIZE:
        case  TIOCLBIC:
        case  TIOCLBIS:
        case  TIOCLSET:
        case  TIOCSETC:
        case OTIOCSETD:
        case  TIOCSETN:
        case  TIOCSETP:
        case  TIOCSLTC:
                mutex_spin_enter(&tty_lock);
                while (isbackground(curproc, tp) &&
                    p->p_pgrp->pg_jobc && (p->p_lflag & PL_PPWAIT) == 0 &&
                    !sigismasked(l, SIGTTOU)) {
                        mutex_spin_exit(&tty_lock);

                        mutex_enter(&proc_lock);
                        pgsignal(p->p_pgrp, SIGTTOU, 1);
                        mutex_exit(&proc_lock);
                        
                        mutex_spin_enter(&tty_lock);
                        error = ttypause(tp, hz);
                        if (error) {
                                mutex_spin_exit(&tty_lock);
                                return (error);
                        }
                }
                mutex_spin_exit(&tty_lock);
                break;
        }

        switch (cmd) {                        /* Process the ioctl. */
        case FIOASYNC:                        /* set/clear async i/o */
                mutex_spin_enter(&tty_lock);
                if (*(int *)data)
                        SET(tp->t_state, TS_ASYNC);
                else
                        CLR(tp->t_state, TS_ASYNC);
                mutex_spin_exit(&tty_lock);
                break;
        case FIONBIO:                        /* set/clear non-blocking i/o */
                break;                        /* XXX: delete. */
        case FIONREAD:                        /* get # bytes to read */
                mutex_spin_enter(&tty_lock);
                *(int *)data = ttnread(tp);
                mutex_spin_exit(&tty_lock);
                break;
        case FIONWRITE:                        /* get # bytes to written & unsent */
                mutex_spin_enter(&tty_lock);
                *(int *)data = tp->t_outq.c_cc;
                mutex_spin_exit(&tty_lock);
                break;
        case FIONSPACE:                        /* get # bytes to written & unsent */
                mutex_spin_enter(&tty_lock);
                *(int *)data = tp->t_outq.c_cn - tp->t_outq.c_cc;
                mutex_spin_exit(&tty_lock);
                break;
        case TIOCEXCL:                        /* set exclusive use of tty */
                mutex_spin_enter(&tty_lock);
                SET(tp->t_state, TS_XCLUDE);
                mutex_spin_exit(&tty_lock);
                break;
        case TIOCFLUSH: {                /* flush buffers */
                int flags = *(int *)data;

                if (flags == 0)
                        flags = FREAD | FWRITE;
                else
                        flags &= FREAD | FWRITE;
                mutex_spin_enter(&tty_lock);
                ttyflush(tp, flags);
                mutex_spin_exit(&tty_lock);
                break;
        }
        case TIOCCONS: {                /* become virtual console */
                struct tty *ctp;

                mutex_enter(&constty_lock);
                error = 0;
                ctp = atomic_load_relaxed(&constty);
                if (*(int *)data) {
                        if (ctp != NULL && ctp != tp &&
                            ISSET(ctp->t_state, TS_CARR_ON | TS_ISOPEN) ==
                            (TS_CARR_ON | TS_ISOPEN)) {
                                error = EBUSY;
                                goto unlock_constty;
                        }

                        pb = pathbuf_create("/dev/console");
                        if (pb == NULL) {
                                error = ENOMEM;
                                goto unlock_constty;
                        }
                        NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, pb);
                        if ((error = namei(&nd)) != 0) {
                                pathbuf_destroy(pb);
                                goto unlock_constty;
                        }
                        error = VOP_ACCESS(nd.ni_vp, VREAD, l->l_cred);
                        vput(nd.ni_vp);
                        pathbuf_destroy(pb);
                        if (error)
                                goto unlock_constty;

                        KASSERT(atomic_load_relaxed(&constty) == ctp ||
                            atomic_load_relaxed(&constty) == NULL);
                        atomic_store_release(&constty, tp);
                } else if (tp == ctp) {
                        atomic_store_relaxed(&constty, NULL);
                }
unlock_constty:        mutex_exit(&constty_lock);
                if (error)
                        return error;
                break;
        }
        case TIOCDRAIN:                        /* wait till output drained */
                if ((error = ttywait(tp)) != 0)
                        return (error);
                break;
        case TIOCGETA: {                /* get termios struct */
                struct termios *t = (struct termios *)data;

                memcpy(t, &tp->t_termios, sizeof(struct termios));
                break;
        }
        case TIOCGETD:                        /* get line discipline (old) */
                *(int *)data = tp->t_linesw->l_no;
                break;
        case TIOCGLINED:                /* get line discipline (new) */
                (void)strncpy((char *)data, tp->t_linesw->l_name,
                    TTLINEDNAMELEN - 1);
                break;
        case TIOCGWINSZ:                /* get window size */
                *(struct winsize *)data = tp->t_winsize;
                break;
        case TIOCGQSIZE:
                *(int *)data = tp->t_qsize;
                break;
        case FIOGETOWN:
                mutex_enter(&proc_lock);
                if (tp->t_session != NULL && !isctty(p, tp)) {
                        mutex_exit(&proc_lock);
                        return (ENOTTY);
                }
                *(int *)data = tp->t_pgrp ? -tp->t_pgrp->pg_id : 0;
                mutex_exit(&proc_lock);
                break;
        case TIOCGPGRP:                        /* get pgrp of tty */
                mutex_enter(&proc_lock);
                if (!isctty(p, tp)) {
                        mutex_exit(&proc_lock);
                        return (ENOTTY);
                }
                *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
                mutex_exit(&proc_lock);
                break;
        case TIOCGSID:                        /* get sid of tty */
                mutex_enter(&proc_lock);
                if (!isctty(p, tp)) {
                        mutex_exit(&proc_lock);
                        return (ENOTTY);
                }
                *(int *)data = tp->t_session->s_sid;
                mutex_exit(&proc_lock);
                break;
#ifdef TIOCHPCL
        case TIOCHPCL:                        /* hang up on last close */
                mutex_spin_enter(&tty_lock);
                SET(tp->t_cflag, HUPCL);
                mutex_spin_exit(&tty_lock);
                break;
#endif
        case TIOCNXCL:                        /* reset exclusive use of tty */
                mutex_spin_enter(&tty_lock);
                CLR(tp->t_state, TS_XCLUDE);
                mutex_spin_exit(&tty_lock);
                break;
        case TIOCOUTQ:                        /* output queue size */
                *(int *)data = tp->t_outq.c_cc;
                break;
        case TIOCSETA:                        /* set termios struct */
        case TIOCSETAW:                        /* drain output, set */
        case TIOCSETAF: {                /* drn out, fls in, set */
                struct termios *t = (struct termios *)data;

                if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
                        if ((error = ttywait(tp)) != 0)
                                return (error);

                        if (cmd == TIOCSETAF) {
                                mutex_spin_enter(&tty_lock);
                                ttyflush(tp, FREAD);
                                mutex_spin_exit(&tty_lock);
                        }
                }

                s = spltty();
                /*
                 * XXXSMP - some drivers call back on us from t_param(), so
                 *            don't take the tty spin lock here.
                 *            require t_param() to unlock upon callback?
                 */
                /* wanted here: mutex_spin_enter(&tty_lock); */
                if (!ISSET(t->c_cflag, CIGNORE)) {
                        /*
                         * Set device hardware.
                         */
                        if (tp->t_param && (error = (*tp->t_param)(tp, t))) {
                                /* wanted here: mutex_spin_exit(&tty_lock); */
                                splx(s);
                                return (error);
                        } else {
                                tp->t_cflag = t->c_cflag;
                                tp->t_ispeed = t->c_ispeed;
                                tp->t_ospeed = t->c_ospeed;
                                if (t->c_ospeed == 0)
                                        ttysig(tp, TTYSIG_LEADER, SIGHUP);
                        }
                        ttsetwater(tp);
                }

                /* delayed lock acquiring */
                mutex_spin_enter(&tty_lock);
                if (cmd != TIOCSETAF) {
                        if (ISSET(t->c_lflag, ICANON) !=
                            ISSET(tp->t_lflag, ICANON)) {
                                if (ISSET(t->c_lflag, ICANON)) {
                                        SET(tp->t_lflag, PENDIN);
                                        ttwakeup(tp);
                                } else {
                                        struct clist tq;

                                        catq(&tp->t_rawq, &tp->t_canq);
                                        tq = tp->t_rawq;
                                        tp->t_rawq = tp->t_canq;
                                        tp->t_canq = tq;
                                        CLR(tp->t_lflag, PENDIN);
                                }
                        }
                }
                tp->t_iflag = t->c_iflag;
                tp->t_oflag = t->c_oflag;
                /*
                 * Make the EXTPROC bit read only.
                 */
                if (ISSET(tp->t_lflag, EXTPROC))
                        SET(t->c_lflag, EXTPROC);
                else
                        CLR(t->c_lflag, EXTPROC);
                tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN);
                memcpy(tp->t_cc, t->c_cc, sizeof(t->c_cc));
                mutex_spin_exit(&tty_lock);
                splx(s);
                break;
        }
        case TIOCSETD:                        /* set line discipline (old) */
                lp = ttyldisc_lookup_bynum(*(int *)data);
                goto setldisc;

        case TIOCSLINED: {                /* set line discipline (new) */
                char *name = (char *)data;
                dev_t device;

                /* Null terminate to prevent buffer overflow */
                name[TTLINEDNAMELEN - 1] = '\0';
                lp = ttyldisc_lookup(name);
 setldisc:
                if (lp == NULL)
                        return (ENXIO);

                if (lp != tp->t_linesw) {
                        device = tp->t_dev;
                        s = spltty();
                        (*tp->t_linesw->l_close)(tp, flag);
                        error = (*lp->l_open)(device, tp);
                        if (error) {
                                (void)(*tp->t_linesw->l_open)(device, tp);
                                splx(s);
                                ttyldisc_release(lp);
                                return (error);
                        }
                        ttyldisc_release(tp->t_linesw);
                        tp->t_linesw = lp;
                        splx(s);
                } else {
                        /* Drop extra reference. */
                        ttyldisc_release(lp);
                }
                break;
        }
        case TIOCSTART:                        /* start output, like ^Q */
                mutex_spin_enter(&tty_lock);
                if (ISSET(tp->t_state, TS_TTSTOP) ||
                    ISSET(tp->t_lflag, FLUSHO)) {
                        CLR(tp->t_lflag, FLUSHO);
                        CLR(tp->t_state, TS_TTSTOP);
                        ttstart(tp);
                }
                mutex_spin_exit(&tty_lock);
                break;
        case TIOCSTI:                        /* simulate terminal input */
                if ((error = kauth_authorize_device_tty(l->l_cred,
                    KAUTH_DEVICE_TTY_STI, tp)) != 0) {
                        if (!ISSET(flag, FREAD))
                                return EPERM;
                        if (!isctty(p, tp))
                                return EACCES;
                        if (tp->t_session->s_leader->p_cred != p->p_cred)
                                return error;
                }
                (*tp->t_linesw->l_rint)(*(u_char *)data, tp);
                break;
        case TIOCSTOP:                        /* stop output, like ^S */
        {
                mutex_spin_enter(&tty_lock);
                if (!ISSET(tp->t_state, TS_TTSTOP)) {
                        SET(tp->t_state, TS_TTSTOP);
                        cdev_stop(tp, 0);
                }
                mutex_spin_exit(&tty_lock);
                break;
        }
        case TIOCSCTTY:                        /* become controlling tty */
                mutex_enter(&proc_lock);
                mutex_spin_enter(&tty_lock);

                /* Session ctty vnode pointer set in vnode layer. */
                if (!SESS_LEADER(p) ||
                    ((p->p_session->s_ttyvp || tp->t_session) &&
                    (tp->t_session != p->p_session))) {
                        mutex_spin_exit(&tty_lock);
                        mutex_exit(&proc_lock);
                        return (EPERM);
                }

                /*
                 * `p_session' acquires a reference.
                 * But note that if `t_session' is set at this point,
                 * it must equal `p_session', in which case the session
                 * already has the correct reference count.
                 */
                if (tp->t_session == NULL) {
                        proc_sesshold(p->p_session);
                }
                tp->t_session = p->p_session;
                tp->t_pgrp = p->p_pgrp;
                p->p_session->s_ttyp = tp;
                p->p_lflag |= PL_CONTROLT;
                mutex_spin_exit(&tty_lock);
                mutex_exit(&proc_lock);
                break;
        case FIOSETOWN: {                /* set pgrp of tty */
                pid_t pgid = *(pid_t *)data;
                struct pgrp *pgrp;

                mutex_enter(&proc_lock);
                if (tp->t_session != NULL && !isctty(p, tp)) {
                        mutex_exit(&proc_lock);
                        return (ENOTTY);
                }

                if (pgid < 0) {
                        if (pgid == INT_MIN) {
                                mutex_exit(&proc_lock);
                                return (EINVAL);
                        }
                        pgrp = pgrp_find(-pgid);
                        if (pgrp == NULL) {
                                mutex_exit(&proc_lock);
                                return (EINVAL);
                        }
                } else {
                        struct proc *p1;
                        p1 = proc_find(pgid);
                        if (!p1) {
                                mutex_exit(&proc_lock);
                                return (ESRCH);
                        }
                        pgrp = p1->p_pgrp;
                }

                if (pgrp->pg_session != p->p_session) {
                        mutex_exit(&proc_lock);
                        return (EPERM);
                }
                mutex_spin_enter(&tty_lock);
                tp->t_pgrp = pgrp;
                mutex_spin_exit(&tty_lock);
                mutex_exit(&proc_lock);
                break;
        }
        case TIOCSPGRP: {                /* set pgrp of tty */
                struct pgrp *pgrp;
                pid_t pgid = *(pid_t *)data;

                if (pgid == NO_PGID)
                        return EINVAL;

                mutex_enter(&proc_lock);
                if (!isctty(p, tp)) {
                        mutex_exit(&proc_lock);
                        return (ENOTTY);
                }
                pgrp = pgrp_find(pgid);
                if (pgrp == NULL || pgrp->pg_session != p->p_session) {
                        mutex_exit(&proc_lock);
                        return (EPERM);
                }
                mutex_spin_enter(&tty_lock);
                tp->t_pgrp = pgrp;
                mutex_spin_exit(&tty_lock);
                mutex_exit(&proc_lock);
                break;
        }
        case TIOCSTAT:                        /* get load avg stats */
                mutex_enter(&proc_lock);
                ttygetinfo(tp, 0, infobuf, sizeof(infobuf));
                mutex_exit(&proc_lock);

                mutex_spin_enter(&tty_lock);
                ttyputinfo(tp, infobuf);
                mutex_spin_exit(&tty_lock);
                break;
        case TIOCSWINSZ:                /* set window size */
                mutex_spin_enter(&tty_lock);
                if (memcmp((void *)&tp->t_winsize, data,
                    sizeof(struct winsize))) {
                        tp->t_winsize = *(struct winsize *)data;
                        ttysig(tp, TTYSIG_PG1, SIGWINCH);
                }
                mutex_spin_exit(&tty_lock);
                break;
        case TIOCSQSIZE:
                if ((error = tty_get_qsize(&s, *(int *)data)) == 0 &&
                    s != tp->t_qsize)
                        error = tty_set_qsize(tp, s);
                return error;

        case TIOCSBRK:
        case TIOCCBRK:
        case TIOCSDTR:
        case TIOCCDTR:
        case TIOCSFLAGS:
        case TIOCGFLAGS:
        case TIOCMSET:
        case TIOCMGET:
        case TIOCMBIS:
        case TIOCMBIC:
                /* Handled by the driver layer */
                return EPASSTHROUGH;

        case TIOCEXT:
        case TIOCPTSNAME:
        case TIOCGRANTPT:
        case TIOCPKT:
        case TIOCUCNTL:
        case TIOCREMOTE:
        case TIOCSIG:
                /* for ptys */
                return EPASSTHROUGH;

        default:
                /* Pass through various console ioctls */
                switch (IOCGROUP(cmd)) {
                case 'c':        /* syscons console */
                case 'v':        /* usl console, video - where one letter */
                case 'K':        /* usl console, keyboard - aint enough */
                case 'V':        /* pcvt compat */
                case 'W':        /* wscons console */
                        return EPASSTHROUGH;
                default:
                        break;
                }

                /* We may have to load the compat_60 module for this. */
                (void)module_autoload("compat_60", MODULE_CLASS_EXEC);
                MODULE_HOOK_CALL(tty_ttioctl_60_hook,
                    (tp, cmd, data, flag, l), enosys(), error);
                if (error != EPASSTHROUGH)
                        return error;

                /* We may have to load the compat_43 module for this. */
                (void)module_autoload("compat_43", MODULE_CLASS_EXEC);
                MODULE_HOOK_CALL(tty_ttioctl_43_hook,
                    (tp, cmd, data, flag, l), enosys(), error);
                return error;
        }
        return (0);
}

int
ttpoll(struct tty *tp, int events, struct lwp *l)
{
        int        revents;

        revents = 0;
        mutex_spin_enter(&tty_lock);
        if (events & (POLLIN | POLLRDNORM))
                if (ttnread(tp) > 0)
                        revents |= events & (POLLIN | POLLRDNORM);

        if (events & (POLLOUT | POLLWRNORM))
                if (tp->t_outq.c_cc <= tp->t_lowat)
                        revents |= events & (POLLOUT | POLLWRNORM);

        if (events & POLLHUP)
                if (!CONNECTED(tp))
                        revents |= POLLHUP;

        if (revents == 0) {
                if (events & (POLLIN | POLLHUP | POLLRDNORM))
                        selrecord(l, &tp->t_rsel);

                if (events & (POLLOUT | POLLWRNORM))
                        selrecord(l, &tp->t_wsel);
        }

        mutex_spin_exit(&tty_lock);

        return (revents);
}

static void
filt_ttyrdetach(struct knote *kn)
{
        struct tty        *tp;

        tp = kn->kn_hook;
        mutex_spin_enter(&tty_lock);
        selremove_knote(&tp->t_rsel, kn);
        mutex_spin_exit(&tty_lock);
}

static int
filt_ttyread(struct knote *kn, long hint)
{
        struct tty        *tp;
        int rv;

        tp = kn->kn_hook;
        if ((hint & NOTE_SUBMIT) == 0)
                mutex_spin_enter(&tty_lock);
        kn->kn_data = ttnread(tp);
        rv = kn->kn_data > 0;
        if ((hint & NOTE_SUBMIT) == 0)
                mutex_spin_exit(&tty_lock);
        return rv;
}

static void
filt_ttywdetach(struct knote *kn)
{
        struct tty        *tp;

        tp = kn->kn_hook;
        mutex_spin_enter(&tty_lock);
        selremove_knote(&tp->t_wsel, kn);
        mutex_spin_exit(&tty_lock);
}

static int
filt_ttywrite(struct knote *kn, long hint)
{
        struct tty        *tp;
        int                canwrite;

        tp = kn->kn_hook;
        if ((hint & NOTE_SUBMIT) == 0)
                mutex_spin_enter(&tty_lock);
        kn->kn_data = tp->t_outq.c_cn - tp->t_outq.c_cc;
        canwrite = (tp->t_outq.c_cc <= tp->t_lowat) && CONNECTED(tp);
        if ((hint & NOTE_SUBMIT) == 0)
                mutex_spin_exit(&tty_lock);
        return (canwrite);
}

static const struct filterops ttyread_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_ttyrdetach,
        .f_event = filt_ttyread,
};

static const struct filterops ttywrite_filtops = {
        .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
        .f_attach = NULL,
        .f_detach = filt_ttywdetach,
        .f_event = filt_ttywrite,
};

int
ttykqfilter(dev_t dev, struct knote *kn)
{
        struct tty        *tp;
        struct selinfo        *sip;

        if ((tp = cdev_tty(dev)) == NULL)
                return (ENXIO);

        switch (kn->kn_filter) {
        case EVFILT_READ:
                sip = &tp->t_rsel;
                kn->kn_fop = &ttyread_filtops;
                break;
        case EVFILT_WRITE:
                sip = &tp->t_wsel;
                kn->kn_fop = &ttywrite_filtops;
                break;
        default:
                return EINVAL;
        }

        kn->kn_hook = tp;

        mutex_spin_enter(&tty_lock);
        selrecord_knote(sip, kn);
        mutex_spin_exit(&tty_lock);

        return (0);
}

/*
 * Find the number of chars ready to be read from this tty.
 * Call with the tty lock held.
 */
static int
ttnread(struct tty *tp)
{
        int        nread;

        KASSERT(mutex_owned(&tty_lock));

        if (ISSET(tp->t_lflag, PENDIN))
                ttypend(tp);
        nread = tp->t_canq.c_cc;
        if (!ISSET(tp->t_lflag, ICANON)) {
                nread += tp->t_rawq.c_cc;
                if (nread < tp->t_cc[VMIN] && !tp->t_cc[VTIME])
                        nread = 0;
        }
        return (nread);
}

/*
 * Wait for output to drain, or if this times out, flush it.
 */
static int
ttywait_timo(struct tty *tp, int timo)
{
        int        error;

        error = 0;

        mutex_spin_enter(&tty_lock);
        while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
            CONNECTED(tp) && tp->t_oproc) {
                (*tp->t_oproc)(tp);
                error = ttysleep(tp, &tp->t_outcv, true, timo);
                if (error == EWOULDBLOCK)
                        ttyflush(tp, FWRITE);
                if (error)
                        break;
        }
        mutex_spin_exit(&tty_lock);

        return (error);
}

/*
 * Wait for output to drain.
 */
int
ttywait(struct tty *tp)
{
        return ttywait_timo(tp, 0);
}

/*
 * Flush if successfully wait.
 */
int
ttywflush(struct tty *tp)
{
        int        error;

        error = ttywait_timo(tp, 5 * hz);
        if (error == 0 || error == EWOULDBLOCK) {
                mutex_spin_enter(&tty_lock);
                ttyflush(tp, FREAD);
                mutex_spin_exit(&tty_lock);
        }
        return (error);
}

/*
 * Flush tty read and/or write queues, notifying anyone waiting.
 * Call with the tty lock held.
 */
void
ttyflush(struct tty *tp, int rw)
{

        KASSERT(mutex_owned(&tty_lock));

        if (rw & FREAD) {
                FLUSHQ(&tp->t_canq);
                FLUSHQ(&tp->t_rawq);
                tp->t_rocount = 0;
                tp->t_rocol = 0;
                CLR(tp->t_state, TS_LOCAL);
                ttwakeup(tp);
        }
        if (rw & FWRITE) {
                CLR(tp->t_state, TS_TTSTOP);
                cdev_stop(tp, rw);
                FLUSHQ(&tp->t_outq);
                cv_broadcast(&tp->t_outcv);
                selnotify(&tp->t_wsel, 0, NOTE_SUBMIT);
        }
}

/*
 * Copy in the default termios characters.
 */
void
ttychars(struct tty *tp)
{

        memcpy(tp->t_cc, ttydefchars, sizeof(ttydefchars));
}

/*
 * Send stop character on input overflow.
 * Call with the tty lock held.
 */
static void
ttyblock(struct tty *tp)
{
        int        total;

        KASSERT(mutex_owned(&tty_lock));

        total = tp->t_rawq.c_cc + tp->t_canq.c_cc;
        if (tp->t_rawq.c_cc > TTYHOG) {
                ttyflush(tp, FREAD | FWRITE);
                CLR(tp->t_state, TS_TBLOCK);
        }
        /*
         * Block further input iff: current input > threshold
         * AND input is available to user program.
         */
        if (total >= TTYHOG / 2 &&
            !ISSET(tp->t_state, TS_TBLOCK) &&
            (!ISSET(tp->t_lflag, ICANON) || tp->t_canq.c_cc > 0)) {
                if (ISSET(tp->t_iflag, IXOFF) &&
                    tp->t_cc[VSTOP] != _POSIX_VDISABLE &&
                    putc(tp->t_cc[VSTOP], &tp->t_outq) == 0) {
                        SET(tp->t_state, TS_TBLOCK);
                        ttstart(tp);
                }
                /* Try to block remote output via hardware flow control. */
                if (ISSET(tp->t_cflag, CHWFLOW) && tp->t_hwiflow &&
                    (*tp->t_hwiflow)(tp, 1) != 0)
                        SET(tp->t_state, TS_TBLOCK);
        }
}

/*
 * Delayed line discipline output
 */
void
ttrstrt(void *tp_arg)
{
        struct tty        *tp;

#ifdef DIAGNOSTIC
        if (tp_arg == NULL)
                panic("ttrstrt");
#endif
        tp = tp_arg;
        mutex_spin_enter(&tty_lock);

        CLR(tp->t_state, TS_TIMEOUT);
        ttstart(tp); /* XXX - Shouldn't this be tp->l_start(tp)? */

        mutex_spin_exit(&tty_lock);
}

/*
 * start a line discipline
 * Always call with tty lock held?
 */
int
ttstart(struct tty *tp)
{

        if (tp->t_oproc != NULL)        /* XXX: Kludge for pty. */
                (*tp->t_oproc)(tp);
        return (0);
}

/*
 * "close" a line discipline
 */
int
ttylclose(struct tty *tp, int flag)
{

        if (flag & FNONBLOCK) {
                mutex_spin_enter(&tty_lock);
                ttyflush(tp, FREAD | FWRITE);
                mutex_spin_exit(&tty_lock);
        } else
                ttywflush(tp);
        return (0);
}

/*
 * Handle modem control transition on a tty.
 * Flag indicates new state of carrier.
 * Returns 0 if the line should be turned off, otherwise 1.
 */
int
ttymodem(struct tty *tp, int flag)
{

        mutex_spin_enter(&tty_lock);
        if (flag == 0) {
                if (ISSET(tp->t_state, TS_CARR_ON)) {
                        /*
                         * Lost carrier.
                         */
                        CLR(tp->t_state, TS_CARR_ON);
                        if (ISSET(tp->t_state, TS_ISOPEN) && !CONNECTED(tp)) {
                                ttysig(tp, TTYSIG_LEADER, SIGHUP);
                                ttyflush(tp, FREAD | FWRITE);
                                mutex_spin_exit(&tty_lock);
                                return (0);
                        }
                }
        } else {
                if (!ISSET(tp->t_state, TS_CARR_ON)) {
                        /*
                         * Carrier now on.
                         */
                        SET(tp->t_state, TS_CARR_ON);
                        ttwakeup(tp);
                }
        }
        mutex_spin_exit(&tty_lock);

        return (1);
}

/*
 * Default modem control routine (for other line disciplines).
 * Return argument flag, to turn off device on carrier drop.
 */
int
nullmodem(struct tty *tp, int flag)
{

        mutex_spin_enter(&tty_lock);
        if (flag)
                SET(tp->t_state, TS_CARR_ON);
        else {
                CLR(tp->t_state, TS_CARR_ON);
                if (!CONNECTED(tp)) {
                        ttysig(tp, TTYSIG_LEADER, SIGHUP);
                        mutex_spin_exit(&tty_lock);
                        return (0);
                }
        }
        mutex_spin_exit(&tty_lock);

        return (1);
}

/*
 * Reinput pending characters after state switch.
 */
void
ttypend(struct tty *tp)
{
        struct clist        tq;
        int                c;

        KASSERT(mutex_owned(&tty_lock));

        CLR(tp->t_lflag, PENDIN);
        SET(tp->t_state, TS_TYPEN);
        tq = tp->t_rawq;
        tp->t_rawq.c_cc = 0;
        tp->t_rawq.c_cf = tp->t_rawq.c_cl = 0;
        while ((c = getc(&tq)) >= 0)
                ttyinput_wlock(c, tp);
        CLR(tp->t_state, TS_TYPEN);
}

/*
 * Process a read call on a tty device.
 */
int
ttread(struct tty *tp, struct uio *uio, int flag)
{
        struct clist        *qp;
        u_char                *cc;
        struct proc        *p;
        int                c, first, error, has_stime, last_cc;
        long                lflag, slp;
        struct timeval        now, stime;

        if (uio->uio_resid == 0)
                return 0;

        stime.tv_usec = 0;        /* XXX gcc */
        stime.tv_sec = 0;        /* XXX gcc */

        cc = tp->t_cc;
        p = curproc;
        error = 0;
        has_stime = 0;
        last_cc = 0;
        slp = 0;

 loop:
        mutex_spin_enter(&tty_lock);
        lflag = tp->t_lflag;
        /*
         * take pending input first
         */
        if (ISSET(lflag, PENDIN))
                ttypend(tp);

        /*
         * Hang process if it's in the background.
         */
        if (isbackground(p, tp)) {
                if (sigismasked(curlwp, SIGTTIN) ||
                    p->p_lflag & PL_PPWAIT || p->p_pgrp->pg_jobc == 0) {
                        mutex_spin_exit(&tty_lock);
                        return (EIO);
                }
                mutex_spin_exit(&tty_lock);

                mutex_enter(&proc_lock);
                pgsignal(p->p_pgrp, SIGTTIN, 1);
                mutex_exit(&proc_lock);

                mutex_spin_enter(&tty_lock);
                error = ttypause(tp, hz);
                mutex_spin_exit(&tty_lock);
                if (error)
                        return (error);
                goto loop;
        }

        if (!ISSET(lflag, ICANON)) {
                int m = cc[VMIN];
                long t = cc[VTIME];

                qp = &tp->t_rawq;
                /*
                 * Check each of the four combinations.
                 * (m > 0 && t == 0) is the normal read case.
                 * It should be fairly efficient, so we check that and its
                 * companion case (m == 0 && t == 0) first.
                 * For the other two cases, we compute the target sleep time
                 * into slp.
                 */
                if (t == 0) {
                        if (qp->c_cc < m)
                                goto sleep;
                        goto read;
                }
                t *= hz;                /* time in deca-ticks */
/*
 * Time difference in deca-ticks, split division to avoid numeric overflow.
 * Ok for hz < ~200kHz
 */
#define        diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 10 * hz + \
                         ((t1).tv_usec - (t2).tv_usec) / 100 * hz / 1000)
                if (m > 0) {
                        if (qp->c_cc <= 0)
                                goto sleep;
                        if (qp->c_cc >= m)
                                goto read;
                        if (!has_stime) {
                                /* first character, start timer */
                                has_stime = 1;
                                getmicrotime(&stime);
                                slp = t;
                        } else if (qp->c_cc > last_cc) {
                                /* got a character, restart timer */
                                getmicrotime(&stime);
                                slp = t;
                        } else {
                                /* nothing, check expiration */
                                getmicrotime(&now);
                                slp = t - diff(now, stime);
                        }
                } else {        /* m == 0 */
                        if (qp->c_cc > 0)
                                goto read;
                        if (!has_stime) {
                                has_stime = 1;
                                getmicrotime(&stime);
                                slp = t;
                        } else {
                                getmicrotime(&now);
                                slp = t - diff(now, stime);
                        }
                }
                last_cc = qp->c_cc;
#undef diff
                if (slp > 0) {
                        /*
                         * Convert deca-ticks back to ticks.
                         * Rounding down may make us wake up just short
                         * of the target, so we round up.
                         * Maybe we should do 'slp/10 + 1' because the
                         * first tick maybe almost immediate.
                         * However it is more useful for a program that sets
                         * VTIME=10 to wakeup every second not every 1.01
                         * seconds (if hz=100).
                         */
                        slp = (slp + 9)/ 10;
                        goto sleep;
                }
        } else if ((qp = &tp->t_canq)->c_cc <= 0) {
                int        carrier;

 sleep:
                /*
                 * If there is no input, sleep on rawq
                 * awaiting hardware receipt and notification.
                 * If we have data, we don't need to check for carrier.
                 */
                carrier = CONNECTED(tp);
                if (!carrier && ISSET(tp->t_state, TS_ISOPEN)) {
                        mutex_spin_exit(&tty_lock);
                        return (0);        /* EOF */
                }
                if (!has_stime || slp <= 0) {
                        if (flag & IO_NDELAY) {
                                mutex_spin_exit(&tty_lock);
                                return (EWOULDBLOCK);
                        }
                }
                error = ttysleep(tp, &tp->t_rawcv, true, slp);
                mutex_spin_exit(&tty_lock);
                /* VMIN == 0: any quantity read satisfies */
                if (cc[VMIN] == 0 && error == EWOULDBLOCK)
                        return (0);
                if (error && error != EWOULDBLOCK)
                        return (error);
                goto loop;
        }
 read:

        /*
         * Input present, check for input mapping and processing.
         */
        first = 1;
        while ((c = getc(qp)) >= 0) {
                /*
                 * delayed suspend (^Y)
                 */
                if (CCEQ(cc[VDSUSP], c) &&
                    ISSET(lflag, IEXTEN|ISIG) == (IEXTEN|ISIG)) {
                        ttysig(tp, TTYSIG_PG1, SIGTSTP);
                        if (first) {
                                error = ttypause(tp, hz);
                                if (error)
                                        break;
                                mutex_spin_exit(&tty_lock);
                                goto loop;
                        }
                        break;
                }
                /*
                 * Interpret EOF only in canonical mode.
                 */
                if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON))
                        break;
                /*
                 * Give user character.
                 */
                mutex_spin_exit(&tty_lock);
                 error = ureadc(c, uio);
                mutex_spin_enter(&tty_lock);
                if (error)
                        break;
                 if (uio->uio_resid == 0)
                        break;
                /*
                 * In canonical mode check for a "break character"
                 * marking the end of a "line of input".
                 */
                if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag))
                        break;
                first = 0;
        }

        /*
         * Look to unblock output now that (presumably)
         * the input queue has gone down.
         */
        if (ISSET(tp->t_state, TS_TBLOCK) && tp->t_rawq.c_cc < TTYHOG / 5) {
                if (ISSET(tp->t_iflag, IXOFF) &&
                    cc[VSTART] != _POSIX_VDISABLE &&
                    putc(cc[VSTART], &tp->t_outq) == 0) {
                        CLR(tp->t_state, TS_TBLOCK);
                        ttstart(tp);
                }
                /* Try to unblock remote output via hardware flow control. */
                if (ISSET(tp->t_cflag, CHWFLOW) && tp->t_hwiflow &&
                    (*tp->t_hwiflow)(tp, 0) != 0)
                        CLR(tp->t_state, TS_TBLOCK);
        }
        mutex_spin_exit(&tty_lock);

        return (error);
}

/*
 * Check the output queue on tp for space for a kernel message (from uprintf
 * or tprintf).  Allow some space over the normal hiwater mark so we don't
 * lose messages due to normal flow control, but don't let the tty run amok.
 * Sleeps here are not interruptible, but we return prematurely if new signals
 * arrive.
 * Call with tty lock held.
 */
static int
ttycheckoutq_wlock(struct tty *tp)
{
        int        hiwat;

        KASSERT(mutex_owned(&tty_lock));

        hiwat = tp->t_hiwat;
        if (tp->t_outq.c_cc > hiwat + 200)
                if (tp->t_outq.c_cc > hiwat) {
                        ttstart(tp);
                        return (0);
                }

        return (1);
}

int
ttycheckoutq(struct tty *tp)
{
        int        r;

        mutex_spin_enter(&tty_lock);
        r = ttycheckoutq_wlock(tp);
        mutex_spin_exit(&tty_lock);

        return (r);
}

/*
 * Process a write call on a tty device.
 */
int
ttwrite(struct tty *tp, struct uio *uio, int flag)
{
        u_char                *cp;
        struct proc        *p;
        int                cc, cc0, ce, i, hiwat, error;
        u_char                obuf[OBUFSIZ];

        cp = NULL;
        hiwat = tp->t_hiwat;
        error = 0;
        cc0 = cc = 0;
 loop:
        mutex_spin_enter(&tty_lock);
        if (!CONNECTED(tp)) {
                if (ISSET(tp->t_state, TS_ISOPEN)) {
                        mutex_spin_exit(&tty_lock);
                        return (EIO);
                } else if (flag & IO_NDELAY) {
                        mutex_spin_exit(&tty_lock);
                        error = EWOULDBLOCK;
                        goto out;
                } else {
                        /* Sleep awaiting carrier. */
                        error = ttysleep(tp, &tp->t_rawcv, true, 0);
                        mutex_spin_exit(&tty_lock);
                        if (error)
                                goto out;
                        goto loop;
                }
        }

        /*
         * Hang the process if it's in the background.
         */
        p = curproc;
        if (isbackground(p, tp) &&
            ISSET(tp->t_lflag, TOSTOP) && (p->p_lflag & PL_PPWAIT) == 0 &&
            !sigismasked(curlwp, SIGTTOU)) {
                if (p->p_pgrp->pg_jobc == 0) {
                        error = EIO;
                        mutex_spin_exit(&tty_lock);
                        goto out;
                }
                mutex_spin_exit(&tty_lock);

                mutex_enter(&proc_lock);
                pgsignal(p->p_pgrp, SIGTTOU, 1);
                mutex_exit(&proc_lock);

                mutex_spin_enter(&tty_lock);
                error = ttypause(tp, hz);
                mutex_spin_exit(&tty_lock);
                if (error)
                        goto out;
                goto loop;
        }
        mutex_spin_exit(&tty_lock);

        /*
         * Process the user's data in at most OBUFSIZ chunks.  Perform any
         * output translation.  Keep track of high water mark, sleep on
         * overflow awaiting device aid in acquiring new space.
         */
        while (uio->uio_resid > 0 || cc > 0) {
                if (ISSET(tp->t_lflag, FLUSHO)) {
                        uio->uio_resid = 0;
                        return (0);
                }
                if (tp->t_outq.c_cc > hiwat)
                        goto ovhiwat;
                /*
                 * Grab a hunk of data from the user, unless we have some
                 * leftover from last time.
                 */
                if (cc == 0) {
                        uioskip(cc0, uio);
                        cc0 = cc = uimin(uio->uio_resid, OBUFSIZ);
                        cp = obuf;
                        error = uiopeek(cp, cc, uio);
                        if (error) {
                                cc = 0;
                                goto out;
                        }
                }
                /*
                 * If nothing fancy need be done, grab those characters we
                 * can handle without any of ttyoutput's processing and
                 * just transfer them to the output q.  For those chars
                 * which require special processing (as indicated by the
                 * bits in char_type), call ttyoutput.  After processing
                 * a hunk of data, look for FLUSHO so ^O's will take effect
                 * immediately.
                 */
                mutex_spin_enter(&tty_lock);
                while (cc > 0) {
                        if (!ISSET(tp->t_oflag, OPOST))
                                ce = cc;
                        else {
                                ce = cc - scanc((u_int)cc, cp, char_type,
                                    CCLASSMASK);
                                /*
                                 * If ce is zero, then we're processing
                                 * a special character through ttyoutput.
                                 */
                                if (ce == 0) {
                                        tp->t_rocount = 0;
                                        if (ttyoutput(*cp, tp) >= 0) {
                                                /* out of space */
                                                mutex_spin_exit(&tty_lock);
                                                goto overfull;
                                        }
                                        cp++;
                                        cc--;
                                        if (ISSET(tp->t_lflag, FLUSHO) ||
                                            tp->t_outq.c_cc > hiwat) {
                                                mutex_spin_exit(&tty_lock);
                                                goto ovhiwat;
                                        }
                                        continue;
                                }
                        }
                        /*
                         * A bunch of normal characters have been found.
                         * Transfer them en masse to the output queue and
                         * continue processing at the top of the loop.
                         * If there are any further characters in this
                         * <= OBUFSIZ chunk, the first should be a character
                         * requiring special handling by ttyoutput.
                         */
                        tp->t_rocount = 0;
                        i = b_to_q(cp, ce, &tp->t_outq);
                        ce -= i;
                        tp->t_column += ce;
                        cp += ce, cc -= ce, tk_nout += ce;
                        tp->t_outcc += ce;
                        if (i > 0) {
                                /* out of space */
                                mutex_spin_exit(&tty_lock);
                                goto overfull;
                        }
                        if (ISSET(tp->t_lflag, FLUSHO) ||
                            tp->t_outq.c_cc > hiwat)
                                break;
                }
                ttstart(tp);
                mutex_spin_exit(&tty_lock);
        }

 out:
        KASSERTMSG(error || cc == 0, "error=%d cc=%d", error, cc);
        KASSERTMSG(cc0 >= cc, "cc0=%d cc=%d", cc0, cc);
        uioskip(cc0 - cc, uio);
        return (error);

 overfull:
        /*
         * Since we are using ring buffers, if we can't insert any more into
         * the output queue, we can assume the ring is full and that someone
         * forgot to set the high water mark correctly.  We set it and then
         * proceed as normal.
         */
        hiwat = tp->t_outq.c_cc - 1;

 ovhiwat:
        mutex_spin_enter(&tty_lock);
        ttstart(tp);
        /*
         * This can only occur if FLUSHO is set in t_lflag,
         * or if ttstart/oproc is synchronous (or very fast).
         */
        if (tp->t_outq.c_cc <= hiwat) {
                mutex_spin_exit(&tty_lock);
                goto loop;
        }
        if (flag & IO_NDELAY) {
                mutex_spin_exit(&tty_lock);
                error = EWOULDBLOCK;
                goto out;
        }
        error = ttysleep(tp, &tp->t_outcv, true, 0);
        mutex_spin_exit(&tty_lock);
        if (error)
                goto out;
        goto loop;
}

/*
 * Try to pull more output from the producer.  Return non-zero if
 * there is output ready to be sent.
 */
bool
ttypull(struct tty *tp)
{

        /* XXXSMP not yet KASSERT(mutex_owned(&tty_lock)); */

        if (tp->t_outq.c_cc <= tp->t_lowat) {
                cv_broadcast(&tp->t_outcv);
                selnotify(&tp->t_wsel, 0, NOTE_SUBMIT);
        }
        return tp->t_outq.c_cc != 0;
}

/*
 * Rubout one character from the rawq of tp
 * as cleanly as possible.
 * Called with tty lock held.
 */
void
ttyrub(int c, struct tty *tp)
{
        u_char        *cp;
        int        savecol, tabc;

        KASSERT(mutex_owned(&tty_lock));

        if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC))
                return;
        CLR(tp->t_lflag, FLUSHO);
        if (ISSET(tp->t_lflag, ECHOE)) {
                if (tp->t_rocount == 0) {
                        /*
                         * Screwed by ttwrite; retype
                         */
                        ttyretype(tp);
                        return;
                }
                if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE))
                        ttyrubo(tp, 2);
                else {
                        CLR(c, ~TTY_CHARMASK);
                        switch (CCLASS(c)) {
                        case ORDINARY:
                                ttyrubo(tp, 1);
                                break;
                        case BACKSPACE:
                        case CONTROL:
                        case NEWLINE:
                        case RETURN:
                        case VTAB:
                                if (ISSET(tp->t_lflag, ECHOCTL))
                                        ttyrubo(tp, 2);
                                break;
                        case TAB:
                                if (tp->t_rocount < tp->t_rawq.c_cc) {
                                        ttyretype(tp);
                                        return;
                                }
                                savecol = tp->t_column;
                                SET(tp->t_state, TS_CNTTB);
                                SET(tp->t_lflag, FLUSHO);
                                tp->t_column = tp->t_rocol;
                                for (cp = firstc(&tp->t_rawq, &tabc); cp;
                                    cp = nextc(&tp->t_rawq, cp, &tabc))
                                        ttyecho(tabc, tp);
                                CLR(tp->t_lflag, FLUSHO);
                                CLR(tp->t_state, TS_CNTTB);

                                /* savecol will now be length of the tab. */
                                savecol -= tp->t_column;
                                tp->t_column += savecol;
                                if (savecol > 8)
                                        savecol = 8;        /* overflow screw */
                                while (--savecol >= 0)
                                        (void)ttyoutput('\b', tp);
                                break;
                        default:                        /* XXX */
                                (void)printf("ttyrub: would panic c = %d, "
                                    "val = %d\n", c, CCLASS(c));
                        }
                }
        } else if (ISSET(tp->t_lflag, ECHOPRT)) {
                if (!ISSET(tp->t_state, TS_ERASE)) {
                        SET(tp->t_state, TS_ERASE);
                        (void)ttyoutput('\\', tp);
                }
                ttyecho(c, tp);
        } else
                ttyecho(tp->t_cc[VERASE], tp);
        --tp->t_rocount;
}

/*
 * Back over cnt characters, erasing them.
 * Called with tty lock held.
 */
static void
ttyrubo(struct tty *tp, int cnt)
{

        KASSERT(mutex_owned(&tty_lock));

        while (cnt-- > 0) {
                (void)ttyoutput('\b', tp);
                (void)ttyoutput(' ', tp);
                (void)ttyoutput('\b', tp);
        }
}

/*
 * ttyretype --
 *        Reprint the rawq line.  Note, it is assumed that c_cc has already
 *        been checked.
 *
 * Called with tty lock held.
 */
void
ttyretype(struct tty *tp)
{
        u_char        *cp;
        int        c;

        KASSERT(mutex_owned(&tty_lock));

        /* Echo the reprint character. */
        if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE)
                ttyecho(tp->t_cc[VREPRINT], tp);

        (void)ttyoutput('\n', tp);

        for (cp = firstc(&tp->t_canq, &c); cp; cp = nextc(&tp->t_canq, cp, &c))
                ttyecho(c, tp);
        for (cp = firstc(&tp->t_rawq, &c); cp; cp = nextc(&tp->t_rawq, cp, &c))
                ttyecho(c, tp);
        CLR(tp->t_state, TS_ERASE);

        tp->t_rocount = tp->t_rawq.c_cc;
        tp->t_rocol = 0;
}

/*
 * Echo a typed character to the terminal.
 * Called with tty lock held.
 */
static void
ttyecho(int c, struct tty *tp)
{

        KASSERT(mutex_owned(&tty_lock));

        if (!ISSET(tp->t_state, TS_CNTTB))
                CLR(tp->t_lflag, FLUSHO);
        if ((!ISSET(tp->t_lflag, ECHO) &&
            (!ISSET(tp->t_lflag, ECHONL) || c != '\n')) ||
            ISSET(tp->t_lflag, EXTPROC))
                return;
        if (((ISSET(tp->t_lflag, ECHOCTL) &&
            (ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n')) ||
            ISSET(c, TTY_CHARMASK) == 0177)) {
                (void)ttyoutput('^', tp);
                CLR(c, ~TTY_CHARMASK);
                if (c == 0177)
                        c = '?';
                else
                        c += 'A' - 1;
        }
        (void)ttyoutput(c, tp);
}

/*
 * Wake up any readers on a tty.
 * Called with tty lock held.
 */
void
ttwakeup(struct tty *tp)
{

        KASSERT(mutex_owned(&tty_lock));

        selnotify(&tp->t_rsel, 0, NOTE_SUBMIT);
        if (ISSET(tp->t_state, TS_ASYNC))
                ttysig(tp, TTYSIG_PG2, SIGIO);
        cv_broadcast(&tp->t_rawcv);
}

/*
 * Look up a code for a specified speed in a conversion table;
 * used by drivers to map software speed values to hardware parameters.
 */
int
ttspeedtab(int speed, const struct speedtab *table)
{

        for (; table->sp_speed != -1; table++)
                if (table->sp_speed == speed)
                        return (table->sp_code);
        return (-1);
}

/*
 * Set tty hi and low water marks.
 *
 * Try to arrange the dynamics so there's about one second
 * from hi to low water.
 */
void
ttsetwater(struct tty *tp)
{
        int        cps, x;

        /* XXX not yet KASSERT(mutex_owned(&tty_lock)); */

#define        CLAMP(x, h, l)        ((x) > h ? h : ((x) < l) ? l : (x))

        cps = tp->t_ospeed / 10;
        tp->t_lowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT);
        x += cps;
        x = CLAMP(x, TTMAXHIWAT, TTMINHIWAT);
        tp->t_hiwat = roundup(x, TTROUND);
#undef        CLAMP
}

/*
 * Prepare report on state of foreground process group.
 * Call with &proc_lock held.
 */
void
ttygetinfo(struct tty *tp, int fromsig, char *buf, size_t bufsz)
{
        struct lwp        *l;
        struct proc        *p, *pick = NULL;
        struct timeval        utime, stime;
        int                tmp;
        fixpt_t                pctcpu = 0;
        const char        *msg = NULL;
        char                lmsg[100];
        long                rss;
        bool                again = false;

        KASSERT(mutex_owned(&proc_lock));

        *buf = '\0';

 retry:
        if (tp->t_session == NULL)
                msg = "not a controlling terminal\n";
        else if (tp->t_pgrp == NULL)
                msg = "no foreground process group\n";
        else if ((p = LIST_FIRST(&tp->t_pgrp->pg_members)) == NULL)
                msg = "empty foreground process group\n";
        else {
                /* Pick interesting process. */
                for (; p != NULL; p = LIST_NEXT(p, p_pglist)) {
                        struct proc *oldpick;

                        if (pick == NULL) {
                                pick = p;
                                continue;
                        }
                        if (pick->p_lock < p->p_lock) {
                                mutex_enter(pick->p_lock);
                                mutex_enter(p->p_lock);
                        } else if (pick->p_lock > p->p_lock) {
                                mutex_enter(p->p_lock);
                                mutex_enter(pick->p_lock);
                        } else
                                mutex_enter(p->p_lock);
                        oldpick = pick;
                        if (proc_compare_wrapper(pick, p))
                                pick = p;
                        mutex_exit(p->p_lock);
                        if (p->p_lock != oldpick->p_lock)
                                mutex_exit(oldpick->p_lock);
                }

                if (pick != NULL) {
                        mutex_enter(pick->p_lock);
                        if (P_ZOMBIE(pick)) {
                                mutex_exit(pick->p_lock);
                                pick = NULL;
                                if (!again) {
                                        again = true;
                                        goto retry;
                                }
                                msg = "found only zombie processes\n";
                        }
                        if (pick && fromsig &&
                            (SIGACTION_PS(pick->p_sigacts, SIGINFO).sa_flags &
                            SA_NOKERNINFO)) {
                                mutex_exit(pick->p_lock);
                                return;
                        }
                }
        }

        /* Print load average. */
        tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
        snprintf(lmsg, sizeof(lmsg), "load: %d.%02d ", tmp / 100, tmp % 100);
        strlcat(buf, lmsg, bufsz);

        if (pick == NULL) {
                strlcat(buf, msg, bufsz);
                return;
        }

        snprintf(lmsg, sizeof(lmsg), " cmd: %s %d [", pick->p_comm,
            pick->p_pid);
        strlcat(buf, lmsg, bufsz);

        KASSERT(mutex_owned(pick->p_lock));
        LIST_FOREACH(l, &pick->p_lwps, l_sibling) {
                const char *lp;
                lwp_lock(l);
#ifdef LWP_PC
#define FMT_RUN "%#"PRIxVADDR
#define VAL_RUNNING (vaddr_t)LWP_PC(l)
#define VAL_RUNNABLE (vaddr_t)LWP_PC(l)
#else
#define FMT_RUN "%s"
#define VAL_RUNNING "running"
#define VAL_RUNNABLE "runnable"
#endif
                switch (l->l_stat) {
                case LSONPROC:
                        snprintf(lmsg, sizeof(lmsg), FMT_RUN"/%d", VAL_RUNNING,
                            cpu_index(l->l_cpu));
                        lp = lmsg;
                        break;
                case LSRUN:
                        snprintf(lmsg, sizeof(lmsg), FMT_RUN, VAL_RUNNABLE);
                        lp = lmsg;
                        break;
                default:
                        lp = l->l_wchan ? l->l_wmesg : "iowait";
                        break;
                }
                strlcat(buf, lp, bufsz);
                strlcat(buf, LIST_NEXT(l, l_sibling) != NULL ? " " : "] ",
                    bufsz);
                pctcpu += l->l_pctcpu;
                lwp_unlock(l);
        }
        pctcpu += pick->p_pctcpu;
        calcru(pick, &utime, &stime, NULL, NULL);
        mutex_exit(pick->p_lock);

        /* Round up and print user+system time, %CPU and RSS. */
        utime.tv_usec += 5000;
        if (utime.tv_usec >= 1000000) {
                utime.tv_sec += 1;
                utime.tv_usec -= 1000000;
        }
        stime.tv_usec += 5000;
        if (stime.tv_usec >= 1000000) {
                stime.tv_sec += 1;
                stime.tv_usec -= 1000000;
        }
#define        pgtok(a)        (((u_long) ((a) * PAGE_SIZE) / 1024))
        tmp = (pctcpu * 10000 + FSCALE / 2) >> FSHIFT;
        if (pick->p_stat == SIDL || P_ZOMBIE(pick))
                rss = 0;
        else
                rss = pgtok(vm_resident_count(pick->p_vmspace));

        snprintf(lmsg, sizeof(lmsg), "%ld.%02ldu %ld.%02lds %d%% %ldk",
            (long)utime.tv_sec, (long)utime.tv_usec / 10000,
            (long)stime.tv_sec, (long)stime.tv_usec / 10000,
            tmp / 100, rss);
        strlcat(buf, lmsg, bufsz);
}

/*
 * Print report on state of foreground process group.
 * Call with tty_lock held.
 */
void
ttyputinfo(struct tty *tp, char *buf)
{

        KASSERT(mutex_owned(&tty_lock));

        if (ttycheckoutq_wlock(tp) == 0)
                return;
        ttyprintf_nolock(tp, "%s\n", buf);
        tp->t_rocount = 0;        /* so pending input will be retyped if BS */
}

/*
 * Returns 1 if p2 has a better chance being the active foreground process
 * in a terminal instead of p1.
 */
static int
proc_compare_wrapper(struct proc *p1, struct proc *p2)
{
        lwp_t *l1, *l2;

        KASSERT(mutex_owned(p1->p_lock));
        KASSERT(mutex_owned(p2->p_lock));

        l1 = LIST_FIRST(&p1->p_lwps);
        l2 = LIST_FIRST(&p2->p_lwps);

        return proc_compare(p1, l1, p2, l2);
}

/*
 * Output char to tty; console putchar style.
 * Can be called with tty lock held through kprintf() machinery..
 */
int
tputchar(int c, int flags, struct tty *tp)
{
        int r = 0;

        if ((flags & NOLOCK) == 0)
                mutex_spin_enter(&tty_lock);
        if (!CONNECTED(tp)) {
                r = -1;
                goto out;
        }
        if (c == '\n')
                (void)ttyoutput('\r', tp);
        (void)ttyoutput(c, tp);
        ttstart(tp);
out:
        if ((flags & NOLOCK) == 0)
                mutex_spin_exit(&tty_lock);
        return (r);
}

/*
 * Sleep on chan, returning ERESTART if tty changed while we napped and
 * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by
 * cv_timedwait(_sig).
 * If the tty is revoked, restarting a pending call will redo validation done
 * at the start of the call.
 *
 * Must be called with the tty lock held.
 */
int
ttysleep(struct tty *tp, kcondvar_t *cv, bool catch_p, int timo)
{
        int        error;
        short        gen;

        KASSERT(mutex_owned(&tty_lock));

        gen = tp->t_gen;
        if (ISSET(tp->t_state, TS_CANCEL))
                error = ERESTART;
        else if (cv == NULL)
                error = kpause("ttypause", catch_p, timo, &tty_lock);
        else if (catch_p)
                error = cv_timedwait_sig(cv, &tty_lock, timo);
        else
                error = cv_timedwait(cv, &tty_lock, timo);
        if (error != 0)
                return (error);
        return (tp->t_gen == gen ? 0 : ERESTART);
}

int
ttypause(struct tty *tp, int timo)
{
        int error;

        error = ttysleep(tp, NULL, true, timo);
        if (error == EWOULDBLOCK)
                error = 0;
        return error;
}

/*
 * Attach a tty to the tty list.
 *
 * This should be called ONLY once per real tty (including pty's).
 * eg, on the sparc, the keyboard and mouse have struct tty's that are
 * distinctly NOT usable as tty's, and thus should not be attached to
 * the ttylist.  This is why this call is not done from tty_alloc().
 *
 * Device drivers should attach tty's at a similar time that they are
 * allocated, or, for the case of statically allocated struct tty's
 * either in the attach or (first) open routine.
 */
void
tty_attach(struct tty *tp)
{

        mutex_spin_enter(&tty_lock);
        TAILQ_INSERT_TAIL(&ttylist, tp, tty_link);
        ++tty_count;
        mutex_spin_exit(&tty_lock);
}

/*
 * Remove a tty from the tty list.
 */
void
tty_detach(struct tty *tp)
{

        mutex_spin_enter(&tty_lock);
        --tty_count;
#ifdef DIAGNOSTIC
        if (tty_count < 0)
                panic("tty_detach: tty_count < 0");
#endif
        TAILQ_REMOVE(&ttylist, tp, tty_link);
        mutex_spin_exit(&tty_lock);
}

/*
 * Allocate a tty structure and its associated buffers.
 */
struct tty *
tty_alloc(void)
{
        struct tty *tp;
        int i;

        tp = kmem_zalloc(sizeof(*tp), KM_SLEEP);
        callout_init(&tp->t_rstrt_ch, 0);
        callout_setfunc(&tp->t_rstrt_ch, ttrstrt, tp);
        tp->t_qsize = tty_qsize;
        clalloc(&tp->t_rawq, tp->t_qsize, 1);
        cv_init(&tp->t_rawcv, "ttyraw");
        cv_init(&tp->t_rawcvf, "ttyrawf");
        clalloc(&tp->t_canq, tp->t_qsize, 1);
        cv_init(&tp->t_cancv, "ttycan");
        cv_init(&tp->t_cancvf, "ttycanf");
        /* output queue doesn't need quoting */
        clalloc(&tp->t_outq, tp->t_qsize, 0);
        cv_init(&tp->t_outcv, "ttyout");
        cv_init(&tp->t_outcvf, "ttyoutf");
        /* Set default line discipline. */
        tp->t_linesw = ttyldisc_default();
        tp->t_dev = NODEV;
        selinit(&tp->t_rsel);
        selinit(&tp->t_wsel);
        for (i = 0; i < TTYSIG_COUNT; i++)  {
                sigemptyset(&tp->t_sigs[i]);
        }

        return tp;
}

/*
 * Free a tty structure and its buffers.
 *
 * Be sure to call tty_detach() for any tty that has been
 * tty_attach()ed.
 */
void
tty_free(struct tty *tp)
{
        int i;

        mutex_enter(&proc_lock);
        mutex_enter(&tty_lock);
        for (i = 0; i < TTYSIG_COUNT; i++)
                sigemptyset(&tp->t_sigs[i]);
        if (tp->t_sigcount != 0)
                TAILQ_REMOVE(&tty_sigqueue, tp, t_sigqueue);
        mutex_exit(&tty_lock);
        mutex_exit(&proc_lock);

        callout_halt(&tp->t_rstrt_ch, NULL);
        callout_destroy(&tp->t_rstrt_ch);
        ttyldisc_release(tp->t_linesw);
        clfree(&tp->t_rawq);
        clfree(&tp->t_canq);
        clfree(&tp->t_outq);
        cv_destroy(&tp->t_rawcv);
        cv_destroy(&tp->t_rawcvf);
        cv_destroy(&tp->t_cancv);
        cv_destroy(&tp->t_cancvf);
        cv_destroy(&tp->t_outcv);
        cv_destroy(&tp->t_outcvf);
        seldestroy(&tp->t_rsel);
        seldestroy(&tp->t_wsel);
        kmem_free(tp, sizeof(*tp));
}

/*
 * tty_unit: map dev_t to tty unit number, as with TTUNIT
 *
 * => defined as function for use with struct cdevsw::d_devtounit
 * => not for drivers with different unit numbering, e.g. TTUNIT(d) >> 4
 */
int
tty_unit(dev_t dev)
{
        return TTUNIT(dev);
}

/*
 * ttyprintf_nolock: send a message to a specific tty, without locking.
 *
 * => should be used only by tty driver or anything that knows the
 *    underlying tty will not be revoked(2)'d away.  [otherwise,
 *    use tprintf]
 */
static void
ttyprintf_nolock(struct tty *tp, const char *fmt, ...)
{
        va_list ap;

        /* No mutex needed; going to process TTY. */
        va_start(ap, fmt);
        kprintf(fmt, TOTTY|NOLOCK, tp, NULL, ap);
        va_end(ap);
}

static int
tty_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
    void *arg0, void *arg1, void *arg2, void *arg3)
{
        struct tty *tty;
        int result;

        result = KAUTH_RESULT_DEFER;

        if (action != KAUTH_DEVICE_TTY_OPEN)
                return result;

        tty = arg0;

        /* If it's not opened, we allow. */
        if ((tty->t_state & TS_ISOPEN) == 0)
                result = KAUTH_RESULT_ALLOW;
        else {
                /*
                 * If it's opened, we can only allow if it's not exclusively
                 * opened; otherwise, that's a privileged operation and we
                 * let the secmodel handle it.
                 */
                if ((tty->t_state & TS_XCLUDE) == 0)
                        result = KAUTH_RESULT_ALLOW;
        }

        return result;
}

/*
 * Initialize the tty subsystem.
 */
void
tty_init(void)
{

        mutex_init(&tty_lock, MUTEX_DEFAULT, IPL_VM);
        mutex_init(&constty_lock, MUTEX_DEFAULT, IPL_NONE);
        constty_psz = pserialize_create();
        cv_init(&ttyref_cv, "ttyref");
        tty_sigsih = softint_establish(SOFTINT_CLOCK, ttysigintr, NULL);
        KASSERT(tty_sigsih != NULL);

        tty_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
            tty_listener_cb, NULL);

        sysctl_kern_tty_setup();
}

/*
 * Send a signal from a tty to its process group or session leader.
 * Handoff to the target is deferred to a soft interrupt.
 */
void
ttysig(struct tty *tp, enum ttysigtype st, int sig)
{
        sigset_t *sp;

        /* XXXSMP not yet KASSERT(mutex_owned(&tty_lock)); */

        sp = &tp->t_sigs[st];
        if (sigismember(sp, sig))
                return;
        sigaddset(sp, sig);
        if (tp->t_sigcount++ == 0)
                TAILQ_INSERT_TAIL(&tty_sigqueue, tp, t_sigqueue);
        softint_schedule(tty_sigsih);
}

/*
 * Deliver deferred signals from ttys.  Note that the process groups
 * and sessions associated with the ttys may have changed from when
 * the signal was originally sent, but in practice it should not matter.
 * For signals produced as a result of a syscall, the soft interrupt
 * will fire before the syscall returns to the user.
 */
static void
ttysigintr(void *cookie)
{
        struct tty *tp;
        enum ttysigtype st;
        struct pgrp *pgrp;
        struct session *sess;
        int sig, lflag;
        char infobuf[200];

        mutex_enter(&proc_lock);
        mutex_spin_enter(&tty_lock);
        while ((tp = TAILQ_FIRST(&tty_sigqueue)) != NULL) {
                KASSERT(tp->t_sigcount > 0);
                for (st = TTYSIG_PG1; st < TTYSIG_COUNT; st++) {
                        if ((sig = firstsig(&tp->t_sigs[st])) != 0)
                                break;
                }
                KASSERT(st < TTYSIG_COUNT);
                sigdelset(&tp->t_sigs[st], sig);
                if (--tp->t_sigcount == 0)
                        TAILQ_REMOVE(&tty_sigqueue, tp, t_sigqueue);
                pgrp = tp->t_pgrp;
                sess = tp->t_session;
                lflag = tp->t_lflag;
                if (sig == SIGINFO) {
                        if (ISSET(tp->t_state, TS_SIGINFO)) {
                                /* Via ioctl: ignore tty option. */
                                tp->t_state &= ~TS_SIGINFO;
                                lflag |= ISIG;
                        }
                        if (!ISSET(lflag, NOKERNINFO)) {
                                mutex_spin_exit(&tty_lock);
                                ttygetinfo(tp, 1, infobuf, sizeof(infobuf));
                                mutex_spin_enter(&tty_lock);
                                ttyputinfo(tp, infobuf);
                        }
                        if (!ISSET(lflag, ISIG))
                                continue;
                }
                mutex_spin_exit(&tty_lock);
                KASSERT(sig != 0);
                switch (st) {
                case TTYSIG_PG1:
                        if (pgrp != NULL)
                                pgsignal(pgrp, sig, 1);
                        break;
                case TTYSIG_PG2:
                        if (pgrp != NULL)
                                pgsignal(pgrp, sig, sess != NULL);
                        break;
                case TTYSIG_LEADER:
                        if (sess != NULL && sess->s_leader != NULL)
                                psignal(sess->s_leader, sig);
                        break;
                default:
                        /* NOTREACHED */
                        break;
                }
                mutex_spin_enter(&tty_lock);
        }
        mutex_spin_exit(&tty_lock);
        mutex_exit(&proc_lock);
}

unsigned char
tty_getctrlchar(struct tty *tp, unsigned which)
{
        KASSERT(which < NCCS);
        return tp->t_cc[which];
}

void
tty_setctrlchar(struct tty *tp, unsigned which, unsigned char val)
{
        KASSERT(which < NCCS);
        tp->t_cc[which] = val;
}

int
tty_try_xonxoff(struct tty *tp, unsigned char c)
{
    const struct cdevsw *cdev;

    if (tp->t_iflag & IXON) {
        if (c == tp->t_cc[VSTOP] && tp->t_cc[VSTOP] != _POSIX_VDISABLE) {
            if ((tp->t_state & TS_TTSTOP) == 0) {
                tp->t_state |= TS_TTSTOP;
                cdev = cdevsw_lookup(tp->t_dev);
                if (cdev != NULL)
                        (*cdev->d_stop)(tp, 0);
            }
            return 0;
        }
        if (c == tp->t_cc[VSTART] && tp->t_cc[VSTART] != _POSIX_VDISABLE) {
            tp->t_state &= ~TS_TTSTOP;
            if (tp->t_oproc != NULL) {
                mutex_spin_enter(&tty_lock);        /* XXX */
                (*tp->t_oproc)(tp);
                mutex_spin_exit(&tty_lock);        /* XXX */
            }
            return 0;
        }
    }
    return EAGAIN;
}



































































































   10 
   10 





   10 












   10 





















































   10 






   10 














   10 





   10 
   10 
   10 























   10 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
/*        $NetBSD: cprng_fast.c,v 1.19 2023/08/05 11:39:18 riastradh Exp $        */

/*-
 * Copyright (c) 2014 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Taylor R. Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.19 2023/08/05 11:39:18 riastradh Exp $");

#include <sys/types.h>
#include <sys/param.h>
#include <sys/bitops.h>
#include <sys/cprng.h>
#include <sys/cpu.h>
#include <sys/entropy.h>
#include <sys/evcnt.h>
#include <sys/kmem.h>
#include <sys/percpu.h>
#include <sys/pserialize.h>

#include <crypto/chacha/chacha.h>

#define        CPRNG_FAST_SEED_BYTES        CHACHA_STREAM_KEYBYTES

struct cprng_fast {
        /* 128-bit vector unit generates 256 bytes at once */
        uint8_t                buf[256];
        uint8_t                key[CPRNG_FAST_SEED_BYTES];
        uint8_t                nonce[CHACHA_STREAM_NONCEBYTES];
        unsigned        i;
        struct evcnt        *reseed_evcnt;
        unsigned        epoch;
};

static void        cprng_fast_init_cpu(void *, void *, struct cpu_info *);
static void        cprng_fast_reseed(struct cprng_fast **, unsigned);

static void        cprng_fast_seed(struct cprng_fast *, const void *);
static void        cprng_fast_buf(struct cprng_fast *, void *, unsigned);

static void        cprng_fast_buf_short(void *, size_t);
static void        cprng_fast_buf_long(void *, size_t);

static percpu_t        *cprng_fast_percpu        __read_mostly;

void
cprng_fast_init(void)
{

        cprng_fast_percpu = percpu_create(sizeof(struct cprng_fast),
            cprng_fast_init_cpu, NULL, NULL);
}

static void
cprng_fast_init_cpu(void *p, void *arg __unused, struct cpu_info *ci)
{
        struct cprng_fast *const cprng = p;

        cprng->epoch = 0;

        cprng->reseed_evcnt = kmem_alloc(sizeof(*cprng->reseed_evcnt),
            KM_SLEEP);
        evcnt_attach_dynamic(cprng->reseed_evcnt, EVCNT_TYPE_MISC, NULL,
            ci->ci_cpuname, "cprng_fast reseed");
}

static int
cprng_fast_get(struct cprng_fast **cprngp)
{
        struct cprng_fast *cprng;
        unsigned epoch;
        int s;

        KASSERT(!cpu_intr_p());
        KASSERT(pserialize_not_in_read_section());

        *cprngp = cprng = percpu_getref(cprng_fast_percpu);
        s = splsoftserial();

        epoch = entropy_epoch();
        if (__predict_false(cprng->epoch != epoch)) {
                splx(s);
                cprng_fast_reseed(cprngp, epoch);
                s = splsoftserial();
        }

        return s;
}

static void
cprng_fast_put(struct cprng_fast *cprng, int s)
{

        KASSERT((cprng == percpu_getref(cprng_fast_percpu)) &&
            (percpu_putref(cprng_fast_percpu), true));
        splx(s);
        percpu_putref(cprng_fast_percpu);
}

static void
cprng_fast_reseed(struct cprng_fast **cprngp, unsigned epoch)
{
        struct cprng_fast *cprng;
        uint8_t seed[CPRNG_FAST_SEED_BYTES];
        int s;

        /*
         * Drop the percpu(9) reference to extract a fresh seed from
         * the entropy pool.  cprng_strong may sleep on an adaptive
         * lock, which invalidates our percpu(9) reference.
         *
         * This may race with reseeding in another thread, which is no
         * big deal -- worst case, we rewind the entropy epoch here and
         * cause the next caller to reseed again, and in the end we
         * just reseed a couple more times than necessary.
         */
        percpu_putref(cprng_fast_percpu);
        cprng_strong(kern_cprng, seed, sizeof(seed), 0);
        *cprngp = cprng = percpu_getref(cprng_fast_percpu);

        s = splsoftserial();
        cprng_fast_seed(cprng, seed);
        cprng->epoch = epoch;
        cprng->reseed_evcnt->ev_count++;
        splx(s);

        explicit_memset(seed, 0, sizeof(seed));
}

/* CPRNG algorithm */

static void
cprng_fast_seed(struct cprng_fast *cprng, const void *seed)
{

        (void)memset(cprng->buf, 0, sizeof cprng->buf);
        (void)memcpy(cprng->key, seed, sizeof cprng->key);
        (void)memset(cprng->nonce, 0, sizeof cprng->nonce);
        cprng->i = sizeof cprng->buf;
}

static void
cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned len)
{
        uint8_t *p = buf;
        unsigned n = len, n0;

        KASSERT(cprng->i <= sizeof(cprng->buf));
        KASSERT(len <= sizeof(cprng->buf));

        n0 = MIN(n, sizeof(cprng->buf) - cprng->i);
        memcpy(p, &cprng->buf[cprng->i], n0);
        if ((n -= n0) == 0) {
                cprng->i += n0;
                KASSERT(cprng->i <= sizeof(cprng->buf));
                return;
        }
        p += n0;
        le64enc(cprng->nonce, 1 + le64dec(cprng->nonce));
        chacha_stream(cprng->buf, sizeof(cprng->buf), 0, cprng->nonce,
            cprng->key, 8);
        memcpy(p, cprng->buf, n);
        cprng->i = n;
}

/* Public API */

static void
cprng_fast_buf_short(void *buf, size_t len)
{
        struct cprng_fast *cprng;
        int s;

        KASSERT(len <= sizeof(cprng->buf));

        s = cprng_fast_get(&cprng);
        cprng_fast_buf(cprng, buf, len);
        cprng_fast_put(cprng, s);
}

static void
cprng_fast_buf_long(void *buf, size_t len)
{
        uint8_t seed[CHACHA_STREAM_KEYBYTES];
        uint8_t nonce[CHACHA_STREAM_NONCEBYTES] = {0};

        CTASSERT(sizeof(seed) <= sizeof(((struct cprng_fast *)0)->buf));

#if SIZE_MAX >= 0x3fffffffff
        /* >=256 GB is not reasonable */
        KASSERT(len <= 0x3fffffffff);
#endif

        cprng_fast_buf_short(seed, sizeof seed);
        chacha_stream(buf, len, 0, nonce, seed, 8);

        (void)explicit_memset(seed, 0, sizeof seed);
}

uint32_t
cprng_fast32(void)
{
        uint32_t v;

        cprng_fast_buf_short(&v, sizeof v);

        return v;
}

uint64_t
cprng_fast64(void)
{
        uint64_t v;

        cprng_fast_buf_short(&v, sizeof v);

        return v;
}

size_t
cprng_fast(void *buf, size_t len)
{

        /*
         * We don't want to hog the CPU, so we use the short version,
         * to generate output without preemption, only if we can do it
         * with at most one ChaCha call.
         */
        if (len <= sizeof(((struct cprng_fast *)0)->buf))
                cprng_fast_buf_short(buf, len);
        else
                cprng_fast_buf_long(buf, len);

        return len;                /* hysterical raisins */
}








































































    1 


    1 
    1 






































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
/*        $NetBSD: tty_subr.c,v 1.43 2019/12/27 09:41:51 msaitoh Exp $        */

/*
 * Copyright (c) 1993, 1994 Theo de Raadt
 * All rights reserved.
 *
 * Per Lindqvist <pgd@compuram.bbt.se> supplied an almost fully working
 * set of true clist functions that this is very loosely based on.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty_subr.c,v 1.43 2019/12/27 09:41:51 msaitoh Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/ioctl.h>
#include <sys/tty.h>
#include <sys/kmem.h>

/*
 * At compile time, choose:
 * There are two ways the TTY_QUOTE bit can be stored. If QBITS is
 * defined we allocate an array of bits -- 1/8th as much memory but
 * setbit(), clrbit(), and isset() take more CPU. If QBITS is
 * undefined, we just use an array of bytes.
 *
 * If TTY_QUOTE functionality isn't required by a line discipline,
 * it can free c_cq and set it to NULL. This speeds things up,
 * and also does not use any extra memory. This is useful for (say)
 * a SLIP line discipline that wants a 32K ring buffer for data
 * but doesn't need quoting.
 */
#define QBITS

#ifdef QBITS
#define QMEM(n)                ((((n)-1)/NBBY)+1)
#else
#define QMEM(n)                (n)
#endif

#ifdef QBITS
static void        clrbits(u_char *, unsigned int, unsigned int);
#endif

/*
 * Initialize a particular clist. Ok, they are really ring buffers,
 * of the specified length, with/without quoting support.
 */
int
clalloc(struct clist *clp, int size, int quot)
{

        clp->c_cs = kmem_zalloc(size, KM_SLEEP);
        if (quot)
                clp->c_cq = kmem_zalloc(QMEM(size), KM_SLEEP);
        else
                clp->c_cq = NULL;

        clp->c_cf = clp->c_cl = NULL;
        clp->c_ce = clp->c_cs + size;
        clp->c_cn = size;
        clp->c_cc = 0;

        return (0);
}

void
clfree(struct clist *clp)
{
        if (clp->c_cs)
                kmem_free(clp->c_cs, clp->c_cn);
        if (clp->c_cq)
                kmem_free(clp->c_cq, QMEM(clp->c_cn));
        clp->c_cs = clp->c_cq = NULL;
}

/*
 * Get a character from a clist.
 */
int
getc(struct clist *clp)
{
        int c = -1;
        int s;

        s = spltty();
        if (clp->c_cc == 0)
                goto out;

        c = *clp->c_cf & 0xff;
        if (clp->c_cq) {
#ifdef QBITS
                if (isset(clp->c_cq, clp->c_cf - clp->c_cs) )
                        c |= TTY_QUOTE;
#else
                if (*(clp->c_cf - clp->c_cs + clp->c_cq))
                        c |= TTY_QUOTE;
#endif
        }
        *clp->c_cf = 0; /* wipe out to avoid information disclosure */
        if (++clp->c_cf == clp->c_ce)
                clp->c_cf = clp->c_cs;
        if (--clp->c_cc == 0)
                clp->c_cf = clp->c_cl = (u_char *)0;
out:
        splx(s);
        return c;
}

/*
 * Copy clist to buffer.
 * Return number of bytes moved.
 */
int
q_to_b(struct clist *clp, u_char *cp, int count)
{
        int cc;
        u_char *p = cp;
        int s;

        s = spltty();
        /* optimize this while loop */
        while (count > 0 && clp->c_cc > 0) {
                cc = clp->c_cl - clp->c_cf;
                if (clp->c_cf >= clp->c_cl)
                        cc = clp->c_ce - clp->c_cf;
                if (cc > count)
                        cc = count;
                memcpy(p, clp->c_cf, cc);
                count -= cc;
                p += cc;
                clp->c_cc -= cc;
                clp->c_cf += cc;
                if (clp->c_cf == clp->c_ce)
                        clp->c_cf = clp->c_cs;
        }
        if (clp->c_cc == 0)
                clp->c_cf = clp->c_cl = (u_char *)0;
        splx(s);
        return p - cp;
}

/*
 * Return count of contiguous characters in clist.
 * Stop counting if flag&character is non-null.
 */
int
ndqb(struct clist *clp, int flag)
{
        int count = 0;
        int i;
        int cc;
        int s;

        s = spltty();
        if ((cc = clp->c_cc) == 0)
                goto out;

        if (flag == 0) {
                count = clp->c_cl - clp->c_cf;
                if (count <= 0)
                        count = clp->c_ce - clp->c_cf;
                goto out;
        }

        i = clp->c_cf - clp->c_cs;
        if (flag & TTY_QUOTE) {
                while (cc-- > 0 && !(clp->c_cs[i++] & (flag & ~TTY_QUOTE) ||
                    isset(clp->c_cq, i))) {
                        count++;
                        if (i == clp->c_cn)
                                break;
                }
        } else {
                while (cc-- > 0 && !(clp->c_cs[i++] & flag)) {
                        count++;
                        if (i == clp->c_cn)
                                break;
                }
        }
out:
        splx(s);
        return count;
}

/*
 * Flush count bytes from clist.
 */
void
ndflush(struct clist *clp, int count)
{
        int cc;
        int s;

        s = spltty();
        if (count == clp->c_cc) {
                clp->c_cc = 0;
                clp->c_cf = clp->c_cl = (u_char *)0;
                goto out;
        }
        /* optimize this while loop */
        while (count > 0 && clp->c_cc > 0) {
                cc = clp->c_cl - clp->c_cf;
                if (clp->c_cf >= clp->c_cl)
                        cc = clp->c_ce - clp->c_cf;
                if (cc > count)
                        cc = count;
                count -= cc;
                clp->c_cc -= cc;
                clp->c_cf += cc;
                if (clp->c_cf == clp->c_ce)
                        clp->c_cf = clp->c_cs;
        }
        if (clp->c_cc == 0)
                clp->c_cf = clp->c_cl = (u_char *)0;
out:
        splx(s);
}

/*
 * Put a character into the output queue.
 */
int
putc(int c, struct clist *clp)
{
        int i;
        int s;

        s = spltty();
        if (clp->c_cc == clp->c_cn)
                goto out;

        if (clp->c_cc == 0) {
                if (!clp->c_cs) {
#if defined(DIAGNOSTIC) || 1
                        printf("putc: required clalloc\n");
#endif
                        if (clalloc(clp, clp->c_cn, 1)) {
out:
                                splx(s);
                                return -1;
                        }
                }
                clp->c_cf = clp->c_cl = clp->c_cs;
        }

        *clp->c_cl = c & 0xff;
        i = clp->c_cl - clp->c_cs;
        if (clp->c_cq) {
#ifdef QBITS
                if (c & TTY_QUOTE)
                        setbit(clp->c_cq, i);
                else
                        clrbit(clp->c_cq, i);
#else
                q = clp->c_cq + i;
                *q = (c & TTY_QUOTE) ? 1 : 0;
#endif
        }
        clp->c_cc++;
        clp->c_cl++;
        if (clp->c_cl == clp->c_ce)
                clp->c_cl = clp->c_cs;
        splx(s);
        return 0;
}

#ifdef QBITS
/*
 * optimized version of
 *
 * for (i = 0; i < len; i++)
 *        clrbit(cp, off + len);
 */
static void
clrbits(u_char *cp, unsigned int off, unsigned int len)
{
        unsigned int sbi, ebi;
        u_char *scp, *ecp;
        unsigned int end;
        unsigned char mask;

        scp = cp + off / NBBY;
        sbi = off % NBBY;
        end = off + len + NBBY - 1;
        ecp = cp + end / NBBY - 1;
        ebi = end % NBBY + 1;
        if (scp >= ecp) {
                mask = ((1 << len) - 1) << sbi;
                *scp &= ~mask;
        } else {
                mask = (1 << sbi) - 1;
                *scp++ &= mask;

                mask = (1 << ebi) - 1;
                *ecp &= ~mask;

                while (scp < ecp)
                        *scp++ = 0x00;
        }
}
#endif

/*
 * Copy buffer to clist.
 * Return number of bytes not transferred.
 */
int
b_to_q(const u_char *cp, int count, struct clist *clp)
{
        int cc;
        const u_char *p = cp;
        int s;

        if (count <= 0)
                return 0;

        s = spltty();
        if (clp->c_cc == clp->c_cn)
                goto out;

        if (clp->c_cc == 0) {
                if (!clp->c_cs) {
#if defined(DIAGNOSTIC) || 1
                        printf("b_to_q: required clalloc\n");
#endif
                        if (clalloc(clp, clp->c_cn, 1))
                                goto out;
                }
                clp->c_cf = clp->c_cl = clp->c_cs;
        }

        /* optimize this while loop */
        while (count > 0 && clp->c_cc < clp->c_cn) {
                cc = clp->c_ce - clp->c_cl;
                if (clp->c_cf > clp->c_cl)
                        cc = clp->c_cf - clp->c_cl;
                if (cc > count)
                        cc = count;
                memcpy(clp->c_cl, p, cc);
                if (clp->c_cq) {
#ifdef QBITS
                        clrbits(clp->c_cq, clp->c_cl - clp->c_cs, cc);
#else
                        memset(clp->c_cl - clp->c_cs + clp->c_cq, 0, cc);
#endif
                }
                p += cc;
                count -= cc;
                clp->c_cc += cc;
                clp->c_cl += cc;
                if (clp->c_cl == clp->c_ce)
                        clp->c_cl = clp->c_cs;
        }
out:
        splx(s);
        return count;
}

static int tty_global_cc;

/*
 * Given a non-NULL pointer into the clist return the pointer
 * to the next character in the list or return NULL if no more chars.
 *
 * Callers must not allow getc's to happen between firstc's and getc's
 * so that the pointer becomes invalid.  Note that interrupts are NOT
 * masked.
 */
u_char *
nextc(struct clist *clp, u_char *cp, int *c)
{

        if (clp->c_cf == cp) {
                /*
                 * First time initialization.
                 */
                tty_global_cc = clp->c_cc;
        }
        if (tty_global_cc == 0 || cp == NULL)
                return NULL;
        if (--tty_global_cc == 0)
                return NULL;
        if (++cp == clp->c_ce)
                cp = clp->c_cs;
        *c = *cp & 0xff;
        if (clp->c_cq) {
#ifdef QBITS
                if (isset(clp->c_cq, cp - clp->c_cs))
                        *c |= TTY_QUOTE;
#else
                if (*(clp->c_cf - clp->c_cs + clp->c_cq))
                        *c |= TTY_QUOTE;
#endif
        }
        return cp;
}

/*
 * Given a non-NULL pointer into the clist return the pointer
 * to the first character in the list or return NULL if no more chars.
 *
 * Callers must not allow getc's to happen between firstc's and getc's
 * so that the pointer becomes invalid.  Note that interrupts are NOT
 * masked.
 *
 * *c is set to the NEXT character
 */
u_char *
firstc(struct clist *clp, int *c)
{
        u_char *cp;

        tty_global_cc = clp->c_cc;
        if (tty_global_cc == 0)
                return NULL;
        cp = clp->c_cf;
        *c = *cp & 0xff;
        if (clp->c_cq) {
#ifdef QBITS
                if (isset(clp->c_cq, cp - clp->c_cs))
                        *c |= TTY_QUOTE;
#else
                if (*(cp - clp->c_cs + clp->c_cq))
                        *c |= TTY_QUOTE;
#endif
        }
        return clp->c_cf;
}

/*
 * Remove the last character in the clist and return it.
 */
int
unputc(struct clist *clp)
{
        unsigned int c = -1;
        int s;

        s = spltty();
        if (clp->c_cc == 0)
                goto out;

        if (clp->c_cl == clp->c_cs)
                clp->c_cl = clp->c_ce - 1;
        else
                --clp->c_cl;
        clp->c_cc--;

        c = *clp->c_cl & 0xff;
        if (clp->c_cq) {
#ifdef QBITS
                if (isset(clp->c_cq, clp->c_cl - clp->c_cs))
                        c |= TTY_QUOTE;
#else
                if (*(clp->c_cf - clp->c_cs + clp->c_cq))
                        c |= TTY_QUOTE;
#endif
        }
        if (clp->c_cc == 0)
                clp->c_cf = clp->c_cl = (u_char *)0;
out:
        splx(s);
        return c;
}

/*
 * Put the chars in the from queue on the end of the to queue.
 */
void
catq(struct clist *from, struct clist *to)
{
        int c;

        while ((c = getc(from)) != -1)
                putc(c, to);
}









































































































































































































    8 





    8 
























    3 






    5 

















    8 




    8 
    8 



































































































































































































































































































































































































































    4 




    4 



    4 





    4 
    4 


    4 





    1 
    3 




































































































































































































































































































































































































































































   14 



   10 







    4 


    4 


















    1 








    1 












    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
/*        $NetBSD: in_pcb.c,v 1.202 2022/11/04 09:05:41 ozaki-r Exp $        */

/*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the project nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1998, 2011 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Coyote Point Systems, Inc.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Public Access Networks Corporation ("Panix").  It was developed under
 * contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1991, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)in_pcb.c        8.4 (Berkeley) 5/24/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in_pcb.c,v 1.202 2022/11/04 09:05:41 ozaki-r Exp $");

#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/once.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/uidinfo.h>
#include <sys/domain.h>

#include <net/if.h>
#include <net/route.h>

#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/portalgo.h>

#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#endif

#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#endif /* IPSEC */

#include <netinet/tcp_vtw.h>

struct        in_addr zeroin_addr;

#define        INPCBHASH_PORT(table, lport) \
        &(table)->inpt_porthashtbl[ntohs(lport) & (table)->inpt_porthash]
#define        INPCBHASH_BIND(table, laddr, lport) \
        &(table)->inpt_bindhashtbl[ \
            ((ntohl((laddr).s_addr) + ntohs(lport))) & (table)->inpt_bindhash]
#define        INPCBHASH_CONNECT(table, faddr, fport, laddr, lport) \
        &(table)->inpt_connecthashtbl[ \
            ((ntohl((faddr).s_addr) + ntohs(fport)) + \
             (ntohl((laddr).s_addr) + ntohs(lport))) & (table)->inpt_connecthash]

int        anonportmin = IPPORT_ANONMIN;
int        anonportmax = IPPORT_ANONMAX;
int        lowportmin  = IPPORT_RESERVEDMIN;
int        lowportmax  = IPPORT_RESERVEDMAX;

static pool_cache_t        in4pcb_pool_cache;
#ifdef INET6
static pool_cache_t        in6pcb_pool_cache;
#endif

static int
inpcb_poolinit(void)
{

        in4pcb_pool_cache = pool_cache_init(sizeof(struct in4pcb), coherency_unit,
            0, 0, "in4pcbpl", NULL, IPL_NET, NULL, NULL, NULL);
#ifdef INET6
        in6pcb_pool_cache = pool_cache_init(sizeof(struct in6pcb), coherency_unit,
            0, 0, "in6pcbpl", NULL, IPL_NET, NULL, NULL, NULL);
#endif
        return 0;
}

void
inpcb_init(struct inpcbtable *table, int bindhashsize, int connecthashsize)
{
        static ONCE_DECL(control);

        TAILQ_INIT(&table->inpt_queue);
        table->inpt_porthashtbl = hashinit(bindhashsize, HASH_LIST, true,
            &table->inpt_porthash);
        table->inpt_bindhashtbl = hashinit(bindhashsize, HASH_LIST, true,
            &table->inpt_bindhash);
        table->inpt_connecthashtbl = hashinit(connecthashsize, HASH_LIST, true,
            &table->inpt_connecthash);
        table->inpt_lastlow = IPPORT_RESERVEDMAX;
        table->inpt_lastport = (in_port_t)anonportmax;

        RUN_ONCE(&control, inpcb_poolinit);
}

/*
 * inpcb_create: construct a new PCB and associated with a given socket.
 * Sets the PCB state to INP_ATTACHED and makes PCB globally visible.
 */
int
inpcb_create(struct socket *so, void *v)
{
        struct inpcbtable *table = v;
        struct inpcb *inp;
        int s;

#ifdef INET6
        KASSERT(soaf(so) == AF_INET || soaf(so) == AF_INET6);

        if (soaf(so) == AF_INET)
                inp = pool_cache_get(in4pcb_pool_cache, PR_NOWAIT);
        else
                inp = pool_cache_get(in6pcb_pool_cache, PR_NOWAIT);
#else
        KASSERT(soaf(so) == AF_INET);
        inp = pool_cache_get(in4pcb_pool_cache, PR_NOWAIT);
#endif
        if (inp == NULL)
                return ENOBUFS;
        if (soaf(so) == AF_INET)
                memset(inp, 0, sizeof(struct in4pcb));
#ifdef INET6
        else
                memset(inp, 0, sizeof(struct in6pcb));
#endif
        inp->inp_af = soaf(so);
        inp->inp_table = table;
        inp->inp_socket = so;
        inp->inp_portalgo = PORTALGO_DEFAULT;
        inp->inp_bindportonsend = false;

        if (inp->inp_af == AF_INET) {
                in4p_errormtu(inp) = -1;
                in4p_prefsrcip(inp).s_addr = INADDR_ANY;
        }
#ifdef INET6
        else {
                in6p_hops6(inp) = -1;        /* use kernel default */
                if (ip6_v6only)
                        inp->inp_flags |= IN6P_IPV6_V6ONLY;
        }
#endif
#if defined(IPSEC)
        if (ipsec_enabled) {
                int error = ipsec_init_pcbpolicy(so, &inp->inp_sp);
                if (error != 0) {
#ifdef INET6
                        if (inp->inp_af == AF_INET)
                                pool_cache_put(in4pcb_pool_cache, inp);
                        else
                                pool_cache_put(in6pcb_pool_cache, inp);
#else
                        KASSERT(inp->inp_af == AF_INET);
                        pool_cache_put(in4pcb_pool_cache, inp);
#endif
                        return error;
                }
                inp->inp_sp->sp_inp = inp;
        }
#endif
        so->so_pcb = inp;
        s = splsoftnet();
        TAILQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue);
        LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), inp,
            inp_lhash);
        inpcb_set_state(inp, INP_ATTACHED);
        splx(s);
        return 0;
}

static int
inpcb_set_port(struct sockaddr_in *sin, struct inpcb *inp, kauth_cred_t cred)
{
        struct inpcbtable *table = inp->inp_table;
        struct socket *so = inp->inp_socket;
        in_port_t *lastport;
        in_port_t lport = 0;
        enum kauth_network_req req;
        int error;

        if (inp->inp_flags & INP_LOWPORT) {
#ifndef IPNOPRIVPORTS
                req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
#else
                req = KAUTH_REQ_NETWORK_BIND_PORT;
#endif

                lastport = &table->inpt_lastlow;
        } else {
                req = KAUTH_REQ_NETWORK_BIND_PORT;

                lastport = &table->inpt_lastport;
        }

        /* XXX-kauth: KAUTH_REQ_NETWORK_BIND_AUTOASSIGN_{,PRIV}PORT */
        error = kauth_authorize_network(cred, KAUTH_NETWORK_BIND, req, so, sin,
            NULL);
        if (error)
                return EACCES;

       /*
        * Use RFC6056 randomized port selection
        */
        error = portalgo_randport(&lport, inp, cred);
        if (error)
                return error;

        inp->inp_flags |= INP_ANONPORT;
        *lastport = lport;
        lport = htons(lport);
        inp->inp_lport = lport;
        inpcb_set_state(inp, INP_BOUND);

        return 0;
}

int
inpcb_bindableaddr(const struct inpcb *inp, struct sockaddr_in *sin,
    kauth_cred_t cred)
{
        int error = EADDRNOTAVAIL;
        struct ifaddr *ifa = NULL;
        int s;

        if (sin->sin_family != AF_INET)
                return EAFNOSUPPORT;

        s = pserialize_read_enter();
        if (IN_MULTICAST(sin->sin_addr.s_addr)) {
                /* Always succeed; port reuse handled in inpcb_bind_port(). */
        } else if (!in_nullhost(sin->sin_addr)) {
                struct in_ifaddr *ia;

                ia = in_get_ia(sin->sin_addr);
                /* check for broadcast addresses */
                if (ia == NULL) {
                        ifa = ifa_ifwithaddr(sintosa(sin));
                        if (ifa != NULL)
                                ia = ifatoia(ifa);
                        else if ((inp->inp_flags & INP_BINDANY) != 0) {
                                error = 0;
                                goto error;
                        }
                }
                if (ia == NULL)
                        goto error;
                if (ia->ia4_flags & IN_IFF_DUPLICATED)
                        goto error;
        }
        error = 0;
 error:
        pserialize_read_exit(s);
        return error;
}

static int
inpcb_bind_addr(struct inpcb *inp, struct sockaddr_in *sin, kauth_cred_t cred)
{
        int error;

        error = inpcb_bindableaddr(inp, sin, cred);
        if (error == 0)
                in4p_laddr(inp) = sin->sin_addr;
        return error;
}

static int
inpcb_bind_port(struct inpcb *inp, struct sockaddr_in *sin, kauth_cred_t cred)
{
        struct inpcbtable *table = inp->inp_table;
        struct socket *so = inp->inp_socket;
        int reuseport = (so->so_options & SO_REUSEPORT);
        int wild = 0, error;

        if (IN_MULTICAST(sin->sin_addr.s_addr)) {
                /*
                 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
                 * allow complete duplication of binding if
                 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
                 * and a multicast address is bound on both
                 * new and duplicated sockets.
                 */
                if (so->so_options & (SO_REUSEADDR | SO_REUSEPORT))
                        reuseport = SO_REUSEADDR|SO_REUSEPORT;
        } 

        if (sin->sin_port == 0) {
                error = inpcb_set_port(sin, inp, cred);
                if (error)
                        return error;
        } else {
                struct inpcb *t;
                vestigial_inpcb_t vestige;
#ifdef INET6
                struct inpcb *t6;
                struct in6_addr mapped;
#endif
                enum kauth_network_req req;

                if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
                        wild = 1;

#ifndef IPNOPRIVPORTS
                if (ntohs(sin->sin_port) < IPPORT_RESERVED)
                        req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
                else
#endif /* !IPNOPRIVPORTS */
                        req = KAUTH_REQ_NETWORK_BIND_PORT;

                error = kauth_authorize_network(cred, KAUTH_NETWORK_BIND, req,
                    so, sin, NULL);
                if (error)
                        return EACCES;

#ifdef INET6
                in6_in_2_v4mapin6(&sin->sin_addr, &mapped);
                t6 = in6pcb_lookup_local(table, &mapped, sin->sin_port, wild, &vestige);
                if (t6 && (reuseport & t6->inp_socket->so_options) == 0)
                        return EADDRINUSE;
                if (!t6 && vestige.valid) {
                    if (!!reuseport != !!vestige.reuse_port) {
                        return EADDRINUSE;
                    }
                }
#endif

                /* XXX-kauth */
                if (so->so_uidinfo->ui_uid && !IN_MULTICAST(sin->sin_addr.s_addr)) {
                        t = inpcb_lookup_local(table, sin->sin_addr, sin->sin_port, 1, &vestige);
                        /*
                         * XXX:        investigate ramifications of loosening this
                         *        restriction so that as long as both ports have
                         *        SO_REUSEPORT allow the bind
                         */
                        if (t &&
                            (!in_nullhost(sin->sin_addr) ||
                             !in_nullhost(in4p_laddr(t)) ||
                             (t->inp_socket->so_options & SO_REUSEPORT) == 0)
                            && (so->so_uidinfo->ui_uid != t->inp_socket->so_uidinfo->ui_uid)) {
                                return EADDRINUSE;
                        }
                        if (!t && vestige.valid) {
                                if ((!in_nullhost(sin->sin_addr)
                                     || !in_nullhost(vestige.laddr.v4)
                                     || !vestige.reuse_port)
                                    && so->so_uidinfo->ui_uid != vestige.uid) {
                                        return EADDRINUSE;
                                }
                        }
                }
                t = inpcb_lookup_local(table, sin->sin_addr, sin->sin_port, wild, &vestige);
                if (t && (reuseport & t->inp_socket->so_options) == 0)
                        return EADDRINUSE;
                if (!t
                    && vestige.valid
                    && !(reuseport && vestige.reuse_port))
                        return EADDRINUSE;

                inp->inp_lport = sin->sin_port;
                inpcb_set_state(inp, INP_BOUND);
        }

        LIST_REMOVE(inp, inp_lhash);
        LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), inp,
            inp_lhash);

        return 0;
}

/*
 * inpcb_bind: assign a local IP address and port number to the PCB.
 *
 * If the address is not a wildcard, verify that it corresponds to a
 * local interface.  If a port is specified and it is privileged, then
 * check the permission.  Check whether the address or port is in use,
 * and if so, whether we can re-use them.
 */
int
inpcb_bind(void *v, struct sockaddr_in *sin, struct lwp *l)
{
        struct inpcb *inp = v;
        struct sockaddr_in lsin;
        int error;

        if (inp->inp_af != AF_INET)
                return EINVAL;

        if (inp->inp_lport || !in_nullhost(in4p_laddr(inp)))
                return EINVAL;

        if (NULL != sin) {
                if (sin->sin_len != sizeof(*sin))
                        return EINVAL;
        } else {
                lsin = *((const struct sockaddr_in *)
                    inp->inp_socket->so_proto->pr_domain->dom_sa_any);
                sin = &lsin;
        }

        /* Bind address. */
        error = inpcb_bind_addr(inp, sin, l->l_cred);
        if (error)
                return error;

        /* Bind port. */
        error = inpcb_bind_port(inp, sin, l->l_cred);
        if (error) {
                in4p_laddr(inp).s_addr = INADDR_ANY;

                return error;
        }

        return 0;
}

/*
 * inpcb_connect: connect from a socket to a specified address, i.e.,
 * assign a foreign IP address and port number to the PCB.
 *
 * Both address and port must be specified in the name argument.
 * If there is no local address for this socket yet, then pick one.
 */
int
inpcb_connect(void *v, struct sockaddr_in *sin, struct lwp *l)
{
        struct inpcb *inp = v;
        vestigial_inpcb_t vestige;
        int error;
        struct in_addr laddr;

        if (inp->inp_af != AF_INET)
                return EINVAL;

        if (sin->sin_len != sizeof (*sin))
                return EINVAL;
        if (sin->sin_family != AF_INET)
                return EAFNOSUPPORT;
        if (sin->sin_port == 0)
                return EADDRNOTAVAIL;

        if (IN_MULTICAST(sin->sin_addr.s_addr) &&
            inp->inp_socket->so_type == SOCK_STREAM)
                return EADDRNOTAVAIL;

        if (!IN_ADDRLIST_READER_EMPTY()) {
                /*
                 * If the destination address is INADDR_ANY,
                 * use any local address (likely loopback).
                 * If the supplied address is INADDR_BROADCAST,
                 * use the broadcast address of an interface
                 * which supports broadcast. (loopback does not)
                 */

                if (in_nullhost(sin->sin_addr)) {
                        /* XXX racy */
                        sin->sin_addr =
                            IN_ADDRLIST_READER_FIRST()->ia_addr.sin_addr;
                } else if (sin->sin_addr.s_addr == INADDR_BROADCAST) {
                        struct in_ifaddr *ia;
                        int s = pserialize_read_enter();
                        IN_ADDRLIST_READER_FOREACH(ia) {
                                if (ia->ia_ifp->if_flags & IFF_BROADCAST) {
                                        sin->sin_addr =
                                            ia->ia_broadaddr.sin_addr;
                                        break;
                                }
                        }
                        pserialize_read_exit(s);
                }
        }
        /*
         * If we haven't bound which network number to use as ours,
         * we will use the number of the outgoing interface.
         * This depends on having done a routing lookup, which
         * we will probably have to do anyway, so we might
         * as well do it now.  On the other hand if we are
         * sending to multiple destinations we may have already
         * done the lookup, so see if we can use the route
         * from before.  In any case, we only
         * chose a port number once, even if sending to multiple
         * destinations.
         */
        if (in_nullhost(in4p_laddr(inp))) {
                int xerror;
                struct in_ifaddr *ia, *_ia;
                int s;
                struct psref psref;
                int bound;

                bound = curlwp_bind();
                ia = in_selectsrc(sin, &inp->inp_route,
                    inp->inp_socket->so_options, inp->inp_moptions, &xerror,
                    &psref);
                if (ia == NULL) {
                        curlwp_bindx(bound);
                        if (xerror == 0)
                                xerror = EADDRNOTAVAIL;
                        return xerror;
                }
                s = pserialize_read_enter();
                _ia = in_get_ia(IA_SIN(ia)->sin_addr);
                if (_ia == NULL && (inp->inp_flags & INP_BINDANY) == 0) {
                        pserialize_read_exit(s);
                        ia4_release(ia, &psref);
                        curlwp_bindx(bound);
                        return EADDRNOTAVAIL;
                }
                pserialize_read_exit(s);
                laddr = IA_SIN(ia)->sin_addr;
                ia4_release(ia, &psref);
                curlwp_bindx(bound);
        } else
                laddr = in4p_laddr(inp);
        if (inpcb_lookup(inp->inp_table, sin->sin_addr, sin->sin_port,
                                 laddr, inp->inp_lport, &vestige) != NULL ||
            vestige.valid) {
                return EADDRINUSE;
        }
        if (in_nullhost(in4p_laddr(inp))) {
                if (inp->inp_lport == 0) {
                        error = inpcb_bind(inp, NULL, l);
                        /*
                         * This used to ignore the return value
                         * completely, but we need to check for
                         * ephemeral port shortage.
                         * And attempts to request low ports if not root.
                         */
                        if (error != 0)
                                return error;
                }
                in4p_laddr(inp) = laddr;
        }
        in4p_faddr(inp) = sin->sin_addr;
        inp->inp_fport = sin->sin_port;

        /* Late bind, if needed */
        if (inp->inp_bindportonsend) {
                struct sockaddr_in lsin = *((const struct sockaddr_in *)
                    inp->inp_socket->so_proto->pr_domain->dom_sa_any);
                lsin.sin_addr = in4p_laddr(inp);
                lsin.sin_port = 0;

                if ((error = inpcb_bind_port(inp, &lsin, l->l_cred)) != 0)
                        return error;
        }

        inpcb_set_state(inp, INP_CONNECTED);
#if defined(IPSEC)
        if (ipsec_enabled && inp->inp_socket->so_type == SOCK_STREAM)
                ipsec_pcbconn(inp->inp_sp);
#endif
        return 0;
}

/*
 * inpcb_disconnect: remove any foreign IP/port association.
 *
 * Note: destroys the PCB if socket was closed.
 */
void
inpcb_disconnect(void *v)
{
        struct inpcb *inp = v;

        if (inp->inp_af != AF_INET)
                return;

        in4p_faddr(inp) = zeroin_addr;
        inp->inp_fport = 0;
        inpcb_set_state(inp, INP_BOUND);
#if defined(IPSEC)
        if (ipsec_enabled)
                ipsec_pcbdisconn(inp->inp_sp);
#endif
        if (inp->inp_socket->so_state & SS_NOFDREF)
                inpcb_destroy(inp);
}

/*
 * inpcb_destroy: destroy PCB as well as the associated socket.
 */
void
inpcb_destroy(void *v)
{
        struct inpcb *inp = v;
        struct socket *so = inp->inp_socket;
        int s;

        KASSERT(inp->inp_af == AF_INET || inp->inp_af == AF_INET6);

#if defined(IPSEC)
        if (ipsec_enabled)
                ipsec_delete_pcbpolicy(inp);
#endif
        so->so_pcb = NULL;

        s = splsoftnet();
        inpcb_set_state(inp, INP_ATTACHED);
        LIST_REMOVE(inp, inp_lhash);
        TAILQ_REMOVE(&inp->inp_table->inpt_queue, inp, inp_queue);
        splx(s);

        if (inp->inp_options) {
                m_free(inp->inp_options);
        }
        rtcache_free(&inp->inp_route);
        ip_freemoptions(inp->inp_moptions);
#ifdef INET6
        if (inp->inp_af == AF_INET6) {
                if (in6p_outputopts(inp) != NULL) {
                        ip6_clearpktopts(in6p_outputopts(inp), -1);
                        free(in6p_outputopts(inp), M_IP6OPT);
                }
                ip6_freemoptions(in6p_moptions(inp));
        }
#endif
        sofree(so);                        /* drops the socket's lock */

#ifdef INET6
        if (inp->inp_af == AF_INET)
                pool_cache_put(in4pcb_pool_cache, inp);
        else
                pool_cache_put(in6pcb_pool_cache, inp);
#else
        KASSERT(inp->inp_af == AF_INET);
        pool_cache_put(in4pcb_pool_cache, inp);
#endif
        mutex_enter(softnet_lock);        /* reacquire the softnet_lock */
}

/*
 * inpcb_fetch_sockaddr: fetch the local IP address and port number.
 */
void
inpcb_fetch_sockaddr(struct inpcb *inp, struct sockaddr_in *sin)
{

        if (inp->inp_af != AF_INET)
                return;

        sockaddr_in_init(sin, &in4p_laddr(inp), inp->inp_lport);
}

/*
 * inpcb_fetch_peeraddr: fetch the foreign IP address and port number.
 */
void
inpcb_fetch_peeraddr(struct inpcb *inp, struct sockaddr_in *sin)
{

        if (inp->inp_af != AF_INET)
                return;

        sockaddr_in_init(sin, &in4p_faddr(inp), inp->inp_fport);
}

/*
 * inpcb_notify: pass some notification to all connections of a protocol
 * associated with destination address.  The local address and/or port
 * numbers may be specified to limit the search.  The "usual action" will
 * be taken, depending on the command.
 *
 * The caller must filter any commands that are not interesting (e.g.,
 * no error in the map).  Call the protocol specific routine (if any) to
 * report any errors for each matching socket.
 *
 * Must be called at splsoftnet.
 */
int
inpcb_notify(struct inpcbtable *table, struct in_addr faddr, u_int fport_arg,
    struct in_addr laddr, u_int lport_arg, int errno,
    void (*notify)(struct inpcb *, int))
{
        struct inpcbhead *head;
        struct inpcb *inp;
        in_port_t fport = fport_arg, lport = lport_arg;
        int nmatch;

        if (in_nullhost(faddr) || notify == NULL)
                return 0;

        nmatch = 0;
        head = INPCBHASH_CONNECT(table, faddr, fport, laddr, lport);
        LIST_FOREACH(inp, head, inp_hash) {
                if (inp->inp_af != AF_INET)
                        continue;

                if (in_hosteq(in4p_faddr(inp), faddr) &&
                    inp->inp_fport == fport &&
                    inp->inp_lport == lport &&
                    in_hosteq(in4p_laddr(inp), laddr)) {
                        (*notify)(inp, errno);
                        nmatch++;
                }
        }
        return nmatch;
}

void
inpcb_notifyall(struct inpcbtable *table, struct in_addr faddr, int errno,
    void (*notify)(struct inpcb *, int))
{
        struct inpcb *inp;

        if (in_nullhost(faddr) || notify == NULL)
                return;

        TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
                if (inp->inp_af != AF_INET)
                        continue;
                if (in_hosteq(in4p_faddr(inp), faddr))
                        (*notify)(inp, errno);
        }
}

void
in_purgeifmcast(struct ip_moptions *imo, struct ifnet *ifp)
{
        int i, gap;

        /* The owner of imo should be protected by solock */
        KASSERT(ifp != NULL);

        if (imo == NULL)
                return;

        /*
         * Unselect the outgoing interface if it is being
         * detached.
         */
        if (imo->imo_multicast_if_index == ifp->if_index)
                imo->imo_multicast_if_index = 0;

        /*
         * Drop multicast group membership if we joined
         * through the interface being detached.
         */
        for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) {
                if (imo->imo_membership[i]->inm_ifp == ifp) {
                        in_delmulti(imo->imo_membership[i]);
                        gap++;
                } else if (gap != 0)
                        imo->imo_membership[i - gap] = imo->imo_membership[i];
        }
        imo->imo_num_memberships -= gap;
}

void
inpcb_purgeif0(struct inpcbtable *table, struct ifnet *ifp)
{
        struct inpcb *inp;

        TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
                bool need_unlock = false;

                if (inp->inp_af != AF_INET)
                        continue;

                /* The caller holds either one of inps' lock */
                if (!inp_locked(inp)) {
                        inp_lock(inp);
                        need_unlock = true;
                }

                in_purgeifmcast(inp->inp_moptions, ifp);

                if (need_unlock)
                        inp_unlock(inp);
        }
}

void
inpcb_purgeif(struct inpcbtable *table, struct ifnet *ifp)
{
        struct rtentry *rt;
        struct inpcb *inp;

        TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
                if (inp->inp_af != AF_INET)
                        continue;
                if ((rt = rtcache_validate(&inp->inp_route)) != NULL &&
                    rt->rt_ifp == ifp) {
                        rtcache_unref(rt, &inp->inp_route);
                        inpcb_rtchange(inp, 0);
                } else
                        rtcache_unref(rt, &inp->inp_route);
        }
}

/*
 * inpcb_losing: check for alternatives when higher level complains about
 * service problems.  For now, invalidate cached routing information.
 * If the route was created dynamically (by a redirect), time to try a
 * default gateway again.
 */
void
inpcb_losing(struct inpcb *inp)
{
        struct rtentry *rt;
        struct rt_addrinfo info;

        if (inp->inp_af != AF_INET)
                return;

        if ((rt = rtcache_validate(&inp->inp_route)) == NULL)
                return;

        memset(&info, 0, sizeof(info));
        info.rti_info[RTAX_DST] = rtcache_getdst(&inp->inp_route);
        info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
        info.rti_info[RTAX_NETMASK] = rt_mask(rt);
        rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
        if (rt->rt_flags & RTF_DYNAMIC) {
                int error;
                struct rtentry *nrt;

                error = rtrequest(RTM_DELETE, rt_getkey(rt),
                    rt->rt_gateway, rt_mask(rt), rt->rt_flags, &nrt);
                rtcache_unref(rt, &inp->inp_route);
                if (error == 0) {
                        rt_newmsg_dynamic(RTM_DELETE, nrt);
                        rt_free(nrt);
                }
        } else
                rtcache_unref(rt, &inp->inp_route);
        /*
         * A new route can be allocated
         * the next time output is attempted.
         */
        rtcache_free(&inp->inp_route);
}

/*
 * inpcb_rtchange: after a routing change, flush old routing.
 * A new route can be allocated the next time output is attempted.
 */
void
inpcb_rtchange(struct inpcb *inp, int errno)
{

        if (inp->inp_af != AF_INET)
                return;

        rtcache_free(&inp->inp_route);

        /* XXX SHOULD NOTIFY HIGHER-LEVEL PROTOCOLS */
}

/*
 * inpcb_lookup_local: find a PCB by looking at the local port and matching
 * the local address or resolving the wildcards.  Primarily used to detect
 * when the local address is already in use.
 */
struct inpcb *
inpcb_lookup_local(struct inpcbtable *table, struct in_addr laddr,
                  u_int lport_arg, int lookup_wildcard, vestigial_inpcb_t *vp)
{
        struct inpcbhead *head;
        struct inpcb *inp;
        struct inpcb *match = NULL;
        int matchwild = 3;
        int wildcard;
        in_port_t lport = lport_arg;

        if (vp)
                vp->valid = 0;

        head = INPCBHASH_PORT(table, lport);
        LIST_FOREACH(inp, head, inp_lhash) {
                if (inp->inp_af != AF_INET)
                        continue;
                if (inp->inp_lport != lport)
                        continue;
                /*
                 * check if inp's faddr and laddr match with ours.
                 * our faddr is considered null.
                 * count the number of wildcard matches. (0 - 2)
                 *
                 *        null        null        match
                 *        A        null        wildcard match
                 *        null        B        wildcard match
                 *        A        B        non match
                 *        A        A        match
                 */
                wildcard = 0;
                if (!in_nullhost(in4p_faddr(inp)))
                        wildcard++;
                if (in_nullhost(in4p_laddr(inp))) {
                        if (!in_nullhost(laddr))
                                wildcard++;
                } else {
                        if (in_nullhost(laddr))
                                wildcard++;
                        else {
                                if (!in_hosteq(in4p_laddr(inp), laddr))
                                        continue;
                        }
                }
                if (wildcard && !lookup_wildcard)
                        continue;
                /*
                 * prefer an address with less wildcards.
                 */
                if (wildcard < matchwild) {
                        match = inp;
                        matchwild = wildcard;
                        if (matchwild == 0)
                                break;
                }
        }
        if (match && matchwild == 0)
                return match;

        if (vp && table->vestige) {
                void        *state = (*table->vestige->init_ports4)(laddr, lport_arg, lookup_wildcard);
                vestigial_inpcb_t better;
                bool has_better = false;

                while (table->vestige
                       && (*table->vestige->next_port4)(state, vp)) {

                        if (vp->lport != lport)
                                continue;
                        wildcard = 0;
                        if (!in_nullhost(vp->faddr.v4))
                                wildcard++;
                        if (in_nullhost(vp->laddr.v4)) {
                                if (!in_nullhost(laddr))
                                        wildcard++;
                        } else {
                                if (in_nullhost(laddr))
                                        wildcard++;
                                else {
                                        if (!in_hosteq(vp->laddr.v4, laddr))
                                                continue;
                                }
                        }
                        if (wildcard && !lookup_wildcard)
                                continue;
                        if (wildcard < matchwild) {
                                better = *vp;
                                has_better = true;

                                matchwild = wildcard;
                                if (matchwild == 0)
                                        break;
                        }
                }

                if (has_better) {
                        *vp = better;
                        return 0;
                }
        }

        return match;
}

#ifdef DIAGNOSTIC
int        inpcb_notifymiss = 0;
#endif

/*
 * inpcb_lookup: perform a full 4-tuple PCB lookup.
 */
struct inpcb *
inpcb_lookup(struct inpcbtable *table,
    struct in_addr faddr, u_int fport_arg,
    struct in_addr laddr, u_int lport_arg,
    vestigial_inpcb_t *vp)
{
        struct inpcbhead *head;
        struct inpcb *inp;
        in_port_t fport = fport_arg, lport = lport_arg;

        if (vp)
                vp->valid = 0;

        head = INPCBHASH_CONNECT(table, faddr, fport, laddr, lport);
        LIST_FOREACH(inp, head, inp_hash) {
                if (inp->inp_af != AF_INET)
                        continue;

                if (in_hosteq(in4p_faddr(inp), faddr) &&
                    inp->inp_fport == fport &&
                    inp->inp_lport == lport &&
                    in_hosteq(in4p_laddr(inp), laddr))
                        goto out;
        }
        if (vp && table->vestige) {
                if ((*table->vestige->lookup4)(faddr, fport_arg,
                                               laddr, lport_arg, vp))
                        return 0;
        }

#ifdef DIAGNOSTIC
        if (inpcb_notifymiss) {
                printf("inpcb_lookup: faddr=%08x fport=%d laddr=%08x lport=%d\n",
                    ntohl(faddr.s_addr), ntohs(fport),
                    ntohl(laddr.s_addr), ntohs(lport));
        }
#endif
        return 0;

out:
        /* Move this PCB to the head of hash chain. */
        if (inp != LIST_FIRST(head)) {
                LIST_REMOVE(inp, inp_hash);
                LIST_INSERT_HEAD(head, inp, inp_hash);
        }
        return inp;
}

/*
 * inpcb_lookup_bound: find a PCB by looking at the local address and port.
 * Primarily used to find the listening (i.e., already bound) socket.
 */
struct inpcb *
inpcb_lookup_bound(struct inpcbtable *table,
    struct in_addr laddr, u_int lport_arg)
{
        struct inpcbhead *head;
        struct inpcb *inp;
        in_port_t lport = lport_arg;

        head = INPCBHASH_BIND(table, laddr, lport);
        LIST_FOREACH(inp, head, inp_hash) {
                if (inp->inp_af != AF_INET)
                        continue;

                if (inp->inp_lport == lport &&
                    in_hosteq(in4p_laddr(inp), laddr))
                        goto out;
        }
        head = INPCBHASH_BIND(table, zeroin_addr, lport);
        LIST_FOREACH(inp, head, inp_hash) {
                if (inp->inp_af != AF_INET)
                        continue;

                if (inp->inp_lport == lport &&
                    in_hosteq(in4p_laddr(inp), zeroin_addr))
                        goto out;
        }
#ifdef DIAGNOSTIC
        if (inpcb_notifymiss) {
                printf("inpcb_lookup_bound: laddr=%08x lport=%d\n",
                    ntohl(laddr.s_addr), ntohs(lport));
        }
#endif
        return 0;

out:
        /* Move this PCB to the head of hash chain. */
        if (inp != LIST_FIRST(head)) {
                LIST_REMOVE(inp, inp_hash);
                LIST_INSERT_HEAD(head, inp, inp_hash);
        }
        return inp;
}

void
inpcb_set_state(struct inpcb *inp, int state)
{

#ifdef INET6
        if (inp->inp_af == AF_INET6) {
                in6pcb_set_state(inp, state);
                return;
        }
#else
        if (inp->inp_af != AF_INET)
                return;
#endif

        if (inp->inp_state > INP_ATTACHED)
                LIST_REMOVE(inp, inp_hash);

        switch (state) {
        case INP_BOUND:
                LIST_INSERT_HEAD(INPCBHASH_BIND(inp->inp_table,
                    in4p_laddr(inp), inp->inp_lport), inp,
                    inp_hash);
                break;
        case INP_CONNECTED:
                LIST_INSERT_HEAD(INPCBHASH_CONNECT(inp->inp_table,
                    in4p_faddr(inp), inp->inp_fport,
                    in4p_laddr(inp), inp->inp_lport), inp,
                    inp_hash);
                break;
        }

        inp->inp_state = state;
}

struct rtentry *
inpcb_rtentry(struct inpcb *inp)
{
        struct route *ro;
        union {
                struct sockaddr                dst;
                struct sockaddr_in        dst4;
        } u;

#ifdef INET6
        if (inp->inp_af == AF_INET6)
                return in6pcb_rtentry(inp);
#endif
        if (inp->inp_af != AF_INET)
                return NULL;

        ro = &inp->inp_route;

        sockaddr_in_init(&u.dst4, &in4p_faddr(inp), 0);
        return rtcache_lookup(ro, &u.dst);
}

void
inpcb_rtentry_unref(struct rtentry *rt, struct inpcb *inp)
{

        rtcache_unref(rt, &inp->inp_route);
}











































































































































































































































































    2 







    2 










































    2 
















    2 









    2 












































    3 







    3 























    2 



    2 



    2 



    2 








































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
/*        $NetBSD: ptyfs_vfsops.c,v 1.58 2020/03/16 21:20:10 pgoyette Exp $        */

/*
 * Copyright (c) 1992, 1993, 1995
 *        The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 */

/*
 * Pseudo-tty Filesystem
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ptyfs_vfsops.c,v 1.58 2020/03/16 21:20:10 pgoyette Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/conf.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/stat.h>
#include <sys/dirent.h>
#include <sys/malloc.h>
#include <sys/syslog.h>
#include <sys/select.h>
#include <sys/filedesc.h>
#include <sys/tty.h>
#include <sys/pty.h>
#include <sys/kauth.h>
#include <sys/module.h>

#include <fs/ptyfs/ptyfs.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>

MODULE(MODULE_CLASS_VFS, ptyfs, NULL);

MALLOC_JUSTDEFINE(M_PTYFSMNT, "ptyfs mount", "ptyfs mount structures");
MALLOC_JUSTDEFINE(M_PTYFSTMP, "ptyfs temp", "ptyfs temporary structures");

VFS_PROTOS(ptyfs);

static int ptyfs__allocvp(struct mount *, struct lwp *, struct vnode **,
    dev_t, char);
static int ptyfs__makename(struct mount *, struct lwp *, char *, size_t,
    dev_t, char);
static void ptyfs__getvattr(struct mount *, struct lwp *, struct vattr *);
static int ptyfs__getmp(struct lwp *, struct mount **);

/*
 * ptm glue: When we mount, we make ptm point to us.
 */
struct ptm_pty *ptyfs_save_ptm;
static int ptyfs_count;

static TAILQ_HEAD(, ptyfsmount) ptyfs_head;

struct ptm_pty ptm_ptyfspty = {
        ptyfs__allocvp,
        ptyfs__makename,
        ptyfs__getvattr,
        ptyfs__getmp,
};

static int
ptyfs__getmp(struct lwp *l, struct mount **mpp)
{
         struct cwdinfo *cwdi = l->l_proc->p_cwdi;
         struct mount *mp;
        struct ptyfsmount *pmnt;
 
        TAILQ_FOREACH(pmnt, &ptyfs_head, pmnt_le) {
                mp = pmnt->pmnt_mp;
                if (cwdi->cwdi_rdir == NULL)
                        goto ok;

                if (vn_isunder(mp->mnt_vnodecovered, cwdi->cwdi_rdir, l))
                        goto ok;
        }
         *mpp = NULL;
         return EOPNOTSUPP;
ok:
        *mpp = mp;
        return 0;
}

static const char *
ptyfs__getpath(struct lwp *l, const struct mount *mp)
{
#define MAXBUF (sizeof(mp->mnt_stat.f_mntonname) + 32)
        struct cwdinfo *cwdi = l->l_proc->p_cwdi;
        char *buf;
        const char *rv;
        size_t len;
        char *bp;
        int error;

        rv = mp->mnt_stat.f_mntonname;
        if (cwdi->cwdi_rdir == NULL)
                return rv;

        buf = malloc(MAXBUF, M_TEMP, M_WAITOK);
        bp = buf + MAXBUF;
        *--bp = '\0';
        error = getcwd_common(mp->mnt_vnodecovered, cwdi->cwdi_rdir, &bp,
            buf, MAXBUF / 2, 0, l);
        if (error) {        /* Mount point is out of rdir */
                rv = NULL;
                goto out;
        }

        len = strlen(bp);
        if (len < sizeof(mp->mnt_stat.f_mntonname))        /* XXX */
                rv += strlen(rv) - len;
out:
        free(buf, M_TEMP);
        return rv;
}

static int
ptyfs__makename(struct mount *mp, struct lwp *l, char *tbuf, size_t bufsiz,
    dev_t dev, char ms)
{
        size_t len;
        const char *np;
        int pty = minor(dev);

        switch (ms) {
        case 'p':
                /* We don't provide access to the master, should we? */
                len = snprintf(tbuf, bufsiz, "/dev/null");
                break;
        case 't':
                /*
                 * We support traditional ptys, so we can get here,
                 * if pty had been opened before PTYFS was mounted,
                 * or was opened through /dev/ptyXX devices.
                 * Return it only outside chroot for more security .
                 */
                if (l->l_proc->p_cwdi->cwdi_rdir == NULL
                    && ptyfs_save_ptm != NULL 
                    && ptyfs_next_active(mp, pty) != pty)
                        return (*ptyfs_save_ptm->makename)(mp, l,
                            tbuf, bufsiz, dev, ms);

                np = ptyfs__getpath(l, mp);
                if (np == NULL)
                        return EOPNOTSUPP;
                len = snprintf(tbuf, bufsiz, "%s/%llu", np,
                        (unsigned long long)minor(dev));
                break;
        default:
                return EINVAL;
        }

        return len >= bufsiz ? ENOSPC : 0;
}


static int
/*ARGSUSED*/
ptyfs__allocvp(struct mount *mp, struct lwp *l, struct vnode **vpp,
    dev_t dev, char ms)
{
        int error;
        ptyfstype type;

        switch (ms) {
        case 'p':
                type = PTYFSptc;
                break;
        case 't':
                type = PTYFSpts;
                break;
        default:
                return EINVAL;
        }

        error = ptyfs_allocvp(mp, vpp, type, minor(dev));
        if (error)
                return error;
        error = vn_lock(*vpp, LK_EXCLUSIVE);
        if (error) {
                vrele(*vpp);
                *vpp = NULL;
                return error;
        }
        if (type == PTYFSptc)
                ptyfs_set_active(mp, minor(dev));
        return 0;
}


static void
ptyfs__getvattr(struct mount *mp, struct lwp *l, struct vattr *vattr)
{
        struct ptyfsmount *pmnt = VFSTOPTY(mp);
        vattr_null(vattr);
        /* get real uid */
        vattr->va_uid = kauth_cred_getuid(l->l_cred);
        vattr->va_gid = pmnt->pmnt_gid;
        vattr->va_mode = pmnt->pmnt_mode;
}


void
ptyfs_init(void)
{

        TAILQ_INIT(&ptyfs_head);
        malloc_type_attach(M_PTYFSMNT);
        malloc_type_attach(M_PTYFSTMP);
        ptyfs_hashinit();
}

void
ptyfs_reinit(void)
{

}

void
ptyfs_done(void)
{

        ptyfs_hashdone();
        malloc_type_detach(M_PTYFSTMP);
        malloc_type_detach(M_PTYFSMNT);
}

#define OSIZE sizeof(struct { int f; gid_t g; mode_t m; })
/*
 * Mount the Pseudo tty params filesystem
 */
int
ptyfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
        struct lwp *l = curlwp;
        int error = 0;
        struct ptyfsmount *pmnt;
        struct ptyfs_args *args = data;

        if (args == NULL)
                return EINVAL;
        if (*data_len != sizeof *args) {
                if (*data_len != OSIZE || args->version >= PTYFS_ARGSVERSION)
                        return EINVAL;
        }

        if (UIO_MX & (UIO_MX - 1)) {
                log(LOG_ERR, "ptyfs: invalid directory entry size");
                return EINVAL;
        }

        if (mp->mnt_flag & MNT_GETARGS) {
                pmnt = VFSTOPTY(mp);
                if (pmnt == NULL)
                        return EIO;
                args->mode = pmnt->pmnt_mode;
                args->gid = pmnt->pmnt_gid;
                if (args->version >= PTYFS_ARGSVERSION) {
                        args->flags = pmnt->pmnt_flags;
                        *data_len = sizeof *args;
                } else {
                        *data_len = OSIZE;
                }
                return 0;
        }

#if 0
        /* Don't allow more than one mount */
        if (ptyfs_count)
                return EBUSY;
#endif

        if (mp->mnt_flag & MNT_UPDATE)
                return EOPNOTSUPP;

        if (args->version > PTYFS_ARGSVERSION)
                return EINVAL;

        pmnt = malloc(sizeof(struct ptyfsmount), M_PTYFSMNT, M_WAITOK);

        mp->mnt_data = pmnt;
        mutex_init(&pmnt->pmnt_lock, MUTEX_DEFAULT, IPL_NONE);
        pmnt->pmnt_gid = args->gid;
        pmnt->pmnt_mode = args->mode;
        if (args->version >= PTYFS_ARGSVERSION)
                pmnt->pmnt_flags = args->flags;
        else
                pmnt->pmnt_flags = 0;
        pmnt->pmnt_bitmap_size = 0;
        pmnt->pmnt_bitmap = NULL;
        mp->mnt_flag |= MNT_LOCAL;
        vfs_getnewfsid(mp);

        if ((error = set_statvfs_info(path, UIO_USERSPACE, "ptyfs",
            UIO_SYSSPACE, mp->mnt_op->vfs_name, mp, l)) != 0) {
                free(pmnt, M_PTYFSMNT);
                return error;
        }

        pmnt->pmnt_mp = mp;
        TAILQ_INSERT_TAIL(&ptyfs_head, pmnt, pmnt_le);
        if (ptyfs_count++ == 0) {
                /* Point pty access to us */
                ptyfs_save_ptm = pty_sethandler(&ptm_ptyfspty);
        }
        return 0;
}

/*ARGSUSED*/
int
ptyfs_start(struct mount *mp, int flags)
{
        return 0;
}

/*ARGSUSED*/
int
ptyfs_unmount(struct mount *mp, int mntflags)
{
        int error;
        int flags = 0;
        struct ptyfsmount *pmnt;

        if (mntflags & MNT_FORCE)
                flags |= FORCECLOSE;

        if ((error = vflush(mp, 0, flags)) != 0)
                return error;

        ptyfs_count--;
        if (ptyfs_count == 0) {
                /* Restore where pty access was pointing */
                (void)pty_sethandler(ptyfs_save_ptm);
                ptyfs_save_ptm = NULL;
        }
        TAILQ_FOREACH(pmnt, &ptyfs_head, pmnt_le) {
                if (pmnt->pmnt_mp == mp) {
                        TAILQ_REMOVE(&ptyfs_head, pmnt, pmnt_le);
                        break;
                }
         }

        /*
         * Finally, throw away the ptyfsmount structure
         */
        if (pmnt->pmnt_bitmap_size > 0)
                kmem_free(pmnt->pmnt_bitmap, pmnt->pmnt_bitmap_size);
        mutex_destroy(&pmnt->pmnt_lock);
        free(mp->mnt_data, M_PTYFSMNT);
        mp->mnt_data = NULL;

        return 0;
}

int
ptyfs_root(struct mount *mp, int lktype, struct vnode **vpp)
{
        int error;

        /* setup "." */
        error = ptyfs_allocvp(mp, vpp, PTYFSroot, 0);
        if (error)
                return error;
        error = vn_lock(*vpp, lktype);
        if (error) {
                vrele(*vpp);
                *vpp = NULL;
                return error;
        }
        return 0;
}

/*ARGSUSED*/
int
ptyfs_sync(struct mount *mp, int waitfor,
    kauth_cred_t uc)
{
        return 0;
}

/*
 * Initialize this vnode / ptynode pair.
 * Only for the slave side of a pty, caller assures
 * no other thread will try to load this node.
 */
int
ptyfs_loadvnode(struct mount *mp, struct vnode *vp,
    const void *key, size_t key_len, const void **new_key)
{
        struct ptyfskey pkey;
        struct ptyfsnode *ptyfs;

        KASSERT(key_len == sizeof(pkey));
        memcpy(&pkey, key, key_len);

        ptyfs = ptyfs_get_node(pkey.ptk_type, pkey.ptk_pty);
        KASSERT(memcmp(&ptyfs->ptyfs_key, &pkey, sizeof(pkey)) == 0);

        switch (pkey.ptk_type) {
        case PTYFSroot:        /* /pts = dr-xr-xr-x */
                vp->v_type = VDIR;
                vp->v_vflag = VV_ROOT;
                break;

        case PTYFSpts:        /* /pts/N = cxxxxxxxxx */
        case PTYFSptc:        /* controlling side = cxxxxxxxxx */
                vp->v_type = VCHR;
                spec_node_init(vp, PTYFS_MAKEDEV(ptyfs));
                break;
        default:
                panic("ptyfs_loadvnode");
        }

        vp->v_tag = VT_PTYFS;
        vp->v_op = ptyfs_vnodeop_p;
        vp->v_data = ptyfs;
        uvm_vnp_setsize(vp, 0);
        *new_key = &ptyfs->ptyfs_key;
        return 0;
}

/*
 * Kernfs flat namespace lookup.
 * Currently unsupported.
 */
/*ARGSUSED*/
int
ptyfs_vget(struct mount *mp, ino_t ino, int lktype,
    struct vnode **vpp)
{
        return EOPNOTSUPP;
}

extern const struct vnodeopv_desc ptyfs_vnodeop_opv_desc;

const struct vnodeopv_desc * const ptyfs_vnodeopv_descs[] = {
        &ptyfs_vnodeop_opv_desc,
        NULL,
};

struct vfsops ptyfs_vfsops = {
        .vfs_name = MOUNT_PTYFS,
        .vfs_min_mount_data = sizeof (struct ptyfs_args),
        .vfs_mount = ptyfs_mount,
        .vfs_start = ptyfs_start,
        .vfs_unmount = ptyfs_unmount,
        .vfs_root = ptyfs_root,
        .vfs_quotactl = (void *)eopnotsupp,
        .vfs_statvfs = genfs_statvfs,
        .vfs_sync = ptyfs_sync,
        .vfs_vget = ptyfs_vget,
        .vfs_loadvnode = ptyfs_loadvnode,
        .vfs_fhtovp = (void *)eopnotsupp,
        .vfs_vptofh = (void *)eopnotsupp,
        .vfs_init = ptyfs_init,
        .vfs_reinit = ptyfs_reinit,
        .vfs_done = ptyfs_done,
        .vfs_snapshot = (void *)eopnotsupp,
        .vfs_extattrctl = (void *)eopnotsupp,
        .vfs_suspendctl = genfs_suspendctl,
        .vfs_renamelock_enter = genfs_renamelock_enter,
        .vfs_renamelock_exit = genfs_renamelock_exit,
        .vfs_fsync = (void *)eopnotsupp,
        .vfs_opv_descs = ptyfs_vnodeopv_descs
};

SYSCTL_SETUP(ptyfs_sysctl_setup, "ptyfs sysctl")
{

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "ptyfs",
                       SYSCTL_DESCR("Pty file system"),
                       NULL, 0, NULL, 0,
                       CTL_VFS, 23, CTL_EOL);
        /*
         * XXX the "23" above could be dynamic, thereby eliminating
         * one more instance of the "number to vfs" mapping problem,
         * but "23" is the order as taken from sys/mount.h
         */
}

static int
ptyfs_modcmd(modcmd_t cmd, void *arg)
{
        int error;

        switch (cmd) {
        case MODULE_CMD_INIT:
                error = vfs_attach(&ptyfs_vfsops);
                if (error != 0)
                        break;
                break;
        case MODULE_CMD_FINI:
                error = vfs_detach(&ptyfs_vfsops);
                if (error != 0)
                        break;
                break;
        default:
                error = ENOTTY;
                break;
        }

        return (error);
}






























































































    1 





    1 



















    1 



    1 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
/*        $NetBSD: exec_aout.c,v 1.41 2019/11/20 19:37:53 pgoyette Exp $        */

/*
 * Copyright (c) 1993, 1994 Christopher G. Demetriou
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Christopher G. Demetriou.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_aout.c,v 1.41 2019/11/20 19:37:53 pgoyette Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/exec.h>
#include <sys/exec_aout.h>
#include <sys/resourcevar.h>
#include <sys/module.h>

#include <uvm/uvm_extern.h>

MODULE(MODULE_CLASS_EXEC, exec_aout, NULL);

static struct execsw exec_aout_execsw = {
        .es_hdrsz = sizeof(struct exec),
        .es_makecmds = exec_aout_makecmds,
        .u = {
                .elf_probe_func = NULL,
        },
        .es_emul = &emul_netbsd,
        .es_prio = EXECSW_PRIO_ANY,
        .es_arglen = 0,
        .es_copyargs = copyargs,
        .es_setregs = NULL,
        .es_coredump = coredump_netbsd,
        .es_setup_stack = exec_setup_stack,
};

static int
exec_aout_modcmd(modcmd_t cmd, void *arg)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return exec_add(&exec_aout_execsw, 1);

        case MODULE_CMD_FINI:
                return exec_remove(&exec_aout_execsw, 1);

        default:
                return ENOTTY;
        }
}

/*
 * exec_aout_makecmds(): Check if it's an a.out-format executable.
 *
 * Given a lwp pointer and an exec package pointer, see if the referent
 * of the epp is in a.out format.  First check 'standard' magic numbers for
 * this architecture.  If that fails, try a CPU-dependent hook.
 *
 * This function, in the former case, or the hook, in the latter, is
 * responsible for creating a set of vmcmds which can be used to build
 * the process's vm space and inserting them into the exec package.
 */

int
exec_aout_makecmds(struct lwp *l, struct exec_package *epp)
{
        u_long midmag, magic;
        u_short mid;
        int error;
        struct exec *execp = epp->ep_hdr;

        if (epp->ep_hdrvalid < sizeof(struct exec))
                return ENOEXEC;

        midmag = ntohl(execp->a_midmag);
        mid = (midmag >> 16) & 0x3ff;
        magic = midmag & 0xffff;

        midmag = mid << 16 | magic;

        switch (midmag) {
        case (MID_MACHINE << 16) | ZMAGIC:
                error = exec_aout_prep_zmagic(l, epp);
                break;
        case (MID_MACHINE << 16) | NMAGIC:
                error = exec_aout_prep_nmagic(l, epp);
                break;
        case (MID_MACHINE << 16) | OMAGIC:
                error = exec_aout_prep_omagic(l, epp);
                break;
        default:
                error = cpu_exec_aout_makecmds(l, epp);
        }

        if (error)
                kill_vmcmds(&epp->ep_vmcmds);
        else
                epp->ep_flags &= ~EXEC_TOPDOWN_VM;

        return error;
}

/*
 * exec_aout_prep_zmagic(): Prepare a 'native' ZMAGIC binary's exec package
 *
 * First, set of the various offsets/lengths in the exec package.
 *
 * Then, mark the text image busy (so it can be demand paged) or error
 * out if this is not possible.  Finally, set up vmcmds for the
 * text, data, bss, and stack segments.
 */

int
exec_aout_prep_zmagic(struct lwp *l, struct exec_package *epp)
{
        struct exec *execp = epp->ep_hdr;
        int error;

        epp->ep_taddr = AOUT_LDPGSZ;
        epp->ep_tsize = execp->a_text;
        epp->ep_daddr = epp->ep_taddr + execp->a_text;
        epp->ep_dsize = execp->a_data + execp->a_bss;
        epp->ep_entry = execp->a_entry;

        error = vn_marktext(epp->ep_vp);
        if (error)
                return (error);

        /* set up command for text segment */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, round_page(execp->a_text),
            epp->ep_taddr, epp->ep_vp, 0, VM_PROT_READ|VM_PROT_EXECUTE);

        /* set up command for data segment */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, round_page(execp->a_data),
            epp->ep_daddr, epp->ep_vp, execp->a_text,
            VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        /* set up command for bss segment */
        if (execp->a_bss > 0)
                NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, execp->a_bss,
                    epp->ep_daddr + execp->a_data, NULLVP, 0,
                    VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        return (*epp->ep_esch->es_setup_stack)(l, epp);
}

/*
 * exec_aout_prep_nmagic(): Prepare a 'native' NMAGIC binary's exec package
 */

int
exec_aout_prep_nmagic(struct lwp *l, struct exec_package *epp)
{
        struct exec *execp = epp->ep_hdr;
        long bsize, baddr;

        epp->ep_taddr = AOUT_LDPGSZ;
        epp->ep_tsize = execp->a_text;
        epp->ep_daddr = roundup(epp->ep_taddr + execp->a_text, AOUT_LDPGSZ);
        epp->ep_dsize = execp->a_data + execp->a_bss;
        epp->ep_entry = execp->a_entry;

        /* set up command for text segment */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text,
            epp->ep_taddr, epp->ep_vp, sizeof(struct exec),
            VM_PROT_READ|VM_PROT_EXECUTE);

        /* set up command for data segment */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_data,
            epp->ep_daddr, epp->ep_vp, execp->a_text + sizeof(struct exec),
            VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        /* set up command for bss segment */
        baddr = round_page(epp->ep_daddr + execp->a_data);
        bsize = epp->ep_daddr + epp->ep_dsize - baddr;
        if (bsize > 0)
                NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
                    NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        return (*epp->ep_esch->es_setup_stack)(l, epp);
}

/*
 * exec_aout_prep_omagic(): Prepare a 'native' OMAGIC binary's exec package
 */

int
exec_aout_prep_omagic(struct lwp *l, struct exec_package *epp)
{
        struct exec *execp = epp->ep_hdr;
        long dsize, bsize, baddr;

        epp->ep_taddr = AOUT_LDPGSZ;
        epp->ep_tsize = execp->a_text;
        epp->ep_daddr = epp->ep_taddr + execp->a_text;
        epp->ep_dsize = execp->a_data + execp->a_bss;
        epp->ep_entry = execp->a_entry;

        /* set up command for text and data segments */
        NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn,
            execp->a_text + execp->a_data, epp->ep_taddr, epp->ep_vp,
            sizeof(struct exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        /* set up command for bss segment */
        baddr = round_page(epp->ep_daddr + execp->a_data);
        bsize = epp->ep_daddr + epp->ep_dsize - baddr;
        if (bsize > 0)
                NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
                    NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);

        /*
         * Make sure (# of pages) mapped above equals (vm_tsize + vm_dsize);
         * obreak(2) relies on this fact. Both `vm_tsize' and `vm_dsize' are
         * computed (in execve(2)) by rounding *up* `ep_tsize' and `ep_dsize'
         * respectively to page boundaries.
         * Compensate `ep_dsize' for the amount of data covered by the last
         * text page.
         */
        dsize = epp->ep_dsize + execp->a_text - round_page(execp->a_text);
        epp->ep_dsize = (dsize > 0) ? dsize : 0;
        return (*epp->ep_esch->es_setup_stack)(l, epp);
}















































   58 













   57 




















    1 

    1 












   58 
   58 











   57 

   58 

   58 


















   57 

















   58 













   58 




   58 





















   58 
    1 


   58 


   58 
   57 

    1 



   58 
   57 
















   58 


   57 



    5 


   58 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
/*        $NetBSD: uvm_page_array.c,v 1.9 2020/05/26 21:52:12 ad Exp $        */

/*-
 * Copyright (c)2011 YAMAMOTO Takashi,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_page_array.c,v 1.9 2020/05/26 21:52:12 ad Exp $");

#include <sys/param.h>
#include <sys/systm.h>

#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>
#include <uvm/uvm_page.h>
#include <uvm/uvm_page_array.h>

/*
 * uvm_page_array_init: initialize the array.
 */

void
uvm_page_array_init(struct uvm_page_array *ar, struct uvm_object *uobj,
    unsigned int flags)
{

        ar->ar_idx = 0;
        ar->ar_npages = 0;
        ar->ar_uobj = uobj;
        ar->ar_flags = flags;
}

/*
 * uvm_page_array_fini: clean up the array.
 */

void
uvm_page_array_fini(struct uvm_page_array *ar)
{

        /*
         * currently nothing to do.
         */
#if defined(DIAGNOSTIC)
        /*
         * poison to trigger assertion in uvm_page_array_peek to
         * detect usage errors.
         */
        ar->ar_npages = 1;
        ar->ar_idx = 1000;
#endif /* defined(DIAGNOSTIC) */
}

/*
 * uvm_page_array_clear: forget the cached pages and initialize the array.
 */

void
uvm_page_array_clear(struct uvm_page_array *ar)
{

        KASSERT(ar->ar_idx <= ar->ar_npages);
        ar->ar_idx = 0;
        ar->ar_npages = 0;
}

/*
 * uvm_page_array_peek: return the next cached page.
 */

struct vm_page *
uvm_page_array_peek(struct uvm_page_array *ar)
{

        KASSERT(ar->ar_idx <= ar->ar_npages);
        if (ar->ar_idx == ar->ar_npages) {
                return NULL;
        }
        return ar->ar_pages[ar->ar_idx];
}

/*
 * uvm_page_array_advance: advance the array to the next cached page
 */

void
uvm_page_array_advance(struct uvm_page_array *ar)
{

        KASSERT(ar->ar_idx <= ar->ar_npages);
        ar->ar_idx++;
        KASSERT(ar->ar_idx <= ar->ar_npages);
}

/*
 * uvm_page_array_fill: lookup pages and keep them cached.
 *
 * return 0 on success.  in that case, cache the result in the array
 * so that they will be picked by later uvm_page_array_peek.
 *
 * nwant is a number of pages to fetch.  a caller should consider it a hint.
 * nwant == 0 means a caller have no specific idea.
 *
 * return ENOENT if no pages are found.
 *
 * called with object lock held.
 */

int
uvm_page_array_fill(struct uvm_page_array *ar, voff_t off, unsigned int nwant)
{
        unsigned int npages;
#if defined(DEBUG)
        unsigned int i;
#endif /* defined(DEBUG) */
        unsigned int maxpages = __arraycount(ar->ar_pages);
        struct uvm_object *uobj = ar->ar_uobj;
        const int flags = ar->ar_flags;
        const bool dense = (flags & UVM_PAGE_ARRAY_FILL_DENSE) != 0;
        const bool backward = (flags & UVM_PAGE_ARRAY_FILL_BACKWARD) != 0;
        int error = 0;

        if (nwant != 0 && nwant < maxpages) {
                maxpages = nwant;
        }
#if 0 /* called from DDB for "show obj/f" without lock */
        KASSERT(rw_lock_held(uobj->vmobjlock));
#endif
        KASSERT(uvm_page_array_peek(ar) == NULL);
        if ((flags & UVM_PAGE_ARRAY_FILL_DIRTY) != 0) {
                unsigned int tagmask = UVM_PAGE_DIRTY_TAG;

                if ((flags & UVM_PAGE_ARRAY_FILL_WRITEBACK) != 0) {
                        tagmask |= UVM_PAGE_WRITEBACK_TAG;
                }
                npages =
                    (backward ? radix_tree_gang_lookup_tagged_node_reverse :
                    radix_tree_gang_lookup_tagged_node)(
                    &uobj->uo_pages, off >> PAGE_SHIFT, (void **)ar->ar_pages,
                    maxpages, dense, tagmask);
        } else {
                npages =
                    (backward ? radix_tree_gang_lookup_node_reverse :
                    radix_tree_gang_lookup_node)(
                    &uobj->uo_pages, off >> PAGE_SHIFT, (void **)ar->ar_pages,
                    maxpages, dense);
        }
        if (npages == 0) {
                if (flags != 0) {
                        /*
                         * if dense or looking for tagged entries (or
                         * working backwards), fail right away.
                         */
                        npages = 0;
                } else {
                        /*
                         * there's nothing else to be found with the current
                         * set of arguments, in the current version of the
                         * tree.
                         *
                         * minimize repeated tree lookups by "finding" a
                         * null pointer, in case the caller keeps looping (a
                         * common use case).
                         */
                        npages = 1;
                        ar->ar_pages[0] = NULL;
                }
                error = ENOENT;
        }
        KASSERT(npages <= maxpages);
        ar->ar_npages = npages;
        ar->ar_idx = 0;
#if defined(DEBUG)
        for (i = 0; error == 0 && i < ar->ar_npages; i++) {
                struct vm_page * const pg = ar->ar_pages[i];

                KASSERT(pg != NULL);
                KDASSERT(pg->uobject == uobj);
                if (backward) {
                        KDASSERT(pg->offset <= off);
                        KDASSERT(i == 0 ||
                            pg->offset < ar->ar_pages[i - 1]->offset);
                } else {
                        KDASSERT(pg->offset >= off);
                        KDASSERT(i == 0 ||
                            pg->offset > ar->ar_pages[i - 1]->offset);
                }
        }
#endif /* defined(DEBUG) */
        return error;
}

/*
 * uvm_page_array_fill_and_peek:
 * same as uvm_page_array_peek except that, if the array is empty, try to fill
 * it first.
 */

struct vm_page *
uvm_page_array_fill_and_peek(struct uvm_page_array *ar, voff_t off,
    unsigned int nwant)
{
        int error;

        if (ar->ar_idx != ar->ar_npages) {
                return ar->ar_pages[ar->ar_idx];
        }
        error = uvm_page_array_fill(ar, off, nwant);
        if (error != 0) {
                return NULL;
        }
        return uvm_page_array_peek(ar);
}





















































































































































































































































































































































































































































  167 












  197 
  132 






  197 




  199 










   41 












    7 













    7 













   66 



   66 








   68 


















  113 
















  110 




















  136 
  145 







    5 


    7 


    7 














   12 




  112 


  136 



  167 







  166 


  165 


















































































































































































































































































































































































































































   24 












  198 



  199 
  198 


  200 



   95 
  195 


  191 
   95 










  201 



















































   42 









   42 



























  197 

  199 

   68 



  197 
   95 






   96 
   95 


  193 
  186 







  185 
  184 
  185 








  197 








  199 



  199 












  165 





  166 


  168 







  167 

    7 

  165 






  166 
   25 

  166 



















   65 




   63 
   65 












   65 








  161 
  153 
  154 






  165 






  168 
















   43 






























   41 

    9 

   43 



   24 
   27 




   10 
   41 
   44 

   43 










    1 






























   43 
   43 






   43 





   24 

   29 
















   43 








   41 

    3 


    3 


    3 





    3 






    1 
   43 


   43 
















    3 

    3 
















  186 













































































































































































































































































































































































































































































































































































































































































































































































  166 
    1 






















  166 






































































































































































































  199 



















































































  198 



  199 
  199 







  199 
































  197 

  200 

    2 

















  198 





  200 
  183 


  160 


  162 





   91 
  196 


  193 
   92 













































  199 
















































































  165 




  163 




  140 
  110 


  166 






































































































   43 





    1 



























   39 



















    2 






































































































  166 


















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
/*        $NetBSD: subr_pool.c,v 1.290 2023/04/09 12:21:59 riastradh Exp $        */

/*
 * Copyright (c) 1997, 1999, 2000, 2002, 2007, 2008, 2010, 2014, 2015, 2018,
 *     2020, 2021 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
 * Simulation Facility, NASA Ames Research Center; by Andrew Doran, and by
 * Maxime Villard.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_pool.c,v 1.290 2023/04/09 12:21:59 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "opt_lockdebug.h"
#include "opt_pool.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/bitops.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/vmem.h>
#include <sys/pool.h>
#include <sys/syslog.h>
#include <sys/debug.h>
#include <sys/lock.h>
#include <sys/lockdebug.h>
#include <sys/xcall.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <sys/asan.h>
#include <sys/msan.h>
#include <sys/fault.h>

#include <uvm/uvm_extern.h>

/*
 * Pool resource management utility.
 *
 * Memory is allocated in pages which are split into pieces according to
 * the pool item size. Each page is kept on one of three lists in the
 * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
 * for empty, full and partially-full pages respectively. The individual
 * pool items are on a linked list headed by `ph_itemlist' in each page
 * header. The memory for building the page list is either taken from
 * the allocated pages themselves (for small pool items) or taken from
 * an internal pool of page headers (`phpool').
 */

/* List of all pools. Non static as needed by 'vmstat -m' */
TAILQ_HEAD(, pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head);

/* Private pool for page header structures */
#define        PHPOOL_MAX        8
static struct pool phpool[PHPOOL_MAX];
#define        PHPOOL_FREELIST_NELEM(idx) \
        (((idx) == 0) ? BITMAP_MIN_SIZE : BITMAP_SIZE * (1 << (idx)))

#if !defined(KMSAN) && (defined(DIAGNOSTIC) || defined(KASAN))
#define POOL_REDZONE
#endif

#if defined(POOL_QUARANTINE)
#define POOL_NOCACHE
#endif

#ifdef POOL_REDZONE
# ifdef KASAN
#  define POOL_REDZONE_SIZE 8
# else
#  define POOL_REDZONE_SIZE 2
# endif
static void pool_redzone_init(struct pool *, size_t);
static void pool_redzone_fill(struct pool *, void *);
static void pool_redzone_check(struct pool *, void *);
static void pool_cache_redzone_check(pool_cache_t, void *);
#else
# define pool_redzone_init(pp, sz)                __nothing
# define pool_redzone_fill(pp, ptr)                __nothing
# define pool_redzone_check(pp, ptr)                __nothing
# define pool_cache_redzone_check(pc, ptr)        __nothing
#endif

#ifdef KMSAN
static inline void pool_get_kmsan(struct pool *, void *);
static inline void pool_put_kmsan(struct pool *, void *);
static inline void pool_cache_get_kmsan(pool_cache_t, void *);
static inline void pool_cache_put_kmsan(pool_cache_t, void *);
#else
#define pool_get_kmsan(pp, ptr)                __nothing
#define pool_put_kmsan(pp, ptr)                __nothing
#define pool_cache_get_kmsan(pc, ptr)        __nothing
#define pool_cache_put_kmsan(pc, ptr)        __nothing
#endif

#ifdef POOL_QUARANTINE
static void pool_quarantine_init(struct pool *);
static void pool_quarantine_flush(struct pool *);
static bool pool_put_quarantine(struct pool *, void *,
    struct pool_pagelist *);
#else
#define pool_quarantine_init(a)                        __nothing
#define pool_quarantine_flush(a)                __nothing
#define pool_put_quarantine(a, b, c)                false
#endif

#ifdef POOL_NOCACHE
static bool pool_cache_put_nocache(pool_cache_t, void *);
#else
#define pool_cache_put_nocache(a, b)                false
#endif

#define NO_CTOR        __FPTRCAST(int (*)(void *, void *, int), nullop)
#define NO_DTOR        __FPTRCAST(void (*)(void *, void *), nullop)

#define pc_has_pser(pc) (((pc)->pc_roflags & PR_PSERIALIZE) != 0)
#define pc_has_ctor(pc) ((pc)->pc_ctor != NO_CTOR)
#define pc_has_dtor(pc) ((pc)->pc_dtor != NO_DTOR)

#define pp_has_pser(pp) (((pp)->pr_roflags & PR_PSERIALIZE) != 0)

#define pool_barrier()        xc_barrier(0)

/*
 * Pool backend allocators.
 *
 * Each pool has a backend allocator that handles allocation, deallocation,
 * and any additional draining that might be needed.
 *
 * We provide two standard allocators:
 *
 *        pool_allocator_kmem - the default when no allocator is specified
 *
 *        pool_allocator_nointr - used for pools that will not be accessed
 *        in interrupt context.
 */
void *pool_page_alloc(struct pool *, int);
void pool_page_free(struct pool *, void *);

static void *pool_page_alloc_meta(struct pool *, int);
static void pool_page_free_meta(struct pool *, void *);

struct pool_allocator pool_allocator_kmem = {
        .pa_alloc = pool_page_alloc,
        .pa_free = pool_page_free,
        .pa_pagesz = 0
};

struct pool_allocator pool_allocator_nointr = {
        .pa_alloc = pool_page_alloc,
        .pa_free = pool_page_free,
        .pa_pagesz = 0
};

struct pool_allocator pool_allocator_meta = {
        .pa_alloc = pool_page_alloc_meta,
        .pa_free = pool_page_free_meta,
        .pa_pagesz = 0
};

#define POOL_ALLOCATOR_BIG_BASE 13
static struct pool_allocator pool_allocator_big[] = {
        {
                .pa_alloc = pool_page_alloc,
                .pa_free = pool_page_free,
                .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 0),
        },
        {
                .pa_alloc = pool_page_alloc,
                .pa_free = pool_page_free,
                .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 1),
        },
        {
                .pa_alloc = pool_page_alloc,
                .pa_free = pool_page_free,
                .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 2),
        },
        {
                .pa_alloc = pool_page_alloc,
                .pa_free = pool_page_free,
                .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 3),
        },
        {
                .pa_alloc = pool_page_alloc,
                .pa_free = pool_page_free,
                .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 4),
        },
        {
                .pa_alloc = pool_page_alloc,
                .pa_free = pool_page_free,
                .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 5),
        },
        {
                .pa_alloc = pool_page_alloc,
                .pa_free = pool_page_free,
                .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 6),
        },
        {
                .pa_alloc = pool_page_alloc,
                .pa_free = pool_page_free,
                .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 7),
        },
        {
                .pa_alloc = pool_page_alloc,
                .pa_free = pool_page_free,
                .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 8),
        },
        {
                .pa_alloc = pool_page_alloc,
                .pa_free = pool_page_free,
                .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 9),
        },
        {
                .pa_alloc = pool_page_alloc,
                .pa_free = pool_page_free,
                .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 10),
        },
        {
                .pa_alloc = pool_page_alloc,
                .pa_free = pool_page_free,
                .pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 11),
        }
};

static int pool_bigidx(size_t);

/* # of seconds to retain page after last use */
int pool_inactive_time = 10;

/* Next candidate for drainage (see pool_drain()) */
static struct pool *drainpp;

/* This lock protects both pool_head and drainpp. */
static kmutex_t pool_head_lock;
static kcondvar_t pool_busy;

/* This lock protects initialization of a potentially shared pool allocator */
static kmutex_t pool_allocator_lock;

static unsigned int poolid_counter = 0;

typedef uint32_t pool_item_bitmap_t;
#define        BITMAP_SIZE        (CHAR_BIT * sizeof(pool_item_bitmap_t))
#define        BITMAP_MASK        (BITMAP_SIZE - 1)
#define        BITMAP_MIN_SIZE        (CHAR_BIT * sizeof(((struct pool_item_header *)NULL)->ph_u2))

struct pool_item_header {
        /* Page headers */
        LIST_ENTRY(pool_item_header)
                                ph_pagelist;        /* pool page list */
        union {
                /* !PR_PHINPAGE */
                struct {
                        SPLAY_ENTRY(pool_item_header)
                                phu_node;        /* off-page page headers */
                } phu_offpage;
                /* PR_PHINPAGE */
                struct {
                        unsigned int phu_poolid;
                } phu_onpage;
        } ph_u1;
        void *                        ph_page;        /* this page's address */
        uint32_t                ph_time;        /* last referenced */
        uint16_t                ph_nmissing;        /* # of chunks in use */
        uint16_t                ph_off;                /* start offset in page */
        union {
                /* !PR_USEBMAP */
                struct {
                        LIST_HEAD(, pool_item)
                                phu_itemlist;        /* chunk list for this page */
                } phu_normal;
                /* PR_USEBMAP */
                struct {
                        pool_item_bitmap_t phu_bitmap[1];
                } phu_notouch;
        } ph_u2;
};
#define ph_node                ph_u1.phu_offpage.phu_node
#define ph_poolid        ph_u1.phu_onpage.phu_poolid
#define ph_itemlist        ph_u2.phu_normal.phu_itemlist
#define ph_bitmap        ph_u2.phu_notouch.phu_bitmap

#define PHSIZE        ALIGN(sizeof(struct pool_item_header))

CTASSERT(offsetof(struct pool_item_header, ph_u2) +
    BITMAP_MIN_SIZE / CHAR_BIT == sizeof(struct pool_item_header));

#if defined(DIAGNOSTIC) && !defined(KASAN)
#define POOL_CHECK_MAGIC
#endif

struct pool_item {
#ifdef POOL_CHECK_MAGIC
        u_int pi_magic;
#endif
#define        PI_MAGIC 0xdeaddeadU
        /* Other entries use only this list entry */
        LIST_ENTRY(pool_item)        pi_list;
};

#define        POOL_NEEDS_CATCHUP(pp)                                                \
        ((pp)->pr_nitems < (pp)->pr_minitems ||                                \
         (pp)->pr_npages < (pp)->pr_minpages)
#define        POOL_OBJ_TO_PAGE(pp, v)                                                \
        (void *)((uintptr_t)v & pp->pr_alloc->pa_pagemask)

/*
 * Pool cache management.
 *
 * Pool caches provide a way for constructed objects to be cached by the
 * pool subsystem.  This can lead to performance improvements by avoiding
 * needless object construction/destruction; it is deferred until absolutely
 * necessary.
 *
 * Caches are grouped into cache groups.  Each cache group references up
 * to PCG_NUMOBJECTS constructed objects.  When a cache allocates an
 * object from the pool, it calls the object's constructor and places it
 * into a cache group.  When a cache group frees an object back to the
 * pool, it first calls the object's destructor.  This allows the object
 * to persist in constructed form while freed to the cache.
 *
 * The pool references each cache, so that when a pool is drained by the
 * pagedaemon, it can drain each individual cache as well.  Each time a
 * cache is drained, the most idle cache group is freed to the pool in
 * its entirety.
 *
 * Pool caches are laid on top of pools.  By layering them, we can avoid
 * the complexity of cache management for pools which would not benefit
 * from it.
 */

static struct pool pcg_normal_pool;
static struct pool pcg_large_pool;
static struct pool cache_pool;
static struct pool cache_cpu_pool;

static pcg_t *volatile pcg_large_cache __cacheline_aligned;
static pcg_t *volatile pcg_normal_cache __cacheline_aligned;

/* List of all caches. */
TAILQ_HEAD(,pool_cache) pool_cache_head =
    TAILQ_HEAD_INITIALIZER(pool_cache_head);

int pool_cache_disable;                /* global disable for caching */
static const pcg_t pcg_dummy;        /* zero sized: always empty, yet always full */

static bool        pool_cache_put_slow(pool_cache_t, pool_cache_cpu_t *, int,
                                    void *);
static bool        pool_cache_get_slow(pool_cache_t, pool_cache_cpu_t *, int,
                                    void **, paddr_t *, int);
static void        pool_cache_cpu_init1(struct cpu_info *, pool_cache_t);
static int        pool_cache_invalidate_groups(pool_cache_t, pcg_t *);
static void        pool_cache_invalidate_cpu(pool_cache_t, u_int);
static void        pool_cache_transfer(pool_cache_t);
static int        pool_pcg_get(pcg_t *volatile *, pcg_t **);
static int        pool_pcg_put(pcg_t *volatile *, pcg_t *);
static pcg_t *        pool_pcg_trunc(pcg_t *volatile *);

static int        pool_catchup(struct pool *);
static void        pool_prime_page(struct pool *, void *,
                    struct pool_item_header *);
static void        pool_update_curpage(struct pool *);

static int        pool_grow(struct pool *, int);
static void        *pool_allocator_alloc(struct pool *, int);
static void        pool_allocator_free(struct pool *, void *);

static void pool_print_pagelist(struct pool *, struct pool_pagelist *,
        void (*)(const char *, ...) __printflike(1, 2));
static void pool_print1(struct pool *, const char *,
        void (*)(const char *, ...) __printflike(1, 2));

static int pool_chk_page(struct pool *, const char *,
                         struct pool_item_header *);

/* -------------------------------------------------------------------------- */

static inline unsigned int
pr_item_bitmap_index(const struct pool *pp, const struct pool_item_header *ph,
    const void *v)
{
        const char *cp = v;
        unsigned int idx;

        KASSERT(pp->pr_roflags & PR_USEBMAP);
        idx = (cp - (char *)ph->ph_page - ph->ph_off) / pp->pr_size;

        if (__predict_false(idx >= pp->pr_itemsperpage)) {
                panic("%s: [%s] %u >= %u", __func__, pp->pr_wchan, idx,
                    pp->pr_itemsperpage);
        }

        return idx;
}

static inline void
pr_item_bitmap_put(const struct pool *pp, struct pool_item_header *ph,
    void *obj)
{
        unsigned int idx = pr_item_bitmap_index(pp, ph, obj);
        pool_item_bitmap_t *bitmap = ph->ph_bitmap + (idx / BITMAP_SIZE);
        pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK);

        if (__predict_false((*bitmap & mask) != 0)) {
                panic("%s: [%s] %p already freed", __func__, pp->pr_wchan, obj);
        }

        *bitmap |= mask;
}

static inline void *
pr_item_bitmap_get(const struct pool *pp, struct pool_item_header *ph)
{
        pool_item_bitmap_t *bitmap = ph->ph_bitmap;
        unsigned int idx;
        int i;

        for (i = 0; ; i++) {
                int bit;

                KASSERT((i * BITMAP_SIZE) < pp->pr_itemsperpage);
                bit = ffs32(bitmap[i]);
                if (bit) {
                        pool_item_bitmap_t mask;

                        bit--;
                        idx = (i * BITMAP_SIZE) + bit;
                        mask = 1U << bit;
                        KASSERT((bitmap[i] & mask) != 0);
                        bitmap[i] &= ~mask;
                        break;
                }
        }
        KASSERT(idx < pp->pr_itemsperpage);
        return (char *)ph->ph_page + ph->ph_off + idx * pp->pr_size;
}

static inline void
pr_item_bitmap_init(const struct pool *pp, struct pool_item_header *ph)
{
        pool_item_bitmap_t *bitmap = ph->ph_bitmap;
        const int n = howmany(pp->pr_itemsperpage, BITMAP_SIZE);
        int i;

        for (i = 0; i < n; i++) {
                bitmap[i] = (pool_item_bitmap_t)-1;
        }
}

/* -------------------------------------------------------------------------- */

static inline void
pr_item_linkedlist_put(const struct pool *pp, struct pool_item_header *ph,
    void *obj)
{
        struct pool_item *pi = obj;

        KASSERT(!pp_has_pser(pp));

#ifdef POOL_CHECK_MAGIC
        pi->pi_magic = PI_MAGIC;
#endif

        if (pp->pr_redzone) {
                /*
                 * Mark the pool_item as valid. The rest is already
                 * invalid.
                 */
                kasan_mark(pi, sizeof(*pi), sizeof(*pi), 0);
        }

        LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
}

static inline void *
pr_item_linkedlist_get(struct pool *pp, struct pool_item_header *ph)
{
        struct pool_item *pi;
        void *v;

        v = pi = LIST_FIRST(&ph->ph_itemlist);
        if (__predict_false(v == NULL)) {
                mutex_exit(&pp->pr_lock);
                panic("%s: [%s] page empty", __func__, pp->pr_wchan);
        }
        KASSERTMSG((pp->pr_nitems > 0),
            "%s: [%s] nitems %u inconsistent on itemlist",
            __func__, pp->pr_wchan, pp->pr_nitems);
#ifdef POOL_CHECK_MAGIC
        KASSERTMSG((pi->pi_magic == PI_MAGIC),
            "%s: [%s] free list modified: "
            "magic=%x; page %p; item addr %p", __func__,
            pp->pr_wchan, pi->pi_magic, ph->ph_page, pi);
#endif

        /*
         * Remove from item list.
         */
        LIST_REMOVE(pi, pi_list);

        return v;
}

/* -------------------------------------------------------------------------- */

static inline void
pr_phinpage_check(struct pool *pp, struct pool_item_header *ph, void *page,
    void *object)
{
        if (__predict_false((void *)ph->ph_page != page)) {
                panic("%s: [%s] item %p not part of pool", __func__,
                    pp->pr_wchan, object);
        }
        if (__predict_false((char *)object < (char *)page + ph->ph_off)) {
                panic("%s: [%s] item %p below item space", __func__,
                    pp->pr_wchan, object);
        }
        if (__predict_false(ph->ph_poolid != pp->pr_poolid)) {
                panic("%s: [%s] item %p poolid %u != %u", __func__,
                    pp->pr_wchan, object, ph->ph_poolid, pp->pr_poolid);
        }
}

static inline void
pc_phinpage_check(pool_cache_t pc, void *object)
{
        struct pool_item_header *ph;
        struct pool *pp;
        void *page;

        pp = &pc->pc_pool;
        page = POOL_OBJ_TO_PAGE(pp, object);
        ph = (struct pool_item_header *)page;

        pr_phinpage_check(pp, ph, page, object);
}

/* -------------------------------------------------------------------------- */

static inline int
phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
{

        /*
         * We consider pool_item_header with smaller ph_page bigger. This
         * unnatural ordering is for the benefit of pr_find_pagehead.
         */
        if (a->ph_page < b->ph_page)
                return 1;
        else if (a->ph_page > b->ph_page)
                return -1;
        else
                return 0;
}

SPLAY_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);
SPLAY_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);

static inline struct pool_item_header *
pr_find_pagehead_noalign(struct pool *pp, void *v)
{
        struct pool_item_header *ph, tmp;

        tmp.ph_page = (void *)(uintptr_t)v;
        ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp);
        if (ph == NULL) {
                ph = SPLAY_ROOT(&pp->pr_phtree);
                if (ph != NULL && phtree_compare(&tmp, ph) >= 0) {
                        ph = SPLAY_NEXT(phtree, &pp->pr_phtree, ph);
                }
                KASSERT(ph == NULL || phtree_compare(&tmp, ph) < 0);
        }

        return ph;
}

/*
 * Return the pool page header based on item address.
 */
static inline struct pool_item_header *
pr_find_pagehead(struct pool *pp, void *v)
{
        struct pool_item_header *ph, tmp;

        if ((pp->pr_roflags & PR_NOALIGN) != 0) {
                ph = pr_find_pagehead_noalign(pp, v);
        } else {
                void *page = POOL_OBJ_TO_PAGE(pp, v);
                if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
                        ph = (struct pool_item_header *)page;
                        pr_phinpage_check(pp, ph, page, v);
                } else {
                        tmp.ph_page = page;
                        ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp);
                }
        }

        KASSERT(ph == NULL || ((pp->pr_roflags & PR_PHINPAGE) != 0) ||
            ((char *)ph->ph_page <= (char *)v &&
            (char *)v < (char *)ph->ph_page + pp->pr_alloc->pa_pagesz));
        return ph;
}

static void
pr_pagelist_free(struct pool *pp, struct pool_pagelist *pq)
{
        struct pool_item_header *ph;

        while ((ph = LIST_FIRST(pq)) != NULL) {
                LIST_REMOVE(ph, ph_pagelist);
                pool_allocator_free(pp, ph->ph_page);
                if ((pp->pr_roflags & PR_PHINPAGE) == 0)
                        pool_put(pp->pr_phpool, ph);
        }
}

/*
 * Remove a page from the pool.
 */
static inline void
pr_rmpage(struct pool *pp, struct pool_item_header *ph,
     struct pool_pagelist *pq)
{

        KASSERT(mutex_owned(&pp->pr_lock));

        /*
         * If the page was idle, decrement the idle page count.
         */
        if (ph->ph_nmissing == 0) {
                KASSERT(pp->pr_nidle != 0);
                KASSERTMSG((pp->pr_nitems >= pp->pr_itemsperpage),
                    "%s: [%s] nitems=%u < itemsperpage=%u", __func__,
                    pp->pr_wchan, pp->pr_nitems, pp->pr_itemsperpage);
                pp->pr_nidle--;
        }

        pp->pr_nitems -= pp->pr_itemsperpage;

        /*
         * Unlink the page from the pool and queue it for release.
         */
        LIST_REMOVE(ph, ph_pagelist);
        if (pp->pr_roflags & PR_PHINPAGE) {
                if (__predict_false(ph->ph_poolid != pp->pr_poolid)) {
                        panic("%s: [%s] ph %p poolid %u != %u",
                            __func__, pp->pr_wchan, ph, ph->ph_poolid,
                            pp->pr_poolid);
                }
        } else {
                SPLAY_REMOVE(phtree, &pp->pr_phtree, ph);
        }
        LIST_INSERT_HEAD(pq, ph, ph_pagelist);

        pp->pr_npages--;
        pp->pr_npagefree++;

        pool_update_curpage(pp);
}

/*
 * Initialize all the pools listed in the "pools" link set.
 */
void
pool_subsystem_init(void)
{
        size_t size;
        int idx;

        mutex_init(&pool_head_lock, MUTEX_DEFAULT, IPL_NONE);
        mutex_init(&pool_allocator_lock, MUTEX_DEFAULT, IPL_NONE);
        cv_init(&pool_busy, "poolbusy");

        /*
         * Initialize private page header pool and cache magazine pool if we
         * haven't done so yet.
         */
        for (idx = 0; idx < PHPOOL_MAX; idx++) {
                static char phpool_names[PHPOOL_MAX][6+1+6+1];
                int nelem;
                size_t sz;

                nelem = PHPOOL_FREELIST_NELEM(idx);
                KASSERT(nelem != 0);
                snprintf(phpool_names[idx], sizeof(phpool_names[idx]),
                    "phpool-%d", nelem);
                sz = offsetof(struct pool_item_header,
                    ph_bitmap[howmany(nelem, BITMAP_SIZE)]);
                pool_init(&phpool[idx], sz, 0, 0, 0,
                    phpool_names[idx], &pool_allocator_meta, IPL_VM);
        }

        size = sizeof(pcg_t) +
            (PCG_NOBJECTS_NORMAL - 1) * sizeof(pcgpair_t);
        pool_init(&pcg_normal_pool, size, coherency_unit, 0, 0,
            "pcgnormal", &pool_allocator_meta, IPL_VM);

        size = sizeof(pcg_t) +
            (PCG_NOBJECTS_LARGE - 1) * sizeof(pcgpair_t);
        pool_init(&pcg_large_pool, size, coherency_unit, 0, 0,
            "pcglarge", &pool_allocator_meta, IPL_VM);

        pool_init(&cache_pool, sizeof(struct pool_cache), coherency_unit,
            0, 0, "pcache", &pool_allocator_meta, IPL_NONE);

        pool_init(&cache_cpu_pool, sizeof(pool_cache_cpu_t), coherency_unit,
            0, 0, "pcachecpu", &pool_allocator_meta, IPL_NONE);
}

static inline bool
pool_init_is_phinpage(const struct pool *pp)
{
        size_t pagesize;

        if (pp->pr_roflags & PR_PHINPAGE) {
                return true;
        }
        if (pp->pr_roflags & (PR_NOTOUCH | PR_NOALIGN)) {
                return false;
        }

        pagesize = pp->pr_alloc->pa_pagesz;

        /*
         * Threshold: the item size is below 1/16 of a page size, and below
         * 8 times the page header size. The latter ensures we go off-page
         * if the page header would make us waste a rather big item.
         */
        if (pp->pr_size < MIN(pagesize / 16, PHSIZE * 8)) {
                return true;
        }

        /* Put the header into the page if it doesn't waste any items. */
        if (pagesize / pp->pr_size == (pagesize - PHSIZE) / pp->pr_size) {
                return true;
        }

        return false;
}

static inline bool
pool_init_is_usebmap(const struct pool *pp)
{
        size_t bmapsize;

        if (pp->pr_roflags & PR_NOTOUCH) {
                return true;
        }

        /*
         * If we're off-page, go with a bitmap.
         */
        if (!(pp->pr_roflags & PR_PHINPAGE)) {
                return true;
        }

        /*
         * If we're on-page, and the page header can already contain a bitmap
         * big enough to cover all the items of the page, go with a bitmap.
         */
        bmapsize = roundup(PHSIZE, pp->pr_align) -
            offsetof(struct pool_item_header, ph_bitmap[0]);
        KASSERT(bmapsize % sizeof(pool_item_bitmap_t) == 0);
        if (pp->pr_itemsperpage <= bmapsize * CHAR_BIT) {
                return true;
        }

        return false;
}

/*
 * Initialize the given pool resource structure.
 *
 * We export this routine to allow other kernel parts to declare
 * static pools that must be initialized before kmem(9) is available.
 */
void
pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
    const char *wchan, struct pool_allocator *palloc, int ipl)
{
        struct pool *pp1;
        size_t prsize;
        int itemspace, slack;

        /* XXX ioff will be removed. */
        KASSERT(ioff == 0);

#ifdef DEBUG
        if (__predict_true(!cold))
                mutex_enter(&pool_head_lock);
        /*
         * Check that the pool hasn't already been initialised and
         * added to the list of all pools.
         */
        TAILQ_FOREACH(pp1, &pool_head, pr_poollist) {
                if (pp == pp1)
                        panic("%s: [%s] already initialised", __func__,
                            wchan);
        }
        if (__predict_true(!cold))
                mutex_exit(&pool_head_lock);
#endif

        if (palloc == NULL)
                palloc = &pool_allocator_kmem;

        if (!cold)
                mutex_enter(&pool_allocator_lock);
        if (palloc->pa_refcnt++ == 0) {
                if (palloc->pa_pagesz == 0)
                        palloc->pa_pagesz = PAGE_SIZE;

                TAILQ_INIT(&palloc->pa_list);

                mutex_init(&palloc->pa_lock, MUTEX_DEFAULT, IPL_VM);
                palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
                palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
        }
        if (!cold)
                mutex_exit(&pool_allocator_lock);

        /*
         * PR_PSERIALIZE implies PR_NOTOUCH; freed objects must remain
         * valid until the the backing page is returned to the system.
         */
        if (flags & PR_PSERIALIZE) {
                flags |= PR_NOTOUCH;
        }

        if (align == 0)
                align = ALIGN(1);

        prsize = size;
        if ((flags & PR_NOTOUCH) == 0 && prsize < sizeof(struct pool_item))
                prsize = sizeof(struct pool_item);

        prsize = roundup(prsize, align);
        KASSERTMSG((prsize <= palloc->pa_pagesz),
            "%s: [%s] pool item size (%zu) larger than page size (%u)",
            __func__, wchan, prsize, palloc->pa_pagesz);

        /*
         * Initialize the pool structure.
         */
        LIST_INIT(&pp->pr_emptypages);
        LIST_INIT(&pp->pr_fullpages);
        LIST_INIT(&pp->pr_partpages);
        pp->pr_cache = NULL;
        pp->pr_curpage = NULL;
        pp->pr_npages = 0;
        pp->pr_minitems = 0;
        pp->pr_minpages = 0;
        pp->pr_maxpages = UINT_MAX;
        pp->pr_roflags = flags;
        pp->pr_flags = 0;
        pp->pr_size = prsize;
        pp->pr_reqsize = size;
        pp->pr_align = align;
        pp->pr_wchan = wchan;
        pp->pr_alloc = palloc;
        pp->pr_poolid = atomic_inc_uint_nv(&poolid_counter);
        pp->pr_nitems = 0;
        pp->pr_nout = 0;
        pp->pr_hardlimit = UINT_MAX;
        pp->pr_hardlimit_warning = NULL;
        pp->pr_hardlimit_ratecap.tv_sec = 0;
        pp->pr_hardlimit_ratecap.tv_usec = 0;
        pp->pr_hardlimit_warning_last.tv_sec = 0;
        pp->pr_hardlimit_warning_last.tv_usec = 0;
        pp->pr_drain_hook = NULL;
        pp->pr_drain_hook_arg = NULL;
        pp->pr_freecheck = NULL;
        pp->pr_redzone = false;
        pool_redzone_init(pp, size);
        pool_quarantine_init(pp);

        /*
         * Decide whether to put the page header off-page to avoid wasting too
         * large a part of the page or too big an item. Off-page page headers
         * go on a hash table, so we can match a returned item with its header
         * based on the page address.
         */
        if (pool_init_is_phinpage(pp)) {
                /* Use the beginning of the page for the page header */
                itemspace = palloc->pa_pagesz - roundup(PHSIZE, align);
                pp->pr_itemoffset = roundup(PHSIZE, align);
                pp->pr_roflags |= PR_PHINPAGE;
        } else {
                /* The page header will be taken from our page header pool */
                itemspace = palloc->pa_pagesz;
                pp->pr_itemoffset = 0;
                SPLAY_INIT(&pp->pr_phtree);
        }

        pp->pr_itemsperpage = itemspace / pp->pr_size;
        KASSERT(pp->pr_itemsperpage != 0);

        /*
         * Decide whether to use a bitmap or a linked list to manage freed
         * items.
         */
        if (pool_init_is_usebmap(pp)) {
                pp->pr_roflags |= PR_USEBMAP;
        }

        /*
         * If we're off-page, then we're using a bitmap; choose the appropriate
         * pool to allocate page headers, whose size varies depending on the
         * bitmap. If we're on-page, nothing to do.
         */
        if (!(pp->pr_roflags & PR_PHINPAGE)) {
                int idx;

                KASSERT(pp->pr_roflags & PR_USEBMAP);

                for (idx = 0; pp->pr_itemsperpage > PHPOOL_FREELIST_NELEM(idx);
                    idx++) {
                        /* nothing */
                }
                if (idx >= PHPOOL_MAX) {
                        /*
                         * if you see this panic, consider to tweak
                         * PHPOOL_MAX and PHPOOL_FREELIST_NELEM.
                         */
                        panic("%s: [%s] too large itemsperpage(%d) for "
                            "PR_USEBMAP", __func__,
                            pp->pr_wchan, pp->pr_itemsperpage);
                }
                pp->pr_phpool = &phpool[idx];
        } else {
                pp->pr_phpool = NULL;
        }

        /*
         * Use the slack between the chunks and the page header
         * for "cache coloring".
         */
        slack = itemspace - pp->pr_itemsperpage * pp->pr_size;
        pp->pr_maxcolor = rounddown(slack, align);
        pp->pr_curcolor = 0;

        pp->pr_nget = 0;
        pp->pr_nfail = 0;
        pp->pr_nput = 0;
        pp->pr_npagealloc = 0;
        pp->pr_npagefree = 0;
        pp->pr_hiwat = 0;
        pp->pr_nidle = 0;
        pp->pr_refcnt = 0;

        mutex_init(&pp->pr_lock, MUTEX_DEFAULT, ipl);
        cv_init(&pp->pr_cv, wchan);
        pp->pr_ipl = ipl;

        /* Insert into the list of all pools. */
        if (!cold)
                mutex_enter(&pool_head_lock);
        TAILQ_FOREACH(pp1, &pool_head, pr_poollist) {
                if (strcmp(pp1->pr_wchan, pp->pr_wchan) > 0)
                        break;
        }
        if (pp1 == NULL)
                TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist);
        else
                TAILQ_INSERT_BEFORE(pp1, pp, pr_poollist);
        if (!cold)
                mutex_exit(&pool_head_lock);

        /* Insert this into the list of pools using this allocator. */
        if (!cold)
                mutex_enter(&palloc->pa_lock);
        TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list);
        if (!cold)
                mutex_exit(&palloc->pa_lock);
}

/*
 * De-commission a pool resource.
 */
void
pool_destroy(struct pool *pp)
{
        struct pool_pagelist pq;
        struct pool_item_header *ph;

        pool_quarantine_flush(pp);

        /* Remove from global pool list */
        mutex_enter(&pool_head_lock);
        while (pp->pr_refcnt != 0)
                cv_wait(&pool_busy, &pool_head_lock);
        TAILQ_REMOVE(&pool_head, pp, pr_poollist);
        if (drainpp == pp)
                drainpp = NULL;
        mutex_exit(&pool_head_lock);

        /* Remove this pool from its allocator's list of pools. */
        mutex_enter(&pp->pr_alloc->pa_lock);
        TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list);
        mutex_exit(&pp->pr_alloc->pa_lock);

        mutex_enter(&pool_allocator_lock);
        if (--pp->pr_alloc->pa_refcnt == 0)
                mutex_destroy(&pp->pr_alloc->pa_lock);
        mutex_exit(&pool_allocator_lock);

        mutex_enter(&pp->pr_lock);

        KASSERT(pp->pr_cache == NULL);
        KASSERTMSG((pp->pr_nout == 0),
            "%s: [%s] pool busy: still out: %u", __func__, pp->pr_wchan,
            pp->pr_nout);
        KASSERT(LIST_EMPTY(&pp->pr_fullpages));
        KASSERT(LIST_EMPTY(&pp->pr_partpages));

        /* Remove all pages */
        LIST_INIT(&pq);
        while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
                pr_rmpage(pp, ph, &pq);

        mutex_exit(&pp->pr_lock);

        pr_pagelist_free(pp, &pq);
        cv_destroy(&pp->pr_cv);
        mutex_destroy(&pp->pr_lock);
}

void
pool_set_drain_hook(struct pool *pp, void (*fn)(void *, int), void *arg)
{

        /* XXX no locking -- must be used just after pool_init() */
        KASSERTMSG((pp->pr_drain_hook == NULL),
            "%s: [%s] already set", __func__, pp->pr_wchan);
        pp->pr_drain_hook = fn;
        pp->pr_drain_hook_arg = arg;
}

static struct pool_item_header *
pool_alloc_item_header(struct pool *pp, void *storage, int flags)
{
        struct pool_item_header *ph;

        if ((pp->pr_roflags & PR_PHINPAGE) != 0)
                ph = storage;
        else
                ph = pool_get(pp->pr_phpool, flags);

        return ph;
}

/*
 * Grab an item from the pool.
 */
void *
pool_get(struct pool *pp, int flags)
{
        struct pool_item_header *ph;
        void *v;

        KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK));
        KASSERTMSG((pp->pr_itemsperpage != 0),
            "%s: [%s] pr_itemsperpage is zero, "
            "pool not initialized?", __func__, pp->pr_wchan);
        KASSERTMSG((!(cpu_intr_p() || cpu_softintr_p())
                || pp->pr_ipl != IPL_NONE || cold || panicstr != NULL),
            "%s: [%s] is IPL_NONE, but called from interrupt context",
            __func__, pp->pr_wchan);
        if (flags & PR_WAITOK) {
                ASSERT_SLEEPABLE();
        }

        if (flags & PR_NOWAIT) {
                if (fault_inject())
                        return NULL;
        }

        mutex_enter(&pp->pr_lock);
 startover:
        /*
         * Check to see if we've reached the hard limit.  If we have,
         * and we can wait, then wait until an item has been returned to
         * the pool.
         */
        KASSERTMSG((pp->pr_nout <= pp->pr_hardlimit),
            "%s: %s: crossed hard limit", __func__, pp->pr_wchan);
        if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) {
                if (pp->pr_drain_hook != NULL) {
                        /*
                         * Since the drain hook is going to free things
                         * back to the pool, unlock, call the hook, re-lock,
                         * and check the hardlimit condition again.
                         */
                        mutex_exit(&pp->pr_lock);
                        (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
                        mutex_enter(&pp->pr_lock);
                        if (pp->pr_nout < pp->pr_hardlimit)
                                goto startover;
                }

                if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
                        /*
                         * XXX: A warning isn't logged in this case.  Should
                         * it be?
                         */
                        pp->pr_flags |= PR_WANTED;
                        do {
                                cv_wait(&pp->pr_cv, &pp->pr_lock);
                        } while (pp->pr_flags & PR_WANTED);
                        goto startover;
                }

                /*
                 * Log a message that the hard limit has been hit.
                 */
                if (pp->pr_hardlimit_warning != NULL &&
                    ratecheck(&pp->pr_hardlimit_warning_last,
                              &pp->pr_hardlimit_ratecap))
                        log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);

                pp->pr_nfail++;

                mutex_exit(&pp->pr_lock);
                KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
                return NULL;
        }

        /*
         * The convention we use is that if `curpage' is not NULL, then
         * it points at a non-empty bucket. In particular, `curpage'
         * never points at a page header which has PR_PHINPAGE set and
         * has no items in its bucket.
         */
        if ((ph = pp->pr_curpage) == NULL) {
                int error;

                KASSERTMSG((pp->pr_nitems == 0),
                    "%s: [%s] curpage NULL, inconsistent nitems %u",
                    __func__, pp->pr_wchan, pp->pr_nitems);

                /*
                 * Call the back-end page allocator for more memory.
                 * Release the pool lock, as the back-end page allocator
                 * may block.
                 */
                error = pool_grow(pp, flags);
                if (error != 0) {
                        /*
                         * pool_grow aborts when another thread
                         * is allocating a new page. Retry if it
                         * waited for it.
                         */
                        if (error == ERESTART)
                                goto startover;

                        /*
                         * We were unable to allocate a page or item
                         * header, but we released the lock during
                         * allocation, so perhaps items were freed
                         * back to the pool.  Check for this case.
                         */
                        if (pp->pr_curpage != NULL)
                                goto startover;

                        pp->pr_nfail++;
                        mutex_exit(&pp->pr_lock);
                        KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
                        return NULL;
                }

                /* Start the allocation process over. */
                goto startover;
        }
        if (pp->pr_roflags & PR_USEBMAP) {
                KASSERTMSG((ph->ph_nmissing < pp->pr_itemsperpage),
                    "%s: [%s] pool page empty", __func__, pp->pr_wchan);
                v = pr_item_bitmap_get(pp, ph);
        } else {
                v = pr_item_linkedlist_get(pp, ph);
        }
        pp->pr_nitems--;
        pp->pr_nout++;
        if (ph->ph_nmissing == 0) {
                KASSERT(pp->pr_nidle > 0);
                pp->pr_nidle--;

                /*
                 * This page was previously empty.  Move it to the list of
                 * partially-full pages.  This page is already curpage.
                 */
                LIST_REMOVE(ph, ph_pagelist);
                LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
        }
        ph->ph_nmissing++;
        if (ph->ph_nmissing == pp->pr_itemsperpage) {
                KASSERTMSG(((pp->pr_roflags & PR_USEBMAP) ||
                        LIST_EMPTY(&ph->ph_itemlist)),
                    "%s: [%s] nmissing (%u) inconsistent", __func__,
                        pp->pr_wchan, ph->ph_nmissing);
                /*
                 * This page is now full.  Move it to the full list
                 * and select a new current page.
                 */
                LIST_REMOVE(ph, ph_pagelist);
                LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist);
                pool_update_curpage(pp);
        }

        pp->pr_nget++;

        /*
         * If we have a low water mark and we are now below that low
         * water mark, add more items to the pool.
         */
        if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
                /*
                 * XXX: Should we log a warning?  Should we set up a timeout
                 * to try again in a second or so?  The latter could break
                 * a caller's assumptions about interrupt protection, etc.
                 */
        }

        mutex_exit(&pp->pr_lock);
        KASSERT((((vaddr_t)v) & (pp->pr_align - 1)) == 0);
        FREECHECK_OUT(&pp->pr_freecheck, v);
        pool_redzone_fill(pp, v);
        pool_get_kmsan(pp, v);
        if (flags & PR_ZERO)
                memset(v, 0, pp->pr_reqsize);
        return v;
}

/*
 * Internal version of pool_put().  Pool is already locked/entered.
 */
static void
pool_do_put(struct pool *pp, void *v, struct pool_pagelist *pq)
{
        struct pool_item_header *ph;

        KASSERT(mutex_owned(&pp->pr_lock));
        pool_redzone_check(pp, v);
        pool_put_kmsan(pp, v);
        FREECHECK_IN(&pp->pr_freecheck, v);
        LOCKDEBUG_MEM_CHECK(v, pp->pr_size);

        KASSERTMSG((pp->pr_nout > 0),
            "%s: [%s] putting with none out", __func__, pp->pr_wchan);

        if (__predict_false((ph = pr_find_pagehead(pp, v)) == NULL)) {
                panic("%s: [%s] page header missing", __func__,  pp->pr_wchan);
        }

        /*
         * Return to item list.
         */
        if (pp->pr_roflags & PR_USEBMAP) {
                pr_item_bitmap_put(pp, ph, v);
        } else {
                pr_item_linkedlist_put(pp, ph, v);
        }
        KDASSERT(ph->ph_nmissing != 0);
        ph->ph_nmissing--;
        pp->pr_nput++;
        pp->pr_nitems++;
        pp->pr_nout--;

        /* Cancel "pool empty" condition if it exists */
        if (pp->pr_curpage == NULL)
                pp->pr_curpage = ph;

        if (pp->pr_flags & PR_WANTED) {
                pp->pr_flags &= ~PR_WANTED;
                cv_broadcast(&pp->pr_cv);
        }

        /*
         * If this page is now empty, do one of two things:
         *
         *        (1) If we have more pages than the page high water mark,
         *            free the page back to the system.  ONLY CONSIDER
         *            FREEING BACK A PAGE IF WE HAVE MORE THAN OUR MINIMUM PAGE
         *            CLAIM.
         *
         *        (2) Otherwise, move the page to the empty page list.
         *
         * Either way, select a new current page (so we use a partially-full
         * page if one is available).
         */
        if (ph->ph_nmissing == 0) {
                pp->pr_nidle++;
                if (pp->pr_nitems - pp->pr_itemsperpage >= pp->pr_minitems &&
                    pp->pr_npages > pp->pr_minpages &&
                    pp->pr_npages > pp->pr_maxpages) {
                        pr_rmpage(pp, ph, pq);
                } else {
                        LIST_REMOVE(ph, ph_pagelist);
                        LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);

                        /*
                         * Update the timestamp on the page.  A page must
                         * be idle for some period of time before it can
                         * be reclaimed by the pagedaemon.  This minimizes
                         * ping-pong'ing for memory.
                         *
                         * note for 64-bit time_t: truncating to 32-bit is not
                         * a problem for our usage.
                         */
                        ph->ph_time = time_uptime;
                }
                pool_update_curpage(pp);
        }

        /*
         * If the page was previously completely full, move it to the
         * partially-full list and make it the current page.  The next
         * allocation will get the item from this page, instead of
         * further fragmenting the pool.
         */
        else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) {
                LIST_REMOVE(ph, ph_pagelist);
                LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
                pp->pr_curpage = ph;
        }
}

void
pool_put(struct pool *pp, void *v)
{
        struct pool_pagelist pq;

        LIST_INIT(&pq);

        mutex_enter(&pp->pr_lock);
        if (!pool_put_quarantine(pp, v, &pq)) {
                pool_do_put(pp, v, &pq);
        }
        mutex_exit(&pp->pr_lock);

        pr_pagelist_free(pp, &pq);
}

/*
 * pool_grow: grow a pool by a page.
 *
 * => called with pool locked.
 * => unlock and relock the pool.
 * => return with pool locked.
 */

static int
pool_grow(struct pool *pp, int flags)
{
        struct pool_item_header *ph;
        char *storage;

        /*
         * If there's a pool_grow in progress, wait for it to complete
         * and try again from the top.
         */
        if (pp->pr_flags & PR_GROWING) {
                if (flags & PR_WAITOK) {
                        do {
                                cv_wait(&pp->pr_cv, &pp->pr_lock);
                        } while (pp->pr_flags & PR_GROWING);
                        return ERESTART;
                } else {
                        if (pp->pr_flags & PR_GROWINGNOWAIT) {
                                /*
                                 * This needs an unlock/relock dance so
                                 * that the other caller has a chance to
                                 * run and actually do the thing.  Note
                                 * that this is effectively a busy-wait.
                                 */
                                mutex_exit(&pp->pr_lock);
                                mutex_enter(&pp->pr_lock);
                                return ERESTART;
                        }
                        return EWOULDBLOCK;
                }
        }
        pp->pr_flags |= PR_GROWING;
        if (flags & PR_WAITOK)
                mutex_exit(&pp->pr_lock);
        else
                pp->pr_flags |= PR_GROWINGNOWAIT;

        storage = pool_allocator_alloc(pp, flags);
        if (__predict_false(storage == NULL))
                goto out;

        ph = pool_alloc_item_header(pp, storage, flags);
        if (__predict_false(ph == NULL)) {
                pool_allocator_free(pp, storage);
                goto out;
        }

        if (flags & PR_WAITOK)
                mutex_enter(&pp->pr_lock);
        pool_prime_page(pp, storage, ph);
        pp->pr_npagealloc++;
        KASSERT(pp->pr_flags & PR_GROWING);
        pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT);
        /*
         * If anyone was waiting for pool_grow, notify them that we
         * may have just done it.
         */
        cv_broadcast(&pp->pr_cv);
        return 0;
out:
        if (flags & PR_WAITOK)
                mutex_enter(&pp->pr_lock);
        KASSERT(pp->pr_flags & PR_GROWING);
        pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT);
        return ENOMEM;
}

void
pool_prime(struct pool *pp, int n)
{

        mutex_enter(&pp->pr_lock);
        pp->pr_minpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
        if (pp->pr_maxpages <= pp->pr_minpages)
                pp->pr_maxpages = pp->pr_minpages + 1;        /* XXX */
        while (pp->pr_npages < pp->pr_minpages)
                (void) pool_grow(pp, PR_WAITOK);
        mutex_exit(&pp->pr_lock);
}

/*
 * Add a page worth of items to the pool.
 *
 * Note, we must be called with the pool descriptor LOCKED.
 */
static void
pool_prime_page(struct pool *pp, void *storage, struct pool_item_header *ph)
{
        const unsigned int align = pp->pr_align;
        struct pool_item *pi;
        void *cp = storage;
        int n;

        KASSERT(mutex_owned(&pp->pr_lock));
        KASSERTMSG(((pp->pr_roflags & PR_NOALIGN) ||
                (((uintptr_t)cp & (pp->pr_alloc->pa_pagesz - 1)) == 0)),
            "%s: [%s] unaligned page: %p", __func__, pp->pr_wchan, cp);

        /*
         * Insert page header.
         */
        LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
        LIST_INIT(&ph->ph_itemlist);
        ph->ph_page = storage;
        ph->ph_nmissing = 0;
        ph->ph_time = time_uptime;
        if (pp->pr_roflags & PR_PHINPAGE)
                ph->ph_poolid = pp->pr_poolid;
        else
                SPLAY_INSERT(phtree, &pp->pr_phtree, ph);

        pp->pr_nidle++;

        /*
         * The item space starts after the on-page header, if any.
         */
        ph->ph_off = pp->pr_itemoffset;

        /*
         * Color this page.
         */
        ph->ph_off += pp->pr_curcolor;
        cp = (char *)cp + ph->ph_off;
        if ((pp->pr_curcolor += align) > pp->pr_maxcolor)
                pp->pr_curcolor = 0;

        KASSERT((((vaddr_t)cp) & (align - 1)) == 0);

        /*
         * Insert remaining chunks on the bucket list.
         */
        n = pp->pr_itemsperpage;
        pp->pr_nitems += n;

        if (pp->pr_roflags & PR_USEBMAP) {
                pr_item_bitmap_init(pp, ph);
        } else {
                while (n--) {
                        pi = (struct pool_item *)cp;

                        KASSERT((((vaddr_t)pi) & (align - 1)) == 0);

                        /* Insert on page list */
                        LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
#ifdef POOL_CHECK_MAGIC
                        pi->pi_magic = PI_MAGIC;
#endif
                        cp = (char *)cp + pp->pr_size;

                        KASSERT((((vaddr_t)cp) & (align - 1)) == 0);
                }
        }

        /*
         * If the pool was depleted, point at the new page.
         */
        if (pp->pr_curpage == NULL)
                pp->pr_curpage = ph;

        if (++pp->pr_npages > pp->pr_hiwat)
                pp->pr_hiwat = pp->pr_npages;
}

/*
 * Used by pool_get() when nitems drops below the low water mark.  This
 * is used to catch up pr_nitems with the low water mark.
 *
 * Note 1, we never wait for memory here, we let the caller decide what to do.
 *
 * Note 2, we must be called with the pool already locked, and we return
 * with it locked.
 */
static int
pool_catchup(struct pool *pp)
{
        int error = 0;

        while (POOL_NEEDS_CATCHUP(pp)) {
                error = pool_grow(pp, PR_NOWAIT);
                if (error) {
                        if (error == ERESTART)
                                continue;
                        break;
                }
        }
        return error;
}

static void
pool_update_curpage(struct pool *pp)
{

        pp->pr_curpage = LIST_FIRST(&pp->pr_partpages);
        if (pp->pr_curpage == NULL) {
                pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages);
        }
        KASSERTMSG((pp->pr_curpage == NULL) == (pp->pr_nitems == 0),
            "pp=%p curpage=%p nitems=%u", pp, pp->pr_curpage, pp->pr_nitems);
}

void
pool_setlowat(struct pool *pp, int n)
{

        mutex_enter(&pp->pr_lock);
        pp->pr_minitems = n;

        /* Make sure we're caught up with the newly-set low water mark. */
        if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
                /*
                 * XXX: Should we log a warning?  Should we set up a timeout
                 * to try again in a second or so?  The latter could break
                 * a caller's assumptions about interrupt protection, etc.
                 */
        }

        mutex_exit(&pp->pr_lock);
}

void
pool_sethiwat(struct pool *pp, int n)
{

        mutex_enter(&pp->pr_lock);

        pp->pr_maxitems = n;

        mutex_exit(&pp->pr_lock);
}

void
pool_sethardlimit(struct pool *pp, int n, const char *warnmess, int ratecap)
{

        mutex_enter(&pp->pr_lock);

        pp->pr_hardlimit = n;
        pp->pr_hardlimit_warning = warnmess;
        pp->pr_hardlimit_ratecap.tv_sec = ratecap;
        pp->pr_hardlimit_warning_last.tv_sec = 0;
        pp->pr_hardlimit_warning_last.tv_usec = 0;

        pp->pr_maxpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;

        mutex_exit(&pp->pr_lock);
}

unsigned int
pool_nget(struct pool *pp)
{

        return pp->pr_nget;
}

unsigned int
pool_nput(struct pool *pp)
{

        return pp->pr_nput;
}

/*
 * Release all complete pages that have not been used recently.
 *
 * Must not be called from interrupt context.
 */
int
pool_reclaim(struct pool *pp)
{
        struct pool_item_header *ph, *phnext;
        struct pool_pagelist pq;
        struct pool_cache *pc;
        uint32_t curtime;
        bool klock;
        int rv;

        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());

        if (pp->pr_drain_hook != NULL) {
                /*
                 * The drain hook must be called with the pool unlocked.
                 */
                (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, PR_NOWAIT);
        }

        /*
         * XXXSMP Because we do not want to cause non-MPSAFE code
         * to block.
         */
        if (pp->pr_ipl == IPL_SOFTNET || pp->pr_ipl == IPL_SOFTCLOCK ||
            pp->pr_ipl == IPL_SOFTSERIAL) {
                KERNEL_LOCK(1, NULL);
                klock = true;
        } else
                klock = false;

        /* Reclaim items from the pool's cache (if any). */
        if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL)
                pool_cache_invalidate(pc);

        if (mutex_tryenter(&pp->pr_lock) == 0) {
                if (klock) {
                        KERNEL_UNLOCK_ONE(NULL);
                }
                return 0;
        }

        LIST_INIT(&pq);

        curtime = time_uptime;

        for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
                phnext = LIST_NEXT(ph, ph_pagelist);

                /* Check our minimum page claim */
                if (pp->pr_npages <= pp->pr_minpages)
                        break;

                KASSERT(ph->ph_nmissing == 0);
                if (curtime - ph->ph_time < pool_inactive_time)
                        continue;

                /*
                 * If freeing this page would put us below the minimum free items
                 * or the minimum pages, stop now.
                 */
                if (pp->pr_nitems - pp->pr_itemsperpage < pp->pr_minitems ||
                    pp->pr_npages - 1 < pp->pr_minpages)
                        break;

                pr_rmpage(pp, ph, &pq);
        }

        mutex_exit(&pp->pr_lock);

        if (LIST_EMPTY(&pq))
                rv = 0;
        else {
                pr_pagelist_free(pp, &pq);
                rv = 1;
        }

        if (klock) {
                KERNEL_UNLOCK_ONE(NULL);
        }

        return rv;
}

/*
 * Drain pools, one at a time. The drained pool is returned within ppp.
 *
 * Note, must never be called from interrupt context.
 */
bool
pool_drain(struct pool **ppp)
{
        bool reclaimed;
        struct pool *pp;

        KASSERT(!TAILQ_EMPTY(&pool_head));

        pp = NULL;

        /* Find next pool to drain, and add a reference. */
        mutex_enter(&pool_head_lock);
        do {
                if (drainpp == NULL) {
                        drainpp = TAILQ_FIRST(&pool_head);
                }
                if (drainpp != NULL) {
                        pp = drainpp;
                        drainpp = TAILQ_NEXT(pp, pr_poollist);
                }
                /*
                 * Skip completely idle pools.  We depend on at least
                 * one pool in the system being active.
                 */
        } while (pp == NULL || pp->pr_npages == 0);
        pp->pr_refcnt++;
        mutex_exit(&pool_head_lock);

        /* Drain the cache (if any) and pool.. */
        reclaimed = pool_reclaim(pp);

        /* Finally, unlock the pool. */
        mutex_enter(&pool_head_lock);
        pp->pr_refcnt--;
        cv_broadcast(&pool_busy);
        mutex_exit(&pool_head_lock);

        if (ppp != NULL)
                *ppp = pp;

        return reclaimed;
}

/*
 * Calculate the total number of pages consumed by pools.
 */
int
pool_totalpages(void)
{

        mutex_enter(&pool_head_lock);
        int pages = pool_totalpages_locked();
        mutex_exit(&pool_head_lock);

        return pages;
}

int
pool_totalpages_locked(void)
{
        struct pool *pp;
        uint64_t total = 0;

        TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
                uint64_t bytes =
                    (uint64_t)pp->pr_npages * pp->pr_alloc->pa_pagesz;

                if ((pp->pr_roflags & PR_RECURSIVE) != 0)
                        bytes -= ((uint64_t)pp->pr_nout * pp->pr_size);
                total += bytes;
        }

        return atop(total);
}

/*
 * Diagnostic helpers.
 */

void
pool_printall(const char *modif, void (*pr)(const char *, ...))
{
        struct pool *pp;

        TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
                pool_printit(pp, modif, pr);
        }
}

void
pool_printit(struct pool *pp, const char *modif, void (*pr)(const char *, ...))
{

        if (pp == NULL) {
                (*pr)("Must specify a pool to print.\n");
                return;
        }

        pool_print1(pp, modif, pr);
}

static void
pool_print_pagelist(struct pool *pp, struct pool_pagelist *pl,
    void (*pr)(const char *, ...))
{
        struct pool_item_header *ph;

        LIST_FOREACH(ph, pl, ph_pagelist) {
                (*pr)("\t\tpage %p, nmissing %d, time %" PRIu32 "\n",
                    ph->ph_page, ph->ph_nmissing, ph->ph_time);
#ifdef POOL_CHECK_MAGIC
                struct pool_item *pi;
                if (!(pp->pr_roflags & PR_USEBMAP)) {
                        LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) {
                                if (pi->pi_magic != PI_MAGIC) {
                                        (*pr)("\t\t\titem %p, magic 0x%x\n",
                                            pi, pi->pi_magic);
                                }
                        }
                }
#endif
        }
}

static void
pool_print1(struct pool *pp, const char *modif, void (*pr)(const char *, ...))
{
        struct pool_item_header *ph;
        pool_cache_t pc;
        pcg_t *pcg;
        pool_cache_cpu_t *cc;
        uint64_t cpuhit, cpumiss, pchit, pcmiss;
        uint32_t nfull;
        int i;
        bool print_log = false, print_pagelist = false, print_cache = false;
        bool print_short = false, skip_empty = false;
        char c;

        while ((c = *modif++) != '\0') {
                if (c == 'l')
                        print_log = true;
                if (c == 'p')
                        print_pagelist = true;
                if (c == 'c')
                        print_cache = true;
                if (c == 's')
                        print_short = true;
                if (c == 'S')
                        skip_empty = true;
        }

        if (skip_empty && pp->pr_nget == 0)
                return;

        if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) {
                (*pr)("POOLCACHE");
        } else {
                (*pr)("POOL");
        }

        /* Single line output. */
        if (print_short) {
                (*pr)(" %s:%p:%u:%u:%u:%u:%u:%u:%u:%u:%u:%u\n",
                    pp->pr_wchan, pp, pp->pr_size, pp->pr_align, pp->pr_npages,
                    pp->pr_nitems, pp->pr_nout, pp->pr_nget, pp->pr_nput,
                    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_nidle);
                return;
        }

        (*pr)(" %s: size %u, align %u, ioff %u, roflags 0x%08x\n",
            pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset,
            pp->pr_roflags);
        (*pr)("\tpool %p, alloc %p\n", pp, pp->pr_alloc);
        (*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
            pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
        (*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
            pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);

        (*pr)("\tnget %lu, nfail %lu, nput %lu\n",
            pp->pr_nget, pp->pr_nfail, pp->pr_nput);
        (*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
            pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);

        if (!print_pagelist)
                goto skip_pagelist;

        if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
                (*pr)("\n\tempty page list:\n");
        pool_print_pagelist(pp, &pp->pr_emptypages, pr);
        if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL)
                (*pr)("\n\tfull page list:\n");
        pool_print_pagelist(pp, &pp->pr_fullpages, pr);
        if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL)
                (*pr)("\n\tpartial-page list:\n");
        pool_print_pagelist(pp, &pp->pr_partpages, pr);

        if (pp->pr_curpage == NULL)
                (*pr)("\tno current page\n");
        else
                (*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);

 skip_pagelist:
        if (print_log)
                goto skip_log;

        (*pr)("\n");

 skip_log:

#define PR_GROUPLIST(pcg)                                                \
        (*pr)("\t\tgroup %p: avail %d\n", pcg, pcg->pcg_avail);                \
        for (i = 0; i < pcg->pcg_size; i++) {                                \
                if (pcg->pcg_objects[i].pcgo_pa !=                        \
                    POOL_PADDR_INVALID) {                                \
                        (*pr)("\t\t\t%p, 0x%llx\n",                        \
                            pcg->pcg_objects[i].pcgo_va,                \
                            (unsigned long long)                        \
                            pcg->pcg_objects[i].pcgo_pa);                \
                } else {                                                \
                        (*pr)("\t\t\t%p\n",                                \
                            pcg->pcg_objects[i].pcgo_va);                \
                }                                                        \
        }

        if (pc != NULL) {
                cpuhit = 0;
                cpumiss = 0;
                pcmiss = 0;
                nfull = 0;
                for (i = 0; i < __arraycount(pc->pc_cpus); i++) {
                        if ((cc = pc->pc_cpus[i]) == NULL)
                                continue;
                        cpuhit += cc->cc_hits;
                        cpumiss += cc->cc_misses;
                        pcmiss += cc->cc_pcmisses;
                        nfull += cc->cc_nfull;
                }
                pchit = cpumiss - pcmiss;
                (*pr)("\tcpu layer hits %llu misses %llu\n", cpuhit, cpumiss);
                (*pr)("\tcache layer hits %llu misses %llu\n", pchit, pcmiss);
                (*pr)("\tcache layer full groups %u\n", nfull);
                if (print_cache) {
                        (*pr)("\tfull cache groups:\n");
                        for (pcg = pc->pc_fullgroups; pcg != NULL;
                            pcg = pcg->pcg_next) {
                                PR_GROUPLIST(pcg);
                        }
                }
        }
#undef PR_GROUPLIST
}

static int
pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph)
{
        struct pool_item *pi;
        void *page;
        int n;

        if ((pp->pr_roflags & PR_NOALIGN) == 0) {
                page = POOL_OBJ_TO_PAGE(pp, ph);
                if (page != ph->ph_page &&
                    (pp->pr_roflags & PR_PHINPAGE) != 0) {
                        if (label != NULL)
                                printf("%s: ", label);
                        printf("pool(%p:%s): page inconsistency: page %p;"
                               " at page head addr %p (p %p)\n", pp,
                                pp->pr_wchan, ph->ph_page,
                                ph, page);
                        return 1;
                }
        }

        if ((pp->pr_roflags & PR_USEBMAP) != 0)
                return 0;

        for (pi = LIST_FIRST(&ph->ph_itemlist), n = 0;
             pi != NULL;
             pi = LIST_NEXT(pi,pi_list), n++) {

#ifdef POOL_CHECK_MAGIC
                if (pi->pi_magic != PI_MAGIC) {
                        if (label != NULL)
                                printf("%s: ", label);
                        printf("pool(%s): free list modified: magic=%x;"
                               " page %p; item ordinal %d; addr %p\n",
                                pp->pr_wchan, pi->pi_magic, ph->ph_page,
                                n, pi);
                        panic("pool");
                }
#endif
                if ((pp->pr_roflags & PR_NOALIGN) != 0) {
                        continue;
                }
                page = POOL_OBJ_TO_PAGE(pp, pi);
                if (page == ph->ph_page)
                        continue;

                if (label != NULL)
                        printf("%s: ", label);
                printf("pool(%p:%s): page inconsistency: page %p;"
                       " item ordinal %d; addr %p (p %p)\n", pp,
                        pp->pr_wchan, ph->ph_page,
                        n, pi, page);
                return 1;
        }
        return 0;
}


int
pool_chk(struct pool *pp, const char *label)
{
        struct pool_item_header *ph;
        int r = 0;

        mutex_enter(&pp->pr_lock);
        LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) {
                r = pool_chk_page(pp, label, ph);
                if (r) {
                        goto out;
                }
        }
        LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
                r = pool_chk_page(pp, label, ph);
                if (r) {
                        goto out;
                }
        }
        LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
                r = pool_chk_page(pp, label, ph);
                if (r) {
                        goto out;
                }
        }

out:
        mutex_exit(&pp->pr_lock);
        return r;
}

/*
 * pool_cache_init:
 *
 *        Initialize a pool cache.
 */
pool_cache_t
pool_cache_init(size_t size, u_int align, u_int align_offset, u_int flags,
    const char *wchan, struct pool_allocator *palloc, int ipl,
    int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), void *arg)
{
        pool_cache_t pc;

        pc = pool_get(&cache_pool, PR_WAITOK);
        if (pc == NULL)
                return NULL;

        pool_cache_bootstrap(pc, size, align, align_offset, flags, wchan,
           palloc, ipl, ctor, dtor, arg);

        return pc;
}

/*
 * pool_cache_bootstrap:
 *
 *        Kernel-private version of pool_cache_init().  The caller
 *        provides initial storage.
 */
void
pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align,
    u_int align_offset, u_int flags, const char *wchan,
    struct pool_allocator *palloc, int ipl,
    int (*ctor)(void *, void *, int), void (*dtor)(void *, void *),
    void *arg)
{
        CPU_INFO_ITERATOR cii;
        pool_cache_t pc1;
        struct cpu_info *ci;
        struct pool *pp;
        unsigned int ppflags;

        pp = &pc->pc_pool;
        if (palloc == NULL && ipl == IPL_NONE) {
                if (size > PAGE_SIZE) {
                        int bigidx = pool_bigidx(size);

                        palloc = &pool_allocator_big[bigidx];
                        flags |= PR_NOALIGN;
                } else
                        palloc = &pool_allocator_nointr;
        }

        ppflags = flags;
        if (ctor == NULL) {
                ctor = NO_CTOR;
        }
        if (dtor == NULL) {
                dtor = NO_DTOR;
        } else {
                /*
                 * If we have a destructor, then the pool layer does not
                 * need to worry about PR_PSERIALIZE.
                 */
                ppflags &= ~PR_PSERIALIZE;
        }

        pool_init(pp, size, align, align_offset, ppflags, wchan, palloc, ipl);

        pc->pc_fullgroups = NULL;
        pc->pc_partgroups = NULL;
        pc->pc_ctor = ctor;
        pc->pc_dtor = dtor;
        pc->pc_arg  = arg;
        pc->pc_refcnt = 0;
        pc->pc_roflags = flags;
        pc->pc_freecheck = NULL;

        if ((flags & PR_LARGECACHE) != 0) {
                pc->pc_pcgsize = PCG_NOBJECTS_LARGE;
                pc->pc_pcgpool = &pcg_large_pool;
                pc->pc_pcgcache = &pcg_large_cache;
        } else {
                pc->pc_pcgsize = PCG_NOBJECTS_NORMAL;
                pc->pc_pcgpool = &pcg_normal_pool;
                pc->pc_pcgcache = &pcg_normal_cache;
        }

        /* Allocate per-CPU caches. */
        memset(pc->pc_cpus, 0, sizeof(pc->pc_cpus));
        pc->pc_ncpu = 0;
        if (ncpu < 2) {
                /* XXX For sparc: boot CPU is not attached yet. */
                pool_cache_cpu_init1(curcpu(), pc);
        } else {
                for (CPU_INFO_FOREACH(cii, ci)) {
                        pool_cache_cpu_init1(ci, pc);
                }
        }

        /* Add to list of all pools. */
        if (__predict_true(!cold))
                mutex_enter(&pool_head_lock);
        TAILQ_FOREACH(pc1, &pool_cache_head, pc_cachelist) {
                if (strcmp(pc1->pc_pool.pr_wchan, pc->pc_pool.pr_wchan) > 0)
                        break;
        }
        if (pc1 == NULL)
                TAILQ_INSERT_TAIL(&pool_cache_head, pc, pc_cachelist);
        else
                TAILQ_INSERT_BEFORE(pc1, pc, pc_cachelist);
        if (__predict_true(!cold))
                mutex_exit(&pool_head_lock);

        atomic_store_release(&pp->pr_cache, pc);
}

/*
 * pool_cache_destroy:
 *
 *        Destroy a pool cache.
 */
void
pool_cache_destroy(pool_cache_t pc)
{

        pool_cache_bootstrap_destroy(pc);
        pool_put(&cache_pool, pc);
}

/*
 * pool_cache_bootstrap_destroy:
 *
 *        Destroy a pool cache.
 */
void
pool_cache_bootstrap_destroy(pool_cache_t pc)
{
        struct pool *pp = &pc->pc_pool;
        u_int i;

        /* Remove it from the global list. */
        mutex_enter(&pool_head_lock);
        while (pc->pc_refcnt != 0)
                cv_wait(&pool_busy, &pool_head_lock);
        TAILQ_REMOVE(&pool_cache_head, pc, pc_cachelist);
        mutex_exit(&pool_head_lock);

        /* First, invalidate the entire cache. */
        pool_cache_invalidate(pc);

        /* Disassociate it from the pool. */
        mutex_enter(&pp->pr_lock);
        atomic_store_relaxed(&pp->pr_cache, NULL);
        mutex_exit(&pp->pr_lock);

        /* Destroy per-CPU data */
        for (i = 0; i < __arraycount(pc->pc_cpus); i++)
                pool_cache_invalidate_cpu(pc, i);

        /* Finally, destroy it. */
        pool_destroy(pp);
}

/*
 * pool_cache_cpu_init1:
 *
 *        Called for each pool_cache whenever a new CPU is attached.
 */
static void
pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc)
{
        pool_cache_cpu_t *cc;
        int index;

        index = ci->ci_index;

        KASSERT(index < __arraycount(pc->pc_cpus));

        if ((cc = pc->pc_cpus[index]) != NULL) {
                return;
        }

        /*
         * The first CPU is 'free'.  This needs to be the case for
         * bootstrap - we may not be able to allocate yet.
         */
        if (pc->pc_ncpu == 0) {
                cc = &pc->pc_cpu0;
                pc->pc_ncpu = 1;
        } else {
                pc->pc_ncpu++;
                cc = pool_get(&cache_cpu_pool, PR_WAITOK);
        }

        cc->cc_current = __UNCONST(&pcg_dummy);
        cc->cc_previous = __UNCONST(&pcg_dummy);
        cc->cc_pcgcache = pc->pc_pcgcache;
        cc->cc_hits = 0;
        cc->cc_misses = 0;
        cc->cc_pcmisses = 0;
        cc->cc_contended = 0;
        cc->cc_nfull = 0;
        cc->cc_npart = 0;

        pc->pc_cpus[index] = cc;
}

/*
 * pool_cache_cpu_init:
 *
 *        Called whenever a new CPU is attached.
 */
void
pool_cache_cpu_init(struct cpu_info *ci)
{
        pool_cache_t pc;

        mutex_enter(&pool_head_lock);
        TAILQ_FOREACH(pc, &pool_cache_head, pc_cachelist) {
                pc->pc_refcnt++;
                mutex_exit(&pool_head_lock);

                pool_cache_cpu_init1(ci, pc);

                mutex_enter(&pool_head_lock);
                pc->pc_refcnt--;
                cv_broadcast(&pool_busy);
        }
        mutex_exit(&pool_head_lock);
}

/*
 * pool_cache_reclaim:
 *
 *        Reclaim memory from a pool cache.
 */
bool
pool_cache_reclaim(pool_cache_t pc)
{

        return pool_reclaim(&pc->pc_pool);
}

static inline void
pool_cache_pre_destruct(pool_cache_t pc)
{
        /*
         * Perform a passive serialization barrier before destructing
         * a batch of one or more objects.
         */
        if (__predict_false(pc_has_pser(pc))) {
                pool_barrier();
        }
}

static void
pool_cache_destruct_object1(pool_cache_t pc, void *object)
{
        (*pc->pc_dtor)(pc->pc_arg, object);
        pool_put(&pc->pc_pool, object);
}

/*
 * pool_cache_destruct_object:
 *
 *        Force destruction of an object and its release back into
 *        the pool.
 */
void
pool_cache_destruct_object(pool_cache_t pc, void *object)
{

        FREECHECK_IN(&pc->pc_freecheck, object);

        pool_cache_pre_destruct(pc);
        pool_cache_destruct_object1(pc, object);
}

/*
 * pool_cache_invalidate_groups:
 *
 *        Invalidate a chain of groups and destruct all objects.  Return the
 *        number of groups that were invalidated.
 */
static int
pool_cache_invalidate_groups(pool_cache_t pc, pcg_t *pcg)
{
        void *object;
        pcg_t *next;
        int i, n;

        if (pcg == NULL) {
                return 0;
        }

        pool_cache_pre_destruct(pc);

        for (n = 0; pcg != NULL; pcg = next, n++) {
                next = pcg->pcg_next;

                for (i = 0; i < pcg->pcg_avail; i++) {
                        object = pcg->pcg_objects[i].pcgo_va;
                        pool_cache_destruct_object1(pc, object);
                }

                if (pcg->pcg_size == PCG_NOBJECTS_LARGE) {
                        pool_put(&pcg_large_pool, pcg);
                } else {
                        KASSERT(pcg->pcg_size == PCG_NOBJECTS_NORMAL);
                        pool_put(&pcg_normal_pool, pcg);
                }
        }
        return n;
}

/*
 * pool_cache_invalidate:
 *
 *        Invalidate a pool cache (destruct and release all of the
 *        cached objects).  Does not reclaim objects from the pool.
 *
 *        Note: For pool caches that provide constructed objects, there
 *        is an assumption that another level of synchronization is occurring
 *        between the input to the constructor and the cache invalidation.
 *
 *        Invalidation is a costly process and should not be called from
 *        interrupt context.
 */
void
pool_cache_invalidate(pool_cache_t pc)
{
        uint64_t where;
        pcg_t *pcg;
        int n, s;

        KASSERT(!cpu_intr_p());
        KASSERT(!cpu_softintr_p());

        if (ncpu < 2 || !mp_online) {
                /*
                 * We might be called early enough in the boot process
                 * for the CPU data structures to not be fully initialized.
                 * In this case, transfer the content of the local CPU's
                 * cache back into global cache as only this CPU is currently
                 * running.
                 */
                pool_cache_transfer(pc);
        } else {
                /*
                 * Signal all CPUs that they must transfer their local
                 * cache back to the global pool then wait for the xcall to
                 * complete.
                 */
                where = xc_broadcast(0,
                    __FPTRCAST(xcfunc_t, pool_cache_transfer), pc, NULL);
                xc_wait(where);
        }

        /* Now dequeue and invalidate everything. */
        pcg = pool_pcg_trunc(&pcg_normal_cache);
        (void)pool_cache_invalidate_groups(pc, pcg);

        pcg = pool_pcg_trunc(&pcg_large_cache);
        (void)pool_cache_invalidate_groups(pc, pcg);

        pcg = pool_pcg_trunc(&pc->pc_fullgroups);
        n = pool_cache_invalidate_groups(pc, pcg);
        s = splvm();
        ((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_nfull -= n;
        splx(s);

        pcg = pool_pcg_trunc(&pc->pc_partgroups);
        n = pool_cache_invalidate_groups(pc, pcg);
        s = splvm();
        ((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_npart -= n;
        splx(s);
}

/*
 * pool_cache_invalidate_cpu:
 *
 *        Invalidate all CPU-bound cached objects in pool cache, the CPU being
 *        identified by its associated index.
 *        It is caller's responsibility to ensure that no operation is
 *        taking place on this pool cache while doing this invalidation.
 *        WARNING: as no inter-CPU locking is enforced, trying to invalidate
 *        pool cached objects from a CPU different from the one currently running
 *        may result in an undefined behaviour.
 */
static void
pool_cache_invalidate_cpu(pool_cache_t pc, u_int index)
{
        pool_cache_cpu_t *cc;
        pcg_t *pcg;

        if ((cc = pc->pc_cpus[index]) == NULL)
                return;

        if ((pcg = cc->cc_current) != &pcg_dummy) {
                pcg->pcg_next = NULL;
                pool_cache_invalidate_groups(pc, pcg);
        }
        if ((pcg = cc->cc_previous) != &pcg_dummy) {
                pcg->pcg_next = NULL;
                pool_cache_invalidate_groups(pc, pcg);
        }
        if (cc != &pc->pc_cpu0)
                pool_put(&cache_cpu_pool, cc);

}

void
pool_cache_set_drain_hook(pool_cache_t pc, void (*fn)(void *, int), void *arg)
{

        pool_set_drain_hook(&pc->pc_pool, fn, arg);
}

void
pool_cache_setlowat(pool_cache_t pc, int n)
{

        pool_setlowat(&pc->pc_pool, n);
}

void
pool_cache_sethiwat(pool_cache_t pc, int n)
{

        pool_sethiwat(&pc->pc_pool, n);
}

void
pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap)
{

        pool_sethardlimit(&pc->pc_pool, n, warnmess, ratecap);
}

void
pool_cache_prime(pool_cache_t pc, int n)
{

        pool_prime(&pc->pc_pool, n);
}

unsigned int
pool_cache_nget(pool_cache_t pc)
{

        return pool_nget(&pc->pc_pool);
}

unsigned int
pool_cache_nput(pool_cache_t pc)
{

        return pool_nput(&pc->pc_pool);
}

/*
 * pool_pcg_get:
 *
 *        Get a cache group from the specified list.  Return true if
 *        contention was encountered.  Must be called at IPL_VM because
 *        of spin wait vs. kernel_lock.
 */
static int
pool_pcg_get(pcg_t *volatile *head, pcg_t **pcgp)
{
        int count = SPINLOCK_BACKOFF_MIN;
        pcg_t *o, *n;

        for (o = atomic_load_relaxed(head);; o = n) {
                if (__predict_false(o == &pcg_dummy)) {
                        /* Wait for concurrent get to complete. */
                        SPINLOCK_BACKOFF(count);
                        n = atomic_load_relaxed(head);
                        continue;
                }
                if (__predict_false(o == NULL)) {
                        break;
                }
                /* Lock out concurrent get/put. */
                n = atomic_cas_ptr(head, o, __UNCONST(&pcg_dummy));
                if (o == n) {
                        /* Fetch pointer to next item and then unlock. */
                        membar_datadep_consumer(); /* alpha */
                        n = atomic_load_relaxed(&o->pcg_next);
                        atomic_store_release(head, n);
                        break;
                }
        }
        *pcgp = o;
        return count != SPINLOCK_BACKOFF_MIN;
}

/*
 * pool_pcg_trunc:
 *
 *        Chop out entire list of pool cache groups.
 */
static pcg_t *
pool_pcg_trunc(pcg_t *volatile *head)
{
        int count = SPINLOCK_BACKOFF_MIN, s;
        pcg_t *o, *n;

        s = splvm();
        for (o = atomic_load_relaxed(head);; o = n) {
                if (__predict_false(o == &pcg_dummy)) {
                        /* Wait for concurrent get to complete. */
                        SPINLOCK_BACKOFF(count);
                        n = atomic_load_relaxed(head);
                        continue;
                }
                n = atomic_cas_ptr(head, o, NULL);
                if (o == n) {
                        splx(s);
                        membar_datadep_consumer(); /* alpha */
                        return o;
                }
        }
}

/*
 * pool_pcg_put:
 *
 *        Put a pool cache group to the specified list.  Return true if
 *        contention was encountered.  Must be called at IPL_VM because of
 *        spin wait vs. kernel_lock.
 */
static int
pool_pcg_put(pcg_t *volatile *head, pcg_t *pcg)
{
        int count = SPINLOCK_BACKOFF_MIN;
        pcg_t *o, *n;

        for (o = atomic_load_relaxed(head);; o = n) {
                if (__predict_false(o == &pcg_dummy)) {
                        /* Wait for concurrent get to complete. */
                        SPINLOCK_BACKOFF(count);
                        n = atomic_load_relaxed(head);
                        continue;
                }
                pcg->pcg_next = o;
                membar_release();
                n = atomic_cas_ptr(head, o, pcg);
                if (o == n) {
                        return count != SPINLOCK_BACKOFF_MIN;
                }
        }
}

static bool __noinline
pool_cache_get_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s,
    void **objectp, paddr_t *pap, int flags)
{
        pcg_t *pcg, *cur;
        void *object;

        KASSERT(cc->cc_current->pcg_avail == 0);
        KASSERT(cc->cc_previous->pcg_avail == 0);

        cc->cc_misses++;

        /*
         * If there's a full group, release our empty group back to the
         * cache.  Install the full group as cc_current and return.
         */
        cc->cc_contended += pool_pcg_get(&pc->pc_fullgroups, &pcg);
        if (__predict_true(pcg != NULL)) {
                KASSERT(pcg->pcg_avail == pcg->pcg_size);
                if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) {
                        KASSERT(cur->pcg_avail == 0);
                        (void)pool_pcg_put(cc->cc_pcgcache, cur);
                }
                cc->cc_nfull--;
                cc->cc_current = pcg;
                return true;
        }

        /*
         * Nothing available locally or in cache.  Take the slow
         * path: fetch a new object from the pool and construct
         * it.
         */
        cc->cc_pcmisses++;
        splx(s);

        object = pool_get(&pc->pc_pool, flags);
        *objectp = object;
        if (__predict_false(object == NULL)) {
                KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
                return false;
        }

        if (__predict_false((*pc->pc_ctor)(pc->pc_arg, object, flags) != 0)) {
                pool_put(&pc->pc_pool, object);
                *objectp = NULL;
                return false;
        }

        KASSERT((((vaddr_t)object) & (pc->pc_pool.pr_align - 1)) == 0);

        if (pap != NULL) {
#ifdef POOL_VTOPHYS
                *pap = POOL_VTOPHYS(object);
#else
                *pap = POOL_PADDR_INVALID;
#endif
        }

        FREECHECK_OUT(&pc->pc_freecheck, object);
        return false;
}

/*
 * pool_cache_get{,_paddr}:
 *
 *        Get an object from a pool cache (optionally returning
 *        the physical address of the object).
 */
void *
pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap)
{
        pool_cache_cpu_t *cc;
        pcg_t *pcg;
        void *object;
        int s;

        KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK));
        if (pc->pc_pool.pr_ipl == IPL_NONE &&
            __predict_true(!cold) &&
            __predict_true(panicstr == NULL)) {
                KASSERTMSG(!cpu_intr_p(),
                    "%s: [%s] is IPL_NONE, but called from interrupt context",
                    __func__, pc->pc_pool.pr_wchan);
                KASSERTMSG(!cpu_softintr_p(),
                    "%s: [%s] is IPL_NONE,"
                    " but called from soft interrupt context",
                    __func__, pc->pc_pool.pr_wchan);
        }

        if (flags & PR_WAITOK) {
                ASSERT_SLEEPABLE();
        }

        if (flags & PR_NOWAIT) {
                if (fault_inject())
                        return NULL;
        }

        /* Lock out interrupts and disable preemption. */
        s = splvm();
        while (/* CONSTCOND */ true) {
                /* Try and allocate an object from the current group. */
                cc = pc->pc_cpus[curcpu()->ci_index];
                 pcg = cc->cc_current;
                if (__predict_true(pcg->pcg_avail > 0)) {
                        object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va;
                        if (__predict_false(pap != NULL))
                                *pap = pcg->pcg_objects[pcg->pcg_avail].pcgo_pa;
#if defined(DIAGNOSTIC)
                        pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL;
                        KASSERT(pcg->pcg_avail < pcg->pcg_size);
                        KASSERT(object != NULL);
#endif
                        cc->cc_hits++;
                        splx(s);
                        FREECHECK_OUT(&pc->pc_freecheck, object);
                        pool_redzone_fill(&pc->pc_pool, object);
                        pool_cache_get_kmsan(pc, object);
                        return object;
                }

                /*
                 * That failed.  If the previous group isn't empty, swap
                 * it with the current group and allocate from there.
                 */
                pcg = cc->cc_previous;
                if (__predict_true(pcg->pcg_avail > 0)) {
                        cc->cc_previous = cc->cc_current;
                        cc->cc_current = pcg;
                        continue;
                }

                /*
                 * Can't allocate from either group: try the slow path.
                 * If get_slow() allocated an object for us, or if
                 * no more objects are available, it will return false.
                 * Otherwise, we need to retry.
                 */
                if (!pool_cache_get_slow(pc, cc, s, &object, pap, flags)) {
                        if (object != NULL) {
                                kmsan_orig(object, pc->pc_pool.pr_size,
                                    KMSAN_TYPE_POOL, __RET_ADDR);
                        }
                        break;
                }
        }

        /*
         * We would like to KASSERT(object || (flags & PR_NOWAIT)), but
         * pool_cache_get can fail even in the PR_WAITOK case, if the
         * constructor fails.
         */
        return object;
}

static bool __noinline
pool_cache_put_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, void *object)
{
        pcg_t *pcg, *cur;

        KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size);
        KASSERT(cc->cc_previous->pcg_avail == cc->cc_previous->pcg_size);

        cc->cc_misses++;

        /*
         * Try to get an empty group from the cache.  If there are no empty
         * groups in the cache then allocate one.
         */
        (void)pool_pcg_get(cc->cc_pcgcache, &pcg);
        if (__predict_false(pcg == NULL)) {
                if (__predict_true(!pool_cache_disable)) {
                        pcg = pool_get(pc->pc_pcgpool, PR_NOWAIT);
                }
                if (__predict_true(pcg != NULL)) {
                        pcg->pcg_avail = 0;
                        pcg->pcg_size = pc->pc_pcgsize;
                }
        }

        /*
         * If there's a empty group, release our full group back to the
         * cache.  Install the empty group to the local CPU and return.
         */
        if (pcg != NULL) {
                KASSERT(pcg->pcg_avail == 0);
                if (__predict_false(cc->cc_previous == &pcg_dummy)) {
                        cc->cc_previous = pcg;
                } else {
                        cur = cc->cc_current;
                        if (__predict_true(cur != &pcg_dummy)) {
                                KASSERT(cur->pcg_avail == cur->pcg_size);
                                cc->cc_contended +=
                                    pool_pcg_put(&pc->pc_fullgroups, cur);
                                cc->cc_nfull++;
                        }
                        cc->cc_current = pcg;
                }
                return true;
        }

        /*
         * Nothing available locally or in cache, and we didn't
         * allocate an empty group.  Take the slow path and destroy
         * the object here and now.
         */
        cc->cc_pcmisses++;
        splx(s);
        pool_cache_destruct_object(pc, object);

        return false;
}

/*
 * pool_cache_put{,_paddr}:
 *
 *        Put an object back to the pool cache (optionally caching the
 *        physical address of the object).
 */
void
pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa)
{
        pool_cache_cpu_t *cc;
        pcg_t *pcg;
        int s;

        KASSERT(object != NULL);
        pool_cache_put_kmsan(pc, object);
        pool_cache_redzone_check(pc, object);
        FREECHECK_IN(&pc->pc_freecheck, object);

        if (pc->pc_pool.pr_roflags & PR_PHINPAGE) {
                pc_phinpage_check(pc, object);
        }

        if (pool_cache_put_nocache(pc, object)) {
                return;
        }

        /* Lock out interrupts and disable preemption. */
        s = splvm();
        while (/* CONSTCOND */ true) {
                /* If the current group isn't full, release it there. */
                cc = pc->pc_cpus[curcpu()->ci_index];
                 pcg = cc->cc_current;
                if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
                        pcg->pcg_objects[pcg->pcg_avail].pcgo_va = object;
                        pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa;
                        pcg->pcg_avail++;
                        cc->cc_hits++;
                        splx(s);
                        return;
                }

                /*
                 * That failed.  If the previous group isn't full, swap
                 * it with the current group and try again.
                 */
                pcg = cc->cc_previous;
                if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
                        cc->cc_previous = cc->cc_current;
                        cc->cc_current = pcg;
                        continue;
                }

                /*
                 * Can't free to either group: try the slow path.
                 * If put_slow() releases the object for us, it
                 * will return false.  Otherwise we need to retry.
                 */
                if (!pool_cache_put_slow(pc, cc, s, object))
                        break;
        }
}

/*
 * pool_cache_transfer:
 *
 *        Transfer objects from the per-CPU cache to the global cache.
 *        Run within a cross-call thread.
 */
static void
pool_cache_transfer(pool_cache_t pc)
{
        pool_cache_cpu_t *cc;
        pcg_t *prev, *cur;
        int s;

        s = splvm();
        cc = pc->pc_cpus[curcpu()->ci_index];
        cur = cc->cc_current;
        cc->cc_current = __UNCONST(&pcg_dummy);
        prev = cc->cc_previous;
        cc->cc_previous = __UNCONST(&pcg_dummy);
        if (cur != &pcg_dummy) {
                if (cur->pcg_avail == cur->pcg_size) {
                        (void)pool_pcg_put(&pc->pc_fullgroups, cur);
                        cc->cc_nfull++;
                } else if (cur->pcg_avail == 0) {
                        (void)pool_pcg_put(pc->pc_pcgcache, cur);
                } else {
                        (void)pool_pcg_put(&pc->pc_partgroups, cur);
                        cc->cc_npart++;
                }
        }
        if (prev != &pcg_dummy) {
                if (prev->pcg_avail == prev->pcg_size) {
                        (void)pool_pcg_put(&pc->pc_fullgroups, prev);
                        cc->cc_nfull++;
                } else if (prev->pcg_avail == 0) {
                        (void)pool_pcg_put(pc->pc_pcgcache, prev);
                } else {
                        (void)pool_pcg_put(&pc->pc_partgroups, prev);
                        cc->cc_npart++;
                }
        }
        splx(s);
}

static int
pool_bigidx(size_t size)
{
        int i;

        for (i = 0; i < __arraycount(pool_allocator_big); i++) {
                if (1 << (i + POOL_ALLOCATOR_BIG_BASE) >= size)
                        return i;
        }
        panic("pool item size %zu too large, use a custom allocator", size);
}

static void *
pool_allocator_alloc(struct pool *pp, int flags)
{
        struct pool_allocator *pa = pp->pr_alloc;
        void *res;

        res = (*pa->pa_alloc)(pp, flags);
        if (res == NULL && (flags & PR_WAITOK) == 0) {
                /*
                 * We only run the drain hook here if PR_NOWAIT.
                 * In other cases, the hook will be run in
                 * pool_reclaim().
                 */
                if (pp->pr_drain_hook != NULL) {
                        (*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
                        res = (*pa->pa_alloc)(pp, flags);
                }
        }
        return res;
}

static void
pool_allocator_free(struct pool *pp, void *v)
{
        struct pool_allocator *pa = pp->pr_alloc;

        if (pp->pr_redzone) {
                KASSERT(!pp_has_pser(pp));
                kasan_mark(v, pa->pa_pagesz, pa->pa_pagesz, 0);
        } else if (__predict_false(pp_has_pser(pp))) {
                /*
                 * Perform a passive serialization barrier before freeing
                 * the pool page back to the system.
                 */
                pool_barrier();
        }
        (*pa->pa_free)(pp, v);
}

void *
pool_page_alloc(struct pool *pp, int flags)
{
        const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
        vmem_addr_t va;
        int ret;

        ret = uvm_km_kmem_alloc(kmem_va_arena, pp->pr_alloc->pa_pagesz,
            vflags | VM_INSTANTFIT, &va);

        return ret ? NULL : (void *)va;
}

void
pool_page_free(struct pool *pp, void *v)
{

        uvm_km_kmem_free(kmem_va_arena, (vaddr_t)v, pp->pr_alloc->pa_pagesz);
}

static void *
pool_page_alloc_meta(struct pool *pp, int flags)
{
        const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
        vmem_addr_t va;
        int ret;

        ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz,
            vflags | VM_INSTANTFIT, &va);

        return ret ? NULL : (void *)va;
}

static void
pool_page_free_meta(struct pool *pp, void *v)
{

        vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz);
}

#ifdef KMSAN
static inline void
pool_get_kmsan(struct pool *pp, void *p)
{
        kmsan_orig(p, pp->pr_size, KMSAN_TYPE_POOL, __RET_ADDR);
        kmsan_mark(p, pp->pr_size, KMSAN_STATE_UNINIT);
}

static inline void
pool_put_kmsan(struct pool *pp, void *p)
{
        kmsan_mark(p, pp->pr_size, KMSAN_STATE_INITED);
}

static inline void
pool_cache_get_kmsan(pool_cache_t pc, void *p)
{
        if (__predict_false(pc_has_ctor(pc))) {
                return;
        }
        pool_get_kmsan(&pc->pc_pool, p);
}

static inline void
pool_cache_put_kmsan(pool_cache_t pc, void *p)
{
        pool_put_kmsan(&pc->pc_pool, p);
}
#endif

#ifdef POOL_QUARANTINE
static void
pool_quarantine_init(struct pool *pp)
{
        pp->pr_quar.rotor = 0;
        memset(&pp->pr_quar, 0, sizeof(pp->pr_quar));
}

static void
pool_quarantine_flush(struct pool *pp)
{
        pool_quar_t *quar = &pp->pr_quar;
        struct pool_pagelist pq;
        size_t i;

        LIST_INIT(&pq);

        mutex_enter(&pp->pr_lock);
        for (i = 0; i < POOL_QUARANTINE_DEPTH; i++) {
                if (quar->list[i] == 0)
                        continue;
                pool_do_put(pp, (void *)quar->list[i], &pq);
        }
        mutex_exit(&pp->pr_lock);

        pr_pagelist_free(pp, &pq);
}

static bool
pool_put_quarantine(struct pool *pp, void *v, struct pool_pagelist *pq)
{
        pool_quar_t *quar = &pp->pr_quar;
        uintptr_t old;

        if (pp->pr_roflags & PR_NOTOUCH) {
                return false;
        }

        pool_redzone_check(pp, v);

        old = quar->list[quar->rotor];
        quar->list[quar->rotor] = (uintptr_t)v;
        quar->rotor = (quar->rotor + 1) % POOL_QUARANTINE_DEPTH;
        if (old != 0) {
                pool_do_put(pp, (void *)old, pq);
        }

        return true;
}
#endif

#ifdef POOL_NOCACHE
static bool
pool_cache_put_nocache(pool_cache_t pc, void *p)
{
        pool_cache_destruct_object(pc, p);
        return true;
}
#endif

#ifdef POOL_REDZONE
#if defined(_LP64)
# define PRIME 0x9e37fffffffc0000UL
#else /* defined(_LP64) */
# define PRIME 0x9e3779b1
#endif /* defined(_LP64) */
#define STATIC_BYTE        0xFE
CTASSERT(POOL_REDZONE_SIZE > 1);

#ifndef KASAN
static inline uint8_t
pool_pattern_generate(const void *p)
{
        return (uint8_t)(((uintptr_t)p) * PRIME
           >> ((sizeof(uintptr_t) - sizeof(uint8_t))) * CHAR_BIT);
}
#endif

static void
pool_redzone_init(struct pool *pp, size_t requested_size)
{
        size_t redzsz;
        size_t nsz;

#ifdef KASAN
        redzsz = requested_size;
        kasan_add_redzone(&redzsz);
        redzsz -= requested_size;
#else
        redzsz = POOL_REDZONE_SIZE;
#endif

        if (pp->pr_roflags & PR_NOTOUCH) {
                pp->pr_redzone = false;
                return;
        }

        /*
         * We may have extended the requested size earlier; check if
         * there's naturally space in the padding for a red zone.
         */
        if (pp->pr_size - requested_size >= redzsz) {
                pp->pr_reqsize_with_redzone = requested_size + redzsz;
                pp->pr_redzone = true;
                return;
        }

        /*
         * No space in the natural padding; check if we can extend a
         * bit the size of the pool.
         *
         * Avoid using redzone for allocations half of a page or larger.
         * For pagesize items, we'd waste a whole new page (could be
         * unmapped?), and for half pagesize items, approximately half
         * the space is lost (eg, 4K pages, you get one 2K allocation.)
         */
        nsz = roundup(pp->pr_size + redzsz, pp->pr_align);
        if (nsz <= (pp->pr_alloc->pa_pagesz / 2)) {
                /* Ok, we can */
                pp->pr_size = nsz;
                pp->pr_reqsize_with_redzone = requested_size + redzsz;
                pp->pr_redzone = true;
        } else {
                /* No space for a red zone... snif :'( */
                pp->pr_redzone = false;
                aprint_debug("pool redzone disabled for '%s'\n", pp->pr_wchan);
        }
}

static void
pool_redzone_fill(struct pool *pp, void *p)
{
        if (!pp->pr_redzone)
                return;
        KASSERT(!pp_has_pser(pp));
#ifdef KASAN
        kasan_mark(p, pp->pr_reqsize, pp->pr_reqsize_with_redzone,
            KASAN_POOL_REDZONE);
#else
        uint8_t *cp, pat;
        const uint8_t *ep;

        cp = (uint8_t *)p + pp->pr_reqsize;
        ep = cp + POOL_REDZONE_SIZE;

        /*
         * We really don't want the first byte of the red zone to be '\0';
         * an off-by-one in a string may not be properly detected.
         */
        pat = pool_pattern_generate(cp);
        *cp = (pat == '\0') ? STATIC_BYTE: pat;
        cp++;

        while (cp < ep) {
                *cp = pool_pattern_generate(cp);
                cp++;
        }
#endif
}

static void
pool_redzone_check(struct pool *pp, void *p)
{
        if (!pp->pr_redzone)
                return;
        KASSERT(!pp_has_pser(pp));
#ifdef KASAN
        kasan_mark(p, 0, pp->pr_reqsize_with_redzone, KASAN_POOL_FREED);
#else
        uint8_t *cp, pat, expected;
        const uint8_t *ep;

        cp = (uint8_t *)p + pp->pr_reqsize;
        ep = cp + POOL_REDZONE_SIZE;

        pat = pool_pattern_generate(cp);
        expected = (pat == '\0') ? STATIC_BYTE: pat;
        if (__predict_false(*cp != expected)) {
                panic("%s: [%s] 0x%02x != 0x%02x", __func__,
                    pp->pr_wchan, *cp, expected);
        }
        cp++;

        while (cp < ep) {
                expected = pool_pattern_generate(cp);
                if (__predict_false(*cp != expected)) {
                        panic("%s: [%s] 0x%02x != 0x%02x", __func__,
                            pp->pr_wchan, *cp, expected);
                }
                cp++;
        }
#endif
}

static void
pool_cache_redzone_check(pool_cache_t pc, void *p)
{
#ifdef KASAN
        /*
         * If there is a ctor/dtor, or if the cache objects use
         * passive serialization, leave the data as valid.
         */
        if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc) ||
            pc_has_pser(pc))) {
                return;
        }
#endif
        pool_redzone_check(&pc->pc_pool, p);
}

#endif /* POOL_REDZONE */

#if defined(DDB)
static bool
pool_in_page(struct pool *pp, struct pool_item_header *ph, uintptr_t addr)
{

        return (uintptr_t)ph->ph_page <= addr &&
            addr < (uintptr_t)ph->ph_page + pp->pr_alloc->pa_pagesz;
}

static bool
pool_in_item(struct pool *pp, void *item, uintptr_t addr)
{

        return (uintptr_t)item <= addr && addr < (uintptr_t)item + pp->pr_size;
}

static bool
pool_in_cg(struct pool *pp, struct pool_cache_group *pcg, uintptr_t addr)
{
        int i;

        if (pcg == NULL) {
                return false;
        }
        for (i = 0; i < pcg->pcg_avail; i++) {
                if (pool_in_item(pp, pcg->pcg_objects[i].pcgo_va, addr)) {
                        return true;
                }
        }
        return false;
}

static bool
pool_allocated(struct pool *pp, struct pool_item_header *ph, uintptr_t addr)
{

        if ((pp->pr_roflags & PR_USEBMAP) != 0) {
                unsigned int idx = pr_item_bitmap_index(pp, ph, (void *)addr);
                pool_item_bitmap_t *bitmap =
                    ph->ph_bitmap + (idx / BITMAP_SIZE);
                pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK);

                return (*bitmap & mask) == 0;
        } else {
                struct pool_item *pi;

                LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) {
                        if (pool_in_item(pp, pi, addr)) {
                                return false;
                        }
                }
                return true;
        }
}

void
pool_whatis(uintptr_t addr, void (*pr)(const char *, ...))
{
        struct pool *pp;

        TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
                struct pool_item_header *ph;
                struct pool_cache *pc;
                uintptr_t item;
                bool allocated = true;
                bool incache = false;
                bool incpucache = false;
                char cpucachestr[32];

                if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
                        LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
                                if (pool_in_page(pp, ph, addr)) {
                                        goto found;
                                }
                        }
                        LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
                                if (pool_in_page(pp, ph, addr)) {
                                        allocated =
                                            pool_allocated(pp, ph, addr);
                                        goto found;
                                }
                        }
                        LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) {
                                if (pool_in_page(pp, ph, addr)) {
                                        allocated = false;
                                        goto found;
                                }
                        }
                        continue;
                } else {
                        ph = pr_find_pagehead_noalign(pp, (void *)addr);
                        if (ph == NULL || !pool_in_page(pp, ph, addr)) {
                                continue;
                        }
                        allocated = pool_allocated(pp, ph, addr);
                }
found:
                if (allocated &&
                    (pc = atomic_load_consume(&pp->pr_cache)) != NULL) {
                        struct pool_cache_group *pcg;
                        int i;

                        for (pcg = pc->pc_fullgroups; pcg != NULL;
                            pcg = pcg->pcg_next) {
                                if (pool_in_cg(pp, pcg, addr)) {
                                        incache = true;
                                        goto print;
                                }
                        }
                        for (i = 0; i < __arraycount(pc->pc_cpus); i++) {
                                pool_cache_cpu_t *cc;

                                if ((cc = pc->pc_cpus[i]) == NULL) {
                                        continue;
                                }
                                if (pool_in_cg(pp, cc->cc_current, addr) ||
                                    pool_in_cg(pp, cc->cc_previous, addr)) {
                                        struct cpu_info *ci =
                                            cpu_lookup(i);

                                        incpucache = true;
                                        snprintf(cpucachestr,
                                            sizeof(cpucachestr),
                                            "cached by CPU %u",
                                            ci->ci_index);
                                        goto print;
                                }
                        }
                }
print:
                item = (uintptr_t)ph->ph_page + ph->ph_off;
                item = item + rounddown(addr - item, pp->pr_size);
                (*pr)("%p is %p+%zu in POOL '%s' (%s)\n",
                    (void *)addr, item, (size_t)(addr - item),
                    pp->pr_wchan,
                    incpucache ? cpucachestr :
                    incache ? "cached" : allocated ? "allocated" : "free");
        }
}
#endif /* defined(DDB) */

static int
pool_sysctl(SYSCTLFN_ARGS)
{
        struct pool_sysctl data;
        struct pool *pp;
        struct pool_cache *pc;
        pool_cache_cpu_t *cc;
        int error;
        size_t i, written;

        if (oldp == NULL) {
                *oldlenp = 0;
                TAILQ_FOREACH(pp, &pool_head, pr_poollist)
                        *oldlenp += sizeof(data);
                return 0;
        }

        memset(&data, 0, sizeof(data));
        error = 0;
        written = 0;
        mutex_enter(&pool_head_lock);
        TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
                if (written + sizeof(data) > *oldlenp)
                        break;
                pp->pr_refcnt++;
                strlcpy(data.pr_wchan, pp->pr_wchan, sizeof(data.pr_wchan));
                data.pr_pagesize = pp->pr_alloc->pa_pagesz;
                data.pr_flags = pp->pr_roflags | pp->pr_flags;
#define COPY(field) data.field = pp->field
                COPY(pr_size);

                COPY(pr_itemsperpage);
                COPY(pr_nitems);
                COPY(pr_nout);
                COPY(pr_hardlimit);
                COPY(pr_npages);
                COPY(pr_minpages);
                COPY(pr_maxpages);

                COPY(pr_nget);
                COPY(pr_nfail);
                COPY(pr_nput);
                COPY(pr_npagealloc);
                COPY(pr_npagefree);
                COPY(pr_hiwat);
                COPY(pr_nidle);
#undef COPY

                data.pr_cache_nmiss_pcpu = 0;
                data.pr_cache_nhit_pcpu = 0;
                data.pr_cache_nmiss_global = 0;
                data.pr_cache_nempty = 0;
                data.pr_cache_ncontended = 0;
                data.pr_cache_npartial = 0;
                if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) {
                        uint32_t nfull = 0;
                        data.pr_cache_meta_size = pc->pc_pcgsize;
                        for (i = 0; i < pc->pc_ncpu; ++i) {
                                cc = pc->pc_cpus[i];
                                if (cc == NULL)
                                        continue;
                                data.pr_cache_ncontended += cc->cc_contended;
                                data.pr_cache_nmiss_pcpu += cc->cc_misses;
                                data.pr_cache_nhit_pcpu += cc->cc_hits;
                                data.pr_cache_nmiss_global += cc->cc_pcmisses;
                                nfull += cc->cc_nfull; /* 32-bit rollover! */
                                data.pr_cache_npartial += cc->cc_npart;
                        }
                        data.pr_cache_nfull = nfull;
                } else {
                        data.pr_cache_meta_size = 0;
                        data.pr_cache_nfull = 0;
                }
                data.pr_cache_nhit_global = data.pr_cache_nmiss_pcpu -
                    data.pr_cache_nmiss_global;

                if (pp->pr_refcnt == UINT_MAX) /* XXX possible? */
                        continue;
                mutex_exit(&pool_head_lock);
                error = sysctl_copyout(l, &data, oldp, sizeof(data));
                mutex_enter(&pool_head_lock);
                if (--pp->pr_refcnt == 0)
                        cv_broadcast(&pool_busy);
                if (error)
                        break;
                written += sizeof(data);
                oldp = (char *)oldp + sizeof(data);
        }
        mutex_exit(&pool_head_lock);

        *oldlenp = written;
        return error;
}

SYSCTL_SETUP(sysctl_pool_setup, "sysctl kern.pool setup")
{
        const struct sysctlnode *rnode = NULL;

        sysctl_createv(clog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRUCT, "pool",
                       SYSCTL_DESCR("Get pool statistics"),
                       pool_sysctl, 0, NULL, 0,
                       CTL_KERN, CTL_CREATE, CTL_EOL);
}











































































































































































































































































































































































































    9 





    9 












    1 





    1 






    1 






























































































































































    2 





    2 













    1 


































    2 
    2 







    1 










    1 














    1 
    1 













    1 

































































    1 

    1 



















   11 


   11 








   11 
























    5 
    5 
    5 



    9 











   11 
   11 
   11 
   11 








































   10 
    1 









    5 


    6 


    6 

    5 













   11 
















    5 


    6 

















   11 



   11 


   10 













    9 
    1 





   11 


   11 






   10 





























































































    1 






    1 

    1 

    1 



    1 



























    1 

    1 

























    1 














    1 














    1 

    1 

















    1 















    1 





    1 













    1 













    6 




    6 
    6 







    6 









    6 
























    1 




    5 



    5 



    5 



















    6 






    6 


    6 








    1 
    5 






    6 
    5 
    6 
    6 
    5 
    1 

















































































































































































  128 










  126 


  126 

  125 









   66 


   66 


   65 





   40 



   40 

   40 


















  131 
  132 






  130 




  129 






   64 






    5 

    5 





    5 




   28 
   26 


   29 




  126 


  126 











  127 
   99 










    4 



    4 
    4 
















    4 















































































    4 






    8 

    8 
    8 













    8 
    2 










    1 
    1 
    1 









    1 














    1 


    1 
    1 
    1 


    1 












    1 


















    1 

    1 
















    1 


    1 


    1 








































































































































































































































  246 

































    1 






    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
/*        $NetBSD: kern_lwp.c,v 1.269 2023/12/20 21:03:50 andvar Exp $        */

/*-
 * Copyright (c) 2001, 2006, 2007, 2008, 2009, 2019, 2020, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Nathan J. Williams, and Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Overview
 *
 *        Lightweight processes (LWPs) are the basic unit or thread of
 *        execution within the kernel.  The core state of an LWP is described
 *        by "struct lwp", also known as lwp_t.
 *
 *        Each LWP is contained within a process (described by "struct proc"),
 *        Every process contains at least one LWP, but may contain more.  The
 *        process describes attributes shared among all of its LWPs such as a
 *        private address space, global execution state (stopped, active,
 *        zombie, ...), signal disposition and so on.  On a multiprocessor
 *        machine, multiple LWPs be executing concurrently in the kernel.
 *
 * Execution states
 *
 *        At any given time, an LWP has overall state that is described by
 *        lwp::l_stat.  The states are broken into two sets below.  The first
 *        set is guaranteed to represent the absolute, current state of the
 *        LWP:
 *
 *        LSONPROC
 *
 *                On processor: the LWP is executing on a CPU, either in the
 *                kernel or in user space.
 *
 *        LSRUN
 *
 *                Runnable: the LWP is parked on a run queue, and may soon be
 *                chosen to run by an idle processor, or by a processor that
 *                has been asked to preempt a currently running but lower
 *                priority LWP.
 *
 *        LSIDL
 *
 *                Idle: the LWP has been created but has not yet executed, or
 *                it has ceased executing a unit of work and is waiting to be
 *                started again.  This state exists so that the LWP can occupy
 *                a slot in the process & PID table, but without having to
 *                worry about being touched; lookups of the LWP by ID will
 *                fail while in this state.  The LWP will become visible for
 *                lookup once its state transitions further.  Some special
 *                kernel threads also (ab)use this state to indicate that they
 *                are idle (soft interrupts and idle LWPs).
 *
 *        LSSUSPENDED:
 *
 *                Suspended: the LWP has had its execution suspended by
 *                another LWP in the same process using the _lwp_suspend()
 *                system call.  User-level LWPs also enter the suspended
 *                state when the system is shutting down.
 *
 *        The second set represent a "statement of intent" on behalf of the
 *        LWP.  The LWP may in fact be executing on a processor, may be
 *        sleeping or idle. It is expected to take the necessary action to
 *        stop executing or become "running" again within a short timeframe.
 *        The LP_RUNNING flag in lwp::l_pflag indicates that an LWP is running.
 *        Importantly, it indicates that its state is tied to a CPU.
 *
 *        LSZOMB:
 *
 *                Dead or dying: the LWP has released most of its resources
 *                and is about to switch away into oblivion, or has already
 *                switched away.  When it switches away, its few remaining
 *                resources can be collected.
 *
 *        LSSLEEP:
 *
 *                Sleeping: the LWP has entered itself onto a sleep queue, and
 *                has switched away or will switch away shortly to allow other
 *                LWPs to run on the CPU.
 *
 *        LSSTOP:
 *
 *                Stopped: the LWP has been stopped as a result of a job
 *                control signal, or as a result of the ptrace() interface. 
 *
 *                Stopped LWPs may run briefly within the kernel to handle
 *                signals that they receive, but will not return to user space
 *                until their process' state is changed away from stopped. 
 *
 *                Single LWPs within a process can not be set stopped
 *                selectively: all actions that can stop or continue LWPs
 *                occur at the process level.
 *
 * State transitions
 *
 *        Note that the LSSTOP state may only be set when returning to
 *        user space in userret(), or when sleeping interruptably.  The
 *        LSSUSPENDED state may only be set in userret().  Before setting
 *        those states, we try to ensure that the LWPs will release all
 *        locks that they hold, and at a minimum try to ensure that the
 *        LWP can be set runnable again by a signal.
 *
 *        LWPs may transition states in the following ways:
 *
 *         RUN -------> ONPROC                ONPROC -----> RUN
 *                                                        > SLEEP
 *                                                        > STOPPED
 *                                                    > SUSPENDED
 *                                                    > ZOMB
 *                                                    > IDL (special cases)
 *
 *         STOPPED ---> RUN                SUSPENDED --> RUN
 *                    > SLEEP
 *
 *         SLEEP -----> ONPROC                IDL --------> RUN
 *                    > RUN                            > SUSPENDED
 *                    > STOPPED                            > STOPPED
 *                                                    > ONPROC (special cases)
 *
 *        Some state transitions are only possible with kernel threads (eg
 *        ONPROC -> IDL) and happen under tightly controlled circumstances
 *        free of unwanted side effects.
 *
 * Migration
 *
 *        Migration of threads from one CPU to another could be performed
 *        internally by the scheduler via sched_takecpu() or sched_catchlwp()
 *        functions.  The universal lwp_migrate() function should be used for
 *        any other cases.  Subsystems in the kernel must be aware that CPU
 *        of LWP may change, while it is not locked.
 *
 * Locking
 *
 *        The majority of fields in 'struct lwp' are covered by a single,
 *        general spin lock pointed to by lwp::l_mutex.  The locks covering
 *        each field are documented in sys/lwp.h.
 *
 *        State transitions must be made with the LWP's general lock held,
 *        and may cause the LWP's lock pointer to change.  Manipulation of
 *        the general lock is not performed directly, but through calls to
 *        lwp_lock(), lwp_unlock() and others.  It should be noted that the
 *        adaptive locks are not allowed to be released while the LWP's lock
 *        is being held (unlike for other spin-locks).
 *
 *        States and their associated locks:
 *
 *        LSIDL, LSONPROC, LSZOMB, LSSUPENDED:
 *
 *                Always covered by spc_lwplock, which protects LWPs not
 *                associated with any other sync object.  This is a per-CPU
 *                lock and matches lwp::l_cpu.
 *
 *        LSRUN:
 *
 *                Always covered by spc_mutex, which protects the run queues.
 *                This is a per-CPU lock and matches lwp::l_cpu.
 *
 *        LSSLEEP:
 *
 *                Covered by a lock associated with the sleep queue (sometimes
 *                a turnstile sleep queue) that the LWP resides on.  This can
 *                be spc_lwplock for SOBJ_SLEEPQ_NULL (an "untracked" sleep).
 *
 *        LSSTOP:
 *
 *                If the LWP was previously sleeping (l_wchan != NULL), then
 *                l_mutex references the sleep queue lock.  If the LWP was
 *                runnable or on the CPU when halted, or has been removed from
 *                the sleep queue since halted, then the lock is spc_lwplock.
 *
 *        The lock order is as follows:
 *
 *                sleepq -> turnstile -> spc_lwplock -> spc_mutex
 *
 *        Each process has a scheduler state lock (proc::p_lock), and a
 *        number of counters on LWPs and their states: p_nzlwps, p_nrlwps, and
 *        so on.  When an LWP is to be entered into or removed from one of the
 *        following states, p_lock must be held and the process wide counters
 *        adjusted:
 *
 *                LSIDL, LSZOMB, LSSTOP, LSSUSPENDED
 *
 *        (But not always for kernel threads.  There are some special cases
 *        as mentioned above: soft interrupts, and the idle loops.)
 *
 *        Note that an LWP is considered running or likely to run soon if in
 *        one of the following states.  This affects the value of p_nrlwps:
 *
 *                LSRUN, LSONPROC, LSSLEEP
 *
 *        p_lock does not need to be held when transitioning among these
 *        three states, hence p_lock is rarely taken for state transitions.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_lwp.c,v 1.269 2023/12/20 21:03:50 andvar Exp $");

#include "opt_ddb.h"
#include "opt_lockdebug.h"
#include "opt_dtrace.h"

#define _LWP_API_PRIVATE

#include <sys/param.h>

#include <sys/atomic.h>
#include <sys/cprng.h>
#include <sys/cpu.h>
#include <sys/dtrace_bsd.h>
#include <sys/filedesc.h>
#include <sys/fstrans.h>
#include <sys/futex.h>
#include <sys/intr.h>
#include <sys/kauth.h>
#include <sys/kcov.h>
#include <sys/kmem.h>
#include <sys/lockdebug.h>
#include <sys/lwpctl.h>
#include <sys/msan.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/pset.h>
#include <sys/psref.h>
#include <sys/ptrace.h>
#include <sys/sdt.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/syscall_stats.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/uidinfo.h>
#include <sys/xcall.h>

#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>

static pool_cache_t        lwp_cache        __read_mostly;
struct lwplist                alllwp                __cacheline_aligned;

static int                lwp_ctor(void *, void *, int);
static void                lwp_dtor(void *, void *);

/* DTrace proc provider probes */
SDT_PROVIDER_DEFINE(proc);

SDT_PROBE_DEFINE1(proc, kernel, , lwp__create, "struct lwp *");
SDT_PROBE_DEFINE1(proc, kernel, , lwp__start, "struct lwp *");
SDT_PROBE_DEFINE1(proc, kernel, , lwp__exit, "struct lwp *");

struct turnstile turnstile0 __cacheline_aligned;
struct lwp lwp0 __aligned(MIN_LWP_ALIGNMENT) = {
#ifdef LWP0_CPU_INFO
        .l_cpu = LWP0_CPU_INFO,
#endif
#ifdef LWP0_MD_INITIALIZER
        .l_md = LWP0_MD_INITIALIZER,
#endif
        .l_proc = &proc0,
        .l_lid = 0,                /* we own proc0's slot in the pid table */
        .l_flag = LW_SYSTEM,
        .l_stat = LSONPROC,
        .l_ts = &turnstile0,
        .l_syncobj = &sched_syncobj,
        .l_refcnt = 0,
        .l_priority = PRI_USER + NPRI_USER - 1,
        .l_inheritedprio = -1,
        .l_class = SCHED_OTHER,
        .l_psid = PS_NONE,
        .l_pi_lenders = SLIST_HEAD_INITIALIZER(&lwp0.l_pi_lenders),
        .l_name = __UNCONST("swapper"),
        .l_fd = &filedesc0,
};

static int
lwp_maxlwp(void)
{
        /* Assume 1 LWP per 1MiB. */
        uint64_t lwps_per = ctob(physmem) / (1024 * 1024);

        return MAX(MIN(MAXMAXLWP, lwps_per), MAXLWP);
}

static int sysctl_kern_maxlwp(SYSCTLFN_PROTO);

/*
 * sysctl helper routine for kern.maxlwp. Ensures that the new
 * values are not too low or too high.
 */
static int
sysctl_kern_maxlwp(SYSCTLFN_ARGS)
{
        int error, nmaxlwp;
        struct sysctlnode node;

        nmaxlwp = maxlwp;
        node = *rnode;
        node.sysctl_data = &nmaxlwp;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return error;

        if (nmaxlwp < 0 || nmaxlwp >= MAXMAXLWP)
                return EINVAL;
        if (nmaxlwp > lwp_maxlwp())
                return EINVAL;
        maxlwp = nmaxlwp;

        return 0;
}

static void
sysctl_kern_lwp_setup(void)
{
        sysctl_createv(NULL, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "maxlwp",
                       SYSCTL_DESCR("Maximum number of simultaneous threads"),
                       sysctl_kern_maxlwp, 0, NULL, 0,
                       CTL_KERN, CTL_CREATE, CTL_EOL);
}

void
lwpinit(void)
{

        LIST_INIT(&alllwp);
        lwpinit_specificdata();
        /*
         * Provide a barrier to ensure that all mutex_oncpu() and rw_oncpu()
         * calls will exit before memory of LWPs is returned to the pool, where
         * KVA of LWP structure might be freed and re-used for other purposes.
         * Kernel preemption is disabled around mutex_oncpu() and rw_oncpu()
         * callers, therefore a regular passive serialization barrier will
         * do the job.
         */
        lwp_cache = pool_cache_init(sizeof(lwp_t), MIN_LWP_ALIGNMENT, 0,
            PR_PSERIALIZE, "lwppl", NULL, IPL_NONE, lwp_ctor, lwp_dtor, NULL);

        maxlwp = lwp_maxlwp();
        sysctl_kern_lwp_setup();
}

void
lwp0_init(void)
{
        struct lwp *l = &lwp0;

        KASSERT((void *)uvm_lwp_getuarea(l) != NULL);

        LIST_INSERT_HEAD(&alllwp, l, l_list);

        callout_init(&l->l_timeout_ch, CALLOUT_MPSAFE);
        callout_setfunc(&l->l_timeout_ch, sleepq_timeout, l);
        cv_init(&l->l_sigcv, "sigwait");
        cv_init(&l->l_waitcv, "vfork");

        l->l_cred = kauth_cred_hold(proc0.p_cred);

        kdtrace_thread_ctor(NULL, l);
        lwp_initspecific(l);

        SYSCALL_TIME_LWP_INIT(l);
}

/*
 * Initialize the non-zeroed portion of an lwp_t.
 */
static int
lwp_ctor(void *arg, void *obj, int flags)
{
        lwp_t *l = obj;

        l->l_stat = LSIDL;
        l->l_cpu = curcpu();
        l->l_mutex = l->l_cpu->ci_schedstate.spc_lwplock;
        l->l_ts = kmem_alloc(sizeof(*l->l_ts), flags == PR_WAITOK ?
            KM_SLEEP : KM_NOSLEEP);

        if (l->l_ts == NULL) {
                return ENOMEM;
        } else {
                turnstile_ctor(l->l_ts);
                return 0;
        }
}

static void
lwp_dtor(void *arg, void *obj)
{
        lwp_t *l = obj;

        /*
         * The value of l->l_cpu must still be valid at this point.
         */
        KASSERT(l->l_cpu != NULL);

        /*
         * We can't return turnstile0 to the pool (it didn't come from it),
         * so if it comes up just drop it quietly and move on.
         */
        if (l->l_ts != &turnstile0)
                kmem_free(l->l_ts, sizeof(*l->l_ts));
}

/*
 * Set an LWP suspended.
 *
 * Must be called with p_lock held, and the LWP locked.  Will unlock the
 * LWP before return.
 */
int
lwp_suspend(struct lwp *curl, struct lwp *t)
{
        int error;

        KASSERT(mutex_owned(t->l_proc->p_lock));
        KASSERT(lwp_locked(t, NULL));

        KASSERT(curl != t || curl->l_stat == LSONPROC);

        /*
         * If the current LWP has been told to exit, we must not suspend anyone
         * else or deadlock could occur.  We won't return to userspace.
         */
        if ((curl->l_flag & (LW_WEXIT | LW_WCORE)) != 0) {
                lwp_unlock(t);
                return (EDEADLK);
        }

        if ((t->l_flag & LW_DBGSUSPEND) != 0) {
                lwp_unlock(t);
                return 0;
        }

        error = 0;

        switch (t->l_stat) {
        case LSRUN:
        case LSONPROC:
                t->l_flag |= LW_WSUSPEND;
                lwp_need_userret(t);
                lwp_unlock(t);
                break;

        case LSSLEEP:
                t->l_flag |= LW_WSUSPEND;
                lwp_need_userret(t);

                /*
                 * Kick the LWP and try to get it to the kernel boundary
                 * so that it will release any locks that it holds.
                 * setrunnable() will release the lock.
                 */
                if ((t->l_flag & LW_SINTR) != 0)
                        setrunnable(t);
                else
                        lwp_unlock(t);
                break;

        case LSSUSPENDED:
                lwp_unlock(t);
                break;

        case LSSTOP:
                t->l_flag |= LW_WSUSPEND;
                lwp_need_userret(t);
                setrunnable(t);
                break;

        case LSIDL:
        case LSZOMB:
                error = EINTR; /* It's what Solaris does..... */
                lwp_unlock(t);
                break;
        }

        return (error);
}

/*
 * Restart a suspended LWP.
 *
 * Must be called with p_lock held, and the LWP locked.  Will unlock the
 * LWP before return.
 */
void
lwp_continue(struct lwp *l)
{

        KASSERT(mutex_owned(l->l_proc->p_lock));
        KASSERT(lwp_locked(l, NULL));

        /* If rebooting or not suspended, then just bail out. */
        if ((l->l_flag & LW_WREBOOT) != 0) {
                lwp_unlock(l);
                return;
        }

        l->l_flag &= ~LW_WSUSPEND;

        if (l->l_stat != LSSUSPENDED || (l->l_flag & LW_DBGSUSPEND) != 0) {
                lwp_unlock(l);
                return;
        }

        /* setrunnable() will release the lock. */
        setrunnable(l);
}

/*
 * Restart a stopped LWP.
 *
 * Must be called with p_lock held, and the LWP NOT locked.  Will unlock the
 * LWP before return.
 */
void
lwp_unstop(struct lwp *l)
{
        struct proc *p = l->l_proc;

        KASSERT(mutex_owned(&proc_lock));
        KASSERT(mutex_owned(p->p_lock));

        lwp_lock(l);

        KASSERT((l->l_flag & LW_DBGSUSPEND) == 0);

        /* If not stopped, then just bail out. */
        if (l->l_stat != LSSTOP) {
                lwp_unlock(l);
                return;
        }

        p->p_stat = SACTIVE;
        p->p_sflag &= ~PS_STOPPING;

        if (!p->p_waited)
                p->p_pptr->p_nstopchild--;

        if (l->l_wchan == NULL) {
                /* setrunnable() will release the lock. */
                setrunnable(l);
        } else if (p->p_xsig && (l->l_flag & LW_SINTR) != 0) {
                /* setrunnable() so we can receive the signal */
                setrunnable(l);
        } else {
                l->l_stat = LSSLEEP;
                p->p_nrlwps++;
                lwp_unlock(l);
        }
}

/*
 * Wait for an LWP within the current process to exit.  If 'lid' is
 * non-zero, we are waiting for a specific LWP.
 *
 * Must be called with p->p_lock held.
 */
int
lwp_wait(struct lwp *l, lwpid_t lid, lwpid_t *departed, bool exiting)
{
        const lwpid_t curlid = l->l_lid;
        proc_t *p = l->l_proc;
        lwp_t *l2, *next;
        int error;

        KASSERT(mutex_owned(p->p_lock));

        p->p_nlwpwait++;
        l->l_waitingfor = lid;

        for (;;) {
                int nfound;

                /*
                 * Avoid a race between exit1() and sigexit(): if the
                 * process is dumping core, then we need to bail out: call
                 * into lwp_userret() where we will be suspended until the
                 * deed is done.
                 */
                if ((p->p_sflag & PS_WCORE) != 0) {
                        mutex_exit(p->p_lock);
                        lwp_userret(l);
                        KASSERT(false);
                }

                /*
                 * First off, drain any detached LWP that is waiting to be
                 * reaped.
                 */
                if ((l2 = p->p_zomblwp) != NULL) {
                        p->p_zomblwp = NULL;
                        lwp_free(l2, false, false);/* releases proc mutex */
                        mutex_enter(p->p_lock);
                        continue;
                }

                /*
                 * Now look for an LWP to collect.  If the whole process is
                 * exiting, count detached LWPs as eligible to be collected,
                 * but don't drain them here.
                 */
                nfound = 0;
                error = 0;

                /*
                 * If given a specific LID, go via pid_table and make sure
                 * it's not detached.
                 */
                if (lid != 0) {
                        l2 = proc_find_lwp(p, lid);
                        if (l2 == NULL) {
                                error = ESRCH;
                                break;
                        }
                        KASSERT(l2->l_lid == lid);
                        if ((l2->l_prflag & LPR_DETACHED) != 0) {
                                error = EINVAL;
                                break;
                        }
                } else {
                        l2 = LIST_FIRST(&p->p_lwps);
                }
                for (; l2 != NULL; l2 = next) {
                        next = (lid != 0 ? NULL : LIST_NEXT(l2, l_sibling));

                        /*
                         * If a specific wait and the target is waiting on
                         * us, then avoid deadlock.  This also traps LWPs
                         * that try to wait on themselves.
                         *
                         * Note that this does not handle more complicated
                         * cycles, like: t1 -> t2 -> t3 -> t1.  The process
                         * can still be killed so it is not a major problem.
                         */
                        if (l2->l_lid == lid && l2->l_waitingfor == curlid) {
                                error = EDEADLK;
                                break;
                        }
                        if (l2 == l)
                                continue;
                        if ((l2->l_prflag & LPR_DETACHED) != 0) {
                                nfound += exiting;
                                continue;
                        }
                        if (lid != 0) {
                                /*
                                 * Mark this LWP as the first waiter, if there
                                 * is no other.
                                 */
                                if (l2->l_waiter == 0)
                                        l2->l_waiter = curlid;
                        } else if (l2->l_waiter != 0) {
                                /*
                                 * It already has a waiter - so don't
                                 * collect it.  If the waiter doesn't
                                 * grab it we'll get another chance
                                 * later.
                                 */
                                nfound++;
                                continue;
                        }
                        nfound++;

                        /* No need to lock the LWP in order to see LSZOMB. */
                        if (l2->l_stat != LSZOMB)
                                continue;

                        /*
                         * We're no longer waiting.  Reset the "first waiter"
                         * pointer on the target, in case it was us.
                         */
                        l->l_waitingfor = 0;
                        l2->l_waiter = 0;
                        p->p_nlwpwait--;
                        if (departed)
                                *departed = l2->l_lid;
                        sched_lwp_collect(l2);

                        /* lwp_free() releases the proc lock. */
                        lwp_free(l2, false, false);
                        mutex_enter(p->p_lock);
                        return 0;
                }

                if (error != 0)
                        break;
                if (nfound == 0) {
                        error = ESRCH;
                        break;
                }

                /*
                 * Note: since the lock will be dropped, need to restart on
                 * wakeup to run all LWPs again, e.g. there may be new LWPs.
                 */
                if (exiting) {
                        KASSERT(p->p_nlwps > 1);
                        error = cv_timedwait(&p->p_lwpcv, p->p_lock, 1);
                        break;
                }

                /*
                 * Break out if all LWPs are in _lwp_wait().  There are
                 * other ways to hang the process with _lwp_wait(), but the
                 * sleep is interruptable so little point checking for them.
                 */
                if (p->p_nlwpwait == p->p_nlwps) {
                        error = EDEADLK;
                        break;
                }

                /*
                 * Sit around and wait for something to happen.  We'll be 
                 * awoken if any of the conditions examined change: if an
                 * LWP exits, is collected, or is detached.
                 */
                if ((error = cv_wait_sig(&p->p_lwpcv, p->p_lock)) != 0)
                        break;
        }

        /*
         * We didn't find any LWPs to collect, we may have received a 
         * signal, or some other condition has caused us to bail out.
         *
         * If waiting on a specific LWP, clear the waiters marker: some
         * other LWP may want it.  Then, kick all the remaining waiters
         * so that they can re-check for zombies and for deadlock.
         */
        if (lid != 0) {
                l2 = proc_find_lwp(p, lid);
                KASSERT(l2 == NULL || l2->l_lid == lid);

                if (l2 != NULL && l2->l_waiter == curlid)
                        l2->l_waiter = 0;
        }
        p->p_nlwpwait--;
        l->l_waitingfor = 0;
        cv_broadcast(&p->p_lwpcv);

        return error;
}

/*
 * Create a new LWP within process 'p2', using LWP 'l1' as a template.
 * The new LWP is created in state LSIDL and must be set running,
 * suspended, or stopped by the caller.
 */
int
lwp_create(lwp_t *l1, proc_t *p2, vaddr_t uaddr, int flags,
    void *stack, size_t stacksize, void (*func)(void *), void *arg,
    lwp_t **rnewlwpp, int sclass, const sigset_t *sigmask,
    const stack_t *sigstk)
{
        struct lwp *l2;

        KASSERT(l1 == curlwp || l1->l_proc == &proc0);

        /*
         * Enforce limits, excluding the first lwp and kthreads.  We must
         * use the process credentials here when adjusting the limit, as
         * they are what's tied to the accounting entity.  However for
         * authorizing the action, we'll use the LWP's credentials.
         */
        mutex_enter(p2->p_lock);
        if (p2->p_nlwps != 0 && p2 != &proc0) {
                uid_t uid = kauth_cred_getuid(p2->p_cred);
                int count = chglwpcnt(uid, 1);
                if (__predict_false(count >
                    p2->p_rlimit[RLIMIT_NTHR].rlim_cur)) {
                        if (kauth_authorize_process(l1->l_cred,
                            KAUTH_PROCESS_RLIMIT, p2,
                            KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
                            &p2->p_rlimit[RLIMIT_NTHR], KAUTH_ARG(RLIMIT_NTHR))
                            != 0) {
                                (void)chglwpcnt(uid, -1);
                                mutex_exit(p2->p_lock);
                                return EAGAIN;
                        }
                }
        }

        /*
         * First off, reap any detached LWP waiting to be collected.
         * We can re-use its LWP structure and turnstile.
         */
        if ((l2 = p2->p_zomblwp) != NULL) {
                p2->p_zomblwp = NULL;
                lwp_free(l2, true, false);
                /* p2 now unlocked by lwp_free() */
                KASSERT(l2->l_ts != NULL);
                KASSERT(l2->l_inheritedprio == -1);
                KASSERT(SLIST_EMPTY(&l2->l_pi_lenders));
                memset(&l2->l_startzero, 0, sizeof(*l2) -
                    offsetof(lwp_t, l_startzero));
        } else {
                mutex_exit(p2->p_lock);
                l2 = pool_cache_get(lwp_cache, PR_WAITOK);
                memset(&l2->l_startzero, 0, sizeof(*l2) -
                    offsetof(lwp_t, l_startzero));
                SLIST_INIT(&l2->l_pi_lenders);
        }

        /*
         * Because of lockless lookup via pid_table, the LWP can be locked
         * and inspected briefly even after it's freed, so a few fields are
         * kept stable.
         */
        KASSERT(l2->l_stat == LSIDL);
        KASSERT(l2->l_cpu != NULL);
        KASSERT(l2->l_ts != NULL);
        KASSERT(l2->l_mutex == l2->l_cpu->ci_schedstate.spc_lwplock);

        l2->l_proc = p2;
        l2->l_refcnt = 0;
        l2->l_class = sclass;

        /*
         * Allocate a process ID for this LWP.  We need to do this now
         * while we can still unwind if it fails.  Because we're marked
         * as LSIDL, no lookups by the ID will succeed.
         *
         * N.B. this will always succeed for the first LWP in a process,
         * because proc_alloc_lwpid() will usurp the slot.  Also note
         * that l2->l_proc MUST be valid so that lookups of the proc
         * will succeed, even if the LWP itself is not visible.
         */
        if (__predict_false(proc_alloc_lwpid(p2, l2) == -1)) {
                pool_cache_put(lwp_cache, l2);
                return EAGAIN;
        }

        /*
         * If vfork(), we want the LWP to run fast and on the same CPU
         * as its parent, so that it can reuse the VM context and cache
         * footprint on the local CPU.
         */
        l2->l_boostpri = ((flags & LWP_VFORK) ? PRI_KERNEL : PRI_USER);
         l2->l_priority = l1->l_priority;
        l2->l_inheritedprio = -1;
        l2->l_protectprio = -1;
        l2->l_auxprio = -1;
        l2->l_flag = 0;
        l2->l_pflag = LP_MPSAFE;
        TAILQ_INIT(&l2->l_ld_locks);
        l2->l_psrefs = 0;
        kmsan_lwp_alloc(l2);

        /*
         * For vfork, borrow parent's lwpctl context if it exists.
         * This also causes us to return via lwp_userret.
         */
        if (flags & LWP_VFORK && l1->l_lwpctl) {
                l2->l_lwpctl = l1->l_lwpctl;
                l2->l_flag |= LW_LWPCTL;
        }

        /*
         * If not the first LWP in the process, grab a reference to the
         * descriptor table.
         */
        l2->l_fd = p2->p_fd;
        if (p2->p_nlwps != 0) {
                KASSERT(l1->l_proc == p2);
                fd_hold(l2);
        } else {
                KASSERT(l1->l_proc != p2);
        }

        if (p2->p_flag & PK_SYSTEM) {
                /* Mark it as a system LWP. */
                l2->l_flag |= LW_SYSTEM;
        }

        kdtrace_thread_ctor(NULL, l2);
        lwp_initspecific(l2);
        sched_lwp_fork(l1, l2);
        callout_init(&l2->l_timeout_ch, CALLOUT_MPSAFE);
        callout_setfunc(&l2->l_timeout_ch, sleepq_timeout, l2);
        cv_init(&l2->l_sigcv, "sigwait");
        cv_init(&l2->l_waitcv, "vfork");
        l2->l_syncobj = &sched_syncobj;
        PSREF_DEBUG_INIT_LWP(l2);

        if (rnewlwpp != NULL)
                *rnewlwpp = l2;

        /*
         * PCU state needs to be saved before calling uvm_lwp_fork() so that
         * the MD cpu_lwp_fork() can copy the saved state to the new LWP.
         */
        pcu_save_all(l1);
#if PCU_UNIT_COUNT > 0
        l2->l_pcu_valid = l1->l_pcu_valid;
#endif

        uvm_lwp_setuarea(l2, uaddr);
        uvm_lwp_fork(l1, l2, stack, stacksize, func, (arg != NULL) ? arg : l2);

        mutex_enter(p2->p_lock);
        l2->l_cred = kauth_cred_hold(p2->p_cred);
        if ((flags & LWP_DETACHED) != 0) {
                l2->l_prflag = LPR_DETACHED;
                p2->p_ndlwps++;
        } else
                l2->l_prflag = 0;

        if (l1->l_proc == p2) {
                /*
                 * These flags are set while p_lock is held.  Copy with
                 * p_lock held too, so the LWP doesn't sneak into the
                 * process without them being set.
                 */
                l2->l_flag |= (l1->l_flag & (LW_WEXIT | LW_WREBOOT | LW_WCORE));
        } else {
                /* fork(): pending core/exit doesn't apply to child. */
                l2->l_flag |= (l1->l_flag & LW_WREBOOT);
        }

        l2->l_sigstk = *sigstk;
        l2->l_sigmask = *sigmask;
        TAILQ_INIT(&l2->l_sigpend.sp_info);
        sigemptyset(&l2->l_sigpend.sp_set);
        LIST_INSERT_HEAD(&p2->p_lwps, l2, l_sibling);
        p2->p_nlwps++;
        p2->p_nrlwps++;

        KASSERT(l2->l_affinity == NULL);

        /* Inherit the affinity mask. */
        if (l1->l_affinity) {
                /*
                 * Note that we hold the state lock while inheriting
                 * the affinity to avoid race with sched_setaffinity().
                 */
                lwp_lock(l1);
                if (l1->l_affinity) {
                        kcpuset_use(l1->l_affinity);
                        l2->l_affinity = l1->l_affinity;
                }
                lwp_unlock(l1);
        }

        /* Ensure a trip through lwp_userret() if needed. */
        if ((l2->l_flag & LW_USERRET) != 0) {
                lwp_need_userret(l2);
        }

        /* This marks the end of the "must be atomic" section. */
        mutex_exit(p2->p_lock);

        SDT_PROBE(proc, kernel, , lwp__create, l2, 0, 0, 0, 0);

        mutex_enter(&proc_lock);
        LIST_INSERT_HEAD(&alllwp, l2, l_list);
        /* Inherit a processor-set */
        l2->l_psid = l1->l_psid;
        mutex_exit(&proc_lock);

        SYSCALL_TIME_LWP_INIT(l2);

        if (p2->p_emul->e_lwp_fork)
                (*p2->p_emul->e_lwp_fork)(l1, l2);

        return (0);
}

/*
 * Set a new LWP running.  If the process is stopping, then the LWP is
 * created stopped.
 */
void
lwp_start(lwp_t *l, int flags)
{
        proc_t *p = l->l_proc;

        mutex_enter(p->p_lock);
        lwp_lock(l);
        KASSERT(l->l_stat == LSIDL);
        if ((flags & LWP_SUSPENDED) != 0) {
                /* It'll suspend itself in lwp_userret(). */
                l->l_flag |= LW_WSUSPEND;
                lwp_need_userret(l);
        }
        if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
                KASSERT(l->l_wchan == NULL);
                    l->l_stat = LSSTOP;
                p->p_nrlwps--;
                lwp_unlock(l);
        } else {
                setrunnable(l);
                /* LWP now unlocked */
        }
        mutex_exit(p->p_lock);
}

/*
 * Called by MD code when a new LWP begins execution.  Must be called
 * with the previous LWP locked (so at splsched), or if there is no
 * previous LWP, at splsched.
 */
void
lwp_startup(struct lwp *prev, struct lwp *new_lwp)
{
        kmutex_t *lock;

        KASSERTMSG(new_lwp == curlwp, "l %p curlwp %p prevlwp %p", new_lwp, curlwp, prev);
        KASSERT(kpreempt_disabled());
        KASSERT(prev != NULL);
        KASSERT((prev->l_pflag & LP_RUNNING) != 0);
        KASSERT(curcpu()->ci_mtx_count == -2);

        /*
         * Immediately mark the previous LWP as no longer running and
         * unlock (to keep lock wait times short as possible).  If a
         * zombie, don't touch after clearing LP_RUNNING as it could be
         * reaped by another CPU.  Use atomic_store_release to ensure
         * this -- matches atomic_load_acquire in lwp_free.
         */
        lock = prev->l_mutex;
        if (__predict_false(prev->l_stat == LSZOMB)) {
                atomic_store_release(&prev->l_pflag,
                    prev->l_pflag & ~LP_RUNNING);
        } else {
                prev->l_pflag &= ~LP_RUNNING;
        }
        mutex_spin_exit(lock);

        /* Correct spin mutex count after mi_switch(). */
        curcpu()->ci_mtx_count = 0;

        /* Install new VM context. */
        if (__predict_true(new_lwp->l_proc->p_vmspace)) {
                pmap_activate(new_lwp);
        }

        /* We remain at IPL_SCHED from mi_switch() - reset it. */
        spl0();

        LOCKDEBUG_BARRIER(NULL, 0);
        SDT_PROBE(proc, kernel, , lwp__start, new_lwp, 0, 0, 0, 0);

        /* For kthreads, acquire kernel lock if not MPSAFE. */
        if (__predict_false((new_lwp->l_pflag & LP_MPSAFE) == 0)) {
                KERNEL_LOCK(1, new_lwp);
        }
}

/*
 * Exit an LWP.
 *
 * *** WARNING *** This can be called with (l != curlwp) in error paths.
 */
void
lwp_exit(struct lwp *l)
{
        struct proc *p = l->l_proc;
        struct lwp *l2;
        bool current;

        current = (l == curlwp);

        KASSERT(current || l->l_stat == LSIDL);
        KASSERT(current || l->l_target_cpu == NULL);
        KASSERT(p == curproc);

        SDT_PROBE(proc, kernel, , lwp__exit, l, 0, 0, 0, 0);

        /* Verify that we hold no locks; for DIAGNOSTIC check kernel_lock. */
        LOCKDEBUG_BARRIER(NULL, 0);
        KASSERTMSG(curcpu()->ci_biglock_count == 0, "kernel_lock leaked");

        /*
         * If we are the last live LWP in a process, we need to exit the
         * entire process.  We do so with an exit status of zero, because
         * it's a "controlled" exit, and because that's what Solaris does.
         *
         * We are not quite a zombie yet, but for accounting purposes we
         * must increment the count of zombies here.
         *
         * Note: the last LWP's specificdata will be deleted here.
         */
        mutex_enter(p->p_lock);
        if (p->p_nlwps - p->p_nzlwps == 1) {
                KASSERT(current == true);
                KASSERT(p != &proc0);
                exit1(l, 0, 0);
                /* NOTREACHED */
        }
        p->p_nzlwps++;

        /*
         * Perform any required thread cleanup.  Do this early so
         * anyone wanting to look us up with lwp_getref_lwpid() will
         * fail to find us before we become a zombie.
         *
         * N.B. this will unlock p->p_lock on our behalf.
         */
        lwp_thread_cleanup(l);

        if (p->p_emul->e_lwp_exit)
                (*p->p_emul->e_lwp_exit)(l);

        /* Drop filedesc reference. */
        fd_free();

        /* Release fstrans private data. */
        fstrans_lwp_dtor(l);

        /* Delete the specificdata while it's still safe to sleep. */
        lwp_finispecific(l);

        /*
         * Release our cached credentials.
         */
        kauth_cred_free(l->l_cred);
        callout_destroy(&l->l_timeout_ch);

        /*
         * If traced, report LWP exit event to the debugger.
         *
         * Remove the LWP from the global list.
         * Free its LID from the PID namespace if needed.
         */
        mutex_enter(&proc_lock);

        if ((p->p_slflag & (PSL_TRACED|PSL_TRACELWP_EXIT)) ==
            (PSL_TRACED|PSL_TRACELWP_EXIT)) {
                mutex_enter(p->p_lock);
                if (ISSET(p->p_sflag, PS_WEXIT)) {
                        mutex_exit(p->p_lock);
                        /*
                         * We are exiting, bail out without informing parent
                         * about a terminating LWP as it would deadlock.
                         */
                } else {
                        eventswitch(TRAP_LWP, PTRACE_LWP_EXIT, l->l_lid);
                        mutex_enter(&proc_lock);
                }
        }

        LIST_REMOVE(l, l_list);
        mutex_exit(&proc_lock);

        /*
         * Get rid of all references to the LWP that others (e.g. procfs)
         * may have, and mark the LWP as a zombie.  If the LWP is detached,
         * mark it waiting for collection in the proc structure.  Note that
         * before we can do that, we need to free any other dead, detached
         * LWP waiting to meet its maker.
         *
         * All conditions need to be observed upon under the same hold of
         * p_lock, because if the lock is dropped any of them can change.
         */
        mutex_enter(p->p_lock);
        for (;;) {
                if (lwp_drainrefs(l))
                        continue;
                if ((l->l_prflag & LPR_DETACHED) != 0) {
                        if ((l2 = p->p_zomblwp) != NULL) {
                                p->p_zomblwp = NULL;
                                lwp_free(l2, false, false);
                                /* proc now unlocked */
                                mutex_enter(p->p_lock);
                                continue;
                        }
                        p->p_zomblwp = l;
                }
                break;
        }

        /*
         * If we find a pending signal for the process and we have been
         * asked to check for signals, then we lose: arrange to have
         * all other LWPs in the process check for signals.
         */
        if ((l->l_flag & LW_PENDSIG) != 0 &&
            firstsig(&p->p_sigpend.sp_set) != 0) {
                LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
                        lwp_lock(l2);
                        signotify(l2);
                        lwp_unlock(l2);
                }
        }

        /*
         * Release any PCU resources before becoming a zombie.
         */
        pcu_discard_all(l);

        lwp_lock(l);
        l->l_stat = LSZOMB;
        if (l->l_name != NULL) {
                strcpy(l->l_name, "(zombie)");
        }
        lwp_unlock(l);
        p->p_nrlwps--;
        if (l->l_lwpctl != NULL)
                l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED;
        mutex_exit(p->p_lock);
        cv_broadcast(&p->p_lwpcv);

        /*
         * We can no longer block.  At this point, lwp_free() may already
         * be gunning for us.  On a multi-CPU system, we may be off p_lwps.
         *
         * Free MD LWP resources.
         */
        cpu_lwp_free(l, 0);

        if (current) {
                /* Switch away into oblivion. */
                lwp_lock(l);
                spc_lock(l->l_cpu);
                mi_switch(l);
                panic("lwp_exit");
        }
}

/*
 * Free a dead LWP's remaining resources.
 *
 * XXXLWP limits.
 */
void
lwp_free(struct lwp *l, bool recycle, bool last)
{
        struct proc *p = l->l_proc;
        struct rusage *ru;
        ksiginfoq_t kq;

        KASSERT(l != curlwp);
        KASSERT(last || mutex_owned(p->p_lock));

        /*
         * We use the process credentials instead of the lwp credentials here
         * because the lwp credentials maybe cached (just after a setuid call)
         * and we don't want pay for syncing, since the lwp is going away
         * anyway
         */
        if (p != &proc0 && p->p_nlwps != 1)
                (void)chglwpcnt(kauth_cred_getuid(p->p_cred), -1);

        /*
         * In the unlikely event that the LWP is still on the CPU,
         * then spin until it has switched away.
         *
         * atomic_load_acquire matches atomic_store_release in
         * lwp_startup and mi_switch.
         */
        while (__predict_false((atomic_load_acquire(&l->l_pflag) & LP_RUNNING)
                != 0)) {
                SPINLOCK_BACKOFF_HOOK;
        }

        /*
         * Now that the LWP's known off the CPU, reset its state back to
         * LSIDL, which defeats anything that might have gotten a hold on
         * the LWP via pid_table before the ID was freed.  It's important
         * to do this with both the LWP locked and p_lock held.
         *
         * Also reset the CPU and lock pointer back to curcpu(), since the
         * LWP will in all likelyhood be cached with the current CPU in
         * lwp_cache when we free it and later allocated from there again
         * (avoid incidental lock contention).
         */
        lwp_lock(l);
        l->l_stat = LSIDL;
        l->l_cpu = curcpu();
        lwp_unlock_to(l, l->l_cpu->ci_schedstate.spc_lwplock);

        /*
         * If this was not the last LWP in the process, then adjust counters
         * and unlock.  This is done differently for the last LWP in exit1().
         */
        if (!last) {
                /*
                 * Add the LWP's run time to the process' base value.
                 * This needs to co-incide with coming off p_lwps.
                 */
                bintime_add(&p->p_rtime, &l->l_rtime);
                p->p_pctcpu += l->l_pctcpu;
                ru = &p->p_stats->p_ru;
                ruadd(ru, &l->l_ru);
                LIST_REMOVE(l, l_sibling);
                p->p_nlwps--;
                p->p_nzlwps--;
                if ((l->l_prflag & LPR_DETACHED) != 0)
                        p->p_ndlwps--;
                mutex_exit(p->p_lock);

                /*
                 * Have any LWPs sleeping in lwp_wait() recheck for
                 * deadlock.
                 */
                cv_broadcast(&p->p_lwpcv);

                /* Free the LWP ID. */
                mutex_enter(&proc_lock);
                proc_free_lwpid(p, l->l_lid);
                mutex_exit(&proc_lock);
        }

        /*
         * Destroy the LWP's remaining signal information.
         */
        ksiginfo_queue_init(&kq);
        sigclear(&l->l_sigpend, NULL, &kq);
        ksiginfo_queue_drain(&kq);
        cv_destroy(&l->l_sigcv);
        cv_destroy(&l->l_waitcv);

        /*
         * Free lwpctl structure and affinity.
         */
        if (l->l_lwpctl) {
                lwp_ctl_free(l);
        }
        if (l->l_affinity) {
                kcpuset_unuse(l->l_affinity, NULL);
                l->l_affinity = NULL;
        }

        /*
         * Free remaining data structures and the LWP itself unless the
         * caller wants to recycle.
         */
        if (l->l_name != NULL)
                kmem_free(l->l_name, MAXCOMLEN);

        kmsan_lwp_free(l);
        kcov_lwp_free(l);
        cpu_lwp_free2(l);
        uvm_lwp_exit(l);

        KASSERT(SLIST_EMPTY(&l->l_pi_lenders));
        KASSERT(l->l_inheritedprio == -1);
        KASSERT(l->l_blcnt == 0);
        kdtrace_thread_dtor(NULL, l);
        if (!recycle)
                pool_cache_put(lwp_cache, l);
}

/*
 * Migrate the LWP to the another CPU.  Unlocks the LWP.
 */
void
lwp_migrate(lwp_t *l, struct cpu_info *tci)
{
        struct schedstate_percpu *tspc;
        int lstat = l->l_stat;

        KASSERT(lwp_locked(l, NULL));
        KASSERT(tci != NULL);

        /* If LWP is still on the CPU, it must be handled like LSONPROC */
        if ((l->l_pflag & LP_RUNNING) != 0) {
                lstat = LSONPROC;
        }

        /*
         * The destination CPU could be changed while previous migration
         * was not finished.
         */
        if (l->l_target_cpu != NULL) {
                l->l_target_cpu = tci;
                lwp_unlock(l);
                return;
        }

        /* Nothing to do if trying to migrate to the same CPU */
        if (l->l_cpu == tci) {
                lwp_unlock(l);
                return;
        }

        KASSERT(l->l_target_cpu == NULL);
        tspc = &tci->ci_schedstate;
        switch (lstat) {
        case LSRUN:
                l->l_target_cpu = tci;
                break;
        case LSSLEEP:
                l->l_cpu = tci;
                break;
        case LSIDL:
        case LSSTOP:
        case LSSUSPENDED:
                l->l_cpu = tci;
                if (l->l_wchan == NULL) {
                        lwp_unlock_to(l, tspc->spc_lwplock);
                        return;
                }
                break;
        case LSONPROC:
                l->l_target_cpu = tci;
                spc_lock(l->l_cpu);
                sched_resched_cpu(l->l_cpu, PRI_USER_RT, true);
                /* spc now unlocked */
                break;
        }
        lwp_unlock(l);
}

#define        lwp_find_exclude(l)                                        \
        ((l)->l_stat == LSIDL || (l)->l_stat == LSZOMB)

/*
 * Find the LWP in the process.  Arguments may be zero, in such case,
 * the calling process and first LWP in the list will be used.
 * On success - returns proc locked.
 *
 * => pid == 0 -> look in curproc.
 * => pid == -1 -> match any proc.
 * => otherwise look up the proc.
 *
 * => lid == 0 -> first LWP in the proc
 * => otherwise specific LWP
 */
struct lwp *
lwp_find2(pid_t pid, lwpid_t lid)
{
        proc_t *p;
        lwp_t *l;

        /* First LWP of specified proc. */
        if (lid == 0) {
                switch (pid) {
                case -1:
                        /* No lookup keys. */
                        return NULL;
                case 0:
                        p = curproc;
                        mutex_enter(p->p_lock);
                        break;
                default:
                        mutex_enter(&proc_lock);
                        p = proc_find(pid);
                        if (__predict_false(p == NULL)) {
                                mutex_exit(&proc_lock);
                                return NULL;
                        }
                        mutex_enter(p->p_lock);
                        mutex_exit(&proc_lock);
                        break;
                }
                LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                        if (__predict_true(!lwp_find_exclude(l)))
                                break;
                }
                goto out;
        }

        l = proc_find_lwp_acquire_proc(lid, &p);
        if (l == NULL)
                return NULL;
        KASSERT(p != NULL);
        KASSERT(mutex_owned(p->p_lock));

        if (__predict_false(lwp_find_exclude(l))) {
                l = NULL;
                goto out;
        }

        /* Apply proc filter, if applicable. */
        switch (pid) {
        case -1:
                /* Match anything. */
                break;
        case 0:
                if (p != curproc)
                        l = NULL;
                break;
        default:
                if (p->p_pid != pid)
                        l = NULL;
                break;
        }

 out:
        if (__predict_false(l == NULL)) {
                mutex_exit(p->p_lock);
        }
        return l;
}

/*
 * Look up a live LWP within the specified process.
 *
 * Must be called with p->p_lock held (as it looks at the radix tree,
 * and also wants to exclude idle and zombie LWPs).
 */
struct lwp *
lwp_find(struct proc *p, lwpid_t id)
{
        struct lwp *l;

        KASSERT(mutex_owned(p->p_lock));

        l = proc_find_lwp(p, id);
        KASSERT(l == NULL || l->l_lid == id);

        /*
         * No need to lock - all of these conditions will
         * be visible with the process level mutex held.
         */
        if (__predict_false(l != NULL && lwp_find_exclude(l)))
                l = NULL;

        return l;
}

/*
 * Verify that an LWP is locked, and optionally verify that the lock matches
 * one we specify.
 */
int
lwp_locked(struct lwp *l, kmutex_t *mtx)
{
        kmutex_t *cur = l->l_mutex;

        return mutex_owned(cur) && (mtx == cur || mtx == NULL);
}

/*
 * Lend a new mutex to an LWP.  The old mutex must be held.
 */
kmutex_t *
lwp_setlock(struct lwp *l, kmutex_t *mtx)
{
        kmutex_t *oldmtx = l->l_mutex;

        KASSERT(mutex_owned(oldmtx));

        atomic_store_release(&l->l_mutex, mtx);
        return oldmtx;
}

/*
 * Lend a new mutex to an LWP, and release the old mutex.  The old mutex
 * must be held.
 */
void
lwp_unlock_to(struct lwp *l, kmutex_t *mtx)
{
        kmutex_t *old;

        KASSERT(lwp_locked(l, NULL));

        old = l->l_mutex;
        atomic_store_release(&l->l_mutex, mtx);
        mutex_spin_exit(old);
}

int
lwp_trylock(struct lwp *l)
{
        kmutex_t *old;

        for (;;) {
                if (!mutex_tryenter(old = atomic_load_consume(&l->l_mutex)))
                        return 0;
                if (__predict_true(atomic_load_relaxed(&l->l_mutex) == old))
                        return 1;
                mutex_spin_exit(old);
        }
}

void
lwp_unsleep(lwp_t *l, bool unlock)
{

        KASSERT(mutex_owned(l->l_mutex));
        (*l->l_syncobj->sobj_unsleep)(l, unlock);
}

/*
 * Lock an LWP.
 */
void
lwp_lock(lwp_t *l)
{
        kmutex_t *old = atomic_load_consume(&l->l_mutex);

        /*
         * Note: mutex_spin_enter() will have posted a read barrier.
         * Re-test l->l_mutex.  If it has changed, we need to try again.
         */
        mutex_spin_enter(old);
        while (__predict_false(atomic_load_relaxed(&l->l_mutex) != old)) {
                mutex_spin_exit(old);
                old = atomic_load_consume(&l->l_mutex);
                mutex_spin_enter(old);
        }
}

/*
 * Unlock an LWP.
 */
void
lwp_unlock(lwp_t *l)
{

        mutex_spin_exit(l->l_mutex);
}

void
lwp_changepri(lwp_t *l, pri_t pri)
{

        KASSERT(mutex_owned(l->l_mutex));

        if (l->l_priority == pri)
                return;

        (*l->l_syncobj->sobj_changepri)(l, pri);
        KASSERT(l->l_priority == pri);
}

void
lwp_lendpri(lwp_t *l, pri_t pri)
{
        KASSERT(mutex_owned(l->l_mutex));

        (*l->l_syncobj->sobj_lendpri)(l, pri);
        KASSERT(l->l_inheritedprio == pri);
}

pri_t
lwp_eprio(lwp_t *l)
{
        pri_t pri = l->l_priority;

        KASSERT(mutex_owned(l->l_mutex));

        /*
         * Timeshared/user LWPs get a temporary priority boost for blocking
         * in kernel.  This is key to good interactive response on a loaded
         * system: without it, things will seem very sluggish to the user. 
         *
         * The function of the boost is to get the LWP onto a CPU and
         * running quickly.  Once that happens the LWP loses the priority
         * boost and could be preempted very quickly by another LWP but that
         * won't happen often enough to be an annoyance.
         */
        if (pri <= MAXPRI_USER && l->l_boostpri > MAXPRI_USER)
                pri = (pri >> 1) + l->l_boostpri;

        return MAX(l->l_auxprio, pri);
}

/*
 * Handle exceptions for mi_userret().  Called if a member of LW_USERRET is
 * set or a preemption is required.
 */
void
lwp_userret(struct lwp *l)
{
        struct proc *p;
        int sig, f;

        KASSERT(l == curlwp);
        KASSERT(l->l_stat == LSONPROC);
        p = l->l_proc;

        for (;;) {
                /*
                 * This is the main location that user preemptions are
                 * processed.
                 */
                preempt_point();

                /*
                 * It is safe to do this unlocked and without raised SPL,
                 * since whenever a flag of interest is added to l_flag the
                 * LWP will take an AST and come down this path again.  If a
                 * remote CPU posts the AST, it will be done with an IPI
                 * (strongly synchronising).
                 */
                if ((f = atomic_load_relaxed(&l->l_flag) & LW_USERRET) == 0) {
                        return;
                }

                /*
                 * Start out with the correct credentials.
                 */
                if ((f & LW_CACHECRED) != 0) {
                        kauth_cred_t oc = l->l_cred;
                        mutex_enter(p->p_lock);
                        l->l_cred = kauth_cred_hold(p->p_cred);
                        lwp_lock(l);
                        l->l_flag &= ~LW_CACHECRED;
                        lwp_unlock(l);
                        mutex_exit(p->p_lock);
                        kauth_cred_free(oc);
                }

                /*
                 * Process pending signals first, unless the process
                 * is dumping core or exiting, where we will instead
                 * enter the LW_WSUSPEND case below.
                 */
                if ((f & (LW_PENDSIG | LW_WCORE | LW_WEXIT)) == LW_PENDSIG) {
                        mutex_enter(p->p_lock);
                        while ((sig = issignal(l)) != 0)
                                postsig(sig);
                        mutex_exit(p->p_lock);
                        continue;
                }

                /*
                 * Core-dump or suspend pending.
                 *
                 * In case of core dump, suspend ourselves, so that the kernel
                 * stack and therefore the userland registers saved in the
                 * trapframe are around for coredump() to write them out.
                 * We also need to save any PCU resources that we have so that
                 * they accessible for coredump().  We issue a wakeup on
                 * p->p_lwpcv so that sigexit() will write the core file out
                 * once all other LWPs are suspended.  
                 */
                if ((f & LW_WSUSPEND) != 0) {
                        pcu_save_all(l);
                        mutex_enter(p->p_lock);
                        p->p_nrlwps--;
                        lwp_lock(l);
                        l->l_stat = LSSUSPENDED;
                        lwp_unlock(l);
                        mutex_exit(p->p_lock);
                        cv_broadcast(&p->p_lwpcv);
                        lwp_lock(l);
                        spc_lock(l->l_cpu);
                        mi_switch(l);
                        continue;
                }

                /*
                 * Process is exiting.  The core dump and signal cases must
                 * be handled first.
                 */
                if ((f & LW_WEXIT) != 0) {
                        lwp_exit(l);
                        KASSERT(0);
                        /* NOTREACHED */
                }

                /*
                 * Update lwpctl processor (for vfork child_return).
                 */
                if ((f & LW_LWPCTL) != 0) {
                        lwp_lock(l);
                        KASSERT(kpreempt_disabled());
                        l->l_lwpctl->lc_curcpu = (int)cpu_index(l->l_cpu);
                        l->l_lwpctl->lc_pctr++;
                        l->l_flag &= ~LW_LWPCTL;
                        lwp_unlock(l);
                        continue;
                }
        }
}

/*
 * Force an LWP to enter the kernel, to take a trip through lwp_userret().
 */
void
lwp_need_userret(struct lwp *l)
{

        KASSERT(!cpu_intr_p());
        KASSERT(lwp_locked(l, NULL) || l->l_stat == LSIDL);

        /*
         * If the LWP is in any state other than LSONPROC, we know that it
         * is executing in-kernel and will hit userret() on the way out. 
         *
         * If the LWP is curlwp, then we know we'll be back out to userspace
         * soon (can't be called from a hardware interrupt here).
         *
         * Otherwise, we can't be sure what the LWP is doing, so first make
         * sure the update to l_flag will be globally visible, and then
         * force the LWP to take a trip through trap() where it will do
         * userret().
         */
        if (l->l_stat == LSONPROC && l != curlwp) {
                membar_producer();
                cpu_signotify(l);
        }
}

/*
 * Add one reference to an LWP.  This will prevent the LWP from
 * exiting, thus keep the lwp structure and PCB around to inspect.
 */
void
lwp_addref(struct lwp *l)
{
        KASSERT(mutex_owned(l->l_proc->p_lock));
        KASSERT(l->l_stat != LSZOMB);
        l->l_refcnt++;
}

/*
 * Remove one reference to an LWP.  If this is the last reference,
 * then we must finalize the LWP's death.
 */
void
lwp_delref(struct lwp *l)
{
        struct proc *p = l->l_proc;

        mutex_enter(p->p_lock);
        lwp_delref2(l);
        mutex_exit(p->p_lock);
}

/*
 * Remove one reference to an LWP.  If this is the last reference,
 * then we must finalize the LWP's death.  The proc mutex is held
 * on entry.
 */
void
lwp_delref2(struct lwp *l)
{
        struct proc *p = l->l_proc;

        KASSERT(mutex_owned(p->p_lock));
        KASSERT(l->l_stat != LSZOMB);
        KASSERT(l->l_refcnt > 0);

        if (--l->l_refcnt == 0)
                cv_broadcast(&p->p_lwpcv);
}

/*
 * Drain all references to the current LWP.  Returns true if
 * we blocked.
 */
bool
lwp_drainrefs(struct lwp *l)
{
        struct proc *p = l->l_proc;
        bool rv = false;

        KASSERT(mutex_owned(p->p_lock));

        l->l_prflag |= LPR_DRAINING;

        while (l->l_refcnt > 0) {
                rv = true;
                cv_wait(&p->p_lwpcv, p->p_lock);
        }
        return rv;
}

/*
 * Return true if the specified LWP is 'alive'.  Only p->p_lock need
 * be held.
 */
bool
lwp_alive(lwp_t *l)
{

        KASSERT(mutex_owned(l->l_proc->p_lock));

        switch (l->l_stat) {
        case LSSLEEP:
        case LSRUN:
        case LSONPROC:
        case LSSTOP:
        case LSSUSPENDED:
                return true;
        default:
                return false;
        }
}

/*
 * Return first live LWP in the process.
 */
lwp_t *
lwp_find_first(proc_t *p)
{
        lwp_t *l;

        KASSERT(mutex_owned(p->p_lock));

        LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                if (lwp_alive(l)) {
                        return l;
                }
        }

        return NULL;
}

/*
 * Allocate a new lwpctl structure for a user LWP.
 */
int
lwp_ctl_alloc(vaddr_t *uaddr)
{
        lcproc_t *lp;
        u_int bit, i, offset;
        struct uvm_object *uao;
        int error;
        lcpage_t *lcp;
        proc_t *p;
        lwp_t *l;

        l = curlwp;
        p = l->l_proc;

        /* don't allow a vforked process to create lwp ctls */
        if (p->p_lflag & PL_PPWAIT)
                return EBUSY;

        if (l->l_lcpage != NULL) {
                lcp = l->l_lcpage;
                *uaddr = lcp->lcp_uaddr + (vaddr_t)l->l_lwpctl - lcp->lcp_kaddr;
                return 0;
        }

        /* First time around, allocate header structure for the process. */
        if ((lp = p->p_lwpctl) == NULL) {
                lp = kmem_alloc(sizeof(*lp), KM_SLEEP);
                mutex_init(&lp->lp_lock, MUTEX_DEFAULT, IPL_NONE);
                lp->lp_uao = NULL;
                TAILQ_INIT(&lp->lp_pages);
                mutex_enter(p->p_lock);
                if (p->p_lwpctl == NULL) {
                        p->p_lwpctl = lp;
                        mutex_exit(p->p_lock);
                } else {
                        mutex_exit(p->p_lock);
                        mutex_destroy(&lp->lp_lock);
                        kmem_free(lp, sizeof(*lp));
                        lp = p->p_lwpctl;
                }
        }

         /*
          * Set up an anonymous memory region to hold the shared pages.
          * Map them into the process' address space.  The user vmspace
          * gets the first reference on the UAO.
          */
        mutex_enter(&lp->lp_lock);
        if (lp->lp_uao == NULL) {
                lp->lp_uao = uao_create(LWPCTL_UAREA_SZ, 0);
                lp->lp_cur = 0;
                lp->lp_max = LWPCTL_UAREA_SZ;
                lp->lp_uva = p->p_emul->e_vm_default_addr(p,
                     (vaddr_t)p->p_vmspace->vm_daddr, LWPCTL_UAREA_SZ,
                     p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
                error = uvm_map(&p->p_vmspace->vm_map, &lp->lp_uva,
                    LWPCTL_UAREA_SZ, lp->lp_uao, 0, 0, UVM_MAPFLAG(UVM_PROT_RW,
                    UVM_PROT_RW, UVM_INH_NONE, UVM_ADV_NORMAL, 0));
                if (error != 0) {
                        uao_detach(lp->lp_uao);
                        lp->lp_uao = NULL;
                        mutex_exit(&lp->lp_lock);
                        return error;
                }
        }

        /* Get a free block and allocate for this LWP. */
        TAILQ_FOREACH(lcp, &lp->lp_pages, lcp_chain) {
                if (lcp->lcp_nfree != 0)
                        break;
        }
        if (lcp == NULL) {
                /* Nothing available - try to set up a free page. */
                if (lp->lp_cur == lp->lp_max) {
                        mutex_exit(&lp->lp_lock);
                        return ENOMEM;
                }
                lcp = kmem_alloc(LWPCTL_LCPAGE_SZ, KM_SLEEP);

                /*
                 * Wire the next page down in kernel space.  Since this
                 * is a new mapping, we must add a reference.
                 */
                uao = lp->lp_uao;
                (*uao->pgops->pgo_reference)(uao);
                lcp->lcp_kaddr = vm_map_min(kernel_map);
                error = uvm_map(kernel_map, &lcp->lcp_kaddr, PAGE_SIZE,
                    uao, lp->lp_cur, PAGE_SIZE,
                    UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
                    UVM_INH_NONE, UVM_ADV_RANDOM, 0));
                if (error != 0) {
                        mutex_exit(&lp->lp_lock);
                        kmem_free(lcp, LWPCTL_LCPAGE_SZ);
                        (*uao->pgops->pgo_detach)(uao);
                        return error;
                }
                error = uvm_map_pageable(kernel_map, lcp->lcp_kaddr,
                    lcp->lcp_kaddr + PAGE_SIZE, FALSE, 0);
                if (error != 0) {
                        mutex_exit(&lp->lp_lock);
                        uvm_unmap(kernel_map, lcp->lcp_kaddr,
                            lcp->lcp_kaddr + PAGE_SIZE);
                        kmem_free(lcp, LWPCTL_LCPAGE_SZ);
                        return error;
                }
                /* Prepare the page descriptor and link into the list. */
                lcp->lcp_uaddr = lp->lp_uva + lp->lp_cur;
                lp->lp_cur += PAGE_SIZE;
                lcp->lcp_nfree = LWPCTL_PER_PAGE;
                lcp->lcp_rotor = 0;
                memset(lcp->lcp_bitmap, 0xff, LWPCTL_BITMAP_SZ);
                TAILQ_INSERT_HEAD(&lp->lp_pages, lcp, lcp_chain);
        }
        for (i = lcp->lcp_rotor; lcp->lcp_bitmap[i] == 0;) {
                if (++i >= LWPCTL_BITMAP_ENTRIES)
                        i = 0;
        }
        bit = ffs(lcp->lcp_bitmap[i]) - 1;
        lcp->lcp_bitmap[i] ^= (1U << bit);
        lcp->lcp_rotor = i;
        lcp->lcp_nfree--;
        l->l_lcpage = lcp;
        offset = (i << 5) + bit;
        l->l_lwpctl = (lwpctl_t *)lcp->lcp_kaddr + offset;
        *uaddr = lcp->lcp_uaddr + offset * sizeof(lwpctl_t);
        mutex_exit(&lp->lp_lock);

        KPREEMPT_DISABLE(l);
        l->l_lwpctl->lc_curcpu = (int)cpu_index(curcpu());
        KPREEMPT_ENABLE(l);

        return 0;
}

/*
 * Free an lwpctl structure back to the per-process list.
 */
void
lwp_ctl_free(lwp_t *l)
{
        struct proc *p = l->l_proc;
        lcproc_t *lp;
        lcpage_t *lcp;
        u_int map, offset;

        /* don't free a lwp context we borrowed for vfork */
        if (p->p_lflag & PL_PPWAIT) {
                l->l_lwpctl = NULL;
                return;
        }

        lp = p->p_lwpctl;
        KASSERT(lp != NULL);

        lcp = l->l_lcpage;
        offset = (u_int)((lwpctl_t *)l->l_lwpctl - (lwpctl_t *)lcp->lcp_kaddr);
        KASSERT(offset < LWPCTL_PER_PAGE);

        mutex_enter(&lp->lp_lock);
        lcp->lcp_nfree++;
        map = offset >> 5;
        lcp->lcp_bitmap[map] |= (1U << (offset & 31));
        if (lcp->lcp_bitmap[lcp->lcp_rotor] == 0)
                lcp->lcp_rotor = map;
        if (TAILQ_FIRST(&lp->lp_pages)->lcp_nfree == 0) {
                TAILQ_REMOVE(&lp->lp_pages, lcp, lcp_chain);
                TAILQ_INSERT_HEAD(&lp->lp_pages, lcp, lcp_chain);
        }
        mutex_exit(&lp->lp_lock);
}

/*
 * Process is exiting; tear down lwpctl state.  This can only be safely
 * called by the last LWP in the process.
 */
void
lwp_ctl_exit(void)
{
        lcpage_t *lcp, *next;
        lcproc_t *lp;
        proc_t *p;
        lwp_t *l;

        l = curlwp;
        l->l_lwpctl = NULL;
        l->l_lcpage = NULL;
        p = l->l_proc;
        lp = p->p_lwpctl;

        KASSERT(lp != NULL);
        KASSERT(p->p_nlwps == 1);

        for (lcp = TAILQ_FIRST(&lp->lp_pages); lcp != NULL; lcp = next) {
                next = TAILQ_NEXT(lcp, lcp_chain);
                uvm_unmap(kernel_map, lcp->lcp_kaddr,
                    lcp->lcp_kaddr + PAGE_SIZE);
                kmem_free(lcp, LWPCTL_LCPAGE_SZ);
        }

        if (lp->lp_uao != NULL) {
                uvm_unmap(&p->p_vmspace->vm_map, lp->lp_uva,
                    lp->lp_uva + LWPCTL_UAREA_SZ);
        }

        mutex_destroy(&lp->lp_lock);
        kmem_free(lp, sizeof(*lp));
        p->p_lwpctl = NULL;
}

/*
 * Return the current LWP's "preemption counter".  Used to detect
 * preemption across operations that can tolerate preemption without
 * crashing, but which may generate incorrect results if preempted.
 *
 * We do arithmetic in unsigned long to avoid undefined behaviour in
 * the event of arithmetic overflow on LP32, and issue __insn_barrier()
 * on both sides so this can safely be used to detect changes to the
 * preemption counter in loops around other memory accesses even in the
 * event of whole-program optimization (e.g., gcc -flto).
 */
long
lwp_pctr(void)
{
        unsigned long pctr;

        __insn_barrier();
        pctr = curlwp->l_ru.ru_nvcsw;
        pctr += curlwp->l_ru.ru_nivcsw;
        __insn_barrier();
        return pctr;
}

/*
 * Set an LWP's private data pointer.
 */
int
lwp_setprivate(struct lwp *l, void *ptr)
{
        int error = 0;

        l->l_private = ptr;
#ifdef __HAVE_CPU_LWP_SETPRIVATE
        error = cpu_lwp_setprivate(l, ptr);
#endif
        return error;
}

/*
 * Perform any thread-related cleanup on LWP exit.
 * N.B. l->l_proc->p_lock must be HELD on entry but will
 * be released before returning!
 */
void
lwp_thread_cleanup(struct lwp *l)
{

        KASSERT(mutex_owned(l->l_proc->p_lock));
        mutex_exit(l->l_proc->p_lock);

        /*
         * If the LWP has robust futexes, release them all
         * now.
         */
        if (__predict_false(l->l_robust_head != 0)) {
                futex_release_all_lwp(l);
        }
}

#if defined(DDB)
#include <machine/pcb.h>

void
lwp_whatis(uintptr_t addr, void (*pr)(const char *, ...))
{
        lwp_t *l;

        LIST_FOREACH(l, &alllwp, l_list) {
                uintptr_t stack = (uintptr_t)KSTACK_LOWEST_ADDR(l);

                if (addr < stack || stack + KSTACK_SIZE <= addr) {
                        continue;
                }
                (*pr)("%p is %p+%zu, LWP %p's stack\n",
                    (void *)addr, (void *)stack,
                    (size_t)(addr - stack), l);
        }
}
#endif /* defined(DDB) */





























































































































































































































































   39 






   21 







































    3 

   20 
























    1 



































   27 






   25 























    2 


























   31 






   33 
















































































   17 




    1 







   17 





















   12 










    3 


    1 


















































   85 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
/* $NetBSD: secmodel_securelevel.c,v 1.37 2020/12/05 17:33:53 thorpej Exp $ */
/*-
 * Copyright (c) 2006 Elad Efrat <elad@NetBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This file contains kauth(9) listeners needed to implement the traditional
 * NetBSD securelevel.
 *
 * The securelevel is a system-global indication on what operations are
 * allowed or not. It affects all users, including root.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: secmodel_securelevel.c,v 1.37 2020/12/05 17:33:53 thorpej Exp $");

#ifdef _KERNEL_OPT
#include "opt_insecure.h"
#endif /* _KERNEL_OPT */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/kauth.h>

#include <sys/conf.h>
#include <sys/mount.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/module.h>

#include <miscfs/specfs/specdev.h>

#include <secmodel/secmodel.h>
#include <secmodel/securelevel/securelevel.h>

MODULE(MODULE_CLASS_SECMODEL, securelevel, NULL);

static int securelevel;

static kauth_listener_t l_system, l_process, l_network, l_machdep, l_device,
    l_vnode;

static secmodel_t securelevel_sm;

/*
 * Sysctl helper routine for securelevel. Ensures that the value only rises
 * unless the caller is init.
 */
int
secmodel_securelevel_sysctl(SYSCTLFN_ARGS)
{
        int newsecurelevel, error;
        struct sysctlnode node;

        newsecurelevel = securelevel;
        node = *rnode;
        node.sysctl_data = &newsecurelevel;
        error = sysctl_lookup(SYSCTLFN_CALL(&node));
        if (error || newp == NULL)
                return (error);

        if ((newsecurelevel < securelevel) && (l->l_proc != initproc))
                return (EPERM);

        securelevel = newsecurelevel;

        return (error);
}

SYSCTL_SETUP(sysctl_security_securelevel_setup, "securelevel sysctl")
{
        const struct sysctlnode *rnode, *rnode2;

        sysctl_createv(clog, 0, NULL, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "models", NULL,
                       NULL, 0, NULL, 0,
                       CTL_SECURITY, CTL_CREATE, CTL_EOL);

        /* Compatibility: security.models.bsd44 */
        rnode2 = rnode;
        sysctl_createv(clog, 0, &rnode2, &rnode2,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "bsd44", NULL,
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);

        /* Compatibility: security.models.bsd44.securelevel */
        sysctl_createv(clog, 0, &rnode2, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "securelevel",
                       SYSCTL_DESCR("System security level"),
                       secmodel_securelevel_sysctl, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, &rnode,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "securelevel", NULL,
                       NULL, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_STRING, "name", NULL,
                       NULL, 0, __UNCONST(SECMODEL_SECURELEVEL_NAME), 0,
                       CTL_CREATE, CTL_EOL);

        sysctl_createv(clog, 0, &rnode, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "securelevel",
                       SYSCTL_DESCR("System security level"),
                       secmodel_securelevel_sysctl, 0, NULL, 0,
                       CTL_CREATE, CTL_EOL);

        /* Compatibility: kern.securelevel */

        sysctl_createv(clog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
                       CTLTYPE_INT, "securelevel",
                       SYSCTL_DESCR("System security level"),
                       secmodel_securelevel_sysctl, 0, NULL, 0,
                       CTL_KERN, KERN_SECURELVL, CTL_EOL);
}

void
secmodel_securelevel_init(void)
{
#ifdef INSECURE
        securelevel = -1;
#else
        securelevel = 0;
#endif /* INSECURE */
}

void
secmodel_securelevel_start(void)
{
        l_system = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
            secmodel_securelevel_system_cb, NULL);
        l_process = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
            secmodel_securelevel_process_cb, NULL);
        l_network = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
            secmodel_securelevel_network_cb, NULL);
        l_machdep = kauth_listen_scope(KAUTH_SCOPE_MACHDEP,
            secmodel_securelevel_machdep_cb, NULL);
        l_device = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
            secmodel_securelevel_device_cb, NULL);
        l_vnode = kauth_listen_scope(KAUTH_SCOPE_VNODE,
            secmodel_securelevel_vnode_cb, NULL);
}

void
secmodel_securelevel_stop(void)
{
        kauth_unlisten_scope(l_system);
        kauth_unlisten_scope(l_process);
        kauth_unlisten_scope(l_network);
        kauth_unlisten_scope(l_machdep);
        kauth_unlisten_scope(l_device);
        kauth_unlisten_scope(l_vnode);
}

static int
securelevel_eval(const char *what, void *arg, void *ret)
{
        int error = 0;

        if (strcasecmp(what, "is-securelevel-above") == 0) {
                int level = (int)(uintptr_t)arg;
                bool *bp = ret;

                *bp = (securelevel > level);
        } else {
                error = ENOENT;
        }

        return error;
}

static int
securelevel_modcmd(modcmd_t cmd, void *arg)
{
        int error = 0;

        switch (cmd) {
        case MODULE_CMD_INIT:
                secmodel_securelevel_init();
                error = secmodel_register(&securelevel_sm,
                    SECMODEL_SECURELEVEL_ID, SECMODEL_SECURELEVEL_NAME,
                    NULL, securelevel_eval, NULL);
                if (error != 0)
                        printf("securelevel_modcmd::init: secmodel_register "
                            "returned %d\n", error);

                secmodel_securelevel_start();
                break;

        case MODULE_CMD_FINI:
                secmodel_securelevel_stop();

                error = secmodel_deregister(securelevel_sm);
                if (error != 0)
                        printf("securelevel_modcmd::fini: secmodel_deregister "
                            "returned %d\n", error);

                break;

        case MODULE_CMD_AUTOUNLOAD:
                error = EPERM;
                break;

        default:
                error = ENOTTY;
                break;
        }

        return (error);
}

/*
 * kauth(9) listener
 *
 * Security model: Traditional NetBSD
 * Scope: System
 * Responsibility: Securelevel
 */
int
secmodel_securelevel_system_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;
        enum kauth_system_req req;

        result = KAUTH_RESULT_DEFER;
        req = (enum kauth_system_req)(uintptr_t)arg0;

        switch (action) {
        case KAUTH_SYSTEM_CHSYSFLAGS:
                /* Deprecated. */
                if (securelevel > 0)
                        result = KAUTH_RESULT_DENY;
                break;

        case KAUTH_SYSTEM_TIME:
                switch (req) {
                case KAUTH_REQ_SYSTEM_TIME_RTCOFFSET:
                        if (securelevel > 0)
                                result = KAUTH_RESULT_DENY;
                        break;

                case KAUTH_REQ_SYSTEM_TIME_SYSTEM: {
                        struct timespec *ts = arg1;
                        struct timespec *delta = arg2;

                        if (securelevel > 1 && time_wraps(ts, delta))
                                result = KAUTH_RESULT_DENY;

                        break;
                }

                default:
                        break;
                }
                break;

        case KAUTH_SYSTEM_MAP_VA_ZERO:
                if (securelevel > 0)
                        result = KAUTH_RESULT_DENY;
                break;

        case KAUTH_SYSTEM_MODULE:
                if (securelevel > 0)
                        result = KAUTH_RESULT_DENY;
                break;

        case KAUTH_SYSTEM_MOUNT:
                switch (req) {
                case KAUTH_REQ_SYSTEM_MOUNT_NEW:
                        if (securelevel > 1)
                                result = KAUTH_RESULT_DENY;

                        break;

                case KAUTH_REQ_SYSTEM_MOUNT_UPDATE:
                        if (securelevel > 1) {
                                struct mount *mp = arg1;
                                u_long flags = (u_long)arg2;

                                /* Can only degrade from read/write to read-only. */
                                if (flags != (mp->mnt_flag | MNT_RDONLY | MNT_RELOAD |
                                    MNT_FORCE | MNT_UPDATE))
                                        result = KAUTH_RESULT_DENY;
                        }

                        break;

                default:
                        break;
                }

                break;

        case KAUTH_SYSTEM_SYSCTL:
                switch (req) {
                case KAUTH_REQ_SYSTEM_SYSCTL_ADD:
                case KAUTH_REQ_SYSTEM_SYSCTL_DELETE:
                case KAUTH_REQ_SYSTEM_SYSCTL_DESC:
                        if (securelevel > 0)
                                result = KAUTH_RESULT_DENY;
                        break;

                default:
                        break;
                }
                break;

        case KAUTH_SYSTEM_SETIDCORE:
                if (securelevel > 0)
                        result = KAUTH_RESULT_DENY;
                break;

        case KAUTH_SYSTEM_DEBUG:
        default:
                break;
        }

        return (result);
}

/*
 * kauth(9) listener
 *
 * Security model: Traditional NetBSD
 * Scope: Process
 * Responsibility: Securelevel
 */
int
secmodel_securelevel_process_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        struct proc *p;
        int result;

        result = KAUTH_RESULT_DEFER;
        p = arg0;

        switch (action) {
        case KAUTH_PROCESS_PROCFS: {
                enum kauth_process_req req;

                req = (enum kauth_process_req)(uintptr_t)arg2;
                switch (req) {
                case KAUTH_REQ_PROCESS_PROCFS_READ:
                        break;

                case KAUTH_REQ_PROCESS_PROCFS_RW:
                case KAUTH_REQ_PROCESS_PROCFS_WRITE:
                        if ((p == initproc) && (securelevel > -1))
                                result = KAUTH_RESULT_DENY;

                        break;

                default:
                        break;
                }

                break;
                }

        case KAUTH_PROCESS_PTRACE:
                if ((p == initproc) && (securelevel > -1))
                        result = KAUTH_RESULT_DENY;

                break;

        case KAUTH_PROCESS_CORENAME:
                if (securelevel > 1)
                        result = KAUTH_RESULT_DENY;
                break;

        default:
                break;
        }

        return (result);
}

/*
 * kauth(9) listener
 *
 * Security model: Traditional NetBSD
 * Scope: Network
 * Responsibility: Securelevel
 */
int
secmodel_securelevel_network_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;
        enum kauth_network_req req;

        result = KAUTH_RESULT_DEFER;
        req = (enum kauth_network_req)(uintptr_t)arg0;

        switch (action) {
        case KAUTH_NETWORK_FIREWALL:
                switch (req) {
                case KAUTH_REQ_NETWORK_FIREWALL_FW:
                case KAUTH_REQ_NETWORK_FIREWALL_NAT:
                        if (securelevel > 1)
                                result = KAUTH_RESULT_DENY;
                        break;

                default:
                        break;
                }
                break;

        case KAUTH_NETWORK_FORWSRCRT:
                if (securelevel > 0)
                        result = KAUTH_RESULT_DENY;
                break;

        default:
                break;
        }

        return (result);
}

/*
 * kauth(9) listener
 *
 * Security model: Traditional NetBSD
 * Scope: Machdep
 * Responsibility: Securelevel
 */
int
secmodel_securelevel_machdep_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;

        result = KAUTH_RESULT_DEFER;

        switch (action) {
        case KAUTH_MACHDEP_IOPERM_SET:
        case KAUTH_MACHDEP_IOPL:
                if (securelevel > 0)
                        result = KAUTH_RESULT_DENY;
                break;

        case KAUTH_MACHDEP_UNMANAGEDMEM:
                if (securelevel > 0)
                        result = KAUTH_RESULT_DENY;
                break;

        case KAUTH_MACHDEP_SVS_DISABLE:
                /* Deprecated. */
                if (securelevel > 0)
                        result = KAUTH_RESULT_DENY;
                break;

        case KAUTH_MACHDEP_CPU_UCODE_APPLY:
                if (securelevel > 1)
                        result = KAUTH_RESULT_DENY;
                break;

        default:
                break;
        }

        return (result);
}

/*
 * kauth(9) listener
 *
 * Security model: Traditional NetBSD
 * Scope: Device
 * Responsibility: Securelevel
 */
int
secmodel_securelevel_device_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;

        result = KAUTH_RESULT_DEFER;

        switch (action) {
        case KAUTH_DEVICE_RAWIO_SPEC: {
                struct vnode *vp;
                enum kauth_device_req req;

                req = (enum kauth_device_req)(uintptr_t)arg0;
                vp = arg1;

                KASSERT(vp != NULL);

                /* Handle /dev/mem and /dev/kmem. */
                if (iskmemvp(vp)) {
                        switch (req) {
                        case KAUTH_REQ_DEVICE_RAWIO_SPEC_READ:
                                break;

                        case KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE:
                        case KAUTH_REQ_DEVICE_RAWIO_SPEC_RW:
                                if (securelevel > 0)
                                        result = KAUTH_RESULT_DENY;

                                break;

                        default:
                                break;
                        }

                        break;
                }

                switch (req) {
                case KAUTH_REQ_DEVICE_RAWIO_SPEC_READ:
                        break;

                case KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE:
                case KAUTH_REQ_DEVICE_RAWIO_SPEC_RW: {
                        int error;

                        error = rawdev_mounted(vp, NULL);

                        /* Not a disk. */
                        if (error == EINVAL)
                                break;

                        if (error && securelevel > 0)
                                result = KAUTH_RESULT_DENY;

                        if (securelevel > 1)
                                result = KAUTH_RESULT_DENY;

                        break;
                        }

                default:
                        break;
                }

                break;
                }

        case KAUTH_DEVICE_RAWIO_PASSTHRU:
                if (securelevel > 0) {
                        u_long bits;

                        bits = (u_long)arg0;

                        KASSERT(bits != 0);
                        KASSERT((bits & ~KAUTH_REQ_DEVICE_RAWIO_PASSTHRU_ALL) == 0);

                        if (bits & ~KAUTH_REQ_DEVICE_RAWIO_PASSTHRU_READCONF)
                                result = KAUTH_RESULT_DENY;
                }

                break;

        case KAUTH_DEVICE_GPIO_PINSET:
                if (securelevel > 0)
                        result = KAUTH_RESULT_DENY;
                break;

        case KAUTH_DEVICE_RND_ADDDATA_ESTIMATE:
                if (securelevel > 1)
                        result = KAUTH_RESULT_DENY;
                break;

        default:
                break;
        }

        return (result);
}

int
secmodel_securelevel_vnode_cb(kauth_cred_t cred, kauth_action_t action,
    void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
        int result;

        result = KAUTH_RESULT_DEFER;

        if ((action & KAUTH_VNODE_WRITE_SYSFLAGS) &&
            (action & KAUTH_VNODE_HAS_SYSFLAGS)) {
                if (securelevel > 0)
                        result = KAUTH_RESULT_DENY;
        }

        return (result);
}































































   36 




    6 

   30 
















    7 

    7 





    2 
    5 












   15 


    6 

    9 

   15 



























































































































   84 





   84 

   84 




    2 

    2 








   73 
   73 









   31 
   32 









    1 
    1 





   70 

   74 




   32 

   32 




    1 

    1 











    1 

    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
/*        $NetBSD: uvm_object.c,v 1.25 2020/08/15 07:24:09 chs Exp $        */

/*
 * Copyright (c) 2006, 2010, 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * uvm_object.c: operate with memory objects
 *
 * TODO:
 *  1. Support PG_RELEASED-using objects
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_object.c,v 1.25 2020/08/15 07:24:09 chs Exp $");

#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#endif

#include <sys/param.h>
#include <sys/rwlock.h>
#include <sys/queue.h>

#include <uvm/uvm.h>
#include <uvm/uvm_ddb.h>
#include <uvm/uvm_page_array.h>

/* Page count to fetch per single step. */
#define        FETCH_PAGECOUNT                        16

/*
 * uvm_obj_init: initialize UVM memory object.
 */
void
uvm_obj_init(struct uvm_object *uo, const struct uvm_pagerops *ops,
    bool alock, u_int refs)
{

#if 0 /* notyet */
        KASSERT(ops);
#endif
        if (alock) {
                /* Allocate and assign a lock. */
                uo->vmobjlock = rw_obj_alloc();
        } else {
                /* The lock will need to be set via uvm_obj_setlock(). */
                uo->vmobjlock = NULL;
        }
        uo->pgops = ops;
        LIST_INIT(&uo->uo_ubc);
        uo->uo_npages = 0;
        uo->uo_refs = refs;
        radix_tree_init_tree(&uo->uo_pages);
}

/*
 * uvm_obj_destroy: destroy UVM memory object.
 */
void
uvm_obj_destroy(struct uvm_object *uo, bool dlock)
{

        KASSERT(radix_tree_empty_tree_p(&uo->uo_pages));

        /* Purge any UBC entries associated with this object. */
        ubc_purge(uo);

        /* Destroy the lock, if requested. */
        if (dlock) {
                rw_obj_free(uo->vmobjlock);
        }
        radix_tree_fini_tree(&uo->uo_pages);
}

/*
 * uvm_obj_setlock: assign a vmobjlock to the UVM object.
 *
 * => Caller is responsible to ensure that UVM objects is not use.
 * => Only dynamic lock may be previously set.  We drop the reference then.
 */
void
uvm_obj_setlock(struct uvm_object *uo, krwlock_t *lockptr)
{
        krwlock_t *olockptr = uo->vmobjlock;

        if (olockptr) {
                /* Drop the reference on the old lock. */
                rw_obj_free(olockptr);
        }
        if (lockptr == NULL) {
                /* If new lock is not passed - allocate default one. */
                lockptr = rw_obj_alloc();
        }
        uo->vmobjlock = lockptr;
}

/*
 * uvm_obj_wirepages: wire the pages of entire UVM object.
 *
 * => NOTE: this function should only be used for types of objects
 *  where PG_RELEASED flag is never set (aobj objects)
 * => caller must pass page-aligned start and end values
 */
int
uvm_obj_wirepages(struct uvm_object *uobj, off_t start, off_t end,
    struct pglist *list)
{
        int i, npages, error;
        struct vm_page *pgs[FETCH_PAGECOUNT], *pg = NULL;
        off_t offset = start, left;

        left = (end - start) >> PAGE_SHIFT;

        rw_enter(uobj->vmobjlock, RW_WRITER);
        while (left) {

                npages = MIN(FETCH_PAGECOUNT, left);

                /* Get the pages */
                memset(pgs, 0, sizeof(pgs));
                error = (*uobj->pgops->pgo_get)(uobj, offset, pgs, &npages, 0,
                        VM_PROT_READ | VM_PROT_WRITE, UVM_ADV_SEQUENTIAL,
                        PGO_SYNCIO);

                if (error)
                        goto error;

                rw_enter(uobj->vmobjlock, RW_WRITER);
                for (i = 0; i < npages; i++) {

                        KASSERT(pgs[i] != NULL);
                        KASSERT(!(pgs[i]->flags & PG_RELEASED));

                        /*
                         * Loan break
                         */
                        if (pgs[i]->loan_count) {
                                while (pgs[i]->loan_count) {
                                        pg = uvm_loanbreak(pgs[i]);
                                        if (!pg) {
                                                rw_exit(uobj->vmobjlock);
                                                uvm_wait("uobjwirepg");
                                                rw_enter(uobj->vmobjlock, RW_WRITER);
                                                continue;
                                        }
                                }
                                pgs[i] = pg;
                        }

                        if (pgs[i]->flags & PG_AOBJ) {
                                uvm_pagemarkdirty(pgs[i],
                                    UVM_PAGE_STATUS_DIRTY);
                                uao_dropswap(uobj, i);
                        }
                }

                /* Wire the pages */
                for (i = 0; i < npages; i++) {
                        uvm_pagelock(pgs[i]);
                        uvm_pagewire(pgs[i]);
                        uvm_pageunlock(pgs[i]);
                        if (list != NULL)
                                TAILQ_INSERT_TAIL(list, pgs[i], pageq.queue);
                }

                /* Unbusy the pages */
                uvm_page_unbusy(pgs, npages);

                left -= npages;
                offset += npages << PAGE_SHIFT;
        }
        rw_exit(uobj->vmobjlock);

        return 0;

error:
        /* Unwire the pages which has been wired */
        uvm_obj_unwirepages(uobj, start, offset);

        return error;
}

/*
 * uvm_obj_unwirepages: unwire the pages of entire UVM object.
 *
 * => NOTE: this function should only be used for types of objects
 *  where PG_RELEASED flag is never set
 * => caller must pass page-aligned start and end values
 */
void
uvm_obj_unwirepages(struct uvm_object *uobj, off_t start, off_t end)
{
        struct vm_page *pg;
        off_t offset;

        rw_enter(uobj->vmobjlock, RW_WRITER);
        for (offset = start; offset < end; offset += PAGE_SIZE) {
                pg = uvm_pagelookup(uobj, offset);

                KASSERT(pg != NULL);
                KASSERT(!(pg->flags & PG_RELEASED));

                uvm_pagelock(pg);
                uvm_pageunwire(pg);
                uvm_pageunlock(pg);
        }
        rw_exit(uobj->vmobjlock);
}

static inline bool
uvm_obj_notag_p(struct uvm_object *uobj, int tag)
{

        KASSERT(rw_lock_held(uobj->vmobjlock));
        return radix_tree_empty_tagged_tree_p(&uobj->uo_pages, tag);
}

bool
uvm_obj_clean_p(struct uvm_object *uobj)
{

        return uvm_obj_notag_p(uobj, UVM_PAGE_DIRTY_TAG);
}

bool
uvm_obj_nowriteback_p(struct uvm_object *uobj)
{

        return uvm_obj_notag_p(uobj, UVM_PAGE_WRITEBACK_TAG);
}

static inline bool
uvm_obj_page_tag_p(struct vm_page *pg, int tag)
{
        struct uvm_object *uobj = pg->uobject;
        uint64_t pgidx = pg->offset >> PAGE_SHIFT;

        KASSERT(uobj != NULL);
        KASSERT(rw_lock_held(uobj->vmobjlock));
        return radix_tree_get_tag(&uobj->uo_pages, pgidx, tag) != 0;
}

static inline void
uvm_obj_page_set_tag(struct vm_page *pg, int tag)
{
        struct uvm_object *uobj = pg->uobject;
        uint64_t pgidx = pg->offset >> PAGE_SHIFT;

        KASSERT(uobj != NULL);
        KASSERT(rw_write_held(uobj->vmobjlock));
        radix_tree_set_tag(&uobj->uo_pages, pgidx, tag);
}

static inline void
uvm_obj_page_clear_tag(struct vm_page *pg, int tag)
{
        struct uvm_object *uobj = pg->uobject;
        uint64_t pgidx = pg->offset >> PAGE_SHIFT;

        KASSERT(uobj != NULL);
        KASSERT(rw_write_held(uobj->vmobjlock));
        radix_tree_clear_tag(&uobj->uo_pages, pgidx, tag);
}

bool
uvm_obj_page_dirty_p(struct vm_page *pg)
{

        return uvm_obj_page_tag_p(pg, UVM_PAGE_DIRTY_TAG);
}

void
uvm_obj_page_set_dirty(struct vm_page *pg)
{

        uvm_obj_page_set_tag(pg, UVM_PAGE_DIRTY_TAG);
}

void
uvm_obj_page_clear_dirty(struct vm_page *pg)
{

        uvm_obj_page_clear_tag(pg, UVM_PAGE_DIRTY_TAG);
}

bool
uvm_obj_page_writeback_p(struct vm_page *pg)
{

        return uvm_obj_page_tag_p(pg, UVM_PAGE_WRITEBACK_TAG);
}

void
uvm_obj_page_set_writeback(struct vm_page *pg)
{

        uvm_obj_page_set_tag(pg, UVM_PAGE_WRITEBACK_TAG);
}

void
uvm_obj_page_clear_writeback(struct vm_page *pg)
{

        uvm_obj_page_clear_tag(pg, UVM_PAGE_WRITEBACK_TAG);
}

#if defined(DDB) || defined(DEBUGPRINT)

/*
 * uvm_object_printit: actually prints the object
 */
void
uvm_object_printit(struct uvm_object *uobj, bool full,
    void (*pr)(const char *, ...))
{
        struct uvm_page_array a;
        struct vm_page *pg;
        int cnt = 0;
        voff_t off;

        (*pr)("OBJECT %p: locked=%d, pgops=%p, npages=%d, ",
            uobj, rw_write_held(uobj->vmobjlock), uobj->pgops, uobj->uo_npages);
        if (UVM_OBJ_IS_KERN_OBJECT(uobj))
                (*pr)("refs=<SYSTEM>\n");
        else
                (*pr)("refs=%d\n", uobj->uo_refs);

        if (!full) {
                return;
        }
        (*pr)("  PAGES <pg,offset>:\n  ");
        uvm_page_array_init(&a, uobj, 0);
        off = 0;
        while ((pg = uvm_page_array_fill_and_peek(&a, off, 0)) != NULL) {
                cnt++;
                (*pr)("<%p,0x%llx> ", pg, (long long)pg->offset);
                if ((cnt % 3) == 0) {
                        (*pr)("\n  ");
                }
                off = pg->offset + PAGE_SIZE;
                uvm_page_array_advance(&a);
        }
        if ((cnt % 3) != 0) {
                (*pr)("\n");
        }
        uvm_page_array_fini(&a);
}

#endif /* DDB || DEBUGPRINT */























































































































































































    2 



    2 
    2 


    2 
    2 











    2 

    2 

    2 





















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
/*        $NetBSD: subr_ipi.c,v 1.11 2023/02/24 11:02:27 riastradh Exp $        */

/*-
 * Copyright (c) 2014 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Mindaugas Rasiukevicius.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Inter-processor interrupt (IPI) interface: asynchronous IPIs to
 * invoke functions with a constant argument and synchronous IPIs
 * with the cross-call support.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_ipi.c,v 1.11 2023/02/24 11:02:27 riastradh Exp $");

#include <sys/param.h>
#include <sys/types.h>

#include <sys/atomic.h>
#include <sys/evcnt.h>
#include <sys/cpu.h>
#include <sys/ipi.h>
#include <sys/intr.h>
#include <sys/kcpuset.h>
#include <sys/kmem.h>
#include <sys/lock.h>
#include <sys/mutex.h>

/*
 * An array of the IPI handlers used for asynchronous invocation.
 * The lock protects the slot allocation.
 */

typedef struct {
        ipi_func_t        func;
        void *                arg;
} ipi_intr_t;

static kmutex_t                ipi_mngmt_lock;
static ipi_intr_t        ipi_intrs[IPI_MAXREG]        __cacheline_aligned;

/*
 * Per-CPU mailbox for IPI messages: it is a single cache line storing
 * up to IPI_MSG_MAX messages.  This interface is built on top of the
 * synchronous IPIs.
 */

#define        IPI_MSG_SLOTS        (CACHE_LINE_SIZE / sizeof(ipi_msg_t *))
#define        IPI_MSG_MAX        IPI_MSG_SLOTS

typedef struct {
        ipi_msg_t *        msg[IPI_MSG_SLOTS];
} ipi_mbox_t;


/* Mailboxes for the synchronous IPIs. */
static ipi_mbox_t *        ipi_mboxes        __read_mostly;
static struct evcnt        ipi_mboxfull_ev        __cacheline_aligned;
static void                ipi_msg_cpu_handler(void *);

/* Handler for the synchronous IPIs - it must be zero. */
#define        IPI_SYNCH_ID        0

#ifndef MULTIPROCESSOR
#define        cpu_ipi(ci)        KASSERT(ci == NULL)
#endif

void
ipi_sysinit(void)
{

        mutex_init(&ipi_mngmt_lock, MUTEX_DEFAULT, IPL_NONE);
        memset(ipi_intrs, 0, sizeof(ipi_intrs));

        /*
         * Register the handler for synchronous IPIs.  This mechanism
         * is built on top of the asynchronous interface.  Slot zero is
         * reserved permanently; it is also handy to use zero as a failure
         * for other registers (as it is potentially less error-prone).
         */
        ipi_intrs[IPI_SYNCH_ID].func = ipi_msg_cpu_handler;

        evcnt_attach_dynamic(&ipi_mboxfull_ev, EVCNT_TYPE_MISC, NULL,
           "ipi", "full");
}

void
ipi_percpu_init(void)
{
        const size_t len = ncpu * sizeof(ipi_mbox_t);

        /* Initialise the per-CPU bit fields. */
        for (u_int i = 0; i < ncpu; i++) {
                struct cpu_info *ci = cpu_lookup(i);
                memset(&ci->ci_ipipend, 0, sizeof(ci->ci_ipipend));
        }

        /* Allocate per-CPU IPI mailboxes. */
        ipi_mboxes = kmem_zalloc(len, KM_SLEEP);
        KASSERT(ipi_mboxes != NULL);
}

/*
 * ipi_register: register an asynchronous IPI handler.
 *
 * => Returns IPI ID which is greater than zero; on failure - zero.
 */
u_int
ipi_register(ipi_func_t func, void *arg)
{
        mutex_enter(&ipi_mngmt_lock);
        for (u_int i = 0; i < IPI_MAXREG; i++) {
                if (ipi_intrs[i].func == NULL) {
                        /* Register the function. */
                        ipi_intrs[i].func = func;
                        ipi_intrs[i].arg = arg;
                        mutex_exit(&ipi_mngmt_lock);

                        KASSERT(i != IPI_SYNCH_ID);
                        return i;
                }
        }
        mutex_exit(&ipi_mngmt_lock);
        printf("WARNING: ipi_register: table full, increase IPI_MAXREG\n");
        return 0;
}

/*
 * ipi_unregister: release the IPI handler given the ID.
 */
void
ipi_unregister(u_int ipi_id)
{
        ipi_msg_t ipimsg = { .func = __FPTRCAST(ipi_func_t, nullop) };

        KASSERT(ipi_id != IPI_SYNCH_ID);
        KASSERT(ipi_id < IPI_MAXREG);

        /* Release the slot. */
        mutex_enter(&ipi_mngmt_lock);
        KASSERT(ipi_intrs[ipi_id].func != NULL);
        ipi_intrs[ipi_id].func = NULL;

        /* Ensure that there are no IPIs in flight. */
        kpreempt_disable();
        ipi_broadcast(&ipimsg, false);
        ipi_wait(&ipimsg);
        kpreempt_enable();
        mutex_exit(&ipi_mngmt_lock);
}

/*
 * ipi_mark_pending: internal routine to mark an IPI pending on the
 * specified CPU (which might be curcpu()).
 */
static bool
ipi_mark_pending(u_int ipi_id, struct cpu_info *ci)
{
        const u_int i = ipi_id >> IPI_BITW_SHIFT;
        const uint32_t bitm = 1U << (ipi_id & IPI_BITW_MASK);

        KASSERT(ipi_id < IPI_MAXREG);
        KASSERT(kpreempt_disabled());

        /* Mark as pending and return true if not previously marked. */
        if ((atomic_load_acquire(&ci->ci_ipipend[i]) & bitm) == 0) {
                membar_release();
                atomic_or_32(&ci->ci_ipipend[i], bitm);
                return true;
        }
        return false;
}

/*
 * ipi_trigger: asynchronously send an IPI to the specified CPU.
 */
void
ipi_trigger(u_int ipi_id, struct cpu_info *ci)
{

        KASSERT(curcpu() != ci);
        if (ipi_mark_pending(ipi_id, ci)) {
                cpu_ipi(ci);
        }
}

/*
 * ipi_trigger_multi_internal: the guts of ipi_trigger_multi() and
 * ipi_trigger_broadcast().
 */
static void
ipi_trigger_multi_internal(u_int ipi_id, const kcpuset_t *target,
    bool skip_self)
{
        const cpuid_t selfid = cpu_index(curcpu());
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;

        KASSERT(kpreempt_disabled());
        KASSERT(target != NULL);

        for (CPU_INFO_FOREACH(cii, ci)) {
                const cpuid_t cpuid = cpu_index(ci);

                if (!kcpuset_isset(target, cpuid) || cpuid == selfid) {
                        continue;
                }
                ipi_trigger(ipi_id, ci);
        }
        if (!skip_self && kcpuset_isset(target, selfid)) {
                ipi_mark_pending(ipi_id, curcpu());
                int s = splhigh();
                ipi_cpu_handler();
                splx(s);
        }
}

/*
 * ipi_trigger_multi: same as ipi_trigger() but sends to the multiple
 * CPUs given the target CPU set.
 */
void
ipi_trigger_multi(u_int ipi_id, const kcpuset_t *target)
{
        ipi_trigger_multi_internal(ipi_id, target, false);
}

/*
 * ipi_trigger_broadcast: same as ipi_trigger_multi() to kcpuset_attached,
 * optionally skipping the sending CPU.
 */
void
ipi_trigger_broadcast(u_int ipi_id, bool skip_self)
{
        ipi_trigger_multi_internal(ipi_id, kcpuset_attached, skip_self);
}

/*
 * put_msg: insert message into the mailbox.
 *
 * Caller is responsible for issuing membar_release first.
 */
static inline void
put_msg(ipi_mbox_t *mbox, ipi_msg_t *msg)
{
        int count = SPINLOCK_BACKOFF_MIN;
again:
        for (u_int i = 0; i < IPI_MSG_MAX; i++) {
                if (atomic_cas_ptr(&mbox->msg[i], NULL, msg) == NULL) {
                        return;
                }
        }

        /* All slots are full: we have to spin-wait. */
        ipi_mboxfull_ev.ev_count++;
        SPINLOCK_BACKOFF(count);
        goto again;
}

/*
 * ipi_cpu_handler: the IPI handler.
 */
void
ipi_cpu_handler(void)
{
        struct cpu_info * const ci = curcpu();

        /*
         * Handle asynchronous IPIs: inspect per-CPU bit field, extract
         * IPI ID numbers and execute functions in those slots.
         */
        for (u_int i = 0; i < IPI_BITWORDS; i++) {
                uint32_t pending, bit;

                if (atomic_load_relaxed(&ci->ci_ipipend[i]) == 0) {
                        continue;
                }
                pending = atomic_swap_32(&ci->ci_ipipend[i], 0);
                membar_acquire();
                while ((bit = ffs(pending)) != 0) {
                        const u_int ipi_id = (i << IPI_BITW_SHIFT) | --bit;
                        ipi_intr_t *ipi_hdl = &ipi_intrs[ipi_id];

                        pending &= ~(1U << bit);
                        KASSERT(ipi_hdl->func != NULL);
                        ipi_hdl->func(ipi_hdl->arg);
                }
        }
}

/*
 * ipi_msg_cpu_handler: handle synchronous IPIs - iterate mailbox,
 * execute the passed functions and acknowledge the messages.
 */
static void
ipi_msg_cpu_handler(void *arg __unused)
{
        const struct cpu_info * const ci = curcpu();
        ipi_mbox_t *mbox = &ipi_mboxes[cpu_index(ci)];

        for (u_int i = 0; i < IPI_MSG_MAX; i++) {
                ipi_msg_t *msg;

                /* Get the message. */
                if ((msg = atomic_load_acquire(&mbox->msg[i])) == NULL) {
                        continue;
                }
                atomic_store_relaxed(&mbox->msg[i], NULL);

                /* Execute the handler. */
                KASSERT(msg->func);
                msg->func(msg->arg);

                /* Ack the request. */
                membar_release();
                atomic_dec_uint(&msg->_pending);
        }
}

/*
 * ipi_unicast: send an IPI to a single CPU.
 *
 * => The CPU must be remote; must not be local.
 * => The caller must ipi_wait() on the message for completion.
 */
void
ipi_unicast(ipi_msg_t *msg, struct cpu_info *ci)
{
        const cpuid_t id = cpu_index(ci);

        KASSERT(msg->func != NULL);
        KASSERT(kpreempt_disabled());
        KASSERT(curcpu() != ci);

        msg->_pending = 1;
        membar_release();

        put_msg(&ipi_mboxes[id], msg);
        ipi_trigger(IPI_SYNCH_ID, ci);
}

/*
 * ipi_multicast: send an IPI to each CPU in the specified set.
 *
 * => The caller must ipi_wait() on the message for completion.
 */
void
ipi_multicast(ipi_msg_t *msg, const kcpuset_t *target)
{
        const struct cpu_info * const self = curcpu();
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
        u_int local;

        KASSERT(msg->func != NULL);
        KASSERT(kpreempt_disabled());

        local = !!kcpuset_isset(target, cpu_index(self));
        msg->_pending = kcpuset_countset(target) - local;
        membar_release();

        for (CPU_INFO_FOREACH(cii, ci)) {
                cpuid_t id;

                if (__predict_false(ci == self)) {
                        continue;
                }
                id = cpu_index(ci);
                if (!kcpuset_isset(target, id)) {
                        continue;
                }
                put_msg(&ipi_mboxes[id], msg);
                ipi_trigger(IPI_SYNCH_ID, ci);
        }
        if (local) {
                msg->func(msg->arg);
        }
}

/*
 * ipi_broadcast: send an IPI to all CPUs.
 *
 * => The caller must ipi_wait() on the message for completion.
 */
void
ipi_broadcast(ipi_msg_t *msg, bool skip_self)
{
        const struct cpu_info * const self = curcpu();
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;

        KASSERT(msg->func != NULL);
        KASSERT(kpreempt_disabled());

        msg->_pending = ncpu - 1;
        membar_release();

        /* Broadcast IPIs for remote CPUs. */
        for (CPU_INFO_FOREACH(cii, ci)) {
                cpuid_t id;

                if (__predict_false(ci == self)) {
                        continue;
                }
                id = cpu_index(ci);
                put_msg(&ipi_mboxes[id], msg);
                ipi_trigger(IPI_SYNCH_ID, ci);
        }

        if (!skip_self) {
                /* Finally, execute locally. */
                msg->func(msg->arg);
        }
}

/*
 * ipi_wait: spin-wait until the message is processed.
 */
void
ipi_wait(ipi_msg_t *msg)
{
        int count = SPINLOCK_BACKOFF_MIN;

        while (atomic_load_acquire(&msg->_pending)) {
                KASSERT(atomic_load_relaxed(&msg->_pending) < ncpu);
                SPINLOCK_BACKOFF(count);
        }
}


















































































































  126 
    3 









  126 
    2 































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
/*        $NetBSD: time.h,v 1.80 2022/06/26 22:31:38 riastradh Exp $        */

/*
 * Copyright (c) 1982, 1986, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)time.h        8.5 (Berkeley) 5/4/95
 */

#ifndef _SYS_TIME_H_
#define        _SYS_TIME_H_

#include <sys/featuretest.h>
#include <sys/types.h>

/*
 * Structure returned by gettimeofday(2) system call,
 * and used in other calls.
 */
struct timeval {
        time_t            tv_sec;                /* seconds */
        suseconds_t        tv_usec;        /* and microseconds */
};

#include <sys/timespec.h>

#if defined(_NETBSD_SOURCE)
#define        TIMEVAL_TO_TIMESPEC(tv, ts) do {                                \
        (ts)->tv_sec = (tv)->tv_sec;                                        \
        (ts)->tv_nsec = (tv)->tv_usec * 1000;                                \
} while (/*CONSTCOND*/0)
#define        TIMESPEC_TO_TIMEVAL(tv, ts) do {                                \
        (tv)->tv_sec = (ts)->tv_sec;                                        \
        (tv)->tv_usec = (suseconds_t)(ts)->tv_nsec / 1000;                \
} while (/*CONSTCOND*/0)

/*
 * Note: timezone is obsolete. All timezone handling is now in
 * userland. Its just here for back compatibility.
 */
struct timezone {
        int        tz_minuteswest;        /* minutes west of Greenwich */
        int        tz_dsttime;        /* type of dst correction */
};

/* Operations on timevals. */
#define        timerclear(tvp)                (tvp)->tv_sec = (tvp)->tv_usec = 0L
#define        timerisset(tvp)                ((tvp)->tv_sec || (tvp)->tv_usec)
#define        timercmp(tvp, uvp, cmp)                                                \
        (((tvp)->tv_sec == (uvp)->tv_sec) ?                                \
            ((tvp)->tv_usec cmp (uvp)->tv_usec) :                        \
            ((tvp)->tv_sec cmp (uvp)->tv_sec))
#define        timeradd(tvp, uvp, vvp)                                                \
        do {                                                                \
                (vvp)->tv_sec = (tvp)->tv_sec + (uvp)->tv_sec;                \
                (vvp)->tv_usec = (tvp)->tv_usec + (uvp)->tv_usec;        \
                if ((vvp)->tv_usec >= 1000000) {                        \
                        (vvp)->tv_sec++;                                \
                        (vvp)->tv_usec -= 1000000;                        \
                }                                                        \
        } while (/* CONSTCOND */ 0)
#define        timersub(tvp, uvp, vvp)                                                \
        do {                                                                \
                (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;                \
                (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;        \
                if ((vvp)->tv_usec < 0) {                                \
                        (vvp)->tv_sec--;                                \
                        (vvp)->tv_usec += 1000000;                        \
                }                                                        \
        } while (/* CONSTCOND */ 0)

/*
 * hide bintime for _STANDALONE because this header is used for hpcboot.exe,
 * which is built with compilers which don't recognize LL suffix.
 *        http://mail-index.NetBSD.org/tech-userlevel/2008/02/27/msg000181.html
 */
#if !defined(_STANDALONE)
struct bintime {
        time_t        sec;
        uint64_t frac;
};

static __inline void
bintime_addx(struct bintime *bt, uint64_t x)
{
        uint64_t u;

        u = bt->frac;
        bt->frac += x;
        if (u > bt->frac)
                bt->sec++;
}

static __inline void
bintime_add(struct bintime *bt, const struct bintime *bt2)
{
        uint64_t u;

        u = bt->frac;
        bt->frac += bt2->frac;
        if (u > bt->frac)
                bt->sec++;
        bt->sec += bt2->sec;
}

static __inline void
bintime_sub(struct bintime *bt, const struct bintime *bt2)
{
        uint64_t u;

        u = bt->frac;
        bt->frac -= bt2->frac;
        if (u < bt->frac)
                bt->sec--;
        bt->sec -= bt2->sec;
}

#define        bintimecmp(bta, btb, cmp)                                        \
        (((bta)->sec == (btb)->sec) ?                                        \
            ((bta)->frac cmp (btb)->frac) :                                \
            ((bta)->sec cmp (btb)->sec))

/*-
 * Background information:
 *
 * When converting between timestamps on parallel timescales of differing
 * resolutions it is historical and scientific practice to round down rather
 * than doing 4/5 rounding.
 *
 *   The date changes at midnight, not at noon.
 *
 *   Even at 15:59:59.999999999 it's not four'o'clock.
 *
 *   time_second ticks after N.999999999 not after N.4999999999
 */

/*
 * The magic numbers for converting ms/us/ns to fractions
 */

/* 1ms = (2^64) / 1000       */
#define        BINTIME_SCALE_MS        ((uint64_t)18446744073709551ULL)

/* 1us = (2^64) / 1000000    */
#define        BINTIME_SCALE_US        ((uint64_t)18446744073709ULL)

/* 1ns = (2^64) / 1000000000 */
#define        BINTIME_SCALE_NS        ((uint64_t)18446744073ULL)

static __inline void
bintime2timespec(const struct bintime *bt, struct timespec *ts)
{

        ts->tv_sec = bt->sec;
        ts->tv_nsec =
            (long)((1000000000ULL * (uint32_t)(bt->frac >> 32)) >> 32);
}

static __inline void
timespec2bintime(const struct timespec *ts, struct bintime *bt)
{

        bt->sec = ts->tv_sec;
        bt->frac = (uint64_t)ts->tv_nsec * BINTIME_SCALE_NS;
}

static __inline void
bintime2timeval(const struct bintime *bt, struct timeval *tv)
{

        tv->tv_sec = bt->sec;
        tv->tv_usec =
            (suseconds_t)((1000000ULL * (uint32_t)(bt->frac >> 32)) >> 32);
}

static __inline void
timeval2bintime(const struct timeval *tv, struct bintime *bt)
{

        bt->sec = tv->tv_sec;
        bt->frac = (uint64_t)tv->tv_usec * BINTIME_SCALE_US;
}

static __inline struct bintime
ms2bintime(uint64_t ms)
{
        struct bintime bt;

        bt.sec = (time_t)(ms / 1000U);
        bt.frac = (uint64_t)(ms % 1000U) * BINTIME_SCALE_MS;

        return bt;
}

static __inline struct bintime
us2bintime(uint64_t us)
{
        struct bintime bt;

        bt.sec = (time_t)(us / 1000000U);
        bt.frac = (uint64_t)(us % 1000000U) * BINTIME_SCALE_US;

        return bt;
}

static __inline struct bintime
ns2bintime(uint64_t ns)
{
        struct bintime bt;

        bt.sec = (time_t)(ns / 1000000000U);
        bt.frac = (uint64_t)(ns % 1000000000U) * BINTIME_SCALE_NS;

        return bt;
}
#endif /* !defined(_STANDALONE) */

/* Operations on timespecs. */
#define        timespecclear(tsp)        (tsp)->tv_sec = (time_t)((tsp)->tv_nsec = 0L)
#define        timespecisset(tsp)        ((tsp)->tv_sec || (tsp)->tv_nsec)
#define        timespeccmp(tsp, usp, cmp)                                        \
        (((tsp)->tv_sec == (usp)->tv_sec) ?                                \
            ((tsp)->tv_nsec cmp (usp)->tv_nsec) :                        \
            ((tsp)->tv_sec cmp (usp)->tv_sec))
#define        timespecadd(tsp, usp, vsp)                                        \
        do {                                                                \
                (vsp)->tv_sec = (tsp)->tv_sec + (usp)->tv_sec;                \
                (vsp)->tv_nsec = (tsp)->tv_nsec + (usp)->tv_nsec;        \
                if ((vsp)->tv_nsec >= 1000000000L) {                        \
                        (vsp)->tv_sec++;                                \
                        (vsp)->tv_nsec -= 1000000000L;                        \
                }                                                        \
        } while (/* CONSTCOND */ 0)
#define        timespecsub(tsp, usp, vsp)                                        \
        do {                                                                \
                (vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec;                \
                (vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec;        \
                if ((vsp)->tv_nsec < 0) {                                \
                        (vsp)->tv_sec--;                                \
                        (vsp)->tv_nsec += 1000000000L;                        \
                }                                                        \
        } while (/* CONSTCOND */ 0)
#define timespec2ns(x) (((uint64_t)(x)->tv_sec) * 1000000000L + (x)->tv_nsec)

#ifdef _KERNEL
bool timespecaddok(const struct timespec *, const struct timespec *) __pure;
bool timespecsubok(const struct timespec *, const struct timespec *) __pure;
#endif

#endif /* _NETBSD_SOURCE */

/*
 * Names of the interval timers, and structure
 * defining a timer setting.
 * NB: Must match the CLOCK_ constants below.
 */
#define        ITIMER_REAL                0
#define        ITIMER_VIRTUAL                1
#define        ITIMER_PROF                2
#define        ITIMER_MONOTONIC        3

struct        itimerval {
        struct        timeval it_interval;        /* timer interval */
        struct        timeval it_value;        /* current value */
};

/*
 * Structure defined by POSIX.1b to be like a itimerval, but with
 * timespecs. Used in the timer_*() system calls.
 */
struct        itimerspec {
        struct        timespec it_interval;
        struct        timespec it_value;
};

#define        CLOCK_REALTIME        0
#define        CLOCK_VIRTUAL        1
#define        CLOCK_PROF        2
#define        CLOCK_MONOTONIC        3
#define CLOCK_THREAD_CPUTIME_ID                0x20000000
#define CLOCK_PROCESS_CPUTIME_ID        0x40000000

#if defined(_NETBSD_SOURCE)
#define        TIMER_RELTIME        0x0        /* relative timer */
#endif
#define        TIMER_ABSTIME        0x1        /* absolute timer */

#ifdef _KERNEL
#include <sys/timevar.h>
#else /* !_KERNEL */
#ifndef _STANDALONE
#if (_POSIX_C_SOURCE - 0) >= 200112L || \
    (defined(_XOPEN_SOURCE) && defined(_XOPEN_SOURCE_EXTENDED)) || \
    (_XOPEN_SOURCE - 0) >= 500 || defined(_NETBSD_SOURCE)
#include <sys/select.h>
#endif

#include <sys/cdefs.h>
#include <time.h>

__BEGIN_DECLS
#ifndef __LIBC12_SOURCE__
#if (_POSIX_C_SOURCE - 0) >= 200112L || \
    defined(_XOPEN_SOURCE) || defined(_NETBSD_SOURCE)
int        getitimer(int, struct itimerval *) __RENAME(__getitimer50);
int        gettimeofday(struct timeval * __restrict, void *__restrict)
    __RENAME(__gettimeofday50);
int        setitimer(int, const struct itimerval * __restrict,
            struct itimerval * __restrict) __RENAME(__setitimer50);
int        utimes(const char *, const struct timeval [2]) __RENAME(__utimes50);
#endif /* _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE || _NETBSD_SOURCE */

#if defined(_NETBSD_SOURCE) || defined(HAVE_NBTOOL_CONFIG_H)
int        adjtime(const struct timeval *, struct timeval *) __RENAME(__adjtime50);
int        futimes(int, const struct timeval [2]) __RENAME(__futimes50);
int        lutimes(const char *, const struct timeval [2]) __RENAME(__lutimes50);
int        settimeofday(const struct timeval * __restrict,
            const void *__restrict) __RENAME(__settimeofday50);
#endif /* _NETBSD_SOURCE */
#endif /* __LIBC12_SOURCE__ */
__END_DECLS

#endif        /* !_STANDALONE */
#endif /* !_KERNEL */
#endif /* !_SYS_TIME_H_ */

















































































































































































  127 









  112 







  110 


  112 
   91 






   77 











  109 
















   51 



    5 
  113 







  110 
   53 










  125 






  125 
  122 
  125 
  124 

  125 



   21 
  121 



  126 
  125 
   54 






  123 










  117 





   40 













   52 






   52 






    6 
   12 






































   50 








   51 










   51 




    5 
















   52 


   51 
   52 


































   80 




































   67 










   68 
   66 













   68 











   68 


   62 


   55 

   33 







   33 








   67 

   67 











   50 





   49 



    5 











   49 
   10 
    1 

    1 


















   49 




   49 





   49 
    9 











    8 




    7 






    8 







































































































































































































































































  109 







  109 








   74 
   64 











   62 

   63 































   63 









































































































  126 















    5 



  126 
  126 

  124 









































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
/*        $NetBSD: kern_runq.c,v 1.70 2023/09/19 22:15:32 ad Exp $        */

/*-
 * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.70 2023/09/19 22:15:32 ad Exp $");

#include "opt_dtrace.h"

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/bitops.h>
#include <sys/cpu.h>
#include <sys/idle.h>
#include <sys/intr.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/pset.h>
#include <sys/sched.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/types.h>
#include <sys/evcnt.h>
#include <sys/atomic.h>

/*
 * Bits per map.
 */
#define        BITMAP_BITS        (32)
#define        BITMAP_SHIFT        (5)
#define        BITMAP_MSB        (0x80000000U)
#define        BITMAP_MASK        (BITMAP_BITS - 1)

const int        schedppq = 1;

static void        *sched_getrq(struct schedstate_percpu *, const pri_t);
#ifdef MULTIPROCESSOR
static lwp_t *        sched_catchlwp(struct cpu_info *);
#endif

/*
 * Preemption control.
 */
#ifdef __HAVE_PREEMPTION
# ifdef DEBUG
int                sched_kpreempt_pri = 0;
# else
int                sched_kpreempt_pri = PRI_USER_RT;
# endif
#else
int                sched_kpreempt_pri = 1000;
#endif

/*
 * Migration and balancing.
 */
static u_int        cacheht_time;        /* Cache hotness time */
static u_int        min_catch;        /* Minimal LWP count for catching */
static u_int        skim_interval;        /* Rate limit for stealing LWPs */

#ifdef KDTRACE_HOOKS
struct lwp *curthread;
#endif

void
runq_init(void)
{

        /* Pulling from remote packages, LWP must not have run for 10ms. */
        cacheht_time = 10;

        /* Minimal count of LWPs for catching */
        min_catch = 1;

        /* Steal from other CPUs at most every 10ms. */
        skim_interval = 10;
}

void
sched_cpuattach(struct cpu_info *ci)
{
        struct schedstate_percpu *spc;
        size_t size;
        void *p;
        u_int i;

        spc = &ci->ci_schedstate;
        spc->spc_nextpkg = ci;

        if (spc->spc_lwplock == NULL) {
                spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
        }
        if (ci == lwp0.l_cpu) {
                /* Initialize the scheduler structure of the primary LWP */
                lwp0.l_mutex = spc->spc_lwplock;
        }
        if (spc->spc_mutex != NULL) {
                /* Already initialized. */
                return;
        }

        /* Allocate the run queue */
        size = roundup2(sizeof(spc->spc_queue[0]) * PRI_COUNT, coherency_unit) +
            coherency_unit;
        p = kmem_alloc(size, KM_SLEEP);
        spc->spc_queue = (void *)roundup2((uintptr_t)p, coherency_unit);

        /* Initialize run queues */
        spc->spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
        for (i = 0; i < PRI_COUNT; i++)
                TAILQ_INIT(&spc->spc_queue[i]);
}

/*
 * Control of the runqueue.
 */
static inline void *
sched_getrq(struct schedstate_percpu *spc, const pri_t prio)
{

        KASSERT(prio < PRI_COUNT);
        return &spc->spc_queue[prio];
}

/*
 * Put an LWP onto a run queue.  The LWP must be locked by spc_mutex for
 * l_cpu.
 */
void
sched_enqueue(struct lwp *l)
{
        struct schedstate_percpu *spc;
        TAILQ_HEAD(, lwp) *q_head;
        const pri_t eprio = lwp_eprio(l);
        struct cpu_info *ci;

        ci = l->l_cpu;
        spc = &ci->ci_schedstate;
        KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));

        /* Enqueue the thread */
        q_head = sched_getrq(spc, eprio);
        if (TAILQ_EMPTY(q_head)) {
                u_int i;
                uint32_t q;

                /* Mark bit */
                i = eprio >> BITMAP_SHIFT;
                q = BITMAP_MSB >> (eprio & BITMAP_MASK);
                KASSERT((spc->spc_bitmap[i] & q) == 0);
                spc->spc_bitmap[i] |= q;
        }

        /*
         * Determine run queue position according to POSIX.  XXX Explicitly
         * lowering a thread's priority with pthread_setschedparam() is not
         * handled.
         */
        if ((l->l_pflag & LP_PREEMPTING) != 0) {
                switch (l->l_class) {
                case SCHED_OTHER:
                        TAILQ_INSERT_TAIL(q_head, l, l_runq);
                        break;
                case SCHED_FIFO:
                        TAILQ_INSERT_HEAD(q_head, l, l_runq);
                        break;
                case SCHED_RR:
                        if (getticks() - l->l_rticks >= sched_rrticks) {
                                TAILQ_INSERT_TAIL(q_head, l, l_runq);
                        } else {
                                TAILQ_INSERT_HEAD(q_head, l, l_runq);
                        }
                        break;
                default:
                        panic("sched_enqueue: LWP %p has class %d\n",
                            l, l->l_class);
                }
        } else {
                TAILQ_INSERT_TAIL(q_head, l, l_runq);
        }
        spc->spc_flags &= ~SPCF_IDLE;
        spc->spc_count++;
        if ((l->l_pflag & LP_BOUND) == 0) {
                atomic_store_relaxed(&spc->spc_mcount,
                    atomic_load_relaxed(&spc->spc_mcount) + 1);
        }

        /*
         * Update the value of highest priority in the runqueue,
         * if priority of this thread is higher.
         */
        if (eprio > spc->spc_maxpriority)
                spc->spc_maxpriority = eprio;

        sched_newts(l);
}

/*
 * Remove and LWP from the run queue it's on.  The LWP must be in state
 * LSRUN.
 */
void
sched_dequeue(struct lwp *l)
{
        TAILQ_HEAD(, lwp) *q_head;
        struct schedstate_percpu *spc;
        const pri_t eprio = lwp_eprio(l);

        spc = &l->l_cpu->ci_schedstate;

        KASSERT(lwp_locked(l, spc->spc_mutex));
        KASSERT(eprio <= spc->spc_maxpriority);
        KASSERT(spc->spc_bitmap[eprio >> BITMAP_SHIFT] != 0);
        KASSERT(spc->spc_count > 0);

        if (spc->spc_migrating == l)
                spc->spc_migrating = NULL;

        spc->spc_count--;
        if ((l->l_pflag & LP_BOUND) == 0) {
                atomic_store_relaxed(&spc->spc_mcount,
                    atomic_load_relaxed(&spc->spc_mcount) - 1);
        }

        q_head = sched_getrq(spc, eprio);
        TAILQ_REMOVE(q_head, l, l_runq);
        if (TAILQ_EMPTY(q_head)) {
                u_int i;
                uint32_t q;

                /* Unmark bit */
                i = eprio >> BITMAP_SHIFT;
                q = BITMAP_MSB >> (eprio & BITMAP_MASK);
                KASSERT((spc->spc_bitmap[i] & q) != 0);
                spc->spc_bitmap[i] &= ~q;

                /*
                 * Update the value of highest priority in the runqueue, in a
                 * case it was a last thread in the queue of highest priority.
                 */
                if (eprio != spc->spc_maxpriority)
                        return;

                do {
                        if (spc->spc_bitmap[i] != 0) {
                                q = ffs(spc->spc_bitmap[i]);
                                spc->spc_maxpriority =
                                    (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
                                return;
                        }
                } while (i--);

                /* If not found - set the lowest value */
                spc->spc_maxpriority = 0;
        }
}

/*
 * Cause a preemption on the given CPU, if the priority "pri" is higher
 * priority than the running LWP.  If "unlock" is specified, and ideally it
 * will be for concurrency reasons, spc_mutex will be dropped before return.
 */
void
sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
{
        struct schedstate_percpu *spc;
        u_int o, n, f;
        lwp_t *l;

        spc = &ci->ci_schedstate;

        KASSERT(mutex_owned(spc->spc_mutex));

        /*
         * If the priority level we're evaluating wouldn't cause a new LWP
         * to be run on the CPU, then we have nothing to do.
         */
        if (pri <= spc->spc_curpriority || !mp_online) {
                if (__predict_true(unlock)) {
                        spc_unlock(ci);
                }
                return;
        }

        /*
         * Figure out what kind of preemption we should do.
         */        
        l = ci->ci_onproc;
        if ((l->l_flag & LW_IDLE) != 0) {
                f = RESCHED_IDLE | RESCHED_UPREEMPT;
        } else if (pri >= sched_kpreempt_pri && (l->l_pflag & LP_INTR) == 0) {
                /* We can't currently preempt softints - should be able to. */
#ifdef __HAVE_PREEMPTION
                f = RESCHED_KPREEMPT;
#else
                /* Leave door open for test: set kpreempt_pri with sysctl. */
                f = RESCHED_UPREEMPT;
#endif
                /*
                 * l_dopreempt must be set with the CPU locked to sync with
                 * mi_switch().  It must also be set with an atomic to sync
                 * with kpreempt().
                 */
                atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE);
        } else {
                f = RESCHED_UPREEMPT;
        }
        if (ci != curcpu()) {
                f |= RESCHED_REMOTE;
        }

        /*
         * Things can start as soon as ci_want_resched is touched: x86 has
         * an instruction that monitors the memory cell it's in.  Drop the
         * schedstate lock in advance, otherwise the remote CPU can awaken
         * and immediately block on the lock.
         */
        if (__predict_true(unlock)) {
                spc_unlock(ci);
        }

        /*
         * The caller almost always has a second scheduler lock held: either
         * the running LWP lock (spc_lwplock), or a sleep queue lock.  That
         * keeps preemption disabled, which among other things ensures all
         * LWPs involved won't be freed while we're here (see lwp_dtor()).
         */
         KASSERT(kpreempt_disabled());

        for (o = 0;; o = n) {
                n = atomic_cas_uint(&ci->ci_want_resched, o, o | f);
                if (__predict_true(o == n)) {
                        /*
                         * We're the first to set a resched on the CPU.  Try
                         * to avoid causing a needless trip through trap()
                         * to handle an AST fault, if it's known the LWP
                         * will either block or go through userret() soon.
                         */
                        if (l != curlwp || cpu_intr_p()) {
                                cpu_need_resched(ci, l, f);
                        }
                        break;
                }
                if (__predict_true(
                    (n & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)) >=
                    (f & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)))) {
                        /* Already in progress, nothing to do. */
                        break;
                }
        }
}

/*
 * Cause a preemption on the given CPU, if the priority of LWP "l" in state
 * LSRUN, is higher priority than the running LWP.  If "unlock" is
 * specified, and ideally it will be for concurrency reasons, spc_mutex will
 * be dropped before return.
 */
void
sched_resched_lwp(struct lwp *l, bool unlock)
{
        struct cpu_info *ci = l->l_cpu;

        KASSERT(lwp_locked(l, ci->ci_schedstate.spc_mutex));
        KASSERT(l->l_stat == LSRUN);

        sched_resched_cpu(ci, lwp_eprio(l), unlock);
}

/*
 * Migration and balancing.
 */

#ifdef MULTIPROCESSOR

/*
 * Estimate if LWP is cache-hot.
 */
static inline bool
lwp_cache_hot(const struct lwp *l)
{

        /* Leave new LWPs in peace, determination has already been made. */
        if (l->l_stat == LSIDL)
                return true;

        if (__predict_false(l->l_slptime != 0 || l->l_rticks == 0))
                return false;

        return (getticks() - l->l_rticks < mstohz(cacheht_time));
}

/*
 * Check if LWP can migrate to the chosen CPU.
 */
static inline bool
sched_migratable(const struct lwp *l, struct cpu_info *ci)
{
        const struct schedstate_percpu *spc = &ci->ci_schedstate;
        KASSERT(lwp_locked(__UNCONST(l), NULL));

        /* Is CPU offline? */
        if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
                return false;

        /* Is affinity set? */
        if (__predict_false(l->l_affinity))
                return kcpuset_isset(l->l_affinity, cpu_index(ci));

        /* Is there a processor-set? */
        return (spc->spc_psid == l->l_psid);
}

/*
 * A small helper to do round robin through CPU packages.
 */
static struct cpu_info *
sched_nextpkg(void)
{
        struct schedstate_percpu *spc = &curcpu()->ci_schedstate;

        spc->spc_nextpkg = 
            spc->spc_nextpkg->ci_sibling[CPUREL_PACKAGE1ST];

        return spc->spc_nextpkg;
}

/*
 * Find a CPU to run LWP "l".  Look for the CPU with the lowest priority
 * thread.  In case of equal priority, prefer first class CPUs, and amongst
 * the remainder choose the CPU with the fewest runqueue entries.
 *
 * Begin the search in the CPU package which "pivot" is a member of.
 */
static struct cpu_info * __noinline 
sched_bestcpu(struct lwp *l, struct cpu_info *pivot)
{
        struct cpu_info *bestci, *curci, *outer;
        struct schedstate_percpu *bestspc, *curspc;
        pri_t bestpri, curpri;

        /*
         * If this fails (it shouldn't), run on the given CPU.  This also
         * gives us a weak preference for "pivot" to begin with.
         */
        bestci = pivot;
        bestspc = &bestci->ci_schedstate;
        if (sched_migratable(l, bestci)) {
                bestpri = MAX(bestspc->spc_curpriority,
                    bestspc->spc_maxpriority);
        } else {
                /* Invalidate the priority. */
                bestpri = PRI_COUNT;
        }

        /* In the outer loop scroll through all CPU packages. */
        pivot = pivot->ci_package1st;
        outer = pivot;
        do {
                /* In the inner loop scroll through all CPUs in package. */
                curci = outer;
                do {
                        if (!sched_migratable(l, curci)) {
                                continue;
                        }

                        curspc = &curci->ci_schedstate;

                        /* If this CPU is idle and 1st class, we're done. */
                        if ((curspc->spc_flags & (SPCF_IDLE | SPCF_1STCLASS)) ==
                            (SPCF_IDLE | SPCF_1STCLASS)) {
                                return curci;
                        }

                        curpri = MAX(curspc->spc_curpriority,
                            curspc->spc_maxpriority);

                        if (curpri > bestpri) {
                                continue;
                        }
                        if (curpri == bestpri) {
                                /* Prefer first class CPUs over others. */
                                if ((curspc->spc_flags & SPCF_1STCLASS) == 0 &&
                                    (bestspc->spc_flags & SPCF_1STCLASS) != 0) {
                                            continue;
                                }
                                /*
                                 * Pick the least busy CPU.  Make sure this is not
                                 * <=, otherwise it defeats the above preference.
                                 */
                                if (bestspc->spc_count < curspc->spc_count) {
                                        continue;
                                }
                        }

                        bestpri = curpri;
                        bestci = curci;
                        bestspc = curspc;

                } while (curci = curci->ci_sibling[CPUREL_PACKAGE],
                    curci != outer);
        } while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
            outer != pivot);

        return bestci;
}

/*
 * Estimate the migration of LWP to the other CPU.
 * Take and return the CPU, if migration is needed.
 */
struct cpu_info *
sched_takecpu(struct lwp *l)
{
        struct schedstate_percpu *spc, *tspc;
        struct cpu_info *ci, *curci, *tci;
        pri_t eprio;
        int flags;

        KASSERT(lwp_locked(l, NULL));

        /* If thread is strictly bound, do not estimate other CPUs */
        ci = l->l_cpu;
        if (l->l_pflag & LP_BOUND)
                return ci;

        spc = &ci->ci_schedstate;
        eprio = lwp_eprio(l);

        /*
         * Handle new LWPs.  For vfork() with a timeshared child, make it
         * run on the same CPU as the parent if no other LWPs in queue. 
         * Otherwise scatter far and wide - try for an even distribution
         * across all CPU packages and CPUs.
         */
        if (l->l_stat == LSIDL) {
                if (curlwp->l_vforkwaiting && l->l_class == SCHED_OTHER) {
                        if (sched_migratable(l, curlwp->l_cpu) && eprio >
                            curlwp->l_cpu->ci_schedstate.spc_maxpriority) {
                                return curlwp->l_cpu;
                        }
                } else {
                        return sched_bestcpu(l, sched_nextpkg());
                }
                flags = SPCF_IDLE;
        } else {
                flags = SPCF_IDLE | SPCF_1STCLASS;
        }

        /*
         * Try to send the LWP back to the first CPU in the same core if
         * idle.  This keeps LWPs clustered in the run queues of 1st class
         * CPUs.  This implies stickiness.  If we didn't find a home for
         * a vfork() child above, try to use any SMT sibling to help out.
         */
        tci = ci;
        do {
                tspc = &tci->ci_schedstate;
                if ((tspc->spc_flags & flags) == flags &&
                    sched_migratable(l, tci)) {
                        return tci;
                }
                tci = tci->ci_sibling[CPUREL_CORE];
        } while (tci != ci);

        /*
         * Otherwise the LWP is "sticky", i.e.  generally preferring to stay
         * on the same CPU.
         */
        if (sched_migratable(l, ci) && (eprio > spc->spc_curpriority ||
            (lwp_cache_hot(l) && l->l_class == SCHED_OTHER))) {
                return ci;
        }

        /*
         * If the current CPU core is idle, run there and avoid the
         * expensive scan of CPUs below.
         */
        curci = curcpu();
        tci = curci;
        do {
                tspc = &tci->ci_schedstate;
                if ((tspc->spc_flags & flags) == flags &&
                    sched_migratable(l, tci)) {
                        return tci;
                }
                tci = tci->ci_sibling[CPUREL_CORE];
        } while (tci != curci);

        /*
         * Didn't find a new home above - happens infrequently.  Start the
         * search in last CPU package that the LWP ran in, but expand to
         * include the whole system if needed.
         */
        return sched_bestcpu(l, l->l_cpu);
}

/*
 * Tries to catch an LWP from the runqueue of other CPU.
 */
static struct lwp *
sched_catchlwp(struct cpu_info *ci)
{
        struct cpu_info *curci = curcpu();
        struct schedstate_percpu *spc, *curspc;
        TAILQ_HEAD(, lwp) *q_head;
        struct lwp *l;
        bool gentle;

        curspc = &curci->ci_schedstate;
        spc = &ci->ci_schedstate;

        /*
         * Be more aggressive if this CPU is first class, and the other
         * is not.
         */
        gentle = ((curspc->spc_flags & SPCF_1STCLASS) == 0 ||
            (spc->spc_flags & SPCF_1STCLASS) != 0);

        if (atomic_load_relaxed(&spc->spc_mcount) < (gentle ? min_catch : 1) ||
            curspc->spc_psid != spc->spc_psid) {
                spc_unlock(ci);
                return NULL;
        }

        /* Take the highest priority thread */
        q_head = sched_getrq(spc, spc->spc_maxpriority);
        l = TAILQ_FIRST(q_head);

        for (;;) {
                /* Check the first and next result from the queue */
                if (l == NULL) {
                        break;
                }
                KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d",
                    ci->ci_data.cpu_name,
                    l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat);

                /* Look for threads, whose are allowed to migrate */
                if ((l->l_pflag & LP_BOUND) ||
                    (gentle && lwp_cache_hot(l)) ||
                    !sched_migratable(l, curci)) {
                        l = TAILQ_NEXT(l, l_runq);
                        /* XXX Gap: could walk down priority list. */
                        continue;
                }

                /* Grab the thread, and move to the local run queue */
                sched_dequeue(l);
                l->l_cpu = curci;
                lwp_unlock_to(l, curspc->spc_mutex);
                sched_enqueue(l);
                return l;
        }
        spc_unlock(ci);

        return l;
}

/*
 * Called from sched_idle() to handle migration.  Return the CPU that we
 * pushed the LWP to (may be NULL).
 */
static struct cpu_info *
sched_idle_migrate(void)
{
        struct cpu_info *ci = curcpu(), *tci = NULL;
        struct schedstate_percpu *spc, *tspc;
        bool dlock = false;

        spc = &ci->ci_schedstate;
        spc_lock(ci);
        for (;;) {
                struct lwp *l;

                l = spc->spc_migrating;
                if (l == NULL)
                        break;

                /*
                 * If second attempt, and target CPU has changed,
                 * drop the old lock.
                 */
                if (dlock == true && tci != l->l_target_cpu) {
                        KASSERT(tci != NULL);
                        spc_unlock(tci);
                        dlock = false;
                }

                /*
                 * Nothing to do if destination has changed to the
                 * local CPU, or migration was done by other CPU.
                 */
                tci = l->l_target_cpu;
                if (tci == NULL || tci == ci) {
                        spc->spc_migrating = NULL;
                        l->l_target_cpu = NULL;
                        break;
                }
                tspc = &tci->ci_schedstate;

                /*
                 * Double-lock the runqueues.
                 * We do that only once.
                 */
                if (dlock == false) {
                        dlock = true;
                        if (ci < tci) {
                                spc_lock(tci);
                        } else if (!mutex_tryenter(tspc->spc_mutex)) {
                                spc_unlock(ci);
                                spc_lock(tci);
                                spc_lock(ci);
                                /* Check the situation again.. */
                                continue;
                        }
                }

                /* Migrate the thread */
                KASSERT(l->l_stat == LSRUN);
                spc->spc_migrating = NULL;
                l->l_target_cpu = NULL;
                sched_dequeue(l);
                l->l_cpu = tci;
                lwp_setlock(l, tspc->spc_mutex);
                sched_enqueue(l);
                sched_resched_lwp(l, true);
                /* tci now unlocked */
                spc_unlock(ci);
                return tci;
        }
        if (dlock == true) {
                KASSERT(tci != NULL);
                spc_unlock(tci);
        }
        spc_unlock(ci);
        return NULL;
}

/*
 * Try to steal an LWP from "tci".
 */
static bool
sched_steal(struct cpu_info *ci, struct cpu_info *tci)
{
        struct schedstate_percpu *spc, *tspc;
        lwp_t *l;

        spc = &ci->ci_schedstate;
        tspc = &tci->ci_schedstate;
        if (atomic_load_relaxed(&tspc->spc_mcount) != 0 &&
            spc->spc_psid == tspc->spc_psid) {
                spc_dlock(ci, tci);
                l = sched_catchlwp(tci);
                spc_unlock(ci);
                if (l != NULL) {
                        return true;
                }
        }
        return false;
}

/*
 * Called from each CPU's idle loop.
 */
void
sched_idle(void)
{
        struct cpu_info *ci, *inner, *outer, *first, *tci, *mci;
        struct schedstate_percpu *spc, *tspc;
        struct lwp *l;

        ci = curcpu();
        spc = &ci->ci_schedstate;
        tci = NULL;
        mci = NULL;

        /*
         * Handle LWP migrations off this CPU to another.  If there a is
         * migration to do then remember the CPU the LWP was sent to, and
         * don't steal the LWP back from that CPU below.
         */
        if (spc->spc_migrating != NULL) {
                mci = sched_idle_migrate();
        }

        /* If this CPU is offline, or we have an LWP to run, we're done. */
        if ((spc->spc_flags & SPCF_OFFLINE) != 0 || spc->spc_count != 0) {
                return;
        }

        /* Deal with SMT. */
        if (ci->ci_nsibling[CPUREL_CORE] > 1) {
                /* Try to help our siblings out. */
                tci = ci->ci_sibling[CPUREL_CORE];
                while (tci != ci) {
                        if (tci != mci && sched_steal(ci, tci)) {
                                return;
                        }
                        tci = tci->ci_sibling[CPUREL_CORE];
                }
                /*
                 * If not the first SMT in the core, and in the default
                 * processor set, the search ends here.
                 */
                if ((spc->spc_flags & SPCF_1STCLASS) == 0 &&
                    spc->spc_psid == PS_NONE) {
                        return;
                }
        }

        /*
         * Find something to run, unless this CPU exceeded the rate limit. 
         * Start looking on the current package to maximise L2/L3 cache
         * locality.  Then expand to looking at the rest of the system.
         *
         * XXX Should probably look at 2nd class CPUs first, but they will
         * shed jobs via preempt() anyway.
         */
        if (spc->spc_nextskim > getticks()) {
                return;
        }
        spc->spc_nextskim = getticks() + mstohz(skim_interval);

        /* In the outer loop scroll through all CPU packages, starting here. */
        first = ci->ci_package1st;
        outer = first;
        do {
                /* In the inner loop scroll through all CPUs in package. */
                inner = outer;
                do {
                        /* Don't hit the locks unless needed. */
                        tspc = &inner->ci_schedstate;
                        if (ci == inner || ci == mci ||
                            spc->spc_psid != tspc->spc_psid ||
                            atomic_load_relaxed(&tspc->spc_mcount) < min_catch) {
                                continue;
                        }
                        spc_dlock(ci, inner);
                        l = sched_catchlwp(inner);
                        spc_unlock(ci);
                        if (l != NULL) {
                                /* Got it! */
                                return;
                        }
                } while (inner = inner->ci_sibling[CPUREL_PACKAGE],
                    inner != outer);
        } while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
            outer != first);
}

/*
 * Called from mi_switch() when an LWP has been preempted / has yielded. 
 * The LWP is presently in the CPU's run queue.  Here we look for a better
 * CPU to teleport the LWP to; there may not be one.
 */
void
sched_preempted(struct lwp *l)
{
        const int flags = SPCF_IDLE | SPCF_1STCLASS;
        struct schedstate_percpu *tspc;
        struct cpu_info *ci, *tci;

        ci = l->l_cpu;
        tspc = &ci->ci_schedstate;

        KASSERT(tspc->spc_count >= 1);

        /*
         * Try to select another CPU if:
         *
         * - there is no migration pending already
         * - and this LWP is running on a 2nd class CPU
         * - or this LWP is a child of vfork() that has just done execve()
         */
        if (l->l_target_cpu != NULL ||
            ((tspc->spc_flags & SPCF_1STCLASS) != 0 &&
            (l->l_pflag & LP_TELEPORT) == 0)) {
                return;
        }

        /*
         * Fast path: if the first SMT in the core is idle, send it back
         * there, because the cache is shared (cheap) and we want all LWPs
         * to be clustered on 1st class CPUs (either running there or on
         * their runqueues).
         */
        tci = ci->ci_sibling[CPUREL_CORE];
        while (tci != ci) {
                tspc = &tci->ci_schedstate;
                if ((tspc->spc_flags & flags) == flags &&
                    sched_migratable(l, tci)) {
                            l->l_target_cpu = tci;
                        l->l_pflag &= ~LP_TELEPORT;
                            return;
                }
                tci = tci->ci_sibling[CPUREL_CORE];
        }

        if ((l->l_pflag & LP_TELEPORT) != 0) {
                /*
                 * A child of vfork(): now that the parent is released,
                 * scatter far and wide, to match the LSIDL distribution
                 * done in sched_takecpu().
                 */
                l->l_pflag &= ~LP_TELEPORT;
                tci = sched_bestcpu(l, sched_nextpkg());
                if (tci != ci) {
                        l->l_target_cpu = tci;
                }
        } else {
                /*
                 * Try to find a better CPU to take it, but don't move to
                 * another 2nd class CPU, and don't move to a non-idle CPU,
                 * because that would prevent SMT being used to maximise
                 * throughput.
                 *
                 * Search in the current CPU package in order to try and
                 * keep L2/L3 cache locality, but expand to include the
                 * whole system if needed.
                 */
                tci = sched_bestcpu(l, l->l_cpu);
                if (tci != ci &&
                    (tci->ci_schedstate.spc_flags & flags) == flags) {
                        l->l_target_cpu = tci;
                }
        }
}

/*
 * Called during execve() by a child of vfork().  Does two things:
 *
 * - If the parent has been awoken and put back on curcpu then give the
 *   CPU back to the parent.
 *
 * - If curlwp is not on a 1st class CPU then find somewhere else to run,
 *   since it dodged the distribution in sched_takecpu() when first set
 *   runnable.
 */
void
sched_vforkexec(struct lwp *l, bool samecpu)
{

        KASSERT(l == curlwp);
        if ((samecpu && ncpu > 1) ||
            (l->l_cpu->ci_schedstate.spc_flags & SPCF_1STCLASS) == 0) {
                l->l_pflag |= LP_TELEPORT;
                preempt();
        }
}

#else

/*
 * stubs for !MULTIPROCESSOR
 */

struct cpu_info *
sched_takecpu(struct lwp *l)
{

        return l->l_cpu;
}

void
sched_idle(void)
{

}

void
sched_preempted(struct lwp *l)
{

}

void
sched_vforkexec(struct lwp *l, bool samecpu)
{

        KASSERT(l == curlwp);
}

#endif        /* MULTIPROCESSOR */

/*
 * Scheduling statistics and balancing.
 */
void
sched_lwp_stats(struct lwp *l)
{
        int batch;

        KASSERT(lwp_locked(l, NULL));

        /* Update sleep time */
        if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
            l->l_stat == LSSUSPENDED)
                l->l_slptime++;

        /*
         * Set that thread is more CPU-bound, if sum of run time exceeds the
         * sum of sleep time.  Check if thread is CPU-bound a first time.
         */
        batch = (l->l_rticksum > l->l_slpticksum);
        if (batch != 0) {
                if ((l->l_flag & LW_BATCH) == 0)
                        batch = 0;
                l->l_flag |= LW_BATCH;
        } else
                l->l_flag &= ~LW_BATCH;

        /* Reset the time sums */
        l->l_slpticksum = 0;
        l->l_rticksum = 0;

        /* Scheduler-specific hook */
        sched_pstats_hook(l, batch);
#ifdef KDTRACE_HOOKS
        curthread = l;
#endif
}

/*
 * Scheduler mill.
 */
struct lwp *
sched_nextlwp(void)
{
        struct cpu_info *ci = curcpu();
        struct schedstate_percpu *spc;
        TAILQ_HEAD(, lwp) *q_head;
        struct lwp *l;

        /* Update the last run time on switch */
        l = curlwp;
        l->l_rticksum += (getticks() - l->l_rticks);

        /* Return to idle LWP if there is a migrating thread */
        spc = &ci->ci_schedstate;
        if (__predict_false(spc->spc_migrating != NULL))
                return NULL;

        /* Return to idle LWP if there is no runnable job */
        if (__predict_false(spc->spc_count == 0))
                return NULL;

        /* Take the highest priority thread */
        KASSERT(spc->spc_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
        q_head = sched_getrq(spc, spc->spc_maxpriority);
        l = TAILQ_FIRST(q_head);
        KASSERT(l != NULL);

        sched_oncpu(l);
        l->l_rticks = getticks();

        return l;
}

/*
 * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop.
 */

bool
sched_curcpu_runnable_p(void)
{
        const struct cpu_info *ci;
        const struct schedstate_percpu *spc;
        bool rv;

        kpreempt_disable();
        ci = curcpu();
        spc = &ci->ci_schedstate;
        rv = (spc->spc_count != 0);
#ifndef __HAVE_FAST_SOFTINTS
        rv |= (ci->ci_data.cpu_softints != 0);
#endif
        kpreempt_enable();

        return rv;
}

/*
 * Sysctl nodes and initialization.
 */

SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
{
        const struct sysctlnode *node = NULL;

        sysctl_createv(clog, 0, NULL, &node,
                CTLFLAG_PERMANENT,
                CTLTYPE_NODE, "sched",
                SYSCTL_DESCR("Scheduler options"),
                NULL, 0, NULL, 0,
                CTL_KERN, CTL_CREATE, CTL_EOL);

        if (node == NULL)
                return;

        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "cacheht_time",
                SYSCTL_DESCR("Cache hotness time (in ms)"),
                NULL, 0, &cacheht_time, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "skim_interval",
                SYSCTL_DESCR("Rate limit for stealing from other CPUs (in ms)"),
                NULL, 0, &skim_interval, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "min_catch",
                SYSCTL_DESCR("Minimal count of threads for catching"),
                NULL, 0, &min_catch, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "timesoftints",
                SYSCTL_DESCR("Track CPU time for soft interrupts"),
                NULL, 0, &softint_timing, 0,
                CTL_CREATE, CTL_EOL);
        sysctl_createv(clog, 0, &node, NULL,
                CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
                CTLTYPE_INT, "kpreempt_pri",
                SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
                NULL, 0, &sched_kpreempt_pri, 0,
                CTL_CREATE, CTL_EOL);
}

/*
 * Debugging.
 */

#ifdef DDB

void
sched_print_runqueue(void (*pr)(const char *, ...))
{
        struct cpu_info *ci, *tci;
        struct schedstate_percpu *spc;
        struct lwp *l;
        struct proc *p;
        CPU_INFO_ITERATOR cii;

        for (CPU_INFO_FOREACH(cii, ci)) {
                int i;

                spc = &ci->ci_schedstate;

                (*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
                (*pr)(" pid.lid = %d.%d, r_count = %u, "
                    "maxpri = %d, mlwp = %p\n",
#ifdef MULTIPROCESSOR
                    ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
#else
                    curlwp->l_proc->p_pid, curlwp->l_lid,
#endif
                    spc->spc_count, spc->spc_maxpriority,
                    spc->spc_migrating);
                i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
                do {
                        uint32_t q;
                        q = spc->spc_bitmap[i];
                        (*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
                } while (i--);
        }

        (*pr)("   %5s %4s %4s %10s %3s %18s %4s %4s %s\n",
            "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS");

        PROCLIST_FOREACH(p, &allproc) {
                (*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
                LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                        ci = l->l_cpu;
                        tci = l->l_target_cpu;
                        (*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n",
                            (int)l->l_lid, l->l_priority, lwp_eprio(l),
                            l->l_flag, l->l_stat == LSRUN ? "RQ" :
                            (l->l_stat == LSSLEEP ? "SQ" : "-"),
                            l, ci->ci_index, (tci ? tci->ci_index : -1),
                            (u_int)(getticks() - l->l_rticks));
                }
        }
}

#endif





























































    2 
























































    1 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/*      $NetBSD: clockctl_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $ */

/*-
 * Copyright (c) 2001 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Emmanuel Dreyfus.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: clockctl_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/device.h>
#include <sys/time.h>
#include <sys/conf.h>
#include <sys/timex.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/compat_stub.h>

#include <sys/clockctl.h>
#include <compat/sys/clockctl.h>
#include <compat/sys/time_types.h>

int
compat50_clockctlioctl(dev_t dev, u_long cmd, void *data, int flags,
    struct lwp *l)
{
        int error = 0;
        const struct cdevsw *cd = cdevsw_lookup(dev);

        if (cd == NULL || cd->d_ioctl == NULL)
                return ENXIO;

        switch (cmd) {
        case CLOCKCTL_OSETTIMEOFDAY: {
                struct timeval50 tv50;
                struct timeval tv;
                struct clockctl50_settimeofday *args = data;

                error = copyin(args->tv, &tv50, sizeof(tv50));
                if (error)
                        return (error);
                timeval50_to_timeval(&tv50, &tv);
                error = settimeofday1(&tv, false, args->tzp, l, false);
                break;
        }
        case CLOCKCTL_OADJTIME: {
                struct timeval atv, oldatv;
                struct timeval50 atv50;
                struct clockctl50_adjtime *args = data;

                if (args->delta) {
                        error = copyin(args->delta, &atv50, sizeof(atv50));
                        if (error)
                                return (error);
                        timeval50_to_timeval(&atv50, &atv);
                }
                adjtime1(args->delta ? &atv : NULL,
                    args->olddelta ? &oldatv : NULL, l->l_proc);
                if (args->olddelta) {
                        timeval_to_timeval50(&oldatv, &atv50);
                        error = copyout(&atv50, args->olddelta, sizeof(atv50));
                }
                break;
        }
        case CLOCKCTL_OCLOCK_SETTIME: {
                struct timespec50 tp50;
                struct timespec tp;
                struct clockctl50_clock_settime *args = data;

                error = copyin(args->tp, &tp50, sizeof(tp50));
                if (error)
                        return (error);
                timespec50_to_timespec(&tp50, &tp);
                error = clock_settime1(l->l_proc, args->clock_id, &tp, true);
                break;
        }
        case CLOCKCTL_ONTP_ADJTIME: {
                if (vec_ntp_timestatus == NULL) {
                        error = ENOTTY;
                        break;
                }
                /* The ioctl number changed but the data did not change. */
                error = (cd->d_ioctl)(dev, CLOCKCTL_NTP_ADJTIME,
                    data, flags, l);
                break;
        }
        default:
                error = ENOTTY;
        }

        return (error);
}

void
clockctl_50_init(void)
{

        MODULE_HOOK_SET(clockctl_ioctl_50_hook, compat50_clockctlioctl);
}

void
clockctl_50_fini(void)
{

        MODULE_HOOK_UNSET(clockctl_ioctl_50_hook);
}





















































































































































































































































    2 



    2 


    2 


    2 




    2 






















































    6 






    6 
    6 




    6 








    2 





   57 









   56 

   56 
    2 
















  128 






  124 







    6 


  108 


    6 
   22 











   18 



   17 
   17 










  109 




   43 
   93 





















  111 



















  212 


  191 

  183 







  255 









  253 

  255 










  126 





  125 



























  126 























  125 
  123 











    5 














  125 















  126 








  125 
  127 
  126 
  126 





  127 
  121 
  126 












  126 





















   66 
  106 
  109 










  110 







  127 







  125 





















  125 















   66 




  108 












  127 

  107 


   66 







  125 








  125 

















  125 







  126 






  125 



  126 

  126 



  124 

  122 

  124 
  123 
  123 












  123 


    9 


  124 












  124 








  124 
  123 




    5 




  125 
  123 












   11 




   11 
   11 
   10 
   11 
   11 





















   11 






























   11 











































































































    5 



    5 




    5 



















   28 



   27 





    5 





   25 
















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
/*        $NetBSD: kern_synch.c,v 1.366 2023/11/22 13:18:48 riastradh Exp $        */

/*-
 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009, 2019, 2020, 2023
 *    The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
 * Daniel Sieger.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1982, 1986, 1990, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_synch.c        8.9 (Berkeley) 5/19/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.366 2023/11/22 13:18:48 riastradh Exp $");

#include "opt_kstack.h"
#include "opt_ddb.h"
#include "opt_dtrace.h"

#define        __MUTEX_PRIVATE

#include <sys/param.h>

#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/dtrace_bsd.h>
#include <sys/evcnt.h>
#include <sys/intr.h>
#include <sys/kernel.h>
#include <sys/lockdebug.h>
#include <sys/lwpctl.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/syscall_stats.h>
#include <sys/syslog.h>
#include <sys/systm.h>

#include <uvm/uvm_extern.h>

#include <dev/lockstat.h>

int                             dtrace_vtime_active=0;
dtrace_vtime_switch_func_t      dtrace_vtime_switch_func;

#ifdef DDB
#include <ddb/ddb.h>
#endif

static void        sched_unsleep(struct lwp *, bool);
static void        sched_changepri(struct lwp *, pri_t);
static void        sched_lendpri(struct lwp *, pri_t);

syncobj_t sleep_syncobj = {
        .sobj_name        = "sleep",
        .sobj_flag        = SOBJ_SLEEPQ_SORTED,
        .sobj_boostpri  = PRI_KERNEL,
        .sobj_unsleep        = sleepq_unsleep,
        .sobj_changepri        = sleepq_changepri,
        .sobj_lendpri        = sleepq_lendpri,
        .sobj_owner        = syncobj_noowner,
};

syncobj_t sched_syncobj = {
        .sobj_name        = "sched",
        .sobj_flag        = SOBJ_SLEEPQ_SORTED,
        .sobj_boostpri  = PRI_USER,
        .sobj_unsleep        = sched_unsleep,
        .sobj_changepri        = sched_changepri,
        .sobj_lendpri        = sched_lendpri,
        .sobj_owner        = syncobj_noowner,
};

syncobj_t kpause_syncobj = {
        .sobj_name        = "kpause",
        .sobj_flag        = SOBJ_SLEEPQ_NULL,
        .sobj_boostpri  = PRI_KERNEL,
        .sobj_unsleep        = sleepq_unsleep,
        .sobj_changepri        = sleepq_changepri,
        .sobj_lendpri        = sleepq_lendpri,
        .sobj_owner        = syncobj_noowner,
};

/* "Lightning bolt": once a second sleep address. */
kcondvar_t                lbolt                        __cacheline_aligned;

u_int                        sched_pstats_ticks        __cacheline_aligned;

/* Preemption event counters. */
static struct evcnt        kpreempt_ev_crit        __cacheline_aligned;
static struct evcnt        kpreempt_ev_klock        __cacheline_aligned;
static struct evcnt        kpreempt_ev_immed        __cacheline_aligned;

void
synch_init(void)
{

        cv_init(&lbolt, "lbolt");

        evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL,
           "kpreempt", "defer: critical section");
        evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL,
           "kpreempt", "defer: kernel_lock");
        evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL,
           "kpreempt", "immediate");
}

/*
 * OBSOLETE INTERFACE
 *
 * General sleep call.  Suspends the current LWP until a wakeup is
 * performed on the specified identifier.  The LWP will then be made
 * runnable with the specified priority.  Sleeps at most timo/hz seconds (0
 * means no timeout).  If pri includes PCATCH flag, signals are checked
 * before and after sleeping, else signals are not checked.  Returns 0 if
 * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 * signal needs to be delivered, ERESTART is returned if the current system
 * call should be restarted if possible, and EINTR is returned if the system
 * call should be interrupted by the signal (return EINTR).
 */
int
tsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo)
{
        struct lwp *l = curlwp;
        sleepq_t *sq;
        kmutex_t *mp;
        bool catch_p;
        int nlocks;

        KASSERT((l->l_pflag & LP_INTR) == 0);
        KASSERT(ident != &lbolt);
        //KASSERT(KERNEL_LOCKED_P());

        if (sleepq_dontsleep(l)) {
                (void)sleepq_abort(NULL, 0);
                return 0;
        }

        catch_p = priority & PCATCH;
        sq = sleeptab_lookup(&sleeptab, ident, &mp);
        nlocks = sleepq_enter(sq, l, mp);
        sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p);
        return sleepq_block(timo, catch_p, &sleep_syncobj, nlocks);
}

int
mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
        kmutex_t *mtx)
{
        struct lwp *l = curlwp;
        sleepq_t *sq;
        kmutex_t *mp;
        bool catch_p;
        int error, nlocks;

        KASSERT((l->l_pflag & LP_INTR) == 0);
        KASSERT(ident != &lbolt);

        if (sleepq_dontsleep(l)) {
                (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
                return 0;
        }

        catch_p = priority & PCATCH;
        sq = sleeptab_lookup(&sleeptab, ident, &mp);
        nlocks = sleepq_enter(sq, l, mp);
        sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p);
        mutex_exit(mtx);
        error = sleepq_block(timo, catch_p, &sleep_syncobj, nlocks);

        if ((priority & PNORELOCK) == 0)
                mutex_enter(mtx);

        return error;
}

/*
 * General sleep call for situations where a wake-up is not expected.
 */
int
kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
{
        struct lwp *l = curlwp;
        int error, nlocks;

        KASSERTMSG(timo != 0 || intr, "wmesg=%s intr=%s timo=%d mtx=%p",
            wmesg, intr ? "true" : "false", timo, mtx);

        if (sleepq_dontsleep(l))
                return sleepq_abort(NULL, 0);

        if (mtx != NULL)
                mutex_exit(mtx);
        nlocks = sleepq_enter(NULL, l, NULL);
        sleepq_enqueue(NULL, l, wmesg, &kpause_syncobj, intr);
        error = sleepq_block(timo, intr, &kpause_syncobj, nlocks);
        if (mtx != NULL)
                mutex_enter(mtx);

        return error;
}

/*
 * OBSOLETE INTERFACE
 *
 * Make all LWPs sleeping on the specified identifier runnable.
 */
void
wakeup(wchan_t ident)
{
        sleepq_t *sq;
        kmutex_t *mp;

        if (__predict_false(cold))
                return;

        sq = sleeptab_lookup(&sleeptab, ident, &mp);
        sleepq_wake(sq, ident, (u_int)-1, mp);
}

/*
 * General yield call.  Puts the current LWP back on its run queue and
 * performs a context switch.
 */
void
yield(void)
{
        struct lwp *l = curlwp;
        int nlocks;

        KERNEL_UNLOCK_ALL(l, &nlocks);
        lwp_lock(l);

        KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
        KASSERT(l->l_stat == LSONPROC);

        spc_lock(l->l_cpu);
        mi_switch(l);
        KERNEL_LOCK(nlocks, l);
}

/*
 * General preemption call.  Puts the current LWP back on its run queue
 * and performs an involuntary context switch.  Different from yield()
 * in that:
 *
 * - It's counted differently (involuntary vs. voluntary).
 * - Realtime threads go to the head of their runqueue vs. tail for yield().
 */
void
preempt(void)
{
        struct lwp *l = curlwp;
        int nlocks;

        KERNEL_UNLOCK_ALL(l, &nlocks);
        lwp_lock(l);

        KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
        KASSERT(l->l_stat == LSONPROC);

        spc_lock(l->l_cpu);
        l->l_pflag |= LP_PREEMPTING;
        mi_switch(l);
        KERNEL_LOCK(nlocks, l);
}

/*
 * Return true if the current LWP should yield the processor.  Intended to
 * be used by long-running code in kernel.
 */
inline bool
preempt_needed(void)
{
        lwp_t *l = curlwp;
        int needed;

        KPREEMPT_DISABLE(l);
        needed = l->l_cpu->ci_want_resched;
        KPREEMPT_ENABLE(l);

        return (needed != 0);
}

/*
 * A breathing point for long running code in kernel.
 */
void
preempt_point(void)
{

        if (__predict_false(preempt_needed())) {
                preempt();
        }
}

/*
 * Handle a request made by another agent to preempt the current LWP
 * in-kernel.  Usually called when l_dopreempt may be non-zero.
 *
 * Character addresses for lockstat only.
 */
static char        kpreempt_is_disabled;
static char        kernel_lock_held;
static char        is_softint_lwp;
static char        spl_is_raised;

bool
kpreempt(uintptr_t where)
{
        uintptr_t failed;
        lwp_t *l;
        int s, dop, lsflag;

        l = curlwp;
        failed = 0;
        while ((dop = l->l_dopreempt) != 0) {
                if (l->l_stat != LSONPROC) {
                        /*
                         * About to block (or die), let it happen.
                         * Doesn't really count as "preemption has
                         * been blocked", since we're going to
                         * context switch.
                         */
                        atomic_swap_uint(&l->l_dopreempt, 0);
                        return true;
                }
                KASSERT((l->l_flag & LW_IDLE) == 0);
                if (__predict_false(l->l_nopreempt != 0)) {
                        /* LWP holds preemption disabled, explicitly. */
                        if ((dop & DOPREEMPT_COUNTED) == 0) {
                                kpreempt_ev_crit.ev_count++;
                        }
                        failed = (uintptr_t)&kpreempt_is_disabled;
                        break;
                }
                if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
                        /* Can't preempt soft interrupts yet. */
                        atomic_swap_uint(&l->l_dopreempt, 0);
                        failed = (uintptr_t)&is_softint_lwp;
                        break;
                }
                s = splsched();
                if (__predict_false(l->l_blcnt != 0 ||
                    curcpu()->ci_biglock_wanted != NULL)) {
                        /* Hold or want kernel_lock, code is not MT safe. */
                        splx(s);
                        if ((dop & DOPREEMPT_COUNTED) == 0) {
                                kpreempt_ev_klock.ev_count++;
                        }
                        failed = (uintptr_t)&kernel_lock_held;
                        break;
                }
                if (__predict_false(!cpu_kpreempt_enter(where, s))) {
                        /*
                         * It may be that the IPL is too high.
                         * kpreempt_enter() can schedule an
                         * interrupt to retry later.
                         */
                        splx(s);
                        failed = (uintptr_t)&spl_is_raised;
                        break;
                }
                /* Do it! */
                if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) {
                        kpreempt_ev_immed.ev_count++;
                }
                lwp_lock(l);
                l->l_pflag |= LP_PREEMPTING;
                spc_lock(l->l_cpu);
                mi_switch(l);
                l->l_nopreempt++;
                splx(s);

                /* Take care of any MD cleanup. */
                cpu_kpreempt_exit(where);
                l->l_nopreempt--;
        }

        if (__predict_true(!failed)) {
                return false;
        }

        /* Record preemption failure for reporting via lockstat. */
        atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
        lsflag = 0;
        LOCKSTAT_ENTER(lsflag);
        if (__predict_false(lsflag)) {
                if (where == 0) {
                        where = (uintptr_t)__builtin_return_address(0);
                }
                /* Preemption is on, might recurse, so make it atomic. */
                if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL,
                    (void *)where) == NULL) {
                        LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
                        l->l_pfaillock = failed;
                }
        }
        LOCKSTAT_EXIT(lsflag);
        return true;
}

/*
 * Return true if preemption is explicitly disabled.
 */
bool
kpreempt_disabled(void)
{
        const lwp_t *l = curlwp;

        return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
            (l->l_flag & LW_IDLE) != 0 || (l->l_pflag & LP_INTR) != 0 ||
            cpu_kpreempt_disabled();
}

/*
 * Disable kernel preemption.
 */
void
kpreempt_disable(void)
{

        KPREEMPT_DISABLE(curlwp);
}

/*
 * Reenable kernel preemption.
 */
void
kpreempt_enable(void)
{

        KPREEMPT_ENABLE(curlwp);
}

/*
 * Compute the amount of time during which the current lwp was running.
 *
 * - update l_rtime unless it's an idle lwp.
 */

void
updatertime(lwp_t *l, const struct bintime *now)
{
        static bool backwards = false;

        if (__predict_false(l->l_flag & LW_IDLE))
                return;

        if (__predict_false(bintimecmp(now, &l->l_stime, <)) && !backwards) {
                char caller[128];

#ifdef DDB
                db_symstr(caller, sizeof(caller),
                    (db_expr_t)(intptr_t)__builtin_return_address(0),
                    DB_STGY_PROC);
#else
                snprintf(caller, sizeof(caller), "%p",
                    __builtin_return_address(0));
#endif
                backwards = true;
                printf("WARNING: lwp %ld (%s%s%s) flags 0x%x:"
                    " timecounter went backwards"
                    " from (%jd + 0x%016"PRIx64"/2^64) sec"
                    " to (%jd + 0x%016"PRIx64"/2^64) sec"
                    " in %s\n",
                    (long)l->l_lid,
                    l->l_proc->p_comm,
                    l->l_name ? " " : "",
                    l->l_name ? l->l_name : "",
                    l->l_pflag,
                    (intmax_t)l->l_stime.sec, l->l_stime.frac,
                    (intmax_t)now->sec, now->frac,
                    caller);
        }

        /* rtime += now - stime */
        bintime_add(&l->l_rtime, now);
        bintime_sub(&l->l_rtime, &l->l_stime);
}

/*
 * Select next LWP from the current CPU to run..
 */
static inline lwp_t *
nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc)
{
        lwp_t *newl;

        /*
         * Let sched_nextlwp() select the LWP to run the CPU next.
         * If no LWP is runnable, select the idle LWP.
         * 
         * On arrival here LWPs on a run queue are locked by spc_mutex which
         * is currently held.  Idle LWPs are always locked by spc_lwplock,
         * which may or may not be held here.  On exit from this code block,
         * in all cases newl is locked by spc_lwplock.
         */
        newl = sched_nextlwp();
        if (newl != NULL) {
                sched_dequeue(newl);
                KASSERT(lwp_locked(newl, spc->spc_mutex));
                KASSERT(newl->l_cpu == ci);
                newl->l_stat = LSONPROC;
                newl->l_pflag |= LP_RUNNING;
                newl->l_boostpri = PRI_NONE;
                spc->spc_curpriority = lwp_eprio(newl);
                spc->spc_flags &= ~(SPCF_SWITCHCLEAR | SPCF_IDLE);
                lwp_setlock(newl, spc->spc_lwplock);
        } else {
                /*
                 * The idle LWP does not get set to LSONPROC, because
                 * otherwise it screws up the output from top(1) etc.
                 */
                newl = ci->ci_data.cpu_idlelwp;
                newl->l_pflag |= LP_RUNNING;
                spc->spc_curpriority = PRI_IDLE;
                spc->spc_flags = (spc->spc_flags & ~SPCF_SWITCHCLEAR) |
                    SPCF_IDLE;
        }

        /*
         * Only clear want_resched if there are no pending (slow) software
         * interrupts.  We can do this without an atomic, because no new
         * LWPs can appear in the queue due to our hold on spc_mutex, and
         * the update to ci_want_resched will become globally visible before
         * the release of spc_mutex becomes globally visible.
         */
        if (ci->ci_data.cpu_softints == 0)
                ci->ci_want_resched = 0;

        return newl;
}

/*
 * The machine independent parts of context switch.
 *
 * NOTE: l->l_cpu is not changed in this routine, because an LWP never
 * changes its own l_cpu (that would screw up curcpu on many ports and could
 * cause all kinds of other evil stuff).  l_cpu is always changed by some
 * other actor, when it's known the LWP is not running (the LP_RUNNING flag
 * is checked under lock).
 */
void
mi_switch(lwp_t *l)
{
        struct cpu_info *ci;
        struct schedstate_percpu *spc;
        struct lwp *newl;
        kmutex_t *lock;
        int oldspl;
        struct bintime bt;
        bool returning;

        KASSERT(lwp_locked(l, NULL));
        KASSERT(kpreempt_disabled());
        KASSERT(mutex_owned(curcpu()->ci_schedstate.spc_mutex));
        KASSERTMSG(l->l_blcnt == 0, "kernel_lock leaked");

        kstack_check_magic(l);

        binuptime(&bt);

        KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp);
        KASSERT((l->l_pflag & LP_RUNNING) != 0);
        KASSERT(l->l_cpu == curcpu() || l->l_stat == LSRUN);
        ci = curcpu();
        spc = &ci->ci_schedstate;
        returning = false;
        newl = NULL;

        /*
         * If we have been asked to switch to a specific LWP, then there
         * is no need to inspect the run queues.  If a soft interrupt is
         * blocking, then return to the interrupted thread without adjusting
         * VM context or its start time: neither have been changed in order
         * to take the interrupt.
         */
        if (l->l_switchto != NULL) {
                if ((l->l_pflag & LP_INTR) != 0) {
                        returning = true;
                        softint_block(l);
                        if ((l->l_pflag & LP_TIMEINTR) != 0)
                                updatertime(l, &bt);
                }
                newl = l->l_switchto;
                l->l_switchto = NULL;
        }
#ifndef __HAVE_FAST_SOFTINTS
        else if (ci->ci_data.cpu_softints != 0) {
                /* There are pending soft interrupts, so pick one. */
                newl = softint_picklwp();
                newl->l_stat = LSONPROC;
                newl->l_pflag |= LP_RUNNING;
        }
#endif        /* !__HAVE_FAST_SOFTINTS */

        /*
         * If on the CPU and we have gotten this far, then we must yield.
         */
        if (l->l_stat == LSONPROC && l != newl) {
                KASSERT(lwp_locked(l, spc->spc_lwplock));
                KASSERT((l->l_flag & LW_IDLE) == 0);
                l->l_stat = LSRUN;
                lwp_setlock(l, spc->spc_mutex);
                sched_enqueue(l);
                sched_preempted(l);

                /*
                 * Handle migration.  Note that "migrating LWP" may
                 * be reset here, if interrupt/preemption happens
                 * early in idle LWP.
                 */
                if (l->l_target_cpu != NULL && (l->l_pflag & LP_BOUND) == 0) {
                        KASSERT((l->l_pflag & LP_INTR) == 0);
                        spc->spc_migrating = l;
                }
        }

        /* Pick new LWP to run. */
        if (newl == NULL) {
                newl = nextlwp(ci, spc);
        }

        /* Items that must be updated with the CPU locked. */
        if (!returning) {
                /* Count time spent in current system call */
                SYSCALL_TIME_SLEEP(l);

                updatertime(l, &bt);

                /* Update the new LWP's start time. */
                newl->l_stime = bt;

                /*
                 * ci_curlwp changes when a fast soft interrupt occurs.
                 * We use ci_onproc to keep track of which kernel or
                 * user thread is running 'underneath' the software
                 * interrupt.  This is important for time accounting,
                 * itimers and forcing user threads to preempt (aston).
                 */
                ci->ci_onproc = newl;
        }

        /*
         * Preemption related tasks.  Must be done holding spc_mutex.  Clear
         * l_dopreempt without an atomic - it's only ever set non-zero by
         * sched_resched_cpu() which also holds spc_mutex, and only ever
         * cleared by the LWP itself (us) with atomics when not under lock.
         */
        l->l_dopreempt = 0;
        if (__predict_false(l->l_pfailaddr != 0)) {
                LOCKSTAT_FLAG(lsflag);
                LOCKSTAT_ENTER(lsflag);
                LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
                LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
                    1, l->l_pfailtime, l->l_pfailaddr);
                LOCKSTAT_EXIT(lsflag);
                l->l_pfailtime = 0;
                l->l_pfaillock = 0;
                l->l_pfailaddr = 0;
        }

        if (l != newl) {
                struct lwp *prevlwp;

                /* Release all locks, but leave the current LWP locked */
                if (l->l_mutex == spc->spc_mutex) {
                        /*
                         * Drop spc_lwplock, if the current LWP has been moved
                         * to the run queue (it is now locked by spc_mutex).
                         */
                        mutex_spin_exit(spc->spc_lwplock);
                } else {
                        /*
                         * Otherwise, drop the spc_mutex, we are done with the
                         * run queues.
                         */
                        mutex_spin_exit(spc->spc_mutex);
                }

                /* We're down to only one lock, so do debug checks. */
                LOCKDEBUG_BARRIER(l->l_mutex, 1);

                /* Count the context switch. */
                CPU_COUNT(CPU_COUNT_NSWTCH, 1);
                if ((l->l_pflag & LP_PREEMPTING) != 0) {
                        l->l_ru.ru_nivcsw++;
                        l->l_pflag &= ~LP_PREEMPTING;
                } else {
                        l->l_ru.ru_nvcsw++;
                }

                /*
                 * Increase the count of spin-mutexes before the release
                 * of the last lock - we must remain at IPL_SCHED after
                 * releasing the lock.
                 */
                KASSERTMSG(ci->ci_mtx_count == -1,
                    "%s: cpu%u: ci_mtx_count (%d) != -1 "
                    "(block with spin-mutex held)",
                     __func__, cpu_index(ci), ci->ci_mtx_count);
                oldspl = MUTEX_SPIN_OLDSPL(ci);
                ci->ci_mtx_count = -2;

                /* Update status for lwpctl, if present. */
                if (l->l_lwpctl != NULL) {
                        l->l_lwpctl->lc_curcpu = (l->l_stat == LSZOMB ?
                            LWPCTL_CPU_EXITED : LWPCTL_CPU_NONE);
                }

                /*
                 * If curlwp is a soft interrupt LWP, there's nobody on the
                 * other side to unlock - we're returning into an assembly
                 * trampoline.  Unlock now.  This is safe because this is a
                 * kernel LWP and is bound to current CPU: the worst anyone
                 * else will do to it, is to put it back onto this CPU's run
                 * queue (and the CPU is busy here right now!).
                 */
                if (returning) {
                        /* Keep IPL_SCHED after this; MD code will fix up. */
                        l->l_pflag &= ~LP_RUNNING;
                        lwp_unlock(l);
                } else {
                        /* A normal LWP: save old VM context. */
                        pmap_deactivate(l);
                }

                /*
                 * If DTrace has set the active vtime enum to anything
                 * other than INACTIVE (0), then it should have set the
                 * function to call.
                 */
                if (__predict_false(dtrace_vtime_active)) {
                        (*dtrace_vtime_switch_func)(newl);
                }

                /*
                 * We must ensure not to come here from inside a read section.
                 */
                KASSERT(pserialize_not_in_read_section());

                /* Switch to the new LWP.. */
#ifdef MULTIPROCESSOR
                KASSERT(curlwp == ci->ci_curlwp);
#endif
                KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp);
                prevlwp = cpu_switchto(l, newl, returning);
                ci = curcpu();
#ifdef MULTIPROCESSOR
                KASSERT(curlwp == ci->ci_curlwp);
#endif
                KASSERTMSG(l == curlwp, "l %p curlwp %p prevlwp %p",
                    l, curlwp, prevlwp);
                KASSERT(prevlwp != NULL);
                KASSERT(l->l_cpu == ci);
                KASSERT(ci->ci_mtx_count == -2);

                /*
                 * Immediately mark the previous LWP as no longer running
                 * and unlock (to keep lock wait times short as possible).
                 * We'll still be at IPL_SCHED afterwards.  If a zombie,
                 * don't touch after clearing LP_RUNNING as it could be
                 * reaped by another CPU.  Issue a memory barrier to ensure
                 * this.
                 *
                 * atomic_store_release matches atomic_load_acquire in
                 * lwp_free.
                 */
                KASSERT((prevlwp->l_pflag & LP_RUNNING) != 0);
                lock = prevlwp->l_mutex;
                if (__predict_false(prevlwp->l_stat == LSZOMB)) {
                        atomic_store_release(&prevlwp->l_pflag,
                            prevlwp->l_pflag & ~LP_RUNNING);
                } else {
                        prevlwp->l_pflag &= ~LP_RUNNING;
                }
                mutex_spin_exit(lock);

                /*
                 * Switched away - we have new curlwp.
                 * Restore VM context and IPL.
                 */
                pmap_activate(l);
                pcu_switchpoint(l);

                /* Update status for lwpctl, if present. */
                if (l->l_lwpctl != NULL) {
                        l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
                        l->l_lwpctl->lc_pctr++;
                }

                /*
                 * Normalize the spin mutex count and restore the previous
                 * SPL.  Note that, unless the caller disabled preemption,
                 * we can be preempted at any time after this splx().
                 */
                KASSERT(l->l_cpu == ci);
                KASSERT(ci->ci_mtx_count == -1);
                ci->ci_mtx_count = 0;
                splx(oldspl);
        } else {
                /* Nothing to do - just unlock and return. */
                mutex_spin_exit(spc->spc_mutex);
                l->l_pflag &= ~LP_PREEMPTING;
                lwp_unlock(l);
        }

        KASSERT(l == curlwp);
        KASSERT(l->l_stat == LSONPROC || (l->l_flag & LW_IDLE) != 0); 

        SYSCALL_TIME_WAKEUP(l);
        LOCKDEBUG_BARRIER(NULL, 1);
}

/*
 * setrunnable: change LWP state to be runnable, placing it on the run queue.
 *
 * Call with the process and LWP locked.  Will return with the LWP unlocked.
 */
void
setrunnable(struct lwp *l)
{
        struct proc *p = l->l_proc;
        struct cpu_info *ci;
        kmutex_t *oldlock;

        KASSERT((l->l_flag & LW_IDLE) == 0);
        KASSERT((l->l_flag & LW_DBGSUSPEND) == 0);
        KASSERT(mutex_owned(p->p_lock));
        KASSERT(lwp_locked(l, NULL));
        KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);

        switch (l->l_stat) {
        case LSSTOP:
                /*
                 * If we're being traced (possibly because someone attached us
                 * while we were stopped), check for a signal from the debugger.
                 */
                if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xsig != 0)
                        signotify(l);
                p->p_nrlwps++;
                break;
        case LSSUSPENDED:
                KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
                l->l_flag &= ~LW_WSUSPEND;
                p->p_nrlwps++;
                cv_broadcast(&p->p_lwpcv);
                break;
        case LSSLEEP:
                KASSERT(l->l_wchan != NULL);
                break;
        case LSIDL:
                KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
                break;
        default:
                panic("setrunnable: lwp %p state was %d", l, l->l_stat);
        }

        /*
         * If the LWP was sleeping, start it again.
         */
        if (l->l_wchan != NULL) {
                l->l_stat = LSSLEEP;
                /* lwp_unsleep() will release the lock. */
                lwp_unsleep(l, true);
                return;
        }

        /*
         * If the LWP is still on the CPU, mark it as LSONPROC.  It may be
         * about to call mi_switch(), in which case it will yield.
         */
        if ((l->l_pflag & LP_RUNNING) != 0) {
                l->l_stat = LSONPROC;
                l->l_slptime = 0;
                lwp_unlock(l);
                return;
        }

        /*
         * Look for a CPU to run.
         * Set the LWP runnable.
         */
        ci = sched_takecpu(l);
        l->l_cpu = ci;
        spc_lock(ci);
        oldlock = lwp_setlock(l, l->l_cpu->ci_schedstate.spc_mutex);
        sched_setrunnable(l);
        l->l_stat = LSRUN;
        l->l_slptime = 0;
        sched_enqueue(l);
        sched_resched_lwp(l, true);
        /* SPC & LWP now unlocked. */
        mutex_spin_exit(oldlock);
}

/*
 * suspendsched:
 *
 *        Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 
 */
void
suspendsched(void)
{
        CPU_INFO_ITERATOR cii;
        struct cpu_info *ci;
        struct lwp *l;
        struct proc *p;

        /*
         * We do this by process in order not to violate the locking rules.
         */
        mutex_enter(&proc_lock);
        PROCLIST_FOREACH(p, &allproc) {
                mutex_enter(p->p_lock);
                if ((p->p_flag & PK_SYSTEM) != 0) {
                        mutex_exit(p->p_lock);
                        continue;
                }

                if (p->p_stat != SSTOP) {
                        if (p->p_stat != SZOMB && p->p_stat != SDEAD) {
                                p->p_pptr->p_nstopchild++;
                                p->p_waited = 0;
                        }
                        p->p_stat = SSTOP;
                }

                LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                        if (l == curlwp)
                                continue;

                        lwp_lock(l);

                        /*
                         * Set L_WREBOOT so that the LWP will suspend itself
                         * when it tries to return to user mode.  We want to
                         * try and get to get as many LWPs as possible to
                         * the user / kernel boundary, so that they will
                         * release any locks that they hold.
                         */
                        l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);

                        if (l->l_stat == LSSLEEP &&
                            (l->l_flag & LW_SINTR) != 0) {
                                /* setrunnable() will release the lock. */
                                setrunnable(l);
                                continue;
                        }

                        lwp_unlock(l);
                }

                mutex_exit(p->p_lock);
        }
        mutex_exit(&proc_lock);

        /*
         * Kick all CPUs to make them preempt any LWPs running in user mode. 
         * They'll trap into the kernel and suspend themselves in userret(). 
         *
         * Unusually, we don't hold any other scheduler object locked, which
         * would keep preemption off for sched_resched_cpu(), so disable it
         * explicitly.
         */
        kpreempt_disable();
        for (CPU_INFO_FOREACH(cii, ci)) {
                spc_lock(ci);
                sched_resched_cpu(ci, PRI_KERNEL, true);
                /* spc now unlocked */
        }
        kpreempt_enable();
}

/*
 * sched_unsleep:
 *
 *        The is called when the LWP has not been awoken normally but instead
 *        interrupted: for example, if the sleep timed out.  Because of this,
 *        it's not a valid action for running or idle LWPs.
 */
static void
sched_unsleep(struct lwp *l, bool cleanup)
{

        lwp_unlock(l);
        panic("sched_unsleep");
}

static void
sched_changepri(struct lwp *l, pri_t pri)
{
        struct schedstate_percpu *spc;
        struct cpu_info *ci;

        KASSERT(lwp_locked(l, NULL));

        ci = l->l_cpu;
        spc = &ci->ci_schedstate;

        if (l->l_stat == LSRUN) {
                KASSERT(lwp_locked(l, spc->spc_mutex));
                sched_dequeue(l);
                l->l_priority = pri;
                sched_enqueue(l);
                sched_resched_lwp(l, false);
        } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) {
                /* On priority drop, only evict realtime LWPs. */
                KASSERT(lwp_locked(l, spc->spc_lwplock));
                l->l_priority = pri;
                spc_lock(ci);
                sched_resched_cpu(ci, spc->spc_maxpriority, true);
                /* spc now unlocked */
        } else {
                l->l_priority = pri;
        }
}

static void
sched_lendpri(struct lwp *l, pri_t pri)
{
        struct schedstate_percpu *spc;
        struct cpu_info *ci;

        KASSERT(lwp_locked(l, NULL));

        ci = l->l_cpu;
        spc = &ci->ci_schedstate;

        if (l->l_stat == LSRUN) {
                KASSERT(lwp_locked(l, spc->spc_mutex));
                sched_dequeue(l);
                l->l_inheritedprio = pri;
                l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
                sched_enqueue(l);
                sched_resched_lwp(l, false);
        } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) {
                /* On priority drop, only evict realtime LWPs. */
                KASSERT(lwp_locked(l, spc->spc_lwplock));
                l->l_inheritedprio = pri;
                l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
                spc_lock(ci);
                sched_resched_cpu(ci, spc->spc_maxpriority, true);
                /* spc now unlocked */
        } else {
                l->l_inheritedprio = pri;
                l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
        }
}

struct lwp *
syncobj_noowner(wchan_t wchan)
{

        return NULL;
}

/* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */
const fixpt_t ccpu = 0.95122942450071400909 * FSCALE;

/*
 * Constants for averages over 1, 5 and 15 minutes when sampling at
 * 5 second intervals.
 */
static const fixpt_t cexp[ ] = {
        0.9200444146293232 * FSCALE,        /* exp(-1/12) */
        0.9834714538216174 * FSCALE,        /* exp(-1/60) */
        0.9944598480048967 * FSCALE,        /* exp(-1/180) */
};

/*
 * sched_pstats:
 *
 * => Update process statistics and check CPU resource allocation.
 * => Call scheduler-specific hook to eventually adjust LWP priorities.
 * => Compute load average of a quantity on 1, 5 and 15 minute intervals.
 */
void
sched_pstats(void)
{
        struct loadavg *avg = &averunnable;
        const int clkhz = (stathz != 0 ? stathz : hz);
        static bool backwardslwp = false;
        static bool backwardsproc = false;
        static u_int lavg_count = 0;
        struct proc *p;
        int nrun;

        sched_pstats_ticks++;
        if (++lavg_count >= 5) {
                lavg_count = 0;
                nrun = 0;
        }
        mutex_enter(&proc_lock);
        PROCLIST_FOREACH(p, &allproc) {
                struct lwp *l;
                struct rlimit *rlim;
                time_t runtm;
                int sig;

                /* Increment sleep time (if sleeping), ignore overflow. */
                mutex_enter(p->p_lock);
                runtm = p->p_rtime.sec;
                LIST_FOREACH(l, &p->p_lwps, l_sibling) {
                        fixpt_t lpctcpu;
                        u_int lcpticks;

                        if (__predict_false((l->l_flag & LW_IDLE) != 0))
                                continue;
                        lwp_lock(l);
                        if (__predict_false(l->l_rtime.sec < 0) &&
                            !backwardslwp) {
                                backwardslwp = true;
                                printf("WARNING: lwp %ld (%s%s%s): "
                                    "negative runtime: "
                                    "(%jd + 0x%016"PRIx64"/2^64) sec\n",
                                    (long)l->l_lid,
                                    l->l_proc->p_comm,
                                    l->l_name ? " " : "",
                                    l->l_name ? l->l_name : "",
                                    (intmax_t)l->l_rtime.sec,
                                    l->l_rtime.frac);
                        }
                        runtm += l->l_rtime.sec;
                        l->l_swtime++;
                        sched_lwp_stats(l);

                        /* For load average calculation. */
                        if (__predict_false(lavg_count == 0) &&
                            (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) {
                                switch (l->l_stat) {
                                case LSSLEEP:
                                        if (l->l_slptime > 1) {
                                                break;
                                        }
                                        /* FALLTHROUGH */
                                case LSRUN:
                                case LSONPROC:
                                case LSIDL:
                                        nrun++;
                                }
                        }
                        lwp_unlock(l);

                        l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
                        if (l->l_slptime != 0)
                                continue;

                        lpctcpu = l->l_pctcpu;
                        lcpticks = atomic_swap_uint(&l->l_cpticks, 0);
                        lpctcpu += ((FSCALE - ccpu) *
                            (lcpticks * FSCALE / clkhz)) >> FSHIFT;
                        l->l_pctcpu = lpctcpu;
                }
                /* Calculating p_pctcpu only for ps(1) */
                p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;

                if (__predict_false(runtm < 0)) {
                        if (!backwardsproc) {
                                backwardsproc = true;
                                printf("WARNING: pid %ld (%s): "
                                    "negative runtime; "
                                    "monotonic clock has gone backwards\n",
                                    (long)p->p_pid, p->p_comm);
                        }
                        mutex_exit(p->p_lock);
                        continue;
                }

                /*
                 * Check if the process exceeds its CPU resource allocation.
                 * If over the hard limit, kill it with SIGKILL.
                 * If over the soft limit, send SIGXCPU and raise
                 * the soft limit a little.
                 */
                rlim = &p->p_rlimit[RLIMIT_CPU];
                sig = 0;
                if (__predict_false(runtm >= rlim->rlim_cur)) {
                        if (runtm >= rlim->rlim_max) {
                                sig = SIGKILL;
                                log(LOG_NOTICE,
                                    "pid %d, command %s, is killed: %s\n",
                                    p->p_pid, p->p_comm, "exceeded RLIMIT_CPU");
                                uprintf("pid %d, command %s, is killed: %s\n",
                                    p->p_pid, p->p_comm, "exceeded RLIMIT_CPU");
                        } else {
                                sig = SIGXCPU;
                                if (rlim->rlim_cur < rlim->rlim_max)
                                        rlim->rlim_cur += 5;
                        }
                }
                mutex_exit(p->p_lock);
                if (__predict_false(sig)) {
                        KASSERT((p->p_flag & PK_SYSTEM) == 0);
                        psignal(p, sig);
                }
        }

        /* Load average calculation. */
        if (__predict_false(lavg_count == 0)) {
                int i;
                CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg));
                for (i = 0; i < __arraycount(cexp); i++) {
                        avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
                            nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
                }
        }

        /* Lightning bolt. */
        cv_broadcast(&lbolt);

        mutex_exit(&proc_lock);
}





































































































































































































    1 
















































































    1 





    1 












    1 



































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
/*        $NetBSD: uipc_accf.c,v 1.13 2014/02/25 18:30:11 pooka Exp $        */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software developed for The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 2000 Paycounter, Inc.
 * Copyright (c) 2005 Robert N. M. Watson
 * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_accf.c,v 1.13 2014/02/25 18:30:11 pooka Exp $");

#define ACCEPT_FILTER_MOD

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/rwlock.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/queue.h>
#include <sys/once.h>
#include <sys/atomic.h>
#include <sys/module.h>

static krwlock_t accept_filter_lock;

static LIST_HEAD(, accept_filter) accept_filtlsthd =
    LIST_HEAD_INITIALIZER(&accept_filtlsthd);

/*
 * Names of Accept filter sysctl objects
 */
static struct sysctllog *ctllog;
static void
sysctl_net_inet_accf_setup(void)
{

        sysctl_createv(&ctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "inet", NULL,
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET, CTL_EOL);
        sysctl_createv(&ctllog, 0, NULL, NULL,
                       CTLFLAG_PERMANENT,
                       CTLTYPE_NODE, "accf",
                       SYSCTL_DESCR("Accept filters"),
                       NULL, 0, NULL, 0,
                       CTL_NET, PF_INET, SO_ACCEPTFILTER, CTL_EOL);
}

int
accept_filt_add(struct accept_filter *filt)
{
        struct accept_filter *p;

        accept_filter_init();

        rw_enter(&accept_filter_lock, RW_WRITER);
        LIST_FOREACH(p, &accept_filtlsthd, accf_next) {
                if (strcmp(p->accf_name, filt->accf_name) == 0)  {
                        rw_exit(&accept_filter_lock);
                        return EEXIST;
                }
        }                                
        LIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next);
        rw_exit(&accept_filter_lock);

        return 0;
}

int
accept_filt_del(struct accept_filter *p)
{

        rw_enter(&accept_filter_lock, RW_WRITER);
        if (p->accf_refcnt != 0) {
                rw_exit(&accept_filter_lock);
                return EBUSY;
        }
        LIST_REMOVE(p, accf_next);
        rw_exit(&accept_filter_lock);

        return 0;
}

struct accept_filter *
accept_filt_get(char *name)
{
        struct accept_filter *p;
        char buf[32];
        u_int gen;

        do {
                rw_enter(&accept_filter_lock, RW_READER);
                LIST_FOREACH(p, &accept_filtlsthd, accf_next) {
                        if (strcmp(p->accf_name, name) == 0) {
                                atomic_inc_uint(&p->accf_refcnt);
                                break;
                        }
                }
                rw_exit(&accept_filter_lock);
                if (p != NULL) {
                        break;
                }
                /* Try to autoload a module to satisfy the request. */
                strcpy(buf, "accf_");
                strlcat(buf, name, sizeof(buf));
                gen = module_gen;
                (void)module_autoload(buf, MODULE_CLASS_ANY);
        } while (gen != module_gen);

        return p;
}

/*
 * Accept filter initialization routine.
 * This should be called only once.
 */

static int
accept_filter_init0(void)
{

        rw_init(&accept_filter_lock);
        sysctl_net_inet_accf_setup();

        return 0;
}

/*
 * Initialization routine: This can also be replaced with 
 * accept_filt_generic_mod_event for attaching new accept filter.
 */

void
accept_filter_init(void)
{
        static ONCE_DECL(accept_filter_init_once);

        RUN_ONCE(&accept_filter_init_once, accept_filter_init0);
}

int
accept_filt_getopt(struct socket *so, struct sockopt *sopt)
{
        struct accept_filter_arg afa;
        int error;

        KASSERT(solocked(so));

        if ((so->so_options & SO_ACCEPTCONN) == 0) {
                error = EINVAL;
                goto out;
        }
        if ((so->so_options & SO_ACCEPTFILTER) == 0) {
                error = EINVAL;
                goto out;
        }

        memset(&afa, 0, sizeof(afa));
        strcpy(afa.af_name, so->so_accf->so_accept_filter->accf_name);
        if (so->so_accf->so_accept_filter_str != NULL)
                strcpy(afa.af_arg, so->so_accf->so_accept_filter_str);
        error = sockopt_set(sopt, &afa, sizeof(afa));
out:
        return error;
}

/*
 * Simple delete case, with socket locked.
 */
int
accept_filt_clear(struct socket *so)
{
        struct accept_filter_arg afa;
        struct accept_filter *afp;
        struct socket *so2, *next;
        struct so_accf *af;

        KASSERT(solocked(so));

        if ((so->so_options & SO_ACCEPTCONN) == 0) {
                return EINVAL;
        }
        if (so->so_accf != NULL) {
                /* Break in-flight processing. */
                for (so2 = TAILQ_FIRST(&so->so_q0); so2 != NULL; so2 = next) {
                        next = TAILQ_NEXT(so2, so_qe);
                        if (so2->so_upcall == NULL) {
                                continue;
                        }
                        so2->so_upcall = NULL;
                        so2->so_upcallarg = NULL;
                        so2->so_options &= ~SO_ACCEPTFILTER;
                        so2->so_rcv.sb_flags &= ~SB_UPCALL;
                        soisconnected(so2);
                }
                af = so->so_accf;
                afp = af->so_accept_filter;
                if (afp != NULL && afp->accf_destroy != NULL) {
                        (*afp->accf_destroy)(so);
                }
                if (af->so_accept_filter_str != NULL) {
                        kmem_free(af->so_accept_filter_str,
                            sizeof(afa.af_name));
                }
                kmem_free(af, sizeof(*af));
                so->so_accf = NULL;
                atomic_dec_uint(&afp->accf_refcnt);
        }
        so->so_options &= ~SO_ACCEPTFILTER;
        return 0;
}

/*
 * setsockopt() for accept filters.  Called with the socket unlocked,
 * will always return it locked.
 */
int
accept_filt_setopt(struct socket *so, const struct sockopt *sopt)
{
        struct accept_filter_arg afa;
        struct accept_filter *afp;
        struct so_accf *newaf;
        int error;

        accept_filter_init();

        if (sopt == NULL || sopt->sopt_size == 0) {
                solock(so);
                return accept_filt_clear(so);
        }

        /*
         * Pre-allocate any memory we may need later to avoid blocking at
         * untimely moments.  This does not optimize for invalid arguments.
         */
        error = sockopt_get(sopt, &afa, sizeof(afa));
        if (error) {
                solock(so);
                return error;
        }
        afa.af_name[sizeof(afa.af_name)-1] = '\0';
        afa.af_arg[sizeof(afa.af_arg)-1] = '\0';
        afp = accept_filt_get(afa.af_name);
        if (afp == NULL) {
                solock(so);
                return ENOENT;
        }
        /*
         * Allocate the new accept filter instance storage.  We may
         * have to free it again later if we fail to attach it.  If
         * attached properly, 'newaf' is NULLed to avoid a free()
         * while in use.
         */
        newaf = kmem_zalloc(sizeof(*newaf), KM_SLEEP);
        if (afp->accf_create != NULL && afa.af_name[0] != '\0') {
                /*
                 * FreeBSD did a variable-size allocation here
                 * with the actual string length from afa.af_name
                 * but it is so short, why bother tracking it?
                 * XXX as others have noted, this is an API mistake;
                 * XXX accept_filter_arg should have a mandatory namelen.
                 * XXX (but it's a bit too late to fix that now)
                 */
                newaf->so_accept_filter_str =
                    kmem_alloc(sizeof(afa.af_name), KM_SLEEP);
                strcpy(newaf->so_accept_filter_str, afa.af_name);
        }

        /*
         * Require a listen socket; don't try to replace an existing filter
         * without first removing it.
         */
        solock(so);
        if ((so->so_options & SO_ACCEPTCONN) == 0 || so->so_accf != NULL) {
                error = EINVAL;
                goto out;
        }

        /*
         * Invoke the accf_create() method of the filter if required.  The
         * socket lock is held over this call, so create methods for filters
         * shouldn't block.
         */
        if (afp->accf_create != NULL) {
                newaf->so_accept_filter_arg =
                    (*afp->accf_create)(so, afa.af_arg);
                if (newaf->so_accept_filter_arg == NULL) {
                        error = EINVAL;
                        goto out;
                }
        }
        newaf->so_accept_filter = afp;
        so->so_accf = newaf;
        so->so_options |= SO_ACCEPTFILTER;
        newaf = NULL;
out:
        if (newaf != NULL) {
                if (newaf->so_accept_filter_str != NULL)
                        kmem_free(newaf->so_accept_filter_str,
                            sizeof(afa.af_name));
                kmem_free(newaf, sizeof(*newaf));
                atomic_dec_uint(&afp->accf_refcnt);
        }
        return error;
}

















































































































































































































































    6 

















    6 
















    6 


    6 






    6 




    1 








    1 
    1 






    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
/*        $NetBSD: radix.c,v 1.49 2020/10/18 13:07:31 gson Exp $        */

/*
 * Copyright (c) 1988, 1989, 1993
 *        The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)radix.c        8.6 (Berkeley) 10/17/95
 */

/*
 * Routines to build and maintain radix trees for routing lookups.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: radix.c,v 1.49 2020/10/18 13:07:31 gson Exp $");

#ifndef _NET_RADIX_H_
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/kmem.h>
#ifdef        _KERNEL
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#endif

#include <sys/systm.h>
#include <sys/malloc.h>
#define        M_DONTWAIT M_NOWAIT
#include <sys/domain.h>
#else
#include <stdlib.h>
#endif
#include <sys/syslog.h>
#include <net/radix.h>
#endif

typedef void (*rn_printer_t)(void *, const char *fmt, ...);

int        max_keylen;
struct radix_mask *rn_mkfreelist;
struct radix_node_head *mask_rnhead;
static char *addmask_key;
static const char normal_chars[] =
    {0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, -1};
static char *rn_zeros, *rn_ones;

#define rn_masktop (mask_rnhead->rnh_treetop)

static int rn_satisfies_leaf(const char *, struct radix_node *, int);
static int rn_lexobetter(const void *, const void *);
static struct radix_mask *rn_new_radix_mask(struct radix_node *,
    struct radix_mask *);
static struct radix_node *rn_walknext(struct radix_node *, rn_printer_t,
    void *);
static struct radix_node *rn_walkfirst(struct radix_node *, rn_printer_t,
    void *);
static void rn_nodeprint(struct radix_node *, rn_printer_t, void *,
    const char *);

#define        SUBTREE_OPEN        "[ "
#define        SUBTREE_CLOSE        " ]"

#ifdef RN_DEBUG
static void rn_treeprint(struct radix_node_head *, rn_printer_t, void *);
#endif /* RN_DEBUG */

/*
 * The data structure for the keys is a radix tree with one way
 * branching removed.  The index rn_b at an internal node n represents a bit
 * position to be tested.  The tree is arranged so that all descendants
 * of a node n have keys whose bits all agree up to position rn_b - 1.
 * (We say the index of n is rn_b.)
 *
 * There is at least one descendant which has a one bit at position rn_b,
 * and at least one with a zero there.
 *
 * A route is determined by a pair of key and mask.  We require that the
 * bit-wise logical and of the key and mask to be the key.
 * We define the index of a route to associated with the mask to be
 * the first bit number in the mask where 0 occurs (with bit number 0
 * representing the highest order bit).
 *
 * We say a mask is normal if every bit is 0, past the index of the mask.
 * If a node n has a descendant (k, m) with index(m) == index(n) == rn_b,
 * and m is a normal mask, then the route applies to every descendant of n.
 * If the index(m) < rn_b, this implies the trailing last few bits of k
 * before bit b are all 0, (and hence consequently true of every descendant
 * of n), so the route applies to all descendants of the node as well.
 *
 * Similar logic shows that a non-normal mask m such that
 * index(m) <= index(n) could potentially apply to many children of n.
 * Thus, for each non-host route, we attach its mask to a list at an internal
 * node as high in the tree as we can go.
 *
 * The present version of the code makes use of normal routes in short-
 * circuiting an explicit mask and compare operation when testing whether
 * a key satisfies a normal route, and also in remembering the unique leaf
 * that governs a subtree.
 */

struct radix_node *
rn_search(
        const void *v_arg,
        struct radix_node *head)
{
        const u_char * const v = v_arg;
        struct radix_node *x;

        for (x = head; x->rn_b >= 0;) {
                if (x->rn_bmask & v[x->rn_off])
                        x = x->rn_r;
                else
                        x = x->rn_l;
        }
        return x;
}

struct radix_node *
rn_search_m(
        const void *v_arg,
        struct radix_node *head,
        const void *m_arg)
{
        struct radix_node *x;
        const u_char * const v = v_arg;
        const u_char * const m = m_arg;

        for (x = head; x->rn_b >= 0;) {
                if ((x->rn_bmask & m[x->rn_off]) &&
                    (x->rn_bmask & v[x->rn_off]))
                        x = x->rn_r;
                else
                        x = x->rn_l;
        }
        return x;
}

int
rn_refines(
        const void *m_arg,
        const void *n_arg)
{
        const char *m = m_arg;
        const char *n = n_arg;
        const char *lim = n + *(const u_char *)n;
        const char *lim2 = lim;
        int longer = (*(const u_char *)n++) - (int)(*(const u_char *)m++);
        int masks_are_equal = 1;

        if (longer > 0)
                lim -= longer;
        while (n < lim) {
                if (*n & ~(*m))
                        return 0;
                if (*n++ != *m++)
                        masks_are_equal = 0;
        }
        while (n < lim2)
                if (*n++)
                        return 0;
        if (masks_are_equal && (longer < 0))
                for (lim2 = m - longer; m < lim2; )
                        if (*m++)
                                return 1;
        return !masks_are_equal;
}

struct radix_node *
rn_lookup(
        const void *v_arg,
        const void *m_arg,
        struct radix_node_head *head)
{
        struct radix_node *x;
        const char *netmask = NULL;

        if (m_arg) {
                if ((x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_off)) == 0)
                        return NULL;
                netmask = x->rn_key;
        }
        x = rn_match(v_arg, head);
        if (x != NULL && netmask != NULL) {
                while (x != NULL && x->rn_mask != netmask)
                        x = x->rn_dupedkey;
        }
        return x;
}

static int
rn_satisfies_leaf(
        const char *trial,
        struct radix_node *leaf,
        int skip)
{
        const char *cp = trial;
        const char *cp2 = leaf->rn_key;
        const char *cp3 = leaf->rn_mask;
        const char *cplim;
        int length = uimin(*(const u_char *)cp, *(const u_char *)cp2);

        if (cp3 == 0)
                cp3 = rn_ones;
        else
                length = uimin(length, *(const u_char *)cp3);
        cplim = cp + length; cp3 += skip; cp2 += skip;
        for (cp += skip; cp < cplim; cp++, cp2++, cp3++)
                if ((*cp ^ *cp2) & *cp3)
                        return 0;
        return 1;
}

struct radix_node *
rn_match(
        const void *v_arg,
        struct radix_node_head *head)
{
        const char * const v = v_arg;
        struct radix_node *t = head->rnh_treetop;
        struct radix_node *top = t;
        struct radix_node *x;
        struct radix_node *saved_t;
        const char *cp = v;
        const char *cp2;
        const char *cplim;
        int off = t->rn_off;
        int vlen = *(const u_char *)cp;
        int matched_off;
        int test, b, rn_b;

        /*
         * Open code rn_search(v, top) to avoid overhead of extra
         * subroutine call.
         */
        for (; t->rn_b >= 0; ) {
                if (t->rn_bmask & cp[t->rn_off])
                        t = t->rn_r;
                else
                        t = t->rn_l;
        }
        /*
         * See if we match exactly as a host destination
         * or at least learn how many bits match, for normal mask finesse.
         *
         * It doesn't hurt us to limit how many bytes to check
         * to the length of the mask, since if it matches we had a genuine
         * match and the leaf we have is the most specific one anyway;
         * if it didn't match with a shorter length it would fail
         * with a long one.  This wins big for class B&C netmasks which
         * are probably the most common case...
         */
        if (t->rn_mask)
                vlen = *(const u_char *)t->rn_mask;
        cp += off; cp2 = t->rn_key + off; cplim = v + vlen;
        for (; cp < cplim; cp++, cp2++)
                if (*cp != *cp2)
                        goto on1;
        /*
         * This extra grot is in case we are explicitly asked
         * to look up the default.  Ugh!
         */
        if ((t->rn_flags & RNF_ROOT) && t->rn_dupedkey)
                t = t->rn_dupedkey;
        return t;
on1:
        test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */
        for (b = 7; (test >>= 1) > 0;)
                b--;
        matched_off = cp - v;
        b += matched_off << 3;
        rn_b = -1 - b;
        /*
         * If there is a host route in a duped-key chain, it will be first.
         */
        if ((saved_t = t)->rn_mask == 0)
                t = t->rn_dupedkey;
        for (; t; t = t->rn_dupedkey)
                /*
                 * Even if we don't match exactly as a host,
                 * we may match if the leaf we wound up at is
                 * a route to a net.
                 */
                if (t->rn_flags & RNF_NORMAL) {
                        if (rn_b <= t->rn_b)
                                return t;
                } else if (rn_satisfies_leaf(v, t, matched_off))
                                return t;
        t = saved_t;
        /* start searching up the tree */
        do {
                struct radix_mask *m;
                t = t->rn_p;
                m = t->rn_mklist;
                if (m) {
                        /*
                         * If non-contiguous masks ever become important
                         * we can restore the masking and open coding of
                         * the search and satisfaction test and put the
                         * calculation of "off" back before the "do".
                         */
                        do {
                                if (m->rm_flags & RNF_NORMAL) {
                                        if (rn_b <= m->rm_b)
                                                return m->rm_leaf;
                                } else {
                                        off = uimin(t->rn_off, matched_off);
                                        x = rn_search_m(v, t, m->rm_mask);
                                        while (x && x->rn_mask != m->rm_mask)
                                                x = x->rn_dupedkey;
                                        if (x && rn_satisfies_leaf(v, x, off))
                                                return x;
                                }
                                m = m->rm_mklist;
                        } while (m);
                }
        } while (t != top);
        return NULL;
}

static void
rn_nodeprint(struct radix_node *rn, rn_printer_t printer, void *arg,
    const char *delim)
{
        (*printer)(arg, "%s(%s%p: p<%p> l<%p> r<%p>)",
            delim, ((void *)rn == arg) ? "*" : "", rn, rn->rn_p,
            rn->rn_l, rn->rn_r);
}

#ifdef RN_DEBUG
int        rn_debug =  1;

static void
rn_dbg_print(void *arg, const char *fmt, ...)
{
        va_list ap;

        va_start(ap, fmt);
        vlog(LOG_DEBUG, fmt, ap);
        va_end(ap);
}

static void
rn_treeprint(struct radix_node_head *h, rn_printer_t printer, void *arg)
{
        struct radix_node *dup, *rn;
        const char *delim;

        if (printer == NULL)
                return;

        rn = rn_walkfirst(h->rnh_treetop, printer, arg);
        for (;;) {
                /* Process leaves */
                delim = "";
                for (dup = rn; dup != NULL; dup = dup->rn_dupedkey) {
                        if ((dup->rn_flags & RNF_ROOT) != 0)
                                continue;
                        rn_nodeprint(dup, printer, arg, delim);
                        delim = ", ";
                }
                rn = rn_walknext(rn, printer, arg);
                if (rn->rn_flags & RNF_ROOT)
                        return;
        }
        /* NOTREACHED */
}

#define        traverse(__head, __rn)        rn_treeprint((__head), rn_dbg_print, (__rn))
#endif /* RN_DEBUG */

struct radix_node *
rn_newpair(
        const void *v,
        int b,
        struct radix_node nodes[2])
{
        struct radix_node *tt = nodes;
        struct radix_node *t = tt + 1;
        t->rn_b = b; t->rn_bmask = 0x80 >> (b & 7);
        t->rn_l = tt; t->rn_off = b >> 3;
        tt->rn_b = -1; tt->rn_key = v; tt->rn_p = t;
        tt->rn_flags = t->rn_flags = RNF_ACTIVE;
        return t;
}

struct radix_node *
rn_insert(
        const void *v_arg,
        struct radix_node_head *head,
        int *dupentry,
        struct radix_node nodes[2])
{
        struct radix_node *top = head->rnh_treetop;
        struct radix_node *t = rn_search(v_arg, top);
        struct radix_node *tt;
        const char *v = v_arg;
        int head_off = top->rn_off;
        int vlen = *((const u_char *)v);
        const char *cp = v + head_off;
        int b;
            /*
         * Find first bit at which v and t->rn_key differ
         */
    {
        const char *cp2 = t->rn_key + head_off;
        const char *cplim = v + vlen;
        int cmp_res;

        while (cp < cplim)
                if (*cp2++ != *cp++)
                        goto on1;
        *dupentry = 1;
        return t;
on1:
        *dupentry = 0;
        cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
        for (b = (cp - v) << 3; cmp_res; b--)
                cmp_res >>= 1;
    }
    {
        struct radix_node *p, *x = top;
        cp = v;
        do {
                p = x;
                if (cp[x->rn_off] & x->rn_bmask)
                        x = x->rn_r;
                else x = x->rn_l;
        } while (b > (unsigned) x->rn_b); /* x->rn_b < b && x->rn_b >= 0 */
#ifdef RN_DEBUG
        if (rn_debug)
                log(LOG_DEBUG, "%s: Going In:\n", __func__), traverse(head, p);
#endif
        t = rn_newpair(v_arg, b, nodes); tt = t->rn_l;
        if ((cp[p->rn_off] & p->rn_bmask) == 0)
                p->rn_l = t;
        else
                p->rn_r = t;
        x->rn_p = t; t->rn_p = p; /* frees x, p as temp vars below */
        if ((cp[t->rn_off] & t->rn_bmask) == 0) {
                t->rn_r = x;
        } else {
                t->rn_r = tt; t->rn_l = x;
        }
#ifdef RN_DEBUG
        if (rn_debug) {
                log(LOG_DEBUG, "%s: Coming Out:\n", __func__),
                    traverse(head, p);
        }
#endif /* RN_DEBUG */
    }
        return tt;
}

struct radix_node *
rn_addmask(
        const void *n_arg,
        int search,
        int skip)
{
        const char *netmask = n_arg;
        const char *cp;
        const char *cplim;
        struct radix_node *x;
        struct radix_node *saved_x;
        int b = 0, mlen, j;
        int maskduplicated, m0, isnormal;
        static int last_zeroed = 0;

        if ((mlen = *(const u_char *)netmask) > max_keylen)
                mlen = max_keylen;
        if (skip == 0)
                skip = 1;
        if (mlen <= skip)
                return mask_rnhead->rnh_nodes;
        if (skip > 1)
                memmove(addmask_key + 1, rn_ones + 1, skip - 1);
        if ((m0 = mlen) > skip)
                memmove(addmask_key + skip, netmask + skip, mlen - skip);
        /*
         * Trim trailing zeroes.
         */
        for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;)
                cp--;
        mlen = cp - addmask_key;
        if (mlen <= skip) {
                if (m0 >= last_zeroed)
                        last_zeroed = mlen;
                return mask_rnhead->rnh_nodes;
        }
        if (m0 < last_zeroed)
                memset(addmask_key + m0, 0, last_zeroed - m0);
        *addmask_key = last_zeroed = mlen;
        x = rn_search(addmask_key, rn_masktop);
        if (memcmp(addmask_key, x->rn_key, mlen) != 0)
                x = 0;
        if (x || search)
                return x;
        R_Malloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x));
        if ((saved_x = x) == NULL)
                return NULL;
        memset(x, 0, max_keylen + 2 * sizeof (*x));
        cp = netmask = (void *)(x + 2);
        memmove(x + 2, addmask_key, mlen);
        x = rn_insert(cp, mask_rnhead, &maskduplicated, x);
        if (maskduplicated) {
                log(LOG_ERR, "rn_addmask: mask impossibly already in tree\n");
                Free(saved_x);
                return x;
        }
        /*
         * Calculate index of mask, and check for normalcy.
         */
        cplim = netmask + mlen; isnormal = 1;
        for (cp = netmask + skip; (cp < cplim) && *(const u_char *)cp == 0xff;)
                cp++;
        if (cp != cplim) {
                for (j = 0x80; (j & *cp) != 0; j >>= 1)
                        b++;
                if (*cp != normal_chars[b] || cp != (cplim - 1))
                        isnormal = 0;
        }
        b += (cp - netmask) << 3;
        x->rn_b = -1 - b;
        if (isnormal)
                x->rn_flags |= RNF_NORMAL;
        return x;
}

static int        /* XXX: arbitrary ordering for non-contiguous masks */
rn_lexobetter(
        const void *m_arg,
        const void *n_arg)
{
        const u_char *mp = m_arg;
        const u_char *np = n_arg;
        const u_char *lim;

        if (*mp > *np)
                return 1;  /* not really, but need to check longer one first */
        if (*mp == *np)
                for (lim = mp + *mp; mp < lim;)
                        if (*mp++ > *np++)
                                return 1;
        return 0;
}

static struct radix_mask *
rn_new_radix_mask(
        struct radix_node *tt,
        struct radix_mask *next)
{
        struct radix_mask *m;

        MKGet(m);
        if (m == NULL) {
                log(LOG_ERR, "Mask for route not entered\n");
                return NULL;
        }
        memset(m, 0, sizeof(*m));
        m->rm_b = tt->rn_b;
        m->rm_flags = tt->rn_flags;
        if (tt->rn_flags & RNF_NORMAL)
                m->rm_leaf = tt;
        else
                m->rm_mask = tt->rn_mask;
        m->rm_mklist = next;
        tt->rn_mklist = m;
        return m;
}

struct radix_node *
rn_addroute(
        const void *v_arg,
        const void *n_arg,
        struct radix_node_head *head,
        struct radix_node treenodes[2])
{
        const char *v = v_arg, *netmask = n_arg;
        struct radix_node *t, *x = NULL, *tt;
        struct radix_node *saved_tt, *top = head->rnh_treetop;
        short b = 0, b_leaf = 0;
        int keyduplicated;
        const char *mmask;
        struct radix_mask *m, **mp;

        /*
         * In dealing with non-contiguous masks, there may be
         * many different routes which have the same mask.
         * We will find it useful to have a unique pointer to
         * the mask to speed avoiding duplicate references at
         * nodes and possibly save time in calculating indices.
         */
        if (netmask != NULL) {
                if ((x = rn_addmask(netmask, 0, top->rn_off)) == NULL)
                        return NULL;
                b_leaf = x->rn_b;
                b = -1 - x->rn_b;
                netmask = x->rn_key;
        }
        /*
         * Deal with duplicated keys: attach node to previous instance
         */
        saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
        if (keyduplicated) {
                for (t = tt; tt != NULL; t = tt, tt = tt->rn_dupedkey) {
                        if (tt->rn_mask == netmask)
                                return NULL;
                        if (netmask == NULL ||
                            (tt->rn_mask != NULL &&
                             (b_leaf < tt->rn_b || /* index(netmask) > node */
                               rn_refines(netmask, tt->rn_mask) ||
                               rn_lexobetter(netmask, tt->rn_mask))))
                                break;
                }
                /*
                 * If the mask is not duplicated, we wouldn't
                 * find it among possible duplicate key entries
                 * anyway, so the above test doesn't hurt.
                 *
                 * We sort the masks for a duplicated key the same way as
                 * in a masklist -- most specific to least specific.
                 * This may require the unfortunate nuisance of relocating
                 * the head of the list.
                 *
                 * We also reverse, or doubly link the list through the
                 * parent pointer.
                 */
                if (tt == saved_tt) {
                        struct        radix_node *xx = x;
                        /* link in at head of list */
                        (tt = treenodes)->rn_dupedkey = t;
                        tt->rn_flags = t->rn_flags;
                        tt->rn_p = x = t->rn_p;
                        t->rn_p = tt;
                        if (x->rn_l == t)
                                x->rn_l = tt;
                        else
                                x->rn_r = tt;
                        saved_tt = tt;
                        x = xx;
                } else {
                        (tt = treenodes)->rn_dupedkey = t->rn_dupedkey;
                        t->rn_dupedkey = tt;
                        tt->rn_p = t;
                        if (tt->rn_dupedkey)
                                tt->rn_dupedkey->rn_p = tt;
                }
                tt->rn_key = v;
                tt->rn_b = -1;
                tt->rn_flags = RNF_ACTIVE;
        }
        /*
         * Put mask in tree.
         */
        if (netmask != NULL) {
                tt->rn_mask = netmask;
                tt->rn_b = x->rn_b;
                tt->rn_flags |= x->rn_flags & RNF_NORMAL;
        }
        t = saved_tt->rn_p;
        if (keyduplicated)
                goto on2;
        b_leaf = -1 - t->rn_b;
        if (t->rn_r == saved_tt)
                x = t->rn_l;
        else
                x = t->rn_r;
        /* Promote general routes from below */
        if (x->rn_b < 0) {
                for (mp = &t->rn_mklist; x != NULL; x = x->rn_dupedkey) {
                        if (x->rn_mask != NULL && x->rn_b >= b_leaf &&
                            x->rn_mklist == NULL) {
                                *mp = m = rn_new_radix_mask(x, NULL);
                                if (m != NULL)
                                        mp = &m->rm_mklist;
                        }
                }
        } else if (x->rn_mklist != NULL) {
                /*
                 * Skip over masks whose index is > that of new node
                 */
                for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist)
                        if (m->rm_b >= b_leaf)
                                break;
                t->rn_mklist = m;
                *mp = NULL;
        }
on2:
        /* Add new route to highest possible ancestor's list */
        if (netmask == NULL || b > t->rn_b)
                return tt; /* can't lift at all */
        b_leaf = tt->rn_b;
        do {
                x = t;
                t = t->rn_p;
        } while (b <= t->rn_b && x != top);
        /*
         * Search through routes associated with node to
         * insert new route according to index.
         * Need same criteria as when sorting dupedkeys to avoid
         * double loop on deletion.
         */
        for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) {
                if (m->rm_b < b_leaf)
                        continue;
                if (m->rm_b > b_leaf)
                        break;
                if (m->rm_flags & RNF_NORMAL) {
                        mmask = m->rm_leaf->rn_mask;
                        if (tt->rn_flags & RNF_NORMAL) {
                                log(LOG_ERR, "Non-unique normal route,"
                                    " mask not entered\n");
                                return tt;
                        }
                } else
                        mmask = m->rm_mask;
                if (mmask == netmask) {
                        m->rm_refs++;
                        tt->rn_mklist = m;
                        return tt;
                }
                if (rn_refines(netmask, mmask) || rn_lexobetter(netmask, mmask))
                        break;
        }
        *mp = rn_new_radix_mask(tt, *mp);
        return tt;
}

struct radix_node *
rn_delete1(
        const void *v_arg,
        const void *netmask_arg,
        struct radix_node_head *head,
        struct radix_node *rn)
{
        struct radix_node *t, *p, *x, *tt;
        struct radix_mask *m, *saved_m, **mp;
        struct radix_node *dupedkey, *saved_tt, *top;
        const char *v, *netmask;
        int b, head_off, vlen;

        v = v_arg;
        netmask = netmask_arg;
        x = head->rnh_treetop;
        tt = rn_search(v, x);
        head_off = x->rn_off;
        vlen =  *(const u_char *)v;
        saved_tt = tt;
        top = x;
        if (tt == NULL ||
            memcmp(v + head_off, tt->rn_key + head_off, vlen - head_off) != 0)
                return NULL;
        /*
         * Delete our route from mask lists.
         */
        if (netmask != NULL) {
                if ((x = rn_addmask(netmask, 1, head_off)) == NULL)
                        return NULL;
                netmask = x->rn_key;
                while (tt->rn_mask != netmask)
                        if ((tt = tt->rn_dupedkey) == NULL)
                                return NULL;
        }
        if (tt->rn_mask == NULL || (saved_m = m = tt->rn_mklist) == NULL)
                goto on1;
        if (tt->rn_flags & RNF_NORMAL) {
                if (m->rm_leaf != tt || m->rm_refs > 0) {
                        log(LOG_ERR, "rn_delete: inconsistent annotation\n");
                        return NULL;  /* dangling ref could cause disaster */
                }
        } else {
                if (m->rm_mask != tt->rn_mask) {
                        log(LOG_ERR, "rn_delete: inconsistent annotation\n");
                        goto on1;
                }
                if (--m->rm_refs >= 0)
                        goto on1;
        }
        b = -1 - tt->rn_b;
        t = saved_tt->rn_p;
        if (b > t->rn_b)
                goto on1; /* Wasn't lifted at all */
        do {
                x = t;
                t = t->rn_p;
        } while (b <= t->rn_b && x != top);
        for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) {
                if (m == saved_m) {
                        *mp = m->rm_mklist;
                        MKFree(m);
                        break;
                }
        }
        if (m == NULL) {
                log(LOG_ERR, "rn_delete: couldn't find our annotation\n");
                if (tt->rn_flags & RNF_NORMAL)
                        return NULL; /* Dangling ref to us */
        }
on1:
        /*
         * Eliminate us from tree
         */
        if (tt->rn_flags & RNF_ROOT)
                return NULL;
#ifdef RN_DEBUG
        if (rn_debug)
                log(LOG_DEBUG, "%s: Going In:\n", __func__), traverse(head, tt);
#endif
        t = tt->rn_p;
        dupedkey = saved_tt->rn_dupedkey;
        if (dupedkey != NULL) {
                /*
                 * Here, tt is the deletion target, and
                 * saved_tt is the head of the dupedkey chain.
                 */
                if (tt == saved_tt) {
                        x = dupedkey;
                        x->rn_p = t;
                        if (t->rn_l == tt)
                                t->rn_l = x;
                        else
                                t->rn_r = x;
                } else {
                        /* find node in front of tt on the chain */
                        for (x = p = saved_tt;
                             p != NULL && p->rn_dupedkey != tt;)
                                p = p->rn_dupedkey;
                        if (p != NULL) {
                                p->rn_dupedkey = tt->rn_dupedkey;
                                if (tt->rn_dupedkey != NULL)
                                        tt->rn_dupedkey->rn_p = p;
                        } else
                                log(LOG_ERR, "rn_delete: couldn't find us\n");
                }
                t = tt + 1;
                if  (t->rn_flags & RNF_ACTIVE) {
                        *++x = *t;
                        p = t->rn_p;
                        if (p->rn_l == t)
                                p->rn_l = x;
                        else
                                p->rn_r = x;
                        x->rn_l->rn_p = x;
                        x->rn_r->rn_p = x;
                }
                goto out;
        }
        if (t->rn_l == tt)
                x = t->rn_r;
        else
                x = t->rn_l;
        p = t->rn_p;
        if (p->rn_r == t)
                p->rn_r = x;
        else
                p->rn_l = x;
        x->rn_p = p;
        /*
         * Demote routes attached to us.
         */
        if (t->rn_mklist == NULL)
                ;
        else if (x->rn_b >= 0) {
                for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist)
                        ;
                *mp = t->rn_mklist;
        } else {
                /* If there are any key,mask pairs in a sibling
                   duped-key chain, some subset will appear sorted
                   in the same order attached to our mklist */
                for (m = t->rn_mklist;
                     m != NULL && x != NULL;
                     x = x->rn_dupedkey) {
                        if (m == x->rn_mklist) {
                                struct radix_mask *mm = m->rm_mklist;
                                x->rn_mklist = NULL;
                                if (--(m->rm_refs) < 0)
                                        MKFree(m);
                                m = mm;
                        }
                }
                if (m != NULL) {
                        log(LOG_ERR, "rn_delete: Orphaned Mask %p at %p\n",
                            m, x);
                }
        }
        /*
         * We may be holding an active internal node in the tree.
         */
        x = tt + 1;
        if (t != x) {
                *t = *x;
                t->rn_l->rn_p = t;
                t->rn_r->rn_p = t;
                p = x->rn_p;
                if (p->rn_l == x)
                        p->rn_l = t;
                else
                        p->rn_r = t;
        }
out:
#ifdef RN_DEBUG
        if (rn_debug) {
                log(LOG_DEBUG, "%s: Coming Out:\n", __func__),
                    traverse(head, tt);
        }
#endif /* RN_DEBUG */
        tt->rn_flags &= ~RNF_ACTIVE;
        tt[1].rn_flags &= ~RNF_ACTIVE;
        return tt;
}

struct radix_node *
rn_delete(
        const void *v_arg,
        const void *netmask_arg,
        struct radix_node_head *head)
{
        return rn_delete1(v_arg, netmask_arg, head, NULL);
}

static struct radix_node *
rn_walknext(struct radix_node *rn, rn_printer_t printer, void *arg)
{
        /* If at right child go back up, otherwise, go right */
        while (rn->rn_p->rn_r == rn && (rn->rn_flags & RNF_ROOT) == 0) {
                if (printer != NULL)
                        (*printer)(arg, SUBTREE_CLOSE);
                rn = rn->rn_p;
        }
        if (printer)
                rn_nodeprint(rn->rn_p, printer, arg, "");
        /* Find the next *leaf* since next node might vanish, too */
        for (rn = rn->rn_p->rn_r; rn->rn_b >= 0;) {
                if (printer != NULL)
                        (*printer)(arg, SUBTREE_OPEN);
                rn = rn->rn_l;
        }
        return rn;
}

static struct radix_node *
rn_walkfirst(struct radix_node *rn, rn_printer_t printer, void *arg)
{
        /* First time through node, go left */
        while (rn->rn_b >= 0) {
                if (printer != NULL)
                        (*printer)(arg, SUBTREE_OPEN);
                rn = rn->rn_l;
        }
        return rn;
}

int
rn_walktree(
        struct radix_node_head *h,
        int (*f)(struct radix_node *, void *),
        void *w)
{
        int error;
        struct radix_node *base, *next, *rn;
        /*
         * This gets complicated because we may delete the node
         * while applying the function f to it, so we need to calculate
         * the successor node in advance.
         */
        rn = rn_walkfirst(h->rnh_treetop, NULL, NULL);
        for (;;) {
                base = rn;
                next = rn_walknext(rn, NULL, NULL);
                /* Process leaves */
                while ((rn = base) != NULL) {
                        base = rn->rn_dupedkey;
                        if (!(rn->rn_flags & RNF_ROOT) && (error = (*f)(rn, w)))
                                return error;
                }
                rn = next;
                if (rn->rn_flags & RNF_ROOT)
                        return 0;
        }
        /* NOTREACHED */
}

struct radix_node *
rn_search_matched(struct radix_node_head *h,
    int (*matcher)(struct radix_node *, void *), void *w)
{
        bool matched;
        struct radix_node *base, *next, *rn;
        /*
         * This gets complicated because we may delete the node
         * while applying the function f to it, so we need to calculate
         * the successor node in advance.
         */
        rn = rn_walkfirst(h->rnh_treetop, NULL, NULL);
        for (;;) {
                base = rn;
                next = rn_walknext(rn, NULL, NULL);
                /* Process leaves */
                while ((rn = base) != NULL) {
                        base = rn->rn_dupedkey;
                        if (!(rn->rn_flags & RNF_ROOT)) {
                                matched = (*matcher)(rn, w);
                                if (matched)
                                        return rn;
                        }
                }
                rn = next;
                if (rn->rn_flags & RNF_ROOT)
                        return NULL;
        }
        /* NOTREACHED */
}

struct delayinit {
        void **head;
        int off;
        SLIST_ENTRY(delayinit) entries;
};
static SLIST_HEAD(, delayinit) delayinits = SLIST_HEAD_INITIALIZER(delayheads);
static int radix_initialized;

/*
 * Initialize a radix tree once radix is initialized.  Only for bootstrap.
 * Assume that no concurrency protection is necessary at this stage.
 */
void
rn_delayedinit(void **head, int off)
{
        struct delayinit *di;

        if (radix_initialized)
                return;

        di = kmem_alloc(sizeof(*di), KM_SLEEP);
        di->head = head;
        di->off = off;
        SLIST_INSERT_HEAD(&delayinits, di, entries);
}

int
rn_inithead(void **head, int off)
{
        struct radix_node_head *rnh;

        if (*head != NULL)
                return 1;
        R_Malloc(rnh, struct radix_node_head *, sizeof (*rnh));
        if (rnh == NULL)
                return 0;
        *head = rnh;
        return rn_inithead0(rnh, off);
}

int
rn_inithead0(struct radix_node_head *rnh, int off)
{
        struct radix_node *t;
        struct radix_node *tt;
        struct radix_node *ttt;

        memset(rnh, 0, sizeof(*rnh));
        t = rn_newpair(rn_zeros, off, rnh->rnh_nodes);
        ttt = rnh->rnh_nodes + 2;
        t->rn_r = ttt;
        t->rn_p = t;
        tt = t->rn_l;
        tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE;
        tt->rn_b = -1 - off;
        *ttt = *tt;
        ttt->rn_key = rn_ones;
        rnh->rnh_addaddr = rn_addroute;
        rnh->rnh_deladdr = rn_delete;
        rnh->rnh_matchaddr = rn_match;
        rnh->rnh_lookup = rn_lookup;
        rnh->rnh_treetop = t;
        return 1;
}

void
rn_init(void)
{
        char *cp, *cplim;
        struct delayinit *di;
#ifdef _KERNEL
        struct domain *dp;

        if (radix_initialized)
                panic("radix already initialized");
        radix_initialized = 1;

        DOMAIN_FOREACH(dp) {
                if (dp->dom_maxrtkey > max_keylen)
                        max_keylen = dp->dom_maxrtkey;
        }
#endif
        if (max_keylen == 0) {
#ifndef _KERNEL
                log(LOG_ERR,
                    "rn_init: radix functions require max_keylen be set\n");
#endif
                return;
        }

        R_Malloc(rn_zeros, char *, 3 * max_keylen);
        if (rn_zeros == NULL)
                panic("rn_init");
        memset(rn_zeros, 0, 3 * max_keylen);
        rn_ones = cp = rn_zeros + max_keylen;
        addmask_key = cplim = rn_ones + max_keylen;
        while (cp < cplim)
                *cp++ = -1;
        if (rn_inithead((void *)&mask_rnhead, 0) == 0)
                panic("rn_init 2");

        while ((di = SLIST_FIRST(&delayinits)) != NULL) {
                if (!rn_inithead(di->head, di->off))
                        panic("delayed rn_inithead failed");
                SLIST_REMOVE_HEAD(&delayinits, entries);
                kmem_free(di, sizeof(*di));
        }
}




























































    1 





































































    1 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
/*        $NetBSD: compat_50_quota.c,v 1.4 2022/09/21 07:15:24 dholland Exp $ */

/*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Christos Zoulas.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: compat_50_quota.c,v 1.4 2022/09/21 07:15:24 dholland Exp $");

#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif

#include <sys/module.h>
#include <sys/namei.h>
#include <sys/param.h>
#include <sys/quota.h>
#include <sys/quotactl.h>
#include <sys/systm.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <sys/vfs_syscalls.h>
#include <sys/vnode.h>

#include <ufs/ufs/quota1.h>

static const struct syscall_package vfs_syscalls_50_quota_syscalls[] = {
        { SYS_compat_50_quotactl, 0, (sy_call_t *)compat_50_sys_quotactl },
        { 0, 0, NULL }
};

/* ARGSUSED */
int   
compat_50_sys_quotactl(struct lwp *l, const struct compat_50_sys_quotactl_args *uap, register_t *retval)
{
        /* {
                syscallarg(const char *) path;
                syscallarg(int) cmd;
                syscallarg(int) uid;
                syscallarg(void *) arg; 
        } */
        struct vnode *vp;
        struct mount *mp;
        int q1cmd;
        int idtype;
        char *qfile;
        struct dqblk dqblk;
        struct quotakey key;
        struct quotaval blocks, files;
        struct quotastat qstat;
        int error;

        error = namei_simple_user(SCARG(uap, path),
                                NSM_FOLLOW_TRYEMULROOT, &vp);
        if (error != 0)
                return (error);       

        mp = vp->v_mount;
        q1cmd = SCARG(uap, cmd);
        idtype = quota_idtype_from_ufs(q1cmd & SUBCMDMASK);
        if (idtype == -1) {
                return EINVAL;
        }

        switch ((q1cmd & ~SUBCMDMASK) >> SUBCMDSHIFT) {
        case Q_QUOTAON:
                qfile = PNBUF_GET();
                error = copyinstr(SCARG(uap, arg), qfile, PATH_MAX, NULL);
                if (error != 0) {
                        PNBUF_PUT(qfile);
                        break;
                }

                error = vfs_quotactl_quotaon(mp, idtype, qfile);

                PNBUF_PUT(qfile);
                break;

        case Q_QUOTAOFF:
                error = vfs_quotactl_quotaoff(mp, idtype);
                break;

        case Q_GETQUOTA:
                key.qk_idtype = idtype;
                key.qk_id = SCARG(uap, uid);

                key.qk_objtype = QUOTA_OBJTYPE_BLOCKS;
                error = vfs_quotactl_get(mp, &key, &blocks);
                if (error) {
                        break;
                }

                key.qk_objtype = QUOTA_OBJTYPE_FILES;
                error = vfs_quotactl_get(mp, &key, &files);
                if (error) {
                        break;
                }

                quotavals_to_dqblk(&blocks, &files, &dqblk);
                error = copyout(&dqblk, SCARG(uap, arg), sizeof(dqblk));
                break;
                
        case Q_SETQUOTA:
                error = copyin(SCARG(uap, arg), &dqblk, sizeof(dqblk));
                if (error) {
                        break;
                }
                dqblk_to_quotavals(&dqblk, &blocks, &files);

                key.qk_idtype = idtype;
                key.qk_id = SCARG(uap, uid);

                key.qk_objtype = QUOTA_OBJTYPE_BLOCKS;
                error = vfs_quotactl_put(mp, &key, &blocks);
                if (error) {
                        break;
                }

                key.qk_objtype = QUOTA_OBJTYPE_FILES;
                error = vfs_quotactl_put(mp, &key, &files);
                break;
                
        case Q_SYNC:
                /*
                 * not supported but used only to see if quota is supported,
                 * emulate with stat
                 *
                 * XXX should probably be supported
                 */
                (void)idtype; /* not used */

                error = vfs_quotactl_stat(mp, &qstat);
                break;

        case Q_SETUSE:
        default:
                error = EOPNOTSUPP;
                break;
        }

        vrele(vp);
        return error;
}

MODULE(MODULE_CLASS_EXEC, compat_50_quota, "compat_50,ufs");

static int
compat_50_quota_modcmd(modcmd_t cmd, void *arg)
{

        switch (cmd) {
        case MODULE_CMD_INIT:
                return syscall_establish(NULL, vfs_syscalls_50_quota_syscalls);
        case MODULE_CMD_FINI:
                return syscall_disestablish(NULL, vfs_syscalls_50_quota_syscalls);
        default:
                return ENOTTY;
        }
}


































































































































































































































































































































































































































































































































































































































   29 






















































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
/* $NetBSD: virtio_pci.c,v 1.44 2023/11/19 19:49:44 thorpej Exp $ */

/*
 * Copyright (c) 2020 The NetBSD Foundation, Inc.
 * Copyright (c) 2012 Stefan Fritsch.
 * Copyright (c) 2010 Minoura Makoto.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: virtio_pci.c,v 1.44 2023/11/19 19:49:44 thorpej Exp $");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/endian.h>
#include <sys/interrupt.h>
#include <sys/syslog.h>

#include <sys/device.h>

#include <dev/pci/pcidevs.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>

#include <dev/pci/virtioreg.h> /* XXX: move to non-pci */
#include <dev/pci/virtio_pcireg.h>

#define VIRTIO_PRIVATE
#include <dev/pci/virtiovar.h> /* XXX: move to non-pci */

#if defined(__alpha__) || defined(__sparc64__)
/*
 * XXX VIRTIO_F_ACCESS_PLATFORM is required for standard PCI DMA
 * XXX to work on these platforms, at least by Qemu.
 * XXX
 * XXX Generalize this later.
 */
#define        __NEED_VIRTIO_F_ACCESS_PLATFORM
#endif /* __alpha__ || __sparc64__ */

#define VIRTIO_PCI_LOG(_sc, _use_log, _fmt, _args...)        \
do {                                                        \
        if ((_use_log)) {                                \
                log(LOG_DEBUG, "%s: " _fmt,                \
                    device_xname((_sc)->sc_dev),        \
                    ##_args);                                \
        } else {                                        \
                aprint_error_dev((_sc)->sc_dev,                \
                    _fmt, ##_args);                        \
        }                                                \
} while(0)

static int        virtio_pci_match(device_t, cfdata_t, void *);
static void        virtio_pci_attach(device_t, device_t, void *);
static int        virtio_pci_rescan(device_t, const char *, const int *);
static int        virtio_pci_detach(device_t, int);


#define NMAPREG                ((PCI_MAPREG_END - PCI_MAPREG_START) / \
                                sizeof(pcireg_t))
struct virtio_pci_softc {
        struct virtio_softc        sc_sc;
        bool                        sc_intr_pervq;

        /* IO space */
        bus_space_tag_t                sc_iot;
        bus_space_handle_t        sc_ioh;
        bus_size_t                sc_iosize;
        bus_size_t                sc_mapped_iosize;

        /* BARs */
        bus_space_tag_t                sc_bars_iot[NMAPREG];
        bus_space_handle_t        sc_bars_ioh[NMAPREG];
        bus_size_t                sc_bars_iosize[NMAPREG];

        /* notify space */
        bus_space_tag_t                sc_notify_iot;
        bus_space_handle_t        sc_notify_ioh;
        bus_size_t                sc_notify_iosize;
        uint32_t                sc_notify_off_multiplier;

        /* isr space */
        bus_space_tag_t                sc_isr_iot;
        bus_space_handle_t        sc_isr_ioh;
        bus_size_t                sc_isr_iosize;

        /* generic */
        struct pci_attach_args        sc_pa;
        pci_intr_handle_t        *sc_ihp;
        void                        **sc_ihs;
        int                        sc_ihs_num;
        int                        sc_devcfg_offset;        /* for 0.9 */
};

static int        virtio_pci_attach_09(device_t, void *);
static void        virtio_pci_kick_09(struct virtio_softc *, uint16_t);
static uint16_t        virtio_pci_read_queue_size_09(struct virtio_softc *, uint16_t);
static void        virtio_pci_setup_queue_09(struct virtio_softc *, uint16_t, uint64_t);
static void        virtio_pci_set_status_09(struct virtio_softc *, int);
static void        virtio_pci_negotiate_features_09(struct virtio_softc *, uint64_t);

static int        virtio_pci_attach_10(device_t, void *);
static void        virtio_pci_kick_10(struct virtio_softc *, uint16_t);
static uint16_t        virtio_pci_read_queue_size_10(struct virtio_softc *, uint16_t);
static void        virtio_pci_setup_queue_10(struct virtio_softc *, uint16_t, uint64_t);
static void        virtio_pci_set_status_10(struct virtio_softc *, int);
static void        virtio_pci_negotiate_features_10(struct virtio_softc *, uint64_t);
static int        virtio_pci_find_cap(struct virtio_pci_softc *psc, int cfg_type, void *buf, int buflen);

static int        virtio_pci_alloc_interrupts(struct virtio_softc *);
static void        virtio_pci_free_interrupts(struct virtio_softc *);
static int        virtio_pci_adjust_config_region(struct virtio_pci_softc *psc);
static int        virtio_pci_intr(void *arg);
static int        virtio_pci_msix_queue_intr(void *);
static int        virtio_pci_msix_config_intr(void *);
static int        virtio_pci_setup_interrupts_09(struct virtio_softc *, int);
static int        virtio_pci_setup_interrupts_10(struct virtio_softc *, int);
static int        virtio_pci_establish_msix_interrupts(struct virtio_softc *,
                    struct pci_attach_args *);
static int        virtio_pci_establish_intx_interrupt(struct virtio_softc *,
                    struct pci_attach_args *);
static bool        virtio_pci_msix_enabled(struct virtio_pci_softc *);

#define VIRTIO_MSIX_CONFIG_VECTOR_INDEX        0
#define VIRTIO_MSIX_QUEUE_VECTOR_INDEX        1

/*
 * For big-endian aarch64/armv7 on QEMU (and most real HW), only CPU cores
 * are running in big-endian mode, with all peripheral being configured to
 * little-endian mode. Their default bus_space(9) functions forcibly swap
 * byte-order. This guarantees that PIO'ed data from pci(4), e.g., are
 * correctly handled by bus_space(9), while DMA'ed ones should be swapped
 * by hand, in violation of virtio(4) specifications.
 */

#if (defined(__aarch64__) || defined(__arm__)) && BYTE_ORDER == BIG_ENDIAN
#        define READ_ENDIAN_09        BIG_ENDIAN
#        define READ_ENDIAN_10        BIG_ENDIAN
#        define STRUCT_ENDIAN_09        BIG_ENDIAN
#        define STRUCT_ENDIAN_10        LITTLE_ENDIAN
#elif BYTE_ORDER == BIG_ENDIAN
#        define READ_ENDIAN_09        LITTLE_ENDIAN
#        define READ_ENDIAN_10        BIG_ENDIAN
#        define STRUCT_ENDIAN_09        BIG_ENDIAN
#        define STRUCT_ENDIAN_10        LITTLE_ENDIAN
#else /* little endian */
#        define READ_ENDIAN_09        LITTLE_ENDIAN
#        define READ_ENDIAN_10        LITTLE_ENDIAN
#        define STRUCT_ENDIAN_09        LITTLE_ENDIAN
#        define STRUCT_ENDIAN_10        LITTLE_ENDIAN
#endif


CFATTACH_DECL3_NEW(virtio_pci, sizeof(struct virtio_pci_softc),
    virtio_pci_match, virtio_pci_attach, virtio_pci_detach, NULL,
    virtio_pci_rescan, NULL, DVF_DETACH_SHUTDOWN);

static const struct virtio_ops virtio_pci_ops_09 = {
        .kick = virtio_pci_kick_09,
        .read_queue_size = virtio_pci_read_queue_size_09,
        .setup_queue = virtio_pci_setup_queue_09,
        .set_status = virtio_pci_set_status_09,
        .neg_features = virtio_pci_negotiate_features_09,
        .alloc_interrupts = virtio_pci_alloc_interrupts,
        .free_interrupts = virtio_pci_free_interrupts,
        .setup_interrupts = virtio_pci_setup_interrupts_09,
};

static const struct virtio_ops virtio_pci_ops_10 = {
        .kick = virtio_pci_kick_10,
        .read_queue_size = virtio_pci_read_queue_size_10,
        .setup_queue = virtio_pci_setup_queue_10,
        .set_status = virtio_pci_set_status_10,
        .neg_features = virtio_pci_negotiate_features_10,
        .alloc_interrupts = virtio_pci_alloc_interrupts,
        .free_interrupts = virtio_pci_free_interrupts,
        .setup_interrupts = virtio_pci_setup_interrupts_10,
};

static int
virtio_pci_match(device_t parent, cfdata_t match, void *aux)
{
        struct pci_attach_args *pa;

        pa = (struct pci_attach_args *)aux;
        switch (PCI_VENDOR(pa->pa_id)) {
        case PCI_VENDOR_QUMRANET:
                /* Transitional devices MUST have a PCI Revision ID of 0. */
                if (((PCI_PRODUCT_QUMRANET_VIRTIO_1000 <=
                      PCI_PRODUCT(pa->pa_id)) &&
                     (PCI_PRODUCT(pa->pa_id) <=
                      PCI_PRODUCT_QUMRANET_VIRTIO_103F)) &&
                      PCI_REVISION(pa->pa_class) == 0)
                        return 1;
                /*
                 * Non-transitional devices SHOULD have a PCI Revision
                 * ID of 1 or higher.  Drivers MUST match any PCI
                 * Revision ID value.
                 */
                if (((PCI_PRODUCT_QUMRANET_VIRTIO_1040 <=
                      PCI_PRODUCT(pa->pa_id)) &&
                     (PCI_PRODUCT(pa->pa_id) <=
                      PCI_PRODUCT_QUMRANET_VIRTIO_107F)) &&
                      /* XXX: TODO */
                      PCI_REVISION(pa->pa_class) == 1)
                        return 1;
                break;
        }

        return 0;
}

static void
virtio_pci_attach(device_t parent, device_t self, void *aux)
{
        struct virtio_pci_softc * const psc = device_private(self);
        struct virtio_softc * const sc = &psc->sc_sc;
        struct pci_attach_args *pa = (struct pci_attach_args *)aux;
        pci_chipset_tag_t pc = pa->pa_pc;
        pcitag_t tag = pa->pa_tag;
        int revision;
        int ret;
        pcireg_t id;
        pcireg_t csr;

        revision = PCI_REVISION(pa->pa_class);
        switch (revision) {
        case 0:
                /* subsystem ID shows what I am */
                id = PCI_SUBSYS_ID(pci_conf_read(pc, tag, PCI_SUBSYS_ID_REG));
                break;
        case 1:
                /* pci product number shows what I am */
                id = PCI_PRODUCT(pa->pa_id) - PCI_PRODUCT_QUMRANET_VIRTIO_1040;
                break;
        default:
                aprint_normal(": unknown revision 0x%02x; giving up\n",
                              revision);
                return;
        }

        aprint_normal("\n");
        aprint_naive("\n");
        virtio_print_device_type(self, id, revision);

        csr = pci_conf_read(pc, tag, PCI_COMMAND_STATUS_REG);
        csr |= PCI_COMMAND_MASTER_ENABLE | PCI_COMMAND_IO_ENABLE;
        pci_conf_write(pc, tag, PCI_COMMAND_STATUS_REG, csr);

        sc->sc_dev = self;
        psc->sc_pa = *pa;
        psc->sc_iot = pa->pa_iot;

        sc->sc_dmat = pa->pa_dmat;
        if (pci_dma64_available(pa))
                sc->sc_dmat = pa->pa_dmat64;

        /* attach is dependent on revision */
        ret = 0;
        if (revision == 1) {
                /* try to attach 1.0 */
                ret = virtio_pci_attach_10(self, aux);
        }
        if (ret == 0 && revision == 0) {
                /*
                 * revision 0 means 0.9 only or both 0.9 and 1.0.  The
                 * latter are so-called "Transitional Devices".  For
                 * those devices, we want to use the 1.0 interface if
                 * possible.
                 *
                 * XXX Currently only on platforms that require 1.0
                 * XXX features, such as VIRTIO_F_ACCESS_PLATFORM.
                 */
#ifdef __NEED_VIRTIO_F_ACCESS_PLATFORM
                /* First, try to attach 1.0 */
                ret = virtio_pci_attach_10(self, aux);
                if (ret != 0) {
                        aprint_error_dev(self,
                            "VirtIO 1.0 error = %d, falling back to 0.9\n",
                            ret);
                        /* Fall back to 0.9. */
                        ret = virtio_pci_attach_09(self, aux);
                }
#else
                ret = virtio_pci_attach_09(self, aux);
#endif /* __NEED_VIRTIO_F_ACCESS_PLATFORM */
        }
        if (ret) {
                aprint_error_dev(self, "cannot attach (%d)\n", ret);
                return;
        }
        KASSERT(sc->sc_ops);

        /* preset config region */
        psc->sc_devcfg_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI;
        if (virtio_pci_adjust_config_region(psc))
                return;

        /* generic */
        virtio_device_reset(sc);
        virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_ACK);
        virtio_set_status(sc, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER);

        sc->sc_childdevid = id;
        sc->sc_child = NULL;
        virtio_pci_rescan(self, NULL, NULL);
        return;
}

/* ARGSUSED */
static int
virtio_pci_rescan(device_t self, const char *ifattr, const int *locs)
{
        struct virtio_pci_softc * const psc = device_private(self);
        struct virtio_softc * const sc = &psc->sc_sc;
        struct virtio_attach_args va;

        if (sc->sc_child)        /* Child already attached? */
                return 0;

        memset(&va, 0, sizeof(va));
        va.sc_childdevid = sc->sc_childdevid;

        config_found(self, &va, NULL, CFARGS_NONE);

        if (virtio_attach_failed(sc))
                return 0;

        return 0;
}


static int
virtio_pci_detach(device_t self, int flags)
{
        struct virtio_pci_softc * const psc = device_private(self);
        struct virtio_softc * const sc = &psc->sc_sc;
        int r;

        r = config_detach_children(self, flags);
        if (r != 0)
                return r;

        /* Check that child never attached, or detached properly */
        KASSERT(sc->sc_child == NULL);
        KASSERT(sc->sc_vqs == NULL);
        KASSERT(psc->sc_ihs_num == 0);

        if (psc->sc_iosize)
                bus_space_unmap(psc->sc_iot, psc->sc_ioh,
                        psc->sc_mapped_iosize);
        psc->sc_iosize = 0;

        return 0;
}


static int
virtio_pci_attach_09(device_t self, void *aux)
        //struct virtio_pci_softc *psc, struct pci_attach_args *pa)
{
        struct virtio_pci_softc * const psc = device_private(self);
        struct pci_attach_args *pa = (struct pci_attach_args *)aux;
        struct virtio_softc * const sc = &psc->sc_sc;
//        pci_chipset_tag_t pc = pa->pa_pc;
//        pcitag_t tag = pa->pa_tag;

        /* complete IO region */
        if (pci_mapreg_map(pa, PCI_MAPREG_START, PCI_MAPREG_TYPE_IO, 0,
                           &psc->sc_iot, &psc->sc_ioh, NULL, &psc->sc_iosize)) {
                aprint_error_dev(self, "can't map i/o space\n");
                return EIO;
        }
        psc->sc_mapped_iosize = psc->sc_iosize;

        /* queue space */
        if (bus_space_subregion(psc->sc_iot, psc->sc_ioh,
                        VIRTIO_CONFIG_QUEUE_NOTIFY, 2, &psc->sc_notify_ioh)) {
                aprint_error_dev(self, "can't map notify i/o space\n");
                return EIO;
        }
        psc->sc_notify_iosize = 2;
        psc->sc_notify_iot = psc->sc_iot;

        /* ISR space */
        if (bus_space_subregion(psc->sc_iot, psc->sc_ioh,
                        VIRTIO_CONFIG_ISR_STATUS, 1, &psc->sc_isr_ioh)) {
                aprint_error_dev(self, "can't map isr i/o space\n");
                return EIO;
        }
        psc->sc_isr_iosize = 1;
        psc->sc_isr_iot = psc->sc_iot;

        /* set our version 0.9 ops */
        sc->sc_ops = &virtio_pci_ops_09;
        sc->sc_bus_endian    = READ_ENDIAN_09;
        sc->sc_struct_endian = STRUCT_ENDIAN_09;
        return 0;
}


static int
virtio_pci_attach_10(device_t self, void *aux)
{
        struct virtio_pci_softc * const psc = device_private(self);
        struct pci_attach_args *pa = (struct pci_attach_args *)aux;
        struct virtio_softc * const sc = &psc->sc_sc;
        pci_chipset_tag_t pc = pa->pa_pc;
        pcitag_t tag = pa->pa_tag;

        struct virtio_pci_cap common, isr, device;
        struct virtio_pci_notify_cap notify;
        int have_device_cfg = 0;
        bus_size_t bars[NMAPREG] = { 0 };
        int bars_idx[NMAPREG] = { 0 };
        struct virtio_pci_cap *caps[] = { &common, &isr, &device, &notify.cap };
        int i, j, ret = 0;

        if (virtio_pci_find_cap(psc, VIRTIO_PCI_CAP_COMMON_CFG,
                        &common, sizeof(common)))
                return ENODEV;
        if (virtio_pci_find_cap(psc, VIRTIO_PCI_CAP_NOTIFY_CFG,
                        &notify, sizeof(notify)))
                return ENODEV;
        if (virtio_pci_find_cap(psc, VIRTIO_PCI_CAP_ISR_CFG,
                        &isr, sizeof(isr)))
                return ENODEV;
        if (virtio_pci_find_cap(psc, VIRTIO_PCI_CAP_DEVICE_CFG,
                        &device, sizeof(device)))
                memset(&device, 0, sizeof(device));
        else
                have_device_cfg = 1;

        /* Figure out which bars we need to map */
        for (i = 0; i < __arraycount(caps); i++) {
                int bar = caps[i]->bar;
                bus_size_t len = caps[i]->offset + caps[i]->length;
                if (caps[i]->length == 0)
                        continue;
                if (bars[bar] < len)
                        bars[bar] = len;
        }

        for (i = j = 0; i < __arraycount(bars); i++) {
                int reg;
                pcireg_t type;
                if (bars[i] == 0)
                        continue;
                reg = PCI_BAR(i);
                type = pci_mapreg_type(pc, tag, reg);
                if (pci_mapreg_map(pa, reg, type, 0,
                                &psc->sc_bars_iot[j], &psc->sc_bars_ioh[j],
                                NULL, &psc->sc_bars_iosize[j])) {
                        aprint_error_dev(self, "can't map bar %u \n", i);
                        ret = EIO;
                        goto err;
                }
                aprint_debug_dev(self,
                    "bar[%d]: iot %p, size 0x%" PRIxBUSSIZE "\n",
                    j, psc->sc_bars_iot[j], psc->sc_bars_iosize[j]);
                bars_idx[i] = j;
                j++;
        }

        i = bars_idx[notify.cap.bar];
        if (bus_space_subregion(psc->sc_bars_iot[i], psc->sc_bars_ioh[i],
                        notify.cap.offset, notify.cap.length,
                        &psc->sc_notify_ioh)) {
                aprint_error_dev(self, "can't map notify i/o space\n");
                ret = EIO;
                goto err;
        }
        psc->sc_notify_iosize = notify.cap.length;
        psc->sc_notify_iot = psc->sc_bars_iot[i];
        psc->sc_notify_off_multiplier = le32toh(notify.notify_off_multiplier);

        if (have_device_cfg) {
                i = bars_idx[device.bar];
                if (bus_space_subregion(psc->sc_bars_iot[i], psc->sc_bars_ioh[i],
                                device.offset, device.length,
                                &sc->sc_devcfg_ioh)) {
                        aprint_error_dev(self, "can't map devcfg i/o space\n");
                        ret = EIO;
                        goto err;
                }
                aprint_debug_dev(self,
                        "device.offset = 0x%x, device.length = 0x%x\n",
                        device.offset, device.length);
                sc->sc_devcfg_iosize = device.length;
                sc->sc_devcfg_iot = psc->sc_bars_iot[i];
        }

        i = bars_idx[isr.bar];
        if (bus_space_subregion(psc->sc_bars_iot[i], psc->sc_bars_ioh[i],
                        isr.offset, isr.length, &psc->sc_isr_ioh)) {
                aprint_error_dev(self, "can't map isr i/o space\n");
                ret = EIO;
                goto err;
        }
        psc->sc_isr_iosize = isr.length;
        psc->sc_isr_iot = psc->sc_bars_iot[i];

        i = bars_idx[common.bar];
        if (bus_space_subregion(psc->sc_bars_iot[i], psc->sc_bars_ioh[i],
                        common.offset, common.length, &psc->sc_ioh)) {
                aprint_error_dev(self, "can't map common i/o space\n");
                ret = EIO;
                goto err;
        }
        psc->sc_iosize = common.length;
        psc->sc_iot = psc->sc_bars_iot[i];
        psc->sc_mapped_iosize = psc->sc_bars_iosize[i];

        psc->sc_sc.sc_version_1 = 1;

        /* set our version 1.0 ops */
        sc->sc_ops = &virtio_pci_ops_10;
        sc->sc_bus_endian    = READ_ENDIAN_10;
        sc->sc_struct_endian = STRUCT_ENDIAN_10;
        return 0;

err:
        /* undo our pci_mapreg_map()s */ 
        for (i = 0; i < __arraycount(bars); i++) {
                if (psc->sc_bars_iosize[i] == 0)
                        continue;
                bus_space_unmap(psc->sc_bars_iot[i], psc->sc_bars_ioh[i],
                                psc->sc_bars_iosize[i]);
        }
        return ret;
}

/* v1.0 attach helper */
static int
virtio_pci_find_cap(struct virtio_pci_softc *psc, int cfg_type, void *buf, int buflen)
{
        device_t self = psc->sc_sc.sc_dev;
        pci_chipset_tag_t pc = psc->sc_pa.pa_pc;
        pcitag_t tag = psc->sc_pa.pa_tag;
        unsigned int offset, i, len;
        union {
                pcireg_t reg[8];
                struct virtio_pci_cap vcap;
        } *v = buf;

        if (buflen < sizeof(struct virtio_pci_cap))
                return ERANGE;

        if (!pci_get_capability(pc, tag, PCI_CAP_VENDSPEC, &offset, &v->reg[0]))
                return ENOENT;

        do {
                for (i = 0; i < 4; i++)
                        v->reg[i] =
                                le32toh(pci_conf_read(pc, tag, offset + i * 4));
                if (v->vcap.cfg_type == cfg_type)
                        break;
                offset = v->vcap.cap_next;
        } while (offset != 0);

        if (offset == 0)
                return ENOENT;

        if (v->vcap.cap_len > sizeof(struct virtio_pci_cap)) {
                len = roundup(v->vcap.cap_len, sizeof(pcireg_t));
                if (len > buflen) {
                        aprint_error_dev(self, "%s cap too large\n", __func__);
                        return ERANGE;
                }
                for (i = 4; i < len / sizeof(pcireg_t);  i++)
                        v->reg[i] =
                                le32toh(pci_conf_read(pc, tag, offset + i * 4));
        }

        /* endian fixup */
        v->vcap.offset = le32toh(v->vcap.offset);
        v->vcap.length = le32toh(v->vcap.length);
        return 0;
}


/* -------------------------------------
 * Version 0.9 support
 * -------------------------------------*/

static void
virtio_pci_kick_09(struct virtio_softc *sc, uint16_t idx)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;

        bus_space_write_2(psc->sc_notify_iot, psc->sc_notify_ioh, 0, idx);
}

/* only applicable for v 0.9 but also called for 1.0 */
static int
virtio_pci_adjust_config_region(struct virtio_pci_softc *psc)
{
        struct virtio_softc * const sc = &psc->sc_sc;
        device_t self = sc->sc_dev;

        if (psc->sc_sc.sc_version_1)
                return 0;

        sc->sc_devcfg_iosize = psc->sc_iosize - psc->sc_devcfg_offset;
        sc->sc_devcfg_iot = psc->sc_iot;
        if (bus_space_subregion(psc->sc_iot, psc->sc_ioh,
                        psc->sc_devcfg_offset, sc->sc_devcfg_iosize,
                        &sc->sc_devcfg_ioh)) {
                aprint_error_dev(self, "can't map config i/o space\n");
                return EIO;
        }

        return 0;
}

static uint16_t
virtio_pci_read_queue_size_09(struct virtio_softc *sc, uint16_t idx)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;

        bus_space_write_2(psc->sc_iot, psc->sc_ioh,
            VIRTIO_CONFIG_QUEUE_SELECT, idx);
        return bus_space_read_2(psc->sc_iot, psc->sc_ioh,
            VIRTIO_CONFIG_QUEUE_SIZE);
}

static void
virtio_pci_setup_queue_09(struct virtio_softc *sc, uint16_t idx, uint64_t addr)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;

        bus_space_write_2(psc->sc_iot, psc->sc_ioh,
            VIRTIO_CONFIG_QUEUE_SELECT, idx);
        bus_space_write_4(psc->sc_iot, psc->sc_ioh,
            VIRTIO_CONFIG_QUEUE_ADDRESS, addr / VIRTIO_PAGE_SIZE);

        if (psc->sc_ihs_num > 1) {
                int vec = VIRTIO_MSIX_QUEUE_VECTOR_INDEX;
                if (psc->sc_intr_pervq)
                        vec += idx;
                bus_space_write_2(psc->sc_iot, psc->sc_ioh,
                    VIRTIO_CONFIG_MSI_QUEUE_VECTOR, vec);
        }
}

static void
virtio_pci_set_status_09(struct virtio_softc *sc, int status)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        int old = 0;

        if (status != 0) {
            old = bus_space_read_1(psc->sc_iot, psc->sc_ioh,
                VIRTIO_CONFIG_DEVICE_STATUS);
        }
        bus_space_write_1(psc->sc_iot, psc->sc_ioh,
            VIRTIO_CONFIG_DEVICE_STATUS, status|old);
}

static void
virtio_pci_negotiate_features_09(struct virtio_softc *sc, uint64_t guest_features)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        uint32_t r;

        r = bus_space_read_4(psc->sc_iot, psc->sc_ioh,
            VIRTIO_CONFIG_DEVICE_FEATURES);

        r &= guest_features;

        bus_space_write_4(psc->sc_iot, psc->sc_ioh,
            VIRTIO_CONFIG_GUEST_FEATURES, r);

        sc->sc_active_features = r;
}

/* -------------------------------------
 * Version 1.0 support
 * -------------------------------------*/

static void
virtio_pci_kick_10(struct virtio_softc *sc, uint16_t idx)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        unsigned offset = sc->sc_vqs[idx].vq_notify_off *
                psc->sc_notify_off_multiplier;

        bus_space_write_2(psc->sc_notify_iot, psc->sc_notify_ioh, offset, idx);
}


static uint16_t
virtio_pci_read_queue_size_10(struct virtio_softc *sc, uint16_t idx)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        bus_space_tag_t           iot = psc->sc_iot;
        bus_space_handle_t ioh = psc->sc_ioh;

        bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_SELECT, idx);
        return bus_space_read_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_SIZE);
}

/*
 * By definition little endian only in v1.0.  NB: "MAY" in the text
 * below refers to "independently" (i.e. the order of accesses) not
 * "32-bit" (which is restricted by the earlier "MUST").
 *
 * 4.1.3.1 Driver Requirements: PCI Device Layout
 *
 * For device configuration access, the driver MUST use ... 32-bit
 * wide and aligned accesses for ... 64-bit wide fields.  For 64-bit
 * fields, the driver MAY access each of the high and low 32-bit parts
 * of the field independently.
 */
static __inline void
virtio_pci_bus_space_write_8(bus_space_tag_t iot, bus_space_handle_t ioh,
     bus_size_t offset, uint64_t value)
{
#if _QUAD_HIGHWORD
        bus_space_write_4(iot, ioh, offset, BUS_ADDR_LO32(value));
        bus_space_write_4(iot, ioh, offset + 4, BUS_ADDR_HI32(value));
#else
        bus_space_write_4(iot, ioh, offset, BUS_ADDR_HI32(value));
        bus_space_write_4(iot, ioh, offset + 4, BUS_ADDR_LO32(value));
#endif
}

static void
virtio_pci_setup_queue_10(struct virtio_softc *sc, uint16_t idx, uint64_t addr)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        struct virtqueue *vq = &sc->sc_vqs[idx];
        bus_space_tag_t           iot = psc->sc_iot;
        bus_space_handle_t ioh = psc->sc_ioh;
        KASSERT(vq->vq_index == idx);

        bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_SELECT, vq->vq_index);
        if (addr == 0) {
                bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_ENABLE, 0);
                virtio_pci_bus_space_write_8(iot, ioh,
                    VIRTIO_CONFIG1_QUEUE_DESC,   0);
                virtio_pci_bus_space_write_8(iot, ioh,
                    VIRTIO_CONFIG1_QUEUE_AVAIL,  0);
                virtio_pci_bus_space_write_8(iot, ioh,
                    VIRTIO_CONFIG1_QUEUE_USED,   0);
        } else {
                virtio_pci_bus_space_write_8(iot, ioh,
                        VIRTIO_CONFIG1_QUEUE_DESC, addr);
                virtio_pci_bus_space_write_8(iot, ioh,
                        VIRTIO_CONFIG1_QUEUE_AVAIL, addr + vq->vq_availoffset);
                virtio_pci_bus_space_write_8(iot, ioh,
                        VIRTIO_CONFIG1_QUEUE_USED, addr + vq->vq_usedoffset);
                bus_space_write_2(iot, ioh,
                        VIRTIO_CONFIG1_QUEUE_ENABLE, 1);
                vq->vq_notify_off = bus_space_read_2(iot, ioh,
                        VIRTIO_CONFIG1_QUEUE_NOTIFY_OFF);
        }

        if (psc->sc_ihs_num > 1) {
                int vec = VIRTIO_MSIX_QUEUE_VECTOR_INDEX;
                if (psc->sc_intr_pervq)
                        vec += idx;
                bus_space_write_2(iot, ioh,
                        VIRTIO_CONFIG1_QUEUE_MSIX_VECTOR, vec);
        }
}

static void
virtio_pci_set_status_10(struct virtio_softc *sc, int status)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        bus_space_tag_t           iot = psc->sc_iot;
        bus_space_handle_t ioh = psc->sc_ioh;
        int old = 0;

        if (status)
                old = bus_space_read_1(iot, ioh, VIRTIO_CONFIG1_DEVICE_STATUS);
        bus_space_write_1(iot, ioh, VIRTIO_CONFIG1_DEVICE_STATUS, status | old);
}

void
virtio_pci_negotiate_features_10(struct virtio_softc *sc, uint64_t guest_features)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        device_t self          =  sc->sc_dev;
        bus_space_tag_t           iot = psc->sc_iot;
        bus_space_handle_t ioh = psc->sc_ioh;
        uint64_t host, negotiated, device_status;

        guest_features |= VIRTIO_F_VERSION_1;
#ifdef __NEED_VIRTIO_F_ACCESS_PLATFORM
        /* XXX This could use some work. */
        guest_features |= VIRTIO_F_ACCESS_PLATFORM;
#endif /* __NEED_VIRTIO_F_ACCESS_PLATFORM */
        /* notify on empty is 0.9 only */
        guest_features &= ~VIRTIO_F_NOTIFY_ON_EMPTY;
        sc->sc_active_features = 0;

        bus_space_write_4(iot, ioh, VIRTIO_CONFIG1_DEVICE_FEATURE_SELECT, 0);
        host = bus_space_read_4(iot, ioh, VIRTIO_CONFIG1_DEVICE_FEATURE);
        bus_space_write_4(iot, ioh, VIRTIO_CONFIG1_DEVICE_FEATURE_SELECT, 1);
        host |= (uint64_t)
                bus_space_read_4(iot, ioh, VIRTIO_CONFIG1_DEVICE_FEATURE) << 32;

        negotiated = host & guest_features;

        bus_space_write_4(iot, ioh, VIRTIO_CONFIG1_DRIVER_FEATURE_SELECT, 0);
        bus_space_write_4(iot, ioh, VIRTIO_CONFIG1_DRIVER_FEATURE,
                        negotiated & 0xffffffff);
        bus_space_write_4(iot, ioh, VIRTIO_CONFIG1_DRIVER_FEATURE_SELECT, 1);
        bus_space_write_4(iot, ioh, VIRTIO_CONFIG1_DRIVER_FEATURE,
                        negotiated >> 32);
        virtio_pci_set_status_10(sc, VIRTIO_CONFIG_DEVICE_STATUS_FEATURES_OK);

        device_status = bus_space_read_1(iot, ioh, VIRTIO_CONFIG1_DEVICE_STATUS);
        if ((device_status & VIRTIO_CONFIG_DEVICE_STATUS_FEATURES_OK) == 0) {
                aprint_error_dev(self, "feature negotiation failed\n");
                bus_space_write_1(iot, ioh, VIRTIO_CONFIG1_DEVICE_STATUS,
                                VIRTIO_CONFIG_DEVICE_STATUS_FAILED);
                return;
        }

        if ((negotiated & VIRTIO_F_VERSION_1) == 0) {
                aprint_error_dev(self, "host rejected version 1\n");
                bus_space_write_1(iot, ioh, VIRTIO_CONFIG1_DEVICE_STATUS,
                                VIRTIO_CONFIG_DEVICE_STATUS_FAILED);
                return;
        }

        sc->sc_active_features = negotiated;
        return;
}


/* -------------------------------------
 * Generic PCI interrupt code
 * -------------------------------------*/

static int
virtio_pci_setup_interrupts_10(struct virtio_softc *sc, int reinit)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        bus_space_tag_t           iot = psc->sc_iot;
        bus_space_handle_t ioh = psc->sc_ioh;
        int vector, ret, qid;

        if (!virtio_pci_msix_enabled(psc))
                return 0;

        vector = VIRTIO_MSIX_CONFIG_VECTOR_INDEX;
        bus_space_write_2(iot, ioh,
                VIRTIO_CONFIG1_CONFIG_MSIX_VECTOR, vector);
        ret = bus_space_read_2(iot, ioh, VIRTIO_CONFIG1_CONFIG_MSIX_VECTOR);
        if (ret != vector) {
                VIRTIO_PCI_LOG(sc, reinit,
                    "can't set config msix vector\n");
                return -1;
        }

        for (qid = 0; qid < sc->sc_nvqs; qid++) {
                vector = VIRTIO_MSIX_QUEUE_VECTOR_INDEX;

                if (psc->sc_intr_pervq)
                        vector += qid;
                bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_SELECT, qid);
                bus_space_write_2(iot, ioh, VIRTIO_CONFIG1_QUEUE_MSIX_VECTOR,
                        vector);
                ret = bus_space_read_2(iot, ioh,
                        VIRTIO_CONFIG1_QUEUE_MSIX_VECTOR);
                if (ret != vector) {
                        VIRTIO_PCI_LOG(sc, reinit, "can't set queue %d "
                            "msix vector\n", qid);
                        return -1;
                }
        }

        return 0;
}

static int
virtio_pci_setup_interrupts_09(struct virtio_softc *sc, int reinit)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        int offset, vector, ret, qid;

        if (!virtio_pci_msix_enabled(psc))
                return 0;

        offset = VIRTIO_CONFIG_MSI_CONFIG_VECTOR;
        vector = VIRTIO_MSIX_CONFIG_VECTOR_INDEX;

        bus_space_write_2(psc->sc_iot, psc->sc_ioh, offset, vector);
        ret = bus_space_read_2(psc->sc_iot, psc->sc_ioh, offset);
        if (ret != vector) {
                aprint_debug_dev(sc->sc_dev, "%s: expected=%d, actual=%d\n",
                    __func__, vector, ret);
                VIRTIO_PCI_LOG(sc, reinit,
                    "can't set config msix vector\n");
                return -1;
        }

        for (qid = 0; qid < sc->sc_nvqs; qid++) {
                offset = VIRTIO_CONFIG_QUEUE_SELECT;
                bus_space_write_2(psc->sc_iot, psc->sc_ioh, offset, qid);

                offset = VIRTIO_CONFIG_MSI_QUEUE_VECTOR;
                vector = VIRTIO_MSIX_QUEUE_VECTOR_INDEX;

                if (psc->sc_intr_pervq)
                        vector += qid;

                bus_space_write_2(psc->sc_iot, psc->sc_ioh, offset, vector);
                ret = bus_space_read_2(psc->sc_iot, psc->sc_ioh, offset);
                if (ret != vector) {
                        aprint_debug_dev(sc->sc_dev, "%s[qid=%d]:"
                            " expected=%d, actual=%d\n",
                            __func__, qid, vector, ret);
                        VIRTIO_PCI_LOG(sc, reinit, "can't set queue %d "
                            "msix vector\n", qid);
                        return -1;
                }
        }

        return 0;
}

static int
virtio_pci_establish_msix_interrupts(struct virtio_softc *sc,
    struct pci_attach_args *pa)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        device_t self = sc->sc_dev;
        pci_chipset_tag_t pc = pa->pa_pc;
        struct virtqueue *vq;
        char intrbuf[PCI_INTRSTR_LEN];
        char intr_xname[INTRDEVNAMEBUF];
        char const *intrstr;
        int idx, qid, n;

        idx = VIRTIO_MSIX_CONFIG_VECTOR_INDEX;
        if (sc->sc_flags & VIRTIO_F_INTR_MPSAFE)
                pci_intr_setattr(pc, &psc->sc_ihp[idx], PCI_INTR_MPSAFE, true);

        snprintf(intr_xname, sizeof(intr_xname), "%s config",
            device_xname(sc->sc_dev));

        psc->sc_ihs[idx] = pci_intr_establish_xname(pc, psc->sc_ihp[idx],
            sc->sc_ipl, virtio_pci_msix_config_intr, sc, intr_xname);
        if (psc->sc_ihs[idx] == NULL) {
                aprint_error_dev(self, "couldn't establish MSI-X for config\n");
                goto error;
        }

        idx = VIRTIO_MSIX_QUEUE_VECTOR_INDEX;
        if (psc->sc_intr_pervq) {
                for (qid = 0; qid < sc->sc_nvqs; qid++) {
                        n = idx + qid;
                        vq = &sc->sc_vqs[qid];

                        snprintf(intr_xname, sizeof(intr_xname), "%s vq#%d",
                            device_xname(sc->sc_dev), qid);

                        if (sc->sc_flags & VIRTIO_F_INTR_MPSAFE) {
                                pci_intr_setattr(pc, &psc->sc_ihp[n],
                                    PCI_INTR_MPSAFE, true);
                        }

                        psc->sc_ihs[n] = pci_intr_establish_xname(pc, psc->sc_ihp[n],
                            sc->sc_ipl, vq->vq_intrhand, vq->vq_intrhand_arg, intr_xname);
                        if (psc->sc_ihs[n] == NULL) {
                                aprint_error_dev(self, "couldn't establish MSI-X for a vq\n");
                                goto error;
                        }
                }
        } else {
                if (sc->sc_flags & VIRTIO_F_INTR_MPSAFE)
                        pci_intr_setattr(pc, &psc->sc_ihp[idx], PCI_INTR_MPSAFE, true);

                snprintf(intr_xname, sizeof(intr_xname), "%s queues",
                    device_xname(sc->sc_dev));
                psc->sc_ihs[idx] = pci_intr_establish_xname(pc, psc->sc_ihp[idx],
                    sc->sc_ipl, virtio_pci_msix_queue_intr, sc, intr_xname);
                if (psc->sc_ihs[idx] == NULL) {
                        aprint_error_dev(self, "couldn't establish MSI-X for queues\n");
                        goto error;
                }
        }

        idx = VIRTIO_MSIX_CONFIG_VECTOR_INDEX;
        intrstr = pci_intr_string(pc, psc->sc_ihp[idx], intrbuf, sizeof(intrbuf));
        aprint_normal_dev(self, "config interrupting at %s\n", intrstr);
        idx = VIRTIO_MSIX_QUEUE_VECTOR_INDEX;
        if (psc->sc_intr_pervq) {
                kcpuset_t *affinity;
                int affinity_to, r;

                kcpuset_create(&affinity, false);

                for (qid = 0; qid < sc->sc_nvqs; qid++) {
                        n = idx + qid;
                        affinity_to = (qid / 2) % ncpu;

                        intrstr = pci_intr_string(pc, psc->sc_ihp[n],
                            intrbuf, sizeof(intrbuf));

                        kcpuset_zero(affinity);
                        kcpuset_set(affinity, affinity_to);
                        r = interrupt_distribute(psc->sc_ihs[n], affinity, NULL);
                        if (r == 0) {
                                aprint_normal_dev(self,
                                    "for vq #%d interrupting at %s affinity to %u\n",
                                    qid, intrstr, affinity_to);
                        } else {
                                aprint_normal_dev(self,
                                    "for vq #%d interrupting at %s\n",
                                    qid, intrstr);
                        }
                }

                kcpuset_destroy(affinity);
        } else {
                intrstr = pci_intr_string(pc, psc->sc_ihp[idx], intrbuf, sizeof(intrbuf));
                aprint_normal_dev(self, "queues interrupting at %s\n", intrstr);
        }

        return 0;

error:
        idx = VIRTIO_MSIX_CONFIG_VECTOR_INDEX;
        if (psc->sc_ihs[idx] != NULL)
                pci_intr_disestablish(psc->sc_pa.pa_pc, psc->sc_ihs[idx]);
        idx = VIRTIO_MSIX_QUEUE_VECTOR_INDEX;
        if (psc->sc_intr_pervq) {
                for (qid = 0; qid < sc->sc_nvqs; qid++) {
                        n = idx + qid;
                        if (psc->sc_ihs[n] == NULL)
                                continue;
                        pci_intr_disestablish(psc->sc_pa.pa_pc, psc->sc_ihs[n]);
                }

        } else {
                if (psc->sc_ihs[idx] != NULL)
                        pci_intr_disestablish(psc->sc_pa.pa_pc, psc->sc_ihs[idx]);
        }

        return -1;
}

static int
virtio_pci_establish_intx_interrupt(struct virtio_softc *sc,
    struct pci_attach_args *pa)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        device_t self = sc->sc_dev;
        pci_chipset_tag_t pc = pa->pa_pc;
        char intrbuf[PCI_INTRSTR_LEN];
        char const *intrstr;

        if (sc->sc_flags & VIRTIO_F_INTR_MPSAFE)
                pci_intr_setattr(pc, &psc->sc_ihp[0], PCI_INTR_MPSAFE, true);

        psc->sc_ihs[0] = pci_intr_establish_xname(pc, psc->sc_ihp[0],
            sc->sc_ipl, virtio_pci_intr, sc, device_xname(sc->sc_dev));
        if (psc->sc_ihs[0] == NULL) {
                aprint_error_dev(self, "couldn't establish INTx\n");
                return -1;
        }

        intrstr = pci_intr_string(pc, psc->sc_ihp[0], intrbuf, sizeof(intrbuf));
        aprint_normal_dev(self, "interrupting at %s\n", intrstr);

        return 0;
}

static int
virtio_pci_alloc_interrupts(struct virtio_softc *sc)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        device_t self = sc->sc_dev;
        pci_chipset_tag_t pc = psc->sc_pa.pa_pc;
        pcitag_t tag = psc->sc_pa.pa_tag;
        int error;
        int nmsix;
        int off;
        int counts[PCI_INTR_TYPE_SIZE];
        pci_intr_type_t max_type;
        pcireg_t ctl;

        nmsix = pci_msix_count(psc->sc_pa.pa_pc, psc->sc_pa.pa_tag);
        aprint_debug_dev(self, "pci_msix_count=%d\n", nmsix);

        /* We need at least two: one for config and the other for queues */
        if ((sc->sc_flags & VIRTIO_F_INTR_MSIX) == 0 || nmsix < 2) {
                /* Try INTx only */
                max_type = PCI_INTR_TYPE_INTX;
                counts[PCI_INTR_TYPE_INTX] = 1;
        } else {
                /* Try MSI-X first and INTx second */
                if (ISSET(sc->sc_flags, VIRTIO_F_INTR_PERVQ) &&
                    sc->sc_nvqs + VIRTIO_MSIX_QUEUE_VECTOR_INDEX <= nmsix) {
                        nmsix = sc->sc_nvqs + VIRTIO_MSIX_QUEUE_VECTOR_INDEX;
                } else {
                        nmsix = 2;
                }

                max_type = PCI_INTR_TYPE_MSIX;
                counts[PCI_INTR_TYPE_MSIX] = nmsix;
                counts[PCI_INTR_TYPE_MSI] = 0;
                counts[PCI_INTR_TYPE_INTX] = 1;
        }

retry:
        error = pci_intr_alloc(&psc->sc_pa, &psc->sc_ihp, counts, max_type);
        if (error != 0) {
                aprint_error_dev(self, "couldn't map interrupt\n");
                return -1;
        }

        if (pci_intr_type(pc, psc->sc_ihp[0]) == PCI_INTR_TYPE_MSIX) {
                psc->sc_intr_pervq = nmsix > 2 ? true : false;
                psc->sc_ihs = kmem_zalloc(sizeof(*psc->sc_ihs) * nmsix,
                    KM_SLEEP);

                error = virtio_pci_establish_msix_interrupts(sc, &psc->sc_pa);
                if (error != 0) {
                        kmem_free(psc->sc_ihs, sizeof(*psc->sc_ihs) * nmsix);
                        pci_intr_release(pc, psc->sc_ihp, nmsix);

                        /* Retry INTx */
                        max_type = PCI_INTR_TYPE_INTX;
                        counts[PCI_INTR_TYPE_INTX] = 1;
                        goto retry;
                }

                psc->sc_ihs_num = nmsix;
                psc->sc_devcfg_offset = VIRTIO_CONFIG_DEVICE_CONFIG_MSI;
                virtio_pci_adjust_config_region(psc);
        } else if (pci_intr_type(pc, psc->sc_ihp[0]) == PCI_INTR_TYPE_INTX) {
                psc->sc_intr_pervq = false;
                psc->sc_ihs = kmem_zalloc(sizeof(*psc->sc_ihs) * 1,
                    KM_SLEEP);

                error = virtio_pci_establish_intx_interrupt(sc, &psc->sc_pa);
                if (error != 0) {
                        kmem_free(psc->sc_ihs, sizeof(*psc->sc_ihs) * 1);
                        pci_intr_release(pc, psc->sc_ihp, 1);
                        return -1;
                }

                psc->sc_ihs_num = 1;
                psc->sc_devcfg_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI;
                virtio_pci_adjust_config_region(psc);

                error = pci_get_capability(pc, tag, PCI_CAP_MSIX, &off, NULL);
                if (error != 0) {
                        ctl = pci_conf_read(pc, tag, off + PCI_MSIX_CTL);
                        ctl &= ~PCI_MSIX_CTL_ENABLE;
                        pci_conf_write(pc, tag, off + PCI_MSIX_CTL, ctl);
                }
        }

        if (!psc->sc_intr_pervq)
                CLR(sc->sc_flags, VIRTIO_F_INTR_PERVQ);
        return 0;
}

static void
virtio_pci_free_interrupts(struct virtio_softc *sc)
{
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;

        for (int i = 0; i < psc->sc_ihs_num; i++) {
                if (psc->sc_ihs[i] == NULL)
                        continue;
                pci_intr_disestablish(psc->sc_pa.pa_pc, psc->sc_ihs[i]);
                psc->sc_ihs[i] = NULL;
        }

        if (psc->sc_ihs_num > 0)
                pci_intr_release(psc->sc_pa.pa_pc, psc->sc_ihp, psc->sc_ihs_num);

        if (psc->sc_ihs != NULL) {
                kmem_free(psc->sc_ihs, sizeof(*psc->sc_ihs) * psc->sc_ihs_num);
                psc->sc_ihs = NULL;
        }
        psc->sc_ihs_num = 0;
}

static bool
virtio_pci_msix_enabled(struct virtio_pci_softc *psc)
{
        pci_chipset_tag_t pc = psc->sc_pa.pa_pc;

        if (pci_intr_type(pc, psc->sc_ihp[0]) == PCI_INTR_TYPE_MSIX)
                return true;

        return false;
}

/*
 * Interrupt handler.
 */
static int
virtio_pci_intr(void *arg)
{
        struct virtio_softc *sc = arg;
        struct virtio_pci_softc * const psc = (struct virtio_pci_softc *)sc;
        int isr, r = 0;

        /* check and ack the interrupt */
        isr = bus_space_read_1(psc->sc_isr_iot, psc->sc_isr_ioh, 0);
        if (isr == 0)
                return 0;
        if ((isr & VIRTIO_CONFIG_ISR_CONFIG_CHANGE) &&
            (sc->sc_config_change != NULL))
                r = (sc->sc_config_change)(sc);
        if (sc->sc_intrhand != NULL) {
                if (sc->sc_soft_ih != NULL)
                        softint_schedule(sc->sc_soft_ih);
                else
                        r |= (sc->sc_intrhand)(sc);
        }

        return r;
}

static int
virtio_pci_msix_queue_intr(void *arg)
{
        struct virtio_softc *sc = arg;
        int r = 0;

        if (sc->sc_intrhand != NULL) {
                if (sc->sc_soft_ih != NULL)
                        softint_schedule(sc->sc_soft_ih);
                else
                        r |= (sc->sc_intrhand)(sc);
        }

        return r;
}

static int
virtio_pci_msix_config_intr(void *arg)
{
        struct virtio_softc *sc = arg;
        int r = 0;

        if (sc->sc_config_change != NULL)
                r = (sc->sc_config_change)(sc);
        return r;
}

MODULE(MODULE_CLASS_DRIVER, virtio_pci, "pci,virtio");

#ifdef _MODULE
#include "ioconf.c"
#endif

static int
virtio_pci_modcmd(modcmd_t cmd, void *opaque)
{
        int error = 0;

#ifdef _MODULE
        switch (cmd) {
        case MODULE_CMD_INIT:
                error = config_init_component(cfdriver_ioconf_virtio_pci,
                    cfattach_ioconf_virtio_pci, cfdata_ioconf_virtio_pci);
                break;
        case MODULE_CMD_FINI:
                error = config_fini_component(cfdriver_ioconf_virtio_pci,
                    cfattach_ioconf_virtio_pci, cfdata_ioconf_virtio_pci);
                break;
        default:
                error = ENOTTY;
                break;
        }
#endif

        return error;
}



















































































































































































    3 










    3 
































    3 












    3 



    3 
















































    3 




















    3 





























    3 






    3 




    3 
    3 





    3 









    3 


















    3 










    3 


    2 





    3 



























































































































































    3 






























    3 












































































    3 

    3 



















































































































































































































    3 



    3 






























    3 















    3 






    3 


































    3 








    3 






    3 









    3 
    3 


    3 
    3 









    3 








    3 

    3 





































































































































































































































































































































































    2 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
/*        $NetBSD: prop_dictionary.c,v 1.46 2023/06/14 00:35:18 rin Exp $        */

/*-
 * Copyright (c) 2006, 2007, 2020 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "prop_object_impl.h"
#include <prop/prop_array.h>
#include <prop/prop_dictionary.h>
#include <prop/prop_string.h>

#include <sys/rbtree.h>

#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <errno.h>
#endif

/*
 * We implement these like arrays, but we keep them sorted by key.
 * This allows us to binary-search as well as keep externalized output
 * sane-looking for human eyes.
 */

#define        EXPAND_STEP                16

/*
 * prop_dictionary_keysym_t is allocated with space at the end to hold the
 * key.  This must be a regular object so that we can maintain sane iterator
 * semantics -- we don't want to require that the caller release the result
 * of prop_object_iterator_next().
 *
 * We'd like to have some small'ish keysym objects for up-to-16 characters
 * in a key, some for up-to-32 characters in a key, and then a final bucket
 * for up-to-128 characters in a key (not including NUL).  Keys longer than
 * 128 characters are not allowed.
 */
struct _prop_dictionary_keysym {
        struct _prop_object                pdk_obj;
        size_t                                pdk_size;
        struct rb_node                        pdk_link;
        char                                 pdk_key[1];
        /* actually variable length */
};

        /* pdk_key[1] takes care of the NUL */
#define        PDK_SIZE_16                (sizeof(struct _prop_dictionary_keysym) + 16)
#define        PDK_SIZE_32                (sizeof(struct _prop_dictionary_keysym) + 32)
#define        PDK_SIZE_128                (sizeof(struct _prop_dictionary_keysym) + 128)

#define        PDK_MAXKEY                128

_PROP_POOL_INIT(_prop_dictionary_keysym16_pool, PDK_SIZE_16, "pdict16")
_PROP_POOL_INIT(_prop_dictionary_keysym32_pool, PDK_SIZE_32, "pdict32")
_PROP_POOL_INIT(_prop_dictionary_keysym128_pool, PDK_SIZE_128, "pdict128")

struct _prop_dict_entry {
        prop_dictionary_keysym_t        pde_key;
        prop_object_t                        pde_objref;
};

struct _prop_dictionary {
        struct _prop_object        pd_obj;
        _PROP_RWLOCK_DECL(pd_rwlock)
        struct _prop_dict_entry        *pd_array;
        unsigned int                pd_capacity;
        unsigned int                pd_count;
        int                        pd_flags;

        uint32_t                pd_version;
};

#define        PD_F_IMMUTABLE                0x01        /* dictionary is immutable */

_PROP_POOL_INIT(_prop_dictionary_pool, sizeof(struct _prop_dictionary),
                "propdict")
_PROP_MALLOC_DEFINE(M_PROP_DICT, "prop dictionary",
                    "property dictionary container object")

static _prop_object_free_rv_t
                _prop_dictionary_free(prop_stack_t, prop_object_t *);
static void        _prop_dictionary_emergency_free(prop_object_t);
static bool        _prop_dictionary_externalize(
                                struct _prop_object_externalize_context *,
                                void *);
static _prop_object_equals_rv_t
                _prop_dictionary_equals(prop_object_t, prop_object_t,
                                        void **, void **,
                                        prop_object_t *, prop_object_t *);
static void        _prop_dictionary_equals_finish(prop_object_t, prop_object_t);
static prop_object_iterator_t
                _prop_dictionary_iterator_locked(prop_dictionary_t);
static prop_object_t
                _prop_dictionary_iterator_next_object_locked(void *);
static prop_object_t
                _prop_dictionary_get_keysym(prop_dictionary_t,
                                            prop_dictionary_keysym_t, bool);
static prop_object_t
                _prop_dictionary_get(prop_dictionary_t, const char *, bool);

static void _prop_dictionary_lock(void);
static void _prop_dictionary_unlock(void);

static const struct _prop_object_type _prop_object_type_dictionary = {
        .pot_type                =        PROP_TYPE_DICTIONARY,
        .pot_free                =        _prop_dictionary_free,
        .pot_emergency_free        =        _prop_dictionary_emergency_free,
        .pot_extern                =        _prop_dictionary_externalize,
        .pot_equals                =        _prop_dictionary_equals,
        .pot_equals_finish        =        _prop_dictionary_equals_finish,
        .pot_lock                 =       _prop_dictionary_lock,
        .pot_unlock                 =       _prop_dictionary_unlock,
};

static _prop_object_free_rv_t
                _prop_dict_keysym_free(prop_stack_t, prop_object_t *);
static bool        _prop_dict_keysym_externalize(
                                struct _prop_object_externalize_context *,
                                void *);
static _prop_object_equals_rv_t
                _prop_dict_keysym_equals(prop_object_t, prop_object_t,
                                         void **, void **,
                                         prop_object_t *, prop_object_t *);

static const struct _prop_object_type _prop_object_type_dict_keysym = {
        .pot_type        =        PROP_TYPE_DICT_KEYSYM,
        .pot_free        =        _prop_dict_keysym_free,
        .pot_extern        =        _prop_dict_keysym_externalize,
        .pot_equals        =        _prop_dict_keysym_equals,
};

#define        prop_object_is_dictionary(x)                \
        ((x) != NULL && (x)->pd_obj.po_type == &_prop_object_type_dictionary)
#define        prop_object_is_dictionary_keysym(x)        \
        ((x) != NULL && (x)->pdk_obj.po_type == &_prop_object_type_dict_keysym)

#define        prop_dictionary_is_immutable(x)                \
                                (((x)->pd_flags & PD_F_IMMUTABLE) != 0)

struct _prop_dictionary_iterator {
        struct _prop_object_iterator pdi_base;
        unsigned int                pdi_index;
};

/*
 * Dictionary key symbols are immutable, and we are likely to have many
 * duplicated key symbols.  So, to save memory, we unique'ify key symbols
 * so we only have to have one copy of each string.
 */

static int
/*ARGSUSED*/
_prop_dict_keysym_rb_compare_nodes(void *ctx _PROP_ARG_UNUSED,
                                   const void *n1, const void *n2)
{
        const struct _prop_dictionary_keysym *pdk1 = n1;
        const struct _prop_dictionary_keysym *pdk2 = n2;

        return strcmp(pdk1->pdk_key, pdk2->pdk_key);
}

static int
/*ARGSUSED*/
_prop_dict_keysym_rb_compare_key(void *ctx _PROP_ARG_UNUSED,
                                 const void *n, const void *v)
{
        const struct _prop_dictionary_keysym *pdk = n;
        const char *cp = v;

        return strcmp(pdk->pdk_key, cp);
}

static const rb_tree_ops_t _prop_dict_keysym_rb_tree_ops = {
        .rbto_compare_nodes = _prop_dict_keysym_rb_compare_nodes,
        .rbto_compare_key = _prop_dict_keysym_rb_compare_key,
        .rbto_node_offset = offsetof(struct _prop_dictionary_keysym, pdk_link),
        .rbto_context = NULL
};

static struct rb_tree _prop_dict_keysym_tree;

_PROP_ONCE_DECL(_prop_dict_init_once)
_PROP_MUTEX_DECL_STATIC(_prop_dict_keysym_tree_mutex)

static int
_prop_dict_init(void)
{

        _PROP_MUTEX_INIT(_prop_dict_keysym_tree_mutex);
        rb_tree_init(&_prop_dict_keysym_tree,
                           &_prop_dict_keysym_rb_tree_ops);
        return 0;
}

static void
_prop_dict_keysym_put(prop_dictionary_keysym_t pdk)
{

        if (pdk->pdk_size <= PDK_SIZE_16)
                _PROP_POOL_PUT(_prop_dictionary_keysym16_pool, pdk);
        else if (pdk->pdk_size <= PDK_SIZE_32)
                _PROP_POOL_PUT(_prop_dictionary_keysym32_pool, pdk);
        else {
                _PROP_ASSERT(pdk->pdk_size <= PDK_SIZE_128);
                _PROP_POOL_PUT(_prop_dictionary_keysym128_pool, pdk);
        }
}

/* ARGSUSED */
static _prop_object_free_rv_t
_prop_dict_keysym_free(prop_stack_t stack, prop_object_t *obj)
{
        prop_dictionary_keysym_t pdk = *obj;

        rb_tree_remove_node(&_prop_dict_keysym_tree, pdk);
        _prop_dict_keysym_put(pdk);

        return _PROP_OBJECT_FREE_DONE;
}

static bool
_prop_dict_keysym_externalize(struct _prop_object_externalize_context *ctx,
                             void *v)
{
        prop_dictionary_keysym_t pdk = v;

        /* We externalize these as strings, and they're never empty. */

        _PROP_ASSERT(pdk->pdk_key[0] != '\0');

        if (_prop_object_externalize_start_tag(ctx, "string") == false ||
            _prop_object_externalize_append_encoded_cstring(ctx,
                                                pdk->pdk_key) == false ||
            _prop_object_externalize_end_tag(ctx, "string") == false)
                return (false);

        return (true);
}

/* ARGSUSED */
static _prop_object_equals_rv_t
_prop_dict_keysym_equals(prop_object_t v1, prop_object_t v2,
    void **stored_pointer1, void **stored_pointer2,
    prop_object_t *next_obj1, prop_object_t *next_obj2)
{
        prop_dictionary_keysym_t pdk1 = v1;
        prop_dictionary_keysym_t pdk2 = v2;

        /*
         * There is only ever one copy of a keysym at any given time,
         * so we can reduce this to a simple pointer equality check.
         */
        if (pdk1 == pdk2)
                return _PROP_OBJECT_EQUALS_TRUE;
        else
                return _PROP_OBJECT_EQUALS_FALSE;
}

static prop_dictionary_keysym_t
_prop_dict_keysym_alloc(const char *key)
{
        prop_dictionary_keysym_t opdk, pdk, rpdk;
        size_t size;

        _PROP_ONCE_RUN(_prop_dict_init_once, _prop_dict_init);

        /*
         * Check to see if this already exists in the tree.  If it does,
         * we just retain it and return it.
         */
        _PROP_MUTEX_LOCK(_prop_dict_keysym_tree_mutex);
        opdk = rb_tree_find_node(&_prop_dict_keysym_tree, key);
        if (opdk != NULL) {
                prop_object_retain(opdk);
                _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex);
                return (opdk);
        }
        _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex);

        /*
         * Not in the tree.  Create it now.
         */

        size = sizeof(*pdk) + strlen(key) /* pdk_key[1] covers the NUL */;

        if (size <= PDK_SIZE_16)
                pdk = _PROP_POOL_GET(_prop_dictionary_keysym16_pool);
        else if (size <= PDK_SIZE_32)
                pdk = _PROP_POOL_GET(_prop_dictionary_keysym32_pool);
        else if (size <= PDK_SIZE_128)
                pdk = _PROP_POOL_GET(_prop_dictionary_keysym128_pool);
        else
                pdk = NULL;        /* key too long */

        if (pdk == NULL)
                return (NULL);

        _prop_object_init(&pdk->pdk_obj, &_prop_object_type_dict_keysym);

        strcpy(pdk->pdk_key, key);
        pdk->pdk_size = size;

        /*
         * We dropped the mutex when we allocated the new object, so
         * we have to check again if it is in the tree.
         */
        _PROP_MUTEX_LOCK(_prop_dict_keysym_tree_mutex);
        opdk = rb_tree_find_node(&_prop_dict_keysym_tree, key);
        if (opdk != NULL) {
                prop_object_retain(opdk);
                _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex);
                _prop_dict_keysym_put(pdk);
                return (opdk);
        }
        rpdk = rb_tree_insert_node(&_prop_dict_keysym_tree, pdk);
        _PROP_ASSERT(rpdk == pdk);
        _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex);
        return (rpdk);
}

static _prop_object_free_rv_t
_prop_dictionary_free(prop_stack_t stack, prop_object_t *obj)
{
        prop_dictionary_t pd = *obj;
        prop_dictionary_keysym_t pdk;
        prop_object_t po;

        _PROP_ASSERT(pd->pd_count <= pd->pd_capacity);
        _PROP_ASSERT((pd->pd_capacity == 0 && pd->pd_array == NULL) ||
                     (pd->pd_capacity != 0 && pd->pd_array != NULL));

        /* The empty dictorinary is easy, handle that first. */
        if (pd->pd_count == 0) {
                if (pd->pd_array != NULL)
                        _PROP_FREE(pd->pd_array, M_PROP_DICT);

                _PROP_RWLOCK_DESTROY(pd->pd_rwlock);

                _PROP_POOL_PUT(_prop_dictionary_pool, pd);

                return (_PROP_OBJECT_FREE_DONE);
        }

        po = pd->pd_array[pd->pd_count - 1].pde_objref;
        _PROP_ASSERT(po != NULL);

        if (stack == NULL) {
                /*
                 * If we are in emergency release mode,
                 * just let caller recurse down.
                 */
                *obj = po;
                return (_PROP_OBJECT_FREE_FAILED);
        }

        /* Otherwise, try to push the current object on the stack. */
        if (!_prop_stack_push(stack, pd, NULL, NULL, NULL)) {
                /* Push failed, entering emergency release mode. */
                return (_PROP_OBJECT_FREE_FAILED);
        }
        /* Object pushed on stack, caller will release it. */
        --pd->pd_count;
        pdk = pd->pd_array[pd->pd_count].pde_key;
        _PROP_ASSERT(pdk != NULL);

        prop_object_release(pdk);

        *obj = po;
        return (_PROP_OBJECT_FREE_RECURSE);
}


static void
_prop_dictionary_lock(void)
{

        /* XXX: once necessary or paranoia? */
        _PROP_ONCE_RUN(_prop_dict_init_once, _prop_dict_init);
        _PROP_MUTEX_LOCK(_prop_dict_keysym_tree_mutex);
}

static void
_prop_dictionary_unlock(void)
{
        _PROP_MUTEX_UNLOCK(_prop_dict_keysym_tree_mutex);
}

static void
_prop_dictionary_emergency_free(prop_object_t obj)
{
        prop_dictionary_t pd = obj;
        prop_dictionary_keysym_t pdk;

        _PROP_ASSERT(pd->pd_count != 0);
        --pd->pd_count;

        pdk = pd->pd_array[pd->pd_count].pde_key;
        _PROP_ASSERT(pdk != NULL);
        prop_object_release(pdk);
}

static bool
_prop_dictionary_externalize(struct _prop_object_externalize_context *ctx,
                             void *v)
{
        prop_dictionary_t pd = v;
        prop_dictionary_keysym_t pdk;
        struct _prop_object *po;
        prop_object_iterator_t pi;
        unsigned int i;
        bool rv = false;

        _PROP_RWLOCK_RDLOCK(pd->pd_rwlock);

        if (pd->pd_count == 0) {
                _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);
                return (_prop_object_externalize_empty_tag(ctx, "dict"));
        }

        if (_prop_object_externalize_start_tag(ctx, "dict") == false ||
            _prop_object_externalize_append_char(ctx, '\n') == false)
                goto out;

        pi = _prop_dictionary_iterator_locked(pd);
        if (pi == NULL)
                goto out;

        ctx->poec_depth++;
        _PROP_ASSERT(ctx->poec_depth != 0);

        while ((pdk = _prop_dictionary_iterator_next_object_locked(pi))
            != NULL) {
                po = _prop_dictionary_get_keysym(pd, pdk, true);
                if (po == NULL ||
                    _prop_object_externalize_start_tag(ctx, "key") == false ||
                    _prop_object_externalize_append_encoded_cstring(ctx,
                                                   pdk->pdk_key) == false ||
                    _prop_object_externalize_end_tag(ctx, "key") == false ||
                    (*po->po_type->pot_extern)(ctx, po) == false) {
                        prop_object_iterator_release(pi);
                        goto out;
                }
        }

        prop_object_iterator_release(pi);

        ctx->poec_depth--;
        for (i = 0; i < ctx->poec_depth; i++) {
                if (_prop_object_externalize_append_char(ctx, '\t') == false)
                        goto out;
        }
        if (_prop_object_externalize_end_tag(ctx, "dict") == false)
                goto out;

        rv = true;

 out:
        _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);
        return (rv);
}

/* ARGSUSED */
static _prop_object_equals_rv_t
_prop_dictionary_equals(prop_object_t v1, prop_object_t v2,
    void **stored_pointer1, void **stored_pointer2,
    prop_object_t *next_obj1, prop_object_t *next_obj2)
{
        prop_dictionary_t dict1 = v1;
        prop_dictionary_t dict2 = v2;
        uintptr_t idx;
        _prop_object_equals_rv_t rv = _PROP_OBJECT_EQUALS_FALSE;

        if (dict1 == dict2)
                return (_PROP_OBJECT_EQUALS_TRUE);

        _PROP_ASSERT(*stored_pointer1 == *stored_pointer2);

        idx = (uintptr_t)*stored_pointer1;

        if (idx == 0) {
                if ((uintptr_t)dict1 < (uintptr_t)dict2) {
                        _PROP_RWLOCK_RDLOCK(dict1->pd_rwlock);
                        _PROP_RWLOCK_RDLOCK(dict2->pd_rwlock);
                } else {
                        _PROP_RWLOCK_RDLOCK(dict2->pd_rwlock);
                        _PROP_RWLOCK_RDLOCK(dict1->pd_rwlock);
                }
        }

        if (dict1->pd_count != dict2->pd_count)
                goto out;

        if (idx == dict1->pd_count) {
                rv = _PROP_OBJECT_EQUALS_TRUE;
                goto out;
        }

        _PROP_ASSERT(idx < dict1->pd_count);

        *stored_pointer1 = (void *)(idx + 1);
        *stored_pointer2 = (void *)(idx + 1);

        *next_obj1 = dict1->pd_array[idx].pde_objref;
        *next_obj2 = dict2->pd_array[idx].pde_objref;

        if (!prop_dictionary_keysym_equals(dict1->pd_array[idx].pde_key,
                                           dict2->pd_array[idx].pde_key))
                goto out;

        return (_PROP_OBJECT_EQUALS_RECURSE);

 out:
         _PROP_RWLOCK_UNLOCK(dict1->pd_rwlock);
        _PROP_RWLOCK_UNLOCK(dict2->pd_rwlock);
        return (rv);
}

static void
_prop_dictionary_equals_finish(prop_object_t v1, prop_object_t v2)
{
         _PROP_RWLOCK_UNLOCK(((prop_dictionary_t)v1)->pd_rwlock);
         _PROP_RWLOCK_UNLOCK(((prop_dictionary_t)v2)->pd_rwlock);
}

static prop_dictionary_t
_prop_dictionary_alloc(unsigned int capacity)
{
        prop_dictionary_t pd;
        struct _prop_dict_entry *array;

        if (capacity != 0) {
                array = _PROP_CALLOC(capacity * sizeof(*array), M_PROP_DICT);
                if (array == NULL)
                        return (NULL);
        } else
                array = NULL;

        pd = _PROP_POOL_GET(_prop_dictionary_pool);
        if (pd != NULL) {
                _prop_object_init(&pd->pd_obj, &_prop_object_type_dictionary);

                _PROP_RWLOCK_INIT(pd->pd_rwlock);
                pd->pd_array = array;
                pd->pd_capacity = capacity;
                pd->pd_count = 0;
                pd->pd_flags = 0;

                pd->pd_version = 0;
        } else if (array != NULL)
                _PROP_FREE(array, M_PROP_DICT);

        return (pd);
}

static bool
_prop_dictionary_expand(prop_dictionary_t pd, unsigned int capacity)
{
        struct _prop_dict_entry *array, *oarray;

        /*
         * Dictionary must be WRITE-LOCKED.
         */

        oarray = pd->pd_array;

        array = _PROP_CALLOC(capacity * sizeof(*array), M_PROP_DICT);
        if (array == NULL)
                return (false);
        if (oarray != NULL)
                memcpy(array, oarray, pd->pd_capacity * sizeof(*array));
        pd->pd_array = array;
        pd->pd_capacity = capacity;

        if (oarray != NULL)
                _PROP_FREE(oarray, M_PROP_DICT);

        return (true);
}

static prop_object_t
_prop_dictionary_iterator_next_object_locked(void *v)
{
        struct _prop_dictionary_iterator *pdi = v;
        prop_dictionary_t pd = pdi->pdi_base.pi_obj;
        prop_dictionary_keysym_t pdk = NULL;

        _PROP_ASSERT(prop_object_is_dictionary(pd));

        if (pd->pd_version != pdi->pdi_base.pi_version)
                goto out;        /* dictionary changed during iteration */

        _PROP_ASSERT(pdi->pdi_index <= pd->pd_count);

        if (pdi->pdi_index == pd->pd_count)
                goto out;        /* we've iterated all objects */

        pdk = pd->pd_array[pdi->pdi_index].pde_key;
        pdi->pdi_index++;

 out:
        return (pdk);
}

static prop_object_t
_prop_dictionary_iterator_next_object(void *v)
{
        struct _prop_dictionary_iterator *pdi = v;
        prop_dictionary_t pd _PROP_ARG_UNUSED = pdi->pdi_base.pi_obj;
        prop_dictionary_keysym_t pdk;

        _PROP_ASSERT(prop_object_is_dictionary(pd));

        _PROP_RWLOCK_RDLOCK(pd->pd_rwlock);
        pdk = _prop_dictionary_iterator_next_object_locked(pdi);
        _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);
        return (pdk);
}

static void
_prop_dictionary_iterator_reset_locked(void *v)
{
        struct _prop_dictionary_iterator *pdi = v;
        prop_dictionary_t pd = pdi->pdi_base.pi_obj;

        _PROP_ASSERT(prop_object_is_dictionary(pd));

        pdi->pdi_index = 0;
        pdi->pdi_base.pi_version = pd->pd_version;
}

static void
_prop_dictionary_iterator_reset(void *v)
{
        struct _prop_dictionary_iterator *pdi = v;
        prop_dictionary_t pd _PROP_ARG_UNUSED = pdi->pdi_base.pi_obj;

        _PROP_RWLOCK_RDLOCK(pd->pd_rwlock);
        _prop_dictionary_iterator_reset_locked(pdi);
        _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);
}

/*
 * prop_dictionary_create --
 *        Create a dictionary.
 */
prop_dictionary_t
prop_dictionary_create(void)
{

        return (_prop_dictionary_alloc(0));
}

/*
 * prop_dictionary_create_with_capacity --
 *        Create a dictionary with the capacity to store N objects.
 */
prop_dictionary_t
prop_dictionary_create_with_capacity(unsigned int capacity)
{

        return (_prop_dictionary_alloc(capacity));
}

/*
 * prop_dictionary_copy --
 *        Copy a dictionary.  The new dictionary has an initial capacity equal
 *        to the number of objects stored int the original dictionary.  The new
 *        dictionary contains references to the original dictionary's objects,
 *        not copies of those objects (i.e. a shallow copy).
 */
prop_dictionary_t
prop_dictionary_copy(prop_dictionary_t opd)
{
        prop_dictionary_t pd;
        prop_dictionary_keysym_t pdk;
        prop_object_t po;
        unsigned int idx;

        if (! prop_object_is_dictionary(opd))
                return (NULL);

        _PROP_RWLOCK_RDLOCK(opd->pd_rwlock);

        pd = _prop_dictionary_alloc(opd->pd_count);
        if (pd != NULL) {
                for (idx = 0; idx < opd->pd_count; idx++) {
                        pdk = opd->pd_array[idx].pde_key;
                        po = opd->pd_array[idx].pde_objref;

                        prop_object_retain(pdk);
                        prop_object_retain(po);

                        pd->pd_array[idx].pde_key = pdk;
                        pd->pd_array[idx].pde_objref = po;
                }
                pd->pd_count = opd->pd_count;
                pd->pd_flags = opd->pd_flags;
        }
        _PROP_RWLOCK_UNLOCK(opd->pd_rwlock);
        return (pd);
}

/*
 * prop_dictionary_copy_mutable --
 *        Like prop_dictionary_copy(), but the resulting dictionary is
 *        mutable.
 */
prop_dictionary_t
prop_dictionary_copy_mutable(prop_dictionary_t opd)
{
        prop_dictionary_t pd;

        if (! prop_object_is_dictionary(opd))
                return (NULL);

        pd = prop_dictionary_copy(opd);
        if (pd != NULL)
                pd->pd_flags &= ~PD_F_IMMUTABLE;

        return (pd);
}

/*
 * prop_dictionary_make_immutable --
 *        Set the immutable flag on that dictionary.
 */
void
prop_dictionary_make_immutable(prop_dictionary_t pd)
{

        _PROP_RWLOCK_WRLOCK(pd->pd_rwlock);
        if (prop_dictionary_is_immutable(pd) == false)
                pd->pd_flags |= PD_F_IMMUTABLE;
        _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);
}

/*
 * prop_dictionary_count --
 *        Return the number of objects stored in the dictionary.
 */
unsigned int
prop_dictionary_count(prop_dictionary_t pd)
{
        unsigned int rv;

        if (! prop_object_is_dictionary(pd))
                return (0);

        _PROP_RWLOCK_RDLOCK(pd->pd_rwlock);
        rv = pd->pd_count;
        _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);

        return (rv);
}

/*
 * prop_dictionary_ensure_capacity --
 *        Ensure that the dictionary has the capacity to store the specified
 *        total number of objects (including the objects already stored in
 *        the dictionary).
 */
bool
prop_dictionary_ensure_capacity(prop_dictionary_t pd, unsigned int capacity)
{
        bool rv;

        if (! prop_object_is_dictionary(pd))
                return (false);

        _PROP_RWLOCK_WRLOCK(pd->pd_rwlock);
        if (capacity > pd->pd_capacity)
                rv = _prop_dictionary_expand(pd, capacity);
        else
                rv = true;
        _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);
        return (rv);
}

static prop_object_iterator_t
_prop_dictionary_iterator_locked(prop_dictionary_t pd)
{
        struct _prop_dictionary_iterator *pdi;

        if (! prop_object_is_dictionary(pd))
                return (NULL);

        pdi = _PROP_CALLOC(sizeof(*pdi), M_TEMP);
        if (pdi == NULL)
                return (NULL);
        pdi->pdi_base.pi_next_object = _prop_dictionary_iterator_next_object;
        pdi->pdi_base.pi_reset = _prop_dictionary_iterator_reset;
        prop_object_retain(pd);
        pdi->pdi_base.pi_obj = pd;
        _prop_dictionary_iterator_reset_locked(pdi);

        return (&pdi->pdi_base);
}

/*
 * prop_dictionary_iterator --
 *        Return an iterator for the dictionary.  The dictionary is retained by
 *        the iterator.
 */
prop_object_iterator_t
prop_dictionary_iterator(prop_dictionary_t pd)
{
        prop_object_iterator_t pi;

        _PROP_RWLOCK_RDLOCK(pd->pd_rwlock);
        pi = _prop_dictionary_iterator_locked(pd);
        _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);
        return (pi);
}

/*
 * prop_dictionary_all_keys --
 *        Return an array containing a snapshot of all of the keys
 *        in the dictionary.
 */
prop_array_t
prop_dictionary_all_keys(prop_dictionary_t pd)
{
        prop_array_t array;
        unsigned int idx;
        bool rv = true;

        if (! prop_object_is_dictionary(pd))
                return (NULL);

        /* There is no pressing need to lock the dictionary for this. */
        array = prop_array_create_with_capacity(pd->pd_count);

        _PROP_RWLOCK_RDLOCK(pd->pd_rwlock);

        for (idx = 0; idx < pd->pd_count; idx++) {
                rv = prop_array_add(array, pd->pd_array[idx].pde_key);
                if (rv == false)
                        break;
        }

        _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);

        if (rv == false) {
                prop_object_release(array);
                array = NULL;
        }
        return (array);
}

static struct _prop_dict_entry *
_prop_dict_lookup(prop_dictionary_t pd, const char *key,
                  unsigned int *idxp)
{
        struct _prop_dict_entry *pde;
        unsigned int base, idx, distance;
        int res;

        /*
         * Dictionary must be READ-LOCKED or WRITE-LOCKED.
         */

        for (idx = 0, base = 0, distance = pd->pd_count; distance != 0;
             distance >>= 1) {
                idx = base + (distance >> 1);
                pde = &pd->pd_array[idx];
                _PROP_ASSERT(pde->pde_key != NULL);
                res = strcmp(key, pde->pde_key->pdk_key);
                if (res == 0) {
                        if (idxp != NULL)
                                *idxp = idx;
                        return (pde);
                }
                if (res > 0) {        /* key > pdk_key: move right */
                        base = idx + 1;
                        distance--;
                }                /* else move left */
        }

        /* idx points to the slot we looked at last. */
        if (idxp != NULL)
                *idxp = idx;
        return (NULL);
}

static prop_object_t
_prop_dictionary_get(prop_dictionary_t pd, const char *key, bool locked)
{
        const struct _prop_dict_entry *pde;
        prop_object_t po = NULL;

        if (! prop_object_is_dictionary(pd))
                return (NULL);

        if (!locked) {
                _PROP_RWLOCK_RDLOCK(pd->pd_rwlock);
        }
        pde = _prop_dict_lookup(pd, key, NULL);
        if (pde != NULL) {
                _PROP_ASSERT(pde->pde_objref != NULL);
                po = pde->pde_objref;
        }
        if (!locked) {
                _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);
        }
        return (po);
}
/*
 * prop_dictionary_get --
 *        Return the object stored with specified key.
 */
prop_object_t
prop_dictionary_get(prop_dictionary_t pd, const char *key)
{
        prop_object_t po = NULL;

        if (! prop_object_is_dictionary(pd))
                return (NULL);

        _PROP_RWLOCK_RDLOCK(pd->pd_rwlock);
        po = _prop_dictionary_get(pd, key, true);
        _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);
        return (po);
}

static prop_object_t
_prop_dictionary_get_keysym(prop_dictionary_t pd, prop_dictionary_keysym_t pdk,
    bool locked)
{

        if (! (prop_object_is_dictionary(pd) &&
               prop_object_is_dictionary_keysym(pdk)))
                return (NULL);

        return (_prop_dictionary_get(pd, pdk->pdk_key, locked));
}

/*
 * prop_dictionary_get_keysym --
 *        Return the object stored at the location encoded by the keysym.
 */
prop_object_t
prop_dictionary_get_keysym(prop_dictionary_t pd, prop_dictionary_keysym_t pdk)
{

        return (_prop_dictionary_get_keysym(pd, pdk, false));
}

/*
 * prop_dictionary_set --
 *        Store a reference to an object at with the specified key.
 *        If the key already exist, the original object is released.
 */
bool
prop_dictionary_set(prop_dictionary_t pd, const char *key, prop_object_t po)
{
        struct _prop_dict_entry *pde;
        prop_dictionary_keysym_t pdk;
        unsigned int idx;
        bool rv = false;

        if (! prop_object_is_dictionary(pd))
                return (false);

        _PROP_ASSERT(pd->pd_count <= pd->pd_capacity);

        if (prop_dictionary_is_immutable(pd))
                return (false);

        _PROP_RWLOCK_WRLOCK(pd->pd_rwlock);

        pde = _prop_dict_lookup(pd, key, &idx);
        if (pde != NULL) {
                prop_object_t opo = pde->pde_objref;
                prop_object_retain(po);
                pde->pde_objref = po;
                prop_object_release(opo);
                rv = true;
                goto out;
        }

        pdk = _prop_dict_keysym_alloc(key);
        if (pdk == NULL)
                goto out;

        if (pd->pd_count == pd->pd_capacity &&
            _prop_dictionary_expand(pd,
                                        pd->pd_capacity + EXPAND_STEP) == false) {
                prop_object_release(pdk);
                    goto out;
        }

        /* At this point, the store will succeed. */
        prop_object_retain(po);

        if (pd->pd_count == 0) {
                pd->pd_array[0].pde_key = pdk;
                pd->pd_array[0].pde_objref = po;
                pd->pd_count++;
                pd->pd_version++;
                rv = true;
                goto out;
        }

        pde = &pd->pd_array[idx];
        _PROP_ASSERT(pde->pde_key != NULL);

        if (strcmp(key, pde->pde_key->pdk_key) < 0) {
                /*
                 * key < pdk_key: insert to the left.  This is the same as
                 * inserting to the right, except we decrement the current
                 * index first.
                 *
                 * Because we're unsigned, we have to special case 0
                 * (grumble).
                 */
                if (idx == 0) {
                        memmove(&pd->pd_array[1], &pd->pd_array[0],
                                pd->pd_count * sizeof(*pde));
                        pd->pd_array[0].pde_key = pdk;
                        pd->pd_array[0].pde_objref = po;
                        pd->pd_count++;
                        pd->pd_version++;
                        rv = true;
                        goto out;
                }
                idx--;
        }

        memmove(&pd->pd_array[idx + 2], &pd->pd_array[idx + 1],
                (pd->pd_count - (idx + 1)) * sizeof(*pde));
        pd->pd_array[idx + 1].pde_key = pdk;
        pd->pd_array[idx + 1].pde_objref = po;
        pd->pd_count++;

        pd->pd_version++;

        rv = true;

 out:
        _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);
        return (rv);
}

/*
 * prop_dictionary_set_keysym --
 *        Replace the object in the dictionary at the location encoded by
 *        the keysym.
 */
bool
prop_dictionary_set_keysym(prop_dictionary_t pd, prop_dictionary_keysym_t pdk,
                           prop_object_t po)
{

        if (! (prop_object_is_dictionary(pd) &&
               prop_object_is_dictionary_keysym(pdk)))
                return (false);

        return (prop_dictionary_set(pd, pdk->pdk_key, po));
}

static void
_prop_dictionary_remove(prop_dictionary_t pd, struct _prop_dict_entry *pde,
    unsigned int idx)
{
        prop_dictionary_keysym_t pdk = pde->pde_key;
        prop_object_t po = pde->pde_objref;

        /*
         * Dictionary must be WRITE-LOCKED.
         */

        _PROP_ASSERT(pd->pd_count != 0);
        _PROP_ASSERT(idx < pd->pd_count);
        _PROP_ASSERT(pde == &pd->pd_array[idx]);

        idx++;
        memmove(&pd->pd_array[idx - 1], &pd->pd_array[idx],
                (pd->pd_count - idx) * sizeof(*pde));
        pd->pd_count--;
        pd->pd_version++;


        prop_object_release(pdk);

        prop_object_release(po);
}

/*
 * prop_dictionary_remove --
 *        Remove the reference to an object with the specified key from
 *        the dictionary.
 */
void
prop_dictionary_remove(prop_dictionary_t pd, const char *key)
{
        struct _prop_dict_entry *pde;
        unsigned int idx;

        if (! prop_object_is_dictionary(pd))
                return;

        _PROP_RWLOCK_WRLOCK(pd->pd_rwlock);

        /* XXX Should this be a _PROP_ASSERT()? */
        if (prop_dictionary_is_immutable(pd))
                goto out;

        pde = _prop_dict_lookup(pd, key, &idx);
        /* XXX Should this be a _PROP_ASSERT()? */
        if (pde == NULL)
                goto out;

        _prop_dictionary_remove(pd, pde, idx);
 out:
        _PROP_RWLOCK_UNLOCK(pd->pd_rwlock);
}

/*
 * prop_dictionary_remove_keysym --
 *        Remove a reference to an object stored in the dictionary at the
 *        location encoded by the keysym.
 */
void
prop_dictionary_remove_keysym(prop_dictionary_t pd,
                              prop_dictionary_keysym_t pdk)
{

        if (! (prop_object_is_dictionary(pd) &&
               prop_object_is_dictionary_keysym(pdk)))
                return;

        prop_dictionary_remove(pd, pdk->pdk_key);
}

/*
 * prop_dictionary_equals --
 *        Return true if the two dictionaries are equivalent.  Note we do a
 *        by-value comparison of the objects in the dictionary.
 */
bool
prop_dictionary_equals(prop_dictionary_t dict1, prop_dictionary_t dict2)
{
        if (!prop_object_is_dictionary(dict1) ||
            !prop_object_is_dictionary(dict2))
                return (false);

        return (prop_object_equals(dict1, dict2));
}

/*
 * prop_dictionary_keysym_value --
 *        Return a reference to the keysym's value.
 */
const char *
prop_dictionary_keysym_value(prop_dictionary_keysym_t pdk)
{

        if (! prop_object_is_dictionary_keysym(pdk))
                return (NULL);

        return (pdk->pdk_key);
}

_PROP_DEPRECATED(prop_dictionary_keysym_cstring_nocopy,
    "this program uses prop_dictionary_keysym_cstring_nocopy(), "
    "which is deprecated; use prop_dictionary_keysym_value() instead.")
const char *
prop_dictionary_keysym_cstring_nocopy(prop_dictionary_keysym_t pdk)
{

        if (! prop_object_is_dictionary_keysym(pdk))
                return (NULL);

        return (pdk->pdk_key);
}

/*
 * prop_dictionary_keysym_equals --
 *        Return true if the two dictionary key symbols are equivalent.
 *        Note: We do not compare the object references.
 */
bool
prop_dictionary_keysym_equals(prop_dictionary_keysym_t pdk1,
                              prop_dictionary_keysym_t pdk2)
{
        if (!prop_object_is_dictionary_keysym(pdk1) ||
            !prop_object_is_dictionary_keysym(pdk2))
                return (false);

        return (prop_object_equals(pdk1, pdk2));
}

/*
 * prop_dictionary_externalize --
 *        Externalize a dictionary, returning a NUL-terminated buffer
 *        containing the XML-style representation.  The buffer is allocated
 *        with the M_TEMP memory type.
 */
char *
prop_dictionary_externalize(prop_dictionary_t pd)
{
        struct _prop_object_externalize_context *ctx;
        char *cp;

        ctx = _prop_object_externalize_context_alloc();
        if (ctx == NULL)
                return (NULL);

        if (_prop_object_externalize_header(ctx) == false ||
            (*pd->pd_obj.po_type->pot_extern)(ctx, pd) == false ||
            _prop_object_externalize_footer(ctx) == false) {
                /* We are responsible for releasing the buffer. */
                _PROP_FREE(ctx->poec_buf, M_TEMP);
                _prop_object_externalize_context_free(ctx);
                return (NULL);
        }

        cp = ctx->poec_buf;
        _prop_object_externalize_context_free(ctx);

        return (cp);
}

/*
 * _prop_dictionary_internalize --
 *        Parse a <dict>...</dict> and return the object created from the
 *        external representation.
 *
 * Internal state in via rec_data is the storage area for the last processed
 * key.
 * _prop_dictionary_internalize_body is the upper half of the parse loop.
 * It is responsible for parsing the key directly and storing it in the area
 * referenced by rec_data.
 * _prop_dictionary_internalize_cont is the lower half and called with the value
 * associated with the key.
 */
static bool _prop_dictionary_internalize_body(prop_stack_t,
    prop_object_t *, struct _prop_object_internalize_context *, char *);

bool
_prop_dictionary_internalize(prop_stack_t stack, prop_object_t *obj,
    struct _prop_object_internalize_context *ctx)
{
        prop_dictionary_t dict;
        char *tmpkey;

        /* We don't currently understand any attributes. */
        if (ctx->poic_tagattr != NULL)
                return (true);

        dict = prop_dictionary_create();
        if (dict == NULL)
                return (true);

        if (ctx->poic_is_empty_element) {
                *obj = dict;
                return (true);
        }

        tmpkey = _PROP_MALLOC(PDK_MAXKEY + 1, M_TEMP);
        if (tmpkey == NULL) {
                prop_object_release(dict);
                return (true);
        }

        *obj = dict;
        /*
         * Opening tag is found, storage for key allocated and
         * now continue to the first element.
         */
        return _prop_dictionary_internalize_body(stack, obj, ctx, tmpkey);
}

static bool
_prop_dictionary_internalize_continue(prop_stack_t stack, prop_object_t *obj,
    struct _prop_object_internalize_context *ctx, void *data, prop_object_t child)
{
        prop_dictionary_t dict = *obj;
        char *tmpkey = data;

        _PROP_ASSERT(tmpkey != NULL);

        if (child == NULL ||
            prop_dictionary_set(dict, tmpkey, child) == false) {
                _PROP_FREE(tmpkey, M_TEMP);
                if (child != NULL)
                        prop_object_release(child);
                prop_object_release(dict);
                *obj = NULL;
                return (true);
        }

        prop_object_release(child);

        /*
         * key, value was added, now continue looking for the next key
         * or the closing tag.
         */
        return _prop_dictionary_internalize_body(stack, obj, ctx, tmpkey);
}

static bool
_prop_dictionary_internalize_body(prop_stack_t stack, prop_object_t *obj,
    struct _prop_object_internalize_context *ctx, char *tmpkey)
{
        prop_dictionary_t dict = *obj;
        size_t keylen;

        /* Fetch the next tag. */
        if (_prop_object_internalize_find_tag(ctx, NULL, _PROP_TAG_TYPE_EITHER) == false)
                goto bad;

        /* Check to see if this is the end of the dictionary. */
        if (_PROP_TAG_MATCH(ctx, "dict") &&
            ctx->poic_tag_type == _PROP_TAG_TYPE_END) {
                _PROP_FREE(tmpkey, M_TEMP);
                return (true);
        }

        /* Ok, it must be a non-empty key start tag. */
        if (!_PROP_TAG_MATCH(ctx, "key") ||
            ctx->poic_tag_type != _PROP_TAG_TYPE_START ||
            ctx->poic_is_empty_element)
                    goto bad;

        if (_prop_object_internalize_decode_string(ctx,
                                        tmpkey, PDK_MAXKEY, &keylen,
                                        &ctx->poic_cp) == false)
                goto bad;

        _PROP_ASSERT(keylen <= PDK_MAXKEY);
        tmpkey[keylen] = '\0';

        if (_prop_object_internalize_find_tag(ctx, "key",
                                _PROP_TAG_TYPE_END) == false)
                goto bad;

        /* ..and now the beginning of the value. */
        if (_prop_object_internalize_find_tag(ctx, NULL,
                                _PROP_TAG_TYPE_START) == false)
                goto bad;

        /*
         * Key is found, now wait for value to be parsed.
         */
        if (_prop_stack_push(stack, *obj,
                             _prop_dictionary_internalize_continue,
                             tmpkey, NULL))
                return (false);

 bad:
        _PROP_FREE(tmpkey, M_TEMP);
        prop_object_release(dict);
        *obj = NULL;
        return (true);
}

/*
 * prop_dictionary_internalize --
 *        Create a dictionary by parsing the NUL-terminated XML-style
 *        representation.
 */
prop_dictionary_t
prop_dictionary_internalize(const char *xml)
{
        return _prop_generic_internalize(xml, "dict");
}

#if !defined(_KERNEL) && !defined(_STANDALONE)
/*
 * prop_dictionary_externalize_to_file --
 *        Externalize a dictionary to the specified file.
 */
bool
prop_dictionary_externalize_to_file(prop_dictionary_t dict, const char *fname)
{
        char *xml;
        bool rv;
        int save_errno = 0;        /* XXXGCC -Wuninitialized [mips, ...] */

        xml = prop_dictionary_externalize(dict);
        if (xml == NULL)
                return (false);
        rv = _prop_object_externalize_write_file(fname, xml, strlen(xml));
        if (rv == false)
                save_errno = errno;
        _PROP_FREE(xml, M_TEMP);
        if (rv == false)
                errno = save_errno;

        return (rv);
}

/*
 * prop_dictionary_internalize_from_file --
 *        Internalize a dictionary from a file.
 */
prop_dictionary_t
prop_dictionary_internalize_from_file(const char *fname)
{
        struct _prop_object_internalize_mapped_file *mf;
        prop_dictionary_t dict;

        mf = _prop_object_internalize_map_file(fname);
        if (mf == NULL)
                return (NULL);
        dict = prop_dictionary_internalize(mf->poimf_xml);
        _prop_object_internalize_unmap_file(mf);

        return (dict);
}
#endif /* !_KERNEL && !_STANDALONE */
























































   86 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
/*        $NetBSD: hash.h,v 1.8 2014/09/05 05:46:15 matt Exp $        */

/*-
 * Copyright (c) 2001 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Luke Mewburn.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef        _SYS_HASH_H_
#define        _SYS_HASH_H_

#include <sys/types.h>

#ifdef __HAVE_MACHINE_HASH_H
#include <machine/hash.h>
#endif

#ifndef __HAVE_HASH32_BUF                        /* not overridden by MD hash */

#define        HASH32_BUF_INIT        5381

/*
 * uint32_t
 * hash32_buf(const void *bf, size_t len, uint32_t hash)
 *        return a 32 bit hash of the binary buffer buf (size len),
 *        seeded with an initial hash value of hash (usually HASH32_BUF_INIT).
 */
static __inline uint32_t
hash32_buf(const void *bf, size_t len, uint32_t hash)
{
        const uint8_t *s = (const uint8_t *)bf;

        while (len-- != 0)                        /* "nemesi": k=257, r=r*257 */
                hash = hash * 257 + *s++;
        return (hash * 257);
}
#endif        /* __HAVE_HASH32_BUF */


#ifndef __HAVE_HASH32_STR                        /* not overridden by MD hash */

#define        HASH32_STR_INIT        5381
/*
 * uint32_t
 * hash32_str(const void *bf, uint32_t hash)
 *        return a 32 bit hash of NUL terminated ASCII string buf,
 *        seeded with an initial hash value of hash (usually HASH32_STR_INIT).
 */
static __inline uint32_t
hash32_str(const void *bf, uint32_t hash)
{
        const uint8_t *s = (const uint8_t *)bf;
        uint8_t        c;

        while ((c = *s++) != 0)
                hash = hash * 33 + c;                /* "perl": k=33, r=r+r/32 */
        return (hash + (hash >> 5));
}

/*
 * uint32_t
 * hash32_strn(const void *bf, size_t len, uint32_t hash)
 *        return a 32 bit hash of NUL terminated ASCII string buf up to
 *        a maximum of len bytes,
 *        seeded with an initial hash value of hash (usually HASH32_STR_INIT).
 */
static __inline uint32_t
hash32_strn(const void *bf, size_t len, uint32_t hash)
{
        const uint8_t *s = (const uint8_t *)bf;
        uint8_t        c;

        while ((c = *s++) != 0 && len-- != 0)
                hash = hash * 33 + c;                /* "perl": k=33, r=r+r/32 */
        return (hash + (hash >> 5));
}
#endif        /* __HAVE_HASH32_STR */

__BEGIN_DECLS
uint32_t        murmurhash2(const void *, size_t, uint32_t);
__END_DECLS

#endif        /* !_SYS_HASH_H_ */






















































































































    2 


































    4 



























    4 
































































    6 





































    6 





































































    6 






















    6 











    6 

    6 




    1 



    5 





    1 

    5 







    6 







    1 



    5 
















    6 



    1 














































    6 








    6 







    6 


    6 





    6 






    6 
    6 









    6 
    6 
    5 
    1 
    6 

    6 

    6 





    6 














    6 








    6 
    6 









    6 










    6 








    6 











    6 









    6 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
/*        $NetBSD: kern_fork.c,v 1.230 2023/02/25 08:22:00 skrll Exp $        */

/*-
 * Copyright (c) 1999, 2001, 2004, 2006, 2007, 2008, 2019
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_fork.c        8.8 (Berkeley) 2/14/95
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_fork.c,v 1.230 2023/02/25 08:22:00 skrll Exp $");

#include "opt_ktrace.h"
#include "opt_dtrace.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/pool.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/ras.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/acct.h>
#include <sys/ktrace.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/syscallargs.h>
#include <sys/uidinfo.h>
#include <sys/sdt.h>
#include <sys/ptrace.h>

/*
 * DTrace SDT provider definitions
 */
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE3(proc, kernel, , create,
    "struct proc *", /* new process */
    "struct proc *", /* parent process */
    "int" /* flags */);

u_int        nprocs __cacheline_aligned = 1;                /* process 0 */

/*
 * Number of ticks to sleep if fork() would fail due to process hitting
 * limits. Exported in miliseconds to userland via sysctl.
 */
int        forkfsleep = 0;

int
sys_fork(struct lwp *l, const void *v, register_t *retval)
{

        return fork1(l, 0, SIGCHLD, NULL, 0, NULL, NULL, retval);
}

/*
 * vfork(2) system call compatible with 4.4BSD (i.e. BSD with Mach VM).
 * Address space is not shared, but parent is blocked until child exit.
 */
int
sys_vfork(struct lwp *l, const void *v, register_t *retval)
{

        return fork1(l, FORK_PPWAIT, SIGCHLD, NULL, 0, NULL, NULL,
            retval);
}

/*
 * New vfork(2) system call for NetBSD, which implements original 3BSD vfork(2)
 * semantics.  Address space is shared, and parent is blocked until child exit.
 */
int
sys___vfork14(struct lwp *l, const void *v, register_t *retval)
{

        return fork1(l, FORK_PPWAIT|FORK_SHAREVM, SIGCHLD, NULL, 0,
            NULL, NULL, retval);
}

/*
 * Linux-compatible __clone(2) system call.
 */
int
sys___clone(struct lwp *l, const struct sys___clone_args *uap,
    register_t *retval)
{
        /* {
                syscallarg(int) flags;
                syscallarg(void *) stack;
        } */
        int flags, sig;

        /*
         * We don't support the CLONE_PTRACE flag.
         */
        if (SCARG(uap, flags) & (CLONE_PTRACE))
                return EINVAL;

        /*
         * Linux enforces CLONE_VM with CLONE_SIGHAND, do same.
         */
        if (SCARG(uap, flags) & CLONE_SIGHAND
            && (SCARG(uap, flags) & CLONE_VM) == 0)
                return EINVAL;

        flags = 0;

        if (SCARG(uap, flags) & CLONE_VM)
                flags |= FORK_SHAREVM;
        if (SCARG(uap, flags) & CLONE_FS)
                flags |= FORK_SHARECWD;
        if (SCARG(uap, flags) & CLONE_FILES)
                flags |= FORK_SHAREFILES;
        if (SCARG(uap, flags) & CLONE_SIGHAND)
                flags |= FORK_SHARESIGS;
        if (SCARG(uap, flags) & CLONE_VFORK)
                flags |= FORK_PPWAIT;

        sig = SCARG(uap, flags) & CLONE_CSIGNAL;
        if (sig < 0 || sig >= _NSIG)
                return EINVAL;

        /*
         * Note that the Linux API does not provide a portable way of
         * specifying the stack area; the caller must know if the stack
         * grows up or down.  So, we pass a stack size of 0, so that the
         * code that makes this adjustment is a noop.
         */
        return fork1(l, flags, sig, SCARG(uap, stack), 0,
            NULL, NULL, retval);
}

/*
 * Print the 'table full' message once per 10 seconds.
 */
static struct timeval fork_tfmrate = { 10, 0 };

/*
 * Check if a process is traced and shall inform about FORK events.
 */
static inline bool
tracefork(struct proc *p, int flags)
{

        return (p->p_slflag & (PSL_TRACEFORK|PSL_TRACED)) ==
            (PSL_TRACEFORK|PSL_TRACED) && (flags & FORK_PPWAIT) == 0;
}

/*
 * Check if a process is traced and shall inform about VFORK events.
 */
static inline bool
tracevfork(struct proc *p, int flags)
{

        return (p->p_slflag & (PSL_TRACEVFORK|PSL_TRACED)) ==
            (PSL_TRACEVFORK|PSL_TRACED) && (flags & FORK_PPWAIT) != 0;
}

/*
 * Check if a process is traced and shall inform about VFORK_DONE events.
 */
static inline bool
tracevforkdone(struct proc *p, int flags)
{

        return (p->p_slflag & (PSL_TRACEVFORK_DONE|PSL_TRACED)) ==
            (PSL_TRACEVFORK_DONE|PSL_TRACED) && (flags & FORK_PPWAIT);
}

/*
 * General fork call.  Note that another LWP in the process may call exec()
 * or exit() while we are forking.  It's safe to continue here, because
 * neither operation will complete until all LWPs have exited the process.
 */
int
fork1(struct lwp *l1, int flags, int exitsig, void *stack, size_t stacksize,
    void (*func)(void *), void *arg, register_t *retval)
{
        struct proc        *p1, *p2, *parent;
        struct plimit   *p1_lim;
        uid_t                uid;
        struct lwp        *l2;
        int                count;
        vaddr_t                uaddr;
        int                tnprocs;
        int                error = 0;

        p1 = l1->l_proc;
        uid = kauth_cred_getuid(l1->l_cred);
        tnprocs = atomic_inc_uint_nv(&nprocs);

        /*
         * Although process entries are dynamically created, we still keep
         * a global limit on the maximum number we will create.
         */
        if (__predict_false(tnprocs >= maxproc))
                error = -1;
        else
                error = kauth_authorize_process(l1->l_cred,
                    KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL);

        if (error) {
                static struct timeval lasttfm;
                atomic_dec_uint(&nprocs);
                if (ratecheck(&lasttfm, &fork_tfmrate))
                        tablefull("proc", "increase kern.maxproc or NPROC");
                if (forkfsleep)
                        kpause("forkmx", false, forkfsleep, NULL);
                return EAGAIN;
        }

        /*
         * Enforce limits.
         */
        count = chgproccnt(uid, 1);
        if (__predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) {
                if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT,
                    p1, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
                    &p1->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != 0) {
                        (void)chgproccnt(uid, -1);
                        atomic_dec_uint(&nprocs);
                        if (forkfsleep)
                                kpause("forkulim", false, forkfsleep, NULL);
                        return EAGAIN;
                }
        }

        /*
         * Allocate virtual address space for the U-area now, while it
         * is still easy to abort the fork operation if we're out of
         * kernel virtual address space.
         */
        uaddr = uvm_uarea_alloc();
        if (__predict_false(uaddr == 0)) {
                (void)chgproccnt(uid, -1);
                atomic_dec_uint(&nprocs);
                return ENOMEM;
        }

        /* Allocate new proc. */
        p2 = proc_alloc();
        if (p2 == NULL) {
                /* We were unable to allocate a process ID. */
                uvm_uarea_free(uaddr);
                mutex_enter(p1->p_lock);
                uid = kauth_cred_getuid(p1->p_cred);
                (void)chgproccnt(uid, -1);
                mutex_exit(p1->p_lock);
                atomic_dec_uint(&nprocs);
                return EAGAIN;
        }

        /*
         * We are now committed to the fork.  From here on, we may
         * block on resources, but resource allocation may NOT fail.
         */

        /*
         * Make a proc table entry for the new process.
         * Start by zeroing the section of proc that is zero-initialized,
         * then copy the section that is copied directly from the parent.
         */
        memset(&p2->p_startzero, 0,
            (unsigned) ((char *)&p2->p_endzero - (char *)&p2->p_startzero));
        memcpy(&p2->p_startcopy, &p1->p_startcopy,
            (unsigned) ((char *)&p2->p_endcopy - (char *)&p2->p_startcopy));

        TAILQ_INIT(&p2->p_sigpend.sp_info);

        LIST_INIT(&p2->p_lwps);
        LIST_INIT(&p2->p_sigwaiters);

        /*
         * Duplicate sub-structures as needed.
         * Increase reference counts on shared objects.
         * Inherit flags we want to keep.  The flags related to SIGCHLD
         * handling are important in order to keep a consistent behaviour
         * for the child after the fork.  If we are a 32-bit process, the
         * child will be too.
         */
        p2->p_flag =
            p1->p_flag & (PK_SUGID | PK_NOCLDWAIT | PK_CLDSIGIGN | PK_32);
        p2->p_emul = p1->p_emul;
        p2->p_execsw = p1->p_execsw;

        if (flags & FORK_SYSTEM) {
                /*
                 * Mark it as a system process.  Set P_NOCLDWAIT so that
                 * children are reparented to init(8) when they exit.
                 * init(8) can easily wait them out for us.
                 */
                p2->p_flag |= (PK_SYSTEM | PK_NOCLDWAIT);
        }

        mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
        mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
        rw_init(&p2->p_reflock);
        cv_init(&p2->p_waitcv, "wait");
        cv_init(&p2->p_lwpcv, "lwpwait");

        /*
         * Share a lock between the processes if they are to share signal
         * state: we must synchronize access to it.
         */
        if (flags & FORK_SHARESIGS) {
                p2->p_lock = p1->p_lock;
                mutex_obj_hold(p1->p_lock);
        } else
                p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);

        kauth_proc_fork(p1, p2);

        p2->p_raslist = NULL;
#if defined(__HAVE_RAS)
        ras_fork(p1, p2);
#endif

        /* bump references to the text vnode (for procfs) */
        p2->p_textvp = p1->p_textvp;
        if (p2->p_textvp)
                vref(p2->p_textvp);
        if (p1->p_path)
                p2->p_path = kmem_strdupsize(p1->p_path, NULL, KM_SLEEP);
        else
                p2->p_path = NULL;

        if (flags & FORK_SHAREFILES)
                fd_share(p2);
        else if (flags & FORK_CLEANFILES)
                p2->p_fd = fd_init(NULL);
        else
                p2->p_fd = fd_copy();

        /* XXX racy */
        p2->p_mqueue_cnt = p1->p_mqueue_cnt;

        if (flags & FORK_SHARECWD)
                cwdshare(p2);
        else
                p2->p_cwdi = cwdinit();

        /*
         * Note: p_limit (rlimit stuff) is copy-on-write, so normally
         * we just need increase pl_refcnt.
         */
        p1_lim = p1->p_limit;
        if (!p1_lim->pl_writeable) {
                lim_addref(p1_lim);
                p2->p_limit = p1_lim;
        } else {
                p2->p_limit = lim_copy(p1_lim);
        }

        if (flags & FORK_PPWAIT) {
                /* Mark ourselves as waiting for a child. */
                p2->p_lflag = PL_PPWAIT;
                l1->l_vforkwaiting = true;
                p2->p_vforklwp = l1;
        } else {
                p2->p_lflag = 0;
                l1->l_vforkwaiting = false;
        }
        p2->p_sflag = 0;
        p2->p_slflag = 0;
        parent = (flags & FORK_NOWAIT) ? initproc : p1;
        p2->p_pptr = parent;
        p2->p_ppid = parent->p_pid;
        LIST_INIT(&p2->p_children);

        p2->p_aio = NULL;

#ifdef KTRACE
        /*
         * Copy traceflag and tracefile if enabled.
         * If not inherited, these were zeroed above.
         */
        if (p1->p_traceflag & KTRFAC_INHERIT) {
                mutex_enter(&ktrace_lock);
                p2->p_traceflag = p1->p_traceflag;
                if ((p2->p_tracep = p1->p_tracep) != NULL)
                        ktradref(p2);
                mutex_exit(&ktrace_lock);
        }
#endif

        /*
         * Create signal actions for the child process.
         */
        p2->p_sigacts = sigactsinit(p1, flags & FORK_SHARESIGS);
        mutex_enter(p1->p_lock);
        p2->p_sflag |=
            (p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP));
        sched_proc_fork(p1, p2);
        mutex_exit(p1->p_lock);

        p2->p_stflag = p1->p_stflag;

        /*
         * p_stats.
         * Copy parts of p_stats, and zero out the rest.
         */
        p2->p_stats = pstatscopy(p1->p_stats);

        /*
         * Set up the new process address space.
         */
        uvm_proc_fork(p1, p2, (flags & FORK_SHAREVM) ? true : false);

        /*
         * Finish creating the child process.
         * It will return through a different path later.
         */
        lwp_create(l1, p2, uaddr, (flags & FORK_PPWAIT) ? LWP_VFORK : 0,
            stack, stacksize, (func != NULL) ? func : child_return, arg, &l2,
            l1->l_class, &l1->l_sigmask, &l1->l_sigstk);

        /*
         * Inherit l_private from the parent.
         * Note that we cannot use lwp_setprivate() here since that
         * also sets the CPU TLS register, which is incorrect if the
         * process has changed that without letting the kernel know.
         */
        l2->l_private = l1->l_private;

        /*
         * If emulation has a process fork hook, call it now.
         */
        if (p2->p_emul->e_proc_fork)
                (*p2->p_emul->e_proc_fork)(p2, l1, flags);

        /*
         * ...and finally, any other random fork hooks that subsystems
         * might have registered.
         */
        doforkhooks(p2, p1);

        SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);

        /*
         * It's now safe for the scheduler and other processes to see the
         * child process.
         */
        mutex_enter(&proc_lock);

        if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT)
                p2->p_lflag |= PL_CONTROLT;

        LIST_INSERT_HEAD(&parent->p_children, p2, p_sibling);
        p2->p_exitsig = exitsig;                /* signal for parent on exit */

        /*
         * Trace fork(2) and vfork(2)-like events on demand in a debugger.
         */
        if (tracefork(p1, flags) || tracevfork(p1, flags)) {
                proc_changeparent(p2, p1->p_pptr);
                SET(p2->p_slflag, PSL_TRACEDCHILD);
        }

        p2->p_oppid = p1->p_pid; /* Remember the original parent id. */

        LIST_INSERT_AFTER(p1, p2, p_pglist);
        LIST_INSERT_HEAD(&allproc, p2, p_list);

        p2->p_trace_enabled = trace_is_enabled(p2);
#ifdef __HAVE_SYSCALL_INTERN
        (*p2->p_emul->e_syscall_intern)(p2);
#endif

        /*
         * Update stats now that we know the fork was successful.
         */
        KPREEMPT_DISABLE(l1);
        CPU_COUNT(CPU_COUNT_FORKS, 1);
        if (flags & FORK_PPWAIT)
                CPU_COUNT(CPU_COUNT_FORKS_PPWAIT, 1);
        if (flags & FORK_SHAREVM)
                CPU_COUNT(CPU_COUNT_FORKS_SHAREVM, 1);
        KPREEMPT_ENABLE(l1);

        if (ktrpoint(KTR_EMUL))
                p2->p_traceflag |= KTRFAC_TRC_EMUL;

        /*
         * Notify any interested parties about the new process.
         */
        if (!SLIST_EMPTY(&p1->p_klist)) {
                mutex_exit(&proc_lock);
                knote_proc_fork(p1, p2);
                mutex_enter(&proc_lock);
        }

        /*
         * Make child runnable, set start time, and add to run queue except
         * if the parent requested the child to start in SSTOP state.
         */
        mutex_enter(p2->p_lock);

        /*
         * Start profiling.
         */
        if ((p2->p_stflag & PST_PROFIL) != 0) {
                mutex_spin_enter(&p2->p_stmutex);
                startprofclock(p2);
                mutex_spin_exit(&p2->p_stmutex);
        }

        getmicrotime(&p2->p_stats->p_start);
        p2->p_acflag = AFORK;
        lwp_lock(l2);
        KASSERT(p2->p_nrlwps == 1);
        KASSERT(l2->l_stat == LSIDL);
        if (p2->p_sflag & PS_STOPFORK) {
                p2->p_nrlwps = 0;
                p2->p_stat = SSTOP;
                p2->p_waited = 0;
                p1->p_nstopchild++;
                l2->l_stat = LSSTOP;
                KASSERT(l2->l_wchan == NULL);
                lwp_unlock(l2);
        } else {
                p2->p_nrlwps = 1;
                p2->p_stat = SACTIVE;
                setrunnable(l2);
                /* LWP now unlocked */
        }

        /*
         * Return child pid to parent process,
         * marking us as parent via retval[1].
         */
        if (retval != NULL) {
                retval[0] = p2->p_pid;
                retval[1] = 0;
        }

        mutex_exit(p2->p_lock);

        /*
         * Let the parent know that we are tracing its child.
         */
        if (tracefork(p1, flags) || tracevfork(p1, flags)) {
                mutex_enter(p1->p_lock);
                eventswitch(TRAP_CHLD,
                    tracefork(p1, flags) ? PTRACE_FORK : PTRACE_VFORK,
                    retval[0]);
                mutex_enter(&proc_lock);
        }

        /*
         * Preserve synchronization semantics of vfork.  If waiting for
         * child to exec or exit, sleep until it clears p_vforkwaiting.
         */
        while (l1->l_vforkwaiting)
                cv_wait(&l1->l_waitcv, &proc_lock);

        /*
         * Let the parent know that we are tracing its child.
         */
        if (tracevforkdone(p1, flags)) {
                mutex_enter(p1->p_lock);
                eventswitch(TRAP_CHLD, PTRACE_VFORK_DONE, retval[0]);
        } else
                mutex_exit(&proc_lock);

        return 0;
}

/*
 * MI code executed in each newly spawned process before returning to userland.
 */
void
child_return(void *arg)
{
        struct lwp *l = curlwp;
        struct proc *p = l->l_proc;

        if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) ==
            (PSL_TRACED|PSL_TRACEDCHILD)) {
                eventswitchchild(p, TRAP_CHLD,
                    ISSET(p->p_lflag, PL_PPWAIT) ? PTRACE_VFORK : PTRACE_FORK);
        }

        md_child_return(l);

        /*
         * Return SYS_fork for all fork types, including vfork(2) and clone(2).
         *
         * This approach simplifies the code and avoids extra locking.
         */
        ktrsysret(SYS_fork, 0, 0);
}






























































































   77 

   79 











   28 



   27 
   28 












   43 




   43 
   43 
   43 






   43 

































   35 



   36 
















    8 



    8 


















    5 



    5 




















    3 



    3 






















































































































































































































   65 

   64 

   59 




   15 












   15 







   15 
   15 
   15 














  115 

  117 

  116 




   13 












   12 






   13 
   13 
   13 
   13 













   61 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
/*        $NetBSD: kern_condvar.c,v 1.63 2023/11/02 10:31:55 martin Exp $        */

/*-
 * Copyright (c) 2006, 2007, 2008, 2019, 2020, 2023
 *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * Kernel condition variable implementation.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.63 2023/11/02 10:31:55 martin Exp $");

#include <sys/param.h>

#include <sys/condvar.h>
#include <sys/cpu.h>
#include <sys/kernel.h>
#include <sys/lockdebug.h>
#include <sys/lwp.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/systm.h>

/*
 * Accessors for the private contents of the kcondvar_t data type.
 *
 *        cv_opaque[0]        sleepq_t
 *        cv_opaque[1]        description for ps(1)
 *
 * cv_opaque[0] is protected by the interlock passed to cv_wait() (enqueue
 * only), and the sleep queue lock acquired with sleepq_hashlock() (enqueue
 * and dequeue).
 *
 * cv_opaque[1] (the wmesg) is static and does not change throughout the life
 * of the CV.
 */
#define        CV_SLEEPQ(cv)                ((sleepq_t *)(cv)->cv_opaque)
#define        CV_WMESG(cv)                ((const char *)(cv)->cv_opaque[1])
#define        CV_SET_WMESG(cv, v)         (cv)->cv_opaque[1] = __UNCONST(v)

#define        CV_DEBUG_P(cv)        (CV_WMESG(cv) != nodebug)
#define        CV_RA                ((uintptr_t)__builtin_return_address(0))

static void                cv_unsleep(lwp_t *, bool);
static inline void        cv_wakeup_one(kcondvar_t *);
static inline void        cv_wakeup_all(kcondvar_t *);

syncobj_t cv_syncobj = {
        .sobj_name        = "cv",
        .sobj_flag        = SOBJ_SLEEPQ_SORTED,
        .sobj_boostpri  = PRI_KERNEL,
        .sobj_unsleep        = cv_unsleep,
        .sobj_changepri        = sleepq_changepri,
        .sobj_lendpri        = sleepq_lendpri,
        .sobj_owner        = syncobj_noowner,
};

static const char deadcv[] = "deadcv";

/*
 * cv_init:
 *
 *        Initialize a condition variable for use.
 */
void
cv_init(kcondvar_t *cv, const char *wmesg)
{

        KASSERT(wmesg != NULL);
        CV_SET_WMESG(cv, wmesg);
        sleepq_init(CV_SLEEPQ(cv));
}

/*
 * cv_destroy:
 *
 *        Tear down a condition variable.
 */
void
cv_destroy(kcondvar_t *cv)
{

        sleepq_destroy(CV_SLEEPQ(cv));
#ifdef DIAGNOSTIC
        KASSERT(cv_is_valid(cv));
        KASSERT(!cv_has_waiters(cv));
        CV_SET_WMESG(cv, deadcv);
#endif
}

/*
 * cv_enter:
 *
 *        Look up and lock the sleep queue corresponding to the given
 *        condition variable, and increment the number of waiters.
 */
static inline int
cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, bool catch_p)
{
        sleepq_t *sq;
        kmutex_t *mp;
        int nlocks;

        KASSERT(cv_is_valid(cv));
        KASSERT(!cpu_intr_p());
        KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL);

        mp = sleepq_hashlock(cv);
        sq = CV_SLEEPQ(cv);
        nlocks = sleepq_enter(sq, l, mp);
        sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj, catch_p);
        mutex_exit(mtx);
        KASSERT(cv_has_waiters(cv));
        return nlocks;
}

/*
 * cv_unsleep:
 *
 *        Remove an LWP from the condition variable and sleep queue.  This
 *        is called when the LWP has not been awoken normally but instead
 *        interrupted: for example, when a signal is received.  Must be
 *        called with the LWP locked.  Will unlock if "unlock" is true.
 */
static void
cv_unsleep(lwp_t *l, bool unlock)
{
        kcondvar_t *cv __diagused;

        cv = (kcondvar_t *)(uintptr_t)l->l_wchan;

        KASSERT(l->l_wchan == (wchan_t)cv);
        KASSERT(l->l_sleepq == CV_SLEEPQ(cv));
        KASSERT(cv_is_valid(cv));
        KASSERT(cv_has_waiters(cv));

        sleepq_unsleep(l, unlock);
}

/*
 * cv_wait:
 *
 *        Wait non-interruptably on a condition variable until awoken.
 */
void
cv_wait(kcondvar_t *cv, kmutex_t *mtx)
{
        lwp_t *l = curlwp;
        int nlocks;

        KASSERT(mutex_owned(mtx));

        nlocks = cv_enter(cv, mtx, l, false);
        (void)sleepq_block(0, false, &cv_syncobj, nlocks);
        mutex_enter(mtx);
}

/*
 * cv_wait_sig:
 *
 *        Wait on a condition variable until a awoken or a signal is received. 
 *        Will also return early if the process is exiting.  Returns zero if
 *        awoken normally, ERESTART if a signal was received and the system
 *        call is restartable, or EINTR otherwise.
 */
int
cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx)
{
        lwp_t *l = curlwp;
        int error, nlocks;

        KASSERT(mutex_owned(mtx));

        nlocks = cv_enter(cv, mtx, l, true);
        error = sleepq_block(0, true, &cv_syncobj, nlocks);
        mutex_enter(mtx);
        return error;
}

/*
 * cv_timedwait:
 *
 *        Wait on a condition variable until awoken or the specified timeout
 *        expires.  Returns zero if awoken normally or EWOULDBLOCK if the
 *        timeout expired.
 *
 *        timo is a timeout in ticks.  timo = 0 specifies an infinite timeout.
 */
int
cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo)
{
        lwp_t *l = curlwp;
        int error, nlocks;

        KASSERT(mutex_owned(mtx));

        nlocks = cv_enter(cv, mtx, l, false);
        error = sleepq_block(timo, false, &cv_syncobj, nlocks);
        mutex_enter(mtx);
        return error;
}

/*
 * cv_timedwait_sig:
 *
 *        Wait on a condition variable until a timeout expires, awoken or a
 *        signal is received.  Will also return early if the process is
 *        exiting.  Returns zero if awoken normally, EWOULDBLOCK if the
 *        timeout expires, ERESTART if a signal was received and the system
 *        call is restartable, or EINTR otherwise.
 *
 *        timo is a timeout in ticks.  timo = 0 specifies an infinite timeout.
 */
int
cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo)
{
        lwp_t *l = curlwp;
        int error, nlocks;

        KASSERT(mutex_owned(mtx));

        nlocks = cv_enter(cv, mtx, l, true);
        error = sleepq_block(timo, true, &cv_syncobj, nlocks);
        mutex_enter(mtx);
        return error;
}

/*
 * Given a number of seconds, sec, and 2^64ths of a second, frac, we
 * want a number of ticks for a timeout:
 *
 *        timo = hz*(sec + frac/2^64)
 *             = hz*sec + hz*frac/2^64
 *             = hz*sec + hz*(frachi*2^32 + fraclo)/2^64
 *             = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64,
 *
 * where frachi is the high 32 bits of frac and fraclo is the
 * low 32 bits.
 *
 * We assume hz < INT_MAX/2 < UINT32_MAX, so
 *
 *        hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1,
 *
 * since fraclo < 2^32.
 *
 * We clamp the result at INT_MAX/2 for a timeout in ticks, since we
 * can't represent timeouts higher than INT_MAX in cv_timedwait, and
 * spurious wakeup is OK.  Moreover, we don't want to wrap around,
 * because we compute end - start in ticks in order to compute the
 * remaining timeout, and that difference cannot wrap around, so we use
 * a timeout less than INT_MAX.  Using INT_MAX/2 provides plenty of
 * margin for paranoia and will exceed most waits in practice by far.
 */
static unsigned
bintime2timo(const struct bintime *bt)
{

        KASSERT(hz < INT_MAX/2);
        CTASSERT(INT_MAX/2 < UINT32_MAX);
        if (bt->sec > ((INT_MAX/2)/hz))
                return INT_MAX/2;
        if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec))
                return INT_MAX/2;

        return hz*bt->sec + (hz*(bt->frac >> 32) >> 32);
}

/*
 * timo is in units of ticks.  We want units of seconds and 2^64ths of
 * a second.  We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a
 * second), from which we can conclude 2^64 / hz = 1 (2^64th of a
 * second)/tick.  So for the fractional part, we compute
 *
 *        frac = rem * 2^64 / hz
 *             = ((rem * 2^32) / hz) * 2^32
 *
 * Using truncating integer division instead of real division will
 * leave us with only about 32 bits of precision, which means about
 * 1/4-nanosecond resolution, which is good enough for our purposes.
 */
static struct bintime
timo2bintime(unsigned timo)
{

        return (struct bintime) {
                .sec = timo / hz,
                .frac = (((uint64_t)(timo % hz) << 32)/hz << 32),
        };
}

/*
 * cv_timedwaitbt:
 *
 *        Wait on a condition variable until awoken or the specified
 *        timeout expires.  Returns zero if awoken normally or
 *        EWOULDBLOCK if the timeout expires.
 *
 *        On entry, bt is a timeout in bintime.  cv_timedwaitbt subtracts
 *        the time slept, so on exit, bt is the time remaining after
 *        sleeping, possibly negative if the complete time has elapsed.
 *        No infinite timeout; use cv_wait_sig instead.
 *
 *        epsilon is a requested maximum error in timeout (excluding
 *        spurious wakeups).  Currently not used, will be used in the
 *        future to choose between low- and high-resolution timers.
 *        Actual wakeup time will be somewhere in [t, t + max(e, r) + s)
 *        where r is the finest resolution of clock available and s is
 *        scheduling delays for scheduler overhead and competing threads.
 *        Time is measured by the interrupt source implementing the
 *        timeout, not by another timecounter.
 */
int
cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
    const struct bintime *epsilon __diagused)
{
        struct bintime slept;
        unsigned start, end;
        int timo;
        int error;

        KASSERTMSG(bt->sec >= 0, "negative timeout");
        KASSERTMSG(epsilon != NULL, "specify maximum requested delay");

        /* If there's nothing left to wait, time out.  */
        if (bt->sec == 0 && bt->frac == 0)
                return EWOULDBLOCK;

        /* Convert to ticks, but clamp to be >=1.  */
        timo = bintime2timo(bt);
        KASSERTMSG(timo >= 0, "negative ticks: %d", timo);
        if (timo == 0)
                timo = 1;

        /*
         * getticks() is technically int, but nothing special
         * happens instead of overflow, so we assume two's-complement
         * wraparound and just treat it as unsigned.
         */
        start = getticks();
        error = cv_timedwait(cv, mtx, timo);
        end = getticks();

        /*
         * Set it to the time left, or zero, whichever is larger.  We
         * do not fail with EWOULDBLOCK here because this may have been
         * an explicit wakeup, so the caller needs to check before they
         * give up or else cv_signal would be lost.
         */
        slept = timo2bintime(end - start);
        if (bintimecmp(bt, &slept, <=)) {
                bt->sec = 0;
                bt->frac = 0;
        } else {
                /* bt := bt - slept */
                bintime_sub(bt, &slept);
        }

        return error;
}

/*
 * cv_timedwaitbt_sig:
 *
 *        Wait on a condition variable until awoken, the specified
 *        timeout expires, or interrupted by a signal.  Returns zero if
 *        awoken normally, EWOULDBLOCK if the timeout expires, or
 *        EINTR/ERESTART if interrupted by a signal.
 *
 *        On entry, bt is a timeout in bintime.  cv_timedwaitbt_sig
 *        subtracts the time slept, so on exit, bt is the time remaining
 *        after sleeping.  No infinite timeout; use cv_wait instead.
 *
 *        epsilon is a requested maximum error in timeout (excluding
 *        spurious wakeups).  Currently not used, will be used in the
 *        future to choose between low- and high-resolution timers.
 */
int
cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
    const struct bintime *epsilon __diagused)
{
        struct bintime slept;
        unsigned start, end;
        int timo;
        int error;

        KASSERTMSG(bt->sec >= 0, "negative timeout");
        KASSERTMSG(epsilon != NULL, "specify maximum requested delay");

        /* If there's nothing left to wait, time out.  */
        if (bt->sec == 0 && bt->frac == 0)
                return EWOULDBLOCK;

        /* Convert to ticks, but clamp to be >=1.  */
        timo = bintime2timo(bt);
        KASSERTMSG(timo >= 0, "negative ticks: %d", timo);
        if (timo == 0)
                timo = 1;

        /*
         * getticks() is technically int, but nothing special
         * happens instead of overflow, so we assume two's-complement
         * wraparound and just treat it as unsigned.
         */
        start = getticks();
        error = cv_timedwait_sig(cv, mtx, timo);
        end = getticks();

        /*
         * Set it to the time left, or zero, whichever is larger.  We
         * do not fail with EWOULDBLOCK here because this may have been
         * an explicit wakeup, so the caller needs to check before they
         * give up or else cv_signal would be lost.
         */
        slept = timo2bintime(end - start);
        if (bintimecmp(bt, &slept, <=)) {
                bt->sec = 0;
                bt->frac = 0;
        } else {
                /* bt := bt - slept */
                bintime_sub(bt, &slept);
        }

        return error;
}

/*
 * cv_signal:
 *
 *        Wake the highest priority LWP waiting on a condition variable.  Must
 *        be called with the interlocking mutex held or just after it has been
 *        released (so the awoken LWP will see the changed condition).
 */
void
cv_signal(kcondvar_t *cv)
{

        KASSERT(cv_is_valid(cv));

        if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) {
                /*
                 * Compiler turns into a tail call usually, i.e. jmp,
                 * because the arguments are the same and no locals.
                 */
                cv_wakeup_one(cv);
        }
}

/*
 * cv_wakeup_one:
 *
 *        Slow path for cv_signal().  Deliberately marked __noinline to
 *        prevent the compiler pulling it in to cv_signal(), which adds
 *        extra prologue and epilogue code.
 */
static __noinline void
cv_wakeup_one(kcondvar_t *cv)
{
        sleepq_t *sq;
        kmutex_t *mp;
        lwp_t *l;

        mp = sleepq_hashlock(cv);
        sq = CV_SLEEPQ(cv);
        if (__predict_true((l = LIST_FIRST(sq)) != NULL)) {
                KASSERT(l->l_sleepq == sq);
                KASSERT(l->l_mutex == mp);
                KASSERT(l->l_wchan == cv);
                sleepq_remove(sq, l, true);
        }
        mutex_spin_exit(mp);
}

/*
 * cv_broadcast:
 *
 *        Wake all LWPs waiting on a condition variable.  Must be called with
 *        the interlocking mutex held or just after it has been released (so
 *        the awoken LWP will see the changed condition).
 */
void
cv_broadcast(kcondvar_t *cv)
{

        KASSERT(cv_is_valid(cv));

        if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) {
                /*
                 * Compiler turns into a tail call usually, i.e. jmp,
                 * because the arguments are the same and no locals.
                 */
                cv_wakeup_all(cv);
        }
}

/*
 * cv_wakeup_all:
 *
 *        Slow path for cv_broadcast().  Deliberately marked __noinline to
 *        prevent the compiler pulling it in to cv_broadcast(), which adds
 *        extra prologue and epilogue code.
 */
static __noinline void
cv_wakeup_all(kcondvar_t *cv)
{
        sleepq_t *sq;
        kmutex_t *mp;
        lwp_t *l;

        mp = sleepq_hashlock(cv);
        sq = CV_SLEEPQ(cv);
        while ((l = LIST_FIRST(sq)) != NULL) {
                KASSERT(l->l_sleepq == sq);
                KASSERT(l->l_mutex == mp);
                KASSERT(l->l_wchan == cv);
                sleepq_remove(sq, l, true);
        }
        mutex_spin_exit(mp);
}

/*
 * cv_has_waiters:
 *
 *        For diagnostic assertions: return non-zero if a condition
 *        variable has waiters.
 */
bool
cv_has_waiters(kcondvar_t *cv)
{

        return !LIST_EMPTY(CV_SLEEPQ(cv));
}

/*
 * cv_is_valid:
 *
 *        For diagnostic assertions: return non-zero if a condition
 *        variable appears to be valid.  No locks need be held.
 */
bool
cv_is_valid(kcondvar_t *cv)
{

        return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL;
}


mkdir(&(0x7f0000000400)='./file0\x00', 0x0)
compat_40_mount(&(0x7f00000001c0)='ptyfs\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f0000000340))
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000280)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000140)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
lchown(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)


compat_50_select(0x0, 0x0, 0x0, 0x0, 0x0)


r0 = open(&(0x7f0000000040)='./file0\x00', 0x200, 0x0)
ioctl$FIOGETBMAP(r0, 0xc008667a, &(0x7f0000000000))


compat_50_wait4(0x0, 0x0, 0x83d22899afb6e29, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt$sock_cred(r0, 0xffff, 0x1022, 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xffffffffffffffff}})
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='null\x00', &(0x7f0000000200)='./file0/file0\x00', 0x0, &(0x7f00000001c0))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
mknod(&(0x7f00000000c0)='./file0\x00', 0x0, 0x40000800)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x5, 0x1012, 0xffffffffffffffff, 0x0, 0x0)
poll(&(0x7f0000000140)=[{}], 0x1, 0x0)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000000)={0x0, 0x0, 0x80000})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0xb, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
r1 = socket(0x18, 0x1, 0x0)
dup2(r0, r1)
setsockopt(r1, 0x1000000029, 0xd, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


open$dir(&(0x7f0000000b80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x200, 0x0)
r0 = open$dir(&(0x7f0000000280)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000300)='./file0\x00', r0, &(0x7f0000000c80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
symlinkat(&(0x7f0000000dc0)='./file0\x00', r0, &(0x7f0000000ec0)='./file1aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
symlinkat(&(0x7f00000000c0)='./file0\x00', r0, &(0x7f00000001c0)='./file0\x00')
open$dir(&(0x7f0000000040)='./file1\x00', 0x200, 0x0)
mkdirat(r0, &(0x7f0000000600)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)
symlinkat(&(0x7f0000000040)='./file0\x00', r0, &(0x7f00000004c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
open(&(0x7f0000000340)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x0)
open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
rename(&(0x7f0000000980)='./file1\x00', &(0x7f0000000100)='./file0\x00')
r1 = socket(0x2, 0x1, 0x0)
sendmsg(r1, &(0x7f0000000380)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000180)=ANY=[@ANYBLOB="1000000000"], 0x10}, 0x401)
r2 = socket(0x1, 0x1, 0x0)
close(r2)
socket(0x18, 0x2, 0x0)
sendto(r2, 0x0, 0x0, 0x0, &(0x7f0000000100), 0x8)


__select50(0x190, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x1000000}, &(0x7f0000000100)={0x3}, &(0x7f0000000140), 0x0)


open(&(0x7f0000000100)='./file0\x00', 0x615, 0x0)
setrlimit(0x8, &(0x7f00000010c0))
ktrace(&(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)


syz_emit_ethernet(0x23bf, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000640)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000080)=ANY=[@ANYBLOB="18000000ffff000001"], 0x18}, 0x0)
sendmmsg(r0, &(0x7f0000000480)={0x0}, 0xfffffffffffffca1, 0x0, 0x0)


__clone(0x600, 0x0)


compat_20_getfsstat(&(0x7f00000012c0), 0xfffffffffffffec4, 0x3)


setrlimit(0x9, &(0x7f00000010c0))
socket(0x1d, 0x3, 0x0)


__getrusage50(0x0, &(0x7f00000000c0))


compat_43_osethostname(&(0x7f0000000000)="324fe362af6bb03fad7f057728d91175c522743cef2b", 0x16)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0xc0104304, &(0x7f00000001c0)=0x20000002)


r0 = socket$inet(0x2, 0x1, 0x0)
getsockopt$sock_cred(r0, 0xffff, 0x1022, 0x0, 0x0)
r1 = getppid()
setpgid(0x0, 0x0)
setpgid(0x0, r1)


openat$wscons(0xffffffffffffff9c, 0x0, 0x0, 0x0)
r0 = socket$unix(0x1, 0x5, 0x0)
compat_20_fstatfs(r0, 0x0)


pipe(&(0x7f0000000080)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
undelete(0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x2001, 0xa3e7)
open$dir(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
compat_50_select(0x2e, &(0x7f0000001900)={0x7fffffff}, 0x0, 0x0, 0x0)


mkdir(&(0x7f00000000c0)='./file0\x00', 0x0)
__mount50(&(0x7f0000000080)='fdesc\x00', &(0x7f0000000200)='./file0\x00', 0x0, 0x0, 0x0)
pathconf(&(0x7f0000000040)='./file0/../file0\x00', 0x2)


r0 = socket(0x800000018, 0x3, 0x0)
sendmsg(0xffffffffffffffff, &(0x7f0000000080)={0x0, 0x30, 0x0, 0x0, &(0x7f00000001c0)=[{0x0, 0x0, 0x0, "7c388162342bc77ac9b292f08b53a25653ec5501b2babffd676b75c3c187a084ff9abe07b10a6601fc157a3f3a8f645a8e9f8f2931671ba56ff22cb51e5aa4cf374c087a2b9411cd2b23ca3a3e51981ac0f855ca45b04f94195ad2280688dd5c6e888852d12450fe6f6c6e2c137f43f82c48afc1f1b78ffca604552fc02725"}]}, 0x6)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


setrlimit(0x8, &(0x7f0000000980))
_ksem_open(&(0x7f0000000000), 0x0, 0x0, 0x0, 0x0)


modctl$MODCTL_STAT(0x4, 0xffffffffffffffff)


mknod(&(0x7f0000000540)='./file0\x00', 0x6000, 0x1003)
r0 = open(&(0x7f0000000480)='./file0\x00', 0x0, 0x0)
write(r0, 0x0, 0x2000000)


r0 = socket(0x18, 0x1, 0x0)
fcntl$setstatus(r0, 0x4, 0x44)


minherit(&(0x7f0000ffa000/0x3000)=nil, 0x3000, 0x0)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


semop(0x0, &(0x7f00000001c0)=[{}, {}, {}, {}, {}], 0x2aaaace1)


r0 = socket(0x18, 0x3, 0x0)
pwrite(r0, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x2011, 0x1733)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
ioctl$FIONREAD(r0, 0x40044266, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
posix_spawn(0x0, 0x0, 0x0, &(0x7f0000001140)={0x0, 0xffffffffffffffff}, 0x0, 0x0)


compat_40_mount(0x0, 0x0, 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
mkdir(&(0x7f0000001040)='./file0\x00', 0x0)
chroot(&(0x7f00000000c0)='./file0\x00')
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0))


r0 = socket(0x2, 0x2, 0x0)
ioctl$FIONREAD(r0, 0x80206979, &(0x7f0000000000))
__clone(0x4000, &(0x7f0000000380))


r0 = socket(0x18, 0x1, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x1008, 0x0, 0x0)


writev(0xffffffffffffffff, &(0x7f0000000240)=[{&(0x7f0000000140)='\x00', 0x1}], 0x1)
r0 = socket$inet(0x2, 0x2, 0x0)
r1 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r1, 0x0, 0x9, &(0x7f0000000240)="ea00000100000000", 0xc)
dup2(r1, r0)
setsockopt$inet_opts(r0, 0x0, 0xd, &(0x7f0000000240)="eaff03522587ae62", 0x8)


r0 = socket(0x18, 0x3, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r0, &(0x7f0000000540)={0x0, 0x0, &(0x7f00000003c0)=[{&(0x7f0000002600)="e426d0c9699fa16c61029568a5a7fc4010a3cd26086976524a199b576cf58588bf04a142f6933152a977d2b52172fbc1bc5231c045a6622295075ec4a3464b1eb4f2ec0a2d86a5a89c0068585d86bc0c34d26c9fa33d9dc75be8be574f6cdac3c683192c85c23a324c56f07d231e91f631f18e4d4ded4945bbfffe6467e7175380f40d0aa0ddc02bc92035547a57786aa267fc5554d52b3b4406666e8333a4389f84041bc3e8057f0c8186e6df27b735fdccd1ce4922c449c593e3f222193e5cdb923dfbed69ef6839cc6d700b2e4dc7a7f4a7448e142272318415517043be65e4541ed5f8777f1717abb4f815e52a7547dcc1c5e99ae525aaba3f8c37852e2a9a761130d19ca867752c2dd3380f2b7e77156893ac49cc74c72d1af608bcba7ea1e8005c1e2bf0076237863455624a4f24893299938c22ad4235e0b5d1fdf2d8069c2722ba87a74eb6778174b5100c2f07538a88581e2c43c29cae6696b2a9bfb92d1ab504722286b3563168de5563d60396905f0cf6e710649b53092dc4731c59feda9d9263173ea94d3c1ce55732f8422f1eedf08549b76583c59cdfdf7b44eaa3f84647cbc0f87a971d07b303c8d09ff2c42c7342d2afafa8b3a688d3a4f8dc76562c933852f30b175f0e250e46af8e1dcf45dbc0e3629080bc09c3dcab3a0f1fe516bf097d063a7292756a030ef5dcb981eba7d260b4a1008fe9bbe360a54eba825ffdfc10e6e921081a6a97a0182394ea65e025dcbb65f861fe25110364b5cb8bfdb0980fb465129c747138297d8106be950b9bcad27fda8368d4ea87e0dab3af0f2e48aee3f369d4747bbe7fdf4b8ba5f07bd2c1ea25dc9437b67ab2f056d81cf427f98a8d455804530e92f3ba638754231e7ac56ca04eae9d5d63dd23daa5d886f23e4890873650757b2985306f8bbe1f43f586ac450712df623075e6ea2f394b73952de5b8cedb22229bebb01656439b84e5921be138ef6d748dc68f54b91e4aefad741f06a026d7c41acad5e96bc3a419dc7e0ef20a7bc7f53c61c7a49b276f239cebac82c6d536603c464ea10b743aebeb45c20da1e116a80f3bad7ae1d544d4ac516d3527e66e1072185e722088cc81c146d679cd132b34a8f47bab6090d8cecf8e1687d3686d2d72d8ab595c101a11d30a40387b630db7309d0ed1a0dbb70d8bb3e17d34d3994dff60cebc9c7999aa146e599d33f50cfaa9e81a565dbb768d233a361753afcd46e1f82a05be6bc93df46d52bb5f4e0c0fc81095a263e4724e0f8128a427830300a8b302ac0c3e6b1215ec8bc668ec22726e018bcc97ad79a6eb433c23c67cc7f06e5f7c3a75b8916aa0d9f081663571d5cb5ba5dd310b470ea66c467212a0cdc2d863ca25e4182a40d6bdd9ba59aa539b373624d4c3095608311b275c2e26601e2734a3f6c28917f56c53502f4862b415dc8131f29952bb05abe3bf592aa8941971dfdb9f3e0e4886a75b0115e9cd6e42b46af5d4c94c1c04d3b552edd9db38397a5883997aa66c98b1311f9743dc2c49d8ace7ca3e73d0238fd945b3cbf0ce10c5267ac7538c82896d3f54e10c1d8e7daea5af883ffb7695bafb587aa64d5fa49ac20cae40c29c9c6adf9f2e7851d4de4945aef591bb1c614ca8d36458d4c0453945929b61f9ef0e18e10102288f57ee87888eb55b1c6db4c1f8dff49bdd67e0f24d4d177287294c67d66919b8fd3242a25fb6545669e08bf42d1e4985ba4303c985bf2c31542ac1d4130c9521b0fe319ffb3e6ffe6a0b69896745100bcf267d2167a6af93e24d4b8edd612e6e0e8990940e1fd32f0dfcad23510c1f0a6b8450018c98873535d7a198c7c4f748981d21e0003394be88fdf6fbc661a23cc092890d8a5460986d66f982e2a75c68a1d0759a42fc1ec0eac908835eb4e56c7a3c6890274ef33e785fbb6851e6e694e298dd4c73a05b9c97602edbdb747bc87b2e01950d17cf110b7c38dc9074258dd1810799166d6c3f19916b2a80677296e57f47b7f59f0a06abcdef6c8de14ec7658bf1e34717327618067bc1fa5e35d5c1dd564f55452a2ef4c682231a9612e9a7d711c90ca3aad3e9154d869eedbcc88644133acc6574c01fa7d4f947e97e9892c90c145e282310a3a33a43fcb7331825b0a7e5a1a74bae6a7d2055142daefc2e42947fad343d5d0c89388fcb0128aa35fef8f01048948cc1779d8613d372547c4050aca769c9937c447d0dde39d798165af585ab2617ade9c9d3b3d481bb4c8f5ccac0315e176aee759714aa3ed683c9bff7aa3e39bea8d8df355cd48dfd47208f18e9c119e846fa7e0a9f27aa118f28fc09bc2712df6ae51e088addf15e4936bde36c8be1e01c2d1050ed2e7e4e00c60afc601cafe713abb1439d4917aa4fc32f58f49fbfde7ed6cf5cc9b141ac3f957651175d0e6bbef9a174ba15db6b53643e8bfbffde8525a58a817d7b943c60494c72d3d5bbe46d6f857f2c26cc660e9d658e59e9c2f630dd81f37dccab415259d80252a7fa891c00b2bb1b45fedfc36f14372f745702681c1e3e183408c98b640e35e2ccc4a7c58759f00ff8e4b926dc78a157a6659ffe6d4c6ccba55d592d978009d66bd49b1ebd3e18911a735a879648f9f0c41a2482b952523af9e48c8a720d7d3266e1f1fab739047576d8ace50c5a5406c91169cca485028768f39868cc7191a581484e1e1d9b17d2631ff3a9ad4add0aefbcd16867db1aceeb7f2d1fdbc0cb6dc0ec794e83269332958be6efd03ba43f5c3b6fd2ddb335a1522f83248c2cbf7f4293fe57dd9138aeb098d58ed872c2c5b7b9ab031f277d502f91a4e68165d49813ceef90a2dd6612f2a0220ee151e5884142fafdfedbb0ff9399634c8de51c83f8f3c8a2fdcced815e1b73a7de4abb4a69da71a1237b73b67a7ad4a0a0c6638d759e86f72f9e4159af46f2716376d84185769790dad4e08efdb4ac406af0b26ebc1b25bfa54245b4b423f6ebf6a49468ff8ee5", 0x841}], 0x1, 0x0}, 0x0)


mknod(&(0x7f0000000300)='./file0\x00', 0x2000, 0x6da)
open(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
open$dir(&(0x7f0000000240)='./file0\x00', 0x0, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000140)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000200)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000800)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_50___stat30(&(0x7f0000000280)='./file0\x00', &(0x7f0000000300))


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000040)='./file0\x00', 0x1000, 0x0)
open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
ktrace(&(0x7f0000000040)='./file0\x00', 0x0, 0x0, 0x0)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
readv(r0, &(0x7f0000000340)=[{0x0}], 0x1)


open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
truncate(&(0x7f00000007c0)='./file0\x00', 0x0, 0x8)
truncate(&(0x7f0000000440)='./file0\x00', 0x0, 0x3e2b649f)
truncate(&(0x7f0000000780)='./file0\x00', 0x0, 0x10001)


__lstat50(&(0x7f0000000000)='.\x00', &(0x7f00000002c0)={<r0=>0x0})
mknod(&(0x7f0000000000)='./file0\x00', 0x2011, r0)
open(&(0x7f00000001c0)='./file0\x00', 0x0, 0x0)
__utimes50(0x0, &(0x7f00000000c0)={0x0, 0x3fffffff})
mknod(&(0x7f0000000100)='./bus\x00', 0x2000, 0x4f4b)
r1 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r1, 0x80104305, &(0x7f00000000c0))


mknod(&(0x7f0000000000)='./file0\x00', 0x2011, 0x1733)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_select(0x40, &(0x7f0000000040), 0x0, 0x0, 0x0)


__nanosleep50(&(0x7f0000000100)={0x0, 0x400}, &(0x7f0000000140))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000029, 0x11, &(0x7f0000000000)="674cd6e5", 0x4)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000029, 0x33, 0x0, 0x0)
listen(r0, 0xfffffffb)
sendmsg$unix(r0, &(0x7f0000000000)={&(0x7f0000000040), 0x1c, 0x0}, 0x0)


rasctl(0x0, 0x9, 0x0)
rasctl(&(0x7f0000000000), 0x4, 0x0)
fork()


recvmmsg(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
getsockopt$sock_cred(r0, 0xffff, 0x1022, 0x0, 0x0)
fcntl$dupfd(0xffffffffffffffff, 0x0, 0xffffffffffffffff)
recvmsg(0xffffffffffffffff, 0x0, 0x0)
sendmmsg(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
openat$tprof(0xffffffffffffff9c, &(0x7f00000004c0), 0x0, 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
open$dir(&(0x7f0000000040)='./file1\x00', 0x0, 0x0)
open(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
compat_50_select(0x40, &(0x7f0000001900)={0x7fffffff}, 0x0, 0x0, 0x0)


setreuid(0x0, 0xee01)
chroot(0x0)


compat_90_getvfsstat(&(0x7f0000000940), 0x8d0, 0x2)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt$sock_cred(r0, 0xffff, 0x1022, 0x0, 0x0)
rasctl(0x0, 0x9, 0x0)
rasctl(&(0x7f0000000040), 0x0, 0x1)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x5900)
compat_43_ocreat(&(0x7f0000000140)='./file0\x00', 0x0)


mknod(&(0x7f0000000280)='./file0\x00', 0x2000, 0x200)
shmat(0x0, &(0x7f0000ffd000/0x1000)=nil, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
pwritev(r0, &(0x7f0000000100)=[{&(0x7f0000000000)="e2", 0x1}], 0x1, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50__lwp_park(&(0x7f0000000000), 0x0, 0x0, 0x0)


open(&(0x7f0000000100)='./file0\x00', 0x615, 0x0)
access(&(0x7f0000000000)='./file0\x00', 0x2)


setrlimit(0x9, &(0x7f00000010c0))
socket(0x18, 0x3, 0x0)


ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x0, 0x0, {0x0, 0x10000000000001}})
shmctl$IPC_SET(0x0, 0x1, &(0x7f00000000c0)={{0x0, 0x0, 0x0, 0x0, 0x140}})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
r1 = socket(0x18, 0x3, 0x0)
r2 = dup2(r0, r1)
setsockopt(r2, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


pipe(&(0x7f0000000800)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0xf0709cfa615b9be3, r1)
compat_30___stat13(0x0, 0x0)


faccessat(0xffffffffffffffff, 0x0, 0x19b0ae06b0de2ca7, 0x0)


mknod$loop(&(0x7f0000000040)='./file0\x00', 0x6000, 0x0)
pathconf(&(0x7f0000000280)='./file0\x00', 0x1)


compat_50_nanosleep(&(0x7f00000000c0)={0x80000000}, 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
access(&(0x7f0000000180)='./file1\x00', 0x5)


open(&(0x7f0000000100)='./file0\x00', 0x615, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
_lwp_detach(0x0)


pipe(&(0x7f0000000340)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
fcntl$lock(r0, 0x3, 0x0)
r1 = accept(r0, 0x0, &(0x7f0000000000))
accept(r0, &(0x7f0000001780)=@family, &(0x7f00000017c0)=0xe)
pipe(&(0x7f0000000340)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
recvfrom$inet6(r2, 0x0, 0x0, 0x0, 0x0, 0x0)
recvfrom$inet6(r0, 0x0, 0x0, 0x0, 0x0, 0x56)
r3 = dup3(r0, r1, 0x400000)
recvfrom(r3, &(0x7f0000001880)=""/206, 0xce, 0x40, &(0x7f0000001980)=@data="114487c6e040ccb6ebac8defccaa", 0xe)
pipe(&(0x7f0000000340)={<r4=>0xffffffffffffffff, <r5=>0xffffffffffffffff})
recvfrom$inet6(r5, 0x0, 0x0, 0x0, 0x0, 0x0)
getpeername$inet6(0xffffffffffffffff, &(0x7f0000001800), &(0x7f0000001840)=0xc)
__fstat50(r4, &(0x7f0000000040))
accept(r0, &(0x7f0000001440), &(0x7f0000001480)=0xe)
recvmmsg(0xffffffffffffffff, &(0x7f0000001400)={&(0x7f00000013c0)={&(0x7f0000000140), 0xe, &(0x7f0000000300)=[{&(0x7f0000000180)=""/175, 0xaf}, {&(0x7f0000000380)=""/4096, 0x1000}, {&(0x7f0000000240)=""/180, 0xb4}], 0x3, &(0x7f0000001380)=""/6, 0x6}, 0x8}, 0x10, 0x2000, 0xfffffffffffffffe)


r0 = socket(0x18, 0x1, 0x0)
socket(0x0, 0x0, 0x0)
fstatat(r0, &(0x7f00000001c0)='./file0/file0\x00', 0x0, 0x0)


mkdir(&(0x7f00000000c0)='./file0\x00', 0x0)
r0 = open$dir(&(0x7f0000000280)='.\x00', 0x0, 0x0)
faccessat(r0, &(0x7f0000000ac0)='./file0\x00', 0x0, 0x0)


open$dir(&(0x7f0000000240)='./file0\x00', 0x2a0, 0x0)
ktrace(&(0x7f0000000080)='./file0\x00', 0x4, 0x2, 0x0)
__posix_fchown(0x0, 0xffffffffffffffff, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x1000, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
open(&(0x7f00000000c0)='./file0\x00', 0x2, 0x0)
flock(r0, 0x1)
close(r0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setgroups(0x1, &(0x7f0000000080)=[0x0])


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
posix_spawn(0xffffffffffffffff, 0x0, &(0x7f0000000140)={0x0, 0x0, 0x0}, 0x0, 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0))
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
chdir(&(0x7f0000000240)='./file0\x00')
posix_spawn(0x0, &(0x7f0000000180)='\x00', 0x0, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x0)


r0 = _lwp_self()
_lwp_detach(r0)
_lwp_wait(r0, 0x0)


__mount50(&(0x7f0000000000)='coda\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000080), 0x0)


pipe(&(0x7f0000000140))
setegid(0x0)


r0 = socket$inet(0x2, 0x1, 0x0)
lseek(r0, 0x0, 0x0, 0x0)


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0xc0e99db6de761f86, 0x0)
compat_50_quotactl(&(0x7f0000000000)='./file0\x00', 0x40000, 0x0, 0x0)


mkdir(&(0x7f0000000040)='./file0\x00', 0x0)
r0 = open(&(0x7f00000002c0)='.\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f0000000340)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0)
mkdirat(r0, &(0x7f0000000440)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38//file0\x00', 0x0)
rename(&(0x7f0000000100)='./file0\x00', &(0x7f00000001c0)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00')


r0 = socket(0x1f, 0x10000003, 0x0)
sendmsg(r0, &(0x7f00000002c0)={&(0x7f0000000000), 0xe, 0x0, 0x0, &(0x7f0000000280)=[{0x10}], 0x10}, 0x0)


symlink(&(0x7f0000000100)='.\x00', &(0x7f0000000240)='./file0\x00')
ioctl$WSDISPLAYIO_GET_EDID(0xffffffffffffffff, 0xc0105766, &(0x7f0000000100)={0x1, 0x0, 0x0})
compat_40_mount(&(0x7f0000000080)='procfs\x00', &(0x7f00000000c0)='./file0\x00', 0x0, &(0x7f0000000100))
chdir(&(0x7f0000000340)='./file0\x00')


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mknod(&(0x7f0000000200)='./bus\x00', 0x6000, 0x100)
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
chdir(&(0x7f0000000240)='./file0\x00')
mkdir(&(0x7f0000000380)='./file0\x00', 0x0)
rename(&(0x7f0000000140)='./file0\x00', &(0x7f0000000040)='./file1\x00')
posix_spawn(0x0, &(0x7f0000000180)='\x00', 0x0, 0x0, 0x0, 0x0)


r0 = socket(0x800000018, 0x3, 0x0)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7, 0x3}, 0x8)


modctl$MODCTL_LOAD(0x5, 0x0)
__mount50(&(0x7f0000000000)='fdesc\x00', &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
open(&(0x7f0000000300)='.\x00', 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x1, &(0x7f00000002c0)="01")
rename(&(0x7f0000000140)='./file0\x00', &(0x7f0000000300)='./file1\x00')


setsockopt$inet_opts(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
pipe(&(0x7f00000001c0)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0xf0709cfa615b9be3, r1)
truncate(0x0, 0x0, 0xffffffffffff7fff)


setreuid(0x0, 0xee01)
compat_30___fhstat30(0x0, 0x0)


r0 = socket(0x2, 0x1, 0x0)
r1 = dup2(r0, r0)
shutdown(r1, 0x2)
getsockname$unix(r0, 0x0, &(0x7f0000000200))


mknod(0x0, 0x100000000205f, 0x2802)
mknod$loop(&(0x7f0000000240)='./bus\x00', 0xa0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvmmsg(0xffffffffffffffff, &(0x7f0000000700)={&(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000680)=""/100, 0x64}, 0x3f8d}, 0x10, 0x0, 0x0)
r2 = fcntl$dupfd(r0, 0x0, r1)
open(&(0x7f0000000040)='./file0\x00', 0x18289, 0x0)
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40000530, 0x0)
recvmsg(r2, &(0x7f0000000540)={0x0, 0x0, &(0x7f0000000280)=[{&(0x7f0000000100)=""/150, 0x96}, {&(0x7f0000000200)=""/63, 0x3f}], 0x2, 0x0}, 0x42)
sendmmsg(r1, &(0x7f0000000080)={0x0}, 0x10, 0x0, 0x0)


semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}})
r0 = socket(0x800000018, 0x1, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x0, &(0x7f0000000000)=0x7, 0x4)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0x0, 0x0, 0x0, 0x10}})
r1 = socket(0x800000018, 0x1, 0x0)
setsockopt$sock_int(r1, 0xffff, 0x1000, &(0x7f0000000000)=0x7, 0x4)
bind$unix(r1, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


r0 = semget(0x2, 0x0, 0x0)
socket(0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
____semctl50$SETVAL(r0, 0x0, 0x8, &(0x7f0000000040)=@array=0x0)


mkdir(&(0x7f00000000c0)='./file0\x00', 0x0)
__mount50(&(0x7f0000000140)='fdesc\x00', &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0)
pathconf(&(0x7f0000000000)='./file0\x00', 0x1)


r0 = socket(0x1d, 0x3, 0x0)
__fstat50(r0, &(0x7f0000000180))


mprotect(&(0x7f0000ffa000/0x4000)=nil, 0x4000, 0x5)
munmap(&(0x7f0000ffb000/0x3000)=nil, 0x3000)
madvise(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0)
madvise(&(0x7f0000ffa000/0x2000)=nil, 0x2000, 0x0)
mincore(&(0x7f0000ffa000/0x3000)=nil, 0x3000, 0x0)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000100)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(r1, &(0x7f0000001a00)={0x0, 0x0, &(0x7f0000001700)=[{0x0}, {0x0}, {&(0x7f0000000280)="e304789063207e1b1f", 0x9}], 0x3}, 0x0)
recvmmsg(r0, &(0x7f0000001400), 0x10, 0x42, 0x0)
poll(&(0x7f0000000000)=[{}], 0x20000000000000fe, 0x0)


socket(0x26, 0x5, 0xfc)


__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_30_getfh(&(0x7f0000000140)='./file0\x00', &(0x7f0000000180))


__clone(0x0, 0x0)
ptrace(0x0, 0x0, 0x0, 0x0)
r0 = getpgrp()
__wait450(r0, 0x0, 0x0, 0x0)


r0 = socket(0x2, 0x1, 0x0)
getsockopt$sock_cred(r0, 0xffff, 0x11, 0x0, 0x0)
open(&(0x7f0000000000)='./file0\x00', 0x70e, 0x0)
socketpair$unix(0x1, 0x2, 0x0, 0x0)
open(0x0, 0x0, 0x0)
ktrace(&(0x7f0000000040)='./file0\x00', 0x0, 0xa5879f5d35e81d73, 0xffffffffffffffff)


setsockopt(0xffffffffffffffff, 0x0, 0x0, &(0x7f00000000c0)="b211d7170d816685c8e360f2fa41c1a0946988b272d2dd3dc90142", 0x1b)
compat_50_lutimes(0x0, &(0x7f00000000c0))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
compat_50_utimes(0x0, &(0x7f0000001e00))


mknod(&(0x7f0000000100)='./file0\x00', 0x6000, 0x500)
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000001080), 0x0)
setsockopt$sock_linger(0xffffffffffffffff, 0xffff, 0x80, &(0x7f0000000080)={0x4}, 0x8)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
mkdir(&(0x7f0000000080)='./file0/file0\x00', 0x0)
rmdir(&(0x7f0000000080)='./file0\x00')


semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000000)={{}, 0x0, 0x0, 0x1})
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x1ff})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000000029, 0x9, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
sendmmsg(r1, &(0x7f0000003d00)={0x0}, 0x10, 0x0, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x4100)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x615, 0x0)
poll(&(0x7f0000000040)=[{r0}], 0x1, 0x0)


open(&(0x7f0000000480)='./file0\x00', 0x7fffe, 0x0)
__posix_rename(&(0x7f0000000040)='./file0\x00', &(0x7f0000000080)='./file0\x00')


semctl$GETALL(0xffffffffffffffff, 0x0, 0x6, &(0x7f0000000040)=""/4096)
open(&(0x7f0000000000)='./file0\x00', 0x2, 0x0)
semctl$IPC_SET(0xffffffffffffffff, 0x0, 0x1, 0xfffffffffffffffe)
r0 = socket(0x2, 0x2, 0x0)
dup(r0)
socket(0x2, 0x3, 0x0)
r1 = socket(0x18, 0x3, 0x0)
ioctl$FIONREAD(r1, 0x4004667f, &(0x7f0000001080))


r0 = socket(0x18, 0x2, 0x0)
bind$inet6(r0, &(0x7f0000000000)={0x18, 0x2}, 0xc)


compat_40_mount(0x0, 0x0, 0x0, &(0x7f00000002c0)="01")
compat_40_mount(&(0x7f0000000140)='ext2fs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0))


r0 = socket(0x1f, 0x40000003, 0x0)
compat_30___fstat13(r0, &(0x7f0000000080))


mknod(&(0x7f0000000000)='./file0\x00', 0x2000, 0x0)
pathconf(&(0x7f0000000080)='./file0\x00', 0x11)


socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000000)=ANY=[@ANYBLOB="89000000ffff000001"], 0x9}, 0x0)
shutdown(r1, 0x0)
sendmsg(r0, &(0x7f0000000380)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x2011, 0xffffffffffffffff)


r0 = socket$inet6(0x18, 0x30000003, 0x4)
getsockopt(r0, 0x0, 0x0, 0x0, 0x0)


open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
r0 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x8, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000401})
r1 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
fcntl$lock(r1, 0xa, 0x0)


mknod(&(0x7f0000000140)='./bus\x00', 0x2000, 0xd03)
r0 = open(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
ioctl$FIOSEEKDATA(r0, 0x20007461, 0x0)


modctl$MODCTL_UNLOAD(0x2, 0x0)
close(0xffffffffffffffff)
r0 = socket(0x11, 0x3, 0x0)
ioctl$FIONBIO(r0, 0x8004667e, &(0x7f0000000000))


r0 = open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
r1 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r1, 0x8, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000000297})
open(&(0x7f0000000040)='./file0\x00', 0x10, 0x0)
fcntl$lock(r0, 0x8, &(0x7f0000000140)={0x3, 0x0, 0x1, 0x269000000, 0xffffffffffffffff})
r2 = msgget$private(0x0, 0x80)
msgsnd(r2, &(0x7f0000000240)={0x3, "d8a9e781434a40e653b2d74236a4a7358607735b0f33622194b712c0c94f3994718bdb6527115b348455e3e05ccbde307b3330a133dc6dd63ffa72205fdbd83184832c0c15d4d7aa7cfe0110a899b9ce9ae412fed65245a141be4a01dc80ab69b9b96c0b40f8adc2f95b3224fa0a476247613de8966c31f4dcd13aeddabd76"}, 0x87, 0x0)
getsockopt$sock_cred(r0, 0xffff, 0x1022, &(0x7f0000000000)={0x0, <r3=>0x0}, &(0x7f0000000180)=0xc)
getsockopt$SO_PEERCRED(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000100)={0x0, <r4=>0x0, <r5=>0x0}, 0xc)
r6 = semget$private(0x0, 0x4000000009, 0x82)
semop(r6, &(0x7f00000000c0)=[{0x1, 0x0, 0xe5ce97ab354d96be}, {0x4, 0x4, 0x1000}, {0x5, 0x0, 0x1000}, {0x4, 0x2002}], 0x4)
semop(r6, &(0x7f0000000340), 0x0)
r7 = getegid()
semop(r6, &(0x7f0000000440)=[{0x4, 0x3ff, 0x3000}, {0x2, 0xb}, {0x2, 0x224, 0x2000}, {0x4, 0x9}, {0x1, 0x3}, {0x1, 0xbe1, 0x1000}, {0x0, 0x8, 0x1000}, {0x1, 0x8, 0x1000}, {0x0, 0x7a, 0x1800}, {0x2, 0x19, 0x800}, {0x3, 0x800, 0x800}, {0x1, 0xfff7, 0x1000}], 0xc)
semctl$SETALL(r6, 0x0, 0x9, &(0x7f0000000240)=[0x6, 0x401, 0x7, 0x0, 0x200, 0x1, 0x9, 0x400, 0x1])
semop(r6, &(0x7f0000000140)=[{0x4, 0x2f, 0x1800}, {0x4, 0xfffd, 0x800}, {0x0, 0x0, 0x1000}, {0x0, 0x4, 0x400}, {0x0, 0x5e4, 0x1800}, {0x2, 0x0, 0x800}, {0x1, 0x3}, {0x0, 0x1000, 0x1800}, {0x4, 0x8001}], 0x9)
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, &(0x7f0000000200)={0x0, <r8=>0x0}, &(0x7f00000002c0)=0xc)
semctl$IPC_SET(r6, 0x0, 0x1, &(0x7f0000000300)={{0x5c26, r4, r7, r8, r5, 0x100, 0x6}, 0x1000, 0x1, 0x1})
r9 = socket(0x18, 0x3, 0x20)
writev(r9, &(0x7f0000000400)=[{&(0x7f0000000380)="2ed314b76b580644b92c5e4402b39e97682b899f1c29770840440219491928f6d5bc6ac9f0", 0x25}, {&(0x7f00000003c0)="cf5558294eb20acffb2f9eee1fbb89f4ae2792c6e89d259219be26eef6d3a8731c9cbaf1011be8e331b70ec40fa3546273cf4a1d1f0551c2a7", 0x39}, {&(0x7f0000000580)="e3452777b3c79aa5cff604aa79c7038fad8e408261a5f0208a21a72326075d22957474bb1e9c83bdb01a7843cb7b15accc30624713506b452dc70e58f58f69566e69f37f8544e2635e171c15287aaa5be579091db735a8dad2bcc3911c4bf65e56dcb1c1cb7a4fc0eff77fa43de49ccb18ee90a2c24a830a26c3db1cf58d18cc0bfdd8061cfb6c35e7e6ffaf8e1c4095200b19534eea210fb203db9ab52a788f0cd3eaa6facb5c1b312cfcc61c721617c1e1897b1e4bcb42ff0a25c66009e8f5c68f613e0b9575f7fa36a998d568e00c223ae23b087ff1ce97c93241adacdcfb798c5a7dd0f04da5da6f7005e7e8ff314c618ce2792a010ba226e02ad404fd77041d50e736a1eff9f3dd069e958206b7c84232555303c4c377a2cf5c304a49dab424d5839b80b80e2653d219d0e28e46ffe411bd92722580960604f1fdf1693960f30dde064c83cf4b315223a0d90832bcf962cc7f0bde6adfd3b9118e723ea95db01bf7941e35b6e16d358fca9ea783df5a532f07593e7b98a8f666d25f2b3ea2d8a4f9c7da6aa659a9ca98ce2679aa81788e0c2d3491e598dd0018de5e6cde77e037b2983ba1b3ea1a0c7d2c15df192a6cc564743c9b7c9b10387b1ef8aee0b3654714818f5c8bb3bff718abb4e9499c339d503cf6376c5bd11dbd04b3632a3acce0e2ad09b7b429e19ad28e488b9e362eb0ebbc42df89b4c1949fd8fc9f6b8a52a6450ed76f04be41be48565ea41aa07c26902b38ba39131bd3dc5aff4cf608107c02abab2d13768171cb8cce3868963e1984ab2e47a1ac4ce6ebd7f7d0841aa8582d5a4c63279c7fe935a80bb679dcb45581ad506aa2d59df3faa0a315ec78e371169419b0ae60e37d33d99102eb389d327c9c853fbfd789915b9628c03df7541671cb8b16ef9511ecc2e1e80cf86f051fb749321f976e6702bf28387ce71474f814510f97f0fcb5f2289389ba525880feeee741167470adab29c9237435f5db2d91dec3fd8a741ab47660bac3c7cf4407095e059a834f8fdceda5c481f4467439b94ca82c1f525a81abda44498436541f2ecccf186ab8047b4ffc07d7987221d5aab7195e10b5b187b111ef14614c13cdc491da1a6e0fe4071f513e536aaa02ba3126a130b56fbcb3a9443b0c6f31ea4c84819c7ac5dda8c31f7769cab1f5f0bce974f8d422101c51cdd3d6291ce6a68ad1575c58054d7da9be39a682bb962d9b8c229d4a5d7fdb3b897da75add2faa39c41a59dd27503040ae7d226cb6100d20cefd54936612aaa2613d256ec0210811a8e4f10badf59902eab34b216ceb7de32604cd016bc9c08ba399306b38e17641781f2f7977c3b6c7819bd13217027477920d1df7f7e1a53d76c770a65390490a35ea1d3e95336b8186d58bf966a0733dcee4c8acf32c1927aa131f6f026cd58166edde8dc4fc5372600a4723c0656d07b86a6e820384c482a865dc210104378549e75395f920ee2da6a9c0cfc7c693274f54ed66ce2e06fcb2d33b87f09e4753f6285d0d8b878d7ab75da865473b11cdf98d3951a4a6777a5167030700e0a5dc782c538188e3a4bb2aca7341b36f8dec99a9c66017460bc5ff1cddc12337bdbec6c5ddd65d7853b79689c99e62eeabade570476d53a9c2fbcd7db58b02f52991ccfd45f53fb2f8957a580eaa89687a366cf44718a82cdfcfabf56ac4fe2141fe91766e6fd608bdba2c46ae1de31c528dd592d0deda5b41e9a1129dbd6bb535f014174cdccf807201d39f88ae9ea7a6bbd6c0000cd4a7202d626fe284686bc02b860004601a5b2fd94fe39007837765e9d27685ce8a7aee7b06e9c4997c41635ceff980e2f3b130ed4bcbda1117dd32c9640ce4de0a14d5648d606d889e0bce9d8655ee71b55d0af4a277542fce64d7019f9f547c5fe5083854fba62d9c9b5b4099c9a6a309a921c777593e9b7922b55094918a6437396b56cd19ded48ec63d517d73c3356ffb4e2a173e70486b25f1efe467f592451a2cfcfcd19538cf4d3b302e81ea0c4afb6ce27236476781f2b11306c5d167055c03c66950812b912ff34b8dc84d4d6525d69d095278255b2bbccd212ec1ccefb2434737d4a18c4d8e5517a72e6ff72e0a9bef67c47f54ce78d503ee68abb2ef7923d43ef90d48d0a7b94f9fc8ccae026ea90ae1ad904f6fe54fe62f53026f2adae7d272804fc8e9fab8dd0802f0fc9140417016d81cdc9bf0b8f5690cf177c499eea6b5bc722ea260e867b3814b7a1175955c3a195ad42812499c814f01e29dce85b508c1e6579ca3e63eee7d2d1879b35869439b1f679209a313aa084baa27ce2bca2d537a409fb1a7ff38e7278485256711008ee9a69b22ee8da66bd9916376f700fd52c661e6b378e81742758019d295e459a537dc74e11a35be68d38eb3c276be49763e484727ce2a405a465a4baa182d91dd92359a1e1c8cf55429649471ebd6bae7139c5c18ea8392fe041f45436adad00172037bf83b3b87ef9cc2a1f105696c1d508149f80aa201335eea64e80f9d2c4da496b14add2543e7e075dbe13d55004b728f70b1e4e05d9ec3d977e8951318219e32e43b878e6fd8e0ca57d22ae283b4958791d0e1cd0db6e9c1f8369ac2453d6a11d6491cbf27b38e1f2132a3f8cb1c9dc23fe93dd09d86b01b3ecca8d59898c869ce028cb9b4744bb792682e5e5208359ba8f566920d2aea3f5b57bc77950865911dd27a0c0c1dc532985a89f4726d1788b269bf75e295dc8fee6d6b9978528294c5beb418876bbfd565220be9df00d2d378cf21b4fe640bdd38d48aed178811b76afd7762bccabe7e93b600e6476fb121fcd8709900b88a985fe29c02142a307fc14e07ceeb87904a39714067ab65394251b6286abc72fbe7be0dd3789fb484d173fb0b1067d29a644768af78eeae7835c6a4966bee9034469ab17e47a77870cdbb2d7e050e1e6ab38cbb2fcde8b9adebc13a532ebdded00fd4f3c291a60208640067d4cf35b869d93a1b0ad9bb53863bb88f98bb87f462908dcb3c85a65c6b0da88fcf6c23a0f5daf6259e1a55e761b16487df95bba470ddf7d4d845366fe4d08b61ebd09ffea2aed849fe7134083608f206392d8e5797d7a82eb6c30ecc413f9afb0641c4e3787a5d7322735a5ea91f37acfca395b4843888bd8b4e85ff6865dd1ce2cbbc4a0f33ab639fb84192c9b16912fcb049e34f44a4fbc8007431164c43ea651605da4aed0f56e39a55db8b17333a361e106008a91cce559a0ce242ad236bebe6d29001aea934cb1deeddfcccf842264c6aae2ce50f5f96415581e31353e2318bbf17f4c9b78688382ebb15cde6a5b01103133daff7f69500ce5abd28e4fd75da1d59a17101da5d2faef683525c0a09d50294a3cec24ab937265fea378596b5dca367693d72cb203cd018ebdcbe8d6f2d0be77a6dc38b39b73bf41a46edeb154f6e9c6fc8af21b65bfaccf676a16c2ef88788c09be5720b44af9f190c510fc561e11c77b9094b2782e0ab8f8129d8cba4c874feb6470549bb0c23c87721c465b117e16b17a0b71cde57bb9a105a43aeeff6a6896a8a67db35c0a624e784e41e08cd0f16e89e2def3b6e8bb9c8b6035f47ace169395b7af05cecd86f51d5e853bd8acf356dcc28b8359adfb6bb7c441fe94bde6e39833422e3a5009567c970a5e216292cc09c1045deb107a2dfecc37c314349b054ac53899d635f3ad71bed739c99945d7720375d4066089c0edf0d67324c19405f7fd452fce333c8f74d681a70a89df9ea3672c2a7e59f94b1dc1b01a46dd8ad04eff012c8b541e6a7093d927e4b759b0628f3592171e9e0db814019d32e3eac43bc72b35d5d0230f6dd753c64b1f062fab6f04bc79835dad29be9cb4c5543ad058445d1cdde4bb93811f845ef44bd4ff4dbe5209ea36ed52cff0159215688be63ec78fc524d3b8b257246134eee995561b489d90f3df6ae17edbd12ac6064e6be1655f1fe08b0cb54c3bc45242a245cc6ad7eee4bba95f7a7fdd28be8710f07bd2b57773e34024493d75f10dc345b11199d23ed63766eb2ff79c620efc6666dd0bc6510ba0fc754e6c951da820c8e37bfaa90edde78ef766d11510c84fca13e7a4676d16a23a8b5bf7acaf43ac34dc8ea944bf141dae0ffc03973b4851ed583084acdb31828c8b2e8940ea6c1bb62a1658a77294183ca0ecc4cba4e9692892492fcbdf3277c2f12c5d8f71b372ccfb0ad1e19a16254e1548c01e74b3ff954be244470277127836d5f6851c66eb4be811fd0a582a1e6ca2777c5802550d7be2b1380149d5c93bdb4c3181450c7bec5516e85a3d4f71f44b12d92a03e9bd6ae343ad4c0cd88af57020026b21ff4c9d81df00a3dd8d1aa9732f42c8f600d9929daa3ccf63afb6e830d186c6384a3c9dd27a8744835cef9ce139fd6864f0683bdc59c817858ae0a9a5536e498cfedfedc4e0cfeaba17f46c98f0ae92999876475c7933a9becd9b6e8414c55650e9753320f0aa3ffab6c4ada6432049ab4401831874c1fb62f97427047c930df55cbae4b96201890ab6420caa79eac1140ea0a6bb64eb813148da0bda129babf56e72af00145ca5afe592f183df1d3402a68516d2f3082711685951cd8a2f51c57f4dcf23ff59607170d11bb344a7a6af5334661c27010e36d37e2cc7131b558ff71c572c55306d9360f457499350431e87e066a090de0380031f94f3b8fa2ce05ef8b39a9eb2b1f8766be892a9ee9f1ff743876dda235db0bb71d63d043ed717ea5cbe91db689ed95dd6944b00c19c584da2c88411321719a830a85f486e5ead43d9fd81f7a255b8896a4b8b243f4f6452f51e59a6562efaaa92f111d53c72f4ec4abc03c741228e6a76fae903f1fe14d80a75bf03b611a3edf6f8f6d3b921476e89b294b246f0c1aa505f570c19050ce1c311c1b5285294aba2b74b171f087934c78422b61dea6f28c7c172bb37ac7435389696a47bb9bc42202e5d8b8bcc87aedb61958c47b73b3f16595dd1215a9a176ca60fb89c24a2040d6c4a4c958212bb195e8c64ce73925f102f199bdeb03ef92fbea05c111fa01b23e5739722d2069154eb7182727829b55124ddc3c76ee0efe444a5eee5b5391afe92ce893f0af5fb332b995ae48b87df18145db95c413c15564f6fc9ecb2649289b1e08c8216a8d429f7df6c6b6f568bac7bcb386262486abecd413f33af3e437eda7b8d8f613ea0e086fd5a5a33ac1544162064ae2283de7ed8663096a4c24552a7dc7fca2a385175ece04658397e938ae832e44c1e11bf9a661c5cc9dc7ba24405b2c54595f3dccf87f281afe26a99183bb7f9b9b890ef7c13fd665fccb80a975be638118895e978698730d4767080ffc6718dc537108931f3d65777aab0bf562ad5f0df16b7450c26180131a6eae400b7e6250da980b361c4b367f60fcc7c5afa31a537d42eb1a82fc2fcf1c0f5431cb7c480cb35788ba827ee79a983e073071ebefd159cee3eaf0843ff4f57260a26b1d7763fde6e3a614f529a592b6f05fbef37690eec48487c82a4a027701368e9c35540dfa09e993ada88bd7ffb7276209024b56552c1411e496bf3e12c259a2c11746a2caf5872284e5120b80aef1deb711a5f4bc61ce08d00b3cedf6ff298e99db89e1c1267db1a09fe125d71e1af3bda209142f961fbad2c5dceb28265d5fdcf29f204b9f8f8fb293e48b09a8d0d9a3a10f46cae94ebab8fdb3e88ee272ef5705fd8262660a05f376d3d5288547779e69a2aaf0c959a4bd6a61d584955011ebd26eef5ebf87a56ddeb4063d6428dab5bcef53d8", 0x1000}, {&(0x7f0000001580)="7847ad1525538944dcda57951bc1957483ec44eb6be0325617a33405d42e443ed3db42630e4b2f6dc64efe057afa45ad4bec0d023f16b237660fe7cf852897788edb4211dfb7a7f25157b16193dd567f71ceeb5bfd79b474e6d6a8812d631e384bc5d0b2599a0c3ab0e63fc244632a585b42841a91a47143cfaaaa375229a8badc1a94", 0x83}], 0x4)
getsockopt$sock_cred(r1, 0xffff, 0x1022, &(0x7f00000001c0)={0x0, <r10=>0x0}, &(0x7f0000000200)=0xc)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000540)={<r11=>0xffffffffffffffff})
getsockopt$sock_cred(r11, 0xffff, 0x1022, &(0x7f0000000140)={0x0, 0x0, <r12=>0x0}, &(0x7f0000000080)=0xc)
semctl$IPC_SET(0xffffffffffffffff, 0x0, 0x1, &(0x7f0000000000)={{0x0, 0x0, 0x0, 0x0, r12}, 0x0, 0x0, 0x1})
r13 = getpgrp()
r14 = getpgid(0x0)
msgctl$IPC_SET(r2, 0x1, &(0x7f0000000240)={{0x0, r3, r7, r10, r12, 0x1a2, 0x4}, 0x80000001, 0xee, r13, r14, 0x3ff, 0xffffffffffffff31, 0x178, 0x3})


r0 = fcntl$dupfd(0xffffffffffffffff, 0xb, 0xffffffffffffffff)
write(r0, 0x0, 0x0)


modctl$MODCTL_LOAD(0x0, &(0x7f0000000440)={&(0x7f0000000340), 0x0, &(0x7f0000000380)='\r', 0x1})


symlink(&(0x7f0000000340)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000580)='./file1\x00')
link(&(0x7f0000000080)='./file1\x00', 0x0)


setreuid(0x0, 0xee01)
__clock_gettime50(0x40000000, 0x0)


mkdir(&(0x7f0000001340)='./file0\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
compat_50_setitimer(0x0, &(0x7f0000000000), 0x0)


writev(0xffffffffffffffff, &(0x7f0000000640)=[{&(0x7f0000000140)="b6f2ec6fbd227526a1591a5f5c9b92400122bf983fb5df6bd9b57a6c2fd6a6313b1dfbcc7c9b6159f431f4ca3d4a687e64df36d2b2fd570108795f1d5052b9878bdc5184baddacac4468658972312e0c14e71de48cc4be55bcec05a64ad962557c074cccfc5e57478d4bc387f2079e1b3ce5417d91dcd0b6337d65259bac459399d9e4b2a71363c5e0bbe4ba4e2db9", 0x8f}], 0x1)
r0 = socket(0x1, 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc0106924, &(0x7f00000001c0))


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_opts(r0, 0x0, 0xd, &(0x7f0000000240)="ea00000100d10000", 0x8)


mknod(&(0x7f0000000000)='./file0\x00', 0x2011, 0x1733)
compat_43_lstat43(&(0x7f0000000080)='./file0\x00', &(0x7f0000000140))


madvise(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0x2)
munmap(&(0x7f0000fed000/0x13000)=nil, 0x13000)
r0 = shmget(0x1, 0x2000, 0x86d, &(0x7f0000ffb000/0x2000)=nil)
r1 = shmget$private(0x0, 0x9000, 0x5, &(0x7f0000ff4000/0x9000)=nil)
shmget$private(0x0, 0x3000, 0x8, &(0x7f0000ffc000/0x3000)=nil)
r2 = shmget(0x1, 0x2000, 0x2f8, &(0x7f0000ffe000/0x2000)=nil)
shmctl$IPC_RMID(r2, 0x0)
shmat(r1, &(0x7f0000ffb000/0x2000)=nil, 0x2000)
munlock(&(0x7f0000ffd000/0x1000)=nil, 0x1000)
mlock(&(0x7f0000ff0000/0x3000)=nil, 0x3000)
shmget$private(0x0, 0x1000, 0x420, &(0x7f0000ff7000/0x1000)=nil)
mprotect(&(0x7f0000ff1000/0x2000)=nil, 0x2000, 0x4)
shmat(r1, &(0x7f0000ff1000/0x1000)=nil, 0x1800)
shmctl$IPC_SET(r1, 0x1, &(0x7f0000000000)={{0x0, 0xffffffffffffffff, 0x0, 0x0, 0xffffffffffffffff, 0x30, 0x9}, 0x21, 0x7, 0x0, 0x0, 0x7, 0xdadb, 0x757})
r3 = shmget$private(0x0, 0x3000, 0x14, &(0x7f0000ff8000/0x3000)=nil)
r4 = shmget(0x1, 0x1000, 0x20, &(0x7f0000ffe000/0x1000)=nil)
shmat(r4, &(0x7f0000fed000/0xe000)=nil, 0x1000)
mlock(&(0x7f0000ffc000/0x2000)=nil, 0x2000)
shmat(r0, &(0x7f0000ffc000/0x3000)=nil, 0x3000)
r5 = fcntl$getown(0xffffffffffffffff, 0x5)
r6 = fcntl$getown(0xffffffffffffffff, 0x5)
shmctl$IPC_SET(0xffffffffffffffff, 0x1, &(0x7f0000000080)={{0x4d, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x40, 0x6}, 0xfe000000, 0xff7f, r5, r6, 0x9, 0xe2, 0x5})
r7 = accept$inet(0xffffffffffffffff, 0x0, &(0x7f0000000100))
fcntl$setown(r7, 0x6, r5)
getsockopt$sock_cred(r7, 0xffff, 0x1022, &(0x7f0000000140)={0x0, <r8=>0x0}, &(0x7f0000000180)=0xc)
shmctl$IPC_SET(r3, 0x1, &(0x7f00000001c0)={{0x0, r8, 0x0, 0x0, 0x0, 0x80, 0xea2d}, 0x9, 0x6, r6, r5, 0x0, 0x8, 0x4})
madvise(&(0x7f0000fee000/0x3000)=nil, 0x3000, 0x6)
mmap(&(0x7f0000ffd000/0x2000)=nil, 0x2000, 0x1, 0x12, r7, 0x1000, 0x0)
madvise(&(0x7f0000ffb000/0x2000)=nil, 0x2000, 0x4)
munlock(&(0x7f0000ffb000/0x3000)=nil, 0x3000)


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0xc0e99db6de761f86, 0x0)
r0 = open(&(0x7f0000000240)='./file0\x00', 0x2, 0x0)
ioctl$FIONREAD(r0, 0xc028698d, &(0x7f0000000080))


r0 = socket$inet6(0x18, 0x3, 0x0)
r1 = socket(0x2, 0x3, 0x0)
mknod(&(0x7f0000000040)='./bus\x00', 0x2000, 0x0)
r2 = open(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOSETOWN(r2, 0x80067409, &(0x7f0000000200))
dup2(r1, r0)
flock(0xffffffffffffffff, 0x0)


mmap(&(0x7f0000000000/0x400000)=nil, 0x400000, 0x3, 0x5012, 0xffffffffffffffff, 0x0, 0x0)
mknod(&(0x7f0000000040)='./bus\x00', 0x2000, 0xd02)
close(0xffffffffffffffff)
r0 = open(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
pread(r0, &(0x7f0000000040)="3cd15db7c30016", 0x50cc00, 0x0)
madvise(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)


__clock_getres50(0x3, 0x0)


pipe(&(0x7f0000000a40)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
compat_43_ogethostname(&(0x7f00000000c0)=""/117, 0x75)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xffffffffffffffff}})
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
open$dir(&(0x7f0000000000)='./file0\x00', 0x10210, 0x0)


mknod(&(0x7f0000000000)='./file0\x00', 0x1ffa, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x201, 0x0)
open$dir(&(0x7f0000000780)='./file0\x00', 0x0, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x222, 0x0)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0xffffffffffffffff}})
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setsockopt(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
r0 = open$dir(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
fchmod(r0, 0x0)


pipe(&(0x7f0000000140)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
writev(0xffffffffffffffff, &(0x7f0000001480)=[{0x0}], 0x1)
madvise(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x80000000002)
setuid(0xee01)
__clone(0x0, &(0x7f0000000000))
compat_90_getvfsstat(&(0x7f0000000940), 0xfffffffffffffd24, 0x0)


__clone(0x0, 0x0)
compat_50_wait4(0x0, 0x0, 0x2004c, 0x0)


mknod(&(0x7f0000000080)='./bus\x00', 0x2000, 0xd03)
open(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
swapctl$SWAP_ON(0x1, 0x0, 0xd1)


mknod(&(0x7f0000000100)='./file0\x00', 0x2001, 0xa3e7)
mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x4f4b)
r0 = open$dir(&(0x7f0000000180)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x80104301, &(0x7f0000000100))
open$dir(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
__getdents30(0xffffffffffffffff, 0x0, 0x0)


r0 = open(&(0x7f0000000480)='./file0\x00', 0x80400000000206, 0x0)
mmap(&(0x7f0000000000/0x200000)=nil, 0x200000, 0x2, 0x10, r0, 0x0, 0x0)
sendto$inet6(0xffffffffffffffff, 0x0, 0x0, 0x0, &(0x7f0000000040)={0x18, 0x3}, 0xc)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)


pipe(&(0x7f00000001c0)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
getpriority(0x0, 0x0)


__mount50(0x0, &(0x7f0000000040)='.\x00', 0x187dbb2394e01c2b, &(0x7f0000000540), 0x0)


__setitimer50(0x0, &(0x7f0000000380)={{0x0, 0xffffffffffff7fff}}, 0x0)


munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
minherit(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0)


munmap(&(0x7f0000ffa000/0x3000)=nil, 0x3000)
mmap(&(0x7f0000ffd000/0x1000)=nil, 0x1000, 0x6, 0x1010, 0xffffffffffffffff, 0x0, 0x0)
mmap(&(0x7f0000ffe000/0x2000)=nil, 0x2000, 0x0, 0x1010, 0xffffffffffffffff, 0x0, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140))
open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r0, 0x9, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000001})
r1 = open(&(0x7f0000000040)='./file0\x00', 0x205, 0x0)
fcntl$lock(r1, 0x8, &(0x7f0000000000)={0x4, 0x0, 0x0, 0x1000300010008, 0xffffffffffffffff})
r2 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
fcntl$lock(r2, 0x9, &(0x7f0000000140)={0x0, 0x0, 0x0, 0x100000002})


__lstat50(&(0x7f0000000000)='.\x00', &(0x7f00000002c0)={<r0=>0x0})
mknod(&(0x7f0000000000)='./file0\x00', 0x2011, r0)
r1 = open(&(0x7f0000000300)='./file0\x00', 0x0, 0x0)
shutdown(0xffffffffffffffff, 0x0)
setsockopt(0xffffffffffffffff, 0x29, 0x0, 0x0, 0x0)
unlink(&(0x7f0000000380)='./file0\x00')
close(r1)


setrlimit(0x0, &(0x7f00000010c0))
setrlimit(0x2, &(0x7f0000000240))


__lstat50(&(0x7f0000000000)='.\x00', &(0x7f00000002c0)={<r0=>0x0})
mknod(&(0x7f0000000000)='./file0\x00', 0x2011, r0)
r1 = open(&(0x7f00000002c0)='./file0\x00', 0x0, 0x0)
open(0x0, 0x0, 0x0)
read(r1, &(0x7f0000000000)=""/36, 0x5cfc000)


munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
mincore(&(0x7f0000ffc000/0x2000)=nil, 0x2000, &(0x7f00000006c0)=""/221)


sendto$inet6(0xffffffffffffffff, &(0x7f0000000080)="b1302bfe967c379d4f640efe754349ca80847475b10531ad59", 0x19, 0x0, 0x0, 0x0)
getsockopt$sock_cred(0xffffffffffffffff, 0xffff, 0x1022, 0x0, 0x0)
mknod(&(0x7f0000001200)='./file0\x00', 0x2000, 0x400)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
readv(r0, &(0x7f0000000080)=[{&(0x7f0000001240)=""/4096, 0x1f000}], 0x3)


symlink(&(0x7f0000000100)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_40_mount(&(0x7f00000001c0)='ptyfs\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f0000000340))
__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
modctl$MODCTL_UNLOAD(0x2, 0x0)
pathconf(&(0x7f0000000040)='./file0/../file0\x00', 0x0)


compat_40_mount(&(0x7f0000000040)='lfs\x00', &(0x7f0000000080)='.\x00', 0x0, &(0x7f00000000c0)="b5")


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
preadv(r0, &(0x7f0000000300)=[{0x0}], 0x1, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
utimensat(0xffffffffffffffff, 0x0, &(0x7f0000000100), 0x0)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x1000300000001, 0xffffffffffffffff})
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x40001ff, 0x0, {0x0, 0x1}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
r1 = socket(0x18, 0x1, 0x0)
setsockopt(r1, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r1, 0x1000000029, 0xc, 0x0, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)


setsockopt(0xffffffffffffffff, 0x0, 0x0, &(0x7f00000000c0)="b211d7170d816685c8e360f2fa41c1a0", 0x10)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000340)={<r0=>0xffffffffffffffff})
readv(r0, &(0x7f0000000000)=[{&(0x7f0000000200)=""/183, 0xb8}], 0x10000000000001cb)


posix_spawn(0x0, 0x0, &(0x7f0000000540)={0x0, 0x1, &(0x7f0000000500)=@dup}, &(0x7f0000000580), 0x0, 0x0)


__mount50(0x0, &(0x7f0000000040)='.\x00', 0x0, 0x0, 0x0)
_ksem_init(0x0, 0x0)
pipe(&(0x7f0000000040)={<r0=>0xffffffffffffffff})
ioctl$WSKBDIO_GETMAP(r0, 0x80047476, 0x0)


semctl$IPC_SET(0xffffffffffffffff, 0x0, 0x1, &(0x7f0000000000)={{0x1ff}, 0x0, 0x0, 0x1})
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x1ff})
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
setsockopt(r1, 0x1000000000029, 0x9, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
setsockopt(r1, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


r0 = socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x1000000000029, 0xb, &(0x7f0000000040)='\x00\x00\x00\x00', 0x4)
setsockopt(r0, 0x1000000000029, 0x9, &(0x7f00000002c0), 0x0)


compat_50_____semctl13$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000900)=@buf=&(0x7f00000008c0)={{}, 0x0, 0x0, 0x0, 0x0})


mmap(&(0x7f0000000000/0x400000)=nil, 0x400000, 0x3, 0x5012, 0xffffffffffffffff, 0x0, 0x0)
setrlimit(0x0, &(0x7f0000000980))
mprotect(&(0x7f0000001000/0x1000)=nil, 0x1000, 0x0)
madvise(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)


r0 = socket(0x18, 0x1, 0x0)
getsockopt$sock_int(r0, 0xffff, 0x1003, &(0x7f0000000640), &(0x7f0000000680)=0x4)


mknod(&(0x7f00000000c0)='./bus\x00', 0x2000, 0x44002802)
r0 = open(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
ioctl$FIOASYNC(r0, 0x20007461, 0x0)


mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
compat_50_setitimer(0x0, &(0x7f0000000000), 0x0)


r0 = socket$inet6(0x18, 0x3, 0x0)
sendto$inet6(r0, 0x0, 0x0, 0x0, &(0x7f0000000040)={0x18, 0x3}, 0xc)


r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt$inet_opts(r0, 0x6, 0x5, 0x0, 0x0)


_lwp_create(&(0x7f0000000040)={0x4, 0x0, {}, {}, {0x0, 0x0, '.^$%{\x00'}}, 0x0, 0x0)


__fhopen40(&(0x7f0000000000)="21a324a2b569b122166cca6fe74a3655454ad8e7c3c36cc012a6579d1b3fe5e78a39d1cd30f6a363f38878d168145b85b5057a100ee2c3f93231559835ba2e5ffb6e", 0x42, 0x0)
r0 = socket$inet(0x2, 0x1, 0x0)
setsockopt$inet_opts(r0, 0x6, 0x2, &(0x7f0000000040), 0x4)


r0 = socket(0x2, 0x3, 0x0)
setsockopt(r0, 0x0, 0x2, 0x0, 0x0)


open(&(0x7f0000000100)='./file0\x00', 0x200, 0x44)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x40000400001803c1, 0x0)
ftruncate(r0, 0x80002, 0x0)
truncate(&(0x7f00000001c0)='./file0\x00', 0x0, 0x0)
execve(&(0x7f0000000300)='./file0\x00', 0x0, &(0x7f0000000880)=[&(0x7f00000006c0)='tap', &(0x7f0000000740)=',@\x00', &(0x7f0000000780)='/dev/bpf\x00', &(0x7f00000007c0)='-\x00', &(0x7f0000000800)=':#*}(&\x00'])


mkdir(&(0x7f00000000c0)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000040)='ptyfs\x00', &(0x7f00000000c0)='./file0\x00', 0x0, &(0x7f0000000100))
lchflags(&(0x7f0000000000)='./file0\x00', 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x0)
setreuid(0x0, 0xee01)
lchflags(&(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)


_ksem_unlink(&(0x7f0000000080))


munmap(&(0x7f0000fed000/0x13000)=nil, 0x13000)
r0 = shmget$private(0x0, 0x9000, 0x0, &(0x7f0000ff4000/0x9000)=nil)
shmat(r0, &(0x7f0000ff9000/0x4000)=nil, 0x0)
madvise(&(0x7f0000ffb000/0x2000)=nil, 0x2000, 0x0)


msgctl$IPC_SET(0x0, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0xffffffffffffffff}})
r0 = socket(0x18, 0x2, 0x0)
close(r0)
r1 = socket(0x800000018, 0x1, 0x0)
r2 = getegid()
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f00000000c0)={{0x0, 0xffffffffffffffff, 0x0, 0x0, r2}})
bind$unix(r1, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
r3 = socket(0x800000018, 0x1, 0x0)
setsockopt$sock_int(r3, 0xffff, 0x4, &(0x7f0000000040)=0x80, 0x4)
sendmsg(0xffffffffffffffff, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, 0x0, 0x10}, 0x0)
bind$unix(r3, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


mmap(&(0x7f0000002000/0x4000)=nil, 0x4000, 0x0, 0x8c8d941961ab3d30, 0xffffffffffffffff, 0x0, 0x0)


mknod(&(0x7f0000000000)='./file1\x00', 0x2000, 0xa718)
r0 = open(&(0x7f00000000c0)='./file1\x00', 0x0, 0x0)
ioctl$FIONBIO(r0, 0xc010447d, &(0x7f0000000080))


unlink(&(0x7f0000000100)='./file0\x00')
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
compat_40_mount(&(0x7f0000000040)='ptyfs\x00', &(0x7f00000000c0)='./file0\x00', 0x0, &(0x7f0000000100))


r0 = socket(0x11, 0x3, 0x0)
sendto$unix(r0, &(0x7f00000000c0), 0x0, 0x0, 0x0, 0x0)
getsockname$unix(0xffffffffffffffff, 0x0, 0x0)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r1 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
r2 = shmat(r1, &(0x7f0000001000/0x3000)=nil, 0x0)
sendmsg(0xffffffffffffffff, &(0x7f0000002780)={0x0, 0x0, 0x0, 0x0, &(0x7f00000016c0)=ANY=[@ANYRES32], 0x10b8}, 0x0)
shmctl$IPC_RMID(r1, 0x0)
shmdt(r2)


fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x100000001, 0xffffffffffffffff})
semctl$SETALL(0x0, 0x0, 0x9, 0x0)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, &(0x7f0000000000)={0x40001ff, 0x0, {0x0, 0x1}})
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f0000000040)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
setsockopt(r0, 0x1000000029, 0xc, &(0x7f00000000c0)="ebffcbff13b9fd812eaa4e713a48e69931929648", 0x14)


r0 = socket(0x800000018, 0x3, 0x0)
bind$unix(r0, &(0x7f0000000080)=@abs={0x1f95d27d48731892, 0x7}, 0x1c)


compat_50_nanosleep(&(0x7f00000002c0)={0x0, 0x1ff}, &(0x7f0000000300))


open(&(0x7f0000000080)='./file0\x00', 0x611, 0x0)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
r1 = open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
fcntl$lock(r1, 0x8, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x100000401})
fcntl$lock(r0, 0x7, &(0x7f0000000100))


r0 = socket(0x18, 0x10000003, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0906935, &(0x7f0000000180))
r1 = compat_30_socket(0x1f, 0x5, 0x0)
paccept(r1, 0x0, 0x0, 0x20000000)


modctl$MODCTL_UNLOAD(0x2, 0x0)
_ksem_init(0x0, &(0x7f0000000040)=<r0=>0x50535244)
_ksem_wait(r0)
_ksem_post(r0)


_ksem_open(&(0x7f0000000a00), 0xa00, 0x0, 0x0, &(0x7f0000001e00))


shutdown(0xffffffffffffffff, 0x0)
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x2, 0x3, 0x0)
setsockopt$sock_int(r0, 0xffff, 0x1002, &(0x7f0000000000), 0x4)


r0 = socket(0x18, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x2, 0x0)
close(r1)
socket(0x18, 0x3, 0x3a)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
sendmsg(r1, &(0x7f0000000280)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000300)=ANY=[@ANYBLOB], 0x10}, 0x0)
socket(0x0, 0x0, 0x0)
connect$unix(r0, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)


ioctl$WSMUXIO_REMOVE_DEVICE(0xffffffffffffffff, 0x80085762, &(0x7f0000000040)={0x1})
r0 = socket(0x18, 0x3, 0x3a)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
socket(0x18, 0x2, 0x0)
setsockopt(r0, 0x29, 0x6c, &(0x7f0000000040), 0x4)
setsockopt$inet6_MRT6_DEL_MFC(r0, 0x29, 0x69, &(0x7f0000000200)={{0x18, 0x1}, {0x18, 0x1}, 0x0, [0x0, 0x0, 0x2]}, 0x5c)


pipe(&(0x7f0000000140)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
_ksem_init(0x0, &(0x7f0000000040)=<r2=>0x50535244)
_ksem_wait(r2)
_ksem_post(r2)
ptrace(0x9, r1, &(0x7f0000000000), 0x80000000)


swapctl$SWAP_STATS(0xa, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, 0x401800c)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r0 = open(&(0x7f0000000080)='./file0\x00', 0x70e, 0x0)
r1 = open(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
r2 = socket$inet(0x2, 0x2, 0x0)
__fstat50(r2, &(0x7f0000000000))
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x10, r1, 0x0, 0x0)
writev(r0, &(0x7f0000000400)=[{0x0}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}], 0x8)
socket$inet6(0x18, 0x4, 0x1)
compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
lchown(0x0, 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', 0x0, 0x0, 0x0)
paccept(0xffffffffffffffff, 0x0, 0x0, 0x0)
madvise(&(0x7f0000001000/0x4000)=nil, 0x4000, 0x3)


socket(0x2, 0x2, 0x0)
pipe(&(0x7f0000000400)={<r0=>0xffffffffffffffff})
r1 = getpid()
fktrace(r0, 0x0, 0x62e2dd08f149ff1b, r1)
mmap(&(0x7f000017b000/0x1000)=nil, 0x1000, 0x0, 0x1011, 0xffffffffffffffff, 0x3, 0x0)
compat_30_fhstat(0x0, 0x0)


__clone(0x0, &(0x7f0000000100))
compat_50_wait4(0x0, 0x0, 0x4, &(0x7f0000000040))


r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
listen(r0, 0x0)
dup2(r1, r0)


getrlimit(0x9eb4d037b749dd47, 0x0)


connect$unix(0xffffffffffffffff, &(0x7f0000000040)=@abs={0x1, 0x0, 0x1}, 0x8)
r0 = socket(0x18, 0x3, 0x3a)
socket(0x18, 0x1, 0x0)
setsockopt(r0, 0x29, 0x6c, &(0x7f0000000040), 0x4)
close(r0)


chroot(&(0x7f0000000000)='.\x00')
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
__stat50(&(0x7f0000000180)='./file0/file0/../file0\x00', &(0x7f0000000300))


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000200)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100), 0x80001ff}})
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
lchown(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))
compat_40_mount(&(0x7f0000000380)='null\x00', &(0x7f0000000200)='./file0/file0\x00', 0x0, &(0x7f00000001c0))
symlink(&(0x7f0000000040)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/file0/file0\x00', &(0x7f0000000280)='./file0\x00')


sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f00000000c0), 0x1c, 0x0}, 0x0)
ioctl$WSMUXIO_INJECTEVENT(0xffffffffffffffff, 0x80185760, 0x0)
connect$unix(0xffffffffffffffff, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
r0 = socket(0x18, 0x1, 0x0)
close(r0)
r1 = socket(0x18, 0x1, 0x0)
close(r1)
r2 = socket(0x18, 0x2, 0x0)
setsockopt(r2, 0x1000000029, 0x2e, &(0x7f0000000000)="ebffcbff13b9fd812eaa4e713048e69931929648", 0x14)
connect$unix(r1, &(0x7f00000000c0)=@abs={0x0, 0x7}, 0x1c)
sendmsg$unix(r0, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)


modctl$MODCTL_EXISTS(0x3, 0x0)


setrlimit(0x7, &(0x7f0000000240)={0x0, 0xfff})
getppid()
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
renameat(0xffffffffffffffff, &(0x7f00000000c0)='./file0\x00', 0xffffffffffffffff, &(0x7f0000000100)='./file0\x00')


__utimes50(&(0x7f0000000180)='./file0\x00', 0x0)


r0 = _lwp_self()
compat_50__lwp_park(0x0, r0, 0x0, 0x0)


symlink(&(0x7f0000000000)='.\x00', &(0x7f0000000040)='./file0\x00')
rename(&(0x7f00000000c0)='./file0/file0/..\x00', 0x0)
__mount50(&(0x7f0000000500)='coda\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f00000000c0), 0x0)


mlock(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0x0, 0x1810, 0xffffffffffffffff, 0x0, 0x0)
mmap(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0, 0xb83df7f70ca77c75, 0xffffffffffffffff, 0x0, 0x0)
mlock(&(0x7f0000ffb000/0x3000)=nil, 0x3000)


r0 = socket(0x18, 0x2, 0x0)
close(r0)
r1 = socket(0x18, 0x3, 0x0)
setsockopt(r1, 0x1000000029, 0x31, &(0x7f00000000c0)="b211d7170d816685c8e360f2fa41c1a0946988b272d2dd3dc90142a84231a746e337b372e93320cff6669cbe7868de45ed3fc33719ca6df71ecec8a918458b2c10a1f8c66653b276e180e9cb9b21f9982230f575295d48889c9a920796b2dd92fc8575680b37ba955d2c15e6d7c9198ed900ab006ddfb67869b51a2216114d1ece85f593e74035f5bc054eb1dbddf42a", 0x90)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000080))
connect$unix(r0, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
writev(r1, &(0x7f0000000380)=[{&(0x7f0000000180)="294bbe01e775d42b920587bee9952844c9bd082f517f000000e5395a3adbd32b44bbf70100fbd963abbeba59c23c5310add84f137b32990c7b3c89185df9f097003f67ca6918f9363386bef4cd0aa3d52bbf1ab75d8c4102f4e3138c229004357738d4ecaca017eb674086edbe016ad9c91ab1fbde25895467c8dce7ff20897002c8abca0036fb92c3cdeffde58eafef99abad4d0c0b9d3cd358d9552dd02afeb2dcdbad04", 0xa5}, {&(0x7f00000003c0)="4fb753dcdadfc366ed6c604d2880be6ecba35fdb2c1643bc9bbfe37bf9b31d625e398beca1d2d858cd37255afed6c1fb00ccf3a4da033bb92a5cc65597870c034aac4125adc0d3960e1c71b921d94624033f62bad195480fe568ecb8a37527d4e5a4306c591cc35c181e9b88e80074bf3157b8dea45e3391e8246c6ba2a894", 0x7f}, {&(0x7f0000000680)="a29228fcd8b93635bd8ce9b958fc56277452f4bd7372ef7f9829fc5fc6f55a034732c8f770149389111a04ee9a4d4ea18cc39157341d5f083e3275bdba743d65aa00e5416ced2bdb35a7d0d7544e2886598fa027e0c681635e3c5902497ab520a51f694a457dec3eb09bc737408f856cc9f41d12255d5f77658faf335ab0f25a7330b20d57d9936f2909c4a030a1b3122001ddd6607e740f00000000000017a7699073d94970743d10a6112e2acaefbdd2e9ff71c4292c082da70a15844eb00c9734ef5d2b2a7fa4f3403567b0e6f0d862015f8ad2d31268a9b957a4850accf1615634f6d247a2c9e338c08ab3da458c4312986966ab546f0ad4899f6141f7d479bf88242eb9b95d61f323906ea0fe458f74eeb1ec4b2b9932a94ad1d8d7e2bf3ffc5a48b0127c8b417b678d35193bfd50d740eb93219b6d77f57ac3051d459ba41d2c07", 0x144}, {&(0x7f00000011c0)="a1120b8d127ca8d8e480ff35007c60a710d03568e2aa0f102f9b9f240eb57c54", 0x20}], 0x4)


r0 = socket(0x18, 0x400000002, 0x0)
compat_43_orecvmsg(r0, &(0x7f0000000340)={0x0, 0x0, 0x0, 0x0, 0x0}, 0x0)


r0 = socket(0x1f, 0x10000003, 0x0)
sendmsg(r0, &(0x7f00000002c0)={&(0x7f0000000000), 0xe, 0x0, 0x0, 0x0}, 0x0)


mknod(&(0x7f0000000100)='./file0\x00', 0x6000, 0xa3e7)
pathconf(&(0x7f0000000480)='./file0\x00', 0x2)


connect$unix(0xffffffffffffffff, &(0x7f0000000000), 0x1)
r0 = socket(0x2, 0x2, 0x0)
r1 = socket(0x2, 0x400000000002, 0x0)
setsockopt(r1, 0x0, 0x21, &(0x7f0000000180)="b1f5d915", 0x4)
r2 = dup2(r1, r0)
bind(r0, &(0x7f0000000000), 0x10)
r3 = socket(0x2, 0x8002, 0x0)
connect$unix(r3, &(0x7f0000000000), 0x10)
setsockopt$sock_int(r0, 0xffff, 0x800, &(0x7f0000000400)=0xbda, 0x4)
write(r3, 0x0, 0x0)
recvmsg(r1, &(0x7f00000003c0)={0x0, 0x0, &(0x7f0000000340)=[{&(0x7f00000000c0)=""/139, 0x8b}], 0x64, &(0x7f00000001c0)=""/189, 0xbd}, 0x43)
recvmmsg(r2, &(0x7f0000000080), 0x10, 0x2, 0x0)


_ksem_timedwait(0x0, 0x0)


mknod(&(0x7f0000000140)='./bus\x00', 0x2000, 0xd03)
__getfh30(&(0x7f0000000080)='./bus\x00', &(0x7f0000000180)=""/210, &(0x7f00000000c0)=0x100)


symlink(&(0x7f0000000080)='.\x00', 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
setreuid(0x0, 0xee01)
open(&(0x7f0000000080)='./bus/file0\x00', 0x20, 0x0)


posix_spawn(0x0, 0x0, &(0x7f0000000200)={0x0, 0x95, &(0x7f00000001c0)=@open={0x0, 0xffffffffffffffff, {&(0x7f0000000180)='@},\x00'}}}, 0x0, 0x0, 0x0)


mknod(&(0x7f0000000040)='./bus\x00', 0x2000, 0x0)
r0 = open(&(0x7f0000000280)='./bus\x00', 0x0, 0x0)
ioctl$FIOSETOWN(r0, 0x80067409, &(0x7f0000000200)=0x740)


socketpair$unix(0x1, 0x5, 0x0, &(0x7f00000015c0)={<r0=>0xffffffffffffffff})
getsockopt$sock_linger(r0, 0xffff, 0x80, &(0x7f0000000000), &(0x7f0000000080)=0x8)
socket$inet(0x2, 0x0, 0x0)
setsockopt$inet_opts(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
bind$unix(0xffffffffffffffff, 0x0, 0x0)
symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f0000000140)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)={0x0, 0x100000000, 0x6}, 0x0, 0x80}})
__getvfsstat90(&(0x7f0000000000), 0xffffffffffffff9a, 0x2)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000200)='umap\x00', &(0x7f0000000040)='./file0\x00', 0x0, &(0x7f00000001c0))
connect$unix(0xffffffffffffffff, 0x0, 0x0)
getsockname$inet(0xffffffffffffffff, 0x0, 0x0)
setgroups(0x0, 0x0)


modctl$MODCTL_LOAD(0x5, 0x0)
rasctl(0x0, 0x0, 0x1)


pipe2(&(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff}, 0x0)
readv(0xffffffffffffffff, &(0x7f00000000c0), 0x0)
writev(r0, &(0x7f0000000200)=[{&(0x7f0000000100)}], 0x1)


r0 = posix_spawn(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
fork()
__wait450(r0, 0x0, 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
__mount50(&(0x7f0000000000)='overlay\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000540), 0x0)
open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
swapctl$SWAP_ON(0x1, &(0x7f0000000000), 0x0)


swapctl$SWAP_ON(0x2, 0x0, 0x0)


compat_40_mount(&(0x7f0000000080)='msdos\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f0000000100))


r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='.\x00', 0x0, 0x0)
fchmodat(r0, 0x0, 0x0, 0x0)


open$dir(&(0x7f0000000b80)='./file0\x00', 0x200, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000100)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
bind$unix(r0, &(0x7f0000000040)=@file={0xd19450564dee018c, './file0\x00'}, 0xa)


symlink(&(0x7f0000000080)='.\x00', &(0x7f0000000240)='./file0\x00')
compat_50___msgctl13$IPC_STAT(0x0, 0x2, &(0x7f00000003c0)={{}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={&(0x7f0000000100)}})
lchown(&(0x7f0000000100)='./file0/file0\x00', 0x0, 0x0)
compat_40_mount(&(0x7f0000000380)='union\x00', &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f00000001c0))


r0 = socket$inet(0x2, 0x2, 0x0)
shutdown(r0, 0x1)


socketpair$unix(0x1, 0x2, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
poll(&(0x7f0000000000)=[{}], 0x20000000000000fe, 0x0)
poll(&(0x7f0000000380)=[{}], 0x1, 0xffff)
r0 = socket(0x2, 0x2, 0x0)
r1 = fcntl$dupfd(r0, 0x2, 0xffffffffffffffff)
close(r1)


_ksem_destroy(0x50535244)


r0 = socket(0x2, 0x3, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0986981, &(0x7f0000000180))
setsockopt$sock_int(r0, 0xffff, 0x8, 0x0, 0x0)


socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
recvmmsg(0xffffffffffffffff, &(0x7f0000000700)={0x0, 0x3f8d}, 0x10, 0x0, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x70e, 0x0)
r1 = getpid()
ktrace(&(0x7f0000000000)='./file0\x00', 0x0, 0x40000530, r1)
recvmmsg(r0, &(0x7f0000000080)={&(0x7f0000000100)={&(0x7f0000000480), 0x213, 0x0, 0x0, 0x0}}, 0x10, 0x1042, 0x0)


sendmsg$unix(0xffffffffffffffff, &(0x7f0000001700)={&(0x7f0000000080), 0x1c, 0x0}, 0x0)
r0 = socket(0x18, 0x400000002, 0x0)
r1 = socket(0x18, 0x2, 0x0)
fcntl$lock(0xffffffffffffffff, 0x0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x1000300000000})
connect$unix(r1, &(0x7f00000000c0)=@abs={0x682eb13985c518e6, 0x7}, 0x1c)
getsockname$inet(r1, &(0x7f0000000080), &(0x7f0000000000)=0xfffffffffffffe22)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
connect$unix(r1, &(0x7f0000000100), 0xa)
r2 = dup2(r1, r0)
sendmsg$unix(r2, &(0x7f0000001700)={0x0, 0x0, 0x0}, 0x0)
shutdown(r1, 0x2)


compat_50_select(0x40, &(0x7f0000000200), &(0x7f0000000240)={0x7}, 0x0, 0x0)


setreuid(0x0, 0xee01)
compat_90_fhstatvfs1(0x0, 0x0, 0x0, 0x0)


compat_50_select(0xda26f44e, 0x0, 0x0, 0x0, 0x0)


r0 = _lwp_self()
_lwp_suspend(r0)
_lwp_wait(r0, 0x0)
_lwp_exit()


r0 = socket(0x1d, 0x3, 0x0)
r1 = dup2(r0, r0)
compat_43_osend(r1, 0x0, 0x0, 0x0)


compat_40_mount(&(0x7f0000000040)='tmpfs\x00', &(0x7f00000000c0)='.\x00', 0x0, &(0x7f00000002c0)="01")
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
rename(&(0x7f0000000140)='./file0\x00', &(0x7f0000000040)='./file1\x00')
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x5)
pathconf(&(0x7f0000000040)='./file0/../file0\x00', 0x7)


mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
open(&(0x7f00000029c0)='./file0/file0\x00', 0x200, 0x0)


socket(0x1f, 0x10000003, 0x1)


mknod(&(0x7f0000000000)='./file0\x00', 0x6000, 0xe02)
open(&(0x7f0000000000)='./file0\x00', 0x611, 0x0)


r0 = socket(0x18, 0x10000003, 0x0)
ioctl$FIOSEEKHOLE(r0, 0xc0906935, &(0x7f0000000180))
sendmsg$unix(0xffffffffffffffff, 0x0, 0x0)
mknod(&(0x7f00000000c0)='./file0\x00', 0x2876, 0x40000800)
open(&(0x7f00000000c0)='./file0\x00', 0x205, 0x0)


compat_50_select(0x0, 0x0, &(0x7f0000000040), 0x0, &(0x7f00000000c0)={0x0, 0x200})


r0 = compat_30_socket(0x22, 0x3, 0x0)
compat_43_fstat43(r0, 0x0)


ptrace(0x2, 0x0, 0x0, 0x0)


poll(0x0, 0x0, 0x3f)


openat$tprof(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
__mount50(&(0x7f0000000000)='coda\x00', &(0x7f0000000040)='.\x00', 0x0, &(0x7f0000000080), 0x0)


open(&(0x7f0000000100)='./file0\x00', 0x200, 0x0)
ktrace(&(0x7f0000000200)='./file0\x00', 0x4, 0xd27d43220c7df9b, 0x0)
r0 = open$dir(&(0x7f0000000000)='.\x00', 0x0, 0x0)
renameat(r0, &(0x7f00000000c0)='./file0\x00', r0, &(0x7f0000000180)='./file1\x00')


pipe(&(0x7f0000000380)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
ioctl$FIONBIO(0xffffffffffffffff, 0x8004667e, &(0x7f0000000000)=0x1)
writev(r0, &(0x7f0000000540)=[{&(0x7f0000000640)="edc5b796445350fec0b489751d1622951fc593a4c9838f394573c433b168dad40277634711d9abfd3d8414cfcb01fa1a83b6ee6b565105ac5d52890a9f0b6f03af807b8d10aa4fcd73279421139823a0bd2261900a6071fbcdf5d26827871d97de219a5a88c60a3109de73655a8fd4fd94dc91b1ed9f2daa87a15a4693a0d45d02ba5625f5654784b47c4c90204e074e1c26ec09cc1afae7b16c1c80bbd644d214e0dcef957823b58087f73ef3d59a2bf9326928279a1389d5acfff46ad0", 0xbe}], 0x1)
fchflags(r0, 0x0)
read(r1, &(0x7f0000000940)=""/196, 0xc4)
writev(r0, &(0x7f0000000340), 0x0)
r2 = socket(0x11, 0x3, 0x0)
socket(0x11, 0x3, 0x4)
sendto$unix(r2, &(0x7f0000000000)="b1000504000004000000000001000000331c13fecea10500fef96ec0c72fd3357ae30200004e3003000000acf20b7804be38164991f7c8cf5f882b297be1aa0500000051e2f0ad3ebbc257699a1f139b672f4d335c223e7d0c032bfa896443a42102000000720fd18bfbb670c1f5a872c881ea6e2ec5890400000000008000361b4cc702fac500002021fbfa0c0f00008abfba221554f4e0f668246c0900000008e371a3f8343712051eea040000000000", 0xb1, 0x0, 0x0, 0x0)
recvmsg(r2, &(0x7f0000000540)={&(0x7f00000000c0), 0xc, &(0x7f0000000a40)=[{&(0x7f0000000180)=""/203, 0xcb}, {&(0x7f0000000280)=""/100, 0x64}, {&(0x7f0000000840)=""/132, 0x84}], 0x3, &(0x7f0000000480)=""/184, 0xb8}, 0x0)


setreuid(0xffffffffffffffff, 0xee01)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000000340)={0x0, 0x0, &(0x7f0000000200)=[{&(0x7f0000000180)="b01e48da7a89f346a9", 0x9}], 0x1}, 0x0)
modctl$MODCTL_STAT(0x4, &(0x7f0000000180)={&(0x7f00000000c0)=""/126, 0x7e})


mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x2000, 0x1)
symlink(&(0x7f0000001640)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/../file0\x00', &(0x7f0000000e40)='./file0\x00')
rename(&(0x7f0000000600)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000f40)='./file0\x00')


mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)
mmap(&(0x7f0000ffb000/0x4000)=nil, 0x4002, 0xd49f275d97cc01bb, 0x1810, 0xffffffffffffffff, 0x0, 0x0)


modctl$MODCTL_LOAD(0x0, &(0x7f0000000000)={&(0x7f0000000080)="ab9894b6809fd3ec1effdf3dd228010a0a7e4225df79690e64e464cde87d674560ede52d261885e19d76d90742afd2c6e7b8d13f2651e2033ed110f073b55a4ec8e169630df710b5b9a7cc2be66f0abd6dc7f92d41e0ace71100a61db06bcf342c7abd07be71d9e64c245e66d64886927a9362916bd8c59a7934649494a8a68c7f0179ccba9652084a74097560bfcaf3d6d99e8e3fb834b98722e32b2326ee851d9910db6a1f44aa21a8a4738caa2ee82b4db64eb04e7398e4fd4713b262a6a33d6a5bcc48807f8c042d15e3e75648f72e5fa625c4e2d4ed09d89308cc", 0x0, &(0x7f0000000040)="0d3c9708e0386072639181f0ff20", 0xfffffffffffffec0})


profil(0x0, 0x0, 0x0, 0x1000039)


mknod(&(0x7f0000000280)='./file0\x00', 0x1100, 0x0)
r0 = open(&(0x7f0000000180)='./file0\x00', 0x80000000000206, 0x0)
read(r0, 0x0, 0x0)

secmodel_suser_device_cb16%of 19
secmodel_suser_generic_cb---of 3
secmodel_suser_init---of 1
secmodel_suser_machdep_cb---of 15
secmodel_suser_network_cb20%of 35
secmodel_suser_process_cb34%of 24
secmodel_suser_start---of 1
secmodel_suser_stop---of 1
secmodel_suser_system_cb25%of 36
secmodel_suser_vnode_cb100%of 3
suser_eval---of 3
suser_modcmd---of 7
sysctl_security_suser_setup---of 1
-----------
SUMMARY26%of 117

pcq_create---of 6
pcq_destroy---of 1
pcq_get---of 12
pcq_maxitems---of 1
pcq_peek---of 6
pcq_put45%of 9
-----------
SUMMARY45%of 9

-----------
SUMMARY---of 0

cpu_spawn_return---of 1
md_child_return---of 1
syscall53%of 19
syscall_intern100%of 1
userret53%of 19
x86_curlwp100%of 1
-----------
SUMMARY56%of 40

handle_modctl_load37%of 11
sys_modctl44%of 44
x86_curlwp100%of 1
-----------
SUMMARY43%of 56

in6pcb_bind32%of 35
in6pcb_bind_port50%of 36
in6pcb_connect34%of 72
in6pcb_disconnect60%of 5
in6pcb_fetch_peeraddr---of 3
in6pcb_fetch_sockaddr---of 3
in6pcb_init---of 1
in6pcb_lookup18%of 34
in6pcb_lookup_bound---of 57
in6pcb_lookup_local16%of 92
in6pcb_notify---of 62
in6pcb_purgeif---of 9
in6pcb_purgeif0---of 28
in6pcb_rtchange---of 3
in6pcb_rtentry16%of 25
in6pcb_rtentry_unref---of 1
in6pcb_set_state43%of 19
x86_curlwp100%of 1
-----------
SUMMARY28%of 319

secmodel_extensions_system_cb23%of 9
secmodel_extensions_vfs_start---of 1
secmodel_extensions_vfs_stop---of 1
secmodel_extensions_vfs_sysctl---of 1
secmodel_extensions_vnode_cb23%of 9
-----------
SUMMARY23%of 18

-----------
SUMMARY---of 0

child_psignal---of 7
coredump_elf32---of 3
coredump_elf64---of 3
coredump_netbsd---of 3
coredump_netbsd32---of 3
eventswitch---of 36
eventswitchchild---of 3
execsigs---of 17
filt_sigattach---of 1
filt_sigdetach---of 1
filt_signal---of 4
getucontext---of 5
issignal---of 47
killpg1---of 24
killproc---of 3
kpgsignal---of 13
kpsendsig---of 3
kpsignal---of 17
kpsignal2---of 99
ksiginfo_alloc---of 6
ksiginfo_exechook---of 8
ksiginfo_free---of 3
ksiginfo_queue_drain0---of 12
pgsignal---of 5
postsig---of 25
proc_stop_callout---of 14
proc_stop_done---of 12
proc_stop_lwps---of 11
proc_stoptrace---of 29
proc_unstop---of 15
psignal---of 5
sendsig---of 7
sendsig_reset---of 7
setucontext---of 9
sigacts_ctor100%of 1
sigacts_poolpage_alloc---of 1
sigacts_poolpage_free---of 1
sigactsfree---of 3
sigactsinit67%of 3
sigactsunshare---of 4
sigchecktrace---of 8
sigclear15%of 21
sigclearall---of 6
sigexit---of 25
sigget---of 11
siggetinfo---of 24
siginit---of 13
sigismasked---of 3
sigispending40%of 5
signal_init---of 1
signal_listener_cb34%of 6
signotify---of 3
sigpost---of 34
sigput---of 17
sigswitch---of 30
sigswitch_unlock_and_switch_away---of 17
trapsignal---of 28
x86_curlwp---of 1
-----------
SUMMARY28%of 36

critpollhook_disestablish---of 1
critpollhook_establish---of 7
docritpollhooks50%of 4
doexechooks---of 8
doexithooks---of 6
doforkhooks50%of 6
domountroothook---of 5
dopowerhooks---of 17
doshutdownhooks---of 9
exechook_disestablish---of 1
exechook_establish---of 1
exithook_disestablish---of 1
exithook_establish---of 1
forkhook_disestablish---of 1
forkhook_establish---of 1
hook_disestablish---of 12
hook_establish---of 11
hook_init---of 1
mountroothook_destroy---of 9
mountroothook_disestablish---of 1
mountroothook_establish---of 7
powerhook_disestablish---of 10
powerhook_establish---of 5
shutdownhook_disestablish---of 1
shutdownhook_establish---of 7
simplehook_create---of 1
simplehook_destroy---of 11
simplehook_disestablish---of 16
simplehook_dohooks---of 17
simplehook_establish---of 4
simplehook_has_hooks---of 1
x86_curlwp---of 1
-----------
SUMMARY50%of 10

ext2fs_cgupdate---of 15
ext2fs_done---of 1
ext2fs_fhtovp---of 9
ext2fs_flushfiles---of 1
ext2fs_init---of 1
ext2fs_init_vnode---of 5
ext2fs_loadvnode---of 9
ext2fs_loadvnode_content---of 13
ext2fs_modcmd---of 4
ext2fs_mount5%of 42
ext2fs_mountfs---of 23
ext2fs_mountroot---of 7
ext2fs_newvnode---of 34
ext2fs_reinit---of 1
ext2fs_reload---of 25
ext2fs_sbfill---of 23
ext2fs_sbupdate---of 3
ext2fs_set_inode_guid---of 3
ext2fs_statvfs---of 15
ext2fs_sync---of 15
ext2fs_sync_selector---of 9
ext2fs_sysctl_setup---of 1
ext2fs_unmount---of 8
ext2fs_vptofh---of 3
x86_curlwp100%of 1
-----------
SUMMARY7%of 43

uvm_page_physload---of 7
uvm_page_physunload---of 25
uvm_page_physunload_force---of 16
uvm_phys_to_vm_page63%of 8
uvm_physseg_find50%of 8
uvm_physseg_get_avail_end---of 5
uvm_physseg_get_avail_start---of 5
uvm_physseg_get_end---of 5
uvm_physseg_get_first---of 1
uvm_physseg_get_free_list40%of 5
uvm_physseg_get_highest_frame---of 8
uvm_physseg_get_last---of 1
uvm_physseg_get_next---of 4
uvm_physseg_get_pg---of 5
uvm_physseg_get_prev---of 4
uvm_physseg_get_start---of 5
uvm_physseg_get_start_hint---of 5
uvm_physseg_init---of 1
uvm_physseg_init_seg---of 14
uvm_physseg_plug---of 30
uvm_physseg_seg_alloc_from_slab---of 15
uvm_physseg_seg_chomp_slab---of 5
uvm_physseg_set_start_hint---of 5
uvm_physseg_unplug---of 30
uvm_physseg_valid_p---of 4
-----------
SUMMARY53%of 21

ufs_balloc_range55%of 11
ufs_inactive47%of 28
ufs_reclaim50%of 10
ufs_truncate_all50%of 16
ufs_truncate_retry47%of 13
x86_curlwp100%of 1
-----------
SUMMARY50%of 79

filt_ptcrdetach---of 1
filt_ptcread---of 21
filt_ptcwdetach---of 1
filt_ptcwrite---of 12
ptcclose---of 1
ptckqfilter---of 4
ptcopen75%of 4
ptcpoll---of 26
ptcread---of 23
ptcwakeup---of 8
ptcwrite---of 32
ptsclose---of 1
ptsopen---of 11
ptspoll---of 3
ptsread---of 26
ptsstart---of 8
ptsstop---of 9
ptswrite---of 3
pty_check59%of 12
pty_isfree---of 8
pty_maxptys---of 4
ptyattach---of 1
ptyioctl---of 70
ptytty---of 1
x86_curlwp---of 1
-----------
SUMMARY63%of 16

-----------
SUMMARY---of 0

debug_init---of 5
freecheck_in29%of 7
freecheck_out23%of 9
-----------
SUMMARY25%of 16

-----------
SUMMARY---of 0

dk_open_parent---of 6
dk_set_geometry---of 3
dkcancel56%of 9
dkclose31%of 23
dkdiscard---of 19
dkdump---of 10
dkioctl31%of 23
dkiodone---of 7
dkminphys50%of 8
dkopen21%of 39
dkread56%of 9
dkrestart---of 1
dksize40%of 5
dkstart60%of 10
dkstrategy50%of 14
dkwedge_add---of 62
dkwedge_attach---of 11
dkwedge_del---of 10
dkwedge_delall---of 1
dkwedge_delall1---of 13
dkwedge_delidle---of 1
dkwedge_detach---of 13
dkwedge_discover---of 16
dkwedge_find_by_parent---of 9
dkwedge_find_by_wname---of 10
dkwedge_find_partition---of 12
dkwedge_get_parent_name---of 5
dkwedge_init---of 22
dkwedge_list---of 9
dkwedge_match---of 1
dkwedge_print_wnames---of 7
dkwedge_read---of 7
dkwrite---of 9
x86_curlwp---of 1
-----------
SUMMARY37%of 140

-----------
SUMMARY---of 0

ffs_clrblock34%of 6
ffs_clusteracct55%of 33
ffs_fragacct73%of 11
ffs_getblk64%of 11
ffs_isblock34%of 6
ffs_isfreeblock34%of 6
ffs_load_inode40%of 5
ffs_setblock34%of 6
-----------
SUMMARY52%of 84

scsipi_do_ioctl7%of 32
scsipi_ioctl_init---of 1
scsipi_user_done---of 14
scsistrategy---of 8
-----------
SUMMARY7%of 32

-----------
SUMMARY---of 0

cgaccount---of 76
expunge---of 41
ffs_copyonwrite---of 56
ffs_snapblkfree5%of 60
ffs_snapgone---of 28
ffs_snapremove---of 55
ffs_snapshot---of 262
ffs_snapshot_fini---of 5
ffs_snapshot_init---of 1
ffs_snapshot_mount---of 27
ffs_snapshot_read---of 17
ffs_snapshot_unmount---of 15
fullacct---of 3
indiracct---of 18
mapacct---of 29
snapacct---of 44
snapblkaddr---of 19
snapshot_expunge_selector---of 8
syncsnap---of 10
x86_curlwp100%of 1
-----------
SUMMARY7%of 61

-----------
SUMMARY---of 0

_mountlist_iterator_next46%of 24
_mountlist_next---of 12
dounmount---of 29
makefstype75%of 4
mount_domount40%of 56
mount_finispecific---of 1
mount_getspecific100%of 1
mount_initspecific---of 3
mount_setspecific---of 1
mount_specific_key_create---of 1
mount_specific_key_delete---of 1
mountlist_append---of 3
mountlist_iterator_destroy30%of 10
mountlist_iterator_init50%of 4
mountlist_iterator_next100%of 1
mountlist_iterator_trynext---of 1
mountlist_remove---of 13
rawdev_mounted38%of 16
vflush---of 30
vfs_busy43%of 7
vfs_getnewfsid64%of 11
vfs_getvfs---of 10
vfs_insmntque59%of 17
vfs_mount_sysinit---of 1
vfs_mountalloc60%of 5
vfs_mountedon---of 5
vfs_mountroot---of 36
vfs_ref50%of 4
vfs_rele67%of 6
vfs_rootmountalloc---of 7
vfs_set_lowermount39%of 13
vfs_shutdown---of 3
vfs_sync_all---of 1
vfs_trybusy---of 8
vfs_unbusy67%of 3
vfs_unmount_forceone---of 4
vfs_unmount_next---of 14
vfs_unmountall---of 1
vfs_unmountall1---of 15
vfs_vnode_iterator_destroy---of 10
vfs_vnode_iterator_init---of 4
vfs_vnode_iterator_next---of 1
vfs_vnode_iterator_next1---of 24
x86_curlwp---of 1
-----------
SUMMARY47%of 182

wsmux_attach_sc---of 18
wsmux_create---of 1
wsmux_detach_sc---of 12
wsmux_do_displayioctl---of 9
wsmux_do_ioctl---of 51
wsmux_evsrc_set_display---of 7
wsmux_getmux60%of 5
wsmux_mux_close---of 6
wsmux_mux_open---of 9
wsmux_set_display---of 12
wsmuxattach---of 1
wsmuxclose---of 7
wsmuxioctl---of 1
wsmuxkqfilter---of 4
wsmuxopen16%of 13
wsmuxpoll50%of 4
wsmuxread---of 4
-----------
SUMMARY32%of 22

-----------
SUMMARY---of 0

ra_startio25%of 12
uvm_ra_allocctx67%of 3
uvm_ra_freectx---of 3
uvm_ra_init---of 1
uvm_ra_request15%of 20
uvm_readahead100%of 1
-----------
SUMMARY25%of 36

elf32_check_header---of 6
elf32_copyargs---of 3
elf32_free_emul_arg---of 3
elf32_load_psection---of 17
elf32_populate_auxv---of 16
exec_elf32_makecmds3%of 106
exec_elf32_modcmd---of 1
netbsd_elf32_note---of 29
netbsd_elf32_probe---of 3
netbsd_elf32_signature---of 14
-----------
SUMMARY3%of 106

addrsel_policy_init---of 1
in6_selectroute37%of 30
in6_selectsrc36%of 157
in6_src_ioctl---of 24
in6pcb_selecthlim50%of 6
in6pcb_selecthlim_rt50%of 6
in6pcb_set_port50%of 4
sysctl_net_inet6_addrctlpolicy---of 13
x86_curlwp100%of 1
-----------
SUMMARY37%of 204

union_abortop67%of 3
union_access16%of 19
union_advlock67%of 3
union_bmap---of 6
union_bwrite---of 11
union_close---of 13
union_create---of 5
union_fsync28%of 11
union_getattr---of 16
union_getpages---of 8
union_inactive40%of 5
union_ioctl---of 3
union_islocked67%of 3
union_kqfilter---of 4
union_link---of 14
union_lock40%of 10
union_lookup27%of 42
union_lookup139%of 21
union_mkdir---of 5
union_mknod---of 5
union_mmap---of 3
union_open24%of 13
union_parsepath45%of 9
union_pathconf---of 6
union_poll---of 3
union_print---of 9
union_putpages56%of 9
union_read---of 11
union_readdir---of 10
union_readlink50%of 6
union_reclaim---of 4
union_remove---of 7
union_rename---of 25
union_revoke---of 5
union_rmdir---of 8
union_seek---of 3
union_setattr18%of 34
union_strategy---of 11
union_symlink---of 3
union_unlock67%of 3
union_whiteout---of 3
union_write---of 9
x86_curlwp100%of 1
-----------
SUMMARY32%of 192

-----------
SUMMARY---of 0

raw_attach67%of 9
raw_detach---of 10
raw_disconnect---of 3
-----------
SUMMARY67%of 9

module_autoload29%of 7
module_builtin_add---of 32
module_builtin_remove---of 16
module_builtin_require_force---of 4
module_compatible---of 1
module_do_builtin---of 49
module_do_load---of 136
module_do_unload---of 43
module_enqueue---of 11
module_error---of 1
module_fetch_info---of 5
module_find_section---of 5
module_getspecific---of 1
module_hold---of 1
module_init---of 12
module_init_class---of 36
module_kernel---of 1
module_listener_cb100%of 1
module_load---of 9
module_name---of 1
module_prime---of 17
module_print---of 3
module_print_list---of 9
module_register_callbacks---of 5
module_rele---of 3
module_setspecific---of 1
module_source---of 1
module_specific_key_create---of 1
module_specific_key_delete---of 1
module_start_unload_thread---of 3
module_thread---of 15
module_thread_kick---of 1
module_unload---of 3
module_unregister_callbacks---of 10
module_whatis---of 8
sysctl_module_autotime---of 4
x86_curlwp100%of 1
-----------
SUMMARY45%of 9

check_sigcontext32---of 22
cpu_coredump32---of 6
cpu_exec_aout_makecmds100%of 1
cpu_getmcontext32---of 3
cpu_mcontext32_validate---of 22
cpu_mcontext32from64_validate---of 1
cpu_setmcontext32---of 12
netbsd32_buildcontext---of 5
netbsd32_machdep_md_fini---of 7
netbsd32_machdep_md_init---of 1
netbsd32_machine32---of 1
netbsd32_process_doxmmregs---of 9
netbsd32_process_read_dbregs---of 1
netbsd32_process_read_fpregs---of 1
netbsd32_process_read_regs---of 1
netbsd32_process_write_dbregs---of 7
netbsd32_process_write_fpregs---of 1
netbsd32_process_write_regs---of 22
netbsd32_ptrace_translate_request---of 3
netbsd32_sendsig_siginfo---of 10
netbsd32_setregs---of 1
netbsd32_sysarch---of 39
netbsd32_vm_default_addr---of 1
startlwp32---of 17
x86_curlwp---of 1
-----------
SUMMARY100%of 1

cpu_ipi31%of 13
x86_broadcast_ipi---of 8
x86_ipi_ast---of 1
x86_ipi_generic---of 1
x86_ipi_halt---of 1
x86_ipi_handler---of 6
x86_ipi_kpreempt---of 1
x86_ipi_reload_mtrr---of 3
x86_ipi_xcall---of 1
x86_send_ipi34%of 6
xc_send_ipi54%of 13
-----------
SUMMARY41%of 32

percpu_alloc100%of 1
percpu_backend_alloc---of 13
percpu_cpu_swap---of 5
percpu_create22%of 19
percpu_foreach---of 6
percpu_foreach_xcall---of 4
percpu_free---of 22
percpu_getptr_remote67%of 3
percpu_getref67%of 3
percpu_init---of 1
percpu_init_cpu---of 14
percpu_putref100%of 1
percpu_traverse_enter---of 1
percpu_traverse_exit---of 1
percpu_xcfunc---of 3
x86_curlwp---of 1
-----------
SUMMARY38%of 27

clock_gettime150%of 12
clock_timeleft---of 3
gettimeleft54%of 13
inittimeleft67%of 6
itimerfix60%of 5
itimespecfix---of 6
timespecaddok---of 13
timespecsubok43%of 14
ts2timo45%of 20
tshzto---of 1
tshztoup---of 1
tstohz100%of 1
tvhzto---of 3
tvtohz37%of 11
x86_curlwp100%of 1
-----------
SUMMARY50%of 83

ufs_deleteextattr---of 7
ufs_extattr_autostart---of 17
ufs_extattr_disable---of 13
ufs_extattr_done---of 1
ufs_extattr_enable---of 23
ufs_extattr_enable_with_open---of 4
ufs_extattr_get_header---of 13
ufs_extattr_init---of 1
ufs_extattr_lookup---of 14
ufs_extattr_rm---of 15
ufs_extattr_start---of 11
ufs_extattr_stop---of 10
ufs_extattr_subdir---of 29
ufs_extattr_uepm_destroy---of 4
ufs_extattr_uepm_init---of 1
ufs_extattr_vnode_inactive20%of 10
ufs_extattrctl---of 19
ufs_getextattr---of 28
ufs_listextattr---of 28
ufs_setextattr---of 69
x86_curlwp---of 1
-----------
SUMMARY20%of 10

clockctl_listener_cb100%of 1
clockctl_modcmd---of 5
clockctlattach---of 1
clockctlclose---of 1
clockctlioctl27%of 15
clockctlopen100%of 1
-----------
SUMMARY36%of 17

compat_50_sys___ntp_gettime30---of 5
compat_50_sys_adjtime---of 7
compat_50_sys_aio_suspend---of 7
compat_50_sys_clock_getres---of 4
compat_50_sys_clock_gettime---of 3
compat_50_sys_clock_settime---of 3
compat_50_sys_getitimer---of 3
compat_50_sys_getrusage---of 3
compat_50_sys_gettimeofday---of 6
compat_50_sys_mq_timedreceive---of 6
compat_50_sys_mq_timedsend---of 4
compat_50_sys_nanosleep75%of 4
compat_50_sys_setitimer50%of 8
compat_50_sys_settimeofday---of 3
compat_50_sys_timer_gettime---of 3
compat_50_sys_timer_settime---of 4
compat_sysctl_time---of 1
kern_time_50_fini---of 1
kern_time_50_init---of 1
-----------
SUMMARY59%of 12

lfs_done---of 1
lfs_extattrctl---of 1
lfs_fhtovp---of 6
lfs_flushfiles---of 17
lfs_gop_write---of 67
lfs_init---of 1
lfs_init_vnode---of 7
lfs_issequential_hole---of 11
lfs_loadvnode---of 46
lfs_modcmd---of 7
lfs_mount6%of 37
lfs_mountfs---of 106
lfs_mountroot---of 5
lfs_newvnode---of 25
lfs_reinit---of 1
lfs_resize_fs---of 57
lfs_statvfs---of 18
lfs_sync---of 9
lfs_sysctl_setup---of 4
lfs_unmount---of 6
lfs_vget---of 4
lfs_vinit---of 32
lfs_vptofh---of 3
lfs_writerd---of 59
sysctl_lfs_dostats---of 3
x86_curlwp100%of 1
-----------
SUMMARY8%of 38

filt_logrdetach---of 1
filt_logread50%of 8
initmsgbuf---of 7
logclose---of 1
loginit---of 1
logioctl---of 10
logkqfilter---of 3
logopen---of 4
logpoll---of 4
logputchar36%of 14
logread---of 14
logsoftintr---of 3
logwakeup50%of 4
sysctl_msgbuf---of 14
-----------
SUMMARY43%of 26

fdesc_done---of 1
fdesc_getattr---of 17
fdesc_inactive---of 4
fdesc_init---of 1
fdesc_ioctl---of 3
fdesc_kqfilter---of 5
fdesc_lookup8%of 39
fdesc_open50%of 4
fdesc_pathconf34%of 9
fdesc_poll---of 3
fdesc_print---of 1
fdesc_read---of 3
fdesc_readdir---of 43
fdesc_readlink---of 4
fdesc_reclaim---of 1
fdesc_setattr---of 5
fdesc_write---of 3
x86_curlwp100%of 1
-----------
SUMMARY17%of 53

check_exec28%of 36
check_posix_spawn34%of 6
copyargs---of 15
do_posix_spawn8%of 42
exec_add---of 31
exec_free_emul_arg---of 5
exec_init---of 33
exec_makepathbuf40%of 10
exec_pool_alloc---of 1
exec_pool_free---of 1
exec_remove---of 29
exec_sigcode_alloc---of 10
exec_sigcode_free---of 8
exec_vm_minaddr---of 1
execve1---of 3
execve_fetch_element---of 1
execve_free_data---of 9
execve_loadvm23%of 59
execve_runproc---of 129
posix_spawn_fa_free---of 9
spawn_exec_data_release25%of 12
spawn_return---of 59
sys_execve67%of 3
sys_fexecve---of 3
sys_posix_spawn54%of 43
x86_curlwp---of 1
-----------
SUMMARY29%of 211

strncpy55%of 11
-----------
SUMMARY55%of 11

-----------
SUMMARY---of 0

fss_attach60%of 5
fss_bs_indir---of 8
fss_bs_io---of 3
fss_bs_thread---of 44
fss_close---of 10
fss_copy_on_write---of 33
fss_detach---of 4
fss_dump---of 1
fss_ioctl3%of 75
fss_match---of 1
fss_open50%of 4
fss_read---of 1
fss_size---of 1
fss_softc_alloc---of 7
fss_softc_free---of 15
fss_strategy---of 7
fss_unmount_hook---of 12
fss_write---of 1
fssattach---of 3
x86_curlwp---of 1
-----------
SUMMARY9%of 84

-----------
SUMMARY---of 0

compat_100___kevent50_fetch_changes---of 6
compat_100___kevent50_put_events---of 5
compat_50_kevent_fetch_timeout---of 5
compat_50_sys_kevent---of 1
compat_50_sys_pollts---of 7
compat_50_sys_pselect---of 7
compat_50_sys_select60%of 5
kern_select_50_fini---of 1
kern_select_50_init---of 1
-----------
SUMMARY60%of 5

specificdata_domain_create---of 1
specificdata_fini13%of 16
specificdata_getspecific50%of 4
specificdata_getspecific_unlocked50%of 4
specificdata_init100%of 1
specificdata_key_create---of 8
specificdata_key_delete---of 12
specificdata_noop_dtor---of 1
specificdata_setspecific---of 17
-----------
SUMMARY29%of 25

shm_delete_mapping42%of 12
shm_find_segment_perm_by_index---of 3
shmctl128%of 33
shmexit---of 13
shmfini---of 5
shmfork---of 5
shminit---of 8
shmmap_getprivate50%of 10
sys___shmctl5067%of 6
sys_shmat50%of 22
sys_shmdt27%of 15
sys_shmget38%of 40
sysctl_ipc_shm_setup---of 1
sysctl_ipc_shmmax---of 4
sysctl_ipc_shmmaxpgs---of 4
sysctl_ipc_shmmni---of 24
-----------
SUMMARY39%of 138

_vstate_assert54%of 13
holdrele---of 1
holdrelel50%of 10
lru_iter_first---of 6
lru_iter_next---of 13
lru_iter_release---of 11
lru_requeue66%of 23
vcache_alloc100%of 1
vcache_dealloc---of 3
vcache_free60%of 15
vcache_get41%of 47
vcache_make_anon---of 35
vcache_new33%of 31
vcache_reclaim40%of 66
vcache_rekey_enter---of 39
vcache_rekey_exit---of 46
vcache_stats---of 10
vcache_tryvget50%of 6
vcache_vget32%of 16
vdead_check50%of 14
vdrain_one---of 24
vdrain_task---of 4
vfs_drainvnodes---of 17
vfs_vnode_sysinit---of 7
vgone---of 9
vhold---of 1
vholdl56%of 9
vnalloc_marker---of 1
vnfree_marker---of 3
vnis_marker---of 1
vput58%of 7
vrecycle---of 13
vref60%of 5
vrefcnt67%of 3
vrele100%of 3
vrele_async---of 3
vrele_deferred---of 13
vrele_flush---of 8
vrele_task---of 9
vrelel40%of 93
vrevoke---of 15
vrevoke_suspend_next---of 11
vshareilock100%of 1
vshareklist67%of 3
vstate_assert_change69%of 19
vstate_assert_wait_stable43%of 7
vtryrele63%of 8
vwakeup---of 9
x86_curlwp100%of 1
-----------
SUMMARY47%of 401

compat_40_sys_mount100%of 1
vfs_syscalls_40_fini---of 1
vfs_syscalls_40_init---of 1
-----------
SUMMARY100%of 1

_psref_held---of 15
psref_acquire50%of 12
psref_class_create---of 1
psref_class_destroy---of 3
psref_copy---of 13
psref_cpu_drained_p---of 3
psref_held---of 1
psref_init---of 1
psref_release55%of 22
psref_target_destroy---of 9
psref_target_init---of 1
psreffed_p_xc---of 3
x86_curlwp100%of 1
-----------
SUMMARY55%of 35

compat_50_sys___sigtimedwait---of 3
compat_50_sys__lwp_park72%of 7
compat_50_sys_wait458%of 7
kern_50_fini---of 1
kern_50_init---of 1
tscopyin---of 5
tscopyout---of 3
-----------
SUMMARY65%of 14

bounds_check_with_label---of 11
bounds_check_with_mediasize34%of 6
convertdisklabel---of 21
disk_attach---of 1
disk_begindetach---of 5
disk_busy100%of 1
disk_destroy---of 1
disk_detach---of 3
disk_find---of 4
disk_init100%of 1
disk_ioctl17%of 24
disk_isbusy---of 1
disk_read_sectors---of 4
disk_rename---of 1
disk_set_info---of 25
disk_unbusy---of 1
disk_wait100%of 1
diskerr---of 10
disklabel_dev_unit100%of 1
-----------
SUMMARY30%of 34

tmpfs_fifo_close100%of 1
tmpfs_fifo_read100%of 1
tmpfs_fifo_write---of 1
-----------
SUMMARY100%of 2

cpu_lookup40%of 10
cpu_setintr---of 15
cpu_setstate---of 16
cpu_ucode_load---of 8
cpu_xc_intr---of 1
cpu_xc_nointr---of 1
cpu_xc_offline---of 17
cpu_xc_online---of 1
cpuctl_ioctl---of 17
cpuctlattach---of 3
mi_cpu_attach---of 9
x86_curlwp---of 1
-----------
SUMMARY40%of 10

st_interpret_sense---of 77
st_load---of 13
st_mode_select---of 8
st_rdpos---of 14
st_rewind---of 9
st_space---of 34
st_unmount---of 11
st_write_filemarks---of 12
stattach---of 24
stclose---of 18
stdetach---of 1
stdone---of 7
stdump---of 1
stioctl---of 71
stopen3%of 67
stread---of 1
strestart---of 1
ststart---of 24
ststrategy---of 8
stwrite---of 1
-----------
SUMMARY3%of 67

exec_read---of 4
exec_setup_stack---of 12
kill_vmcmds29%of 7
new_vmcmd---of 11
vmcmd_map_pagedvn---of 13
vmcmd_map_readvn---of 4
vmcmd_map_zero---of 5
vmcmd_readvn---of 10
vmcmdset_extend---of 4
x86_curlwp---of 1
-----------
SUMMARY29%of 7

ffs_bufrd35%of 47
ffs_bufwr42%of 39
ffs_fsync6%of 34
ffs_full_fsync20%of 30
ffs_gop_size100%of 4
ffs_read40%of 25
ffs_reclaim39%of 13
ffs_spec_fsync23%of 9
ffs_write36%of 57
ufs_post_write_update34%of 18
x86_curlwp---of 1
-----------
SUMMARY32%of 276

fifo_bmap---of 7
fifo_close48%of 19
fifo_inactive---of 1
fifo_ioctl25%of 8
fifo_kqfilter---of 10
fifo_lookup---of 1
fifo_open53%of 55
fifo_pathconf---of 6
fifo_poll---of 41
fifo_print---of 3
fifo_read29%of 7
fifo_socantrcvmore58%of 7
fifo_write---of 3
filt_fifordetach---of 9
filt_fiforead---of 12
filt_fifowdetach---of 9
filt_fifowrite---of 17
x86_curlwp100%of 1
-----------
SUMMARY49%of 97

turnstile_block52%of 56
turnstile_changepri---of 1
turnstile_ctor100%of 1
turnstile_exit100%of 1
turnstile_init---of 3
turnstile_lookup60%of 5
turnstile_print---of 10
turnstile_remove42%of 12
turnstile_wakeup53%of 36
x86_curlwp100%of 1
-----------
SUMMARY53%of 112

genfs_compat_getpages---of 19
genfs_compat_gop_write---of 1
genfs_dio_iodone---of 6
genfs_directio---of 34
genfs_do_io43%of 26
genfs_do_putpages50%of 132
genfs_getpages31%of 221
genfs_gop_putrange100%of 1
genfs_gop_write100%of 1
genfs_gop_write_rwmap---of 1
genfs_putpages100%of 1
x86_curlwp100%of 1
-----------
SUMMARY39%of 383

m_add---of 4
m_adj23%of 18
m_align39%of 13
m_apply---of 17
m_cat---of 10
m_clget50%of 6
m_copy_internal---of 33
m_copy_pkthdr---of 8
m_copyback40%of 5
m_copyback_cow42%of 12
m_copyback_internal15%of 80
m_copydata50%of 12
m_copym---of 1
m_copypacket---of 22
m_copyup---of 16
m_defrag---of 45
m_devget---of 17
m_dup---of 1
m_ensure_contig38%of 16
m_ext_free---of 18
m_free67%of 6
m_freem75%of 4
m_get50%of 6
m_get_n---of 6
m_getcl---of 7
m_gethdr67%of 3
m_gethdr_n---of 6
m_getptr---of 6
m_makewritable---of 12
m_move_pkthdr50%of 8
m_prepend40%of 10
m_print---of 34
m_pulldown10%of 54
m_pullup43%of 7
m_remove_pkthdr---of 3
m_split---of 1
m_split_internal---of 24
m_tag_copy---of 3
m_tag_copy_chain---of 9
m_tag_delete---of 6
m_tag_delete_chain36%of 17
m_tag_find43%of 7
m_tag_free---of 1
m_tag_get---of 4
m_tag_prepend---of 3
m_tag_unlink---of 6
m_verify_packet---of 11
mb_ctor100%of 1
mb_drain---of 20
mbinit---of 7
mbstat_convert_to_user_cb---of 3
mbstat_type_add---of 1
sysctl_kern_mbuf---of 14
sysctl_kern_mbuf_stats---of 1
x86_curlwp---of 1
-----------
SUMMARY29%of 285

do_tcpinit---of 1
ipsec4_hdrsiz_tcp---of 6
ipsec6_hdrsiz_tcp---of 6
tcp6_ctlinput---of 18
tcp6_mtudisc---of 8
tcp6_mtudisc_callback---of 1
tcp_close22%of 19
tcp_ctlinput---of 27
tcp_drain---of 10
tcp_drainstub---of 1
tcp_drop---of 8
tcp_established---of 22
tcp_fasttimo---of 3
tcp_freeq16%of 19
tcp_hdrsz---of 1
tcp_init---of 7
tcp_init_common---of 7
tcp_iss_secret_init---of 1
tcp_mss_from_peer---of 17
tcp_mss_to_advertise34%of 9
tcp_mtudisc---of 8
tcp_mtudisc_callback---of 1
tcp_new_iss50%of 4
tcp_new_iss150%of 4
tcp_newtcpcb60%of 5
tcp_notify---of 21
tcp_optlen100%of 1
tcp_quench---of 3
tcp_respond---of 71
tcp_rmx_rtt---of 10
tcp_statadd---of 3
tcp_statinc---of 3
tcp_tcpcb_template---of 1
tcp_template21%of 34
-----------
SUMMARY27%of 95

in6_control---of 229
in6_domifattach---of 1
in6_domifdetach---of 1
in6_if_down---of 1
in6_if_link_down---of 17
in6_if_link_state_change---of 3
in6_if_link_up---of 23
in6_if_up---of 1
in6_ifaddlocal---of 5
in6_ifawithifp27%of 34
in6_ifremlocal---of 13
in6_ifremprefix---of 45
in6_in_2_v4mapin6---of 1
in6_init---of 1
in6_is_addr_deprecated---of 10
in6_lltable_create---of 21
in6_lltable_delete---of 11
in6_lltable_destroy_lle---of 3
in6_lltable_dump_entry---of 5
in6_lltable_fill_sa_entry---of 1
in6_lltable_free_entry---of 3
in6_lltable_hash---of 1
in6_lltable_lookup---of 11
in6_lltable_match_prefix---of 8
in6_localaddr39%of 18
in6_mask2len---of 24
in6_matchlen39%of 21
in6_prefixlen2mask---of 11
in6_purge_mcast_references---of 12
in6_purgeaddr---of 28
in6_purgeif---of 1
in6_rt_ifa_matcher---of 1
in6_sin6_2_sin---of 1
in6_sin6_2_sin_in_sock---of 1
in6_sin_2_v4mapsin6---of 1
in6_sin_2_v4mapsin6_in_sock---of 1
in6_tunnel_validate---of 3
in6_update_ifa---of 1
in6_update_ifa1---of 247
in6ifa_ifpforlinklocal---of 15
in6ifa_ifpforlinklocal_psref---of 3
in6ifa_ifpwithaddr---of 13
in6ifa_ifpwithaddr_psref---of 3
in6ifa_ifwithaddr---of 11
x86_curlwp---of 1
-----------
SUMMARY33%of 73

union_allocvp28%of 81
union_check_rmdir---of 25
union_copyfile---of 6
union_copyup---of 11
union_dircache---of 8
union_dircache_r---of 8
union_diruncache---of 7
union_do_lookup---of 5
union_done---of 1
union_dowhiteout---of 5
union_freevp---of 1
union_init---of 1
union_loadvnode45%of 20
union_mkshadow---of 5
union_mkwhiteout---of 3
union_newsize25%of 12
union_newupper---of 19
union_readdirhook---of 8
union_reinit---of 14
union_rele12%of 18
union_removed_upper---of 13
union_vn_close---of 3
union_vn_create---of 5
-----------
SUMMARY28%of 131

cache_activate---of 9
cache_compare_nodes80%of 5
cache_cpu_init---of 3
cache_cross_mount100%of 1
cache_deactivate19%of 16
cache_enter42%of 60
cache_enter_id100%of 3
cache_enter_mount55%of 11
cache_have_id43%of 7
cache_lookup40%of 60
cache_lookup_entry63%of 16
cache_lookup_linked49%of 62
cache_lookup_mount67%of 3
cache_lookup_raw---of 1
cache_purge131%of 26
cache_purge_children67%of 6
cache_purgevfs---of 4
cache_remove52%of 37
cache_revlookup39%of 49
cache_stat_sysctl---of 4
cache_update_stats---of 28
cache_vdir_filter---of 1
cache_vnode_fini60%of 5
cache_vnode_init100%of 1
namecache_count_2passes---of 5
namecache_count_pass2---of 5
namecache_print---of 22
nchinit---of 5
x86_curlwp100%of 1
-----------
SUMMARY45%of 369

compat_50_sys___fhstat40---of 3
compat_50_sys___fstat30---of 3
compat_50_sys___lstat30---of 3
compat_50_sys___stat3067%of 3
compat_50_sys_futimes---of 5
compat_50_sys_lfs_segwait---of 1
compat_50_sys_lutimes50%of 4
compat_50_sys_mknod---of 1
compat_50_sys_utimes50%of 4
vfs_syscalls_50_fini---of 1
vfs_syscalls_50_init---of 1
-----------
SUMMARY55%of 11

rb_tree_find_node80%of 5
rb_tree_find_node_geq84%of 6
rb_tree_find_node_leq---of 6
rb_tree_init100%of 1
rb_tree_insert_node50%of 80
rb_tree_iterate54%of 13
rb_tree_removal_rebalance62%of 78
rb_tree_remove_node55%of 74
rb_tree_reparent_nodes67%of 9
-----------
SUMMARY57%of 266

bufq_fcfs_cancel---of 10
bufq_fcfs_fini---of 3
bufq_fcfs_get38%of 8
bufq_fcfs_init100%of 1
bufq_fcfs_modcmd---of 4
bufq_fcfs_put67%of 3
-----------
SUMMARY50%of 12

bt_alloc43%of 14
bt_freetrim15%of 27
bt_insbusy75%of 8
bt_insfree47%of 13
bt_refill_locked28%of 29
bt_rembusy50%of 8
bt_remfree50%of 8
bt_remseg43%of 7
pool_page_alloc_vmem_meta---of 1
pool_page_free_vmem_meta---of 1
qc_poolpage_alloc67%of 3
qc_poolpage_free---of 1
vmem_add---of 1
vmem_add125%of 20
vmem_add_bts---of 7
vmem_alloc56%of 18
vmem_create---of 3
vmem_destroy---of 6
vmem_destroy1---of 26
vmem_dump---of 14
vmem_fit46%of 22
vmem_free72%of 7
vmem_init---of 40
vmem_print---of 1
vmem_printall---of 4
vmem_rehash_all---of 29
vmem_rehash_all_kick---of 1
vmem_rehash_start---of 3
vmem_roundup_size---of 1
vmem_size---of 5
vmem_subsystem_init---of 1
vmem_whatis---of 11
vmem_xalloc39%of 122
vmem_xalloc_addr---of 8
vmem_xcreate---of 3
vmem_xfree47%of 13
vmem_xfree_bt48%of 42
vmem_xfreeall---of 8
-----------
SUMMARY41%of 361

ip6_get_prevhdr---of 9
ip6_getdstifaddr100%of 1
ip6_hopopts_input22%of 28
ip6_init---of 14
ip6_lasthdr50%of 8
ip6_nexthdr29%of 21
ip6_notify_pmtu---of 13
ip6_pullexthdr---of 13
ip6_savecontrol---of 32
ip6_statinc---of 3
ip6_unknown_opt43%of 7
ip6intr---of 137
sysctl_net_inet6_ip6_stats---of 1
-----------
SUMMARY31%of 65

ptyfs_allocvp100%of 1
ptyfs_clr_active---of 4
ptyfs_get_node24%of 13
ptyfs_hashdone---of 1
ptyfs_hashinit---of 1
ptyfs_next_active---of 7
ptyfs_set_active---of 9
-----------
SUMMARY29%of 14

procfs_allocvp100%of 1
procfs_doemul---of 1
procfs_proc_find---of 5
procfs_proc_lock50%of 8
procfs_proc_unlock---of 1
procfs_rw---of 32
procfs_use_linux_compat---of 1
vfs_findname---of 5
vfs_getuserstr---of 7
x86_curlwp---of 1
-----------
SUMMARY56%of 9

-----------
SUMMARY---of 0

chkdq40%of 5
chkiq40%of 5
dqdone---of 1
dqflush---of 8
dqget---of 46
dqinit---of 1
dqref---of 5
dqreinit---of 14
dqrele12%of 18
getinoquota---of 15
qsync---of 4
quota_handle_cmd---of 53
ufsquota_free100%of 1
ufsquota_init100%of 1
-----------
SUMMARY27%of 30

fpu_area_restore---of 11
fpu_area_save---of 6
fpu_clear---of 17
fpu_handle_deferred28%of 11
fpu_kern_enter---of 24
fpu_kern_leave---of 13
fpu_lwp_abandon67%of 3
fpu_lwp_fork58%of 7
fpu_save45%of 9
fpu_set_default_cw---of 9
fpu_sigreset---of 7
fpu_switch47%of 13
fpuinit---of 1
fpuinit_mxcsr_mask---of 1
fputrap---of 6
fxrstor---of 1
fxrstor64---of 1
fxsave---of 1
fxsave64---of 1
process_read_fpregs_s87---of 7
process_read_fpregs_xmm---of 7
process_read_xstate---of 31
process_verify_xstate---of 8
process_write_fpregs_s87---of 9
process_write_fpregs_xmm---of 9
process_write_xstate---of 37
x86_curlwp100%of 1
xrstor---of 1
xrstor64100%of 1
xsave---of 1
xsave64---of 1
xsaveopt---of 1
xsaveopt64100%of 1
-----------
SUMMARY48%of 46

pat_init---of 3
pmap_activate50%of 8
pmap_activate_sync---of 12
pmap_bootstrap---of 46
pmap_changeprot_local---of 7
pmap_check_pv65%of 20
pmap_clear_attrs58%of 14
pmap_compare_key---of 1
pmap_compare_nodes67%of 3
pmap_copy_page50%of 4
pmap_cpu_init_late---of 1
pmap_create100%of 1
pmap_ctor45%of 9
pmap_deactivate50%of 16
pmap_deactivate_sync---of 15
pmap_destroy54%of 32
pmap_drain_pv20%of 15
pmap_dtor34%of 6
pmap_dump---of 16
pmap_enter_default67%of 3
pmap_enter_ma48%of 182
pmap_extract_ma65%of 17
pmap_find_ptp62%of 13
pmap_fork34%of 6
pmap_free_ptp50%of 12
pmap_free_pv---of 27
pmap_freepage54%of 15
pmap_get_physpage---of 8
pmap_growkernel---of 32
pmap_init---of 1
pmap_init_tmp_pgtbl---of 7
pmap_is_curpmap---of 3
pmap_is_user---of 3
pmap_kenter_ma50%of 12
pmap_kremove100%of 1
pmap_kremove165%of 14
pmap_kremove_local---of 1
pmap_ldt_cleanup---of 5
pmap_ldt_sync---of 3
pmap_ldt_xcall---of 3
pmap_load54%of 13
pmap_load156%of 9
pmap_lookup_pv29%of 21
pmap_map_ptes78%of 9
pmap_page_remove100%of 1
pmap_pdes_valid---of 9
pmap_pp_clear_attrs44%of 16
pmap_pp_remove51%of 63
pmap_pv_clear_attrs---of 3
pmap_pv_remove---of 3
pmap_pvp_ctor75%of 8
pmap_pvp_dtor---of 5
pmap_reactivate67%of 9
pmap_reference---of 1
pmap_remove49%of 45
pmap_remove_all---of 73
pmap_remove_pte33%of 28
pmap_remove_pv31%of 46
pmap_resident_count100%of 1
pmap_sync_pv50%of 55
pmap_test_attrs---of 18
pmap_unget_ptp---of 20
pmap_unmap_ptes65%of 14
pmap_unwire30%of 17
pmap_update45%of 18
pmap_virtual_space---of 1
pmap_vpage_cpu_init---of 5
pmap_wired_count100%of 1
pmap_write_protect54%of 26
pmap_zero_page67%of 3
slotspace_rand---of 21
vtophys100%of 1
x86_curlwp100%of 1
x86_mmap_flags---of 1
-----------
SUMMARY49%of 808

tmpfs_done---of 1
tmpfs_fhtovp---of 14
tmpfs_init---of 1
tmpfs_modcmd---of 4
tmpfs_mount11%of 28
tmpfs_root50%of 6
tmpfs_snapshot---of 1
tmpfs_start100%of 1
tmpfs_statvfs67%of 3
tmpfs_sync---of 1
tmpfs_unmount---of 17
tmpfs_vget---of 1
tmpfs_vptofh---of 4
x86_curlwp100%of 1
-----------
SUMMARY26%of 39

compat_30_sys_socket100%of 1
uipc_syscalls_30_fini---of 1
uipc_syscalls_30_init---of 1
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

_prop_string_equals---of 4
_prop_string_externalize---of 5
_prop_string_free45%of 9
_prop_string_init---of 1
_prop_string_instantiate34%of 9
_prop_string_internalize---of 10
_prop_string_rb_compare_key---of 3
_prop_string_rb_compare_nodes60%of 5
prop_string_append---of 8
prop_string_append_cstring---of 9
prop_string_compare---of 5
prop_string_compare_string---of 4
prop_string_copy---of 9
prop_string_copy_mutable---of 8
prop_string_copy_value---of 5
prop_string_create---of 3
prop_string_create_copy100%of 1
prop_string_create_cstring---of 6
prop_string_create_cstring_nocopy---of 3
prop_string_create_format50%of 6
prop_string_create_nocopy67%of 3
prop_string_cstring---of 5
prop_string_cstring_nocopy---of 4
prop_string_equals---of 5
prop_string_equals_cstring---of 4
prop_string_equals_string---of 4
prop_string_mutable---of 4
prop_string_size---of 4
prop_string_value---of 5
-----------
SUMMARY49%of 33

amap_add50%of 18
amap_adjref_anons63%of 8
amap_alloc58%of 7
amap_alloc134%of 15
amap_copy56%of 40
amap_cow_now---of 26
amap_ctor58%of 7
amap_dtor38%of 8
amap_extend28%of 84
amap_free50%of 10
amap_lookup59%of 12
amap_lookups60%of 15
amap_pp_adjref70%of 43
amap_pp_establish60%of 10
amap_ref67%of 3
amap_share_protect---of 19
amap_splitref64%of 11
amap_swap_off---of 48
amap_unadd54%of 13
amap_unref72%of 7
amap_wipeout48%of 19
amap_wiperange78%of 18
uvm_amap_init---of 1
-----------
SUMMARY51%of 348

x86_curlwp---of 1
xc__highpri_intr---of 9
xc_barrier50%of 10
xc_broadcast---of 10
xc_encode_ipl---of 3
xc_highpri31%of 13
xc_init_cpu---of 12
xc_ipi_handler---of 5
xc_lowpri50%of 14
xc_nop---of 1
xc_thread---of 14
xc_unicast---of 14
xc_wait60%of 10
-----------
SUMMARY47%of 47

uvm_aio_aiodone---of 15
uvm_aio_aiodone_pages---of 52
uvm_pager_init---of 9
uvm_pager_realloc_emerg---of 8
uvm_pageratop---of 5
uvm_pagermapin42%of 17
uvm_pagermapout43%of 7
x86_curlwp100%of 1
-----------
SUMMARY44%of 25

_x86_memio_map---of 4
_x86_memio_unmap---of 9
bus_space_alloc---of 15
bus_space_barrier29%of 7
bus_space_copy_region_1---of 21
bus_space_copy_region_2---of 21
bus_space_copy_region_4---of 21
bus_space_free---of 9
bus_space_is_equal---of 3
bus_space_map---of 15
bus_space_mmap---of 3
bus_space_release---of 10
bus_space_reservation_map---of 8
bus_space_reservation_unmap---of 5
bus_space_reservation_unmap1---of 8
bus_space_reserve---of 11
bus_space_reserve_subregion---of 12
bus_space_set_multi_1---of 11
bus_space_set_multi_2---of 11
bus_space_set_multi_4---of 11
bus_space_set_region_1---of 11
bus_space_set_region_2---of 11
bus_space_set_region_4---of 11
bus_space_subregion---of 1
bus_space_tag_create---of 15
bus_space_tag_destroy---of 1
bus_space_unmap---of 5
bus_space_vaddr---of 1
x86_bus_space_init---of 1
x86_bus_space_mallocok---of 1
x86_mem_add_mapping---of 6
-----------
SUMMARY29%of 7

sysmon_wdog_critpoll40%of 5
sysmon_wdog_find---of 5
sysmon_wdog_fini---of 4
sysmon_wdog_init---of 7
sysmon_wdog_ktickle---of 4
sysmon_wdog_modcmd---of 4
sysmon_wdog_ref---of 1
sysmon_wdog_register---of 10
sysmon_wdog_release---of 3
sysmon_wdog_setmode---of 12
sysmon_wdog_shutdown---of 4
sysmon_wdog_unregister---of 10
sysmonclose_wdog---of 5
sysmonioctl_wdog---of 27
sysmonopen_wdog---of 1
wdog_preinit---of 1
-----------
SUMMARY40%of 5

vfs_hooks_attach50%of 4
vfs_hooks_detach---of 9
vfs_hooks_init---of 1
vfs_hooks_reexport---of 7
vfs_hooks_unmount---of 6
-----------
SUMMARY50%of 4

pax_aslr_exec_offset---of 5
pax_aslr_init_vm---of 6
pax_aslr_mmap31%of 13
pax_aslr_offset---of 3
pax_aslr_rtld_offset---of 5
pax_aslr_stack---of 6
pax_aslr_stack_gap---of 6
pax_init---of 1
pax_mprotect_maxprotect100%of 1
pax_mprotect_prot---of 1
pax_mprotect_validate75%of 4
pax_segvguard17%of 24
pax_segvguard_cleanup20%of 10
pax_set_flags---of 4
pax_setup_elf_flags---of 10
sysctl_security_pax_setup---of 1
-----------
SUMMARY27%of 52

in6_addmulti45%of 18
in6_delmulti---of 1
in6_delmulti_locked---of 11
in6_joingroup50%of 4
in6_leavegroup---of 3
in6_lookup_and_delete_multi---of 7
in6_lookup_multi---of 7
in6_mkludge_sysctl---of 3
in6_multi_group---of 7
in6_multi_lock---of 1
in6_multi_locked---of 4
in6_multi_unlock---of 1
in6_multicast_sysctl---of 34
in6_purge_multi---of 13
in6_sysctl_multicast_setup---of 1
in6m_destroy---of 23
mld_init---of 1
mld_input---of 64
mld_sendpkt---of 32
mld_start_listening50%of 8
mld_starttimer---of 7
mld_timeo---of 6
x86_curlwp---of 1
-----------
SUMMARY47%of 30

com_attach_subr---of 44
com_break---of 4
com_cleanup---of 3
com_common_getc---of 14
com_config---of 7
com_detach---of 10
com_hwiflow---of 4
com_iflush43%of 7
com_init_regs---of 1
com_init_regs_stride---of 1
com_init_regs_stride_width---of 4
com_intr_poll---of 1
com_is_console---of 5
com_loadchannelregs43%of 14
com_modem---of 7
com_probe_subr---of 3
com_read_1100%of 1
com_read_4---of 1
com_resume---of 1
com_shutdown---of 22
com_suspend---of 1
com_to_tiocm---of 1
com_write_1100%of 1
com_write_4---of 1
com_write_multi_1---of 1
com_write_multi_4---of 4
comclose---of 7
comcnattach---of 3
comcnattach1---of 3
comcngetc---of 1
comcnpollc---of 1
comcnputc37%of 11
comdiag---of 1
comhwiflow---of 14
cominit---of 17
comintr---of 66
comioctl5%of 42
comopen25%of 40
comparam31%of 68
compoll---of 4
comprobe1---of 3
comread---of 4
comsoft---of 39
comspeed---of 5
comstart---of 10
comstop---of 4
comtty---of 1
comwrite---of 4
tiocm_to_com---of 8
-----------
SUMMARY27%of 184

joyattach---of 3
joyclose---of 1
joydetach---of 1
joyioctl---of 9
joyopen50%of 4
joyread---of 9
-----------
SUMMARY50%of 4

_rw_init100%of 1
rw_abort---of 3
rw_destroy60%of 5
rw_downgrade---of 36
rw_dump---of 1
rw_enter52%of 72
rw_init100%of 1
rw_lock_held67%of 3
rw_lock_op50%of 4
rw_owner100%of 1
rw_read_held67%of 3
rw_tryupgrade48%of 21
rw_vector_exit49%of 33
rw_vector_tryenter57%of 16
rw_write_held67%of 3
x86_curlwp100%of 1
-----------
SUMMARY54%of 164

compat_ifconf10%of 43
uipc_syscalls_40_fini---of 3
uipc_syscalls_40_init---of 1
x86_curlwp---of 1
-----------
SUMMARY10%of 43

allocbuf33%of 49
bawrite---of 5
bbusy54%of 15
bdwrite38%of 29
binvalbuf---of 5
bio_doread55%of 11
biodone50%of 6
biodone246%of 11
biohist_init---of 1
biointr---of 12
biowait60%of 10
bread50%of 6
breadn37%of 11
brelse100%of 1
brelsel34%of 78
bremfree40%of 20
buf_destroy---of 1
buf_drain---of 7
buf_init---of 1
buf_memcalc---of 7
buf_nbuf---of 1
buf_setvalimit---of 3
buf_trim---of 11
bufhash_stats---of 10
bufinit---of 12
bufinit2---of 3
bufpool_page_alloc100%of 1
bufpool_page_free---of 1
bwrite53%of 23
getblk32%of 29
geteblk---of 7
getiobuf75%of 4
getnewbuf9%of 58
incore64%of 11
nestiobuf_done56%of 9
nestiobuf_iodone---of 9
nestiobuf_setup---of 5
putiobuf100%of 1
sysctl_bufvm_update---of 25
sysctl_dobuf---of 20
vfs_bufstats---of 40
vn_bwrite100%of 1
x86_curlwp100%of 1
-----------
SUMMARY37%of 385

sd_diskstart48%of 21
sd_dumpblocks---of 1
sd_firstopen---of 13
sd_flush---of 4
sd_get_parms---of 47
sd_get_parms_page4---of 13
sd_get_parms_page5---of 14
sd_interpret_sense---of 20
sd_iosize---of 5
sd_label---of 3
sd_lastclose---of 8
sd_read_capacity---of 6
sd_shutdown---of 4
sd_suspend---of 6
sdattach---of 18
sdclose---of 1
sddetach---of 4
sddone---of 6
sddump---of 5
sdioctl8%of 41
sdmatch---of 1
sdminphys40%of 5
sdopen34%of 6
sdread---of 1
sdrestart---of 1
sdsize---of 4
sdstart---of 1
sdstrategy50%of 4
sdwrite---of 1
-----------
SUMMARY25%of 77

cpu_initclocks---of 1
cpu_intr_p60%of 5
cpu_kpreempt_disabled100%of 1
cpu_kpreempt_enter80%of 5
cpu_kpreempt_exit80%of 5
cpu_need_proftick---of 5
cpu_need_resched28%of 11
cpu_signotify60%of 5
get_booted_kernel---of 5
init_x86_clusters---of 24
init_x86_msgbuf---of 11
init_x86_vm---of 41
intr_findpic---of 1
lookup_bootinfo---of 5
machdep_init---of 1
mm_md_physacc29%of 7
sysctl_machdep_booted_kernel---of 5
sysctl_machdep_bootmethod---of 1
sysctl_machdep_cpu_idle---of 1
sysctl_machdep_diskinfo---of 3
sysctl_machdep_hypervisor---of 3
sysctl_machdep_setup---of 1
sysctl_machdep_tsc_enable---of 5
x86_add_cluster---of 18
x86_cpu_idle_get---of 1
x86_cpu_idle_init---of 1
x86_cpu_idle_set---of 1
x86_cpu_is_lcall---of 5
x86_curlwp100%of 1
x86_listener_cb---of 5
x86_parse_clusters---of 18
x86_reset---of 7
x86_rndseed---of 10
x86_select_freelist---of 9
x86_startup---of 1
-----------
SUMMARY53%of 40

localcount_acquire60%of 5
localcount_debug_refcnt---of 1
localcount_drain---of 10
localcount_fini---of 3
localcount_init100%of 1
localcount_release43%of 7
localcount_xc---of 1
-----------
SUMMARY54%of 13

semctl15%of 95
semexit---of 16
semfini---of 7
seminit---of 9
seminit_exithook---of 1
semrealloc---of 52
semu_alloc---of 16
semundo_adjust---of 19
semundo_clear---of 15
sys_____semctl5072%of 14
sys_semconfig---of 3
sys_semget24%of 25
sys_semop6%of 73
sysctl_ipc_sem_setup---of 3
sysctl_ipc_semmni---of 3
sysctl_ipc_semmns---of 3
sysctl_ipc_semmnu---of 3
-----------
SUMMARY12%of 207

do_sys_wait100%of 3
do_sys_waitid27%of 110
exit1---of 100
exit_lwps---of 16
match_process32%of 25
proc_changeparent---of 8
proc_reparent---of 19
spc_lock---of 1
sys___wait45038%of 8
sys_exit---of 3
sys_wait6---of 7
x86_curlwp100%of 1
-----------
SUMMARY30%of 147

devhandle_compare---of 12
devhandle_impl_inherit---of 1
devhandle_invalid---of 1
devhandle_is_valid---of 3
devhandle_lookup_device_call---of 7
devhandle_type---of 3
device_activation---of 5
device_attached_to_iattr---of 4
device_call_generic---of 7
device_cfattach---of 1
device_cfdata---of 1
device_cfdriver100%of 1
device_class---of 1
device_enumerate_children---of 7
device_handle---of 1
device_has_power---of 1
device_is_a---of 4
device_is_active100%of 1
device_is_enabled---of 1
device_locator---of 3
device_parent100%of 1
device_private100%of 3
device_properties100%of 1
device_set_handle---of 1
device_set_private---of 5
device_unit---of 1
device_xname100%of 1
-----------
SUMMARY100%of 8

umap_bypass21%of 81
umap_getattr40%of 10
umap_lookup37%of 30
umap_print---of 1
umap_rename---of 10
-----------
SUMMARY27%of 121

-----------
SUMMARY---of 0

_netbsd_keccakf1600100%of 3
-----------
SUMMARY100%of 3

uvm_fault_internal42%of 313
uvm_fault_lower_enter47%of 47
uvm_fault_lower_upgrade25%of 16
uvm_fault_unwire100%of 1
uvm_fault_unwire_locked41%of 27
uvm_fault_upper_enter39%of 31
uvm_fault_upper_upgrade25%of 16
uvm_fault_wire58%of 7
uvmfault_anonget7%of 74
uvmfault_promote36%of 42
uvmfault_update_stats50%of 6
x86_curlwp100%of 1
-----------
SUMMARY37%of 581

ipsec4_output---of 16
ipsec6_check_policy---of 8
ipsec6_udp_cksum---of 7
ipsec_address---of 4
ipsec_attach---of 1
ipsec_checkpolicy---of 18
ipsec_chkreplay---of 11
ipsec_delete_pcbpolicy50%of 10
ipsec_get_policy---of 14
ipsec_get_reqlevel---of 40
ipsec_getpolicybyaddr---of 14
ipsec_getpolicybysock---of 92
ipsec_hdrsiz---of 9
ipsec_in_reject---of 24
ipsec_init_pcbpolicy50%of 8
ipsec_invalpcbcacheall---of 1
ipsec_ip_input_checkpolicy---of 5
ipsec_logsastr---of 9
ipsec_mtu---of 8
ipsec_pcbconn60%of 5
ipsec_pcbdisconn60%of 5
ipsec_set_policy---of 30
ipsec_setspidx---of 54
ipsec_sp_hdrsiz---of 22
ipsec_updatereplay---of 25
key_get_default_sp---of 12
xform_init---of 8
xform_register---of 1
-----------
SUMMARY54%of 28

route_abort---of 3
route_accept---of 3
route_attach_wrapper40%of 10
route_bind_wrapper---of 3
route_connect2_wrapper---of 3
route_connect_wrapper---of 3
route_ctloutput---of 24
route_detach_wrapper---of 11
route_disconnect_wrapper---of 5
route_enqueue---of 4
route_filter---of 28
route_init---of 1
route_intr---of 6
route_ioctl_wrapper---of 1
route_listen_wrapper---of 3
route_output---of 132
route_output_report---of 10
route_peeraddr_wrapper---of 9
route_rcvd_wrapper---of 3
route_recvoob_wrapper---of 3
route_send_wrapper---of 5
route_sendoob_wrapper---of 3
route_shutdown_wrapper---of 3
route_sockaddr_wrapper---of 9
route_stat_wrapper67%of 3
rt_addrmsg---of 1
rt_addrmsg0---of 42
rt_addrmsg_rt---of 1
rt_addrmsg_src---of 1
rt_clonedmsg---of 5
rt_ieee80211msg---of 13
rt_ifannouncemsg---of 4
rt_ifmsg---of 7
rt_missmsg---of 5
rt_msg1---of 28
rt_msg2---of 33
rt_msg3---of 1
rt_pr_init---of 1
rt_setmetrics---of 18
sysctl_dumpentry---of 17
sysctl_net_route_setup---of 1
sysctl_rtable---of 86
x86_curlwp---of 1
-----------
SUMMARY47%of 13

-----------
SUMMARY---of 0

if_loop_modcmd---of 5
loioctl30%of 10
loop_clone_create---of 3
loop_clone_destroy---of 3
loop_rtrequest---of 3
loopattach---of 1
loopinit---of 3
looutput25%of 29
-----------
SUMMARY26%of 39

min_check---of 5
uvmpdpol_anfree100%of 1
uvmpdpol_balancequeue---of 35
uvmpdpol_estimatepageable---of 5
uvmpdpol_idle---of 12
uvmpdpol_init---of 1
uvmpdpol_init_cpu---of 1
uvmpdpol_needsscan_p---of 1
uvmpdpol_pageactivate58%of 7
uvmpdpol_pageactivate_locked56%of 9
uvmpdpol_pageactivate_p67%of 3
uvmpdpol_pagedeactivate58%of 7
uvmpdpol_pagedeactivate_locked39%of 21
uvmpdpol_pagedequeue58%of 7
uvmpdpol_pagedequeue_locked30%of 24
uvmpdpol_pageenqueue---of 7
uvmpdpol_pageisqueued_p67%of 3
uvmpdpol_pagerealize78%of 9
uvmpdpol_pagerealize_locked60%of 10
uvmpdpol_reinit---of 1
uvmpdpol_scanfini---of 7
uvmpdpol_scaninit---of 8
uvmpdpol_selectvictim---of 31
uvmpdpol_sysctlsetup---of 1
uvmpdpol_tune---of 3
-----------
SUMMARY50%of 101

buildcontext---of 1
cpu_dump---of 15
cpu_dump_mempagecnt---of 8
cpu_dump_prep_sparse---of 1
cpu_dumpconf---of 11
cpu_dumpsize---of 1
cpu_fsgs_reload---of 5
cpu_getmcontext---of 3
cpu_init_idt---of 1
cpu_init_tss---of 1
cpu_mcontext_validate12%of 25
cpu_reboot---of 23
cpu_reset---of 1
cpu_segregs32_zero---of 7
cpu_segregs64_zero---of 7
cpu_setmcontext---of 12
cpu_startup---of 15
dodumpsys---of 30
dump_header_addbytes---of 4
dump_header_addseg---of 1
dump_header_finish---of 1
dump_header_flush---of 1
dump_header_start---of 1
dump_misc_init---of 15
dump_seg_count_range---of 1
dump_seg_iter---of 14
dump_seg_prep---of 13
dumpsys_seg---of 12
idt_vec_init_cpu_md---of 5
init_bootspace---of 1
init_slotspace---of 1
init_x86_64---of 13
mm_md_direct_mapped_io---of 1
mm_md_direct_mapped_phys100%of 1
mm_md_kernacc---of 17
reserve_dumppages---of 1
sendsig_siginfo---of 11
set_mem_segment---of 1
set_sys_segment---of 1
setgate---of 1
setregion---of 1
setregs---of 1
sparse_dump_mark---of 18
sparse_dump_reset---of 1
unsetgate---of 1
x86_curlwp---of 1
-----------
SUMMARY16%of 26

gang_lookup_init56%of 38
radix_tree_await_memory---of 1
radix_tree_clear_tag46%of 35
radix_tree_empty_tagged_tree_p67%of 3
radix_tree_empty_tree_p100%of 1
radix_tree_fini_tree60%of 5
radix_tree_free_node67%of 3
radix_tree_gang_lookup_node64%of 47
radix_tree_gang_lookup_node_reverse---of 49
radix_tree_gang_lookup_tagged_node---of 62
radix_tree_gang_lookup_tagged_node_reverse39%of 63
radix_tree_get_tag54%of 13
radix_tree_grow50%of 12
radix_tree_init---of 1
radix_tree_init_tree100%of 1
radix_tree_insert_node50%of 8
radix_tree_lookup_node80%of 10
radix_tree_lookup_ptr46%of 33
radix_tree_remove_node50%of 40
radix_tree_replace_node---of 16
radix_tree_set_tag54%of 28
radix_tree_undo_insert_node---of 23
-----------
SUMMARY52%of 340

coda_done---of 4
coda_fhtovp---of 15
coda_init---of 4
coda_loadvnode---of 3
coda_modcmd---of 4
coda_mount30%of 20
coda_nb_statvfs---of 8
coda_root---of 14
coda_start---of 4
coda_sync---of 4
coda_unmount---of 9
coda_vfsopstats_init---of 1
coda_vget---of 4
coda_vptofh---of 4
devtomp---of 3
getNewVnode---of 6
sysctl_vfs_coda_setup---of 1
x86_curlwp100%of 1
-----------
SUMMARY34%of 21

enforce_rlimit_fsize72%of 7
vn_advlock100%of 3
vn_bdev_open---of 4
vn_bdev_openpath---of 4
vn_close75%of 4
vn_closefile100%of 1
vn_extattr_get---of 5
vn_extattr_rm---of 7
vn_extattr_set---of 3
vn_fcntl---of 1
vn_fifo_bypass100%of 1
vn_fpathconf---of 1
vn_ioctl28%of 22
vn_knote_attach---of 10
vn_knote_detach---of 21
vn_kqfilter---of 1
vn_lock50%of 28
vn_markexec---of 4
vn_marktext---of 6
vn_mmap29%of 42
vn_open51%of 55
vn_openchk---of 9
vn_poll100%of 1
vn_posix_fadvise---of 18
vn_rdwr50%of 12
vn_read43%of 7
vn_readdir---of 16
vn_seek27%of 19
vn_stat50%of 4
vn_statfile---of 1
vn_truncate34%of 6
vn_write30%of 10
vn_writechk100%of 1
x86_curlwp100%of 1
-----------
SUMMARY44%of 224

addrulwp80%of 5
calcru67%of 21
donice---of 10
dosetrlimit29%of 32
getrusage150%of 8
lim_addref100%of 1
lim_copy34%of 9
lim_free34%of 6
lim_privatise38%of 8
lim_setcorename---of 3
pstatscopy100%of 1
pstatsfree100%of 1
resource_init---of 1
resource_listener_cb40%of 10
ruadd72%of 7
rulwps---of 6
ruspace---of 1
sys___getrusage5067%of 3
sys_getpriority18%of 23
sys_getrlimit67%of 3
sys_setpriority---of 23
sys_setrlimit67%of 3
sysctl_proc_corename---of 22
sysctl_proc_paxflags---of 10
sysctl_proc_plimit---of 19
sysctl_proc_stop---of 13
x86_curlwp100%of 1
-----------
SUMMARY44%of 142

sys___msync13---of 7
sys_madvise60%of 10
sys_mincore8%of 40
sys_minherit50%of 4
sys_mlock34%of 6
sys_mlockall---of 3
sys_mmap32%of 32
sys_mprotect50%of 4
sys_munlock50%of 4
sys_munlockall---of 1
sys_munmap34%of 6
uvm_default_mapaddr---of 3
uvm_mmap45%of 34
uvm_mmap_anon---of 3
uvm_mmap_dev---of 7
x86_curlwp100%of 1
-----------
SUMMARY32%of 141

bintime---of 6
binuptime63%of 8
dtrace_getnanotime---of 4
dummy_get_timecount---of 1
getbinboottime---of 6
getbintime---of 8
getbinuptime---of 4
getmicroboottime---of 6
getmicrotime50%of 4
getmicrouptime50%of 4
getnanoboottime---of 6
getnanotime50%of 4
getnanouptime50%of 4
inittimecounter---of 3
microtime---of 10
microuptime---of 6
nanotime40%of 10
nanouptime50%of 6
pps_capture---of 7
pps_event---of 1
pps_init67%of 3
pps_ioctl---of 14
pps_ref_event---of 23
sysctl_kern_timecounter_choice---of 11
sysctl_kern_timecounter_hardware---of 14
sysctl_timecounter_setup---of 3
tc_detach---of 17
tc_getfrequency100%of 1
tc_gonebad---of 1
tc_init---of 14
tc_setclock---of 7
tc_ticktock---of 12
tc_windup---of 19
x86_curlwp100%of 1
-----------
SUMMARY54%of 45

compat_43_sys_creat100%of 1
compat_43_sys_fstat67%of 3
compat_43_sys_ftruncate---of 1
compat_43_sys_getdirentries---of 24
compat_43_sys_lseek---of 1
compat_43_sys_lstat67%of 3
compat_43_sys_quota---of 1
compat_43_sys_stat---of 3
compat_43_sys_truncate---of 1
cvtstat40%of 10
vfs_syscalls_43_fini---of 1
vfs_syscalls_43_init---of 1
x86_curlwp---of 1
-----------
SUMMARY53%of 17

procfs_access67%of 3
procfs_close---of 5
procfs_dir---of 18
procfs_getattr10%of 97
procfs_getpages---of 3
procfs_inactive---of 1
procfs_lookup17%of 55
procfs_open---of 15
procfs_pathconf---of 9
procfs_print---of 1
procfs_readdir---of 114
procfs_readlink---of 26
procfs_reclaim---of 1
procfs_root_readdir_callback---of 7
procfs_setattr---of 1
procfs_validfile---of 4
procfs_validfile_linux---of 5
x86_curlwp---of 1
-----------
SUMMARY13%of 155

-----------
SUMMARY---of 0

mutex_obj_alloc100%of 1
mutex_obj_free72%of 7
mutex_obj_hold60%of 5
mutex_obj_refcnt---of 1
mutex_obj_tryalloc---of 3
-----------
SUMMARY70%of 13

compat_ifmediareq_post---of 8
compat_ifmediareq_pre34%of 6
ifmedia_80_fini---of 5
ifmedia_80_init---of 1
-----------
SUMMARY34%of 6

ufs_gro_directory_empty_p53%of 17
ufs_gro_genealogy---of 60
ufs_gro_lock_directory54%of 13
ufs_gro_lookup53%of 17
ufs_gro_remove---of 32
ufs_gro_remove_check_permitted---of 21
ufs_gro_remove_check_possible---of 21
ufs_gro_rename40%of 169
ufs_gro_rename_check_permitted56%of 43
ufs_gro_rename_check_possible56%of 43
ufs_rename100%of 1
ufs_rename_ulr_overlap_p54%of 13
ufs_sane_rename100%of 1
-----------
SUMMARY47%of 317

uvm_pagecheckdirty50%of 8
uvm_pagegetdirty63%of 8
uvm_pagemarkdirty74%of 30
-----------
SUMMARY68%of 46

do_fcntl_lock69%of 16
do_posix_fadvise---of 4
do_sys_fstat67%of 3
dodup80%of 5
sys___fstat5050%of 4
sys___posix_fadvise50---of 4
sys_close50%of 4
sys_dup67%of 3
sys_dup2100%of 1
sys_dup3100%of 1
sys_fcntl26%of 58
sys_flock34%of 12
sys_fpathconf---of 4
sys_pipe67%of 3
sys_pipe250%of 4
x86_curlwp100%of 1
-----------
SUMMARY43%of 115

layer_node_create50%of 4
layerfs_done---of 1
layerfs_init---of 1
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

cpu_speculation_init---of 59
mitigation_mds_change_cpu---of 12
mitigation_taa_change_cpu---of 6
mitigation_v2_change_cpu---of 17
mitigation_v4_change_cpu---of 9
speculation_barrier50%of 6
sysctl_machdep_mds_mitigated---of 18
sysctl_machdep_spectreV2_mitigated---of 11
sysctl_machdep_spectreV4_mitigated---of 23
sysctl_machdep_taa_mitigated---of 25
sysctl_speculation_init---of 1
v2_set_name---of 4
-----------
SUMMARY50%of 6

filt_soempty---of 14
filt_solisten---of 8
filt_sordetach---of 9
filt_soread---of 15
filt_sowdetach---of 9
filt_sowrite---of 22
fsocreate86%of 7
sbsavetimestamp---of 6
so_setsockopt---of 15
soabort---of 7
soaccept---of 8
sobind67%of 9
socket_listener_cb72%of 14
sockopt_destroy100%of 3
sockopt_get100%of 3
sockopt_getint100%of 3
sockopt_getmbuf---of 6
sockopt_init67%of 6
sockopt_set---of 7
sockopt_setint---of 3
sockopt_setmbuf---of 8
soclose25%of 45
soconnect54%of 13
soconnect267%of 3
socreate67%of 15
sodisconnect---of 6
sofamily---of 4
sofree60%of 22
sogetopt18%of 51
sohasoutofband---of 1
soinit---of 5
soinit1---of 3
sokvaalloc---of 7
sokvafree---of 1
solisten50%of 12
soloanfree---of 3
soo_kqfilter---of 11
sopendfree_thread---of 10
sopoll32%of 35
soreceive20%of 226
sorestart---of 7
sorflush55%of 11
sosend49%of 95
sosetopt31%of 105
soshutdown72%of 7
sysctl_kern_sbmax---of 3
sysctl_kern_somaxkva---of 4
sysctl_kern_sooptions---of 4
x86_curlwp100%of 1
-----------
SUMMARY35%of 686

ufs_blkatoff43%of 14
ufs_dirbad---of 1
ufs_dirbadentry36%of 14
ufs_dirempty50%of 16
ufs_direnter29%of 60
ufs_dirremove49%of 27
ufs_dirrewrite47%of 15
ufs_lookup50%of 112
ufs_makedirentry67%of 3
-----------
SUMMARY44%of 261

elf64_check_header---of 6
elf64_copyargs---of 3
elf64_free_emul_arg---of 3
elf64_load_psection---of 17
elf64_populate_auxv---of 16
exec_elf64_makecmds3%of 105
exec_elf64_modcmd---of 4
netbsd_elf64_note---of 29
netbsd_elf64_probe---of 3
netbsd_elf64_signature---of 14
-----------
SUMMARY3%of 105

-----------
SUMMARY---of 0

copyout_msg_control19%of 11
copyout_sockname14%of 15
copyout_sockname_sb---of 14
do_sys_accept28%of 33
do_sys_bind---of 3
do_sys_connect37%of 22
do_sys_getpeername20%of 10
do_sys_getsockname63%of 8
do_sys_peeloff---of 1
do_sys_recvmsg67%of 3
do_sys_recvmsg_so65%of 37
do_sys_sendmsg67%of 6
do_sys_sendmsg_so58%of 52
free_control_mbuf12%of 18
getsockopt67%of 12
sockargs32%of 25
sys___socket30100%of 3
sys_accept16%of 13
sys_bind67%of 9
sys_connect86%of 7
sys_getpeername20%of 10
sys_getsockname20%of 10
sys_getsockopt100%of 1
sys_getsockopt2---of 1
sys_listen100%of 3
sys_paccept25%of 16
sys_recvfrom40%of 5
sys_recvmmsg42%of 39
sys_recvmsg37%of 19
sys_sendmmsg50%of 10
sys_sendmsg67%of 3
sys_sendto100%of 1
sys_setsockopt80%of 10
sys_shutdown50%of 8
sys_socketpair58%of 14
x86_curlwp100%of 1
-----------
SUMMARY44%of 434

mm_init---of 5
mm_ioctl---of 10
mm_mmap---of 3
mm_open100%of 1
mm_readwrite14%of 23
-----------
SUMMARY17%of 24

dk_attach---of 5
dk_close---of 11
dk_detach---of 3
dk_discard---of 15
dk_done---of 5
dk_drain---of 3
dk_dump---of 35
dk_getdefaultlabel---of 3
dk_getdisklabel---of 14
dk_init---of 1
dk_ioctl15%of 48
dk_open37%of 19
dk_size---of 10
dk_start53%of 21
dk_strategy43%of 7
dk_strategy_defer---of 7
dk_strategy_pending---of 4
dk_subr_modcmd---of 1
dk_translate34%of 12
x86_curlwp---of 1
-----------
SUMMARY30%of 107

kern_uipc_socket_50_fini---of 7
kern_uipc_socket_50_init---of 1
uipc_socket_50_getopt140%of 5
uipc_socket_50_sbts---of 4
uipc_socket_50_setopt18%of 25
-----------
SUMMARY14%of 30

mount_listener_cb60%of 5
usermount_common_policy---of 3
vfs_attach---of 8
vfs_delref100%of 1
vfs_detach---of 13
vfs_opv_free---of 4
vfs_opv_init---of 17
vfs_reinit---of 6
vfsinit---of 6
vn_default_error---of 1
-----------
SUMMARY67%of 6

filt_pipedetach---of 6
filt_piperead---of 8
filt_pipewrite---of 8
pipe126%of 27
pipe_close---of 1
pipe_ctor75%of 4
pipe_dtor---of 3
pipe_fpathconf---of 3
pipe_init---of 5
pipe_ioctl---of 13
pipe_kqfilter---of 5
pipe_poll60%of 15
pipe_posix_fadvise---of 1
pipe_read---of 46
pipe_restart---of 1
pipe_stat50%of 4
pipe_write16%of 64
pipeclose---of 26
sysctl_kern_pipe_setup---of 1
x86_curlwp100%of 1
-----------
SUMMARY28%of 115

procfs_done---of 1
procfs_exechook_cb---of 15
procfs_hashrem---of 6
procfs_init---of 1
procfs_listener_cb23%of 9
procfs_loadvnode9%of 58
procfs_modcmd---of 6
procfs_mount28%of 11
procfs_reinit---of 1
procfs_root50%of 4
procfs_start100%of 1
procfs_statvfs100%of 1
procfs_sync---of 1
procfs_sysctl_setup---of 1
procfs_unmount---of 3
procfs_vget---of 1
x86_curlwp100%of 1
-----------
SUMMARY18%of 85

addlog---of 8
addtstamp40%of 15
aprint_debug---of 7
aprint_debug_dev---of 11
aprint_debug_ifnet---of 9
aprint_error---of 1
aprint_error_dev---of 3
aprint_error_ifnet---of 3
aprint_error_internal---of 10
aprint_get_error_count---of 5
aprint_naive---of 7
aprint_naive_dev---of 11
aprint_naive_ifnet---of 9
aprint_normal---of 8
aprint_normal_dev---of 12
aprint_normal_ifnet---of 10
aprint_verbose---of 8
aprint_verbose_dev---of 12
aprint_verbose_ifnet---of 10
db_printf---of 3
db_vprintf---of 3
device_printf---of 6
klogpri---of 3
kprintf23%of 227
kprintf_init---of 5
kprintf_internal---of 1
kprintf_lock---of 3
kprintf_unlock---of 4
log50%of 10
logpri---of 8
printf50%of 6
printf_flags---of 6
printf_nolog---of 6
printf_nostamp---of 6
printf_tolog---of 6
putlogpri48%of 25
putone63%of 16
snprintf67%of 3
tablefull---of 3
tprintf---of 10
tprintf_close---of 3
tprintf_open---of 4
ttyprintf---of 1
twiddle---of 6
uprintf---of 4
uprintf_locked---of 4
vasprintf---of 3
vlog---of 10
vpanic---of 21
vprintf---of 8
vprintf_flags---of 6
vsnprintf100%of 3
x86_curlwp---of 1
-----------
SUMMARY30%of 305

chglwpcnt67%of 3
chgproccnt67%of 3
chgsbsize100%of 3
chgsemcnt67%of 3
sysctl_kern_uidinfo_cnt---of 7
uid_find50%of 10
uid_init---of 10
uid_stats---of 10
-----------
SUMMARY64%of 22

-----------
SUMMARY---of 0

compat_60_ptmget_ioctl---of 9
compat_60_ptmioctl---of 3
compat_60_ttioctl50%of 4
kern_tty_60_fini---of 5
kern_tty_60_init---of 1
-----------
SUMMARY50%of 4

bdev_cancel---of 13
bdev_close---of 12
bdev_detached---of 8
bdev_discard---of 12
bdev_dump---of 5
bdev_flags---of 5
bdev_ioctl---of 12
bdev_open45%of 29
bdev_size36%of 14
bdev_strategy43%of 14
bdev_type60%of 5
bdevsw_getname---of 6
bdevsw_lookup50%of 4
bdevsw_lookup_major---of 7
cdev_cancel39%of 13
cdev_close42%of 12
cdev_detached---of 8
cdev_discard---of 12
cdev_flags---of 5
cdev_ioctl59%of 12
cdev_kqfilter---of 12
cdev_mmap---of 12
cdev_open56%of 29
cdev_poll59%of 12
cdev_read50%of 12
cdev_stop---of 12
cdev_tty---of 6
cdev_type60%of 5
cdev_write42%of 12
cdevsw_getname50%of 6
cdevsw_lookup50%of 4
cdevsw_lookup_major---of 7
dev_minor_unit100%of 1
devsw_attach---of 87
devsw_blk2chr---of 9
devsw_blk2name---of 9
devsw_chr2blk34%of 9
devsw_detach---of 1
devsw_detach_locked---of 37
devsw_init---of 5
devsw_name2blk---of 14
devsw_name2chr---of 14
nommap---of 1
-----------
SUMMARY48%of 193

do_udpinit---of 1
sysctl_net_inet_udp_stats---of 1
udp4_input_checksum---of 15
udp4_sendup---of 23
udp_abort---of 3
udp_accept---of 3
udp_attach_wrapper46%of 11
udp_bind_wrapper---of 7
udp_connect2_wrapper---of 3
udp_connect_wrapper---of 9
udp_ctlinput---of 8
udp_ctloutput17%of 12
udp_detach_wrapper60%of 5
udp_disconnect_wrapper---of 5
udp_init---of 3
udp_init_common---of 3
udp_input---of 65
udp_input_checksum---of 4
udp_ioctl_wrapper---of 1
udp_listen_wrapper---of 3
udp_notify---of 13
udp_output---of 21
udp_peeraddr_wrapper---of 7
udp_purgeif_wrapper---of 1
udp_rcvd_wrapper---of 3
udp_recvoob_wrapper67%of 3
udp_send---of 16
udp_send_wrapper---of 1
udp_sendoob_wrapper---of 3
udp_shutdown_wrapper67%of 3
udp_sockaddr_wrapper---of 7
udp_stat_wrapper67%of 3
udp_statinc---of 3
-----------
SUMMARY44%of 37

ah6_ctlinput_wrapper---of 1
encap6_ctlinput_wrapper---of 1
esp6_ctlinput_wrapper---of 1
icmp6_ctloutput_wrapper100%of 1
in6_dom_init---of 1
rip6_ctlinput_wrapper---of 1
rip6_ctloutput_wrapper100%of 1
tcp6_ctlinput_wrapper---of 1
tcp6_init---of 1
tcp_ctloutput_wrapper100%of 1
udp6_ctlinput_wrapper---of 1
udp6_ctloutput_wrapper100%of 1
-----------
SUMMARY100%of 4

-----------
SUMMARY---of 0

ffs_balloc12%of 243
-----------
SUMMARY12%of 243

-----------
SUMMARY---of 0

kern_pset_destroy---of 17
psets_init---of 1
psets_listener_cb67%of 3
sys__pset_bind---of 27
sys_pset_assign---of 36
sys_pset_create---of 12
sys_pset_destroy---of 3
sysctl_pset_setup---of 3
sysctl_psets_list---of 8
sysctl_psets_max---of 8
x86_curlwp---of 1
-----------
SUMMARY67%of 3

bufq_alloc41%of 22
bufq_cancel---of 1
bufq_drain---of 4
bufq_free67%of 3
bufq_get100%of 1
bufq_getstrategyname---of 1
bufq_init---of 3
bufq_move---of 4
bufq_peek100%of 1
bufq_put100%of 1
bufq_register---of 1
bufq_unregister---of 5
sysctl_kern_bufq_strategies---of 17
-----------
SUMMARY50%of 28

adjtime1---of 9
clock_getres1---of 6
clock_settime1---of 3
dogetitimer---of 5
dosetitimer10%of 32
dotimer_gettime---of 4
dotimer_settime---of 28
itimer_arm_real---of 9
itimer_callout---of 19
itimer_decr---of 17
itimer_fini---of 5
itimer_gettime---of 24
itimer_init---of 14
itimer_lock---of 1
itimer_lock_held---of 1
itimer_poison---of 11
itimer_settime---of 46
itimer_unlock---of 1
nanosleep125%of 28
ptimer_fire---of 7
ptimer_free---of 14
ptimer_intr---of 16
ptimer_tick---of 10
ptimers_alloc---of 3
ptimers_free---of 35
settime---of 1
settime1---of 15
settimeofday134%of 9
sys___adjtime50---of 7
sys___clock_getres5050%of 8
sys___clock_gettime50100%of 3
sys___clock_settime50---of 4
sys___getitimer50---of 5
sys___gettimeofday50---of 6
sys___nanosleep5050%of 4
sys___setitimer5025%of 8
sys___settimeofday50---of 7
sys___timer_gettime50---of 4
sys___timer_settime50---of 4
sys_clock_getcpuclockid2---of 6
sys_clock_nanosleep---of 5
sys_timer_create---of 1
sys_timer_delete---of 13
sys_timer_getoverrun---of 4
time_init---of 1
time_wraps---of 4
timer_create1---of 49
-----------
SUMMARY27%of 92

_lwp_getspecific_by_lwp---of 1
lwp_finispecific100%of 1
lwp_getspecific100%of 1
lwp_initspecific67%of 3
lwp_setspecific---of 1
lwp_setspecific_by_lwp---of 1
lwp_specific_key_create---of 1
lwp_specific_key_delete---of 1
lwpinit_specificdata---of 3
x86_curlwp100%of 1
-----------
SUMMARY84%of 6

Fhash---of 4
algo_bsd46%of 11
algo_doublehash---of 17
algo_hash---of 15
algo_randinc---of 13
algo_random_pick---of 11
algo_random_start---of 11
check_suitable_port22%of 19
portalgo_algo_index_select20%of 10
portalgo_randport24%of 21
sysctl_portalgo_available---of 1
sysctl_portalgo_reserve4---of 1
sysctl_portalgo_reserve6---of 1
sysctl_portalgo_selected---of 10
sysctl_portalgo_selected4---of 1
sysctl_portalgo_selected6---of 1
-----------
SUMMARY27%of 61

_icmp6_input---of 345
icmp6_ctloutput23%of 9
icmp6_errcount23%of 9
icmp6_error31%of 53
icmp6_error2---of 9
icmp6_init---of 1
icmp6_input---of 1
icmp6_mtudisc_callback_register---of 6
icmp6_mtudisc_timeout---of 8
icmp6_mtudisc_update---of 26
icmp6_redirect_input---of 43
icmp6_redirect_output---of 38
icmp6_redirect_timeout---of 7
icmp6_reflect20%of 41
icmp6_statinc---of 3
ni6_nametodns---of 36
sysctl_net_inet6_icmp6_redirtimeout---of 7
sysctl_net_inet6_icmp6_stats---of 1
-----------
SUMMARY25%of 112

uvm_pgflcache_alloc54%of 13
uvm_pgflcache_fill50%of 18
uvm_pgflcache_fini_cpu---of 19
uvm_pgflcache_free60%of 10
uvm_pgflcache_init---of 1
uvm_pgflcache_pause---of 3
uvm_pgflcache_resume---of 7
uvm_pgflcache_spill54%of 13
uvm_pgflcache_start---of 11
-----------
SUMMARY54%of 54

sw_reg_biodone---of 1
sw_reg_iodone---of 11
sw_reg_start---of 10
swap_off---of 19
swapdrum_sdp_is---of 13
swaplist_find48%of 21
swaplist_insert37%of 22
swaplist_trim50%of 14
swapsys_lock---of 1
swapsys_unlock---of 1
swopen---of 3
swread100%of 1
swstrategy24%of 25
swwrite---of 1
sys_swapctl31%of 56
sysctl_uvmswap_setup---of 1
uvm_swap_alloc---of 26
uvm_swap_free---of 20
uvm_swap_get---of 7
uvm_swap_init---of 6
uvm_swap_io---of 66
uvm_swap_markbad---of 15
uvm_swap_put---of 1
uvm_swap_shutdown---of 17
uvm_swap_stats35%of 20
uvm_swapisfull---of 4
x86_curlwp---of 1
-----------
SUMMARY36%of 159

bufq_disksort_cancel---of 10
bufq_disksort_fini67%of 3
bufq_disksort_get25%of 8
bufq_disksort_init100%of 1
bufq_disksort_modcmd---of 4
bufq_disksort_put---of 49
-----------
SUMMARY42%of 12

minphys100%of 3
physio44%of 46
physio_biodone50%of 10
physio_done---of 17
physio_init---of 3
x86_curlwp100%of 1
-----------
SUMMARY49%of 60

tcp_congctl_bystruct---of 7
tcp_congctl_init---of 12
tcp_congctl_register---of 12
tcp_congctl_release58%of 7
tcp_congctl_select47%of 15
tcp_congctl_unregister---of 16
tcp_cubic_congestion_exp---of 5
tcp_cubic_fast_retransmit---of 9
tcp_cubic_newack---of 18
tcp_cubic_slow_retransmit---of 7
tcp_newreno_fast_retransmit---of 7
tcp_newreno_fast_retransmit_newack---of 10
tcp_newreno_newack---of 8
tcp_reno_congestion_exp---of 3
tcp_reno_do_fast_retransmit---of 4
tcp_reno_fast_retransmit---of 3
tcp_reno_fast_retransmit_newack---of 7
tcp_reno_newack---of 7
tcp_reno_slow_retransmit---of 3
-----------
SUMMARY50%of 22

sysctl_net_inet6_udp6_stats---of 1
udp6_abort_wrapper---of 5
udp6_accept_wrapper---of 3
udp6_attach_wrapper43%of 7
udp6_bind_wrapper60%of 5
udp6_connect2_wrapper---of 3
udp6_connect_wrapper46%of 11
udp6_ctlinput---of 14
udp6_ctloutput17%of 12
udp6_detach_wrapper---of 5
udp6_disconnect_wrapper40%of 10
udp6_init---of 1
udp6_input---of 14
udp6_input_checksum---of 9
udp6_ioctl_wrapper---of 1
udp6_listen_wrapper67%of 3
udp6_notify---of 13
udp6_output21%of 86
udp6_peeraddr_wrapper---of 7
udp6_purgeif_wrapper---of 1
udp6_rcvd_wrapper---of 3
udp6_realinput---of 58
udp6_recvoob_wrapper---of 3
udp6_send_wrapper58%of 7
udp6_sendoob_wrapper---of 3
udp6_sendup---of 23
udp6_shutdown_wrapper100%of 1
udp6_sockaddr_wrapper---of 7
udp6_stat_wrapper---of 3
udp6_statinc---of 3
x86_curlwp100%of 1
-----------
SUMMARY31%of 143

in6_addrscope67%of 6
in6_clearscope34%of 6
in6_getscopename---of 9
in6_setscope56%of 27
in6_setzoneid---of 6
sa6_embedscope20%of 20
sa6_recoverscope28%of 11
scope6_addr2default---of 11
scope6_ifattach---of 1
scope6_ifdetach---of 1
scope6_init---of 1
sockaddr_in6_externalize---of 3
-----------
SUMMARY40%of 70

compat43_set_accrights---of 5
compat_43_sys_accept---of 6
compat_43_sys_getpeername---of 4
compat_43_sys_getsockname---of 4
compat_43_sys_recv---of 1
compat_43_sys_recvfrom---of 6
compat_43_sys_recvmsg34%of 18
compat_43_sys_send100%of 1
compat_43_sys_sendmsg---of 9
uipc_syscalls_43_fini---of 1
uipc_syscalls_43_init---of 1
-----------
SUMMARY37%of 19

default_bus_space_handle_is_equal---of 1
default_bus_space_is_equal---of 1
devenodev---of 1
deveopnotsupp---of 1
enoioctl---of 1
enosys100%of 1
enxio---of 1
nullret---of 1
spldebug_start---of 1
sys_sa_stacks---of 1
ttyvnullop100%of 1
-----------
SUMMARY100%of 2

cfdata_ifattr---of 1
cfiattr_lookup---of 19
config_alldevs_enter38%of 16
config_alldevs_exit8%of 25
config_attach---of 5
config_attach_acquire---of 7
config_attach_internal---of 30
config_attach_pseudo67%of 3
config_attach_pseudo_acquire45%of 9
config_cfattach_attach---of 14
config_cfattach_detach---of 17
config_cfattach_lookup---of 12
config_cfdata_attach---of 30
config_cfdata_detach---of 21
config_cfdriver_attach---of 9
config_cfdriver_detach---of 13
config_cfdriver_lookup---of 7
config_create_interruptthreads---of 1
config_create_mountrootthreads---of 5
config_deactivate---of 12
config_defer---of 8
config_deferred---of 3
config_detach---of 3
config_detach_all---of 11
config_detach_children---of 11
config_detach_commit---of 6
config_detach_exit---of 6
config_detach_release---of 59
config_devalloc32%of 72
config_devlink60%of 5
config_devunlink---of 18
config_finalize---of 28
config_finalize_mountroot---of 7
config_finalize_register---of 11
config_fini_component---of 43
config_found---of 5
config_found_acquire---of 14
config_init---of 36
config_init_component---of 35
config_init_mi---of 3
config_interrupts---of 8
config_interrupts_thread---of 12
config_match---of 14
config_mountroot---of 8
config_mountroot_thread---of 10
config_pending_decr30%of 10
config_pending_incr50%of 6
config_probe---of 1
config_process_deferred20%of 15
config_rootfound---of 10
config_rootsearch---of 7
config_search---of 7
config_search_internal---of 54
config_stdsubmatch---of 13
config_twiddle_fn---of 3
config_twiddle_init---of 3
device_acquire---of 3
device_active---of 5
device_active_deregister---of 10
device_active_register---of 18
device_compatible_lookup---of 9
device_compatible_lookup_id---of 5
device_compatible_lookup_strlist---of 6
device_compatible_match---of 9
device_compatible_match_id---of 5
device_compatible_match_strlist---of 6
device_compatible_plookup---of 9
device_compatible_plookup_strlist---of 6
device_compatible_pmatch---of 9
device_compatible_pmatch_strlist---of 6
device_find_by_driver_unit---of 10
device_find_by_xname---of 5
device_getlock---of 1
device_lookup---of 5
device_lookup_acquire22%of 14
device_lookup_private60%of 5
device_pmf_bus_deregister---of 3
device_pmf_bus_private---of 1
device_pmf_bus_register---of 1
device_pmf_bus_resume---of 6
device_pmf_bus_shutdown---of 4
device_pmf_bus_suspend---of 7
device_pmf_class_deregister---of 3
device_pmf_class_private---of 1
device_pmf_class_register---of 1
device_pmf_class_resume---of 7
device_pmf_class_suspend---of 6
device_pmf_driver_child_register50%of 4
device_pmf_driver_deregister---of 4
device_pmf_driver_register100%of 1
device_pmf_driver_resume---of 7
device_pmf_driver_set_child_register---of 1
device_pmf_driver_shutdown---of 4
device_pmf_driver_suspend---of 7
device_pmf_is_registered---of 1
device_pmf_lock---of 6
device_pmf_unlock---of 5
device_release100%of 1
deviter_first---of 1
deviter_init---of 18
deviter_next---of 21
deviter_next1---of 11
deviter_release---of 1
devmon_report_device37%of 11
ifattr_match---of 3
no_devmon_insert---of 1
null_childdetached---of 1
shutdown_first---of 6
shutdown_next---of 4
x86_curlwp100%of 1
-----------
SUMMARY33%of 198

umap_findid---of 5
umap_mapids32%of 19
umap_reverse_findid40%of 5
-----------
SUMMARY34%of 24

-----------
SUMMARY---of 0

mq_close_fop---of 6
mq_handle_open---of 33
mq_listener_cb67%of 3
mq_poll_fop---of 7
mq_recv1---of 31
mq_send1---of 44
mq_stat_fop---of 1
mqueue_destroy---of 15
mqueue_get---of 6
mqueue_modcmd---of 7
mqueue_print_list---of 4
mqueue_sysctl_init---of 3
sys___mq_timedreceive50---of 6
sys___mq_timedsend50---of 4
sys_mq_close---of 1
sys_mq_getattr---of 4
sys_mq_notify---of 11
sys_mq_open---of 6
sys_mq_receive---of 3
sys_mq_send---of 1
sys_mq_setattr---of 8
sys_mq_unlink---of 17
x86_curlwp---of 1
-----------
SUMMARY67%of 3

_prop_generic_internalize17%of 12
_prop_object_externalize_append_char---of 10
_prop_object_externalize_append_cstring---of 5
_prop_object_externalize_append_encoded_cstring---of 22
_prop_object_externalize_context_alloc---of 4
_prop_object_externalize_context_free---of 1
_prop_object_externalize_empty_tag---of 13
_prop_object_externalize_end_tag---of 9
_prop_object_externalize_footer---of 11
_prop_object_externalize_header---of 29
_prop_object_externalize_start_tag---of 10
_prop_object_fini---of 1
_prop_object_init100%of 1
_prop_object_internalize_by_tag---of 30
_prop_object_internalize_context_alloc16%of 19
_prop_object_internalize_context_free---of 1
_prop_object_internalize_decode_string---of 30
_prop_object_internalize_find_tag---of 59
_prop_object_internalize_match---of 3
prop_object_equals---of 1
prop_object_equals_with_error---of 16
prop_object_iterator_next---of 1
prop_object_iterator_release---of 1
prop_object_iterator_reset---of 1
prop_object_release41%of 32
prop_object_retain67%of 3
prop_object_type67%of 3
-----------
SUMMARY33%of 70

fault_close---of 1
fault_inject14%of 15
fault_ioctl---of 23
fault_lwp_free---of 3
fault_modcmd---of 4
fault_open---of 1
-----------
SUMMARY14%of 15

syn_cache_add---of 38
syn_cache_cleanup23%of 9
syn_cache_get---of 51
syn_cache_init---of 3
syn_cache_insert---of 26
syn_cache_lookup---of 11
syn_cache_reset---of 8
syn_cache_respond---of 43
syn_cache_rm---of 12
syn_cache_timer---of 7
syn_cache_unreach---of 9
-----------
SUMMARY23%of 9

vioscsi_attach---of 25
vioscsi_detach---of 10
vioscsi_free_reqs---of 5
vioscsi_match---of 1
vioscsi_modcmd---of 1
vioscsi_scsipi_request35%of 26
vioscsi_vq_done---of 10
-----------
SUMMARY35%of 26

cpu_lwp_fork67%of 9
cpu_lwp_free40%of 5
cpu_lwp_free260%of 5
cpu_proc_fork100%of 1
cpu_uarea_alloc50%of 4
cpu_uarea_free---of 5
kvtop---of 3
vmapbuf67%of 6
vunmapbuf---of 3
x86_curlwp100%of 1
-----------
SUMMARY62%of 31

secmodel_deregister---of 16
secmodel_eval---of 9
secmodel_init---of 1
secmodel_nsecmodels100%of 1
secmodel_register---of 13
secmodel_setinfo---of 1
-----------
SUMMARY100%of 1

in6_cksum32%of 16
-----------
SUMMARY32%of 16

kill1---of 18
sigaction1---of 50
sigaltstack1---of 9
sigpending1---of 1
sigprocmask1---of 13
sigsuspend1---of 5
sigsuspendsetup67%of 3
sigsuspendteardown50%of 4
sigtimedwait1---of 41
sys_____sigtimedwait50---of 41
sys___sigaction_sigtramp---of 8
sys___sigaltstack14---of 15
sys___sigpending14---of 1
sys___sigprocmask14---of 8
sys___sigsuspend14---of 7
sys_getcontext---of 1
sys_kill---of 1
sys_setcontext---of 4
sys_sigqueueinfo---of 3
-----------
SUMMARY58%of 7

i82489_ipi50%of 8
lapic_boot_init---of 36
lapic_calibrate_timer---of 30
lapic_clockintr---of 1
lapic_cpu_number---of 3
lapic_delay---of 18
lapic_dump---of 15
lapic_enable---of 3
lapic_get_timecount---of 8
lapic_hwmask---of 4
lapic_hwunmask---of 4
lapic_initclock---of 3
lapic_is_x2apic---of 4
lapic_readreg---of 3
lapic_reset---of 7
lapic_set_lvt---of 28
lapic_setup---of 1
lapic_write_tpri---of 1
lapic_writereg---of 3
x2apic_ipi---of 1
x86_ipi_init---of 9
x86_ipi_startup---of 9
-----------
SUMMARY50%of 8

do_ksem_init28%of 22
do_ksem_open18%of 35
do_ksem_wait46%of 11
ksem_close_fop---of 4
ksem_create43%of 7
ksem_free---of 14
ksem_get32%of 16
ksem_listener_cb50%of 4
ksem_modcmd---of 10
ksem_read_fop---of 3
ksem_release46%of 11
ksem_stat_fop---of 3
sys__ksem_close---of 5
sys__ksem_destroy12%of 18
sys__ksem_getvalue---of 4
sys__ksem_init100%of 1
sys__ksem_open100%of 1
sys__ksem_post43%of 7
sys__ksem_timedwait40%of 5
sys__ksem_trywait---of 1
sys__ksem_unlink22%of 23
sys__ksem_wait100%of 1
x86_curlwp100%of 1
-----------
SUMMARY30%of 163

kthread_create44%of 23
kthread_exit---of 11
kthread_fpu_enter---of 9
kthread_fpu_exit---of 13
kthread_join---of 10
kthread_sysinit---of 1
x86_curlwp---of 1
-----------
SUMMARY44%of 23

-----------
SUMMARY---of 0

_mutex_init50%of 10
mutex_abort---of 1
mutex_destroy56%of 9
mutex_dump---of 1
mutex_init100%of 1
mutex_ownable67%of 3
mutex_owned75%of 4
mutex_owner67%of 3
mutex_spin_retry---of 18
mutex_tryenter45%of 18
mutex_vector_enter60%of 90
mutex_vector_exit53%of 23
x86_curlwp100%of 1
-----------
SUMMARY58%of 162

lookup_crossmount38%of 24
lookup_for_nfsd---of 21
lookup_for_nfsd_index---of 33
lookup_once59%of 43
lookup_parsepath79%of 14
namei58%of 19
namei_hash---of 7
namei_simple_kernel100%of 1
namei_simple_user100%of 1
namei_tryemulroot33%of 297
nameiat_simple_kernel42%of 17
nameiat_simple_user59%of 17
pathbuf_assimilate100%of 1
pathbuf_copyin58%of 7
pathbuf_copystring100%of 1
pathbuf_create50%of 4
pathbuf_destroy60%of 5
pathbuf_maybe_copyin67%of 3
pathbuf_stringcopy_get100%of 3
pathbuf_stringcopy_put72%of 7
relookup48%of 23
x86_curlwp100%of 1
-----------
SUMMARY43%of 488

filt_genfsdetach---of 1
filt_genfsread---of 6
filt_genfsvnode---of 10
filt_genfswrite---of 6
genfs_abortop100%of 1
genfs_access67%of 3
genfs_accessx50%of 4
genfs_can_access73%of 11
genfs_can_access_acl_nfs4---of 41
genfs_can_access_acl_posix1e---of 44
genfs_can_chflags100%of 3
genfs_can_chmod45%of 9
genfs_can_chown38%of 8
genfs_can_chtimes---of 5
genfs_can_extattr---of 5
genfs_can_sticky---of 4
genfs_deadlock---of 11
genfs_deadunlock---of 1
genfs_ebadf---of 1
genfs_einval100%of 1
genfs_enoioctl---of 1
genfs_eopnotsupp---of 18
genfs_erofs_link---of 1
genfs_fcntl---of 1
genfs_islocked100%of 3
genfs_kqfilter---of 5
genfs_lock60%of 10
genfs_mmap100%of 1
genfs_node_destroy100%of 1
genfs_node_init100%of 1
genfs_node_rdlock100%of 1
genfs_node_rdtrylock100%of 1
genfs_node_unlock100%of 1
genfs_node_wrlock100%of 1
genfs_node_wrlocked100%of 1
genfs_null_putpages67%of 3
genfs_nullop100%of 1
genfs_parsepath100%of 4
genfs_pathconf60%of 5
genfs_poll---of 1
genfs_revoke---of 3
genfs_seek100%of 1
genfs_size---of 1
genfs_unlock100%of 1
-----------
SUMMARY70%of 76

-----------
SUMMARY---of 0

stub_compat_70_unp_addsockcred---of 1
uipc_ctloutput---of 30
uipc_init---of 5
unp_abort---of 16
unp_accept---of 21
unp_addsockcred---of 5
unp_attach56%of 27
unp_bind31%of 26
unp_connect---of 38
unp_connect147%of 15
unp_connect272%of 7
unp_detach26%of 39
unp_discard_now---of 6
unp_disconnect60%of 5
unp_disconnect117%of 12
unp_dispose11%of 19
unp_externalize---of 21
unp_ioctl---of 1
unp_listen58%of 7
unp_mark---of 6
unp_peeraddr---of 11
unp_rcvd---of 18
unp_recvoob---of 3
unp_resetlock50%of 6
unp_send47%of 92
unp_sendoob---of 3
unp_setpeerlocks---of 14
unp_shutdown---of 11
unp_sockaddr---of 9
unp_stat---of 12
unp_thread---of 130
x86_curlwp100%of 1
-----------
SUMMARY41%of 256

domain_attach---of 12
domaininit---of 31
domaininit_post---of 1
pfctlinput---of 9
pfctlinput2---of 11
pffasttimo---of 9
pffinddomain75%of 8
pffindproto67%of 18
pffindtype67%of 12
pfslowtimo---of 9
sockaddr_addr---of 10
sockaddr_alloc30%of 10
sockaddr_any---of 8
sockaddr_any_by_family---of 8
sockaddr_anyaddr---of 18
sockaddr_cmp30%of 17
sockaddr_const_addr---of 10
sockaddr_copy30%of 10
sockaddr_dup67%of 3
sockaddr_externalize---of 10
sockaddr_format---of 15
sockaddr_free100%of 1
sockaddr_getsize_by_family---of 7
sysctl_unpcblist---of 41
x86_curlwp---of 1
-----------
SUMMARY51%of 79

ah4_ctlinput_wrapper---of 1
esp4_ctlinput_wrapper---of 1
rip_ctlinput_wrapper---of 1
rip_ctloutput_wrapper100%of 1
sockaddr_in_addr---of 3
sockaddr_in_cmp---of 6
sockaddr_in_const_addr---of 3
tcp_ctlinput_wrapper---of 1
tcp_ctloutput_wrapper100%of 1
udp_ctlinput_wrapper---of 1
udp_ctloutput_wrapper100%of 1
-----------
SUMMARY100%of 3

do_sched_getparam---of 8
do_sched_setparam---of 33
sched_init---of 3
sched_listener_cb29%of 7
sys__sched_getaffinity---of 8
sys__sched_getparam---of 5
sys__sched_protect---of 16
sys__sched_setaffinity---of 31
sys__sched_setparam---of 3
sys_sched_yield---of 1
x86_curlwp---of 1
-----------
SUMMARY29%of 7

sysctl_basenode_init---of 1
sysctl_hw_machine_arch---of 1
sysctl_hwbase_setup---of 1
sysctl_kernbase_setup---of 1
sysctl_setlen60%of 5
-----------
SUMMARY60%of 5

-----------
SUMMARY---of 0

pfil_add_hook---of 19
pfil_add_ihook---of 11
pfil_head_create---of 10
pfil_head_destroy---of 6
pfil_head_get---of 7
pfil_init---of 1
pfil_list_add---of 14
pfil_list_remove---of 10
pfil_remove_hook---of 7
pfil_remove_ihook---of 9
pfil_run_addrhooks---of 1
pfil_run_arg---of 10
pfil_run_hooks29%of 25
pfil_run_ifhooks---of 1
x86_curlwp100%of 1
-----------
SUMMARY31%of 26

pktq_barrier---of 15
pktq_collect_counts---of 1
pktq_create---of 7
pktq_dequeue---of 12
pktq_destroy---of 8
pktq_enqueue58%of 7
pktq_fini_cpu---of 3
pktq_flush---of 11
pktq_ifdetach---of 6
pktq_init_cpu---of 1
pktq_rps_hash---of 5
pktq_rps_hash_curcpu---of 1
pktq_rps_hash_toeplitz---of 7
pktq_rps_hash_toeplitz_othercpus---of 8
pktq_rps_hash_zero---of 1
pktq_set_maxlen---of 17
pktq_set_maxlen_cpu---of 1
pktq_sysctl_setup---of 7
pktqueue_list_init---of 1
sysctl_pktq_drops---of 1
sysctl_pktq_maxlen---of 3
sysctl_pktq_nitems---of 1
sysctl_pktq_rps_hash_handler---of 20
-----------
SUMMARY58%of 7

msdos_modcmd---of 4
msdosfs_fhtovp---of 6
msdosfs_mount14%of 30
msdosfs_mountfs---of 59
msdosfs_mountroot---of 6
msdosfs_root---of 4
msdosfs_start---of 1
msdosfs_statvfs---of 1
msdosfs_sync---of 9
msdosfs_sync_selector---of 9
msdosfs_sysctl_setup---of 1
msdosfs_unmount---of 4
msdosfs_vget---of 1
msdosfs_vptofh---of 4
update_mp---of 7
x86_curlwp100%of 1
-----------
SUMMARY17%of 31

-----------
SUMMARY---of 0

if_vioif_modcmd---of 1
vioif_attach---of 134
vioif_cfg_softint---of 1
vioif_config_change---of 1
vioif_ctrl_intr---of 4
vioif_ctrl_release---of 5
vioif_ctrl_send_command---of 19
vioif_deferred_transmit---of 1
vioif_finalize_teardown---of 3
vioif_ifflags13%of 16
vioif_ifflags_cb---of 1
vioif_init---of 21
vioif_ioctl40%of 5
vioif_match---of 1
vioif_net_dequeue_commit---of 3
vioif_net_enqueue---of 5
vioif_net_sched_handle---of 11
vioif_populate_rx_mbufs_locked---of 17
vioif_rx_deq_locked---of 16
vioif_rx_filter10%of 20
vioif_rx_handle---of 5
vioif_rx_handle_locked---of 10
vioif_rx_intr---of 4
vioif_send_common_locked---of 30
vioif_set_rx_filter---of 8
vioif_start---of 1
vioif_stop---of 30
vioif_transmit---of 6
vioif_tx_deq_locked---of 14
vioif_tx_handle---of 5
vioif_tx_handle_locked---of 13
vioif_tx_intr---of 4
vioif_update_link_status---of 7
vioif_watchdog---of 9
vioif_workq_work---of 3
x86_curlwp---of 1
-----------
SUMMARY15%of 41

hardupdate16%of 19
ntp_adjtime141%of 32
ntp_gettime---of 1
ntp_init---of 1
ntp_timestatus50%of 4
ntp_update_second---of 23
sys___ntp_gettime50---of 7
sys_ntp_adjtime---of 9
sysctl_kern_ntptime---of 1
sysctl_kern_ntptime_setup---of 1
-----------
SUMMARY33%of 55

file_free---of 11
fileassoc_add---of 53
fileassoc_clear---of 4
fileassoc_decuse---of 7
fileassoc_deregister---of 6
fileassoc_file_delete23%of 9
fileassoc_file_lookup---of 23
fileassoc_init---of 3
fileassoc_lookup---of 3
fileassoc_register---of 8
fileassoc_table_clear---of 14
fileassoc_table_delete---of 12
fileassoc_table_run---of 14
table_dtor---of 7
-----------
SUMMARY23%of 9

add_suspensor---of 35
complete_suspension---of 51
device_pmf_remove_suspensor---of 39
input_activity_handler---of 3
input_idle---of 7
pmf_check_system_drivers---of 8
pmf_class_display_deregister---of 9
pmf_class_display_register---of 6
pmf_class_input_deregister---of 1
pmf_class_input_register---of 3
pmf_class_network_register---of 1
pmf_class_network_resume---of 6
pmf_class_network_suspend---of 1
pmf_device_deregister---of 1
pmf_device_descendants_release---of 6
pmf_device_descendants_resume---of 10
pmf_device_recursive_resume---of 5
pmf_device_recursive_suspend---of 7
pmf_device_register1100%of 1
pmf_device_resume---of 30
pmf_device_subtree_release---of 1
pmf_device_subtree_resume---of 3
pmf_device_suspend---of 22
pmf_event_deregister---of 14
pmf_event_inject---of 7
pmf_event_register---of 3
pmf_event_worker---of 12
pmf_get_platform---of 3
pmf_init---of 8
pmf_qual_recursive_copy---of 1
pmf_self_suspensor_init---of 1
pmf_set_platform---of 4
pmf_suspend_worker---of 15
pmf_system_bus_resume---of 9
pmf_system_resume---of 14
pmf_system_shutdown---of 8
pmf_system_suspend---of 11
sysctl_pmf_setup---of 1
-----------
SUMMARY100%of 1

pollcommon67%of 9
sel_do_scan74%of 56
selclear64%of 11
selcommon83%of 23
seldestroy20%of 10
selinit100%of 1
selnotify18%of 28
selrecord45%of 9
selrecord_knote---of 1
selremove_knote---of 1
selsysinit---of 3
seltrue100%of 1
sys___pollts50---of 7
sys___pselect50---of 7
sys___select5040%of 5
sys_poll67%of 3
sysctl_select_setup---of 1
x86_curlwp100%of 1
-----------
SUMMARY58%of 157

-----------
SUMMARY---of 0

compat_90_sys_fhstatvfs167%of 3
compat_90_sys_fstatvfs1---of 3
compat_90_sys_getvfsstat100%of 1
compat_90_sys_statvfs1---of 3
statvfs_to_statvfs90_copy100%of 1
vfs_syscalls_90_fini---of 1
vfs_syscalls_90_init---of 1
-----------
SUMMARY80%of 5

entpool_enter50%of 16
entpool_enter_nostir50%of 14
entpool_extract---of 10
entpool_selftest---of 172
entpool_stir---of 6
-----------
SUMMARY50%of 30

_if_byindex---of 3
_if_down---of 23
doifioctl12%of 136
if_acquire67%of 3
if_activate_sadl---of 15
if_addr_init---of 11
if_alloc---of 1
if_alloc_sadl---of 3
if_attach---of 1
if_attachdomain---of 14
if_byindex75%of 4
if_clone_attach---of 4
if_clone_detach---of 6
if_clone_list---of 8
if_clone_lookup15%of 27
if_deactivate---of 3
if_deferred_start_common---of 3
if_deferred_start_init---of 1
if_deferred_start_softint---of 1
if_delroute_matcher---of 1
if_detach---of 68
if_dl_create---of 1
if_do_dad---of 4
if_domain_link_state_change---of 6
if_down---of 3
if_down_locked---of 3
if_export_if_data---of 1
if_flags_set---of 11
if_free---of 1
if_free_sadl---of 15
if_get19%of 33
if_get_byindex40%of 5
if_get_bylla---of 11
if_held---of 1
if_init---of 3
if_initialize---of 23
if_initname---of 1
if_input---of 8
if_ioctl---of 4
if_is_deactivated100%of 1
if_link_state_change---of 18
if_link_state_change_work---of 18
if_linkstate_change_disestablish---of 1
if_linkstate_change_establish---of 1
if_listener_cb100%of 1
if_mcast_op67%of 3
if_nulldrain---of 1
if_nullinit---of 1
if_nullinput---of 1
if_nullioctl---of 1
if_nulloutput---of 1
if_nullslowtimo---of 1
if_nullstart---of 1
if_nullstop---of 1
if_nulltransmit---of 1
if_percpuq_create---of 6
if_percpuq_destroy---of 3
if_percpuq_drops---of 1
if_percpuq_enqueue---of 5
if_percpuq_init_ifq---of 1
if_percpuq_purge_ifq---of 6
if_percpuq_softint---of 9
if_purgeaddrs---of 12
if_put100%of 3
if_register---of 39
if_schedule_deferred_start---of 8
if_sdl_sysctl---of 11
if_set_sadl---of 6
if_slowtimo_intr---of 8
if_slowtimo_work---of 5
if_stop---of 3
if_transmit---of 7
if_transmit_lock---of 1
if_tunnel_alloc_ro_percpu---of 1
if_tunnel_check_nesting---of 5
if_tunnel_free_ro_percpu---of 1
if_tunnel_ro_fini_pc---of 1
if_tunnel_ro_init_pc---of 1
if_tunnel_ro_percpu_rtcache_free---of 1
if_tunnel_rtcache_free_pc---of 1
if_up---of 1
if_up_locked---of 12
ifa_acquire---of 1
ifa_held---of 1
ifa_ifwithaddr44%of 23
ifa_ifwithaddr_psref---of 3
ifa_ifwithaf---of 17
ifa_ifwithdstaddr---of 21
ifa_ifwithdstaddr_psref---of 3
ifa_ifwithladdr---of 4
ifa_ifwithladdr_psref---of 5
ifa_ifwithnet---of 39
ifa_ifwithnet_psref---of 3
ifa_insert---of 13
ifa_is_destroying---of 1
ifa_psref_init---of 1
ifa_release---of 3
ifa_remove---of 25
ifaddrpref_ioctl---of 22
ifafree---of 7
ifaof_ifpforaddr---of 21
ifaof_ifpforaddr_psref---of 3
ifaref---of 1
ifinit---of 1
ifinit1---of 7
ifinit_post---of 1
ifioctl_common---of 63
ifpromisc---of 9
ifpromisc_locked---of 9
ifq_enqueue---of 1
ifq_enqueue2---of 5
ifreq_setaddr43%of 7
ifunit19%of 32
link_rtrequest---of 6
p2p_rtrequest---of 22
pslist_writer_insert_after---of 15
pslist_writer_insert_head---of 12
sysctl_if_watchdog---of 4
sysctl_percpuq_drops_handler---of 1
x86_curlwp100%of 1
-----------
SUMMARY22%of 279

ufsdirhash_add---of 18
ufsdirhash_adjfree70%of 13
ufsdirhash_build33%of 80
ufsdirhash_checkblock10%of 21
ufsdirhash_dirtrunc---of 20
ufsdirhash_done---of 3
ufsdirhash_enduseful25%of 8
ufsdirhash_findfree31%of 26
ufsdirhash_findslot42%of 17
ufsdirhash_free---of 14
ufsdirhash_init---of 3
ufsdirhash_lookup30%of 64
ufsdirhash_move---of 5
ufsdirhash_newblk---of 8
ufsdirhash_remove45%of 18
ufsdirhash_sysctl_init---of 1
-----------
SUMMARY33%of 247

layer_access25%of 8
layer_bmap---of 1
layer_bypass21%of 49
layer_close50%of 4
layer_fsync40%of 5
layer_getattr67%of 3
layer_getpages---of 8
layer_inactive100%of 1
layer_lookup65%of 14
layer_open58%of 7
layer_print---of 1
layer_putpages58%of 7
layer_reclaim---of 5
layer_remove---of 3
layer_rename---of 7
layer_revoke---of 1
layer_rmdir---of 3
layer_setattr25%of 16
-----------
SUMMARY36%of 114

uvm_vnp_setsize55%of 11
uvm_vnp_setwritesize50%of 12
uvn_detach---of 1
uvn_findpage46%of 37
uvn_findpages71%of 17
uvn_get47%of 13
uvn_markdirty60%of 5
uvn_put---of 3
uvn_reference100%of 1
uvn_text_p67%of 3
-----------
SUMMARY54%of 99

kmeminit_nkmempages---of 3
uvm_km_alloc55%of 22
uvm_km_bootstrap---of 12
uvm_km_check_empty50%of 16
uvm_km_free---of 12
uvm_km_init---of 1
uvm_km_kmem_alloc50%of 12
uvm_km_kmem_free100%of 1
uvm_km_pgremove---of 15
uvm_km_pgremove_intrsafe60%of 25
uvm_km_protect---of 1
uvm_km_suballoc---of 8
uvm_km_va_starved_p---of 3
-----------
SUMMARY56%of 76

layerfs_fhtovp---of 6
layerfs_loadvnode60%of 5
layerfs_modcmd---of 1
layerfs_quotactl---of 3
layerfs_renamelock_enter---of 1
layerfs_renamelock_exit---of 1
layerfs_root67%of 3
layerfs_snapshot---of 1
layerfs_start100%of 1
layerfs_statvfs50%of 4
layerfs_suspendctl---of 1
layerfs_sync---of 1
layerfs_vget---of 6
layerfs_vptofh100%of 1
sysctl_vfs_layerfs_setup---of 1
-----------
SUMMARY65%of 14

sb_max_set---of 3
sbappend63%of 8
sbappendaddr66%of 29
sbappendaddrchain---of 26
sbappendcontrol---of 25
sbappendrecord46%of 11
sbappendstream---of 7
sbcompress42%of 31
sbcreatecontrol---of 3
sbcreatecontrol1---of 10
sbdrop43%of 28
sbdroprecord---of 14
sbflush57%of 16
sbinsertoob---of 19
sblock18%of 17
sbrelease67%of 3
sbreserve67%of 12
sbunlock60%of 5
sbwait39%of 13
socantrcvmore56%of 9
socantsendmore56%of 9
socket_print---of 41
sofindproc---of 33
soget67%of 3
soinit2---of 1
soisconnected34%of 27
soisconnecting67%of 3
soisdisconnected54%of 15
soisdisconnecting---of 15
solocked100%of 1
solocked267%of 3
solockreset67%of 3
solockretry---of 6
sonewconn---of 28
soput58%of 7
soqinsque---of 14
soqremque---of 22
soreserve54%of 13
soroverflow---of 10
sosetlock80%of 5
sowait---of 15
sowakeup47%of 13
x86_curlwp100%of 1
-----------
SUMMARY50%of 285

-----------
SUMMARY---of 0

_rnd_add_uint32---of 1
_rnd_add_uint64---of 1
attach_seed_rndsource---of 9
entropy_account_cpu12%of 45
entropy_bootrequest---of 9
entropy_consolidate---of 6
entropy_consolidate_xc---of 25
entropy_cpu_put58%of 7
entropy_enter53%of 17
entropy_enter_early---of 5
entropy_epoch100%of 1
entropy_extract---of 34
entropy_fini_cpu---of 1
entropy_init_cpu---of 1
entropy_ioctl---of 70
entropy_kqfilter---of 6
entropy_notify---of 15
entropy_pending_cpu---of 5
entropy_poll---of 11
entropy_ready---of 1
entropy_request---of 17
entropy_reset_xc---of 6
entropy_softintr---of 5
entropy_thread---of 14
filt_entropy_read_detach---of 3
filt_entropy_read_event---of 11
rnd_add_data45%of 9
rnd_add_data_132%of 32
rnd_add_data_internal43%of 14
rnd_add_data_intr---of 1
rnd_add_data_sync---of 1
rnd_add_uint32100%of 1
rnd_attach_source---of 29
rnd_detach_source---of 16
rnd_init---of 15
rnd_init_softint---of 6
rnd_lock_sources---of 15
rnd_seed---of 14
rnd_system_ioctl---of 1
rnd_unlock_sources---of 10
rndsource_entropybits_cpu---of 3
rndsource_setcb---of 1
rndsource_to_user---of 11
rndsource_to_user_est---of 5
rndsource_to_user_est_cpu---of 5
sysctl_entropy_consolidate---of 3
sysctl_entropy_gather---of 3
x86_curlwp100%of 1
-----------
SUMMARY33%of 127

ubc_alloc29%of 64
ubc_fault41%of 37
ubc_init---of 11
ubc_purge14%of 22
ubc_release13%of 41
ubc_uiomove50%of 16
ubc_zerorange50%of 4
ubchash_stats---of 10
ubchist_init---of 1
-----------
SUMMARY28%of 184

rw_obj_alloc100%of 1
rw_obj_free58%of 7
rw_obj_hold60%of 5
rw_obj_refcnt100%of 1
rw_obj_tryalloc67%of 3
-----------
SUMMARY65%of 17

compat_ifdatareq38%of 8
uipc_syscalls_50_fini---of 3
uipc_syscalls_50_init---of 1
-----------
SUMMARY38%of 8

hci_abort_wrapper---of 3
hci_accept_wrapper---of 3
hci_attach_wrapper44%of 16
hci_bind_wrapper---of 16
hci_connect2_wrapper---of 3
hci_connect_wrapper---of 11
hci_ctloutput---of 14
hci_detach---of 18
hci_detach_wrapper---of 1
hci_device_cb10%of 22
hci_disconnect_wrapper---of 5
hci_drop---of 7
hci_init---of 3
hci_ioctl_wrapper---of 1
hci_listen_wrapper---of 3
hci_mtap---of 41
hci_peeraddr_wrapper---of 7
hci_purgeif_wrapper---of 1
hci_rcvd_wrapper---of 3
hci_recvoob_wrapper---of 3
hci_send_wrapper32%of 25
hci_sendoob_wrapper---of 3
hci_shutdown_wrapper---of 3
hci_sockaddr_wrapper---of 7
hci_stat_wrapper67%of 3
x86_curlwp100%of 1
-----------
SUMMARY30%of 67

-----------
SUMMARY---of 0

_uvm_map_sanity---of 9
_uvm_tree_sanity---of 28
sysctl_user_va0_disable---of 5
sysctl_uvmmap_setup---of 1
sysctl_vmproc---of 44
uvm_map62%of 13
uvm_map_advice48%of 21
uvm_map_checkprot---of 8
uvm_map_clean39%of 67
uvm_map_clip_end30%of 47
uvm_map_clip_start30%of 47
uvm_map_compare_key---of 3
uvm_map_compare_nodes78%of 9
uvm_map_enter44%of 141
uvm_map_extract---of 122
uvm_map_findspace27%of 173
uvm_map_inherit48%of 21
uvm_map_init---of 1
uvm_map_init_caches---of 1
uvm_map_lock_entry60%of 5
uvm_map_lookup_entry46%of 51
uvm_map_pageable31%of 113
uvm_map_pageable_all---of 80
uvm_map_prepare40%of 53
uvm_map_printit---of 5
uvm_map_protect61%of 46
uvm_map_protect_user67%of 3
uvm_map_reference---of 1
uvm_map_replace---of 65
uvm_map_reserve---of 1
uvm_map_setup---of 1
uvm_map_space_avail53%of 21
uvm_map_submap---of 19
uvm_map_unlock_entry60%of 5
uvm_map_willneed67%of 12
uvm_mapent_clone56%of 18
uvm_mapent_splitadj58%of 14
uvm_mapent_trymerge20%of 117
uvm_rb_fixup74%of 26
uvm_rb_insert63%of 8
uvm_rb_remove72%of 21
uvm_unmap150%of 12
uvm_unmap_detach70%of 13
uvm_unmap_remove46%of 82
uvm_voaddr_acquire---of 45
uvm_voaddr_compare---of 11
uvm_voaddr_release---of 12
uvm_whatis---of 9
uvmspace_addref---of 5
uvmspace_alloc---of 1
uvmspace_exec---of 12
uvmspace_fork53%of 42
uvmspace_free---of 12
uvmspace_init---of 3
uvmspace_share---of 5
uvmspace_spawn---of 1
vm_map_busy---of 5
vm_map_lock58%of 7
vm_map_lock_read100%of 1
vm_map_lock_try---of 4
vm_map_locked_p---of 1
vm_map_unbusy---of 3
vm_map_unlock50%of 6
vm_map_unlock_read100%of 1
x86_curlwp100%of 1
-----------
SUMMARY41%of 1217

sys_nomodule---of 1
syscall_disestablish---of 17
syscall_establish---of 9
trace_enter50%of 8
trace_exit50%of 8
trace_is_enabled100%of 3
x86_curlwp100%of 1
-----------
SUMMARY60%of 20

-----------
SUMMARY---of 0

callout_ack---of 6
callout_active50%of 6
callout_destroy50%of 12
callout_expired---of 6
callout_halt50%of 22
callout_hardclock---of 14
callout_init60%of 5
callout_init_cpu---of 5
callout_invoking---of 6
callout_pending50%of 6
callout_reset50%of 10
callout_schedule50%of 6
callout_schedule_locked47%of 13
callout_setfunc50%of 10
callout_softclock---of 34
callout_startup---of 5
callout_stop60%of 10
callout_wait---of 17
db_show_callout---of 9
db_show_callout_bucket---of 5
x86_curlwp---of 1
-----------
SUMMARY51%of 100

cpu_attach---of 51
cpu_boot_secondary_processors---of 13
cpu_broadcast_halt---of 1
cpu_childdetached---of 9
cpu_debug_dump---of 6
cpu_defer---of 1
cpu_get_tsc_freq---of 12
cpu_hatch---of 13
cpu_init---of 14
cpu_init_first---of 3
cpu_init_idle_lwps---of 8
cpu_init_msrs---of 5
cpu_kick100%of 1
cpu_load_pmap67%of 3
cpu_match---of 1
cpu_offline_md---of 1
cpu_rescan---of 11
cpu_resume---of 7
cpu_shutdown---of 4
cpu_stop---of 8
cpu_suspend---of 3
mp_cpu_start---of 9
mp_cpu_start_cleanup---of 1
x86_cpu_idle_halt---of 5
x86_cpu_idle_mwait---of 5
x86_curlwp---of 1
-----------
SUMMARY75%of 4

tmpfs_gro_directory_empty_p---of 19
tmpfs_gro_genealogy---of 45
tmpfs_gro_lock_directory56%of 9
tmpfs_gro_lookup53%of 19
tmpfs_gro_remove---of 40
tmpfs_gro_remove_check_permitted---of 26
tmpfs_gro_remove_check_possible---of 25
tmpfs_gro_rename33%of 123
tmpfs_gro_rename_check_permitted47%of 52
tmpfs_gro_rename_check_possible48%of 50
tmpfs_rename100%of 1
tmpfs_rmdired_p56%of 9
tmpfs_sane_rename100%of 1
-----------
SUMMARY42%of 264

tmpfs_bytes_max100%of 1
tmpfs_dirent_get67%of 3
tmpfs_dirent_put---of 3
tmpfs_mem_decr---of 3
tmpfs_mem_incr---of 3
tmpfs_mem_info100%of 1
tmpfs_mntmem_destroy---of 3
tmpfs_mntmem_init100%of 1
tmpfs_mntmem_set---of 3
tmpfs_node_get50%of 4
tmpfs_node_put---of 3
tmpfs_pages_avail100%of 1
tmpfs_strname_alloc60%of 5
tmpfs_strname_free60%of 5
tmpfs_strname_neqlen67%of 3
-----------
SUMMARY67%of 24

VFS_EXTATTRCTL---of 1
VFS_FHTOVP---of 5
VFS_MOUNT60%of 5
VFS_QUOTACTL---of 5
VFS_ROOT100%of 5
VFS_SNAPSHOT---of 5
VFS_START100%of 5
VFS_STATVFS100%of 5
VFS_SUSPENDCTL100%of 5
VFS_SYNC---of 5
VFS_UNMOUNT---of 5
VFS_VPTOFH100%of 5
bdevvp---of 1
bgetvp56%of 18
brelvp44%of 23
cdevvp---of 1
copy_statvfs_info100%of 3
printlockedvnodes---of 9
reassignbuf57%of 30
sched_sync---of 40
set_statvfs_info46%of 11
setrootfstime---of 1
sysctl_kern_vnode---of 13
sysctl_vfs_generic_fstypes---of 12
vattr_null100%of 1
vdevgone---of 10
vfinddev---of 1
vflushbuf25%of 16
vfs_buf_print---of 1
vfs_getopsbyname60%of 5
vfs_mount_print---of 12
vfs_mount_print_all---of 4
vfs_syncer_add_to_worklist67%of 12
vfs_syncer_remove_from_worklist---of 7
vfs_timestamp40%of 5
vfs_unixify_accmode100%of 4
vfs_vnode_lock_print---of 10
vfs_vnode_print---of 17
vinvalbuf30%of 24
vn_syncer_add131%of 13
vn_syncer_add_to_worklist60%of 5
vn_syncer_remove_from_worklist34%of 12
vntblinit---of 4
vprint---of 14
vstate_name---of 3
vtruncbuf40%of 20
vtype2dt100%of 1
x86_curlwp---of 1
-----------
SUMMARY53%of 233

_rtcache_init40%of 20
db_show_routes---of 1
db_show_rtentry---of 34
ifa_ifwithroute_psref---of 31
route_listener_cb67%of 3
rt_check_reject_route50%of 6
rt_delete_matched_entries---of 22
rt_free---of 7
rt_free_work---of 15
rt_get_ifa---of 4
rt_getifa---of 16
rt_gettag---of 1
rt_ifa_addlocal---of 14
rt_ifa_remlocal---of 17
rt_init---of 3
rt_newmsg---of 3
rt_newmsg_dynamic---of 9
rt_replace_ifa---of 36
rt_replace_ifa_matched_entries---of 12
rt_setgate---of 22
rt_settag---of 4
rt_timer_add---of 26
rt_timer_count---of 1
rt_timer_queue_change---of 1
rt_timer_queue_create---of 8
rt_timer_queue_destroy---of 24
rt_timer_timer---of 1
rt_timer_work---of 23
rt_unref---of 7
rt_update---of 60
rt_update_finish---of 1
rt_update_prepare---of 8
rt_walktree---of 1
rtalloc1---of 1
rtalloc1_locked25%of 8
rtcache_copy---of 21
rtcache_free75%of 4
rtcache_init100%of 1
rtcache_init_noclone---of 1
rtcache_lookup240%of 25
rtcache_percpu_alloc---of 1
rtcache_percpu_init_cpu---of 1
rtcache_setdst30%of 17
rtcache_unref100%of 1
rtcache_update---of 1
rtcache_validate72%of 7
rtinit---of 36
rtredirect---of 41
rtrequest---of 1
rtrequest1---of 124
x86_curlwp---of 1
-----------
SUMMARY44%of 92

ether_add_vlantag---of 23
ether_addmulti40%of 15
ether_aton_r---of 13
ether_bpf_mtap---of 11
ether_crc32_be---of 4
ether_crc32_le---of 7
ether_del_vlantag---of 17
ether_delmulti---of 13
ether_disable_vlan_mtu---of 6
ether_enable_vlan_mtu---of 5
ether_ifattach---of 10
ether_ifdetach---of 15
ether_ifdetachhook_disestablish---of 3
ether_ifdetachhook_establish---of 3
ether_inject_vlantag---of 16
ether_input---of 90
ether_ioctl7%of 31
ether_ioctl_reinit---of 9
ether_multiaddr30%of 10
ether_multicast_sysctl---of 17
ether_output---of 69
ether_set_ifflags_cb---of 1
ether_set_vlan_cb---of 1
ether_snprintf---of 7
ether_sprintf---of 1
ether_strip_vlantag---of 11
etherinit---of 1
x86_curlwp---of 1
-----------
SUMMARY20%of 56

ffs_acls---of 14
ffs_cgupdate---of 13
ffs_done---of 3
ffs_extattrctl---of 3
ffs_fhtovp---of 14
ffs_flushfiles---of 13
ffs_init---of 3
ffs_init_vnode60%of 5
ffs_loadvnode---of 10
ffs_modcmd---of 7
ffs_mount---of 86
ffs_mountfs---of 86
ffs_mountroot---of 5
ffs_newvnode34%of 48
ffs_oldfscompat_read---of 16
ffs_reinit---of 1
ffs_reload---of 48
ffs_sbupdate---of 12
ffs_snapshot_cb67%of 3
ffs_statvfs100%of 1
ffs_superblock_validate---of 26
ffs_sync---of 27
ffs_sync_selector---of 10
ffs_sysctl_setup---of 1
ffs_unmount---of 24
ffs_vfs_fsync---of 18
ffs_vptofh100%of 3
x86_curlwp---of 1
-----------
SUMMARY42%of 60

do_ptrace7%of 234
ptrace_common_modcmd---of 4
ptrace_doio---of 23
ptrace_listener_cb12%of 45
-----------
SUMMARY8%of 279

VOP_ABORTOP75%of 8
VOP_ACCESS100%of 5
VOP_ACCESSX100%of 5
VOP_ACLCHECK---of 8
VOP_ADVLOCK100%of 5
VOP_BMAP50%of 8
VOP_BWRITE50%of 8
VOP_CLOSE75%of 8
VOP_CLOSEEXTATTR---of 5
VOP_CREATE46%of 11
VOP_DELETEEXTATTR---of 5
VOP_FALLOCATE---of 5
VOP_FCNTL---of 5
VOP_FDISCARD---of 5
VOP_FSYNC100%of 5
VOP_GETACL---of 8
VOP_GETATTR100%of 5
VOP_GETEXTATTR---of 5
VOP_GETPAGES60%of 5
VOP_INACTIVE100%of 5
VOP_IOCTL60%of 5
VOP_ISLOCKED100%of 5
VOP_KQFILTER---of 8
VOP_LINK---of 10
VOP_LISTEXTATTR---of 5
VOP_LOCK67%of 15
VOP_LOOKUP78%of 9
VOP_MKDIR64%of 11
VOP_MKNOD46%of 11
VOP_MMAP50%of 8
VOP_OPEN88%of 8
VOP_OPENEXTATTR---of 5
VOP_PARSEPATH100%of 5
VOP_PATHCONF100%of 5
VOP_POLL50%of 8
VOP_PRINT---of 8
VOP_PUTPAGES100%of 5
VOP_READ63%of 8
VOP_READDIR100%of 5
VOP_READLINK100%of 5
VOP_RECLAIM60%of 5
VOP_REMOVE47%of 13
VOP_RENAME50%of 8
VOP_REVOKE---of 5
VOP_RMDIR---of 13
VOP_SEEK50%of 8
VOP_SETACL---of 8
VOP_SETATTR47%of 15
VOP_SETEXTATTR---of 5
VOP_STRATEGY60%of 5
VOP_SYMLINK64%of 11
VOP_UNLOCK100%of 5
VOP_WHITEOUT60%of 5
VOP_WRITE46%of 11
-----------
SUMMARY68%of 277

genfs_renamelock_enter100%of 1
genfs_renamelock_exit100%of 1
genfs_statvfs100%of 1
genfs_suspendctl50%of 6
-----------
SUMMARY67%of 9

copypktopts---of 31
ip6_clearpktopts36%of 37
ip6_copypktopts---of 4
ip6_ctloutput13%of 308
ip6_freemoptions20%of 10
ip6_freepcbopts---of 3
ip6_get_membership39%of 21
ip6_if_output45%of 9
ip6_initpktopts---of 1
ip6_mloopback---of 8
ip6_optlen20%of 10
ip6_output24%of 271
ip6_raw_ctloutput---of 13
ip6_setpktopt12%of 111
ip6_setpktopts22%of 14
ip6_splithdr50%of 4
x86_curlwp100%of 1
-----------
SUMMARY19%of 796

exec_script_makecmds10%of 33
exec_script_modcmd---of 5
-----------
SUMMARY10%of 33

ktd_callout---of 1
ktdrel64%of 11
ktealloc75%of 4
ktesethdrlen---of 1
ktr_csw12%of 17
ktr_emul50%of 4
ktr_execarg---of 4
ktr_execenv---of 4
ktr_execfd---of 4
ktr_genio67%of 3
ktr_geniov67%of 3
ktr_io27%of 15
ktr_kuser75%of 4
ktr_mib---of 4
ktr_mibio100%of 3
ktr_namei50%of 4
ktr_namei2---of 4
ktr_point100%of 1
ktr_psig---of 6
ktr_syscall40%of 10
ktr_sysret40%of 5
ktrace_common47%of 76
ktrace_listener_cb56%of 9
ktrace_thread---of 40
ktraddentry33%of 28
ktradref60%of 5
ktrcanset60%of 5
ktrderef---of 5
ktrderefall---of 15
ktrinit---of 1
ktrops40%of 25
ktruser---of 9
sys_fktrace50%of 4
sys_utrace---of 1
x86_curlwp100%of 1
-----------
SUMMARY45%of 237

scdebug_init---of 1
sys___syscall50%of 8
sys_syscall---of 8
-----------
SUMMARY50%of 8

ipcperm100%of 1
sysctl_ipc_setup---of 1
sysctl_kern_sysvipc---of 23
sysv_ipc_modcmd---of 16
sysvipc_listener_cb45%of 9
sysvipcfini---of 3
sysvipcinit---of 3
-----------
SUMMARY50%of 10

lf_advlock24%of 113
lf_alloc75%of 4
lf_clearlock25%of 12
lf_findoverlap50%of 26
lf_init---of 1
lf_split---of 4
lf_wakelock17%of 12
x86_curlwp100%of 1
-----------
SUMMARY29%of 168

sco_abort_wrapper---of 8
sco_accept_wrapper---of 7
sco_attach_wrapper50%of 14
sco_bind_wrapper---of 9
sco_complete---of 10
sco_connect2_wrapper---of 3
sco_connect_wrapper---of 9
sco_connected---of 1
sco_connecting---of 1
sco_ctloutput---of 6
sco_detach_wrapper---of 5
sco_disconnect_wrapper---of 5
sco_disconnected---of 1
sco_input---of 13
sco_ioctl_wrapper---of 1
sco_linkmode---of 1
sco_listen_wrapper---of 5
sco_newconn---of 3
sco_peeraddr_wrapper---of 7
sco_purgeif_wrapper---of 1
sco_rcvd_wrapper---of 3
sco_recvoob_wrapper---of 3
sco_send_wrapper---of 12
sco_sendoob_wrapper---of 3
sco_shutdown_wrapper---of 3
sco_sockaddr_wrapper---of 7
sco_stat_wrapper---of 3
-----------
SUMMARY50%of 14

iskmemdev---of 4
iskmemvp100%of 4
rawio_listener_cb100%of 1
spec_advlock100%of 1
spec_bmap---of 7
spec_close41%of 42
spec_fdiscard---of 7
spec_fsync40%of 5
spec_inactive---of 3
spec_init---of 1
spec_io_drain38%of 8
spec_io_enter34%of 9
spec_io_exit55%of 11
spec_ioctl34%of 6
spec_kqfilter---of 4
spec_lookup---of 1
spec_mmap---of 5
spec_node_destroy29%of 14
spec_node_getmountedfs67%of 3
spec_node_init74%of 15
spec_node_lookup_by_dev23%of 18
spec_node_lookup_by_mount---of 11
spec_node_revoke44%of 23
spec_node_setmountedfs---of 9
spec_open49%of 64
spec_pathconf45%of 9
spec_poll50%of 4
spec_print---of 1
spec_read16%of 26
spec_reclaim---of 5
spec_strategy58%of 14
spec_write17%of 24
x86_curlwp100%of 1
-----------
SUMMARY42%of 302

doubletrap---of 4
nmitrap---of 5
startlwp---of 3
trap21%of 115
trap_print---of 3
userret64%of 19
x86_curlwp100%of 1
-----------
SUMMARY28%of 135

workqueue_create---of 13
workqueue_destroy---of 20
workqueue_enqueue42%of 12
workqueue_exit---of 5
workqueue_initqueue---of 8
workqueue_q_wait---of 13
workqueue_wait---of 12
workqueue_worker---of 22
x86_curlwp---of 1
-----------
SUMMARY42%of 12

strlcpy70%of 13
-----------
SUMMARY70%of 13

rt_addaddr---of 1
rt_assert_inactive---of 3
rt_deladdr---of 4
rt_gettable67%of 3
rt_inithead---of 3
rt_lookup---of 3
rt_matchaddr67%of 3
rt_refines---of 1
rt_walktree_visitor---of 1
rtbl_init---of 6
rtbl_search_matched_entry---of 3
rtbl_walktree---of 3
-----------
SUMMARY67%of 6

filt_fileattach---of 1
filt_kqdetach---of 1
filt_kqueue---of 3
filt_nopdetach---of 1
filt_nopevent---of 1
filt_proc---of 3
filt_procattach---of 4
filt_procdetach---of 7
filt_seltrue---of 1
filt_seltruedetach---of 1
filt_timer---of 1
filt_timerattach---of 13
filt_timercompute---of 17
filt_timerdetach---of 1
filt_timerexpire---of 6
filt_timertouch---of 27
filt_user---of 1
filt_userattach---of 1
filt_userdetach---of 1
filt_usertouch---of 14
filter_event50%of 10
kevent1---of 188
kevent_fetch_changes---of 1
kevent_put_events---of 1
kfilter_register---of 30
kfilter_unregister---of 20
klist_fini34%of 6
klist_init100%of 1
klist_insert---of 1
klist_remove---of 4
knote50%of 6
knote_activate_locked50%of 8
knote_clear_eof---of 1
knote_detach---of 39
knote_detach_quiesce---of 27
knote_fdclose---of 6
knote_proc_exec---of 9
knote_proc_exit---of 12
knote_proc_fork---of 16
knote_proc_fork_track---of 26
knote_set_eof---of 1
kqueue1---of 3
kqueue_check42%of 12
kqueue_close---of 16
kqueue_doclose---of 10
kqueue_fcntl---of 1
kqueue_fpathconf---of 1
kqueue_init---of 1
kqueue_ioctl---of 29
kqueue_kqfilter---of 5
kqueue_listener_cb50%of 4
kqueue_poll---of 4
kqueue_printit---of 8
kqueue_restart---of 3
kqueue_stat---of 1
seltrue_kqfilter---of 3
sys___kevent100---of 1
sys_kqueue---of 1
sys_kqueue1---of 1
x86_curlwp---of 1
-----------
SUMMARY47%of 47

uarea_poolpage_alloc60%of 5
uarea_poolpage_free---of 3
uarea_system_poolpage_alloc67%of 3
uarea_system_poolpage_free---of 3
uvm_idle---of 3
uvm_init_limits---of 3
uvm_kernacc---of 1
uvm_lwp_exit100%of 1
uvm_lwp_fork100%of 1
uvm_lwp_getuarea100%of 1
uvm_lwp_setuarea100%of 1
uvm_proc_exit---of 7
uvm_proc_fork67%of 3
uvm_scheduler---of 1
uvm_uarea_alloc100%of 1
uvm_uarea_free---of 1
uvm_uarea_init---of 1
uvm_uarea_system_alloc100%of 1
uvm_uarea_system_free---of 1
uvm_vslock100%of 1
uvm_vsunlock100%of 1
x86_curlwp---of 1
-----------
SUMMARY79%of 19

change_owner29%of 7
change_root80%of 5
chdir_lookup67%of 6
do_fhstat38%of 8
do_fhstatvfs25%of 8
do_open56%of 9
do_posix_mknodat---of 3
do_sys_accessat60%of 15
do_sys_chdir---of 3
do_sys_chmodat50%of 6
do_sys_chownat---of 6
do_sys_fchdir---of 12
do_sys_fstatvfs---of 3
do_sys_getvfsstat37%of 11
do_sys_linkat17%of 18
do_sys_mkdir---of 1
do_sys_mkdirat42%of 12
do_sys_mkfifoat30%of 10
do_sys_mknod---of 1
do_sys_mknodat40%of 25
do_sys_mount29%of 70
do_sys_openat54%of 15
do_sys_pstatvfs---of 3
do_sys_quotactl---of 46
do_sys_readlinkat---of 14
do_sys_rename---of 1
do_sys_renameat47%of 80
do_sys_stat100%of 1
do_sys_statat67%of 9
do_sys_symlink---of 1
do_sys_symlinkat45%of 20
do_sys_sync---of 9
do_sys_unlink---of 1
do_sys_unlinkat25%of 20
do_sys_utimens---of 1
do_sys_utimensat37%of 30
do_sys_utimes37%of 11
dofhopen15%of 21
dorevoke---of 4
dostatvfs28%of 18
fd_open---of 4
filt_fs---of 4
filt_fsattach---of 1
filt_fsdetach---of 1
kern_pathconf50%of 4
open_setfp84%of 6
sync_vnode_filter---of 3
sys___fhopen40100%of 1
sys___fhstat50---of 3
sys___fhstatvfs190---of 3
sys___fstatvfs190---of 4
sys___futimes50---of 10
sys___getdents3040%of 5
sys___getfh3023%of 9
sys___getvfsstat9028%of 11
sys___lstat5067%of 3
sys___lutimes50---of 9
sys___mknod50100%of 3
sys___mount50100%of 1
sys___posix_chown---of 3
sys___posix_fchown67%of 3
sys___posix_lchown---of 3
sys___posix_rename100%of 1
sys___quotactl---of 3
sys___stat5067%of 3
sys___statvfs190---of 4
sys___utimes5045%of 9
sys_access100%of 1
sys_chdir100%of 3
sys_chflags---of 3
sys_chmod---of 1
sys_chown---of 3
sys_chroot100%of 4
sys_faccessat100%of 1
sys_fchdir---of 1
sys_fchflags---of 3
sys_fchmod67%of 3
sys_fchmodat100%of 1
sys_fchown---of 3
sys_fchownat---of 1
sys_fchroot---of 6
sys_fdatasync---of 3
sys_fdiscard---of 6
sys_fstatat67%of 3
sys_fsync---of 3
sys_fsync_range---of 8
sys_ftruncate50%of 4
sys_futimens---of 3
sys_lchflags67%of 3
sys_lchmod---of 3
sys_lchown100%of 3
sys_link100%of 1
sys_linkat---of 1
sys_lpathconf---of 1
sys_lseek40%of 5
sys_mkdir100%of 1
sys_mkdirat100%of 1
sys_mkfifo---of 1
sys_mkfifoat---of 1
sys_mknodat67%of 3
sys_open100%of 3
sys_openat100%of 3
sys_pathconf100%of 1
sys_posix_fallocate---of 6
sys_pread34%of 6
sys_preadv---of 1
sys_pwrite50%of 6
sys_pwritev100%of 1
sys_readlink---of 1
sys_readlinkat---of 1
sys_rename100%of 1
sys_renameat100%of 1
sys_revoke---of 3
sys_rmdir---of 1
sys_symlink100%of 1
sys_symlinkat100%of 1
sys_sync---of 1
sys_truncate43%of 7
sys_umask---of 1
sys_undelete19%of 11
sys_unlink100%of 1
sys_unlinkat---of 1
sys_unmount---of 9
sys_utimensat100%of 1
vfs_composefh50%of 6
vfs_composefh_alloc38%of 8
vfs_composefh_free---of 1
vfs_copyinfh_alloc43%of 7
vfs_copyinfh_free---of 1
vfs_evfilt_fs_init---of 1
vfs_fhtovp---of 4
vfs_syncwait---of 14
x86_curlwp100%of 1
-----------
SUMMARY44%of 597

extensions_modcmd---of 7
secmodel_extensions_network_cb34%of 6
secmodel_extensions_process_cb42%of 12
sysctl_extensions_curtain_handler---of 6
sysctl_extensions_user_handler---of 6
sysctl_security_extensions_setup---of 1
-----------
SUMMARY39%of 18

-----------
SUMMARY---of 0

do_lwp_create---of 4
lwp_park50%of 12
lwp_unpark50%of 10
mi_startlwp---of 4
sys____lwp_park6040%of 10
sys__lwp_continue---of 3
sys__lwp_create25%of 8
sys__lwp_ctl---of 4
sys__lwp_detach45%of 9
sys__lwp_exit100%of 1
sys__lwp_getname---of 6
sys__lwp_getprivate---of 1
sys__lwp_kill---of 5
sys__lwp_self100%of 1
sys__lwp_setname---of 9
sys__lwp_setprivate---of 1
sys__lwp_suspend---of 12
sys__lwp_unpark100%of 1
sys__lwp_unpark_all---of 9
sys__lwp_wait50%of 4
sys__lwp_wakeup---of 5
x86_curlwp100%of 1
-----------
SUMMARY48%of 57

strncmp50%of 6
-----------
SUMMARY50%of 6

-----------
SUMMARY---of 0

compat_43_ttioctl21%of 44
kern_tty_43_fini---of 3
kern_tty_43_init---of 1
ttcompatgetflags54%of 15
-----------
SUMMARY29%of 59

hash_value---of 1
hash_value_ensure_initialized---of 3
old_sysctl100%of 7
random_address_init---of 1
sys___sysctl---of 14
sysctl_copyin---of 4
sysctl_copyinstr---of 4
sysctl_copyout---of 4
sysctl_create---of 169
sysctl_createv---of 72
sysctl_describe---of 71
sysctl_destroy---of 66
sysctl_destroyv---of 16
sysctl_dispatch18%of 23
sysctl_finalize---of 1
sysctl_free---of 21
sysctl_init---of 4
sysctl_locate49%of 27
sysctl_lock---of 3
sysctl_log_print---of 8
sysctl_lookup29%of 49
sysctl_map_flags---of 6
sysctl_mmap---of 12
sysctl_needfunc---of 6
sysctl_notavail---of 4
sysctl_null---of 1
sysctl_query---of 41
sysctl_relock---of 1
sysctl_teardown---of 14
sysctl_unlock---of 1
x86_curlwp100%of 1
-----------
SUMMARY37%of 107

umap_modcmd---of 4
umapfs_mount19%of 22
umapfs_sysctl_setup---of 1
umapfs_unmount---of 4
x86_curlwp100%of 1
-----------
SUMMARY22%of 23

sysctl_kern_veriexec_algorithms---of 7
sysctl_kern_veriexec_setup---of 1
sysctl_kern_veriexec_strict---of 4
veriexec_convert---of 3
veriexec_dump---of 4
veriexec_file_add---of 35
veriexec_file_convert---of 3
veriexec_file_delete---of 6
veriexec_file_dump---of 3
veriexec_file_free---of 6
veriexec_file_purge_cb---of 3
veriexec_file_report---of 9
veriexec_file_verify---of 40
veriexec_flush---of 6
veriexec_fp_status---of 18
veriexec_fpops_add---of 26
veriexec_init---of 6
veriexec_listener_cb50%of 4
veriexec_lookup---of 1
veriexec_mountspecific_dtor---of 3
veriexec_openchk13%of 16
veriexec_purge---of 3
veriexec_raw_cb59%of 12
veriexec_removechk25%of 8
veriexec_renamechk16%of 13
veriexec_table_delete---of 4
veriexec_unmountchk---of 11
veriexec_verify34%of 6
-----------
SUMMARY29%of 59

kern_free67%of 3
kern_malloc75%of 4
kern_realloc---of 10
-----------
SUMMARY72%of 7

copyin_pid---of 7
copyin_proc---of 6
copyin_vmspace---of 5
copyout_proc---of 6
copyout_vmspace---of 5
ioctl_copyin---of 3
ioctl_copyout---of 3
ucas_32---of 1
ucas_64---of 1
ufetch_32---of 1
ufetch_8---of 1
ufetch_ptr---of 1
ufetch_short---of 1
uio_setup_sysspace100%of 1
uiomove69%of 22
uiomove_frombuf---of 4
uiopeek---of 19
uioskip---of 15
ureadc---of 8
ustore_32---of 1
ustore_char---of 1
ustore_ptr---of 1
ustore_short---of 1
x86_curlwp100%of 1
-----------
SUMMARY71%of 24

do_setresgid15%of 48
do_setresuid36%of 51
sys___getlogin---of 1
sys___setlogin---of 7
sys_getegid100%of 1
sys_geteuid---of 1
sys_getgid---of 1
sys_getgid_with_egid---of 1
sys_getgroups---of 4
sys_getpgid50%of 4
sys_getpgrp100%of 1
sys_getpid---of 1
sys_getpid_with_ppid100%of 1
sys_getppid100%of 1
sys_getsid---of 4
sys_getuid---of 1
sys_getuid_with_euid---of 1
sys_issetugid---of 1
sys_setegid100%of 1
sys_seteuid---of 1
sys_setgid---of 1
sys_setgroups67%of 3
sys_setpgid50%of 4
sys_setregid---of 5
sys_setreuid80%of 5
sys_setsid---of 1
sys_setuid100%of 1
-----------
SUMMARY34%of 121

kernconfig_is_held---of 1
kernconfig_lock50%of 6
kernconfig_lock_init---of 1
kernconfig_unlock58%of 7
x86_curlwp100%of 1
-----------
SUMMARY58%of 14

vnd_alloc---of 1
vnd_attach67%of 3
vnd_detach---of 4
vnd_free---of 1
vnd_match---of 1
vnd_modcmd---of 1
vndattach---of 3
vndclose---of 17
vnddoclear---of 30
vnddump---of 1
vndioctl---of 127
vndioctl_get---of 11
vndiodone---of 7
vndopen25%of 29
vndread---of 6
vndsize---of 11
vndstrategy---of 23
vndthread---of 58
vndwrite---of 6
x86_curlwp---of 1
-----------
SUMMARY29%of 32

cpu_frequency---of 1
cpu_hascounter100%of 1
rdtsc_cpuid---of 1
rdtsc_lfence---of 1
rdtsc_mfence---of 1
tsc_apply_cpu---of 3
tsc_delay---of 4
tsc_get_timecount34%of 6
tsc_is_invariant---of 14
tsc_post_ap---of 6
tsc_read_bp---of 10
tsc_setfunc---of 4
tsc_sync_ap---of 4
tsc_sync_bp---of 4
tsc_sync_drift---of 3
tsc_tc_init---of 9
tsc_tc_reset---of 4
tsc_user_disable---of 1
tsc_user_enable---of 1
x86_curlwp100%of 1
-----------
SUMMARY50%of 8

coda_abortop---of 1
coda_access---of 8
coda_bmap---of 3
coda_close---of 11
coda_create---of 14
coda_fsync---of 10
coda_getattr---of 11
coda_getpages---of 18
coda_grab_vnode---of 7
coda_inactive---of 9
coda_ioctl---of 19
coda_islocked---of 4
coda_link---of 23
coda_lock---of 4
coda_lookup---of 27
coda_mkdir---of 15
coda_open---of 15
coda_pathconf---of 1
coda_putpages---of 7
coda_rdwr---of 25
coda_read---of 4
coda_readdir---of 33
coda_readlink---of 12
coda_reclaim---of 11
coda_remove---of 17
coda_rename---of 23
coda_rmdir---of 11
coda_setattr---of 22
coda_strategy---of 3
coda_symlink---of 12
coda_unlock---of 4
coda_vnodeopstats_init100%of 3
coda_vop_error---of 4
coda_vop_nop---of 4
coda_write---of 4
make_coda_node---of 10
x86_curlwp---of 1
-----------
SUMMARY100%of 3

closef65%of 14
fbadop_close---of 1
fbadop_ioctl---of 1
fbadop_read---of 1
fbadop_stat---of 1
fbadop_write---of 1
fd_abort56%of 18
fd_affix53%of 21
fd_alloc46%of 33
fd_allocfile50%of 14
fd_checkmaps59%of 24
fd_clone60%of 5
fd_close47%of 26
fd_closeexec---of 17
fd_copy51%of 89
fd_dup40%of 5
fd_dup244%of 16
fd_dupopen38%of 8
fd_free12%of 54
fd_getfile70%of 13
fd_getfile2---of 7
fd_getsock100%of 4
fd_getsock1100%of 4
fd_getvnode80%of 5
fd_hold100%of 1
fd_init---of 62
fd_putsock45%of 18
fd_set_exclose80%of 5
fd_share100%of 1
fd_sys_init---of 5
fd_tryexpand---of 34
fd_unused54%of 30
fd_used50%of 16
fgetdummy---of 1
fgetown---of 1
file_ctor40%of 5
file_dtor38%of 8
filedesc_ctor100%of 1
filedesc_dtor---of 1
filedescopen---of 1
fnullop_fcntl100%of 1
fnullop_kqfilter---of 1
fnullop_poll100%of 1
fnullop_restart---of 1
fownsignal---of 8
fputdummy---of 1
fsetown---of 10
sysctl_file_marker_reset---of 14
sysctl_kern_file---of 34
sysctl_kern_file2---of 50
x86_curlwp100%of 1
-----------
SUMMARY49%of 441

scsi_async_event_xfer_mode---of 21
scsi_change_def---of 1
scsi_fc_sas_async_event_xfer_mode---of 10
scsi_kill_pending---of 6
scsi_print_addr---of 3
scsi_scsipi_cmd67%of 3
-----------
SUMMARY67%of 3

addupc_intr---of 6
addupc_task---of 8
sys_profil50%of 4
-----------
SUMMARY50%of 4

del_m6if---of 10
expire_upcalls---of 12
ip6_mdq---of 65
ip6_mforward---of 54
ip6_mrouter_detach---of 18
ip6_mrouter_done32%of 16
ip6_mrouter_get---of 4
ip6_mrouter_set6%of 67
mrt6_ioctl---of 11
pim6_init---of 1
pim6_input---of 19
socket_send---of 9
sysctl_net_inet6_pim6_stats---of 1
-----------
SUMMARY11%of 83

rip_abort---of 3
rip_accept---of 3
rip_attach_wrapper46%of 11
rip_bind_wrapper---of 15
rip_connect2_wrapper---of 3
rip_connect_wrapper---of 11
rip_ctlinput---of 18
rip_ctloutput14%of 15
rip_detach_wrapper---of 5
rip_disconnect_wrapper---of 5
rip_init---of 1
rip_input---of 24
rip_ioctl_wrapper---of 1
rip_listen_wrapper---of 3
rip_output---of 30
rip_pcbnotify---of 9
rip_peeraddr_wrapper---of 7
rip_purgeif_wrapper---of 1
rip_rcvd_wrapper---of 3
rip_recvoob_wrapper---of 3
rip_sbappendaddr---of 14
rip_send_wrapper---of 18
rip_sendoob_wrapper---of 3
rip_shutdown_wrapper---of 3
rip_sockaddr_wrapper---of 7
rip_stat_wrapper---of 3
-----------
SUMMARY27%of 26

prop_dictionary_get_bool---of 3
prop_dictionary_get_cstring---of 5
prop_dictionary_get_cstring_nocopy---of 4
prop_dictionary_get_data---of 5
prop_dictionary_get_dict---of 3
prop_dictionary_get_int---of 1
prop_dictionary_get_int16---of 1
prop_dictionary_get_int32---of 1
prop_dictionary_get_int64---of 1
prop_dictionary_get_int8---of 1
prop_dictionary_get_intptr---of 1
prop_dictionary_get_long---of 1
prop_dictionary_get_longlong---of 1
prop_dictionary_get_schar---of 1
prop_dictionary_get_short---of 1
prop_dictionary_get_string50%of 4
prop_dictionary_get_uchar---of 1
prop_dictionary_get_uint---of 1
prop_dictionary_get_uint16---of 1
prop_dictionary_get_uint32---of 1
prop_dictionary_get_uint64---of 1
prop_dictionary_get_uint8---of 1
prop_dictionary_get_uintptr---of 1
prop_dictionary_get_ulong---of 1
prop_dictionary_get_ulonglong---of 1
prop_dictionary_get_ushort---of 1
prop_dictionary_set_and_rel---of 3
prop_dictionary_set_bool---of 3
prop_dictionary_set_cstring---of 3
prop_dictionary_set_cstring_nocopy---of 3
prop_dictionary_set_data---of 3
prop_dictionary_set_data_nocopy---of 3
prop_dictionary_set_int---of 3
prop_dictionary_set_int16---of 3
prop_dictionary_set_int32---of 3
prop_dictionary_set_int64---of 3
prop_dictionary_set_int8---of 3
prop_dictionary_set_intptr---of 3
prop_dictionary_set_long---of 3
prop_dictionary_set_longlong---of 3
prop_dictionary_set_schar---of 3
prop_dictionary_set_short---of 3
prop_dictionary_set_string67%of 3
prop_dictionary_set_string_nocopy67%of 3
prop_dictionary_set_uchar---of 3
prop_dictionary_set_uint---of 3
prop_dictionary_set_uint1667%of 3
prop_dictionary_set_uint32---of 3
prop_dictionary_set_uint64---of 3
prop_dictionary_set_uint8---of 3
prop_dictionary_set_uintptr---of 3
prop_dictionary_set_ulong---of 3
prop_dictionary_set_ulonglong---of 3
prop_dictionary_set_ushort---of 3
-----------
SUMMARY62%of 13

devmon_insert17%of 12
drvctl_close---of 14
drvctl_command_get_properties---of 7
drvctl_fini---of 1
drvctl_init---of 1
drvctl_ioctl3%of 81
drvctl_modcmd---of 7
drvctl_poll67%of 3
drvctl_read---of 1
drvctl_stat---of 1
drvctl_write---of 1
drvctlattach---of 1
drvctlopen67%of 3
x86_curlwp100%of 1
-----------
SUMMARY9%of 100

-----------
SUMMARY---of 0

explicit_memset100%of 1
-----------
SUMMARY100%of 1

_prop_object_copyout---of 8
prop_array_copyin---of 6
prop_array_copyin_ioctl---of 1
prop_array_copyin_ioctl_size---of 7
prop_array_copyin_size---of 6
prop_array_copyout---of 1
prop_array_copyout_ioctl---of 3
prop_dictionary_copyin---of 6
prop_dictionary_copyin_ioctl100%of 1
prop_dictionary_copyin_ioctl_size29%of 7
prop_dictionary_copyin_size---of 6
prop_dictionary_copyout---of 1
prop_dictionary_copyout_ioctl---of 3
prop_kern_init---of 4
x86_curlwp---of 1
-----------
SUMMARY38%of 8

ip_ctloutput6%of 129
ip_fragment---of 32
ip_freemoptions40%of 5
ip_get_membership24%of 13
ip_getmoptions---of 21
ip_if_output---of 6
ip_multicast_if25%of 16
ip_optcopy---of 17
ip_optlen---of 4
ip_output---of 193
ip_pktinfo_prepare---of 19
ip_setmoptions11%of 65
ip_setpktopts---of 16
x86_curlwp100%of 1
-----------
SUMMARY11%of 229

compat_cvtcmd5%of 68
compat_ifioctl---of 27
do_compat_cvtcmd100%of 1
if_43_fini---of 5
if_43_init---of 1
x86_curlwp---of 1
-----------
SUMMARY6%of 69

sack_dump---of 4
sack_removehole---of 7
tcp_del_sackholes---of 7
tcp_free_sackholes50%of 6
tcp_new_dsack---of 3
tcp_sack_adjust---of 8
tcp_sack_init---of 1
tcp_sack_numblks50%of 4
tcp_sack_option---of 48
tcp_sack_output---of 8
-----------
SUMMARY50%of 10

compat_50_iflist---of 3
compat_50_route_abort---of 3
compat_50_route_accept---of 3
compat_50_route_attach_wrapper40%of 10
compat_50_route_bind_wrapper---of 3
compat_50_route_connect2_wrapper---of 3
compat_50_route_connect_wrapper---of 3
compat_50_route_detach_wrapper---of 11
compat_50_route_disconnect_wrapper---of 5
compat_50_route_enqueue---of 4
compat_50_route_filter18%of 28
compat_50_route_init---of 1
compat_50_route_intr---of 6
compat_50_route_ioctl_wrapper---of 1
compat_50_route_listen_wrapper---of 3
compat_50_route_output11%of 132
compat_50_route_peeraddr_wrapper---of 9
compat_50_route_rcvd_wrapper---of 3
compat_50_route_recvoob_wrapper---of 3
compat_50_route_send_wrapper60%of 5
compat_50_route_sendoob_wrapper---of 3
compat_50_route_shutdown_wrapper---of 3
compat_50_route_sockaddr_wrapper---of 9
compat_50_route_stat_wrapper---of 3
compat_50_rt_addrmsg---of 1
compat_50_rt_addrmsg0---of 41
compat_50_rt_addrmsg_rt---of 1
compat_50_rt_addrmsg_src---of 1
compat_50_rt_ieee80211msg---of 15
compat_50_rt_ifannouncemsg---of 6
compat_50_rt_ifmsg---of 9
compat_50_rt_missmsg---of 7
compat_50_rt_msg1---of 28
compat_50_rt_oifmsg---of 4
route_ctloutput---of 24
route_output_report---of 10
rt_msg2---of 26
rt_pr_init---of 1
rtsock_50_fini---of 19
rtsock_50_init---of 1
x86_curlwp100%of 1
-----------
SUMMARY16%of 176

-----------
SUMMARY---of 0

union_modcmd---of 4
union_mount28%of 37
union_renamelock_enter---of 1
union_renamelock_exit---of 1
union_root50%of 6
union_start100%of 1
union_statvfs43%of 7
union_sync---of 1
union_unmount---of 9
union_unmount_selector---of 3
union_vget---of 1
unionfs_sysctl_setup---of 1
x86_curlwp100%of 1
-----------
SUMMARY35%of 52

scsipi_adapter_addref---of 13
scsipi_adapter_delref---of 11
scsipi_adapter_enable---of 5
scsipi_adapter_ioctl---of 6
scsipi_adapter_minphys60%of 5
scsipi_adapter_request---of 9
scsipi_async_event---of 41
scsipi_channel_freeze---of 3
scsipi_channel_init---of 5
scsipi_channel_shutdown---of 4
scsipi_channel_thaw---of 4
scsipi_channel_timed_thaw---of 4
scsipi_complete---of 80
scsipi_completion_thread---of 27
scsipi_done---of 23
scsipi_enqueue25%of 29
scsipi_execute_xs30%of 34
scsipi_free_opcodeinfo---of 3
scsipi_get_opcodeinfo---of 14
scsipi_get_xs25%of 33
scsipi_init---of 3
scsipi_inquire---of 5
scsipi_insert_periph---of 4
scsipi_interpret_sense---of 65
scsipi_kill_pending---of 4
scsipi_lookup_periph---of 9
scsipi_lookup_periph_locked---of 9
scsipi_mode_select---of 1
scsipi_mode_select_big---of 1
scsipi_mode_sense---of 1
scsipi_mode_sense_big---of 1
scsipi_periph_freeze---of 1
scsipi_periph_freeze_locked---of 1
scsipi_periph_thaw---of 5
scsipi_periph_thaw_locked---of 5
scsipi_periph_timed_thaw---of 7
scsipi_prevent---of 3
scsipi_print_cdb---of 3
scsipi_put_xs---of 23
scsipi_remove_periph---of 6
scsipi_run_queue25%of 69
scsipi_set_xfer_mode---of 15
scsipi_start---of 1
scsipi_sync_factor_to_freq---of 3
scsipi_sync_factor_to_period---of 3
scsipi_sync_period_to_factor---of 7
scsipi_target_detach---of 19
scsipi_test_unit_ready---of 3
scsipi_thread_call_callback---of 5
scsipi_wait_drain---of 4
-----------
SUMMARY27%of 170

_kernel_lock50%of 28
_kernel_lock_dump---of 1
_kernel_locked_p100%of 1
_kernel_unlock64%of 22
assert_sleepable30%of 10
kernel_lock_init---of 1
kernel_lock_trace_ipi---of 3
x86_curlwp100%of 1
-----------
SUMMARY54%of 62

msg_freehdr---of 12
msgctl19%of 23
msgfini---of 7
msginit---of 20
msgrcv1---of 40
msgrealloc---of 66
msgsnd131%of 43
sys___msgctl5050%of 6
sys_msgget30%of 20
sys_msgrcv---of 1
sys_msgsnd100%of 1
sysctl_ipc_msg_setup---of 3
sysctl_ipc_msgmni---of 3
sysctl_ipc_msgseg---of 3
-----------
SUMMARY27%of 93

copy_procargs---of 25
copy_procargs_sysctl_cb---of 1
copyin_psstrings---of 4
fill_eproc---of 38
fill_kproc2---of 54
fixjobc22%of 33
get_expose_address---of 1
pg_delete31%of 13
pg_remove43%of 14
pgid_in_session---of 10
pgrp_find50%of 6
pidtbl_dump---of 15
proc0_init---of 16
proc_alloc50%of 6
proc_alloc_lwpid54%of 13
proc_alloc_pid58%of 7
proc_alloc_pid_slot16%of 26
proc_crmod_enter60%of 5
proc_crmod_leave73%of 18
proc_ctor100%of 1
proc_enterpgrp17%of 71
proc_find39%of 13
proc_find_locked---of 7
proc_find_lwp55%of 11
proc_find_lwp_acquire_proc---of 8
proc_find_lwp_unlocked40%of 10
proc_find_lwpid---of 12
proc_find_raw34%of 6
proc_finispecific---of 1
proc_free_lwpid30%of 10
proc_free_mem67%of 3
proc_free_pid67%of 3
proc_free_pid_internal56%of 9
proc_getauxv---of 9
proc_getspecific---of 1
proc_initspecific---of 3
proc_leavepgrp40%of 10
proc_listener_cb32%of 16
proc_sesshold---of 3
proc_sessrele50%of 8
proc_setspecific---of 1
proc_specific_key_create---of 1
proc_specific_key_delete---of 1
proc_uidmatch---of 5
proc_vmspace_getref---of 6
procinit---of 5
procinit_sysctl---of 1
proclist_foreach_call---of 17
sysctl_doeproc---of 90
sysctl_kern_proc_args---of 40
sysctl_security_expose_address---of 5
x86_curlwp100%of 1
-----------
SUMMARY36%of 313

_bpf_change_type---of 6
_bpf_deregister_track_event---of 13
_bpf_mtap---of 10
_bpf_mtap2---of 7
_bpf_mtap_af---of 7
_bpf_mtap_sl_in---of 21
_bpf_mtap_sl_out---of 9
_bpf_mtap_softint---of 10
_bpf_mtap_softint_init---of 7
_bpf_register_track_event---of 7
_bpfattach---of 10
_bpfdetach---of 37
bpf_attachd---of 19
bpf_close---of 41
bpf_deliver---of 34
bpf_detachd---of 23
bpf_ioctl---of 184
bpf_jit_freecode---of 3
bpf_jit_generate---of 3
bpf_kqfilter---of 3
bpf_mcpy---of 5
bpf_modcmd---of 4
bpf_mtap_si---of 6
bpf_poll28%of 11
bpf_read---of 15
bpf_stat---of 1
bpf_stats---of 1
bpf_sysctl_gstats_handler---of 1
bpf_timed_out---of 5
bpf_write---of 55
bpfilterattach---of 1
bpfopen47%of 13
filt_bpfrdetach---of 1
filt_bpfread---of 3
sysctl_net_bpf_maxbufsize---of 4
sysctl_net_bpf_peers---of 17
sysctl_net_bpf_setup---of 3
x86_curlwp100%of 1
-----------
SUMMARY40%of 25

compat_43_sys_getdtablesize---of 1
compat_43_sys_gethostid---of 1
compat_43_sys_gethostname100%of 1
compat_43_sys_getkerninfo---of 25
compat_43_sys_sethostid---of 1
compat_43_sys_sethostname100%of 1
kern_info_43_fini---of 1
kern_info_43_init---of 1
-----------
SUMMARY100%of 2

cwdexec---of 5
cwdfree34%of 6
cwdinit58%of 7
cwdshare100%of 1
cwdunshare---of 3
x86_curlwp100%of 1
-----------
SUMMARY54%of 15

iostat_alloc---of 3
iostat_busy53%of 19
iostat_find---of 7
iostat_free---of 8
iostat_init---of 1
iostat_isbusy---of 1
iostat_rename---of 1
iostat_seek---of 1
iostat_unbusy---of 14
iostat_wait70%of 10
iostati_getnames---of 15
sysctl_hw_disknames---of 1
sysctl_hw_iostatnames---of 1
sysctl_hw_iostats---of 9
-----------
SUMMARY59%of 29

virtio_alloc_vq---of 16
virtio_attach_failed---of 5
virtio_child---of 1
virtio_child_attach_failed---of 4
virtio_child_attach_finish---of 27
virtio_child_attach_start---of 7
virtio_child_detach---of 4
virtio_dequeue---of 15
virtio_dequeue_commit---of 1
virtio_dmat100%of 1
virtio_enqueue62%of 21
virtio_enqueue_abort---of 1
virtio_enqueue_commit39%of 21
virtio_enqueue_p60%of 22
virtio_enqueue_prep60%of 5
virtio_enqueue_reserve29%of 25
virtio_features---of 1
virtio_free_vq---of 7
virtio_init_vq---of 1
virtio_init_vq_vqdone---of 1
virtio_intrhand---of 1
virtio_modcmd---of 1
virtio_negotiate_features---of 3
virtio_postpone_intr---of 11
virtio_postpone_intr_far---of 5
virtio_postpone_intr_smart---of 5
virtio_print_device_type---of 3
virtio_read_device_config_1---of 1
virtio_read_device_config_2---of 3
virtio_read_device_config_4---of 3
virtio_read_device_config_8---of 5
virtio_read_device_config_le_2---of 3
virtio_read_device_config_le_4---of 3
virtio_reinit_end---of 1
virtio_reinit_start---of 9
virtio_reset---of 1
virtio_reset_vq---of 20
virtio_rw16---of 5
virtio_rw32---of 5
virtio_rw6460%of 5
virtio_set_status---of 1
virtio_soft_intr---of 3
virtio_start_vq_intr---of 10
virtio_stop_vq_intr---of 6
virtio_vq_done---of 1
virtio_vq_intr---of 6
virtio_vq_is_enqueued---of 7
virtio_write_device_config_1---of 1
virtio_write_device_config_2---of 3
virtio_write_device_config_4---of 3
virtio_write_device_config_8---of 7
virtio_write_device_config_le_2---of 3
virtio_write_device_config_le_4---of 3
vq_alloc_slot60%of 15
vq_free_slot---of 4
-----------
SUMMARY50%of 115

raw_ctlinput---of 1
raw_input25%of 33
raw_send37%of 11
raw_setpeeraddr---of 1
raw_setsockaddr---of 1
raw_usrreq---of 23
-----------
SUMMARY28%of 44

ufs_bmap80%of 5
ufs_bmaparray13%of 88
ufs_getlbns40%of 15
ufs_issequential67%of 3
x86_curlwp---of 1
-----------
SUMMARY21%of 111

-----------
SUMMARY---of 0

ccd_components_sysctl---of 18
ccd_info_sysctl---of 8
ccd_modcmd---of 1
ccd_units_sysctl---of 8
ccdattach---of 1
ccdclose---of 12
ccddetach---of 1
ccdget40%of 10
ccdgetdisklabel---of 19
ccdioctl---of 172
ccdiodone---of 14
ccdopen34%of 15
ccdread---of 8
ccdsize---of 13
ccdstart---of 42
ccdstrategy---of 11
ccdthread---of 13
ccdwrite---of 8
sysctl_kern_ccd_setup---of 3
x86_curlwp---of 1
-----------
SUMMARY36%of 25

bufq_priocscan_cancel---of 13
bufq_priocscan_fini---of 3
bufq_priocscan_get25%of 57
bufq_priocscan_init---of 1
bufq_priocscan_modcmd---of 4
bufq_priocscan_put67%of 3
cscan_tree_compare_key38%of 8
cscan_tree_compare_nodes15%of 21
-----------
SUMMARY25%of 89

null_modcmd---of 4
nullfs_mount16%of 13
nullfs_sysctl_setup---of 1
nullfs_unmount---of 4
x86_curlwp100%of 1
-----------
SUMMARY22%of 14

pserialize_create---of 1
pserialize_destroy---of 1
pserialize_in_read_section67%of 3
pserialize_init---of 1
pserialize_not_in_read_section100%of 1
pserialize_perform50%of 8
pserialize_read_enter100%of 1
pserialize_read_exit50%of 6
-----------
SUMMARY58%of 19

strnlen60%of 5
-----------
SUMMARY60%of 5

_prop_stack_init100%of 1
_prop_stack_pop30%of 20
_prop_stack_push43%of 7
-----------
SUMMARY36%of 28

do_filereadv48%of 34
do_filewritev45%of 38
dofileread30%of 10
dofilewrite---of 14
sys_ioctl42%of 36
sys_read100%of 4
sys_readv100%of 1
sys_write50%of 4
sys_writev100%of 1
x86_curlwp100%of 1
-----------
SUMMARY47%of 129

ptrace_copyin_piod---of 3
ptrace_copyin_siginfo---of 3
ptrace_copyout_lwpstatus---of 1
ptrace_copyout_piod---of 3
ptrace_copyout_siginfo---of 3
ptrace_modcmd---of 4
sys_ptrace100%of 1
-----------
SUMMARY100%of 1

pmap_tlb_cpu_init---of 1
pmap_tlb_init---of 1
pmap_tlb_intr---of 15
pmap_tlb_shootdown82%of 11
pmap_tlb_shootnow49%of 39
-----------
SUMMARY57%of 50

kcpuset_atomic_clear67%of 3
kcpuset_atomic_set67%of 3
kcpuset_atomicly_intersect---of 5
kcpuset_atomicly_merge---of 5
kcpuset_atomicly_remove---of 5
kcpuset_clear---of 6
kcpuset_clone---of 1
kcpuset_copy43%of 7
kcpuset_copyin---of 9
kcpuset_copyout---of 9
kcpuset_countset100%of 3
kcpuset_create34%of 6
kcpuset_destroy58%of 7
kcpuset_export_u32---of 3
kcpuset_ffs---of 4
kcpuset_ffs_intersecting---of 4
kcpuset_fill---of 7
kcpuset_intersect---of 7
kcpuset_intersecting_p---of 5
kcpuset_isotherset---of 6
kcpuset_isset46%of 11
kcpuset_iszero---of 5
kcpuset_match100%of 1
kcpuset_merge58%of 7
kcpuset_remove---of 6
kcpuset_set---of 6
kcpuset_sysinit---of 9
kcpuset_unuse---of 13
kcpuset_use---of 3
kcpuset_zero43%of 7
-----------
SUMMARY53%of 55

-----------
SUMMARY---of 0

change_keepalive---of 6
sysctl_inpcblist---of 27
sysctl_net_inet_ip_ports---of 24
sysctl_net_inet_tcp_ident---of 66
sysctl_net_inet_tcp_mssdflt---of 4
sysctl_net_inet_tcp_setup2---of 1
sysctl_net_inet_tcp_stats---of 1
sysctl_tcp_congctl---of 4
sysctl_tcp_init_win---of 4
sysctl_tcp_keep---of 5
sysctl_update_tcpcb_template---of 4
tcp_abort_wrapper---of 3
tcp_accept_wrapper---of 5
tcp_attach_wrapper39%of 18
tcp_bind_wrapper45%of 9
tcp_connect2_wrapper---of 3
tcp_connect_wrapper32%of 25
tcp_ctloutput9%of 45
tcp_detach_wrapper67%of 3
tcp_disconnect129%of 7
tcp_disconnect_wrapper---of 3
tcp_ioctl_wrapper---of 4
tcp_listen_wrapper---of 10
tcp_peeraddr_wrapper---of 5
tcp_purgeif_wrapper---of 4
tcp_rcvd_wrapper---of 4
tcp_recvoob_wrapper---of 10
tcp_send_wrapper---of 5
tcp_sendoob_wrapper23%of 9
tcp_shutdown_wrapper---of 4
tcp_sockaddr_wrapper34%of 6
tcp_stat_wrapper---of 3
tcp_usrclosed---of 24
tcp_usrreq_init---of 1
x86_curlwp---of 1
-----------
SUMMARY26%of 122

-----------
SUMMARY---of 0

cpu_count80%of 5
cpu_count_sync67%of 9
cpu_getmodel---of 1
cpu_setmodel---of 1
cpu_softintr_p100%of 1
cpu_topology_init---of 75
cpu_topology_set---of 1
cpu_topology_setspeed---of 1
curcpu_stable---of 4
mi_cpu_init---of 5
x86_curlwp100%of 1
-----------
SUMMARY75%of 16

compat_20_sys_fhstatfs---of 7
compat_20_sys_fstatfs50%of 4
compat_20_sys_getfsstat100%of 1
compat_20_sys_statfs---of 4
statvfs_to_statfs12_copy59%of 12
vfs_syscalls_20_fini---of 1
vfs_syscalls_20_init---of 1
-----------
SUMMARY59%of 17

ufs_accessx48%of 21
ufs_advlock100%of 1
ufs_bufio65%of 20
ufs_close67%of 3
ufs_create50%of 4
ufs_do_nfs4_acl_inheritance---of 3
ufs_getattr50%of 10
ufs_gop_alloc46%of 11
ufs_gop_markupdate40%of 5
ufs_link---of 25
ufs_makeinode21%of 44
ufs_mkdir18%of 62
ufs_mknod50%of 6
ufs_open50%of 4
ufs_pathconf---of 18
ufs_print---of 3
ufs_readdir33%of 28
ufs_readlink45%of 9
ufs_remove40%of 15
ufs_rmdir---of 22
ufs_setattr28%of 120
ufs_strategy37%of 22
ufs_symlink53%of 17
ufs_vinit64%of 11
ufs_whiteout14%of 15
ufsfifo_close67%of 3
ufsfifo_read100%of 1
ufsfifo_write---of 1
ufsspec_close67%of 3
ufsspec_read67%of 3
ufsspec_write67%of 3
x86_curlwp100%of 1
-----------
SUMMARY35%of 442

ppsratecheck58%of 7
ratecheck---of 7
-----------
SUMMARY58%of 7

-----------
SUMMARY---of 0

copystr50%of 8
-----------
SUMMARY50%of 8

cn_set_tab---of 1
cnbell---of 4
cnclose---of 4
cnflush50%of 4
cngetc---of 5
cngetsn---of 22
cnhalt---of 4
cnioctl36%of 14
cnkqfilter---of 10
cnopen---of 8
cnpoll---of 10
cnpollc---of 6
cnputc75%of 4
cnread---of 8
cnwrite---of 10
cons_modcmd---of 4
nullcnpollc---of 1
-----------
SUMMARY46%of 22

compat_80_modstat11%of 29
kern_mod_80_fini---of 3
kern_mod_80_init---of 1
-----------
SUMMARY11%of 29

clockrnd_get---of 3
get_intr_timecount---of 1
getticks100%of 1
hardclock---of 16
initclocks---of 10
schedclock---of 3
startprofclock---of 6
statclock---of 35
stopprofclock---of 6
sysctl_kern_clockrate---of 1
x86_curlwp---of 1
-----------
SUMMARY100%of 1

tmpfs_alloc_dirent---of 4
tmpfs_chflags---of 9
tmpfs_chmod---of 9
tmpfs_chown---of 14
tmpfs_chsize---of 16
tmpfs_chtimes---of 14
tmpfs_construct_node22%of 50
tmpfs_dir_attach45%of 27
tmpfs_dir_cached---of 9
tmpfs_dir_detach---of 38
tmpfs_dir_getdents42%of 29
tmpfs_dir_getdotents50%of 8
tmpfs_dir_getseq32%of 19
tmpfs_dir_lookup60%of 22
tmpfs_dir_lookupbyseq37%of 22
tmpfs_free_dirent---of 5
tmpfs_free_node---of 27
tmpfs_init_vnode44%of 16
tmpfs_loadvnode---of 5
tmpfs_newvnode46%of 44
tmpfs_reg_resize---of 19
tmpfs_update72%of 7
tmpfs_update_lazily---of 7
tmpfs_update_locked84%of 12
-----------
SUMMARY43%of 256

ras_fork84%of 6
ras_lookup59%of 12
ras_purgeall---of 7
sys_rasctl41%of 22
x86_curlwp100%of 1
-----------
SUMMARY54%of 41

compat_50_sys_____semctl1334%of 12
-----------
SUMMARY34%of 12

ov_mount17%of 12
ov_unmount---of 4
overlay_modcmd---of 4
overlay_sysctl_setup---of 1
x86_curlwp100%of 1
-----------
SUMMARY24%of 13

compat_30_sys___fhstat3067%of 3
compat_30_sys___fstat1367%of 3
compat_30_sys___lstat13---of 3
compat_30_sys___stat1367%of 3
compat_30_sys_fhopen---of 1
compat_30_sys_fhstat67%of 3
compat_30_sys_fhstatvfs1---of 3
compat_30_sys_getdents---of 18
compat_30_sys_getfh34%of 6
vfs_syscalls_30_fini---of 1
vfs_syscalls_30_init---of 1
-----------
SUMMARY56%of 18

getcwd_common56%of 58
proc_isunder67%of 3
sys___getcwd---of 6
vn_isunder100%of 1
vnode_to_path---of 9
-----------
SUMMARY57%of 62

sleepq_abort---of 3
sleepq_block49%of 27
sleepq_changepri---of 3
sleepq_enqueue62%of 13
sleepq_enter86%of 7
sleepq_init100%of 1
sleepq_insert43%of 21
sleepq_lendpri67%of 3
sleepq_reinsert25%of 12
sleepq_remove48%of 17
sleepq_timeout---of 4
sleepq_transfer---of 10
sleepq_uncatch---of 1
sleepq_unsleep---of 7
sleepq_wake---of 13
sleeptab_init---of 5
x86_curlwp100%of 1
-----------
SUMMARY50%of 102

tmpfs_access50%of 20
tmpfs_advlock---of 3
tmpfs_close67%of 3
tmpfs_create---of 6
tmpfs_fsync67%of 3
tmpfs_getattr60%of 5
tmpfs_getpages---of 24
tmpfs_inactive50%of 8
tmpfs_link---of 29
tmpfs_lookup50%of 57
tmpfs_mkdir67%of 3
tmpfs_mknod60%of 5
tmpfs_open60%of 5
tmpfs_pathconf20%of 10
tmpfs_print---of 5
tmpfs_putpages38%of 8
tmpfs_read---of 15
tmpfs_readdir25%of 33
tmpfs_readlink55%of 11
tmpfs_reclaim---of 7
tmpfs_remove---of 29
tmpfs_rmdir---of 53
tmpfs_setattr---of 28
tmpfs_symlink---of 3
tmpfs_whiteout---of 17
tmpfs_write---of 22
x86_curlwp---of 1
-----------
SUMMARY45%of 171

soo_close67%of 3
soo_fpathconf---of 3
soo_ioctl23%of 45
soo_poll100%of 1
soo_posix_fadvise---of 1
soo_read---of 1
soo_restart---of 1
soo_stat58%of 7
soo_write100%of 1
x86_curlwp100%of 1
-----------
SUMMARY33%of 58

x86_curlwp---of 1
x86_dbregs_abandon100%of 1
x86_dbregs_clear---of 6
x86_dbregs_init---of 1
x86_dbregs_read---of 3
x86_dbregs_restore---of 4
x86_dbregs_save---of 4
x86_dbregs_store_dr6---of 5
x86_dbregs_switch19%of 11
x86_dbregs_user_trap---of 15
x86_dbregs_validate---of 8
x86_dbregs_write---of 3
-----------
SUMMARY25%of 12

ffs_indirtrunc32%of 60
ffs_itimes62%of 18
ffs_truncate37%of 190
ffs_update47%of 30
x86_curlwp---of 1
-----------
SUMMARY39%of 298

genfs_insane_rename60%of 30
genfs_rename_cache_purge50%of 28
genfs_rename_exit54%of 30
genfs_rename_knote53%of 38
genfs_rename_lock---of 98
genfs_sane_rename35%of 184
genfs_ufslike_remove_check_permitted---of 20
genfs_ufslike_remove_check_possible---of 1
genfs_ufslike_rename_check_permitted48%of 48
genfs_ufslike_rename_check_possible50%of 4
-----------
SUMMARY44%of 362

-----------
SUMMARY---of 0

uvm_analloc50%of 10
uvm_anfree37%of 22
uvm_anon_ctor100%of 1
uvm_anon_dropswap67%of 3
uvm_anon_init---of 1
uvm_anon_lockloanpg---of 12
uvm_anon_pagein---of 15
uvm_anon_release---of 21
-----------
SUMMARY45%of 36

filt_tunrdetach---of 1
filt_tunread---of 10
if_tun_modcmd---of 5
tun_clone_create---of 6
tun_clone_destroy---of 20
tun_find_zunit---of 11
tun_i_softintr---of 4
tun_ioctl---of 34
tun_o_softintr---of 4
tun_output---of 39
tunattach---of 1
tunclose---of 25
tunioctl8%of 27
tunkqfilter---of 7
tunopen---of 13
tunpoll---of 11
tunread---of 21
tunwrite---of 39
-----------
SUMMARY8%of 27

ffs_alloc34%of 30
ffs_alloccg43%of 61
ffs_alloccgblk21%of 58
ffs_blkalloc---of 6
ffs_blkalloc_ump---of 43
ffs_blkfree13%of 16
ffs_blkfree_cg36%of 14
ffs_blkfree_common13%of 97
ffs_blkfree_snap---of 16
ffs_blkfree_td---of 9
ffs_blkpref_ufs1---of 24
ffs_blkpref_ufs238%of 24
ffs_checkfreefile---of 19
ffs_discard_finish---of 5
ffs_discard_init---of 3
ffs_discardcb---of 3
ffs_freefile28%of 11
ffs_freefile_common41%of 27
ffs_freefile_snap---of 13
ffs_fserr---of 4
ffs_mapsearch45%of 20
ffs_nodealloccg30%of 87
ffs_realloccg28%of 111
ffs_valloc38%of 66
ffs_vfree100%of 1
x86_curlwp---of 1
-----------
SUMMARY30%of 623

-----------
SUMMARY---of 0

ttyerrpoll---of 1
ttyldisc_attach---of 42
ttyldisc_default100%of 1
ttyldisc_detach---of 16
ttyldisc_init---of 4
ttyldisc_lookup---of 7
ttyldisc_lookup_bynum---of 7
ttyldisc_release---of 5
ttynullioctl100%of 1
-----------
SUMMARY100%of 2

_fstrans_start64%of 19
cow_change_enter---of 14
fscow_disestablish---of 17
fscow_establish---of 10
fscow_run35%of 26
fstrans_alloc_lwp_info64%of 25
fstrans_done29%of 32
fstrans_dump---of 51
fstrans_getstate---of 11
fstrans_held55%of 11
fstrans_init---of 3
fstrans_is_owner---of 9
fstrans_lwp_dtor13%of 16
fstrans_lwp_pcc50%of 4
fstrans_lwp_pcd---of 6
fstrans_mount100%of 1
fstrans_mount_dtor50%of 18
fstrans_mount_get59%of 12
fstrans_setstate44%of 32
fstrans_start67%of 3
fstrans_start_lazy---of 3
fstrans_start_nowait100%of 1
fstrans_unmount50%of 6
vfs_resume58%of 7
vfs_suspend50%of 16
x86_curlwp100%of 1
-----------
SUMMARY47%of 230

ptyfs_access67%of 3
ptyfs_advlock---of 3
ptyfs_close---of 13
ptyfs_getattr43%of 14
ptyfs_inactive67%of 3
ptyfs_ioctl---of 3
ptyfs_itimes---of 9
ptyfs_kqfilter---of 3
ptyfs_lookup15%of 14
ptyfs_open---of 5
ptyfs_pathconf23%of 9
ptyfs_poll---of 3
ptyfs_print---of 1
ptyfs_read---of 12
ptyfs_readdir---of 26
ptyfs_reclaim---of 1
ptyfs_setattr22%of 47
ptyfs_write---of 11
x86_curlwp---of 1
-----------
SUMMARY27%of 90

popcount100%of 1
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

rip6_abort_wrapper---of 3
rip6_accept_wrapper---of 3
rip6_attach50%of 8
rip6_attach_wrapper100%of 1
rip6_bind_wrapper44%of 23
rip6_connect2_wrapper---of 3
rip6_connect_wrapper42%of 17
rip6_ctlinput---of 12
rip6_ctloutput32%of 19
rip6_detach67%of 9
rip6_detach_wrapper100%of 1
rip6_disconnect_wrapper58%of 7
rip6_init---of 1
rip6_input---of 43
rip6_ioctl_wrapper---of 1
rip6_listen_wrapper---of 3
rip6_output34%of 54
rip6_peeraddr_wrapper---of 7
rip6_purgeif_wrapper---of 1
rip6_rcvd_wrapper---of 3
rip6_recvoob_wrapper---of 3
rip6_sbappendaddr---of 12
rip6_send_wrapper47%of 15
rip6_sendoob_wrapper---of 3
rip6_shutdown_wrapper---of 3
rip6_sockaddr_wrapper---of 7
rip6_stat_wrapper---of 3
sysctl_net_inet6_raw6_stats---of 1
x86_curlwp100%of 1
-----------
SUMMARY42%of 155

fdesc_loadvnode28%of 11
fdesc_modcmd---of 4
fdesc_mount40%of 5
fdesc_root100%of 1
fdesc_start100%of 1
fdesc_sync---of 1
fdesc_sysctl_setup---of 1
fdesc_unmount---of 4
fdesc_vget---of 1
x86_curlwp100%of 1
-----------
SUMMARY43%of 19

key_abort---of 3
key_accept---of 3
key_attach_wrapper53%of 17
key_bind_wrapper---of 3
key_connect2_wrapper---of 3
key_connect_wrapper---of 3
key_detach_wrapper---of 9
key_disconnect_wrapper---of 5
key_init_so---of 1
key_ioctl_wrapper---of 1
key_listen_wrapper---of 3
key_output24%of 13
key_peeraddr_wrapper---of 9
key_pr_init---of 1
key_rcvd_wrapper---of 3
key_recvoob_wrapper---of 3
key_send_wrapper60%of 5
key_sendoob_wrapper---of 3
key_sendup0---of 24
key_sendup_mbuf---of 40
key_shutdown_wrapper---of 3
key_sockaddr_wrapper---of 9
key_stat_wrapper67%of 3
-----------
SUMMARY45%of 38

sco_attach_pcb46%of 11
sco_bind_pcb---of 4
sco_connect_pcb---of 19
sco_detach_pcb---of 10
sco_disconnect_pcb---of 4
sco_getopt---of 5
sco_listen_pcb---of 3
sco_peeraddr_pcb---of 1
sco_send_pcb---of 13
sco_setopt---of 1
sco_sockaddr_pcb---of 1
-----------
SUMMARY46%of 11

kmem_alloc50%of 8
kmem_asprintf---of 3
kmem_create_caches---of 17
kmem_free60%of 5
kmem_init---of 1
kmem_intr_alloc50%of 20
kmem_intr_free53%of 17
kmem_intr_zalloc67%of 3
kmem_roundup_size100%of 1
kmem_strdupsize50%of 4
kmem_strfree67%of 3
kmem_strndup---of 7
kmem_tmpbuf_alloc---of 3
kmem_tmpbuf_free---of 3
kmem_zalloc50%of 8
-----------
SUMMARY54%of 69

tcp_output18%of 336
tcp_setpersist---of 4
-----------
SUMMARY18%of 336

_bus_dma_alloc_bouncebuf---of 6
_bus_dmamap_load_busaddr54%of 15
_bus_dmamem_alloc---of 19
_bus_dmamem_free---of 8
_bus_dmamem_map---of 9
_bus_dmamem_unmap---of 10
bus_dma_tag_create---of 22
bus_dma_tag_destroy---of 9
bus_dmamap_create---of 21
bus_dmamap_destroy---of 8
bus_dmamap_load27%of 23
bus_dmamap_load_mbuf---of 39
bus_dmamap_load_raw---of 14
bus_dmamap_load_uio---of 23
bus_dmamap_sync21%of 49
bus_dmamap_unload---of 7
bus_dmamem_alloc---of 5
bus_dmamem_free---of 5
bus_dmamem_map---of 5
bus_dmamem_mmap---of 15
bus_dmamem_unmap---of 5
bus_dmatag_destroy---of 8
bus_dmatag_subregion---of 17
-----------
SUMMARY28%of 87

kauth_accmode_to_action100%of 1
kauth_authorize_action---of 4
kauth_authorize_action_internal54%of 15
kauth_authorize_device---of 4
kauth_authorize_device_passthru---of 15
kauth_authorize_device_spec47%of 15
kauth_authorize_device_tty50%of 4
kauth_authorize_generic---of 4
kauth_authorize_machdep---of 4
kauth_authorize_network50%of 4
kauth_authorize_process75%of 4
kauth_authorize_system75%of 4
kauth_authorize_vnode75%of 4
kauth_cred_alloc27%of 15
kauth_cred_clone100%of 1
kauth_cred_clone134%of 27
kauth_cred_copy---of 9
kauth_cred_dup43%of 7
kauth_cred_free32%of 22
kauth_cred_get100%of 1
kauth_cred_getdata---of 7
kauth_cred_getegid40%of 5
kauth_cred_geteuid40%of 5
kauth_cred_getgid40%of 5
kauth_cred_getgroups---of 6
kauth_cred_getrefcnt---of 5
kauth_cred_getsvgid40%of 5
kauth_cred_getsvuid40%of 5
kauth_cred_getuid40%of 5
kauth_cred_group43%of 7
kauth_cred_groupmember43%of 14
kauth_cred_hold43%of 7
kauth_cred_ismember_gid---of 11
kauth_cred_ngroups40%of 5
kauth_cred_setdata---of 7
kauth_cred_setegid43%of 7
kauth_cred_seteuid43%of 7
kauth_cred_setgid---of 7
kauth_cred_setgroups46%of 11
kauth_cred_setsvgid---of 7
kauth_cred_setsvuid43%of 7
kauth_cred_setuid43%of 7
kauth_cred_to_uucred---of 9
kauth_cred_topcred---of 7
kauth_cred_toucred---of 7
kauth_cred_uidmatch34%of 12
kauth_cred_uucmp---of 20
kauth_deregister_key---of 3
kauth_deregister_scope---of 7
kauth_extattr_action---of 1
kauth_init---of 1
kauth_listen_scope---of 7
kauth_proc_chroot30%of 17
kauth_proc_fork30%of 17
kauth_proc_setgroups42%of 17
kauth_register_key---of 5
kauth_register_scope---of 14
kauth_unlisten_scope---of 7
kauth_uucred_to_cred---of 7
x86_curlwp100%of 1
-----------
SUMMARY42%of 290

module_hook_exit100%of 1
module_hook_init---of 1
module_hook_set---of 5
module_hook_tryenter100%of 3
module_hook_unset---of 5
-----------
SUMMARY100%of 4

sched_lwp_collect---of 1
sched_lwp_fork100%of 1
sched_newts100%of 1
sched_nice---of 11
sched_oncpu100%of 1
sched_proc_exit43%of 7
sched_proc_fork67%of 3
sched_pstats_hook---of 16
sched_rqinit---of 1
sched_schedclock---of 9
sched_setrunnable67%of 15
sched_slept100%of 1
sched_tick---of 11
sched_wakeup100%of 1
sysctl_sched_4bsd_setup---of 3
sysctl_sched_rtts---of 1
x86_curlwp---of 1
-----------
SUMMARY67%of 30

ld_rbto_compare_key100%of 1
ld_rbto_compare_nodes100%of 1
lockdebug_abort---of 3
lockdebug_abort1---of 8
lockdebug_alloc38%of 27
lockdebug_barrier29%of 21
lockdebug_dismiss---of 1
lockdebug_dump---of 5
lockdebug_free47%of 13
lockdebug_lock_print---of 10
lockdebug_locked55%of 11
lockdebug_mem_check17%of 6
lockdebug_more39%of 13
lockdebug_show_all_locks---of 27
lockdebug_show_lockstats---of 9
lockdebug_unlocked57%of 30
lockdebug_wantlock71%of 17
x86_curlwp100%of 1
-----------
SUMMARY47%of 141

uao_create43%of 19
uao_detach55%of 24
uao_dropswap60%of 5
uao_dropswap_range21%of 34
uao_find_swhash_elt---of 10
uao_find_swslot---of 9
uao_get27%of 56
uao_init---of 3
uao_pagein_page---of 10
uao_put29%of 25
uao_reference67%of 3
uao_set_pgfl---of 4
uao_set_swslot20%of 20
uao_swap_off---of 41
-----------
SUMMARY32%of 186

exec_netbsd32_makecmds38%of 8
netbsd32_exec_aout_prep_nmagic---of 3
netbsd32_exec_aout_prep_omagic---of 3
netbsd32_exec_aout_prep_zmagic---of 4
-----------
SUMMARY38%of 8

-----------
SUMMARY---of 0

softint_block---of 3
softint_disestablish---of 14
softint_dispatch---of 48
softint_establish---of 19
softint_init---of 10
softint_init_isr---of 3
softint_schedule47%of 13
softint_schedule_cpu63%of 8
x86_curlwp---of 1
-----------
SUMMARY53%of 21

_prop_number_alloc50%of 8
_prop_number_equals---of 9
_prop_number_externalize---of 4
_prop_number_free---of 1
_prop_number_init---of 1
_prop_number_internalize---of 9
_prop_number_lock67%of 3
_prop_number_rb_compare_key43%of 7
_prop_number_rb_compare_nodes43%of 7
_prop_number_unlock100%of 1
prop_number_copy---of 4
prop_number_create_integer---of 1
prop_number_create_signed---of 1
prop_number_create_unsigned100%of 1
prop_number_create_unsigned_integer---of 1
prop_number_equals---of 5
prop_number_equals_integer---of 6
prop_number_equals_signed---of 6
prop_number_equals_unsigned---of 6
prop_number_equals_unsigned_integer---of 6
prop_number_int16_value---of 7
prop_number_int32_value---of 7
prop_number_int64_value---of 5
prop_number_int8_value---of 7
prop_number_int_value---of 7
prop_number_integer_value---of 4
prop_number_intptr_value---of 5
prop_number_long_value---of 5
prop_number_longlong_value---of 5
prop_number_schar_value---of 7
prop_number_short_value---of 7
prop_number_signed_value---of 4
prop_number_size---of 9
prop_number_uchar_value---of 7
prop_number_uint16_value---of 7
prop_number_uint32_value---of 7
prop_number_uint64_value---of 5
prop_number_uint8_value---of 7
prop_number_uint_value---of 7
prop_number_uintptr_value---of 5
prop_number_ulong_value---of 5
prop_number_ulonglong_value---of 5
prop_number_unsigned---of 1
prop_number_unsigned_integer_value---of 4
prop_number_unsigned_value---of 4
prop_number_ushort_value---of 7
-----------
SUMMARY52%of 27

sys_ktrace60%of 10
x86_curlwp100%of 1
-----------
SUMMARY64%of 11

uvm_availmem100%of 1
uvm_cpu_attach---of 3
uvm_page_init---of 32
uvm_page_lookup_freelist67%of 3
uvm_page_numa_load---of 3
uvm_page_owner_locked_p100%of 6
uvm_page_physget---of 12
uvm_page_print_freelists---of 9
uvm_page_printall---of 7
uvm_page_printit---of 20
uvm_page_rebucket---of 13
uvm_page_recolor---of 1
uvm_page_redim---of 49
uvm_page_unbusy36%of 28
uvm_pageactivate50%of 10
uvm_pagealloc_pgb62%of 26
uvm_pagealloc_pgfl63%of 8
uvm_pagealloc_strat40%of 78
uvm_pageboot_alloc---of 11
uvm_pagecopy100%of 1
uvm_pagedeactivate55%of 11
uvm_pagedequeue70%of 10
uvm_pageenqueue---of 11
uvm_pagefree37%of 66
uvm_pageinsert_object60%of 15
uvm_pageismanaged---of 1
uvm_pagelock100%of 1
uvm_pagelock2---of 1
uvm_pagelookup56%of 9
uvm_pagereadonly_p78%of 9
uvm_pagerealloc24%of 17
uvm_pageremove_object60%of 15
uvm_pagereplace---of 20
uvm_pageunlock100%of 3
uvm_pageunlock2---of 9
uvm_pageunwire40%of 15
uvm_pagewait---of 10
uvm_pagewakeup60%of 5
uvm_pagewanted_p---of 8
uvm_pagewire50%of 12
uvm_pagezero---of 1
uvm_pgfl_lock---of 1
uvm_pgfl_unlock---of 1
uvm_setpagesize---of 6
uvm_vm_page_to_phys100%of 1
x86_curlwp100%of 1
-----------
SUMMARY49%of 351

uvm_deallocate67%of 3
-----------
SUMMARY67%of 3

filt_ttyrdetach---of 1
filt_ttyread---of 12
filt_ttywdetach---of 1
filt_ttywrite---of 8
nullmodem---of 7
sysctl_kern_tty_qsize---of 3
tputchar---of 12
ttioctl8%of 185
ttpoll---of 20
ttread---of 85
ttrstrt---of 4
ttsetwater67%of 3
ttspeedtab---of 5
ttstart---of 3
ttwakeup38%of 8
ttwrite---of 60
tty_acquire---of 3
tty_alloc100%of 1
tty_attach67%of 3
tty_detach---of 8
tty_free---of 8
tty_get_qsize---of 4
tty_getctrlchar---of 3
tty_init---of 3
tty_listener_cb100%of 4
tty_release---of 11
tty_set_qsize---of 4
tty_setctrlchar---of 3
tty_try_xonxoff---of 8
tty_unit---of 1
ttycancel---of 1
ttychars100%of 1
ttycheckoutq---of 6
ttyclose---of 6
ttyecho---of 13
ttyflush46%of 11
ttygetinfo---of 61
ttyinput---of 3
ttyinput_wlock---of 140
ttykqfilter---of 5
ttylclose---of 5
ttylock---of 1
ttylocked---of 1
ttylopen80%of 5
ttymodem20%of 10
ttyopen15%of 14
ttyoutput---of 28
ttypause---of 6
ttypend---of 6
ttyprintf_nolock---of 1
ttypull---of 3
ttyputinfo---of 8
ttyretype---of 11
ttyrub---of 28
ttysig---of 5
ttysigintr---of 36
ttysleep---of 9
ttyunlock---of 1
ttywait---of 1
ttywait_timo16%of 13
ttywflush---of 4
x86_curlwp100%of 1
-----------
SUMMARY17%of 259

cprng_fast---of 4
cprng_fast32100%of 1
cprng_fast64---of 1
cprng_fast_buf_short50%of 14
cprng_fast_init---of 1
cprng_fast_init_cpu---of 1
-----------
SUMMARY54%of 15

b_to_q---of 21
catq---of 10
clalloc100%of 3
clfree---of 5
firstc---of 5
getc---of 8
ndflush---of 10
ndqb---of 13
nextc---of 10
putc---of 9
q_to_b---of 11
unputc---of 8
-----------
SUMMARY100%of 3

in_purgeifmcast---of 12
inpcb_bind---of 10
inpcb_bind_port---of 39
inpcb_bindableaddr---of 18
inpcb_connect---of 51
inpcb_create45%of 20
inpcb_destroy41%of 22
inpcb_disconnect---of 6
inpcb_fetch_peeraddr---of 3
inpcb_fetch_sockaddr---of 3
inpcb_init---of 3
inpcb_lookup---of 26
inpcb_lookup_bound---of 26
inpcb_lookup_local---of 33
inpcb_losing---of 9
inpcb_notify---of 11
inpcb_notifyall---of 8
inpcb_poolinit---of 1
inpcb_purgeif---of 9
inpcb_purgeif0---of 33
inpcb_rtchange---of 3
inpcb_rtentry50%of 4
inpcb_rtentry_unref100%of 1
inpcb_set_state22%of 19
x86_curlwp---of 1
-----------
SUMMARY38%of 66

ptyfs__allocvp---of 8
ptyfs__getmp---of 6
ptyfs__getvattr---of 1
ptyfs__makename---of 12
ptyfs_done---of 1
ptyfs_init---of 1
ptyfs_loadvnode45%of 9
ptyfs_modcmd---of 4
ptyfs_mount24%of 17
ptyfs_reinit---of 1
ptyfs_root50%of 4
ptyfs_start100%of 1
ptyfs_sync---of 1
ptyfs_sysctl_setup---of 1
ptyfs_unmount---of 15
ptyfs_vget---of 1
x86_curlwp100%of 1
-----------
SUMMARY38%of 32

exec_aout_makecmds50%of 8
exec_aout_modcmd---of 4
exec_aout_prep_nmagic---of 3
exec_aout_prep_omagic---of 3
exec_aout_prep_zmagic---of 4
-----------
SUMMARY50%of 8

uvm_page_array_advance60%of 5
uvm_page_array_clear67%of 3
uvm_page_array_fill57%of 37
uvm_page_array_fill_and_peek72%of 7
uvm_page_array_fini100%of 1
uvm_page_array_init100%of 1
uvm_page_array_peek---of 5
-----------
SUMMARY62%of 54

phtree_SPLAY92%of 12
phtree_SPLAY_INSERT---of 5
phtree_SPLAY_MINMAX---of 8
phtree_SPLAY_REMOVE---of 5
pool_cache_bootstrap---of 45
pool_cache_bootstrap_destroy---of 21
pool_cache_cpu_init---of 9
pool_cache_destroy---of 1
pool_cache_destruct_object---of 3
pool_cache_get_paddr36%of 28
pool_cache_get_slow26%of 27
pool_cache_init---of 3
pool_cache_invalidate---of 14
pool_cache_invalidate_groups---of 12
pool_cache_nget---of 1
pool_cache_nput---of 1
pool_cache_prime---of 6
pool_cache_put_paddr60%of 10
pool_cache_reclaim---of 1
pool_cache_set_drain_hook---of 3
pool_cache_sethardlimit---of 1
pool_cache_sethiwat---of 1
pool_cache_setlowat---of 10
pool_cache_transfer---of 9
pool_chk---of 13
pool_chk_page---of 15
pool_destroy---of 31
pool_drain---of 10
pool_get45%of 94
pool_grow55%of 62
pool_init---of 55
pool_nget---of 1
pool_nput---of 1
pool_page_alloc100%of 1
pool_page_alloc_meta100%of 1
pool_page_free---of 1
pool_page_free_meta---of 1
pool_pcg_put---of 8
pool_pcg_trunc---of 8
pool_prime---of 6
pool_printall---of 4
pool_printit---of 60
pool_put50%of 63
pool_reclaim---of 31
pool_set_drain_hook---of 3
pool_sethardlimit---of 1
pool_sethiwat---of 1
pool_setlowat---of 10
pool_subsystem_init---of 3
pool_sysctl---of 21
pool_totalpages---of 6
pool_totalpages_locked---of 6
pool_whatis---of 78
pr_pagelist_free14%of 15
pr_rmpage---of 26
sysctl_pool_setup---of 1
-----------
SUMMARY47%of 313

lwp0_init---of 6
lwp_addref60%of 5
lwp_alive---of 5
lwp_changepri50%of 6
lwp_continue---of 8
lwp_create56%of 54
lwp_ctl_alloc---of 26
lwp_ctl_exit---of 10
lwp_ctl_free---of 18
lwp_ctor67%of 3
lwp_delref100%of 1
lwp_delref256%of 9
lwp_drainrefs---of 6
lwp_dtor60%of 5
lwp_eprio84%of 6
lwp_exit32%of 50
lwp_find---of 6
lwp_find2---of 23
lwp_find_first31%of 13
lwp_free56%of 40
lwp_lendpri60%of 5
lwp_lock50%of 8
lwp_locked100%of 1
lwp_migrate---of 16
lwp_need_userret67%of 9
lwp_pctr100%of 1
lwp_setlock60%of 5
lwp_setprivate---of 1
lwp_start---of 10
lwp_startup---of 20
lwp_suspend---of 19
lwp_thread_cleanup---of 5
lwp_trylock50%of 8
lwp_unlock100%of 1
lwp_unlock_to60%of 5
lwp_unsleep---of 3
lwp_unstop---of 14
lwp_userret27%of 19
lwp_wait35%of 41
lwp_whatis---of 6
lwpinit---of 1
spc_lock100%of 1
sysctl_kern_maxlwp---of 5
x86_curlwp100%of 1
-----------
SUMMARY48%of 297

secmodel_securelevel_device_cb50%of 14
secmodel_securelevel_init---of 1
secmodel_securelevel_machdep_cb---of 7
secmodel_securelevel_network_cb50%of 4
secmodel_securelevel_process_cb50%of 6
secmodel_securelevel_start---of 1
secmodel_securelevel_stop---of 1
secmodel_securelevel_sysctl---of 5
secmodel_securelevel_system_cb34%of 15
secmodel_securelevel_vnode_cb67%of 3
securelevel_eval---of 3
securelevel_modcmd---of 7
sysctl_security_securelevel_setup---of 1
-----------
SUMMARY46%of 42

uvm_obj_clean_p67%of 3
uvm_obj_destroy80%of 5
uvm_obj_init100%of 3
uvm_obj_nowriteback_p67%of 3
uvm_obj_page_clear_dirty60%of 5
uvm_obj_page_clear_writeback---of 5
uvm_obj_page_dirty_p60%of 5
uvm_obj_page_set_dirty60%of 5
uvm_obj_page_set_writeback60%of 5
uvm_obj_page_writeback_p---of 5
uvm_obj_setlock80%of 5
uvm_obj_unwirepages---of 8
uvm_obj_wirepages---of 24
uvm_object_printit---of 10
-----------
SUMMARY70%of 39

ipi_broadcast---of 15
ipi_cpu_handler---of 9
ipi_mark_pending56%of 9
ipi_msg_cpu_handler---of 10
ipi_multicast---of 16
ipi_percpu_init---of 6
ipi_register---of 9
ipi_sysinit---of 1
ipi_trigger60%of 5
ipi_trigger_broadcast---of 1
ipi_trigger_multi---of 1
ipi_trigger_multi_internal---of 16
ipi_unicast---of 11
ipi_unregister---of 6
ipi_wait---of 10
put_msg---of 12
-----------
SUMMARY58%of 14

-----------
SUMMARY---of 0

runq_init---of 1
sched_bestcpu60%of 30
sched_catchlwp---of 29
sched_cpuattach---of 8
sched_curcpu_runnable_p---of 1
sched_dequeue57%of 32
sched_enqueue43%of 28
sched_idle---of 44
sched_lwp_stats---of 7
sched_nextlwp56%of 9
sched_preempted35%of 23
sched_print_runqueue---of 14
sched_resched_cpu56%of 18
sched_resched_lwp60%of 5
sched_takecpu40%of 50
sched_vforkexec---of 6
sysctl_sched_setup---of 3
x86_curlwp100%of 1
-----------
SUMMARY49%of 196

clockctl_50_fini---of 3
clockctl_50_init---of 1
compat50_clockctlioctl20%of 15
-----------
SUMMARY20%of 15

kpause42%of 12
kpreempt73%of 22
kpreempt_disable100%of 1
kpreempt_disabled67%of 6
kpreempt_enable80%of 5
mi_switch51%of 97
mtsleep---of 12
preempt72%of 7
preempt_needed60%of 5
preempt_point86%of 7
sched_changepri34%of 9
sched_lendpri45%of 9
sched_pstats---of 33
setrunnable34%of 24
suspendsched---of 24
synch_init---of 1
syncobj_noowner---of 1
tsleep---of 10
updatertime40%of 10
wakeup---of 3
x86_curlwp100%of 1
yield---of 7
-----------
SUMMARY53%of 215

accept_filt_add---of 9
accept_filt_clear---of 15
accept_filt_del---of 7
accept_filt_get---of 7
accept_filt_getopt---of 6
accept_filt_setopt13%of 40
accept_filter_init---of 3
accept_filter_init0---of 1
-----------
SUMMARY13%of 40

rn_addmask---of 27
rn_addroute---of 88
rn_delayedinit---of 3
rn_delete---of 1
rn_delete1---of 50
rn_init---of 21
rn_inithead---of 4
rn_inithead0---of 1
rn_insert---of 12
rn_lookup---of 8
rn_match27%of 49
rn_newpair---of 1
rn_refines---of 11
rn_search---of 4
rn_search_m---of 7
rn_search_matched---of 18
rn_walktree---of 18
-----------
SUMMARY27%of 49

compat_50_quota_modcmd---of 4
compat_50_sys_quotactl15%of 14
-----------
SUMMARY15%of 14

virtio_pci_alloc_interrupts---of 48
virtio_pci_attach---of 60
virtio_pci_detach---of 10
virtio_pci_find_cap---of 10
virtio_pci_free_interrupts---of 9
virtio_pci_intr---of 8
virtio_pci_kick_09100%of 1
virtio_pci_kick_10---of 1
virtio_pci_match---of 7
virtio_pci_modcmd---of 1
virtio_pci_msix_config_intr---of 3
virtio_pci_msix_queue_intr---of 4
virtio_pci_negotiate_features_09---of 1
virtio_pci_negotiate_features_10---of 4
virtio_pci_read_queue_size_09---of 1
virtio_pci_read_queue_size_10---of 1
virtio_pci_rescan---of 3
virtio_pci_set_status_09---of 3
virtio_pci_set_status_10---of 3
virtio_pci_setup_interrupts_09---of 8
virtio_pci_setup_interrupts_10---of 8
virtio_pci_setup_queue_09---of 3
virtio_pci_setup_queue_10---of 7
-----------
SUMMARY100%of 1

_prop_dict_init---of 1
_prop_dict_keysym_equals---of 1
_prop_dict_keysym_externalize---of 6
_prop_dict_keysym_free40%of 5
_prop_dict_keysym_rb_compare_key100%of 1
_prop_dict_keysym_rb_compare_nodes100%of 1
_prop_dictionary_alloc---of 6
_prop_dictionary_emergency_free---of 5
_prop_dictionary_equals---of 15
_prop_dictionary_equals_finish---of 1
_prop_dictionary_externalize---of 28
_prop_dictionary_free40%of 15
_prop_dictionary_get_keysym---of 15
_prop_dictionary_internalize---of 6
_prop_dictionary_internalize_body---of 15
_prop_dictionary_internalize_continue---of 6
_prop_dictionary_iterator_next_object---of 12
_prop_dictionary_iterator_reset---of 4
_prop_dictionary_lock67%of 3
_prop_dictionary_unlock100%of 1
prop_dictionary_all_keys---of 7
prop_dictionary_copy---of 7
prop_dictionary_copy_mutable---of 5
prop_dictionary_count---of 4
prop_dictionary_create67%of 3
prop_dictionary_create_with_capacity---of 1
prop_dictionary_ensure_capacity---of 7
prop_dictionary_equals---of 5
prop_dictionary_externalize---of 6
prop_dictionary_get28%of 11
prop_dictionary_get_keysym---of 1
prop_dictionary_internalize100%of 1
prop_dictionary_iterator---of 6
prop_dictionary_keysym_cstring_nocopy---of 3
prop_dictionary_keysym_equals---of 5
prop_dictionary_keysym_value---of 3
prop_dictionary_make_immutable---of 3
prop_dictionary_remove---of 15
prop_dictionary_remove_keysym---of 5
prop_dictionary_set40%of 38
prop_dictionary_set_keysym---of 5
-----------
SUMMARY44%of 79

-----------
SUMMARY---of 0

child_return---of 5
fork145%of 94
sys___clone67%of 3
sys___vfork14---of 1
sys_fork100%of 1
sys_vfork---of 1
x86_curlwp100%of 1
-----------
SUMMARY47%of 99

cv_broadcast80%of 5
cv_destroy60%of 5
cv_enter56%of 9
cv_has_waiters100%of 1
cv_init67%of 3
cv_is_valid---of 1
cv_signal80%of 5
cv_timedwait67%of 3
cv_timedwait_sig67%of 3
cv_timedwaitbt---of 20
cv_timedwaitbt_sig---of 20
cv_unsleep---of 7
cv_wait67%of 3
cv_wait_sig67%of 3
cv_wakeup_all60%of 10
cv_wakeup_one50%of 8
x86_curlwp100%of 1
-----------
SUMMARY65%of 59